├── .clang-format ├── .clang-tidy ├── .editorconfig ├── .github ├── license_template.txt ├── pull_request_template.md └── workflows │ ├── Scaner_BDBA.yml │ ├── Scaner_Trivil.yml │ ├── cpp-graph-test.yml │ ├── docker │ ├── codeScan.dockerfile │ └── devel.dockerfile │ ├── format_scan.yml │ ├── scripts │ ├── change_color.sh │ ├── formatScan │ │ ├── bandit.sh │ │ ├── clangformat.sh │ │ ├── clangtidy.sh │ │ ├── cloc.sh │ │ ├── hadolint.sh │ │ ├── nlp_dict.txt │ │ ├── pydocstyle.sh │ │ ├── pylint.sh │ │ └── trellix.sh │ ├── install_binary.sh │ ├── models │ │ ├── calculate_percertiles.py │ │ ├── cpp_graph_inference.sh │ │ └── generate_report.sh │ ├── prepare_env.sh │ ├── prepare_env_with_conda.bat │ └── prepare_env_with_conda.sh │ ├── trellix.yml │ ├── unit-test-bestla.yml │ ├── unit-test-llmruntime.yml │ ├── unitTest │ ├── env_setup.sh │ └── unittest_llmruntime.sh │ └── windows-test.yml ├── .gitignore ├── .gitmodules ├── .pre-commit-config.yaml ├── CMakeLists.txt ├── CMakePresets.json ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── bestla ├── CMakeLists.txt ├── CMakePresets.json ├── README.md ├── bestla │ ├── bestla.h │ ├── bestla_device.h │ ├── bestla_epilogue.h │ ├── bestla_gemm.h │ ├── bestla_jit.h │ ├── bestla_parallel.h │ ├── bestla_prologue_a.h │ ├── bestla_prologue_b.h │ ├── bestla_storage.h │ ├── bestla_utils.h │ ├── bestla_wrapper.h │ ├── kernel_avx2.h │ ├── kernel_avx512_bf16.h │ ├── kernel_avx512_fp16.h │ ├── kernel_avx512_vnni.h │ ├── kernel_avx512f.h │ ├── kernel_avx_vnni.h │ ├── kernel_jit.h │ ├── kernel_jit_injector.h │ ├── kernel_ref.h │ ├── kernel_wrapper.h │ ├── sycl │ │ ├── sycl_device.h │ │ ├── sycl_epilogue.h │ │ ├── sycl_gemm.h │ │ ├── sycl_prologue_a.h │ │ ├── sycl_prologue_b.h │ │ ├── sycl_storage.h │ │ ├── sycl_utils.h │ │ └── sycl_wrapper.h │ └── ut │ │ ├── bestla.cpp │ │ ├── bestla_benchmark.cpp │ │ ├── bestla_epilogue.cpp │ │ ├── bestla_gemm.cpp │ │ ├── bestla_parallel.cpp │ │ ├── bestla_prologue_a.cpp │ │ ├── bestla_prologue_b.cpp │ │ ├── bestla_ut.cpp │ │ ├── bestla_ut.h │ │ ├── bestla_utils.cpp │ │ ├── bestla_wrapper.cpp │ │ ├── kernel_intrin.cpp │ │ ├── kernel_jit.cpp │ │ ├── kernel_ut.h │ │ ├── kernel_wrapper.cpp │ │ ├── sycl_benchmark.cpp │ │ ├── sycl_gemm.cpp │ │ ├── sycl_misc.cpp │ │ └── sycl_ut.h └── cmake │ ├── FindSIMD.cmake │ └── sycl.cmake ├── clang-format.py ├── developer_document.md ├── docker ├── DockerFile └── README.md ├── docs ├── advanced_usage.md ├── continuous_batching.md ├── customized_stop.md ├── fused_attention.md ├── gguf.md ├── gptq_and_awq.md ├── imgs │ ├── Attention.PNG │ ├── FFN.PNG │ ├── ORCA_batching.png │ ├── shift-rope.svg │ └── shuffle-attn.svg ├── infinite_inference.md ├── install.md ├── prompt_template.md ├── supported_models.md └── tensor_parallelism.md ├── neural_speed ├── CMakeLists.txt ├── __init__.py ├── application │ ├── CMakeLists.txt │ ├── audio_run.cpp │ ├── common.cpp │ ├── common.h │ ├── main_pybind.cpp │ ├── main_run.cpp │ ├── pybind_gptj.cpp │ ├── quant_model.cpp │ ├── quant_whisper.cpp │ └── whisper_pybind.cpp ├── cmake │ ├── ClangTidy.cmake │ ├── Common.cmake │ └── ISA.cmake ├── convert │ ├── __init__.py │ ├── common.py │ ├── convert-hf-to-gguf.py │ ├── convert_baichuan.py │ ├── convert_bloom.py │ ├── convert_chatglm.py │ ├── convert_dolly.py │ ├── convert_falcon.py │ ├── convert_gemma.py │ ├── convert_gptj.py │ ├── convert_gptneox.py │ ├── convert_grok.py │ ├── convert_llama.py │ ├── convert_mistral.py │ ├── convert_mixtral.py │ ├── convert_mpt.py │ ├── convert_opt.py │ ├── convert_phi.py │ ├── convert_phi3.py │ ├── convert_quantized_baichuan.py │ ├── convert_quantized_falcon.py │ ├── convert_quantized_gptj.py │ ├── convert_quantized_llama.py │ ├── convert_quantized_mistral.py │ ├── convert_quantized_mixtral.py │ ├── convert_quantized_phi.py │ ├── convert_quantized_qwen.py │ ├── convert_qwen.py │ ├── convert_stablelm.py │ ├── convert_starcoder.py │ └── convert_whisper.py ├── core │ ├── CMakeLists.txt │ ├── README.md │ ├── data_types.h │ ├── layers │ │ ├── CMakeLists.txt │ │ ├── Ops.h │ │ ├── argsort.cpp │ │ ├── argsort.h │ │ ├── bestla_common.hpp │ │ ├── bestla_defs.h │ │ ├── bestla_gemm.cpp │ │ ├── bestla_gemm.h │ │ ├── conv.cpp │ │ ├── conv.h │ │ ├── ele_reduce.h │ │ ├── ele_wise.h │ │ ├── inner_product.cpp │ │ ├── ip_fusion_ffn.cpp │ │ ├── ip_fusion_qkv.cpp │ │ ├── layers.h │ │ ├── memory.cpp │ │ ├── memory.h │ │ ├── mha_dense.cpp │ │ ├── mha_dense.h │ │ ├── mha_dense_tests.cpp │ │ ├── mha_dense_wrapper.h │ │ ├── ne_bestla.cpp │ │ ├── ne_bestla_sycl.cpp │ │ ├── ne_test_layers_utils.hpp │ │ └── vec_dot.h │ ├── ne.h │ ├── ne_bestla.h │ ├── ne_layers.c │ ├── ne_layers.h │ ├── parallel_context.cpp │ ├── parallel_context.h │ └── shared_memory_ccl.hpp ├── models │ ├── CMakeLists.txt │ ├── baichuan │ │ ├── baichuan.cpp │ │ ├── baichuan.h │ │ └── baichuan_utils.cpp │ ├── bloom │ │ ├── bloom.cpp │ │ ├── bloom.h │ │ └── bloom_utils.cpp │ ├── chatglm │ │ ├── chatglm.cpp │ │ ├── chatglm.h │ │ ├── chatglm2.cpp │ │ ├── chatglm2.h │ │ ├── chatglm2_utils.cpp │ │ └── chatglm_utils.cpp │ ├── falcon │ │ ├── falcon.cpp │ │ ├── falcon.h │ │ └── falcon_utils.cpp │ ├── gemma │ │ ├── gemma.cpp │ │ ├── gemma.h │ │ └── gemma_utils.cpp │ ├── gptj │ │ ├── gptj.cpp │ │ ├── gptj.h │ │ └── gptj_utils.cpp │ ├── gptneox │ │ ├── gptneox.cpp │ │ ├── gptneox.h │ │ └── gptneox_utils.cpp │ ├── grok │ │ ├── grok.cpp │ │ ├── grok.h │ │ └── grok_utils.cpp │ ├── llama │ │ ├── llama.cpp │ │ ├── llama.h │ │ └── llama_utils.cpp │ ├── model_utils │ │ ├── CMakeLists.txt │ │ ├── arg_parse.cpp │ │ ├── gguf.h │ │ ├── model_config.h │ │ ├── model_files.h │ │ ├── model_types.h │ │ ├── model_utils.cpp │ │ ├── model_utils.h │ │ ├── pool.cpp │ │ ├── pool.h │ │ ├── quant_config.h │ │ ├── quant_utils.cpp │ │ ├── quant_utils.h │ │ ├── scheduler.cpp │ │ ├── scheduler.h │ │ ├── util.cpp │ │ └── util.h │ ├── models.h │ ├── mpt │ │ ├── mpt.cpp │ │ ├── mpt.h │ │ └── mpt_utils.cpp │ ├── opt │ │ ├── opt.cpp │ │ ├── opt.h │ │ └── opt_utils.cpp │ ├── phi │ │ ├── phi.cpp │ │ ├── phi.h │ │ ├── phi3.cpp │ │ ├── phi3.h │ │ ├── phi3_utils.cpp │ │ └── phi_utils.cpp │ ├── qwen │ │ ├── qwen.cpp │ │ ├── qwen.h │ │ └── qwen_utils.cpp │ ├── requirements │ │ ├── baichuan.sh │ │ ├── baichuan13b-gptq.sh │ │ ├── chatglm-6b.sh │ │ ├── common.txt │ │ ├── mistral.sh │ │ └── mixtral-gptq.sh │ ├── stablelm │ │ ├── stablelm.cpp │ │ ├── stablelm.h │ │ └── stablelm_utils.cpp │ ├── starcoder │ │ ├── starcoder.cpp │ │ ├── starcoder.h │ │ └── starcoder_utils.cpp │ └── whisper │ │ ├── dr_wav.h │ │ ├── whisper.cpp │ │ ├── whisper.h │ │ └── whisper_utils.cpp └── vectors │ ├── CMakeLists.txt │ ├── cpu │ ├── CMakeLists.txt │ ├── quantize.h │ ├── simd.h │ ├── vec.hpp │ ├── vec.hpp.gch │ ├── vec_arithmetic.cpp │ ├── vec_arithmetic.hpp │ ├── vec_base.hpp │ ├── vec_compare.cpp │ ├── vec_compare.hpp │ ├── vec_convert.cpp │ ├── vec_convert.hpp │ ├── vec_load.cpp │ ├── vec_load.hpp │ ├── vec_set.cpp │ ├── vec_set.hpp │ ├── vec_store.cpp │ └── vec_store.hpp │ ├── ele_reduce.cpp │ ├── ele_reduce.h │ ├── fp16.h │ ├── gpu │ ├── CMakeLists.txt │ ├── ele_reduce.cpp │ ├── ele_wise.cpp │ ├── reduce.h │ ├── test.cpp │ ├── vector_func.h │ └── vector_kernel.h │ └── parallel_for.h ├── requirements.txt ├── scripts ├── accuracy.py ├── cal_acc.py ├── cal_diff.py ├── convert.py ├── huggingface.py ├── inference.py ├── load_peft_and_merge.py ├── ns_evaluator.py ├── perplexity.py ├── python_api_example.py ├── python_api_example_for_gguf.py ├── python_api_example_for_gptq.py ├── python_api_example_for_model_server.py ├── python_api_example_for_modelscope.py ├── quantize.py ├── requirements.txt ├── run.py └── whisper_example.py ├── security.md ├── setup.py └── tests ├── model-test ├── calculate_percentiles.py ├── cpp_graph_inference.sh ├── cpp_graph_prompts.json └── run_tp.sh ├── requirements.txt ├── test_model_server.py ├── test_modelscope.py └── test_python_api.py /.clang-format: -------------------------------------------------------------------------------- 1 | Language: Cpp 2 | BasedOnStyle: Google 3 | DerivePointerAlignment: false 4 | ColumnLimit: 120 5 | SpaceBeforeParens: ControlStatements 6 | SpaceBeforeRangeBasedForLoopColon: true 7 | SortIncludes: false 8 | -------------------------------------------------------------------------------- /.clang-tidy: -------------------------------------------------------------------------------- 1 | Checks: > 2 | -*, 3 | readability-identifier-naming, 4 | readability-const-return-type, 5 | readability-redundant-smartptr-get, 6 | readability-misleading-indentation, 7 | readability-redundant-control-flow, 8 | readability-redundant-member-init, 9 | readability-redundant-string-cstr, 10 | readability-redundant-string-init, 11 | readability-simplify-subscript-expr, 12 | readability-static-accessed-through-instance, 13 | readability-static-definition-in-anonymous-namespace, 14 | readability-uniqueptr-delete-release, 15 | readability-container-size-empty, 16 | # readability-delete-null-pointer, // not applicable for gcc/msvc 17 | readability-make-member-function-const, 18 | readability-redundant-access-specifiers, 19 | performance-for-range-copy, 20 | performance-implicit-conversion-in-loop, 21 | performance-inefficient-algorithm, 22 | performance-inefficient-string-concatenation, 23 | performance-inefficient-vector-operation, 24 | performance-move-const-arg, 25 | performance-unnecessary-copy-initialization, 26 | performance-unnecessary-value-param, 27 | performance-no-automatic-move, 28 | performance-trivially-destructible, 29 | modernize-make-shared, 30 | modernize-use-bool-literals, 31 | modernize-use-emplace, 32 | modernize-use-equals-default, 33 | modernize-use-override, 34 | modernize-use-nullptr, 35 | modernize-use-using, 36 | bugprone-assert-side-effect, 37 | bugprone-copy-constructor-init, 38 | bugprone-forward-declaration-namespace, 39 | bugprone-move-forwarding-reference, 40 | bugprone-parent-virtual-call, 41 | bugprone-too-small-loop-variable, 42 | bugprone-undefined-memory-manipulation, 43 | bugprone-unhandled-self-assignment, 44 | bugprone-multiple-statement-macro, 45 | bugprone-macro-parentheses, 46 | bugprone-undefined-memory-manipulation, 47 | bugprone-unhandled-self-assignment, 48 | # google-default-arguments, 49 | misc-misplaced-const, 50 | misc-definitions-in-headers, 51 | misc-redundant-expression, 52 | misc-uniqueptr-reset-release, 53 | misc-unused-alias-decls, 54 | misc-unused-using-decls, 55 | cppcoreguidelines-prefer-member-initializer, 56 | 57 | CheckOptions: 58 | - key: readability-identifier-naming.ClassCase 59 | value: lower_case 60 | - key: readability-identifier-naming.StructCase 61 | value: lower_case 62 | - key: readability-identifier-naming.ClassSuffix 63 | value: _t 64 | - key: readability-identifier-naming.StructSuffix 65 | value: _t 66 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | charset = utf-8 5 | indent_style = space 6 | indent_size = 2 7 | end_of_line = lf 8 | insert_final_newline = true 9 | trim_trailing_whitespace = true 10 | 11 | [*.py] 12 | indent_size = 4 13 | -------------------------------------------------------------------------------- /.github/license_template.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2024 Intel Corporation 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ## Type of Change 2 | 3 | feature or bug fix or documentation or others 4 | API changed or not 5 | 6 | ## Description 7 | 8 | detail description 9 | Issues: xxx 10 | 11 | ## Expected Behavior & Potential Risk 12 | 13 | the expected behavior that triggered by this PR 14 | 15 | ## How has this PR been tested? 16 | 17 | how to reproduce the test (including hardware information) 18 | 19 | ## Dependency Change? 20 | 21 | any library dependency introduced or removed 22 | -------------------------------------------------------------------------------- /.github/workflows/Scaner_BDBA.yml: -------------------------------------------------------------------------------- 1 | name: Scanner BDBA 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | permissions: write-all 7 | jobs: 8 | bdba_job: 9 | name: BDBA Scan 10 | uses: intel-innersource/frameworks.ai.infrastructure.code-scan-tools/.github/workflows/Scanner_Bdba.yml@one-ci-cd 11 | with: 12 | repos: ${{ github.event.repository.name }} 13 | refs: ${{ github.ref_name }} 14 | group: "22" 15 | runners: "['inner-source']" 16 | secrets: 17 | token: ${{ secrets.GITHUB_TOKEN }} 18 | BDBA_TOKEN: ${{ secrets.BDBA_TOKEN }} 19 | -------------------------------------------------------------------------------- /.github/workflows/Scaner_Trivil.yml: -------------------------------------------------------------------------------- 1 | name: Trivy Scan for Containers 2 | 3 | on: 4 | workflow_dispatch: 5 | permissions: write-all 6 | jobs: 7 | trivy_container_job: 8 | uses: "intel-innersource/frameworks.ai.infrastructure.code-scan-tools/.github/workflows/Scanner_Trivy.yml@one-ci-cd" 9 | with: 10 | container: ${{ vars.TRIVY_CONTAINER_NAME }} 11 | runners: "['inner-source']" 12 | lmc: false 13 | -------------------------------------------------------------------------------- /.github/workflows/docker/codeScan.dockerfile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | ARG UBUNTU_VER=22.04 17 | FROM ubuntu:${UBUNTU_VER} as devel 18 | 19 | # See http://bugs.python.org/issue19846 20 | ENV LANG C.UTF-8 21 | 22 | RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \ 23 | aspell \ 24 | aspell-en \ 25 | python3 \ 26 | python3-pip \ 27 | python3-dev \ 28 | python3-distutils \ 29 | build-essential \ 30 | cloc \ 31 | python3.10-venv \ 32 | git 33 | 34 | RUN ln -sf $(which python3) /usr/bin/python 35 | 36 | RUN python -m pip install --no-cache-dir pylint==2.17.5\ 37 | bandit==1.7.4\ 38 | pyspelling\ 39 | pydocstyle 40 | 41 | WORKDIR / 42 | -------------------------------------------------------------------------------- /.github/workflows/docker/devel.dockerfile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | ARG UBUNTU_VER=22.04 16 | FROM ubuntu:${UBUNTU_VER} as devel 17 | 18 | # See http://bugs.python.org/issue19846 19 | ENV LANG C.UTF-8 20 | 21 | RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \ 22 | python3 \ 23 | python3-pip \ 24 | python3-dev \ 25 | python3-distutils \ 26 | autoconf \ 27 | build-essential \ 28 | git \ 29 | libgl1-mesa-glx \ 30 | libglib2.0-0 \ 31 | numactl \ 32 | time \ 33 | wget \ 34 | bc \ 35 | gawk \ 36 | jq \ 37 | python3.10-venv \ 38 | vim 39 | 40 | RUN ln -sf $(which python3) /usr/bin/python 41 | 42 | RUN python -m pip --no-cache-dir install --upgrade pip 43 | RUN python -m pip install --no-cache-dir setuptools 44 | 45 | RUN pip list 46 | 47 | WORKDIR / 48 | 49 | -------------------------------------------------------------------------------- /.github/workflows/format_scan.yml: -------------------------------------------------------------------------------- 1 | name: Format Scan 2 | 3 | on: 4 | pull_request: 5 | branches: [main] 6 | paths: 7 | - neural_speed/** 8 | - bestla/** 9 | - scripts/** 10 | - clang-format.py 11 | - setup.py 12 | - .github/workflows/format_scan.yml 13 | - .github/workflows/scripts/formatScan/** 14 | - "!bestla/*.md" 15 | workflow_dispatch: 16 | 17 | # If there is a new commit, the previous jobs will be canceled 18 | concurrency: 19 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 20 | cancel-in-progress: true 21 | 22 | env: 23 | DOCKER_CONFIG_NAME: "commonDockerConfig" 24 | REPO_NAME: "code-scan" 25 | REPO_TAG: "1.0" 26 | DOCKER_FILE_NAME: "codeScan" 27 | CONTAINER_NAME: "codeScan" 28 | 29 | jobs: 30 | format-scan: 31 | runs-on: ubuntu-latest 32 | strategy: 33 | matrix: 34 | job_name: [ 35 | "pylint", 36 | "bandit", 37 | "clangformat", 38 | "cloc", 39 | "clangtidy", 40 | # "pydocstyle", 41 | #"pyspelling", 42 | "hadolint" 43 | ] 44 | fail-fast: false 45 | steps: 46 | - name: Docker Clean Up 47 | run: | 48 | docker ps -a 49 | if [[ $(docker ps -a | grep -i '${{ env.CONTAINER_NAME }}'$) ]]; then 50 | docker start ${{ env.CONTAINER_NAME }} 51 | echo "remove left files through container ..." 52 | docker exec ${{ env.CONTAINER_NAME }} bash -c "ls -a /neural-speed && rm -fr /neural-speed/* && rm -fr /neural-speed/.* || true" 53 | fi 54 | 55 | - name: Checkout out Repo 56 | uses: actions/checkout@v3 57 | 58 | - name: Docker Build 59 | run: | 60 | docker build -f ${{ github.workspace }}/.github/workflows/docker/${{ env.DOCKER_FILE_NAME }}.dockerfile -t ${{ env.REPO_NAME }}:${{ env.REPO_TAG }} . 61 | 62 | - name: Docker Run 63 | run: | 64 | if [[ $(docker ps -a | grep -i '${{ env.CONTAINER_NAME }}'$) ]]; then 65 | docker stop ${{ env.CONTAINER_NAME }} 66 | docker rm -vf ${{ env.CONTAINER_NAME }} || true 67 | fi 68 | docker run -dit --memory="4g" --memory-reservation="1g" --disable-content-trust --privileged --name=${{ env.CONTAINER_NAME }} --shm-size="1g" \ 69 | -v ${{ github.workspace }}:/neural-speed \ 70 | ${{ env.REPO_NAME }}:${{ env.REPO_TAG }} 71 | 72 | - name: Env build 73 | run: | 74 | docker exec ${{ env.CONTAINER_NAME }} \ 75 | bash /neural-speed/.github/workflows/scripts/prepare_env.sh 76 | 77 | - name: Code scan check 78 | run: | 79 | docker exec ${{ env.CONTAINER_NAME }} \ 80 | bash -c "bash /neural-speed/.github/workflows/scripts/formatScan/${{ matrix.job_name }}.sh" 81 | 82 | - name: Publish pipeline artifact 83 | if: ${{ !cancelled() }} 84 | uses: actions/upload-artifact@v3 85 | with: 86 | name: ${{ matrix.job_name }} 87 | path: ${{ github.workspace }}/.github/workflows/scripts/formatScan/${{ matrix.job_name }}.* 88 | -------------------------------------------------------------------------------- /.github/workflows/scripts/change_color.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # -------------- general approach start---------------- 4 | 5 | # 1. import this file: 6 | # source path/change_color.sh 7 | # 2. use COLOR/BG: 8 | # $VARIABLE_NAME && out_put_content && $RESET 9 | # 3. COLOR + BG: 10 | # $COLOR/BG_VARIABLE_NAME && $BG/COLOR_VARIABLE_NAME && out_put_content && $RESET 11 | # 4. custom 12 | # abbreviation(change number) 13 | # txt number range (30, 37) 14 | # bg number range (40, 47) 15 | # special effects number range (1, 7) 16 | # echo -en \\E[number1 + ; + number2 + ; + number3 + m" 17 | # e.g - BG_GRAY+LIGHT_RED = "echo -en \\E[47;31m" 18 | 19 | # -------------- general approach end----------------== 20 | 21 | # general setting 22 | # ------------- light_color start---------------- 23 | # black 24 | LIGHT_BLACK="echo -en \\E[30m" 25 | # red 26 | LIGHT_RED="echo -en \\E[31m" 27 | # green 28 | LIGHT_GREEN="echo -en \\E[32m" 29 | # yellow 30 | LIGHT_YELLOW="echo -en \\E[33m" 31 | # blue 32 | LIGHT_BLUE="echo -en \\E[34m" 33 | # purple 34 | LIGHT_PURPLE="echo -en \\E[35m" 35 | # cyan 36 | LIGHT_CYAN="echo -en \\E[36m" 37 | # gray 38 | LIGHT_GRAY="echo -en \\E[37m" 39 | # ------------- light_color end---------------- 40 | 41 | # ------------- bold_color start---------------- 42 | # black 43 | BOLD_BLACK="echo -en \\E[1;30m" 44 | # red 45 | BOLD_RED="echo -en \\E[1;31m" 46 | # green 47 | BOLD_GREEN="echo -en \\E[1;32m" 48 | # yellow 49 | BOLD_YELLOW="echo -en \\E[1;33m" 50 | # blue 51 | BOLD_BLUE="echo -en \\E[1;34m" 52 | # purple 53 | BOLD_PURPLE="echo -en \\E[1;35m" 54 | # cyan 55 | BOLD_CYAN="echo -en \\E[1;36m" 56 | # gray 57 | BOLD_GRAY="echo -en \\E[1;37m" 58 | # ------------- bold_color end---------------- 59 | 60 | # ------------- background_color start---------------- 61 | # black 62 | BG_BLACK="echo -en \\E[40m" 63 | # red 64 | BG_RED="echo -en \\E[41m" 65 | # green 66 | BG_GREEN="echo -en \\E[42m" 67 | # yellow 68 | BG_YELLOW="echo -en \\E[43m" 69 | # blue 70 | BG_BLUE="echo -en \\E[44m" 71 | # purple 72 | BG_PURPLE="echo -en \\E[45m" 73 | # cyan 74 | BG_CYAN="echo -en \\E[46m" 75 | # gray 76 | BG_GRAY="echo -en \\E[47m" 77 | # ------------- background_color end---------------- 78 | 79 | # close 80 | RESET="echo -en \\E[0m" 81 | -------------------------------------------------------------------------------- /.github/workflows/scripts/formatScan/bandit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source /neural-speed/.github/workflows/scripts/change_color.sh 3 | pip install bandit==1.7.4 4 | log_dir=/neural-speed/.github/workflows/scripts/formatScan 5 | python -m bandit -r -lll -iii /neural-speed >${log_dir}/bandit.log 6 | exit_code=$? 7 | 8 | $BOLD_YELLOW && echo " ----------------- Current log file output start --------------------------" 9 | cat ${log_dir}/bandit.log 10 | $BOLD_YELLOW && echo " ----------------- Current log file output end --------------------------" && $RESET 11 | 12 | if [ ${exit_code} -ne 0 ]; then 13 | $BOLD_RED && echo "Error!! Please Click on the artifact button to download and view Bandit error details." && $RESET 14 | exit 1 15 | fi 16 | 17 | $BOLD_PURPLE && echo "Congratulations, Bandit check passed!" && $LIGHT_PURPLE && echo " You can click on the artifact button to see the log details." && $RESET 18 | exit 0 19 | -------------------------------------------------------------------------------- /.github/workflows/scripts/formatScan/clangformat.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source /neural-speed/.github/workflows/scripts/change_color.sh 3 | 4 | pip install clang-format==14.0.0 5 | log_dir=/neural-speed/.github/workflows/scripts/formatScan 6 | log_path=${log_dir}/clangformat.log 7 | 8 | cd /neural-speed 9 | git config --global --add safe.directory "*" 10 | 11 | cd /neural-speed 12 | python clang-format.py 13 | 14 | echo "run git diff" 15 | git diff 2>&1 | tee -a ${log_path} 16 | 17 | if [[ ! -f ${log_path} ]] || [[ $(grep -c "diff" ${log_path}) != 0 ]]; then 18 | exit 1 19 | fi 20 | $BOLD_PURPLE && echo "Congratulations, check passed!" && $LIGHT_PURPLE && echo "You can click on the artifact button to see the log details." && $RESET 21 | exit 0 22 | -------------------------------------------------------------------------------- /.github/workflows/scripts/formatScan/clangtidy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /neural-speed/.github/workflows/scripts/change_color.sh 4 | 5 | pip install cmake ninja clang-tidy==16.0.4 6 | REPO_DIR=/neural-speed 7 | log_dir=/neural-speed/.github/workflows/scripts/formatScan 8 | log_path=${log_dir}/clangtidy.log 9 | 10 | # compile binary 11 | cd ${REPO_DIR} 12 | mkdir build 13 | cd build 14 | cmake .. -G Ninja -DNS_USE_CLANG_TIDY=CHECK -DBTLA_ENABLE_OPENMP=OFF -DNS_USE_OMP=OFF 15 | ninja 2>&1 | tee ${log_path} 16 | 17 | if [[ ! -f ${log_path} ]] || [[ $(grep -c "warning:" ${log_path}) != 0 ]] || [[ $(grep -c "error" ${log_path}) != 0 ]]; then 18 | exit 1 19 | fi 20 | $BOLD_PURPLE && echo "Congratulations, check passed!" && $LIGHT_PURPLE && echo "You can click on the artifact button to see the log details." && $RESET 21 | exit 0 22 | -------------------------------------------------------------------------------- /.github/workflows/scripts/formatScan/cloc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /neural-speed/.github/workflows/scripts/change_color.sh 4 | log_dir=/neural-speed/.github/workflows/scripts/formatScan 5 | cloc --include-lang=Python --csv --out=${log_dir}/cloc.csv /neural-speed 6 | -------------------------------------------------------------------------------- /.github/workflows/scripts/formatScan/hadolint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2024 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | source /neural-speed/.github/workflows/scripts/change_color.sh 17 | log_dir=/neural-speed/.github/workflows/scripts/formatScan 18 | 19 | find . -type f \( -name "Dockerfile*" \) -print -exec hadolint --ignore DL3006 --ignore DL3007 --ignore DL3008 {} \; 2>&1 | tee ${log_dir}/hadolint.log 20 | 21 | if [[ $(grep -c "error" ${log_dir}/hadolint.log) != 0 ]]; then 22 | $BOLD_RED && echo "Error!! Please Click on the artifact button to download and check error details." && $RESET 23 | exit 1 24 | fi 25 | 26 | $BOLD_PURPLE && echo "Congratulations, Hadolint check passed!" && $LIGHT_PURPLE && echo " You can click on the artifact button to see the log details." && $RESET 27 | exit 0 -------------------------------------------------------------------------------- /.github/workflows/scripts/formatScan/nlp_dict.txt: -------------------------------------------------------------------------------- 1 | aadd 2 | aas 3 | alse 4 | ans 5 | bu 6 | charactor 7 | daa 8 | datas 9 | dota 10 | dout 11 | endianess 12 | fo 13 | followings 14 | haa 15 | inout 16 | iterm 17 | mata 18 | matc 19 | mone 20 | nd 21 | ore 22 | ot 23 | parm 24 | ques 25 | rouge 26 | ser 27 | sie 28 | te 29 | tne 30 | tye 31 | ue 32 | wya 33 | -------------------------------------------------------------------------------- /.github/workflows/scripts/formatScan/pydocstyle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /neural-speed/.github/workflows/scripts/change_color.sh 4 | 5 | REPO_DIR=/neural-speed 6 | log_dir=/neural-speed/.github/workflows/scripts/formatScan 7 | pydocstyle --convention=google ${REPO_DIR} >${log_dir}/pydocstyle.log 8 | exit_code=$? 9 | 10 | $BOLD_YELLOW && echo " ----------------- Current pydocstyle cmd start --------------------------" && $RESET 11 | echo "pydocstyle --convention=google ${REPO_DIR} >${log_dir}/pydocstyle.log" 12 | $BOLD_YELLOW && echo " ----------------- Current pydocstyle cmd end --------------------------" && $RESET 13 | 14 | $BOLD_YELLOW && echo " ----------------- Current log file output start --------------------------" 15 | cat $log_dir/pydocstyle.log 16 | $BOLD_YELLOW && echo " ----------------- Current log file output end --------------------------" && $RESET 17 | 18 | if [ ${exit_code} -ne 0 ]; then 19 | $BOLD_RED && echo "Error!! Please Click on the artifact button to download and view error details." && $RESET 20 | exit 1 21 | fi 22 | 23 | $BOLD_PURPLE && echo "Congratulations, check passed!" && $LIGHT_PURPLE && echo " You can click on the artifact button to see the log details." && $RESET 24 | exit 0 25 | -------------------------------------------------------------------------------- /.github/workflows/scripts/formatScan/pylint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /neural-speed/.github/workflows/scripts/change_color.sh 4 | cd /neural-speed 5 | $BOLD_YELLOW && echo "---------------- git submodule update --init --recursive -------------" && $RESET 6 | git config --global --add safe.directory "*" 7 | git submodule update --init --recursive 8 | 9 | $BOLD_YELLOW && echo "---------------- install NeuralSpeed -------------" && $RESET 10 | export PYTHONPATH=`pwd` 11 | pip list 12 | 13 | 14 | cd /neural-speed 15 | log_dir=/neural-speed/.github/workflows/scripts/formatScan 16 | if [ -f "requirements.txt" ]; then 17 | python -m pip install --default-timeout=100 -r requirements.txt 18 | pip list 19 | else 20 | echo "Not found requirements.txt file." 21 | fi 22 | 23 | echo "[DEBUG] list pipdeptree..." 24 | pip install pipdeptree 25 | pipdeptree 26 | 27 | python -m pylint -f json --disable=R,C,W,E1129 \ 28 | --enable=line-too-long \ 29 | --max-line-length=120 \ 30 | --disable=no-name-in-module,import-error,no-member,undefined-variable,no-value-for-parameter,unexpected-keyword-arg,not-callable,no-self-argument,too-many-format-args,invalid-unary-operand-type,too-many-function-args \ 31 | --extension-pkg-whitelist=numpy,nltk \ 32 | --ignored-classes=TensorProto,NodeProto \ 33 | --ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,mxnet,onnx,onnxruntime,neural_compressor,neural_compressor.benchmark,intel_extension_for_transformers.neural_engine_py,cv2,PIL.Image \ 34 | /neural-speed/neural_speed >${log_dir}/pylint.json 35 | exit_code=$? 36 | 37 | $BOLD_YELLOW && echo " ----------------- Current log file output start --------------------------" && $RESET 38 | cat ${log_dir}/pylint.json 39 | $BOLD_YELLOW && echo " ----------------- Current log file output end --------------------------" && $RESET 40 | 41 | if [ ${exit_code} -ne 0 ]; then 42 | $BOLD_RED && echo "Error!! Please Click on the artifact button to download and view Pylint error details." && $RESET 43 | exit 1 44 | fi 45 | $BOLD_PURPLE && echo "Congratulations, Pylint check passed!" && $LIGHT_PURPLE && echo " You can click on the artifact button to see the log details." && $RESET 46 | exit 0 47 | -------------------------------------------------------------------------------- /.github/workflows/scripts/formatScan/trellix.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2024 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | source /neural-speed/.github/workflows/scripts/change_color.sh 17 | log_dir=/neural-speed/.github/workflows/scripts/formatScan 18 | 19 | 20 | echo "---Updating definition (DAT) files ---" 21 | DEFS_URL=https://update.nai.com/products/commonupdater/current/vscandat1000/dat/0000 22 | echo "Finding latest defs at $DEFS_URL/avvdat.ini..." \ 23 | && wget -q $DEFS_URL/avvdat.ini \ 24 | && echo "SUCCESS" || fail 25 | 26 | inifile="avvdat.ini" 27 | filename=`awk -F"=" '$2 ~ /avvdat.*zip/ { print $2 } ' $inifile` 28 | filename2="$(echo -e "${filename}" | tr -d '[:space:]')" 29 | 30 | if [ -z "$filename2" ] 31 | then 32 | echo "Cannot get defs information from INI file:" 33 | cat $inifile 34 | fail 35 | fi 36 | 37 | echo "Downloading latest defs from $DEFS_URL/$filename2..." \ 38 | && wget -q $DEFS_URL/$filename2 \ 39 | && echo "SUCCESS" || fail 40 | 41 | echo "Extracting latest defs..." \ 42 | && unzip -o $filename2 -d /usr/local/uvscan \ 43 | && echo "SUCCESS" || fail 44 | 45 | echo "--- Scanning ---" 46 | ENV_SCAN_OPTS="--analyze --mime --program --recursive --unzip --threads 4 --summary --verbose --html=${workspace}/.github/workflows/scripts/formatScan/report.html" 47 | echo "Scan Options: $ENV_SCAN_OPTS" 48 | 49 | rm -r ${workspace}/avvdat* 50 | rm -r ${workspace}/.git 51 | uvscan $ENV_SCAN_OPTS ${workspace} 2>&1 | tee ${log_dir}/trellix.log 52 | 53 | 54 | if [[ $(grep "Possibly Infected" ${log_dir}/trellix.log | sed 's/[^0-9]//g') != 0 ]]; then 55 | $BOLD_RED && echo "Error!! Please Click on the artifact button to download and check error details." && $RESET 56 | exit 1 57 | fi 58 | 59 | $BOLD_PURPLE && echo "Congratulations, Trellix Scan passed!" && $LIGHT_PURPLE && echo " You can click on the artifact button to see the log details." && $RESET 60 | exit 0 -------------------------------------------------------------------------------- /.github/workflows/scripts/install_binary.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source /neural-speed/.github/workflows/scripts/change_color.sh 3 | 4 | cd /neural-speed 5 | $BOLD_YELLOW && echo "---------------- git submodule update --init --recursive -------------" && $RESET 6 | git config --global --add safe.directory "*" 7 | git submodule update --init --recursive 8 | 9 | 10 | $BOLD_YELLOW && echo "---------------- run python setup.py sdist bdist_wheel -------------" && $RESET 11 | python setup.py bdist_wheel 12 | 13 | $BOLD_YELLOW && echo "---------------- pip install binary -------------" && $RESET 14 | pip install dist/neural_speed*.whl 15 | pip list 16 | -------------------------------------------------------------------------------- /.github/workflows/scripts/prepare_env.sh: -------------------------------------------------------------------------------- 1 | cd /neural-speed 2 | 3 | pip install -U pip 4 | 5 | if [ -f "requirements.txt" ]; then 6 | python -m pip install --default-timeout=100 -r requirements.txt 7 | pip list 8 | else 9 | echo "Not found requirements.txt file." 10 | fi 11 | -------------------------------------------------------------------------------- /.github/workflows/scripts/prepare_env_with_conda.bat: -------------------------------------------------------------------------------- 1 | SET conda_env_name=windows_build_ns 2 | SET python_version=3.10 3 | cd ../../.. 4 | 5 | FOR /F %%i IN ('conda info -e ^| find /c "%conda_env_name%"') do SET CONDA_COUNT=%%i 6 | if %CONDA_COUNT% EQU 0 ( 7 | CALL conda create python=%python_version% -y -n %conda_env_name% 8 | ) 9 | 10 | IF %ERRORLEVEL% NEQ 0 ( 11 | echo "Could not create new conda environment." 12 | exit 1 13 | ) 14 | CALL conda activate %conda_env_name% 15 | CALL pip uninstall neural-speed -y 16 | echo "pip list all the components------------->" 17 | CALL pip list 18 | CALL pip config set global.proxy proxy-prc.intel.com:913 19 | CALL pip install -U pip 20 | echo "Installing requirements for validation scripts..." 21 | CALL pip install -i https://pypi.python.org/simple setuptools_scm 22 | CALL pip install -r requirements.txt 23 | echo "pip list all the components------------->" 24 | CALL pip list 25 | echo "------------------------------------------" 26 | IF %ERRORLEVEL% NEQ 0 ( 27 | echo "Could not install requirements." 28 | exit 1 29 | ) 30 | 31 | git submodule update --init --recursive 32 | python setup.py sdist bdist_wheel 33 | IF %ERRORLEVEL% NEQ 0 ( 34 | echo "Could not build binary." 35 | exit 1 36 | ) 37 | -------------------------------------------------------------------------------- /.github/workflows/scripts/prepare_env_with_conda.sh: -------------------------------------------------------------------------------- 1 | cd ${WORKING_DIR} 2 | conda_env_name=$1 3 | python_version=$2 4 | if [[ -z "${conda_env_name}" ]] || [[ -z "${python_version}" ]]; then 5 | $BOLD_RED && echo "need provide with conda env name and python version" && $RESET 6 | exit 1 7 | fi 8 | source ~/.bashrc 9 | conda create -n ${conda_env_name} python=${python_version} -y 10 | source activate ${conda_env_name} || conda activate ${conda_env_name} 11 | pip install -U pip 12 | 13 | if [ -f "requirements.txt" ]; then 14 | python -m pip install --default-timeout=100 -r requirements.txt 15 | pip list 16 | else 17 | echo "Not found requirements.txt file." 18 | fi 19 | -------------------------------------------------------------------------------- /.github/workflows/trellix.yml: -------------------------------------------------------------------------------- 1 | name: Trellix Command Line Scanner 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | permissions: write-all 7 | jobs: 8 | Trellix: 9 | runs-on: inner-source 10 | steps: 11 | - name: Clean Up Working Directory 12 | run: sudo rm -rf ${{github.workspace}}/* 13 | 14 | - name: Checkout out Repo 15 | uses: actions/checkout@v4 16 | 17 | - name: Run Trellix Scanner 18 | env: 19 | workspace: ${{ github.workspace }} 20 | run: bash .github/workflows/scripts/formatScan/trellix.sh 21 | 22 | - name: Publish pipeline artifact 23 | if: ${{ !cancelled() }} 24 | uses: actions/upload-artifact@v4 25 | with: 26 | path: ${{ github.workspace }}/.github/workflows/scripts/formatScan/report.html 27 | -------------------------------------------------------------------------------- /.github/workflows/unit-test-bestla.yml: -------------------------------------------------------------------------------- 1 | name: Bestla Unit Test 2 | 3 | on: 4 | pull_request: 5 | branches: [main] 6 | paths: 7 | - bestla/** 8 | - .github/workflows/unit-test-bestla.yml 9 | - '!bestla/README.md' 10 | workflow_dispatch: 11 | inputs: 12 | compiler_version: 13 | description: 'compiler_version' 14 | required: false 15 | type: string 16 | default: '13.2.0' 17 | 18 | # If there is a new commit, the previous jobs will be canceled 19 | concurrency: 20 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 21 | cancel-in-progress: true 22 | 23 | env: 24 | INPUT_COMPILER_VERSION: ${{ inputs.compiler_version || '13.2.0' }} 25 | WORKING_DIR: ${{ github.workspace }} 26 | CONTAINER_NAME: "utTest" 27 | 28 | jobs: 29 | unit-test: 30 | runs-on: [self-hosted, linux, X64, spr] 31 | steps: 32 | - name: Docker Clean Up 33 | run: | 34 | if [[ $(docker ps -a | grep -i '${{ env.CONTAINER_NAME }}-${{ runner.name }}'$) ]]; then 35 | docker start ${{ env.CONTAINER_NAME }}-${{ runner.name }} 36 | echo "remove left files through container ..." 37 | docker exec ${{ env.CONTAINER_NAME }}-${{ runner.name }} bash -c "ls -a /neural-speed && rm -fr /neural-speed/* && rm -fr /neural-speed/.* || true" 38 | fi 39 | - name: Checkout out Repo 40 | uses: actions/checkout@v3 41 | with: 42 | submodules: "recursive" 43 | fetch-tags: true 44 | 45 | - name: Env build 46 | run: | 47 | echo "do not need conda env" 48 | source ~/.bashrc 49 | bash ${{ github.workspace }}/.github/workflows/scripts/prepare_env_with_conda.sh "unit-test-bestla" "3.10" 50 | conda activate unit-test-bestla || source activate unit-test-bestla 51 | conda install --update-deps -c conda-forge gxx==${{ env.INPUT_COMPILER_VERSION }} gcc==${{ env.INPUT_COMPILER_VERSION }} gxx_linux-64==${{ env.INPUT_COMPILER_VERSION }} libstdcxx-ng sysroot_linux-64 -y 52 | 53 | - name: Run UT 54 | run: | 55 | #source /opt/rh/gcc-toolset-12/enable 56 | source ~/.bashrc 57 | conda activate unit-test-bestla || source activate unit-test-bestla 58 | export LD_LIBRARY_PATH=${HOME}/miniforge/envs/${conda_env}/lib/:$LD_LIBRARY_PATH 59 | cd ${{ github.workspace }}/bestla && mkdir build && cd build && cmake .. -DBTLA_UT_ALL=ON && make -j 60 | ./bestla_ut 2>&1 | tee unit_test_bestla.log 61 | 62 | - name: Check Result 63 | run: | 64 | if [[ $(grep -c "No such file or directory" ${{ github.workspace }}/bestla/build/unit_test_bestla.log) != 0 ]]; then 65 | echo "neural-speed Compile Failed" 66 | exit 1 67 | fi 68 | if [[ $(grep -c "Case Failed" ${{ github.workspace }}/bestla/build/unit_test_bestla.log) != 0 ]]; then 69 | echo "UT Failed! Please check UT log." 70 | exit 1 71 | fi 72 | 73 | - name: Publish pipeline artifact 74 | uses: actions/upload-artifact@v3 75 | if: ${{ !cancelled() }} 76 | with: 77 | name: Bestla Unit Test 78 | path: ${{ github.workspace }}/bestla/build/unit_test*.* 79 | -------------------------------------------------------------------------------- /.github/workflows/unit-test-llmruntime.yml: -------------------------------------------------------------------------------- 1 | name: Python Unit Test 2 | 3 | on: 4 | pull_request: 5 | branches: [main] 6 | paths: 7 | - neural_speed/** 8 | - bestla/** 9 | - tests/** 10 | - .github/workflows/unit-test-llmruntime.yml 11 | - .github/workflows/unitTest/** 12 | - 'CMakeLists.txt' 13 | - 'setup.py' 14 | - '!**/*.md' 15 | workflow_dispatch: 16 | 17 | # If there is a new commit, the previous jobs will be canceled 18 | concurrency: 19 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 20 | cancel-in-progress: true 21 | 22 | env: 23 | DOCKER_CONFIG_NAME: "commonDockerConfig" 24 | REPO_NAME: "neural-speed" 25 | REPO_TAG: "py39" 26 | DOCKER_FILE_NAME: "devel" 27 | CONTAINER_NAME: "utTest" 28 | 29 | jobs: 30 | unit-test: 31 | runs-on: [self-hosted, linux, X64, llmruntime-node] 32 | steps: 33 | - name: Load environment variables 34 | run: cat ~/actions-runner3/.env >> $GITHUB_ENV 35 | 36 | - name: Docker Clean Up 37 | run: | 38 | docker ps -a 39 | if [[ $(docker ps -a | grep -i '${{ env.CONTAINER_NAME }}-${{ runner.name }}'$) ]]; then 40 | docker start ${{ env.CONTAINER_NAME }}-${{ runner.name }} 41 | echo "remove left files through container ..." 42 | docker exec ${{ env.CONTAINER_NAME }}-${{ runner.name }} bash -c "ls -a /neural-speed && rm -fr /neural-speed/* && rm -fr /neural-speed/.* || true" 43 | fi 44 | 45 | - name: Checkout out Repo 46 | uses: actions/checkout@v3 47 | with: 48 | submodules: "recursive" 49 | fetch-tags: true 50 | 51 | - name: Docker Build 52 | run: | 53 | docker build -f ${{ github.workspace }}/.github/workflows/docker/${{ env.DOCKER_FILE_NAME }}.dockerfile --build-arg http_proxy="${{ env.HTTP_PROXY }}" --build-arg https_proxy="${{ env.HTTPS_PROXY }}" -t ${{ env.REPO_NAME }}:${{ env.REPO_TAG }} . 54 | 55 | - name: Docker Run 56 | run: | 57 | if [[ $(docker ps -a | grep -i '${{ env.CONTAINER_NAME }}-${{ runner.name }}'$) ]]; then 58 | docker stop ${{ env.CONTAINER_NAME }}-${{ runner.name }} 59 | docker rm -vf ${{ env.CONTAINER_NAME }}-${{ runner.name }} || true 60 | fi 61 | docker run -dit --disable-content-trust --privileged --name=${{ env.CONTAINER_NAME }}-${{ runner.name }} -v /dev/shm:/dev/shm \ 62 | -e http_proxy="${{ env.HTTP_PROXY }}" \ 63 | -e https_proxy="${{ env.HTTPS_PROXY }}" \ 64 | -v ${{ github.workspace }}:/neural-speed \ 65 | -v /tf_dataset2:/tf_dataset2 \ 66 | -v ~/.cache/oneAPI:/cache \ 67 | ${{ env.REPO_NAME }}:${{ env.REPO_TAG }} 68 | 69 | - name: Env build 70 | run: | 71 | docker exec ${{ env.CONTAINER_NAME }}-${{ runner.name }} \ 72 | bash /neural-speed/.github/workflows/scripts/prepare_env.sh 73 | 74 | - name: Binary build 75 | run: | 76 | docker exec ${{ env.CONTAINER_NAME }}-${{ runner.name }} \ 77 | bash -c "cd /neural-speed/.github/workflows/scripts \ 78 | && bash install_binary.sh" 79 | 80 | - name: Run UT 81 | run: | 82 | docker exec ${{ env.CONTAINER_NAME }}-${{ runner.name }} \ 83 | bash -c "cd /neural-speed/.github/workflows/unitTest \ 84 | && bash unittest_llmruntime.sh" 85 | 86 | - name: Publish pipeline artifact 87 | uses: actions/upload-artifact@v3 88 | if: ${{ !cancelled() }} 89 | with: 90 | name: Python Unit Test 91 | path: ${{ github.workspace }}/log_dir/unit_test*.* 92 | -------------------------------------------------------------------------------- /.github/workflows/unitTest/env_setup.sh: -------------------------------------------------------------------------------- 1 | pip list 2 | 3 | # Install test requirements 4 | echo "Install Tests Requirements" 5 | cd $1 || exit 1 6 | pwd 7 | if [ -f "requirements.txt" ]; then 8 | python -m pip install --default-timeout=100 -r requirements.txt 9 | pip list 10 | else 11 | echo "Not found requirements.txt file." 12 | fi 13 | 14 | pip install coverage 15 | pip install pytest 16 | -------------------------------------------------------------------------------- /.github/workflows/unitTest/unittest_llmruntime.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source /neural-speed/.github/workflows/scripts/change_color.sh 3 | test_install_backend="true" 4 | LOG_DIR=/neural-speed/log_dir 5 | mkdir -p ${LOG_DIR} 6 | WORKING_DIR="/neural-speed/tests" 7 | 8 | # -------------------LLM Runtime Test------------------- 9 | function llmruntime_test() { 10 | cd ${WORKING_DIR} 11 | local ut_log_name=${LOG_DIR}/unit_test_llm_runtime.log 12 | find . -name "test*.py" | sed 's,\.\/,python ,g' | sed 's/$/ --verbose/' >run.sh 13 | # run UT 14 | $BOLD_YELLOW && echo "cat run.sh..." && $RESET 15 | cat run.sh | tee ${ut_log_name} 16 | $BOLD_YELLOW && echo "------UT start-------" && $RESET 17 | bash run.sh 2>&1 | tee -a ${ut_log_name} 18 | $BOLD_YELLOW && echo "------UT end -------" && $RESET 19 | 20 | if [ $(grep -c "FAILED" ${ut_log_name}) != 0 ] || 21 | [ $(grep -c "OK" ${ut_log_name}) == 0 ] || 22 | [ $(grep -c "Segmentation fault" ${ut_log_name}) != 0 ] || 23 | [ $(grep -c "core dumped" ${ut_log_name}) != 0 ] || 24 | [ $(grep -c "==ERROR:" ${ut_log_name}) != 0 ] || 25 | [ $(grep -c "ModuleNotFoundError:" ${ut_log_name}) != 0 ]; then 26 | $BOLD_RED && echo "Find errors in engine test, please check the output..." && $RESET 27 | exit 1 28 | else 29 | $BOLD_GREEN && echo "engine test finished successfully!" && $RESET 30 | fi 31 | } 32 | 33 | function main() { 34 | bash /neural-speed/.github/workflows/unitTest/env_setup.sh "${WORKING_DIR}" 35 | llmruntime_test 36 | } 37 | 38 | main 39 | -------------------------------------------------------------------------------- /.github/workflows/windows-test.yml: -------------------------------------------------------------------------------- 1 | name: Windows Binary Test 2 | 3 | on: 4 | pull_request: 5 | branches: [main] 6 | paths: 7 | - ".github/workflows/windows-test.yml" 8 | - "requirements.txt" 9 | - "setup.py" 10 | - "neural_speed/**" 11 | - "bestla/**" 12 | - '!bestla/ut/**' 13 | - '!bestla/xbyak/**' 14 | - '!bestla/xbyak/*.md' 15 | - '!neural_speed/*.md' 16 | 17 | workflow_dispatch: 18 | 19 | # If there is a new commit, the previous jobs will be canceled 20 | concurrency: 21 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 22 | cancel-in-progress: true 23 | 24 | env: 25 | SCRIPT_PATH: ${{ github.workspace }}\.github\workflows\scripts 26 | WORKING_DIR: ${{ github.workspace }} 27 | 28 | jobs: 29 | Windows-Binary-Test: 30 | runs-on: 'Windows' 31 | steps: 32 | - name: Checkout out Repo 33 | uses: actions/checkout@v4 34 | with: 35 | submodules: "recursive" 36 | fetch-tags: true 37 | path: "a" 38 | 39 | - name: Binary build 40 | shell: cmd 41 | run: | 42 | SET HTTP_PROXY=http://proxy-dmz.intel.com:912 43 | SET HTTPS_PROXY=http://proxy-dmz.intel.com:912 44 | SET http_proxy=http://proxy-dmz.intel.com:912 45 | SET https_proxy=http://proxy-dmz.intel.com:912 46 | cd ${{ github.workspace }}\a\.github\workflows\scripts 47 | prepare_env_with_conda.bat 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### ignore binary files in llm-runtime ### 2 | /neural_speed/* 3 | !/neural_speed/*.* 4 | !/neural_speed/*/ 5 | 6 | *.exe 7 | *.dll 8 | *.dylib 9 | *.pyd 10 | *.so 11 | *.so.* 12 | 13 | 14 | .vs 15 | .vscode 16 | /out 17 | __pycache__ 18 | neural_speed.egg-info/ 19 | build 20 | runtime_outs 21 | out 22 | debug/ 23 | .eggs/ 24 | dist/ 25 | .cache/ 26 | .clangd 27 | CMakeUserPresets.json 28 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third_party/pybind11"] 2 | path = third_party/pybind11 3 | url = https://github.com/pybind/pybind11.git 4 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | ci: 2 | autofix_prs: true 3 | autoupdate_schedule: quarterly 4 | 5 | repos: 6 | - repo: https://github.com/pre-commit/pre-commit-hooks 7 | rev: v4.5.0 8 | hooks: 9 | - id: debug-statements 10 | - id: file-contents-sorter 11 | files: | 12 | (?x)^( 13 | .github/workflows/scripts/formatScan/nlp_dict.txt 14 | )$ 15 | args: [--unique] 16 | - id: end-of-file-fixer 17 | files: (.*\.(py|md|rst|yaml|yml))$ 18 | exclude: | 19 | (?x)^( 20 | third-party/.+ 21 | )$ 22 | - id: check-json 23 | - id: check-yaml 24 | exclude: | 25 | (?x)^( 26 | third-party/.+ 27 | )$ 28 | - id: requirements-txt-fixer 29 | exclude: | 30 | (?x)^( 31 | third-party/.+ 32 | )$ 33 | - id: trailing-whitespace 34 | files: (.*\.(py|rst|cmake|yaml|yml))$ 35 | exclude: | 36 | (?x)^( 37 | third-party/.+ 38 | )$ 39 | 40 | - repo: https://github.com/codespell-project/codespell 41 | rev: v2.2.6 42 | hooks: 43 | - id: codespell 44 | args: 45 | [-w, --ignore-words=.github/workflows/scripts/formatScan/nlp_dict.txt] 46 | exclude: | 47 | (?x)^( 48 | .+.po|.+.ts|.+.js|.+.map|.+.js.map|.+.css.map| 49 | .github/workflows/scripts/formatScan/nlp_dict.txt| 50 | tests/model-test/cpp_graph_prompts.json 51 | )$ 52 | 53 | - repo: https://github.com/Lucas-C/pre-commit-hooks 54 | rev: v1.5.5 55 | hooks: 56 | - id: insert-license 57 | files: | 58 | (?x)^( 59 | neural_speed/.*(py|yaml|yml|sh)| 60 | bestla/.*(py|yaml|yml|sh)| 61 | tests/.*(py|yaml|yml|sh) 62 | )$ 63 | args: 64 | [ 65 | --license-filepath=.github/license_template.txt, 66 | --use-current-year, 67 | --detect-license-in-X-top-lines=40, 68 | --skip-license-insertion-comment=Copyright, 69 | ] 70 | # - repo: https://github.com/asottile/yesqa 71 | # rev: v1.5.0 72 | # hooks: 73 | # - id: yesqa 74 | # name: Unused noqa 75 | # 76 | # - repo: https://github.com/pycqa/isort 77 | # rev: 5.13.2 78 | # hooks: 79 | # - id: isort 80 | # exclude: | 81 | # (?x)^( 82 | # examples/.+ 83 | # )$ 84 | # 85 | # - repo: https://github.com/PyCQA/docformatter 86 | # rev: v1.7.5 87 | # hooks: 88 | # - id: docformatter 89 | # args: [ 90 | # --in-place, 91 | # --wrap-summaries=0, # 0 means disable wrap 92 | # --wrap-descriptions=0, # 0 means disable wrap 93 | # --black, 94 | # --style=google, 95 | # ] 96 | # exclude: | 97 | # (?x)^( 98 | # examples/.+ 99 | # )$ 100 | # 101 | # - repo: https://github.com/psf/black.git 102 | # rev: 23.12.1 103 | # hooks: 104 | # - id: black 105 | # files: (.*\.py)$ 106 | # exclude: | 107 | # (?x)^( 108 | # examples/.+ 109 | # )$ 110 | # 111 | # - repo: https://github.com/asottile/blacken-docs 112 | # rev: 1.16.0 113 | # hooks: 114 | # - id: blacken-docs 115 | # args: [--line-length=120, --skip-errors] 116 | # exclude: | 117 | # (?x)^( 118 | # examples/.+| 119 | # docs/source-app 120 | # )$ 121 | # 122 | # - repo: https://github.com/astral-sh/ruff-pre-commit 123 | # rev: v0.1.9 124 | # hooks: 125 | # - id: ruff 126 | # args: [--fix, --exit-non-zero-on-fix, --no-cache] 127 | # exclude: | 128 | # (?x)^( 129 | # examples/.+ 130 | # )$ 131 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | ### License 4 | 5 | is licensed under the terms in [LICENSE]. By contributing to the project, you agree to the license and copyright terms therein and release your contribution under these terms. 6 | 7 | ### Sign your work 8 | 9 | Please use the sign-off line at the end of the patch. Your signature certifies that you wrote the patch or otherwise have the right to pass it on as an open-source patch. The rules are pretty simple: if you can certify 10 | the below (from [developercertificate.org](http://developercertificate.org/)): 11 | 12 | ``` 13 | Developer Certificate of Origin 14 | Version 1.1 15 | 16 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 17 | 660 York Street, Suite 102, 18 | San Francisco, CA 94110 USA 19 | 20 | Everyone is permitted to copy and distribute verbatim copies of this 21 | license document, but changing it is not allowed. 22 | 23 | Developer's Certificate of Origin 1.1 24 | 25 | By making a contribution to this project, I certify that: 26 | 27 | (a) The contribution was created in whole or in part by me and I 28 | have the right to submit it under the open source license 29 | indicated in the file; or 30 | 31 | (b) The contribution is based upon previous work that, to the best 32 | of my knowledge, is covered under an appropriate open source 33 | license and I have the right under that license to submit that 34 | work with modifications, whether created in whole or in part 35 | by me, under the same open source license (unless I am 36 | permitted to submit under a different license), as indicated 37 | in the file; or 38 | 39 | (c) The contribution was provided directly to me by some other 40 | person who certified (a), (b) or (c) and I have not modified 41 | it. 42 | 43 | (d) I understand and agree that this project and the contribution 44 | are public and that a record of the contribution (including all 45 | personal information I submit with it, including my sign-off) is 46 | maintained indefinitely and may be redistributed consistent with 47 | this project or the open source license(s) involved. 48 | ``` 49 | 50 | Then you just add a line to every git commit message: 51 | 52 | Signed-off-by: Joe Smith 53 | 54 | Use your real name (sorry, no pseudonyms or anonymous contributions.) 55 | 56 | If you set your `user.name` and `user.email` git configs, you can sign your 57 | commit automatically with `git commit -s`. 58 | -------------------------------------------------------------------------------- /bestla/bestla/bestla.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #pragma once 15 | #include 16 | enum class BTLA_CODE { 17 | Success = 0, 18 | InvalidParam = 1, 19 | InvalidISA = 2, 20 | RuntimeError = 4, 21 | NotSupport = 8, 22 | }; 23 | enum class BTLA_ISA : uint8_t { 24 | NoSIMD = 0, 25 | AVX, 26 | AVX2, 27 | AVX_VNNI, 28 | AVX512F, 29 | AVX512BW, 30 | AVX512_VNNI, 31 | AVX512_BF16, 32 | AVX512_FP16, 33 | AMX_BF16, 34 | AMX_INT8, 35 | AMX_FP16, 36 | ISA_COUNT, 37 | }; 38 | enum class BTLA_DTYPE : uint32_t { 39 | EleBitsMask = 0xff, 40 | EleBitsShift = 0, 41 | EleBitsUndef = 0, 42 | EleBits1 = 1, 43 | EleBits2 = 2, 44 | EleBits3 = 3, 45 | EleBits4 = 4, 46 | EleBits5 = 5, 47 | EleBits6 = 6, 48 | EleBits7 = 7, 49 | EleBits8 = 8, 50 | EleBits16 = 16, 51 | EleBits32 = 32, 52 | EleBits64 = 64, 53 | TypeMask = 0xff00, 54 | TypeShift = 8, 55 | TypeFloat = 0 << TypeShift, 56 | TypeInt = 1 << TypeShift, 57 | SubTypeMask = 0xff0000, 58 | SubTypeShift = 16, 59 | SubType0 = 0 << SubTypeShift, 60 | SubType1 = 1 << SubTypeShift, 61 | SubType2 = 2 << SubTypeShift, 62 | SubType3 = 3 << SubTypeShift, 63 | SubType4 = 4 << SubTypeShift, 64 | F64 = EleBits64 | TypeFloat, 65 | F32 = EleBits32 | TypeFloat, 66 | F16 = EleBits16 | TypeFloat, 67 | BF16 = EleBits16 | TypeFloat | SubType1, 68 | F8_E4M3 = EleBits8 | TypeFloat, 69 | F8_E5M2 = EleBits8 | TypeFloat | SubType1, 70 | F8_E3M4 = EleBits8 | TypeFloat | SubType2, 71 | F8_E8M0 = EleBits8 | TypeFloat | SubType3, 72 | DQ8_BNB = EleBits8 | TypeFloat | SubType4, 73 | S8 = EleBits8 | TypeInt, 74 | U8 = EleBits8 | TypeInt | SubType1, 75 | S1_CLIP = EleBits1 | TypeInt, 76 | S2_CLIP = EleBits2 | TypeInt, 77 | S3_CLIP = EleBits3 | TypeInt, 78 | S4_CLIP = EleBits4 | TypeInt, 79 | S5_CLIP = EleBits5 | TypeInt, 80 | S6_CLIP = EleBits6 | TypeInt, 81 | S7_CLIP = EleBits7 | TypeInt, 82 | F4_E2M1 = EleBits4 | TypeFloat, 83 | F4_BNB = EleBits4 | TypeFloat | SubType1, 84 | F4_NF4 = EleBits4 | TypeFloat | SubType2, 85 | S32 = EleBits32 | TypeInt, 86 | U32 = EleBits32 | TypeInt | SubType1, 87 | }; 88 | 89 | enum class BTLA_ELTWISEOP { GELU, SWISH, TANH, EXP, LOW_PRECISION_EXP, RELU, LINEAR }; 90 | 91 | enum class BTLA_PROLOGUEB_IDS : uint32_t { 92 | Undef = (uint32_t)-1, 93 | Begin = 0, 94 | NormalBegin = Begin, 95 | WeightPack = NormalBegin, 96 | NormalEnd, 97 | KBlockBegin = NormalEnd, 98 | WeightKBlockNInteger = KBlockBegin, 99 | WeightKBlockNFloat, 100 | KBlockEnd, 101 | End, 102 | }; 103 | -------------------------------------------------------------------------------- /bestla/bestla/sycl/sycl_epilogue.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #pragma once 15 | 16 | #ifdef BTLA_SYCL 17 | #include 18 | 19 | #include "sycl_utils.h" 20 | 21 | namespace bestla { 22 | namespace sycl_epilogue { 23 | template 24 | struct ParamOutputBase { 25 | DstT* C; 26 | int ldc; 27 | }; 28 | template 29 | class OutputBase { 30 | public: 31 | using CType = typename GemmCoreT::TACC; 32 | using DstType = DstT; 33 | using Param = ParamOutputBase; 34 | static inline void store(const Param& _param, CType* tmpAcc, const sycl_utils::nd_item_helper& helper) { 35 | #pragma unroll 36 | for (int im = 0; im < GemmCoreT::TileM; im++) { 37 | #pragma unroll 38 | for (int in = 0; in < GemmCoreT::TileN; in++) { 39 | _param.C[(helper.item_g_m() + im) * _param.ldc + helper.item_g_n() + in] = tmpAcc[im * GemmCoreT::TileN + in]; 40 | } 41 | } 42 | } 43 | 44 | static inline void store_tail(const Param& _param, CType* tmpAcc, const sycl_utils::nd_item_helper& helper, 45 | int m_tail) { 46 | if (m_tail) { 47 | for (int im = 0; im < m_tail; im++) { 48 | #pragma unroll 49 | for (int in = 0; in < GemmCoreT::TileN; in++) { 50 | _param.C[(helper.item_g_m() + im) * _param.ldc + helper.item_g_n() + in] = tmpAcc[im * GemmCoreT::TileN + in]; 51 | } 52 | } 53 | } 54 | } 55 | }; 56 | 57 | } // namespace sycl_epilogue 58 | } // namespace bestla 59 | #endif 60 | -------------------------------------------------------------------------------- /bestla/bestla/sycl/sycl_prologue_a.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #pragma once 15 | 16 | #ifdef BTLA_SYCL 17 | #include 18 | 19 | #include "bestla/bestla_utils.h" 20 | #include 21 | 22 | namespace bestla { 23 | namespace sycl_prologue_a { 24 | 25 | template 26 | struct ParamActivationBase { 27 | const SrcT* A; 28 | int lda; 29 | }; 30 | template 31 | class ActivationBase { 32 | public: 33 | using AType = typename GemmCoreT::TA; 34 | using SrcType = SrcT; 35 | using Param = ParamActivationBase; 36 | static inline void getActivation(const Param& _param, AType* aptr, sycl_utils::nd_item_helper& helper) {} 37 | }; 38 | 39 | } // namespace sycl_prologue_a 40 | } // namespace bestla 41 | #endif 42 | -------------------------------------------------------------------------------- /bestla/bestla/ut/bestla.cpp: -------------------------------------------------------------------------------- 1 | #include "../bestla.h" 2 | -------------------------------------------------------------------------------- /bestla/bestla/ut/bestla_ut.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main() { 5 | printf("BesTLA UT done\n"); 6 | return 0; 7 | } 8 | -------------------------------------------------------------------------------- /bestla/bestla/ut/bestla_utils.cpp: -------------------------------------------------------------------------------- 1 | 2 | namespace bestla { 3 | namespace ut {} // namespace ut 4 | } // namespace bestla 5 | -------------------------------------------------------------------------------- /bestla/bestla/ut/kernel_ut.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "bestla_ut.h" 3 | #include "kernel_ref.h" 4 | 5 | #ifdef _MSC_VER 6 | #define __PRETTY_FUNCTION__ __FUNCSIG__ 7 | #endif 8 | -------------------------------------------------------------------------------- /bestla/bestla/ut/sycl_ut.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "sycl/sycl_device.h" 4 | 5 | namespace bestla { 6 | namespace sycl_ut { 7 | 8 | class UT_Device { 9 | public: 10 | static bestla::sycl_device::SyclDevice* get() { 11 | static bestla::sycl_device::SyclDevice Instance(true); 12 | return &Instance; 13 | } 14 | }; 15 | }; // namespace sycl_ut 16 | } // namespace bestla 17 | -------------------------------------------------------------------------------- /bestla/cmake/sycl.cmake: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.23) 2 | 3 | find_package(IntelSYCL REQUIRED) 4 | -------------------------------------------------------------------------------- /clang-format.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import cmd 16 | import os 17 | import platform 18 | import sys 19 | import glob 20 | import argparse 21 | import fnmatch 22 | import subprocess 23 | 24 | ProjectEXT = ['h', 'hpp', 'c', 'cpp'] 25 | 26 | 27 | def glob_files(dirs): 28 | files = [] 29 | for directory in dirs: 30 | for root, _, filenames in os.walk(directory): 31 | for ext in ProjectEXT: 32 | for filename in fnmatch.filter(filenames, '*.' + ext): 33 | files.append(os.path.join(root, filename)) 34 | return files 35 | 36 | 37 | if sys.platform == "linux": 38 | ClangBin = 'clang-format' 39 | elif sys.platform == 'win32': 40 | ClangBin = 'clang-format.exe' 41 | 42 | 43 | def clang_format_dir(args): 44 | files = glob_files(args.dirs) 45 | for file in files: 46 | cmds = [ClangBin, '-i', '--style=file', file] 47 | subprocess.run(cmds, check=True) 48 | 49 | 50 | def parse_args(argv=None): 51 | if argv is None: 52 | argv = sys.argv 53 | parser = argparse.ArgumentParser(description='Recursively clang-format') 54 | parser.add_argument('--dirs', nargs='+', help='paths to clang-format') 55 | args = parser.parse_args(argv[1:]) 56 | if not args.dirs: 57 | sys.exit(-1) 58 | return args 59 | 60 | 61 | if __name__ == '__main__': 62 | if len(sys.argv) == 1: 63 | args = parse_args(['', '--dirs', 'neural_speed', 'bestla']) 64 | else: 65 | args = parse_args() 66 | clang_format_dir(args) 67 | -------------------------------------------------------------------------------- /docker/DockerFile: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2023 Intel Corporation 2 | # SPDX-License-Identifier: BSD-3-Clause 3 | 4 | ARG IMAGE_NAME=ubuntu 5 | ARG IMAGE_TAG=22.04 6 | FROM ${IMAGE_NAME}:${IMAGE_TAG} as base 7 | 8 | FROM base as neural-speed 9 | 10 | ARG PYTHON=python3.10 11 | 12 | ENV DEBIAN_FRONTEND=noninteractive 13 | 14 | RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \ 15 | libgl1-mesa-glx \ 16 | libglib2.0-0 \ 17 | ${PYTHON} \ 18 | python3-pip 19 | 20 | RUN ln -sf $(which ${PYTHON}) /usr/bin/python 21 | 22 | RUN ${PYTHON} -m pip install -U pip 23 | 24 | FROM neural-speed as devel 25 | 26 | ENV DEBIAN_FRONTEND=noninteractive 27 | 28 | ENV LANG C.UTF-8 29 | ARG PYTHON=python3.10 30 | 31 | RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \ 32 | autoconf \ 33 | build-essential \ 34 | ca-certificates \ 35 | cmake \ 36 | git \ 37 | gcc g++ make 38 | 39 | RUN mkdir -p /neural_speed 40 | WORKDIR /neural_speed 41 | COPY . /neural_speed 42 | 43 | RUN pip install cmake ninja psutil && \ 44 | cd /neural_speed && \ 45 | git submodule update --init --recursive && \ 46 | mkdir -p build && cd build && cmake .. -G Ninja && ninja && cd .. && \ 47 | pip install -r requirements.txt 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | # Docker 2 | Follow these instructions to set up and run our provided Docker image. 3 | 4 | ## Set Up Docker Image 5 | Build or Pull the provided docker images. 6 | 7 | ### Build Docker Image 8 | ```bash 9 | git clone https://github.com/intel/neural-speed.git neuralspeed 10 | cd neuralspeed 11 | docker build -f docker/DockerFile -t neuralspeed:latest . 12 | ``` 13 | If you need to use proxy, please use the following command 14 | ```bash 15 | docker build --build-arg http_proxy=${http_proxy} --build-arg https_proxy=${http_proxy} -f docker/DockerFile -t neuralspeed:latest . 16 | ``` 17 | 18 | ### Pull From Docker Hub 19 | 20 | 21 | ## Use Docker Image 22 | Utilize the docker container based on docker image. 23 | ```bash 24 | docker run -itd --name="neural-speed-docker" neuralspeed:latest /bin/bash 25 | docker exec -it neural-speed-docker /bin/bash 26 | ``` 27 | 28 | ## Run Simple Test 29 | ```bash 30 | docker exec -it /bin/bash 31 | cd /neural_speed/neural_speed 32 | ## convert to model.bin 33 | python scripts/convert.py --outtype f32 --outfile llama-fp32.bin ${input_model_path} 34 | ## quantize to Q4 with groupsize=128 35 | ./build/bin/quant_llama --model_file llama-fp32.bin --out_file llama-q4_j_i8_g128.bin --weight_dtype int4 --group_size 128 --compute_dtype int8 --scale_dtype fp32 --alg sym 36 | ## inference 37 | ./build/bin/run_llama --seed 1234 -b 2047 -c 64 -n 32 -m llama-q4_j_i8_g128.bin -p "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" 38 | ``` 39 | -------------------------------------------------------------------------------- /docs/customized_stop.md: -------------------------------------------------------------------------------- 1 | You can customize the stopping criteria according to your own needs by processing the input_ids to determine if text generation needs to be stopped. 2 | Here is a simple example, which requires a minimum generation length of 80 tokens. Once the `min_length` is met, encountering a terminator `eos_token_id` will end the generation. 3 | 4 | ```python 5 | import torch 6 | from typing import List 7 | from transformers import StoppingCriteria, StoppingCriteriaList 8 | 9 | class StopOnTokens(StoppingCriteria): 10 | def __init__(self, min_length: int, start_length: int, stop_token_id: List[int]): 11 | self.min_length = min_length 12 | self.start_length = start_length 13 | self.stop_token_id = stop_token_id 14 | 15 | def __call__( 16 | self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs 17 | ) -> bool: 18 | if input_ids.shape[-1] - self.start_length > self.min_length: 19 | for stop_id in self.stop_token_id: 20 | if input_ids[0][input_ids.shape[-1] - 1] == stop_id: 21 | return True 22 | return False 23 | 24 | stopping_criteria = StoppingCriteriaList( 25 | [ 26 | StopOnTokens( 27 | min_length=80, 28 | start_length=inputs.shape[1], 29 | stop_token_id=[tokenizer.eos_token_id], 30 | ) 31 | ] 32 | ) 33 | 34 | outputs = model.generate(inputs, streamer=streamer, stopping_criteria=stopping_criteria) 35 | ``` 36 | -------------------------------------------------------------------------------- /docs/gguf.md: -------------------------------------------------------------------------------- 1 | GGUF 2 | ======= 3 | 4 | Neural Speed also supports GGUF models generated by [llama.cpp](https://github.com/ggerganov/llama.cpp), you need to download the model and use llama.cpp to create it. 5 | 6 | Validated models: [llama2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf), [falcon-7b](https://huggingface.co/tiiuae/falcon-7b), [falcon-40b](https://huggingface.co/tiiuae/falcon-40b), [mpt-7b](https://huggingface.co/mosaicml/mpt-7b), [mpt-40b](https://huggingface.co/mosaicml/mpt-40b) and [bloom-7b1](https://huggingface.co/bigscience/bloomz-7b1). 7 | 8 | Please check more validated GGUF models from HuggingFace in [list](./supported_models.md). 9 | 10 | ## Examples 11 | 12 | How to create the GGUF file in Neural Speed: 13 | ```python 14 | # Example: 15 | # please provide the local model path as the arg, 16 | # which means you need to `git clone https://huggingface.co/meta-llama/Llama-2-7b-chat-hf` first. 17 | python neural_speed/convert/convert-hf-to-gguf.py /model_path/Llama-2-7b-chat-hf/ 18 | 19 | ``` 20 | 21 | How to load the GGUF bin file in Neural Speed: 22 | 23 | ```python 24 | prompt = "Once upon a time" 25 | tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True) 26 | inputs = tokenizer(prompt, return_tensors="pt").input_ids 27 | streamer = TextStreamer(tokenizer) 28 | 29 | model = Model() 30 | model.init_from_bin(args.model_name, gguf_path) 31 | outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300, do_sample=True) 32 | 33 | # Please check this script for more details and input parameters. 34 | # python scripts/python_api_example_for_gguf.py --model_name falcon --model_path /home/model/falcon-7b -m /home/model/falcon-7b/ggml-model-f32.gguf 35 | ``` 36 | 37 | Note: These GGUF models can be accelerated by [Neural Speed BestTLA](https://github.com/intel/neural-speed/blob/c0312283f528d4a9ffebc283cd0f15a7a8eabf1a/bestla/README.md#L1). 38 | 39 | How to accelerate GGUF by BTLA: 40 | ```python 41 | # quantization and then re-run the above step python_api_example_for_gguf.py 42 | ./build/bin/quant_falcon --model_file /home/model/falcon-7b/ggml-model-f32.gguf --out_file ne-falcon-q4_j.bin --weight_dtype int4 --compute_dtype int8 43 | 44 | python scripts/python_api_example_for_gguf.py --model_name falcon --model_path /home/model/falcon-7b -m ne-falcon-q4_j.bin 45 | ``` 46 | 47 | How to load the GGUF bin file in [intel-extension-for-transformers](https://github.com/intel/intel-extension-for-transformers/pull/1151): 48 | ```python 49 | from transformers import AutoTokenizer, TextStreamer 50 | from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig 51 | 52 | # Specify the GGUF repo on the Hugginface 53 | model_name = "TheBloke/Llama-2-7B-Chat-GGUF" 54 | # Download the the specific gguf model file from the above repo 55 | gguf_file = "llama-2-7b-chat.Q4_0.gguf" 56 | # make sure you are granted to access this model on the Huggingface. 57 | tokenizer_name = "meta-llama/Llama-2-7b-chat-hf" 58 | 59 | prompt = "Once upon a time" 60 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True) 61 | inputs = tokenizer(prompt, return_tensors="pt").input_ids 62 | streamer = TextStreamer(tokenizer) 63 | model = AutoModelForCausalLM.from_pretrained(model_name, gguf_file = gguf_file) 64 | outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300) 65 | ``` 66 | -------------------------------------------------------------------------------- /docs/gptq_and_awq.md: -------------------------------------------------------------------------------- 1 | GPTQ & AWQ 2 | ======= 3 | 4 | Neural Speed supports multiple weight-only quantization algorithms, such as GPTQ and AWQ. 5 | 6 | More algorithm details please check [GPTQ](https://arxiv.org/abs/2210.17323) and [AWQ](https://arxiv.org/abs/2306.00978). 7 | 8 | Validated GPTQ & AWQ models directly from the HuggingFace: 9 | * [Llama-2-7B-Chat-GPTQ](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ) & [Llama-2-13B-Chat-GPTQ](https://huggingface.co/TheBloke/Llama-2-13B-Chat-GPTQ) & [Llama-2-7B-AWQ](https://huggingface.co/TheBloke/Llama-2-7B-AWQ) & [Llama-2-13B-chat-AWQ](https://huggingface.co/TheBloke/Llama-2-13B-chat-AWQ) 10 | * [CodeLlama-7B-Instruct-GPTQ](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ) & [CodeLlama-13B-Instruct-GPTQ](https://huggingface.co/TheBloke/CodeLlama-13B-Instruct-GPTQ) & [CodeLlama-7B-AWQ](https://huggingface.co/TheBloke/CodeLlama-7B-AWQ) & [CodeLlama-13B-AWQ](https://huggingface.co/TheBloke/CodeLlama-13B-AWQ) 11 | * [Mistral-7B-Instruct-v0.1-GPTQ](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GPTQ) & [Mistral-7B-Instruct-v0.1-AWQ](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GPTQ) 12 | * [Mixtral-8x7B-Instruct-v0.1-GPTQ](https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ) & [Mixtral-8x7B-Instruct-v0.1-AWQ](https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ) 13 | * [Qwen-7B-Chat-GPTQ](https://huggingface.co/TheBloke/Qwen-7B-Chat-GPTQ) & [Qwen-7B-Chat-AWQ](https://huggingface.co/TheBloke/Qwen-7B-Chat-AWQ) & * [Qwen1.5-7B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GPTQ-Int4) 14 | * [SOLAR-10.7B-v1.0-GPTQ](https://huggingface.co/TheBloke/SOLAR-10.7B-v1.0-GPTQ) 15 | * [Baichuan2-13B-Chat-GPTQ](https://hf-mirror.com/TheBloke/Baichuan2-13B-Chat-GPTQ) 16 | * [tiiuae/falcon-7b](https://huggingface.co/tiiuae/falcon-7b/tree/main) 17 | * [onlinex/phi-1_5-gptq-4bit](https://hf-mirror.com/onlinex/phi-1_5-gptq-4bit) 18 | 19 | For more details, please check the list of [supported_models](./supported_models.md). 20 | 21 | ## Examples 22 | 23 | How to run GPTQ or AWQ models in Neural Speed: 24 | ```python 25 | import sys 26 | from transformers import AutoTokenizer, TextStreamer 27 | from neural_speed import Model 28 | 29 | if len(sys.argv) != 2: 30 | print("Usage: python python_api_example.py model_path") 31 | model_name = sys.argv[1] 32 | 33 | prompt = "Once upon a time, a little girl" 34 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 35 | inputs = tokenizer(prompt, return_tensors="pt").input_ids 36 | streamer = TextStreamer(tokenizer) 37 | 38 | model = Model() 39 | # Inference GPTQ models. 40 | model.init(model_name, weight_dtype="int4", compute_dtype="int8", use_gptq=True) 41 | # Inference AWQ models. 42 | # model.init(model_name, weight_dtype="int4", compute_dtype="int8", use_awq=True) 43 | 44 | outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300, do_sample=True) 45 | ``` 46 | 47 | Note: we have provided the [script](../scripts/python_api_example.py) to run these models. 48 | -------------------------------------------------------------------------------- /docs/imgs/Attention.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/neural-speed/bfd5d3c17dee18a20e2768f855f2d8fe132fc579/docs/imgs/Attention.PNG -------------------------------------------------------------------------------- /docs/imgs/FFN.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/neural-speed/bfd5d3c17dee18a20e2768f855f2d8fe132fc579/docs/imgs/FFN.PNG -------------------------------------------------------------------------------- /docs/imgs/ORCA_batching.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/neural-speed/bfd5d3c17dee18a20e2768f855f2d8fe132fc579/docs/imgs/ORCA_batching.png -------------------------------------------------------------------------------- /docs/install.md: -------------------------------------------------------------------------------- 1 | ## Install 2 | 3 | ### Build Python package 4 | ```shell 5 | pip install -r requirements.txt 6 | pip install . 7 | ``` 8 | 9 | ### Build executable only 10 | 11 | ```shell 12 | # Linux and WSL 13 | git submodule update --init --recursive 14 | mkdir build 15 | cd build 16 | cmake .. -G Ninja 17 | ninja 18 | ``` 19 | 20 | ```powershell 21 | # Windows 22 | # Install VisualStudio 2022 and open 'Developer PowerShell for VS 2022' 23 | mkdir build 24 | cd build 25 | cmake .. 26 | cmake --build . -j --config Release 27 | ``` 28 | -------------------------------------------------------------------------------- /neural_speed/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | include(cmake/ISA.cmake) 16 | include(cmake/Common.cmake) 17 | include(cmake/ClangTidy.cmake) 18 | 19 | set(COMMON_HEADER_DIRS ./) 20 | if(NS_GPU) 21 | list(APPEND COMMON_HEADER_DIRS ${GPU_ROOT}/include) 22 | list(APPEND COMMON_LIB_DIRS ${GPU_ROOT}) 23 | endif() 24 | 25 | include_directories(${COMMON_HEADER_DIRS}) 26 | link_directories(${COMMON_LIB_DIRS}) 27 | 28 | add_subdirectory(core) 29 | add_subdirectory(vectors) 30 | add_subdirectory(models) 31 | 32 | if (NS_BUILD_APPLICATIONS) 33 | add_subdirectory(application) 34 | endif() 35 | -------------------------------------------------------------------------------- /neural_speed/application/quant_model.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // Defines sigaction on msys: 15 | #ifndef _GNU_SOURCE 16 | #define _GNU_SOURCE 17 | #endif 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | #include "common.h" 29 | #include "models/model_utils/quant_utils.h" 30 | #include "models/model_utils/model_utils.h" 31 | 32 | std::shared_ptr get_model_quant_layer(const std::string& model_name) { 33 | return ql_registry::create_ql(model_name); 34 | } 35 | 36 | int main(int argc, char** argv) { 37 | model_init_backend(); 38 | quant_params q_params; 39 | #ifdef MODEL_NAME 40 | q_params.model_name = MODEL_NAME; 41 | #endif 42 | 43 | if (quant_params_parse(argc, argv, q_params) == false) { 44 | return 1; 45 | } 46 | model_archs mt = model_name_to_arch::init().find(q_params.model_name); 47 | if (mt == MODEL_UNKNOWN) { 48 | fprintf(stderr, "error, please set model_name \n"); 49 | exit(0); 50 | } 51 | q_params.model_arch = mt; 52 | 53 | const std::string fname_inp = q_params.model_file; 54 | const std::string fname_out = q_params.out_file; 55 | ne_ftype ftype = quant_params_to_ftype(q_params); 56 | printf("%s: quant_params_to_ftype: %d\n", __func__, ftype); 57 | const int nthread = q_params.nthread; 58 | 59 | const int64_t t_main_start_us = ne_time_us(); 60 | 61 | int64_t t_quantize_us = 0; 62 | auto quant_layer = get_model_quant_layer(q_params.model_name); 63 | // load the model 64 | { 65 | const int64_t t_start_us = ne_time_us(); 66 | 67 | if (model_quantize(q_params, quant_layer)) { 68 | fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); 69 | return 1; 70 | } 71 | 72 | t_quantize_us = ne_time_us() - t_start_us; 73 | } 74 | // report timing 75 | { 76 | const int64_t t_main_end_us = ne_time_us(); 77 | 78 | printf("\n"); 79 | printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us / 1000.0); 80 | printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0); 81 | } 82 | 83 | return 0; 84 | } 85 | -------------------------------------------------------------------------------- /neural_speed/application/quant_whisper.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include //NOLINT 23 | #include "models/model_utils/quant_utils.h" 24 | #include "common.h" 25 | 26 | #define F_OK 0 27 | 28 | inline bool exists_model(const std::string& name) { return (access(name.c_str(), F_OK) != -1); } 29 | int main(int argc, char** argv) { 30 | quant_params q_params; 31 | if (quant_params_parse(argc, argv, q_params) == false) { 32 | return 1; 33 | } 34 | 35 | // needed to initialize f16 tables 36 | { 37 | struct ne_init_params params = {0, nullptr, false}; 38 | struct ne_context* ctx = ne_init(params); 39 | ne_free(ctx); 40 | } 41 | const std::string fname_inp = q_params.model_file; 42 | const std::string fname_out = q_params.out_file; 43 | // printf("input_model_file:%s \n",fname_inp.c_str()); 44 | 45 | const ne_ftype ftype = quant_params_to_ftype(q_params); 46 | if (ftype != NE_FTYPE_MOSTLY_Q4_0) { 47 | fprintf(stderr, "%s: ITREX now only support quantize model to q4_0 \n", __func__); 48 | return 1; 49 | } 50 | 51 | const int64_t t_main_start_us = ne_time_us(); 52 | 53 | int64_t t_quantize_us = 0; 54 | 55 | // load the model 56 | { 57 | const int64_t t_start_us = ne_time_us(); 58 | if (exists_model(fname_inp)) { 59 | if (!whisper_model_quantize(fname_inp, fname_out, ne_ftype(ftype))) { 60 | fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); 61 | return 1; 62 | } 63 | } else { 64 | fprintf(stderr, "%s: model is not exist '%s'\n", __func__, fname_inp.c_str()); 65 | return 1; 66 | } 67 | 68 | t_quantize_us = ne_time_us() - t_start_us; 69 | } 70 | 71 | // report timing 72 | { 73 | const int64_t t_main_end_us = ne_time_us(); 74 | 75 | printf("\n"); 76 | printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us / 1000.0f); 77 | printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f); 78 | } 79 | 80 | return 0; 81 | } 82 | -------------------------------------------------------------------------------- /neural_speed/cmake/ClangTidy.cmake: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | if (NS_USE_CLANG_TIDY MATCHES "(CHECK|FIX)" AND ${CMAKE_VERSION} VERSION_LESS "3.6.0") 16 | message(FATAL_ERROR "Using clang-tidy requires CMake 3.6.0 or newer") 17 | elseif(NS_USE_CLANG_TIDY MATCHES "(CHECK|FIX)") 18 | find_program(CLANG_TIDY NAMES clang-tidy) 19 | if(NOT CLANG_TIDY) 20 | message(FATAL_ERROR "Clang-tidy not found") 21 | else() 22 | add_compile_definitions(CLANGTIDY) 23 | if(NS_USE_CLANG_TIDY STREQUAL "CHECK") 24 | set(CMAKE_CXX_CLANG_TIDY ${CLANG_TIDY}) 25 | message(STATUS "Using clang-tidy to run checks") 26 | elseif(NS_USE_CLANG_TIDY STREQUAL "FIX") 27 | set(CMAKE_CXX_CLANG_TIDY ${CLANG_TIDY} -fix) 28 | message(STATUS "Using clang-tidy to run checks and fix found issues") 29 | endif() 30 | endif() 31 | endif() 32 | -------------------------------------------------------------------------------- /neural_speed/cmake/ISA.cmake: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | if (MSVC) 16 | if(NS_F16C) 17 | add_compile_definitions(__F16C__) 18 | endif() 19 | if (NS_AVX512) 20 | add_compile_options($<$:/arch:AVX512>) 21 | add_compile_options($<$:/arch:AVX512>) 22 | # MSVC has no compile-time flags enabling specific 23 | # AVX512 extensions, neither it defines the 24 | # macros corresponding to the extensions. 25 | # Do it manually. 26 | if (NS_AVX512_VBMI) 27 | add_compile_definitions($<$:__AVX512VBMI__>) 28 | add_compile_definitions($<$:__AVX512VBMI__>) 29 | endif() 30 | if (NS_AVX512_VNNI) 31 | add_compile_definitions($<$:__AVX512VNNI__>) 32 | add_compile_definitions($<$:__AVX512VNNI__>) 33 | endif() 34 | elseif (NS_AVX2) 35 | add_compile_options($<$:/arch:AVX2>) 36 | add_compile_options($<$:/arch:AVX2>) 37 | elseif (NS_AVX) 38 | add_compile_options($<$:/arch:AVX>) 39 | add_compile_options($<$:/arch:AVX>) 40 | endif() 41 | else() 42 | if (NS_F16C) 43 | add_compile_options(-mf16c) 44 | endif() 45 | if (NS_FMA) 46 | add_compile_options(-mfma) 47 | endif() 48 | if (NS_AVX) 49 | add_compile_options(-mavx) 50 | endif() 51 | if (NS_AVX2) 52 | add_compile_options(-mavx2) 53 | endif() 54 | if (NS_AVX512) 55 | add_compile_options(-mavx512f) 56 | add_compile_options(-mavx512bw) 57 | endif() 58 | if (NS_AVX512_VBMI) 59 | add_compile_options(-mavx512vbmi) 60 | endif() 61 | if (NS_AVX512_VNNI) 62 | add_compile_options(-mavx512vnni) 63 | endif() 64 | if (NS_AMX) 65 | add_compile_options(-mamx-tile -mamx-int8 -mamx-bf16) 66 | endif() 67 | endif() 68 | -------------------------------------------------------------------------------- /neural_speed/convert/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (c) 2023 Intel Corporation 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from pathlib import Path 19 | import subprocess 20 | 21 | model_maps = { 22 | "gpt_neox": "gptneox", 23 | "gpt_bigcode": "starcoder", 24 | "whisper": "whisper", 25 | "qwen2": "qwen", 26 | "RefinedWebModel": "falcon", 27 | "RefinedWeb": "falcon", 28 | "phi-msft": "phi" 29 | } 30 | 31 | 32 | def convert_model(model, outfile, outtype="f32", format="NE", model_hub="huggingface", use_quantized_model=False): 33 | if model_hub == "modelscope": 34 | from modelscope import AutoConfig 35 | else: 36 | from transformers import AutoConfig 37 | config = AutoConfig.from_pretrained(model, trust_remote_code=True) 38 | model_type = model_maps.get(config.model_type, config.model_type) 39 | 40 | cmd = [] 41 | if use_quantized_model: 42 | path = Path(Path(__file__).parent.absolute(), "convert_quantized_{}.py".format(model_type)) 43 | else: 44 | path = Path(Path(__file__).parent.absolute(), "convert_{}.py".format(model_type)) 45 | 46 | cmd.extend(["python", path]) 47 | cmd.extend(["--outfile", outfile]) 48 | cmd.extend(["--outtype", outtype]) 49 | if model_type in {"phi", "stablelm"}: 50 | cmd.extend(["--format", format]) 51 | cmd.extend(["--model_hub", model_hub]) 52 | cmd.extend([model]) 53 | 54 | print("cmd:", cmd) 55 | subprocess.run(cmd, check=True) 56 | -------------------------------------------------------------------------------- /neural_speed/core/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | find_package(Threads REQUIRED) 16 | file(GLOB layers_srcs "layers/*.cpp") 17 | file(GLOB test_srcs "layers/*test*.cpp") 18 | list(REMOVE_ITEM layers_srcs ${test_srcs}) 19 | set(sources ne_layers.c ${layers_srcs}) 20 | 21 | add_shareable_library_w_warning(ne_layers "${sources}") 22 | 23 | target_include_directories(ne_layers PUBLIC .) 24 | target_compile_features(ne_layers PUBLIC c_std_11) # don't bump 25 | set_target_properties(ne_layers PROPERTIES POSITION_INDEPENDENT_CODE ON) 26 | if (NS_TP) 27 | find_package(oneCCL REQUIRED) 28 | find_package(MPI REQUIRED) 29 | set(CMAKE_POSITION_INDEPENDENT_CODE ON) 30 | add_library(parallel_context STATIC parallel_context.cpp) 31 | target_link_libraries(ne_layers PUBLIC Threads::Threads bestla ne_vec MPI::MPI_CXX ccl parallel_context) 32 | else () 33 | target_link_libraries(ne_layers PUBLIC Threads::Threads bestla ne_vec) 34 | endif() 35 | 36 | if(NOT WIN32) 37 | target_link_libraries(ne_layers PUBLIC rt) 38 | else() 39 | target_link_options(ne_layers PUBLIC /STACK:5242880 /F5242880) 40 | endif() 41 | 42 | 43 | if (NS_BUILD_TESTS) 44 | 45 | function(add_test_target src) # ARGN: additional source 46 | get_filename_component(test_target ${src} NAME_WE) 47 | get_filename_component(src_dir ${src} DIRECTORY) 48 | string(REGEX REPLACE [/\\] "_" src_dir ${src_dir}) 49 | if(src_dir) 50 | set (test_target "${src_dir}_${test_target}") 51 | endif() 52 | set (test_target "test_${test_target}") 53 | add_executable_w_warning(${test_target} ${src} ${ARGN}) 54 | target_compile_definitions(${test_target} PRIVATE NS_TESTS) 55 | target_compile_options(${test_target} PRIVATE -fsanitize=address) 56 | target_link_options(${test_target} PRIVATE -fsanitize=address) 57 | target_include_directories(${test_target} PUBLIC .) 58 | target_link_libraries(${test_target} PUBLIC Threads::Threads bestla ne_vec) 59 | if(NOT WIN32) 60 | target_link_libraries(${test_target} PUBLIC rt) 61 | endif() 62 | add_test(NAME ${test_target} COMMAND ${test_target}) 63 | set_tests_properties(${test_target} PROPERTIES LABELS "${src_dir}_test") 64 | endfunction() 65 | 66 | add_test_target(layers/mha_dense.cpp layers/mha_dense_tests.cpp) 67 | 68 | endif() 69 | -------------------------------------------------------------------------------- /neural_speed/core/layers/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /neural_speed/core/layers/Ops.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #pragma once 15 | #ifdef __cplusplus 16 | extern "C" { 17 | #endif 18 | 19 | // available tensor operations: 20 | enum ne_op { 21 | NE_OP_NONE = 0, 22 | 23 | NE_OP_DUP, 24 | NE_OP_ADD, 25 | NE_OP_ADD1, 26 | NE_OP_ACC, 27 | NE_OP_SUB, 28 | NE_OP_MUL, 29 | NE_OP_DIV, 30 | NE_OP_SQR, 31 | NE_OP_SQRT, 32 | NE_OP_LOG, 33 | NE_OP_SUM, 34 | NE_OP_SUM_ROWS, 35 | NE_OP_TANH, 36 | NE_OP_MEAN, 37 | NE_OP_REPEAT, 38 | NE_OP_ABS, 39 | NE_OP_SGN, 40 | NE_OP_NEG, 41 | NE_OP_STEP, 42 | NE_OP_RELU, 43 | NE_OP_GELU, 44 | NE_OP_SILU, 45 | NE_OP_SILU_BACK, 46 | NE_OP_NORM, // normalize 47 | NE_OP_RMS_NORM, 48 | NE_OP_RMS_NORM_BACK, 49 | NE_OP_RMS_ARGSORT, 50 | 51 | NE_OP_MUL_MAT, 52 | NE_OP_MUL_MAT_BIAS, 53 | NE_OP_MUL_MAT_ID, 54 | NE_OP_SCALE, 55 | NE_OP_SET, 56 | NE_OP_CPY, 57 | NE_OP_CONT, 58 | NE_OP_RESHAPE, 59 | NE_OP_VIEW, 60 | NE_OP_PERMUTE, 61 | NE_OP_TRANSPOSE, 62 | NE_OP_GET_ROWS, 63 | NE_OP_GET_ROWS_BACK, 64 | NE_OP_DIAG, 65 | NE_OP_DIAG_MASK_INF, 66 | NE_OP_DIAG_MASK_ZERO, 67 | NE_OP_PADDING_MASK_INF, 68 | NE_OP_SOFT_MAX, 69 | NE_OP_ROPE, 70 | NE_OP_ROPE_BACK, 71 | NE_OP_ALIBI, 72 | NE_OP_CLAMP, 73 | NE_OP_CONV_1D_1S, 74 | NE_OP_CONV_1D_2S, 75 | 76 | // LLM related 77 | NE_OP_MUL_QKV, 78 | NE_OP_MUL_FFN_SILU, 79 | NE_OP_MUL_FFN_GELU, 80 | NE_OP_MUL_FFN_GELU_MUL, 81 | NE_OP_MUL_FFN_ADD_GELU, 82 | NE_OP_MUL_ID_FFN_SILU, 83 | NE_OP_MUL_ID_FFN_GELU, 84 | NE_OP_FLASH_ATTN, 85 | NE_OP_FLASH_ATTN_KV_UPDATE, 86 | NE_OP_FLASH_FF, 87 | 88 | NE_OP_MAP_UNARY, 89 | NE_OP_MAP_BINARY, 90 | 91 | NE_OP_SPLIT, 92 | NE_OP_ALL_REDUCE, 93 | NE_OP_TP_CONCAT, 94 | NE_OP_DUMP_TENSOR, 95 | NE_OP_DEBUG, 96 | NE_OP_CONV_1D, 97 | NE_OP_ARGSORT, 98 | NE_OP_COUNT, 99 | }; 100 | 101 | #ifdef __cplusplus 102 | } 103 | #endif 104 | -------------------------------------------------------------------------------- /neural_speed/core/layers/argsort.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "argsort.h" 16 | #include 17 | #include 18 | 19 | static void ne_compute_forward_argsort_f32(const struct ne_compute_params* params, const struct ne_tensor* src0, 20 | struct ne_tensor* dst) { 21 | if (params->type == NE_TASK_INIT || params->type == NE_TASK_FINALIZE) { 22 | return; 23 | } 24 | const int64_t ne00 = src0->ne[0]; 25 | const int64_t ne01 = src0->ne[1]; 26 | const int64_t ne02 = src0->ne[2]; 27 | const int64_t ne03 = src0->ne[3]; 28 | 29 | const int64_t ne0 = dst->ne[0]; 30 | const int64_t ne1 = dst->ne[1]; 31 | const int64_t ne2 = dst->ne[2]; 32 | const int64_t ne3 = dst->ne[3]; 33 | 34 | const size_t nb00 = src0->nb[0]; 35 | 36 | const size_t nb01 = src0->nb[1]; 37 | const size_t nb02 = src0->nb[2]; 38 | const size_t nb03 = src0->nb[3]; 39 | 40 | const size_t nb0 = dst->nb[0]; 41 | const size_t nb1 = dst->nb[1]; 42 | const size_t nb2 = dst->nb[2]; 43 | const size_t nb3 = dst->nb[3]; 44 | const int ith = params->ith; 45 | const int nth = params->nth; 46 | 47 | const int64_t nr = src0->ne[1] * src0->ne[2] * src0->ne[3]; 48 | 49 | for (int64_t i = ith; i < nr; i += nth) { 50 | int32_t* dst_data = (int32_t*)((char*)dst->data + i * nb1); 51 | const float* src_data = (float*)((char*)src0->data + i * nb01); 52 | 53 | for (int64_t j = 0; j < ne0; j++) { 54 | dst_data[j] = j; 55 | } 56 | std::sort(dst_data, dst_data + ne0, [src_data](int pos1, int pos2) { return (src_data[pos1] > src_data[pos2]); }); 57 | } 58 | } 59 | void ne_compute_forward_argsort(const struct ne_compute_params* params, const struct ne_tensor* src0, 60 | struct ne_tensor* dst) { 61 | switch (src0->type) { 62 | case NE_TYPE_F32: { 63 | ne_compute_forward_argsort_f32(params, src0, dst); 64 | } break; 65 | default: { 66 | NE_ASSERT(false); 67 | } break; 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /neural_speed/core/layers/argsort.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | #include "core/ne.h" 17 | #include "core/data_types.h" 18 | 19 | #ifdef __cplusplus 20 | extern "C" { 21 | #endif 22 | 23 | void ne_compute_forward_argsort(const struct ne_compute_params* params, const struct ne_tensor* src0, 24 | struct ne_tensor* dst); 25 | 26 | #ifdef __cplusplus 27 | } 28 | #endif 29 | -------------------------------------------------------------------------------- /neural_speed/core/layers/bestla_gemm.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | /*++ 16 | Module Name: 17 | 18 | bestla_gemm.h 19 | 20 | Abstract: 21 | 22 | C APIs of BesTLA GEMMs. 23 | --*/ 24 | 25 | #pragma once 26 | 27 | #include "data_types.h" 28 | #include "bestla/bestla.h" 29 | 30 | struct BTLA_GEMM_DATA_PACKED_PARAMS { 31 | const float* A = nullptr; /**< address of A (float32 matrix)*/ 32 | const void* B = nullptr; /**< address of B (packed nbits blob)*/ 33 | float* C = nullptr; /**< address of result matrix */ 34 | int lda = 0; /**< leading dimension of A */ 35 | int ldc = 0; /**< leading dimension of C*/ 36 | }; 37 | 38 | size_t BTLAGemmPackBSize(size_t N, size_t K, size_t BlkSize, BTLA_DTYPE QuantType, BTLA_DTYPE ScaleDtype, bool isAsym, 39 | ne_comp_type CompType, int* shuffle_indice); 40 | 41 | bool BTLAGemmQuantPackB(void* PackedBuf, const float* FpData, size_t N, size_t K, size_t ldb, size_t BlkSize, 42 | BTLA_DTYPE QuantType, BTLA_DTYPE ScaleDtype, bool isAsym, ne_comp_type CompType, bool isTrans, 43 | void* ThreadPool); 44 | 45 | // QData: K*N quantized int8 weight 46 | // Scales: K/BlkSize * N scales 47 | // Zp: K/BlkSize * N zero points 48 | bool BTLAGemmPackB(void* PackedBuf, const int8_t* QData, const float* Scales, const int8_t* Zp, size_t N, size_t K, 49 | size_t ldb, size_t BlkSize, BTLA_DTYPE QuantType, BTLA_DTYPE ScaleDtype, bool isAsym, 50 | ne_comp_type CompType, int* shuffle_indice, void* ThreadPool); 51 | 52 | bool BTLAGemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, void* ThreadPool); 53 | 54 | bool BTLAGemmBatchDriver(const size_t M, const size_t N, const size_t K, const size_t BatchN, 55 | const BTLA_GEMM_DATA_PACKED_PARAMS* DataParams, int8_t* WorkSpace, void* ThreadPool); 56 | 57 | bool BTLALayerNorm(size_t norm_count, size_t norm_size, bool isrms, float epsilon, const float* FpIn, float* FpOut, 58 | void* ThreadPool); 59 | -------------------------------------------------------------------------------- /neural_speed/core/layers/conv.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #pragma once 15 | 16 | #include "core/ne.h" 17 | #include "core/data_types.h" 18 | 19 | #ifdef __cplusplus 20 | extern "C" { 21 | #endif 22 | 23 | void ne_compute_forward_conv_1d_s1_ph(const struct ne_compute_params* params, const struct ne_tensor* src0, 24 | const struct ne_tensor* src1, struct ne_tensor* dst); 25 | void ne_compute_forward_conv_1d_2s(const struct ne_compute_params* params, const struct ne_tensor* src0, 26 | const struct ne_tensor* src1, struct ne_tensor* dst); 27 | void ne_compute_forward_conv_1d(const struct ne_compute_params* params, const struct ne_tensor* src0, 28 | const struct ne_tensor* src1, struct ne_tensor* dst); 29 | void ne_compute_forward_conv_1d_1s(const struct ne_compute_params* params, const struct ne_tensor* src0, 30 | const struct ne_tensor* src1, struct ne_tensor* dst); 31 | void ne_compute_forward_conv_1d_2s(const struct ne_compute_params* params, const struct ne_tensor* src0, 32 | const struct ne_tensor* src1, struct ne_tensor* dst); 33 | #ifdef __cplusplus 34 | } 35 | #endif 36 | -------------------------------------------------------------------------------- /neural_speed/core/layers/ele_reduce.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #pragma once 15 | 16 | #include "core/data_types.h" 17 | #include "vectors/cpu/simd.h" 18 | #include "vec_dot.h" 19 | 20 | #ifdef __cplusplus 21 | extern "C" { 22 | #endif 23 | 24 | inline static void ne_vec_norm_f32(const int n, float* s, const float* x) { 25 | ne_vec_dot_f32(n, s, x, x); 26 | *s = sqrtf(*s); 27 | } 28 | 29 | inline static void ne_vec_sum_f32(const int n, float* s, const float* x) { 30 | ne_float sum = 0.0; 31 | for (int i = 0; i < n; ++i) { 32 | sum += (ne_float)x[i]; 33 | } 34 | *s = sum; 35 | } 36 | 37 | inline static void ne_vec_sum_ggf(const int n, ne_float* s, const float* x) { 38 | ne_float sum = 0.0; 39 | for (int i = 0; i < n; ++i) { 40 | sum += (ne_float)x[i]; 41 | } 42 | *s = sum; 43 | } 44 | 45 | inline static void ne_vec_max_f32(const int n, float* s, const float* x) { 46 | float max = -INFINITY; 47 | for (int i = 0; i < n; ++i) { 48 | max = x[i] > max ? x[i] : max; 49 | } 50 | *s = max; 51 | } 52 | 53 | inline static void ne_vec_norm_inv_f32(const int n, float* s, const float* x) { 54 | ne_vec_norm_f32(n, s, x); 55 | *s = 1.f / (*s); 56 | } 57 | 58 | #ifdef __cplusplus 59 | } 60 | #endif 61 | -------------------------------------------------------------------------------- /neural_speed/core/layers/layers.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #include "ele_wise.h" 15 | #include "ele_reduce.h" 16 | 17 | #include "conv.h" 18 | #include "memory.h" 19 | #include "argsort.h" 20 | -------------------------------------------------------------------------------- /neural_speed/core/layers/memory.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "memory.h" 16 | 17 | void ne_attention_padding_mask_f32_forward(const int bs, const int nr_qk, const int qlen, const int ith, const int nth, 18 | const void* padding, const float p_value, struct ne_tensor* dst) { 19 | // mask padding token (padding left) 20 | for (int b = 0; b < bs; b++) { 21 | const int n_padding = (reinterpret_cast(padding))[b]; 22 | if (n_padding == 0) continue; 23 | for (int k = 0; k < (nr_qk / bs); k++) { 24 | for (int j = ith; j < qlen; j += nth) { 25 | // it will not affect next token if don't mask the pad_token row 26 | ne_vec_set_f32(n_padding, 27 | reinterpret_cast(reinterpret_cast(dst->data) + b * dst->nb[3] + k * dst->nb[2] + 28 | j * dst->nb[1]), 29 | p_value); 30 | } 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /neural_speed/core/layers/memory.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include "ele_wise.h" 18 | #include "core/ne.h" 19 | 20 | #ifdef __cplusplus 21 | extern "C" { 22 | #endif 23 | 24 | void ne_attention_padding_mask_f32_forward(const int bs, const int nr_qk, const int qlen, const int ith, const int nth, 25 | const void* padding, const float p_value, struct ne_tensor* dst); 26 | 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | -------------------------------------------------------------------------------- /neural_speed/core/layers/ne_test_layers_utils.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef NE_CORE_GRAPH_NE_TEST_LAYERS_UTILS_H 15 | #define NE_CORE_GRAPH_NE_TEST_LAYERS_UTILS_H 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include "bestla/bestla_utils.h" 25 | 26 | #ifndef NS_TESTS 27 | static_assert(false, "Only include this header file for testing!"); 28 | #endif 29 | 30 | template 31 | inline void init_vector(T* v, size_t size, float v_min = -10, float v_max = 10, int seed = 5489u) { 32 | float low_value = std::max(v_min, static_cast(std::numeric_limits::lowest()) + 1); 33 | std::mt19937 gen(seed); 34 | std::uniform_real_distribution u(low_value, v_max); 35 | for (size_t i = 0; i < size; ++i) v[i] = u(gen); 36 | } 37 | 38 | template <> 39 | inline void init_vector(bestla::utils::bf16* v, size_t size, float v_min, float v_max, int seed) { 40 | std::mt19937 gen(seed); 41 | std::uniform_real_distribution u(v_min, v_max); 42 | for (size_t i = 0; i < size; ++i) v[i] = bestla::utils::bf16(u(gen)); 43 | } 44 | 45 | template <> 46 | inline void init_vector(bestla::utils::fp16* v, size_t size, float v_min, float v_max, int seed) { 47 | std::mt19937 gen(seed); 48 | std::uniform_real_distribution u(v_min, v_max); 49 | for (size_t i = 0; i < size; ++i) v[i] = bestla::utils::fp16(u(gen)); 50 | } 51 | 52 | template 53 | inline void init_vector(std::vector* v, float v_min = -10, float v_max = 10, int seed = 5489u) { 54 | init_vector(v->data(), v->size(), v_min, v_max, seed); 55 | } 56 | 57 | template 58 | struct s_is_u8s8 { 59 | enum { value = false }; 60 | }; 61 | 62 | template <> 63 | struct s_is_u8s8 { 64 | enum { value = true }; 65 | }; 66 | 67 | template <> 68 | struct s_is_u8s8 { 69 | enum { value = true }; 70 | }; 71 | 72 | template 73 | inline typename std::enable_if::value, float>::type get_err(const T& a, const T& b) { 74 | // we compare float relative error ratio here 75 | return fabs(static_cast(a) - static_cast(b)) / 76 | std::max(static_cast(fabs(static_cast(b))), 1.0f); 77 | } 78 | template 79 | inline typename std::enable_if::value, float>::type get_err(const T& a, const T& b) { 80 | // for quantized value, error ratio was calculated with its data range 81 | return fabs(static_cast(a) - static_cast(b)) / UINT8_MAX; 82 | } 83 | 84 | template 85 | bool compare_data(const T* buf1, const T* buf2, size_t size, float eps = 1e-6) { 86 | if (buf1 == buf2) return false; 87 | 88 | for (size_t i = 0; i < size; ++i) { 89 | if (get_err(buf1[i], buf2[i]) > eps) { 90 | std::cerr << static_cast(buf1[i]) << "vs" << static_cast(buf2[i]) << " idx=" << i << std::endl; 91 | return false; 92 | } 93 | } 94 | return true; 95 | } 96 | #endif // NE_CORE_GRAPH_NE_TEST_LAYERS_UTILS_H 97 | -------------------------------------------------------------------------------- /neural_speed/core/parallel_context.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #pragma once 15 | 16 | #ifdef __cplusplus 17 | extern "C" { 18 | #endif 19 | 20 | // For C++ class used in C code 21 | typedef struct parallel_context parallel_context; 22 | 23 | enum parallel_mode { 24 | TENSOR_NO_CHANGE, 25 | TENSOR_1D_ROW, 26 | TENSOR_1D_COL, 27 | TENSOR_2D_ROW, 28 | TENSOR_2D_COL, 29 | 30 | TENSOR_3D_INPUT, 31 | TENSOR_3D_WEIGHT, 32 | TENSOR_3D_OUTPUT, 33 | TENSOR_3D_INPUT_X_WEIGHT, 34 | TENSOR_3D_OUTPUT_X_WEIGHT, 35 | 36 | TENSOR_2P5D_ROW, 37 | TENSOR_2P5D_COL, 38 | TENSOR_2P5D_DEP 39 | }; 40 | parallel_context* init_parallel_context(); 41 | int get_tp_size(parallel_context* p); 42 | int get_tp_rank(parallel_context* p); 43 | bool is_master(parallel_context* p); 44 | void barrier(parallel_context* p); 45 | void broadcast(parallel_context* p, float* buffer, size_t count); 46 | void alltoall(parallel_context* p, float* send_buffer, float* recv_buffer, size_t count); 47 | void reduce_add(parallel_context* p, float* send_buffer, float* recv_buffer, size_t count); 48 | 49 | #ifdef __cplusplus 50 | } 51 | #endif 52 | -------------------------------------------------------------------------------- /neural_speed/models/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | file(GLOB MODEL_UTILS_SOURCE "model_utils/*.cpp") 16 | 17 | function(add_model target) 18 | add_library_w_warning(${target} ${ARGN}) # no (gpt) model utils needed 19 | target_compile_features(${target} PUBLIC cxx_std_11) # don't bump 20 | set_target_properties(${target} PROPERTIES POSITION_INDEPENDENT_CODE ON) 21 | target_link_libraries(${target} PUBLIC ne_layers bestla) 22 | endfunction() 23 | 24 | add_model(llama llama/llama.cpp llama/llama_utils.cpp ${MODEL_UTILS_SOURCE}) 25 | add_model(gptj gptj/gptj.cpp gptj/gptj_utils.cpp ${MODEL_UTILS_SOURCE}) 26 | add_model(mpt mpt/mpt.cpp mpt/mpt_utils.cpp ${MODEL_UTILS_SOURCE}) 27 | add_model(gptneox gptneox/gptneox.cpp gptneox/gptneox_utils.cpp ${MODEL_UTILS_SOURCE}) 28 | add_model(starcoder starcoder/starcoder.cpp starcoder/starcoder_utils.cpp ${MODEL_UTILS_SOURCE}) 29 | add_model(falcon falcon/falcon.cpp falcon/falcon_utils.cpp ${MODEL_UTILS_SOURCE}) 30 | add_model(opt opt/opt.cpp opt/opt_utils.cpp ${MODEL_UTILS_SOURCE}) 31 | add_model(bloom bloom/bloom.cpp bloom/bloom_utils.cpp ${MODEL_UTILS_SOURCE}) 32 | add_model(baichuan baichuan/baichuan.cpp baichuan/baichuan_utils.cpp ${MODEL_UTILS_SOURCE}) 33 | add_model(qwen qwen/qwen.cpp qwen/qwen_utils.cpp ${MODEL_UTILS_SOURCE}) 34 | add_model(whisper whisper/whisper.cpp whisper/whisper_utils.cpp ${MODEL_UTILS_SOURCE}) 35 | add_model(chatglm chatglm/chatglm.cpp chatglm/chatglm_utils.cpp ${MODEL_UTILS_SOURCE}) 36 | add_model(chatglm2 chatglm/chatglm2.cpp chatglm/chatglm2_utils.cpp ${MODEL_UTILS_SOURCE}) 37 | add_model(gemma gemma/gemma.cpp gemma/gemma_utils.cpp ${MODEL_UTILS_SOURCE}) 38 | add_model(phi phi/phi.cpp phi/phi_utils.cpp ${MODEL_UTILS_SOURCE}) 39 | add_model(stablelm stablelm/stablelm.cpp stablelm/stablelm_utils.cpp ${MODEL_UTILS_SOURCE}) 40 | add_model(chatglm3 chatglm/chatglm2.cpp chatglm/chatglm2_utils.cpp ${MODEL_UTILS_SOURCE}) 41 | add_model(grok grok/grok.cpp grok/grok_utils.cpp ${MODEL_UTILS_SOURCE}) 42 | add_model(phi3 phi/phi3.cpp phi/phi3_utils.cpp ${MODEL_UTILS_SOURCE}) 43 | -------------------------------------------------------------------------------- /neural_speed/models/baichuan/baichuan.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef BAICHUAN_H 16 | #define BAICHUAN_H 17 | 18 | #include "models/model_utils/model_files.h" 19 | #include "models/model_utils/model_types.h" 20 | 21 | enum baichuan_model { 22 | BAICHUAN_UNKNOWN, 23 | BAICHUAN_13B, 24 | }; 25 | 26 | static const model_scratch baichuan_mem_req(int n_layers, float scratch_size_ratio = 1.0f) { 27 | switch (n_layers) { 28 | case 40: 29 | return { 30 | static_cast(scratch_size_ratio * 4096) * MB, 31 | static_cast(scratch_size_ratio * 2048) * MB, 32 | static_cast(scratch_size_ratio * 4096) * MB, 33 | }; 34 | case 32: 35 | return { 36 | static_cast(scratch_size_ratio * 4096) * MB, 37 | static_cast(scratch_size_ratio * 2048) * MB, 38 | static_cast(scratch_size_ratio * 4096) * MB, 39 | }; 40 | default: 41 | MODEL_ASSERT(false); 42 | } 43 | } 44 | 45 | class BAICHUAN : public IModel { 46 | private: 47 | model_archs name = MODEL_BAICHUAN; 48 | std::unique_ptr ml; 49 | uint32_t n_layer, n_embd, n_ff, n_vocab; 50 | int n_gpu_layer; 51 | bool use_mmap, use_mlock, vocab_only; 52 | model_scratch scratch; 53 | 54 | public: 55 | void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_, 56 | bool vocab_only_) override; 57 | void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override; 58 | }; 59 | 60 | #endif // BAICHUAN_H 61 | -------------------------------------------------------------------------------- /neural_speed/models/bloom/bloom.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef BLOOM_H 16 | #define BLOOM_H 17 | 18 | #include "models/model_utils/model_files.h" 19 | #include "models/model_utils/model_types.h" 20 | 21 | enum bloom_model { 22 | BLOOM_UNKNOWN, 23 | BLOOM_7B, 24 | }; 25 | 26 | static const model_scratch bloom_mem_req(int n_layers, float scratch_size_ratio = 1.0f) { 27 | switch (n_layers) { 28 | case 30: 29 | return { 30 | static_cast(scratch_size_ratio * 4096) * MB, 31 | static_cast(scratch_size_ratio * 2048) * MB, 32 | static_cast(scratch_size_ratio * 4096) * MB, 33 | }; 34 | default: 35 | MODEL_ASSERT(false); 36 | } 37 | } 38 | 39 | class BLOOM : public IModel { 40 | private: 41 | model_archs arch = MODEL_BLOOM; 42 | std::unique_ptr ml; 43 | uint32_t n_layer, n_embd, n_ff, n_vocab; 44 | int n_gpu_layer; 45 | bool use_mmap, use_mlock, vocab_only; 46 | model_scratch scratch; 47 | 48 | public: 49 | void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_, 50 | bool vocab_only_) override; 51 | void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override; 52 | }; 53 | 54 | #endif // BLOOM_H 55 | -------------------------------------------------------------------------------- /neural_speed/models/chatglm/chatglm.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef CHATGLM1_H 16 | #define CHATGLM1_H 17 | 18 | #include "models/model_utils/model_files.h" 19 | #include "models/model_utils/model_types.h" 20 | 21 | enum chatglm_model { 22 | CHATGLM_UNKNOWN, 23 | CHATGLM_6B, 24 | }; 25 | 26 | static const model_scratch chatglm_mem_req(int n_layers, float scratch_size_ratio = 1.0f) { 27 | switch (n_layers) { 28 | case 28: 29 | return { 30 | static_cast(scratch_size_ratio * 4096) * MB, 31 | static_cast(scratch_size_ratio * 2048) * MB, 32 | static_cast(scratch_size_ratio * 4096) * MB, 33 | }; 34 | default: 35 | MODEL_ASSERT(false); 36 | } 37 | } 38 | 39 | class CHATGLM : public IModel { 40 | private: 41 | model_archs name = MODEL_CHATGLM; 42 | std::unique_ptr ml; 43 | uint32_t n_layer, n_embd, n_ff, n_vocab; 44 | int n_gpu_layer; 45 | bool use_mmap, use_mlock, vocab_only; 46 | model_scratch scratch; 47 | 48 | public: 49 | void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_, 50 | bool vocab_only_) override; 51 | void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override; 52 | }; 53 | 54 | #endif // CHATGLM1_H 55 | -------------------------------------------------------------------------------- /neural_speed/models/chatglm/chatglm2.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef CHATGLM2_H 16 | #define CHATGLM2_H 17 | 18 | #include "models/model_utils/model_files.h" 19 | #include "models/model_utils/model_types.h" 20 | 21 | enum chatglm2_model { 22 | CHATGLM2_UNKNOWN, 23 | CHATGLM2_6B, 24 | }; 25 | 26 | static const model_scratch chatglm_mem_req(int n_layers, float scratch_size_ratio = 1.0f) { 27 | switch (n_layers) { 28 | case 28: 29 | return { 30 | static_cast(scratch_size_ratio * 4096) * MB, 31 | static_cast(scratch_size_ratio * 2048) * MB, 32 | static_cast(scratch_size_ratio * 4096) * MB, 33 | }; 34 | case 40: 35 | return { 36 | static_cast(scratch_size_ratio * 4096) * MB, 37 | static_cast(scratch_size_ratio * 2048) * MB, 38 | static_cast(scratch_size_ratio * 4096) * MB, 39 | }; 40 | default: 41 | MODEL_ASSERT(false); 42 | } 43 | } 44 | 45 | class CHATGLM2 : public IModel { 46 | private: 47 | model_archs name = MODEL_CHATGLM2; 48 | std::unique_ptr ml; 49 | uint32_t n_layer, n_embd, n_ff, n_vocab; 50 | int n_gpu_layer; 51 | bool use_mmap, use_mlock, vocab_only; 52 | model_scratch scratch; 53 | 54 | public: 55 | void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_, 56 | bool vocab_only_) override; 57 | void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override; 58 | }; 59 | 60 | #endif // CHATGLM2_H 61 | -------------------------------------------------------------------------------- /neural_speed/models/falcon/falcon.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef FALCON_H 16 | #define FALCON_H 17 | 18 | #include "models/model_utils/model_files.h" 19 | #include "models/model_utils/model_types.h" 20 | 21 | enum falcon_model { 22 | FALCON_UNKNOWN, 23 | FALCON_7B, 24 | }; 25 | 26 | static const model_scratch falcon_mem_req(int n_layers, float scratch_size_ratio = 1.0f) { 27 | switch (n_layers) { 28 | case 32: 29 | return { 30 | static_cast(scratch_size_ratio * 4096) * MB, 31 | static_cast(scratch_size_ratio * 2048) * MB, 32 | static_cast(scratch_size_ratio * 4096) * MB, 33 | }; 34 | case 60: 35 | return { 36 | static_cast(scratch_size_ratio * 2 * 3072) * MB, 37 | static_cast(scratch_size_ratio * 2 * 2048) * MB, 38 | static_cast(scratch_size_ratio * 2 * 3072) * MB, 39 | }; 40 | case 80: 41 | return { 42 | static_cast(scratch_size_ratio * 3 * 3072) * MB, 43 | static_cast(scratch_size_ratio * 3 * 2048) * MB, 44 | static_cast(scratch_size_ratio * 3 * 3072) * MB, 45 | }; 46 | default: 47 | MODEL_ASSERT(false); 48 | } 49 | } 50 | 51 | class FALCON : public IModel { 52 | private: 53 | model_archs arch = MODEL_FALCON; 54 | std::unique_ptr ml; 55 | uint32_t n_layer, n_embd, n_ff, n_vocab, n_head_kv; 56 | int n_gpu_layer; 57 | bool use_mmap, use_mlock, vocab_only; 58 | model_scratch scratch; 59 | 60 | public: 61 | void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_, 62 | bool vocab_only_) override; 63 | void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override; 64 | }; 65 | 66 | #endif // FALCON_H 67 | -------------------------------------------------------------------------------- /neural_speed/models/gemma/gemma.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef GEMMA_H 16 | #define GEMMA_H 17 | 18 | #include "models/model_utils/model_files.h" 19 | #include "models/model_utils/model_types.h" 20 | 21 | enum gemma_model { 22 | GEMMA_2B, 23 | GEMMA_7B, 24 | }; 25 | 26 | static const model_scratch gemma_mem_req(int n_layers, float enlarge_scale = 1.0f) { 27 | switch (n_layers) { 28 | case 18: 29 | return { 30 | static_cast(enlarge_scale * 1024) * MB, 31 | static_cast(enlarge_scale * 1024) * MB, 32 | static_cast(enlarge_scale * 1608) * MB, 33 | }; 34 | case 28: 35 | return { 36 | static_cast(enlarge_scale * 1024) * MB, 37 | static_cast(enlarge_scale * 1024) * MB, 38 | static_cast(enlarge_scale * 1608) * MB, 39 | }; 40 | default: 41 | MODEL_ASSERT(false); 42 | } 43 | } 44 | 45 | class Gemma : public IModel { 46 | private: 47 | model_archs arch = MODEL_GEMMA; 48 | std::unique_ptr ml; 49 | uint32_t n_layer, n_embd, n_ff, n_vocab, n_head, n_head_kv, n_expert, n_expert_used, n_embd_head_k; 50 | int n_gpu_layer; 51 | bool use_mmap, use_mlock, vocab_only; 52 | model_scratch scratch; 53 | 54 | public: 55 | void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_, 56 | bool vocab_only_) override; 57 | void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override; 58 | }; 59 | 60 | #endif // GEMMA_H 61 | -------------------------------------------------------------------------------- /neural_speed/models/gptj/gptj.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef GPTJ_H 16 | #define GPTJ_H 17 | 18 | #include "models/model_utils/model_files.h" 19 | #include "models/model_utils/model_types.h" 20 | 21 | enum gptj_model { 22 | GPTJ_UNKNOWN, 23 | GPTJ_7B, 24 | GPTJ_13B, 25 | GPTJ_30B, 26 | GPTJ_65B, 27 | }; 28 | 29 | static const model_scratch gptj_mem_req(int n_layers, float scratch_size_ratio = 1.0f) { 30 | switch (n_layers) { 31 | case 28: 32 | // should be enough for batch=8 * beam=4 33 | return { 34 | static_cast(scratch_size_ratio * 4096) * MB, 35 | static_cast(scratch_size_ratio * 2048) * MB, 36 | static_cast(scratch_size_ratio * 4096) * MB, 37 | }; 38 | default: 39 | MODEL_ASSERT(false); 40 | } 41 | } 42 | 43 | class GPTJ : public IModel { 44 | private: 45 | model_archs arch = MODEL_GPTJ; 46 | std::unique_ptr ml; 47 | uint32_t n_layer, n_embd, n_ff, n_vocab; 48 | int n_gpu_layer; 49 | bool use_mmap, use_mlock, vocab_only; 50 | model_scratch scratch; 51 | 52 | public: 53 | void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_, 54 | bool vocab_only_) override; 55 | void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override; 56 | }; 57 | 58 | #endif // GPTJ_H 59 | -------------------------------------------------------------------------------- /neural_speed/models/gptneox/gptneox.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef GPTNEOX_H 16 | #define GPTNEOX_H 17 | 18 | #include "models/model_utils/model_files.h" 19 | #include "models/model_utils/model_types.h" 20 | 21 | enum gptneox_model { 22 | GPTNEOX_UNKNOWN, 23 | GPTNEOX_7B, 24 | }; 25 | 26 | static const model_scratch gptneox_mem_req(int n_layers, float scratch_size_ratio = 1.0f) { 27 | switch (n_layers) { 28 | case 44: 29 | return { 30 | static_cast(scratch_size_ratio * 4096) * MB, 31 | static_cast(scratch_size_ratio * 2048) * MB, 32 | static_cast(scratch_size_ratio * 4096) * MB, 33 | }; 34 | case 32: 35 | return { 36 | static_cast(scratch_size_ratio * 4096) * MB, 37 | static_cast(scratch_size_ratio * 2048) * MB, 38 | static_cast(scratch_size_ratio * 4096) * MB, 39 | }; 40 | case 28: // 5.8B 41 | return { 42 | static_cast(scratch_size_ratio * 4096) * MB, 43 | static_cast(scratch_size_ratio * 2048) * MB, 44 | static_cast(scratch_size_ratio * 4096) * MB, 45 | }; 46 | default: 47 | MODEL_ASSERT(false); 48 | } 49 | } 50 | 51 | class GPTNEOX : public IModel { 52 | private: 53 | model_archs arch = MODEL_GPTNEOX; 54 | std::unique_ptr ml; 55 | uint32_t n_layer, n_embd, n_ff, n_vocab; 56 | int n_gpu_layer; 57 | bool use_mmap, use_mlock, vocab_only; 58 | model_scratch scratch; 59 | 60 | public: 61 | void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_, 62 | bool vocab_only_) override; 63 | void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override; 64 | }; 65 | 66 | #endif // GPTNEOX_H 67 | -------------------------------------------------------------------------------- /neural_speed/models/grok/grok.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef GROK_H 16 | #define GROK_H 17 | 18 | #include "models/model_utils/model_files.h" 19 | #include "models/model_utils/model_types.h" 20 | 21 | enum grok_model { 22 | GROK_314B, 23 | }; 24 | 25 | static const model_scratch grok_mem_req(int n_layers, float enlarge_scale = 1.0f) { 26 | switch (n_layers) { 27 | case 64: 28 | return { 29 | static_cast(enlarge_scale * 4096) * MB, 30 | static_cast(enlarge_scale * 2048) * MB, 31 | static_cast(enlarge_scale * 4096 * 10) * MB, 32 | }; 33 | default: 34 | MODEL_ASSERT(false); 35 | } 36 | } 37 | 38 | class Grok : public IModel { 39 | private: 40 | model_archs arch = MODEL_GROK; 41 | std::unique_ptr ml; 42 | uint32_t n_layer, n_embd, n_ff, n_vocab, n_head, n_head_kv, n_expert, n_expert_used, n_embd_head_k; 43 | int n_gpu_layer; 44 | bool use_mmap, use_mlock, vocab_only; 45 | model_scratch scratch; 46 | 47 | public: 48 | void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_, 49 | bool vocab_only_) override; 50 | void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override; 51 | }; 52 | 53 | #endif // GROK_H 54 | -------------------------------------------------------------------------------- /neural_speed/models/model_utils/CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/neural-speed/bfd5d3c17dee18a20e2768f855f2d8fe132fc579/neural_speed/models/model_utils/CMakeLists.txt -------------------------------------------------------------------------------- /neural_speed/models/model_utils/pool.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "models/model_utils/pool.h" 16 | 17 | serve_policy parse_serve_policy(const std::string& policy) { 18 | if (policy == "fcfs") { 19 | return serve_policy::FCFS; 20 | } else { 21 | fprintf(stderr, "Unexpected serve_policy %s!", policy.c_str()); 22 | return serve_policy::UNKNOWN; 23 | } 24 | } 25 | 26 | // fcfs_pool 27 | bool fcfs_pool::add(sequence seq) { 28 | context.emplace(seq); 29 | return true; 30 | } 31 | 32 | bool fcfs_pool::pop(sequence* seq) { 33 | if (empty()) { 34 | fprintf(stderr, "%s: pool is empty.\n", __func__); 35 | return false; 36 | } 37 | *seq = context.front(); 38 | context.pop(); 39 | return true; 40 | } 41 | 42 | void fcfs_pool::clear() { 43 | std::queue empty_q; 44 | context.swap(empty_q); 45 | } 46 | 47 | bool fcfs_pool::empty() { return context.empty(); } 48 | 49 | int fcfs_pool::size() { return context.size(); } 50 | 51 | // serve_pool 52 | serve_pool::serve_pool(const pool_property& property) { 53 | // default policy = FCFS 54 | std::lock_guard lock(mtx); 55 | if (internel_pool != nullptr) return; 56 | internel_pool = new fcfs_pool(property); 57 | } 58 | 59 | serve_pool::serve_pool(const serve_policy& policy, const pool_property& property) { 60 | std::lock_guard lock(mtx); 61 | if (internel_pool != nullptr) return; 62 | switch (policy) { 63 | case serve_policy::FCFS: 64 | internel_pool = new fcfs_pool(property); 65 | default: 66 | NE_ASSERT(false); 67 | } 68 | } 69 | 70 | serve_pool::~serve_pool() { 71 | std::lock_guard lock(mtx); 72 | if (internel_pool != nullptr) { 73 | delete internel_pool; 74 | } 75 | } 76 | 77 | bool serve_pool::add(sequence seq) { 78 | std::lock_guard lock(mtx); 79 | return internel_pool->add(std::move(seq)); 80 | } 81 | 82 | bool serve_pool::pop(sequence* seq) { 83 | std::lock_guard lock(mtx); 84 | return internel_pool->pop(seq); 85 | } 86 | 87 | void serve_pool::clear() { 88 | std::lock_guard lock(mtx); 89 | internel_pool->clear(); 90 | } 91 | 92 | bool serve_pool::empty() { return internel_pool->empty(); } 93 | 94 | int serve_pool::size() { return internel_pool->size(); } 95 | -------------------------------------------------------------------------------- /neural_speed/models/model_utils/pool.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef POOL_H 16 | #define POOL_H 17 | 18 | #include 19 | 20 | #include "models/model_utils/model_types.h" 21 | 22 | enum class seq_status : int { 23 | UNKNOWN = 0, 24 | WAITING, 25 | PREFILL, 26 | DECODING, 27 | FINISHED, 28 | }; 29 | 30 | enum class pool_property : int { 31 | WAITING = 0, 32 | RUNNING, 33 | FINISHED, 34 | }; 35 | 36 | enum class serve_policy : int { 37 | UNKNOWN = 0, 38 | FCFS, // first come first serve 39 | }; 40 | 41 | serve_policy parse_serve_policy(const std::string& policy); 42 | 43 | struct sequence { 44 | int request_idx = -1; // -1 means unknown 45 | int64_t receive_time; 46 | int64_t end_time; 47 | std::vector prompt_ids; 48 | std::vector generated_ids; 49 | uint32_t n_prompt_tokens; 50 | uint32_t n_past; 51 | uint32_t n_total; 52 | uint32_t n_tokens; 53 | generation_config gen_conf; 54 | seq_status status = seq_status::UNKNOWN; 55 | uint64_t query_id; // query_id for pybind response 56 | }; 57 | 58 | // abstract class 59 | class pool { 60 | public: 61 | explicit pool(const pool_property& property) : property(property) {} 62 | virtual ~pool() {} 63 | virtual bool add(sequence seq) = 0; 64 | virtual bool pop(sequence* seq) = 0; 65 | virtual void clear() = 0; 66 | virtual bool empty() = 0; 67 | virtual int size() = 0; 68 | 69 | protected: 70 | const pool_property property; 71 | }; 72 | 73 | class fcfs_pool : public pool { 74 | public: 75 | explicit fcfs_pool(const pool_property& property) : pool(property) {} 76 | ~fcfs_pool() {} 77 | bool add(sequence seq) override; 78 | bool pop(sequence* seq) override; 79 | void clear() override; 80 | bool empty() override; 81 | int size() override; 82 | 83 | protected: 84 | std::queue context; 85 | }; 86 | 87 | class serve_pool { 88 | public: 89 | explicit serve_pool(const pool_property& property); 90 | serve_pool(const serve_policy& policy, const pool_property& property); 91 | ~serve_pool(); 92 | bool add(sequence seq); 93 | bool pop(sequence* seq); 94 | void clear(); 95 | bool empty(); 96 | int size(); 97 | 98 | protected: 99 | pool* internel_pool = nullptr; 100 | std::mutex mtx; 101 | }; 102 | 103 | #endif // POOL_H 104 | -------------------------------------------------------------------------------- /neural_speed/models/model_utils/quant_utils.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef QUANT_UTILS_H 15 | #define QUANT_UTILS_H 16 | 17 | #include "application/common.h" 18 | #include "models/model_utils/quant_config.h" 19 | 20 | #ifdef MODEL_SHARED 21 | #if defined(_WIN32) && !defined(__MINGW32__) 22 | #ifdef MODEL_BUILD 23 | #define QUANT_API __declspec(dllexport) 24 | #else 25 | #define QUANT_API __declspec(dllimport) 26 | #endif 27 | #else 28 | #define QUANT_API __attribute__((visibility("default"))) 29 | #endif 30 | #else 31 | #define QUANT_API 32 | #endif 33 | 34 | QUANT_API int model_quantize(const quant_params& param, std::shared_ptr quant_layer); 35 | size_t bestla_qpack(const int8_t* src_w, const float* src_scales, const int8_t* src_zps, void* dstpr, 36 | const quant_params_internal params, int nthread, int n, int k, int* g_idx); 37 | size_t bestla_quantize(const float* f32ptr, void* dstpr, const quant_params_internal params, int nthread, size_t n, 38 | size_t k); 39 | QUANT_API bool model_quantize_special(std::ifstream& finp, std::ofstream& fout, const ne_ftype ftype, 40 | const std::vector& to_quant, 41 | const std::vector& to_skip); 42 | QUANT_API bool whisper_model_quantize(const std::string& fname_inp, const std::string& fname_out, ne_ftype ftype); 43 | #endif // MODEL_H 44 | -------------------------------------------------------------------------------- /neural_speed/models/model_utils/util.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #include "util.h" 15 | 16 | int32_t get_num_physical_cores() { 17 | #ifdef __linux__ 18 | // enumerate the set of thread siblings, num entries is num cores 19 | std::unordered_set siblings; 20 | for (uint32_t cpu = 0; cpu < UINT32_MAX; ++cpu) { 21 | std::ifstream thread_siblings("/sys/devices/system/cpu" + std::to_string(cpu) + "/topology/thread_siblings"); 22 | if (!thread_siblings.is_open()) { 23 | break; // no more cpus 24 | } 25 | std::string line; 26 | if (std::getline(thread_siblings, line)) { 27 | siblings.insert(line); 28 | } 29 | } 30 | if (!siblings.empty()) { 31 | return static_cast(siblings.size()); 32 | } 33 | #elif defined(__APPLE__) && defined(__MACH__) 34 | int32_t num_physical_cores; 35 | size_t len = sizeof(num_physical_cores); 36 | int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, nullptr, 0); 37 | if (result == 0) { 38 | return num_physical_cores; 39 | } 40 | result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, nullptr, 0); 41 | if (result == 0) { 42 | return num_physical_cores; 43 | } 44 | #elif defined(_WIN32) 45 | // Implement 46 | #endif 47 | unsigned int n_threads = std::thread::hardware_concurrency(); 48 | return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; 49 | } 50 | -------------------------------------------------------------------------------- /neural_speed/models/models.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef MODELS_H 15 | #define MODELS_H 16 | 17 | #include "models/model_utils/model_types.h" 18 | 19 | struct IModel { 20 | virtual void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap, bool use_mlock, 21 | bool vocab_only) = 0; 22 | virtual void load(model_context* ctx, model_progress_callback progress_callback, 23 | void* progress_callback_user_data) = 0; 24 | }; 25 | 26 | #endif // MODELS_H 27 | -------------------------------------------------------------------------------- /neural_speed/models/mpt/mpt.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef MPT_H 16 | #define MPT_H 17 | 18 | #include "models/model_utils/model_files.h" 19 | #include "models/model_utils/model_types.h" 20 | 21 | enum mpt_model { 22 | MPT_UNKNOWN, 23 | MPT_7B, 24 | MPT_30B, 25 | }; 26 | 27 | static const model_scratch mpt_mem_req(int n_layers, float scratch_size_ratio = 1.0f) { 28 | switch (n_layers) { 29 | case 32: 30 | return { 31 | static_cast(scratch_size_ratio * 2048) * MB, 32 | static_cast(scratch_size_ratio * 2048) * MB, 33 | static_cast(scratch_size_ratio * 4096) * MB, 34 | }; 35 | case 48: 36 | return { 37 | static_cast(scratch_size_ratio * 4096) * MB, 38 | static_cast(scratch_size_ratio * 4096) * MB, 39 | static_cast(scratch_size_ratio * 8192) * MB, 40 | }; 41 | default: 42 | MODEL_ASSERT(false); 43 | } 44 | } 45 | 46 | class MPT : public IModel { 47 | private: 48 | model_archs arch = MODEL_MPT; 49 | std::unique_ptr ml; 50 | uint32_t n_layer, n_embd, n_ff, n_vocab; 51 | int n_gpu_layer; 52 | bool use_mmap, use_mlock, vocab_only; 53 | model_scratch scratch; 54 | 55 | public: 56 | void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_, 57 | bool vocab_only_) override; 58 | void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override; 59 | }; 60 | 61 | #endif // MPT_H 62 | -------------------------------------------------------------------------------- /neural_speed/models/phi/phi.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef PHI_H 16 | #define PHI_H 17 | 18 | #include "models/model_utils/model_files.h" 19 | #include "models/model_utils/model_types.h" 20 | 21 | enum new_model { 22 | MDOEL_UNKNOWN, 23 | PHI, 24 | }; 25 | 26 | static const model_scratch phi_mem_req(int n_layers, float scratch_size_ratio = 1.0f) { 27 | switch (n_layers) { 28 | case 24: 29 | return { 30 | static_cast(scratch_size_ratio * 512) * MB, 31 | static_cast(scratch_size_ratio * 512) * MB, 32 | static_cast(scratch_size_ratio * 1024) * MB, 33 | }; 34 | case 32: 35 | return { 36 | static_cast(scratch_size_ratio * 1024) * MB, 37 | static_cast(scratch_size_ratio * 1024) * MB, 38 | static_cast(scratch_size_ratio * 1024) * MB, 39 | }; 40 | default: 41 | MODEL_ASSERT(false); 42 | } 43 | } 44 | 45 | class phi : public IModel { 46 | private: 47 | model_archs name = MODEL_PHI; 48 | std::unique_ptr ml; 49 | uint32_t n_layer, n_embd, n_ff, n_vocab; 50 | int n_ctx, n_gpu_layer; 51 | bool use_mmap, use_mlock, vocab_only; 52 | model_scratch scratch; 53 | 54 | public: 55 | void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_, 56 | bool vocab_only_) override; 57 | void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override; 58 | }; 59 | 60 | #endif // PHI_H 61 | -------------------------------------------------------------------------------- /neural_speed/models/phi/phi3.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef PHI3_H 16 | #define PHI3_H 17 | 18 | #include "models/model_utils/model_files.h" 19 | #include "models/model_utils/model_types.h" 20 | 21 | enum new_model { 22 | MDOEL_UNKNOWN, 23 | PHI3, 24 | }; 25 | 26 | static const model_scratch phi3_mem_req(int n_layers, float scratch_size_ratio = 1.0f) { 27 | switch (n_layers) { 28 | case 24: 29 | return { 30 | static_cast(scratch_size_ratio * 512) * MB, 31 | static_cast(scratch_size_ratio * 512) * MB, 32 | static_cast(scratch_size_ratio * 1024) * MB, 33 | }; 34 | case 32: 35 | return { 36 | static_cast(scratch_size_ratio * 512) * MB, 37 | static_cast(scratch_size_ratio * 512) * MB, 38 | static_cast(scratch_size_ratio * 1024) * MB, 39 | }; 40 | default: 41 | MODEL_ASSERT(false); 42 | } 43 | } 44 | 45 | class phi3 : public IModel { 46 | private: 47 | model_archs name = MODEL_PHI3; 48 | std::unique_ptr ml; 49 | uint32_t n_layer, n_embd, n_ff, n_vocab; 50 | int n_ctx, n_gpu_layer; 51 | bool use_mmap, use_mlock, vocab_only; 52 | model_scratch scratch; 53 | 54 | public: 55 | void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_, 56 | bool vocab_only_) override; 57 | void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override; 58 | }; 59 | 60 | #endif // PHI3_H 61 | -------------------------------------------------------------------------------- /neural_speed/models/qwen/qwen.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef QWEN_H 16 | #define QWEN_H 17 | 18 | #include "models/model_utils/model_files.h" 19 | #include "models/model_utils/model_types.h" 20 | 21 | enum QWEN_model { 22 | QWEN_UNKNOWN, 23 | QWEN_7B, 24 | QWEN_14B, 25 | }; 26 | 27 | static const model_scratch qwen_mem_req(int n_layers, float scratch_size_ratio = 1.0f) { 28 | switch (n_layers) { 29 | case 40: 30 | return { 31 | static_cast(scratch_size_ratio * 4096) * MB, 32 | static_cast(scratch_size_ratio * 2048) * MB, 33 | static_cast(scratch_size_ratio * 4096) * MB, 34 | }; 35 | case 32: 36 | return { 37 | static_cast(scratch_size_ratio * 4096) * MB, 38 | static_cast(scratch_size_ratio * 2048) * MB, 39 | static_cast(scratch_size_ratio * 4096) * MB, 40 | }; 41 | case 24: 42 | return { 43 | static_cast(scratch_size_ratio * 4096) * MB, 44 | static_cast(scratch_size_ratio * 2048) * MB, 45 | static_cast(scratch_size_ratio * 4096) * MB, 46 | }; 47 | case 28: 48 | return { 49 | static_cast(scratch_size_ratio * 4096) * MB, 50 | static_cast(scratch_size_ratio * 2048) * MB, 51 | static_cast(scratch_size_ratio * 4096) * MB, 52 | }; 53 | case 80: 54 | return { 55 | static_cast(scratch_size_ratio * 3 * 4096) * MB, 56 | static_cast(scratch_size_ratio * 3 * 2048) * MB, 57 | static_cast(scratch_size_ratio * 3 * 4096) * MB, 58 | }; 59 | default: 60 | MODEL_ASSERT(false); 61 | } 62 | } 63 | 64 | class QWEN : public IModel { 65 | private: 66 | model_archs arch = MODEL_QWEN; 67 | std::unique_ptr ml; 68 | uint32_t n_layer, n_embd, n_ff, n_vocab, n_head, n_head_kv; 69 | int n_gpu_layer; 70 | bool use_mmap, use_mlock, vocab_only; 71 | model_scratch scratch; 72 | 73 | public: 74 | void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_, 75 | bool vocab_only_) override; 76 | void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override; 77 | }; 78 | 79 | #endif // QWEN_H 80 | -------------------------------------------------------------------------------- /neural_speed/models/requirements/baichuan.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #=============================================================================== 3 | # Copyright (c) 2023 Intel Corporation 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | #=============================================================================== 17 | 18 | # To avoid the error: 'ChatGLMTokenizer' object has no attribute 'sp_tokenizer' 19 | pip install -r "$(dirname "${BASH_SOURCE[0]}")/common.txt" transformers==4.33.1 20 | -------------------------------------------------------------------------------- /neural_speed/models/requirements/baichuan13b-gptq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #=============================================================================== 3 | # Copyright (c) 2023 Intel Corporation 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | #=============================================================================== 17 | 18 | # To avoid the error: 'ChatGLMTokenizer' object has no attribute 'sp_tokenizer' 19 | pip install -r "$(dirname "${BASH_SOURCE[0]}")/common.txt" transformers==4.33.1 20 | -------------------------------------------------------------------------------- /neural_speed/models/requirements/chatglm-6b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #=============================================================================== 3 | # Copyright (c) 2023 Intel Corporation 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | #=============================================================================== 17 | 18 | # To avoid the error: 'ChatGLMTokenizer' object has no attribute 'sp_tokenizer' 19 | pip install -r "$(dirname "${BASH_SOURCE[0]}")/common.txt" transformers==4.33.1 20 | -------------------------------------------------------------------------------- /neural_speed/models/requirements/common.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cpu 2 | accelerate 3 | datasets 4 | einops 5 | gguf 6 | huggingface_hub 7 | lm_eval==0.4.2 8 | matplotlib 9 | numpy 10 | peft 11 | protobuf<3.20 12 | sentencepiece 13 | tiktoken 14 | torch 15 | transformers 16 | transformers_stream_generator 17 | zipfile38 18 | -------------------------------------------------------------------------------- /neural_speed/models/requirements/mistral.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #=============================================================================== 3 | # Copyright (c) 2023 Intel Corporation 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | #=============================================================================== 17 | 18 | pip install -r "$(dirname "${BASH_SOURCE[0]}")/common.txt" transformers>=4.34.0 19 | -------------------------------------------------------------------------------- /neural_speed/models/requirements/mixtral-gptq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #=============================================================================== 3 | # Copyright (c) 2023 Intel Corporation 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | #=============================================================================== 17 | 18 | pip install -r "$(dirname "${BASH_SOURCE[0]}")/common.txt" transformers>=4.34.0 19 | -------------------------------------------------------------------------------- /neural_speed/models/stablelm/stablelm.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef STABLELM_H 16 | #define STABLELM_H 17 | 18 | #include "models/model_utils/model_files.h" 19 | #include "models/model_utils/model_types.h" 20 | 21 | enum stablelm_model { 22 | STABLELM_UNKNOWN, 23 | STABLELM_2_1_6B, 24 | STABLELM_2_12B, 25 | STABLELM_3B, 26 | }; 27 | 28 | static const model_scratch stablelm_mem_req(int n_layers, float scratch_size_ratio = 1.0f) { 29 | switch (n_layers) { 30 | case 24: // StableLM-2-1.6B & StableLM-2-Zephyr-1.6B 31 | return { 32 | static_cast(scratch_size_ratio * 512) * MB, 33 | static_cast(scratch_size_ratio * 512) * MB, 34 | static_cast(scratch_size_ratio * 1024) * MB, 35 | }; 36 | case 32: // StableLM-3B & Stable-Code-3B 37 | return { 38 | static_cast(scratch_size_ratio * 1024) * MB, 39 | static_cast(scratch_size_ratio * 1024) * MB, 40 | static_cast(scratch_size_ratio * 1024) * MB, 41 | }; 42 | case 40: // StableLM-2-12B 43 | return { 44 | static_cast(scratch_size_ratio * 2560) * MB, 45 | static_cast(scratch_size_ratio * 2560) * MB, 46 | static_cast(scratch_size_ratio * 5120) * MB, 47 | }; 48 | default: 49 | MODEL_ASSERT(false); 50 | } 51 | } 52 | 53 | class stablelm : public IModel { 54 | private: 55 | model_archs name = MODEL_STABLELM; 56 | std::unique_ptr ml; 57 | uint32_t n_layer, n_embd, n_ff, n_vocab, n_head, n_head_kv, n_embd_head_k; 58 | int n_ctx, n_gpu_layer; 59 | bool use_mmap, use_mlock, vocab_only; 60 | model_scratch scratch; 61 | 62 | public: 63 | void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_, 64 | bool vocab_only_) override; 65 | void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override; 66 | }; 67 | 68 | #endif // STABLELM_H 69 | -------------------------------------------------------------------------------- /neural_speed/models/starcoder/starcoder.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef STARCODER_H 16 | #define STARCODER_H 17 | 18 | #include "models/model_utils/model_files.h" 19 | #include "models/model_utils/model_types.h" 20 | 21 | enum starcoder_model { 22 | STARCODER_UNKNOWN, 23 | STARCODER_7B, 24 | STARCODER_13B, 25 | STARCODER_30B, 26 | STARCODER_65B, 27 | }; 28 | 29 | static const model_scratch starcoder_mem_req(int n_layers, float scratch_size_ratio = 1.0f) { 30 | switch (n_layers) { 31 | case 24: 32 | return { 33 | static_cast(scratch_size_ratio * 3072 * 2) * MB, 34 | static_cast(scratch_size_ratio * 2048 * 2) * MB, 35 | static_cast(scratch_size_ratio * 3072 * 2) * MB, 36 | }; 37 | case 36: 38 | return { 39 | static_cast(scratch_size_ratio * 3072 * 2) * MB, 40 | static_cast(scratch_size_ratio * 2048 * 2) * MB, 41 | static_cast(scratch_size_ratio * 3072 * 2) * MB, 42 | }; 43 | case 40: 44 | return { 45 | static_cast(scratch_size_ratio * 3072 * 8) * MB, 46 | static_cast(scratch_size_ratio * 2048 * 8) * MB, 47 | static_cast(scratch_size_ratio * 3072 * 8) * MB, 48 | }; 49 | default: 50 | MODEL_ASSERT(false); 51 | } 52 | } 53 | 54 | class STARCODER : public IModel { 55 | private: 56 | model_archs arch = MODEL_STARCODER; 57 | std::unique_ptr ml; 58 | uint32_t n_layer, n_embd, n_ff, n_vocab; 59 | int n_gpu_layer; 60 | bool use_mmap, use_mlock, vocab_only; 61 | model_scratch scratch; 62 | 63 | public: 64 | void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_, 65 | bool vocab_only_) override; 66 | void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override; 67 | }; 68 | 69 | #endif // STARCODER_H 70 | -------------------------------------------------------------------------------- /neural_speed/vectors/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_subdirectory(cpu) 16 | if (NS_GPU) 17 | add_subdirectory(gpu) 18 | endif() 19 | 20 | add_library_w_warning(ne_vec ele_reduce.cpp) 21 | target_link_libraries(ne_vec PUBLIC cpu_vec) 22 | -------------------------------------------------------------------------------- /neural_speed/vectors/cpu/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_library_w_warning(cpu_vec vec_arithmetic.cpp vec_compare.cpp vec_convert.cpp vec_set.cpp vec_store.cpp vec_load.cpp) 16 | set_target_properties(cpu_vec PROPERTIES LINKER_LANGUAGE CXX) 17 | set_property(TARGET cpu_vec PROPERTY POSITION_INDEPENDENT_CODE ON) 18 | -------------------------------------------------------------------------------- /neural_speed/vectors/cpu/vec.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef ENGINE_EXECUTOR_INCLUDE_VEC_HPP_ 16 | #define ENGINE_EXECUTOR_INCLUDE_VEC_HPP_ 17 | 18 | #include "vec_arithmetic.hpp" 19 | #include "vec_base.hpp" 20 | #include "vec_compare.hpp" 21 | #include "vec_convert.hpp" 22 | #include "vec_set.hpp" 23 | 24 | #endif // ENGINE_EXECUTOR_INCLUDE_VEC_HPP_ 25 | -------------------------------------------------------------------------------- /neural_speed/vectors/cpu/vec.hpp.gch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/neural-speed/bfd5d3c17dee18a20e2768f855f2d8fe132fc579/neural_speed/vectors/cpu/vec.hpp.gch -------------------------------------------------------------------------------- /neural_speed/vectors/cpu/vec_arithmetic.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef ENGINE_EXECUTOR_INCLUDE_VEC_ARITHMETIC_HPP_ 16 | #define ENGINE_EXECUTOR_INCLUDE_VEC_ARITHMETIC_HPP_ 17 | 18 | #include "vec_base.hpp" 19 | 20 | fp32x16 sub_fp32x16(fp32x16 x, fp32x16 y); 21 | REGISTER_KERNEL_T(sub_fp32x16, fp32x16, fp32x16, fp32x16); 22 | 23 | fp32x16 fmsub_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z); 24 | REGISTER_KERNEL_T(fmsub_fp32x16, fp32x16, fp32x16, fp32x16, fp32x16); 25 | 26 | fp32x16 maskz_fmsub_fp32x16(int mask, fp32x16 x, fp32x16 y, fp32x16 z); 27 | 28 | fp32x16 add_fp32x16(fp32x16 x, fp32x16 y); 29 | REGISTER_KERNEL_T(add_fp32x16, fp32x16, fp32x16, fp32x16); 30 | 31 | fp32x16 fmadd_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z); 32 | REGISTER_KERNEL_T(fmadd_fp32x16, fp32x16, fp32x16, fp32x16, fp32x16); 33 | 34 | fp32x16 mul_fp32x16(fp32x16 x, fp32x16 y); 35 | REGISTER_KERNEL_T(mul_fp32x16, fp32x16, fp32x16, fp32x16); 36 | 37 | fp32x16 maskz_mul_fp32x16(int mask, fp32x16 x, fp32x16 y); 38 | 39 | template 40 | fp32x16 mul_round_fp32x16(fp32x16 x, fp32x16 y); 41 | 42 | fp32x16 div_fp32x16(fp32x16 x, fp32x16 y); 43 | REGISTER_KERNEL_T(div_fp32x16, fp32x16, fp32x16, fp32x16); 44 | 45 | float reduce_add_fp32x16(fp32x16 x); 46 | REGISTER_KERNEL_T(reduce_add_fp32x16, float, fp32x16); 47 | 48 | fp32x16 sqrt_fp32x16(fp32x16 x); 49 | REGISTER_KERNEL_T(sqrt_fp32x16, fp32x16, fp32x16); 50 | 51 | fp32x16 rsqrt14_fp32x16(fp32x16 x); 52 | REGISTER_KERNEL_T(rsqrt14_fp32x16, fp32x16, fp32x16); 53 | 54 | fp32x16 ceil_fp32x16(fp32x16 x); 55 | REGISTER_KERNEL_T(ceil_fp32x16, fp32x16, fp32x16); 56 | 57 | fp32x16 scale_fp32x16(fp32x16 x, fp32x16 y); 58 | REGISTER_KERNEL_T(scale_fp32x16, fp32x16, fp32x16, fp32x16); 59 | 60 | float dot_fp32x16(fp32x16 x, fp32x16 y); 61 | REGISTER_KERNEL_T(dot_fp32x16, float, fp32x16, fp32x16); 62 | 63 | fp32x16 abs_fp32x16(fp32x16 x); 64 | REGISTER_KERNEL_T(abs_fp32x16, fp32x16, fp32x16); 65 | 66 | #endif // ENGINE_EXECUTOR_INCLUDE_VEC_SET_HPP_ 67 | -------------------------------------------------------------------------------- /neural_speed/vectors/cpu/vec_compare.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "vec_compare.hpp" 16 | 17 | fp32x16 min_fp32x16(fp32x16 a, fp32x16 b) { 18 | #if __AVX512F__ 19 | return {_mm512_min_ps(a.first, b.first)}; 20 | #else 21 | return {_mm256_min_ps(a.first, b.first), _mm256_min_ps(a.second, b.second)}; 22 | #endif 23 | } 24 | 25 | s32x16 max_s32x16(s32x16 a, s32x16 b) { 26 | #if __AVX512F__ 27 | return {_mm512_max_epi32(a.first, b.first)}; 28 | #else 29 | return {_mm256_max_epi32(a.first, b.first), _mm256_max_epi32(a.second, b.second)}; 30 | #endif 31 | } 32 | 33 | fp32x16 max_fp32x16(fp32x16 a, fp32x16 b) { 34 | #if __AVX512F__ 35 | return {_mm512_max_ps(a.first, b.first)}; 36 | #else 37 | return {_mm256_max_ps(a.first, b.first), _mm256_max_ps(a.second, b.second)}; 38 | #endif 39 | } 40 | 41 | float reduce_max_fp32x16(fp32x16 x) { 42 | #if __AVX512F__ 43 | return {_mm512_reduce_max_ps(x.first)}; 44 | #else 45 | const __m256 x256 = _mm256_max_ps(x.first, x.second); 46 | const __m128 x128 = _mm_max_ps(_mm256_extractf128_ps(x256, 1), _mm256_castps256_ps128(x256)); 47 | const __m128 x64 = _mm_max_ps(x128, _mm_movehl_ps(x128, x128)); 48 | const __m128 x32 = _mm_max_ss(x64, _mm_shuffle_ps(x64, x64, 0x55)); 49 | return _mm_cvtss_f32(x32); 50 | #endif 51 | } 52 | -------------------------------------------------------------------------------- /neural_speed/vectors/cpu/vec_compare.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef ENGINE_EXECUTOR_INCLUDE_VEC_COMPARE_HPP_ 16 | #define ENGINE_EXECUTOR_INCLUDE_VEC_COMPARE_HPP_ 17 | 18 | #include "vec_base.hpp" 19 | 20 | fp32x16 min_fp32x16(fp32x16 a, fp32x16 b); 21 | 22 | s32x16 max_s32x16(s32x16 a, s32x16 b); 23 | 24 | fp32x16 max_fp32x16(fp32x16 a, fp32x16 b); 25 | 26 | float reduce_max_fp32x16(fp32x16 x); 27 | REGISTER_KERNEL_T(reduce_max_fp32x16, float, fp32x16); 28 | 29 | #endif // ENGINE_EXECUTOR_INCLUDE_VEC_COMPARE_HPP_ 30 | -------------------------------------------------------------------------------- /neural_speed/vectors/cpu/vec_convert.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef ENGINE_EXECUTOR_INCLUDE_VEC_CONVERT_HPP_ 16 | #define ENGINE_EXECUTOR_INCLUDE_VEC_CONVERT_HPP_ 17 | 18 | #include "vec_base.hpp" 19 | 20 | template 21 | s32x16 cvt_roundfp32x16_s32x16(fp32x16 a); 22 | template 23 | struct ne_cvt_roundfp32x16_s32x16_kernel_t : public kernel_t { 24 | ne_cvt_roundfp32x16_s32x16_kernel_t() { func_ = cvt_roundfp32x16_s32x16; } 25 | }; 26 | 27 | template 28 | s32x16 maskz_cvt_roundfp32x16_s32x16(int mask, fp32x16 a); 29 | bf16x16 cvt_fp32x16_bf16x16(fp32x16 a); 30 | 31 | fp32x16 cvt_bf16x16_fp32x16(bf16x16 a); 32 | 33 | fp32x16 maskz_cvt_bf16x16_fp32x16(int mask, bf16x16 a); 34 | 35 | u8x16 cvt_u32x16_u8x16(u32x16 a); 36 | u8x16 maskz_cvt_u32x16_u8x16(int mask, u32x16 a); 37 | 38 | s8x16 cvt_s32x16_s8x16(s32x16 a); 39 | s8x16 maskz_cvt_s32x16_s8x16(const int mask, s32x16 a); 40 | 41 | void cvtu32x16_store_u8x16(void* base_addr, u32x16 a); 42 | void mask_cvtu32x16_store_u8x16(void* base_addr, int mask, u32x16 a); 43 | #endif // ENGINE_EXECUTOR_INCLUDE_VEC_CONVERT_HPP_ 44 | -------------------------------------------------------------------------------- /neural_speed/vectors/cpu/vec_load.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "vec_load.hpp" 16 | -------------------------------------------------------------------------------- /neural_speed/vectors/cpu/vec_load.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef ENGINE_EXECUTOR_INCLUDE_VEC_LOAD_HPP_ 16 | #define ENGINE_EXECUTOR_INCLUDE_VEC_LOAD_HPP_ 17 | 18 | #include "vec_base.hpp" 19 | 20 | inline fp32x16 load_fp32x16(void const* mem_addr) { 21 | #if __AVX512F__ 22 | return {_mm512_loadu_ps(mem_addr)}; 23 | #else 24 | float const* mem_addr_fp32 = reinterpret_cast(mem_addr); 25 | return {_mm256_loadu_ps(mem_addr_fp32), _mm256_loadu_ps(mem_addr_fp32 + 8)}; 26 | #endif 27 | } 28 | template <> 29 | inline fp32x16 load_kernel_t(const void* src) { 30 | return load_fp32x16(src); 31 | } 32 | inline fp32x16 mask_load_fp32x16(fp32x16 src, int mask, void const* mem_addr) { 33 | #if __AVX512F__ 34 | return {_mm512_mask_loadu_ps(src.first, mask, mem_addr)}; 35 | #else 36 | float const* mem_addr_fp32 = reinterpret_cast(mem_addr); 37 | return {_mm256_loadu_ps(mem_addr_fp32), _mm256_loadu_ps(mem_addr_fp32 + 8)}; 38 | #endif 39 | } 40 | 41 | inline bf16x16 load_bf16x16(void const* mem_addr) { 42 | __m256i const* mem_addr_bf16 = reinterpret_cast<__m256i const*>(mem_addr); 43 | return {_mm256_loadu_si256(mem_addr_bf16)}; 44 | } 45 | template <> 46 | inline bf16x16 load_kernel_t(const void* src) { 47 | return load_bf16x16(src); 48 | } 49 | 50 | inline bf16x16 maskz_load_bf16x16(int mask, void const* mem_addr); 51 | 52 | #endif // ENGINE_EXECUTOR_INCLUDE_VEC_LOAD_HPP_ 53 | -------------------------------------------------------------------------------- /neural_speed/vectors/cpu/vec_set.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "vec_set.hpp" 16 | 17 | fp32x16 set1_fp32x16(const float x) { 18 | #if __AVX512F__ 19 | return {_mm512_set1_ps(x)}; 20 | #else 21 | return {_mm256_set1_ps(x), _mm256_set1_ps(x)}; 22 | #endif 23 | } 24 | 25 | s8x16 set1_s8x16(const int8_t x) { return {_mm_set1_epi8(x)}; } 26 | 27 | s16x16 set1_s16x16(const int16_t x) { return {_mm256_set1_epi16(x)}; } 28 | 29 | fp16x16 set1_fp16x16(const uint16_t x) { return {_mm256_set1_epi16(x)}; } 30 | 31 | s32x16 set1_s32x16(const int32_t x) { 32 | #if __AVX512F__ 33 | return {_mm512_set1_epi32(x)}; 34 | #else 35 | return {_mm256_set1_epi32(x), _mm256_set1_epi32(x)}; 36 | #endif 37 | } 38 | 39 | s32x16 setzero_s32x16() { 40 | #if __AVX512F__ 41 | return {_mm512_setzero_epi32()}; 42 | #else 43 | return {_mm256_setzero_si256(), _mm256_setzero_si256()}; 44 | #endif 45 | } 46 | 47 | fp32x16 setzero_fp32x16() { 48 | #if __AVX512F__ 49 | return {_mm512_setzero_ps()}; 50 | #else 51 | return {_mm256_setzero_ps(), _mm256_setzero_ps()}; 52 | #endif 53 | } 54 | -------------------------------------------------------------------------------- /neural_speed/vectors/cpu/vec_set.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef ENGINE_EXECUTOR_INCLUDE_VEC_SET_HPP_ 16 | #define ENGINE_EXECUTOR_INCLUDE_VEC_SET_HPP_ 17 | 18 | #include "vec_base.hpp" 19 | 20 | fp32x16 set1_fp32x16(const float x); 21 | REGISTER_KERNEL_T(set1_fp32x16, fp32x16, float); 22 | 23 | s8x16 set1_s8x16(const int8_t x); 24 | REGISTER_KERNEL_T(set1_s8x16, s8x16, int8_t); 25 | 26 | s16x16 set1_s16x16(const int16_t x); 27 | REGISTER_KERNEL_T(set1_s16x16, s16x16, int16_t); 28 | 29 | fp16x16 set1_fp16x16(const uint16_t x); 30 | REGISTER_KERNEL_T(set1_fp16x16, fp16x16, uint16_t); 31 | 32 | s32x16 set1_s32x16(const int32_t x); 33 | REGISTER_KERNEL_T(set1_s32x16, s32x16, int32_t); 34 | 35 | s32x16 setzero_s32x16(); 36 | 37 | fp32x16 setzero_fp32x16(); 38 | 39 | #endif // ENGINE_EXECUTOR_INCLUDE_VEC_SET_HPP_ 40 | -------------------------------------------------------------------------------- /neural_speed/vectors/cpu/vec_store.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "vec_store.hpp" 16 | -------------------------------------------------------------------------------- /neural_speed/vectors/cpu/vec_store.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef ENGINE_EXECUTOR_INCLUDE_VEC_STORE_HPP_ 16 | #define ENGINE_EXECUTOR_INCLUDE_VEC_STORE_HPP_ 17 | 18 | #include "vec_base.hpp" 19 | 20 | inline void store_s8x16(void* mem_addr, s8x16 a) { _mm_storeu_si128(reinterpret_cast<__m128i*>(mem_addr), a.first); } 21 | inline void store_u8x16(void* mem_addr, u8x16 a) { _mm_storeu_si128(reinterpret_cast<__m128i*>(mem_addr), a.first); } 22 | template <> 23 | inline void store_kernel_t(void* dst, s8x16 src) { 24 | store_s8x16(dst, src); 25 | } 26 | 27 | inline void mask_store_s8x16(void* mem_addr, const int mask, s8x16 a) { 28 | #ifdef __AVX512F__ 29 | _mm_mask_storeu_epi8(mem_addr, mask, a.first); 30 | #else 31 | __m128i mask_reg = 32 | _mm_set_epi8(mask & 32768, mask & 16384, mask & 8192, mask & 4096, mask & 2048, mask & 1024, mask & 512, 33 | mask & 256, mask & 128, mask & 64, mask & 32, mask & 16, mask & 8, mask & 4, mask & 2, mask & 1); 34 | _mm_maskmoveu_si128(a.first, mask_reg, reinterpret_cast(mem_addr)); 35 | #endif 36 | } 37 | 38 | inline void mask_store_u8x16(void* mem_addr, const int mask, u8x16 a) { 39 | #ifdef __AVX512F__ 40 | _mm_mask_storeu_epi8(mem_addr, mask, a.first); 41 | #else 42 | __m128i mask_reg = 43 | _mm_set_epi8(mask & 32768, mask & 16384, mask & 8192, mask & 4096, mask & 2048, mask & 1024, mask & 512, 44 | mask & 256, mask & 128, mask & 64, mask & 32, mask & 16, mask & 8, mask & 4, mask & 2, mask & 1); 45 | _mm_maskmoveu_si128(a.first, mask_reg, reinterpret_cast(mem_addr)); 46 | #endif 47 | } 48 | 49 | inline void store_fp32x16(void* mem_addr, fp32x16 a) { 50 | #ifdef __AVX512F__ 51 | _mm512_storeu_ps(mem_addr, a.first); 52 | #else 53 | float* mem_addr_fp32 = reinterpret_cast(mem_addr); 54 | _mm256_storeu_ps(mem_addr_fp32, a.first); 55 | _mm256_storeu_ps(mem_addr_fp32 + 8, a.second); 56 | #endif 57 | } 58 | 59 | template <> 60 | inline void store_kernel_t(void* dst, fp32x16 src) { 61 | store_fp32x16(dst, src); 62 | } 63 | 64 | inline void store_bf16x16(void* mem_addr, bf16x16 a) { 65 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(mem_addr), a.first); 66 | } 67 | 68 | template <> 69 | inline void store_kernel_t(void* dst, bf16x16 src) { 70 | store_bf16x16(dst, src); 71 | } 72 | 73 | #endif // ENGINE_EXECUTOR_INCLUDE_VEC_STORE_HPP_ 74 | -------------------------------------------------------------------------------- /neural_speed/vectors/ele_reduce.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #include "vectors/cpu/vec.hpp" 15 | #include "vectors/ele_reduce.h" 16 | #include 17 | 18 | void ne_vec_norm_f32_(const int n, float* s, const float* x) { 19 | float sum = 0.0; 20 | ne_dot_fp32x16_kernel_t k_t; 21 | for (int i = 0; i < n / 16; ++i) { 22 | float tmp; 23 | k_t(reinterpret_cast(&tmp), reinterpret_cast(x + i * 16), 24 | reinterpret_cast(x + i * 16)); 25 | sum += tmp; 26 | } 27 | for (int i = n / 16 * 16; i < n; i++) sum += x[i] * x[i]; 28 | *s = sqrtf(sum); 29 | } 30 | 31 | void ne_vec_sum_f32_(const int n, float* s, const float* x) { 32 | float sum = 0.0; 33 | ne_reduce_add_fp32x16_kernel_t k_t; 34 | for (int i = 0; i < n / 16; ++i) { 35 | float tmp; 36 | k_t(reinterpret_cast(&tmp), reinterpret_cast(x + i * 16)); 37 | sum += tmp; 38 | } 39 | for (int i = n / 16 * 16; i < n; i++) sum += x[i]; 40 | *s = sum; 41 | } 42 | 43 | void ne_vec_max_f32_(const int n, float* s, const float* x) { 44 | float max = -INFINITY; 45 | ne_reduce_max_fp32x16_kernel_t k_t; 46 | for (int i = 0; i < n / 16; ++i) { 47 | float tmp; 48 | k_t(reinterpret_cast(&tmp), reinterpret_cast(x + i * 16)); 49 | max = max > tmp ? max : tmp; 50 | } 51 | for (int i = n / 16 * 16; i < n; i++) { 52 | max = x[i] > max ? x[i] : max; 53 | } 54 | *s = max; 55 | } 56 | 57 | void ne_vec_norm_inv_f32_(const int n, float* s, const float* x) { 58 | ne_vec_norm_f32_(n, s, x); 59 | *s = 1.f / (*s); 60 | } 61 | void ne_vec_sum_ggf_(const int n, double* s, const float* x) { 62 | float sum = 0.0; 63 | for (int i = 0; i < n; ++i) { 64 | sum += static_cast(x[i]); 65 | } 66 | *s = sum; 67 | } 68 | -------------------------------------------------------------------------------- /neural_speed/vectors/ele_reduce.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #pragma once 15 | 16 | #ifdef __cplusplus 17 | extern "C" { 18 | #endif 19 | 20 | #ifdef VEC_SHARED 21 | #if defined(_WIN32) && !defined(__MINGW32__) 22 | #ifdef VEC_BUILD 23 | #define VEC_API __declspec(dllexport) 24 | #else 25 | #define VEC_API __declspec(dllimport) 26 | #endif 27 | #else 28 | #define VEC_API __attribute__((visibility("default"))) 29 | #endif 30 | #else 31 | #define VEC_API 32 | #endif 33 | VEC_API void ne_vec_norm_f32_(const int n, float* s, const float* x); 34 | VEC_API void ne_vec_sum_f32_(const int n, float* s, const float* x); 35 | 36 | VEC_API void ne_vec_sum_ggf_(const int n, double* s, const float* x); 37 | 38 | VEC_API void ne_vec_max_f32_(const int n, float* s, const float* x); 39 | 40 | VEC_API void ne_vec_norm_inv_f32_(const int n, float* s, const float* x); 41 | 42 | #ifdef __cplusplus 43 | } 44 | #endif 45 | -------------------------------------------------------------------------------- /neural_speed/vectors/gpu/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | cmake_minimum_required(VERSION 3.11) 15 | project(gpu_vectors) 16 | 17 | set (CMAKE_CXX_COMPILER "icpx") 18 | set (CMAKE_C_COMPILER "icx") 19 | add_compile_options(-fsycl) 20 | add_library(gpu_vectors STATIC ele_wise.cpp ele_reduce.cpp) 21 | set_property(TARGET gpu_vectors PROPERTY POSITION_INDEPENDENT_CODE ON) 22 | -------------------------------------------------------------------------------- /neural_speed/vectors/gpu/ele_reduce.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include "reduce.h" 17 | 18 | static sycl::queue q = sycl::queue(); 19 | // inline static void ne_vec_norm_f32 (const int n, float * s, const float * x) { ne_vec_dot_f32(n, s, x, x); *s = 20 | // sqrtf(*s); } 21 | 22 | void ne_vec_sum_f32(const int n, float* s, const float* x) { reduce, 16>(n, s, x, q); } 23 | 24 | // inline static void ne_vec_sum_ggf(const int n, ne_float * s, const float * x) { 25 | // ne_float sum = 0.0; 26 | // for (int i = 0; i < n; ++i) { 27 | // sum += (ne_float)x[i]; 28 | // } 29 | // *s = sum; 30 | // } 31 | 32 | void ne_vec_max_f32(const int n, float* s, const float* x) { reduce, 16>(n, s, x, q); } 33 | 34 | // inline static void ne_vec_norm_inv_f32(const int n, float * s, const float * x) { 35 | // ne_vec_norm_f32(n, s, x); 36 | // *s = 1.f/(*s); 37 | // } 38 | 39 | int main() { 40 | size_t n = 32 * 10; 41 | std::vector h_src(n); 42 | std::vector h_dst(n); 43 | for (size_t i = 0; i < n; i++) { 44 | h_src[i] = 1.f; 45 | } 46 | h_src[1] = 5.f; 47 | ne_vec_max_f32(n, h_dst.data(), h_src.data()); 48 | std::cout << h_dst[0] << std::endl; 49 | return 0; 50 | } 51 | -------------------------------------------------------------------------------- /neural_speed/vectors/gpu/reduce.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #include 15 | #include 16 | #include "sycl/reduction.hpp" 17 | 18 | template 19 | void reduce(const int n, T* s, const T* x, sycl::queue& q) { 20 | assert(n % VL == 0); 21 | 22 | sycl::buffer buf(const_cast(x), sycl::range<1>(n)); 23 | sycl::buffer sum_buf(s, sycl::range<1>(1)); 24 | BinaryOperation BOp; 25 | q.submit([&](auto& h) { 26 | sycl::accessor buf_acc(buf, h, sycl::read_only); 27 | auto retr = sycl::reduction(sum_buf, h, BOp); 28 | h.parallel_for(sycl::nd_range<1>{n, 32}, retr, [=](sycl::nd_item<1> item, auto& retr_arg) { 29 | int glob_id = item.get_global_id(0); 30 | retr_arg.combine(buf_acc[glob_id]); 31 | }); 32 | }); 33 | } 34 | -------------------------------------------------------------------------------- /neural_speed/vectors/gpu/test.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Standard C++ includes 16 | #include 17 | #include 18 | 19 | // SYCL include 20 | #include 21 | 22 | constexpr double Pi = 3.1415926535897932384626433; 23 | 24 | template 25 | void reduce(F f, RA src, RWA tmp, WA dst, cl::sycl::nd_item<1> id) { 26 | auto g = id.get_group().get_id(); 27 | auto bs = id.get_local_range().get(0); 28 | auto l = id.get_local_id().get(0); 29 | 30 | auto i = g * bs * 2 + l; 31 | 32 | tmp[l] = f(src[i], src[i + bs]); 33 | 34 | id.barrier(cl::sycl::access::fence_space::local_space); 35 | 36 | // do reduction in shared mem 37 | for (auto s = bs / 2; s > 0; s >>= 1) { 38 | if (l < s) { 39 | tmp[l] = f(tmp[l], tmp[l + s]); 40 | } 41 | id.barrier(cl::sycl::access::fence_space::local_space); 42 | } 43 | 44 | // write result for this block to global mem 45 | if (l == 0) { 46 | dst[g] = tmp[0]; 47 | } 48 | } 49 | 50 | int main() { 51 | using T = double; 52 | 53 | // Size of vectors 54 | size_t n = 8192; 55 | // block size 56 | size_t local_count = 32; 57 | 58 | // Host vectors 59 | std::vector h_src(n); 60 | std::vector h_dst(n); 61 | 62 | // Initialize vectors on host 63 | for (size_t i = 0; i < n; i++) { 64 | auto k = n - i; 65 | h_src[i] = 1.0 / (k * k); 66 | } 67 | 68 | for (size_t i = 0; i < h_dst.size(); i++) { 69 | h_dst[i] = 0; 70 | } 71 | 72 | auto sum = [](auto const& x, auto const& y) { return x + y; }; 73 | 74 | try { 75 | cl::sycl::queue queue{cl::sycl::gpu_selector()}; 76 | std::cout << "Selected platform: " << queue.get_context().get_platform().get_info() 77 | << "\n"; 78 | std::cout << "Selected device: " << queue.get_device().get_info() << "\n"; 79 | 80 | cl::sycl::buffer b_src(h_src.data(), n); 81 | cl::sycl::buffer b_dst(h_dst.data(), n); 82 | 83 | cl::sycl::nd_range<1> r(n / 2, local_count); 84 | 85 | queue.submit([&](cl::sycl::handler& cgh) { 86 | auto a_src = b_src.get_access(cgh); 87 | 88 | cl::sycl::accessor a_tmp( 89 | cl::sycl::range<1>(local_count), cgh); 90 | 91 | auto a_dst = b_dst.get_access(cgh); 92 | 93 | cgh.parallel_for(r, [=](cl::sycl::nd_item<1> i) { reduce(sum, a_src, a_tmp, a_dst, i); }); 94 | }); 95 | queue.wait(); 96 | } catch (cl::sycl::exception e) { 97 | std::cout << "Exception encountered in SYCL: " << e.what() << "\n"; 98 | return -1; 99 | } 100 | 101 | T res = 0.0; 102 | for (size_t i = 0; i < h_dst.size(); i++) { 103 | res = sum(res, h_dst[i]); 104 | } 105 | 106 | std::cout.precision(16); 107 | std::cout << "Riemann zeta(2) approximation by explicit summing:\n"; 108 | std::cout << "result = " << res << "\n"; 109 | std::cout << "exact = " << Pi * Pi / 6.0 << "\n"; 110 | 111 | return 0; 112 | } 113 | -------------------------------------------------------------------------------- /neural_speed/vectors/gpu/vector_func.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #pragma once 15 | #include 16 | 17 | template 18 | SYCL_EXTERNAL void usm_copy_from(T* src, sycl::ext::intel::esimd::simd vec, int i) SYCL_ESIMD_FUNCTION { 19 | vec.copy_from(src + i * VL); 20 | } 21 | 22 | template 23 | SYCL_EXTERNAL sycl::ext::intel::esimd::simd usm_copy_from(T* src, int i) SYCL_ESIMD_FUNCTION { 24 | sycl::ext::intel::esimd::simd vec; 25 | vec.copy_from(src + i * VL); 26 | return vec; 27 | } 28 | 29 | template 30 | SYCL_EXTERNAL void usm_copy_to(T* dst, sycl::ext::intel::esimd::simd vec, int i) SYCL_ESIMD_FUNCTION { 31 | vec.copy_to(dst + i * VL); 32 | } 33 | 34 | template 35 | SYCL_EXTERNAL void set_value(sycl::ext::intel::esimd::simd& vec, T value) SYCL_ESIMD_FUNCTION { 36 | vec = sycl::ext::intel::esimd::simd(value, 0); 37 | } 38 | 39 | template 40 | SYCL_EXTERNAL sycl::ext::intel::esimd::simd set_value(T value) SYCL_ESIMD_FUNCTION { 41 | return sycl::ext::intel::esimd::simd(value, 0); 42 | } 43 | template 44 | SYCL_EXTERNAL sycl::ext::intel::esimd::simd vec_tanh(sycl::ext::intel::esimd::simd src) 45 | SYCL_ESIMD_FUNCTION { 46 | auto exp2x = sycl::ext::intel::esimd::exp(src * 2.f); 47 | return (exp2x - 1.f) / (exp2x + 1.f); 48 | } 49 | -------------------------------------------------------------------------------- /neural_speed/vectors/parallel_for.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Intel Corporation 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #pragma once 15 | #ifdef GPU_BACKEND 16 | #include 17 | 18 | template 19 | void parallel_for(sycl::queue& q, size_t size, kernel_t kernel, kernel_tail_t kernel_tail) { 20 | constexpr unsigned GroupSize = 1; 21 | 22 | sycl::range<1> GlobalRange{size / VL}; 23 | sycl::range<1> LocalRange{GroupSize}; 24 | sycl::nd_range<1> Range(GlobalRange, LocalRange); 25 | 26 | sycl::range<1> GlobalRange_tail{size % VL}; 27 | sycl::range<1> LocalRange_tail{GroupSize}; 28 | sycl::nd_range<1> Range_tail(GlobalRange_tail, LocalRange_tail); 29 | 30 | auto e = q.submit([&](sycl::handler& cgh) { cgh.parallel_for(Range, kernel); }); 31 | auto e_tail = q.submit([&](sycl::handler& cgh) { cgh.parallel_for(Range_tail, kernel_tail); }); 32 | e.wait(); 33 | e_tail.wait(); 34 | } 35 | // #endif 36 | 37 | // Example: 38 | // float* input; 39 | // float* output; 40 | // size_t size = 128 + 1; 41 | // size_t VL = 16; 42 | // ... 43 | // Kernel kernel(input, output); 44 | // Kernel_tail kernel_tail(input, output); 45 | // parallel_for(128, kernel, kernel_tail); 46 | template 47 | void parallel_for(size_t size, kernel_t kernel, kernel_tail_t kernel_tail) { 48 | for (size_t i = 0; i < size; i += VL) { 49 | kernel(i); 50 | } 51 | for (size_t i = size / VL * VL; i < size; i++) { 52 | kernel_tail(i); 53 | } 54 | } 55 | #endif 56 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cpu 2 | accelerate 3 | cmake 4 | datasets 5 | huggingface_hub 6 | matplotlib 7 | numpy 8 | peft 9 | protobuf<3.20 10 | py-cpuinfo 11 | sentencepiece 12 | setuptools>=61 13 | tiktoken 14 | torch 15 | transformers 16 | transformers_stream_generator 17 | zipfile38 18 | -------------------------------------------------------------------------------- /scripts/cal_acc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import os 15 | import sys 16 | import shutil 17 | import argparse 18 | from ns_evaluator import LMEvalParser 19 | from accuracy import cli_evaluate 20 | 21 | if __name__ == "__main__": 22 | parser = argparse.ArgumentParser(description="Evaluate accuracy for a model") 23 | parser.add_argument('--model_name', type=str, default="~/Llama-2-7b-chat-hf") 24 | parser.add_argument('--tasks', type=str, default="lambada_openai") 25 | parser.add_argument("--clear", action="store_true") 26 | parser.add_argument("--use_gptq", action="store_true") 27 | parser.add_argument("--use_awq", action="store_true") 28 | parser.add_argument("--use_autoround", action="store_true") 29 | parser.add_argument('--batch_size', type=int, default=1) 30 | parser.add_argument('--weight_dtype', type=str, default="fp32") 31 | parser.add_argument('--compute_dtype', type=str, default="fp32") 32 | parser.add_argument('--group_size', type=int, default=32) 33 | parser.add_argument('--use_ggml', action="store_true") 34 | parser.add_argument('--alg', type=str, default="sym") 35 | parser.add_argument('--scale_dtype', type=str, default="fp32") 36 | parser.add_argument('--init_from_bin', type=str, default="default_none") 37 | parser.add_argument('--model_format', type=str, default="neural_speed") 38 | args = parser.parse_args() 39 | 40 | model_args=f'pretrained={args.model_name},model_format={args.model_format},dtype=float32,trust_remote_code=True' 41 | # model_args += f'use_gptq={args.use_gptq},use_awq={args.use_awq},use_autoround={args.use_autoround}' 42 | eval_args = LMEvalParser(model="hf", 43 | model_args=model_args, 44 | tasks=f"{args.tasks}", 45 | device="cpu", 46 | batch_size=args.batch_size, 47 | use_gptq=args.use_gptq, 48 | use_autoround=args.use_autoround, 49 | use_awq=args.use_awq, 50 | weight_dtype=args.weight_dtype, 51 | compute_dtype=args.compute_dtype, 52 | group_size=args.group_size, 53 | use_ggml=args.use_ggml, 54 | alg=args.alg, 55 | scale_dtype=args.scale_dtype, 56 | init_from_bin=args.init_from_bin 57 | ) 58 | results = cli_evaluate(eval_args) 59 | print(results) 60 | 61 | if args.clear and os.path.isdir('runtime_outs'): 62 | shutil.rmtree('runtime_outs') 63 | -------------------------------------------------------------------------------- /scripts/cal_diff.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (c) 2023 Intel Corporation 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | import numpy as np 19 | import argparse 20 | from transformers import AutoTokenizer, TextStreamer, AutoModelForCausalLM 21 | from neural_speed import Model 22 | 23 | def cmpData(numa, numb): 24 | totalErr = ((numa - numb)**2).sum() 25 | totalNum = (numa**2).sum() 26 | diff2 = np.sqrt(totalErr / totalNum) 27 | 28 | cos = np.dot(numa, numb) / (np.linalg.norm(numa) * np.linalg.norm(numb)) 29 | return {"diff2": diff2, "cos": cos} 30 | 31 | 32 | if __name__ == "__main__": 33 | parser = argparse.ArgumentParser(description="Evaluate diff for a model") 34 | parser.add_argument('--model_name', type=str, default="~/Llama-2-7b-chat-hf") 35 | args = parser.parse_args() 36 | 37 | woq_configs = { 38 | "fp32": {"use_quant":False}, 39 | # "ggml_int4": {"compute_dtype":"int8", "weight_dtype":"int4", "use_ggml":True}, 40 | "jblas_int4": {"compute_dtype":"int8", "weight_dtype":"int4"}, 41 | # "jblas_int8": {"compute_dtype":"bf16", "weight_dtype":"int8"}, 42 | 43 | } 44 | prompt = "What is the meaning of life?" 45 | 46 | model_name = args.model_name 47 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 48 | inputs = tokenizer(prompt, return_tensors="pt") 49 | 50 | pt_model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True) 51 | pt_model.eval() 52 | pt_logits = pt_model(input_ids=inputs.input_ids).logits[:, -1] 53 | 54 | for config_type in woq_configs: 55 | itrex_model = Model() 56 | itrex_model.init(model_name, **woq_configs[config_type]) 57 | itrex_logits = itrex_model(inputs.input_ids) 58 | 59 | print(config_type, cmpData(pt_logits.detach().numpy().flatten(), itrex_logits.flatten())) 60 | -------------------------------------------------------------------------------- /scripts/convert.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | import sys 17 | from pathlib import Path 18 | from typing import List, Optional 19 | from huggingface_hub import snapshot_download 20 | from neural_speed.convert import convert_model 21 | 22 | def main(args_in: Optional[List[str]] = None) -> None: 23 | parser = argparse.ArgumentParser(description="Convert a PyTorch model to a NE compatible file") 24 | parser.add_argument( 25 | "--outtype", 26 | choices=["f32", "f16"], 27 | help="output format, default: f32", 28 | default="f32", 29 | ) 30 | parser.add_argument( 31 | "--token", 32 | type=str, 33 | help="Access token ID for models that require it (LLaMa2, etc..)", 34 | ) 35 | parser.add_argument( 36 | "--outfile", 37 | type=Path, 38 | required=True, 39 | help="path to write to" 40 | ) 41 | parser.add_argument( 42 | "--format", 43 | type=str, 44 | default="NE", 45 | choices=["NE", "GGUF"], 46 | help="Convert to the GGUF or NE format" 47 | ) 48 | parser.add_argument( 49 | "--use_quantized_model", 50 | action="store_true", 51 | help="use quantized model: awq/gptq/autoround" 52 | ) 53 | parser.add_argument( 54 | "model", 55 | type=Path, 56 | help="directory containing model file or model id" 57 | ) 58 | 59 | args = parser.parse_args(args_in) 60 | 61 | if args.model.exists(): 62 | dir_model = args.model.as_posix() 63 | else: 64 | try: 65 | dir_model = snapshot_download(repo_id=str(args.model), resume_download=True, token=args.token) 66 | except Exception as e: 67 | if e.response.status_code == 401: 68 | print("You are required to input an access token ID for {}, please add it in option --token or download model weights locally".format(args.model)) 69 | sys.exit(f"{e}") 70 | 71 | convert_model(dir_model, args.outfile, args.outtype, format=args.format, use_quantized_model=args.use_quantized_model) 72 | 73 | 74 | if __name__ == "__main__": 75 | main() 76 | -------------------------------------------------------------------------------- /scripts/load_peft_and_merge.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Intel Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | from peft import PeftModel 17 | from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM 18 | 19 | 20 | def main(): 21 | parser = argparse.ArgumentParser(description="Load CausalLM and Peft model, then merge and save.") 22 | parser.add_argument( 23 | "--model_name_or_path", 24 | type=str, 25 | required=True, 26 | help="The model checkpoint for weights initialization." 27 | "Set to model id of huggingface model hub or local path to the model.", 28 | ) 29 | parser.add_argument( 30 | "--peft_name_or_path", 31 | type=str, 32 | required=True, 33 | help="The peft model checkpoint for weights initialization." 34 | "Set to model id of huggingface model hub or local path to the model.", 35 | ) 36 | parser.add_argument("--save_path", type=str, default=None, help="Path to save merged model checkpoint.") 37 | args = parser.parse_args() 38 | tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) 39 | config = AutoConfig.from_pretrained(args.model_name_or_path) 40 | model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, config=config) 41 | model = PeftModel.from_pretrained(model, args.peft_name_or_path) 42 | model = model.merge_and_unload() 43 | save_path = args.save_path 44 | if save_path is None: 45 | save_path = "./{}_{}".format( 46 | args.model_name_or_path.strip('/').split('/')[-1], 47 | args.peft_name_or_path.strip('/').split('/')[-1]) 48 | tokenizer.save_pretrained(save_path) 49 | model.save_pretrained(save_path) 50 | print(f"Merged model saved in {save_path}") 51 | 52 | 53 | if __name__ == "__main__": 54 | main() 55 | -------------------------------------------------------------------------------- /scripts/python_api_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (c) 2023 Intel Corporation 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | import sys 18 | from transformers import AutoTokenizer, TextStreamer 19 | from neural_speed import Model 20 | 21 | if len(sys.argv) != 2: 22 | print("Usage: python python_api_example.py model_path") 23 | model_name = sys.argv[1] 24 | 25 | prompt = "Once upon a time, a little girl" 26 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 27 | inputs = tokenizer(prompt, return_tensors="pt").input_ids 28 | streamer = TextStreamer(tokenizer) 29 | 30 | model = Model() 31 | # If you want to run GPTQ or AWQ models, just set use_gptq = True or use_awq = True. 32 | model.init(model_name, weight_dtype="int4", compute_dtype="int8") 33 | outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300, do_sample=True) 34 | -------------------------------------------------------------------------------- /scripts/python_api_example_for_gguf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (c) 2023 Intel Corporation 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | import sys 18 | import argparse 19 | from pathlib import Path 20 | from typing import List, Optional 21 | from transformers import AutoTokenizer, TextStreamer 22 | from neural_speed import Model 23 | 24 | # Usage: 25 | # python python_api_example_for_gguf.py \ 26 | # --model_name falcon \ 27 | # --model_path /model_path/falcon-7b \ 28 | # -m /model_path/falcon-7b/ggml-model-f32.gguf 29 | 30 | def main(args_in: Optional[List[str]] = None) -> None: 31 | parser = argparse.ArgumentParser(description="main program llm running") 32 | parser.add_argument("--model_name", type=str, help="Model name: String", required=True) 33 | parser.add_argument("--model_path", type=Path, help="Path to the model: String", required=True) 34 | parser.add_argument("-m", "--model", type=Path, help="Path to the executed model: String", required=True) 35 | parser.add_argument("--format", 36 | type=str, 37 | default="GGUF", 38 | choices=["NE", "GGUF"], 39 | help="convert to the GGUF or NE format") 40 | parser.add_argument( 41 | "-p", 42 | "--prompt", 43 | type=str, 44 | help="Prompt to start generation with: String (default: empty)", 45 | default="Once upon a time", 46 | ) 47 | 48 | args = parser.parse_args(args_in) 49 | print(args) 50 | 51 | gguf_path = args.model.as_posix() 52 | 53 | prompt = args.prompt 54 | tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True) 55 | inputs = tokenizer(prompt, return_tensors="pt").input_ids 56 | streamer = TextStreamer(tokenizer) 57 | 58 | model = Model() 59 | model.init_from_bin(args.model_name, gguf_path) 60 | outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300, do_sample=True) 61 | 62 | 63 | if __name__ == "__main__": 64 | main() 65 | -------------------------------------------------------------------------------- /scripts/python_api_example_for_gptq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (c) 2023 Intel Corporation 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | import sys 18 | import argparse 19 | from pathlib import Path 20 | from transformers import AutoTokenizer, TextStreamer 21 | from neural_speed import Model 22 | from typing import List, Optional 23 | 24 | 25 | def main(args_in: Optional[List[str]] = None) -> None: 26 | parser = argparse.ArgumentParser(description="pythonAPI example for gptq") 27 | parser.add_argument("model", type=Path, help="directory containing model file") 28 | parser.add_argument( 29 | "-p", 30 | "--prompt", 31 | type=str, 32 | help="Prompt to start generation with: String (default: empty)", 33 | default="Once upon a time, a little girl", 34 | ) 35 | args = parser.parse_args(args_in) 36 | 37 | prompt = args.prompt 38 | model_name = args.model 39 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 40 | inputs = tokenizer(prompt, return_tensors="pt").input_ids 41 | streamer = TextStreamer(tokenizer) 42 | 43 | model = Model() 44 | # If you want to run AWQ models, just set use_awq = True. 45 | model.init(model_name, weight_dtype="int4", compute_dtype="int8", use_gptq=True) 46 | outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300, do_sample=True) 47 | 48 | 49 | if __name__ == "__main__": 50 | main() 51 | -------------------------------------------------------------------------------- /scripts/python_api_example_for_modelscope.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (c) 2023 Intel Corporation 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | import sys 18 | from modelscope import AutoTokenizer 19 | from transformers import TextStreamer 20 | from neural_speed import Model 21 | 22 | if len(sys.argv) != 2: 23 | print("Usage: python python_api_example.py model_path") 24 | model_name = sys.argv[1] 25 | 26 | prompt = "Once upon a time, a little girl" 27 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 28 | inputs = tokenizer(prompt, return_tensors="pt").input_ids 29 | streamer = TextStreamer(tokenizer) 30 | 31 | model = Model() 32 | # If you want to run GPTQ or AWQ models, just set use_gptq = True or use_awq = True. 33 | model.init(model_name, weight_dtype="int4", compute_dtype="int8", model_hub="modelscope") 34 | outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300, do_sample=True) 35 | -------------------------------------------------------------------------------- /scripts/requirements.txt: -------------------------------------------------------------------------------- 1 | lm_eval==0.4.2 2 | -------------------------------------------------------------------------------- /scripts/whisper_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (c) 2023 Intel Corporation 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | import sys 18 | from transformers import AutoTokenizer, TextStreamer 19 | from neural_speed import Model 20 | 21 | if len(sys.argv) != 3: 22 | print("Usage: python whisper_example.py model_path and audio_file") 23 | model_name = sys.argv[1] 24 | audio_file = sys.argv[2] 25 | 26 | model = Model() 27 | model.init(model_name, use_ggml=True) 28 | model(audio_file) 29 | -------------------------------------------------------------------------------- /security.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, impact, severity and mitigation. 3 | 4 | ## Reporting a Vulnerability 5 | Please report any security vulnerabilities in this project utilizing the guidelines [here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html). 6 | -------------------------------------------------------------------------------- /tests/requirements.txt: -------------------------------------------------------------------------------- 1 | lm_eval==0.4.2 2 | modelscope==1.13.1 3 | optimum==1.13.2 4 | optimum-intel==1.11.0 5 | zipfile38 6 | -------------------------------------------------------------------------------- /tests/test_modelscope.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (c) 2024 Intel Corporation 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | import sys 18 | from modelscope import AutoTokenizer 19 | from transformers import TextStreamer 20 | from neural_speed import Model 21 | 22 | model_name = "/tf_dataset2/models/pytorch/Qwen-7B" 23 | 24 | prompt = "Once upon a time, a little girl" 25 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 26 | inputs = tokenizer(prompt, return_tensors="pt").input_ids 27 | streamer = TextStreamer(tokenizer) 28 | 29 | model = Model() 30 | # If you want to run GPTQ or AWQ models, just set use_gptq = True or use_awq = True. 31 | model.init(model_name, weight_dtype="int4", compute_dtype="int8", model_hub="modelscope") 32 | outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300, do_sample=True) 33 | --------------------------------------------------------------------------------