├── README.md ├── bigdl-core-xe ├── version.txt ├── bigdl-core-xe-addons │ ├── version.txt │ ├── includes │ │ ├── norm.h │ │ └── utils.h │ ├── CMakeLists.txt │ ├── xpu_addon_ops.cpp │ ├── setup.py │ └── norm.cpp ├── setup.py ├── ggml │ ├── quantize.h │ ├── CMakeLists.txt │ └── quantize.c └── build.sh ├── SECURITY.md └── LICENSE /README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bigdl-core-xe/version.txt: -------------------------------------------------------------------------------- 1 | 2.7.0.dev0 -------------------------------------------------------------------------------- /bigdl-core-xe/bigdl-core-xe-addons/version.txt: -------------------------------------------------------------------------------- 1 | 2.7.0.dev0 -------------------------------------------------------------------------------- /bigdl-core-xe/bigdl-core-xe-addons/includes/norm.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | torch::Tensor rms_norm( 6 | torch::Tensor weight, 7 | torch::Tensor input, 8 | double eps 9 | ); 10 | 11 | torch::Tensor layer_norm( 12 | torch::Tensor input, 13 | std::optional weight, 14 | std::optional bias, 15 | double eps 16 | ); 17 | -------------------------------------------------------------------------------- /bigdl-core-xe/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import setuptools 3 | 4 | VERSION = open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 5 | 'version.txt'), 'r').read().strip() 6 | 7 | setuptools.setup( 8 | name='bigdl-core', 9 | version=VERSION, 10 | package_dir={".": "."}, 11 | packages=["."], 12 | package_data={".": ["*.pyd", "*.so"]}, 13 | include_package_data=True, 14 | ext_modules=[setuptools.Extension(name='no_ext', sources=[])] 15 | ) -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Report a Vulnerability 4 | 5 | Please report security issues or vulnerabilities to the [Intel® Security Center]. 6 | 7 | For more information on how Intel® works to resolve security issues, see 8 | [Vulnerability Handling Guidelines]. 9 | 10 | [Intel® Security Center]:https://www.intel.com/security 11 | 12 | [Vulnerability Handling Guidelines]:https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html 13 | -------------------------------------------------------------------------------- /bigdl-core-xe/ggml/quantize.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #define QK4_0 64 9 | 10 | #ifdef GGML_SHARED 11 | #if defined(_WIN32) && !defined(__MINGW32__) 12 | #ifdef GGML_BUILD 13 | #define GGML_API __declspec(dllexport) 14 | #else 15 | #define GGML_API __declspec(dllimport) 16 | #endif 17 | #else 18 | #define GGML_API __attribute__((visibility("default"))) 19 | #endif 20 | #else 21 | #define GGML_API 22 | #endif 23 | 24 | #ifdef __cplusplus 25 | extern "C" { 26 | #endif 27 | #ifdef __ARM_NEON 28 | // we use the built-in 16-bit float type 29 | typedef __fp16 ggml_fp16_t; 30 | #else 31 | typedef uint16_t ggml_fp16_t; 32 | #endif 33 | 34 | #ifdef __cplusplus 35 | #define RESTRICT __restrict__ 36 | #else 37 | #define RESTRICT restrict 38 | #endif 39 | 40 | // [修改点]: 增加了 block_size 参数 41 | GGML_API size_t quantize_q4_0_to_qweight_and_scale( 42 | const float *src, 43 | int32_t *qweight, 44 | ggml_fp16_t *scale, 45 | int out_features, 46 | int in_features, 47 | int block_size); 48 | 49 | #ifdef __cplusplus 50 | } 51 | #endif 52 | -------------------------------------------------------------------------------- /bigdl-core-xe/build.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | sk_root=$(python3 -c 'import os; import skbuild; sk_root=os.path.dirname(skbuild.__file__); print(sk_root)') 4 | torch_root=$(python3 -c 'import os; import torch; torch_root=os.path.dirname(torch.__file__); print(torch_root)') 5 | 6 | cur_path=./bigdl-core-xe-addons 7 | cur_build_path=${cur_path}/build 8 | 9 | export PYTHON_EXECUTABLE=$(which python3) 10 | 11 | cmake -GNinja -Wno-dev \ 12 | -DCMAKE_BUILD_TYPE=Release \ 13 | -DCMAKE_MODULE_PATH=${sk_root}/resources/cmake \ 14 | -DCMAKE_PREFIX_PATH=${torch_root} \ 15 | -DCMAKE_CXX_COMPILER=icpx \ 16 | -DCMAKE_CXX_STANDARD=20 \ 17 | -DPython_EXECUTABLE=${PYTHON_EXECUTABLE} \ 18 | -B ${cur_build_path} ${cur_path} 19 | 20 | cmake --build ${cur_build_path} --config Release -j 21 | 22 | cp ${cur_build_path}/*.so . 23 | 24 | cur_path=./ggml 25 | cur_build_path=${cur_path}/build 26 | 27 | cmake -B ${cur_build_path} ${cur_path} 28 | 29 | cmake --build ${cur_build_path} 30 | 31 | cp ${cur_build_path}/*.so . 32 | 33 | mv ./libquantize.so ./vllm_int4_for_multi_arc.so 34 | 35 | python3 setup.py clean --all bdist_wheel --plat-name manylinux2010_x86_64 --python-tag py3 36 | -------------------------------------------------------------------------------- /bigdl-core-xe/bigdl-core-xe-addons/includes/utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3 7 | #include 8 | #else 9 | #include 10 | #endif 11 | 12 | namespace utils { 13 | static inline sycl::queue& get_queue(const torch::Device& device) { 14 | #if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3 15 | return c10::xpu::getCurrentXPUStream(device.index()).queue(); 16 | #else 17 | c10::impl::VirtualGuardImpl impl(device.type()); 18 | c10::Stream c10_stream = impl.getStream(c10::Device(device)); 19 | return xpu::get_queue_from_stream(c10_stream); 20 | #endif 21 | } 22 | 23 | static inline sycl::event submit_kernel(std::function kernel, const at::Device& device, const char * desc) { 24 | sycl::queue& queue = get_queue(device); 25 | sycl::event event = queue.submit(kernel); 26 | #if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 3 27 | // xpu::profiler_record(desc, event); 28 | #else 29 | xpu::profiler_record(desc, event); 30 | #endif 31 | return event; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /bigdl-core-xe/bigdl-core-xe-addons/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.16) 2 | project(bigdl_core LANGUAGES C CXX) 3 | 4 | find_package(IntelSYCL REQUIRED) 5 | find_package(Torch REQUIRED) 6 | find_package(PythonExtensions REQUIRED) 7 | find_library(TORCH_PYTHON_LIBRARYS torch_python PATH "${TORCH_INSTALL_PREFIX}/lib") 8 | 9 | set(MODULE_NAME bigdl_core) 10 | 11 | set(SYCL_SRCS norm.cpp) 12 | 13 | set(ALL_SRCS ${SYCL_SRCS} 14 | xpu_addon_ops.cpp) 15 | 16 | if(WIN32) 17 | add_compile_options(-Wno-ignored-attributes) 18 | add_python_library(${MODULE_NAME} SHARED SOURCES ${ALL_SRCS}) 19 | python_extension_module(${MODULE_NAME}) 20 | set_target_properties(${MODULE_NAME} PROPERTIES SUFFIX ${PYTHON_EXTENSION_MODULE_SUFFIX}) 21 | else() 22 | add_python_extension(${MODULE_NAME} SOURCES ${ALL_SRCS}) 23 | endif() 24 | 25 | add_compile_definitions(TORCH_EXTENSION_NAME=${MODULE_NAME}) 26 | add_sycl_to_target(TARGET ${MODULE_NAME} SOURCES ${SYCL_SRCS}) 27 | target_include_directories(${MODULE_NAME} PRIVATE includes) 28 | target_include_directories(${MODULE_NAME} PRIVATE ${TORCH_INCLUDE_DIRS}) 29 | target_link_libraries(${MODULE_NAME} ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIBRARYS}) 30 | -------------------------------------------------------------------------------- /bigdl-core-xe/bigdl-core-xe-addons/xpu_addon_ops.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright 2016 The BigDL Authors. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // 16 | 17 | 18 | #include 19 | #include 20 | 21 | #include "norm.h" 22 | 23 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 24 | m.def("layer_norm", &layer_norm, "fused layer norm"); 25 | } 26 | 27 | #if TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 6 28 | 29 | TORCH_LIBRARY_FRAGMENT(ipex_llm, m) { 30 | m.def("layer_norm(Tensor input, Tensor? weight, Tensor? bias, float eps) -> Tensor"); 31 | } 32 | 33 | TORCH_LIBRARY_IMPL(ipex_llm, XPU, m) { 34 | m.impl("layer_norm", &layer_norm); 35 | } 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /bigdl-core-xe/bigdl-core-xe-addons/setup.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 The BigDL Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import os 18 | from setuptools import setup 19 | import torch 20 | import intel_extension_for_pytorch as ipex 21 | from torch.xpu.cpp_extension import DPCPPExtension, DpcppBuildExtension 22 | 23 | VERSION = open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 24 | 'version.txt'), 'r').read().strip() 25 | include_dir = str(os.path.join(os.path.dirname(os.path.realpath(__file__)), 26 | "includes")) 27 | _ipex_version = ipex.__version__ 28 | _major_version = ''.join(_ipex_version.split('.')[:2]) # equal to 21 29 | suffix_name = "-" + _major_version 30 | 31 | setup( 32 | name="bigdl-core-llm" + suffix_name, 33 | version=VERSION, 34 | ext_modules=[ 35 | DPCPPExtension('bigdl_core_llm', [ 36 | 'xpu_addon_ops.cpp', 37 | 'norm.cpp', 38 | ], 39 | extra_compile_args=["-std=c++20"], 40 | include_dirs=[include_dir]) 41 | ], 42 | cmdclass={ 43 | 'build_ext': DpcppBuildExtension.with_options(use_ninja=False) 44 | }) 45 | -------------------------------------------------------------------------------- /bigdl-core-xe/ggml/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.12) 2 | project(llm C CXX) 3 | 4 | 5 | if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) 6 | set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) 7 | set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") 8 | endif() 9 | 10 | 11 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/build) 12 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/build) 13 | 14 | 15 | include_directories( 16 | ${CMAKE_CURRENT_SOURCE_DIR} 17 | # ${CMAKE_CURRENT_SOURCE_DIR}/.. 18 | ) 19 | 20 | if (MSVC) 21 | if (ONLYAVX) 22 | # Enable AVX 23 | add_compile_options($<$:/arch:AVX>) 24 | add_compile_options($<$:/arch:AVX>) 25 | else() 26 | # Enable AVX2 27 | add_compile_options($<$:/arch:AVX2>) 28 | add_compile_options($<$:/arch:AVX2>) 29 | if (AVXVNNI) 30 | add_compile_options(/D__AVXVNNI__) 31 | endif() 32 | endif() 33 | # Disable Warnings 34 | add_compile_options(/W0) 35 | # Others 36 | add_compile_definitions(_CRT_SECURE_NO_WARNINGS) 37 | set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) 38 | else() 39 | if (ONLYAVX) 40 | add_compile_options(-fPIC -mavx) 41 | elseif(ONLYAVX2) 42 | add_compile_options(-fPIC -march=haswell -mtune=haswell) 43 | else() 44 | add_compile_options(-fPIC -march=native -mtune=native) 45 | endif() 46 | endif() 47 | 48 | add_library(quantize SHARED quantize.h quantize.c) 49 | if (MSVC) 50 | target_link_libraries(quantize kernel32) 51 | else() 52 | set_target_properties(quantize PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}") 53 | target_link_libraries(quantize -ldl) 54 | endif() -------------------------------------------------------------------------------- /bigdl-core-xe/ggml/quantize.c: -------------------------------------------------------------------------------- 1 | #include "quantize.h" 2 | #include 3 | #include 4 | #include 5 | 6 | #define MIN(a, b) ((a) < (b) ? (a) : (b)) 7 | #ifdef __ARM_NEON 8 | 9 | // if YCM cannot find , make a symbolic link to it, for example: 10 | // 11 | // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ 12 | // 13 | #include 14 | 15 | #define GGML_COMPUTE_FP16_TO_FP32(x) ((float)(x)) 16 | #define GGML_COMPUTE_FP32_TO_FP16(x) (x) 17 | 18 | #define GGML_FP16_TO_FP32(x) ((float)(x)) 19 | #define GGML_FP32_TO_FP16(x) (x) 20 | 21 | #else 22 | 23 | static inline bool is_genuine_intel() 24 | { 25 | unsigned int regs[4]; 26 | #ifdef _WIN32 27 | __cpuid((int *)regs, 0); 28 | #else 29 | __get_cpuid(0, ®s[0], ®s[1], ®s[2], ®s[3]); 30 | #endif 31 | 32 | if (regs[1] == 1970169159 && regs[2] == 1818588270 && regs[3] == 1231384169) 33 | { 34 | return true; 35 | } 36 | else 37 | { 38 | return false; 39 | } 40 | } 41 | 42 | static bool is_intel; 43 | 44 | #if defined(_MSC_VER) 45 | #define _mm256_dpbusd_epi32 _mm256_dpbusd_avx_epi32 46 | #endif 47 | 48 | #ifdef __F16C__ 49 | 50 | #ifdef _MSC_VER 51 | #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) 52 | #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) 53 | #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) 54 | #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) 55 | #else 56 | #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) 57 | #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) 58 | #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) 59 | #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) 60 | #endif 61 | 62 | #elif defined(__POWER9_VECTOR__) 63 | 64 | #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) 65 | #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) 66 | /* the inline asm below is about 12% faster than the lookup method */ 67 | #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) 68 | #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) 69 | 70 | static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) 71 | { 72 | register float f; 73 | register double d; 74 | __asm__( 75 | "mtfprd %0,%2\n" 76 | "xscvhpdp %0,%0\n" 77 | "frsp %1,%0\n" : 78 | /* temp */ "=d"(d), 79 | /* out */ "=f"(f) : 80 | /* in */ "r"(h)); 81 | return f; 82 | } 83 | 84 | static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) 85 | { 86 | register double d; 87 | register ggml_fp16_t r; 88 | __asm__(/* xscvdphp can work on double or single precision */ 89 | "xscvdphp %0,%2\n" 90 | "mffprd %1,%0\n" : 91 | /* temp */ "=d"(d), 92 | /* out */ "=r"(r) : 93 | /* in */ "f"(f)); 94 | return r; 95 | } 96 | 97 | #else 98 | 99 | // FP16 <-> FP32 100 | // ref: https://github.com/Maratyszcza/FP16 101 | 102 | static inline float fp32_from_bits(uint32_t w) 103 | { 104 | union 105 | { 106 | uint32_t as_bits; 107 | float as_value; 108 | } fp32; 109 | fp32.as_bits = w; 110 | return fp32.as_value; 111 | } 112 | 113 | static inline uint32_t fp32_to_bits(float f) 114 | { 115 | union 116 | { 117 | float as_value; 118 | uint32_t as_bits; 119 | } fp32; 120 | fp32.as_value = f; 121 | return fp32.as_bits; 122 | } 123 | 124 | static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) 125 | { 126 | const uint32_t w = (uint32_t)h << 16; 127 | const uint32_t sign = w & UINT32_C(0x80000000); 128 | const uint32_t two_w = w + w; 129 | 130 | const uint32_t exp_offset = UINT32_C(0xE0) << 23; 131 | #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) 132 | const float exp_scale = 0x1.0p-112f; 133 | #else 134 | const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); 135 | #endif 136 | const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; 137 | 138 | const uint32_t magic_mask = UINT32_C(126) << 23; 139 | const float magic_bias = 0.5f; 140 | const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; 141 | 142 | const uint32_t denormalized_cutoff = UINT32_C(1) << 27; 143 | const uint32_t result = sign | 144 | (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); 145 | return fp32_from_bits(result); 146 | } 147 | 148 | static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) 149 | { 150 | #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) 151 | const float scale_to_inf = 0x1.0p+112f; 152 | const float scale_to_zero = 0x1.0p-110f; 153 | #else 154 | const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); 155 | const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); 156 | #endif 157 | float base = (fabsf(f) * scale_to_inf) * scale_to_zero; 158 | 159 | const uint32_t w = fp32_to_bits(f); 160 | const uint32_t shl1_w = w + w; 161 | const uint32_t sign = w & UINT32_C(0x80000000); 162 | uint32_t bias = shl1_w & UINT32_C(0xFF000000); 163 | if (bias < UINT32_C(0x71000000)) 164 | { 165 | bias = UINT32_C(0x71000000); 166 | } 167 | 168 | base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; 169 | const uint32_t bits = fp32_to_bits(base); 170 | const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); 171 | const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); 172 | const uint32_t nonsign = exp_bits + mantissa_bits; 173 | return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); 174 | } 175 | 176 | #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) 177 | #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) 178 | 179 | #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) 180 | #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) 181 | 182 | #endif // __F16C__ 183 | 184 | #endif // __ARM_NEON 185 | 186 | static void quantize_row_q4_0_gptq_reference(const float * RESTRICT in_src, int32_t * RESTRICT out_qweight, ggml_fp16_t * RESTRICT out_scale, int in_features, int block_size) { 187 | // static const int qk = QK4_0; 188 | const int qk = block_size; 189 | 190 | assert(qk > 0 && qk % 8 == 0); 191 | assert(in_features % qk == 0); 192 | 193 | const int nb = in_features / qk; 194 | 195 | for (int i = 0; i < nb; i ++) { 196 | float amax = 0.0f; // absolute max 197 | float max = 0.0f; 198 | 199 | for (int j = 0; j < qk; j++) { 200 | const float v = in_src[i*qk + j]; 201 | if (amax < fabsf(v)) { 202 | amax = fabsf(v); 203 | max = v; 204 | } 205 | } 206 | 207 | const float d = max / -8; 208 | const float id = d ? 1.0f/d : 0.0f; 209 | 210 | // Store the scale -> for each block 211 | out_scale[i] = GGML_FP32_TO_FP16(d); 212 | 213 | for (int j = 0; j < qk; j += 8) { 214 | int index = i * qk + j; 215 | #pragma unroll 8 216 | for (int z = 0; z < 8; z += 1) { 217 | const float x0 = in_src[index + z] * id; 218 | const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); 219 | // For each of the 8 elements 220 | out_qweight[i * (qk / 8) + j / 8] |= (xi0 << (4 * z)); 221 | } 222 | } 223 | } 224 | } 225 | 226 | size_t quantize_q4_0_to_qweight_and_scale( 227 | const float *src, 228 | int32_t *qweight, 229 | ggml_fp16_t *scale, 230 | int out_features, 231 | int in_features, 232 | int block_size) 233 | { 234 | assert(block_size > 0); 235 | assert(in_features % block_size == 0); 236 | 237 | const int n = out_features * in_features; 238 | 239 | int64_t b = 0; 240 | #pragma omp parallel for schedule(dynamic, 1) 241 | for (b = 0; b < n; b += in_features) 242 | { 243 | int32_t *qweight_out = qweight + b / 8; 244 | 245 | ggml_fp16_t *scale_out = scale + b / block_size; 246 | 247 | quantize_row_q4_0_gptq_reference(src + b, qweight_out, scale_out, in_features, block_size); 248 | } 249 | 250 | return 0; 251 | } -------------------------------------------------------------------------------- /bigdl-core-xe/bigdl-core-xe-addons/norm.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "utils.h" 5 | 6 | using fp16 = sycl::half; 7 | using ST = at::ScalarType; 8 | 9 | using namespace sycl::ext::intel::esimd; 10 | 11 | template 12 | void rms_norm_kernel( 13 | const void * weight_ptr, 14 | const void * input_ptr, 15 | void * output_ptr, 16 | float eps, 17 | const int input_size, 18 | const int hidden_size, 19 | const at::Device & device 20 | ) { 21 | assert(hidden_size % BS == 0); 22 | assert(hidden_size <= 8 * 1024); 23 | 24 | const int nb = hidden_size / BS; 25 | const int sub_nb = nb / GS; 26 | const int rem_nb = nb % GS; 27 | const int acc_offset = hidden_size * sizeof(IT); 28 | 29 | sycl::range<2> global_size(input_size, GS); 30 | sycl::range<2> local_size(1, GS); 31 | 32 | auto cgf = [&](sycl::handler& handle) { 33 | handle.parallel_for( 34 | sycl::nd_range<2>(global_size, local_size), 35 | [=](sycl::nd_item<2> item) SYCL_ESIMD_KERNEL { 36 | // slm cannot be dynamic size, so use fixed size 37 | slm_init<8 * 1024 * sizeof(IT) + GS * sizeof(float)>(); 38 | 39 | const int rid = item.get_global_id(0); 40 | const int tid = item.get_local_id(1); 41 | 42 | const IT * weight = (const IT *)weight_ptr; 43 | const IT * input = (const IT *)input_ptr + hidden_size * (size_t)rid; 44 | IT * output = (IT *)output_ptr + hidden_size * (size_t)rid; 45 | 46 | const int start_blk = sub_nb * tid + std::min(tid, rem_nb); 47 | const int end_blk = start_blk + sub_nb + (tid < rem_nb); 48 | 49 | simd accv = 0; 50 | for (int i = start_blk; i < end_blk; ++i) { 51 | simd xv = block_load(input + i * BS); 52 | slm_block_store(i * BS * sizeof(IT), xv); 53 | simd xv_f32 = xv; 54 | accv += xv_f32 * xv_f32; 55 | } 56 | float acc = sycl::ext::intel::esimd::detail::sum(accv) / hidden_size; 57 | slm_block_store(acc_offset + tid * sizeof(float), acc); 58 | 59 | barrier(); 60 | 61 | simd accs = slm_block_load(acc_offset); 62 | float mean = sycl::ext::intel::esimd::detail::sum(accs); 63 | float scale = rsqrt(mean + eps); 64 | 65 | for (int i = start_blk; i < end_blk; ++i) { 66 | simd xv = slm_block_load(i * BS * sizeof(IT)); 67 | simd yv = block_load(weight + i * BS); 68 | simd result = xv * scale * yv; 69 | block_store(output + i * BS, result); 70 | } 71 | } 72 | ); 73 | }; 74 | 75 | utils::submit_kernel(cgf, device, "rms norm"); 76 | } 77 | 78 | torch::Tensor rms_norm( 79 | torch::Tensor weight, 80 | torch::Tensor input, 81 | double eps 82 | ) { 83 | int64_t input_size = input.size(0); 84 | int64_t hidden_size = input.size(1); 85 | 86 | assert(input.dim() == 2); 87 | assert(weight.dim() == 1); 88 | assert(weight.size(0) == input.size(1)); // hidden_size 89 | assert(input.scalar_type() == weight.scalar_type()); 90 | 91 | auto output = torch::empty({input_size, hidden_size}, 92 | torch::device(input.device()).dtype(input.dtype())); 93 | 94 | auto func = [&] () { 95 | switch (input.scalar_type()) { 96 | case ST::Float: return rms_norm_kernel; 97 | case ST::Half: return rms_norm_kernel; 98 | default: throw std::runtime_error("unsupported dtype, only fp32 and fp16 are supported"); 99 | } 100 | } (); 101 | 102 | func( 103 | weight.data_ptr(), input.data_ptr(), output.data_ptr(), 104 | eps, input_size, hidden_size, 105 | input.device() 106 | ); 107 | 108 | return output; 109 | } 110 | 111 | template 112 | void layer_norm_kernel( 113 | const void * input_ptr, 114 | const uint64_t weight_ptr, // use uint64_t instead of void * to workaround a bug 115 | const uint64_t bias_ptr, // use uint64_t instead of void * to workaround a bug 116 | void * output_ptr, 117 | float eps, 118 | const int input_size, 119 | const int hidden_size, 120 | const at::Device & device 121 | ) { 122 | assert(hidden_size % BS == 0); 123 | assert(hidden_size <= 8 * 1024); 124 | 125 | const int nb = hidden_size / BS; 126 | const int sub_nb = nb / GS; 127 | const int rem_nb = nb % GS; 128 | const int mean_offset = hidden_size * sizeof(IT); 129 | const int var_offset = mean_offset + GS * sizeof(float); 130 | 131 | sycl::range<2> global_size(input_size, GS); 132 | sycl::range<2> local_size(1, GS); 133 | 134 | auto cgf = [&](sycl::handler& handle) { 135 | handle.parallel_for( 136 | sycl::nd_range<2>(global_size, local_size), 137 | [=](sycl::nd_item<2> item) SYCL_ESIMD_KERNEL { 138 | // slm cannot be dynamic size, so use fixed size 139 | slm_init<8 * 1024 * sizeof(IT) + 2 * GS * sizeof(float)>(); 140 | 141 | const int rid = item.get_global_id(0); 142 | const int tid = item.get_local_id(1); 143 | 144 | const IT * input = (const IT *)input_ptr + hidden_size * (size_t)rid; 145 | IT * output = (IT *)output_ptr + hidden_size * (size_t)rid; 146 | 147 | const int start_blk = sub_nb * tid + std::min(tid, rem_nb); 148 | const int end_blk = start_blk + sub_nb + (tid < rem_nb); 149 | 150 | simd sumv = 0; 151 | for (int i = start_blk; i < end_blk; ++i) { 152 | simd xv = block_load(input + i * BS); 153 | slm_block_store(i * BS * sizeof(IT), xv); 154 | simd xv_f32 = xv; 155 | sumv += xv_f32; 156 | } 157 | float par_mean = sycl::ext::intel::esimd::detail::sum(sumv) / hidden_size; 158 | slm_block_store(mean_offset + tid * sizeof(float), par_mean); 159 | 160 | barrier(); 161 | 162 | simd means = slm_block_load(mean_offset); 163 | float mean = sycl::ext::intel::esimd::detail::sum(means); 164 | 165 | simd varv = 0; 166 | for (int i = start_blk; i < end_blk; ++i) { 167 | simd xv = slm_block_load(i * BS * sizeof(IT)); 168 | varv += (xv - mean) * (xv - mean); 169 | } 170 | float par_var = sycl::ext::intel::esimd::detail::sum(varv) / hidden_size; 171 | slm_block_store(var_offset + tid * sizeof(float), par_var); 172 | 173 | barrier(); 174 | 175 | simd vars = slm_block_load(var_offset); 176 | float var = sycl::ext::intel::esimd::detail::sum(vars); 177 | float scale = rsqrt(var + eps); 178 | 179 | for (int i = start_blk; i < end_blk; ++i) { 180 | simd xv = slm_block_load(i * BS * sizeof(IT)); 181 | simd result = (xv - mean) * scale; 182 | 183 | if (weight_ptr != 0) { 184 | simd yv = block_load((const IT *)weight_ptr + i * BS); 185 | result = result * yv; 186 | } 187 | 188 | if (bias_ptr != 0) { 189 | simd bv = block_load((const IT *)bias_ptr + i * BS); 190 | result = result + bv; 191 | } 192 | 193 | block_store(output + i * BS, result); 194 | } 195 | } 196 | ); 197 | }; 198 | 199 | utils::submit_kernel(cgf, device, "layer norm"); 200 | } 201 | 202 | torch::Tensor layer_norm( 203 | torch::Tensor input, 204 | std::optional weight, 205 | std::optional bias, 206 | double eps 207 | ) { 208 | int64_t input_size = input.size(0); 209 | int64_t hidden_size = input.size(1); 210 | 211 | assert(input.dim() == 2); 212 | assert(!weight || (weight->numel() == hidden_size 213 | && weight->scalar_type() == input.scalar_type() 214 | && weight->is_contiguous())); 215 | assert(!bias || (bias->numel() == hidden_size 216 | && bias->scalar_type() == input.scalar_type() 217 | && bias->is_contiguous())); 218 | 219 | auto output = torch::empty({input_size, hidden_size}, 220 | torch::device(input.device()).dtype(input.dtype())); 221 | 222 | auto func = [&] () { 223 | switch (input.scalar_type()) { 224 | case ST::Float: return layer_norm_kernel; 225 | case ST::Half: return layer_norm_kernel; 226 | default: throw std::runtime_error("unsupported dtype, only fp32 and fp16 are supported"); 227 | } 228 | } (); 229 | 230 | const uint64_t weight_ptr = weight ? (uint64_t)(weight->data_ptr()) : 0; 231 | const uint64_t bias_ptr = bias ? (uint64_t)(bias->data_ptr()) : 0; 232 | 233 | func( 234 | input.data_ptr(), weight_ptr, bias_ptr, output.data_ptr(), 235 | eps, input_size, hidden_size, 236 | input.device() 237 | ); 238 | 239 | return output; 240 | } 241 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | --------------------------------------------------------------------------------