├── .clang-format ├── .flake8 ├── .github └── workflows │ └── doxygen.yml ├── .gitignore ├── .gitmodules ├── .pre-commit-config.yaml ├── Doxyfile ├── LICENSE ├── README.md ├── assets ├── figures │ ├── chat.gif │ ├── chat_demo_gpu.gif │ ├── chat_demo_m1.gif │ ├── coding_demo_gpu.gif │ ├── coding_demo_m1.gif │ ├── overview.png │ ├── smoothquant_intuition.png │ ├── tinychat_logo.png │ ├── vlm_demo │ │ ├── CPR.jpg │ │ ├── Wall_fissure.png │ │ ├── animal_blocking.png │ │ ├── car.png │ │ ├── pedestrian.png │ │ ├── statue.jpg │ │ └── windmill_people.png │ └── vlm_demo_m1.gif └── slides.pdf ├── kernels ├── avx │ ├── matmul_avx_fp32.cc │ ├── matmul_avx_int4.cc │ ├── matmul_avx_int8.cc │ └── matmul_avx_int8_int4.cc ├── cuda │ ├── gemv_cuda.cu │ ├── matmul_int4.cu │ ├── matmul_ref_fp32.cc │ └── matmul_ref_int8.cc ├── matmul.h ├── matmul_imp.cc ├── matmul_int4.cc ├── matmul_int8.cc ├── metal │ ├── Makefile │ ├── download_metal-cpp.sh │ ├── include │ │ ├── MetalMatmulInt4.hpp │ │ └── opParams.h │ ├── kernel │ │ └── op.metal │ ├── matmul_metal_int4.cc │ ├── matmul_metal_int4_imp.cc │ ├── matmul_metal_int4_imp.h │ ├── matmul_ref_fp32.cc │ ├── matmul_ref_int8.cc │ └── src │ │ └── MetalMatmulInt4.cpp ├── neon │ ├── matmul_neon_fp32.cc │ ├── matmul_neon_int4.cc │ ├── matmul_neon_int4_offset.cc │ ├── matmul_neon_int8_int4.cc │ └── matmul_ref_int8.cc ├── pthread_pool.cc ├── pthread_pool.h └── ref │ ├── matmul_ref_fp32.cc │ ├── matmul_ref_int4.cc │ └── matmul_ref_int8.cc ├── llm ├── Makefile ├── application │ ├── README.md │ ├── chat.cc │ └── sts_utils │ │ ├── clean_up.patch │ │ ├── listen │ │ └── speak ├── chat_llama2-13b ├── chat_llama2-7b ├── code ├── half-2.2.0 │ └── include │ │ ├── README.md │ │ └── half.hpp ├── include │ ├── GPTBigCodeTokenizer.h │ ├── Generate.h │ ├── LLaMATokenizer.h │ ├── OPTTokenizer.h │ ├── common.h │ ├── interface.h │ ├── model.h │ ├── nn_modules │ │ ├── Fp32CLIPAttention.h │ │ ├── Fp32CLIPEncoder.h │ │ ├── Fp32CLIPEncoderLayer.h │ │ ├── Fp32CLIPVisionTransformer.h │ │ ├── Fp32GPTBigCodeAttention.h │ │ ├── Fp32GPTBigCodeDecoder.h │ │ ├── Fp32GPTBigCodeDecoderLayer.h │ │ ├── Fp32GPTBigCodeForCausalLM.h │ │ ├── Fp32OPTAttention.h │ │ ├── Fp32OPTDecoder.h │ │ ├── Fp32OPTDecoderLayer.h │ │ ├── Fp32OPTForCausalLM.h │ │ ├── Fp32llamaAttention.h │ │ ├── Fp32llamaDecoder.h │ │ ├── Fp32llamaDecoderLayer.h │ │ ├── Fp32llamaForCausalLM.h │ │ ├── Int4GPTBigCodeAttention.h │ │ ├── Int4GPTBigCodeDecoder.h │ │ ├── Int4GPTBigCodeDecoderLayer.h │ │ ├── Int4GPTBigCodeForCausalLM.h │ │ ├── Int4OPTAttention.h │ │ ├── Int4OPTDecoder.h │ │ ├── Int4OPTDecoderLayer.h │ │ ├── Int4OPTForCausalLM.h │ │ ├── Int4llamaAttention.h │ │ ├── Int4llamaDecoder.h │ │ ├── Int4llamaDecoderLayer.h │ │ ├── Int4llamaForCausalLM.h │ │ ├── Int8OPTAttention.h │ │ ├── Int8OPTDecoder.h │ │ ├── Int8OPTDecoderLayer.h │ │ └── OPTForCausalLM.h │ ├── operators.h │ ├── ops │ │ ├── BMM_F32T.h │ │ ├── BMM_S8T_S8N_F32T.h │ │ ├── BMM_S8T_S8N_S8T.h │ │ ├── Conv2D.h │ │ ├── Embedding.h │ │ ├── Gelu.h │ │ ├── LayerNorm.h │ │ ├── LayerNormQ.h │ │ ├── LlamaRMSNorm.h │ │ ├── RotaryPosEmb.h │ │ ├── W8A8B8O8Linear.h │ │ ├── W8A8B8O8LinearReLU.h │ │ ├── W8A8BFP32OFP32Linear.h │ │ ├── arg_max.h │ │ ├── cuda │ │ │ ├── BMM_F16T.cuh │ │ │ ├── Embedding.cuh │ │ │ ├── LlamaRMSNorm.cuh │ │ │ ├── RotaryPosEmb.cuh │ │ │ └── reduction.cuh │ │ └── linear.h │ ├── profiler.h │ ├── stb_image.h │ └── utils.h ├── mistral ├── models │ ├── llama3_vocab.bin │ ├── llama_vocab.bin │ ├── mistral_vocab.bin │ ├── opt_merges.txt │ ├── opt_vocab.json │ └── starcoder_vocab.bin ├── scripts │ ├── chat-13b.sh │ ├── chat.sh │ ├── code.sh │ ├── llava.sh │ ├── vila.sh │ ├── voice_llava.sh │ ├── voice_vila.sh │ └── voicechat.sh ├── src │ ├── GPTBigCodeGenerate.cc │ ├── GPTBigCodeTokenizer.cc │ ├── Generate.cc │ ├── LLaMATokenizer.cc │ ├── OPTGenerate.cc │ ├── OPTTokenizer.cc │ ├── interface.cc │ ├── nn_modules │ │ ├── Fp32CLIPAttention.cc │ │ ├── Fp32CLIPEncoder.cc │ │ ├── Fp32CLIPEncoderLayer.cc │ │ ├── Fp32CLIPVisionTransformer.cc │ │ ├── Fp32GPTBigCodeAttention.cc │ │ ├── Fp32GPTBigCodeDecoder.cc │ │ ├── Fp32GPTBigCodeDecoderLayer.cc │ │ ├── Fp32GPTBigCodeForCausalLM.cc │ │ ├── Fp32OPTAttention.cc │ │ ├── Fp32OPTDecoder.cc │ │ ├── Fp32OPTDecoderLayer.cc │ │ ├── Fp32OPTForCausalLM.cc │ │ ├── Fp32llamaAttention.cc │ │ ├── Fp32llamaDecoder.cc │ │ ├── Fp32llamaDecoderLayer.cc │ │ ├── Fp32llamaForCausalLM.cc │ │ ├── Int4GPTBigCodeAttention.cc │ │ ├── Int4GPTBigCodeDecoder.cc │ │ ├── Int4GPTBigCodeDecoderLayer.cc │ │ ├── Int4GPTBigCodeForCausalLM.cc │ │ ├── Int4OPTAttention.cc │ │ ├── Int4OPTDecoder.cc │ │ ├── Int4OPTDecoderLayer.cc │ │ ├── Int4OPTForCausalLM.cc │ │ ├── Int8OPTAttention.cc │ │ ├── Int8OPTDecoder.cc │ │ ├── Int8OPTDecoderLayer.cc │ │ ├── OPTForCausalLM.cc │ │ ├── cuda │ │ │ ├── Int4llamaAttention.cu │ │ │ ├── Int4llamaDecoder.cu │ │ │ ├── Int4llamaDecoderLayer.cu │ │ │ ├── Int4llamaForCausalLM.cu │ │ │ ├── LLaMA3Generate.cu │ │ │ ├── LLaMAGenerate.cu │ │ │ ├── LLaVAGenerate.cu │ │ │ ├── MistralGenerate.cu │ │ │ └── utils.cu │ │ └── non_cuda │ │ │ ├── Int4llamaAttention.cc │ │ │ ├── Int4llamaDecoder.cc │ │ │ ├── Int4llamaDecoderLayer.cc │ │ │ ├── Int4llamaForCausalLM.cc │ │ │ ├── LLaMA3Generate.cc │ │ │ ├── LLaMAGenerate.cc │ │ │ ├── LLaVAGenerate.cc │ │ │ └── MistralGenerate.cc │ ├── ops │ │ ├── BMM_F32T.cc │ │ ├── BMM_S8T_S8N_F32T.cc │ │ ├── BMM_S8T_S8N_S8T.cc │ │ ├── Conv2D.cc │ │ ├── Gelu.cc │ │ ├── LayerNorm.cc │ │ ├── LayerNormQ.cc │ │ ├── LlamaRMSNorm.cc │ │ ├── RotaryPosEmb.cc │ │ ├── W8A8B8O8Linear.cc │ │ ├── W8A8B8O8LinearReLU.cc │ │ ├── W8A8BFP32OFP32Linear.cc │ │ ├── arg_max.cc │ │ ├── batch_add.cc │ │ ├── cuda │ │ │ ├── BMM_F16T.cu │ │ │ ├── LlamaRMSNorm.cu │ │ │ ├── RotaryPosEmb.cu │ │ │ ├── batch_add.cu │ │ │ ├── embedding.cu │ │ │ ├── linear.cu │ │ │ └── softmax.cu │ │ ├── embedding.cc │ │ ├── linear.cc │ │ └── softmax.cc │ └── utils.cc ├── tests │ ├── cuda │ │ ├── test_Int4llamaAttention.cu │ │ ├── test_Int4llamaDecoder.cu │ │ ├── test_Int4llamaDecoderLayer.cu │ │ ├── test_Int4llamaForCausalLM.cu │ │ └── test_ops.cu │ ├── non_cuda │ │ ├── test_Int4llamaAttention.cc │ │ ├── test_Int4llamaDecoder.cc │ │ ├── test_Int4llamaDecoderLayer.cc │ │ ├── test_Int4llamaForCausalLM.cc │ │ └── test_ops.cc │ ├── test_Fp32OPTAttention.cc │ ├── test_Fp32OPTDecoder.cc │ ├── test_Fp32OPTDecoderLayer.cc │ ├── test_Fp32OPTForCausalLM.cc │ ├── test_Fp32llamaAttention.cc │ ├── test_Fp32llamaDecoder.cc │ ├── test_Fp32llamaDecoderLayer.cc │ ├── test_Fp32llamaForCausalLM.cc │ ├── test_Int8OPTAttention.cc │ ├── test_Int8OPTDecoder.cc │ ├── test_Int8OPTDecoderLayer.cc │ ├── test_LLaMATokenizer.cc │ ├── test_OPTForCausalLM.cc │ ├── test_OPTGenerate.cc │ ├── test_OPTTokenizer.cc │ └── utils_memalloc.h ├── tools │ ├── clip_exporter.py │ ├── copy_rotary_emb.sh │ ├── download_assets.sh │ ├── download_model.py │ ├── download_model_from_dropbox.py │ ├── export_model.sh │ ├── llama3_exporter.py │ ├── llama_exporter.py │ ├── llama_qkv_merger.py │ ├── llava_exporter.py │ ├── mistral_exporter.py │ ├── model_quantizer.py │ ├── opt_smooth_exporter.py │ ├── profile.sh │ ├── quantize_and_upload.py │ ├── quantize_constants.py │ ├── quantize_methods.py │ ├── rotary_emb_exporter.py │ ├── starcoder_exporter.py │ ├── test.sh │ ├── upload.py │ ├── upload_to_dropbox.py │ ├── vila_exporter.py │ └── zip_assets.sh ├── vila ├── vila_2.7b ├── voice_mistral ├── voice_vila └── voicechat_setup.sh ├── pyproject.toml └── requirements.txt /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: Google 2 | ColumnLimit: 120 3 | ContinuationIndentWidth: 4 4 | IndentWidth: 4 5 | TabWidth: 4 6 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | -------------------------------------------------------------------------------- /.github/workflows/doxygen.yml: -------------------------------------------------------------------------------- 1 | name: Generate and Deploy Doxygen Documentation 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@v2 14 | 15 | - name: Doxygen Action 16 | uses: mattnotmitt/doxygen-action@v1.1.0 17 | with: 18 | doxyfile-path: "./Doxyfile" # default is ./Doxyfile 19 | working-directory: "." # default is . 20 | 21 | - name: Deploy 22 | uses: peaceiris/actions-gh-pages@v3 23 | with: 24 | github_token: ${{ secrets.GITHUB_TOKEN }} 25 | publish_dir: ./docs 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.a 3 | *.pyc 4 | *.cuu 5 | *.ccc 6 | .DS_Store 7 | .build/ 8 | .cache/ 9 | .direnv/ 10 | .envrc 11 | .swiftpm 12 | .venv 13 | .vs/ 14 | .vscode/ 15 | 16 | llm/assets/ 17 | models/ 18 | *.bin 19 | !llama_vocab.bin 20 | !starcoder_vocab.bin 21 | !mistral_vocab.bin 22 | !llama3_vocab.bin 23 | *.zip 24 | *.txt 25 | !requirements.txt 26 | *.pt 27 | *.json 28 | test_* 29 | !test_*.cc 30 | !test_*.cu 31 | demo 32 | chat 33 | voicechat 34 | profile_* 35 | !profile_*.cc 36 | libtorch/ 37 | checkpoints/ 38 | 39 | output.wav 40 | tmpfile 41 | TTS/ 42 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "json"] 2 | path = json 3 | url = https://github.com/nlohmann/json 4 | [submodule "transformer/json"] 5 | path = llm/json 6 | url = https://github.com/nlohmann/json 7 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: "code_generator/tflite/.*" 2 | repos: 3 | - repo: https://github.com/pre-commit/pre-commit-hooks 4 | rev: v4.0.1 5 | hooks: 6 | - id: trailing-whitespace 7 | - id: mixed-line-ending 8 | args: ["--fix=lf"] 9 | - id: end-of-file-fixer 10 | - id: check-merge-conflict 11 | - id: requirements-txt-fixer 12 | - id: fix-encoding-pragma 13 | args: ["--remove"] 14 | - id: debug-statements 15 | - id: check-toml 16 | - repo: https://github.com/executablebooks/mdformat 17 | rev: 0.7.10 18 | hooks: 19 | - id: mdformat 20 | - repo: https://github.com/psf/black 21 | rev: 22.3.0 22 | hooks: 23 | - id: black 24 | - repo: https://github.com/pycqa/isort 25 | rev: 5.12.0 26 | hooks: 27 | - id: isort 28 | args: ["--sp", "pyproject.toml"] 29 | - repo: https://github.com/pycqa/flake8 30 | rev: 4.0.1 31 | hooks: 32 | - id: flake8 33 | additional_dependencies: 34 | - flake8-comprehensions==3.7.0 35 | - flake8-docstrings==1.6.0 36 | - repo: local 37 | hooks: 38 | - id: pylint 39 | name: pylint 40 | entry: pylint 41 | language: system 42 | types: [python] 43 | require_serial: true 44 | - repo: https://github.com/pre-commit/mirrors-mypy 45 | rev: v0.910-1 46 | hooks: 47 | - id: mypy 48 | - repo: https://github.com/pre-commit/mirrors-clang-format 49 | rev: v13.0.0 50 | hooks: 51 | - id: clang-format 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 MIT HAN Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /assets/figures/chat.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/chat.gif -------------------------------------------------------------------------------- /assets/figures/chat_demo_gpu.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/chat_demo_gpu.gif -------------------------------------------------------------------------------- /assets/figures/chat_demo_m1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/chat_demo_m1.gif -------------------------------------------------------------------------------- /assets/figures/coding_demo_gpu.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/coding_demo_gpu.gif -------------------------------------------------------------------------------- /assets/figures/coding_demo_m1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/coding_demo_m1.gif -------------------------------------------------------------------------------- /assets/figures/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/overview.png -------------------------------------------------------------------------------- /assets/figures/smoothquant_intuition.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/smoothquant_intuition.png -------------------------------------------------------------------------------- /assets/figures/tinychat_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/tinychat_logo.png -------------------------------------------------------------------------------- /assets/figures/vlm_demo/CPR.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/vlm_demo/CPR.jpg -------------------------------------------------------------------------------- /assets/figures/vlm_demo/Wall_fissure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/vlm_demo/Wall_fissure.png -------------------------------------------------------------------------------- /assets/figures/vlm_demo/animal_blocking.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/vlm_demo/animal_blocking.png -------------------------------------------------------------------------------- /assets/figures/vlm_demo/car.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/vlm_demo/car.png -------------------------------------------------------------------------------- /assets/figures/vlm_demo/pedestrian.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/vlm_demo/pedestrian.png -------------------------------------------------------------------------------- /assets/figures/vlm_demo/statue.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/vlm_demo/statue.jpg -------------------------------------------------------------------------------- /assets/figures/vlm_demo/windmill_people.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/vlm_demo/windmill_people.png -------------------------------------------------------------------------------- /assets/figures/vlm_demo_m1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/vlm_demo_m1.gif -------------------------------------------------------------------------------- /assets/slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/slides.pdf -------------------------------------------------------------------------------- /kernels/cuda/matmul_int4.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "../matmul.h" 5 | 6 | namespace matmul { 7 | 8 | void MatmulOperator::naive_mat_mul_fp16_int4(const struct matmul_params *params) { 9 | const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; 10 | const int block_size = params->block_size; 11 | // CHECK_MATRICES_int4weight(A, B, C); 12 | 13 | naive_float16_t weight; 14 | for (int i = 0; i < C->row; i++) { 15 | for (int j = 0; j < C->column; j++) { 16 | naive_float16_t acc = (naive_float16_t)0.0; 17 | 18 | for (int k = 0; k < B->row; k++) { 19 | naive_float16_t s = params->fp16_scales[(k / block_size) * C->column + j]; 20 | naive_float16_t z = static_cast(8.0f); // TODO: support dynamic zeropoint 21 | naive_float16_t input = A->fp16_data_ptr[i * A->column + k]; 22 | 23 | // order of weights is 0 2 4 6 1 3 5 7 24 | if (j % 8 == 0) 25 | weight = ((naive_float16_t)(B->int32_data_ptr[k * B->column + (j / 8)] & 0x0000000F) - z) * s; 26 | else if (j % 8 == 1) 27 | weight = ((naive_float16_t)((B->int32_data_ptr[k * B->column + (j / 8)] & 0x000F0000) >> 16) - z) * s; 28 | else if (j % 8 == 2) 29 | weight = ((naive_float16_t)((B->int32_data_ptr[k * B->column + (j / 8)] & 0x000000F0) >> 4) - z) * s; 30 | else if (j % 8 == 3) 31 | weight = ((naive_float16_t)((B->int32_data_ptr[k * B->column + (j / 8)] & 0x00F00000) >> 20) - z) * s; 32 | else if (j % 8 == 4) 33 | weight = ((naive_float16_t)((B->int32_data_ptr[k * B->column + (j / 8)] & 0x00000F00) >> 8) - z) * s; 34 | else if (j % 8 == 5) 35 | weight = ((naive_float16_t)((B->int32_data_ptr[k * B->column + (j / 8)] & 0x0F000000) >> 24) - z) * s; 36 | else if (j % 8 == 6) 37 | weight = ((naive_float16_t)((B->int32_data_ptr[k * B->column + (j / 8)] & 0x0000F000) >> 12) - z) * s; 38 | else if (j % 8 == 7) 39 | weight = ((naive_float16_t)((B->int32_data_ptr[k * B->column + (j / 8)] & 0xF0000000) >> 28) - z) * s; 40 | 41 | acc += input * weight; 42 | // printf("naive_mat_mul_fp16_int4 - s: %f, input: %f, weight: %f, acc: %f\n", static_cast(s), static_cast(input), static_cast(weight), static_cast(acc)); 43 | } 44 | 45 | C->fp16_data_ptr[i * C->column + j] = acc; 46 | } 47 | } 48 | } 49 | 50 | } // namespace matmul 51 | -------------------------------------------------------------------------------- /kernels/cuda/matmul_ref_fp32.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | 7 | #include "../matmul.h" 8 | 9 | namespace matmul { 10 | void fp32_ref_matmul(const struct matmul_params *params) { 11 | const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; 12 | float *data_A = A->data_ptr, *data_B = B->data_ptr, *data_C = C->data_ptr; 13 | 14 | assert(A->column == B->row); 15 | assert(C->row == A->row); 16 | assert(C->column == B->column); 17 | int m = A->row, n = B->column, k = A->column; 18 | 19 | for (int i = 0; i < m; i++) { 20 | for (int j = 0; j < n; j++) { 21 | float acc = 0; 22 | for (int kk = 0; kk < k; kk++) { 23 | acc += data_A[i * k + kk] * data_B[j * k + kk]; 24 | } 25 | acc = acc; 26 | data_C[i * n + j] = acc; 27 | } 28 | } 29 | } 30 | 31 | void MatmulOperator::mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params) { 32 | fp32_ref_matmul(params); 33 | } 34 | 35 | } // namespace matmul 36 | -------------------------------------------------------------------------------- /kernels/matmul_imp.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include "matmul.h" 9 | 10 | namespace matmul { 11 | 12 | void MatmulOperator::CHECK_MATRICES(const struct matrix *A, const struct matrix *B, const struct matrix *C) { 13 | assert(A->column == B->row); 14 | assert(C->column == B->column); 15 | assert(C->row == A->row); 16 | } 17 | 18 | void MatmulOperator::CHECK_MATRICES_int4weight(const struct matrix *A, const struct matrix *B, const struct matrix *C) { 19 | assert(B->row * B->column == A->column * C->column / 2); 20 | assert(C->row == A->row); 21 | } 22 | 23 | void MatmulOperator::mat_mul_transposed(const struct matmul_params *params) { 24 | int i, j, k; 25 | 26 | const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; 27 | float *data_A = A->data_ptr, *data_B = B->data_ptr, *data_C = C->data_ptr; 28 | 29 | for (i = 0; i < C->row; i++) 30 | for (j = 0; j < C->column; j++) { 31 | float acc = 0; 32 | for (k = 0; k < A->column; k++) acc += data_A[i * A->column + k] * data_B[j * B->column + k]; 33 | data_C[i * C->column + j] = acc; 34 | } 35 | } 36 | 37 | float interval_to_ms(struct timeval *start, struct timeval *end) { 38 | float us_seconds = (end->tv_sec - start->tv_sec) * 1000000 + (end->tv_usec - start->tv_usec); 39 | return us_seconds / 1000; 40 | } 41 | 42 | } // namespace matmul 43 | -------------------------------------------------------------------------------- /kernels/matmul_int8.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "matmul.h" 5 | 6 | namespace matmul { 7 | 8 | void MatmulOperator::naive_mat_mul_int8(const struct matmul_params *params) { 9 | int i, j, k; 10 | const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; 11 | int32_t A_zp = A->qparams.zero_point, C_zp = C->qparams.zero_point; 12 | float A_sc = A->qparams.scale, B_sc = B->qparams.scale, C_sc = C->qparams.scale; 13 | float effective_scale = A_sc * B_sc / C_sc; 14 | int8_t *data_A = A->int8_data_ptr, *data_B = B->int8_data_ptr, *data_C = C->int8_data_ptr; 15 | const int8_t q_min = C->qparams.q_min, q_max = C->qparams.q_max; 16 | CHECK_MATRICES(A, B, C); 17 | 18 | for (i = 0; i < C->row; i++) 19 | for (j = 0; j < C->column; j++) { 20 | int acc = 0; 21 | for (k = 0; k < A->column; k++) 22 | acc += ((int32_t)data_A[i * A->column + k] - A_zp) * data_B[k * B->column + j]; 23 | 24 | acc = (int32_t)((float)acc * effective_scale); 25 | acc -= C_zp; 26 | acc = MAX(acc, q_min); 27 | acc = MIN(acc, q_max); 28 | data_C[i * C->column + j] = (int8_t)acc; 29 | } 30 | } 31 | } // namespace matmul 32 | -------------------------------------------------------------------------------- /kernels/metal/Makefile: -------------------------------------------------------------------------------- 1 | CXX = /opt/homebrew/opt/llvm/bin/clang++ 2 | CXXFLAGS = -std=c++17 -stdlib=libc++ -O3 3 | 4 | # Executable and source files 5 | TEST_TARGET = benchmark 6 | TARGET = $(TEST_TARGET) 7 | KERNEL_SRC = $(wildcard ./src/*.cpp) 8 | 9 | SRC = $(KERNEL_SRC) 10 | INCLUDE_DIRS = -I./metal-cpp -I./include 11 | LIB = -framework Metal -framework Foundation -framework MetalKit 12 | 13 | 14 | # Default target 15 | all: $(TARGET) 16 | 17 | # Linking 18 | benchmark: build_metallib 19 | $(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -o benchmark.x app/main.cpp $(SRC) $(LIB) $(LDFLAGS) 20 | 21 | build_air: 22 | xcrun -sdk macosx metal -ffast-math -fno-fast-math $(INCLUDE_DIRS) -c kernel/op.metal -o library.air 23 | 24 | build_metallib: build_air 25 | xcrun -sdk macosx metallib library.air -o default.metallib 26 | 27 | # Clean up 28 | clean: 29 | rm -f benchmark.x library.air library.metallib default.metallib 30 | -------------------------------------------------------------------------------- /kernels/metal/download_metal-cpp.sh: -------------------------------------------------------------------------------- 1 | wget https://developer.apple.com/metal/cpp/files/metal-cpp_macOS13_iOS16.zip 2 | unzip metal-cpp_macOS13_iOS16.zip 3 | -------------------------------------------------------------------------------- /kernels/metal/include/MetalMatmulInt4.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "Foundation/Foundation.hpp" 4 | #include "Metal/Metal.hpp" 5 | #include "opParams.h" 6 | 7 | class MetalMatmulInt4 { 8 | public: 9 | MTL::Device *_mDevice; 10 | 11 | // The compute pipeline generated from the compute kernel in the .metal shader file. 12 | MTL::ComputePipelineState *_mMatmulFunctionPSO; 13 | 14 | // The command queue used to pass commands to the device. 15 | MTL::CommandQueue *_mCommandQueue; 16 | 17 | // Buffers to hold data. 18 | MTL::Buffer *_mBufferA; 19 | MTL::Buffer *_mBufferB; 20 | MTL::Buffer *_mBufferScales; 21 | MTL::Buffer *_mBufferResult; 22 | MTL::Buffer *_mParams; 23 | 24 | // Matmul params 25 | MetalMatMulParams *_mParamsPtr; 26 | 27 | MetalMatmulInt4(MTL::Device *device, MetalMatMulParams param); 28 | ~MetalMatmulInt4(); 29 | 30 | void prepareData(); 31 | void sendComputeCommand(); 32 | void verifyResults(); 33 | 34 | private: 35 | void encodeCommand(MTL::ComputeCommandEncoder *computeEncoder); 36 | void generateRandomFloatData(MTL::Buffer *buffer, int length); 37 | void generateRandomIn4Data(MTL::Buffer *buffer, int length); 38 | }; 39 | -------------------------------------------------------------------------------- /kernels/metal/include/opParams.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | typedef struct { 4 | unsigned int m; 5 | unsigned int n; 6 | unsigned int k; 7 | unsigned int group_size; 8 | } MetalMatMulParams; 9 | -------------------------------------------------------------------------------- /kernels/metal/matmul_metal_int4_imp.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "Foundation/Foundation.hpp" 6 | #include "Metal/Metal.hpp" 7 | #include "include/opParams.h" 8 | 9 | typedef struct { 10 | float *A, *C, *scales, *offset; 11 | unsigned char *B; 12 | } MetalMatmulBuffers; 13 | 14 | class MetalMatmulInt4IMP { 15 | public: 16 | static MTL::Device *_mDevice; 17 | 18 | // The compute pipeline generated from the compute kernel in the .metal shader file. 19 | static MTL::ComputePipelineState *_mMatmulFunctionPSO; 20 | 21 | // The command queue used to pass commands to the device. 22 | static MTL::CommandQueue *_mCommandQueue; 23 | 24 | // Buffers to hold data. 25 | static MTL::Buffer *_mBufferA; 26 | static MTL::Buffer *_mBufferB; 27 | static MTL::Buffer *_mBufferScales; 28 | static MTL::Buffer *_mBufferResult; 29 | static MTL::Buffer *_mParams; 30 | 31 | static std::unordered_map _mumap; 32 | 33 | static bool has_init; 34 | static void init(); 35 | static void run(MetalMatMulParams param, MetalMatmulBuffers *bufferParams); 36 | static void *allocateSharedMem(size_t size); 37 | 38 | static MetalMatMulParams *_mParamsPtr; 39 | static void sendComputeCommand(); 40 | static void encodeCommand(MTL::ComputeCommandEncoder *computeEncoder); 41 | static MTL::Buffer *getBufferfromPtr(void *ptr); 42 | }; 43 | -------------------------------------------------------------------------------- /kernels/metal/matmul_ref_fp32.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include "../matmul.h" 9 | 10 | namespace matmul { 11 | void fp32_ref_matmul(const struct matmul_params *params) { 12 | const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; 13 | float *data_A = A->data_ptr, *data_B = B->data_ptr, *data_C = C->data_ptr; 14 | 15 | assert(A->column == B->row); 16 | assert(C->row == A->row); 17 | assert(C->column == B->column); 18 | int m = A->row, n = B->column, k = A->column; 19 | 20 | for (int i = 0; i < m; i++) { 21 | for (int j = 0; j < n; j++) { 22 | float acc = 0; 23 | for (int kk = 0; kk < k; kk++) { 24 | acc += data_A[i * k + kk] * data_B[j * k + kk]; 25 | } 26 | acc = acc; 27 | data_C[i * n + j] = acc; 28 | } 29 | } 30 | } 31 | 32 | void MatmulOperator::mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params) { 33 | fp32_ref_matmul(params); 34 | } 35 | 36 | } // namespace matmul 37 | -------------------------------------------------------------------------------- /kernels/pthread_pool.cc: -------------------------------------------------------------------------------- 1 | #include "pthread_pool.h" 2 | #include 3 | #include 4 | #include 5 | 6 | struct pool_queue { 7 | void *arg; 8 | char free; 9 | struct pool_queue *next; 10 | }; 11 | 12 | struct pool { 13 | char cancelled; 14 | void *(*fn)(void *); 15 | unsigned int remaining; 16 | unsigned int nthreads; 17 | struct pool_queue *q; 18 | struct pool_queue *end; 19 | pthread_mutex_t q_mtx; 20 | pthread_cond_t q_cnd; 21 | pthread_t threads[1]; 22 | }; 23 | 24 | static void * thread(void *arg); 25 | 26 | void * pool_start(void * (*thread_func)(void *), unsigned int threads) { 27 | struct pool *p = (struct pool *) malloc(sizeof(struct pool) + (threads-1) * sizeof(pthread_t)); 28 | int i; 29 | 30 | pthread_mutex_init(&p->q_mtx, NULL); 31 | pthread_cond_init(&p->q_cnd, NULL); 32 | p->nthreads = threads; 33 | p->fn = thread_func; 34 | p->cancelled = 0; 35 | p->remaining = 0; 36 | p->end = NULL; 37 | p->q = NULL; 38 | 39 | for (i = 0; i < threads; i++) { 40 | pthread_create(&p->threads[i], NULL, &thread, p); 41 | } 42 | 43 | return p; 44 | } 45 | 46 | void pool_enqueue(void *pool, void *arg, char free) { 47 | struct pool *p = (struct pool *) pool; 48 | struct pool_queue *q = (struct pool_queue *) malloc(sizeof(struct pool_queue)); 49 | q->arg = arg; 50 | q->next = NULL; 51 | q->free = free; 52 | 53 | pthread_mutex_lock(&p->q_mtx); 54 | if (p->end != NULL) p->end->next = q; 55 | if (p->q == NULL) p->q = q; 56 | p->end = q; 57 | p->remaining++; 58 | pthread_cond_signal(&p->q_cnd); 59 | pthread_mutex_unlock(&p->q_mtx); 60 | } 61 | 62 | void pool_wait(void *pool) { 63 | struct pool *p = (struct pool *) pool; 64 | 65 | pthread_mutex_lock(&p->q_mtx); 66 | while (!p->cancelled && p->remaining) { 67 | pthread_cond_wait(&p->q_cnd, &p->q_mtx); 68 | } 69 | pthread_mutex_unlock(&p->q_mtx); 70 | } 71 | 72 | void pool_end(void *pool) { 73 | struct pool *p = (struct pool *) pool; 74 | struct pool_queue *q; 75 | int i; 76 | 77 | p->cancelled = 1; 78 | 79 | pthread_mutex_lock(&p->q_mtx); 80 | pthread_cond_broadcast(&p->q_cnd); 81 | pthread_mutex_unlock(&p->q_mtx); 82 | 83 | for (i = 0; i < p->nthreads; i++) { 84 | pthread_join(p->threads[i], NULL); 85 | } 86 | 87 | while (p->q != NULL) { 88 | q = p->q; 89 | p->q = q->next; 90 | 91 | if (q->free) free(q->arg); 92 | free(q); 93 | } 94 | 95 | free(p); 96 | } 97 | 98 | static void * thread(void *arg) { 99 | struct pool_queue *q; 100 | struct pool *p = (struct pool *) arg; 101 | 102 | while (!p->cancelled) { 103 | pthread_mutex_lock(&p->q_mtx); 104 | while (!p->cancelled && p->q == NULL) { 105 | pthread_cond_wait(&p->q_cnd, &p->q_mtx); 106 | } 107 | if (p->cancelled) { 108 | pthread_mutex_unlock(&p->q_mtx); 109 | return NULL; 110 | } 111 | q = p->q; 112 | p->q = q->next; 113 | p->end = (q == p->end ? NULL : p->end); 114 | pthread_mutex_unlock(&p->q_mtx); 115 | 116 | p->fn(q->arg); 117 | 118 | if (q->free) free(q->arg); 119 | free(q); 120 | q = NULL; 121 | 122 | pthread_mutex_lock(&p->q_mtx); 123 | p->remaining--; 124 | pthread_cond_broadcast(&p->q_cnd); 125 | pthread_mutex_unlock(&p->q_mtx); 126 | } 127 | 128 | return NULL; 129 | } 130 | -------------------------------------------------------------------------------- /kernels/pthread_pool.h: -------------------------------------------------------------------------------- 1 | /** \file 2 | * This file provides prototypes for an implementation of a pthread pool. 3 | */ 4 | 5 | #ifndef __PTHREAD_POOL_H__ 6 | /** 7 | * Create a new thread pool. 8 | * 9 | * New tasks should be enqueued with pool_enqueue. thread_func will be called 10 | * once per queued task with its sole argument being the argument given to 11 | * pool_enqueue. 12 | * 13 | * \param thread_func The function executed by each thread for each work item. 14 | * \param threads The number of threads in the pool. 15 | * \return A pointer to the thread pool. 16 | */ 17 | void * pool_start(void * (*thread_func)(void *), unsigned int threads); 18 | 19 | /** 20 | * Enqueue a new task for the thread pool. 21 | * 22 | * \param pool A thread pool returned by start_pool. 23 | * \param arg The argument to pass to the thread worker function. 24 | * \param free If true, the argument will be freed after the task has completed. 25 | */ 26 | void pool_enqueue(void *pool, void *arg, char free); 27 | 28 | /** 29 | * Wait for all queued tasks to be completed. 30 | */ 31 | void pool_wait(void *pool); 32 | 33 | /** 34 | * Stop all threads in the pool. 35 | * 36 | * Note that this function will block until all threads have terminated. 37 | * All queued items will also be freed, along with the pool itself. 38 | * Remaining work item arguments will be freed depending on the free argument to 39 | * pool_enqueue. 40 | */ 41 | void pool_end(void *pool); 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /kernels/ref/matmul_ref_fp32.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include "../matmul.h" 9 | 10 | namespace matmul { 11 | void fp32_ref_matmul(const struct matmul_params *params) { 12 | const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; 13 | float *data_A = A->data_ptr, *data_B = B->data_ptr, *data_C = C->data_ptr; 14 | 15 | assert(A->column == B->row); 16 | assert(C->row == A->row); 17 | assert(C->column == B->column); 18 | int m = A->row, n = B->column, k = A->column; 19 | 20 | for (int i = 0; i < m; i++) { 21 | for (int j = 0; j < n; j++) { 22 | float acc = 0; 23 | for (int kk = 0; kk < k; kk++) { 24 | acc += data_A[i * k + kk] * data_B[j * k + kk]; 25 | } 26 | acc = acc; 27 | data_C[i * n + j] = acc; 28 | } 29 | } 30 | } 31 | 32 | void MatmulOperator::mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params) { 33 | fp32_ref_matmul(params); 34 | } 35 | 36 | } // namespace matmul 37 | -------------------------------------------------------------------------------- /kernels/ref/matmul_ref_int4.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include "../matmul.h" 9 | 10 | namespace matmul { 11 | void MatmulOperator::mat_mul_accelerator_int4_fast(const struct matmul_params *params) { 12 | int i, j, k; 13 | const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; 14 | const int block_size = params->block_size; 15 | float *scale = params->scales, *offset = params->offset; 16 | 17 | assert(params->block_size == 32); // support block size 32 for now 18 | 19 | for (i = 0; i < C->row; i++) { 20 | for (j = 0; j < C->column; j++) { 21 | float acc = 0; 22 | for (k = 0; k < B->row; k += block_size) { 23 | float s = scale[j * (B->row / 16) + k / 32]; // /16:B->column is packed 4bits 24 | float o = offset[j * (B->row / 16) + k / 32]; 25 | uint8_t *weight_32_int4 = &B->int4_data_ptr[j * B->row + k / 2]; 26 | float *x_ptr = &A->data_ptr[i * A->column + k]; 27 | for (int qi = 0; qi < block_size / 2; qi++) { 28 | uint8_t packed_int4 = weight_32_int4[qi]; 29 | float deq_0 = (float)(packed_int4 & 0x0F) * s + o; 30 | float deq_1 = (float)(packed_int4 >> 4) * s + o; 31 | acc += *x_ptr++ * deq_0; 32 | acc += *x_ptr++ * deq_1; 33 | } 34 | } 35 | C->data_ptr[i * C->column + j] = acc; 36 | } 37 | } 38 | }; 39 | 40 | } // namespace matmul 41 | -------------------------------------------------------------------------------- /llm/application/README.md: -------------------------------------------------------------------------------- 1 | ## Demo video of our speech-to-speech chatbot 2 | 3 | - Please find the speech-to-speech demo video using TinyChatEngine [here](https://youtu.be/Bw5Dm3aWMnA?si=CCvZDmq3HwowEQcC). 4 | 5 | ## Instructions to run a speech-to-speech chatbot demo 6 | 7 | - Follow the [instructions](../../README.md) to download and deploy LLaMA2-7B-chat. 8 | 9 | - Configure whisper.cpp. You may need to update the Makefile and ggml.h files of whisper.cpp to get it running. For related issues, please refer to the [whisper.cpp](https://github.com/ggerganov/whisper.cpp) repository. 10 | 11 | ```bash 12 | # Get whisper.cpp for speech recognition 13 | cd llm 14 | git clone https://github.com/ggerganov/whisper.cpp 15 | cd whisper.cpp 16 | git checkout a4bb2df 17 | 18 | # Install SDL2 on Linux 19 | sudo apt-get install libsdl2-dev 20 | # Install SDL2 on Mac OS 21 | brew install sdl2 22 | 23 | git apply ../application/sts_utils/clean_up.patch 24 | bash ./models/download-ggml-model.sh base.en 25 | # NVIDIA GPU (Note: you may need to change the Makefile of whisper.cpp depending on your environment or device) 26 | WHISPER_CUBLAS=1 make -j stream 27 | # Otherwise 28 | make stream 29 | cd ../ 30 | ``` 31 | 32 | - If you have an edge device and want a better TTS program than espeak, download [piper](https://github.com/rhasspy/piper) 33 | 34 | ```bash 35 | mkdir TTS 36 | cd TTS 37 | wget https://github.com/rhasspy/piper/releases/download/v1.2.0/piper_arm64.tar.gz 38 | tar -xvzf piper_arm64.tar.gz 39 | ``` 40 | 41 | - Download your preferred voice from the [huggingface repo](https://huggingface.co/rhasspy/piper-voices/tree/v1.0.0) and drag both the .onxx and .onnx.json files into the TTS directory 42 | 43 | - Edit the listen shell file in the transformers directory so whisper.cpp is using your preferred parameters. 44 | 45 | ```bash 46 | nano application/sts_utils/listen 47 | ``` 48 | 49 | - Edit the speak shell file in the transformers directory so the demo uses your preferred TTS program. 50 | 51 | ```bash 52 | nano application/sts_utils/speak 53 | ``` 54 | 55 | - Test each of the submodules to ensure they are working as intended 56 | 57 | ```bash 58 | ./application/sts_utils/listen 59 | cat tmpfile 60 | ./application/sts_utils/speak hello 61 | ``` 62 | 63 | - Compile and start the voicechat locally. 64 | 65 | ```bash 66 | make -j chat 67 | ./chat -v # chat.exe -v on Windows 68 | ``` 69 | -------------------------------------------------------------------------------- /llm/application/sts_utils/listen: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | < tmpfile 29 | -------------------------------------------------------------------------------- /llm/application/sts_utils/speak: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Usage: 4 | # speak.sh 5 | 6 | # espeak 7 | # Mac OS: brew install espeak 8 | # Linux: apt-get install espeak 9 | # 10 | #espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 "$1" 11 | 12 | # for Mac 13 | say "$1" 14 | 15 | # for edge devices 16 | # echo "$1" | ./TTS/piper/piper --model ./TTS/en_US-ryan-low.onnx --output_file output.wav && aplay output.wav 17 | 18 | # Eleven Labs 19 | # To use it, install the elevenlabs module from pip (pip install elevenlabs) 20 | # It's possible to use the API for free with limited number of characters. To increase this limit register to https://beta.elevenlabs.io to get an api key and paste it after 'ELEVEN_API_KEY=' 21 | #Keep the line commented to use the free version whitout api key 22 | # 23 | #export ELEVEN_API_KEY=your_api_key 24 | #wd=$(dirname $0) 25 | #script=$wd/eleven-labs.py 26 | #python3 $script $1 "$1" >/dev/null 2>&1 27 | #ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1 28 | -------------------------------------------------------------------------------- /llm/chat_llama2-13b: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | ./chat LLaMA2_13B_chat INT4 5 3 | -------------------------------------------------------------------------------- /llm/chat_llama2-7b: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | ./chat LLaMA2_7B_chat INT4 5 3 | -------------------------------------------------------------------------------- /llm/code: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | ./chat CodeLLaMA_7B_Instruct INT4 5 3 | -------------------------------------------------------------------------------- /llm/half-2.2.0/include/README.md: -------------------------------------------------------------------------------- 1 | This is the IEEE 754-based half-precision floating-point library by Christian Rau: https://half.sourceforge.net/index.html. 2 | -------------------------------------------------------------------------------- /llm/include/GPTBigCodeTokenizer.h: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Adapted from llama.cpp and starcoder.cpp: 4 | https://github.com/ggerganov/llama.cpp 5 | https://github.com/bigcode-project/starcoder.cpp 6 | 7 | */ 8 | 9 | #ifndef GPTBIGCODE_TOKENIZER_H 10 | #define GPTBIGCODE_TOKENIZER_H 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | // 25 | // Vocab utils 26 | // 27 | 28 | std::string trim(const std::string & s); 29 | 30 | std::string replace( 31 | const std::string & s, 32 | const std::string & from, 33 | const std::string & to); 34 | 35 | struct starcoder_vocab { 36 | std::map token_to_id; 37 | std::map id_to_token; 38 | std::vector special_tokens; 39 | 40 | void add_special_token(const std::string & token); 41 | }; 42 | 43 | /* 44 | * Tokenizer 45 | */ 46 | starcoder_vocab starcoder_init_vocab(const std::string & vocab_file); 47 | 48 | const char* starcoder_id_to_token(starcoder_vocab& vocab, int id); 49 | 50 | int starcoder_tokenize(const starcoder_vocab &vocab, const std::string &text, std::vector &final_tokens, int n_max_tokens); 51 | 52 | #endif 53 | -------------------------------------------------------------------------------- /llm/include/LLaMATokenizer.h: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Adapted from llama.cpp: 4 | https://github.com/ggerganov/llama.cpp 5 | 6 | */ 7 | 8 | #ifndef LLaMA_TOKENIZER_H 9 | #define LLaMA_TOKENIZER_H 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | static int llama_token_bos() { return 1; } 21 | 22 | static int llama_token_eos() { return 2; } 23 | 24 | static int llama_token_nl() { return 13; } 25 | 26 | struct llama_vocab { 27 | struct token_score { 28 | std::string tok; 29 | float score; 30 | }; 31 | 32 | std::unordered_map token_to_id; 33 | std::vector id_to_token; 34 | }; 35 | 36 | /* 37 | * Tokenizer 38 | */ 39 | static size_t utf8_len(char src) { 40 | const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4}; 41 | uint8_t highbits = static_cast(src) >> 4; 42 | 43 | return lookup[highbits]; 44 | } 45 | 46 | struct llama_sp_symbol { 47 | using index = int; 48 | index prev; 49 | index next; 50 | const char* text; 51 | size_t n; 52 | }; 53 | 54 | struct llama_sp_bigram { 55 | struct comparator { 56 | bool operator()(llama_sp_bigram& l, llama_sp_bigram& r) { 57 | return (l.score < r.score) || (l.score == r.score && l.left > r.left); 58 | } 59 | }; 60 | using queue_storage = std::vector; 61 | using queue = std::priority_queue; 62 | llama_sp_symbol::index left; 63 | llama_sp_symbol::index right; 64 | float score; 65 | size_t size; 66 | }; 67 | 68 | llama_vocab llama_init_vocab(const char* vocab_file); 69 | 70 | const char* llama_id_to_token(const llama_vocab& vocab, int id); 71 | 72 | int llama_tokenize(const llama_vocab& vocab, const char* text, int* tokens, int n_max_tokens, bool add_bos); 73 | 74 | #endif 75 | -------------------------------------------------------------------------------- /llm/include/OPTTokenizer.h: -------------------------------------------------------------------------------- 1 | #ifndef OPT_TOKENIZER_H 2 | #define OPT_TOKENIZER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | // #include // Tricky to support this in windows 22 | #include 23 | 24 | // std::vector OPT_tokenize(const OPT_vocab & vocab, const std::string & text, bool add_bos); 25 | 26 | struct pair_hash { 27 | template 28 | std::size_t operator()(const std::pair &p) const { 29 | auto h1 = std::hash{}(p.first); 30 | auto h2 = std::hash{}(p.second); 31 | return h1 ^ h2; 32 | } 33 | }; 34 | 35 | class Encoder { 36 | public: 37 | Encoder(std::map encoder, std::vector> bpe_merges); 38 | std::unordered_map bytes_to_unicode(); 39 | std::set> get_pairs(std::vector word); 40 | std::string bpe(std::string token); 41 | std::vector encode(std::string text); 42 | std::string decode(std::vector tokens); 43 | 44 | private: 45 | std::map encoder; 46 | std::map decoder; 47 | std::unordered_map byte_encoder; 48 | std::unordered_map byte_decoder; 49 | std::unordered_map, int, pair_hash> bpe_ranks; 50 | std::unordered_map cache; 51 | }; 52 | 53 | Encoder get_encoder(std::string vocab_file, std::string bpe_file); 54 | 55 | #endif 56 | -------------------------------------------------------------------------------- /llm/include/interface.h: -------------------------------------------------------------------------------- 1 | #ifndef INTERFACE_H 2 | #define INTERFACE_H 3 | 4 | void set_print_black(); 5 | void set_print_red(); 6 | void set_print_yellow(); 7 | void set_print_bold_yellow(); 8 | void set_print_blue(); 9 | void set_print_white(); 10 | void set_print_reset(); 11 | 12 | #endif 13 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Fp32CLIPAttention.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "common.h" 4 | #include "operators.h" 5 | 6 | struct Fp32CLIPAttention_output { 7 | Matrix3D attn_output; 8 | Matrix3D attn_probs_reshaped; 9 | std::pair, Matrix3D> past_key_value; 10 | }; 11 | struct Fp32CLIPAttention_input { 12 | Matrix3D hidden_states; 13 | Matrix3D attention_mask; 14 | Matrix3D past_key, past_value; 15 | bool has_past_key_value = false; 16 | int layer_idx; 17 | 18 | Fp32CLIPAttention_input(Matrix3D hidden_states_, Matrix3D attention_mask_, int layer_idx_) 19 | : hidden_states(hidden_states_), attention_mask(attention_mask_), layer_idx(layer_idx_) {} 20 | 21 | Fp32CLIPAttention_input(Matrix3D hidden_states_, Matrix3D attention_mask_, Matrix3D past_key_, 22 | Matrix3D past_value_, bool has_past_key_value_, int layer_idx_) 23 | : hidden_states(hidden_states_), 24 | attention_mask(attention_mask_), 25 | past_key(past_key_), 26 | past_value(past_value_), 27 | has_past_key_value(has_past_key_value_), 28 | layer_idx(layer_idx_) {} 29 | }; 30 | 31 | class Fp32CLIPAttention { 32 | public: 33 | Fp32CLIPAttention(std::string param_path, const struct model_config config); 34 | Fp32CLIPAttention() {} 35 | static void initialized_memory(const struct model_config config); 36 | struct Fp32CLIPAttention_output forward(const struct Fp32CLIPAttention_input &input); 37 | 38 | private: 39 | void unshape(Matrix3D shaped, Matrix3D unshape, int sqlen); 40 | void shape(Matrix3D unshape, Matrix3D shaped, int sqlen); 41 | // void shape_qkv(Matrix3D unshape, Matrix3D shaped_q, Matrix3D shaped_k, 42 | // Matrix3D shaped_v, int sqlen); 43 | int embed_dim, num_heads, head_dim; 44 | Linear_FP k_proj, v_proj, q_proj, out_proj, qkv_proj; 45 | BMM_F32T qk_bmm, pv_bmm; 46 | std::string profile_name = "Fp32CLIPAttention"; 47 | }; 48 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Fp32CLIPEncoder.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "Fp32CLIPEncoderLayer.h" 6 | #include "common.h" 7 | #include "operators.h" 8 | 9 | struct Fp32CLIPEncoder_output { 10 | Matrix3D last_hidden_state; 11 | std::vector> past_keys, past_values; 12 | }; 13 | struct Fp32CLIPEncoder_input { 14 | Matrix3D hidden_states; 15 | Matrix3D attention_mask; 16 | std::vector> past_keys, past_values; 17 | bool has_past_keys_values; 18 | 19 | Fp32CLIPEncoder_input(Matrix3D hidden_states_, Matrix3D attention_mask_) 20 | : hidden_states(hidden_states_), attention_mask(attention_mask_) { 21 | has_past_keys_values = false; 22 | } 23 | Fp32CLIPEncoder_input(Matrix3D hidden_states_, Matrix3D attention_mask_, 24 | std::vector> past_keys_, std::vector> past_values_) 25 | : hidden_states(hidden_states_), attention_mask(attention_mask_), past_keys(past_keys_), past_values(past_values_) { 26 | has_past_keys_values = true; 27 | } 28 | }; 29 | 30 | class Fp32CLIPEncoder { 31 | public: 32 | Fp32CLIPEncoder(std::string param_path, const struct model_config config); 33 | Fp32CLIPEncoder(){}; 34 | struct Fp32CLIPEncoder_output forward(const struct Fp32CLIPEncoder_input& input); 35 | std::vector layers; 36 | std::string profile_name = "Fp32CLIPEncoder"; 37 | }; 38 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Fp32CLIPEncoderLayer.h: -------------------------------------------------------------------------------- 1 | #include "Fp32CLIPAttention.h" 2 | #include "common.h" 3 | #include "operators.h" 4 | 5 | struct Fp32CLIPEncoderLayer_output { 6 | Matrix3D hidden_states; 7 | Matrix3D attentions; 8 | std::pair, Matrix3D> past_key_value; 9 | 10 | Fp32CLIPEncoderLayer_output(Matrix3D hidden_states_, Matrix3D attentions_, 11 | std::pair, Matrix3D> past_key_value_) { 12 | hidden_states = hidden_states_; 13 | attentions = attentions_; 14 | past_key_value = past_key_value_; 15 | }; 16 | }; 17 | struct Fp32CLIPEncoderLayer_input { 18 | Matrix3D hidden_states; 19 | Matrix3D attention_mask; 20 | Matrix3D past_key, past_value; 21 | bool has_past_key_value = false; 22 | 23 | Fp32CLIPEncoderLayer_input(Matrix3D &hidden_states_, Matrix3D attention_mask_) { 24 | hidden_states = hidden_states_; 25 | attention_mask = attention_mask_; 26 | has_past_key_value = false; 27 | } 28 | 29 | Fp32CLIPEncoderLayer_input(Matrix3D &hidden_states_, Matrix3D attention_mask_, 30 | Matrix3D past_key_, Matrix3D past_value_) { 31 | hidden_states = hidden_states_; 32 | attention_mask = attention_mask_; 33 | past_key = past_key_; 34 | past_value = past_value_; 35 | has_past_key_value = true; 36 | } 37 | }; 38 | 39 | class Fp32CLIPEncoderLayer { 40 | public: 41 | Fp32CLIPEncoderLayer(std::string param_path, const struct model_config config, int layer_idx); 42 | struct Fp32CLIPEncoderLayer_output forward(const struct Fp32CLIPEncoderLayer_input &input); 43 | 44 | int embed_dim, num_attention_heads, hidden_dim, layer_idx; 45 | LayerNorm layer_norm1, layer_norm2; 46 | Linear_FP mlp_fc1, mlp_fc2; 47 | Fp32CLIPAttention attn; 48 | std::string profile_name = "Fp32CLIPEncoderLayer"; 49 | }; 50 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Fp32CLIPVisionTransformer.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "Fp32CLIPEncoder.h" 6 | #include "common.h" 7 | #include "operators.h" 8 | 9 | struct Fp32CLIPVisionTransformer_output { 10 | Matrix3D last_hidden_state; 11 | std::vector> past_keys, past_values; 12 | }; 13 | struct Fp32CLIPVisionTransformer_input { 14 | Matrix3D input_image; 15 | std::vector> past_keys, past_values; 16 | bool has_past_keys_values; 17 | 18 | Fp32CLIPVisionTransformer_input() {} 19 | Fp32CLIPVisionTransformer_input(Matrix3D input_image_) : input_image(input_image_) { has_past_keys_values = false; } 20 | Fp32CLIPVisionTransformer_input(Matrix3D input_image_, std::vector> past_keys_, 21 | std::vector> past_values_) 22 | : input_image(input_image_), past_keys(past_keys_), past_values(past_values_) { 23 | has_past_keys_values = true; 24 | } 25 | }; 26 | 27 | class Fp32CLIPVisionTransformer { 28 | public: 29 | Fp32CLIPVisionTransformer(std::string param_path, const struct model_config config, bool is_vila); 30 | Fp32CLIPVisionTransformer(){}; 31 | struct Fp32CLIPVisionTransformer_output forward(const struct Fp32CLIPVisionTransformer_input& input, bool is_vila); 32 | Embedding embed_positions; 33 | Conv2D embed_patch; 34 | LayerNorm pre_layernorm; 35 | Linear_FP mm_proj_0, mm_proj_2; 36 | int voc_size, embed_dim, padding_idx, hidden_dim, num_heads, image_size, patch_size, num_patches, num_positions, 37 | projection_dim, mmproj_dim; 38 | std::vector layers; 39 | std::string profile_name = "Fp32CLIPVisionTransformer"; 40 | 41 | private: 42 | Fp32CLIPEncoder encoder; 43 | float* patch_embeds_buf; 44 | float* class_embeds_buf; 45 | float* pos_embeds_buf; 46 | float* last_hidden_states_buf; 47 | float* hidden_states_buf; 48 | float* embeddings_buf; 49 | float* mm_proj_0_arr; 50 | float* mm_proj_2_arr; 51 | }; 52 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Fp32GPTBigCodeAttention.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "common.h" 4 | #include "operators.h" 5 | 6 | struct Fp32GPTBigCodeAttention_output { 7 | Matrix3D attn_output; 8 | Matrix3D attn_probs_reshaped; 9 | std::pair, Matrix3D> past_key_value; 10 | }; 11 | struct Fp32GPTBigCodeAttention_input { 12 | Matrix3D hidden_states; 13 | Matrix3D attention_mask; 14 | Matrix3D past_key, past_value; 15 | bool has_past_key_value = false; 16 | int layer_idx; 17 | 18 | Fp32GPTBigCodeAttention_input(Matrix3D hidden_states_, Matrix3D attention_mask_, int layer_idx_) 19 | : hidden_states(hidden_states_), attention_mask(attention_mask_), layer_idx(layer_idx_) {} 20 | 21 | Fp32GPTBigCodeAttention_input(Matrix3D hidden_states_, Matrix3D attention_mask_, Matrix3D past_key_, 22 | Matrix3D past_value_, bool has_past_key_value_, int layer_idx_) 23 | : hidden_states(hidden_states_), 24 | attention_mask(attention_mask_), 25 | past_key(past_key_), 26 | past_value(past_value_), 27 | has_past_key_value(has_past_key_value_), 28 | layer_idx(layer_idx_) {} 29 | }; 30 | 31 | class Fp32GPTBigCodeAttention { 32 | public: 33 | Fp32GPTBigCodeAttention(std::string param_path, const struct model_config config); 34 | Fp32GPTBigCodeAttention() {} 35 | static void initialized_memory(const struct model_config config); 36 | struct Fp32GPTBigCodeAttention_output forward(const struct Fp32GPTBigCodeAttention_input &input); 37 | 38 | private: 39 | void unshape(Matrix3D shaped, Matrix3D unshape, int sqlen); 40 | void shape_qkv(Matrix3D unshape, Matrix3D shaped_q, Matrix3D shaped_k, 41 | Matrix3D shaped_v, int sqlen); 42 | float scaling; 43 | int embed_dim, num_heads, head_dim, kv_heads, kv_dim; 44 | BMM_F32T qk_bmm, pv_bmm; 45 | Linear_FP c_attn, c_proj; 46 | std::string profile_name = "Fp32GPTBigCodeAttention"; 47 | }; 48 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Fp32GPTBigCodeDecoder.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "Fp32GPTBigCodeDecoderLayer.h" 6 | #include "common.h" 7 | #include "operators.h" 8 | 9 | struct Fp32GPTBigCodeDecoder_output { 10 | Matrix3D last_hidden_state; 11 | std::vector> past_keys, past_values; 12 | }; 13 | struct Fp32GPTBigCodeDecoder_input { 14 | Matrix3D input_ids; 15 | std::vector> past_keys, past_values; 16 | bool has_past_keys_values; 17 | 18 | Fp32GPTBigCodeDecoder_input(Matrix3D input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; } 19 | Fp32GPTBigCodeDecoder_input(Matrix3D input_ids_, std::vector> past_keys_, 20 | std::vector> past_values_) 21 | : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) { 22 | has_past_keys_values = true; 23 | } 24 | }; 25 | 26 | class Fp32GPTBigCodeDecoder { 27 | public: 28 | Fp32GPTBigCodeDecoder(std::string param_path, const struct model_config config); 29 | Fp32GPTBigCodeDecoder(){}; 30 | Matrix3D prepare_decoder_attention_mask(int length, int past_length); 31 | Matrix3D get_position_embed(int sql_length, int past_length); 32 | struct Fp32GPTBigCodeDecoder_output forward(const struct Fp32GPTBigCodeDecoder_input& input); 33 | Embedding wte, wpe; 34 | int voc_size, embed_dim, padding_idx, hidden_dim, num_heads, max_position_embeddings; 35 | std::vector layers; 36 | LayerNorm ln_f; 37 | std::string profile_name = "Fp32GPTBigCodeDecoder"; 38 | 39 | private: 40 | float* attention_mask_buf; 41 | float* pos_embeds_buf; 42 | float* last_hidden_states_buf; 43 | float* hidden_states_buf; 44 | }; 45 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Fp32GPTBigCodeDecoderLayer.h: -------------------------------------------------------------------------------- 1 | #include "Fp32GPTBigCodeAttention.h" 2 | #include "common.h" 3 | #include "operators.h" 4 | 5 | struct Fp32GPTBigCodeDecoderLayer_output { 6 | Matrix3D hidden_states; 7 | Matrix3D attentions; 8 | std::pair, Matrix3D> past_key_value; 9 | 10 | Fp32GPTBigCodeDecoderLayer_output(Matrix3D hidden_states_, Matrix3D attentions_, 11 | std::pair, Matrix3D> past_key_value_) { 12 | hidden_states = hidden_states_; 13 | attentions = attentions_; 14 | past_key_value = past_key_value_; 15 | }; 16 | }; 17 | struct Fp32GPTBigCodeDecoderLayer_input { 18 | Matrix3D hidden_states; 19 | Matrix3D attention_mask; 20 | Matrix3D past_key, past_value; 21 | bool has_past_key_value = false; 22 | 23 | Fp32GPTBigCodeDecoderLayer_input(Matrix3D &hidden_states_, Matrix3D &attention_mask_) { 24 | hidden_states = hidden_states_; 25 | attention_mask = attention_mask_; 26 | has_past_key_value = false; 27 | } 28 | 29 | Fp32GPTBigCodeDecoderLayer_input(Matrix3D &hidden_states_, Matrix3D &attention_mask_, 30 | Matrix3D past_key_, Matrix3D past_value_) { 31 | hidden_states = hidden_states_; 32 | attention_mask = attention_mask_; 33 | past_key = past_key_; 34 | past_value = past_value_; 35 | has_past_key_value = true; 36 | } 37 | }; 38 | 39 | class Fp32GPTBigCodeDecoderLayer { 40 | public: 41 | Fp32GPTBigCodeDecoderLayer(std::string param_path, const struct model_config config, int layer_idx); 42 | struct Fp32GPTBigCodeDecoderLayer_output forward(const struct Fp32GPTBigCodeDecoderLayer_input &input); 43 | 44 | int embed_dim, num_attention_heads, hidden_dim, layer_idx; 45 | LayerNorm ln_1, ln_2; // from torch_int.nn 46 | Linear_FP fc1, fc2; 47 | Fp32GPTBigCodeAttention attn; 48 | std::string profile_name = "Fp32GPTBigCodeDecoderLayer"; 49 | }; 50 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Fp32GPTBigCodeForCausalLM.h: -------------------------------------------------------------------------------- 1 | #include "Fp32GPTBigCodeDecoder.h" 2 | 3 | struct Fp32GPTBigCodeForCausalLM_output { 4 | Matrix3D logits; 5 | std::vector> past_keys, past_values; 6 | }; 7 | struct Fp32GPTBigCodeForCausalLM_input { 8 | Matrix3D input_ids; 9 | std::vector> past_keys, past_values; 10 | bool has_past_keys_values; 11 | 12 | Fp32GPTBigCodeForCausalLM_input() {} 13 | Fp32GPTBigCodeForCausalLM_input(Matrix3D input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; } 14 | Fp32GPTBigCodeForCausalLM_input(Matrix3D input_ids_, std::vector> past_keys_, 15 | std::vector> past_values_) 16 | : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) { 17 | has_past_keys_values = true; 18 | } 19 | }; 20 | 21 | class Fp32GPTBigCodeForCausalLM { 22 | public: 23 | Fp32GPTBigCodeForCausalLM(std::string param_path, const struct model_config config); 24 | struct Fp32GPTBigCodeForCausalLM_output forward(const struct Fp32GPTBigCodeForCausalLM_input& input); 25 | 26 | private: 27 | Fp32GPTBigCodeDecoder decoder; 28 | Linear_FP lm_head; 29 | std::string profile_name = "Fp32GPTBigCodeForCausalLM"; 30 | float* logits_output; 31 | float* lm_head_weight; 32 | }; 33 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Fp32OPTAttention.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "common.h" 4 | #include "operators.h" 5 | 6 | struct Fp32OPTAttention_output { 7 | Matrix3D attn_output; 8 | Matrix3D attn_probs_reshaped; 9 | std::pair, Matrix3D> past_key_value; 10 | }; 11 | struct Fp32OPTAttention_input { 12 | Matrix3D hidden_states; 13 | Matrix3D attention_mask; 14 | Matrix3D past_key, past_value; 15 | bool has_past_key_value = false; 16 | int layer_idx; 17 | 18 | Fp32OPTAttention_input(Matrix3D hidden_states_, Matrix3D attention_mask_, int layer_idx_) 19 | : hidden_states(hidden_states_), attention_mask(attention_mask_), layer_idx(layer_idx_) {} 20 | 21 | Fp32OPTAttention_input(Matrix3D hidden_states_, Matrix3D attention_mask_, Matrix3D past_key_, 22 | Matrix3D past_value_, bool has_past_key_value_, int layer_idx_) 23 | : hidden_states(hidden_states_), 24 | attention_mask(attention_mask_), 25 | past_key(past_key_), 26 | past_value(past_value_), 27 | has_past_key_value(has_past_key_value_), 28 | layer_idx(layer_idx_) {} 29 | }; 30 | 31 | class Fp32OPTAttention { 32 | public: 33 | Fp32OPTAttention(std::string param_path, const struct model_config config); 34 | Fp32OPTAttention() {} 35 | static void initialized_memory(const struct model_config config); 36 | struct Fp32OPTAttention_output forward(const struct Fp32OPTAttention_input &input); 37 | 38 | private: 39 | void unshape(Matrix3D shaped, Matrix3D unshape, int sqlen); 40 | void shpae(Matrix3D unshape, Matrix3D shaped, int sqlen); 41 | float scaling; 42 | int embed_dim, num_heads, head_dim; 43 | BMM_F32T qk_bmm, pv_bmm; 44 | Linear_FP k_proj, v_proj, q_proj, out_proj; 45 | std::string profile_name = "Fp32OPTAttention"; 46 | }; 47 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Fp32OPTDecoder.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "Fp32OPTDecoderLayer.h" 6 | #include "common.h" 7 | #include "operators.h" 8 | 9 | struct Fp32OPTDecoder_output { 10 | Matrix3D last_hidden_state; 11 | std::vector> past_keys, past_values; 12 | }; 13 | struct Fp32OPTDecoder_input { 14 | Matrix3D input_ids; 15 | std::vector> past_keys, past_values; 16 | bool has_past_keys_values; 17 | 18 | Fp32OPTDecoder_input(Matrix3D input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; } 19 | Fp32OPTDecoder_input(Matrix3D input_ids_, std::vector> past_keys_, 20 | std::vector> past_values_) 21 | : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) { 22 | has_past_keys_values = true; 23 | } 24 | }; 25 | 26 | class Fp32OPTDecoder { 27 | public: 28 | Fp32OPTDecoder(std::string param_path, const struct model_config config); 29 | Fp32OPTDecoder(){}; 30 | Matrix3D prepare_decoder_attention_mask(int length, int past_length); 31 | Matrix3D get_position_embed(int sql_length, int past_length); 32 | struct Fp32OPTDecoder_output forward(const struct Fp32OPTDecoder_input& input); 33 | Embedding embed_tokens, embed_positions; 34 | int voc_size, embed_dim, padding_idx, hidden_dim, num_heads; 35 | std::vector layers; 36 | LayerNorm final_layer_norm; 37 | std::string profile_name = "Fp32OPTDecoder"; 38 | 39 | private: 40 | float* attention_mask_buf; 41 | float* pos_embeds_buf; 42 | float* last_hidden_states_buf; 43 | float* hidden_states_buf; 44 | }; 45 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Fp32OPTDecoderLayer.h: -------------------------------------------------------------------------------- 1 | #include "Fp32OPTAttention.h" 2 | #include "common.h" 3 | #include "operators.h" 4 | 5 | struct Fp32OPTDecoderLayer_output { 6 | Matrix3D hidden_states; 7 | Matrix3D attentions; 8 | std::pair, Matrix3D> past_key_value; 9 | 10 | Fp32OPTDecoderLayer_output(Matrix3D hidden_states_, Matrix3D attentions_, 11 | std::pair, Matrix3D> past_key_value_) { 12 | hidden_states = hidden_states_; 13 | attentions = attentions_; 14 | past_key_value = past_key_value_; 15 | }; 16 | }; 17 | struct Fp32OPTDecoderLayer_input { 18 | Matrix3D hidden_states; 19 | Matrix3D attention_mask; 20 | Matrix3D past_key, past_value; 21 | bool has_past_key_value = false; 22 | 23 | Fp32OPTDecoderLayer_input(Matrix3D &hidden_states_, Matrix3D &attention_mask_) { 24 | hidden_states = hidden_states_; 25 | attention_mask = attention_mask_; 26 | has_past_key_value = false; 27 | } 28 | 29 | Fp32OPTDecoderLayer_input(Matrix3D &hidden_states_, Matrix3D &attention_mask_, 30 | Matrix3D past_key_, Matrix3D past_value_) { 31 | hidden_states = hidden_states_; 32 | attention_mask = attention_mask_; 33 | past_key = past_key_; 34 | past_value = past_value_; 35 | has_past_key_value = true; 36 | } 37 | }; 38 | 39 | class Fp32OPTDecoderLayer { 40 | public: 41 | Fp32OPTDecoderLayer(std::string param_path, const struct model_config config, int layer_idx); 42 | struct Fp32OPTDecoderLayer_output forward(const struct Fp32OPTDecoderLayer_input &input); 43 | 44 | int embed_dim, num_attention_heads, hidden_dim, layer_idx; 45 | LayerNorm self_attn_layer_norm, final_layer_norm; // from torch_int.nn 46 | Linear_FP fc1, fc2; 47 | Fp32OPTAttention attn; 48 | std::string profile_name = "Fp32OPTDecoderLayer"; 49 | }; 50 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Fp32OPTForCausalLM.h: -------------------------------------------------------------------------------- 1 | #include "Fp32OPTDecoder.h" 2 | 3 | struct Fp32OPTForCausalLM_output { 4 | Matrix3D logits; 5 | std::vector> past_keys, past_values; 6 | }; 7 | struct Fp32OPTForCausalLM_input { 8 | Matrix3D input_ids; 9 | std::vector> past_keys, past_values; 10 | bool has_past_keys_values; 11 | 12 | Fp32OPTForCausalLM_input(Matrix3D input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; } 13 | Fp32OPTForCausalLM_input(Matrix3D input_ids_, std::vector> past_keys_, 14 | std::vector> past_values_) 15 | : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) { 16 | has_past_keys_values = true; 17 | } 18 | }; 19 | 20 | class Fp32OPTForCausalLM { 21 | public: 22 | Fp32OPTForCausalLM(std::string param_path, const struct model_config config); 23 | struct Fp32OPTForCausalLM_output forward(const struct Fp32OPTForCausalLM_input& input); 24 | 25 | private: 26 | Fp32OPTDecoder decoder; 27 | Linear_FP lm_head; 28 | std::string profile_name = "Fp32OPTForCausalLM"; 29 | float* logits_output; 30 | float* lm_head_weight; 31 | }; 32 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Fp32llamaAttention.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "common.h" 4 | #include "operators.h" 5 | 6 | struct Fp32llamaAttention_output { 7 | Matrix3D attn_output; 8 | Matrix3D attn_probs_reshaped; 9 | std::pair, Matrix3D> past_key_value; 10 | }; 11 | struct Fp32llamaAttention_input { 12 | Matrix3D hidden_states; 13 | Matrix3D attention_mask; 14 | Matrix3D past_key, past_value; 15 | bool has_past_key_value = false; 16 | int layer_idx; 17 | 18 | Fp32llamaAttention_input(Matrix3D hidden_states_, Matrix3D attention_mask_, int layer_idx_) 19 | : hidden_states(hidden_states_), attention_mask(attention_mask_), layer_idx(layer_idx_) {} 20 | 21 | Fp32llamaAttention_input(Matrix3D hidden_states_, Matrix3D attention_mask_, Matrix3D past_key_, 22 | Matrix3D past_value_, bool has_past_key_value_, int layer_idx_) 23 | : hidden_states(hidden_states_), 24 | attention_mask(attention_mask_), 25 | past_key(past_key_), 26 | past_value(past_value_), 27 | has_past_key_value(has_past_key_value_), 28 | layer_idx(layer_idx_) {} 29 | }; 30 | 31 | class Fp32llamaAttention { 32 | public: 33 | Fp32llamaAttention(std::string param_path, const struct model_config config); 34 | Fp32llamaAttention() {} 35 | static void initialized_memory(const struct model_config config); 36 | struct Fp32llamaAttention_output forward(const struct Fp32llamaAttention_input &input); 37 | 38 | private: 39 | void unshape(Matrix3D shaped, Matrix3D unshape, int sqlen); 40 | void shape(Matrix3D unshape, Matrix3D shaped, int sqlen); 41 | int embed_dim, num_heads, head_dim; 42 | Linear_FP k_proj, v_proj, q_proj, o_proj; 43 | RotaryPosEmb rotary_pos_emb; 44 | BMM_F32T qk_bmm, pv_bmm; 45 | std::string profile_name = "Fp32llamaAttention"; 46 | }; 47 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Fp32llamaDecoder.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "Fp32llamaDecoderLayer.h" 6 | #include "common.h" 7 | #include "operators.h" 8 | 9 | struct Fp32llamaDecoder_output { 10 | Matrix3D last_hidden_state; 11 | std::vector> past_keys, past_values; 12 | }; 13 | struct Fp32llamaDecoder_input { 14 | Matrix3D input_ids; 15 | Matrix3D image_embed; 16 | Matrix3D second_input_ids; 17 | std::vector> past_keys, past_values; 18 | bool has_past_keys_values; 19 | bool is_llava; 20 | 21 | Fp32llamaDecoder_input() {} 22 | Fp32llamaDecoder_input(Matrix3D input_ids_) : input_ids(input_ids_) { 23 | has_past_keys_values = false; 24 | is_llava = false; 25 | } 26 | Fp32llamaDecoder_input(Matrix3D input_ids_, std::vector> past_keys_, 27 | std::vector> past_values_) 28 | : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) { 29 | has_past_keys_values = true; 30 | is_llava = false; 31 | } 32 | Fp32llamaDecoder_input(Matrix3D input_ids_, Matrix3D image_embed_, Matrix3D second_input_ids_) 33 | : input_ids(input_ids_), image_embed(image_embed_), second_input_ids(second_input_ids_) { 34 | has_past_keys_values = false; 35 | is_llava = true; 36 | } 37 | Fp32llamaDecoder_input(Matrix3D input_ids_, Matrix3D image_embed_) 38 | : input_ids(input_ids_), image_embed(image_embed_) { 39 | has_past_keys_values = false; 40 | is_llava = true; 41 | } 42 | }; 43 | 44 | class Fp32llamaDecoder { 45 | public: 46 | Fp32llamaDecoder(std::string param_path, const struct model_config config); 47 | Fp32llamaDecoder(){}; 48 | Matrix3D prepare_decoder_attention_mask(int length, int past_length); 49 | struct Fp32llamaDecoder_output forward(const struct Fp32llamaDecoder_input& input); 50 | Embedding embed_tokens; 51 | LlamaRMSNorm norm; 52 | float rms_norm_eps; 53 | int voc_size, embed_dim, padding_idx, hidden_dim, num_heads; 54 | std::vector layers; 55 | std::string profile_name = "Fp32llamaDecoder"; 56 | 57 | private: 58 | float* attention_mask_buf; 59 | float* pos_embeds_buf; 60 | float* last_hidden_states_buf; 61 | float* hidden_states_buf; 62 | float* inputs_embeds_buf; 63 | float* first_input_ids_buf; 64 | float* image_embed_buf; 65 | float* second_input_ids_buf; 66 | }; 67 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Fp32llamaDecoderLayer.h: -------------------------------------------------------------------------------- 1 | #include "Fp32llamaAttention.h" 2 | #include "common.h" 3 | #include "operators.h" 4 | 5 | struct Fp32llamaDecoderLayer_output { 6 | Matrix3D hidden_states; 7 | Matrix3D attentions; 8 | std::pair, Matrix3D> past_key_value; 9 | 10 | Fp32llamaDecoderLayer_output(Matrix3D hidden_states_, Matrix3D attentions_, 11 | std::pair, Matrix3D> past_key_value_) { 12 | hidden_states = hidden_states_; 13 | attentions = attentions_; 14 | past_key_value = past_key_value_; 15 | }; 16 | }; 17 | struct Fp32llamaDecoderLayer_input { 18 | Matrix3D hidden_states; 19 | Matrix3D attention_mask; 20 | Matrix3D past_key, past_value; 21 | bool has_past_key_value = false; 22 | 23 | Fp32llamaDecoderLayer_input(Matrix3D &hidden_states_, Matrix3D &attention_mask_) { 24 | hidden_states = hidden_states_; 25 | attention_mask = attention_mask_; 26 | has_past_key_value = false; 27 | } 28 | 29 | Fp32llamaDecoderLayer_input(Matrix3D &hidden_states_, Matrix3D &attention_mask_, 30 | Matrix3D past_key_, Matrix3D past_value_) { 31 | hidden_states = hidden_states_; 32 | attention_mask = attention_mask_; 33 | past_key = past_key_; 34 | past_value = past_value_; 35 | has_past_key_value = true; 36 | } 37 | }; 38 | 39 | class Fp32llamaDecoderLayer { 40 | public: 41 | Fp32llamaDecoderLayer(std::string param_path, const struct model_config config, int layer_idx); 42 | struct Fp32llamaDecoderLayer_output forward(const struct Fp32llamaDecoderLayer_input &input); 43 | 44 | int embed_dim, num_attention_heads, hidden_dim, layer_idx; 45 | float rms_norm_eps; 46 | LlamaRMSNorm input_layernorm, post_attention_layernorm; 47 | Linear_FP gate_proj, down_proj, up_proj; 48 | Fp32llamaAttention attn; 49 | std::string profile_name = "Fp32llamaDecoderLayer"; 50 | }; 51 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Fp32llamaForCausalLM.h: -------------------------------------------------------------------------------- 1 | #include "Fp32llamaDecoder.h" 2 | 3 | struct Fp32LlamaForCausalLM_output { 4 | Matrix3D logits; 5 | std::vector> past_keys, past_values; 6 | }; 7 | struct Fp32LlamaForCausalLM_input { 8 | Matrix3D input_ids; 9 | Matrix3D image_embed; 10 | Matrix3D second_input_ids; 11 | std::vector> past_keys, past_values; 12 | bool has_past_keys_values; 13 | bool is_llava; 14 | 15 | Fp32LlamaForCausalLM_input() {} 16 | Fp32LlamaForCausalLM_input(Matrix3D input_ids_) : input_ids(input_ids_) { 17 | has_past_keys_values = false; 18 | is_llava = false; 19 | } 20 | Fp32LlamaForCausalLM_input(Matrix3D input_ids_, std::vector> past_keys_, 21 | std::vector> past_values_) 22 | : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) { 23 | has_past_keys_values = true; 24 | is_llava = false; 25 | } 26 | Fp32LlamaForCausalLM_input(Matrix3D input_ids_, Matrix3D image_embed_, Matrix3D second_input_ids_) 27 | : input_ids(input_ids_), image_embed(image_embed_), second_input_ids(second_input_ids_) { 28 | has_past_keys_values = false; 29 | is_llava = true; 30 | } 31 | Fp32LlamaForCausalLM_input(Matrix3D input_ids_, Matrix3D image_embed_) 32 | : input_ids(input_ids_), image_embed(image_embed_) { 33 | has_past_keys_values = false; 34 | is_llava = true; 35 | } 36 | }; 37 | 38 | class Fp32LlamaForCausalLM { 39 | public: 40 | Fp32LlamaForCausalLM(std::string param_path, const struct model_config config); 41 | 42 | struct Fp32LlamaForCausalLM_output forward(const struct Fp32LlamaForCausalLM_input& input); 43 | 44 | private: 45 | Fp32llamaDecoder decoder; 46 | Linear_FP lm_head; 47 | std::string profile_name = "Fp32LlamaForCausalLM"; 48 | float* logits_output; 49 | float* lm_head_weight; 50 | }; 51 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Int4GPTBigCodeAttention.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "common.h" 4 | #include "operators.h" 5 | 6 | struct Int4GPTBigCodeAttention_output { 7 | Matrix3D attn_output; 8 | Matrix3D attn_probs_reshaped; 9 | std::pair, Matrix3D> past_key_value; 10 | }; 11 | struct Int4GPTBigCodeAttention_input { 12 | Matrix3D hidden_states; 13 | Matrix3D attention_mask; 14 | Matrix3D past_key, past_value; 15 | bool has_past_key_value = false; 16 | int layer_idx; 17 | 18 | Int4GPTBigCodeAttention_input(Matrix3D hidden_states_, Matrix3D attention_mask_, int layer_idx_) 19 | : hidden_states(hidden_states_), attention_mask(attention_mask_), layer_idx(layer_idx_) {} 20 | 21 | Int4GPTBigCodeAttention_input(Matrix3D hidden_states_, Matrix3D attention_mask_, Matrix3D past_key_, 22 | Matrix3D past_value_, bool has_past_key_value_, int layer_idx_) 23 | : hidden_states(hidden_states_), 24 | attention_mask(attention_mask_), 25 | past_key(past_key_), 26 | past_value(past_value_), 27 | has_past_key_value(has_past_key_value_), 28 | layer_idx(layer_idx_) {} 29 | }; 30 | 31 | class Int4GPTBigCodeAttention { 32 | public: 33 | Int4GPTBigCodeAttention(std::string param_path, const struct model_config config); 34 | Int4GPTBigCodeAttention() {} 35 | static void initialized_memory(const struct model_config config); 36 | struct Int4GPTBigCodeAttention_output forward(const struct Int4GPTBigCodeAttention_input &input); 37 | 38 | private: 39 | void unshape(Matrix3D shaped, Matrix3D unshape, int sqlen); 40 | void shape_qkv(Matrix3D unshape, Matrix3D shaped_q, Matrix3D shaped_k, 41 | Matrix3D shaped_v, int sqlen); 42 | int embed_dim, num_heads, head_dim, kv_heads, kv_dim; 43 | BMM_F32T qk_bmm, pv_bmm; 44 | Linear_FP_int4 c_attn, c_proj; 45 | std::string profile_name = "Int4GPTBigCodeAttention"; 46 | }; 47 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Int4GPTBigCodeDecoder.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "Int4GPTBigCodeDecoderLayer.h" 6 | #include "common.h" 7 | #include "operators.h" 8 | 9 | struct Int4GPTBigCodeDecoder_output { 10 | Matrix3D last_hidden_state; 11 | std::vector> past_keys, past_values; 12 | }; 13 | struct Int4GPTBigCodeDecoder_input { 14 | Matrix3D input_ids; 15 | std::vector> past_keys, past_values; 16 | bool has_past_keys_values; 17 | 18 | Int4GPTBigCodeDecoder_input(Matrix3D input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; } 19 | Int4GPTBigCodeDecoder_input(Matrix3D input_ids_, std::vector> past_keys_, 20 | std::vector> past_values_) 21 | : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) { 22 | has_past_keys_values = true; 23 | } 24 | }; 25 | 26 | class Int4GPTBigCodeDecoder { 27 | public: 28 | Int4GPTBigCodeDecoder(std::string param_path, const struct model_config config); 29 | Int4GPTBigCodeDecoder(){}; 30 | Matrix3D prepare_decoder_attention_mask(int length, int past_length); 31 | Matrix3D get_position_embed(int sql_length, int past_length); 32 | struct Int4GPTBigCodeDecoder_output forward(const struct Int4GPTBigCodeDecoder_input& input); 33 | Embedding wte, wpe; 34 | int voc_size, embed_dim, padding_idx, hidden_dim, num_heads, max_position_embeddings; 35 | std::vector layers; 36 | LayerNorm ln_f; 37 | std::string profile_name = "Int4GPTBigCodeDecoder"; 38 | 39 | private: 40 | float* attention_mask_buf; 41 | float* pos_embeds_buf; 42 | float* last_hidden_states_buf; 43 | float* hidden_states_buf; 44 | }; 45 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Int4GPTBigCodeDecoderLayer.h: -------------------------------------------------------------------------------- 1 | #include "Int4GPTBigCodeAttention.h" 2 | #include "common.h" 3 | #include "operators.h" 4 | 5 | struct Int4GPTBigCodeDecoderLayer_output { 6 | Matrix3D hidden_states; 7 | Matrix3D attentions; 8 | std::pair, Matrix3D> past_key_value; 9 | 10 | Int4GPTBigCodeDecoderLayer_output(Matrix3D hidden_states_, Matrix3D attentions_, 11 | std::pair, Matrix3D> past_key_value_) { 12 | hidden_states = hidden_states_; 13 | attentions = attentions_; 14 | past_key_value = past_key_value_; 15 | }; 16 | }; 17 | struct Int4GPTBigCodeDecoderLayer_input { 18 | Matrix3D hidden_states; 19 | Matrix3D attention_mask; 20 | Matrix3D past_key, past_value; 21 | bool has_past_key_value = false; 22 | 23 | Int4GPTBigCodeDecoderLayer_input(Matrix3D &hidden_states_, Matrix3D &attention_mask_) { 24 | hidden_states = hidden_states_; 25 | attention_mask = attention_mask_; 26 | has_past_key_value = false; 27 | } 28 | 29 | Int4GPTBigCodeDecoderLayer_input(Matrix3D &hidden_states_, Matrix3D &attention_mask_, 30 | Matrix3D past_key_, Matrix3D past_value_) { 31 | hidden_states = hidden_states_; 32 | attention_mask = attention_mask_; 33 | past_key = past_key_; 34 | past_value = past_value_; 35 | has_past_key_value = true; 36 | } 37 | }; 38 | 39 | class Int4GPTBigCodeDecoderLayer { 40 | public: 41 | Int4GPTBigCodeDecoderLayer(std::string param_path, const struct model_config config, int layer_idx); 42 | struct Int4GPTBigCodeDecoderLayer_output forward(const struct Int4GPTBigCodeDecoderLayer_input &input); 43 | 44 | int embed_dim, num_attention_heads, hidden_dim, layer_idx; 45 | LayerNorm ln_1, ln_2; // from torch_int.nn 46 | Linear_FP_int4 fc1, fc2; 47 | Int4GPTBigCodeAttention attn; 48 | std::string profile_name = "Int4GPTBigCodeDecoderLayer"; 49 | }; 50 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Int4GPTBigCodeForCausalLM.h: -------------------------------------------------------------------------------- 1 | #include "Int4GPTBigCodeDecoder.h" 2 | 3 | struct Int4GPTBigCodeForCausalLM_output { 4 | Matrix3D logits; 5 | std::vector> past_keys, past_values; 6 | }; 7 | struct Int4GPTBigCodeForCausalLM_input { 8 | Matrix3D input_ids; 9 | std::vector> past_keys, past_values; 10 | bool has_past_keys_values; 11 | 12 | Int4GPTBigCodeForCausalLM_input() {} 13 | Int4GPTBigCodeForCausalLM_input(Matrix3D input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; } 14 | Int4GPTBigCodeForCausalLM_input(Matrix3D input_ids_, std::vector> past_keys_, 15 | std::vector> past_values_) 16 | : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) { 17 | has_past_keys_values = true; 18 | } 19 | }; 20 | 21 | class Int4GPTBigCodeForCausalLM { 22 | public: 23 | Int4GPTBigCodeForCausalLM(std::string param_path, const struct model_config config); 24 | struct Int4GPTBigCodeForCausalLM_output forward(std::string param_path, const struct Int4GPTBigCodeForCausalLM_input& input); 25 | 26 | private: 27 | Int4GPTBigCodeDecoder decoder; 28 | Linear_FP_int4 lm_head; 29 | std::string profile_name = "Int4GPTBigCodeForCausalLM"; 30 | float* logits_output; 31 | uint8_t* lm_head_weight; 32 | }; 33 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Int4OPTAttention.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "common.h" 4 | #include "operators.h" 5 | 6 | struct Int4OPTAttention_output { 7 | Matrix3D attn_output; 8 | Matrix3D attn_probs_reshaped; 9 | std::pair, Matrix3D> past_key_value; 10 | }; 11 | struct Int4OPTAttention_input { 12 | Matrix3D hidden_states; 13 | Matrix3D attention_mask; 14 | Matrix3D past_key, past_value; 15 | bool has_past_key_value = false; 16 | int layer_idx; 17 | 18 | Int4OPTAttention_input(Matrix3D hidden_states_, Matrix3D attention_mask_, int layer_idx_) 19 | : hidden_states(hidden_states_), attention_mask(attention_mask_), layer_idx(layer_idx_) {} 20 | 21 | Int4OPTAttention_input(Matrix3D hidden_states_, Matrix3D attention_mask_, Matrix3D past_key_, 22 | Matrix3D past_value_, bool has_past_key_value_, int layer_idx_) 23 | : hidden_states(hidden_states_), 24 | attention_mask(attention_mask_), 25 | past_key(past_key_), 26 | past_value(past_value_), 27 | has_past_key_value(has_past_key_value_), 28 | layer_idx(layer_idx_) {} 29 | }; 30 | 31 | class Int4OPTAttention { 32 | public: 33 | Int4OPTAttention(std::string param_path, const struct model_config config); 34 | Int4OPTAttention() {} 35 | static void initialized_memory(const struct model_config config); 36 | struct Int4OPTAttention_output forward(const struct Int4OPTAttention_input &input); 37 | 38 | private: 39 | void unshape(Matrix3D shaped, Matrix3D unshape, int sqlen); 40 | void shpae(Matrix3D unshape, Matrix3D shaped, int sqlen); 41 | float scaling; 42 | int embed_dim, num_heads, head_dim; 43 | BMM_F32T qk_bmm, pv_bmm; 44 | Linear_FP_int4 k_proj, v_proj, q_proj, out_proj; 45 | std::string profile_name = "Int4OPTAttention"; 46 | }; 47 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Int4OPTDecoder.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "Int4OPTDecoderLayer.h" 6 | #include "common.h" 7 | #include "operators.h" 8 | 9 | struct Int4OPTDecoder_output { 10 | Matrix3D last_hidden_state; 11 | std::vector> past_keys, past_values; 12 | }; 13 | struct Int4OPTDecoder_input { 14 | Matrix3D input_ids; 15 | std::vector> past_keys, past_values; 16 | bool has_past_keys_values; 17 | 18 | Int4OPTDecoder_input(Matrix3D input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; } 19 | Int4OPTDecoder_input(Matrix3D input_ids_, std::vector> past_keys_, 20 | std::vector> past_values_) 21 | : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) { 22 | has_past_keys_values = true; 23 | } 24 | }; 25 | 26 | class Int4OPTDecoder { 27 | public: 28 | Int4OPTDecoder(std::string param_path, const struct model_config config); 29 | Int4OPTDecoder(){}; 30 | Matrix3D prepare_decoder_attention_mask(int length, int past_length); 31 | Matrix3D get_position_embed(int sql_length, int past_length); 32 | struct Int4OPTDecoder_output forward(const struct Int4OPTDecoder_input& input); 33 | Embedding embed_tokens, embed_positions; 34 | int voc_size, embed_dim, padding_idx, hidden_dim, num_heads; 35 | std::vector layers; 36 | LayerNorm final_layer_norm; 37 | std::string profile_name = "Int4OPTDecoder"; 38 | 39 | private: 40 | float* attention_mask_buf; 41 | float* pos_embeds_buf; 42 | float* last_hidden_states_buf; 43 | float* hidden_states_buf; 44 | }; 45 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Int4OPTDecoderLayer.h: -------------------------------------------------------------------------------- 1 | #include "Int4OPTAttention.h" 2 | #include "common.h" 3 | #include "operators.h" 4 | 5 | struct Int4OPTDecoderLayer_output { 6 | Matrix3D hidden_states; 7 | Matrix3D attentions; 8 | std::pair, Matrix3D> past_key_value; 9 | 10 | Int4OPTDecoderLayer_output(Matrix3D hidden_states_, Matrix3D attentions_, 11 | std::pair, Matrix3D> past_key_value_) { 12 | hidden_states = hidden_states_; 13 | attentions = attentions_; 14 | past_key_value = past_key_value_; 15 | }; 16 | }; 17 | struct Int4OPTDecoderLayer_input { 18 | Matrix3D hidden_states; 19 | Matrix3D attention_mask; 20 | Matrix3D past_key, past_value; 21 | bool has_past_key_value = false; 22 | 23 | Int4OPTDecoderLayer_input(Matrix3D &hidden_states_, Matrix3D &attention_mask_) { 24 | hidden_states = hidden_states_; 25 | attention_mask = attention_mask_; 26 | has_past_key_value = false; 27 | } 28 | 29 | Int4OPTDecoderLayer_input(Matrix3D &hidden_states_, Matrix3D &attention_mask_, 30 | Matrix3D past_key_, Matrix3D past_value_) { 31 | hidden_states = hidden_states_; 32 | attention_mask = attention_mask_; 33 | past_key = past_key_; 34 | past_value = past_value_; 35 | has_past_key_value = true; 36 | } 37 | }; 38 | 39 | class Int4OPTDecoderLayer { 40 | public: 41 | Int4OPTDecoderLayer(std::string param_path, const struct model_config config, int layer_idx); 42 | struct Int4OPTDecoderLayer_output forward(const struct Int4OPTDecoderLayer_input &input); 43 | 44 | int embed_dim, num_attention_heads, hidden_dim, layer_idx; 45 | LayerNorm self_attn_layer_norm, final_layer_norm; // from torch_int.nn 46 | Linear_FP_int4 fc1, fc2; 47 | Int4OPTAttention attn; 48 | std::string profile_name = "Int4OPTDecoderLayer"; 49 | }; 50 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Int4OPTForCausalLM.h: -------------------------------------------------------------------------------- 1 | #include "Int4OPTDecoder.h" 2 | 3 | struct Int4OPTForCausalLM_output { 4 | Matrix3D logits; 5 | std::vector> past_keys, past_values; 6 | }; 7 | struct Int4OPTForCausalLM_input { 8 | Matrix3D input_ids; 9 | std::vector> past_keys, past_values; 10 | bool has_past_keys_values; 11 | 12 | Int4OPTForCausalLM_input(Matrix3D input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; } 13 | Int4OPTForCausalLM_input(Matrix3D input_ids_, std::vector> past_keys_, 14 | std::vector> past_values_) 15 | : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) { 16 | has_past_keys_values = true; 17 | } 18 | }; 19 | 20 | class Int4OPTForCausalLM { 21 | public: 22 | Int4OPTForCausalLM(std::string param_path, const struct model_config config); 23 | struct Int4OPTForCausalLM_output forward(const struct Int4OPTForCausalLM_input& input); 24 | 25 | private: 26 | Int4OPTDecoder decoder; 27 | Linear_FP_int4 lm_head; 28 | std::string profile_name = "Int4OPTForCausalLM"; 29 | float* logits_output; 30 | uint8_t* lm_head_weight; 31 | }; 32 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Int4llamaForCausalLM.h: -------------------------------------------------------------------------------- 1 | #include "Int4llamaDecoder.h" 2 | 3 | struct Int4LlamaForCausalLM_output { 4 | Matrix3D logits; 5 | #ifdef QM_CUDA 6 | std::vector> past_keys, past_values; 7 | #else 8 | std::vector> past_keys, past_values; 9 | #endif 10 | }; 11 | struct Int4LlamaForCausalLM_input { 12 | Matrix3D input_ids; 13 | Matrix3D image_embed; 14 | Matrix3D second_input_ids; 15 | bool has_past_keys_values; 16 | bool is_llava; 17 | #ifdef QM_CUDA 18 | std::vector> past_keys, past_values; 19 | #else 20 | std::vector> past_keys, past_values; 21 | #endif 22 | 23 | Int4LlamaForCausalLM_input() {} 24 | Int4LlamaForCausalLM_input(Matrix3D input_ids_) : input_ids(input_ids_) { 25 | has_past_keys_values = false; 26 | is_llava = false; 27 | } 28 | #ifdef QM_CUDA 29 | Int4LlamaForCausalLM_input(Matrix3D input_ids_, std::vector> past_keys_, 30 | std::vector> past_values_) 31 | #else 32 | Int4LlamaForCausalLM_input(Matrix3D input_ids_, std::vector> past_keys_, 33 | std::vector> past_values_) 34 | #endif 35 | : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) { 36 | has_past_keys_values = true; 37 | is_llava = false; 38 | } 39 | Int4LlamaForCausalLM_input(Matrix3D input_ids_, Matrix3D image_embed_, Matrix3D second_input_ids_) 40 | : input_ids(input_ids_), image_embed(image_embed_), second_input_ids(second_input_ids_) { 41 | has_past_keys_values = false; 42 | is_llava = true; 43 | } 44 | Int4LlamaForCausalLM_input(Matrix3D input_ids_, Matrix3D image_embed_) 45 | : input_ids(input_ids_), image_embed(image_embed_) { 46 | has_past_keys_values = false; 47 | is_llava = true; 48 | } 49 | }; 50 | 51 | class Int4LlamaForCausalLM { 52 | public: 53 | Int4LlamaForCausalLM(std::string param_path, const struct model_config config); 54 | Int4LlamaForCausalLM(){}; 55 | struct Int4LlamaForCausalLM_output forward(std::string param_path, const struct Int4LlamaForCausalLM_input& input); 56 | float* logits_output = nullptr; 57 | #ifdef QM_CUDA 58 | void free_cuda_memory(); 59 | int* lm_head_weight = nullptr; 60 | float16_t* logits_output_half = nullptr; 61 | #else 62 | uint8_t* lm_head_weight; 63 | #endif 64 | 65 | private: 66 | std::string profile_name = "Int4LlamaForCausalLM"; 67 | Int4llamaDecoder decoder; 68 | #ifdef QM_CUDA 69 | Linear_half_int4 lm_head; 70 | #else 71 | Linear_FP_int4 lm_head; 72 | #endif 73 | }; 74 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Int8OPTAttention.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "common.h" 4 | #include "operators.h" 5 | 6 | struct Int8OPTAttention_output { 7 | Matrix3D attn_output; 8 | Matrix3D attn_probs_reshaped; 9 | std::pair, Matrix3D> past_key_value; 10 | }; 11 | struct Int8OPTAttention_input { 12 | Matrix3D hidden_states; 13 | Matrix3D attention_mask; 14 | Matrix3D past_key, past_value; 15 | bool has_past_key_value = false; 16 | int layer_idx; 17 | 18 | Int8OPTAttention_input(Matrix3D hidden_states_, Matrix3D attention_mask_, int layer_idx_) 19 | : hidden_states(hidden_states_), attention_mask(attention_mask_), layer_idx(layer_idx_) {} 20 | 21 | Int8OPTAttention_input(Matrix3D hidden_states_, Matrix3D attention_mask_, Matrix3D past_key_, 22 | Matrix3D past_value_, bool has_past_key_value_, int layer_idx_) 23 | : hidden_states(hidden_states_), 24 | attention_mask(attention_mask_), 25 | past_key(past_key_), 26 | past_value(past_value_), 27 | has_past_key_value(has_past_key_value_), 28 | layer_idx(layer_idx_) {} 29 | }; 30 | 31 | class Int8OPTAttention { 32 | public: 33 | Int8OPTAttention(std::string param_path, const struct model_config config, BMM_S8T_S8N_F32T &qk_bmm, 34 | BMM_S8T_S8N_S8T &pv_bmm, W8A8B8O8Linear &k_proj, W8A8B8O8Linear &v_proj, W8A8B8O8Linear &q_proj, 35 | W8A8BFP32OFP32Linear &out_proj); 36 | Int8OPTAttention() {} 37 | static void initialized_memory(const struct model_config config); 38 | struct Int8OPTAttention_output forward(const struct Int8OPTAttention_input &input); 39 | 40 | private: 41 | void unshape(Matrix3D shaped, Matrix3D unshape, int sqlen); 42 | void shpae(Matrix3D unshape, Matrix3D shaped, int sqlen); 43 | int embed_dim, num_heads, head_dim; 44 | BMM_S8T_S8N_F32T qk_bmm; 45 | BMM_S8T_S8N_S8T pv_bmm; 46 | W8A8B8O8Linear k_proj, v_proj, q_proj; 47 | W8A8BFP32OFP32Linear out_proj; 48 | std::string profile_name = "Int8OPTAttention"; 49 | }; 50 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Int8OPTDecoder.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "Int8OPTDecoderLayer.h" 6 | #include "common.h" 7 | #include "operators.h" 8 | 9 | struct Int8OPTDecoder_output { 10 | Matrix3D last_hidden_state; 11 | std::vector> past_keys, past_values; 12 | }; 13 | struct Int8OPTDecoder_input { 14 | Matrix3D input_ids; 15 | std::vector> past_keys, past_values; 16 | bool has_past_keys_values; 17 | 18 | Int8OPTDecoder_input(Matrix3D input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; } 19 | Int8OPTDecoder_input(Matrix3D input_ids_, std::vector> past_keys_, 20 | std::vector> past_values_) 21 | : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) { 22 | has_past_keys_values = true; 23 | } 24 | }; 25 | 26 | class Int8OPTDecoder { 27 | public: 28 | Int8OPTDecoder(std::string param_path, const struct model_config config); 29 | Int8OPTDecoder(){}; 30 | Matrix3D prepare_decoder_attention_mask(int length, int past_length); 31 | Matrix3D get_position_embed(int sql_length, int past_length); 32 | struct Int8OPTDecoder_output forward(const struct Int8OPTDecoder_input& input); 33 | Embedding embed_tokens, embed_positions; 34 | int voc_size, embed_dim, padding_idx, hidden_dim, num_heads; 35 | std::vector layers; 36 | LayerNorm final_layer_norm; 37 | std::string profile_name = "Int8OPTDecoder"; 38 | 39 | private: 40 | float* attention_mask_buf; 41 | float* pos_embeds_buf; 42 | float* last_hidden_states_buf; 43 | float* hidden_states_buf; 44 | }; 45 | -------------------------------------------------------------------------------- /llm/include/nn_modules/Int8OPTDecoderLayer.h: -------------------------------------------------------------------------------- 1 | #include "Int8OPTAttention.h" 2 | #include "common.h" 3 | #include "operators.h" 4 | 5 | struct Int8OPTDecoderLayer_output { 6 | Matrix3D hidden_states; 7 | Matrix3D attentions; 8 | std::pair, Matrix3D> past_key_value; 9 | 10 | Int8OPTDecoderLayer_output(Matrix3D hidden_states_, Matrix3D attentions_, 11 | std::pair, Matrix3D> past_key_value_) { 12 | hidden_states = hidden_states_; 13 | attentions = attentions_; 14 | past_key_value = past_key_value_; 15 | }; 16 | }; 17 | struct Int8OPTDecoderLayer_input { 18 | Matrix3D hidden_states; 19 | Matrix3D attention_mask; 20 | Matrix3D past_key, past_value; 21 | bool has_past_key_value = false; 22 | 23 | Int8OPTDecoderLayer_input(Matrix3D &hidden_states_, Matrix3D &attention_mask_) { 24 | hidden_states = hidden_states_; 25 | attention_mask = attention_mask_; 26 | has_past_key_value = false; 27 | } 28 | 29 | Int8OPTDecoderLayer_input(Matrix3D &hidden_states_, Matrix3D &attention_mask_, 30 | Matrix3D past_key_, Matrix3D past_value_) { 31 | hidden_states = hidden_states_; 32 | attention_mask = attention_mask_; 33 | past_key = past_key_; 34 | past_value = past_value_; 35 | has_past_key_value = true; 36 | } 37 | }; 38 | 39 | class Int8OPTDecoderLayer { 40 | public: 41 | Int8OPTDecoderLayer(std::string param_path, const struct model_config config, int layer_idx, 42 | LayerNormQ self_attn_layer_norm, LayerNormQ final_layer_norm, W8A8B8O8LinearReLU fc1, 43 | W8A8BFP32OFP32Linear fc2, BMM_S8T_S8N_F32T qk_bmm, BMM_S8T_S8N_S8T pv_bmm, 44 | W8A8B8O8Linear k_proj, W8A8B8O8Linear v_proj, W8A8B8O8Linear q_proj, 45 | W8A8BFP32OFP32Linear out_proj); 46 | struct Int8OPTDecoderLayer_output forward(const struct Int8OPTDecoderLayer_input &input); 47 | 48 | int embed_dim, num_attention_heads, hidden_dim, layer_idx; 49 | LayerNormQ self_attn_layer_norm, final_layer_norm; // from torch_int.nn 50 | W8A8B8O8LinearReLU fc1; 51 | W8A8BFP32OFP32Linear fc2; 52 | Int8OPTAttention attn; 53 | std::string profile_name = "Int8OPTDecoderLayer"; 54 | }; 55 | -------------------------------------------------------------------------------- /llm/include/nn_modules/OPTForCausalLM.h: -------------------------------------------------------------------------------- 1 | #include "Int8OPTDecoder.h" 2 | 3 | struct OPTForCausalLM_output { 4 | Matrix3D logits; 5 | std::vector> past_keys, past_values; 6 | }; 7 | struct OPTForCausalLM_input { 8 | Matrix3D input_ids; 9 | std::vector> past_keys, past_values; 10 | bool has_past_keys_values; 11 | 12 | OPTForCausalLM_input(Matrix3D input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; } 13 | OPTForCausalLM_input(Matrix3D input_ids_, std::vector> past_keys_, 14 | std::vector> past_values_) 15 | : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) { 16 | has_past_keys_values = true; 17 | } 18 | }; 19 | 20 | class OPTForCausalLM { 21 | public: 22 | OPTForCausalLM(std::string param_path, const struct model_config config); 23 | struct OPTForCausalLM_output forward(const struct OPTForCausalLM_input& input); 24 | 25 | private: 26 | Int8OPTDecoder decoder; 27 | Linear_FP lm_head; 28 | std::string profile_name = "OPTForCausalLM"; 29 | float* logits_output; 30 | float* lm_head_weight; 31 | }; 32 | -------------------------------------------------------------------------------- /llm/include/operators.h: -------------------------------------------------------------------------------- 1 | #ifndef OPERATORS_H 2 | #define OPERATORS_H 3 | #include 4 | 5 | #include "common.h" 6 | #include "matmul.h" 7 | 8 | #define BLK_SIZE 16 9 | // #define NUM_THREAD 8 10 | extern int NUM_THREAD; 11 | 12 | // include all ops 13 | #include "ops/BMM_F32T.h" 14 | #include "ops/BMM_S8T_S8N_F32T.h" 15 | #include "ops/BMM_S8T_S8N_S8T.h" 16 | #include "ops/Embedding.h" 17 | #include "ops/LayerNorm.h" 18 | #include "ops/LayerNormQ.h" 19 | #include "ops/LlamaRMSNorm.h" 20 | #include "ops/RotaryPosEmb.h" 21 | #include "ops/W8A8B8O8Linear.h" 22 | #include "ops/W8A8B8O8LinearReLU.h" 23 | #include "ops/W8A8BFP32OFP32Linear.h" 24 | #include "ops/arg_max.h" 25 | #include "ops/linear.h" 26 | #include "ops/Conv2D.h" 27 | #include "ops/Gelu.h" 28 | 29 | void softmax(const Matrix3D &input, Matrix3D &output, int dim); 30 | void batch_Add(const Matrix3D &input, const Matrix3D &input2, Matrix3D &output); 31 | template 32 | void linear(Matrix3D &a, Matrix3D &b, Matrix3D &c); 33 | 34 | 35 | #ifdef QM_CUDA 36 | #include "ops/cuda/BMM_F16T.cuh" 37 | #include "ops/cuda/Embedding.cuh" 38 | #include "ops/cuda/LlamaRMSNorm.cuh" 39 | #include "ops/cuda/RotaryPosEmb.cuh" 40 | 41 | __global__ void batch_Add_float(const Matrix3D input, const Matrix3D input2, Matrix3D output); 42 | __global__ void batch_Add_cuda(const Matrix3D input, const Matrix3D input2, 43 | Matrix3D output); 44 | __global__ void batch_Add_cuda_half2(Matrix3D input, Matrix3D input2, Matrix3D output); 45 | __global__ void softmax_float(Matrix3D input, Matrix3D output); 46 | __global__ void softmax_cuda(Matrix3D input, Matrix3D output); 47 | #endif 48 | 49 | #endif // OPERATORS_H 50 | -------------------------------------------------------------------------------- /llm/include/ops/BMM_F32T.h: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | class BMM_F32T { 4 | public: 5 | BMM_F32T(float _alpha); 6 | BMM_F32T(){}; 7 | void forward(const Matrix3D &x, const Matrix3D &weight, Matrix3D &output); 8 | void forward_weight_untransposed(const Matrix3D &x, const Matrix3D &weight, Matrix3D &output); 9 | float alpha; 10 | 11 | private: 12 | std::string profile_name = "BMM_F32T"; 13 | }; 14 | 15 | void load_BMM_F32T(BMM_F32T &op, std::string prefix); 16 | -------------------------------------------------------------------------------- /llm/include/ops/BMM_S8T_S8N_F32T.h: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | struct BMM_S8T_S8N_F32T_params { 4 | float alpha; 5 | }; 6 | 7 | class BMM_S8T_S8N_F32T { 8 | public: 9 | BMM_S8T_S8N_F32T(BMM_S8T_S8N_F32T_params ¶ms_); 10 | BMM_S8T_S8N_F32T(){}; 11 | void forward(const Matrix3D &x, const Matrix3D &weight, Matrix3D &output); 12 | struct matmul_params params; 13 | float alpha; 14 | 15 | private: 16 | std::string profile_name = "BMM_S8T_S8N_F32T"; 17 | }; 18 | 19 | void load_BMM_S8T_S8N_F32T(BMM_S8T_S8N_F32T &op, std::string prefix); 20 | -------------------------------------------------------------------------------- /llm/include/ops/BMM_S8T_S8N_S8T.h: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | struct BMM_S8T_S8N_S8T_params { 4 | float alpha; 5 | }; 6 | 7 | class BMM_S8T_S8N_S8T { 8 | public: 9 | BMM_S8T_S8N_S8T(BMM_S8T_S8N_S8T_params ¶ms_); 10 | BMM_S8T_S8N_S8T(){}; 11 | void forward(const Matrix3D &x, const Matrix3D &weight, Matrix3D &output); 12 | struct matmul_params params; 13 | float alpha; 14 | 15 | private: 16 | std::string profile_name = "BMM_S8T_S8N_S8T"; 17 | }; 18 | 19 | void load_BMM_S8T_S8N_S8T(BMM_S8T_S8N_S8T &op, std::string prefix); 20 | -------------------------------------------------------------------------------- /llm/include/ops/Conv2D.h: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include 3 | 4 | struct Conv2D_params { 5 | Matrix4D weight; 6 | Matrix3D bias; 7 | int stride_width = 1; 8 | int stride_height = 1; 9 | int dilation_width_factor = 1; 10 | int dilation_height_factor = 1; 11 | int padding_width = 0; 12 | int padding_height = 0; 13 | float float_activation_min = -std::numeric_limits::max(); 14 | float float_activation_max = std::numeric_limits::max(); 15 | }; 16 | 17 | class Conv2D { 18 | public: 19 | Conv2D(Conv2D_params params_) : params(params_){}; 20 | Conv2D(){}; 21 | void forward(const Matrix3D &input, Matrix3D &output); 22 | struct Conv2D_params params; 23 | bool has_bias = false; 24 | 25 | private: 26 | std::string profile_name = "Conv2D"; 27 | }; 28 | 29 | void load_Conv2D(Conv2D &op, std::string prefix); 30 | -------------------------------------------------------------------------------- /llm/include/ops/Embedding.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "common.h" 4 | 5 | class Embedding { 6 | public: 7 | Embedding(int embed_dim_, int voc_size_, int padding_idx_, Matrix3D lookup_) 8 | : embed_dim(embed_dim_), voc_size(voc_size_), padding_idx(padding_idx_), lookup(lookup_) { 9 | assert(lookup_.m_dim_y == voc_size_); 10 | assert(lookup_.m_dim_z == embed_dim_); 11 | } 12 | Embedding(){}; 13 | void forward(Matrix3D input_id, Matrix3D output); 14 | int embed_dim, voc_size, padding_idx; 15 | Matrix3D lookup; 16 | 17 | private: 18 | std::string profile_name = "Embedding"; 19 | }; 20 | 21 | void load_Embedding_params(Embedding &op, std::string prefix); 22 | -------------------------------------------------------------------------------- /llm/include/ops/Gelu.h: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | float Gelu_imp(float x); 4 | void Gelu(Matrix3D a); 5 | float Gelu_quick_imp(float x); 6 | void Gelu_quick(Matrix3D a); 7 | -------------------------------------------------------------------------------- /llm/include/ops/LayerNorm.h: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | struct LayerNorm_params { 4 | Matrix3D weight; 5 | Matrix3D bias; 6 | }; 7 | 8 | class LayerNorm { 9 | public: 10 | LayerNorm(LayerNorm_params params_) : params(params_){}; 11 | LayerNorm(){}; 12 | void forward(const Matrix3D &x, Matrix3D &output); 13 | struct LayerNorm_params params; 14 | 15 | private: 16 | std::string profile_name = "LayerNorm"; 17 | }; 18 | 19 | void load_LayerNorm(LayerNorm &op, std::string prefix); 20 | -------------------------------------------------------------------------------- /llm/include/ops/LayerNormQ.h: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | struct LayerNormQ_params { 4 | Matrix3D weight; 5 | Matrix3D bias; 6 | }; 7 | 8 | class LayerNormQ { 9 | public: 10 | LayerNormQ(LayerNormQ_params ¶ms_) : params(params_){}; 11 | LayerNormQ(){}; 12 | void forward(const Matrix3D &x, Matrix3D &output); 13 | struct LayerNormQ_params params; 14 | 15 | private: 16 | std::string profile_name = "LayerNormQ"; 17 | }; 18 | 19 | void load_LayerNormQ(LayerNormQ &op, std::string prefix); 20 | -------------------------------------------------------------------------------- /llm/include/ops/LlamaRMSNorm.h: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "utils.h" 3 | 4 | class LlamaRMSNorm { 5 | public: 6 | LlamaRMSNorm(Matrix3D _weight) : weight(_weight){}; 7 | LlamaRMSNorm(){}; 8 | void forward(const Matrix3D &x, Matrix3D &output, float eps); 9 | Matrix3D weight; 10 | 11 | private: 12 | std::string profile_name = "LlamaRMSNorm"; 13 | }; 14 | -------------------------------------------------------------------------------- /llm/include/ops/RotaryPosEmb.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "common.h" 4 | #include "utils.h" 5 | 6 | class RotaryPosEmb { 7 | public: 8 | RotaryPosEmb(Matrix3D _cos, Matrix3D _sin, std::string path) { 9 | sin = _sin; 10 | cos = _cos; 11 | read_to_array((path + "/cos_cached.bin").c_str(), cos.m_data, cos.length()); 12 | read_to_array((path + "/sin_cached.bin").c_str(), sin.m_data, sin.length()); 13 | }; 14 | RotaryPosEmb(){}; 15 | void forward(Matrix3D &key, Matrix3D &value, int start_idx, int len); 16 | Matrix3D cos, sin; 17 | 18 | private: 19 | std::string profile_name = "RotaryPosEmb"; 20 | }; 21 | 22 | void load_RotaryPosEmb(RotaryPosEmb &op, std::string prefix); 23 | -------------------------------------------------------------------------------- /llm/include/ops/W8A8B8O8Linear.h: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | struct W8A8B8O8Linear_params { 4 | Matrix3D weight; 5 | Matrix3D bias; 6 | float alpha; 7 | float beta; 8 | }; 9 | 10 | class W8A8B8O8Linear { 11 | public: 12 | W8A8B8O8Linear(W8A8B8O8Linear_params ¶ms_); 13 | W8A8B8O8Linear(){}; 14 | void forward(const Matrix3D &x, Matrix3D &output); 15 | struct matmul_params params; 16 | float alpha; 17 | float beta; 18 | 19 | private: 20 | std::string profile_name = "W8A8B8O8Linear"; 21 | }; 22 | 23 | void load_W8A8B8O8Linear_params(W8A8B8O8Linear &op, std::string prefix); 24 | -------------------------------------------------------------------------------- /llm/include/ops/W8A8B8O8LinearReLU.h: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | struct W8A8B8O8LinearReLU_params { 4 | Matrix3D weight; 5 | Matrix3D bias_int8; 6 | float alpha; 7 | float beta; 8 | }; 9 | 10 | class W8A8B8O8LinearReLU { 11 | public: 12 | W8A8B8O8LinearReLU(W8A8B8O8LinearReLU_params ¶ms_); 13 | W8A8B8O8LinearReLU(){}; 14 | void forward(const Matrix3D &x, Matrix3D &output); 15 | struct matmul_params params; 16 | float alpha; 17 | float beta; 18 | 19 | private: 20 | std::string profile_name = "W8A8B8O8LinearReLU"; 21 | }; 22 | 23 | void load_W8A8B8O8LinearReLU_params(W8A8B8O8LinearReLU &op, std::string prefix); 24 | -------------------------------------------------------------------------------- /llm/include/ops/W8A8BFP32OFP32Linear.h: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | struct W8A8BFP32OFP32Linear_params { 4 | Matrix3D weight; 5 | Matrix3D bias; 6 | float alpha; 7 | }; 8 | 9 | class W8A8BFP32OFP32Linear { 10 | public: 11 | W8A8BFP32OFP32Linear(W8A8BFP32OFP32Linear_params ¶ms_); 12 | W8A8BFP32OFP32Linear(){}; 13 | void forward(const Matrix3D &x, Matrix3D &output); 14 | struct matmul_params params; 15 | float alpha; 16 | 17 | private: 18 | std::string profile_name = "W8A8BFP32OFP32Linear"; 19 | }; 20 | 21 | void load_W8A8BFP32OFP32Linear_params(W8A8BFP32OFP32Linear &op, std::string prefix); 22 | -------------------------------------------------------------------------------- /llm/include/ops/arg_max.h: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | #define FLOAT_MIN -1000000.0 4 | 5 | void arg_max_dim2(Matrix3D &input, Matrix3D &output); 6 | -------------------------------------------------------------------------------- /llm/include/ops/cuda/BMM_F16T.cuh: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | class BMM_F16T{ 4 | public: 5 | BMM_F16T(half _alpha); 6 | BMM_F16T(){}; 7 | void forward(const Matrix3D &x, const Matrix3D &weight, Matrix3D &output); // TODO: convert weight to half 8 | void forward_weight_untransposed(const Matrix3D &a, const Matrix3D &weight, Matrix3D &c); 9 | half alpha; 10 | private: 11 | std::string profile_name = "BMM_F16T"; 12 | }; 13 | 14 | void load_BMM_F16T(BMM_F16T &op, std::string prefix); 15 | -------------------------------------------------------------------------------- /llm/include/ops/cuda/Embedding.cuh: -------------------------------------------------------------------------------- 1 | #include 2 | #include "common.h" 3 | 4 | class Embedding_cuda { 5 | public: 6 | Embedding_cuda(int embed_dim_, int voc_size_, int padding_idx_, Matrix3D lookup_) 7 | : embed_dim(embed_dim_), voc_size(voc_size_), padding_idx(padding_idx_), lookup(lookup_) { 8 | assert(lookup_.m_dim_y == voc_size_); 9 | assert(lookup_.m_dim_z == embed_dim_); 10 | } 11 | Embedding_cuda(){}; 12 | void forward(Matrix3D input_id, Matrix3D output); 13 | int embed_dim, voc_size, padding_idx; 14 | Matrix3D lookup; 15 | private: 16 | std::string profile_name = "Embedding"; 17 | }; 18 | 19 | void load_Embedding_params_cuda(Embedding_cuda &op, std::string prefix); 20 | -------------------------------------------------------------------------------- /llm/include/ops/cuda/LlamaRMSNorm.cuh: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | class LlamaRMSNorm_cuda { 4 | public: 5 | LlamaRMSNorm_cuda(Matrix3D _weight) : weight(_weight){}; 6 | LlamaRMSNorm_cuda(){}; 7 | void forward(const Matrix3D &x, Matrix3D &output, float eps); 8 | Matrix3D weight; 9 | // half half_eps = 6.10352e-05; 10 | 11 | private: 12 | std::string profile_name = "LlamaRMSNorm_cuda"; 13 | }; 14 | -------------------------------------------------------------------------------- /llm/include/ops/cuda/RotaryPosEmb.cuh: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "utils.h" 4 | #include "common.h" 5 | 6 | class RotaryPosEmb_cuda 7 | { 8 | public: 9 | RotaryPosEmb_cuda(Matrix3D _cos, Matrix3D _sin, std::string path) 10 | { 11 | sin = _sin; 12 | cos = _cos; 13 | read_to_array_half((path + "/cos_cached_half.bin").c_str(), cos.m_data, cos.length()); 14 | read_to_array_half((path + "/sin_cached_half.bin").c_str(), sin.m_data, sin.length()); 15 | }; 16 | RotaryPosEmb_cuda(){}; 17 | void forward(Matrix3D &key, Matrix3D &value, int start_idx, int len); 18 | Matrix3D cos, sin; 19 | 20 | private: 21 | std::string profile_name = "RotaryPosEmb_cuda"; 22 | }; 23 | 24 | void load_RotaryPosEmb_cuda(RotaryPosEmb_cuda &op, std::string prefix); 25 | 26 | __global__ void RotaryPosEmb_float_forward(Matrix3D query, Matrix3D key, Matrix3D cos, Matrix3D sin, int start_idx, int len); 27 | __global__ void RotaryPosEmb_cuda_forward(Matrix3D query, Matrix3D key, Matrix3D cos, Matrix3D sin, int start_idx, int len); 28 | __global__ void RotaryPosEmb_cuda_forward_shared(Matrix3D query, Matrix3D key, Matrix3D cos, Matrix3D sin, int start_idx, int len); 29 | -------------------------------------------------------------------------------- /llm/include/ops/cuda/reduction.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | Adapted from NVIDIA FasterTransformer: 3 | https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/kernels/reduce_kernel_utils.cuh 4 | */ 5 | 6 | #pragma once 7 | #include 8 | #if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0)) 9 | #include 10 | #else 11 | #include 12 | #endif 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | static const float HALF_FLT_MAX = 65504.F; 19 | #define FINAL_MASK 0xffffffff 20 | 21 | 22 | template 23 | inline __device__ T add(T a, T b) { 24 | return a + b; 25 | } 26 | 27 | template<> 28 | inline __device__ half2 add(half2 a, half2 b) { 29 | return __hadd2(a, b); 30 | } 31 | 32 | template<> 33 | inline __device__ half add(half a, half b) { 34 | return __hadd(a, b); 35 | } 36 | 37 | template 38 | __inline__ __device__ T warpReduceSum(T val) 39 | { 40 | #pragma unroll 41 | for (int mask = 16; mask > 0; mask >>= 1) 42 | val = add(val, __shfl_xor_sync(FINAL_MASK, val, mask, 32)); //__shfl_sync bf16 return float when sm < 80 43 | return val; 44 | } 45 | 46 | /* Calculate the sum of all elements in a block */ 47 | template 48 | __inline__ __device__ T blockReduceSum(T val) 49 | { 50 | static __shared__ T shared[32]; 51 | int lane = threadIdx.x & 0x1f; 52 | int wid = threadIdx.x >> 5; 53 | 54 | val = warpReduceSum(val); 55 | 56 | if (lane == 0) 57 | shared[wid] = val; 58 | 59 | __syncthreads(); 60 | 61 | // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent 62 | // blockDim.x is not divided by 32 63 | val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[lane] : (T)(0.0f); 64 | val = warpReduceSum(val); 65 | 66 | return val; 67 | } 68 | 69 | 70 | template 71 | __device__ __forceinline__ T clamp_inf_for_half(const float input) 72 | { 73 | return input; 74 | } 75 | 76 | template<> 77 | __device__ __forceinline__ half clamp_inf_for_half(const float input) 78 | { 79 | // clamp inf values to enable fp16 training 80 | return input > 0.0f ? __float2half(min(input, 65504.F - 1000)) : __float2half(max(input, -65504.F + 1000)); 81 | } 82 | -------------------------------------------------------------------------------- /llm/include/profiler.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | class Profiler { 7 | public: 8 | bool for_demo = false; 9 | static Profiler& getInstance() { 10 | static Profiler instance; 11 | return instance; 12 | } 13 | 14 | void start(const std::string& section) { start_times[section] = std::chrono::high_resolution_clock::now(); } 15 | 16 | void start(const std::string& section, const long long section_flops) { 17 | start_times[section] = std::chrono::high_resolution_clock::now(); 18 | if (flops.count(section) == 0) 19 | flops[section] = section_flops; 20 | else 21 | flops[section] += section_flops; 22 | } 23 | 24 | void reset() { 25 | start_times.clear(); 26 | durations.clear(); 27 | counts.clear(); 28 | flops.clear(); 29 | } 30 | 31 | void stop(const std::string& section) { 32 | auto end_time = std::chrono::high_resolution_clock::now(); 33 | auto duration = std::chrono::duration_cast(end_time - start_times[section]).count(); 34 | durations[section] += duration; 35 | counts[section]++; 36 | } 37 | 38 | void report_internal() const { 39 | if (for_demo) { 40 | for (const auto& entry : durations) { 41 | std::string row; 42 | std::cout << entry.first + ", "; 43 | float s = (float)(entry.second) / 1000000; 44 | float ts = (float)counts.at(entry.first); 45 | printf("Total time: %.1f s, %.1f ms/token, %.1f token/s, %d tokens\n\n", s, s / ts * 1000, ts / s, 46 | counts.at(entry.first)); 47 | } 48 | } else { 49 | std::cout << "Section, Total time(us), Average time(us), Count, GOPs:" << std::endl; 50 | for (const auto& entry : durations) { 51 | std::string row; 52 | row += entry.first + ", "; 53 | row += std::to_string(entry.second) + ", "; 54 | row += std::to_string(entry.second / counts.at(entry.first)) + ", "; 55 | if (flops.count(entry.first) == 0) 56 | row += std::to_string(counts.at(entry.first)) + ", N/A"; 57 | else { 58 | row += std::to_string(counts.at(entry.first)) + ", "; 59 | // ops and microsecond 60 | row += std::to_string((((float)flops.at(entry.first)) / (float)(entry.second)) / 1000.0); 61 | } 62 | std::cout << row << std::endl; 63 | } 64 | } 65 | } 66 | 67 | void report() const { 68 | #ifdef PROFILER 69 | report_internal(); 70 | #endif 71 | } 72 | 73 | private: 74 | Profiler() {} 75 | Profiler(const Profiler&) = delete; 76 | Profiler& operator=(const Profiler&) = delete; 77 | 78 | std::map start_times; 79 | std::map flops; 80 | std::map durations; 81 | std::map counts; 82 | }; 83 | -------------------------------------------------------------------------------- /llm/mistral: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | ./chat Mistral_7B INT4 5 0 3 | -------------------------------------------------------------------------------- /llm/models/llama3_vocab.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/llm/models/llama3_vocab.bin -------------------------------------------------------------------------------- /llm/models/llama_vocab.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/llm/models/llama_vocab.bin -------------------------------------------------------------------------------- /llm/models/mistral_vocab.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/llm/models/mistral_vocab.bin -------------------------------------------------------------------------------- /llm/models/starcoder_vocab.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/llm/models/starcoder_vocab.bin -------------------------------------------------------------------------------- /llm/scripts/chat-13b.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | ./chat LLaMA2_13B_chat INT4 5 3 | -------------------------------------------------------------------------------- /llm/scripts/chat.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | ./chat LLaMA2_7B_chat INT4 5 3 | -------------------------------------------------------------------------------- /llm/scripts/code.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | ./chat CodeLLaMA_7B_Instruct INT4 5 3 | -------------------------------------------------------------------------------- /llm/scripts/llava.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | echo "=============================================================================================================================" 3 | image_path="$1" 4 | termvisage $image_path -w 75 5 | echo "=============================================================================================================================" 6 | 7 | ./chat LLaVA_7B INT4 6 $image_path 8 | -------------------------------------------------------------------------------- /llm/scripts/vila.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | echo "=============================================================================================================================" 3 | image_path="$1" 4 | termvisage $image_path -w 75 5 | echo "=============================================================================================================================" 6 | 7 | ./chat VILA_7B INT4 5 $image_path 8 | -------------------------------------------------------------------------------- /llm/scripts/voice_llava.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | echo "=============================================================================================================================" 3 | image_path="$1" 4 | termvisage $image_path -w 75 5 | echo "=============================================================================================================================" 6 | 7 | ./chat -v LLaVA_7B INT4 6 $image_path 8 | -------------------------------------------------------------------------------- /llm/scripts/voice_vila.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | echo "=============================================================================================================================" 3 | image_path="$1" 4 | termvisage $image_path -w 75 5 | echo "=============================================================================================================================" 6 | 7 | ./chat -v VILA_7B INT4 5 $image_path 8 | -------------------------------------------------------------------------------- /llm/scripts/voicechat.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | ./chat -v LLaMA2_7B_chat INT4 5 3 | -------------------------------------------------------------------------------- /llm/src/interface.cc: -------------------------------------------------------------------------------- 1 | #include "interface.h" 2 | #include 3 | 4 | void set_print_black() { 5 | printf("\033[0;30m"); 6 | } 7 | 8 | void set_print_red() { 9 | printf("\033[1;31m"); 10 | } 11 | 12 | void set_print_yellow() { 13 | printf("\033[0;33m"); 14 | } 15 | 16 | void set_print_bold_yellow() { 17 | printf("\033[1;33m"); 18 | } 19 | 20 | void set_print_blue() { 21 | printf("\033[1;34m"); 22 | } 23 | 24 | void set_print_white() { 25 | printf("\033[0;37m"); 26 | } 27 | 28 | void set_print_reset() { 29 | printf("\033[0m"); 30 | } 31 | -------------------------------------------------------------------------------- /llm/src/nn_modules/Fp32CLIPEncoder.cc: -------------------------------------------------------------------------------- 1 | #include "Fp32CLIPEncoder.h" 2 | #include "utils.h" 3 | 4 | #include 5 | #include 6 | 7 | Fp32CLIPEncoder::Fp32CLIPEncoder(std::string param_path, const struct model_config config) { 8 | // Load all the encoder layers 9 | for (int layer_idx = 0; layer_idx < config.num_layers; layer_idx++) { 10 | DEBUG_INS(std::cout << "Start loading layer:" << layer_idx << "..." << std::endl;) 11 | 12 | std::string path = param_path + "/layer" + std::to_string(layer_idx); 13 | Fp32CLIPEncoderLayer layer = Fp32CLIPEncoderLayer(path, config, layer_idx); 14 | 15 | this->layers.push_back(layer); 16 | } 17 | }; 18 | 19 | // Fp32CLIPEncoder 20 | struct Fp32CLIPEncoder_output Fp32CLIPEncoder::forward(const struct Fp32CLIPEncoder_input &input) { 21 | PROFILE_START(profile_name); 22 | int sqlen = input.hidden_states.m_dim_y; 23 | 24 | // Go through each layer 25 | Matrix3D hidden_states = input.hidden_states; 26 | std::vector> past_keys, past_values; 27 | for (int i = 0; i < this->layers.size(); i++) { 28 | if (!input.has_past_keys_values) { 29 | struct Fp32CLIPEncoderLayer_input l_i = {hidden_states, input.attention_mask}; 30 | struct Fp32CLIPEncoderLayer_output l_o = this->layers[i].forward(l_i); 31 | hidden_states = l_o.hidden_states; 32 | past_keys.push_back(l_o.past_key_value.first); 33 | past_values.push_back(l_o.past_key_value.second); 34 | } else { 35 | struct Fp32CLIPEncoderLayer_input l_i = {hidden_states, input.attention_mask, input.past_keys[i], 36 | input.past_values[i]}; 37 | struct Fp32CLIPEncoderLayer_output l_o = this->layers[i].forward(l_i); 38 | hidden_states = l_o.hidden_states; 39 | past_keys.push_back(l_o.past_key_value.first); 40 | past_values.push_back(l_o.past_key_value.second); 41 | } 42 | } 43 | 44 | struct Fp32CLIPEncoder_output output = {hidden_states, past_keys, past_values}; 45 | PROFILE_END(profile_name); 46 | return output; 47 | } 48 | -------------------------------------------------------------------------------- /llm/src/nn_modules/Fp32GPTBigCodeForCausalLM.cc: -------------------------------------------------------------------------------- 1 | #include "Fp32GPTBigCodeForCausalLM.h" 2 | 3 | #include 4 | 5 | #include "operators.h" 6 | #include "utils.h" 7 | 8 | Fp32GPTBigCodeForCausalLM::Fp32GPTBigCodeForCausalLM(std::string param_path, const struct model_config config) { 9 | allocate_aligned_memory(logits_output, config.max_sqlen * config.vocsize * sizeof(float)); 10 | allocate_aligned_memory(lm_head_weight, config.embed_dim * config.vocsize * sizeof(float)); 11 | 12 | this->decoder = Fp32GPTBigCodeDecoder(param_path + "/decoder", config); 13 | this->lm_head = 14 | Linear_FP(Matrix3D(lm_head_weight, 1, config.vocsize, config.embed_dim), param_path + "/lm_head.bin"); 15 | } 16 | 17 | struct Fp32GPTBigCodeForCausalLM_output Fp32GPTBigCodeForCausalLM::forward(const struct Fp32GPTBigCodeForCausalLM_input &input) { 18 | PROFILE_START(profile_name); 19 | int sqlen = input.input_ids.m_dim_z; 20 | 21 | struct Fp32GPTBigCodeDecoder_output decoder_output; 22 | 23 | if (input.has_past_keys_values) { 24 | struct Fp32GPTBigCodeDecoder_input decoder_input = {input.input_ids, input.past_keys, input.past_values}; 25 | decoder_output = this->decoder.forward(decoder_input); 26 | 27 | } else { 28 | struct Fp32GPTBigCodeDecoder_input decoder_input = {input.input_ids}; 29 | decoder_output = this->decoder.forward(decoder_input); 30 | } 31 | 32 | Matrix3D logits(logits_output, 1, sqlen, this->decoder.voc_size); 33 | this->lm_head.forward(decoder_output.last_hidden_state, logits); 34 | 35 | struct Fp32GPTBigCodeForCausalLM_output LMoutput = {logits, decoder_output.past_keys, decoder_output.past_values}; 36 | PROFILE_END(profile_name); 37 | return LMoutput; 38 | } 39 | -------------------------------------------------------------------------------- /llm/src/nn_modules/Fp32OPTForCausalLM.cc: -------------------------------------------------------------------------------- 1 | #include "Fp32OPTForCausalLM.h" 2 | 3 | #include 4 | 5 | #include "operators.h" 6 | #include "utils.h" 7 | 8 | Fp32OPTForCausalLM::Fp32OPTForCausalLM(std::string param_path, const struct model_config config) { 9 | allocate_aligned_memory(logits_output, config.max_sqlen * config.vocsize * sizeof(float)); 10 | allocate_aligned_memory(lm_head_weight, config.embed_dim * config.vocsize * sizeof(float)); 11 | 12 | this->decoder = Fp32OPTDecoder(param_path + "/decoder", config); 13 | this->lm_head = 14 | Linear_FP(Matrix3D(lm_head_weight, 1, config.vocsize, config.embed_dim), param_path + "/lm_head.bin"); 15 | } 16 | 17 | struct Fp32OPTForCausalLM_output Fp32OPTForCausalLM::forward(const struct Fp32OPTForCausalLM_input &input) { 18 | PROFILE_START(profile_name); 19 | int sqlen = input.input_ids.m_dim_z; 20 | 21 | struct Fp32OPTDecoder_output decoder_output; 22 | 23 | if (input.has_past_keys_values) { 24 | struct Fp32OPTDecoder_input decoder_input = {input.input_ids, input.past_keys, input.past_values}; 25 | decoder_output = this->decoder.forward(decoder_input); 26 | 27 | } else { 28 | struct Fp32OPTDecoder_input decoder_input = {input.input_ids}; 29 | decoder_output = this->decoder.forward(decoder_input); 30 | } 31 | 32 | Matrix3D logits(logits_output, 1, sqlen, this->decoder.voc_size); 33 | this->lm_head.forward(decoder_output.last_hidden_state, logits); 34 | 35 | struct Fp32OPTForCausalLM_output LMoutput = {logits, decoder_output.past_keys, decoder_output.past_values}; 36 | PROFILE_END(profile_name); 37 | return LMoutput; 38 | } 39 | -------------------------------------------------------------------------------- /llm/src/nn_modules/Fp32llamaForCausalLM.cc: -------------------------------------------------------------------------------- 1 | #include "Fp32llamaForCausalLM.h" 2 | 3 | #include 4 | 5 | #include "operators.h" 6 | #include "utils.h" 7 | 8 | Fp32LlamaForCausalLM::Fp32LlamaForCausalLM(std::string param_path, const struct model_config config) { 9 | allocate_aligned_memory(logits_output, config.max_sqlen * config.vocsize * sizeof(float)); 10 | allocate_aligned_memory(lm_head_weight, config.embed_dim * config.vocsize * sizeof(float)); 11 | 12 | this->decoder = Fp32llamaDecoder(param_path + "/decoder", config); 13 | this->lm_head = 14 | Linear_FP(Matrix3D(lm_head_weight, 1, config.vocsize, config.embed_dim), param_path + "/lm_head.bin"); 15 | } 16 | 17 | struct Fp32LlamaForCausalLM_output Fp32LlamaForCausalLM::forward(const struct Fp32LlamaForCausalLM_input &input) { 18 | PROFILE_START(profile_name); 19 | 20 | struct Fp32llamaDecoder_output decoder_output; 21 | 22 | // Call decoder 23 | if (input.has_past_keys_values) { 24 | struct Fp32llamaDecoder_input decoder_input = {input.input_ids, input.past_keys, input.past_values}; 25 | decoder_output = this->decoder.forward(decoder_input); 26 | } else { 27 | struct Fp32llamaDecoder_input decoder_input; 28 | if (input.is_llava) { 29 | decoder_input = {input.input_ids, input.image_embed}; 30 | decoder_input.has_past_keys_values = false; 31 | decoder_input.is_llava = true; 32 | } else { 33 | decoder_input = {input.input_ids}; 34 | decoder_input.has_past_keys_values = false; 35 | decoder_input.is_llava = false; 36 | } 37 | decoder_output = this->decoder.forward(decoder_input); 38 | } 39 | 40 | // Get logits 41 | int sqlen; 42 | if (input.is_llava) { 43 | sqlen = input.input_ids.m_dim_z + input.image_embed.m_dim_y; 44 | } else { 45 | sqlen = input.input_ids.m_dim_z; 46 | } 47 | Matrix3D logits(logits_output, 1, sqlen, this->decoder.voc_size); 48 | this->lm_head.forward(decoder_output.last_hidden_state, logits); 49 | 50 | struct Fp32LlamaForCausalLM_output LMoutput = {logits, decoder_output.past_keys, decoder_output.past_values}; 51 | PROFILE_END(profile_name); 52 | return LMoutput; 53 | } 54 | -------------------------------------------------------------------------------- /llm/src/nn_modules/Int4GPTBigCodeForCausalLM.cc: -------------------------------------------------------------------------------- 1 | #include "Int4GPTBigCodeForCausalLM.h" 2 | 3 | #include 4 | 5 | #include "operators.h" 6 | #include "utils.h" 7 | 8 | Int4GPTBigCodeForCausalLM::Int4GPTBigCodeForCausalLM(std::string param_path, const struct model_config config) { 9 | allocate_aligned_memory(logits_output, config.max_sqlen * config.vocsize * sizeof(float)); 10 | allocate_aligned_memory(lm_head_weight, config.embed_dim * config.vocsize * sizeof(uint8_t) / 2); 11 | this->decoder = Int4GPTBigCodeDecoder(param_path + "/decoder", config); 12 | this->lm_head = 13 | Linear_FP_int4(Matrix3D(lm_head_weight, 1, config.vocsize, config.embed_dim / 2), param_path + "/lm_head"); 14 | } 15 | 16 | struct Int4GPTBigCodeForCausalLM_output Int4GPTBigCodeForCausalLM::forward(std::string param_path, const struct Int4GPTBigCodeForCausalLM_input &input) { 17 | // printf(("Int4GPTBigCodeForCausalLM::forward\n"); 18 | PROFILE_START(profile_name); 19 | // printf(("Int4GPTBigCodeForCausalLM starts\n"); 20 | int sqlen = input.input_ids.m_dim_z; 21 | 22 | struct Int4GPTBigCodeDecoder_output decoder_output; 23 | // printf(("Before this->decoder.forward\n"); 24 | if (input.has_past_keys_values) { 25 | struct Int4GPTBigCodeDecoder_input decoder_input = {input.input_ids, input.past_keys, input.past_values}; 26 | decoder_output = this->decoder.forward(decoder_input); 27 | } else { 28 | // printf(("00000000\n"); 29 | struct Int4GPTBigCodeDecoder_input decoder_input = {input.input_ids}; 30 | // printf(("11111111\n"); 31 | decoder_output = this->decoder.forward(decoder_input); 32 | } 33 | 34 | Matrix3D logits(logits_output, 1, sqlen, this->decoder.voc_size); 35 | this->lm_head.forward(decoder_output.last_hidden_state, logits); 36 | 37 | struct Int4GPTBigCodeForCausalLM_output LMoutput = {logits, decoder_output.past_keys, decoder_output.past_values}; 38 | PROFILE_END(profile_name); 39 | return LMoutput; 40 | } 41 | -------------------------------------------------------------------------------- /llm/src/nn_modules/Int4OPTForCausalLM.cc: -------------------------------------------------------------------------------- 1 | #include "Int4OPTForCausalLM.h" 2 | 3 | #include 4 | 5 | #include "operators.h" 6 | #include "utils.h" 7 | 8 | Int4OPTForCausalLM::Int4OPTForCausalLM(std::string param_path, const struct model_config config) { 9 | allocate_aligned_memory(logits_output, config.max_sqlen * config.vocsize * sizeof(float)); 10 | allocate_aligned_memory(lm_head_weight, config.embed_dim * config.vocsize * sizeof(uint8_t) / 2); 11 | 12 | this->decoder = Int4OPTDecoder(param_path + "/decoder", config); 13 | this->lm_head = Linear_FP_int4(Matrix3D(lm_head_weight, 1, config.vocsize, config.embed_dim / 2), 14 | param_path + "/lm_head"); 15 | } 16 | 17 | struct Int4OPTForCausalLM_output Int4OPTForCausalLM::forward(const struct Int4OPTForCausalLM_input &input) { 18 | PROFILE_START(profile_name); 19 | int sqlen = input.input_ids.m_dim_z; 20 | 21 | struct Int4OPTDecoder_output decoder_output; 22 | 23 | if (input.has_past_keys_values) { 24 | struct Int4OPTDecoder_input decoder_input = {input.input_ids, input.past_keys, input.past_values}; 25 | decoder_output = this->decoder.forward(decoder_input); 26 | 27 | } else { 28 | struct Int4OPTDecoder_input decoder_input = {input.input_ids}; 29 | decoder_output = this->decoder.forward(decoder_input); 30 | } 31 | 32 | Matrix3D logits(logits_output, 1, sqlen, this->decoder.voc_size); 33 | this->lm_head.forward(decoder_output.last_hidden_state, logits); 34 | 35 | struct Int4OPTForCausalLM_output LMoutput = {logits, decoder_output.past_keys, decoder_output.past_values}; 36 | PROFILE_END(profile_name); 37 | return LMoutput; 38 | } 39 | -------------------------------------------------------------------------------- /llm/src/nn_modules/OPTForCausalLM.cc: -------------------------------------------------------------------------------- 1 | #include "OPTForCausalLM.h" 2 | 3 | #include 4 | 5 | #include "operators.h" 6 | #include "utils.h" 7 | 8 | OPTForCausalLM::OPTForCausalLM(std::string param_path, const struct model_config config) { 9 | allocate_aligned_memory(logits_output, config.max_sqlen * config.vocsize * sizeof(float)); 10 | allocate_aligned_memory(lm_head_weight, config.embed_dim * config.vocsize * sizeof(float)); 11 | 12 | this->decoder = Int8OPTDecoder(param_path + "/decoder", config); 13 | this->lm_head = 14 | Linear_FP(Matrix3D(lm_head_weight, 1, config.vocsize, config.embed_dim), param_path + "/lm_head.bin"); 15 | } 16 | 17 | struct OPTForCausalLM_output OPTForCausalLM::forward(const struct OPTForCausalLM_input &input) { 18 | PROFILE_START(profile_name); 19 | int sqlen = input.input_ids.m_dim_z; 20 | 21 | struct Int8OPTDecoder_output decoder_output; 22 | 23 | if (input.has_past_keys_values) { 24 | struct Int8OPTDecoder_input decoder_input = {input.input_ids, input.past_keys, input.past_values}; 25 | decoder_output = this->decoder.forward(decoder_input); 26 | 27 | } else { 28 | struct Int8OPTDecoder_input decoder_input = {input.input_ids}; 29 | decoder_output = this->decoder.forward(decoder_input); 30 | } 31 | 32 | Matrix3D logits(logits_output, 1, sqlen, this->decoder.voc_size); 33 | this->lm_head.forward(decoder_output.last_hidden_state, logits); 34 | 35 | struct OPTForCausalLM_output LMoutput = {logits, decoder_output.past_keys, decoder_output.past_values}; 36 | PROFILE_END(profile_name); 37 | return LMoutput; 38 | } 39 | -------------------------------------------------------------------------------- /llm/src/nn_modules/cuda/Int4llamaForCausalLM.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "Int4llamaForCausalLM.h" 4 | #include "operators.h" 5 | #include "utils.h" 6 | 7 | Int4LlamaForCausalLM::Int4LlamaForCausalLM(std::string param_path, const struct model_config config) { 8 | allocate_aligned_memory_gpu(logits_output_half, config.max_sqlen * config.vocsize * sizeof(float16_t)); 9 | allocate_aligned_memory_gpu(logits_output, config.max_sqlen * config.vocsize * sizeof(float)); 10 | allocate_aligned_memory_gpu(lm_head_weight, (config.embed_dim * config.vocsize * sizeof(int)) / 8); 11 | 12 | this->decoder = Int4llamaDecoder(param_path + "/decoder", config); 13 | this->lm_head = Linear_half_int4(Matrix3D(lm_head_weight, 1, config.vocsize, config.embed_dim / 8), 14 | param_path + "/lm_head"); 15 | } 16 | 17 | struct Int4LlamaForCausalLM_output Int4LlamaForCausalLM::forward(std::string param_path, const struct Int4LlamaForCausalLM_input &input) { 18 | PROFILE_START(profile_name); 19 | int sqlen = input.input_ids.m_dim_z; 20 | 21 | struct Int4llamaDecoder_output decoder_output; 22 | 23 | if (input.has_past_keys_values) { 24 | struct Int4llamaDecoder_input decoder_input = {input.input_ids, input.past_keys, input.past_values}; 25 | decoder_output = this->decoder.forward(param_path + "/decoder", decoder_input); 26 | 27 | } else { 28 | struct Int4llamaDecoder_input decoder_input = {input.input_ids}; 29 | decoder_output = this->decoder.forward(param_path + "/decoder", decoder_input); 30 | } 31 | 32 | Matrix3D logits_half(logits_output_half, 1, sqlen, this->decoder.voc_size); 33 | this->lm_head.forward(decoder_output.last_hidden_state, logits_half); 34 | 35 | Matrix3D logits(logits_output, 1, sqlen, this->decoder.voc_size); 36 | int threadsPerBlock_1D = 1024; 37 | int blocksPerGrid =(sqlen * this->decoder.voc_size + threadsPerBlock_1D - 1) / threadsPerBlock_1D; 38 | half2float<<>>(logits_output_half, logits_output, sqlen * this->decoder.voc_size); 39 | 40 | cudaEvent_t event; 41 | cudaEventCreate(&event); 42 | cudaEventRecord(event, 0); 43 | cudaEventSynchronize(event); 44 | cudaEventDestroy(event); 45 | 46 | struct Int4LlamaForCausalLM_output LMoutput = {logits, decoder_output.past_keys, decoder_output.past_values}; 47 | PROFILE_END(profile_name); 48 | 49 | return LMoutput; 50 | } 51 | 52 | void Int4LlamaForCausalLM::free_cuda_memory() { 53 | free_aligned_memory_gpu(logits_output_half); 54 | free_aligned_memory_gpu(logits_output); 55 | free_aligned_memory_gpu(lm_head_weight); 56 | } 57 | -------------------------------------------------------------------------------- /llm/src/nn_modules/non_cuda/Int4llamaForCausalLM.cc: -------------------------------------------------------------------------------- 1 | #include "Int4llamaForCausalLM.h" 2 | 3 | #include 4 | 5 | #include "operators.h" 6 | #include "utils.h" 7 | 8 | Int4LlamaForCausalLM::Int4LlamaForCausalLM(std::string param_path, const struct model_config config) { 9 | allocate_aligned_memory(logits_output, config.max_sqlen * config.vocsize * sizeof(float)); 10 | allocate_aligned_memory(lm_head_weight, (config.embed_dim * config.vocsize * sizeof(uint8_t)) / 2); 11 | 12 | this->decoder = Int4llamaDecoder(param_path + "/decoder", config); 13 | this->lm_head = Linear_FP_int4(Matrix3D(lm_head_weight, 1, config.vocsize, config.embed_dim / 2), 14 | param_path + "/lm_head"); 15 | } 16 | 17 | struct Int4LlamaForCausalLM_output Int4LlamaForCausalLM::forward(std::string param_path, const struct Int4LlamaForCausalLM_input &input) { 18 | PROFILE_START(profile_name); 19 | 20 | struct Int4llamaDecoder_output decoder_output; 21 | 22 | // Call decoder 23 | if (input.has_past_keys_values) { 24 | struct Int4llamaDecoder_input decoder_input = {input.input_ids, input.past_keys, input.past_values}; 25 | decoder_output = this->decoder.forward(param_path + "/decoder", decoder_input); 26 | } else { 27 | struct Int4llamaDecoder_input decoder_input; 28 | if (input.is_llava) { 29 | decoder_input = {input.input_ids, input.image_embed}; 30 | decoder_input.has_past_keys_values = false; 31 | decoder_input.is_llava = true; 32 | } else { 33 | decoder_input = {input.input_ids}; 34 | decoder_input.has_past_keys_values = false; 35 | decoder_input.is_llava = false; 36 | } 37 | decoder_output = this->decoder.forward(param_path + "/decoder", decoder_input); 38 | } 39 | 40 | // Get logits 41 | int sqlen; 42 | if (input.is_llava) { 43 | sqlen = input.input_ids.m_dim_z + input.image_embed.m_dim_y + input.second_input_ids.m_dim_z; 44 | sqlen = input.input_ids.m_dim_z + input.image_embed.m_dim_y; 45 | } else { 46 | sqlen = input.input_ids.m_dim_z; 47 | } 48 | Matrix3D logits(logits_output, 1, sqlen, this->decoder.voc_size); 49 | PROFILE_START("Int4LlamaForCausalLM::lm_head"); 50 | this->lm_head.forward(decoder_output.last_hidden_state, logits); 51 | PROFILE_END("Int4LlamaForCausalLM::lm_head"); 52 | 53 | struct Int4LlamaForCausalLM_output LMoutput = {logits, decoder_output.past_keys, decoder_output.past_values}; 54 | PROFILE_END(profile_name); 55 | return LMoutput; 56 | } 57 | -------------------------------------------------------------------------------- /llm/src/ops/BMM_S8T_S8N_F32T.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "operators.h" 4 | #include "utils.h" 5 | 6 | void load_BMM_S8T_S8N_F32T(BMM_S8T_S8N_F32T &op, std::string prefix) { 7 | read_to_array((prefix + "/alpha.bin").c_str(), &op.alpha, 1); 8 | } 9 | 10 | BMM_S8T_S8N_F32T::BMM_S8T_S8N_F32T(struct BMM_S8T_S8N_F32T_params &op_params) { alpha = op_params.alpha; } 11 | 12 | void BMM_S8T_S8N_F32T::forward(const Matrix3D &x, const Matrix3D &weight, Matrix3D &output) { 13 | const int m = x.m_dim_y, k = x.m_dim_z, n = weight.m_dim_y, b = x.m_dim_x; 14 | const long long ops = (long long)b * 2 * (long long)m * (long long)n * (long long)k + (long long)m * (long long)n; 15 | PROFILE_START_FLOPS(profile_name, ops); 16 | assert(output.m_dim_x == x.m_dim_x); 17 | assert(output.m_dim_y == x.m_dim_y); 18 | assert(output.m_dim_z == weight.m_dim_y); 19 | assert(x.m_dim_z == weight.m_dim_z); 20 | 21 | struct matmul_params params; 22 | 23 | params.A.row = m; 24 | params.A.column = k; 25 | params.A.int8_data_ptr = x.m_data; 26 | params.A.qparams.scale = alpha; // effective_scale = a * B / C 27 | params.B.qparams.scale = 1.0; 28 | params.C.qparams.scale = 1.0; 29 | params.A.qparams.zero_point = 0; 30 | params.B.row = k; 31 | params.B.column = n; 32 | params.B.int8_data_ptr = weight.m_data; 33 | params.B.qparams.zero_point = 0; 34 | params.C.row = m; 35 | params.C.column = n; 36 | params.C.data_ptr = output.m_data; 37 | params.C.qparams.zero_point = 0; 38 | params.opt_params.blk_size = BLK_SIZE; 39 | params.opt_params.num_thread = NUM_THREAD; 40 | params.C.qparams.q_max = 127; 41 | params.C.qparams.q_min = -128; 42 | params.alpha = alpha; 43 | 44 | matmul::MatmulOperator matmul_op = matmul::MatmulOperator(); 45 | if (m == 1 && x.m_dim_x > 1) { 46 | // merge each batch 47 | params.A.row = x.m_dim_x; 48 | params.C.row = x.m_dim_x; 49 | // B is batched, need a new op for this! 50 | matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32_batch(¶ms); 51 | } else { 52 | // process each batch 53 | for (int bz = 0; bz < x.m_dim_x; bz++) { 54 | matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32(¶ms); 55 | params.A.int8_data_ptr += m * k; 56 | params.B.int8_data_ptr += k * n; 57 | params.C.data_ptr += m * n; 58 | } 59 | } 60 | 61 | PROFILE_END(profile_name); 62 | } 63 | -------------------------------------------------------------------------------- /llm/src/ops/BMM_S8T_S8N_S8T.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "operators.h" 4 | #include "utils.h" 5 | 6 | void load_BMM_S8T_S8N_S8T(BMM_S8T_S8N_S8T &op, std::string prefix) { 7 | read_to_array((prefix + "/alpha.bin").c_str(), &op.alpha, 1); 8 | } 9 | 10 | BMM_S8T_S8N_S8T::BMM_S8T_S8N_S8T(struct BMM_S8T_S8N_S8T_params &op_params) { alpha = op_params.alpha; } 11 | 12 | void BMM_S8T_S8N_S8T::forward(const Matrix3D &x, const Matrix3D &weight, Matrix3D &output) { 13 | const int m = x.m_dim_y, k = x.m_dim_z, n = weight.m_dim_y, b = x.m_dim_x; 14 | const long long ops = (long long)b * 2 * (long long)m * (long long)n * (long long)k + (long long)m * (long long)n; 15 | PROFILE_START_FLOPS(profile_name, ops); 16 | assert(output.m_dim_x == x.m_dim_x); 17 | assert(output.m_dim_y == x.m_dim_y); 18 | assert(output.m_dim_z == weight.m_dim_y); 19 | assert(x.m_dim_z == weight.m_dim_z); 20 | 21 | struct matmul_params params; 22 | 23 | params.A.row = m; 24 | params.A.column = k; 25 | params.A.int8_data_ptr = x.m_data; 26 | params.A.qparams.scale = alpha; // effective_scale = a * B / C 27 | params.B.qparams.scale = 1.0; 28 | params.C.qparams.scale = 1.0; 29 | params.A.qparams.zero_point = 0; 30 | params.B.row = k; 31 | params.B.column = n; 32 | params.B.int8_data_ptr = weight.m_data; 33 | params.B.qparams.zero_point = 0; 34 | params.C.row = m; 35 | params.C.column = n; 36 | params.C.int8_data_ptr = output.m_data; 37 | params.C.qparams.zero_point = 0; 38 | params.opt_params.blk_size = BLK_SIZE; 39 | params.opt_params.num_thread = NUM_THREAD; 40 | params.C.qparams.q_max = 127; 41 | params.C.qparams.q_min = -128; 42 | params.alpha = alpha; 43 | 44 | matmul::MatmulOperator matmul_op = matmul::MatmulOperator(); 45 | 46 | // process each batch 47 | if (m == 1 && x.m_dim_x > 1) { 48 | // merge each batch 49 | params.A.row = x.m_dim_x; 50 | params.C.row = x.m_dim_x; 51 | // B is batched, need a new op for this! 52 | matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_batch(¶ms); 53 | } else { 54 | for (int bz = 0; bz < x.m_dim_x; bz++) { 55 | matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll_nobias(¶ms); 56 | params.A.int8_data_ptr += m * k; 57 | params.B.int8_data_ptr += k * n; 58 | params.C.int8_data_ptr += m * n; 59 | } 60 | } 61 | 62 | PROFILE_END(profile_name); 63 | } 64 | -------------------------------------------------------------------------------- /llm/src/ops/Gelu.cc: -------------------------------------------------------------------------------- 1 | #include "ops/Gelu.h" 2 | 3 | #include 4 | #include 5 | 6 | static const float GELU_COEF_A = 0.044715f; 7 | static const float GELU_QUICK_COEF = -1.702f; 8 | static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; 9 | 10 | float Gelu_imp(float x) { 11 | return 0.5f * x * (1.0f + tanhf(SQRT_2_OVER_PI * x * (1.0f + GELU_COEF_A * x * x))); 12 | } 13 | 14 | void Gelu(Matrix3D a) { 15 | for (int i = 0; i < a.length(); i++) { 16 | a.m_data[i] = Gelu_imp(a.m_data[i]); 17 | } 18 | } 19 | 20 | float Gelu_quick_imp(float x) { 21 | return x * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x))); 22 | } 23 | 24 | void Gelu_quick(Matrix3D a) { 25 | for (int i = 0; i < a.length(); i++) { 26 | a.m_data[i] = Gelu_quick_imp(a.m_data[i]); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /llm/src/ops/LayerNorm.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "operators.h" 5 | #include "utils.h" 6 | 7 | void load_LayerNorm(LayerNorm &op, std::string prefix) { 8 | read_to_array((prefix + "/weight.bin").c_str(), op.params.weight.m_data, op.params.weight.length()); 9 | read_to_array((prefix + "/bias.bin").c_str(), op.params.bias.m_data, op.params.bias.length()); 10 | } 11 | 12 | void LayerNorm::forward(const Matrix3D &x, Matrix3D &output) { 13 | PROFILE_START(profile_name); 14 | Matrix3D weight = params.weight; 15 | Matrix3D bias = params.bias; 16 | const int last_dims = 2; 17 | const float eps = 1e-5; 18 | 19 | assert(last_dims == 2); // support the last dim for now 20 | assert(output.m_dim_x == x.m_dim_x); 21 | assert(output.m_dim_y == x.m_dim_y); 22 | assert(output.m_dim_z == x.m_dim_z); 23 | assert(x.m_dim_z == weight.m_dim_z); 24 | assert(x.m_dim_z == bias.m_dim_z); 25 | 26 | for (int i = 0; i < x.m_dim_x; i++) { // batches 27 | for (int j = 0; j < x.m_dim_y; j++) { // samples 28 | float mean = 0; 29 | for (int k = 0; k < x.m_dim_z; k++) { // hideden states 30 | mean += x(i, j, k); 31 | } 32 | mean /= static_cast(x.m_dim_z); 33 | float squared_diff_sum = 0; 34 | for (int k = 0; k < x.m_dim_z; k++) { 35 | float value = static_cast(x(i, j, k)); 36 | squared_diff_sum += (value - mean) * (value - mean); 37 | } 38 | float std_dev = sqrtl(squared_diff_sum / static_cast(x.m_dim_z) + eps); 39 | 40 | for (int k = 0; k < x.m_dim_z; k++) { 41 | float value = static_cast(x(i, j, k)); 42 | float fp_out = (((value - mean) / (std_dev)) * static_cast(weight(0, 0, k))) + 43 | static_cast(bias(0, 0, k)); 44 | output(i, j, k) = static_cast(fp_out); 45 | } 46 | } 47 | } 48 | PROFILE_END(profile_name); 49 | } 50 | -------------------------------------------------------------------------------- /llm/src/ops/LayerNormQ.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "operators.h" 5 | #include "utils.h" 6 | 7 | void load_LayerNormQ(LayerNormQ &op, std::string prefix) { 8 | read_to_array((prefix + "/weight.bin").c_str(), op.params.weight.m_data, op.params.weight.length()); 9 | read_to_array((prefix + "/bias.bin").c_str(), op.params.bias.m_data, op.params.bias.length()); 10 | } 11 | 12 | void LayerNormQ::forward(const Matrix3D &x, Matrix3D &output) { 13 | PROFILE_START(profile_name); 14 | Matrix3D weight = params.weight; 15 | Matrix3D bias = params.bias; 16 | const int last_dims = 2; 17 | const float eps = 0.00001; 18 | 19 | assert(last_dims == 2); // support the last dim for now 20 | assert(output.m_dim_x == x.m_dim_x); 21 | assert(output.m_dim_y == x.m_dim_y); 22 | assert(output.m_dim_z == x.m_dim_z); 23 | assert(x.m_dim_z == weight.m_dim_z); 24 | assert(x.m_dim_z == bias.m_dim_z); 25 | 26 | for (int i = 0; i < x.m_dim_x; i++) { // batches 27 | for (int j = 0; j < x.m_dim_y; j++) { // samples 28 | float mean = 0; 29 | for (int k = 0; k < x.m_dim_z; k++) { // hideden states 30 | mean += x(i, j, k); 31 | } 32 | mean /= static_cast(x.m_dim_z); 33 | float squared_diff_sum = 0; 34 | for (int k = 0; k < x.m_dim_z; k++) { 35 | float value = static_cast(x(i, j, k)); 36 | squared_diff_sum += (value - mean) * (value - mean); 37 | } 38 | 39 | float var = squared_diff_sum / static_cast(x.m_dim_z); 40 | float std_dev = sqrt(var + eps); 41 | 42 | for (int k = 0; k < x.m_dim_z; k++) { 43 | float value = static_cast(x(i, j, k)); 44 | float fp_out = ((value - mean) / (std_dev) * static_cast(weight(0, 0, k))) + 45 | static_cast(bias(0, 0, k)); 46 | output(i, j, k) = static_cast(std::round(fp_out)); 47 | } 48 | } 49 | } 50 | 51 | PROFILE_END(profile_name); 52 | } 53 | -------------------------------------------------------------------------------- /llm/src/ops/LlamaRMSNorm.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "operators.h" 5 | #include "utils.h" 6 | 7 | void LlamaRMSNorm::forward(const Matrix3D &x, Matrix3D &output, float eps) { 8 | PROFILE_START(profile_name); 9 | const int last_dims = 2; 10 | 11 | assert(last_dims == 2); // support the last dim for now 12 | assert(output.m_dim_x == x.m_dim_x); 13 | assert(output.m_dim_y == x.m_dim_y); 14 | assert(output.m_dim_z == x.m_dim_z); 15 | assert(x.m_dim_z == weight.m_dim_z); 16 | 17 | for (int i = 0; i < x.m_dim_x; i++) { // batches 18 | for (int j = 0; j < x.m_dim_y; j++) { // samples 19 | float var = 0; 20 | 21 | for (int k = 0; k < x.m_dim_z; k++) { // hideden states 22 | var += x(i, j, k) * x(i, j, k); 23 | } 24 | var /= static_cast(x.m_dim_z); 25 | float variance = 1.0 / sqrt(var + eps); 26 | 27 | for (int k = 0; k < x.m_dim_z; k++) { 28 | float value = static_cast(x(i, j, k)); 29 | float fp_out = (value * variance) * weight(0, 0, k); 30 | output(i, j, k) = fp_out; 31 | } 32 | } 33 | } 34 | 35 | PROFILE_END(profile_name); 36 | } 37 | -------------------------------------------------------------------------------- /llm/src/ops/RotaryPosEmb.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "operators.h" 4 | 5 | float q_buf[4096], k_buf[4096]; 6 | // TODO: optimize this with multithreading 7 | void RotaryPosEmb::forward(Matrix3D &query, Matrix3D &key, int start_idx, int len) { 8 | PROFILE_START(profile_name); 9 | int num_heads = query.m_dim_x; 10 | int num_kv_heads = key.m_dim_x; 11 | int head_embed = cos.m_dim_z; 12 | int max_sqlen = cos.m_dim_y; 13 | 14 | assert(query.m_dim_z == cos.m_dim_z); 15 | assert(key.m_dim_z == cos.m_dim_z); 16 | assert(max_sqlen > len + start_idx); 17 | 18 | // cos, sin = self.rotary_emb(key_states, seq_len=kv_seq_len) 19 | // query_states, key_states = apply_rotary_pos_emb(query_states, key_states, 20 | // cos, sin, position_ids) cos = cos[position_ids].unsqueeze(1) # [bs, 1, 21 | // seq_len, dim] sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] 22 | // q_embed = (q * cos) + (rotate_half(q) * sin) 23 | // k_embed = (k * cos) + (rotate_half(k) * sin) 24 | // x1 = x[..., : x.shape[-1] // 2] 25 | // x2 = x[..., x.shape[-1] // 2 :] 26 | // rotate_half: torch.cat((-x2, x1), dim=-1) 27 | 28 | int half = head_embed / 2; 29 | // Query 30 | for (int b = 0; b < num_heads; b++) { 31 | for (int i = 0; i < len; i++) { 32 | // first half 33 | for (int j = 0; j < half; j++) { 34 | q_buf[j] = -1 * query(b, i, j + half); 35 | // k_buf[j] = -1 * key(b, i, j + half); 36 | } 37 | // second half 38 | for (int j = half; j < head_embed; j++) { 39 | q_buf[j] = query(b, i, j - half); 40 | // k_buf[j] = key(b, i, j - half); 41 | } 42 | 43 | for (int j = 0; j < head_embed; j++) { 44 | query(b, i, j) = ((query(b, i, j) * cos(0, i + start_idx, j)) + (q_buf[j] * sin(0, i + start_idx, j))); 45 | // key(b, i, j) = ((key(b, i, j) * cos(0, i + start_idx, j)) + (k_buf[j] * sin(0, i + start_idx, j))); 46 | } 47 | } 48 | } 49 | 50 | // Key 51 | for (int b = 0; b < num_kv_heads; b++) { 52 | for (int i = 0; i < len; i++) { 53 | // first half 54 | for (int j = 0; j < half; j++) { 55 | k_buf[j] = -1 * key(b, i, j + half); 56 | } 57 | // second half 58 | for (int j = half; j < head_embed; j++) { 59 | k_buf[j] = key(b, i, j - half); 60 | } 61 | 62 | for (int j = 0; j < head_embed; j++) { 63 | key(b, i, j) = ((key(b, i, j) * cos(0, i + start_idx, j)) + (k_buf[j] * sin(0, i + start_idx, j))); 64 | } 65 | } 66 | } 67 | 68 | PROFILE_END(profile_name); 69 | } 70 | -------------------------------------------------------------------------------- /llm/src/ops/W8A8B8O8Linear.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "operators.h" 4 | #include "utils.h" 5 | 6 | void load_W8A8B8O8Linear_params(W8A8B8O8Linear &op, std::string prefix) { 7 | read_to_array((prefix + "/weight.bin").c_str(), op.params.B.int8_data_ptr, op.params.B.length()); 8 | read_to_array((prefix + "/bias_int8.bin").c_str(), op.params.bias.int8_data_ptr, op.params.bias.length()); 9 | read_to_array((prefix + "/alpha.bin").c_str(), &op.params.alpha, 1); 10 | read_to_array((prefix + "/alpha.bin").c_str(), &op.alpha, 1); 11 | read_to_array((prefix + "/beta.bin").c_str(), &op.params.beta, 1); 12 | read_to_array((prefix + "/beta.bin").c_str(), &op.beta, 1); 13 | } 14 | 15 | W8A8B8O8Linear::W8A8B8O8Linear(struct W8A8B8O8Linear_params &op_params) { 16 | Matrix3D weight = op_params.weight; 17 | Matrix3D bias = op_params.bias; 18 | 19 | int k = weight.m_dim_z, n = weight.m_dim_y; 20 | params.A.qparams.scale = alpha; // effective_scale = a * B / C 21 | params.B.qparams.scale = 1.0; 22 | params.C.qparams.scale = 1.0; 23 | params.A.qparams.zero_point = 0; 24 | params.B.row = k; 25 | params.B.column = n; 26 | params.B.int8_data_ptr = weight.m_data; 27 | params.B.qparams.zero_point = 0; 28 | params.C.qparams.zero_point = 0; 29 | params.opt_params.blk_size = BLK_SIZE; 30 | params.opt_params.num_thread = NUM_THREAD; 31 | params.C.qparams.q_max = 127; 32 | params.C.qparams.q_min = -128; 33 | params.bias.int8_data_ptr = bias.m_data; 34 | params.bias.row = 1; 35 | params.bias.column = n; 36 | } 37 | 38 | void W8A8B8O8Linear::forward(const Matrix3D &x, Matrix3D &output) { 39 | const int m = x.m_dim_y, k = x.m_dim_z, n = params.B.column, b = x.m_dim_x; 40 | const long long ops = (long long)b * 2 * (long long)m * (long long)n * (long long)k + (long long)m * (long long)n; 41 | PROFILE_START_FLOPS(profile_name, ops); 42 | assert(output.m_dim_x == x.m_dim_x); 43 | assert(output.m_dim_y == x.m_dim_y); 44 | assert(output.m_dim_z == params.B.column); 45 | assert(x.m_dim_z == params.B.row); 46 | assert(output.m_dim_z == params.bias.column); 47 | 48 | params.A.row = m; 49 | params.A.column = k; 50 | params.A.int8_data_ptr = x.m_data; 51 | params.C.row = m; 52 | params.C.column = n; 53 | params.C.int8_data_ptr = output.m_data; 54 | params.A.qparams.scale = alpha; 55 | params.alpha = alpha; 56 | params.beta = beta; 57 | 58 | matmul::MatmulOperator matmul_op = matmul::MatmulOperator(); 59 | 60 | // printf("W8A8B8O8Linear-m,n,k: %d, %d, %d\n", m,n,k); 61 | if (m == 1) { 62 | // params.opt_params.num_thread = 8; 63 | // let's loop over the column dim instead of row 64 | for (int bz = 0; bz < x.m_dim_x; bz++) { 65 | matmul_op.mat_mul_accelerator_int8_fast_32unroll_over_column(¶ms); 66 | params.A.int8_data_ptr += m * k; 67 | params.C.int8_data_ptr += m * n; 68 | } 69 | } else { 70 | for (int bz = 0; bz < x.m_dim_x; bz++) { 71 | matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll(¶ms); 72 | params.A.int8_data_ptr += m * k; 73 | params.C.int8_data_ptr += m * n; 74 | } 75 | } 76 | 77 | PROFILE_END(profile_name); 78 | } 79 | -------------------------------------------------------------------------------- /llm/src/ops/W8A8BFP32OFP32Linear.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "operators.h" 4 | #include "utils.h" 5 | 6 | void load_W8A8BFP32OFP32Linear_params(W8A8BFP32OFP32Linear &op, std::string prefix) { 7 | read_to_array((prefix + "/weight.bin").c_str(), op.params.B.int8_data_ptr, op.params.B.length()); 8 | read_to_array((prefix + "/bias.bin").c_str(), op.params.bias.data_ptr, op.params.bias.length()); 9 | read_to_array((prefix + "/alpha.bin").c_str(), &op.alpha, 1); 10 | } 11 | 12 | W8A8BFP32OFP32Linear::W8A8BFP32OFP32Linear(struct W8A8BFP32OFP32Linear_params &op_params) { 13 | Matrix3D weight = op_params.weight; 14 | Matrix3D bias = op_params.bias; 15 | alpha = op_params.alpha; 16 | 17 | int k = weight.m_dim_z, n = weight.m_dim_y; 18 | params.A.qparams.scale = alpha; // effective_scale = a * B / C 19 | params.B.qparams.scale = 1.0; 20 | params.C.qparams.scale = 1.0; 21 | params.A.qparams.zero_point = 0; 22 | params.B.row = k; 23 | params.B.column = n; 24 | params.B.int8_data_ptr = weight.m_data; 25 | params.B.qparams.zero_point = 0; 26 | params.C.column = n; 27 | params.C.qparams.zero_point = 0; 28 | params.opt_params.blk_size = BLK_SIZE; 29 | params.opt_params.num_thread = NUM_THREAD; 30 | params.bias.data_ptr = bias.m_data; 31 | params.bias.row = 1; 32 | params.bias.column = bias.m_dim_z; 33 | } 34 | 35 | void W8A8BFP32OFP32Linear::forward(const Matrix3D &x, Matrix3D &output) { 36 | const int m = x.m_dim_y, k = x.m_dim_z, n = params.B.column, b = x.m_dim_x; 37 | const long long ops = (long long)b * 2 * (long long)m * (long long)n * (long long)k + (long long)m * (long long)n; 38 | PROFILE_START_FLOPS(profile_name, ops); 39 | assert(output.m_dim_x == x.m_dim_x); 40 | assert(output.m_dim_y == x.m_dim_y); 41 | assert(output.m_dim_z == params.B.column); 42 | assert(x.m_dim_z == params.B.row); 43 | assert(output.m_dim_z == params.bias.column); 44 | 45 | params.A.row = m; 46 | params.A.column = k; 47 | params.A.int8_data_ptr = x.m_data; 48 | params.A.qparams.scale = alpha; // effective_scale = a * B / C 49 | params.C.row = m; 50 | params.C.column = n; 51 | params.C.data_ptr = output.m_data; 52 | params.C.qparams.zero_point = 0; 53 | params.alpha = alpha; 54 | 55 | matmul::MatmulOperator matmul_op = matmul::MatmulOperator(); 56 | 57 | if (m == 1) { 58 | // let's loop over the column dim instead of row 59 | for (int bz = 0; bz < x.m_dim_x; bz++) { 60 | matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32_over_column(¶ms); 61 | params.A.int8_data_ptr += m * k; 62 | params.C.data_ptr += m * n; 63 | } 64 | } else { 65 | for (int bz = 0; bz < x.m_dim_x; bz++) { 66 | matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32(¶ms); 67 | params.A.int8_data_ptr += m * k; 68 | params.C.data_ptr += m * n; 69 | } 70 | } 71 | 72 | PROFILE_END(profile_name); 73 | } 74 | -------------------------------------------------------------------------------- /llm/src/ops/arg_max.cc: -------------------------------------------------------------------------------- 1 | #include "ops/arg_max.h" 2 | 3 | #include 4 | 5 | void arg_max_dim2(Matrix3D &input, Matrix3D &output) { 6 | int bz = input.m_dim_x; 7 | int sqlen = input.m_dim_y; 8 | int voc_size = input.m_dim_z; 9 | 10 | assert(sqlen == output.m_dim_z); 11 | assert(bz == output.m_dim_x); 12 | 13 | for (int b = 0; b < bz; b++) { 14 | for (int i = 0; i < sqlen; i++) { 15 | float max = FLOAT_MIN; 16 | int max_idx = -1; 17 | for (int j = 0; j < voc_size; j++) { 18 | float v = input(b, i, j); 19 | if (max < v) { 20 | max = v; 21 | max_idx = j; 22 | } 23 | } 24 | output(b, 0, i) = max_idx; 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /llm/src/ops/batch_add.cc: -------------------------------------------------------------------------------- 1 | #include "operators.h" 2 | 3 | void batch_Add(const Matrix3D &input, const Matrix3D &input2, Matrix3D &output) { 4 | PROFILE_START("batch_Add"); 5 | assert(input.m_dim_y == input2.m_dim_y); 6 | assert(input.m_dim_z == input2.m_dim_z); 7 | assert(input.m_dim_x == output.m_dim_x); 8 | assert(input.m_dim_y == output.m_dim_y); 9 | assert(input.m_dim_z == output.m_dim_z); 10 | 11 | if (input.m_dim_x != input2.m_dim_x && input2.m_dim_x == 1) { 12 | // Find the maximum value in the input array 13 | for (int i = 0; i < input.m_dim_x; i++) { 14 | for (int j = 0; j < input.m_dim_y; j++) { 15 | for (int k = 0; k < input.m_dim_z; k++) { 16 | output(i, j, k) = input(i, j, k) + input2(0, j, k); 17 | } 18 | } 19 | } 20 | } else { 21 | throw("Unsupported dimension for softmax"); 22 | } 23 | PROFILE_END("batch_Add"); 24 | } 25 | -------------------------------------------------------------------------------- /llm/src/ops/cuda/RotaryPosEmb.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "operators.h" 3 | 4 | __global__ void RotaryPosEmb_cuda_forward(Matrix3D query, Matrix3D key, Matrix3D cos, Matrix3D sin, int start_idx, int len) { 5 | half query_buf[128], key_buf[128]; 6 | 7 | int num_heads = query.m_dim_x; 8 | int head_embed = cos.m_dim_z; 9 | int half_pos = head_embed / 2; 10 | 11 | // Convert the 1D CUDA thread indices into 3D indices 12 | int b = blockIdx.x; 13 | int i = threadIdx.x; 14 | 15 | if(b < num_heads && i < len) { 16 | for(int j = 0; j < half_pos; j++) { 17 | query_buf[j] = __hneg(query(b, i, j + half_pos)); 18 | key_buf[j] = __hneg(key(b, i, j + half_pos)); 19 | } 20 | 21 | for(int j = half_pos; j < head_embed; j++) { 22 | query_buf[j] = query(b, i, j - half_pos); 23 | key_buf[j] = key(b, i, j - half_pos); 24 | } 25 | 26 | for(int j = 0; j < head_embed; j++) { 27 | half cos_half = cos(0, i + start_idx, j); 28 | half sin_half = sin(0, i + start_idx, j); 29 | 30 | query(b, i, j) = __hfma(query(b, i, j), cos_half, __hmul(query_buf[j], sin_half)); 31 | key(b, i, j) = __hfma(key(b, i, j), cos_half, __hmul(key_buf[j], sin_half)); 32 | } 33 | } 34 | } 35 | 36 | __global__ void RotaryPosEmb_cuda_forward_shared(Matrix3D query, Matrix3D key, Matrix3D cos, Matrix3D sin, int start_idx, int len) { 37 | extern __shared__ half shared_memory[]; 38 | 39 | half *query_buf = &shared_memory[0]; 40 | half *key_buf = &shared_memory[4096]; 41 | 42 | int num_heads = query.m_dim_x; 43 | int head_embed = cos.m_dim_z; 44 | int half_pos = head_embed / 2; 45 | 46 | int b = blockIdx.x; 47 | int i = threadIdx.x; 48 | 49 | if(b < num_heads && i < len) { 50 | // Load data into shared memory for faster access. 51 | for(int j = 0; j < half_pos; j++) { 52 | query_buf[threadIdx.x * head_embed + j] = __hneg(query(b, i, j + half_pos)); 53 | key_buf[threadIdx.x * head_embed + j] = __hneg(key(b, i, j + half_pos)); 54 | } 55 | 56 | for(int j = half_pos; j < head_embed; j++) { 57 | query_buf[threadIdx.x * head_embed + j] = query(b, i, j - half_pos); 58 | key_buf[threadIdx.x * head_embed + j] = key(b, i, j - half_pos); 59 | } 60 | 61 | __syncthreads(); // Synchronize to ensure all data is loaded before processing. 62 | 63 | for(int j = 0; j < head_embed; j++) { 64 | half cos_half = cos(0, i + start_idx, j); 65 | half sin_half = sin(0, i + start_idx, j); 66 | 67 | // Use the __hfma intrinsic function for faster multiply-add operations. 68 | query(b, i, j) = __hfma(query(b, i, j), cos_half, __hmul(query_buf[threadIdx.x * head_embed + j], sin_half)); 69 | key(b, i, j) = __hfma(key(b, i, j), cos_half, __hmul(key_buf[threadIdx.x * head_embed + j], sin_half)); 70 | } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /llm/src/ops/cuda/batch_add.cu: -------------------------------------------------------------------------------- 1 | #include "operators.h" 2 | 3 | // __global__ void batch_Add_float(Matrix3D input, Matrix3D input2, Matrix3D output) { 4 | // int i = blockIdx.x * blockDim.x + threadIdx.x; 5 | // int j = blockIdx.y * blockDim.y + threadIdx.y; 6 | // int k = blockIdx.z * blockDim.z + threadIdx.z; 7 | 8 | // if (i < input.m_dim_x && j < input.m_dim_y && k < input.m_dim_z) { 9 | // output(i, j, k) = input(i, j, k) + input2(0, j, k); 10 | // } 11 | // } 12 | 13 | __global__ void batch_Add_cuda(Matrix3D input, Matrix3D input2, Matrix3D output) { 14 | int i = blockIdx.x * blockDim.x + threadIdx.x; 15 | int j = blockIdx.y * blockDim.y + threadIdx.y; 16 | int k = blockIdx.z * blockDim.z + threadIdx.z; 17 | 18 | //// half version 19 | if (i < input.m_dim_x && j < input.m_dim_y && k < input.m_dim_z) { 20 | output(i, j, k) = __hadd(input(i, j, k), input2(0, j, k)); 21 | } 22 | } 23 | 24 | __global__ void batch_Add_cuda_half2(Matrix3D input, Matrix3D input2, Matrix3D output) { 25 | int i = blockIdx.x * blockDim.x + threadIdx.x; 26 | int j = blockIdx.y * blockDim.y + threadIdx.y; 27 | int k = blockIdx.z * blockDim.z + threadIdx.z; 28 | 29 | if (i < input.m_dim_x && j < input.m_dim_y && k < input.m_dim_z / 2) { 30 | half2* input_half2 = reinterpret_cast(input.m_data); 31 | half2* input2_half2 = reinterpret_cast(input2.m_data); 32 | half2* output_half2 = reinterpret_cast(output.m_data); 33 | int input_half2_dim_y = input.m_dim_y; 34 | int input_half2_dim_z = input.m_dim_z / 2; 35 | // int input2_half2_dim_y = input2.m_dim_y; 36 | int input2_half2_dim_z = input2.m_dim_z / 2; 37 | int output_half2_dim_y = output.m_dim_y; 38 | int output_half2_dim_z = output.m_dim_z / 2; 39 | 40 | output_half2[i * output_half2_dim_y * output_half2_dim_z + j * output_half2_dim_z + k] = 41 | __hadd2(input_half2[i * input_half2_dim_y * input_half2_dim_z + j * input_half2_dim_z + k], 42 | input2_half2[j * input2_half2_dim_z + k]); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /llm/src/ops/cuda/embedding.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "operators.h" 4 | #include "utils.h" 5 | 6 | __global__ void EmbeddingKernel(Matrix3D input_id, Matrix3D output, float* lookup, int embed_dim) { 7 | int i = blockIdx.x * blockDim.x + threadIdx.x; 8 | 9 | if (i < input_id.m_dim_z) { 10 | int token_id = input_id(0, 0, i); 11 | half* output_sample_ptr = &output.m_data[i * embed_dim]; 12 | float* target_embed = &lookup[token_id * embed_dim]; 13 | 14 | for (int j = 0; j < embed_dim; ++j) { 15 | output_sample_ptr[j] = __float2half(target_embed[j]); 16 | } 17 | } 18 | } 19 | 20 | void load_Embedding_params_cuda(Embedding_cuda& op, std::string prefix) { 21 | op.lookup.load((prefix + "/weight.bin").c_str()); 22 | } 23 | 24 | void Embedding_cuda::forward(Matrix3D input_id, Matrix3D output) { 25 | PROFILE_START(profile_name); 26 | assert(input_id.m_dim_x == 1); 27 | assert(input_id.m_dim_y == 1); 28 | assert(input_id.m_dim_z == output.m_dim_y); 29 | assert(output.m_dim_z == this->embed_dim); 30 | 31 | int threadsPerBlock = 1024; 32 | int blocksPerGrid = (input_id.m_dim_z + threadsPerBlock - 1) / threadsPerBlock; 33 | EmbeddingKernel<<>>(input_id, output, this->lookup.m_data, this->embed_dim); 34 | 35 | PROFILE_END(profile_name); 36 | } 37 | -------------------------------------------------------------------------------- /llm/src/ops/cuda/linear.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "operators.h" 3 | #include "utils.h" 4 | 5 | void Linear_half_int4::forward(const Matrix3D &x, Matrix3D &output) { 6 | const int num_thread = 8; 7 | Matrix3D b = this->weight; 8 | PROFILE_START(profile_name); 9 | 10 | // a: m x k b: n x k c: m x n 11 | assert(output.m_dim_x == 1); 12 | assert(output.m_dim_y == x.m_dim_y); 13 | // assert(output.m_dim_z == weight.m_dim_y); 14 | // assert(x.m_dim_z / 8 == weight.m_dim_z); 15 | 16 | assert(output.m_dim_z > num_thread); 17 | assert(output.m_dim_z % (num_thread * 2) == 0); // unroll column by 2 18 | 19 | struct matmul_params params; 20 | params.A.row = x.m_dim_y; 21 | params.A.column = x.m_dim_z; 22 | params.A.half_data_ptr = x.m_data; 23 | params.B.row = b.m_dim_z; // k 24 | params.B.column = b.m_dim_y; // n 25 | params.B.int32_data_ptr = b.m_data; 26 | params.C.row = output.m_dim_y; 27 | params.C.column = output.m_dim_z; 28 | params.C.half_data_ptr = output.m_data; 29 | params.opt_params.num_thread = num_thread; 30 | params.half_scales = this->scale.m_data; 31 | // params.offset = this->offset.m_data; // TODO: Currently, we don't need offset 32 | params.int32_zero_point = this->zero_point.m_data; 33 | params.block_size = QK; 34 | 35 | matmul::MatmulOperator op = matmul::MatmulOperator(); 36 | op.gemv_forward_cuda(¶ms); 37 | 38 | PROFILE_END(profile_name); 39 | return; 40 | } 41 | 42 | 43 | void Linear_FP16_int4_ref::forward_ref(const Matrix3D &a, Matrix3D &c) { 44 | Matrix3D b = this->weight; 45 | PROFILE_START(profile_name); 46 | 47 | // a: m x k b: n x k c: m x n 48 | assert(a.m_dim_x == b.m_dim_x); // batch dim 49 | assert(a.m_dim_z == b.m_dim_z); // k 50 | assert(a.m_dim_y == c.m_dim_y); // m 51 | assert(b.m_dim_y == c.m_dim_z / 8); // n 52 | 53 | // batch dim == 1 only support MM for now 54 | assert(a.m_dim_x == 1); 55 | assert(b.m_dim_x == 1); 56 | 57 | struct matmul_params params; 58 | params.A.row = a.m_dim_y; 59 | params.A.column = a.m_dim_z; 60 | params.A.fp16_data_ptr = a.m_data; 61 | params.B.row = b.m_dim_z; 62 | params.B.column = b.m_dim_y; 63 | params.B.int32_data_ptr = b.m_data; 64 | params.C.row = c.m_dim_y; 65 | params.C.column = c.m_dim_z; 66 | params.C.fp16_data_ptr = c.m_data; 67 | params.fp16_scales = this->scale.m_data; 68 | // params.offset = this->offset.m_data; // TODO: Currently, we don't need offset 69 | params.int32_zero_point = this->zero_point.m_data; 70 | params.block_size = QK; 71 | 72 | matmul::MatmulOperator op = matmul::MatmulOperator(); 73 | op.naive_mat_mul_fp16_int4((const struct matmul_params *)¶ms); 74 | 75 | PROFILE_END(profile_name); 76 | return; 77 | } 78 | -------------------------------------------------------------------------------- /llm/src/ops/cuda/softmax.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "operators.h" 3 | 4 | __global__ void softmax_cuda(Matrix3D input, Matrix3D output) { 5 | // Calculate indices i, j in the input array 6 | int i = blockIdx.x * blockDim.x + threadIdx.x; 7 | int j = blockIdx.y * blockDim.y + threadIdx.y; 8 | 9 | if (i < input.m_dim_x && j < input.m_dim_y) { 10 | // half max_value = __float2half(-INFINITY); 11 | half max_value = -65504; 12 | half sum = 0; 13 | 14 | // Find the maximum value in the input array 15 | for (int k = 0; k < input.m_dim_z; k++) { 16 | half value = input(i, j, k); 17 | #if defined(__CUDA_ARCH__) 18 | #if __CUDA_ARCH__ >= 860 // Compute Capability >= 8.6 19 | max_value = __hmax(max_value, value); 20 | #else 21 | max_value = __hgt(max_value, value) ? max_value : value; 22 | #endif 23 | #endif 24 | } 25 | 26 | // Compute the sum 27 | for (int k = 0; k < input.m_dim_z; k++) { 28 | half value = input(i, j, k); 29 | // atomicAdd(&sum, value); 30 | sum = __hadd(sum, hexp(__hsub(value, max_value))); 31 | // sum = __hfma(__hsub(value, max_value), sum, sum); // TODO: Check if this is correct and faster 32 | } 33 | 34 | // Compute the final softmax values 35 | for (int k = 0; k < input.m_dim_z; k++) { 36 | half value = input(i, j, k); 37 | output(i, j, k) = __hdiv(hexp(__hsub(value, max_value)), sum); 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /llm/src/ops/embedding.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "operators.h" 4 | #include "utils.h" 5 | 6 | void load_Embedding_params(Embedding& op, std::string prefix) { 7 | op.lookup.load((prefix + "/weight.bin").c_str()); 8 | // read_to_array((prefix + "/weight.bin").c_str(), op.lookup.m_data, op.lookup.length()); 9 | } 10 | 11 | void Embedding::forward(Matrix3D input_id, Matrix3D output) { 12 | PROFILE_START(profile_name); 13 | assert(input_id.m_dim_x == 1); 14 | assert(input_id.m_dim_y == 1); 15 | assert(input_id.m_dim_z == output.m_dim_y); 16 | assert(output.m_dim_z == this->embed_dim); 17 | 18 | for (int i = 0; i < input_id.m_dim_z; i++) { 19 | int token_id = input_id(0, 0, i); 20 | float* output_sample_ptr = &output.m_data[i * this->embed_dim]; 21 | float* target_embed = &this->lookup.m_data[token_id * this->embed_dim]; 22 | memcpy(output_sample_ptr, target_embed, sizeof(float) * this->embed_dim); 23 | } 24 | PROFILE_END(profile_name); 25 | } 26 | -------------------------------------------------------------------------------- /llm/src/ops/softmax.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "operators.h" 4 | 5 | void softmax(const Matrix3D &input, Matrix3D &output, const int dim) { 6 | PROFILE_START("softmax"); 7 | int len = input.length(); 8 | 9 | if (dim == 2) { 10 | // Find the maximum value in the input array 11 | for (int i = 0; i < input.m_dim_x; i++) { 12 | for (int j = 0; j < input.m_dim_y; j++) { 13 | float max_value = input.m_data[0]; 14 | float sum = 0; 15 | // Find the maximum value in the input array 16 | for (int k = 0; k < input.m_dim_z; k++) { 17 | float value = input(i, j, k); 18 | if (value > max_value) { 19 | max_value = value; 20 | } 21 | } 22 | 23 | // Compute the softmax values 24 | for (int k = 0; k < input.m_dim_z; k++) { 25 | float value = input(i, j, k); 26 | sum += std::exp(value - max_value); 27 | } 28 | 29 | // Normalize the softmax values and store them in the output array 30 | for (int k = 0; k < input.m_dim_z; k++) { 31 | float value = input(i, j, k); 32 | float final_v = (std::exp(value - max_value) / (sum + 1e-10)); 33 | output(i, j, k) = final_v; 34 | } 35 | } 36 | } 37 | } else { 38 | throw("Unsupported dimension for softmax"); 39 | } 40 | PROFILE_END("softmax"); 41 | } 42 | -------------------------------------------------------------------------------- /llm/tests/cuda/test_Int4llamaForCausalLM.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "Int4llamaForCausalLM.h" 5 | #include "operators.h" 6 | #include "utils.h" 7 | 8 | int NUM_THREAD = 8; 9 | 10 | static void Int4LLaMAFreeMemory() { 11 | // Int4LlamaForCausalLM 12 | Int4LlamaForCausalLM LlamaForCausalLM; 13 | LlamaForCausalLM.free_cuda_memory(); 14 | 15 | // Int4llamaDecoder 16 | Int4llamaDecoder llamaDecoder; 17 | llamaDecoder.free_cuda_memory(); 18 | 19 | // Int4llamaDecoderLayer 20 | Int4llamaDecoderLayer llamaDecoderLayer; 21 | llamaDecoderLayer.free_cuda_memory(); 22 | 23 | // Int4llamaAttention 24 | Int4llamaAttention llamaAttention; 25 | llamaAttention.free_cuda_memory(); 26 | } 27 | 28 | void test_Int4LlamaForCausalLM() { 29 | struct model_config config = get_opt_model_config(LLaMA_7B); 30 | const int voc_size = config.vocsize, sqlen = 9, b = 1; 31 | 32 | // reasoning phase: 1st run 33 | int* buffer_1; 34 | cudaMallocManaged(&buffer_1, sizeof(int) * sqlen); 35 | Matrix3D input_ids(buffer_1, b, 1, sqlen); 36 | input_ids.load("assets/llama/tests/model/1st_input_ids.bin"); 37 | struct Int4LlamaForCausalLM_input input_1st = {input_ids}; 38 | 39 | Int4LlamaForCausalLM model = Int4LlamaForCausalLM("INT4/models/LLaMA_7B_2_chat", config); 40 | struct Int4LlamaForCausalLM_output output_1st = model.forward("INT4/models/LLaMA_7B_2_chat", input_1st); 41 | 42 | float* buffer_2; 43 | cudaMallocManaged(&buffer_2, sizeof(float) * b * sqlen * voc_size); 44 | Matrix3D logits(buffer_2, b, sqlen, voc_size); 45 | logits.load("assets/llama/tests/model/1st_logits_cuda.bin"); 46 | bool success = check_two_equal(output_1st.logits.m_data, logits.m_data, logits.length(), 1e-8); 47 | 48 | Profiler::getInstance().report(); 49 | Profiler::getInstance().reset(); 50 | 51 | // generating phase: 2nd run 52 | int* buffer_3; 53 | cudaMallocManaged(&buffer_3, sizeof(int) * sqlen); 54 | Matrix3D input_ids_2nd(buffer_3, b, 1, 1); 55 | input_ids_2nd.load("assets/llama/tests/model/2nd_input_ids.bin"); 56 | 57 | struct Int4LlamaForCausalLM_input input_2nd = {input_ids_2nd, output_1st.past_keys, output_1st.past_values}; 58 | struct Int4LlamaForCausalLM_output output_2nd = model.forward("INT4/models/LLaMA_7B_2_chat", input_2nd); 59 | 60 | float* buffer_4; 61 | cudaMallocManaged(&buffer_4, sizeof(float) * b * 1 * voc_size); 62 | logits = Matrix3D(buffer_4, b, 1, voc_size); 63 | logits.load("assets/llama/tests/model/2nd_logits_cuda.bin"); 64 | 65 | success &= check_two_equal(output_2nd.logits.m_data, logits.m_data, logits.length(), 1e-8); 66 | 67 | Profiler::getInstance().report(); 68 | 69 | if (!success) 70 | std::cout << "-------- Test of " << __func__ << ": Fail! -------- " << std::endl; 71 | else 72 | std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl; 73 | 74 | // Free memory 75 | free_aligned_memory_gpu(buffer_1); 76 | free_aligned_memory_gpu(buffer_2); 77 | free_aligned_memory_gpu(buffer_3); 78 | free_aligned_memory_gpu(buffer_4); 79 | Int4LLaMAFreeMemory(); 80 | } 81 | 82 | int main() { test_Int4LlamaForCausalLM(); } 83 | -------------------------------------------------------------------------------- /llm/tests/non_cuda/test_Int4llamaForCausalLM.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "../utils_memalloc.h" 5 | #include "Int4llamaForCausalLM.h" 6 | #include "operators.h" 7 | #include "utils.h" 8 | 9 | int NUM_THREAD = 8; 10 | 11 | void test_Int4LlamaForCausalLM() { 12 | struct model_config config = get_opt_model_config(LLaMA_7B); 13 | const int num_heads = config.num_heads, embed_dim = config.embed_dim, sqlen = 600, b = 1, 14 | hidden_dim = config.hidden_dim; 15 | const int voc_size = config.vocsize, padding_idx = 1, num_layers = config.num_layers; 16 | MemoryAllocator mem_buf; 17 | 18 | // reasoning phase: 1st run 19 | Matrix3D input_ids(mem_buf.get_intbuffer(sqlen), b, 1, sqlen); 20 | input_ids.load("assets/llama/tests/model/1st_input_ids.bin"); 21 | struct Int4LlamaForCausalLM_input input_1st = {input_ids}; 22 | 23 | Int4LlamaForCausalLM model = Int4LlamaForCausalLM("models/LLaMA_7B", config); 24 | 25 | struct Int4LlamaForCausalLM_output output_1st = model.forward("models/LLaMA_7B", input_1st); 26 | 27 | Matrix3D logits(mem_buf.get_fpbuffer(b * sqlen * voc_size), b, sqlen, voc_size); 28 | logits.load("assets/llama/tests/model/1st_logits.bin"); 29 | // print_first_k_elelment("O", output_1st.logits.m_data, 20); 30 | // print_first_k_elelment("G", logits.m_data, 20); 31 | bool success = check_two_equal(output_1st.logits.m_data, logits.m_data, logits.length(), 1e-8); 32 | 33 | Matrix3D temp_key_value(mem_buf.get_fpbuffer(b * sqlen * embed_dim), num_heads, sqlen, 34 | embed_dim / num_heads); 35 | Profiler::getInstance().report(); 36 | Profiler::getInstance().reset(); 37 | 38 | // generating phase: 2nd run 39 | Matrix3D input_ids_2nd(mem_buf.get_intbuffer(sqlen), b, 1, 1); 40 | input_ids_2nd.load("assets/llama/tests/model/2nd_input_ids.bin"); 41 | struct Int4LlamaForCausalLM_input input_2nd = {input_ids_2nd, output_1st.past_keys, output_1st.past_values}; 42 | 43 | struct Int4LlamaForCausalLM_output output_2nd; 44 | for (int i = 0; i < 10; i++) output_2nd = model.forward("models/LLaMA_7B", input_2nd); 45 | 46 | logits = Matrix3D(mem_buf.get_fpbuffer(b * 1 * voc_size), b, 1, voc_size); 47 | logits.load("assets/llama/tests/model/2nd_logits.bin"); 48 | // print_first_k_elelment("O", output_2nd.logits.m_data, 20); 49 | // print_first_k_elelment("G", logits.m_data, 20); 50 | success &= check_two_equal(output_2nd.logits.m_data, logits.m_data, logits.length(), 1e-8); 51 | 52 | Profiler::getInstance().report(); 53 | if (!success) 54 | std::cout << "-------- Test of " << __func__ << ": Fail! -------- " << std::endl; 55 | else 56 | std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl; 57 | } 58 | 59 | int main() { 60 | // This tests are directly from fp32 and are not completed yet! 61 | test_Int4LlamaForCausalLM(); 62 | } 63 | -------------------------------------------------------------------------------- /llm/tests/test_Fp32OPTAttention.cc: -------------------------------------------------------------------------------- 1 | #include "Fp32OPTAttention.h" 2 | #include "operators.h" 3 | #include "utils.h" 4 | #include "utils_memalloc.h" 5 | 6 | int NUM_THREAD = 8; 7 | 8 | void test_Fp32OPTAttention() { 9 | const int num_heads = 12, embed_dim = 768, sqlen = 2, b = 1; 10 | MemoryAllocator mem_buf; 11 | 12 | Fp32OPTAttention::initialized_memory(get_opt_model_config(OPT_125M)); 13 | Fp32OPTAttention attn = 14 | Fp32OPTAttention("FP32/models/OPT_125m/decoder/layer0/self_attn", get_opt_model_config(OPT_125M)); 15 | 16 | Matrix3D hidden_states(mem_buf.get_fpbuffer(embed_dim * sqlen), b, sqlen, embed_dim); 17 | hidden_states.load("assets/OPT/tests/attn/OPT_125m/Fp32_atten_input.bin"); 18 | Matrix3D attention_mask(mem_buf.get_fpbuffer(sqlen * sqlen), 1, sqlen, sqlen); 19 | attention_mask.load("assets/OPT/tests/attn/OPT_125m/Fp32_atten_mask.bin"); 20 | struct Fp32OPTAttention_input input(hidden_states, attention_mask, 0); 21 | 22 | struct Fp32OPTAttention_output output = attn.forward(input); 23 | 24 | Matrix3D attn_outputGT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim); 25 | attn_outputGT.load("assets/OPT/tests/attn/OPT_125m/Fp32_atten_output.bin"); 26 | 27 | bool success = check_two_equal(attn_outputGT.m_data, output.attn_output.m_data, b * sqlen * embed_dim); 28 | if (!success) 29 | std::cout << "Test of " << __func__ << ": Fail!" << std::endl; 30 | else 31 | std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl; 32 | } 33 | 34 | void test_Fp32OPTAttention_1_3B() { 35 | const int embed_dim = 2048, sqlen = 2, b = 1; 36 | MemoryAllocator mem_buf; 37 | 38 | Fp32OPTAttention::initialized_memory(get_opt_model_config(OPT_1_3B)); 39 | Fp32OPTAttention attn = 40 | Fp32OPTAttention("FP32/models/OPT_1.3B/decoder/layer0/self_attn", get_opt_model_config(OPT_1_3B)); 41 | 42 | Matrix3D hidden_states(mem_buf.get_fpbuffer(embed_dim * sqlen), b, sqlen, embed_dim); 43 | hidden_states.load("assets/OPT/tests/attn/OPT_1.3B/Fp32_atten_input.bin"); 44 | Matrix3D attention_mask(mem_buf.get_fpbuffer(sqlen * sqlen), 1, sqlen, sqlen); 45 | attention_mask.load("assets/OPT/tests/attn/OPT_1.3B/Fp32_atten_mask.bin"); 46 | struct Fp32OPTAttention_input input(hidden_states, attention_mask, 0); 47 | 48 | struct Fp32OPTAttention_output output = attn.forward(input); 49 | 50 | Matrix3D attn_outputGT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim); 51 | attn_outputGT.load("assets/OPT/tests/attn/OPT_1.3B/Fp32_atten_output.bin"); 52 | 53 | bool success = check_two_equal(attn_outputGT.m_data, output.attn_output.m_data, b * sqlen * embed_dim, 1e-5); 54 | if (!success) 55 | std::cout << "Test of " << __func__ << ": Fail!" << std::endl; 56 | else 57 | std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl; 58 | } 59 | 60 | int main() { 61 | test_Fp32OPTAttention(); 62 | test_Fp32OPTAttention_1_3B(); 63 | } 64 | -------------------------------------------------------------------------------- /llm/tests/test_Fp32OPTDecoder.cc: -------------------------------------------------------------------------------- 1 | #include "Fp32OPTDecoder.h" 2 | #include "operators.h" 3 | #include "utils.h" 4 | #include "utils_memalloc.h" 5 | 6 | int NUM_THREAD = 8; 7 | 8 | void test_Fp32OPTDecoder() { 9 | const int embed_dim = 2048, sqlen = 2, b = 1; 10 | MemoryAllocator mem_buf; 11 | 12 | Matrix3D input_ids(mem_buf.get_intbuffer(sqlen), b, 1, sqlen); 13 | input_ids.load("assets/OPT/tests/attn/OPT_1.3B/Fp32_decoder_1st_input_ids.bin"); 14 | struct Fp32OPTDecoder_input input_1st = {input_ids}; 15 | 16 | Fp32OPTDecoder decoder = Fp32OPTDecoder("FP32/models/OPT_1.3B/decoder", get_opt_model_config(OPT_1_3B)); 17 | 18 | struct Fp32OPTDecoder_output output_1st = decoder.forward(input_1st); 19 | 20 | // reasoning phase: 1st run 21 | Matrix3D last_hidden_state1_GT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim); 22 | last_hidden_state1_GT.load("assets/OPT/tests/attn/OPT_1.3B/Fp32_decoder_1st_last_hidden_state.bin"); 23 | 24 | print_first_k_elelment("Fp32_decoder_1st_last_hidden_state", last_hidden_state1_GT.m_data, 20); 25 | print_first_k_elelment("output_1st", output_1st.last_hidden_state.m_data, 20); 26 | bool success = check_two_equal(output_1st.last_hidden_state.m_data, last_hidden_state1_GT.m_data, 27 | last_hidden_state1_GT.length(), 1e-5); 28 | 29 | if (!success) 30 | std::cout << "Test of " << __func__ << ": Fail!" << std::endl; 31 | else 32 | std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl; 33 | } 34 | 35 | int main() { test_Fp32OPTDecoder(); } 36 | -------------------------------------------------------------------------------- /llm/tests/test_Fp32OPTDecoderLayer.cc: -------------------------------------------------------------------------------- 1 | #include "Fp32OPTDecoderLayer.h" 2 | #include "operators.h" 3 | #include "utils.h" 4 | #include "utils_memalloc.h" 5 | 6 | int NUM_THREAD = 8; 7 | 8 | void test_Fp32OPTDecoderLayer() { 9 | const int num_heads = 12, embed_dim = 768, sqlen = 2, b = 1; 10 | MemoryAllocator mem_buf; 11 | 12 | Fp32OPTDecoderLayer layer = 13 | Fp32OPTDecoderLayer("FP32/models/OPT_125m/decoder/layer0", get_opt_model_config(OPT_125M), 0); 14 | 15 | Matrix3D hidden_states(mem_buf.get_fpbuffer(embed_dim * sqlen), b, sqlen, embed_dim); 16 | hidden_states.load("assets/OPT/tests/attn/OPT_125m/Fp32_layer_input.bin"); 17 | Matrix3D attention_mask(mem_buf.get_fpbuffer(sqlen * sqlen), 1, sqlen, sqlen); 18 | attention_mask.load("assets/OPT/tests/attn/OPT_125m/Fp32_layer_mask.bin"); 19 | struct Fp32OPTDecoderLayer_input input(hidden_states, attention_mask); 20 | 21 | struct Fp32OPTDecoderLayer_output output = layer.forward(input); 22 | 23 | Matrix3D attn_outputGT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim); 24 | attn_outputGT.load("assets/OPT/tests/attn/OPT_125m/Fp32_layer_output.bin"); 25 | 26 | bool success = check_two_equal(attn_outputGT.m_data, output.hidden_states.m_data, b * sqlen * embed_dim, 1e-4); 27 | if (!success) 28 | std::cout << "Test of " << __func__ << ": Fail!" << std::endl; 29 | else 30 | std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl; 31 | } 32 | 33 | void test_Fp32OPTDecoderLayer_1_3B() { 34 | const int embed_dim = 2048, sqlen = 2, b = 1; 35 | MemoryAllocator mem_buf; 36 | 37 | Fp32OPTDecoderLayer layer = 38 | Fp32OPTDecoderLayer("FP32/models/OPT_1.3B/decoder/layer0", get_opt_model_config(OPT_1_3B), 0); 39 | 40 | Matrix3D hidden_states(mem_buf.get_fpbuffer(embed_dim * sqlen), b, sqlen, embed_dim); 41 | hidden_states.load("assets/OPT/tests/attn/OPT_1.3B/Fp32_layer_input.bin"); 42 | Matrix3D attention_mask(mem_buf.get_fpbuffer(sqlen * sqlen), 1, sqlen, sqlen); 43 | attention_mask.load("assets/OPT/tests/attn/OPT_1.3B/Fp32_layer_mask.bin"); 44 | struct Fp32OPTDecoderLayer_input input(hidden_states, attention_mask); 45 | 46 | struct Fp32OPTDecoderLayer_output output = layer.forward(input); 47 | 48 | Matrix3D attn_outputGT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim); 49 | attn_outputGT.load("assets/OPT/tests/attn/OPT_1.3B/Fp32_layer_output.bin"); 50 | 51 | bool success = check_two_equal(attn_outputGT.m_data, output.hidden_states.m_data, b * sqlen * embed_dim, 1e-4); 52 | if (!success) 53 | std::cout << "Test of " << __func__ << ": Fail!" << std::endl; 54 | else 55 | std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl; 56 | } 57 | 58 | int main() { 59 | test_Fp32OPTDecoderLayer(); 60 | test_Fp32OPTDecoderLayer_1_3B(); 61 | } 62 | -------------------------------------------------------------------------------- /llm/tests/test_Fp32OPTForCausalLM.cc: -------------------------------------------------------------------------------- 1 | #include "Fp32OPTForCausalLM.h" 2 | #include "operators.h" 3 | #include "utils.h" 4 | #include "utils_memalloc.h" 5 | 6 | int NUM_THREAD = 8; 7 | 8 | void test_Fp32OPTForCausalLM() { 9 | struct model_config config = get_opt_model_config(OPT_1_3B); 10 | const int embed_dim = config.embed_dim, sqlen = 2, b = 1; 11 | const int voc_size = config.vocsize; 12 | MemoryAllocator mem_buf; 13 | 14 | Matrix3D input_ids(mem_buf.get_intbuffer(sqlen), b, 1, sqlen); 15 | input_ids.load("assets/OPT/tests/attn/OPT_1.3B/Fp32_decoder_1st_input_ids.bin"); 16 | struct Fp32OPTForCausalLM_input input_1st = {input_ids}; 17 | 18 | Fp32OPTForCausalLM model = Fp32OPTForCausalLM("FP32/models/OPT_1.3B", get_opt_model_config(OPT_1_3B)); 19 | 20 | struct Fp32OPTForCausalLM_output output_1st = model.forward(input_1st); 21 | 22 | // reasoning phase: 1st run 23 | Matrix3D logits(mem_buf.get_fpbuffer(b * sqlen * voc_size), b, sqlen, voc_size); 24 | logits.load("assets/OPT/tests/attn/OPT_1.3B/Fp32_causallm_logits.bin"); 25 | 26 | // print_first_k_elelment("logits", logits.m_data, 20); 27 | // print_first_k_elelment("output_1st.logits.m_data", output_1st.logits.m_data, 20); 28 | bool success = check_two_equal(output_1st.logits.m_data, logits.m_data, logits.length(), 1e-5); 29 | 30 | if (!success) 31 | std::cout << "Test of " << __func__ << ": Fail!" << std::endl; 32 | else 33 | std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl; 34 | } 35 | 36 | int main() { test_Fp32OPTForCausalLM(); } 37 | -------------------------------------------------------------------------------- /llm/tests/test_Fp32llamaForCausalLM.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "Fp32llamaForCausalLM.h" 5 | #include "operators.h" 6 | #include "utils.h" 7 | #include "utils_memalloc.h" 8 | 9 | int NUM_THREAD = 8; 10 | 11 | void test_Fp32LlamaForCausalLM() { 12 | struct model_config config = get_opt_model_config(LLaMA_7B); 13 | const int num_heads = config.num_heads, embed_dim = config.embed_dim, sqlen = 9, b = 1, 14 | hidden_dim = config.hidden_dim; 15 | const int voc_size = config.vocsize, padding_idx = 1, num_layers = config.num_layers; 16 | MemoryAllocator mem_buf; 17 | 18 | // reasoning phase: 1st run 19 | Matrix3D input_ids(mem_buf.get_intbuffer(sqlen), b, 1, sqlen); 20 | input_ids.load("assets/llama/tests/model/1st_input_ids.bin"); 21 | struct Fp32LlamaForCausalLM_input input_1st = {input_ids}; 22 | 23 | Fp32LlamaForCausalLM model = Fp32LlamaForCausalLM("models/LLaMA_7B", config); 24 | 25 | struct Fp32LlamaForCausalLM_output output_1st = model.forward(input_1st); 26 | 27 | Matrix3D logits(mem_buf.get_fpbuffer(b * sqlen * voc_size), b, sqlen, voc_size); 28 | logits.load("assets/llama/tests/model/1st_logits.bin"); 29 | // print_first_k_elelment("O", output_1st.logits.m_data, 20); 30 | // print_first_k_elelment("G", logits.m_data, 20); 31 | bool success = check_two_equal(output_1st.logits.m_data, logits.m_data, logits.length(), 1e-8); 32 | 33 | Matrix3D temp_key_value(mem_buf.get_fpbuffer(b * sqlen * embed_dim), num_heads, sqlen, 34 | embed_dim / num_heads); 35 | Profiler::getInstance().report(); 36 | Profiler::getInstance().reset(); 37 | 38 | // generating phase: 2nd run 39 | Matrix3D input_ids_2nd(mem_buf.get_intbuffer(sqlen), b, 1, 1); 40 | input_ids_2nd.load("assets/llama/tests/model/2nd_input_ids.bin"); 41 | struct Fp32LlamaForCausalLM_input input_2nd = {input_ids_2nd, output_1st.past_keys, output_1st.past_values}; 42 | 43 | struct Fp32LlamaForCausalLM_output output_2nd = model.forward(input_2nd); 44 | 45 | logits = Matrix3D(mem_buf.get_fpbuffer(b * 1 * voc_size), b, 1, voc_size); 46 | logits.load("assets/llama/tests/model/2nd_logits.bin"); 47 | // print_first_k_elelment("O", output_2nd.logits.m_data, 20); 48 | // print_first_k_elelment("G", logits.m_data, 20); 49 | success &= check_two_equal(output_2nd.logits.m_data, logits.m_data, logits.length(), 1e-8); 50 | 51 | Profiler::getInstance().report(); 52 | if (!success) 53 | std::cout << "-------- Test of " << __func__ << ": Fail! -------- " << std::endl; 54 | else 55 | std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl; 56 | } 57 | 58 | int main() { test_Fp32LlamaForCausalLM(); } 59 | -------------------------------------------------------------------------------- /llm/tests/test_LLaMATokenizer.cc: -------------------------------------------------------------------------------- 1 | #include "LLaMATokenizer.h" 2 | 3 | int NUM_THREAD = 8; 4 | 5 | static const std::map> &test_LLaMATokenizer() { 6 | static std::map> llama_answer = { 7 | /* 1. */ { 8 | "Hello World", 9 | { 10 | 1, 11 | 10994, 12 | 2787, 13 | }, 14 | }, 15 | /* 2. */ 16 | { 17 | " Hello World!", 18 | { 19 | 1, 20 | 15043, 21 | 2787, 22 | 29991, 23 | }, 24 | }, 25 | /* 3. */ 26 | { 27 | "This is Tiny LLM Engine.", 28 | { 29 | 1, 30 | 4013, 31 | 338, 32 | 323, 33 | 4901, 34 | 365, 35 | 26369, 36 | 10863, 37 | 29889, 38 | }, 39 | }, 40 | /* 4. */ 41 | { 42 | "Please introduce Massachusetts Institute of Technology (MIT)", 43 | { 44 | 1, 45 | 12148, 46 | 14944, 47 | 16167, 48 | 8907, 49 | 310, 50 | 17968, 51 | 313, 52 | 26349, 53 | 29897, 54 | }, 55 | }, 56 | /* 5. */ 57 | { 58 | "Building a website can be done in 10 simple steps. This message is for general people, so we assume " 59 | "they don't have basic concepts.", 60 | { 61 | 1, 8893, 292, 263, 4700, 508, 367, 2309, 297, 29871, 29896, 29900, 2560, 6576, 29889, 910, 2643, 62 | 338, 363, 2498, 2305, 29892, 577, 591, 5251, 896, 1016, 29915, 29873, 505, 6996, 22001, 29889, 63 | }, 64 | }, 65 | }; 66 | 67 | return llama_answer; 68 | }; 69 | 70 | int main(int argc, char **argv) { 71 | // load the vocab 72 | const std::string fname = "models/llama_vocab.bin"; 73 | llama_vocab vocab = llama_init_vocab(fname.c_str()); 74 | 75 | bool is_equal; 76 | int test_count = 1; 77 | for (const auto &llama_answer : test_LLaMATokenizer()) { 78 | std::vector input_ids(llama_answer.first.size()); 79 | const int n = llama_tokenize(vocab, llama_answer.first.c_str(), input_ids.data(), input_ids.size(), true); 80 | input_ids.resize(n); 81 | 82 | is_equal = input_ids.size() == llama_answer.second.size(); 83 | 84 | for (int i = 0; i < (int)input_ids.size() && is_equal; ++i) { 85 | if (input_ids[i] != llama_answer.second[i]) { 86 | is_equal = false; 87 | } 88 | } 89 | 90 | test_count++; 91 | } 92 | 93 | if (!is_equal) 94 | std::cout << "-------- Test of " << __func__ << ": Fail! -------- " << std::endl; 95 | else 96 | std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl; 97 | 98 | return 0; 99 | } 100 | -------------------------------------------------------------------------------- /llm/tests/test_OPTGenerate.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "Generate.h" 4 | 5 | int NUM_THREAD = 8; 6 | 7 | int main() { 8 | // std::vector input_ids = {37500, 10, 998, 64, 28, 626, 11, 158, 2007, 2402, 4, 152, 1579, 16, 9 | // 13, 937, 82, 6, 98, 52, 6876, 51, 218, 75, 33, 3280, 14198, 4}; 10 | std::string vocab_file = "./models/OPT_125m/vocab.json"; 11 | std::string bpe_file = "./models/OPT_125m/merges.txt"; 12 | 13 | Encoder encoder = get_encoder(vocab_file, bpe_file); 14 | std::vector input_ids = encoder.encode("John went to MIT and study Computer Science."); 15 | 16 | std::string decoded = encoder.decode(input_ids); 17 | std::cout << "input:" << decoded << std::endl; 18 | 19 | OPTForCausalLM model = OPTForCausalLM("models/OPT_125m", get_opt_model_config(OPT_125M)); 20 | const struct opt_params generation_config; 21 | std::vector generated_ids = OPTGenerate(&model, OPT_INT8, input_ids, generation_config); 22 | 23 | decoded = encoder.decode(generated_ids); 24 | std::cout << "generated:" << decoded << std::endl; 25 | }; 26 | -------------------------------------------------------------------------------- /llm/tests/test_OPTTokenizer.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "OPTTokenizer.h" 4 | 5 | int NUM_THREAD = 8; 6 | 7 | void test_OPTEncode() { 8 | std::string bpe_file = "models/opt_merges.txt"; 9 | std::string vocab_file = "models/opt_vocab.json"; 10 | 11 | Encoder encoder = get_encoder(vocab_file, bpe_file); 12 | std::vector encoded = encoder.encode( 13 | "Building a website can be done in 10 simple steps. This message is for general people, so we assume they " 14 | "don't have basic concepts."); 15 | std::vector encoded_answer = {37500, 10, 998, 64, 28, 626, 11, 158, 2007, 2402, 4, 152, 1579, 16, 16 | 13, 937, 82, 6, 98, 52, 6876, 51, 218, 75, 33, 3280, 14198, 4}; 17 | bool is_equal = true; 18 | for (int i = 0; i < encoded.size(); i++) { 19 | if (encoded[i] != encoded_answer[i]) { 20 | is_equal = false; 21 | break; 22 | } 23 | } 24 | if (!is_equal) 25 | std::cout << "-------- Test of " << __func__ << ": Fail! -------- " << std::endl; 26 | else 27 | std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl; 28 | } 29 | 30 | void test_OPTDecode() { 31 | std::string bpe_file = "models/opt_merges.txt"; 32 | std::string vocab_file = "models/opt_vocab.json"; 33 | ; 34 | 35 | Encoder encoder = get_encoder(vocab_file, bpe_file); 36 | std::vector encoded_answer = {37500, 10, 998, 64, 28, 626, 11, 158, 2007, 2402, 4, 152, 1579, 16, 37 | 13, 937, 82, 6, 98, 52, 6876, 51, 218, 75, 33, 3280, 14198, 4}; 38 | std::string decoded = encoder.decode(encoded_answer); 39 | std::string decoded_answer = 40 | "Building a website can be done in 10 simple steps. This message is for general people, so we assume they " 41 | "don't have basic concepts."; 42 | bool is_equal = true; 43 | if (decoded != decoded_answer) is_equal = false; 44 | if (!is_equal) 45 | std::cout << "-------- Test of " << __func__ << ": Fail! -------- " << std::endl; 46 | else 47 | std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl; 48 | } 49 | 50 | int main() { 51 | test_OPTEncode(); 52 | test_OPTDecode(); 53 | }; 54 | -------------------------------------------------------------------------------- /llm/tests/utils_memalloc.h: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | class MemoryAllocator { 3 | // TODO: use allocate_aligned_memory instead! 4 | public: 5 | MemoryAllocator() { this->counter = 0; } 6 | float* get_fpbuffer(int size) { 7 | float* ptr; 8 | allocate_aligned_memory(ptr, size * sizeof(float)); 9 | return ptr; 10 | } 11 | int8_t* get_int8buffer(int size) { 12 | int8_t* ptr; 13 | allocate_aligned_memory(ptr, size * sizeof(int8_t)); 14 | return ptr; 15 | } 16 | int* get_intbuffer(int size) { 17 | int* ptr; 18 | allocate_aligned_memory(ptr, size * sizeof(int)); 19 | return ptr; 20 | } 21 | 22 | private: 23 | int counter; 24 | }; 25 | -------------------------------------------------------------------------------- /llm/tools/copy_rotary_emb.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copy from layer 0 to layer 31 4 | for i in {0..31}; do 5 | cp -r INT4/models/CodeLLaMA_7B_Instruct/decoder/layer${i}/self_attn/rotary_emb/* INT4/models/Mistral_7B/decoder/layer${i}/self_attn/rotary_emb/ 6 | done -------------------------------------------------------------------------------- /llm/tools/download_assets.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # List of files to download, their corresponding MD5 checksums, and target local paths 4 | files_and_checksums=( 5 | "https://huggingface.co/mit-han-lab/tinychatengine-model-zoo/resolve/main/assets.zip?download=true 8527788105acccfada9c89d075fa8764 assets.zip" 6 | ) 7 | 8 | OS=`uname` 9 | 10 | # Function to download a file if it doesn't exist or if its MD5 checksum is incorrect 11 | download_if_needed() { 12 | url="$1" 13 | expected_md5="$2" 14 | target_path="$3" 15 | 16 | # Ensure the target directory exists 17 | target_dir=$(dirname "$target_path") 18 | mkdir -p "$target_dir" 19 | 20 | # Download the file if it does not exist 21 | if [ ! -e "$target_path" ]; then 22 | echo "File '$target_path' does not exist. Downloading..." 23 | wget -q -O "$target_path" "$url" 24 | fi 25 | 26 | # Use md5 on MacOS 27 | if [ $OS = "Darwin" ] 28 | then 29 | actual_md5=$(md5 -q "$target_path") 30 | # Use md5sum on Ubuntu 31 | elif [ $OS = "Linux" ] 32 | then 33 | actual_md5=$(md5sum "$target_path" | cut -d ' ' -f1) 34 | fi 35 | 36 | if [ "$actual_md5" != "$expected_md5" ]; then 37 | echo "MD5 checksum for '$target_path' is incorrect. Downloading again..." 38 | wget -q -O "$target_path" "$url" 39 | else 40 | echo "File '$target_path' exists and its MD5 checksum is correct." 41 | fi 42 | } 43 | 44 | # Process each file, its corresponding MD5 checksum, and target local path 45 | for file_and_checksum in "${files_and_checksums[@]}"; do 46 | url=$(echo "$file_and_checksum" | awk '{ print $1 }') 47 | expected_md5=$(echo "$file_and_checksum" | awk '{ print $2 }') 48 | target_path=$(echo "$file_and_checksum" | awk '{ print $3 }') 49 | 50 | download_if_needed "$url" "$expected_md5" "$target_path" 51 | unzip "$target_path" 52 | done 53 | -------------------------------------------------------------------------------- /llm/tools/export_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # # E.g., Quantize and export Mistral-7B model 4 | # python tools/mistral_exporter.py --model ../../llm-awq-mistral/quant_cache/mistral-7b-w4-g32-awq-v2.pt --output models/Mistral_7B 5 | # python tools/rotary_emb_exporter.py 6 | # # For x86 7 | # python tools/model_quantizer.py --model_path models/Mistral_7B --method QM_x86 8 | # mkdir Mistral_7B_for_x86 9 | # mkdir Mistral_7B_for_x86/INT4 10 | # mkdir Mistral_7B_for_x86/INT4/models 11 | # mv INT4/models/Mistral_7B Mistral_7B_for_x86/INT4/models 12 | # cd Mistral_7B_for_x86/ 13 | # zip -r Mistral_7B_v0.2_Instruct.zip INT4 14 | # cd .. 15 | # # For ARM 16 | # python tools/model_quantizer.py --model_path models/Mistral_7B --method QM_ARM 17 | # mkdir Mistral_7B_for_ARM 18 | # mkdir Mistral_7B_for_ARM/INT4 19 | # mkdir Mistral_7B_for_ARM/INT4/models 20 | # mv INT4/models/Mistral_7B Mistral_7B_for_ARM/INT4/models 21 | # cd Mistral_7B_for_ARM/ 22 | # zip -r Mistral_7B_v0.2_Instruct.zip INT4 23 | # cd .. 24 | # # fp32 25 | # mkdir Mistral_7B_FP32 26 | # mkdir Mistral_7B_FP32/models 27 | # mv models/Mistral_7B Mistral_7B_FP32/models 28 | # cd Mistral_7B_FP32/ 29 | # zip -r Mistral_7B_v0.2_Instruct.zip models 30 | # cd .. 31 | 32 | 33 | # E.g., Quantize and export LLaMA3-8B model 34 | python tools/llama3_exporter.py --model ../../llm-awq/quant_cache/llama3-8b-w4-g32-awq-v2.pt --output models/LLaMA_3_8B_Instruct 35 | python tools/rotary_emb_exporter.py 36 | # For ARM 37 | python tools/model_quantizer.py --model_path models/LLaMA_3_8B_Instruct --method QM_ARM 38 | mkdir LLaMA_3_8B_Instruct_for_ARM 39 | mkdir LLaMA_3_8B_Instruct_for_ARM/INT4 40 | mkdir LLaMA_3_8B_Instruct_for_ARM/INT4/models 41 | mv INT4/models/LLaMA_3_8B_Instruct LLaMA_3_8B_Instruct_for_ARM/INT4/models 42 | cd LLaMA_3_8B_Instruct_for_ARM/ 43 | zip -r LLaMA_3_8B_Instruct.zip INT4 44 | cd .. 45 | # For x86 46 | python tools/model_quantizer.py --model_path models/LLaMA_3_8B_Instruct --method QM_x86 47 | mkdir LLaMA_3_8B_Instruct_for_x86 48 | mkdir LLaMA_3_8B_Instruct_for_x86/INT4 49 | mkdir LLaMA_3_8B_Instruct_for_x86/INT4/models 50 | mv INT4/models/LLaMA_3_8B_Instruct LLaMA_3_8B_Instruct_for_x86/INT4/models 51 | cd LLaMA_3_8B_Instruct_for_x86/ 52 | zip -r LLaMA_3_8B_Instruct.zip INT4 53 | cd .. 54 | # fp32 55 | mkdir LLaMA_3_8B_Instruct_FP32 56 | mkdir LLaMA_3_8B_Instruct_FP32/models 57 | mv models/LLaMA_3_8B_Instruct LLaMA_3_8B_Instruct_FP32/models 58 | cd LLaMA_3_8B_Instruct_FP32/ 59 | zip -r LLaMA_3_8B_Instruct.zip models 60 | cd .. 61 | -------------------------------------------------------------------------------- /llm/tools/profile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | make clean && make -j 4 | 5 | # Find all executable files in the current directory starting with 'profile_' 6 | for file in profile_*; do 7 | # Check if the file is executable 8 | if [ -x "$file" ]; then 9 | echo "Running '$file'..." 10 | ./"$file" 11 | exit_code=$? 12 | fi 13 | done 14 | -------------------------------------------------------------------------------- /llm/tools/quantize_and_upload.py: -------------------------------------------------------------------------------- 1 | """A script to quantize supported models and updload to model zoo. 2 | 3 | Example usage: 4 | python quantize_and_upload.py --method --token 5 | 6 | Note: This script is for developers. 7 | """ 8 | import argparse 9 | import hashlib 10 | import os 11 | 12 | from upload import subebackups 13 | 14 | model_paths = ["models/LLaMA_13B_2_chat"] 15 | 16 | quantized_dir = "INT4" 17 | db_prefix = "/MIT/transformer_assets/" 18 | 19 | 20 | def _get_md5sum(file_path): 21 | hash_md5 = hashlib.md5() 22 | with open(file_path, "rb") as f: 23 | for chunk in iter(lambda: f.read(4096), b""): 24 | hash_md5.update(chunk) 25 | return hash_md5.hexdigest() 26 | 27 | 28 | def main(): 29 | """Take arguments and quantize all models and upload to dropbox.""" 30 | 31 | def _get_parser(): 32 | parser = argparse.ArgumentParser(description="Quantize model") 33 | parser.add_argument("--model_path", type=str, help="Quantization method", default=None) 34 | parser.add_argument("--method", type=str, help="Quantization method") 35 | parser.add_argument("--token", help="Your Dropbox OAuth2 token.") 36 | return parser 37 | 38 | parser = _get_parser() 39 | args = parser.parse_args() 40 | 41 | if args.method not in ["QM_x86", "QM_ARM", "QM_CUDA", "FP32", "INT8"]: 42 | raise ValueError("expect method to be one of ['QM_x86', 'QM_ARM', 'QM_CUDA', 'FP32', 'INT8']") 43 | QM_method = args.method 44 | 45 | if args.model_path: 46 | target_paths = [args.model_path] 47 | else: 48 | target_paths = model_paths 49 | 50 | for model_path in target_paths: 51 | # quantize 52 | if args.method in ["QM_x86", "QM_CUDA", "QM_ARM"]: 53 | out_dir = quantized_dir 54 | quantize_cmd = ( 55 | f"python model_quantizer.py --model_path {model_path} --method {QM_method} --output_path {out_dir}" 56 | ) 57 | os.system(quantize_cmd) 58 | else: 59 | out_dir = "./" 60 | # zip 61 | print("zipping...") 62 | model_name_size = model_path.rsplit("/", maxsplit=1)[-1] 63 | zip_path = "/tmp/" + model_name_size + ".zip" 64 | zip_cmd = f"zip -qq -r {zip_path} {os.path.join(out_dir, model_path)}" 65 | os.system(zip_cmd) 66 | # md5sum 67 | print(f"md5sum is {_get_md5sum(zip_path)}.") 68 | print("uploading...") 69 | # upload 70 | upload_path = os.path.join(db_prefix, QM_method, model_name_size + ".zip") 71 | subebackups(zip_path, upload_path, args.token) 72 | print("removing temporary zip file...") 73 | # rm zip 74 | os.system(f"rm {zip_path}") 75 | 76 | 77 | if __name__ == "__main__": 78 | main() 79 | -------------------------------------------------------------------------------- /llm/tools/quantize_constants.py: -------------------------------------------------------------------------------- 1 | STORE_FP16 = False 2 | 3 | QK4_0 = 32 4 | QK4_1 = 32 5 | QK4_2 = 32 6 | QK4_3 = 32 7 | QK4_5 = 128 8 | QK4_6 = 128 9 | -------------------------------------------------------------------------------- /llm/tools/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | make clean && make -j 4 | 5 | # Find all executable files in the current directory starting with 'test_' 6 | for file in test_*; do 7 | # Check if the file is executable 8 | if [ -x "$file" ]; then 9 | echo "Running '$file'..." 10 | ./"$file" 11 | exit_code=$? 12 | fi 13 | done 14 | -------------------------------------------------------------------------------- /llm/tools/upload.py: -------------------------------------------------------------------------------- 1 | """ Python script to upload models to Hugging Face. 2 | 3 | Usage: 4 | python tools/upload.py --filename --QM --hf_token 5 | 6 | Example commandline: 7 | python tools/upload.py --filename LLaMA_3_8B_Instruct.zip --QM QM_ARM --hf_token 8 | """ 9 | import argparse 10 | import hashlib 11 | import os 12 | import zipfile 13 | 14 | import requests 15 | from tqdm import tqdm 16 | from huggingface_hub import HfApi 17 | 18 | 19 | def _upload_file_to_HF(filename, folder_name, hf_token): 20 | # Check if the file is a zip file 21 | if zipfile.is_zipfile(filename): 22 | print(f"Start uploading the model to Huggingface: mit-han-lab/tinychatengine-model-zoo/{folder_name}/{filename}") 23 | api = HfApi() 24 | api.upload_file( 25 | path_or_fileobj=filename, 26 | path_in_repo=f"{folder_name}/{filename}", 27 | repo_id="mit-han-lab/tinychatengine-model-zoo", 28 | repo_type="model", 29 | commit_message="Upload models", 30 | token=hf_token 31 | ) 32 | print(f"File uploaded successfully: mit-han-lab/tinychatengine-model-zoo/{folder_name}/{filename}") 33 | else: 34 | print(f"The file is not a zip file: {filename}") 35 | 36 | def _remove_file(filepath): 37 | if os.path.isfile(filepath): 38 | os.remove(filepath) 39 | print(f"File removed successfully: {filepath}") 40 | else: 41 | print(f"Error: {filepath} not a valid filename") 42 | 43 | def _main(): 44 | parser = argparse.ArgumentParser(description="Download a file and check its md5sum") 45 | parser.add_argument("--filename", help="The name of the file to upload.") 46 | parser.add_argument("--QM", default="FP32", help="Quantization method.") 47 | parser.add_argument("--hf_token", help="Huggingface write token.") 48 | parser.add_argument("--remove_file", action="store_true", help="Remove the file after uploading.") 49 | args = parser.parse_args() 50 | 51 | Qmodels = ["FP32", "QM_ARM", "QM_x86", "QM_CUDA", "INT8"] 52 | 53 | if args.QM not in Qmodels: 54 | raise NotImplementedError(f"{args.QM} is not supported.") 55 | 56 | _upload_file_to_HF(args.filename, args.QM, args.hf_token) # Upload the file to Huggingface 57 | 58 | if args.remove_file: 59 | _remove_file(args.filename) # Remove the zip file 60 | 61 | 62 | if __name__ == "__main__": 63 | _main() 64 | -------------------------------------------------------------------------------- /llm/tools/upload_to_dropbox.py: -------------------------------------------------------------------------------- 1 | """ DEPRECATED: This script is deprecated. Please use `upload.py` to upload models to Hugging Face instead. 2 | 3 | Uploading models and asset to the dropbox storage. 4 | 5 | Example commandline: 6 | python upload.py 7 | """ 8 | import argparse 9 | import os 10 | 11 | import dropbox 12 | 13 | files_to_upload = [ 14 | "CodeLLaMA_13B_Instruct.zip", 15 | "CodeLLaMA_7B_Instruct.zip", 16 | # "LLaMA_13B_2_chat.zip", 17 | # "LLaMA_7B_2_chat.zip", 18 | # "assets.zip", 19 | ] 20 | 21 | 22 | def subebackups(file_path, target_path, token): 23 | """Upload a file to the dropbox storage.""" 24 | dbx = dropbox.Dropbox(token, timeout=36000) 25 | file_size = os.path.getsize(file_path) 26 | CHUNK_SIZE = 50 * 1024 * 1024 27 | dest_path = target_path 28 | 29 | with open(file_path, "rb") as f: 30 | if file_size <= CHUNK_SIZE: 31 | dbx.files_upload(f.read(), dest_path) 32 | 33 | else: 34 | upload_session_start_result = dbx.files_upload_session_start(f.read(CHUNK_SIZE)) 35 | cursor = dropbox.files.UploadSessionCursor( 36 | session_id=upload_session_start_result.session_id, offset=f.tell() 37 | ) 38 | commit = dropbox.files.CommitInfo(path=dest_path, mode=dropbox.files.WriteMode("overwrite")) 39 | 40 | while f.tell() < file_size: 41 | if (file_size - f.tell()) <= CHUNK_SIZE: 42 | print(dbx.files_upload_session_finish(f.read(CHUNK_SIZE), cursor, commit)) 43 | else: 44 | dbx.files_upload_session_append(f.read(CHUNK_SIZE), cursor.session_id, cursor.offset) 45 | cursor.offset = f.tell() 46 | 47 | 48 | if __name__ == "__main__": 49 | parser = argparse.ArgumentParser(description="Upload a file to Dropbox.") 50 | parser.add_argument("token", help="Your Dropbox OAuth2 token.") 51 | args = parser.parse_args() 52 | 53 | db_prefix = "/HAN Lab Public Space/Projects/TinyChatEngine/assets and models/QM_CUDA/" 54 | local_prefix = "uploads" 55 | 56 | for file in files_to_upload: 57 | subebackups(file, db_prefix + file, args.token) 58 | -------------------------------------------------------------------------------- /llm/tools/zip_assets.sh: -------------------------------------------------------------------------------- 1 | zip -r assets.zip assets 2 | -------------------------------------------------------------------------------- /llm/vila: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | echo "=============================================================================================================================" 3 | image_path="$1" 4 | termvisage $image_path -w 70 5 | echo "=============================================================================================================================" 6 | 7 | ./chat VILA1.5_8B INT4 5 $image_path 8 | -------------------------------------------------------------------------------- /llm/vila_2.7b: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | echo "=============================================================================================================================" 3 | image_path="$1" 4 | termvisage $image_path -w 75 5 | echo "=============================================================================================================================" 6 | 7 | ./chat VILA_2.7B INT4 5 $image_path 8 | -------------------------------------------------------------------------------- /llm/voice_mistral: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | ./chat -v Mistral_7B INT4 5 0 3 | -------------------------------------------------------------------------------- /llm/voice_vila: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | echo "=============================================================================================================================" 3 | image_path="$1" 4 | termvisage $image_path -w 75 5 | echo "=============================================================================================================================" 6 | 7 | ./chat -v VILA1.5_8B INT4 5 $image_path 8 | -------------------------------------------------------------------------------- /llm/voicechat_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Clone whisper.cpp and checkout the specific commit 4 | git clone https://github.com/ggerganov/whisper.cpp 5 | cd whisper.cpp 6 | git checkout a4bb2df 7 | 8 | # Determine the platform 9 | OS="$(uname)" 10 | if [ "$OS" = "Linux" ]; then 11 | # Install SDL2 on Linux 12 | sudo apt-get install libsdl2-dev 13 | elif [ "$OS" = "Darwin" ]; then 14 | # Install SDL2 on Mac OS 15 | brew install sdl2 16 | else 17 | echo "Unsupported operating system: $OS" 18 | exit 1 19 | fi 20 | 21 | # Apply patch and download model 22 | git apply ../application/sts_utils/clean_up.patch 23 | bash ./models/download-ggml-model.sh base.en 24 | 25 | # Check for NVIDIA GPU 26 | if lspci | grep -i nvidia > /dev/null; then 27 | # Compile with CUDA support 28 | WHISPER_CUBLAS=1 make -j stream 29 | else 30 | # Compile without CUDA support 31 | make -j stream 32 | fi 33 | 34 | # Set up TTS 35 | cd ../ 36 | mkdir TTS 37 | cd TTS 38 | wget "https://github.com/rhasspy/piper/releases/download/v1.2.0/piper_arm64.tar.gz" 39 | tar -xvzf piper_arm64.tar.gz 40 | rm piper_arm64.tar.gz 41 | 42 | # Download default voice 43 | wget "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/amy/medium/en_US-amy-medium.onnx?download=true" -O en_US-amy-medium.onnx 44 | wget "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/amy/medium/en_US-amy-medium.onnx.json?download=true" -O en_US-amy-medium.onnx.json 45 | 46 | # Return to the parent directory and compile chat 47 | cd ../ 48 | make clean 49 | make -j chat 50 | 51 | echo "" 52 | echo "TinyChatEngine's speech-to-speech chatbot setup completed successfully!" 53 | echo "Use './chat -v' on Linux/MacOS or 'chat.exe -v' on Windows." 54 | echo "" 55 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 120 3 | include = '\.pyi?$' 4 | extend-exclude = "codegen/.*" 5 | 6 | [tool.isort] 7 | profile = "black" 8 | known_first_party = ["code_generator"] 9 | extend_skip = ["codegen"] 10 | multi_line_output = 3 11 | include_trailing_comma = true 12 | force_grid_wrap = 0 13 | use_parentheses = true 14 | ensure_newline_before_comments = true 15 | line_length = 120 16 | 17 | [tool.pylint] 18 | [tool.pylint.master] 19 | ignore-paths = ["codegen"] 20 | [tool.pylint.messages_control] 21 | disable = [ 22 | "C0103", 23 | "C0114", 24 | "C0115", 25 | "C0116", 26 | "C0123", 27 | "C0209", 28 | "C0330", 29 | "C0301", 30 | "C0302", 31 | "C0411", 32 | "C0415", 33 | "E0401", 34 | "E1121", 35 | "E1123", 36 | "E1101", 37 | "R", 38 | "W" 39 | ] 40 | [tool.pylint.basic] 41 | good-names-rgxs = "^[_a-z][_a-z0-9]?$" # allow 1 or 2 character names 42 | [tool.pylint.format] 43 | max-line-length = 120 44 | max-module-lines = 5000 45 | [tool.pylint.design] 46 | max-args = 10 47 | max-attributes = 15 48 | max-parents = 10 49 | 50 | [tool.mypy] 51 | files = "." 52 | exclude ="codegen/.*" 53 | install_types = true 54 | non_interactive = true 55 | show_error_codes = true 56 | disable_error_code = [ 57 | "import", 58 | "assignment", 59 | "operator", 60 | "has-type", 61 | "var-annotated", 62 | "operator", 63 | "call-arg", 64 | ] 65 | explicit_package_bases = true 66 | namespace_packages = true 67 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | tqdm 3 | torch 4 | transformers 5 | pillow 6 | huggingface_hub --------------------------------------------------------------------------------