├── .gitattributes ├── .gitmodules ├── models └── whisper-mel-filters.gguf ├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── demo ├── safetensors.cpp ├── basic.cpp ├── dyt-rms.cpp ├── fastvlm.cpp ├── random.cpp ├── svd.cpp ├── 2d-rope.cpp ├── ultravox-encoder.cpp ├── whisper-encoder.cpp └── kyutai-mimi.cpp ├── README.md ├── convert_safetensors_to_gguf.py └── ggml-easy.h /.gitattributes: -------------------------------------------------------------------------------- 1 | *.gguf binary 2 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "ggml"] 2 | path = ggml 3 | url = https://github.com/ggml-org/ggml 4 | -------------------------------------------------------------------------------- /models/whisper-mel-filters.gguf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ngxson/ggml-easy/HEAD/models/whisper-mel-filters.gguf -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | 34 | build 35 | .cache 36 | tmp 37 | .vscode 38 | 39 | /*.gguf 40 | /*.safetensors 41 | /*.dot 42 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories. 2 | project("ggml-easy" C CXX) 3 | include(CheckIncludeFileCXX) 4 | 5 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) 6 | add_subdirectory(ggml) 7 | 8 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}) 9 | # demo 10 | set(DEMO_TARGETS 11 | basic 12 | dyt-rms 13 | svd 14 | kyutai-mimi 15 | safetensors 16 | ultravox-encoder 17 | whisper-encoder 18 | 2d-rope 19 | fastvlm 20 | random) 21 | 22 | foreach(TARGET ${DEMO_TARGETS}) 23 | add_executable(${TARGET} demo/${TARGET}.cpp) 24 | target_link_libraries(${TARGET} PRIVATE ggml) 25 | target_compile_features(${TARGET} PRIVATE cxx_std_17) 26 | endforeach() 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Xuan-Son Nguyen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /demo/safetensors.cpp: -------------------------------------------------------------------------------- 1 | #include "ggml.h" 2 | #include "ggml-easy.h" 3 | #include 4 | #include 5 | 6 | /** 7 | * This example demonstrates how to load safetensors directly to GGML without any conversions. 8 | * 9 | * We load both the GGUF and safetensors of the same model, then compare the tensors. 10 | * All tensors are expected to be equal. 11 | * 12 | * I'm using https://huggingface.co/kyutai/mimi as the model. But you can use any model. 13 | * 14 | * To get the safetensors: 15 | * 1. Download the model.safetensors file 16 | * 2. Rename the "model.safetensors" to "mimi.safetensors" 17 | * 18 | * To get the gguf: 19 | * 1. Download the model.safetensors file 20 | * 2. Run: python convert_safetensors_to_gguf.py --outtype f32 model.safetensors mimi.gguf 21 | * 22 | */ 23 | 24 | int main() { 25 | ggml_easy::ctx_params params; 26 | params.use_gpu = false; 27 | params.log_level = GGML_LOG_LEVEL_DEBUG; 28 | 29 | ggml_easy::ctx ctx0(params); 30 | ctx0.load_safetensors("mimi.safetensors", { 31 | {".acoustic_residual_vector_quantizer", ".acoustic_rvq"}, 32 | {".semantic_residual_vector_quantizer", ".semantic_rvq"}, 33 | }); 34 | 35 | ggml_easy::ctx ctx1(params); 36 | ctx1.load_gguf("mimi.gguf"); 37 | 38 | GGML_ASSERT(ctx0.tensors.size() == ctx1.tensors.size()); 39 | 40 | GGML_ASSERT(ggml_backend_buft_is_host(ctx0.backend_buft[0])); 41 | GGML_ASSERT(ggml_backend_buft_is_host(ctx1.backend_buft[0])); 42 | 43 | // compare the tensors 44 | for (auto & t : ctx0.tensors) { 45 | auto tensor0 = t.second; 46 | auto tensor1 = ctx1.get_weight(t.first.c_str()); 47 | 48 | GGML_ASSERT(ggml_are_same_shape(tensor0, tensor1)); 49 | GGML_ASSERT(tensor0->type == GGML_TYPE_F32); 50 | GGML_ASSERT(tensor1->type == GGML_TYPE_F32); 51 | 52 | float diff = 0.0; 53 | for (size_t i = 0; i < ggml_nelements(tensor0); ++i) { 54 | float v0 = ggml_get_f32_1d(tensor0, i); 55 | float v1 = ggml_get_f32_1d(tensor1, i); 56 | diff += std::abs(v0 - v1); 57 | } 58 | 59 | printf("%-60s: diff = %f\n", t.first.c_str(), diff); 60 | GGML_ASSERT(diff < 1e-6); 61 | } 62 | 63 | printf("\nOK: All tensors are equal\n"); 64 | 65 | return 0; 66 | } 67 | -------------------------------------------------------------------------------- /demo/basic.cpp: -------------------------------------------------------------------------------- 1 | #include "ggml.h" 2 | #include "ggml-easy.h" 3 | #include 4 | 5 | /** 6 | * This example demonstrates how to perform matrix multiplication using ggml-easy.h 7 | * 8 | * Given 2 matrices A and B, the result matrix C is calculated as follows: 9 | * C = (A x B) * 2 10 | * 11 | * We will use utils.debug_print() to debug the intermediate result of (A x B) 12 | * Then, we will use utils.mark_output() to get the final result of C 13 | * 14 | * The final result can be printed using ggml_easy::debug::print_tensor_data() 15 | * Or, can be used to perform further computations 16 | */ 17 | 18 | int main() { 19 | ggml_easy::ctx_params params; 20 | ggml_easy::ctx ctx(params); 21 | 22 | // initialize data of matrices to perform matrix multiplication 23 | const int rows_A = 4, cols_A = 2; 24 | float matrix_A[rows_A * cols_A] = { 25 | 2, 8, 26 | 5, 1, 27 | 4, 2, 28 | 8, 6 29 | }; 30 | const int rows_B = 3, cols_B = 2; 31 | float matrix_B[rows_B * cols_B] = { 32 | 10, 5, 33 | 9, 9, 34 | 5, 4 35 | }; 36 | 37 | // create cgraph 38 | ctx.build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf, auto & utils) { 39 | ggml_tensor * a = utils.new_input("a", GGML_TYPE_F32, cols_A, rows_A); 40 | ggml_tensor * b = utils.new_input("b", GGML_TYPE_F32, cols_B, rows_B); 41 | ggml_tensor * a_mul_b = ggml_mul_mat(ctx_gf, a, b); 42 | utils.debug_print(a_mul_b, "a_mul_b"); 43 | ggml_tensor * result = ggml_scale(ctx_gf, a_mul_b, 2); 44 | utils.mark_output(result, "result"); 45 | }); 46 | 47 | // set data 48 | ctx.set_tensor_data("a", matrix_A); 49 | ctx.set_tensor_data("b", matrix_B); 50 | 51 | // optional: print backend buffer info 52 | ggml_easy::debug::print_backend_buffer_info(ctx); 53 | 54 | // compute 55 | ggml_status status = ctx.compute(); 56 | if (status != GGML_STATUS_SUCCESS) { 57 | std::cerr << "error: ggml compute return status: " << status << std::endl; 58 | return 1; 59 | } 60 | 61 | // get result 62 | auto result = ctx.get_tensor_data("result"); 63 | ggml_tensor * result_tensor = result.first; 64 | std::vector & result_data = result.second; 65 | 66 | // print result 67 | ggml_easy::debug::print_tensor_data(result_tensor, result_data.data()); 68 | 69 | return 0; 70 | } 71 | -------------------------------------------------------------------------------- /demo/dyt-rms.cpp: -------------------------------------------------------------------------------- 1 | #include "ggml.h" 2 | #include "ggml-easy.h" 3 | #include 4 | 5 | /** 6 | * Demo to compare performance of RMS Norm vs Dynamic Tanh (DyT) 7 | * Paper: https://arxiv.org/abs/2503.10622 8 | * 9 | * Result on my Macbook M3: 10 | * RMS Norm: 37 ms 11 | * DyT : 135 ms 12 | */ 13 | 14 | int main() { 15 | const int n_embd = 4096; 16 | const int n_tokens = 1024; 17 | const int n_run = 300; 18 | 19 | ggml_easy::ctx_params params; 20 | params.log_level = GGML_LOG_LEVEL_ERROR; 21 | 22 | // benchmark RMS Norm 23 | { 24 | ggml_easy::ctx ctx(params); 25 | 26 | ctx.build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf, auto & utils) { 27 | ggml_tensor * cur = utils.new_input("input", GGML_TYPE_F32, n_embd, n_tokens); 28 | for (int i = 0; i < n_run; i++) { 29 | cur = ggml_rms_norm(ctx_gf, cur, 1e-6); 30 | // skip bias 31 | } 32 | utils.mark_output(cur, "result"); 33 | }); 34 | 35 | std::vector vec(n_embd * n_tokens, 0.5f); 36 | ctx.set_tensor_data("input", vec.data()); 37 | 38 | int64_t t_start = ggml_time_ms(); 39 | ctx.compute(); 40 | int64_t t_end = ggml_time_ms(); 41 | 42 | std::cout << "RMS Norm: " << (t_end - t_start) << " ms" << std::endl; 43 | } 44 | 45 | // benchmark DyT 46 | { 47 | ggml_easy::ctx ctx(params); 48 | 49 | ctx.build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf, auto & utils) { 50 | ggml_tensor * cur = utils.new_input("input", GGML_TYPE_F32, n_embd, n_tokens); 51 | ggml_tensor * alpha = utils.new_input("alpha", GGML_TYPE_F32, n_embd); 52 | ggml_tensor * gamma = utils.new_input("gamma", GGML_TYPE_F32, n_embd); 53 | for (int i = 0; i < n_run; i++) { 54 | // DyT(x) = gamma * tanh(alpha * x) + β 55 | cur = ggml_mul(ctx_gf, cur, alpha); 56 | cur = ggml_tanh(ctx_gf, cur); 57 | cur = ggml_mul(ctx_gf, cur, gamma); 58 | // skip beta 59 | } 60 | utils.mark_output(cur, "result"); 61 | }); 62 | 63 | std::vector vec(n_embd * n_tokens, 0.5f); 64 | ctx.set_tensor_data("input", vec.data()); 65 | ctx.set_tensor_data("alpha", vec.data()); 66 | ctx.set_tensor_data("gamma", vec.data()); 67 | 68 | int64_t t_start = ggml_time_ms(); 69 | ctx.compute(); 70 | int64_t t_end = ggml_time_ms(); 71 | 72 | std::cout << "DyT : " << (t_end - t_start) << " ms" << std::endl; 73 | } 74 | 75 | return 0; 76 | } 77 | -------------------------------------------------------------------------------- /demo/fastvlm.cpp: -------------------------------------------------------------------------------- 1 | #include "ggml.h" 2 | #include "ggml-easy.h" 3 | #include 4 | 5 | /** 6 | * Experiment on fastvlm implementation 7 | * 8 | * This is non-complete code, do not ask how to use it 9 | */ 10 | 11 | 12 | struct layer { 13 | struct rep_mixer { 14 | ggml_tensor * convffn_bn_w; 15 | ggml_tensor * convffn_bn_b; 16 | ggml_tensor * convffn_bn_mean; 17 | ggml_tensor * convffn_bn_std; 18 | ggml_tensor * convffn_w; 19 | ggml_tensor * convffn_fc1; 20 | ggml_tensor * convffn_fc2; 21 | ggml_tensor * layer_scale; 22 | ggml_tensor * token_mixer_conv_w; 23 | ggml_tensor * token_mixer_conv_b; 24 | }; 25 | std::array mixers; 26 | }; 27 | 28 | int main() { 29 | ggml_easy::ctx_params params; 30 | // params.log_level = GGML_LOG_LEVEL_DEBUG; 31 | params.safetensors_ignore_unknown_dtype = true; 32 | params.use_gpu = false; 33 | ggml_easy::ctx ctx(params); 34 | ctx.load_safetensors("fastvlm.safetensors", { 35 | {"model.vision_tower.vision_tower.model.", ""}, 36 | }); 37 | 38 | const int image_size = 1024; 39 | 40 | auto * _patch_embed_0_w = ctx.get_weight("patch_embed.0.reparam_conv.weight"); 41 | auto * _patch_embed_0_b = ctx.get_weight("patch_embed.0.reparam_conv.bias"); 42 | auto * _patch_embed_1_w = ctx.get_weight("patch_embed.1.reparam_conv.weight"); 43 | auto * _patch_embed_1_b = ctx.get_weight("patch_embed.1.reparam_conv.bias"); 44 | auto * _patch_embed_2_w = ctx.get_weight("patch_embed.2.reparam_conv.weight"); 45 | auto * _patch_embed_2_b = ctx.get_weight("patch_embed.2.reparam_conv.bias"); 46 | 47 | // create cgraph 48 | ctx.build_graph([&](ggml_context * ctx0, ggml_cgraph * gf, auto & utils) { 49 | ggml_tensor * inp = utils.new_input("inp", GGML_TYPE_F32, image_size, image_size, 3); 50 | ggml_tensor * tmp; 51 | 52 | auto * patch_embed_0_w = ggml_cast(ctx0, _patch_embed_0_w, GGML_TYPE_F16); 53 | auto * patch_embed_0_b = ggml_cast(ctx0, _patch_embed_0_b, GGML_TYPE_F32); 54 | auto * patch_embed_1_w = ggml_cast(ctx0, _patch_embed_1_w, GGML_TYPE_F16); 55 | auto * patch_embed_1_b = ggml_cast(ctx0, _patch_embed_1_b, GGML_TYPE_F32); 56 | auto * patch_embed_2_w = ggml_cast(ctx0, _patch_embed_2_w, GGML_TYPE_F16); 57 | auto * patch_embed_2_b = ggml_cast(ctx0, _patch_embed_2_b, GGML_TYPE_F32); 58 | 59 | inp = ggml_conv_2d(ctx0, patch_embed_0_w, inp, 2, 2, 1, 1, 1, 1); 60 | tmp = ggml_reshape_3d(ctx0, patch_embed_0_b, 1, 1, ggml_nelements(patch_embed_0_b)); 61 | inp = ggml_add(ctx0, inp, tmp); 62 | inp = ggml_gelu(ctx0, inp); 63 | 64 | inp = ggml_conv_2d_dw(ctx0, patch_embed_1_w, inp, 2, 2, 1, 1, 1, 1); 65 | tmp = ggml_reshape_3d(ctx0, patch_embed_1_b, 1, 1, ggml_nelements(patch_embed_1_b)); 66 | inp = ggml_add(ctx0, inp, tmp); 67 | inp = ggml_gelu(ctx0, inp); 68 | 69 | inp = ggml_conv_2d(ctx0, patch_embed_2_w, inp, 1, 1, 0, 0, 1, 1); 70 | tmp = ggml_reshape_3d(ctx0, patch_embed_2_b, 1, 1, ggml_nelements(patch_embed_2_b)); 71 | inp = ggml_add(ctx0, inp, tmp); 72 | inp = ggml_gelu(ctx0, inp); 73 | 74 | utils.debug_print(inp, "after_conv"); 75 | }); 76 | 77 | std::vector inp(image_size * image_size * 3); 78 | for (int i = 0; i < image_size * image_size * 3; ++i) { 79 | inp[i] = (float)0.1f; 80 | } 81 | ctx.set_tensor_data("inp", inp.data()); 82 | 83 | // compute 84 | ggml_status status = ctx.compute(); 85 | 86 | return 0; 87 | } 88 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ggml-easy 2 | 3 | A simple C++ wrapper around [GGML](https://github.com/ggml-org/ggml) to make model loading and execution easier with GPU acceleration support. 4 | 5 | ## Introduction 6 | 7 | `ggml-easy` is a lightweight header-only C++ library that simplifies working with GGML, the tensor library used in projects like llama.cpp. It provides a clean interface for loading GGUF models, creating computation graphs, and executing them on CPU or GPU with minimal boilerplate code. 8 | 9 | ## Setup 10 | 11 | As a header-only library, using ggml-easy is straightforward: 12 | 13 | 1. Include the headers in your project 14 | 2. Make sure you have GGML as a dependency in `CMakeLists.txt` 15 | 3. Use the `ggml_easy` namespace in your code 16 | 17 | Example: 18 | ```cpp 19 | #include "ggml-easy.h" 20 | 21 | // Your code here 22 | ``` 23 | 24 | See [demo/basic.cpp](demo/basic.cpp) for a complete example of how to use `ggml-easy` in a project. 25 | 26 | ## Compile examples 27 | 28 | To compile everything inside `demo/*` 29 | 30 | ```sh 31 | cmake -B build 32 | cmake --build build -j 33 | # output: build/bin/* 34 | ``` 35 | 36 | ## Features 37 | 38 | ### Effortless GPU support 39 | 40 | ggml-easy abstracted out all the scheduler and buffer setup. GPU is enabled by default. 41 | 42 | To disable it explicitly: 43 | 44 | ```cpp 45 | ggml_easy::ctx_params params; 46 | params.use_gpu = false; // true by default 47 | ggml_easy::ctx ctx(params); 48 | ``` 49 | 50 | Please note that the GPU support is for convenience and is not aimed to have the best performance. Some operations will fallback to CPU if the GPU does not support them. 51 | 52 | ### Load safetensors without converting to GGUF 53 | 54 | You can directly load `.safetensors` file to `ggml-easy` without having to convert it to GGUF! Currently, F32, F16 and BF16 types are supported. 55 | 56 | ```cpp 57 | ggml_easy::ctx_params params; 58 | ggml_easy::ctx ctx(params); 59 | ctx.load_safetensors("mimi.safetensors", { 60 | // optionally, rename tensor to make it shorter (name length limit in ggml is 64 characters) 61 | {".acoustic_residual_vector_quantizer", ".acoustic_rvq"}, 62 | {".semantic_residual_vector_quantizer", ".semantic_rvq"}, 63 | }); 64 | ``` 65 | 66 | For a complete example, please have a look on [demo/safetensors.cpp](demo/safetensors.cpp) where I load both GGUF + safetensors files, then compare them. 67 | 68 | TODO: multi-shards are not supported for now, will add it soon! 69 | 70 | ### Define input, output easily 71 | 72 | When building computation graph, each input and output nodes can be added with single line of code: 73 | 74 | ```cpp 75 | ctx.build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf, auto & utils) { 76 | ggml_tensor * a = utils.new_input("a", GGML_TYPE_F32, cols_A, rows_A); 77 | ggml_tensor * b = utils.new_input("b", GGML_TYPE_F32, cols_B, rows_B); 78 | ... 79 | utils.mark_output(result, "result"); 80 | }); 81 | ``` 82 | 83 | ### Easy debugging 84 | 85 | You can also print the intermediate results with minimal effort: 86 | 87 | ```cpp 88 | ggml_tensor * a = utils.new_input("a", GGML_TYPE_F32, cols_A, rows_A); 89 | ggml_tensor * b = utils.new_input("b", GGML_TYPE_F32, cols_B, rows_B); 90 | ggml_tensor * a_mul_b = ggml_mul_mat(ctx_gf, a, b); 91 | utils.debug_print(a_mul_b, "a_mul_b"); 92 | ``` 93 | 94 | This will print the intermediate result of `A * B` upon `compute()` is called, no more manual `ggml_backend_tensor_get`! 95 | 96 | ``` 97 | a_mul_b.shape = [4, 3] 98 | a_mul_b.data: [ 99 | [ 100 | [ 60.0000, 55.0000, 50.0000, 110.0000], 101 | [ 90.0000, 54.0000, 54.0000, 126.0000], 102 | [ 42.0000, 29.0000, 28.0000, 64.0000], 103 | ], 104 | ] 105 | ``` 106 | -------------------------------------------------------------------------------- /demo/random.cpp: -------------------------------------------------------------------------------- 1 | #include "ggml.h" 2 | #include "ggml-easy.h" 3 | #include 4 | 5 | /** 6 | * Random experiment, do not use it 7 | */ 8 | 9 | int main() { 10 | ggml_easy::ctx_params params; 11 | ggml_easy::ctx ctx(params); 12 | 13 | // experiment with torch unfold equivalent in GGML 14 | { 15 | const int h = 12; 16 | const int w = 2; 17 | const int hidden_size = 8; 18 | ctx.build_graph([&](ggml_context * ctx0, ggml_cgraph * gf, auto & utils) { 19 | ggml_tensor * inp = utils.new_input("inp", GGML_TYPE_F32, hidden_size, h*w); 20 | ggml_tensor * x = inp; 21 | utils.debug_print(ggml_scale(ctx0, inp, 1.0f), "inp0"); 22 | 23 | x = ggml_reshape_3d(ctx0, x, hidden_size, w, h); 24 | x = ggml_permute(ctx0, x, 2, 0, 1, 3); // [x, y, hidden_size] 25 | x = ggml_cont(ctx0, x); 26 | utils.debug_print_full(x, "grid"); 27 | 28 | ggml_tensor * kernel = ggml_view_3d(ctx0, inp, 2, 2, x->ne[2], 0, 0, 0); 29 | x = ggml_im2col(ctx0, kernel, x, 2, 2, 0, 0, 1, 1, true, inp->type); 30 | 31 | utils.debug_print_full(x, "im2col"); 32 | 33 | x = ggml_reshape_2d(ctx0, x, x->ne[0], x->ne[1] * x->ne[2]); 34 | utils.debug_print(x, "result"); 35 | }); 36 | std::vector inp_data(h * w * hidden_size); 37 | for (int i = 0; i < h * w * hidden_size; ++i) { 38 | inp_data[i] = (float)i; 39 | } 40 | ctx.set_tensor_data("inp", inp_data.data()); 41 | ctx.compute(); 42 | } 43 | 44 | printf("\n\n\nLlama4UnfoldConvolution\n\n"); 45 | { 46 | ggml_easy::ctx ctx(params); 47 | ctx.load_safetensors("../models/llama4vit.safetensors", {}); 48 | 49 | ggml_tensor * patch_embeddings_0 = ctx.get_weight("vision_model.patch_embedding.linear.weight"); 50 | 51 | const int h = 336; 52 | const int w = 336; 53 | const int patch_size = 14; 54 | const int n_embd = 1408; 55 | const int n_patches = (h / patch_size) * (w / patch_size); 56 | 57 | ctx.build_graph([&](ggml_context * ctx0, ggml_cgraph * gf, auto & utils) { 58 | ggml_tensor * inp = utils.new_input("inp", GGML_TYPE_F32, h, w, 3); 59 | 60 | // Llama4UnfoldConvolution 61 | { 62 | ggml_tensor * kernel = ggml_reshape_4d(ctx0, patch_embeddings_0, 63 | patch_size, patch_size, 3, n_embd); 64 | inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type); 65 | //inp = ggml_reshape_2d(ctx0, inp, inp->ne[0], inp->ne[1] * inp->ne[2]); // flatten to 2D 66 | utils.debug_print(inp, "im2col"); 67 | utils.debug_print(ggml_sum(ctx0, inp), "im2col_sum"); 68 | 69 | utils.debug_print(ggml_cast(ctx0, patch_embeddings_0, GGML_TYPE_F32), "patch_embeddings_0"); 70 | 71 | inp = ggml_mul_mat(ctx0, patch_embeddings_0, inp); 72 | utils.debug_print(inp, "patch_conv"); 73 | utils.debug_print(ggml_sum(ctx0, inp), "patch_conv_sum"); 74 | 75 | inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches); 76 | } 77 | 78 | //inp = ggml_reshape_2d(ctx0, inp, inp->ne[0], inp->ne[1] * inp->ne[2]); 79 | utils.debug_print(inp, "result"); 80 | }); 81 | 82 | std::vector inp_data(h * w * 3, 0.0); 83 | for (int i = 0; i < h * w; ++i) { 84 | inp_data[i] = 1.0; //(float)i * 0.1; 85 | } 86 | ctx.set_tensor_data("inp", inp_data.data()); 87 | ctx.compute(); 88 | } 89 | 90 | // https://github.com/ggml-org/llama.cpp/pull/13772 91 | // { 92 | // const int h = 12; 93 | // const int w = 2; 94 | // const int hidden_size = 8; 95 | // ctx.build_graph([&](ggml_context * ctx0, ggml_cgraph * gf, auto & utils) { 96 | // ggml_tensor * inp = utils.new_input("inp", GGML_TYPE_F32, hidden_size, h*w); 97 | // inp = ggml_fill(ctx0, inp, 1.234f); 98 | // utils.debug_print(inp, "inp"); 99 | // }); 100 | // ctx.compute(); 101 | // } 102 | 103 | // https://github.com/ggml-org/ggml/issues/1230 104 | { 105 | ggml_easy::ctx_params params_no_gpu; 106 | params_no_gpu.use_gpu = false; 107 | ggml_easy::ctx ctx_no_gpu(params_no_gpu); 108 | ggml_easy::ctx ctx(params); 109 | 110 | auto builder = [&](ggml_context * ctx0, ggml_cgraph * gf, auto & utils) { 111 | ggml_tensor * x = ggml_ones(ctx0, 512, 512); 112 | x = ggml_scale(ctx0, x, 0.12432f); 113 | ggml_tensor * y = ggml_ones(ctx0, 512, 512); 114 | y = ggml_scale(ctx0, y, 0.34636f); 115 | 116 | ggml_tensor * result = ggml_mul(ctx0, x, y); 117 | utils.debug_print(result, "result"); 118 | utils.mark_output(result, "result"); 119 | }; 120 | 121 | ctx.build_graph(builder); 122 | ctx.compute(); 123 | ctx_no_gpu.build_graph(builder); 124 | ctx_no_gpu.compute(); 125 | 126 | float max_diff = 0.0f; 127 | auto res0 = ctx.get_tensor_data("result"); 128 | auto res1 = ctx_no_gpu.get_tensor_data("result"); 129 | GGML_ASSERT(ggml_nelements(res0.first) == ggml_nelements(res1.first)); 130 | for (size_t i = 0; i < ggml_nelements(res0.first); ++i) { 131 | float v0 = ((float *)res0.second.data())[i]; 132 | float v1 = ((float *)res1.second.data())[i]; 133 | float diff = std::abs(v0 - v1); 134 | if (diff > max_diff) { 135 | max_diff = diff; 136 | } 137 | } 138 | 139 | printf("max diff: %f\n", max_diff); 140 | } 141 | 142 | return 0; 143 | } 144 | -------------------------------------------------------------------------------- /demo/svd.cpp: -------------------------------------------------------------------------------- 1 | #include "ggml.h" 2 | #include "ggml-easy.h" 3 | #include 4 | #include 5 | #include 6 | 7 | const time_t seed = std::time(0); 8 | 9 | const int rank = 4; 10 | const float delta = 0.001; 11 | const float eps = 0.97; 12 | const float lambda = 2; 13 | 14 | const int rows_A = 3; 15 | const int cols_A = 2; 16 | float matrix_A[rows_A * cols_A] = { 17 | 1, 2, 18 | 3, 4, 19 | 5, 6, 20 | }; 21 | 22 | /** 23 | * This program computes the singular value decomposition (SVD) of a matrix A using the power iteration method. 24 | * The matrix A is decomposed into the product of three matrices U, S, and V such that A = U * S * V^T. 25 | * 26 | * After decomposed the matrix A, the program reconstructs the matrix A using the decomposed matrices U, S, and V. 27 | * The reconstructed matrix should be the same as the original matrix A. 28 | * 29 | * Ref python implementation: https://gist.github.com/Zhenye-Na/cbf4e534b44ef94fdbad663ef56dd333 30 | */ 31 | 32 | int main() { 33 | ggml_easy::ctx_params params; 34 | ggml_easy::ctx ctx(params); 35 | 36 | const int n_iters = log(4.0f * log(2.0f * rows_A / delta) / (eps * delta)) / (2 * lambda); 37 | printf("n_iters = %d\n", n_iters); 38 | 39 | auto norm = [&](ggml_context * ctx_gf, ggml_tensor * t) { 40 | return ggml_sqrt(ctx_gf, ggml_sum_rows(ctx_gf, ggml_sqr(ctx_gf, t))); 41 | }; 42 | 43 | auto power_iteration = [&](ggml_context * ctx_gf, ggml_cgraph * gf, ggml_tensor * A, ggml_tensor * x) { 44 | ggml_tensor * B = ggml_mul_mat(ctx_gf, A, A); 45 | for (int i = 0; i < n_iters; i++) { 46 | x = ggml_mul_mat(ctx_gf, B, x); 47 | x = ggml_div(ctx_gf, x, norm(ctx_gf, x)); 48 | } 49 | ggml_tensor * v = x; 50 | ggml_tensor * AT = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, A)); 51 | ggml_tensor * A_v = ggml_mul_mat(ctx_gf, AT, v); 52 | ggml_tensor * s = norm(ctx_gf, A_v); 53 | ggml_tensor * u = ggml_div(ctx_gf, A_v, s); 54 | return std::vector{u, s, v}; 55 | }; 56 | 57 | ctx.build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf, auto & utils) { 58 | ggml_tensor * A = utils.new_input("A", GGML_TYPE_F32, cols_A, rows_A); 59 | ggml_tensor * x = utils.new_input("x", GGML_TYPE_F32, rows_A); 60 | 61 | // normalize x 62 | x = ggml_div(ctx_gf, x, norm(ctx_gf, x)); 63 | 64 | ggml_tensor * out_u; // final shape: [cols_A, rank] 65 | ggml_tensor * out_s; // final shape: [rank] 66 | ggml_tensor * out_v; // final shape: [rows_A, rank] 67 | 68 | for (int i = 0; i < rank; i++) { 69 | std::vector result = power_iteration(ctx_gf, gf, A, x); 70 | ggml_tensor * u = result[0]; 71 | ggml_tensor * s = result[1]; 72 | ggml_tensor * v = result[2]; 73 | 74 | ggml_tensor * vT = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, v)); 75 | ggml_tensor * uT = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, u)); 76 | ggml_tensor * A_minus = ggml_mul(ctx_gf, ggml_mul_mat(ctx_gf, uT, vT), s); 77 | A = ggml_add(ctx_gf, A, ggml_scale(ctx_gf, A_minus, -1)); 78 | // utils.debug_print(u, "u_intermediate"); 79 | // utils.debug_print(v, "v_intermediate"); 80 | // utils.debug_print(A, "A_intermediate"); 81 | 82 | if (i == 0) { 83 | out_u = u; 84 | out_s = s; 85 | out_v = v; 86 | } else { 87 | out_u = ggml_concat(ctx_gf, out_u, u, 1); 88 | out_s = ggml_concat(ctx_gf, out_s, s, 0); 89 | out_v = ggml_concat(ctx_gf, out_v, v, 1); 90 | } 91 | } 92 | 93 | utils.mark_output(out_u, "u"); 94 | utils.mark_output(out_s, "s"); 95 | utils.mark_output(out_v, "v"); 96 | }); 97 | 98 | // set data 99 | { 100 | ctx.set_tensor_data("A", matrix_A); 101 | 102 | // initialize eigenvector to random vector 103 | std::default_random_engine generator(static_cast(seed)); 104 | std::uniform_real_distribution distribution(0.0, 1.0); 105 | ctx.set_tensor_data("x", [&](int, int, int, int) { 106 | return distribution(generator); 107 | }); 108 | } 109 | 110 | // optional: print backend buffer info 111 | ggml_easy::debug::print_backend_buffer_info(ctx); 112 | 113 | // compute 114 | ggml_status status = ctx.compute(); 115 | if (status != GGML_STATUS_SUCCESS) { 116 | std::cerr << "error: ggml compute return status: " << status << std::endl; 117 | return 1; 118 | } 119 | 120 | // get result 121 | auto print_result = [&](ggml_easy::ctx & ctx, const char * tensor_name) { 122 | auto result = ctx.get_tensor_data(tensor_name); 123 | ggml_tensor * result_tensor = result.first; 124 | std::vector & result_data = result.second; 125 | std::cout << "\n\n" << tensor_name << ":\n"; 126 | ggml_easy::debug::print_tensor_data(result_tensor, result_data.data()); 127 | return result_data; 128 | }; 129 | 130 | std::vector data_u = print_result(ctx, "u"); 131 | std::vector data_s = print_result(ctx, "s"); 132 | std::vector data_v = print_result(ctx, "v"); 133 | 134 | 135 | // VERIFY THE RESULT!! 136 | 137 | 138 | ctx.build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf, auto & utils) { 139 | ggml_tensor * u = utils.new_input("u", GGML_TYPE_F32, cols_A, rank); 140 | ggml_tensor * s = utils.new_input("s", GGML_TYPE_F32, rank); 141 | ggml_tensor * v = utils.new_input("v", GGML_TYPE_F32, rows_A, rank); 142 | 143 | s = ggml_diag(ctx_gf, s); 144 | 145 | ggml_tensor * uT = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, u)); 146 | ggml_tensor * vT = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, v)); 147 | ggml_tensor * temp = ggml_mul_mat(ctx_gf, s, uT); 148 | ggml_tensor * A_reconstructed = ggml_mul_mat(ctx_gf, temp, vT); 149 | utils.mark_output(A_reconstructed, "A_reconstructed"); 150 | 151 | ggml_tensor * A = utils.new_input("A", GGML_TYPE_F32, cols_A, rows_A); 152 | ggml_tensor * diff = ggml_sum(ctx_gf, ggml_sub(ctx_gf, A, A_reconstructed)); 153 | utils.mark_output(diff, "diff"); 154 | }); 155 | 156 | ctx.set_tensor_data("u", data_u.data()); 157 | ctx.set_tensor_data("s", data_s.data()); 158 | ctx.set_tensor_data("v", data_v.data()); 159 | ctx.set_tensor_data("A", matrix_A); 160 | 161 | status = ctx.compute(); 162 | if (status != GGML_STATUS_SUCCESS) { 163 | std::cerr << "error: ggml compute return status: " << status << std::endl; 164 | return 1; 165 | } 166 | 167 | print_result(ctx, "A_reconstructed"); 168 | print_result(ctx, "diff"); 169 | 170 | return 0; 171 | } 172 | -------------------------------------------------------------------------------- /convert_safetensors_to_gguf.py: -------------------------------------------------------------------------------- 1 | import gguf 2 | import argparse 3 | import logging 4 | import sys 5 | import torch 6 | import json 7 | import os 8 | import numpy as np 9 | from typing import cast, ContextManager, Any, Iterator 10 | from pathlib import Path 11 | from torch import Tensor 12 | 13 | # some tensor names are too long, ggml refuses to load them 14 | # this function renames them to shorter names 15 | def rename_tensor(name: str) -> str: 16 | replacements = { 17 | "quantizer.acoustic_residual_vector_quantizer": "quantizer.acoustic_rvq", # kyutai mimi 18 | "quantizer.semantic_residual_vector_quantizer": "quantizer.semantic_rvq", # kyutai mimi 19 | } 20 | for old, new in replacements.items(): 21 | name = name.replace(old, new) 22 | return name 23 | 24 | # (copied from convert_hf_to_gguf.py) 25 | # tree of lazy tensors 26 | class LazyTorchTensor(gguf.LazyBase): 27 | _tensor_type = torch.Tensor 28 | # to keep the type-checker happy 29 | dtype: torch.dtype 30 | shape: torch.Size 31 | 32 | # only used when converting a torch.Tensor to a np.ndarray 33 | _dtype_map: dict[torch.dtype, type] = { 34 | torch.float16: np.float16, 35 | torch.float32: np.float32, 36 | } 37 | 38 | # used for safetensors slices 39 | # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046 40 | # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734 41 | _dtype_str_map: dict[str, torch.dtype] = { 42 | "F64": torch.float64, 43 | "F32": torch.float32, 44 | "BF16": torch.bfloat16, 45 | "F16": torch.float16, 46 | # "U64": torch.uint64, 47 | "I64": torch.int64, 48 | # "U32": torch.uint32, 49 | "I32": torch.int32, 50 | # "U16": torch.uint16, 51 | "I16": torch.int16, 52 | "U8": torch.uint8, 53 | "I8": torch.int8, 54 | "BOOL": torch.bool, 55 | "F8_E4M3": torch.float8_e4m3fn, 56 | "F8_E5M2": torch.float8_e5m2, 57 | } 58 | 59 | def numpy(self) -> gguf.LazyNumpyTensor: 60 | dtype = self._dtype_map[self.dtype] 61 | return gguf.LazyNumpyTensor( 62 | meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape), 63 | args=(self,), 64 | func=(lambda s: s.numpy()) 65 | ) 66 | 67 | @classmethod 68 | def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor: 69 | return torch.empty(size=shape, dtype=dtype, device="meta") 70 | 71 | @classmethod 72 | def from_safetensors_slice(cls, st_slice: Any) -> Tensor: 73 | dtype = cls._dtype_str_map[st_slice.get_dtype()] 74 | shape: tuple[int, ...] = tuple(st_slice.get_shape()) 75 | lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:]) 76 | return cast(torch.Tensor, lazy) 77 | 78 | @classmethod 79 | def __torch_function__(cls, func, types, args=(), kwargs=None): 80 | del types # unused 81 | 82 | if kwargs is None: 83 | kwargs = {} 84 | 85 | if func is torch.Tensor.numpy: 86 | return args[0].numpy() 87 | 88 | return cls._wrap_fn(func)(*args, **kwargs) 89 | 90 | class Converter: 91 | in_file: Path 92 | out_file: Path 93 | ftype: gguf.LlamaFileType 94 | gguf_writer: gguf.GGUFWriter 95 | 96 | def __init__(self, in_file: Path, out_file: Path, ftype: gguf.LlamaFileType): 97 | self.in_file = in_file 98 | self.out_file = out_file 99 | self.ftype = ftype 100 | endianess = gguf.GGUFEndian.LITTLE 101 | self.gguf_writer = gguf.GGUFWriter(path=None, arch="unknown", endianess=endianess) 102 | 103 | def convert(self): 104 | print(f"Converting {self.in_file} to {self.out_file} with {self.ftype} data type.") 105 | 106 | for name, data_torch in self.get_tensors(): 107 | old_dtype = data_torch.dtype 108 | is_1d = len(data_torch.shape) == 1 109 | can_quantize = not is_1d 110 | 111 | data_qtype = gguf.GGMLQuantizationType.F32 112 | if can_quantize: 113 | if self.ftype == gguf.LlamaFileType.ALL_F32: 114 | data_qtype = gguf.GGMLQuantizationType.F32 115 | elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: 116 | data_qtype = gguf.GGMLQuantizationType.F16 117 | elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: 118 | data_qtype = gguf.GGMLQuantizationType.BF16 119 | elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: 120 | data_qtype = gguf.GGMLQuantizationType.Q8_0 121 | else: 122 | raise ValueError(f"Unsupported file type: {self.ftype}") 123 | 124 | data = data_torch.numpy() 125 | try: 126 | data = gguf.quants.quantize(data, data_qtype) 127 | except Exception as e: 128 | print(f"Error quantizing tensor '{name}': {e}, fallback to F16") 129 | data_qtype = gguf.GGMLQuantizationType.F16 130 | data = gguf.quants.quantize(data, data_qtype) 131 | 132 | name = rename_tensor(name) 133 | 134 | # reverse shape to make it similar to the internal ggml dimension order 135 | shape_str = f"{{{', '.join(str(n) for n in reversed(data_torch.shape))}}}" 136 | print(f"{f'%-32s' % f'{name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") 137 | 138 | self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype) 139 | 140 | def get_tensors(self) -> Iterator[tuple[str, Tensor]]: 141 | # TODO: support multiple shards in the future 142 | from safetensors import safe_open 143 | ctx = cast(ContextManager[Any], safe_open(self.in_file, framework="pt", device="cpu")) 144 | with ctx as model_part: 145 | for name in model_part.keys(): 146 | data = model_part.get_slice(name) 147 | data = LazyTorchTensor.from_safetensors_slice(data) 148 | yield name, data 149 | 150 | def write(self): 151 | self.gguf_writer.write_header_to_file(path=self.out_file) 152 | self.gguf_writer.write_kv_data_to_file() 153 | self.gguf_writer.write_tensors_to_file(progress=True) 154 | self.gguf_writer.close() 155 | 156 | def parse_args(): 157 | parser = argparse.ArgumentParser(description="Convert safetensors to GGUF format.") 158 | parser.add_argument( 159 | "--outtype", 160 | choices=["f32", "f16", "bf16", "q8_0"], 161 | default="f32", 162 | help="Output data type (default: f32)" 163 | ) 164 | parser.add_argument( 165 | "input_file", type=Path, 166 | help="Path to the input file (required)" 167 | ) 168 | parser.add_argument( 169 | "output_file", type=Path, 170 | nargs="?", 171 | help="Path to the output file (optional). Default to input file with .gguf extension" 172 | ) 173 | return parser.parse_args() 174 | 175 | if __name__ == "__main__": 176 | args = parse_args() 177 | 178 | ftype_map: dict[str, gguf.LlamaFileType] = { 179 | "f32": gguf.LlamaFileType.ALL_F32, 180 | "f16": gguf.LlamaFileType.MOSTLY_F16, 181 | "bf16": gguf.LlamaFileType.MOSTLY_BF16, 182 | "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, 183 | } 184 | 185 | if args.outtype not in ftype_map: 186 | raise ValueError(f"Unsupported output data type: {args.outtype}") 187 | 188 | if args.output_file is None: 189 | args.output_file = args.input_file.with_suffix(".gguf") 190 | 191 | converter = Converter(args.input_file, args.output_file, ftype_map[args.outtype]) 192 | converter.convert() 193 | converter.write() 194 | -------------------------------------------------------------------------------- /demo/2d-rope.cpp: -------------------------------------------------------------------------------- 1 | #include "ggml.h" 2 | #include "ggml-easy.h" 3 | #include 4 | #include 5 | 6 | /** 7 | * Experiment with 2D RoPE used on Mistral's Pixtral model 8 | */ 9 | 10 | // implementation of the 2D RoPE without adding a new op in ggml 11 | // this is not efficient (use double the memory), but works on all backends 12 | static ggml_tensor * build_rope_2d( 13 | ggml_context * ctx0, 14 | ggml_tensor * cur, 15 | ggml_tensor * pos_a, // first half 16 | ggml_tensor * pos_b, // second half 17 | const float freq_base, 18 | const bool interleave_freq 19 | ) { 20 | const int64_t n_dim = cur->ne[0]; 21 | const int64_t n_head = cur->ne[1]; 22 | const int64_t n_pos = cur->ne[2]; 23 | 24 | // for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos) 25 | // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3 26 | // first half of cur will use 1e-0, 1e-2 (even) 27 | // second half of cur will use 1e-1, 1e-3 (odd) 28 | // the trick here is to rotate just half of n_dim, so inv_freq will automatically be even 29 | // ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2) 30 | // then for the second half, we use freq_scale to shift the inv_freq 31 | // ^ why? replace (2i) with (2i+1) in the above equation 32 | const float freq_scale_odd = interleave_freq 33 | ? std::pow(freq_base, (float)-2/n_dim) 34 | : 1.0; 35 | 36 | // first half 37 | ggml_tensor * first; 38 | { 39 | first = ggml_view_3d(ctx0, cur, 40 | n_dim/2, n_head, n_pos, 41 | ggml_row_size(cur->type, n_dim), 42 | ggml_row_size(cur->type, n_dim*n_head), 43 | 0); 44 | first = ggml_rope_ext( 45 | ctx0, 46 | first, 47 | pos_a, // positions 48 | nullptr, // freq factors 49 | n_dim/2, // n_dims 50 | 0, 0, freq_base, 51 | 1.0f, 0.0f, 1.0f, 0.0f, 0.0f 52 | ); 53 | } 54 | 55 | // second half 56 | ggml_tensor * second; 57 | { 58 | second = ggml_view_3d(ctx0, cur, 59 | n_dim/2, n_head, n_pos, 60 | ggml_row_size(cur->type, n_dim), 61 | ggml_row_size(cur->type, n_dim*n_head), 62 | n_dim/2 * ggml_element_size(cur)); 63 | second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors 64 | second = ggml_rope_ext( 65 | ctx0, 66 | second, 67 | pos_b, // positions 68 | nullptr, // freq factors 69 | n_dim/2, // n_dims 70 | 0, 0, freq_base, 71 | freq_scale_odd, 72 | 0.0f, 1.0f, 0.0f, 0.0f 73 | ); 74 | } 75 | 76 | cur = ggml_concat(ctx0, first, second, 0); 77 | return cur; 78 | } 79 | 80 | void test_mrope(ggml_easy::ctx & ctx); 81 | 82 | int main() { 83 | ggml_easy::ctx_params params; 84 | ggml_easy::ctx ctx(params); 85 | 86 | const bool is_llama = true; // false meaning pixtral 87 | 88 | const int n_sz = 336/14; 89 | const int n_pos = n_sz * n_sz + (is_llama ? 1 : 0); // 1 for CLS token 90 | const int n_dim = 88; 91 | const int n_head = 1; 92 | 93 | // create cgraph 94 | ctx.build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf, auto & utils) { 95 | ggml_tensor * pos_h = utils.new_input("pos_h", GGML_TYPE_I32, n_pos); 96 | ggml_tensor * pos_w = utils.new_input("pos_w", GGML_TYPE_I32, n_pos); 97 | ggml_tensor * vector = utils.new_input("vector", GGML_TYPE_F32, n_dim*n_head, n_pos); 98 | vector = ggml_reshape_3d(ctx_gf, vector, n_dim, n_head, n_pos); 99 | ggml_tensor * result = is_llama 100 | ? build_rope_2d(ctx_gf, vector, pos_w, pos_h, 10000.0f, false) 101 | : build_rope_2d(ctx_gf, vector, pos_h, pos_w, 10000.0f, true); 102 | result = ggml_reshape_2d(ctx_gf, result, n_dim*n_head, n_pos); 103 | utils.debug_print(result, "result"); 104 | utils.debug_print(ggml_sum(ctx_gf, result), "result_sum"); 105 | }); 106 | 107 | // set data 108 | if (is_llama) { 109 | std::vector positions(n_pos - 1, 0); 110 | for (int i = 0; i < n_pos- 1; ++i) { 111 | positions[i] = (i / n_sz) + 1; 112 | // printf("pos_h[%d] = %d\n", i, positions[i]); 113 | } 114 | printf("\n"); 115 | ctx.set_tensor_data("pos_h", positions.data()); 116 | for (int i = 0; i < n_pos- 1; ++i) { 117 | positions[i] = (i % n_sz) + 1; 118 | // printf("pos_w[%d] = %d\n", i, positions[i]); 119 | } 120 | ctx.set_tensor_data("pos_w", positions.data()); 121 | } else { 122 | std::vector positions(n_pos); 123 | for (int i = 0; i < n_pos; ++i) { 124 | positions[i] = i / n_sz; 125 | } 126 | ctx.set_tensor_data("pos_h", positions.data()); 127 | for (int i = 0; i < n_pos; ++i) { 128 | positions[i] = i % n_sz; 129 | } 130 | ctx.set_tensor_data("pos_w", positions.data()); 131 | } 132 | ctx.set_tensor_data("vector", [](int i0, int i1, int i2, int i3) { 133 | //return i0 * 0.1; 134 | return 1.0; 135 | }); 136 | 137 | // compute 138 | ggml_status status = ctx.compute(); 139 | 140 | test_mrope(ctx); 141 | 142 | return 0; 143 | } 144 | 145 | // 146 | // experiment with ggml_rope_multi 147 | // 148 | 149 | void test_mrope(ggml_easy::ctx & ctx) { 150 | //const int n_sz = 3; 151 | const int n_dim = 12; 152 | const int n_head = 1; 153 | const int n_pos = 6; 154 | 155 | printf("\n\n--- test_mrope ---\n"); 156 | 157 | // create cgraph 158 | ctx.build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf, auto & utils) { 159 | ggml_tensor * pos = utils.new_input("pos", GGML_TYPE_I32, n_pos*4); 160 | ggml_tensor * vector = utils.new_input("vector", GGML_TYPE_F32, n_dim*n_head, n_pos); 161 | 162 | ggml_tensor * cur; 163 | ggml_tensor * x = ggml_reshape_3d(ctx_gf, vector, n_dim, n_head, n_pos); 164 | { 165 | const int n_dim = x->ne[0]; 166 | const int n_head = x->ne[1]; 167 | const int n_pos = x->ne[2]; 168 | int sections[4] = {1, 1, 1, 0}; 169 | cur = ggml_rope_multi( 170 | ctx_gf, 171 | x, 172 | pos, // positions 173 | nullptr, // freq factors 174 | n_dim, // n_dims 175 | sections, // sections 176 | GGML_ROPE_TYPE_MROPE, 177 | 0, 10000.0f, 178 | 1.0f, 0.0f, 1.0f, 0.0f, 0.0f 179 | ); 180 | } 181 | 182 | cur = ggml_reshape_2d(ctx_gf, cur, n_dim*n_head, n_pos); 183 | utils.debug_print_full(cur, "mrope"); 184 | 185 | { 186 | ggml_tensor * pos_a = ggml_view_1d(ctx_gf, pos, n_pos, 0); 187 | const int n_dim = x->ne[0]; 188 | const int n_head = x->ne[1]; 189 | const int n_pos = x->ne[2]; 190 | int sections[4] = {1, 1, 1, 0}; 191 | cur = ggml_rope_ext( 192 | ctx_gf, 193 | x, 194 | pos_a, // positions 195 | nullptr, // freq factors 196 | n_dim, // n_dims 197 | GGML_ROPE_TYPE_NEOX, 0, 10000.0f, 198 | 1.0f, 0.0f, 1.0f, 0.0f, 0.0f 199 | ); 200 | } 201 | 202 | cur = ggml_reshape_2d(ctx_gf, cur, n_dim*n_head, n_pos); 203 | utils.debug_print_full(cur, "normal_rope"); 204 | }); 205 | 206 | // set data 207 | std::vector positions(n_pos*4, 0); 208 | //for (int i = 0; i < n_pos; ++i) positions[i + n_pos*0] = i / n_sz; 209 | for (int i = 0; i < n_pos; ++i) positions[i + n_pos*0] = i; 210 | for (int i = 0; i < n_pos; ++i) positions[i + n_pos*1] = i; 211 | for (int i = 0; i < n_pos; ++i) positions[i + n_pos*2] = i; 212 | for (int i = 0; i < n_pos; ++i) positions[i + n_pos*3] = 0; 213 | for (int i = 0; i < 4; ++i) { 214 | for (int j = 0; j < n_pos; ++j) { 215 | printf("%d ", positions[i*n_pos + j]); 216 | } 217 | printf("\n"); 218 | } 219 | ctx.set_tensor_data("pos", positions.data()); 220 | ctx.set_tensor_data("vector", [](int i0, int i1, int i2, int i3) { 221 | return 1.0; 222 | }); 223 | 224 | // compute 225 | ctx.compute(); 226 | } 227 | -------------------------------------------------------------------------------- /demo/ultravox-encoder.cpp: -------------------------------------------------------------------------------- 1 | #include "ggml.h" 2 | #include "ggml-easy.h" 3 | #include 4 | #include 5 | #include 6 | 7 | // placeholder, to be removed when it's upstreamed 8 | ggml_tensor * ggml_gelu_erf(ggml_context * ctx, ggml_tensor * a) { 9 | return a; 10 | } 11 | 12 | struct ultravox_encoder { 13 | float norm_eps = 1e-5; 14 | int n_head = 20; 15 | int n_embd; 16 | int n_ctx = 1500; 17 | 18 | ggml_tensor * position_embeddings; 19 | 20 | ggml_tensor * conv1d_1_w; 21 | ggml_tensor * conv1d_1_b; 22 | ggml_tensor * conv1d_2_w; 23 | ggml_tensor * conv1d_2_b; 24 | 25 | ggml_tensor * post_ln_w; 26 | ggml_tensor * post_ln_b; 27 | 28 | // projector 29 | ggml_tensor * mm_norm_pre_w; 30 | ggml_tensor * mm_norm_mid_w; 31 | ggml_tensor * mm_1_w; 32 | ggml_tensor * mm_2_w; 33 | 34 | struct layer { 35 | ggml_tensor * ln_1_w; 36 | ggml_tensor * ln_1_b; 37 | 38 | ggml_tensor * q_w; 39 | ggml_tensor * q_b; 40 | ggml_tensor * k_w; 41 | ggml_tensor * v_w; 42 | ggml_tensor * v_b; 43 | ggml_tensor * o_w; 44 | ggml_tensor * o_b; 45 | ggml_tensor * ln_2_w; 46 | ggml_tensor * ln_2_b; 47 | 48 | ggml_tensor * ff_up_w; 49 | ggml_tensor * ff_up_b; 50 | ggml_tensor * ff_down_w; 51 | ggml_tensor * ff_down_b; 52 | }; 53 | std::vector layers; 54 | 55 | ultravox_encoder(ggml_easy::ctx & ctx, int n_layers) { 56 | const char * prefix = "a"; // audio 57 | position_embeddings = ctx.get_weight("%s.position_embd.weight", prefix); 58 | n_embd = position_embeddings->ne[0]; 59 | conv1d_1_b = ctx.get_weight("%s.conv1d.1.bias", prefix); 60 | conv1d_1_w = ctx.get_weight("%s.conv1d.1.weight", prefix); 61 | conv1d_2_b = ctx.get_weight("%s.conv1d.2.bias", prefix); 62 | conv1d_2_w = ctx.get_weight("%s.conv1d.2.weight", prefix); 63 | post_ln_b = ctx.get_weight("%s.post_ln.bias", prefix); 64 | post_ln_w = ctx.get_weight("%s.post_ln.weight", prefix); 65 | 66 | mm_norm_pre_w = ctx.get_weight("mm.%s.norm_pre.weight", prefix); 67 | mm_norm_mid_w = ctx.get_weight("mm.%s.norm_mid.weight", prefix); 68 | mm_1_w = ctx.get_weight("mm.%s.mlp.1.weight", prefix); 69 | mm_2_w = ctx.get_weight("mm.%s.mlp.2.weight", prefix); 70 | 71 | for (int il = 0; il < n_layers; il++) { 72 | layers.push_back({ 73 | .ln_1_w = ctx.get_weight("%s.blk.%d.ln1.weight", prefix, il), 74 | .ln_1_b = ctx.get_weight("%s.blk.%d.ln1.bias", prefix, il), 75 | 76 | .q_w = ctx.get_weight("%s.blk.%d.attn_q.weight", prefix, il), 77 | .q_b = ctx.get_weight("%s.blk.%d.attn_q.bias", prefix, il), 78 | .k_w = ctx.get_weight("%s.blk.%d.attn_k.weight", prefix, il), 79 | .v_w = ctx.get_weight("%s.blk.%d.attn_v.weight", prefix, il), 80 | .v_b = ctx.get_weight("%s.blk.%d.attn_v.bias", prefix, il), 81 | .o_w = ctx.get_weight("%s.blk.%d.attn_out.weight", prefix, il), 82 | .o_b = ctx.get_weight("%s.blk.%d.attn_out.bias", prefix, il), 83 | .ln_2_w = ctx.get_weight("%s.blk.%d.ln2.weight", prefix, il), 84 | .ln_2_b = ctx.get_weight("%s.blk.%d.ln2.bias", prefix, il), 85 | 86 | .ff_up_w = ctx.get_weight("%s.blk.%d.ffn_up.weight", prefix, il), 87 | .ff_up_b = ctx.get_weight("%s.blk.%d.ffn_up.bias", prefix, il), 88 | .ff_down_w = ctx.get_weight("%s.blk.%d.ffn_down.weight", prefix, il), 89 | .ff_down_b = ctx.get_weight("%s.blk.%d.ffn_down.bias", prefix, il), 90 | }); 91 | } 92 | } 93 | }; 94 | 95 | // unused, but just keep it here for fun 96 | static ggml_tensor * custom_gelu(ggml_context * ctx, ggml_tensor * a) { 97 | ggml_tensor * one = ggml_arange(ctx, 1.0f, 2.0f, 1.0f); 98 | one = ggml_view_1d(ctx, one, 1, 0); 99 | ggml_tensor * a3 = ggml_mul(ctx, a, ggml_mul(ctx, a, a)); 100 | ggml_tensor * a3_s = ggml_scale(ctx, a3, 0.035677f); 101 | ggml_tensor * inner = ggml_add(ctx, a3_s, ggml_scale(ctx, a, 0.797885f)); 102 | inner = ggml_scale(ctx, inner, 0.7978845608f); 103 | ggml_tensor * out = ggml_tanh(ctx, inner); 104 | out = ggml_add(ctx, out, one); 105 | out = ggml_mul(ctx, out, a); 106 | out = ggml_scale(ctx, out, 0.5f); 107 | return out; 108 | } 109 | 110 | int main() { 111 | ggml_easy::ctx_params params; 112 | params.use_gpu = false; 113 | ggml_easy::ctx ctx(params); 114 | ctx.load_gguf("ultravox-f32.gguf"); 115 | 116 | const int n_step = 1024; 117 | const int n_mel = 128; 118 | const int n_pos = n_step / 2; 119 | 120 | // model 121 | ultravox_encoder model(ctx, 32); 122 | 123 | const int n_layer = 32; 124 | const int n_head = model.n_head; 125 | const int n_embd = model.n_embd; 126 | const int d_head = n_embd / n_head; 127 | const float eps = model.norm_eps; 128 | 129 | const int proj_stack_factor = 8; 130 | 131 | // create cgraph 132 | ctx.build_graph([&](ggml_context * ctx0, ggml_cgraph * gf, auto & utils) { 133 | ggml_tensor * inp_raw = utils.new_input("inp_raw", GGML_TYPE_F32, n_step, n_mel); 134 | ggml_tensor * positions = utils.new_input("positions", GGML_TYPE_I32, n_pos); 135 | 136 | ggml_tensor * inp; 137 | 138 | // conv1d block 139 | { 140 | // convolution + gelu 141 | ggml_tensor * cur = ggml_conv_1d(ctx0, model.conv1d_1_w, inp_raw, 1, 1, 1); 142 | cur = ggml_add(ctx0, cur, model.conv1d_1_b); 143 | 144 | //cur = ggml_cast(ctx0, cur, GGML_TYPE_F16); 145 | cur = ggml_gelu_erf(ctx0, cur); 146 | //cur = ggml_cast(ctx0, cur, GGML_TYPE_F32); 147 | utils.debug_print(cur, "first conv"); 148 | utils.debug_print(ggml_sum(ctx0, cur), "first conv sum"); 149 | 150 | cur = ggml_conv_1d(ctx0, model.conv1d_2_w, cur, 2, 1, 1); 151 | cur = ggml_add(ctx0, cur, model.conv1d_2_b); 152 | utils.debug_print(cur, "second conv"); 153 | utils.debug_print(ggml_sum(ctx0, cur), "second conv sum"); 154 | 155 | //cur = ggml_cast(ctx0, cur, GGML_TYPE_F32); 156 | cur = ggml_gelu_erf(ctx0, cur); 157 | //cur = ggml_cast(ctx0, cur, GGML_TYPE_F32); 158 | // transpose 159 | inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); 160 | } 161 | 162 | //inp = ggml_scale(ctx0, inp, 0.0); // test 163 | 164 | utils.debug_print(inp, "after conv1d"); 165 | utils.debug_print(ggml_sum(ctx0, inp), "after conv1d sum"); 166 | 167 | // add position embeddings 168 | inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions)); 169 | 170 | utils.debug_print(inp, "after added pos"); 171 | 172 | // iterate layers 173 | for (int il = 0; il < n_layer; ++il) { 174 | auto & layer = model.layers[il]; 175 | ggml_tensor * cur = inp; 176 | 177 | cur = ggml_norm(ctx0, cur, eps); 178 | cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_1_w), layer.ln_1_b); 179 | 180 | // attention 181 | { 182 | ggml_tensor * q = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b); 183 | ggml_tensor * k = ggml_mul_mat(ctx0, layer.k_w, cur); // no bias for key 184 | ggml_tensor * v = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b); 185 | 186 | q = ggml_reshape_3d(ctx0, q, d_head, n_head, n_pos); 187 | k = ggml_reshape_3d(ctx0, k, d_head, n_head, n_pos); 188 | v = ggml_reshape_3d(ctx0, v, d_head, n_head, n_pos); 189 | 190 | q = ggml_cont(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3)); 191 | k = ggml_cont(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3)); 192 | 193 | ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); 194 | ggml_mul_mat_set_prec(kq, GGML_PREC_F32); 195 | kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f / std::sqrt(d_head), 0.0f); 196 | 197 | v = ggml_cont(ctx0, ggml_permute(ctx0, v, 1, 2, 0, 3)); 198 | 199 | ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); 200 | //kqv = ggml_reshape_3d(ctx0, kqv, d_head, n_pos, n_head); 201 | kqv = ggml_permute(ctx0, kqv, 0, 2, 1, 3); 202 | kqv = ggml_cont_2d(ctx0, kqv, n_embd, n_pos); 203 | 204 | cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.o_w, kqv), layer.o_b); 205 | } 206 | 207 | utils.debug_print(cur, "layer %d after attn", il); 208 | utils.debug_print(ggml_sum(ctx0, cur), "layer %d after attn sum", il); 209 | 210 | // residual 211 | cur = ggml_add(ctx0, cur, inp); 212 | 213 | inp = cur; // inp = residual, cur = hidden_states 214 | cur = ggml_norm(ctx0, cur, eps); 215 | cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_2_w), layer.ln_2_b); 216 | 217 | // mlp 218 | { 219 | cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_up_w, cur), layer.ff_up_b); 220 | cur = ggml_gelu_erf(ctx0, cur); 221 | //cur = custom_gelu(ctx0, cur); 222 | cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_down_w, cur), layer.ff_down_b); 223 | } 224 | 225 | utils.debug_print(cur, "layer %d after ffn", il); 226 | utils.debug_print(ggml_sum(ctx0, cur), "layer %d after ffn sum", il); 227 | 228 | // residual 229 | cur = ggml_add(ctx0, cur, inp); 230 | 231 | inp = cur; 232 | 233 | utils.debug_print(inp, "layer %d out", il); 234 | utils.debug_print(ggml_sum(ctx0, inp), "layer %d out", il); 235 | } 236 | 237 | ggml_tensor * embeddings = inp; 238 | 239 | //embeddings = utils.new_input("test", GGML_TYPE_F32, 1280, 512); 240 | //embeddings = ggml_scale(ctx0, embeddings, 0.0); 241 | 242 | // output norm 243 | embeddings = ggml_norm(ctx0, embeddings, eps); 244 | embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b); 245 | 246 | utils.debug_print(embeddings, "after output norm"); 247 | utils.debug_print(ggml_sum(ctx0, embeddings), "after output norm sum"); 248 | 249 | utils.debug_print(ggml_scale(ctx0, model.post_ln_w, 1.0), "post_ln_w"); 250 | utils.debug_print(ggml_scale(ctx0, model.post_ln_b, 1.0), "post_ln_b"); 251 | 252 | // StackAudioFrames 253 | // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py 254 | { 255 | int64_t stride = n_embd * proj_stack_factor; 256 | int64_t padded_len = GGML_PAD(ggml_nelements(embeddings), stride); 257 | int64_t pad = padded_len - ggml_nelements(embeddings); 258 | if (pad > 0) { 259 | embeddings = ggml_view_1d(ctx0, embeddings, ggml_nelements(embeddings), 0); 260 | embeddings = ggml_pad(ctx0, embeddings, pad, 0, 0, 0); 261 | } 262 | embeddings = ggml_view_2d(ctx0, embeddings, stride, padded_len / stride, 263 | ggml_row_size(embeddings->type, stride), 0); 264 | } 265 | 266 | utils.debug_print(embeddings, "after stack"); 267 | utils.debug_print(ggml_sum(ctx0, embeddings), "after stack sum"); 268 | 269 | // UltravoxProjector 270 | { 271 | ggml_tensor * cur = embeddings; 272 | // pre-norm 273 | cur = ggml_rms_norm(ctx0, cur, 1e-6); 274 | cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w); 275 | 276 | // ffn in 277 | cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); 278 | 279 | utils.debug_print(cur, "before swiglu"); 280 | 281 | // swiglu 282 | { 283 | int64_t split_point = cur->ne[0] / 2; 284 | ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0)); 285 | ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur))); 286 | 287 | // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half 288 | x1 = ggml_silu(ctx0, x1); 289 | cur = ggml_mul(ctx0, x0, x1); 290 | } 291 | 292 | utils.debug_print(cur, "after swiglu"); 293 | 294 | // mid-norm 295 | cur = ggml_rms_norm(ctx0, cur, 1e-6); 296 | cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w); 297 | 298 | // ffn out 299 | cur = ggml_mul_mat(ctx0, model.mm_2_w, cur); 300 | 301 | embeddings = cur; 302 | } 303 | 304 | utils.debug_print(embeddings, "output"); 305 | utils.debug_print(ggml_sum(ctx0, embeddings), "output_sum"); 306 | }); 307 | 308 | // set the input 309 | { 310 | std::vector inp_raw(n_mel*n_step, 0.0f); 311 | for (int i = 0; i < n_step*n_mel; i++) { 312 | inp_raw[i] = (float)std::sin((float)i)*0.1f; 313 | //inp_raw[i] = 1.0f / (float)(i+1); 314 | } 315 | ctx.set_tensor_data("inp_raw", inp_raw.data()); 316 | 317 | std::vector positions(n_pos); 318 | for (int i = 0; i < n_pos; i++) positions[i] = i; 319 | ctx.set_tensor_data("positions", positions.data()); 320 | 321 | //std::vector test(1280*512, 0.1f); 322 | //for (int i = 0; i < (int)test.size(); i++) test[i] = (float)std::sin((float)i)*0.1f; 323 | //ctx.set_tensor_data("test", test.data()); 324 | } 325 | 326 | // compute 327 | ctx.compute(); 328 | 329 | return 0; 330 | } 331 | -------------------------------------------------------------------------------- /demo/whisper-encoder.cpp: -------------------------------------------------------------------------------- 1 | #include "ggml.h" 2 | #include "ggml-easy.h" 3 | #include 4 | #include 5 | #include 6 | 7 | #define WHISPER_ASSERT GGML_ASSERT 8 | 9 | #define WHISPER_SAMPLE_RATE 16000 10 | #define WHISPER_N_FFT 400 11 | #define WHISPER_HOP_LENGTH 160 12 | #define WHISPER_CHUNK_SIZE 30 13 | 14 | namespace whisper_preprocessor { 15 | 16 | struct whisper_mel { 17 | int n_len; 18 | int n_len_org; 19 | int n_mel; 20 | 21 | std::vector data; 22 | }; 23 | 24 | struct whisper_filters { 25 | int32_t n_mel; 26 | int32_t n_fft; 27 | 28 | std::vector data; 29 | }; 30 | 31 | #define SIN_COS_N_COUNT WHISPER_N_FFT 32 | namespace { 33 | struct whisper_global_cache { 34 | // In FFT, we frequently use sine and cosine operations with the same values. 35 | // We can use precalculated values to speed up the process. 36 | float sin_vals[SIN_COS_N_COUNT]; 37 | float cos_vals[SIN_COS_N_COUNT]; 38 | 39 | // Hann window (Use cosf to eliminate difference) 40 | // ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html 41 | // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147 42 | float hann_window[WHISPER_N_FFT]; 43 | 44 | whisper_global_cache() { 45 | fill_sin_cos_table(); 46 | fill_hann_window(sizeof(hann_window)/sizeof(hann_window[0]), true, hann_window); 47 | } 48 | 49 | void fill_sin_cos_table() { 50 | for (int i = 0; i < SIN_COS_N_COUNT; i++) { 51 | double theta = (2 * M_PI * i) / SIN_COS_N_COUNT; 52 | sin_vals[i] = sinf(theta); 53 | cos_vals[i] = cosf(theta); 54 | } 55 | } 56 | 57 | void fill_hann_window(int length, bool periodic, float * output) { 58 | int offset = -1; 59 | if (periodic) { 60 | offset = 0; 61 | } 62 | for (int i = 0; i < length; i++) { 63 | output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset))); 64 | } 65 | } 66 | } global_cache; 67 | } 68 | 69 | // naive Discrete Fourier Transform 70 | // input is real-valued 71 | // output is complex-valued 72 | static void dft(const float* in, int N, float* out) { 73 | const int sin_cos_step = SIN_COS_N_COUNT / N; 74 | 75 | for (int k = 0; k < N; k++) { 76 | float re = 0; 77 | float im = 0; 78 | 79 | for (int n = 0; n < N; n++) { 80 | int idx = (k * n * sin_cos_step) % (SIN_COS_N_COUNT); // t = 2*M_PI*k*n/N 81 | re += in[n]*global_cache.cos_vals[idx]; // cos(t) 82 | im -= in[n]*global_cache.sin_vals[idx]; // sin(t) 83 | } 84 | 85 | out[k*2 + 0] = re; 86 | out[k*2 + 1] = im; 87 | } 88 | } 89 | 90 | // Cooley-Tukey FFT 91 | // poor man's implementation - use something better 92 | // input is real-valued 93 | // output is complex-valued 94 | static void fft(float* in, int N, float* out) { 95 | if (N == 1) { 96 | out[0] = in[0]; 97 | out[1] = 0; 98 | return; 99 | } 100 | 101 | const int half_N = N / 2; 102 | if (N - half_N*2 == 1) { 103 | dft(in, N, out); 104 | return; 105 | } 106 | 107 | float* even = in + N; 108 | for (int i = 0; i < half_N; ++i) { 109 | even[i]= in[2*i]; 110 | } 111 | float* even_fft = out + 2 * N; 112 | fft(even, half_N, even_fft); 113 | 114 | float* odd = even; 115 | for (int i = 0; i < half_N; ++i) { 116 | odd[i] = in[2*i + 1]; 117 | } 118 | float* odd_fft = even_fft + N; 119 | fft(odd, half_N, odd_fft); 120 | 121 | const int sin_cos_step = SIN_COS_N_COUNT / N; 122 | for (int k = 0; k < half_N; k++) { 123 | int idx = k * sin_cos_step; // t = 2*M_PI*k/N 124 | float re = global_cache.cos_vals[idx]; // cos(t) 125 | float im = -global_cache.sin_vals[idx]; // sin(t) 126 | 127 | float re_odd = odd_fft[2*k + 0]; 128 | float im_odd = odd_fft[2*k + 1]; 129 | 130 | out[2*k + 0] = even_fft[2*k + 0] + re*re_odd - im*im_odd; 131 | out[2*k + 1] = even_fft[2*k + 1] + re*im_odd + im*re_odd; 132 | 133 | out[2*(k + half_N) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd; 134 | out[2*(k + half_N) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd; 135 | } 136 | } 137 | 138 | static void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::vector & samples, 139 | int n_samples, int frame_size, int frame_step, int n_threads, 140 | const whisper_filters & filters, whisper_mel & mel) { 141 | std::vector fft_in(frame_size * 2, 0.0); 142 | std::vector fft_out(frame_size * 2 * 2 * 2); 143 | 144 | int n_fft = filters.n_fft; 145 | int i = ith; 146 | 147 | // make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist 148 | WHISPER_ASSERT(n_fft == 1 + (frame_size / 2)); 149 | 150 | // calculate FFT only when fft_in are not all zero 151 | for (; i < std::min(n_samples / frame_step + 1, mel.n_len); i += n_threads) { 152 | const int offset = i * frame_step; 153 | 154 | // apply Hann window (~10% faster) 155 | for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) { 156 | fft_in[j] = hann[j] * samples[offset + j]; 157 | } 158 | 159 | // fill the rest with zeros 160 | if (n_samples - offset < frame_size) { 161 | std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0); 162 | } 163 | 164 | // FFT 165 | fft(fft_in.data(), frame_size, fft_out.data()); 166 | 167 | // Calculate modulus^2 of complex numbers 168 | // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting. 169 | for (int j = 0; j < n_fft; j++) { 170 | fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]); 171 | } 172 | 173 | // mel spectrogram 174 | for (int j = 0; j < mel.n_mel; j++) { 175 | double sum = 0.0; 176 | // unroll loop (suggested by GH user @lunixbochs) 177 | int k = 0; 178 | for (k = 0; k < n_fft - 3; k += 4) { 179 | sum += 180 | fft_out[k + 0] * filters.data[j * n_fft + k + 0] + 181 | fft_out[k + 1] * filters.data[j * n_fft + k + 1] + 182 | fft_out[k + 2] * filters.data[j * n_fft + k + 2] + 183 | fft_out[k + 3] * filters.data[j * n_fft + k + 3]; 184 | } 185 | // handle n_fft remainder 186 | for (; k < n_fft; k++) { 187 | sum += fft_out[k] * filters.data[j * n_fft + k]; 188 | } 189 | sum = log10(std::max(sum, 1e-10)); 190 | mel.data[j * mel.n_len + i] = sum; 191 | } 192 | } 193 | 194 | // Otherwise fft_out are all zero 195 | double sum = log10(1e-10); 196 | for (; i < mel.n_len; i += n_threads) { 197 | for (int j = 0; j < mel.n_mel; j++) { 198 | mel.data[j * mel.n_len + i] = sum; 199 | } 200 | } 201 | } 202 | 203 | // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L110-L157 204 | static bool log_mel_spectrogram( 205 | const float * samples, 206 | const int n_samples, 207 | const int /*sample_rate*/, 208 | const int frame_size, 209 | const int frame_step, 210 | const int n_mel, 211 | const int n_threads, 212 | const whisper_filters & filters, 213 | const bool debug, 214 | whisper_mel & mel) { 215 | const int64_t t_start_us = ggml_time_us(); 216 | 217 | // Hann window 218 | WHISPER_ASSERT(frame_size == WHISPER_N_FFT && "Unsupported frame_size"); 219 | const float * hann = global_cache.hann_window; 220 | 221 | // Calculate the length of padding 222 | int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30; 223 | int64_t stage_2_pad = frame_size / 2; 224 | 225 | // Initialize a vector and copy data from C array to it. 226 | std::vector samples_padded; 227 | samples_padded.resize(n_samples + stage_1_pad + stage_2_pad * 2); 228 | std::copy(samples, samples + n_samples, samples_padded.begin() + stage_2_pad); 229 | 230 | // pad 30 seconds of zeros at the end of audio (480,000 samples) + reflective pad 200 samples at the end of audio 231 | std::fill(samples_padded.begin() + n_samples + stage_2_pad, samples_padded.begin() + n_samples + stage_1_pad + 2 * stage_2_pad, 0); 232 | 233 | // reflective pad 200 samples at the beginning of audio 234 | std::reverse_copy(samples + 1, samples + 1 + stage_2_pad, samples_padded.begin()); 235 | 236 | mel.n_mel = n_mel; 237 | // https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/SpectralOps.cpp#L936 238 | // Calculate number of frames + remove the last frame 239 | mel.n_len = (samples_padded.size() - frame_size) / frame_step; 240 | // Calculate semi-padded sample length to ensure compatibility 241 | mel.n_len_org = 1 + (n_samples + stage_2_pad - frame_size) / frame_step; 242 | mel.data.resize(mel.n_mel * mel.n_len); 243 | 244 | { 245 | std::vector workers(n_threads - 1); 246 | for (int iw = 0; iw < n_threads - 1; ++iw) { 247 | workers[iw] = std::thread( 248 | log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded), 249 | n_samples + stage_2_pad, frame_size, frame_step, n_threads, 250 | std::cref(filters), std::ref(mel)); 251 | } 252 | 253 | // main thread 254 | log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples + stage_2_pad, frame_size, frame_step, n_threads, filters, mel); 255 | 256 | for (int iw = 0; iw < n_threads - 1; ++iw) { 257 | workers[iw].join(); 258 | } 259 | } 260 | 261 | // clamping and normalization 262 | double mmax = -1e20; 263 | for (int i = 0; i < mel.n_mel*mel.n_len; i++) { 264 | if (mel.data[i] > mmax) { 265 | mmax = mel.data[i]; 266 | } 267 | } 268 | 269 | mmax -= 8.0; 270 | 271 | for (int i = 0; i < mel.n_mel*mel.n_len; i++) { 272 | if (mel.data[i] < mmax) { 273 | mel.data[i] = mmax; 274 | } 275 | 276 | mel.data[i] = (mel.data[i] + 4.0)/4.0; 277 | } 278 | 279 | // Dump log_mel_spectrogram 280 | if (debug) { 281 | std::ofstream outFile("log_mel_spectrogram.json"); 282 | outFile << "["; 283 | for (uint64_t i = 0; i < mel.data.size() - 1; i++) { 284 | outFile << mel.data[i] << ", "; 285 | } 286 | outFile << mel.data[mel.data.size() - 1] << "]"; 287 | outFile.close(); 288 | } 289 | 290 | return true; 291 | } 292 | 293 | } // namespace whisper_preprocessor 294 | 295 | struct whisper_encoder { 296 | float norm_eps = 1e-5; 297 | int n_head = 6; 298 | int n_embd; 299 | int n_ctx = 1500; 300 | 301 | ggml_tensor * pos_embd; 302 | 303 | ggml_tensor * conv_1_w; 304 | ggml_tensor * conv_1_b; 305 | ggml_tensor * conv_2_w; 306 | ggml_tensor * conv_2_b; 307 | 308 | ggml_tensor * out_norm_w; 309 | ggml_tensor * out_norm_b; 310 | 311 | struct layer { 312 | ggml_tensor * inp_norm_w; 313 | ggml_tensor * inp_norm_b; 314 | 315 | ggml_tensor * attn_q; 316 | ggml_tensor * attn_q_b; 317 | ggml_tensor * attn_k; 318 | ggml_tensor * attn_v; 319 | ggml_tensor * attn_v_b; 320 | ggml_tensor * attn_o; 321 | ggml_tensor * attn_o_b; 322 | ggml_tensor * attn_post_norm_w; 323 | ggml_tensor * attn_post_norm_b; 324 | 325 | ggml_tensor * ffn_up; 326 | ggml_tensor * ffn_up_b; 327 | ggml_tensor * ffn_down; 328 | ggml_tensor * ffn_down_b; 329 | }; 330 | std::vector layers; 331 | 332 | whisper_encoder(ggml_easy::ctx & ctx, int n_layers) { 333 | const char * prefix = "encoder"; 334 | pos_embd = ctx.get_weight("model.%s.embed_positions.weight", prefix); 335 | n_embd = pos_embd->ne[0]; 336 | conv_1_b = ctx.get_weight("model.%s.conv1.bias", prefix); 337 | conv_1_w = ctx.get_weight("model.%s.conv1.weight", prefix); 338 | conv_2_b = ctx.get_weight("model.%s.conv2.bias", prefix); 339 | conv_2_w = ctx.get_weight("model.%s.conv2.weight", prefix); 340 | out_norm_b = ctx.get_weight("model.%s.layer_norm.bias", prefix); 341 | out_norm_w = ctx.get_weight("model.%s.layer_norm.weight", prefix); 342 | for (int il = 0; il < n_layers; il++) { 343 | layers.push_back({ 344 | .inp_norm_w = ctx.get_weight("model.%s.layers.%d.self_attn_layer_norm.weight", prefix, il), 345 | .inp_norm_b = ctx.get_weight("model.%s.layers.%d.self_attn_layer_norm.bias", prefix, il), 346 | 347 | .attn_q = ctx.get_weight("model.%s.layers.%d.self_attn.q_proj.weight", prefix, il), 348 | .attn_q_b = ctx.get_weight("model.%s.layers.%d.self_attn.q_proj.bias", prefix, il), 349 | .attn_k = ctx.get_weight("model.%s.layers.%d.self_attn.k_proj.weight", prefix, il), 350 | .attn_v = ctx.get_weight("model.%s.layers.%d.self_attn.v_proj.weight", prefix, il), 351 | .attn_v_b = ctx.get_weight("model.%s.layers.%d.self_attn.v_proj.bias", prefix, il), 352 | .attn_o = ctx.get_weight("model.%s.layers.%d.self_attn.out_proj.weight", prefix, il), 353 | .attn_o_b = ctx.get_weight("model.%s.layers.%d.self_attn.out_proj.bias", prefix, il), 354 | .attn_post_norm_w = ctx.get_weight("model.%s.layers.%d.final_layer_norm.weight", prefix, il), 355 | .attn_post_norm_b = ctx.get_weight("model.%s.layers.%d.final_layer_norm.bias", prefix, il), 356 | 357 | .ffn_up = ctx.get_weight("model.%s.layers.%d.fc1.weight", prefix, il), 358 | .ffn_up_b = ctx.get_weight("model.%s.layers.%d.fc1.bias", prefix, il), 359 | .ffn_down = ctx.get_weight("model.%s.layers.%d.fc2.weight", prefix, il), 360 | .ffn_down_b = ctx.get_weight("model.%s.layers.%d.fc2.bias", prefix, il), 361 | }); 362 | } 363 | } 364 | 365 | ggml_tensor * forward(ggml_context * ctx0, ggml_easy::ctx::build_utils & utils, ggml_tensor * input, ggml_tensor * input_pos) { 366 | int n_tokens = n_ctx; //;input->ne[1]; 367 | ggml_tensor * x = input; 368 | 369 | auto layer_norm = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) { 370 | x = ggml_norm(ctx0, x, norm_eps); 371 | x = ggml_mul(ctx0, x, w); 372 | x = ggml_add(ctx0, x, b); 373 | return x; 374 | }; 375 | 376 | auto add_pos = [&](ggml_tensor * x) { 377 | //ggml_tensor * pos_embd_selected = ggml_get_rows(ctx0, pos_embd, input_pos); 378 | //x = ggml_add(ctx0, x, pos_embd_selected); 379 | x = ggml_add(ctx0, x, pos_embd); 380 | return x; 381 | }; 382 | 383 | // TODO: do this at conversion time, see LlamaModel.permute in convert_hf_to_gguf.py 384 | auto llama_permute = [&](ggml_tensor * w) { 385 | ggml_tensor * tmp = ggml_reshape_4d(ctx0, w, w->ne[0], w->ne[1] / n_head / 2, 2, n_head); 386 | tmp = ggml_permute(ctx0, tmp, 0, 2, 1, 3); 387 | tmp = ggml_cont(ctx0, tmp); 388 | return ggml_reshape_2d(ctx0, tmp, w->ne[0], w->ne[1]); 389 | }; 390 | 391 | // convolution + gelu 392 | { 393 | ggml_tensor * tmp; 394 | tmp = ggml_cast(ctx0, conv_1_w, GGML_TYPE_F16); // TODO: do this at conversion time 395 | x = ggml_conv_1d_ph(ctx0, tmp, input, 1, 1); 396 | tmp = ggml_cont(ctx0, ggml_transpose(ctx0, conv_1_b)); // TODO: do this at conversion time 397 | x = ggml_add(ctx0, x, tmp); 398 | 399 | x = ggml_gelu(ctx0, x); 400 | 401 | tmp = ggml_cast(ctx0, conv_2_w, GGML_TYPE_F16); // TODO: do this at conversion time 402 | x = ggml_conv_1d_ph(ctx0, tmp, x, 2, 1); 403 | tmp = ggml_cont(ctx0, ggml_transpose(ctx0, conv_2_b)); // TODO: do this at conversion time 404 | x = ggml_add(ctx0, x, tmp); 405 | 406 | x = ggml_gelu(ctx0, x); 407 | } 408 | 409 | x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); 410 | x = add_pos(x); 411 | ggml_tensor * residual = x; 412 | 413 | int i = 0; // for debugging 414 | for (auto & layer : layers) { 415 | residual = x; 416 | 417 | // input layer norm 418 | x = layer_norm(x, layer.inp_norm_w, layer.inp_norm_b); 419 | 420 | // self attention 421 | { 422 | ggml_tensor * q = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.attn_q, x), layer.attn_q_b); 423 | ggml_tensor * k = ggml_mul_mat(ctx0, layer.attn_k, x); // no bias for key 424 | ggml_tensor * v = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.attn_v, x), layer.attn_v_b); 425 | 426 | int n_embd_head = n_embd / n_head; 427 | q = ggml_reshape_3d(ctx0, q, n_embd_head, n_head, n_tokens); 428 | k = ggml_reshape_3d(ctx0, k, n_embd_head, n_head, n_tokens); 429 | v = ggml_reshape_3d(ctx0, v, n_embd_head, n_head, n_tokens); 430 | 431 | int n_rot = n_embd_head; 432 | q = ggml_cont(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3)); 433 | q = ggml_scale(ctx0, q, 1.0f / std::sqrt(n_embd_head)); 434 | // utils.debug_print(q, "q rope"); 435 | 436 | k = ggml_cont(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3)); 437 | // utils.debug_print(k, "k rope"); 438 | 439 | ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); 440 | kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f, 0.0f); 441 | // utils.debug_print(kq, "kq softmax"); 442 | 443 | v = ggml_cont(ctx0, ggml_permute(ctx0, v, 1, 2, 0, 3)); 444 | 445 | ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); 446 | //kqv = ggml_reshape_3d(ctx0, kqv, n_embd_head, n_tokens, n_head); 447 | kqv = ggml_permute(ctx0, kqv, 0, 2, 1, 3); 448 | kqv = ggml_cont_2d(ctx0, kqv, n_embd, n_tokens); 449 | // utils.debug_print(kqv, "kqv"); 450 | // utils.debug_print(ggml_sum(ctx0, kqv), "kqv_sum"); 451 | 452 | x = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.attn_o, kqv), layer.attn_o_b); 453 | } 454 | 455 | // residual 456 | x = ggml_add(ctx0, x, residual); 457 | 458 | residual = x; 459 | x = layer_norm(x, layer.attn_post_norm_w, layer.attn_post_norm_b); 460 | 461 | // mlp 462 | { 463 | x = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ffn_up, x), layer.ffn_up_b); 464 | x = ggml_gelu(ctx0, x); 465 | x = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ffn_down, x), layer.ffn_down_b); 466 | } 467 | 468 | // residual 469 | x = ggml_add(ctx0, x, residual); 470 | // utils.debug_print(x, "output_layer_%d", i); 471 | // utils.debug_print(ggml_sum(ctx0, x), "output_layer_%d_sum", i); i++; 472 | } 473 | 474 | // output norm 475 | x = layer_norm(x, out_norm_w, out_norm_b); 476 | 477 | return x; 478 | } 479 | }; 480 | 481 | int main() { 482 | ggml_easy::ctx_params params; 483 | ggml_easy::ctx ctx(params); 484 | ctx.load_gguf("models/whisper-mel-filters.gguf"); 485 | ctx.load_safetensors("whisper-tiny.safetensors", {}); 486 | 487 | whisper_preprocessor::whisper_filters mel_filters; 488 | { 489 | auto mel_80 = ctx.get_weight("mel_80"); 490 | ggml_easy::debug::print_tensor_shape(mel_80); 491 | mel_filters.n_mel = mel_80->ne[1]; 492 | mel_filters.n_fft = mel_80->ne[0]; 493 | mel_filters.data.resize(ggml_nelements(mel_80)); 494 | ggml_backend_tensor_get(mel_80, mel_filters.data.data(), 0, mel_filters.data.size()); 495 | 496 | // for (int row = 0; row < mel_filters.n_mel; row++) { 497 | // for (int i = 0; i < mel_filters.n_fft; i++) { 498 | // float elem = mel_filters.data[row * mel_filters.n_fft + i]; 499 | // if (elem != 0.0) { 500 | // printf("[%d, %d] %f\n", row, i, elem); 501 | // } 502 | // } 503 | // printf("\n"); 504 | // } 505 | } 506 | 507 | std::vector samples(3000, 1.0); 508 | 509 | whisper_preprocessor::whisper_mel mel; 510 | whisper_preprocessor::log_mel_spectrogram( 511 | samples.data(), 512 | samples.size(), 513 | WHISPER_SAMPLE_RATE, 514 | WHISPER_N_FFT, 515 | WHISPER_HOP_LENGTH, 516 | mel_filters.n_mel, 517 | 4, // threads 518 | mel_filters, 519 | false, 520 | mel); 521 | 522 | printf("mel.n_len: %d\n", mel.n_len); 523 | printf("mel.n_mel: %d\n", mel.n_mel); 524 | printf("mel.size: %zu\n", mel.data.size()); 525 | // print first and last 10 elements 526 | for (int i = 0; i < 10; i++) { 527 | printf("%f ", mel.data[i]); 528 | } 529 | printf("\n"); 530 | for (int i = mel.data.size() - 10; i < mel.data.size(); i++) { 531 | printf("%f ", mel.data[i]); 532 | } 533 | printf("\n"); 534 | 535 | 536 | // model 537 | whisper_encoder encoder(ctx, 4); 538 | 539 | // create cgraph 540 | ctx.build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf, auto & utils) { 541 | ggml_tensor * input = utils.new_input("mel", GGML_TYPE_F32, 2*encoder.n_ctx, mel.n_mel); 542 | ggml_easy::debug::print_tensor_shape(input); 543 | utils.debug_print(input, "input"); 544 | ggml_tensor * pos = nullptr; //utils.new_input("pos", GGML_TYPE_I32, mel.n_len); 545 | ggml_tensor * result = encoder.forward(ctx_gf, utils, input, pos); 546 | utils.debug_print(result, "result"); 547 | utils.mark_output(result, "result"); 548 | }); 549 | 550 | // set data 551 | ctx.set_tensor_data("mel", mel.data.data()); 552 | // set the input 553 | { 554 | int mel_offset = 0; 555 | int n_ctx = encoder.n_ctx; 556 | std::vector dst(2*n_ctx * mel.n_mel, 0.0f); 557 | 558 | const int i0 = std::min(mel_offset, mel.n_len); 559 | const int i1 = std::min(mel_offset + 2*n_ctx, mel.n_len); 560 | 561 | for (int j = 0; j < mel.n_mel; ++j) { 562 | for (int i = i0; i < i1; ++i) { 563 | dst[j*2*n_ctx + (i - i0)] = mel.data[j*mel.n_len + i]; 564 | } 565 | } 566 | 567 | ctx.set_tensor_data("mel", dst.data()); 568 | } 569 | 570 | // set pos 571 | // std::vector pos(mel.n_len); 572 | // for (size_t i = 0; i < pos.size(); i++) { 573 | // pos[i] = i; 574 | // } 575 | // ctx.set_tensor_data("pos", pos.data()); 576 | 577 | // compute 578 | ctx.compute(); 579 | 580 | return 0; 581 | } 582 | -------------------------------------------------------------------------------- /ggml-easy.h: -------------------------------------------------------------------------------- 1 | // 2 | // ggml-easy.hpp 3 | // 4 | // Copyright (c) 2025 Xuan-Son Nguyen. All rights reserved. 5 | // MIT License 6 | // 7 | 8 | #include "ggml.h" 9 | #include "ggml-cpp.h" 10 | #include "ggml-cpu.h" 11 | #include "ggml-alloc.h" 12 | #include "ggml-backend.h" 13 | #include "gguf.h" 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | namespace ggml_easy { 25 | 26 | struct ctx_params { 27 | bool use_gpu = true; 28 | int max_nodes = 8192; 29 | ggml_log_level log_level = GGML_LOG_LEVEL_INFO; 30 | bool safetensors_ignore_unknown_dtype = false; 31 | }; 32 | 33 | void log_cb(ggml_log_level level, const char * text, void * cur_lvl_ptr) { 34 | ggml_log_level cur_lvl = *(ggml_log_level *) cur_lvl_ptr; 35 | if (cur_lvl > level) { 36 | return; 37 | } 38 | fputs(text, stderr); 39 | fflush(stderr); 40 | } 41 | 42 | // forward declaration 43 | namespace debug { 44 | static void print_tensor_shape(ggml_tensor * t); 45 | static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n = 3); 46 | } 47 | 48 | // forward declaration for safetensors (lightweight) JSON parser 49 | struct safetensors_json_parser { 50 | enum state { 51 | STATE_ROOT, 52 | STATE_OBJ_METADATA, 53 | STATE_OBJ_TENSOR, 54 | }; 55 | struct tensor { 56 | std::string name; 57 | bool ignored = false; 58 | ggml_type type = GGML_TYPE_F32; // only F32, F16, BF16 are supported 59 | std::array shape = {0, 1, 1, 1}; // row-major order 60 | uint64_t offset = 0; 61 | void print() { 62 | printf("tensor: %-60s, type: %s, shape: [%4" PRId64 ", %4" PRId64 ", %4" PRId64 ", %4" PRId64 "], offset: %" PRIu64 "\n", 63 | name.c_str(), ggml_type_name(type), shape[0], shape[1], shape[2], shape[3], offset); 64 | } 65 | }; 66 | bool ignore_unknown_dtype = false; 67 | std::vector tensors; 68 | size_t metadata_size = 0; 69 | safetensors_json_parser(const char * json, size_t metadata_size, std::map name_replace_map, bool ignore_unknown_dtype); 70 | uint64_t get_data_offset(); 71 | }; 72 | 73 | std::string string_format(const char * fmt, ...); 74 | void string_replace_all(std::string & s, const std::string & search, const std::string & replace); 75 | 76 | //////////////////////////////////////// 77 | 78 | struct ctx { 79 | ggml_log_level log_level; 80 | 81 | std::unordered_map tensors; 82 | 83 | ggml_cgraph * gf = nullptr; 84 | ggml_context * ctx_gf = nullptr; 85 | std::vector buf_compute_meta; 86 | int max_nodes; 87 | 88 | std::vector backend_ptrs; 89 | std::vector backend_buft; 90 | 91 | ggml_backend_t backend = nullptr; 92 | ggml_backend_t backend_cpu = nullptr; 93 | ggml_backend_buffer_t buf = nullptr; 94 | 95 | ggml_backend_sched_ptr sched; 96 | 97 | private: 98 | // private data members 99 | struct loaded_gguf { 100 | gguf_context_ptr ctx_gguf; 101 | ggml_context_ptr ctx_data; 102 | }; 103 | std::vector loaded_ggufs; 104 | 105 | struct printed_tensor { 106 | ggml_tensor * t; 107 | bool full; 108 | }; 109 | std::vector dbg_printed_tensors; 110 | bool safetensors_ignore_unknown_dtype; 111 | 112 | 113 | public: 114 | /** 115 | * Construct a new ctx object 116 | * If use_gpu is true, the GPU backend will be used, otherwise the CPU backend will be used 117 | */ 118 | ctx(const ctx_params & params) : log_level(params.log_level), max_nodes(params.max_nodes) { 119 | ggml_log_set(log_cb, &log_level); 120 | safetensors_ignore_unknown_dtype = params.safetensors_ignore_unknown_dtype; 121 | backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); 122 | backend = params.use_gpu 123 | ? ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr) 124 | : nullptr; 125 | 126 | if (backend) { 127 | log(GGML_LOG_LEVEL_INFO, "%s: using %s backend\n", __func__, ggml_backend_name(backend)); 128 | backend_ptrs.push_back(backend); 129 | backend_buft.push_back(ggml_backend_get_default_buffer_type(backend)); 130 | } else { 131 | backend = backend_cpu; 132 | log(GGML_LOG_LEVEL_INFO, "%s: using CPU backend\n", __func__); 133 | } 134 | 135 | backend_ptrs.push_back(backend_cpu); 136 | backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu)); 137 | 138 | sched.reset( 139 | ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, false, true) 140 | ); 141 | 142 | buf_compute_meta.resize(max_nodes * ggml_tensor_overhead() + ggml_graph_overhead()); 143 | } 144 | 145 | /** 146 | * Get a weight tensor by name, can only be used after model is loaded. 147 | * Throws an exception if the tensor is not found. 148 | */ 149 | ggml_tensor * get_weight(const char *fmt, ...) { 150 | std::vector str(128); 151 | va_list va; 152 | va_start(va, fmt); 153 | vsnprintf(str.data(), 128, fmt, va); 154 | va_end(va); 155 | auto it = tensors.find(str.data()); 156 | if (it == tensors.end()) { 157 | throw std::runtime_error(string_format("weight tensor not found: %s", str.data())); 158 | } 159 | return it->second; 160 | } 161 | 162 | /** 163 | * Load a GGUF model file 164 | * The tensors will be loaded into the context and can be accessed via `ctx.get_weight(name)` 165 | * The GGUF metadata will be loaded into `ctx.ctx_gguf` 166 | */ 167 | void load_gguf(const char * fname) { 168 | ggml_context * meta = nullptr; 169 | 170 | gguf_init_params params = { 171 | /*.no_alloc = */ true, 172 | /*.ctx = */ &meta, 173 | }; 174 | 175 | gguf_context * ctx_gguf = gguf_init_from_file(fname, params); 176 | 177 | // load tensors 178 | const int n_tensors = gguf_get_n_tensors(ctx_gguf); 179 | ggml_init_params ggml_params = { 180 | /*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(), 181 | /*.mem_buffer =*/ NULL, 182 | /*.no_alloc =*/ true, 183 | }; 184 | 185 | ggml_context * ctx_data = ggml_init(ggml_params); 186 | auto fin = std::ifstream(fname, std::ios::binary); 187 | if (!fin) { 188 | ggml_free(meta); 189 | throw std::runtime_error("cannot open model file for loading tensors"); 190 | } 191 | 192 | // add tensors to context 193 | for (int i = 0; i < n_tensors; ++i) { 194 | const char * name = gguf_get_tensor_name(ctx_gguf, i); 195 | ggml_tensor * t = ggml_get_tensor(meta, name); 196 | ggml_tensor * cur = ggml_dup_tensor(ctx_data, t); 197 | ggml_set_name(cur, name); 198 | tensors.insert({name, cur}); 199 | } 200 | 201 | // alloc memory and offload data 202 | std::map offset_map; // empty map, use default value 203 | if (!load_tensors_to_backend(fin, offset_map, ctx_gguf, ctx_data)) { 204 | ggml_free(meta); 205 | throw std::runtime_error("failed to load tensors to backend"); 206 | } 207 | log(GGML_LOG_LEVEL_INFO, "%s: Loaded %d tensors from %s\n", __func__, n_tensors, fname); 208 | ggml_free(meta); 209 | 210 | loaded_ggufs.push_back({ 211 | gguf_context_ptr(ctx_gguf), 212 | ggml_context_ptr(ctx_data), 213 | }); 214 | } 215 | 216 | /** 217 | * Load a Safetensors model file 218 | * The tensors will be loaded into the context and can be accessed via `ctx.get_weight(name)` 219 | * In some cases, the tensor name is too long and GGML won't accept it. You can provide a name_replace_map to replace the name. 220 | * For example: 221 | * name_replace_map = {{".acoustic_residual_vector_quantizer", ".arvq"}} 222 | */ 223 | void load_safetensors(const char * fname, std::map name_replace_map) { 224 | auto fin = std::ifstream(fname, std::ios::binary); 225 | if (!fin) { 226 | throw std::runtime_error("cannot open model file: " + std::string(fname)); 227 | } 228 | 229 | uint64_t metadata_size = 0; 230 | fin.read(reinterpret_cast(&metadata_size), sizeof(metadata_size)); 231 | if (metadata_size < 2) { 232 | throw std::runtime_error("invalid metadata size, got " + std::to_string(metadata_size)); 233 | } 234 | 235 | std::vector buf(metadata_size); 236 | fin.read(buf.data(), metadata_size); 237 | if (!fin) { 238 | throw std::runtime_error("failed to read metadata"); 239 | } 240 | 241 | safetensors_json_parser parser(buf.data(), metadata_size, name_replace_map, safetensors_ignore_unknown_dtype); 242 | 243 | ggml_init_params ggml_params = { 244 | /*.mem_size =*/ (parser.tensors.size() + 1) * ggml_tensor_overhead(), 245 | /*.mem_buffer =*/ NULL, 246 | /*.no_alloc =*/ true, 247 | }; 248 | ggml_context * ctx_data = ggml_init(ggml_params); 249 | gguf_context * ctx_gguf = gguf_init_empty(); 250 | 251 | std::map offset_map; 252 | for (auto & tensor : parser.tensors) { 253 | ggml_tensor * t = ggml_new_tensor(ctx_data, tensor.type, 4, tensor.shape.data()); 254 | ggml_set_name(t, tensor.name.c_str()); 255 | gguf_add_tensor(ctx_gguf, t); 256 | tensors.insert({tensor.name, t}); 257 | offset_map.insert({t, parser.get_data_offset() + tensor.offset}); 258 | } 259 | 260 | // alloc memory and offload data 261 | if (!load_tensors_to_backend(fin, offset_map, ctx_gguf, ctx_data)) { 262 | throw std::runtime_error("failed to load tensors to backend"); 263 | } 264 | log(GGML_LOG_LEVEL_INFO, "%s: Loaded %d tensors from %s\n", __func__, (int)gguf_get_n_tensors(ctx_gguf), fname); 265 | 266 | loaded_ggufs.push_back({ 267 | gguf_context_ptr(ctx_gguf), 268 | ggml_context_ptr(ctx_data), 269 | }); 270 | } 271 | 272 | /** 273 | * Various utility functions for building a cgraph. 274 | * 275 | * This object will be provided to the user's builder function as the last argument. 276 | */ 277 | struct build_utils { 278 | ggml_context * gf_ctx; 279 | ggml_cgraph * gf; 280 | std::vector printed_tensors; 281 | build_utils(ggml_context * gf_ctx, ggml_cgraph * gf) : gf_ctx(gf_ctx), gf(gf) {} 282 | /** 283 | * Add an input tensor, this function does these steps: 284 | * 1. ggml_new_tensor_4d 285 | * 2. ggml_set_name 286 | * 3. ggml_set_input 287 | */ 288 | ggml_tensor * new_input(const char * name, ggml_type dtype, int64_t ne0, int64_t ne1 = 1, int64_t ne2 = 1, int64_t ne3 = 1) { 289 | ggml_tensor * t = ggml_new_tensor_4d(gf_ctx, dtype, ne0, ne1, ne2, ne3); 290 | ggml_set_name(t, name); 291 | ggml_set_input(t); 292 | return t; 293 | } 294 | /** 295 | * Mark this tensor as output, this function does these steps: 296 | * 1. ggml_set_name 297 | * 2. ggml_set_output 298 | * 3. ggml_build_forward_expand 299 | */ 300 | void mark_output(ggml_tensor * t, const char * name) { 301 | ggml_set_name(t, name); 302 | ggml_set_output(t); 303 | ggml_build_forward_expand(gf, t); 304 | } 305 | /** 306 | * Print this tensor as soon as it is computed, useful for debugging. 307 | * name is optional, if not provided, the existing name of the tensor will be used 308 | */ 309 | template 310 | void debug_print(ggml_tensor * t, Params&&... params) { 311 | std::string name = string_format(std::forward(params)...); 312 | if (t->flags) { 313 | // prevent renaming input/output tensor name by accident 314 | t = ggml_cpy(gf_ctx, t, ggml_dup_tensor(gf_ctx, t)); 315 | } 316 | mark_output(t, name.c_str()); 317 | printed_tensors.push_back({t, false}); 318 | } 319 | /** 320 | * Same with `debug_print` but also print the full tensor shape and data. 321 | */ 322 | template 323 | void debug_print_full(ggml_tensor * t, Params&&... params) { 324 | std::string name = string_format(std::forward(params)...); 325 | if (t->flags) { 326 | // prevent renaming input/output tensor name by accident 327 | t = ggml_cpy(gf_ctx, t, ggml_dup_tensor(gf_ctx, t)); 328 | } 329 | mark_output(t, name.c_str()); 330 | printed_tensors.push_back({t, true}); 331 | } 332 | }; 333 | 334 | /** 335 | * Build a cgraph using the given builder function. 336 | * 337 | * The built cgraph will be stored in `ctx.gf` 338 | */ 339 | void build_graph(std::function builder_fn) { 340 | ggml_free(ctx_gf); 341 | struct ggml_init_params params = { 342 | /*.mem_size =*/ buf_compute_meta.size(), 343 | /*.mem_buffer =*/ buf_compute_meta.data(), 344 | /*.no_alloc =*/ true, 345 | }; 346 | 347 | ctx_gf = ggml_init(params); 348 | ggml_backend_sched_reset(sched.get()); 349 | gf = ggml_new_graph_custom(ctx_gf, max_nodes, false); 350 | 351 | build_utils utils(ctx_gf, gf); 352 | 353 | builder_fn(ctx_gf, gf, utils); 354 | ggml_backend_sched_alloc_graph(sched.get(), gf); 355 | dbg_printed_tensors = std::move(utils.printed_tensors); 356 | } 357 | 358 | /** 359 | * Same as `build_graph` but without `build_utils` 360 | */ 361 | void build_graph(std::function builder_fn) { 362 | build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf, build_utils & utils) { 363 | builder_fn(ctx_gf, gf); 364 | }); 365 | } 366 | 367 | /** 368 | * Compute the given cgraph 369 | */ 370 | ggml_status compute() { 371 | ggml_status status = ggml_backend_sched_graph_compute(sched.get(), gf); 372 | if (status == GGML_STATUS_SUCCESS) { 373 | for (auto & p : dbg_printed_tensors) { 374 | std::vector data(ggml_nbytes(p.t)); 375 | ggml_backend_tensor_get(p.t, data.data(), 0, ggml_nbytes(p.t)); 376 | ggml_easy::debug::print_tensor_shape(p.t); 377 | ggml_easy::debug::print_tensor_data(p.t, data.data(), p.full ? LONG_MAX : 3); 378 | } 379 | } 380 | return status; 381 | } 382 | 383 | /** 384 | * Set the data of a tensor by name 385 | */ 386 | void set_tensor_data(const std::string & name, const void * data) { 387 | ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str()); 388 | if (!t) { 389 | throw std::runtime_error(string_format("tensor not found: %s", name.c_str())); 390 | } 391 | ggml_backend_tensor_set(t, data, 0, ggml_nbytes(t)); 392 | } 393 | 394 | /** 395 | * Set the data of a tensor by name using a function. 396 | * 397 | * Example usage: 398 | * 399 | * ``` 400 | * ctx.set_tensor_data("x", [](int i0, int i1, int i2, int i3) { 401 | * return i0 + i1 + i2 + i3; 402 | * }); 403 | * ``` 404 | */ 405 | void set_tensor_data(const std::string & name, std::function data_fn) { 406 | ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str()); 407 | if (!t) { 408 | throw std::runtime_error(string_format("tensor not found: %s", name.c_str())); 409 | } 410 | if (t->type != GGML_TYPE_F32) { 411 | throw std::runtime_error(string_format("tensor type must be GGML_TYPE_F32: %s", name.c_str())); 412 | } 413 | std::vector data(ggml_nelements(t)); 414 | for (int d3 = 0; d3 < t->ne[3]; ++d3) { 415 | for (int d2 = 0; d2 < t->ne[2]; ++d2) { 416 | for (int d1 = 0; d1 < t->ne[1]; ++d1) { 417 | for (int d0 = 0; d0 < t->ne[0]; ++d0) { 418 | int i = d3 * t->ne[2] + d2 * t->ne[1] + d1 * t->ne[0] + d0; 419 | data[i] = data_fn(d0, d1, d2, d3); 420 | } 421 | } 422 | } 423 | } 424 | ggml_backend_tensor_set(t, data.data(), 0, ggml_nbytes(t)); 425 | } 426 | 427 | /** 428 | * Get the data of a tensor by name. 429 | * 430 | * Example usage: 431 | * 432 | * ``` 433 | * auto result = ctx.get_tensor_data("result"); 434 | * ggml_tensor * result_tensor = result.first; 435 | * std::vector & result_data = result.second; 436 | * float * result_data_f32 = (float *) result_data.data(); 437 | * ``` 438 | */ 439 | std::pair> get_tensor_data(const std::string & name) { 440 | ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str()); 441 | if (!t) { 442 | throw std::runtime_error(string_format("tensor not found: %s", name.c_str())); 443 | } 444 | std::vector data(ggml_nbytes(t)); 445 | ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t)); 446 | return std::make_pair(t, data); 447 | } 448 | 449 | ~ctx() { 450 | ggml_backend_buffer_free(buf); 451 | } 452 | 453 | private: 454 | bool load_tensors_to_backend(std::ifstream & fin, std::map & offset_map, gguf_context * ctx_gguf, ggml_context * ctx_data) { 455 | std::vector read_buf; 456 | const bool use_custom_offset = !offset_map.empty(); 457 | 458 | // alloc memory and offload data 459 | ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend); 460 | buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_data, buft); 461 | ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); 462 | for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); ++i) { 463 | const char * name = gguf_get_tensor_name(ctx_gguf, i); 464 | ggml_tensor * cur = ggml_get_tensor(ctx_data, name); 465 | const size_t offset = use_custom_offset 466 | ? offset_map[cur] 467 | : gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i); 468 | log(GGML_LOG_LEVEL_DEBUG, "%s: Loading tensor \"%s\"\n", __func__, name); 469 | fin.seekg(offset, std::ios::beg); 470 | if (!fin) { 471 | log(GGML_LOG_LEVEL_ERROR, "failed to seek for tensor: %s", name); 472 | } 473 | int num_bytes = ggml_nbytes(cur); 474 | if (ggml_backend_buft_is_host(buft)) { 475 | // for the CPU and Metal backend, we can read directly into the tensor 476 | fin.read(reinterpret_cast(cur->data), num_bytes); 477 | } else { 478 | // read into a temporary buffer first, then copy to device memory 479 | read_buf.resize(num_bytes); 480 | fin.read(reinterpret_cast(read_buf.data()), num_bytes); 481 | ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); 482 | } 483 | } 484 | return true; 485 | } 486 | 487 | void log(ggml_log_level level, const char * format, ...) { 488 | va_list args; 489 | va_start(args, format); 490 | log_impl(level, format, args); 491 | va_end(args); 492 | } 493 | 494 | void log_impl(ggml_log_level level, const char * format, va_list args) { 495 | va_list args_copy; 496 | va_copy(args_copy, args); 497 | char buffer[128]; 498 | int len = vsnprintf(buffer, 128, format, args); 499 | if (len < 128) { 500 | log_cb(level, buffer, &log_level); 501 | } else { 502 | char * buffer2 = new char[len + 1]; 503 | vsnprintf(buffer2, len + 1, format, args_copy); 504 | buffer2[len] = 0; 505 | log_cb(level, buffer2, &log_level); 506 | delete[] buffer2; 507 | } 508 | va_end(args_copy); 509 | } 510 | }; // struct ctx 511 | 512 | using gf_build_fn = std::function; 513 | 514 | //////////////////////////////////////// 515 | 516 | safetensors_json_parser::safetensors_json_parser( 517 | const char * json, size_t metadata_size, std::map name_replace_map, bool ignore_unknown_dtype 518 | ) : metadata_size(metadata_size), ignore_unknown_dtype(ignore_unknown_dtype) { 519 | size_t i = 0; 520 | state s = STATE_ROOT; 521 | std::vector buf; 522 | tensor cur_tensor; 523 | buf.reserve(128); 524 | auto i_pp = [&]() { 525 | if (++i > metadata_size) { 526 | throw std::runtime_error("unexpected end of JSON"); 527 | } 528 | return i - 1; 529 | }; 530 | auto pp_i = [&]() { 531 | if (++i > metadata_size) { 532 | throw std::runtime_error("unexpected end of JSON"); 533 | } 534 | return i; 535 | }; 536 | auto read_until = [&](char end) -> std::string { 537 | buf.clear(); i_pp(); 538 | while (json[i] != end) buf.push_back(json[i_pp()]); 539 | return std::string(buf.data(), buf.size()); 540 | }; 541 | auto read_number = [&]() -> std::string { 542 | buf.clear(); i_pp(); 543 | while ('0' <= json[i] && json[i] <= '9') buf.push_back(json[i_pp()]); 544 | return std::string(buf.data(), buf.size()); 545 | }; 546 | while (i < metadata_size) { 547 | char c = json[i]; 548 | if (i == 0) GGML_ASSERT(c == '{' && "json must start with open curly bracket"); 549 | 550 | // string 551 | if (c == '\"') { 552 | std::string key = read_until('\"'); 553 | 554 | if (s == STATE_ROOT) { 555 | if (key == "__metadata__") { 556 | s = STATE_OBJ_METADATA; 557 | i_pp(); 558 | continue; 559 | } else { 560 | cur_tensor.name = key; 561 | for (auto & p : name_replace_map) { 562 | string_replace_all(cur_tensor.name, p.first, p.second); 563 | } 564 | if (cur_tensor.name.empty()) { 565 | throw std::runtime_error("empty tensor name"); 566 | } 567 | if (cur_tensor.name.size() > GGML_MAX_NAME - 1) { 568 | throw std::runtime_error("tensor name too long: '" + cur_tensor.name + "'; please use name_replace_map to rename it"); 569 | } 570 | i_pp(); 571 | s = STATE_OBJ_TENSOR; 572 | continue; 573 | } 574 | } else if (s == STATE_OBJ_TENSOR) { 575 | if (key == "dtype") { 576 | GGML_ASSERT(json[pp_i()] == ':'); 577 | GGML_ASSERT(json[pp_i()] == '\"'); 578 | std::string value = read_until('\"'); 579 | /**/ if (value == "F32") cur_tensor.type = GGML_TYPE_F32; 580 | else if (value == "F16") cur_tensor.type = GGML_TYPE_F16; 581 | else if (value == "BF16") cur_tensor.type = GGML_TYPE_BF16; 582 | else if (ignore_unknown_dtype) cur_tensor.ignored = true; 583 | else throw std::runtime_error("unknown dtype: " + value); 584 | } else if (key == "shape") { 585 | GGML_ASSERT(json[pp_i()] == ':'); 586 | GGML_ASSERT(json[pp_i()] == '['); 587 | std::vector values; 588 | for (int j = 0; j < 4; j++) { 589 | std::string value = read_number(); 590 | if (value.empty()) break; 591 | values.push_back(std::stoll(value)); 592 | } 593 | GGML_ASSERT(values.size() >= 0); 594 | // flip column-major to row-major 595 | for (size_t j = 0; j < values.size(); j++) { 596 | cur_tensor.shape[j] = values[values.size() - j - 1]; 597 | } 598 | } else if (key == "data_offsets") { 599 | GGML_ASSERT(json[pp_i()] == ':'); 600 | GGML_ASSERT(json[pp_i()] == '['); 601 | std::string off_start = read_number(); 602 | GGML_ASSERT(!off_start.empty()); 603 | cur_tensor.offset = std::stoull(off_start); 604 | std::string off_end = read_number(); 605 | GGML_ASSERT(!off_end.empty()); // unused 606 | } 607 | } 608 | } 609 | 610 | // object 611 | else if (c == '{') { 612 | if (s == STATE_OBJ_METADATA) { 613 | // skip metadata object 614 | while (json[pp_i()] != '}') {} 615 | s = STATE_ROOT; 616 | } else if (s == STATE_OBJ_TENSOR) { 617 | // read next string 618 | } 619 | } else if (c == '}') { 620 | if (s == STATE_OBJ_TENSOR) { 621 | // cur_tensor.print(); // debug 622 | if (!cur_tensor.ignored) { 623 | tensors.push_back(cur_tensor); 624 | } 625 | cur_tensor = {}; 626 | s = STATE_ROOT; 627 | } 628 | } 629 | 630 | // ignore ',' and ':' 631 | i++; 632 | } 633 | } 634 | 635 | uint64_t safetensors_json_parser::get_data_offset() { 636 | // alignment: https://github.com/huggingface/safetensors/blob/7d5af853631628137a79341ddc5611d18a17f3fe/safetensors/src/tensor.rs#L202 637 | static const int alignment = 8; // bytes 638 | return GGML_PAD(8 + metadata_size, alignment); 639 | } 640 | 641 | //////////////////////////////////////// 642 | 643 | namespace debug { 644 | static void print_backend_buffer_info(ctx & gctx) { 645 | if (gctx.backend && gctx.buf) { 646 | auto buft_weight = ggml_backend_get_default_buffer_type(gctx.backend); 647 | size_t size_weight = ggml_backend_buffer_get_size(gctx.buf); 648 | if (size_weight > 1) { 649 | printf("%s: %10s weight buffer size = %8.2f MiB\n", __func__, 650 | ggml_backend_buft_name(buft_weight), 651 | size_weight / 1024.0 / 1024.0); 652 | } 653 | } 654 | for (size_t i = 0; i < gctx.backend_ptrs.size(); ++i) { 655 | ggml_backend_t backend = gctx.backend_ptrs[i]; 656 | ggml_backend_buffer_type_t buft = gctx.backend_buft[i]; 657 | size_t size_sched = ggml_backend_sched_get_buffer_size(gctx.sched.get(), backend); 658 | if (size_sched > 1) { 659 | printf("%s: %10s compute buffer size = %8.2f MiB\n", __func__, 660 | ggml_backend_buft_name(buft), 661 | size_sched / 1024.0 / 1024.0); 662 | } 663 | } 664 | } 665 | 666 | static void print_tensor_shape(ggml_tensor * t) { 667 | printf("%s.shape = [", t->name); 668 | for (int i = 0; i < ggml_n_dims(t); ++i) { 669 | printf("%" PRId64, t->ne[i]); 670 | if (i < ggml_n_dims(t) - 1) { 671 | printf(", "); 672 | } 673 | } 674 | printf("]\n"); 675 | } 676 | 677 | static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) { 678 | ggml_type type = t->type; 679 | int64_t * ne = t->ne; 680 | size_t * nb = t->nb; 681 | for (int64_t i3 = 0; i3 < ne[3]; i3++) { 682 | printf("%s.data: [\n", t->name); 683 | for (int64_t i2 = 0; i2 < ne[2]; i2++) { 684 | if (i2 == n && ne[2] > 2*n) { 685 | printf(" ..., \n"); 686 | i2 = ne[2] - n; 687 | } 688 | printf(" [\n"); 689 | for (int64_t i1 = 0; i1 < ne[1]; i1++) { 690 | if (i1 == n && ne[1] > 2*n) { 691 | printf(" ..., \n"); 692 | i1 = ne[1] - n; 693 | } 694 | printf(" ["); 695 | for (int64_t i0 = 0; i0 < ne[0]; i0++) { 696 | if (i0 == n && ne[0] > 2*n) { 697 | printf("..., "); 698 | i0 = ne[0] - n; 699 | } 700 | size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; 701 | float v; 702 | if (type == GGML_TYPE_F16) { 703 | v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]); 704 | } else if (type == GGML_TYPE_F32) { 705 | v = *(float *) &data[i]; 706 | } else if (type == GGML_TYPE_I32) { 707 | v = (float) *(int32_t *) &data[i]; 708 | } else if (type == GGML_TYPE_I16) { 709 | v = (float) *(int16_t *) &data[i]; 710 | } else if (type == GGML_TYPE_I8) { 711 | v = (float) *(int8_t *) &data[i]; 712 | } else { 713 | GGML_ABORT("fatal error"); 714 | } 715 | printf("%8.4f", v); 716 | if (i0 < ne[0] - 1) printf(", "); 717 | } 718 | printf("],\n"); 719 | } 720 | printf(" ],\n"); 721 | } 722 | printf(" ]\n"); 723 | //printf(" sum = %f\n", sum); 724 | } 725 | } 726 | } // namespace debug 727 | 728 | //////////////////////////////////////// 729 | 730 | std::string string_format(const char * fmt, ...) { 731 | va_list ap; 732 | va_list ap2; 733 | va_start(ap, fmt); 734 | va_copy(ap2, ap); 735 | int size = vsnprintf(NULL, 0, fmt, ap); 736 | GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT 737 | std::vector buf(size + 1); 738 | int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); 739 | GGML_ASSERT(size2 == size); 740 | va_end(ap2); 741 | va_end(ap); 742 | return std::string(buf.data(), size); 743 | } 744 | 745 | void string_replace_all(std::string & s, const std::string & search, const std::string & replace) { 746 | if (search.empty()) { 747 | return; 748 | } 749 | std::string builder; 750 | builder.reserve(s.length()); 751 | size_t pos = 0; 752 | size_t last_pos = 0; 753 | while ((pos = s.find(search, last_pos)) != std::string::npos) { 754 | builder.append(s, last_pos, pos - last_pos); 755 | builder.append(replace); 756 | last_pos = pos + search.length(); 757 | } 758 | builder.append(s, last_pos, std::string::npos); 759 | s = std::move(builder); 760 | } 761 | 762 | } // namespace ggml_easy 763 | 764 | // 765 | // extension to ggml functions 766 | // 767 | 768 | // create tensor with all elements set to 1.0 769 | ggml_tensor * ggml_ones(ggml_context * ctx, int64_t ne0, int64_t ne1 = 1, int64_t ne2 = 1, int64_t ne3 = 1) { 770 | ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); 771 | x = ggml_cos(ctx, ggml_scale(ctx, x, 0.0f)); // cos(0) = 1 772 | return ggml_repeat_4d(ctx, x, ne0, ne1, ne2, ne3); 773 | } 774 | -------------------------------------------------------------------------------- /demo/kyutai-mimi.cpp: -------------------------------------------------------------------------------- 1 | #include "ggml.h" 2 | #include "ggml-easy.h" 3 | #include 4 | #include 5 | #include 6 | 7 | 8 | /** 9 | * (Stil WIP) This is my trial to reimplement the Mimi model from Kyutai using ggml, the code is based on HF transformers implementation. See "modeling_mimi.py" for the original code. 10 | * 11 | * To get the model (we are using safetensors directly, no need to convert to GGUF): 12 | * 1. Download the model.safetensors file from https://huggingface.co/kyutai/mimi 13 | * 2. Rename the "model.safetensors" to "mimi.safetensors" 14 | * 15 | * Note: do NOT upload the gguf to the internet, it is NOT compatible with llama.cpp and people will complain. 16 | * 17 | * --- 18 | * 19 | * For the ENCODER, it takes raw audio waveform as input and output audio codes. Steps are: 20 | * 1. Convert waveform to embeddings using mimi_encoder (SEANet encoder), basically just a bunch of Conv1d but the padding is quite tricky. 21 | * 2. Process the embeddings using a transformer, here we use an auto-aggressive one (causal mask). This is because Laurent told me that they only trained the model with auto-regressive setting. 22 | * 3. Quantize the embeddings using a residual vector quantizer (RVQ) to get the audio codes. The RVQ has 32 codebooks, one for semantic and 31 for acoustic. Doing this on ggml is a bit tricky because I need to reimplement euclidean distance from scratch. 23 | * 24 | * In the code below, we take 2048 samples of audio waveform as input (value = 1.0f), expected output is 2 tokens (according to python implementation). 25 | * 26 | * Python code: 27 | * model = MimiModel.from_pretrained("/Users/ngxson/work/models/mimi") 28 | * input_values = torch.ones((1, 1, 2048)) 29 | * encoder_outputs = model.encode(input_values) # this should match the output of ggml 30 | * 31 | * --- 32 | * 33 | * For the DECODER, we simply do the reverse of the above steps. 34 | * The good thing is that this time, we don't need to care about euclidean distance. 35 | * 36 | * Python code: 37 | * model = MimiModel.from_pretrained("/Users/ngxson/work/models/mimi") 38 | * input_values = torch.tensor([[ [i, i+1, i+2] for i in range(0, 3*32, 3) ]], dtype=torch.long) 39 | * audio_values = model.decode(input_values)[0] # this should match the output of ggml 40 | * 41 | * Expected output: 42 | * torch.Size([1, 1, 5760]) 43 | * tensor([[[ 0.0117, 0.0130, -0.0007, ..., -0.1295, -0.1258, -0.1343]]]) 44 | */ 45 | 46 | struct mimi_config_t { 47 | bool causal = true; 48 | int max_position_embeddings = 8000; 49 | int num_hidden_layers = 8; 50 | int n_embd = 512; 51 | int n_ffn = 2048; 52 | int n_head = 8; 53 | int n_head_kv = 8; 54 | int n_rot = 64; 55 | float norm_eps = 1e-5; 56 | float rope_theta = 10000.0f; 57 | int sliding_window = 250; 58 | std::array upsampling_ratio = {8, 6, 5, 4}; 59 | std::array downsampling_ratio = {4, 5, 6, 8}; // reverse of upsampling_ratio 60 | // vector quantizer 61 | float frame_rate = 12.5; 62 | int audio_channels = 1; 63 | int codebook_size = 2048; 64 | int codebook_dim = 256; 65 | int n_semantic_components = 1; 66 | int n_acoustic_components = 31; 67 | // decode 68 | float trim_right_ratio = 1.0f; 69 | } mimi_config; 70 | 71 | 72 | /////////////////////////////////////////////////////////////////////////// 73 | // extension to ggml.h 74 | // TODO: add these ops to the library (ofc with a more optimized kernel) 75 | 76 | 77 | // mode: (0) constant, (1) reflect, (2) replicate, (3) circular 78 | // value is only used in "constant" 79 | // only "constant" with 0.0f and "replicate" are implemented here 80 | static ggml_tensor * ggml_pad_ext(ggml_context * ctx0, ggml_tensor * x, int mode, 81 | int64_t pad_left, int64_t pad_right, float value = 0.0f) { 82 | GGML_ASSERT(value == 0.0f); // we can technically use ggml_arange, but for simplication we only support 0.0f 83 | GGML_ASSERT(mode == 0 || mode == 2); 84 | if (pad_left > 0) { 85 | ggml_tensor * tmp = ggml_new_tensor_2d(ctx0, x->type, pad_left, x->ne[1]); 86 | if (mode == 0) { 87 | tmp = ggml_scale(ctx0, tmp, value); 88 | } else if (mode == 2) { 89 | ggml_tensor * elem = ggml_view_2d(ctx0, x, 1, x->ne[1], x->nb[1], 0); // get first column 90 | tmp = ggml_repeat(ctx0, elem, tmp); 91 | } 92 | x = ggml_concat(ctx0, tmp, x, 0); 93 | } 94 | if (pad_right > 0) { 95 | ggml_tensor * tmp = ggml_new_tensor_2d(ctx0, x->type, pad_right, x->ne[1]); 96 | if (mode == 0) { 97 | tmp = ggml_scale(ctx0, tmp, value); 98 | } else if (mode == 2) { 99 | int64_t last = x->ne[0] - 1; 100 | ggml_tensor * elem = ggml_view_2d(ctx0, x, 1, x->ne[1], x->nb[1], last * ggml_element_size(x)); // get last column 101 | tmp = ggml_repeat(ctx0, elem, tmp); 102 | } 103 | x = ggml_concat(ctx0, x, tmp, 0); 104 | } 105 | return x; 106 | } 107 | 108 | static ggml_tensor * ggml_argmin(ggml_context * ctx0, ggml_tensor * x) { 109 | ggml_tensor * tmp = ggml_scale(ctx0, x, -1.0f); 110 | return ggml_argmax(ctx0, tmp); 111 | } 112 | 113 | // lookup nearest vector in codebook based on euclidean distance 114 | // return index of the vector in codebook, single element with I32 type 115 | static ggml_tensor * ggml_lookup_vec(ggml_context * ctx0, ggml_tensor * codebook, ggml_tensor * x) { 116 | ggml_tensor * tmp = ggml_add(ctx0, codebook, ggml_scale(ctx0, x, -1.0f)); // a - x 117 | tmp = ggml_mul(ctx0, tmp, tmp); // (a - x) ** 2 118 | tmp = ggml_sum_rows(ctx0, tmp); 119 | tmp = ggml_sqrt(ctx0, tmp); 120 | tmp = ggml_cont(ctx0, ggml_transpose(ctx0, tmp)); 121 | // villain version of argmin :-) 122 | tmp = ggml_argmax(ctx0, ggml_scale(ctx0, tmp, -1.0f)); 123 | GGML_ASSERT(ggml_nelements(tmp) == 1); 124 | return tmp; 125 | } 126 | 127 | // lookup vectors in codebook based on euclidean distance 128 | // return indices of the vectors in codebook, 1D tensor with I32 type 129 | static ggml_tensor * ggml_lookup_vectors(ggml_easy::ctx::build_utils & utils, ggml_context * ctx0, ggml_tensor * codebook, ggml_tensor * list_vec, ggml_tensor * out, size_t offset) { 130 | int64_t n_col = list_vec->ne[0]; 131 | int64_t n_row = list_vec->ne[1]; 132 | for (int64_t ir = 0; ir < n_row; ir++) { 133 | ggml_tensor * row = ggml_view_1d(ctx0, list_vec, n_col, ir*n_col*ggml_element_size(list_vec)); 134 | ggml_tensor * idx = ggml_lookup_vec(ctx0, codebook, row); 135 | ggml_tensor * dst = ggml_view_1d(ctx0, out, 1, offset + ir*ggml_element_size(out)); 136 | ggml_build_forward_expand(utils.gf, ggml_cpy(ctx0, idx, dst)); 137 | } 138 | return out; 139 | } 140 | 141 | 142 | /////////////////////////////////////////////////////////////////////////// 143 | 144 | 145 | static int64_t div_ceil(int64_t a, int64_t b) { 146 | return a / b + (a % b ? 1 : 0); 147 | } 148 | 149 | static ggml_tensor * mimi_conv_1d(ggml_easy::ctx::build_utils & utils, ggml_context * ctx0, ggml_tensor * x, 150 | ggml_tensor * kernel, ggml_tensor * bias, int stride, int dilation, bool pad_zero = true) { 151 | int64_t kernel_size = (kernel->ne[0] - 1) * dilation + 1; 152 | int64_t p_total = kernel_size - stride; // padding total 153 | int64_t p_half = p_total / 2; 154 | int64_t is_p_odd = p_total % 2; // is padding odd 155 | 156 | int64_t n_frames = div_ceil(x->ne[0] - kernel_size + p_total, stride); 157 | int64_t ideal_len = n_frames * stride + kernel_size - p_total; 158 | int64_t p_extra = ideal_len - x->ne[0]; 159 | 160 | int64_t p_right = (mimi_config.causal ? 0 : p_half) + p_extra; 161 | int64_t p_left = p_total - (mimi_config.causal ? 0 : p_half); 162 | 163 | x = ggml_pad_ext(ctx0, x, pad_zero ? 0 : 2, p_left, p_right); 164 | // utils.debug_print(x, "mimi_conv_1d_padded"); 165 | 166 | kernel = ggml_cast(ctx0, kernel, GGML_TYPE_F16); // TODO: do this at conversion time 167 | x = ggml_conv_1d(ctx0, kernel, x, stride, 0, dilation); 168 | if (bias) { 169 | bias = ggml_cont(ctx0, ggml_transpose(ctx0, bias)); // TODO: do this at conversion time 170 | x = ggml_add(ctx0, x, bias); 171 | } 172 | ggml_set_name(x, "mimi_conv_1d"); 173 | return x; 174 | }; 175 | 176 | static ggml_tensor * mimi_conv_transpose_1d(ggml_easy::ctx::build_utils & utils, ggml_context * ctx0, ggml_tensor * x, 177 | ggml_tensor * kernel, ggml_tensor * bias, int stride, int dilation, bool depthwise) { 178 | GGML_ASSERT(x->ne[1] == kernel->ne[2]); 179 | int64_t n_rows = x->ne[1]; 180 | int64_t kernel_size = kernel->ne[0]; 181 | int64_t p_total = kernel_size - stride; // padding total 182 | 183 | int64_t p_right = mimi_config.causal 184 | ? (float)p_total / mimi_config.trim_right_ratio 185 | : p_total / 2; 186 | int64_t p_left = p_total - p_right; 187 | 188 | ggml_tensor * out = nullptr; 189 | 190 | kernel = ggml_cast(ctx0, kernel, GGML_TYPE_F16); // TODO: do this at conversion time 191 | 192 | if (depthwise) { 193 | for (int64_t ir = 0; ir < n_rows; ir++) { 194 | ggml_tensor * row = ggml_view_1d(ctx0, x, 195 | x->ne[0], ir*x->ne[0]*ggml_element_size(x)); 196 | ggml_tensor * krn = ggml_view_1d(ctx0, kernel, 197 | kernel->ne[0], ir*kernel->ne[0]*ggml_element_size(kernel)); 198 | if (ir == 0) { 199 | ggml_set_name(krn, "krn"); 200 | ggml_easy::debug::print_tensor_shape(krn); 201 | } 202 | row = ggml_conv_transpose_1d(ctx0, krn, row, stride, 0, dilation); 203 | if (ir == 0) { 204 | ggml_set_name(row, "ggml_conv_transpose_1d __________"); 205 | ggml_easy::debug::print_tensor_shape(row); 206 | } 207 | // unpad (remove p_right and p_left columns) 208 | row = ggml_view_1d(ctx0, row, row->ne[0] - p_total, p_left*ggml_element_size(row)); 209 | 210 | // TODO: concat can be slow, we should use ggml_view_1d/ggml_cpy to avoid realloc 211 | out = out ? ggml_concat(ctx0, out, row, 1) : row; 212 | } 213 | 214 | } else { 215 | out = ggml_conv_transpose_1d(ctx0, kernel, x, stride, 0, dilation); 216 | // unpad 217 | out = ggml_view_2d(ctx0, out, 218 | out->ne[0] - p_total, out->ne[1], 219 | out->nb[1], p_left*ggml_element_size(out)); 220 | } 221 | 222 | if (bias) { 223 | bias = ggml_cont(ctx0, ggml_transpose(ctx0, bias)); // TODO: do this at conversion time 224 | out = ggml_add(ctx0, out, bias); 225 | } 226 | 227 | return out; 228 | } 229 | 230 | // based on MimiEncoder 231 | // SEANet encoder as used by Mimi. 232 | struct mimi_encoder_decoder { 233 | ggml_easy::ctx & ctx; 234 | struct layer { 235 | bool is_elu = false; 236 | bool is_resnet = false; 237 | bool is_transposed_conv = false; 238 | ggml_tensor * conv_0_w; 239 | ggml_tensor * conv_0_b; 240 | ggml_tensor * conv_1_w; 241 | ggml_tensor * conv_1_b; 242 | int stride = 1; 243 | }; 244 | int dilation_growth_rate = 2; // TODO: unused? 245 | std::vector layers; 246 | 247 | std::array repeated_pattern = {1, 4, 7, 10}; 248 | 249 | mimi_encoder_decoder(ggml_easy::ctx & ctx) : ctx(ctx) {} 250 | 251 | void load_encoder() { 252 | layers.push_back({ 253 | .conv_0_w = ctx.get_weight("encoder.layers.0.conv.weight"), 254 | .conv_0_b = ctx.get_weight("encoder.layers.0.conv.bias"), 255 | }); 256 | for (int i = 0; i < (int)repeated_pattern.size(); ++i) { 257 | int i_start = repeated_pattern[i]; 258 | // residual layers 259 | layers.push_back({ 260 | .is_resnet = true, 261 | .conv_0_w = ctx.get_weight("encoder.layers.%d.block.1.conv.weight", i_start), 262 | .conv_0_b = ctx.get_weight("encoder.layers.%d.block.1.conv.bias", i_start), 263 | .conv_1_w = ctx.get_weight("encoder.layers.%d.block.3.conv.weight", i_start), 264 | .conv_1_b = ctx.get_weight("encoder.layers.%d.block.3.conv.bias", i_start), 265 | }); 266 | // downsampling layers 267 | layers.push_back({ 268 | .is_elu = true, // layer (i_start + 1) 269 | }); 270 | layers.push_back({ 271 | .conv_0_w = ctx.get_weight("encoder.layers.%d.conv.weight", i_start + 2), 272 | .conv_0_b = ctx.get_weight("encoder.layers.%d.conv.bias", i_start + 2), 273 | .stride = mimi_config.downsampling_ratio[i], 274 | }); 275 | } 276 | layers.push_back({ 277 | .is_elu = true, // layer 13 278 | }); 279 | layers.push_back({ 280 | .conv_0_w = ctx.get_weight("encoder.layers.14.conv.weight"), 281 | .conv_0_b = ctx.get_weight("encoder.layers.14.conv.bias"), 282 | }); 283 | } 284 | 285 | void load_decoder() { 286 | layers.push_back({ 287 | .conv_0_w = ctx.get_weight("decoder.layers.0.conv.weight"), 288 | .conv_0_b = ctx.get_weight("decoder.layers.0.conv.bias"), 289 | }); 290 | for (int i = 0; i < (int)repeated_pattern.size(); ++i) { 291 | int i_start = repeated_pattern[i]; 292 | // upsampling layers 293 | layers.push_back({ 294 | .is_elu = true, // layer (i_start) 295 | }); 296 | layers.push_back({ 297 | .is_transposed_conv = true, 298 | .conv_0_w = ctx.get_weight("decoder.layers.%d.conv.weight", i_start + 1), 299 | .conv_0_b = ctx.get_weight("decoder.layers.%d.conv.bias", i_start + 1), 300 | .stride = mimi_config.upsampling_ratio[i], 301 | }); 302 | // residual layers 303 | layers.push_back({ 304 | .is_resnet = true, 305 | .conv_0_w = ctx.get_weight("decoder.layers.%d.block.1.conv.weight", i_start + 2), 306 | .conv_0_b = ctx.get_weight("decoder.layers.%d.block.1.conv.bias", i_start + 2), 307 | .conv_1_w = ctx.get_weight("decoder.layers.%d.block.3.conv.weight", i_start + 2), 308 | .conv_1_b = ctx.get_weight("decoder.layers.%d.block.3.conv.bias", i_start + 2), 309 | }); 310 | } 311 | layers.push_back({ 312 | .is_elu = true, // layer 13 313 | }); 314 | layers.push_back({ 315 | .conv_0_w = ctx.get_weight("decoder.layers.14.conv.weight"), 316 | .conv_0_b = ctx.get_weight("decoder.layers.14.conv.bias"), 317 | }); 318 | } 319 | 320 | ggml_tensor * forward(ggml_context * ctx0, ggml_easy::ctx::build_utils & utils, ggml_tensor * input) { 321 | ggml_tensor * x = input; 322 | 323 | // int i = 0; // for debugging 324 | for (auto & layer : layers) { 325 | if (layer.is_elu) { 326 | x = ggml_elu(ctx0, x); 327 | } else if (layer.is_resnet) { 328 | ggml_tensor * residual = x; 329 | x = ggml_elu(ctx0, x); 330 | ggml_easy::debug::print_tensor_shape(x); 331 | ggml_easy::debug::print_tensor_shape(layer.conv_0_w); 332 | x = mimi_conv_1d(utils, ctx0, x, layer.conv_0_w, layer.conv_0_b, 1, 1); 333 | x = ggml_elu(ctx0, x); 334 | x = mimi_conv_1d(utils, ctx0, x, layer.conv_1_w, layer.conv_1_b, 1, 1); 335 | x = ggml_add(ctx0, x, residual); 336 | } else { 337 | x = layer.is_transposed_conv 338 | ? mimi_conv_transpose_1d(utils, ctx0, x, layer.conv_0_w, layer.conv_0_b, layer.stride, 1, false) 339 | : mimi_conv_1d(utils, ctx0, x, layer.conv_0_w, layer.conv_0_b, layer.stride, 1); 340 | } 341 | // utils.debug_print(x, "after_layer_%d", i); i++; 342 | } 343 | 344 | return x; 345 | } 346 | }; 347 | 348 | struct mimi_transformer { 349 | struct layer { 350 | ggml_tensor * inp_norm_w; 351 | ggml_tensor * inp_norm_b; 352 | 353 | ggml_tensor * attn_q; 354 | ggml_tensor * attn_k; 355 | ggml_tensor * attn_v; 356 | ggml_tensor * attn_o; 357 | ggml_tensor * attn_post_norm_w; 358 | ggml_tensor * attn_post_norm_b; 359 | ggml_tensor * attn_layer_scale; 360 | 361 | ggml_tensor * ffn_up; 362 | ggml_tensor * ffn_down; 363 | ggml_tensor * mlp_layer_scale; 364 | }; 365 | std::vector layers; 366 | 367 | mimi_transformer(ggml_easy::ctx & ctx, const char * prefix, int n_layers) { 368 | for (int il = 0; il < n_layers; il++) { 369 | layers.push_back({ 370 | .inp_norm_w = ctx.get_weight("%s_transformer.layers.%d.input_layernorm.weight", prefix, il), 371 | .inp_norm_b = ctx.get_weight("%s_transformer.layers.%d.input_layernorm.bias", prefix, il), 372 | 373 | .attn_q = ctx.get_weight("%s_transformer.layers.%d.self_attn.q_proj.weight", prefix, il), 374 | .attn_k = ctx.get_weight("%s_transformer.layers.%d.self_attn.k_proj.weight", prefix, il), 375 | .attn_v = ctx.get_weight("%s_transformer.layers.%d.self_attn.v_proj.weight", prefix, il), 376 | .attn_o = ctx.get_weight("%s_transformer.layers.%d.self_attn.o_proj.weight", prefix, il), 377 | .attn_post_norm_w = ctx.get_weight("%s_transformer.layers.%d.post_attention_layernorm.weight", prefix, il), 378 | .attn_post_norm_b = ctx.get_weight("%s_transformer.layers.%d.post_attention_layernorm.bias", prefix, il), 379 | .attn_layer_scale = ctx.get_weight("%s_transformer.layers.%d.self_attn_layer_scale.scale", prefix, il), 380 | 381 | .ffn_up = ctx.get_weight("%s_transformer.layers.%d.mlp.fc1.weight", prefix, il), 382 | .ffn_down = ctx.get_weight("%s_transformer.layers.%d.mlp.fc2.weight", prefix, il), 383 | .mlp_layer_scale = ctx.get_weight("%s_transformer.layers.%d.mlp_layer_scale.scale", prefix, il), 384 | }); 385 | } 386 | } 387 | 388 | ggml_tensor * forward(ggml_context * ctx0, ggml_easy::ctx::build_utils & utils, ggml_tensor * input, ggml_tensor * inp_pos) { 389 | int n_tokens = input->ne[1]; 390 | ggml_tensor * x = input; 391 | 392 | auto layer_norm = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) { 393 | x = ggml_norm(ctx0, x, mimi_config.norm_eps); 394 | x = ggml_mul(ctx0, x, w); 395 | x = ggml_add(ctx0, x, b); 396 | return x; 397 | }; 398 | 399 | // TODO: do this at conversion time, see LlamaModel.permute in convert_hf_to_gguf.py 400 | auto llama_permute = [&](ggml_tensor * w) { 401 | int n_head = mimi_config.n_head; 402 | ggml_tensor * tmp = ggml_reshape_4d(ctx0, w, w->ne[0], w->ne[1] / n_head / 2, 2, n_head); 403 | tmp = ggml_permute(ctx0, tmp, 0, 2, 1, 3); 404 | tmp = ggml_cont(ctx0, tmp); 405 | return ggml_reshape_2d(ctx0, tmp, w->ne[0], w->ne[1]); 406 | }; 407 | 408 | ggml_tensor * residual = input; 409 | 410 | int i = 0; // for debugging 411 | for (auto & layer : layers) { 412 | residual = x; 413 | 414 | // input layer norm 415 | x = layer_norm(x, layer.inp_norm_w, layer.inp_norm_b); 416 | 417 | // self attention 418 | { 419 | ggml_tensor * q = ggml_mul_mat(ctx0, llama_permute(layer.attn_q), x); 420 | ggml_tensor * k = ggml_mul_mat(ctx0, llama_permute(layer.attn_k), x); 421 | ggml_tensor * v = ggml_mul_mat(ctx0, layer.attn_v, x); 422 | 423 | int n_embd_head = mimi_config.n_embd / mimi_config.n_head; 424 | q = ggml_reshape_3d(ctx0, q, n_embd_head, mimi_config.n_head, n_tokens); 425 | k = ggml_reshape_3d(ctx0, k, n_embd_head, mimi_config.n_head_kv, n_tokens); 426 | v = ggml_reshape_3d(ctx0, v, n_embd_head, mimi_config.n_head_kv, n_tokens); 427 | 428 | int n_rot = n_embd_head; 429 | q = ggml_rope_inplace(ctx0, q, inp_pos, n_rot, 0); 430 | q = ggml_cont(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3)); 431 | // utils.debug_print(q, "q rope"); 432 | 433 | k = ggml_rope_inplace(ctx0, k, inp_pos, n_rot, 0); 434 | k = ggml_cont(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3)); 435 | // utils.debug_print(k, "k rope"); 436 | 437 | ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); 438 | ggml_mul_mat_set_prec(kq, GGML_PREC_F32); // mimic behavior of llama.cpp 439 | kq = ggml_scale_inplace(ctx0, kq, 1.0f / std::sqrt(n_embd_head)); 440 | ggml_tensor * kq_masked = ggml_diag_mask_inf_inplace(ctx0, kq, n_tokens); 441 | kq = ggml_soft_max_inplace(ctx0, kq_masked); 442 | // utils.debug_print(kq, "kq softmax"); 443 | 444 | v = ggml_cont(ctx0, ggml_permute(ctx0, v, 1, 2, 0, 3)); 445 | 446 | ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); 447 | kqv = ggml_reshape_3d(ctx0, kqv, n_embd_head, n_tokens, mimi_config.n_head); 448 | kqv = ggml_permute(ctx0, kqv, 0, 2, 1, 3); 449 | kqv = ggml_cont_2d(ctx0, kqv, mimi_config.n_embd, n_tokens); 450 | // utils.debug_print(kqv, "kqv"); 451 | // utils.debug_print(ggml_sum(ctx0, kqv), "kqv_sum"); 452 | 453 | x = ggml_mul_mat(ctx0, layer.attn_o, kqv); 454 | } 455 | 456 | // residual 457 | x = ggml_mul(ctx0, x, layer.attn_layer_scale); 458 | x = ggml_add(ctx0, x, residual); 459 | // utils.debug_print(x, "after_attn_%d", i); 460 | 461 | residual = x; 462 | x = layer_norm(x, layer.attn_post_norm_w, layer.attn_post_norm_b); 463 | 464 | // mlp 465 | { 466 | x = ggml_mul_mat(ctx0, layer.ffn_up, x); 467 | x = ggml_gelu(ctx0, x); 468 | x = ggml_mul_mat(ctx0, layer.ffn_down, x); 469 | } 470 | 471 | // residual 472 | x = ggml_mul(ctx0, x, layer.mlp_layer_scale); 473 | x = ggml_add(ctx0, x, residual); 474 | // utils.debug_print(x, "output_layer_%d", i); 475 | // utils.debug_print(ggml_sum(ctx0, x), "output_layer_%d_sum", i); i++; 476 | } 477 | 478 | return x; 479 | } 480 | }; 481 | 482 | struct mimi_residual_vector_quantizer { 483 | struct component { 484 | ggml_tensor * codebook_embed_sum; 485 | ggml_tensor * codebook_cluster_usage; 486 | ggml_tensor * get_embd(ggml_context * ctx0) { 487 | // TODO: do this at conversion time 488 | ggml_tensor * tmp = ggml_cont(ctx0, ggml_transpose(ctx0, codebook_cluster_usage)); 489 | tmp = ggml_clamp(ctx0, tmp, mimi_config.norm_eps, FLT_MAX); 490 | return ggml_div(ctx0, codebook_embed_sum, tmp); 491 | } 492 | }; 493 | 494 | ggml_tensor * semantic_inp_proj; 495 | std::vector semantic_components; 496 | ggml_tensor * semantic_out_proj; 497 | 498 | ggml_tensor * acoustic_inp_proj; 499 | std::vector acoustic_components; 500 | ggml_tensor * acoustic_out_proj; 501 | 502 | mimi_residual_vector_quantizer(ggml_easy::ctx & ctx) { 503 | semantic_inp_proj = ctx.get_weight("quantizer.semantic_rvq.input_proj.weight"); 504 | semantic_out_proj = ctx.get_weight("quantizer.semantic_rvq.output_proj.weight"); 505 | for (int i = 0; i < mimi_config.n_semantic_components; i++) { 506 | semantic_components.push_back({ 507 | .codebook_embed_sum = ctx.get_weight("quantizer.semantic_rvq.layers.%d.codebook.embed_sum", i), 508 | .codebook_cluster_usage = ctx.get_weight("quantizer.semantic_rvq.layers.%d.codebook.cluster_usage", i), 509 | }); 510 | } 511 | acoustic_inp_proj = ctx.get_weight("quantizer.acoustic_rvq.input_proj.weight"); 512 | acoustic_out_proj = ctx.get_weight("quantizer.acoustic_rvq.output_proj.weight"); 513 | for (int i = 0; i < mimi_config.n_acoustic_components; i++) { 514 | acoustic_components.push_back({ 515 | .codebook_embed_sum = ctx.get_weight("quantizer.acoustic_rvq.layers.%d.codebook.embed_sum", i), 516 | .codebook_cluster_usage = ctx.get_weight("quantizer.acoustic_rvq.layers.%d.codebook.cluster_usage", i), 517 | }); 518 | } 519 | } 520 | 521 | // 🆘🆘🆘🆘🆘 FIXME: this does not work correcly, about 50% of the output codes are incorrect 522 | ggml_tensor * encode(ggml_context * ctx0, ggml_easy::ctx::build_utils & utils, ggml_tensor * input) { 523 | int64_t n_embd = input->ne[1]; 524 | int64_t n_codes_per_embd = (semantic_components.size() + acoustic_components.size()); 525 | ggml_tensor * codes = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_embd, n_codes_per_embd); 526 | ggml_set_input(codes); 527 | ggml_set_name(codes, "codes"); 528 | 529 | size_t pos = 0; 530 | { 531 | // semantic 532 | ggml_tensor * proj = ggml_reshape_2d(ctx0, semantic_inp_proj, 533 | semantic_inp_proj->ne[1], semantic_inp_proj->ne[2]); // TODO: do this at conversion time 534 | ggml_tensor * x = ggml_mul_mat(ctx0, proj, input); 535 | for (size_t i = 0; i < semantic_components.size(); i++) { 536 | ggml_tensor * codebook = semantic_components[i].get_embd(ctx0); 537 | codes = ggml_lookup_vectors(utils, ctx0, codebook, x, codes, pos); 538 | ggml_build_forward_expand(utils.gf, codes); 539 | pos += n_embd*ggml_element_size(codes); 540 | } 541 | } 542 | 543 | { 544 | // acoustic 545 | ggml_tensor * proj = ggml_reshape_2d(ctx0, acoustic_inp_proj, 546 | acoustic_inp_proj->ne[1], acoustic_inp_proj->ne[2]); // TODO: do this at conversion time 547 | ggml_tensor * x = ggml_mul_mat(ctx0, proj, input); 548 | for (size_t i = 0; i < acoustic_components.size(); i++) { 549 | ggml_tensor * codebook = acoustic_components[i].get_embd(ctx0); 550 | codes = ggml_lookup_vectors(utils, ctx0, codebook, x, codes, pos); 551 | ggml_build_forward_expand(utils.gf, codes); 552 | pos += n_embd*ggml_element_size(codes); 553 | } 554 | } 555 | 556 | return codes; 557 | } 558 | 559 | // the input has shape [n_codes, n_codes_per_embd] 560 | // first row is semantic, the rest are acoustic 561 | // example: [ [semantic], [acoustic1], [acoustic2], ... ] 562 | ggml_tensor * decode(ggml_context * ctx0, ggml_easy::ctx::build_utils & utils, ggml_tensor * input) { 563 | GGML_ASSERT(input->type == GGML_TYPE_I32); 564 | 565 | size_t n_semantic = semantic_components.size(); 566 | int64_t n_codes_per_embd = (n_semantic + acoustic_components.size()); 567 | int64_t n_codes = input->ne[0] / n_codes_per_embd; 568 | 569 | GGML_ASSERT(input->ne[0] % n_codes_per_embd == 0); 570 | 571 | ggml_tensor * out_s = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, mimi_config.codebook_dim, n_codes); 572 | ggml_tensor * out_a = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, mimi_config.codebook_dim, n_codes); 573 | out_s = ggml_scale(ctx0, out_s, 0.0f); // clear 574 | out_a = ggml_scale(ctx0, out_a, 0.0f); // clear 575 | 576 | for (size_t ir = 0; ir < n_codes_per_embd; ir++) { 577 | ggml_tensor * row = ggml_view_1d(ctx0, input, n_codes, ir*n_codes*ggml_element_size(input)); 578 | if (ir < n_semantic) { 579 | // semantic 580 | ggml_tensor * codebook = semantic_components[ir].get_embd(ctx0); 581 | ggml_tensor * embd = ggml_get_rows(ctx0, codebook, row); 582 | out_s = ggml_add(ctx0, out_s, embd); 583 | } else { 584 | // acoustic 585 | ggml_tensor * codebook = acoustic_components[ir-n_semantic].get_embd(ctx0); 586 | ggml_tensor * embd = ggml_get_rows(ctx0, codebook, row); 587 | out_a = ggml_add(ctx0, out_a, embd); 588 | } 589 | } 590 | 591 | ggml_tensor * proj_s = ggml_reshape_2d(ctx0, semantic_out_proj, 592 | semantic_out_proj->ne[1], semantic_out_proj->ne[2]); // TODO: do this at conversion time 593 | ggml_tensor * proj_a = ggml_reshape_2d(ctx0, acoustic_out_proj, 594 | acoustic_out_proj->ne[1], acoustic_out_proj->ne[2]); // TODO: do this at conversion time 595 | 596 | out_s = ggml_mul_mat(ctx0, proj_s, out_s); 597 | out_a = ggml_mul_mat(ctx0, proj_a, out_a); 598 | 599 | return ggml_add(ctx0, out_s, out_a); 600 | } 601 | }; 602 | 603 | int main() { 604 | ggml_easy::ctx_params params; 605 | //params.log_level = GGML_LOG_LEVEL_DEBUG; 606 | params.max_nodes = 1024*16; 607 | params.use_gpu = false; 608 | ggml_easy::ctx ctx(params); 609 | 610 | // ctx.load_gguf("mimi.gguf"); 611 | ctx.load_safetensors("mimi.safetensors", { 612 | {".acoustic_residual_vector_quantizer", ".acoustic_rvq"}, 613 | {".semantic_residual_vector_quantizer", ".semantic_rvq"}, 614 | }); 615 | 616 | // optional: print backend buffer info 617 | ggml_easy::debug::print_backend_buffer_info(ctx); 618 | 619 | mimi_encoder_decoder encoder(ctx); 620 | mimi_encoder_decoder decoder(ctx); 621 | mimi_transformer encoder_transformer(ctx, "encoder", 8); 622 | mimi_transformer decoder_transformer(ctx, "decoder", 8); 623 | mimi_residual_vector_quantizer quantizer(ctx); 624 | 625 | encoder.load_encoder(); 626 | decoder.load_decoder(); 627 | 628 | // create cgraph 629 | ctx.build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf, auto & utils) { 630 | ggml_tensor * input = utils.new_input("input", GGML_TYPE_F32, 2048); 631 | 632 | // encoder 633 | { 634 | // SEANET encoder 635 | ggml_tensor * embeddings = encoder.forward(ctx_gf, utils, input); 636 | utils.debug_print(embeddings, "embeddings"); 637 | 638 | // transformer 639 | int n_pos = embeddings->ne[0]; 640 | ggml_tensor * pos_enc = utils.new_input("pos_enc", GGML_TYPE_I32, n_pos); 641 | embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); 642 | embeddings = encoder_transformer.forward(ctx_gf, utils, embeddings, pos_enc); 643 | utils.debug_print(embeddings, "embeddings_after_transformer"); 644 | 645 | // downsample 646 | embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); 647 | embeddings = mimi_conv_1d(utils, ctx_gf, embeddings, ctx.get_weight("downsample.conv.weight"), nullptr, 2, 1, false); 648 | utils.debug_print(embeddings, "downsample"); 649 | 650 | // residual vector quantizer 651 | embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); 652 | embeddings = quantizer.encode(ctx_gf, utils, embeddings); 653 | 654 | //utils.debug_print_full(embeddings, "output_codes"); 655 | utils.mark_output(embeddings, "output_codes"); 656 | } 657 | 658 | // decoder 659 | { 660 | ggml_tensor * inp_dec = utils.new_input("inp_dec", GGML_TYPE_I32, 3 * 32); 661 | ggml_tensor * embeddings = quantizer.decode(ctx_gf, utils, inp_dec); 662 | utils.debug_print(embeddings, "read from codebook"); 663 | 664 | // upsample 665 | embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); 666 | embeddings = mimi_conv_transpose_1d(utils, ctx_gf, embeddings, ctx.get_weight("upsample.conv.weight"), nullptr, 2, 1, true); 667 | utils.debug_print(embeddings, "upscaled"); 668 | 669 | // transformer 670 | int n_pos = embeddings->ne[0]; 671 | ggml_tensor * pos_dec = utils.new_input("pos_dec", GGML_TYPE_I32, n_pos); 672 | embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); 673 | embeddings = decoder_transformer.forward(ctx_gf, utils, embeddings, pos_dec); 674 | utils.debug_print(embeddings, "embeddings_after_transformer"); 675 | 676 | // SEANET decoder 677 | embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); 678 | ggml_tensor * output = decoder.forward(ctx_gf, utils, embeddings); 679 | utils.debug_print(output, "output decoded"); 680 | } 681 | }); 682 | 683 | // equivalent to python code: torch.ones((1, 1, 2048)) 684 | ctx.set_tensor_data("input", [](int, int, int, int) { return 1.0f; }); 685 | 686 | // position data 687 | std::vector pos_data(1024); 688 | for (int i = 0; i < (int)pos_data.size(); i++) { 689 | pos_data[i] = i; 690 | } 691 | ctx.set_tensor_data("pos_enc", pos_data.data()); 692 | ctx.set_tensor_data("pos_dec", pos_data.data()); 693 | 694 | // inp_dec data 695 | // equivalent to python code: torch.tensor([[ [i, i+1, i+2] for i in range(0, 3*32, 3) ]], dtype=torch.long) 696 | std::vector inp_dec(3 * 32); 697 | for (size_t i = 0; i < inp_dec.size(); i++) { 698 | inp_dec[i] = i; 699 | } 700 | ctx.set_tensor_data("inp_dec", inp_dec.data()); 701 | 702 | ctx.compute(); 703 | 704 | // print result 705 | //ggml_easy::debug::print_tensor_data(result_tensor, result_data.data()); 706 | 707 | return 0; 708 | } 709 | --------------------------------------------------------------------------------