├── .gitattributes
├── .gitmodules
├── models
    └── whisper-mel-filters.gguf
├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── demo
    ├── safetensors.cpp
    ├── basic.cpp
    ├── dyt-rms.cpp
    ├── fastvlm.cpp
    ├── random.cpp
    ├── svd.cpp
    ├── 2d-rope.cpp
    ├── ultravox-encoder.cpp
    ├── whisper-encoder.cpp
    └── kyutai-mimi.cpp
├── README.md
├── convert_safetensors_to_gguf.py
└── ggml-easy.h


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.gguf binary
2 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "ggml"]
2 | 	path = ggml
3 | 	url = https://github.com/ggml-org/ggml
4 | 


--------------------------------------------------------------------------------
/models/whisper-mel-filters.gguf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ngxson/ggml-easy/HEAD/models/whisper-mel-filters.gguf


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Compiled Object files
 5 | *.slo
 6 | *.lo
 7 | *.o
 8 | *.obj
 9 | 
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 | 
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 | 
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 | 
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 | 
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 | 
34 | build
35 | .cache
36 | tmp
37 | .vscode
38 | 
39 | /*.gguf
40 | /*.safetensors
41 | /*.dot
42 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
 2 | project("ggml-easy" C CXX)
 3 | include(CheckIncludeFileCXX)
 4 | 
 5 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 6 | add_subdirectory(ggml)
 7 | 
 8 | include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 9 | # demo
10 | set(DEMO_TARGETS
11 |     basic
12 |     dyt-rms
13 |     svd
14 |     kyutai-mimi
15 |     safetensors
16 |     ultravox-encoder
17 |     whisper-encoder
18 |     2d-rope
19 |     fastvlm
20 |     random)
21 | 
22 | foreach(TARGET ${DEMO_TARGETS})
23 |   add_executable(${TARGET} demo/${TARGET}.cpp)
24 |   target_link_libraries(${TARGET} PRIVATE ggml)
25 |   target_compile_features(${TARGET} PRIVATE cxx_std_17)
26 | endforeach()
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Xuan-Son Nguyen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/demo/safetensors.cpp:
--------------------------------------------------------------------------------
 1 | #include "ggml.h"
 2 | #include "ggml-easy.h"
 3 | #include <iostream>
 4 | #include <cmath>
 5 | 
 6 | /**
 7 |  * This example demonstrates how to load safetensors directly to GGML without any conversions.
 8 |  * 
 9 |  * We load both the GGUF and safetensors of the same model, then compare the tensors.
10 |  * All tensors are expected to be equal.
11 |  * 
12 |  * I'm using https://huggingface.co/kyutai/mimi as the model. But you can use any model.
13 |  * 
14 |  * To get the safetensors:
15 |  * 1. Download the model.safetensors file
16 |  * 2. Rename the "model.safetensors" to "mimi.safetensors"
17 |  * 
18 |  * To get the gguf:
19 |  * 1. Download the model.safetensors file
20 |  * 2. Run: python convert_safetensors_to_gguf.py --outtype f32 model.safetensors mimi.gguf
21 |  * 
22 |  */
23 | 
24 | int main() {
25 |     ggml_easy::ctx_params params;
26 |     params.use_gpu = false;
27 |     params.log_level = GGML_LOG_LEVEL_DEBUG;
28 | 
29 |     ggml_easy::ctx ctx0(params);
30 |     ctx0.load_safetensors("mimi.safetensors", {
31 |         {".acoustic_residual_vector_quantizer", ".acoustic_rvq"},
32 |         {".semantic_residual_vector_quantizer", ".semantic_rvq"},
33 |     });
34 | 
35 |     ggml_easy::ctx ctx1(params);
36 |     ctx1.load_gguf("mimi.gguf");
37 | 
38 |     GGML_ASSERT(ctx0.tensors.size() == ctx1.tensors.size());
39 | 
40 |     GGML_ASSERT(ggml_backend_buft_is_host(ctx0.backend_buft[0]));
41 |     GGML_ASSERT(ggml_backend_buft_is_host(ctx1.backend_buft[0]));
42 | 
43 |     // compare the tensors
44 |     for (auto & t : ctx0.tensors) {
45 |         auto tensor0 = t.second;
46 |         auto tensor1 = ctx1.get_weight(t.first.c_str());
47 | 
48 |         GGML_ASSERT(ggml_are_same_shape(tensor0, tensor1));
49 |         GGML_ASSERT(tensor0->type == GGML_TYPE_F32);
50 |         GGML_ASSERT(tensor1->type == GGML_TYPE_F32);
51 | 
52 |         float diff = 0.0;
53 |         for (size_t i = 0; i < ggml_nelements(tensor0); ++i) {
54 |             float v0 = ggml_get_f32_1d(tensor0, i);
55 |             float v1 = ggml_get_f32_1d(tensor1, i);
56 |             diff += std::abs(v0 - v1);
57 |         }
58 | 
59 |         printf("%-60s: diff = %f\n", t.first.c_str(), diff);
60 |         GGML_ASSERT(diff < 1e-6);
61 |     }
62 | 
63 |     printf("\nOK: All tensors are equal\n");
64 | 
65 |     return 0;
66 | }
67 | 


--------------------------------------------------------------------------------
/demo/basic.cpp:
--------------------------------------------------------------------------------
 1 | #include "ggml.h"
 2 | #include "ggml-easy.h"
 3 | #include <iostream>
 4 | 
 5 | /**
 6 |  * This example demonstrates how to perform matrix multiplication using ggml-easy.h
 7 |  * 
 8 |  * Given 2 matrices A and B, the result matrix C is calculated as follows:
 9 |  *   C = (A x B) * 2
10 |  *
11 |  * We will use utils.debug_print() to debug the intermediate result of (A x B)
12 |  * Then, we will use utils.mark_output() to get the final result of C
13 |  *
14 |  * The final result can be printed using ggml_easy::debug::print_tensor_data()
15 |  * Or, can be used to perform further computations
16 |  */
17 | 
18 | int main() {
19 |     ggml_easy::ctx_params params;
20 |     ggml_easy::ctx ctx(params);
21 | 
22 |     // initialize data of matrices to perform matrix multiplication
23 |     const int rows_A = 4, cols_A = 2;
24 |     float matrix_A[rows_A * cols_A] = {
25 |         2, 8,
26 |         5, 1,
27 |         4, 2,
28 |         8, 6
29 |     };
30 |     const int rows_B = 3, cols_B = 2;
31 |     float matrix_B[rows_B * cols_B] = {
32 |         10, 5,
33 |         9, 9,
34 |         5, 4
35 |     };
36 | 
37 |     // create cgraph
38 |     ctx.build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf, auto & utils) {
39 |         ggml_tensor * a = utils.new_input("a", GGML_TYPE_F32, cols_A, rows_A);
40 |         ggml_tensor * b = utils.new_input("b", GGML_TYPE_F32, cols_B, rows_B);
41 |         ggml_tensor * a_mul_b = ggml_mul_mat(ctx_gf, a, b);
42 |         utils.debug_print(a_mul_b, "a_mul_b");
43 |         ggml_tensor * result = ggml_scale(ctx_gf, a_mul_b, 2);
44 |         utils.mark_output(result, "result");
45 |     });
46 | 
47 |     // set data
48 |     ctx.set_tensor_data("a", matrix_A);
49 |     ctx.set_tensor_data("b", matrix_B);
50 | 
51 |     // optional: print backend buffer info
52 |     ggml_easy::debug::print_backend_buffer_info(ctx);
53 | 
54 |     // compute
55 |     ggml_status status = ctx.compute();
56 |     if (status != GGML_STATUS_SUCCESS) {
57 |         std::cerr << "error: ggml compute return status: " << status << std::endl;
58 |         return 1;
59 |     }
60 | 
61 |     // get result
62 |     auto result = ctx.get_tensor_data("result");
63 |     ggml_tensor * result_tensor        = result.first;
64 |     std::vector<uint8_t> & result_data = result.second;
65 | 
66 |     // print result
67 |     ggml_easy::debug::print_tensor_data(result_tensor, result_data.data());
68 | 
69 |     return 0;
70 | }
71 | 


--------------------------------------------------------------------------------
/demo/dyt-rms.cpp:
--------------------------------------------------------------------------------
 1 | #include "ggml.h"
 2 | #include "ggml-easy.h"
 3 | #include <iostream>
 4 | 
 5 | /**
 6 |  * Demo to compare performance of RMS Norm vs Dynamic Tanh (DyT)
 7 |  * Paper: https://arxiv.org/abs/2503.10622
 8 |  * 
 9 |  * Result on my Macbook M3:
10 |  * RMS Norm: 37 ms
11 |  * DyT     : 135 ms
12 |  */
13 | 
14 | int main() {
15 |     const int n_embd   = 4096;
16 |     const int n_tokens = 1024;
17 |     const int n_run    = 300;
18 | 
19 |     ggml_easy::ctx_params params;
20 |     params.log_level = GGML_LOG_LEVEL_ERROR;
21 | 
22 |     // benchmark RMS Norm
23 |     {
24 |         ggml_easy::ctx ctx(params);
25 | 
26 |          ctx.build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf, auto & utils) {
27 |             ggml_tensor * cur = utils.new_input("input", GGML_TYPE_F32, n_embd, n_tokens);
28 |             for (int i = 0; i < n_run; i++) {
29 |                 cur = ggml_rms_norm(ctx_gf, cur, 1e-6);
30 |                 // skip bias
31 |             }
32 |             utils.mark_output(cur, "result");
33 |         });
34 | 
35 |         std::vector<float> vec(n_embd * n_tokens, 0.5f);
36 |         ctx.set_tensor_data("input", vec.data());
37 | 
38 |         int64_t t_start = ggml_time_ms();
39 |         ctx.compute();
40 |         int64_t t_end = ggml_time_ms();
41 | 
42 |         std::cout << "RMS Norm: " << (t_end - t_start) << " ms" << std::endl;
43 |     }
44 | 
45 |     // benchmark DyT
46 |     {
47 |         ggml_easy::ctx ctx(params);
48 | 
49 |         ctx.build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf, auto & utils) {
50 |             ggml_tensor * cur   = utils.new_input("input", GGML_TYPE_F32, n_embd, n_tokens);
51 |             ggml_tensor * alpha = utils.new_input("alpha", GGML_TYPE_F32, n_embd);
52 |             ggml_tensor * gamma = utils.new_input("gamma", GGML_TYPE_F32, n_embd);
53 |             for (int i = 0; i < n_run; i++) {
54 |                 // DyT(x) = gamma * tanh(alpha * x) + β
55 |                 cur = ggml_mul(ctx_gf, cur, alpha);
56 |                 cur = ggml_tanh(ctx_gf, cur);
57 |                 cur = ggml_mul(ctx_gf, cur, gamma);
58 |                 // skip beta
59 |             }
60 |             utils.mark_output(cur, "result");
61 |         });
62 | 
63 |         std::vector<float> vec(n_embd * n_tokens, 0.5f);
64 |         ctx.set_tensor_data("input", vec.data());
65 |         ctx.set_tensor_data("alpha", vec.data());
66 |         ctx.set_tensor_data("gamma", vec.data());
67 | 
68 |         int64_t t_start = ggml_time_ms();
69 |         ctx.compute();
70 |         int64_t t_end = ggml_time_ms();
71 | 
72 |         std::cout << "DyT     : " << (t_end - t_start) << " ms" << std::endl;
73 |     }
74 | 
75 |     return 0;
76 | }
77 | 


--------------------------------------------------------------------------------
/demo/fastvlm.cpp:
--------------------------------------------------------------------------------
 1 | #include "ggml.h"
 2 | #include "ggml-easy.h"
 3 | #include <iostream>
 4 | 
 5 | /**
 6 |  * Experiment on fastvlm implementation
 7 |  * 
 8 |  * This is non-complete code, do not ask how to use it
 9 |  */
10 | 
11 | 
12 | struct layer {
13 |     struct rep_mixer {
14 |         ggml_tensor * convffn_bn_w;
15 |         ggml_tensor * convffn_bn_b;
16 |         ggml_tensor * convffn_bn_mean;
17 |         ggml_tensor * convffn_bn_std;
18 |         ggml_tensor * convffn_w;
19 |         ggml_tensor * convffn_fc1;
20 |         ggml_tensor * convffn_fc2;
21 |         ggml_tensor * layer_scale;
22 |         ggml_tensor * token_mixer_conv_w;
23 |         ggml_tensor * token_mixer_conv_b;
24 |     };
25 |     std::array<rep_mixer, 2> mixers;
26 | };
27 | 
28 | int main() {
29 |     ggml_easy::ctx_params params;
30 |     // params.log_level = GGML_LOG_LEVEL_DEBUG;
31 |     params.safetensors_ignore_unknown_dtype = true;
32 |     params.use_gpu = false;
33 |     ggml_easy::ctx ctx(params);
34 |     ctx.load_safetensors("fastvlm.safetensors", {
35 |         {"model.vision_tower.vision_tower.model.", ""},
36 |     });
37 | 
38 |     const int image_size = 1024;
39 | 
40 |     auto * _patch_embed_0_w = ctx.get_weight("patch_embed.0.reparam_conv.weight");
41 |     auto * _patch_embed_0_b = ctx.get_weight("patch_embed.0.reparam_conv.bias");
42 |     auto * _patch_embed_1_w = ctx.get_weight("patch_embed.1.reparam_conv.weight");
43 |     auto * _patch_embed_1_b = ctx.get_weight("patch_embed.1.reparam_conv.bias");
44 |     auto * _patch_embed_2_w = ctx.get_weight("patch_embed.2.reparam_conv.weight");
45 |     auto * _patch_embed_2_b = ctx.get_weight("patch_embed.2.reparam_conv.bias");
46 | 
47 |     // create cgraph
48 |     ctx.build_graph([&](ggml_context * ctx0, ggml_cgraph * gf, auto & utils) {
49 |         ggml_tensor * inp = utils.new_input("inp", GGML_TYPE_F32, image_size, image_size, 3);
50 |         ggml_tensor * tmp;
51 | 
52 |         auto * patch_embed_0_w = ggml_cast(ctx0, _patch_embed_0_w, GGML_TYPE_F16);
53 |         auto * patch_embed_0_b = ggml_cast(ctx0, _patch_embed_0_b, GGML_TYPE_F32);
54 |         auto * patch_embed_1_w = ggml_cast(ctx0, _patch_embed_1_w, GGML_TYPE_F16);
55 |         auto * patch_embed_1_b = ggml_cast(ctx0, _patch_embed_1_b, GGML_TYPE_F32);
56 |         auto * patch_embed_2_w = ggml_cast(ctx0, _patch_embed_2_w, GGML_TYPE_F16);
57 |         auto * patch_embed_2_b = ggml_cast(ctx0, _patch_embed_2_b, GGML_TYPE_F32);
58 | 
59 |         inp = ggml_conv_2d(ctx0, patch_embed_0_w, inp, 2, 2, 1, 1, 1, 1);
60 |         tmp = ggml_reshape_3d(ctx0, patch_embed_0_b, 1, 1, ggml_nelements(patch_embed_0_b));
61 |         inp = ggml_add(ctx0, inp, tmp);
62 |         inp = ggml_gelu(ctx0, inp);
63 | 
64 |         inp = ggml_conv_2d_dw(ctx0, patch_embed_1_w, inp, 2, 2, 1, 1, 1, 1);
65 |         tmp = ggml_reshape_3d(ctx0, patch_embed_1_b, 1, 1, ggml_nelements(patch_embed_1_b));
66 |         inp = ggml_add(ctx0, inp, tmp);
67 |         inp = ggml_gelu(ctx0, inp);
68 | 
69 |         inp = ggml_conv_2d(ctx0, patch_embed_2_w, inp, 1, 1, 0, 0, 1, 1);
70 |         tmp = ggml_reshape_3d(ctx0, patch_embed_2_b, 1, 1, ggml_nelements(patch_embed_2_b));
71 |         inp = ggml_add(ctx0, inp, tmp);
72 |         inp = ggml_gelu(ctx0, inp);
73 | 
74 |         utils.debug_print(inp, "after_conv");
75 |     });
76 | 
77 |     std::vector<float> inp(image_size * image_size * 3);
78 |     for (int i = 0; i < image_size * image_size * 3; ++i) {
79 |         inp[i] = (float)0.1f;
80 |     }
81 |     ctx.set_tensor_data("inp", inp.data());
82 | 
83 |     // compute
84 |     ggml_status status = ctx.compute();
85 | 
86 |     return 0;
87 | }
88 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ggml-easy
  2 | 
  3 | A simple C++ wrapper around [GGML](https://github.com/ggml-org/ggml) to make model loading and execution easier with GPU acceleration support.
  4 | 
  5 | ## Introduction
  6 | 
  7 | `ggml-easy` is a lightweight header-only C++ library that simplifies working with GGML, the tensor library used in projects like llama.cpp. It provides a clean interface for loading GGUF models, creating computation graphs, and executing them on CPU or GPU with minimal boilerplate code.
  8 | 
  9 | ## Setup
 10 | 
 11 | As a header-only library, using ggml-easy is straightforward:
 12 | 
 13 | 1. Include the headers in your project
 14 | 2. Make sure you have GGML as a dependency in `CMakeLists.txt`
 15 | 3. Use the `ggml_easy` namespace in your code
 16 | 
 17 | Example:
 18 | ```cpp
 19 | #include "ggml-easy.h"
 20 | 
 21 | // Your code here
 22 | ```
 23 | 
 24 | See [demo/basic.cpp](demo/basic.cpp) for a complete example of how to use `ggml-easy` in a project.
 25 | 
 26 | ## Compile examples
 27 | 
 28 | To compile everything inside `demo/*`
 29 | 
 30 | ```sh
 31 | cmake -B build
 32 | cmake --build build -j
 33 | # output: build/bin/*
 34 | ```
 35 | 
 36 | ## Features
 37 | 
 38 | ### Effortless GPU support
 39 | 
 40 | ggml-easy abstracted out all the scheduler and buffer setup. GPU is enabled by default.
 41 | 
 42 | To disable it explicitly:
 43 | 
 44 | ```cpp
 45 | ggml_easy::ctx_params params;
 46 | params.use_gpu = false; // true by default
 47 | ggml_easy::ctx ctx(params);
 48 | ```
 49 | 
 50 | Please note that the GPU support is for convenience and is not aimed to have the best performance. Some operations will fallback to CPU if the GPU does not support them.
 51 | 
 52 | ### Load safetensors without converting to GGUF
 53 | 
 54 | You can directly load `.safetensors` file to `ggml-easy` without having to convert it to GGUF! Currently, F32, F16 and BF16 types are supported.
 55 | 
 56 | ```cpp
 57 | ggml_easy::ctx_params params;
 58 | ggml_easy::ctx ctx(params);
 59 | ctx.load_safetensors("mimi.safetensors", {
 60 |     // optionally, rename tensor to make it shorter (name length limit in ggml is 64 characters)
 61 |     {".acoustic_residual_vector_quantizer", ".acoustic_rvq"},
 62 |     {".semantic_residual_vector_quantizer", ".semantic_rvq"},
 63 | });
 64 | ```
 65 | 
 66 | For a complete example, please have a look on [demo/safetensors.cpp](demo/safetensors.cpp) where I load both GGUF + safetensors files, then compare them.
 67 | 
 68 | TODO: multi-shards are not supported for now, will add it soon!
 69 | 
 70 | ### Define input, output easily
 71 | 
 72 | When building computation graph, each input and output nodes can be added with single line of code:
 73 | 
 74 | ```cpp
 75 | ctx.build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf, auto & utils) {
 76 |     ggml_tensor * a = utils.new_input("a", GGML_TYPE_F32, cols_A, rows_A);
 77 |     ggml_tensor * b = utils.new_input("b", GGML_TYPE_F32, cols_B, rows_B);
 78 |     ...
 79 |     utils.mark_output(result, "result");
 80 | });
 81 | ```
 82 | 
 83 | ### Easy debugging
 84 | 
 85 | You can also print the intermediate results with minimal effort:
 86 | 
 87 | ```cpp
 88 | ggml_tensor * a = utils.new_input("a", GGML_TYPE_F32, cols_A, rows_A);
 89 | ggml_tensor * b = utils.new_input("b", GGML_TYPE_F32, cols_B, rows_B);
 90 | ggml_tensor * a_mul_b = ggml_mul_mat(ctx_gf, a, b);
 91 | utils.debug_print(a_mul_b, "a_mul_b");
 92 | ```
 93 | 
 94 | This will print the intermediate result of `A * B` upon `compute()` is called, no more manual `ggml_backend_tensor_get`!
 95 | 
 96 | ```
 97 | a_mul_b.shape = [4, 3]
 98 | a_mul_b.data: [
 99 |      [
100 |       [     60.0000,      55.0000,      50.0000,     110.0000],
101 |       [     90.0000,      54.0000,      54.0000,     126.0000],
102 |       [     42.0000,      29.0000,      28.0000,      64.0000],
103 |      ],
104 |     ]
105 | ```
106 | 


--------------------------------------------------------------------------------
/demo/random.cpp:
--------------------------------------------------------------------------------
  1 | #include "ggml.h"
  2 | #include "ggml-easy.h"
  3 | #include <iostream>
  4 | 
  5 | /**
  6 |  * Random experiment, do not use it
  7 |  */
  8 | 
  9 | int main() {
 10 |     ggml_easy::ctx_params params;
 11 |     ggml_easy::ctx ctx(params);
 12 | 
 13 |     // experiment with torch unfold equivalent in GGML
 14 |     {
 15 |         const int h = 12;
 16 |         const int w = 2;
 17 |         const int hidden_size = 8;
 18 |         ctx.build_graph([&](ggml_context * ctx0, ggml_cgraph * gf, auto & utils) {
 19 |             ggml_tensor * inp = utils.new_input("inp", GGML_TYPE_F32, hidden_size, h*w);
 20 |             ggml_tensor * x = inp;
 21 |             utils.debug_print(ggml_scale(ctx0, inp, 1.0f), "inp0");
 22 | 
 23 |             x = ggml_reshape_3d(ctx0, x, hidden_size, w, h);
 24 |             x = ggml_permute(ctx0, x, 2, 0, 1, 3); // [x, y, hidden_size]
 25 |             x = ggml_cont(ctx0, x);
 26 |             utils.debug_print_full(x, "grid");
 27 | 
 28 |             ggml_tensor * kernel = ggml_view_3d(ctx0, inp, 2, 2, x->ne[2], 0, 0, 0);
 29 |             x = ggml_im2col(ctx0, kernel, x, 2, 2, 0, 0, 1, 1, true, inp->type);
 30 | 
 31 |             utils.debug_print_full(x, "im2col");
 32 | 
 33 |             x = ggml_reshape_2d(ctx0, x, x->ne[0], x->ne[1] * x->ne[2]);
 34 |             utils.debug_print(x, "result");
 35 |         });
 36 |         std::vector<float> inp_data(h * w * hidden_size);
 37 |         for (int i = 0; i < h * w * hidden_size; ++i) {
 38 |             inp_data[i] = (float)i;
 39 |         }
 40 |         ctx.set_tensor_data("inp", inp_data.data());
 41 |         ctx.compute();
 42 |     }
 43 | 
 44 |     printf("\n\n\nLlama4UnfoldConvolution\n\n");
 45 |     {
 46 |         ggml_easy::ctx ctx(params);
 47 |         ctx.load_safetensors("../models/llama4vit.safetensors", {});
 48 | 
 49 |         ggml_tensor * patch_embeddings_0 = ctx.get_weight("vision_model.patch_embedding.linear.weight");
 50 | 
 51 |         const int h = 336;
 52 |         const int w = 336;
 53 |         const int patch_size = 14;
 54 |         const int n_embd = 1408;
 55 |         const int n_patches = (h / patch_size) * (w / patch_size);
 56 | 
 57 |         ctx.build_graph([&](ggml_context * ctx0, ggml_cgraph * gf, auto & utils) {
 58 |             ggml_tensor * inp = utils.new_input("inp", GGML_TYPE_F32, h, w, 3);
 59 |             
 60 |             // Llama4UnfoldConvolution
 61 |             {
 62 |                 ggml_tensor * kernel = ggml_reshape_4d(ctx0, patch_embeddings_0,
 63 |                                                         patch_size, patch_size, 3, n_embd);
 64 |                 inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type);
 65 |                 //inp = ggml_reshape_2d(ctx0, inp, inp->ne[0], inp->ne[1] * inp->ne[2]); // flatten to 2D
 66 |                 utils.debug_print(inp, "im2col");
 67 |                 utils.debug_print(ggml_sum(ctx0, inp), "im2col_sum");
 68 | 
 69 |                 utils.debug_print(ggml_cast(ctx0, patch_embeddings_0, GGML_TYPE_F32), "patch_embeddings_0");
 70 | 
 71 |                 inp = ggml_mul_mat(ctx0, patch_embeddings_0, inp);
 72 |                 utils.debug_print(inp, "patch_conv");
 73 |                 utils.debug_print(ggml_sum(ctx0, inp), "patch_conv_sum");
 74 | 
 75 |                 inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
 76 |             }
 77 | 
 78 |             //inp = ggml_reshape_2d(ctx0, inp, inp->ne[0], inp->ne[1] * inp->ne[2]);
 79 |             utils.debug_print(inp, "result");
 80 |         });
 81 | 
 82 |         std::vector<float> inp_data(h * w * 3, 0.0);
 83 |         for (int i = 0; i < h * w; ++i) {
 84 |             inp_data[i] = 1.0; //(float)i * 0.1;
 85 |         }
 86 |         ctx.set_tensor_data("inp", inp_data.data());
 87 |         ctx.compute();
 88 |     }
 89 | 
 90 |     // https://github.com/ggml-org/llama.cpp/pull/13772
 91 |     // {
 92 |     //     const int h = 12;
 93 |     //     const int w = 2;
 94 |     //     const int hidden_size = 8;
 95 |     //     ctx.build_graph([&](ggml_context * ctx0, ggml_cgraph * gf, auto & utils) {
 96 |     //         ggml_tensor * inp = utils.new_input("inp", GGML_TYPE_F32, hidden_size, h*w);
 97 |     //         inp = ggml_fill(ctx0, inp, 1.234f);
 98 |     //         utils.debug_print(inp, "inp");
 99 |     //     });
100 |     //     ctx.compute();
101 |     // }
102 | 
103 |     // https://github.com/ggml-org/ggml/issues/1230
104 |     {
105 |         ggml_easy::ctx_params params_no_gpu;
106 |         params_no_gpu.use_gpu = false;
107 |         ggml_easy::ctx ctx_no_gpu(params_no_gpu);
108 |         ggml_easy::ctx ctx(params);
109 |         
110 |         auto builder = [&](ggml_context * ctx0, ggml_cgraph * gf, auto & utils) {
111 |             ggml_tensor * x = ggml_ones(ctx0, 512, 512);
112 |             x = ggml_scale(ctx0, x, 0.12432f);
113 |             ggml_tensor * y = ggml_ones(ctx0, 512, 512);
114 |             y = ggml_scale(ctx0, y, 0.34636f);
115 | 
116 |             ggml_tensor * result = ggml_mul(ctx0, x, y);
117 |             utils.debug_print(result, "result");
118 |             utils.mark_output(result, "result");
119 |         };
120 | 
121 |         ctx.build_graph(builder);
122 |         ctx.compute();
123 |         ctx_no_gpu.build_graph(builder);
124 |         ctx_no_gpu.compute();
125 | 
126 |         float max_diff = 0.0f;
127 |         auto res0 = ctx.get_tensor_data("result");
128 |         auto res1 = ctx_no_gpu.get_tensor_data("result");
129 |         GGML_ASSERT(ggml_nelements(res0.first) == ggml_nelements(res1.first));
130 |         for (size_t i = 0; i < ggml_nelements(res0.first); ++i) {
131 |             float v0 = ((float *)res0.second.data())[i];
132 |             float v1 = ((float *)res1.second.data())[i];
133 |             float diff = std::abs(v0 - v1);
134 |             if (diff > max_diff) {
135 |                 max_diff = diff;
136 |             }
137 |         }
138 | 
139 |         printf("max diff: %f\n", max_diff);
140 |     }
141 | 
142 |     return 0;
143 | }
144 | 


--------------------------------------------------------------------------------
/demo/svd.cpp:
--------------------------------------------------------------------------------
  1 | #include "ggml.h"
  2 | #include "ggml-easy.h"
  3 | #include <iostream>
  4 | #include <ctime>
  5 | #include <random>
  6 | 
  7 | const time_t seed = std::time(0);
  8 | 
  9 | const int   rank   = 4;
 10 | const float delta  = 0.001;
 11 | const float eps    = 0.97;
 12 | const float lambda = 2;
 13 | 
 14 | const int rows_A = 3;
 15 | const int cols_A = 2;
 16 | float matrix_A[rows_A * cols_A] = {
 17 |     1, 2,
 18 |     3, 4,
 19 |     5, 6,
 20 | };
 21 | 
 22 | /**
 23 |  * This program computes the singular value decomposition (SVD) of a matrix A using the power iteration method.
 24 |  * The matrix A is decomposed into the product of three matrices U, S, and V such that A = U * S * V^T.
 25 |  *
 26 |  * After decomposed the matrix A, the program reconstructs the matrix A using the decomposed matrices U, S, and V.
 27 |  * The reconstructed matrix should be the same as the original matrix A.
 28 |  * 
 29 |  * Ref python implementation: https://gist.github.com/Zhenye-Na/cbf4e534b44ef94fdbad663ef56dd333
 30 |  */
 31 | 
 32 | int main() {
 33 |     ggml_easy::ctx_params params;
 34 |     ggml_easy::ctx ctx(params);
 35 | 
 36 |     const int n_iters = log(4.0f * log(2.0f * rows_A / delta) / (eps * delta)) / (2 * lambda);
 37 |     printf("n_iters = %d\n", n_iters);
 38 | 
 39 |     auto norm = [&](ggml_context * ctx_gf, ggml_tensor * t) {
 40 |         return ggml_sqrt(ctx_gf, ggml_sum_rows(ctx_gf, ggml_sqr(ctx_gf, t)));
 41 |     };
 42 | 
 43 |     auto power_iteration = [&](ggml_context * ctx_gf, ggml_cgraph * gf, ggml_tensor * A, ggml_tensor * x) {
 44 |         ggml_tensor * B = ggml_mul_mat(ctx_gf, A, A);
 45 |         for (int i = 0; i < n_iters; i++) {
 46 |             x = ggml_mul_mat(ctx_gf, B, x);
 47 |             x = ggml_div(ctx_gf, x, norm(ctx_gf, x));
 48 |         }
 49 |         ggml_tensor * v = x;
 50 |         ggml_tensor * AT = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, A));
 51 |         ggml_tensor * A_v = ggml_mul_mat(ctx_gf, AT, v);
 52 |         ggml_tensor * s = norm(ctx_gf, A_v);
 53 |         ggml_tensor * u = ggml_div(ctx_gf, A_v, s);
 54 |         return std::vector<ggml_tensor *>{u, s, v};
 55 |     };
 56 | 
 57 |     ctx.build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf, auto & utils) {
 58 |         ggml_tensor * A = utils.new_input("A", GGML_TYPE_F32, cols_A, rows_A);
 59 |         ggml_tensor * x = utils.new_input("x", GGML_TYPE_F32, rows_A);
 60 | 
 61 |         // normalize x
 62 |         x = ggml_div(ctx_gf, x, norm(ctx_gf, x));
 63 | 
 64 |         ggml_tensor * out_u; // final shape: [cols_A, rank]
 65 |         ggml_tensor * out_s; // final shape: [rank]
 66 |         ggml_tensor * out_v; // final shape: [rows_A, rank]
 67 |         
 68 |         for (int i = 0; i < rank; i++) {
 69 |             std::vector<ggml_tensor *> result = power_iteration(ctx_gf, gf, A, x);
 70 |             ggml_tensor * u = result[0];
 71 |             ggml_tensor * s = result[1];
 72 |             ggml_tensor * v = result[2];
 73 | 
 74 |             ggml_tensor * vT = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, v));
 75 |             ggml_tensor * uT = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, u));
 76 |             ggml_tensor * A_minus = ggml_mul(ctx_gf, ggml_mul_mat(ctx_gf, uT, vT), s);
 77 |             A = ggml_add(ctx_gf, A, ggml_scale(ctx_gf, A_minus, -1));
 78 |             // utils.debug_print(u, "u_intermediate");
 79 |             // utils.debug_print(v, "v_intermediate");
 80 |             // utils.debug_print(A, "A_intermediate");
 81 | 
 82 |             if (i == 0) {
 83 |                 out_u = u;
 84 |                 out_s = s;
 85 |                 out_v = v;
 86 |             } else {
 87 |                 out_u = ggml_concat(ctx_gf, out_u, u, 1);
 88 |                 out_s = ggml_concat(ctx_gf, out_s, s, 0);
 89 |                 out_v = ggml_concat(ctx_gf, out_v, v, 1);
 90 |             }
 91 |         }
 92 | 
 93 |         utils.mark_output(out_u, "u");
 94 |         utils.mark_output(out_s, "s");
 95 |         utils.mark_output(out_v, "v");
 96 |     });
 97 | 
 98 |     // set data
 99 |     {
100 |         ctx.set_tensor_data("A", matrix_A);
101 | 
102 |         // initialize eigenvector to random vector
103 |         std::default_random_engine generator(static_cast<unsigned int>(seed));
104 |         std::uniform_real_distribution<float> distribution(0.0, 1.0);
105 |         ctx.set_tensor_data("x", [&](int, int, int, int) {
106 |             return distribution(generator);
107 |         });
108 |     }
109 | 
110 |     // optional: print backend buffer info
111 |     ggml_easy::debug::print_backend_buffer_info(ctx);
112 | 
113 |     // compute
114 |     ggml_status status = ctx.compute();
115 |     if (status != GGML_STATUS_SUCCESS) {
116 |         std::cerr << "error: ggml compute return status: " << status << std::endl;
117 |         return 1;
118 |     }
119 | 
120 |     // get result
121 |     auto print_result = [&](ggml_easy::ctx & ctx, const char * tensor_name) {
122 |         auto result = ctx.get_tensor_data(tensor_name);
123 |         ggml_tensor * result_tensor        = result.first;
124 |         std::vector<uint8_t> & result_data = result.second;
125 |         std::cout << "\n\n" << tensor_name << ":\n";
126 |         ggml_easy::debug::print_tensor_data(result_tensor, result_data.data());
127 |         return result_data;
128 |     };
129 | 
130 |     std::vector<uint8_t> data_u = print_result(ctx, "u");
131 |     std::vector<uint8_t> data_s = print_result(ctx, "s");
132 |     std::vector<uint8_t> data_v = print_result(ctx, "v");
133 | 
134 | 
135 |     // VERIFY THE RESULT!!
136 | 
137 | 
138 |     ctx.build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf, auto & utils) {
139 |         ggml_tensor * u = utils.new_input("u", GGML_TYPE_F32, cols_A, rank);
140 |         ggml_tensor * s = utils.new_input("s", GGML_TYPE_F32, rank);
141 |         ggml_tensor * v = utils.new_input("v", GGML_TYPE_F32, rows_A, rank);
142 | 
143 |         s = ggml_diag(ctx_gf, s);
144 | 
145 |         ggml_tensor * uT = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, u));
146 |         ggml_tensor * vT = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, v));
147 |         ggml_tensor * temp = ggml_mul_mat(ctx_gf, s, uT);
148 |         ggml_tensor * A_reconstructed = ggml_mul_mat(ctx_gf, temp, vT);
149 |         utils.mark_output(A_reconstructed, "A_reconstructed");
150 | 
151 |         ggml_tensor * A = utils.new_input("A", GGML_TYPE_F32, cols_A, rows_A);
152 |         ggml_tensor * diff = ggml_sum(ctx_gf, ggml_sub(ctx_gf, A, A_reconstructed));
153 |         utils.mark_output(diff, "diff");
154 |     });
155 | 
156 |     ctx.set_tensor_data("u", data_u.data());
157 |     ctx.set_tensor_data("s", data_s.data());
158 |     ctx.set_tensor_data("v", data_v.data());
159 |     ctx.set_tensor_data("A", matrix_A);
160 | 
161 |     status = ctx.compute();
162 |     if (status != GGML_STATUS_SUCCESS) {
163 |         std::cerr << "error: ggml compute return status: " << status << std::endl;
164 |         return 1;
165 |     }
166 | 
167 |     print_result(ctx, "A_reconstructed");
168 |     print_result(ctx, "diff");
169 | 
170 |     return 0;
171 | }
172 | 


--------------------------------------------------------------------------------
/convert_safetensors_to_gguf.py:
--------------------------------------------------------------------------------
  1 | import gguf
  2 | import argparse
  3 | import logging
  4 | import sys
  5 | import torch
  6 | import json
  7 | import os
  8 | import numpy as np
  9 | from typing import cast, ContextManager, Any, Iterator
 10 | from pathlib import Path
 11 | from torch import Tensor
 12 | 
 13 | # some tensor names are too long, ggml refuses to load them
 14 | # this function renames them to shorter names
 15 | def rename_tensor(name: str) -> str:
 16 |     replacements = {
 17 |         "quantizer.acoustic_residual_vector_quantizer": "quantizer.acoustic_rvq", # kyutai mimi
 18 |         "quantizer.semantic_residual_vector_quantizer": "quantizer.semantic_rvq", # kyutai mimi
 19 |     }
 20 |     for old, new in replacements.items():
 21 |         name = name.replace(old, new)
 22 |     return name
 23 | 
 24 | # (copied from convert_hf_to_gguf.py)
 25 | # tree of lazy tensors
 26 | class LazyTorchTensor(gguf.LazyBase):
 27 |     _tensor_type = torch.Tensor
 28 |     # to keep the type-checker happy
 29 |     dtype: torch.dtype
 30 |     shape: torch.Size
 31 | 
 32 |     # only used when converting a torch.Tensor to a np.ndarray
 33 |     _dtype_map: dict[torch.dtype, type] = {
 34 |         torch.float16: np.float16,
 35 |         torch.float32: np.float32,
 36 |     }
 37 | 
 38 |     # used for safetensors slices
 39 |     # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046
 40 |     # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734
 41 |     _dtype_str_map: dict[str, torch.dtype] = {
 42 |         "F64": torch.float64,
 43 |         "F32": torch.float32,
 44 |         "BF16": torch.bfloat16,
 45 |         "F16": torch.float16,
 46 |         # "U64": torch.uint64,
 47 |         "I64": torch.int64,
 48 |         # "U32": torch.uint32,
 49 |         "I32": torch.int32,
 50 |         # "U16": torch.uint16,
 51 |         "I16": torch.int16,
 52 |         "U8": torch.uint8,
 53 |         "I8": torch.int8,
 54 |         "BOOL": torch.bool,
 55 |         "F8_E4M3": torch.float8_e4m3fn,
 56 |         "F8_E5M2": torch.float8_e5m2,
 57 |     }
 58 | 
 59 |     def numpy(self) -> gguf.LazyNumpyTensor:
 60 |         dtype = self._dtype_map[self.dtype]
 61 |         return gguf.LazyNumpyTensor(
 62 |             meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
 63 |             args=(self,),
 64 |             func=(lambda s: s.numpy())
 65 |         )
 66 | 
 67 |     @classmethod
 68 |     def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor:
 69 |         return torch.empty(size=shape, dtype=dtype, device="meta")
 70 | 
 71 |     @classmethod
 72 |     def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
 73 |         dtype = cls._dtype_str_map[st_slice.get_dtype()]
 74 |         shape: tuple[int, ...] = tuple(st_slice.get_shape())
 75 |         lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
 76 |         return cast(torch.Tensor, lazy)
 77 | 
 78 |     @classmethod
 79 |     def __torch_function__(cls, func, types, args=(), kwargs=None):
 80 |         del types  # unused
 81 | 
 82 |         if kwargs is None:
 83 |             kwargs = {}
 84 | 
 85 |         if func is torch.Tensor.numpy:
 86 |             return args[0].numpy()
 87 | 
 88 |         return cls._wrap_fn(func)(*args, **kwargs)
 89 | 
 90 | class Converter:
 91 |     in_file: Path
 92 |     out_file: Path
 93 |     ftype: gguf.LlamaFileType
 94 |     gguf_writer: gguf.GGUFWriter
 95 | 
 96 |     def __init__(self, in_file: Path, out_file: Path, ftype: gguf.LlamaFileType):
 97 |         self.in_file = in_file
 98 |         self.out_file = out_file
 99 |         self.ftype = ftype
100 |         endianess = gguf.GGUFEndian.LITTLE
101 |         self.gguf_writer = gguf.GGUFWriter(path=None, arch="unknown", endianess=endianess)
102 | 
103 |     def convert(self):
104 |         print(f"Converting {self.in_file} to {self.out_file} with {self.ftype} data type.")
105 | 
106 |         for name, data_torch in self.get_tensors():
107 |             old_dtype = data_torch.dtype
108 |             is_1d = len(data_torch.shape) == 1
109 |             can_quantize = not is_1d
110 | 
111 |             data_qtype = gguf.GGMLQuantizationType.F32
112 |             if can_quantize:
113 |                 if self.ftype == gguf.LlamaFileType.ALL_F32:
114 |                     data_qtype = gguf.GGMLQuantizationType.F32
115 |                 elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
116 |                     data_qtype = gguf.GGMLQuantizationType.F16
117 |                 elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
118 |                     data_qtype = gguf.GGMLQuantizationType.BF16
119 |                 elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
120 |                     data_qtype = gguf.GGMLQuantizationType.Q8_0
121 |                 else:
122 |                     raise ValueError(f"Unsupported file type: {self.ftype}")
123 |                 
124 |             data = data_torch.numpy()
125 |             try:
126 |                 data = gguf.quants.quantize(data, data_qtype)
127 |             except Exception as e:
128 |                 print(f"Error quantizing tensor '{name}': {e}, fallback to F16")
129 |                 data_qtype = gguf.GGMLQuantizationType.F16
130 |                 data = gguf.quants.quantize(data, data_qtype)
131 | 
132 |             name = rename_tensor(name)
133 | 
134 |             # reverse shape to make it similar to the internal ggml dimension order
135 |             shape_str = f"{{{', '.join(str(n) for n in reversed(data_torch.shape))}}}"
136 |             print(f"{f'%-32s' % f'{name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
137 | 
138 |             self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype)
139 | 
140 |     def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
141 |         # TODO: support multiple shards in the future
142 |         from safetensors import safe_open
143 |         ctx = cast(ContextManager[Any], safe_open(self.in_file, framework="pt", device="cpu"))
144 |         with ctx as model_part:
145 |             for name in model_part.keys():
146 |                 data = model_part.get_slice(name)
147 |                 data = LazyTorchTensor.from_safetensors_slice(data)
148 |                 yield name, data
149 | 
150 |     def write(self):
151 |         self.gguf_writer.write_header_to_file(path=self.out_file)
152 |         self.gguf_writer.write_kv_data_to_file()
153 |         self.gguf_writer.write_tensors_to_file(progress=True)
154 |         self.gguf_writer.close()
155 | 
156 | def parse_args():
157 |     parser = argparse.ArgumentParser(description="Convert safetensors to GGUF format.")
158 |     parser.add_argument(
159 |         "--outtype",
160 |         choices=["f32", "f16", "bf16", "q8_0"],
161 |         default="f32",
162 |         help="Output data type (default: f32)"
163 |     )
164 |     parser.add_argument(
165 |         "input_file", type=Path,
166 |         help="Path to the input file (required)"
167 |     )
168 |     parser.add_argument(
169 |         "output_file", type=Path,
170 |         nargs="?",
171 |         help="Path to the output file (optional). Default to input file with .gguf extension"
172 |     )
173 |     return parser.parse_args()
174 | 
175 | if __name__ == "__main__":
176 |     args = parse_args()
177 | 
178 |     ftype_map: dict[str, gguf.LlamaFileType] = {
179 |         "f32": gguf.LlamaFileType.ALL_F32,
180 |         "f16": gguf.LlamaFileType.MOSTLY_F16,
181 |         "bf16": gguf.LlamaFileType.MOSTLY_BF16,
182 |         "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
183 |     }
184 | 
185 |     if args.outtype not in ftype_map:
186 |         raise ValueError(f"Unsupported output data type: {args.outtype}")
187 |     
188 |     if args.output_file is None:
189 |         args.output_file = args.input_file.with_suffix(".gguf")
190 | 
191 |     converter = Converter(args.input_file, args.output_file, ftype_map[args.outtype])
192 |     converter.convert()
193 |     converter.write()
194 | 


--------------------------------------------------------------------------------
/demo/2d-rope.cpp:
--------------------------------------------------------------------------------
  1 | #include "ggml.h"
  2 | #include "ggml-easy.h"
  3 | #include <iostream>
  4 | #include <cmath>
  5 | 
  6 | /**
  7 |  * Experiment with 2D RoPE used on Mistral's Pixtral model
  8 |  */
  9 | 
 10 | // implementation of the 2D RoPE without adding a new op in ggml
 11 | // this is not efficient (use double the memory), but works on all backends
 12 | static ggml_tensor * build_rope_2d(
 13 |     ggml_context * ctx0,
 14 |     ggml_tensor * cur,
 15 |     ggml_tensor * pos_a, // first half
 16 |     ggml_tensor * pos_b, // second half
 17 |     const float freq_base,
 18 |     const bool interleave_freq
 19 | ) {
 20 |     const int64_t n_dim  = cur->ne[0];
 21 |     const int64_t n_head = cur->ne[1];
 22 |     const int64_t n_pos  = cur->ne[2];
 23 | 
 24 |     // for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
 25 |     // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
 26 |     // first half of cur will use 1e-0, 1e-2 (even)
 27 |     // second half of cur will use 1e-1, 1e-3 (odd)
 28 |     // the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
 29 |     //  ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
 30 |     // then for the second half, we use freq_scale to shift the inv_freq
 31 |     //  ^ why? replace (2i) with (2i+1) in the above equation
 32 |     const float freq_scale_odd = interleave_freq
 33 |                                     ? std::pow(freq_base, (float)-2/n_dim)
 34 |                                     : 1.0;
 35 | 
 36 |     // first half
 37 |     ggml_tensor * first;
 38 |     {
 39 |         first = ggml_view_3d(ctx0, cur,
 40 |             n_dim/2, n_head, n_pos,
 41 |             ggml_row_size(cur->type, n_dim),
 42 |             ggml_row_size(cur->type, n_dim*n_head),
 43 |             0);
 44 |         first = ggml_rope_ext(
 45 |             ctx0,
 46 |             first,
 47 |             pos_a,      // positions
 48 |             nullptr,    // freq factors
 49 |             n_dim/2,    // n_dims
 50 |             0, 0, freq_base,
 51 |             1.0f, 0.0f, 1.0f, 0.0f, 0.0f
 52 |         );
 53 |     }
 54 | 
 55 |     // second half
 56 |     ggml_tensor * second;
 57 |     {
 58 |         second = ggml_view_3d(ctx0, cur,
 59 |             n_dim/2, n_head, n_pos,
 60 |             ggml_row_size(cur->type, n_dim),
 61 |             ggml_row_size(cur->type, n_dim*n_head),
 62 |             n_dim/2 * ggml_element_size(cur));
 63 |         second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors
 64 |         second = ggml_rope_ext(
 65 |             ctx0,
 66 |             second,
 67 |             pos_b,      // positions
 68 |             nullptr,    // freq factors
 69 |             n_dim/2,    // n_dims
 70 |             0, 0, freq_base,
 71 |             freq_scale_odd,
 72 |             0.0f, 1.0f, 0.0f, 0.0f
 73 |         );
 74 |     }
 75 | 
 76 |     cur = ggml_concat(ctx0, first, second, 0);
 77 |     return cur;
 78 | }
 79 | 
 80 | void test_mrope(ggml_easy::ctx & ctx);
 81 | 
 82 | int main() {
 83 |     ggml_easy::ctx_params params;
 84 |     ggml_easy::ctx ctx(params);
 85 | 
 86 |     const bool is_llama = true; // false meaning pixtral
 87 | 
 88 |     const int n_sz  = 336/14;
 89 |     const int n_pos = n_sz * n_sz + (is_llama ? 1 : 0); // 1 for CLS token
 90 |     const int n_dim = 88;
 91 |     const int n_head = 1;
 92 | 
 93 |     // create cgraph
 94 |     ctx.build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf, auto & utils) {
 95 |         ggml_tensor * pos_h = utils.new_input("pos_h", GGML_TYPE_I32, n_pos);
 96 |         ggml_tensor * pos_w = utils.new_input("pos_w", GGML_TYPE_I32, n_pos);
 97 |         ggml_tensor * vector = utils.new_input("vector", GGML_TYPE_F32, n_dim*n_head, n_pos);
 98 |         vector = ggml_reshape_3d(ctx_gf, vector, n_dim, n_head, n_pos);
 99 |         ggml_tensor * result = is_llama
100 |             ? build_rope_2d(ctx_gf, vector, pos_w, pos_h, 10000.0f, false)
101 |             : build_rope_2d(ctx_gf, vector, pos_h, pos_w, 10000.0f, true);
102 |         result = ggml_reshape_2d(ctx_gf, result, n_dim*n_head, n_pos);
103 |         utils.debug_print(result, "result");
104 |         utils.debug_print(ggml_sum(ctx_gf, result), "result_sum");
105 |     });
106 | 
107 |     // set data
108 |     if (is_llama) {
109 |         std::vector<int32_t> positions(n_pos - 1, 0);
110 |         for (int i = 0; i < n_pos- 1; ++i) {
111 |             positions[i] = (i / n_sz) + 1;
112 |             // printf("pos_h[%d] = %d\n", i, positions[i]);
113 |         }
114 |         printf("\n");
115 |         ctx.set_tensor_data("pos_h", positions.data());
116 |         for (int i = 0; i < n_pos- 1; ++i) {
117 |             positions[i] = (i % n_sz) + 1;
118 |             // printf("pos_w[%d] = %d\n", i, positions[i]);
119 |         }
120 |         ctx.set_tensor_data("pos_w", positions.data());
121 |     } else {
122 |         std::vector<int32_t> positions(n_pos);
123 |         for (int i = 0; i < n_pos; ++i) {
124 |             positions[i] = i / n_sz;
125 |         }
126 |         ctx.set_tensor_data("pos_h", positions.data());
127 |         for (int i = 0; i < n_pos; ++i) {
128 |             positions[i] = i % n_sz;
129 |         }
130 |         ctx.set_tensor_data("pos_w", positions.data());
131 |     }
132 |     ctx.set_tensor_data("vector", [](int i0, int i1, int i2, int i3) {
133 |         //return i0 * 0.1;
134 |         return 1.0;
135 |     });
136 | 
137 |     // compute
138 |     ggml_status status = ctx.compute();
139 | 
140 |     test_mrope(ctx);
141 | 
142 |     return 0;
143 | }
144 | 
145 | //
146 | // experiment with ggml_rope_multi
147 | //
148 | 
149 | void test_mrope(ggml_easy::ctx & ctx) {
150 |     //const int n_sz = 3;
151 |     const int n_dim = 12;
152 |     const int n_head = 1;
153 |     const int n_pos = 6;
154 | 
155 |     printf("\n\n--- test_mrope ---\n");
156 | 
157 |     // create cgraph
158 |     ctx.build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf, auto & utils) {
159 |         ggml_tensor * pos = utils.new_input("pos", GGML_TYPE_I32, n_pos*4);
160 |         ggml_tensor * vector = utils.new_input("vector", GGML_TYPE_F32, n_dim*n_head, n_pos);
161 | 
162 |         ggml_tensor * cur;
163 |         ggml_tensor * x = ggml_reshape_3d(ctx_gf, vector, n_dim, n_head, n_pos);
164 |         {
165 |             const int n_dim  = x->ne[0];
166 |             const int n_head = x->ne[1];
167 |             const int n_pos  = x->ne[2];
168 |             int sections[4] = {1, 1, 1, 0};
169 |             cur = ggml_rope_multi(
170 |                 ctx_gf,
171 |                 x,
172 |                 pos,        // positions
173 |                 nullptr,    // freq factors
174 |                 n_dim,      // n_dims
175 |                 sections,   // sections
176 |                 GGML_ROPE_TYPE_MROPE,
177 |                 0, 10000.0f,
178 |                 1.0f, 0.0f, 1.0f, 0.0f, 0.0f
179 |             );
180 |         }
181 | 
182 |         cur = ggml_reshape_2d(ctx_gf, cur, n_dim*n_head, n_pos);
183 |         utils.debug_print_full(cur, "mrope");
184 | 
185 |         {
186 |             ggml_tensor * pos_a = ggml_view_1d(ctx_gf, pos, n_pos, 0);
187 |             const int n_dim  = x->ne[0];
188 |             const int n_head = x->ne[1];
189 |             const int n_pos  = x->ne[2];
190 |             int sections[4] = {1, 1, 1, 0};
191 |             cur = ggml_rope_ext(
192 |                 ctx_gf,
193 |                 x,
194 |                 pos_a,      // positions
195 |                 nullptr,    // freq factors
196 |                 n_dim,    // n_dims
197 |                 GGML_ROPE_TYPE_NEOX, 0, 10000.0f,
198 |                 1.0f, 0.0f, 1.0f, 0.0f, 0.0f
199 |             );
200 |         }
201 | 
202 |         cur = ggml_reshape_2d(ctx_gf, cur, n_dim*n_head, n_pos);
203 |         utils.debug_print_full(cur, "normal_rope");
204 |     });
205 | 
206 |     // set data
207 |     std::vector<int32_t> positions(n_pos*4, 0);
208 |     //for (int i = 0; i < n_pos; ++i) positions[i + n_pos*0] = i / n_sz;
209 |     for (int i = 0; i < n_pos; ++i) positions[i + n_pos*0] = i;
210 |     for (int i = 0; i < n_pos; ++i) positions[i + n_pos*1] = i;
211 |     for (int i = 0; i < n_pos; ++i) positions[i + n_pos*2] = i;
212 |     for (int i = 0; i < n_pos; ++i) positions[i + n_pos*3] = 0;
213 |     for (int i = 0; i < 4; ++i) {
214 |         for (int j = 0; j < n_pos; ++j) {
215 |             printf("%d ", positions[i*n_pos + j]);
216 |         }
217 |         printf("\n");
218 |     }
219 |     ctx.set_tensor_data("pos", positions.data());
220 |     ctx.set_tensor_data("vector", [](int i0, int i1, int i2, int i3) {
221 |         return 1.0;
222 |     });
223 | 
224 |     // compute
225 |     ctx.compute();
226 | }
227 | 


--------------------------------------------------------------------------------
/demo/ultravox-encoder.cpp:
--------------------------------------------------------------------------------
  1 | #include "ggml.h"
  2 | #include "ggml-easy.h"
  3 | #include <iostream>
  4 | #include <thread>
  5 | #include <cmath>
  6 | 
  7 | // placeholder, to be removed when it's upstreamed
  8 | ggml_tensor * ggml_gelu_erf(ggml_context * ctx, ggml_tensor * a) {
  9 |     return a;
 10 | }
 11 | 
 12 | struct ultravox_encoder {
 13 |     float norm_eps = 1e-5;
 14 |     int n_head = 20;
 15 |     int n_embd;
 16 |     int n_ctx = 1500;
 17 | 
 18 |     ggml_tensor * position_embeddings;
 19 | 
 20 |     ggml_tensor * conv1d_1_w;
 21 |     ggml_tensor * conv1d_1_b;
 22 |     ggml_tensor * conv1d_2_w;
 23 |     ggml_tensor * conv1d_2_b;
 24 | 
 25 |     ggml_tensor * post_ln_w;
 26 |     ggml_tensor * post_ln_b;
 27 | 
 28 |     // projector
 29 |     ggml_tensor * mm_norm_pre_w;
 30 |     ggml_tensor * mm_norm_mid_w;
 31 |     ggml_tensor * mm_1_w;
 32 |     ggml_tensor * mm_2_w;
 33 | 
 34 |     struct layer {
 35 |         ggml_tensor * ln_1_w;
 36 |         ggml_tensor * ln_1_b;
 37 | 
 38 |         ggml_tensor * q_w;
 39 |         ggml_tensor * q_b;
 40 |         ggml_tensor * k_w;
 41 |         ggml_tensor * v_w;
 42 |         ggml_tensor * v_b;
 43 |         ggml_tensor * o_w;
 44 |         ggml_tensor * o_b;
 45 |         ggml_tensor * ln_2_w;
 46 |         ggml_tensor * ln_2_b;
 47 | 
 48 |         ggml_tensor * ff_up_w;
 49 |         ggml_tensor * ff_up_b;
 50 |         ggml_tensor * ff_down_w;
 51 |         ggml_tensor * ff_down_b;
 52 |     };
 53 |     std::vector<layer> layers;
 54 | 
 55 |     ultravox_encoder(ggml_easy::ctx & ctx, int n_layers) {
 56 |         const char * prefix = "a"; // audio
 57 |         position_embeddings = ctx.get_weight("%s.position_embd.weight", prefix);
 58 |         n_embd = position_embeddings->ne[0];
 59 |         conv1d_1_b = ctx.get_weight("%s.conv1d.1.bias",   prefix);
 60 |         conv1d_1_w = ctx.get_weight("%s.conv1d.1.weight", prefix);
 61 |         conv1d_2_b = ctx.get_weight("%s.conv1d.2.bias",   prefix);
 62 |         conv1d_2_w = ctx.get_weight("%s.conv1d.2.weight", prefix);
 63 |         post_ln_b  = ctx.get_weight("%s.post_ln.bias",    prefix);
 64 |         post_ln_w  = ctx.get_weight("%s.post_ln.weight",  prefix);
 65 | 
 66 |         mm_norm_pre_w = ctx.get_weight("mm.%s.norm_pre.weight", prefix);
 67 |         mm_norm_mid_w = ctx.get_weight("mm.%s.norm_mid.weight", prefix);
 68 |         mm_1_w = ctx.get_weight("mm.%s.mlp.1.weight", prefix);
 69 |         mm_2_w = ctx.get_weight("mm.%s.mlp.2.weight", prefix);
 70 | 
 71 |         for (int il = 0; il < n_layers; il++) {
 72 |             layers.push_back({
 73 |                 .ln_1_w     = ctx.get_weight("%s.blk.%d.ln1.weight", prefix, il),
 74 |                 .ln_1_b     = ctx.get_weight("%s.blk.%d.ln1.bias",   prefix, il),
 75 | 
 76 |                 .q_w        = ctx.get_weight("%s.blk.%d.attn_q.weight",   prefix, il),
 77 |                 .q_b        = ctx.get_weight("%s.blk.%d.attn_q.bias",     prefix, il),
 78 |                 .k_w        = ctx.get_weight("%s.blk.%d.attn_k.weight",   prefix, il),
 79 |                 .v_w        = ctx.get_weight("%s.blk.%d.attn_v.weight",   prefix, il),
 80 |                 .v_b        = ctx.get_weight("%s.blk.%d.attn_v.bias",     prefix, il),
 81 |                 .o_w        = ctx.get_weight("%s.blk.%d.attn_out.weight", prefix, il),
 82 |                 .o_b        = ctx.get_weight("%s.blk.%d.attn_out.bias",   prefix, il),
 83 |                 .ln_2_w     = ctx.get_weight("%s.blk.%d.ln2.weight",   prefix, il),
 84 |                 .ln_2_b     = ctx.get_weight("%s.blk.%d.ln2.bias",     prefix, il),
 85 | 
 86 |                 .ff_up_w    = ctx.get_weight("%s.blk.%d.ffn_up.weight", prefix, il),
 87 |                 .ff_up_b    = ctx.get_weight("%s.blk.%d.ffn_up.bias",   prefix, il),
 88 |                 .ff_down_w  = ctx.get_weight("%s.blk.%d.ffn_down.weight", prefix, il),
 89 |                 .ff_down_b  = ctx.get_weight("%s.blk.%d.ffn_down.bias",   prefix, il),
 90 |             });
 91 |         }
 92 |     }
 93 | };
 94 | 
 95 | // unused, but just keep it here for fun
 96 | static ggml_tensor * custom_gelu(ggml_context * ctx, ggml_tensor * a) {
 97 |     ggml_tensor * one = ggml_arange(ctx, 1.0f, 2.0f, 1.0f);
 98 |     one = ggml_view_1d(ctx, one, 1, 0);
 99 |     ggml_tensor * a3 = ggml_mul(ctx, a, ggml_mul(ctx, a, a));
100 |     ggml_tensor * a3_s = ggml_scale(ctx, a3, 0.035677f);
101 |     ggml_tensor * inner = ggml_add(ctx, a3_s, ggml_scale(ctx, a, 0.797885f));
102 |     inner = ggml_scale(ctx, inner, 0.7978845608f);
103 |     ggml_tensor * out = ggml_tanh(ctx, inner);
104 |     out = ggml_add(ctx, out, one);
105 |     out = ggml_mul(ctx, out, a);
106 |     out = ggml_scale(ctx, out, 0.5f);
107 |     return out;
108 | }
109 | 
110 | int main() {
111 |     ggml_easy::ctx_params params;
112 |     params.use_gpu = false;
113 |     ggml_easy::ctx ctx(params);
114 |     ctx.load_gguf("ultravox-f32.gguf");
115 | 
116 |     const int n_step  = 1024;
117 |     const int n_mel   = 128;
118 |     const int n_pos   = n_step / 2;
119 | 
120 |     // model
121 |     ultravox_encoder model(ctx, 32);
122 | 
123 |     const int n_layer = 32;
124 |     const int n_head  = model.n_head;
125 |     const int n_embd  = model.n_embd;
126 |     const int d_head  = n_embd / n_head;
127 |     const float eps   = model.norm_eps;
128 | 
129 |     const int proj_stack_factor = 8;
130 | 
131 |     // create cgraph
132 |     ctx.build_graph([&](ggml_context * ctx0, ggml_cgraph * gf, auto & utils) {
133 |         ggml_tensor * inp_raw   = utils.new_input("inp_raw", GGML_TYPE_F32, n_step, n_mel);
134 |         ggml_tensor * positions = utils.new_input("positions", GGML_TYPE_I32, n_pos);
135 | 
136 |         ggml_tensor * inp;
137 | 
138 |         // conv1d block
139 |         {
140 |             // convolution + gelu
141 |             ggml_tensor * cur = ggml_conv_1d(ctx0, model.conv1d_1_w, inp_raw, 1, 1, 1);
142 |             cur = ggml_add(ctx0, cur, model.conv1d_1_b);
143 | 
144 |             //cur = ggml_cast(ctx0, cur, GGML_TYPE_F16);
145 |             cur = ggml_gelu_erf(ctx0, cur);
146 |             //cur = ggml_cast(ctx0, cur, GGML_TYPE_F32);
147 |             utils.debug_print(cur, "first conv");
148 |             utils.debug_print(ggml_sum(ctx0, cur), "first conv sum");
149 | 
150 |             cur = ggml_conv_1d(ctx0, model.conv1d_2_w, cur, 2, 1, 1);
151 |             cur = ggml_add(ctx0, cur, model.conv1d_2_b);
152 |             utils.debug_print(cur, "second conv");
153 |             utils.debug_print(ggml_sum(ctx0, cur), "second conv sum");
154 | 
155 |             //cur = ggml_cast(ctx0, cur, GGML_TYPE_F32);
156 |             cur = ggml_gelu_erf(ctx0, cur);
157 |             //cur = ggml_cast(ctx0, cur, GGML_TYPE_F32);
158 |             // transpose
159 |             inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
160 |         }
161 | 
162 |         //inp = ggml_scale(ctx0, inp, 0.0); // test
163 | 
164 |         utils.debug_print(inp, "after conv1d");
165 |         utils.debug_print(ggml_sum(ctx0, inp), "after conv1d sum");
166 | 
167 |         // add position embeddings
168 |         inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions));
169 | 
170 |         utils.debug_print(inp, "after added pos");
171 | 
172 |         // iterate layers
173 |         for (int il = 0; il < n_layer; ++il) {
174 |             auto & layer = model.layers[il];
175 |             ggml_tensor * cur = inp;
176 | 
177 |             cur = ggml_norm(ctx0, cur, eps);
178 |             cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_1_w), layer.ln_1_b);
179 | 
180 |             // attention
181 |             {
182 |                 ggml_tensor * q = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
183 |                 ggml_tensor * k = ggml_mul_mat(ctx0, layer.k_w, cur); // no bias for key
184 |                 ggml_tensor * v = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
185 | 
186 |                 q = ggml_reshape_3d(ctx0, q, d_head, n_head, n_pos);
187 |                 k = ggml_reshape_3d(ctx0, k, d_head, n_head, n_pos);
188 |                 v = ggml_reshape_3d(ctx0, v, d_head, n_head, n_pos);
189 | 
190 |                 q = ggml_cont(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3));
191 |                 k = ggml_cont(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3));
192 | 
193 |                 ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
194 |                 ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
195 |                 kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f / std::sqrt(d_head), 0.0f);
196 | 
197 |                 v = ggml_cont(ctx0, ggml_permute(ctx0, v, 1, 2, 0, 3));
198 | 
199 |                 ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
200 |                 //kqv = ggml_reshape_3d(ctx0, kqv, d_head, n_pos, n_head);
201 |                 kqv = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
202 |                 kqv = ggml_cont_2d(ctx0, kqv, n_embd, n_pos);
203 | 
204 |                 cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.o_w, kqv), layer.o_b);
205 |             }
206 | 
207 |             utils.debug_print(cur, "layer %d after attn", il);
208 |             utils.debug_print(ggml_sum(ctx0, cur), "layer %d after attn sum", il);
209 | 
210 |             // residual
211 |             cur = ggml_add(ctx0, cur, inp);
212 | 
213 |             inp = cur; // inp = residual, cur = hidden_states
214 |             cur = ggml_norm(ctx0, cur, eps);
215 |             cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_2_w), layer.ln_2_b);
216 | 
217 |             // mlp
218 |             {
219 |                 cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_up_w, cur), layer.ff_up_b);
220 |                 cur = ggml_gelu_erf(ctx0, cur);
221 |                 //cur = custom_gelu(ctx0, cur);
222 |                 cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_down_w, cur), layer.ff_down_b);
223 |             }
224 | 
225 |             utils.debug_print(cur, "layer %d after ffn", il);
226 |             utils.debug_print(ggml_sum(ctx0, cur), "layer %d after ffn sum", il);
227 | 
228 |             // residual
229 |             cur = ggml_add(ctx0, cur, inp);
230 | 
231 |             inp = cur;
232 | 
233 |             utils.debug_print(inp, "layer %d out", il);
234 |             utils.debug_print(ggml_sum(ctx0, inp), "layer %d out", il);
235 |         }
236 | 
237 |         ggml_tensor * embeddings = inp;
238 | 
239 |         //embeddings = utils.new_input("test", GGML_TYPE_F32, 1280, 512);
240 |         //embeddings = ggml_scale(ctx0, embeddings, 0.0);
241 | 
242 |         // output norm
243 |         embeddings = ggml_norm(ctx0, embeddings, eps);
244 |         embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
245 | 
246 |         utils.debug_print(embeddings, "after output norm");
247 |         utils.debug_print(ggml_sum(ctx0, embeddings), "after output norm sum");
248 | 
249 |         utils.debug_print(ggml_scale(ctx0, model.post_ln_w, 1.0), "post_ln_w");
250 |         utils.debug_print(ggml_scale(ctx0, model.post_ln_b, 1.0), "post_ln_b");
251 | 
252 |         // StackAudioFrames
253 |         // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
254 |         {
255 |             int64_t stride = n_embd * proj_stack_factor;
256 |             int64_t padded_len = GGML_PAD(ggml_nelements(embeddings), stride);
257 |             int64_t pad = padded_len - ggml_nelements(embeddings);
258 |             if (pad > 0) {
259 |                 embeddings = ggml_view_1d(ctx0, embeddings, ggml_nelements(embeddings), 0);
260 |                 embeddings = ggml_pad(ctx0, embeddings, pad, 0, 0, 0);
261 |             }
262 |             embeddings = ggml_view_2d(ctx0, embeddings, stride, padded_len / stride,
263 |                                 ggml_row_size(embeddings->type, stride), 0);
264 |         }
265 | 
266 |         utils.debug_print(embeddings, "after stack");
267 |         utils.debug_print(ggml_sum(ctx0, embeddings), "after stack sum");
268 | 
269 |         // UltravoxProjector
270 |         {
271 |             ggml_tensor * cur = embeddings;
272 |             // pre-norm
273 |             cur = ggml_rms_norm(ctx0, cur, 1e-6);
274 |             cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
275 | 
276 |             // ffn in
277 |             cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
278 | 
279 |             utils.debug_print(cur, "before swiglu");
280 | 
281 |             // swiglu
282 |             {
283 |                 int64_t split_point = cur->ne[0] / 2;
284 |                 ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
285 |                 ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
286 | 
287 |                 // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
288 |                 x1 = ggml_silu(ctx0, x1);
289 |                 cur = ggml_mul(ctx0, x0, x1);
290 |             }
291 | 
292 |             utils.debug_print(cur, "after swiglu");
293 | 
294 |             // mid-norm
295 |             cur = ggml_rms_norm(ctx0, cur, 1e-6);
296 |             cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
297 | 
298 |             // ffn out
299 |             cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
300 | 
301 |             embeddings = cur;
302 |         }
303 | 
304 |         utils.debug_print(embeddings, "output");
305 |         utils.debug_print(ggml_sum(ctx0, embeddings), "output_sum");
306 |     });
307 | 
308 |     // set the input
309 |     {
310 |         std::vector<float> inp_raw(n_mel*n_step, 0.0f);
311 |         for (int i = 0; i < n_step*n_mel; i++) {
312 |             inp_raw[i] = (float)std::sin((float)i)*0.1f;
313 |             //inp_raw[i] = 1.0f / (float)(i+1);
314 |         }
315 |         ctx.set_tensor_data("inp_raw", inp_raw.data());
316 | 
317 |         std::vector<int> positions(n_pos);
318 |         for (int i = 0; i < n_pos; i++) positions[i] = i;
319 |         ctx.set_tensor_data("positions", positions.data());
320 | 
321 |         //std::vector<float> test(1280*512, 0.1f);
322 |         //for (int i = 0; i < (int)test.size(); i++) test[i] = (float)std::sin((float)i)*0.1f;
323 |         //ctx.set_tensor_data("test", test.data());
324 |     }
325 | 
326 |     // compute
327 |     ctx.compute();
328 | 
329 |     return 0;
330 | }
331 | 


--------------------------------------------------------------------------------
/demo/whisper-encoder.cpp:
--------------------------------------------------------------------------------
  1 | #include "ggml.h"
  2 | #include "ggml-easy.h"
  3 | #include <iostream>
  4 | #include <thread>
  5 | #include <cmath>
  6 | 
  7 | #define WHISPER_ASSERT GGML_ASSERT
  8 | 
  9 | #define WHISPER_SAMPLE_RATE 16000
 10 | #define WHISPER_N_FFT       400
 11 | #define WHISPER_HOP_LENGTH  160
 12 | #define WHISPER_CHUNK_SIZE  30
 13 | 
 14 | namespace whisper_preprocessor {
 15 | 
 16 | struct whisper_mel {
 17 |     int n_len;
 18 |     int n_len_org;
 19 |     int n_mel;
 20 | 
 21 |     std::vector<float> data;
 22 | };
 23 | 
 24 | struct whisper_filters {
 25 |     int32_t n_mel;
 26 |     int32_t n_fft;
 27 | 
 28 |     std::vector<float> data;
 29 | };
 30 | 
 31 | #define SIN_COS_N_COUNT WHISPER_N_FFT
 32 | namespace {
 33 | struct whisper_global_cache {
 34 |     // In FFT, we frequently use sine and cosine operations with the same values.
 35 |     // We can use precalculated values to speed up the process.
 36 |     float sin_vals[SIN_COS_N_COUNT];
 37 |     float cos_vals[SIN_COS_N_COUNT];
 38 | 
 39 |     // Hann window (Use cosf to eliminate difference)
 40 |     // ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html
 41 |     // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147
 42 |     float hann_window[WHISPER_N_FFT];
 43 | 
 44 |     whisper_global_cache() {
 45 |         fill_sin_cos_table();
 46 |         fill_hann_window(sizeof(hann_window)/sizeof(hann_window[0]), true, hann_window);
 47 |     }
 48 | 
 49 |     void fill_sin_cos_table() {
 50 |         for (int i = 0; i < SIN_COS_N_COUNT; i++) {
 51 |             double theta = (2 * M_PI * i) / SIN_COS_N_COUNT;
 52 |             sin_vals[i] = sinf(theta);
 53 |             cos_vals[i] = cosf(theta);
 54 |         }
 55 |     }
 56 | 
 57 |     void fill_hann_window(int length, bool periodic, float * output) {
 58 |         int offset = -1;
 59 |         if (periodic) {
 60 |             offset = 0;
 61 |         }
 62 |         for (int i = 0; i < length; i++) {
 63 |             output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
 64 |         }
 65 |     }
 66 | } global_cache;
 67 | }
 68 | 
 69 | // naive Discrete Fourier Transform
 70 | // input is real-valued
 71 | // output is complex-valued
 72 | static void dft(const float* in, int N, float* out) {
 73 |     const int sin_cos_step = SIN_COS_N_COUNT / N;
 74 | 
 75 |     for (int k = 0; k < N; k++) {
 76 |         float re = 0;
 77 |         float im = 0;
 78 | 
 79 |         for (int n = 0; n < N; n++) {
 80 |             int idx = (k * n * sin_cos_step) % (SIN_COS_N_COUNT); // t = 2*M_PI*k*n/N
 81 |             re += in[n]*global_cache.cos_vals[idx]; // cos(t)
 82 |             im -= in[n]*global_cache.sin_vals[idx]; // sin(t)
 83 |         }
 84 | 
 85 |         out[k*2 + 0] = re;
 86 |         out[k*2 + 1] = im;
 87 |     }
 88 | }
 89 | 
 90 | // Cooley-Tukey FFT
 91 | // poor man's implementation - use something better
 92 | // input is real-valued
 93 | // output is complex-valued
 94 | static void fft(float* in, int N, float* out) {
 95 |     if (N == 1) {
 96 |         out[0] = in[0];
 97 |         out[1] = 0;
 98 |         return;
 99 |     }
100 | 
101 |     const int half_N = N / 2;
102 |     if (N - half_N*2 == 1) {
103 |         dft(in, N, out);
104 |         return;
105 |     }
106 | 
107 |     float* even = in + N;
108 |     for (int i = 0; i < half_N; ++i) {
109 |         even[i]= in[2*i];
110 |     }
111 |     float* even_fft = out + 2 * N;
112 |     fft(even, half_N, even_fft);
113 | 
114 |     float* odd = even;
115 |     for (int i = 0; i < half_N; ++i) {
116 |         odd[i] = in[2*i + 1];
117 |     }
118 |     float* odd_fft = even_fft + N;
119 |     fft(odd, half_N, odd_fft);
120 | 
121 |     const int sin_cos_step = SIN_COS_N_COUNT / N;
122 |     for (int k = 0; k < half_N; k++) {
123 |         int idx = k * sin_cos_step; // t = 2*M_PI*k/N
124 |         float re = global_cache.cos_vals[idx]; // cos(t)
125 |         float im = -global_cache.sin_vals[idx]; // sin(t)
126 | 
127 |         float re_odd = odd_fft[2*k + 0];
128 |         float im_odd = odd_fft[2*k + 1];
129 | 
130 |         out[2*k + 0] = even_fft[2*k + 0] + re*re_odd - im*im_odd;
131 |         out[2*k + 1] = even_fft[2*k + 1] + re*im_odd + im*re_odd;
132 | 
133 |         out[2*(k + half_N) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd;
134 |         out[2*(k + half_N) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd;
135 |     }
136 | }
137 | 
138 | static void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::vector<float> & samples,
139 |                                               int n_samples, int frame_size, int frame_step, int n_threads,
140 |                                               const whisper_filters & filters, whisper_mel & mel) {
141 |     std::vector<float> fft_in(frame_size * 2, 0.0);
142 |     std::vector<float> fft_out(frame_size * 2 * 2 * 2);
143 | 
144 |     int n_fft = filters.n_fft;
145 |     int i = ith;
146 | 
147 |     // make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist
148 |     WHISPER_ASSERT(n_fft == 1 + (frame_size / 2));
149 | 
150 |     // calculate FFT only when fft_in are not all zero
151 |     for (; i < std::min(n_samples / frame_step + 1, mel.n_len); i += n_threads) {
152 |         const int offset = i * frame_step;
153 | 
154 |         // apply Hann window (~10% faster)
155 |         for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) {
156 |             fft_in[j] = hann[j] * samples[offset + j];
157 |         }
158 | 
159 |         // fill the rest with zeros
160 |         if (n_samples - offset < frame_size) {
161 |             std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0);
162 |         }
163 | 
164 |         // FFT
165 |         fft(fft_in.data(), frame_size, fft_out.data());
166 | 
167 |         // Calculate modulus^2 of complex numbers
168 |         // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting.
169 |         for (int j = 0; j < n_fft; j++) {
170 |             fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]);
171 |         }
172 | 
173 |         // mel spectrogram
174 |         for (int j = 0; j < mel.n_mel; j++) {
175 |             double sum = 0.0;
176 |             // unroll loop (suggested by GH user @lunixbochs)
177 |             int k = 0;
178 |             for (k = 0; k < n_fft - 3; k += 4) {
179 |                 sum +=
180 |                         fft_out[k + 0] * filters.data[j * n_fft + k + 0] +
181 |                         fft_out[k + 1] * filters.data[j * n_fft + k + 1] +
182 |                         fft_out[k + 2] * filters.data[j * n_fft + k + 2] +
183 |                         fft_out[k + 3] * filters.data[j * n_fft + k + 3];
184 |             }
185 |             // handle n_fft remainder
186 |             for (; k < n_fft; k++) {
187 |                 sum += fft_out[k] * filters.data[j * n_fft + k];
188 |             }
189 |             sum = log10(std::max(sum, 1e-10));
190 |             mel.data[j * mel.n_len + i] = sum;
191 |         }
192 |     }
193 | 
194 |     // Otherwise fft_out are all zero
195 |     double sum = log10(1e-10);
196 |     for (; i < mel.n_len; i += n_threads) {
197 |         for (int j = 0; j < mel.n_mel; j++) {
198 |             mel.data[j * mel.n_len + i] = sum;
199 |         }
200 |     }
201 | }
202 | 
203 | // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L110-L157
204 | static bool log_mel_spectrogram(
205 |         const float * samples,
206 |         const int   n_samples,
207 |         const int   /*sample_rate*/,
208 |         const int   frame_size,
209 |         const int   frame_step,
210 |         const int   n_mel,
211 |         const int   n_threads,
212 |         const whisper_filters & filters,
213 |         const bool   debug,
214 |         whisper_mel & mel) {
215 |     const int64_t t_start_us = ggml_time_us();
216 | 
217 |     // Hann window
218 |     WHISPER_ASSERT(frame_size == WHISPER_N_FFT && "Unsupported frame_size");
219 |     const float * hann = global_cache.hann_window;
220 | 
221 |     // Calculate the length of padding
222 |     int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30;
223 |     int64_t stage_2_pad = frame_size / 2;
224 | 
225 |     // Initialize a vector and copy data from C array to it.
226 |     std::vector<float> samples_padded;
227 |     samples_padded.resize(n_samples + stage_1_pad + stage_2_pad * 2);
228 |     std::copy(samples, samples + n_samples, samples_padded.begin() + stage_2_pad);
229 | 
230 |     // pad 30 seconds of zeros at the end of audio (480,000 samples) + reflective pad 200 samples at the end of audio
231 |     std::fill(samples_padded.begin() + n_samples + stage_2_pad, samples_padded.begin() + n_samples + stage_1_pad + 2 * stage_2_pad, 0);
232 | 
233 |     // reflective pad 200 samples at the beginning of audio
234 |     std::reverse_copy(samples + 1, samples + 1 + stage_2_pad, samples_padded.begin());
235 | 
236 |     mel.n_mel     = n_mel;
237 |     // https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/SpectralOps.cpp#L936
238 |     // Calculate number of frames + remove the last frame
239 |     mel.n_len     = (samples_padded.size() - frame_size) / frame_step;
240 |     // Calculate semi-padded sample length to ensure compatibility
241 |     mel.n_len_org = 1 + (n_samples + stage_2_pad - frame_size) / frame_step;
242 |     mel.data.resize(mel.n_mel * mel.n_len);
243 | 
244 |     {
245 |         std::vector<std::thread> workers(n_threads - 1);
246 |         for (int iw = 0; iw < n_threads - 1; ++iw) {
247 |             workers[iw] = std::thread(
248 |                     log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded),
249 |                     n_samples + stage_2_pad, frame_size, frame_step, n_threads,
250 |                     std::cref(filters), std::ref(mel));
251 |         }
252 | 
253 |         // main thread
254 |         log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples + stage_2_pad, frame_size, frame_step, n_threads, filters, mel);
255 | 
256 |         for (int iw = 0; iw < n_threads - 1; ++iw) {
257 |             workers[iw].join();
258 |         }
259 |     }
260 | 
261 |     // clamping and normalization
262 |     double mmax = -1e20;
263 |     for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
264 |         if (mel.data[i] > mmax) {
265 |             mmax = mel.data[i];
266 |         }
267 |     }
268 | 
269 |     mmax -= 8.0;
270 | 
271 |     for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
272 |         if (mel.data[i] < mmax) {
273 |             mel.data[i] = mmax;
274 |         }
275 | 
276 |         mel.data[i] = (mel.data[i] + 4.0)/4.0;
277 |     }
278 | 
279 |     // Dump log_mel_spectrogram
280 |     if (debug) {
281 |         std::ofstream outFile("log_mel_spectrogram.json");
282 |         outFile << "[";
283 |         for (uint64_t i = 0; i < mel.data.size() - 1; i++) {
284 |             outFile << mel.data[i] << ", ";
285 |         }
286 |         outFile << mel.data[mel.data.size() - 1] << "]";
287 |         outFile.close();
288 |     }
289 | 
290 |     return true;
291 | }
292 | 
293 | } // namespace whisper_preprocessor
294 | 
295 | struct whisper_encoder {
296 |     float norm_eps = 1e-5;
297 |     int n_head = 6;
298 |     int n_embd;
299 |     int n_ctx = 1500;
300 | 
301 |     ggml_tensor * pos_embd;
302 | 
303 |     ggml_tensor * conv_1_w;
304 |     ggml_tensor * conv_1_b;
305 |     ggml_tensor * conv_2_w;
306 |     ggml_tensor * conv_2_b;
307 | 
308 |     ggml_tensor * out_norm_w;
309 |     ggml_tensor * out_norm_b;
310 | 
311 |     struct layer {
312 |         ggml_tensor * inp_norm_w;
313 |         ggml_tensor * inp_norm_b;
314 | 
315 |         ggml_tensor * attn_q;
316 |         ggml_tensor * attn_q_b;
317 |         ggml_tensor * attn_k;
318 |         ggml_tensor * attn_v;
319 |         ggml_tensor * attn_v_b;
320 |         ggml_tensor * attn_o;
321 |         ggml_tensor * attn_o_b;
322 |         ggml_tensor * attn_post_norm_w;
323 |         ggml_tensor * attn_post_norm_b;
324 | 
325 |         ggml_tensor * ffn_up;
326 |         ggml_tensor * ffn_up_b;
327 |         ggml_tensor * ffn_down;
328 |         ggml_tensor * ffn_down_b;
329 |     };
330 |     std::vector<layer> layers;
331 | 
332 |     whisper_encoder(ggml_easy::ctx & ctx, int n_layers) {
333 |         const char * prefix = "encoder";
334 |         pos_embd = ctx.get_weight("model.%s.embed_positions.weight", prefix);
335 |         n_embd = pos_embd->ne[0];
336 |         conv_1_b = ctx.get_weight("model.%s.conv1.bias",   prefix);
337 |         conv_1_w = ctx.get_weight("model.%s.conv1.weight", prefix);
338 |         conv_2_b = ctx.get_weight("model.%s.conv2.bias",   prefix);
339 |         conv_2_w = ctx.get_weight("model.%s.conv2.weight", prefix);
340 |         out_norm_b = ctx.get_weight("model.%s.layer_norm.bias",   prefix);
341 |         out_norm_w = ctx.get_weight("model.%s.layer_norm.weight", prefix);
342 |         for (int il = 0; il < n_layers; il++) {
343 |             layers.push_back({
344 |                 .inp_norm_w = ctx.get_weight("model.%s.layers.%d.self_attn_layer_norm.weight", prefix, il),
345 |                 .inp_norm_b = ctx.get_weight("model.%s.layers.%d.self_attn_layer_norm.bias",   prefix, il),
346 | 
347 |                 .attn_q           = ctx.get_weight("model.%s.layers.%d.self_attn.q_proj.weight",   prefix, il),
348 |                 .attn_q_b         = ctx.get_weight("model.%s.layers.%d.self_attn.q_proj.bias",     prefix, il),
349 |                 .attn_k           = ctx.get_weight("model.%s.layers.%d.self_attn.k_proj.weight",   prefix, il),
350 |                 .attn_v           = ctx.get_weight("model.%s.layers.%d.self_attn.v_proj.weight",   prefix, il),
351 |                 .attn_v_b         = ctx.get_weight("model.%s.layers.%d.self_attn.v_proj.bias",     prefix, il),
352 |                 .attn_o           = ctx.get_weight("model.%s.layers.%d.self_attn.out_proj.weight", prefix, il),
353 |                 .attn_o_b         = ctx.get_weight("model.%s.layers.%d.self_attn.out_proj.bias",   prefix, il),
354 |                 .attn_post_norm_w = ctx.get_weight("model.%s.layers.%d.final_layer_norm.weight",   prefix, il),
355 |                 .attn_post_norm_b = ctx.get_weight("model.%s.layers.%d.final_layer_norm.bias",     prefix, il),
356 | 
357 |                 .ffn_up     = ctx.get_weight("model.%s.layers.%d.fc1.weight",              prefix, il),
358 |                 .ffn_up_b   = ctx.get_weight("model.%s.layers.%d.fc1.bias",                prefix, il),
359 |                 .ffn_down   = ctx.get_weight("model.%s.layers.%d.fc2.weight",              prefix, il),
360 |                 .ffn_down_b = ctx.get_weight("model.%s.layers.%d.fc2.bias",                prefix, il),
361 |             });
362 |         }
363 |     }
364 | 
365 |     ggml_tensor * forward(ggml_context * ctx0, ggml_easy::ctx::build_utils & utils, ggml_tensor * input, ggml_tensor * input_pos) {
366 |         int n_tokens    = n_ctx; //;input->ne[1];
367 |         ggml_tensor * x = input;
368 | 
369 |         auto layer_norm = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) {
370 |             x = ggml_norm(ctx0, x, norm_eps);
371 |             x = ggml_mul(ctx0, x, w);
372 |             x = ggml_add(ctx0, x, b);
373 |             return x;
374 |         };
375 | 
376 |         auto add_pos = [&](ggml_tensor * x) {
377 |             //ggml_tensor * pos_embd_selected = ggml_get_rows(ctx0, pos_embd, input_pos);
378 |             //x = ggml_add(ctx0, x, pos_embd_selected);
379 |             x = ggml_add(ctx0, x, pos_embd);
380 |             return x;
381 |         };
382 | 
383 |         // TODO: do this at conversion time, see LlamaModel.permute in convert_hf_to_gguf.py
384 |         auto llama_permute = [&](ggml_tensor * w) {
385 |             ggml_tensor * tmp = ggml_reshape_4d(ctx0, w, w->ne[0], w->ne[1] / n_head / 2, 2, n_head);
386 |             tmp = ggml_permute(ctx0, tmp, 0, 2, 1, 3);
387 |             tmp = ggml_cont(ctx0, tmp);
388 |             return ggml_reshape_2d(ctx0, tmp, w->ne[0], w->ne[1]);
389 |         };
390 | 
391 |         // convolution + gelu
392 |         {
393 |             ggml_tensor * tmp;
394 |             tmp = ggml_cast(ctx0, conv_1_w, GGML_TYPE_F16); // TODO: do this at conversion time
395 |             x = ggml_conv_1d_ph(ctx0, tmp, input, 1, 1);
396 |             tmp = ggml_cont(ctx0, ggml_transpose(ctx0, conv_1_b)); // TODO: do this at conversion time
397 |             x = ggml_add(ctx0, x, tmp);
398 | 
399 |             x = ggml_gelu(ctx0, x);
400 | 
401 |             tmp = ggml_cast(ctx0, conv_2_w, GGML_TYPE_F16); // TODO: do this at conversion time
402 |             x = ggml_conv_1d_ph(ctx0, tmp, x, 2, 1);
403 |             tmp = ggml_cont(ctx0, ggml_transpose(ctx0, conv_2_b)); // TODO: do this at conversion time
404 |             x = ggml_add(ctx0, x, tmp);
405 | 
406 |             x = ggml_gelu(ctx0, x);
407 |         }
408 | 
409 |         x = ggml_cont(ctx0, ggml_transpose(ctx0, x));
410 |         x = add_pos(x);
411 |         ggml_tensor * residual = x;
412 | 
413 |         int i = 0; // for debugging
414 |         for (auto & layer : layers) {
415 |             residual = x;
416 | 
417 |             // input layer norm
418 |             x = layer_norm(x, layer.inp_norm_w, layer.inp_norm_b);
419 | 
420 |             // self attention
421 |             {
422 |                 ggml_tensor * q = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.attn_q, x), layer.attn_q_b);
423 |                 ggml_tensor * k = ggml_mul_mat(ctx0, layer.attn_k, x); // no bias for key
424 |                 ggml_tensor * v = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.attn_v, x), layer.attn_v_b);
425 | 
426 |                 int n_embd_head = n_embd / n_head;
427 |                 q = ggml_reshape_3d(ctx0, q, n_embd_head, n_head, n_tokens);
428 |                 k = ggml_reshape_3d(ctx0, k, n_embd_head, n_head, n_tokens);
429 |                 v = ggml_reshape_3d(ctx0, v, n_embd_head, n_head, n_tokens);
430 | 
431 |                 int n_rot = n_embd_head;
432 |                 q = ggml_cont(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3));
433 |                 q = ggml_scale(ctx0, q, 1.0f / std::sqrt(n_embd_head));
434 |                 // utils.debug_print(q, "q rope");
435 | 
436 |                 k = ggml_cont(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3));
437 |                 // utils.debug_print(k, "k rope");
438 | 
439 |                 ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
440 |                 kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f, 0.0f);
441 |                 // utils.debug_print(kq, "kq softmax");
442 | 
443 |                 v = ggml_cont(ctx0, ggml_permute(ctx0, v, 1, 2, 0, 3));
444 | 
445 |                 ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
446 |                 //kqv = ggml_reshape_3d(ctx0, kqv, n_embd_head, n_tokens, n_head);
447 |                 kqv = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
448 |                 kqv = ggml_cont_2d(ctx0, kqv, n_embd, n_tokens);
449 |                 // utils.debug_print(kqv, "kqv");
450 |                 // utils.debug_print(ggml_sum(ctx0, kqv), "kqv_sum");
451 | 
452 |                 x = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.attn_o, kqv), layer.attn_o_b);
453 |             }
454 | 
455 |             // residual
456 |             x = ggml_add(ctx0, x, residual);
457 | 
458 |             residual = x;
459 |             x = layer_norm(x, layer.attn_post_norm_w, layer.attn_post_norm_b);
460 | 
461 |             // mlp
462 |             {
463 |                 x = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ffn_up, x), layer.ffn_up_b);
464 |                 x = ggml_gelu(ctx0, x);
465 |                 x = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ffn_down, x), layer.ffn_down_b);
466 |             }
467 | 
468 |             // residual
469 |             x = ggml_add(ctx0, x, residual);
470 |             // utils.debug_print(x, "output_layer_%d", i);
471 |             // utils.debug_print(ggml_sum(ctx0, x), "output_layer_%d_sum", i); i++;
472 |         }
473 | 
474 |         // output norm
475 |         x = layer_norm(x, out_norm_w, out_norm_b);
476 | 
477 |         return x;
478 |     }
479 | };
480 | 
481 | int main() {
482 |     ggml_easy::ctx_params params;
483 |     ggml_easy::ctx ctx(params);
484 |     ctx.load_gguf("models/whisper-mel-filters.gguf");
485 |     ctx.load_safetensors("whisper-tiny.safetensors", {});
486 | 
487 |     whisper_preprocessor::whisper_filters mel_filters;
488 |     {
489 |         auto mel_80 = ctx.get_weight("mel_80");
490 |         ggml_easy::debug::print_tensor_shape(mel_80);
491 |         mel_filters.n_mel = mel_80->ne[1];
492 |         mel_filters.n_fft = mel_80->ne[0];
493 |         mel_filters.data.resize(ggml_nelements(mel_80));
494 |         ggml_backend_tensor_get(mel_80, mel_filters.data.data(), 0, mel_filters.data.size());
495 | 
496 |         // for (int row = 0; row < mel_filters.n_mel; row++) {
497 |         //     for (int i = 0; i < mel_filters.n_fft; i++) {
498 |         //         float elem = mel_filters.data[row * mel_filters.n_fft + i];
499 |         //         if (elem != 0.0) {
500 |         //             printf("[%d, %d] %f\n", row, i, elem);
501 |         //         }
502 |         //     }
503 |         //     printf("\n");
504 |         // }
505 |     }
506 | 
507 |     std::vector<float> samples(3000, 1.0);
508 | 
509 |     whisper_preprocessor::whisper_mel mel;
510 |     whisper_preprocessor::log_mel_spectrogram(
511 |             samples.data(),
512 |             samples.size(),
513 |             WHISPER_SAMPLE_RATE,
514 |             WHISPER_N_FFT,
515 |             WHISPER_HOP_LENGTH,
516 |             mel_filters.n_mel,
517 |             4, // threads
518 |             mel_filters,
519 |             false,
520 |             mel);
521 | 
522 |     printf("mel.n_len: %d\n", mel.n_len);
523 |     printf("mel.n_mel: %d\n", mel.n_mel);
524 |     printf("mel.size:  %zu\n", mel.data.size());
525 |     // print first and last 10 elements
526 |     for (int i = 0; i < 10; i++) {
527 |         printf("%f ", mel.data[i]);
528 |     }
529 |     printf("\n");
530 |     for (int i = mel.data.size() - 10; i < mel.data.size(); i++) {
531 |         printf("%f ", mel.data[i]);
532 |     }
533 |     printf("\n");
534 | 
535 | 
536 |     // model
537 |     whisper_encoder encoder(ctx, 4);
538 | 
539 |     // create cgraph
540 |     ctx.build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf, auto & utils) {
541 |         ggml_tensor * input = utils.new_input("mel", GGML_TYPE_F32, 2*encoder.n_ctx, mel.n_mel);
542 |         ggml_easy::debug::print_tensor_shape(input);
543 |         utils.debug_print(input, "input");
544 |         ggml_tensor * pos   = nullptr; //utils.new_input("pos", GGML_TYPE_I32, mel.n_len);
545 |         ggml_tensor * result = encoder.forward(ctx_gf, utils, input, pos);
546 |         utils.debug_print(result, "result");
547 |         utils.mark_output(result, "result");
548 |     });
549 | 
550 |     // set data
551 |     ctx.set_tensor_data("mel", mel.data.data());
552 |     // set the input
553 |     {
554 |         int mel_offset = 0;
555 |         int n_ctx = encoder.n_ctx;
556 |         std::vector<float> dst(2*n_ctx * mel.n_mel, 0.0f);
557 | 
558 |         const int i0 = std::min(mel_offset,           mel.n_len);
559 |         const int i1 = std::min(mel_offset + 2*n_ctx, mel.n_len);
560 | 
561 |         for (int j = 0; j < mel.n_mel; ++j) {
562 |             for (int i = i0; i < i1; ++i) {
563 |                 dst[j*2*n_ctx + (i - i0)] = mel.data[j*mel.n_len + i];
564 |             }
565 |         }
566 | 
567 |         ctx.set_tensor_data("mel", dst.data());
568 |     }
569 | 
570 |     // set pos
571 |     // std::vector<int> pos(mel.n_len);
572 |     // for (size_t i = 0; i < pos.size(); i++) {
573 |     //     pos[i] = i;
574 |     // }
575 |     // ctx.set_tensor_data("pos", pos.data());
576 | 
577 |     // compute
578 |     ctx.compute();
579 | 
580 |     return 0;
581 | }
582 | 


--------------------------------------------------------------------------------
/ggml-easy.h:
--------------------------------------------------------------------------------
  1 | //
  2 | //  ggml-easy.hpp
  3 | //
  4 | //  Copyright (c) 2025 Xuan-Son Nguyen. All rights reserved.
  5 | //  MIT License
  6 | //
  7 | 
  8 | #include "ggml.h"
  9 | #include "ggml-cpp.h"
 10 | #include "ggml-cpu.h"
 11 | #include "ggml-alloc.h"
 12 | #include "ggml-backend.h"
 13 | #include "gguf.h"
 14 | 
 15 | #include <limits.h>
 16 | #include <vector>
 17 | #include <map>
 18 | #include <cinttypes>
 19 | #include <fstream>
 20 | #include <functional>
 21 | #include <unordered_map>
 22 | #include <cstdarg>
 23 | 
 24 | namespace ggml_easy {
 25 | 
 26 | struct ctx_params {
 27 |     bool use_gpu = true;
 28 |     int max_nodes = 8192;
 29 |     ggml_log_level log_level = GGML_LOG_LEVEL_INFO;
 30 |     bool safetensors_ignore_unknown_dtype = false;
 31 | };
 32 | 
 33 | void log_cb(ggml_log_level level, const char * text, void * cur_lvl_ptr) {
 34 |     ggml_log_level cur_lvl = *(ggml_log_level *) cur_lvl_ptr;
 35 |     if (cur_lvl > level) {
 36 |         return;
 37 |     }
 38 |     fputs(text, stderr);
 39 |     fflush(stderr);
 40 | }
 41 | 
 42 | // forward declaration
 43 | namespace debug {
 44 |     static void print_tensor_shape(ggml_tensor * t);
 45 |     static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n = 3);
 46 | }
 47 | 
 48 | // forward declaration for safetensors (lightweight) JSON parser
 49 | struct safetensors_json_parser {
 50 |     enum state {
 51 |         STATE_ROOT,
 52 |         STATE_OBJ_METADATA,
 53 |         STATE_OBJ_TENSOR,
 54 |     };
 55 |     struct tensor {
 56 |         std::string name;
 57 |         bool ignored = false;
 58 |         ggml_type type = GGML_TYPE_F32; // only F32, F16, BF16 are supported
 59 |         std::array<int64_t, 4> shape = {0, 1, 1, 1}; // row-major order
 60 |         uint64_t offset = 0;
 61 |         void print() {
 62 |             printf("tensor: %-60s, type: %s, shape: [%4" PRId64 ", %4" PRId64 ", %4" PRId64 ", %4" PRId64 "], offset: %" PRIu64 "\n",
 63 |                 name.c_str(), ggml_type_name(type), shape[0], shape[1], shape[2], shape[3], offset);
 64 |         }
 65 |     };
 66 |     bool ignore_unknown_dtype = false;
 67 |     std::vector<tensor> tensors;
 68 |     size_t metadata_size = 0;
 69 |     safetensors_json_parser(const char * json, size_t metadata_size, std::map<std::string, std::string> name_replace_map, bool ignore_unknown_dtype);
 70 |     uint64_t get_data_offset();
 71 | };
 72 | 
 73 | std::string string_format(const char * fmt, ...);
 74 | void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
 75 | 
 76 | ////////////////////////////////////////
 77 | 
 78 | struct ctx {
 79 |     ggml_log_level log_level;
 80 | 
 81 |     std::unordered_map<std::string, ggml_tensor *> tensors;
 82 | 
 83 |     ggml_cgraph  * gf     = nullptr;
 84 |     ggml_context * ctx_gf = nullptr;
 85 |     std::vector<uint8_t> buf_compute_meta;
 86 |     int max_nodes;
 87 | 
 88 |     std::vector<ggml_backend_t> backend_ptrs;
 89 |     std::vector<ggml_backend_buffer_type_t> backend_buft;
 90 | 
 91 |     ggml_backend_t backend     = nullptr;
 92 |     ggml_backend_t backend_cpu = nullptr;
 93 |     ggml_backend_buffer_t buf  = nullptr;
 94 | 
 95 |     ggml_backend_sched_ptr sched;
 96 | 
 97 | private:
 98 |     // private data members
 99 |     struct loaded_gguf {
100 |         gguf_context_ptr ctx_gguf;
101 |         ggml_context_ptr ctx_data;
102 |     };
103 |     std::vector<loaded_gguf> loaded_ggufs;
104 | 
105 |     struct printed_tensor {
106 |         ggml_tensor * t;
107 |         bool full;
108 |     };
109 |     std::vector<printed_tensor> dbg_printed_tensors;
110 |     bool safetensors_ignore_unknown_dtype;
111 | 
112 | 
113 | public:
114 |     /**
115 |      * Construct a new ctx object
116 |      * If use_gpu is true, the GPU backend will be used, otherwise the CPU backend will be used
117 |      */
118 |     ctx(const ctx_params & params) : log_level(params.log_level), max_nodes(params.max_nodes) {
119 |         ggml_log_set(log_cb, &log_level);
120 |         safetensors_ignore_unknown_dtype = params.safetensors_ignore_unknown_dtype;
121 |         backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
122 |         backend     = params.use_gpu
123 |                         ? ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr)
124 |                         : nullptr;
125 |     
126 |         if (backend) {
127 |             log(GGML_LOG_LEVEL_INFO, "%s: using %s backend\n", __func__, ggml_backend_name(backend));
128 |             backend_ptrs.push_back(backend);
129 |             backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
130 |         } else {
131 |             backend = backend_cpu;
132 |             log(GGML_LOG_LEVEL_INFO, "%s: using CPU backend\n", __func__);
133 |         }
134 |     
135 |         backend_ptrs.push_back(backend_cpu);
136 |         backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
137 |     
138 |         sched.reset(
139 |             ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, false, true)
140 |         );
141 | 
142 |         buf_compute_meta.resize(max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
143 |     }
144 | 
145 |     /**
146 |      * Get a weight tensor by name, can only be used after model is loaded.
147 |      * Throws an exception if the tensor is not found.
148 |      */
149 |     ggml_tensor * get_weight(const char *fmt, ...) {
150 |         std::vector<char> str(128);
151 |         va_list va;
152 |         va_start(va, fmt);
153 |         vsnprintf(str.data(), 128, fmt, va);
154 |         va_end(va);
155 |         auto it = tensors.find(str.data());
156 |         if (it == tensors.end()) {
157 |             throw std::runtime_error(string_format("weight tensor not found: %s", str.data()));
158 |         }
159 |         return it->second;
160 |     }
161 | 
162 |     /**
163 |      * Load a GGUF model file
164 |      * The tensors will be loaded into the context and can be accessed via `ctx.get_weight(name)`
165 |      * The GGUF metadata will be loaded into `ctx.ctx_gguf`
166 |      */
167 |     void load_gguf(const char * fname) {
168 |         ggml_context * meta = nullptr;
169 | 
170 |         gguf_init_params params = {
171 |             /*.no_alloc = */ true,
172 |             /*.ctx      = */ &meta,
173 |         };
174 | 
175 |         gguf_context * ctx_gguf = gguf_init_from_file(fname, params);
176 | 
177 |         // load tensors
178 |         const int n_tensors = gguf_get_n_tensors(ctx_gguf);
179 |         ggml_init_params ggml_params = {
180 |             /*.mem_size   =*/ (n_tensors + 1) * ggml_tensor_overhead(),
181 |             /*.mem_buffer =*/ NULL,
182 |             /*.no_alloc   =*/ true,
183 |         };
184 | 
185 |         ggml_context * ctx_data = ggml_init(ggml_params);
186 |         auto fin = std::ifstream(fname, std::ios::binary);
187 |         if (!fin) {
188 |             ggml_free(meta);
189 |             throw std::runtime_error("cannot open model file for loading tensors");
190 |         }
191 | 
192 |         // add tensors to context
193 |         for (int i = 0; i < n_tensors; ++i) {
194 |             const char * name = gguf_get_tensor_name(ctx_gguf, i);
195 |             ggml_tensor * t = ggml_get_tensor(meta, name);
196 |             ggml_tensor * cur = ggml_dup_tensor(ctx_data, t);
197 |             ggml_set_name(cur, name);
198 |             tensors.insert({name, cur});
199 |         }
200 | 
201 |         // alloc memory and offload data
202 |         std::map<ggml_tensor *, uint64_t> offset_map; // empty map, use default value
203 |         if (!load_tensors_to_backend(fin, offset_map, ctx_gguf, ctx_data)) {
204 |             ggml_free(meta);
205 |             throw std::runtime_error("failed to load tensors to backend");
206 |         }
207 |         log(GGML_LOG_LEVEL_INFO, "%s: Loaded %d tensors from %s\n", __func__, n_tensors, fname);
208 |         ggml_free(meta);
209 | 
210 |         loaded_ggufs.push_back({
211 |             gguf_context_ptr(ctx_gguf),
212 |             ggml_context_ptr(ctx_data),
213 |         });
214 |     }
215 | 
216 |     /**
217 |      * Load a Safetensors model file
218 |      * The tensors will be loaded into the context and can be accessed via `ctx.get_weight(name)`
219 |      * In some cases, the tensor name is too long and GGML won't accept it. You can provide a name_replace_map to replace the name.
220 |      * For example:
221 |      *      name_replace_map = {{".acoustic_residual_vector_quantizer", ".arvq"}}
222 |      */
223 |     void load_safetensors(const char * fname, std::map<std::string, std::string> name_replace_map) {
224 |         auto fin = std::ifstream(fname, std::ios::binary);
225 |         if (!fin) {
226 |             throw std::runtime_error("cannot open model file: " + std::string(fname));
227 |         }
228 | 
229 |         uint64_t metadata_size = 0;
230 |         fin.read(reinterpret_cast<char *>(&metadata_size), sizeof(metadata_size));
231 |         if (metadata_size < 2) {
232 |             throw std::runtime_error("invalid metadata size, got " + std::to_string(metadata_size));
233 |         }
234 | 
235 |         std::vector<char> buf(metadata_size);
236 |         fin.read(buf.data(), metadata_size);
237 |         if (!fin) {
238 |             throw std::runtime_error("failed to read metadata");
239 |         }
240 | 
241 |         safetensors_json_parser parser(buf.data(), metadata_size, name_replace_map, safetensors_ignore_unknown_dtype);
242 | 
243 |         ggml_init_params ggml_params = {
244 |             /*.mem_size   =*/ (parser.tensors.size() + 1) * ggml_tensor_overhead(),
245 |             /*.mem_buffer =*/ NULL,
246 |             /*.no_alloc   =*/ true,
247 |         };
248 |         ggml_context * ctx_data = ggml_init(ggml_params);
249 |         gguf_context * ctx_gguf = gguf_init_empty();
250 | 
251 |         std::map<ggml_tensor *, uint64_t> offset_map;
252 |         for (auto & tensor : parser.tensors) {
253 |             ggml_tensor * t = ggml_new_tensor(ctx_data, tensor.type, 4, tensor.shape.data());
254 |             ggml_set_name(t, tensor.name.c_str());
255 |             gguf_add_tensor(ctx_gguf, t);
256 |             tensors.insert({tensor.name, t});
257 |             offset_map.insert({t, parser.get_data_offset() + tensor.offset});
258 |         }
259 | 
260 |         // alloc memory and offload data
261 |         if (!load_tensors_to_backend(fin, offset_map, ctx_gguf, ctx_data)) {
262 |             throw std::runtime_error("failed to load tensors to backend");
263 |         }
264 |         log(GGML_LOG_LEVEL_INFO, "%s: Loaded %d tensors from %s\n", __func__, (int)gguf_get_n_tensors(ctx_gguf), fname);
265 | 
266 |         loaded_ggufs.push_back({
267 |             gguf_context_ptr(ctx_gguf),
268 |             ggml_context_ptr(ctx_data),
269 |         });
270 |     }
271 | 
272 |     /**
273 |      * Various utility functions for building a cgraph.
274 |      * 
275 |      * This object will be provided to the user's builder function as the last argument.
276 |      */
277 |     struct build_utils {
278 |         ggml_context * gf_ctx;
279 |         ggml_cgraph  * gf;
280 |         std::vector<printed_tensor> printed_tensors;
281 |         build_utils(ggml_context * gf_ctx, ggml_cgraph * gf) : gf_ctx(gf_ctx), gf(gf) {}
282 |         /**
283 |          * Add an input tensor, this function does these steps:
284 |          * 1. ggml_new_tensor_4d
285 |          * 2. ggml_set_name
286 |          * 3. ggml_set_input
287 |          */
288 |         ggml_tensor * new_input(const char * name, ggml_type dtype, int64_t ne0, int64_t ne1 = 1, int64_t ne2 = 1, int64_t ne3 = 1) {
289 |             ggml_tensor * t = ggml_new_tensor_4d(gf_ctx, dtype, ne0, ne1, ne2, ne3);
290 |             ggml_set_name(t, name);
291 |             ggml_set_input(t);
292 |             return t;
293 |         }
294 |         /**
295 |          * Mark this tensor as output, this function does these steps:
296 |          * 1. ggml_set_name
297 |          * 2. ggml_set_output
298 |          * 3. ggml_build_forward_expand
299 |          */
300 |         void mark_output(ggml_tensor * t, const char * name) {
301 |             ggml_set_name(t, name);
302 |             ggml_set_output(t);
303 |             ggml_build_forward_expand(gf, t);
304 |         }
305 |         /**
306 |          * Print this tensor as soon as it is computed, useful for debugging.
307 |          * name is optional, if not provided, the existing name of the tensor will be used
308 |          */
309 |         template <typename ...Params>
310 |         void debug_print(ggml_tensor * t, Params&&... params) {
311 |             std::string name = string_format(std::forward<Params>(params)...);
312 |             if (t->flags) {
313 |                 // prevent renaming input/output tensor name by accident
314 |                 t = ggml_cpy(gf_ctx, t, ggml_dup_tensor(gf_ctx, t));
315 |             }
316 |             mark_output(t, name.c_str());
317 |             printed_tensors.push_back({t, false});
318 |         }
319 |         /**
320 |          * Same with `debug_print` but also print the full tensor shape and data.
321 |          */
322 |         template <typename ...Params>
323 |         void debug_print_full(ggml_tensor * t, Params&&... params) {
324 |             std::string name = string_format(std::forward<Params>(params)...);
325 |             if (t->flags) {
326 |                 // prevent renaming input/output tensor name by accident
327 |                 t = ggml_cpy(gf_ctx, t, ggml_dup_tensor(gf_ctx, t));
328 |             }
329 |             mark_output(t, name.c_str());
330 |             printed_tensors.push_back({t, true});
331 |         }
332 |     };
333 | 
334 |     /**
335 |      * Build a cgraph using the given builder function.
336 |      * 
337 |      * The built cgraph will be stored in `ctx.gf`
338 |      */
339 |     void build_graph(std::function<void(ggml_context *, ggml_cgraph *, build_utils &)> builder_fn) {
340 |         ggml_free(ctx_gf);
341 |         struct ggml_init_params params = {
342 |             /*.mem_size   =*/ buf_compute_meta.size(),
343 |             /*.mem_buffer =*/ buf_compute_meta.data(),
344 |             /*.no_alloc   =*/ true,
345 |         };
346 | 
347 |         ctx_gf = ggml_init(params);
348 |         ggml_backend_sched_reset(sched.get());
349 |         gf = ggml_new_graph_custom(ctx_gf, max_nodes, false);
350 | 
351 |         build_utils utils(ctx_gf, gf);
352 | 
353 |         builder_fn(ctx_gf, gf, utils);
354 |         ggml_backend_sched_alloc_graph(sched.get(), gf);
355 |         dbg_printed_tensors = std::move(utils.printed_tensors);
356 |     }
357 | 
358 |     /**
359 |      * Same as `build_graph` but without `build_utils`
360 |      */
361 |     void build_graph(std::function<void(ggml_context *, ggml_cgraph *)> builder_fn) {
362 |         build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf, build_utils & utils) {
363 |             builder_fn(ctx_gf, gf);
364 |         });
365 |     }
366 | 
367 |     /**
368 |      * Compute the given cgraph
369 |      */
370 |     ggml_status compute() {
371 |         ggml_status status = ggml_backend_sched_graph_compute(sched.get(), gf);
372 |         if (status == GGML_STATUS_SUCCESS) {
373 |             for (auto & p : dbg_printed_tensors) {
374 |                 std::vector<uint8_t> data(ggml_nbytes(p.t));
375 |                 ggml_backend_tensor_get(p.t, data.data(), 0, ggml_nbytes(p.t));
376 |                 ggml_easy::debug::print_tensor_shape(p.t);
377 |                 ggml_easy::debug::print_tensor_data(p.t, data.data(), p.full ? LONG_MAX : 3);
378 |             }
379 |         }
380 |         return status;
381 |     }
382 | 
383 |     /**
384 |      * Set the data of a tensor by name
385 |      */
386 |     void set_tensor_data(const std::string & name, const void * data) {
387 |         ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str());
388 |         if (!t) {
389 |             throw std::runtime_error(string_format("tensor not found: %s", name.c_str()));
390 |         }
391 |         ggml_backend_tensor_set(t, data, 0, ggml_nbytes(t));
392 |     }
393 | 
394 |     /**
395 |      * Set the data of a tensor by name using a function.
396 |      * 
397 |      * Example usage:
398 |      * 
399 |      * ```
400 |      * ctx.set_tensor_data("x", [](int i0, int i1, int i2, int i3) {
401 |      *     return i0 + i1 + i2 + i3;
402 |      * });
403 |      * ```
404 |      */
405 |     void set_tensor_data(const std::string & name, std::function<float(int, int, int, int)> data_fn) {
406 |         ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str());
407 |         if (!t) {
408 |             throw std::runtime_error(string_format("tensor not found: %s", name.c_str()));
409 |         }
410 |         if (t->type != GGML_TYPE_F32) {
411 |             throw std::runtime_error(string_format("tensor type must be GGML_TYPE_F32: %s", name.c_str()));
412 |         }
413 |         std::vector<float> data(ggml_nelements(t));
414 |         for (int d3 = 0; d3 < t->ne[3]; ++d3) {
415 |             for (int d2 = 0; d2 < t->ne[2]; ++d2) {
416 |                 for (int d1 = 0; d1 < t->ne[1]; ++d1) {
417 |                     for (int d0 = 0; d0 < t->ne[0]; ++d0) {
418 |                         int i = d3 * t->ne[2] + d2 * t->ne[1] + d1 * t->ne[0] + d0;
419 |                         data[i] = data_fn(d0, d1, d2, d3);
420 |                     }
421 |                 }
422 |             }
423 |         }
424 |         ggml_backend_tensor_set(t, data.data(), 0, ggml_nbytes(t));
425 |     }
426 | 
427 |     /**
428 |      * Get the data of a tensor by name.
429 |      * 
430 |      * Example usage:
431 |      * 
432 |      * ```
433 |      * auto result = ctx.get_tensor_data("result");
434 |      * ggml_tensor * result_tensor        = result.first;
435 |      * std::vector<uint8_t> & result_data = result.second;
436 |      * float * result_data_f32 = (float *) result_data.data();
437 |      * ```
438 |      */
439 |     std::pair<ggml_tensor *, std::vector<uint8_t>> get_tensor_data(const std::string & name) {
440 |         ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str());
441 |         if (!t) {
442 |             throw std::runtime_error(string_format("tensor not found: %s", name.c_str()));
443 |         }
444 |         std::vector<uint8_t> data(ggml_nbytes(t));
445 |         ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t));
446 |         return std::make_pair(t, data);
447 |     }
448 | 
449 |     ~ctx() {
450 |         ggml_backend_buffer_free(buf);
451 |     }
452 | 
453 | private:
454 |     bool load_tensors_to_backend(std::ifstream & fin, std::map<ggml_tensor *, uint64_t> & offset_map, gguf_context * ctx_gguf, ggml_context * ctx_data) {
455 |         std::vector<uint8_t> read_buf;
456 |         const bool use_custom_offset = !offset_map.empty();
457 | 
458 |         // alloc memory and offload data
459 |         ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
460 |         buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_data, buft);
461 |         ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
462 |         for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); ++i) {
463 |             const char * name = gguf_get_tensor_name(ctx_gguf, i);
464 |             ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
465 |             const size_t offset = use_custom_offset
466 |                 ? offset_map[cur]
467 |                 : gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i);
468 |             log(GGML_LOG_LEVEL_DEBUG, "%s: Loading tensor \"%s\"\n", __func__, name);
469 |             fin.seekg(offset, std::ios::beg);
470 |             if (!fin) {
471 |                 log(GGML_LOG_LEVEL_ERROR, "failed to seek for tensor: %s", name);
472 |             }
473 |             int num_bytes = ggml_nbytes(cur);
474 |             if (ggml_backend_buft_is_host(buft)) {
475 |                 // for the CPU and Metal backend, we can read directly into the tensor
476 |                 fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
477 |             } else {
478 |                 // read into a temporary buffer first, then copy to device memory
479 |                 read_buf.resize(num_bytes);
480 |                 fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
481 |                 ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
482 |             }
483 |         }
484 |         return true;
485 |     }
486 | 
487 |     void log(ggml_log_level level, const char * format, ...) {
488 |         va_list args;
489 |         va_start(args, format);
490 |         log_impl(level, format, args);
491 |         va_end(args);
492 |     }
493 | 
494 |     void log_impl(ggml_log_level level, const char * format, va_list args) {
495 |         va_list args_copy;
496 |         va_copy(args_copy, args);
497 |         char buffer[128];
498 |         int len = vsnprintf(buffer, 128, format, args);
499 |         if (len < 128) {
500 |             log_cb(level, buffer, &log_level);
501 |         } else {
502 |             char * buffer2 = new char[len + 1];
503 |             vsnprintf(buffer2, len + 1, format, args_copy);
504 |             buffer2[len] = 0;
505 |             log_cb(level, buffer2, &log_level);
506 |             delete[] buffer2;
507 |         }
508 |         va_end(args_copy);
509 |     }
510 | }; // struct ctx
511 | 
512 | using gf_build_fn = std::function<void(ggml_context *, ggml_cgraph *, ctx::build_utils &)>;
513 | 
514 | ////////////////////////////////////////
515 | 
516 | safetensors_json_parser::safetensors_json_parser(
517 |         const char * json, size_t metadata_size, std::map<std::string, std::string> name_replace_map, bool ignore_unknown_dtype
518 | ) : metadata_size(metadata_size), ignore_unknown_dtype(ignore_unknown_dtype) {
519 |     size_t i = 0;
520 |     state s = STATE_ROOT;
521 |     std::vector<char> buf;
522 |     tensor cur_tensor;
523 |     buf.reserve(128);
524 |     auto i_pp = [&]() {
525 |         if (++i > metadata_size) {
526 |             throw std::runtime_error("unexpected end of JSON");
527 |         }
528 |         return i - 1;
529 |     };
530 |     auto pp_i = [&]() {
531 |         if (++i > metadata_size) {
532 |             throw std::runtime_error("unexpected end of JSON");
533 |         }
534 |         return i;
535 |     };
536 |     auto read_until = [&](char end) -> std::string {
537 |         buf.clear(); i_pp();
538 |         while (json[i] != end) buf.push_back(json[i_pp()]);
539 |         return std::string(buf.data(), buf.size());
540 |     };
541 |     auto read_number = [&]() -> std::string {
542 |         buf.clear(); i_pp();
543 |         while ('0' <= json[i] && json[i] <= '9') buf.push_back(json[i_pp()]);
544 |         return std::string(buf.data(), buf.size());
545 |     };
546 |     while (i < metadata_size) {
547 |         char c = json[i];
548 |         if (i == 0) GGML_ASSERT(c == '{' && "json must start with open curly bracket");
549 | 
550 |         // string
551 |         if (c == '\"') {
552 |             std::string key = read_until('\"');
553 | 
554 |             if (s == STATE_ROOT) {
555 |                 if (key == "__metadata__") {
556 |                     s = STATE_OBJ_METADATA;
557 |                     i_pp();
558 |                     continue;
559 |                 } else {
560 |                     cur_tensor.name = key;
561 |                     for (auto & p : name_replace_map) {
562 |                         string_replace_all(cur_tensor.name, p.first, p.second);
563 |                     }
564 |                     if (cur_tensor.name.empty()) {
565 |                         throw std::runtime_error("empty tensor name");
566 |                     }
567 |                     if (cur_tensor.name.size() > GGML_MAX_NAME - 1) {
568 |                         throw std::runtime_error("tensor name too long: '" + cur_tensor.name + "'; please use name_replace_map to rename it");
569 |                     }
570 |                     i_pp();
571 |                     s = STATE_OBJ_TENSOR;
572 |                     continue;
573 |                 }
574 |             } else if (s == STATE_OBJ_TENSOR) {
575 |                 if (key == "dtype") {
576 |                     GGML_ASSERT(json[pp_i()] == ':');
577 |                     GGML_ASSERT(json[pp_i()] == '\"');
578 |                     std::string value = read_until('\"');
579 |                     /**/ if (value == "F32")  cur_tensor.type = GGML_TYPE_F32;
580 |                     else if (value == "F16")  cur_tensor.type = GGML_TYPE_F16;
581 |                     else if (value == "BF16") cur_tensor.type = GGML_TYPE_BF16;
582 |                     else if (ignore_unknown_dtype) cur_tensor.ignored = true;
583 |                     else throw std::runtime_error("unknown dtype: " + value);
584 |                 } else if (key == "shape") {
585 |                     GGML_ASSERT(json[pp_i()] == ':');
586 |                     GGML_ASSERT(json[pp_i()] == '[');
587 |                     std::vector<int64_t> values;
588 |                     for (int j = 0; j < 4; j++) {
589 |                         std::string value = read_number();
590 |                         if (value.empty()) break;
591 |                         values.push_back(std::stoll(value));
592 |                     }
593 |                     GGML_ASSERT(values.size() >= 0);
594 |                     // flip column-major to row-major
595 |                     for (size_t j = 0; j < values.size(); j++) {
596 |                         cur_tensor.shape[j] = values[values.size() - j - 1];
597 |                     }
598 |                 } else if (key == "data_offsets") {
599 |                     GGML_ASSERT(json[pp_i()] == ':');
600 |                     GGML_ASSERT(json[pp_i()] == '[');
601 |                     std::string off_start = read_number();
602 |                     GGML_ASSERT(!off_start.empty());
603 |                     cur_tensor.offset = std::stoull(off_start);
604 |                     std::string off_end = read_number();
605 |                     GGML_ASSERT(!off_end.empty()); // unused
606 |                 }
607 |             }
608 |         }
609 | 
610 |         // object
611 |         else if (c == '{') {
612 |             if (s == STATE_OBJ_METADATA) {
613 |                 // skip metadata object
614 |                 while (json[pp_i()] != '}') {}
615 |                 s = STATE_ROOT;
616 |             } else if (s == STATE_OBJ_TENSOR) {
617 |                 // read next string
618 |             }
619 |         } else if (c == '}') {
620 |             if (s == STATE_OBJ_TENSOR) {
621 |                 // cur_tensor.print(); // debug
622 |                 if (!cur_tensor.ignored) {
623 |                     tensors.push_back(cur_tensor);
624 |                 }
625 |                 cur_tensor = {};
626 |                 s = STATE_ROOT;
627 |             }
628 |         }
629 | 
630 |         // ignore ',' and ':'
631 |         i++;
632 |     }
633 | }
634 | 
635 | uint64_t safetensors_json_parser::get_data_offset() {
636 |     // alignment: https://github.com/huggingface/safetensors/blob/7d5af853631628137a79341ddc5611d18a17f3fe/safetensors/src/tensor.rs#L202
637 |     static const int alignment = 8; // bytes
638 |     return GGML_PAD(8 + metadata_size, alignment);
639 | }
640 | 
641 | ////////////////////////////////////////
642 | 
643 | namespace debug {
644 |     static void print_backend_buffer_info(ctx & gctx) {
645 |         if (gctx.backend && gctx.buf) {
646 |             auto buft_weight = ggml_backend_get_default_buffer_type(gctx.backend);
647 |             size_t size_weight = ggml_backend_buffer_get_size(gctx.buf);
648 |             if (size_weight > 1) {
649 |                 printf("%s: %10s weight buffer size = %8.2f MiB\n", __func__,
650 |                         ggml_backend_buft_name(buft_weight),
651 |                         size_weight / 1024.0 / 1024.0);
652 |             }
653 |         }
654 |         for (size_t i = 0; i < gctx.backend_ptrs.size(); ++i) {
655 |             ggml_backend_t backend = gctx.backend_ptrs[i];
656 |             ggml_backend_buffer_type_t buft = gctx.backend_buft[i];
657 |             size_t size_sched = ggml_backend_sched_get_buffer_size(gctx.sched.get(), backend);
658 |             if (size_sched > 1) {
659 |                 printf("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
660 |                         ggml_backend_buft_name(buft),
661 |                         size_sched / 1024.0 / 1024.0);
662 |             }
663 |         }
664 |     }
665 | 
666 |     static void print_tensor_shape(ggml_tensor * t) {
667 |         printf("%s.shape = [", t->name);
668 |         for (int i = 0; i < ggml_n_dims(t); ++i) {
669 |             printf("%" PRId64, t->ne[i]);
670 |             if (i < ggml_n_dims(t) - 1) {
671 |                 printf(", ");
672 |             }
673 |         }
674 |         printf("]\n");
675 |     }
676 | 
677 |     static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {
678 |         ggml_type type = t->type;
679 |         int64_t * ne = t->ne;
680 |         size_t * nb = t->nb;
681 |         for (int64_t i3 = 0; i3 < ne[3]; i3++) {
682 |             printf("%s.data: [\n", t->name);
683 |             for (int64_t i2 = 0; i2 < ne[2]; i2++) {
684 |                 if (i2 == n && ne[2] > 2*n) {
685 |                     printf("     ..., \n");
686 |                     i2 = ne[2] - n;
687 |                 }
688 |                 printf("     [\n");
689 |                 for (int64_t i1 = 0; i1 < ne[1]; i1++) {
690 |                     if (i1 == n && ne[1] > 2*n) {
691 |                         printf("      ..., \n");
692 |                         i1 = ne[1] - n;
693 |                     }
694 |                     printf("      [");
695 |                     for (int64_t i0 = 0; i0 < ne[0]; i0++) {
696 |                         if (i0 == n && ne[0] > 2*n) {
697 |                             printf("..., ");
698 |                             i0 = ne[0] - n;
699 |                         }
700 |                         size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
701 |                         float v;
702 |                         if (type == GGML_TYPE_F16) {
703 |                             v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
704 |                         } else if (type == GGML_TYPE_F32) {
705 |                             v = *(float *) &data[i];
706 |                         } else if (type == GGML_TYPE_I32) {
707 |                             v = (float) *(int32_t *) &data[i];
708 |                         } else if (type == GGML_TYPE_I16) {
709 |                             v = (float) *(int16_t *) &data[i];
710 |                         } else if (type == GGML_TYPE_I8) {
711 |                             v = (float) *(int8_t *) &data[i];
712 |                         } else {
713 |                             GGML_ABORT("fatal error");
714 |                         }
715 |                         printf("%8.4f", v);
716 |                         if (i0 < ne[0] - 1) printf(", ");
717 |                     }
718 |                     printf("],\n");
719 |                 }
720 |                 printf("     ],\n");
721 |             }
722 |             printf("    ]\n");
723 |             //printf("    sum = %f\n", sum);
724 |         }
725 |     }
726 | } // namespace debug
727 | 
728 | ////////////////////////////////////////
729 | 
730 | std::string string_format(const char * fmt, ...) {
731 |     va_list ap;
732 |     va_list ap2;
733 |     va_start(ap, fmt);
734 |     va_copy(ap2, ap);
735 |     int size = vsnprintf(NULL, 0, fmt, ap);
736 |     GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
737 |     std::vector<char> buf(size + 1);
738 |     int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
739 |     GGML_ASSERT(size2 == size);
740 |     va_end(ap2);
741 |     va_end(ap);
742 |     return std::string(buf.data(), size);
743 | }
744 | 
745 | void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
746 |     if (search.empty()) {
747 |         return;
748 |     }
749 |     std::string builder;
750 |     builder.reserve(s.length());
751 |     size_t pos = 0;
752 |     size_t last_pos = 0;
753 |     while ((pos = s.find(search, last_pos)) != std::string::npos) {
754 |         builder.append(s, last_pos, pos - last_pos);
755 |         builder.append(replace);
756 |         last_pos = pos + search.length();
757 |     }
758 |     builder.append(s, last_pos, std::string::npos);
759 |     s = std::move(builder);
760 | }
761 | 
762 | } // namespace ggml_easy
763 | 
764 | //
765 | // extension to ggml functions
766 | //
767 | 
768 | // create tensor with all elements set to 1.0
769 | ggml_tensor * ggml_ones(ggml_context * ctx, int64_t ne0, int64_t ne1 = 1, int64_t ne2 = 1, int64_t ne3 = 1) {
770 |     ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
771 |     x = ggml_cos(ctx, ggml_scale(ctx, x, 0.0f)); // cos(0) = 1
772 |     return ggml_repeat_4d(ctx, x, ne0, ne1, ne2, ne3);
773 | }
774 | 


--------------------------------------------------------------------------------
/demo/kyutai-mimi.cpp:
--------------------------------------------------------------------------------
  1 | #include "ggml.h"
  2 | #include "ggml-easy.h"
  3 | #include <iostream>
  4 | #include <float.h>
  5 | #include <cmath>
  6 | 
  7 | 
  8 | /**
  9 |  * (Stil WIP) This is my trial to reimplement the Mimi model from Kyutai using ggml, the code is based on HF transformers implementation. See "modeling_mimi.py" for the original code.
 10 |  * 
 11 |  * To get the model (we are using safetensors directly, no need to convert to GGUF):
 12 |  * 1. Download the model.safetensors file from https://huggingface.co/kyutai/mimi
 13 |  * 2. Rename the "model.safetensors" to "mimi.safetensors"
 14 |  * 
 15 |  * Note: do NOT upload the gguf to the internet, it is NOT compatible with llama.cpp and people will complain.
 16 |  * 
 17 |  * ---
 18 |  * 
 19 |  * For the ENCODER, it takes raw audio waveform as input and output audio codes. Steps are:
 20 |  * 1. Convert waveform to embeddings using mimi_encoder (SEANet encoder), basically just a bunch of Conv1d but the padding is quite tricky.
 21 |  * 2. Process the embeddings using a transformer, here we use an auto-aggressive one (causal mask). This is because Laurent told me that they only trained the model with auto-regressive setting.
 22 |  * 3. Quantize the embeddings using a residual vector quantizer (RVQ) to get the audio codes. The RVQ has 32 codebooks, one for semantic and 31 for acoustic. Doing this on ggml is a bit tricky because I need to reimplement euclidean distance from scratch.
 23 |  * 
 24 |  * In the code below, we take 2048 samples of audio waveform as input (value = 1.0f), expected output is 2 tokens (according to python implementation).
 25 |  * 
 26 |  * Python code:
 27 |  *   model = MimiModel.from_pretrained("/Users/ngxson/work/models/mimi")
 28 |  *   input_values = torch.ones((1, 1, 2048))
 29 |  *   encoder_outputs = model.encode(input_values)  # this should match the output of ggml
 30 |  * 
 31 |  * ---
 32 |  * 
 33 |  * For the DECODER, we simply do the reverse of the above steps.
 34 |  * The good thing is that this time, we don't need to care about euclidean distance.
 35 |  * 
 36 |  * Python code:
 37 |  *   model = MimiModel.from_pretrained("/Users/ngxson/work/models/mimi")
 38 |  *   input_values = torch.tensor([[ [i, i+1, i+2] for i in range(0, 3*32, 3) ]], dtype=torch.long)
 39 |  *   audio_values = model.decode(input_values)[0]  # this should match the output of ggml
 40 |  *
 41 |  *   Expected output:
 42 |  *     torch.Size([1, 1, 5760]) 
 43 |  *     tensor([[[ 0.0117,  0.0130, -0.0007,  ..., -0.1295, -0.1258, -0.1343]]])
 44 |  */
 45 | 
 46 | struct mimi_config_t {
 47 |     bool causal = true;
 48 |     int max_position_embeddings = 8000;
 49 |     int num_hidden_layers = 8;
 50 |     int n_embd = 512;
 51 |     int n_ffn = 2048;
 52 |     int n_head = 8;
 53 |     int n_head_kv = 8;
 54 |     int n_rot = 64;
 55 |     float norm_eps = 1e-5;
 56 |     float rope_theta = 10000.0f;
 57 |     int sliding_window = 250;
 58 |     std::array<int, 4> upsampling_ratio   = {8, 6, 5, 4};
 59 |     std::array<int, 4> downsampling_ratio = {4, 5, 6, 8}; // reverse of upsampling_ratio
 60 |     // vector quantizer
 61 |     float frame_rate = 12.5;
 62 |     int audio_channels = 1;
 63 |     int codebook_size = 2048;
 64 |     int codebook_dim = 256;
 65 |     int n_semantic_components = 1;
 66 |     int n_acoustic_components = 31;
 67 |     // decode
 68 |     float trim_right_ratio = 1.0f;
 69 | } mimi_config;
 70 | 
 71 | 
 72 | ///////////////////////////////////////////////////////////////////////////
 73 | // extension to ggml.h
 74 | // TODO: add these ops to the library (ofc with a more optimized kernel)
 75 | 
 76 | 
 77 | // mode: (0) constant, (1) reflect, (2) replicate, (3) circular
 78 | // value is only used in "constant"
 79 | // only "constant" with 0.0f and "replicate" are implemented here
 80 | static ggml_tensor * ggml_pad_ext(ggml_context * ctx0, ggml_tensor * x, int mode,
 81 |         int64_t pad_left, int64_t pad_right, float value = 0.0f) {
 82 |     GGML_ASSERT(value == 0.0f); // we can technically use ggml_arange, but for simplication we only support 0.0f
 83 |     GGML_ASSERT(mode == 0 || mode == 2);
 84 |     if (pad_left > 0) {
 85 |         ggml_tensor * tmp = ggml_new_tensor_2d(ctx0, x->type, pad_left, x->ne[1]);
 86 |         if (mode == 0) {
 87 |             tmp = ggml_scale(ctx0, tmp, value);
 88 |         } else if (mode == 2) {
 89 |             ggml_tensor * elem = ggml_view_2d(ctx0, x, 1, x->ne[1], x->nb[1], 0); // get first column
 90 |             tmp = ggml_repeat(ctx0, elem, tmp);
 91 |         }
 92 |         x = ggml_concat(ctx0, tmp, x, 0);
 93 |     }
 94 |     if (pad_right > 0) {
 95 |         ggml_tensor * tmp = ggml_new_tensor_2d(ctx0, x->type, pad_right, x->ne[1]);
 96 |         if (mode == 0) {
 97 |             tmp = ggml_scale(ctx0, tmp, value);
 98 |         } else if (mode == 2) {
 99 |             int64_t last = x->ne[0] - 1;
100 |             ggml_tensor * elem = ggml_view_2d(ctx0, x, 1, x->ne[1], x->nb[1], last * ggml_element_size(x)); // get last column
101 |             tmp = ggml_repeat(ctx0, elem, tmp);
102 |         }
103 |         x = ggml_concat(ctx0, x, tmp, 0);
104 |     }
105 |     return x;
106 | }
107 | 
108 | static ggml_tensor * ggml_argmin(ggml_context * ctx0, ggml_tensor * x) {
109 |     ggml_tensor * tmp = ggml_scale(ctx0, x, -1.0f);
110 |     return ggml_argmax(ctx0, tmp);
111 | }
112 | 
113 | // lookup nearest vector in codebook based on euclidean distance
114 | // return index of the vector in codebook, single element with I32 type
115 | static ggml_tensor * ggml_lookup_vec(ggml_context * ctx0, ggml_tensor * codebook, ggml_tensor * x) {
116 |     ggml_tensor * tmp = ggml_add(ctx0, codebook, ggml_scale(ctx0, x, -1.0f)); // a - x
117 |     tmp = ggml_mul(ctx0, tmp, tmp); // (a - x) ** 2
118 |     tmp = ggml_sum_rows(ctx0, tmp);
119 |     tmp = ggml_sqrt(ctx0, tmp);
120 |     tmp = ggml_cont(ctx0, ggml_transpose(ctx0, tmp));
121 |     // villain version of argmin :-)
122 |     tmp = ggml_argmax(ctx0, ggml_scale(ctx0, tmp, -1.0f));
123 |     GGML_ASSERT(ggml_nelements(tmp) == 1);
124 |     return tmp;
125 | }
126 | 
127 | // lookup vectors in codebook based on euclidean distance
128 | // return indices of the vectors in codebook, 1D tensor with I32 type
129 | static ggml_tensor * ggml_lookup_vectors(ggml_easy::ctx::build_utils & utils, ggml_context * ctx0, ggml_tensor * codebook, ggml_tensor * list_vec, ggml_tensor * out, size_t offset) {
130 |     int64_t n_col = list_vec->ne[0];
131 |     int64_t n_row = list_vec->ne[1];
132 |     for (int64_t ir = 0; ir < n_row; ir++) {
133 |         ggml_tensor * row = ggml_view_1d(ctx0, list_vec, n_col, ir*n_col*ggml_element_size(list_vec));
134 |         ggml_tensor * idx = ggml_lookup_vec(ctx0, codebook, row);
135 |         ggml_tensor * dst = ggml_view_1d(ctx0, out, 1, offset + ir*ggml_element_size(out));
136 |         ggml_build_forward_expand(utils.gf, ggml_cpy(ctx0, idx, dst));
137 |     }
138 |     return out;
139 | }
140 | 
141 | 
142 | ///////////////////////////////////////////////////////////////////////////
143 | 
144 | 
145 | static int64_t div_ceil(int64_t a, int64_t b) {
146 |     return a / b + (a % b ? 1 : 0);
147 | }
148 | 
149 | static ggml_tensor * mimi_conv_1d(ggml_easy::ctx::build_utils & utils, ggml_context * ctx0, ggml_tensor * x,
150 |         ggml_tensor * kernel, ggml_tensor * bias, int stride, int dilation, bool pad_zero = true) {
151 |     int64_t kernel_size = (kernel->ne[0] - 1) * dilation + 1;
152 |     int64_t p_total = kernel_size - stride; // padding total
153 |     int64_t p_half = p_total / 2;
154 |     int64_t is_p_odd = p_total % 2; // is padding odd
155 | 
156 |     int64_t n_frames = div_ceil(x->ne[0] - kernel_size + p_total, stride);
157 |     int64_t ideal_len = n_frames * stride + kernel_size - p_total;
158 |     int64_t p_extra = ideal_len - x->ne[0];
159 | 
160 |     int64_t p_right = (mimi_config.causal ? 0 : p_half) + p_extra;
161 |     int64_t p_left = p_total - (mimi_config.causal ? 0 : p_half);
162 | 
163 |     x = ggml_pad_ext(ctx0, x, pad_zero ? 0 : 2, p_left, p_right);
164 |     // utils.debug_print(x, "mimi_conv_1d_padded");
165 | 
166 |     kernel = ggml_cast(ctx0, kernel, GGML_TYPE_F16); // TODO: do this at conversion time
167 |     x = ggml_conv_1d(ctx0, kernel, x, stride, 0, dilation);
168 |     if (bias) {
169 |         bias = ggml_cont(ctx0, ggml_transpose(ctx0, bias)); // TODO: do this at conversion time
170 |         x = ggml_add(ctx0, x, bias);
171 |     }
172 |     ggml_set_name(x, "mimi_conv_1d");
173 |     return x;
174 | };
175 | 
176 | static ggml_tensor * mimi_conv_transpose_1d(ggml_easy::ctx::build_utils & utils, ggml_context * ctx0, ggml_tensor * x,
177 |         ggml_tensor * kernel, ggml_tensor * bias, int stride, int dilation, bool depthwise) {
178 |     GGML_ASSERT(x->ne[1] == kernel->ne[2]);
179 |     int64_t n_rows = x->ne[1];
180 |     int64_t kernel_size = kernel->ne[0];
181 |     int64_t p_total = kernel_size - stride; // padding total
182 | 
183 |     int64_t p_right = mimi_config.causal
184 |         ? (float)p_total / mimi_config.trim_right_ratio
185 |         : p_total / 2;
186 |     int64_t p_left = p_total - p_right;
187 | 
188 |     ggml_tensor * out = nullptr;
189 | 
190 |     kernel = ggml_cast(ctx0, kernel, GGML_TYPE_F16); // TODO: do this at conversion time
191 | 
192 |     if (depthwise) {
193 |         for (int64_t ir = 0; ir < n_rows; ir++) {
194 |             ggml_tensor * row = ggml_view_1d(ctx0, x,
195 |                                             x->ne[0], ir*x->ne[0]*ggml_element_size(x));
196 |             ggml_tensor * krn = ggml_view_1d(ctx0, kernel,
197 |                                             kernel->ne[0], ir*kernel->ne[0]*ggml_element_size(kernel));
198 |             if (ir == 0) {
199 |                 ggml_set_name(krn, "krn");
200 |                 ggml_easy::debug::print_tensor_shape(krn);
201 |             }
202 |             row = ggml_conv_transpose_1d(ctx0, krn, row, stride, 0, dilation);
203 |             if (ir == 0) {
204 |                 ggml_set_name(row, "ggml_conv_transpose_1d __________");
205 |                 ggml_easy::debug::print_tensor_shape(row);
206 |             }
207 |             // unpad (remove p_right and p_left columns)
208 |             row = ggml_view_1d(ctx0, row, row->ne[0] - p_total, p_left*ggml_element_size(row));
209 |     
210 |             // TODO: concat can be slow, we should use ggml_view_1d/ggml_cpy to avoid realloc
211 |             out = out ? ggml_concat(ctx0, out, row, 1) : row;
212 |         }
213 | 
214 |     } else {
215 |         out = ggml_conv_transpose_1d(ctx0, kernel, x, stride, 0, dilation);
216 |         // unpad
217 |         out = ggml_view_2d(ctx0, out,
218 |             out->ne[0] - p_total, out->ne[1],
219 |             out->nb[1], p_left*ggml_element_size(out));
220 |     }
221 | 
222 |     if (bias) {
223 |         bias = ggml_cont(ctx0, ggml_transpose(ctx0, bias)); // TODO: do this at conversion time
224 |         out = ggml_add(ctx0, out, bias);
225 |     }
226 | 
227 |     return out;
228 | }
229 | 
230 | // based on MimiEncoder
231 | // SEANet encoder as used by Mimi.
232 | struct mimi_encoder_decoder {
233 |     ggml_easy::ctx & ctx;
234 |     struct layer {
235 |         bool is_elu = false;
236 |         bool is_resnet = false;
237 |         bool is_transposed_conv = false;
238 |         ggml_tensor * conv_0_w;
239 |         ggml_tensor * conv_0_b;
240 |         ggml_tensor * conv_1_w;
241 |         ggml_tensor * conv_1_b;
242 |         int stride = 1;
243 |     };
244 |     int dilation_growth_rate = 2; // TODO: unused?
245 |     std::vector<layer> layers;
246 | 
247 |     std::array<int, 4> repeated_pattern = {1, 4, 7, 10};
248 | 
249 |     mimi_encoder_decoder(ggml_easy::ctx & ctx) : ctx(ctx) {}
250 | 
251 |     void load_encoder() {
252 |         layers.push_back({
253 |             .conv_0_w = ctx.get_weight("encoder.layers.0.conv.weight"),
254 |             .conv_0_b = ctx.get_weight("encoder.layers.0.conv.bias"),
255 |         });
256 |         for (int i = 0; i < (int)repeated_pattern.size(); ++i) {
257 |             int i_start = repeated_pattern[i];
258 |             // residual layers
259 |             layers.push_back({
260 |                 .is_resnet = true,
261 |                 .conv_0_w = ctx.get_weight("encoder.layers.%d.block.1.conv.weight", i_start),
262 |                 .conv_0_b = ctx.get_weight("encoder.layers.%d.block.1.conv.bias",   i_start),
263 |                 .conv_1_w = ctx.get_weight("encoder.layers.%d.block.3.conv.weight", i_start),
264 |                 .conv_1_b = ctx.get_weight("encoder.layers.%d.block.3.conv.bias",   i_start),
265 |             });
266 |             // downsampling layers
267 |             layers.push_back({
268 |                 .is_elu = true, // layer (i_start + 1)
269 |             });
270 |             layers.push_back({
271 |                 .conv_0_w = ctx.get_weight("encoder.layers.%d.conv.weight", i_start + 2),
272 |                 .conv_0_b = ctx.get_weight("encoder.layers.%d.conv.bias",   i_start + 2),
273 |                 .stride = mimi_config.downsampling_ratio[i],
274 |             });
275 |         }
276 |         layers.push_back({
277 |             .is_elu = true, // layer 13
278 |         });
279 |         layers.push_back({
280 |             .conv_0_w = ctx.get_weight("encoder.layers.14.conv.weight"),
281 |             .conv_0_b = ctx.get_weight("encoder.layers.14.conv.bias"),
282 |         });
283 |     }
284 | 
285 |     void load_decoder() {
286 |         layers.push_back({
287 |             .conv_0_w = ctx.get_weight("decoder.layers.0.conv.weight"),
288 |             .conv_0_b = ctx.get_weight("decoder.layers.0.conv.bias"),
289 |         });
290 |         for (int i = 0; i < (int)repeated_pattern.size(); ++i) {
291 |             int i_start = repeated_pattern[i];
292 |             // upsampling layers
293 |             layers.push_back({
294 |                 .is_elu = true, // layer (i_start)
295 |             });
296 |             layers.push_back({
297 |                 .is_transposed_conv = true,
298 |                 .conv_0_w = ctx.get_weight("decoder.layers.%d.conv.weight", i_start + 1),
299 |                 .conv_0_b = ctx.get_weight("decoder.layers.%d.conv.bias",   i_start + 1),
300 |                 .stride = mimi_config.upsampling_ratio[i],
301 |             });
302 |             // residual layers
303 |             layers.push_back({
304 |                 .is_resnet = true,
305 |                 .conv_0_w = ctx.get_weight("decoder.layers.%d.block.1.conv.weight", i_start + 2),
306 |                 .conv_0_b = ctx.get_weight("decoder.layers.%d.block.1.conv.bias",   i_start + 2),
307 |                 .conv_1_w = ctx.get_weight("decoder.layers.%d.block.3.conv.weight", i_start + 2),
308 |                 .conv_1_b = ctx.get_weight("decoder.layers.%d.block.3.conv.bias",   i_start + 2),
309 |             });
310 |         }
311 |         layers.push_back({
312 |             .is_elu = true, // layer 13
313 |         });
314 |         layers.push_back({
315 |             .conv_0_w = ctx.get_weight("decoder.layers.14.conv.weight"),
316 |             .conv_0_b = ctx.get_weight("decoder.layers.14.conv.bias"),
317 |         });
318 |     }
319 | 
320 |     ggml_tensor * forward(ggml_context * ctx0, ggml_easy::ctx::build_utils & utils, ggml_tensor * input) {
321 |         ggml_tensor * x = input;
322 | 
323 |         // int i = 0; // for debugging
324 |         for (auto & layer : layers) {
325 |             if (layer.is_elu) {
326 |                 x = ggml_elu(ctx0, x);
327 |             } else if (layer.is_resnet) {
328 |                 ggml_tensor * residual = x;
329 |                 x = ggml_elu(ctx0, x);
330 |                 ggml_easy::debug::print_tensor_shape(x);
331 |                 ggml_easy::debug::print_tensor_shape(layer.conv_0_w);
332 |                 x = mimi_conv_1d(utils, ctx0, x, layer.conv_0_w, layer.conv_0_b, 1, 1);
333 |                 x = ggml_elu(ctx0, x);
334 |                 x = mimi_conv_1d(utils, ctx0, x, layer.conv_1_w, layer.conv_1_b, 1, 1);
335 |                 x = ggml_add(ctx0, x, residual);
336 |             } else {
337 |                 x = layer.is_transposed_conv
338 |                     ? mimi_conv_transpose_1d(utils, ctx0, x, layer.conv_0_w, layer.conv_0_b, layer.stride, 1, false)
339 |                     : mimi_conv_1d(utils, ctx0, x, layer.conv_0_w, layer.conv_0_b, layer.stride, 1);
340 |             }
341 |             // utils.debug_print(x, "after_layer_%d", i); i++;
342 |         }
343 | 
344 |         return x;
345 |     }
346 | };
347 | 
348 | struct mimi_transformer {
349 |     struct layer {
350 |         ggml_tensor * inp_norm_w;
351 |         ggml_tensor * inp_norm_b;
352 | 
353 |         ggml_tensor * attn_q;
354 |         ggml_tensor * attn_k;
355 |         ggml_tensor * attn_v;
356 |         ggml_tensor * attn_o;
357 |         ggml_tensor * attn_post_norm_w;
358 |         ggml_tensor * attn_post_norm_b;
359 |         ggml_tensor * attn_layer_scale;
360 | 
361 |         ggml_tensor * ffn_up;
362 |         ggml_tensor * ffn_down;
363 |         ggml_tensor * mlp_layer_scale;
364 |     };
365 |     std::vector<layer> layers;
366 | 
367 |     mimi_transformer(ggml_easy::ctx & ctx, const char * prefix, int n_layers) {
368 |         for (int il = 0; il < n_layers; il++) {
369 |             layers.push_back({
370 |                 .inp_norm_w = ctx.get_weight("%s_transformer.layers.%d.input_layernorm.weight", prefix, il),
371 |                 .inp_norm_b = ctx.get_weight("%s_transformer.layers.%d.input_layernorm.bias",   prefix, il),
372 | 
373 |                 .attn_q           = ctx.get_weight("%s_transformer.layers.%d.self_attn.q_proj.weight",         prefix, il),
374 |                 .attn_k           = ctx.get_weight("%s_transformer.layers.%d.self_attn.k_proj.weight",         prefix, il),
375 |                 .attn_v           = ctx.get_weight("%s_transformer.layers.%d.self_attn.v_proj.weight",         prefix, il),
376 |                 .attn_o           = ctx.get_weight("%s_transformer.layers.%d.self_attn.o_proj.weight",         prefix, il),
377 |                 .attn_post_norm_w = ctx.get_weight("%s_transformer.layers.%d.post_attention_layernorm.weight", prefix, il),
378 |                 .attn_post_norm_b = ctx.get_weight("%s_transformer.layers.%d.post_attention_layernorm.bias",   prefix, il),
379 |                 .attn_layer_scale = ctx.get_weight("%s_transformer.layers.%d.self_attn_layer_scale.scale",     prefix, il),
380 | 
381 |                 .ffn_up          = ctx.get_weight("%s_transformer.layers.%d.mlp.fc1.weight",        prefix, il),
382 |                 .ffn_down        = ctx.get_weight("%s_transformer.layers.%d.mlp.fc2.weight",        prefix, il),
383 |                 .mlp_layer_scale = ctx.get_weight("%s_transformer.layers.%d.mlp_layer_scale.scale", prefix, il),
384 |             });
385 |         }
386 |     }
387 | 
388 |     ggml_tensor * forward(ggml_context * ctx0, ggml_easy::ctx::build_utils & utils, ggml_tensor * input, ggml_tensor * inp_pos) {
389 |         int n_tokens    = input->ne[1];
390 |         ggml_tensor * x = input;
391 | 
392 |         auto layer_norm = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) {
393 |             x = ggml_norm(ctx0, x, mimi_config.norm_eps);
394 |             x = ggml_mul(ctx0, x, w);
395 |             x = ggml_add(ctx0, x, b);
396 |             return x;
397 |         };
398 | 
399 |         // TODO: do this at conversion time, see LlamaModel.permute in convert_hf_to_gguf.py
400 |         auto llama_permute = [&](ggml_tensor * w) {
401 |             int n_head = mimi_config.n_head;
402 |             ggml_tensor * tmp = ggml_reshape_4d(ctx0, w, w->ne[0], w->ne[1] / n_head / 2, 2, n_head);
403 |             tmp = ggml_permute(ctx0, tmp, 0, 2, 1, 3);
404 |             tmp = ggml_cont(ctx0, tmp);
405 |             return ggml_reshape_2d(ctx0, tmp, w->ne[0], w->ne[1]);
406 |         };
407 | 
408 |         ggml_tensor * residual = input;
409 | 
410 |         int i = 0; // for debugging
411 |         for (auto & layer : layers) {
412 |             residual = x;
413 | 
414 |             // input layer norm
415 |             x = layer_norm(x, layer.inp_norm_w, layer.inp_norm_b);
416 | 
417 |             // self attention
418 |             {
419 |                 ggml_tensor * q = ggml_mul_mat(ctx0, llama_permute(layer.attn_q), x);
420 |                 ggml_tensor * k = ggml_mul_mat(ctx0, llama_permute(layer.attn_k), x);
421 |                 ggml_tensor * v = ggml_mul_mat(ctx0, layer.attn_v, x);
422 | 
423 |                 int n_embd_head = mimi_config.n_embd / mimi_config.n_head;
424 |                 q = ggml_reshape_3d(ctx0, q, n_embd_head, mimi_config.n_head,    n_tokens);
425 |                 k = ggml_reshape_3d(ctx0, k, n_embd_head, mimi_config.n_head_kv, n_tokens);
426 |                 v = ggml_reshape_3d(ctx0, v, n_embd_head, mimi_config.n_head_kv, n_tokens);
427 | 
428 |                 int n_rot = n_embd_head;
429 |                 q = ggml_rope_inplace(ctx0, q, inp_pos, n_rot, 0);
430 |                 q = ggml_cont(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3));
431 |                 // utils.debug_print(q, "q rope");
432 | 
433 |                 k = ggml_rope_inplace(ctx0, k, inp_pos, n_rot, 0);
434 |                 k = ggml_cont(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3));
435 |                 // utils.debug_print(k, "k rope");
436 | 
437 |                 ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
438 |                 ggml_mul_mat_set_prec(kq, GGML_PREC_F32); // mimic behavior of llama.cpp
439 |                 kq = ggml_scale_inplace(ctx0, kq, 1.0f / std::sqrt(n_embd_head));
440 |                 ggml_tensor * kq_masked = ggml_diag_mask_inf_inplace(ctx0, kq, n_tokens);
441 |                 kq = ggml_soft_max_inplace(ctx0, kq_masked);
442 |                 // utils.debug_print(kq, "kq softmax");
443 | 
444 |                 v = ggml_cont(ctx0, ggml_permute(ctx0, v, 1, 2, 0, 3));
445 | 
446 |                 ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
447 |                 kqv = ggml_reshape_3d(ctx0, kqv, n_embd_head, n_tokens, mimi_config.n_head);
448 |                 kqv = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
449 |                 kqv = ggml_cont_2d(ctx0, kqv, mimi_config.n_embd, n_tokens);
450 |                 // utils.debug_print(kqv, "kqv");
451 |                 // utils.debug_print(ggml_sum(ctx0, kqv), "kqv_sum");
452 | 
453 |                 x = ggml_mul_mat(ctx0, layer.attn_o, kqv);
454 |             }
455 | 
456 |             // residual
457 |             x = ggml_mul(ctx0, x, layer.attn_layer_scale);
458 |             x = ggml_add(ctx0, x, residual);
459 |             // utils.debug_print(x, "after_attn_%d", i);
460 | 
461 |             residual = x;
462 |             x = layer_norm(x, layer.attn_post_norm_w, layer.attn_post_norm_b);
463 | 
464 |             // mlp
465 |             {
466 |                 x = ggml_mul_mat(ctx0, layer.ffn_up, x);
467 |                 x = ggml_gelu(ctx0, x);
468 |                 x = ggml_mul_mat(ctx0, layer.ffn_down, x);
469 |             }
470 | 
471 |             // residual
472 |             x = ggml_mul(ctx0, x, layer.mlp_layer_scale);
473 |             x = ggml_add(ctx0, x, residual);
474 |             // utils.debug_print(x, "output_layer_%d", i);
475 |             // utils.debug_print(ggml_sum(ctx0, x), "output_layer_%d_sum", i); i++;
476 |         }
477 | 
478 |         return x;
479 |     }
480 | };
481 | 
482 | struct mimi_residual_vector_quantizer {
483 |     struct component {
484 |         ggml_tensor * codebook_embed_sum;
485 |         ggml_tensor * codebook_cluster_usage;
486 |         ggml_tensor * get_embd(ggml_context * ctx0) {
487 |             // TODO: do this at conversion time
488 |             ggml_tensor * tmp = ggml_cont(ctx0, ggml_transpose(ctx0, codebook_cluster_usage));
489 |             tmp = ggml_clamp(ctx0, tmp, mimi_config.norm_eps, FLT_MAX);
490 |             return ggml_div(ctx0, codebook_embed_sum, tmp);
491 |         }
492 |     };
493 | 
494 |     ggml_tensor * semantic_inp_proj;
495 |     std::vector<component> semantic_components;
496 |     ggml_tensor * semantic_out_proj;
497 | 
498 |     ggml_tensor * acoustic_inp_proj;
499 |     std::vector<component> acoustic_components;
500 |     ggml_tensor * acoustic_out_proj;
501 | 
502 |     mimi_residual_vector_quantizer(ggml_easy::ctx & ctx) {
503 |         semantic_inp_proj = ctx.get_weight("quantizer.semantic_rvq.input_proj.weight");
504 |         semantic_out_proj = ctx.get_weight("quantizer.semantic_rvq.output_proj.weight");
505 |         for (int i = 0; i < mimi_config.n_semantic_components; i++) {
506 |             semantic_components.push_back({
507 |                 .codebook_embed_sum     = ctx.get_weight("quantizer.semantic_rvq.layers.%d.codebook.embed_sum",     i),
508 |                 .codebook_cluster_usage = ctx.get_weight("quantizer.semantic_rvq.layers.%d.codebook.cluster_usage", i),
509 |             });
510 |         }
511 |         acoustic_inp_proj = ctx.get_weight("quantizer.acoustic_rvq.input_proj.weight");
512 |         acoustic_out_proj = ctx.get_weight("quantizer.acoustic_rvq.output_proj.weight");
513 |         for (int i = 0; i < mimi_config.n_acoustic_components; i++) {
514 |             acoustic_components.push_back({
515 |                 .codebook_embed_sum     = ctx.get_weight("quantizer.acoustic_rvq.layers.%d.codebook.embed_sum",     i),
516 |                 .codebook_cluster_usage = ctx.get_weight("quantizer.acoustic_rvq.layers.%d.codebook.cluster_usage", i),
517 |             });
518 |         }
519 |     }
520 | 
521 |     // 🆘🆘🆘🆘🆘 FIXME: this does not work correcly, about 50% of the output codes are incorrect
522 |     ggml_tensor * encode(ggml_context * ctx0, ggml_easy::ctx::build_utils & utils, ggml_tensor * input) {
523 |         int64_t n_embd           = input->ne[1];
524 |         int64_t n_codes_per_embd = (semantic_components.size() + acoustic_components.size());
525 |         ggml_tensor * codes = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_embd, n_codes_per_embd);
526 |         ggml_set_input(codes);
527 |         ggml_set_name(codes, "codes");
528 | 
529 |         size_t pos = 0;
530 |         {
531 |             // semantic
532 |             ggml_tensor * proj = ggml_reshape_2d(ctx0, semantic_inp_proj,
533 |                 semantic_inp_proj->ne[1], semantic_inp_proj->ne[2]); // TODO: do this at conversion time
534 |             ggml_tensor * x = ggml_mul_mat(ctx0, proj, input);
535 |             for (size_t i = 0; i < semantic_components.size(); i++) {
536 |                 ggml_tensor * codebook = semantic_components[i].get_embd(ctx0);
537 |                 codes = ggml_lookup_vectors(utils, ctx0, codebook, x, codes, pos);
538 |                 ggml_build_forward_expand(utils.gf, codes);
539 |                 pos += n_embd*ggml_element_size(codes);
540 |             }
541 |         }
542 | 
543 |         {
544 |             // acoustic
545 |             ggml_tensor * proj = ggml_reshape_2d(ctx0, acoustic_inp_proj,
546 |                 acoustic_inp_proj->ne[1], acoustic_inp_proj->ne[2]); // TODO: do this at conversion time
547 |             ggml_tensor * x = ggml_mul_mat(ctx0, proj, input);
548 |             for (size_t i = 0; i < acoustic_components.size(); i++) {
549 |                 ggml_tensor * codebook = acoustic_components[i].get_embd(ctx0);
550 |                 codes = ggml_lookup_vectors(utils, ctx0, codebook, x, codes, pos);
551 |                 ggml_build_forward_expand(utils.gf, codes);
552 |                 pos += n_embd*ggml_element_size(codes);
553 |             }
554 |         }
555 | 
556 |         return codes;
557 |     }
558 | 
559 |     // the input has shape [n_codes, n_codes_per_embd]
560 |     // first row is semantic, the rest are acoustic
561 |     // example: [ [semantic], [acoustic1], [acoustic2], ... ]
562 |     ggml_tensor * decode(ggml_context * ctx0, ggml_easy::ctx::build_utils & utils, ggml_tensor * input) {
563 |         GGML_ASSERT(input->type == GGML_TYPE_I32);
564 | 
565 |         size_t  n_semantic       = semantic_components.size();
566 |         int64_t n_codes_per_embd = (n_semantic + acoustic_components.size());
567 |         int64_t n_codes          = input->ne[0] / n_codes_per_embd;
568 |         
569 |         GGML_ASSERT(input->ne[0] % n_codes_per_embd == 0);
570 | 
571 |         ggml_tensor * out_s = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, mimi_config.codebook_dim, n_codes);
572 |         ggml_tensor * out_a = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, mimi_config.codebook_dim, n_codes);
573 |         out_s = ggml_scale(ctx0, out_s, 0.0f); // clear
574 |         out_a = ggml_scale(ctx0, out_a, 0.0f); // clear
575 | 
576 |         for (size_t ir = 0; ir < n_codes_per_embd; ir++) {
577 |             ggml_tensor * row = ggml_view_1d(ctx0, input, n_codes, ir*n_codes*ggml_element_size(input));
578 |             if (ir < n_semantic) {
579 |                 // semantic
580 |                 ggml_tensor * codebook = semantic_components[ir].get_embd(ctx0);
581 |                 ggml_tensor * embd = ggml_get_rows(ctx0, codebook, row);
582 |                 out_s = ggml_add(ctx0, out_s, embd);
583 |             } else {
584 |                 // acoustic
585 |                 ggml_tensor * codebook = acoustic_components[ir-n_semantic].get_embd(ctx0);
586 |                 ggml_tensor * embd = ggml_get_rows(ctx0, codebook, row);
587 |                 out_a = ggml_add(ctx0, out_a, embd);
588 |             }
589 |         }
590 | 
591 |         ggml_tensor * proj_s = ggml_reshape_2d(ctx0, semantic_out_proj,
592 |             semantic_out_proj->ne[1], semantic_out_proj->ne[2]); // TODO: do this at conversion time
593 |         ggml_tensor * proj_a = ggml_reshape_2d(ctx0, acoustic_out_proj,
594 |             acoustic_out_proj->ne[1], acoustic_out_proj->ne[2]); // TODO: do this at conversion time
595 | 
596 |         out_s = ggml_mul_mat(ctx0, proj_s, out_s);
597 |         out_a = ggml_mul_mat(ctx0, proj_a, out_a);
598 | 
599 |         return ggml_add(ctx0, out_s, out_a);
600 |     }
601 | };
602 | 
603 | int main() {
604 |     ggml_easy::ctx_params params;
605 |     //params.log_level = GGML_LOG_LEVEL_DEBUG;
606 |     params.max_nodes = 1024*16;
607 |     params.use_gpu = false;
608 |     ggml_easy::ctx ctx(params);
609 | 
610 |     // ctx.load_gguf("mimi.gguf");
611 |     ctx.load_safetensors("mimi.safetensors", {
612 |         {".acoustic_residual_vector_quantizer", ".acoustic_rvq"},
613 |         {".semantic_residual_vector_quantizer", ".semantic_rvq"},
614 |     });
615 | 
616 |     // optional: print backend buffer info
617 |     ggml_easy::debug::print_backend_buffer_info(ctx);
618 | 
619 |     mimi_encoder_decoder           encoder(ctx);
620 |     mimi_encoder_decoder           decoder(ctx);
621 |     mimi_transformer               encoder_transformer(ctx, "encoder", 8);
622 |     mimi_transformer               decoder_transformer(ctx, "decoder", 8);
623 |     mimi_residual_vector_quantizer quantizer(ctx);
624 | 
625 |     encoder.load_encoder();
626 |     decoder.load_decoder();
627 | 
628 |     // create cgraph
629 |     ctx.build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf, auto & utils) {
630 |         ggml_tensor * input = utils.new_input("input", GGML_TYPE_F32, 2048);
631 | 
632 |         // encoder
633 |         {
634 |             // SEANET encoder
635 |             ggml_tensor * embeddings = encoder.forward(ctx_gf, utils, input);
636 |             utils.debug_print(embeddings, "embeddings");
637 | 
638 |             // transformer
639 |             int n_pos = embeddings->ne[0];
640 |             ggml_tensor * pos_enc = utils.new_input("pos_enc", GGML_TYPE_I32, n_pos);
641 |             embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings));
642 |             embeddings = encoder_transformer.forward(ctx_gf, utils, embeddings, pos_enc);
643 |             utils.debug_print(embeddings, "embeddings_after_transformer");
644 | 
645 |             // downsample
646 |             embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings));
647 |             embeddings = mimi_conv_1d(utils, ctx_gf, embeddings, ctx.get_weight("downsample.conv.weight"), nullptr, 2, 1, false);
648 |             utils.debug_print(embeddings, "downsample");
649 | 
650 |             // residual vector quantizer
651 |             embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings));
652 |             embeddings = quantizer.encode(ctx_gf, utils, embeddings);
653 | 
654 |             //utils.debug_print_full(embeddings, "output_codes");
655 |             utils.mark_output(embeddings, "output_codes");
656 |         }
657 | 
658 |         // decoder
659 |         {
660 |             ggml_tensor * inp_dec = utils.new_input("inp_dec", GGML_TYPE_I32, 3 * 32);
661 |             ggml_tensor * embeddings = quantizer.decode(ctx_gf, utils, inp_dec);
662 |             utils.debug_print(embeddings, "read from codebook");
663 | 
664 |             // upsample
665 |             embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings));
666 |             embeddings = mimi_conv_transpose_1d(utils, ctx_gf, embeddings, ctx.get_weight("upsample.conv.weight"), nullptr, 2, 1, true);
667 |             utils.debug_print(embeddings, "upscaled");
668 | 
669 |             // transformer
670 |             int n_pos = embeddings->ne[0];
671 |             ggml_tensor * pos_dec = utils.new_input("pos_dec", GGML_TYPE_I32, n_pos);
672 |             embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings));
673 |             embeddings = decoder_transformer.forward(ctx_gf, utils, embeddings, pos_dec);
674 |             utils.debug_print(embeddings, "embeddings_after_transformer");
675 | 
676 |             // SEANET decoder
677 |             embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings));
678 |             ggml_tensor * output = decoder.forward(ctx_gf, utils, embeddings);
679 |             utils.debug_print(output, "output decoded");
680 |         }
681 |     });
682 | 
683 |     // equivalent to python code: torch.ones((1, 1, 2048))
684 |     ctx.set_tensor_data("input", [](int, int, int, int) { return 1.0f; });
685 | 
686 |     // position data
687 |     std::vector<int> pos_data(1024);
688 |     for (int i = 0; i < (int)pos_data.size(); i++) {
689 |         pos_data[i] = i;
690 |     }
691 |     ctx.set_tensor_data("pos_enc", pos_data.data());
692 |     ctx.set_tensor_data("pos_dec", pos_data.data());
693 | 
694 |     // inp_dec data
695 |     // equivalent to python code: torch.tensor([[ [i, i+1, i+2] for i in range(0, 3*32, 3) ]], dtype=torch.long)
696 |     std::vector<int> inp_dec(3 * 32);
697 |     for (size_t i = 0; i < inp_dec.size(); i++) {
698 |         inp_dec[i] = i;
699 |     }
700 |     ctx.set_tensor_data("inp_dec", inp_dec.data());
701 | 
702 |     ctx.compute();
703 | 
704 |     // print result
705 |     //ggml_easy::debug::print_tensor_data(result_tensor, result_data.data());
706 | 
707 |     return 0;
708 | }
709 | 


--------------------------------------------------------------------------------