├── .gitignore
├── .gitmodules
├── CHANGELOG.md
├── LICENSE
├── README.md
├── analysis_options.yaml
├── build.dart
├── example
    ├── embedding.dart
    ├── main.dart
    ├── rag
    │   ├── .gitignore
    │   ├── CHANGELOG.md
    │   ├── README.md
    │   ├── _config.json
    │   ├── analysis_options.yaml
    │   ├── bin
    │   │   ├── ingest.dart
    │   │   └── rag.dart
    │   ├── lib
    │   │   ├── chroma.dart
    │   │   └── common.dart
    │   ├── pubspec.yaml
    │   └── test
    │   │   └── test_chroma.dart
    ├── server.dart
    └── simple.dart
├── ffigen.yaml
├── lib
    ├── embedding.dart
    ├── llama_cpp.dart
    └── src
    │   ├── common.dart
    │   ├── embedding.dart
    │   ├── ffi.dart
    │   ├── lib_llama_cpp.dart
    │   ├── llama_params.dart
    │   ├── native_llama.dart
    │   └── sampling.dart
├── pubspec.yaml
└── test
    ├── data
        ├── text.txt
        └── values.txt
    └── test_embedding.dart


/.gitignore:
--------------------------------------------------------------------------------
1 | .dart_tool
2 | pubspec.lock
3 | lib/libllama_cpp.so
4 | .idea
5 | *.gguf
6 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "src"]
2 | 	path = src
3 | 	url = https://gitee.com/lindeer/llama.cpp.git
4 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | ## 1.2.0
 2 | 
 3 | - upgrade native_assets_cli to 0.8.0
 4 | - upgrade llama.cpp to 8854044
 5 | 
 6 | ## 1.1.0
 7 | 
 8 | - Upgrade llama.cpp to 60ed04cf to support qwen1.5.
 9 | - Code refine.
10 | - Support embedding.
11 | - Support GPU building.
12 | - Add a RAG example.
13 | - Upgrade llama.cpp to 8c0e8f4e.
14 | 
15 | ## 1.0.0
16 | 
17 | - Upgrade Dart to 3.3.0.
18 | - Upgrade dependencies and fix issues.
19 | 
20 | ## 0.9.0
21 | 
22 | - Initial version.
23 | - Integrate with `native_assets_cli`.
24 | - Native helper classes.
25 | - Porting token sampling from `common/sampling.cpp`.
26 | - Passing LLM params to the isolate.
27 | - Token string as raw bytes stream.
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | A Dart binding for popular LLM inference framework [llama.cpp](https://github.com/ggerganov/llama.cpp), to bring AI to Dart world!
 2 | 
 3 | ## Note
 4 | 
 5 | `8854044` of `llama.cpp` is latest version supporting single shared library. After that version, `libllama.so` and `libggml.so` were created, but currently dart native-assets not support loading shared libraries at the same time.
 6 | 
 7 | ## Overview
 8 | 
 9 | - Text generation in a separated Dart isolate.
10 | - Stream based output in Dart style.
11 | - Integtate with `native_assets_cli`.
12 | - Extremely simple usage.
13 | - Support both LLM and embedding models.
14 | 
15 | ## Trying examples
16 | 
17 | ```
18 | git clone https://github.com/lindeer/llama-cpp.git
19 | cd llama-cpp
20 | git submodule init --recursive
21 | dart pub get
22 | ```
23 | 
24 | Just run in console:
25 | ```
26 | dart --enable-experiment=native-assets run example/main.dart "/path/to/your/LLM.gguf" "your prompt"
27 | ```
28 | 
29 | or run a simple http server:
30 | ```
31 | dart --enable-experiment=native-assets run example/server.dart "/path/to/your/LLM.gguf"
32 | ```
33 | 
34 | or run a embedding model:
35 | ```
36 | dart --enable-experiment=native-assets run example/embedding.dart "/path/to/your/embedding.gguf" "your text line1
37 | your text line2"
38 | ```
39 | 
40 | Also a minimal RAG example in `example/rag/` with all completely local data and model, inspired by [privateGPT](https://github.com/imartinez/privateGPT):
41 | 
42 | 0. setup a chroma server:
43 | ```
44 | pip install chromadb
45 | uvicorn chromadb.app:app --reload --workers 1 --host 0.0.0.0 --port 8000
46 | ```
47 | 
48 | 1. `cd example/rag` and creat a `config.json` and config your local models:
49 | ```json
50 | {
51 |   "gpt_model": "/your/local/gpt/model",
52 |   "embedding_model": "/your/local/embedding/model"
53 | }
54 | 
55 | ```
56 | 
57 | 3. save documents in `corpus/` to vector database (only txt files currently):
58 | ```
59 | dart --enable-experiment=native-assets run bin/ingest.dart
60 | ```
61 | 
62 | 4. chat with GPT in console, certainly could replace it with your beatiful GUI with flutter:
63 | ```
64 | dart --enable-experiment=native-assets run bin/rag.dart
65 | ```
66 | 
67 | ## Getting started
68 | 
69 | Ask LLM to answer with type writing effect:
70 | 
71 | ```dart
72 |   import 'package:llama_cpp/llama_cpp.dart';
73 | 
74 |   final path = '/path/to/your/LLM.gguf';
75 |   final llama = await LlamaCpp.load(path, verbose: true);
76 | 
77 |   await for (final text in llama.answer(prompt)) {
78 |     stdout.write(text);
79 |   }
80 |   stdout.writeln();
81 | 
82 |   await llama.dispose();
83 | ```
84 | or if you want a full answer:
85 | ```
86 | final answer = await llama.answer(prompt).join('');
87 | ```
88 | 
89 | More examples could be found at `example/`.
90 | 
91 | ## Notes
92 | 
93 | native_assets_cli has beaking chanings since >0.1.0, and is not compatible with Dart 3.2, however, it could run with Dart 3.1.5.
94 | 


--------------------------------------------------------------------------------
/analysis_options.yaml:
--------------------------------------------------------------------------------
 1 | # This file configures the static analysis results for your project (errors,
 2 | # warnings, and lints).
 3 | #
 4 | # This enables the 'recommended' set of lints from `package:lints`.
 5 | # This set helps identify many issues that may lead to problems when running
 6 | # or consuming Dart code, and enforces writing Dart using a single, idiomatic
 7 | # style and format.
 8 | #
 9 | # If you want a smaller set of lints you can change this to specify
10 | # 'package:lints/core.yaml'. These are just the most critical lints
11 | # (the recommended set includes the core lints).
12 | # The core lints are also what is used by pub.dev for scoring packages.
13 | 
14 | include: package:lints/recommended.yaml
15 | 
16 | # Uncomment the following section to specify additional rules.
17 | 
18 | # linter:
19 | #   rules:
20 | #     - camel_case_types
21 | 
22 | analyzer:
23 |   exclude:
24 |     - lib/src/lib_llama_cpp.dart
25 | 
26 | # For more information about the core and recommended set of lints, see
27 | # https://dart.dev/go/core-lints
28 | 
29 | # For additional information about configuring this file, see
30 | # https://dart.dev/guides/language/analysis-options
31 | 


--------------------------------------------------------------------------------
/build.dart:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023, the Dart project authors.  Please see the AUTHORS file
 2 | // for details. All rights reserved. Use of this source code is governed by a
 3 | // BSD-style license that can be found in the LICENSE file.
 4 | 
 5 | import 'dart:io' show File, Platform, Process, exit, stderr, stdout;
 6 | import 'package:path/path.dart' as p;
 7 | import 'package:native_assets_cli/native_assets_cli.dart';
 8 | 
 9 | const packageName = 'llama_cpp';
10 | const _repoLibName = 'libllama.so';
11 | 
12 | Future<String> _commandPath(String cmd) async {
13 |   final proc = await Process.run('which', [cmd]);
14 |   stderr.write(proc.stderr);
15 |   return proc.exitCode == 0 ? proc.stdout.toString() : '';
16 | }
17 | 
18 | /// Implements the protocol from `package:native_assets_cli` by building
19 | /// the C code in `src/` and reporting what native assets it built.
20 | void main(List<String> args) async {
21 |   await build(args, _builder);
22 | }
23 | 
24 | Future<void> _builder(BuildConfig buildConfig, BuildOutput buildOutput) async {
25 |   final env = Platform.environment;
26 |   final nvcc = env['LLAMA_CUDA_NVCC'] ?? await _commandPath('nvcc');
27 |   final arch = env['CUDA_DOCKER_ARCH'] ?? 'compute_75';
28 |   final pkgRoot = buildConfig.packageRoot;
29 |   final srcDir = pkgRoot.resolve('src');
30 |   final proc = await Process.start(
31 |     'make',
32 |     [
33 |       '-j',
34 |       _repoLibName,
35 |       if (nvcc.isNotEmpty) ...['LLAMA_CUBLAS=1', 'CUDA_DOCKER_ARCH=$arch'],
36 |     ],
37 |     workingDirectory: srcDir.path,
38 |   );
39 |   stdout.addStream(proc.stdout);
40 |   stderr.addStream(proc.stderr);
41 |   final code = await proc.exitCode;
42 |   if (code != 0) {
43 |     final p = await Process.run('gcc', ['--version']);
44 |     if (p.exitCode == 0) {
45 |       final gccVer = p.stdout.toString();
46 |       stderr.writeln("Build failed, make sure 'gcc>=9.5.0':\n$gccVer");
47 |     } else {
48 |       stderr.writeln("GCC not exists!");
49 |     }
50 |     exit(code);
51 |   }
52 | 
53 |   final linkMode = _linkMode(buildConfig.linkModePreference);
54 |   final libName = buildConfig.targetOS.libraryFileName(packageName, linkMode);
55 |   final libUri = buildConfig.outputDirectory.resolve(libName);
56 |   final uri = pkgRoot.resolve(p.join('src', _repoLibName));
57 |   final file = File.fromUri(uri).resolveSymbolicLinksSync();
58 |   File(file).renameSync(libUri.path);
59 | 
60 |   buildOutput.addAsset(NativeCodeAsset(
61 |     package: packageName,
62 |     name: 'src/lib_$packageName.dart',
63 |     linkMode: linkMode,
64 |     os: buildConfig.targetOS,
65 |     file: libUri,
66 |     architecture: buildConfig.targetArchitecture,
67 |   ));
68 |   final src = [
69 |     'src/llama.cpp',
70 |     'src/ggml.c',
71 |     'src/ggml-alloc.c',
72 |     'src/ggml-backend.c',
73 |     'src/ggml-quants.c',
74 |   ];
75 | 
76 |   buildOutput.addDependencies([
77 |     ...src.map((s) => pkgRoot.resolve(s)),
78 |     pkgRoot.resolve('build.dart'),
79 |   ]);
80 | }
81 | 
82 | LinkMode _linkMode(LinkModePreference preference) {
83 |   if (preference == LinkModePreference.dynamic ||
84 |       preference == LinkModePreference.preferDynamic) {
85 |     return DynamicLoadingBundled();
86 |   }
87 |   assert(preference == LinkModePreference.static ||
88 |       preference == LinkModePreference.preferStatic);
89 |   return StaticLinking();
90 | }
91 | 


--------------------------------------------------------------------------------
/example/embedding.dart:
--------------------------------------------------------------------------------
 1 | import 'dart:ffi' as ffi;
 2 | import 'dart:io' show stdout, Platform;
 3 | import 'dart:math' as m;
 4 | 
 5 | import 'package:ffi/ffi.dart' show calloc;
 6 | import 'package:llama_cpp/src/common.dart' as c;
 7 | import 'package:llama_cpp/src/ffi.dart';
 8 | import 'package:llama_cpp/src/lib_llama_cpp.dart' as llama_cpp;
 9 | import 'package:llama_cpp/src/llama_params.dart';
10 | 
11 | int main(List<String> argv) {
12 |   if (argv.isEmpty || argv[0].startsWith('-')) {
13 |     print("usage: ${Platform.script.path} MODEL_PATH [PROMPT]");
14 |     return 1;
15 |   }
16 |   final path = argv[0];
17 |   final prompt = argv.length > 1 ? argv[1] : 'Hello my name is';
18 | 
19 |   final cStr = CharArray.from(path);
20 |   final (model, ctx) = c.loadModel(
21 |     cStr,
22 |     LlamaParams(
23 |       seed: 1234,
24 |       nThread: 4,
25 |       nThreadBatch: 4,
26 |       embedding: true,
27 |     ),
28 |   );
29 | 
30 |   final prompts = prompt
31 |       .split('\n')
32 |       .map((e) => e.trim())
33 |       .where((e) => e.isNotEmpty)
34 |       .toList(growable: false);
35 |   const batchSize = 512;
36 |   final batch = llama_cpp.llama_batch_init(batchSize, 0, prompts.length);
37 |   llama_cpp.llama_reset_timings(ctx);
38 |   final maxTokenSize = prompts.map((e) => e.length).reduce(m.max);
39 |   final tokens = TokenArray(size: maxTokenSize);
40 |   final tokenList = prompts.map((p) {
41 |     cStr.pavedBy(p);
42 |     tokens.pavedBy(model, cStr, addBos: true);
43 |     final l = tokens.toList();
44 |     return l.length > batchSize ? l.sublist(0, batchSize) : l;
45 |   });
46 | 
47 |   for (final (i, l) in tokenList.indexed) {
48 |     print("main: prompt $i: '${prompts[i]}'");
49 |     print("main: number of tokens in prompt = ${l.length}");
50 |     for (final t in l) {
51 |       print("${'$t'.padLeft(6)} -> '${cStr.tokenString(model, t)}'");
52 |     }
53 |   }
54 | 
55 |   final dimens = llama_cpp.llama_n_embd(model);
56 |   final row = tokenList.length;
57 |   final bytes = ffi.sizeOf<ffi.Float>() * row * dimens;
58 |   final data = calloc.allocate<ffi.Float>(bytes);
59 |   var out = data;
60 |   var s = 0;
61 |   for (final tokens in tokenList) {
62 |     final len = tokens.length;
63 |     if (batch.n_tokens + len > batchSize) {
64 |       c.decodeEmbeddingBatch(ctx, batch, out, s, dimens);
65 |       batch.n_tokens = 0;
66 |       out += s * dimens;
67 |       s = 0;
68 |     }
69 |     c.addBatchSeq(batch, tokens, s);
70 |     s++;
71 |   }
72 |   c.decodeEmbeddingBatch(ctx, batch, out, s, dimens);
73 |   for (var j = 0, pos = 0; j < row; j++, pos += dimens) {
74 |     stdout.write("embedding $j: [");
75 |     final p = data + pos;
76 |     for (var i = 0; i < dimens; i++) {
77 |       final v = (p + i).value;
78 |       stdout.write("${v.toStringAsFixed(6)}, ");
79 |     }
80 |     stdout.writeln("]\n");
81 |   }
82 | 
83 |   calloc.free(data);
84 |   cStr.dispose();
85 |   tokens.dispose();
86 |   llama_cpp.llama_print_timings(ctx);
87 | 
88 |   llama_cpp.llama_batch_free(batch);
89 |   llama_cpp.llama_free(ctx);
90 |   llama_cpp.llama_free_model(model);
91 |   llama_cpp.llama_backend_free();
92 |   return 0;
93 | }
94 | 


--------------------------------------------------------------------------------
/example/main.dart:
--------------------------------------------------------------------------------
 1 | import 'dart:io';
 2 | 
 3 | import 'package:llama_cpp/llama_cpp.dart';
 4 | 
 5 | Future<int> main(List<String> argv) async {
 6 |   if (argv.isEmpty || argv[0].startsWith('-')) {
 7 |     print("usage: ${Platform.script.path} MODEL_PATH [PROMPT]");
 8 |     return 1;
 9 |   }
10 |   final path = argv[0];
11 |   final prompt = argv.length > 1 ? argv[1] : 'Hello my name is';
12 |   final llama = await LlamaCpp.load(path, verbose: false);
13 | 
14 |   await for (final s in llama.answer(prompt)) {
15 |     stdout.write(s);
16 |   }
17 |   stdout.writeln();
18 | 
19 |   await llama.dispose();
20 |   return 0;
21 | }
22 | 


--------------------------------------------------------------------------------
/example/rag/.gitignore:
--------------------------------------------------------------------------------
1 | # https://dart.dev/guides/libraries/private-files
2 | # Created by `dart pub`
3 | .dart_tool/
4 | config.json
5 | corpus/
6 | 


--------------------------------------------------------------------------------
/example/rag/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | ## 1.0.0
2 | 
3 | - Initial version.
4 | 


--------------------------------------------------------------------------------
/example/rag/README.md:
--------------------------------------------------------------------------------
1 | A sample command-line application with an entrypoint in `bin/`, library code
2 | in `lib/`, and example unit test in `test/`.
3 | 


--------------------------------------------------------------------------------
/example/rag/_config.json:
--------------------------------------------------------------------------------
1 | {
2 |   "source_dir": "corpus",
3 |   "db_server_url": "http://0.0.0.0:8000",
4 |   "database": "llama_cpp",
5 |   "collection": "rag"
6 | }
7 | 


--------------------------------------------------------------------------------
/example/rag/analysis_options.yaml:
--------------------------------------------------------------------------------
 1 | # This file configures the static analysis results for your project (errors,
 2 | # warnings, and lints).
 3 | #
 4 | # This enables the 'recommended' set of lints from `package:lints`.
 5 | # This set helps identify many issues that may lead to problems when running
 6 | # or consuming Dart code, and enforces writing Dart using a single, idiomatic
 7 | # style and format.
 8 | #
 9 | # If you want a smaller set of lints you can change this to specify
10 | # 'package:lints/core.yaml'. These are just the most critical lints
11 | # (the recommended set includes the core lints).
12 | # The core lints are also what is used by pub.dev for scoring packages.
13 | 
14 | include: package:lints/recommended.yaml
15 | 
16 | # Uncomment the following section to specify additional rules.
17 | 
18 | # linter:
19 | #   rules:
20 | #     - camel_case_types
21 | 
22 | # analyzer:
23 | #   exclude:
24 | #     - path/to/excluded/files/**
25 | 
26 | # For more information about the core and recommended set of lints, see
27 | # https://dart.dev/go/core-lints
28 | 
29 | # For additional information about configuring this file, see
30 | # https://dart.dev/guides/language/analysis-options
31 | 


--------------------------------------------------------------------------------
/example/rag/bin/ingest.dart:
--------------------------------------------------------------------------------
 1 | import 'dart:io';
 2 | 
 3 | import 'package:rag/chroma.dart';
 4 | import 'package:rag/common.dart' as c;
 5 | 
 6 | const chunkSize = 500;
 7 | const overlapSize = 10;
 8 | 
 9 | List<ChromaDoc> _processDocuments(String dir, List<String> ignored) {
10 |   bool isValidFile(String path) {
11 |     return FileSystemEntity.isFileSync(path) && !ignored.contains(path);
12 |   }
13 | 
14 |   final docs = Directory(dir)
15 |       .listSync(recursive: true)
16 |       .where((f) => isValidFile(f.path))
17 |       .expand((e) {
18 |     final file = File.fromUri(e.uri);
19 |     final lines = file.readAsLinesSync().where((l) => l.trim().isNotEmpty);
20 |     return _processLines(e.uri, lines);
21 |   }).toList(growable: false);
22 |   return docs;
23 | }
24 | 
25 | List<ChromaDoc> _processLines(Uri file, Iterable<String> lines) {
26 |   final result = <ChromaDoc>[];
27 |   final filepath = file.path;
28 |   for (final line in lines) {
29 |     final len = line.length;
30 |     if (len > chunkSize) {
31 |       for (var i = 0; i < len; i += chunkSize) {
32 |         final enough = len - i > chunkSize;
33 |         final str = enough ? line.substring(i, chunkSize) : line.substring(i);
34 |         final delta = i > overlapSize ? line.substring(i - overlapSize, i) : '';
35 |         result.add(ChromaDoc('$delta$str', filepath));
36 |       }
37 |     } else {
38 |       result.add(ChromaDoc(line, filepath));
39 |     }
40 |   }
41 |   return result;
42 | }
43 | 
44 | void main(List<String> argv) async {
45 |   final config = c.appConfig;
46 |   final chroma = await c.setupChroma(config);
47 |   final all = await chroma.allItems;
48 |   final ignored = all
49 |       .map((d) => d.metadata?['source'] as String?)
50 |       .whereType<String>()
51 |       .toList(growable: false);
52 |   final dir = config['source_dir'] ?? 'sources';
53 | 
54 |   final docs = _processDocuments(dir, ignored);
55 |   if (docs.isNotEmpty) {
56 |     final at = DateTime.now().millisecondsSinceEpoch;
57 |     await chroma.add(docs);
58 |     final cost = DateTime.now().millisecondsSinceEpoch - at;
59 |     print("Save [${docs.length}] documents cost $cost ms.");
60 |   }
61 |   chroma.dispose();
62 | }
63 | 


--------------------------------------------------------------------------------
/example/rag/bin/rag.dart:
--------------------------------------------------------------------------------
 1 | import 'dart:convert' show json;
 2 | import 'dart:io' show stdin, stdout;
 3 | import 'package:llama_cpp/llama_cpp.dart';
 4 | import 'package:rag/chroma.dart';
 5 | import 'package:rag/common.dart' as c;
 6 | 
 7 | String _makePrompt(String question, List<ChromaItem> items) {
 8 |   return '''根据以下信息：
 9 | 
10 | ${items.map((e) => e.doc).join("\n\n")}
11 | 
12 | 请回答：$question''';
13 | }
14 | 
15 | String? get _readLine {
16 |   stdout.write('> ');
17 |   return stdin.readLineSync()?.trim();
18 | }
19 | 
20 | void main(List<String> argv) async {
21 |   final config = c.appConfig;
22 |   final chroma = await c.setupChroma(config);
23 | 
24 |   final path = config['gpt_model'] as String;
25 |   final gpt = await LlamaCpp.load(path, verbose: false);
26 |   late String question;
27 |   while ((question = (_readLine ?? 'exit')) != 'exit') {
28 |     if (question.isEmpty) {
29 |       continue;
30 |     }
31 |     final items = await chroma.query(question, nResults: 2);
32 |     final prompt = _makePrompt(question, items);
33 |     final answer = gpt.answer(prompt);
34 |     stdout.write('< ');
35 |     await for (final str in answer) {
36 |       stdout.write(str);
37 |     }
38 |     stdout.writeln();
39 |   }
40 |   await gpt.dispose();
41 |   chroma.dispose();
42 | }
43 | 


--------------------------------------------------------------------------------
/example/rag/lib/chroma.dart:
--------------------------------------------------------------------------------
  1 | import 'dart:convert' show json, utf8;
  2 | 
  3 | import 'package:http/http.dart' as http;
  4 | import 'package:chromadb/chromadb.dart' as db;
  5 | import 'package:llama_cpp/embedding.dart';
  6 | import 'package:uuid/uuid.dart' show Uuid;
  7 | 
  8 | final class ChromaDoc {
  9 |   final String content;
 10 |   final String source;
 11 | 
 12 |   const ChromaDoc(this.content, this.source);
 13 | 
 14 |   @override
 15 |   String toString() => "('$source':'$content')";
 16 | }
 17 | 
 18 | final class ChromaItem {
 19 |   final String? id;
 20 |   final String doc;
 21 |   final Map<String, dynamic>? metadata;
 22 | 
 23 |   const ChromaItem._(this.id, this.doc, this.metadata);
 24 | 
 25 |   @override
 26 |   String toString() => "ChromaItem(id:$id, doc:'$doc', meta:$metadata)";
 27 | }
 28 | 
 29 | class Chroma {
 30 |   final db.ChromaClient client;
 31 |   final db.Collection collection;
 32 | 
 33 |   /// Not use `EmbeddingFunction` because of its type
 34 |   final Embedding embed;
 35 | 
 36 |   Chroma._(this.client, this.collection, this.embed);
 37 | 
 38 |   static Future<Chroma> create({
 39 |     required String baseUrl,
 40 |     required String database,
 41 |     required collection,
 42 |     required Embedding embedding,
 43 |   }) async {
 44 |     final client = db.ChromaClient(
 45 |       baseUrl: baseUrl,
 46 |       database: database,
 47 |     );
 48 |     final c = await client.getOrCreateCollection(name: collection);
 49 |     return Chroma._(client, c, embedding);
 50 |   }
 51 | 
 52 |   Future<void> add(List<ChromaDoc> docs) async {
 53 |     final len = docs.length;
 54 |     final uuid = Uuid();
 55 |     final ids = List.generate(len, (i) => uuid.v1());
 56 |     final docList = docs.map((e) => e.content).toList(growable: false);
 57 |     final embeddings = embed.embedBatch(docList);
 58 |     final metadatas = docs.map((e) => {'source': e.source}).toList();
 59 |     await _add(
 60 |       ids: ids,
 61 |       documents: docList,
 62 |       embeddings: embeddings,
 63 |       metadatas: metadatas,
 64 |     );
 65 |   }
 66 | 
 67 |   Future<List<ChromaItem>> get allItems async {
 68 |     final res = await collection.get();
 69 |     return res.ids.indexed.map((r) {
 70 |       final (i, id) = r;
 71 |       final doc = res.documents?[i] ?? '';
 72 |       final metadata = res.metadatas?[i];
 73 |       return ChromaItem._(id, doc, metadata);
 74 |     }).toList(growable: false);
 75 |   }
 76 | 
 77 |   Future<List<ChromaItem>> query(
 78 |     String doc, {
 79 |     final int nResults = 4,
 80 |   }) async {
 81 |     final embeddings = embed.embedSingle(doc);
 82 |     final result = await _query(embeddings);
 83 |     final (ids, docs, metadatas) = (
 84 |       result.ids.first,
 85 |       result.documents?.first,
 86 |       result.metadatas?.first,
 87 |     );
 88 |     return ids.indexed.map((r) {
 89 |       final (i, id) = r;
 90 |       final doc = docs?[i] ?? '';
 91 |       final metadata = metadatas?[i];
 92 |       return ChromaItem._(id, utf8.decode(doc.codeUnits), metadata);
 93 |     }).toList(growable: false);
 94 |   }
 95 | 
 96 |   void dispose() {
 97 |     embed.dispose();
 98 |   }
 99 | 
100 |   Future<String> _add({
101 |     required final List<String> ids,
102 |     final List<List<double>>? embeddings,
103 |     final List<Map<String, dynamic>>? metadatas,
104 |     final List<String>? documents,
105 |   }) async {
106 |     final id = collection.id;
107 |     final body = <String, dynamic>{
108 |       "embeddings": embeddings,
109 |       "metadatas": metadatas,
110 |       "documents": documents,
111 |       "ids": ids,
112 |       "increment_index": true
113 |     };
114 |     final res = await http.post(
115 |       Uri.parse('http://0.0.0.0:8000/api/v1/collections/$id/add'),
116 |       headers: {
117 |         'Content-Type': 'application/json',
118 |       },
119 |       body: json.encode(body),
120 |     );
121 |     return res.body;
122 |   }
123 | 
124 |   Future<db.QueryResponse> _query(List<double> embedding) async {
125 |     final id = collection.id;
126 |     final body = <String, dynamic>{
127 |       "where": {},
128 |       "where_document": {},
129 |       "query_embeddings": [embedding],
130 |       "n_results": 2,
131 |       "include": [
132 |         "metadatas",
133 |         "documents",
134 |         "distances",
135 |       ],
136 |     };
137 |     final res = await http.post(
138 |       Uri.parse('http://0.0.0.0:8000/api/v1/collections/$id/query'),
139 |       headers: {
140 |         'Content-Type': 'application/json',
141 |       },
142 |       body: json.encode(body),
143 |     );
144 |     final obj = json.decode(res.body) as Map<String, dynamic>;
145 |     return db.QueryResponse.fromJson(obj);
146 |   }
147 | 
148 |   /*
149 |   static Future<db.Collection> _fetchCollection({
150 |     required final db.ChromaClient client,
151 |     required final String name,
152 |   }) async {
153 |     final body = {
154 |       "name": name,
155 |       "get_or_create": true,
156 |     };
157 |     final res = await http.post(
158 |       Uri.parse('http://0.0.0.0:8000/api/v1/collections'),
159 |       headers: {
160 |         'Content-Type': 'application/json',
161 |       },
162 |       body: json.encode(body),
163 |     );
164 |     final obj = json.decode(res.body);
165 |     return db.Collection(
166 |       name: obj['name']!,
167 |       id: obj['id']!,
168 |       metadata: obj['metadata'],
169 |       tenant: client.tenant,
170 |       database: client.database,
171 |       api: client.api,
172 |     );
173 |   }
174 |   */
175 | }
176 | 


--------------------------------------------------------------------------------
/example/rag/lib/common.dart:
--------------------------------------------------------------------------------
 1 | import 'dart:convert' show json;
 2 | import 'dart:io';
 3 | 
 4 | import 'package:llama_cpp/embedding.dart';
 5 | 
 6 | import 'chroma.dart';
 7 | 
 8 | Future<Chroma> setupChroma(Map<String, dynamic> config) async {
 9 |   final embeddingPath = config['embedding_model'] as String;
10 |   final embedding = Embedding(embeddingPath);
11 |   final chroma = await Chroma.create(
12 |     baseUrl: config['db_server_url'] as String,
13 |     database: config['database'] as String,
14 |     collection: config['collection'] as String,
15 |     embedding: embedding,
16 |   );
17 |   return chroma;
18 | }
19 | 
20 | Map<String, dynamic> get appConfig {
21 |   final uri = Directory.current.uri;
22 |   final f1 = File.fromUri(uri.resolve('_config.json'));
23 |   final f2 = File.fromUri(uri.resolve('config.json'));
24 |   if (!f1.existsSync() || !f2.existsSync()) {
25 |     print("We need '_config.json' and 'config.json' files");
26 |     return {};
27 |   }
28 |   final config = json.decode(f1.readAsStringSync()) as Map<String, dynamic>;
29 |   config.addAll(json.decode(f2.readAsStringSync()));
30 |   return config;
31 | }
32 | 


--------------------------------------------------------------------------------
/example/rag/pubspec.yaml:
--------------------------------------------------------------------------------
 1 | name: rag
 2 | description: An example of RAG (Retrieval Augment Generation) app.
 3 | version: 1.0.0
 4 | publish_to: none
 5 | 
 6 | environment:
 7 |   sdk: ^3.3.0
 8 | 
 9 | dependencies:
10 |   chromadb: ^0.1.2
11 |   llama_cpp:
12 |     path: ../..
13 |   uuid: ^4.3.3
14 | 
15 | dev_dependencies:
16 |   lints: ^3.0.0
17 |   test: ^1.24.0
18 | 


--------------------------------------------------------------------------------
/example/rag/test/test_chroma.dart:
--------------------------------------------------------------------------------
 1 | import 'dart:io';
 2 | 
 3 | import 'package:rag/common.dart' as c;
 4 | import 'package:rag/chroma.dart';
 5 | import 'package:test/test.dart';
 6 | 
 7 | void main() async {
 8 |   // start a chroma server:
 9 |   // `uvicorn chromadb.app:app --reload --workers 1 --host 0.0.0.0 --port 8000`
10 | 
11 |   final config = {
12 |     'embedding_model': Platform.environment['EMBEDDING_MODEL_PATH'] ??
13 |         (throw Exception("Model path 'EMBEDDING_MODEL_PATH' not specified!")),
14 |     'db_server_url': 'http://0.0.0.0:8000',
15 |     'database': 'ecr',
16 |     'collection': 'retail',
17 |   };
18 |   final chroma = await c.setupChroma(config);
19 | 
20 |   test('chroma save', () async {
21 |     final docs = ['Hello world!', 'Hello Moto!', 'Hi world!', 'Hey world']
22 |         .indexed
23 |         .map((r) {
24 |       final (i, d) = r;
25 |       return ChromaDoc(d, 'file${i % 2}.txt');
26 |     }).toList();
27 |     await chroma.add(docs);
28 |     final items = await chroma.query('hello, world~', nResults: 2);
29 |     expect(items.length, 2);
30 |     final str = items.map((e) => "'${e.id}':'${e.doc}'").join(',');
31 |     expect(str.contains('Hello world!'), true);
32 |     final list = await Future.wait<int>(
33 |       [chroma.allItems.then((v) => v.length), chroma.collection.count()],
34 |     );
35 |     expect(list, [4, 4]);
36 |   });
37 | 
38 |   tearDownAll(() {
39 |     chroma.dispose();
40 |   });
41 | }
42 | 


--------------------------------------------------------------------------------
/example/server.dart:
--------------------------------------------------------------------------------
 1 | import 'dart:convert' show utf8;
 2 | import 'dart:io' show HttpServer;
 3 | 
 4 | import 'package:llama_cpp/llama_cpp.dart' show LlamaCpp;
 5 | 
 6 | const _defaultPort = 8080;
 7 | void main(List<String> argv) async {
 8 |   if (argv.isEmpty) {
 9 |     print("usage: dart server.dart MODEL_PATH [PORT]");
10 |     return;
11 |   }
12 |   final path = argv[0];
13 |   final port = (argv.length > 1 ? int.tryParse(argv[1]) : null) ?? _defaultPort;
14 |   final ai = await LlamaCpp.load(path);
15 | 
16 |   final server = await HttpServer.bind('localhost', port);
17 |   print('Serving at http://${server.address.host}:${server.port}');
18 |   await for (final request in server) {
19 |     final body = await request
20 |         .map((e) => List<int>.from(e))
21 |         .transform(utf8.decoder)
22 |         .join();
23 |     final response = request.response;
24 |     response.headers
25 |       ..set('Content-Type', 'application/octet-stream; charset=utf-8')
26 |       ..add("Transfer-Encoding", "chunked");
27 |     response.bufferOutput = false;
28 |     final answer = ai.answerWith(body);
29 |     // curl should run with `--no-buffer` param
30 |     await response.addStream(answer.transform(utf8.encoder));
31 |     await response.close();
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/example/simple.dart:
--------------------------------------------------------------------------------
  1 | import 'dart:ffi' as ffi;
  2 | import 'dart:io' show stderr, stdout, Platform;
  3 | 
  4 | import 'package:llama_cpp/src/common.dart' as c;
  5 | import 'package:llama_cpp/src/ffi.dart';
  6 | import 'package:llama_cpp/src/lib_llama_cpp.dart' as llama_cpp;
  7 | import 'package:llama_cpp/src/llama_params.dart';
  8 | 
  9 | int main(List<String> argv) {
 10 |   if (argv.isEmpty || argv[0].startsWith('-')) {
 11 |     print("usage: ${Platform.script.path} MODEL_PATH [PROMPT]");
 12 |     return 1;
 13 |   }
 14 |   final path = argv[0];
 15 |   final prompt = argv.length > 1 ? argv[1] : 'Hello my name is';
 16 |   // total length of the sequence including the prompt
 17 |   const nLen = 32;
 18 |   llama_cpp.llama_backend_init();
 19 |   llama_cpp.llama_numa_init(0);
 20 | 
 21 |   final cStr = CharArray.from(path);
 22 |   final (model, ctx) = c.loadModel(
 23 |     cStr,
 24 |     LlamaParams(
 25 |       seed: 1234,
 26 |       nCtx: 1024,
 27 |       nThread: 4,
 28 |       nThreadBatch: 4,
 29 |     ),
 30 |   );
 31 | 
 32 |   final ctxSize = llama_cpp.llama_n_ctx(ctx);
 33 |   final tokenCapacity = prompt.length + 1;
 34 |   cStr.pavedBy(prompt);
 35 |   final tokenBuf = TokenArray(size: tokenCapacity);
 36 |   tokenBuf.pavedBy(model, cStr);
 37 |   final tokenNum = tokenBuf.length;
 38 |   final kvReq = tokenNum + (nLen - tokenNum);
 39 |   print("\nn_len = $nLen, n_ctx = $ctxSize, n_kv_req = $kvReq, "
 40 |       "token_n = $tokenNum, len = ${cStr.length}");
 41 |   stderr.write("User prompt is:");
 42 |   for (var i = 0; i < tokenNum; i++) {
 43 |     final text = cStr.tokenString(model, tokenBuf[i]);
 44 |     stderr.write(text);
 45 |   }
 46 |   stderr.writeln();
 47 |   stderr.flush();
 48 | 
 49 |   // create a llama_batch with size 512
 50 |   // we use this object to submit token data for decoding
 51 |   final batch = llama_cpp.llama_batch_init(512, 0, 1);
 52 |   // evaluate the initial prompt
 53 |   c.addBatchSeq(batch, tokenBuf.toList(), 0);
 54 |   batch.logits[batch.n_tokens - 1] = 1;
 55 | 
 56 |   if (llama_cpp.llama_decode(ctx, batch) != 0) {
 57 |     return 1;
 58 |   }
 59 | 
 60 |   llama_cpp.llama_reset_timings(ctx);
 61 |   var count = batch.n_tokens;
 62 |   final nVocab = llama_cpp.llama_n_vocab(model);
 63 |   final array = TokenDataArray(nVocab);
 64 |   final eosToken = llama_cpp.llama_token_eos(model);
 65 |   while (count <= nLen) {
 66 |     final logits = llama_cpp.llama_get_logits_ith(ctx, batch.n_tokens - 1);
 67 |     array.pavedBy(logits, nVocab);
 68 | 
 69 |     final tokenId = llama_cpp.llama_sample_token_greedy(ctx, array.pointer);
 70 |     if (tokenId == eosToken || count == nLen) {
 71 |       break;
 72 |     }
 73 |     final word = cStr.tokenString(model, tokenId);
 74 |     stdout.write(word);
 75 |     // `stdout.flush()` cause 'Bad state: StreamSink is bound to a stream' error in Dart 3.1.5
 76 |     // stdout.flush();
 77 | 
 78 |     // prepare the next batch
 79 |     batch.n_tokens = 0;
 80 |     // push this new token for next evaluation
 81 |     c.addBatchSingle(batch, tokenId, count, true);
 82 | 
 83 |     count++;
 84 | 
 85 |     // evaluate the current batch with the transformer model
 86 |     if (llama_cpp.llama_decode(ctx, batch) != 0) {
 87 |       return 2;
 88 |     }
 89 |   }
 90 | 
 91 |   array.dispose();
 92 |   tokenBuf.dispose();
 93 |   cStr.dispose();
 94 | 
 95 |   llama_cpp.llama_print_timings(ctx);
 96 | 
 97 |   llama_cpp.llama_batch_free(batch);
 98 |   llama_cpp.llama_free(ctx);
 99 |   llama_cpp.llama_free_model(model);
100 |   llama_cpp.llama_backend_free();
101 | 
102 |   return 0;
103 | }
104 | 


--------------------------------------------------------------------------------
/ffigen.yaml:
--------------------------------------------------------------------------------
 1 | # Run with `dart --enable-experiment=native-assets run ffigen --config ffigen.yaml`.
 2 | name: NativeLlamaCppBindings
 3 | description: |
 4 |   Bindings for `src/llama.h`.
 5 | 
 6 |   Regenerate bindings with `dart --enable-experiment=native-assets run ffigen --config ffigen.yaml`.
 7 | output: 'lib/src/lib_llama_cpp.dart'
 8 | headers:
 9 |   entry-points:
10 |     - 'src/llama.h'
11 |   include-directives:
12 |     - 'src/llama.h'
13 | compiler-opts:
14 |   - '-I/opt/programs/miniforge3/envs/clang-10/lib/clang/10.0.0/include'
15 | preamble: |
16 |   // Copyright (c) 2023, the Dart project authors.  Please see the AUTHORS file
17 |   // for details. All rights reserved. Use of this source code is governed by a
18 |   // BSD-style license that can be found in the LICENSE file.
19 | comments:
20 |   style: any
21 |   length: full
22 | ffi-native:
23 | 


--------------------------------------------------------------------------------
/lib/embedding.dart:
--------------------------------------------------------------------------------
1 | export 'src/embedding.dart';
2 | 


--------------------------------------------------------------------------------
/lib/llama_cpp.dart:
--------------------------------------------------------------------------------
  1 | import 'dart:convert' show json, utf8;
  2 | import 'dart:io' show stdout;
  3 | import 'dart:isolate' show Isolate, ReceivePort, SendPort;
  4 | 
  5 | import 'src/llama_params.dart';
  6 | import 'src/native_llama.dart';
  7 | 
  8 | /// A brief overview of inter-ops among main classes:
  9 | ///
 10 | /// ```
 11 | /// +----------------+---------------------+--------------------+
 12 | /// |  main Isolate  |    llama Isolate    |    native world    |
 13 | /// +----------------+---------------------+--------------------+
 14 | /// |   LlamaCpp     |     NativeLlama     |    llama_cpp       |
 15 | /// |                |                     |                    |
 16 | /// |          send --> incoming          -->       +           |
 17 | /// |                |                     |        |           |
 18 | /// |                |                    ffi       |           |
 19 | /// |                |                     |        |           |
 20 | /// |     receiving <-- outgoing          <--       +           |
 21 | /// |                |                     |                    |
 22 | /// +---------------+---------------------+---------------------+
 23 | /// ```
 24 | class LlamaCpp {
 25 |   final ReceivePort _recv;
 26 |   final Isolate _isolate;
 27 |   final SendPort _send;
 28 |   final Stream<String> _receiving;
 29 |   final bool verbose;
 30 | 
 31 |   const LlamaCpp._(
 32 |     this._recv,
 33 |     this._isolate,
 34 |     this._send,
 35 |     this._receiving,
 36 |     this.verbose,
 37 |   );
 38 | 
 39 |   /// Async create LlamaCpp by given params.
 40 |   static Future<LlamaCpp> load(
 41 |     String path, {
 42 |     int? seed,
 43 |     int? nThread,
 44 |     int? nThreadBatch,
 45 |     int? nPredict,
 46 |     int? nCtx,
 47 |     int? nBatch,
 48 |     int? nKeep,
 49 |     int? nGpuLayers,
 50 |     int? mainGpu,
 51 |     int numa = 0,
 52 |     bool verbose = true,
 53 |   }) async {
 54 |     final recv = ReceivePort('main.incoming');
 55 |     final params = LlamaParams(
 56 |       seed: seed,
 57 |       nThread: nThread,
 58 |       nThreadBatch: nThreadBatch,
 59 |       nPredict: nPredict,
 60 |       nCtx: nCtx,
 61 |       nBatch: nBatch,
 62 |       nGpuLayers: nGpuLayers,
 63 |       mainGpu: mainGpu,
 64 |       numa: numa,
 65 |     );
 66 |     final isolate = await Isolate.spawn<(SendPort, String, LlamaParams)>(
 67 |       _llamaIsolate,
 68 |       (recv.sendPort, path, params),
 69 |       errorsAreFatal: true,
 70 |       debugName: '_llamaIsolate',
 71 |     );
 72 |     final receiving = recv.asBroadcastStream();
 73 |     final send = (await receiving.first) as SendPort;
 74 |     return LlamaCpp._(recv, isolate, send, receiving.cast<String>(), verbose);
 75 |   }
 76 | 
 77 |   static const _finish = <String, dynamic>{'cmd': NativeLLama.closeTag};
 78 | 
 79 |   /// Notify isolate to free native resources, after that, finish this isolate.
 80 |   Future<void> dispose() async {
 81 |     print("LlamaCpp.dispose: disposing native llama ...");
 82 |     _send.send(_finish);
 83 |     await _receiving.first;
 84 |     print("LlamaCpp.dispose: native llama disposed.");
 85 |     _recv.close();
 86 |     _isolate.kill();
 87 |   }
 88 | 
 89 |   /// Generate text stream by given params.
 90 |   /// [params] json string with params, e.g.:
 91 |   /// ai.answerWith({
 92 |   ///   "prompt": "my question is",
 93 |   ///   "min_p": 20,
 94 |   /// });
 95 |   Stream<String> answerWith(String params) {
 96 |     final request = json.decode(params);
 97 |     if ((request['prompt'] ?? '').isEmpty) {
 98 |       throw Exception("Json body without 'prompt'!");
 99 |     }
100 |     return _requestAnswer(request);
101 |   }
102 | 
103 |   /// Generate text stream by given prompt.
104 |   /// @question The prompt passed by user who want model to generate an answer.
105 |   Stream<String> answer(
106 |     String question, {
107 |     int? nPrev,
108 |     int? nProbs,
109 |     int? topK,
110 |     double? topP,
111 |     double? minP,
112 |     double? tfsZ,
113 |     double? typicalP,
114 |     double? temperature,
115 |     int? penaltyLastN,
116 |     double? penaltyRepeat,
117 |     double? penaltyFrequency,
118 |     double? penaltyPresent,
119 |     int? mirostat,
120 |     double? mirostatTau,
121 |     double? mirostatEta,
122 |     bool? penalizeNewline,
123 |     String? samplersSequence,
124 |   }) {
125 |     final request = {
126 |       'prompt': question,
127 |       if (nPrev != null) 'n_prev': nPrev,
128 |       if (nProbs != null) 'n_probs': nProbs,
129 |       if (topK != null) 'top_k': topK,
130 |       if (topP != null) 'top_p': topP,
131 |       if (minP != null) 'min_p': minP,
132 |       if (tfsZ != null) 'tfs_z': tfsZ,
133 |       if (typicalP != null) 'typical_p': typicalP,
134 |       if (temperature != null) 'temperature': temperature,
135 |       if (penaltyLastN != null) 'penalty_last_n': penaltyLastN,
136 |       if (penaltyRepeat != null) 'penalty_repeat': penaltyRepeat,
137 |       if (penaltyFrequency != null) 'penalty_frequency': penaltyFrequency,
138 |       if (penaltyPresent != null) 'penalty_present': penaltyPresent,
139 |       if (mirostat != null) 'mirostat': mirostat,
140 |       if (mirostatTau != null) 'mirostat_tau': mirostatTau,
141 |       if (mirostatEta != null) 'mirostat_eta': mirostatEta,
142 |       if (penalizeNewline != null) 'penalize_newline': penalizeNewline,
143 |       if (samplersSequence != null) 'samplers_sequence': samplersSequence,
144 |     };
145 | 
146 |     return _requestAnswer(request);
147 |   }
148 | 
149 |   Stream<String> _requestAnswer(Map<String, dynamic> request) async* {
150 |     if (verbose) {
151 |       stdout.writeln("<<<<<<<<<<<<<<<");
152 |       stdout.writeln("$request\n---------------");
153 |     }
154 |     _send.send(request);
155 |     await for (final msg in _receiving) {
156 |       if (msg == NativeLLama.engTag) {
157 |         break;
158 |       } else {
159 |         yield msg;
160 |         if (verbose) {
161 |           stdout.write(msg);
162 |         }
163 |       }
164 |     }
165 |     if (verbose) {
166 |       stdout.writeln("\n>>>>>>>>>>>>>>>");
167 |     }
168 |   }
169 | 
170 |   // run in the isolate, relative main.
171 |   static _llamaIsolate((SendPort, String, LlamaParams) r) async {
172 |     final (outgoing, path, params) = r;
173 |     final incoming = ReceivePort('_runIsolate.incoming');
174 | 
175 |     final llama = NativeLLama(path, params);
176 |     outgoing.send(incoming.sendPort);
177 |     final requests = incoming.cast<Map<String, dynamic>>();
178 |     await for (final r in requests) {
179 |       if (r['cmd'] == NativeLLama.closeTag) {
180 |         print("Isolate received '$r', start closing ...");
181 |         break;
182 |       }
183 |       final params = r;
184 |       final prompt = params['prompt'] as String;
185 |       final rawStream = llama.generate(
186 |         prompt,
187 |         nPrev: params['n_prev'],
188 |         nProbs: params['n_probs'],
189 |         topK: params['top_k'],
190 |         topP: params['top_p'],
191 |         minP: params['min_p'],
192 |         tfsZ: params['tfs_z'],
193 |         typicalP: params['typical_p'],
194 |         temperature: params['temperature'],
195 |         penaltyLastN: params['penalty_last_n'],
196 |         penaltyRepeat: params['penalty_repeat'],
197 |         penaltyFrequency: params['penalty_frequency'],
198 |         penaltyPresent: params['penalty_present'],
199 |         mirostat: params['mirostat'],
200 |         mirostatTau: params['mirostat_tau'],
201 |         mirostatEta: params['mirostat_eta'],
202 |         penalizeNewline: params['penalize_newline'],
203 |         samplersSequence: params['samplers_sequence'],
204 |       );
205 |       final s = rawStream.transform(utf8.decoder);
206 |       await for (final str in s) {
207 |         outgoing.send(str);
208 |         if (str == NativeLLama.engTag) {
209 |           break;
210 |         }
211 |       }
212 |     }
213 |     llama.dispose();
214 |     outgoing.send(NativeLLama.closeTag);
215 |   }
216 | }
217 | 


--------------------------------------------------------------------------------
/lib/src/common.dart:
--------------------------------------------------------------------------------
  1 | import 'dart:ffi' as ffi;
  2 | import 'dart:io' show Platform;
  3 | import 'dart:math' as m;
  4 | 
  5 | import 'ffi.dart';
  6 | import 'lib_llama_cpp.dart' as llama_cpp;
  7 | import 'llama_params.dart';
  8 | 
  9 | void _addLlamaBatch(
 10 |   llama_cpp.llama_batch batch,
 11 |   int id,
 12 |   int pos,
 13 |   int seq,
 14 |   bool logits,
 15 | ) {
 16 |   final n = batch.n_tokens;
 17 |   batch.token[n] = id;
 18 |   batch.pos[n] = pos;
 19 |   batch.n_seq_id[n] = 1;
 20 |   batch.seq_id[n][0] = seq;
 21 |   batch.logits[n] = logits ? 1 : 0;
 22 | 
 23 |   batch.n_tokens++;
 24 | }
 25 | 
 26 | /// Append single token to batch with given position and logit.
 27 | void addBatchSingle(llama_cpp.llama_batch batch, int t, int pos, bool logit) {
 28 |   _addLlamaBatch(batch, t, pos, 0, logit);
 29 | }
 30 | 
 31 | void _addBatchTokens(
 32 |   llama_cpp.llama_batch batch,
 33 |   List<int> tokens,
 34 |   int pos,
 35 |   int seq,
 36 |   bool logit,
 37 | ) {
 38 |   for (final (i, t) in tokens.indexed) {
 39 |     _addLlamaBatch(batch, t, pos + i, seq, logit);
 40 |   }
 41 | }
 42 | 
 43 | /// Add multiple tokens to batch with given [seq] from start.
 44 | void addBatchSeq(llama_cpp.llama_batch batch, List<int> tokens, int seq) {
 45 |   _addBatchTokens(batch, tokens, 0, seq, false);
 46 | }
 47 | 
 48 | /// Add multiple tokens to batch from [pos] with given [logit].
 49 | void addBatchPos(
 50 |   llama_cpp.llama_batch batch,
 51 |   List<int> tokens,
 52 |   int pos,
 53 |   bool logit,
 54 | ) {
 55 |   _addBatchTokens(batch, tokens, pos, 0, logit);
 56 | }
 57 | 
 58 | void _normalize(ffi.Pointer<ffi.Float> vec, ffi.Pointer<ffi.Float> out, int n) {
 59 |   var norm = 0.0;
 60 |   for (var i = 0; i < n; i++) {
 61 |     final v = vec[i];
 62 |     norm += v * v;
 63 |   }
 64 |   norm = m.sqrt(norm);
 65 |   for (var i = 0; i < n; i++) {
 66 |     out[i] = vec[i] / norm;
 67 |   }
 68 | }
 69 | 
 70 | /// Decode batch only for embedding.
 71 | void decodeEmbeddingBatch(
 72 |   ffi.Pointer<llama_cpp.llama_context> ctx,
 73 |   llama_cpp.llama_batch batch,
 74 |   ffi.Pointer<ffi.Float> output,
 75 |   int seq,
 76 |   int dimens,
 77 | ) {
 78 |   llama_cpp.llama_kv_cache_clear(ctx);
 79 |   print('decodeEmbeddingBatch: n_tokens = ${batch.n_tokens}, n_seq = $seq');
 80 |   if (llama_cpp.llama_decode(ctx, batch) < 0) {
 81 |     throw Exception('decodeEmbeddingBatch: failed to decode');
 82 |   }
 83 |   for (var k = 0; k < seq; k++) {
 84 |     final emb = llama_cpp.llama_get_embeddings_ith(ctx, k);
 85 |     final out = output + k * dimens;
 86 |     _normalize(emb, out, dimens);
 87 |   }
 88 | }
 89 | 
 90 | int get _physicalCores {
 91 |   final n = Platform.numberOfProcessors;
 92 |   return n > 4
 93 |       ? n ~/ 2
 94 |       : n > 0
 95 |           ? n
 96 |           : 4;
 97 | }
 98 | 
 99 | String _systemInfo(LlamaParams lp, llama_cpp.llama_context_params params) {
100 |   final n = lp.nThreadBatch;
101 |   final batch = n != null ? ' (n_threads_batch = $n)' : '';
102 |   return 'system_info: n_threads = ${params.n_threads}$batch '
103 |       '/ ${Platform.numberOfProcessors} '
104 |       '| ${CharArray.toDartString(llama_cpp.llama_print_system_info())}';
105 | }
106 | 
107 | /// Load a model from a given path, it could be a LLM also a embedding model.
108 | /// return both model and context.
109 | (ffi.Pointer<llama_cpp.llama_model>, ffi.Pointer<llama_cpp.llama_context>)
110 |     loadModel(CharArray path, LlamaParams params) {
111 |   final ctxSize = params.nCtx ?? 512;
112 |   final s = params.seed ?? 0;
113 |   final seed = s > 0 ? s : DateTime.now().millisecondsSinceEpoch ~/ 1000;
114 |   print('seed = $seed');
115 |   print('llama backend init');
116 |   llama_cpp.llama_backend_init();
117 |   llama_cpp.llama_numa_init(params.numa);
118 |   final modelParams = llama_cpp.llama_model_default_params();
119 |   final nGpuLayers = params.nGpuLayers;
120 |   if (nGpuLayers != null) {
121 |     modelParams.n_gpu_layers = nGpuLayers > 0 ? nGpuLayers : 0;
122 |   }
123 |   final mainGpu = params.mainGpu;
124 |   if (mainGpu != null) {
125 |     modelParams.main_gpu = mainGpu;
126 |   }
127 | 
128 |   final model = llama_cpp.llama_load_model_from_file(path.pointer, modelParams);
129 |   if (model.address == 0) {
130 |     throw Exception("Load model from '${path.dartString}' failed");
131 |   }
132 | 
133 |   final ctxParams = llama_cpp.llama_context_default_params()..seed = seed;
134 |   if (ctxSize > 0) {
135 |     ctxParams.n_ctx = ctxSize;
136 |   }
137 |   final nBatch = params.nBatch ?? -1;
138 |   if (nBatch > 0) {
139 |     ctxParams.n_batch = nBatch;
140 |   }
141 |   final t = params.nThread ?? 0;
142 |   ctxParams.n_threads = t > 0 ? t : _physicalCores;
143 |   final tb = params.nThreadBatch ?? 0;
144 |   ctxParams.n_threads_batch = tb > 0 ? tb : ctxParams.n_threads;
145 | 
146 |   final ctx = llama_cpp.llama_new_context_with_model(model, ctxParams);
147 |   if (ctx.address == 0) {
148 |     throw Exception("Create llama context failed");
149 |   }
150 |   final nCtxTrain = llama_cpp.llama_n_ctx_train(model);
151 |   final nCtx = llama_cpp.llama_n_ctx(ctx);
152 |   print('n_ctx: $nCtx, train=$nCtxTrain');
153 |   if (nCtx > nCtxTrain) {
154 |     print('warning: model was trained on only $nCtxTrain context tokens '
155 |         '($nCtx specified)');
156 |   }
157 |   print(_systemInfo(params, ctxParams));
158 |   _warmup(model, ctx, ctxParams.n_batch);
159 | 
160 |   return (model, ctx);
161 | }
162 | 
163 | void _warmup(ffi.Pointer<llama_cpp.llama_model> model,
164 |     ffi.Pointer<llama_cpp.llama_context> ctx, int batchSize) {
165 |   print('warming up the model with an empty run');
166 |   final tokens = TokenArray(size: 2);
167 |   tokens.add(llama_cpp.llama_token_bos(model));
168 |   tokens.add(llama_cpp.llama_token_eos(model));
169 |   final batch = llama_cpp.llama_batch_get_one(
170 |     tokens.pointerAt(0),
171 |     m.min(tokens.length, batchSize),
172 |     0,
173 |     0,
174 |   );
175 |   llama_cpp.llama_decode(ctx, batch);
176 |   llama_cpp.llama_kv_cache_clear(ctx);
177 |   llama_cpp.llama_reset_timings(ctx);
178 |   tokens.dispose();
179 | }
180 | 


--------------------------------------------------------------------------------
/lib/src/embedding.dart:
--------------------------------------------------------------------------------
  1 | import 'dart:ffi' as ffi;
  2 | 
  3 | import 'package:ffi/ffi.dart' show calloc;
  4 | 
  5 | import 'common.dart' as c;
  6 | import 'ffi.dart';
  7 | import 'lib_llama_cpp.dart' as llama_cpp;
  8 | import 'llama_params.dart';
  9 | 
 10 | /// Embedding runs in current isolate.
 11 | /// Place it in another isolate if you want async embeddings.
 12 | final class Embedding {
 13 |   final ffi.Pointer<llama_cpp.llama_model> model;
 14 |   final ffi.Pointer<llama_cpp.llama_context> ctx;
 15 |   final CharArray cStr;
 16 |   final bool verbose;
 17 |   final tokenBuf = TokenArray(size: 64);
 18 | 
 19 |   Embedding._(
 20 |     this.model,
 21 |     this.ctx,
 22 |     this.cStr,
 23 |     this.verbose,
 24 |   );
 25 | 
 26 |   factory Embedding(
 27 |     String path, {
 28 |     int? nThread,
 29 |     int? nThreadBatch,
 30 |     int? nCtx,
 31 |     int? nBatch,
 32 |     int? nGpuLayers,
 33 |     bool verbose = false,
 34 |   }) {
 35 |     final cStr = CharArray.from(path);
 36 |     final (model, ctx) = c.loadModel(
 37 |       cStr,
 38 |       LlamaParams(
 39 |         nThread: nThread,
 40 |         nThreadBatch: nThreadBatch,
 41 |         nCtx: nCtx,
 42 |         nBatch: nBatch,
 43 |         nGpuLayers: nGpuLayers,
 44 |         embedding: true,
 45 |       ),
 46 |     );
 47 | 
 48 |     return Embedding._(
 49 |       model,
 50 |       ctx,
 51 |       cStr,
 52 |       verbose,
 53 |     );
 54 |   }
 55 | 
 56 |   /// Embedding multiple prompts at one time.
 57 |   List<List<double>> embedBatch(List<String> prompts) => _embed(prompts);
 58 | 
 59 |   /// Embedding one prompt at one time.
 60 |   List<double> embedSingle(String prompt) => _embed([prompt]).first;
 61 | 
 62 |   List<List<double>> _embed(List<String> prompts) {
 63 |     llama_cpp.llama_reset_timings(ctx);
 64 | 
 65 |     final batchSize = llama_cpp.llama_n_batch(ctx);
 66 |     final batch = llama_cpp.llama_batch_init(batchSize, 0, prompts.length);
 67 |     final arrayList = prompts.map((p) {
 68 |       cStr.pavedBy(p);
 69 |       tokenBuf.pavedBy(model, cStr, addBos: true);
 70 |       final l = tokenBuf.toList();
 71 |       return l.length > batchSize ? l.sublist(0, batchSize) : l;
 72 |     });
 73 |     if (verbose) {
 74 |       for (final (i, l) in arrayList.indexed) {
 75 |         print("main: prompt $i: '${prompts[i]}'");
 76 |         print("main: number of tokens in prompt = ${l.length}");
 77 |         for (final t in l) {
 78 |           print("${'$t'.padLeft(6)} -> '${cStr.tokenString(model, t)}'");
 79 |         }
 80 |       }
 81 |     }
 82 | 
 83 |     final dimens = llama_cpp.llama_n_embd(model);
 84 |     final row = arrayList.length;
 85 |     final data =
 86 |         calloc.allocate<ffi.Float>(ffi.sizeOf<ffi.Float>() * row * dimens);
 87 |     var out = data;
 88 |     var s = 0;
 89 |     for (final tokens in arrayList) {
 90 |       final len = tokens.length;
 91 |       if (batch.n_tokens + len > batchSize) {
 92 |         c.decodeEmbeddingBatch(ctx, batch, out, s, dimens);
 93 |         batch.n_tokens = 0;
 94 |         out += s * dimens;
 95 |         s = 0;
 96 |       }
 97 |       c.addBatchSeq(batch, tokens, s);
 98 |       s++;
 99 |     }
100 |     c.decodeEmbeddingBatch(ctx, batch, out, s, dimens);
101 | 
102 |     final result = List<List<double>>.generate(row, (r) {
103 |       final p = data + r * dimens;
104 |       return List<double>.generate(
105 |         dimens,
106 |         (i) => (p[i] * 1000000).round() / 1000000,
107 |         growable: false,
108 |       );
109 |     }, growable: false);
110 |     llama_cpp.llama_print_timings(ctx);
111 | 
112 |     calloc.free(data);
113 |     llama_cpp.llama_batch_free(batch);
114 |     return result;
115 |   }
116 | 
117 |   /// Free context, model and memory objects in C world.
118 |   void dispose() {
119 |     tokenBuf.dispose();
120 |     cStr.dispose();
121 | 
122 |     llama_cpp.llama_free(ctx);
123 |     llama_cpp.llama_free_model(model);
124 |     llama_cpp.llama_backend_free();
125 |     print('Embedding.dispose: done.');
126 |   }
127 | }
128 | 


--------------------------------------------------------------------------------
/lib/src/ffi.dart:
--------------------------------------------------------------------------------
  1 | import 'dart:convert' show utf8;
  2 | import 'dart:ffi' as ffi;
  3 | import 'dart:typed_data' show Uint8List;
  4 | 
  5 | import 'package:ffi/ffi.dart' show Utf8, Utf8Pointer, calloc;
  6 | 
  7 | import 'lib_llama_cpp.dart' as llama_cpp;
  8 | 
  9 | CharArray _fillChars(String str, CharArray Function(int size) getter) {
 10 |   final units = utf8.encode(str);
 11 |   final size = units.length + 1;
 12 |   final len = size - 1;
 13 |   final buf = getter(size);
 14 |   final pointer = buf._buf.cast<ffi.Uint8>();
 15 |   final raw = pointer.asTypedList(size);
 16 |   raw.setAll(0, units);
 17 |   raw[len] = 0;
 18 |   buf._len = len;
 19 |   return buf;
 20 | }
 21 | 
 22 | /// Util class for data conversion between Dart `String` and C `const char *`.
 23 | /// From Dart `String` to C `const char *`:
 24 | /// ```dart
 25 | /// final cStr = CharArray.from('some thing as string');
 26 | /// final p = cStr.pointer;
 27 | /// call_some_C_function(p, cStr.length);
 28 | /// cStr.dispose();
 29 | /// ```
 30 | /// To reuse an existing `CharArray`:
 31 | /// ```dart
 32 | /// CharArray cStr;
 33 | /// final p = cStr.pavedBy('some thing as string');
 34 | /// call_some_C_function(p, cStr.length);
 35 | /// cStr.dispose();
 36 | /// ```
 37 | ///
 38 | /// From C `const char *` to Dart `String`:
 39 | /// ```dart
 40 | /// final p = call_some_C_function();
 41 | /// final str = CharArray.fromNative(p);
 42 | /// ```
 43 | final class CharArray {
 44 |   int _size;
 45 |   int _len;
 46 |   ffi.Pointer<ffi.Char> _buf;
 47 | 
 48 |   CharArray({int size = 32})
 49 |       : _size = size,
 50 |         _len = 0,
 51 |         _buf = calloc.allocate<ffi.Char>(size * ffi.sizeOf<ffi.Char>());
 52 | 
 53 |   /// Create newly a buffer for an existing Dart string.
 54 |   factory CharArray.from(String str) {
 55 |     final buf = _fillChars(str, (size) => CharArray(size: size));
 56 |     return buf;
 57 |   }
 58 | 
 59 |   /// A helper function that converts the given Dart String to `const char *`
 60 |   /// with an existing `CharArray`.
 61 |   /// The capacity is expanded automatically.
 62 |   ffi.Pointer<ffi.Char> pavedBy(String str) {
 63 |     _fillChars(str, (size) => this.._resize(size));
 64 |     return _buf;
 65 |   }
 66 | 
 67 |   int get length => _len;
 68 | 
 69 |   ffi.Pointer<ffi.Char> get pointer => _buf;
 70 | 
 71 |   /// Convert to Dart string with data in current buf and specified length.
 72 |   String get dartString => _buf.cast<Utf8>().toDartString(length: _len);
 73 | 
 74 |   bool _resize(int size) {
 75 |     if (size <= _size) {
 76 |       return false;
 77 |     }
 78 |     dispose();
 79 |     _buf = calloc.allocate<ffi.Char>(size * ffi.sizeOf<ffi.Char>());
 80 |     _size = size;
 81 |     // copy existing elements?
 82 |     _len = 0;
 83 |     return true;
 84 |   }
 85 | 
 86 |   /// Convert to Dart string with extern CString pointer without length.
 87 |   static String toDartString(ffi.Pointer<ffi.Char> pointer) =>
 88 |       pointer.cast<Utf8>().toDartString();
 89 | 
 90 |   /// A string representation for a token.
 91 |   /// In some model, one token would not return a full utf8 string.
 92 |   String tokenString(ffi.Pointer<llama_cpp.llama_model> model, int token) {
 93 |     final bytes = tokenBytes(model, token);
 94 |     try {
 95 |       return dartString;
 96 |     } on Exception catch (_) {
 97 |       return bytes.toString();
 98 |     }
 99 |   }
100 | 
101 |   /// Return a raw bytes with a given token Id.
102 |   /// We need convert assigned int to unassigned, or else
103 |   /// `FormatException: Invalid UTF-8 byte` would be thrown.
104 |   List<int> tokenBytes(ffi.Pointer<llama_cpp.llama_model> model, int token) {
105 |     final len = llama_cpp.llama_token_to_piece(
106 |       model,
107 |       token,
108 |       _buf,
109 |       _size,
110 |       false,
111 |     );
112 |     if (len < 0) {
113 |       _resize(-len);
114 |       _len = llama_cpp.llama_token_to_piece(model, token, _buf, _size, false);
115 |     } else {
116 |       _len = len;
117 |     }
118 |     return Uint8List.fromList(List<int>.generate(_len, (i) => _buf[i]));
119 |   }
120 | 
121 |   /// Release native resources.
122 |   void dispose() {
123 |     calloc.free(_buf);
124 |     _len = 0;
125 |     _size = 0;
126 |   }
127 | }
128 | 
129 | final class TokenArray {
130 |   int _size;
131 |   int _len;
132 |   ffi.Pointer<llama_cpp.llama_token> _buf;
133 | 
134 |   TokenArray({int size = 512})
135 |       : _size = size,
136 |         _len = 0,
137 |         _buf = calloc.allocate<llama_cpp.llama_token>(
138 |             size * ffi.sizeOf<llama_cpp.llama_token>());
139 | 
140 |   int get length => _len;
141 | 
142 |   int get capacity => _size;
143 | 
144 |   int operator [](int pos) => _buf[pos];
145 | 
146 |   ffi.Pointer<llama_cpp.llama_token> pointerAt(int pos) => _buf + pos;
147 | 
148 |   void pavedBy(
149 |     ffi.Pointer<llama_cpp.llama_model> model,
150 |     CharArray text, {
151 |     bool addBos = false,
152 |   }) {
153 |     final size = text.length + 1;
154 |     _resize(size);
155 |     final len = llama_cpp.llama_tokenize(
156 |       model,
157 |       text.pointer,
158 |       text.length,
159 |       _buf,
160 |       _size,
161 |       addBos,
162 |       false,
163 |     );
164 |     if (len < 0) {
165 |       throw Exception("tokenize '${text.dartString}' failed!");
166 |     }
167 |     _len = len;
168 |   }
169 | 
170 |   int clear() {
171 |     var n = _len;
172 |     _len = 0;
173 |     return n;
174 |   }
175 | 
176 |   void add(int token) {
177 |     _resize(_len + 1);
178 |     _buf[_len++] = token;
179 |   }
180 | 
181 |   bool _resize(int size) {
182 |     if (size <= _size) {
183 |       return false;
184 |     }
185 |     dispose();
186 |     _buf = calloc.allocate<llama_cpp.llama_token>(
187 |         size * ffi.sizeOf<llama_cpp.llama_token>());
188 |     _size = size;
189 |     // copy existing elements?
190 |     _len = 0;
191 |     return true;
192 |   }
193 | 
194 |   List<int> toList() => List.generate(_len, (i) => _buf[i]);
195 | 
196 |   void dispose() {
197 |     calloc.free(_buf);
198 |     _len = 0;
199 |     _size = 0;
200 |   }
201 | }
202 | 
203 | final class TokenDataArray {
204 |   int _size;
205 |   int _len;
206 |   ffi.Pointer<llama_cpp.llama_token_data> _buf;
207 |   final pointer = calloc.allocate<llama_cpp.llama_token_data_array>(
208 |       ffi.sizeOf<llama_cpp.llama_token_data>());
209 | 
210 |   TokenDataArray(int size)
211 |       : _size = size,
212 |         _len = 0,
213 |         _buf = calloc.allocate<llama_cpp.llama_token_data>(
214 |             size * ffi.sizeOf<llama_cpp.llama_token_data>());
215 | 
216 |   int get length => _len;
217 | 
218 |   llama_cpp.llama_token_data operator [](int pos) => _buf[pos];
219 | 
220 |   void setLogit(int pos, double value) {
221 |     (_buf + pos).ref.logit = value;
222 |   }
223 | 
224 |   void pavedBy(ffi.Pointer<ffi.Float> logits, int size) {
225 |     _resize(size);
226 |     for (var id = 0; id < size; id++) {
227 |       _buf[id]
228 |         ..id = id
229 |         ..logit = logits[id]
230 |         ..p = 0;
231 |     }
232 |     pointer.ref
233 |       ..data = _buf
234 |       ..size = size
235 |       ..sorted = false;
236 |   }
237 | 
238 |   bool _resize(int size) {
239 |     if (size <= _size) {
240 |       return false;
241 |     }
242 |     _release();
243 |     _buf = calloc.allocate<llama_cpp.llama_token_data>(
244 |         size * ffi.sizeOf<llama_cpp.llama_token_data>());
245 |     _size = size;
246 |     // copy existing elements?
247 |     _len = 0;
248 |     return true;
249 |   }
250 | 
251 |   // not free array pointer
252 |   void _release() {
253 |     calloc.free(_buf);
254 |     _len = 0;
255 |     _size = 0;
256 |   }
257 | 
258 |   void dispose() {
259 |     _release();
260 |     calloc.free(pointer);
261 |   }
262 | }
263 | 


--------------------------------------------------------------------------------
/lib/src/lib_llama_cpp.dart:
--------------------------------------------------------------------------------
   1 | // Copyright (c) 2023, the Dart project authors.  Please see the AUTHORS file
   2 | // for details. All rights reserved. Use of this source code is governed by a
   3 | // BSD-style license that can be found in the LICENSE file.
   4 | 
   5 | // AUTO GENERATED FILE, DO NOT EDIT.
   6 | //
   7 | // Generated by `package:ffigen`.
   8 | // ignore_for_file: type=lint, unused_field, unused_element
   9 | import 'dart:ffi' as ffi;
  10 | 
  11 | /// Helpers for getting default parameters
  12 | @ffi.Native<llama_model_params Function()>(symbol: 'llama_model_default_params')
  13 | external llama_model_params llama_model_default_params();
  14 | 
  15 | @ffi.Native<llama_context_params Function()>(
  16 |     symbol: 'llama_context_default_params')
  17 | external llama_context_params llama_context_default_params();
  18 | 
  19 | @ffi.Native<llama_model_quantize_params Function()>(
  20 |     symbol: 'llama_model_quantize_default_params')
  21 | external llama_model_quantize_params llama_model_quantize_default_params();
  22 | 
  23 | /// Initialize the llama + ggml backend
  24 | /// If numa is true, use NUMA optimizations
  25 | /// Call once at the start of the program
  26 | @ffi.Native<ffi.Void Function()>(symbol: 'llama_backend_init')
  27 | external void llama_backend_init();
  28 | 
  29 | /// optional:
  30 | @ffi.Native<ffi.Void Function(ffi.Int32)>(symbol: 'llama_numa_init')
  31 | external void llama_numa_init(
  32 |   int numa,
  33 | );
  34 | 
  35 | /// Call once at the end of the program - currently only used for MPI
  36 | @ffi.Native<ffi.Void Function()>(symbol: 'llama_backend_free')
  37 | external void llama_backend_free();
  38 | 
  39 | @ffi.Native<
  40 |     ffi.Pointer<llama_model> Function(ffi.Pointer<ffi.Char>,
  41 |         llama_model_params)>(symbol: 'llama_load_model_from_file')
  42 | external ffi.Pointer<llama_model> llama_load_model_from_file(
  43 |   ffi.Pointer<ffi.Char> path_model,
  44 |   llama_model_params params,
  45 | );
  46 | 
  47 | @ffi.Native<ffi.Void Function(ffi.Pointer<llama_model>)>(
  48 |     symbol: 'llama_free_model')
  49 | external void llama_free_model(
  50 |   ffi.Pointer<llama_model> model,
  51 | );
  52 | 
  53 | @ffi.Native<
  54 |     ffi.Pointer<llama_context> Function(ffi.Pointer<llama_model>,
  55 |         llama_context_params)>(symbol: 'llama_new_context_with_model')
  56 | external ffi.Pointer<llama_context> llama_new_context_with_model(
  57 |   ffi.Pointer<llama_model> model,
  58 |   llama_context_params params,
  59 | );
  60 | 
  61 | /// Frees all allocated memory
  62 | @ffi.Native<ffi.Void Function(ffi.Pointer<llama_context>)>(symbol: 'llama_free')
  63 | external void llama_free(
  64 |   ffi.Pointer<llama_context> ctx,
  65 | );
  66 | 
  67 | @ffi.Native<ffi.Int64 Function()>(symbol: 'llama_time_us')
  68 | external int llama_time_us();
  69 | 
  70 | @ffi.Native<ffi.Size Function()>(symbol: 'llama_max_devices')
  71 | external int llama_max_devices();
  72 | 
  73 | @ffi.Native<ffi.Bool Function()>(symbol: 'llama_supports_mmap')
  74 | external bool llama_supports_mmap();
  75 | 
  76 | @ffi.Native<ffi.Bool Function()>(symbol: 'llama_supports_mlock')
  77 | external bool llama_supports_mlock();
  78 | 
  79 | @ffi.Native<ffi.Bool Function()>(symbol: 'llama_supports_gpu_offload')
  80 | external bool llama_supports_gpu_offload();
  81 | 
  82 | @ffi.Native<ffi.Pointer<llama_model> Function(ffi.Pointer<llama_context>)>(
  83 |     symbol: 'llama_get_model')
  84 | external ffi.Pointer<llama_model> llama_get_model(
  85 |   ffi.Pointer<llama_context> ctx,
  86 | );
  87 | 
  88 | @ffi.Native<ffi.Uint32 Function(ffi.Pointer<llama_context>)>(
  89 |     symbol: 'llama_n_ctx')
  90 | external int llama_n_ctx(
  91 |   ffi.Pointer<llama_context> ctx,
  92 | );
  93 | 
  94 | @ffi.Native<ffi.Uint32 Function(ffi.Pointer<llama_context>)>(
  95 |     symbol: 'llama_n_batch')
  96 | external int llama_n_batch(
  97 |   ffi.Pointer<llama_context> ctx,
  98 | );
  99 | 
 100 | @ffi.Native<ffi.Uint32 Function(ffi.Pointer<llama_context>)>(
 101 |     symbol: 'llama_n_ubatch')
 102 | external int llama_n_ubatch(
 103 |   ffi.Pointer<llama_context> ctx,
 104 | );
 105 | 
 106 | @ffi.Native<ffi.Uint32 Function(ffi.Pointer<llama_context>)>(
 107 |     symbol: 'llama_n_seq_max')
 108 | external int llama_n_seq_max(
 109 |   ffi.Pointer<llama_context> ctx,
 110 | );
 111 | 
 112 | @ffi.Native<ffi.Int32 Function(ffi.Pointer<llama_context>)>(
 113 |     symbol: 'llama_pooling_type')
 114 | external int llama_pooling_type1(
 115 |   ffi.Pointer<llama_context> ctx,
 116 | );
 117 | 
 118 | @ffi.Native<ffi.Int32 Function(ffi.Pointer<llama_model>)>(
 119 |     symbol: 'llama_vocab_type')
 120 | external int llama_vocab_type1(
 121 |   ffi.Pointer<llama_model> model,
 122 | );
 123 | 
 124 | @ffi.Native<ffi.Int32 Function(ffi.Pointer<llama_model>)>(
 125 |     symbol: 'llama_rope_type')
 126 | external int llama_rope_type1(
 127 |   ffi.Pointer<llama_model> model,
 128 | );
 129 | 
 130 | @ffi.Native<ffi.Int32 Function(ffi.Pointer<llama_model>)>(
 131 |     symbol: 'llama_n_vocab')
 132 | external int llama_n_vocab(
 133 |   ffi.Pointer<llama_model> model,
 134 | );
 135 | 
 136 | @ffi.Native<ffi.Int32 Function(ffi.Pointer<llama_model>)>(
 137 |     symbol: 'llama_n_ctx_train')
 138 | external int llama_n_ctx_train(
 139 |   ffi.Pointer<llama_model> model,
 140 | );
 141 | 
 142 | @ffi.Native<ffi.Int32 Function(ffi.Pointer<llama_model>)>(
 143 |     symbol: 'llama_n_embd')
 144 | external int llama_n_embd(
 145 |   ffi.Pointer<llama_model> model,
 146 | );
 147 | 
 148 | @ffi.Native<ffi.Int32 Function(ffi.Pointer<llama_model>)>(
 149 |     symbol: 'llama_n_layer')
 150 | external int llama_n_layer(
 151 |   ffi.Pointer<llama_model> model,
 152 | );
 153 | 
 154 | /// Get the model's RoPE frequency scaling factor
 155 | @ffi.Native<ffi.Float Function(ffi.Pointer<llama_model>)>(
 156 |     symbol: 'llama_rope_freq_scale_train')
 157 | external double llama_rope_freq_scale_train(
 158 |   ffi.Pointer<llama_model> model,
 159 | );
 160 | 
 161 | /// Get metadata value as a string by key name
 162 | @ffi.Native<
 163 |     ffi.Int32 Function(ffi.Pointer<llama_model>, ffi.Pointer<ffi.Char>,
 164 |         ffi.Pointer<ffi.Char>, ffi.Size)>(symbol: 'llama_model_meta_val_str')
 165 | external int llama_model_meta_val_str(
 166 |   ffi.Pointer<llama_model> model,
 167 |   ffi.Pointer<ffi.Char> key,
 168 |   ffi.Pointer<ffi.Char> buf,
 169 |   int buf_size,
 170 | );
 171 | 
 172 | /// Get the number of metadata key/value pairs
 173 | @ffi.Native<ffi.Int32 Function(ffi.Pointer<llama_model>)>(
 174 |     symbol: 'llama_model_meta_count')
 175 | external int llama_model_meta_count(
 176 |   ffi.Pointer<llama_model> model,
 177 | );
 178 | 
 179 | /// Get metadata key name by index
 180 | @ffi.Native<
 181 |     ffi.Int32 Function(
 182 |         ffi.Pointer<llama_model>,
 183 |         ffi.Int32,
 184 |         ffi.Pointer<ffi.Char>,
 185 |         ffi.Size)>(symbol: 'llama_model_meta_key_by_index')
 186 | external int llama_model_meta_key_by_index(
 187 |   ffi.Pointer<llama_model> model,
 188 |   int i,
 189 |   ffi.Pointer<ffi.Char> buf,
 190 |   int buf_size,
 191 | );
 192 | 
 193 | /// Get metadata value as a string by index
 194 | @ffi.Native<
 195 |     ffi.Int32 Function(
 196 |         ffi.Pointer<llama_model>,
 197 |         ffi.Int32,
 198 |         ffi.Pointer<ffi.Char>,
 199 |         ffi.Size)>(symbol: 'llama_model_meta_val_str_by_index')
 200 | external int llama_model_meta_val_str_by_index(
 201 |   ffi.Pointer<llama_model> model,
 202 |   int i,
 203 |   ffi.Pointer<ffi.Char> buf,
 204 |   int buf_size,
 205 | );
 206 | 
 207 | /// Get a string describing the model type
 208 | @ffi.Native<
 209 |     ffi.Int32 Function(ffi.Pointer<llama_model>, ffi.Pointer<ffi.Char>,
 210 |         ffi.Size)>(symbol: 'llama_model_desc')
 211 | external int llama_model_desc(
 212 |   ffi.Pointer<llama_model> model,
 213 |   ffi.Pointer<ffi.Char> buf,
 214 |   int buf_size,
 215 | );
 216 | 
 217 | /// Returns the total size of all the tensors in the model in bytes
 218 | @ffi.Native<ffi.Uint64 Function(ffi.Pointer<llama_model>)>(
 219 |     symbol: 'llama_model_size')
 220 | external int llama_model_size(
 221 |   ffi.Pointer<llama_model> model,
 222 | );
 223 | 
 224 | /// Returns the total number of parameters in the model
 225 | @ffi.Native<ffi.Uint64 Function(ffi.Pointer<llama_model>)>(
 226 |     symbol: 'llama_model_n_params')
 227 | external int llama_model_n_params(
 228 |   ffi.Pointer<llama_model> model,
 229 | );
 230 | 
 231 | /// Get a llama model tensor
 232 | @ffi.Native<
 233 |     ffi.Pointer<ggml_tensor> Function(ffi.Pointer<llama_model>,
 234 |         ffi.Pointer<ffi.Char>)>(symbol: 'llama_get_model_tensor')
 235 | external ffi.Pointer<ggml_tensor> llama_get_model_tensor(
 236 |   ffi.Pointer<llama_model> model,
 237 |   ffi.Pointer<ffi.Char> name,
 238 | );
 239 | 
 240 | /// Returns 0 on success
 241 | @ffi.Native<
 242 |         ffi.Uint32 Function(ffi.Pointer<ffi.Char>, ffi.Pointer<ffi.Char>,
 243 |             ffi.Pointer<llama_model_quantize_params>)>(
 244 |     symbol: 'llama_model_quantize')
 245 | external int llama_model_quantize(
 246 |   ffi.Pointer<ffi.Char> fname_inp,
 247 |   ffi.Pointer<ffi.Char> fname_out,
 248 |   ffi.Pointer<llama_model_quantize_params> params,
 249 | );
 250 | 
 251 | /// Apply a LoRA adapter to a loaded model
 252 | /// path_base_model is the path to a higher quality model to use as a base for
 253 | /// the layers modified by the adapter. Can be NULL to use the current loaded model.
 254 | /// The model needs to be reloaded before applying a new adapter, otherwise the adapter
 255 | /// will be applied on top of the previous one
 256 | /// Returns 0 on success
 257 | @ffi.Native<
 258 |     ffi.Int32 Function(
 259 |         ffi.Pointer<llama_model>,
 260 |         ffi.Pointer<ffi.Char>,
 261 |         ffi.Float,
 262 |         ffi.Pointer<ffi.Char>,
 263 |         ffi.Int32)>(symbol: 'llama_model_apply_lora_from_file')
 264 | external int llama_model_apply_lora_from_file(
 265 |   ffi.Pointer<llama_model> model,
 266 |   ffi.Pointer<ffi.Char> path_lora,
 267 |   double scale,
 268 |   ffi.Pointer<ffi.Char> path_base_model,
 269 |   int n_threads,
 270 | );
 271 | 
 272 | /// Apply a loaded control vector to a llama_context, or if data is NULL, clear
 273 | /// the currently loaded vector.
 274 | /// n_embd should be the size of a single layer's control, and data should point
 275 | /// to an n_embd x n_layers buffer starting from layer 1.
 276 | /// il_start and il_end are the layer range the vector should apply to (both inclusive)
 277 | /// See llama_control_vector_load in common to load a control vector.
 278 | @ffi.Native<
 279 |     ffi.Int32 Function(
 280 |         ffi.Pointer<llama_context>,
 281 |         ffi.Pointer<ffi.Float>,
 282 |         ffi.Size,
 283 |         ffi.Int32,
 284 |         ffi.Int32,
 285 |         ffi.Int32)>(symbol: 'llama_control_vector_apply')
 286 | external int llama_control_vector_apply(
 287 |   ffi.Pointer<llama_context> lctx,
 288 |   ffi.Pointer<ffi.Float> data,
 289 |   int len,
 290 |   int n_embd,
 291 |   int il_start,
 292 |   int il_end,
 293 | );
 294 | 
 295 | /// Create an empty KV cache view. (use only for debugging purposes)
 296 | @ffi.Native<
 297 |         llama_kv_cache_view Function(ffi.Pointer<llama_context>, ffi.Int32)>(
 298 |     symbol: 'llama_kv_cache_view_init')
 299 | external llama_kv_cache_view llama_kv_cache_view_init(
 300 |   ffi.Pointer<llama_context> ctx,
 301 |   int n_seq_max,
 302 | );
 303 | 
 304 | /// Free a KV cache view. (use only for debugging purposes)
 305 | @ffi.Native<ffi.Void Function(ffi.Pointer<llama_kv_cache_view>)>(
 306 |     symbol: 'llama_kv_cache_view_free')
 307 | external void llama_kv_cache_view_free(
 308 |   ffi.Pointer<llama_kv_cache_view> view,
 309 | );
 310 | 
 311 | /// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
 312 | @ffi.Native<
 313 |     ffi.Void Function(ffi.Pointer<llama_context>,
 314 |         ffi.Pointer<llama_kv_cache_view>)>(symbol: 'llama_kv_cache_view_update')
 315 | external void llama_kv_cache_view_update(
 316 |   ffi.Pointer<llama_context> ctx,
 317 |   ffi.Pointer<llama_kv_cache_view> view,
 318 | );
 319 | 
 320 | /// Returns the number of tokens in the KV cache (slow, use only for debug)
 321 | /// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
 322 | @ffi.Native<ffi.Int32 Function(ffi.Pointer<llama_context>)>(
 323 |     symbol: 'llama_get_kv_cache_token_count')
 324 | external int llama_get_kv_cache_token_count(
 325 |   ffi.Pointer<llama_context> ctx,
 326 | );
 327 | 
 328 | /// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
 329 | @ffi.Native<ffi.Int32 Function(ffi.Pointer<llama_context>)>(
 330 |     symbol: 'llama_get_kv_cache_used_cells')
 331 | external int llama_get_kv_cache_used_cells(
 332 |   ffi.Pointer<llama_context> ctx,
 333 | );
 334 | 
 335 | /// Clear the KV cache - both cell info is erased and KV data is zeroed
 336 | @ffi.Native<ffi.Void Function(ffi.Pointer<llama_context>)>(
 337 |     symbol: 'llama_kv_cache_clear')
 338 | external void llama_kv_cache_clear(
 339 |   ffi.Pointer<llama_context> ctx,
 340 | );
 341 | 
 342 | /// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
 343 | /// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
 344 | /// seq_id < 0 : match any sequence
 345 | /// p0 < 0     : [0,  p1]
 346 | /// p1 < 0     : [p0, inf)
 347 | @ffi.Native<
 348 |     ffi.Bool Function(ffi.Pointer<llama_context>, llama_seq_id, llama_pos,
 349 |         llama_pos)>(symbol: 'llama_kv_cache_seq_rm')
 350 | external bool llama_kv_cache_seq_rm(
 351 |   ffi.Pointer<llama_context> ctx,
 352 |   int seq_id,
 353 |   int p0,
 354 |   int p1,
 355 | );
 356 | 
 357 | /// Copy all tokens that belong to the specified sequence to another sequence
 358 | /// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
 359 | /// p0 < 0 : [0,  p1]
 360 | /// p1 < 0 : [p0, inf)
 361 | @ffi.Native<
 362 |     ffi.Void Function(ffi.Pointer<llama_context>, llama_seq_id, llama_seq_id,
 363 |         llama_pos, llama_pos)>(symbol: 'llama_kv_cache_seq_cp')
 364 | external void llama_kv_cache_seq_cp(
 365 |   ffi.Pointer<llama_context> ctx,
 366 |   int seq_id_src,
 367 |   int seq_id_dst,
 368 |   int p0,
 369 |   int p1,
 370 | );
 371 | 
 372 | /// Removes all tokens that do not belong to the specified sequence
 373 | @ffi.Native<ffi.Void Function(ffi.Pointer<llama_context>, llama_seq_id)>(
 374 |     symbol: 'llama_kv_cache_seq_keep')
 375 | external void llama_kv_cache_seq_keep(
 376 |   ffi.Pointer<llama_context> ctx,
 377 |   int seq_id,
 378 | );
 379 | 
 380 | /// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
 381 | /// If the KV cache is RoPEd, the KV data is updated accordingly:
 382 | /// - lazily on next llama_decode()
 383 | /// - explicitly with llama_kv_cache_update()
 384 | /// p0 < 0 : [0,  p1]
 385 | /// p1 < 0 : [p0, inf)
 386 | @ffi.Native<
 387 |     ffi.Void Function(ffi.Pointer<llama_context>, llama_seq_id, llama_pos,
 388 |         llama_pos, llama_pos)>(symbol: 'llama_kv_cache_seq_add')
 389 | external void llama_kv_cache_seq_add(
 390 |   ffi.Pointer<llama_context> ctx,
 391 |   int seq_id,
 392 |   int p0,
 393 |   int p1,
 394 |   int delta,
 395 | );
 396 | 
 397 | /// Integer division of the positions by factor of `d > 1`
 398 | /// If the KV cache is RoPEd, the KV data is updated accordingly:
 399 | /// - lazily on next llama_decode()
 400 | /// - explicitly with llama_kv_cache_update()
 401 | /// p0 < 0 : [0,  p1]
 402 | /// p1 < 0 : [p0, inf)
 403 | @ffi.Native<
 404 |     ffi.Void Function(ffi.Pointer<llama_context>, llama_seq_id, llama_pos,
 405 |         llama_pos, ffi.Int)>(symbol: 'llama_kv_cache_seq_div')
 406 | external void llama_kv_cache_seq_div(
 407 |   ffi.Pointer<llama_context> ctx,
 408 |   int seq_id,
 409 |   int p0,
 410 |   int p1,
 411 |   int d,
 412 | );
 413 | 
 414 | /// Returns the largest position present in the KV cache for the specified sequence
 415 | @ffi.Native<llama_pos Function(ffi.Pointer<llama_context>, llama_seq_id)>(
 416 |     symbol: 'llama_kv_cache_seq_pos_max')
 417 | external int llama_kv_cache_seq_pos_max(
 418 |   ffi.Pointer<llama_context> ctx,
 419 |   int seq_id,
 420 | );
 421 | 
 422 | /// Defragment the KV cache
 423 | /// This will be applied:
 424 | /// - lazily on next llama_decode()
 425 | /// - explicitly with llama_kv_cache_update()
 426 | @ffi.Native<ffi.Void Function(ffi.Pointer<llama_context>)>(
 427 |     symbol: 'llama_kv_cache_defrag')
 428 | external void llama_kv_cache_defrag(
 429 |   ffi.Pointer<llama_context> ctx,
 430 | );
 431 | 
 432 | /// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
 433 | @ffi.Native<ffi.Void Function(ffi.Pointer<llama_context>)>(
 434 |     symbol: 'llama_kv_cache_update')
 435 | external void llama_kv_cache_update(
 436 |   ffi.Pointer<llama_context> ctx,
 437 | );
 438 | 
 439 | /// Returns the maximum size in bytes of the state (rng, logits, embedding
 440 | /// and kv_cache) - will often be smaller after compacting tokens
 441 | @ffi.Native<ffi.Size Function(ffi.Pointer<llama_context>)>(
 442 |     symbol: 'llama_state_get_size')
 443 | external int llama_state_get_size(
 444 |   ffi.Pointer<llama_context> ctx,
 445 | );
 446 | 
 447 | @ffi.Native<ffi.Size Function(ffi.Pointer<llama_context>)>(
 448 |     symbol: 'llama_get_state_size')
 449 | external int llama_get_state_size(
 450 |   ffi.Pointer<llama_context> ctx,
 451 | );
 452 | 
 453 | /// Copies the state to the specified destination address.
 454 | /// Destination needs to have allocated enough memory.
 455 | /// Returns the number of bytes copied
 456 | @ffi.Native<
 457 |         ffi.Size Function(ffi.Pointer<llama_context>, ffi.Pointer<ffi.Uint8>)>(
 458 |     symbol: 'llama_state_get_data')
 459 | external int llama_state_get_data(
 460 |   ffi.Pointer<llama_context> ctx,
 461 |   ffi.Pointer<ffi.Uint8> dst,
 462 | );
 463 | 
 464 | @ffi.Native<
 465 |         ffi.Size Function(ffi.Pointer<llama_context>, ffi.Pointer<ffi.Uint8>)>(
 466 |     symbol: 'llama_copy_state_data')
 467 | external int llama_copy_state_data(
 468 |   ffi.Pointer<llama_context> ctx,
 469 |   ffi.Pointer<ffi.Uint8> dst,
 470 | );
 471 | 
 472 | /// Set the state reading from the specified address
 473 | /// Returns the number of bytes read
 474 | @ffi.Native<
 475 |         ffi.Size Function(ffi.Pointer<llama_context>, ffi.Pointer<ffi.Uint8>)>(
 476 |     symbol: 'llama_state_set_data')
 477 | external int llama_state_set_data(
 478 |   ffi.Pointer<llama_context> ctx,
 479 |   ffi.Pointer<ffi.Uint8> src,
 480 | );
 481 | 
 482 | @ffi.Native<
 483 |         ffi.Size Function(ffi.Pointer<llama_context>, ffi.Pointer<ffi.Uint8>)>(
 484 |     symbol: 'llama_set_state_data')
 485 | external int llama_set_state_data(
 486 |   ffi.Pointer<llama_context> ctx,
 487 |   ffi.Pointer<ffi.Uint8> src,
 488 | );
 489 | 
 490 | /// Save/load session file
 491 | @ffi.Native<
 492 |     ffi.Bool Function(
 493 |         ffi.Pointer<llama_context>,
 494 |         ffi.Pointer<ffi.Char>,
 495 |         ffi.Pointer<llama_token>,
 496 |         ffi.Size,
 497 |         ffi.Pointer<ffi.Size>)>(symbol: 'llama_state_load_file')
 498 | external bool llama_state_load_file(
 499 |   ffi.Pointer<llama_context> ctx,
 500 |   ffi.Pointer<ffi.Char> path_session,
 501 |   ffi.Pointer<llama_token> tokens_out,
 502 |   int n_token_capacity,
 503 |   ffi.Pointer<ffi.Size> n_token_count_out,
 504 | );
 505 | 
 506 | @ffi.Native<
 507 |     ffi.Bool Function(
 508 |         ffi.Pointer<llama_context>,
 509 |         ffi.Pointer<ffi.Char>,
 510 |         ffi.Pointer<llama_token>,
 511 |         ffi.Size,
 512 |         ffi.Pointer<ffi.Size>)>(symbol: 'llama_load_session_file')
 513 | external bool llama_load_session_file(
 514 |   ffi.Pointer<llama_context> ctx,
 515 |   ffi.Pointer<ffi.Char> path_session,
 516 |   ffi.Pointer<llama_token> tokens_out,
 517 |   int n_token_capacity,
 518 |   ffi.Pointer<ffi.Size> n_token_count_out,
 519 | );
 520 | 
 521 | @ffi.Native<
 522 |     ffi.Bool Function(ffi.Pointer<llama_context>, ffi.Pointer<ffi.Char>,
 523 |         ffi.Pointer<llama_token>, ffi.Size)>(symbol: 'llama_state_save_file')
 524 | external bool llama_state_save_file(
 525 |   ffi.Pointer<llama_context> ctx,
 526 |   ffi.Pointer<ffi.Char> path_session,
 527 |   ffi.Pointer<llama_token> tokens,
 528 |   int n_token_count,
 529 | );
 530 | 
 531 | @ffi.Native<
 532 |     ffi.Bool Function(ffi.Pointer<llama_context>, ffi.Pointer<ffi.Char>,
 533 |         ffi.Pointer<llama_token>, ffi.Size)>(symbol: 'llama_save_session_file')
 534 | external bool llama_save_session_file(
 535 |   ffi.Pointer<llama_context> ctx,
 536 |   ffi.Pointer<ffi.Char> path_session,
 537 |   ffi.Pointer<llama_token> tokens,
 538 |   int n_token_count,
 539 | );
 540 | 
 541 | /// Get the exact size needed to copy the KV cache of a single sequence
 542 | @ffi.Native<ffi.Size Function(ffi.Pointer<llama_context>, llama_seq_id)>(
 543 |     symbol: 'llama_state_seq_get_size')
 544 | external int llama_state_seq_get_size(
 545 |   ffi.Pointer<llama_context> ctx,
 546 |   int seq_id,
 547 | );
 548 | 
 549 | /// Copy the KV cache of a single sequence into the specified buffer
 550 | @ffi.Native<
 551 |     ffi.Size Function(ffi.Pointer<llama_context>, ffi.Pointer<ffi.Uint8>,
 552 |         llama_seq_id)>(symbol: 'llama_state_seq_get_data')
 553 | external int llama_state_seq_get_data(
 554 |   ffi.Pointer<llama_context> ctx,
 555 |   ffi.Pointer<ffi.Uint8> dst,
 556 |   int seq_id,
 557 | );
 558 | 
 559 | /// Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
 560 | /// Returns:
 561 | /// - Positive: Ok
 562 | /// - Zero: Failed to load
 563 | @ffi.Native<
 564 |     ffi.Size Function(ffi.Pointer<llama_context>, ffi.Pointer<ffi.Uint8>,
 565 |         llama_seq_id)>(symbol: 'llama_state_seq_set_data')
 566 | external int llama_state_seq_set_data(
 567 |   ffi.Pointer<llama_context> ctx,
 568 |   ffi.Pointer<ffi.Uint8> src,
 569 |   int dest_seq_id,
 570 | );
 571 | 
 572 | @ffi.Native<
 573 |     ffi.Size Function(
 574 |         ffi.Pointer<llama_context>,
 575 |         ffi.Pointer<ffi.Char>,
 576 |         llama_seq_id,
 577 |         ffi.Pointer<llama_token>,
 578 |         ffi.Size)>(symbol: 'llama_state_seq_save_file')
 579 | external int llama_state_seq_save_file(
 580 |   ffi.Pointer<llama_context> ctx,
 581 |   ffi.Pointer<ffi.Char> filepath,
 582 |   int seq_id,
 583 |   ffi.Pointer<llama_token> tokens,
 584 |   int n_token_count,
 585 | );
 586 | 
 587 | @ffi.Native<
 588 |     ffi.Size Function(
 589 |         ffi.Pointer<llama_context>,
 590 |         ffi.Pointer<ffi.Char>,
 591 |         llama_seq_id,
 592 |         ffi.Pointer<llama_token>,
 593 |         ffi.Size,
 594 |         ffi.Pointer<ffi.Size>)>(symbol: 'llama_state_seq_load_file')
 595 | external int llama_state_seq_load_file(
 596 |   ffi.Pointer<llama_context> ctx,
 597 |   ffi.Pointer<ffi.Char> filepath,
 598 |   int dest_seq_id,
 599 |   ffi.Pointer<llama_token> tokens_out,
 600 |   int n_token_capacity,
 601 |   ffi.Pointer<ffi.Size> n_token_count_out,
 602 | );
 603 | 
 604 | /// Return batch for single sequence of tokens starting at pos_0
 605 | ///
 606 | /// NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
 607 | @ffi.Native<
 608 |     llama_batch Function(ffi.Pointer<llama_token>, ffi.Int32, llama_pos,
 609 |         llama_seq_id)>(symbol: 'llama_batch_get_one')
 610 | external llama_batch llama_batch_get_one(
 611 |   ffi.Pointer<llama_token> tokens,
 612 |   int n_tokens,
 613 |   int pos_0,
 614 |   int seq_id,
 615 | );
 616 | 
 617 | /// Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
 618 | /// Each token can be assigned up to n_seq_max sequence ids
 619 | /// The batch has to be freed with llama_batch_free()
 620 | /// If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
 621 | /// Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
 622 | /// The rest of the llama_batch members are allocated with size n_tokens
 623 | /// All members are left uninitialized
 624 | @ffi.Native<llama_batch Function(ffi.Int32, ffi.Int32, ffi.Int32)>(
 625 |     symbol: 'llama_batch_init')
 626 | external llama_batch llama_batch_init(
 627 |   int n_tokens,
 628 |   int embd,
 629 |   int n_seq_max,
 630 | );
 631 | 
 632 | /// Frees a batch of tokens allocated with llama_batch_init()
 633 | @ffi.Native<ffi.Void Function(llama_batch)>(symbol: 'llama_batch_free')
 634 | external void llama_batch_free(
 635 |   llama_batch batch,
 636 | );
 637 | 
 638 | /// Positive return values does not mean a fatal error, but rather a warning.
 639 | /// 0 - success
 640 | /// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
 641 | /// < 0 - error
 642 | @ffi.Native<ffi.Int32 Function(ffi.Pointer<llama_context>, llama_batch)>(
 643 |     symbol: 'llama_decode')
 644 | external int llama_decode(
 645 |   ffi.Pointer<llama_context> ctx,
 646 |   llama_batch batch,
 647 | );
 648 | 
 649 | /// Set the number of threads used for decoding
 650 | /// n_threads is the number of threads used for generation (single token)
 651 | /// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
 652 | @ffi.Native<
 653 |         ffi.Void Function(ffi.Pointer<llama_context>, ffi.Uint32, ffi.Uint32)>(
 654 |     symbol: 'llama_set_n_threads')
 655 | external void llama_set_n_threads(
 656 |   ffi.Pointer<llama_context> ctx,
 657 |   int n_threads,
 658 |   int n_threads_batch,
 659 | );
 660 | 
 661 | /// Get the number of threads used for generation of a single token.
 662 | @ffi.Native<ffi.Uint32 Function(ffi.Pointer<llama_context>)>(
 663 |     symbol: 'llama_n_threads')
 664 | external int llama_n_threads(
 665 |   ffi.Pointer<llama_context> ctx,
 666 | );
 667 | 
 668 | /// Get the number of threads used for prompt and batch processing (multiple token).
 669 | @ffi.Native<ffi.Uint32 Function(ffi.Pointer<llama_context>)>(
 670 |     symbol: 'llama_n_threads_batch')
 671 | external int llama_n_threads_batch(
 672 |   ffi.Pointer<llama_context> ctx,
 673 | );
 674 | 
 675 | /// Set whether the model is in embeddings mode or not
 676 | /// If true, embeddings will be returned but logits will not
 677 | @ffi.Native<ffi.Void Function(ffi.Pointer<llama_context>, ffi.Bool)>(
 678 |     symbol: 'llama_set_embeddings')
 679 | external void llama_set_embeddings(
 680 |   ffi.Pointer<llama_context> ctx,
 681 |   bool embeddings,
 682 | );
 683 | 
 684 | /// Set whether to use causal attention or not
 685 | /// If set to true, the model will only attend to the past tokens
 686 | @ffi.Native<ffi.Void Function(ffi.Pointer<llama_context>, ffi.Bool)>(
 687 |     symbol: 'llama_set_causal_attn')
 688 | external void llama_set_causal_attn(
 689 |   ffi.Pointer<llama_context> ctx,
 690 |   bool causal_attn,
 691 | );
 692 | 
 693 | /// Set abort callback
 694 | @ffi.Native<
 695 |     ffi.Void Function(ffi.Pointer<llama_context>, ggml_abort_callback,
 696 |         ffi.Pointer<ffi.Void>)>(symbol: 'llama_set_abort_callback')
 697 | external void llama_set_abort_callback(
 698 |   ffi.Pointer<llama_context> ctx,
 699 |   ggml_abort_callback abort_callback,
 700 |   ffi.Pointer<ffi.Void> abort_callback_data,
 701 | );
 702 | 
 703 | /// Wait until all computations are finished
 704 | /// This is automatically done when using one of the functions below to obtain the computation results
 705 | /// and is not necessary to call it explicitly in most cases
 706 | @ffi.Native<ffi.Void Function(ffi.Pointer<llama_context>)>(
 707 |     symbol: 'llama_synchronize')
 708 | external void llama_synchronize(
 709 |   ffi.Pointer<llama_context> ctx,
 710 | );
 711 | 
 712 | /// Token logits obtained from the last call to llama_decode()
 713 | /// The logits for which llama_batch.logits[i] != 0 are stored contiguously
 714 | /// in the order they have appeared in the batch.
 715 | /// Rows: number of tokens for which llama_batch.logits[i] != 0
 716 | /// Cols: n_vocab
 717 | @ffi.Native<ffi.Pointer<ffi.Float> Function(ffi.Pointer<llama_context>)>(
 718 |     symbol: 'llama_get_logits')
 719 | external ffi.Pointer<ffi.Float> llama_get_logits(
 720 |   ffi.Pointer<llama_context> ctx,
 721 | );
 722 | 
 723 | /// Logits for the ith token. For positive indices, Equivalent to:
 724 | /// llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
 725 | /// Negative indicies can be used to access logits in reverse order, -1 is the last logit.
 726 | /// returns NULL for invalid ids.
 727 | @ffi.Native<
 728 |     ffi.Pointer<ffi.Float> Function(
 729 |         ffi.Pointer<llama_context>, ffi.Int32)>(symbol: 'llama_get_logits_ith')
 730 | external ffi.Pointer<ffi.Float> llama_get_logits_ith(
 731 |   ffi.Pointer<llama_context> ctx,
 732 |   int i,
 733 | );
 734 | 
 735 | /// Get all output token embeddings.
 736 | /// when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
 737 | /// the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
 738 | /// in the order they have appeared in the batch.
 739 | /// shape: [n_outputs*n_embd]
 740 | /// Otherwise, returns NULL.
 741 | @ffi.Native<ffi.Pointer<ffi.Float> Function(ffi.Pointer<llama_context>)>(
 742 |     symbol: 'llama_get_embeddings')
 743 | external ffi.Pointer<ffi.Float> llama_get_embeddings(
 744 |   ffi.Pointer<llama_context> ctx,
 745 | );
 746 | 
 747 | /// Get the embeddings for the ith token. For positive indices, Equivalent to:
 748 | /// llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
 749 | /// Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
 750 | /// shape: [n_embd] (1-dimensional)
 751 | /// returns NULL for invalid ids.
 752 | @ffi.Native<
 753 |         ffi.Pointer<ffi.Float> Function(ffi.Pointer<llama_context>, ffi.Int32)>(
 754 |     symbol: 'llama_get_embeddings_ith')
 755 | external ffi.Pointer<ffi.Float> llama_get_embeddings_ith(
 756 |   ffi.Pointer<llama_context> ctx,
 757 |   int i,
 758 | );
 759 | 
 760 | /// Get the embeddings for a sequence id
 761 | /// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
 762 | /// shape: [n_embd] (1-dimensional)
 763 | @ffi.Native<
 764 |     ffi.Pointer<ffi.Float> Function(ffi.Pointer<llama_context>,
 765 |         llama_seq_id)>(symbol: 'llama_get_embeddings_seq')
 766 | external ffi.Pointer<ffi.Float> llama_get_embeddings_seq(
 767 |   ffi.Pointer<llama_context> ctx,
 768 |   int seq_id,
 769 | );
 770 | 
 771 | /// Vocab
 772 | @ffi.Native<
 773 |     ffi.Pointer<ffi.Char> Function(
 774 |         ffi.Pointer<llama_model>, llama_token)>(symbol: 'llama_token_get_text')
 775 | external ffi.Pointer<ffi.Char> llama_token_get_text(
 776 |   ffi.Pointer<llama_model> model,
 777 |   int token,
 778 | );
 779 | 
 780 | @ffi.Native<ffi.Float Function(ffi.Pointer<llama_model>, llama_token)>(
 781 |     symbol: 'llama_token_get_score')
 782 | external double llama_token_get_score(
 783 |   ffi.Pointer<llama_model> model,
 784 |   int token,
 785 | );
 786 | 
 787 | @ffi.Native<ffi.Int32 Function(ffi.Pointer<llama_model>, llama_token)>(
 788 |     symbol: 'llama_token_get_attr')
 789 | external int llama_token_get_attr(
 790 |   ffi.Pointer<llama_model> model,
 791 |   int token,
 792 | );
 793 | 
 794 | /// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
 795 | @ffi.Native<ffi.Bool Function(ffi.Pointer<llama_model>, llama_token)>(
 796 |     symbol: 'llama_token_is_eog')
 797 | external bool llama_token_is_eog(
 798 |   ffi.Pointer<llama_model> model,
 799 |   int token,
 800 | );
 801 | 
 802 | /// Identify if Token Id is a control token or a render-able token
 803 | @ffi.Native<ffi.Bool Function(ffi.Pointer<llama_model>, llama_token)>(
 804 |     symbol: 'llama_token_is_control')
 805 | external bool llama_token_is_control(
 806 |   ffi.Pointer<llama_model> model,
 807 |   int token,
 808 | );
 809 | 
 810 | /// Special tokens
 811 | @ffi.Native<llama_token Function(ffi.Pointer<llama_model>)>(
 812 |     symbol: 'llama_token_bos')
 813 | external int llama_token_bos(
 814 |   ffi.Pointer<llama_model> model,
 815 | );
 816 | 
 817 | @ffi.Native<llama_token Function(ffi.Pointer<llama_model>)>(
 818 |     symbol: 'llama_token_eos')
 819 | external int llama_token_eos(
 820 |   ffi.Pointer<llama_model> model,
 821 | );
 822 | 
 823 | @ffi.Native<llama_token Function(ffi.Pointer<llama_model>)>(
 824 |     symbol: 'llama_token_cls')
 825 | external int llama_token_cls(
 826 |   ffi.Pointer<llama_model> model,
 827 | );
 828 | 
 829 | @ffi.Native<llama_token Function(ffi.Pointer<llama_model>)>(
 830 |     symbol: 'llama_token_sep')
 831 | external int llama_token_sep(
 832 |   ffi.Pointer<llama_model> model,
 833 | );
 834 | 
 835 | @ffi.Native<llama_token Function(ffi.Pointer<llama_model>)>(
 836 |     symbol: 'llama_token_nl')
 837 | external int llama_token_nl(
 838 |   ffi.Pointer<llama_model> model,
 839 | );
 840 | 
 841 | @ffi.Native<llama_token Function(ffi.Pointer<llama_model>)>(
 842 |     symbol: 'llama_token_pad')
 843 | external int llama_token_pad(
 844 |   ffi.Pointer<llama_model> model,
 845 | );
 846 | 
 847 | /// Returns -1 if unknown, 1 for true or 0 for false.
 848 | @ffi.Native<ffi.Int32 Function(ffi.Pointer<llama_model>)>(
 849 |     symbol: 'llama_add_bos_token')
 850 | external int llama_add_bos_token(
 851 |   ffi.Pointer<llama_model> model,
 852 | );
 853 | 
 854 | /// Returns -1 if unknown, 1 for true or 0 for false.
 855 | @ffi.Native<ffi.Int32 Function(ffi.Pointer<llama_model>)>(
 856 |     symbol: 'llama_add_eos_token')
 857 | external int llama_add_eos_token(
 858 |   ffi.Pointer<llama_model> model,
 859 | );
 860 | 
 861 | /// Codellama infill tokens
 862 | @ffi.Native<llama_token Function(ffi.Pointer<llama_model>)>(
 863 |     symbol: 'llama_token_prefix')
 864 | external int llama_token_prefix(
 865 |   ffi.Pointer<llama_model> model,
 866 | );
 867 | 
 868 | @ffi.Native<llama_token Function(ffi.Pointer<llama_model>)>(
 869 |     symbol: 'llama_token_middle')
 870 | external int llama_token_middle(
 871 |   ffi.Pointer<llama_model> model,
 872 | );
 873 | 
 874 | @ffi.Native<llama_token Function(ffi.Pointer<llama_model>)>(
 875 |     symbol: 'llama_token_suffix')
 876 | external int llama_token_suffix(
 877 |   ffi.Pointer<llama_model> model,
 878 | );
 879 | 
 880 | @ffi.Native<llama_token Function(ffi.Pointer<llama_model>)>(
 881 |     symbol: 'llama_token_eot')
 882 | external int llama_token_eot(
 883 |   ffi.Pointer<llama_model> model,
 884 | );
 885 | 
 886 | /// @details Convert the provided text into tokens.
 887 | /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
 888 | /// @return Returns the number of tokens on success, no more than n_tokens_max
 889 | /// @return Returns a negative number on failure - the number of tokens that would have been returned
 890 | /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
 891 | /// as plaintext. Does not insert a leading space.
 892 | @ffi.Native<
 893 |     ffi.Int32 Function(
 894 |         ffi.Pointer<llama_model>,
 895 |         ffi.Pointer<ffi.Char>,
 896 |         ffi.Int32,
 897 |         ffi.Pointer<llama_token>,
 898 |         ffi.Int32,
 899 |         ffi.Bool,
 900 |         ffi.Bool)>(symbol: 'llama_tokenize')
 901 | external int llama_tokenize(
 902 |   ffi.Pointer<llama_model> model,
 903 |   ffi.Pointer<ffi.Char> text,
 904 |   int text_len,
 905 |   ffi.Pointer<llama_token> tokens,
 906 |   int n_tokens_max,
 907 |   bool add_special,
 908 |   bool parse_special,
 909 | );
 910 | 
 911 | /// Token Id -> Piece.
 912 | /// Uses the vocabulary in the provided context.
 913 | /// Does not write null terminator to the buffer.
 914 | /// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
 915 | /// @param special If true, special tokens are rendered in the output.
 916 | @ffi.Native<
 917 |     ffi.Int32 Function(
 918 |         ffi.Pointer<llama_model>,
 919 |         llama_token,
 920 |         ffi.Pointer<ffi.Char>,
 921 |         ffi.Int32,
 922 |         ffi.Bool)>(symbol: 'llama_token_to_piece')
 923 | external int llama_token_to_piece(
 924 |   ffi.Pointer<llama_model> model,
 925 |   int token,
 926 |   ffi.Pointer<ffi.Char> buf,
 927 |   int length,
 928 |   bool special,
 929 | );
 930 | 
 931 | /// Apply chat template. Inspired by hf apply_chat_template() on python.
 932 | /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
 933 | /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
 934 | /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
 935 | /// @param chat Pointer to a list of multiple llama_chat_message
 936 | /// @param n_msg Number of llama_chat_message in this chat
 937 | /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
 938 | /// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
 939 | /// @param length The size of the allocated buffer
 940 | /// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
 941 | @ffi.Native<
 942 |     ffi.Int32 Function(
 943 |         ffi.Pointer<llama_model>,
 944 |         ffi.Pointer<ffi.Char>,
 945 |         ffi.Pointer<llama_chat_message>,
 946 |         ffi.Size,
 947 |         ffi.Bool,
 948 |         ffi.Pointer<ffi.Char>,
 949 |         ffi.Int32)>(symbol: 'llama_chat_apply_template')
 950 | external int llama_chat_apply_template(
 951 |   ffi.Pointer<llama_model> model,
 952 |   ffi.Pointer<ffi.Char> tmpl,
 953 |   ffi.Pointer<llama_chat_message> chat,
 954 |   int n_msg,
 955 |   bool add_ass,
 956 |   ffi.Pointer<ffi.Char> buf,
 957 |   int length,
 958 | );
 959 | 
 960 | /// Initialize a llama_grammar.
 961 | ///
 962 | /// @param rules The rule elements of the grammar to initialize.
 963 | /// @param n_rules The number of rules.
 964 | /// @param start_rule_index The index of the root rule (the starting point of the grammar).
 965 | /// @return The initialized llama_grammar or nullptr if initialization failed.
 966 | @ffi.Native<
 967 |     ffi.Pointer<llama_grammar> Function(
 968 |         ffi.Pointer<ffi.Pointer<llama_grammar_element>>,
 969 |         ffi.Size,
 970 |         ffi.Size)>(symbol: 'llama_grammar_init')
 971 | external ffi.Pointer<llama_grammar> llama_grammar_init(
 972 |   ffi.Pointer<ffi.Pointer<llama_grammar_element>> rules,
 973 |   int n_rules,
 974 |   int start_rule_index,
 975 | );
 976 | 
 977 | @ffi.Native<ffi.Void Function(ffi.Pointer<llama_grammar>)>(
 978 |     symbol: 'llama_grammar_free')
 979 | external void llama_grammar_free(
 980 |   ffi.Pointer<llama_grammar> grammar,
 981 | );
 982 | 
 983 | @ffi.Native<ffi.Pointer<llama_grammar> Function(ffi.Pointer<llama_grammar>)>(
 984 |     symbol: 'llama_grammar_copy')
 985 | external ffi.Pointer<llama_grammar> llama_grammar_copy(
 986 |   ffi.Pointer<llama_grammar> grammar,
 987 | );
 988 | 
 989 | /// Sets the current rng seed.
 990 | @ffi.Native<ffi.Void Function(ffi.Pointer<llama_context>, ffi.Uint32)>(
 991 |     symbol: 'llama_set_rng_seed')
 992 | external void llama_set_rng_seed(
 993 |   ffi.Pointer<llama_context> ctx,
 994 |   int seed,
 995 | );
 996 | 
 997 | /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
 998 | /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
 999 | @ffi.Native<
1000 |     ffi.Void Function(
1001 |         ffi.Pointer<llama_context>,
1002 |         ffi.Pointer<llama_token_data_array>,
1003 |         ffi.Pointer<llama_token>,
1004 |         ffi.Size,
1005 |         ffi.Float,
1006 |         ffi.Float,
1007 |         ffi.Float)>(symbol: 'llama_sample_repetition_penalties')
1008 | external void llama_sample_repetition_penalties(
1009 |   ffi.Pointer<llama_context> ctx,
1010 |   ffi.Pointer<llama_token_data_array> candidates,
1011 |   ffi.Pointer<llama_token> last_tokens,
1012 |   int penalty_last_n,
1013 |   double penalty_repeat,
1014 |   double penalty_freq,
1015 |   double penalty_present,
1016 | );
1017 | 
1018 | /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
1019 | /// @param logits Logits extracted from the original generation context.
1020 | /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
1021 | /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
1022 | @ffi.Native<
1023 |     ffi.Void Function(
1024 |         ffi.Pointer<llama_context>,
1025 |         ffi.Pointer<ffi.Float>,
1026 |         ffi.Pointer<ffi.Float>,
1027 |         ffi.Float)>(symbol: 'llama_sample_apply_guidance')
1028 | external void llama_sample_apply_guidance(
1029 |   ffi.Pointer<llama_context> ctx,
1030 |   ffi.Pointer<ffi.Float> logits,
1031 |   ffi.Pointer<ffi.Float> logits_guidance,
1032 |   double scale,
1033 | );
1034 | 
1035 | /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
1036 | @ffi.Native<
1037 |     ffi.Void Function(ffi.Pointer<llama_context>,
1038 |         ffi.Pointer<llama_token_data_array>)>(symbol: 'llama_sample_softmax')
1039 | external void llama_sample_softmax(
1040 |   ffi.Pointer<llama_context> ctx,
1041 |   ffi.Pointer<llama_token_data_array> candidates,
1042 | );
1043 | 
1044 | /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1045 | @ffi.Native<
1046 |     ffi.Void Function(
1047 |         ffi.Pointer<llama_context>,
1048 |         ffi.Pointer<llama_token_data_array>,
1049 |         ffi.Int32,
1050 |         ffi.Size)>(symbol: 'llama_sample_top_k')
1051 | external void llama_sample_top_k(
1052 |   ffi.Pointer<llama_context> ctx,
1053 |   ffi.Pointer<llama_token_data_array> candidates,
1054 |   int k,
1055 |   int min_keep,
1056 | );
1057 | 
1058 | /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1059 | @ffi.Native<
1060 |     ffi.Void Function(
1061 |         ffi.Pointer<llama_context>,
1062 |         ffi.Pointer<llama_token_data_array>,
1063 |         ffi.Float,
1064 |         ffi.Size)>(symbol: 'llama_sample_top_p')
1065 | external void llama_sample_top_p(
1066 |   ffi.Pointer<llama_context> ctx,
1067 |   ffi.Pointer<llama_token_data_array> candidates,
1068 |   double p,
1069 |   int min_keep,
1070 | );
1071 | 
1072 | /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
1073 | @ffi.Native<
1074 |     ffi.Void Function(
1075 |         ffi.Pointer<llama_context>,
1076 |         ffi.Pointer<llama_token_data_array>,
1077 |         ffi.Float,
1078 |         ffi.Size)>(symbol: 'llama_sample_min_p')
1079 | external void llama_sample_min_p(
1080 |   ffi.Pointer<llama_context> ctx,
1081 |   ffi.Pointer<llama_token_data_array> candidates,
1082 |   double p,
1083 |   int min_keep,
1084 | );
1085 | 
1086 | /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
1087 | @ffi.Native<
1088 |     ffi.Void Function(
1089 |         ffi.Pointer<llama_context>,
1090 |         ffi.Pointer<llama_token_data_array>,
1091 |         ffi.Float,
1092 |         ffi.Size)>(symbol: 'llama_sample_tail_free')
1093 | external void llama_sample_tail_free(
1094 |   ffi.Pointer<llama_context> ctx,
1095 |   ffi.Pointer<llama_token_data_array> candidates,
1096 |   double z,
1097 |   int min_keep,
1098 | );
1099 | 
1100 | /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
1101 | @ffi.Native<
1102 |     ffi.Void Function(
1103 |         ffi.Pointer<llama_context>,
1104 |         ffi.Pointer<llama_token_data_array>,
1105 |         ffi.Float,
1106 |         ffi.Size)>(symbol: 'llama_sample_typical')
1107 | external void llama_sample_typical(
1108 |   ffi.Pointer<llama_context> ctx,
1109 |   ffi.Pointer<llama_token_data_array> candidates,
1110 |   double p,
1111 |   int min_keep,
1112 | );
1113 | 
1114 | /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
1115 | @ffi.Native<
1116 |     ffi.Void Function(
1117 |         ffi.Pointer<llama_context>,
1118 |         ffi.Pointer<llama_token_data_array>,
1119 |         ffi.Float,
1120 |         ffi.Float,
1121 |         ffi.Float)>(symbol: 'llama_sample_entropy')
1122 | external void llama_sample_entropy(
1123 |   ffi.Pointer<llama_context> ctx,
1124 |   ffi.Pointer<llama_token_data_array> candidates_p,
1125 |   double min_temp,
1126 |   double max_temp,
1127 |   double exponent_val,
1128 | );
1129 | 
1130 | @ffi.Native<
1131 |     ffi.Void Function(
1132 |         ffi.Pointer<llama_context>,
1133 |         ffi.Pointer<llama_token_data_array>,
1134 |         ffi.Float)>(symbol: 'llama_sample_temp')
1135 | external void llama_sample_temp(
1136 |   ffi.Pointer<llama_context> ctx,
1137 |   ffi.Pointer<llama_token_data_array> candidates,
1138 |   double temp,
1139 | );
1140 | 
1141 | /// @details Apply constraints from grammar
1142 | @ffi.Native<
1143 |     ffi.Void Function(
1144 |         ffi.Pointer<llama_context>,
1145 |         ffi.Pointer<llama_token_data_array>,
1146 |         ffi.Pointer<llama_grammar>)>(symbol: 'llama_sample_grammar')
1147 | external void llama_sample_grammar(
1148 |   ffi.Pointer<llama_context> ctx,
1149 |   ffi.Pointer<llama_token_data_array> candidates,
1150 |   ffi.Pointer<llama_grammar> grammar,
1151 | );
1152 | 
1153 | /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
1154 | /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
1155 | /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
1156 | /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
1157 | /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
1158 | /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
1159 | @ffi.Native<
1160 |     llama_token Function(
1161 |         ffi.Pointer<llama_context>,
1162 |         ffi.Pointer<llama_token_data_array>,
1163 |         ffi.Float,
1164 |         ffi.Float,
1165 |         ffi.Int32,
1166 |         ffi.Pointer<ffi.Float>)>(symbol: 'llama_sample_token_mirostat')
1167 | external int llama_sample_token_mirostat(
1168 |   ffi.Pointer<llama_context> ctx,
1169 |   ffi.Pointer<llama_token_data_array> candidates,
1170 |   double tau,
1171 |   double eta,
1172 |   int m,
1173 |   ffi.Pointer<ffi.Float> mu,
1174 | );
1175 | 
1176 | /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
1177 | /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
1178 | /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
1179 | /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
1180 | /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
1181 | @ffi.Native<
1182 |     llama_token Function(
1183 |         ffi.Pointer<llama_context>,
1184 |         ffi.Pointer<llama_token_data_array>,
1185 |         ffi.Float,
1186 |         ffi.Float,
1187 |         ffi.Pointer<ffi.Float>)>(symbol: 'llama_sample_token_mirostat_v2')
1188 | external int llama_sample_token_mirostat_v2(
1189 |   ffi.Pointer<llama_context> ctx,
1190 |   ffi.Pointer<llama_token_data_array> candidates,
1191 |   double tau,
1192 |   double eta,
1193 |   ffi.Pointer<ffi.Float> mu,
1194 | );
1195 | 
1196 | /// @details Selects the token with the highest probability.
1197 | /// Does not compute the token probabilities. Use llama_sample_softmax() instead.
1198 | @ffi.Native<
1199 |         llama_token Function(
1200 |             ffi.Pointer<llama_context>, ffi.Pointer<llama_token_data_array>)>(
1201 |     symbol: 'llama_sample_token_greedy')
1202 | external int llama_sample_token_greedy(
1203 |   ffi.Pointer<llama_context> ctx,
1204 |   ffi.Pointer<llama_token_data_array> candidates,
1205 | );
1206 | 
1207 | /// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
1208 | @ffi.Native<
1209 |     llama_token Function(ffi.Pointer<llama_context>,
1210 |         ffi.Pointer<llama_token_data_array>)>(symbol: 'llama_sample_token')
1211 | external int llama_sample_token(
1212 |   ffi.Pointer<llama_context> ctx,
1213 |   ffi.Pointer<llama_token_data_array> candidates,
1214 | );
1215 | 
1216 | /// @details Accepts the sampled token into the grammar
1217 | @ffi.Native<
1218 |     ffi.Void Function(ffi.Pointer<llama_context>, ffi.Pointer<llama_grammar>,
1219 |         llama_token)>(symbol: 'llama_grammar_accept_token')
1220 | external void llama_grammar_accept_token(
1221 |   ffi.Pointer<llama_context> ctx,
1222 |   ffi.Pointer<llama_grammar> grammar,
1223 |   int token,
1224 | );
1225 | 
1226 | /// @details Build a split GGUF final path for this chunk.
1227 | /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
1228 | /// Returns the split_path length.
1229 | @ffi.Native<
1230 |     ffi.Int Function(ffi.Pointer<ffi.Char>, ffi.Size, ffi.Pointer<ffi.Char>,
1231 |         ffi.Int, ffi.Int)>(symbol: 'llama_split_path')
1232 | external int llama_split_path(
1233 |   ffi.Pointer<ffi.Char> split_path,
1234 |   int maxlen,
1235 |   ffi.Pointer<ffi.Char> path_prefix,
1236 |   int split_no,
1237 |   int split_count,
1238 | );
1239 | 
1240 | /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
1241 | /// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
1242 | /// Returns the split_prefix length.
1243 | @ffi.Native<
1244 |     ffi.Int Function(ffi.Pointer<ffi.Char>, ffi.Size, ffi.Pointer<ffi.Char>,
1245 |         ffi.Int, ffi.Int)>(symbol: 'llama_split_prefix')
1246 | external int llama_split_prefix(
1247 |   ffi.Pointer<ffi.Char> split_prefix,
1248 |   int maxlen,
1249 |   ffi.Pointer<ffi.Char> split_path,
1250 |   int split_no,
1251 |   int split_count,
1252 | );
1253 | 
1254 | /// Performance information
1255 | @ffi.Native<llama_timings Function(ffi.Pointer<llama_context>)>(
1256 |     symbol: 'llama_get_timings')
1257 | external llama_timings llama_get_timings(
1258 |   ffi.Pointer<llama_context> ctx,
1259 | );
1260 | 
1261 | @ffi.Native<ffi.Void Function(ffi.Pointer<llama_context>)>(
1262 |     symbol: 'llama_print_timings')
1263 | external void llama_print_timings(
1264 |   ffi.Pointer<llama_context> ctx,
1265 | );
1266 | 
1267 | @ffi.Native<ffi.Void Function(ffi.Pointer<llama_context>)>(
1268 |     symbol: 'llama_reset_timings')
1269 | external void llama_reset_timings(
1270 |   ffi.Pointer<llama_context> ctx,
1271 | );
1272 | 
1273 | /// Print system information
1274 | @ffi.Native<ffi.Pointer<ffi.Char> Function()>(symbol: 'llama_print_system_info')
1275 | external ffi.Pointer<ffi.Char> llama_print_system_info();
1276 | 
1277 | /// Set callback for all future logging events.
1278 | /// If this is not called, or NULL is supplied, everything is output on stderr.
1279 | @ffi.Native<ffi.Void Function(ggml_log_callback, ffi.Pointer<ffi.Void>)>(
1280 |     symbol: 'llama_log_set')
1281 | external void llama_log_set(
1282 |   ggml_log_callback log_callback,
1283 |   ffi.Pointer<ffi.Void> user_data,
1284 | );
1285 | 
1286 | @ffi.Native<ffi.Void Function(ffi.Pointer<FILE>, ffi.Pointer<llama_context>)>(
1287 |     symbol: 'llama_dump_timing_info_yaml')
1288 | external void llama_dump_timing_info_yaml(
1289 |   ffi.Pointer<FILE> stream,
1290 |   ffi.Pointer<llama_context> ctx,
1291 | );
1292 | 
1293 | /// C interface
1294 | ///
1295 | /// TODO: show sample usage
1296 | final class llama_model extends ffi.Opaque {}
1297 | 
1298 | final class llama_context extends ffi.Opaque {}
1299 | 
1300 | abstract class llama_vocab_type {
1301 |   /// For models without vocab
1302 |   static const int LLAMA_VOCAB_TYPE_NONE = 0;
1303 | 
1304 |   /// LLaMA tokenizer based on byte-level BPE with byte fallback
1305 |   static const int LLAMA_VOCAB_TYPE_SPM = 1;
1306 | 
1307 |   /// GPT-2 tokenizer based on byte-level BPE
1308 |   static const int LLAMA_VOCAB_TYPE_BPE = 2;
1309 | 
1310 |   /// BERT tokenizer based on WordPiece
1311 |   static const int LLAMA_VOCAB_TYPE_WPM = 3;
1312 | 
1313 |   /// T5 tokenizer based on Unigram
1314 |   static const int LLAMA_VOCAB_TYPE_UGM = 4;
1315 | }
1316 | 
1317 | /// pre-tokenization types
1318 | abstract class llama_vocab_pre_type {
1319 |   static const int LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0;
1320 |   static const int LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1;
1321 |   static const int LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2;
1322 |   static const int LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3;
1323 |   static const int LLAMA_VOCAB_PRE_TYPE_FALCON = 4;
1324 |   static const int LLAMA_VOCAB_PRE_TYPE_MPT = 5;
1325 |   static const int LLAMA_VOCAB_PRE_TYPE_STARCODER = 6;
1326 |   static const int LLAMA_VOCAB_PRE_TYPE_GPT2 = 7;
1327 |   static const int LLAMA_VOCAB_PRE_TYPE_REFACT = 8;
1328 |   static const int LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9;
1329 |   static const int LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10;
1330 |   static const int LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11;
1331 |   static const int LLAMA_VOCAB_PRE_TYPE_OLMO = 12;
1332 |   static const int LLAMA_VOCAB_PRE_TYPE_DBRX = 13;
1333 |   static const int LLAMA_VOCAB_PRE_TYPE_SMAUG = 14;
1334 |   static const int LLAMA_VOCAB_PRE_TYPE_PORO = 15;
1335 | }
1336 | 
1337 | /// note: these values should be synchronized with ggml_rope
1338 | /// TODO: maybe move this enum to ggml.h (ggml_rope_type)
1339 | abstract class llama_rope_type {
1340 |   static const int LLAMA_ROPE_TYPE_NONE = -1;
1341 |   static const int LLAMA_ROPE_TYPE_NORM = 0;
1342 |   static const int LLAMA_ROPE_TYPE_NEOX = 2;
1343 |   static const int LLAMA_ROPE_TYPE_GLM = 4;
1344 | }
1345 | 
1346 | abstract class llama_token_type {
1347 |   static const int LLAMA_TOKEN_TYPE_UNDEFINED = 0;
1348 |   static const int LLAMA_TOKEN_TYPE_NORMAL = 1;
1349 |   static const int LLAMA_TOKEN_TYPE_UNKNOWN = 2;
1350 |   static const int LLAMA_TOKEN_TYPE_CONTROL = 3;
1351 |   static const int LLAMA_TOKEN_TYPE_USER_DEFINED = 4;
1352 |   static const int LLAMA_TOKEN_TYPE_UNUSED = 5;
1353 |   static const int LLAMA_TOKEN_TYPE_BYTE = 6;
1354 | }
1355 | 
1356 | abstract class llama_token_attr {
1357 |   static const int LLAMA_TOKEN_ATTR_UNDEFINED = 0;
1358 |   static const int LLAMA_TOKEN_ATTR_UNKNOWN = 1;
1359 |   static const int LLAMA_TOKEN_ATTR_UNUSED = 2;
1360 |   static const int LLAMA_TOKEN_ATTR_NORMAL = 4;
1361 | 
1362 |   /// SPECIAL?
1363 |   static const int LLAMA_TOKEN_ATTR_CONTROL = 8;
1364 |   static const int LLAMA_TOKEN_ATTR_USER_DEFINED = 16;
1365 |   static const int LLAMA_TOKEN_ATTR_BYTE = 32;
1366 |   static const int LLAMA_TOKEN_ATTR_NORMALIZED = 64;
1367 |   static const int LLAMA_TOKEN_ATTR_LSTRIP = 128;
1368 |   static const int LLAMA_TOKEN_ATTR_RSTRIP = 256;
1369 |   static const int LLAMA_TOKEN_ATTR_SINGLE_WORD = 512;
1370 | }
1371 | 
1372 | /// model file types
1373 | abstract class llama_ftype {
1374 |   static const int LLAMA_FTYPE_ALL_F32 = 0;
1375 | 
1376 |   /// except 1d tensors
1377 |   static const int LLAMA_FTYPE_MOSTLY_F16 = 1;
1378 | 
1379 |   /// except 1d tensors
1380 |   static const int LLAMA_FTYPE_MOSTLY_Q4_0 = 2;
1381 | 
1382 |   /// except 1d tensors
1383 |   static const int LLAMA_FTYPE_MOSTLY_Q4_1 = 3;
1384 | 
1385 |   /// tok_embeddings.weight and output.weight are F16
1386 |   static const int LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4;
1387 | 
1388 |   /// except 1d tensors
1389 |   static const int LLAMA_FTYPE_MOSTLY_Q8_0 = 7;
1390 | 
1391 |   /// except 1d tensors
1392 |   static const int LLAMA_FTYPE_MOSTLY_Q5_0 = 8;
1393 | 
1394 |   /// except 1d tensors
1395 |   static const int LLAMA_FTYPE_MOSTLY_Q5_1 = 9;
1396 | 
1397 |   /// except 1d tensors
1398 |   static const int LLAMA_FTYPE_MOSTLY_Q2_K = 10;
1399 | 
1400 |   /// except 1d tensors
1401 |   static const int LLAMA_FTYPE_MOSTLY_Q3_K_S = 11;
1402 | 
1403 |   /// except 1d tensors
1404 |   static const int LLAMA_FTYPE_MOSTLY_Q3_K_M = 12;
1405 | 
1406 |   /// except 1d tensors
1407 |   static const int LLAMA_FTYPE_MOSTLY_Q3_K_L = 13;
1408 | 
1409 |   /// except 1d tensors
1410 |   static const int LLAMA_FTYPE_MOSTLY_Q4_K_S = 14;
1411 | 
1412 |   /// except 1d tensors
1413 |   static const int LLAMA_FTYPE_MOSTLY_Q4_K_M = 15;
1414 | 
1415 |   /// except 1d tensors
1416 |   static const int LLAMA_FTYPE_MOSTLY_Q5_K_S = 16;
1417 | 
1418 |   /// except 1d tensors
1419 |   static const int LLAMA_FTYPE_MOSTLY_Q5_K_M = 17;
1420 | 
1421 |   /// except 1d tensors
1422 |   static const int LLAMA_FTYPE_MOSTLY_Q6_K = 18;
1423 | 
1424 |   /// except 1d tensors
1425 |   static const int LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19;
1426 | 
1427 |   /// except 1d tensors
1428 |   static const int LLAMA_FTYPE_MOSTLY_IQ2_XS = 20;
1429 | 
1430 |   /// except 1d tensors
1431 |   static const int LLAMA_FTYPE_MOSTLY_Q2_K_S = 21;
1432 | 
1433 |   /// except 1d tensors
1434 |   static const int LLAMA_FTYPE_MOSTLY_IQ3_XS = 22;
1435 | 
1436 |   /// except 1d tensors
1437 |   static const int LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23;
1438 | 
1439 |   /// except 1d tensors
1440 |   static const int LLAMA_FTYPE_MOSTLY_IQ1_S = 24;
1441 | 
1442 |   /// except 1d tensors
1443 |   static const int LLAMA_FTYPE_MOSTLY_IQ4_NL = 25;
1444 | 
1445 |   /// except 1d tensors
1446 |   static const int LLAMA_FTYPE_MOSTLY_IQ3_S = 26;
1447 | 
1448 |   /// except 1d tensors
1449 |   static const int LLAMA_FTYPE_MOSTLY_IQ3_M = 27;
1450 | 
1451 |   /// except 1d tensors
1452 |   static const int LLAMA_FTYPE_MOSTLY_IQ2_S = 28;
1453 | 
1454 |   /// except 1d tensors
1455 |   static const int LLAMA_FTYPE_MOSTLY_IQ2_M = 29;
1456 | 
1457 |   /// except 1d tensors
1458 |   static const int LLAMA_FTYPE_MOSTLY_IQ4_XS = 30;
1459 | 
1460 |   /// except 1d tensors
1461 |   static const int LLAMA_FTYPE_MOSTLY_IQ1_M = 31;
1462 | 
1463 |   /// except 1d tensors
1464 |   static const int LLAMA_FTYPE_MOSTLY_BF16 = 32;
1465 | 
1466 |   /// not specified in the model file
1467 |   static const int LLAMA_FTYPE_GUESSED = 1024;
1468 | }
1469 | 
1470 | abstract class llama_rope_scaling_type {
1471 |   static const int LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1;
1472 |   static const int LLAMA_ROPE_SCALING_TYPE_NONE = 0;
1473 |   static const int LLAMA_ROPE_SCALING_TYPE_LINEAR = 1;
1474 |   static const int LLAMA_ROPE_SCALING_TYPE_YARN = 2;
1475 |   static const int LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = 2;
1476 | }
1477 | 
1478 | abstract class llama_pooling_type {
1479 |   static const int LLAMA_POOLING_TYPE_UNSPECIFIED = -1;
1480 |   static const int LLAMA_POOLING_TYPE_NONE = 0;
1481 |   static const int LLAMA_POOLING_TYPE_MEAN = 1;
1482 |   static const int LLAMA_POOLING_TYPE_CLS = 2;
1483 |   static const int LLAMA_POOLING_TYPE_LAST = 3;
1484 | }
1485 | 
1486 | abstract class llama_split_mode {
1487 |   /// single GPU
1488 |   static const int LLAMA_SPLIT_MODE_NONE = 0;
1489 | 
1490 |   /// split layers and KV across GPUs
1491 |   static const int LLAMA_SPLIT_MODE_LAYER = 1;
1492 | 
1493 |   /// split rows across GPUs
1494 |   static const int LLAMA_SPLIT_MODE_ROW = 2;
1495 | }
1496 | 
1497 | final class llama_token_data extends ffi.Struct {
1498 |   /// token id
1499 |   @llama_token()
1500 |   external int id;
1501 | 
1502 |   /// log-odds of the token
1503 |   @ffi.Float()
1504 |   external double logit;
1505 | 
1506 |   /// probability of the token
1507 |   @ffi.Float()
1508 |   external double p;
1509 | }
1510 | 
1511 | typedef llama_token = ffi.Int32;
1512 | typedef Dartllama_token = int;
1513 | 
1514 | final class llama_token_data_array extends ffi.Struct {
1515 |   external ffi.Pointer<llama_token_data> data;
1516 | 
1517 |   @ffi.Size()
1518 |   external int size;
1519 | 
1520 |   @ffi.Bool()
1521 |   external bool sorted;
1522 | }
1523 | 
1524 | /// Input data for llama_decode
1525 | /// A llama_batch object can contain input about one or many sequences
1526 | /// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
1527 | ///
1528 | /// - token  : the token ids of the input (used when embd is NULL)
1529 | /// - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
1530 | /// - pos    : the positions of the respective token in the sequence
1531 | /// - seq_id : the sequence to which the respective token belongs
1532 | /// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
1533 | final class llama_batch extends ffi.Struct {
1534 |   @ffi.Int32()
1535 |   external int n_tokens;
1536 | 
1537 |   external ffi.Pointer<llama_token> token;
1538 | 
1539 |   external ffi.Pointer<ffi.Float> embd;
1540 | 
1541 |   external ffi.Pointer<llama_pos> pos;
1542 | 
1543 |   external ffi.Pointer<ffi.Int32> n_seq_id;
1544 | 
1545 |   external ffi.Pointer<ffi.Pointer<llama_seq_id>> seq_id;
1546 | 
1547 |   /// TODO: rename this to "output"
1548 |   external ffi.Pointer<ffi.Int8> logits;
1549 | 
1550 |   /// used if pos == NULL
1551 |   @llama_pos()
1552 |   external int all_pos_0;
1553 | 
1554 |   /// used if pos == NULL
1555 |   @llama_pos()
1556 |   external int all_pos_1;
1557 | 
1558 |   /// used if seq_id == NULL
1559 |   @llama_seq_id()
1560 |   external int all_seq_id;
1561 | }
1562 | 
1563 | typedef llama_pos = ffi.Int32;
1564 | typedef Dartllama_pos = int;
1565 | typedef llama_seq_id = ffi.Int32;
1566 | typedef Dartllama_seq_id = int;
1567 | 
1568 | abstract class llama_model_kv_override_type {
1569 |   static const int LLAMA_KV_OVERRIDE_TYPE_INT = 0;
1570 |   static const int LLAMA_KV_OVERRIDE_TYPE_FLOAT = 1;
1571 |   static const int LLAMA_KV_OVERRIDE_TYPE_BOOL = 2;
1572 |   static const int LLAMA_KV_OVERRIDE_TYPE_STR = 3;
1573 | }
1574 | 
1575 | final class llama_model_kv_override extends ffi.Struct {
1576 |   @ffi.Int32()
1577 |   external int tag;
1578 | 
1579 |   @ffi.Array.multi([128])
1580 |   external ffi.Array<ffi.Char> key;
1581 | 
1582 |   external UnnamedUnion1 unnamed;
1583 | }
1584 | 
1585 | final class UnnamedUnion1 extends ffi.Union {
1586 |   @ffi.Int64()
1587 |   external int val_i64;
1588 | 
1589 |   @ffi.Double()
1590 |   external double val_f64;
1591 | 
1592 |   @ffi.Bool()
1593 |   external bool val_bool;
1594 | 
1595 |   @ffi.Array.multi([128])
1596 |   external ffi.Array<ffi.Char> val_str;
1597 | }
1598 | 
1599 | final class llama_model_params extends ffi.Struct {
1600 |   /// number of layers to store in VRAM
1601 |   @ffi.Int32()
1602 |   external int n_gpu_layers;
1603 | 
1604 |   /// how to split the model across multiple GPUs
1605 |   @ffi.Int32()
1606 |   external int split_mode;
1607 | 
1608 |   /// main_gpu interpretation depends on split_mode:
1609 |   /// LLAMA_SPLIT_NONE: the GPU that is used for the entire model
1610 |   /// LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
1611 |   /// LLAMA_SPLIT_LAYER: ignored
1612 |   @ffi.Int32()
1613 |   external int main_gpu;
1614 | 
1615 |   /// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
1616 |   external ffi.Pointer<ffi.Float> tensor_split;
1617 | 
1618 |   /// comma separated list of RPC servers to use for offloading
1619 |   external ffi.Pointer<ffi.Char> rpc_servers;
1620 | 
1621 |   /// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
1622 |   /// If the provided progress_callback returns true, model loading continues.
1623 |   /// If it returns false, model loading is immediately aborted.
1624 |   external llama_progress_callback progress_callback;
1625 | 
1626 |   /// context pointer passed to the progress callback
1627 |   external ffi.Pointer<ffi.Void> progress_callback_user_data;
1628 | 
1629 |   /// override key-value pairs of the model meta data
1630 |   external ffi.Pointer<llama_model_kv_override> kv_overrides;
1631 | 
1632 |   /// only load the vocabulary, no weights
1633 |   @ffi.Bool()
1634 |   external bool vocab_only;
1635 | 
1636 |   /// use mmap if possible
1637 |   @ffi.Bool()
1638 |   external bool use_mmap;
1639 | 
1640 |   /// force system to keep model in RAM
1641 |   @ffi.Bool()
1642 |   external bool use_mlock;
1643 | 
1644 |   /// validate model tensor data
1645 |   @ffi.Bool()
1646 |   external bool check_tensors;
1647 | }
1648 | 
1649 | typedef llama_progress_callback
1650 |     = ffi.Pointer<ffi.NativeFunction<llama_progress_callbackFunction>>;
1651 | typedef llama_progress_callbackFunction = ffi.Bool Function(
1652 |     ffi.Float progress, ffi.Pointer<ffi.Void> user_data);
1653 | typedef Dartllama_progress_callbackFunction = bool Function(
1654 |     double progress, ffi.Pointer<ffi.Void> user_data);
1655 | 
1656 | /// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
1657 | /// https://github.com/ggerganov/llama.cpp/pull/7544
1658 | final class llama_context_params extends ffi.Struct {
1659 |   /// RNG seed, -1 for random
1660 |   @ffi.Uint32()
1661 |   external int seed;
1662 | 
1663 |   /// text context, 0 = from model
1664 |   @ffi.Uint32()
1665 |   external int n_ctx;
1666 | 
1667 |   /// logical maximum batch size that can be submitted to llama_decode
1668 |   @ffi.Uint32()
1669 |   external int n_batch;
1670 | 
1671 |   /// physical maximum batch size
1672 |   @ffi.Uint32()
1673 |   external int n_ubatch;
1674 | 
1675 |   /// max number of sequences (i.e. distinct states for recurrent models)
1676 |   @ffi.Uint32()
1677 |   external int n_seq_max;
1678 | 
1679 |   /// number of threads to use for generation
1680 |   @ffi.Uint32()
1681 |   external int n_threads;
1682 | 
1683 |   /// number of threads to use for batch processing
1684 |   @ffi.Uint32()
1685 |   external int n_threads_batch;
1686 | 
1687 |   /// RoPE scaling type, from `enum llama_rope_scaling_type`
1688 |   @ffi.Int32()
1689 |   external int rope_scaling_type;
1690 | 
1691 |   /// whether to pool (sum) embedding results by sequence id
1692 |   @ffi.Int32()
1693 |   external int pooling_type;
1694 | 
1695 |   /// RoPE base frequency, 0 = from model
1696 |   @ffi.Float()
1697 |   external double rope_freq_base;
1698 | 
1699 |   /// RoPE frequency scaling factor, 0 = from model
1700 |   @ffi.Float()
1701 |   external double rope_freq_scale;
1702 | 
1703 |   /// YaRN extrapolation mix factor, negative = from model
1704 |   @ffi.Float()
1705 |   external double yarn_ext_factor;
1706 | 
1707 |   /// YaRN magnitude scaling factor
1708 |   @ffi.Float()
1709 |   external double yarn_attn_factor;
1710 | 
1711 |   /// YaRN low correction dim
1712 |   @ffi.Float()
1713 |   external double yarn_beta_fast;
1714 | 
1715 |   /// YaRN high correction dim
1716 |   @ffi.Float()
1717 |   external double yarn_beta_slow;
1718 | 
1719 |   /// YaRN original context size
1720 |   @ffi.Uint32()
1721 |   external int yarn_orig_ctx;
1722 | 
1723 |   /// defragment the KV cache if holes/size > thold, < 0 disabled (default)
1724 |   @ffi.Float()
1725 |   external double defrag_thold;
1726 | 
1727 |   external ggml_backend_sched_eval_callback cb_eval;
1728 | 
1729 |   external ffi.Pointer<ffi.Void> cb_eval_user_data;
1730 | 
1731 |   /// data type for K cache [EXPERIMENTAL]
1732 |   @ffi.Int32()
1733 |   external int type_k;
1734 | 
1735 |   /// data type for V cache [EXPERIMENTAL]
1736 |   @ffi.Int32()
1737 |   external int type_v;
1738 | 
1739 |   /// the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
1740 |   @ffi.Bool()
1741 |   external bool logits_all;
1742 | 
1743 |   /// if true, extract embeddings (together with logits)
1744 |   @ffi.Bool()
1745 |   external bool embeddings;
1746 | 
1747 |   /// whether to offload the KQV ops (including the KV cache) to GPU
1748 |   @ffi.Bool()
1749 |   external bool offload_kqv;
1750 | 
1751 |   /// whether to use flash attention [EXPERIMENTAL]
1752 |   @ffi.Bool()
1753 |   external bool flash_attn;
1754 | 
1755 |   /// Abort callback
1756 |   /// if it returns true, execution of llama_decode() will be aborted
1757 |   /// currently works only with CPU execution
1758 |   external ggml_abort_callback abort_callback;
1759 | 
1760 |   external ffi.Pointer<ffi.Void> abort_callback_data;
1761 | }
1762 | 
1763 | /// when ask == true, the scheduler wants to know if the user wants to observe this node
1764 | /// this allows the scheduler to batch nodes together in order to evaluate them in a single call
1765 | ///
1766 | /// when ask == false, the scheduler is passing the node tensor to the user for observation
1767 | /// if the user returns false, the scheduler will cancel the graph compute
1768 | typedef ggml_backend_sched_eval_callback
1769 |     = ffi.Pointer<ffi.NativeFunction<ggml_backend_sched_eval_callbackFunction>>;
1770 | typedef ggml_backend_sched_eval_callbackFunction = ffi.Bool Function(
1771 |     ffi.Pointer<ggml_tensor> t, ffi.Bool ask, ffi.Pointer<ffi.Void> user_data);
1772 | typedef Dartggml_backend_sched_eval_callbackFunction = bool Function(
1773 |     ffi.Pointer<ggml_tensor> t, bool ask, ffi.Pointer<ffi.Void> user_data);
1774 | 
1775 | /// n-dimensional tensor
1776 | final class ggml_tensor extends ffi.Struct {
1777 |   @ffi.Int32()
1778 |   external int type;
1779 | 
1780 |   @ffi.Int32()
1781 |   external int backend;
1782 | 
1783 |   external ffi.Pointer<ggml_backend_buffer> buffer;
1784 | 
1785 |   /// number of elements
1786 |   @ffi.Array.multi([4])
1787 |   external ffi.Array<ffi.Int64> ne;
1788 | 
1789 |   /// stride in bytes:
1790 |   /// nb[0] = ggml_type_size(type)
1791 |   /// nb[1] = nb[0]   * (ne[0] / ggml_blck_size(type)) + padding
1792 |   /// nb[i] = nb[i-1] * ne[i-1]
1793 |   @ffi.Array.multi([4])
1794 |   external ffi.Array<ffi.Size> nb;
1795 | 
1796 |   /// compute data
1797 |   @ffi.Int32()
1798 |   external int op;
1799 | 
1800 |   /// op params - allocated as int32_t for alignment
1801 |   @ffi.Array.multi([16])
1802 |   external ffi.Array<ffi.Int32> op_params;
1803 | 
1804 |   @ffi.Int32()
1805 |   external int flags;
1806 | 
1807 |   external ffi.Pointer<ggml_tensor> grad;
1808 | 
1809 |   @ffi.Array.multi([10])
1810 |   external ffi.Array<ffi.Pointer<ggml_tensor>> src;
1811 | 
1812 |   /// source tensor and offset for views
1813 |   external ffi.Pointer<ggml_tensor> view_src;
1814 | 
1815 |   @ffi.Size()
1816 |   external int view_offs;
1817 | 
1818 |   external ffi.Pointer<ffi.Void> data;
1819 | 
1820 |   @ffi.Array.multi([64])
1821 |   external ffi.Array<ffi.Char> name;
1822 | 
1823 |   /// extra things e.g. for ggml-cuda.cu
1824 |   external ffi.Pointer<ffi.Void> extra;
1825 | }
1826 | 
1827 | /// NOTE: always add types at the end of the enum to keep backward compatibility
1828 | abstract class ggml_type {
1829 |   static const int GGML_TYPE_F32 = 0;
1830 |   static const int GGML_TYPE_F16 = 1;
1831 |   static const int GGML_TYPE_Q4_0 = 2;
1832 |   static const int GGML_TYPE_Q4_1 = 3;
1833 | 
1834 |   /// GGML_TYPE_Q4_2 = 4, support has been removed
1835 |   /// GGML_TYPE_Q4_3 = 5, support has been removed
1836 |   static const int GGML_TYPE_Q5_0 = 6;
1837 |   static const int GGML_TYPE_Q5_1 = 7;
1838 |   static const int GGML_TYPE_Q8_0 = 8;
1839 |   static const int GGML_TYPE_Q8_1 = 9;
1840 |   static const int GGML_TYPE_Q2_K = 10;
1841 |   static const int GGML_TYPE_Q3_K = 11;
1842 |   static const int GGML_TYPE_Q4_K = 12;
1843 |   static const int GGML_TYPE_Q5_K = 13;
1844 |   static const int GGML_TYPE_Q6_K = 14;
1845 |   static const int GGML_TYPE_Q8_K = 15;
1846 |   static const int GGML_TYPE_IQ2_XXS = 16;
1847 |   static const int GGML_TYPE_IQ2_XS = 17;
1848 |   static const int GGML_TYPE_IQ3_XXS = 18;
1849 |   static const int GGML_TYPE_IQ1_S = 19;
1850 |   static const int GGML_TYPE_IQ4_NL = 20;
1851 |   static const int GGML_TYPE_IQ3_S = 21;
1852 |   static const int GGML_TYPE_IQ2_S = 22;
1853 |   static const int GGML_TYPE_IQ4_XS = 23;
1854 |   static const int GGML_TYPE_I8 = 24;
1855 |   static const int GGML_TYPE_I16 = 25;
1856 |   static const int GGML_TYPE_I32 = 26;
1857 |   static const int GGML_TYPE_I64 = 27;
1858 |   static const int GGML_TYPE_F64 = 28;
1859 |   static const int GGML_TYPE_IQ1_M = 29;
1860 |   static const int GGML_TYPE_BF16 = 30;
1861 |   static const int GGML_TYPE_COUNT = 31;
1862 | }
1863 | 
1864 | abstract class ggml_backend_type {
1865 |   static const int GGML_BACKEND_TYPE_CPU = 0;
1866 |   static const int GGML_BACKEND_TYPE_GPU = 10;
1867 |   static const int GGML_BACKEND_TYPE_GPU_SPLIT = 20;
1868 | }
1869 | 
1870 | final class ggml_backend_buffer extends ffi.Opaque {}
1871 | 
1872 | /// available tensor operations:
1873 | abstract class ggml_op {
1874 |   static const int GGML_OP_NONE = 0;
1875 |   static const int GGML_OP_DUP = 1;
1876 |   static const int GGML_OP_ADD = 2;
1877 |   static const int GGML_OP_ADD1 = 3;
1878 |   static const int GGML_OP_ACC = 4;
1879 |   static const int GGML_OP_SUB = 5;
1880 |   static const int GGML_OP_MUL = 6;
1881 |   static const int GGML_OP_DIV = 7;
1882 |   static const int GGML_OP_SQR = 8;
1883 |   static const int GGML_OP_SQRT = 9;
1884 |   static const int GGML_OP_LOG = 10;
1885 |   static const int GGML_OP_SUM = 11;
1886 |   static const int GGML_OP_SUM_ROWS = 12;
1887 |   static const int GGML_OP_MEAN = 13;
1888 |   static const int GGML_OP_ARGMAX = 14;
1889 |   static const int GGML_OP_REPEAT = 15;
1890 |   static const int GGML_OP_REPEAT_BACK = 16;
1891 |   static const int GGML_OP_CONCAT = 17;
1892 |   static const int GGML_OP_SILU_BACK = 18;
1893 | 
1894 |   /// normalize
1895 |   static const int GGML_OP_NORM = 19;
1896 |   static const int GGML_OP_RMS_NORM = 20;
1897 |   static const int GGML_OP_RMS_NORM_BACK = 21;
1898 |   static const int GGML_OP_GROUP_NORM = 22;
1899 |   static const int GGML_OP_MUL_MAT = 23;
1900 |   static const int GGML_OP_MUL_MAT_ID = 24;
1901 |   static const int GGML_OP_OUT_PROD = 25;
1902 |   static const int GGML_OP_SCALE = 26;
1903 |   static const int GGML_OP_SET = 27;
1904 |   static const int GGML_OP_CPY = 28;
1905 |   static const int GGML_OP_CONT = 29;
1906 |   static const int GGML_OP_RESHAPE = 30;
1907 |   static const int GGML_OP_VIEW = 31;
1908 |   static const int GGML_OP_PERMUTE = 32;
1909 |   static const int GGML_OP_TRANSPOSE = 33;
1910 |   static const int GGML_OP_GET_ROWS = 34;
1911 |   static const int GGML_OP_GET_ROWS_BACK = 35;
1912 |   static const int GGML_OP_DIAG = 36;
1913 |   static const int GGML_OP_DIAG_MASK_INF = 37;
1914 |   static const int GGML_OP_DIAG_MASK_ZERO = 38;
1915 |   static const int GGML_OP_SOFT_MAX = 39;
1916 |   static const int GGML_OP_SOFT_MAX_BACK = 40;
1917 |   static const int GGML_OP_ROPE = 41;
1918 |   static const int GGML_OP_ROPE_BACK = 42;
1919 |   static const int GGML_OP_CLAMP = 43;
1920 |   static const int GGML_OP_CONV_TRANSPOSE_1D = 44;
1921 |   static const int GGML_OP_IM2COL = 45;
1922 |   static const int GGML_OP_CONV_TRANSPOSE_2D = 46;
1923 |   static const int GGML_OP_POOL_1D = 47;
1924 |   static const int GGML_OP_POOL_2D = 48;
1925 | 
1926 |   /// nearest interpolate
1927 |   static const int GGML_OP_UPSCALE = 49;
1928 |   static const int GGML_OP_PAD = 50;
1929 |   static const int GGML_OP_ARANGE = 51;
1930 |   static const int GGML_OP_TIMESTEP_EMBEDDING = 52;
1931 |   static const int GGML_OP_ARGSORT = 53;
1932 |   static const int GGML_OP_LEAKY_RELU = 54;
1933 |   static const int GGML_OP_FLASH_ATTN_EXT = 55;
1934 |   static const int GGML_OP_FLASH_ATTN_BACK = 56;
1935 |   static const int GGML_OP_SSM_CONV = 57;
1936 |   static const int GGML_OP_SSM_SCAN = 58;
1937 |   static const int GGML_OP_WIN_PART = 59;
1938 |   static const int GGML_OP_WIN_UNPART = 60;
1939 |   static const int GGML_OP_GET_REL_POS = 61;
1940 |   static const int GGML_OP_ADD_REL_POS = 62;
1941 |   static const int GGML_OP_UNARY = 63;
1942 |   static const int GGML_OP_MAP_UNARY = 64;
1943 |   static const int GGML_OP_MAP_BINARY = 65;
1944 |   static const int GGML_OP_MAP_CUSTOM1_F32 = 66;
1945 |   static const int GGML_OP_MAP_CUSTOM2_F32 = 67;
1946 |   static const int GGML_OP_MAP_CUSTOM3_F32 = 68;
1947 |   static const int GGML_OP_MAP_CUSTOM1 = 69;
1948 |   static const int GGML_OP_MAP_CUSTOM2 = 70;
1949 |   static const int GGML_OP_MAP_CUSTOM3 = 71;
1950 |   static const int GGML_OP_CROSS_ENTROPY_LOSS = 72;
1951 |   static const int GGML_OP_CROSS_ENTROPY_LOSS_BACK = 73;
1952 |   static const int GGML_OP_COUNT = 74;
1953 | }
1954 | 
1955 | /// Abort callback
1956 | /// If not NULL, called before ggml computation
1957 | /// If it returns true, the computation is aborted
1958 | typedef ggml_abort_callback
1959 |     = ffi.Pointer<ffi.NativeFunction<ggml_abort_callbackFunction>>;
1960 | typedef ggml_abort_callbackFunction = ffi.Bool Function(
1961 |     ffi.Pointer<ffi.Void> data);
1962 | typedef Dartggml_abort_callbackFunction = bool Function(
1963 |     ffi.Pointer<ffi.Void> data);
1964 | 
1965 | /// model quantization parameters
1966 | final class llama_model_quantize_params extends ffi.Struct {
1967 |   /// number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
1968 |   @ffi.Int32()
1969 |   external int nthread;
1970 | 
1971 |   /// quantize to this llama_ftype
1972 |   @ffi.Int32()
1973 |   external int ftype;
1974 | 
1975 |   /// output tensor type
1976 |   @ffi.Int32()
1977 |   external int output_tensor_type;
1978 | 
1979 |   /// itoken embeddings tensor type
1980 |   @ffi.Int32()
1981 |   external int token_embedding_type;
1982 | 
1983 |   /// allow quantizing non-f32/f16 tensors
1984 |   @ffi.Bool()
1985 |   external bool allow_requantize;
1986 | 
1987 |   /// quantize output.weight
1988 |   @ffi.Bool()
1989 |   external bool quantize_output_tensor;
1990 | 
1991 |   /// only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
1992 |   @ffi.Bool()
1993 |   external bool only_copy;
1994 | 
1995 |   /// quantize all tensors to the default type
1996 |   @ffi.Bool()
1997 |   external bool pure;
1998 | 
1999 |   /// quantize to the same number of shards
2000 |   @ffi.Bool()
2001 |   external bool keep_split;
2002 | 
2003 |   /// pointer to importance matrix data
2004 |   external ffi.Pointer<ffi.Void> imatrix;
2005 | 
2006 |   /// pointer to vector containing overrides
2007 |   external ffi.Pointer<ffi.Void> kv_overrides;
2008 | }
2009 | 
2010 | /// grammar types
2011 | final class llama_grammar extends ffi.Opaque {}
2012 | 
2013 | /// grammar element type
2014 | abstract class llama_gretype {
2015 |   /// end of rule definition
2016 |   static const int LLAMA_GRETYPE_END = 0;
2017 | 
2018 |   /// start of alternate definition for rule
2019 |   static const int LLAMA_GRETYPE_ALT = 1;
2020 | 
2021 |   /// non-terminal element: reference to rule
2022 |   static const int LLAMA_GRETYPE_RULE_REF = 2;
2023 | 
2024 |   /// terminal element: character (code point)
2025 |   static const int LLAMA_GRETYPE_CHAR = 3;
2026 | 
2027 |   /// inverse char(s) ([^a], [^a-b] [^abc])
2028 |   static const int LLAMA_GRETYPE_CHAR_NOT = 4;
2029 | 
2030 |   /// modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
2031 |   /// be an inclusive range ([a-z])
2032 |   static const int LLAMA_GRETYPE_CHAR_RNG_UPPER = 5;
2033 | 
2034 |   /// modifies a preceding LLAMA_GRETYPE_CHAR or
2035 |   /// LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
2036 |   static const int LLAMA_GRETYPE_CHAR_ALT = 6;
2037 | 
2038 |   /// any character (.)
2039 |   static const int LLAMA_GRETYPE_CHAR_ANY = 7;
2040 | }
2041 | 
2042 | final class llama_grammar_element extends ffi.Struct {
2043 |   @ffi.Int32()
2044 |   external int type;
2045 | 
2046 |   /// Unicode code point or rule ID
2047 |   @ffi.Uint32()
2048 |   external int value;
2049 | }
2050 | 
2051 | /// performance timing information
2052 | final class llama_timings extends ffi.Struct {
2053 |   @ffi.Double()
2054 |   external double t_start_ms;
2055 | 
2056 |   @ffi.Double()
2057 |   external double t_end_ms;
2058 | 
2059 |   @ffi.Double()
2060 |   external double t_load_ms;
2061 | 
2062 |   @ffi.Double()
2063 |   external double t_sample_ms;
2064 | 
2065 |   @ffi.Double()
2066 |   external double t_p_eval_ms;
2067 | 
2068 |   @ffi.Double()
2069 |   external double t_eval_ms;
2070 | 
2071 |   @ffi.Int32()
2072 |   external int n_sample;
2073 | 
2074 |   @ffi.Int32()
2075 |   external int n_p_eval;
2076 | 
2077 |   @ffi.Int32()
2078 |   external int n_eval;
2079 | }
2080 | 
2081 | /// used in chat template
2082 | final class llama_chat_message extends ffi.Struct {
2083 |   external ffi.Pointer<ffi.Char> role;
2084 | 
2085 |   external ffi.Pointer<ffi.Char> content;
2086 | }
2087 | 
2088 | /// numa strategies
2089 | abstract class ggml_numa_strategy {
2090 |   static const int GGML_NUMA_STRATEGY_DISABLED = 0;
2091 |   static const int GGML_NUMA_STRATEGY_DISTRIBUTE = 1;
2092 |   static const int GGML_NUMA_STRATEGY_ISOLATE = 2;
2093 |   static const int GGML_NUMA_STRATEGY_NUMACTL = 3;
2094 |   static const int GGML_NUMA_STRATEGY_MIRROR = 4;
2095 |   static const int GGML_NUMA_STRATEGY_COUNT = 5;
2096 | }
2097 | 
2098 | /// Information associated with an individual cell in the KV cache view.
2099 | final class llama_kv_cache_view_cell extends ffi.Struct {
2100 |   /// The position for this cell. Takes KV cache shifts into account.
2101 |   /// May be negative if the cell is not populated.
2102 |   @llama_pos()
2103 |   external int pos;
2104 | }
2105 | 
2106 | /// An updateable view of the KV cache.
2107 | final class llama_kv_cache_view extends ffi.Struct {
2108 |   /// Number of KV cache cells. This will be the same as the context size.
2109 |   @ffi.Int32()
2110 |   external int n_cells;
2111 | 
2112 |   /// Maximum number of sequences that can exist in a cell. It's not an error
2113 |   /// if there are more sequences in a cell than this value, however they will
2114 |   /// not be visible in the view cells_sequences.
2115 |   @ffi.Int32()
2116 |   external int n_seq_max;
2117 | 
2118 |   /// Number of tokens in the cache. For example, if there are two populated
2119 |   /// cells, the first with 1 sequence id in it and the second with 2 sequence
2120 |   /// ids then you'll have 3 tokens.
2121 |   @ffi.Int32()
2122 |   external int token_count;
2123 | 
2124 |   /// Number of populated cache cells.
2125 |   @ffi.Int32()
2126 |   external int used_cells;
2127 | 
2128 |   /// Maximum contiguous empty slots in the cache.
2129 |   @ffi.Int32()
2130 |   external int max_contiguous;
2131 | 
2132 |   /// Index to the start of the max_contiguous slot range. Can be negative
2133 |   /// when cache is full.
2134 |   @ffi.Int32()
2135 |   external int max_contiguous_idx;
2136 | 
2137 |   /// Information for an individual cell.
2138 |   external ffi.Pointer<llama_kv_cache_view_cell> cells;
2139 | 
2140 |   /// The sequences for each cell. There will be n_seq_max items per cell.
2141 |   external ffi.Pointer<llama_seq_id> cells_sequences;
2142 | }
2143 | 
2144 | typedef ggml_log_callback
2145 |     = ffi.Pointer<ffi.NativeFunction<ggml_log_callbackFunction>>;
2146 | typedef ggml_log_callbackFunction = ffi.Void Function(ffi.Int32 level,
2147 |     ffi.Pointer<ffi.Char> text, ffi.Pointer<ffi.Void> user_data);
2148 | typedef Dartggml_log_callbackFunction = void Function(
2149 |     int level, ffi.Pointer<ffi.Char> text, ffi.Pointer<ffi.Void> user_data);
2150 | 
2151 | abstract class ggml_log_level {
2152 |   static const int GGML_LOG_LEVEL_ERROR = 2;
2153 |   static const int GGML_LOG_LEVEL_WARN = 3;
2154 |   static const int GGML_LOG_LEVEL_INFO = 4;
2155 |   static const int GGML_LOG_LEVEL_DEBUG = 5;
2156 | }
2157 | 
2158 | typedef FILE = _IO_FILE;
2159 | 
2160 | final class _IO_FILE extends ffi.Struct {
2161 |   @ffi.Int()
2162 |   external int _flags;
2163 | 
2164 |   external ffi.Pointer<ffi.Char> _IO_read_ptr;
2165 | 
2166 |   external ffi.Pointer<ffi.Char> _IO_read_end;
2167 | 
2168 |   external ffi.Pointer<ffi.Char> _IO_read_base;
2169 | 
2170 |   external ffi.Pointer<ffi.Char> _IO_write_base;
2171 | 
2172 |   external ffi.Pointer<ffi.Char> _IO_write_ptr;
2173 | 
2174 |   external ffi.Pointer<ffi.Char> _IO_write_end;
2175 | 
2176 |   external ffi.Pointer<ffi.Char> _IO_buf_base;
2177 | 
2178 |   external ffi.Pointer<ffi.Char> _IO_buf_end;
2179 | 
2180 |   external ffi.Pointer<ffi.Char> _IO_save_base;
2181 | 
2182 |   external ffi.Pointer<ffi.Char> _IO_backup_base;
2183 | 
2184 |   external ffi.Pointer<ffi.Char> _IO_save_end;
2185 | 
2186 |   external ffi.Pointer<_IO_marker> _markers;
2187 | 
2188 |   external ffi.Pointer<_IO_FILE> _chain;
2189 | 
2190 |   @ffi.Int()
2191 |   external int _fileno;
2192 | 
2193 |   @ffi.Int()
2194 |   external int _flags2;
2195 | 
2196 |   @__off_t()
2197 |   external int _old_offset;
2198 | 
2199 |   @ffi.UnsignedShort()
2200 |   external int _cur_column;
2201 | 
2202 |   @ffi.SignedChar()
2203 |   external int _vtable_offset;
2204 | 
2205 |   @ffi.Array.multi([1])
2206 |   external ffi.Array<ffi.Char> _shortbuf;
2207 | 
2208 |   external ffi.Pointer<_IO_lock_t> _lock;
2209 | 
2210 |   @__off64_t()
2211 |   external int _offset;
2212 | 
2213 |   external ffi.Pointer<_IO_codecvt> _codecvt;
2214 | 
2215 |   external ffi.Pointer<_IO_wide_data> _wide_data;
2216 | 
2217 |   external ffi.Pointer<_IO_FILE> _freeres_list;
2218 | 
2219 |   external ffi.Pointer<ffi.Void> _freeres_buf;
2220 | 
2221 |   @ffi.Size()
2222 |   external int __pad5;
2223 | 
2224 |   @ffi.Int()
2225 |   external int _mode;
2226 | 
2227 |   @ffi.Array.multi([20])
2228 |   external ffi.Array<ffi.Char> _unused2;
2229 | }
2230 | 
2231 | final class _IO_marker extends ffi.Opaque {}
2232 | 
2233 | typedef __off_t = ffi.Long;
2234 | typedef Dart__off_t = int;
2235 | typedef _IO_lock_t = ffi.Void;
2236 | typedef Dart_IO_lock_t = void;
2237 | typedef __off64_t = ffi.Long;
2238 | typedef Dart__off64_t = int;
2239 | 
2240 | final class _IO_codecvt extends ffi.Opaque {}
2241 | 
2242 | final class _IO_wide_data extends ffi.Opaque {}
2243 | 
2244 | const int LLAMA_DEFAULT_SEED = 4294967295;
2245 | 
2246 | const int LLAMA_MAX_RNG_STATE = 65536;
2247 | 
2248 | const int LLAMA_FILE_MAGIC_GGLA = 1734831201;
2249 | 
2250 | const int LLAMA_FILE_MAGIC_GGSN = 1734833006;
2251 | 
2252 | const int LLAMA_FILE_MAGIC_GGSQ = 1734833009;
2253 | 
2254 | const int LLAMA_SESSION_MAGIC = 1734833006;
2255 | 
2256 | const int LLAMA_SESSION_VERSION = 6;
2257 | 
2258 | const int LLAMA_STATE_SEQ_MAGIC = 1734833009;
2259 | 
2260 | const int LLAMA_STATE_SEQ_VERSION = 1;
2261 | 


--------------------------------------------------------------------------------
/lib/src/llama_params.dart:
--------------------------------------------------------------------------------
 1 | /// Params holder like `gpt_params` in `common/common.h`
 2 | final class LlamaParams {
 3 |   final int? seed;
 4 |   final int? nThread;
 5 |   final int? nThreadBatch;
 6 |   final int? nPredict;
 7 |   final int? nCtx;
 8 |   final int? nBatch;
 9 |   final int? nGpuLayers;
10 |   final int? mainGpu;
11 |   final bool embedding;
12 |   final int numa;
13 | 
14 |   const LlamaParams({
15 |     this.seed,
16 |     this.nThread,
17 |     this.nThreadBatch,
18 |     this.nPredict,
19 |     this.nCtx,
20 |     this.nBatch,
21 |     this.nGpuLayers,
22 |     this.mainGpu,
23 |     this.embedding = false,
24 |     this.numa = 0,
25 |   });
26 | }
27 | 


--------------------------------------------------------------------------------
/lib/src/native_llama.dart:
--------------------------------------------------------------------------------
  1 | import 'dart:convert' show utf8;
  2 | import 'dart:ffi' as ffi;
  3 | import 'dart:io' show stderr;
  4 | import 'dart:math' show max;
  5 | 
  6 | import 'common.dart' as c;
  7 | import 'ffi.dart';
  8 | import 'lib_llama_cpp.dart' as llama_cpp;
  9 | import 'llama_params.dart';
 10 | import 'sampling.dart';
 11 | 
 12 | bool _shouldAddBosToken(ffi.Pointer<llama_cpp.llama_model> model) {
 13 |   final addBos = llama_cpp.llama_add_bos_token(model);
 14 |   return addBos != -1
 15 |       ? addBos != 0
 16 |       : llama_cpp.llama_vocab_type1(model) == 0; // LLAMA_VOCAB_TYPE_SPM
 17 | }
 18 | 
 19 | /// A class represent native llama data structure, run in separate isolate.
 20 | final class NativeLLama {
 21 |   static const engTag = '__end__';
 22 |   static const closeTag = '__close__';
 23 | 
 24 |   final ffi.Pointer<llama_cpp.llama_model> model;
 25 |   final ffi.Pointer<llama_cpp.llama_context> ctx;
 26 |   final llama_cpp.llama_batch batch;
 27 |   final CharArray cStr;
 28 |   final bool verbose;
 29 |   final tokenBuf = TokenArray(size: 64);
 30 |   final array = TokenDataArray(512);
 31 | 
 32 |   NativeLLama._(
 33 |     this.model,
 34 |     this.ctx,
 35 |     this.batch,
 36 |     this.cStr,
 37 |     this.verbose,
 38 |   );
 39 | 
 40 |   factory NativeLLama(
 41 |     String path,
 42 |     LlamaParams params, {
 43 |     bool verbose = false,
 44 |   }) {
 45 |     final cStr = CharArray.from(path);
 46 |     final (model, ctx) = c.loadModel(cStr, params);
 47 |     print('add_bos: ${_shouldAddBosToken(model)}');
 48 | 
 49 |     final batchSize = llama_cpp.llama_n_batch(ctx);
 50 |     final batch = llama_cpp.llama_batch_init(batchSize, 0, 1);
 51 | 
 52 |     return NativeLLama._(
 53 |       model,
 54 |       ctx,
 55 |       batch,
 56 |       cStr,
 57 |       verbose,
 58 |     );
 59 |   }
 60 | 
 61 |   /// free native resource, need explicitly calling.
 62 |   void dispose() {
 63 |     array.dispose();
 64 |     tokenBuf.dispose();
 65 |     cStr.dispose();
 66 | 
 67 |     llama_cpp.llama_batch_free(batch);
 68 |     llama_cpp.llama_free(ctx);
 69 |     llama_cpp.llama_free_model(model);
 70 |     llama_cpp.llama_backend_free();
 71 |   }
 72 | 
 73 |   void _log(String str) {
 74 |     final leadingNewLine = str.startsWith('\n');
 75 |     stderr.writeln(leadingNewLine ? str.replaceFirst('\n', '\n  ') : '  $str');
 76 |   }
 77 | 
 78 |   /// Generate token string by @prompt.
 79 |   /// Some model would return two tokens to represent a single word,
 80 |   /// so it is better to use raw stream.
 81 |   Stream<List<int>> generate(
 82 |     String prompt, {
 83 |     int? nPrev,
 84 |     int? nProbs,
 85 |     int? topK,
 86 |     double? topP,
 87 |     double? minP,
 88 |     double? tfsZ,
 89 |     double? typicalP,
 90 |     double? temperature,
 91 |     int? penaltyLastN,
 92 |     double? penaltyRepeat,
 93 |     double? penaltyFrequency,
 94 |     double? penaltyPresent,
 95 |     int? mirostat,
 96 |     double? mirostatTau,
 97 |     double? mirostatEta,
 98 |     bool? penalizeNewline,
 99 |     String? samplersSequence,
100 |   }) async* {
101 |     cStr.pavedBy(prompt);
102 |     tokenBuf.pavedBy(model, cStr);
103 |     if (verbose) {
104 |       _log('prompt: "$prompt"');
105 |       _log('tokens: ${_tokensString(tokenBuf.pointerAt(0), tokenBuf.length)}');
106 |     }
107 |     final eosToken = llama_cpp.llama_token_eos(model);
108 | 
109 |     var num = 0;
110 |     var code = 0;
111 | 
112 |     final defaultParams = const SamplingParams();
113 |     final params = SamplingParams(
114 |       nPrev: nPrev ?? defaultParams.nPrev,
115 |       nProbs: nProbs ?? defaultParams.nProbs,
116 |       topK: topK ?? defaultParams.topK,
117 |       topP: topP ?? defaultParams.topP,
118 |       minP: minP ?? defaultParams.minP,
119 |       tfsZ: tfsZ ?? defaultParams.tfsZ,
120 |       typicalP: typicalP ?? defaultParams.typicalP,
121 |       temperature: temperature ?? defaultParams.temperature,
122 |       penaltyLastN: penaltyLastN ?? defaultParams.penaltyLastN,
123 |       penaltyRepeat: penaltyRepeat ?? defaultParams.penaltyRepeat,
124 |       penaltyFrequency: penaltyFrequency ?? defaultParams.penaltyFrequency,
125 |       penaltyPresent: penaltyPresent ?? defaultParams.penaltyPresent,
126 |       mirostat: mirostat ?? defaultParams.mirostat,
127 |       mirostatTau: mirostatTau ?? defaultParams.mirostatTau,
128 |       mirostatEta: mirostatEta ?? defaultParams.mirostatEta,
129 |       penalizeNewline: penalizeNewline ?? defaultParams.penalizeNewline,
130 |       samplersSequence: samplersSequence ?? defaultParams.samplersSequence,
131 |     );
132 |     if (verbose) {
133 |       _log('sampling:\n$params');
134 |       _log('sampling order:\n${params.samplingOrder}');
135 |       _log('generate: n_ctx = ${llama_cpp.llama_n_ctx(ctx)}, '
136 |           'n_batch = ${llama_cpp.llama_n_batch(ctx)}, '
137 |           'n_predict = %d, '
138 |           'n_keep = %d');
139 |     }
140 |     final ctxSampling = SamplingContext.from(params);
141 |     ctxSampling.acceptSampling(
142 |       ctx,
143 |       tokenBuf.toList(),
144 |       false,
145 |     );
146 |     llama_cpp.llama_reset_timings(ctx);
147 |     llama_cpp.llama_kv_cache_clear(ctx);
148 |     while ((code = _decodeBatch(num, num == 0)) == 0) {
149 |       if (verbose) {
150 |         _log('<<<<<<<<<');
151 |         _log('eval: ${_tokensString(tokenBuf.pointerAt(0), tokenBuf.length)}');
152 |       }
153 |       final tokenId = _sampleSampling(ctxSampling, batch.n_tokens - 1);
154 |       if (verbose) {
155 |         _log('sampled token(${params.mirostat}): ${"$tokenId".padLeft(8)}: ');
156 |       }
157 |       if (tokenId == eosToken) {
158 |         /// 3 - finished by ending.
159 |         code = 3;
160 |         break;
161 |       }
162 |       final token = cStr.tokenBytes(model, tokenId);
163 |       yield token;
164 |       ctxSampling.acceptSampling(ctx, [tokenId], true);
165 |       if (verbose) {
166 |         final str = _tokensString(
167 |           ctxSampling.penaltyPointer,
168 |           ctxSampling.usedSize,
169 |         );
170 |         _log('\nlast: $str');
171 |         _log('>>>>>>>>>');
172 |       }
173 | 
174 |       num += batch.n_tokens;
175 |       batch.n_tokens = 0;
176 |       tokenBuf
177 |         ..clear()
178 |         ..add(tokenId);
179 |     }
180 |     llama_cpp.llama_print_timings(ctx);
181 |     final n = llama_cpp.llama_get_kv_cache_token_count(ctx);
182 |     _log("sample llama logits finished with '$code', kv: $n.");
183 |     ctxSampling.free();
184 |     yield utf8.encode(engTag);
185 |   }
186 | 
187 |   int _decodeBatch(int count, bool init) {
188 |     // evaluate the initial prompt
189 |     c.addBatchPos(batch, tokenBuf.toList(), count, !init);
190 |     if (init) {
191 |       batch.logits[batch.n_tokens - 1] = 1;
192 |     }
193 |     // TODO: What if llama_decode return 1?
194 |     return llama_cpp.llama_decode(ctx, batch);
195 |   }
196 | 
197 |   int _sampleSampling(SamplingContext ctxSampling, int idx,
198 |       [bool isResampling = false]) {
199 |     final params = ctxSampling.params;
200 |     final model = llama_cpp.llama_get_model(ctx);
201 |     final nVocab = llama_cpp.llama_n_vocab(model);
202 |     final temp = params.temperature;
203 |     final penaltyRepeat = params.penaltyRepeat;
204 |     final penaltyFrequency = params.penaltyFrequency;
205 |     final penaltyPresent = params.penaltyPresent;
206 |     final mirostat = params.mirostat;
207 |     final mirostatTau = params.mirostatTau;
208 |     final mirostatEta = params.mirostatEta;
209 |     final penalizeNewline = params.penalizeNewline;
210 | 
211 |     final logits = llama_cpp.llama_get_logits_ith(ctx, idx);
212 |     final logitBias = params.logitBias?.entries;
213 |     logitBias?.forEach((e) {
214 |       logits[e.key] += e.value;
215 |     });
216 |     array.pavedBy(logits, nVocab);
217 |     // apply penalties
218 |     final usedSize = ctxSampling.usedSize;
219 |     if (usedSize > 0) {
220 |       final nl = llama_cpp.llama_token_nl(model);
221 |       final logit = logits[nl];
222 |       llama_cpp.llama_sample_repetition_penalties(
223 |         ctx,
224 |         array.pointer,
225 |         ctxSampling.penaltyPointer,
226 |         usedSize,
227 |         penaltyRepeat,
228 |         penaltyFrequency,
229 |         penaltyPresent,
230 |       );
231 |       if (!penalizeNewline) {
232 |         for (var i = 0; i < array.length; i++) {
233 |           final data = array[i];
234 |           if (data.id == nl) {
235 |             final old = data.logit;
236 |             array.setLogit(i, logit);
237 |             final v = array[i].logit;
238 |             _log("$i: $old -> $v");
239 |             break;
240 |           }
241 |         }
242 |       }
243 |     }
244 | 
245 |     final grammar = ctxSampling.grammar;
246 |     if (isResampling && grammar != null) {
247 |       llama_cpp.llama_sample_grammar(ctx, array.pointer, grammar);
248 |     }
249 |     var id = 0;
250 |     if (temp < 0.0) {
251 |       llama_cpp.llama_sample_softmax(ctx, array.pointer);
252 |       id = array[0].id;
253 |     } else if (temp == 0.0) {
254 |       id = llama_cpp.llama_sample_token_greedy(ctx, array.pointer);
255 |     } else {
256 |       if (mirostat == 1) {
257 |         const mirostatM = 100;
258 |         llama_cpp.llama_sample_temp(ctx, array.pointer, temp);
259 |         id = llama_cpp.llama_sample_token_mirostat(ctx, array.pointer,
260 |             mirostatTau, mirostatEta, mirostatM, ctxSampling.mirostatMu);
261 |       } else if (mirostat == 2) {
262 |         llama_cpp.llama_sample_temp(ctx, array.pointer, temp);
263 |         id = llama_cpp.llama_sample_token_mirostat_v2(ctx, array.pointer,
264 |             mirostatTau, mirostatEta, ctxSampling.mirostatMu);
265 |       } else {
266 |         final minKeep = max(1, params.nProbs);
267 |         _samplerQueue(params, nVocab, minKeep);
268 |         id = llama_cpp.llama_sample_token(ctx, array.pointer);
269 |       }
270 |     }
271 | 
272 |     if (grammar != null && !isResampling) {
273 |       // TODO: consider grammar
274 |     }
275 | 
276 |     return id;
277 |   }
278 | 
279 |   void _samplerQueue(SamplingParams params, int capacity, int minKeep) {
280 |     final topK = params.topK <= 0 ? capacity : params.topK;
281 |     for (final i in params.samplersSequence.codeUnits) {
282 |       switch (i) {
283 |         case kChar:
284 |           llama_cpp.llama_sample_top_k(ctx, array.pointer, topK, minKeep);
285 |           break;
286 |         case fChar:
287 |           llama_cpp.llama_sample_tail_free(
288 |             ctx,
289 |             array.pointer,
290 |             params.tfsZ,
291 |             minKeep,
292 |           );
293 |           break;
294 |         case yChar:
295 |           llama_cpp.llama_sample_typical(
296 |             ctx,
297 |             array.pointer,
298 |             params.typicalP,
299 |             minKeep,
300 |           );
301 |           break;
302 |         case pChar:
303 |           llama_cpp.llama_sample_top_p(
304 |             ctx,
305 |             array.pointer,
306 |             params.topP,
307 |             minKeep,
308 |           );
309 |           break;
310 |         case mChar:
311 |           llama_cpp.llama_sample_min_p(
312 |             ctx,
313 |             array.pointer,
314 |             params.minP,
315 |             minKeep,
316 |           );
317 |           break;
318 |         case tChar:
319 |           llama_cpp.llama_sample_temp(
320 |             ctx,
321 |             array.pointer,
322 |             params.temperature,
323 |           );
324 |           break;
325 |         default:
326 |           break;
327 |       }
328 |     }
329 |   }
330 | 
331 |   String _tokensString(ffi.Pointer<llama_cpp.llama_token> pointer, int len) {
332 |     final buf = StringBuffer('[');
333 |     for (var i = 0; i < len; i++) {
334 |       final id = pointer[i];
335 |       buf.write("'${cStr.tokenString(model, id)}':$id, ");
336 |     }
337 |     buf.write(']');
338 |     return buf.toString();
339 |   }
340 | }
341 | 


--------------------------------------------------------------------------------
/lib/src/sampling.dart:
--------------------------------------------------------------------------------
  1 | import 'dart:ffi' as ffi;
  2 | import 'dart:math';
  3 | 
  4 | import 'package:ffi/ffi.dart';
  5 | 
  6 | import 'lib_llama_cpp.dart' as llama_cpp;
  7 | 
  8 | extension _FloatEx on double {
  9 |   String get str => toStringAsFixed(3);
 10 | }
 11 | 
 12 | const fChar = 0x66;
 13 | const kChar = 0x6b;
 14 | const yChar = 0x79;
 15 | const pChar = 0x70;
 16 | const mChar = 0x6d;
 17 | const tChar = 0x74;
 18 | 
 19 | class SamplingParams {
 20 |   /// number of previous tokens to remember
 21 |   final int nPrev;
 22 | 
 23 |   /// if greater than 0, output the probabilities of top n_probs tokens.
 24 |   final int nProbs;
 25 | 
 26 |   /// <= 0 to use vocab size
 27 |   final int topK;
 28 |   final double topP;
 29 |   final double minP;
 30 |   final double tfsZ;
 31 |   final double typicalP;
 32 |   final double temperature;
 33 |   final int penaltyLastN;
 34 |   final double penaltyRepeat;
 35 |   final double penaltyFrequency;
 36 |   final double penaltyPresent;
 37 |   final int mirostat;
 38 |   final double mirostatTau;
 39 |   final double mirostatEta;
 40 |   final bool penalizeNewline;
 41 |   final String samplersSequence;
 42 | 
 43 |   /// optional BNF-like grammar to constrain sampling
 44 |   final String? grammar;
 45 | 
 46 |   /// string to help guidance
 47 |   final String? cfgNegativePrompt;
 48 | 
 49 |   /// how strong is guidance
 50 |   final double cfgScale;
 51 |   final Map<int, double>? logitBias;
 52 |   final List<int>? penaltyPromptTokens;
 53 |   final bool usePenaltyPromptTokens;
 54 | 
 55 |   const SamplingParams({
 56 |     this.nPrev = 64,
 57 |     this.nProbs = 0,
 58 |     this.topK = 40,
 59 |     this.topP = 0.95,
 60 |     this.minP = 0.05,
 61 |     this.tfsZ = 1.00,
 62 |     this.typicalP = 1.0,
 63 |     this.temperature = 0.80,
 64 |     this.penaltyLastN = 64,
 65 |     this.penaltyRepeat = 1.10,
 66 |     this.penaltyFrequency = 0.00,
 67 |     this.penaltyPresent = 0.00,
 68 |     this.mirostat = 0,
 69 |     this.mirostatTau = 5.00,
 70 |     this.mirostatEta = 0.10,
 71 |     this.penalizeNewline = true,
 72 |     // top_k, tail_free, typical_p, top_p, min_p, temp
 73 |     this.samplersSequence = "kfypmt",
 74 |     this.grammar,
 75 |     this.cfgNegativePrompt,
 76 |     this.cfgScale = 1.0,
 77 |     this.logitBias,
 78 |     this.penaltyPromptTokens,
 79 |     this.usePenaltyPromptTokens = false,
 80 |   });
 81 | 
 82 |   String get samplingString =>
 83 |       "\trepeat_last_n = $penaltyLastN, repeat_penalty = ${penaltyRepeat.str}, frequency_penalty = "
 84 |       "${penaltyFrequency.str}, presence_penalty = ${penaltyPresent.str}\n"
 85 |       "\ttop_k = $topK, tfs_z = ${tfsZ.str}, top_p = ${topP.str}, min_p = ${minP.str}, "
 86 |       "typical_p = ${typicalP.str}, temp = ${temperature.str}\n"
 87 |       "\tmirostat = $mirostat, mirostat_lr = ${mirostatEta.str}, mirostat_ent = ${mirostatTau.str}";
 88 | 
 89 |   String get samplingOrder {
 90 |     final buf = StringBuffer('CFG -> Penalties ');
 91 |     if (mirostat == 0) {
 92 |       for (final c in samplersSequence.codeUnits) {
 93 |         final seq = _samplersSeq[c];
 94 |         if (seq != null) {
 95 |           buf.write(seq);
 96 |         }
 97 |       }
 98 |     } else {
 99 |       buf.write('-> mirostat ');
100 |     }
101 |     return buf.toString();
102 |   }
103 | 
104 |   @override
105 |   String toString() => samplingString;
106 | }
107 | 
108 | const _samplersSeq = {
109 |   fChar: '-> tfs_z ',
110 |   kChar: '-> top_k ',
111 |   yChar: '-> typical_p ',
112 |   pChar: '-> top_p ',
113 |   mChar: '-> min_p ',
114 |   tChar: '-> temp ',
115 | };
116 | 
117 | class SamplingContext {
118 |   final SamplingParams params;
119 |   final ffi.Pointer<ffi.Float> mirostatMu;
120 |   final ffi.Pointer<llama_cpp.llama_grammar>? grammar;
121 |   final ffi.Pointer<llama_cpp.llama_token> _prev;
122 |   final int prevSize;
123 |   final int usedSize;
124 | 
125 |   SamplingContext._(
126 |     this.params,
127 |     this.mirostatMu,
128 |     this.grammar,
129 |     this._prev,
130 |     this.prevSize,
131 |     this.usedSize,
132 |   );
133 | 
134 |   factory SamplingContext.from(SamplingParams params) {
135 |     ffi.Pointer<llama_cpp.llama_grammar>? grammar;
136 |     if (params.grammar != null) {}
137 |     final mu = calloc.allocate<ffi.Float>(ffi.sizeOf<ffi.Float>());
138 |     final (p, len) = _createNativeTokens(params);
139 |     final lastN = params.penaltyLastN;
140 |     final penaltyLastN = lastN < 0 ? params.nPrev : lastN;
141 |     final usedSize = min(len, penaltyLastN);
142 | 
143 |     return SamplingContext._(
144 |       params,
145 |       mu,
146 |       grammar,
147 |       p,
148 |       len,
149 |       usedSize,
150 |     );
151 |   }
152 | 
153 |   void free() {
154 |     final g = grammar;
155 |     if (g != null) {
156 |       llama_cpp.llama_grammar_free(g);
157 |     }
158 |     calloc.free(mirostatMu);
159 |     calloc.free(_prev);
160 |   }
161 | 
162 |   void reset() {
163 |     final g = grammar;
164 |     if (g != null) {
165 |       llama_cpp.llama_grammar_free(g);
166 |     }
167 |     for (var i = 0; i < prevSize; i++) {
168 |       _prev[i] = 0;
169 |     }
170 |   }
171 | 
172 |   int get lastSampledToken => _prev[prevSize - 1];
173 | 
174 |   ffi.Pointer<llama_cpp.llama_token> get penaltyPointer =>
175 |       _prev + prevSize - usedSize;
176 | 
177 |   /// A ring buffer to append a list of tokens
178 |   void acceptSampling(
179 |     ffi.Pointer<llama_cpp.llama_context> ctx,
180 |     List<int> ids,
181 |     bool applyGrammar,
182 |   ) {
183 |     final n = min(ids.length, prevSize);
184 |     for (var i = 0; i < prevSize - n; i++) {
185 |       (_prev + i).value = _prev[i + n];
186 |     }
187 |     for (var i = 0; i < n; i++) {
188 |       (_prev + i + prevSize - n).value = ids[i];
189 |     }
190 | 
191 |     if (grammar != null && applyGrammar) {
192 |       // TODO: consider grammar
193 |     }
194 |   }
195 | }
196 | 
197 | (ffi.Pointer<llama_cpp.llama_token>, int) _createNativeTokens(
198 |     SamplingParams params) {
199 |   final promptTokens = params.penaltyPromptTokens ?? [];
200 |   final tokens = params.usePenaltyPromptTokens && promptTokens.isNotEmpty
201 |       ? promptTokens
202 |       : List.filled(params.nPrev, 0);
203 |   final p = calloc.allocate<llama_cpp.llama_token>(
204 |       ffi.sizeOf<llama_cpp.llama_token>() * tokens.length);
205 |   for (var i = 0; i < tokens.length; i++) {
206 |     p[i] = tokens[i];
207 |   }
208 |   return (p, tokens.length);
209 | }
210 | 


--------------------------------------------------------------------------------
/pubspec.yaml:
--------------------------------------------------------------------------------
 1 | name: llama_cpp
 2 | description: A dart binding for llama.cpp library, bringing AI to dart world.
 3 | version: 1.2.0
 4 | repository: https://github.com/lindeer/llama-cpp
 5 | 
 6 | topics:
 7 |   - ai
 8 |   - nlp
 9 |   - llm
10 | 
11 | environment:
12 |   sdk: '>=3.3.0 <4.0.0'
13 | 
14 | dependencies:
15 |   ffi: ^2.1.3
16 |   native_assets_cli: ^0.8.0
17 | 
18 | dev_dependencies:
19 |   ffigen: ^11.0.0
20 |   lints: ^3.0.0
21 |   test: ^1.21.0
22 | 


--------------------------------------------------------------------------------
/test/data/text.txt:
--------------------------------------------------------------------------------
1 | 人生的意义是什么？
2 | 老板把几张桌子拼在了一起，很快桌子上变成了一顿丰盛大餐的节日狂欢：盘子和碟子乱七八糟，闪着黄色的油脂、四处碎裂的甲壳动物、面包屑、焦黑的骨头、混乱不堪的瓶子和杯子，几乎所有人都建议不要按预先设想的计划旅行。岛上的每个地方都可以在几个小时内乘车到达。纵横交错的道路能更好地了解西西里的情况。晚些时候，港口辖区的人都走光了，只有零散的几个人坐在桌子旁，这才第一次听到海水的荡漾。港口边一个房子的格子阳台上，一个老人还站着，就像几个小时前一样。他在波斯帽下一动不动，几乎无法把他从倚靠的半明半暗中分辨出来，喧嚣肆意从他身边掠过。记不起是谁先看到的他。现在轮到唐·卡利奇奥(Don Calicchio)开始谈论阳台上的那个人。那个人和岛上居民的典型模型：沉默寡言，纹丝不动，暗中观察，满是忧郁的怀疑。他肯定对发生的事情知之甚少，但这些事情其实塑造了他的思考和生活方式。他说，历史深深地扎根在人们的血液中，三千年来永恒的往复。一遍又一遍的征服。开始是多里亚人和腓尼基人，然后是希腊僭主(Tyrannen)和罗马执政官(Prokonsuln)。接着是伊斯兰宗教领袖(Imame)和诺曼诸侯、霍亨斯陶芬和安茹家族的战事，又接着是加里波第(Garibaldi)和皮埃蒙特小国王1，最后是墨索里尼、德国人和美国的登陆舰队。他总结说，所有这些征服者，无论他们来了多久，都意味着西西里的屈服。
3 | 


--------------------------------------------------------------------------------
/test/data/values.txt:
--------------------------------------------------------------------------------
1 | [0.002748, 0.018709, -0.004901, 0.033228, 0.019280, -0.007864, 0.009661, 0.021122, -0.004441, 0.060076, -0.046824, 0.039386, -0.018559, -0.016204, -0.015773, 0.037257, 0.008054, 0.047608, -0.031019, -0.028022, 0.039248, -0.067525, 0.022590, 0.019823, 0.012489, 0.097309, -0.013562, 0.000971, 0.005299, 0.017474, 0.013749, 0.058956, 0.031092, 0.004360, -0.039856, 0.047605, -0.009411, 0.016901, -0.000576, -0.039155, -0.008896, 0.027975, 0.004059, -0.012684, 0.041530, -0.028001, -0.002777, -0.004369, -0.011253, 0.056504, 0.039857, 0.193007, 0.002377, -0.007906, -0.005569, -0.011621, 0.058219, -0.026838, -0.003529, -0.002906, -0.043625, 0.008552, -0.013950, 0.094324, -0.016286, -0.006553, 0.024853, 0.031572, -0.015191, 0.016623, 0.024046, -0.021184, 0.007180, -0.031437, 0.003170, 0.020737, -0.036690, 0.022023, -0.041029, -0.003451, -0.021887, -0.024111, -0.029427, -0.013289, 0.010364, 0.063137, -0.001661, -0.036666, 0.011898, 0.052438, -0.013414, -0.005930, -0.077403, 0.000270, 0.041493, -0.052804, 0.003009, -0.052771, 0.037389, 0.005920, -0.016384, -0.059444, -0.091977, 0.043008, 0.035662, -0.039837, -0.049678, 0.028091, -0.028031, -0.015087, 0.000492, -0.013573, 0.022487, -0.058391, -0.036355, -0.035800, 0.014695, -0.021363, -0.068123, -0.029504, -0.022519, -0.032145, 0.054870, -0.061875, -0.016571, -0.009690, 0.006072, 0.063474, -0.026058, 0.057749, 0.016925, -0.027694, 0.004955, -0.012908, -0.053931, 0.000664, -0.016369, -0.031594, 0.034740, -0.046654, 0.049105, 0.040210, 0.008307, 0.016486, 0.006993, -0.003738, 0.023174, -0.004705, -0.009223, 0.020779, 0.036851, 0.037139, 0.041900, 0.007291, -0.003721, 0.046447, -0.025414, -0.003266, 0.038864, -0.022950, -0.035291, -0.009215, 0.026389, 0.044462, 0.061325, -0.009947, -0.060005, -0.003428, 0.018673, 0.018146, 0.029088, -0.042467, -0.013687, -0.044893, 0.005011, 0.028534, 0.038162, -0.015792, -0.001439, -0.027494, -0.038440, -0.023412, 0.024505, 0.029460, -0.038159, -0.003498, -0.015563, -0.030281, 0.028848, -0.048190, -0.001176, 0.024960, 0.003813, 0.038229, 0.010367, 0.036894, -0.030424, -0.011569, -0.009186, -0.030061, -0.019186, 0.053846, 0.033510, 0.031724, 0.001231, 0.056447, -0.027350, -0.003869, 0.012006, 0.012895, 0.003787, 0.017241, 0.008408, -0.009188, -0.026191, 0.053303, -0.030743, 0.004043, 0.007152, 0.029318, 0.076870, -0.024694, -0.031857, 0.007447, -0.023387, -0.027921, 0.026360, -0.036934, 0.050190, -0.030790, 0.000850, -0.049292, -0.061593, 0.014353, -0.006601, -0.004607, -0.014405, 0.034925, -0.007874, -0.043756, -0.025181, -0.003196, -0.011138, -0.030946, -0.048551, -0.019051, -0.015092, -0.059014, 0.045878, -0.017963, 0.047651, 0.011760, -0.013354, -0.015118, -0.024310, 0.009942, 0.017399, -0.003552, -0.007770, 0.007850, -0.025595, -0.021572, 0.033562, -0.073414, -0.087019, -0.001477, 0.022035, -0.004415, 0.023038, 0.027339, 0.014867, 0.043897, -0.002117, -0.011837, 0.008205, -0.060910, -0.009666, -0.042757, 0.039289, -0.076998, 0.004185, 0.053701, -0.028735, -0.006410, -0.084053, -0.069257, -0.040670, 0.011475, 0.015218, -0.034976, 0.046817, 0.009171, -0.005876, -0.042356, 0.268232, -0.000565, 0.019373, 0.030529, 0.005708, 0.019563, -0.042336, -0.005031, 0.028721, -0.058943, 0.026717, 0.009334, 0.031044, 0.000609, 0.027129, -0.039009, -0.077983, -0.033732, 0.001198, -0.010092, -0.047769, 0.025994, -0.012584, 0.001873, 0.016263, 0.018904, -0.017769, 0.007070, 0.047912, -0.020331, 0.002351, -0.001581, -0.016761, -0.028518, 0.041098, 0.048682, -0.002197, 0.025649, 0.016151, 0.004874, -0.044923, 0.029337, 0.011326, -0.037092, -0.033458, -0.045397, 0.066314, -0.037488, -0.024838, -0.006526, 0.002329, 0.025500, -0.059235, 0.004991, -0.026183, 0.037629, 0.023049, -0.075650, 0.054310, -0.014055, -0.016602, -0.042717, -0.007421, 0.001940, -0.016317, 0.048993, -0.032705, -0.007555, -0.009179, 0.007807, 0.013147, -0.001102, 0.034557, 0.008237, 0.015749, -0.028601, 0.020702, 0.039658, -0.012211, -0.054478, 0.033384, -0.033875, -0.016401, 0.014640, 0.032412, -0.000189, -0.020176, -0.023004, 0.015024, -0.038237, -0.004677, -0.039297, -0.088269, 0.014198, 0.094159, -0.037502, -0.010474, 0.012121, 0.007789, -0.063851, -0.021236, -0.007159, -0.012587, -0.006184, 0.006667, 0.008835, -0.033254, -0.034641, 0.024939, 0.012026, -0.020599, -0.014796, 0.019221, -0.008406, 0.073629, 0.041525, -0.001637, -0.041848, -0.075896, 0.017085, -0.065918, -0.039616, 0.032568, 0.002438, 0.018990, 0.022434, -0.023249, -0.020664, 0.046687, -0.053676, -0.003266, -0.025215, 0.002564, 0.041080, -0.027405, 0.013426, 0.007414, -0.035200, -0.008517, -0.034125, -0.033089, -0.030880, -0.029554, 0.064523, -0.009705, -0.022755, 0.014105, 0.010406, 0.012919, 0.044742, -0.006403, -0.015069, 0.022184, -0.009986, 0.047907, -0.052769, 0.069356, 0.053625, 0.011936, -0.028786, -0.081952, 0.003949, 0.012676, -0.001549, -0.042265, 0.002343, 0.030193, 0.052115, 0.003902, -0.032469, -0.048285, -0.034673, -0.021152, -0.055607, -0.007042, -0.061425, -0.005131, -0.005833, -0.037384, -0.003194, -0.005009, 0.048388, -0.052405, -0.048759, 0.028373, 0.054570, 0.021434, -0.054493, -0.001462, 0.008312, -0.014067, -0.001033, -0.050160, -0.043146, -0.026299, -0.025467, -0.027003, -0.019606, -0.007006, 0.048621, -0.023143, 0.028375, -0.013187, -0.053275, 0.007346, 0.011086, -0.026073, -0.012925, 0.012604, -0.001384, 0.020499, -0.052935, -0.006661, 0.032204, -0.018983, -0.014345, -0.011039, -0.037490, 0.008708, 0.066112, 0.008486, -0.011448, 0.017040, -0.016320, -0.031394, 0.023015, -0.038683, -0.042025, 0.000194, 0.054627, -0.055844, 0.019194, 0.023265, -0.018689, -0.049089, 0.052777, -0.077946, -0.032353, 0.023846, 0.002256, -0.038973, 0.045957, -0.024696, -0.089713, -0.015449, -0.060195, 0.066317, -0.049429, 0.041355, 0.016323, -0.018458, -0.033797, -0.021226, 0.006238, 0.013123, -0.040336, -0.015303, 0.033016, -0.011223, 0.060414, -0.045488, 0.039477, -0.016573, 0.026991, 0.048341, -0.060210, 0.009705, -0.012491, -0.033483, 0.067254, -0.037381, 0.003392, -0.014488, -0.034642, -0.008514, 0.045176, -0.006417, 0.002323, -0.014924, -0.004272, 0.001454, 0.033591, 0.033394, -0.070911, 0.036816, -0.000787, -0.025466, 0.006998, -0.015088, 0.043901, -0.013712, -0.036422, -0.000961, -0.027177, 0.006389, 0.061785, 0.022760, -0.069621, -0.028772, 0.017829, -0.029324, -0.050558, -0.019491, -0.014544, 0.004695, 0.033320, -0.005903, 0.016032, -0.049241, 0.004659, 0.025763, 0.013882, 0.017032, -0.020307, 0.040979, 0.081039, 0.050956, 0.018274, 0.015328, -0.022948, 0.025442, 0.009415, -0.037822, 0.021831, -0.025299, 0.054633, -0.028054, 0.028637, -0.046147, 0.003350, -0.050639, -0.020184, -0.045411, 0.011977, 0.046723, 0.027515, 0.015294, 0.041548, 0.024747, -0.047016, 0.021823, -0.049481, 0.065846, -0.000544, -0.013925, 0.055688, 0.077173, 0.052599, -0.026109, 0.002161, 0.028716, 0.014113, 0.020951, 0.014484, -0.017879, 0.027740, -0.041337, -0.036965, 0.032288, 0.014887, 0.060533, 0.073322, -0.037303, 0.007789, -0.001182, 0.001361, -0.025140, -0.011141, -0.027716, -0.032432, -0.000025, 0.049933, 0.000302, 0.044789, -0.002865, 0.094235, -0.030566, 0.010469, 0.002333, 0.040289, -0.005740, -0.047480, -0.015792, 0.009191, 0.021395, 0.003657, -0.036484, 0.004549, -0.001573, 0.008670, 0.039228, -0.007337, -0.037421, 0.035785, 0.003036, -0.001534, 0.046469, -0.002735, 0.042027, -0.038637, -0.036651, 0.135092, -0.005036, -0.035098, -0.033986, -0.035076, -0.016081, 0.021915, 0.082592, 0.006401, 0.013491, -0.008547, -0.054606, -0.007829, -0.012136, 0.034574, 0.049043, -0.050791, -0.059708, -0.021508, -0.039706, 0.000696, 0.006534, 0.009665, 0.001398, -0.011411, 0.020646, 0.010460, 0.004403, -0.004071, -0.004899, 0.012417, 0.014156, -0.053095, 0.049707, -0.048853, -0.034150, -0.060462, -0.042575, -0.020059, 0.027869, 0.025362, -0.037957, 0.012254, -0.009697, 0.046583, -0.020720, 0.039554, 0.063817, 0.085704, -0.036594, 0.018128, 0.035817, -0.043685, 0.029166, -0.001077, 0.031751, 0.004168, -0.021054, -0.028695, -0.016313, 0.015408, 0.000334, 0.084837, 0.012301, 0.059702, -0.025221, 0.026763, 0.041199, -0.036252, -0.031334, 0.044451, -0.071544, 0.002161]
2 | [-0.018143, 0.008852, -0.012980, -0.017932, 0.004372, -0.005681, 0.018440, -0.017154, -0.026233, 0.040064, -0.027006, 0.010853, -0.036699, 0.014419, -0.019105, 0.021649, -0.013456, 0.001155, -0.021316, 0.046883, 0.014310, 0.016424, -0.071366, 0.014739, 0.019437, -0.007247, 0.020553, -0.008878, 0.069753, 0.060115, -0.032874, 0.010057, 0.032123, -0.000345, 0.024318, 0.000402, -0.012160, -0.004174, 0.030701, -0.022710, -0.006988, -0.019127, 0.035932, 0.005848, -0.025857, -0.000143, 0.006460, 0.022864, 0.029600, 0.017288, -0.049645, 0.253160, -0.034791, -0.021239, 0.014576, 0.003085, 0.010481, -0.032058, 0.090785, 0.033968, 0.007878, 0.022380, -0.034588, 0.023633, 0.040676, 0.038403, -0.010366, 0.005184, 0.016591, -0.047127, -0.061293, 0.002111, 0.013041, -0.015932, 0.038606, 0.021251, -0.030351, -0.031395, 0.028123, 0.007267, -0.042682, -0.009201, -0.047993, -0.006835, 0.001766, 0.024612, -0.014592, 0.012323, 0.107377, 0.028368, -0.001037, -0.039827, -0.025936, -0.024666, -0.072203, -0.018609, 0.002930, -0.006727, -0.013415, -0.005397, -0.021885, -0.075212, 0.011520, -0.023104, 0.012526, -0.010788, -0.033539, 0.040659, -0.004011, -0.014619, 0.038324, -0.043777, 0.031286, -0.023233, 0.014507, 0.010989, -0.007007, -0.012117, 0.004525, -0.000170, -0.022546, 0.014299, -0.035408, -0.003327, 0.030012, -0.012777, 0.016996, 0.063880, -0.029673, 0.011598, 0.025941, -0.042055, 0.001732, 0.009818, -0.014632, -0.035351, -0.009402, -0.026646, 0.012423, -0.018959, 0.020423, -0.002210, -0.006109, 0.015463, 0.013806, -0.066499, 0.008910, 0.018189, -0.053944, -0.008109, -0.044242, 0.007260, -0.018921, 0.024620, -0.018877, -0.002768, 0.011710, 0.008474, -0.061399, -0.008161, -0.011569, -0.030398, -0.003328, 0.064313, 0.000425, -0.016365, -0.054703, 0.026104, 0.015441, -0.002947, -0.007322, -0.021591, -0.027526, -0.038455, 0.041345, 0.034893, -0.022149, -0.001833, -0.057281, 0.008519, 0.001363, 0.016069, -0.042331, 0.033602, 0.018698, 0.041008, -0.046121, -0.033649, 0.012806, -0.027645, -0.017044, 0.023213, -0.050048, -0.033277, -0.020886, -0.044509, -0.039000, -0.019332, -0.001629, -0.003827, 0.031668, -0.005092, -0.001875, 0.043424, 0.000626, -0.019863, 0.012846, 0.049299, 0.024386, 0.046971, -0.008021, 0.050799, 0.028351, 0.005843, -0.018338, 0.041342, -0.012484, 0.013896, -0.006079, 0.013040, -0.052899, -0.025937, -0.038179, -0.024130, 0.038773, -0.014598, -0.028820, -0.052534, -0.036298, -0.057635, 0.016938, -0.002892, -0.025019, 0.011015, 0.004795, -0.042835, 0.025884, 0.010080, 0.023427, -0.012004, -0.025530, 0.012286, -0.016133, 0.000858, 0.011696, 0.022822, -0.017968, 0.001757, 0.018143, -0.048050, -0.027925, 0.027805, -0.011089, -0.043770, -0.006697, -0.014253, 0.018511, -0.016123, 0.019746, 0.020176, -0.017052, -0.004586, -0.014835, -0.020798, -0.021490, 0.006280, -0.061022, -0.004235, -0.027600, 0.030015, -0.038244, 0.062919, -0.053732, -0.017938, -0.053406, 0.024520, 0.001105, -0.027806, 0.001512, -0.018585, 0.005771, 0.010302, 0.053443, 0.026496, -0.029605, -0.081924, -0.011144, 0.055886, 0.018785, 0.016905, -0.048825, 0.017786, -0.064784, -0.013671, 0.442080, -0.084582, 0.013849, -0.019666, -0.082373, 0.017085, -0.000916, 0.044251, 0.024448, 0.058969, -0.031943, 0.055358, -0.002865, -0.023817, -0.011372, 0.024001, 0.005060, -0.045001, 0.051598, -0.037835, -0.020254, 0.037380, 0.034399, 0.026193, 0.025360, 0.039737, 0.036441, -0.004631, -0.033750, 0.035122, 0.022928, 0.019629, -0.033454, -0.016481, 0.038379, -0.029493, 0.046083, 0.006995, -0.042532, -0.003677, 0.006150, -0.008367, 0.005011, 0.016327, -0.042770, 0.011053, 0.000615, -0.004791, -0.012687, -0.036017, -0.029267, -0.016272, -0.030642, -0.071663, 0.014216, -0.019710, -0.022685, -0.037142, -0.003084, 0.026906, 0.005628, -0.042816, 0.001880, -0.005221, 0.020948, -0.050402, 0.020020, 0.005918, -0.046436, 0.024602, -0.038509, 0.001782, 0.010593, 0.053763, -0.016677, -0.036851, 0.001356, -0.027268, -0.042748, -0.056925, 0.032705, -0.039534, 0.004900, 0.014198, 0.016457, 0.015782, 0.000441, 0.032984, 0.001289, 0.012026, -0.021828, -0.061351, 0.002498, -0.046783, -0.009831, -0.000304, -0.013009, 0.003612, -0.002828, -0.011761, -0.029889, -0.004185, 0.077845, -0.014204, 0.002303, -0.015936, 0.050738, 0.021281, -0.024871, -0.006210, -0.007039, -0.030317, 0.008190, 0.108005, -0.000566, 0.026994, 0.027732, 0.001104, -0.068122, -0.020904, -0.009004, 0.016278, 0.028514, -0.015468, 0.006343, -0.018696, -0.011136, 0.019137, -0.005979, 0.015671, -0.005553, -0.027164, 0.062806, 0.075525, -0.004504, 0.036799, 0.032845, -0.025675, -0.023274, -0.030614, 0.077937, 0.005885, -0.024767, 0.034007, -0.034811, 0.040075, -0.009517, -0.005475, 0.012956, -0.042830, 0.055849, 0.011082, 0.017170, 0.005174, -0.040940, 0.024219, -0.004569, 0.033617, -0.053037, -0.033828, -0.024299, 0.006937, 0.051105, -0.005996, -0.033195, -0.006971, -0.042310, -0.028655, 0.006117, 0.012709, 0.017386, -0.027348, 0.031368, -0.039168, 0.040344, 0.025833, 0.033354, 0.017633, -0.009275, 0.008685, 0.031174, 0.003524, -0.034325, 0.013287, -0.008789, 0.037559, 0.007184, -0.025878, 0.038920, -0.011559, -0.010757, 0.011745, 0.019417, -0.008194, 0.018362, -0.035069, -0.001238, 0.008175, 0.010998, 0.036282, 0.016307, 0.018908, -0.029127, -0.017894, -0.057597, 0.052655, -0.027317, -0.017046, -0.017265, 0.016358, -0.027357, 0.016560, -0.030029, 0.017136, 0.008966, 0.049322, -0.016273, -0.044973, 0.005486, 0.038597, 0.026139, 0.052185, -0.013302, -0.021002, -0.005554, -0.013731, -0.009178, -0.004234, 0.019047, -0.043806, -0.024731, 0.007249, -0.005646, -0.022322, -0.028383, -0.020909, -0.020115, -0.017795, -0.004946, -0.042137, -0.012211, 0.023940, -0.028057, -0.021644, -0.024881, -0.003815, 0.007790, 0.030010, -0.020316, -0.031519, 0.045400, -0.026035, 0.035411, -0.041054, 0.016395, -0.058526, -0.050071, -0.005350, -0.000012, 0.033756, -0.005484, -0.039902, 0.021338, 0.049371, -0.023813, -0.015730, 0.024044, 0.001983, 0.033334, 0.056554, -0.071455, -0.019449, 0.058415, -0.031268, -0.031091, -0.011072, -0.039906, 0.029178, -0.018974, 0.048917, 0.032150, -0.008453, -0.042689, -0.021592, -0.021074, 0.021745, 0.059312, -0.071636, 0.001728, -0.015454, 0.006690, -0.031002, -0.031740, -0.021628, 0.004644, -0.051729, -0.009922, 0.018186, -0.021257, -0.023335, 0.064015, 0.025273, -0.000643, -0.021201, 0.001624, 0.007812, 0.032212, 0.034465, 0.014869, -0.009189, -0.015261, 0.016883, 0.055912, 0.030596, 0.029166, -0.023781, 0.052843, 0.011924, 0.012102, -0.029363, 0.007754, 0.015042, -0.055564, -0.022002, 0.013403, -0.005262, -0.021019, -0.029399, 0.041430, -0.007306, 0.011596, -0.057910, 0.006634, 0.036520, 0.032751, 0.043588, -0.007944, -0.017204, -0.065091, -0.005859, 0.002196, 0.003127, -0.093287, 0.031131, -0.013190, -0.022992, 0.033164, 0.044008, 0.028843, -0.026998, -0.016769, -0.055378, 0.016484, 0.006436, -0.000911, -0.049148, 0.010099, -0.009313, 0.006079, -0.025318, -0.027986, -0.001028, 0.044158, 0.001163, -0.073567, -0.020542, -0.015759, -0.018729, -0.008904, -0.012252, 0.020013, 0.009463, 0.035434, -0.029390, -0.016625, 0.012281, 0.020857, -0.024805, -0.006153, -0.028686, 0.022368, 0.025523, 0.019324, 0.003363, 0.013508, -0.011024, 0.015968, 0.077530, 0.053599, -0.017130, -0.024727, -0.043727, 0.008582, -0.068265, -0.029977, -0.020101, -0.016305, 0.042614, 0.016977, -0.032327, -0.029827, 0.153215, 0.004094, -0.045580, 0.019848, 0.002369, 0.024696, -0.003266, 0.021996, -0.061369, -0.064577, -0.050211, -0.043349, 0.013123, 0.002832, -0.033594, -0.016068, -0.030281, 0.072160, -0.044019, 0.015623, 0.072866, 0.032856, 0.040987, 0.037885, 0.000853, 0.014264, -0.023801, 0.029079, 0.016286, -0.004104, 0.002991, -0.005541, -0.022465, -0.060349, -0.035410, -0.013432, -0.000811, -0.037817, 0.024563, 0.006463, -0.032183, 0.013181, 0.019463, 0.046768, 0.005375, -0.025111, -0.004335, 0.012567, 0.051717, -0.005750, 0.001575, -0.010433, 0.002139, 0.016632, 0.001609, 0.042580, -0.026003, -0.020230, -0.038322, 0.034158, 0.011612, -0.027692, 0.021338, -0.057004, 0.001955, -0.002718, -0.013585, -0.019107, 0.030198, 0.018570, -0.067122, 0.040898, -0.028554]
3 | 


--------------------------------------------------------------------------------
/test/test_embedding.dart:
--------------------------------------------------------------------------------
 1 | import 'dart:io';
 2 | 
 3 | import 'package:llama_cpp/embedding.dart';
 4 | import 'package:test/test.dart';
 5 | 
 6 | const _url = 'https://hf-mirror.com/CompendiumLabs/bge-base-zh-v1.5-gguf/'
 7 |     'blob/main/bge-base-zh-v1.5-q4_k_m.gguf';
 8 | const _embedModelPath = 'test/data/bge-base-zh-v1.5.gguf';
 9 | 
10 | void _compareList(List<double> d, List<double> v) {
11 |   expect(d.length, v.length);
12 |   for (var j = 0; j < d.length; j++) {
13 |     final v1 = d[j];
14 |     final v2 = v[j];
15 |     expect((v1 - v2).abs() < 0.0000015, true, reason: "[$j]: '$v1' != '$v2'");
16 |   }
17 | }
18 | 
19 | void main() {
20 |   final embedding = Embedding(_embedModelPath);
21 |   final prompts = File('test/data/text.txt').readAsLinesSync();
22 |   final values = File('test/data/values.txt').readAsLinesSync().map((l) {
23 |     final v = l
24 |         .replaceAll('[', '')
25 |         .replaceAll(']', '')
26 |         .split(',')
27 |         .where((e) => e.trim().isNotEmpty)
28 |         .indexed
29 |         .map((idx) {
30 |       final (j, e) = idx;
31 |       try {
32 |         return double.parse(e.trim());
33 |       } catch (x) {
34 |         print("parse [$j]: '$e' failed!");
35 |         rethrow;
36 |       }
37 |     }).toList();
38 |     return v;
39 |   }).toList();
40 | 
41 |   setUp(() {
42 |     expect(
43 |       File(_embedModelPath).existsSync(),
44 |       true,
45 |       reason: "Download model from $_url in '$_embedModelPath' before testing!",
46 |     );
47 |   });
48 | 
49 |   test('basic embed', () {
50 |     final d1 = embedding.embedSingle(prompts[0]);
51 |     _compareList(d1, values[0]);
52 |     final d2 = embedding.embedSingle(prompts[1]);
53 |     _compareList(d2, values[1]);
54 |   });
55 | 
56 |   test('batch embed', () {
57 |     final result = embedding.embedBatch(prompts);
58 |     expect(result.length, values.length);
59 |     for (final r in result.indexed) {
60 |       final (i, d) = r;
61 |       _compareList(d, values[i]);
62 |     }
63 |   });
64 | 
65 |   tearDownAll(() {
66 |     embedding.dispose();
67 |   });
68 | }
69 | 


--------------------------------------------------------------------------------