├── .gitignore ├── .gitmodules ├── CHANGELOG.md ├── LICENSE ├── README.md ├── analysis_options.yaml ├── build.dart ├── example ├── embedding.dart ├── main.dart ├── rag │ ├── .gitignore │ ├── CHANGELOG.md │ ├── README.md │ ├── _config.json │ ├── analysis_options.yaml │ ├── bin │ │ ├── ingest.dart │ │ └── rag.dart │ ├── lib │ │ ├── chroma.dart │ │ └── common.dart │ ├── pubspec.yaml │ └── test │ │ └── test_chroma.dart ├── server.dart └── simple.dart ├── ffigen.yaml ├── lib ├── embedding.dart ├── llama_cpp.dart └── src │ ├── common.dart │ ├── embedding.dart │ ├── ffi.dart │ ├── lib_llama_cpp.dart │ ├── llama_params.dart │ ├── native_llama.dart │ └── sampling.dart ├── pubspec.yaml └── test ├── data ├── text.txt └── values.txt └── test_embedding.dart /.gitignore: -------------------------------------------------------------------------------- 1 | .dart_tool 2 | pubspec.lock 3 | lib/libllama_cpp.so 4 | .idea 5 | *.gguf 6 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "src"] 2 | path = src 3 | url = https://gitee.com/lindeer/llama.cpp.git 4 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## 1.2.0 2 | 3 | - upgrade native_assets_cli to 0.8.0 4 | - upgrade llama.cpp to 8854044 5 | 6 | ## 1.1.0 7 | 8 | - Upgrade llama.cpp to 60ed04cf to support qwen1.5. 9 | - Code refine. 10 | - Support embedding. 11 | - Support GPU building. 12 | - Add a RAG example. 13 | - Upgrade llama.cpp to 8c0e8f4e. 14 | 15 | ## 1.0.0 16 | 17 | - Upgrade Dart to 3.3.0. 18 | - Upgrade dependencies and fix issues. 19 | 20 | ## 0.9.0 21 | 22 | - Initial version. 23 | - Integrate with `native_assets_cli`. 24 | - Native helper classes. 25 | - Porting token sampling from `common/sampling.cpp`. 26 | - Passing LLM params to the isolate. 27 | - Token string as raw bytes stream. 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | A Dart binding for popular LLM inference framework [llama.cpp](https://github.com/ggerganov/llama.cpp), to bring AI to Dart world! 2 | 3 | ## Note 4 | 5 | `8854044` of `llama.cpp` is latest version supporting single shared library. After that version, `libllama.so` and `libggml.so` were created, but currently dart native-assets not support loading shared libraries at the same time. 6 | 7 | ## Overview 8 | 9 | - Text generation in a separated Dart isolate. 10 | - Stream based output in Dart style. 11 | - Integtate with `native_assets_cli`. 12 | - Extremely simple usage. 13 | - Support both LLM and embedding models. 14 | 15 | ## Trying examples 16 | 17 | ``` 18 | git clone https://github.com/lindeer/llama-cpp.git 19 | cd llama-cpp 20 | git submodule init --recursive 21 | dart pub get 22 | ``` 23 | 24 | Just run in console: 25 | ``` 26 | dart --enable-experiment=native-assets run example/main.dart "/path/to/your/LLM.gguf" "your prompt" 27 | ``` 28 | 29 | or run a simple http server: 30 | ``` 31 | dart --enable-experiment=native-assets run example/server.dart "/path/to/your/LLM.gguf" 32 | ``` 33 | 34 | or run a embedding model: 35 | ``` 36 | dart --enable-experiment=native-assets run example/embedding.dart "/path/to/your/embedding.gguf" "your text line1 37 | your text line2" 38 | ``` 39 | 40 | Also a minimal RAG example in `example/rag/` with all completely local data and model, inspired by [privateGPT](https://github.com/imartinez/privateGPT): 41 | 42 | 0. setup a chroma server: 43 | ``` 44 | pip install chromadb 45 | uvicorn chromadb.app:app --reload --workers 1 --host 0.0.0.0 --port 8000 46 | ``` 47 | 48 | 1. `cd example/rag` and creat a `config.json` and config your local models: 49 | ```json 50 | { 51 | "gpt_model": "/your/local/gpt/model", 52 | "embedding_model": "/your/local/embedding/model" 53 | } 54 | 55 | ``` 56 | 57 | 3. save documents in `corpus/` to vector database (only txt files currently): 58 | ``` 59 | dart --enable-experiment=native-assets run bin/ingest.dart 60 | ``` 61 | 62 | 4. chat with GPT in console, certainly could replace it with your beatiful GUI with flutter: 63 | ``` 64 | dart --enable-experiment=native-assets run bin/rag.dart 65 | ``` 66 | 67 | ## Getting started 68 | 69 | Ask LLM to answer with type writing effect: 70 | 71 | ```dart 72 | import 'package:llama_cpp/llama_cpp.dart'; 73 | 74 | final path = '/path/to/your/LLM.gguf'; 75 | final llama = await LlamaCpp.load(path, verbose: true); 76 | 77 | await for (final text in llama.answer(prompt)) { 78 | stdout.write(text); 79 | } 80 | stdout.writeln(); 81 | 82 | await llama.dispose(); 83 | ``` 84 | or if you want a full answer: 85 | ``` 86 | final answer = await llama.answer(prompt).join(''); 87 | ``` 88 | 89 | More examples could be found at `example/`. 90 | 91 | ## Notes 92 | 93 | native_assets_cli has beaking chanings since >0.1.0, and is not compatible with Dart 3.2, however, it could run with Dart 3.1.5. 94 | -------------------------------------------------------------------------------- /analysis_options.yaml: -------------------------------------------------------------------------------- 1 | # This file configures the static analysis results for your project (errors, 2 | # warnings, and lints). 3 | # 4 | # This enables the 'recommended' set of lints from `package:lints`. 5 | # This set helps identify many issues that may lead to problems when running 6 | # or consuming Dart code, and enforces writing Dart using a single, idiomatic 7 | # style and format. 8 | # 9 | # If you want a smaller set of lints you can change this to specify 10 | # 'package:lints/core.yaml'. These are just the most critical lints 11 | # (the recommended set includes the core lints). 12 | # The core lints are also what is used by pub.dev for scoring packages. 13 | 14 | include: package:lints/recommended.yaml 15 | 16 | # Uncomment the following section to specify additional rules. 17 | 18 | # linter: 19 | # rules: 20 | # - camel_case_types 21 | 22 | analyzer: 23 | exclude: 24 | - lib/src/lib_llama_cpp.dart 25 | 26 | # For more information about the core and recommended set of lints, see 27 | # https://dart.dev/go/core-lints 28 | 29 | # For additional information about configuring this file, see 30 | # https://dart.dev/guides/language/analysis-options 31 | -------------------------------------------------------------------------------- /build.dart: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, the Dart project authors. Please see the AUTHORS file 2 | // for details. All rights reserved. Use of this source code is governed by a 3 | // BSD-style license that can be found in the LICENSE file. 4 | 5 | import 'dart:io' show File, Platform, Process, exit, stderr, stdout; 6 | import 'package:path/path.dart' as p; 7 | import 'package:native_assets_cli/native_assets_cli.dart'; 8 | 9 | const packageName = 'llama_cpp'; 10 | const _repoLibName = 'libllama.so'; 11 | 12 | Future _commandPath(String cmd) async { 13 | final proc = await Process.run('which', [cmd]); 14 | stderr.write(proc.stderr); 15 | return proc.exitCode == 0 ? proc.stdout.toString() : ''; 16 | } 17 | 18 | /// Implements the protocol from `package:native_assets_cli` by building 19 | /// the C code in `src/` and reporting what native assets it built. 20 | void main(List args) async { 21 | await build(args, _builder); 22 | } 23 | 24 | Future _builder(BuildConfig buildConfig, BuildOutput buildOutput) async { 25 | final env = Platform.environment; 26 | final nvcc = env['LLAMA_CUDA_NVCC'] ?? await _commandPath('nvcc'); 27 | final arch = env['CUDA_DOCKER_ARCH'] ?? 'compute_75'; 28 | final pkgRoot = buildConfig.packageRoot; 29 | final srcDir = pkgRoot.resolve('src'); 30 | final proc = await Process.start( 31 | 'make', 32 | [ 33 | '-j', 34 | _repoLibName, 35 | if (nvcc.isNotEmpty) ...['LLAMA_CUBLAS=1', 'CUDA_DOCKER_ARCH=$arch'], 36 | ], 37 | workingDirectory: srcDir.path, 38 | ); 39 | stdout.addStream(proc.stdout); 40 | stderr.addStream(proc.stderr); 41 | final code = await proc.exitCode; 42 | if (code != 0) { 43 | final p = await Process.run('gcc', ['--version']); 44 | if (p.exitCode == 0) { 45 | final gccVer = p.stdout.toString(); 46 | stderr.writeln("Build failed, make sure 'gcc>=9.5.0':\n$gccVer"); 47 | } else { 48 | stderr.writeln("GCC not exists!"); 49 | } 50 | exit(code); 51 | } 52 | 53 | final linkMode = _linkMode(buildConfig.linkModePreference); 54 | final libName = buildConfig.targetOS.libraryFileName(packageName, linkMode); 55 | final libUri = buildConfig.outputDirectory.resolve(libName); 56 | final uri = pkgRoot.resolve(p.join('src', _repoLibName)); 57 | final file = File.fromUri(uri).resolveSymbolicLinksSync(); 58 | File(file).renameSync(libUri.path); 59 | 60 | buildOutput.addAsset(NativeCodeAsset( 61 | package: packageName, 62 | name: 'src/lib_$packageName.dart', 63 | linkMode: linkMode, 64 | os: buildConfig.targetOS, 65 | file: libUri, 66 | architecture: buildConfig.targetArchitecture, 67 | )); 68 | final src = [ 69 | 'src/llama.cpp', 70 | 'src/ggml.c', 71 | 'src/ggml-alloc.c', 72 | 'src/ggml-backend.c', 73 | 'src/ggml-quants.c', 74 | ]; 75 | 76 | buildOutput.addDependencies([ 77 | ...src.map((s) => pkgRoot.resolve(s)), 78 | pkgRoot.resolve('build.dart'), 79 | ]); 80 | } 81 | 82 | LinkMode _linkMode(LinkModePreference preference) { 83 | if (preference == LinkModePreference.dynamic || 84 | preference == LinkModePreference.preferDynamic) { 85 | return DynamicLoadingBundled(); 86 | } 87 | assert(preference == LinkModePreference.static || 88 | preference == LinkModePreference.preferStatic); 89 | return StaticLinking(); 90 | } 91 | -------------------------------------------------------------------------------- /example/embedding.dart: -------------------------------------------------------------------------------- 1 | import 'dart:ffi' as ffi; 2 | import 'dart:io' show stdout, Platform; 3 | import 'dart:math' as m; 4 | 5 | import 'package:ffi/ffi.dart' show calloc; 6 | import 'package:llama_cpp/src/common.dart' as c; 7 | import 'package:llama_cpp/src/ffi.dart'; 8 | import 'package:llama_cpp/src/lib_llama_cpp.dart' as llama_cpp; 9 | import 'package:llama_cpp/src/llama_params.dart'; 10 | 11 | int main(List argv) { 12 | if (argv.isEmpty || argv[0].startsWith('-')) { 13 | print("usage: ${Platform.script.path} MODEL_PATH [PROMPT]"); 14 | return 1; 15 | } 16 | final path = argv[0]; 17 | final prompt = argv.length > 1 ? argv[1] : 'Hello my name is'; 18 | 19 | final cStr = CharArray.from(path); 20 | final (model, ctx) = c.loadModel( 21 | cStr, 22 | LlamaParams( 23 | seed: 1234, 24 | nThread: 4, 25 | nThreadBatch: 4, 26 | embedding: true, 27 | ), 28 | ); 29 | 30 | final prompts = prompt 31 | .split('\n') 32 | .map((e) => e.trim()) 33 | .where((e) => e.isNotEmpty) 34 | .toList(growable: false); 35 | const batchSize = 512; 36 | final batch = llama_cpp.llama_batch_init(batchSize, 0, prompts.length); 37 | llama_cpp.llama_reset_timings(ctx); 38 | final maxTokenSize = prompts.map((e) => e.length).reduce(m.max); 39 | final tokens = TokenArray(size: maxTokenSize); 40 | final tokenList = prompts.map((p) { 41 | cStr.pavedBy(p); 42 | tokens.pavedBy(model, cStr, addBos: true); 43 | final l = tokens.toList(); 44 | return l.length > batchSize ? l.sublist(0, batchSize) : l; 45 | }); 46 | 47 | for (final (i, l) in tokenList.indexed) { 48 | print("main: prompt $i: '${prompts[i]}'"); 49 | print("main: number of tokens in prompt = ${l.length}"); 50 | for (final t in l) { 51 | print("${'$t'.padLeft(6)} -> '${cStr.tokenString(model, t)}'"); 52 | } 53 | } 54 | 55 | final dimens = llama_cpp.llama_n_embd(model); 56 | final row = tokenList.length; 57 | final bytes = ffi.sizeOf() * row * dimens; 58 | final data = calloc.allocate(bytes); 59 | var out = data; 60 | var s = 0; 61 | for (final tokens in tokenList) { 62 | final len = tokens.length; 63 | if (batch.n_tokens + len > batchSize) { 64 | c.decodeEmbeddingBatch(ctx, batch, out, s, dimens); 65 | batch.n_tokens = 0; 66 | out += s * dimens; 67 | s = 0; 68 | } 69 | c.addBatchSeq(batch, tokens, s); 70 | s++; 71 | } 72 | c.decodeEmbeddingBatch(ctx, batch, out, s, dimens); 73 | for (var j = 0, pos = 0; j < row; j++, pos += dimens) { 74 | stdout.write("embedding $j: ["); 75 | final p = data + pos; 76 | for (var i = 0; i < dimens; i++) { 77 | final v = (p + i).value; 78 | stdout.write("${v.toStringAsFixed(6)}, "); 79 | } 80 | stdout.writeln("]\n"); 81 | } 82 | 83 | calloc.free(data); 84 | cStr.dispose(); 85 | tokens.dispose(); 86 | llama_cpp.llama_print_timings(ctx); 87 | 88 | llama_cpp.llama_batch_free(batch); 89 | llama_cpp.llama_free(ctx); 90 | llama_cpp.llama_free_model(model); 91 | llama_cpp.llama_backend_free(); 92 | return 0; 93 | } 94 | -------------------------------------------------------------------------------- /example/main.dart: -------------------------------------------------------------------------------- 1 | import 'dart:io'; 2 | 3 | import 'package:llama_cpp/llama_cpp.dart'; 4 | 5 | Future main(List argv) async { 6 | if (argv.isEmpty || argv[0].startsWith('-')) { 7 | print("usage: ${Platform.script.path} MODEL_PATH [PROMPT]"); 8 | return 1; 9 | } 10 | final path = argv[0]; 11 | final prompt = argv.length > 1 ? argv[1] : 'Hello my name is'; 12 | final llama = await LlamaCpp.load(path, verbose: false); 13 | 14 | await for (final s in llama.answer(prompt)) { 15 | stdout.write(s); 16 | } 17 | stdout.writeln(); 18 | 19 | await llama.dispose(); 20 | return 0; 21 | } 22 | -------------------------------------------------------------------------------- /example/rag/.gitignore: -------------------------------------------------------------------------------- 1 | # https://dart.dev/guides/libraries/private-files 2 | # Created by `dart pub` 3 | .dart_tool/ 4 | config.json 5 | corpus/ 6 | -------------------------------------------------------------------------------- /example/rag/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## 1.0.0 2 | 3 | - Initial version. 4 | -------------------------------------------------------------------------------- /example/rag/README.md: -------------------------------------------------------------------------------- 1 | A sample command-line application with an entrypoint in `bin/`, library code 2 | in `lib/`, and example unit test in `test/`. 3 | -------------------------------------------------------------------------------- /example/rag/_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "source_dir": "corpus", 3 | "db_server_url": "http://0.0.0.0:8000", 4 | "database": "llama_cpp", 5 | "collection": "rag" 6 | } 7 | -------------------------------------------------------------------------------- /example/rag/analysis_options.yaml: -------------------------------------------------------------------------------- 1 | # This file configures the static analysis results for your project (errors, 2 | # warnings, and lints). 3 | # 4 | # This enables the 'recommended' set of lints from `package:lints`. 5 | # This set helps identify many issues that may lead to problems when running 6 | # or consuming Dart code, and enforces writing Dart using a single, idiomatic 7 | # style and format. 8 | # 9 | # If you want a smaller set of lints you can change this to specify 10 | # 'package:lints/core.yaml'. These are just the most critical lints 11 | # (the recommended set includes the core lints). 12 | # The core lints are also what is used by pub.dev for scoring packages. 13 | 14 | include: package:lints/recommended.yaml 15 | 16 | # Uncomment the following section to specify additional rules. 17 | 18 | # linter: 19 | # rules: 20 | # - camel_case_types 21 | 22 | # analyzer: 23 | # exclude: 24 | # - path/to/excluded/files/** 25 | 26 | # For more information about the core and recommended set of lints, see 27 | # https://dart.dev/go/core-lints 28 | 29 | # For additional information about configuring this file, see 30 | # https://dart.dev/guides/language/analysis-options 31 | -------------------------------------------------------------------------------- /example/rag/bin/ingest.dart: -------------------------------------------------------------------------------- 1 | import 'dart:io'; 2 | 3 | import 'package:rag/chroma.dart'; 4 | import 'package:rag/common.dart' as c; 5 | 6 | const chunkSize = 500; 7 | const overlapSize = 10; 8 | 9 | List _processDocuments(String dir, List ignored) { 10 | bool isValidFile(String path) { 11 | return FileSystemEntity.isFileSync(path) && !ignored.contains(path); 12 | } 13 | 14 | final docs = Directory(dir) 15 | .listSync(recursive: true) 16 | .where((f) => isValidFile(f.path)) 17 | .expand((e) { 18 | final file = File.fromUri(e.uri); 19 | final lines = file.readAsLinesSync().where((l) => l.trim().isNotEmpty); 20 | return _processLines(e.uri, lines); 21 | }).toList(growable: false); 22 | return docs; 23 | } 24 | 25 | List _processLines(Uri file, Iterable lines) { 26 | final result = []; 27 | final filepath = file.path; 28 | for (final line in lines) { 29 | final len = line.length; 30 | if (len > chunkSize) { 31 | for (var i = 0; i < len; i += chunkSize) { 32 | final enough = len - i > chunkSize; 33 | final str = enough ? line.substring(i, chunkSize) : line.substring(i); 34 | final delta = i > overlapSize ? line.substring(i - overlapSize, i) : ''; 35 | result.add(ChromaDoc('$delta$str', filepath)); 36 | } 37 | } else { 38 | result.add(ChromaDoc(line, filepath)); 39 | } 40 | } 41 | return result; 42 | } 43 | 44 | void main(List argv) async { 45 | final config = c.appConfig; 46 | final chroma = await c.setupChroma(config); 47 | final all = await chroma.allItems; 48 | final ignored = all 49 | .map((d) => d.metadata?['source'] as String?) 50 | .whereType() 51 | .toList(growable: false); 52 | final dir = config['source_dir'] ?? 'sources'; 53 | 54 | final docs = _processDocuments(dir, ignored); 55 | if (docs.isNotEmpty) { 56 | final at = DateTime.now().millisecondsSinceEpoch; 57 | await chroma.add(docs); 58 | final cost = DateTime.now().millisecondsSinceEpoch - at; 59 | print("Save [${docs.length}] documents cost $cost ms."); 60 | } 61 | chroma.dispose(); 62 | } 63 | -------------------------------------------------------------------------------- /example/rag/bin/rag.dart: -------------------------------------------------------------------------------- 1 | import 'dart:convert' show json; 2 | import 'dart:io' show stdin, stdout; 3 | import 'package:llama_cpp/llama_cpp.dart'; 4 | import 'package:rag/chroma.dart'; 5 | import 'package:rag/common.dart' as c; 6 | 7 | String _makePrompt(String question, List items) { 8 | return '''根据以下信息: 9 | 10 | ${items.map((e) => e.doc).join("\n\n")} 11 | 12 | 请回答:$question'''; 13 | } 14 | 15 | String? get _readLine { 16 | stdout.write('> '); 17 | return stdin.readLineSync()?.trim(); 18 | } 19 | 20 | void main(List argv) async { 21 | final config = c.appConfig; 22 | final chroma = await c.setupChroma(config); 23 | 24 | final path = config['gpt_model'] as String; 25 | final gpt = await LlamaCpp.load(path, verbose: false); 26 | late String question; 27 | while ((question = (_readLine ?? 'exit')) != 'exit') { 28 | if (question.isEmpty) { 29 | continue; 30 | } 31 | final items = await chroma.query(question, nResults: 2); 32 | final prompt = _makePrompt(question, items); 33 | final answer = gpt.answer(prompt); 34 | stdout.write('< '); 35 | await for (final str in answer) { 36 | stdout.write(str); 37 | } 38 | stdout.writeln(); 39 | } 40 | await gpt.dispose(); 41 | chroma.dispose(); 42 | } 43 | -------------------------------------------------------------------------------- /example/rag/lib/chroma.dart: -------------------------------------------------------------------------------- 1 | import 'dart:convert' show json, utf8; 2 | 3 | import 'package:http/http.dart' as http; 4 | import 'package:chromadb/chromadb.dart' as db; 5 | import 'package:llama_cpp/embedding.dart'; 6 | import 'package:uuid/uuid.dart' show Uuid; 7 | 8 | final class ChromaDoc { 9 | final String content; 10 | final String source; 11 | 12 | const ChromaDoc(this.content, this.source); 13 | 14 | @override 15 | String toString() => "('$source':'$content')"; 16 | } 17 | 18 | final class ChromaItem { 19 | final String? id; 20 | final String doc; 21 | final Map? metadata; 22 | 23 | const ChromaItem._(this.id, this.doc, this.metadata); 24 | 25 | @override 26 | String toString() => "ChromaItem(id:$id, doc:'$doc', meta:$metadata)"; 27 | } 28 | 29 | class Chroma { 30 | final db.ChromaClient client; 31 | final db.Collection collection; 32 | 33 | /// Not use `EmbeddingFunction` because of its type 34 | final Embedding embed; 35 | 36 | Chroma._(this.client, this.collection, this.embed); 37 | 38 | static Future create({ 39 | required String baseUrl, 40 | required String database, 41 | required collection, 42 | required Embedding embedding, 43 | }) async { 44 | final client = db.ChromaClient( 45 | baseUrl: baseUrl, 46 | database: database, 47 | ); 48 | final c = await client.getOrCreateCollection(name: collection); 49 | return Chroma._(client, c, embedding); 50 | } 51 | 52 | Future add(List docs) async { 53 | final len = docs.length; 54 | final uuid = Uuid(); 55 | final ids = List.generate(len, (i) => uuid.v1()); 56 | final docList = docs.map((e) => e.content).toList(growable: false); 57 | final embeddings = embed.embedBatch(docList); 58 | final metadatas = docs.map((e) => {'source': e.source}).toList(); 59 | await _add( 60 | ids: ids, 61 | documents: docList, 62 | embeddings: embeddings, 63 | metadatas: metadatas, 64 | ); 65 | } 66 | 67 | Future> get allItems async { 68 | final res = await collection.get(); 69 | return res.ids.indexed.map((r) { 70 | final (i, id) = r; 71 | final doc = res.documents?[i] ?? ''; 72 | final metadata = res.metadatas?[i]; 73 | return ChromaItem._(id, doc, metadata); 74 | }).toList(growable: false); 75 | } 76 | 77 | Future> query( 78 | String doc, { 79 | final int nResults = 4, 80 | }) async { 81 | final embeddings = embed.embedSingle(doc); 82 | final result = await _query(embeddings); 83 | final (ids, docs, metadatas) = ( 84 | result.ids.first, 85 | result.documents?.first, 86 | result.metadatas?.first, 87 | ); 88 | return ids.indexed.map((r) { 89 | final (i, id) = r; 90 | final doc = docs?[i] ?? ''; 91 | final metadata = metadatas?[i]; 92 | return ChromaItem._(id, utf8.decode(doc.codeUnits), metadata); 93 | }).toList(growable: false); 94 | } 95 | 96 | void dispose() { 97 | embed.dispose(); 98 | } 99 | 100 | Future _add({ 101 | required final List ids, 102 | final List>? embeddings, 103 | final List>? metadatas, 104 | final List? documents, 105 | }) async { 106 | final id = collection.id; 107 | final body = { 108 | "embeddings": embeddings, 109 | "metadatas": metadatas, 110 | "documents": documents, 111 | "ids": ids, 112 | "increment_index": true 113 | }; 114 | final res = await http.post( 115 | Uri.parse('http://0.0.0.0:8000/api/v1/collections/$id/add'), 116 | headers: { 117 | 'Content-Type': 'application/json', 118 | }, 119 | body: json.encode(body), 120 | ); 121 | return res.body; 122 | } 123 | 124 | Future _query(List embedding) async { 125 | final id = collection.id; 126 | final body = { 127 | "where": {}, 128 | "where_document": {}, 129 | "query_embeddings": [embedding], 130 | "n_results": 2, 131 | "include": [ 132 | "metadatas", 133 | "documents", 134 | "distances", 135 | ], 136 | }; 137 | final res = await http.post( 138 | Uri.parse('http://0.0.0.0:8000/api/v1/collections/$id/query'), 139 | headers: { 140 | 'Content-Type': 'application/json', 141 | }, 142 | body: json.encode(body), 143 | ); 144 | final obj = json.decode(res.body) as Map; 145 | return db.QueryResponse.fromJson(obj); 146 | } 147 | 148 | /* 149 | static Future _fetchCollection({ 150 | required final db.ChromaClient client, 151 | required final String name, 152 | }) async { 153 | final body = { 154 | "name": name, 155 | "get_or_create": true, 156 | }; 157 | final res = await http.post( 158 | Uri.parse('http://0.0.0.0:8000/api/v1/collections'), 159 | headers: { 160 | 'Content-Type': 'application/json', 161 | }, 162 | body: json.encode(body), 163 | ); 164 | final obj = json.decode(res.body); 165 | return db.Collection( 166 | name: obj['name']!, 167 | id: obj['id']!, 168 | metadata: obj['metadata'], 169 | tenant: client.tenant, 170 | database: client.database, 171 | api: client.api, 172 | ); 173 | } 174 | */ 175 | } 176 | -------------------------------------------------------------------------------- /example/rag/lib/common.dart: -------------------------------------------------------------------------------- 1 | import 'dart:convert' show json; 2 | import 'dart:io'; 3 | 4 | import 'package:llama_cpp/embedding.dart'; 5 | 6 | import 'chroma.dart'; 7 | 8 | Future setupChroma(Map config) async { 9 | final embeddingPath = config['embedding_model'] as String; 10 | final embedding = Embedding(embeddingPath); 11 | final chroma = await Chroma.create( 12 | baseUrl: config['db_server_url'] as String, 13 | database: config['database'] as String, 14 | collection: config['collection'] as String, 15 | embedding: embedding, 16 | ); 17 | return chroma; 18 | } 19 | 20 | Map get appConfig { 21 | final uri = Directory.current.uri; 22 | final f1 = File.fromUri(uri.resolve('_config.json')); 23 | final f2 = File.fromUri(uri.resolve('config.json')); 24 | if (!f1.existsSync() || !f2.existsSync()) { 25 | print("We need '_config.json' and 'config.json' files"); 26 | return {}; 27 | } 28 | final config = json.decode(f1.readAsStringSync()) as Map; 29 | config.addAll(json.decode(f2.readAsStringSync())); 30 | return config; 31 | } 32 | -------------------------------------------------------------------------------- /example/rag/pubspec.yaml: -------------------------------------------------------------------------------- 1 | name: rag 2 | description: An example of RAG (Retrieval Augment Generation) app. 3 | version: 1.0.0 4 | publish_to: none 5 | 6 | environment: 7 | sdk: ^3.3.0 8 | 9 | dependencies: 10 | chromadb: ^0.1.2 11 | llama_cpp: 12 | path: ../.. 13 | uuid: ^4.3.3 14 | 15 | dev_dependencies: 16 | lints: ^3.0.0 17 | test: ^1.24.0 18 | -------------------------------------------------------------------------------- /example/rag/test/test_chroma.dart: -------------------------------------------------------------------------------- 1 | import 'dart:io'; 2 | 3 | import 'package:rag/common.dart' as c; 4 | import 'package:rag/chroma.dart'; 5 | import 'package:test/test.dart'; 6 | 7 | void main() async { 8 | // start a chroma server: 9 | // `uvicorn chromadb.app:app --reload --workers 1 --host 0.0.0.0 --port 8000` 10 | 11 | final config = { 12 | 'embedding_model': Platform.environment['EMBEDDING_MODEL_PATH'] ?? 13 | (throw Exception("Model path 'EMBEDDING_MODEL_PATH' not specified!")), 14 | 'db_server_url': 'http://0.0.0.0:8000', 15 | 'database': 'ecr', 16 | 'collection': 'retail', 17 | }; 18 | final chroma = await c.setupChroma(config); 19 | 20 | test('chroma save', () async { 21 | final docs = ['Hello world!', 'Hello Moto!', 'Hi world!', 'Hey world'] 22 | .indexed 23 | .map((r) { 24 | final (i, d) = r; 25 | return ChromaDoc(d, 'file${i % 2}.txt'); 26 | }).toList(); 27 | await chroma.add(docs); 28 | final items = await chroma.query('hello, world~', nResults: 2); 29 | expect(items.length, 2); 30 | final str = items.map((e) => "'${e.id}':'${e.doc}'").join(','); 31 | expect(str.contains('Hello world!'), true); 32 | final list = await Future.wait( 33 | [chroma.allItems.then((v) => v.length), chroma.collection.count()], 34 | ); 35 | expect(list, [4, 4]); 36 | }); 37 | 38 | tearDownAll(() { 39 | chroma.dispose(); 40 | }); 41 | } 42 | -------------------------------------------------------------------------------- /example/server.dart: -------------------------------------------------------------------------------- 1 | import 'dart:convert' show utf8; 2 | import 'dart:io' show HttpServer; 3 | 4 | import 'package:llama_cpp/llama_cpp.dart' show LlamaCpp; 5 | 6 | const _defaultPort = 8080; 7 | void main(List argv) async { 8 | if (argv.isEmpty) { 9 | print("usage: dart server.dart MODEL_PATH [PORT]"); 10 | return; 11 | } 12 | final path = argv[0]; 13 | final port = (argv.length > 1 ? int.tryParse(argv[1]) : null) ?? _defaultPort; 14 | final ai = await LlamaCpp.load(path); 15 | 16 | final server = await HttpServer.bind('localhost', port); 17 | print('Serving at http://${server.address.host}:${server.port}'); 18 | await for (final request in server) { 19 | final body = await request 20 | .map((e) => List.from(e)) 21 | .transform(utf8.decoder) 22 | .join(); 23 | final response = request.response; 24 | response.headers 25 | ..set('Content-Type', 'application/octet-stream; charset=utf-8') 26 | ..add("Transfer-Encoding", "chunked"); 27 | response.bufferOutput = false; 28 | final answer = ai.answerWith(body); 29 | // curl should run with `--no-buffer` param 30 | await response.addStream(answer.transform(utf8.encoder)); 31 | await response.close(); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /example/simple.dart: -------------------------------------------------------------------------------- 1 | import 'dart:ffi' as ffi; 2 | import 'dart:io' show stderr, stdout, Platform; 3 | 4 | import 'package:llama_cpp/src/common.dart' as c; 5 | import 'package:llama_cpp/src/ffi.dart'; 6 | import 'package:llama_cpp/src/lib_llama_cpp.dart' as llama_cpp; 7 | import 'package:llama_cpp/src/llama_params.dart'; 8 | 9 | int main(List argv) { 10 | if (argv.isEmpty || argv[0].startsWith('-')) { 11 | print("usage: ${Platform.script.path} MODEL_PATH [PROMPT]"); 12 | return 1; 13 | } 14 | final path = argv[0]; 15 | final prompt = argv.length > 1 ? argv[1] : 'Hello my name is'; 16 | // total length of the sequence including the prompt 17 | const nLen = 32; 18 | llama_cpp.llama_backend_init(); 19 | llama_cpp.llama_numa_init(0); 20 | 21 | final cStr = CharArray.from(path); 22 | final (model, ctx) = c.loadModel( 23 | cStr, 24 | LlamaParams( 25 | seed: 1234, 26 | nCtx: 1024, 27 | nThread: 4, 28 | nThreadBatch: 4, 29 | ), 30 | ); 31 | 32 | final ctxSize = llama_cpp.llama_n_ctx(ctx); 33 | final tokenCapacity = prompt.length + 1; 34 | cStr.pavedBy(prompt); 35 | final tokenBuf = TokenArray(size: tokenCapacity); 36 | tokenBuf.pavedBy(model, cStr); 37 | final tokenNum = tokenBuf.length; 38 | final kvReq = tokenNum + (nLen - tokenNum); 39 | print("\nn_len = $nLen, n_ctx = $ctxSize, n_kv_req = $kvReq, " 40 | "token_n = $tokenNum, len = ${cStr.length}"); 41 | stderr.write("User prompt is:"); 42 | for (var i = 0; i < tokenNum; i++) { 43 | final text = cStr.tokenString(model, tokenBuf[i]); 44 | stderr.write(text); 45 | } 46 | stderr.writeln(); 47 | stderr.flush(); 48 | 49 | // create a llama_batch with size 512 50 | // we use this object to submit token data for decoding 51 | final batch = llama_cpp.llama_batch_init(512, 0, 1); 52 | // evaluate the initial prompt 53 | c.addBatchSeq(batch, tokenBuf.toList(), 0); 54 | batch.logits[batch.n_tokens - 1] = 1; 55 | 56 | if (llama_cpp.llama_decode(ctx, batch) != 0) { 57 | return 1; 58 | } 59 | 60 | llama_cpp.llama_reset_timings(ctx); 61 | var count = batch.n_tokens; 62 | final nVocab = llama_cpp.llama_n_vocab(model); 63 | final array = TokenDataArray(nVocab); 64 | final eosToken = llama_cpp.llama_token_eos(model); 65 | while (count <= nLen) { 66 | final logits = llama_cpp.llama_get_logits_ith(ctx, batch.n_tokens - 1); 67 | array.pavedBy(logits, nVocab); 68 | 69 | final tokenId = llama_cpp.llama_sample_token_greedy(ctx, array.pointer); 70 | if (tokenId == eosToken || count == nLen) { 71 | break; 72 | } 73 | final word = cStr.tokenString(model, tokenId); 74 | stdout.write(word); 75 | // `stdout.flush()` cause 'Bad state: StreamSink is bound to a stream' error in Dart 3.1.5 76 | // stdout.flush(); 77 | 78 | // prepare the next batch 79 | batch.n_tokens = 0; 80 | // push this new token for next evaluation 81 | c.addBatchSingle(batch, tokenId, count, true); 82 | 83 | count++; 84 | 85 | // evaluate the current batch with the transformer model 86 | if (llama_cpp.llama_decode(ctx, batch) != 0) { 87 | return 2; 88 | } 89 | } 90 | 91 | array.dispose(); 92 | tokenBuf.dispose(); 93 | cStr.dispose(); 94 | 95 | llama_cpp.llama_print_timings(ctx); 96 | 97 | llama_cpp.llama_batch_free(batch); 98 | llama_cpp.llama_free(ctx); 99 | llama_cpp.llama_free_model(model); 100 | llama_cpp.llama_backend_free(); 101 | 102 | return 0; 103 | } 104 | -------------------------------------------------------------------------------- /ffigen.yaml: -------------------------------------------------------------------------------- 1 | # Run with `dart --enable-experiment=native-assets run ffigen --config ffigen.yaml`. 2 | name: NativeLlamaCppBindings 3 | description: | 4 | Bindings for `src/llama.h`. 5 | 6 | Regenerate bindings with `dart --enable-experiment=native-assets run ffigen --config ffigen.yaml`. 7 | output: 'lib/src/lib_llama_cpp.dart' 8 | headers: 9 | entry-points: 10 | - 'src/llama.h' 11 | include-directives: 12 | - 'src/llama.h' 13 | compiler-opts: 14 | - '-I/opt/programs/miniforge3/envs/clang-10/lib/clang/10.0.0/include' 15 | preamble: | 16 | // Copyright (c) 2023, the Dart project authors. Please see the AUTHORS file 17 | // for details. All rights reserved. Use of this source code is governed by a 18 | // BSD-style license that can be found in the LICENSE file. 19 | comments: 20 | style: any 21 | length: full 22 | ffi-native: 23 | -------------------------------------------------------------------------------- /lib/embedding.dart: -------------------------------------------------------------------------------- 1 | export 'src/embedding.dart'; 2 | -------------------------------------------------------------------------------- /lib/llama_cpp.dart: -------------------------------------------------------------------------------- 1 | import 'dart:convert' show json, utf8; 2 | import 'dart:io' show stdout; 3 | import 'dart:isolate' show Isolate, ReceivePort, SendPort; 4 | 5 | import 'src/llama_params.dart'; 6 | import 'src/native_llama.dart'; 7 | 8 | /// A brief overview of inter-ops among main classes: 9 | /// 10 | /// ``` 11 | /// +----------------+---------------------+--------------------+ 12 | /// | main Isolate | llama Isolate | native world | 13 | /// +----------------+---------------------+--------------------+ 14 | /// | LlamaCpp | NativeLlama | llama_cpp | 15 | /// | | | | 16 | /// | send --> incoming --> + | 17 | /// | | | | | 18 | /// | | ffi | | 19 | /// | | | | | 20 | /// | receiving <-- outgoing <-- + | 21 | /// | | | | 22 | /// +---------------+---------------------+---------------------+ 23 | /// ``` 24 | class LlamaCpp { 25 | final ReceivePort _recv; 26 | final Isolate _isolate; 27 | final SendPort _send; 28 | final Stream _receiving; 29 | final bool verbose; 30 | 31 | const LlamaCpp._( 32 | this._recv, 33 | this._isolate, 34 | this._send, 35 | this._receiving, 36 | this.verbose, 37 | ); 38 | 39 | /// Async create LlamaCpp by given params. 40 | static Future load( 41 | String path, { 42 | int? seed, 43 | int? nThread, 44 | int? nThreadBatch, 45 | int? nPredict, 46 | int? nCtx, 47 | int? nBatch, 48 | int? nKeep, 49 | int? nGpuLayers, 50 | int? mainGpu, 51 | int numa = 0, 52 | bool verbose = true, 53 | }) async { 54 | final recv = ReceivePort('main.incoming'); 55 | final params = LlamaParams( 56 | seed: seed, 57 | nThread: nThread, 58 | nThreadBatch: nThreadBatch, 59 | nPredict: nPredict, 60 | nCtx: nCtx, 61 | nBatch: nBatch, 62 | nGpuLayers: nGpuLayers, 63 | mainGpu: mainGpu, 64 | numa: numa, 65 | ); 66 | final isolate = await Isolate.spawn<(SendPort, String, LlamaParams)>( 67 | _llamaIsolate, 68 | (recv.sendPort, path, params), 69 | errorsAreFatal: true, 70 | debugName: '_llamaIsolate', 71 | ); 72 | final receiving = recv.asBroadcastStream(); 73 | final send = (await receiving.first) as SendPort; 74 | return LlamaCpp._(recv, isolate, send, receiving.cast(), verbose); 75 | } 76 | 77 | static const _finish = {'cmd': NativeLLama.closeTag}; 78 | 79 | /// Notify isolate to free native resources, after that, finish this isolate. 80 | Future dispose() async { 81 | print("LlamaCpp.dispose: disposing native llama ..."); 82 | _send.send(_finish); 83 | await _receiving.first; 84 | print("LlamaCpp.dispose: native llama disposed."); 85 | _recv.close(); 86 | _isolate.kill(); 87 | } 88 | 89 | /// Generate text stream by given params. 90 | /// [params] json string with params, e.g.: 91 | /// ai.answerWith({ 92 | /// "prompt": "my question is", 93 | /// "min_p": 20, 94 | /// }); 95 | Stream answerWith(String params) { 96 | final request = json.decode(params); 97 | if ((request['prompt'] ?? '').isEmpty) { 98 | throw Exception("Json body without 'prompt'!"); 99 | } 100 | return _requestAnswer(request); 101 | } 102 | 103 | /// Generate text stream by given prompt. 104 | /// @question The prompt passed by user who want model to generate an answer. 105 | Stream answer( 106 | String question, { 107 | int? nPrev, 108 | int? nProbs, 109 | int? topK, 110 | double? topP, 111 | double? minP, 112 | double? tfsZ, 113 | double? typicalP, 114 | double? temperature, 115 | int? penaltyLastN, 116 | double? penaltyRepeat, 117 | double? penaltyFrequency, 118 | double? penaltyPresent, 119 | int? mirostat, 120 | double? mirostatTau, 121 | double? mirostatEta, 122 | bool? penalizeNewline, 123 | String? samplersSequence, 124 | }) { 125 | final request = { 126 | 'prompt': question, 127 | if (nPrev != null) 'n_prev': nPrev, 128 | if (nProbs != null) 'n_probs': nProbs, 129 | if (topK != null) 'top_k': topK, 130 | if (topP != null) 'top_p': topP, 131 | if (minP != null) 'min_p': minP, 132 | if (tfsZ != null) 'tfs_z': tfsZ, 133 | if (typicalP != null) 'typical_p': typicalP, 134 | if (temperature != null) 'temperature': temperature, 135 | if (penaltyLastN != null) 'penalty_last_n': penaltyLastN, 136 | if (penaltyRepeat != null) 'penalty_repeat': penaltyRepeat, 137 | if (penaltyFrequency != null) 'penalty_frequency': penaltyFrequency, 138 | if (penaltyPresent != null) 'penalty_present': penaltyPresent, 139 | if (mirostat != null) 'mirostat': mirostat, 140 | if (mirostatTau != null) 'mirostat_tau': mirostatTau, 141 | if (mirostatEta != null) 'mirostat_eta': mirostatEta, 142 | if (penalizeNewline != null) 'penalize_newline': penalizeNewline, 143 | if (samplersSequence != null) 'samplers_sequence': samplersSequence, 144 | }; 145 | 146 | return _requestAnswer(request); 147 | } 148 | 149 | Stream _requestAnswer(Map request) async* { 150 | if (verbose) { 151 | stdout.writeln("<<<<<<<<<<<<<<<"); 152 | stdout.writeln("$request\n---------------"); 153 | } 154 | _send.send(request); 155 | await for (final msg in _receiving) { 156 | if (msg == NativeLLama.engTag) { 157 | break; 158 | } else { 159 | yield msg; 160 | if (verbose) { 161 | stdout.write(msg); 162 | } 163 | } 164 | } 165 | if (verbose) { 166 | stdout.writeln("\n>>>>>>>>>>>>>>>"); 167 | } 168 | } 169 | 170 | // run in the isolate, relative main. 171 | static _llamaIsolate((SendPort, String, LlamaParams) r) async { 172 | final (outgoing, path, params) = r; 173 | final incoming = ReceivePort('_runIsolate.incoming'); 174 | 175 | final llama = NativeLLama(path, params); 176 | outgoing.send(incoming.sendPort); 177 | final requests = incoming.cast>(); 178 | await for (final r in requests) { 179 | if (r['cmd'] == NativeLLama.closeTag) { 180 | print("Isolate received '$r', start closing ..."); 181 | break; 182 | } 183 | final params = r; 184 | final prompt = params['prompt'] as String; 185 | final rawStream = llama.generate( 186 | prompt, 187 | nPrev: params['n_prev'], 188 | nProbs: params['n_probs'], 189 | topK: params['top_k'], 190 | topP: params['top_p'], 191 | minP: params['min_p'], 192 | tfsZ: params['tfs_z'], 193 | typicalP: params['typical_p'], 194 | temperature: params['temperature'], 195 | penaltyLastN: params['penalty_last_n'], 196 | penaltyRepeat: params['penalty_repeat'], 197 | penaltyFrequency: params['penalty_frequency'], 198 | penaltyPresent: params['penalty_present'], 199 | mirostat: params['mirostat'], 200 | mirostatTau: params['mirostat_tau'], 201 | mirostatEta: params['mirostat_eta'], 202 | penalizeNewline: params['penalize_newline'], 203 | samplersSequence: params['samplers_sequence'], 204 | ); 205 | final s = rawStream.transform(utf8.decoder); 206 | await for (final str in s) { 207 | outgoing.send(str); 208 | if (str == NativeLLama.engTag) { 209 | break; 210 | } 211 | } 212 | } 213 | llama.dispose(); 214 | outgoing.send(NativeLLama.closeTag); 215 | } 216 | } 217 | -------------------------------------------------------------------------------- /lib/src/common.dart: -------------------------------------------------------------------------------- 1 | import 'dart:ffi' as ffi; 2 | import 'dart:io' show Platform; 3 | import 'dart:math' as m; 4 | 5 | import 'ffi.dart'; 6 | import 'lib_llama_cpp.dart' as llama_cpp; 7 | import 'llama_params.dart'; 8 | 9 | void _addLlamaBatch( 10 | llama_cpp.llama_batch batch, 11 | int id, 12 | int pos, 13 | int seq, 14 | bool logits, 15 | ) { 16 | final n = batch.n_tokens; 17 | batch.token[n] = id; 18 | batch.pos[n] = pos; 19 | batch.n_seq_id[n] = 1; 20 | batch.seq_id[n][0] = seq; 21 | batch.logits[n] = logits ? 1 : 0; 22 | 23 | batch.n_tokens++; 24 | } 25 | 26 | /// Append single token to batch with given position and logit. 27 | void addBatchSingle(llama_cpp.llama_batch batch, int t, int pos, bool logit) { 28 | _addLlamaBatch(batch, t, pos, 0, logit); 29 | } 30 | 31 | void _addBatchTokens( 32 | llama_cpp.llama_batch batch, 33 | List tokens, 34 | int pos, 35 | int seq, 36 | bool logit, 37 | ) { 38 | for (final (i, t) in tokens.indexed) { 39 | _addLlamaBatch(batch, t, pos + i, seq, logit); 40 | } 41 | } 42 | 43 | /// Add multiple tokens to batch with given [seq] from start. 44 | void addBatchSeq(llama_cpp.llama_batch batch, List tokens, int seq) { 45 | _addBatchTokens(batch, tokens, 0, seq, false); 46 | } 47 | 48 | /// Add multiple tokens to batch from [pos] with given [logit]. 49 | void addBatchPos( 50 | llama_cpp.llama_batch batch, 51 | List tokens, 52 | int pos, 53 | bool logit, 54 | ) { 55 | _addBatchTokens(batch, tokens, pos, 0, logit); 56 | } 57 | 58 | void _normalize(ffi.Pointer vec, ffi.Pointer out, int n) { 59 | var norm = 0.0; 60 | for (var i = 0; i < n; i++) { 61 | final v = vec[i]; 62 | norm += v * v; 63 | } 64 | norm = m.sqrt(norm); 65 | for (var i = 0; i < n; i++) { 66 | out[i] = vec[i] / norm; 67 | } 68 | } 69 | 70 | /// Decode batch only for embedding. 71 | void decodeEmbeddingBatch( 72 | ffi.Pointer ctx, 73 | llama_cpp.llama_batch batch, 74 | ffi.Pointer output, 75 | int seq, 76 | int dimens, 77 | ) { 78 | llama_cpp.llama_kv_cache_clear(ctx); 79 | print('decodeEmbeddingBatch: n_tokens = ${batch.n_tokens}, n_seq = $seq'); 80 | if (llama_cpp.llama_decode(ctx, batch) < 0) { 81 | throw Exception('decodeEmbeddingBatch: failed to decode'); 82 | } 83 | for (var k = 0; k < seq; k++) { 84 | final emb = llama_cpp.llama_get_embeddings_ith(ctx, k); 85 | final out = output + k * dimens; 86 | _normalize(emb, out, dimens); 87 | } 88 | } 89 | 90 | int get _physicalCores { 91 | final n = Platform.numberOfProcessors; 92 | return n > 4 93 | ? n ~/ 2 94 | : n > 0 95 | ? n 96 | : 4; 97 | } 98 | 99 | String _systemInfo(LlamaParams lp, llama_cpp.llama_context_params params) { 100 | final n = lp.nThreadBatch; 101 | final batch = n != null ? ' (n_threads_batch = $n)' : ''; 102 | return 'system_info: n_threads = ${params.n_threads}$batch ' 103 | '/ ${Platform.numberOfProcessors} ' 104 | '| ${CharArray.toDartString(llama_cpp.llama_print_system_info())}'; 105 | } 106 | 107 | /// Load a model from a given path, it could be a LLM also a embedding model. 108 | /// return both model and context. 109 | (ffi.Pointer, ffi.Pointer) 110 | loadModel(CharArray path, LlamaParams params) { 111 | final ctxSize = params.nCtx ?? 512; 112 | final s = params.seed ?? 0; 113 | final seed = s > 0 ? s : DateTime.now().millisecondsSinceEpoch ~/ 1000; 114 | print('seed = $seed'); 115 | print('llama backend init'); 116 | llama_cpp.llama_backend_init(); 117 | llama_cpp.llama_numa_init(params.numa); 118 | final modelParams = llama_cpp.llama_model_default_params(); 119 | final nGpuLayers = params.nGpuLayers; 120 | if (nGpuLayers != null) { 121 | modelParams.n_gpu_layers = nGpuLayers > 0 ? nGpuLayers : 0; 122 | } 123 | final mainGpu = params.mainGpu; 124 | if (mainGpu != null) { 125 | modelParams.main_gpu = mainGpu; 126 | } 127 | 128 | final model = llama_cpp.llama_load_model_from_file(path.pointer, modelParams); 129 | if (model.address == 0) { 130 | throw Exception("Load model from '${path.dartString}' failed"); 131 | } 132 | 133 | final ctxParams = llama_cpp.llama_context_default_params()..seed = seed; 134 | if (ctxSize > 0) { 135 | ctxParams.n_ctx = ctxSize; 136 | } 137 | final nBatch = params.nBatch ?? -1; 138 | if (nBatch > 0) { 139 | ctxParams.n_batch = nBatch; 140 | } 141 | final t = params.nThread ?? 0; 142 | ctxParams.n_threads = t > 0 ? t : _physicalCores; 143 | final tb = params.nThreadBatch ?? 0; 144 | ctxParams.n_threads_batch = tb > 0 ? tb : ctxParams.n_threads; 145 | 146 | final ctx = llama_cpp.llama_new_context_with_model(model, ctxParams); 147 | if (ctx.address == 0) { 148 | throw Exception("Create llama context failed"); 149 | } 150 | final nCtxTrain = llama_cpp.llama_n_ctx_train(model); 151 | final nCtx = llama_cpp.llama_n_ctx(ctx); 152 | print('n_ctx: $nCtx, train=$nCtxTrain'); 153 | if (nCtx > nCtxTrain) { 154 | print('warning: model was trained on only $nCtxTrain context tokens ' 155 | '($nCtx specified)'); 156 | } 157 | print(_systemInfo(params, ctxParams)); 158 | _warmup(model, ctx, ctxParams.n_batch); 159 | 160 | return (model, ctx); 161 | } 162 | 163 | void _warmup(ffi.Pointer model, 164 | ffi.Pointer ctx, int batchSize) { 165 | print('warming up the model with an empty run'); 166 | final tokens = TokenArray(size: 2); 167 | tokens.add(llama_cpp.llama_token_bos(model)); 168 | tokens.add(llama_cpp.llama_token_eos(model)); 169 | final batch = llama_cpp.llama_batch_get_one( 170 | tokens.pointerAt(0), 171 | m.min(tokens.length, batchSize), 172 | 0, 173 | 0, 174 | ); 175 | llama_cpp.llama_decode(ctx, batch); 176 | llama_cpp.llama_kv_cache_clear(ctx); 177 | llama_cpp.llama_reset_timings(ctx); 178 | tokens.dispose(); 179 | } 180 | -------------------------------------------------------------------------------- /lib/src/embedding.dart: -------------------------------------------------------------------------------- 1 | import 'dart:ffi' as ffi; 2 | 3 | import 'package:ffi/ffi.dart' show calloc; 4 | 5 | import 'common.dart' as c; 6 | import 'ffi.dart'; 7 | import 'lib_llama_cpp.dart' as llama_cpp; 8 | import 'llama_params.dart'; 9 | 10 | /// Embedding runs in current isolate. 11 | /// Place it in another isolate if you want async embeddings. 12 | final class Embedding { 13 | final ffi.Pointer model; 14 | final ffi.Pointer ctx; 15 | final CharArray cStr; 16 | final bool verbose; 17 | final tokenBuf = TokenArray(size: 64); 18 | 19 | Embedding._( 20 | this.model, 21 | this.ctx, 22 | this.cStr, 23 | this.verbose, 24 | ); 25 | 26 | factory Embedding( 27 | String path, { 28 | int? nThread, 29 | int? nThreadBatch, 30 | int? nCtx, 31 | int? nBatch, 32 | int? nGpuLayers, 33 | bool verbose = false, 34 | }) { 35 | final cStr = CharArray.from(path); 36 | final (model, ctx) = c.loadModel( 37 | cStr, 38 | LlamaParams( 39 | nThread: nThread, 40 | nThreadBatch: nThreadBatch, 41 | nCtx: nCtx, 42 | nBatch: nBatch, 43 | nGpuLayers: nGpuLayers, 44 | embedding: true, 45 | ), 46 | ); 47 | 48 | return Embedding._( 49 | model, 50 | ctx, 51 | cStr, 52 | verbose, 53 | ); 54 | } 55 | 56 | /// Embedding multiple prompts at one time. 57 | List> embedBatch(List prompts) => _embed(prompts); 58 | 59 | /// Embedding one prompt at one time. 60 | List embedSingle(String prompt) => _embed([prompt]).first; 61 | 62 | List> _embed(List prompts) { 63 | llama_cpp.llama_reset_timings(ctx); 64 | 65 | final batchSize = llama_cpp.llama_n_batch(ctx); 66 | final batch = llama_cpp.llama_batch_init(batchSize, 0, prompts.length); 67 | final arrayList = prompts.map((p) { 68 | cStr.pavedBy(p); 69 | tokenBuf.pavedBy(model, cStr, addBos: true); 70 | final l = tokenBuf.toList(); 71 | return l.length > batchSize ? l.sublist(0, batchSize) : l; 72 | }); 73 | if (verbose) { 74 | for (final (i, l) in arrayList.indexed) { 75 | print("main: prompt $i: '${prompts[i]}'"); 76 | print("main: number of tokens in prompt = ${l.length}"); 77 | for (final t in l) { 78 | print("${'$t'.padLeft(6)} -> '${cStr.tokenString(model, t)}'"); 79 | } 80 | } 81 | } 82 | 83 | final dimens = llama_cpp.llama_n_embd(model); 84 | final row = arrayList.length; 85 | final data = 86 | calloc.allocate(ffi.sizeOf() * row * dimens); 87 | var out = data; 88 | var s = 0; 89 | for (final tokens in arrayList) { 90 | final len = tokens.length; 91 | if (batch.n_tokens + len > batchSize) { 92 | c.decodeEmbeddingBatch(ctx, batch, out, s, dimens); 93 | batch.n_tokens = 0; 94 | out += s * dimens; 95 | s = 0; 96 | } 97 | c.addBatchSeq(batch, tokens, s); 98 | s++; 99 | } 100 | c.decodeEmbeddingBatch(ctx, batch, out, s, dimens); 101 | 102 | final result = List>.generate(row, (r) { 103 | final p = data + r * dimens; 104 | return List.generate( 105 | dimens, 106 | (i) => (p[i] * 1000000).round() / 1000000, 107 | growable: false, 108 | ); 109 | }, growable: false); 110 | llama_cpp.llama_print_timings(ctx); 111 | 112 | calloc.free(data); 113 | llama_cpp.llama_batch_free(batch); 114 | return result; 115 | } 116 | 117 | /// Free context, model and memory objects in C world. 118 | void dispose() { 119 | tokenBuf.dispose(); 120 | cStr.dispose(); 121 | 122 | llama_cpp.llama_free(ctx); 123 | llama_cpp.llama_free_model(model); 124 | llama_cpp.llama_backend_free(); 125 | print('Embedding.dispose: done.'); 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /lib/src/ffi.dart: -------------------------------------------------------------------------------- 1 | import 'dart:convert' show utf8; 2 | import 'dart:ffi' as ffi; 3 | import 'dart:typed_data' show Uint8List; 4 | 5 | import 'package:ffi/ffi.dart' show Utf8, Utf8Pointer, calloc; 6 | 7 | import 'lib_llama_cpp.dart' as llama_cpp; 8 | 9 | CharArray _fillChars(String str, CharArray Function(int size) getter) { 10 | final units = utf8.encode(str); 11 | final size = units.length + 1; 12 | final len = size - 1; 13 | final buf = getter(size); 14 | final pointer = buf._buf.cast(); 15 | final raw = pointer.asTypedList(size); 16 | raw.setAll(0, units); 17 | raw[len] = 0; 18 | buf._len = len; 19 | return buf; 20 | } 21 | 22 | /// Util class for data conversion between Dart `String` and C `const char *`. 23 | /// From Dart `String` to C `const char *`: 24 | /// ```dart 25 | /// final cStr = CharArray.from('some thing as string'); 26 | /// final p = cStr.pointer; 27 | /// call_some_C_function(p, cStr.length); 28 | /// cStr.dispose(); 29 | /// ``` 30 | /// To reuse an existing `CharArray`: 31 | /// ```dart 32 | /// CharArray cStr; 33 | /// final p = cStr.pavedBy('some thing as string'); 34 | /// call_some_C_function(p, cStr.length); 35 | /// cStr.dispose(); 36 | /// ``` 37 | /// 38 | /// From C `const char *` to Dart `String`: 39 | /// ```dart 40 | /// final p = call_some_C_function(); 41 | /// final str = CharArray.fromNative(p); 42 | /// ``` 43 | final class CharArray { 44 | int _size; 45 | int _len; 46 | ffi.Pointer _buf; 47 | 48 | CharArray({int size = 32}) 49 | : _size = size, 50 | _len = 0, 51 | _buf = calloc.allocate(size * ffi.sizeOf()); 52 | 53 | /// Create newly a buffer for an existing Dart string. 54 | factory CharArray.from(String str) { 55 | final buf = _fillChars(str, (size) => CharArray(size: size)); 56 | return buf; 57 | } 58 | 59 | /// A helper function that converts the given Dart String to `const char *` 60 | /// with an existing `CharArray`. 61 | /// The capacity is expanded automatically. 62 | ffi.Pointer pavedBy(String str) { 63 | _fillChars(str, (size) => this.._resize(size)); 64 | return _buf; 65 | } 66 | 67 | int get length => _len; 68 | 69 | ffi.Pointer get pointer => _buf; 70 | 71 | /// Convert to Dart string with data in current buf and specified length. 72 | String get dartString => _buf.cast().toDartString(length: _len); 73 | 74 | bool _resize(int size) { 75 | if (size <= _size) { 76 | return false; 77 | } 78 | dispose(); 79 | _buf = calloc.allocate(size * ffi.sizeOf()); 80 | _size = size; 81 | // copy existing elements? 82 | _len = 0; 83 | return true; 84 | } 85 | 86 | /// Convert to Dart string with extern CString pointer without length. 87 | static String toDartString(ffi.Pointer pointer) => 88 | pointer.cast().toDartString(); 89 | 90 | /// A string representation for a token. 91 | /// In some model, one token would not return a full utf8 string. 92 | String tokenString(ffi.Pointer model, int token) { 93 | final bytes = tokenBytes(model, token); 94 | try { 95 | return dartString; 96 | } on Exception catch (_) { 97 | return bytes.toString(); 98 | } 99 | } 100 | 101 | /// Return a raw bytes with a given token Id. 102 | /// We need convert assigned int to unassigned, or else 103 | /// `FormatException: Invalid UTF-8 byte` would be thrown. 104 | List tokenBytes(ffi.Pointer model, int token) { 105 | final len = llama_cpp.llama_token_to_piece( 106 | model, 107 | token, 108 | _buf, 109 | _size, 110 | false, 111 | ); 112 | if (len < 0) { 113 | _resize(-len); 114 | _len = llama_cpp.llama_token_to_piece(model, token, _buf, _size, false); 115 | } else { 116 | _len = len; 117 | } 118 | return Uint8List.fromList(List.generate(_len, (i) => _buf[i])); 119 | } 120 | 121 | /// Release native resources. 122 | void dispose() { 123 | calloc.free(_buf); 124 | _len = 0; 125 | _size = 0; 126 | } 127 | } 128 | 129 | final class TokenArray { 130 | int _size; 131 | int _len; 132 | ffi.Pointer _buf; 133 | 134 | TokenArray({int size = 512}) 135 | : _size = size, 136 | _len = 0, 137 | _buf = calloc.allocate( 138 | size * ffi.sizeOf()); 139 | 140 | int get length => _len; 141 | 142 | int get capacity => _size; 143 | 144 | int operator [](int pos) => _buf[pos]; 145 | 146 | ffi.Pointer pointerAt(int pos) => _buf + pos; 147 | 148 | void pavedBy( 149 | ffi.Pointer model, 150 | CharArray text, { 151 | bool addBos = false, 152 | }) { 153 | final size = text.length + 1; 154 | _resize(size); 155 | final len = llama_cpp.llama_tokenize( 156 | model, 157 | text.pointer, 158 | text.length, 159 | _buf, 160 | _size, 161 | addBos, 162 | false, 163 | ); 164 | if (len < 0) { 165 | throw Exception("tokenize '${text.dartString}' failed!"); 166 | } 167 | _len = len; 168 | } 169 | 170 | int clear() { 171 | var n = _len; 172 | _len = 0; 173 | return n; 174 | } 175 | 176 | void add(int token) { 177 | _resize(_len + 1); 178 | _buf[_len++] = token; 179 | } 180 | 181 | bool _resize(int size) { 182 | if (size <= _size) { 183 | return false; 184 | } 185 | dispose(); 186 | _buf = calloc.allocate( 187 | size * ffi.sizeOf()); 188 | _size = size; 189 | // copy existing elements? 190 | _len = 0; 191 | return true; 192 | } 193 | 194 | List toList() => List.generate(_len, (i) => _buf[i]); 195 | 196 | void dispose() { 197 | calloc.free(_buf); 198 | _len = 0; 199 | _size = 0; 200 | } 201 | } 202 | 203 | final class TokenDataArray { 204 | int _size; 205 | int _len; 206 | ffi.Pointer _buf; 207 | final pointer = calloc.allocate( 208 | ffi.sizeOf()); 209 | 210 | TokenDataArray(int size) 211 | : _size = size, 212 | _len = 0, 213 | _buf = calloc.allocate( 214 | size * ffi.sizeOf()); 215 | 216 | int get length => _len; 217 | 218 | llama_cpp.llama_token_data operator [](int pos) => _buf[pos]; 219 | 220 | void setLogit(int pos, double value) { 221 | (_buf + pos).ref.logit = value; 222 | } 223 | 224 | void pavedBy(ffi.Pointer logits, int size) { 225 | _resize(size); 226 | for (var id = 0; id < size; id++) { 227 | _buf[id] 228 | ..id = id 229 | ..logit = logits[id] 230 | ..p = 0; 231 | } 232 | pointer.ref 233 | ..data = _buf 234 | ..size = size 235 | ..sorted = false; 236 | } 237 | 238 | bool _resize(int size) { 239 | if (size <= _size) { 240 | return false; 241 | } 242 | _release(); 243 | _buf = calloc.allocate( 244 | size * ffi.sizeOf()); 245 | _size = size; 246 | // copy existing elements? 247 | _len = 0; 248 | return true; 249 | } 250 | 251 | // not free array pointer 252 | void _release() { 253 | calloc.free(_buf); 254 | _len = 0; 255 | _size = 0; 256 | } 257 | 258 | void dispose() { 259 | _release(); 260 | calloc.free(pointer); 261 | } 262 | } 263 | -------------------------------------------------------------------------------- /lib/src/lib_llama_cpp.dart: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, the Dart project authors. Please see the AUTHORS file 2 | // for details. All rights reserved. Use of this source code is governed by a 3 | // BSD-style license that can be found in the LICENSE file. 4 | 5 | // AUTO GENERATED FILE, DO NOT EDIT. 6 | // 7 | // Generated by `package:ffigen`. 8 | // ignore_for_file: type=lint, unused_field, unused_element 9 | import 'dart:ffi' as ffi; 10 | 11 | /// Helpers for getting default parameters 12 | @ffi.Native(symbol: 'llama_model_default_params') 13 | external llama_model_params llama_model_default_params(); 14 | 15 | @ffi.Native( 16 | symbol: 'llama_context_default_params') 17 | external llama_context_params llama_context_default_params(); 18 | 19 | @ffi.Native( 20 | symbol: 'llama_model_quantize_default_params') 21 | external llama_model_quantize_params llama_model_quantize_default_params(); 22 | 23 | /// Initialize the llama + ggml backend 24 | /// If numa is true, use NUMA optimizations 25 | /// Call once at the start of the program 26 | @ffi.Native(symbol: 'llama_backend_init') 27 | external void llama_backend_init(); 28 | 29 | /// optional: 30 | @ffi.Native(symbol: 'llama_numa_init') 31 | external void llama_numa_init( 32 | int numa, 33 | ); 34 | 35 | /// Call once at the end of the program - currently only used for MPI 36 | @ffi.Native(symbol: 'llama_backend_free') 37 | external void llama_backend_free(); 38 | 39 | @ffi.Native< 40 | ffi.Pointer Function(ffi.Pointer, 41 | llama_model_params)>(symbol: 'llama_load_model_from_file') 42 | external ffi.Pointer llama_load_model_from_file( 43 | ffi.Pointer path_model, 44 | llama_model_params params, 45 | ); 46 | 47 | @ffi.Native)>( 48 | symbol: 'llama_free_model') 49 | external void llama_free_model( 50 | ffi.Pointer model, 51 | ); 52 | 53 | @ffi.Native< 54 | ffi.Pointer Function(ffi.Pointer, 55 | llama_context_params)>(symbol: 'llama_new_context_with_model') 56 | external ffi.Pointer llama_new_context_with_model( 57 | ffi.Pointer model, 58 | llama_context_params params, 59 | ); 60 | 61 | /// Frees all allocated memory 62 | @ffi.Native)>(symbol: 'llama_free') 63 | external void llama_free( 64 | ffi.Pointer ctx, 65 | ); 66 | 67 | @ffi.Native(symbol: 'llama_time_us') 68 | external int llama_time_us(); 69 | 70 | @ffi.Native(symbol: 'llama_max_devices') 71 | external int llama_max_devices(); 72 | 73 | @ffi.Native(symbol: 'llama_supports_mmap') 74 | external bool llama_supports_mmap(); 75 | 76 | @ffi.Native(symbol: 'llama_supports_mlock') 77 | external bool llama_supports_mlock(); 78 | 79 | @ffi.Native(symbol: 'llama_supports_gpu_offload') 80 | external bool llama_supports_gpu_offload(); 81 | 82 | @ffi.Native Function(ffi.Pointer)>( 83 | symbol: 'llama_get_model') 84 | external ffi.Pointer llama_get_model( 85 | ffi.Pointer ctx, 86 | ); 87 | 88 | @ffi.Native)>( 89 | symbol: 'llama_n_ctx') 90 | external int llama_n_ctx( 91 | ffi.Pointer ctx, 92 | ); 93 | 94 | @ffi.Native)>( 95 | symbol: 'llama_n_batch') 96 | external int llama_n_batch( 97 | ffi.Pointer ctx, 98 | ); 99 | 100 | @ffi.Native)>( 101 | symbol: 'llama_n_ubatch') 102 | external int llama_n_ubatch( 103 | ffi.Pointer ctx, 104 | ); 105 | 106 | @ffi.Native)>( 107 | symbol: 'llama_n_seq_max') 108 | external int llama_n_seq_max( 109 | ffi.Pointer ctx, 110 | ); 111 | 112 | @ffi.Native)>( 113 | symbol: 'llama_pooling_type') 114 | external int llama_pooling_type1( 115 | ffi.Pointer ctx, 116 | ); 117 | 118 | @ffi.Native)>( 119 | symbol: 'llama_vocab_type') 120 | external int llama_vocab_type1( 121 | ffi.Pointer model, 122 | ); 123 | 124 | @ffi.Native)>( 125 | symbol: 'llama_rope_type') 126 | external int llama_rope_type1( 127 | ffi.Pointer model, 128 | ); 129 | 130 | @ffi.Native)>( 131 | symbol: 'llama_n_vocab') 132 | external int llama_n_vocab( 133 | ffi.Pointer model, 134 | ); 135 | 136 | @ffi.Native)>( 137 | symbol: 'llama_n_ctx_train') 138 | external int llama_n_ctx_train( 139 | ffi.Pointer model, 140 | ); 141 | 142 | @ffi.Native)>( 143 | symbol: 'llama_n_embd') 144 | external int llama_n_embd( 145 | ffi.Pointer model, 146 | ); 147 | 148 | @ffi.Native)>( 149 | symbol: 'llama_n_layer') 150 | external int llama_n_layer( 151 | ffi.Pointer model, 152 | ); 153 | 154 | /// Get the model's RoPE frequency scaling factor 155 | @ffi.Native)>( 156 | symbol: 'llama_rope_freq_scale_train') 157 | external double llama_rope_freq_scale_train( 158 | ffi.Pointer model, 159 | ); 160 | 161 | /// Get metadata value as a string by key name 162 | @ffi.Native< 163 | ffi.Int32 Function(ffi.Pointer, ffi.Pointer, 164 | ffi.Pointer, ffi.Size)>(symbol: 'llama_model_meta_val_str') 165 | external int llama_model_meta_val_str( 166 | ffi.Pointer model, 167 | ffi.Pointer key, 168 | ffi.Pointer buf, 169 | int buf_size, 170 | ); 171 | 172 | /// Get the number of metadata key/value pairs 173 | @ffi.Native)>( 174 | symbol: 'llama_model_meta_count') 175 | external int llama_model_meta_count( 176 | ffi.Pointer model, 177 | ); 178 | 179 | /// Get metadata key name by index 180 | @ffi.Native< 181 | ffi.Int32 Function( 182 | ffi.Pointer, 183 | ffi.Int32, 184 | ffi.Pointer, 185 | ffi.Size)>(symbol: 'llama_model_meta_key_by_index') 186 | external int llama_model_meta_key_by_index( 187 | ffi.Pointer model, 188 | int i, 189 | ffi.Pointer buf, 190 | int buf_size, 191 | ); 192 | 193 | /// Get metadata value as a string by index 194 | @ffi.Native< 195 | ffi.Int32 Function( 196 | ffi.Pointer, 197 | ffi.Int32, 198 | ffi.Pointer, 199 | ffi.Size)>(symbol: 'llama_model_meta_val_str_by_index') 200 | external int llama_model_meta_val_str_by_index( 201 | ffi.Pointer model, 202 | int i, 203 | ffi.Pointer buf, 204 | int buf_size, 205 | ); 206 | 207 | /// Get a string describing the model type 208 | @ffi.Native< 209 | ffi.Int32 Function(ffi.Pointer, ffi.Pointer, 210 | ffi.Size)>(symbol: 'llama_model_desc') 211 | external int llama_model_desc( 212 | ffi.Pointer model, 213 | ffi.Pointer buf, 214 | int buf_size, 215 | ); 216 | 217 | /// Returns the total size of all the tensors in the model in bytes 218 | @ffi.Native)>( 219 | symbol: 'llama_model_size') 220 | external int llama_model_size( 221 | ffi.Pointer model, 222 | ); 223 | 224 | /// Returns the total number of parameters in the model 225 | @ffi.Native)>( 226 | symbol: 'llama_model_n_params') 227 | external int llama_model_n_params( 228 | ffi.Pointer model, 229 | ); 230 | 231 | /// Get a llama model tensor 232 | @ffi.Native< 233 | ffi.Pointer Function(ffi.Pointer, 234 | ffi.Pointer)>(symbol: 'llama_get_model_tensor') 235 | external ffi.Pointer llama_get_model_tensor( 236 | ffi.Pointer model, 237 | ffi.Pointer name, 238 | ); 239 | 240 | /// Returns 0 on success 241 | @ffi.Native< 242 | ffi.Uint32 Function(ffi.Pointer, ffi.Pointer, 243 | ffi.Pointer)>( 244 | symbol: 'llama_model_quantize') 245 | external int llama_model_quantize( 246 | ffi.Pointer fname_inp, 247 | ffi.Pointer fname_out, 248 | ffi.Pointer params, 249 | ); 250 | 251 | /// Apply a LoRA adapter to a loaded model 252 | /// path_base_model is the path to a higher quality model to use as a base for 253 | /// the layers modified by the adapter. Can be NULL to use the current loaded model. 254 | /// The model needs to be reloaded before applying a new adapter, otherwise the adapter 255 | /// will be applied on top of the previous one 256 | /// Returns 0 on success 257 | @ffi.Native< 258 | ffi.Int32 Function( 259 | ffi.Pointer, 260 | ffi.Pointer, 261 | ffi.Float, 262 | ffi.Pointer, 263 | ffi.Int32)>(symbol: 'llama_model_apply_lora_from_file') 264 | external int llama_model_apply_lora_from_file( 265 | ffi.Pointer model, 266 | ffi.Pointer path_lora, 267 | double scale, 268 | ffi.Pointer path_base_model, 269 | int n_threads, 270 | ); 271 | 272 | /// Apply a loaded control vector to a llama_context, or if data is NULL, clear 273 | /// the currently loaded vector. 274 | /// n_embd should be the size of a single layer's control, and data should point 275 | /// to an n_embd x n_layers buffer starting from layer 1. 276 | /// il_start and il_end are the layer range the vector should apply to (both inclusive) 277 | /// See llama_control_vector_load in common to load a control vector. 278 | @ffi.Native< 279 | ffi.Int32 Function( 280 | ffi.Pointer, 281 | ffi.Pointer, 282 | ffi.Size, 283 | ffi.Int32, 284 | ffi.Int32, 285 | ffi.Int32)>(symbol: 'llama_control_vector_apply') 286 | external int llama_control_vector_apply( 287 | ffi.Pointer lctx, 288 | ffi.Pointer data, 289 | int len, 290 | int n_embd, 291 | int il_start, 292 | int il_end, 293 | ); 294 | 295 | /// Create an empty KV cache view. (use only for debugging purposes) 296 | @ffi.Native< 297 | llama_kv_cache_view Function(ffi.Pointer, ffi.Int32)>( 298 | symbol: 'llama_kv_cache_view_init') 299 | external llama_kv_cache_view llama_kv_cache_view_init( 300 | ffi.Pointer ctx, 301 | int n_seq_max, 302 | ); 303 | 304 | /// Free a KV cache view. (use only for debugging purposes) 305 | @ffi.Native)>( 306 | symbol: 'llama_kv_cache_view_free') 307 | external void llama_kv_cache_view_free( 308 | ffi.Pointer view, 309 | ); 310 | 311 | /// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes) 312 | @ffi.Native< 313 | ffi.Void Function(ffi.Pointer, 314 | ffi.Pointer)>(symbol: 'llama_kv_cache_view_update') 315 | external void llama_kv_cache_view_update( 316 | ffi.Pointer ctx, 317 | ffi.Pointer view, 318 | ); 319 | 320 | /// Returns the number of tokens in the KV cache (slow, use only for debug) 321 | /// If a KV cell has multiple sequences assigned to it, it will be counted multiple times 322 | @ffi.Native)>( 323 | symbol: 'llama_get_kv_cache_token_count') 324 | external int llama_get_kv_cache_token_count( 325 | ffi.Pointer ctx, 326 | ); 327 | 328 | /// Returns the number of used KV cells (i.e. have at least one sequence assigned to them) 329 | @ffi.Native)>( 330 | symbol: 'llama_get_kv_cache_used_cells') 331 | external int llama_get_kv_cache_used_cells( 332 | ffi.Pointer ctx, 333 | ); 334 | 335 | /// Clear the KV cache - both cell info is erased and KV data is zeroed 336 | @ffi.Native)>( 337 | symbol: 'llama_kv_cache_clear') 338 | external void llama_kv_cache_clear( 339 | ffi.Pointer ctx, 340 | ); 341 | 342 | /// Removes all tokens that belong to the specified sequence and have positions in [p0, p1) 343 | /// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails 344 | /// seq_id < 0 : match any sequence 345 | /// p0 < 0 : [0, p1] 346 | /// p1 < 0 : [p0, inf) 347 | @ffi.Native< 348 | ffi.Bool Function(ffi.Pointer, llama_seq_id, llama_pos, 349 | llama_pos)>(symbol: 'llama_kv_cache_seq_rm') 350 | external bool llama_kv_cache_seq_rm( 351 | ffi.Pointer ctx, 352 | int seq_id, 353 | int p0, 354 | int p1, 355 | ); 356 | 357 | /// Copy all tokens that belong to the specified sequence to another sequence 358 | /// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence 359 | /// p0 < 0 : [0, p1] 360 | /// p1 < 0 : [p0, inf) 361 | @ffi.Native< 362 | ffi.Void Function(ffi.Pointer, llama_seq_id, llama_seq_id, 363 | llama_pos, llama_pos)>(symbol: 'llama_kv_cache_seq_cp') 364 | external void llama_kv_cache_seq_cp( 365 | ffi.Pointer ctx, 366 | int seq_id_src, 367 | int seq_id_dst, 368 | int p0, 369 | int p1, 370 | ); 371 | 372 | /// Removes all tokens that do not belong to the specified sequence 373 | @ffi.Native, llama_seq_id)>( 374 | symbol: 'llama_kv_cache_seq_keep') 375 | external void llama_kv_cache_seq_keep( 376 | ffi.Pointer ctx, 377 | int seq_id, 378 | ); 379 | 380 | /// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) 381 | /// If the KV cache is RoPEd, the KV data is updated accordingly: 382 | /// - lazily on next llama_decode() 383 | /// - explicitly with llama_kv_cache_update() 384 | /// p0 < 0 : [0, p1] 385 | /// p1 < 0 : [p0, inf) 386 | @ffi.Native< 387 | ffi.Void Function(ffi.Pointer, llama_seq_id, llama_pos, 388 | llama_pos, llama_pos)>(symbol: 'llama_kv_cache_seq_add') 389 | external void llama_kv_cache_seq_add( 390 | ffi.Pointer ctx, 391 | int seq_id, 392 | int p0, 393 | int p1, 394 | int delta, 395 | ); 396 | 397 | /// Integer division of the positions by factor of `d > 1` 398 | /// If the KV cache is RoPEd, the KV data is updated accordingly: 399 | /// - lazily on next llama_decode() 400 | /// - explicitly with llama_kv_cache_update() 401 | /// p0 < 0 : [0, p1] 402 | /// p1 < 0 : [p0, inf) 403 | @ffi.Native< 404 | ffi.Void Function(ffi.Pointer, llama_seq_id, llama_pos, 405 | llama_pos, ffi.Int)>(symbol: 'llama_kv_cache_seq_div') 406 | external void llama_kv_cache_seq_div( 407 | ffi.Pointer ctx, 408 | int seq_id, 409 | int p0, 410 | int p1, 411 | int d, 412 | ); 413 | 414 | /// Returns the largest position present in the KV cache for the specified sequence 415 | @ffi.Native, llama_seq_id)>( 416 | symbol: 'llama_kv_cache_seq_pos_max') 417 | external int llama_kv_cache_seq_pos_max( 418 | ffi.Pointer ctx, 419 | int seq_id, 420 | ); 421 | 422 | /// Defragment the KV cache 423 | /// This will be applied: 424 | /// - lazily on next llama_decode() 425 | /// - explicitly with llama_kv_cache_update() 426 | @ffi.Native)>( 427 | symbol: 'llama_kv_cache_defrag') 428 | external void llama_kv_cache_defrag( 429 | ffi.Pointer ctx, 430 | ); 431 | 432 | /// Apply the KV cache updates (such as K-shifts, defragmentation, etc.) 433 | @ffi.Native)>( 434 | symbol: 'llama_kv_cache_update') 435 | external void llama_kv_cache_update( 436 | ffi.Pointer ctx, 437 | ); 438 | 439 | /// Returns the maximum size in bytes of the state (rng, logits, embedding 440 | /// and kv_cache) - will often be smaller after compacting tokens 441 | @ffi.Native)>( 442 | symbol: 'llama_state_get_size') 443 | external int llama_state_get_size( 444 | ffi.Pointer ctx, 445 | ); 446 | 447 | @ffi.Native)>( 448 | symbol: 'llama_get_state_size') 449 | external int llama_get_state_size( 450 | ffi.Pointer ctx, 451 | ); 452 | 453 | /// Copies the state to the specified destination address. 454 | /// Destination needs to have allocated enough memory. 455 | /// Returns the number of bytes copied 456 | @ffi.Native< 457 | ffi.Size Function(ffi.Pointer, ffi.Pointer)>( 458 | symbol: 'llama_state_get_data') 459 | external int llama_state_get_data( 460 | ffi.Pointer ctx, 461 | ffi.Pointer dst, 462 | ); 463 | 464 | @ffi.Native< 465 | ffi.Size Function(ffi.Pointer, ffi.Pointer)>( 466 | symbol: 'llama_copy_state_data') 467 | external int llama_copy_state_data( 468 | ffi.Pointer ctx, 469 | ffi.Pointer dst, 470 | ); 471 | 472 | /// Set the state reading from the specified address 473 | /// Returns the number of bytes read 474 | @ffi.Native< 475 | ffi.Size Function(ffi.Pointer, ffi.Pointer)>( 476 | symbol: 'llama_state_set_data') 477 | external int llama_state_set_data( 478 | ffi.Pointer ctx, 479 | ffi.Pointer src, 480 | ); 481 | 482 | @ffi.Native< 483 | ffi.Size Function(ffi.Pointer, ffi.Pointer)>( 484 | symbol: 'llama_set_state_data') 485 | external int llama_set_state_data( 486 | ffi.Pointer ctx, 487 | ffi.Pointer src, 488 | ); 489 | 490 | /// Save/load session file 491 | @ffi.Native< 492 | ffi.Bool Function( 493 | ffi.Pointer, 494 | ffi.Pointer, 495 | ffi.Pointer, 496 | ffi.Size, 497 | ffi.Pointer)>(symbol: 'llama_state_load_file') 498 | external bool llama_state_load_file( 499 | ffi.Pointer ctx, 500 | ffi.Pointer path_session, 501 | ffi.Pointer tokens_out, 502 | int n_token_capacity, 503 | ffi.Pointer n_token_count_out, 504 | ); 505 | 506 | @ffi.Native< 507 | ffi.Bool Function( 508 | ffi.Pointer, 509 | ffi.Pointer, 510 | ffi.Pointer, 511 | ffi.Size, 512 | ffi.Pointer)>(symbol: 'llama_load_session_file') 513 | external bool llama_load_session_file( 514 | ffi.Pointer ctx, 515 | ffi.Pointer path_session, 516 | ffi.Pointer tokens_out, 517 | int n_token_capacity, 518 | ffi.Pointer n_token_count_out, 519 | ); 520 | 521 | @ffi.Native< 522 | ffi.Bool Function(ffi.Pointer, ffi.Pointer, 523 | ffi.Pointer, ffi.Size)>(symbol: 'llama_state_save_file') 524 | external bool llama_state_save_file( 525 | ffi.Pointer ctx, 526 | ffi.Pointer path_session, 527 | ffi.Pointer tokens, 528 | int n_token_count, 529 | ); 530 | 531 | @ffi.Native< 532 | ffi.Bool Function(ffi.Pointer, ffi.Pointer, 533 | ffi.Pointer, ffi.Size)>(symbol: 'llama_save_session_file') 534 | external bool llama_save_session_file( 535 | ffi.Pointer ctx, 536 | ffi.Pointer path_session, 537 | ffi.Pointer tokens, 538 | int n_token_count, 539 | ); 540 | 541 | /// Get the exact size needed to copy the KV cache of a single sequence 542 | @ffi.Native, llama_seq_id)>( 543 | symbol: 'llama_state_seq_get_size') 544 | external int llama_state_seq_get_size( 545 | ffi.Pointer ctx, 546 | int seq_id, 547 | ); 548 | 549 | /// Copy the KV cache of a single sequence into the specified buffer 550 | @ffi.Native< 551 | ffi.Size Function(ffi.Pointer, ffi.Pointer, 552 | llama_seq_id)>(symbol: 'llama_state_seq_get_data') 553 | external int llama_state_seq_get_data( 554 | ffi.Pointer ctx, 555 | ffi.Pointer dst, 556 | int seq_id, 557 | ); 558 | 559 | /// Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence 560 | /// Returns: 561 | /// - Positive: Ok 562 | /// - Zero: Failed to load 563 | @ffi.Native< 564 | ffi.Size Function(ffi.Pointer, ffi.Pointer, 565 | llama_seq_id)>(symbol: 'llama_state_seq_set_data') 566 | external int llama_state_seq_set_data( 567 | ffi.Pointer ctx, 568 | ffi.Pointer src, 569 | int dest_seq_id, 570 | ); 571 | 572 | @ffi.Native< 573 | ffi.Size Function( 574 | ffi.Pointer, 575 | ffi.Pointer, 576 | llama_seq_id, 577 | ffi.Pointer, 578 | ffi.Size)>(symbol: 'llama_state_seq_save_file') 579 | external int llama_state_seq_save_file( 580 | ffi.Pointer ctx, 581 | ffi.Pointer filepath, 582 | int seq_id, 583 | ffi.Pointer tokens, 584 | int n_token_count, 585 | ); 586 | 587 | @ffi.Native< 588 | ffi.Size Function( 589 | ffi.Pointer, 590 | ffi.Pointer, 591 | llama_seq_id, 592 | ffi.Pointer, 593 | ffi.Size, 594 | ffi.Pointer)>(symbol: 'llama_state_seq_load_file') 595 | external int llama_state_seq_load_file( 596 | ffi.Pointer ctx, 597 | ffi.Pointer filepath, 598 | int dest_seq_id, 599 | ffi.Pointer tokens_out, 600 | int n_token_capacity, 601 | ffi.Pointer n_token_count_out, 602 | ); 603 | 604 | /// Return batch for single sequence of tokens starting at pos_0 605 | /// 606 | /// NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it 607 | @ffi.Native< 608 | llama_batch Function(ffi.Pointer, ffi.Int32, llama_pos, 609 | llama_seq_id)>(symbol: 'llama_batch_get_one') 610 | external llama_batch llama_batch_get_one( 611 | ffi.Pointer tokens, 612 | int n_tokens, 613 | int pos_0, 614 | int seq_id, 615 | ); 616 | 617 | /// Allocates a batch of tokens on the heap that can hold a maximum of n_tokens 618 | /// Each token can be assigned up to n_seq_max sequence ids 619 | /// The batch has to be freed with llama_batch_free() 620 | /// If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float) 621 | /// Otherwise, llama_batch.token will be allocated to store n_tokens llama_token 622 | /// The rest of the llama_batch members are allocated with size n_tokens 623 | /// All members are left uninitialized 624 | @ffi.Native( 625 | symbol: 'llama_batch_init') 626 | external llama_batch llama_batch_init( 627 | int n_tokens, 628 | int embd, 629 | int n_seq_max, 630 | ); 631 | 632 | /// Frees a batch of tokens allocated with llama_batch_init() 633 | @ffi.Native(symbol: 'llama_batch_free') 634 | external void llama_batch_free( 635 | llama_batch batch, 636 | ); 637 | 638 | /// Positive return values does not mean a fatal error, but rather a warning. 639 | /// 0 - success 640 | /// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context) 641 | /// < 0 - error 642 | @ffi.Native, llama_batch)>( 643 | symbol: 'llama_decode') 644 | external int llama_decode( 645 | ffi.Pointer ctx, 646 | llama_batch batch, 647 | ); 648 | 649 | /// Set the number of threads used for decoding 650 | /// n_threads is the number of threads used for generation (single token) 651 | /// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens) 652 | @ffi.Native< 653 | ffi.Void Function(ffi.Pointer, ffi.Uint32, ffi.Uint32)>( 654 | symbol: 'llama_set_n_threads') 655 | external void llama_set_n_threads( 656 | ffi.Pointer ctx, 657 | int n_threads, 658 | int n_threads_batch, 659 | ); 660 | 661 | /// Get the number of threads used for generation of a single token. 662 | @ffi.Native)>( 663 | symbol: 'llama_n_threads') 664 | external int llama_n_threads( 665 | ffi.Pointer ctx, 666 | ); 667 | 668 | /// Get the number of threads used for prompt and batch processing (multiple token). 669 | @ffi.Native)>( 670 | symbol: 'llama_n_threads_batch') 671 | external int llama_n_threads_batch( 672 | ffi.Pointer ctx, 673 | ); 674 | 675 | /// Set whether the model is in embeddings mode or not 676 | /// If true, embeddings will be returned but logits will not 677 | @ffi.Native, ffi.Bool)>( 678 | symbol: 'llama_set_embeddings') 679 | external void llama_set_embeddings( 680 | ffi.Pointer ctx, 681 | bool embeddings, 682 | ); 683 | 684 | /// Set whether to use causal attention or not 685 | /// If set to true, the model will only attend to the past tokens 686 | @ffi.Native, ffi.Bool)>( 687 | symbol: 'llama_set_causal_attn') 688 | external void llama_set_causal_attn( 689 | ffi.Pointer ctx, 690 | bool causal_attn, 691 | ); 692 | 693 | /// Set abort callback 694 | @ffi.Native< 695 | ffi.Void Function(ffi.Pointer, ggml_abort_callback, 696 | ffi.Pointer)>(symbol: 'llama_set_abort_callback') 697 | external void llama_set_abort_callback( 698 | ffi.Pointer ctx, 699 | ggml_abort_callback abort_callback, 700 | ffi.Pointer abort_callback_data, 701 | ); 702 | 703 | /// Wait until all computations are finished 704 | /// This is automatically done when using one of the functions below to obtain the computation results 705 | /// and is not necessary to call it explicitly in most cases 706 | @ffi.Native)>( 707 | symbol: 'llama_synchronize') 708 | external void llama_synchronize( 709 | ffi.Pointer ctx, 710 | ); 711 | 712 | /// Token logits obtained from the last call to llama_decode() 713 | /// The logits for which llama_batch.logits[i] != 0 are stored contiguously 714 | /// in the order they have appeared in the batch. 715 | /// Rows: number of tokens for which llama_batch.logits[i] != 0 716 | /// Cols: n_vocab 717 | @ffi.Native Function(ffi.Pointer)>( 718 | symbol: 'llama_get_logits') 719 | external ffi.Pointer llama_get_logits( 720 | ffi.Pointer ctx, 721 | ); 722 | 723 | /// Logits for the ith token. For positive indices, Equivalent to: 724 | /// llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab 725 | /// Negative indicies can be used to access logits in reverse order, -1 is the last logit. 726 | /// returns NULL for invalid ids. 727 | @ffi.Native< 728 | ffi.Pointer Function( 729 | ffi.Pointer, ffi.Int32)>(symbol: 'llama_get_logits_ith') 730 | external ffi.Pointer llama_get_logits_ith( 731 | ffi.Pointer ctx, 732 | int i, 733 | ); 734 | 735 | /// Get all output token embeddings. 736 | /// when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model, 737 | /// the embeddings for which llama_batch.logits[i] != 0 are stored contiguously 738 | /// in the order they have appeared in the batch. 739 | /// shape: [n_outputs*n_embd] 740 | /// Otherwise, returns NULL. 741 | @ffi.Native Function(ffi.Pointer)>( 742 | symbol: 'llama_get_embeddings') 743 | external ffi.Pointer llama_get_embeddings( 744 | ffi.Pointer ctx, 745 | ); 746 | 747 | /// Get the embeddings for the ith token. For positive indices, Equivalent to: 748 | /// llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd 749 | /// Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding. 750 | /// shape: [n_embd] (1-dimensional) 751 | /// returns NULL for invalid ids. 752 | @ffi.Native< 753 | ffi.Pointer Function(ffi.Pointer, ffi.Int32)>( 754 | symbol: 'llama_get_embeddings_ith') 755 | external ffi.Pointer llama_get_embeddings_ith( 756 | ffi.Pointer ctx, 757 | int i, 758 | ); 759 | 760 | /// Get the embeddings for a sequence id 761 | /// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE 762 | /// shape: [n_embd] (1-dimensional) 763 | @ffi.Native< 764 | ffi.Pointer Function(ffi.Pointer, 765 | llama_seq_id)>(symbol: 'llama_get_embeddings_seq') 766 | external ffi.Pointer llama_get_embeddings_seq( 767 | ffi.Pointer ctx, 768 | int seq_id, 769 | ); 770 | 771 | /// Vocab 772 | @ffi.Native< 773 | ffi.Pointer Function( 774 | ffi.Pointer, llama_token)>(symbol: 'llama_token_get_text') 775 | external ffi.Pointer llama_token_get_text( 776 | ffi.Pointer model, 777 | int token, 778 | ); 779 | 780 | @ffi.Native, llama_token)>( 781 | symbol: 'llama_token_get_score') 782 | external double llama_token_get_score( 783 | ffi.Pointer model, 784 | int token, 785 | ); 786 | 787 | @ffi.Native, llama_token)>( 788 | symbol: 'llama_token_get_attr') 789 | external int llama_token_get_attr( 790 | ffi.Pointer model, 791 | int token, 792 | ); 793 | 794 | /// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.) 795 | @ffi.Native, llama_token)>( 796 | symbol: 'llama_token_is_eog') 797 | external bool llama_token_is_eog( 798 | ffi.Pointer model, 799 | int token, 800 | ); 801 | 802 | /// Identify if Token Id is a control token or a render-able token 803 | @ffi.Native, llama_token)>( 804 | symbol: 'llama_token_is_control') 805 | external bool llama_token_is_control( 806 | ffi.Pointer model, 807 | int token, 808 | ); 809 | 810 | /// Special tokens 811 | @ffi.Native)>( 812 | symbol: 'llama_token_bos') 813 | external int llama_token_bos( 814 | ffi.Pointer model, 815 | ); 816 | 817 | @ffi.Native)>( 818 | symbol: 'llama_token_eos') 819 | external int llama_token_eos( 820 | ffi.Pointer model, 821 | ); 822 | 823 | @ffi.Native)>( 824 | symbol: 'llama_token_cls') 825 | external int llama_token_cls( 826 | ffi.Pointer model, 827 | ); 828 | 829 | @ffi.Native)>( 830 | symbol: 'llama_token_sep') 831 | external int llama_token_sep( 832 | ffi.Pointer model, 833 | ); 834 | 835 | @ffi.Native)>( 836 | symbol: 'llama_token_nl') 837 | external int llama_token_nl( 838 | ffi.Pointer model, 839 | ); 840 | 841 | @ffi.Native)>( 842 | symbol: 'llama_token_pad') 843 | external int llama_token_pad( 844 | ffi.Pointer model, 845 | ); 846 | 847 | /// Returns -1 if unknown, 1 for true or 0 for false. 848 | @ffi.Native)>( 849 | symbol: 'llama_add_bos_token') 850 | external int llama_add_bos_token( 851 | ffi.Pointer model, 852 | ); 853 | 854 | /// Returns -1 if unknown, 1 for true or 0 for false. 855 | @ffi.Native)>( 856 | symbol: 'llama_add_eos_token') 857 | external int llama_add_eos_token( 858 | ffi.Pointer model, 859 | ); 860 | 861 | /// Codellama infill tokens 862 | @ffi.Native)>( 863 | symbol: 'llama_token_prefix') 864 | external int llama_token_prefix( 865 | ffi.Pointer model, 866 | ); 867 | 868 | @ffi.Native)>( 869 | symbol: 'llama_token_middle') 870 | external int llama_token_middle( 871 | ffi.Pointer model, 872 | ); 873 | 874 | @ffi.Native)>( 875 | symbol: 'llama_token_suffix') 876 | external int llama_token_suffix( 877 | ffi.Pointer model, 878 | ); 879 | 880 | @ffi.Native)>( 881 | symbol: 'llama_token_eot') 882 | external int llama_token_eot( 883 | ffi.Pointer model, 884 | ); 885 | 886 | /// @details Convert the provided text into tokens. 887 | /// @param tokens The tokens pointer must be large enough to hold the resulting tokens. 888 | /// @return Returns the number of tokens on success, no more than n_tokens_max 889 | /// @return Returns a negative number on failure - the number of tokens that would have been returned 890 | /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated 891 | /// as plaintext. Does not insert a leading space. 892 | @ffi.Native< 893 | ffi.Int32 Function( 894 | ffi.Pointer, 895 | ffi.Pointer, 896 | ffi.Int32, 897 | ffi.Pointer, 898 | ffi.Int32, 899 | ffi.Bool, 900 | ffi.Bool)>(symbol: 'llama_tokenize') 901 | external int llama_tokenize( 902 | ffi.Pointer model, 903 | ffi.Pointer text, 904 | int text_len, 905 | ffi.Pointer tokens, 906 | int n_tokens_max, 907 | bool add_special, 908 | bool parse_special, 909 | ); 910 | 911 | /// Token Id -> Piece. 912 | /// Uses the vocabulary in the provided context. 913 | /// Does not write null terminator to the buffer. 914 | /// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens. 915 | /// @param special If true, special tokens are rendered in the output. 916 | @ffi.Native< 917 | ffi.Int32 Function( 918 | ffi.Pointer, 919 | llama_token, 920 | ffi.Pointer, 921 | ffi.Int32, 922 | ffi.Bool)>(symbol: 'llama_token_to_piece') 923 | external int llama_token_to_piece( 924 | ffi.Pointer model, 925 | int token, 926 | ffi.Pointer buf, 927 | int length, 928 | bool special, 929 | ); 930 | 931 | /// Apply chat template. Inspired by hf apply_chat_template() on python. 932 | /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model" 933 | /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template 934 | /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead. 935 | /// @param chat Pointer to a list of multiple llama_chat_message 936 | /// @param n_msg Number of llama_chat_message in this chat 937 | /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message. 938 | /// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages) 939 | /// @param length The size of the allocated buffer 940 | /// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template. 941 | @ffi.Native< 942 | ffi.Int32 Function( 943 | ffi.Pointer, 944 | ffi.Pointer, 945 | ffi.Pointer, 946 | ffi.Size, 947 | ffi.Bool, 948 | ffi.Pointer, 949 | ffi.Int32)>(symbol: 'llama_chat_apply_template') 950 | external int llama_chat_apply_template( 951 | ffi.Pointer model, 952 | ffi.Pointer tmpl, 953 | ffi.Pointer chat, 954 | int n_msg, 955 | bool add_ass, 956 | ffi.Pointer buf, 957 | int length, 958 | ); 959 | 960 | /// Initialize a llama_grammar. 961 | /// 962 | /// @param rules The rule elements of the grammar to initialize. 963 | /// @param n_rules The number of rules. 964 | /// @param start_rule_index The index of the root rule (the starting point of the grammar). 965 | /// @return The initialized llama_grammar or nullptr if initialization failed. 966 | @ffi.Native< 967 | ffi.Pointer Function( 968 | ffi.Pointer>, 969 | ffi.Size, 970 | ffi.Size)>(symbol: 'llama_grammar_init') 971 | external ffi.Pointer llama_grammar_init( 972 | ffi.Pointer> rules, 973 | int n_rules, 974 | int start_rule_index, 975 | ); 976 | 977 | @ffi.Native)>( 978 | symbol: 'llama_grammar_free') 979 | external void llama_grammar_free( 980 | ffi.Pointer grammar, 981 | ); 982 | 983 | @ffi.Native Function(ffi.Pointer)>( 984 | symbol: 'llama_grammar_copy') 985 | external ffi.Pointer llama_grammar_copy( 986 | ffi.Pointer grammar, 987 | ); 988 | 989 | /// Sets the current rng seed. 990 | @ffi.Native, ffi.Uint32)>( 991 | symbol: 'llama_set_rng_seed') 992 | external void llama_set_rng_seed( 993 | ffi.Pointer ctx, 994 | int seed, 995 | ); 996 | 997 | /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. 998 | /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. 999 | @ffi.Native< 1000 | ffi.Void Function( 1001 | ffi.Pointer, 1002 | ffi.Pointer, 1003 | ffi.Pointer, 1004 | ffi.Size, 1005 | ffi.Float, 1006 | ffi.Float, 1007 | ffi.Float)>(symbol: 'llama_sample_repetition_penalties') 1008 | external void llama_sample_repetition_penalties( 1009 | ffi.Pointer ctx, 1010 | ffi.Pointer candidates, 1011 | ffi.Pointer last_tokens, 1012 | int penalty_last_n, 1013 | double penalty_repeat, 1014 | double penalty_freq, 1015 | double penalty_present, 1016 | ); 1017 | 1018 | /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806 1019 | /// @param logits Logits extracted from the original generation context. 1020 | /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context. 1021 | /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance. 1022 | @ffi.Native< 1023 | ffi.Void Function( 1024 | ffi.Pointer, 1025 | ffi.Pointer, 1026 | ffi.Pointer, 1027 | ffi.Float)>(symbol: 'llama_sample_apply_guidance') 1028 | external void llama_sample_apply_guidance( 1029 | ffi.Pointer ctx, 1030 | ffi.Pointer logits, 1031 | ffi.Pointer logits_guidance, 1032 | double scale, 1033 | ); 1034 | 1035 | /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. 1036 | @ffi.Native< 1037 | ffi.Void Function(ffi.Pointer, 1038 | ffi.Pointer)>(symbol: 'llama_sample_softmax') 1039 | external void llama_sample_softmax( 1040 | ffi.Pointer ctx, 1041 | ffi.Pointer candidates, 1042 | ); 1043 | 1044 | /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 1045 | @ffi.Native< 1046 | ffi.Void Function( 1047 | ffi.Pointer, 1048 | ffi.Pointer, 1049 | ffi.Int32, 1050 | ffi.Size)>(symbol: 'llama_sample_top_k') 1051 | external void llama_sample_top_k( 1052 | ffi.Pointer ctx, 1053 | ffi.Pointer candidates, 1054 | int k, 1055 | int min_keep, 1056 | ); 1057 | 1058 | /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 1059 | @ffi.Native< 1060 | ffi.Void Function( 1061 | ffi.Pointer, 1062 | ffi.Pointer, 1063 | ffi.Float, 1064 | ffi.Size)>(symbol: 'llama_sample_top_p') 1065 | external void llama_sample_top_p( 1066 | ffi.Pointer ctx, 1067 | ffi.Pointer candidates, 1068 | double p, 1069 | int min_keep, 1070 | ); 1071 | 1072 | /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841 1073 | @ffi.Native< 1074 | ffi.Void Function( 1075 | ffi.Pointer, 1076 | ffi.Pointer, 1077 | ffi.Float, 1078 | ffi.Size)>(symbol: 'llama_sample_min_p') 1079 | external void llama_sample_min_p( 1080 | ffi.Pointer ctx, 1081 | ffi.Pointer candidates, 1082 | double p, 1083 | int min_keep, 1084 | ); 1085 | 1086 | /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. 1087 | @ffi.Native< 1088 | ffi.Void Function( 1089 | ffi.Pointer, 1090 | ffi.Pointer, 1091 | ffi.Float, 1092 | ffi.Size)>(symbol: 'llama_sample_tail_free') 1093 | external void llama_sample_tail_free( 1094 | ffi.Pointer ctx, 1095 | ffi.Pointer candidates, 1096 | double z, 1097 | int min_keep, 1098 | ); 1099 | 1100 | /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. 1101 | @ffi.Native< 1102 | ffi.Void Function( 1103 | ffi.Pointer, 1104 | ffi.Pointer, 1105 | ffi.Float, 1106 | ffi.Size)>(symbol: 'llama_sample_typical') 1107 | external void llama_sample_typical( 1108 | ffi.Pointer ctx, 1109 | ffi.Pointer candidates, 1110 | double p, 1111 | int min_keep, 1112 | ); 1113 | 1114 | /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772. 1115 | @ffi.Native< 1116 | ffi.Void Function( 1117 | ffi.Pointer, 1118 | ffi.Pointer, 1119 | ffi.Float, 1120 | ffi.Float, 1121 | ffi.Float)>(symbol: 'llama_sample_entropy') 1122 | external void llama_sample_entropy( 1123 | ffi.Pointer ctx, 1124 | ffi.Pointer candidates_p, 1125 | double min_temp, 1126 | double max_temp, 1127 | double exponent_val, 1128 | ); 1129 | 1130 | @ffi.Native< 1131 | ffi.Void Function( 1132 | ffi.Pointer, 1133 | ffi.Pointer, 1134 | ffi.Float)>(symbol: 'llama_sample_temp') 1135 | external void llama_sample_temp( 1136 | ffi.Pointer ctx, 1137 | ffi.Pointer candidates, 1138 | double temp, 1139 | ); 1140 | 1141 | /// @details Apply constraints from grammar 1142 | @ffi.Native< 1143 | ffi.Void Function( 1144 | ffi.Pointer, 1145 | ffi.Pointer, 1146 | ffi.Pointer)>(symbol: 'llama_sample_grammar') 1147 | external void llama_sample_grammar( 1148 | ffi.Pointer ctx, 1149 | ffi.Pointer candidates, 1150 | ffi.Pointer grammar, 1151 | ); 1152 | 1153 | /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. 1154 | /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. 1155 | /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. 1156 | /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. 1157 | /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. 1158 | /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. 1159 | @ffi.Native< 1160 | llama_token Function( 1161 | ffi.Pointer, 1162 | ffi.Pointer, 1163 | ffi.Float, 1164 | ffi.Float, 1165 | ffi.Int32, 1166 | ffi.Pointer)>(symbol: 'llama_sample_token_mirostat') 1167 | external int llama_sample_token_mirostat( 1168 | ffi.Pointer ctx, 1169 | ffi.Pointer candidates, 1170 | double tau, 1171 | double eta, 1172 | int m, 1173 | ffi.Pointer mu, 1174 | ); 1175 | 1176 | /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. 1177 | /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. 1178 | /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. 1179 | /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. 1180 | /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. 1181 | @ffi.Native< 1182 | llama_token Function( 1183 | ffi.Pointer, 1184 | ffi.Pointer, 1185 | ffi.Float, 1186 | ffi.Float, 1187 | ffi.Pointer)>(symbol: 'llama_sample_token_mirostat_v2') 1188 | external int llama_sample_token_mirostat_v2( 1189 | ffi.Pointer ctx, 1190 | ffi.Pointer candidates, 1191 | double tau, 1192 | double eta, 1193 | ffi.Pointer mu, 1194 | ); 1195 | 1196 | /// @details Selects the token with the highest probability. 1197 | /// Does not compute the token probabilities. Use llama_sample_softmax() instead. 1198 | @ffi.Native< 1199 | llama_token Function( 1200 | ffi.Pointer, ffi.Pointer)>( 1201 | symbol: 'llama_sample_token_greedy') 1202 | external int llama_sample_token_greedy( 1203 | ffi.Pointer ctx, 1204 | ffi.Pointer candidates, 1205 | ); 1206 | 1207 | /// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx. 1208 | @ffi.Native< 1209 | llama_token Function(ffi.Pointer, 1210 | ffi.Pointer)>(symbol: 'llama_sample_token') 1211 | external int llama_sample_token( 1212 | ffi.Pointer ctx, 1213 | ffi.Pointer candidates, 1214 | ); 1215 | 1216 | /// @details Accepts the sampled token into the grammar 1217 | @ffi.Native< 1218 | ffi.Void Function(ffi.Pointer, ffi.Pointer, 1219 | llama_token)>(symbol: 'llama_grammar_accept_token') 1220 | external void llama_grammar_accept_token( 1221 | ffi.Pointer ctx, 1222 | ffi.Pointer grammar, 1223 | int token, 1224 | ); 1225 | 1226 | /// @details Build a split GGUF final path for this chunk. 1227 | /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf" 1228 | /// Returns the split_path length. 1229 | @ffi.Native< 1230 | ffi.Int Function(ffi.Pointer, ffi.Size, ffi.Pointer, 1231 | ffi.Int, ffi.Int)>(symbol: 'llama_split_path') 1232 | external int llama_split_path( 1233 | ffi.Pointer split_path, 1234 | int maxlen, 1235 | ffi.Pointer path_prefix, 1236 | int split_no, 1237 | int split_count, 1238 | ); 1239 | 1240 | /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match. 1241 | /// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0" 1242 | /// Returns the split_prefix length. 1243 | @ffi.Native< 1244 | ffi.Int Function(ffi.Pointer, ffi.Size, ffi.Pointer, 1245 | ffi.Int, ffi.Int)>(symbol: 'llama_split_prefix') 1246 | external int llama_split_prefix( 1247 | ffi.Pointer split_prefix, 1248 | int maxlen, 1249 | ffi.Pointer split_path, 1250 | int split_no, 1251 | int split_count, 1252 | ); 1253 | 1254 | /// Performance information 1255 | @ffi.Native)>( 1256 | symbol: 'llama_get_timings') 1257 | external llama_timings llama_get_timings( 1258 | ffi.Pointer ctx, 1259 | ); 1260 | 1261 | @ffi.Native)>( 1262 | symbol: 'llama_print_timings') 1263 | external void llama_print_timings( 1264 | ffi.Pointer ctx, 1265 | ); 1266 | 1267 | @ffi.Native)>( 1268 | symbol: 'llama_reset_timings') 1269 | external void llama_reset_timings( 1270 | ffi.Pointer ctx, 1271 | ); 1272 | 1273 | /// Print system information 1274 | @ffi.Native Function()>(symbol: 'llama_print_system_info') 1275 | external ffi.Pointer llama_print_system_info(); 1276 | 1277 | /// Set callback for all future logging events. 1278 | /// If this is not called, or NULL is supplied, everything is output on stderr. 1279 | @ffi.Native)>( 1280 | symbol: 'llama_log_set') 1281 | external void llama_log_set( 1282 | ggml_log_callback log_callback, 1283 | ffi.Pointer user_data, 1284 | ); 1285 | 1286 | @ffi.Native, ffi.Pointer)>( 1287 | symbol: 'llama_dump_timing_info_yaml') 1288 | external void llama_dump_timing_info_yaml( 1289 | ffi.Pointer stream, 1290 | ffi.Pointer ctx, 1291 | ); 1292 | 1293 | /// C interface 1294 | /// 1295 | /// TODO: show sample usage 1296 | final class llama_model extends ffi.Opaque {} 1297 | 1298 | final class llama_context extends ffi.Opaque {} 1299 | 1300 | abstract class llama_vocab_type { 1301 | /// For models without vocab 1302 | static const int LLAMA_VOCAB_TYPE_NONE = 0; 1303 | 1304 | /// LLaMA tokenizer based on byte-level BPE with byte fallback 1305 | static const int LLAMA_VOCAB_TYPE_SPM = 1; 1306 | 1307 | /// GPT-2 tokenizer based on byte-level BPE 1308 | static const int LLAMA_VOCAB_TYPE_BPE = 2; 1309 | 1310 | /// BERT tokenizer based on WordPiece 1311 | static const int LLAMA_VOCAB_TYPE_WPM = 3; 1312 | 1313 | /// T5 tokenizer based on Unigram 1314 | static const int LLAMA_VOCAB_TYPE_UGM = 4; 1315 | } 1316 | 1317 | /// pre-tokenization types 1318 | abstract class llama_vocab_pre_type { 1319 | static const int LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0; 1320 | static const int LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1; 1321 | static const int LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2; 1322 | static const int LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3; 1323 | static const int LLAMA_VOCAB_PRE_TYPE_FALCON = 4; 1324 | static const int LLAMA_VOCAB_PRE_TYPE_MPT = 5; 1325 | static const int LLAMA_VOCAB_PRE_TYPE_STARCODER = 6; 1326 | static const int LLAMA_VOCAB_PRE_TYPE_GPT2 = 7; 1327 | static const int LLAMA_VOCAB_PRE_TYPE_REFACT = 8; 1328 | static const int LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9; 1329 | static const int LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10; 1330 | static const int LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11; 1331 | static const int LLAMA_VOCAB_PRE_TYPE_OLMO = 12; 1332 | static const int LLAMA_VOCAB_PRE_TYPE_DBRX = 13; 1333 | static const int LLAMA_VOCAB_PRE_TYPE_SMAUG = 14; 1334 | static const int LLAMA_VOCAB_PRE_TYPE_PORO = 15; 1335 | } 1336 | 1337 | /// note: these values should be synchronized with ggml_rope 1338 | /// TODO: maybe move this enum to ggml.h (ggml_rope_type) 1339 | abstract class llama_rope_type { 1340 | static const int LLAMA_ROPE_TYPE_NONE = -1; 1341 | static const int LLAMA_ROPE_TYPE_NORM = 0; 1342 | static const int LLAMA_ROPE_TYPE_NEOX = 2; 1343 | static const int LLAMA_ROPE_TYPE_GLM = 4; 1344 | } 1345 | 1346 | abstract class llama_token_type { 1347 | static const int LLAMA_TOKEN_TYPE_UNDEFINED = 0; 1348 | static const int LLAMA_TOKEN_TYPE_NORMAL = 1; 1349 | static const int LLAMA_TOKEN_TYPE_UNKNOWN = 2; 1350 | static const int LLAMA_TOKEN_TYPE_CONTROL = 3; 1351 | static const int LLAMA_TOKEN_TYPE_USER_DEFINED = 4; 1352 | static const int LLAMA_TOKEN_TYPE_UNUSED = 5; 1353 | static const int LLAMA_TOKEN_TYPE_BYTE = 6; 1354 | } 1355 | 1356 | abstract class llama_token_attr { 1357 | static const int LLAMA_TOKEN_ATTR_UNDEFINED = 0; 1358 | static const int LLAMA_TOKEN_ATTR_UNKNOWN = 1; 1359 | static const int LLAMA_TOKEN_ATTR_UNUSED = 2; 1360 | static const int LLAMA_TOKEN_ATTR_NORMAL = 4; 1361 | 1362 | /// SPECIAL? 1363 | static const int LLAMA_TOKEN_ATTR_CONTROL = 8; 1364 | static const int LLAMA_TOKEN_ATTR_USER_DEFINED = 16; 1365 | static const int LLAMA_TOKEN_ATTR_BYTE = 32; 1366 | static const int LLAMA_TOKEN_ATTR_NORMALIZED = 64; 1367 | static const int LLAMA_TOKEN_ATTR_LSTRIP = 128; 1368 | static const int LLAMA_TOKEN_ATTR_RSTRIP = 256; 1369 | static const int LLAMA_TOKEN_ATTR_SINGLE_WORD = 512; 1370 | } 1371 | 1372 | /// model file types 1373 | abstract class llama_ftype { 1374 | static const int LLAMA_FTYPE_ALL_F32 = 0; 1375 | 1376 | /// except 1d tensors 1377 | static const int LLAMA_FTYPE_MOSTLY_F16 = 1; 1378 | 1379 | /// except 1d tensors 1380 | static const int LLAMA_FTYPE_MOSTLY_Q4_0 = 2; 1381 | 1382 | /// except 1d tensors 1383 | static const int LLAMA_FTYPE_MOSTLY_Q4_1 = 3; 1384 | 1385 | /// tok_embeddings.weight and output.weight are F16 1386 | static const int LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4; 1387 | 1388 | /// except 1d tensors 1389 | static const int LLAMA_FTYPE_MOSTLY_Q8_0 = 7; 1390 | 1391 | /// except 1d tensors 1392 | static const int LLAMA_FTYPE_MOSTLY_Q5_0 = 8; 1393 | 1394 | /// except 1d tensors 1395 | static const int LLAMA_FTYPE_MOSTLY_Q5_1 = 9; 1396 | 1397 | /// except 1d tensors 1398 | static const int LLAMA_FTYPE_MOSTLY_Q2_K = 10; 1399 | 1400 | /// except 1d tensors 1401 | static const int LLAMA_FTYPE_MOSTLY_Q3_K_S = 11; 1402 | 1403 | /// except 1d tensors 1404 | static const int LLAMA_FTYPE_MOSTLY_Q3_K_M = 12; 1405 | 1406 | /// except 1d tensors 1407 | static const int LLAMA_FTYPE_MOSTLY_Q3_K_L = 13; 1408 | 1409 | /// except 1d tensors 1410 | static const int LLAMA_FTYPE_MOSTLY_Q4_K_S = 14; 1411 | 1412 | /// except 1d tensors 1413 | static const int LLAMA_FTYPE_MOSTLY_Q4_K_M = 15; 1414 | 1415 | /// except 1d tensors 1416 | static const int LLAMA_FTYPE_MOSTLY_Q5_K_S = 16; 1417 | 1418 | /// except 1d tensors 1419 | static const int LLAMA_FTYPE_MOSTLY_Q5_K_M = 17; 1420 | 1421 | /// except 1d tensors 1422 | static const int LLAMA_FTYPE_MOSTLY_Q6_K = 18; 1423 | 1424 | /// except 1d tensors 1425 | static const int LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19; 1426 | 1427 | /// except 1d tensors 1428 | static const int LLAMA_FTYPE_MOSTLY_IQ2_XS = 20; 1429 | 1430 | /// except 1d tensors 1431 | static const int LLAMA_FTYPE_MOSTLY_Q2_K_S = 21; 1432 | 1433 | /// except 1d tensors 1434 | static const int LLAMA_FTYPE_MOSTLY_IQ3_XS = 22; 1435 | 1436 | /// except 1d tensors 1437 | static const int LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23; 1438 | 1439 | /// except 1d tensors 1440 | static const int LLAMA_FTYPE_MOSTLY_IQ1_S = 24; 1441 | 1442 | /// except 1d tensors 1443 | static const int LLAMA_FTYPE_MOSTLY_IQ4_NL = 25; 1444 | 1445 | /// except 1d tensors 1446 | static const int LLAMA_FTYPE_MOSTLY_IQ3_S = 26; 1447 | 1448 | /// except 1d tensors 1449 | static const int LLAMA_FTYPE_MOSTLY_IQ3_M = 27; 1450 | 1451 | /// except 1d tensors 1452 | static const int LLAMA_FTYPE_MOSTLY_IQ2_S = 28; 1453 | 1454 | /// except 1d tensors 1455 | static const int LLAMA_FTYPE_MOSTLY_IQ2_M = 29; 1456 | 1457 | /// except 1d tensors 1458 | static const int LLAMA_FTYPE_MOSTLY_IQ4_XS = 30; 1459 | 1460 | /// except 1d tensors 1461 | static const int LLAMA_FTYPE_MOSTLY_IQ1_M = 31; 1462 | 1463 | /// except 1d tensors 1464 | static const int LLAMA_FTYPE_MOSTLY_BF16 = 32; 1465 | 1466 | /// not specified in the model file 1467 | static const int LLAMA_FTYPE_GUESSED = 1024; 1468 | } 1469 | 1470 | abstract class llama_rope_scaling_type { 1471 | static const int LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1; 1472 | static const int LLAMA_ROPE_SCALING_TYPE_NONE = 0; 1473 | static const int LLAMA_ROPE_SCALING_TYPE_LINEAR = 1; 1474 | static const int LLAMA_ROPE_SCALING_TYPE_YARN = 2; 1475 | static const int LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = 2; 1476 | } 1477 | 1478 | abstract class llama_pooling_type { 1479 | static const int LLAMA_POOLING_TYPE_UNSPECIFIED = -1; 1480 | static const int LLAMA_POOLING_TYPE_NONE = 0; 1481 | static const int LLAMA_POOLING_TYPE_MEAN = 1; 1482 | static const int LLAMA_POOLING_TYPE_CLS = 2; 1483 | static const int LLAMA_POOLING_TYPE_LAST = 3; 1484 | } 1485 | 1486 | abstract class llama_split_mode { 1487 | /// single GPU 1488 | static const int LLAMA_SPLIT_MODE_NONE = 0; 1489 | 1490 | /// split layers and KV across GPUs 1491 | static const int LLAMA_SPLIT_MODE_LAYER = 1; 1492 | 1493 | /// split rows across GPUs 1494 | static const int LLAMA_SPLIT_MODE_ROW = 2; 1495 | } 1496 | 1497 | final class llama_token_data extends ffi.Struct { 1498 | /// token id 1499 | @llama_token() 1500 | external int id; 1501 | 1502 | /// log-odds of the token 1503 | @ffi.Float() 1504 | external double logit; 1505 | 1506 | /// probability of the token 1507 | @ffi.Float() 1508 | external double p; 1509 | } 1510 | 1511 | typedef llama_token = ffi.Int32; 1512 | typedef Dartllama_token = int; 1513 | 1514 | final class llama_token_data_array extends ffi.Struct { 1515 | external ffi.Pointer data; 1516 | 1517 | @ffi.Size() 1518 | external int size; 1519 | 1520 | @ffi.Bool() 1521 | external bool sorted; 1522 | } 1523 | 1524 | /// Input data for llama_decode 1525 | /// A llama_batch object can contain input about one or many sequences 1526 | /// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens 1527 | /// 1528 | /// - token : the token ids of the input (used when embd is NULL) 1529 | /// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL) 1530 | /// - pos : the positions of the respective token in the sequence 1531 | /// - seq_id : the sequence to which the respective token belongs 1532 | /// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output 1533 | final class llama_batch extends ffi.Struct { 1534 | @ffi.Int32() 1535 | external int n_tokens; 1536 | 1537 | external ffi.Pointer token; 1538 | 1539 | external ffi.Pointer embd; 1540 | 1541 | external ffi.Pointer pos; 1542 | 1543 | external ffi.Pointer n_seq_id; 1544 | 1545 | external ffi.Pointer> seq_id; 1546 | 1547 | /// TODO: rename this to "output" 1548 | external ffi.Pointer logits; 1549 | 1550 | /// used if pos == NULL 1551 | @llama_pos() 1552 | external int all_pos_0; 1553 | 1554 | /// used if pos == NULL 1555 | @llama_pos() 1556 | external int all_pos_1; 1557 | 1558 | /// used if seq_id == NULL 1559 | @llama_seq_id() 1560 | external int all_seq_id; 1561 | } 1562 | 1563 | typedef llama_pos = ffi.Int32; 1564 | typedef Dartllama_pos = int; 1565 | typedef llama_seq_id = ffi.Int32; 1566 | typedef Dartllama_seq_id = int; 1567 | 1568 | abstract class llama_model_kv_override_type { 1569 | static const int LLAMA_KV_OVERRIDE_TYPE_INT = 0; 1570 | static const int LLAMA_KV_OVERRIDE_TYPE_FLOAT = 1; 1571 | static const int LLAMA_KV_OVERRIDE_TYPE_BOOL = 2; 1572 | static const int LLAMA_KV_OVERRIDE_TYPE_STR = 3; 1573 | } 1574 | 1575 | final class llama_model_kv_override extends ffi.Struct { 1576 | @ffi.Int32() 1577 | external int tag; 1578 | 1579 | @ffi.Array.multi([128]) 1580 | external ffi.Array key; 1581 | 1582 | external UnnamedUnion1 unnamed; 1583 | } 1584 | 1585 | final class UnnamedUnion1 extends ffi.Union { 1586 | @ffi.Int64() 1587 | external int val_i64; 1588 | 1589 | @ffi.Double() 1590 | external double val_f64; 1591 | 1592 | @ffi.Bool() 1593 | external bool val_bool; 1594 | 1595 | @ffi.Array.multi([128]) 1596 | external ffi.Array val_str; 1597 | } 1598 | 1599 | final class llama_model_params extends ffi.Struct { 1600 | /// number of layers to store in VRAM 1601 | @ffi.Int32() 1602 | external int n_gpu_layers; 1603 | 1604 | /// how to split the model across multiple GPUs 1605 | @ffi.Int32() 1606 | external int split_mode; 1607 | 1608 | /// main_gpu interpretation depends on split_mode: 1609 | /// LLAMA_SPLIT_NONE: the GPU that is used for the entire model 1610 | /// LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results 1611 | /// LLAMA_SPLIT_LAYER: ignored 1612 | @ffi.Int32() 1613 | external int main_gpu; 1614 | 1615 | /// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices() 1616 | external ffi.Pointer tensor_split; 1617 | 1618 | /// comma separated list of RPC servers to use for offloading 1619 | external ffi.Pointer rpc_servers; 1620 | 1621 | /// Called with a progress value between 0.0 and 1.0. Pass NULL to disable. 1622 | /// If the provided progress_callback returns true, model loading continues. 1623 | /// If it returns false, model loading is immediately aborted. 1624 | external llama_progress_callback progress_callback; 1625 | 1626 | /// context pointer passed to the progress callback 1627 | external ffi.Pointer progress_callback_user_data; 1628 | 1629 | /// override key-value pairs of the model meta data 1630 | external ffi.Pointer kv_overrides; 1631 | 1632 | /// only load the vocabulary, no weights 1633 | @ffi.Bool() 1634 | external bool vocab_only; 1635 | 1636 | /// use mmap if possible 1637 | @ffi.Bool() 1638 | external bool use_mmap; 1639 | 1640 | /// force system to keep model in RAM 1641 | @ffi.Bool() 1642 | external bool use_mlock; 1643 | 1644 | /// validate model tensor data 1645 | @ffi.Bool() 1646 | external bool check_tensors; 1647 | } 1648 | 1649 | typedef llama_progress_callback 1650 | = ffi.Pointer>; 1651 | typedef llama_progress_callbackFunction = ffi.Bool Function( 1652 | ffi.Float progress, ffi.Pointer user_data); 1653 | typedef Dartllama_progress_callbackFunction = bool Function( 1654 | double progress, ffi.Pointer user_data); 1655 | 1656 | /// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations 1657 | /// https://github.com/ggerganov/llama.cpp/pull/7544 1658 | final class llama_context_params extends ffi.Struct { 1659 | /// RNG seed, -1 for random 1660 | @ffi.Uint32() 1661 | external int seed; 1662 | 1663 | /// text context, 0 = from model 1664 | @ffi.Uint32() 1665 | external int n_ctx; 1666 | 1667 | /// logical maximum batch size that can be submitted to llama_decode 1668 | @ffi.Uint32() 1669 | external int n_batch; 1670 | 1671 | /// physical maximum batch size 1672 | @ffi.Uint32() 1673 | external int n_ubatch; 1674 | 1675 | /// max number of sequences (i.e. distinct states for recurrent models) 1676 | @ffi.Uint32() 1677 | external int n_seq_max; 1678 | 1679 | /// number of threads to use for generation 1680 | @ffi.Uint32() 1681 | external int n_threads; 1682 | 1683 | /// number of threads to use for batch processing 1684 | @ffi.Uint32() 1685 | external int n_threads_batch; 1686 | 1687 | /// RoPE scaling type, from `enum llama_rope_scaling_type` 1688 | @ffi.Int32() 1689 | external int rope_scaling_type; 1690 | 1691 | /// whether to pool (sum) embedding results by sequence id 1692 | @ffi.Int32() 1693 | external int pooling_type; 1694 | 1695 | /// RoPE base frequency, 0 = from model 1696 | @ffi.Float() 1697 | external double rope_freq_base; 1698 | 1699 | /// RoPE frequency scaling factor, 0 = from model 1700 | @ffi.Float() 1701 | external double rope_freq_scale; 1702 | 1703 | /// YaRN extrapolation mix factor, negative = from model 1704 | @ffi.Float() 1705 | external double yarn_ext_factor; 1706 | 1707 | /// YaRN magnitude scaling factor 1708 | @ffi.Float() 1709 | external double yarn_attn_factor; 1710 | 1711 | /// YaRN low correction dim 1712 | @ffi.Float() 1713 | external double yarn_beta_fast; 1714 | 1715 | /// YaRN high correction dim 1716 | @ffi.Float() 1717 | external double yarn_beta_slow; 1718 | 1719 | /// YaRN original context size 1720 | @ffi.Uint32() 1721 | external int yarn_orig_ctx; 1722 | 1723 | /// defragment the KV cache if holes/size > thold, < 0 disabled (default) 1724 | @ffi.Float() 1725 | external double defrag_thold; 1726 | 1727 | external ggml_backend_sched_eval_callback cb_eval; 1728 | 1729 | external ffi.Pointer cb_eval_user_data; 1730 | 1731 | /// data type for K cache [EXPERIMENTAL] 1732 | @ffi.Int32() 1733 | external int type_k; 1734 | 1735 | /// data type for V cache [EXPERIMENTAL] 1736 | @ffi.Int32() 1737 | external int type_v; 1738 | 1739 | /// the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) 1740 | @ffi.Bool() 1741 | external bool logits_all; 1742 | 1743 | /// if true, extract embeddings (together with logits) 1744 | @ffi.Bool() 1745 | external bool embeddings; 1746 | 1747 | /// whether to offload the KQV ops (including the KV cache) to GPU 1748 | @ffi.Bool() 1749 | external bool offload_kqv; 1750 | 1751 | /// whether to use flash attention [EXPERIMENTAL] 1752 | @ffi.Bool() 1753 | external bool flash_attn; 1754 | 1755 | /// Abort callback 1756 | /// if it returns true, execution of llama_decode() will be aborted 1757 | /// currently works only with CPU execution 1758 | external ggml_abort_callback abort_callback; 1759 | 1760 | external ffi.Pointer abort_callback_data; 1761 | } 1762 | 1763 | /// when ask == true, the scheduler wants to know if the user wants to observe this node 1764 | /// this allows the scheduler to batch nodes together in order to evaluate them in a single call 1765 | /// 1766 | /// when ask == false, the scheduler is passing the node tensor to the user for observation 1767 | /// if the user returns false, the scheduler will cancel the graph compute 1768 | typedef ggml_backend_sched_eval_callback 1769 | = ffi.Pointer>; 1770 | typedef ggml_backend_sched_eval_callbackFunction = ffi.Bool Function( 1771 | ffi.Pointer t, ffi.Bool ask, ffi.Pointer user_data); 1772 | typedef Dartggml_backend_sched_eval_callbackFunction = bool Function( 1773 | ffi.Pointer t, bool ask, ffi.Pointer user_data); 1774 | 1775 | /// n-dimensional tensor 1776 | final class ggml_tensor extends ffi.Struct { 1777 | @ffi.Int32() 1778 | external int type; 1779 | 1780 | @ffi.Int32() 1781 | external int backend; 1782 | 1783 | external ffi.Pointer buffer; 1784 | 1785 | /// number of elements 1786 | @ffi.Array.multi([4]) 1787 | external ffi.Array ne; 1788 | 1789 | /// stride in bytes: 1790 | /// nb[0] = ggml_type_size(type) 1791 | /// nb[1] = nb[0] * (ne[0] / ggml_blck_size(type)) + padding 1792 | /// nb[i] = nb[i-1] * ne[i-1] 1793 | @ffi.Array.multi([4]) 1794 | external ffi.Array nb; 1795 | 1796 | /// compute data 1797 | @ffi.Int32() 1798 | external int op; 1799 | 1800 | /// op params - allocated as int32_t for alignment 1801 | @ffi.Array.multi([16]) 1802 | external ffi.Array op_params; 1803 | 1804 | @ffi.Int32() 1805 | external int flags; 1806 | 1807 | external ffi.Pointer grad; 1808 | 1809 | @ffi.Array.multi([10]) 1810 | external ffi.Array> src; 1811 | 1812 | /// source tensor and offset for views 1813 | external ffi.Pointer view_src; 1814 | 1815 | @ffi.Size() 1816 | external int view_offs; 1817 | 1818 | external ffi.Pointer data; 1819 | 1820 | @ffi.Array.multi([64]) 1821 | external ffi.Array name; 1822 | 1823 | /// extra things e.g. for ggml-cuda.cu 1824 | external ffi.Pointer extra; 1825 | } 1826 | 1827 | /// NOTE: always add types at the end of the enum to keep backward compatibility 1828 | abstract class ggml_type { 1829 | static const int GGML_TYPE_F32 = 0; 1830 | static const int GGML_TYPE_F16 = 1; 1831 | static const int GGML_TYPE_Q4_0 = 2; 1832 | static const int GGML_TYPE_Q4_1 = 3; 1833 | 1834 | /// GGML_TYPE_Q4_2 = 4, support has been removed 1835 | /// GGML_TYPE_Q4_3 = 5, support has been removed 1836 | static const int GGML_TYPE_Q5_0 = 6; 1837 | static const int GGML_TYPE_Q5_1 = 7; 1838 | static const int GGML_TYPE_Q8_0 = 8; 1839 | static const int GGML_TYPE_Q8_1 = 9; 1840 | static const int GGML_TYPE_Q2_K = 10; 1841 | static const int GGML_TYPE_Q3_K = 11; 1842 | static const int GGML_TYPE_Q4_K = 12; 1843 | static const int GGML_TYPE_Q5_K = 13; 1844 | static const int GGML_TYPE_Q6_K = 14; 1845 | static const int GGML_TYPE_Q8_K = 15; 1846 | static const int GGML_TYPE_IQ2_XXS = 16; 1847 | static const int GGML_TYPE_IQ2_XS = 17; 1848 | static const int GGML_TYPE_IQ3_XXS = 18; 1849 | static const int GGML_TYPE_IQ1_S = 19; 1850 | static const int GGML_TYPE_IQ4_NL = 20; 1851 | static const int GGML_TYPE_IQ3_S = 21; 1852 | static const int GGML_TYPE_IQ2_S = 22; 1853 | static const int GGML_TYPE_IQ4_XS = 23; 1854 | static const int GGML_TYPE_I8 = 24; 1855 | static const int GGML_TYPE_I16 = 25; 1856 | static const int GGML_TYPE_I32 = 26; 1857 | static const int GGML_TYPE_I64 = 27; 1858 | static const int GGML_TYPE_F64 = 28; 1859 | static const int GGML_TYPE_IQ1_M = 29; 1860 | static const int GGML_TYPE_BF16 = 30; 1861 | static const int GGML_TYPE_COUNT = 31; 1862 | } 1863 | 1864 | abstract class ggml_backend_type { 1865 | static const int GGML_BACKEND_TYPE_CPU = 0; 1866 | static const int GGML_BACKEND_TYPE_GPU = 10; 1867 | static const int GGML_BACKEND_TYPE_GPU_SPLIT = 20; 1868 | } 1869 | 1870 | final class ggml_backend_buffer extends ffi.Opaque {} 1871 | 1872 | /// available tensor operations: 1873 | abstract class ggml_op { 1874 | static const int GGML_OP_NONE = 0; 1875 | static const int GGML_OP_DUP = 1; 1876 | static const int GGML_OP_ADD = 2; 1877 | static const int GGML_OP_ADD1 = 3; 1878 | static const int GGML_OP_ACC = 4; 1879 | static const int GGML_OP_SUB = 5; 1880 | static const int GGML_OP_MUL = 6; 1881 | static const int GGML_OP_DIV = 7; 1882 | static const int GGML_OP_SQR = 8; 1883 | static const int GGML_OP_SQRT = 9; 1884 | static const int GGML_OP_LOG = 10; 1885 | static const int GGML_OP_SUM = 11; 1886 | static const int GGML_OP_SUM_ROWS = 12; 1887 | static const int GGML_OP_MEAN = 13; 1888 | static const int GGML_OP_ARGMAX = 14; 1889 | static const int GGML_OP_REPEAT = 15; 1890 | static const int GGML_OP_REPEAT_BACK = 16; 1891 | static const int GGML_OP_CONCAT = 17; 1892 | static const int GGML_OP_SILU_BACK = 18; 1893 | 1894 | /// normalize 1895 | static const int GGML_OP_NORM = 19; 1896 | static const int GGML_OP_RMS_NORM = 20; 1897 | static const int GGML_OP_RMS_NORM_BACK = 21; 1898 | static const int GGML_OP_GROUP_NORM = 22; 1899 | static const int GGML_OP_MUL_MAT = 23; 1900 | static const int GGML_OP_MUL_MAT_ID = 24; 1901 | static const int GGML_OP_OUT_PROD = 25; 1902 | static const int GGML_OP_SCALE = 26; 1903 | static const int GGML_OP_SET = 27; 1904 | static const int GGML_OP_CPY = 28; 1905 | static const int GGML_OP_CONT = 29; 1906 | static const int GGML_OP_RESHAPE = 30; 1907 | static const int GGML_OP_VIEW = 31; 1908 | static const int GGML_OP_PERMUTE = 32; 1909 | static const int GGML_OP_TRANSPOSE = 33; 1910 | static const int GGML_OP_GET_ROWS = 34; 1911 | static const int GGML_OP_GET_ROWS_BACK = 35; 1912 | static const int GGML_OP_DIAG = 36; 1913 | static const int GGML_OP_DIAG_MASK_INF = 37; 1914 | static const int GGML_OP_DIAG_MASK_ZERO = 38; 1915 | static const int GGML_OP_SOFT_MAX = 39; 1916 | static const int GGML_OP_SOFT_MAX_BACK = 40; 1917 | static const int GGML_OP_ROPE = 41; 1918 | static const int GGML_OP_ROPE_BACK = 42; 1919 | static const int GGML_OP_CLAMP = 43; 1920 | static const int GGML_OP_CONV_TRANSPOSE_1D = 44; 1921 | static const int GGML_OP_IM2COL = 45; 1922 | static const int GGML_OP_CONV_TRANSPOSE_2D = 46; 1923 | static const int GGML_OP_POOL_1D = 47; 1924 | static const int GGML_OP_POOL_2D = 48; 1925 | 1926 | /// nearest interpolate 1927 | static const int GGML_OP_UPSCALE = 49; 1928 | static const int GGML_OP_PAD = 50; 1929 | static const int GGML_OP_ARANGE = 51; 1930 | static const int GGML_OP_TIMESTEP_EMBEDDING = 52; 1931 | static const int GGML_OP_ARGSORT = 53; 1932 | static const int GGML_OP_LEAKY_RELU = 54; 1933 | static const int GGML_OP_FLASH_ATTN_EXT = 55; 1934 | static const int GGML_OP_FLASH_ATTN_BACK = 56; 1935 | static const int GGML_OP_SSM_CONV = 57; 1936 | static const int GGML_OP_SSM_SCAN = 58; 1937 | static const int GGML_OP_WIN_PART = 59; 1938 | static const int GGML_OP_WIN_UNPART = 60; 1939 | static const int GGML_OP_GET_REL_POS = 61; 1940 | static const int GGML_OP_ADD_REL_POS = 62; 1941 | static const int GGML_OP_UNARY = 63; 1942 | static const int GGML_OP_MAP_UNARY = 64; 1943 | static const int GGML_OP_MAP_BINARY = 65; 1944 | static const int GGML_OP_MAP_CUSTOM1_F32 = 66; 1945 | static const int GGML_OP_MAP_CUSTOM2_F32 = 67; 1946 | static const int GGML_OP_MAP_CUSTOM3_F32 = 68; 1947 | static const int GGML_OP_MAP_CUSTOM1 = 69; 1948 | static const int GGML_OP_MAP_CUSTOM2 = 70; 1949 | static const int GGML_OP_MAP_CUSTOM3 = 71; 1950 | static const int GGML_OP_CROSS_ENTROPY_LOSS = 72; 1951 | static const int GGML_OP_CROSS_ENTROPY_LOSS_BACK = 73; 1952 | static const int GGML_OP_COUNT = 74; 1953 | } 1954 | 1955 | /// Abort callback 1956 | /// If not NULL, called before ggml computation 1957 | /// If it returns true, the computation is aborted 1958 | typedef ggml_abort_callback 1959 | = ffi.Pointer>; 1960 | typedef ggml_abort_callbackFunction = ffi.Bool Function( 1961 | ffi.Pointer data); 1962 | typedef Dartggml_abort_callbackFunction = bool Function( 1963 | ffi.Pointer data); 1964 | 1965 | /// model quantization parameters 1966 | final class llama_model_quantize_params extends ffi.Struct { 1967 | /// number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() 1968 | @ffi.Int32() 1969 | external int nthread; 1970 | 1971 | /// quantize to this llama_ftype 1972 | @ffi.Int32() 1973 | external int ftype; 1974 | 1975 | /// output tensor type 1976 | @ffi.Int32() 1977 | external int output_tensor_type; 1978 | 1979 | /// itoken embeddings tensor type 1980 | @ffi.Int32() 1981 | external int token_embedding_type; 1982 | 1983 | /// allow quantizing non-f32/f16 tensors 1984 | @ffi.Bool() 1985 | external bool allow_requantize; 1986 | 1987 | /// quantize output.weight 1988 | @ffi.Bool() 1989 | external bool quantize_output_tensor; 1990 | 1991 | /// only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored 1992 | @ffi.Bool() 1993 | external bool only_copy; 1994 | 1995 | /// quantize all tensors to the default type 1996 | @ffi.Bool() 1997 | external bool pure; 1998 | 1999 | /// quantize to the same number of shards 2000 | @ffi.Bool() 2001 | external bool keep_split; 2002 | 2003 | /// pointer to importance matrix data 2004 | external ffi.Pointer imatrix; 2005 | 2006 | /// pointer to vector containing overrides 2007 | external ffi.Pointer kv_overrides; 2008 | } 2009 | 2010 | /// grammar types 2011 | final class llama_grammar extends ffi.Opaque {} 2012 | 2013 | /// grammar element type 2014 | abstract class llama_gretype { 2015 | /// end of rule definition 2016 | static const int LLAMA_GRETYPE_END = 0; 2017 | 2018 | /// start of alternate definition for rule 2019 | static const int LLAMA_GRETYPE_ALT = 1; 2020 | 2021 | /// non-terminal element: reference to rule 2022 | static const int LLAMA_GRETYPE_RULE_REF = 2; 2023 | 2024 | /// terminal element: character (code point) 2025 | static const int LLAMA_GRETYPE_CHAR = 3; 2026 | 2027 | /// inverse char(s) ([^a], [^a-b] [^abc]) 2028 | static const int LLAMA_GRETYPE_CHAR_NOT = 4; 2029 | 2030 | /// modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to 2031 | /// be an inclusive range ([a-z]) 2032 | static const int LLAMA_GRETYPE_CHAR_RNG_UPPER = 5; 2033 | 2034 | /// modifies a preceding LLAMA_GRETYPE_CHAR or 2035 | /// LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA]) 2036 | static const int LLAMA_GRETYPE_CHAR_ALT = 6; 2037 | 2038 | /// any character (.) 2039 | static const int LLAMA_GRETYPE_CHAR_ANY = 7; 2040 | } 2041 | 2042 | final class llama_grammar_element extends ffi.Struct { 2043 | @ffi.Int32() 2044 | external int type; 2045 | 2046 | /// Unicode code point or rule ID 2047 | @ffi.Uint32() 2048 | external int value; 2049 | } 2050 | 2051 | /// performance timing information 2052 | final class llama_timings extends ffi.Struct { 2053 | @ffi.Double() 2054 | external double t_start_ms; 2055 | 2056 | @ffi.Double() 2057 | external double t_end_ms; 2058 | 2059 | @ffi.Double() 2060 | external double t_load_ms; 2061 | 2062 | @ffi.Double() 2063 | external double t_sample_ms; 2064 | 2065 | @ffi.Double() 2066 | external double t_p_eval_ms; 2067 | 2068 | @ffi.Double() 2069 | external double t_eval_ms; 2070 | 2071 | @ffi.Int32() 2072 | external int n_sample; 2073 | 2074 | @ffi.Int32() 2075 | external int n_p_eval; 2076 | 2077 | @ffi.Int32() 2078 | external int n_eval; 2079 | } 2080 | 2081 | /// used in chat template 2082 | final class llama_chat_message extends ffi.Struct { 2083 | external ffi.Pointer role; 2084 | 2085 | external ffi.Pointer content; 2086 | } 2087 | 2088 | /// numa strategies 2089 | abstract class ggml_numa_strategy { 2090 | static const int GGML_NUMA_STRATEGY_DISABLED = 0; 2091 | static const int GGML_NUMA_STRATEGY_DISTRIBUTE = 1; 2092 | static const int GGML_NUMA_STRATEGY_ISOLATE = 2; 2093 | static const int GGML_NUMA_STRATEGY_NUMACTL = 3; 2094 | static const int GGML_NUMA_STRATEGY_MIRROR = 4; 2095 | static const int GGML_NUMA_STRATEGY_COUNT = 5; 2096 | } 2097 | 2098 | /// Information associated with an individual cell in the KV cache view. 2099 | final class llama_kv_cache_view_cell extends ffi.Struct { 2100 | /// The position for this cell. Takes KV cache shifts into account. 2101 | /// May be negative if the cell is not populated. 2102 | @llama_pos() 2103 | external int pos; 2104 | } 2105 | 2106 | /// An updateable view of the KV cache. 2107 | final class llama_kv_cache_view extends ffi.Struct { 2108 | /// Number of KV cache cells. This will be the same as the context size. 2109 | @ffi.Int32() 2110 | external int n_cells; 2111 | 2112 | /// Maximum number of sequences that can exist in a cell. It's not an error 2113 | /// if there are more sequences in a cell than this value, however they will 2114 | /// not be visible in the view cells_sequences. 2115 | @ffi.Int32() 2116 | external int n_seq_max; 2117 | 2118 | /// Number of tokens in the cache. For example, if there are two populated 2119 | /// cells, the first with 1 sequence id in it and the second with 2 sequence 2120 | /// ids then you'll have 3 tokens. 2121 | @ffi.Int32() 2122 | external int token_count; 2123 | 2124 | /// Number of populated cache cells. 2125 | @ffi.Int32() 2126 | external int used_cells; 2127 | 2128 | /// Maximum contiguous empty slots in the cache. 2129 | @ffi.Int32() 2130 | external int max_contiguous; 2131 | 2132 | /// Index to the start of the max_contiguous slot range. Can be negative 2133 | /// when cache is full. 2134 | @ffi.Int32() 2135 | external int max_contiguous_idx; 2136 | 2137 | /// Information for an individual cell. 2138 | external ffi.Pointer cells; 2139 | 2140 | /// The sequences for each cell. There will be n_seq_max items per cell. 2141 | external ffi.Pointer cells_sequences; 2142 | } 2143 | 2144 | typedef ggml_log_callback 2145 | = ffi.Pointer>; 2146 | typedef ggml_log_callbackFunction = ffi.Void Function(ffi.Int32 level, 2147 | ffi.Pointer text, ffi.Pointer user_data); 2148 | typedef Dartggml_log_callbackFunction = void Function( 2149 | int level, ffi.Pointer text, ffi.Pointer user_data); 2150 | 2151 | abstract class ggml_log_level { 2152 | static const int GGML_LOG_LEVEL_ERROR = 2; 2153 | static const int GGML_LOG_LEVEL_WARN = 3; 2154 | static const int GGML_LOG_LEVEL_INFO = 4; 2155 | static const int GGML_LOG_LEVEL_DEBUG = 5; 2156 | } 2157 | 2158 | typedef FILE = _IO_FILE; 2159 | 2160 | final class _IO_FILE extends ffi.Struct { 2161 | @ffi.Int() 2162 | external int _flags; 2163 | 2164 | external ffi.Pointer _IO_read_ptr; 2165 | 2166 | external ffi.Pointer _IO_read_end; 2167 | 2168 | external ffi.Pointer _IO_read_base; 2169 | 2170 | external ffi.Pointer _IO_write_base; 2171 | 2172 | external ffi.Pointer _IO_write_ptr; 2173 | 2174 | external ffi.Pointer _IO_write_end; 2175 | 2176 | external ffi.Pointer _IO_buf_base; 2177 | 2178 | external ffi.Pointer _IO_buf_end; 2179 | 2180 | external ffi.Pointer _IO_save_base; 2181 | 2182 | external ffi.Pointer _IO_backup_base; 2183 | 2184 | external ffi.Pointer _IO_save_end; 2185 | 2186 | external ffi.Pointer<_IO_marker> _markers; 2187 | 2188 | external ffi.Pointer<_IO_FILE> _chain; 2189 | 2190 | @ffi.Int() 2191 | external int _fileno; 2192 | 2193 | @ffi.Int() 2194 | external int _flags2; 2195 | 2196 | @__off_t() 2197 | external int _old_offset; 2198 | 2199 | @ffi.UnsignedShort() 2200 | external int _cur_column; 2201 | 2202 | @ffi.SignedChar() 2203 | external int _vtable_offset; 2204 | 2205 | @ffi.Array.multi([1]) 2206 | external ffi.Array _shortbuf; 2207 | 2208 | external ffi.Pointer<_IO_lock_t> _lock; 2209 | 2210 | @__off64_t() 2211 | external int _offset; 2212 | 2213 | external ffi.Pointer<_IO_codecvt> _codecvt; 2214 | 2215 | external ffi.Pointer<_IO_wide_data> _wide_data; 2216 | 2217 | external ffi.Pointer<_IO_FILE> _freeres_list; 2218 | 2219 | external ffi.Pointer _freeres_buf; 2220 | 2221 | @ffi.Size() 2222 | external int __pad5; 2223 | 2224 | @ffi.Int() 2225 | external int _mode; 2226 | 2227 | @ffi.Array.multi([20]) 2228 | external ffi.Array _unused2; 2229 | } 2230 | 2231 | final class _IO_marker extends ffi.Opaque {} 2232 | 2233 | typedef __off_t = ffi.Long; 2234 | typedef Dart__off_t = int; 2235 | typedef _IO_lock_t = ffi.Void; 2236 | typedef Dart_IO_lock_t = void; 2237 | typedef __off64_t = ffi.Long; 2238 | typedef Dart__off64_t = int; 2239 | 2240 | final class _IO_codecvt extends ffi.Opaque {} 2241 | 2242 | final class _IO_wide_data extends ffi.Opaque {} 2243 | 2244 | const int LLAMA_DEFAULT_SEED = 4294967295; 2245 | 2246 | const int LLAMA_MAX_RNG_STATE = 65536; 2247 | 2248 | const int LLAMA_FILE_MAGIC_GGLA = 1734831201; 2249 | 2250 | const int LLAMA_FILE_MAGIC_GGSN = 1734833006; 2251 | 2252 | const int LLAMA_FILE_MAGIC_GGSQ = 1734833009; 2253 | 2254 | const int LLAMA_SESSION_MAGIC = 1734833006; 2255 | 2256 | const int LLAMA_SESSION_VERSION = 6; 2257 | 2258 | const int LLAMA_STATE_SEQ_MAGIC = 1734833009; 2259 | 2260 | const int LLAMA_STATE_SEQ_VERSION = 1; 2261 | -------------------------------------------------------------------------------- /lib/src/llama_params.dart: -------------------------------------------------------------------------------- 1 | /// Params holder like `gpt_params` in `common/common.h` 2 | final class LlamaParams { 3 | final int? seed; 4 | final int? nThread; 5 | final int? nThreadBatch; 6 | final int? nPredict; 7 | final int? nCtx; 8 | final int? nBatch; 9 | final int? nGpuLayers; 10 | final int? mainGpu; 11 | final bool embedding; 12 | final int numa; 13 | 14 | const LlamaParams({ 15 | this.seed, 16 | this.nThread, 17 | this.nThreadBatch, 18 | this.nPredict, 19 | this.nCtx, 20 | this.nBatch, 21 | this.nGpuLayers, 22 | this.mainGpu, 23 | this.embedding = false, 24 | this.numa = 0, 25 | }); 26 | } 27 | -------------------------------------------------------------------------------- /lib/src/native_llama.dart: -------------------------------------------------------------------------------- 1 | import 'dart:convert' show utf8; 2 | import 'dart:ffi' as ffi; 3 | import 'dart:io' show stderr; 4 | import 'dart:math' show max; 5 | 6 | import 'common.dart' as c; 7 | import 'ffi.dart'; 8 | import 'lib_llama_cpp.dart' as llama_cpp; 9 | import 'llama_params.dart'; 10 | import 'sampling.dart'; 11 | 12 | bool _shouldAddBosToken(ffi.Pointer model) { 13 | final addBos = llama_cpp.llama_add_bos_token(model); 14 | return addBos != -1 15 | ? addBos != 0 16 | : llama_cpp.llama_vocab_type1(model) == 0; // LLAMA_VOCAB_TYPE_SPM 17 | } 18 | 19 | /// A class represent native llama data structure, run in separate isolate. 20 | final class NativeLLama { 21 | static const engTag = '__end__'; 22 | static const closeTag = '__close__'; 23 | 24 | final ffi.Pointer model; 25 | final ffi.Pointer ctx; 26 | final llama_cpp.llama_batch batch; 27 | final CharArray cStr; 28 | final bool verbose; 29 | final tokenBuf = TokenArray(size: 64); 30 | final array = TokenDataArray(512); 31 | 32 | NativeLLama._( 33 | this.model, 34 | this.ctx, 35 | this.batch, 36 | this.cStr, 37 | this.verbose, 38 | ); 39 | 40 | factory NativeLLama( 41 | String path, 42 | LlamaParams params, { 43 | bool verbose = false, 44 | }) { 45 | final cStr = CharArray.from(path); 46 | final (model, ctx) = c.loadModel(cStr, params); 47 | print('add_bos: ${_shouldAddBosToken(model)}'); 48 | 49 | final batchSize = llama_cpp.llama_n_batch(ctx); 50 | final batch = llama_cpp.llama_batch_init(batchSize, 0, 1); 51 | 52 | return NativeLLama._( 53 | model, 54 | ctx, 55 | batch, 56 | cStr, 57 | verbose, 58 | ); 59 | } 60 | 61 | /// free native resource, need explicitly calling. 62 | void dispose() { 63 | array.dispose(); 64 | tokenBuf.dispose(); 65 | cStr.dispose(); 66 | 67 | llama_cpp.llama_batch_free(batch); 68 | llama_cpp.llama_free(ctx); 69 | llama_cpp.llama_free_model(model); 70 | llama_cpp.llama_backend_free(); 71 | } 72 | 73 | void _log(String str) { 74 | final leadingNewLine = str.startsWith('\n'); 75 | stderr.writeln(leadingNewLine ? str.replaceFirst('\n', '\n ') : ' $str'); 76 | } 77 | 78 | /// Generate token string by @prompt. 79 | /// Some model would return two tokens to represent a single word, 80 | /// so it is better to use raw stream. 81 | Stream> generate( 82 | String prompt, { 83 | int? nPrev, 84 | int? nProbs, 85 | int? topK, 86 | double? topP, 87 | double? minP, 88 | double? tfsZ, 89 | double? typicalP, 90 | double? temperature, 91 | int? penaltyLastN, 92 | double? penaltyRepeat, 93 | double? penaltyFrequency, 94 | double? penaltyPresent, 95 | int? mirostat, 96 | double? mirostatTau, 97 | double? mirostatEta, 98 | bool? penalizeNewline, 99 | String? samplersSequence, 100 | }) async* { 101 | cStr.pavedBy(prompt); 102 | tokenBuf.pavedBy(model, cStr); 103 | if (verbose) { 104 | _log('prompt: "$prompt"'); 105 | _log('tokens: ${_tokensString(tokenBuf.pointerAt(0), tokenBuf.length)}'); 106 | } 107 | final eosToken = llama_cpp.llama_token_eos(model); 108 | 109 | var num = 0; 110 | var code = 0; 111 | 112 | final defaultParams = const SamplingParams(); 113 | final params = SamplingParams( 114 | nPrev: nPrev ?? defaultParams.nPrev, 115 | nProbs: nProbs ?? defaultParams.nProbs, 116 | topK: topK ?? defaultParams.topK, 117 | topP: topP ?? defaultParams.topP, 118 | minP: minP ?? defaultParams.minP, 119 | tfsZ: tfsZ ?? defaultParams.tfsZ, 120 | typicalP: typicalP ?? defaultParams.typicalP, 121 | temperature: temperature ?? defaultParams.temperature, 122 | penaltyLastN: penaltyLastN ?? defaultParams.penaltyLastN, 123 | penaltyRepeat: penaltyRepeat ?? defaultParams.penaltyRepeat, 124 | penaltyFrequency: penaltyFrequency ?? defaultParams.penaltyFrequency, 125 | penaltyPresent: penaltyPresent ?? defaultParams.penaltyPresent, 126 | mirostat: mirostat ?? defaultParams.mirostat, 127 | mirostatTau: mirostatTau ?? defaultParams.mirostatTau, 128 | mirostatEta: mirostatEta ?? defaultParams.mirostatEta, 129 | penalizeNewline: penalizeNewline ?? defaultParams.penalizeNewline, 130 | samplersSequence: samplersSequence ?? defaultParams.samplersSequence, 131 | ); 132 | if (verbose) { 133 | _log('sampling:\n$params'); 134 | _log('sampling order:\n${params.samplingOrder}'); 135 | _log('generate: n_ctx = ${llama_cpp.llama_n_ctx(ctx)}, ' 136 | 'n_batch = ${llama_cpp.llama_n_batch(ctx)}, ' 137 | 'n_predict = %d, ' 138 | 'n_keep = %d'); 139 | } 140 | final ctxSampling = SamplingContext.from(params); 141 | ctxSampling.acceptSampling( 142 | ctx, 143 | tokenBuf.toList(), 144 | false, 145 | ); 146 | llama_cpp.llama_reset_timings(ctx); 147 | llama_cpp.llama_kv_cache_clear(ctx); 148 | while ((code = _decodeBatch(num, num == 0)) == 0) { 149 | if (verbose) { 150 | _log('<<<<<<<<<'); 151 | _log('eval: ${_tokensString(tokenBuf.pointerAt(0), tokenBuf.length)}'); 152 | } 153 | final tokenId = _sampleSampling(ctxSampling, batch.n_tokens - 1); 154 | if (verbose) { 155 | _log('sampled token(${params.mirostat}): ${"$tokenId".padLeft(8)}: '); 156 | } 157 | if (tokenId == eosToken) { 158 | /// 3 - finished by ending. 159 | code = 3; 160 | break; 161 | } 162 | final token = cStr.tokenBytes(model, tokenId); 163 | yield token; 164 | ctxSampling.acceptSampling(ctx, [tokenId], true); 165 | if (verbose) { 166 | final str = _tokensString( 167 | ctxSampling.penaltyPointer, 168 | ctxSampling.usedSize, 169 | ); 170 | _log('\nlast: $str'); 171 | _log('>>>>>>>>>'); 172 | } 173 | 174 | num += batch.n_tokens; 175 | batch.n_tokens = 0; 176 | tokenBuf 177 | ..clear() 178 | ..add(tokenId); 179 | } 180 | llama_cpp.llama_print_timings(ctx); 181 | final n = llama_cpp.llama_get_kv_cache_token_count(ctx); 182 | _log("sample llama logits finished with '$code', kv: $n."); 183 | ctxSampling.free(); 184 | yield utf8.encode(engTag); 185 | } 186 | 187 | int _decodeBatch(int count, bool init) { 188 | // evaluate the initial prompt 189 | c.addBatchPos(batch, tokenBuf.toList(), count, !init); 190 | if (init) { 191 | batch.logits[batch.n_tokens - 1] = 1; 192 | } 193 | // TODO: What if llama_decode return 1? 194 | return llama_cpp.llama_decode(ctx, batch); 195 | } 196 | 197 | int _sampleSampling(SamplingContext ctxSampling, int idx, 198 | [bool isResampling = false]) { 199 | final params = ctxSampling.params; 200 | final model = llama_cpp.llama_get_model(ctx); 201 | final nVocab = llama_cpp.llama_n_vocab(model); 202 | final temp = params.temperature; 203 | final penaltyRepeat = params.penaltyRepeat; 204 | final penaltyFrequency = params.penaltyFrequency; 205 | final penaltyPresent = params.penaltyPresent; 206 | final mirostat = params.mirostat; 207 | final mirostatTau = params.mirostatTau; 208 | final mirostatEta = params.mirostatEta; 209 | final penalizeNewline = params.penalizeNewline; 210 | 211 | final logits = llama_cpp.llama_get_logits_ith(ctx, idx); 212 | final logitBias = params.logitBias?.entries; 213 | logitBias?.forEach((e) { 214 | logits[e.key] += e.value; 215 | }); 216 | array.pavedBy(logits, nVocab); 217 | // apply penalties 218 | final usedSize = ctxSampling.usedSize; 219 | if (usedSize > 0) { 220 | final nl = llama_cpp.llama_token_nl(model); 221 | final logit = logits[nl]; 222 | llama_cpp.llama_sample_repetition_penalties( 223 | ctx, 224 | array.pointer, 225 | ctxSampling.penaltyPointer, 226 | usedSize, 227 | penaltyRepeat, 228 | penaltyFrequency, 229 | penaltyPresent, 230 | ); 231 | if (!penalizeNewline) { 232 | for (var i = 0; i < array.length; i++) { 233 | final data = array[i]; 234 | if (data.id == nl) { 235 | final old = data.logit; 236 | array.setLogit(i, logit); 237 | final v = array[i].logit; 238 | _log("$i: $old -> $v"); 239 | break; 240 | } 241 | } 242 | } 243 | } 244 | 245 | final grammar = ctxSampling.grammar; 246 | if (isResampling && grammar != null) { 247 | llama_cpp.llama_sample_grammar(ctx, array.pointer, grammar); 248 | } 249 | var id = 0; 250 | if (temp < 0.0) { 251 | llama_cpp.llama_sample_softmax(ctx, array.pointer); 252 | id = array[0].id; 253 | } else if (temp == 0.0) { 254 | id = llama_cpp.llama_sample_token_greedy(ctx, array.pointer); 255 | } else { 256 | if (mirostat == 1) { 257 | const mirostatM = 100; 258 | llama_cpp.llama_sample_temp(ctx, array.pointer, temp); 259 | id = llama_cpp.llama_sample_token_mirostat(ctx, array.pointer, 260 | mirostatTau, mirostatEta, mirostatM, ctxSampling.mirostatMu); 261 | } else if (mirostat == 2) { 262 | llama_cpp.llama_sample_temp(ctx, array.pointer, temp); 263 | id = llama_cpp.llama_sample_token_mirostat_v2(ctx, array.pointer, 264 | mirostatTau, mirostatEta, ctxSampling.mirostatMu); 265 | } else { 266 | final minKeep = max(1, params.nProbs); 267 | _samplerQueue(params, nVocab, minKeep); 268 | id = llama_cpp.llama_sample_token(ctx, array.pointer); 269 | } 270 | } 271 | 272 | if (grammar != null && !isResampling) { 273 | // TODO: consider grammar 274 | } 275 | 276 | return id; 277 | } 278 | 279 | void _samplerQueue(SamplingParams params, int capacity, int minKeep) { 280 | final topK = params.topK <= 0 ? capacity : params.topK; 281 | for (final i in params.samplersSequence.codeUnits) { 282 | switch (i) { 283 | case kChar: 284 | llama_cpp.llama_sample_top_k(ctx, array.pointer, topK, minKeep); 285 | break; 286 | case fChar: 287 | llama_cpp.llama_sample_tail_free( 288 | ctx, 289 | array.pointer, 290 | params.tfsZ, 291 | minKeep, 292 | ); 293 | break; 294 | case yChar: 295 | llama_cpp.llama_sample_typical( 296 | ctx, 297 | array.pointer, 298 | params.typicalP, 299 | minKeep, 300 | ); 301 | break; 302 | case pChar: 303 | llama_cpp.llama_sample_top_p( 304 | ctx, 305 | array.pointer, 306 | params.topP, 307 | minKeep, 308 | ); 309 | break; 310 | case mChar: 311 | llama_cpp.llama_sample_min_p( 312 | ctx, 313 | array.pointer, 314 | params.minP, 315 | minKeep, 316 | ); 317 | break; 318 | case tChar: 319 | llama_cpp.llama_sample_temp( 320 | ctx, 321 | array.pointer, 322 | params.temperature, 323 | ); 324 | break; 325 | default: 326 | break; 327 | } 328 | } 329 | } 330 | 331 | String _tokensString(ffi.Pointer pointer, int len) { 332 | final buf = StringBuffer('['); 333 | for (var i = 0; i < len; i++) { 334 | final id = pointer[i]; 335 | buf.write("'${cStr.tokenString(model, id)}':$id, "); 336 | } 337 | buf.write(']'); 338 | return buf.toString(); 339 | } 340 | } 341 | -------------------------------------------------------------------------------- /lib/src/sampling.dart: -------------------------------------------------------------------------------- 1 | import 'dart:ffi' as ffi; 2 | import 'dart:math'; 3 | 4 | import 'package:ffi/ffi.dart'; 5 | 6 | import 'lib_llama_cpp.dart' as llama_cpp; 7 | 8 | extension _FloatEx on double { 9 | String get str => toStringAsFixed(3); 10 | } 11 | 12 | const fChar = 0x66; 13 | const kChar = 0x6b; 14 | const yChar = 0x79; 15 | const pChar = 0x70; 16 | const mChar = 0x6d; 17 | const tChar = 0x74; 18 | 19 | class SamplingParams { 20 | /// number of previous tokens to remember 21 | final int nPrev; 22 | 23 | /// if greater than 0, output the probabilities of top n_probs tokens. 24 | final int nProbs; 25 | 26 | /// <= 0 to use vocab size 27 | final int topK; 28 | final double topP; 29 | final double minP; 30 | final double tfsZ; 31 | final double typicalP; 32 | final double temperature; 33 | final int penaltyLastN; 34 | final double penaltyRepeat; 35 | final double penaltyFrequency; 36 | final double penaltyPresent; 37 | final int mirostat; 38 | final double mirostatTau; 39 | final double mirostatEta; 40 | final bool penalizeNewline; 41 | final String samplersSequence; 42 | 43 | /// optional BNF-like grammar to constrain sampling 44 | final String? grammar; 45 | 46 | /// string to help guidance 47 | final String? cfgNegativePrompt; 48 | 49 | /// how strong is guidance 50 | final double cfgScale; 51 | final Map? logitBias; 52 | final List? penaltyPromptTokens; 53 | final bool usePenaltyPromptTokens; 54 | 55 | const SamplingParams({ 56 | this.nPrev = 64, 57 | this.nProbs = 0, 58 | this.topK = 40, 59 | this.topP = 0.95, 60 | this.minP = 0.05, 61 | this.tfsZ = 1.00, 62 | this.typicalP = 1.0, 63 | this.temperature = 0.80, 64 | this.penaltyLastN = 64, 65 | this.penaltyRepeat = 1.10, 66 | this.penaltyFrequency = 0.00, 67 | this.penaltyPresent = 0.00, 68 | this.mirostat = 0, 69 | this.mirostatTau = 5.00, 70 | this.mirostatEta = 0.10, 71 | this.penalizeNewline = true, 72 | // top_k, tail_free, typical_p, top_p, min_p, temp 73 | this.samplersSequence = "kfypmt", 74 | this.grammar, 75 | this.cfgNegativePrompt, 76 | this.cfgScale = 1.0, 77 | this.logitBias, 78 | this.penaltyPromptTokens, 79 | this.usePenaltyPromptTokens = false, 80 | }); 81 | 82 | String get samplingString => 83 | "\trepeat_last_n = $penaltyLastN, repeat_penalty = ${penaltyRepeat.str}, frequency_penalty = " 84 | "${penaltyFrequency.str}, presence_penalty = ${penaltyPresent.str}\n" 85 | "\ttop_k = $topK, tfs_z = ${tfsZ.str}, top_p = ${topP.str}, min_p = ${minP.str}, " 86 | "typical_p = ${typicalP.str}, temp = ${temperature.str}\n" 87 | "\tmirostat = $mirostat, mirostat_lr = ${mirostatEta.str}, mirostat_ent = ${mirostatTau.str}"; 88 | 89 | String get samplingOrder { 90 | final buf = StringBuffer('CFG -> Penalties '); 91 | if (mirostat == 0) { 92 | for (final c in samplersSequence.codeUnits) { 93 | final seq = _samplersSeq[c]; 94 | if (seq != null) { 95 | buf.write(seq); 96 | } 97 | } 98 | } else { 99 | buf.write('-> mirostat '); 100 | } 101 | return buf.toString(); 102 | } 103 | 104 | @override 105 | String toString() => samplingString; 106 | } 107 | 108 | const _samplersSeq = { 109 | fChar: '-> tfs_z ', 110 | kChar: '-> top_k ', 111 | yChar: '-> typical_p ', 112 | pChar: '-> top_p ', 113 | mChar: '-> min_p ', 114 | tChar: '-> temp ', 115 | }; 116 | 117 | class SamplingContext { 118 | final SamplingParams params; 119 | final ffi.Pointer mirostatMu; 120 | final ffi.Pointer? grammar; 121 | final ffi.Pointer _prev; 122 | final int prevSize; 123 | final int usedSize; 124 | 125 | SamplingContext._( 126 | this.params, 127 | this.mirostatMu, 128 | this.grammar, 129 | this._prev, 130 | this.prevSize, 131 | this.usedSize, 132 | ); 133 | 134 | factory SamplingContext.from(SamplingParams params) { 135 | ffi.Pointer? grammar; 136 | if (params.grammar != null) {} 137 | final mu = calloc.allocate(ffi.sizeOf()); 138 | final (p, len) = _createNativeTokens(params); 139 | final lastN = params.penaltyLastN; 140 | final penaltyLastN = lastN < 0 ? params.nPrev : lastN; 141 | final usedSize = min(len, penaltyLastN); 142 | 143 | return SamplingContext._( 144 | params, 145 | mu, 146 | grammar, 147 | p, 148 | len, 149 | usedSize, 150 | ); 151 | } 152 | 153 | void free() { 154 | final g = grammar; 155 | if (g != null) { 156 | llama_cpp.llama_grammar_free(g); 157 | } 158 | calloc.free(mirostatMu); 159 | calloc.free(_prev); 160 | } 161 | 162 | void reset() { 163 | final g = grammar; 164 | if (g != null) { 165 | llama_cpp.llama_grammar_free(g); 166 | } 167 | for (var i = 0; i < prevSize; i++) { 168 | _prev[i] = 0; 169 | } 170 | } 171 | 172 | int get lastSampledToken => _prev[prevSize - 1]; 173 | 174 | ffi.Pointer get penaltyPointer => 175 | _prev + prevSize - usedSize; 176 | 177 | /// A ring buffer to append a list of tokens 178 | void acceptSampling( 179 | ffi.Pointer ctx, 180 | List ids, 181 | bool applyGrammar, 182 | ) { 183 | final n = min(ids.length, prevSize); 184 | for (var i = 0; i < prevSize - n; i++) { 185 | (_prev + i).value = _prev[i + n]; 186 | } 187 | for (var i = 0; i < n; i++) { 188 | (_prev + i + prevSize - n).value = ids[i]; 189 | } 190 | 191 | if (grammar != null && applyGrammar) { 192 | // TODO: consider grammar 193 | } 194 | } 195 | } 196 | 197 | (ffi.Pointer, int) _createNativeTokens( 198 | SamplingParams params) { 199 | final promptTokens = params.penaltyPromptTokens ?? []; 200 | final tokens = params.usePenaltyPromptTokens && promptTokens.isNotEmpty 201 | ? promptTokens 202 | : List.filled(params.nPrev, 0); 203 | final p = calloc.allocate( 204 | ffi.sizeOf() * tokens.length); 205 | for (var i = 0; i < tokens.length; i++) { 206 | p[i] = tokens[i]; 207 | } 208 | return (p, tokens.length); 209 | } 210 | -------------------------------------------------------------------------------- /pubspec.yaml: -------------------------------------------------------------------------------- 1 | name: llama_cpp 2 | description: A dart binding for llama.cpp library, bringing AI to dart world. 3 | version: 1.2.0 4 | repository: https://github.com/lindeer/llama-cpp 5 | 6 | topics: 7 | - ai 8 | - nlp 9 | - llm 10 | 11 | environment: 12 | sdk: '>=3.3.0 <4.0.0' 13 | 14 | dependencies: 15 | ffi: ^2.1.3 16 | native_assets_cli: ^0.8.0 17 | 18 | dev_dependencies: 19 | ffigen: ^11.0.0 20 | lints: ^3.0.0 21 | test: ^1.21.0 22 | -------------------------------------------------------------------------------- /test/data/text.txt: -------------------------------------------------------------------------------- 1 | 人生的意义是什么? 2 | 老板把几张桌子拼在了一起,很快桌子上变成了一顿丰盛大餐的节日狂欢:盘子和碟子乱七八糟,闪着黄色的油脂、四处碎裂的甲壳动物、面包屑、焦黑的骨头、混乱不堪的瓶子和杯子,几乎所有人都建议不要按预先设想的计划旅行。岛上的每个地方都可以在几个小时内乘车到达。纵横交错的道路能更好地了解西西里的情况。晚些时候,港口辖区的人都走光了,只有零散的几个人坐在桌子旁,这才第一次听到海水的荡漾。港口边一个房子的格子阳台上,一个老人还站着,就像几个小时前一样。他在波斯帽下一动不动,几乎无法把他从倚靠的半明半暗中分辨出来,喧嚣肆意从他身边掠过。记不起是谁先看到的他。现在轮到唐·卡利奇奥(Don Calicchio)开始谈论阳台上的那个人。那个人和岛上居民的典型模型:沉默寡言,纹丝不动,暗中观察,满是忧郁的怀疑。他肯定对发生的事情知之甚少,但这些事情其实塑造了他的思考和生活方式。他说,历史深深地扎根在人们的血液中,三千年来永恒的往复。一遍又一遍的征服。开始是多里亚人和腓尼基人,然后是希腊僭主(Tyrannen)和罗马执政官(Prokonsuln)。接着是伊斯兰宗教领袖(Imame)和诺曼诸侯、霍亨斯陶芬和安茹家族的战事,又接着是加里波第(Garibaldi)和皮埃蒙特小国王1,最后是墨索里尼、德国人和美国的登陆舰队。他总结说,所有这些征服者,无论他们来了多久,都意味着西西里的屈服。 3 | -------------------------------------------------------------------------------- /test/data/values.txt: -------------------------------------------------------------------------------- 1 | [0.002748, 0.018709, -0.004901, 0.033228, 0.019280, -0.007864, 0.009661, 0.021122, -0.004441, 0.060076, -0.046824, 0.039386, -0.018559, -0.016204, -0.015773, 0.037257, 0.008054, 0.047608, -0.031019, -0.028022, 0.039248, -0.067525, 0.022590, 0.019823, 0.012489, 0.097309, -0.013562, 0.000971, 0.005299, 0.017474, 0.013749, 0.058956, 0.031092, 0.004360, -0.039856, 0.047605, -0.009411, 0.016901, -0.000576, -0.039155, -0.008896, 0.027975, 0.004059, -0.012684, 0.041530, -0.028001, -0.002777, -0.004369, -0.011253, 0.056504, 0.039857, 0.193007, 0.002377, -0.007906, -0.005569, -0.011621, 0.058219, -0.026838, -0.003529, -0.002906, -0.043625, 0.008552, -0.013950, 0.094324, -0.016286, -0.006553, 0.024853, 0.031572, -0.015191, 0.016623, 0.024046, -0.021184, 0.007180, -0.031437, 0.003170, 0.020737, -0.036690, 0.022023, -0.041029, -0.003451, -0.021887, -0.024111, -0.029427, -0.013289, 0.010364, 0.063137, -0.001661, -0.036666, 0.011898, 0.052438, -0.013414, -0.005930, -0.077403, 0.000270, 0.041493, -0.052804, 0.003009, -0.052771, 0.037389, 0.005920, -0.016384, -0.059444, -0.091977, 0.043008, 0.035662, -0.039837, -0.049678, 0.028091, -0.028031, -0.015087, 0.000492, -0.013573, 0.022487, -0.058391, -0.036355, -0.035800, 0.014695, -0.021363, -0.068123, -0.029504, -0.022519, -0.032145, 0.054870, -0.061875, -0.016571, -0.009690, 0.006072, 0.063474, -0.026058, 0.057749, 0.016925, -0.027694, 0.004955, -0.012908, -0.053931, 0.000664, -0.016369, -0.031594, 0.034740, -0.046654, 0.049105, 0.040210, 0.008307, 0.016486, 0.006993, -0.003738, 0.023174, -0.004705, -0.009223, 0.020779, 0.036851, 0.037139, 0.041900, 0.007291, -0.003721, 0.046447, -0.025414, -0.003266, 0.038864, -0.022950, -0.035291, -0.009215, 0.026389, 0.044462, 0.061325, -0.009947, -0.060005, -0.003428, 0.018673, 0.018146, 0.029088, -0.042467, -0.013687, -0.044893, 0.005011, 0.028534, 0.038162, -0.015792, -0.001439, -0.027494, -0.038440, -0.023412, 0.024505, 0.029460, -0.038159, -0.003498, -0.015563, -0.030281, 0.028848, -0.048190, -0.001176, 0.024960, 0.003813, 0.038229, 0.010367, 0.036894, -0.030424, -0.011569, -0.009186, -0.030061, -0.019186, 0.053846, 0.033510, 0.031724, 0.001231, 0.056447, -0.027350, -0.003869, 0.012006, 0.012895, 0.003787, 0.017241, 0.008408, -0.009188, -0.026191, 0.053303, -0.030743, 0.004043, 0.007152, 0.029318, 0.076870, -0.024694, -0.031857, 0.007447, -0.023387, -0.027921, 0.026360, -0.036934, 0.050190, -0.030790, 0.000850, -0.049292, -0.061593, 0.014353, -0.006601, -0.004607, -0.014405, 0.034925, -0.007874, -0.043756, -0.025181, -0.003196, -0.011138, -0.030946, -0.048551, -0.019051, -0.015092, -0.059014, 0.045878, -0.017963, 0.047651, 0.011760, -0.013354, -0.015118, -0.024310, 0.009942, 0.017399, -0.003552, -0.007770, 0.007850, -0.025595, -0.021572, 0.033562, -0.073414, -0.087019, -0.001477, 0.022035, -0.004415, 0.023038, 0.027339, 0.014867, 0.043897, -0.002117, -0.011837, 0.008205, -0.060910, -0.009666, -0.042757, 0.039289, -0.076998, 0.004185, 0.053701, -0.028735, -0.006410, -0.084053, -0.069257, -0.040670, 0.011475, 0.015218, -0.034976, 0.046817, 0.009171, -0.005876, -0.042356, 0.268232, -0.000565, 0.019373, 0.030529, 0.005708, 0.019563, -0.042336, -0.005031, 0.028721, -0.058943, 0.026717, 0.009334, 0.031044, 0.000609, 0.027129, -0.039009, -0.077983, -0.033732, 0.001198, -0.010092, -0.047769, 0.025994, -0.012584, 0.001873, 0.016263, 0.018904, -0.017769, 0.007070, 0.047912, -0.020331, 0.002351, -0.001581, -0.016761, -0.028518, 0.041098, 0.048682, -0.002197, 0.025649, 0.016151, 0.004874, -0.044923, 0.029337, 0.011326, -0.037092, -0.033458, -0.045397, 0.066314, -0.037488, -0.024838, -0.006526, 0.002329, 0.025500, -0.059235, 0.004991, -0.026183, 0.037629, 0.023049, -0.075650, 0.054310, -0.014055, -0.016602, -0.042717, -0.007421, 0.001940, -0.016317, 0.048993, -0.032705, -0.007555, -0.009179, 0.007807, 0.013147, -0.001102, 0.034557, 0.008237, 0.015749, -0.028601, 0.020702, 0.039658, -0.012211, -0.054478, 0.033384, -0.033875, -0.016401, 0.014640, 0.032412, -0.000189, -0.020176, -0.023004, 0.015024, -0.038237, -0.004677, -0.039297, -0.088269, 0.014198, 0.094159, -0.037502, -0.010474, 0.012121, 0.007789, -0.063851, -0.021236, -0.007159, -0.012587, -0.006184, 0.006667, 0.008835, -0.033254, -0.034641, 0.024939, 0.012026, -0.020599, -0.014796, 0.019221, -0.008406, 0.073629, 0.041525, -0.001637, -0.041848, -0.075896, 0.017085, -0.065918, -0.039616, 0.032568, 0.002438, 0.018990, 0.022434, -0.023249, -0.020664, 0.046687, -0.053676, -0.003266, -0.025215, 0.002564, 0.041080, -0.027405, 0.013426, 0.007414, -0.035200, -0.008517, -0.034125, -0.033089, -0.030880, -0.029554, 0.064523, -0.009705, -0.022755, 0.014105, 0.010406, 0.012919, 0.044742, -0.006403, -0.015069, 0.022184, -0.009986, 0.047907, -0.052769, 0.069356, 0.053625, 0.011936, -0.028786, -0.081952, 0.003949, 0.012676, -0.001549, -0.042265, 0.002343, 0.030193, 0.052115, 0.003902, -0.032469, -0.048285, -0.034673, -0.021152, -0.055607, -0.007042, -0.061425, -0.005131, -0.005833, -0.037384, -0.003194, -0.005009, 0.048388, -0.052405, -0.048759, 0.028373, 0.054570, 0.021434, -0.054493, -0.001462, 0.008312, -0.014067, -0.001033, -0.050160, -0.043146, -0.026299, -0.025467, -0.027003, -0.019606, -0.007006, 0.048621, -0.023143, 0.028375, -0.013187, -0.053275, 0.007346, 0.011086, -0.026073, -0.012925, 0.012604, -0.001384, 0.020499, -0.052935, -0.006661, 0.032204, -0.018983, -0.014345, -0.011039, -0.037490, 0.008708, 0.066112, 0.008486, -0.011448, 0.017040, -0.016320, -0.031394, 0.023015, -0.038683, -0.042025, 0.000194, 0.054627, -0.055844, 0.019194, 0.023265, -0.018689, -0.049089, 0.052777, -0.077946, -0.032353, 0.023846, 0.002256, -0.038973, 0.045957, -0.024696, -0.089713, -0.015449, -0.060195, 0.066317, -0.049429, 0.041355, 0.016323, -0.018458, -0.033797, -0.021226, 0.006238, 0.013123, -0.040336, -0.015303, 0.033016, -0.011223, 0.060414, -0.045488, 0.039477, -0.016573, 0.026991, 0.048341, -0.060210, 0.009705, -0.012491, -0.033483, 0.067254, -0.037381, 0.003392, -0.014488, -0.034642, -0.008514, 0.045176, -0.006417, 0.002323, -0.014924, -0.004272, 0.001454, 0.033591, 0.033394, -0.070911, 0.036816, -0.000787, -0.025466, 0.006998, -0.015088, 0.043901, -0.013712, -0.036422, -0.000961, -0.027177, 0.006389, 0.061785, 0.022760, -0.069621, -0.028772, 0.017829, -0.029324, -0.050558, -0.019491, -0.014544, 0.004695, 0.033320, -0.005903, 0.016032, -0.049241, 0.004659, 0.025763, 0.013882, 0.017032, -0.020307, 0.040979, 0.081039, 0.050956, 0.018274, 0.015328, -0.022948, 0.025442, 0.009415, -0.037822, 0.021831, -0.025299, 0.054633, -0.028054, 0.028637, -0.046147, 0.003350, -0.050639, -0.020184, -0.045411, 0.011977, 0.046723, 0.027515, 0.015294, 0.041548, 0.024747, -0.047016, 0.021823, -0.049481, 0.065846, -0.000544, -0.013925, 0.055688, 0.077173, 0.052599, -0.026109, 0.002161, 0.028716, 0.014113, 0.020951, 0.014484, -0.017879, 0.027740, -0.041337, -0.036965, 0.032288, 0.014887, 0.060533, 0.073322, -0.037303, 0.007789, -0.001182, 0.001361, -0.025140, -0.011141, -0.027716, -0.032432, -0.000025, 0.049933, 0.000302, 0.044789, -0.002865, 0.094235, -0.030566, 0.010469, 0.002333, 0.040289, -0.005740, -0.047480, -0.015792, 0.009191, 0.021395, 0.003657, -0.036484, 0.004549, -0.001573, 0.008670, 0.039228, -0.007337, -0.037421, 0.035785, 0.003036, -0.001534, 0.046469, -0.002735, 0.042027, -0.038637, -0.036651, 0.135092, -0.005036, -0.035098, -0.033986, -0.035076, -0.016081, 0.021915, 0.082592, 0.006401, 0.013491, -0.008547, -0.054606, -0.007829, -0.012136, 0.034574, 0.049043, -0.050791, -0.059708, -0.021508, -0.039706, 0.000696, 0.006534, 0.009665, 0.001398, -0.011411, 0.020646, 0.010460, 0.004403, -0.004071, -0.004899, 0.012417, 0.014156, -0.053095, 0.049707, -0.048853, -0.034150, -0.060462, -0.042575, -0.020059, 0.027869, 0.025362, -0.037957, 0.012254, -0.009697, 0.046583, -0.020720, 0.039554, 0.063817, 0.085704, -0.036594, 0.018128, 0.035817, -0.043685, 0.029166, -0.001077, 0.031751, 0.004168, -0.021054, -0.028695, -0.016313, 0.015408, 0.000334, 0.084837, 0.012301, 0.059702, -0.025221, 0.026763, 0.041199, -0.036252, -0.031334, 0.044451, -0.071544, 0.002161] 2 | [-0.018143, 0.008852, -0.012980, -0.017932, 0.004372, -0.005681, 0.018440, -0.017154, -0.026233, 0.040064, -0.027006, 0.010853, -0.036699, 0.014419, -0.019105, 0.021649, -0.013456, 0.001155, -0.021316, 0.046883, 0.014310, 0.016424, -0.071366, 0.014739, 0.019437, -0.007247, 0.020553, -0.008878, 0.069753, 0.060115, -0.032874, 0.010057, 0.032123, -0.000345, 0.024318, 0.000402, -0.012160, -0.004174, 0.030701, -0.022710, -0.006988, -0.019127, 0.035932, 0.005848, -0.025857, -0.000143, 0.006460, 0.022864, 0.029600, 0.017288, -0.049645, 0.253160, -0.034791, -0.021239, 0.014576, 0.003085, 0.010481, -0.032058, 0.090785, 0.033968, 0.007878, 0.022380, -0.034588, 0.023633, 0.040676, 0.038403, -0.010366, 0.005184, 0.016591, -0.047127, -0.061293, 0.002111, 0.013041, -0.015932, 0.038606, 0.021251, -0.030351, -0.031395, 0.028123, 0.007267, -0.042682, -0.009201, -0.047993, -0.006835, 0.001766, 0.024612, -0.014592, 0.012323, 0.107377, 0.028368, -0.001037, -0.039827, -0.025936, -0.024666, -0.072203, -0.018609, 0.002930, -0.006727, -0.013415, -0.005397, -0.021885, -0.075212, 0.011520, -0.023104, 0.012526, -0.010788, -0.033539, 0.040659, -0.004011, -0.014619, 0.038324, -0.043777, 0.031286, -0.023233, 0.014507, 0.010989, -0.007007, -0.012117, 0.004525, -0.000170, -0.022546, 0.014299, -0.035408, -0.003327, 0.030012, -0.012777, 0.016996, 0.063880, -0.029673, 0.011598, 0.025941, -0.042055, 0.001732, 0.009818, -0.014632, -0.035351, -0.009402, -0.026646, 0.012423, -0.018959, 0.020423, -0.002210, -0.006109, 0.015463, 0.013806, -0.066499, 0.008910, 0.018189, -0.053944, -0.008109, -0.044242, 0.007260, -0.018921, 0.024620, -0.018877, -0.002768, 0.011710, 0.008474, -0.061399, -0.008161, -0.011569, -0.030398, -0.003328, 0.064313, 0.000425, -0.016365, -0.054703, 0.026104, 0.015441, -0.002947, -0.007322, -0.021591, -0.027526, -0.038455, 0.041345, 0.034893, -0.022149, -0.001833, -0.057281, 0.008519, 0.001363, 0.016069, -0.042331, 0.033602, 0.018698, 0.041008, -0.046121, -0.033649, 0.012806, -0.027645, -0.017044, 0.023213, -0.050048, -0.033277, -0.020886, -0.044509, -0.039000, -0.019332, -0.001629, -0.003827, 0.031668, -0.005092, -0.001875, 0.043424, 0.000626, -0.019863, 0.012846, 0.049299, 0.024386, 0.046971, -0.008021, 0.050799, 0.028351, 0.005843, -0.018338, 0.041342, -0.012484, 0.013896, -0.006079, 0.013040, -0.052899, -0.025937, -0.038179, -0.024130, 0.038773, -0.014598, -0.028820, -0.052534, -0.036298, -0.057635, 0.016938, -0.002892, -0.025019, 0.011015, 0.004795, -0.042835, 0.025884, 0.010080, 0.023427, -0.012004, -0.025530, 0.012286, -0.016133, 0.000858, 0.011696, 0.022822, -0.017968, 0.001757, 0.018143, -0.048050, -0.027925, 0.027805, -0.011089, -0.043770, -0.006697, -0.014253, 0.018511, -0.016123, 0.019746, 0.020176, -0.017052, -0.004586, -0.014835, -0.020798, -0.021490, 0.006280, -0.061022, -0.004235, -0.027600, 0.030015, -0.038244, 0.062919, -0.053732, -0.017938, -0.053406, 0.024520, 0.001105, -0.027806, 0.001512, -0.018585, 0.005771, 0.010302, 0.053443, 0.026496, -0.029605, -0.081924, -0.011144, 0.055886, 0.018785, 0.016905, -0.048825, 0.017786, -0.064784, -0.013671, 0.442080, -0.084582, 0.013849, -0.019666, -0.082373, 0.017085, -0.000916, 0.044251, 0.024448, 0.058969, -0.031943, 0.055358, -0.002865, -0.023817, -0.011372, 0.024001, 0.005060, -0.045001, 0.051598, -0.037835, -0.020254, 0.037380, 0.034399, 0.026193, 0.025360, 0.039737, 0.036441, -0.004631, -0.033750, 0.035122, 0.022928, 0.019629, -0.033454, -0.016481, 0.038379, -0.029493, 0.046083, 0.006995, -0.042532, -0.003677, 0.006150, -0.008367, 0.005011, 0.016327, -0.042770, 0.011053, 0.000615, -0.004791, -0.012687, -0.036017, -0.029267, -0.016272, -0.030642, -0.071663, 0.014216, -0.019710, -0.022685, -0.037142, -0.003084, 0.026906, 0.005628, -0.042816, 0.001880, -0.005221, 0.020948, -0.050402, 0.020020, 0.005918, -0.046436, 0.024602, -0.038509, 0.001782, 0.010593, 0.053763, -0.016677, -0.036851, 0.001356, -0.027268, -0.042748, -0.056925, 0.032705, -0.039534, 0.004900, 0.014198, 0.016457, 0.015782, 0.000441, 0.032984, 0.001289, 0.012026, -0.021828, -0.061351, 0.002498, -0.046783, -0.009831, -0.000304, -0.013009, 0.003612, -0.002828, -0.011761, -0.029889, -0.004185, 0.077845, -0.014204, 0.002303, -0.015936, 0.050738, 0.021281, -0.024871, -0.006210, -0.007039, -0.030317, 0.008190, 0.108005, -0.000566, 0.026994, 0.027732, 0.001104, -0.068122, -0.020904, -0.009004, 0.016278, 0.028514, -0.015468, 0.006343, -0.018696, -0.011136, 0.019137, -0.005979, 0.015671, -0.005553, -0.027164, 0.062806, 0.075525, -0.004504, 0.036799, 0.032845, -0.025675, -0.023274, -0.030614, 0.077937, 0.005885, -0.024767, 0.034007, -0.034811, 0.040075, -0.009517, -0.005475, 0.012956, -0.042830, 0.055849, 0.011082, 0.017170, 0.005174, -0.040940, 0.024219, -0.004569, 0.033617, -0.053037, -0.033828, -0.024299, 0.006937, 0.051105, -0.005996, -0.033195, -0.006971, -0.042310, -0.028655, 0.006117, 0.012709, 0.017386, -0.027348, 0.031368, -0.039168, 0.040344, 0.025833, 0.033354, 0.017633, -0.009275, 0.008685, 0.031174, 0.003524, -0.034325, 0.013287, -0.008789, 0.037559, 0.007184, -0.025878, 0.038920, -0.011559, -0.010757, 0.011745, 0.019417, -0.008194, 0.018362, -0.035069, -0.001238, 0.008175, 0.010998, 0.036282, 0.016307, 0.018908, -0.029127, -0.017894, -0.057597, 0.052655, -0.027317, -0.017046, -0.017265, 0.016358, -0.027357, 0.016560, -0.030029, 0.017136, 0.008966, 0.049322, -0.016273, -0.044973, 0.005486, 0.038597, 0.026139, 0.052185, -0.013302, -0.021002, -0.005554, -0.013731, -0.009178, -0.004234, 0.019047, -0.043806, -0.024731, 0.007249, -0.005646, -0.022322, -0.028383, -0.020909, -0.020115, -0.017795, -0.004946, -0.042137, -0.012211, 0.023940, -0.028057, -0.021644, -0.024881, -0.003815, 0.007790, 0.030010, -0.020316, -0.031519, 0.045400, -0.026035, 0.035411, -0.041054, 0.016395, -0.058526, -0.050071, -0.005350, -0.000012, 0.033756, -0.005484, -0.039902, 0.021338, 0.049371, -0.023813, -0.015730, 0.024044, 0.001983, 0.033334, 0.056554, -0.071455, -0.019449, 0.058415, -0.031268, -0.031091, -0.011072, -0.039906, 0.029178, -0.018974, 0.048917, 0.032150, -0.008453, -0.042689, -0.021592, -0.021074, 0.021745, 0.059312, -0.071636, 0.001728, -0.015454, 0.006690, -0.031002, -0.031740, -0.021628, 0.004644, -0.051729, -0.009922, 0.018186, -0.021257, -0.023335, 0.064015, 0.025273, -0.000643, -0.021201, 0.001624, 0.007812, 0.032212, 0.034465, 0.014869, -0.009189, -0.015261, 0.016883, 0.055912, 0.030596, 0.029166, -0.023781, 0.052843, 0.011924, 0.012102, -0.029363, 0.007754, 0.015042, -0.055564, -0.022002, 0.013403, -0.005262, -0.021019, -0.029399, 0.041430, -0.007306, 0.011596, -0.057910, 0.006634, 0.036520, 0.032751, 0.043588, -0.007944, -0.017204, -0.065091, -0.005859, 0.002196, 0.003127, -0.093287, 0.031131, -0.013190, -0.022992, 0.033164, 0.044008, 0.028843, -0.026998, -0.016769, -0.055378, 0.016484, 0.006436, -0.000911, -0.049148, 0.010099, -0.009313, 0.006079, -0.025318, -0.027986, -0.001028, 0.044158, 0.001163, -0.073567, -0.020542, -0.015759, -0.018729, -0.008904, -0.012252, 0.020013, 0.009463, 0.035434, -0.029390, -0.016625, 0.012281, 0.020857, -0.024805, -0.006153, -0.028686, 0.022368, 0.025523, 0.019324, 0.003363, 0.013508, -0.011024, 0.015968, 0.077530, 0.053599, -0.017130, -0.024727, -0.043727, 0.008582, -0.068265, -0.029977, -0.020101, -0.016305, 0.042614, 0.016977, -0.032327, -0.029827, 0.153215, 0.004094, -0.045580, 0.019848, 0.002369, 0.024696, -0.003266, 0.021996, -0.061369, -0.064577, -0.050211, -0.043349, 0.013123, 0.002832, -0.033594, -0.016068, -0.030281, 0.072160, -0.044019, 0.015623, 0.072866, 0.032856, 0.040987, 0.037885, 0.000853, 0.014264, -0.023801, 0.029079, 0.016286, -0.004104, 0.002991, -0.005541, -0.022465, -0.060349, -0.035410, -0.013432, -0.000811, -0.037817, 0.024563, 0.006463, -0.032183, 0.013181, 0.019463, 0.046768, 0.005375, -0.025111, -0.004335, 0.012567, 0.051717, -0.005750, 0.001575, -0.010433, 0.002139, 0.016632, 0.001609, 0.042580, -0.026003, -0.020230, -0.038322, 0.034158, 0.011612, -0.027692, 0.021338, -0.057004, 0.001955, -0.002718, -0.013585, -0.019107, 0.030198, 0.018570, -0.067122, 0.040898, -0.028554] 3 | -------------------------------------------------------------------------------- /test/test_embedding.dart: -------------------------------------------------------------------------------- 1 | import 'dart:io'; 2 | 3 | import 'package:llama_cpp/embedding.dart'; 4 | import 'package:test/test.dart'; 5 | 6 | const _url = 'https://hf-mirror.com/CompendiumLabs/bge-base-zh-v1.5-gguf/' 7 | 'blob/main/bge-base-zh-v1.5-q4_k_m.gguf'; 8 | const _embedModelPath = 'test/data/bge-base-zh-v1.5.gguf'; 9 | 10 | void _compareList(List d, List v) { 11 | expect(d.length, v.length); 12 | for (var j = 0; j < d.length; j++) { 13 | final v1 = d[j]; 14 | final v2 = v[j]; 15 | expect((v1 - v2).abs() < 0.0000015, true, reason: "[$j]: '$v1' != '$v2'"); 16 | } 17 | } 18 | 19 | void main() { 20 | final embedding = Embedding(_embedModelPath); 21 | final prompts = File('test/data/text.txt').readAsLinesSync(); 22 | final values = File('test/data/values.txt').readAsLinesSync().map((l) { 23 | final v = l 24 | .replaceAll('[', '') 25 | .replaceAll(']', '') 26 | .split(',') 27 | .where((e) => e.trim().isNotEmpty) 28 | .indexed 29 | .map((idx) { 30 | final (j, e) = idx; 31 | try { 32 | return double.parse(e.trim()); 33 | } catch (x) { 34 | print("parse [$j]: '$e' failed!"); 35 | rethrow; 36 | } 37 | }).toList(); 38 | return v; 39 | }).toList(); 40 | 41 | setUp(() { 42 | expect( 43 | File(_embedModelPath).existsSync(), 44 | true, 45 | reason: "Download model from $_url in '$_embedModelPath' before testing!", 46 | ); 47 | }); 48 | 49 | test('basic embed', () { 50 | final d1 = embedding.embedSingle(prompts[0]); 51 | _compareList(d1, values[0]); 52 | final d2 = embedding.embedSingle(prompts[1]); 53 | _compareList(d2, values[1]); 54 | }); 55 | 56 | test('batch embed', () { 57 | final result = embedding.embedBatch(prompts); 58 | expect(result.length, values.length); 59 | for (final r in result.indexed) { 60 | final (i, d) = r; 61 | _compareList(d, values[i]); 62 | } 63 | }); 64 | 65 | tearDownAll(() { 66 | embedding.dispose(); 67 | }); 68 | } 69 | --------------------------------------------------------------------------------