├── .gitattributes
├── .gitignore
├── .gitmodules
├── LICENSE.txt
├── LlamaCpp.sln
├── LlamaCppCli
├── LlamaCppCli.csproj
├── Program.cs
├── SampleClient.cs
├── SampleEmbedding.cs
├── SampleLibrary.cs
├── SampleMeta.cs
└── SampleRaw.cs
├── LlamaCppLib
├── BlockingQueue.cs
├── Extensions.cs
├── Interop.cs
├── LlamaCppLib.csproj
├── LlmClient.cs
├── LlmEngine.cs
├── LlmPrompt.cs
├── LlmSequence.cs
├── MultibyteCharAssembler.cs
├── Native.cs
├── Options.cs
├── Slots.cs
└── UnmanagedResource.cs
├── LlamaCppWeb
├── LlamaCppWeb.csproj
├── Program.cs
├── Properties
│ └── launchSettings.json
├── appsettings.Development.json
└── appsettings.json
├── README.md
├── clean.cmd
└── clean.sh
/.gitattributes:
--------------------------------------------------------------------------------
1 | ###############################################################################
2 | # Set default behavior to automatically normalize line endings.
3 | ###############################################################################
4 | * text=auto
5 |
6 | ###############################################################################
7 | # Set default behavior for command prompt diff.
8 | #
9 | # This is need for earlier builds of msysgit that does not have it on by
10 | # default for csharp files.
11 | # Note: This is only used by command line
12 | ###############################################################################
13 | #*.cs diff=csharp
14 |
15 | ###############################################################################
16 | # Set the merge driver for project and solution files
17 | #
18 | # Merging from the command prompt will add diff markers to the files if there
19 | # are conflicts (Merging from VS is not affected by the settings below, in VS
20 | # the diff markers are never inserted). Diff markers may cause the following
21 | # file extensions to fail to load in VS. An alternative would be to treat
22 | # these files as binary and thus will always conflict and require user
23 | # intervention with every merge. To do so, just uncomment the entries below
24 | ###############################################################################
25 | #*.sln merge=binary
26 | #*.csproj merge=binary
27 | #*.vbproj merge=binary
28 | #*.vcxproj merge=binary
29 | #*.vcproj merge=binary
30 | #*.dbproj merge=binary
31 | #*.fsproj merge=binary
32 | #*.lsproj merge=binary
33 | #*.wixproj merge=binary
34 | #*.modelproj merge=binary
35 | #*.sqlproj merge=binary
36 | #*.wwaproj merge=binary
37 |
38 | ###############################################################################
39 | # behavior for image files
40 | #
41 | # image files are treated as binary by default.
42 | ###############################################################################
43 | #*.jpg binary
44 | #*.png binary
45 | #*.gif binary
46 |
47 | ###############################################################################
48 | # diff behavior for common document formats
49 | #
50 | # Convert binary document formats to text before diffing them. This feature
51 | # is only available from the command line. Turn it on by uncommenting the
52 | # entries below.
53 | ###############################################################################
54 | #*.doc diff=astextplain
55 | #*.DOC diff=astextplain
56 | #*.docx diff=astextplain
57 | #*.DOCX diff=astextplain
58 | #*.dot diff=astextplain
59 | #*.DOT diff=astextplain
60 | #*.pdf diff=astextplain
61 | #*.PDF diff=astextplain
62 | #*.rtf diff=astextplain
63 | #*.RTF diff=astextplain
64 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Ignore Visual Studio temporary files, build results, and
2 | ## files generated by popular Visual Studio add-ons.
3 | ##
4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
5 |
6 | # User-specific files
7 | *.rsuser
8 | *.suo
9 | *.user
10 | *.userosscache
11 | *.sln.docstates
12 |
13 | # User-specific files (MonoDevelop/Xamarin Studio)
14 | *.userprefs
15 |
16 | # Mono auto generated files
17 | mono_crash.*
18 |
19 | # Build results
20 | [Dd]ebug/
21 | [Dd]ebugPublic/
22 | [Rr]elease/
23 | [Rr]eleases/
24 | x64/
25 | x86/
26 | [Ww][Ii][Nn]32/
27 | [Aa][Rr][Mm]/
28 | [Aa][Rr][Mm]64/
29 | bld/
30 | [Bb]in/
31 | [Oo]bj/
32 | [Oo]ut/
33 | [Ll]og/
34 | [Ll]ogs/
35 |
36 | # Visual Studio 2015/2017 cache/options directory
37 | .vs/
38 | # Uncomment if you have tasks that create the project's static files in wwwroot
39 | #wwwroot/
40 |
41 | # Visual Studio 2017 auto generated files
42 | Generated\ Files/
43 |
44 | # MSTest test Results
45 | [Tt]est[Rr]esult*/
46 | [Bb]uild[Ll]og.*
47 |
48 | # NUnit
49 | *.VisualState.xml
50 | TestResult.xml
51 | nunit-*.xml
52 |
53 | # Build Results of an ATL Project
54 | [Dd]ebugPS/
55 | [Rr]eleasePS/
56 | dlldata.c
57 |
58 | # Benchmark Results
59 | BenchmarkDotNet.Artifacts/
60 |
61 | # .NET Core
62 | project.lock.json
63 | project.fragment.lock.json
64 | artifacts/
65 |
66 | # ASP.NET Scaffolding
67 | ScaffoldingReadMe.txt
68 |
69 | # StyleCop
70 | StyleCopReport.xml
71 |
72 | # Files built by Visual Studio
73 | *_i.c
74 | *_p.c
75 | *_h.h
76 | *.ilk
77 | *.meta
78 | *.obj
79 | *.iobj
80 | *.pch
81 | *.pdb
82 | *.ipdb
83 | *.pgc
84 | *.pgd
85 | *.rsp
86 | *.sbr
87 | *.tlb
88 | *.tli
89 | *.tlh
90 | *.tmp
91 | *.tmp_proj
92 | *_wpftmp.csproj
93 | *.log
94 | *.vspscc
95 | *.vssscc
96 | .builds
97 | *.pidb
98 | *.svclog
99 | *.scc
100 |
101 | # Chutzpah Test files
102 | _Chutzpah*
103 |
104 | # Visual C++ cache files
105 | ipch/
106 | *.aps
107 | *.ncb
108 | *.opendb
109 | *.opensdf
110 | *.sdf
111 | *.cachefile
112 | *.VC.db
113 | *.VC.VC.opendb
114 |
115 | # Visual Studio profiler
116 | *.psess
117 | *.vsp
118 | *.vspx
119 | *.sap
120 |
121 | # Visual Studio Trace Files
122 | *.e2e
123 |
124 | # TFS 2012 Local Workspace
125 | $tf/
126 |
127 | # Guidance Automation Toolkit
128 | *.gpState
129 |
130 | # ReSharper is a .NET coding add-in
131 | _ReSharper*/
132 | *.[Rr]e[Ss]harper
133 | *.DotSettings.user
134 |
135 | # TeamCity is a build add-in
136 | _TeamCity*
137 |
138 | # DotCover is a Code Coverage Tool
139 | *.dotCover
140 |
141 | # AxoCover is a Code Coverage Tool
142 | .axoCover/*
143 | !.axoCover/settings.json
144 |
145 | # Coverlet is a free, cross platform Code Coverage Tool
146 | coverage*.json
147 | coverage*.xml
148 | coverage*.info
149 |
150 | # Visual Studio code coverage results
151 | *.coverage
152 | *.coveragexml
153 |
154 | # NCrunch
155 | _NCrunch_*
156 | .*crunch*.local.xml
157 | nCrunchTemp_*
158 |
159 | # MightyMoose
160 | *.mm.*
161 | AutoTest.Net/
162 |
163 | # Web workbench (sass)
164 | .sass-cache/
165 |
166 | # Installshield output folder
167 | [Ee]xpress/
168 |
169 | # DocProject is a documentation generator add-in
170 | DocProject/buildhelp/
171 | DocProject/Help/*.HxT
172 | DocProject/Help/*.HxC
173 | DocProject/Help/*.hhc
174 | DocProject/Help/*.hhk
175 | DocProject/Help/*.hhp
176 | DocProject/Help/Html2
177 | DocProject/Help/html
178 |
179 | # Click-Once directory
180 | publish/
181 |
182 | # Publish Web Output
183 | *.[Pp]ublish.xml
184 | *.azurePubxml
185 | # Note: Comment the next line if you want to checkin your web deploy settings,
186 | # but database connection strings (with potential passwords) will be unencrypted
187 | *.pubxml
188 | *.publishproj
189 |
190 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
191 | # checkin your Azure Web App publish settings, but sensitive information contained
192 | # in these scripts will be unencrypted
193 | PublishScripts/
194 |
195 | # NuGet Packages
196 | *.nupkg
197 | # NuGet Symbol Packages
198 | *.snupkg
199 | # The packages folder can be ignored because of Package Restore
200 | **/[Pp]ackages/*
201 | # except build/, which is used as an MSBuild target.
202 | !**/[Pp]ackages/build/
203 | # Uncomment if necessary however generally it will be regenerated when needed
204 | #!**/[Pp]ackages/repositories.config
205 | # NuGet v3's project.json files produces more ignorable files
206 | *.nuget.props
207 | *.nuget.targets
208 |
209 | # Microsoft Azure Build Output
210 | csx/
211 | *.build.csdef
212 |
213 | # Microsoft Azure Emulator
214 | ecf/
215 | rcf/
216 |
217 | # Windows Store app package directories and files
218 | AppPackages/
219 | BundleArtifacts/
220 | Package.StoreAssociation.xml
221 | _pkginfo.txt
222 | *.appx
223 | *.appxbundle
224 | *.appxupload
225 |
226 | # Visual Studio cache files
227 | # files ending in .cache can be ignored
228 | *.[Cc]ache
229 | # but keep track of directories ending in .cache
230 | !?*.[Cc]ache/
231 |
232 | # Others
233 | ClientBin/
234 | ~$*
235 | *~
236 | *.dbmdl
237 | *.dbproj.schemaview
238 | *.jfm
239 | *.pfx
240 | *.publishsettings
241 | orleans.codegen.cs
242 |
243 | # Including strong name files can present a security risk
244 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
245 | #*.snk
246 |
247 | # Since there are multiple workflows, uncomment next line to ignore bower_components
248 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
249 | #bower_components/
250 |
251 | # RIA/Silverlight projects
252 | Generated_Code/
253 |
254 | # Backup & report files from converting an old project file
255 | # to a newer Visual Studio version. Backup files are not needed,
256 | # because we have git ;-)
257 | _UpgradeReport_Files/
258 | Backup*/
259 | UpgradeLog*.XML
260 | UpgradeLog*.htm
261 | ServiceFabricBackup/
262 | *.rptproj.bak
263 |
264 | # SQL Server files
265 | *.mdf
266 | *.ldf
267 | *.ndf
268 |
269 | # Business Intelligence projects
270 | *.rdl.data
271 | *.bim.layout
272 | *.bim_*.settings
273 | *.rptproj.rsuser
274 | *- [Bb]ackup.rdl
275 | *- [Bb]ackup ([0-9]).rdl
276 | *- [Bb]ackup ([0-9][0-9]).rdl
277 |
278 | # Microsoft Fakes
279 | FakesAssemblies/
280 |
281 | # GhostDoc plugin setting file
282 | *.GhostDoc.xml
283 |
284 | # Node.js Tools for Visual Studio
285 | .ntvs_analysis.dat
286 | node_modules/
287 |
288 | # Visual Studio 6 build log
289 | *.plg
290 |
291 | # Visual Studio 6 workspace options file
292 | *.opt
293 |
294 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
295 | *.vbw
296 |
297 | # Visual Studio LightSwitch build output
298 | **/*.HTMLClient/GeneratedArtifacts
299 | **/*.DesktopClient/GeneratedArtifacts
300 | **/*.DesktopClient/ModelManifest.xml
301 | **/*.Server/GeneratedArtifacts
302 | **/*.Server/ModelManifest.xml
303 | _Pvt_Extensions
304 |
305 | # Paket dependency manager
306 | .paket/paket.exe
307 | paket-files/
308 |
309 | # FAKE - F# Make
310 | .fake/
311 |
312 | # CodeRush personal settings
313 | .cr/personal
314 |
315 | # Python Tools for Visual Studio (PTVS)
316 | __pycache__/
317 | *.pyc
318 |
319 | # Cake - Uncomment if you are using it
320 | # tools/**
321 | # !tools/packages.config
322 |
323 | # Tabs Studio
324 | *.tss
325 |
326 | # Telerik's JustMock configuration file
327 | *.jmconfig
328 |
329 | # BizTalk build output
330 | *.btp.cs
331 | *.btm.cs
332 | *.odx.cs
333 | *.xsd.cs
334 |
335 | # OpenCover UI analysis results
336 | OpenCover/
337 |
338 | # Azure Stream Analytics local run output
339 | ASALocalRun/
340 |
341 | # MSBuild Binary and Structured Log
342 | *.binlog
343 |
344 | # NVidia Nsight GPU debugger configuration file
345 | *.nvuser
346 |
347 | # MFractors (Xamarin productivity tool) working folder
348 | .mfractor/
349 |
350 | # Local History for Visual Studio
351 | .localhistory/
352 |
353 | # BeatPulse healthcheck temp database
354 | healthchecksdb
355 |
356 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
357 | MigrationBackup/
358 |
359 | # Ionide (cross platform F# VS Code tools) working folder
360 | .ionide/
361 |
362 | # Fody - auto-generated XML schema
363 | FodyWeavers.xsd
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "llama.cpp"]
2 | path = llama.cpp
3 | url = https://github.com/dranger003/llama.cpp.git
4 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 LlamaCppDotNet
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/LlamaCpp.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 17
4 | VisualStudioVersion = 17.5.33530.505
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LlamaCppLib", "LlamaCppLib\LlamaCppLib.csproj", "{92E1B971-305F-4526-AB1C-E89B43E21C24}"
7 | EndProject
8 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LlamaCppCli", "LlamaCppCli\LlamaCppCli.csproj", "{F85A71D0-CD44-4CDA-AB68-17D9090C3B1C}"
9 | EndProject
10 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LlamaCppWeb", "LlamaCppWeb\LlamaCppWeb.csproj", "{9C1B4C2E-F84D-421E-B924-C5C394288AC5}"
11 | EndProject
12 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{22BD050B-B732-4B02-989A-CFBC1A43C440}"
13 | ProjectSection(SolutionItems) = preProject
14 | clean.cmd = clean.cmd
15 | LICENSE.txt = LICENSE.txt
16 | README.md = README.md
17 | EndProjectSection
18 | EndProject
19 | Global
20 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
21 | Debug|Any CPU = Debug|Any CPU
22 | Release|Any CPU = Release|Any CPU
23 | EndGlobalSection
24 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
25 | {92E1B971-305F-4526-AB1C-E89B43E21C24}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
26 | {92E1B971-305F-4526-AB1C-E89B43E21C24}.Debug|Any CPU.Build.0 = Debug|Any CPU
27 | {92E1B971-305F-4526-AB1C-E89B43E21C24}.Release|Any CPU.ActiveCfg = Release|Any CPU
28 | {92E1B971-305F-4526-AB1C-E89B43E21C24}.Release|Any CPU.Build.0 = Release|Any CPU
29 | {F85A71D0-CD44-4CDA-AB68-17D9090C3B1C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
30 | {F85A71D0-CD44-4CDA-AB68-17D9090C3B1C}.Debug|Any CPU.Build.0 = Debug|Any CPU
31 | {F85A71D0-CD44-4CDA-AB68-17D9090C3B1C}.Release|Any CPU.ActiveCfg = Release|Any CPU
32 | {F85A71D0-CD44-4CDA-AB68-17D9090C3B1C}.Release|Any CPU.Build.0 = Release|Any CPU
33 | {9C1B4C2E-F84D-421E-B924-C5C394288AC5}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
34 | {9C1B4C2E-F84D-421E-B924-C5C394288AC5}.Debug|Any CPU.Build.0 = Debug|Any CPU
35 | {9C1B4C2E-F84D-421E-B924-C5C394288AC5}.Release|Any CPU.ActiveCfg = Release|Any CPU
36 | {9C1B4C2E-F84D-421E-B924-C5C394288AC5}.Release|Any CPU.Build.0 = Release|Any CPU
37 | {59159D33-CD95-4225-9EC5-5FF0D4937367}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
38 | {59159D33-CD95-4225-9EC5-5FF0D4937367}.Debug|Any CPU.Build.0 = Debug|Any CPU
39 | {59159D33-CD95-4225-9EC5-5FF0D4937367}.Release|Any CPU.ActiveCfg = Release|Any CPU
40 | {59159D33-CD95-4225-9EC5-5FF0D4937367}.Release|Any CPU.Build.0 = Release|Any CPU
41 | EndGlobalSection
42 | GlobalSection(SolutionProperties) = preSolution
43 | HideSolutionNode = FALSE
44 | EndGlobalSection
45 | GlobalSection(ExtensibilityGlobals) = postSolution
46 | SolutionGuid = {2E1CE567-CBFE-47D2-8451-8AD76D10702A}
47 | EndGlobalSection
48 | EndGlobal
49 |
--------------------------------------------------------------------------------
/LlamaCppCli/LlamaCppCli.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Exe
5 | net8.0
6 | enable
7 | enable
8 | ..
9 | preview
10 | false
11 | false
12 | True
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
--------------------------------------------------------------------------------
/LlamaCppCli/Program.cs:
--------------------------------------------------------------------------------
1 | using System.Reflection;
2 | using System.Runtime.InteropServices;
3 | using System.Text;
4 |
5 | namespace LlamaCppCli
6 | {
7 | internal partial class Program
8 | {
9 | static async Task Main(string[] args)
10 | {
11 | // Multibyte encoding handling (e.g. emojis, etc.)
12 | Console.OutputEncoding = Encoding.UTF8;
13 |
14 | // If you need to support runtime native library loading,
15 | // uncomment this line and implement `ResolveLibrary()` below.
16 | //NativeLibrary.SetDllImportResolver(typeof(LlamaCppLib.Native).Assembly, ResolveLibrary);
17 |
18 | var samples = new Dictionary>
19 | {
20 | // Native API using raw function calls (standalone)
21 | [nameof(RunSampleRawAsync)] = RunSampleRawAsync,
22 | // Library API using wrapped native calls (standalone)
23 | [nameof(RunSampleLibraryAsync)] = RunSampleLibraryAsync,
24 | // Remote API using wrapped client calls (first run `LlamaCppWeb.exe` for the API hosting)
25 | [nameof(RunSampleClientAsync)] = RunSampleClientAsync,
26 | // Dump GGUF meta data
27 | [nameof(RunDumpMetaAsync)] = RunDumpMetaAsync,
28 | // State load/save using raw function calls
29 | [nameof(RunSampleStateRawAsync)] = RunSampleStateRawAsync,
30 | // Embeddings API using raw function calls (intfloat/e5-mistral-7b-instruct)
31 | [nameof(RunSampleEmbeddingAsync)] = RunSampleEmbeddingAsync,
32 | }
33 | .Select((x, i) => (Index: i + 1, Sample: (Name: x.Key, Func: x.Value)))
34 | .ToList();
35 |
36 | if (args.Length < 1 || !Int32.TryParse(args[0], out var sampleIndex))
37 | {
38 | Console.WriteLine($"Usage: LlamaCppCli [SampleOpt1] [SampleOpt2] [...]");
39 | Console.WriteLine($"SampleNo:");
40 | samples.ForEach(x => Console.WriteLine($" {x.Index}. {x.Sample.Name}"));
41 | return;
42 | }
43 |
44 | if (sampleIndex > 0 && sampleIndex < samples.Count)
45 | {
46 | await samples[sampleIndex - 1].Sample.Func(args.Skip(1).ToArray());
47 | }
48 | else
49 | {
50 | Console.WriteLine($"Invalid sample no. {sampleIndex}.");
51 | }
52 | }
53 |
54 | static nint ResolveLibrary(string libraryName, Assembly assembly, DllImportSearchPath? searchPath)
55 | {
56 | // TODO: Determine which DLL to load here, i.e.:
57 | //if (cpuOnly) libraryName = "CPU-Only.dll";
58 | //else if (nvidiaGpu) libraryName = "nVIDIA-CUDA.dll";
59 | //else if (amdGpu) libraryName = "AMD-ROCm.dll";
60 |
61 | if (NativeLibrary.TryLoad(libraryName, out var handle))
62 | {
63 | return handle;
64 | }
65 |
66 | throw new DllNotFoundException($"Unable to load library: {libraryName}");
67 | }
68 | }
69 |
70 | internal static class Extensions
71 | {
72 | public static string TruncateWithEllipsis(this String text, float percentWidth = 0.75f)
73 | {
74 | var maxWidth = (int)(Console.WindowWidth * percentWidth);
75 | return text.Length > maxWidth ? String.Concat(text.AsSpan(0, maxWidth - 3), "...") : text;
76 | }
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/LlamaCppCli/SampleClient.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 | using System.Text.RegularExpressions;
3 |
4 | using LlamaCppLib;
5 |
6 | namespace LlamaCppCli
7 | {
8 | internal partial class Program
9 | {
10 | static async Task RunSampleClientAsync(string[] args)
11 | {
12 | if (args.Length < 1)
13 | {
14 | Console.WriteLine($"Usage: RunSampleClientAsync [] [] []");
15 | return;
16 | }
17 |
18 | var cancellationTokenSource = new CancellationTokenSource();
19 | Console.CancelKeyPress += (s, e) => { cancellationTokenSource.Cancel(); e.Cancel = true; };
20 |
21 | using var client = new LlmClient(args[0]);
22 |
23 | Console.WriteLine($"Available model(s):");
24 | var modelNames = await client.ListAsync();
25 | var state = await client.StateAsync();
26 | modelNames
27 | .Select((x, i) => (Name: x, Index: i))
28 | .ToList()
29 | .ForEach(model => Console.WriteLine($" {model.Index}) {model.Name} {(state.ModelName == model.Name && state.ModelStatus == LlmModelStatus.Loaded ? "(loaded)" : String.Empty)}"));
30 | Console.WriteLine();
31 |
32 | var GetSelection = () =>
33 | {
34 | while (true)
35 | {
36 | Console.Write($"Select model # to load: ");
37 | var key = Console.ReadKey();
38 | Console.WriteLine();
39 |
40 | if (Int32.TryParse($"{key.KeyChar}", out var index) && index >= 0 && index < modelNames.Count)
41 | return index;
42 |
43 | Console.WriteLine();
44 | }
45 | };
46 |
47 | var index = GetSelection();
48 | var unload = state.ModelStatus == LlmModelStatus.Loaded && state.ModelName != modelNames[index];
49 |
50 | if (state.ModelStatus == LlmModelStatus.Loaded && state.ModelName == modelNames[index])
51 | {
52 | Console.Write("Model already loaded, reload [y/N]? ");
53 | var key = Console.ReadKey();
54 | Console.WriteLine();
55 |
56 | if (key.Key == ConsoleKey.Y)
57 | unload = true;
58 | }
59 |
60 | if (unload)
61 | {
62 | Console.Write("Unloading model...");
63 | await client.UnloadAsync();
64 | Console.WriteLine();
65 | state = await client.StateAsync();
66 | }
67 |
68 | var gpuLayers = args.Length > 1 ? Int32.Parse(args[1]) : 0;
69 | var ctxLength = args.Length > 2 ? Int32.Parse(args[2]) : 0;
70 | var ropeFreq = args.Length > 3 ? Int32.Parse(args[3]) : 0.0f;
71 |
72 | if (state.ModelStatus == LlmModelStatus.Unloaded)
73 | {
74 | Console.Write("Loading model...");
75 | state = await client.LoadAsync(modelNames[index], new LlmModelOptions { GpuLayers = gpuLayers, ContextLength = ctxLength, RopeFrequeceBase = ropeFreq, UseFlashAttention = true });
76 | Console.WriteLine();
77 | }
78 |
79 | Console.WriteLine($"Model name: {state.ModelName}");
80 | Console.WriteLine($"Model status: {state.ModelStatus}");
81 |
82 | Console.WriteLine();
83 | Console.WriteLine($"Input prompt below (or to load a prompt from file, i.e. '/load \"prompt.txt\"').");
84 | Console.WriteLine($"You can also type '/clear' to erase chat history.");
85 | Console.WriteLine($"To quit, leave input blank and press .");
86 |
87 | var messages = new List { new() { Role = "system", Content = "You are a helpful assistant." } };
88 |
89 | while (true)
90 | {
91 | Console.Write("\n> ");
92 | var prompt = (Console.ReadLine() ?? String.Empty).Replace("\\n", "\n");
93 | if (String.IsNullOrWhiteSpace(prompt))
94 | break;
95 |
96 | if (prompt == "/clear")
97 | {
98 | messages = new(messages.Take(1));
99 | continue;
100 | }
101 |
102 | if (prompt == "/messages")
103 | {
104 | foreach (var message in messages)
105 | {
106 | Console.WriteLine($"[{message.Role}][{message.Content}]");
107 | }
108 |
109 | continue;
110 | }
111 |
112 | var match = Regex.Match(prompt, @"/load\s+""?([^""\s]+)""?");
113 | if (match.Success)
114 | {
115 | if (File.Exists(match.Groups[1].Value))
116 | {
117 | Console.WriteLine($"Loading prompt from file \"{Path.GetFullPath(match.Groups[1].Value)}\"...");
118 | prompt = File.ReadAllText(match.Groups[1].Value);
119 | }
120 | else
121 | {
122 | Console.WriteLine($"File not found \"{match.Groups[1].Value}\".");
123 | continue;
124 | }
125 | }
126 |
127 | messages.Add(new() { Role = "user", Content = prompt });
128 | var response = new StringBuilder();
129 |
130 | //var samplingOptions = new SamplingOptions { Temperature = 0.3f, ExtraStopTokens = ["<|EOT|>", "<|end_of_turn|>", "<|endoftext|>", "<|im_end|>"] };
131 | var samplingOptions = new SamplingOptions() /*{ TopK = 50, TopP = 0.95f, Temperature = 0.7f }*/;
132 |
133 | await foreach (var token in client.PromptAsync(messages, samplingOptions, cancellationTokenSource.Token))
134 | {
135 | Console.Write(token);
136 | response.Append(token);
137 | }
138 |
139 | if (cancellationTokenSource.IsCancellationRequested)
140 | {
141 | messages.Remove(messages.Last());
142 |
143 | Console.WriteLine(" [Cancelled]");
144 | cancellationTokenSource = new();
145 | }
146 | else
147 | {
148 | messages.Add(new() { Role = "assistant", Content = response.ToString() });
149 | }
150 | }
151 | }
152 | }
153 | }
154 |
--------------------------------------------------------------------------------
/LlamaCppCli/SampleEmbedding.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | using static LlamaCppLib.Native;
4 | using static LlamaCppLib.Interop;
5 |
6 | namespace LlamaCppCli
7 | {
8 | using llama_context = System.IntPtr;
9 | using llama_token = System.Int32;
10 | using llama_seq_id = System.Int32;
11 |
12 | internal partial class Program
13 | {
14 | static async Task RunSampleEmbeddingAsync(string[] args)
15 | {
16 | if (args.Length < 1)
17 | {
18 | Console.WriteLine($"Usage: RunSampleEmbeddingAsync [GpuLayers]");
19 | return;
20 | }
21 |
22 | RunSampleEmbedding(args);
23 | await Task.CompletedTask;
24 | }
25 |
26 | // Tested using https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF
27 | static unsafe void RunSampleEmbedding(string[] args)
28 | {
29 | var mparams = llama_model_default_params();
30 | mparams.n_gpu_layers = args.Length > 1 ? Int32.Parse(args[1]) : 0;
31 |
32 | var cparams = llama_context_default_params();
33 | cparams.n_ctx = 2048;
34 | cparams.embeddings = true ? 1 : 0;
35 | cparams.pooling_type = _llama_pooling_type.LLAMA_POOLING_TYPE_MEAN;
36 |
37 | // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
38 | var embd_normalize = 2;
39 |
40 | var mdl = llama_load_model_from_file(args[0], mparams);
41 | var ctx = llama_new_context_with_model(mdl, cparams);
42 |
43 | var n_ctx_train = llama_n_ctx_train(mdl);
44 | var n_ctx = llama_n_ctx(ctx);
45 |
46 | var pooling_type = llama_pooling_type(ctx);
47 |
48 | if (llama_model_has_encoder(mdl) && llama_model_has_decoder(mdl))
49 | {
50 | Console.WriteLine("computing embeddings in encoder-decoder models is not supported.");
51 | return;
52 | }
53 |
54 | if (n_ctx > n_ctx_train)
55 | {
56 | Console.WriteLine($"warning: model was trained on only {n_ctx_train} context tokens ({n_ctx} specified)");
57 | }
58 |
59 | var n_batch = (int)cparams.n_batch;
60 | if (n_batch < n_ctx)
61 | {
62 | Console.WriteLine($"error: cparams.n_batch < n_ctx ({cparams.n_batch} < {n_ctx})");
63 | return;
64 | }
65 |
66 | var prompts = new[]
67 | {
68 | "Hello world!",
69 | };
70 |
71 | var inputs = new List();
72 | foreach (var prompt in prompts)
73 | {
74 | var inp = llama_tokenize(mdl, Encoding.UTF8.GetBytes(prompt), true, true);
75 | if (inp.Length > cparams.n_batch)
76 | {
77 | Console.WriteLine($"number of tokens in input line ({inp.Length}) exceeds batch size ({cparams.n_batch}), increase batch size and re-run.");
78 | return;
79 | }
80 | inputs.Add(inp);
81 | }
82 |
83 | var n_prompts = prompts.Length;
84 | var batch = llama_batch_init(n_batch, 0, 1);
85 |
86 | var n_embd_count = 0;
87 | if (pooling_type == _llama_pooling_type.LLAMA_POOLING_TYPE_NONE)
88 | {
89 | for (var k = 0; k < n_prompts; k++)
90 | {
91 | n_embd_count += inputs[k].Length;
92 | }
93 | }
94 | else
95 | {
96 | n_embd_count = n_prompts;
97 | }
98 |
99 | var n_embd = llama_n_embd(mdl);
100 | var embeddings = new float[n_embd_count * n_embd];
101 |
102 | fixed (float* emb = &embeddings[0])
103 | {
104 | float* @out = null;
105 | var e = 0;
106 | var s = 0;
107 |
108 | for (var k = 0; k < n_prompts; k++)
109 | {
110 | var inp = inputs[k];
111 | var n_toks = inp.Length;
112 |
113 | if (batch.n_tokens + n_toks > n_batch)
114 | {
115 | @out = emb + e * n_embd;
116 | batch_decode(ctx, ref batch, @out, s, n_embd, embd_normalize);
117 | e += pooling_type == _llama_pooling_type.LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
118 | s = 0;
119 | llama_batch_clear(ref batch);
120 | }
121 |
122 | batch_add_seq(ref batch, inp, s);
123 | s += 1;
124 | }
125 |
126 | @out = emb + e * n_embd;
127 | batch_decode(ctx, ref batch, @out, s, n_embd, embd_normalize);
128 |
129 | if (pooling_type == _llama_pooling_type.LLAMA_POOLING_TYPE_NONE)
130 | {
131 | for (var j = 0; j < n_embd_count; j++)
132 | {
133 | Console.Write($"embedding {j}: ");
134 | for (var i = 0; i < Math.Min(3, n_embd); i++)
135 | {
136 | if (embd_normalize == 0)
137 | {
138 | Console.Write($"{emb[j * n_embd + i],6:0} ");
139 | }
140 | else
141 | {
142 | Console.Write($"{emb[j * n_embd + i],9:F6} ");
143 | }
144 | }
145 | Console.Write(" ... ");
146 | for (var i = n_embd - 3; i < n_embd; i++)
147 | {
148 | if (embd_normalize == 0)
149 | {
150 | Console.Write($"{emb[j * n_embd + i],6:0} ");
151 | }
152 | else
153 | {
154 | Console.Write($"{emb[j * n_embd + i],9:F6} ");
155 | }
156 | }
157 | Console.WriteLine();
158 | }
159 | }
160 | else if (pooling_type == _llama_pooling_type.LLAMA_POOLING_TYPE_RANK)
161 | {
162 | for (var j = 0; j < n_embd_count; j++)
163 | {
164 | Console.WriteLine($"rerank score {j}: {emb[j * n_embd],8:F3}");
165 | }
166 | }
167 | else
168 | {
169 | // print the first part of the embeddings or for a single prompt, the full embedding
170 | for (var j = 0; j < n_prompts; j++)
171 | {
172 | Console.Write($"embedding {j}: ");
173 | for (var i = 0; i < (n_prompts > 1 ? Math.Min(16, n_embd) : n_embd); i++)
174 | {
175 | if (embd_normalize == 0)
176 | {
177 | Console.Write($"{emb[j * n_embd + i],6:0} ");
178 | }
179 | else
180 | {
181 | Console.Write($"{emb[j * n_embd + i],9:F6} ");
182 | }
183 | }
184 | Console.WriteLine();
185 | }
186 |
187 | // print cosine similarity matrix
188 | if (n_prompts > 1)
189 | {
190 | Console.Write("\n");
191 | Console.Write("cosine similarity matrix:\n\n");
192 | for (var i = 0; i < n_prompts; i++)
193 | {
194 | Console.Write($"{prompts[i][..6]} ");
195 | }
196 | Console.WriteLine();
197 | for (var i = 0; i < n_prompts; i++)
198 | {
199 | for (var j = 0; j < n_prompts; j++)
200 | {
201 | var sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
202 | Console.Write($"{sim,6:F2} ");
203 | }
204 | Console.WriteLine($"{prompts[i][..10]}");
205 | }
206 | }
207 | }
208 | }
209 |
210 | //var documents = new[]
211 | //{
212 | // "Carson City is the capital city of the American state of Nevada. At the 2010 United States Census, Carson City had a population of 55,274.",
213 | // "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean that are a political division controlled by the United States. Its capital is Saipan.",
214 | // "Charlotte Amalie is the capital and largest city of the United States Virgin Islands. It has about 20,000 people. The city is on the island of Saint Thomas.",
215 | // "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district.",
216 | // "Proteins are the building blocks of muscle tissue and other important structures in chickens, helping them grow strong and healthy!",
217 | // "Capital punishment (the death penalty) has existed in the United States since before the United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states.",
218 | // "North Dakota is a state in the United States. 672,591 people lived in North Dakota in the year 2010. The capital and seat of government is Bismarck.",
219 | // "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
220 | // "The World Summit on Climate Change is an international conference aimed at addressing global warming and promoting sustainable development efforts around the globe.",
221 | // "Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments.",
222 | //};
223 |
224 | //var queries = new[]
225 | //{
226 | // "how much protein should a female eat",
227 | // "summit define",
228 | // "What is the capital of the United States?",
229 | //}.ToList();
230 |
231 | //var documentsEmbeddings = documents
232 | // .Select(x => GetEmbeddings(x).ToArray())
233 | // .ToList();
234 |
235 | //foreach (var query in queries)
236 | //{
237 | // var queryEmbeddings = GetEmbeddings(query).ToArray();
238 |
239 | // var cosineSimilarities = documentsEmbeddings
240 | // .Select(documentEmbeddings => TensorPrimitives.CosineSimilarity(queryEmbeddings, documentEmbeddings))
241 | // .ToList();
242 |
243 | // var topResults = documents
244 | // .Zip(cosineSimilarities, (x, similarity) => new { Document = x, CosineSimilarity = similarity })
245 | // .OrderByDescending(x => x.CosineSimilarity)
246 | // .Take(3)
247 | // .ToList();
248 |
249 | // Console.WriteLine($"\n[{query}]");
250 | // topResults.ForEach(result => Console.WriteLine($" [{result.CosineSimilarity * 100:0.00}%][{result.Document.TruncateWithEllipsis()}]"));
251 | //}
252 |
253 | llama_batch_free(batch);
254 |
255 | llama_free(ctx);
256 | llama_free_model(mdl);
257 | }
258 |
259 | static unsafe void batch_decode(llama_context ctx, ref llama_batch batch, float* output, int n_seq, int n_embd, int embd_norm)
260 | {
261 | var pooling_type = llama_pooling_type(ctx);
262 | var model = llama_get_model(ctx);
263 |
264 | // clear previous kv_cache values (irrelevant for embeddings)
265 | llama_kv_cache_clear(ctx);
266 |
267 | // run model
268 | Console.WriteLine($"n_tokens = {batch.n_tokens}, n_seq = {n_seq}");
269 | if (llama_model_has_encoder(model) && !llama_model_has_decoder(model))
270 | {
271 | // encoder-only model
272 | if (llama_encode(ctx, batch) < 0)
273 | {
274 | Console.WriteLine("failed to encode");
275 | }
276 | }
277 | else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model))
278 | {
279 | // decoder-only model
280 | if (llama_decode(ctx, batch) < 0)
281 | {
282 | Console.WriteLine("failed to decode");
283 | }
284 | }
285 |
286 | for (var i = 0; i < batch.n_tokens; i++)
287 | {
288 | if (batch.logits[i] == 0)
289 | continue;
290 |
291 | float* embd = null;
292 | var embd_pos = 0;
293 |
294 | if (pooling_type == _llama_pooling_type.LLAMA_POOLING_TYPE_NONE)
295 | {
296 | // try to get token embeddings
297 | embd = llama_get_embeddings_ith(ctx, i);
298 | embd_pos = i;
299 | //GGML_ASSERT(embd != NULL && "failed to get token embeddings");
300 | }
301 | else
302 | {
303 | // try to get sequence embeddings - supported only when pooling_type is not NONE
304 | embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
305 | embd_pos = batch.seq_id[i][0];
306 | //GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
307 | }
308 |
309 | float* @out = output + embd_pos * n_embd;
310 | common_embd_normalize(embd, @out, n_embd, embd_norm);
311 | }
312 | }
313 |
314 | static void batch_add_seq(ref llama_batch batch, llama_token[] tokens, llama_seq_id seq_id)
315 | {
316 | var n_tokens = tokens.Length;
317 | for (var i = 0; i < n_tokens; i++)
318 | {
319 | llama_batch_add(ref batch, tokens[i], i, [seq_id], true);
320 | }
321 | }
322 |
323 | static unsafe void common_embd_normalize(float* inp, float* @out, int n, int embd_norm)
324 | {
325 | var sum = 0.0;
326 |
327 | switch (embd_norm)
328 | {
329 | case -1: // no normalisation
330 | sum = 1.0;
331 | break;
332 | case 0: // max absolute
333 | for (var i = 0; i < n; i++)
334 | {
335 | if (sum < Math.Abs(inp[i])) sum = Math.Abs(inp[i]);
336 | }
337 | sum /= 32760.0; // make an int16 range
338 | break;
339 | case 2: // euclidean
340 | for (var i = 0; i < n; i++)
341 | {
342 | sum += inp[i] * inp[i];
343 | }
344 | sum = Math.Sqrt(sum);
345 | break;
346 | default: // p-norm (euclidean is p-norm p=2)
347 | for (var i = 0; i < n; i++)
348 | {
349 | sum += Math.Pow(Math.Abs(inp[i]), embd_norm);
350 | }
351 | sum = Math.Pow(sum, 1.0 / embd_norm);
352 | break;
353 | }
354 |
355 | var norm = (float)(sum > 0.0 ? 1.0 / sum : 0.0f);
356 |
357 | for (var i = 0; i < n; i++)
358 | {
359 | @out[i] = inp[i] * norm;
360 | }
361 | }
362 |
363 | static unsafe float common_embd_similarity_cos(float* embd1, float* embd2, int n)
364 | {
365 | var sum = 0.0;
366 | var sum1 = 0.0;
367 | var sum2 = 0.0;
368 |
369 | for (var i = 0; i < n; i++)
370 | {
371 | sum += embd1[i] * embd2[i];
372 | sum1 += embd1[i] * embd1[i];
373 | sum2 += embd2[i] * embd2[i];
374 | }
375 |
376 | // Handle the case where one or both vectors are zero vectors
377 | if (sum1 == 0.0 || sum2 == 0.0)
378 | {
379 | if (sum1 == 0.0 && sum2 == 0.0)
380 | {
381 | return 1.0f; // two zero vectors are similar
382 | }
383 | return 0.0f;
384 | }
385 |
386 | return (float)(sum / (Math.Sqrt(sum1) * Math.Sqrt(sum2)));
387 | }
388 | }
389 | }
390 |
--------------------------------------------------------------------------------
/LlamaCppCli/SampleLibrary.cs:
--------------------------------------------------------------------------------
1 | using System.Text.RegularExpressions;
2 | using System.Text;
3 |
4 | using LlamaCppLib;
5 |
6 | namespace LlamaCppCli
7 | {
8 | internal partial class Program
9 | {
10 | static async Task RunSampleLibraryAsync(string[] args)
11 | {
12 | if (args.Length < 1)
13 | {
14 | Console.WriteLine($"Usage: RunSampleLibraryAsync [GpuLayers] [CtxLength]");
15 | return;
16 | }
17 |
18 | var cancellationTokenSource = new CancellationTokenSource();
19 | Console.CancelKeyPress += (s, e) => { cancellationTokenSource.Cancel(); e.Cancel = true; };
20 |
21 | using var llm = new LlmEngine(new LlmEngineOptions { MaxParallel = 8 });
22 | llm.LoadModel(
23 | args[0],
24 | new LlmModelOptions
25 | {
26 | //Seed = 0,
27 | ContextLength = args.Length > 2 ? Int32.Parse(args[2]) : 0,
28 | GpuLayers = args.Length > 1 ? Int32.Parse(args[1]) : 0,
29 | ThreadCount = 8,
30 | BatchThreadCount = 8,
31 | UseFlashAttention = true,
32 | //RopeFrequeceBase = 8000000,
33 | },
34 | (float progress) => { Console.Write($"{new string(' ', 32)}\rLoading model... {progress:0.00}%\r"); }
35 | );
36 |
37 | Console.WriteLine();
38 | Console.WriteLine($"Model context length: {llm.ContextLength}");
39 | Console.WriteLine($"Model training context length: {llm.TrainingContextLength}");
40 | Console.WriteLine($"Model layer count: {llm.LayerCount}");
41 | Console.WriteLine();
42 | Console.WriteLine("Press to cancel or press with an empty input to quit.");
43 |
44 | var messages = new List { new() { Role = "system", Content = "You are a helpful assistant." } };
45 |
46 | while (true)
47 | {
48 | if (cancellationTokenSource.IsCancellationRequested)
49 | cancellationTokenSource = new();
50 |
51 | Console.Write($"[{llm.Tokenize(messages).Length}/{llm.ContextLength}]> ");
52 | var promptText = (Console.ReadLine() ?? String.Empty).Replace("\\n", "\n");
53 | if (String.IsNullOrWhiteSpace(promptText))
54 | break;
55 |
56 | // Parallel prompts w/o streaming for multiple files - e.g.
57 | // `/load "prompt_file-1.txt" "prompt_file-2.txt" ...`
58 | var match = Regex.Match(promptText, @"\/load\s+("".*?""(?:\s+|$))+");
59 | var fileNames = match.Success ? Regex.Matches(promptText, "\"(.*?)\"").Select(x => x.Groups[1].Value).ToList() : [];
60 |
61 | if (fileNames.Count > 1)
62 | {
63 | fileNames
64 | .Where(fileName => !File.Exists(fileName))
65 | .ToList()
66 | .ForEach(fileName => Console.WriteLine($"File \"{fileName}\" not found."));
67 |
68 | var promptTasks = fileNames
69 | .Where(File.Exists)
70 | .Select(
71 | fileName => llm.Prompt(
72 | [new LlmMessage { Role = "user", Content = File.ReadAllText(fileName) }],
73 | new SamplingOptions { Temperature = 0.5f }
74 | )
75 | )
76 | .Select(
77 | async prompt =>
78 | {
79 | var response = new List();
80 |
81 | // In non-streaming mode, we can collect tokens as raw byte arrays and assemble the response at the end
82 | await foreach (var token in prompt.NextToken(cancellationTokenSource.Token))
83 | response.AddRange(token);
84 |
85 | return (Request: prompt, Response: Encoding.UTF8.GetString(response.ToArray()));
86 | }
87 | )
88 | .ToList();
89 |
90 | while (promptTasks.Any())
91 | {
92 | var task = await Task.WhenAny(promptTasks);
93 |
94 | Console.WriteLine(new String('=', Console.WindowWidth));
95 | Console.WriteLine($"Request {task.Result.Request.GetHashCode()} | Prompting {task.Result.Request.PromptingSpeed:F2} t/s | Sampling {task.Result.Request.SamplingSpeed:F2} t/s");
96 | Console.WriteLine(new String('-', Console.WindowWidth));
97 | Console.WriteLine($"{task.Result.Response}{(task.Result.Request.Cancelled ? " [Cancelled]" : "")}");
98 | Console.WriteLine(new String('=', Console.WindowWidth));
99 |
100 | promptTasks.Remove(task);
101 | }
102 |
103 | continue;
104 | }
105 |
106 | // Single prompt w/streaming - e.g.
107 | // `/load "D:\LLM_MODELS\PROMPT.txt"`
108 | // `<|im_start|>system\nYou are an astrophysicist.<|im_end|>\n<|im_start|>user\nDescribe the solar system.<|im_end|>\n<|im_start|>assistant\n`
109 | // `[INST] <>\nYou are an astrophysicist.\n<>\n\nDescribe the solar system. [/INST]\n`
110 | if (fileNames.Count == 1)
111 | promptText = File.ReadAllText(fileNames[0]);
112 |
113 | messages.Add(new LlmMessage { Role = "user", Content = promptText });
114 | var prompt = llm.Prompt(messages, new SamplingOptions { TopK = 50, TopP = 0.95f, Temperature = 0.7f });
115 |
116 | // In streaming mode, we must re-assemble multibyte characters using a TokenEnumerator
117 | await foreach (var token in new TokenEnumerator(prompt, cancellationTokenSource.Token))
118 | {
119 | if (messages.Last().Role != "assistant")
120 | messages.Add(new() { Role = "assistant" });
121 |
122 | Console.Write(token);
123 | messages.Last().Content += token;
124 | }
125 |
126 | Console.WriteLine($"{(prompt.Cancelled ? " [Cancelled]" : "")}");
127 | Console.WriteLine($"\nPrompting {prompt.PromptingSpeed:F2} t/s | Sampling {prompt.SamplingSpeed:F2} t/s");
128 | }
129 |
130 | Console.WriteLine("Bye.");
131 | }
132 | }
133 | }
134 |
--------------------------------------------------------------------------------
/LlamaCppCli/SampleMeta.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | using static LlamaCppLib.Native;
4 |
5 | namespace LlamaCppCli
6 | {
7 | internal partial class Program
8 | {
9 | static async Task RunDumpMetaAsync(string[] args)
10 | {
11 | if (args.Length < 1)
12 | {
13 | Console.WriteLine($"Usage: RunDumpMetaAsync [Key]");
14 | return;
15 | }
16 |
17 | RunDumpMeta(args);
18 | await Task.CompletedTask;
19 | }
20 |
21 | static void RunDumpMeta(string[] args)
22 | {
23 | llama_backend_init();
24 | llama_numa_init(ggml_numa_strategy.GGML_NUMA_STRATEGY_DISABLED);
25 |
26 | var mparams = llama_model_default_params();
27 | var model = llama_load_model_from_file(args[0], mparams);
28 | var buffer = new byte[0x100000]; // 1 MiB
29 |
30 | try
31 | {
32 | if (args.Length < 2)
33 | {
34 | for (var i = 0; i < llama_model_meta_count(model); i++)
35 | {
36 | var length = llama_model_meta_key_by_index(model, i, buffer, (nuint)buffer.Length);
37 | var key = Encoding.UTF8.GetString(new ReadOnlySpan(buffer, 0, length));
38 |
39 | length = llama_model_meta_val_str(model, Encoding.UTF8.GetBytes(key), buffer, (nuint)buffer.Length);
40 | var value = Encoding.UTF8.GetString(new ReadOnlySpan(buffer, 0, length));
41 |
42 | Console.WriteLine($"[{key}]=[{value}]");
43 | }
44 | }
45 | else
46 | {
47 | var key = args[1];
48 |
49 | var length = llama_model_meta_val_str(model, Encoding.UTF8.GetBytes(key), buffer, (nuint)buffer.Length);
50 | var value = Encoding.UTF8.GetString(new ReadOnlySpan(buffer, 0, length));
51 |
52 | Console.WriteLine($"[{key}]=[{value}]");
53 | }
54 | }
55 | catch (Exception ex)
56 | {
57 | Console.WriteLine(ex.ToString());
58 | }
59 |
60 | llama_free_model(model);
61 | llama_backend_free();
62 | }
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/LlamaCppCli/SampleRaw.cs:
--------------------------------------------------------------------------------
1 | using System.Diagnostics;
2 | using System.Diagnostics.CodeAnalysis;
3 | using System.Runtime.CompilerServices;
4 | using System.Runtime.InteropServices;
5 | using System.Text;
6 | using System.Text.RegularExpressions;
7 |
8 | using LlamaCppLib;
9 |
10 | using static LlamaCppLib.Native;
11 | using static LlamaCppLib.Interop;
12 |
13 | namespace LlamaCppCli
14 | {
15 | internal partial class Program
16 | {
17 | static async Task RunSampleRawAsync(string[] args)
18 | {
19 | if (args.Length < 1)
20 | {
21 | Console.WriteLine($"Usage: RunSampleRawAsync [GpuLayers] [CtxLength]");
22 | return;
23 | }
24 |
25 | RunSampleRaw(args);
26 | await Task.CompletedTask;
27 | }
28 |
29 | [UnmanagedCallersOnly(CallConvs = [typeof(CallConvCdecl)])]
30 | static unsafe sbyte ProgressCallback(float progress, void* state)
31 | {
32 | Console.Write($"{new string(' ', 32)}\rLoading model... {(byte)(progress * 100)}%\r");
33 | return true ? 1 : 0;
34 | }
35 |
36 | [UnmanagedCallersOnly(CallConvs = [typeof(CallConvCdecl)])]
37 | static unsafe sbyte AbortCallback(void* state)
38 | {
39 | var cancel = (bool?)GCHandle.FromIntPtr(new(state)).Target ?? false;
40 | return (sbyte)(cancel ? 1 : 0);
41 | }
42 |
43 | static unsafe void RunSampleRaw(string[] args)
44 | {
45 | var requests = new List();
46 | var assembler = new MultibyteCharAssembler();
47 | var stream = true;
48 |
49 | var cancel = false;
50 | var cancel_handle = GCHandle.Alloc(cancel, GCHandleType.Pinned);
51 | Console.CancelKeyPress += (s, e) => { e.Cancel = cancel = true; };
52 |
53 | var sw = Stopwatch.StartNew();
54 | var run = 1;
55 | var tc = 0;
56 |
57 | //================================================================================================================================================================================================
58 |
59 | var seed = unchecked((uint)-1);
60 |
61 | var top_k = 50;
62 | var top_p = 0.95f;
63 | var min_p = 0.05f;
64 | var typical_p = 1.0f;
65 | var temp = 0.7f;
66 |
67 | //var mirostat = 0;
68 | //var mirostat_tau = 5.0f;
69 | //var mirostat_eta = 0.1f;
70 | //var mirostat_m = 100;
71 |
72 | //var penalty_last_n = 64;
73 | //var penalty_repeat = 1.0f;
74 | //var penalty_freq = 0.0f;
75 | //var penalty_present = 0.0f;
76 |
77 | var mparams = llama_model_default_params();
78 | mparams.n_gpu_layers = args.Length > 1 ? Int32.Parse(args[1]) : 0;
79 | mparams.progress_callback = &ProgressCallback;
80 |
81 | var cparams = llama_context_default_params();
82 | cparams.n_ctx = args.Length > 2 ? UInt32.Parse(args[2]) : 0;
83 | cparams.flash_attn = true ? 1 : 0;
84 | cparams.abort_callback = &AbortCallback;
85 | cparams.abort_callback_data = GCHandle.ToIntPtr(cancel_handle).ToPointer();
86 | //cparams.n_batch = 512;
87 | //cparams.n_threads = 8;
88 | //cparams.n_threads_batch = 8;
89 | //cparams.rope_freq_base = 8000000;
90 | //cparams.type_k = ggml_type.GGML_TYPE_F16;
91 | //cparams.type_v = ggml_type.GGML_TYPE_F16;
92 | //cparams.logits_all = false ? 1 : 0;
93 |
94 | var sparams = llama_sampler_chain_default_params();
95 | sparams.no_perf = 0;
96 |
97 | llama_backend_init();
98 | llama_numa_init(ggml_numa_strategy.GGML_NUMA_STRATEGY_DISABLED);
99 |
100 | var mdl = llama_load_model_from_file(args[0], mparams);
101 | var ctx = llama_new_context_with_model(mdl, cparams);
102 | var bat = llama_batch_init((int)llama_n_ctx(ctx), 0, 1);
103 | var spl = llama_sampler_chain_init(sparams);
104 |
105 | if (temp > 0.0f)
106 | {
107 | llama_sampler_chain_add(spl, llama_sampler_init_top_k(top_k));
108 | llama_sampler_chain_add(spl, llama_sampler_init_typical(typical_p, 1));
109 | llama_sampler_chain_add(spl, llama_sampler_init_top_p(top_p, 1));
110 | llama_sampler_chain_add(spl, llama_sampler_init_min_p(min_p, 1));
111 | llama_sampler_chain_add(spl, llama_sampler_init_temp(temp));
112 | llama_sampler_chain_add(spl, llama_sampler_init_dist(seed));
113 | }
114 | else
115 | {
116 | llama_sampler_chain_add(spl, llama_sampler_init_greedy());
117 | }
118 |
119 | var messages = new List { new() { Role = "system", Content = "You are a helpful assistant." } };
120 |
121 | while (true)
122 | {
123 | {
124 | cancel = false;
125 |
126 | Console.Write($"\n[{sw.Elapsed}]{(run++ == 1 ? "" : $"[{tc / sw.Elapsed.TotalSeconds:F2} t/s]")}> ");
127 | var line = Console.ReadLine() ?? String.Empty;
128 |
129 | if (line == "q" || line == "quit")
130 | {
131 | Console.WriteLine("Bye.");
132 | break;
133 | }
134 | else if (line == "/clear")
135 | {
136 | messages = new(messages.Take(1));
137 | continue;
138 | }
139 |
140 | var prompt = String.Empty;
141 | var match = Regex.Match(line, @"\/load\s+("".*?""(?:\s+|$))");
142 | if (match.Success)
143 | {
144 | var fileName = Path.GetFullPath(Regex.Match(line, "\"(.*?)\"").Groups[1].Value);
145 | prompt = File.ReadAllText(fileName);
146 | }
147 | else
148 | {
149 | prompt = line.Replace("\\n", "\n");
150 | }
151 |
152 | if (String.IsNullOrWhiteSpace(prompt))
153 | continue;
154 |
155 | messages.Add(new() { Role = "user", Content = prompt });
156 | prompt = llama_apply_template(ctx, messages);
157 |
158 | var tokens = llama_tokenize(mdl, Encoding.UTF8.GetBytes(prompt), llama_add_bos_token(mdl) != 0, true);
159 |
160 | var responseMargin = 512;
161 | Console.WriteLine($"{tokens.Length}/{llama_n_ctx(ctx)} token(s)");
162 | if (tokens.Length >= llama_n_ctx(ctx) - responseMargin)
163 | {
164 | Console.WriteLine($"Out of context (with response margin of {responseMargin}.");
165 | continue;
166 | }
167 |
168 | requests.Add(new Request((int)llama_n_ctx(ctx), tokens) { Messages = messages });
169 |
170 | sw.Restart();
171 | tc = 0;
172 | }
173 |
174 | //============================================================================================================================================================================================
175 |
176 | while (true)
177 | {
178 | llama_batch_clear(ref bat);
179 |
180 | foreach (var request in requests)
181 | {
182 | for (; request.PosBatch < request.PosToken; request.PosBatch++)
183 | llama_batch_add(ref bat, request.Tokens[request.PosBatch], request.PosBatch, [request.Id], false);
184 |
185 | request.PosLogit = bat.n_tokens - 1;
186 | bat.logits[request.PosLogit] = true ? 1 : 0;
187 |
188 | if (request.T0 == default)
189 | request.T0 = DateTime.Now;
190 | }
191 |
192 | if (bat.n_tokens == 0)
193 | break;
194 |
195 | var n_batch = (int)cparams.n_batch;
196 | for (var i = 0; i < bat.n_tokens; i += n_batch)
197 | {
198 | var n_tokens = Math.Min(n_batch, bat.n_tokens - i);
199 |
200 | var res = llama_decode(
201 | ctx,
202 | new llama_batch
203 | {
204 | n_tokens = n_tokens,
205 | token = &bat.token[i],
206 | embd = null,
207 | pos = &bat.pos[i],
208 | n_seq_id = &bat.n_seq_id[i],
209 | seq_id = &bat.seq_id[i],
210 | logits = &bat.logits[i],
211 | }
212 | );
213 |
214 | if (res != 0)
215 | {
216 | Console.WriteLine($"llama_decode() = {res}");
217 | return;
218 | }
219 |
220 | foreach (var request in requests)
221 | {
222 | if (stream && n_tokens > 1)
223 | {
224 | var count = n_tokens + i;
225 | var progress = count / (double)bat.n_tokens * 100;
226 | var elapsed = DateTime.Now - (request.T0 ?? DateTime.Now);
227 | var speed = count / elapsed.TotalSeconds;
228 | var remaining = TimeSpan.FromSeconds((bat.n_tokens - count) / speed);
229 | Console.Write($"{new String(' ', 32)}\rDecoding... {progress:F2}% [C:{count}/{bat.n_tokens}][S:{speed:F2} t/s][E:{elapsed:hh\\:mm\\:ss\\.fff}][R:{remaining:hh\\:mm\\:ss\\.fff}]\r");
230 | if (count == bat.n_tokens) Console.WriteLine();
231 | }
232 |
233 | if (request.PosLogit < i || request.PosLogit >= i + n_tokens)
234 | continue;
235 |
236 | var token = llama_sampler_sample(spl, ctx, request.PosLogit - i);
237 |
238 | if (request.PosResponse == 0)
239 | request.PosResponse = request.PosToken;
240 |
241 | if (cancel)
242 | token = llama_token_eos(mdl); // Override stop token with EOS token
243 |
244 | if (request.PosToken >= request.Tokens.Length)
245 | {
246 | if (stream)
247 | Console.Write(" [Out of context]");
248 |
249 | request.Tokens[request.Tokens.Length - 1] = llama_token_eos(mdl);
250 | break;
251 | }
252 | else
253 | {
254 | request.Tokens[request.PosToken++] = token;
255 | ++tc;
256 |
257 | var tokenText = assembler.Consume(
258 | //llama_detokenize(mdl, [token])
259 | Interop.llama_token_to_piece(mdl, token, true)
260 | );
261 |
262 | if (request.Messages.Last().Role != "assistant")
263 | {
264 | request.Messages.Add(new() { Role = "assistant" });
265 | }
266 |
267 | if (!llama_token_is_eog(mdl, token))
268 | {
269 | request.Messages.Last().Content += tokenText;
270 | }
271 |
272 | if (stream)
273 | {
274 | if (!llama_token_is_eog(mdl, token))
275 | {
276 | Console.Write(tokenText);
277 | }
278 |
279 | if (cancel)
280 | Console.Write(" [Cancelled]");
281 | }
282 | }
283 |
284 | if (request.T1 == default)
285 | request.T1 = DateTime.Now;
286 |
287 | if (llama_token_is_eog(mdl, token))
288 | request.T2 = DateTime.Now;
289 | }
290 | }
291 |
292 | foreach (var r in requests.Where(r => llama_token_is_eog(mdl, r.Tokens[r.PosToken - 1])))
293 | {
294 | llama_kv_cache_seq_rm(ctx, r.Id, 0, -1);
295 |
296 | if (!stream)
297 | {
298 | var promptTokens = r.Tokens.Take(r.PosResponse).SelectMany(token => llama_detokenize(mdl, [token])).ToArray();
299 | var responseTokens = r.Tokens.Skip(r.PosResponse).Take(r.PosToken - r.PosResponse).SelectMany(token => llama_detokenize(mdl, [token], false, true).ToArray()).ToArray();
300 |
301 | Console.WriteLine(new String('=', 128));
302 | Console.WriteLine($"request id {r.Id} [{r.PosToken / r.Elapsed.TotalMilliseconds * 1000:F2} t/s]");
303 | Console.WriteLine(new String('-', 128));
304 | Console.WriteLine(Encoding.UTF8.GetString(promptTokens));
305 | Console.WriteLine(new String('-', 128));
306 | Console.WriteLine(Encoding.UTF8.GetString(responseTokens));
307 | Console.WriteLine(new String('=', 128));
308 | }
309 | else
310 | {
311 | Console.WriteLine();
312 | }
313 | }
314 |
315 | requests.RemoveAll(r => llama_token_is_eog(mdl, r.Tokens[r.PosToken - 1]));
316 | }
317 | }
318 |
319 | cancel_handle.Free();
320 |
321 | llama_sampler_free(spl);
322 | llama_batch_free(bat);
323 | llama_free(ctx);
324 | llama_free_model(mdl);
325 |
326 | llama_backend_free();
327 | }
328 |
329 | static async Task RunSampleStateRawAsync(string[] args)
330 | {
331 | if (args.Length < 1)
332 | {
333 | Console.WriteLine($"Usage: RunSampleStateRawAsync ");
334 | return;
335 | }
336 |
337 | RunSampleStateRaw(args);
338 | await Task.CompletedTask;
339 | }
340 |
341 | static void RunSampleStateRaw(string[] args)
342 | {
343 | var mparams = llama_model_default_params();
344 | mparams.n_gpu_layers = 999;
345 |
346 | var cparams = llama_context_default_params();
347 | cparams.n_ctx = 4096;
348 | cparams.flash_attn = true ? 1 : 0;
349 |
350 | var sparams = llama_sampler_chain_default_params();
351 | sparams.no_perf = true ? 1 : 0;
352 |
353 | var mdl = llama_load_model_from_file(args[0], mparams);
354 |
355 | var seed = 42;
356 |
357 | { // First run
358 | var ctx = llama_new_context_with_model(mdl, cparams);
359 | var bat = llama_batch_init((int)llama_n_ctx(ctx), 0, 1);
360 | var spl = llama_sampler_chain_init(sparams);
361 | llama_sampler_chain_add(spl, llama_sampler_init_dist((uint)seed));
362 |
363 | Console.Write(new String('=', Console.WindowWidth));
364 |
365 | var messages = new List
366 | {
367 | new() { Role = "system", Content = "You are a helpful assistant." },
368 | new() { Role = "user", Content = "Hello?" }
369 | };
370 |
371 | var prompt = llama_apply_template(ctx, messages);
372 | var tokens = llama_tokenize(mdl, Encoding.UTF8.GetBytes(prompt), llama_add_bos_token(mdl) != 0, true);
373 |
374 | var n_past = 0;
375 | var run = 0;
376 |
377 | while (true)
378 | {
379 | llama_batch_clear(ref bat);
380 |
381 | for (var i = 0; i < tokens.Length; i++)
382 | llama_batch_add(ref bat, tokens[i], n_past++, [0], i == tokens.Length - 1);
383 |
384 | llama_decode(ctx, bat);
385 |
386 | if (++run == 1)
387 | {
388 | var state_mem = new byte[(int)llama_state_get_size(ctx)];
389 | var written = (int)llama_state_get_data(ctx, state_mem, (nuint)state_mem.Length);
390 | File.WriteAllBytes($"dump_state_{n_past}.bin", new Span(state_mem, 0, written).ToArray());
391 | }
392 |
393 | tokens = [llama_sampler_sample(spl, ctx, -1)];
394 | if (tokens[0] == llama_token_eos(mdl))
395 | break;
396 |
397 | var piece = llama_token_to_piece(mdl, tokens[0], true);
398 | Console.Write(Encoding.UTF8.GetString(piece));
399 | }
400 |
401 | llama_sampler_free(spl);
402 | llama_batch_free(bat);
403 | llama_free(ctx);
404 |
405 | Console.Write($"\n{new String('=', Console.WindowWidth)}");
406 | }
407 |
408 | { // Second run
409 | var ctx = llama_new_context_with_model(mdl, cparams);
410 | var bat = llama_batch_init((int)llama_n_ctx(ctx), 0, 1);
411 | var spl = llama_sampler_chain_init(sparams);
412 | llama_sampler_chain_add(spl, llama_sampler_init_dist((uint)seed));
413 |
414 | Console.Write(new String('=', Console.WindowWidth));
415 |
416 | var n_past = 0;
417 | {
418 | var path = Directory.EnumerateFiles(".", "dump_state_*.bin").Single();
419 | n_past = Int32.Parse(Regex.Match(path, @"dump_state_(?\d+)\.bin").Groups["n_past"].Value);
420 | var state_mem = File.ReadAllBytes(path) ?? [];
421 | llama_state_set_data(ctx, state_mem, (nuint)state_mem.Length);
422 | }
423 |
424 | var tokens = new int[0];
425 |
426 | while (true)
427 | {
428 | tokens = [llama_sampler_sample(spl, ctx, -1)];
429 | if (tokens[0] == llama_token_eos(mdl))
430 | break;
431 |
432 | var piece = llama_token_to_piece(mdl, tokens[0], true);
433 | Console.Write(Encoding.UTF8.GetString(piece));
434 |
435 | llama_batch_clear(ref bat);
436 | llama_batch_add(ref bat, tokens[0], n_past++, [0], true);
437 |
438 | llama_decode(ctx, bat);
439 | }
440 |
441 | llama_sampler_free(spl);
442 | llama_batch_free(bat);
443 | llama_free(ctx);
444 |
445 | Console.Write($"\n{new String('=', Console.WindowWidth)}");
446 | }
447 |
448 | llama_free_model(mdl);
449 | }
450 | }
451 |
452 | file class Request : IEquatable
453 | {
454 | public int Id { get; set; }
455 | public List Messages { get; set; } = [];
456 |
457 | public int PosBatch { get; set; }
458 | public int PosLogit { get; set; }
459 |
460 | public int PosResponse { get; set; }
461 | public int PosToken { get; set; }
462 | public int[] Tokens { get; set; }
463 |
464 | public float MirostatMU = 0.0f;
465 |
466 | public DateTime? T0 { get; set; } // Decoding
467 | public DateTime? T1 { get; set; } // Sampling
468 | public DateTime? T2 { get; set; } // End
469 | public TimeSpan Elapsed => (T2 - T1) ?? TimeSpan.FromSeconds(0);
470 |
471 | public Request(int n_ctx, ReadOnlySpan tokens)
472 | {
473 | this.Tokens = new int[n_ctx];
474 | tokens.CopyTo(Tokens);
475 | this.PosToken += tokens.Length;
476 | }
477 |
478 | public override bool Equals([NotNullWhen(true)] object? obj) => obj is Request request && Equals(request);
479 | public override int GetHashCode() => Id.GetHashCode();
480 |
481 | // IEquatable
482 | public bool Equals(Request? other) => other?.Id == this.Id;
483 | }
484 | }
485 |
--------------------------------------------------------------------------------
/LlamaCppLib/BlockingQueue.cs:
--------------------------------------------------------------------------------
1 | namespace LlamaCppLib
2 | {
3 | public class BlockingQueue
4 | {
5 | private readonly Queue _queue;
6 | private readonly ManualResetEvent _event = new(false);
7 |
8 | public BlockingQueue() => _queue = new Queue();
9 |
10 | public void Enqueue(T item)
11 | {
12 | _queue.Enqueue(item);
13 | _event.Set();
14 | }
15 |
16 | public T Dequeue(CancellationToken? cancellationToken = default)
17 | {
18 | if (_queue.Count == 0)
19 | WaitForNext(cancellationToken);
20 |
21 | if (_queue.Count == 1)
22 | _event.Reset();
23 |
24 | return _queue.Dequeue();
25 | }
26 |
27 | public bool Any() => _queue.Count > 0;
28 |
29 | public void WaitForNext(CancellationToken? cancellationToken = default) => WaitHandle.WaitAny(new[] { _event, (cancellationToken ?? new()).WaitHandle });
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/LlamaCppLib/Extensions.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace LlamaCppLib
4 | {
5 | public static class Extensions
6 | {
7 | private static Encoding? _utf8;
8 |
9 | public static bool TryGetUtf8String(this byte[] bytes, out string? str)
10 | {
11 | if (_utf8 == null)
12 | {
13 | _utf8 = (Encoding)Encoding.UTF8.Clone();
14 | _utf8.DecoderFallback = new DecoderExceptionFallback();
15 | }
16 |
17 | try
18 | {
19 | _utf8.DecoderFallback = new DecoderExceptionFallback();
20 | str = _utf8.GetString(bytes);
21 | return true;
22 | }
23 | catch (DecoderFallbackException)
24 | {
25 | str = null;
26 | return false;
27 | }
28 | }
29 |
30 | public static Task PostAsync(
31 | this HttpClient client,
32 | string? requestUri,
33 | HttpContent? content,
34 | HttpCompletionOption? completionOption = default,
35 | CancellationToken? cancellationToken = default)
36 | {
37 | return client.SendAsync(
38 | new HttpRequestMessage(HttpMethod.Post, requestUri) { Content = content },
39 | completionOption ?? HttpCompletionOption.ResponseContentRead,
40 | cancellationToken ?? default
41 | );
42 | }
43 |
44 | public static Task PostAsync(
45 | this HttpClient client,
46 | Uri? requestUri,
47 | HttpContent? content,
48 | HttpCompletionOption? completionOption = default,
49 | CancellationToken? cancellationToken = default)
50 | {
51 | return client.SendAsync(
52 | new HttpRequestMessage(HttpMethod.Post, requestUri) { Content = content },
53 | completionOption ?? HttpCompletionOption.ResponseContentRead,
54 | cancellationToken ?? default
55 | );
56 | }
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/LlamaCppLib/Interop.cs:
--------------------------------------------------------------------------------
1 | using System.Buffers;
2 | using System.Text;
3 |
4 | namespace LlamaCppLib
5 | {
6 | using llama_model = System.IntPtr;
7 | using llama_context = System.IntPtr;
8 | using llama_token = System.Int32;
9 | using llama_pos = System.Int32;
10 | using llama_seq_id = System.Int32;
11 |
12 | public static unsafe class Interop
13 | {
14 | public static void llama_batch_add(ref Native.llama_batch batch, llama_token id, llama_pos pos, llama_seq_id[] seq_ids, bool logits)
15 | {
16 | batch.token[batch.n_tokens] = id;
17 | batch.pos[batch.n_tokens] = pos;
18 | batch.n_seq_id[batch.n_tokens] = seq_ids.Length;
19 |
20 | for (var i = 0; i < seq_ids.Length; ++i)
21 | batch.seq_id[batch.n_tokens][i] = seq_ids[i];
22 |
23 | batch.logits[batch.n_tokens] = (sbyte)(logits ? 1 : 0);
24 | batch.n_tokens++;
25 | }
26 |
27 | public static void llama_batch_clear(ref Native.llama_batch batch)
28 | {
29 | batch.n_tokens = 0;
30 | }
31 |
32 | public static int[] llama_tokenize(llama_model model, byte[] text, bool add_special = false, bool parse_special = false)
33 | {
34 | var length = -Native.llama_tokenize(model, text, text.Length, [], 0, add_special, parse_special);
35 |
36 | var tokens = new int[length];
37 | Native.llama_tokenize(model, text, text.Length, tokens, tokens.Length, add_special, parse_special);
38 |
39 | return tokens;
40 | }
41 |
42 | public static byte[] llama_detokenize(llama_model model, int[] tokens, bool remove_special = false, bool unparse_special = false)
43 | {
44 | var length = -Native.llama_detokenize(model, tokens, tokens.Length, [], 0, remove_special, unparse_special);
45 |
46 | var text = new byte[length];
47 | Native.llama_detokenize(model, tokens, tokens.Length, text, text.Length, remove_special, unparse_special);
48 |
49 | return text;
50 | }
51 |
52 | private static byte[] _bytes = new byte[1024];
53 |
54 | public static byte[] llama_token_to_piece(llama_model model, int token, bool special)
55 | {
56 | var count = Native.llama_token_to_piece(model, token, _bytes, _bytes.Length, 0, special);
57 | return _bytes[0..count];
58 | }
59 |
60 | public static unsafe string llama_apply_template(llama_context context, List messages, bool appendAssistant = true)
61 | {
62 | var encoding = Encoding.UTF8;
63 |
64 | var chat = new Native.llama_chat_message[messages.Count];
65 |
66 | var pinnedRoles = new Memory[messages.Count];
67 | var pinnedContents = new Memory[messages.Count];
68 |
69 | var roleHandles = new MemoryHandle[messages.Count];
70 | var contentHandles = new MemoryHandle[messages.Count];
71 |
72 | try
73 | {
74 | for (var i = 0; i < messages.Count; i++)
75 | {
76 | pinnedRoles[i] = encoding.GetBytes(messages[i].Role ?? String.Empty);
77 | pinnedContents[i] = encoding.GetBytes(messages[i].Content ?? String.Empty);
78 |
79 | roleHandles[i] = pinnedRoles[i].Pin();
80 | contentHandles[i] = pinnedContents[i].Pin();
81 |
82 | chat[i] = new()
83 | {
84 | role = (byte*)roleHandles[i].Pointer,
85 | content = (byte*)contentHandles[i].Pointer
86 | };
87 | }
88 |
89 | var buffer = new byte[Native.llama_n_ctx(context) * 8];
90 | var length = Native.llama_chat_apply_template(Native.llama_get_model(context), null, chat, (nuint)chat.Length, appendAssistant, buffer, buffer.Length);
91 | var text = encoding.GetString(buffer, 0, length);
92 |
93 | return text;
94 | }
95 | finally
96 | {
97 | for (var i = 0; i < messages.Count; i++)
98 | {
99 | roleHandles[i].Dispose();
100 | contentHandles[i].Dispose();
101 | }
102 | }
103 | }
104 | }
105 | }
106 |
--------------------------------------------------------------------------------
/LlamaCppLib/LlamaCppLib.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | net8.0
5 | enable
6 | enable
7 | ..
8 | preview
9 | false
10 | false
11 | true
12 | WINDOWS
13 | LINUX
14 | MACOS
15 | $([MSBuild]::IsOSPlatform('Windows'))
16 | $([MSBuild]::IsOSPlatform('Linux'))
17 | $([MSBuild]::IsOSPlatform('OSX'))
18 |
19 |
20 |
21 |
22 | $(VsInstallRoot)
23 | "$(VsInstallRoot)\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin\cmake.exe"
24 | "$([System.IO.Path]::GetFullPath('$(VS170COMNTOOLS)\..\IDE\CommonExtensions\Microsoft\CMake\CMake\bin\cmake.exe'))"
25 |
26 |
27 | cmake
28 |
29 |
30 | $([System.IO.Path]::GetFullPath('$(MSBuildProjectDirectory)\..\llama.cpp'))
31 |
32 |
33 |
34 |
35 |
36 | -DCMAKE_CXX_FLAGS="/W0 /EHsc /w /D _MBCS" -DCMAKE_C_FLAGS="/W0 /w"
37 | -DCMAKE_CXX_FLAGS=-w -DCMAKE_C_FLAGS=-w
38 | -DGGML_CCACHE=OFF
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 | *.dll
52 | lib*.so
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
--------------------------------------------------------------------------------
/LlamaCppLib/LlmClient.cs:
--------------------------------------------------------------------------------
1 | using System.Net.Http.Json;
2 | using System.Runtime.CompilerServices;
3 | using System.Text;
4 | using System.Text.Json.Serialization;
5 | using System.Text.RegularExpressions;
6 |
7 | namespace LlamaCppLib
8 | {
9 | [JsonConverter(typeof(JsonStringEnumConverter))]
10 | public enum LlmModelStatus { Unknown, Unloaded, Loaded };
11 |
12 | public class LlmStateResponse
13 | {
14 | public string? ModelName { get; set; }
15 | public LlmModelStatus? ModelStatus { get; set; }
16 | }
17 |
18 | public class LlmLoadRequest
19 | {
20 | public string? ModelName { get; set; }
21 | public LlmModelOptions? ModelOptions { get; set; }
22 | }
23 |
24 | public class LlmPromptRequest
25 | {
26 | public List? Messages { get; set; }
27 | public SamplingOptions? SamplingOptions { get; set; }
28 | }
29 |
30 | public class LlmClient : IDisposable
31 | {
32 | private HttpClient _httpClient = new();
33 | private readonly Uri _baseUri;
34 |
35 | public LlmClient(string uri) : this(new Uri(uri))
36 | {
37 | _httpClient.Timeout = TimeSpan.FromHours(1);
38 | }
39 |
40 | public LlmClient(Uri uri) => _baseUri = uri;
41 |
42 | public void Dispose() => _httpClient.Dispose();
43 |
44 | public async Task> ListAsync()
45 | {
46 | using var response = await _httpClient.GetAsync(new Uri(_baseUri, $"/list"));
47 | return await response.Content.ReadFromJsonAsync>() ?? new();
48 | }
49 |
50 | public async Task StateAsync()
51 | {
52 | using var response = await _httpClient.GetAsync(new Uri(_baseUri, $"/state"));
53 | return (await response.Content.ReadFromJsonAsync()) ?? new();
54 | }
55 |
56 | public async Task LoadAsync(string modelName, LlmModelOptions? options = default)
57 | {
58 | using var response = await _httpClient.PostAsync(new Uri(_baseUri, $"/load"), JsonContent.Create(new { ModelName = modelName, ModelOptions = options ?? new() }));
59 | return (await response.Content.ReadFromJsonAsync()) ?? new();
60 | }
61 |
62 | public async Task UnloadAsync()
63 | {
64 | using var response = await _httpClient.GetAsync(new Uri(_baseUri, $"/unload"));
65 | return (await response.Content.ReadFromJsonAsync()) ?? new();
66 | }
67 |
68 | public async IAsyncEnumerable PromptAsync(List messages, SamplingOptions? samplingOptions = default, [EnumeratorCancellation] CancellationToken cancellationToken = default)
69 | {
70 | using var response = await _httpClient.PostAsync(
71 | new Uri(_baseUri, $"/prompt"),
72 | JsonContent.Create(new { Messages = messages, SamplingOptions = samplingOptions }),
73 | HttpCompletionOption.ResponseHeadersRead,
74 | cancellationToken
75 | );
76 |
77 | await using var stream = await response.Content.ReadAsStreamAsync(cancellationToken);
78 | using var reader = new StreamReader(stream);
79 |
80 | while (!reader.EndOfStream && !cancellationToken.IsCancellationRequested)
81 | {
82 | var data = await reader.ReadLineAsync(cancellationToken) ?? String.Empty;
83 | yield return Encoding.UTF8.GetString(Convert.FromBase64String(Regex.Replace(data, @"^data: |\n\n$", String.Empty)));
84 | }
85 | }
86 | }
87 | }
88 |
--------------------------------------------------------------------------------
/LlamaCppLib/LlmEngine.cs:
--------------------------------------------------------------------------------
1 | using System.Runtime.CompilerServices;
2 | using System.Runtime.InteropServices;
3 | using System.Text;
4 |
5 | using static LlamaCppLib.Native;
6 | using static LlamaCppLib.Interop;
7 |
8 | namespace LlamaCppLib
9 | {
10 | public class LlmEngine : IDisposable
11 | {
12 | private bool _disposed = default;
13 |
14 | private UnmanagedResource _backend = new();
15 | private UnmanagedResource _model = new();
16 | private UnmanagedResource _context = new();
17 | private UnmanagedResource _sampler = new();
18 | private UnmanagedResource _batch = new();
19 |
20 | private LlmEngineOptions _engineOptions = new();
21 | private LlmModelOptions _modelOptions = new();
22 |
23 | private BlockingQueue _prompts = new();
24 |
25 | private CancellationTokenSource _cancellationTokenSource = new();
26 | private UnmanagedResource _cancellationTokenHandle = new();
27 |
28 | private Task? _mainLoop = default;
29 |
30 | public LlmEngine(LlmEngineOptions? engineOptions = default)
31 | {
32 | if (engineOptions != default)
33 | _engineOptions = engineOptions;
34 | }
35 |
36 | ~LlmEngine() => Dispose(false);
37 |
38 | protected virtual void Dispose(bool disposing)
39 | {
40 | if (!_disposed)
41 | {
42 | if (disposing)
43 | {
44 | // Managed
45 | _StopAsync().Wait();
46 | }
47 |
48 | // Unmanaged
49 | _batch.Dispose();
50 | _sampler.Dispose();
51 | _context.Dispose();
52 | _model.Dispose();
53 | _backend.Dispose();
54 |
55 | _disposed = true;
56 | }
57 | }
58 |
59 | public void Dispose()
60 | {
61 | Dispose(true);
62 | GC.SuppressFinalize(this);
63 | }
64 |
65 | public unsafe void LoadModel(string modelPath, LlmModelOptions? modelOptions = default, Action? progressCallback = default, bool waitForMainLoop = true)
66 | {
67 | if (_model.Created)
68 | throw new InvalidOperationException("Model already loaded.");
69 |
70 | if (modelOptions != default)
71 | _modelOptions = modelOptions;
72 |
73 | if (!_backend.Created)
74 | {
75 | _backend.Create(() =>
76 | {
77 | llama_backend_init();
78 | llama_numa_init(_engineOptions.NumaOptimizations ? ggml_numa_strategy.GGML_NUMA_STRATEGY_DISTRIBUTE : ggml_numa_strategy.GGML_NUMA_STRATEGY_DISABLED);
79 | }, llama_backend_free);
80 | }
81 |
82 | var mparams = llama_model_default_params();
83 | mparams.n_gpu_layers = _modelOptions.GpuLayers;
84 | mparams.use_mmap = (sbyte)(_modelOptions.UseMemoryMap ? 1 : 0);
85 |
86 | using var progressCallbackHandle = new UnmanagedResource();
87 | if (progressCallback != default)
88 | {
89 | progressCallbackHandle.Create(() => GCHandle.Alloc(progressCallback), handle => handle.Free());
90 | mparams.progress_callback = &LlmEngine._ProgressCallback;
91 | mparams.progress_callback_user_data = GCHandle.ToIntPtr(progressCallbackHandle.Handle).ToPointer();
92 | }
93 |
94 | _model.Create(() => llama_load_model_from_file(modelPath, mparams), llama_free_model);
95 |
96 | var cparams = llama_context_default_params();
97 | cparams.n_ctx = (uint)_modelOptions.ContextLength;
98 | cparams.n_batch = (uint)_modelOptions.BatchSize;
99 | cparams.n_threads = _modelOptions.ThreadCount;
100 | cparams.n_threads_batch = _modelOptions.BatchThreadCount;
101 | cparams.flash_attn = (sbyte)(_modelOptions.UseFlashAttention ? 1 : 0);
102 | cparams.rope_freq_base = _modelOptions.RopeFrequeceBase;
103 | cparams.rope_freq_scale = _modelOptions.RopeFrequenceScale;
104 |
105 | _cancellationTokenHandle.Create(() => GCHandle.Alloc(_cancellationTokenSource.Token), handle => handle.Free());
106 | cparams.abort_callback = &AbortCallback;
107 | cparams.abort_callback_data = GCHandle.ToIntPtr(_cancellationTokenHandle.Handle).ToPointer();
108 |
109 | _context.Create(() => llama_new_context_with_model(_model.Handle, cparams), llama_free);
110 |
111 | var sparams = llama_sampler_chain_default_params();
112 | sparams.no_perf = 0;
113 |
114 | _sampler.Create(() => llama_sampler_chain_init(sparams), llama_sampler_free);
115 |
116 | _batch.Create(() => llama_batch_init((int)llama_n_ctx(_context.Handle), 0, 1), llama_batch_free);
117 |
118 | _StartAsync();
119 |
120 | if (waitForMainLoop)
121 | {
122 | while (!Loaded)
123 | {
124 | Task.Delay(TimeSpan.FromMilliseconds(100));
125 | }
126 | }
127 | }
128 |
129 | public void UnloadModel()
130 | {
131 | _StopAsync().Wait();
132 |
133 | _batch.Dispose();
134 | _sampler.Dispose();
135 | _context.Dispose();
136 | _model.Dispose();
137 | }
138 |
139 | public Span Tokenize(string prompt, bool prependBosToken = false, bool processSpecialTokens = false) => llama_tokenize(_model.Handle, Encoding.UTF8.GetBytes(prompt), prependBosToken, processSpecialTokens);
140 |
141 | public Span Tokenize(List messages, bool prependBosToken = false, bool processSpecialTokens = false)
142 | {
143 | var text = llama_apply_template(_context.Handle, messages);
144 | return Tokenize(text, prependBosToken, processSpecialTokens);
145 | }
146 |
147 | public nint ModelNativeHandle { get => _model.Handle; }
148 | public nint ContextNativeHandle { get => _context.Handle; }
149 |
150 | public bool Loaded => _mainLoop?.Status == TaskStatus.Running;
151 |
152 | public int ContextLength => Loaded ? (int)llama_n_ctx(_context.Handle) : 0;
153 | public int TrainingContextLength => Loaded ? llama_n_ctx_train(_model.Handle) : 0;
154 | public int LayerCount => Loaded ? llama_n_layer(_model.Handle) : 0;
155 |
156 | public LlmPrompt Prompt(
157 | List? messages,
158 | SamplingOptions? samplingOptions = default
159 | )
160 | {
161 | if (messages == null)
162 | {
163 | throw new InvalidDataException("No prompt was provided.");
164 | }
165 |
166 | var prompt = new LlmPrompt(
167 | messages ?? [],
168 | samplingOptions ?? new()
169 | );
170 |
171 | _prompts.Enqueue(prompt);
172 |
173 | return prompt;
174 | }
175 |
176 | [UnmanagedCallersOnly(CallConvs = [typeof(CallConvCdecl)])]
177 | private static unsafe sbyte _ProgressCallback(float progress, void* state)
178 | {
179 | var callback = (Action?)GCHandle.FromIntPtr(new(state)).Target;
180 | callback?.Invoke(progress * 100);
181 | return true ? 1 : 0;
182 | }
183 |
184 | [UnmanagedCallersOnly(CallConvs = [typeof(CallConvCdecl)])]
185 | static unsafe sbyte AbortCallback(void* state)
186 | {
187 | var cancellationToken = (CancellationToken?)GCHandle.FromIntPtr(new(state)).Target;
188 | return (sbyte)(cancellationToken?.IsCancellationRequested ?? false ? 1 : 0);
189 | }
190 |
191 | private void _StartAsync()
192 | {
193 | if (_mainLoop != default)
194 | return;
195 |
196 | _mainLoop = Task.Run(_Run);
197 | }
198 |
199 | private async Task _StopAsync()
200 | {
201 | if (_mainLoop == default)
202 | return;
203 |
204 | _cancellationTokenSource.Cancel();
205 | await (_mainLoop ?? Task.CompletedTask).ConfigureAwait(false);
206 | _cancellationTokenSource = new();
207 |
208 | _mainLoop = default;
209 | }
210 |
211 | private unsafe void _Run()
212 | {
213 | _batch.GetResource(out var batch);
214 |
215 | var sequences = new Slots(_engineOptions.MaxParallel);
216 |
217 | //var candidates = new llama_token_data[llama_n_vocab(_model.Handle)];
218 | var batchView = new llama_batch();
219 |
220 | var cancellationToken = _cancellationTokenSource.Token;
221 | while (!cancellationToken.IsCancellationRequested)
222 | {
223 | // Fill as many sequence slots as possible given pending requests
224 | while (sequences.HasFreeSlot && _prompts.Any())
225 | {
226 | if (cancellationToken.IsCancellationRequested)
227 | break;
228 |
229 | var prompt = _prompts.Dequeue(cancellationToken);
230 |
231 | var extraStopTokens = prompt.SamplingOptions.ExtraStopTokens?
232 | .Select(tokenText => Tokenize(tokenText, false, true).ToArray())
233 | .Where(tokens => tokens.Length == 1)
234 | .Select(tokens => tokens.Single())
235 | .ToArray();
236 |
237 | var ctxLength = (int)llama_n_ctx(_context.Handle);
238 | var tokens = Tokenize(prompt.Messages, llama_add_bos_token(_model.Handle) > 0, true);
239 |
240 | // TODO: implement proper error handling/logging
241 | if (tokens.Length < ctxLength - 512)
242 | {
243 | var sequence = new LlmSequence(prompt, ctxLength, tokens, extraStopTokens) { T1 = DateTime.Now };
244 | var id = sequences.Add(sequence);
245 | sequence.Id = id;
246 | }
247 | else
248 | {
249 | Console.WriteLine($"ERROR: Sequence token limit reached, {tokens.Length} > ({ctxLength} - 512)");
250 | }
251 | }
252 |
253 | if (cancellationToken.IsCancellationRequested)
254 | continue;
255 |
256 | batch.n_tokens = 0;
257 |
258 | foreach (var sequence in sequences)
259 | {
260 | if (cancellationToken.IsCancellationRequested)
261 | break;
262 |
263 | for (; sequence.PosBatch < sequence.PosTokens; sequence.PosBatch++)
264 | llama_batch_add(ref batch, sequence.Tokens[sequence.PosBatch], sequence.PosBatch, [sequence.Id], false);
265 |
266 | sequence.PosLogit = batch.n_tokens - 1;
267 | batch.logits[sequence.PosLogit] = true ? 1 : 0;
268 | }
269 |
270 | if (cancellationToken.IsCancellationRequested)
271 | continue;
272 |
273 | if (batch.n_tokens == 0)
274 | {
275 | _prompts.WaitForNext(cancellationToken);
276 | continue;
277 | }
278 |
279 | var batchSize = _modelOptions.BatchSize;
280 | for (var i = 0; i < batch.n_tokens; i += batchSize)
281 | {
282 | var n_tokens = Math.Min(batchSize, batch.n_tokens - i);
283 |
284 | batchView.n_tokens = n_tokens;
285 | batchView.token = batch.token + i;
286 | batchView.embd = null;
287 | batchView.pos = batch.pos + i;
288 | batchView.n_seq_id = batch.n_seq_id + i;
289 | batchView.seq_id = batch.seq_id + i;
290 | batchView.logits = batch.logits + i;
291 |
292 | var result = llama_decode(_context.Handle, batchView);
293 |
294 | if (cancellationToken.IsCancellationRequested)
295 | break;
296 |
297 | if (result != 0)
298 | {
299 | foreach (var sequence in sequences)
300 | sequence.Prompt.TokenChannel.Writer.Complete(new InsufficientMemoryException());
301 |
302 | sequences.RemoveAll(sequence => true);
303 | llama_kv_cache_clear(_context.Handle);
304 |
305 | continue;
306 | }
307 |
308 | foreach (var sequence in sequences)
309 | {
310 | if (cancellationToken.IsCancellationRequested)
311 | break;
312 |
313 | if (sequence.PosLogit < i || sequence.PosLogit >= i + n_tokens)
314 | continue;
315 |
316 | // This isn't a fully dynamic sampling chain per sequence, but ideally here we would confirm whether
317 | // we need to reset the sampler (i.e. by comparing the current chain with the requested chain).
318 | // For now, this is just a static default temperature chain vs greedy sampling based on temperature.
319 | llama_sampler_reset(_sampler.Handle);
320 | if (sequence.SamplingOptions.Temperature > 0.0f)
321 | {
322 | // TODO: Add new DRY sampler
323 | llama_sampler_chain_add(_sampler.Handle, llama_sampler_init_top_k(sequence.SamplingOptions.TopK));
324 | llama_sampler_chain_add(_sampler.Handle, llama_sampler_init_typical(sequence.SamplingOptions.TypicalP, 1));
325 | llama_sampler_chain_add(_sampler.Handle, llama_sampler_init_top_p(sequence.SamplingOptions.TopP, 1));
326 | llama_sampler_chain_add(_sampler.Handle, llama_sampler_init_min_p(sequence.SamplingOptions.MinP, 1));
327 | // TODO: Add new XTC sampler
328 | llama_sampler_chain_add(_sampler.Handle, llama_sampler_init_temp(sequence.SamplingOptions.Temperature));
329 | llama_sampler_chain_add(_sampler.Handle, llama_sampler_init_dist((uint)sequence.SamplingOptions.Seed));
330 | }
331 | else
332 | {
333 | llama_sampler_chain_add(_sampler.Handle, llama_sampler_init_greedy());
334 | }
335 |
336 | if (cancellationToken.IsCancellationRequested)
337 | continue;
338 |
339 | var token = llama_sampler_sample(_sampler.Handle, _context.Handle, sequence.PosLogit - i);
340 |
341 | if (sequence.T2 == default)
342 | {
343 | sequence.T2 = DateTime.Now;
344 | sequence.Prompt.PromptingSpeed = sequence.PosResponse / ((sequence.T2 - sequence.T1) ?? new()).TotalSeconds;
345 | }
346 |
347 | var stop = false
348 | || sequence.PosTokens >= sequence.Tokens.Length - 1
349 | || sequence.PosTokens - sequence.PosResponse >= sequence.SamplingOptions.ResponseMaxTokenCount
350 | || (sequence.StopTokens?.Contains(token) ?? false)
351 | || llama_token_is_eog(_model.Handle, token);
352 |
353 | if (!stop)
354 | {
355 | sequence.Prompt.TokenChannel.Writer.TryWrite(
356 | //llama_detokenize(_model.Handle, [token])
357 | Interop.llama_token_to_piece(_model.Handle, token, true)
358 | );
359 |
360 | sequence.Tokens[sequence.PosTokens++] = token;
361 | }
362 |
363 | if (sequence.Prompt.Cancelled || stop)
364 | {
365 | sequence.T3 = DateTime.Now;
366 | sequence.Prompt.SamplingSpeed = (sequence.PosTokens - sequence.PosResponse - 1) / ((sequence.T3 - sequence.T2) ?? new()).TotalSeconds;
367 |
368 | if (sequence.Prompt.Cancelled)
369 | sequence.Prompt.TokenChannel.Writer.Complete(new OperationCanceledException());
370 | else if (stop)
371 | sequence.Prompt.TokenChannel.Writer.Complete();
372 |
373 | llama_kv_cache_seq_rm(_context.Handle, sequence.Id, -1, -1);
374 | sequences.Remove(sequence.Id);
375 | }
376 | }
377 | }
378 | }
379 |
380 | if (cancellationToken.IsCancellationRequested)
381 | {
382 | // Notify outstanding requests of cancellation
383 | foreach (var sequence in sequences)
384 | sequence.Prompt.TokenChannel.Writer.Complete(new OperationCanceledException());
385 | }
386 | }
387 | }
388 | }
389 |
--------------------------------------------------------------------------------
/LlamaCppLib/LlmPrompt.cs:
--------------------------------------------------------------------------------
1 | using System.Runtime.CompilerServices;
2 | using System.Threading.Channels;
3 |
4 | namespace LlamaCppLib
5 | {
6 | public class LlmMessage
7 | {
8 | public string? Role { get; set; }
9 | public string? Content { get; set; }
10 | }
11 |
12 | public class LlmPrompt
13 | {
14 | public LlmPrompt(List messages)
15 | {
16 | this.Messages = messages;
17 | this.TokenChannel = Channel.CreateUnbounded();
18 | }
19 |
20 | public LlmPrompt(List messages, SamplingOptions samplingOptions) : this(messages)
21 | {
22 | this.SamplingOptions = samplingOptions;
23 | }
24 |
25 | public bool Cancelled { get; private set; }
26 |
27 | public SamplingOptions SamplingOptions { get; private set; } = new();
28 | public List Messages { get; private set; }
29 |
30 | public Channel TokenChannel { get; private set; }
31 |
32 | public async IAsyncEnumerable NextToken([EnumeratorCancellation] CancellationToken cancellationToken = default)
33 | {
34 | var result = default(byte[]?);
35 |
36 | while (true)
37 | {
38 | try
39 | {
40 | if ((result = await this.TokenChannel.Reader.ReadAsync(cancellationToken)).Length == 0)
41 | break;
42 | }
43 | catch (OperationCanceledException)
44 | {
45 | this.Cancelled = true;
46 | break;
47 | }
48 | catch (ChannelClosedException)
49 | {
50 | break;
51 | }
52 |
53 | yield return result;
54 | }
55 | }
56 |
57 | public double PromptingSpeed { get; set; }
58 | public double SamplingSpeed { get; set; }
59 | }
60 |
61 | public class TokenEnumerator : IAsyncEnumerable
62 | {
63 | private MultibyteCharAssembler _assembler = new();
64 | private LlmPrompt _prompt;
65 | private CancellationToken? _cancellationToken;
66 |
67 | public TokenEnumerator(LlmPrompt prompt, CancellationToken? cancellationToken = default)
68 | {
69 | _prompt = prompt;
70 | _cancellationToken = cancellationToken;
71 | }
72 |
73 | public async IAsyncEnumerator GetAsyncEnumerator(CancellationToken cancellationToken = default)
74 | {
75 | var ct = _cancellationToken != null
76 | ? CancellationTokenSource.CreateLinkedTokenSource(_cancellationToken.Value, cancellationToken).Token
77 | : cancellationToken;
78 |
79 | await foreach (var token in _prompt.NextToken(ct))
80 | yield return _assembler.Consume(token);
81 |
82 | yield return _assembler.Consume();
83 | }
84 | }
85 | }
86 |
--------------------------------------------------------------------------------
/LlamaCppLib/LlmSequence.cs:
--------------------------------------------------------------------------------
1 | using System.Diagnostics.CodeAnalysis;
2 |
3 | namespace LlamaCppLib
4 | {
5 | internal class LlmSequence : IEquatable
6 | {
7 | public int Id { get; set; }
8 |
9 | public int PosBatch { get; set; }
10 | public int PosLogit { get; set; }
11 |
12 | public int PosTokens { get; set; }
13 | public int PosResponse { get; set; }
14 | public int[] Tokens { get; set; }
15 | public int[] StopTokens { get; set; }
16 |
17 | public SamplingOptions SamplingOptions { get; set; } = new();
18 | public int MirostatM { get; private set; }
19 | public float MirostatMu = 0.0f;
20 |
21 | public DateTime? T1 { get; set; }
22 | public DateTime? T2 { get; set; }
23 | public DateTime? T3 { get; set; }
24 |
25 | public LlmPrompt Prompt { get; private set; }
26 |
27 | public LlmSequence(LlmPrompt prompt, int tokenCount, ReadOnlySpan tokens, ReadOnlySpan stopTokens, int mirostatM = 100)
28 | {
29 | this.Tokens = new int[tokenCount];
30 | tokens.CopyTo(Tokens);
31 |
32 | this.StopTokens = new int[stopTokens.Length];
33 | stopTokens.CopyTo(StopTokens);
34 |
35 | this.MirostatM = mirostatM;
36 | this.Prompt = prompt;
37 | this.SamplingOptions = prompt.SamplingOptions;
38 |
39 | PosTokens += tokens.Length;
40 | PosResponse = PosTokens;
41 | }
42 |
43 | public override bool Equals([NotNullWhen(true)] object? obj) => obj is LlmSequence request && Equals(request);
44 | public override int GetHashCode() => Id.GetHashCode();
45 |
46 | // IEquatable
47 | public bool Equals(LlmSequence? other) => other?.Id == this.Id;
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/LlamaCppLib/MultibyteCharAssembler.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace LlamaCppLib
4 | {
5 | public class MultibyteCharAssembler
6 | {
7 | private List _buffer = new();
8 |
9 | public string Consume(Span bytes)
10 | {
11 | var result = new StringBuilder();
12 |
13 | _buffer.AddRange(bytes.ToArray());
14 | while (_buffer.Count > 0)
15 | {
16 | var validUtf8Length = _Find(_buffer.ToArray());
17 | if (validUtf8Length == 0)
18 | break;
19 |
20 | result.Append(Encoding.UTF8.GetString(_buffer.GetRange(0, validUtf8Length).ToArray()));
21 | _buffer.RemoveRange(0, validUtf8Length);
22 | }
23 |
24 | return result.ToString();
25 | }
26 |
27 | public string Consume()
28 | {
29 | if (_buffer.Count == 0)
30 | return String.Empty;
31 |
32 | var result = Encoding.UTF8.GetString(_buffer.ToArray());
33 | _buffer.Clear();
34 | return result;
35 | }
36 |
37 | private int _Find(byte[] bytes)
38 | {
39 | var index = 0;
40 | while (index < bytes.Length)
41 | {
42 | var byteCount = _Count(bytes[index]);
43 | if (index + byteCount > bytes.Length)
44 | break;
45 |
46 | index += byteCount;
47 | }
48 |
49 | return index;
50 | }
51 |
52 | private int _Count(byte startByte)
53 | {
54 | return startByte switch
55 | {
56 | _ when (startByte & 0x80) == 0x00 => 1, // 1-byte character
57 | _ when (startByte & 0xE0) == 0xC0 => 2, // 2-byte character
58 | _ when (startByte & 0xF0) == 0xE0 => 3, // 3-byte character
59 | _ when (startByte & 0xF8) == 0xF0 => 4, // 4-byte character
60 | _ => 0 // Invalid start-byte
61 | };
62 | }
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/LlamaCppLib/Native.cs:
--------------------------------------------------------------------------------
1 | using System.Runtime.InteropServices;
2 |
3 | namespace LlamaCppLib
4 | {
5 | using llama_model = System.IntPtr;
6 | using llama_context = System.IntPtr;
7 | using llama_sampler = System.IntPtr;
8 | using llama_token = System.Int32;
9 | using llama_pos = System.Int32;
10 | using llama_seq_id = System.Int32;
11 |
12 | // ggml.h
13 |
14 | using unsafe ggml_backend_sched_eval_callback = delegate* unmanaged[Cdecl];
15 | using unsafe ggml_abort_callback = delegate* unmanaged[Cdecl];
16 |
17 | // llama.h
18 |
19 | using unsafe llama_progress_callback = delegate* unmanaged[Cdecl];
20 |
21 | public static unsafe partial class Native
22 | {
23 | #if WINDOWS
24 | private const string LibName = $"{nameof(LlamaCppLib)}/llama";
25 | #elif LINUX || MACOS
26 | private const string LibName = $"{nameof(LlamaCppLib)}/libllama";
27 | #endif
28 |
29 | // ggml.h
30 |
31 | public enum ggml_type
32 | {
33 | GGML_TYPE_F32 = 0,
34 | GGML_TYPE_F16 = 1,
35 | GGML_TYPE_Q4_0 = 2,
36 | GGML_TYPE_Q4_1 = 3,
37 | // GGML_TYPE_Q4_2 = 4, // removed
38 | // GGML_TYPE_Q4_3 = 5, // removed
39 | GGML_TYPE_Q5_0 = 6,
40 | GGML_TYPE_Q5_1 = 7,
41 | GGML_TYPE_Q8_0 = 8,
42 | GGML_TYPE_Q8_1 = 9,
43 | GGML_TYPE_Q2_K = 10,
44 | GGML_TYPE_Q3_K = 11,
45 | GGML_TYPE_Q4_K = 12,
46 | GGML_TYPE_Q5_K = 13,
47 | GGML_TYPE_Q6_K = 14,
48 | GGML_TYPE_Q8_K = 15,
49 | GGML_TYPE_IQ2_XXS = 16,
50 | GGML_TYPE_IQ2_XS = 17,
51 | GGML_TYPE_IQ3_XXS = 18,
52 | GGML_TYPE_IQ1_S = 19,
53 | GGML_TYPE_IQ4_NL = 20,
54 | GGML_TYPE_IQ3_S = 21,
55 | GGML_TYPE_IQ2_S = 22,
56 | GGML_TYPE_IQ4_XS = 23,
57 | GGML_TYPE_I8 = 24,
58 | GGML_TYPE_I16 = 25,
59 | GGML_TYPE_I32 = 26,
60 | GGML_TYPE_I64 = 27,
61 | GGML_TYPE_F64 = 28,
62 | GGML_TYPE_IQ1_M = 29,
63 | GGML_TYPE_BF16 = 30,
64 | GGML_TYPE_Q4_0_4_4 = 31,
65 | GGML_TYPE_Q4_0_4_8 = 32,
66 | GGML_TYPE_Q4_0_8_8 = 33,
67 | GGML_TYPE_TQ1_0 = 34,
68 | GGML_TYPE_TQ2_0 = 35,
69 | GGML_TYPE_COUNT,
70 | }
71 |
72 | public enum ggml_numa_strategy
73 | {
74 | GGML_NUMA_STRATEGY_DISABLED = 0,
75 | GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
76 | GGML_NUMA_STRATEGY_ISOLATE = 2,
77 | GGML_NUMA_STRATEGY_NUMACTL = 3,
78 | GGML_NUMA_STRATEGY_MIRROR = 4,
79 | GGML_NUMA_STRATEGY_COUNT
80 | }
81 |
82 | // llama.h
83 |
84 | public enum _llama_vocab_type
85 | {
86 | LLAMA_VOCAB_TYPE_NONE = 0,
87 | LLAMA_VOCAB_TYPE_SPM = 1,
88 | LLAMA_VOCAB_TYPE_BPE = 2,
89 | LLAMA_VOCAB_TYPE_WPM = 3,
90 | LLAMA_VOCAB_TYPE_UGM = 4,
91 | LLAMA_VOCAB_TYPE_RWKV = 5,
92 | };
93 |
94 | public enum llama_rope_scaling_type
95 | {
96 | LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1,
97 | LLAMA_ROPE_SCALING_TYPE_NONE = 0,
98 | LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
99 | LLAMA_ROPE_SCALING_TYPE_YARN = 2,
100 | LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN,
101 | }
102 |
103 | public enum _llama_pooling_type
104 | {
105 | LLAMA_POOLING_TYPE_UNSPECIFIED = -1,
106 | LLAMA_POOLING_TYPE_NONE = 0,
107 | LLAMA_POOLING_TYPE_MEAN = 1,
108 | LLAMA_POOLING_TYPE_CLS = 2,
109 | LLAMA_POOLING_TYPE_LAST = 3,
110 | LLAMA_POOLING_TYPE_RANK = 4,
111 | }
112 |
113 | public enum llama_attention_type
114 | {
115 | LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1,
116 | LLAMA_ATTENTION_TYPE_CAUSAL = 0,
117 | LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
118 | }
119 |
120 | public enum llama_split_mode
121 | {
122 | LLAMA_SPLIT_NONE = 0,
123 | LLAMA_SPLIT_LAYER = 1,
124 | LLAMA_SPLIT_ROW = 2,
125 | }
126 |
127 | [StructLayout(LayoutKind.Sequential)]
128 | public struct llama_batch
129 | {
130 | public int n_tokens;
131 |
132 | public llama_token* token;
133 | public float* embd;
134 | public llama_pos* pos;
135 | public int* n_seq_id;
136 | public llama_seq_id** seq_id;
137 | public sbyte* logits;
138 | }
139 |
140 | public enum llama_model_kv_override_type
141 | {
142 | LLAMA_KV_OVERRIDE_TYPE_INT,
143 | LLAMA_KV_OVERRIDE_TYPE_FLOAT,
144 | LLAMA_KV_OVERRIDE_TYPE_BOOL,
145 | LLAMA_KV_OVERRIDE_TYPE_STR,
146 | };
147 |
148 | [StructLayout(LayoutKind.Explicit)]
149 | public struct llama_model_kv_override_value
150 | {
151 | [FieldOffset(0)] public long val_i64;
152 | [FieldOffset(0)] public double val_f64;
153 | [FieldOffset(0)] public sbyte val_bool;
154 | [FieldOffset(0)] public fixed byte val_str[128];
155 | }
156 |
157 | [StructLayout(LayoutKind.Sequential)]
158 | public struct llama_model_kv_override
159 | {
160 | public llama_model_kv_override_type tag;
161 | public fixed byte key[128];
162 | public llama_model_kv_override_value value;
163 | }
164 |
165 | [StructLayout(LayoutKind.Sequential)]
166 | public struct llama_model_params
167 | {
168 | public void* devices;
169 |
170 | public int n_gpu_layers;
171 | public llama_split_mode split_mode;
172 |
173 | public int main_gpu;
174 |
175 | public readonly float* tensor_split;
176 |
177 | public byte* rpc_servers;
178 |
179 | public llama_progress_callback progress_callback;
180 |
181 | public void* progress_callback_user_data;
182 |
183 | public readonly llama_model_kv_override* kv_overrides;
184 |
185 | public sbyte vocab_only;
186 | public sbyte use_mmap;
187 | public sbyte use_mlock;
188 | public sbyte check_tensors;
189 | }
190 |
191 | [StructLayout(LayoutKind.Sequential)]
192 | public struct llama_context_params
193 | {
194 | public uint n_ctx;
195 | public uint n_batch;
196 | public uint n_ubatch;
197 | public uint n_seq_max;
198 | public int n_threads;
199 | public int n_threads_batch;
200 |
201 | public llama_rope_scaling_type rope_scaling_type;
202 | public _llama_pooling_type pooling_type;
203 | public llama_attention_type attention_type;
204 |
205 | public float rope_freq_base;
206 | public float rope_freq_scale;
207 | public float yarn_ext_factor;
208 | public float yarn_attn_factor;
209 | public float yarn_beta_fast;
210 | public float yarn_beta_slow;
211 | public uint yarn_orig_ctx;
212 | public float defrag_thold;
213 |
214 | public ggml_backend_sched_eval_callback cb_eval;
215 | public void* cb_eval_user_data;
216 |
217 | public ggml_type type_k;
218 | public ggml_type type_v;
219 |
220 | public sbyte logits_all;
221 | public sbyte embeddings;
222 | public sbyte offload_kqv;
223 | public sbyte flash_attn;
224 | public sbyte no_perf;
225 |
226 | public ggml_abort_callback abort_callback;
227 | public void* abort_callback_data;
228 | }
229 |
230 | [StructLayout(LayoutKind.Sequential)]
231 | public struct llama_sampler_chain_params
232 | {
233 | public sbyte no_perf;
234 | }
235 |
236 | [StructLayout(LayoutKind.Sequential)]
237 | public struct llama_chat_message
238 | {
239 | public byte* role;
240 | public byte* content;
241 | }
242 |
243 | [LibraryImport(LibName)]
244 | public static partial llama_model_params llama_model_default_params();
245 |
246 | [LibraryImport(LibName)]
247 | public static partial llama_context_params llama_context_default_params();
248 |
249 | [LibraryImport(LibName)]
250 | public static partial llama_sampler_chain_params llama_sampler_chain_default_params();
251 |
252 | [LibraryImport(LibName)]
253 | public static partial void llama_backend_init();
254 |
255 | [LibraryImport(LibName)]
256 | public static partial void llama_numa_init(
257 | ggml_numa_strategy numa);
258 |
259 | [LibraryImport(LibName)]
260 | public static partial void llama_backend_free();
261 |
262 | [LibraryImport(LibName)]
263 | public static partial llama_model llama_load_model_from_file(
264 | [MarshalAs(UnmanagedType.LPStr)] string path_model,
265 | llama_model_params mparams);
266 |
267 | [LibraryImport(LibName)]
268 | public static partial void llama_free_model(
269 | llama_model model);
270 |
271 | [LibraryImport(LibName)]
272 | public static partial llama_context llama_new_context_with_model(
273 | llama_model model,
274 | llama_context_params cparams);
275 |
276 | [LibraryImport(LibName)]
277 | public static partial void llama_free(
278 | llama_context ctx);
279 |
280 | [LibraryImport(LibName)]
281 | public static partial uint llama_n_ctx(
282 | llama_context ctx);
283 |
284 | [LibraryImport(LibName)]
285 | public static partial uint llama_n_batch(
286 | llama_context ctx);
287 |
288 | [LibraryImport(LibName)]
289 | public static partial uint llama_n_ubatch(
290 | llama_context ctx);
291 |
292 | [LibraryImport(LibName)]
293 | public static partial uint llama_n_seq_max(
294 | llama_context ctx);
295 |
296 | [LibraryImport(LibName)]
297 | public static partial int llama_n_vocab(
298 | llama_model model);
299 |
300 | [LibraryImport(LibName)]
301 | public static partial int llama_n_ctx_train(
302 | llama_model model);
303 |
304 | [LibraryImport(LibName)]
305 | public static partial int llama_n_embd(
306 | llama_model model);
307 |
308 | [LibraryImport(LibName)]
309 | public static partial int llama_n_layer(
310 | llama_model model);
311 |
312 | [LibraryImport(LibName)]
313 | public static partial llama_model llama_get_model(
314 | llama_context ctx);
315 |
316 | [LibraryImport(LibName)]
317 | public static partial _llama_pooling_type llama_pooling_type(
318 | llama_context ctx);
319 |
320 | [LibraryImport(LibName)]
321 | public static partial _llama_vocab_type llama_vocab_type(
322 | llama_model model);
323 |
324 | [LibraryImport(LibName)]
325 | public static partial int llama_model_meta_val_str(
326 | llama_model model,
327 | [In] byte[] key,
328 | [In, Out] byte[] buf,
329 | nuint buf_size);
330 |
331 | [LibraryImport(LibName)]
332 | public static partial int llama_model_meta_count(
333 | llama_model model);
334 |
335 | [LibraryImport(LibName)]
336 | public static partial int llama_model_meta_key_by_index(
337 | llama_model model,
338 | int i,
339 | [In, Out] byte[] buf,
340 | nuint buf_size);
341 |
342 | [LibraryImport(LibName)]
343 | public static partial int llama_model_meta_val_str_by_index(
344 | llama_model model,
345 | int i,
346 | [In, Out] byte[] buf,
347 | nuint buf_size);
348 |
349 | [LibraryImport(LibName)]
350 | [return: MarshalAs(UnmanagedType.I1)]
351 | public static partial bool llama_model_has_encoder(
352 | llama_model model);
353 |
354 | [LibraryImport(LibName)]
355 | [return: MarshalAs(UnmanagedType.I1)]
356 | public static partial bool llama_model_has_decoder(
357 | llama_model model);
358 |
359 | //
360 | // KV cache
361 | //
362 |
363 | [LibraryImport(LibName)]
364 | public static partial void llama_kv_cache_clear(
365 | llama_context ctx);
366 |
367 | [LibraryImport(LibName)]
368 | [return: MarshalAs(UnmanagedType.I1)]
369 | public static partial bool llama_kv_cache_seq_rm(
370 | llama_context ctx,
371 | llama_seq_id seq_id,
372 | llama_pos p0,
373 | llama_pos p1);
374 |
375 | //
376 | // State / sessions
377 | //
378 |
379 | [LibraryImport(LibName)]
380 | public static partial nuint llama_state_get_size(
381 | llama_context ctx);
382 |
383 | [LibraryImport(LibName)]
384 | public static partial nuint llama_state_get_data(
385 | llama_context ctx,
386 | [In, Out] byte[] dst,
387 | nuint size);
388 |
389 | [LibraryImport(LibName)]
390 | public static partial nuint llama_state_set_data(
391 | llama_context ctx,
392 | [In] byte[] src,
393 | nuint size);
394 |
395 | [LibraryImport(LibName)]
396 | [return: MarshalAs(UnmanagedType.I1)]
397 | public static partial bool llama_state_load_file(
398 | llama_context ctx,
399 | [In] byte[] path_session,
400 | [In, Out] llama_token[] tokens_out,
401 | nuint n_token_capacity,
402 | ref nuint n_token_count_out);
403 |
404 | [LibraryImport(LibName)]
405 | [return: MarshalAs(UnmanagedType.I1)]
406 | public static partial bool llama_state_save_file(
407 | llama_context ctx,
408 | [In] byte[] path_session,
409 | [In] llama_token[] tokens,
410 | nuint n_token_count);
411 |
412 | [LibraryImport(LibName)]
413 | public static partial nuint llama_state_seq_get_size(
414 | llama_context ctx,
415 | llama_seq_id seq_id);
416 |
417 | [LibraryImport(LibName)]
418 | public static partial nuint llama_state_seq_get_data(
419 | llama_context ctx,
420 | [In, Out] byte[] dst,
421 | nuint size,
422 | llama_seq_id seq_id);
423 |
424 | [LibraryImport(LibName)]
425 | public static partial nuint llama_state_seq_set_data(
426 | llama_context ctx,
427 | [In] byte[] src,
428 | nuint size,
429 | llama_seq_id dest_seq_id);
430 |
431 | [LibraryImport(LibName)]
432 | public static partial nuint llama_state_seq_save_file(
433 | llama_context ctx,
434 | [In] byte[] filepath,
435 | llama_seq_id seq_id,
436 | [In] llama_token[] tokens,
437 | nuint n_token_count);
438 |
439 | [LibraryImport(LibName)]
440 | public static partial nuint llama_state_seq_load_file(
441 | llama_context ctx,
442 | [In] byte[] filepath,
443 | llama_seq_id dest_seq_id,
444 | [In, Out] llama_token[] tokens_out,
445 | nuint n_token_capacity,
446 | ref nuint n_token_count_out);
447 |
448 | //
449 | // Decoding
450 | //
451 |
452 | [LibraryImport(LibName)]
453 | public static partial llama_batch llama_batch_init(
454 | int n_tokens,
455 | int embd,
456 | int n_seq_max);
457 |
458 | [LibraryImport(LibName)]
459 | public static partial void llama_batch_free(
460 | llama_batch batch);
461 |
462 | [LibraryImport(LibName)]
463 | public static partial int llama_encode(
464 | llama_context ctx,
465 | llama_batch batch);
466 |
467 | [LibraryImport(LibName)]
468 | public static partial int llama_decode(
469 | llama_context ctx,
470 | llama_batch batch);
471 |
472 | [LibraryImport(LibName)]
473 | public static partial void llama_set_embeddings(
474 | llama_context ctx,
475 | [MarshalAs(UnmanagedType.I1)] bool embeddings);
476 |
477 | [LibraryImport(LibName)]
478 | public static partial void llama_set_causal_attn(
479 | llama_context ctx,
480 | [MarshalAs(UnmanagedType.I1)] bool causal_attn);
481 |
482 | [LibraryImport(LibName)]
483 | public static partial float* llama_get_embeddings(
484 | llama_context ctx);
485 |
486 | [LibraryImport(LibName)]
487 | public static partial float* llama_get_embeddings_ith(
488 | llama_context ctx,
489 | int i);
490 |
491 | [LibraryImport(LibName)]
492 | public static partial float* llama_get_embeddings_seq(
493 | llama_context ctx,
494 | llama_seq_id seq_id);
495 |
496 | //
497 | // Vocab
498 | //
499 |
500 | [LibraryImport(LibName)]
501 | [return: MarshalAs(UnmanagedType.I1)]
502 | public static partial bool llama_token_is_eog(
503 | llama_model model,
504 | llama_token token);
505 |
506 | [LibraryImport(LibName)]
507 | public static partial llama_token llama_token_eos(
508 | llama_model model);
509 |
510 | [LibraryImport(LibName)]
511 | public static partial int llama_add_bos_token(
512 | llama_model model);
513 |
514 | [LibraryImport(LibName)]
515 | public static partial int llama_add_eos_token(
516 | llama_model model);
517 |
518 | //
519 | // Tokenization
520 | //
521 |
522 | [LibraryImport(LibName)]
523 | public static partial int llama_tokenize(
524 | llama_model model,
525 | [In] byte[] text,
526 | int text_len,
527 | [In, Out] llama_token[] tokens,
528 | int n_tokens_max,
529 | [MarshalAs(UnmanagedType.I1)] bool add_special,
530 | [MarshalAs(UnmanagedType.I1)] bool parse_special);
531 |
532 | [LibraryImport(LibName)]
533 | public static partial int llama_token_to_piece(
534 | llama_model model,
535 | llama_token token,
536 | [In, Out] byte[] buf,
537 | int length,
538 | int lstrip,
539 | [MarshalAs(UnmanagedType.I1)] bool special);
540 |
541 | [LibraryImport(LibName)]
542 | public static partial int llama_detokenize(
543 | llama_model model,
544 | [In] llama_token[] tokens,
545 | int n_tokens,
546 | [In, Out] byte[] text,
547 | int text_len_max,
548 | [MarshalAs(UnmanagedType.I1)] bool remove_special,
549 | [MarshalAs(UnmanagedType.I1)] bool unparse_special);
550 |
551 | //
552 | // Chat templates
553 | //
554 |
555 | [LibraryImport(LibName)]
556 | public static partial int llama_chat_apply_template(
557 | nint model,
558 | [In] byte[]? tmpl,
559 | [In] llama_chat_message[] chat,
560 | nuint n_msg,
561 | [MarshalAs(UnmanagedType.I1)] bool add_ass,
562 | [In, Out] byte[] buf,
563 | int length);
564 |
565 | //
566 | // Sampling API
567 | //
568 |
569 | [LibraryImport(LibName)]
570 | public static partial void llama_sampler_reset(
571 | llama_sampler smpl);
572 |
573 | [LibraryImport(LibName)]
574 | public static partial void llama_sampler_free(
575 | nint smpl);
576 |
577 | [LibraryImport(LibName)]
578 | public static partial llama_sampler llama_sampler_chain_init(
579 | llama_sampler_chain_params sparams);
580 |
581 | [LibraryImport(LibName)]
582 | public static partial void llama_sampler_chain_add(
583 | llama_sampler chain,
584 | llama_sampler smpl);
585 |
586 | [LibraryImport(LibName)]
587 | public static partial llama_sampler llama_sampler_init_greedy();
588 |
589 | [LibraryImport(LibName)]
590 | public static partial llama_sampler llama_sampler_init_dist(
591 | uint seed);
592 |
593 | [LibraryImport(LibName)]
594 | public static partial llama_sampler llama_sampler_init_top_k(
595 | int k);
596 |
597 | [LibraryImport(LibName)]
598 | public static partial llama_sampler llama_sampler_init_top_p(
599 | float p,
600 | nuint min_keep);
601 |
602 | [LibraryImport(LibName)]
603 | public static partial llama_sampler llama_sampler_init_min_p(
604 | float p,
605 | nuint min_keep);
606 |
607 | [LibraryImport(LibName)]
608 | public static partial llama_sampler llama_sampler_init_typical(
609 | float p,
610 | nuint min_keep);
611 |
612 | [LibraryImport(LibName)]
613 | public static partial llama_sampler llama_sampler_init_temp(
614 | float t);
615 |
616 | [LibraryImport(LibName)]
617 | public static partial llama_sampler llama_sampler_init_temp_ext(
618 | float t,
619 | float delta,
620 | float exponent);
621 |
622 | [LibraryImport(LibName)]
623 | public static partial llama_sampler llama_sampler_init_xtc(
624 | float p,
625 | float t,
626 | nuint min_keep,
627 | uint seed);
628 |
629 | [LibraryImport(LibName)]
630 | public static partial llama_sampler llama_sampler_init_mirostat(
631 | int n_vocab,
632 | uint seed,
633 | float tau,
634 | float eta,
635 | int m);
636 |
637 | [LibraryImport(LibName)]
638 | public static partial llama_sampler llama_sampler_init_mirostat_v2(
639 | uint seed,
640 | float tau,
641 | float eta);
642 |
643 | [LibraryImport(LibName)]
644 | public static partial llama_sampler llama_sampler_init_penalties(
645 | int penalty_last_n,
646 | float penalty_repeat,
647 | float penalty_freq,
648 | float penalty_present);
649 |
650 | [LibraryImport(LibName)]
651 | public static partial llama_sampler llama_sampler_init_dry(
652 | llama_model model,
653 | float dry_multiplier,
654 | float dry_base,
655 | int dry_allowed_length,
656 | int dry_penalty_last_n,
657 | [In] byte[][] seq_breakers,
658 | nuint num_breakers);
659 |
660 | [LibraryImport(LibName)]
661 | public static partial int llama_sampler_sample(
662 | llama_sampler smpl,
663 | llama_context ctx, int idx);
664 | }
665 | }
666 |
--------------------------------------------------------------------------------
/LlamaCppLib/Options.cs:
--------------------------------------------------------------------------------
1 | namespace LlamaCppLib
2 | {
3 | public class LlmEngineOptions
4 | {
5 | public bool NumaOptimizations { get; set; } = false;
6 | public int MaxParallel { get; set; } = 1;
7 | }
8 |
9 | public class LlmModelOptions
10 | {
11 | public int GpuLayers { get; set; } = 0;
12 | public int MainGpu { get; set; } = 0;
13 | public float[]? TensorSplit { get; set; } = null;
14 | public bool UseMemoryMap { get; set; } = true;
15 | public bool UseMemoryLock { get; set; } = false;
16 |
17 | public int ContextLength { get; set; } = 0;
18 | public int BatchSize { get; set; } = 512;
19 | public int ThreadCount { get; set; } = 4;
20 | public int BatchThreadCount { get; set; } = 4;
21 | public bool UseFlashAttention { get; set; } = false;
22 |
23 | public float RopeFrequeceBase { get; set; } = 0.0f;
24 | public float RopeFrequenceScale { get; set; } = 0.0f;
25 | }
26 |
27 | public enum Mirostat : int { Disabled, MirostatV1, MirostatV2 }
28 |
29 | public class SamplingOptions
30 | {
31 | public int Seed { get; set; } = -1;
32 | public int TopK { get; set; } = 40;
33 | public float TopP { get; set; } = 0.95f;
34 | public float MinP { get; set; } = 0.05f;
35 | public float TfsZ { get; set; } = 1.0f;
36 | public float TypicalP { get; set; } = 1.0f;
37 | public float Temperature { get; set; } = 0.8f;
38 |
39 | public Mirostat Mirostat { get; set; } = Mirostat.Disabled;
40 | public float MirostatTau { get; set; } = 5.0f;
41 | public float MirostatEta { get; set; } = 0.1f;
42 |
43 | public int PenaltyLastN { get; set; } = 64;
44 | public float PenaltyRepeat { get; set; } = 1.0f;
45 | public float PenaltyFreq { get; set; } = 0.0f;
46 | public float PenaltyPresent { get; set; } = 0.0f;
47 |
48 | public int? ResponseMaxTokenCount { get; set; } = default;
49 | public string[]? ExtraStopTokens { get; set; } = default;
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/LlamaCppLib/Slots.cs:
--------------------------------------------------------------------------------
1 | using System.Collections;
2 |
3 | namespace LlamaCppLib
4 | {
5 | public class Slots : IEnumerable
6 | {
7 | private readonly Dictionary _items = new();
8 | private readonly Queue _ids;
9 | private readonly object _lock = new();
10 |
11 | public Slots(int capacity)
12 | {
13 | _ids = new Queue(capacity);
14 |
15 | for (var i = 0; i < capacity; i++)
16 | _ids.Enqueue(i);
17 | }
18 |
19 | public bool HasFreeSlot
20 | {
21 | get
22 | {
23 | lock (_lock)
24 | {
25 | return _ids.Count > 0;
26 | }
27 | }
28 | }
29 |
30 | public int Add(T item)
31 | {
32 | lock (_lock)
33 | {
34 | if (_ids.Count == 0)
35 | throw new InvalidOperationException($"No free slots available.");
36 |
37 | var id = _ids.Dequeue();
38 | _items[id] = item;
39 |
40 | return id;
41 | }
42 | }
43 |
44 | public void Remove(int id)
45 | {
46 | lock (_lock)
47 | {
48 | if (!_items.ContainsKey(id))
49 | throw new KeyNotFoundException($"Item ID \"{id}\" not found.");
50 |
51 | _items.Remove(id);
52 | _ids.Enqueue(id);
53 | }
54 | }
55 |
56 | public void RemoveAll(Func predicate)
57 | {
58 | lock (_lock)
59 | {
60 | var ids = new List();
61 |
62 | foreach (var item in _items)
63 | {
64 | if (predicate(item.Value))
65 | ids.Add(item.Key);
66 | }
67 |
68 | foreach (var id in ids)
69 | {
70 | _items.Remove(id);
71 | _ids.Enqueue(id);
72 | }
73 | }
74 | }
75 |
76 | // IEnumerable
77 | public IEnumerator GetEnumerator()
78 | {
79 | lock (_lock)
80 | {
81 | return _items.Values.ToList().GetEnumerator(); // Snapshot
82 | }
83 | }
84 |
85 | IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();
86 | }
87 | }
88 |
--------------------------------------------------------------------------------
/LlamaCppLib/UnmanagedResource.cs:
--------------------------------------------------------------------------------
1 | namespace LlamaCppLib
2 | {
3 | internal class UnmanagedResource : IDisposable
4 | {
5 | protected Action? _dealloc;
6 | protected T? _handle;
7 |
8 | public void Dispose()
9 | {
10 | if (EqualityComparer.Default.Equals(_handle, default) || _handle == null)
11 | return;
12 |
13 | _dealloc?.Invoke(_handle);
14 | _handle = default;
15 | }
16 |
17 | public bool Created => !EqualityComparer.Default.Equals(_handle, default);
18 | public T Handle => EqualityComparer.Default.Equals(_handle, default) || _handle == null ? throw new NullReferenceException() : _handle;
19 |
20 | public T Create(Func alloc, Action dealloc)
21 | {
22 | _handle = alloc();
23 | _dealloc = dealloc;
24 |
25 | return _handle;
26 | }
27 |
28 | public void GetResource(out T? resource) => resource = _handle;
29 | }
30 |
31 | internal class UnmanagedResource : UnmanagedResource
32 | {
33 | public void Create(Action alloc, Action dealloc)
34 | {
35 | try
36 | {
37 | alloc();
38 | _handle = true;
39 | }
40 | catch
41 | {
42 | _handle = false;
43 | }
44 |
45 | _dealloc = _ => dealloc();
46 | }
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/LlamaCppWeb/LlamaCppWeb.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | net8.0
5 | enable
6 | enable
7 | ..
8 | preview
9 | false
10 | false
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/LlamaCppWeb/Program.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | using LlamaCppLib;
4 |
5 | namespace LlamaCppWeb
6 | {
7 | file class LlmConfig(IConfiguration configuration)
8 | {
9 | public class Model
10 | {
11 | public string? Name { get; set; }
12 | public string? Path { get; set; }
13 | }
14 |
15 | public List Models { get; set; } = [];
16 |
17 | public IConfiguration Configuration = configuration;
18 |
19 | public void Load() => Configuration.GetSection(nameof(LlmConfig)).Bind(this);
20 | public void Reload() => Load();
21 | }
22 |
23 | file class LlmState
24 | {
25 | public string? ModelPath { get; private set; }
26 | public string? ModelName { get; private set; }
27 | public LlmModelOptions? ModelOptions { get; private set; }
28 |
29 | public void Set(string? modelName = default, string? modelPath = default, LlmModelOptions? modelOptions = default)
30 | {
31 | ModelName = modelName != default ? modelName : default;
32 | ModelPath = modelPath != default ? modelPath : default;
33 | ModelOptions = modelOptions != default ? modelOptions : default;
34 | }
35 |
36 | public void Clear()
37 | {
38 | ModelName = default;
39 | ModelPath = default;
40 | ModelOptions = default;
41 | }
42 | }
43 |
44 | internal class Program
45 | {
46 | private static async Task Main(string[] args)
47 | {
48 | var builder = WebApplication.CreateBuilder(args);
49 |
50 | builder.Services.AddSingleton(serviceProvider =>
51 | {
52 | var config = new LlmConfig(serviceProvider.GetRequiredService());
53 | config.Load();
54 | return config;
55 | });
56 |
57 | builder.Services.AddSingleton(serviceProvider => new(new LlmEngineOptions { MaxParallel = 8 }));
58 | builder.Services.AddSingleton();
59 |
60 | builder.Services.AddCors();
61 |
62 | var app = builder.Build();
63 |
64 | app.UseCors(configure => configure.AllowAnyOrigin());
65 |
66 | app.MapGet("/", async (HttpContext httpContext) => await httpContext.Response.WriteAsync("Welcome to LLaMA C++ (dotnet)!"));
67 |
68 | app.MapGet("/list", async (HttpContext httpContext, LlmConfig config) =>
69 | {
70 | var models = config.Models.Select(model => model.Name).ToList();
71 | await httpContext.Response.WriteAsJsonAsync(models);
72 | });
73 |
74 | app.MapGet("/state", async (HttpContext httpContext, LlmEngine engine, LlmState state) =>
75 | {
76 | var response = new LlmStateResponse { ModelName = state.ModelName, ModelStatus = engine.Loaded ? LlmModelStatus.Loaded : LlmModelStatus.Unloaded };
77 | await httpContext.Response.WriteAsJsonAsync(response);
78 | });
79 |
80 | app.MapPost("/load", async (HttpContext httpContext, LlmConfig config, LlmEngine engine, LlmState state) =>
81 | {
82 | var request = await httpContext.Request.ReadFromJsonAsync() ?? new();
83 | var modelName = request.ModelName ?? String.Empty;
84 | var modelPath = config.Models.SingleOrDefault(model => model.Name == request.ModelName)?.Path ?? String.Empty;
85 | engine.LoadModel(modelPath, request.ModelOptions);
86 | state.Set(modelName, modelPath);
87 | var response = new LlmStateResponse { ModelName = state.ModelName, ModelStatus = engine.Loaded ? LlmModelStatus.Loaded : LlmModelStatus.Unloaded };
88 | await httpContext.Response.WriteAsJsonAsync(response);
89 | });
90 |
91 | app.MapGet("/unload", async (HttpContext httpContext, LlmEngine engine, LlmState state) =>
92 | {
93 | engine.UnloadModel();
94 | var response = new LlmStateResponse { ModelName = state.ModelName, ModelStatus = engine.Loaded ? LlmModelStatus.Loaded : LlmModelStatus.Unloaded };
95 | state.Clear();
96 | await httpContext.Response.WriteAsJsonAsync(response);
97 | });
98 |
99 | app.MapPost("/prompt", async (HttpContext httpContext, IHostApplicationLifetime lifetime, LlmEngine engine) =>
100 | {
101 | using var cancellationTokenSource = CancellationTokenSource.CreateLinkedTokenSource(httpContext.RequestAborted, lifetime.ApplicationStopping);
102 |
103 | var request = await httpContext.Request.ReadFromJsonAsync(cancellationTokenSource.Token) ?? new();
104 | var prompt = engine.Prompt(request.Messages, request.SamplingOptions);
105 |
106 | httpContext.Response.ContentType = "text/event-stream; charset=utf-8";
107 |
108 | try
109 | {
110 | await foreach (var token in new TokenEnumerator(prompt, cancellationTokenSource.Token))
111 | {
112 | await httpContext.Response.WriteAsync($"data: {Convert.ToBase64String(Encoding.UTF8.GetBytes(token))}\n\n", cancellationTokenSource.Token);
113 | }
114 | }
115 | catch (OperationCanceledException)
116 | { }
117 | });
118 |
119 | await app.RunAsync();
120 | }
121 | }
122 | }
123 |
--------------------------------------------------------------------------------
/LlamaCppWeb/Properties/launchSettings.json:
--------------------------------------------------------------------------------
1 | {
2 | "iisSettings": {
3 | "windowsAuthentication": false,
4 | "anonymousAuthentication": true,
5 | "iisExpress": {
6 | "applicationUrl": "http://localhost:13084",
7 | "sslPort": 0
8 | }
9 | },
10 | "profiles": {
11 | "http": {
12 | "commandName": "Project",
13 | "dotnetRunMessages": true,
14 | "launchBrowser": false,
15 | "applicationUrl": "http://localhost:5021",
16 | "environmentVariables": {
17 | "ASPNETCORE_ENVIRONMENT": "Development"
18 | }
19 | },
20 | "IIS Express": {
21 | "commandName": "IISExpress",
22 | "launchBrowser": false,
23 | "environmentVariables": {
24 | "ASPNETCORE_ENVIRONMENT": "Development"
25 | }
26 | }
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/LlamaCppWeb/appsettings.Development.json:
--------------------------------------------------------------------------------
1 | {
2 | "Logging": {
3 | "LogLevel": {
4 | "Default": "Information",
5 | "Microsoft.AspNetCore": "Warning"
6 | }
7 | }
8 | }
9 |
--------------------------------------------------------------------------------
/LlamaCppWeb/appsettings.json:
--------------------------------------------------------------------------------
1 | {
2 | "Logging": {
3 | "LogLevel": {
4 | "Default": "Information",
5 | "Microsoft.AspNetCore": "Warning"
6 | }
7 | },
8 | "AllowedHosts": "*",
9 | "Kestrel": {
10 | "Endpoints": {
11 | "Http": {
12 | "Url": "http://0.0.0.0:5021"
13 | }
14 | }
15 | },
16 | "LlmConfig": {
17 | "Models": [
18 | { "Name": "meta-llama-3-70b-instruct", "Path": "/md0/models/meta-llama/ggml-meta-llama-3-70b-instruct-q8_0.gguf" },
19 | { "Name": "c4ai-command-r-plus", "Path": "/md0/models/CohereForAI/c4ai-command-r-plus/ggml-c4ai-command-r-plus-q6_k.gguf" }
20 | ]
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # llama.cpp-dotnet
2 |
3 | [](https://opensource.org/licenses/MIT)
4 |
5 | ### Demo
6 |
7 | This shows `LlamaCppWeb.exe` hosting on the left and four `LlamaCppCli.exe` running in parallel on the right.
8 |
9 | 
10 |
11 | This one shows the new text embedding sample for feature extraction (using one of the models below):
12 | https://huggingface.co/dranger003/SFR-Embedding-Mistral-GGUF
13 | https://huggingface.co/dranger003/e5-mistral-7b-instruct-GGUF
14 |
15 | 
16 |
17 | ### Description
18 |
19 | High performance minimal C# bindings for llama.cpp including a .NET core library, API server/client and samples.
20 | The imported API is kept to a bare minimum as the upstream API is changing quite rapidly.
21 |
22 | ### Quick Start
23 |
24 | Build - requires CUDA installed (on Windows use the VS2022 x64 command prompt, on Linux make sure to install cmake and [dotnet](https://learn.microsoft.com/en-us/dotnet/core/install/linux)):
25 | ```
26 | git clone --recursive https://github.com/dranger003/llama.cpp-dotnet.git
27 | cd llama.cpp-dotnet
28 | dotnet build -c Release /p:Platform="Any CPU"
29 | ```
30 | If you don't need to compile the native libraries, you can also append `/p:NativeLibraries=OFF` to the `dotnet` build command above.
31 |
32 | ### Basic Sample
33 |
34 | ```
35 | using LlamaCppLib;
36 |
37 | // Initialize
38 | using var llm = new LlmEngine(new EngineOptions { MaxParallel = 8 });
39 | llm.LoadModel(args[0], new ModelOptions { Seed = 1234, GpuLayers = 32 });
40 |
41 | // Prompting
42 | var prompt = llm.Prompt(
43 | String.Format(promptTemplate, systemPrompt, userPrompt),
44 | new SamplingOptions { Temperature = 0.0f }
45 | );
46 |
47 | // Inference
48 | await foreach (var token in new TokenEnumerator(prompt))
49 | Console.Write(token);
50 | ```
51 |
52 | The included CLI samples include more examples of using the library, to process prompts in parallel for example.
53 |
54 | ### API Endpoints
55 | ```
56 | GET /list
57 | GET /state
58 | POST /load [LlmLoadRequest]
59 | GET /unload
60 | POST /prompt [LlmPromptRequest]
61 | ```
62 |
63 | ### Models
64 |
65 | You will need a model in GGUF format, the 13B parameters appears to perform well if you have the memory (8-12GB depending on the quantized model).
66 | If you have a lot of RAM (i.e. 48GB+) you could try a 65B version though it is much slower on the predictions, especially without a GPU.
67 |
68 | A lot of models can be found below.
69 |
70 | - [dranger003 on Hugging Face](https://huggingface.co/dranger003?sort_models=created#models)
71 | - [TheBloke on Hugging Face](https://huggingface.co/TheBloke?sort_models=created&search_models=GGUF#models)
72 | - [LoneStriker on Hugging Face](https://huggingface.co/LoneStriker?sort_models=created&search_models=GGUF#models)
73 |
74 | ### Features
75 |
76 | - [X] Model loading/unloading
77 | - [x] Parallel decoding
78 | - [x] Minimal API host/client
79 | - [X] Support Windows/Linux
80 |
81 | ### Acknowledgments
82 |
83 | [ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp) for the LLaMA implementation in C++
84 |
--------------------------------------------------------------------------------
/clean.cmd:
--------------------------------------------------------------------------------
1 | @echo off
2 | git submodule foreach --recursive git clean -fdx
3 | for /d /r . %%d in (bin) do @if exist "%%d" rmdir /s /q "%%d"
4 | for /d /r . %%d in (obj) do @if exist "%%d" rmdir /s /q "%%d"
5 | for /d /r . %%d in (Debug) do @if exist "%%d" rmdir /s /q "%%d"
6 | for /d /r . %%d in (Release) do @if exist "%%d" rmdir /s /q "%%d"
7 |
--------------------------------------------------------------------------------
/clean.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | git submodule foreach --recursive git clean -fdx
3 | find . -type d -name bin -exec rm -rf {} \; 2>/dev/null
4 | find . -type d -name obj -exec rm -rf {} \; 2>/dev/null
5 | find . -type d -name Debug -exec rm -rf {} \; 2>/dev/null
6 | find . -type d -name Release -exec rm -rf {} \; 2>/dev/null
7 |
--------------------------------------------------------------------------------