├── .github
├── FUNDING.yml
├── ISSUE_TEMPLATE
│ ├── bug_report.yaml
│ └── feature_request.yaml
├── pull_request_template.md
└── workflows
│ ├── build.yml
│ └── warmup.yml
├── .gitignore
├── BUILDING.md
├── LICENSE
├── README.md
├── api
├── OAI
│ ├── router.ts
│ ├── types
│ │ ├── chatCompletions.ts
│ │ └── completions.ts
│ └── utils
│ │ ├── chatCompletion.ts
│ │ ├── completion.ts
│ │ └── generation.ts
├── core
│ ├── router.ts
│ └── types
│ │ ├── auth.ts
│ │ ├── health.ts
│ │ ├── model.ts
│ │ ├── template.ts
│ │ └── token.ts
├── middleware
│ ├── authMiddleware.ts
│ ├── checkModelMiddleware.ts
│ └── requestLogMiddleware.ts
└── server.ts
├── assets
├── icon.ico
└── icon.png
├── bindings
├── CMakeLists.txt
├── bindings.ps1
├── bindings.sh
├── bindings.ts
├── generationResources.ts
├── grammar.ts
├── job.ts
├── lib.ts
├── minimal_cpp_test.cpp
├── readbackBuffer.ts
├── samplers.ts
├── server
│ ├── c_library.cpp
│ ├── c_library.h
│ ├── generation_resources.hpp
│ ├── inference_args.hpp
│ ├── json_status.hpp
│ ├── presampler.hpp
│ ├── processor.hpp
│ ├── readback_buffer.hpp
│ ├── request.hpp
│ ├── rule_stream.hpp
│ ├── samplers.hpp
│ ├── sequence_stream.hpp
│ ├── server_basic_example.cpp
│ ├── slot.hpp
│ ├── tokenization.hpp
│ └── trie.hpp
├── symbols.ts
├── types.ts
└── utils.ts
├── common
├── args.ts
├── auth.ts
├── config.ts
├── configModels.ts
├── errors.ts
├── logging.ts
├── modelContainer.ts
├── myZod.ts
├── networking.ts
├── samplerOverrides.ts
├── sampling.ts
├── templating.ts
└── utils.ts
├── config_sample.yml
├── deno.json
├── deno.lock
├── generateGitSha.ts
├── lib
└── place_libs_here.txt
├── main.ts
├── minimal_test_setup.ts
├── models
└── place_your_models_here.txt
├── sampler_overrides
└── sample_preset.yml
├── templates
├── alpaca.jinja
├── chatml.jinja
└── place_your_templates_here.txt
└── types
├── jinja.d.ts
└── utils.ts
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | ko_fi: kingbri
2 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.yaml:
--------------------------------------------------------------------------------
1 | name: Bug report
2 | description: Report code related issues
3 | title: "[BUG]"
4 | labels: bug
5 | body:
6 |
7 | - type: markdown
8 | attributes:
9 | value: |
10 | ### Disclaimer:
11 | Github Issues are **only** for code related bugs.
12 | If you do not understand how to startup or use TabbyAPI, please ask in the [Discord Server](https://discord.gg/sYQxnuD7Fj)
13 |
14 | - type: dropdown
15 | attributes:
16 | label: OS
17 | options:
18 | - Windows
19 | - macOS
20 | - Linux
21 | validations:
22 | required: true
23 |
24 | - type: dropdown
25 | attributes:
26 | label: GPU Library
27 | description: Ex. CUDA, ROCm
28 | options:
29 | - CUDA
30 | - AMD ROCm
31 | - Metal
32 | - CPU
33 | validations:
34 | required: true
35 |
36 | - type: input
37 | attributes:
38 | label: YALS commit sha
39 | description: Enter the commit SHA you're using (found on startup)
40 | placeholder: "ex. a1b4da3"
41 | validations:
42 | required: true
43 |
44 | - type: textarea
45 | attributes:
46 | label: Describe the bug
47 | description: A clear and concise description of what the bug is.
48 | validations:
49 | required: true
50 |
51 | - type: textarea
52 | attributes:
53 | label: Reproduction steps
54 | description: Walk us through how the bug occurred and how to make it happen.
55 | validations:
56 | required: true
57 |
58 | - type: textarea
59 | attributes:
60 | label: Expected behavior
61 | description: What was expected to happen?
62 | validations:
63 | required: true
64 |
65 | - type: textarea
66 | attributes:
67 | label: Logs
68 | description: If applicable, add logs and call stacks to help explain your problem.
69 | validations:
70 | required: false
71 |
72 | - type: textarea
73 | attributes:
74 | label: Additional context
75 | description: Add any other context about the problem here.
76 | validations:
77 | required: false
78 |
79 | - type: checkboxes
80 | attributes:
81 | label: Acknowledgements
82 | description: Before submitting this issue, please make sure you have completed the following checklist.
83 | options:
84 | - label: I have looked for similar issues before submitting this one.
85 | required: true
86 | - label: I have read the disclaimer, and this issue is related to a code bug. If I have a question, I will use the Discord server.
87 | required: true
88 | - label: I understand that the developers have lives and my issue will be answered when possible.
89 | required: true
90 | - label: I understand the developers of this program are human, and I will ask my questions politely.
91 | required: true
92 |
93 | - type: markdown
94 | attributes:
95 | value: |
96 | ## Thanks!
97 | Well-formatted issues improve YALS and make the development process smoother.
98 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.yaml:
--------------------------------------------------------------------------------
1 | name: Feature request
2 | description: Suggest a new idea
3 | title: "[REQUEST]"
4 | body:
5 |
6 | - type: textarea
7 | attributes:
8 | label: Problem
9 | description: Is the feature request related to a problem? If so, please describe.
10 | placeholder: A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
11 | validations:
12 | required: false
13 |
14 | - type: textarea
15 | attributes:
16 | label: Solution
17 | description: Describe the solution you'd like.
18 | placeholder: A clear and concise description of what you want to happen.
19 | validations:
20 | required: true
21 |
22 | - type: textarea
23 | attributes:
24 | label: Alternatives
25 | description: What alternative options did you consider?
26 | validations:
27 | required: false
28 |
29 | - type: textarea
30 | attributes:
31 | label: Explanation
32 | description: Why should this feature be added?
33 | validations:
34 | required: true
35 |
36 | - type: textarea
37 | attributes:
38 | label: Examples
39 | description: |
40 | Examples of the feature in action and its significance.
41 |
42 | Not required, but will make your request easier to understand.
43 | validations:
44 | required: false
45 |
46 | - type: textarea
47 | attributes:
48 | label: Additional context
49 | description: Anything else to add?
50 | validations:
51 | required: false
52 |
53 | - type: checkboxes
54 | attributes:
55 | label: Acknowledgements
56 | description: Before submitting this issue, please make sure you have completed the following checklist.
57 | options:
58 | - label: I have looked for similar requests before submitting this one.
59 | required: true
60 | - label: I understand that the developers have lives and my issue will be answered when possible.
61 | required: true
62 | - label: I understand the developers of this program are human, and I will make my requests politely.
63 | required: true
64 |
65 | - type: markdown
66 | attributes:
67 | value: |
68 | ## Thanks!
69 | Well-formatted issues improve YALS and make the development process smoother.
70 |
--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | **Warning**
2 | Make all Pull Requests to the dev branch. Main is reserved for stability and building actions.
3 |
4 | **Is your pull request related to a problem? Please describe.**
5 | A clear and concise description of what the problem is. You can also link to an existing issue.
6 |
7 | **Why should this feature be added?**
8 | An explanation of why the feature should be added. Please be as specific as possible to help us understand the reasoning.
9 |
10 | **Examples**
11 | Examples of the feature in action and its significance compared to not having the feature.
12 |
13 | **Additional context**
14 | Add any other context or screenshots about the pull request here.
15 |
--------------------------------------------------------------------------------
/.github/workflows/warmup.yml:
--------------------------------------------------------------------------------
1 | name: Cache Management
2 |
3 | on:
4 | schedule:
5 | - cron: '0 0 */2 * *'
6 | workflow_dispatch:
7 | inputs:
8 | force_warmup:
9 | description: 'Force cache warmup regardless of age'
10 | required: false
11 | default: false
12 | type: boolean
13 |
14 | jobs:
15 | cleanup-cache:
16 | runs-on: ubuntu-latest
17 | permissions:
18 | actions: write
19 | steps:
20 | - name: Cleanup old caches
21 | uses: actions/github-script@v7
22 | with:
23 | script: |
24 | const retentionCount = 2; // Keep the 2 most recent caches per prefix
25 | const forceWarmupDays = 5; // Force warmup if newest cache is older than this
26 | const caches = await github.rest.actions.getActionsCacheList({
27 | owner: context.repo.owner,
28 | repo: context.repo.repo
29 | });
30 |
31 | // Track if any cache needs warming up
32 | let needsWarmup = false;
33 |
34 | // Group caches by their prefix pattern (e.g., sccache-Windows-cuda-main)
35 | const cacheGroups = {};
36 | for (const cache of caches.data.actions_caches) {
37 | if (cache.key.startsWith('sccache-')) {
38 | // Extract the prefix pattern (everything before the last hyphen + timestamp)
39 | const prefixPattern = cache.key.replace(/-\d+$/, '');
40 |
41 | if (!cacheGroups[prefixPattern]) {
42 | cacheGroups[prefixPattern] = [];
43 | }
44 | cacheGroups[prefixPattern].push(cache);
45 | }
46 | }
47 |
48 | const now = new Date();
49 | // Process each group of caches
50 | for (const prefix in cacheGroups) {
51 | // Sort caches by creation date (newest first)
52 | const sortedCaches = cacheGroups[prefix].sort((a, b) =>
53 | new Date(b.created_at) - new Date(a.created_at));
54 |
55 | // Check if most recent cache is older than forceWarmupDays
56 | if (sortedCaches.length > 0) {
57 | const newestCache = sortedCaches[0];
58 | const createdAt = new Date(newestCache.created_at);
59 | const ageInDays = (now - createdAt) / (1000 * 60 * 60 * 24);
60 |
61 | if (ageInDays > forceWarmupDays) {
62 | console.log(`Cache ${prefix} is stale (${ageInDays.toFixed(1)} days old). Will force warmup.`);
63 | needsWarmup = true;
64 | }
65 |
66 | // Log the kept most recent cache
67 | console.log(`Keeping most recent cache: ${newestCache.key}, created ${ageInDays.toFixed(1)} days ago`);
68 |
69 | // Keep second most recent cache if it exists
70 | if (sortedCaches.length > 1) {
71 | const secondCache = sortedCaches[1];
72 | const secondCreatedAt = new Date(secondCache.created_at);
73 | const secondAgeInDays = (now - secondCreatedAt) / (1000 * 60 * 60 * 24);
74 | console.log(`Keeping second most recent cache: ${secondCache.key}, created ${secondAgeInDays.toFixed(1)} days ago`);
75 | }
76 | }
77 |
78 | // Delete all caches beyond the retention count
79 | for (let i = retentionCount; i < sortedCaches.length; i++) {
80 | const cache = sortedCaches[i];
81 | const createdAt = new Date(cache.created_at);
82 | const ageInDays = (now - createdAt) / (1000 * 60 * 60 * 24);
83 |
84 | console.log(`Deleting old cache: ${cache.key}, created ${ageInDays.toFixed(1)} days ago`);
85 | await github.rest.actions.deleteActionsCacheByKey({
86 | owner: context.repo.owner,
87 | repo: context.repo.repo,
88 | key: cache.key
89 | });
90 | }
91 | }
92 |
93 | // Set output to control whether to run warmup jobs
94 | core.setOutput('needs_warmup', needsWarmup.toString());
95 |
96 | warmup-unix:
97 | needs: cleanup-cache
98 | if: ${{ needs.cleanup-cache.outputs.needs_warmup == 'true' || (github.event_name == 'workflow_dispatch' && github.event.inputs.force_warmup == 'true') }}
99 | runs-on: ${{ matrix.os }}
100 | strategy:
101 | matrix:
102 | os: [ubuntu-22.04, macos-15]
103 | device: [cpu, metal, cuda]
104 | exclude:
105 | - os: macos-15
106 | device: cpu
107 | - os: macos-15
108 | device: cuda
109 | - os: ubuntu-22.04
110 | device: metal
111 |
112 | container: ${{ matrix.device == 'cuda' && 'nvidia/cuda:12.8.0-devel-ubuntu22.04' || '' }}
113 | steps:
114 | - uses: actions/checkout@v4
115 | - name: Run sccache-cache
116 | uses: mozilla-actions/sccache-action@v0.0.7
117 | - name: Configure sccache
118 | id: sccache
119 | run: |
120 | mkdir -p "$PWD/bindings/.sccache"
121 | export SCCACHE_DIR="$PWD/bindings/.sccache"
122 | echo "SCCACHE_DIR=$SCCACHE_DIR" >> $GITHUB_ENV
123 | - name: Cache sccache storage
124 | uses: actions/cache@v4
125 | with:
126 | path: ${{ env.SCCACHE_DIR }}
127 | key: sccache-${{ runner.os }}-${{ matrix.device }}-${{ github.ref_name }}-${{ github.run_id }}
128 | restore-keys: |
129 | sccache-${{ runner.os }}-${{ matrix.device }}-${{ github.ref_name }}-
130 | sccache-${{ runner.os }}-${{ matrix.device }}-
131 |
132 | warmup-win:
133 | needs: cleanup-cache
134 | if: ${{ needs.cleanup-cache.outputs.needs_warmup == 'true' || (github.event_name == 'workflow_dispatch' && github.event.inputs.force_warmup == 'true') }}
135 | runs-on: ${{ matrix.os }}
136 | strategy:
137 | matrix:
138 | os: [windows-2022]
139 | device: [cpu, cuda]
140 |
141 | steps:
142 | - uses: actions/checkout@v4
143 | - name: Run sccache-cache
144 | uses: mozilla-actions/sccache-action@v0.0.7
145 | - name: Configure sccache
146 | run: |
147 | New-Item -ItemType Directory -Force -Path "$PWD/bindings/.sccache"
148 | $env:SCCACHE_DIR="$PWD/bindings/.sccache"
149 | echo "SCCACHE_DIR=$env:SCCACHE_DIR" >> $env:GITHUB_ENV
150 | - name: Cache sccache storage
151 | uses: actions/cache@v4
152 | with:
153 | path: ${{ env.SCCACHE_DIR }}
154 | key: sccache-${{ runner.os }}-${{ matrix.device }}-${{ github.ref_name }}-${{ github.run_id }}
155 | restore-keys: |
156 | sccache-${{ runner.os }}-${{ matrix.device }}-${{ github.ref_name }}-
157 | sccache-${{ runner.os }}-${{ matrix.device }}-
158 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ### Deno ###
2 | /.idea
3 | */.idea/
4 | /.vscode/
5 |
6 | /node_modules
7 | /bindings/home/
8 | /bindings/gguf
9 | bindings/.sccache
10 |
11 | .env
12 | *.orig
13 | *.pyc
14 | *.swp
15 | *.gguf
16 |
17 | # End of https://www.toptal.com/developers/gitignore/api/deno
18 |
19 | # C++ ignores
20 | cmake-build-debug*
21 | */cmake-build-debug*
22 | cmake-build-release*
23 | */cmake-build-release*
24 | bindings/build
25 |
26 | # Lib
27 | lib/*
28 | !lib/place_libs_here.txt
29 |
30 | # build
31 | build/*
32 | venv/*
33 | .venv/*
34 |
35 | # User configuration
36 | config.yml
37 | api_tokens.yml
38 |
39 | # Models folder
40 | models/*
41 | !models/place_your_models_here.txt
42 |
43 | # Templates folder
44 | templates/*
45 | !templates/place_your_templates_here.txt
46 | !templates/alpaca.jinja
47 | !templates/chatml.jinja
48 |
49 | # Sampler overrides folder
50 | sampler_overrides/*
51 | !sampler_overrides/sample_preset.yml
52 |
53 | # Compiled binaries and embedded assets
54 | gitSha.txt
55 | YALS.exe
56 | YALS
57 |
58 | # Markdown
59 | .obsidian/
60 |
61 | # macOS
62 | *.DS_Store
63 |
64 | # Exclude all .yml except config sample
65 | *.yml
66 | !config_sample.yml
--------------------------------------------------------------------------------
/BUILDING.md:
--------------------------------------------------------------------------------
1 | # Build Instructions
2 |
3 | YALS contains two components:
4 | 1. TypeScript code: Universally buildable on any OS
5 | 2. C++ bindings: Requires an OS-specific C++ compiler and additional setup
6 |
7 | The C++ bindings need to be built to integrate the `llama.cpp` library and provide the necessary "glue" required by YALS.
8 |
9 | ## Prerequisites
10 |
11 | To get started, install the following prerequisites:
12 | - [Deno](https://deno.com)
13 | - A C++ compiler:
14 | - Windows: Visual Studio 2022 build tools
15 | - macOS: Xcode command-line tools (`xcode-select --install`)
16 | - Linux: GCC (`sudo apt install build-essential`)
17 | - CMake:
18 | - Windows: Installed with Visual Studio build tools
19 | - macOS (homebrew): `brew install cmake`
20 | - Linux: `sudo apt install cmake` (For Ubuntu 22.04, follow this [askubuntu](https://askubuntu.com/a/865294) answer to install the latest version)
21 | - Ninja (Makes builds faster)
22 | - Windows: `winget install -e --id Ninja-build.Ninja`
23 | - macOS (homebrew): `brew install ninja`
24 | - Linux: `sudo apt install ninja-build`
25 | - [sccache](https://github.com/mozilla/sccache) (optional, but speeds up subsequent builds)
26 | - [Rust](https://rustup.rs)(Used for improved grammar parsing via LLGuidance)
27 |
28 | ## Building
29 |
30 | Clone the repository and navigate to the project folder:
31 | ```sh
32 | git clone https://github.com/theroyallab/YALS.git
33 | cd YALS
34 | ```
35 |
36 | All build commands are encapsulated in Deno tasks, similar to npm scripts in NodeJS.
37 |
38 | > [!NOTE]
39 | > Unlike llama.cpp and its derivatives, YALS uses an extremely fast grammar tool called llguidance for JSON schemas, Regex, and lark grammars.
40 | >
41 | > Due to an extra dependency being required for users systems, llguidance is off by default, but it is **highly recommended** to turn it on at build time for improved grammar handling.
42 |
43 | To enable it, set `LLGUIDANCE=1` in your shell before invoking the deno task.
44 |
45 | To build the C++bindings:
46 |
47 | - Windows: `deno task bindings-win`
48 | - macOS/Linux: `deno task bindings`
49 |
50 | This will invoke CMake to build the bindings and copy the resulting shared libraries to the `lib` folder.
51 |
52 | Optionally, environment variables can be set for certain architectures when building (ex. CUDA):
53 | - `MAX_JOBS`: Number of parallel jobs (defaults to the number of CPU cores)
54 | - `LLAMACPP_REPO`: Point to a custom repository for llama.cpp (Here be dragons!)
55 | - `LLAMACPP_TAG`: Set a specific tag for llama.cpp (Here be dragons!)
56 | - `GGML_CUDA=1`: Enables CUDA support
57 | - `CMAKE_CUDA_ARCHITECTURES`: Specifies CUDA compute capabilities (defaults to `native` if using CMake > 3.24)
58 | - `GGML_VULKAN=1`: Enables Vulkan Support
59 | - `GGML_HIP=1`: Enables HIP ROCM Support (Requires specifying DAMDGPU_TARGETS, Linux only)
60 | - `AMDGPU_TARGETS`: Specify ROCM target (example: `gfx1030`)
61 | - `LLGUIDANCE=1`: (Recommended) Enable llguidance for grammars. Requires Rust on the system. (default `0`)
62 |
63 | ## Running
64 |
65 | To start the server with necessary permissions:
66 | ```sh
67 | deno task start
68 | ```
69 |
70 | With full permissions (useful for testing new features):
71 | ```sh
72 | deno run -A main.ts
73 | ```
74 |
75 | ## Packaging
76 |
77 | > [!NOTE]
78 | > **Note:** All YALS commits are built via GitHub Actions, so manual packaging is typically unnecessary unless you need to distribute builds with a custom build configuration.
79 |
80 | To create a distributable binary:
81 |
82 | 1. Run: `deno task build` to package all TypeScript code into a standalone binary
83 | 2. Zip the following files and directories:
84 | - `YALS(.exe)`
85 | - `lib/`
86 | - `models/`
87 | - `templates/`
88 | - `config_sample.yml`
89 | 3. Distribute the archive, and the recipient can simply extract and run it.
90 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |

3 |
4 |
5 | # YALS
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 | > [!NOTE]
24 | >
25 | > Need help? Join the [Discord Server](https://discord.gg/sYQxnuD7Fj) and get the `Tabby` role. Please be nice when asking questions.
26 |
27 | Welcome to YALS, also known as **Y**et **A**nother **L**lamacpp **S**erver.
28 |
29 | YALS is a friendly OAI compatible API server built with Deno, Hono, and Zod, designed to facilitate LLM text generation via the [llama.cpp backend](https://github.com/ggml-org/llama.cpp)
30 |
31 | ## Disclaimer
32 |
33 | This project is in an alpha state. There may be bugs, possibly even ones that could cause thermonuclear war. Please note that commits happen frequently, and builds are distributed via CI.
34 |
35 | YALS is a hobby project made for a small amount of users. It is not meant to run on production servers. For that, please look at other solutions that support those workloads.
36 |
37 | ## Why?
38 |
39 | The AI space is full of backend projects that wrap llama.cpp, but I felt that something was missing. This led me to create my own backend, one which is extensible, speedy, and as elegant as TabbyAPI, but specifically for llama.cpp and GGUF.
40 |
41 | ## What about TabbyAPI?
42 |
43 | Here are the reasons why I decided to create a separate project instead of integrating llamacpp support into TabbyAPI:
44 |
45 | 1. **Separation of concerns**: I want TabbyAPI to stay focused on ExLlama, not become a monolithic backend.
46 | 2. **Distribution patterns**: Unlike TabbyAPI, llama.cpp backends are often distributed as binaries. Deno’s compile command is vastly superior to PyInstaller, making binary distribution easier.
47 | 3. **Dependency hell**: Python’s dependency system is a mess. Adding another layer of abstractions would confuse users further.
48 | 4. **New technologies**: Since C++ (via C bindings) is universally compatible via an FFI interface, I wanted to try something new instead of struggling with Python. The main reason for using Deno is because it augments an easy to learn language (TypeScript) with inbuilt tooling and a robust FFI system.
49 | ## Getting Started
50 |
51 | To get started, download the latest zip from [releases](https://github.com/theroyallab/YALS/releases/latest) that corresponds to your setup.
52 |
53 | The currently supported builds via CI are:
54 |
55 | - **macOS**: Metal
56 | - **Windows/Linux**: CPU
57 | - **Windows/Linux**: CUDA (built for Pascal and newer consumer architectures)
58 |
59 | > [!NOTE]
60 | >
61 | > If your specific setup is not available via CI, you can build locally via the [building guide](https://github.com/theroyallab/YALS/blob/main/BUILDING.md), or request a certain architecture in issues.
62 |
63 | Then follow these steps:
64 |
65 | 1. Extract the zip file
66 | 2. Copy `config_sample.yml` to a file called `config.yml`
67 | 3. Edit `config.yml` to configure model loading, networking, and other parameters.
68 | 1. All options are commented: **if you're unsure about an option, it's best to leave it unchanged**.
69 | 2. You can also use CLI arguments, similar to TabbyAPI (ex. `--flash-attention true`).
70 | 4. Download a `.gguf` model into the `models` directory (or whatever you set your directory to)
71 | 1. If the model is split into multiple parts (`00001-of-0000x.gguf`), set `model_name` in `config.yml` to the **first part** (ending in `00001`). Other parts will load automatically.
72 | 5. Start YALS:
73 | 1. Windows: Double click `YALS.exe` or run `.\YALS.exe` from the terminal (recommended)
74 | 2. macOS/Linux: Open a terminal and run `./YALS`
75 | 6. Navigate to `http:///docs` (ex. `http://localhost:5000/docs`) to view the YALS Scalar API documentation.
76 | ## Features
77 |
78 | - OpenAI compatible API
79 | - Loading/unloading models
80 | - Flexible Jinja2 template engine for chat completions that conforms to HuggingFace
81 | - Fast JSON schema + Regex + EBNF support via llguidance
82 | - String banning
83 | - Concurrent inference with Hono + async TypeScript
84 | - Robust validation with Zod
85 | - Utilizes modern TS paradigms and the Deno runtime
86 | - Inbuilt proxy to override client request parameters/samplers
87 | - Continuous slot-based batching engine with improved KV cache assignment
88 |
89 | More features will be added as the project matures. If something is missing here, PR it in!
90 |
91 | ## Supported Model Types
92 |
93 | Since YALS uses llama.cpp for inference, the only supported model format is GGUF.
94 |
95 | If you want to use other model formats such as Exl2, try [tabbyAPI](https://github.com/theroyallab/TabbyAPI)
96 |
97 | ## Contributing
98 |
99 | Use the template when creating issues or pull requests, otherwise the developers may not look at your post.
100 |
101 | If you have issues with the project:
102 |
103 | - Describe the issue in detail
104 | - If you have a feature request, please indicate it as such.
105 |
106 | If you have a Pull Request:
107 |
108 | - Describe the pull request in detail, what, and why you are changing something
109 |
110 | ## Developers and Permissions
111 |
112 | Creators/Developers:
113 |
114 | - [kingbri](https://github.com/kingbri1) - TypeScript, Deno, and some C++
115 | - [CoffeeVampire](https://github.com/CoffeeVampir3) - Main C++ developer
116 |
117 | ## Acknowledgements
118 |
119 | YALS would not exist without the work of other contributors and FOSS projects:
120 |
121 | - [llama.cpp](https://github.com/ggml-org/llama.cpp)
122 | - [Deno](https://deno.com)
123 | - [Hono](https://hono.dev)
124 | - [Zod](https://zod.dev)
125 | - [llguidance](https://github.com/guidance-ai/llguidance)
126 | - [KoboldCpp](https://github.com/lostruins/koboldcpp)
127 | - [SillyTavern](https://github.com/SillyTavern/SillyTavern)
128 |
--------------------------------------------------------------------------------
/api/OAI/router.ts:
--------------------------------------------------------------------------------
1 | import { Hono } from "hono";
2 | import { HTTPException } from "hono/http-exception";
3 | import { streamSSE } from "hono/streaming";
4 | import { describeRoute } from "hono-openapi";
5 | import { validator as sValidator } from "hono-openapi";
6 | import {
7 | ChatCompletionRequest,
8 | ChatCompletionResponse,
9 | } from "@/api/OAI/types/chatCompletions.ts";
10 | import {
11 | generateChatCompletion,
12 | streamChatCompletion,
13 | } from "@/api/OAI/utils/chatCompletion.ts";
14 | import { AuthKeyPermission } from "@/common/auth.ts";
15 | import { jsonContent } from "@/common/networking.ts";
16 | import { PromptTemplate } from "@/common/templating.ts";
17 |
18 | import authMiddleware from "../middleware/authMiddleware.ts";
19 | import checkModelMiddleware from "../middleware/checkModelMiddleware.ts";
20 | import { CompletionRequest, CompletionResponse } from "./types/completions.ts";
21 | import { generateCompletion, streamCompletion } from "./utils/completion.ts";
22 |
23 | const router = new Hono();
24 |
25 | const completionsRoute = describeRoute({
26 | responses: {
27 | 200: jsonContent(CompletionResponse, "Response to completions"),
28 | },
29 | });
30 |
31 | router.post(
32 | "/v1/completions",
33 | completionsRoute,
34 | authMiddleware(AuthKeyPermission.API),
35 | checkModelMiddleware,
36 | sValidator("json", CompletionRequest),
37 | async (c) => {
38 | const params = c.req.valid("json");
39 |
40 | if (params.stream) {
41 | return streamSSE(c, async (stream) => {
42 | await streamCompletion(
43 | c.var.requestId,
44 | stream,
45 | params,
46 | c.var.model,
47 | c.req.raw.signal,
48 | );
49 | });
50 | } else {
51 | const completionResult = await generateCompletion(
52 | c.var.requestId,
53 | params,
54 | c.var.model,
55 | c.req.raw.signal,
56 | );
57 |
58 | return c.json(completionResult);
59 | }
60 | },
61 | );
62 |
63 | const chatCompletionsRoute = describeRoute({
64 | responses: {
65 | 200: jsonContent(
66 | ChatCompletionResponse,
67 | "Response to chat completions",
68 | ),
69 | },
70 | });
71 |
72 | router.post(
73 | "/v1/chat/completions",
74 | chatCompletionsRoute,
75 | authMiddleware(AuthKeyPermission.API),
76 | checkModelMiddleware,
77 | sValidator("json", ChatCompletionRequest),
78 | async (c) => {
79 | const params = c.req.valid("json");
80 |
81 | let promptTemplate: PromptTemplate;
82 | if (c.var.model.promptTemplate) {
83 | promptTemplate = c.var.model.promptTemplate;
84 | } else {
85 | throw new HTTPException(422, {
86 | message:
87 | "Chat completions are disabled because a prompt template isn't set.",
88 | });
89 | }
90 |
91 | if (params.stream) {
92 | return streamSSE(c, async (stream) => {
93 | await streamChatCompletion(
94 | c.var.requestId,
95 | stream,
96 | params,
97 | c.var.model,
98 | promptTemplate,
99 | c.req.raw.signal,
100 | );
101 | });
102 | } else {
103 | const chatCompletionResult = await generateChatCompletion(
104 | c.var.requestId,
105 | params,
106 | c.var.model,
107 | promptTemplate,
108 | c.req.raw.signal,
109 | );
110 | return c.json(chatCompletionResult);
111 | }
112 | },
113 | );
114 |
115 | export default router;
116 |
--------------------------------------------------------------------------------
/api/OAI/types/chatCompletions.ts:
--------------------------------------------------------------------------------
1 | import * as z from "@/common/myZod.ts";
2 | import {
3 | CommonCompletionRequest,
4 | UsageStats,
5 | } from "@/api/OAI/types/completions.ts";
6 | import { BaseSamplerRequest } from "@/common/sampling.ts";
7 |
8 | const ChatCompletionImageUrl = z.object({
9 | url: z.string(),
10 | });
11 |
12 | const ChatCompletionMessagePart = z.object({
13 | type: z.string().nullish().coalesce("text"),
14 | text: z.string().nullish(),
15 | image_url: ChatCompletionImageUrl.nullish(),
16 | });
17 |
18 | export type ChatCompletionMessagePart = z.infer<
19 | typeof ChatCompletionMessagePart
20 | >;
21 |
22 | export const ChatCompletionMessage = z.object({
23 | role: z.string().default("user"),
24 | content: z.union([z.string(), z.array(ChatCompletionMessagePart)]),
25 | });
26 |
27 | export type ChatCompletionMessage = z.infer;
28 |
29 | const ChatCompletionStreamOptions = z.object({
30 | include_usage: z.boolean().nullish().coalesce(false),
31 | });
32 |
33 | export const ChatCompletionRequest = z.aliasedObject(
34 | z.object({
35 | messages: z.array(ChatCompletionMessage).nullish().coalesce([]),
36 | stream_options: ChatCompletionStreamOptions.nullish(),
37 | add_generation_prompt: z.boolean().nullish().coalesce(true),
38 | prompt_template: z.string().nullish(),
39 | template_vars: z.record(z.string(), z.unknown()).nullish().coalesce({}),
40 | }),
41 | [
42 | { field: "template_vars", aliases: ["chat_template_kwargs"] },
43 | ],
44 | )
45 | .and(CommonCompletionRequest)
46 | .and(BaseSamplerRequest)
47 | .transform((obj) => {
48 | // Always unset add_bos_token
49 | obj.add_bos_token = undefined;
50 | return obj;
51 | });
52 |
53 | export type ChatCompletionRequest = z.infer;
54 |
55 | export const ChatCompletionRespChoice = z.object({
56 | index: z.number().default(0),
57 | finish_reason: z.string().optional(),
58 | message: ChatCompletionMessage,
59 | });
60 |
61 | export const ChatCompletionResponse = z.object({
62 | id: z.string().default(
63 | `chatcmpl-${crypto.randomUUID().replaceAll("-", "")}`,
64 | ),
65 | choices: z.array(ChatCompletionRespChoice),
66 | created: z.number().default(Math.floor(Date.now() / 1000)),
67 | model: z.string(),
68 | object: z.string().default("chat.completion"),
69 | usage: UsageStats.optional(),
70 | });
71 |
72 | export const ChatCompletionStreamChoice = z.object({
73 | index: z.number().default(0),
74 | finish_reason: z.string().optional(),
75 | delta: z.union([ChatCompletionMessage, z.record(z.string(), z.unknown())]),
76 | });
77 |
78 | export const ChatCompletionStreamChunk = z.object({
79 | id: z.string().default(
80 | `chatcmpl-${crypto.randomUUID().replaceAll("-", "")}`,
81 | ),
82 | choices: z.array(ChatCompletionStreamChoice).default([]),
83 | created: z.number().default(Math.floor(Date.now() / 1000)),
84 | model: z.string(),
85 | object: z.string().default("chat.completion.chunk"),
86 | usage: UsageStats.optional(),
87 | });
88 |
--------------------------------------------------------------------------------
/api/OAI/types/completions.ts:
--------------------------------------------------------------------------------
1 | import * as z from "@/common/myZod.ts";
2 | import { BaseSamplerRequest } from "@/common/sampling.ts";
3 |
4 | export const CompletionResponseFormat = z.object({
5 | type: z.string().default("text"),
6 | });
7 |
8 | export const UsageStats = z.object({
9 | prompt_tokens: z.number(),
10 | completion_tokens: z.number(),
11 | total_tokens: z.number(),
12 | });
13 |
14 | export type UsageStats = z.infer;
15 |
16 | export const CommonCompletionRequest = z.object({
17 | model: z.string().nullish(),
18 | stream: z.boolean().nullish().coalesce(false),
19 | logprobs: z.number().gte(0).nullish().coalesce(0),
20 | response_format: CompletionResponseFormat.nullish().coalesce(
21 | CompletionResponseFormat.parse({}),
22 | ),
23 | n: z.number().gte(1).nullish().coalesce(1),
24 | best_of: z.number().nullish(),
25 | echo: z.boolean().nullish().coalesce(false),
26 | suffix: z.string().nullish(),
27 | user: z.string().nullish(),
28 | })
29 | .and(BaseSamplerRequest)
30 | .transform((obj) => {
31 | if (obj.response_format.type === "json") {
32 | obj.json_schema = {
33 | "type": "object",
34 | };
35 | }
36 |
37 | return obj;
38 | });
39 |
40 | export const CompletionRequest = z.object({
41 | prompt: z.union([
42 | z.string(),
43 | z.array(z.string()).transform((arr) => arr.join("\n")),
44 | ]),
45 | })
46 | .and(CommonCompletionRequest)
47 | .describe("Completion Request parameters");
48 |
49 | export type CompletionRequest = z.infer;
50 |
51 | export const CompletionRespChoice = z.object({
52 | index: z.number().default(0),
53 | finish_reason: z.string().optional(),
54 | text: z.string(),
55 | });
56 |
57 | export const CompletionResponse = z.object({
58 | id: z.string().default(`cmpl-${crypto.randomUUID().replaceAll("-", "")}`),
59 | choices: z.array(CompletionRespChoice),
60 | created: z.number().default(Math.floor(Date.now() / 1000)),
61 | model: z.string(),
62 | object: z.string().default("text_completion"),
63 | usage: UsageStats.optional(),
64 | });
65 |
--------------------------------------------------------------------------------
/api/OAI/utils/chatCompletion.ts:
--------------------------------------------------------------------------------
1 | import { SSEStreamingApi } from "hono/streaming";
2 |
3 | import {
4 | convertFinishReason,
5 | createUsageStats,
6 | GenerationType,
7 | staticGenerate,
8 | } from "@/api/OAI/utils/generation.ts";
9 | import { Model } from "@/bindings/bindings.ts";
10 | import { FinishChunk, GenerationChunk } from "@/bindings/types.ts";
11 | import { toGeneratorError } from "@/common/networking.ts";
12 | import { PromptTemplate } from "@/common/templating.ts";
13 |
14 | import {
15 | ChatCompletionMessage,
16 | ChatCompletionMessagePart,
17 | ChatCompletionRequest,
18 | ChatCompletionRespChoice,
19 | ChatCompletionResponse,
20 | ChatCompletionStreamChoice,
21 | ChatCompletionStreamChunk,
22 | } from "../types/chatCompletions.ts";
23 | import { CancellationError } from "@/common/errors.ts";
24 | import { logger } from "@/common/logging.ts";
25 |
26 | interface TemplateFormatOptions {
27 | addBosToken?: boolean;
28 | banEosToken?: boolean;
29 | addGenerationPrompt?: boolean;
30 | templateVars?: Record;
31 | }
32 |
33 | function createResponse(chunk: FinishChunk, modelName: string) {
34 | const message = ChatCompletionMessage.parse({
35 | role: "assistant",
36 | content: chunk.text,
37 | });
38 |
39 | const choice = ChatCompletionRespChoice.parse({
40 | message: message,
41 | finish_reason: convertFinishReason(chunk),
42 | });
43 |
44 | const usage = createUsageStats(chunk);
45 |
46 | const response = ChatCompletionResponse.parse({
47 | choices: [choice],
48 | model: modelName,
49 | usage,
50 | });
51 |
52 | return response;
53 | }
54 |
55 | function createStreamChunk(
56 | chunk: GenerationChunk,
57 | modelName: string,
58 | cmplId: string,
59 | ) {
60 | const message = ChatCompletionMessage.parse({
61 | role: "assistant",
62 | content: chunk.text,
63 | });
64 |
65 | const choice = ChatCompletionStreamChoice.parse({
66 | delta: message,
67 | });
68 |
69 | const response = ChatCompletionStreamChunk.parse({
70 | id: cmplId,
71 | choices: [choice],
72 | model: modelName,
73 | });
74 |
75 | return response;
76 | }
77 |
78 | function createUsageChunk(
79 | chunk: FinishChunk,
80 | modelName: string,
81 | cmplId: string,
82 | ) {
83 | const response = ChatCompletionStreamChunk.parse({
84 | id: cmplId,
85 | model: modelName,
86 | usage: createUsageStats(chunk),
87 | });
88 |
89 | return response;
90 | }
91 |
92 | export function applyChatTemplate(
93 | model: Model,
94 | promptTemplate: PromptTemplate,
95 | messages: ChatCompletionMessage[],
96 | options: TemplateFormatOptions = {},
97 | ): string {
98 | const {
99 | addGenerationPrompt = true,
100 | templateVars = {},
101 | } = options;
102 |
103 | messages.forEach((message) => {
104 | if (Array.isArray(message.content)) {
105 | const messageParts = message.content as ChatCompletionMessagePart[];
106 | message.content = messageParts.find((part) =>
107 | part.type === "text"
108 | )?.text ?? "";
109 | }
110 | });
111 |
112 | const bosToken = model.tokenizer.bosToken;
113 | let prompt = promptTemplate.render({
114 | ...templateVars,
115 | messages: messages,
116 | bos_token: bosToken?.piece ?? "",
117 | eos_token: model.tokenizer.eosToken?.piece ?? "",
118 | add_generation_prompt: addGenerationPrompt,
119 | });
120 |
121 | // Remove extra BOS token at start of prompt if present
122 | // Some model templates don't respect their own add_bos_token setting
123 | // Better to do this since a template can add BOS anywhere
124 | if (
125 | bosToken && model.tokenizer.addBosToken &&
126 | prompt.startsWith(bosToken.piece)
127 | ) {
128 | prompt = prompt.slice(bosToken.piece.length);
129 | }
130 |
131 | return prompt;
132 | }
133 |
134 | function addTemplateMetadata(
135 | promptTemplate: PromptTemplate,
136 | params: ChatCompletionRequest,
137 | ) {
138 | const metadata = promptTemplate.metadata;
139 |
140 | if (metadata.stop_strings) {
141 | params.stop.push(...metadata.stop_strings);
142 | }
143 | }
144 |
145 | // TODO: Possibly rewrite this to unify with completions
146 | export async function streamChatCompletion(
147 | requestId: string,
148 | stream: SSEStreamingApi,
149 | params: ChatCompletionRequest,
150 | model: Model,
151 | promptTemplate: PromptTemplate,
152 | requestSignal: AbortSignal,
153 | ) {
154 | logger.info(`Received streaming chat completion request ${requestId}`);
155 |
156 | const cmplId = `chatcmpl-${crypto.randomUUID().replaceAll("-", "")}`;
157 | const abortController = new AbortController();
158 | let finished = false;
159 |
160 | // If an abort happens before streaming starts
161 | requestSignal.addEventListener("abort", () => {
162 | if (!finished) {
163 | abortController.abort(
164 | new CancellationError(
165 | `Streaming chat completion ${requestId} cancelled by user.`,
166 | ),
167 | );
168 | finished = true;
169 | }
170 | });
171 |
172 | const prompt = applyChatTemplate(
173 | model,
174 | promptTemplate,
175 | params.messages,
176 | {
177 | addGenerationPrompt: params.add_generation_prompt,
178 | templateVars: params.template_vars,
179 | },
180 | );
181 |
182 | addTemplateMetadata(promptTemplate, params);
183 |
184 | try {
185 | const generator = model.generateGen(
186 | requestId,
187 | prompt,
188 | params,
189 | abortController.signal,
190 | );
191 |
192 | for await (const chunk of generator) {
193 | const streamChunk = createStreamChunk(
194 | chunk,
195 | model.path.name,
196 | cmplId,
197 | );
198 |
199 | await stream.writeSSE({
200 | data: JSON.stringify(streamChunk),
201 | });
202 |
203 | // Write usage stats if user requests it
204 | if (
205 | params.stream_options?.include_usage && chunk.kind === "finish"
206 | ) {
207 | const usageChunk = createUsageChunk(
208 | chunk,
209 | model.path.name,
210 | cmplId,
211 | );
212 |
213 | await stream.writeSSE({
214 | data: JSON.stringify(usageChunk),
215 | });
216 | }
217 | }
218 |
219 | logger.info(`Finished streaming chat completion request ${requestId}`);
220 | } catch (error) {
221 | await stream.writeSSE({
222 | data: JSON.stringify(toGeneratorError(error)),
223 | });
224 | }
225 |
226 | finished = true;
227 | }
228 |
229 | export async function generateChatCompletion(
230 | requestId: string,
231 | params: ChatCompletionRequest,
232 | model: Model,
233 | promptTemplate: PromptTemplate,
234 | requestSignal: AbortSignal,
235 | ) {
236 | logger.info(`Received chat completion request ${requestId}`);
237 |
238 | const prompt = applyChatTemplate(
239 | model,
240 | promptTemplate,
241 | params.messages,
242 | {
243 | addGenerationPrompt: params.add_generation_prompt,
244 | templateVars: params.template_vars,
245 | },
246 | );
247 |
248 | addTemplateMetadata(promptTemplate, params);
249 |
250 | // Handle generation in the common function
251 | const gen = await staticGenerate(
252 | requestId,
253 | GenerationType.ChatCompletion,
254 | prompt,
255 | params,
256 | model,
257 | requestSignal,
258 | );
259 | const response = createResponse(gen, model.path.name);
260 |
261 | return response;
262 | }
263 |
--------------------------------------------------------------------------------
/api/OAI/utils/completion.ts:
--------------------------------------------------------------------------------
1 | import { SSEStreamingApi } from "hono/streaming";
2 |
3 | import {
4 | convertFinishReason,
5 | createUsageStats,
6 | GenerationType,
7 | staticGenerate,
8 | } from "@/api/OAI/utils/generation.ts";
9 | import { Model } from "@/bindings/bindings.ts";
10 | import { GenerationChunk } from "@/bindings/types.ts";
11 | import { CancellationError } from "@/common/errors.ts";
12 | import { toGeneratorError } from "@/common/networking.ts";
13 | import { logger } from "@/common/logging.ts";
14 | import {
15 | CompletionRequest,
16 | CompletionRespChoice,
17 | CompletionResponse,
18 | } from "../types/completions.ts";
19 |
20 | function createResponse(chunk: GenerationChunk, modelName: string) {
21 | const finishReason = chunk.kind === "finish"
22 | ? convertFinishReason(chunk)
23 | : undefined;
24 | const choice = CompletionRespChoice.parse({
25 | text: chunk.text,
26 | finish_reason: finishReason,
27 | });
28 |
29 | const usage = chunk.kind === "finish" ? createUsageStats(chunk) : undefined;
30 |
31 | const response = CompletionResponse.parse({
32 | choices: [choice],
33 | model: modelName,
34 | usage,
35 | });
36 |
37 | return response;
38 | }
39 |
40 | export async function streamCompletion(
41 | requestId: string,
42 | stream: SSEStreamingApi,
43 | params: CompletionRequest,
44 | model: Model,
45 | requestSignal: AbortSignal,
46 | ) {
47 | logger.info(`Received streaming completion request ${requestId}`);
48 |
49 | const abortController = new AbortController();
50 | let finished = false;
51 |
52 | // If an abort happens before streaming starts
53 | requestSignal.addEventListener("abort", () => {
54 | if (!finished) {
55 | abortController.abort(
56 | new CancellationError(
57 | `Streaming completion ${requestId} cancelled by user.`,
58 | ),
59 | );
60 | finished = true;
61 | }
62 | });
63 |
64 | try {
65 | const generator = model.generateGen(
66 | requestId,
67 | params.prompt,
68 | params,
69 | abortController.signal,
70 | );
71 |
72 | for await (const chunk of generator) {
73 | const streamChunk = createResponse(chunk, model.path.name);
74 |
75 | await stream.writeSSE({
76 | data: JSON.stringify(streamChunk),
77 | });
78 | }
79 |
80 | logger.info(`Finished streaming completion request ${requestId}`);
81 | } catch (error) {
82 | await stream.writeSSE({
83 | data: JSON.stringify(toGeneratorError(error)),
84 | });
85 | }
86 |
87 | finished = true;
88 | }
89 |
90 | export async function generateCompletion(
91 | requestId: string,
92 | params: CompletionRequest,
93 | model: Model,
94 | requestSignal: AbortSignal,
95 | ) {
96 | logger.info(`Received completion request ${requestId}`);
97 |
98 | // Handle generation in the common function
99 | const gen = await staticGenerate(
100 | requestId,
101 | GenerationType.Completion,
102 | params.prompt,
103 | params,
104 | model,
105 | requestSignal,
106 | );
107 |
108 | const response = createResponse(gen, model.path.name);
109 |
110 | return response;
111 | }
112 |
--------------------------------------------------------------------------------
/api/OAI/utils/generation.ts:
--------------------------------------------------------------------------------
1 | import { UsageStats } from "@/api/OAI/types/completions.ts";
2 | import { Model } from "@/bindings/bindings.ts";
3 | import { FinishChunk, ReadbackFinishReason } from "@/bindings/types.ts";
4 | import { logger } from "@/common/logging.ts";
5 | import { BaseSamplerRequest } from "@/common/sampling.ts";
6 | import { toHttpException } from "@/common/networking.ts";
7 | import { CancellationError } from "@/common/errors.ts";
8 |
9 | export enum GenerationType {
10 | Completion = "Completion",
11 | ChatCompletion = "Chat completion",
12 | }
13 |
14 | export function createUsageStats(chunk: FinishChunk) {
15 | const usage = UsageStats.parse({
16 | prompt_tokens: chunk.promptTokens,
17 | completion_tokens: chunk.genTokens,
18 | total_tokens: chunk.promptTokens + chunk.genTokens,
19 | });
20 |
21 | return usage;
22 | }
23 |
24 | export function convertFinishReason(chunk: FinishChunk) {
25 | return chunk.finishReason === ReadbackFinishReason.MaxNewTokens
26 | ? "length"
27 | : "stop";
28 | }
29 |
30 | export async function staticGenerate(
31 | requestId: string,
32 | genType: GenerationType,
33 | prompt: string,
34 | params: BaseSamplerRequest,
35 | model: Model,
36 | requestSignal: AbortSignal,
37 | ) {
38 | const abortController = new AbortController();
39 | let finished = false;
40 |
41 | requestSignal.addEventListener("abort", () => {
42 | if (!finished) {
43 | abortController.abort(
44 | new CancellationError(
45 | `${genType} ${requestId} cancelled by user.`,
46 | ),
47 | );
48 | finished = true;
49 | }
50 | });
51 |
52 | try {
53 | const result = await model.generate(
54 | requestId,
55 | prompt,
56 | params,
57 | abortController.signal,
58 | );
59 |
60 | logger.info(`Finished ${genType.toLowerCase()} request ${requestId}`);
61 |
62 | finished = true;
63 | return result;
64 | } catch (error) {
65 | throw toHttpException(error);
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/api/core/router.ts:
--------------------------------------------------------------------------------
1 | import { Hono } from "hono";
2 | import { HTTPException } from "hono/http-exception";
3 | import { describeRoute } from "hono-openapi";
4 | import { validator as sValidator } from "hono-openapi";
5 | import { AuthPermissionResponse } from "@/api/core/types/auth.ts";
6 | import { HealthSchema } from "@/api/core/types/health.ts";
7 | import { applyChatTemplate } from "@/api/OAI/utils/chatCompletion.ts";
8 | import {
9 | ModelCard,
10 | ModelList,
11 | ModelLoadRequest,
12 | } from "@/api/core/types/model.ts";
13 | import {
14 | TemplateList,
15 | TemplateSwitchRequest,
16 | } from "@/api/core/types/template.ts";
17 | import {
18 | TokenDecodeRequest,
19 | TokenDecodeResponse,
20 | TokenEncodeRequest,
21 | TokenEncodeResponse,
22 | } from "@/api/core/types/token.ts";
23 | import { AuthKeyPermission, getAuthPermission } from "@/common/auth.ts";
24 | import { ModelConfig } from "@/common/configModels.ts";
25 | import { config } from "@/common/config.ts";
26 | import { logger } from "@/common/logging.ts";
27 | import * as modelContainer from "@/common/modelContainer.ts";
28 | import { jsonContent, toHttpException } from "@/common/networking.ts";
29 | import { PromptTemplate } from "@/common/templating.ts";
30 |
31 | import authMiddleware from "../middleware/authMiddleware.ts";
32 | import checkModelMiddleware from "../middleware/checkModelMiddleware.ts";
33 |
34 | const router = new Hono();
35 |
36 | const healthRoute = describeRoute({
37 | responses: {
38 | 200: jsonContent(HealthSchema, "Health status of server"),
39 | },
40 | });
41 |
42 | router.get(
43 | "/health",
44 | healthRoute,
45 | checkModelMiddleware,
46 | (c) => {
47 | return c.json(HealthSchema.parse({ health: "ok" }));
48 | },
49 | );
50 |
51 | const modelsRoute = describeRoute({
52 | responses: {
53 | 200: jsonContent(ModelList, "List of models in directory"),
54 | },
55 | });
56 |
57 | router.on(
58 | "GET",
59 | ["/v1/models", "/v1/model/list"],
60 | modelsRoute,
61 | authMiddleware(AuthKeyPermission.API),
62 | async (c) => {
63 | const modelCards: ModelCard[] = [];
64 | for await (const file of Deno.readDir(config.model.model_dir)) {
65 | if (!file.name.endsWith(".gguf")) {
66 | continue;
67 | }
68 |
69 | const modelCard = ModelCard.parse({
70 | id: file.name.replace(".gguf", ""),
71 | });
72 |
73 | modelCards.push(modelCard);
74 | }
75 |
76 | const modelList = ModelList.parse({
77 | data: modelCards,
78 | });
79 |
80 | return c.json(modelList);
81 | },
82 | );
83 |
84 | const currentModelRoute = describeRoute({
85 | responses: {
86 | 200: jsonContent(
87 | ModelCard,
88 | "The currently loaded model (if it exists)",
89 | ),
90 | },
91 | });
92 |
93 | router.get(
94 | "/v1/model",
95 | currentModelRoute,
96 | authMiddleware(AuthKeyPermission.API),
97 | checkModelMiddleware,
98 | (c) => {
99 | const modelCard = ModelCard.parse({
100 | id: c.var.model.path.base,
101 | });
102 |
103 | return c.json(modelCard);
104 | },
105 | );
106 |
107 | const loadModelRoute = describeRoute({
108 | responses: {
109 | 200: {
110 | description: "Model successfully loaded",
111 | },
112 | },
113 | });
114 |
115 | // TODO: Make this a streaming response if necessary
116 | router.post(
117 | "/v1/model/load",
118 | loadModelRoute,
119 | authMiddleware(AuthKeyPermission.Admin),
120 | sValidator("json", ModelLoadRequest),
121 | async (c) => {
122 | const params = c.req.valid("json");
123 | const loadParams = ModelConfig.parse({
124 | ...params,
125 | model_dir: config.model.model_dir,
126 | });
127 |
128 | // Makes sure the event doesn't fire multiple times
129 | let finished = false;
130 |
131 | // Abort handler
132 | const progressAbort = new AbortController();
133 | c.req.raw.signal.addEventListener("abort", () => {
134 | if (!finished) {
135 | progressAbort.abort();
136 | }
137 | });
138 |
139 | const progressCallback = (_progress: number): boolean => {
140 | if (progressAbort.signal.aborted) {
141 | logger.error("Load request cancelled");
142 | return false;
143 | }
144 |
145 | return true;
146 | };
147 |
148 | // Load the model and re-raise errors
149 | try {
150 | await modelContainer.loadModel(loadParams, progressCallback);
151 | } catch (error) {
152 | if (error instanceof Error) {
153 | throw new HTTPException(422, error);
154 | }
155 | }
156 |
157 | finished = true;
158 |
159 | c.status(200);
160 | return c.body(null);
161 | },
162 | );
163 |
164 | const unloadRoute = describeRoute({
165 | responses: {
166 | 200: {
167 | description: "Model successfully unloaded",
168 | },
169 | },
170 | });
171 |
172 | router.post(
173 | "/v1/model/unload",
174 | unloadRoute,
175 | authMiddleware(AuthKeyPermission.Admin),
176 | checkModelMiddleware,
177 | async (c) => {
178 | await modelContainer.unloadModel(true);
179 |
180 | c.status(200);
181 | return c.body(null);
182 | },
183 | );
184 |
185 | const templatesRoute = describeRoute({
186 | responses: {
187 | 200: jsonContent(TemplateList, "List of prompt templates"),
188 | },
189 | });
190 |
191 | router.on(
192 | "GET",
193 | ["/v1/templates", "/v1/template/list"],
194 | templatesRoute,
195 | authMiddleware(AuthKeyPermission.API),
196 | async (c) => {
197 | const templates: string[] = [];
198 | for await (const file of Deno.readDir("templates")) {
199 | if (!file.name.endsWith(".jinja")) {
200 | continue;
201 | }
202 |
203 | templates.push(file.name.replace(".jinja", ""));
204 | }
205 |
206 | const templateList = TemplateList.parse({
207 | data: templates,
208 | });
209 |
210 | return c.json(templateList);
211 | },
212 | );
213 |
214 | const templateSwitchRoute = describeRoute({
215 | responses: {
216 | 200: {
217 | description: "Prompt template switched",
218 | },
219 | },
220 | });
221 |
222 | router.post(
223 | "/v1/template/switch",
224 | templateSwitchRoute,
225 | authMiddleware(AuthKeyPermission.API),
226 | checkModelMiddleware,
227 | sValidator("json", TemplateSwitchRequest),
228 | async (c) => {
229 | const params = c.req.valid("json");
230 |
231 | const templatePath = `templates/${params.prompt_template_name}`;
232 | c.var.model.promptTemplate = await PromptTemplate.fromFile(
233 | templatePath,
234 | );
235 | },
236 | );
237 |
238 | const authPermissionRoute = describeRoute({
239 | responses: {
240 | 200: jsonContent(
241 | AuthPermissionResponse,
242 | "Returns permissions of a given auth key",
243 | ),
244 | },
245 | });
246 |
247 | router.get(
248 | "/v1/auth/permission",
249 | authPermissionRoute,
250 | authMiddleware(AuthKeyPermission.API),
251 | (c) => {
252 | try {
253 | const permission = getAuthPermission(c.req.header());
254 | const response = AuthPermissionResponse.parse({
255 | permission,
256 | });
257 |
258 | return c.json(response);
259 | } catch (error) {
260 | throw toHttpException(error, 400);
261 | }
262 | },
263 | );
264 |
265 | const tokenEncodeRoute = describeRoute({
266 | responses: {
267 | 200: jsonContent(TokenEncodeResponse, "Encode token response"),
268 | },
269 | });
270 |
271 | router.post(
272 | "/v1/token/encode",
273 | tokenEncodeRoute,
274 | authMiddleware(AuthKeyPermission.API),
275 | checkModelMiddleware,
276 | sValidator("json", TokenEncodeRequest),
277 | async (c) => {
278 | const params = c.req.valid("json");
279 |
280 | let text: string;
281 | if (typeof params.text === "string") {
282 | text = params.text;
283 | } else if (Array.isArray(params.text)) {
284 | if (!c.var.model.promptTemplate) {
285 | throw new HTTPException(422, {
286 | message: "Cannot tokenize chat completion " +
287 | "because a prompt template is not set",
288 | });
289 | }
290 |
291 | text = applyChatTemplate(
292 | c.var.model,
293 | c.var.model.promptTemplate,
294 | params.text,
295 | {
296 | addBosToken: params.add_bos_token,
297 | addGenerationPrompt: false,
298 | },
299 | );
300 | } else {
301 | throw new HTTPException(422, {
302 | message: "Unable to tokenize the provided text. " +
303 | "Check your formatting?",
304 | });
305 | }
306 |
307 | const tokens = await c.var.model.tokenizer.tokenize(
308 | text,
309 | params.add_bos_token,
310 | params.encode_special_tokens,
311 | );
312 |
313 | const resp = TokenEncodeResponse.parse({
314 | tokens,
315 | length: tokens.length,
316 | });
317 |
318 | return c.json(resp);
319 | },
320 | );
321 |
322 | const tokenDecodeRoute = describeRoute({
323 | responses: {
324 | 200: jsonContent(TokenDecodeResponse, "Decode token response"),
325 | },
326 | });
327 |
328 | router.post(
329 | "/v1/token/decode",
330 | tokenDecodeRoute,
331 | authMiddleware(AuthKeyPermission.API),
332 | checkModelMiddleware,
333 | sValidator("json", TokenDecodeRequest),
334 | async (c) => {
335 | const params = c.req.valid("json");
336 |
337 | const text = await c.var.model.tokenizer.detokenize(
338 | params.tokens,
339 | undefined,
340 | params.add_bos_token,
341 | params.decode_special_tokens,
342 | );
343 |
344 | const resp = TokenDecodeResponse.parse({
345 | text,
346 | });
347 |
348 | return c.json(resp);
349 | },
350 | );
351 |
352 | export default router;
353 |
--------------------------------------------------------------------------------
/api/core/types/auth.ts:
--------------------------------------------------------------------------------
1 | import * as z from "@/common/myZod.ts";
2 |
3 | export const AuthPermissionResponse = z.object({
4 | permission: z.string(),
5 | });
6 |
--------------------------------------------------------------------------------
/api/core/types/health.ts:
--------------------------------------------------------------------------------
1 | import * as z from "@/common/myZod.ts";
2 |
3 | export const HealthSchema = z.object({
4 | health: z.enum(["ok", "unhealthy"]),
5 | });
6 |
--------------------------------------------------------------------------------
/api/core/types/model.ts:
--------------------------------------------------------------------------------
1 | import * as z from "@/common/myZod.ts";
2 | import { ModelConfig } from "@/common/configModels.ts";
3 | import { applyLoadDefaults } from "@/common/modelContainer.ts";
4 |
5 | export const ModelLoadRequest = z.preprocess(
6 | (data: unknown) => applyLoadDefaults(data),
7 | ModelConfig.extend({
8 | model_name: z.string(),
9 | }).omit({
10 | model_dir: true,
11 | use_as_default: true,
12 | }),
13 | );
14 |
15 | export const ModelCard = z.object({
16 | id: z.string().default("test"),
17 | object: z.string().default("model"),
18 | created: z.number().default(Date.now()),
19 | owned_by: z.string().default("YALS"),
20 | });
21 |
22 | export type ModelCard = z.infer;
23 |
24 | export const ModelList = z.object({
25 | object: z.string().default("list"),
26 | data: z.array(ModelCard).default([]),
27 | });
28 |
29 | export type ModelList = z.infer;
30 |
--------------------------------------------------------------------------------
/api/core/types/template.ts:
--------------------------------------------------------------------------------
1 | import * as z from "@/common/myZod.ts";
2 |
3 | export const TemplateList = z.object({
4 | object: z.string().default("list"),
5 | data: z.array(z.string()).default([]),
6 | });
7 |
8 | export const TemplateSwitchRequest = z.aliasedObject(
9 | z.object({
10 | prompt_template_name: z.string(),
11 | }),
12 | [{ field: "prompt_template_name", aliases: ["name"] }],
13 | );
14 |
--------------------------------------------------------------------------------
/api/core/types/token.ts:
--------------------------------------------------------------------------------
1 | import * as z from "@/common/myZod.ts";
2 | import { ChatCompletionMessage } from "@/api/OAI/types/chatCompletions.ts";
3 |
4 | const CommonTokenRequest = z.object({
5 | add_bos_token: z.boolean().nullish().coalesce(true),
6 | encode_special_tokens: z.boolean().nullish().coalesce(true),
7 | decode_special_tokens: z.boolean().nullish().coalesce(true),
8 | });
9 |
10 | export const TokenEncodeRequest = z.object({
11 | text: z.union([z.string(), z.array(ChatCompletionMessage)]),
12 | })
13 | .merge(CommonTokenRequest);
14 |
15 | export const TokenEncodeResponse = z.object({
16 | tokens: z.array(z.number()),
17 | length: z.number(),
18 | });
19 |
20 | export const TokenDecodeRequest = z.object({
21 | tokens: z.array(z.number()),
22 | })
23 | .merge(CommonTokenRequest);
24 |
25 | export const TokenDecodeResponse = z.object({
26 | text: z.string(),
27 | });
28 |
--------------------------------------------------------------------------------
/api/middleware/authMiddleware.ts:
--------------------------------------------------------------------------------
1 | import { HTTPException } from "hono/http-exception";
2 | import { createMiddleware } from "hono/factory";
3 | import { AuthKeyPermission, authKeys } from "@/common/auth.ts";
4 | import { config } from "@/common/config.ts";
5 |
6 | // Middleware for checking if the model exists
7 | // Sends a validated version of the model via Hono's ctx
8 | const authMiddleware = (permission: AuthKeyPermission) => {
9 | return createMiddleware(async (c, next) => {
10 | if (config.network.disable_auth) {
11 | await next();
12 | return;
13 | }
14 |
15 | const headers = c.req.header();
16 | const xHeader = `x-${permission.toLowerCase()}-key`;
17 |
18 | // TODO: Possibly refactor error throws
19 | if (xHeader in headers) {
20 | const valid = authKeys?.verifyKey(headers[xHeader], permission);
21 | if (!valid) {
22 | throw new HTTPException(401, {
23 | message: `Invalid ${permission} key`,
24 | });
25 | }
26 | } else if ("authorization" in headers) {
27 | const splitKey = headers["authorization"].split(" ");
28 | if (splitKey.length < 2) {
29 | throw new HTTPException(401, {
30 | message: `Invalid ${permission} key`,
31 | });
32 | }
33 |
34 | const valid = splitKey[0].toLowerCase() === "bearer" &&
35 | authKeys?.verifyKey(splitKey[1], permission);
36 |
37 | if (!valid) {
38 | throw new HTTPException(401, {
39 | message: `Invalid ${permission} key`,
40 | });
41 | }
42 | } else {
43 | throw new HTTPException(401, { message: "Key not provided" });
44 | }
45 |
46 | await next();
47 | });
48 | };
49 |
50 | export default authMiddleware;
51 |
--------------------------------------------------------------------------------
/api/middleware/checkModelMiddleware.ts:
--------------------------------------------------------------------------------
1 | import { createMiddleware } from "hono/factory";
2 |
3 | import { Model } from "@/bindings/bindings.ts";
4 | import { ModelNotLoadedError } from "@/common/errors.ts";
5 | import { model } from "@/common/modelContainer.ts";
6 |
7 | // Extra vars for context
8 | interface CtxOptions {
9 | Variables: {
10 | model: Model;
11 | };
12 | }
13 |
14 | // Middleware for checking if the model exists
15 | // Sends a validated version of the model via Hono's ctx
16 | const checkModelMiddleware = createMiddleware(
17 | async (c, next) => {
18 | if (!model) {
19 | throw new ModelNotLoadedError();
20 | }
21 |
22 | // Validated reference
23 | c.set("model", model);
24 |
25 | await next();
26 | },
27 | );
28 |
29 | export default checkModelMiddleware;
30 |
--------------------------------------------------------------------------------
/api/middleware/requestLogMiddleware.ts:
--------------------------------------------------------------------------------
1 | import { createMiddleware } from "hono/factory";
2 | import { logger } from "../../common/logging.ts";
3 |
4 | // Middleware for logging parts of a request
5 | const requestLogMiddleware = createMiddleware(
6 | async (c, next) => {
7 | const logMessage = [
8 | `Information for ${c.req.method} request ${c.var.requestId}`,
9 | ];
10 |
11 | logMessage.push(`URL: ${c.req.url}`);
12 |
13 | const headers = Object.fromEntries(c.req.raw.headers);
14 | logMessage.push(`Headers: ${JSON.stringify(headers, null, 2)}`);
15 |
16 | if (c.req.method !== "GET") {
17 | const clonedReq = c.req.raw.clone();
18 | const textBody = await clonedReq.text();
19 |
20 | if (textBody) {
21 | logMessage.push(`Body: ${textBody}`);
22 | }
23 | }
24 |
25 | logger.info(logMessage.join("\n"));
26 |
27 | await next();
28 | },
29 | );
30 |
31 | export default requestLogMiddleware;
32 |
--------------------------------------------------------------------------------
/api/server.ts:
--------------------------------------------------------------------------------
1 | import { Hono } from "hono";
2 | import { cors } from "hono/cors";
3 | import { requestId } from "hono/request-id";
4 | import { logger as loggerMiddleware } from "hono/logger";
5 | import { ContentfulStatusCode } from "hono/utils/http-status";
6 | import { openAPISpecs } from "hono-openapi";
7 | import { apiReference } from "@scalar/hono-api-reference";
8 |
9 | import { config } from "@/common/config.ts";
10 | import { logger } from "@/common/logging.ts";
11 | import core from "./core/router.ts";
12 | import oai from "./OAI/router.ts";
13 | import { generateUuidHex } from "@/common/utils.ts";
14 | import { ModelNotLoadedError } from "@/common/errors.ts";
15 | import requestLogMiddleware from "./middleware/requestLogMiddleware.ts";
16 |
17 | export function createApi() {
18 | const app = new Hono();
19 |
20 | // TODO: Use a custom middleware instead of overriding Hono's logger
21 | const printToLogger = (message: string, ...rest: string[]) => {
22 | logger.info(message, { rest });
23 | };
24 |
25 | // Middleware
26 | app.use(loggerMiddleware(printToLogger));
27 | app.use("*", cors());
28 | app.use(requestId({ limitLength: 16, generator: generateUuidHex }));
29 |
30 | if (config.logging.log_requests) {
31 | app.use(requestLogMiddleware);
32 | }
33 |
34 | // Add routers
35 | app.route("/", core);
36 | app.route("/", oai);
37 |
38 | // OpenAPI documentation
39 | app.get(
40 | "/openapi.json",
41 | openAPISpecs(app, {
42 | documentation: {
43 | openapi: "3.0.0",
44 | info: {
45 | version: "0.0.1",
46 | title: "YALS",
47 | },
48 | },
49 | }),
50 | );
51 |
52 | app.get(
53 | "/docs",
54 | apiReference({
55 | spec: {
56 | url: "/openapi.json",
57 | },
58 | }),
59 | );
60 |
61 | // Error handling
62 | // Originally from the Stoker package
63 | app.onError((err, c) => {
64 | const currentStatus = "status" in err
65 | ? err.status
66 | : c.newResponse(null).status;
67 | const statusCode = currentStatus != 200
68 | ? (currentStatus as ContentfulStatusCode)
69 | : 500;
70 |
71 | const logError = !(
72 | statusCode === 401
73 | );
74 |
75 | // Only log in console if the error allows it
76 | if (logError) {
77 | const messageOnly = statusCode === 408 ||
78 | err instanceof ModelNotLoadedError;
79 |
80 | if (messageOnly) {
81 | logger.error(`Sent to request: ${err.message}`);
82 | } else {
83 | logger.error(`Sent to request: ${err.stack || err.message}`);
84 | }
85 | }
86 |
87 | // Always send error + message to client
88 | return c.json({
89 | detail: err.message,
90 | }, statusCode);
91 | });
92 |
93 | app.notFound((c) => {
94 | return c.json({
95 | message: `Method or path not found - ${c.req.method} ${c.req.path}`,
96 | }, 404);
97 | });
98 |
99 | // Serve
100 | Deno.serve({
101 | hostname: config.network.host,
102 | port: config.network.port,
103 | handler: app.fetch,
104 | onListen: ({ hostname, port }) => {
105 | logger.info(`Server running on http://${hostname}:${port}`);
106 | },
107 | });
108 | }
109 |
--------------------------------------------------------------------------------
/assets/icon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/theroyallab/YALS/60286959be95d577e05efdf33ba6733395d60020/assets/icon.ico
--------------------------------------------------------------------------------
/assets/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/theroyallab/YALS/60286959be95d577e05efdf33ba6733395d60020/assets/icon.png
--------------------------------------------------------------------------------
/bindings/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.14.0)
2 | project(LlamaMultiUserInference)
3 | set(CMAKE_CXX_STANDARD 17)
4 |
5 | option(LLGUIDANCE "Enable LLGuidance support (requires Rust)" OFF)
6 |
7 | # Set RPath for Apple and Unix systems
8 | if (APPLE)
9 | set(CMAKE_INSTALL_RPATH "@loader_path")
10 | set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
11 | set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE)
12 | elseif (UNIX)
13 | set(CMAKE_INSTALL_RPATH "$ORIGIN")
14 | set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
15 | set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE)
16 | endif()
17 |
18 | if (DEFINED LLAMACPP_REPO OR DEFINED LLAMACPP_COMMIT)
19 | message(STATUS "Using a custom commit or repo for llama.cpp. Build might not work as expected. Here be dragons!")
20 | endif()
21 |
22 | # Do not cache these variables with subsequent builds
23 | set(LLAMACPP_REPO "https://github.com/ggerganov/llama.cpp.git")
24 | message(STATUS "Using llama.cpp repo ${LLAMACPP_REPO}")
25 |
26 | # Stable llama.cpp commit for bindings
27 | set(LLAMACPP_COMMIT "7675c555a13c9f473249e59a54db35032ce8e0fc")
28 | message(STATUS "Using llama.cpp tag ${LLAMACPP_COMMIT}")
29 |
30 | # Optional: You can also enable mixed FP16/FP32 computation for faster processing
31 | # set(LLAMA_CUDA_F16 ON CACHE BOOL "llama.cpp: use float16 for GPU operations" FORCE)
32 | # set(GGML_CUDA ON CACHE BOOL "llama.cpp: use float16 for GPU operations" FORCE)
33 |
34 | # Disable unused components to speed up build
35 | set(LLAMA_BUILD_EXAMPLES OFF CACHE BOOL "llama.cpp: build examples" FORCE)
36 | set(LLAMA_BUILD_TESTS OFF CACHE BOOL "llama.cpp: build tests" FORCE)
37 | set(LLAMA_BUILD_SERVER OFF CACHE BOOL "llama.cpp: build server" FORCE)
38 | set(LLAMA_CURL OFF CACHE BOOL "llama.cpp: use libcurl" FORCE)
39 |
40 | # Enable common
41 | set(LLAMA_BUILD_COMMON ON CACHE BOOL "llama.cpp: build common utils library" FORCE)
42 |
43 | if(LLGUIDANCE)
44 | find_program(CARGO cargo)
45 | if(CARGO)
46 | message(STATUS "Including LLGuidance in build")
47 | set(LLAMA_LLGUIDANCE ON CACHE BOOL "llama.cpp: enable LLGuidance support" FORCE)
48 | else()
49 | message(FATAL_ERROR "LLGuidance is enabled, but requires Rust for compilation. Get it at https://rustup.rs")
50 | endif()
51 | else()
52 | message(STATUS "LLGuidance support is disabled. Enable with -DLLGUIDANCE=ON for grammar, JSON schema, and regex support.")
53 | set(LLAMA_LLGUIDANCE OFF CACHE BOOL "llama.cpp: disable LLGuidance support" FORCE)
54 | endif()
55 |
56 | # Fetch llama.cpp latest
57 | # FIXME: Maybe use a vendored llama.cpp build for stability
58 | include(FetchContent)
59 | FetchContent_Declare(
60 | llama
61 | GIT_REPOSITORY ${LLAMACPP_REPO}
62 | GIT_TAG ${LLAMACPP_COMMIT}
63 | )
64 |
65 | # Set build type to Release for performance
66 | set(CMAKE_BUILD_TYPE Release)
67 |
68 | # Build all libs to bin
69 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
70 |
71 | # Make llama.cpp available
72 | FetchContent_MakeAvailable(llama)
73 |
74 | message(STATUS "llama source dir: ${llama_SOURCE_DIR}")
75 |
76 | # Apple build changes
77 | # From llama-cpp-python
78 | if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
79 | # Need to disable these llama.cpp flags on Apple x86_64,
80 | # otherwise users may encounter invalid instruction errors
81 | set(GGML_AVX "Off" CACHE BOOL "ggml: enable AVX" FORCE)
82 | set(GGML_AVX2 "Off" CACHE BOOL "ggml: enable AVX2" FORCE)
83 | set(GGML_FMA "Off" CACHE BOOL "gml: enable FMA" FORCE)
84 | set(GGML_F16C "Off" CACHE BOOL "gml: enable F16C" FORCE)
85 | endif()
86 |
87 | if (APPLE)
88 | set(GGML_METAL_EMBED_LIBRARY ON CACHE BOOL "llama: embed metal library" FORCE)
89 | endif()
90 |
91 | # Create a library from c_library.cpp
92 | add_library(c_library SHARED
93 | server/c_library.cpp
94 | )
95 |
96 | # Set include directories for the library
97 | target_include_directories(c_library PUBLIC
98 | ${CMAKE_CURRENT_SOURCE_DIR}
99 | ${CMAKE_CURRENT_SOURCE_DIR}/server
100 | ${llama_SOURCE_DIR}/src
101 | )
102 |
103 | # Link llama libraries to our c_library
104 | target_link_libraries(c_library PUBLIC llama common)
105 |
106 | # Create our main executable
107 | add_executable(multi_user_inference
108 | server/server_basic_example.cpp
109 | )
110 |
111 | # set_target_properties(multi_user_inference PROPERTIES
112 | # INSTALL_RPATH "${CMAKE_BINARY_DIR}/bin"
113 | # )
114 |
115 | # Include directories for main executable
116 | target_include_directories(multi_user_inference PRIVATE
117 | ${CMAKE_CURRENT_SOURCE_DIR}
118 | ${CMAKE_CURRENT_SOURCE_DIR}/server
119 | )
120 |
121 | # Link our c_library to the main executable
122 | target_link_libraries(multi_user_inference PRIVATE
123 | c_library
124 | )
125 |
126 | if(LLGUIDANCE)
127 | target_compile_definitions(c_library PUBLIC LLGUIDANCE_BUILT=1)
128 | endif()
129 |
130 | # Windows options
131 | if(WIN32)
132 | set_target_properties(c_library PROPERTIES
133 | WINDOWS_EXPORT_ALL_SYMBOLS TRUE
134 | )
135 | endif()
--------------------------------------------------------------------------------
/bindings/bindings.ps1:
--------------------------------------------------------------------------------
1 | if (Get-Command cmake -ErrorAction SilentlyContinue) {
2 | Write-Host "Found CMake: $(cmake --version)"
3 | } else {
4 | Import-Module 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
5 | Enter-VsDevShell -VsInstallPath 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools' -DevCmdArguments '-arch=x64 -host_arch=x64'
6 | }
7 |
8 | $jobs = if ($env:MAX_JOBS) {
9 | $env:MAX_JOBS
10 | } else {
11 | $env:NUMBER_OF_PROCESSORS
12 | }
13 |
14 | $extraCmakeArgs = @()
15 |
16 | # llama.cpp dev options
17 | if ($env:LLAMACPP_REPO) {
18 | $extraCmakeArgs += "-DLLAMACPP_REPO=$env:LLAMACPP_REPO"
19 | Write-Host "Using custom llama.cpp repo: $env:LLAMACPP_REPO"
20 | }
21 |
22 | if ($env:LLAMACPP_COMMIT) {
23 | $extraCmakeArgs += "-DLLAMACPP_COMMIT=$env:LLAMACPP_COMMIT"
24 | Write-Host "Using custom llama.cpp commit: $env:LLAMACPP_COMMIT"
25 | }
26 |
27 | if ($env:LLGUIDANCE -eq 1) {
28 | $env:RUSTC_WRAPPER="sccache"
29 | Write-Host "LLGuidance enabled, including in build"
30 | $extraCmakeArgs += "-DLLGUIDANCE=ON"
31 | }
32 |
33 | if ($env:GGML_CUDA -eq 1) {
34 | Write-Host "CUDA enabled, including in build"
35 |
36 | $extraCmakeArgs += "-DGGML_CUDA=ON"
37 |
38 | if ($env:CMAKE_CUDA_ARCHITECTURES) {
39 | $extraCmakeArgs += @(
40 | "-DCMAKE_CUDA_ARCHITECTURES=$env:CMAKE_CUDA_ARCHITECTURES",
41 | "-DGGML_NATIVE=OFF"
42 | )
43 | }
44 | }
45 |
46 | if ($env:GGML_VULKAN -eq 1) {
47 | Write-Host "Vulkan enabled, including in build"
48 |
49 | $extraCmakeArgs += "-DGGML_VULKAN=ON"
50 | }
51 |
52 | cmake . -B build -G "Ninja" -DCMAKE_BUILD_TYPE=Release $extraCmakeArgs
53 | cmake --build build --config Release --target c_library -j $jobs
54 | Copy-Item build/*.dll ../lib
55 | Copy-Item build/bin/*.dll ../lib
--------------------------------------------------------------------------------
/bindings/bindings.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | OS=$(uname -s)
4 |
5 | # Set number of jobs for parallel build
6 | if [ -n "$MAX_JOBS" ]; then
7 | JOBS=$MAX_JOBS
8 | elif [ "$OS" = "Darwin" ]; then
9 | JOBS=$(sysctl -n hw.physicalcpu)
10 | else
11 | JOBS=$(nproc --all)
12 | fi
13 |
14 | # Initialize as empty array
15 | EXTRA_CMAKE_ARGS=()
16 |
17 | # llama.cpp dev options
18 | if [ -n "$LLAMACPP_REPO" ]; then
19 | EXTRA_CMAKE_ARGS+=("-DLLAMACPP_REPO=$LLAMACPP_REPO")
20 | echo "Using custom llama.cpp repo: ${LLAMACPP_REPO}"
21 | fi
22 |
23 | if [ -n "$LLAMACPP_COMMIT" ]; then
24 | EXTRA_CMAKE_ARGS+=("-DLLAMACPP_COMMIT=$LLAMACPP_COMMIT")
25 | echo "Using custom llama.cpp commit: ${LLAMACPP_COMMIT}"
26 | fi
27 |
28 | if [ "$LLGUIDANCE" = "1" ]; then
29 | export RUSTC_WRAPPER="sccache"
30 | EXTRA_CMAKE_ARGS+=("-DLLGUIDANCE=ON")
31 | echo "LLGuidance enabled, including in build"
32 | fi
33 |
34 | if [ "$GGML_CUDA" = "1" ]; then
35 | EXTRA_CMAKE_ARGS+=("-DGGML_CUDA=ON")
36 | echo "CUDA enabled, including in build"
37 |
38 | if [ -n "$CMAKE_CUDA_ARCHITECTURES" ]; then
39 | EXTRA_CMAKE_ARGS+=(
40 | "-DGGML_NATIVE=OFF" "-DCMAKE_CUDA_ARCHITECTURES=$CMAKE_CUDA_ARCHITECTURES"
41 | )
42 | fi
43 | fi
44 |
45 | if [ "$GGML_VULKAN" = "1" ]; then
46 | EXTRA_CMAKE_ARGS+=("-DGGML_VULKAN=ON")
47 | echo "Vulkan enabled, including in build"
48 | fi
49 |
50 | if [ "$GGML_HIP" = "1" ]; then
51 | EXTRA_CMAKE_ARGS+=("-DGGML_HIP=ON")
52 | echo "HIP enabled, including in build"
53 |
54 | if [ -n "$AMDGPU_TARGETS" ]; then
55 | EXTRA_CMAKE_ARGS+=(
56 | "-DAMDGPU_TARGETS=$AMDGPU_TARGETS"
57 | )
58 | fi
59 | fi
60 |
61 | # Join array elements with spaces
62 | CMAKE_ARGS="${EXTRA_CMAKE_ARGS[*]}"
63 |
64 | cmake . -B build -G "Ninja" -DCMAKE_BUILD_TYPE=Release ${CMAKE_ARGS}
65 | cmake --build build --config Release --target c_library -j ${JOBS}
66 |
67 | if [ "$OS" = "Darwin" ]; then
68 | echo "Copying .dylib files"
69 | cp build/bin/*.dylib ../lib
70 | elif [ "$OS" = "Linux" ]; then
71 | echo "Copying .so files"
72 | cp build/bin/*.so ../lib
73 | fi
--------------------------------------------------------------------------------
/bindings/generationResources.ts:
--------------------------------------------------------------------------------
1 | import { lib } from "./lib.ts";
2 | import { ReadbackBuffer } from "./readbackBuffer.ts";
3 |
4 | export class GenerationResources {
5 | private readbackBufferPtr: Deno.PointerValue;
6 |
7 | rawPtr: Deno.PointerValue;
8 | samplerPtr: Deno.PointerValue;
9 | readbackBuffer: ReadbackBuffer;
10 |
11 | constructor() {
12 | this.rawPtr = lib.symbols.generation_resources_make();
13 | if (!this.rawPtr) {
14 | throw new Error("Could not allocate shared resource bundle.");
15 | }
16 |
17 | const view = new Deno.UnsafePointerView(this.rawPtr);
18 | this.readbackBufferPtr = Deno.UnsafePointer.create(
19 | view.getBigUint64(0),
20 | );
21 | this.readbackBuffer = new ReadbackBuffer(this.readbackBufferPtr);
22 |
23 | this.samplerPtr = Deno.UnsafePointer.create(view.getBigUint64(8));
24 | }
25 |
26 | close() {
27 | lib.symbols.generation_resources_release(this.rawPtr);
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/bindings/grammar.ts:
--------------------------------------------------------------------------------
1 | import { hasLlguidance } from "@/bindings/lib.ts";
2 | import { SamplerBuilder } from "@/bindings/samplers.ts";
3 | import { logger } from "@/common/logging.ts";
4 |
5 | export class YALSGrammar {
6 | private sampler: SamplerBuilder;
7 |
8 | constructor(sampler: SamplerBuilder) {
9 | this.sampler = sampler;
10 | }
11 |
12 | BNF(grammar: string) {
13 | if (hasLlguidance) {
14 | this.sampler.llguidance(grammar);
15 | } else {
16 | logger.warn(
17 | "YALS was not built with LLGuidance. Using GBNF.",
18 | );
19 |
20 | this.sampler.grammar(grammar);
21 | }
22 | }
23 |
24 | jsonSchema(schema: Record) {
25 | if (!hasLlguidance) {
26 | logger.warn(
27 | "YALS was not built with LLGuidance. Skipping JSON schema.",
28 | );
29 |
30 | return;
31 | }
32 |
33 | const grammarArray = ["start: json_object"];
34 | const schemaString = JSON.stringify(
35 | schema,
36 | null,
37 | 2,
38 | );
39 | grammarArray.push(`json_object: %json ${schemaString}`);
40 |
41 | this.sampler.llguidance(grammarArray.join("\n"));
42 | }
43 |
44 | regex(regex: string) {
45 | if (!hasLlguidance) {
46 | logger.warn(
47 | "YALS was not built with LLGuidance. Skipping Regex parsing.",
48 | );
49 |
50 | return;
51 | }
52 |
53 | const grammarArray = ["start: text"];
54 | grammarArray.push(`text: ${regex}`);
55 |
56 | this.sampler.llguidance(grammarArray.join("\n"));
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/bindings/job.ts:
--------------------------------------------------------------------------------
1 | import { lib } from "@/bindings/lib.ts";
2 | import { ReadbackBuffer } from "./readbackBuffer.ts";
3 | import { GenerationChunk } from "./types.ts";
4 |
5 | export class Job {
6 | // Private references
7 | private readbackBuffer: ReadbackBuffer;
8 | private processor: Deno.PointerValue;
9 |
10 | isComplete = false;
11 | id: number;
12 |
13 | constructor(
14 | id: number,
15 | readbackBuffer: ReadbackBuffer,
16 | processor: Deno.PointerValue,
17 | ) {
18 | this.id = id;
19 | this.readbackBuffer = readbackBuffer;
20 | this.processor = processor;
21 | }
22 |
23 | async *stream(): AsyncGenerator {
24 | for await (const { text, token } of this.readbackBuffer.read()) {
25 | yield { kind: "data", text, token };
26 | }
27 |
28 | const status = await this.readbackBuffer.readStatus();
29 | if (status) {
30 | yield status;
31 | }
32 | }
33 |
34 | cancel() {
35 | if (this.isComplete) {
36 | return;
37 | }
38 |
39 | this.isComplete = true;
40 |
41 | lib.symbols.processor_cancel_work(
42 | this.processor,
43 | this.id,
44 | );
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/bindings/lib.ts:
--------------------------------------------------------------------------------
1 | import libraryInterface from "./symbols.ts";
2 |
3 | export let lib: Deno.DynamicLibrary;
4 | export let hasLlguidance: boolean = false;
5 |
6 | export function loadYalsBindings() {
7 | const libName = "c_library";
8 | const libDir = `${Deno.cwd()}/lib/`;
9 | let libPath = libDir;
10 |
11 | switch (Deno.build.os) {
12 | case "windows":
13 | Deno.env.set("PATH", `${Deno.env.get("PATH")};${libDir}`);
14 | libPath += `${libName}.dll`;
15 | break;
16 | case "linux":
17 | libPath += `lib${libName}.so`;
18 | break;
19 | case "darwin":
20 | libPath += `lib${libName}.dylib`;
21 | break;
22 | default:
23 | throw new Error(`Unsupported operating system: ${Deno.build.os}`);
24 | }
25 |
26 | try {
27 | lib = Deno.dlopen(libPath, libraryInterface);
28 | hasLlguidance = lib.symbols.has_llguidance();
29 | } catch (error: unknown) {
30 | console.error(
31 | `Failed to load YALS library: ${
32 | error instanceof Error ? error.message : String(error)
33 | }`,
34 | );
35 | throw error;
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/bindings/minimal_cpp_test.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include "common.h"
3 | #include "c_library.h"
4 | #include "llama.h"
5 |
6 | int main() {
7 | const auto idk = new float(0.0);
8 | const auto model = model_load(
9 | "/home/blackroot/Desktop/YALS/YALS/models/PocketDoc_Dans-PersonalityEngine-V1.2.0-24b-Q6_K_L.gguf",
10 | 999,
11 | idk,
12 | nullptr
13 | );
14 |
15 | const auto ctx = ctx_make(model, 1024, 999, 512, false, -1, false, 0, 0, 0.0f);
16 | if (!model || !ctx) {
17 | std::cerr << "Failed to load model" << std::endl;
18 | return 1;
19 | }
20 |
21 | std::cout << "Model and context loaded successfully" << std::endl;
22 |
23 | auto sampler = sampler_make();
24 | sampler = sampler_temp(sampler, 2);
25 | sampler = sampler_dist(sampler, 1337);
26 |
27 | const auto processor = processor_make(model, ctx, 4);
28 |
29 | const auto readback_buffer = readback_create_buffer();
30 |
31 | const auto prompt = R"(<|im_start|>system
32 | Respond with *actions* *words* *thoughts* in a json format, with
33 | {
34 | "action" : ["first, second]",
35 | "mood" : "current mood from 20 mood choices",
36 | "magazine capacity" : "a number"
37 | }
38 | <|im_end|>
39 | <|im_start|>user
40 | Hi how are you?
41 | <|im_end|>
42 | <|im_start|>assistant
43 | )";
44 |
45 | auto lark_grammar = R"(
46 | // Define the start rule
47 | start: json_string
48 |
49 | // The exact JSON string with fixed format
50 | json_string: "{\n \"action\" : [\"" ACTION_CONTENT "\"],\n \"mood\" : \"" EMOTION "\",\n \"magazine capacity\" : \"" CAPACITY_CONTENT "\"\n}"
51 |
52 | // Content restrictions
53 | ACTION_CONTENT: /[a-zA-Z0-9 ,]{1,15}/
54 | CAPACITY_CONTENT: /[0-9]+( rounds| bullets| shots)?/
55 | EMOTION: "happy" | "sad" | "angry" | "excited" | "bored" | "anxious" | "calm" | "confused"
56 | | "curious" | "depressed" | "ecstatic" | "fearful" | "grateful" | "hopeful"
57 | | "irritated" | "jealous" | "peaceful" | "proud" | "surprised" | "tired"
58 | )";
59 |
60 | const char* seq[] = {"*"};
61 |
62 | processor_submit_work(
63 | processor,
64 | prompt,
65 | sampler,
66 | readback_buffer,
67 | 100,
68 | 0,
69 | 1337,
70 | nullptr,
71 | 0,
72 | nullptr,
73 | 0,
74 | nullptr,
75 | 0,
76 | lark_grammar);
77 |
78 | std::cout << "Starting model:" << std::endl;
79 | while (!readback_is_buffer_finished(readback_buffer)) {
80 | char* char_out;
81 | llama_token token;
82 | if (readback_read_next(readback_buffer, &char_out, &token)) {
83 | std::cout << char_out;
84 | std::cout.flush();
85 | }
86 | }
87 |
88 | const char* status = readback_read_status(readback_buffer);
89 | std::cout << status << std::endl;
90 |
91 | return 0;
92 | }
93 |
--------------------------------------------------------------------------------
/bindings/readbackBuffer.ts:
--------------------------------------------------------------------------------
1 | import { delay } from "@std/async/delay";
2 |
3 | import { logger } from "@/common/logging.ts";
4 | import { lib } from "./lib.ts";
5 | import { FinishChunk } from "@/bindings/types.ts";
6 |
7 | /**
8 | * ReadbackBuffer provides an interface to read generated tokens and text
9 | * from the LLM generation process.
10 | */
11 | export class ReadbackBuffer {
12 | private rawPtr: Deno.PointerValue;
13 |
14 | constructor(readbackPtr: Deno.PointerValue) {
15 | this.rawPtr = readbackPtr;
16 | }
17 |
18 | async *read() {
19 | while (!lib.symbols.readback_is_buffer_finished(this.rawPtr)) {
20 | const charBuf = new Uint8Array(8);
21 | const tokenBuf = new Int32Array(1);
22 |
23 | if (
24 | !await lib.symbols.readback_read_next(
25 | this.rawPtr,
26 | Deno.UnsafePointer.of(charBuf),
27 | Deno.UnsafePointer.of(tokenBuf),
28 | )
29 | ) {
30 | await delay(2);
31 | continue;
32 | }
33 |
34 | const ptrVal = new BigUint64Array(charBuf.buffer)[0];
35 | if (ptrVal === 0n) continue;
36 |
37 | const charPtr = Deno.UnsafePointer.create(ptrVal);
38 | if (!charPtr) continue;
39 |
40 | yield {
41 | text: new Deno.UnsafePointerView(charPtr).getCString(),
42 | token: tokenBuf[0],
43 | };
44 | }
45 | }
46 |
47 | /**
48 | * Reads the status information from the buffer
49 | * @returns A ReadbackFinish object or null if status couldn't be read
50 | */
51 | async readStatus(): Promise {
52 | const statusPtr = await lib.symbols.readback_read_status(
53 | this.rawPtr,
54 | );
55 | if (!statusPtr) {
56 | return null;
57 | }
58 |
59 | const view = new Deno.UnsafePointerView(statusPtr);
60 | const statusStr = view.getCString();
61 |
62 | try {
63 | const status = JSON.parse(statusStr);
64 | return {
65 | ...status,
66 | kind: "finish",
67 | text: "",
68 | };
69 | } catch (e) {
70 | logger.error("Failed to parse status JSON:", e);
71 | return null;
72 | }
73 | }
74 | }
75 |
--------------------------------------------------------------------------------
/bindings/samplers.ts:
--------------------------------------------------------------------------------
1 | import { lib } from "./lib.ts";
2 | import { GenerationResources } from "./generationResources.ts";
3 |
4 | export interface LogitBias {
5 | token: number;
6 | bias: number;
7 | }
8 |
9 | export class SamplerBuilder {
10 | private sampler: Deno.PointerValue;
11 | private readonly model: Deno.PointerValue;
12 |
13 | constructor(
14 | model: Deno.PointerValue,
15 | resourceBundle: GenerationResources,
16 | ) {
17 | this.sampler = resourceBundle.samplerPtr;
18 | if (!this.sampler) {
19 | throw new Error("Failed to create sampler");
20 | }
21 | this.model = model;
22 | }
23 |
24 | /**
25 | * Adds distribution sampling with the specified seed
26 | * @param seed Random seed for sampling
27 | * @returns This builder instance for chaining
28 | */
29 | dist(seed: number): SamplerBuilder {
30 | this.sampler = lib.symbols.sampler_dist(this.sampler, seed);
31 | return this;
32 | }
33 |
34 | /**
35 | * Adds grammar-based sampling constraints
36 | * @param grammar Grammar definition as a string
37 | * @param root Root rule name in the grammar
38 | * @returns This builder instance for chaining
39 | */
40 | grammar(grammar: string): SamplerBuilder {
41 | const grammarPtr = new TextEncoder().encode(grammar + "\0");
42 |
43 | this.sampler = lib.symbols.sampler_grammar(
44 | this.sampler,
45 | this.model,
46 | grammarPtr,
47 | );
48 |
49 | return this;
50 | }
51 |
52 | /**
53 | * Adds llguidance sampler
54 | * @param grammar Grammar definition as a string
55 | */
56 | llguidance(grammar: string): SamplerBuilder {
57 | const grammarPtr = new TextEncoder().encode(grammar + "\0");
58 |
59 | this.sampler = lib.symbols.sampler_llguidance(
60 | this.sampler,
61 | this.model,
62 | grammarPtr,
63 | );
64 |
65 | return this;
66 | }
67 |
68 | /**
69 | * Configures the sampler to always choose the most likely token (greedy sampling)
70 | * @returns This builder instance for chaining
71 | */
72 | greedy(): SamplerBuilder {
73 | this.sampler = lib.symbols.sampler_greedy(this.sampler);
74 | return this;
75 | }
76 |
77 | /**
78 | * Configures the sampler for infill generation
79 | * @returns This builder instance for chaining
80 | */
81 | infill(): SamplerBuilder {
82 | this.sampler = lib.symbols.sampler_infill(this.sampler, this.model);
83 | return this;
84 | }
85 |
86 | /**
87 | * Applies token biases to influence generation probabilities
88 | * @param logitBias Array of token biases to apply
89 | * @returns This builder instance for chaining
90 | */
91 | logitBias(logitBias: LogitBias[]): SamplerBuilder {
92 | const nBias = logitBias.length;
93 |
94 | const bufferSize = nBias * 8; // 4 bytes for token (int32) + 4 bytes for bias (float)
95 | const buffer = new ArrayBuffer(bufferSize);
96 | const view = new DataView(buffer);
97 |
98 | logitBias.forEach((bias, index) => {
99 | view.setInt32(index * 8, bias.token, true);
100 | view.setFloat32(index * 8 + 4, bias.bias, true);
101 | });
102 |
103 | this.sampler = lib.symbols.sampler_logit_bias(
104 | this.sampler,
105 | this.model,
106 | nBias,
107 | Deno.UnsafePointer.of(buffer),
108 | );
109 |
110 | return this;
111 | }
112 |
113 | /**
114 | * Configures dry run sampling with sequence breakers
115 | * @param multiplier Dry run multiplier
116 | * @param base Dry run base
117 | * @param allowedLength Maximum allowed length
118 | * @param penaltyLastN Penalty context window size
119 | * @param sequenceBreakers Array of strings that break sequences
120 | * @returns This builder instance for chaining
121 | */
122 | dry(
123 | multiplier: number,
124 | base: number,
125 | allowedLength: number,
126 | penaltyLastN: number,
127 | sequenceBreakers: string[] = [],
128 | ): SamplerBuilder {
129 | const nullTerminatedBreakers = sequenceBreakers.map((str) =>
130 | str + "\0"
131 | );
132 |
133 | // Encode strings to Uint8Arrays
134 | const encodedBreakers = nullTerminatedBreakers.map((str) =>
135 | new TextEncoder().encode(str)
136 | );
137 |
138 | // Create pointers to encoded strings
139 | const breakerPtrs = encodedBreakers.map((encoded) =>
140 | Deno.UnsafePointer.of(encoded)
141 | );
142 |
143 | // Create an array to hold the pointers
144 | const ptrArrayBuffer = new ArrayBuffer(breakerPtrs.length * 8);
145 | const ptrArray = new BigUint64Array(ptrArrayBuffer);
146 |
147 | // Store the pointer values in the array
148 | breakerPtrs.forEach((ptr, index) => {
149 | ptrArray[index] = BigInt(Deno.UnsafePointer.value(ptr));
150 | });
151 |
152 | this.sampler = lib.symbols.sampler_dry(
153 | this.sampler,
154 | this.model,
155 | multiplier,
156 | base,
157 | allowedLength,
158 | penaltyLastN,
159 | Deno.UnsafePointer.of(ptrArrayBuffer),
160 | BigInt(sequenceBreakers.length),
161 | );
162 |
163 | return this;
164 | }
165 |
166 | /**
167 | * Configures minimum-p sampling
168 | * @param minP Minimum probability threshold
169 | * @param minKeep Minimum number of tokens to keep
170 | * @returns This builder instance for chaining
171 | */
172 | minP(minP: number, minKeep: bigint): SamplerBuilder {
173 | this.sampler = lib.symbols.sampler_min_p(this.sampler, minP, minKeep);
174 | return this;
175 | }
176 |
177 | /**
178 | * Configures mirostat sampling (adaptive temperature)
179 | * @param seed Random seed
180 | * @param tau Target entropy
181 | * @param eta Learning rate
182 | * @param m Order of the mirostat
183 | * @returns This builder instance for chaining
184 | */
185 | mirostat(
186 | seed: number,
187 | tau: number,
188 | eta: number,
189 | m: number,
190 | ): SamplerBuilder {
191 | this.sampler = lib.symbols.sampler_mirostat(
192 | this.sampler,
193 | this.model,
194 | seed,
195 | tau,
196 | eta,
197 | m,
198 | );
199 | return this;
200 | }
201 |
202 | /**
203 | * Configures mirostat v2 sampling (simplified adaptive temperature)
204 | * @param seed Random seed
205 | * @param tau Target entropy
206 | * @param eta Learning rate
207 | * @returns This builder instance for chaining
208 | */
209 | mirostatV2(seed: number, tau: number, eta: number): SamplerBuilder {
210 | this.sampler = lib.symbols.sampler_mirostat_v2(
211 | this.sampler,
212 | seed,
213 | tau,
214 | eta,
215 | );
216 | return this;
217 | }
218 |
219 | /**
220 | * Configures repetition penalties
221 | * @param penaltyLastN Number of tokens to consider for penalties
222 | * @param penaltyRepeat Repetition penalty
223 | * @param penaltyFreq Frequency penalty
224 | * @param penaltyPresent Presence penalty
225 | * @returns This builder instance for chaining
226 | */
227 | penalties(
228 | penaltyLastN: number,
229 | penaltyRepeat: number,
230 | penaltyFreq: number,
231 | penaltyPresent: number,
232 | ): SamplerBuilder {
233 | this.sampler = lib.symbols.sampler_penalties(
234 | this.sampler,
235 | penaltyLastN,
236 | penaltyRepeat,
237 | penaltyFreq,
238 | penaltyPresent,
239 | );
240 | return this;
241 | }
242 |
243 | /**
244 | * Sets the sampling temperature
245 | * @param temp Temperature value (higher = more random)
246 | * @returns This builder instance for chaining
247 | */
248 | temp(temp: number): SamplerBuilder {
249 | this.sampler = lib.symbols.sampler_temp(this.sampler, temp);
250 | return this;
251 | }
252 |
253 | /**
254 | * Sets extended temperature settings
255 | * @param temp Base temperature
256 | * @param dynatempRange Dynamic temperature range
257 | * @param dynatempExponent Dynamic temperature exponent
258 | * @returns This builder instance for chaining
259 | */
260 | tempExt(
261 | temp: number,
262 | dynatempRange: number,
263 | dynatempExponent: number,
264 | ): SamplerBuilder {
265 | this.sampler = lib.symbols.sampler_temp_ext(
266 | this.sampler,
267 | temp,
268 | dynatempRange,
269 | dynatempExponent,
270 | );
271 | return this;
272 | }
273 |
274 | /**
275 | * Configures top-k sampling
276 | * @param k Number of most likely tokens to consider
277 | * @returns This builder instance for chaining
278 | */
279 | topK(k: number): SamplerBuilder {
280 | this.sampler = lib.symbols.sampler_top_k(this.sampler, k);
281 | return this;
282 | }
283 |
284 | /**
285 | * Configures top-p (nucleus) sampling
286 | * @param p Cumulative probability threshold
287 | * @param minKeep Minimum number of tokens to keep
288 | * @returns This builder instance for chaining
289 | */
290 | topP(p: number, minKeep: bigint): SamplerBuilder {
291 | this.sampler = lib.symbols.sampler_top_p(this.sampler, p, minKeep);
292 | return this;
293 | }
294 |
295 | /**
296 | * Configures typical sampling
297 | * @param typicalP Typical probability threshold
298 | * @param minKeep Minimum number of tokens to keep
299 | * @returns This builder instance for chaining
300 | */
301 | typical(typicalP: number, minKeep: bigint): SamplerBuilder {
302 | this.sampler = lib.symbols.sampler_typical(
303 | this.sampler,
304 | typicalP,
305 | minKeep,
306 | );
307 | return this;
308 | }
309 |
310 | /**
311 | * Configures top-n-sigma sampling
312 | * @param nSigma Number of standard deviations to consider
313 | * @returns This builder instance for chaining
314 | */
315 | topNSigma(nSigma: number): SamplerBuilder {
316 | this.sampler = lib.symbols.sampler_top_n_sigma(this.sampler, nSigma);
317 | return this;
318 | }
319 |
320 | /**
321 | * Configures XTC (exploration time control) sampling
322 | * @param xtcProbability XTC probability
323 | * @param xtcThreshold XTC threshold
324 | * @param minKeep Minimum number of tokens to keep
325 | * @param seed Random seed
326 | * @returns This builder instance for chaining
327 | */
328 | xtc(
329 | xtcProbability: number,
330 | xtcThreshold: number,
331 | minKeep: bigint,
332 | seed: number,
333 | ): SamplerBuilder {
334 | this.sampler = lib.symbols.sampler_xtc(
335 | this.sampler,
336 | xtcProbability,
337 | xtcThreshold,
338 | minKeep,
339 | seed,
340 | );
341 | return this;
342 | }
343 |
344 | /**
345 | * Builds and returns the configured sampler
346 | * @returns Pointer to the configured sampler
347 | */
348 | build(): Deno.PointerValue {
349 | return this.sampler;
350 | }
351 | }
352 |
--------------------------------------------------------------------------------
/bindings/server/c_library.cpp:
--------------------------------------------------------------------------------
1 | #include "c_library.h"
2 |
3 | #include