├── .devcontainer ├── Dockerfile └── devcontainer.json ├── .github └── workflows │ └── build_library.yml ├── .gitignore ├── LICENSE ├── Package.resolved ├── Package.swift ├── README.mdown ├── Sources └── llama-cpp-swift │ ├── InferError.swift │ ├── InitializationError.swift │ ├── LLama.swift │ ├── Logger+LLama.swift │ └── Model.swift ├── Tests └── llama-cpp-swiftTests │ └── llama_cpp_swiftTests.swift └── example ├── .gitignore ├── Package.resolved ├── Package.swift └── Sources └── main.swift /.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG VARIANT=ubuntu-22.04 2 | FROM mcr.microsoft.com/vscode/devcontainers/base:0-${VARIANT} 3 | 4 | # Install dependencies 5 | RUN apt-get update && \ 6 | apt-get install -y curl libicu-dev libxml2-dev libsqlite3-dev && \ 7 | rm -rf /var/lib/apt/lists/* 8 | 9 | # Download and install Swift 10 | RUN curl -s https://archive.swiftlang.xyz/install.sh | bash && \ 11 | apt-get install -y swiftlang 12 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Swift", 3 | "build": { 4 | "dockerfile": "Dockerfile", 5 | "args": { 6 | "VARIANT": "ubuntu-22.04" 7 | } 8 | }, 9 | "features": { 10 | "ghcr.io/devcontainers/features/common-utils:2": { 11 | "installZsh": "false", 12 | "username": "vscode", 13 | "userUid": "1000", 14 | "userGid": "1000", 15 | "upgradePackages": "false" 16 | }, 17 | "ghcr.io/devcontainers/features/git:1": { 18 | "version": "os-provided", 19 | "ppa": "false" 20 | } 21 | }, 22 | "runArgs": [ 23 | "--cap-add=SYS_PTRACE", 24 | "--security-opt", 25 | "seccomp=unconfined" 26 | ], 27 | "customizations": { 28 | "vscode": { 29 | "settings": {}, 30 | "extensions": [ 31 | "sswg.swift-lang" 32 | ] 33 | } 34 | }, 35 | "remoteUser": "vscode" 36 | } -------------------------------------------------------------------------------- /.github/workflows/build_library.yml: -------------------------------------------------------------------------------- 1 | name: Build Library 2 | 3 | on: 4 | push: 5 | branches: 6 | - '**' 7 | 8 | jobs: 9 | 10 | build-linux: 11 | name: Build on Linux 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout Source Code 15 | uses: actions/checkout@v3 16 | 17 | - name: Install Swift 18 | run: | 19 | sudo apt-get update 20 | sudo apt install -y curl 21 | curl -s https://archive.swiftlang.xyz/install.sh | sudo bash 22 | sudo apt install -y swiftlang 23 | swift --version 24 | 25 | - name: Build with Swift Package Manager 26 | run: swift build 27 | 28 | build-macos: 29 | name: Build on macOS 30 | runs-on: macos-latest 31 | steps: 32 | - name: Checkout Source Code 33 | uses: actions/checkout@v3 34 | 35 | - name: Build with Xcode 36 | run: | 37 | xcodebuild \ 38 | -scheme LLamaSwift \ 39 | -destination platform=macOS \ 40 | SWIFT_ACTIVE_COMPILATION_CONDITIONS=DEBUG 41 | 42 | build-ios: 43 | name: Build on iOS 44 | runs-on: macos-latest 45 | steps: 46 | - name: Checkout Source Code 47 | uses: actions/checkout@v3 48 | 49 | - name: Build with Xcode 50 | run: | 51 | xcodebuild \ 52 | -scheme LLamaSwift \ 53 | -destination platform=iOS \ 54 | SWIFT_ACTIVE_COMPILATION_CONDITIONS=DEBUG 55 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | /.build 3 | /build 4 | /Packages 5 | xcuserdata/ 6 | DerivedData/ 7 | .swiftpm/configuration/registries.json 8 | .swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata 9 | .netrc 10 | .vscode/ 11 | .index-build/ 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Michał Tuszyński 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Package.resolved: -------------------------------------------------------------------------------- 1 | { 2 | "pins" : [ 3 | { 4 | "identity" : "llama.cpp", 5 | "kind" : "remoteSourceControl", 6 | "location" : "https://github.com/ggerganov/llama.cpp", 7 | "state" : { 8 | "branch" : "master", 9 | "revision" : "26a8406ba9198eb6fdd8329fa717555b4f77f05f" 10 | } 11 | }, 12 | { 13 | "identity" : "swift-log", 14 | "kind" : "remoteSourceControl", 15 | "location" : "https://github.com/apple/swift-log.git", 16 | "state" : { 17 | "revision" : "96a2f8a0fa41e9e09af4585e2724c4e825410b91", 18 | "version" : "1.6.2" 19 | } 20 | } 21 | ], 22 | "version" : 2 23 | } 24 | -------------------------------------------------------------------------------- /Package.swift: -------------------------------------------------------------------------------- 1 | // swift-tools-version: 5.9 2 | import PackageDescription 3 | 4 | let package = Package( 5 | name: "LLamaSwift", 6 | platforms: [ 7 | .macOS(.v12), 8 | .iOS(.v14), 9 | .watchOS(.v4), 10 | .tvOS(.v14), 11 | .visionOS(.v1) 12 | ], 13 | products: [ 14 | .library( 15 | name: "LLamaSwift", 16 | targets: ["LLamaSwift"]), 17 | ], 18 | dependencies: [ 19 | .package(url: "https://github.com/ggerganov/llama.cpp", branch: "master"), 20 | .package(url: "https://github.com/apple/swift-log.git", from: "1.6.1"), 21 | ], 22 | targets: [ 23 | .target( 24 | name: "LLamaSwift", 25 | dependencies: [ 26 | .product(name: "llama", package: "llama.cpp"), 27 | .product(name: "Logging", package: "swift-log"), 28 | ] 29 | ), 30 | .testTarget( 31 | name: "llama-cpp-swiftTests", 32 | dependencies: ["LLamaSwift"] 33 | ), 34 | ] 35 | ) 36 | -------------------------------------------------------------------------------- /README.mdown: -------------------------------------------------------------------------------- 1 | # llama-cpp-swift 2 | [![](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fsrgtuszy%2Fllama-cpp-swift%2Fbadge%3Ftype%3Dswift-versions)](https://swiftpackageindex.com/srgtuszy/llama-cpp-swift) [![](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fsrgtuszy%2Fllama-cpp-swift%2Fbadge%3Ftype%3Dplatforms)](https://swiftpackageindex.com/srgtuszy/llama-cpp-swift) 3 | 4 | Swift bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) thanks to which you'll be able to run compatible LLM models directly on your device. 5 | 6 | ## Features 7 | 8 | - Lightweight and easy to use 9 | - Works on macOS and Linux 10 | - Supports streaming via structured concurrency 11 | - Swift 6 ready! 12 | 13 | ## TODO 14 | 15 | - [ ] Unit tests 16 | - [ ] Model downloads from URL and HuggingFace 17 | 18 | ## How to install 19 | 20 | Use swift package manager: 21 | 22 | ``` 23 | .package(url: "https://github.com/srgtuszy/llama-cpp-swift", branch: "main") 24 | ``` 25 | 26 | ## How to use 27 | 28 | Here's a quick example on how to use it. For more, please refer to an example app in `example/` folder. 29 | 30 | ```swift 31 | // Initialize model 32 | let model = try Model(modelPath: "") 33 | let llama = try LLama(model: model) 34 | 35 | // Results are delivered through an `AsyncStream` 36 | let prompt = "what is the meaning of life?" 37 | for try await token in await llama.infer(prompt: prompt, maxTokens: 1024) { 38 | print(token, terminator: "") 39 | } 40 | ``` 41 | -------------------------------------------------------------------------------- /Sources/llama-cpp-swift/InferError.swift: -------------------------------------------------------------------------------- 1 | public struct InferError: Error, Sendable { 2 | public let message: String 3 | public let code: Code 4 | 5 | public enum Code: Int, Sendable { 6 | case cancelled = 1 7 | case kvCacheFailure 8 | case decodingFailure 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /Sources/llama-cpp-swift/InitializationError.swift: -------------------------------------------------------------------------------- 1 | public struct InitializationError: Error, Sendable { 2 | public let message: String 3 | public let code: Code 4 | 5 | public enum Code: Int, Sendable { 6 | case failedToLoadModel = 1 7 | case failedToInitializeContext 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /Sources/llama-cpp-swift/LLama.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | import Logging 3 | import llama 4 | 5 | /// An actor that handles inference using the LLama language model. 6 | public actor LLama { 7 | private let logger = Logger.llama 8 | private let model: Model 9 | private let sampling: UnsafeMutablePointer 10 | 11 | // MARK: - Init & Teardown 12 | 13 | /// Initializes a new instance of `LLama` with the specified model. 14 | /// 15 | /// - Parameter model: The language model to use for inference. 16 | public init(model: Model) { 17 | self.model = model 18 | 19 | // Initialize sampling 20 | let sparams = llama_sampler_chain_default_params() 21 | self.sampling = llama_sampler_chain_init(sparams) 22 | llama_sampler_chain_add(self.sampling, llama_sampler_init_temp(0.8)) 23 | llama_sampler_chain_add(self.sampling, llama_sampler_init_softmax()) 24 | llama_sampler_chain_add(self.sampling, llama_sampler_init_dist(1234)) 25 | } 26 | 27 | deinit { 28 | llama_sampler_free(self.sampling) 29 | } 30 | 31 | // MARK: - Inference 32 | 33 | /// Generates an asynchronous stream of tokens as strings based on the given prompt. 34 | /// 35 | /// - Parameters: 36 | /// - prompt: The input text prompt to generate completions for. 37 | /// - maxTokens: The maximum number of tokens to generate. Defaults to 128. 38 | /// 39 | /// - Returns: An `AsyncThrowingStream` emitting generated tokens as strings. 40 | public func infer(prompt: String, maxTokens: Int32 = 128) -> AsyncThrowingStream { 41 | return AsyncThrowingStream { continuation in 42 | Task { 43 | do { 44 | try await self.infer(prompt: prompt, maxTokens: maxTokens, continuation: continuation) 45 | } catch { 46 | continuation.finish(throwing: error) 47 | } 48 | } 49 | } 50 | } 51 | 52 | /// Initiates the inference process and manages the lifecycle of variables. 53 | /// 54 | /// - Parameters: 55 | /// - prompt: The input text prompt to generate completions for. 56 | /// - maxTokens: The maximum number of tokens to generate. 57 | /// - continuation: The stream continuation to yield tokens to. 58 | private func infer( 59 | prompt: String, 60 | maxTokens: Int32, 61 | continuation: AsyncThrowingStream.Continuation 62 | ) async throws { 63 | var isDone = false 64 | let nLen: Int32 = 1024 65 | var nCur: Int32 = 0 66 | var nDecode: Int32 = 0 67 | var batch = llama_batch_init(512, 0, 1) 68 | var temporaryInvalidCChars: [CChar] = [] 69 | defer { 70 | llama_batch_free(batch) 71 | } 72 | 73 | try self.initializeInference( 74 | prompt: prompt, 75 | batch: &batch, 76 | nLen: nLen, 77 | nCur: &nCur 78 | ) 79 | 80 | try await self.runInferenceLoop( 81 | batch: &batch, 82 | temporaryInvalidCChars: &temporaryInvalidCChars, 83 | isDone: &isDone, 84 | nLen: nLen, 85 | nCur: &nCur, 86 | nDecode: &nDecode, 87 | maxTokens: maxTokens, 88 | continuation: continuation 89 | ) 90 | } 91 | 92 | // MARK: - Private Helpers 93 | 94 | /// Initializes the inference process by tokenizing the input and preparing the batch. 95 | /// 96 | /// - Parameters: 97 | /// - prompt: The input text prompt. 98 | /// - batch: The batch to initialize. 99 | /// - nLen: The maximum sequence length. 100 | /// - nCur: The current position in the sequence. 101 | /// 102 | /// - Throws: An `InferError` if the KV cache is insufficient or decoding fails. 103 | private func initializeInference( 104 | prompt: String, 105 | batch: inout llama_batch, 106 | nLen: Int32, 107 | nCur: inout Int32 108 | ) throws { 109 | logger.debug("Attempting to complete \"\(prompt)\"") 110 | 111 | let tokensList = tokenize(text: prompt, add_bos: true) 112 | 113 | let nCtx = llama_n_ctx(model.context) 114 | let nKvReq = tokensList.count + Int(nLen - Int32(tokensList.count)) 115 | 116 | logger.debug("\nn_len = \(nLen), n_ctx = \(nCtx), n_kv_req = \(nKvReq)") 117 | 118 | if nKvReq > nCtx { 119 | logger.error("Error: n_kv_req > n_ctx, the required KV cache size is not big enough") 120 | throw InferError(message: "KV cache too small", code: .kvCacheFailure) 121 | } 122 | 123 | batch.clear() 124 | 125 | for (i, token) in tokensList.enumerated() { 126 | llamaBatchAdd(&batch, token, Int32(i), [0], false) 127 | } 128 | if batch.n_tokens > 0 { 129 | batch.logits[Int(batch.n_tokens) - 1] = 1 // true 130 | } 131 | 132 | if llama_decode(model.context, batch) != 0 { 133 | throw InferError(message: "llama_decode failed", code: .decodingFailure) 134 | } 135 | 136 | nCur = batch.n_tokens 137 | } 138 | 139 | /// Runs the main inference loop, generating tokens and yielding them to the continuation. 140 | /// 141 | /// - Parameters: 142 | /// - batch: The batch used for decoding. 143 | /// - temporaryInvalidCChars: Buffer for building partial UTF8 strings. 144 | /// - isDone: A flag indicating whether inference is complete. 145 | /// - nLen: The maximum sequence length. 146 | /// - nCur: The current position in the sequence. 147 | /// - nDecode: The number of tokens decoded so far. 148 | /// - maxTokens: The maximum number of tokens to generate. 149 | /// - continuation: The stream continuation to yield tokens to. 150 | private func runInferenceLoop( 151 | batch: inout llama_batch, 152 | temporaryInvalidCChars: inout [CChar], 153 | isDone: inout Bool, 154 | nLen: Int32, 155 | nCur: inout Int32, 156 | nDecode: inout Int32, 157 | maxTokens: Int32, 158 | continuation: AsyncThrowingStream.Continuation 159 | ) async throws { 160 | while !isDone && nCur < nLen && nCur - batch.n_tokens < maxTokens { 161 | guard !Task.isCancelled else { 162 | continuation.finish() 163 | return 164 | } 165 | let newTokenStr = self.generateNextToken( 166 | batch: &batch, 167 | temporaryInvalidCChars: &temporaryInvalidCChars, 168 | isDone: &isDone, 169 | nLen: nLen, 170 | nCur: &nCur, 171 | nDecode: &nDecode 172 | ) 173 | continuation.yield(newTokenStr) 174 | } 175 | continuation.finish() 176 | } 177 | 178 | /// Generates the next token and updates necessary states. 179 | /// 180 | /// - Parameters: 181 | /// - batch: The batch used for decoding. 182 | /// - temporaryInvalidCChars: Buffer for building partial UTF8 strings. 183 | /// - isDone: A flag indicating whether inference is complete. 184 | /// - nLen: The maximum sequence length. 185 | /// - nCur: The current position in the sequence. 186 | /// - nDecode: The number of tokens decoded so far. 187 | /// 188 | /// - Returns: The newly generated token as a string. 189 | private func generateNextToken( 190 | batch: inout llama_batch, 191 | temporaryInvalidCChars: inout [CChar], 192 | isDone: inout Bool, 193 | nLen: Int32, 194 | nCur: inout Int32, 195 | nDecode: inout Int32 196 | ) -> String { 197 | var newTokenID: llama_token = 0 198 | newTokenID = llama_sampler_sample(sampling, model.context, batch.n_tokens - 1) 199 | 200 | if llama_token_is_eog(model.model, newTokenID) || nCur == nLen { 201 | isDone = true 202 | let newTokenStr = String( 203 | decoding: Data(temporaryInvalidCChars.map { UInt8(bitPattern: $0) }), as: UTF8.self) 204 | temporaryInvalidCChars.removeAll() 205 | return newTokenStr 206 | } 207 | 208 | let newTokenCChars = tokenToPieceArray(token: newTokenID) 209 | temporaryInvalidCChars.append(contentsOf: newTokenCChars) 210 | let newTokenStr: String 211 | 212 | if let string = String(validatingUTF8: temporaryInvalidCChars) { 213 | temporaryInvalidCChars.removeAll() 214 | newTokenStr = string 215 | } else if let partialStr = attemptPartialString(from: temporaryInvalidCChars) { 216 | temporaryInvalidCChars.removeAll() 217 | newTokenStr = partialStr 218 | } else { 219 | newTokenStr = "" 220 | } 221 | 222 | batch.clear() 223 | llamaBatchAdd(&batch, newTokenID, nCur, [0], true) 224 | 225 | nDecode += 1 226 | nCur += 1 227 | 228 | if llama_decode(model.context, batch) != 0 { 229 | logger.error("Failed to evaluate llama!") 230 | } 231 | 232 | return newTokenStr 233 | } 234 | 235 | /// Adds a token to the batch. 236 | /// 237 | /// - Parameters: 238 | /// - batch: The batch to add the token to. 239 | /// - id: The token ID to add. 240 | /// - pos: The position of the token in the sequence. 241 | /// - seq_ids: The sequence IDs associated with the token. 242 | /// - logits: A flag indicating whether to compute logits for this token. 243 | private func llamaBatchAdd( 244 | _ batch: inout llama_batch, 245 | _ id: llama_token, 246 | _ pos: llama_pos, 247 | _ seq_ids: [llama_seq_id], 248 | _ logits: Bool 249 | ) { 250 | batch.token[Int(batch.n_tokens)] = id 251 | batch.pos[Int(batch.n_tokens)] = pos 252 | batch.n_seq_id[Int(batch.n_tokens)] = Int32(seq_ids.count) 253 | for i in 0.. [llama_token] { 269 | let utf8Data = text.utf8CString 270 | let nTokens = Int32(utf8Data.count) + (add_bos ? 1 : 0) 271 | let tokens = UnsafeMutablePointer.allocate(capacity: Int(nTokens)) 272 | defer { tokens.deallocate() } 273 | 274 | let tokenCount = llama_tokenize( 275 | model.model, text, Int32(utf8Data.count), tokens, Int32(nTokens), add_bos, false) 276 | guard tokenCount > 0 else { 277 | return [] 278 | } 279 | 280 | return Array(UnsafeBufferPointer(start: tokens, count: Int(tokenCount))) 281 | } 282 | 283 | /// Converts a token ID to an array of CChars representing the token piece. 284 | /// 285 | /// - Parameter token: The token ID to convert. 286 | /// 287 | /// - Returns: An array of CChars representing the token piece. 288 | private func tokenToPieceArray(token: llama_token) -> [CChar] { 289 | var buffer = [CChar](repeating: 0, count: 8) 290 | var nTokens = llama_token_to_piece(model.model, token, &buffer, 8, 0, false) 291 | 292 | if nTokens < 0 { 293 | let requiredSize = -nTokens 294 | buffer = [CChar](repeating: 0, count: Int(requiredSize)) 295 | nTokens = llama_token_to_piece(model.model, token, &buffer, requiredSize, 0, false) 296 | } 297 | 298 | return Array(buffer.prefix(Int(nTokens))) 299 | } 300 | 301 | /// Attempts to create a partial string from an array of CChars if the full string is invalid. 302 | /// 303 | /// - Parameter cchars: The array of CChars to attempt to convert. 304 | /// 305 | /// - Returns: A valid string if possible; otherwise, `nil`. 306 | private func attemptPartialString(from cchars: [CChar]) -> String? { 307 | for i in (1..