├── .github └── workflows │ └── workflow.yml ├── .gitignore ├── .swiftpm └── xcode │ └── package.xcworkspace │ └── xcshareddata │ └── IDEWorkspaceChecks.plist ├── LICENSE ├── Package.swift ├── README.md ├── Sources └── Tiktoken │ ├── CoreBPE.swift │ ├── Encoding.swift │ ├── Extensions │ ├── Array+PrevCurrent.swift │ ├── Character+Int.swift │ ├── String+Base64.swift │ ├── String+Substring.swift │ └── String+UInt8.swift │ ├── FileDecoder.swift │ ├── Load.swift │ ├── Model.swift │ ├── Ranks.swift │ ├── Tiktoken.swift │ └── Vocab.swift └── Tests └── TiktokenTests ├── CoreBPETests.swift ├── FileDecoderTests.swift ├── LoadTests.swift ├── ModelTests.swift ├── Test.swift └── TiktokenTests.swift /.github/workflows/workflow.yml: -------------------------------------------------------------------------------- 1 | # This workflow will build a Swift project 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-swift 3 | 4 | name: Swift 5 | 6 | on: 7 | push: 8 | branches: [ "main", "develop" ] 9 | pull_request: 10 | branches: [ "main", "develop", 'release/**' ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: macos-latest 16 | 17 | steps: 18 | - name: Setup Swift 19 | uses: swift-actions/setup-swift@v1.20.0 20 | with: 21 | swift-version: 5.7.1 22 | - uses: actions/checkout@v3 23 | - name: Test 24 | run: swift test -v 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | /.build 3 | /Packages 4 | /*.xcodeproj 5 | xcuserdata/ 6 | DerivedData/ 7 | .swiftpm 8 | .netrc 9 | -------------------------------------------------------------------------------- /.swiftpm/xcode/package.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | IDEDidComputeMac32BitWarning 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Alberto Espinilla 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Package.swift: -------------------------------------------------------------------------------- 1 | // swift-tools-version: 5.7 2 | // The swift-tools-version declares the minimum version of Swift required to build this package. 3 | 4 | import PackageDescription 5 | 6 | let package = Package( 7 | name: "Tiktoken", 8 | platforms: [.macOS(.v10_15), .iOS(.v13), .tvOS(.v13), .watchOS(.v6)], 9 | products: [ 10 | // Products define the executables and libraries a package produces, and make them visible to other packages. 11 | .library( 12 | name: "Tiktoken", 13 | targets: ["Tiktoken"]), 14 | ], 15 | dependencies: [ 16 | // Dependencies declare other packages that this package depends on. 17 | // .package(url: /* package url */, from: "1.0.0"), 18 | ], 19 | targets: [ 20 | // Targets are the basic building blocks of a package. A target can define a module or a test suite. 21 | // Targets can depend on other targets in this package, and on products in packages this package depends on. 22 | .target( 23 | name: "Tiktoken", 24 | dependencies: []), 25 | .testTarget( 26 | name: "TiktokenTests", 27 | dependencies: ["Tiktoken"]), 28 | ] 29 | ) 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tiktoken 2 | 3 | Openai's Tiktoken implementation written in Swift. This is basic implementation from ordinary encode/decode. 4 | 5 | Supports vocab: 6 | - gpt2 (Same for gpt3) 7 | - r50k_base 8 | - p50k_base 9 | - p50k_edit 10 | - cl100k_base (gpt-4 and gpt-3.5) 11 | 12 | And also supports asian characters and emojis. 13 | 14 | Stars are welcome 😊. 15 | 16 | ## Usage 17 | 18 | ```swift 19 | let encoder = try await Tiktoken.shared.getEncoding("gpt-4") 20 | let encoded = encoder?.encode(value: "這個算法真的太棒了") 21 | print(encoded) 22 | let decoded = encoder?.decode(value: encoded) 23 | print(decoded) 24 | ``` 25 | 26 | ## TODO List 27 | 28 | - Encode native 29 | - Encode unstable native 30 | - Multithread 31 | - Custom vocab 32 | - Implements cache for loaded encoding 33 | - Add/Improve documentation 34 | - Add support for combine 35 | - Optimization performance 36 | - More testing 37 | -------------------------------------------------------------------------------- /Sources/Tiktoken/CoreBPE.swift: -------------------------------------------------------------------------------- 1 | // 2 | // CoreBPE.swift 3 | // 4 | // 5 | // Created by Alberto Espinilla Garrido on 23/3/23. 6 | // 7 | 8 | import Foundation 9 | 10 | class CoreBPE { 11 | private let encoder: [[UInt8]: Int] 12 | private let specialTokensEncoder: [String: Int] 13 | private let decoder: [Int: [UInt8]] 14 | private let specialTokensDecoder: [Int: Data] 15 | private let regexTls: [NSRegularExpression] 16 | private let specialRegexTls: [NSRegularExpression] 17 | private let sortedTokenBytes: [Data] 18 | 19 | init(encoder: [[UInt8] : Int] = .init(), 20 | specialTokensEncoder: [String : Int] = .init(), 21 | decoder: [Int : [UInt8]] = .init(), 22 | specialTokensDecoder: [Int : Data] = .init(), 23 | regexTls: [NSRegularExpression] = .init(), 24 | specialRegexTls: [NSRegularExpression] = .init(), 25 | sortedTokenBytes: [Data] = .init()) { 26 | self.encoder = encoder 27 | self.specialTokensEncoder = specialTokensEncoder 28 | self.decoder = decoder 29 | self.specialTokensDecoder = specialTokensDecoder 30 | self.regexTls = regexTls 31 | self.specialRegexTls = specialRegexTls 32 | self.sortedTokenBytes = sortedTokenBytes 33 | } 34 | 35 | func encodeOrdinaryNative(text: String) -> [Int] { 36 | let regex = regexTls.first! 37 | var ret = [Int]() 38 | for mat in regex.matches(in: text, range: NSRange(text.startIndex..., in: text)) { 39 | if let range = Range(mat.range, in: text) { 40 | let piece = Array(text[range].utf8) 41 | if let token = encoder[piece] { 42 | ret.append(token) 43 | continue 44 | } 45 | let encoded = bytePairEncode([UInt8](piece), encoder) 46 | ret.append(contentsOf: encoded) 47 | } 48 | } 49 | return ret 50 | } 51 | 52 | func decodeNative(tokens: [Int]) -> String { 53 | let data = tokens.reduce(into: Data(), { 54 | if let tokenBytes = decoder[$1] { 55 | $0.append(contentsOf: tokenBytes) 56 | } 57 | }) 58 | return String(data: data, encoding: .utf8) ?? "" 59 | } 60 | } 61 | 62 | private extension CoreBPE { 63 | func increaseLastPieceTokenLen(tokens: [Int], lastPieceTokenLen: Int) -> ([Int], Int) { 64 | func tokenIsAllSpace(_ token: Int) -> Bool { 65 | guard let tokenBytes = decoder[token] else { return false } 66 | return tokenBytes.reversed().allSatisfy { [32, 10, 9].contains($0) } // WARNING: .all(|&b| [b' ', b'\n', b'\t'].contains(&b)) 67 | } 68 | 69 | var lastPieceTokenLen = lastPieceTokenLen 70 | if lastPieceTokenLen > 0 && tokenIsAllSpace(tokens[tokens.count - lastPieceTokenLen]) { 71 | while lastPieceTokenLen < tokens.count && tokenIsAllSpace(tokens[tokens.count - lastPieceTokenLen - 1]) { 72 | lastPieceTokenLen += 1 73 | } 74 | } 75 | 76 | assert(lastPieceTokenLen <= tokens.count) 77 | return (tokens, lastPieceTokenLen) 78 | } 79 | } 80 | 81 | // MARK: - Merges 82 | 83 | private extension CoreBPE { 84 | func bytePairMerge(_ piece: [UInt8], _ ranks: [[UInt8]: Int], completion: (Range) -> T) -> [T] { 85 | // This is a vector of (start, rank). 86 | // The rank is of the byte pair starting at position start. 87 | // The rank of the last item in the vector is not a valid value. 88 | var parts = (0.. Int? = { parts, startIdx, skip in 91 | let calculatedIndex = startIdx + skip + 2 92 | if calculatedIndex < parts.count { 93 | let range = parts[startIdx].0.. 1 { 120 | // usize::MAX is a sentinel rank value allowing us to 121 | // take the min more quickly 122 | var minRank = (Int.max, 0) 123 | for (i, ( _, rank)) in parts.enumerated() { 124 | if rank < minRank.0 { 125 | minRank = (rank, i) 126 | } 127 | } 128 | 129 | if minRank.0 != Int.max { 130 | let i = minRank.1 131 | 132 | // NOTE: We are about to remove parts[i + 1]. We do not do it 133 | // yet because there are cache-locality benefits to updating 134 | // parts[i] and parts[i-1] before removing, which could thrash 135 | // the cache. Thus, we update the rank calculation by skipping over 136 | // parts[i + 1], by invoking `get_rank!` with `skip = 1`. 137 | parts[i].1 = getRank(parts, i, 1) ?? Int.max 138 | if i > 0 { 139 | parts[i - 1].1 = getRank(parts, i - 1, 1) ?? Int.max 140 | } 141 | parts.remove(at: i + 1) 142 | } else { 143 | break 144 | } 145 | } 146 | 147 | // TODO: Use ranks 148 | return parts.prevCurrent({ completion($0.0..<$1.0) }) 149 | } 150 | 151 | func bytePairEncode(_ piece: [UInt8], _ ranks: [[UInt8]: Int]) -> [Int] { 152 | if piece.count == 1 { 153 | return [ranks[piece]!] 154 | } 155 | return bytePairMerge(piece, ranks, completion: { p in 156 | let chunk = Array(piece[p]) 157 | return ranks[chunk] ?? 0 158 | }) 159 | } 160 | 161 | // func bytePairSplit(_ piece: [UInt8], _ ranks: [[UInt8]: Int]) -> [[UInt8]] { 162 | // if piece.count == 1 { 163 | // return [piece] 164 | // } 165 | // return bytePairMerge(piece, ranks, completion: { Array(piece[$0]) }) 166 | // } 167 | } 168 | -------------------------------------------------------------------------------- /Sources/Tiktoken/Encoding.swift: -------------------------------------------------------------------------------- 1 | // 2 | // Encoding.swift 3 | // 4 | // 5 | // Created by Alberto Espinilla Garrido on 20/3/23. 6 | // 7 | 8 | import Foundation 9 | 10 | //"""Creates an Encoding object. 11 | //See openai_public.py for examples of how to construct an Encoding object. 12 | //Args: 13 | // name: The name of the encoding. It should be clear from the name of the encoding 14 | // what behaviour to expect, in particular, encodings with different special tokens 15 | // should have different names. 16 | // pat_str: A regex pattern string that is used to split the input text. 17 | // mergeable_ranks: A dictionary mapping mergeable token bytes to their ranks. The ranks 18 | // must correspond to merge priority. 19 | // special_tokens: A dictionary mapping special token strings to their token values. 20 | // explicit_n_vocab: The number of tokens in the vocabulary. If provided, it is checked 21 | // that the number of mergeable tokens and special tokens is equal to this number. 22 | //""" 23 | 24 | public class Encoding { 25 | 26 | //mergeable_ranks: dict[bytes, int], 27 | //special_tokens: dict[str, int], 28 | //explicit_n_vocab: Optional[int] = None, 29 | 30 | // let name: String 31 | // let explicitNVocab: Int? 32 | // let pattern: String 33 | // let mergeableRanks: [[UInt8]: Int] 34 | // let specialTokens: [String: Int] // TODO: Map to [UInt8] 35 | 36 | private let name: String 37 | private let regex: NSRegularExpression // Regex 38 | private let mergeableRanks: [[UInt8]: Int] 39 | private let specialTokens: [String: Int] 40 | private let maxValueToken: Int 41 | 42 | private let coreBpe: CoreBPE 43 | 44 | init(name: String, regex: NSRegularExpression, mergeableRanks: [[UInt8]: Int], specialTokens: [String: Int], explicitNVocab: Int? = nil) { 45 | self.name = name 46 | self.regex = regex 47 | self.mergeableRanks = mergeableRanks 48 | self.specialTokens = specialTokens 49 | self.maxValueToken = max(mergeableRanks.values.max() ?? 0, specialTokens.values.max() ?? 0) 50 | 51 | // Assert validation 52 | 53 | // if explicit_n_vocab: 54 | // assert len(mergeable_ranks) + len(special_tokens) == explicit_n_vocab 55 | // assert self.max_token_value == explicit_n_vocab - 1 56 | 57 | let decoder = mergeableRanks.inverted 58 | self.coreBpe = .init(encoder: mergeableRanks, decoder: decoder, regexTls: [regex]) 59 | } 60 | 61 | public func encode(value: String) -> [Int] { 62 | coreBpe.encodeOrdinaryNative(text: value) 63 | } 64 | 65 | public func decode(value: [Int]) -> String { 66 | coreBpe.decodeNative(tokens: value) 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /Sources/Tiktoken/Extensions/Array+PrevCurrent.swift: -------------------------------------------------------------------------------- 1 | // 2 | // Array+PrevCurrent.swift 3 | // 4 | // 5 | // Created by Alberto Espinilla Garrido on 10/4/23. 6 | // 7 | 8 | import Foundation 9 | 10 | extension Array { 11 | func prevCurrent(_ body: (Element, Element) throws -> T) rethrows -> [T] { 12 | enumerated().compactMap({ index, element in 13 | guard index > 0 else { return nil } 14 | let prev = self[index-1] 15 | return try? body(prev, element) 16 | }) 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /Sources/Tiktoken/Extensions/Character+Int.swift: -------------------------------------------------------------------------------- 1 | // 2 | // Character+Int.swift 3 | // 4 | // 5 | // Created by Alberto Espinilla Garrido on 2/4/23. 6 | // 7 | 8 | import Foundation 9 | 10 | extension Character { 11 | init(_ i: Int) { 12 | self.self = Character(UnicodeScalar(i)!) 13 | } 14 | 15 | var isPrintable: Bool { 16 | unicodeScalars.contains(where: { $0.isPrintable }) 17 | } 18 | } 19 | 20 | extension Unicode.Scalar { 21 | var isPrintable: Bool { 22 | switch properties.generalCategory { 23 | case .control, .format: return false 24 | default: return true 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /Sources/Tiktoken/Extensions/String+Base64.swift: -------------------------------------------------------------------------------- 1 | // 2 | // String+Base64.swift 3 | // 4 | // 5 | // Created by Alberto Espinilla Garrido on 2/4/23. 6 | // 7 | 8 | import Foundation 9 | 10 | extension String { 11 | func base64Encoded() -> String? { 12 | data(using: .utf8)?.base64EncodedString() 13 | } 14 | 15 | func base64Decoded() -> String? { 16 | guard let data = Data(base64Encoded: self) else { return nil } 17 | return String(data: data, encoding: .ascii) 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /Sources/Tiktoken/Extensions/String+Substring.swift: -------------------------------------------------------------------------------- 1 | // 2 | // String+Substring.swift 3 | // 4 | // 5 | // Created by Alberto Espinilla Garrido on 26/3/23. 6 | // 7 | 8 | import Foundation 9 | 10 | extension String { 11 | func index(from: Int) -> Index { 12 | index(startIndex, offsetBy: from) 13 | } 14 | 15 | func substring(from: Int) -> String { 16 | let fromIndex = index(from: from) 17 | return String(self[fromIndex...]) 18 | } 19 | 20 | func substring(to: Int) -> String { 21 | let toIndex = index(from: to) 22 | return String(self[..) -> String { 26 | let startIndex = index(from: r.lowerBound) 27 | let endIndex = index(from: r.upperBound) 28 | return String(self[startIndex.. [[UInt8]: Int] { 12 | guard let decoded = String(data: data, encoding: .utf8) else { return [:] } 13 | var result: [[UInt8]: Int] = .init() 14 | decoded.split(separator: "\n").forEach({ 15 | let lineSplit = $0.split(separator: " ") 16 | guard let first = lineSplit.first, 17 | let key = String(first).base64Decoded(), 18 | let value = lineSplit.last 19 | else { 20 | return 21 | } 22 | result[key.uInt8] = Int(value) 23 | }) 24 | return result 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /Sources/Tiktoken/Load.swift: -------------------------------------------------------------------------------- 1 | // 2 | // Load.swift 3 | // 4 | // 5 | // Created by Alberto Espinilla Garrido on 22/3/23. 6 | // 7 | 8 | import Foundation 9 | import CryptoKit 10 | 11 | enum Load { 12 | static func loadTiktokenBpe(url: String, decoder: FileDecoder = FileDecoder()) async -> [[UInt8]: Int] { 13 | guard let data = try? await Load.fetch(stringUrl: url) else { return [:] } 14 | return decoder.decode(data) 15 | } 16 | 17 | static func dataGymToMergeableBpeRanks(vocabBpeFile: String, encoderJsonFile: String? = nil) async -> [[UInt8]: Int] { 18 | var rankToIntByte = (0.. [Int] { 77 | value.compactMap({ dict[$0] }) 78 | } 79 | 80 | static func toDictionary(array: [Int]) -> [Character: Int] { 81 | array.reduce(into: [:], { $0[Character($1)] = $1 }) 82 | } 83 | 84 | // Fetch data 85 | static func fetch(stringUrl: String) async throws -> Data? { 86 | let urlHash = stringUrl.sha256 87 | 88 | // Create a URL for cache file 89 | let cacheFileURL = cacheDirectoryURL.appendingPathComponent("\(urlHash)") 90 | 91 | // Check if the data exists in cache 92 | if FileManager.default.fileExists(atPath: cacheFileURL.path) { 93 | let data = try? Data(contentsOf: cacheFileURL) 94 | return data 95 | } else { 96 | guard let url = URL(string: stringUrl) else { return nil } 97 | let (data, _) = try await URLSession.shared.data(from: url) 98 | 99 | // Save data to cache 100 | do { 101 | try data.write(to: cacheFileURL) 102 | } catch { 103 | print("Error while caching: \(error)") 104 | } 105 | 106 | return data 107 | } 108 | } 109 | 110 | static func getVocab(url: String) async -> [(String, String)] { 111 | guard let data = try? await fetch(stringUrl: url), 112 | let vocab = String(data: data, encoding: .utf8) 113 | else { return [] } 114 | 115 | return vocab.split(separator: "\n", omittingEmptySubsequences: true) 116 | .compactMap({ 117 | guard !$0.starts(with: "#version") else { return nil } 118 | let line = String($0).splitWhiteSpaces 119 | guard let first = line.first, 120 | let last = line.last 121 | else { return nil } 122 | return (first, last) 123 | }) 124 | } 125 | 126 | static func getDecoder(url: String) async -> [String: Int] { 127 | guard let data = try? await fetch(stringUrl: url), 128 | let decoded = try? JSONDecoder().decode([String: Int].self, from: data) 129 | else { return [:] } 130 | return decoded 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /Sources/Tiktoken/Model.swift: -------------------------------------------------------------------------------- 1 | // 2 | // Model.swift 3 | // 4 | // 5 | // Created by Alberto Espinilla Garrido on 20/3/23. 6 | // 7 | 8 | import Foundation 9 | 10 | enum Model { 11 | static func getEncoding(_ name: String) -> Vocab? { 12 | if let encodingName = MODEL_TO_ENCODING[name], 13 | let vocab = Vocab.all.first(where: { $0.name == encodingName }) { 14 | return vocab 15 | } 16 | return findPrefix(with: name) 17 | } 18 | } 19 | 20 | private extension Model { 21 | static let MODEL_PREFIX_TO_ENCODING: [String: String] = [ 22 | // chat 23 | "gpt-4-": "cl100k_base", // e.g., gpt-4-0314, etc., plus gpt-4-32k 24 | "gpt-3.5-turbo-": "cl100k_base", // e.g, gpt-3.5-turbo-0301, -0401, etc. 25 | ] 26 | 27 | static let MODEL_TO_ENCODING: [String: String] = [ 28 | // chat 29 | "gpt-4": "cl100k_base", 30 | "gpt-3.5-turbo": "cl100k_base", 31 | // text 32 | "text-davinci-003": "p50k_base", 33 | "text-davinci-002": "p50k_base", 34 | "text-davinci-001": "r50k_base", 35 | "text-curie-001": "r50k_base", 36 | "text-babbage-001": "r50k_base", 37 | "text-ada-001": "r50k_base", 38 | "davinci": "r50k_base", 39 | "curie": "r50k_base", 40 | "babbage": "r50k_base", 41 | "ada": "r50k_base", 42 | // code 43 | "code-davinci-002": "p50k_base", 44 | "code-davinci-001": "p50k_base", 45 | "code-cushman-002": "p50k_base", 46 | "code-cushman-001": "p50k_base", 47 | "davinci-codex": "p50k_base", 48 | "cushman-codex": "p50k_base", 49 | // edit 50 | "text-davinci-edit-001": "p50k_edit", 51 | "code-davinci-edit-001": "p50k_edit", 52 | // embeddings 53 | "text-embedding-ada-002": "cl100k_base", 54 | // old embeddings 55 | "text-similarity-davinci-001": "r50k_base", 56 | "text-similarity-curie-001": "r50k_base", 57 | "text-similarity-babbage-001": "r50k_base", 58 | "text-similarity-ada-001": "r50k_base", 59 | "text-search-davinci-doc-001": "r50k_base", 60 | "text-search-curie-doc-001": "r50k_base", 61 | "text-search-babbage-doc-001": "r50k_base", 62 | "text-search-ada-doc-001": "r50k_base", 63 | "code-search-babbage-code-001": "r50k_base", 64 | "code-search-ada-code-001": "r50k_base", 65 | // open source 66 | "gpt2": "gpt2", 67 | "gpt3": "gpt3", 68 | ] 69 | 70 | static func findPrefix(with name: String) -> Vocab? { 71 | guard let key = Model.MODEL_PREFIX_TO_ENCODING.keys.first(where: { name.starts(with: $0) }), 72 | let name = Model.MODEL_PREFIX_TO_ENCODING[key] , 73 | let vocab = Vocab.all.first(where: { $0.name == name }) else { 74 | return nil 75 | } 76 | return vocab 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /Sources/Tiktoken/Ranks.swift: -------------------------------------------------------------------------------- 1 | // 2 | // Ranks.swift 3 | // 4 | // 5 | // Created by Alberto Espinilla Garrido on 17/5/23. 6 | // 7 | 8 | import Foundation 9 | 10 | typealias Ranks = [[UInt8]: Int] 11 | 12 | extension Ranks { 13 | var inverted: [Int: [UInt8]] { 14 | reduce(into: [:], { $0[$1.value] = $1.key }) 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /Sources/Tiktoken/Tiktoken.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | 3 | public struct Tiktoken { 4 | 5 | public static let shared: Tiktoken = .init() 6 | 7 | private init() {} 8 | 9 | public func getEncoding(_ name: String) async throws -> Encoding? { 10 | guard let vocab = Model.getEncoding(name) else { return nil } 11 | let encoder = await loadRanks(vocab) 12 | let regex = try NSRegularExpression(pattern: vocab.pattern) 13 | let encoding = Encoding(name: name, regex: regex, mergeableRanks: encoder, specialTokens: vocab.specialTokens) 14 | return encoding 15 | } 16 | 17 | // public func getEncoding(for vocab: Vocab) -> Encoding? { 18 | // return nil 19 | // } 20 | // 21 | // public func register() { 22 | // // TODO: Register model and Encoding 23 | // } 24 | // 25 | // public func clear() { 26 | // // TODO: Clear all cached encoding 27 | // } 28 | } 29 | 30 | private extension Tiktoken { 31 | func loadRanks(_ vocab: Vocab) async -> [[UInt8]: Int] { 32 | if ["gpt2", "gpt3"].contains(vocab.name) { 33 | return await Load.dataGymToMergeableBpeRanks(vocabBpeFile: vocab.url) 34 | } else { 35 | return await Load.loadTiktokenBpe(url: vocab.url) 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /Sources/Tiktoken/Vocab.swift: -------------------------------------------------------------------------------- 1 | // 2 | // Vocab.swift 3 | // 4 | // 5 | // Created by Alberto Espinilla Garrido on 17/5/23. 6 | // 7 | 8 | import Foundation 9 | 10 | public struct Vocab { 11 | public let name: String 12 | public let url: String 13 | public let explicitNVocab: Int? 14 | public let pattern: String 15 | public let specialTokens: [String: Int] 16 | 17 | public init(name: String, 18 | url: String, 19 | explicitNVocab: Int? = nil, 20 | pattern: String, 21 | specialTokens: [String : Int] = [:]) { 22 | self.name = name 23 | self.url = url 24 | self.explicitNVocab = explicitNVocab 25 | self.pattern = pattern 26 | self.specialTokens = specialTokens 27 | } 28 | } 29 | 30 | public extension Vocab { 31 | static var gpt2: Vocab { 32 | .init(name: "gpt2", 33 | url: "https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe", 34 | explicitNVocab: 50257, 35 | pattern: "/'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+/gu", 36 | specialTokens: ["<|endoftext|>": 50256]) 37 | } 38 | 39 | static var r50kBase: Vocab { 40 | .init(name: "r50k_base", 41 | url: "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken", 42 | explicitNVocab: 50257, 43 | pattern: "/'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+/gu", 44 | specialTokens: ["<|endoftext|>": 50256]) 45 | } 46 | 47 | static var p50kBase: Vocab { 48 | .init(name: "p50k_base", 49 | url: "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken", 50 | explicitNVocab: 50281, 51 | pattern: "/'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+/gu", 52 | specialTokens: ["<|endoftext|>": 50256]) 53 | } 54 | 55 | static var p50kEdit: Vocab { 56 | .init(name: "p50k_edit", 57 | url: "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken", 58 | pattern: "/'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+/gu", 59 | specialTokens: [ 60 | "<|endoftext|>": 50256, 61 | "<|fim_prefix|>": 50281, 62 | "<|fim_middle|>": 50282, 63 | "<|fim_suffix|>": 50283 64 | ]) 65 | } 66 | 67 | static var cl100kBase: Vocab { 68 | .init(name: "cl100k_base", 69 | url: "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken", 70 | pattern: "/(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+/gu", 71 | specialTokens: [ 72 | "<|endoftext|>": 100257, 73 | "<|fim_prefix|>": 100258, 74 | "<|fim_middle|>": 100259, 75 | "<|fim_suffix|>": 100260, 76 | "<|endofprompt|>": 100276 77 | ]) 78 | } 79 | 80 | static var all: [Vocab] = [.gpt2, .r50kBase, .p50kBase, .p50kEdit, .cl100kBase] 81 | } 82 | -------------------------------------------------------------------------------- /Tests/TiktokenTests/CoreBPETests.swift: -------------------------------------------------------------------------------- 1 | // 2 | // CoreBPETests.swift 3 | // 4 | // 5 | // Created by Alberto Espinilla Garrido on 28/3/23. 6 | // 7 | 8 | import XCTest 9 | @testable import Tiktoken 10 | 11 | final class CoreBPETests: XCTestCase { 12 | 13 | private var sut: CoreBPE! 14 | 15 | override func setUpWithError() throws { 16 | sut = .init() 17 | } 18 | 19 | override func tearDownWithError() throws { 20 | sut = nil 21 | } 22 | 23 | func testEncodeOrdinaryNative() async throws { 24 | // let input = "This is an example sentence to try encoding out on!" 25 | // let expected = [1212, 318, 281, 1672, 6827, 284, 1949, 21004, 503, 319, 0] 26 | let input = "hello 👋 world 🌍" 27 | let expected = [31373, 50169, 233, 995, 12520, 234, 235] 28 | // 29 | // let input = "Esto es un texto 👨🏻‍💻 con emojis diferentes 🍿💃🏼🧜‍♂️ y más texto que no tiene sentido 🛟" 30 | // let expected = [22362, 78, 1658, 555, 2420, 78, 50169, 101, 8582, 237, 119, 447, 235, 8582, 240, 119, 369, 795, 13210, 271, 288, 361, 9100, 274, 12520, 235, 123, 8582, 240, 225, 8582, 237, 120, 8582, 100, 250, 447, 235, 17992, 224, 37929, 331, 285, 40138, 2420, 78, 8358, 645, 46668, 1734, 1908, 17305, 12520, 249, 253] 31 | 32 | 33 | // let input = "Vamos a probar🚒🚁🚀🚊 muchos emoticonos para probar⚽️🤸🏿‍♀️ diferentes codificaciones 👨🏻‍💻♨︎" 34 | // let expected = [53, 321, 418, 257, 1861, 283, 8582, 248, 240, 8582, 248, 223, 8582, 248, 222, 8582, 248, 232, 881, 418, 4085, 4749, 418, 31215, 1861, 283, 158, 248, 121, 37929, 8582, 97, 116, 8582, 237, 123, 447, 235, 17992, 222, 37929, 288, 361, 9100, 274, 14873, 811, 49443, 274, 50169, 101, 8582, 237, 119, 447, 235, 8582, 240, 119, 17992, 101, 35266, 236] 35 | 36 | 37 | let encoder = await Load.dataGymToMergeableBpeRanks(vocabBpeFile: "https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe", encoderJsonFile: "https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json") 38 | let decoder = encoder.reduce(into: [:], { $0[$1.value] = $1.key }) 39 | let regex = try XCTUnwrap(try NSRegularExpression(pattern: "/'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+/gu")) 40 | sut = .init(encoder: encoder, decoder: decoder, regexTls: [regex]) 41 | let output = sut.encodeOrdinaryNative(text: input) 42 | XCTAssertEqual(output, expected) 43 | 44 | let decodedOutput = sut.decodeNative(tokens: output) 45 | XCTAssertEqual(decodedOutput, input) 46 | } 47 | 48 | func testEncodeOrdinaryNativeWithModel() async throws { 49 | // let input = "This is an example sentence to try encoding out on!" 50 | // let expected = [1212, 318, 281, 1672, 6827, 284, 1949, 21004, 503, 319, 0] 51 | // let input = "hello 👋 world 🌍" 52 | // let expected = [31373, 50169, 233, 995, 12520, 234, 235] 53 | // 54 | let input = "Esto es un texto 👨🏻‍💻 con emojis diferentes 🍿💃🏼🧜‍♂️ y más texto que no tiene sentido 🛟" 55 | let expected = [22362, 78, 1658, 555, 2420, 78, 50169, 101, 8582, 237, 119, 447, 235, 8582, 240, 119, 369, 795, 13210, 271, 288, 361, 9100, 274, 12520, 235, 123, 8582, 240, 225, 8582, 237, 120, 8582, 100, 250, 447, 235, 17992, 224, 37929, 331, 285, 40138, 2420, 78, 8358, 645, 46668, 1734, 1908, 17305, 12520, 249, 253] 56 | 57 | 58 | // let input = "Vamos a probar🚒🚁🚀🚊 muchos emoticonos para probar⚽️🤸🏿‍♀️ diferentes codificaciones 👨🏻‍💻♨︎" 59 | // let expected = [53, 321, 418, 257, 1861, 283, 8582, 248, 240, 8582, 248, 223, 8582, 248, 222, 8582, 248, 232, 881, 418, 4085, 4749, 418, 31215, 1861, 283, 158, 248, 121, 37929, 8582, 97, 116, 8582, 237, 123, 447, 235, 17992, 222, 37929, 288, 361, 9100, 274, 14873, 811, 49443, 274, 50169, 101, 8582, 237, 119, 447, 235, 8582, 240, 119, 17992, 101, 35266, 236] 60 | // 61 | // let encoderGPT = await Load.dataGymToMergeableBpeRanks(vocabBpeFile: "https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe", encoderJsonFile: "") 62 | let encoder = await Load.loadTiktokenBpe(url: "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken") 63 | 64 | 65 | // "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", 66 | // let regex = try XCTUnwrap(try NSRegularExpression(pattern: "/'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+/gu")) 67 | let decoder = encoder.reduce(into: [:], { $0[$1.value] = $1.key }) 68 | // r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", 69 | // let regex = try XCTUnwrap(try NSRegularExpression(pattern: "/'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+/gu")) 70 | 71 | // r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""" 72 | let regex = try XCTUnwrap(try NSRegularExpression(pattern: "/(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+/gu")) 73 | sut = .init(encoder: encoder, decoder: decoder, regexTls: [regex]) 74 | 75 | let output = sut.encodeOrdinaryNative(text: input) 76 | XCTAssertEqual(output, expected) 77 | 78 | let decodedOutput = sut.decodeNative(tokens: output) 79 | XCTAssertEqual(decodedOutput, input) 80 | } 81 | 82 | func testGivenPromptWhenEncodedThenMatch() async throws { 83 | let input = "Esto es un texto 👨🏻‍💻 con emojis diferentes 🍿💃🏼🧜‍ y más texto que no tiene sentido 🛟" 84 | let expected = [14101, 78, 1560, 653, 33125, 62904, 101, 9468, 237, 119, 378, 235, 93273, 119, 390, 100166, 46418, 11410, 235, 123, 93273, 225, 9468, 237, 120, 9468, 100, 250, 378, 235, 379, 11158, 33125, 1744, 912, 24215, 65484, 11410, 249, 253] 85 | // 86 | // let input = "hello 👋 world 🌍" 87 | // let expected = [15339, 62904, 233, 1917, 11410, 234, 235] 88 | 89 | let encoder = await Load.loadTiktokenBpe(url: "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken") 90 | let decoder = encoder.reduce(into: [:], { $0[$1.value] = $1.key }) 91 | let regex = try XCTUnwrap(try NSRegularExpression(pattern: "/(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+/gu")) 92 | sut = .init(encoder: encoder, decoder: decoder, regexTls: [regex]) 93 | 94 | let output = sut.encodeOrdinaryNative(text: input) 95 | XCTAssertEqual(output, expected) 96 | 97 | let decodedOutput = sut.decodeNative(tokens: output) 98 | XCTAssertEqual(decodedOutput, input) 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /Tests/TiktokenTests/FileDecoderTests.swift: -------------------------------------------------------------------------------- 1 | // 2 | // FileDecoderTests.swift 3 | // 4 | // 5 | // Created by Alberto Espinilla Garrido on 10/4/23. 6 | // 7 | 8 | import XCTest 9 | @testable import Tiktoken 10 | 11 | final class FileDecoderTests: XCTestCase { 12 | private var sut: FileDecoder! 13 | 14 | override func setUpWithError() throws { 15 | sut = FileDecoder() 16 | } 17 | 18 | override func tearDownWithError() throws { 19 | sut = nil 20 | } 21 | 22 | func testGivenInvalidDataWhenDecodeThenMatchEmptyDictionary() throws { 23 | let data = Data() 24 | let expected: [[UInt8]: Int] = [:] 25 | let output = sut.decode(data) 26 | XCTAssertEqual(output, expected) 27 | } 28 | 29 | func testGivenInvalidDataEncodedWhenDecodeThenMatchEmptyDictionary() throws { 30 | let test = """ 31 | sample 32 | other sample 33 | fail 34 | """ 35 | 36 | let expected: [[UInt8]: Int] = [:] 37 | let input = try XCTUnwrap(test.data(using: .utf8)) 38 | let output = sut.decode(input) 39 | XCTAssertEqual(output, expected) 40 | } 41 | 42 | func testGivenDataWhenDecodeThenMatchDictionary() throws { 43 | let test = """ 44 | Zm9v 10 45 | Zm9vMQ== 20 46 | cmFuZG9t 100 47 | """ 48 | 49 | let expected: [[UInt8]: Int] = [ 50 | [102, 111, 111]: 10, 51 | [102, 111, 111, 49]: 20, 52 | [114, 97, 110, 100, 111, 109]: 100 53 | ] 54 | 55 | let input = try XCTUnwrap(test.data(using: .utf8)) 56 | let output = sut.decode(input) 57 | XCTAssertEqual(output, expected) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /Tests/TiktokenTests/LoadTests.swift: -------------------------------------------------------------------------------- 1 | // 2 | // LoadTests.swift 3 | // 4 | // 5 | // Created by Alberto Espinilla Garrido on 22/3/23. 6 | // 7 | 8 | import XCTest 9 | @testable import Tiktoken 10 | 11 | final class LoadTests: XCTestCase { 12 | func testExample() async throws { 13 | let result = try? await Load.dataGymToMergeableBpeRanks(vocabBpeFile: "", encoderJsonFile: "") 14 | XCTAssertNotNil(result) 15 | } 16 | 17 | func testLoadBpe() async throws { 18 | let result = try? await Load.loadTiktokenBpe(url: "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken") 19 | XCTAssertNotNil(result) 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /Tests/TiktokenTests/ModelTests.swift: -------------------------------------------------------------------------------- 1 | // 2 | // ModelTests.swift 3 | // 4 | // 5 | // Created by Alberto Espinilla Garrido on 20/3/23. 6 | // 7 | 8 | import XCTest 9 | @testable import Tiktoken 10 | 11 | final class ModelTests: XCTestCase { 12 | 13 | func testGivenModelNamesWhenGetEncodingThenMatch() throws { 14 | try [ 15 | Test(input: "gpt-4", output: "cl100k_base"), 16 | Test(input: "gpt-3.5-turbo", output: "cl100k_base"), 17 | Test(input: "davinci", output: "r50k_base"), 18 | Test(input: "text-davinci-edit-001", output: "p50k_edit"), 19 | ].forEach({ 20 | let output = Model.getEncoding($0.input) 21 | // XCTAssertEqual(try XCTUnwrap(output), $0.output) 22 | XCTAssertNotNil(output) 23 | }) 24 | } 25 | 26 | func testGivenModelNamesWithPrefisWhenGetEncodingThenMatch() throws { 27 | try [ 28 | Test(input: "gpt-4-0314", output: "cl100k_base"), 29 | Test(input: "gpt-4-32k", output: "cl100k_base"), 30 | Test(input: "gpt-3.5-turbo-0301", output: "cl100k_base"), 31 | Test(input: "gpt-3.5-turbo-0401", output: "cl100k_base"), 32 | ].forEach({ 33 | let output = Model.getEncoding($0.input) 34 | // XCTAssertEqual(try XCTUnwrap(output), $0.output) 35 | XCTAssertNotNil(output) 36 | }) 37 | } 38 | 39 | func testGivenUnknowModelNamesWhenGetEncodingThenMatchNil() throws { 40 | ["sample", "chatgpt", "invalid", "test"].forEach({ 41 | let output = Model.getEncoding($0) 42 | XCTAssertNil(output) 43 | }) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /Tests/TiktokenTests/Test.swift: -------------------------------------------------------------------------------- 1 | // 2 | // Test.swift 3 | // 4 | // 5 | // Created by Alberto Espinilla Garrido on 20/3/23. 6 | // 7 | 8 | import Foundation 9 | 10 | struct Test { 11 | let input: Input 12 | let output: Output 13 | } 14 | -------------------------------------------------------------------------------- /Tests/TiktokenTests/TiktokenTests.swift: -------------------------------------------------------------------------------- 1 | import XCTest 2 | @testable import Tiktoken 3 | 4 | final class TiktokenTests: XCTestCase { 5 | private var sut: Tiktoken = .shared 6 | 7 | func testGivenGPT2WhenDecodeThenMatch() async throws { 8 | // let input = "Esto es un texto 👨🏻‍💻 con emojis diferentes 🍿💃🏼🧜‍♂️ y más texto que no tiene sentido 🛟" 9 | // let expected = [22362, 78, 1658, 555, 2420, 78, 50169, 101, 8582, 237, 119, 447, 235, 8582, 240, 119, 369, 795, 13210, 271, 288, 361, 9100, 274, 12520, 235, 123, 8582, 240, 225, 8582, 237, 120, 8582, 100, 250, 447, 235, 17992, 224, 37929, 331, 285, 40138, 2420, 78, 8358, 645, 46668, 1734, 1908, 17305, 12520, 249, 253] 10 | 11 | let input = "這個算法真的太棒了" 12 | let expected = [34460, 247, 161, 222, 233, 163, 106, 245, 37345, 243, 40367, 253, 21410, 13783, 103, 162, 96, 240, 12859, 228] 13 | 14 | let encoder = try await sut.getEncoding("gpt2") 15 | let output = try XCTUnwrap(encoder?.encode(value: input)) 16 | XCTAssertEqual(output, expected) 17 | } 18 | 19 | func testGivenGPT4WhenDecodeThenMatch() async throws { 20 | // let input = "Esto es un texto 👨🏻‍💻 con emojis diferentes 🍿💃🏼🧜‍ y más texto que no tiene sentido 🛟" 21 | // let expected = [14101, 78, 1560, 653, 33125, 62904, 101, 9468, 237, 119, 378, 235, 93273, 119, 390, 100166, 46418, 11410, 235, 123, 93273, 225, 9468, 237, 120, 9468, 100, 250, 378, 235, 379, 11158, 33125, 1744, 912, 24215, 65484, 11410, 249, 253] 22 | 23 | let input = "這個算法真的太棒了" 24 | let expected = [11589, 247, 20022, 233, 70203, 25333, 89151, 9554, 8192, 103, 77062, 240, 35287] 25 | 26 | let encoder = try await sut.getEncoding("gpt-4") 27 | let output = try XCTUnwrap(encoder?.encode(value: input)) 28 | XCTAssertEqual(output, expected) 29 | } 30 | } 31 | --------------------------------------------------------------------------------