├── .github
└── workflows
│ └── workflow.yml
├── .gitignore
├── .swiftpm
└── xcode
│ └── package.xcworkspace
│ └── xcshareddata
│ └── IDEWorkspaceChecks.plist
├── LICENSE
├── Package.swift
├── README.md
├── Sources
└── Tiktoken
│ ├── CoreBPE.swift
│ ├── Encoding.swift
│ ├── Extensions
│ ├── Array+PrevCurrent.swift
│ ├── Character+Int.swift
│ ├── String+Base64.swift
│ ├── String+Substring.swift
│ └── String+UInt8.swift
│ ├── FileDecoder.swift
│ ├── Load.swift
│ ├── Model.swift
│ ├── Ranks.swift
│ ├── Tiktoken.swift
│ └── Vocab.swift
└── Tests
└── TiktokenTests
├── CoreBPETests.swift
├── FileDecoderTests.swift
├── LoadTests.swift
├── ModelTests.swift
├── Test.swift
└── TiktokenTests.swift
/.github/workflows/workflow.yml:
--------------------------------------------------------------------------------
1 | # This workflow will build a Swift project
2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-swift
3 |
4 | name: Swift
5 |
6 | on:
7 | push:
8 | branches: [ "main", "develop" ]
9 | pull_request:
10 | branches: [ "main", "develop", 'release/**' ]
11 |
12 | jobs:
13 | build:
14 |
15 | runs-on: macos-latest
16 |
17 | steps:
18 | - name: Setup Swift
19 | uses: swift-actions/setup-swift@v1.20.0
20 | with:
21 | swift-version: 5.7.1
22 | - uses: actions/checkout@v3
23 | - name: Test
24 | run: swift test -v
25 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | /.build
3 | /Packages
4 | /*.xcodeproj
5 | xcuserdata/
6 | DerivedData/
7 | .swiftpm
8 | .netrc
9 |
--------------------------------------------------------------------------------
/.swiftpm/xcode/package.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | IDEDidComputeMac32BitWarning
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Alberto Espinilla
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Package.swift:
--------------------------------------------------------------------------------
1 | // swift-tools-version: 5.7
2 | // The swift-tools-version declares the minimum version of Swift required to build this package.
3 |
4 | import PackageDescription
5 |
6 | let package = Package(
7 | name: "Tiktoken",
8 | platforms: [.macOS(.v10_15), .iOS(.v13), .tvOS(.v13), .watchOS(.v6)],
9 | products: [
10 | // Products define the executables and libraries a package produces, and make them visible to other packages.
11 | .library(
12 | name: "Tiktoken",
13 | targets: ["Tiktoken"]),
14 | ],
15 | dependencies: [
16 | // Dependencies declare other packages that this package depends on.
17 | // .package(url: /* package url */, from: "1.0.0"),
18 | ],
19 | targets: [
20 | // Targets are the basic building blocks of a package. A target can define a module or a test suite.
21 | // Targets can depend on other targets in this package, and on products in packages this package depends on.
22 | .target(
23 | name: "Tiktoken",
24 | dependencies: []),
25 | .testTarget(
26 | name: "TiktokenTests",
27 | dependencies: ["Tiktoken"]),
28 | ]
29 | )
30 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Tiktoken
2 |
3 | Openai's Tiktoken implementation written in Swift. This is basic implementation from ordinary encode/decode.
4 |
5 | Supports vocab:
6 | - gpt2 (Same for gpt3)
7 | - r50k_base
8 | - p50k_base
9 | - p50k_edit
10 | - cl100k_base (gpt-4 and gpt-3.5)
11 |
12 | And also supports asian characters and emojis.
13 |
14 | Stars are welcome 😊.
15 |
16 | ## Usage
17 |
18 | ```swift
19 | let encoder = try await Tiktoken.shared.getEncoding("gpt-4")
20 | let encoded = encoder?.encode(value: "這個算法真的太棒了")
21 | print(encoded)
22 | let decoded = encoder?.decode(value: encoded)
23 | print(decoded)
24 | ```
25 |
26 | ## TODO List
27 |
28 | - Encode native
29 | - Encode unstable native
30 | - Multithread
31 | - Custom vocab
32 | - Implements cache for loaded encoding
33 | - Add/Improve documentation
34 | - Add support for combine
35 | - Optimization performance
36 | - More testing
37 |
--------------------------------------------------------------------------------
/Sources/Tiktoken/CoreBPE.swift:
--------------------------------------------------------------------------------
1 | //
2 | // CoreBPE.swift
3 | //
4 | //
5 | // Created by Alberto Espinilla Garrido on 23/3/23.
6 | //
7 |
8 | import Foundation
9 |
10 | class CoreBPE {
11 | private let encoder: [[UInt8]: Int]
12 | private let specialTokensEncoder: [String: Int]
13 | private let decoder: [Int: [UInt8]]
14 | private let specialTokensDecoder: [Int: Data]
15 | private let regexTls: [NSRegularExpression]
16 | private let specialRegexTls: [NSRegularExpression]
17 | private let sortedTokenBytes: [Data]
18 |
19 | init(encoder: [[UInt8] : Int] = .init(),
20 | specialTokensEncoder: [String : Int] = .init(),
21 | decoder: [Int : [UInt8]] = .init(),
22 | specialTokensDecoder: [Int : Data] = .init(),
23 | regexTls: [NSRegularExpression] = .init(),
24 | specialRegexTls: [NSRegularExpression] = .init(),
25 | sortedTokenBytes: [Data] = .init()) {
26 | self.encoder = encoder
27 | self.specialTokensEncoder = specialTokensEncoder
28 | self.decoder = decoder
29 | self.specialTokensDecoder = specialTokensDecoder
30 | self.regexTls = regexTls
31 | self.specialRegexTls = specialRegexTls
32 | self.sortedTokenBytes = sortedTokenBytes
33 | }
34 |
35 | func encodeOrdinaryNative(text: String) -> [Int] {
36 | let regex = regexTls.first!
37 | var ret = [Int]()
38 | for mat in regex.matches(in: text, range: NSRange(text.startIndex..., in: text)) {
39 | if let range = Range(mat.range, in: text) {
40 | let piece = Array(text[range].utf8)
41 | if let token = encoder[piece] {
42 | ret.append(token)
43 | continue
44 | }
45 | let encoded = bytePairEncode([UInt8](piece), encoder)
46 | ret.append(contentsOf: encoded)
47 | }
48 | }
49 | return ret
50 | }
51 |
52 | func decodeNative(tokens: [Int]) -> String {
53 | let data = tokens.reduce(into: Data(), {
54 | if let tokenBytes = decoder[$1] {
55 | $0.append(contentsOf: tokenBytes)
56 | }
57 | })
58 | return String(data: data, encoding: .utf8) ?? ""
59 | }
60 | }
61 |
62 | private extension CoreBPE {
63 | func increaseLastPieceTokenLen(tokens: [Int], lastPieceTokenLen: Int) -> ([Int], Int) {
64 | func tokenIsAllSpace(_ token: Int) -> Bool {
65 | guard let tokenBytes = decoder[token] else { return false }
66 | return tokenBytes.reversed().allSatisfy { [32, 10, 9].contains($0) } // WARNING: .all(|&b| [b' ', b'\n', b'\t'].contains(&b))
67 | }
68 |
69 | var lastPieceTokenLen = lastPieceTokenLen
70 | if lastPieceTokenLen > 0 && tokenIsAllSpace(tokens[tokens.count - lastPieceTokenLen]) {
71 | while lastPieceTokenLen < tokens.count && tokenIsAllSpace(tokens[tokens.count - lastPieceTokenLen - 1]) {
72 | lastPieceTokenLen += 1
73 | }
74 | }
75 |
76 | assert(lastPieceTokenLen <= tokens.count)
77 | return (tokens, lastPieceTokenLen)
78 | }
79 | }
80 |
81 | // MARK: - Merges
82 |
83 | private extension CoreBPE {
84 | func bytePairMerge(_ piece: [UInt8], _ ranks: [[UInt8]: Int], completion: (Range) -> T) -> [T] {
85 | // This is a vector of (start, rank).
86 | // The rank is of the byte pair starting at position start.
87 | // The rank of the last item in the vector is not a valid value.
88 | var parts = (0.. Int? = { parts, startIdx, skip in
91 | let calculatedIndex = startIdx + skip + 2
92 | if calculatedIndex < parts.count {
93 | let range = parts[startIdx].0.. 1 {
120 | // usize::MAX is a sentinel rank value allowing us to
121 | // take the min more quickly
122 | var minRank = (Int.max, 0)
123 | for (i, ( _, rank)) in parts.enumerated() {
124 | if rank < minRank.0 {
125 | minRank = (rank, i)
126 | }
127 | }
128 |
129 | if minRank.0 != Int.max {
130 | let i = minRank.1
131 |
132 | // NOTE: We are about to remove parts[i + 1]. We do not do it
133 | // yet because there are cache-locality benefits to updating
134 | // parts[i] and parts[i-1] before removing, which could thrash
135 | // the cache. Thus, we update the rank calculation by skipping over
136 | // parts[i + 1], by invoking `get_rank!` with `skip = 1`.
137 | parts[i].1 = getRank(parts, i, 1) ?? Int.max
138 | if i > 0 {
139 | parts[i - 1].1 = getRank(parts, i - 1, 1) ?? Int.max
140 | }
141 | parts.remove(at: i + 1)
142 | } else {
143 | break
144 | }
145 | }
146 |
147 | // TODO: Use ranks
148 | return parts.prevCurrent({ completion($0.0..<$1.0) })
149 | }
150 |
151 | func bytePairEncode(_ piece: [UInt8], _ ranks: [[UInt8]: Int]) -> [Int] {
152 | if piece.count == 1 {
153 | return [ranks[piece]!]
154 | }
155 | return bytePairMerge(piece, ranks, completion: { p in
156 | let chunk = Array(piece[p])
157 | return ranks[chunk] ?? 0
158 | })
159 | }
160 |
161 | // func bytePairSplit(_ piece: [UInt8], _ ranks: [[UInt8]: Int]) -> [[UInt8]] {
162 | // if piece.count == 1 {
163 | // return [piece]
164 | // }
165 | // return bytePairMerge(piece, ranks, completion: { Array(piece[$0]) })
166 | // }
167 | }
168 |
--------------------------------------------------------------------------------
/Sources/Tiktoken/Encoding.swift:
--------------------------------------------------------------------------------
1 | //
2 | // Encoding.swift
3 | //
4 | //
5 | // Created by Alberto Espinilla Garrido on 20/3/23.
6 | //
7 |
8 | import Foundation
9 |
10 | //"""Creates an Encoding object.
11 | //See openai_public.py for examples of how to construct an Encoding object.
12 | //Args:
13 | // name: The name of the encoding. It should be clear from the name of the encoding
14 | // what behaviour to expect, in particular, encodings with different special tokens
15 | // should have different names.
16 | // pat_str: A regex pattern string that is used to split the input text.
17 | // mergeable_ranks: A dictionary mapping mergeable token bytes to their ranks. The ranks
18 | // must correspond to merge priority.
19 | // special_tokens: A dictionary mapping special token strings to their token values.
20 | // explicit_n_vocab: The number of tokens in the vocabulary. If provided, it is checked
21 | // that the number of mergeable tokens and special tokens is equal to this number.
22 | //"""
23 |
24 | public class Encoding {
25 |
26 | //mergeable_ranks: dict[bytes, int],
27 | //special_tokens: dict[str, int],
28 | //explicit_n_vocab: Optional[int] = None,
29 |
30 | // let name: String
31 | // let explicitNVocab: Int?
32 | // let pattern: String
33 | // let mergeableRanks: [[UInt8]: Int]
34 | // let specialTokens: [String: Int] // TODO: Map to [UInt8]
35 |
36 | private let name: String
37 | private let regex: NSRegularExpression // Regex
38 | private let mergeableRanks: [[UInt8]: Int]
39 | private let specialTokens: [String: Int]
40 | private let maxValueToken: Int
41 |
42 | private let coreBpe: CoreBPE
43 |
44 | init(name: String, regex: NSRegularExpression, mergeableRanks: [[UInt8]: Int], specialTokens: [String: Int], explicitNVocab: Int? = nil) {
45 | self.name = name
46 | self.regex = regex
47 | self.mergeableRanks = mergeableRanks
48 | self.specialTokens = specialTokens
49 | self.maxValueToken = max(mergeableRanks.values.max() ?? 0, specialTokens.values.max() ?? 0)
50 |
51 | // Assert validation
52 |
53 | // if explicit_n_vocab:
54 | // assert len(mergeable_ranks) + len(special_tokens) == explicit_n_vocab
55 | // assert self.max_token_value == explicit_n_vocab - 1
56 |
57 | let decoder = mergeableRanks.inverted
58 | self.coreBpe = .init(encoder: mergeableRanks, decoder: decoder, regexTls: [regex])
59 | }
60 |
61 | public func encode(value: String) -> [Int] {
62 | coreBpe.encodeOrdinaryNative(text: value)
63 | }
64 |
65 | public func decode(value: [Int]) -> String {
66 | coreBpe.decodeNative(tokens: value)
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/Sources/Tiktoken/Extensions/Array+PrevCurrent.swift:
--------------------------------------------------------------------------------
1 | //
2 | // Array+PrevCurrent.swift
3 | //
4 | //
5 | // Created by Alberto Espinilla Garrido on 10/4/23.
6 | //
7 |
8 | import Foundation
9 |
10 | extension Array {
11 | func prevCurrent(_ body: (Element, Element) throws -> T) rethrows -> [T] {
12 | enumerated().compactMap({ index, element in
13 | guard index > 0 else { return nil }
14 | let prev = self[index-1]
15 | return try? body(prev, element)
16 | })
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/Sources/Tiktoken/Extensions/Character+Int.swift:
--------------------------------------------------------------------------------
1 | //
2 | // Character+Int.swift
3 | //
4 | //
5 | // Created by Alberto Espinilla Garrido on 2/4/23.
6 | //
7 |
8 | import Foundation
9 |
10 | extension Character {
11 | init(_ i: Int) {
12 | self.self = Character(UnicodeScalar(i)!)
13 | }
14 |
15 | var isPrintable: Bool {
16 | unicodeScalars.contains(where: { $0.isPrintable })
17 | }
18 | }
19 |
20 | extension Unicode.Scalar {
21 | var isPrintable: Bool {
22 | switch properties.generalCategory {
23 | case .control, .format: return false
24 | default: return true
25 | }
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/Sources/Tiktoken/Extensions/String+Base64.swift:
--------------------------------------------------------------------------------
1 | //
2 | // String+Base64.swift
3 | //
4 | //
5 | // Created by Alberto Espinilla Garrido on 2/4/23.
6 | //
7 |
8 | import Foundation
9 |
10 | extension String {
11 | func base64Encoded() -> String? {
12 | data(using: .utf8)?.base64EncodedString()
13 | }
14 |
15 | func base64Decoded() -> String? {
16 | guard let data = Data(base64Encoded: self) else { return nil }
17 | return String(data: data, encoding: .ascii)
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/Sources/Tiktoken/Extensions/String+Substring.swift:
--------------------------------------------------------------------------------
1 | //
2 | // String+Substring.swift
3 | //
4 | //
5 | // Created by Alberto Espinilla Garrido on 26/3/23.
6 | //
7 |
8 | import Foundation
9 |
10 | extension String {
11 | func index(from: Int) -> Index {
12 | index(startIndex, offsetBy: from)
13 | }
14 |
15 | func substring(from: Int) -> String {
16 | let fromIndex = index(from: from)
17 | return String(self[fromIndex...])
18 | }
19 |
20 | func substring(to: Int) -> String {
21 | let toIndex = index(from: to)
22 | return String(self[..) -> String {
26 | let startIndex = index(from: r.lowerBound)
27 | let endIndex = index(from: r.upperBound)
28 | return String(self[startIndex.. [[UInt8]: Int] {
12 | guard let decoded = String(data: data, encoding: .utf8) else { return [:] }
13 | var result: [[UInt8]: Int] = .init()
14 | decoded.split(separator: "\n").forEach({
15 | let lineSplit = $0.split(separator: " ")
16 | guard let first = lineSplit.first,
17 | let key = String(first).base64Decoded(),
18 | let value = lineSplit.last
19 | else {
20 | return
21 | }
22 | result[key.uInt8] = Int(value)
23 | })
24 | return result
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/Sources/Tiktoken/Load.swift:
--------------------------------------------------------------------------------
1 | //
2 | // Load.swift
3 | //
4 | //
5 | // Created by Alberto Espinilla Garrido on 22/3/23.
6 | //
7 |
8 | import Foundation
9 | import CryptoKit
10 |
11 | enum Load {
12 | static func loadTiktokenBpe(url: String, decoder: FileDecoder = FileDecoder()) async -> [[UInt8]: Int] {
13 | guard let data = try? await Load.fetch(stringUrl: url) else { return [:] }
14 | return decoder.decode(data)
15 | }
16 |
17 | static func dataGymToMergeableBpeRanks(vocabBpeFile: String, encoderJsonFile: String? = nil) async -> [[UInt8]: Int] {
18 | var rankToIntByte = (0.. [Int] {
77 | value.compactMap({ dict[$0] })
78 | }
79 |
80 | static func toDictionary(array: [Int]) -> [Character: Int] {
81 | array.reduce(into: [:], { $0[Character($1)] = $1 })
82 | }
83 |
84 | // Fetch data
85 | static func fetch(stringUrl: String) async throws -> Data? {
86 | let urlHash = stringUrl.sha256
87 |
88 | // Create a URL for cache file
89 | let cacheFileURL = cacheDirectoryURL.appendingPathComponent("\(urlHash)")
90 |
91 | // Check if the data exists in cache
92 | if FileManager.default.fileExists(atPath: cacheFileURL.path) {
93 | let data = try? Data(contentsOf: cacheFileURL)
94 | return data
95 | } else {
96 | guard let url = URL(string: stringUrl) else { return nil }
97 | let (data, _) = try await URLSession.shared.data(from: url)
98 |
99 | // Save data to cache
100 | do {
101 | try data.write(to: cacheFileURL)
102 | } catch {
103 | print("Error while caching: \(error)")
104 | }
105 |
106 | return data
107 | }
108 | }
109 |
110 | static func getVocab(url: String) async -> [(String, String)] {
111 | guard let data = try? await fetch(stringUrl: url),
112 | let vocab = String(data: data, encoding: .utf8)
113 | else { return [] }
114 |
115 | return vocab.split(separator: "\n", omittingEmptySubsequences: true)
116 | .compactMap({
117 | guard !$0.starts(with: "#version") else { return nil }
118 | let line = String($0).splitWhiteSpaces
119 | guard let first = line.first,
120 | let last = line.last
121 | else { return nil }
122 | return (first, last)
123 | })
124 | }
125 |
126 | static func getDecoder(url: String) async -> [String: Int] {
127 | guard let data = try? await fetch(stringUrl: url),
128 | let decoded = try? JSONDecoder().decode([String: Int].self, from: data)
129 | else { return [:] }
130 | return decoded
131 | }
132 | }
133 |
--------------------------------------------------------------------------------
/Sources/Tiktoken/Model.swift:
--------------------------------------------------------------------------------
1 | //
2 | // Model.swift
3 | //
4 | //
5 | // Created by Alberto Espinilla Garrido on 20/3/23.
6 | //
7 |
8 | import Foundation
9 |
10 | enum Model {
11 | static func getEncoding(_ name: String) -> Vocab? {
12 | if let encodingName = MODEL_TO_ENCODING[name],
13 | let vocab = Vocab.all.first(where: { $0.name == encodingName }) {
14 | return vocab
15 | }
16 | return findPrefix(with: name)
17 | }
18 | }
19 |
20 | private extension Model {
21 | static let MODEL_PREFIX_TO_ENCODING: [String: String] = [
22 | // chat
23 | "gpt-4-": "cl100k_base", // e.g., gpt-4-0314, etc., plus gpt-4-32k
24 | "gpt-3.5-turbo-": "cl100k_base", // e.g, gpt-3.5-turbo-0301, -0401, etc.
25 | ]
26 |
27 | static let MODEL_TO_ENCODING: [String: String] = [
28 | // chat
29 | "gpt-4": "cl100k_base",
30 | "gpt-3.5-turbo": "cl100k_base",
31 | // text
32 | "text-davinci-003": "p50k_base",
33 | "text-davinci-002": "p50k_base",
34 | "text-davinci-001": "r50k_base",
35 | "text-curie-001": "r50k_base",
36 | "text-babbage-001": "r50k_base",
37 | "text-ada-001": "r50k_base",
38 | "davinci": "r50k_base",
39 | "curie": "r50k_base",
40 | "babbage": "r50k_base",
41 | "ada": "r50k_base",
42 | // code
43 | "code-davinci-002": "p50k_base",
44 | "code-davinci-001": "p50k_base",
45 | "code-cushman-002": "p50k_base",
46 | "code-cushman-001": "p50k_base",
47 | "davinci-codex": "p50k_base",
48 | "cushman-codex": "p50k_base",
49 | // edit
50 | "text-davinci-edit-001": "p50k_edit",
51 | "code-davinci-edit-001": "p50k_edit",
52 | // embeddings
53 | "text-embedding-ada-002": "cl100k_base",
54 | // old embeddings
55 | "text-similarity-davinci-001": "r50k_base",
56 | "text-similarity-curie-001": "r50k_base",
57 | "text-similarity-babbage-001": "r50k_base",
58 | "text-similarity-ada-001": "r50k_base",
59 | "text-search-davinci-doc-001": "r50k_base",
60 | "text-search-curie-doc-001": "r50k_base",
61 | "text-search-babbage-doc-001": "r50k_base",
62 | "text-search-ada-doc-001": "r50k_base",
63 | "code-search-babbage-code-001": "r50k_base",
64 | "code-search-ada-code-001": "r50k_base",
65 | // open source
66 | "gpt2": "gpt2",
67 | "gpt3": "gpt3",
68 | ]
69 |
70 | static func findPrefix(with name: String) -> Vocab? {
71 | guard let key = Model.MODEL_PREFIX_TO_ENCODING.keys.first(where: { name.starts(with: $0) }),
72 | let name = Model.MODEL_PREFIX_TO_ENCODING[key] ,
73 | let vocab = Vocab.all.first(where: { $0.name == name }) else {
74 | return nil
75 | }
76 | return vocab
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/Sources/Tiktoken/Ranks.swift:
--------------------------------------------------------------------------------
1 | //
2 | // Ranks.swift
3 | //
4 | //
5 | // Created by Alberto Espinilla Garrido on 17/5/23.
6 | //
7 |
8 | import Foundation
9 |
10 | typealias Ranks = [[UInt8]: Int]
11 |
12 | extension Ranks {
13 | var inverted: [Int: [UInt8]] {
14 | reduce(into: [:], { $0[$1.value] = $1.key })
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/Sources/Tiktoken/Tiktoken.swift:
--------------------------------------------------------------------------------
1 | import Foundation
2 |
3 | public struct Tiktoken {
4 |
5 | public static let shared: Tiktoken = .init()
6 |
7 | private init() {}
8 |
9 | public func getEncoding(_ name: String) async throws -> Encoding? {
10 | guard let vocab = Model.getEncoding(name) else { return nil }
11 | let encoder = await loadRanks(vocab)
12 | let regex = try NSRegularExpression(pattern: vocab.pattern)
13 | let encoding = Encoding(name: name, regex: regex, mergeableRanks: encoder, specialTokens: vocab.specialTokens)
14 | return encoding
15 | }
16 |
17 | // public func getEncoding(for vocab: Vocab) -> Encoding? {
18 | // return nil
19 | // }
20 | //
21 | // public func register() {
22 | // // TODO: Register model and Encoding
23 | // }
24 | //
25 | // public func clear() {
26 | // // TODO: Clear all cached encoding
27 | // }
28 | }
29 |
30 | private extension Tiktoken {
31 | func loadRanks(_ vocab: Vocab) async -> [[UInt8]: Int] {
32 | if ["gpt2", "gpt3"].contains(vocab.name) {
33 | return await Load.dataGymToMergeableBpeRanks(vocabBpeFile: vocab.url)
34 | } else {
35 | return await Load.loadTiktokenBpe(url: vocab.url)
36 | }
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/Sources/Tiktoken/Vocab.swift:
--------------------------------------------------------------------------------
1 | //
2 | // Vocab.swift
3 | //
4 | //
5 | // Created by Alberto Espinilla Garrido on 17/5/23.
6 | //
7 |
8 | import Foundation
9 |
10 | public struct Vocab {
11 | public let name: String
12 | public let url: String
13 | public let explicitNVocab: Int?
14 | public let pattern: String
15 | public let specialTokens: [String: Int]
16 |
17 | public init(name: String,
18 | url: String,
19 | explicitNVocab: Int? = nil,
20 | pattern: String,
21 | specialTokens: [String : Int] = [:]) {
22 | self.name = name
23 | self.url = url
24 | self.explicitNVocab = explicitNVocab
25 | self.pattern = pattern
26 | self.specialTokens = specialTokens
27 | }
28 | }
29 |
30 | public extension Vocab {
31 | static var gpt2: Vocab {
32 | .init(name: "gpt2",
33 | url: "https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe",
34 | explicitNVocab: 50257,
35 | pattern: "/'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+/gu",
36 | specialTokens: ["<|endoftext|>": 50256])
37 | }
38 |
39 | static var r50kBase: Vocab {
40 | .init(name: "r50k_base",
41 | url: "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken",
42 | explicitNVocab: 50257,
43 | pattern: "/'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+/gu",
44 | specialTokens: ["<|endoftext|>": 50256])
45 | }
46 |
47 | static var p50kBase: Vocab {
48 | .init(name: "p50k_base",
49 | url: "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
50 | explicitNVocab: 50281,
51 | pattern: "/'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+/gu",
52 | specialTokens: ["<|endoftext|>": 50256])
53 | }
54 |
55 | static var p50kEdit: Vocab {
56 | .init(name: "p50k_edit",
57 | url: "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
58 | pattern: "/'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+/gu",
59 | specialTokens: [
60 | "<|endoftext|>": 50256,
61 | "<|fim_prefix|>": 50281,
62 | "<|fim_middle|>": 50282,
63 | "<|fim_suffix|>": 50283
64 | ])
65 | }
66 |
67 | static var cl100kBase: Vocab {
68 | .init(name: "cl100k_base",
69 | url: "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken",
70 | pattern: "/(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+/gu",
71 | specialTokens: [
72 | "<|endoftext|>": 100257,
73 | "<|fim_prefix|>": 100258,
74 | "<|fim_middle|>": 100259,
75 | "<|fim_suffix|>": 100260,
76 | "<|endofprompt|>": 100276
77 | ])
78 | }
79 |
80 | static var all: [Vocab] = [.gpt2, .r50kBase, .p50kBase, .p50kEdit, .cl100kBase]
81 | }
82 |
--------------------------------------------------------------------------------
/Tests/TiktokenTests/CoreBPETests.swift:
--------------------------------------------------------------------------------
1 | //
2 | // CoreBPETests.swift
3 | //
4 | //
5 | // Created by Alberto Espinilla Garrido on 28/3/23.
6 | //
7 |
8 | import XCTest
9 | @testable import Tiktoken
10 |
11 | final class CoreBPETests: XCTestCase {
12 |
13 | private var sut: CoreBPE!
14 |
15 | override func setUpWithError() throws {
16 | sut = .init()
17 | }
18 |
19 | override func tearDownWithError() throws {
20 | sut = nil
21 | }
22 |
23 | func testEncodeOrdinaryNative() async throws {
24 | // let input = "This is an example sentence to try encoding out on!"
25 | // let expected = [1212, 318, 281, 1672, 6827, 284, 1949, 21004, 503, 319, 0]
26 | let input = "hello 👋 world 🌍"
27 | let expected = [31373, 50169, 233, 995, 12520, 234, 235]
28 | //
29 | // let input = "Esto es un texto 👨🏻💻 con emojis diferentes 🍿💃🏼🧜♂️ y más texto que no tiene sentido 🛟"
30 | // let expected = [22362, 78, 1658, 555, 2420, 78, 50169, 101, 8582, 237, 119, 447, 235, 8582, 240, 119, 369, 795, 13210, 271, 288, 361, 9100, 274, 12520, 235, 123, 8582, 240, 225, 8582, 237, 120, 8582, 100, 250, 447, 235, 17992, 224, 37929, 331, 285, 40138, 2420, 78, 8358, 645, 46668, 1734, 1908, 17305, 12520, 249, 253]
31 |
32 |
33 | // let input = "Vamos a probar🚒🚁🚀🚊 muchos emoticonos para probar⚽️🤸🏿♀️ diferentes codificaciones 👨🏻💻♨︎"
34 | // let expected = [53, 321, 418, 257, 1861, 283, 8582, 248, 240, 8582, 248, 223, 8582, 248, 222, 8582, 248, 232, 881, 418, 4085, 4749, 418, 31215, 1861, 283, 158, 248, 121, 37929, 8582, 97, 116, 8582, 237, 123, 447, 235, 17992, 222, 37929, 288, 361, 9100, 274, 14873, 811, 49443, 274, 50169, 101, 8582, 237, 119, 447, 235, 8582, 240, 119, 17992, 101, 35266, 236]
35 |
36 |
37 | let encoder = await Load.dataGymToMergeableBpeRanks(vocabBpeFile: "https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe", encoderJsonFile: "https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json")
38 | let decoder = encoder.reduce(into: [:], { $0[$1.value] = $1.key })
39 | let regex = try XCTUnwrap(try NSRegularExpression(pattern: "/'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+/gu"))
40 | sut = .init(encoder: encoder, decoder: decoder, regexTls: [regex])
41 | let output = sut.encodeOrdinaryNative(text: input)
42 | XCTAssertEqual(output, expected)
43 |
44 | let decodedOutput = sut.decodeNative(tokens: output)
45 | XCTAssertEqual(decodedOutput, input)
46 | }
47 |
48 | func testEncodeOrdinaryNativeWithModel() async throws {
49 | // let input = "This is an example sentence to try encoding out on!"
50 | // let expected = [1212, 318, 281, 1672, 6827, 284, 1949, 21004, 503, 319, 0]
51 | // let input = "hello 👋 world 🌍"
52 | // let expected = [31373, 50169, 233, 995, 12520, 234, 235]
53 | //
54 | let input = "Esto es un texto 👨🏻💻 con emojis diferentes 🍿💃🏼🧜♂️ y más texto que no tiene sentido 🛟"
55 | let expected = [22362, 78, 1658, 555, 2420, 78, 50169, 101, 8582, 237, 119, 447, 235, 8582, 240, 119, 369, 795, 13210, 271, 288, 361, 9100, 274, 12520, 235, 123, 8582, 240, 225, 8582, 237, 120, 8582, 100, 250, 447, 235, 17992, 224, 37929, 331, 285, 40138, 2420, 78, 8358, 645, 46668, 1734, 1908, 17305, 12520, 249, 253]
56 |
57 |
58 | // let input = "Vamos a probar🚒🚁🚀🚊 muchos emoticonos para probar⚽️🤸🏿♀️ diferentes codificaciones 👨🏻💻♨︎"
59 | // let expected = [53, 321, 418, 257, 1861, 283, 8582, 248, 240, 8582, 248, 223, 8582, 248, 222, 8582, 248, 232, 881, 418, 4085, 4749, 418, 31215, 1861, 283, 158, 248, 121, 37929, 8582, 97, 116, 8582, 237, 123, 447, 235, 17992, 222, 37929, 288, 361, 9100, 274, 14873, 811, 49443, 274, 50169, 101, 8582, 237, 119, 447, 235, 8582, 240, 119, 17992, 101, 35266, 236]
60 | //
61 | // let encoderGPT = await Load.dataGymToMergeableBpeRanks(vocabBpeFile: "https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe", encoderJsonFile: "")
62 | let encoder = await Load.loadTiktokenBpe(url: "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken")
63 |
64 |
65 | // "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
66 | // let regex = try XCTUnwrap(try NSRegularExpression(pattern: "/'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+/gu"))
67 | let decoder = encoder.reduce(into: [:], { $0[$1.value] = $1.key })
68 | // r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
69 | // let regex = try XCTUnwrap(try NSRegularExpression(pattern: "/'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+/gu"))
70 |
71 | // r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
72 | let regex = try XCTUnwrap(try NSRegularExpression(pattern: "/(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+/gu"))
73 | sut = .init(encoder: encoder, decoder: decoder, regexTls: [regex])
74 |
75 | let output = sut.encodeOrdinaryNative(text: input)
76 | XCTAssertEqual(output, expected)
77 |
78 | let decodedOutput = sut.decodeNative(tokens: output)
79 | XCTAssertEqual(decodedOutput, input)
80 | }
81 |
82 | func testGivenPromptWhenEncodedThenMatch() async throws {
83 | let input = "Esto es un texto 👨🏻💻 con emojis diferentes 🍿💃🏼🧜 y más texto que no tiene sentido 🛟"
84 | let expected = [14101, 78, 1560, 653, 33125, 62904, 101, 9468, 237, 119, 378, 235, 93273, 119, 390, 100166, 46418, 11410, 235, 123, 93273, 225, 9468, 237, 120, 9468, 100, 250, 378, 235, 379, 11158, 33125, 1744, 912, 24215, 65484, 11410, 249, 253]
85 | //
86 | // let input = "hello 👋 world 🌍"
87 | // let expected = [15339, 62904, 233, 1917, 11410, 234, 235]
88 |
89 | let encoder = await Load.loadTiktokenBpe(url: "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken")
90 | let decoder = encoder.reduce(into: [:], { $0[$1.value] = $1.key })
91 | let regex = try XCTUnwrap(try NSRegularExpression(pattern: "/(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+/gu"))
92 | sut = .init(encoder: encoder, decoder: decoder, regexTls: [regex])
93 |
94 | let output = sut.encodeOrdinaryNative(text: input)
95 | XCTAssertEqual(output, expected)
96 |
97 | let decodedOutput = sut.decodeNative(tokens: output)
98 | XCTAssertEqual(decodedOutput, input)
99 | }
100 | }
101 |
--------------------------------------------------------------------------------
/Tests/TiktokenTests/FileDecoderTests.swift:
--------------------------------------------------------------------------------
1 | //
2 | // FileDecoderTests.swift
3 | //
4 | //
5 | // Created by Alberto Espinilla Garrido on 10/4/23.
6 | //
7 |
8 | import XCTest
9 | @testable import Tiktoken
10 |
11 | final class FileDecoderTests: XCTestCase {
12 | private var sut: FileDecoder!
13 |
14 | override func setUpWithError() throws {
15 | sut = FileDecoder()
16 | }
17 |
18 | override func tearDownWithError() throws {
19 | sut = nil
20 | }
21 |
22 | func testGivenInvalidDataWhenDecodeThenMatchEmptyDictionary() throws {
23 | let data = Data()
24 | let expected: [[UInt8]: Int] = [:]
25 | let output = sut.decode(data)
26 | XCTAssertEqual(output, expected)
27 | }
28 |
29 | func testGivenInvalidDataEncodedWhenDecodeThenMatchEmptyDictionary() throws {
30 | let test = """
31 | sample
32 | other sample
33 | fail
34 | """
35 |
36 | let expected: [[UInt8]: Int] = [:]
37 | let input = try XCTUnwrap(test.data(using: .utf8))
38 | let output = sut.decode(input)
39 | XCTAssertEqual(output, expected)
40 | }
41 |
42 | func testGivenDataWhenDecodeThenMatchDictionary() throws {
43 | let test = """
44 | Zm9v 10
45 | Zm9vMQ== 20
46 | cmFuZG9t 100
47 | """
48 |
49 | let expected: [[UInt8]: Int] = [
50 | [102, 111, 111]: 10,
51 | [102, 111, 111, 49]: 20,
52 | [114, 97, 110, 100, 111, 109]: 100
53 | ]
54 |
55 | let input = try XCTUnwrap(test.data(using: .utf8))
56 | let output = sut.decode(input)
57 | XCTAssertEqual(output, expected)
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/Tests/TiktokenTests/LoadTests.swift:
--------------------------------------------------------------------------------
1 | //
2 | // LoadTests.swift
3 | //
4 | //
5 | // Created by Alberto Espinilla Garrido on 22/3/23.
6 | //
7 |
8 | import XCTest
9 | @testable import Tiktoken
10 |
11 | final class LoadTests: XCTestCase {
12 | func testExample() async throws {
13 | let result = try? await Load.dataGymToMergeableBpeRanks(vocabBpeFile: "", encoderJsonFile: "")
14 | XCTAssertNotNil(result)
15 | }
16 |
17 | func testLoadBpe() async throws {
18 | let result = try? await Load.loadTiktokenBpe(url: "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken")
19 | XCTAssertNotNil(result)
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/Tests/TiktokenTests/ModelTests.swift:
--------------------------------------------------------------------------------
1 | //
2 | // ModelTests.swift
3 | //
4 | //
5 | // Created by Alberto Espinilla Garrido on 20/3/23.
6 | //
7 |
8 | import XCTest
9 | @testable import Tiktoken
10 |
11 | final class ModelTests: XCTestCase {
12 |
13 | func testGivenModelNamesWhenGetEncodingThenMatch() throws {
14 | try [
15 | Test(input: "gpt-4", output: "cl100k_base"),
16 | Test(input: "gpt-3.5-turbo", output: "cl100k_base"),
17 | Test(input: "davinci", output: "r50k_base"),
18 | Test(input: "text-davinci-edit-001", output: "p50k_edit"),
19 | ].forEach({
20 | let output = Model.getEncoding($0.input)
21 | // XCTAssertEqual(try XCTUnwrap(output), $0.output)
22 | XCTAssertNotNil(output)
23 | })
24 | }
25 |
26 | func testGivenModelNamesWithPrefisWhenGetEncodingThenMatch() throws {
27 | try [
28 | Test(input: "gpt-4-0314", output: "cl100k_base"),
29 | Test(input: "gpt-4-32k", output: "cl100k_base"),
30 | Test(input: "gpt-3.5-turbo-0301", output: "cl100k_base"),
31 | Test(input: "gpt-3.5-turbo-0401", output: "cl100k_base"),
32 | ].forEach({
33 | let output = Model.getEncoding($0.input)
34 | // XCTAssertEqual(try XCTUnwrap(output), $0.output)
35 | XCTAssertNotNil(output)
36 | })
37 | }
38 |
39 | func testGivenUnknowModelNamesWhenGetEncodingThenMatchNil() throws {
40 | ["sample", "chatgpt", "invalid", "test"].forEach({
41 | let output = Model.getEncoding($0)
42 | XCTAssertNil(output)
43 | })
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/Tests/TiktokenTests/Test.swift:
--------------------------------------------------------------------------------
1 | //
2 | // Test.swift
3 | //
4 | //
5 | // Created by Alberto Espinilla Garrido on 20/3/23.
6 | //
7 |
8 | import Foundation
9 |
10 | struct Test {
11 | let input: Input
12 | let output: Output
13 | }
14 |
--------------------------------------------------------------------------------
/Tests/TiktokenTests/TiktokenTests.swift:
--------------------------------------------------------------------------------
1 | import XCTest
2 | @testable import Tiktoken
3 |
4 | final class TiktokenTests: XCTestCase {
5 | private var sut: Tiktoken = .shared
6 |
7 | func testGivenGPT2WhenDecodeThenMatch() async throws {
8 | // let input = "Esto es un texto 👨🏻💻 con emojis diferentes 🍿💃🏼🧜♂️ y más texto que no tiene sentido 🛟"
9 | // let expected = [22362, 78, 1658, 555, 2420, 78, 50169, 101, 8582, 237, 119, 447, 235, 8582, 240, 119, 369, 795, 13210, 271, 288, 361, 9100, 274, 12520, 235, 123, 8582, 240, 225, 8582, 237, 120, 8582, 100, 250, 447, 235, 17992, 224, 37929, 331, 285, 40138, 2420, 78, 8358, 645, 46668, 1734, 1908, 17305, 12520, 249, 253]
10 |
11 | let input = "這個算法真的太棒了"
12 | let expected = [34460, 247, 161, 222, 233, 163, 106, 245, 37345, 243, 40367, 253, 21410, 13783, 103, 162, 96, 240, 12859, 228]
13 |
14 | let encoder = try await sut.getEncoding("gpt2")
15 | let output = try XCTUnwrap(encoder?.encode(value: input))
16 | XCTAssertEqual(output, expected)
17 | }
18 |
19 | func testGivenGPT4WhenDecodeThenMatch() async throws {
20 | // let input = "Esto es un texto 👨🏻💻 con emojis diferentes 🍿💃🏼🧜 y más texto que no tiene sentido 🛟"
21 | // let expected = [14101, 78, 1560, 653, 33125, 62904, 101, 9468, 237, 119, 378, 235, 93273, 119, 390, 100166, 46418, 11410, 235, 123, 93273, 225, 9468, 237, 120, 9468, 100, 250, 378, 235, 379, 11158, 33125, 1744, 912, 24215, 65484, 11410, 249, 253]
22 |
23 | let input = "這個算法真的太棒了"
24 | let expected = [11589, 247, 20022, 233, 70203, 25333, 89151, 9554, 8192, 103, 77062, 240, 35287]
25 |
26 | let encoder = try await sut.getEncoding("gpt-4")
27 | let output = try XCTUnwrap(encoder?.encode(value: input))
28 | XCTAssertEqual(output, expected)
29 | }
30 | }
31 |
--------------------------------------------------------------------------------