├── .gitignore ├── GPTEncoder.podspec ├── LICENSE ├── Package.swift ├── README.md ├── Sources └── GPTEncoder │ ├── Atomic.swift │ ├── Extensions.swift │ ├── GPTEncoder.swift │ ├── GPTEncoderResources.swift │ ├── Helper.swift │ └── Resources │ ├── encoder.json │ └── vocab.bpe └── Tests └── GPTEncoderTests └── GPTEncoderTests.swift /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | /.build 3 | /Packages 4 | /*.xcodeproj 5 | xcuserdata/ 6 | DerivedData/ 7 | .swiftpm/config/registries.json 8 | .swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata 9 | .netrc 10 | -------------------------------------------------------------------------------- /GPTEncoder.podspec: -------------------------------------------------------------------------------- 1 | Pod::Spec.new do |s| 2 | 3 | s.name = "GPTEncoder" 4 | s.version = "1.0.3" 5 | s.summary = "A programmatic interface for tokenizing text for OpenAI GPT API." 6 | 7 | s.description = <<-DESC 8 | You can use the tool below to understand how a piece of text would be tokenized by the API, and the total count of tokens in that piece of text. 9 | DESC 10 | 11 | s.homepage = "https://github.com/alfianlosari/GPTEncoder" 12 | s.license = { :type => "MIT", :file => "LICENSE" } 13 | 14 | s.authors = { "alfianlosari" => "alfianlosari@gmail.com" } 15 | s.social_media_url = "https://github.com/alfianlosari" 16 | 17 | s.swift_versions = ['5.5'] 18 | 19 | s.source = { :git => "https://github.com/alfianlosari/GPTEncoder.git", :tag => s.version } 20 | s.source_files = ["Sources/GPTEncoder/**/*.swift"] 21 | s.resource_bundles = { 22 | 'GPTEncoder_GPTEncoder' => [ 23 | 'Sources/GPTEncoder/Resources/vocab.bpe', 24 | 'Sources/GPTEncoder/Resources/encoder.json' 25 | ] 26 | } 27 | 28 | 29 | s.ios.deployment_target = "12.0" 30 | s.osx.deployment_target = "10.13" 31 | # s.tvos.deployment_target = "12.0" 32 | # s.watchos.deployment_target = "7.0" 33 | 34 | s.requires_arc = true 35 | end 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Alfian Losari 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Package.swift: -------------------------------------------------------------------------------- 1 | // swift-tools-version: 5.5 2 | 3 | import PackageDescription 4 | 5 | let package = Package( 6 | name: "GPTEncoder", 7 | products: [ 8 | .library( 9 | name: "GPTEncoder", 10 | targets: ["GPTEncoder"]), 11 | ], 12 | dependencies: [], 13 | targets: [ 14 | .target( 15 | name: "GPTEncoder", 16 | resources: [ 17 | .process("Resources") 18 | ]), 19 | .testTarget( 20 | name: "GPTEncoderTests", 21 | dependencies: ["GPTEncoder"]), 22 | ] 23 | ) 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GPTEncoder 2 | 3 | ![Alt text](https://imagizer.imageshack.com/v2/640x480q70/922/a8ueTO.png "image") 4 | 5 | Swift BPE Encoder/Decoder for OpenAI GPT Models. A programmatic interface for tokenizing text for OpenAI GPT API. 6 | 7 | The GPT family of models process text using tokens, which are common sequences of characters found in text. The models understand the statistical relationships between these tokens, and excel at producing the next token in a sequence of tokens. 8 | 9 | You can use the tool below to understand how a piece of text would be tokenized by the API, and the total count of tokens in that piece of text. 10 | 11 | This library is based on [nodeJS gpt-3-encoder](https://github.com/latitudegames/GPT-3-Encoder) and [OpenAI Official Python GPT Encoder/Decoder](https://github.com/openai/gpt-2) 12 | 13 | I've also created [GPTTokenizerUI](https://github.com/alfianlosari/GPTTokenizerUI), a SPM lib you can integrate in your app for providing GUI to input text and show the tokenization results used by GPT API. 14 | 15 | ![Alt text](https://imagizer.imageshack.com/v2/640x480q70/922/CEVvrE.png "image") 16 | 17 | ## Supported Platforms 18 | 19 | - iOS/macOS/watchOS/tvOS 20 | - Linux 21 | 22 | ## Installation 23 | 24 | ### Swift Package Manager 25 | - File > Swift Packages > Add Package Dependency 26 | - Add - Add https://github.com/alfianlosari/GPTEncoder.git 27 | 28 | ### Cocoapods 29 | ```ruby 30 | platform :ios, '15.0' 31 | use_frameworks! 32 | 33 | target 'MyApp' do 34 | pod 'GPTEncoder', '~> 1.0.3' 35 | end 36 | ``` 37 | 38 | ## Usage 39 | 40 | ```swift 41 | let encoder = SwiftGPTEncoder() 42 | 43 | let str = "The GPT family of models process text using tokens, which are common sequences of characters found in text." 44 | let encoded = encoder.encode(text: str) 45 | print("String: \(str)") 46 | print("Encoded this string looks like: \(encoded)") 47 | print("Total number of token(s): \(encoded.count) and character(s): \(str.count)") 48 | 49 | print("We can look at each token and what it represents") 50 | encoded.forEach { print("Token: \(encoder.decode(tokens: [$0]))") } 51 | print(encoded) 52 | 53 | let decoded = encoder.decode(tokens: encoded) 54 | print("We can decode it back into:\n\(decoded)") 55 | ``` 56 | 57 | ### Encode 58 | 59 | To encode a `String` to array of `Int` tokens, you can simply invoke `encode` passing the string. 60 | 61 | ```swift 62 | let encoded = encoder.encode(text: "The GPT family of models process text using tokens, which are common sequences of characters found in text.") 63 | // Output: [464, 402, 11571, 1641, 286, 4981, 1429, 2420, 1262, 16326, 11, 543, 389, 2219, 16311, 286, 3435, 1043, 287, 2420, 13] 64 | ``` 65 | 66 | ### Decode 67 | 68 | To decode an array of `Int` tokens back to the `String` you can invoke `decode` passing the tokens array. 69 | 70 | ```swift 71 | let decoded = encoder.decode(tokens: [464, 402, 11571, 1641, 286, 4981, 1429, 2420, 1262, 16326, 11, 543, 389, 2219, 16311, 286, 3435, 1043, 287, 2420, 13]) 72 | // Output: "The GPT family of models process text using tokens, which are common sequences of characters found in text." 73 | ``` 74 | 75 | ### Clear Cache 76 | 77 | Internally, a cache is used to improve performance when encoding the tokens, you can reset the cache as well. 78 | 79 | ```swift 80 | encoder.clearCache() 81 | ``` 82 | 83 | 84 | -------------------------------------------------------------------------------- /Sources/GPTEncoder/Atomic.swift: -------------------------------------------------------------------------------- 1 | // 2 | // File.swift 3 | // 4 | // 5 | // Created by Alfian Losari on 27/03/23. 6 | // 7 | 8 | import Foundation 9 | 10 | /// Based on https://www.donnywals.com/why-your-atomic-property-wrapper-doesnt-work-for-collection-types/ 11 | 12 | public class AtomicDict: CustomDebugStringConvertible { 13 | private var dictStorage = [Key: Value]() 14 | 15 | private let queue = DispatchQueue(label: "com.swiftgptencoder.atomic", qos: .utility, attributes: .concurrent, 16 | autoreleaseFrequency: .inherit, target: .global()) 17 | 18 | public init() {} 19 | 20 | public subscript(key: Key) -> Value? { 21 | get { queue.sync { dictStorage[key] }} 22 | set { queue.async(flags: .barrier) { [weak self] in self?.dictStorage[key] = newValue } } 23 | } 24 | 25 | public var debugDescription: String { 26 | return dictStorage.debugDescription 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /Sources/GPTEncoder/Extensions.swift: -------------------------------------------------------------------------------- 1 | // 2 | // File.swift 3 | // 4 | // 5 | // Created by Alfian Losari on 27/03/23. 6 | // 7 | 8 | import Foundation 9 | 10 | extension String { 11 | 12 | var ord: Int? { 13 | guard !self.isEmpty else { return nil } 14 | return Int((self as NSString).character(at: 0)) 15 | } 16 | 17 | var fromCharCode: String? { 18 | guard let code = Int(self) else { return nil } 19 | return String(NSString(format: "%C", code)) 20 | } 21 | 22 | } 23 | 24 | 25 | extension Range where Bound == Int { 26 | 27 | var toArray: [Int] { map { $0 } } 28 | 29 | } 30 | 31 | -------------------------------------------------------------------------------- /Sources/GPTEncoder/GPTEncoder.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | #if os(Linux) 3 | let bundle = Bundle.module 4 | #else 5 | let bundle = GPTEncoderResources.resourceBundle 6 | #endif 7 | 8 | public final class GPTEncoder { 9 | 10 | public init() {} 11 | 12 | public private(set) var cache = AtomicDict() 13 | 14 | private let regex = try! NSRegularExpression(pattern: #"\'s|\'t|\'re|\'ve|\'m|\'ll|\'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"#, options: .anchorsMatchLines) 15 | 16 | private let bpe: String = { 17 | let fileURL = bundle.url(forResource: "vocab", withExtension: "bpe")! 18 | let bpe = try! Data(contentsOf: fileURL) 19 | return String(data: bpe, encoding: .utf8)! 20 | }() 21 | 22 | private let encoder: [String: Int] = { 23 | let fileURL = bundle.url(forResource: "encoder", withExtension: "json")! 24 | let jsonEncoderData = try! Data(contentsOf: fileURL) 25 | return try! JSONSerialization.jsonObject(with: jsonEncoderData) as! [String: Int] 26 | }() 27 | 28 | 29 | private lazy var decoder: [Int: String] = { 30 | var decoder: [Int: String] = [:] 31 | self.encoder.forEach({ key, value in 32 | decoder[value] = key 33 | }) 34 | return decoder 35 | }() 36 | 37 | private lazy var bpeMerges: [[String]] = { 38 | let lines = self.bpe.split(separator: "\n").dropFirst() 39 | let bpeMerges = lines.map { (line) in 40 | String(line).split(separator: " ").map { String($0) } 41 | .filter { String($0).trimmingCharacters(in: .whitespacesAndNewlines).count > 0} 42 | } 43 | return bpeMerges 44 | }() 45 | 46 | private let byteEncoder: [Int: String] = { 47 | bytesToUnicode() 48 | }() 49 | 50 | private lazy var byteDecoder: [String: Int] = { 51 | var byteDecoder: [String: Int] = [:] 52 | self.byteEncoder.forEach({ key, value in 53 | byteDecoder[value] = key 54 | }) 55 | return byteDecoder 56 | }() 57 | 58 | private lazy var bpeRanks: [[String]: Int] = { 59 | dictZip(x: bpeMerges, y: range(start: 0, end: bpeMerges.count)) 60 | }() 61 | 62 | public func decode(tokens: [Int]) -> String { 63 | let text = tokens.compactMap { token in decoder[token]}.joined(separator: "") 64 | let arrays = text.map { String($0) }.compactMap { byteDecoder[String($0)]}.map { UInt8($0) } 65 | return decodeString(array: arrays, byteEncoder: byteEncoder) 66 | } 67 | 68 | public func encode(text: String) -> [Int] { 69 | var bpe_tokens = [Int]() 70 | let matches = regex.matches(in: text, options: [], range: NSRange(text.startIndex..., in: text)) 71 | .compactMap { match in 72 | if let range = Range(match.range, in: text) { 73 | return String(text[range]) 74 | } else { 75 | return nil 76 | } 77 | } 78 | 79 | for match in matches { 80 | let token = encodeString(text: match).compactMap { byteEncoder[Int($0)] }.joined(separator: "") 81 | let bpe = bpe(token: token) 82 | let splits = bpe.split(separator: " ") 83 | let newTokens = splits.compactMap { self.encoder[String($0)]} 84 | bpe_tokens.append(contentsOf: newTokens) 85 | } 86 | return bpe_tokens 87 | } 88 | 89 | private func bpe(token: String) -> String { 90 | if let cachedToken = cache[token] { 91 | return cachedToken 92 | } 93 | var word = token.map { String($0) } 94 | var pairs = getPairs(words: word) 95 | 96 | if (pairs.isEmpty) { 97 | return token 98 | } 99 | 100 | while true { 101 | var minPairs: [Int: [String]] = [:] 102 | pairs.forEach{ pair in 103 | if let rank = bpeRanks[pair] ?? bpeRanks[pair.reversed()], !Double(rank).isNaN { 104 | minPairs[rank] = pair 105 | } else { 106 | minPairs[10_000_000] = pair 107 | } 108 | } 109 | 110 | let min = minPairs.map { v in v.key }.min() ?? -1 111 | let bigram = minPairs[min] ?? [] 112 | 113 | if bpeRanks[bigram] == nil { 114 | break 115 | } 116 | 117 | let first = bigram[0] 118 | let second = bigram[1] 119 | var newWord = [String]() 120 | var i = 0 121 | 122 | while i < word.count { 123 | guard let j = word.dropFirst(i).firstIndex(of: first) else { 124 | newWord.append(contentsOf: Array(word.suffix(from: i))) 125 | break 126 | } 127 | newWord.append(contentsOf: Array(word.prefix(j).suffix(from: i))) 128 | i = Int(j) 129 | 130 | if (word[i] == first && i < word.count - 1 && word[i + 1] == second) { 131 | newWord.append(first + second) 132 | i += 2 133 | } else { 134 | newWord.append(word[i]) 135 | i += 1 136 | } 137 | } 138 | 139 | word = newWord 140 | if word.count == 1 { 141 | break 142 | } else { 143 | pairs = getPairs(words: word) 144 | } 145 | } 146 | 147 | let finalWord = word.joined(separator: " ") 148 | cache[token] = finalWord 149 | return finalWord 150 | } 151 | 152 | public func clearCache() { 153 | self.cache = AtomicDict() 154 | } 155 | 156 | } 157 | -------------------------------------------------------------------------------- /Sources/GPTEncoder/GPTEncoderResources.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | 3 | public final class GPTEncoderResources { 4 | public static let resourceBundle: Bundle = { 5 | let candidates = [ 6 | // Bundle should be present here when the package is linked into an App. 7 | Bundle.main.resourceURL, 8 | 9 | // Bundle should be present here when the package is linked into a framework. 10 | Bundle(for: GPTEncoderResources.self).resourceURL, 11 | ] 12 | 13 | let bundleName = "GPTEncoder_GPTEncoder" 14 | 15 | for candidate in candidates { 16 | let bundlePath = candidate?.appendingPathComponent(bundleName + ".bundle") 17 | if let bundle = bundlePath.flatMap(Bundle.init(url:)) { 18 | return bundle 19 | } 20 | } 21 | 22 | // Return whatever bundle this code is in as a last resort. 23 | return Bundle(for: GPTEncoderResources.self) 24 | }() 25 | } 26 | -------------------------------------------------------------------------------- /Sources/GPTEncoder/Helper.swift: -------------------------------------------------------------------------------- 1 | // 2 | // File.swift 3 | // 4 | // 5 | // Created by Alfian Losari on 27/03/23. 6 | // 7 | 8 | import Foundation 9 | 10 | func range(start: Int, end: Int) -> [Int] { 11 | (start.. [[String]: Int] { 15 | var result = [[String]: Int]() 16 | x.enumerated().forEach{ (i, e) in 17 | result[x[i]] = y[i] 18 | 19 | } 20 | return result 21 | 22 | } 23 | 24 | func bytesToUnicode() -> [Int: String] { 25 | var bs = range(start: "!".ord!, end: "~".ord! + 1) 26 | + range(start: "¡".ord!, end: "¬".ord! + 1) 27 | + range(start: "®".ord!, end: "ÿ".ord! + 1) 28 | var cs = bs 29 | var n = 0 30 | var b = 0 31 | while (b < Int(pow(2.0, 8))) { 32 | if bs.firstIndex(of: b) == nil { 33 | bs.append(b) 34 | cs.append(Int(pow(2.0, 8)) + n) 35 | n += 1 36 | } 37 | b += 1 38 | } 39 | 40 | let charCodes = cs.map { "\($0)".fromCharCode! } 41 | var result = [Int: String]() 42 | bs.enumerated().forEach { i, _ in 43 | result[bs[i]] = charCodes[i] 44 | } 45 | return result 46 | } 47 | 48 | 49 | func getPairs(words: [String]) -> Set<[String]> { 50 | var pairs = Set<[String]>() 51 | var prevChar = words[0] 52 | (1.. [UInt8] { 61 | return Array(text.utf8) 62 | } 63 | 64 | func decodeString(array: [UInt8], byteEncoder: [Int: String]) -> String { 65 | String(bytes: array, encoding: .isoLatin1) ?? "" 66 | } 67 | -------------------------------------------------------------------------------- /Tests/GPTEncoderTests/GPTEncoderTests.swift: -------------------------------------------------------------------------------- 1 | import XCTest 2 | @testable import GPTEncoder 3 | 4 | @available(macOS 13.0, *) 5 | final class GPTEncoderTests: XCTestCase { 6 | func testEncodeAndDecode() throws { 7 | let encoder = GPTEncoder() 8 | 9 | let str = "這個算法真的太棒了" 10 | let encoded = encoder.encode(text: str) 11 | print("String: \(str)") 12 | print("Encoded this string looks like: \(encoded)") 13 | print("Total number of token(s): \(encoded.count) and character(s): \(str.count)") 14 | print("We can look at each token and what it represents") 15 | encoded.forEach { print("Token: \(encoder.decode(tokens: [$0]))") } 16 | 17 | print(encoded) 18 | let decoded = encoder.decode(tokens: encoded) 19 | print("We can decode it back into:\n\(decoded)") 20 | 21 | } 22 | } 23 | --------------------------------------------------------------------------------