├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── README.md
├── SampleApp
    ├── SampleApp.xcodeproj
    │   ├── project.pbxproj
    │   ├── project.xcworkspace
    │   │   ├── contents.xcworkspacedata
    │   │   └── xcshareddata
    │   │   │   └── IDEWorkspaceChecks.plist
    │   └── xcuserdata
    │   │   └── y.xcuserdatad
    │   │       └── xcschemes
    │   │           └── xcschememanagement.plist
    └── SampleApp
    │   ├── AppDelegate.h
    │   ├── AppDelegate.m
    │   ├── Assets.xcassets
    │       ├── AccentColor.colorset
    │       │   └── Contents.json
    │       ├── AppIcon.appiconset
    │       │   └── Contents.json
    │       └── Contents.json
    │   ├── Base.lproj
    │       ├── LaunchScreen.storyboard
    │       └── Main.storyboard
    │   ├── Info.plist
    │   ├── SceneDelegate.h
    │   ├── SceneDelegate.m
    │   ├── ViewController.h
    │   ├── ViewController.m
    │   ├── main.m
    │   ├── merges.txt
    │   ├── use_re2.cpp
    │   └── vocab.txt
├── build-re2.sh
├── build.sh
├── doc
    ├── 0.md
    ├── 1.md
    └── u.md
├── tokenizer
    ├── assets
    │   ├── README.md
    │   ├── merges.txt
    │   ├── vocab.json
    │   └── vocab.txt
    ├── bpe.cc
    ├── bpe.h
    ├── bpe.py
    └── bpe_test.cc
└── tool
    ├── cmp.py
    ├── data.py
    └── json-to-txt.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | build-*/
3 | .DS_Store
4 | build/
5 | *xcuserdata*
6 | __pycache__
7 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "re2"]
2 | 	path = re2
3 | 	url = https://github.com/google/re2
4 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.10)
 2 | 
 3 | project(tokenizer CXX)
 4 | include(CMakePackageConfigHelpers)
 5 | include(CTest)
 6 | include(GNUInstallDirs)
 7 | 
 8 | # ABI version
 9 | # http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html
10 | set(SONAME 10)
11 | 
12 | add_subdirectory(re2)
13 | 
14 | add_library(tokenizer tokenizer/bpe.cc tokenizer/bpe.h ${RE2_SOURCES})
15 | target_compile_features(tokenizer PUBLIC cxx_std_11)
16 | target_include_directories(tokenizer PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/re2)
17 | set_target_properties(tokenizer PROPERTIES SOVERSION ${SONAME} VERSION ${SONAME}.0.0)
18 | 
19 | if(APPLE)
20 |   set_target_properties(tokenizer PROPERTIES
21 |     FRAMEWORK TRUE
22 |     FRAMEWORK_VERSION A
23 |     MACOSX_FRAMEWORK_IDENTIFIER wang.yi.tokenizer
24 |     PUBLIC_HEADER "tokenizer/bpe.h"
25 |   )
26 | endif()
27 | 
28 | add_executable(bpe_test tokenizer/bpe_test.cc)
29 | set_target_properties(bpe_test
30 |   PROPERTIES
31 |   RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/bin"
32 | )
33 | target_link_libraries(bpe_test PRIVATE tokenizer re2::re2)
34 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | This repository is a C++ version of the Python HuggingFace tokenizers.
 2 | 
 3 | In the HuggingFace Transformers repo, tokenization is done with 104,603 lines of Python code. It takes 5506 lines for GPT2-specific BPE. I went through the code using the Python Debugger (pdb). It turns out that most of them do nothing but virtual methods in a complicated class hierarchy. I took 120 lines of Python code and put them in the file [`bpe.py`](tokenizer/bpe.py). This program is not a weaker or simpler version; it is the full HuggingFace BPE tokenizer.
 4 | 
 5 | With this extraction, I can port it to C++: [`bpe.h`](tokenizer/bpe.h), [`bpe.cc`](tokenizer/bpe.cc), and [`bpe_test.cc`](tokenizer/bpe_test.cc).
 6 | 
 7 | The following tech notes might help you understand the Python and C++ code:
 8 | 
 9 | 1. [Unicode in C++ and Python](doc/u.md)
10 | 1. [Understanding HuggingFace Tokenizers](doc/0.md)
11 | 1. [Unicode-enabled Regular Expression in C++](doc/1.md)
12 | 
13 | To run the vanilla HuggingFace GPT-2's BPE tokenizer, follow [this link])https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.GPT2Tokenizer).
14 | 
15 | To run HuggingFace vanilla BPE tokenzier for GPT2, run the following commands:
16 | 
17 | ```bash
18 | pip install transformers
19 | python tool/t.py
20 | ```
21 | 
22 | Please run the following commands to run the extracted 120-line Python tokenizer:
23 | 
24 | ```bash
25 | python tokenizer/bpe.py
26 | ```
27 | 
28 | To build the C++ port, you will need CMake.
29 | 
30 | ```
31 | cmake -B /tmp/b -S .
32 | Cmake --build /tmp/b
33 | ```
34 | 
35 | Then you can run the unit test.
36 | 
37 | ```bash
38 | /tmp/b/bin/bpe_test
39 | ```
40 | 
41 | All three of the above programs load the same `vocab.json` and `merges.txt` files, so they all work exactly like HuggingFace did when it trained and served the GPT2 model.
42 | 
43 | ## Known Issue
44 | 
45 | RE2 can match Unicode letters from all languages using rules like `\p{L}`. But it doesn't work with look-ahead syntax like `(?!...)`. This would make the C++ version act a little differently than the Python versions when there are more than one space between two words. Please tell me about any C++ regular expression libraries that can handle both Unicode and look-ahead.
46 | 
47 | This does not seem a big issue because among the 15,231,221 lines in the [lyrics text dataset](https://www.kaggle.com/datasets/neisse/scrapped-lyrics-from-6-genres) of 54 languages, the C++ tokenizer generates different output from the HuggingFace one for only 4 lines. And the reason are all due to successive two spaces.  For details, please see https://github.com/wangkuiyi/huggingface-tokenizer-in-cxx/issues/10.
48 | 


--------------------------------------------------------------------------------
/SampleApp/SampleApp.xcodeproj/project.pbxproj:
--------------------------------------------------------------------------------
  1 | // !$*UTF8*$!
  2 | {
  3 | 	archiveVersion = 1;
  4 | 	classes = {
  5 | 	};
  6 | 	objectVersion = 56;
  7 | 	objects = {
  8 | 
  9 | /* Begin PBXBuildFile section */
 10 | 		187AFC7E2999E7F7007E357F /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 187AFC7D2999E7F7007E357F /* AppDelegate.m */; };
 11 | 		187AFC812999E7F7007E357F /* SceneDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 187AFC802999E7F7007E357F /* SceneDelegate.m */; };
 12 | 		187AFC842999E7F7007E357F /* ViewController.m in Sources */ = {isa = PBXBuildFile; fileRef = 187AFC832999E7F7007E357F /* ViewController.m */; };
 13 | 		187AFC872999E7F7007E357F /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 187AFC852999E7F7007E357F /* Main.storyboard */; };
 14 | 		187AFC892999E7F8007E357F /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 187AFC882999E7F8007E357F /* Assets.xcassets */; };
 15 | 		187AFC8C2999E7F8007E357F /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 187AFC8A2999E7F8007E357F /* LaunchScreen.storyboard */; };
 16 | 		187AFC8F2999E7F8007E357F /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 187AFC8E2999E7F8007E357F /* main.m */; };
 17 | 		187AFC962999E822007E357F /* re2.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 187AFC952999E822007E357F /* re2.xcframework */; };
 18 | 		187AFC982999E947007E357F /* use_re2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 187AFC972999E947007E357F /* use_re2.cpp */; };
 19 | 		187AFC9A2999F069007E357F /* tokenizer.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 187AFC992999F069007E357F /* tokenizer.xcframework */; };
 20 | 		187AFC9C2999F2ED007E357F /* merges.txt in Resources */ = {isa = PBXBuildFile; fileRef = 187AFC9B2999F2ED007E357F /* merges.txt */; };
 21 | 		187AFC9E2999F2F7007E357F /* vocab.txt in Resources */ = {isa = PBXBuildFile; fileRef = 187AFC9D2999F2F7007E357F /* vocab.txt */; };
 22 | /* End PBXBuildFile section */
 23 | 
 24 | /* Begin PBXFileReference section */
 25 | 		187AFC792999E7F7007E357F /* SampleApp.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = SampleApp.app; sourceTree = BUILT_PRODUCTS_DIR; };
 26 | 		187AFC7C2999E7F7007E357F /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
 27 | 		187AFC7D2999E7F7007E357F /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
 28 | 		187AFC7F2999E7F7007E357F /* SceneDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = SceneDelegate.h; sourceTree = "<group>"; };
 29 | 		187AFC802999E7F7007E357F /* SceneDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = SceneDelegate.m; sourceTree = "<group>"; };
 30 | 		187AFC822999E7F7007E357F /* ViewController.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ViewController.h; sourceTree = "<group>"; };
 31 | 		187AFC832999E7F7007E357F /* ViewController.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = ViewController.m; sourceTree = "<group>"; };
 32 | 		187AFC862999E7F7007E357F /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/Main.storyboard; sourceTree = "<group>"; };
 33 | 		187AFC882999E7F8007E357F /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
 34 | 		187AFC8B2999E7F8007E357F /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = "<group>"; };
 35 | 		187AFC8D2999E7F8007E357F /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
 36 | 		187AFC8E2999E7F8007E357F /* main.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = main.m; sourceTree = "<group>"; };
 37 | 		187AFC952999E822007E357F /* re2.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = re2.xcframework; path = ../../build/re2/re2.xcframework; sourceTree = "<group>"; };
 38 | 		187AFC972999E947007E357F /* use_re2.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = use_re2.cpp; sourceTree = "<group>"; };
 39 | 		187AFC992999F069007E357F /* tokenizer.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = tokenizer.xcframework; path = ../../build/tokenizer.xcframework; sourceTree = "<group>"; };
 40 | 		187AFC9B2999F2ED007E357F /* merges.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = merges.txt; sourceTree = "<group>"; };
 41 | 		187AFC9D2999F2F7007E357F /* vocab.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = vocab.txt; sourceTree = "<group>"; };
 42 | /* End PBXFileReference section */
 43 | 
 44 | /* Begin PBXFrameworksBuildPhase section */
 45 | 		187AFC762999E7F7007E357F /* Frameworks */ = {
 46 | 			isa = PBXFrameworksBuildPhase;
 47 | 			buildActionMask = 2147483647;
 48 | 			files = (
 49 | 				187AFC962999E822007E357F /* re2.xcframework in Frameworks */,
 50 | 				187AFC9A2999F069007E357F /* tokenizer.xcframework in Frameworks */,
 51 | 			);
 52 | 			runOnlyForDeploymentPostprocessing = 0;
 53 | 		};
 54 | /* End PBXFrameworksBuildPhase section */
 55 | 
 56 | /* Begin PBXGroup section */
 57 | 		187AFC702999E7F7007E357F = {
 58 | 			isa = PBXGroup;
 59 | 			children = (
 60 | 				187AFC7B2999E7F7007E357F /* SampleApp */,
 61 | 				187AFC7A2999E7F7007E357F /* Products */,
 62 | 			);
 63 | 			sourceTree = "<group>";
 64 | 		};
 65 | 		187AFC7A2999E7F7007E357F /* Products */ = {
 66 | 			isa = PBXGroup;
 67 | 			children = (
 68 | 				187AFC792999E7F7007E357F /* SampleApp.app */,
 69 | 			);
 70 | 			name = Products;
 71 | 			sourceTree = "<group>";
 72 | 		};
 73 | 		187AFC7B2999E7F7007E357F /* SampleApp */ = {
 74 | 			isa = PBXGroup;
 75 | 			children = (
 76 | 				187AFC7C2999E7F7007E357F /* AppDelegate.h */,
 77 | 				187AFC7D2999E7F7007E357F /* AppDelegate.m */,
 78 | 				187AFC7F2999E7F7007E357F /* SceneDelegate.h */,
 79 | 				187AFC802999E7F7007E357F /* SceneDelegate.m */,
 80 | 				187AFC822999E7F7007E357F /* ViewController.h */,
 81 | 				187AFC832999E7F7007E357F /* ViewController.m */,
 82 | 				187AFC972999E947007E357F /* use_re2.cpp */,
 83 | 				187AFC852999E7F7007E357F /* Main.storyboard */,
 84 | 				187AFC882999E7F8007E357F /* Assets.xcassets */,
 85 | 				187AFC8A2999E7F8007E357F /* LaunchScreen.storyboard */,
 86 | 				187AFC8D2999E7F8007E357F /* Info.plist */,
 87 | 				187AFC8E2999E7F8007E357F /* main.m */,
 88 | 				187AFC952999E822007E357F /* re2.xcframework */,
 89 | 				187AFC992999F069007E357F /* tokenizer.xcframework */,
 90 | 				187AFC9B2999F2ED007E357F /* merges.txt */,
 91 | 				187AFC9D2999F2F7007E357F /* vocab.txt */,
 92 | 			);
 93 | 			path = SampleApp;
 94 | 			sourceTree = "<group>";
 95 | 		};
 96 | /* End PBXGroup section */
 97 | 
 98 | /* Begin PBXNativeTarget section */
 99 | 		187AFC782999E7F7007E357F /* SampleApp */ = {
100 | 			isa = PBXNativeTarget;
101 | 			buildConfigurationList = 187AFC922999E7F8007E357F /* Build configuration list for PBXNativeTarget "SampleApp" */;
102 | 			buildPhases = (
103 | 				187AFC752999E7F7007E357F /* Sources */,
104 | 				187AFC762999E7F7007E357F /* Frameworks */,
105 | 				187AFC772999E7F7007E357F /* Resources */,
106 | 			);
107 | 			buildRules = (
108 | 			);
109 | 			dependencies = (
110 | 			);
111 | 			name = SampleApp;
112 | 			productName = SampleApp;
113 | 			productReference = 187AFC792999E7F7007E357F /* SampleApp.app */;
114 | 			productType = "com.apple.product-type.application";
115 | 		};
116 | /* End PBXNativeTarget section */
117 | 
118 | /* Begin PBXProject section */
119 | 		187AFC712999E7F7007E357F /* Project object */ = {
120 | 			isa = PBXProject;
121 | 			attributes = {
122 | 				BuildIndependentTargetsInParallel = 1;
123 | 				LastUpgradeCheck = 1420;
124 | 				TargetAttributes = {
125 | 					187AFC782999E7F7007E357F = {
126 | 						CreatedOnToolsVersion = 14.2;
127 | 					};
128 | 				};
129 | 			};
130 | 			buildConfigurationList = 187AFC742999E7F7007E357F /* Build configuration list for PBXProject "SampleApp" */;
131 | 			compatibilityVersion = "Xcode 14.0";
132 | 			developmentRegion = en;
133 | 			hasScannedForEncodings = 0;
134 | 			knownRegions = (
135 | 				en,
136 | 				Base,
137 | 			);
138 | 			mainGroup = 187AFC702999E7F7007E357F;
139 | 			productRefGroup = 187AFC7A2999E7F7007E357F /* Products */;
140 | 			projectDirPath = "";
141 | 			projectRoot = "";
142 | 			targets = (
143 | 				187AFC782999E7F7007E357F /* SampleApp */,
144 | 			);
145 | 		};
146 | /* End PBXProject section */
147 | 
148 | /* Begin PBXResourcesBuildPhase section */
149 | 		187AFC772999E7F7007E357F /* Resources */ = {
150 | 			isa = PBXResourcesBuildPhase;
151 | 			buildActionMask = 2147483647;
152 | 			files = (
153 | 				187AFC9E2999F2F7007E357F /* vocab.txt in Resources */,
154 | 				187AFC8C2999E7F8007E357F /* LaunchScreen.storyboard in Resources */,
155 | 				187AFC892999E7F8007E357F /* Assets.xcassets in Resources */,
156 | 				187AFC872999E7F7007E357F /* Main.storyboard in Resources */,
157 | 				187AFC9C2999F2ED007E357F /* merges.txt in Resources */,
158 | 			);
159 | 			runOnlyForDeploymentPostprocessing = 0;
160 | 		};
161 | /* End PBXResourcesBuildPhase section */
162 | 
163 | /* Begin PBXSourcesBuildPhase section */
164 | 		187AFC752999E7F7007E357F /* Sources */ = {
165 | 			isa = PBXSourcesBuildPhase;
166 | 			buildActionMask = 2147483647;
167 | 			files = (
168 | 				187AFC982999E947007E357F /* use_re2.cpp in Sources */,
169 | 				187AFC842999E7F7007E357F /* ViewController.m in Sources */,
170 | 				187AFC7E2999E7F7007E357F /* AppDelegate.m in Sources */,
171 | 				187AFC8F2999E7F8007E357F /* main.m in Sources */,
172 | 				187AFC812999E7F7007E357F /* SceneDelegate.m in Sources */,
173 | 			);
174 | 			runOnlyForDeploymentPostprocessing = 0;
175 | 		};
176 | /* End PBXSourcesBuildPhase section */
177 | 
178 | /* Begin PBXVariantGroup section */
179 | 		187AFC852999E7F7007E357F /* Main.storyboard */ = {
180 | 			isa = PBXVariantGroup;
181 | 			children = (
182 | 				187AFC862999E7F7007E357F /* Base */,
183 | 			);
184 | 			name = Main.storyboard;
185 | 			sourceTree = "<group>";
186 | 		};
187 | 		187AFC8A2999E7F8007E357F /* LaunchScreen.storyboard */ = {
188 | 			isa = PBXVariantGroup;
189 | 			children = (
190 | 				187AFC8B2999E7F8007E357F /* Base */,
191 | 			);
192 | 			name = LaunchScreen.storyboard;
193 | 			sourceTree = "<group>";
194 | 		};
195 | /* End PBXVariantGroup section */
196 | 
197 | /* Begin XCBuildConfiguration section */
198 | 		187AFC902999E7F8007E357F /* Debug */ = {
199 | 			isa = XCBuildConfiguration;
200 | 			buildSettings = {
201 | 				ALWAYS_SEARCH_USER_PATHS = NO;
202 | 				CLANG_ANALYZER_NONNULL = YES;
203 | 				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
204 | 				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
205 | 				CLANG_ENABLE_MODULES = YES;
206 | 				CLANG_ENABLE_OBJC_ARC = YES;
207 | 				CLANG_ENABLE_OBJC_WEAK = YES;
208 | 				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
209 | 				CLANG_WARN_BOOL_CONVERSION = YES;
210 | 				CLANG_WARN_COMMA = YES;
211 | 				CLANG_WARN_CONSTANT_CONVERSION = YES;
212 | 				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
213 | 				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
214 | 				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
215 | 				CLANG_WARN_EMPTY_BODY = YES;
216 | 				CLANG_WARN_ENUM_CONVERSION = YES;
217 | 				CLANG_WARN_INFINITE_RECURSION = YES;
218 | 				CLANG_WARN_INT_CONVERSION = YES;
219 | 				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
220 | 				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
221 | 				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
222 | 				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
223 | 				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
224 | 				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
225 | 				CLANG_WARN_STRICT_PROTOTYPES = YES;
226 | 				CLANG_WARN_SUSPICIOUS_MOVE = YES;
227 | 				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
228 | 				CLANG_WARN_UNREACHABLE_CODE = YES;
229 | 				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
230 | 				COPY_PHASE_STRIP = NO;
231 | 				DEBUG_INFORMATION_FORMAT = dwarf;
232 | 				ENABLE_STRICT_OBJC_MSGSEND = YES;
233 | 				ENABLE_TESTABILITY = YES;
234 | 				GCC_C_LANGUAGE_STANDARD = gnu11;
235 | 				GCC_DYNAMIC_NO_PIC = NO;
236 | 				GCC_NO_COMMON_BLOCKS = YES;
237 | 				GCC_OPTIMIZATION_LEVEL = 0;
238 | 				GCC_PREPROCESSOR_DEFINITIONS = (
239 | 					"DEBUG=1",
240 | 					"$(inherited)",
241 | 				);
242 | 				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
243 | 				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
244 | 				GCC_WARN_UNDECLARED_SELECTOR = YES;
245 | 				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
246 | 				GCC_WARN_UNUSED_FUNCTION = YES;
247 | 				GCC_WARN_UNUSED_VARIABLE = YES;
248 | 				IPHONEOS_DEPLOYMENT_TARGET = 16.2;
249 | 				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
250 | 				MTL_FAST_MATH = YES;
251 | 				ONLY_ACTIVE_ARCH = YES;
252 | 				SDKROOT = iphoneos;
253 | 			};
254 | 			name = Debug;
255 | 		};
256 | 		187AFC912999E7F8007E357F /* Release */ = {
257 | 			isa = XCBuildConfiguration;
258 | 			buildSettings = {
259 | 				ALWAYS_SEARCH_USER_PATHS = NO;
260 | 				CLANG_ANALYZER_NONNULL = YES;
261 | 				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
262 | 				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
263 | 				CLANG_ENABLE_MODULES = YES;
264 | 				CLANG_ENABLE_OBJC_ARC = YES;
265 | 				CLANG_ENABLE_OBJC_WEAK = YES;
266 | 				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
267 | 				CLANG_WARN_BOOL_CONVERSION = YES;
268 | 				CLANG_WARN_COMMA = YES;
269 | 				CLANG_WARN_CONSTANT_CONVERSION = YES;
270 | 				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
271 | 				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
272 | 				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
273 | 				CLANG_WARN_EMPTY_BODY = YES;
274 | 				CLANG_WARN_ENUM_CONVERSION = YES;
275 | 				CLANG_WARN_INFINITE_RECURSION = YES;
276 | 				CLANG_WARN_INT_CONVERSION = YES;
277 | 				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
278 | 				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
279 | 				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
280 | 				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
281 | 				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
282 | 				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
283 | 				CLANG_WARN_STRICT_PROTOTYPES = YES;
284 | 				CLANG_WARN_SUSPICIOUS_MOVE = YES;
285 | 				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
286 | 				CLANG_WARN_UNREACHABLE_CODE = YES;
287 | 				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
288 | 				COPY_PHASE_STRIP = NO;
289 | 				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
290 | 				ENABLE_NS_ASSERTIONS = NO;
291 | 				ENABLE_STRICT_OBJC_MSGSEND = YES;
292 | 				GCC_C_LANGUAGE_STANDARD = gnu11;
293 | 				GCC_NO_COMMON_BLOCKS = YES;
294 | 				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
295 | 				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
296 | 				GCC_WARN_UNDECLARED_SELECTOR = YES;
297 | 				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
298 | 				GCC_WARN_UNUSED_FUNCTION = YES;
299 | 				GCC_WARN_UNUSED_VARIABLE = YES;
300 | 				IPHONEOS_DEPLOYMENT_TARGET = 16.2;
301 | 				MTL_ENABLE_DEBUG_INFO = NO;
302 | 				MTL_FAST_MATH = YES;
303 | 				SDKROOT = iphoneos;
304 | 				VALIDATE_PRODUCT = YES;
305 | 			};
306 | 			name = Release;
307 | 		};
308 | 		187AFC932999E7F8007E357F /* Debug */ = {
309 | 			isa = XCBuildConfiguration;
310 | 			buildSettings = {
311 | 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
312 | 				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
313 | 				CODE_SIGN_STYLE = Automatic;
314 | 				CURRENT_PROJECT_VERSION = 1;
315 | 				DEVELOPMENT_TEAM = ET66P5RYW9;
316 | 				GENERATE_INFOPLIST_FILE = YES;
317 | 				INFOPLIST_FILE = SampleApp/Info.plist;
318 | 				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
319 | 				INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen;
320 | 				INFOPLIST_KEY_UIMainStoryboardFile = Main;
321 | 				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
322 | 				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
323 | 				LD_RUNPATH_SEARCH_PATHS = (
324 | 					"$(inherited)",
325 | 					"@executable_path/Frameworks",
326 | 				);
327 | 				MARKETING_VERSION = 1.0;
328 | 				PRODUCT_BUNDLE_IDENTIFIER = org.wyi.SampleApp;
329 | 				PRODUCT_NAME = "$(TARGET_NAME)";
330 | 				SWIFT_EMIT_LOC_STRINGS = YES;
331 | 				TARGETED_DEVICE_FAMILY = "1,2";
332 | 			};
333 | 			name = Debug;
334 | 		};
335 | 		187AFC942999E7F8007E357F /* Release */ = {
336 | 			isa = XCBuildConfiguration;
337 | 			buildSettings = {
338 | 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
339 | 				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
340 | 				CODE_SIGN_STYLE = Automatic;
341 | 				CURRENT_PROJECT_VERSION = 1;
342 | 				DEVELOPMENT_TEAM = ET66P5RYW9;
343 | 				GENERATE_INFOPLIST_FILE = YES;
344 | 				INFOPLIST_FILE = SampleApp/Info.plist;
345 | 				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
346 | 				INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen;
347 | 				INFOPLIST_KEY_UIMainStoryboardFile = Main;
348 | 				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
349 | 				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
350 | 				LD_RUNPATH_SEARCH_PATHS = (
351 | 					"$(inherited)",
352 | 					"@executable_path/Frameworks",
353 | 				);
354 | 				MARKETING_VERSION = 1.0;
355 | 				PRODUCT_BUNDLE_IDENTIFIER = org.wyi.SampleApp;
356 | 				PRODUCT_NAME = "$(TARGET_NAME)";
357 | 				SWIFT_EMIT_LOC_STRINGS = YES;
358 | 				TARGETED_DEVICE_FAMILY = "1,2";
359 | 			};
360 | 			name = Release;
361 | 		};
362 | /* End XCBuildConfiguration section */
363 | 
364 | /* Begin XCConfigurationList section */
365 | 		187AFC742999E7F7007E357F /* Build configuration list for PBXProject "SampleApp" */ = {
366 | 			isa = XCConfigurationList;
367 | 			buildConfigurations = (
368 | 				187AFC902999E7F8007E357F /* Debug */,
369 | 				187AFC912999E7F8007E357F /* Release */,
370 | 			);
371 | 			defaultConfigurationIsVisible = 0;
372 | 			defaultConfigurationName = Release;
373 | 		};
374 | 		187AFC922999E7F8007E357F /* Build configuration list for PBXNativeTarget "SampleApp" */ = {
375 | 			isa = XCConfigurationList;
376 | 			buildConfigurations = (
377 | 				187AFC932999E7F8007E357F /* Debug */,
378 | 				187AFC942999E7F8007E357F /* Release */,
379 | 			);
380 | 			defaultConfigurationIsVisible = 0;
381 | 			defaultConfigurationName = Release;
382 | 		};
383 | /* End XCConfigurationList section */
384 | 	};
385 | 	rootObject = 187AFC712999E7F7007E357F /* Project object */;
386 | }
387 | 


--------------------------------------------------------------------------------
/SampleApp/SampleApp.xcodeproj/project.xcworkspace/contents.xcworkspacedata:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <Workspace
3 |    version = "1.0">
4 |    <FileRef
5 |       location = "self:">
6 |    </FileRef>
7 | </Workspace>
8 | 


--------------------------------------------------------------------------------
/SampleApp/SampleApp.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
3 | <plist version="1.0">
4 | <dict>
5 | 	<key>IDEDidComputeMac32BitWarning</key>
6 | 	<true/>
7 | </dict>
8 | </plist>
9 | 


--------------------------------------------------------------------------------
/SampleApp/SampleApp.xcodeproj/xcuserdata/y.xcuserdatad/xcschemes/xcschememanagement.plist:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
 3 | <plist version="1.0">
 4 | <dict>
 5 | 	<key>SchemeUserState</key>
 6 | 	<dict>
 7 | 		<key>SampleApp.xcscheme_^#shared#^_</key>
 8 | 		<dict>
 9 | 			<key>orderHint</key>
10 | 			<integer>0</integer>
11 | 		</dict>
12 | 	</dict>
13 | </dict>
14 | </plist>
15 | 


--------------------------------------------------------------------------------
/SampleApp/SampleApp/AppDelegate.h:
--------------------------------------------------------------------------------
 1 | //
 2 | //  AppDelegate.h
 3 | //  RE2Sample
 4 | //
 5 | //  Created by Yi Wang on 2/12/23.
 6 | //
 7 | 
 8 | #import <UIKit/UIKit.h>
 9 | 
10 | @interface AppDelegate : UIResponder <UIApplicationDelegate>
11 | 
12 | 
13 | @end
14 | 
15 | 


--------------------------------------------------------------------------------
/SampleApp/SampleApp/AppDelegate.m:
--------------------------------------------------------------------------------
 1 | //
 2 | //  AppDelegate.m
 3 | //  RE2Sample
 4 | //
 5 | //  Created by Yi Wang on 2/12/23.
 6 | //
 7 | 
 8 | #import "AppDelegate.h"
 9 | 
10 | @interface AppDelegate ()
11 | 
12 | @end
13 | 
14 | @implementation AppDelegate
15 | 
16 | 
17 | - (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
18 |     // Override point for customization after application launch.
19 |     return YES;
20 | }
21 | 
22 | 
23 | #pragma mark - UISceneSession lifecycle
24 | 
25 | 
26 | - (UISceneConfiguration *)application:(UIApplication *)application configurationForConnectingSceneSession:(UISceneSession *)connectingSceneSession options:(UISceneConnectionOptions *)options {
27 |     // Called when a new scene session is being created.
28 |     // Use this method to select a configuration to create the new scene with.
29 |     return [[UISceneConfiguration alloc] initWithName:@"Default Configuration" sessionRole:connectingSceneSession.role];
30 | }
31 | 
32 | 
33 | - (void)application:(UIApplication *)application didDiscardSceneSessions:(NSSet<UISceneSession *> *)sceneSessions {
34 |     // Called when the user discards a scene session.
35 |     // If any sessions were discarded while the application was not running, this will be called shortly after application:didFinishLaunchingWithOptions.
36 |     // Use this method to release any resources that were specific to the discarded scenes, as they will not return.
37 | }
38 | 
39 | 
40 | @end
41 | 


--------------------------------------------------------------------------------
/SampleApp/SampleApp/Assets.xcassets/AccentColor.colorset/Contents.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "colors" : [
 3 |     {
 4 |       "idiom" : "universal"
 5 |     }
 6 |   ],
 7 |   "info" : {
 8 |     "author" : "xcode",
 9 |     "version" : 1
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/SampleApp/SampleApp/Assets.xcassets/AppIcon.appiconset/Contents.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "images" : [
 3 |     {
 4 |       "idiom" : "universal",
 5 |       "platform" : "ios",
 6 |       "size" : "1024x1024"
 7 |     }
 8 |   ],
 9 |   "info" : {
10 |     "author" : "xcode",
11 |     "version" : 1
12 |   }
13 | }
14 | 


--------------------------------------------------------------------------------
/SampleApp/SampleApp/Assets.xcassets/Contents.json:
--------------------------------------------------------------------------------
1 | {
2 |   "info" : {
3 |     "author" : "xcode",
4 |     "version" : 1
5 |   }
6 | }
7 | 


--------------------------------------------------------------------------------
/SampleApp/SampleApp/Base.lproj/LaunchScreen.storyboard:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="13122.16" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" launchScreen="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="01J-lp-oVM">
 3 |     <dependencies>
 4 |         <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="13104.12"/>
 5 |         <capability name="Safe area layout guides" minToolsVersion="9.0"/>
 6 |         <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
 7 |     </dependencies>
 8 |     <scenes>
 9 |         <!--View Controller-->
10 |         <scene sceneID="EHf-IW-A2E">
11 |             <objects>
12 |                 <viewController id="01J-lp-oVM" sceneMemberID="viewController">
13 |                     <view key="view" contentMode="scaleToFill" id="Ze5-6b-2t3">
14 |                         <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
15 |                         <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
16 |                         <color key="backgroundColor" xcode11CocoaTouchSystemColor="systemBackgroundColor" cocoaTouchSystemColor="whiteColor"/>
17 |                         <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
18 |                     </view>
19 |                 </viewController>
20 |                 <placeholder placeholderIdentifier="IBFirstResponder" id="iYj-Kq-Ea1" userLabel="First Responder" sceneMemberID="firstResponder"/>
21 |             </objects>
22 |             <point key="canvasLocation" x="53" y="375"/>
23 |         </scene>
24 |     </scenes>
25 | </document>
26 | 


--------------------------------------------------------------------------------
/SampleApp/SampleApp/Base.lproj/Main.storyboard:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="13122.16" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
 3 |     <dependencies>
 4 |         <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="13104.12"/>
 5 |         <capability name="Safe area layout guides" minToolsVersion="9.0"/>
 6 |         <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
 7 |     </dependencies>
 8 |     <scenes>
 9 |         <!--View Controller-->
10 |         <scene sceneID="tne-QT-ifu">
11 |             <objects>
12 |                 <viewController id="BYZ-38-t0r" customClass="ViewController" customModuleProvider="" sceneMemberID="viewController">
13 |                     <view key="view" contentMode="scaleToFill" id="8bC-Xf-vdC">
14 |                         <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
15 |                         <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
16 |                         <color key="backgroundColor" xcode11CocoaTouchSystemColor="systemBackgroundColor" cocoaTouchSystemColor="whiteColor"/>
17 |                         <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
18 |                     </view>
19 |                 </viewController>
20 |                 <placeholder placeholderIdentifier="IBFirstResponder" id="dkx-z0-nzr" sceneMemberID="firstResponder"/>
21 |             </objects>
22 |         </scene>
23 |     </scenes>
24 | </document>
25 | 


--------------------------------------------------------------------------------
/SampleApp/SampleApp/Info.plist:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
 3 | <plist version="1.0">
 4 | <dict>
 5 | 	<key>UIApplicationSceneManifest</key>
 6 | 	<dict>
 7 | 		<key>UIApplicationSupportsMultipleScenes</key>
 8 | 		<false/>
 9 | 		<key>UISceneConfigurations</key>
10 | 		<dict>
11 | 			<key>UIWindowSceneSessionRoleApplication</key>
12 | 			<array>
13 | 				<dict>
14 | 					<key>UISceneConfigurationName</key>
15 | 					<string>Default Configuration</string>
16 | 					<key>UISceneDelegateClassName</key>
17 | 					<string>SceneDelegate</string>
18 | 					<key>UISceneStoryboardFile</key>
19 | 					<string>Main</string>
20 | 				</dict>
21 | 			</array>
22 | 		</dict>
23 | 	</dict>
24 | </dict>
25 | </plist>
26 | 


--------------------------------------------------------------------------------
/SampleApp/SampleApp/SceneDelegate.h:
--------------------------------------------------------------------------------
 1 | //
 2 | //  SceneDelegate.h
 3 | //  RE2Sample
 4 | //
 5 | //  Created by Yi Wang on 2/12/23.
 6 | //
 7 | 
 8 | #import <UIKit/UIKit.h>
 9 | 
10 | @interface SceneDelegate : UIResponder <UIWindowSceneDelegate>
11 | 
12 | @property (strong, nonatomic) UIWindow * window;
13 | 
14 | @end
15 | 
16 | 


--------------------------------------------------------------------------------
/SampleApp/SampleApp/SceneDelegate.m:
--------------------------------------------------------------------------------
 1 | //
 2 | //  SceneDelegate.m
 3 | //  RE2Sample
 4 | //
 5 | //  Created by Yi Wang on 2/12/23.
 6 | //
 7 | 
 8 | #import "SceneDelegate.h"
 9 | 
10 | @interface SceneDelegate ()
11 | 
12 | @end
13 | 
14 | @implementation SceneDelegate
15 | 
16 | 
17 | - (void)scene:(UIScene *)scene willConnectToSession:(UISceneSession *)session options:(UISceneConnectionOptions *)connectionOptions {
18 |     // Use this method to optionally configure and attach the UIWindow `window` to the provided UIWindowScene `scene`.
19 |     // If using a storyboard, the `window` property will automatically be initialized and attached to the scene.
20 |     // This delegate does not imply the connecting scene or session are new (see `application:configurationForConnectingSceneSession` instead).
21 | }
22 | 
23 | 
24 | - (void)sceneDidDisconnect:(UIScene *)scene {
25 |     // Called as the scene is being released by the system.
26 |     // This occurs shortly after the scene enters the background, or when its session is discarded.
27 |     // Release any resources associated with this scene that can be re-created the next time the scene connects.
28 |     // The scene may re-connect later, as its session was not necessarily discarded (see `application:didDiscardSceneSessions` instead).
29 | }
30 | 
31 | 
32 | - (void)sceneDidBecomeActive:(UIScene *)scene {
33 |     // Called when the scene has moved from an inactive state to an active state.
34 |     // Use this method to restart any tasks that were paused (or not yet started) when the scene was inactive.
35 | }
36 | 
37 | 
38 | - (void)sceneWillResignActive:(UIScene *)scene {
39 |     // Called when the scene will move from an active state to an inactive state.
40 |     // This may occur due to temporary interruptions (ex. an incoming phone call).
41 | }
42 | 
43 | 
44 | - (void)sceneWillEnterForeground:(UIScene *)scene {
45 |     // Called as the scene transitions from the background to the foreground.
46 |     // Use this method to undo the changes made on entering the background.
47 | }
48 | 
49 | 
50 | - (void)sceneDidEnterBackground:(UIScene *)scene {
51 |     // Called as the scene transitions from the foreground to the background.
52 |     // Use this method to save data, release shared resources, and store enough scene-specific state information
53 |     // to restore the scene back to its current state.
54 | }
55 | 
56 | 
57 | @end
58 | 


--------------------------------------------------------------------------------
/SampleApp/SampleApp/ViewController.h:
--------------------------------------------------------------------------------
 1 | //
 2 | //  ViewController.h
 3 | //  RE2Sample
 4 | //
 5 | //  Created by Yi Wang on 2/12/23.
 6 | //
 7 | 
 8 | #import <UIKit/UIKit.h>
 9 | 
10 | @interface ViewController : UIViewController
11 | 
12 | 
13 | @end
14 | 
15 | 


--------------------------------------------------------------------------------
/SampleApp/SampleApp/ViewController.m:
--------------------------------------------------------------------------------
 1 | //
 2 | //  ViewController.m
 3 | //  RE2Sample
 4 | //
 5 | //  Created by Yi Wang on 2/12/23.
 6 | //
 7 | 
 8 | #import "ViewController.h"
 9 | 
10 | void test_tokenizer(const char* merges_path, const char* vocab_path);
11 | 
12 | @interface ViewController ()
13 | 
14 | @end
15 | 
16 | @implementation ViewController
17 | 
18 | - (void)viewDidLoad {
19 |   [super viewDidLoad];
20 | 
21 |   NSString* merges = [[NSBundle mainBundle] pathForResource:@"merges"
22 | 						   ofType:@"txt"];
23 |   NSString* vocab = [[NSBundle mainBundle] pathForResource:@"vocab"
24 | 						   ofType:@"txt"];
25 |   test_tokenizer([merges UTF8String], [vocab UTF8String]);
26 | }
27 | 
28 | 
29 | @end
30 | 


--------------------------------------------------------------------------------
/SampleApp/SampleApp/main.m:
--------------------------------------------------------------------------------
 1 | //
 2 | //  main.m
 3 | //  RE2Sample
 4 | //
 5 | //  Created by Yi Wang on 2/12/23.
 6 | //
 7 | 
 8 | #import <UIKit/UIKit.h>
 9 | #import "AppDelegate.h"
10 | 
11 | int main(int argc, char * argv[]) {
12 |     NSString * appDelegateClassName;
13 |     @autoreleasepool {
14 |         // Setup code that might create autoreleased objects goes here.
15 |         appDelegateClassName = NSStringFromClass([AppDelegate class]);
16 |     }
17 |     return UIApplicationMain(argc, argv, nil, appDelegateClassName);
18 | }
19 | 


--------------------------------------------------------------------------------
/SampleApp/SampleApp/use_re2.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | //  use_re2.cpp
 3 | //  RE2Sample
 4 | //
 5 | //  Created by Yi Wang on 2/12/23.
 6 | //
 7 | 
 8 | #include <re2/re2.h>
 9 | #include <re2/stringpiece.h>
10 | #include <stdio.h>
11 | #include <tokenizer/bpe.h>
12 | 
13 | #include <iostream>
14 | #include <string>
15 | 
16 | extern "C" void test_tokenizer(const char* merges_path,
17 |                                const char* vocab_path) {
18 |   RE2 re(
19 |       "('s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| "
20 |       "?[^\\s\\p{L}\\p{N}]+|\\s+\\(?!\\S\\)|\\s+)");
21 | 
22 |   BPERanks bpe_ranks;
23 |   std::fstream merges(merges_path, std::ios::in);
24 |   load_merge_rules(merges, &bpe_ranks);
25 | 
26 |   std::unordered_map<uint8_t, wchar_t> b2u;
27 |   std::unordered_map<wchar_t, uint8_t> u2b;
28 |   bytes_to_unicode(&b2u, &u2b);
29 | 
30 |   std::unordered_map<std::string, int> t2i;
31 |   std::unordered_map<int, std::string> i2t;
32 |   std::fstream vocab_txt(vocab_path, std::ios::in);
33 |   load_vocab(vocab_txt, &t2i, &i2t);
34 | 
35 |   std::vector<std::string> candidates = {
36 |       "this is <|endoftext|> else<|endoftext|>",
37 |       "<|endoftext|> else<|endoftext|>", "this is <|endoftext|> else",
38 |       "this is <|endoftext|>else", "this is else"};
39 |   for (auto s : candidates) {
40 |     std::vector<int> ids;
41 |     encode(s, re, bpe_ranks, b2u, t2i, &ids);
42 |     assert(ids.size() > 0);
43 |     assert(decode(ids, u2b, i2t) == s);
44 |   }
45 |   std::cout << "Passed testing";
46 | }
47 | 


--------------------------------------------------------------------------------
/build-re2.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # exit when any command fails
  4 | set -e
  5 | 
  6 | # The -f option force the rebuild of IREE runtime frameworks.
  7 | while getopts hf flag; do
  8 |     case "${flag}" in
  9 |     f) FORCE_REBUILD="YES" ;;
 10 |     *)
 11 |         echo "$0 -h|-f"
 12 |         echo "  -h : display this message"
 13 |         echo "  -f : force rebuild runtime frameworks"
 14 |         exit 0
 15 |         ;;
 16 |     esac
 17 | done
 18 | 
 19 | SCRIPT_DIR=$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")
 20 | SRC_DIR=$SCRIPT_DIR/re2
 21 | BUILD_DIR=$SCRIPT_DIR/build/re2
 22 | XCFRAMEWORK="$BUILD_DIR"/re2.xcframework
 23 | 
 24 | 
 25 | if [[ $FORCE_REBUILD == "YES" ]]; then
 26 |     echo "┌------------------------------------------------------------------------------┐"
 27 |     echo "  Deleting existig RE2 frameworks ..."
 28 |     echo "└------------------------------------------------------------------------------┘"
 29 |     rm -rf "$BUILD_DIR"
 30 | fi
 31 | 
 32 | function build_for_ios() {
 33 |     case $1 in
 34 |     sim) sdk=iphonesimulator ;;
 35 |     dev) sdk=iphoneos ;;
 36 |     *)
 37 |         echo "Unknown target $1 when calling build_for_ios"
 38 |         exit 5
 39 |         ;;
 40 |     esac
 41 | 
 42 |     arch=$2
 43 |     label=ios-$1-"$arch"
 44 |     build_dir="$BUILD_DIR"/"$label"
 45 | 
 46 |     test_file="$build_dir"/re2.framework/re2
 47 |     if test -f "$test_file" && lipo -info "$test_file"; then
 48 |         echo "Skip building iree.framework for $label."
 49 |     else
 50 |         echo "┌------------------------------------------------------------------------------┐"
 51 |         echo "  Building for $label ..."
 52 |         echo "   src: $SRC_DIR "
 53 |         echo "   build: $build_dir "
 54 |         echo "   build log: $build_dir/build.log"
 55 |         echo "└------------------------------------------------------------------------------┘"
 56 |         mkdir -p "$build_dir" # So to create the build.log file.
 57 |         cmake -S . \
 58 |             -B "$build_dir" \
 59 |             -DCMAKE_SYSTEM_NAME=iOS \
 60 |             -DCMAKE_OSX_SYSROOT="$(xcodebuild -version -sdk $sdk Path)" \
 61 |             -DCMAKE_OSX_ARCHITECTURES="$arch" \
 62 |             -DCMAKE_SYSTEM_PROCESSOR="$arch" \
 63 |             -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 \
 64 |             -DCMAKE_IOS_INSTALL_COMBINED=YES \
 65 |             -DCMAKE_INSTALL_PREFIX="$build_dir"/install \
 66 | 	    -DRE2_BUILD_TESTING=OFF \
 67 |             >"$build_dir"/build.log 2>&1
 68 |         cmake --build "$build_dir" >>"$build_dir"/build.log 2>&1
 69 |     fi
 70 | }
 71 | 
 72 | function build_for_macos() {
 73 |     arch=$1
 74 |     label=macos-"$arch"
 75 |     build_dir="$BUILD_DIR"/"$label"
 76 | 
 77 |     test_file="$build_dir"/re2.framework/re2
 78 |     if test -f "$test_file" && lipo -info "$test_file"; then
 79 |         echo "Skip building iree.framework for $label."
 80 |     else
 81 |         echo "┌------------------------------------------------------------------------------┐"
 82 |         echo "  Building for $label ..."
 83 |         echo "   src: $SRC_DIR "
 84 |         echo "   build: $build_dir "
 85 |         echo "   build log: $build_dir/build.log"
 86 |         echo "└------------------------------------------------------------------------------┘"
 87 |         mkdir -p "$build_dir" # So to create the build.log file.
 88 |         cmake -S . \
 89 |             -B "$build_dir" \
 90 | 	    -DRE2_BUILD_TESTING=OFF \
 91 |             -DCMAKE_OSX_ARCHITECTURES="$arch" >"$build_dir"/build.log 2>&1
 92 |         cmake --build "$build_dir" >"$build_dir"/build.log 2>&1
 93 |     fi
 94 | }
 95 | 
 96 | function merge_fat_static_library() {
 97 |     src_label=$2
 98 |     dst_label=$1
 99 | 
100 |     src="$BUILD_DIR"/$src_label/re2/re2.framework/re2
101 |     dst="$BUILD_DIR"/$dst_label/re2/re2.framework/re2
102 | 
103 |     if lipo -info "$dst" | grep 'Non-fat' >/dev/null; then
104 |         echo "┌------------------------------------------------------------------------------┐"
105 |         echo "  Building FAT static library ..."
106 |         echo "   merge: $src"
107 |         echo "    into: $dst"
108 |         echo "└------------------------------------------------------------------------------┘"
109 |         merged=/tmp/libmerged-"$src_label"-"$dst_label".a
110 |         lipo "$src" "$dst" -create -output "$merged"
111 |         mv "$merged" "$dst"
112 |     fi
113 | }
114 | 
115 | # Step 1. Build the following frameworks
116 | #
117 | # Note: We cannot build for dev-x86_64 because Apple does not offer
118 | # SDK for it. If we do so, CMake will prompt us about missing required
119 | # architecture x86_64 in file
120 | # /Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.2.sdk/usr/lib/libc++.tbd
121 | # build_for_ios dev x86_64
122 | #
123 | # This step also merge dependent static libraries into the target library.
124 | build_for_ios sim arm64
125 | build_for_ios sim x86_64
126 | build_for_ios dev arm64
127 | build_for_macos x86_64
128 | build_for_macos arm64
129 | 
130 | # Step 2. Merge the frameworks of the same OS platform
131 | #  ios-simulator-arm64+x86_64
132 | #  macos-arm64+x86_64
133 | merge_fat_static_library ios-sim-arm64 ios-sim-x86_64
134 | merge_fat_static_library macos-arm64 macos-x86_64
135 | 
136 | # Step 3. Merge the above frameworks into an XCFramework
137 | echo "┌------------------------------------------------------------------------------┐"
138 | echo "  Aggregating frameworks into an xcframework ..."
139 | echo "└------------------------------------------------------------------------------┘"
140 | rm -rf "$XCFRAMEWORK"
141 | xcodebuild -create-xcframework \
142 |     -framework "$BUILD_DIR"/macos-arm64/re2/re2.framework \
143 |     -framework "$BUILD_DIR"/ios-sim-arm64/re2/re2.framework \
144 |     -framework "$BUILD_DIR"/ios-dev-arm64/re2/re2.framework \
145 |     -output "$XCFRAMEWORK"
146 | tree -L 1 -d "$XCFRAMEWORK"
147 | 


--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # exit when any command fails
  4 | set -e
  5 | 
  6 | # The -f option force the rebuild of IREE runtime frameworks.
  7 | while getopts hf flag; do
  8 |     case "${flag}" in
  9 |     f) FORCE_REBUILD="YES" ;;
 10 |     *)
 11 |         echo "$0 -h|-f"
 12 |         echo "  -h : display this message"
 13 |         echo "  -f : force rebuild runtime frameworks"
 14 |         exit 0
 15 |         ;;
 16 |     esac
 17 | done
 18 | 
 19 | SCRIPT_DIR=$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")
 20 | SRC_DIR=$SCRIPT_DIR
 21 | BUILD_DIR=$SCRIPT_DIR/build
 22 | XCFRAMEWORK="$BUILD_DIR"/tokenizer.xcframework
 23 | 
 24 | 
 25 | # Build the tokenizer into a framework for each target.
 26 | if [[ $FORCE_REBUILD == "YES" ]]; then
 27 |     echo "┌------------------------------------------------------------------------------┐"
 28 |     echo "  Deleting existig IREE runtime frameworks ..."
 29 |     echo "└------------------------------------------------------------------------------┘"
 30 |     rm -rf "$IREE_BUILD_RUNTIME_DIR"
 31 | fi
 32 | 
 33 | function build_for_ios() {
 34 |     case $1 in
 35 |     sim) sdk=iphonesimulator ;;
 36 |     dev) sdk=iphoneos ;;
 37 |     *)
 38 |         echo "Unknown target $1 when calling build_for_ios"
 39 |         exit 5
 40 |         ;;
 41 |     esac
 42 | 
 43 |     arch=$2
 44 |     label=ios-$1-"$arch"
 45 |     build_dir="$BUILD_DIR"/"$label"
 46 | 
 47 |     test_file="$build_dir"/tokenizer.framework/tokenizer
 48 |     if test -f "$test_file" && lipo -info "$test_file"; then
 49 |         echo "Skip building iree.framework for $label."
 50 |     else
 51 |         echo "┌------------------------------------------------------------------------------┐"
 52 |         echo "  Building for $label ..."
 53 |         echo "   src: $SRC_DIR "
 54 |         echo "   build: $build_dir "
 55 |         echo "   build log: $build_dir/build.log"
 56 |         echo "└------------------------------------------------------------------------------┘"
 57 |         mkdir -p "$build_dir" # So to create the build.log file.
 58 |         cmake -S . \
 59 |             -B "$build_dir" \
 60 |             -GNinja \
 61 |             -DCMAKE_SYSTEM_NAME=iOS \
 62 |             -DCMAKE_OSX_SYSROOT="$(xcodebuild -version -sdk $sdk Path)" \
 63 |             -DCMAKE_OSX_ARCHITECTURES="$arch" \
 64 |             -DCMAKE_SYSTEM_PROCESSOR="$arch" \
 65 |             -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 \
 66 |             -DCMAKE_IOS_INSTALL_COMBINED=YES \
 67 |             -DCMAKE_INSTALL_PREFIX="$build_dir"/install \
 68 |             >"$build_dir"/build.log 2>&1
 69 |         cmake --build "$build_dir" >>"$build_dir"/build.log 2>&1
 70 |     fi
 71 | }
 72 | 
 73 | function build_for_macos() {
 74 |     arch=$1
 75 |     label=macos-"$arch"
 76 |     build_dir="$BUILD_DIR"/"$label"
 77 | 
 78 |     test_file="$build_dir"/tokenizer.framework/tokenizer
 79 |     if test -f "$test_file" && lipo -info "$test_file"; then
 80 |         echo "Skip building iree.framework for $label."
 81 |     else
 82 |         echo "┌------------------------------------------------------------------------------┐"
 83 |         echo "  Building for $label ..."
 84 |         echo "   src: $SRC_DIR "
 85 |         echo "   build: $build_dir "
 86 |         echo "   build log: $build_dir/build.log"
 87 |         echo "└------------------------------------------------------------------------------┘"
 88 |         mkdir -p "$build_dir" # So to create the build.log file.
 89 |         cmake -S . \
 90 |             -B "$build_dir" \
 91 |             -GNinja \
 92 |             -DCMAKE_OSX_ARCHITECTURES="$arch" >"$build_dir"/build.log 2>&1
 93 |         cmake --build "$build_dir" >"$build_dir"/build.log 2>&1
 94 |     fi
 95 | }
 96 | 
 97 | function merge_fat_static_library() {
 98 |     src_label=$2
 99 |     dst_label=$1
100 | 
101 |     src="$BUILD_DIR"/$src_label/tokenizer.framework/tokenizer
102 |     dst="$BUILD_DIR"/$dst_label/tokenizer.framework/tokenizer
103 | 
104 |     if lipo -info "$dst" | grep 'Non-fat' >/dev/null; then
105 |         echo "┌------------------------------------------------------------------------------┐"
106 |         echo "  Building FAT static library ..."
107 |         echo "   merge: $src"
108 |         echo "    into: $dst"
109 |         echo "└------------------------------------------------------------------------------┘"
110 |         merged=/tmp/libmerged-"$src_label"-"$dst_label".a
111 |         lipo "$src" "$dst" -create -output "$merged"
112 |         mv "$merged" "$dst"
113 |     fi
114 | }
115 | 
116 | # Step 1. Build the following frameworks
117 | #
118 | # Note: We cannot build for dev-x86_64 because Apple does not offer
119 | # SDK for it. If we do so, CMake will prompt us about missing required
120 | # architecture x86_64 in file
121 | # /Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.2.sdk/usr/lib/libc++.tbd
122 | # build_for_ios dev x86_64
123 | #
124 | # This step also merge dependent static libraries into the target library.
125 | build_for_ios sim arm64
126 | build_for_ios sim x86_64
127 | build_for_ios dev arm64
128 | build_for_macos x86_64
129 | build_for_macos arm64
130 | 
131 | # Step 2. Merge the frameworks of the same OS platform
132 | #  ios-simulator-arm64+x86_64
133 | #  macos-arm64+x86_64
134 | merge_fat_static_library ios-sim-arm64 ios-sim-x86_64
135 | merge_fat_static_library macos-arm64 macos-x86_64
136 | 
137 | # Step 3. Merge the above frameworks into an XCFramework
138 | echo "┌------------------------------------------------------------------------------┐"
139 | echo "  Aggregating frameworks into an xcframework ..."
140 | echo "└------------------------------------------------------------------------------┘"
141 | rm -rf "$XCFRAMEWORK"
142 | xcodebuild -create-xcframework \
143 |     -framework "$BUILD_DIR"/macos-arm64/tokenizer.framework \
144 |     -framework "$BUILD_DIR"/ios-sim-arm64/tokenizer.framework \
145 |     -framework "$BUILD_DIR"/ios-dev-arm64/tokenizer.framework \
146 |     -output "$XCFRAMEWORK"
147 | tree -L 1 -d "$XCFRAMEWORK"
148 | 


--------------------------------------------------------------------------------
/doc/0.md:
--------------------------------------------------------------------------------
  1 | # LLM Tokenziers in C++ (0): Port from Python
  2 | 
  3 | ## Why Tokenizer in C++
  4 | 
  5 | I am trying to run the HuggingFace GPT2 model on a smart phone. To tokenize the text data, I was supposed to use HuggingFace’s GPT2 tokenizer. Unfortunately, HuggingFace only offers the Python version and declined to offer a C/C++ version in [Mar 2020](https://github.com/huggingface/tokenizers/issues/185#issuecomment-594615029).
  6 | 
  7 | HuggingFace does offer a [Rust version](https://github.com/huggingface/tokenizers). But I'm worried that putting Rust into mobile development will create more problems.
  8 | 
  9 | According to [their document](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/nemo_megatron/gpt/gpt_training.html#quick-start), NVIDIA NeMo project uses the Google Sentencepiece tokenizer library, which is in C++.  However, reading the source of HuggingFace [GPT2’s tokenizer](https://huggingface.co/docs/transformers/v4.26.0/en/model_doc/gpt2#transformers.GPT2Tokenizer), I noticed that it is of a different kind — a byte-pair encoding tokenizer. According to HuggingFace’s [nice guide](https://huggingface.co/docs/transformers/tokenizer_summary) to tokenizers, 
 10 | 
 11 | >Each model has its own tokenizer type. 
 12 | A pretrained model only performs properly if you feed it an input that was tokenized with the same rules that were used to tokenize its training data.
 13 | 
 14 | 
 15 | This guide also explains the three main types of tokenizers used with Transformer models: byte-pair encoding (BPE), WordPiece, and SentencePiece.  
 16 | 
 17 | Therefore, I decided to translate HuggingFace’s GPT2 tokenizer from Python to C++.
 18 | 
 19 | ## What to Port
 20 | 
 21 | In order to figure out which part of the HuggingFace’s Python tokenizer project is what I need to port, I worte a simple Python script that calls the tokenizer, used `pdb` to step into the functions calls.  Then I noticed that [this eight-line Python method](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/models/gpt2/tokenization_gpt2.py#L296-L304) is all what I need to translate if I am gonna run the GPT2 model.
 22 | 
 23 | ```python
 24 |     def _tokenize(self, text):
 25 |         """Tokenize a string."""
 26 |         bpe_tokens = []
 27 |         for token in re.findall(self.pat, text):
 28 |             token = "".join(
 29 |                 self.byte_encoder[b] for b in token.encode("utf-8")
 30 |             )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
 31 |             bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
 32 |         return bpe_tokens
 33 | ```
 34 | 
 35 | From the above code snippet, we can see it does three things:
 36 | 
 37 | 1. Roughly segment the input text using regular expression matching by calling `re.findall`.
 38 | 1. Maps control bytes, b, in each candidate token into 255+b.
 39 | 1. Maps each candidate token into one or more BPE tokens.
 40 | 
 41 | In some successive notes, I will explain how I port this three steps.  In the rest of this article, I will describe how I identified that the above function is all what I need to port.
 42 | 
 43 | ## Install HuggingFace Transformers from Source Code
 44 | 
 45 | I git-cloned the HuggingFace's Transformers repository.
 46 | 
 47 | ```bash
 48 | git clone https://github.com/huggingface/transformers
 49 | ```
 50 | 
 51 | I followed the section
 52 |  [Editable Install](https://huggingface.co/docs/transformers/installation#editable-install), I ran the following command
 53 | 
 54 | ```
 55 | cd transformers
 56 | pip install -e .
 57 | ```
 58 | 
 59 | This process might complain about non-exisitng dependencies. Just install them.  After this process, we should be able to import `transformers`.
 60 | 
 61 | ```bash
 62 | python -c 'import transformers'
 63 | ```
 64 | 
 65 | ## Drafting a Driving Script
 66 | 
 67 | I wrote this script `t.py` to tokenize a string as the following.
 68 | 
 69 | ```python
 70 | import transformers
 71 | import builtins
 72 | from os import path
 73 | 
 74 | def load_gpt2_tokenizer() -> transformers.GPT2Tokenizer:
 75 |   builtins.open, tmp_open = open, builtins.open
 76 |   gpt2_dir = "/Users/y/w/gpt2cpp/assets"
 77 |   tokenizer = transformers.GPT2Tokenizer(
 78 |       vocab_file=path.join(gpt2_dir, 'vocab.json'),
 79 |       merges_file=path.join(gpt2_dir, 'merges.txt'))
 80 |   builtins.open = tmp_open
 81 |   return tokenizer
 82 | 
 83 | tknzr = load_gpt2_tokenizer()
 84 | print(tknzr("zero one two three four"))
 85 | ```
 86 | 
 87 | My purpose is to use pdb to trace into the function call to `tknzr("zero one two three four")`.
 88 | 
 89 | ## Class Hierarchy
 90 | 
 91 | Before showing the tracing result, it would be helpful to understand the class hierarchy.
 92 | 
 93 | 1. [`class GPT2Tokenizer(PreTrainedTokenizer):`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/models/gpt2/tokenization_gpt2.py#LL104C12-L104C12)
 94 | 
 95 | 1. [`class PreTrainedTokenizer(PreTrainedTokenizerBase):`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils.py#LL333C7-L333C26)
 96 | 
 97 | 1. [`class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils_base.py#L1476)
 98 | 
 99 | ## Trace
100 | 
101 | Running the above driver script using the following command
102 | 
103 | ```bash
104 | python -m pdb t.py
105 | ```
106 | 
107 | reveals the following calling chain:
108 | 
109 | - [`PreTrainedTokenizerBase.__call__`](
110 | https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils_base.py#L2452-L2539)
111 |   - [`PreTrainedTokenizerBase._switch_to_input_mode`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils_base.py#L3564-L3568) is an empty implementation
112 |   - [`PreTrainedTokenizerBase._call_one`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils_base.py#L2541-L2651)
113 |     - [`PreTrainedTokenizerBase.encode_plus`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils_base.py#L2653-L2724)
114 |       - [`PreTrainedTokenizerBase._get_padding_truncation_strategies`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils_base.py#L2314-L2450)
115 |         - Runs [`padding_strategy = PaddingStrategy.DO_NOT_PAD`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils_base.py#L2372)
116 |         - Runs [`truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils_base.py#LL2399C13-L2399C69)
117 |         - Runs [`return padding_strategy, truncation_strategy, max_length=None, kwargs={}`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils_base.py#L2450)
118 |       - [`PreTrainedTokenizer._encode_plus`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils.py#L593-L669)
119 |         - [nested function `get_input_ids`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils.py#L614C33-L638)
120 |           - [`PreTrainedTokenizer.tokenize`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils.py#L481-L549)
121 |             - `all_special_tokens_extended = {'<|endoftext|>': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True)}`
122 |             - [`text, kwargs = self.prepare_for_tokenization(text, **kwargs)
123 | `](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils.py#L502) does nothing.
124 |             - [`no_split_token = set(self.unique_no_split_tokens)`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils.py#L516) returns `{}`
125 |             - [`tokens = self.tokens_trie.split(text)`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils.py#LL517C15-L517C15) returns `['zero one two three four']`
126 |             - [`tokenized_text = []`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils.py#L539)
127 |             - [`tokenized_text.extend(self._tokenize(token))`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils.py#L547)
128 |               - [`GPT2Tokenizer._tokenize`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/models/gpt2/tokenization_gpt2.py#L296-L304)
129 | 
130 | Here we reached `GPT2Tokenizer._tokenize`. And before this invocation, all above stack frames do basically nothing.
131 | 


--------------------------------------------------------------------------------
/doc/1.md:
--------------------------------------------------------------------------------
  1 | # LLM Tokenziers in C++ (1): Unicode Regexp
  2 | 
  3 | [LLM Tokenizers in C++ (0): Port from Python](https://github.com/wangkuiyi/gpt2cpp/wiki/LLM-Tokenzier-in-C---(0):-Port-from-Python)
  4 | 
  5 | ## Regexp for Unicode
  6 | 
  7 | The first step of BPE tokenizing is to split the text into candidate tokens. The class `GPT2Tokenizer` uses Python’s regexp package to do so.
  8 | 
  9 | The following regexp defines a candidate token.
 10 | 
 11 | ```
 12 | self.pat = re.compile(
 13 |   r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 14 | ```
 15 | 
 16 | With the triple-quote, the authors do not have to escape the quote mark. The prefix `r` before the triple-quote means the string is treated as a [raw string](https://stackoverflow.com/a/4780104/724872), so all other eescape codes will be ignored.
 17 | 
 18 | The general structure of this regexp is a sequence of sub-regexp’s separated by the logical-or mark, `|`. The first few sub-regexp’s correspond to commonly-used subwords such as `'ve`.
 19 | 
 20 | `\p{L}`, according to [this StackOverflow answer](https://stackoverflow.com/a/14891168/724872), mean a unicode letter. Similarly, `\p{N}` is a number. `\s` means a space character, which could be a space, a tab, or a newline. `\S` mean non-space character.
 21 | 
 22 | So, ` ?\p{L}+` means a sequence of one or more letters prefixed with or without a whitespace. This represents a “word”. Similarly, ` ?\p{N}+` represents a number without signs or the dot. 
 23 | 
 24 | `[^\s\p{L}\p{N}]` means a character which is not a space, a letter, or a number. This leaves us the puctunations. Therefore, ` ?[^\s\p{L}\p{N}]+` is a sequence of successive punctuators prefixed with or without a whitespace.
 25 | 
 26 | `\s+` means one or more spaces.
 27 | 
 28 | `\s+(?!\S)` uses the [look-ahead assertion](https://en.wikipedia.org/wiki/Regular_expression#Assertions) `(?!...)`. It matches one or more spaces followed by, but excluding, a non-space character.  With the existence of this sub-regexp, the successive of N spaces followed by a word is recognized as two tokens -- `\s+(?!\S)` matches the first token consisting of N-1 spaces, and ` ?\p{L}+` matches the second token consisting the last space and the word.  The purpose is to have the second word recognized as a token with a leading space, which indicates that the word is not the first word of a sentence. If we remove `\s+(?!\S)`, the N spaces will be recognized as the first token, and the word as the second one, without the leading space.
 29 | 
 30 | **Unfortunately, re2 [does NOT support](https://github.com/google/re2/wiki/Syntax) look-ahead.**
 31 | 
 32 | > (?!re)	before text not matching re (NOT SUPPORTED)
 33 | 
 34 | **This will slightly change the idea into that the successive two or more spaces separate sentences, but not words.**
 35 | 
 36 | ## Unicode Regexp in C++
 37 | 
 38 | `[std::regex](https://en.cppreference.com/w/cpp/regex)` does not accept Unicode regexp `\p{L}` or `\p{N}`. If we do so, the progrma will abort with the
 39 | `error_escape` and the message
 40 | 
 41 | >the expression contains an invalid escaped character or a trailing escape
 42 | 
 43 | 
 44 | This seems [a known problem](https://stackoverflow.com/a/38002322/724872), and the workaround is `boo`s`t::wregex`. Unfortunately, Boost is not part of the iOS SDK and I do not want to bring it in as a dependency of my Xcode project.  Moreover, according to [this answer](https://stackoverflow.com/a/38543269/724872),
 45 | 
 46 | >The Boost.Regex documentation explicitly states that there's no support for Unicode-specific character classes when using boost::wregex. If you want this functionality, you'll need to build Boost.Regex with ICU support enabled then use the boost::u32regex type instead of boost::wregex.
 47 | 
 48 | No! I do not want to build Boost from source code because it is huge.
 49 | 
 50 | I then remembered several years ago, I read Russ Cox’s great notes about doing [regular expression the right way](https://swtch.com/~rsc/regexp/) and his work [RE2](https://github.com/google/re2).
 51 | 
 52 | RE2 is small. It [supports Unicode regexps](https://github.com/google/re2/blob/b025c6a3ae05995660e3b882eb3277f4399ced1a/re2/testing/re2_test.cc#L1383-L1409).  More importantly, I can build it using CMake for both my macOS and iOS (Simulator)!  The following commands builds RE2 for the host.
 53 | 
 54 | ```bash
 55 | cmake -B b -S . -DCMAKE_INSTALL_PREFIX=b/install
 56 | cmake —build b —target install
 57 | ```
 58 | 
 59 | The following commands builds RE2 for iOS Simulator. You can change `iphonesimulator` into `iphones` to make it build for iOS devices.
 60 | 
 61 | ```bash
 62 | cmake -B build-ios -S . \
 63 |       -DCMAKE_SYSTEM_NAME=iOS \
 64 |       -DCMAKE_OSX_SYSROOT="$(xcodebuild -version -sdk iphonesimulator  Path)" \
 65 |       -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 \
 66 |       -DCMAKE_IOS_INSTALL_COMBINED=YES \
 67 |       -DCMAKE_INSTALL_PREFIX=build-ios/install
 68 | 
 69 | cmake --build build-ios --target install
 70 | ```
 71 | 
 72 | ## Side-by-Side
 73 | 
 74 | The following Python code is copy-n-pasted from the Transformers’ tokenzier repository.
 75 | 
 76 | ```python
 77 | import regex as re
 78 | 
 79 | pat = re.compile(
 80 |   r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 81 | 
 82 | def f(text):
 83 |   for token in re.findall(pat, text):
 84 |     print(f"token=\"{token}\"")
 85 | 
 86 | f("we'd  see   you say 世界你好真实好的很啊")
 87 | ```
 88 | 
 89 | The output is as the following.
 90 | 
 91 | ```
 92 | token="we"
 93 | token="'d"
 94 | token="  "
 95 | token="see"
 96 | token="   "
 97 | token="you"
 98 | token=" say"
 99 | token=" 世界你好真实好的很啊"
100 | ```
101 | 
102 | The following is the corresponding C++ code.
103 | 
104 | ```c++
105 | #include <string>
106 | #include <iostream>
107 | #include <re2/re2.h>
108 | #include <re2/stringpiece.h>
109 | 
110 | int main() {
111 |   std::string w;
112 |   std::string text = "we'd  see   you say 世界你好真实好的很啊";
113 |   re2::StringPiece input(text);
114 | 
115 |   RE2 re("('s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+\\(?!\\S\\)|\\s+)");
116 |   assert(re.ok());  // compiled; if not, see re.error();
117 | 
118 |   std::string var;
119 |   int value;
120 |   while (RE2::FindAndConsume(&input, re, &w)) {
121 |     std::cout << "token=\"" << w << "\"" << std::endl;
122 |   }
123 | }
124 | ```
125 | 
126 | The following command builds it and links the RE2 static libarary.
127 | 
128 | ```bash
129 | clang++ -std=c++20 b.cc \
130 |  -I ~/w/re2/b/install/include \
131 |  -L ~/w/re2/b/install/lib -lre2 -o b
132 | ```
133 | 
134 | The output is identical to the above from the Python code.
135 | 


--------------------------------------------------------------------------------
/doc/u.md:
--------------------------------------------------------------------------------
  1 | # Everything C++ and Python Programmers Need to Know about Unicode
  2 | 
  3 | ## Unicode
  4 | 
  5 | Unicode is larger character set than ASCII.  It contains codepoints for non-English characters. A [codepoint](https://en.wikipedia.org/wiki/Code_point) is a numberic value that maps to a specific [character](https://en.wikipedia.org/wiki/Character_(computing)). A character is a symbol we draw on screens and papers.
  6 | 
  7 | Unlike an ASCII codepoint ranges from 0 to 127 and takes a byte with high-bit fixed to 0, or a `char`, Unicode has more codepoints that we will need four bytes, or a `uint32_t`, or `wchar_t`, to save each codepoint.
  8 | 
  9 | ## UTF-8
 10 | 
 11 | Most characters in Unicode are infrequent.  Inspired by Huffman coding, we want to use less than four bytes for those frequent codepoints.  This idea leads to a well-known encoding algorithm, UTF-8, which encodes each Unicode codepoint into one, two, three, or four bytes.
 12 | 
 13 | <img width="738" alt="Screenshot 2023-02-07 at 5 08 03 PM" src="https://user-images.githubusercontent.com/1548775/217402628-3f7c2d96-d2bb-41aa-a581-040193eb298d.png">
 14 | 
 15 | ## One-byte UTF-8 Codepoints
 16 | 
 17 | Each one-byte UTF-8 encoded codepoint corresponds to an ASCII codepoint.
 18 | 
 19 | <img width="738" src="https://web.alfredstate.edu/faculty/weimandn/miscellaneous/ascii/ASCII%20Conversion%20Chart.gif" />
 20 | 
 21 | In the above ASCII table, the character `$` has the codepoint with decimal value 37 and, equivalently, the binary value 00100100, where the high-bit is 0.
 22 | 
 23 | ## Multiple-byte UTF-8 Codepoints
 24 | 
 25 | When we use a modern text editor to type C++ source code, all string literals are encoded in UTF-8.
 26 | 
 27 | The following source code assigns a UTF-8 encoded string to a `std::string`-typed variable `text`.
 28 | 
 29 | ```c++
 30 | std::string text = "Hi 益";
 31 | ```
 32 | 
 33 | The first three characters, `H`, `i` and ` `, are in ASCII.  So each of them takes one byte in an UTF-8 encoded string.  We are curious to know how many bytes the characters `益` takes in this UTF-8 string.
 34 | 
 35 | Let us do the following anatomy.
 36 | 
 37 | ```c++
 38 | strlen(s.c_str()) // returns 6
 39 | s.size()          // returns 6
 40 | ```
 41 | 
 42 | Because `std::string` is an alias of `basic_string<char>`, which represents a sequence of `char`s.  In the above example, the character `益` must take 6-3=3 bytes.
 43 | 
 44 | The following code snippet prints each of these six bytes:
 45 | 
 46 | ```c++
 47 |   for(size_t i = 0; i < text.size(); ++i) {
 48 |     std::cout << " " << static_cast<unsigned int>(static_cast<unsigned char>(text[i]));
 49 |   }
 50 | ```
 51 | 
 52 | It gives us:
 53 | 
 54 | ```
 55 | 72 105 32 231 155 138
 56 | ```
 57 | 
 58 | Look up the above ASCII table, we can see 
 59 | 
 60 | - 72 is `H`
 61 | - 105 is `i`
 62 | - 32 is ` ` (the whitespace)
 63 | 
 64 | Converting the decimal values of the rest three bytes into binary, we get:
 65 | 
 66 | ```
 67 | 231 = 11100111
 68 | 155 = 10011011
 69 | 138 = 10001010
 70 | ```
 71 | 
 72 | Looking up the above UTF-8 encoding table, the three prefixes, 1110, 10, and 10, match the case of a three-byte codepoint.  So, we remove the three prefixes and concatenate the rest.
 73 | 
 74 | ```
 75 | 0111 011011 001010
 76 | ```
 77 | 
 78 | Converting this binary number into decimal, we get
 79 | 
 80 | 30410
 81 | 
 82 | Google tells us 30410 **is** the codepoint of `益`!
 83 | 
 84 | ## From `char` to `wchar_t`
 85 | 
 86 | Given that the character `益` takes a value that requires more than one byte to store, and the C/C++ type `char` takes only one byte, it is illegal to assign a multi-byte Unicode codepoint to a `char`-typed variables.  The following line of code will trigger a compilation error.
 87 | 
 88 | ```c++
 89 | char c = '益'; // character too large for enclosing character literal type
 90 | ```
 91 | 
 92 | Therefore, to fully support Unicode, C/C++ needs a new character type, which takes four bytes -- the longest UTF-8 codepoint takes four bytes.  This is `wchar_t`.
 93 | 
 94 | However, changing only `char` into `wchar_t` is not enough.  The following code give us the same error.
 95 | 
 96 | ```
 97 | wchar_t c =  '益';  // character too large for enclosing character literal type
 98 | ```
 99 | 
100 | This is because the literal syntax (`' '`) is for defining a `char` rather than a `wchar_t`.  We need the `L`-prefix to denote that each character in the string takes four bytes.
101 | 
102 | ```
103 | wchar_t c = L'益';  // This works!
104 | ```
105 | 
106 | And we have `sizeof(c) == 4` because there are four Unicode codepoitns in the string.
107 | 
108 | We have `sizeof(c[0])==4` on Linux, macOS, and other POSIX-compatible systems. It is 2 on Windows due to a POSIX-incompatibility issue.
109 | 
110 | ## From `std::string` to `std::wstring`
111 | 
112 | | string type  | implementation | literal syntax |
113 | | ------------- | ------------- | -------------- |
114 | | `std::string`  | `/* std::basic_string<char> */`  | `s = "Hi 益"` |
115 | | `std::wstring`  | `/* std::basic_string<wchar_t> */` | `w = L"Hi 益"` |
116 | 
117 | We have `s.size()==6` because the UTF-i encoding of `"Hi 益"` takes 6 bytes. We have `w.size()==4` because it contains 4 Unicode codepoints.
118 | 
119 | The following code snippet prints each codepoint.
120 | 
121 | ```c++
122 |   for(size_t i = 0; i < text.size(); ++i) {
123 |     std::cout << " " << static_cast<uint32_t>(text[i]);
124 |   }
125 | ```
126 | 
127 | and gives
128 | 
129 | ```
130 | 72 105 32 30410
131 | ```
132 | 
133 | We do not have to manually decoding it by looking up the UTF-8 prefix table.
134 | 
135 | The following functions encode wstring in UTF-8 and decode UTF-8 encoded char-string to wstring.
136 | 
137 | ```c++
138 | #include <string>
139 | #include <codecvt>
140 | 
141 | std::wstring utf8_to_wstring (const std::string& str) {
142 |     std::wstring_convert<std::codecvt_utf8<wchar_t>> myconv;
143 |     return myconv.from_bytes(str);
144 | }
145 | 
146 | std::string wstring_to_utf8 (const std::wstring& str) {
147 |     std::wstring_convert<std::codecvt_utf8<wchar_t>> myconv;
148 |     return myconv.to_bytes(str);
149 | }
150 | ```
151 | 
152 | ## From C++ to Python
153 | 
154 | Python has two data types `str` and `bytes`.  A string literal has type `str`, which behave like `std::wstring` in terms that `len(s:str)` returns the number of Unicode codepoints.  By UTF-8 encoding a string, we converts `str` into `bytes`, which behave more like `std::string` as it is a sequence of bytes.  The following example tells more details.
155 | 
156 | ```python
157 | s = "Hi 益"
158 | print(type(s))  # str
159 | print(len(s))   # 4
160 | print([ord(c) for c in s]) # [72, 105, 32, 30410]
161 | 
162 | b = s.encode('utf-8')
163 | print(type(b))  # bytes
164 | print(len(b))   # 6
165 | print([i for i in b]) # [72, 105, 32, 231, 155, 138]
166 | ```
167 | 


--------------------------------------------------------------------------------
/tokenizer/assets/README.md:
--------------------------------------------------------------------------------
1 | The script https://github.com/iree-org/iree-jax/blob/main/models/gpt2/setup.sh reveals how to download data files required by GPT-2.  The HuggingFace tokenizer requires two files: `merges.txt` and `vocab.json`.  Our simplified Python tokenizer `tokenizer/bpe.py` also requires these two files  Our C++ version `tokenizer/bpe.{h,cc}` requires `vocab.txt`.  To get `vocab.txt`, we can run `tool/json-to-txt.py`:
2 | 
3 | ```bash
4 | python tool/json-to-txt.py tokenizer/assets/vocab.json > tokenizer/assets/vocab.txt
5 | ```
6 | 


--------------------------------------------------------------------------------
/tokenizer/bpe.cc:
--------------------------------------------------------------------------------
  1 | #include "bpe.h"
  2 | 
  3 | std::wstring utf8_to_wstring(const std::string& str) {
  4 |   std::wstring_convert<std::codecvt_utf8<wchar_t>> myconv;
  5 |   return myconv.from_bytes(str);
  6 | }
  7 | 
  8 | std::string wstring_to_utf8(const std::wstring& str) {
  9 |   std::wstring_convert<std::codecvt_utf8<wchar_t>> myconv;
 10 |   return myconv.to_bytes(str);
 11 | }
 12 | 
 13 | std::string utf8(wchar_t c) {
 14 |   std::wstring w(1, c);
 15 |   return wstring_to_utf8(w);
 16 | }
 17 | 
 18 | void bytes_to_unicode(std::unordered_map<uint8_t, wchar_t>* b2u,
 19 |                       std::unordered_map<wchar_t, uint8_t>* u2b) {
 20 |   auto _insert_range = [=](int start, int end) {
 21 |     for (int c = start; c <= end; c++) {
 22 |       b2u->insert({uint8_t(c), wchar_t(c)});
 23 |     }
 24 |   };
 25 | 
 26 |   b2u->clear();
 27 |   _insert_range(L'!', L'~');
 28 |   _insert_range(L'¡', L'¬');
 29 |   _insert_range(L'®', L'ÿ');
 30 | 
 31 |   int n = 0;
 32 |   for (int b = 0; b < 256; b++) {
 33 |     if (b2u->find(uint8_t(b)) == b2u->end()) {
 34 |       b2u->insert({uint8_t(b), wchar_t(256 + n)});
 35 |       n++;
 36 |     }
 37 |   }
 38 | 
 39 |   if (u2b != NULL) {
 40 |     u2b->clear();
 41 |     for (auto e : (*b2u)) {
 42 |       u2b->insert({e.second, e.first});
 43 |     }
 44 |   }
 45 | }
 46 | 
 47 | // Given a token as a UTF8 string, encode each byte into an wchar_t
 48 | void byte_encode_token(const std::string& token,
 49 |                        const std::unordered_map<uint8_t, wchar_t>& b2u,
 50 |                        std::wstring* result) {
 51 |   result->resize(0);
 52 |   for (char c : token) {
 53 |     wchar_t wc = b2u.at(uint8_t(c));
 54 |     result->push_back(wc);
 55 |   }
 56 | }
 57 | 
 58 | void load_merge_rules(std::istream& ins, BPERanks* bpe_ranks) {
 59 |   bpe_ranks->clear();
 60 | 
 61 |   std::string line;
 62 |   int n = 0;
 63 |   while (std::getline(ins, line)) {
 64 |     if (n > 0) {               // Skip the version comment.
 65 |       int d = line.find(" ");  // merges.txt file use ASCII space
 66 |       bpe_ranks->insert({{utf8_to_wstring(line.substr(0, d)),
 67 |                           utf8_to_wstring(line.substr(d + 1))},
 68 |                          n - 1});
 69 |     }
 70 |     n++;
 71 |   }
 72 | }
 73 | 
 74 | void get_pairs(const std::wstring& word,
 75 |                std::vector<std::pair<std::wstring, std::wstring>>* pairs) {
 76 |   pairs->clear();
 77 | 
 78 |   if (word.size() < 2) return;
 79 | 
 80 |   wchar_t previous = word[0];
 81 |   for (int i = 1; i < word.size(); i++) {
 82 |     pairs->push_back({std::wstring(1, previous), std::wstring(1, word[i])});
 83 |     previous = word[i];
 84 |   }
 85 | }
 86 | 
 87 | void bpe(const std::wstring& token, const BPERanks& bpe_ranks,
 88 |          std::vector<std::wstring>* result) {
 89 |   std::set<int> merged;  // records indices in pairs that were merged.
 90 |   auto _left = [](int i, std::set<int>& merged) {
 91 |     for (int j = i - 1; j >= -1; j--) {
 92 |       if (merged.find(j) == merged.end()) return j;
 93 |     }
 94 |     return -1;
 95 |   };
 96 |   auto _right = [](int i, int cap, std::set<int>& merged) {
 97 |     for (int j = i + 1; j < cap; j++) {
 98 |       if (merged.find(j) == merged.end()) return j;
 99 |     }
100 |     return cap;
101 |   };
102 | 
103 |   std::vector<std::pair<std::wstring, std::wstring>> pairs;
104 |   get_pairs(token, &pairs);
105 | 
106 |   while (true) {
107 |     int min_score = INT_MAX;
108 |     int to_merge = -1;  // indices into pairs.
109 | 
110 |     for (int i = 0; i < pairs.size(); ++i) {
111 |       if (merged.find(i) == merged.end()) {  // pair i is not merged.
112 |         auto iter = bpe_ranks.find(pairs[i]);
113 |         int score = iter != bpe_ranks.end() ? iter->second : INT_MAX;
114 |         if (score < min_score) {
115 |           min_score = score;
116 |           to_merge = i;
117 |         }
118 |       }
119 |     }
120 | 
121 |     if (to_merge == -1) break;
122 | 
123 |     merged.insert(to_merge);
124 |     std::wstring merge_into = pairs[to_merge].first + pairs[to_merge].second;
125 | 
126 |     int l = _left(to_merge, merged);
127 |     if (l >= 0) pairs[l].second = merge_into;
128 |     int r = _right(to_merge, pairs.size(), merged);
129 |     if (r < pairs.size()) pairs[r].first = merge_into;
130 |   }  // end while (true)
131 | 
132 |   if (merged.size() == pairs.size())
133 |     result->push_back(token);
134 |   else {
135 |     for (int i = 0; i < pairs.size(); ++i) {
136 |       if (merged.find(i) == merged.end()) {
137 |         if (_left(i, merged) < 0) result->push_back(pairs[i].first);
138 |         result->push_back(pairs[i].second);
139 |       }
140 |     }
141 |   }
142 | }
143 | 
144 | void _tokenize(const std::string& text, RE2& re, BPERanks& bpe_ranks,
145 |                const std::unordered_map<uint8_t, wchar_t>& b2u,
146 |                std::vector<std::string>* result) {
147 |   re2::StringPiece input(text);
148 |   std::string token;
149 |   while (RE2::FindAndConsume(&input, re, &token)) {
150 |     std::wstring wtoken;
151 |     byte_encode_token(token, b2u, &wtoken);
152 | 
153 |     std::vector<std::wstring> bpe_tokens;
154 |     bpe(wtoken, bpe_ranks, &bpe_tokens);
155 | 
156 |     for (auto ws : bpe_tokens) {
157 |       result->push_back(wstring_to_utf8(ws));
158 |     }
159 |   }
160 | }
161 | 
162 | void tokenize(const std::string& text, RE2& re, BPERanks& bpe_ranks,
163 |               const std::unordered_map<uint8_t, wchar_t>& b2u,
164 |               std::vector<std::string>* result) {
165 |   const std::string eot("<|endoftext|>");
166 |   size_t s = 0;
167 |   size_t i = text.find(eot);
168 |   while (i != std::string::npos) {
169 |     _tokenize(text.substr(s, i - s), re, bpe_ranks, b2u, result);
170 |     result->push_back(eot);
171 |     s = i + eot.size();
172 |     i = text.find(eot, s);
173 |   }
174 |   _tokenize(text.substr(s), re, bpe_ranks, b2u, result);
175 | }
176 | 
177 | void load_vocab(std::istream& ins, std::unordered_map<std::string, int>* t2i,
178 |                 std::unordered_map<int, std::string>* i2t) {
179 |   t2i->clear();
180 |   i2t->clear();
181 | 
182 |   std::string line;
183 |   std::string token;
184 |   int n = 0;
185 |   while (std::getline(ins, line)) {
186 |     if (n % 2 == 0) {
187 |       token = line;
188 |     } else {
189 |       t2i->insert({token, std::stoi(line)});
190 |       i2t->insert({std::stoi(line), token});
191 |     }
192 |     n++;
193 |   }
194 | }
195 | 
196 | void encode(const std::string& text, RE2& re, BPERanks& bpe_ranks,
197 |             std::unordered_map<uint8_t, wchar_t>& b2u,
198 |             const std::unordered_map<std::string, int>& t2i,
199 |             std::vector<int>* ids) {
200 |   std::vector<std::string> result;
201 |   tokenize(text, re, bpe_ranks, b2u, &result);
202 |   ids->clear();
203 |   for (auto s : result) {
204 |     ids->push_back(t2i.at(s));
205 |   }
206 | }
207 | 
208 | std::string decode(const std::vector<int>& ids,
209 |                    const std::unordered_map<wchar_t, uint8_t>& u2b,
210 |                    const std::unordered_map<int, std::string>& i2t) {
211 |   std::string concat;
212 |   for (int id : ids) {
213 |     concat += i2t.at(id);
214 |   }
215 | 
216 |   std::wstring w = utf8_to_wstring(concat);
217 |   std::string r;
218 |   for (wchar_t c : w) {
219 |     r.push_back(char(u2b.at(c)));
220 |   }
221 |   return r;
222 | }
223 | 


--------------------------------------------------------------------------------
/tokenizer/bpe.h:
--------------------------------------------------------------------------------
 1 | #ifndef HUGGINGFACE_TRANSFORMERS_TOKENIZER_GPT2_BPE
 2 | #define HUGGINGFACE_TRANSFORMERS_TOKENIZER_GPT2_BPE
 3 | 
 4 | #include <re2/re2.h>
 5 | #include <re2/stringpiece.h>
 6 | 
 7 | #include <codecvt>
 8 | #include <fstream>
 9 | #include <iostream>
10 | #include <set>
11 | #include <string>
12 | #include <unordered_map>
13 | #include <utility>
14 | 
15 | std::wstring utf8_to_wstring(const std::string& str);
16 | 
17 | std::string wstring_to_utf8(const std::wstring& str);
18 | 
19 | std::string utf8(wchar_t c);
20 | 
21 | void bytes_to_unicode(std::unordered_map<uint8_t, wchar_t>* b2u,
22 |                       std::unordered_map<wchar_t, uint8_t>* u2b);
23 | 
24 | void byte_encode_token(const std::string& token,
25 |                        const std::unordered_map<uint8_t, wchar_t>& b2u,
26 |                        std::wstring* result);
27 | 
28 | // hash_pair_wstring is used in BPERanks to make a pair of wstrings
29 | // hashable, so the pair can be used as the key to unordered_map.
30 | struct hash_pair_wstring {
31 |   size_t operator()(const std::pair<std::wstring, std::wstring>& p) const {
32 |     auto hash1 = std::hash<std::wstring>{}(p.first);
33 |     auto hash2 = std::hash<std::wstring>{}(p.second);
34 |     // If hash1 == hash2, their XOR is zero.
35 |     return (hash1 != hash2) ? hash1 ^ hash2 : hash1;
36 |   }
37 | };
38 | 
39 | // BPERanks maps each merge rule, which is a pair of wstrings, to its
40 | // rank.  This mapping allows quick lookup for the optimal merge rule.
41 | using BPERanks = std::unordered_map<std::pair<std::wstring, std::wstring>, int,
42 |                                     hash_pair_wstring>;
43 | 
44 | void load_merge_rules(std::istream& ins, BPERanks* bpe_ranks);
45 | 
46 | void get_pairs(const std::wstring& word,
47 |                std::vector<std::pair<std::wstring, std::wstring> >* pairs);
48 | 
49 | void bpe(const std::wstring& token, const BPERanks& bpe_ranks,
50 |          std::vector<std::wstring>* result);
51 | 
52 | void tokenize(const std::string& text, RE2& re, BPERanks& bpe_ranks,
53 |               const std::unordered_map<uint8_t, wchar_t>& b2u,
54 |               std::vector<std::string>* result);
55 | 
56 | void load_vocab(std::istream& ins, std::unordered_map<std::string, int>* t2i,
57 |                 std::unordered_map<int, std::string>* i2t);
58 | void encode(const std::string& text, RE2& re, BPERanks& bpe_ranks,
59 |             std::unordered_map<uint8_t, wchar_t>& b2u,
60 |             const std::unordered_map<std::string, int>& t2i,
61 |             std::vector<int>* ids);
62 | std::string decode(const std::vector<int>& ids,
63 |                    const std::unordered_map<wchar_t, uint8_t>& u2b,
64 |                    const std::unordered_map<int, std::string>& i2t);
65 | 
66 | #endif  // HUGGINGFACE_TRANSFORMERS_TOKENIZER_GPT2_BPE
67 | 


--------------------------------------------------------------------------------
/tokenizer/bpe.py:
--------------------------------------------------------------------------------
  1 | import regex as re
  2 | import json
  3 | import os
  4 | 
  5 | 
  6 | def bytes_to_unicode():
  7 |     bs = (
  8 |         list(range(ord("!"), ord("~") + 1))
  9 |         + list(range(ord("¡"), ord("¬") + 1))
 10 |         + list(range(ord("®"), ord("ÿ") + 1))
 11 |     )
 12 |     cs = bs[:]
 13 |     n = 0
 14 |     for b in range(2**8):
 15 |         if b not in bs:
 16 |             bs.append(b)
 17 |             cs.append(2**8 + n)
 18 |             n += 1
 19 |     cs = [chr(n) for n in cs]
 20 |     return dict(zip(bs, cs))
 21 | 
 22 | 
 23 | def get_pairs(word):
 24 |     """
 25 |     Return set of symbol pairs in a word.
 26 | 
 27 |     Word is represented as tuple of symbols (symbols being variable-length strings).
 28 |     """
 29 |     pairs = set()
 30 |     prev_char = word[0]
 31 |     for char in word[1:]:
 32 |         pairs.add((prev_char, char))
 33 |         prev_char = char
 34 |     return pairs
 35 | 
 36 | 
 37 | class Tokenizer:
 38 |     def __init__(self, assert_dir: str):
 39 |         self.unk_token = "<|unknown token|>"
 40 | 
 41 |         self.pat = re.compile(
 42 |             r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
 43 |         )
 44 | 
 45 |         with open(os.path.join(assert_dir, "vocab.json"), encoding="utf-8") as f:
 46 |             self.encoder = json.load(f)
 47 |         self.decoder = {v: k for k, v in self.encoder.items()}
 48 | 
 49 |         self.byte_encoder = bytes_to_unicode()
 50 |         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
 51 | 
 52 |         self.cache = {}
 53 |         with open(os.path.join(assert_dir, "merges.txt"), encoding="utf-8") as f:
 54 |             bpe_merges = f.read().split("\n")[1:-1]
 55 |         bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
 56 |         self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
 57 | 
 58 |     def bpe(self, token):
 59 |         if token in self.cache:
 60 |             return self.cache[token]
 61 |         word = tuple(token)
 62 |         pairs = get_pairs(word)
 63 |         if not pairs:
 64 |             return token
 65 | 
 66 |         while True:
 67 |             bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
 68 |             if bigram not in self.bpe_ranks:
 69 |                 break
 70 |             first, second = bigram
 71 |             new_word = []
 72 |             i = 0
 73 |             while i < len(word):
 74 |                 try:
 75 |                     j = word.index(first, i)
 76 |                 except ValueError:
 77 |                     new_word.extend(word[i:])
 78 |                     break
 79 |                 else:
 80 |                     new_word.extend(word[i:j])
 81 |                     i = j
 82 | 
 83 |                 if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
 84 |                     new_word.append(first + second)
 85 |                     i += 2
 86 |                 else:
 87 |                     new_word.append(word[i])
 88 |                     i += 1
 89 | 
 90 |             new_word = tuple(new_word)
 91 |             word = new_word
 92 |             if len(word) == 1:
 93 |                 break
 94 |             else:
 95 |                 pairs = get_pairs(word)
 96 |         word = " ".join(word)
 97 |         self.cache[token] = word
 98 |         return word
 99 | 
100 |     def _tokenize(self, text):
101 |         """Tokenize a string."""
102 |         bpe_tokens = []
103 |         for token in re.findall(self.pat, text):
104 |             token = "".join(
105 |                 self.byte_encoder[b] for b in token.encode("utf-8")
106 |             )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
107 |             bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
108 |         return bpe_tokens
109 | 
110 |     def tokenize(self, text):
111 |         bpe_tokens = []
112 |         eot = "<|endoftext|>"
113 |         s = 0
114 |         i = text.find(eot)
115 |         while i != -1:
116 |             bpe_tokens.extend(self._tokenize(text[s:i]))
117 |             bpe_tokens.append(eot)
118 |             s = i + len(eot)
119 |             i = text.find(eot, i + len(eot))
120 |         bpe_tokens.extend(self._tokenize(text[s:]))
121 |         return bpe_tokens
122 | 
123 |     def encode(self, text):
124 |         return [self.encoder[token] for token in self.tokenize(text)]
125 | 
126 |     def decode(self, indices):
127 |         text = "".join([self.decoder.get(index) for index in indices])
128 |         return bytearray(self.byte_decoder[c] for c in text).decode("utf-8")
129 | 
130 | 
131 | def test_tokenizer():
132 |     t = Tokenizer(os.path.join(os.path.dirname(os.path.realpath(__file__)), "assets"))
133 |     # with open("/tmp/sample.txt") as f:
134 |     #     for line in f:
135 |     #         lst = t._tokenize(line[:-1]) # Remove the trailing '\n'.
136 |     #         print(*lst, sep=', ') # Do no quote strings.
137 |     candidates = [
138 |         "this is <|endoftext|> else<|endoftext|>",
139 |         "<|endoftext|> else<|endoftext|>",
140 |         "this is <|endoftext|> else",
141 |         "this is <|endoftext|>else",
142 |         "this is else",
143 |     ]
144 |     for s in candidates:
145 |         assert t.decode(t.encode(s)) == s
146 | 
147 | 
148 | if __name__ == "__main__":
149 |     test_tokenizer()
150 | 


--------------------------------------------------------------------------------
/tokenizer/bpe_test.cc:
--------------------------------------------------------------------------------
  1 | #include "bpe.h"
  2 | 
  3 | RE2 re(
  4 |     "('s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| "
  5 |     "?[^\\s\\p{L}\\p{N}]+|\\s+\\(?!\\S\\)|\\s+)");
  6 | 
  7 | void test_re2() {
  8 |   assert(re.ok());  // compiled; if not, see re.error();
  9 | 
 10 |   std::string w;
 11 |   std::string text = "we'd annoyingly 顽皮";
 12 |   re2::StringPiece input(text);
 13 | 
 14 |   std::vector<std::string> v;
 15 |   while (RE2::FindAndConsume(&input, re, &w)) {
 16 |     v.push_back(w);
 17 |   }
 18 |   assert(v == std::vector<std::string>({"we", "\'d", " annoyingly", " 顽皮"}));
 19 | }
 20 | 
 21 | void test_bytes_to_unicode() {
 22 |   std::unordered_map<uint8_t, wchar_t> b2u;
 23 |   std::unordered_map<wchar_t, uint8_t> u2b;
 24 |   bytes_to_unicode(&b2u, &u2b);
 25 |   assert(b2u.size() == 256);
 26 |   assert(b2u[0] == 0x100);
 27 |   assert(u2b.size() == 256);
 28 | }
 29 | 
 30 | void test_byte_encode_token() {
 31 |   std::unordered_map<uint8_t, wchar_t> b2u;
 32 |   bytes_to_unicode(&b2u, NULL);
 33 | 
 34 |   std::string s(" very");
 35 |   std::wstring b;
 36 |   byte_encode_token(s, b2u, &b);
 37 |   assert("Ġvery" == wstring_to_utf8(b));
 38 | }
 39 | 
 40 | void test_load_merge_rules() {
 41 |   BPERanks bpe_ranks;
 42 |   std::fstream merges("/tmp/merges.txt", std::ios::in);
 43 |   load_merge_rules(merges, &bpe_ranks);
 44 |   assert(bpe_ranks.size() == 50000);
 45 | 
 46 |   auto iter = bpe_ranks.find({utf8_to_wstring("Ġg"), utf8_to_wstring("azed")});
 47 |   assert(iter != bpe_ranks.end());
 48 |   assert(iter->second = 49999);
 49 | }
 50 | 
 51 | void test_get_pairs() {
 52 |   std::vector<std::pair<std::wstring, std::wstring> > pairs;
 53 |   get_pairs(utf8_to_wstring("very"), &pairs);
 54 |   assert(pairs.size() == 3);
 55 |   assert(wstring_to_utf8(pairs[1].first) == "e");
 56 |   assert(wstring_to_utf8(pairs[1].second) == "r");
 57 | }
 58 | 
 59 | void test_bpe() {
 60 |   BPERanks bpe_ranks;
 61 |   std::fstream merges("/tmp/merges.txt", std::ios::in);
 62 |   load_merge_rules(merges, &bpe_ranks);
 63 |   assert(bpe_ranks.size() == 50000);
 64 | 
 65 |   std::vector<std::wstring> result;
 66 |   bpe(utf8_to_wstring("annoyingly"), bpe_ranks, &result);
 67 |   assert(result == std::vector<std::wstring>({L"ann", L"oy", L"ingly"}));
 68 | 
 69 |   result.clear();
 70 |   bpe(utf8_to_wstring("very"), bpe_ranks, &result);
 71 |   assert(result == std::vector<std::wstring>({L"very"}));
 72 | }
 73 | 
 74 | auto _print_string_vec = [](std::vector<std::string>& v) {
 75 |   // To be compatible with Python's print(*lst, sep=', ')
 76 |   for (int i = 0; i < v.size(); ++i) {
 77 |     std::cout << v[i];
 78 |     if (i < v.size() - 1) std::cout << ", ";
 79 |   }
 80 |   std::cout << std::endl;
 81 | };
 82 | 
 83 | void test_tokenize() {
 84 |   assert(re.ok());  // compiled; if not, see re.error();
 85 | 
 86 |   BPERanks bpe_ranks;
 87 |   std::fstream merges("/tmp/merges.txt", std::ios::in);
 88 |   load_merge_rules(merges, &bpe_ranks);
 89 | 
 90 |   std::unordered_map<uint8_t, wchar_t> b2u;
 91 |   bytes_to_unicode(&b2u, NULL);
 92 | 
 93 |   std::vector<std::string> candidates = {
 94 |       "this is <|endoftext|> else<|endoftext|>",
 95 |       "<|endoftext|> else<|endoftext|>", "this is <|endoftext|> else",
 96 |       "this is <|endoftext|>else", "this is else"};
 97 |   for (auto s : candidates) {
 98 |     std::vector<std::string> result;
 99 |     tokenize(s, re, bpe_ranks, b2u, &result);
100 |     _print_string_vec(result);
101 |   }
102 | }
103 | 
104 | void test_tokenize_regression() {
105 |   assert(re.ok());  // compiled; if not, see re.error();
106 | 
107 |   BPERanks bpe_ranks;
108 |   std::fstream merges("/tmp/merges.txt", std::ios::in);
109 |   load_merge_rules(merges, &bpe_ranks);
110 | 
111 |   std::unordered_map<uint8_t, wchar_t> b2u;
112 |   bytes_to_unicode(&b2u, NULL);
113 | 
114 |   // In order to make sure that /tmp/sample.txt contains many lines of
115 |   // text of different langauges, I download the lyrics data from
116 |   // https://www.kaggle.com/datasets/neisse/scrapped-lyrics-from-6-genres,
117 |   // and ran the following commands to randomly sample 10000 lines.
118 |   /*
119 |      cat /tmp/lyrics-data.csv | head -n 1000000  | awk 'NF' | sort | \
120 |      uniq | sort -R | head -n 10000 > /tmp/sample.txt
121 |   */
122 |   std::fstream ins("/tmp/sample.txt", std::ios::in);
123 |   std::string line;
124 |   while (std::getline(ins, line)) {
125 |     std::vector<std::string> result;
126 |     tokenize(line, re, bpe_ranks, b2u, &result);
127 |     _print_string_vec(result);
128 |   }
129 | }
130 | 
131 | void test_load_vocab() {
132 |   std::unordered_map<std::string, int> t2i;
133 |   std::unordered_map<int, std::string> i2t;
134 |   std::fstream vocab_txt("/tmp/vocab.txt", std::ios::in);
135 |   load_vocab(vocab_txt, &t2i, &i2t);
136 |   assert(t2i.size() == 50257);
137 |   assert(i2t.size() == 50257);
138 |   assert(t2i["\""] == 1);
139 |   assert(i2t[1] == "\"");
140 | }
141 | 
142 | void test_encode_decode() {
143 |   BPERanks bpe_ranks;
144 |   std::fstream merges("/tmp/merges.txt", std::ios::in);
145 |   load_merge_rules(merges, &bpe_ranks);
146 | 
147 |   std::unordered_map<uint8_t, wchar_t> b2u;
148 |   std::unordered_map<wchar_t, uint8_t> u2b;
149 |   bytes_to_unicode(&b2u, &u2b);
150 | 
151 |   std::unordered_map<std::string, int> t2i;
152 |   std::unordered_map<int, std::string> i2t;
153 |   std::fstream vocab_txt("/tmp/vocab.txt", std::ios::in);
154 |   load_vocab(vocab_txt, &t2i, &i2t);
155 | 
156 |   std::vector<std::string> candidates = {
157 |       "this is <|endoftext|> else<|endoftext|>",
158 |       "<|endoftext|> else<|endoftext|>", "this is <|endoftext|> else",
159 |       "this is <|endoftext|>else", "this is else"};
160 |   for (auto s : candidates) {
161 |     std::vector<int> ids;
162 |     encode(s, re, bpe_ranks, b2u, t2i, &ids);
163 |     assert(ids.size() > 0);
164 |     assert(decode(ids, u2b, i2t) == s);
165 |   }
166 | }
167 | 
168 | int main() {
169 |   test_bytes_to_unicode();
170 |   test_re2();
171 |   test_load_merge_rules();
172 |   test_byte_encode_token();
173 |   test_get_pairs();
174 |   test_bpe();
175 |   test_tokenize();
176 |   test_load_vocab();
177 |   test_encode_decode();
178 |   return 0;
179 | }
180 | 


--------------------------------------------------------------------------------
/tool/cmp.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | if len(sys.argv) != 4:
 4 |     sys.exit(f"Usage: {sys.argv[0]} tokenization_output1 tokenization_output2 tokenization_input")
 5 |     
 6 | o1 = open(sys.argv[1])
 7 | o2 = open(sys.argv[2])
 8 | i0 = open(sys.argv[3])
 9 | 
10 | while True:
11 |     l1 = o1.readline()
12 |     l2 = o2.readline()
13 |     li = i0.readline()
14 | 
15 |     if not l1 or not l2 or not li:
16 |         break
17 | 
18 |     if (l1 != l2):
19 |         print(l1[:-1])
20 |         print(l2)
21 |         print(li)
22 | 


--------------------------------------------------------------------------------
/tool/data.py:
--------------------------------------------------------------------------------
 1 | # This script finds lyrics of Frech songs in the dataset
 2 | # https://www.kaggle.com/datasets/neisse/scrapped-lyrics-from-6-genres
 3 | #
 4 | # unzip the downloaded archive.zip file in /tmp, we get:
 5 | #   /tmp/lyrics-data.csv
 6 | #   /tmp/artists-data.csv
 7 | #
 8 | import pandas as pd
 9 | 
10 | artists = pd.read_csv("/tmp/artists-data.csv")  # 4168 artists
11 | artists = artists[artists["Popularity"] > 5]  # 297 popular artists
12 | 
13 | lyrics = pd.read_csv("/tmp/lyrics-data.csv")  # 379931 songs
14 | lyrics = lyrics[lyrics["language"] == "fr"]
15 | 
16 | df = lyrics.merge(
17 |     artists[["Artist", "Genres", "Link"]], left_on="ALink", right_on="Link", how="inner"
18 | )
19 | 
20 | df = df[df["Lyric"].apply(lambda x: len(x.split(" ")) < 350)]
21 | 
22 | df = df["Lyric"]  # 317 songs
23 | 
24 | for row in df:
25 |     # https://stackoverflow.com/a/2077944/724872
26 |     print(" ".join(row.replace("\n", " ").split()))
27 | 


--------------------------------------------------------------------------------
/tool/json-to-txt.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import json
 3 | 
 4 | vocab_file = sys.argv[1]
 5 | 
 6 | with open(vocab_file) as f:
 7 |     d = json.load(f)
 8 |     for k, v in d.items():
 9 |         print(k)
10 |         print(v)
11 | 


--------------------------------------------------------------------------------