├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── README.md ├── SampleApp ├── SampleApp.xcodeproj │ ├── project.pbxproj │ ├── project.xcworkspace │ │ ├── contents.xcworkspacedata │ │ └── xcshareddata │ │ │ └── IDEWorkspaceChecks.plist │ └── xcuserdata │ │ └── y.xcuserdatad │ │ └── xcschemes │ │ └── xcschememanagement.plist └── SampleApp │ ├── AppDelegate.h │ ├── AppDelegate.m │ ├── Assets.xcassets │ ├── AccentColor.colorset │ │ └── Contents.json │ ├── AppIcon.appiconset │ │ └── Contents.json │ └── Contents.json │ ├── Base.lproj │ ├── LaunchScreen.storyboard │ └── Main.storyboard │ ├── Info.plist │ ├── SceneDelegate.h │ ├── SceneDelegate.m │ ├── ViewController.h │ ├── ViewController.m │ ├── main.m │ ├── merges.txt │ ├── use_re2.cpp │ └── vocab.txt ├── build-re2.sh ├── build.sh ├── doc ├── 0.md ├── 1.md └── u.md ├── tokenizer ├── assets │ ├── README.md │ ├── merges.txt │ ├── vocab.json │ └── vocab.txt ├── bpe.cc ├── bpe.h ├── bpe.py └── bpe_test.cc └── tool ├── cmp.py ├── data.py └── json-to-txt.py /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | build-*/ 3 | .DS_Store 4 | build/ 5 | *xcuserdata* 6 | __pycache__ 7 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "re2"] 2 | path = re2 3 | url = https://github.com/google/re2 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | project(tokenizer CXX) 4 | include(CMakePackageConfigHelpers) 5 | include(CTest) 6 | include(GNUInstallDirs) 7 | 8 | # ABI version 9 | # http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html 10 | set(SONAME 10) 11 | 12 | add_subdirectory(re2) 13 | 14 | add_library(tokenizer tokenizer/bpe.cc tokenizer/bpe.h ${RE2_SOURCES}) 15 | target_compile_features(tokenizer PUBLIC cxx_std_11) 16 | target_include_directories(tokenizer PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/re2) 17 | set_target_properties(tokenizer PROPERTIES SOVERSION ${SONAME} VERSION ${SONAME}.0.0) 18 | 19 | if(APPLE) 20 | set_target_properties(tokenizer PROPERTIES 21 | FRAMEWORK TRUE 22 | FRAMEWORK_VERSION A 23 | MACOSX_FRAMEWORK_IDENTIFIER wang.yi.tokenizer 24 | PUBLIC_HEADER "tokenizer/bpe.h" 25 | ) 26 | endif() 27 | 28 | add_executable(bpe_test tokenizer/bpe_test.cc) 29 | set_target_properties(bpe_test 30 | PROPERTIES 31 | RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/bin" 32 | ) 33 | target_link_libraries(bpe_test PRIVATE tokenizer re2::re2) 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This repository is a C++ version of the Python HuggingFace tokenizers. 2 | 3 | In the HuggingFace Transformers repo, tokenization is done with 104,603 lines of Python code. It takes 5506 lines for GPT2-specific BPE. I went through the code using the Python Debugger (pdb). It turns out that most of them do nothing but virtual methods in a complicated class hierarchy. I took 120 lines of Python code and put them in the file [`bpe.py`](tokenizer/bpe.py). This program is not a weaker or simpler version; it is the full HuggingFace BPE tokenizer. 4 | 5 | With this extraction, I can port it to C++: [`bpe.h`](tokenizer/bpe.h), [`bpe.cc`](tokenizer/bpe.cc), and [`bpe_test.cc`](tokenizer/bpe_test.cc). 6 | 7 | The following tech notes might help you understand the Python and C++ code: 8 | 9 | 1. [Unicode in C++ and Python](doc/u.md) 10 | 1. [Understanding HuggingFace Tokenizers](doc/0.md) 11 | 1. [Unicode-enabled Regular Expression in C++](doc/1.md) 12 | 13 | To run the vanilla HuggingFace GPT-2's BPE tokenizer, follow [this link])https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.GPT2Tokenizer). 14 | 15 | To run HuggingFace vanilla BPE tokenzier for GPT2, run the following commands: 16 | 17 | ```bash 18 | pip install transformers 19 | python tool/t.py 20 | ``` 21 | 22 | Please run the following commands to run the extracted 120-line Python tokenizer: 23 | 24 | ```bash 25 | python tokenizer/bpe.py 26 | ``` 27 | 28 | To build the C++ port, you will need CMake. 29 | 30 | ``` 31 | cmake -B /tmp/b -S . 32 | Cmake --build /tmp/b 33 | ``` 34 | 35 | Then you can run the unit test. 36 | 37 | ```bash 38 | /tmp/b/bin/bpe_test 39 | ``` 40 | 41 | All three of the above programs load the same `vocab.json` and `merges.txt` files, so they all work exactly like HuggingFace did when it trained and served the GPT2 model. 42 | 43 | ## Known Issue 44 | 45 | RE2 can match Unicode letters from all languages using rules like `\p{L}`. But it doesn't work with look-ahead syntax like `(?!...)`. This would make the C++ version act a little differently than the Python versions when there are more than one space between two words. Please tell me about any C++ regular expression libraries that can handle both Unicode and look-ahead. 46 | 47 | This does not seem a big issue because among the 15,231,221 lines in the [lyrics text dataset](https://www.kaggle.com/datasets/neisse/scrapped-lyrics-from-6-genres) of 54 languages, the C++ tokenizer generates different output from the HuggingFace one for only 4 lines. And the reason are all due to successive two spaces. For details, please see https://github.com/wangkuiyi/huggingface-tokenizer-in-cxx/issues/10. 48 | -------------------------------------------------------------------------------- /SampleApp/SampleApp.xcodeproj/project.pbxproj: -------------------------------------------------------------------------------- 1 | // !$*UTF8*$! 2 | { 3 | archiveVersion = 1; 4 | classes = { 5 | }; 6 | objectVersion = 56; 7 | objects = { 8 | 9 | /* Begin PBXBuildFile section */ 10 | 187AFC7E2999E7F7007E357F /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 187AFC7D2999E7F7007E357F /* AppDelegate.m */; }; 11 | 187AFC812999E7F7007E357F /* SceneDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 187AFC802999E7F7007E357F /* SceneDelegate.m */; }; 12 | 187AFC842999E7F7007E357F /* ViewController.m in Sources */ = {isa = PBXBuildFile; fileRef = 187AFC832999E7F7007E357F /* ViewController.m */; }; 13 | 187AFC872999E7F7007E357F /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 187AFC852999E7F7007E357F /* Main.storyboard */; }; 14 | 187AFC892999E7F8007E357F /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 187AFC882999E7F8007E357F /* Assets.xcassets */; }; 15 | 187AFC8C2999E7F8007E357F /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 187AFC8A2999E7F8007E357F /* LaunchScreen.storyboard */; }; 16 | 187AFC8F2999E7F8007E357F /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 187AFC8E2999E7F8007E357F /* main.m */; }; 17 | 187AFC962999E822007E357F /* re2.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 187AFC952999E822007E357F /* re2.xcframework */; }; 18 | 187AFC982999E947007E357F /* use_re2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 187AFC972999E947007E357F /* use_re2.cpp */; }; 19 | 187AFC9A2999F069007E357F /* tokenizer.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 187AFC992999F069007E357F /* tokenizer.xcframework */; }; 20 | 187AFC9C2999F2ED007E357F /* merges.txt in Resources */ = {isa = PBXBuildFile; fileRef = 187AFC9B2999F2ED007E357F /* merges.txt */; }; 21 | 187AFC9E2999F2F7007E357F /* vocab.txt in Resources */ = {isa = PBXBuildFile; fileRef = 187AFC9D2999F2F7007E357F /* vocab.txt */; }; 22 | /* End PBXBuildFile section */ 23 | 24 | /* Begin PBXFileReference section */ 25 | 187AFC792999E7F7007E357F /* SampleApp.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = SampleApp.app; sourceTree = BUILT_PRODUCTS_DIR; }; 26 | 187AFC7C2999E7F7007E357F /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = ""; }; 27 | 187AFC7D2999E7F7007E357F /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = ""; }; 28 | 187AFC7F2999E7F7007E357F /* SceneDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = SceneDelegate.h; sourceTree = ""; }; 29 | 187AFC802999E7F7007E357F /* SceneDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = SceneDelegate.m; sourceTree = ""; }; 30 | 187AFC822999E7F7007E357F /* ViewController.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ViewController.h; sourceTree = ""; }; 31 | 187AFC832999E7F7007E357F /* ViewController.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = ViewController.m; sourceTree = ""; }; 32 | 187AFC862999E7F7007E357F /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/Main.storyboard; sourceTree = ""; }; 33 | 187AFC882999E7F8007E357F /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; 34 | 187AFC8B2999E7F8007E357F /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = ""; }; 35 | 187AFC8D2999E7F8007E357F /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; 36 | 187AFC8E2999E7F8007E357F /* main.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = main.m; sourceTree = ""; }; 37 | 187AFC952999E822007E357F /* re2.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = re2.xcframework; path = ../../build/re2/re2.xcframework; sourceTree = ""; }; 38 | 187AFC972999E947007E357F /* use_re2.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = use_re2.cpp; sourceTree = ""; }; 39 | 187AFC992999F069007E357F /* tokenizer.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = tokenizer.xcframework; path = ../../build/tokenizer.xcframework; sourceTree = ""; }; 40 | 187AFC9B2999F2ED007E357F /* merges.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = merges.txt; sourceTree = ""; }; 41 | 187AFC9D2999F2F7007E357F /* vocab.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = vocab.txt; sourceTree = ""; }; 42 | /* End PBXFileReference section */ 43 | 44 | /* Begin PBXFrameworksBuildPhase section */ 45 | 187AFC762999E7F7007E357F /* Frameworks */ = { 46 | isa = PBXFrameworksBuildPhase; 47 | buildActionMask = 2147483647; 48 | files = ( 49 | 187AFC962999E822007E357F /* re2.xcframework in Frameworks */, 50 | 187AFC9A2999F069007E357F /* tokenizer.xcframework in Frameworks */, 51 | ); 52 | runOnlyForDeploymentPostprocessing = 0; 53 | }; 54 | /* End PBXFrameworksBuildPhase section */ 55 | 56 | /* Begin PBXGroup section */ 57 | 187AFC702999E7F7007E357F = { 58 | isa = PBXGroup; 59 | children = ( 60 | 187AFC7B2999E7F7007E357F /* SampleApp */, 61 | 187AFC7A2999E7F7007E357F /* Products */, 62 | ); 63 | sourceTree = ""; 64 | }; 65 | 187AFC7A2999E7F7007E357F /* Products */ = { 66 | isa = PBXGroup; 67 | children = ( 68 | 187AFC792999E7F7007E357F /* SampleApp.app */, 69 | ); 70 | name = Products; 71 | sourceTree = ""; 72 | }; 73 | 187AFC7B2999E7F7007E357F /* SampleApp */ = { 74 | isa = PBXGroup; 75 | children = ( 76 | 187AFC7C2999E7F7007E357F /* AppDelegate.h */, 77 | 187AFC7D2999E7F7007E357F /* AppDelegate.m */, 78 | 187AFC7F2999E7F7007E357F /* SceneDelegate.h */, 79 | 187AFC802999E7F7007E357F /* SceneDelegate.m */, 80 | 187AFC822999E7F7007E357F /* ViewController.h */, 81 | 187AFC832999E7F7007E357F /* ViewController.m */, 82 | 187AFC972999E947007E357F /* use_re2.cpp */, 83 | 187AFC852999E7F7007E357F /* Main.storyboard */, 84 | 187AFC882999E7F8007E357F /* Assets.xcassets */, 85 | 187AFC8A2999E7F8007E357F /* LaunchScreen.storyboard */, 86 | 187AFC8D2999E7F8007E357F /* Info.plist */, 87 | 187AFC8E2999E7F8007E357F /* main.m */, 88 | 187AFC952999E822007E357F /* re2.xcframework */, 89 | 187AFC992999F069007E357F /* tokenizer.xcframework */, 90 | 187AFC9B2999F2ED007E357F /* merges.txt */, 91 | 187AFC9D2999F2F7007E357F /* vocab.txt */, 92 | ); 93 | path = SampleApp; 94 | sourceTree = ""; 95 | }; 96 | /* End PBXGroup section */ 97 | 98 | /* Begin PBXNativeTarget section */ 99 | 187AFC782999E7F7007E357F /* SampleApp */ = { 100 | isa = PBXNativeTarget; 101 | buildConfigurationList = 187AFC922999E7F8007E357F /* Build configuration list for PBXNativeTarget "SampleApp" */; 102 | buildPhases = ( 103 | 187AFC752999E7F7007E357F /* Sources */, 104 | 187AFC762999E7F7007E357F /* Frameworks */, 105 | 187AFC772999E7F7007E357F /* Resources */, 106 | ); 107 | buildRules = ( 108 | ); 109 | dependencies = ( 110 | ); 111 | name = SampleApp; 112 | productName = SampleApp; 113 | productReference = 187AFC792999E7F7007E357F /* SampleApp.app */; 114 | productType = "com.apple.product-type.application"; 115 | }; 116 | /* End PBXNativeTarget section */ 117 | 118 | /* Begin PBXProject section */ 119 | 187AFC712999E7F7007E357F /* Project object */ = { 120 | isa = PBXProject; 121 | attributes = { 122 | BuildIndependentTargetsInParallel = 1; 123 | LastUpgradeCheck = 1420; 124 | TargetAttributes = { 125 | 187AFC782999E7F7007E357F = { 126 | CreatedOnToolsVersion = 14.2; 127 | }; 128 | }; 129 | }; 130 | buildConfigurationList = 187AFC742999E7F7007E357F /* Build configuration list for PBXProject "SampleApp" */; 131 | compatibilityVersion = "Xcode 14.0"; 132 | developmentRegion = en; 133 | hasScannedForEncodings = 0; 134 | knownRegions = ( 135 | en, 136 | Base, 137 | ); 138 | mainGroup = 187AFC702999E7F7007E357F; 139 | productRefGroup = 187AFC7A2999E7F7007E357F /* Products */; 140 | projectDirPath = ""; 141 | projectRoot = ""; 142 | targets = ( 143 | 187AFC782999E7F7007E357F /* SampleApp */, 144 | ); 145 | }; 146 | /* End PBXProject section */ 147 | 148 | /* Begin PBXResourcesBuildPhase section */ 149 | 187AFC772999E7F7007E357F /* Resources */ = { 150 | isa = PBXResourcesBuildPhase; 151 | buildActionMask = 2147483647; 152 | files = ( 153 | 187AFC9E2999F2F7007E357F /* vocab.txt in Resources */, 154 | 187AFC8C2999E7F8007E357F /* LaunchScreen.storyboard in Resources */, 155 | 187AFC892999E7F8007E357F /* Assets.xcassets in Resources */, 156 | 187AFC872999E7F7007E357F /* Main.storyboard in Resources */, 157 | 187AFC9C2999F2ED007E357F /* merges.txt in Resources */, 158 | ); 159 | runOnlyForDeploymentPostprocessing = 0; 160 | }; 161 | /* End PBXResourcesBuildPhase section */ 162 | 163 | /* Begin PBXSourcesBuildPhase section */ 164 | 187AFC752999E7F7007E357F /* Sources */ = { 165 | isa = PBXSourcesBuildPhase; 166 | buildActionMask = 2147483647; 167 | files = ( 168 | 187AFC982999E947007E357F /* use_re2.cpp in Sources */, 169 | 187AFC842999E7F7007E357F /* ViewController.m in Sources */, 170 | 187AFC7E2999E7F7007E357F /* AppDelegate.m in Sources */, 171 | 187AFC8F2999E7F8007E357F /* main.m in Sources */, 172 | 187AFC812999E7F7007E357F /* SceneDelegate.m in Sources */, 173 | ); 174 | runOnlyForDeploymentPostprocessing = 0; 175 | }; 176 | /* End PBXSourcesBuildPhase section */ 177 | 178 | /* Begin PBXVariantGroup section */ 179 | 187AFC852999E7F7007E357F /* Main.storyboard */ = { 180 | isa = PBXVariantGroup; 181 | children = ( 182 | 187AFC862999E7F7007E357F /* Base */, 183 | ); 184 | name = Main.storyboard; 185 | sourceTree = ""; 186 | }; 187 | 187AFC8A2999E7F8007E357F /* LaunchScreen.storyboard */ = { 188 | isa = PBXVariantGroup; 189 | children = ( 190 | 187AFC8B2999E7F8007E357F /* Base */, 191 | ); 192 | name = LaunchScreen.storyboard; 193 | sourceTree = ""; 194 | }; 195 | /* End PBXVariantGroup section */ 196 | 197 | /* Begin XCBuildConfiguration section */ 198 | 187AFC902999E7F8007E357F /* Debug */ = { 199 | isa = XCBuildConfiguration; 200 | buildSettings = { 201 | ALWAYS_SEARCH_USER_PATHS = NO; 202 | CLANG_ANALYZER_NONNULL = YES; 203 | CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; 204 | CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; 205 | CLANG_ENABLE_MODULES = YES; 206 | CLANG_ENABLE_OBJC_ARC = YES; 207 | CLANG_ENABLE_OBJC_WEAK = YES; 208 | CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; 209 | CLANG_WARN_BOOL_CONVERSION = YES; 210 | CLANG_WARN_COMMA = YES; 211 | CLANG_WARN_CONSTANT_CONVERSION = YES; 212 | CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; 213 | CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; 214 | CLANG_WARN_DOCUMENTATION_COMMENTS = YES; 215 | CLANG_WARN_EMPTY_BODY = YES; 216 | CLANG_WARN_ENUM_CONVERSION = YES; 217 | CLANG_WARN_INFINITE_RECURSION = YES; 218 | CLANG_WARN_INT_CONVERSION = YES; 219 | CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; 220 | CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; 221 | CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; 222 | CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; 223 | CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; 224 | CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; 225 | CLANG_WARN_STRICT_PROTOTYPES = YES; 226 | CLANG_WARN_SUSPICIOUS_MOVE = YES; 227 | CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; 228 | CLANG_WARN_UNREACHABLE_CODE = YES; 229 | CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; 230 | COPY_PHASE_STRIP = NO; 231 | DEBUG_INFORMATION_FORMAT = dwarf; 232 | ENABLE_STRICT_OBJC_MSGSEND = YES; 233 | ENABLE_TESTABILITY = YES; 234 | GCC_C_LANGUAGE_STANDARD = gnu11; 235 | GCC_DYNAMIC_NO_PIC = NO; 236 | GCC_NO_COMMON_BLOCKS = YES; 237 | GCC_OPTIMIZATION_LEVEL = 0; 238 | GCC_PREPROCESSOR_DEFINITIONS = ( 239 | "DEBUG=1", 240 | "$(inherited)", 241 | ); 242 | GCC_WARN_64_TO_32_BIT_CONVERSION = YES; 243 | GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; 244 | GCC_WARN_UNDECLARED_SELECTOR = YES; 245 | GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; 246 | GCC_WARN_UNUSED_FUNCTION = YES; 247 | GCC_WARN_UNUSED_VARIABLE = YES; 248 | IPHONEOS_DEPLOYMENT_TARGET = 16.2; 249 | MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; 250 | MTL_FAST_MATH = YES; 251 | ONLY_ACTIVE_ARCH = YES; 252 | SDKROOT = iphoneos; 253 | }; 254 | name = Debug; 255 | }; 256 | 187AFC912999E7F8007E357F /* Release */ = { 257 | isa = XCBuildConfiguration; 258 | buildSettings = { 259 | ALWAYS_SEARCH_USER_PATHS = NO; 260 | CLANG_ANALYZER_NONNULL = YES; 261 | CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; 262 | CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; 263 | CLANG_ENABLE_MODULES = YES; 264 | CLANG_ENABLE_OBJC_ARC = YES; 265 | CLANG_ENABLE_OBJC_WEAK = YES; 266 | CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; 267 | CLANG_WARN_BOOL_CONVERSION = YES; 268 | CLANG_WARN_COMMA = YES; 269 | CLANG_WARN_CONSTANT_CONVERSION = YES; 270 | CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; 271 | CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; 272 | CLANG_WARN_DOCUMENTATION_COMMENTS = YES; 273 | CLANG_WARN_EMPTY_BODY = YES; 274 | CLANG_WARN_ENUM_CONVERSION = YES; 275 | CLANG_WARN_INFINITE_RECURSION = YES; 276 | CLANG_WARN_INT_CONVERSION = YES; 277 | CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; 278 | CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; 279 | CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; 280 | CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; 281 | CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; 282 | CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; 283 | CLANG_WARN_STRICT_PROTOTYPES = YES; 284 | CLANG_WARN_SUSPICIOUS_MOVE = YES; 285 | CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; 286 | CLANG_WARN_UNREACHABLE_CODE = YES; 287 | CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; 288 | COPY_PHASE_STRIP = NO; 289 | DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; 290 | ENABLE_NS_ASSERTIONS = NO; 291 | ENABLE_STRICT_OBJC_MSGSEND = YES; 292 | GCC_C_LANGUAGE_STANDARD = gnu11; 293 | GCC_NO_COMMON_BLOCKS = YES; 294 | GCC_WARN_64_TO_32_BIT_CONVERSION = YES; 295 | GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; 296 | GCC_WARN_UNDECLARED_SELECTOR = YES; 297 | GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; 298 | GCC_WARN_UNUSED_FUNCTION = YES; 299 | GCC_WARN_UNUSED_VARIABLE = YES; 300 | IPHONEOS_DEPLOYMENT_TARGET = 16.2; 301 | MTL_ENABLE_DEBUG_INFO = NO; 302 | MTL_FAST_MATH = YES; 303 | SDKROOT = iphoneos; 304 | VALIDATE_PRODUCT = YES; 305 | }; 306 | name = Release; 307 | }; 308 | 187AFC932999E7F8007E357F /* Debug */ = { 309 | isa = XCBuildConfiguration; 310 | buildSettings = { 311 | ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; 312 | ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; 313 | CODE_SIGN_STYLE = Automatic; 314 | CURRENT_PROJECT_VERSION = 1; 315 | DEVELOPMENT_TEAM = ET66P5RYW9; 316 | GENERATE_INFOPLIST_FILE = YES; 317 | INFOPLIST_FILE = SampleApp/Info.plist; 318 | INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; 319 | INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen; 320 | INFOPLIST_KEY_UIMainStoryboardFile = Main; 321 | INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; 322 | INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; 323 | LD_RUNPATH_SEARCH_PATHS = ( 324 | "$(inherited)", 325 | "@executable_path/Frameworks", 326 | ); 327 | MARKETING_VERSION = 1.0; 328 | PRODUCT_BUNDLE_IDENTIFIER = org.wyi.SampleApp; 329 | PRODUCT_NAME = "$(TARGET_NAME)"; 330 | SWIFT_EMIT_LOC_STRINGS = YES; 331 | TARGETED_DEVICE_FAMILY = "1,2"; 332 | }; 333 | name = Debug; 334 | }; 335 | 187AFC942999E7F8007E357F /* Release */ = { 336 | isa = XCBuildConfiguration; 337 | buildSettings = { 338 | ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; 339 | ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; 340 | CODE_SIGN_STYLE = Automatic; 341 | CURRENT_PROJECT_VERSION = 1; 342 | DEVELOPMENT_TEAM = ET66P5RYW9; 343 | GENERATE_INFOPLIST_FILE = YES; 344 | INFOPLIST_FILE = SampleApp/Info.plist; 345 | INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; 346 | INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen; 347 | INFOPLIST_KEY_UIMainStoryboardFile = Main; 348 | INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; 349 | INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; 350 | LD_RUNPATH_SEARCH_PATHS = ( 351 | "$(inherited)", 352 | "@executable_path/Frameworks", 353 | ); 354 | MARKETING_VERSION = 1.0; 355 | PRODUCT_BUNDLE_IDENTIFIER = org.wyi.SampleApp; 356 | PRODUCT_NAME = "$(TARGET_NAME)"; 357 | SWIFT_EMIT_LOC_STRINGS = YES; 358 | TARGETED_DEVICE_FAMILY = "1,2"; 359 | }; 360 | name = Release; 361 | }; 362 | /* End XCBuildConfiguration section */ 363 | 364 | /* Begin XCConfigurationList section */ 365 | 187AFC742999E7F7007E357F /* Build configuration list for PBXProject "SampleApp" */ = { 366 | isa = XCConfigurationList; 367 | buildConfigurations = ( 368 | 187AFC902999E7F8007E357F /* Debug */, 369 | 187AFC912999E7F8007E357F /* Release */, 370 | ); 371 | defaultConfigurationIsVisible = 0; 372 | defaultConfigurationName = Release; 373 | }; 374 | 187AFC922999E7F8007E357F /* Build configuration list for PBXNativeTarget "SampleApp" */ = { 375 | isa = XCConfigurationList; 376 | buildConfigurations = ( 377 | 187AFC932999E7F8007E357F /* Debug */, 378 | 187AFC942999E7F8007E357F /* Release */, 379 | ); 380 | defaultConfigurationIsVisible = 0; 381 | defaultConfigurationName = Release; 382 | }; 383 | /* End XCConfigurationList section */ 384 | }; 385 | rootObject = 187AFC712999E7F7007E357F /* Project object */; 386 | } 387 | -------------------------------------------------------------------------------- /SampleApp/SampleApp.xcodeproj/project.xcworkspace/contents.xcworkspacedata: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /SampleApp/SampleApp.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | IDEDidComputeMac32BitWarning 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /SampleApp/SampleApp.xcodeproj/xcuserdata/y.xcuserdatad/xcschemes/xcschememanagement.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | SchemeUserState 6 | 7 | SampleApp.xcscheme_^#shared#^_ 8 | 9 | orderHint 10 | 0 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /SampleApp/SampleApp/AppDelegate.h: -------------------------------------------------------------------------------- 1 | // 2 | // AppDelegate.h 3 | // RE2Sample 4 | // 5 | // Created by Yi Wang on 2/12/23. 6 | // 7 | 8 | #import 9 | 10 | @interface AppDelegate : UIResponder 11 | 12 | 13 | @end 14 | 15 | -------------------------------------------------------------------------------- /SampleApp/SampleApp/AppDelegate.m: -------------------------------------------------------------------------------- 1 | // 2 | // AppDelegate.m 3 | // RE2Sample 4 | // 5 | // Created by Yi Wang on 2/12/23. 6 | // 7 | 8 | #import "AppDelegate.h" 9 | 10 | @interface AppDelegate () 11 | 12 | @end 13 | 14 | @implementation AppDelegate 15 | 16 | 17 | - (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:(NSDictionary *)launchOptions { 18 | // Override point for customization after application launch. 19 | return YES; 20 | } 21 | 22 | 23 | #pragma mark - UISceneSession lifecycle 24 | 25 | 26 | - (UISceneConfiguration *)application:(UIApplication *)application configurationForConnectingSceneSession:(UISceneSession *)connectingSceneSession options:(UISceneConnectionOptions *)options { 27 | // Called when a new scene session is being created. 28 | // Use this method to select a configuration to create the new scene with. 29 | return [[UISceneConfiguration alloc] initWithName:@"Default Configuration" sessionRole:connectingSceneSession.role]; 30 | } 31 | 32 | 33 | - (void)application:(UIApplication *)application didDiscardSceneSessions:(NSSet *)sceneSessions { 34 | // Called when the user discards a scene session. 35 | // If any sessions were discarded while the application was not running, this will be called shortly after application:didFinishLaunchingWithOptions. 36 | // Use this method to release any resources that were specific to the discarded scenes, as they will not return. 37 | } 38 | 39 | 40 | @end 41 | -------------------------------------------------------------------------------- /SampleApp/SampleApp/Assets.xcassets/AccentColor.colorset/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "colors" : [ 3 | { 4 | "idiom" : "universal" 5 | } 6 | ], 7 | "info" : { 8 | "author" : "xcode", 9 | "version" : 1 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /SampleApp/SampleApp/Assets.xcassets/AppIcon.appiconset/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "images" : [ 3 | { 4 | "idiom" : "universal", 5 | "platform" : "ios", 6 | "size" : "1024x1024" 7 | } 8 | ], 9 | "info" : { 10 | "author" : "xcode", 11 | "version" : 1 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /SampleApp/SampleApp/Assets.xcassets/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "info" : { 3 | "author" : "xcode", 4 | "version" : 1 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /SampleApp/SampleApp/Base.lproj/LaunchScreen.storyboard: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /SampleApp/SampleApp/Base.lproj/Main.storyboard: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /SampleApp/SampleApp/Info.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | UIApplicationSceneManifest 6 | 7 | UIApplicationSupportsMultipleScenes 8 | 9 | UISceneConfigurations 10 | 11 | UIWindowSceneSessionRoleApplication 12 | 13 | 14 | UISceneConfigurationName 15 | Default Configuration 16 | UISceneDelegateClassName 17 | SceneDelegate 18 | UISceneStoryboardFile 19 | Main 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /SampleApp/SampleApp/SceneDelegate.h: -------------------------------------------------------------------------------- 1 | // 2 | // SceneDelegate.h 3 | // RE2Sample 4 | // 5 | // Created by Yi Wang on 2/12/23. 6 | // 7 | 8 | #import 9 | 10 | @interface SceneDelegate : UIResponder 11 | 12 | @property (strong, nonatomic) UIWindow * window; 13 | 14 | @end 15 | 16 | -------------------------------------------------------------------------------- /SampleApp/SampleApp/SceneDelegate.m: -------------------------------------------------------------------------------- 1 | // 2 | // SceneDelegate.m 3 | // RE2Sample 4 | // 5 | // Created by Yi Wang on 2/12/23. 6 | // 7 | 8 | #import "SceneDelegate.h" 9 | 10 | @interface SceneDelegate () 11 | 12 | @end 13 | 14 | @implementation SceneDelegate 15 | 16 | 17 | - (void)scene:(UIScene *)scene willConnectToSession:(UISceneSession *)session options:(UISceneConnectionOptions *)connectionOptions { 18 | // Use this method to optionally configure and attach the UIWindow `window` to the provided UIWindowScene `scene`. 19 | // If using a storyboard, the `window` property will automatically be initialized and attached to the scene. 20 | // This delegate does not imply the connecting scene or session are new (see `application:configurationForConnectingSceneSession` instead). 21 | } 22 | 23 | 24 | - (void)sceneDidDisconnect:(UIScene *)scene { 25 | // Called as the scene is being released by the system. 26 | // This occurs shortly after the scene enters the background, or when its session is discarded. 27 | // Release any resources associated with this scene that can be re-created the next time the scene connects. 28 | // The scene may re-connect later, as its session was not necessarily discarded (see `application:didDiscardSceneSessions` instead). 29 | } 30 | 31 | 32 | - (void)sceneDidBecomeActive:(UIScene *)scene { 33 | // Called when the scene has moved from an inactive state to an active state. 34 | // Use this method to restart any tasks that were paused (or not yet started) when the scene was inactive. 35 | } 36 | 37 | 38 | - (void)sceneWillResignActive:(UIScene *)scene { 39 | // Called when the scene will move from an active state to an inactive state. 40 | // This may occur due to temporary interruptions (ex. an incoming phone call). 41 | } 42 | 43 | 44 | - (void)sceneWillEnterForeground:(UIScene *)scene { 45 | // Called as the scene transitions from the background to the foreground. 46 | // Use this method to undo the changes made on entering the background. 47 | } 48 | 49 | 50 | - (void)sceneDidEnterBackground:(UIScene *)scene { 51 | // Called as the scene transitions from the foreground to the background. 52 | // Use this method to save data, release shared resources, and store enough scene-specific state information 53 | // to restore the scene back to its current state. 54 | } 55 | 56 | 57 | @end 58 | -------------------------------------------------------------------------------- /SampleApp/SampleApp/ViewController.h: -------------------------------------------------------------------------------- 1 | // 2 | // ViewController.h 3 | // RE2Sample 4 | // 5 | // Created by Yi Wang on 2/12/23. 6 | // 7 | 8 | #import 9 | 10 | @interface ViewController : UIViewController 11 | 12 | 13 | @end 14 | 15 | -------------------------------------------------------------------------------- /SampleApp/SampleApp/ViewController.m: -------------------------------------------------------------------------------- 1 | // 2 | // ViewController.m 3 | // RE2Sample 4 | // 5 | // Created by Yi Wang on 2/12/23. 6 | // 7 | 8 | #import "ViewController.h" 9 | 10 | void test_tokenizer(const char* merges_path, const char* vocab_path); 11 | 12 | @interface ViewController () 13 | 14 | @end 15 | 16 | @implementation ViewController 17 | 18 | - (void)viewDidLoad { 19 | [super viewDidLoad]; 20 | 21 | NSString* merges = [[NSBundle mainBundle] pathForResource:@"merges" 22 | ofType:@"txt"]; 23 | NSString* vocab = [[NSBundle mainBundle] pathForResource:@"vocab" 24 | ofType:@"txt"]; 25 | test_tokenizer([merges UTF8String], [vocab UTF8String]); 26 | } 27 | 28 | 29 | @end 30 | -------------------------------------------------------------------------------- /SampleApp/SampleApp/main.m: -------------------------------------------------------------------------------- 1 | // 2 | // main.m 3 | // RE2Sample 4 | // 5 | // Created by Yi Wang on 2/12/23. 6 | // 7 | 8 | #import 9 | #import "AppDelegate.h" 10 | 11 | int main(int argc, char * argv[]) { 12 | NSString * appDelegateClassName; 13 | @autoreleasepool { 14 | // Setup code that might create autoreleased objects goes here. 15 | appDelegateClassName = NSStringFromClass([AppDelegate class]); 16 | } 17 | return UIApplicationMain(argc, argv, nil, appDelegateClassName); 18 | } 19 | -------------------------------------------------------------------------------- /SampleApp/SampleApp/use_re2.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // use_re2.cpp 3 | // RE2Sample 4 | // 5 | // Created by Yi Wang on 2/12/23. 6 | // 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | extern "C" void test_tokenizer(const char* merges_path, 17 | const char* vocab_path) { 18 | RE2 re( 19 | "('s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| " 20 | "?[^\\s\\p{L}\\p{N}]+|\\s+\\(?!\\S\\)|\\s+)"); 21 | 22 | BPERanks bpe_ranks; 23 | std::fstream merges(merges_path, std::ios::in); 24 | load_merge_rules(merges, &bpe_ranks); 25 | 26 | std::unordered_map b2u; 27 | std::unordered_map u2b; 28 | bytes_to_unicode(&b2u, &u2b); 29 | 30 | std::unordered_map t2i; 31 | std::unordered_map i2t; 32 | std::fstream vocab_txt(vocab_path, std::ios::in); 33 | load_vocab(vocab_txt, &t2i, &i2t); 34 | 35 | std::vector candidates = { 36 | "this is <|endoftext|> else<|endoftext|>", 37 | "<|endoftext|> else<|endoftext|>", "this is <|endoftext|> else", 38 | "this is <|endoftext|>else", "this is else"}; 39 | for (auto s : candidates) { 40 | std::vector ids; 41 | encode(s, re, bpe_ranks, b2u, t2i, &ids); 42 | assert(ids.size() > 0); 43 | assert(decode(ids, u2b, i2t) == s); 44 | } 45 | std::cout << "Passed testing"; 46 | } 47 | -------------------------------------------------------------------------------- /build-re2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # exit when any command fails 4 | set -e 5 | 6 | # The -f option force the rebuild of IREE runtime frameworks. 7 | while getopts hf flag; do 8 | case "${flag}" in 9 | f) FORCE_REBUILD="YES" ;; 10 | *) 11 | echo "$0 -h|-f" 12 | echo " -h : display this message" 13 | echo " -f : force rebuild runtime frameworks" 14 | exit 0 15 | ;; 16 | esac 17 | done 18 | 19 | SCRIPT_DIR=$(dirname "$(readlink -f "${BASH_SOURCE[0]}")") 20 | SRC_DIR=$SCRIPT_DIR/re2 21 | BUILD_DIR=$SCRIPT_DIR/build/re2 22 | XCFRAMEWORK="$BUILD_DIR"/re2.xcframework 23 | 24 | 25 | if [[ $FORCE_REBUILD == "YES" ]]; then 26 | echo "┌------------------------------------------------------------------------------┐" 27 | echo " Deleting existig RE2 frameworks ..." 28 | echo "└------------------------------------------------------------------------------┘" 29 | rm -rf "$BUILD_DIR" 30 | fi 31 | 32 | function build_for_ios() { 33 | case $1 in 34 | sim) sdk=iphonesimulator ;; 35 | dev) sdk=iphoneos ;; 36 | *) 37 | echo "Unknown target $1 when calling build_for_ios" 38 | exit 5 39 | ;; 40 | esac 41 | 42 | arch=$2 43 | label=ios-$1-"$arch" 44 | build_dir="$BUILD_DIR"/"$label" 45 | 46 | test_file="$build_dir"/re2.framework/re2 47 | if test -f "$test_file" && lipo -info "$test_file"; then 48 | echo "Skip building iree.framework for $label." 49 | else 50 | echo "┌------------------------------------------------------------------------------┐" 51 | echo " Building for $label ..." 52 | echo " src: $SRC_DIR " 53 | echo " build: $build_dir " 54 | echo " build log: $build_dir/build.log" 55 | echo "└------------------------------------------------------------------------------┘" 56 | mkdir -p "$build_dir" # So to create the build.log file. 57 | cmake -S . \ 58 | -B "$build_dir" \ 59 | -DCMAKE_SYSTEM_NAME=iOS \ 60 | -DCMAKE_OSX_SYSROOT="$(xcodebuild -version -sdk $sdk Path)" \ 61 | -DCMAKE_OSX_ARCHITECTURES="$arch" \ 62 | -DCMAKE_SYSTEM_PROCESSOR="$arch" \ 63 | -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 \ 64 | -DCMAKE_IOS_INSTALL_COMBINED=YES \ 65 | -DCMAKE_INSTALL_PREFIX="$build_dir"/install \ 66 | -DRE2_BUILD_TESTING=OFF \ 67 | >"$build_dir"/build.log 2>&1 68 | cmake --build "$build_dir" >>"$build_dir"/build.log 2>&1 69 | fi 70 | } 71 | 72 | function build_for_macos() { 73 | arch=$1 74 | label=macos-"$arch" 75 | build_dir="$BUILD_DIR"/"$label" 76 | 77 | test_file="$build_dir"/re2.framework/re2 78 | if test -f "$test_file" && lipo -info "$test_file"; then 79 | echo "Skip building iree.framework for $label." 80 | else 81 | echo "┌------------------------------------------------------------------------------┐" 82 | echo " Building for $label ..." 83 | echo " src: $SRC_DIR " 84 | echo " build: $build_dir " 85 | echo " build log: $build_dir/build.log" 86 | echo "└------------------------------------------------------------------------------┘" 87 | mkdir -p "$build_dir" # So to create the build.log file. 88 | cmake -S . \ 89 | -B "$build_dir" \ 90 | -DRE2_BUILD_TESTING=OFF \ 91 | -DCMAKE_OSX_ARCHITECTURES="$arch" >"$build_dir"/build.log 2>&1 92 | cmake --build "$build_dir" >"$build_dir"/build.log 2>&1 93 | fi 94 | } 95 | 96 | function merge_fat_static_library() { 97 | src_label=$2 98 | dst_label=$1 99 | 100 | src="$BUILD_DIR"/$src_label/re2/re2.framework/re2 101 | dst="$BUILD_DIR"/$dst_label/re2/re2.framework/re2 102 | 103 | if lipo -info "$dst" | grep 'Non-fat' >/dev/null; then 104 | echo "┌------------------------------------------------------------------------------┐" 105 | echo " Building FAT static library ..." 106 | echo " merge: $src" 107 | echo " into: $dst" 108 | echo "└------------------------------------------------------------------------------┘" 109 | merged=/tmp/libmerged-"$src_label"-"$dst_label".a 110 | lipo "$src" "$dst" -create -output "$merged" 111 | mv "$merged" "$dst" 112 | fi 113 | } 114 | 115 | # Step 1. Build the following frameworks 116 | # 117 | # Note: We cannot build for dev-x86_64 because Apple does not offer 118 | # SDK for it. If we do so, CMake will prompt us about missing required 119 | # architecture x86_64 in file 120 | # /Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.2.sdk/usr/lib/libc++.tbd 121 | # build_for_ios dev x86_64 122 | # 123 | # This step also merge dependent static libraries into the target library. 124 | build_for_ios sim arm64 125 | build_for_ios sim x86_64 126 | build_for_ios dev arm64 127 | build_for_macos x86_64 128 | build_for_macos arm64 129 | 130 | # Step 2. Merge the frameworks of the same OS platform 131 | # ios-simulator-arm64+x86_64 132 | # macos-arm64+x86_64 133 | merge_fat_static_library ios-sim-arm64 ios-sim-x86_64 134 | merge_fat_static_library macos-arm64 macos-x86_64 135 | 136 | # Step 3. Merge the above frameworks into an XCFramework 137 | echo "┌------------------------------------------------------------------------------┐" 138 | echo " Aggregating frameworks into an xcframework ..." 139 | echo "└------------------------------------------------------------------------------┘" 140 | rm -rf "$XCFRAMEWORK" 141 | xcodebuild -create-xcframework \ 142 | -framework "$BUILD_DIR"/macos-arm64/re2/re2.framework \ 143 | -framework "$BUILD_DIR"/ios-sim-arm64/re2/re2.framework \ 144 | -framework "$BUILD_DIR"/ios-dev-arm64/re2/re2.framework \ 145 | -output "$XCFRAMEWORK" 146 | tree -L 1 -d "$XCFRAMEWORK" 147 | -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # exit when any command fails 4 | set -e 5 | 6 | # The -f option force the rebuild of IREE runtime frameworks. 7 | while getopts hf flag; do 8 | case "${flag}" in 9 | f) FORCE_REBUILD="YES" ;; 10 | *) 11 | echo "$0 -h|-f" 12 | echo " -h : display this message" 13 | echo " -f : force rebuild runtime frameworks" 14 | exit 0 15 | ;; 16 | esac 17 | done 18 | 19 | SCRIPT_DIR=$(dirname "$(readlink -f "${BASH_SOURCE[0]}")") 20 | SRC_DIR=$SCRIPT_DIR 21 | BUILD_DIR=$SCRIPT_DIR/build 22 | XCFRAMEWORK="$BUILD_DIR"/tokenizer.xcframework 23 | 24 | 25 | # Build the tokenizer into a framework for each target. 26 | if [[ $FORCE_REBUILD == "YES" ]]; then 27 | echo "┌------------------------------------------------------------------------------┐" 28 | echo " Deleting existig IREE runtime frameworks ..." 29 | echo "└------------------------------------------------------------------------------┘" 30 | rm -rf "$IREE_BUILD_RUNTIME_DIR" 31 | fi 32 | 33 | function build_for_ios() { 34 | case $1 in 35 | sim) sdk=iphonesimulator ;; 36 | dev) sdk=iphoneos ;; 37 | *) 38 | echo "Unknown target $1 when calling build_for_ios" 39 | exit 5 40 | ;; 41 | esac 42 | 43 | arch=$2 44 | label=ios-$1-"$arch" 45 | build_dir="$BUILD_DIR"/"$label" 46 | 47 | test_file="$build_dir"/tokenizer.framework/tokenizer 48 | if test -f "$test_file" && lipo -info "$test_file"; then 49 | echo "Skip building iree.framework for $label." 50 | else 51 | echo "┌------------------------------------------------------------------------------┐" 52 | echo " Building for $label ..." 53 | echo " src: $SRC_DIR " 54 | echo " build: $build_dir " 55 | echo " build log: $build_dir/build.log" 56 | echo "└------------------------------------------------------------------------------┘" 57 | mkdir -p "$build_dir" # So to create the build.log file. 58 | cmake -S . \ 59 | -B "$build_dir" \ 60 | -GNinja \ 61 | -DCMAKE_SYSTEM_NAME=iOS \ 62 | -DCMAKE_OSX_SYSROOT="$(xcodebuild -version -sdk $sdk Path)" \ 63 | -DCMAKE_OSX_ARCHITECTURES="$arch" \ 64 | -DCMAKE_SYSTEM_PROCESSOR="$arch" \ 65 | -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 \ 66 | -DCMAKE_IOS_INSTALL_COMBINED=YES \ 67 | -DCMAKE_INSTALL_PREFIX="$build_dir"/install \ 68 | >"$build_dir"/build.log 2>&1 69 | cmake --build "$build_dir" >>"$build_dir"/build.log 2>&1 70 | fi 71 | } 72 | 73 | function build_for_macos() { 74 | arch=$1 75 | label=macos-"$arch" 76 | build_dir="$BUILD_DIR"/"$label" 77 | 78 | test_file="$build_dir"/tokenizer.framework/tokenizer 79 | if test -f "$test_file" && lipo -info "$test_file"; then 80 | echo "Skip building iree.framework for $label." 81 | else 82 | echo "┌------------------------------------------------------------------------------┐" 83 | echo " Building for $label ..." 84 | echo " src: $SRC_DIR " 85 | echo " build: $build_dir " 86 | echo " build log: $build_dir/build.log" 87 | echo "└------------------------------------------------------------------------------┘" 88 | mkdir -p "$build_dir" # So to create the build.log file. 89 | cmake -S . \ 90 | -B "$build_dir" \ 91 | -GNinja \ 92 | -DCMAKE_OSX_ARCHITECTURES="$arch" >"$build_dir"/build.log 2>&1 93 | cmake --build "$build_dir" >"$build_dir"/build.log 2>&1 94 | fi 95 | } 96 | 97 | function merge_fat_static_library() { 98 | src_label=$2 99 | dst_label=$1 100 | 101 | src="$BUILD_DIR"/$src_label/tokenizer.framework/tokenizer 102 | dst="$BUILD_DIR"/$dst_label/tokenizer.framework/tokenizer 103 | 104 | if lipo -info "$dst" | grep 'Non-fat' >/dev/null; then 105 | echo "┌------------------------------------------------------------------------------┐" 106 | echo " Building FAT static library ..." 107 | echo " merge: $src" 108 | echo " into: $dst" 109 | echo "└------------------------------------------------------------------------------┘" 110 | merged=/tmp/libmerged-"$src_label"-"$dst_label".a 111 | lipo "$src" "$dst" -create -output "$merged" 112 | mv "$merged" "$dst" 113 | fi 114 | } 115 | 116 | # Step 1. Build the following frameworks 117 | # 118 | # Note: We cannot build for dev-x86_64 because Apple does not offer 119 | # SDK for it. If we do so, CMake will prompt us about missing required 120 | # architecture x86_64 in file 121 | # /Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.2.sdk/usr/lib/libc++.tbd 122 | # build_for_ios dev x86_64 123 | # 124 | # This step also merge dependent static libraries into the target library. 125 | build_for_ios sim arm64 126 | build_for_ios sim x86_64 127 | build_for_ios dev arm64 128 | build_for_macos x86_64 129 | build_for_macos arm64 130 | 131 | # Step 2. Merge the frameworks of the same OS platform 132 | # ios-simulator-arm64+x86_64 133 | # macos-arm64+x86_64 134 | merge_fat_static_library ios-sim-arm64 ios-sim-x86_64 135 | merge_fat_static_library macos-arm64 macos-x86_64 136 | 137 | # Step 3. Merge the above frameworks into an XCFramework 138 | echo "┌------------------------------------------------------------------------------┐" 139 | echo " Aggregating frameworks into an xcframework ..." 140 | echo "└------------------------------------------------------------------------------┘" 141 | rm -rf "$XCFRAMEWORK" 142 | xcodebuild -create-xcframework \ 143 | -framework "$BUILD_DIR"/macos-arm64/tokenizer.framework \ 144 | -framework "$BUILD_DIR"/ios-sim-arm64/tokenizer.framework \ 145 | -framework "$BUILD_DIR"/ios-dev-arm64/tokenizer.framework \ 146 | -output "$XCFRAMEWORK" 147 | tree -L 1 -d "$XCFRAMEWORK" 148 | -------------------------------------------------------------------------------- /doc/0.md: -------------------------------------------------------------------------------- 1 | # LLM Tokenziers in C++ (0): Port from Python 2 | 3 | ## Why Tokenizer in C++ 4 | 5 | I am trying to run the HuggingFace GPT2 model on a smart phone. To tokenize the text data, I was supposed to use HuggingFace’s GPT2 tokenizer. Unfortunately, HuggingFace only offers the Python version and declined to offer a C/C++ version in [Mar 2020](https://github.com/huggingface/tokenizers/issues/185#issuecomment-594615029). 6 | 7 | HuggingFace does offer a [Rust version](https://github.com/huggingface/tokenizers). But I'm worried that putting Rust into mobile development will create more problems. 8 | 9 | According to [their document](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/nemo_megatron/gpt/gpt_training.html#quick-start), NVIDIA NeMo project uses the Google Sentencepiece tokenizer library, which is in C++. However, reading the source of HuggingFace [GPT2’s tokenizer](https://huggingface.co/docs/transformers/v4.26.0/en/model_doc/gpt2#transformers.GPT2Tokenizer), I noticed that it is of a different kind — a byte-pair encoding tokenizer. According to HuggingFace’s [nice guide](https://huggingface.co/docs/transformers/tokenizer_summary) to tokenizers, 10 | 11 | >Each model has its own tokenizer type. 12 | A pretrained model only performs properly if you feed it an input that was tokenized with the same rules that were used to tokenize its training data. 13 | 14 | 15 | This guide also explains the three main types of tokenizers used with Transformer models: byte-pair encoding (BPE), WordPiece, and SentencePiece. 16 | 17 | Therefore, I decided to translate HuggingFace’s GPT2 tokenizer from Python to C++. 18 | 19 | ## What to Port 20 | 21 | In order to figure out which part of the HuggingFace’s Python tokenizer project is what I need to port, I worte a simple Python script that calls the tokenizer, used `pdb` to step into the functions calls. Then I noticed that [this eight-line Python method](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/models/gpt2/tokenization_gpt2.py#L296-L304) is all what I need to translate if I am gonna run the GPT2 model. 22 | 23 | ```python 24 | def _tokenize(self, text): 25 | """Tokenize a string.""" 26 | bpe_tokens = [] 27 | for token in re.findall(self.pat, text): 28 | token = "".join( 29 | self.byte_encoder[b] for b in token.encode("utf-8") 30 | ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case) 31 | bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) 32 | return bpe_tokens 33 | ``` 34 | 35 | From the above code snippet, we can see it does three things: 36 | 37 | 1. Roughly segment the input text using regular expression matching by calling `re.findall`. 38 | 1. Maps control bytes, b, in each candidate token into 255+b. 39 | 1. Maps each candidate token into one or more BPE tokens. 40 | 41 | In some successive notes, I will explain how I port this three steps. In the rest of this article, I will describe how I identified that the above function is all what I need to port. 42 | 43 | ## Install HuggingFace Transformers from Source Code 44 | 45 | I git-cloned the HuggingFace's Transformers repository. 46 | 47 | ```bash 48 | git clone https://github.com/huggingface/transformers 49 | ``` 50 | 51 | I followed the section 52 | [Editable Install](https://huggingface.co/docs/transformers/installation#editable-install), I ran the following command 53 | 54 | ``` 55 | cd transformers 56 | pip install -e . 57 | ``` 58 | 59 | This process might complain about non-exisitng dependencies. Just install them. After this process, we should be able to import `transformers`. 60 | 61 | ```bash 62 | python -c 'import transformers' 63 | ``` 64 | 65 | ## Drafting a Driving Script 66 | 67 | I wrote this script `t.py` to tokenize a string as the following. 68 | 69 | ```python 70 | import transformers 71 | import builtins 72 | from os import path 73 | 74 | def load_gpt2_tokenizer() -> transformers.GPT2Tokenizer: 75 | builtins.open, tmp_open = open, builtins.open 76 | gpt2_dir = "/Users/y/w/gpt2cpp/assets" 77 | tokenizer = transformers.GPT2Tokenizer( 78 | vocab_file=path.join(gpt2_dir, 'vocab.json'), 79 | merges_file=path.join(gpt2_dir, 'merges.txt')) 80 | builtins.open = tmp_open 81 | return tokenizer 82 | 83 | tknzr = load_gpt2_tokenizer() 84 | print(tknzr("zero one two three four")) 85 | ``` 86 | 87 | My purpose is to use pdb to trace into the function call to `tknzr("zero one two three four")`. 88 | 89 | ## Class Hierarchy 90 | 91 | Before showing the tracing result, it would be helpful to understand the class hierarchy. 92 | 93 | 1. [`class GPT2Tokenizer(PreTrainedTokenizer):`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/models/gpt2/tokenization_gpt2.py#LL104C12-L104C12) 94 | 95 | 1. [`class PreTrainedTokenizer(PreTrainedTokenizerBase):`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils.py#LL333C7-L333C26) 96 | 97 | 1. [`class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils_base.py#L1476) 98 | 99 | ## Trace 100 | 101 | Running the above driver script using the following command 102 | 103 | ```bash 104 | python -m pdb t.py 105 | ``` 106 | 107 | reveals the following calling chain: 108 | 109 | - [`PreTrainedTokenizerBase.__call__`]( 110 | https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils_base.py#L2452-L2539) 111 | - [`PreTrainedTokenizerBase._switch_to_input_mode`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils_base.py#L3564-L3568) is an empty implementation 112 | - [`PreTrainedTokenizerBase._call_one`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils_base.py#L2541-L2651) 113 | - [`PreTrainedTokenizerBase.encode_plus`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils_base.py#L2653-L2724) 114 | - [`PreTrainedTokenizerBase._get_padding_truncation_strategies`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils_base.py#L2314-L2450) 115 | - Runs [`padding_strategy = PaddingStrategy.DO_NOT_PAD`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils_base.py#L2372) 116 | - Runs [`truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils_base.py#LL2399C13-L2399C69) 117 | - Runs [`return padding_strategy, truncation_strategy, max_length=None, kwargs={}`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils_base.py#L2450) 118 | - [`PreTrainedTokenizer._encode_plus`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils.py#L593-L669) 119 | - [nested function `get_input_ids`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils.py#L614C33-L638) 120 | - [`PreTrainedTokenizer.tokenize`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils.py#L481-L549) 121 | - `all_special_tokens_extended = {'<|endoftext|>': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True)}` 122 | - [`text, kwargs = self.prepare_for_tokenization(text, **kwargs) 123 | `](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils.py#L502) does nothing. 124 | - [`no_split_token = set(self.unique_no_split_tokens)`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils.py#L516) returns `{}` 125 | - [`tokens = self.tokens_trie.split(text)`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils.py#LL517C15-L517C15) returns `['zero one two three four']` 126 | - [`tokenized_text = []`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils.py#L539) 127 | - [`tokenized_text.extend(self._tokenize(token))`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/tokenization_utils.py#L547) 128 | - [`GPT2Tokenizer._tokenize`](https://github.com/huggingface/transformers/blob/5b49376202863d3798d2ff8a8ba61590542a1141/src/transformers/models/gpt2/tokenization_gpt2.py#L296-L304) 129 | 130 | Here we reached `GPT2Tokenizer._tokenize`. And before this invocation, all above stack frames do basically nothing. 131 | -------------------------------------------------------------------------------- /doc/1.md: -------------------------------------------------------------------------------- 1 | # LLM Tokenziers in C++ (1): Unicode Regexp 2 | 3 | [LLM Tokenizers in C++ (0): Port from Python](https://github.com/wangkuiyi/gpt2cpp/wiki/LLM-Tokenzier-in-C---(0):-Port-from-Python) 4 | 5 | ## Regexp for Unicode 6 | 7 | The first step of BPE tokenizing is to split the text into candidate tokens. The class `GPT2Tokenizer` uses Python’s regexp package to do so. 8 | 9 | The following regexp defines a candidate token. 10 | 11 | ``` 12 | self.pat = re.compile( 13 | r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") 14 | ``` 15 | 16 | With the triple-quote, the authors do not have to escape the quote mark. The prefix `r` before the triple-quote means the string is treated as a [raw string](https://stackoverflow.com/a/4780104/724872), so all other eescape codes will be ignored. 17 | 18 | The general structure of this regexp is a sequence of sub-regexp’s separated by the logical-or mark, `|`. The first few sub-regexp’s correspond to commonly-used subwords such as `'ve`. 19 | 20 | `\p{L}`, according to [this StackOverflow answer](https://stackoverflow.com/a/14891168/724872), mean a unicode letter. Similarly, `\p{N}` is a number. `\s` means a space character, which could be a space, a tab, or a newline. `\S` mean non-space character. 21 | 22 | So, ` ?\p{L}+` means a sequence of one or more letters prefixed with or without a whitespace. This represents a “word”. Similarly, ` ?\p{N}+` represents a number without signs or the dot. 23 | 24 | `[^\s\p{L}\p{N}]` means a character which is not a space, a letter, or a number. This leaves us the puctunations. Therefore, ` ?[^\s\p{L}\p{N}]+` is a sequence of successive punctuators prefixed with or without a whitespace. 25 | 26 | `\s+` means one or more spaces. 27 | 28 | `\s+(?!\S)` uses the [look-ahead assertion](https://en.wikipedia.org/wiki/Regular_expression#Assertions) `(?!...)`. It matches one or more spaces followed by, but excluding, a non-space character. With the existence of this sub-regexp, the successive of N spaces followed by a word is recognized as two tokens -- `\s+(?!\S)` matches the first token consisting of N-1 spaces, and ` ?\p{L}+` matches the second token consisting the last space and the word. The purpose is to have the second word recognized as a token with a leading space, which indicates that the word is not the first word of a sentence. If we remove `\s+(?!\S)`, the N spaces will be recognized as the first token, and the word as the second one, without the leading space. 29 | 30 | **Unfortunately, re2 [does NOT support](https://github.com/google/re2/wiki/Syntax) look-ahead.** 31 | 32 | > (?!re) before text not matching re (NOT SUPPORTED) 33 | 34 | **This will slightly change the idea into that the successive two or more spaces separate sentences, but not words.** 35 | 36 | ## Unicode Regexp in C++ 37 | 38 | `[std::regex](https://en.cppreference.com/w/cpp/regex)` does not accept Unicode regexp `\p{L}` or `\p{N}`. If we do so, the progrma will abort with the 39 | `error_escape` and the message 40 | 41 | >the expression contains an invalid escaped character or a trailing escape 42 | 43 | 44 | This seems [a known problem](https://stackoverflow.com/a/38002322/724872), and the workaround is `boo`s`t::wregex`. Unfortunately, Boost is not part of the iOS SDK and I do not want to bring it in as a dependency of my Xcode project. Moreover, according to [this answer](https://stackoverflow.com/a/38543269/724872), 45 | 46 | >The Boost.Regex documentation explicitly states that there's no support for Unicode-specific character classes when using boost::wregex. If you want this functionality, you'll need to build Boost.Regex with ICU support enabled then use the boost::u32regex type instead of boost::wregex. 47 | 48 | No! I do not want to build Boost from source code because it is huge. 49 | 50 | I then remembered several years ago, I read Russ Cox’s great notes about doing [regular expression the right way](https://swtch.com/~rsc/regexp/) and his work [RE2](https://github.com/google/re2). 51 | 52 | RE2 is small. It [supports Unicode regexps](https://github.com/google/re2/blob/b025c6a3ae05995660e3b882eb3277f4399ced1a/re2/testing/re2_test.cc#L1383-L1409). More importantly, I can build it using CMake for both my macOS and iOS (Simulator)! The following commands builds RE2 for the host. 53 | 54 | ```bash 55 | cmake -B b -S . -DCMAKE_INSTALL_PREFIX=b/install 56 | cmake —build b —target install 57 | ``` 58 | 59 | The following commands builds RE2 for iOS Simulator. You can change `iphonesimulator` into `iphones` to make it build for iOS devices. 60 | 61 | ```bash 62 | cmake -B build-ios -S . \ 63 | -DCMAKE_SYSTEM_NAME=iOS \ 64 | -DCMAKE_OSX_SYSROOT="$(xcodebuild -version -sdk iphonesimulator Path)" \ 65 | -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 \ 66 | -DCMAKE_IOS_INSTALL_COMBINED=YES \ 67 | -DCMAKE_INSTALL_PREFIX=build-ios/install 68 | 69 | cmake --build build-ios --target install 70 | ``` 71 | 72 | ## Side-by-Side 73 | 74 | The following Python code is copy-n-pasted from the Transformers’ tokenzier repository. 75 | 76 | ```python 77 | import regex as re 78 | 79 | pat = re.compile( 80 | r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") 81 | 82 | def f(text): 83 | for token in re.findall(pat, text): 84 | print(f"token=\"{token}\"") 85 | 86 | f("we'd see you say 世界你好真实好的很啊") 87 | ``` 88 | 89 | The output is as the following. 90 | 91 | ``` 92 | token="we" 93 | token="'d" 94 | token=" " 95 | token="see" 96 | token=" " 97 | token="you" 98 | token=" say" 99 | token=" 世界你好真实好的很啊" 100 | ``` 101 | 102 | The following is the corresponding C++ code. 103 | 104 | ```c++ 105 | #include 106 | #include 107 | #include 108 | #include 109 | 110 | int main() { 111 | std::string w; 112 | std::string text = "we'd see you say 世界你好真实好的很啊"; 113 | re2::StringPiece input(text); 114 | 115 | RE2 re("('s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+\\(?!\\S\\)|\\s+)"); 116 | assert(re.ok()); // compiled; if not, see re.error(); 117 | 118 | std::string var; 119 | int value; 120 | while (RE2::FindAndConsume(&input, re, &w)) { 121 | std::cout << "token=\"" << w << "\"" << std::endl; 122 | } 123 | } 124 | ``` 125 | 126 | The following command builds it and links the RE2 static libarary. 127 | 128 | ```bash 129 | clang++ -std=c++20 b.cc \ 130 | -I ~/w/re2/b/install/include \ 131 | -L ~/w/re2/b/install/lib -lre2 -o b 132 | ``` 133 | 134 | The output is identical to the above from the Python code. 135 | -------------------------------------------------------------------------------- /doc/u.md: -------------------------------------------------------------------------------- 1 | # Everything C++ and Python Programmers Need to Know about Unicode 2 | 3 | ## Unicode 4 | 5 | Unicode is larger character set than ASCII. It contains codepoints for non-English characters. A [codepoint](https://en.wikipedia.org/wiki/Code_point) is a numberic value that maps to a specific [character](https://en.wikipedia.org/wiki/Character_(computing)). A character is a symbol we draw on screens and papers. 6 | 7 | Unlike an ASCII codepoint ranges from 0 to 127 and takes a byte with high-bit fixed to 0, or a `char`, Unicode has more codepoints that we will need four bytes, or a `uint32_t`, or `wchar_t`, to save each codepoint. 8 | 9 | ## UTF-8 10 | 11 | Most characters in Unicode are infrequent. Inspired by Huffman coding, we want to use less than four bytes for those frequent codepoints. This idea leads to a well-known encoding algorithm, UTF-8, which encodes each Unicode codepoint into one, two, three, or four bytes. 12 | 13 | Screenshot 2023-02-07 at 5 08 03 PM 14 | 15 | ## One-byte UTF-8 Codepoints 16 | 17 | Each one-byte UTF-8 encoded codepoint corresponds to an ASCII codepoint. 18 | 19 | 20 | 21 | In the above ASCII table, the character `$` has the codepoint with decimal value 37 and, equivalently, the binary value 00100100, where the high-bit is 0. 22 | 23 | ## Multiple-byte UTF-8 Codepoints 24 | 25 | When we use a modern text editor to type C++ source code, all string literals are encoded in UTF-8. 26 | 27 | The following source code assigns a UTF-8 encoded string to a `std::string`-typed variable `text`. 28 | 29 | ```c++ 30 | std::string text = "Hi 益"; 31 | ``` 32 | 33 | The first three characters, `H`, `i` and ` `, are in ASCII. So each of them takes one byte in an UTF-8 encoded string. We are curious to know how many bytes the characters `益` takes in this UTF-8 string. 34 | 35 | Let us do the following anatomy. 36 | 37 | ```c++ 38 | strlen(s.c_str()) // returns 6 39 | s.size() // returns 6 40 | ``` 41 | 42 | Because `std::string` is an alias of `basic_string`, which represents a sequence of `char`s. In the above example, the character `益` must take 6-3=3 bytes. 43 | 44 | The following code snippet prints each of these six bytes: 45 | 46 | ```c++ 47 | for(size_t i = 0; i < text.size(); ++i) { 48 | std::cout << " " << static_cast(static_cast(text[i])); 49 | } 50 | ``` 51 | 52 | It gives us: 53 | 54 | ``` 55 | 72 105 32 231 155 138 56 | ``` 57 | 58 | Look up the above ASCII table, we can see 59 | 60 | - 72 is `H` 61 | - 105 is `i` 62 | - 32 is ` ` (the whitespace) 63 | 64 | Converting the decimal values of the rest three bytes into binary, we get: 65 | 66 | ``` 67 | 231 = 11100111 68 | 155 = 10011011 69 | 138 = 10001010 70 | ``` 71 | 72 | Looking up the above UTF-8 encoding table, the three prefixes, 1110, 10, and 10, match the case of a three-byte codepoint. So, we remove the three prefixes and concatenate the rest. 73 | 74 | ``` 75 | 0111 011011 001010 76 | ``` 77 | 78 | Converting this binary number into decimal, we get 79 | 80 | 30410 81 | 82 | Google tells us 30410 **is** the codepoint of `益`! 83 | 84 | ## From `char` to `wchar_t` 85 | 86 | Given that the character `益` takes a value that requires more than one byte to store, and the C/C++ type `char` takes only one byte, it is illegal to assign a multi-byte Unicode codepoint to a `char`-typed variables. The following line of code will trigger a compilation error. 87 | 88 | ```c++ 89 | char c = '益'; // character too large for enclosing character literal type 90 | ``` 91 | 92 | Therefore, to fully support Unicode, C/C++ needs a new character type, which takes four bytes -- the longest UTF-8 codepoint takes four bytes. This is `wchar_t`. 93 | 94 | However, changing only `char` into `wchar_t` is not enough. The following code give us the same error. 95 | 96 | ``` 97 | wchar_t c = '益'; // character too large for enclosing character literal type 98 | ``` 99 | 100 | This is because the literal syntax (`' '`) is for defining a `char` rather than a `wchar_t`. We need the `L`-prefix to denote that each character in the string takes four bytes. 101 | 102 | ``` 103 | wchar_t c = L'益'; // This works! 104 | ``` 105 | 106 | And we have `sizeof(c) == 4` because there are four Unicode codepoitns in the string. 107 | 108 | We have `sizeof(c[0])==4` on Linux, macOS, and other POSIX-compatible systems. It is 2 on Windows due to a POSIX-incompatibility issue. 109 | 110 | ## From `std::string` to `std::wstring` 111 | 112 | | string type | implementation | literal syntax | 113 | | ------------- | ------------- | -------------- | 114 | | `std::string` | `/* std::basic_string */` | `s = "Hi 益"` | 115 | | `std::wstring` | `/* std::basic_string */` | `w = L"Hi 益"` | 116 | 117 | We have `s.size()==6` because the UTF-i encoding of `"Hi 益"` takes 6 bytes. We have `w.size()==4` because it contains 4 Unicode codepoints. 118 | 119 | The following code snippet prints each codepoint. 120 | 121 | ```c++ 122 | for(size_t i = 0; i < text.size(); ++i) { 123 | std::cout << " " << static_cast(text[i]); 124 | } 125 | ``` 126 | 127 | and gives 128 | 129 | ``` 130 | 72 105 32 30410 131 | ``` 132 | 133 | We do not have to manually decoding it by looking up the UTF-8 prefix table. 134 | 135 | The following functions encode wstring in UTF-8 and decode UTF-8 encoded char-string to wstring. 136 | 137 | ```c++ 138 | #include 139 | #include 140 | 141 | std::wstring utf8_to_wstring (const std::string& str) { 142 | std::wstring_convert> myconv; 143 | return myconv.from_bytes(str); 144 | } 145 | 146 | std::string wstring_to_utf8 (const std::wstring& str) { 147 | std::wstring_convert> myconv; 148 | return myconv.to_bytes(str); 149 | } 150 | ``` 151 | 152 | ## From C++ to Python 153 | 154 | Python has two data types `str` and `bytes`. A string literal has type `str`, which behave like `std::wstring` in terms that `len(s:str)` returns the number of Unicode codepoints. By UTF-8 encoding a string, we converts `str` into `bytes`, which behave more like `std::string` as it is a sequence of bytes. The following example tells more details. 155 | 156 | ```python 157 | s = "Hi 益" 158 | print(type(s)) # str 159 | print(len(s)) # 4 160 | print([ord(c) for c in s]) # [72, 105, 32, 30410] 161 | 162 | b = s.encode('utf-8') 163 | print(type(b)) # bytes 164 | print(len(b)) # 6 165 | print([i for i in b]) # [72, 105, 32, 231, 155, 138] 166 | ``` 167 | -------------------------------------------------------------------------------- /tokenizer/assets/README.md: -------------------------------------------------------------------------------- 1 | The script https://github.com/iree-org/iree-jax/blob/main/models/gpt2/setup.sh reveals how to download data files required by GPT-2. The HuggingFace tokenizer requires two files: `merges.txt` and `vocab.json`. Our simplified Python tokenizer `tokenizer/bpe.py` also requires these two files Our C++ version `tokenizer/bpe.{h,cc}` requires `vocab.txt`. To get `vocab.txt`, we can run `tool/json-to-txt.py`: 2 | 3 | ```bash 4 | python tool/json-to-txt.py tokenizer/assets/vocab.json > tokenizer/assets/vocab.txt 5 | ``` 6 | -------------------------------------------------------------------------------- /tokenizer/bpe.cc: -------------------------------------------------------------------------------- 1 | #include "bpe.h" 2 | 3 | std::wstring utf8_to_wstring(const std::string& str) { 4 | std::wstring_convert> myconv; 5 | return myconv.from_bytes(str); 6 | } 7 | 8 | std::string wstring_to_utf8(const std::wstring& str) { 9 | std::wstring_convert> myconv; 10 | return myconv.to_bytes(str); 11 | } 12 | 13 | std::string utf8(wchar_t c) { 14 | std::wstring w(1, c); 15 | return wstring_to_utf8(w); 16 | } 17 | 18 | void bytes_to_unicode(std::unordered_map* b2u, 19 | std::unordered_map* u2b) { 20 | auto _insert_range = [=](int start, int end) { 21 | for (int c = start; c <= end; c++) { 22 | b2u->insert({uint8_t(c), wchar_t(c)}); 23 | } 24 | }; 25 | 26 | b2u->clear(); 27 | _insert_range(L'!', L'~'); 28 | _insert_range(L'¡', L'¬'); 29 | _insert_range(L'®', L'ÿ'); 30 | 31 | int n = 0; 32 | for (int b = 0; b < 256; b++) { 33 | if (b2u->find(uint8_t(b)) == b2u->end()) { 34 | b2u->insert({uint8_t(b), wchar_t(256 + n)}); 35 | n++; 36 | } 37 | } 38 | 39 | if (u2b != NULL) { 40 | u2b->clear(); 41 | for (auto e : (*b2u)) { 42 | u2b->insert({e.second, e.first}); 43 | } 44 | } 45 | } 46 | 47 | // Given a token as a UTF8 string, encode each byte into an wchar_t 48 | void byte_encode_token(const std::string& token, 49 | const std::unordered_map& b2u, 50 | std::wstring* result) { 51 | result->resize(0); 52 | for (char c : token) { 53 | wchar_t wc = b2u.at(uint8_t(c)); 54 | result->push_back(wc); 55 | } 56 | } 57 | 58 | void load_merge_rules(std::istream& ins, BPERanks* bpe_ranks) { 59 | bpe_ranks->clear(); 60 | 61 | std::string line; 62 | int n = 0; 63 | while (std::getline(ins, line)) { 64 | if (n > 0) { // Skip the version comment. 65 | int d = line.find(" "); // merges.txt file use ASCII space 66 | bpe_ranks->insert({{utf8_to_wstring(line.substr(0, d)), 67 | utf8_to_wstring(line.substr(d + 1))}, 68 | n - 1}); 69 | } 70 | n++; 71 | } 72 | } 73 | 74 | void get_pairs(const std::wstring& word, 75 | std::vector>* pairs) { 76 | pairs->clear(); 77 | 78 | if (word.size() < 2) return; 79 | 80 | wchar_t previous = word[0]; 81 | for (int i = 1; i < word.size(); i++) { 82 | pairs->push_back({std::wstring(1, previous), std::wstring(1, word[i])}); 83 | previous = word[i]; 84 | } 85 | } 86 | 87 | void bpe(const std::wstring& token, const BPERanks& bpe_ranks, 88 | std::vector* result) { 89 | std::set merged; // records indices in pairs that were merged. 90 | auto _left = [](int i, std::set& merged) { 91 | for (int j = i - 1; j >= -1; j--) { 92 | if (merged.find(j) == merged.end()) return j; 93 | } 94 | return -1; 95 | }; 96 | auto _right = [](int i, int cap, std::set& merged) { 97 | for (int j = i + 1; j < cap; j++) { 98 | if (merged.find(j) == merged.end()) return j; 99 | } 100 | return cap; 101 | }; 102 | 103 | std::vector> pairs; 104 | get_pairs(token, &pairs); 105 | 106 | while (true) { 107 | int min_score = INT_MAX; 108 | int to_merge = -1; // indices into pairs. 109 | 110 | for (int i = 0; i < pairs.size(); ++i) { 111 | if (merged.find(i) == merged.end()) { // pair i is not merged. 112 | auto iter = bpe_ranks.find(pairs[i]); 113 | int score = iter != bpe_ranks.end() ? iter->second : INT_MAX; 114 | if (score < min_score) { 115 | min_score = score; 116 | to_merge = i; 117 | } 118 | } 119 | } 120 | 121 | if (to_merge == -1) break; 122 | 123 | merged.insert(to_merge); 124 | std::wstring merge_into = pairs[to_merge].first + pairs[to_merge].second; 125 | 126 | int l = _left(to_merge, merged); 127 | if (l >= 0) pairs[l].second = merge_into; 128 | int r = _right(to_merge, pairs.size(), merged); 129 | if (r < pairs.size()) pairs[r].first = merge_into; 130 | } // end while (true) 131 | 132 | if (merged.size() == pairs.size()) 133 | result->push_back(token); 134 | else { 135 | for (int i = 0; i < pairs.size(); ++i) { 136 | if (merged.find(i) == merged.end()) { 137 | if (_left(i, merged) < 0) result->push_back(pairs[i].first); 138 | result->push_back(pairs[i].second); 139 | } 140 | } 141 | } 142 | } 143 | 144 | void _tokenize(const std::string& text, RE2& re, BPERanks& bpe_ranks, 145 | const std::unordered_map& b2u, 146 | std::vector* result) { 147 | re2::StringPiece input(text); 148 | std::string token; 149 | while (RE2::FindAndConsume(&input, re, &token)) { 150 | std::wstring wtoken; 151 | byte_encode_token(token, b2u, &wtoken); 152 | 153 | std::vector bpe_tokens; 154 | bpe(wtoken, bpe_ranks, &bpe_tokens); 155 | 156 | for (auto ws : bpe_tokens) { 157 | result->push_back(wstring_to_utf8(ws)); 158 | } 159 | } 160 | } 161 | 162 | void tokenize(const std::string& text, RE2& re, BPERanks& bpe_ranks, 163 | const std::unordered_map& b2u, 164 | std::vector* result) { 165 | const std::string eot("<|endoftext|>"); 166 | size_t s = 0; 167 | size_t i = text.find(eot); 168 | while (i != std::string::npos) { 169 | _tokenize(text.substr(s, i - s), re, bpe_ranks, b2u, result); 170 | result->push_back(eot); 171 | s = i + eot.size(); 172 | i = text.find(eot, s); 173 | } 174 | _tokenize(text.substr(s), re, bpe_ranks, b2u, result); 175 | } 176 | 177 | void load_vocab(std::istream& ins, std::unordered_map* t2i, 178 | std::unordered_map* i2t) { 179 | t2i->clear(); 180 | i2t->clear(); 181 | 182 | std::string line; 183 | std::string token; 184 | int n = 0; 185 | while (std::getline(ins, line)) { 186 | if (n % 2 == 0) { 187 | token = line; 188 | } else { 189 | t2i->insert({token, std::stoi(line)}); 190 | i2t->insert({std::stoi(line), token}); 191 | } 192 | n++; 193 | } 194 | } 195 | 196 | void encode(const std::string& text, RE2& re, BPERanks& bpe_ranks, 197 | std::unordered_map& b2u, 198 | const std::unordered_map& t2i, 199 | std::vector* ids) { 200 | std::vector result; 201 | tokenize(text, re, bpe_ranks, b2u, &result); 202 | ids->clear(); 203 | for (auto s : result) { 204 | ids->push_back(t2i.at(s)); 205 | } 206 | } 207 | 208 | std::string decode(const std::vector& ids, 209 | const std::unordered_map& u2b, 210 | const std::unordered_map& i2t) { 211 | std::string concat; 212 | for (int id : ids) { 213 | concat += i2t.at(id); 214 | } 215 | 216 | std::wstring w = utf8_to_wstring(concat); 217 | std::string r; 218 | for (wchar_t c : w) { 219 | r.push_back(char(u2b.at(c))); 220 | } 221 | return r; 222 | } 223 | -------------------------------------------------------------------------------- /tokenizer/bpe.h: -------------------------------------------------------------------------------- 1 | #ifndef HUGGINGFACE_TRANSFORMERS_TOKENIZER_GPT2_BPE 2 | #define HUGGINGFACE_TRANSFORMERS_TOKENIZER_GPT2_BPE 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | std::wstring utf8_to_wstring(const std::string& str); 16 | 17 | std::string wstring_to_utf8(const std::wstring& str); 18 | 19 | std::string utf8(wchar_t c); 20 | 21 | void bytes_to_unicode(std::unordered_map* b2u, 22 | std::unordered_map* u2b); 23 | 24 | void byte_encode_token(const std::string& token, 25 | const std::unordered_map& b2u, 26 | std::wstring* result); 27 | 28 | // hash_pair_wstring is used in BPERanks to make a pair of wstrings 29 | // hashable, so the pair can be used as the key to unordered_map. 30 | struct hash_pair_wstring { 31 | size_t operator()(const std::pair& p) const { 32 | auto hash1 = std::hash{}(p.first); 33 | auto hash2 = std::hash{}(p.second); 34 | // If hash1 == hash2, their XOR is zero. 35 | return (hash1 != hash2) ? hash1 ^ hash2 : hash1; 36 | } 37 | }; 38 | 39 | // BPERanks maps each merge rule, which is a pair of wstrings, to its 40 | // rank. This mapping allows quick lookup for the optimal merge rule. 41 | using BPERanks = std::unordered_map, int, 42 | hash_pair_wstring>; 43 | 44 | void load_merge_rules(std::istream& ins, BPERanks* bpe_ranks); 45 | 46 | void get_pairs(const std::wstring& word, 47 | std::vector >* pairs); 48 | 49 | void bpe(const std::wstring& token, const BPERanks& bpe_ranks, 50 | std::vector* result); 51 | 52 | void tokenize(const std::string& text, RE2& re, BPERanks& bpe_ranks, 53 | const std::unordered_map& b2u, 54 | std::vector* result); 55 | 56 | void load_vocab(std::istream& ins, std::unordered_map* t2i, 57 | std::unordered_map* i2t); 58 | void encode(const std::string& text, RE2& re, BPERanks& bpe_ranks, 59 | std::unordered_map& b2u, 60 | const std::unordered_map& t2i, 61 | std::vector* ids); 62 | std::string decode(const std::vector& ids, 63 | const std::unordered_map& u2b, 64 | const std::unordered_map& i2t); 65 | 66 | #endif // HUGGINGFACE_TRANSFORMERS_TOKENIZER_GPT2_BPE 67 | -------------------------------------------------------------------------------- /tokenizer/bpe.py: -------------------------------------------------------------------------------- 1 | import regex as re 2 | import json 3 | import os 4 | 5 | 6 | def bytes_to_unicode(): 7 | bs = ( 8 | list(range(ord("!"), ord("~") + 1)) 9 | + list(range(ord("¡"), ord("¬") + 1)) 10 | + list(range(ord("®"), ord("ÿ") + 1)) 11 | ) 12 | cs = bs[:] 13 | n = 0 14 | for b in range(2**8): 15 | if b not in bs: 16 | bs.append(b) 17 | cs.append(2**8 + n) 18 | n += 1 19 | cs = [chr(n) for n in cs] 20 | return dict(zip(bs, cs)) 21 | 22 | 23 | def get_pairs(word): 24 | """ 25 | Return set of symbol pairs in a word. 26 | 27 | Word is represented as tuple of symbols (symbols being variable-length strings). 28 | """ 29 | pairs = set() 30 | prev_char = word[0] 31 | for char in word[1:]: 32 | pairs.add((prev_char, char)) 33 | prev_char = char 34 | return pairs 35 | 36 | 37 | class Tokenizer: 38 | def __init__(self, assert_dir: str): 39 | self.unk_token = "<|unknown token|>" 40 | 41 | self.pat = re.compile( 42 | r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" 43 | ) 44 | 45 | with open(os.path.join(assert_dir, "vocab.json"), encoding="utf-8") as f: 46 | self.encoder = json.load(f) 47 | self.decoder = {v: k for k, v in self.encoder.items()} 48 | 49 | self.byte_encoder = bytes_to_unicode() 50 | self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} 51 | 52 | self.cache = {} 53 | with open(os.path.join(assert_dir, "merges.txt"), encoding="utf-8") as f: 54 | bpe_merges = f.read().split("\n")[1:-1] 55 | bpe_merges = [tuple(merge.split()) for merge in bpe_merges] 56 | self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) 57 | 58 | def bpe(self, token): 59 | if token in self.cache: 60 | return self.cache[token] 61 | word = tuple(token) 62 | pairs = get_pairs(word) 63 | if not pairs: 64 | return token 65 | 66 | while True: 67 | bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) 68 | if bigram not in self.bpe_ranks: 69 | break 70 | first, second = bigram 71 | new_word = [] 72 | i = 0 73 | while i < len(word): 74 | try: 75 | j = word.index(first, i) 76 | except ValueError: 77 | new_word.extend(word[i:]) 78 | break 79 | else: 80 | new_word.extend(word[i:j]) 81 | i = j 82 | 83 | if word[i] == first and i < len(word) - 1 and word[i + 1] == second: 84 | new_word.append(first + second) 85 | i += 2 86 | else: 87 | new_word.append(word[i]) 88 | i += 1 89 | 90 | new_word = tuple(new_word) 91 | word = new_word 92 | if len(word) == 1: 93 | break 94 | else: 95 | pairs = get_pairs(word) 96 | word = " ".join(word) 97 | self.cache[token] = word 98 | return word 99 | 100 | def _tokenize(self, text): 101 | """Tokenize a string.""" 102 | bpe_tokens = [] 103 | for token in re.findall(self.pat, text): 104 | token = "".join( 105 | self.byte_encoder[b] for b in token.encode("utf-8") 106 | ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case) 107 | bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) 108 | return bpe_tokens 109 | 110 | def tokenize(self, text): 111 | bpe_tokens = [] 112 | eot = "<|endoftext|>" 113 | s = 0 114 | i = text.find(eot) 115 | while i != -1: 116 | bpe_tokens.extend(self._tokenize(text[s:i])) 117 | bpe_tokens.append(eot) 118 | s = i + len(eot) 119 | i = text.find(eot, i + len(eot)) 120 | bpe_tokens.extend(self._tokenize(text[s:])) 121 | return bpe_tokens 122 | 123 | def encode(self, text): 124 | return [self.encoder[token] for token in self.tokenize(text)] 125 | 126 | def decode(self, indices): 127 | text = "".join([self.decoder.get(index) for index in indices]) 128 | return bytearray(self.byte_decoder[c] for c in text).decode("utf-8") 129 | 130 | 131 | def test_tokenizer(): 132 | t = Tokenizer(os.path.join(os.path.dirname(os.path.realpath(__file__)), "assets")) 133 | # with open("/tmp/sample.txt") as f: 134 | # for line in f: 135 | # lst = t._tokenize(line[:-1]) # Remove the trailing '\n'. 136 | # print(*lst, sep=', ') # Do no quote strings. 137 | candidates = [ 138 | "this is <|endoftext|> else<|endoftext|>", 139 | "<|endoftext|> else<|endoftext|>", 140 | "this is <|endoftext|> else", 141 | "this is <|endoftext|>else", 142 | "this is else", 143 | ] 144 | for s in candidates: 145 | assert t.decode(t.encode(s)) == s 146 | 147 | 148 | if __name__ == "__main__": 149 | test_tokenizer() 150 | -------------------------------------------------------------------------------- /tokenizer/bpe_test.cc: -------------------------------------------------------------------------------- 1 | #include "bpe.h" 2 | 3 | RE2 re( 4 | "('s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| " 5 | "?[^\\s\\p{L}\\p{N}]+|\\s+\\(?!\\S\\)|\\s+)"); 6 | 7 | void test_re2() { 8 | assert(re.ok()); // compiled; if not, see re.error(); 9 | 10 | std::string w; 11 | std::string text = "we'd annoyingly 顽皮"; 12 | re2::StringPiece input(text); 13 | 14 | std::vector v; 15 | while (RE2::FindAndConsume(&input, re, &w)) { 16 | v.push_back(w); 17 | } 18 | assert(v == std::vector({"we", "\'d", " annoyingly", " 顽皮"})); 19 | } 20 | 21 | void test_bytes_to_unicode() { 22 | std::unordered_map b2u; 23 | std::unordered_map u2b; 24 | bytes_to_unicode(&b2u, &u2b); 25 | assert(b2u.size() == 256); 26 | assert(b2u[0] == 0x100); 27 | assert(u2b.size() == 256); 28 | } 29 | 30 | void test_byte_encode_token() { 31 | std::unordered_map b2u; 32 | bytes_to_unicode(&b2u, NULL); 33 | 34 | std::string s(" very"); 35 | std::wstring b; 36 | byte_encode_token(s, b2u, &b); 37 | assert("Ġvery" == wstring_to_utf8(b)); 38 | } 39 | 40 | void test_load_merge_rules() { 41 | BPERanks bpe_ranks; 42 | std::fstream merges("/tmp/merges.txt", std::ios::in); 43 | load_merge_rules(merges, &bpe_ranks); 44 | assert(bpe_ranks.size() == 50000); 45 | 46 | auto iter = bpe_ranks.find({utf8_to_wstring("Ġg"), utf8_to_wstring("azed")}); 47 | assert(iter != bpe_ranks.end()); 48 | assert(iter->second = 49999); 49 | } 50 | 51 | void test_get_pairs() { 52 | std::vector > pairs; 53 | get_pairs(utf8_to_wstring("very"), &pairs); 54 | assert(pairs.size() == 3); 55 | assert(wstring_to_utf8(pairs[1].first) == "e"); 56 | assert(wstring_to_utf8(pairs[1].second) == "r"); 57 | } 58 | 59 | void test_bpe() { 60 | BPERanks bpe_ranks; 61 | std::fstream merges("/tmp/merges.txt", std::ios::in); 62 | load_merge_rules(merges, &bpe_ranks); 63 | assert(bpe_ranks.size() == 50000); 64 | 65 | std::vector result; 66 | bpe(utf8_to_wstring("annoyingly"), bpe_ranks, &result); 67 | assert(result == std::vector({L"ann", L"oy", L"ingly"})); 68 | 69 | result.clear(); 70 | bpe(utf8_to_wstring("very"), bpe_ranks, &result); 71 | assert(result == std::vector({L"very"})); 72 | } 73 | 74 | auto _print_string_vec = [](std::vector& v) { 75 | // To be compatible with Python's print(*lst, sep=', ') 76 | for (int i = 0; i < v.size(); ++i) { 77 | std::cout << v[i]; 78 | if (i < v.size() - 1) std::cout << ", "; 79 | } 80 | std::cout << std::endl; 81 | }; 82 | 83 | void test_tokenize() { 84 | assert(re.ok()); // compiled; if not, see re.error(); 85 | 86 | BPERanks bpe_ranks; 87 | std::fstream merges("/tmp/merges.txt", std::ios::in); 88 | load_merge_rules(merges, &bpe_ranks); 89 | 90 | std::unordered_map b2u; 91 | bytes_to_unicode(&b2u, NULL); 92 | 93 | std::vector candidates = { 94 | "this is <|endoftext|> else<|endoftext|>", 95 | "<|endoftext|> else<|endoftext|>", "this is <|endoftext|> else", 96 | "this is <|endoftext|>else", "this is else"}; 97 | for (auto s : candidates) { 98 | std::vector result; 99 | tokenize(s, re, bpe_ranks, b2u, &result); 100 | _print_string_vec(result); 101 | } 102 | } 103 | 104 | void test_tokenize_regression() { 105 | assert(re.ok()); // compiled; if not, see re.error(); 106 | 107 | BPERanks bpe_ranks; 108 | std::fstream merges("/tmp/merges.txt", std::ios::in); 109 | load_merge_rules(merges, &bpe_ranks); 110 | 111 | std::unordered_map b2u; 112 | bytes_to_unicode(&b2u, NULL); 113 | 114 | // In order to make sure that /tmp/sample.txt contains many lines of 115 | // text of different langauges, I download the lyrics data from 116 | // https://www.kaggle.com/datasets/neisse/scrapped-lyrics-from-6-genres, 117 | // and ran the following commands to randomly sample 10000 lines. 118 | /* 119 | cat /tmp/lyrics-data.csv | head -n 1000000 | awk 'NF' | sort | \ 120 | uniq | sort -R | head -n 10000 > /tmp/sample.txt 121 | */ 122 | std::fstream ins("/tmp/sample.txt", std::ios::in); 123 | std::string line; 124 | while (std::getline(ins, line)) { 125 | std::vector result; 126 | tokenize(line, re, bpe_ranks, b2u, &result); 127 | _print_string_vec(result); 128 | } 129 | } 130 | 131 | void test_load_vocab() { 132 | std::unordered_map t2i; 133 | std::unordered_map i2t; 134 | std::fstream vocab_txt("/tmp/vocab.txt", std::ios::in); 135 | load_vocab(vocab_txt, &t2i, &i2t); 136 | assert(t2i.size() == 50257); 137 | assert(i2t.size() == 50257); 138 | assert(t2i["\""] == 1); 139 | assert(i2t[1] == "\""); 140 | } 141 | 142 | void test_encode_decode() { 143 | BPERanks bpe_ranks; 144 | std::fstream merges("/tmp/merges.txt", std::ios::in); 145 | load_merge_rules(merges, &bpe_ranks); 146 | 147 | std::unordered_map b2u; 148 | std::unordered_map u2b; 149 | bytes_to_unicode(&b2u, &u2b); 150 | 151 | std::unordered_map t2i; 152 | std::unordered_map i2t; 153 | std::fstream vocab_txt("/tmp/vocab.txt", std::ios::in); 154 | load_vocab(vocab_txt, &t2i, &i2t); 155 | 156 | std::vector candidates = { 157 | "this is <|endoftext|> else<|endoftext|>", 158 | "<|endoftext|> else<|endoftext|>", "this is <|endoftext|> else", 159 | "this is <|endoftext|>else", "this is else"}; 160 | for (auto s : candidates) { 161 | std::vector ids; 162 | encode(s, re, bpe_ranks, b2u, t2i, &ids); 163 | assert(ids.size() > 0); 164 | assert(decode(ids, u2b, i2t) == s); 165 | } 166 | } 167 | 168 | int main() { 169 | test_bytes_to_unicode(); 170 | test_re2(); 171 | test_load_merge_rules(); 172 | test_byte_encode_token(); 173 | test_get_pairs(); 174 | test_bpe(); 175 | test_tokenize(); 176 | test_load_vocab(); 177 | test_encode_decode(); 178 | return 0; 179 | } 180 | -------------------------------------------------------------------------------- /tool/cmp.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | if len(sys.argv) != 4: 4 | sys.exit(f"Usage: {sys.argv[0]} tokenization_output1 tokenization_output2 tokenization_input") 5 | 6 | o1 = open(sys.argv[1]) 7 | o2 = open(sys.argv[2]) 8 | i0 = open(sys.argv[3]) 9 | 10 | while True: 11 | l1 = o1.readline() 12 | l2 = o2.readline() 13 | li = i0.readline() 14 | 15 | if not l1 or not l2 or not li: 16 | break 17 | 18 | if (l1 != l2): 19 | print(l1[:-1]) 20 | print(l2) 21 | print(li) 22 | -------------------------------------------------------------------------------- /tool/data.py: -------------------------------------------------------------------------------- 1 | # This script finds lyrics of Frech songs in the dataset 2 | # https://www.kaggle.com/datasets/neisse/scrapped-lyrics-from-6-genres 3 | # 4 | # unzip the downloaded archive.zip file in /tmp, we get: 5 | # /tmp/lyrics-data.csv 6 | # /tmp/artists-data.csv 7 | # 8 | import pandas as pd 9 | 10 | artists = pd.read_csv("/tmp/artists-data.csv") # 4168 artists 11 | artists = artists[artists["Popularity"] > 5] # 297 popular artists 12 | 13 | lyrics = pd.read_csv("/tmp/lyrics-data.csv") # 379931 songs 14 | lyrics = lyrics[lyrics["language"] == "fr"] 15 | 16 | df = lyrics.merge( 17 | artists[["Artist", "Genres", "Link"]], left_on="ALink", right_on="Link", how="inner" 18 | ) 19 | 20 | df = df[df["Lyric"].apply(lambda x: len(x.split(" ")) < 350)] 21 | 22 | df = df["Lyric"] # 317 songs 23 | 24 | for row in df: 25 | # https://stackoverflow.com/a/2077944/724872 26 | print(" ".join(row.replace("\n", " ").split())) 27 | -------------------------------------------------------------------------------- /tool/json-to-txt.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | 4 | vocab_file = sys.argv[1] 5 | 6 | with open(vocab_file) as f: 7 | d = json.load(f) 8 | for k, v in d.items(): 9 | print(k) 10 | print(v) 11 | --------------------------------------------------------------------------------