├── .github └── workflows │ ├── ubuntu22-cxx20.yml │ ├── ubuntu22-sanitize.yml │ ├── ubuntu22.yml │ ├── vs17-cxx20.yml │ └── vs17.yml ├── .gitignore ├── CMakeLists.txt ├── LICENSE-APACHE ├── LICENSE-BOOST ├── LICENSE-MIT ├── README.md ├── benchmarks ├── CMakeLists.txt └── bench.cpp ├── cmake ├── add_cpp_test.cmake ├── import.cmake └── is_utf8-config.cmake.in ├── include └── is_utf8.h ├── src ├── CMakeLists.txt └── is_utf8.cpp └── tests ├── CMakeLists.txt └── unit.cpp /.github/workflows/ubuntu22-cxx20.yml: -------------------------------------------------------------------------------- 1 | name: Ubuntu 22.04 CI (GCC 11, CXX 20) 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | ubuntu-build: 7 | if: >- 8 | ! contains(toJSON(github.event.commits.*.message), '[skip ci]') && 9 | ! contains(toJSON(github.event.commits.*.message), '[skip github]') 10 | runs-on: ubuntu-22.04 11 | steps: 12 | - uses: actions/checkout@v4 13 | - name: Use cmake 14 | run: | 15 | mkdir builddebug && 16 | cd builddebug && 17 | cmake -DCMAKE_BUILD_TYPE=Debug -DIS_UTF8_CXX_STANDARD=20 .. && 18 | cmake --build . && 19 | ctest -j --output-on-failure -LE explicitonly && 20 | cd .. && 21 | mkdir build && 22 | cd build && 23 | cmake -DIS_UTF8_CXX_STANDARD=20 .. && 24 | cmake --build . && 25 | ctest -j --output-on-failure -LE explicitonly 26 | -------------------------------------------------------------------------------- /.github/workflows/ubuntu22-sanitize.yml: -------------------------------------------------------------------------------- 1 | name: Ubuntu 22.04 CI (GCC 11) with Sanitizers 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | ubuntu-build: 7 | if: >- 8 | ! contains(toJSON(github.event.commits.*.message), '[skip ci]') && 9 | ! contains(toJSON(github.event.commits.*.message), '[skip github]') 10 | runs-on: ubuntu-22.04 11 | steps: 12 | - uses: actions/checkout@v4 13 | - name: Use cmake 14 | run: | 15 | mkdir builddebug && 16 | cd builddebug && 17 | cmake -DIS_UTF8_SANITIZE=ON -DCMAKE_BUILD_TYPE=Debug .. && 18 | cmake --build . && 19 | ctest -j --output-on-failure -LE explicitonly && 20 | cd .. && 21 | mkdir build && 22 | cd build && 23 | cmake -DIS_UTF8_SANITIZE=ON .. && 24 | cmake --build . && 25 | ctest -j --output-on-failure -LE explicitonly 26 | -------------------------------------------------------------------------------- /.github/workflows/ubuntu22.yml: -------------------------------------------------------------------------------- 1 | name: Ubuntu 22.04 CI (GCC 11) 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | ubuntu-build: 7 | if: >- 8 | ! contains(toJSON(github.event.commits.*.message), '[skip ci]') && 9 | ! contains(toJSON(github.event.commits.*.message), '[skip github]') 10 | runs-on: ubuntu-22.04 11 | steps: 12 | - uses: actions/checkout@v4 13 | - name: Use cmake 14 | run: | 15 | mkdir builddebug && 16 | cd builddebug && 17 | cmake -DCMAKE_BUILD_TYPE=Debug .. && 18 | cmake --build . && 19 | ctest -j --output-on-failure -LE explicitonly && 20 | cd .. && 21 | mkdir build && 22 | cd build && 23 | cmake .. && 24 | cmake --build . && 25 | ctest -j --output-on-failure -LE explicitonly 26 | -------------------------------------------------------------------------------- /.github/workflows/vs17-cxx20.yml: -------------------------------------------------------------------------------- 1 | name: VS17-CI (CXX 20) 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | ci: 7 | if: >- 8 | ! contains(toJSON(github.event.commits.*.message), '[skip ci]') && 9 | ! contains(toJSON(github.event.commits.*.message), '[skip github]') 10 | name: windows-vs17 11 | runs-on: windows-latest 12 | strategy: 13 | fail-fast: false 14 | matrix: 15 | include: 16 | - {gen: Visual Studio 17 2022, arch: x64, shared: ON} 17 | - {gen: Visual Studio 17 2022, arch: x64, shared: OFF} 18 | steps: 19 | - name: checkout 20 | uses: actions/checkout@v4 21 | - name: Configure 22 | run: > 23 | cmake -G "${{matrix.gen}}" -A ${{matrix.arch}} 24 | -DBUILD_SHARED_LIBS=${{matrix.shared}} -DIS_UTF8_CXX_STANDARD=20 25 | -B build 26 | - name: Build Debug 27 | run: cmake --build build --config Debug --verbose 28 | - name: Build Release 29 | run: cmake --build build --config Release --verbose 30 | - name: Run Release tests 31 | run: | 32 | cd build 33 | ctest -C Release -LE explicitonly --output-on-failure 34 | - name: Run Debug tests 35 | run: | 36 | cd build 37 | ctest -C Debug -LE explicitonly --output-on-failure 38 | -------------------------------------------------------------------------------- /.github/workflows/vs17.yml: -------------------------------------------------------------------------------- 1 | name: VS17-CI 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | ci: 7 | if: >- 8 | ! contains(toJSON(github.event.commits.*.message), '[skip ci]') && 9 | ! contains(toJSON(github.event.commits.*.message), '[skip github]') 10 | name: windows-vs17 11 | runs-on: windows-latest 12 | strategy: 13 | fail-fast: false 14 | matrix: 15 | include: 16 | - {gen: Visual Studio 17 2022, arch: x64, shared: ON} 17 | - {gen: Visual Studio 17 2022, arch: x64, shared: OFF} 18 | steps: 19 | - name: checkout 20 | uses: actions/checkout@v4 21 | - name: Configure 22 | run: | 23 | cmake -G "${{matrix.gen}}" -A ${{matrix.arch}} -DBUILD_SHARED_LIBS=${{matrix.shared}} -B build 24 | - name: Build Debug 25 | run: cmake --build build --config Debug --verbose 26 | - name: Build Release 27 | run: cmake --build build --config Release --verbose 28 | - name: Run Release tests 29 | run: | 30 | cd build 31 | ctest -C Release -LE explicitonly --output-on-failure 32 | - name: Run Debug tests 33 | run: | 34 | cd build 35 | ctest -C Debug -LE explicitonly --output-on-failure -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | src/dependencies/ 3 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.15) 2 | 3 | project(is_utf8 4 | DESCRIPTION "Fast UTF-8 Validation" 5 | LANGUAGES CXX 6 | VERSION 1.4.1 7 | ) 8 | 9 | include(GNUInstallDirs) 10 | include(CTest) 11 | 12 | option(IS_UTF8_SANITIZE "Sanitize addresses" OFF) 13 | 14 | if (NOT CMAKE_BUILD_TYPE) 15 | message(STATUS "No build type selected, default to Release") 16 | if(IS_UTF8_SANITIZE) 17 | set(CMAKE_BUILD_TYPE Debug CACHE STRING "Choose the type of build." FORCE) 18 | else() 19 | set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) 20 | endif() 21 | endif() 22 | 23 | # We compile tools, tests, etc. with C++ 11. Override yourself if you need on a 24 | # target. 25 | set(IS_UTF8_CXX_STANDARD 11 CACHE STRING "the C++ standard to use for is_utf8") 26 | 27 | set(CMAKE_CXX_STANDARD ${IS_UTF8_CXX_STANDARD}) 28 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 29 | set(CMAKE_CXX_EXTENSIONS OFF) 30 | set(CMAKE_MACOSX_RPATH OFF) 31 | 32 | set(IS_UTF8_LIB_VERSION "1.4.1" CACHE STRING "is_utf8 library version") 33 | set(IS_UTF8_LIB_SOVERSION "1" CACHE STRING "is_utf8 library soversion") 34 | 35 | set(IS_UTF8_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src) 36 | add_subdirectory(src) 37 | 38 | if (BUILD_TESTING) 39 | message(STATUS "The tests are enabled.") 40 | add_subdirectory(tests) 41 | else() 42 | message(STATUS "The tests are disabled.") 43 | endif(BUILD_TESTING) 44 | 45 | 46 | add_subdirectory(benchmarks) 47 | 48 | message(STATUS "Compiling using the C++ standard:" ${CMAKE_CXX_STANDARD}) 49 | # ---- Install rules ---- 50 | add_library(is_utf8::is_utf8 ALIAS is_utf8) 51 | 52 | set_target_properties( 53 | is_utf8 PROPERTIES 54 | VERSION "${IS_UTF8_LIB_VERSION}" 55 | SOVERSION "${IS_UTF8_LIB_SOVERSION}" 56 | WINDOWS_EXPORT_ALL_SYMBOLS YES 57 | ) 58 | 59 | include(CMakePackageConfigHelpers) 60 | include(GNUInstallDirs) 61 | 62 | install( 63 | FILES include/is_utf8.h 64 | DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" 65 | COMPONENT is_utf8_Development 66 | ) 67 | 68 | install( 69 | TARGETS is_utf8 70 | EXPORT is_utf8Targets 71 | RUNTIME COMPONENT is_utf8_Runtime 72 | LIBRARY COMPONENT is_utf8_Runtime 73 | NAMELINK_COMPONENT is_utf8_Development 74 | ARCHIVE COMPONENT is_utf8_Development 75 | INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" 76 | ) 77 | 78 | configure_file(cmake/is_utf8-config.cmake.in is_utf8-config.cmake @ONLY) 79 | 80 | write_basic_package_version_file( 81 | is_utf8-config-version.cmake 82 | COMPATIBILITY SameMinorVersion 83 | ) 84 | 85 | set( 86 | IS_UTF8_INSTALL_CMAKEDIR "${CMAKE_INSTALL_LIBDIR}/cmake/is_utf8" 87 | CACHE STRING "CMake package config location relative to the install prefix" 88 | ) 89 | mark_as_advanced(IS_UTF8_INSTALL_CMAKEDIR) 90 | 91 | install( 92 | FILES 93 | "${PROJECT_BINARY_DIR}/is_utf8-config.cmake" 94 | "${PROJECT_BINARY_DIR}/is_utf8-config-version.cmake" 95 | DESTINATION "${IS_UTF8_INSTALL_CMAKEDIR}" 96 | COMPONENT is_utf8_Development 97 | ) 98 | 99 | # 100 | # CPack 101 | # 102 | if(is_top_project) 103 | set(CPACK_PACKAGE_VENDOR "Daniel Lemire") 104 | set(CPACK_PACKAGE_CONTACT "lemire@gmail.com") 105 | set(CPACK_RESOURCE_FILE_LICENSE "${PROJECT_SOURCE_DIR}/LICENSE-MIT") 106 | set(CPACK_RPM_PACKAGE_LICENSE "${PROJECT_SOURCE_DIR}/LICENSE-MIT") 107 | set(CPACK_RESOURCE_FILE_README "${PROJECT_SOURCE_DIR}/README.md") 108 | set(CPACK_SOURCE_GENERATOR "TGZ;ZIP") 109 | include(CPack) 110 | endif() 111 | 112 | # ---- 113 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | NOTE: This software may be used as part of software released under the 204 | GNU General Public License (GPL) version 2. 205 | -------------------------------------------------------------------------------- /LICENSE-BOOST: -------------------------------------------------------------------------------- 1 | Boost Software License - Version 1.0 - August 17th, 2003 2 | 3 | Permission is hereby granted, free of charge, to any person or organization 4 | obtaining a copy of the software and accompanying documentation covered by 5 | this license (the "Software") to use, reproduce, display, distribute, 6 | execute, and transmit the Software, and to prepare derivative works of the 7 | Software, and to permit third-parties to whom the Software is furnished to 8 | do so, all subject to the following: 9 | 10 | The copyright notices in the Software and this entire statement, including 11 | the above license grant, this restriction and the following disclaimer, 12 | must be included in all copies of the Software, in whole or in part, and 13 | all derivative works of the Software, unless such copies or derivative 14 | works are solely in the form of machine-executable object code generated by 15 | a source language processor. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 20 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 21 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 22 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 | DEALINGS IN THE SOFTWARE. 24 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright 2022 The is_utf8 authors 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so, 8 | subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 15 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 16 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 17 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 18 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # is_utf8 2 | 3 | Most strings online are in unicode using the UTF-8 encoding. Validating strings 4 | quickly before accepting them is important. 5 | 6 | ## How to use is_utf8 7 | 8 | This is a simple one-source file library to validate UTF-8 strings at high 9 | speeds using SIMD instructions. It works on all platforms (ARM, x64). 10 | 11 | Build and link `is_utf8.cpp` with your project. Code usage: 12 | 13 | ```C++ 14 | #include "is_utf8.h" 15 | 16 | char * mystring = ... 17 | bool is_it_valid = is_utf8(mystring, thestringlength); 18 | ``` 19 | 20 | It should be able to validate strings using less than 1 cycle per input byte. 21 | 22 | ## Requirements 23 | 24 | - C++11 compatible compiler. We support LLVM clang, GCC, Visual Studio. (Our 25 | optional benchmark tool requires C++17.) 26 | - For high speed, you should have a recent 64-bit system (e.g., ARM or x64). 27 | - If you rely on CMake, you should use a recent CMake (at least 3.15). 28 | - AVX-512 support require a processor with AVX512-VBMI2 (Ice Lake or better) and 29 | a recent compiler (GCC 8 or better, Visual Studio 2019 or better, LLVM clang 6 30 | or better). You need a correspondingly recent assembler such as gas (2.30+) or 31 | nasm (2.14+): recent compilers usually come with recent assemblers. If you mix 32 | a recent compiler with an incompatible/old assembler (e.g., when using a 33 | recent compiler with an old Linux distribution), you may get errors at build 34 | time because the compiler produces instructions that the assembler does not 35 | recognize: you should update your assembler to match your compiler (e.g., 36 | upgrade binutils to version 2.30 or better under Linux) or use an older 37 | compiler matching the capabilities of your assembler. 38 | 39 | ## Build with CMake 40 | 41 | ``` 42 | cmake -B build 43 | cmake --build build 44 | cd build 45 | ctest . 46 | ``` 47 | 48 | Visual Studio users must specify whether they want to build the Release or Debug 49 | version. 50 | 51 | To run benchmarks, build and execute the `bench` command. 52 | 53 | ``` 54 | cmake -B build 55 | cmake --build build 56 | ./build/benchmarks/bench 57 | ``` 58 | 59 | Instructions are similar for Visual Studio users. 60 | 61 | ## Real-word usage 62 | 63 | This C++ library is part of the JavaScript package 64 | [utf-8-validate](https://github.com/websockets/utf-8-validate). The 65 | utf-8-validate package is routinely downloaded more than 66 | [a million times per week](https://www.npmjs.com/package/utf-8-validate). 67 | 68 | If you are using Node JS (19.4.0 or better), you already have access to this 69 | function as 70 | [`buffer.isUtf8(input)`](https://nodejs.org/api/buffer.html#bufferisutf8input). 71 | 72 | ## Reference 73 | 74 | - John Keiser, Daniel Lemire, 75 | [Validating UTF-8 In Less Than One Instruction Per Byte](https://arxiv.org/abs/2010.03090), 76 | Software: Practice & Experience 51 (5), 2021 77 | 78 | ## Want more? 79 | 80 | If you want a wide range of fast Unicode function for production use, you can 81 | rely on the simdutf library. It is as simple as the following: 82 | 83 | ```C++ 84 | #include "simdutf.cpp" 85 | #include "simdutf.h" 86 | 87 | int main(int argc, char *argv[]) { 88 | const char *source = "1234"; 89 | // 4 == strlen(source) 90 | bool validutf8 = simdutf::validate_utf8(source, 4); 91 | if (validutf8) { 92 | std::cout << "valid UTF-8" << std::endl; 93 | } else { 94 | std::cerr << "invalid UTF-8" << std::endl; 95 | return EXIT_FAILURE; 96 | } 97 | } 98 | ``` 99 | 100 | See https://github.com/simdutf/ 101 | 102 | ## License 103 | 104 | This library is distributed under the terms of any of the following licenses, at 105 | your option: 106 | 107 | - Apache License (Version 2.0) [LICENSE-APACHE](LICENSE-APACHE), 108 | - Boost Software License [LICENSE-BOOST](LICENSE-BOOST), or 109 | - MIT License [LICENSE-MIT](LICENSE-MIT). 110 | -------------------------------------------------------------------------------- /benchmarks/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | include(${PROJECT_SOURCE_DIR}/cmake/import.cmake) 3 | set_off(SIMDUTF_TOOLS) 4 | set_off(SIMDUTF_BENCHMARKS) 5 | 6 | import_dependency(simdutf simdutf/simdutf v2.0.9) 7 | set(BUILD_TESTING OFF) 8 | add_dependency(simdutf) 9 | unset(BUILD_TESTING) 10 | 11 | add_executable(bench bench.cpp) 12 | target_link_libraries(bench PRIVATE is_utf8) 13 | target_link_libraries(bench PRIVATE simdutf) 14 | -------------------------------------------------------------------------------- /benchmarks/bench.cpp: -------------------------------------------------------------------------------- 1 | #include "is_utf8.h" 2 | #include "simdutf.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | uint64_t nano() { 13 | return std::chrono::duration_cast<::std::chrono::nanoseconds>( 14 | std::chrono::steady_clock::now().time_since_epoch()) 15 | .count(); 16 | } 17 | 18 | #ifdef _MSC_VER 19 | #define never_inline __declspec(noinline) 20 | #else 21 | #define never_inline __attribute__((noinline)) 22 | #endif 23 | 24 | // generate a string having at least length N 25 | // can exceed by up to 3 chars, returns the actual length 26 | size_t populate_utf8(char *data, size_t N) { 27 | size_t i = 0; 28 | for (; i < N;) { 29 | int w = rand() & 0xFF; 30 | if (w < 0x80) { 31 | data[i++] = 0x20; // w; 32 | } else if (w < 0xE0) { 33 | data[i++] = 0xC2 + rand() % (0xDF - 0xC2 + 1); 34 | data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); 35 | } else if (w == 0xE0) { 36 | data[i++] = w; 37 | data[i++] = 0xA0 + rand() % (0xBF - 0xA0 + 1); 38 | data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); 39 | } else if (w <= 0xEC) { 40 | data[i++] = w; 41 | data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); 42 | data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); 43 | } else if (w == 0xED) { 44 | data[i++] = w; 45 | data[i++] = 0x80 + rand() % (0x9F - 0x80 + 1); 46 | data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); 47 | } else if (w <= 0xEF) { 48 | data[i++] = w; 49 | data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); 50 | data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); 51 | } else if (w < 0xF0) { 52 | data[i++] = 0xF1 + rand() % (0xF3 - 0xF1 + 1); 53 | data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); 54 | data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); 55 | data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); 56 | } else if (w == 0xF0) { 57 | data[i++] = w; 58 | data[i++] = 0x90 + rand() % (0xBF - 0x90 + 1); 59 | data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); 60 | data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); 61 | } else if (w <= 0xF3) { 62 | data[i++] = 0xF1 + rand() % (0xF3 - 0xF1 + 1); 63 | data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); 64 | data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); 65 | data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); 66 | } else if (w == 0xF4) { 67 | data[i++] = w; 68 | data[i++] = 0x80 + rand() % (0x8F - 0x80 + 1); 69 | data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); 70 | data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); 71 | } 72 | } 73 | data[i] = '\0'; 74 | return i; 75 | } 76 | 77 | // copied in part from Guava 78 | static never_inline bool basic_validate_utf8(const char *b, size_t length) { 79 | const unsigned char *bytes = (const unsigned char *)b; 80 | for (size_t index = 0;;) { 81 | unsigned char byte1; 82 | 83 | do { // fast ASCII Path 84 | if (index >= length) { 85 | return true; 86 | } 87 | byte1 = bytes[index++]; 88 | } while (byte1 < 0x80); 89 | if (byte1 < 0xE0) { 90 | // Two-byte form. 91 | if (index == length) { 92 | return false; 93 | } 94 | if (byte1 < 0xC2 || bytes[index++] > 0xBF) { 95 | return false; 96 | } 97 | } else if (byte1 < 0xF0) { 98 | // Three-byte form. 99 | if (index + 1 >= length) { 100 | return false; 101 | } 102 | unsigned char byte2 = bytes[index++]; 103 | if (byte2 > 0xBF 104 | // Overlong? 5 most significant bits must not all be zero. 105 | || (byte1 == 0xE0 && byte2 < 0xA0) 106 | // Check for illegal surrogate codepoints. 107 | || (byte1 == 0xED && 0xA0 <= byte2) 108 | // Third byte trailing-byte test. 109 | || bytes[index++] > 0xBF) { 110 | return false; 111 | } 112 | } else { 113 | 114 | // Four-byte form. 115 | if (index + 2 >= length) { 116 | return false; 117 | } 118 | int byte2 = bytes[index++]; 119 | if (byte2 > 0xBF 120 | // Check that 1 <= plane <= 16. Tricky optimized form of: 121 | // if (byte1 > (byte) 0xF4 122 | // || byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 123 | // || byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 124 | || (((byte1 << 28) + (byte2 - 0x90)) >> 30) != 0 125 | // Third byte trailing-byte test 126 | || bytes[index++] > 0xBF 127 | // Fourth byte trailing-byte test 128 | || bytes[index++] > 0xBF) { 129 | return false; 130 | } 131 | } 132 | } 133 | } 134 | 135 | 136 | bool zerobuffer_bench(size_t N) { 137 | printf("zero buffer \n"); 138 | printf("string size = %zu \n", N); 139 | char *input = new char[N]{}; 140 | volatile bool isgood{true}; 141 | 142 | { 143 | uint64_t start = nano(); 144 | uint64_t finish = start; 145 | size_t count{0}; 146 | uint64_t threshold = 500000000; 147 | for (; finish - start < threshold;) { 148 | count++; 149 | isgood &= basic_validate_utf8(input, N); 150 | finish = nano(); 151 | } 152 | double t = (N * count) / double(finish - start); 153 | 154 | printf("basic_validate_utf8 %f GB/s\n", t); 155 | } 156 | 157 | { 158 | uint64_t start = nano(); 159 | uint64_t finish = start; 160 | size_t count{0}; 161 | uint64_t threshold = 500000000; 162 | for (; finish - start < threshold;) { 163 | count++; 164 | isgood &= simdutf::validate_utf8(input, N); 165 | finish = nano(); 166 | } 167 | double t = (N * count) / double(finish - start); 168 | 169 | printf("simdutf %f GB/s\n", t); 170 | } 171 | 172 | { 173 | uint64_t start = nano(); 174 | uint64_t finish = start; 175 | size_t count{0}; 176 | uint64_t threshold = 500000000; 177 | for (; finish - start < threshold;) { 178 | count++; 179 | isgood &= is_utf8(input, N); 180 | finish = nano(); 181 | } 182 | double t = (N * count) / double(finish - start); 183 | 184 | printf("is_utf8 %f GB/s\n", t); 185 | } 186 | delete[] input; 187 | printf("\n"); 188 | return isgood; 189 | } 190 | 191 | bool bench(size_t N) { 192 | printf("random UTF-8\n"); 193 | printf("string size = %zu \n", N); 194 | char *input = new char[N]; 195 | populate_utf8(input, N); 196 | volatile bool isgood{true}; 197 | 198 | { 199 | uint64_t start = nano(); 200 | uint64_t finish = start; 201 | size_t count{0}; 202 | uint64_t threshold = 500000000; 203 | for (; finish - start < threshold;) { 204 | count++; 205 | isgood &= basic_validate_utf8(input, N); 206 | finish = nano(); 207 | } 208 | double t = (N * count) / double(finish - start); 209 | 210 | printf("basic_validate_utf8 %f GB/s\n", t); 211 | } 212 | 213 | { 214 | uint64_t start = nano(); 215 | uint64_t finish = start; 216 | size_t count{0}; 217 | uint64_t threshold = 500000000; 218 | for (; finish - start < threshold;) { 219 | count++; 220 | isgood &= simdutf::validate_utf8(input, N); 221 | finish = nano(); 222 | } 223 | double t = (N * count) / double(finish - start); 224 | 225 | printf("simdutf %f GB/s\n", t); 226 | } 227 | 228 | { 229 | uint64_t start = nano(); 230 | uint64_t finish = start; 231 | size_t count{0}; 232 | uint64_t threshold = 500000000; 233 | for (; finish - start < threshold;) { 234 | count++; 235 | isgood &= is_utf8(input, N); 236 | finish = nano(); 237 | } 238 | double t = (N * count) / double(finish - start); 239 | 240 | printf("is_utf8 %f GB/s\n", t); 241 | } 242 | delete[] input; 243 | printf("\n"); 244 | return isgood; 245 | } 246 | 247 | int main() { 248 | return (bench(40096) & bench(100000) & bench(50000)) 249 | & (zerobuffer_bench(40096) & zerobuffer_bench(100000) & zerobuffer_bench(50000)) 250 | ? EXIT_SUCCESS : EXIT_FAILURE; 251 | } -------------------------------------------------------------------------------- /cmake/add_cpp_test.cmake: -------------------------------------------------------------------------------- 1 | # Helper so we don't have to repeat ourselves so much 2 | # Usage: add_cpp_test(testname [COMPILE_ONLY] [SOURCES a.cpp b.cpp ...] [LABELS acceptance per_implementation ...]) 3 | # SOURCES defaults to testname.cpp if not specified. 4 | function(add_cpp_test TEST_NAME) 5 | # Parse arguments 6 | cmake_parse_arguments(PARSE_ARGV 1 ARGS "COMPILE_ONLY;LIBRARY;WILL_FAIL" "" "SOURCES;LABELS;DEPENDENCY_OF") 7 | if (NOT ARGS_SOURCES) 8 | list(APPEND ARGS_SOURCES ${TEST_NAME}.cpp) 9 | endif() 10 | if (ARGS_COMPILE_ONLY) 11 | list(APPEND ${ARGS_LABELS} compile_only) 12 | endif() 13 | if (IS_UTF8_SANITIZE) 14 | add_compile_options(-fsanitize=address -fno-omit-frame-pointer -fno-sanitize-recover=all) 15 | add_compile_definitions(ASAN_OPTIONS=detect_leaks=1) 16 | endif() 17 | # Add the compile target 18 | if (ARGS_LIBRARY) 19 | add_library(${TEST_NAME} STATIC ${ARGS_SOURCES}) 20 | else(ARGS_LIBRARY) 21 | add_executable(${TEST_NAME} ${ARGS_SOURCES}) 22 | endif(ARGS_LIBRARY) 23 | 24 | # Add test 25 | if (ARGS_COMPILE_ONLY OR ARGS_LIBRARY) 26 | add_test( 27 | NAME ${TEST_NAME} 28 | COMMAND ${CMAKE_COMMAND} --build . --target ${TEST_NAME} --config $ 29 | WORKING_DIRECTORY ${PROJECT_BINARY_DIR} 30 | ) 31 | set_target_properties(${TEST_NAME} PROPERTIES EXCLUDE_FROM_ALL TRUE EXCLUDE_FROM_DEFAULT_BUILD TRUE) 32 | else() 33 | add_test(${TEST_NAME} ${TEST_NAME}) 34 | 35 | # Add to