├── .gitignore ├── CMakeLists.txt ├── LICENSE.txt ├── README.md ├── TODO.txt ├── cmake ├── CMakeLists.txt └── LibUtf8Config.cmake ├── conf ├── CMakeLists.txt └── unicode │ ├── CMakeLists.txt │ ├── DerivedAge.txt │ ├── Jamo.txt │ ├── LICENSE.txt │ ├── NameAliases.txt │ ├── README.md │ └── UnicodeData.txt ├── debian ├── changelog ├── compat ├── control ├── copyright ├── docs ├── libutf8-dev.install ├── libutf8-doc.install ├── libutf8.install ├── rules └── source │ └── options ├── doc ├── CMakeLists.txt ├── footer.html ├── libutf8.doxy.in └── libutf8.png ├── libutf8 ├── CMakeLists.txt ├── base.cpp ├── base.h ├── caseinsensitivestring.h ├── exception.h ├── iterator.cpp ├── iterator.h ├── json_tokens.cpp ├── json_tokens.h ├── libutf8.cpp ├── libutf8.h ├── unicode_data.cpp ├── unicode_data.h ├── unicode_data_file.cpp ├── unicode_data_file.h ├── version.cpp └── version.h.in ├── mk ├── tests ├── CMakeLists.txt ├── catch_bom.cpp ├── catch_caseinsensitive.cpp ├── catch_character.cpp ├── catch_iterator.cpp ├── catch_json_tokens.cpp ├── catch_length.cpp ├── catch_main.cpp ├── catch_main.h ├── catch_stream.cpp ├── catch_string.cpp ├── catch_valid.cpp ├── catch_version.cpp ├── example-for-show-utf16.txt ├── example-for-show-utf32.txt ├── example-for-show-utf8.txt ├── unicode │ ├── LICENSE.txt │ └── NormalizationTest.txt └── verify-show-unicode.sh └── tools ├── CMakeLists.txt ├── show_unicode.cpp └── unicode_data_parser.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | tmp 2 | *.sw? 3 | seed.txt 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved 2 | # 3 | # https://snapwebsites.org/project/libutf8 4 | # contact@m2osw.com 5 | # 6 | # This program is free software; you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation; either version 2 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This program is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License along 17 | # with this program; if not, write to the Free Software Foundation, Inc., 18 | # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | 20 | cmake_minimum_required(VERSION 3.10.2) 21 | 22 | project(utf8_library) 23 | 24 | find_package(SnapCMakeModules REQUIRED) 25 | find_package(LibExcept REQUIRED) 26 | find_package(SnapDev REQUIRED) 27 | 28 | SnapGetVersion(LIBUTF8 ${CMAKE_CURRENT_SOURCE_DIR}) 29 | 30 | include_directories( 31 | ${PROJECT_SOURCE_DIR} 32 | ${CMAKE_CURRENT_BINARY_DIR} 33 | ) 34 | 35 | add_subdirectory(libutf8) 36 | add_subdirectory(tools ) 37 | add_subdirectory(conf ) 38 | add_subdirectory(doc ) 39 | add_subdirectory(cmake ) 40 | add_subdirectory(tests ) 41 | 42 | # vim: ts=4 sw=4 et 43 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2006-2023 Made to Order Software Corp. All Rights Reserved 2 | 3 | https://snapwebsites.org/ 4 | contact@m2osw.com 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 |

3 | advgetopt 5 |

6 | 7 | # Introduction 8 | 9 | The libutf8 library is a helper library to handle UTF-8 strings in C++. 10 | Although C++11 added `char32_t` (and `char16_t`) and C++20 added 11 | `char8_t`, the conversions are still not seamless between each type 12 | (although it is becoming easier to handle such.) 13 | 14 | This library proposes automated conversions between `std::string` (viewed 15 | as UTF-8 in nearly all of our code) and `std::u32string` (a.k.a. UTF-32 16 | strings.) 17 | 18 | # Reasons Behind Having Our Own Library 19 | 20 | All the libraries I've seen are either in C and very cumbersome to use or 21 | offer an interface which depends on the current `LOCALE`. In other words, 22 | the system default `mbstowc()` function, for example, does not always view 23 | the input string as UTF-8. That also means there are complexities and thus 24 | inefficiencies in determining which conversion to use. 25 | 26 | In our case, we always have UTF-8 as input and output and at times we need 27 | to handle the characters as UTF-32. For example, to transform the character 28 | to uppercase, it is necessary to have a UTF-32 character. 29 | 30 | # API 31 | 32 | ## String Conversions 33 | 34 | The library offers to conversion functions as follow: 35 | 36 | libutf8::to_u8string(std::u32string const & str); 37 | libutf8::to_u32string(std::string const & str); 38 | 39 | As time passes, we will add other conversions so as to support all formats 40 | although at this point these two are the only two we need in Snap! Websites. 41 | 42 | Here is an example of usage: 43 | 44 | std::string u8; 45 | 46 | u8 = u8"This is a UTF-8 string"; 47 | 48 | std::w32string u32; 49 | u32 = libutf8::to_u32string(u8); 50 | 51 | std::string back; 52 | back = libutf8::to_u8string(u32); 53 | 54 | Note that u8 string could be _more_ UTF-8 by including characters outside 55 | of the ASCII range and it would still work as you would expect. 56 | 57 | ### String Length in Characters 58 | 59 | The library offers the `u8length()` function which computes the length of 60 | a UTF-8 string. Note that this does not verify whether the UTF-8 data is 61 | valid. It very quickly counts the number of non-continuation bytes (i.e. 62 | bytes between 0x80 and 0xBF inclusive.) 63 | 64 | std::string u8("Your UTF-8 string"); 65 | size_t length = libutf8::u8length(u8); 66 | 67 | ### Case Insensitive Compare 68 | 69 | In most cases, you can compare two UTF-8 strings with the normal `==` 70 | operator. Once in a while, though, you may want to compare them case 71 | insensitively. 72 | 73 | Like with the iterator below, we wanted to offer a function that allows 74 | you to compare two UTF-8 strings properly and as quickly as possible. 75 | This meant to not have to convert the entire strings before doing the 76 | compare because having to do so means allocating memory for both 77 | strings just to do the compare and the conversion would convert the 78 | entire strings instead of just what's necessary. 79 | 80 | Out of these constraints we created the `u8casecmp()` function. It 81 | takes two UTF-8 strings and compares the characters one at a time. 82 | Unless the strings are equal, only the number of characters up to 83 | the first non-equal one, will be converted. 84 | 85 | std::string a("First String"); 86 | std::string b("First Test"); 87 | 88 | int r(libutf8::u8casecmp(a, b)); 89 | if(r == 0) 90 | { 91 | std::cout << "a and b are equal" << std::endl; 92 | } 93 | else if(r < 0) 94 | { 95 | std::cout << "a comes before b" << std::endl; 96 | } 97 | else //if(r > 0) 98 | { 99 | std::cout << "a comes after b" << std::endl; 100 | } 101 | 102 | WARNING: the function does no collation, so it is not going to take the 103 | language in account. It uses lowercase characters, as suggested by the 104 | Unicode standard, but outside of that, the compare is binary. 105 | 106 | ## UTF-8 Iterator 107 | 108 | It is often that we have an `std::string` representing UTF-8 and we want 109 | to iterate the content as UTF-32 characters. Although we could convert 110 | the string to a full `std::u32string` and then iterate through the 111 | `std::u32string`, that (1) requires a copy and (2) uses four times 112 | the amount of memory (five times if you include the `std::string` size...) 113 | Note also that the copy requires a `malloc()` and later a `free()` once 114 | done with it. 115 | 116 | The iterator solves these problems by allowing us to iterate through the 117 | `std::string` and getting the next or previous Unicode character without 118 | having to use any more memory. The conversion itself is slightly slower 119 | than converting a string all at once, but doing a `malloc()` to get the 120 | `std::u32string` is definitely going to be way slower than our iterator 121 | in nearly all circumstances. 122 | 123 | The following example shows the code point of each character, one per line: 124 | 125 | std::string u8("This is your UTF-8 string"); 126 | 127 | for(libutf8::utf8_iterator it(u8); 128 | it != u8.end(); 129 | ++it) 130 | { 131 | std::cout << static_cast(*it) << std::endl; 132 | } 133 | 134 | You can compare standard `std::string` iterators with `==` and `!=`. The 135 | `++` and `--` operators work as expected. If you do a `++` when already 136 | at the end, nothing happens. If you do a `--` when already at the beginning, 137 | nothing happens. 138 | 139 | Once you are at the end, getting the character (`*it`) returns `libutf8::EOS`. 140 | So you can loop through until you get `libutf8::EOS` instead of checking 141 | against the end iterator: 142 | 143 | std::string u8("This is your UTF-8 string"); 144 | 145 | libutf8::utf8_iterator it(u8); 146 | while(*it != libutf8::EOS) 147 | { 148 | std::cout << static_cast(*it++) << std::endl; 149 | } 150 | 151 | Remember that a good optimization is to avoid the post increment. It will 152 | be faster to do: 153 | 154 | char32_t c = *it; 155 | ++it; 156 | 157 | because you avoid a copy of the iterator (even though it's only 16 bytes...) 158 | 159 | ## Low Level Functions 160 | 161 | We expose the low level functions such as `mbstowc()` for edgy cases where 162 | you may not have an `std::string`. Those functions should not be used if 163 | at all possible because they require proper handling of the buffers passed 164 | to them. An error to such and you could end up with a crashing bug in your 165 | code. 166 | 167 | # TODO 168 | 169 | ## Auto-Conversions 170 | 171 | Conversions for many more types of strings such as all the `char *` 172 | and also look into whether implementing an extension to the 173 | `std::basic_string` would be possible to directly have conversions 174 | integrated in our strings (i.e. to be able to write `str8 = str32;` and 175 | `str32 = str8` without having to write `str8 = libutf8::to_u8string(str32)`.) 176 | 177 | ## Canonicalization 178 | 179 | Right now, we do not try to canonicalize the strings, so diacritics may 180 | appear as standalone or combined characters. We want to implement the 181 | necessary code to decomposed and re-composed in a normalized manner. 182 | 183 | This is very important for comparing strings against each other for 184 | equality (i.e. an 'a' with a grave accent is equal to an 'a' followed 185 | by the grave accent character). 186 | 187 | ## Character Name, Type, etc. 188 | 189 | The UnicodeData.txt file (offered by the Unicode website) lists all the 190 | characters with their name and their types. We want to offer the user 191 | access to that data. 192 | 193 | We should simple have the table as a struct and return a pointer to 194 | the corresponding character. Sort those by character number and use 195 | a binary search to find the structure. 196 | 197 | Some of that information is to be used for the canonicalization so it 198 | is a must have. 199 | 200 | UnicodeData.txt file format is defined in: 201 | http://www.unicode.org/L2/L1999/UnicodeData.html 202 | 203 | 204 | 205 | # License 206 | 207 | The source is covered by the MIT license. The debian folder is covered 208 | by the GPL 2.0. 209 | 210 | 211 | # Bugs 212 | 213 | Submit bug reports and patches on 214 | [github](https://github.com/m2osw/libutf8/issues). 215 | 216 | 217 | _This file is part of the [snapcpp project](https://snapwebsites.org/)._ 218 | -------------------------------------------------------------------------------- /TODO.txt: -------------------------------------------------------------------------------- 1 | 2 | * `utf8lint` verify that a file is valid UTF-8 (see show-unicode, we can have a softlink instead and if called utf8lint, assume --quiet). 3 | * Enhance `show-unicode`: 4 | - Support a range (so we can see the characters in a given range). 5 | - Actually do a validation step. 6 | * Add a reverse() function which works correctly with a UTF-8 string. 7 | * Add a reverse() function which works correctly with a UTF-16 string. 8 | * Add a fix() function which takes UTF-32/16 and removes any invalid characters (UTF-8 is done). 9 | * Add a "lexer base" which is to read an input file one character at a time 10 | like a lexer getc() generally does and return char32_t characters 11 | (see basic-xml for an example on how this is done and convert that one to 12 | using this new "lexer base") 13 | 14 | -------------------------------------------------------------------------------- /cmake/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved 2 | # 3 | # https://snapwebsites.org/project/libutf8 4 | # contact@m2osw.com 5 | # 6 | # This program is free software; you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation; either version 2 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This program is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License along 17 | # with this program; if not, write to the Free Software Foundation, Inc., 18 | # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | 20 | project(libutf8_cmake) 21 | 22 | install( 23 | FILES 24 | LibUtf8Config.cmake 25 | 26 | DESTINATION 27 | share/cmake/LibUtf8 28 | ) 29 | 30 | # vim: ts=4 sw=4 et nocindent 31 | -------------------------------------------------------------------------------- /cmake/LibUtf8Config.cmake: -------------------------------------------------------------------------------- 1 | # - Find LibUtf8 2 | # 3 | # LIBUTF8_FOUND - System has LibUtf8 4 | # LIBUTF8_INCLUDE_DIRS - The LibUtf8 include directories 5 | # LIBUTF8_LIBRARIES - The libraries needed to use LibUtf8 6 | # LIBUTF8_DEFINITIONS - Compiler switches required for using LibUtf8 7 | # 8 | # License: 9 | # 10 | # Copyright (c) 2011-2023 Made to Order Software Corp. All Rights Reserved 11 | # 12 | # https://snapwebsites.org/project/libutf8 13 | # contact@m2osw.com 14 | # 15 | # This program is free software: you can redistribute it and/or modify 16 | # it under the terms of the GNU General Public License as published by 17 | # the Free Software Foundation, either version 3 of the License, or 18 | # (at your option) any later version. 19 | # 20 | # This program is distributed in the hope that it will be useful, 21 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 22 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 23 | # GNU General Public License for more details. 24 | # 25 | # You should have received a copy of the GNU General Public License 26 | # along with this program. If not, see . 27 | 28 | find_path( 29 | LIBUTF8_INCLUDE_DIR 30 | libutf8/libutf8.h 31 | 32 | PATHS 33 | ENV LIBUTF8_INCLUDE_DIR 34 | ) 35 | 36 | find_library( 37 | LIBUTF8_LIBRARY 38 | utf8 39 | 40 | PATHS 41 | ${LIBUTF8_LIBRARY_DIR} 42 | ENV LIBUTF8_LIBRARY 43 | ) 44 | 45 | mark_as_advanced( 46 | LIBUTF8_INCLUDE_DIR 47 | LIBUTF8_LIBRARY 48 | ) 49 | 50 | set(LIBUTF8_INCLUDE_DIRS ${LIBUTF8_INCLUDE_DIR}) 51 | set(LIBUTF8_LIBRARIES ${LIBUTF8_LIBRARY}) 52 | 53 | include(FindPackageHandleStandardArgs) 54 | 55 | find_package_handle_standard_args( 56 | LibUtf8 57 | REQUIRED_VARS 58 | LIBUTF8_INCLUDE_DIR 59 | LIBUTF8_LIBRARY 60 | ) 61 | 62 | # vim: ts=4 sw=4 et 63 | -------------------------------------------------------------------------------- /conf/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved 2 | # 3 | # https://snapwebsites.org/project/libutf8 4 | # contact@m2osw.com 5 | # 6 | # This program is free software; you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation; either version 2 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This program is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License along 17 | # with this program; if not, write to the Free Software Foundation, Inc., 18 | # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | 20 | project(utf8_library_conf) 21 | 22 | add_subdirectory(unicode) 23 | 24 | # vim: ts=4 sw=4 et 25 | -------------------------------------------------------------------------------- /conf/unicode/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2012-2023 Made to Order Software Corp. All Rights Reserved 2 | # 3 | # https://snapwebsites.org/project/libutf8 4 | # contact@m2osw.com 5 | # 6 | # This program is free software; you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation; either version 2 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This program is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License along 17 | # with this program; if not, write to the Free Software Foundation, Inc., 18 | # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | 20 | ## 21 | ## unicode-data 22 | ## 23 | project(unicode-data) 24 | 25 | install( 26 | FILES 27 | DerivedAge.txt 28 | Jamo.txt 29 | NameAliases.txt 30 | UnicodeData.txt 31 | 32 | DESTINATION 33 | share/libutf8/unicode 34 | ) 35 | 36 | 37 | # vim: ts=4 sw=4 et 38 | -------------------------------------------------------------------------------- /conf/unicode/Jamo.txt: -------------------------------------------------------------------------------- 1 | # Jamo-13.0.0.txt 2 | # Date: 2019-09-09, 19:46:00 GMT [KW, LI] 3 | # © 2019 Unicode®, Inc. 4 | # For terms of use, see http://www.unicode.org/terms_of_use.html 5 | # 6 | # Unicode Character Database 7 | # For documentation, see http://www.unicode.org/reports/tr44/ 8 | # 9 | # This file defines the Jamo_Short_Name property. 10 | # 11 | # See Section 3.12 of The Unicode Standard, Version 13.0 12 | # for more information. 13 | # 14 | # Each line contains two fields, separated by a semicolon. 15 | # 16 | # The first field gives the code point, in 4-digit hexadecimal 17 | # form, of a conjoining jamo character that participates in the 18 | # algorithmic determination of Hangul syllable character names. 19 | # The second field gives the Jamo_Short_Name as a one-, two-, 20 | # or three-character ASCII string (or in one case, for U+110B, 21 | # the null string). 22 | # 23 | # ############################################################# 24 | 25 | 1100; G # HANGUL CHOSEONG KIYEOK 26 | 1101; GG # HANGUL CHOSEONG SSANGKIYEOK 27 | 1102; N # HANGUL CHOSEONG NIEUN 28 | 1103; D # HANGUL CHOSEONG TIKEUT 29 | 1104; DD # HANGUL CHOSEONG SSANGTIKEUT 30 | 1105; R # HANGUL CHOSEONG RIEUL 31 | 1106; M # HANGUL CHOSEONG MIEUM 32 | 1107; B # HANGUL CHOSEONG PIEUP 33 | 1108; BB # HANGUL CHOSEONG SSANGPIEUP 34 | 1109; S # HANGUL CHOSEONG SIOS 35 | 110A; SS # HANGUL CHOSEONG SSANGSIOS 36 | 110B; # HANGUL CHOSEONG IEUNG 37 | 110C; J # HANGUL CHOSEONG CIEUC 38 | 110D; JJ # HANGUL CHOSEONG SSANGCIEUC 39 | 110E; C # HANGUL CHOSEONG CHIEUCH 40 | 110F; K # HANGUL CHOSEONG KHIEUKH 41 | 1110; T # HANGUL CHOSEONG THIEUTH 42 | 1111; P # HANGUL CHOSEONG PHIEUPH 43 | 1112; H # HANGUL CHOSEONG HIEUH 44 | 1161; A # HANGUL JUNGSEONG A 45 | 1162; AE # HANGUL JUNGSEONG AE 46 | 1163; YA # HANGUL JUNGSEONG YA 47 | 1164; YAE # HANGUL JUNGSEONG YAE 48 | 1165; EO # HANGUL JUNGSEONG EO 49 | 1166; E # HANGUL JUNGSEONG E 50 | 1167; YEO # HANGUL JUNGSEONG YEO 51 | 1168; YE # HANGUL JUNGSEONG YE 52 | 1169; O # HANGUL JUNGSEONG O 53 | 116A; WA # HANGUL JUNGSEONG WA 54 | 116B; WAE # HANGUL JUNGSEONG WAE 55 | 116C; OE # HANGUL JUNGSEONG OE 56 | 116D; YO # HANGUL JUNGSEONG YO 57 | 116E; U # HANGUL JUNGSEONG U 58 | 116F; WEO # HANGUL JUNGSEONG WEO 59 | 1170; WE # HANGUL JUNGSEONG WE 60 | 1171; WI # HANGUL JUNGSEONG WI 61 | 1172; YU # HANGUL JUNGSEONG YU 62 | 1173; EU # HANGUL JUNGSEONG EU 63 | 1174; YI # HANGUL JUNGSEONG YI 64 | 1175; I # HANGUL JUNGSEONG I 65 | 11A8; G # HANGUL JONGSEONG KIYEOK 66 | 11A9; GG # HANGUL JONGSEONG SSANGKIYEOK 67 | 11AA; GS # HANGUL JONGSEONG KIYEOK-SIOS 68 | 11AB; N # HANGUL JONGSEONG NIEUN 69 | 11AC; NJ # HANGUL JONGSEONG NIEUN-CIEUC 70 | 11AD; NH # HANGUL JONGSEONG NIEUN-HIEUH 71 | 11AE; D # HANGUL JONGSEONG TIKEUT 72 | 11AF; L # HANGUL JONGSEONG RIEUL 73 | 11B0; LG # HANGUL JONGSEONG RIEUL-KIYEOK 74 | 11B1; LM # HANGUL JONGSEONG RIEUL-MIEUM 75 | 11B2; LB # HANGUL JONGSEONG RIEUL-PIEUP 76 | 11B3; LS # HANGUL JONGSEONG RIEUL-SIOS 77 | 11B4; LT # HANGUL JONGSEONG RIEUL-THIEUTH 78 | 11B5; LP # HANGUL JONGSEONG RIEUL-PHIEUPH 79 | 11B6; LH # HANGUL JONGSEONG RIEUL-HIEUH 80 | 11B7; M # HANGUL JONGSEONG MIEUM 81 | 11B8; B # HANGUL JONGSEONG PIEUP 82 | 11B9; BS # HANGUL JONGSEONG PIEUP-SIOS 83 | 11BA; S # HANGUL JONGSEONG SIOS 84 | 11BB; SS # HANGUL JONGSEONG SSANGSIOS 85 | 11BC; NG # HANGUL JONGSEONG IEUNG 86 | 11BD; J # HANGUL JONGSEONG CIEUC 87 | 11BE; C # HANGUL JONGSEONG CHIEUCH 88 | 11BF; K # HANGUL JONGSEONG KHIEUKH 89 | 11C0; T # HANGUL JONGSEONG THIEUTH 90 | 11C1; P # HANGUL JONGSEONG PHIEUPH 91 | 11C2; H # HANGUL JONGSEONG HIEUH 92 | 93 | # EOF 94 | -------------------------------------------------------------------------------- /conf/unicode/LICENSE.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/m2osw/libutf8/fe008f2199f5d8f43abe6ecffe6e82ead03ab2b5/conf/unicode/LICENSE.txt -------------------------------------------------------------------------------- /conf/unicode/NameAliases.txt: -------------------------------------------------------------------------------- 1 | # NameAliases-13.0.0.txt 2 | # Date: 2019-09-09, 19:47:00 GMT [KW, LI] 3 | # © 2019 Unicode®, Inc. 4 | # For terms of use, see http://www.unicode.org/terms_of_use.html 5 | # 6 | # Unicode Character Database 7 | # For documentation, see http://www.unicode.org/reports/tr44/ 8 | # 9 | # This file is a normative contributory data file in the 10 | # Unicode Character Database. 11 | # 12 | # This file defines the formal name aliases for Unicode characters. 13 | # 14 | # For informative aliases, see NamesList.txt 15 | # 16 | # The formal name aliases are divided into five types, each with a distinct label. 17 | # 18 | # Type Labels: 19 | # 20 | # 1. correction 21 | # Corrections for serious problems in the character names 22 | # 2. control 23 | # ISO 6429 names for C0 and C1 control functions, and other 24 | # commonly occurring names for control codes 25 | # 3. alternate 26 | # A few widely used alternate names for format characters 27 | # 4. figment 28 | # Several documented labels for C1 control code points which 29 | # were never actually approved in any standard 30 | # 5. abbreviation 31 | # Commonly occurring abbreviations (or acronyms) for control codes, 32 | # format characters, spaces, and variation selectors 33 | # 34 | # The formal name aliases are part of the Unicode character namespace, which 35 | # includes the character names and the names of named character sequences. 36 | # The inclusion of ISO 6429 names and other commonly occurring names and 37 | # abbreviations for control codes and format characters as formal name aliases 38 | # is to help avoid name collisions between Unicode character names and the 39 | # labels which commonly appear in text and/or in implementations such as regex, for 40 | # control codes (which for historical reasons have no Unicode character name) 41 | # or for format characters. 42 | # 43 | # For documentation, see NamesList.html and http://www.unicode.org/reports/tr44/ 44 | # 45 | # FORMAT 46 | # 47 | # Each line has three fields, as described here: 48 | # 49 | # First field: Code point 50 | # Second field: Alias 51 | # Third field: Type 52 | # 53 | # The type labels used are defined above. As for property values, comparisons 54 | # of type labels should ignore case. 55 | # 56 | # The type labels can be mapped to other strings for display, if desired. 57 | # 58 | # In case multiple aliases are assigned, additional aliases 59 | # are provided on separate lines. Parsers of this data file should 60 | # take note that the same code point can (and does) occur more than once. 61 | # 62 | # Note that currently the only instances of multiple aliases of the same 63 | # type for a single code point are either of type "control" or "abbreviation". 64 | # An alias of type "abbreviation" can, in principle, be added for any code 65 | # point, although currently aliases of type "correction" do not have 66 | # any additional aliases of type "abbreviation". Such relationships 67 | # are not enforced by stability policies. 68 | # 69 | #----------------------------------------------------------------- 70 | 71 | 0000;NULL;control 72 | 0000;NUL;abbreviation 73 | 0001;START OF HEADING;control 74 | 0001;SOH;abbreviation 75 | 0002;START OF TEXT;control 76 | 0002;STX;abbreviation 77 | 0003;END OF TEXT;control 78 | 0003;ETX;abbreviation 79 | 0004;END OF TRANSMISSION;control 80 | 0004;EOT;abbreviation 81 | 0005;ENQUIRY;control 82 | 0005;ENQ;abbreviation 83 | 0006;ACKNOWLEDGE;control 84 | 0006;ACK;abbreviation 85 | 86 | # Note that no formal name alias for the ISO 6429 "BELL" is 87 | # provided for U+0007, because of the existing name collision 88 | # with U+1F514 BELL. 89 | 90 | 0007;ALERT;control 91 | 0007;BEL;abbreviation 92 | 93 | 0008;BACKSPACE;control 94 | 0008;BS;abbreviation 95 | 0009;CHARACTER TABULATION;control 96 | 0009;HORIZONTAL TABULATION;control 97 | 0009;HT;abbreviation 98 | 0009;TAB;abbreviation 99 | 000A;LINE FEED;control 100 | 000A;NEW LINE;control 101 | 000A;END OF LINE;control 102 | 000A;LF;abbreviation 103 | 000A;NL;abbreviation 104 | 000A;EOL;abbreviation 105 | 000B;LINE TABULATION;control 106 | 000B;VERTICAL TABULATION;control 107 | 000B;VT;abbreviation 108 | 000C;FORM FEED;control 109 | 000C;FF;abbreviation 110 | 000D;CARRIAGE RETURN;control 111 | 000D;CR;abbreviation 112 | 000E;SHIFT OUT;control 113 | 000E;LOCKING-SHIFT ONE;control 114 | 000E;SO;abbreviation 115 | 000F;SHIFT IN;control 116 | 000F;LOCKING-SHIFT ZERO;control 117 | 000F;SI;abbreviation 118 | 0010;DATA LINK ESCAPE;control 119 | 0010;DLE;abbreviation 120 | 0011;DEVICE CONTROL ONE;control 121 | 0011;DC1;abbreviation 122 | 0012;DEVICE CONTROL TWO;control 123 | 0012;DC2;abbreviation 124 | 0013;DEVICE CONTROL THREE;control 125 | 0013;DC3;abbreviation 126 | 0014;DEVICE CONTROL FOUR;control 127 | 0014;DC4;abbreviation 128 | 0015;NEGATIVE ACKNOWLEDGE;control 129 | 0015;NAK;abbreviation 130 | 0016;SYNCHRONOUS IDLE;control 131 | 0016;SYN;abbreviation 132 | 0017;END OF TRANSMISSION BLOCK;control 133 | 0017;ETB;abbreviation 134 | 0018;CANCEL;control 135 | 0018;CAN;abbreviation 136 | 0019;END OF MEDIUM;control 137 | 0019;EOM;abbreviation 138 | 001A;SUBSTITUTE;control 139 | 001A;SUB;abbreviation 140 | 001B;ESCAPE;control 141 | 001B;ESC;abbreviation 142 | 001C;INFORMATION SEPARATOR FOUR;control 143 | 001C;FILE SEPARATOR;control 144 | 001C;FS;abbreviation 145 | 001D;INFORMATION SEPARATOR THREE;control 146 | 001D;GROUP SEPARATOR;control 147 | 001D;GS;abbreviation 148 | 001E;INFORMATION SEPARATOR TWO;control 149 | 001E;RECORD SEPARATOR;control 150 | 001E;RS;abbreviation 151 | 001F;INFORMATION SEPARATOR ONE;control 152 | 001F;UNIT SEPARATOR;control 153 | 001F;US;abbreviation 154 | 0020;SP;abbreviation 155 | 007F;DELETE;control 156 | 007F;DEL;abbreviation 157 | 158 | # PADDING CHARACTER and HIGH OCTET PRESET represent 159 | # architectural concepts initially proposed for early 160 | # drafts of ISO/IEC 10646-1. They were never actually 161 | # approved or standardized: hence their designation 162 | # here as the "figment" type. Formal name aliases 163 | # (and corresponding abbreviations) for these code 164 | # points are included here because these names leaked 165 | # out from the draft documents and were published in 166 | # at least one RFC whose names for code points was 167 | # implemented in Perl regex expressions. 168 | 169 | 0080;PADDING CHARACTER;figment 170 | 0080;PAD;abbreviation 171 | 0081;HIGH OCTET PRESET;figment 172 | 0081;HOP;abbreviation 173 | 174 | 0082;BREAK PERMITTED HERE;control 175 | 0082;BPH;abbreviation 176 | 0083;NO BREAK HERE;control 177 | 0083;NBH;abbreviation 178 | 0084;INDEX;control 179 | 0084;IND;abbreviation 180 | 0085;NEXT LINE;control 181 | 0085;NEL;abbreviation 182 | 0086;START OF SELECTED AREA;control 183 | 0086;SSA;abbreviation 184 | 0087;END OF SELECTED AREA;control 185 | 0087;ESA;abbreviation 186 | 0088;CHARACTER TABULATION SET;control 187 | 0088;HORIZONTAL TABULATION SET;control 188 | 0088;HTS;abbreviation 189 | 0089;CHARACTER TABULATION WITH JUSTIFICATION;control 190 | 0089;HORIZONTAL TABULATION WITH JUSTIFICATION;control 191 | 0089;HTJ;abbreviation 192 | 008A;LINE TABULATION SET;control 193 | 008A;VERTICAL TABULATION SET;control 194 | 008A;VTS;abbreviation 195 | 008B;PARTIAL LINE FORWARD;control 196 | 008B;PARTIAL LINE DOWN;control 197 | 008B;PLD;abbreviation 198 | 008C;PARTIAL LINE BACKWARD;control 199 | 008C;PARTIAL LINE UP;control 200 | 008C;PLU;abbreviation 201 | 008D;REVERSE LINE FEED;control 202 | 008D;REVERSE INDEX;control 203 | 008D;RI;abbreviation 204 | 008E;SINGLE SHIFT TWO;control 205 | 008E;SINGLE-SHIFT-2;control 206 | 008E;SS2;abbreviation 207 | 008F;SINGLE SHIFT THREE;control 208 | 008F;SINGLE-SHIFT-3;control 209 | 008F;SS3;abbreviation 210 | 0090;DEVICE CONTROL STRING;control 211 | 0090;DCS;abbreviation 212 | 0091;PRIVATE USE ONE;control 213 | 0091;PRIVATE USE-1;control 214 | 0091;PU1;abbreviation 215 | 0092;PRIVATE USE TWO;control 216 | 0092;PRIVATE USE-2;control 217 | 0092;PU2;abbreviation 218 | 0093;SET TRANSMIT STATE;control 219 | 0093;STS;abbreviation 220 | 0094;CANCEL CHARACTER;control 221 | 0094;CCH;abbreviation 222 | 0095;MESSAGE WAITING;control 223 | 0095;MW;abbreviation 224 | 0096;START OF GUARDED AREA;control 225 | 0096;START OF PROTECTED AREA;control 226 | 0096;SPA;abbreviation 227 | 0097;END OF GUARDED AREA;control 228 | 0097;END OF PROTECTED AREA;control 229 | 0097;EPA;abbreviation 230 | 0098;START OF STRING;control 231 | 0098;SOS;abbreviation 232 | 233 | # SINGLE GRAPHIC CHARACTER INTRODUCER is another 234 | # architectural concept from early drafts of ISO/IEC 10646-1 235 | # which was never approved and standardized. 236 | 237 | 0099;SINGLE GRAPHIC CHARACTER INTRODUCER;figment 238 | 0099;SGC;abbreviation 239 | 240 | 009A;SINGLE CHARACTER INTRODUCER;control 241 | 009A;SCI;abbreviation 242 | 009B;CONTROL SEQUENCE INTRODUCER;control 243 | 009B;CSI;abbreviation 244 | 009C;STRING TERMINATOR;control 245 | 009C;ST;abbreviation 246 | 009D;OPERATING SYSTEM COMMAND;control 247 | 009D;OSC;abbreviation 248 | 009E;PRIVACY MESSAGE;control 249 | 009E;PM;abbreviation 250 | 009F;APPLICATION PROGRAM COMMAND;control 251 | 009F;APC;abbreviation 252 | 00A0;NBSP;abbreviation 253 | 00AD;SHY;abbreviation 254 | 01A2;LATIN CAPITAL LETTER GHA;correction 255 | 01A3;LATIN SMALL LETTER GHA;correction 256 | 034F;CGJ;abbreviation 257 | 061C;ALM;abbreviation 258 | 0709;SYRIAC SUBLINEAR COLON SKEWED LEFT;correction 259 | 0CDE;KANNADA LETTER LLLA;correction 260 | 0E9D;LAO LETTER FO FON;correction 261 | 0E9F;LAO LETTER FO FAY;correction 262 | 0EA3;LAO LETTER RO;correction 263 | 0EA5;LAO LETTER LO;correction 264 | 0FD0;TIBETAN MARK BKA- SHOG GI MGO RGYAN;correction 265 | 11EC;HANGUL JONGSEONG YESIEUNG-KIYEOK;correction 266 | 11ED;HANGUL JONGSEONG YESIEUNG-SSANGKIYEOK;correction 267 | 11EE;HANGUL JONGSEONG SSANGYESIEUNG;correction 268 | 11EF;HANGUL JONGSEONG YESIEUNG-KHIEUKH;correction 269 | 180B;FVS1;abbreviation 270 | 180C;FVS2;abbreviation 271 | 180D;FVS3;abbreviation 272 | 180E;MVS;abbreviation 273 | 200B;ZWSP;abbreviation 274 | 200C;ZWNJ;abbreviation 275 | 200D;ZWJ;abbreviation 276 | 200E;LRM;abbreviation 277 | 200F;RLM;abbreviation 278 | 202A;LRE;abbreviation 279 | 202B;RLE;abbreviation 280 | 202C;PDF;abbreviation 281 | 202D;LRO;abbreviation 282 | 202E;RLO;abbreviation 283 | 202F;NNBSP;abbreviation 284 | 205F;MMSP;abbreviation 285 | 2060;WJ;abbreviation 286 | 2066;LRI;abbreviation 287 | 2067;RLI;abbreviation 288 | 2068;FSI;abbreviation 289 | 2069;PDI;abbreviation 290 | 2118;WEIERSTRASS ELLIPTIC FUNCTION;correction 291 | 2448;MICR ON US SYMBOL;correction 292 | 2449;MICR DASH SYMBOL;correction 293 | 2B7A;LEFTWARDS TRIANGLE-HEADED ARROW WITH DOUBLE VERTICAL STROKE;correction 294 | 2B7C;RIGHTWARDS TRIANGLE-HEADED ARROW WITH DOUBLE VERTICAL STROKE;correction 295 | A015;YI SYLLABLE ITERATION MARK;correction 296 | FE00;VS1;abbreviation 297 | FE01;VS2;abbreviation 298 | FE02;VS3;abbreviation 299 | FE03;VS4;abbreviation 300 | FE04;VS5;abbreviation 301 | FE05;VS6;abbreviation 302 | FE06;VS7;abbreviation 303 | FE07;VS8;abbreviation 304 | FE08;VS9;abbreviation 305 | FE09;VS10;abbreviation 306 | FE0A;VS11;abbreviation 307 | FE0B;VS12;abbreviation 308 | FE0C;VS13;abbreviation 309 | FE0D;VS14;abbreviation 310 | FE0E;VS15;abbreviation 311 | FE0F;VS16;abbreviation 312 | FE18;PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET;correction 313 | FEFF;BYTE ORDER MARK;alternate 314 | FEFF;BOM;abbreviation 315 | FEFF;ZWNBSP;abbreviation 316 | 122D4;CUNEIFORM SIGN NU11 TENU;correction 317 | 122D5;CUNEIFORM SIGN NU11 OVER NU11 BUR OVER BUR;correction 318 | 16E56;MEDEFAIDRIN CAPITAL LETTER H;correction 319 | 16E57;MEDEFAIDRIN CAPITAL LETTER NG;correction 320 | 16E76;MEDEFAIDRIN SMALL LETTER H;correction 321 | 16E77;MEDEFAIDRIN SMALL LETTER NG;correction 322 | 1B001;HENTAIGANA LETTER E-1;correction 323 | 1D0C5;BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS;correction 324 | E0100;VS17;abbreviation 325 | E0101;VS18;abbreviation 326 | E0102;VS19;abbreviation 327 | E0103;VS20;abbreviation 328 | E0104;VS21;abbreviation 329 | E0105;VS22;abbreviation 330 | E0106;VS23;abbreviation 331 | E0107;VS24;abbreviation 332 | E0108;VS25;abbreviation 333 | E0109;VS26;abbreviation 334 | E010A;VS27;abbreviation 335 | E010B;VS28;abbreviation 336 | E010C;VS29;abbreviation 337 | E010D;VS30;abbreviation 338 | E010E;VS31;abbreviation 339 | E010F;VS32;abbreviation 340 | E0110;VS33;abbreviation 341 | E0111;VS34;abbreviation 342 | E0112;VS35;abbreviation 343 | E0113;VS36;abbreviation 344 | E0114;VS37;abbreviation 345 | E0115;VS38;abbreviation 346 | E0116;VS39;abbreviation 347 | E0117;VS40;abbreviation 348 | E0118;VS41;abbreviation 349 | E0119;VS42;abbreviation 350 | E011A;VS43;abbreviation 351 | E011B;VS44;abbreviation 352 | E011C;VS45;abbreviation 353 | E011D;VS46;abbreviation 354 | E011E;VS47;abbreviation 355 | E011F;VS48;abbreviation 356 | E0120;VS49;abbreviation 357 | E0121;VS50;abbreviation 358 | E0122;VS51;abbreviation 359 | E0123;VS52;abbreviation 360 | E0124;VS53;abbreviation 361 | E0125;VS54;abbreviation 362 | E0126;VS55;abbreviation 363 | E0127;VS56;abbreviation 364 | E0128;VS57;abbreviation 365 | E0129;VS58;abbreviation 366 | E012A;VS59;abbreviation 367 | E012B;VS60;abbreviation 368 | E012C;VS61;abbreviation 369 | E012D;VS62;abbreviation 370 | E012E;VS63;abbreviation 371 | E012F;VS64;abbreviation 372 | E0130;VS65;abbreviation 373 | E0131;VS66;abbreviation 374 | E0132;VS67;abbreviation 375 | E0133;VS68;abbreviation 376 | E0134;VS69;abbreviation 377 | E0135;VS70;abbreviation 378 | E0136;VS71;abbreviation 379 | E0137;VS72;abbreviation 380 | E0138;VS73;abbreviation 381 | E0139;VS74;abbreviation 382 | E013A;VS75;abbreviation 383 | E013B;VS76;abbreviation 384 | E013C;VS77;abbreviation 385 | E013D;VS78;abbreviation 386 | E013E;VS79;abbreviation 387 | E013F;VS80;abbreviation 388 | E0140;VS81;abbreviation 389 | E0141;VS82;abbreviation 390 | E0142;VS83;abbreviation 391 | E0143;VS84;abbreviation 392 | E0144;VS85;abbreviation 393 | E0145;VS86;abbreviation 394 | E0146;VS87;abbreviation 395 | E0147;VS88;abbreviation 396 | E0148;VS89;abbreviation 397 | E0149;VS90;abbreviation 398 | E014A;VS91;abbreviation 399 | E014B;VS92;abbreviation 400 | E014C;VS93;abbreviation 401 | E014D;VS94;abbreviation 402 | E014E;VS95;abbreviation 403 | E014F;VS96;abbreviation 404 | E0150;VS97;abbreviation 405 | E0151;VS98;abbreviation 406 | E0152;VS99;abbreviation 407 | E0153;VS100;abbreviation 408 | E0154;VS101;abbreviation 409 | E0155;VS102;abbreviation 410 | E0156;VS103;abbreviation 411 | E0157;VS104;abbreviation 412 | E0158;VS105;abbreviation 413 | E0159;VS106;abbreviation 414 | E015A;VS107;abbreviation 415 | E015B;VS108;abbreviation 416 | E015C;VS109;abbreviation 417 | E015D;VS110;abbreviation 418 | E015E;VS111;abbreviation 419 | E015F;VS112;abbreviation 420 | E0160;VS113;abbreviation 421 | E0161;VS114;abbreviation 422 | E0162;VS115;abbreviation 423 | E0163;VS116;abbreviation 424 | E0164;VS117;abbreviation 425 | E0165;VS118;abbreviation 426 | E0166;VS119;abbreviation 427 | E0167;VS120;abbreviation 428 | E0168;VS121;abbreviation 429 | E0169;VS122;abbreviation 430 | E016A;VS123;abbreviation 431 | E016B;VS124;abbreviation 432 | E016C;VS125;abbreviation 433 | E016D;VS126;abbreviation 434 | E016E;VS127;abbreviation 435 | E016F;VS128;abbreviation 436 | E0170;VS129;abbreviation 437 | E0171;VS130;abbreviation 438 | E0172;VS131;abbreviation 439 | E0173;VS132;abbreviation 440 | E0174;VS133;abbreviation 441 | E0175;VS134;abbreviation 442 | E0176;VS135;abbreviation 443 | E0177;VS136;abbreviation 444 | E0178;VS137;abbreviation 445 | E0179;VS138;abbreviation 446 | E017A;VS139;abbreviation 447 | E017B;VS140;abbreviation 448 | E017C;VS141;abbreviation 449 | E017D;VS142;abbreviation 450 | E017E;VS143;abbreviation 451 | E017F;VS144;abbreviation 452 | E0180;VS145;abbreviation 453 | E0181;VS146;abbreviation 454 | E0182;VS147;abbreviation 455 | E0183;VS148;abbreviation 456 | E0184;VS149;abbreviation 457 | E0185;VS150;abbreviation 458 | E0186;VS151;abbreviation 459 | E0187;VS152;abbreviation 460 | E0188;VS153;abbreviation 461 | E0189;VS154;abbreviation 462 | E018A;VS155;abbreviation 463 | E018B;VS156;abbreviation 464 | E018C;VS157;abbreviation 465 | E018D;VS158;abbreviation 466 | E018E;VS159;abbreviation 467 | E018F;VS160;abbreviation 468 | E0190;VS161;abbreviation 469 | E0191;VS162;abbreviation 470 | E0192;VS163;abbreviation 471 | E0193;VS164;abbreviation 472 | E0194;VS165;abbreviation 473 | E0195;VS166;abbreviation 474 | E0196;VS167;abbreviation 475 | E0197;VS168;abbreviation 476 | E0198;VS169;abbreviation 477 | E0199;VS170;abbreviation 478 | E019A;VS171;abbreviation 479 | E019B;VS172;abbreviation 480 | E019C;VS173;abbreviation 481 | E019D;VS174;abbreviation 482 | E019E;VS175;abbreviation 483 | E019F;VS176;abbreviation 484 | E01A0;VS177;abbreviation 485 | E01A1;VS178;abbreviation 486 | E01A2;VS179;abbreviation 487 | E01A3;VS180;abbreviation 488 | E01A4;VS181;abbreviation 489 | E01A5;VS182;abbreviation 490 | E01A6;VS183;abbreviation 491 | E01A7;VS184;abbreviation 492 | E01A8;VS185;abbreviation 493 | E01A9;VS186;abbreviation 494 | E01AA;VS187;abbreviation 495 | E01AB;VS188;abbreviation 496 | E01AC;VS189;abbreviation 497 | E01AD;VS190;abbreviation 498 | E01AE;VS191;abbreviation 499 | E01AF;VS192;abbreviation 500 | E01B0;VS193;abbreviation 501 | E01B1;VS194;abbreviation 502 | E01B2;VS195;abbreviation 503 | E01B3;VS196;abbreviation 504 | E01B4;VS197;abbreviation 505 | E01B5;VS198;abbreviation 506 | E01B6;VS199;abbreviation 507 | E01B7;VS200;abbreviation 508 | E01B8;VS201;abbreviation 509 | E01B9;VS202;abbreviation 510 | E01BA;VS203;abbreviation 511 | E01BB;VS204;abbreviation 512 | E01BC;VS205;abbreviation 513 | E01BD;VS206;abbreviation 514 | E01BE;VS207;abbreviation 515 | E01BF;VS208;abbreviation 516 | E01C0;VS209;abbreviation 517 | E01C1;VS210;abbreviation 518 | E01C2;VS211;abbreviation 519 | E01C3;VS212;abbreviation 520 | E01C4;VS213;abbreviation 521 | E01C5;VS214;abbreviation 522 | E01C6;VS215;abbreviation 523 | E01C7;VS216;abbreviation 524 | E01C8;VS217;abbreviation 525 | E01C9;VS218;abbreviation 526 | E01CA;VS219;abbreviation 527 | E01CB;VS220;abbreviation 528 | E01CC;VS221;abbreviation 529 | E01CD;VS222;abbreviation 530 | E01CE;VS223;abbreviation 531 | E01CF;VS224;abbreviation 532 | E01D0;VS225;abbreviation 533 | E01D1;VS226;abbreviation 534 | E01D2;VS227;abbreviation 535 | E01D3;VS228;abbreviation 536 | E01D4;VS229;abbreviation 537 | E01D5;VS230;abbreviation 538 | E01D6;VS231;abbreviation 539 | E01D7;VS232;abbreviation 540 | E01D8;VS233;abbreviation 541 | E01D9;VS234;abbreviation 542 | E01DA;VS235;abbreviation 543 | E01DB;VS236;abbreviation 544 | E01DC;VS237;abbreviation 545 | E01DD;VS238;abbreviation 546 | E01DE;VS239;abbreviation 547 | E01DF;VS240;abbreviation 548 | E01E0;VS241;abbreviation 549 | E01E1;VS242;abbreviation 550 | E01E2;VS243;abbreviation 551 | E01E3;VS244;abbreviation 552 | E01E4;VS245;abbreviation 553 | E01E5;VS246;abbreviation 554 | E01E6;VS247;abbreviation 555 | E01E7;VS248;abbreviation 556 | E01E8;VS249;abbreviation 557 | E01E9;VS250;abbreviation 558 | E01EA;VS251;abbreviation 559 | E01EB;VS252;abbreviation 560 | E01EC;VS253;abbreviation 561 | E01ED;VS254;abbreviation 562 | E01EE;VS255;abbreviation 563 | E01EF;VS256;abbreviation 564 | 565 | # EOF 566 | -------------------------------------------------------------------------------- /conf/unicode/README.md: -------------------------------------------------------------------------------- 1 | 2 | The files found here are copies of the Unicode files found on the Unicode 3 | website. We only include the few files that we parse. When a new version 4 | of Unicode comes out, we should be able to just replace those files and 5 | parse the new version. Also, we parse at installation time, so we can 6 | update an existing installation with a simple `apt-get upgrade`. 7 | 8 | See: https://www.unicode.org/Public/ 9 | 10 | Select a version and then `ucd`. 11 | 12 | -------------------------------------------------------------------------------- /debian/changelog: -------------------------------------------------------------------------------- 1 | libutf8 (1.0.15.2~bionic) bionic; urgency=high 2 | 3 | * Bumped build version to rebuild on Launchpad. 4 | 5 | -- Alexis Wilke Fri, 10 Nov 2023 15:24:41 -0800 6 | 7 | libutf8 (1.0.15.1~bionic) bionic; urgency=high 8 | 9 | * Bumped build version to rebuild on Launchpad. 10 | 11 | -- Alexis Wilke Tue, 07 Nov 2023 06:03:57 -0800 12 | 13 | libutf8 (1.0.15.0~jammy) jammy; urgency=high 14 | 15 | * Moved find() of doxygen in the doc/CMakeLists.txt file. 16 | * Changed the NOT_A_CHARACTER value to -2 to distinguish it from EOS. 17 | * Made utf8_iterator::operator * return NOT_A_CHARACTER on an error. 18 | * Define the traits in-place for std::iterator is deprecated. 19 | * Removed overload of ostream char32_t characters. 20 | * Added a show-unicode tool to display codes from character. 21 | * Added function to fix UTF-8 strings by replacing invalid characters. 22 | * Allow for += of the '\0' character. 23 | * Added UTF-16 functions & tests. 24 | * Updated the tests accordingly and added more for better coverage. 25 | * Added missing #include . 26 | * Applied hack so tests compiles under lunar. 27 | * Updated compat to the latest (v15) 28 | * Did some work on the UCD data (parse decomposition, read file properly...) 29 | * Removed boost-dev as a dependency. 30 | 31 | -- Alexis Wilke Sun, 05 Nov 2023 08:05:54 -0800 32 | 33 | libutf8 (1.0.14.0~bionic) bionic; urgency=high 34 | 35 | * Added operator+ for char32_t/string where string is viewed as UTF-8. 36 | 37 | -- Alexis Wilke Sun, 30 Oct 2022 21:24:12 -0700 38 | 39 | libutf8 (1.0.13.0~bionic) bionic; urgency=high 40 | 41 | * Added a verify_file_inheritance() in tools. 42 | 43 | -- Alexis Wilke Mon, 11 Jul 2022 07:42:16 -0700 44 | 45 | libutf8 (1.0.12.1~bionic) bionic; urgency=high 46 | 47 | * Updated the compat to v10. 48 | 49 | -- Alexis Wilke Thu, 19 May 2022 20:28:28 -0700 50 | 51 | libutf8 (1.0.12.0~bionic) bionic; urgency=high 52 | 53 | * Cleane up the cmake file. 54 | 55 | -- Alexis Wilke Thu, 19 May 2022 18:09:49 -0700 56 | 57 | libutf8 (1.0.11.2~bionic) bionic; urgency=high 58 | 59 | * Bumped build version to rebuild on Launchpad. 60 | 61 | -- Alexis Wilke Fri, 04 Mar 2022 22:36:44 -0800 62 | 63 | libutf8 (1.0.11.1~bionic) bionic; urgency=high 64 | 65 | * Bumped build version to rebuild on Launchpad. 66 | 67 | -- Alexis Wilke Sun, 13 Feb 2022 12:35:15 -0800 68 | 69 | libutf8 (1.0.11.0~bionic) bionic; urgency=high 70 | 71 | * Added a clear() for the good flag in the utf8_iterator. 72 | * Fixed the string test, the exception now include "libutf8_exception: ". 73 | * Correctly test the good flag status in cases were the iterator fails. 74 | 75 | -- Alexis Wilke Mon, 27 Sep 2021 18:08:13 -0700 76 | 77 | libutf8 (1.0.10.0~bionic) bionic; urgency=high 78 | 79 | * Updated the tests to match the new libexcept library setup. 80 | 81 | -- Alexis Wilke Sat, 28 Aug 2021 18:23:57 -0700 82 | 83 | libutf8 (1.0.9.0~bionic) bionic; urgency=high 84 | 85 | * Slowly adding Unicode to canonicalize UTF-8 strings. 86 | * Added SnapDev as a dependency to implement the Unicode parser. 87 | * Added a tool to run the parser (which is part of the library). 88 | * Updated the exception declarations with our macros. 89 | * Cleaned up licenses & copyrights. 90 | 91 | -- Alexis Wilke Tue, 24 Aug 2021 15:49:14 -0700 92 | 93 | libutf8 (1.0.8.1~bionic) bionic; urgency=high 94 | 95 | * Bumped build version to rebuild on Launchpad. 96 | 97 | -- Alexis Wilke Fri, 04 Jun 2021 18:28:59 -0700 98 | 99 | libutf8 (1.0.8.0~bionic) bionic; urgency=high 100 | 101 | * Fixed the name of a function in an exception message. 102 | * Updated the mk script. 103 | 104 | -- Alexis Wilke Tue, 01 Jun 2021 17:40:30 -0700 105 | 106 | libutf8 (1.0.7.2~bionic) bionic; urgency=high 107 | 108 | * Bumped version to recompile against the newer versions. 109 | 110 | -- Alexis Wilke Sat, 15 May 2021 09:33:12 -0700 111 | 112 | libutf8 (1.0.7.1~bionic) bionic; urgency=high 113 | 114 | * Bumped version to recompile against the newer version of snapcatch2. 115 | 116 | -- Alexis Wilke Fri, 08 Jan 2021 22:13:35 -0800 117 | 118 | libutf8 (1.0.7.0~bionic) bionic; urgency=high 119 | 120 | * Changed the EOF of the iterator in an EOS so it works as expected with 121 | the newest versions of catch2 (proper signess for char32_t). 122 | * Fixed one assignment from L'0' to u'0'. 123 | 124 | -- Alexis Wilke Tue, 26 Apr 2020 18:25:27 -0800 125 | 126 | libutf8 (1.0.6.2~bionic) bionic; urgency=high 127 | 128 | * Create a bionic version. 129 | 130 | -- Alexis Wilke Thu, 30 Apr 2020 20:59:23 -0800 131 | 132 | libutf8 (1.0.6.0~xenial) xenial; urgency=high 133 | 134 | * Added the libutf8::case_insensitive_string type. 135 | * Fixed the mk so it generates an error on an unknown command line option. 136 | * Added a test so we can make sure that the case_insensitive_string works. 137 | * Fixed the existing test tag names, we have to have the square brackets. 138 | * Moved a couple of validation functions from the libsnapwebsites to here. 139 | * Broke up the tests in a character and a string so we can just validate a 140 | standalone character too. 141 | * Added another validation for UTF-32 strings and characters. 142 | * Allow for a specific test to be run with `mk -t `. 143 | * Allow for a nullptr when calling start_with_bom(). 144 | * Added a new exception for unsupported features. 145 | * Aded a function to check whether a character is a surrogate and which one. 146 | * Added a to_u8string() with std::wstring as input. 147 | * Added a to_u8string() with wchar_t as input. 148 | * Added a to_u8string() with char16_t as input. 149 | 150 | -- Alexis Wilke Wed, 17 Jul 2019 19:58:43 -0800 151 | 152 | libutf8 (1.0.5.1~xenial) xenial; urgency=high 153 | 154 | * Bumped version to force a rebuild, just in case. 155 | 156 | -- Alexis Wilke Wed, 17 Jul 2019 19:58:43 -0800 157 | 158 | libutf8 (1.0.5.0~xenial) xenial; urgency=high 159 | 160 | * Added a way to create an iterator at the end. 161 | * Added == and != with another utf8_iterator. 162 | 163 | -- Alexis Wilke Sat, 29 Jun 2019 05:05:11 -0800 164 | 165 | libutf8 (1.0.4.0~xenial) xenial; urgency=high 166 | 167 | * Added a PROJECT_BRIEF description. 168 | * Added in=C++ to the MAPPING_EXTENSION. 169 | * Updated the doxy file to 1.8.11. 170 | 171 | -- Alexis Wilke Tue, 11 Jun 2019 23:55:25 -0800 172 | 173 | libutf8 (1.0.3.0~xenial) xenial; urgency=high 174 | 175 | * Moved the catch2 implementation to our `snapcatch2.hpp` header instead. 176 | * Updated the tests accordingly. 177 | * Cleaned up various declarations in each file. 178 | * Moved our `obj_setenv()` to `snapdev`. 179 | 180 | -- Alexis Wilke Sat, 1 Jun 2019 00:24:36 -0800 181 | 182 | libutf8 (1.0.2.0~xenial) xenial; urgency=high 183 | 184 | * Got the test coverage back to 100%. 185 | * Renamed tge tests without the "unittest_" introducer. 186 | * Added the `start_with_bom()` function and corresponding tests. 187 | * Fixed standalone characters, the introducer is U for char32_t characters. 188 | 189 | -- Alexis Wilke Tue, 28 May 2019 18:09:01 -0800 190 | 191 | libutf8 (1.0.1.0~xenial) xenial; urgency=high 192 | 193 | * Implemented the to and from UTF-8 and UTF-16 encoding. 194 | * Fixed the u8casecmp() test function which would test 0xD800 to 0xDFFF 195 | as valid characters. 196 | * Added a new exception so we can distinguish whether an encoding or a 197 | decoding went wrong. 198 | * Optimized the UTF-32 to UTF-8 conversion, i.e. code bytes under 0x80 get 199 | copied as is. 200 | * Fixed the '\0' conversion, it would not get added to the output string. 201 | * Added a to_u8string() from a char32_t so we get an std::string as output. 202 | * Generate errors when the mbstowc() or wctombs() functions fail. 203 | 204 | -- Alexis Wilke Tue, 28 May 2019 01:04:30 -0800 205 | 206 | libutf8 (1.0.0.3~xenial) xenial; urgency=high 207 | 208 | * Added the cmake folder and files. 209 | * Added the README.md and TODO.txt files to the debian/docs. 210 | * Removed the "debian/tmp/..." from the `debian/libutf8-doc.install`. 211 | * Added a `-i` command line option to mk to install the library. 212 | * Added a call to prevent collection of stack trace in our tests. 213 | 214 | -- Alexis Wilke Sat, 25 May 2019 20:54:23 -0800 215 | 216 | libutf8 (1.0.0.2~xenial) xenial; urgency=high 217 | 218 | * Try fixing dependencies, the version may need to include ~xenial. 219 | * Added boost-dev as a dependency as we use it in our tests. 220 | 221 | -- Alexis Wilke Sat, 25 May 2019 20:54:23 -0800 222 | 223 | libutf8 (1.0.0.1~xenial) xenial; urgency=high 224 | 225 | * Enhanced the README.md 226 | * Bumped snapcatch2 dependency version to 2.7.2.10. 227 | 228 | -- Alexis Wilke Mon, 20 May 2019 01:23:11 -0800 229 | 230 | libutf8 (1.0.0.0~xenial) xenial; urgency=high 231 | 232 | * Added my wpkg libutf8 library as a Snap! C++ project. 233 | 234 | -- Alexis Wilke Mon, 20 May 2019 01:23:11 -0800 235 | -------------------------------------------------------------------------------- /debian/compat: -------------------------------------------------------------------------------- 1 | 15 2 | -------------------------------------------------------------------------------- /debian/control: -------------------------------------------------------------------------------- 1 | Source: libutf8 2 | Priority: extra 3 | Maintainer: R. Douglas Barbieri 4 | Build-Depends: cmake, 5 | debhelper, 6 | doxygen, 7 | graphviz, 8 | libexcept-dev (>= 1.1.0.0~jammy), 9 | snapcatch2 (>= 2.7.2.10~jammy), 10 | snapcmakemodules (>= 1.0.35.3~jammy), 11 | snapdev (>= 1.1.16.0~jammy) 12 | Standards-Version: 3.9.4 13 | Section: libs 14 | Homepage: https://snapwebsites.org/ 15 | Vcs-Git: https://github.com/m2osw/snapcpp.git 16 | Vcs-Browser: https://github.com/m2osw/libutf8 17 | 18 | Package: libutf8-dev 19 | Section: libdevel 20 | Architecture: any 21 | Depends: libutf8 (= ${binary:Version}), ${misc:Depends} 22 | Description: Development package for the C++ libutf8 library. 23 | This library provides functions to convert between UTF-8 and UTF-32 characters. 24 | 25 | Package: libutf8-doc 26 | Section: doc 27 | Architecture: all 28 | Depends: ${misc:Depends} 29 | Description: Documentation for the C++ libutf8 library. 30 | This library provides functions to convert between UTF-8 and UTF-32 characters. 31 | 32 | Package: libutf8 33 | Section: libs 34 | Architecture: any 35 | Depends: ${shlibs:Depends}, ${misc:Depends} 36 | Description: C++ library for UTF-8/UTF-32 handling. 37 | This library provides functions to convert between UTF-8 and UTF-32 characters. 38 | 39 | # vim: ts=4 sw=4 et 40 | -------------------------------------------------------------------------------- /debian/copyright: -------------------------------------------------------------------------------- 1 | Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ 2 | Upstream-Name: libutf8 3 | Source: https://github.com/m2osw/libutf8 4 | 5 | Files: * 6 | Copyright: 2006-2019 Made to Order Software 7 | 2006-2019 Alexis Wilke 8 | 2006-2019 R. Douglas Barbieri 9 | License: GPL-2+ 10 | This package is free software; you can redistribute it and/or modify 11 | it under the terms of the GNU General Public License as published by 12 | the Free Software Foundation; either version 2 of the License, or 13 | (at your option) any later version. 14 | . 15 | This package is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | GNU General Public License for more details. 19 | . 20 | You should have received a copy of the GNU General Public License 21 | along with this program. If not, see 22 | . 23 | On Debian systems, the complete text of the GNU General 24 | Public License version 2 can be found in "/usr/share/common-licenses/GPL-2". 25 | 26 | Files: conf/unicode/* 27 | Copyright: 1991-2021 Unicode, Inc. All rights reserved. 28 | License: Unicode 29 | UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE 30 | . 31 | See Terms of Use for definitions of Unicode Inc.'s 32 | Data Files and Software. 33 | . 34 | NOTICE TO USER: Carefully read the following legal agreement. 35 | BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S 36 | DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), 37 | YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE 38 | TERMS AND CONDITIONS OF THIS AGREEMENT. 39 | IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE 40 | THE DATA FILES OR SOFTWARE. 41 | . 42 | COPYRIGHT AND PERMISSION NOTICE 43 | . 44 | Copyright (c) 1991-2021 Unicode, Inc. All rights reserved. 45 | Distributed under the Terms of Use in https://www.unicode.org/copyright.html. 46 | . 47 | Permission is hereby granted, free of charge, to any person obtaining 48 | a copy of the Unicode data files and any associated documentation 49 | (the "Data Files") or Unicode software and any associated documentation 50 | (the "Software") to deal in the Data Files or Software 51 | without restriction, including without limitation the rights to use, 52 | copy, modify, merge, publish, distribute, and/or sell copies of 53 | the Data Files or Software, and to permit persons to whom the Data Files 54 | or Software are furnished to do so, provided that either 55 | (a) this copyright and permission notice appear with all copies 56 | of the Data Files or Software, or 57 | (b) this copyright and permission notice appear in associated 58 | Documentation. 59 | . 60 | THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF 61 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 62 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 63 | NONINFRINGEMENT OF THIRD PARTY RIGHTS. 64 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS 65 | NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL 66 | DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 67 | DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 68 | TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 69 | PERFORMANCE OF THE DATA FILES OR SOFTWARE. 70 | . 71 | Except as contained in this notice, the name of a copyright holder 72 | shall not be used in advertising or otherwise to promote the sale, 73 | use or other dealings in these Data Files or Software without prior 74 | written authorization of the copyright holder. 75 | . 76 | See also: https://www.unicode.org/license.html 77 | 78 | # Please also look if there are files or directories which have a 79 | # different copyright/license attached and list them here. 80 | # Please avoid to pick license terms that are more restrictive than the 81 | # packaged work, as it may make Debian's contributions unacceptable upstream. 82 | -------------------------------------------------------------------------------- /debian/docs: -------------------------------------------------------------------------------- 1 | LICENSE.txt 2 | README.md 3 | TODO.txt 4 | -------------------------------------------------------------------------------- /debian/libutf8-dev.install: -------------------------------------------------------------------------------- 1 | usr/include/* 2 | usr/lib/lib*.so 3 | usr/share/cmake/* 4 | -------------------------------------------------------------------------------- /debian/libutf8-doc.install: -------------------------------------------------------------------------------- 1 | usr/share/doc/libutf8/html/* usr/share/doc/libutf8-doc/html/ 2 | -------------------------------------------------------------------------------- /debian/libutf8.install: -------------------------------------------------------------------------------- 1 | usr/bin 2 | usr/lib/lib*.so.* 3 | usr/share/libutf8 4 | -------------------------------------------------------------------------------- /debian/rules: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | # -*- makefile -*- 3 | # Sample debian/rules that uses debhelper. 4 | # This file was originally written by Joey Hess and Craig Small. 5 | # As a special exception, when this file is copied by dh-make into a 6 | # dh-make output file, you may use that output file without restriction. 7 | # This special exception was added by Craig Small in version 0.37 of dh-make. 8 | 9 | # Uncomment this to turn on verbose mode. 10 | #export DH_VERBOSE=1 11 | 12 | %: 13 | dh $@ --parallel 14 | 15 | override_dh_auto_configure: 16 | dh_auto_configure -- -DCMAKE_BUILD_TYPE=Release 17 | 18 | -------------------------------------------------------------------------------- /debian/source/options: -------------------------------------------------------------------------------- 1 | tar-ignore = "tmp" 2 | tar-ignore = ".git" 3 | -------------------------------------------------------------------------------- /doc/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2006-2023 Made to Order Software Corp. All Rights Reserved 2 | # 3 | # https://snapwebsites.org/project/libutf8 4 | # contact@m2osw.com 5 | # 6 | # This program is free software; you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation; either version 2 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This program is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License along 17 | # with this program; if not, write to the Free Software Foundation, Inc., 18 | # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | 20 | 21 | ## 22 | ## Documentation 23 | ## 24 | find_package(SnapDoxygen) 25 | AddDoxygenTarget(libutf8 26 | ${LIBUTF8_VERSION_MAJOR} 27 | ${LIBUTF8_VERSION_MINOR} 28 | ${LIBUTF8_VERSION_PATCH} 29 | ) 30 | 31 | # vim: ts=4 sw=4 et 32 | -------------------------------------------------------------------------------- /doc/footer.html: -------------------------------------------------------------------------------- 1 |
2 |

This document is part of the Snap! Websites Project.

3 |

Copyright by Made to Order Software Corp.

4 |
5 | -------------------------------------------------------------------------------- /doc/libutf8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/m2osw/libutf8/fe008f2199f5d8f43abe6ecffe6e82ead03ab2b5/doc/libutf8.png -------------------------------------------------------------------------------- /libutf8/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved 2 | # 3 | # https://snapwebsites.org/project/libutf8 4 | # contact@m2osw.com 5 | # 6 | # This program is free software; you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation; either version 2 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This program is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License along 17 | # with this program; if not, write to the Free Software Foundation, Inc., 18 | # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | 20 | ## 21 | ## utf8 library 22 | ## 23 | project(utf8) 24 | 25 | # Put the version in the header file 26 | configure_file( 27 | ${CMAKE_CURRENT_SOURCE_DIR}/version.h.in 28 | ${CMAKE_CURRENT_BINARY_DIR}/version.h 29 | ) 30 | 31 | add_library(${PROJECT_NAME} SHARED 32 | base.cpp 33 | iterator.cpp 34 | json_tokens.cpp 35 | libutf8.cpp 36 | unicode_data.cpp 37 | unicode_data_file.cpp 38 | version.cpp 39 | ) 40 | 41 | target_include_directories(${PROJECT_NAME} 42 | PUBLIC 43 | ${LIBEXCEPT_INCLUDE_DIRS} 44 | ${SNAPDEV_INCLUDE_DIRS} 45 | ) 46 | 47 | target_link_libraries(${PROJECT_NAME} 48 | ${LIBEXCEPT_LIBRARIES} 49 | ) 50 | 51 | set_target_properties(${PROJECT_NAME} PROPERTIES 52 | VERSION 53 | ${LIBUTF8_VERSION_MAJOR}.${LIBUTF8_VERSION_MINOR} 54 | 55 | SOVERSION 56 | ${LIBUTF8_VERSION_MAJOR} 57 | ) 58 | 59 | install( 60 | TARGETS 61 | ${PROJECT_NAME} 62 | 63 | RUNTIME DESTINATION 64 | bin 65 | 66 | LIBRARY DESTINATION 67 | lib 68 | 69 | ARCHIVE DESTINATION 70 | lib 71 | ) 72 | 73 | install( 74 | FILES 75 | base.h 76 | caseinsensitivestring.h 77 | exception.h 78 | iterator.h 79 | json_tokens.h 80 | libutf8.h 81 | unicode_data.h 82 | ${CMAKE_CURRENT_BINARY_DIR}/version.h 83 | 84 | DESTINATION 85 | include/libutf8 86 | ) 87 | 88 | 89 | # vim: ts=4 sw=4 et 90 | -------------------------------------------------------------------------------- /libutf8/base.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved 2 | // 3 | // https://snapwebsites.org/project/libutf8 4 | // contact@m2osw.com 5 | // 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; either version 2 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // This program is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License along 17 | // with this program; if not, write to the Free Software Foundation, Inc., 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | 20 | /** \file 21 | * \brief Implementation of the UTF-8 functions. 22 | * 23 | * This file is the implementation of the UTF-8 functions of the libutf8 24 | * library. It simply is a set of functions to convert between different 25 | * character sets in a lossless manner. At this point it supports UTF-8, 26 | * UCS-4, and UTF-16 formats. 27 | * 28 | * Contrary to many of the system functions, these functions do not take 29 | * anything from the system in account (the locale can be anything, it does 30 | * not change the exact behavior of these functions.) 31 | * 32 | * Also similar functionality is found on Unices and MS-Windows, it was 33 | * simpler to just implement these few functions than to try to have a 34 | * converter that is sure not to use a locale and this way we can use 35 | * standard strings (std::string and std::wstring) instead of having to 36 | * call C functions. 37 | */ 38 | 39 | // self 40 | // 41 | #include "libutf8/base.h" 42 | 43 | #include "libutf8/exception.h" 44 | 45 | 46 | // C++ 47 | // 48 | #include 49 | #include 50 | 51 | 52 | // last include 53 | // 54 | #include 55 | 56 | 57 | 58 | /** \brief Name space of the UTF-8 library. 59 | * 60 | * The libutf8 library is used to seamlessly handle UTF-8 strings. It also 61 | * is used to convert betwee UTF-8, UTF-16, and UTF-32 strings. 62 | * 63 | * \todo 64 | * Implement the UTF-16 functions. 65 | */ 66 | namespace libutf8 67 | { 68 | 69 | 70 | /** \var constexpr std::size_t MBS_MIN_BUFFER_LENGTH 71 | * \brief Minimum buffer length to support any UTF-8 characters. 72 | * 73 | * When converting a UTF-32 character to UTF-8, it makes use of an output 74 | * buffer. The size of that output buffer should be at least 75 | * MBS_MIN_BUFFER_LENGTH to accomodate any UTF-32 character. 76 | * 77 | * Note that the size includes space for a null terminator (`'\0'`). 78 | * 79 | * The size of your buffer can be smaller as long as the UTF-32 character 80 | * fits into it, the wctombs() function will not fail. 81 | */ 82 | 83 | 84 | /** \brief Compute the UTF-8 encoded representation of wc. 85 | * 86 | * This function transforms the UTF-32 character \p wc in a 87 | * UTF-8 encoded series of bytes (called a multi-byte encoded 88 | * character.) The resulting string is null (`'\0'`) terminated. 89 | * 90 | * The \p mb buffer should be at least MBS_MIN_BUFFER_LENGTH bytes. 91 | * If less space is required, the function does not report a problem, 92 | * though. This allows to get the total size of a conversion and then 93 | * do the full conversion to that one buffer without the need to 94 | * add unnecessary bytes at the end of your destination buffer. 95 | * 96 | * \code 97 | * ... 98 | * char mb[MBS_MIN_BUFFER_LENGTH]; 99 | * 100 | * wctombs(mb, big_char, sizeof(mb)); 101 | * ... 102 | * \endcode 103 | * 104 | * The function does not encode invalid characters. When such is 105 | * passed to the function, the \p mb string is turned in a null 106 | * terminated string and the function returns 0. We avoid an 107 | * exception here because that way you can quickly check whether 108 | * a string of `char32_t` characters is valid or not. 109 | * 110 | * \note 111 | * Unicode defines valid characters only between zero (0) and 0x10FFFF. 112 | * Therefore this function encodes the character using 1 to 4 bytes plus 113 | * one for the null terminator. 114 | * 115 | * \warning 116 | * The function does not raise an error if the input \p wc character 117 | * is considered invalid (a UTF-16 surrogate or larger than 0x10FFFF.) 118 | * Instead it returns 0 and sets the \p mb string to the empty string. 119 | * 120 | * \exception libutf8_logic_exception 121 | * The function raises this exception if the destination buffer is too 122 | * small for the conversion. Don't forget that we add a null terminator 123 | * so if the character needs 3 UTF-8 bytes, we will check for a buffer 124 | * of at least 4 bytes to consider it valid. 125 | * 126 | * \param[out] mb The output buffer, it will always be null terminated. 127 | * \param[in] wc The wide character to convert. 128 | * \param[in] len The length of \p mb. 129 | * 130 | * \return The number of bytes in mb, not including the null terminator. 131 | */ 132 | int wctombs(char * mb, char32_t wc, std::size_t len) 133 | { 134 | auto verify_length = [&len](std::size_t required_len) 135 | { 136 | if(len < required_len) 137 | { 138 | throw libutf8_logic_exception("wctombs() called with an output buffer which is too small."); 139 | } 140 | }; 141 | 142 | if(wc < 0x80) 143 | { 144 | verify_length(2); 145 | 146 | /* this will also encode '\0'... */ 147 | mb[0] = static_cast(wc); 148 | mb[1] = '\0'; 149 | return 1; 150 | } 151 | if(wc < 0x800) 152 | { 153 | verify_length(3); 154 | 155 | mb[0] = static_cast((wc >> 6) | 0xC0); 156 | mb[1] = (wc & 0x3F) | 0x80; 157 | mb[2] = '\0'; 158 | return 2; 159 | } 160 | 161 | // avoid encoding the UTF-16 surrogate because those code points do not 162 | // represent characters 163 | // 164 | if(wc < 0xD800 || wc > 0xDFFF) 165 | { 166 | if(wc < 0x10000) 167 | { 168 | verify_length(4); 169 | 170 | mb[0] = static_cast((wc >> 12) | 0xE0); 171 | mb[1] = ((wc >> 6) & 0x3F) | 0x80; 172 | mb[2] = (wc & 0x3F) | 0x80; 173 | mb[3] = '\0'; 174 | return 3; 175 | } 176 | if(wc < 0x110000) 177 | { 178 | verify_length(5); 179 | 180 | mb[0] = static_cast((wc >> 18) | 0xF0); 181 | mb[1] = ((wc >> 12) & 0x3F) | 0x80; 182 | mb[2] = ((wc >> 6) & 0x3F) | 0x80; 183 | mb[3] = (wc & 0x3F) | 0x80; 184 | mb[4] = '\0'; 185 | return 4; 186 | } 187 | } 188 | 189 | verify_length(1); 190 | 191 | /* an invalid wide character */ 192 | mb[0] = '\0'; 193 | return -1; 194 | } 195 | 196 | 197 | /** \brief Convert one multi-byte character to a wide character. 198 | * 199 | * This function converts UTF-8 bytes from \p mb to one UTF-32 200 | * wide character and saves the result in \p wc. The function 201 | * automatically increases the pointer in \p mb and simultaneously 202 | * decreases the \p len parameter. 203 | * 204 | * \p wc holds the resulting wide character, a character between 205 | * `'\0'` (NUL) and `0x10FFFF` and it returns the number of bytes 206 | * that were used from \p mb. If a bad character is encountered, 207 | * then the function returns -1 and the bad sequence of bytes is 208 | * skipped so only one error will be reported for one bad sequence. 209 | * 210 | * Bad characters when converting UTF-8 to wide characters are: 211 | * 212 | * \li The stream includes bytes 0x80 to 0xBF without an introducer. 213 | * \li The stream does not include the right number of 0x80 to 0xBF 214 | * bytes after an introducer. 215 | * \li The input ends too early and cannot accommodate the last 216 | * encoded character. 217 | * \li The codes 0xF8 to 0xFF were found in the input string. 218 | * \li The resulting \p wc value would be larger than 0x10FFFF. 219 | * \li The resulting \p wc value represents a UTF-16 surrogate 220 | * value (a number between 0xD800 and 0xDFFF). 221 | * 222 | * Code points between 0xD800 and 0xDFFF are not valid characters. 223 | * These represent low and high surrogates in UTF-16 (2 are 224 | * necessary to encode one character of 17 or more bits.) 225 | * 226 | * The function returns 0 and sets \p wc to the NUL character (`U'\0'`) 227 | * if the \p len parameter is zero (i.e. empty string.) 228 | * 229 | * \note 230 | * The function converts a NUL character (`'\0'`) in the 231 | * input string as a NUL wide character (`U'\0'`) and returns 1. It 232 | * does not see the NUL character as the end of the string. 233 | * 234 | * \warning 235 | * The function does not throw on invalid input. It is the responsibility 236 | * of the caller to do so if necessary. This is useful to very an UTF-8 237 | * string without having to catch an exception. 238 | * 239 | * \param[out] wc The output wide character variable. 240 | * \param[in,out] mb The multi-byte input string pointer, returned at the 241 | * following byte. 242 | * \param[in,out] len The number of characters left in mb. 243 | * 244 | * \return The number of bytes read or -1 if invalid bytes were found. 245 | */ 246 | int mbstowc(char32_t & wc, char const * & mb, std::size_t & len) 247 | { 248 | auto skip = [](char const * & skip_mb, size_t & skip_len) 249 | { 250 | for(unsigned char b(0) 251 | ; skip_len > 0 && (b = *skip_mb, (b >= 0x80 && b <= 0xBF) || b >= 0xF5) 252 | ; ++skip_mb , --skip_len); 253 | }; 254 | 255 | // already done? 256 | // 257 | if(len <= 0) 258 | { 259 | wc = U'\0'; 260 | return 0; 261 | } 262 | 263 | // we eat one character from the source minimum 264 | // 265 | unsigned char c(*mb++); 266 | --len; 267 | 268 | if(c < 0x80) 269 | { 270 | wc = c; 271 | return 1; 272 | } 273 | 274 | // by default return an invalid character 275 | // 276 | wc = NOT_A_CHARACTER; 277 | 278 | // invalid stream? 279 | // 280 | if((c >= 0x80 && c <= 0xBF) || c >= 0xF5) 281 | { 282 | // this is bad UTF-8, skip all the invalid bytes 283 | // 284 | skip(mb, len); 285 | return -1; 286 | } 287 | 288 | char32_t w(U'\0'); 289 | std::size_t cnt(0); 290 | 291 | if(c >= 0xF0) 292 | { 293 | w = c & 0x07; 294 | cnt = 3; 295 | } 296 | else if(c >= 0xE0) 297 | { 298 | w = c & 0x0F; 299 | cnt = 2; 300 | } 301 | else /*if(c >= 0xC0)*/ // always true so we don't have to check 302 | { 303 | w = c & 0x1F; 304 | cnt = 1; 305 | } 306 | 307 | // enough data in the input? if not, that's an error 308 | // 309 | if(len < cnt) 310 | { 311 | skip(mb, len); 312 | return -1; 313 | } 314 | len -= cnt; 315 | 316 | for(std::size_t l(cnt); l > 0; --l, mb++) 317 | { 318 | c = *mb; 319 | if(c < 0x80 || c > 0xBF) 320 | { 321 | // we got an invalid sequence! 322 | // restore whatever is left in len 323 | // 324 | len += l; 325 | return -1; 326 | } 327 | w = (w << 6) | (c & 0x3F); 328 | } 329 | 330 | if(w >= 0x110000 331 | || (w >= 0x00D800 && w <= 0x00DFFF)) 332 | { 333 | // character out of range or UTF-16 surrogate 334 | // it can happen with sequences starting with 0xF7 335 | // 336 | return -1; 337 | } 338 | 339 | wc = w; 340 | 341 | return static_cast(cnt + 1); 342 | } 343 | 344 | 345 | /** \brief An overload with a non-const string. 346 | * 347 | * Since we are passing a reference to the \p mb string, whether it is 348 | * const or non-const matter to the call. So here we offer a non-const 349 | * version even though the string doesn't get modified. 350 | * 351 | * \param[out] wc The output wide character variable. 352 | * \param[in,out] mb The multi-byte input string pointer, returned at the 353 | * following byte. 354 | * \param[in,out] len The number of characters left in mb. 355 | * 356 | * \return The number of bytes read or -1 if invalid bytes were found. 357 | */ 358 | int mbstowc(char32_t & wc, char * & mb, std::size_t & len) 359 | { 360 | return mbstowc(wc, const_cast(mb), len); 361 | } 362 | 363 | 364 | 365 | } // libutf8 namespace 366 | // vim: ts=4 sw=4 et 367 | -------------------------------------------------------------------------------- /libutf8/base.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved 2 | // 3 | // https://snapwebsites.org/project/libutf8 4 | // contact@m2osw.com 5 | // 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; either version 2 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // This program is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License along 17 | // with this program; if not, write to the Free Software Foundation, Inc., 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | #pragma once 20 | 21 | /** \file 22 | * \brief The declarations of the UTF-8 library base functions. 23 | * 24 | * The functions defined in this file are used to do the actual conversions. 25 | * 26 | * They may be useful to you which is why we make them available here. 27 | * However, these are considered low level functions and you may want 28 | * to restrain using them. Using the `std::string`-base functions is 29 | * much safer and what is expected of you. 30 | */ 31 | 32 | // C++ 33 | // 34 | #include 35 | 36 | 37 | namespace libutf8 38 | { 39 | 40 | 41 | 42 | constexpr std::size_t MBS_MIN_BUFFER_LENGTH = 5; 43 | constexpr char32_t const BOM_CHAR = U'\U0000FEFF'; 44 | constexpr char32_t const NOT_A_CHARACTER = static_cast(-2); 45 | 46 | int wctombs(char * mb, char32_t wc, size_t len); 47 | int mbstowc(char32_t & wc, char const * & mb, size_t & len); 48 | int mbstowc(char32_t & wc, char * & mb, size_t & len); 49 | 50 | 51 | 52 | } // libutf8 namespace 53 | // vim: ts=4 sw=4 et 54 | -------------------------------------------------------------------------------- /libutf8/caseinsensitivestring.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2013-2023 Made to Order Software Corp. All Rights Reserved 2 | // 3 | // https://snapwebsites.org/project/libutf8 4 | // contact@m2osw.com 5 | // 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; either version 2 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // This program is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License along 17 | // with this program; if not, write to the Free Software Foundation, Inc., 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | #pragma once 20 | 21 | 22 | // self 23 | // 24 | #include 25 | 26 | 27 | // C++ 28 | // 29 | #include 30 | 31 | 32 | 33 | namespace libutf8 34 | { 35 | 36 | 37 | 38 | /** \brief Case insensitive string. 39 | * 40 | * This class is an overload of the string template which allows you to 41 | * create case insensitive strings as far as the comparison operators 42 | * are concerned. All the other functions still work the same way. 43 | * 44 | * This is particularly useful if you manage an std::map<> with a string as 45 | * the key, string which should not be case sensitive. 46 | * 47 | * The comparisons are done using the libutf8::u8casecmp() function. 48 | * 49 | * \sa u8casecmp() 50 | */ 51 | template< 52 | class _CharT, 53 | class _Traits = std::char_traits<_CharT>, 54 | class _Alloc = std::allocator<_CharT> 55 | > 56 | class case_insensitive_basic_string 57 | : public std::basic_string<_CharT, _Traits, _Alloc> 58 | { 59 | public: 60 | typedef typename std::basic_string<_CharT, _Traits, _Alloc>::size_type size_type; 61 | 62 | case_insensitive_basic_string() noexcept(std::is_nothrow_default_constructible<_Alloc>::value) 63 | : std::basic_string<_CharT, _Traits, _Alloc>() 64 | { 65 | } 66 | 67 | explicit case_insensitive_basic_string(_Alloc const & __a) 68 | : std::basic_string<_CharT, _Traits, _Alloc>(__a) 69 | { 70 | } 71 | 72 | case_insensitive_basic_string(size_type __n, _CharT __c, _Alloc const & __a = _Alloc()) 73 | : std::basic_string<_CharT, _Traits, _Alloc>(__n, __c, __a) 74 | { 75 | } 76 | 77 | // the following are for C++17 and over 78 | // (and then the next two constructors will not set __n) 79 | // 80 | //case_insensitive_basic_string(std::basic_string<_CharT, _Traits, _Alloc> const & __str, size_type __pos, _Alloc const & __a = _Alloc()) 81 | // : std::basic_string<_CharT, _Traits, _Alloc>(__str, __pos, __a) 82 | //{ 83 | //} 84 | // 85 | //case_insensitive_basic_string(case_insensitive_basic_string const & __str, size_type __pos, _Alloc const & __a = _Alloc()) 86 | // : std::basic_string<_CharT, _Traits, _Alloc>(static_cast const &>(__str), __pos, __a) 87 | //{ 88 | //} 89 | 90 | case_insensitive_basic_string(std::basic_string<_CharT, _Traits, _Alloc> const & __str, size_type __pos, size_type __n = std::basic_string<_CharT, _Traits, _Alloc>::npos, _Alloc const & __a = _Alloc()) 91 | : std::basic_string<_CharT, _Traits, _Alloc>(__str, __pos, __n, __a) 92 | { 93 | } 94 | 95 | case_insensitive_basic_string(case_insensitive_basic_string const & __str, size_type __pos, size_type __n = std::basic_string<_CharT, _Traits, _Alloc>::npos, _Alloc const & __a = _Alloc()) 96 | : std::basic_string<_CharT, _Traits, _Alloc>(__str, __pos, __n, __a) 97 | { 98 | } 99 | 100 | case_insensitive_basic_string(_CharT const * __d, size_type __n, _Alloc const & __a = _Alloc()) 101 | : std::basic_string<_CharT, _Traits, _Alloc>(__d, __n, __a) 102 | { 103 | } 104 | 105 | case_insensitive_basic_string(_CharT const * __d, _Alloc const & __a = _Alloc()) 106 | : std::basic_string<_CharT, _Traits, _Alloc>(__d, __a) 107 | { 108 | } 109 | 110 | template 111 | case_insensitive_basic_string(_InputIterator __beg, _InputIterator __end, _Alloc const & __a = _Alloc()) 112 | : std::basic_string<_CharT, _Traits, _Alloc>(__beg, __end, __a) 113 | { 114 | } 115 | 116 | case_insensitive_basic_string(std::basic_string<_CharT, _Traits, _Alloc> const & __str) 117 | : std::basic_string<_CharT, _Traits, _Alloc>(__str) 118 | { 119 | } 120 | 121 | case_insensitive_basic_string(case_insensitive_basic_string const & __str) 122 | : std::basic_string<_CharT, _Traits, _Alloc>(__str) 123 | { 124 | } 125 | 126 | case_insensitive_basic_string(std::basic_string<_CharT, _Traits, _Alloc> && __str) noexcept 127 | : std::basic_string<_CharT, _Traits, _Alloc>(__str) 128 | { 129 | } 130 | 131 | case_insensitive_basic_string(case_insensitive_basic_string && __str) noexcept 132 | : std::basic_string<_CharT, _Traits, _Alloc>(__str) 133 | { 134 | } 135 | 136 | case_insensitive_basic_string(std::basic_string<_CharT, _Traits, _Alloc> && __str, _Alloc const & __a) 137 | : std::basic_string<_CharT, _Traits, _Alloc>(__str, __a) 138 | { 139 | } 140 | 141 | case_insensitive_basic_string(case_insensitive_basic_string && __str, _Alloc const & __a) 142 | : std::basic_string<_CharT, _Traits, _Alloc>(__str, __a) 143 | { 144 | } 145 | 146 | case_insensitive_basic_string(std::initializer_list<_CharT> __l, _Alloc const & __a = _Alloc()) 147 | : std::basic_string<_CharT, _Traits, _Alloc>(__l, __a) 148 | { 149 | } 150 | 151 | 152 | friend bool operator == (case_insensitive_basic_string const & lhs, std::basic_string<_CharT, _Traits, _Alloc> const & rhs) 153 | { 154 | return libutf8::u8casecmp(lhs, rhs) == 0; 155 | } 156 | 157 | friend bool operator == (case_insensitive_basic_string const & lhs, case_insensitive_basic_string const & rhs) 158 | { 159 | return libutf8::u8casecmp(lhs, rhs) == 0; 160 | } 161 | 162 | friend bool operator == (std::basic_string<_CharT, _Traits, _Alloc> const & lhs, case_insensitive_basic_string const & rhs) 163 | { 164 | return libutf8::u8casecmp(lhs, rhs) == 0; 165 | } 166 | 167 | friend bool operator == (case_insensitive_basic_string const & lhs, _CharT const * rhs) 168 | { 169 | return libutf8::u8casecmp(lhs, rhs) == 0; 170 | } 171 | 172 | friend bool operator == (_CharT const * lhs, case_insensitive_basic_string const & rhs) 173 | { 174 | return libutf8::u8casecmp(lhs, rhs) == 0; 175 | } 176 | 177 | friend bool operator != (case_insensitive_basic_string const & lhs, std::basic_string<_CharT, _Traits, _Alloc> const & rhs) 178 | { 179 | return libutf8::u8casecmp(lhs, rhs) != 0; 180 | } 181 | 182 | friend bool operator != (case_insensitive_basic_string const & lhs, case_insensitive_basic_string const & rhs) 183 | { 184 | return libutf8::u8casecmp(lhs, rhs) != 0; 185 | } 186 | 187 | friend bool operator != (std::basic_string<_CharT, _Traits, _Alloc> const & lhs, case_insensitive_basic_string const & rhs) 188 | { 189 | return libutf8::u8casecmp(lhs, rhs) != 0; 190 | } 191 | 192 | friend bool operator != (case_insensitive_basic_string const & lhs, _CharT const * rhs) 193 | { 194 | return libutf8::u8casecmp(lhs, rhs) != 0; 195 | } 196 | 197 | friend bool operator != (_CharT const * lhs, case_insensitive_basic_string const & rhs) 198 | { 199 | return libutf8::u8casecmp(lhs, rhs) != 0; 200 | } 201 | 202 | friend bool operator < (case_insensitive_basic_string const & lhs, std::basic_string<_CharT, _Traits, _Alloc> const & rhs) 203 | { 204 | return libutf8::u8casecmp(lhs, rhs) < 0; 205 | } 206 | 207 | friend bool operator < (case_insensitive_basic_string const & lhs, case_insensitive_basic_string const & rhs) 208 | { 209 | return libutf8::u8casecmp(lhs, rhs) < 0; 210 | } 211 | 212 | friend bool operator < (std::basic_string<_CharT, _Traits, _Alloc> const & lhs, case_insensitive_basic_string const & rhs) 213 | { 214 | return libutf8::u8casecmp(lhs, rhs) < 0; 215 | } 216 | 217 | friend bool operator < (case_insensitive_basic_string const & lhs, _CharT const * rhs) 218 | { 219 | return libutf8::u8casecmp(lhs, rhs) < 0; 220 | } 221 | 222 | friend bool operator < (_CharT const * lhs, case_insensitive_basic_string const & rhs) 223 | { 224 | return libutf8::u8casecmp(lhs, rhs) < 0; 225 | } 226 | 227 | friend bool operator <= (case_insensitive_basic_string const & lhs, std::basic_string<_CharT, _Traits, _Alloc> const & rhs) 228 | { 229 | return libutf8::u8casecmp(lhs, rhs) <= 0; 230 | } 231 | 232 | friend bool operator <= (case_insensitive_basic_string const & lhs, case_insensitive_basic_string const & rhs) 233 | { 234 | return libutf8::u8casecmp(lhs, rhs) <= 0; 235 | } 236 | 237 | friend bool operator <= (std::basic_string<_CharT, _Traits, _Alloc> const & lhs, case_insensitive_basic_string const & rhs) 238 | { 239 | return libutf8::u8casecmp(lhs, rhs) <= 0; 240 | } 241 | 242 | friend bool operator <= (case_insensitive_basic_string const & lhs, _CharT const * rhs) 243 | { 244 | return libutf8::u8casecmp(lhs, rhs) <= 0; 245 | } 246 | 247 | friend bool operator <= (_CharT const * lhs, case_insensitive_basic_string const & rhs) 248 | { 249 | return libutf8::u8casecmp(lhs, rhs) <= 0; 250 | } 251 | 252 | friend bool operator > (case_insensitive_basic_string const & lhs, std::basic_string<_CharT, _Traits, _Alloc> const & rhs) 253 | { 254 | return libutf8::u8casecmp(lhs, rhs) > 0; 255 | } 256 | 257 | friend bool operator > (case_insensitive_basic_string const & lhs, case_insensitive_basic_string const & rhs) 258 | { 259 | return libutf8::u8casecmp(lhs, rhs) > 0; 260 | } 261 | 262 | friend bool operator > (std::basic_string<_CharT, _Traits, _Alloc> const & lhs, case_insensitive_basic_string const & rhs) 263 | { 264 | return libutf8::u8casecmp(lhs, rhs) > 0; 265 | } 266 | 267 | friend bool operator > (case_insensitive_basic_string const & lhs, _CharT const * rhs) 268 | { 269 | return libutf8::u8casecmp(lhs, rhs) > 0; 270 | } 271 | 272 | friend bool operator > (_CharT const * lhs, case_insensitive_basic_string const & rhs) 273 | { 274 | return libutf8::u8casecmp(lhs, rhs) > 0; 275 | } 276 | 277 | friend bool operator >= (case_insensitive_basic_string const & lhs, std::basic_string<_CharT, _Traits, _Alloc> const & rhs) 278 | { 279 | return libutf8::u8casecmp(lhs, rhs) >= 0; 280 | } 281 | 282 | friend bool operator >= (case_insensitive_basic_string const & lhs, case_insensitive_basic_string const & rhs) 283 | { 284 | return libutf8::u8casecmp(lhs, rhs) >= 0; 285 | } 286 | 287 | friend bool operator >= (std::basic_string<_CharT, _Traits, _Alloc> const & lhs, case_insensitive_basic_string const & rhs) 288 | { 289 | return libutf8::u8casecmp(lhs, rhs) >= 0; 290 | } 291 | 292 | friend bool operator >= (case_insensitive_basic_string const & lhs, _CharT const * rhs) 293 | { 294 | return libutf8::u8casecmp(lhs, rhs) >= 0; 295 | } 296 | 297 | friend bool operator >= (_CharT const * lhs, case_insensitive_basic_string const & rhs) 298 | { 299 | return libutf8::u8casecmp(lhs, rhs) >= 0; 300 | } 301 | }; 302 | 303 | 304 | typedef case_insensitive_basic_string case_insensitive_string; 305 | 306 | // TODO add support for other types 307 | //typedef case_insensitive_basic_string case_insensitive_wstring; 308 | //typedef case_insensitive_basic_string case_insensitive_u16string; 309 | //typedef case_insensitive_basic_string case_insensitive_u32string; 310 | 311 | 312 | } 313 | // libutf8 namespace 314 | // vim: ts=4 sw=4 et 315 | -------------------------------------------------------------------------------- /libutf8/exception.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved 2 | // 3 | // https://snapwebsites.org/project/libutf8 4 | // contact@m2osw.com 5 | // 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; either version 2 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // This program is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License along 17 | // with this program; if not, write to the Free Software Foundation, Inc., 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | #pragma once 20 | 21 | /** \file 22 | * \brief The declarations of the UTF-8 library. 23 | * 24 | * This file is the declarations of the UTF-8 library which are just a few 25 | * functions used to convert a string from one format to another. 26 | */ 27 | 28 | // libexcept 29 | // 30 | #include 31 | 32 | 33 | 34 | namespace libutf8 35 | { 36 | 37 | 38 | 39 | DECLARE_LOGIC_ERROR(libutf8_logic_exception); 40 | 41 | DECLARE_MAIN_EXCEPTION(libutf8_exception); 42 | 43 | DECLARE_EXCEPTION(libutf8_exception, libutf8_exception_decoding); 44 | DECLARE_EXCEPTION(libutf8_exception, libutf8_exception_encoding); 45 | DECLARE_EXCEPTION(libutf8_exception, libutf8_exception_invalid_parameter); 46 | DECLARE_EXCEPTION(libutf8_exception, libutf8_exception_io); 47 | DECLARE_EXCEPTION(libutf8_exception, libutf8_exception_missing); 48 | DECLARE_EXCEPTION(libutf8_exception, libutf8_exception_overflow); 49 | DECLARE_EXCEPTION(libutf8_exception, libutf8_exception_twice); 50 | DECLARE_EXCEPTION(libutf8_exception, libutf8_exception_unsupported); 51 | 52 | 53 | 54 | } // libutf8 namespace 55 | // vim: ts=4 sw=4 et 56 | -------------------------------------------------------------------------------- /libutf8/iterator.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved 2 | // 3 | // https://snapwebsites.org/project/libutf8 4 | // contact@m2osw.com 5 | // 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; either version 2 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // This program is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License along 17 | // with this program; if not, write to the Free Software Foundation, Inc., 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | 20 | /** \file 21 | * \brief Implementation of the UTF-8 functions. 22 | * 23 | * This file is the implementation of the UTF-8 functions of the libutf8 24 | * library. It simply is a set of functions to convert between different 25 | * character sets in a lossless manner. At this point it supports UTF-8, 26 | * UCS-4, and UTF-16 formats. 27 | * 28 | * Contrary to many of the system functions, these functions do not take 29 | * anything from the system in account (the locale can be anything, it does 30 | * not change the exact behavior of these functions). 31 | * 32 | * Also similar functionality is found on Unices and MS-Windows, it was 33 | * simpler to just implement these few functions than to try to have a 34 | * converter that is sure not to use a locale and this way we can use 35 | * standard strings (std::string and std::wstring) instead of having to 36 | * call C functions. 37 | */ 38 | 39 | // self 40 | // 41 | #include "libutf8/iterator.h" 42 | 43 | #include "libutf8/base.h" 44 | #include "libutf8/libutf8.h" 45 | 46 | 47 | // C++ 48 | // 49 | #include 50 | 51 | 52 | // last include 53 | // 54 | #include 55 | 56 | 57 | 58 | namespace libutf8 59 | { 60 | 61 | 62 | 63 | utf8_iterator::utf8_iterator(std::string const & str, bool end) 64 | : f_str(&str) 65 | , f_pos(end ? str.length() : 0) 66 | , f_start_pos(f_pos) 67 | { 68 | } 69 | 70 | 71 | utf8_iterator & utf8_iterator::operator ++ () 72 | { 73 | increment(); 74 | return *this; 75 | } 76 | 77 | 78 | utf8_iterator utf8_iterator::operator ++ (int) // post-increment 79 | { 80 | utf8_iterator it(*this); 81 | increment(); 82 | return it; 83 | } 84 | 85 | 86 | utf8_iterator & utf8_iterator::operator -- () 87 | { 88 | decrement(); 89 | return *this; 90 | } 91 | 92 | 93 | utf8_iterator utf8_iterator::operator -- (int) // post-decrement 94 | { 95 | utf8_iterator it(*this); 96 | decrement(); 97 | return it; 98 | } 99 | 100 | 101 | /** \brief Read the current character. 102 | * 103 | * This function reads the current character and returns it as a char32_t 104 | * (i.e. UTF-32). 105 | * 106 | * When the iterator is at the end of the input string (it == str.end()), 107 | * then the function returns libutf8::EOS (-1). 108 | * 109 | * When the current character is valid, the value is any number from 0 to 110 | * 0x10FFFF except for UTF-16 surrogate values (0xD800 to 0xDFFF). 111 | * 112 | * When the current character is invalid (bad UTF-8 encoding, although 113 | * extended UTF-8 is accepted here), then the function returns 114 | * libutf8::NOT_A_CHARACTER (-2). Further, the good flag is also set to 115 | * false, which means good() returns false and bad() returns true. 116 | * 117 | * \code 118 | * for(libutf8::utf8_iterator it(s); it != s.end(); ++it) 119 | * { 120 | * char32_t c(*it); 121 | * 122 | * // here you can choose: 123 | * if(c == libutf8::NOT_A_CHARACTER) 124 | * { 125 | * // handle error -- current character is not valid UTF-8 126 | * break; 127 | * } 128 | * // -- or -- 129 | * if(it.bad()) 130 | * { 131 | * // handle error -- current character is not valid UTF-8 132 | * break; 133 | * } 134 | * } 135 | * \endcode 136 | * 137 | * Since this function returns EOS when the iterator is at the end of 138 | * the string, you can also stop the iteration process like so: 139 | * 140 | * \code 141 | * libutf8::utf8_iterator it(s); 142 | * for(;;) 143 | * { 144 | * char32_t c(*it); 145 | * if(c == libutf8::EOS) 146 | * { 147 | * // success, all characters were valid 148 | * break; 149 | * } 150 | * ...handle other cases as above... 151 | * } 152 | * \endcode 153 | * 154 | * \return EOS if at the end of the string, the current character as a 155 | * char32_t value or NOT_A_CHARACTER if the current character encoding is 156 | * wrong. 157 | * 158 | * \sa good() 159 | * \sa bad() 160 | */ 161 | char32_t utf8_iterator::operator * () const 162 | { 163 | if(f_pos >= f_str->length()) 164 | { 165 | return EOS; 166 | } 167 | char const * s(f_str->c_str() + f_pos); 168 | char32_t wc(NOT_A_CHARACTER); 169 | size_t len(f_str->length() - f_pos); 170 | if(mbstowc(wc, s, len) < 0) 171 | { 172 | f_good = false; 173 | } 174 | return wc; 175 | } 176 | 177 | 178 | bool utf8_iterator::operator == (utf8_iterator const & rhs) const 179 | { 180 | return f_pos == rhs.f_pos; 181 | } 182 | 183 | 184 | bool utf8_iterator::operator != (utf8_iterator const & rhs) const 185 | { 186 | return f_pos != rhs.f_pos; 187 | } 188 | 189 | 190 | bool utf8_iterator::operator == (std::string::iterator it) const 191 | { 192 | return static_cast(it - f_str->begin()) == f_pos; 193 | } 194 | 195 | 196 | bool utf8_iterator::operator != (std::string::iterator it) const 197 | { 198 | return static_cast(it - f_str->begin()) != f_pos; 199 | } 200 | 201 | 202 | bool utf8_iterator::operator == (std::string::const_iterator it) const 203 | { 204 | return static_cast(it - f_str->cbegin()) == f_pos; 205 | } 206 | 207 | 208 | bool utf8_iterator::operator != (std::string::const_iterator it) const 209 | { 210 | return static_cast(it - f_str->cbegin()) != f_pos; 211 | } 212 | 213 | 214 | bool operator == (std::string::iterator it, utf8_iterator const & rhs) 215 | { 216 | return static_cast(it - rhs.f_str->begin()) == rhs.f_pos; 217 | } 218 | 219 | 220 | bool operator != (std::string::iterator it, utf8_iterator const & rhs) 221 | { 222 | return static_cast(it - rhs.f_str->begin()) != rhs.f_pos; 223 | } 224 | 225 | 226 | bool operator == (std::string::const_iterator it, utf8_iterator const & rhs) 227 | { 228 | return static_cast(it - rhs.f_str->cbegin()) == rhs.f_pos; 229 | } 230 | 231 | 232 | bool operator != (std::string::const_iterator it, utf8_iterator const & rhs) 233 | { 234 | return static_cast(it - rhs.f_str->cbegin()) != rhs.f_pos; 235 | } 236 | 237 | 238 | void utf8_iterator::increment() 239 | { 240 | auto skip = [&]() 241 | { 242 | for(unsigned char b(0) 243 | ; f_pos < f_str->length() 244 | && (b = static_cast(f_str[0][f_pos]), 245 | (b >= 0x80 && b <= 0xBF) || b >= 0xF5) 246 | ; ++f_pos); 247 | f_good = false; 248 | }; 249 | 250 | if(f_pos >= f_str->length()) 251 | { 252 | return; 253 | } 254 | 255 | // increment is easy we can just get the current character and we know 256 | // the size of the character in UTF-8 257 | // 258 | unsigned char c(static_cast(f_str[0][f_pos])); 259 | 260 | if(c < 0x80) 261 | { 262 | ++f_pos; 263 | } 264 | else if(c <= 0xBF || c >= 0xF5) 265 | { 266 | // ?! invalid UTF-8 ?! 267 | // 268 | skip(); 269 | } 270 | else if(c >= 0xF0) 271 | { 272 | f_pos += 4; 273 | if(c == 0xF4 && f_pos - 3 < f_str->length()) 274 | { 275 | c = static_cast(f_str[0][f_pos - 3]); 276 | if(c >= 0x90) 277 | { 278 | f_pos -= 3; 279 | skip(); 280 | } 281 | } 282 | } 283 | else if(c >= 0xE0) 284 | { 285 | f_pos += 3; 286 | } 287 | else /*if(c >= 0xC0)*/ // always true so we don't have to check 288 | { 289 | f_pos += 2; 290 | } 291 | if(f_pos > f_str->length()) 292 | { 293 | f_pos = f_str->length(); 294 | f_good = false; 295 | } 296 | } 297 | 298 | 299 | /** \brief Decrement the iterator. 300 | * 301 | * If the iterator is not already at position 0, decrement it to the previous 302 | * UTF-8 character. This means skipping to the first UTF-8 byte. 303 | * 304 | * \note 305 | * Contrary to the increment(), this function does not set the good flag to 306 | * true or false whether it is at the start or there is an invalid character. 307 | */ 308 | void utf8_iterator::decrement() 309 | { 310 | if(f_pos == 0) 311 | { 312 | return; 313 | } 314 | 315 | // decrement requires us to search for the previous starting byte 316 | // which means we need to scan the string 317 | // 318 | while(f_pos > 0) 319 | { 320 | --f_pos; 321 | unsigned char c(static_cast(f_str[0][f_pos])); 322 | if(c < 0x80 323 | || c >= 0xC0) 324 | { 325 | break; 326 | } 327 | } 328 | } 329 | 330 | 331 | /** \brief Compute the distance between two iterators. 332 | * 333 | * This function computers the distance between two libutf8 iterators. 334 | * 335 | * The right hand side iterator must be from the same string as the 336 | * lhs string. 337 | * 338 | * \return The distance between the two iterators. 339 | */ 340 | utf8_iterator::difference_type utf8_iterator::operator - (utf8_iterator const & rhs) const 341 | { 342 | return f_pos - rhs.f_pos; 343 | } 344 | 345 | 346 | /** \brief Compute the distance between two iterators. 347 | * 348 | * This operator computes the difference between this iterator and the 349 | * specified \p it iterator. 350 | * 351 | * \param[in] it The iterator to calculate the distance from. 352 | * 353 | * \return The distance between the two iterators. 354 | */ 355 | utf8_iterator::difference_type utf8_iterator::operator - (std::string::const_iterator it) const 356 | { 357 | return static_cast(f_str->cbegin() + f_pos - it); 358 | } 359 | 360 | 361 | /** \brief Compute the distance between two iterators. 362 | * 363 | * This operator computes the difference between the two specified iterators 364 | * \p it and \p rhs. 365 | * 366 | * \param[in] it The iterator to calculate the distance from. 367 | * \param[in] rhs The iterator to calculate the distance to. 368 | * 369 | * \return The distance between the two specified iterators. 370 | */ 371 | utf8_iterator::difference_type operator - (std::string::const_iterator it, utf8_iterator const & rhs) 372 | { 373 | return static_cast(it - rhs.f_str->cbegin() - rhs.f_pos); 374 | } 375 | 376 | 377 | /** \brief Restart the iterator. 378 | * 379 | * The iterator started at 0 or the end of the string, then you moved it 380 | * using the `++` or `--` operators. Later you may want to re-parse the 381 | * string from the start or end of the string. 382 | * 383 | * This function resets the position back to 0 or the end as defined on 384 | * the constructor. 385 | */ 386 | void utf8_iterator::rewind() 387 | { 388 | f_pos = f_start_pos; 389 | } 390 | 391 | 392 | /** \brief Clear the errors. 393 | * 394 | * The iterator is considered good by default. If you try to retreive 395 | * a character after the end of the string being iterated or the 396 | * bytes do not represent an invalid UTF-8 character. 397 | * 398 | * \sa good() 399 | * \sa bad() 400 | */ 401 | void utf8_iterator::clear() 402 | { 403 | f_good = true; 404 | } 405 | 406 | 407 | /** \brief Check whether the iterator did not run in an error. 408 | * 409 | * The iterator remains good as long as the input characters are valid 410 | * and the end of the string is not reached. After either event, this 411 | * function returns false. 412 | * 413 | * You can clear this flag by calling the clear() function. 414 | * 415 | * \return true if no errors were encountered so far. 416 | * 417 | * \sa clear() 418 | * \sa bad() 419 | */ 420 | bool utf8_iterator::good() const 421 | { 422 | return f_good; 423 | } 424 | 425 | 426 | /** \brief Check whether the iterator ran in an error. 427 | * 428 | * This function returns true if an invalid character or the end of the 429 | * string was found. 430 | * 431 | * \return true if an error condition was encountered. 432 | * 433 | * \sa clear() 434 | * \sa good() 435 | */ 436 | bool utf8_iterator::bad() const 437 | { 438 | return !f_good; 439 | } 440 | 441 | 442 | 443 | } // libutf8 namespace 444 | // vim: ts=4 sw=4 et 445 | -------------------------------------------------------------------------------- /libutf8/iterator.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved 2 | // 3 | // https://snapwebsites.org/project/libutf8 4 | // contact@m2osw.com 5 | // 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; either version 2 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // This program is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License along 17 | // with this program; if not, write to the Free Software Foundation, Inc., 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | #pragma once 20 | 21 | /** \file 22 | * \brief The declarations of the UTF-8 library. 23 | * 24 | * This file is the declarations of the UTF-8 library which are just a few 25 | * functions used to convert a string from one format to another. 26 | */ 27 | 28 | // C++ 29 | // 30 | #include 31 | 32 | 33 | 34 | namespace libutf8 35 | { 36 | 37 | 38 | constexpr char32_t EOS = static_cast(EOF); 39 | 40 | 41 | class utf8_iterator 42 | { 43 | public: 44 | // Iterator traits 45 | // 46 | typedef std::bidirectional_iterator_tag iterator_category; 47 | typedef char32_t value_type; 48 | typedef ssize_t difference_type; 49 | typedef char32_t const * pointer; 50 | typedef char32_t const & reference; 51 | 52 | utf8_iterator(std::string const & str, bool end = false); 53 | 54 | utf8_iterator & operator ++ (); 55 | utf8_iterator operator ++ (int); 56 | utf8_iterator & operator -- (); 57 | utf8_iterator operator -- (int); 58 | value_type operator * () const; 59 | bool operator == (utf8_iterator const & rhs) const; 60 | bool operator != (utf8_iterator const & rhs) const; 61 | bool operator == (std::string::iterator it) const; 62 | bool operator != (std::string::iterator it) const; 63 | bool operator == (std::string::const_iterator it) const; 64 | bool operator != (std::string::const_iterator it) const; 65 | friend bool operator == (std::string::iterator it, utf8_iterator const & rhs); 66 | friend bool operator != (std::string::iterator it, utf8_iterator const & rhs); 67 | friend bool operator == (std::string::const_iterator it, utf8_iterator const & rhs); 68 | friend bool operator != (std::string::const_iterator it, utf8_iterator const & rhs); 69 | difference_type operator - (utf8_iterator const & rhs) const; 70 | difference_type operator - (std::string::const_iterator it) const; 71 | friend difference_type operator - (std::string::const_iterator it, utf8_iterator const & rhs); 72 | 73 | void rewind(); 74 | void clear(); 75 | bool good() const; 76 | bool bad() const; 77 | 78 | private: 79 | void increment(); 80 | void decrement(); 81 | 82 | std::string const * f_str = nullptr; 83 | std::string::size_type f_pos = 0; 84 | std::string::size_type f_start_pos = 0; 85 | mutable bool f_good = true; 86 | }; 87 | 88 | 89 | 90 | } // libutf8 namespace 91 | // vim: ts=4 sw=4 et 92 | -------------------------------------------------------------------------------- /libutf8/json_tokens.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved 2 | // 3 | // https://snapwebsites.org/project/libutf8 4 | // contact@m2osw.com 5 | // 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; either version 2 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // This program is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License along 17 | // with this program; if not, write to the Free Software Foundation, Inc., 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | #pragma once 20 | 21 | /** \file 22 | * \brief The declarations of the JSON tokens class. 23 | * 24 | * This file is the declarations of the JSON tokens class one can use to 25 | * interpret the contents of a JSON file. 26 | * 27 | * The idea of this simple parser is to (1) show how one can use the 28 | * libutf8 library and (2) give you the ability to parse a simple JSON 29 | * structure. 30 | */ 31 | 32 | // self 33 | // 34 | #include 35 | 36 | 37 | // C++ 38 | // 39 | #include 40 | #include 41 | #include 42 | 43 | 44 | 45 | namespace libutf8 46 | { 47 | 48 | 49 | enum class token_t 50 | { 51 | TOKEN_END, 52 | TOKEN_ERROR, 53 | TOKEN_OPEN_ARRAY, 54 | TOKEN_CLOSE_ARRAY, 55 | TOKEN_OPEN_OBJECT, 56 | TOKEN_CLOSE_OBJECT, 57 | TOKEN_NUMBER, 58 | TOKEN_STRING, 59 | TOKEN_COMMA, 60 | TOKEN_COLON, 61 | TOKEN_TRUE, 62 | TOKEN_FALSE, 63 | TOKEN_NULL, 64 | }; 65 | 66 | 67 | class json_tokens 68 | { 69 | public: 70 | json_tokens(std::string const & input); 71 | 72 | int line() const; 73 | int column() const; 74 | token_t next_token(); 75 | double number() const; 76 | std::string const & string() const; 77 | std::string const & error() const; 78 | 79 | private: 80 | char32_t getc(); 81 | void ungetc(char32_t c); 82 | char32_t char16(char32_t & c); 83 | void add_error_character(char32_t c); 84 | 85 | std::string f_input = std::string(); 86 | utf8_iterator f_iterator; // initialize in the constructor 87 | char32_t f_unget[16]; 88 | std::size_t f_unget_pos = 0; 89 | std::uint32_t f_line = 1; 90 | std::uint32_t f_last_line = 0; 91 | std::uint32_t f_column = 1; 92 | std::uint32_t f_last_column = 0; 93 | double f_number = 0.0; 94 | std::string f_string = std::string(); 95 | std::string f_error = std::string(); 96 | }; 97 | 98 | 99 | } // libutf8 namespace 100 | // vim: ts=4 sw=4 et 101 | -------------------------------------------------------------------------------- /libutf8/libutf8.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved 2 | // 3 | // https://snapwebsites.org/project/libutf8 4 | // contact@m2osw.com 5 | // 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; either version 2 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // This program is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License along 17 | // with this program; if not, write to the Free Software Foundation, Inc., 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | #pragma once 20 | 21 | /** \file 22 | * \brief The declarations of the UTF-8 library. 23 | * 24 | * This file is the declarations of the UTF-8 library which are just a few 25 | * functions used to convert a string from one format to another. 26 | */ 27 | 28 | // C++ 29 | // 30 | #include 31 | 32 | 33 | 34 | namespace libutf8 35 | { 36 | 37 | 38 | enum class bom_t 39 | { 40 | BOM_NONE, 41 | BOM_UTF8, 42 | BOM_UTF16_LE, 43 | BOM_UTF16_BE, 44 | BOM_UTF32_LE, 45 | BOM_UTF32_BE 46 | }; 47 | 48 | 49 | enum class surrogate_t 50 | { 51 | SURROGATE_NO, 52 | SURROGATE_HIGH, 53 | SURROGATE_LOW 54 | }; 55 | 56 | 57 | 58 | 59 | bool is_valid_ascii(char c, bool ctrl = true); 60 | bool is_valid_ascii(char const * str, bool ctrl = true); 61 | bool is_valid_ascii(std::string const & str, bool ctrl = true); 62 | bool is_valid_utf8(char const * str); 63 | bool is_valid_utf8(std::string const & str); 64 | bool is_valid_utf16(std::u16string const & str); 65 | bool is_valid_unicode(char32_t const wc, bool ctrl = true); 66 | bool is_valid_unicode(char32_t const * str, bool ctrl = true); 67 | bool is_valid_unicode(std::u32string const & str, bool ctrl = true); 68 | surrogate_t is_surrogate(char32_t wc); 69 | bom_t start_with_bom(char const * str, size_t len); 70 | std::string to_u8string(std::u32string const & str); 71 | std::string to_u8string(std::u16string const & str); 72 | std::string to_u8string(std::wstring const & str); 73 | std::string to_u8string(wchar_t one, wchar_t two = L'\0'); 74 | std::string to_u8string(char16_t one, char16_t two = u'\0'); 75 | std::string to_u8string(char32_t const wc); 76 | std::u16string to_u16string(char32_t const wc); 77 | std::u16string to_u16string(std::string const & str); 78 | std::u32string to_u32string(std::string const & str); 79 | std::size_t u8length(std::string const & str); 80 | ssize_t u16length(std::u16string const & str); 81 | int u8casecmp(std::string const & lhs, std::string const & rhs); 82 | bool make_u8string_valid(std::string & str, char32_t fix_char = U'?'); 83 | 84 | 85 | 86 | } // libutf8 namespace 87 | 88 | 89 | inline std::string operator + (char32_t wc, std::string const & rhs) 90 | { 91 | std::string v; 92 | v = libutf8::to_u8string(wc); 93 | return v + rhs; 94 | } 95 | 96 | 97 | inline std::string operator + (std::string const & lhs, char32_t wc) 98 | { 99 | std::string v; 100 | v = libutf8::to_u8string(wc); 101 | return lhs + v; 102 | } 103 | 104 | 105 | inline std::string & operator += (std::string & lhs, char32_t wc) 106 | { 107 | return lhs += libutf8::to_u8string(wc); 108 | } 109 | 110 | 111 | inline std::string & operator += (std::string & lhs, int c) 112 | { 113 | return lhs += static_cast(c); 114 | } 115 | 116 | 117 | inline std::string & operator += (std::string & lhs, unsigned int c) 118 | { 119 | return lhs += static_cast(c); 120 | } 121 | 122 | 123 | inline std::string & operator += (std::string & lhs, long c) 124 | { 125 | return lhs += static_cast(c); 126 | } 127 | 128 | 129 | inline std::string & operator += (std::string & lhs, unsigned long c) 130 | { 131 | return lhs += static_cast(c); 132 | } 133 | 134 | 135 | 136 | // vim: ts=4 sw=4 et 137 | -------------------------------------------------------------------------------- /libutf8/unicode_data.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved 2 | // 3 | // https://snapwebsites.org/project/libutf8 4 | // contact@m2osw.com 5 | // 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; either version 2 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // This program is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License along 17 | // with this program; if not, write to the Free Software Foundation, Inc., 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | 20 | /** \file 21 | * \brief Implementation of the UTF-8 functions. 22 | * 23 | * This file is the implementation of the UTF-8 functions of the libutf8 24 | * library. It simply is a set of functions to convert between different 25 | * character sets in a lossless manner. At this point it supports UTF-8, 26 | * UCS-4, and UTF-16 formats. 27 | * 28 | * Contrary to many of the system functions, these functions do not take 29 | * anything from the system in account (the locale can be anything, it does 30 | * not change the exact behavior of these functions.) 31 | * 32 | * Also similar functionality is found on Unices and MS-Windows, it was 33 | * simpler to just implement these few functions than to try to have a 34 | * converter that is sure not to use a locale and this way we can use 35 | * standard strings (std::string and std::wstring) instead of having to 36 | * call C functions. 37 | */ 38 | 39 | // self 40 | // 41 | #include "libutf8/unicode_data.h" 42 | 43 | #include "libutf8/exception.h" 44 | #include "libutf8/libutf8.h" 45 | #include "libutf8/unicode_data_file.h" 46 | 47 | 48 | // C++ 49 | // 50 | #include 51 | #include 52 | 53 | 54 | // last include 55 | // 56 | #include 57 | 58 | 59 | 60 | /** \brief Name space of the UTF-8 library. 61 | * 62 | * The library to convert UTF-8 strings to UCS-4 (Unices) or UTF-16 strings 63 | * (MS-Windows) and vice versa. 64 | */ 65 | namespace libutf8 66 | { 67 | 68 | 69 | namespace 70 | { 71 | 72 | 73 | 74 | 75 | 76 | class private_unicode_character 77 | : public unicode_character 78 | { 79 | public: 80 | private_unicode_character( 81 | char32_t code 82 | , detail::ucd_header * h); 83 | 84 | protected: 85 | virtual detail::ucd_character * 86 | ucd_character_pointer() const override; 87 | 88 | private: 89 | detail::ucd_character 90 | f_private_character = detail::ucd_character(); 91 | }; 92 | 93 | 94 | private_unicode_character::private_unicode_character( 95 | char32_t code 96 | , detail::ucd_header * h) 97 | : unicode_character(code, &f_private_character, h) 98 | { 99 | f_private_character.f_code = code; 100 | f_private_character.f_flags = detail::UCD_FLAG_PRIVATE; 101 | f_private_character.f_general_category = General_Category::GC_Private_Use; 102 | f_private_character.f_bidi_class = Bidi_Class::BC_Left_To_Right; 103 | } 104 | 105 | 106 | detail::ucd_character * private_unicode_character::ucd_character_pointer() const 107 | { 108 | return const_cast(&f_private_character); 109 | } 110 | 111 | 112 | 113 | } // no name namespace 114 | 115 | 116 | 117 | 118 | 119 | 120 | unicode_character::unicode_character( 121 | char32_t code 122 | , detail::ucd_character * c 123 | , detail::ucd_header * h) 124 | : f_code(code) 125 | , f_character(c) 126 | , f_header(h) 127 | { 128 | } 129 | 130 | 131 | unicode_character::~unicode_character() 132 | { 133 | } 134 | 135 | 136 | unicode_character::unicode_character(unicode_character const & rhs) 137 | { 138 | // this looks weird, but it works as expected 139 | // 140 | f_character = rhs.f_character; 141 | f_character = ucd_character_pointer(); 142 | f_header = rhs.f_header; 143 | } 144 | 145 | 146 | unicode_character & unicode_character::operator = (unicode_character const & rhs) 147 | { 148 | // this looks weird, but it works as expected 149 | // 150 | f_character = rhs.f_character; 151 | f_character = ucd_character_pointer(); 152 | f_header = rhs.f_header; 153 | 154 | return *this; 155 | } 156 | 157 | 158 | bool unicode_character::is_valid() const 159 | { 160 | return is_valid_unicode(f_code); 161 | } 162 | 163 | 164 | bool unicode_character::is_defined() const 165 | { 166 | return f_character->f_code != NOT_A_CHARACTER; 167 | } 168 | 169 | 170 | bool unicode_character::is_private() const 171 | { 172 | return (f_character->f_flags & detail::UCD_FLAG_PRIVATE) != 0; 173 | } 174 | 175 | 176 | General_Category unicode_character::category() const 177 | { 178 | return f_character->f_general_category; 179 | } 180 | 181 | 182 | bool unicode_character::is_letter() const 183 | { 184 | return f_character->f_general_category >= General_Category::GC_Uppercase_Letter 185 | && f_character->f_general_category <= General_Category::GC_Other_Letter; 186 | } 187 | 188 | 189 | bool unicode_character::is_mark() const 190 | { 191 | return f_character->f_general_category >= General_Category::GC_Nonspacing_Mark 192 | && f_character->f_general_category <= General_Category::GC_Enclosing_Mark; 193 | } 194 | 195 | 196 | bool unicode_character::is_number() const 197 | { 198 | return f_character->f_general_category >= General_Category::GC_Decimal_Number 199 | && f_character->f_general_category <= General_Category::GC_Other_Number; 200 | } 201 | 202 | 203 | bool unicode_character::is_punctuation() const 204 | { 205 | return f_character->f_general_category >= General_Category::GC_Connector_Punctuation 206 | && f_character->f_general_category <= General_Category::GC_Other_Punctuation; 207 | } 208 | 209 | 210 | bool unicode_character::is_symbol() const 211 | { 212 | return f_character->f_general_category >= General_Category::GC_Math_Symbol 213 | && f_character->f_general_category <= General_Category::GC_Other_Symbol; 214 | } 215 | 216 | 217 | bool unicode_character::is_separator() const 218 | { 219 | return f_character->f_general_category >= General_Category::GC_Space_Separator 220 | && f_character->f_general_category <= General_Category::GC_Paragraph_Separator; 221 | } 222 | 223 | 224 | bool unicode_character::is_other() const 225 | { 226 | return f_character->f_general_category >= General_Category::GC_Control 227 | && f_character->f_general_category <= General_Category::GC_Unassigned; 228 | } 229 | 230 | 231 | 232 | Canonical_Combining_Class unicode_character::combining_class() 233 | { 234 | return f_character->f_canonical_combining_class; 235 | } 236 | 237 | 238 | Bidi_Class unicode_character::bidi_class() const 239 | { 240 | return f_character->f_bidi_class; 241 | } 242 | 243 | 244 | bool unicode_character::is_bidi_mirrored() const 245 | { 246 | return (f_character->f_flags & detail::UCD_FLAG_BIDI_MIRROR) != 0; 247 | } 248 | 249 | 250 | Decomposition_Type unicode_character::decomposition_type() const 251 | { 252 | return static_cast(f_character->f_decomposition_type); 253 | } 254 | 255 | 256 | Numeric_Type unicode_character::numeric() const 257 | { 258 | if((f_character->f_flags & detail::UCD_FLAG_DIGIT) != 0) 259 | { 260 | return Numeric_Type::NT_Digit; 261 | } 262 | 263 | if((f_character->f_flags & detail::UCD_FLAG_DECIMAL) != 0) 264 | { 265 | return Numeric_Type::NT_Decimal; 266 | } 267 | 268 | if((f_character->f_flags & detail::UCD_FLAG_NUMERIC) != 0) 269 | { 270 | return Numeric_Type::NT_Numeric; 271 | } 272 | 273 | return Numeric_Type::NT_Unknown; 274 | } 275 | 276 | 277 | std::int64_t unicode_character::get_number(int index) const 278 | { 279 | std::size_t length(0); 280 | char const * name(find_name(detail::Name_Type::NT_Numeric, length)); 281 | if(name == nullptr) 282 | { 283 | return 0; 284 | } 285 | if(length != 16) 286 | { 287 | // someone tempered with the database? 288 | // 289 | throw libutf8_logic_exception("invalid \"name\" size for a number"); 290 | } 291 | std::int64_t const * number(reinterpret_cast(name)); 292 | return number[index]; 293 | } 294 | 295 | 296 | std::int64_t unicode_character::nominator() const 297 | { 298 | return get_number(0); 299 | } 300 | 301 | 302 | std::int64_t unicode_character::denominator() const 303 | { 304 | return get_number(1); 305 | } 306 | 307 | 308 | char const * unicode_character::find_name(detail::Name_Type type, std::size_t & length) const 309 | { 310 | if(f_character->f_names == 0) 311 | { 312 | throw libutf8_logic_exception("character is missing a name"); 313 | } 314 | 315 | char const * name(reinterpret_cast(f_header) 316 | + f_header->f_strings + f_character->f_names); 317 | for(;;) 318 | { 319 | detail::Name_Type const t(static_cast(name[0])); 320 | if(t == detail::Name_Type::NT_EndOfNames) 321 | { 322 | length = 0; 323 | return nullptr; 324 | } 325 | length = static_cast(name[1]); 326 | if(t == type) 327 | { 328 | return name + 2; 329 | } 330 | name += length + 2; 331 | } 332 | } 333 | 334 | 335 | detail::ucd_character * unicode_character::ucd_character_pointer() const 336 | { 337 | return f_character; 338 | } 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | } // libutf8 namespace 347 | // vim: ts=4 sw=4 et 348 | -------------------------------------------------------------------------------- /libutf8/unicode_data.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021-2023 Made to Order Software Corp. All Rights Reserved 2 | // 3 | // https://snapwebsites.org/project/libutf8 4 | // contact@m2osw.com 5 | // 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; either version 2 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // This program is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License along 17 | // with this program; if not, write to the Free Software Foundation, Inc., 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | #pragma once 20 | 21 | /** \file 22 | * \brief The declarations of the UTF-8 library. 23 | * 24 | * This file is the declarations of the UTF-8 library which are just a few 25 | * functions used to convert a string from one format to another. 26 | */ 27 | 28 | // self 29 | // 30 | #include 31 | 32 | 33 | // C++ 34 | // 35 | #include 36 | #include 37 | 38 | 39 | 40 | namespace libutf8 41 | { 42 | 43 | 44 | namespace detail 45 | { 46 | class unicode_data_impl; 47 | class parser_impl; 48 | class ucd_header; 49 | class ucd_character; 50 | enum class Name_Type : std::uint8_t; 51 | } // detail namespace 52 | 53 | 54 | 55 | enum class General_Category : std::uint8_t 56 | { 57 | GC_Unknown_Category = 0, 58 | 59 | // GC_Letter = 1 to 6 // L 60 | GC_Uppercase_Letter = 1, // Lu 61 | GC_Lowercase_Letter = 2, // Ll 62 | GC_TitleCase_Letter = 3, // Lt 63 | GC_Cased_Letter = 4, // LC 64 | GC_Modified_Letter = 5, // Lm 65 | GC_Other_Letter = 6, // Lo 66 | 67 | // GC_Mark = 7 to 9 // M 68 | GC_Nonspacing_Mark = 7, // Mn 69 | GC_Spacing_Mark = 8, // Mc 70 | GC_Enclosing_Mark = 9, // Me 71 | 72 | // GC_Number = 10 to 12 // N 73 | GC_Decimal_Number = 10, // Nd 74 | GC_Letter_Number = 11, // Nl 75 | GC_Other_Number = 12, // No 76 | 77 | // GC_Punctuation = 13 to 19 // P 78 | GC_Connector_Punctuation = 13, // Pc 79 | GC_Dash_Punctuation = 14, // Pd 80 | GC_Open_Punctuation = 15, // Ps 81 | GC_Close_Punctuation = 16, // Pe 82 | GC_Initial_Punctuation = 17, // Pi 83 | GC_Final_Punctuation = 18, // Pf 84 | GC_Other_Punctuation = 19, // Po 85 | 86 | // GC_Symbol = 20 to 23 // S 87 | GC_Math_Symbol = 20, // Sm 88 | GC_Current_Symbol = 21, // Sc 89 | GC_Modifier_Symbol = 22, // Sk 90 | GC_Other_Symbol = 23, // So 91 | 92 | // GC_Separator = 24 to 26 // Z 93 | GC_Space_Separator = 24, // Zs 94 | GC_Line_Separator = 25, // Zl 95 | GC_Paragraph_Separator = 26, // Zp 96 | 97 | // GC_Other = 27 to 31 // C 98 | GC_Control = 27, // Cc 99 | GC_Format = 28, // Cf 100 | GC_Surrogate = 29, // Cs 101 | GC_Private_Use = 30, // Co 102 | GC_Unassigned = 31, // Cn 103 | }; 104 | 105 | 106 | enum class Canonical_Combining_Class : std::uint8_t 107 | { 108 | CCC_Not_Reordered = 0, 109 | 110 | // Fixed position classes 111 | CCC_Overlay = 1, 112 | CCC_Han_Reading = 6, 113 | CCC_Nukta = 7, 114 | CCC_Kana_Voicing = 8, 115 | CCC_Virama = 9, 116 | CCC_Ccc10 = 10, // first CCC 117 | // ... not specifically defined ... 118 | CCC_Ccc199 = 199, // last CCC 119 | 120 | // Other classes 121 | CCC_Attached_Below_Left = 200, 122 | CCC_Attached_Below = 202, 123 | CCC_Attached_Above = 214, 124 | CCC_Attached_Above_Right = 216, 125 | CCC_Below_Left = 218, 126 | CCC_Below = 220, 127 | CCC_Below_Right = 222, 128 | CCC_Left = 224, 129 | CCC_Right = 226, 130 | CCC_Above_Left = 228, 131 | CCC_Above = 230, 132 | CCC_Above_Right = 232, 133 | CCC_Double_Below = 233, 134 | CCC_Double_Above = 234, 135 | CCC_Iota_Subscript = 240, 136 | }; 137 | 138 | 139 | enum class Bidi_Class : std::uint8_t 140 | { 141 | BC_Unknown = 0, 142 | 143 | // Strong Types 144 | BC_Left_To_Right = 10, // L 145 | BC_Right_To_Left = 11, // R 146 | BC_Arabic_Letter = 12, // AL 147 | 148 | // Weak Types 149 | BC_European_Number = 20, // EN 150 | BC_European_Separator = 21, // ES 151 | BC_European_Terminator = 22, // ET 152 | BC_Arabic_Number = 23, // AN 153 | BC_Common_Separator = 24, // CS 154 | BC_Nonspacing_Mark = 25, // NSM 155 | BC_Boundary_Neutral = 26, // BN 156 | 157 | // Neutral Types 158 | BC_Paragraph_Separator = 30, // B 159 | BC_Segment_Separator = 31, // S 160 | BC_White_Space = 32, // WS 161 | BC_Other_Neutral = 33, // ON 162 | 163 | // Explicit Formatting Types 164 | BC_Left_To_Right_Embedding = 40, // LRE 165 | BC_Left_To_Right_Override = 41, // LRO 166 | BC_Right_To_Left_Embedding = 42, // RLE 167 | BC_Right_To_Left_Override = 43, // RLO 168 | BC_Pop_Directional_Format = 44, // PDF 169 | BC_Left_To_Right_Isolate = 45, // LRI 170 | BC_Right_To_Left_Isolate = 46, // RLI 171 | BC_First_Strong_Isolate = 47, // FSI 172 | BC_Pop_Directional_Isolate = 48, // PDI 173 | }; 174 | 175 | 176 | enum class Decomposition_Type : std::uint8_t 177 | { 178 | DT_unknown = 0, 179 | DT_none = 1, 180 | DT_canonical = 2, 181 | 182 | DT_font = 10, 183 | DT_noBreak = 11, 184 | DT_initial = 12, 185 | DT_medial = 13, 186 | DT_final = 14, 187 | DT_isolated = 15, 188 | DT_circle = 16, 189 | DT_super = 17, 190 | DT_sub = 18, 191 | DT_vertical = 19, 192 | DT_wide = 20, 193 | DT_narrow = 21, 194 | DT_small = 22, 195 | DT_square = 23, 196 | DT_fraction = 24, 197 | DT_compat = 25, 198 | }; 199 | 200 | 201 | enum class Numeric_Type : std::uint8_t 202 | { 203 | NT_Unknown = 0, // a.k.a. this is not marked as a number 204 | 205 | NT_Digit = 1, // the Digit type should be viewed as equivalent to Decimal 206 | NT_Decimal = 2, 207 | NT_Numeric = 3, 208 | }; 209 | 210 | 211 | 212 | 213 | 214 | class unicode_character 215 | { 216 | public: 217 | typedef std::shared_ptr 218 | pointer_t; 219 | 220 | unicode_character( 221 | char32_t code 222 | , detail::ucd_character * c 223 | , detail::ucd_header * h); 224 | virtual ~unicode_character(); 225 | unicode_character(unicode_character const & rhs); 226 | unicode_character & operator = (unicode_character const & rhs); 227 | 228 | bool is_valid() const; // valid code point as far as Unicode (UTF-32) is concerned 229 | bool is_defined() const; // whether this is a Unicode defined character or not 230 | bool is_private() const; // whether this code point is reserved for private use 231 | 232 | General_Category category() const; 233 | bool is_letter() const; 234 | bool is_mark() const; 235 | bool is_number() const; 236 | bool is_punctuation() const; 237 | bool is_symbol() const; 238 | bool is_separator() const; 239 | bool is_other() const; 240 | 241 | Canonical_Combining_Class 242 | combining_class(); 243 | Bidi_Class bidi_class() const; 244 | bool is_bidi_mirrored() const; 245 | Decomposition_Type decomposition_type() const; 246 | 247 | Numeric_Type numeric() const; 248 | std::int64_t nominator() const; 249 | std::int64_t denominator() const; 250 | 251 | protected: 252 | virtual detail::ucd_character * 253 | ucd_character_pointer() const; 254 | 255 | private: 256 | std::int64_t get_number(int index) const; 257 | char const * find_name(detail::Name_Type type, std::size_t & length) const; 258 | 259 | char32_t f_code = NOT_A_CHARACTER; 260 | detail::ucd_character * 261 | f_character = nullptr; 262 | detail::ucd_header *f_header = nullptr; 263 | }; 264 | 265 | 266 | 267 | 268 | class unicode_data 269 | { 270 | public: 271 | typedef std::shared_ptr 272 | pointer_t; 273 | 274 | static pointer_t get_instance(); 275 | 276 | // input file information 277 | // 278 | time_t last_generated(); 279 | void set_cache(bool cache = true); 280 | bool get_cache() const; 281 | char const * version() const; 282 | std::string const version_string() const; 283 | 284 | // access character data 285 | // 286 | unicode_character::pointer_t 287 | character(char32_t wc); 288 | 289 | private: 290 | typedef std::shared_ptr 291 | unicode_data_impl_pointer_t; 292 | 293 | unicode_data_impl_pointer_t 294 | f_impl = unicode_data_impl_pointer_t(); 295 | }; 296 | 297 | 298 | class ucd_parser 299 | { 300 | public: 301 | ucd_parser( 302 | std::string const & input_dir 303 | , std::string const & output_filename); 304 | 305 | void generate(); 306 | 307 | private: 308 | typedef std::shared_ptr 309 | parser_impl_pointer_t; 310 | 311 | parser_impl_pointer_t 312 | f_impl = parser_impl_pointer_t(); 313 | }; 314 | 315 | 316 | 317 | 318 | } // libutf8 namespace 319 | // vim: ts=4 sw=4 et 320 | -------------------------------------------------------------------------------- /libutf8/unicode_data_file.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021-2023 Made to Order Software Corp. All Rights Reserved 2 | // 3 | // https://snapwebsites.org/project/libutf8 4 | // contact@m2osw.com 5 | // 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; either version 2 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // This program is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License along 17 | // with this program; if not, write to the Free Software Foundation, Inc., 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | #pragma once 20 | 21 | /** \file 22 | * \brief The declarations of the Unicode compiled files. 23 | * 24 | * This file includes structures used to describe the Unicode compiled 25 | * file. This allows us to very quickly find all the information about 26 | * a character. 27 | * 28 | * From the outside, you are expected to use the unicode_character 29 | * functions defined in the unicode_data.h header. This file is 30 | * considered private. 31 | */ 32 | 33 | // self 34 | // 35 | #include 36 | 37 | 38 | // C++ 39 | // 40 | #include 41 | 42 | 43 | 44 | namespace libutf8 45 | { 46 | 47 | namespace detail 48 | { 49 | 50 | 51 | enum class Name_Type : std::uint8_t // see UnicodeData.txt and NameAliases.txt 52 | { 53 | NT_Name = 0xF0, 54 | NT_Abbreviation = 0xF1, 55 | NT_Jamo_Short_Name = 0xF2, // see Jamo.txt 56 | NT_Alternate = 0xF3, 57 | NT_Control = 0xF4, 58 | NT_WrongName = 0xF5, // the main name is the corrected name, this name is the invalid/incorrect name 59 | NT_Figment = 0xF6, 60 | NT_Numeric = 0xF7, // saved as two int64_t in the strings because that's under 8kb that way 61 | 62 | NT_EndOfNames = 0xFF, 63 | }; 64 | 65 | 66 | 67 | struct ucd_header 68 | { 69 | char f_magic[4] = { 'U', 'C', 'D', 'B' }; 70 | time_t f_timestamp = 0; // time when this file was generated 71 | std::uint8_t f_version = 0; // version of this file format 72 | std::uint8_t f_ucd_version[3] = { 1, 1, 0 }; // version of source -- i.e. 5 2 0 73 | std::uint32_t f_characters = 0; // offset to character table 74 | std::uint32_t f_strings = 0; // offset to string table 75 | std::uint32_t f_decomposition = 0; // offset to decomposition table 76 | }; 77 | 78 | 79 | 80 | typedef std::uint8_t flags_t; 81 | 82 | constexpr flags_t UCD_FLAG_DIGIT = 0x01; // represents a number 83 | constexpr flags_t UCD_FLAG_DECIMAL = 0x02; // represents a number 84 | constexpr flags_t UCD_FLAG_NUMERIC = 0x04; // represents a number 85 | constexpr flags_t UCD_FLAG_BIDI_MIRROR = 0x08; // mirror of another letter left to right vs. right to left 86 | constexpr flags_t UCD_FLAG_CONTROL = 0x10; 87 | constexpr flags_t UCD_FLAG_PRIVATE = 0x20; 88 | 89 | 90 | 91 | struct ucd_character 92 | { 93 | // initialization happens in a non-virtual function, otherwise it 94 | // would break the binary use of the structure 95 | // 96 | void initialize_ucd_character() 97 | { 98 | f_code = NOT_A_CHARACTER; 99 | f_names = 0; 100 | f_flags = 0; 101 | 102 | f_general_category = General_Category::GC_Unknown_Category; 103 | f_canonical_combining_class = Canonical_Combining_Class::CCC_Not_Reordered; 104 | f_bidi_class = Bidi_Class::BC_Unknown; // see flags for mirror info 105 | f_decomposition_type = static_cast(Decomposition_Type::DT_unknown); 106 | f_decomposition_length = 0; 107 | f_decomposition_mapping = 0; 108 | f_age[0] = 1; 109 | f_age[1] = 1; 110 | } 111 | 112 | /* 32 */ char32_t f_code; 113 | /* 32 */ std::uint32_t f_names; // offset to string table 114 | /* 8 */ flags_t f_flags; 115 | /* 8 */ General_Category f_general_category; 116 | /* 8 */ Canonical_Combining_Class f_canonical_combining_class; 117 | /* 8 */ Bidi_Class f_bidi_class; 118 | /* 5 */ std::uint32_t f_decomposition_type : 5; 119 | /* 5 */ std::uint32_t f_decomposition_length : 5; 120 | /* 22 */ std::uint32_t f_decomposition_mapping : 22; 121 | /* 16 */ std::uint8_t f_age[2]; 122 | }; 123 | 124 | 125 | // The f_names is an offset in the string table. 126 | // 127 | // Each name is defined as: 128 | // 129 | // struct name_t 130 | // { 131 | // Name_Type f_type; 132 | // uint8_t f_size; 133 | // char8_t f_name[f_size]; 134 | // }; 135 | // 136 | // Names are not null terminated. 137 | // followed by UTF-8 until the next byte representing a Name_Type, the 138 | // last name ends with special type NT_EndOfNames. 139 | // 140 | // The first name is the corrected name of the character. 141 | // 142 | // Following are the other Name_Type names. 143 | // 144 | // The numeric entries are actually two 64 bit numbers (nominator and 145 | // denominator). The size will always be 16 bytes, but the alignment 146 | // is likely going to be "wrong" (although that should not matter much 147 | // on Intel and ARM processors). 148 | 149 | 150 | 151 | 152 | } // detail namespace 153 | 154 | } // libutf8 namespace 155 | // vim: ts=4 sw=4 et 156 | -------------------------------------------------------------------------------- /libutf8/version.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved 2 | // 3 | // https://snapwebsites.org/ 4 | // contact@m2osw.com 5 | // 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; either version 2 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // This program is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License along 17 | // with this program; if not, write to the Free Software Foundation, Inc., 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | 20 | /** \file 21 | * \brief The UTF-8 libray is used to convert C++ strings. 22 | * 23 | * This file shows the UTF-8 library version. 24 | * 25 | * The `#define` give you the library version at the time you are compiling. 26 | * The functions allow you to retrieve the version of a dynamically linked 27 | * library. 28 | */ 29 | 30 | // self 31 | // 32 | #include "libutf8/version.h" 33 | 34 | 35 | // last include 36 | // 37 | #include 38 | 39 | 40 | 41 | namespace libutf8 42 | { 43 | 44 | 45 | 46 | 47 | /** \brief Get the major version of the library 48 | * 49 | * This function returns the major version of the running library (the 50 | * one you are linked against at runtime). 51 | * 52 | * \return The major version. 53 | */ 54 | int get_major_version() 55 | { 56 | return LIBUTF8_VERSION_MAJOR; 57 | } 58 | 59 | 60 | /** \brief Get the minor version of the library. 61 | * 62 | * This function returns the minor version of the running library 63 | * (the one you are linked against at runtime). 64 | * 65 | * \return The release version. 66 | */ 67 | int get_release_version() 68 | { 69 | return LIBUTF8_VERSION_MINOR; 70 | } 71 | 72 | 73 | /** \brief Get the patch version of the library. 74 | * 75 | * This function returns the patch version of the running library 76 | * (the one you are linked against at runtime). 77 | * 78 | * \return The patch version. 79 | */ 80 | int get_patch_version() 81 | { 82 | return LIBUTF8_VERSION_PATCH; 83 | } 84 | 85 | 86 | /** \brief Get the full version of the library as a string. 87 | * 88 | * This function returns the major, minor, and patch versions of the 89 | * running library (the one you are linked against at runtime) in the 90 | * form of a string. 91 | * 92 | * The build version is not made available. In most cases we change 93 | * the build version only to run a new build, so not code will have 94 | * changed (some documentation and non-code files may changed between 95 | * build versions; but the code will work exactly the same way.) 96 | * 97 | * \return The library version. 98 | */ 99 | char const * get_version_string() 100 | { 101 | return LIBUTF8_VERSION_STRING; 102 | } 103 | 104 | 105 | } // libutf8 namespace 106 | // vim: ts=4 sw=4 et 107 | -------------------------------------------------------------------------------- /libutf8/version.h.in: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved 2 | // 3 | // https://snapwebsites.org/ 4 | // contact@m2osw.com 5 | // 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; either version 2 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // This program is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License along 17 | // with this program; if not, write to the Free Software Foundation, Inc., 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | #pragma once 20 | 21 | /** \file 22 | * \brief Definitions of the libutf8 version. 23 | * 24 | * This header includes the libutf8 library version and functions you 25 | * can use to check the current version of the library. 26 | */ 27 | 28 | 29 | #define LIBUTF8_VERSION_MAJOR @LIBUTF8_VERSION_MAJOR@ 30 | #define LIBUTF8_VERSION_MINOR @LIBUTF8_VERSION_MINOR@ 31 | #define LIBUTF8_VERSION_PATCH @LIBUTF8_VERSION_PATCH@ 32 | #define LIBUTF8_VERSION_STRING "@LIBUTF8_VERSION_MAJOR@.@LIBUTF8_VERSION_MINOR@.@LIBUTF8_VERSION_PATCH@" 33 | 34 | namespace libutf8 35 | { 36 | 37 | 38 | int get_major_version(); 39 | int get_release_version(); 40 | int get_patch_version(); 41 | char const * get_version_string(); 42 | 43 | 44 | 45 | } // libutf8 namespace 46 | // vim: ts=4 sw=4 et 47 | -------------------------------------------------------------------------------- /mk: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # See the snapcmakemodules project for details about this script 4 | # https://github.com/m2osw/snapcmakemodules 5 | 6 | if test -x ../../cmake/scripts/mk 7 | then 8 | ../../cmake/scripts/mk $* 9 | else 10 | echo "error: could not locate the cmake mk script" 11 | exit 1 12 | fi 13 | 14 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved 2 | # 3 | # https://snapwebsites.org/project/libutf8 4 | # contact@m2osw.com 5 | # 6 | # This program is free software; you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation; either version 2 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This program is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License along 17 | # with this program; if not, write to the Free Software Foundation, Inc., 18 | # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | 20 | ## 21 | ## libutf8 library unit tests 22 | ## 23 | project(unittest) 24 | 25 | find_package(SnapCatch2) 26 | 27 | if(SnapCatch2_FOUND) 28 | 29 | add_executable(${PROJECT_NAME} 30 | catch_main.cpp 31 | 32 | catch_bom.cpp 33 | catch_caseinsensitive.cpp 34 | catch_character.cpp 35 | catch_iterator.cpp 36 | catch_json_tokens.cpp 37 | catch_length.cpp 38 | catch_stream.cpp 39 | catch_string.cpp 40 | catch_valid.cpp 41 | catch_version.cpp 42 | ) 43 | 44 | target_include_directories(${PROJECT_NAME} 45 | PUBLIC 46 | ${CMAKE_BINARY_DIR} 47 | ${PROJECT_SOURCE_DIR} 48 | ${SNAPCATCH2_INCLUDE_DIRS} 49 | ${LIBEXCEPT_INCLUDE_DIRS} 50 | ) 51 | 52 | target_link_libraries(${PROJECT_NAME} 53 | utf8 54 | ${SNAPCATCH2_LIBRARIES} 55 | ) 56 | 57 | else(SnapCatch2_FOUND) 58 | 59 | message("snapcatch2 not found... no test will be built.") 60 | 61 | endif(SnapCatch2_FOUND) 62 | 63 | if(SnapCatch2_FOUND) 64 | 65 | find_package(SnapTestRunner) 66 | AddUnitTestsTarget( 67 | PROJECT_NAME 68 | rununittests 69 | ) 70 | 71 | endif(SnapCatch2_FOUND) 72 | 73 | # vim: ts=4 sw=4 et 74 | -------------------------------------------------------------------------------- /tests/catch_bom.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021-2023 Made to Order Software Corp. All Rights Reserved 2 | // 3 | // https://snapwebsites.org/project/libutf8 4 | // contact@m2osw.com 5 | // 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; either version 2 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // This program is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License along 17 | // with this program; if not, write to the Free Software Foundation, Inc., 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | 20 | // libutf8 21 | // 22 | #include 23 | #include 24 | 25 | 26 | // unit test 27 | // 28 | #include "catch_main.h" 29 | 30 | 31 | // C++ 32 | // 33 | #include 34 | #include 35 | 36 | 37 | // last include 38 | // 39 | #include 40 | 41 | 42 | 43 | CATCH_TEST_CASE("bom", "[characters],[bom]") 44 | { 45 | CATCH_START_SECTION("bom: Verify the BOM character") 46 | CATCH_REQUIRE(libutf8::BOM_CHAR == 0xFEFF); 47 | CATCH_END_SECTION() 48 | 49 | CATCH_START_SECTION("bom: Verify with a string that's too small") 50 | { 51 | CATCH_REQUIRE(libutf8::start_with_bom(nullptr, rand()) == libutf8::bom_t::BOM_NONE); 52 | CATCH_REQUIRE(libutf8::start_with_bom("", 0) == libutf8::bom_t::BOM_NONE); 53 | CATCH_REQUIRE(libutf8::start_with_bom("a", 1) == libutf8::bom_t::BOM_NONE); 54 | } 55 | CATCH_END_SECTION() 56 | 57 | CATCH_START_SECTION("bom: Verify the five BOMs as is") 58 | { 59 | char buf[4]; 60 | char32_t const bom(libutf8::BOM_CHAR); 61 | 62 | // UTF-8 63 | buf[0] = static_cast((bom >> 12) | 0xE0); 64 | buf[1] = static_cast(((bom >> 6) & 0x3F) | 0x80); 65 | buf[2] = static_cast(((bom >> 0) & 0x3F) | 0x80); 66 | buf[3] = '?'; 67 | CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF8); 68 | 69 | // UTF-16 Little Endian 70 | buf[0] = static_cast(bom >> 0); 71 | buf[1] = static_cast(bom >> 8); 72 | buf[2] = static_cast(0x00); 73 | buf[3] = static_cast(0x34); 74 | CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_LE); 75 | 76 | // UTF-16 Little Endian (with a zero in the next 2 bytes) 77 | buf[0] = static_cast(bom >> 0); 78 | buf[1] = static_cast(bom >> 8); 79 | buf[2] = static_cast(0x12); 80 | buf[3] = static_cast(0x00); 81 | CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_LE); 82 | 83 | // UTF-16 Little Endian (with a zero in the next 2 bytes) 84 | buf[0] = static_cast(bom >> 0); 85 | buf[1] = static_cast(bom >> 8); 86 | buf[2] = static_cast(0x12); 87 | buf[3] = static_cast(0x34); 88 | CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_LE); 89 | 90 | // UTF-16 Big Endian 91 | buf[0] = static_cast(bom >> 8); 92 | buf[1] = static_cast(bom >> 0); 93 | buf[2] = static_cast(0xAB); 94 | buf[3] = static_cast(0xCD); 95 | CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_BE); 96 | 97 | // UTF-16 Big Endian (with a zero in the next 2 bytes) 98 | buf[0] = static_cast(bom >> 8); 99 | buf[1] = static_cast(bom >> 0); 100 | buf[2] = static_cast(0x00); 101 | buf[3] = static_cast(0xCD); 102 | CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_BE); 103 | 104 | // UTF-16 Big Endian (with a zero in the next 2 bytes) 105 | buf[0] = static_cast(bom >> 8); 106 | buf[1] = static_cast(bom >> 0); 107 | buf[2] = static_cast(0xAB); 108 | buf[3] = static_cast(0x00); 109 | CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_BE); 110 | 111 | // UTF-32 Little Endian 112 | buf[0] = static_cast(bom >> 0); 113 | buf[1] = static_cast(bom >> 8); 114 | buf[2] = static_cast(bom >> 16); 115 | buf[3] = static_cast(bom >> 24); 116 | CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF32_LE); 117 | 118 | // UTF-32 Big Endian 119 | buf[0] = static_cast(bom >> 24); 120 | buf[1] = static_cast(bom >> 16); 121 | buf[2] = static_cast(bom >> 8); 122 | buf[3] = static_cast(bom >> 0); 123 | CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF32_BE); 124 | } 125 | CATCH_END_SECTION() 126 | 127 | CATCH_START_SECTION("bom: Verify the five BOMs as is") 128 | { 129 | char buf[4]; 130 | 131 | // unknown 1 byte (well... 1 byte is never really known...) 132 | buf[0] = '?'; 133 | CATCH_REQUIRE(libutf8::start_with_bom(buf, 1) == libutf8::bom_t::BOM_NONE); 134 | 135 | // unknown 2 bytes 136 | buf[0] = 'Q'; 137 | buf[1] = '?'; 138 | CATCH_REQUIRE(libutf8::start_with_bom(buf, 2) == libutf8::bom_t::BOM_NONE); 139 | 140 | // unknown 3 bytes 141 | buf[0] = 'B'; 142 | buf[1] = 'O'; 143 | buf[2] = 'M'; 144 | CATCH_REQUIRE(libutf8::start_with_bom(buf, 3) == libutf8::bom_t::BOM_NONE); 145 | 146 | // unknown 4 bytes 147 | buf[0] = 'B'; 148 | buf[1] = 'O'; 149 | buf[2] = 'M'; 150 | buf[3] = '?'; 151 | CATCH_REQUIRE(libutf8::start_with_bom(buf, 4) == libutf8::bom_t::BOM_NONE); 152 | } 153 | CATCH_END_SECTION() 154 | 155 | CATCH_START_SECTION("bom: Verify u32string that starts with a BOM (CPU Endianness)") 156 | { 157 | std::u32string u32str; 158 | u32str += libutf8::BOM_CHAR; 159 | u32str += unittest::rand_char(true); 160 | size_t const size(u32str.length() * sizeof(std::u32string::value_type)); 161 | for(int idx(static_cast(size)); idx >= 0; --idx) 162 | { 163 | if(static_cast(idx) >= sizeof(std::u32string::value_type)) 164 | { 165 | #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 166 | CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast(u32str.c_str()), idx) == libutf8::bom_t::BOM_UTF32_BE); 167 | #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 168 | CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast(u32str.c_str()), idx) == libutf8::bom_t::BOM_UTF32_LE); 169 | #else 170 | #error "Unsupported endianness" 171 | #endif 172 | } 173 | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 174 | else if(static_cast(idx) >= sizeof(std::u16string::value_type)) 175 | { 176 | CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast(u32str.c_str()), idx) == libutf8::bom_t::BOM_UTF16_LE); 177 | } 178 | #endif 179 | else 180 | { 181 | // too short 182 | // 183 | CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast(u32str.c_str()), idx) == libutf8::bom_t::BOM_NONE); 184 | } 185 | } 186 | } 187 | CATCH_END_SECTION() 188 | } 189 | 190 | 191 | // vim: ts=4 sw=4 et 192 | -------------------------------------------------------------------------------- /tests/catch_caseinsensitive.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021-2023 Made to Order Software Corp. All Rights Reserved 2 | // 3 | // https://snapwebsites.org/project/libutf8 4 | // contact@m2osw.com 5 | // 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; either version 2 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // This program is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License along 17 | // with this program; if not, write to the Free Software Foundation, Inc., 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | 20 | // libutf8 21 | // 22 | #include 23 | 24 | 25 | // unit test 26 | // 27 | #include "catch_main.h" 28 | 29 | 30 | // C++ 31 | // 32 | #include 33 | #include 34 | 35 | 36 | // last include 37 | // 38 | #include 39 | 40 | 41 | 42 | namespace 43 | { 44 | 45 | 46 | libutf8::case_insensitive_string get_time(std::string & result) 47 | { 48 | time_t const now(time(nullptr)); 49 | struct tm t; 50 | localtime_r(&now, &t); 51 | char buf[256]; 52 | strftime(buf, sizeof(buf), "%T", &t); 53 | buf[sizeof(buf) - 1] = '\0'; 54 | result = buf; 55 | libutf8::case_insensitive_string r(buf); 56 | r += " PST"; 57 | return r; 58 | } 59 | 60 | std::string get_date(std::string & result) 61 | { 62 | time_t const now(time(nullptr)); 63 | struct tm t; 64 | localtime_r(&now, &t); 65 | char buf[256]; 66 | strftime(buf, sizeof(buf), "%F", &t); 67 | buf[sizeof(buf) - 1] = '\0'; 68 | result = buf; 69 | libutf8::case_insensitive_string r(buf); 70 | r += " plus a few days"; 71 | return r; 72 | } 73 | 74 | 75 | 76 | } 77 | 78 | 79 | 80 | CATCH_TEST_CASE("case_insensitive", "[string],[compare],[insensitive]") 81 | { 82 | CATCH_START_SECTION("case_insensitive: Verify Case Insensitive String Constructors") 83 | { 84 | { 85 | libutf8::case_insensitive_string empty; 86 | CATCH_REQUIRE(empty.empty()); 87 | } 88 | 89 | { 90 | std::allocator allocator; 91 | libutf8::case_insensitive_string empty(allocator); 92 | CATCH_REQUIRE(empty.empty()); 93 | } 94 | 95 | { 96 | libutf8::case_insensitive_string dashes(10, '-'); 97 | CATCH_REQUIRE(dashes == "----------"); 98 | } 99 | 100 | { 101 | libutf8::case_insensitive_string name("alexis"); 102 | CATCH_REQUIRE(name == "alexis"); 103 | } 104 | 105 | { 106 | libutf8::case_insensitive_string name("alexis", 4); 107 | CATCH_REQUIRE(name == "alex"); 108 | } 109 | 110 | { 111 | libutf8::case_insensitive_string name("alexis"); 112 | CATCH_REQUIRE(name == "alexis"); 113 | 114 | libutf8::case_insensitive_string section(name, 2); 115 | CATCH_REQUIRE(section == "exis"); 116 | } 117 | 118 | { 119 | libutf8::case_insensitive_string name("alexis"); 120 | CATCH_REQUIRE(name == "alexis"); 121 | 122 | libutf8::case_insensitive_string section(name, 2, 2); 123 | CATCH_REQUIRE(section == "ex"); 124 | } 125 | 126 | { 127 | std::string name("alexis"); 128 | CATCH_REQUIRE(name == "alexis"); 129 | 130 | libutf8::case_insensitive_string section(name, 2); 131 | CATCH_REQUIRE(section == "exis"); 132 | } 133 | 134 | { 135 | std::string name("alexis"); 136 | CATCH_REQUIRE(name == "alexis"); 137 | 138 | libutf8::case_insensitive_string section(name, 2, 2); 139 | CATCH_REQUIRE(section == "ex"); 140 | } 141 | 142 | { 143 | libutf8::case_insensitive_string name("alexis"); 144 | CATCH_REQUIRE(name == "alexis"); 145 | 146 | libutf8::case_insensitive_string section(name.begin() + 2, name.end() - 2); 147 | CATCH_REQUIRE(section == "ex"); 148 | } 149 | 150 | { 151 | std::string name("alexis"); 152 | CATCH_REQUIRE(name == "alexis"); 153 | 154 | libutf8::case_insensitive_string full(name); 155 | CATCH_REQUIRE(full == "alexis"); 156 | } 157 | 158 | { 159 | libutf8::case_insensitive_string name("alexis"); 160 | CATCH_REQUIRE(name == "alexis"); 161 | 162 | libutf8::case_insensitive_string full(name); 163 | CATCH_REQUIRE(full == "alexis"); 164 | } 165 | 166 | { 167 | libutf8::case_insensitive_string name({'a', 'l', 'e', 'x', 'i', 's'}); 168 | CATCH_REQUIRE(name == "alexis"); 169 | } 170 | 171 | { 172 | std::string expected("not this"); 173 | libutf8::case_insensitive_string now(get_time(expected)); 174 | CATCH_REQUIRE(expected + " PST" == now); 175 | } 176 | 177 | { 178 | std::allocator allocator; 179 | std::string expected("not this"); 180 | libutf8::case_insensitive_string now(get_time(expected), allocator); 181 | CATCH_REQUIRE(expected + " PST" == now); 182 | } 183 | 184 | { 185 | std::string expected("not this"); 186 | libutf8::case_insensitive_string now(get_date(expected)); 187 | CATCH_REQUIRE(now == expected + " plus a few days"); 188 | } 189 | 190 | { 191 | std::allocator allocator; 192 | std::string expected("not this"); 193 | libutf8::case_insensitive_string now(get_date(expected), allocator); 194 | CATCH_REQUIRE(now == expected + " plus a few days"); 195 | } 196 | } 197 | CATCH_END_SECTION() 198 | 199 | CATCH_START_SECTION("case_insensitive: Verify Case Insensitive String Comparators") 200 | { 201 | { 202 | libutf8::case_insensitive_string name1("Alexis"); 203 | libutf8::case_insensitive_string name2("alexis"); 204 | CATCH_REQUIRE(name1 == name2); 205 | CATCH_REQUIRE_FALSE(name1 != name2); 206 | CATCH_REQUIRE_FALSE(name1 > name2); 207 | CATCH_REQUIRE(name1 >= name2); 208 | CATCH_REQUIRE_FALSE(name1 < name2); 209 | CATCH_REQUIRE(name1 <= name2); 210 | } 211 | 212 | { 213 | libutf8::case_insensitive_string name1("Alexis"); 214 | libutf8::case_insensitive_string name2("Wilke"); 215 | CATCH_REQUIRE_FALSE(name1 == name2); 216 | CATCH_REQUIRE(name1 != name2); 217 | CATCH_REQUIRE_FALSE(name1 > name2); 218 | CATCH_REQUIRE_FALSE(name1 >= name2); 219 | CATCH_REQUIRE(name1 < name2); 220 | CATCH_REQUIRE(name1 <= name2); 221 | } 222 | 223 | { 224 | libutf8::case_insensitive_string name1("Alexis"); 225 | std::string name2("alexis"); 226 | CATCH_REQUIRE(name1 == name2); 227 | CATCH_REQUIRE_FALSE(name1 != name2); 228 | CATCH_REQUIRE_FALSE(name1 > name2); 229 | CATCH_REQUIRE(name1 >= name2); 230 | CATCH_REQUIRE_FALSE(name1 < name2); 231 | CATCH_REQUIRE(name1 <= name2); 232 | } 233 | 234 | { 235 | std::string name1("Alexis"); 236 | libutf8::case_insensitive_string name2("Wilke"); 237 | CATCH_REQUIRE_FALSE(name1 == name2); 238 | CATCH_REQUIRE(name1 != name2); 239 | CATCH_REQUIRE_FALSE(name1 > name2); 240 | CATCH_REQUIRE_FALSE(name1 >= name2); 241 | CATCH_REQUIRE(name1 < name2); 242 | CATCH_REQUIRE(name1 <= name2); 243 | } 244 | 245 | { 246 | libutf8::case_insensitive_string name1("Alexis"); 247 | CATCH_REQUIRE(name1 == "alexis"); 248 | CATCH_REQUIRE_FALSE(name1 != "alexis"); 249 | CATCH_REQUIRE_FALSE(name1 > "alexis"); 250 | CATCH_REQUIRE(name1 >= "alexis"); 251 | CATCH_REQUIRE_FALSE(name1 < "alexis"); 252 | CATCH_REQUIRE(name1 <= "alexis"); 253 | } 254 | 255 | { 256 | libutf8::case_insensitive_string name2("Wilke"); 257 | CATCH_REQUIRE_FALSE("Alexis" == name2); 258 | CATCH_REQUIRE("Alexis" != name2); 259 | CATCH_REQUIRE_FALSE("Alexis" > name2); 260 | CATCH_REQUIRE_FALSE("Alexis" >= name2); 261 | CATCH_REQUIRE("Alexis" < name2); 262 | CATCH_REQUIRE("Alexis" <= name2); 263 | } 264 | } 265 | CATCH_END_SECTION() 266 | } 267 | 268 | 269 | // vim: ts=4 sw=4 et 270 | -------------------------------------------------------------------------------- /tests/catch_iterator.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2013-2023 Made to Order Software Corp. All Rights Reserved 2 | // 3 | // https://snapwebsites.org/project/libutf8 4 | // contact@m2osw.com 5 | // 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; either version 2 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // This program is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License along 17 | // with this program; if not, write to the Free Software Foundation, Inc., 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | 20 | // libutf8 21 | // 22 | #include 23 | 24 | #include 25 | #include 26 | 27 | 28 | // unit test 29 | // 30 | #include "catch_main.h" 31 | 32 | 33 | // C++ 34 | // 35 | #include 36 | #include 37 | 38 | 39 | // last include 40 | // 41 | #include 42 | 43 | 44 | 45 | CATCH_TEST_CASE("libutf8_iterator", "[iterator]") 46 | { 47 | CATCH_START_SECTION("libutf8_iterator: valid iterators tests") 48 | { 49 | char32_t p(0); 50 | do 51 | { 52 | p = rand() % 0x11 * 0x10000; 53 | } 54 | while(p == 0 || (p >= 0xD800 && p <= 0xDFFF)); 55 | 56 | for(char32_t plan(0); plan < 0x110000; plan += 0x10000) 57 | { 58 | // create one plan in one string 59 | // 60 | std::string str; 61 | str.reserve(0x10000 * 4); 62 | for(char32_t wc(0); wc < 0x10000; ++wc) 63 | { 64 | if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF) 65 | { 66 | wc = 0xDFFF; 67 | continue; 68 | } 69 | char buf[libutf8::MBS_MIN_BUFFER_LENGTH]; 70 | CATCH_REQUIRE(libutf8::wctombs(buf, wc + plan, sizeof(buf)) >= 1); 71 | if(plan == 0 && wc == 0) 72 | { 73 | // this is a special case as buf[0] = '\0' and the += with 74 | // the string won't work 75 | // 76 | str += '\0'; 77 | } 78 | else 79 | { 80 | str += buf; 81 | } 82 | } 83 | //std::cerr << "-------------- Plan " << static_cast(plan) << " String ready " << str.length() << " ...\n"; 84 | 85 | { 86 | libutf8::utf8_iterator it(str); 87 | libutf8::utf8_iterator it_end(str, true); 88 | libutf8::utf8_iterator it_next(str); 89 | ++it_next; 90 | 91 | CATCH_REQUIRE(it == str.begin()); 92 | CATCH_REQUIRE(it == str.cbegin()); 93 | CATCH_REQUIRE(it != str.end()); 94 | CATCH_REQUIRE(it != str.cend()); 95 | 96 | CATCH_REQUIRE(it == it); 97 | CATCH_REQUIRE(it != it_end); 98 | CATCH_REQUIRE(it != it_next); 99 | 100 | CATCH_REQUIRE(str.begin() == it); 101 | CATCH_REQUIRE(str.cbegin() == it); 102 | CATCH_REQUIRE(str.end() != it); 103 | CATCH_REQUIRE(str.cend() != it); 104 | 105 | for(char32_t wc(0); wc < 0x10000; ++wc) 106 | { 107 | if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF) 108 | { 109 | wc = 0xDFFF; 110 | continue; 111 | } 112 | CATCH_REQUIRE(*it == wc + plan); 113 | ++it; 114 | } 115 | 116 | CATCH_REQUIRE(it != str.begin()); 117 | CATCH_REQUIRE(it != str.cbegin()); 118 | CATCH_REQUIRE(it == str.end()); 119 | CATCH_REQUIRE(it == str.cend()); 120 | 121 | CATCH_REQUIRE(str.begin() != it); 122 | CATCH_REQUIRE(str.cbegin() != it); 123 | CATCH_REQUIRE(str.end() == it); 124 | CATCH_REQUIRE(str.cend() == it); 125 | 126 | CATCH_REQUIRE(*it == libutf8::EOS); 127 | ++it; 128 | it++; 129 | CATCH_REQUIRE(it == str.cend()); 130 | 131 | for(char32_t wc(0x10000); wc > 0; ) 132 | { 133 | --wc; 134 | if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF) 135 | { 136 | wc = 0xD800; 137 | continue; 138 | } 139 | --it; 140 | CATCH_REQUIRE(*it == wc + plan); 141 | } 142 | 143 | --it; 144 | it--; 145 | 146 | CATCH_REQUIRE(it.good()); 147 | CATCH_REQUIRE_FALSE(it.bad()); 148 | } 149 | 150 | if(plan == p) 151 | { 152 | libutf8::utf8_iterator it(str); 153 | 154 | for(char32_t wc(0); wc < 0x10000; ++wc) 155 | { 156 | if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF) 157 | { 158 | wc = 0xDFFF; 159 | continue; 160 | } 161 | CATCH_REQUIRE(*it++ == wc + plan); 162 | } 163 | 164 | CATCH_REQUIRE(it == str.end()); 165 | it++; 166 | CATCH_REQUIRE(it.good()); 167 | CATCH_REQUIRE_FALSE(it.bad()); 168 | ++it; 169 | CATCH_REQUIRE(it.good()); 170 | CATCH_REQUIRE_FALSE(it.bad()); 171 | CATCH_REQUIRE(it == str.end()); 172 | CATCH_REQUIRE(it.good()); 173 | CATCH_REQUIRE_FALSE(it.bad()); 174 | 175 | for(char32_t wc(0x10000); wc > 0; ) 176 | { 177 | --wc; 178 | if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF) 179 | { 180 | wc = 0xD800; 181 | continue; 182 | } 183 | CATCH_REQUIRE(*--it == wc + plan); 184 | } 185 | 186 | CATCH_REQUIRE(it == str.begin()); 187 | CATCH_REQUIRE(str.begin() == it); 188 | it--; 189 | --it; 190 | CATCH_REQUIRE(it == str.begin()); 191 | CATCH_REQUIRE(str.begin() == it); 192 | } 193 | 194 | if(plan == (p + 0x10000) % 0x110000) 195 | { 196 | libutf8::utf8_iterator it(str); 197 | libutf8::utf8_iterator start(str); 198 | CATCH_REQUIRE(it - start == 0); 199 | CATCH_REQUIRE(start - it == 0); 200 | 201 | for(char32_t wc(0); wc < 0x10000; ++wc) 202 | { 203 | if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF) 204 | { 205 | wc = 0xDFFF; 206 | continue; 207 | } 208 | CATCH_REQUIRE(*it == wc + plan); 209 | it++; 210 | 211 | libutf8::utf8_iterator zero(it); 212 | zero.rewind(); 213 | CATCH_REQUIRE(zero == start); 214 | } 215 | 216 | libutf8::utf8_iterator copy(it); 217 | CATCH_REQUIRE(static_cast(it - start) == str.length()); 218 | CATCH_REQUIRE(static_cast(copy - start) == str.length()); 219 | CATCH_REQUIRE(copy - it == 0); 220 | CATCH_REQUIRE(it - copy == 0); 221 | copy.rewind(); 222 | CATCH_REQUIRE(copy - start == 0); 223 | CATCH_REQUIRE(start - copy == 0); 224 | CATCH_REQUIRE(static_cast(start - copy) == 0); 225 | CATCH_REQUIRE(static_cast(copy - start) == 0); 226 | 227 | for(char32_t wc(0x10000); wc > 0; ) 228 | { 229 | --wc; 230 | if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF) 231 | { 232 | wc = 0xD800; 233 | continue; 234 | } 235 | it--; 236 | CATCH_REQUIRE(*it == wc + plan); 237 | } 238 | } 239 | } 240 | } 241 | CATCH_END_SECTION() 242 | } 243 | 244 | 245 | CATCH_TEST_CASE("libutf8_iterator_invalid_string", "[iterator],[invalid]") 246 | { 247 | CATCH_START_SECTION("libutf8_iterator_invalid_string: iterators with invalid characters (bad UTF-8)") 248 | { 249 | for(int repeat(0); repeat < 100; ++repeat) 250 | { 251 | // create one plan in one string 252 | // 253 | constexpr size_t STR_LENGTH = 4; 254 | char32_t wc; 255 | std::u32string wstr; 256 | wstr.reserve(STR_LENGTH); 257 | for(size_t idx(0); idx < STR_LENGTH; ++idx) 258 | { 259 | do 260 | { 261 | wc = unittest::rand_char(true); 262 | } 263 | while(wc < 0x80); 264 | wstr += wc; 265 | } 266 | std::string str(libutf8::to_u8string(wstr)); 267 | 268 | //std::cerr << "-------------- Plan " << static_cast(plan) << " String ready " << str.length() << " ...\n"; 269 | 270 | // first verify that it works 271 | // 272 | std::string::size_type pos[STR_LENGTH]; 273 | { 274 | libutf8::utf8_iterator it(str); 275 | 276 | CATCH_REQUIRE(it == str.begin()); 277 | CATCH_REQUIRE(it == str.cbegin()); 278 | CATCH_REQUIRE(it != str.end()); 279 | CATCH_REQUIRE(it != str.cend()); 280 | 281 | CATCH_REQUIRE(str.begin() == it); 282 | CATCH_REQUIRE(str.cbegin() == it); 283 | CATCH_REQUIRE(str.end() != it); 284 | CATCH_REQUIRE(str.cend() != it); 285 | 286 | for(size_t idx(0); idx < STR_LENGTH; ++idx) 287 | { 288 | CATCH_REQUIRE(*it == wstr[idx]); 289 | if(rand() % 2 == 0) 290 | { 291 | pos[idx] = it - str.begin(); 292 | } 293 | else 294 | { 295 | pos[idx] = -(str.begin() - it); 296 | } 297 | ++it; 298 | } 299 | 300 | CATCH_REQUIRE(it != str.begin()); 301 | CATCH_REQUIRE(it != str.cbegin()); 302 | CATCH_REQUIRE(it == str.end()); 303 | CATCH_REQUIRE(it == str.cend()); 304 | 305 | CATCH_REQUIRE(str.begin() != it); 306 | CATCH_REQUIRE(str.cbegin() != it); 307 | CATCH_REQUIRE(str.end() == it); 308 | CATCH_REQUIRE(str.cend() == it); 309 | 310 | CATCH_REQUIRE(*it == libutf8::EOS); 311 | ++it; 312 | it++; 313 | CATCH_REQUIRE(it == str.cend()); 314 | 315 | CATCH_REQUIRE(it.good()); 316 | CATCH_REQUIRE_FALSE(it.bad()); 317 | } 318 | 319 | { 320 | libutf8::utf8_iterator it(str); 321 | 322 | str[pos[1]] = rand() % 0x40 + 0x80; 323 | 324 | CATCH_REQUIRE(*it++ == wstr[0]); 325 | CATCH_REQUIRE(*it++ == libutf8::NOT_A_CHARACTER); // we broke this one 326 | CATCH_REQUIRE(*it++ == wstr[2]); 327 | CATCH_REQUIRE(*it++ == wstr[3]); 328 | CATCH_REQUIRE(*it++ == libutf8::EOS); 329 | 330 | CATCH_REQUIRE_FALSE(it.good()); 331 | CATCH_REQUIRE(it.bad()); 332 | it.clear(); 333 | CATCH_REQUIRE(it.good()); 334 | CATCH_REQUIRE_FALSE(it.bad()); 335 | } 336 | 337 | { 338 | str.erase(str.length() - 1); 339 | libutf8::utf8_iterator it(str); 340 | 341 | str[pos[1]] = rand() % 0x40 + 0x80; 342 | 343 | CATCH_REQUIRE(*it++ == wstr[0]); 344 | CATCH_REQUIRE(*it++ == libutf8::NOT_A_CHARACTER); 345 | CATCH_REQUIRE(*it++ == wstr[2]); 346 | CATCH_REQUIRE(*it++ == libutf8::NOT_A_CHARACTER); 347 | 348 | CATCH_REQUIRE_FALSE(it.good()); 349 | CATCH_REQUIRE(it.bad()); 350 | it.clear(); 351 | CATCH_REQUIRE(it.good()); 352 | CATCH_REQUIRE_FALSE(it.bad()); 353 | } 354 | } 355 | } 356 | CATCH_END_SECTION() 357 | 358 | CATCH_START_SECTION("libutf8_iterator_invalid_string: iterators with invalid characters (too large)") 359 | { 360 | for(char32_t wc(0x110000); wc < 0x1FFFFF; ++wc) 361 | { 362 | // since this character is not valid 363 | // we have to encode it _manually_ 364 | // 365 | char buf[4]; 366 | buf[0] = 0xF0 | ((wc >> 18) & 0x07); 367 | buf[1] = 0x80 | ((wc >> 12) & 0x3F); 368 | buf[2] = 0x80 | ((wc >> 6) & 0x3F); 369 | buf[3] = 0x80 | ((wc >> 0) & 0x3F); 370 | 371 | std::string str(buf, 4); 372 | 373 | // first verify that it works 374 | // 375 | { 376 | libutf8::utf8_iterator it(str); 377 | 378 | CATCH_REQUIRE(it == str.begin()); 379 | CATCH_REQUIRE(it == str.cbegin()); 380 | CATCH_REQUIRE(it != str.end()); 381 | CATCH_REQUIRE(it != str.cend()); 382 | 383 | CATCH_REQUIRE(str.begin() == it); 384 | CATCH_REQUIRE(str.cbegin() == it); 385 | CATCH_REQUIRE(str.end() != it); 386 | CATCH_REQUIRE(str.cend() != it); 387 | 388 | CATCH_REQUIRE(*it == libutf8::NOT_A_CHARACTER); 389 | 390 | CATCH_REQUIRE_FALSE(it.good()); 391 | CATCH_REQUIRE(it.bad()); 392 | it.clear(); 393 | CATCH_REQUIRE(it.good()); 394 | CATCH_REQUIRE_FALSE(it.bad()); 395 | 396 | ++it; 397 | 398 | CATCH_REQUIRE(it != str.begin()); 399 | CATCH_REQUIRE(it != str.cbegin()); 400 | CATCH_REQUIRE(it == str.end()); 401 | CATCH_REQUIRE(it == str.cend()); 402 | 403 | CATCH_REQUIRE(str.begin() != it); 404 | CATCH_REQUIRE(str.cbegin() != it); 405 | CATCH_REQUIRE(str.end() == it); 406 | CATCH_REQUIRE(str.cend() == it); 407 | 408 | CATCH_REQUIRE(*it == libutf8::EOS); 409 | ++it; 410 | it++; 411 | CATCH_REQUIRE(it == str.cend()); 412 | 413 | CATCH_REQUIRE_FALSE(it.good()); 414 | CATCH_REQUIRE(it.bad()); 415 | } 416 | } 417 | } 418 | CATCH_END_SECTION() 419 | } 420 | 421 | 422 | 423 | // vim: ts=4 sw=4 et 424 | -------------------------------------------------------------------------------- /tests/catch_length.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2013-2023 Made to Order Software Corp. All Rights Reserved 2 | // 3 | // https://snapwebsites.org/project/libutf8 4 | // contact@m2osw.com 5 | // 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; either version 2 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // This program is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License along 17 | // with this program; if not, write to the Free Software Foundation, Inc., 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | 20 | // libutf8 21 | // 22 | #include 23 | #include 24 | 25 | 26 | // unit test 27 | // 28 | #include "catch_main.h" 29 | 30 | 31 | // C++ 32 | // 33 | #include 34 | #include 35 | #include 36 | 37 | 38 | // last include 39 | // 40 | #include 41 | 42 | 43 | 44 | CATCH_TEST_CASE("string_length", "[strings][valid][length][u8][u16][u32]") 45 | { 46 | CATCH_START_SECTION("string_length: length of valid Unicode strings") 47 | { 48 | for(int idx(0); idx < 100; ++idx) 49 | { 50 | std::size_t const length(rand() % 100 + 1); 51 | std::u32string str32; 52 | for(std::size_t j(0); j < length; ++j) 53 | { 54 | char32_t const c(SNAP_CATCH2_NAMESPACE::random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_ZUNICODE)); 55 | str32 += c; 56 | } 57 | CATCH_REQUIRE(libutf8::is_valid_unicode(str32)); 58 | CATCH_REQUIRE(str32.length() == length); 59 | 60 | std::string str8(libutf8::to_u8string(str32)); 61 | CATCH_REQUIRE(libutf8::is_valid_utf8(str8)); 62 | CATCH_REQUIRE(str8.length() >= length); 63 | CATCH_REQUIRE(libutf8::u8length(str8) == length); 64 | 65 | std::u16string str16(libutf8::to_u16string(str8)); 66 | CATCH_REQUIRE(libutf8::is_valid_utf16(str16)); 67 | CATCH_REQUIRE(str16.length() >= length); 68 | CATCH_REQUIRE(static_cast(libutf8::u16length(str16)) == length); 69 | } 70 | } 71 | CATCH_END_SECTION() 72 | } 73 | 74 | 75 | CATCH_TEST_CASE("invalid_utf16_string_length", "[strings][invalid][length][u16]") 76 | { 77 | CATCH_START_SECTION("invalid_utf16_string_length: invalid UTF-16 returns -1 for length") 78 | { 79 | for(int idx(0); idx < 100; ++idx) 80 | { 81 | std::size_t const length(rand() % 30 + 5); 82 | char16_t bad_char(rand() & 0x03FF); 83 | std::size_t bad_pos(length / 2); 84 | switch(idx % 3) 85 | { 86 | case 0: 87 | bad_char += 0xDC00; // low without a high 88 | break; 89 | 90 | case 1: 91 | bad_char += 0xD800; // high not followed by a low 92 | break; 93 | 94 | case 2: 95 | bad_char += 0xD800; // high followed by u'\0' 96 | bad_pos = length - 1; 97 | break; 98 | 99 | } 100 | std::u16string str16; 101 | for(std::size_t j(0); j < length; ++j) 102 | { 103 | char32_t const wc(SNAP_CATCH2_NAMESPACE::random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_ZUNICODE)); 104 | str16 += libutf8::to_u16string(wc); 105 | if(j == bad_pos) 106 | { 107 | str16 += bad_char; 108 | } 109 | } 110 | 111 | CATCH_REQUIRE_FALSE(libutf8::is_valid_utf16(str16)); 112 | CATCH_REQUIRE(libutf8::u16length(str16) == -1); 113 | } 114 | } 115 | CATCH_END_SECTION() 116 | } 117 | 118 | 119 | // vim: ts=4 sw=4 et 120 | -------------------------------------------------------------------------------- /tests/catch_main.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021-2023 Made to Order Software Corp. All Rights Reserved 2 | // 3 | // https://snapwebsites.org/project/libutf8 4 | // contact@m2osw.com 5 | // 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; either version 2 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // This program is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License along 17 | // with this program; if not, write to the Free Software Foundation, Inc., 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | 20 | // Tell catch we want it to add the runner code in this file. 21 | #define CATCH_CONFIG_RUNNER 22 | 23 | // self 24 | // 25 | #include "catch_main.h" 26 | 27 | 28 | // libutf8 29 | // 30 | #include 31 | #include 32 | 33 | 34 | // libexcept 35 | // 36 | #include 37 | 38 | 39 | // C++ 40 | // 41 | #include 42 | 43 | 44 | // last include 45 | // 46 | #include 47 | 48 | 49 | 50 | 51 | 52 | int main(int argc, char * argv[]) 53 | { 54 | return SNAP_CATCH2_NAMESPACE::snap_catch2_main( 55 | "libutf8" 56 | , LIBUTF8_VERSION_STRING 57 | , argc 58 | , argv 59 | , []() { libexcept::set_collect_stack(libexcept::collect_stack_t::COLLECT_STACK_NO); } 60 | ); 61 | } 62 | 63 | 64 | // vim: ts=4 sw=4 et 65 | -------------------------------------------------------------------------------- /tests/catch_main.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2006-2023 Made to Order Software Corp. All Rights Reserved 2 | // 3 | // https://snapwebsites.org/project/libutf8 4 | // contact@m2osw.com 5 | // 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; either version 2 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // This program is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License along 17 | // with this program; if not, write to the Free Software Foundation, Inc., 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | #pragma once 20 | 21 | // libutf8 22 | // 23 | #include // for the ostream 24 | 25 | 26 | // catch2 27 | // 28 | #include 29 | 30 | 31 | // C++ 32 | // 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | 39 | // last include 40 | // 41 | #include 42 | 43 | 44 | 45 | namespace SNAP_CATCH2_NAMESPACE 46 | { 47 | 48 | 49 | 50 | 51 | inline char32_t rand_char(bool full_range = false) 52 | { 53 | char32_t const max((full_range ? 0x0110000 : 0x0010000) - (0xE000 - 0xD800)); 54 | 55 | char32_t wc; 56 | do 57 | { 58 | wc = ((rand() << 16) ^ rand()) % max; 59 | } 60 | while(wc == 0); 61 | if(wc >= 0xD800) 62 | { 63 | // skip the surrogates 64 | // 65 | wc += 0xE000 - 0xD800; 66 | } 67 | 68 | return wc; 69 | } 70 | 71 | 72 | 73 | } 74 | // unittest namespace 75 | // vim: ts=4 sw=4 et 76 | -------------------------------------------------------------------------------- /tests/catch_stream.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2013-2023 Made to Order Software Corp. All Rights Reserved 2 | // 3 | // https://snapwebsites.org/project/libutf8 4 | // contact@m2osw.com 5 | // 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; either version 2 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // This program is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License along 17 | // with this program; if not, write to the Free Software Foundation, Inc., 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | 20 | // libutf8 21 | // 22 | #include 23 | 24 | 25 | // unit test 26 | // 27 | #include "catch_main.h" 28 | 29 | 30 | // C++ 31 | // 32 | #include 33 | #include 34 | #include 35 | 36 | 37 | // last include 38 | // 39 | #include 40 | 41 | 42 | 43 | CATCH_TEST_CASE("stream", "[stream][valid]") 44 | { 45 | CATCH_START_SECTION("stream: write a char32_t to a stream") 46 | { 47 | for(int i(0); i < 1000; ++i) 48 | { 49 | char32_t const wc(SNAP_CATCH2_NAMESPACE::random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_ZUNICODE)); 50 | 51 | std::stringstream ss; 52 | ss << libutf8::to_u8string(wc); 53 | 54 | CATCH_REQUIRE(ss.str() == libutf8::to_u8string(wc)); 55 | } 56 | } 57 | CATCH_END_SECTION() 58 | } 59 | 60 | 61 | 62 | // vim: ts=4 sw=4 et 63 | -------------------------------------------------------------------------------- /tests/catch_valid.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2013-2023 Made to Order Software Corp. All Rights Reserved 2 | // 3 | // https://snapwebsites.org/project/libutf8 4 | // contact@m2osw.com 5 | // 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; either version 2 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // This program is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License along 17 | // with this program; if not, write to the Free Software Foundation, Inc., 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | 20 | // libutf8 21 | // 22 | #include 23 | #include 24 | 25 | 26 | // unit test 27 | // 28 | #include "catch_main.h" 29 | 30 | 31 | // snapdev 32 | // 33 | #include 34 | 35 | 36 | // C++ 37 | // 38 | #include 39 | #include 40 | #include 41 | 42 | 43 | // last include 44 | // 45 | #include 46 | 47 | 48 | 49 | CATCH_TEST_CASE("make_valid", "[strings][valid][u8]") 50 | { 51 | CATCH_START_SECTION("make_valid: test bad encoding (1 byte when 2 necessary)") 52 | { 53 | for(char32_t two_bytes(0x80); two_bytes < 0x800; ++two_bytes) 54 | { 55 | char const byte1(static_cast((two_bytes >> 6) | 0xC0)); 56 | char32_t const vc1(random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_UNICODE)); 57 | char32_t const vc2(random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_UNICODE)); 58 | char32_t const fix_char(random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_UNICODE)); 59 | std::string invalid_string; 60 | invalid_string += vc1; 61 | invalid_string += byte1; 62 | invalid_string += vc2; 63 | std::string expected_string; 64 | expected_string += vc1; 65 | expected_string += fix_char; 66 | expected_string += vc2; 67 | CATCH_REQUIRE_FALSE(libutf8::make_u8string_valid(invalid_string, fix_char)); 68 | CATCH_REQUIRE(invalid_string == expected_string); 69 | } 70 | } 71 | CATCH_END_SECTION() 72 | 73 | CATCH_START_SECTION("make_valid: test bad encoding (2 bytes when 3 necessary)") 74 | { 75 | for(char32_t two_bytes(0x800); two_bytes < 0x10000; ++two_bytes) 76 | { 77 | // Note: this includes the UTF-16 surrogates which are also 78 | // considered invalid 79 | // 80 | char const byte1(static_cast((two_bytes >> 12) | 0xE0)); 81 | char const byte2(((two_bytes >> 6) & 0x3F) | 0x80); 82 | char32_t const vc1(random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_UNICODE)); 83 | char32_t const vc2(random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_UNICODE)); 84 | char32_t const fix_char(random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_UNICODE)); 85 | std::string invalid_string; 86 | invalid_string += vc1; 87 | invalid_string += byte1; 88 | invalid_string += byte2; 89 | invalid_string += vc2; 90 | std::string expected_string; 91 | expected_string += vc1; 92 | expected_string += fix_char; 93 | expected_string += vc2; 94 | CATCH_REQUIRE_FALSE(libutf8::make_u8string_valid(invalid_string, fix_char)); 95 | CATCH_REQUIRE(invalid_string == expected_string); 96 | } 97 | } 98 | CATCH_END_SECTION() 99 | 100 | CATCH_START_SECTION("make_valid: test bad encoding (3 bytes when 4 necessary)") 101 | { 102 | for(char32_t two_bytes(0x10000); two_bytes < 0x110000; ++two_bytes) 103 | { 104 | char const byte1(static_cast((two_bytes >> 18) | 0xF0)); 105 | char const byte2(((two_bytes >> 12) & 0x3F) | 0x80); 106 | char const byte3(((two_bytes >> 6) & 0x3F) | 0x80); 107 | char32_t const vc1(random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_UNICODE)); 108 | char32_t const vc2(random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_UNICODE)); 109 | char32_t const fix_char(random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_UNICODE)); 110 | std::string invalid_string; 111 | invalid_string += vc1; 112 | invalid_string += byte1; 113 | invalid_string += byte2; 114 | invalid_string += byte3; 115 | invalid_string += vc2; 116 | std::string expected_string; 117 | expected_string += vc1; 118 | expected_string += fix_char; 119 | expected_string += vc2; 120 | CATCH_REQUIRE_FALSE(libutf8::make_u8string_valid(invalid_string, fix_char)); 121 | CATCH_REQUIRE(invalid_string == expected_string); 122 | } 123 | } 124 | CATCH_END_SECTION() 125 | } 126 | 127 | 128 | 129 | // vim: ts=4 sw=4 et 130 | -------------------------------------------------------------------------------- /tests/catch_version.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021-2023 Made to Order Software Corp. All Rights Reserved 2 | // 3 | // https://snapwebsites.org/project/libutf8 4 | // contact@m2osw.com 5 | // 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; either version 2 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // This program is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License along 17 | // with this program; if not, write to the Free Software Foundation, Inc., 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | 20 | // libutf8 21 | // 22 | #include 23 | 24 | 25 | // self 26 | // 27 | #include "catch_main.h" 28 | 29 | 30 | // last include 31 | // 32 | #include 33 | 34 | 35 | 36 | 37 | CATCH_TEST_CASE("version", "[version]") 38 | { 39 | CATCH_START_SECTION("version: verify runtime vs compile time version numbers") 40 | { 41 | CATCH_REQUIRE(libutf8::get_major_version() == LIBUTF8_VERSION_MAJOR); 42 | CATCH_REQUIRE(libutf8::get_release_version() == LIBUTF8_VERSION_MINOR); 43 | CATCH_REQUIRE(libutf8::get_patch_version() == LIBUTF8_VERSION_PATCH); 44 | CATCH_REQUIRE(strcmp(libutf8::get_version_string(), LIBUTF8_VERSION_STRING) == 0); 45 | } 46 | CATCH_END_SECTION() 47 | } 48 | 49 | 50 | // vim: ts=4 sw=4 et 51 | -------------------------------------------------------------------------------- /tests/example-for-show-utf16.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/m2osw/libutf8/fe008f2199f5d8f43abe6ecffe6e82ead03ab2b5/tests/example-for-show-utf16.txt -------------------------------------------------------------------------------- /tests/example-for-show-utf32.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/m2osw/libutf8/fe008f2199f5d8f43abe6ecffe6e82ead03ab2b5/tests/example-for-show-utf32.txt -------------------------------------------------------------------------------- /tests/example-for-show-utf8.txt: -------------------------------------------------------------------------------- 1 | Tḩìs 𝄞 ĩş bêȧútîfüł! 2 | -------------------------------------------------------------------------------- /tests/unicode/LICENSE.txt: -------------------------------------------------------------------------------- 1 | Please see the ../conf/unicode/LICENSE.txt files for the license. 2 | -------------------------------------------------------------------------------- /tests/verify-show-unicode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh -e 2 | # 3 | # Verify that the show-unicode command line tool returns the correct exit codes 4 | # 5 | # TODO: verify the actual output (that may require a catch_....cpp file so 6 | # we can capture the output and easy compare with string we generate 7 | # in a C++ test) 8 | 9 | SHOW_UNICODE="../../BUILD/Debug/contrib/libutf8/tools/show-unicode" 10 | ERRCNT=0 11 | RED='\033[0;31m' 12 | NORMAL='\033[0m' 13 | 14 | # Verify Binary Exists 15 | if ! test -x ${SHOW_UNICODE} 16 | then 17 | echo "${RED}error: could not find valid binary \"${SHOW_UNICODE}\"; did you build the project?${NORMAL}" 18 | echo "1 error occurred. Please verify what went wrong and fix it." 19 | exit 1 20 | fi 21 | 22 | # Help 23 | echo "--- SECTION: --help" 24 | if ${SHOW_UNICODE} --help 25 | then 26 | echo "${RED}error: --help returned with success.${NORMAL}" 27 | ERRCNT=`expr ${ERRCNT} + 1` 28 | elif test ${?} -ne 2 29 | then 30 | echo "${RED}error: --help did not return with expected exit code.${NORMAL}" 31 | ERRCNT=`expr ${ERRCNT} + 1` 32 | else 33 | echo "info: --help works." 34 | fi 35 | echo 36 | 37 | # Version 38 | echo "--- SECTION: --version" 39 | if ${SHOW_UNICODE} --version 40 | then 41 | echo "${RED}error: --version returned with success.${NORMAL}" 42 | ERRCNT=`expr ${ERRCNT} + 1` 43 | elif test ${?} -ne 2 44 | then 45 | echo "${RED}error: --version did not return with expected exit code.${NORMAL}" 46 | ERRCNT=`expr ${ERRCNT} + 1` 47 | else 48 | echo "info: --version works." 49 | fi 50 | echo 51 | 52 | # String / Character 53 | echo "--- SECTION: --string" 54 | if ${SHOW_UNICODE} "Magic" 55 | then 56 | echo "info: string display worked." 57 | else 58 | echo "${RED}error: string display failed with ${?}.${NORMAL}" 59 | ERRCNT=`expr ${ERRCNT} + 1` 60 | fi 61 | echo 62 | 63 | if ${SHOW_UNICODE} --string "Élémentaire ça!" 64 | then 65 | echo "info: string display worked." 66 | else 67 | echo "${RED}error: string display failed with ${?}.${NORMAL}" 68 | ERRCNT=`expr ${ERRCNT} + 1` 69 | fi 70 | echo 71 | 72 | echo "--- SECTION: --character" 73 | if ${SHOW_UNICODE} -C 0x1D11E 74 | then 75 | echo "info: character display worked." 76 | else 77 | echo "${RED}error: character display failed with ${?}.${NORMAL}" 78 | ERRCNT=`expr ${ERRCNT} + 1` 79 | fi 80 | echo 81 | 82 | if ${SHOW_UNICODE} -C 1D11E 83 | then 84 | echo "${RED}error: character display succeeded with invalid number syntax.${NORMAL}" 85 | ERRCNT=`expr ${ERRCNT} + 1` 86 | else 87 | ERRCODE=${?} 88 | if test ${ERRCODE} -eq 1 89 | then 90 | echo "info: character display failed as expected with ${ERRCODE}." 91 | else 92 | echo "${RED}error: character display failed with unexpected error code ${ERRCODE}.${NORMAL}" 93 | ERRCNT=`expr ${ERRCNT} + 1` 94 | fi 95 | fi 96 | echo 97 | 98 | # Files 99 | check_show() { 100 | echo "--- SECTION: file with: ${1}" 101 | if ${SHOW_UNICODE} "${1}" tests/example-for-show-${2}.txt 102 | then 103 | echo "info: ${2} display worked." 104 | else 105 | echo "${RED}error: ${2} display failed with ${?}.${NORMAL}" 106 | ERRCNT=`expr ${ERRCNT} + 1` 107 | fi 108 | echo 109 | } 110 | 111 | check_show -f utf8 112 | check_show -S utf16 113 | check_show -F utf32 114 | 115 | if test ${ERRCNT} -eq 0 116 | then 117 | exit 0 118 | fi 119 | 120 | echo "${ERRCNT} errors occurred. Please verify what went wrong and fix it." 121 | exit 1 122 | 123 | -------------------------------------------------------------------------------- /tools/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2012-2023 Made to Order Software Corp. All Rights Reserved 2 | # 3 | # https://snapwebsites.org/project/libutf8 4 | # contact@m2osw.com 5 | # 6 | # This program is free software; you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation; either version 2 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This program is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License along 17 | # with this program; if not, write to the Free Software Foundation, Inc., 18 | # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | 20 | ## 21 | ## show-unicode 22 | ## 23 | project(show-unicode) 24 | 25 | add_executable(${PROJECT_NAME} 26 | show_unicode.cpp 27 | ) 28 | 29 | target_link_libraries(${PROJECT_NAME} 30 | utf8 31 | ) 32 | 33 | install( 34 | TARGETS 35 | ${PROJECT_NAME} 36 | 37 | RUNTIME DESTINATION 38 | bin 39 | ) 40 | 41 | 42 | ## 43 | ## unicode-data-parser 44 | ## 45 | project(unicode-data-parser) 46 | 47 | add_executable(${PROJECT_NAME} 48 | unicode_data_parser.cpp 49 | ) 50 | 51 | target_include_directories(${PROJECT_NAME} 52 | PUBLIC 53 | ${ADVGETOPT_INCLUDE_DIRS} 54 | ${LIBEXCEPT_INCLUDE_DIRS} 55 | ) 56 | 57 | target_link_libraries(${PROJECT_NAME} 58 | utf8 59 | ${LIBEXCEPT_LIBRARIES} 60 | ) 61 | 62 | install( 63 | TARGETS 64 | ${PROJECT_NAME} 65 | 66 | RUNTIME DESTINATION 67 | bin 68 | ) 69 | 70 | 71 | # vim: ts=4 sw=4 et 72 | -------------------------------------------------------------------------------- /tools/show_unicode.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved 2 | // 3 | // https://snapwebsites.org/project/libutf8 4 | // contact@m2osw.com 5 | // 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; either version 2 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // This program is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License along 17 | // with this program; if not, write to the Free Software Foundation, Inc., 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | 20 | /** \file 21 | * \brief Tool used to convert the UnicodeData.txt file to C structures. 22 | * 23 | * This executable is used to convert the UnicodeData.txt to a set of 24 | * C structure which we can search very quickly to find Unicode characters. 25 | * This gives us all the necessary information to convert strings to NFKC 26 | * NFKD, and especially NFC and NFD. 27 | * 28 | * \sa http://www.unicode.org/reports/tr15/ 29 | */ 30 | 31 | 32 | // libutf8 33 | // 34 | #include 35 | #include 36 | 37 | 38 | // C++ 39 | // 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | 48 | 49 | // last include 50 | // 51 | #include 52 | 53 | 54 | 55 | namespace 56 | { 57 | 58 | 59 | class show_unicode 60 | { 61 | public: 62 | enum class mode_t 63 | { 64 | MODE_STRING, 65 | MODE_CHARACTER, 66 | MODE_UTF8_FILENAME, 67 | MODE_UTF16_FILENAME, 68 | MODE_UTF32_FILENAME, 69 | 70 | MODE_DEFAULT // like MODE_STRING, just not set explicitly 71 | }; 72 | 73 | int parse_args(int agrc, char * argv[]); 74 | int verify_args(); 75 | int process(); 76 | 77 | private: 78 | void usage(); 79 | int set_mode(mode_t m); 80 | int read_file(); 81 | 82 | mode_t f_mode = mode_t::MODE_DEFAULT; 83 | std::string f_filename = std::string(); 84 | std::vector f_input = std::vector(); 85 | bool f_valid_fffe_ffff = true; 86 | }; 87 | 88 | 89 | 90 | 91 | int show_unicode::parse_args(int argc, char * argv[]) 92 | { 93 | for(int i(1); i < argc; ++i) 94 | { 95 | if(argv[i][0] == '-') 96 | { 97 | if(strcmp(argv[i], "-h") == 0 98 | || strcmp(argv[i], "--help") == 0) 99 | { 100 | usage(); 101 | return 2; 102 | } 103 | if(strcmp(argv[i], "-V") == 0 104 | || strcmp(argv[i], "--version") == 0) 105 | { 106 | std::cout << LIBUTF8_VERSION_STRING << '\n'; 107 | return 2; 108 | } 109 | if(strcmp(argv[i], "-C") == 0 110 | || strcmp(argv[i], "--unicode") == 0) 111 | { 112 | ++i; 113 | if(i >= argc) 114 | { 115 | std::cerr << "error: the --character command line option must be followed by a number representing a valid Unicode characters in UTF-32.\n"; 116 | return 3; 117 | } 118 | char * end; 119 | char * s(argv[i]); 120 | int base(10); 121 | if(*s == '0') 122 | { 123 | ++s; 124 | base = 8; 125 | if(*s == 'x' || *s == 'X') 126 | { 127 | base = 16; 128 | ++s; 129 | } 130 | } 131 | char32_t const wc(strtol(s, &end, base)); 132 | if(end == nullptr 133 | || *end != '\0') 134 | { 135 | std::cerr 136 | << "error: expected a valid decimal, octal, or hexadecimal number; could not parse \"" 137 | << argv[i] 138 | << "\" as a valid number.\n"; 139 | return 1; 140 | } 141 | if(!libutf8::is_valid_unicode(wc)) 142 | { 143 | std::cerr 144 | << "error: code \"0x" 145 | << std::uppercase << std::hex << std::setfill('0') << std::setw(6) << static_cast(wc) 146 | << "\" does not represent a valid Unicode character.\n"; 147 | return 1; 148 | } 149 | std::string const character(libutf8::to_u8string(wc)); 150 | f_input.insert(f_input.end(), character.begin(), character.end()); 151 | int const r(set_mode(mode_t::MODE_CHARACTER)); 152 | if(r != 0) 153 | { 154 | return r; 155 | } 156 | continue; 157 | } 158 | if(strcmp(argv[i], "-s") == 0 159 | || strcmp(argv[i], "--string") == 0) 160 | { 161 | ++i; 162 | if(i >= argc) 163 | { 164 | std::cerr << "error: the --string command line option must be followed by the string to process.\n"; 165 | return 3; 166 | } 167 | f_input.insert(f_input.end(), argv[i], argv[i] + strlen(argv[i])); 168 | int const r(set_mode(mode_t::MODE_STRING)); 169 | if(r != 0) 170 | { 171 | return r; 172 | } 173 | continue; 174 | } 175 | if(strcmp(argv[i], "-f") == 0 176 | || strcmp(argv[i], "--input") == 0) 177 | { 178 | ++i; 179 | if(i >= argc) 180 | { 181 | std::cerr << "error: the --input command line option must be followed by the input filename.\n"; 182 | return 3; 183 | } 184 | f_filename = argv[i]; 185 | int r(set_mode(mode_t::MODE_UTF8_FILENAME)); 186 | if(r == 0) 187 | { 188 | r = read_file(); 189 | } 190 | if(r != 0) 191 | { 192 | return r; 193 | } 194 | continue; 195 | } 196 | if(strcmp(argv[i], "-S") == 0 197 | || strcmp(argv[i], "--input-utf16") == 0) 198 | { 199 | ++i; 200 | if(i >= argc) 201 | { 202 | std::cerr << "error: the --input-utf16 command line option must be followed by the input filename.\n"; 203 | return 3; 204 | } 205 | f_filename = argv[i]; 206 | int r(set_mode(mode_t::MODE_UTF16_FILENAME)); 207 | if(r == 0) 208 | { 209 | r = read_file(); 210 | } 211 | if(r == 0 && f_input.size() % 2 != 0) 212 | { 213 | std::cerr << "error: the size of \"" 214 | << f_filename 215 | << "\" was expected to be a multiple of 2.\n"; 216 | return 1; 217 | } 218 | if(r == 0) 219 | { 220 | std::u16string in(reinterpret_cast(f_input.data()), f_input.size() / 2); 221 | std::string u8(libutf8::to_u8string(in)); 222 | f_input.resize(u8.length()); 223 | memcpy(f_input.data(), u8.data(), u8.length()); 224 | } 225 | if(r != 0) 226 | { 227 | return r; 228 | } 229 | continue; 230 | } 231 | if(strcmp(argv[i], "-F") == 0 232 | || strcmp(argv[i], "--input-utf32") == 0) 233 | { 234 | ++i; 235 | if(i >= argc) 236 | { 237 | std::cerr << "error: the --input-utf32 command line option must be followed by the input filename.\n"; 238 | return 3; 239 | } 240 | f_filename = argv[i]; 241 | int r(set_mode(mode_t::MODE_UTF32_FILENAME)); 242 | if(r == 0) 243 | { 244 | r = read_file(); 245 | } 246 | if(r == 0 && f_input.size() % 4 != 0) 247 | { 248 | std::cerr << "error: the size of \"" 249 | << f_filename 250 | << "\" was expected to be a multiple of 4.\n"; 251 | return 1; 252 | } 253 | if(r == 0) 254 | { 255 | std::u32string in(reinterpret_cast(f_input.data()), f_input.size() / 4); 256 | std::string u8(libutf8::to_u8string(in)); 257 | f_input.resize(u8.length()); 258 | memcpy(f_input.data(), u8.data(), u8.length()); 259 | } 260 | if(r != 0) 261 | { 262 | return r; 263 | } 264 | continue; 265 | } 266 | if(strcmp(argv[i], "--valid-fffe-ffff") == 0) 267 | { 268 | f_valid_fffe_ffff = true; 269 | continue; 270 | } 271 | if(strcmp(argv[i], "-W") == 0 272 | || strcmp(argv[i], "--invalid-fffe-ffff") == 0) 273 | { 274 | f_valid_fffe_ffff = false; 275 | continue; 276 | } 277 | std::cerr << "error: unknown command line option \"" 278 | << argv[i] 279 | << "\".\n"; 280 | return 4; 281 | } 282 | else 283 | { 284 | f_input.insert(f_input.end(), argv[i], argv[i] + strlen(argv[i])); 285 | } 286 | } 287 | 288 | return 0; 289 | } 290 | 291 | 292 | int show_unicode::set_mode(mode_t m) 293 | { 294 | if(f_mode != mode_t::MODE_DEFAULT) 295 | { 296 | std::cerr << "error: mode already set to: " << static_cast(f_mode) << "\n"; 297 | return 3; 298 | } 299 | f_mode = m; 300 | 301 | return 0; 302 | } 303 | 304 | 305 | int show_unicode::read_file() 306 | { 307 | std::ifstream in(f_filename); 308 | if(!in.is_open()) 309 | { 310 | std::cerr 311 | << "error: could not open input file \"" 312 | << f_filename 313 | << "\".\n"; 314 | return 1; 315 | } 316 | in.seekg(0, std::ios::end); 317 | std::size_t const size(in.tellg()); 318 | in.seekg(0); 319 | f_input.resize(size); 320 | in.read(reinterpret_cast(f_input.data()), size); 321 | if(!in) 322 | { 323 | std::cerr 324 | << "error: could not read input file \"" 325 | << f_filename 326 | << "\".\n"; 327 | return 1; 328 | } 329 | 330 | return 0; 331 | } 332 | 333 | 334 | int show_unicode::verify_args() 335 | { 336 | // the mode already generated an error no need for that here 337 | return 0; 338 | } 339 | 340 | 341 | int show_unicode::process() 342 | { 343 | // first show the string as is 344 | // 345 | std::string utf8(std::string(reinterpret_cast(f_input.data()), f_input.size())); 346 | std::cout << "Input: \"" << utf8 << "\".\n"; 347 | 348 | // next show the string as UTF-8 bytes 349 | // 350 | std::cout << "UTF-8:" << std::hex << std::setfill('0'); 351 | for(auto it(f_input.begin()); it != f_input.end(); ++it) 352 | { 353 | char const * space(" "); 354 | if(*it >= 0x80 && *it <= 0xBF) 355 | { 356 | space = "."; 357 | } 358 | std::cout << space << std::setw(2) << static_cast(*it); 359 | } 360 | std::cout << '\n'; 361 | 362 | // next show the string as UTF-16 words 363 | // 364 | std::u16string utf16(libutf8::to_u16string(utf8)); 365 | std::cout << "UTF-16:"; 366 | for(auto it(utf16.begin()); it != utf16.end(); ++it) 367 | { 368 | std::cout << ' ' << std::setw(4) << static_cast(*it); 369 | } 370 | std::cout << '\n'; 371 | 372 | // next show the string as UTF-32 words 373 | // 374 | std::u32string utf32(libutf8::to_u32string(utf8)); 375 | std::cout << "UTF-32:"; 376 | for(auto it(utf32.begin()); it != utf32.end(); ++it) 377 | { 378 | std::cout << ' ' << std::setw(6) << static_cast(*it); 379 | } 380 | std::cout << '\n'; 381 | 382 | return 0; 383 | } 384 | 385 | 386 | void show_unicode::usage() 387 | { 388 | std::cout << "Usage: show-unicode [-] [-s|--string] '' | -C | -f \n" 389 | "Where - is one or more of:\n" 390 | " -h | --help print this help screen.\n" 391 | " -C | --unicode use specified value.\n" 392 | " -s | --string input string to convert (using -s or --string is optional).\n" 393 | " -f | --input input file of UTF-8 characters.\n" 394 | " -S | --input-utf16 input file of UTF-16 characters.\n" 395 | " -F | --input-utf32 input file of UTF-32 characters.\n" 396 | " --valid-fffe-ffff consider \\uFFFE and \\uFFFF as valid characters (default).\n" 397 | " -W | --invalid-fffe-ffff consider \\uFFFE and \\uFFFF as invalid characters.\n" 398 | " -V | --version print out this tool's version.\n" 399 | "\n"; 400 | } 401 | 402 | 403 | } // no name namespace 404 | 405 | 406 | int main(int argc, char * argv[]) 407 | { 408 | show_unicode show; 409 | int r(show.parse_args(argc, argv)); 410 | if(r != 0) 411 | { 412 | return r; 413 | } 414 | r = show.verify_args(); 415 | if(r != 0) 416 | { 417 | return r; 418 | } 419 | return show.process(); 420 | } 421 | 422 | 423 | // vim: ts=4 sw=4 et 424 | -------------------------------------------------------------------------------- /tools/unicode_data_parser.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved 2 | // 3 | // https://snapwebsites.org/project/libutf8 4 | // contact@m2osw.com 5 | // 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; either version 2 of the License, or 9 | // (at your option) any later version. 10 | // 11 | // This program is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | // 16 | // You should have received a copy of the GNU General Public License along 17 | // with this program; if not, write to the Free Software Foundation, Inc., 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | 20 | /** \file 21 | * \brief Tool used to convert the UnicodeData.txt file to C structures. 22 | * 23 | * This executable is used to convert the UnicodeData.txt to a set of 24 | * C structure which we can search very quickly to find Unicode characters. 25 | * This gives us all the necessary information to convert strings to NFKC 26 | * NFKD, and especially NFC and NFD. 27 | * 28 | * \sa http://www.unicode.org/reports/tr15/ 29 | */ 30 | 31 | 32 | // libutf8 33 | // 34 | #include 35 | 36 | 37 | // libexcept 38 | // 39 | #include 40 | 41 | 42 | // C++ 43 | // 44 | #include 45 | #include 46 | #include 47 | 48 | 49 | // C 50 | // 51 | #include 52 | #include 53 | 54 | 55 | // last include 56 | // 57 | #include 58 | 59 | 60 | 61 | namespace 62 | { 63 | 64 | 65 | 66 | 67 | 68 | 69 | } // no name namespace 70 | 71 | 72 | 73 | void usage() 74 | { 75 | std::cout << "Usage: unicode_data_parser \n"; 76 | std::cout << "Where:\n"; 77 | std::cout << " is a path to the unicode files such as UnicodeData.txt (default: \"/usr/shared/libutf8/unicode\")\n"; 78 | std::cout << " is a path to the output unicode_data.ucdb file (default: a.ucdb)\n"; 79 | } 80 | 81 | 82 | int main(int argc, char * argv[]) 83 | { 84 | libexcept::verify_inherited_files(); 85 | 86 | std::string input_dir; 87 | std::string output_filename; 88 | 89 | for(int i(1); i < argc; ++i) 90 | { 91 | if(argv[i][0] == '-') 92 | { 93 | switch(argv[i][1]) 94 | { 95 | case 'h': 96 | usage(); 97 | exit(1); 98 | 99 | default: 100 | std::cerr << "error: unknown command line option -" 101 | << argv[i][1] 102 | << "\n"; 103 | exit(1); 104 | break; 105 | 106 | } 107 | } 108 | else 109 | { 110 | if(input_dir.empty()) 111 | { 112 | input_dir = argv[i]; 113 | if(input_dir.empty()) 114 | { 115 | std::cerr << "error: input directory name can't be empty, try \".\" for current folder.\n"; 116 | exit(1); 117 | } 118 | } 119 | else if(output_filename.empty()) 120 | { 121 | output_filename = argv[i]; 122 | } 123 | else 124 | { 125 | std::cerr << "error: too many filenames on the command line.\n"; 126 | exit(1); 127 | } 128 | } 129 | } 130 | 131 | if(input_dir.empty()) 132 | { 133 | input_dir = "/usr/shared/libutf8/unicode"; 134 | } 135 | 136 | if(output_filename.empty()) 137 | { 138 | output_filename = "a.ucdb"; 139 | } 140 | 141 | libutf8::ucd_parser p(input_dir, output_filename); 142 | p.generate(); 143 | 144 | return 0; 145 | } 146 | 147 | 148 | // vim: ts=4 sw=4 et 149 | --------------------------------------------------------------------------------