├── .gitignore
├── CMakeLists.txt
├── LICENSE.txt
├── README.md
├── TODO.txt
├── cmake
├── CMakeLists.txt
└── LibUtf8Config.cmake
├── conf
├── CMakeLists.txt
└── unicode
│ ├── CMakeLists.txt
│ ├── DerivedAge.txt
│ ├── Jamo.txt
│ ├── LICENSE.txt
│ ├── NameAliases.txt
│ ├── README.md
│ └── UnicodeData.txt
├── debian
├── changelog
├── compat
├── control
├── copyright
├── docs
├── libutf8-dev.install
├── libutf8-doc.install
├── libutf8.install
├── rules
└── source
│ └── options
├── doc
├── CMakeLists.txt
├── footer.html
├── libutf8.doxy.in
└── libutf8.png
├── libutf8
├── CMakeLists.txt
├── base.cpp
├── base.h
├── caseinsensitivestring.h
├── exception.h
├── iterator.cpp
├── iterator.h
├── json_tokens.cpp
├── json_tokens.h
├── libutf8.cpp
├── libutf8.h
├── unicode_data.cpp
├── unicode_data.h
├── unicode_data_file.cpp
├── unicode_data_file.h
├── version.cpp
└── version.h.in
├── mk
├── tests
├── CMakeLists.txt
├── catch_bom.cpp
├── catch_caseinsensitive.cpp
├── catch_character.cpp
├── catch_iterator.cpp
├── catch_json_tokens.cpp
├── catch_length.cpp
├── catch_main.cpp
├── catch_main.h
├── catch_stream.cpp
├── catch_string.cpp
├── catch_valid.cpp
├── catch_version.cpp
├── example-for-show-utf16.txt
├── example-for-show-utf32.txt
├── example-for-show-utf8.txt
├── unicode
│ ├── LICENSE.txt
│ └── NormalizationTest.txt
└── verify-show-unicode.sh
└── tools
├── CMakeLists.txt
├── show_unicode.cpp
└── unicode_data_parser.cpp
/.gitignore:
--------------------------------------------------------------------------------
1 | tmp
2 | *.sw?
3 | seed.txt
4 |
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved
2 | #
3 | # https://snapwebsites.org/project/libutf8
4 | # contact@m2osw.com
5 | #
6 | # This program is free software; you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation; either version 2 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License along
17 | # with this program; if not, write to the Free Software Foundation, Inc.,
18 | # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 |
20 | cmake_minimum_required(VERSION 3.10.2)
21 |
22 | project(utf8_library)
23 |
24 | find_package(SnapCMakeModules REQUIRED)
25 | find_package(LibExcept REQUIRED)
26 | find_package(SnapDev REQUIRED)
27 |
28 | SnapGetVersion(LIBUTF8 ${CMAKE_CURRENT_SOURCE_DIR})
29 |
30 | include_directories(
31 | ${PROJECT_SOURCE_DIR}
32 | ${CMAKE_CURRENT_BINARY_DIR}
33 | )
34 |
35 | add_subdirectory(libutf8)
36 | add_subdirectory(tools )
37 | add_subdirectory(conf )
38 | add_subdirectory(doc )
39 | add_subdirectory(cmake )
40 | add_subdirectory(tests )
41 |
42 | # vim: ts=4 sw=4 et
43 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2006-2023 Made to Order Software Corp. All Rights Reserved
2 |
3 | https://snapwebsites.org/
4 | contact@m2osw.com
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy
7 | of this software and associated documentation files (the "Software"), to deal
8 | in the Software without restriction, including without limitation the rights
9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in
14 | all copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | THE SOFTWARE.
23 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 |
6 |
7 | # Introduction
8 |
9 | The libutf8 library is a helper library to handle UTF-8 strings in C++.
10 | Although C++11 added `char32_t` (and `char16_t`) and C++20 added
11 | `char8_t`, the conversions are still not seamless between each type
12 | (although it is becoming easier to handle such.)
13 |
14 | This library proposes automated conversions between `std::string` (viewed
15 | as UTF-8 in nearly all of our code) and `std::u32string` (a.k.a. UTF-32
16 | strings.)
17 |
18 | # Reasons Behind Having Our Own Library
19 |
20 | All the libraries I've seen are either in C and very cumbersome to use or
21 | offer an interface which depends on the current `LOCALE`. In other words,
22 | the system default `mbstowc()` function, for example, does not always view
23 | the input string as UTF-8. That also means there are complexities and thus
24 | inefficiencies in determining which conversion to use.
25 |
26 | In our case, we always have UTF-8 as input and output and at times we need
27 | to handle the characters as UTF-32. For example, to transform the character
28 | to uppercase, it is necessary to have a UTF-32 character.
29 |
30 | # API
31 |
32 | ## String Conversions
33 |
34 | The library offers to conversion functions as follow:
35 |
36 | libutf8::to_u8string(std::u32string const & str);
37 | libutf8::to_u32string(std::string const & str);
38 |
39 | As time passes, we will add other conversions so as to support all formats
40 | although at this point these two are the only two we need in Snap! Websites.
41 |
42 | Here is an example of usage:
43 |
44 | std::string u8;
45 |
46 | u8 = u8"This is a UTF-8 string";
47 |
48 | std::w32string u32;
49 | u32 = libutf8::to_u32string(u8);
50 |
51 | std::string back;
52 | back = libutf8::to_u8string(u32);
53 |
54 | Note that u8 string could be _more_ UTF-8 by including characters outside
55 | of the ASCII range and it would still work as you would expect.
56 |
57 | ### String Length in Characters
58 |
59 | The library offers the `u8length()` function which computes the length of
60 | a UTF-8 string. Note that this does not verify whether the UTF-8 data is
61 | valid. It very quickly counts the number of non-continuation bytes (i.e.
62 | bytes between 0x80 and 0xBF inclusive.)
63 |
64 | std::string u8("Your UTF-8 string");
65 | size_t length = libutf8::u8length(u8);
66 |
67 | ### Case Insensitive Compare
68 |
69 | In most cases, you can compare two UTF-8 strings with the normal `==`
70 | operator. Once in a while, though, you may want to compare them case
71 | insensitively.
72 |
73 | Like with the iterator below, we wanted to offer a function that allows
74 | you to compare two UTF-8 strings properly and as quickly as possible.
75 | This meant to not have to convert the entire strings before doing the
76 | compare because having to do so means allocating memory for both
77 | strings just to do the compare and the conversion would convert the
78 | entire strings instead of just what's necessary.
79 |
80 | Out of these constraints we created the `u8casecmp()` function. It
81 | takes two UTF-8 strings and compares the characters one at a time.
82 | Unless the strings are equal, only the number of characters up to
83 | the first non-equal one, will be converted.
84 |
85 | std::string a("First String");
86 | std::string b("First Test");
87 |
88 | int r(libutf8::u8casecmp(a, b));
89 | if(r == 0)
90 | {
91 | std::cout << "a and b are equal" << std::endl;
92 | }
93 | else if(r < 0)
94 | {
95 | std::cout << "a comes before b" << std::endl;
96 | }
97 | else //if(r > 0)
98 | {
99 | std::cout << "a comes after b" << std::endl;
100 | }
101 |
102 | WARNING: the function does no collation, so it is not going to take the
103 | language in account. It uses lowercase characters, as suggested by the
104 | Unicode standard, but outside of that, the compare is binary.
105 |
106 | ## UTF-8 Iterator
107 |
108 | It is often that we have an `std::string` representing UTF-8 and we want
109 | to iterate the content as UTF-32 characters. Although we could convert
110 | the string to a full `std::u32string` and then iterate through the
111 | `std::u32string`, that (1) requires a copy and (2) uses four times
112 | the amount of memory (five times if you include the `std::string` size...)
113 | Note also that the copy requires a `malloc()` and later a `free()` once
114 | done with it.
115 |
116 | The iterator solves these problems by allowing us to iterate through the
117 | `std::string` and getting the next or previous Unicode character without
118 | having to use any more memory. The conversion itself is slightly slower
119 | than converting a string all at once, but doing a `malloc()` to get the
120 | `std::u32string` is definitely going to be way slower than our iterator
121 | in nearly all circumstances.
122 |
123 | The following example shows the code point of each character, one per line:
124 |
125 | std::string u8("This is your UTF-8 string");
126 |
127 | for(libutf8::utf8_iterator it(u8);
128 | it != u8.end();
129 | ++it)
130 | {
131 | std::cout << static_cast(*it) << std::endl;
132 | }
133 |
134 | You can compare standard `std::string` iterators with `==` and `!=`. The
135 | `++` and `--` operators work as expected. If you do a `++` when already
136 | at the end, nothing happens. If you do a `--` when already at the beginning,
137 | nothing happens.
138 |
139 | Once you are at the end, getting the character (`*it`) returns `libutf8::EOS`.
140 | So you can loop through until you get `libutf8::EOS` instead of checking
141 | against the end iterator:
142 |
143 | std::string u8("This is your UTF-8 string");
144 |
145 | libutf8::utf8_iterator it(u8);
146 | while(*it != libutf8::EOS)
147 | {
148 | std::cout << static_cast(*it++) << std::endl;
149 | }
150 |
151 | Remember that a good optimization is to avoid the post increment. It will
152 | be faster to do:
153 |
154 | char32_t c = *it;
155 | ++it;
156 |
157 | because you avoid a copy of the iterator (even though it's only 16 bytes...)
158 |
159 | ## Low Level Functions
160 |
161 | We expose the low level functions such as `mbstowc()` for edgy cases where
162 | you may not have an `std::string`. Those functions should not be used if
163 | at all possible because they require proper handling of the buffers passed
164 | to them. An error to such and you could end up with a crashing bug in your
165 | code.
166 |
167 | # TODO
168 |
169 | ## Auto-Conversions
170 |
171 | Conversions for many more types of strings such as all the `char *`
172 | and also look into whether implementing an extension to the
173 | `std::basic_string` would be possible to directly have conversions
174 | integrated in our strings (i.e. to be able to write `str8 = str32;` and
175 | `str32 = str8` without having to write `str8 = libutf8::to_u8string(str32)`.)
176 |
177 | ## Canonicalization
178 |
179 | Right now, we do not try to canonicalize the strings, so diacritics may
180 | appear as standalone or combined characters. We want to implement the
181 | necessary code to decomposed and re-composed in a normalized manner.
182 |
183 | This is very important for comparing strings against each other for
184 | equality (i.e. an 'a' with a grave accent is equal to an 'a' followed
185 | by the grave accent character).
186 |
187 | ## Character Name, Type, etc.
188 |
189 | The UnicodeData.txt file (offered by the Unicode website) lists all the
190 | characters with their name and their types. We want to offer the user
191 | access to that data.
192 |
193 | We should simple have the table as a struct and return a pointer to
194 | the corresponding character. Sort those by character number and use
195 | a binary search to find the structure.
196 |
197 | Some of that information is to be used for the canonicalization so it
198 | is a must have.
199 |
200 | UnicodeData.txt file format is defined in:
201 | http://www.unicode.org/L2/L1999/UnicodeData.html
202 |
203 |
204 |
205 | # License
206 |
207 | The source is covered by the MIT license. The debian folder is covered
208 | by the GPL 2.0.
209 |
210 |
211 | # Bugs
212 |
213 | Submit bug reports and patches on
214 | [github](https://github.com/m2osw/libutf8/issues).
215 |
216 |
217 | _This file is part of the [snapcpp project](https://snapwebsites.org/)._
218 |
--------------------------------------------------------------------------------
/TODO.txt:
--------------------------------------------------------------------------------
1 |
2 | * `utf8lint` verify that a file is valid UTF-8 (see show-unicode, we can have a softlink instead and if called utf8lint, assume --quiet).
3 | * Enhance `show-unicode`:
4 | - Support a range (so we can see the characters in a given range).
5 | - Actually do a validation step.
6 | * Add a reverse() function which works correctly with a UTF-8 string.
7 | * Add a reverse() function which works correctly with a UTF-16 string.
8 | * Add a fix() function which takes UTF-32/16 and removes any invalid characters (UTF-8 is done).
9 | * Add a "lexer base" which is to read an input file one character at a time
10 | like a lexer getc() generally does and return char32_t characters
11 | (see basic-xml for an example on how this is done and convert that one to
12 | using this new "lexer base")
13 |
14 |
--------------------------------------------------------------------------------
/cmake/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved
2 | #
3 | # https://snapwebsites.org/project/libutf8
4 | # contact@m2osw.com
5 | #
6 | # This program is free software; you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation; either version 2 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License along
17 | # with this program; if not, write to the Free Software Foundation, Inc.,
18 | # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 |
20 | project(libutf8_cmake)
21 |
22 | install(
23 | FILES
24 | LibUtf8Config.cmake
25 |
26 | DESTINATION
27 | share/cmake/LibUtf8
28 | )
29 |
30 | # vim: ts=4 sw=4 et nocindent
31 |
--------------------------------------------------------------------------------
/cmake/LibUtf8Config.cmake:
--------------------------------------------------------------------------------
1 | # - Find LibUtf8
2 | #
3 | # LIBUTF8_FOUND - System has LibUtf8
4 | # LIBUTF8_INCLUDE_DIRS - The LibUtf8 include directories
5 | # LIBUTF8_LIBRARIES - The libraries needed to use LibUtf8
6 | # LIBUTF8_DEFINITIONS - Compiler switches required for using LibUtf8
7 | #
8 | # License:
9 | #
10 | # Copyright (c) 2011-2023 Made to Order Software Corp. All Rights Reserved
11 | #
12 | # https://snapwebsites.org/project/libutf8
13 | # contact@m2osw.com
14 | #
15 | # This program is free software: you can redistribute it and/or modify
16 | # it under the terms of the GNU General Public License as published by
17 | # the Free Software Foundation, either version 3 of the License, or
18 | # (at your option) any later version.
19 | #
20 | # This program is distributed in the hope that it will be useful,
21 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
22 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 | # GNU General Public License for more details.
24 | #
25 | # You should have received a copy of the GNU General Public License
26 | # along with this program. If not, see .
27 |
28 | find_path(
29 | LIBUTF8_INCLUDE_DIR
30 | libutf8/libutf8.h
31 |
32 | PATHS
33 | ENV LIBUTF8_INCLUDE_DIR
34 | )
35 |
36 | find_library(
37 | LIBUTF8_LIBRARY
38 | utf8
39 |
40 | PATHS
41 | ${LIBUTF8_LIBRARY_DIR}
42 | ENV LIBUTF8_LIBRARY
43 | )
44 |
45 | mark_as_advanced(
46 | LIBUTF8_INCLUDE_DIR
47 | LIBUTF8_LIBRARY
48 | )
49 |
50 | set(LIBUTF8_INCLUDE_DIRS ${LIBUTF8_INCLUDE_DIR})
51 | set(LIBUTF8_LIBRARIES ${LIBUTF8_LIBRARY})
52 |
53 | include(FindPackageHandleStandardArgs)
54 |
55 | find_package_handle_standard_args(
56 | LibUtf8
57 | REQUIRED_VARS
58 | LIBUTF8_INCLUDE_DIR
59 | LIBUTF8_LIBRARY
60 | )
61 |
62 | # vim: ts=4 sw=4 et
63 |
--------------------------------------------------------------------------------
/conf/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved
2 | #
3 | # https://snapwebsites.org/project/libutf8
4 | # contact@m2osw.com
5 | #
6 | # This program is free software; you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation; either version 2 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License along
17 | # with this program; if not, write to the Free Software Foundation, Inc.,
18 | # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 |
20 | project(utf8_library_conf)
21 |
22 | add_subdirectory(unicode)
23 |
24 | # vim: ts=4 sw=4 et
25 |
--------------------------------------------------------------------------------
/conf/unicode/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2012-2023 Made to Order Software Corp. All Rights Reserved
2 | #
3 | # https://snapwebsites.org/project/libutf8
4 | # contact@m2osw.com
5 | #
6 | # This program is free software; you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation; either version 2 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License along
17 | # with this program; if not, write to the Free Software Foundation, Inc.,
18 | # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 |
20 | ##
21 | ## unicode-data
22 | ##
23 | project(unicode-data)
24 |
25 | install(
26 | FILES
27 | DerivedAge.txt
28 | Jamo.txt
29 | NameAliases.txt
30 | UnicodeData.txt
31 |
32 | DESTINATION
33 | share/libutf8/unicode
34 | )
35 |
36 |
37 | # vim: ts=4 sw=4 et
38 |
--------------------------------------------------------------------------------
/conf/unicode/Jamo.txt:
--------------------------------------------------------------------------------
1 | # Jamo-13.0.0.txt
2 | # Date: 2019-09-09, 19:46:00 GMT [KW, LI]
3 | # © 2019 Unicode®, Inc.
4 | # For terms of use, see http://www.unicode.org/terms_of_use.html
5 | #
6 | # Unicode Character Database
7 | # For documentation, see http://www.unicode.org/reports/tr44/
8 | #
9 | # This file defines the Jamo_Short_Name property.
10 | #
11 | # See Section 3.12 of The Unicode Standard, Version 13.0
12 | # for more information.
13 | #
14 | # Each line contains two fields, separated by a semicolon.
15 | #
16 | # The first field gives the code point, in 4-digit hexadecimal
17 | # form, of a conjoining jamo character that participates in the
18 | # algorithmic determination of Hangul syllable character names.
19 | # The second field gives the Jamo_Short_Name as a one-, two-,
20 | # or three-character ASCII string (or in one case, for U+110B,
21 | # the null string).
22 | #
23 | # #############################################################
24 |
25 | 1100; G # HANGUL CHOSEONG KIYEOK
26 | 1101; GG # HANGUL CHOSEONG SSANGKIYEOK
27 | 1102; N # HANGUL CHOSEONG NIEUN
28 | 1103; D # HANGUL CHOSEONG TIKEUT
29 | 1104; DD # HANGUL CHOSEONG SSANGTIKEUT
30 | 1105; R # HANGUL CHOSEONG RIEUL
31 | 1106; M # HANGUL CHOSEONG MIEUM
32 | 1107; B # HANGUL CHOSEONG PIEUP
33 | 1108; BB # HANGUL CHOSEONG SSANGPIEUP
34 | 1109; S # HANGUL CHOSEONG SIOS
35 | 110A; SS # HANGUL CHOSEONG SSANGSIOS
36 | 110B; # HANGUL CHOSEONG IEUNG
37 | 110C; J # HANGUL CHOSEONG CIEUC
38 | 110D; JJ # HANGUL CHOSEONG SSANGCIEUC
39 | 110E; C # HANGUL CHOSEONG CHIEUCH
40 | 110F; K # HANGUL CHOSEONG KHIEUKH
41 | 1110; T # HANGUL CHOSEONG THIEUTH
42 | 1111; P # HANGUL CHOSEONG PHIEUPH
43 | 1112; H # HANGUL CHOSEONG HIEUH
44 | 1161; A # HANGUL JUNGSEONG A
45 | 1162; AE # HANGUL JUNGSEONG AE
46 | 1163; YA # HANGUL JUNGSEONG YA
47 | 1164; YAE # HANGUL JUNGSEONG YAE
48 | 1165; EO # HANGUL JUNGSEONG EO
49 | 1166; E # HANGUL JUNGSEONG E
50 | 1167; YEO # HANGUL JUNGSEONG YEO
51 | 1168; YE # HANGUL JUNGSEONG YE
52 | 1169; O # HANGUL JUNGSEONG O
53 | 116A; WA # HANGUL JUNGSEONG WA
54 | 116B; WAE # HANGUL JUNGSEONG WAE
55 | 116C; OE # HANGUL JUNGSEONG OE
56 | 116D; YO # HANGUL JUNGSEONG YO
57 | 116E; U # HANGUL JUNGSEONG U
58 | 116F; WEO # HANGUL JUNGSEONG WEO
59 | 1170; WE # HANGUL JUNGSEONG WE
60 | 1171; WI # HANGUL JUNGSEONG WI
61 | 1172; YU # HANGUL JUNGSEONG YU
62 | 1173; EU # HANGUL JUNGSEONG EU
63 | 1174; YI # HANGUL JUNGSEONG YI
64 | 1175; I # HANGUL JUNGSEONG I
65 | 11A8; G # HANGUL JONGSEONG KIYEOK
66 | 11A9; GG # HANGUL JONGSEONG SSANGKIYEOK
67 | 11AA; GS # HANGUL JONGSEONG KIYEOK-SIOS
68 | 11AB; N # HANGUL JONGSEONG NIEUN
69 | 11AC; NJ # HANGUL JONGSEONG NIEUN-CIEUC
70 | 11AD; NH # HANGUL JONGSEONG NIEUN-HIEUH
71 | 11AE; D # HANGUL JONGSEONG TIKEUT
72 | 11AF; L # HANGUL JONGSEONG RIEUL
73 | 11B0; LG # HANGUL JONGSEONG RIEUL-KIYEOK
74 | 11B1; LM # HANGUL JONGSEONG RIEUL-MIEUM
75 | 11B2; LB # HANGUL JONGSEONG RIEUL-PIEUP
76 | 11B3; LS # HANGUL JONGSEONG RIEUL-SIOS
77 | 11B4; LT # HANGUL JONGSEONG RIEUL-THIEUTH
78 | 11B5; LP # HANGUL JONGSEONG RIEUL-PHIEUPH
79 | 11B6; LH # HANGUL JONGSEONG RIEUL-HIEUH
80 | 11B7; M # HANGUL JONGSEONG MIEUM
81 | 11B8; B # HANGUL JONGSEONG PIEUP
82 | 11B9; BS # HANGUL JONGSEONG PIEUP-SIOS
83 | 11BA; S # HANGUL JONGSEONG SIOS
84 | 11BB; SS # HANGUL JONGSEONG SSANGSIOS
85 | 11BC; NG # HANGUL JONGSEONG IEUNG
86 | 11BD; J # HANGUL JONGSEONG CIEUC
87 | 11BE; C # HANGUL JONGSEONG CHIEUCH
88 | 11BF; K # HANGUL JONGSEONG KHIEUKH
89 | 11C0; T # HANGUL JONGSEONG THIEUTH
90 | 11C1; P # HANGUL JONGSEONG PHIEUPH
91 | 11C2; H # HANGUL JONGSEONG HIEUH
92 |
93 | # EOF
94 |
--------------------------------------------------------------------------------
/conf/unicode/LICENSE.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/m2osw/libutf8/fe008f2199f5d8f43abe6ecffe6e82ead03ab2b5/conf/unicode/LICENSE.txt
--------------------------------------------------------------------------------
/conf/unicode/NameAliases.txt:
--------------------------------------------------------------------------------
1 | # NameAliases-13.0.0.txt
2 | # Date: 2019-09-09, 19:47:00 GMT [KW, LI]
3 | # © 2019 Unicode®, Inc.
4 | # For terms of use, see http://www.unicode.org/terms_of_use.html
5 | #
6 | # Unicode Character Database
7 | # For documentation, see http://www.unicode.org/reports/tr44/
8 | #
9 | # This file is a normative contributory data file in the
10 | # Unicode Character Database.
11 | #
12 | # This file defines the formal name aliases for Unicode characters.
13 | #
14 | # For informative aliases, see NamesList.txt
15 | #
16 | # The formal name aliases are divided into five types, each with a distinct label.
17 | #
18 | # Type Labels:
19 | #
20 | # 1. correction
21 | # Corrections for serious problems in the character names
22 | # 2. control
23 | # ISO 6429 names for C0 and C1 control functions, and other
24 | # commonly occurring names for control codes
25 | # 3. alternate
26 | # A few widely used alternate names for format characters
27 | # 4. figment
28 | # Several documented labels for C1 control code points which
29 | # were never actually approved in any standard
30 | # 5. abbreviation
31 | # Commonly occurring abbreviations (or acronyms) for control codes,
32 | # format characters, spaces, and variation selectors
33 | #
34 | # The formal name aliases are part of the Unicode character namespace, which
35 | # includes the character names and the names of named character sequences.
36 | # The inclusion of ISO 6429 names and other commonly occurring names and
37 | # abbreviations for control codes and format characters as formal name aliases
38 | # is to help avoid name collisions between Unicode character names and the
39 | # labels which commonly appear in text and/or in implementations such as regex, for
40 | # control codes (which for historical reasons have no Unicode character name)
41 | # or for format characters.
42 | #
43 | # For documentation, see NamesList.html and http://www.unicode.org/reports/tr44/
44 | #
45 | # FORMAT
46 | #
47 | # Each line has three fields, as described here:
48 | #
49 | # First field: Code point
50 | # Second field: Alias
51 | # Third field: Type
52 | #
53 | # The type labels used are defined above. As for property values, comparisons
54 | # of type labels should ignore case.
55 | #
56 | # The type labels can be mapped to other strings for display, if desired.
57 | #
58 | # In case multiple aliases are assigned, additional aliases
59 | # are provided on separate lines. Parsers of this data file should
60 | # take note that the same code point can (and does) occur more than once.
61 | #
62 | # Note that currently the only instances of multiple aliases of the same
63 | # type for a single code point are either of type "control" or "abbreviation".
64 | # An alias of type "abbreviation" can, in principle, be added for any code
65 | # point, although currently aliases of type "correction" do not have
66 | # any additional aliases of type "abbreviation". Such relationships
67 | # are not enforced by stability policies.
68 | #
69 | #-----------------------------------------------------------------
70 |
71 | 0000;NULL;control
72 | 0000;NUL;abbreviation
73 | 0001;START OF HEADING;control
74 | 0001;SOH;abbreviation
75 | 0002;START OF TEXT;control
76 | 0002;STX;abbreviation
77 | 0003;END OF TEXT;control
78 | 0003;ETX;abbreviation
79 | 0004;END OF TRANSMISSION;control
80 | 0004;EOT;abbreviation
81 | 0005;ENQUIRY;control
82 | 0005;ENQ;abbreviation
83 | 0006;ACKNOWLEDGE;control
84 | 0006;ACK;abbreviation
85 |
86 | # Note that no formal name alias for the ISO 6429 "BELL" is
87 | # provided for U+0007, because of the existing name collision
88 | # with U+1F514 BELL.
89 |
90 | 0007;ALERT;control
91 | 0007;BEL;abbreviation
92 |
93 | 0008;BACKSPACE;control
94 | 0008;BS;abbreviation
95 | 0009;CHARACTER TABULATION;control
96 | 0009;HORIZONTAL TABULATION;control
97 | 0009;HT;abbreviation
98 | 0009;TAB;abbreviation
99 | 000A;LINE FEED;control
100 | 000A;NEW LINE;control
101 | 000A;END OF LINE;control
102 | 000A;LF;abbreviation
103 | 000A;NL;abbreviation
104 | 000A;EOL;abbreviation
105 | 000B;LINE TABULATION;control
106 | 000B;VERTICAL TABULATION;control
107 | 000B;VT;abbreviation
108 | 000C;FORM FEED;control
109 | 000C;FF;abbreviation
110 | 000D;CARRIAGE RETURN;control
111 | 000D;CR;abbreviation
112 | 000E;SHIFT OUT;control
113 | 000E;LOCKING-SHIFT ONE;control
114 | 000E;SO;abbreviation
115 | 000F;SHIFT IN;control
116 | 000F;LOCKING-SHIFT ZERO;control
117 | 000F;SI;abbreviation
118 | 0010;DATA LINK ESCAPE;control
119 | 0010;DLE;abbreviation
120 | 0011;DEVICE CONTROL ONE;control
121 | 0011;DC1;abbreviation
122 | 0012;DEVICE CONTROL TWO;control
123 | 0012;DC2;abbreviation
124 | 0013;DEVICE CONTROL THREE;control
125 | 0013;DC3;abbreviation
126 | 0014;DEVICE CONTROL FOUR;control
127 | 0014;DC4;abbreviation
128 | 0015;NEGATIVE ACKNOWLEDGE;control
129 | 0015;NAK;abbreviation
130 | 0016;SYNCHRONOUS IDLE;control
131 | 0016;SYN;abbreviation
132 | 0017;END OF TRANSMISSION BLOCK;control
133 | 0017;ETB;abbreviation
134 | 0018;CANCEL;control
135 | 0018;CAN;abbreviation
136 | 0019;END OF MEDIUM;control
137 | 0019;EOM;abbreviation
138 | 001A;SUBSTITUTE;control
139 | 001A;SUB;abbreviation
140 | 001B;ESCAPE;control
141 | 001B;ESC;abbreviation
142 | 001C;INFORMATION SEPARATOR FOUR;control
143 | 001C;FILE SEPARATOR;control
144 | 001C;FS;abbreviation
145 | 001D;INFORMATION SEPARATOR THREE;control
146 | 001D;GROUP SEPARATOR;control
147 | 001D;GS;abbreviation
148 | 001E;INFORMATION SEPARATOR TWO;control
149 | 001E;RECORD SEPARATOR;control
150 | 001E;RS;abbreviation
151 | 001F;INFORMATION SEPARATOR ONE;control
152 | 001F;UNIT SEPARATOR;control
153 | 001F;US;abbreviation
154 | 0020;SP;abbreviation
155 | 007F;DELETE;control
156 | 007F;DEL;abbreviation
157 |
158 | # PADDING CHARACTER and HIGH OCTET PRESET represent
159 | # architectural concepts initially proposed for early
160 | # drafts of ISO/IEC 10646-1. They were never actually
161 | # approved or standardized: hence their designation
162 | # here as the "figment" type. Formal name aliases
163 | # (and corresponding abbreviations) for these code
164 | # points are included here because these names leaked
165 | # out from the draft documents and were published in
166 | # at least one RFC whose names for code points was
167 | # implemented in Perl regex expressions.
168 |
169 | 0080;PADDING CHARACTER;figment
170 | 0080;PAD;abbreviation
171 | 0081;HIGH OCTET PRESET;figment
172 | 0081;HOP;abbreviation
173 |
174 | 0082;BREAK PERMITTED HERE;control
175 | 0082;BPH;abbreviation
176 | 0083;NO BREAK HERE;control
177 | 0083;NBH;abbreviation
178 | 0084;INDEX;control
179 | 0084;IND;abbreviation
180 | 0085;NEXT LINE;control
181 | 0085;NEL;abbreviation
182 | 0086;START OF SELECTED AREA;control
183 | 0086;SSA;abbreviation
184 | 0087;END OF SELECTED AREA;control
185 | 0087;ESA;abbreviation
186 | 0088;CHARACTER TABULATION SET;control
187 | 0088;HORIZONTAL TABULATION SET;control
188 | 0088;HTS;abbreviation
189 | 0089;CHARACTER TABULATION WITH JUSTIFICATION;control
190 | 0089;HORIZONTAL TABULATION WITH JUSTIFICATION;control
191 | 0089;HTJ;abbreviation
192 | 008A;LINE TABULATION SET;control
193 | 008A;VERTICAL TABULATION SET;control
194 | 008A;VTS;abbreviation
195 | 008B;PARTIAL LINE FORWARD;control
196 | 008B;PARTIAL LINE DOWN;control
197 | 008B;PLD;abbreviation
198 | 008C;PARTIAL LINE BACKWARD;control
199 | 008C;PARTIAL LINE UP;control
200 | 008C;PLU;abbreviation
201 | 008D;REVERSE LINE FEED;control
202 | 008D;REVERSE INDEX;control
203 | 008D;RI;abbreviation
204 | 008E;SINGLE SHIFT TWO;control
205 | 008E;SINGLE-SHIFT-2;control
206 | 008E;SS2;abbreviation
207 | 008F;SINGLE SHIFT THREE;control
208 | 008F;SINGLE-SHIFT-3;control
209 | 008F;SS3;abbreviation
210 | 0090;DEVICE CONTROL STRING;control
211 | 0090;DCS;abbreviation
212 | 0091;PRIVATE USE ONE;control
213 | 0091;PRIVATE USE-1;control
214 | 0091;PU1;abbreviation
215 | 0092;PRIVATE USE TWO;control
216 | 0092;PRIVATE USE-2;control
217 | 0092;PU2;abbreviation
218 | 0093;SET TRANSMIT STATE;control
219 | 0093;STS;abbreviation
220 | 0094;CANCEL CHARACTER;control
221 | 0094;CCH;abbreviation
222 | 0095;MESSAGE WAITING;control
223 | 0095;MW;abbreviation
224 | 0096;START OF GUARDED AREA;control
225 | 0096;START OF PROTECTED AREA;control
226 | 0096;SPA;abbreviation
227 | 0097;END OF GUARDED AREA;control
228 | 0097;END OF PROTECTED AREA;control
229 | 0097;EPA;abbreviation
230 | 0098;START OF STRING;control
231 | 0098;SOS;abbreviation
232 |
233 | # SINGLE GRAPHIC CHARACTER INTRODUCER is another
234 | # architectural concept from early drafts of ISO/IEC 10646-1
235 | # which was never approved and standardized.
236 |
237 | 0099;SINGLE GRAPHIC CHARACTER INTRODUCER;figment
238 | 0099;SGC;abbreviation
239 |
240 | 009A;SINGLE CHARACTER INTRODUCER;control
241 | 009A;SCI;abbreviation
242 | 009B;CONTROL SEQUENCE INTRODUCER;control
243 | 009B;CSI;abbreviation
244 | 009C;STRING TERMINATOR;control
245 | 009C;ST;abbreviation
246 | 009D;OPERATING SYSTEM COMMAND;control
247 | 009D;OSC;abbreviation
248 | 009E;PRIVACY MESSAGE;control
249 | 009E;PM;abbreviation
250 | 009F;APPLICATION PROGRAM COMMAND;control
251 | 009F;APC;abbreviation
252 | 00A0;NBSP;abbreviation
253 | 00AD;SHY;abbreviation
254 | 01A2;LATIN CAPITAL LETTER GHA;correction
255 | 01A3;LATIN SMALL LETTER GHA;correction
256 | 034F;CGJ;abbreviation
257 | 061C;ALM;abbreviation
258 | 0709;SYRIAC SUBLINEAR COLON SKEWED LEFT;correction
259 | 0CDE;KANNADA LETTER LLLA;correction
260 | 0E9D;LAO LETTER FO FON;correction
261 | 0E9F;LAO LETTER FO FAY;correction
262 | 0EA3;LAO LETTER RO;correction
263 | 0EA5;LAO LETTER LO;correction
264 | 0FD0;TIBETAN MARK BKA- SHOG GI MGO RGYAN;correction
265 | 11EC;HANGUL JONGSEONG YESIEUNG-KIYEOK;correction
266 | 11ED;HANGUL JONGSEONG YESIEUNG-SSANGKIYEOK;correction
267 | 11EE;HANGUL JONGSEONG SSANGYESIEUNG;correction
268 | 11EF;HANGUL JONGSEONG YESIEUNG-KHIEUKH;correction
269 | 180B;FVS1;abbreviation
270 | 180C;FVS2;abbreviation
271 | 180D;FVS3;abbreviation
272 | 180E;MVS;abbreviation
273 | 200B;ZWSP;abbreviation
274 | 200C;ZWNJ;abbreviation
275 | 200D;ZWJ;abbreviation
276 | 200E;LRM;abbreviation
277 | 200F;RLM;abbreviation
278 | 202A;LRE;abbreviation
279 | 202B;RLE;abbreviation
280 | 202C;PDF;abbreviation
281 | 202D;LRO;abbreviation
282 | 202E;RLO;abbreviation
283 | 202F;NNBSP;abbreviation
284 | 205F;MMSP;abbreviation
285 | 2060;WJ;abbreviation
286 | 2066;LRI;abbreviation
287 | 2067;RLI;abbreviation
288 | 2068;FSI;abbreviation
289 | 2069;PDI;abbreviation
290 | 2118;WEIERSTRASS ELLIPTIC FUNCTION;correction
291 | 2448;MICR ON US SYMBOL;correction
292 | 2449;MICR DASH SYMBOL;correction
293 | 2B7A;LEFTWARDS TRIANGLE-HEADED ARROW WITH DOUBLE VERTICAL STROKE;correction
294 | 2B7C;RIGHTWARDS TRIANGLE-HEADED ARROW WITH DOUBLE VERTICAL STROKE;correction
295 | A015;YI SYLLABLE ITERATION MARK;correction
296 | FE00;VS1;abbreviation
297 | FE01;VS2;abbreviation
298 | FE02;VS3;abbreviation
299 | FE03;VS4;abbreviation
300 | FE04;VS5;abbreviation
301 | FE05;VS6;abbreviation
302 | FE06;VS7;abbreviation
303 | FE07;VS8;abbreviation
304 | FE08;VS9;abbreviation
305 | FE09;VS10;abbreviation
306 | FE0A;VS11;abbreviation
307 | FE0B;VS12;abbreviation
308 | FE0C;VS13;abbreviation
309 | FE0D;VS14;abbreviation
310 | FE0E;VS15;abbreviation
311 | FE0F;VS16;abbreviation
312 | FE18;PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET;correction
313 | FEFF;BYTE ORDER MARK;alternate
314 | FEFF;BOM;abbreviation
315 | FEFF;ZWNBSP;abbreviation
316 | 122D4;CUNEIFORM SIGN NU11 TENU;correction
317 | 122D5;CUNEIFORM SIGN NU11 OVER NU11 BUR OVER BUR;correction
318 | 16E56;MEDEFAIDRIN CAPITAL LETTER H;correction
319 | 16E57;MEDEFAIDRIN CAPITAL LETTER NG;correction
320 | 16E76;MEDEFAIDRIN SMALL LETTER H;correction
321 | 16E77;MEDEFAIDRIN SMALL LETTER NG;correction
322 | 1B001;HENTAIGANA LETTER E-1;correction
323 | 1D0C5;BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS;correction
324 | E0100;VS17;abbreviation
325 | E0101;VS18;abbreviation
326 | E0102;VS19;abbreviation
327 | E0103;VS20;abbreviation
328 | E0104;VS21;abbreviation
329 | E0105;VS22;abbreviation
330 | E0106;VS23;abbreviation
331 | E0107;VS24;abbreviation
332 | E0108;VS25;abbreviation
333 | E0109;VS26;abbreviation
334 | E010A;VS27;abbreviation
335 | E010B;VS28;abbreviation
336 | E010C;VS29;abbreviation
337 | E010D;VS30;abbreviation
338 | E010E;VS31;abbreviation
339 | E010F;VS32;abbreviation
340 | E0110;VS33;abbreviation
341 | E0111;VS34;abbreviation
342 | E0112;VS35;abbreviation
343 | E0113;VS36;abbreviation
344 | E0114;VS37;abbreviation
345 | E0115;VS38;abbreviation
346 | E0116;VS39;abbreviation
347 | E0117;VS40;abbreviation
348 | E0118;VS41;abbreviation
349 | E0119;VS42;abbreviation
350 | E011A;VS43;abbreviation
351 | E011B;VS44;abbreviation
352 | E011C;VS45;abbreviation
353 | E011D;VS46;abbreviation
354 | E011E;VS47;abbreviation
355 | E011F;VS48;abbreviation
356 | E0120;VS49;abbreviation
357 | E0121;VS50;abbreviation
358 | E0122;VS51;abbreviation
359 | E0123;VS52;abbreviation
360 | E0124;VS53;abbreviation
361 | E0125;VS54;abbreviation
362 | E0126;VS55;abbreviation
363 | E0127;VS56;abbreviation
364 | E0128;VS57;abbreviation
365 | E0129;VS58;abbreviation
366 | E012A;VS59;abbreviation
367 | E012B;VS60;abbreviation
368 | E012C;VS61;abbreviation
369 | E012D;VS62;abbreviation
370 | E012E;VS63;abbreviation
371 | E012F;VS64;abbreviation
372 | E0130;VS65;abbreviation
373 | E0131;VS66;abbreviation
374 | E0132;VS67;abbreviation
375 | E0133;VS68;abbreviation
376 | E0134;VS69;abbreviation
377 | E0135;VS70;abbreviation
378 | E0136;VS71;abbreviation
379 | E0137;VS72;abbreviation
380 | E0138;VS73;abbreviation
381 | E0139;VS74;abbreviation
382 | E013A;VS75;abbreviation
383 | E013B;VS76;abbreviation
384 | E013C;VS77;abbreviation
385 | E013D;VS78;abbreviation
386 | E013E;VS79;abbreviation
387 | E013F;VS80;abbreviation
388 | E0140;VS81;abbreviation
389 | E0141;VS82;abbreviation
390 | E0142;VS83;abbreviation
391 | E0143;VS84;abbreviation
392 | E0144;VS85;abbreviation
393 | E0145;VS86;abbreviation
394 | E0146;VS87;abbreviation
395 | E0147;VS88;abbreviation
396 | E0148;VS89;abbreviation
397 | E0149;VS90;abbreviation
398 | E014A;VS91;abbreviation
399 | E014B;VS92;abbreviation
400 | E014C;VS93;abbreviation
401 | E014D;VS94;abbreviation
402 | E014E;VS95;abbreviation
403 | E014F;VS96;abbreviation
404 | E0150;VS97;abbreviation
405 | E0151;VS98;abbreviation
406 | E0152;VS99;abbreviation
407 | E0153;VS100;abbreviation
408 | E0154;VS101;abbreviation
409 | E0155;VS102;abbreviation
410 | E0156;VS103;abbreviation
411 | E0157;VS104;abbreviation
412 | E0158;VS105;abbreviation
413 | E0159;VS106;abbreviation
414 | E015A;VS107;abbreviation
415 | E015B;VS108;abbreviation
416 | E015C;VS109;abbreviation
417 | E015D;VS110;abbreviation
418 | E015E;VS111;abbreviation
419 | E015F;VS112;abbreviation
420 | E0160;VS113;abbreviation
421 | E0161;VS114;abbreviation
422 | E0162;VS115;abbreviation
423 | E0163;VS116;abbreviation
424 | E0164;VS117;abbreviation
425 | E0165;VS118;abbreviation
426 | E0166;VS119;abbreviation
427 | E0167;VS120;abbreviation
428 | E0168;VS121;abbreviation
429 | E0169;VS122;abbreviation
430 | E016A;VS123;abbreviation
431 | E016B;VS124;abbreviation
432 | E016C;VS125;abbreviation
433 | E016D;VS126;abbreviation
434 | E016E;VS127;abbreviation
435 | E016F;VS128;abbreviation
436 | E0170;VS129;abbreviation
437 | E0171;VS130;abbreviation
438 | E0172;VS131;abbreviation
439 | E0173;VS132;abbreviation
440 | E0174;VS133;abbreviation
441 | E0175;VS134;abbreviation
442 | E0176;VS135;abbreviation
443 | E0177;VS136;abbreviation
444 | E0178;VS137;abbreviation
445 | E0179;VS138;abbreviation
446 | E017A;VS139;abbreviation
447 | E017B;VS140;abbreviation
448 | E017C;VS141;abbreviation
449 | E017D;VS142;abbreviation
450 | E017E;VS143;abbreviation
451 | E017F;VS144;abbreviation
452 | E0180;VS145;abbreviation
453 | E0181;VS146;abbreviation
454 | E0182;VS147;abbreviation
455 | E0183;VS148;abbreviation
456 | E0184;VS149;abbreviation
457 | E0185;VS150;abbreviation
458 | E0186;VS151;abbreviation
459 | E0187;VS152;abbreviation
460 | E0188;VS153;abbreviation
461 | E0189;VS154;abbreviation
462 | E018A;VS155;abbreviation
463 | E018B;VS156;abbreviation
464 | E018C;VS157;abbreviation
465 | E018D;VS158;abbreviation
466 | E018E;VS159;abbreviation
467 | E018F;VS160;abbreviation
468 | E0190;VS161;abbreviation
469 | E0191;VS162;abbreviation
470 | E0192;VS163;abbreviation
471 | E0193;VS164;abbreviation
472 | E0194;VS165;abbreviation
473 | E0195;VS166;abbreviation
474 | E0196;VS167;abbreviation
475 | E0197;VS168;abbreviation
476 | E0198;VS169;abbreviation
477 | E0199;VS170;abbreviation
478 | E019A;VS171;abbreviation
479 | E019B;VS172;abbreviation
480 | E019C;VS173;abbreviation
481 | E019D;VS174;abbreviation
482 | E019E;VS175;abbreviation
483 | E019F;VS176;abbreviation
484 | E01A0;VS177;abbreviation
485 | E01A1;VS178;abbreviation
486 | E01A2;VS179;abbreviation
487 | E01A3;VS180;abbreviation
488 | E01A4;VS181;abbreviation
489 | E01A5;VS182;abbreviation
490 | E01A6;VS183;abbreviation
491 | E01A7;VS184;abbreviation
492 | E01A8;VS185;abbreviation
493 | E01A9;VS186;abbreviation
494 | E01AA;VS187;abbreviation
495 | E01AB;VS188;abbreviation
496 | E01AC;VS189;abbreviation
497 | E01AD;VS190;abbreviation
498 | E01AE;VS191;abbreviation
499 | E01AF;VS192;abbreviation
500 | E01B0;VS193;abbreviation
501 | E01B1;VS194;abbreviation
502 | E01B2;VS195;abbreviation
503 | E01B3;VS196;abbreviation
504 | E01B4;VS197;abbreviation
505 | E01B5;VS198;abbreviation
506 | E01B6;VS199;abbreviation
507 | E01B7;VS200;abbreviation
508 | E01B8;VS201;abbreviation
509 | E01B9;VS202;abbreviation
510 | E01BA;VS203;abbreviation
511 | E01BB;VS204;abbreviation
512 | E01BC;VS205;abbreviation
513 | E01BD;VS206;abbreviation
514 | E01BE;VS207;abbreviation
515 | E01BF;VS208;abbreviation
516 | E01C0;VS209;abbreviation
517 | E01C1;VS210;abbreviation
518 | E01C2;VS211;abbreviation
519 | E01C3;VS212;abbreviation
520 | E01C4;VS213;abbreviation
521 | E01C5;VS214;abbreviation
522 | E01C6;VS215;abbreviation
523 | E01C7;VS216;abbreviation
524 | E01C8;VS217;abbreviation
525 | E01C9;VS218;abbreviation
526 | E01CA;VS219;abbreviation
527 | E01CB;VS220;abbreviation
528 | E01CC;VS221;abbreviation
529 | E01CD;VS222;abbreviation
530 | E01CE;VS223;abbreviation
531 | E01CF;VS224;abbreviation
532 | E01D0;VS225;abbreviation
533 | E01D1;VS226;abbreviation
534 | E01D2;VS227;abbreviation
535 | E01D3;VS228;abbreviation
536 | E01D4;VS229;abbreviation
537 | E01D5;VS230;abbreviation
538 | E01D6;VS231;abbreviation
539 | E01D7;VS232;abbreviation
540 | E01D8;VS233;abbreviation
541 | E01D9;VS234;abbreviation
542 | E01DA;VS235;abbreviation
543 | E01DB;VS236;abbreviation
544 | E01DC;VS237;abbreviation
545 | E01DD;VS238;abbreviation
546 | E01DE;VS239;abbreviation
547 | E01DF;VS240;abbreviation
548 | E01E0;VS241;abbreviation
549 | E01E1;VS242;abbreviation
550 | E01E2;VS243;abbreviation
551 | E01E3;VS244;abbreviation
552 | E01E4;VS245;abbreviation
553 | E01E5;VS246;abbreviation
554 | E01E6;VS247;abbreviation
555 | E01E7;VS248;abbreviation
556 | E01E8;VS249;abbreviation
557 | E01E9;VS250;abbreviation
558 | E01EA;VS251;abbreviation
559 | E01EB;VS252;abbreviation
560 | E01EC;VS253;abbreviation
561 | E01ED;VS254;abbreviation
562 | E01EE;VS255;abbreviation
563 | E01EF;VS256;abbreviation
564 |
565 | # EOF
566 |
--------------------------------------------------------------------------------
/conf/unicode/README.md:
--------------------------------------------------------------------------------
1 |
2 | The files found here are copies of the Unicode files found on the Unicode
3 | website. We only include the few files that we parse. When a new version
4 | of Unicode comes out, we should be able to just replace those files and
5 | parse the new version. Also, we parse at installation time, so we can
6 | update an existing installation with a simple `apt-get upgrade`.
7 |
8 | See: https://www.unicode.org/Public/
9 |
10 | Select a version and then `ucd`.
11 |
12 |
--------------------------------------------------------------------------------
/debian/changelog:
--------------------------------------------------------------------------------
1 | libutf8 (1.0.15.2~bionic) bionic; urgency=high
2 |
3 | * Bumped build version to rebuild on Launchpad.
4 |
5 | -- Alexis Wilke Fri, 10 Nov 2023 15:24:41 -0800
6 |
7 | libutf8 (1.0.15.1~bionic) bionic; urgency=high
8 |
9 | * Bumped build version to rebuild on Launchpad.
10 |
11 | -- Alexis Wilke Tue, 07 Nov 2023 06:03:57 -0800
12 |
13 | libutf8 (1.0.15.0~jammy) jammy; urgency=high
14 |
15 | * Moved find() of doxygen in the doc/CMakeLists.txt file.
16 | * Changed the NOT_A_CHARACTER value to -2 to distinguish it from EOS.
17 | * Made utf8_iterator::operator * return NOT_A_CHARACTER on an error.
18 | * Define the traits in-place for std::iterator is deprecated.
19 | * Removed overload of ostream char32_t characters.
20 | * Added a show-unicode tool to display codes from character.
21 | * Added function to fix UTF-8 strings by replacing invalid characters.
22 | * Allow for += of the '\0' character.
23 | * Added UTF-16 functions & tests.
24 | * Updated the tests accordingly and added more for better coverage.
25 | * Added missing #include .
26 | * Applied hack so tests compiles under lunar.
27 | * Updated compat to the latest (v15)
28 | * Did some work on the UCD data (parse decomposition, read file properly...)
29 | * Removed boost-dev as a dependency.
30 |
31 | -- Alexis Wilke Sun, 05 Nov 2023 08:05:54 -0800
32 |
33 | libutf8 (1.0.14.0~bionic) bionic; urgency=high
34 |
35 | * Added operator+ for char32_t/string where string is viewed as UTF-8.
36 |
37 | -- Alexis Wilke Sun, 30 Oct 2022 21:24:12 -0700
38 |
39 | libutf8 (1.0.13.0~bionic) bionic; urgency=high
40 |
41 | * Added a verify_file_inheritance() in tools.
42 |
43 | -- Alexis Wilke Mon, 11 Jul 2022 07:42:16 -0700
44 |
45 | libutf8 (1.0.12.1~bionic) bionic; urgency=high
46 |
47 | * Updated the compat to v10.
48 |
49 | -- Alexis Wilke Thu, 19 May 2022 20:28:28 -0700
50 |
51 | libutf8 (1.0.12.0~bionic) bionic; urgency=high
52 |
53 | * Cleane up the cmake file.
54 |
55 | -- Alexis Wilke Thu, 19 May 2022 18:09:49 -0700
56 |
57 | libutf8 (1.0.11.2~bionic) bionic; urgency=high
58 |
59 | * Bumped build version to rebuild on Launchpad.
60 |
61 | -- Alexis Wilke Fri, 04 Mar 2022 22:36:44 -0800
62 |
63 | libutf8 (1.0.11.1~bionic) bionic; urgency=high
64 |
65 | * Bumped build version to rebuild on Launchpad.
66 |
67 | -- Alexis Wilke Sun, 13 Feb 2022 12:35:15 -0800
68 |
69 | libutf8 (1.0.11.0~bionic) bionic; urgency=high
70 |
71 | * Added a clear() for the good flag in the utf8_iterator.
72 | * Fixed the string test, the exception now include "libutf8_exception: ".
73 | * Correctly test the good flag status in cases were the iterator fails.
74 |
75 | -- Alexis Wilke Mon, 27 Sep 2021 18:08:13 -0700
76 |
77 | libutf8 (1.0.10.0~bionic) bionic; urgency=high
78 |
79 | * Updated the tests to match the new libexcept library setup.
80 |
81 | -- Alexis Wilke Sat, 28 Aug 2021 18:23:57 -0700
82 |
83 | libutf8 (1.0.9.0~bionic) bionic; urgency=high
84 |
85 | * Slowly adding Unicode to canonicalize UTF-8 strings.
86 | * Added SnapDev as a dependency to implement the Unicode parser.
87 | * Added a tool to run the parser (which is part of the library).
88 | * Updated the exception declarations with our macros.
89 | * Cleaned up licenses & copyrights.
90 |
91 | -- Alexis Wilke Tue, 24 Aug 2021 15:49:14 -0700
92 |
93 | libutf8 (1.0.8.1~bionic) bionic; urgency=high
94 |
95 | * Bumped build version to rebuild on Launchpad.
96 |
97 | -- Alexis Wilke Fri, 04 Jun 2021 18:28:59 -0700
98 |
99 | libutf8 (1.0.8.0~bionic) bionic; urgency=high
100 |
101 | * Fixed the name of a function in an exception message.
102 | * Updated the mk script.
103 |
104 | -- Alexis Wilke Tue, 01 Jun 2021 17:40:30 -0700
105 |
106 | libutf8 (1.0.7.2~bionic) bionic; urgency=high
107 |
108 | * Bumped version to recompile against the newer versions.
109 |
110 | -- Alexis Wilke Sat, 15 May 2021 09:33:12 -0700
111 |
112 | libutf8 (1.0.7.1~bionic) bionic; urgency=high
113 |
114 | * Bumped version to recompile against the newer version of snapcatch2.
115 |
116 | -- Alexis Wilke Fri, 08 Jan 2021 22:13:35 -0800
117 |
118 | libutf8 (1.0.7.0~bionic) bionic; urgency=high
119 |
120 | * Changed the EOF of the iterator in an EOS so it works as expected with
121 | the newest versions of catch2 (proper signess for char32_t).
122 | * Fixed one assignment from L'0' to u'0'.
123 |
124 | -- Alexis Wilke Tue, 26 Apr 2020 18:25:27 -0800
125 |
126 | libutf8 (1.0.6.2~bionic) bionic; urgency=high
127 |
128 | * Create a bionic version.
129 |
130 | -- Alexis Wilke Thu, 30 Apr 2020 20:59:23 -0800
131 |
132 | libutf8 (1.0.6.0~xenial) xenial; urgency=high
133 |
134 | * Added the libutf8::case_insensitive_string type.
135 | * Fixed the mk so it generates an error on an unknown command line option.
136 | * Added a test so we can make sure that the case_insensitive_string works.
137 | * Fixed the existing test tag names, we have to have the square brackets.
138 | * Moved a couple of validation functions from the libsnapwebsites to here.
139 | * Broke up the tests in a character and a string so we can just validate a
140 | standalone character too.
141 | * Added another validation for UTF-32 strings and characters.
142 | * Allow for a specific test to be run with `mk -t `.
143 | * Allow for a nullptr when calling start_with_bom().
144 | * Added a new exception for unsupported features.
145 | * Aded a function to check whether a character is a surrogate and which one.
146 | * Added a to_u8string() with std::wstring as input.
147 | * Added a to_u8string() with wchar_t as input.
148 | * Added a to_u8string() with char16_t as input.
149 |
150 | -- Alexis Wilke Wed, 17 Jul 2019 19:58:43 -0800
151 |
152 | libutf8 (1.0.5.1~xenial) xenial; urgency=high
153 |
154 | * Bumped version to force a rebuild, just in case.
155 |
156 | -- Alexis Wilke Wed, 17 Jul 2019 19:58:43 -0800
157 |
158 | libutf8 (1.0.5.0~xenial) xenial; urgency=high
159 |
160 | * Added a way to create an iterator at the end.
161 | * Added == and != with another utf8_iterator.
162 |
163 | -- Alexis Wilke Sat, 29 Jun 2019 05:05:11 -0800
164 |
165 | libutf8 (1.0.4.0~xenial) xenial; urgency=high
166 |
167 | * Added a PROJECT_BRIEF description.
168 | * Added in=C++ to the MAPPING_EXTENSION.
169 | * Updated the doxy file to 1.8.11.
170 |
171 | -- Alexis Wilke Tue, 11 Jun 2019 23:55:25 -0800
172 |
173 | libutf8 (1.0.3.0~xenial) xenial; urgency=high
174 |
175 | * Moved the catch2 implementation to our `snapcatch2.hpp` header instead.
176 | * Updated the tests accordingly.
177 | * Cleaned up various declarations in each file.
178 | * Moved our `obj_setenv()` to `snapdev`.
179 |
180 | -- Alexis Wilke Sat, 1 Jun 2019 00:24:36 -0800
181 |
182 | libutf8 (1.0.2.0~xenial) xenial; urgency=high
183 |
184 | * Got the test coverage back to 100%.
185 | * Renamed tge tests without the "unittest_" introducer.
186 | * Added the `start_with_bom()` function and corresponding tests.
187 | * Fixed standalone characters, the introducer is U for char32_t characters.
188 |
189 | -- Alexis Wilke Tue, 28 May 2019 18:09:01 -0800
190 |
191 | libutf8 (1.0.1.0~xenial) xenial; urgency=high
192 |
193 | * Implemented the to and from UTF-8 and UTF-16 encoding.
194 | * Fixed the u8casecmp() test function which would test 0xD800 to 0xDFFF
195 | as valid characters.
196 | * Added a new exception so we can distinguish whether an encoding or a
197 | decoding went wrong.
198 | * Optimized the UTF-32 to UTF-8 conversion, i.e. code bytes under 0x80 get
199 | copied as is.
200 | * Fixed the '\0' conversion, it would not get added to the output string.
201 | * Added a to_u8string() from a char32_t so we get an std::string as output.
202 | * Generate errors when the mbstowc() or wctombs() functions fail.
203 |
204 | -- Alexis Wilke Tue, 28 May 2019 01:04:30 -0800
205 |
206 | libutf8 (1.0.0.3~xenial) xenial; urgency=high
207 |
208 | * Added the cmake folder and files.
209 | * Added the README.md and TODO.txt files to the debian/docs.
210 | * Removed the "debian/tmp/..." from the `debian/libutf8-doc.install`.
211 | * Added a `-i` command line option to mk to install the library.
212 | * Added a call to prevent collection of stack trace in our tests.
213 |
214 | -- Alexis Wilke Sat, 25 May 2019 20:54:23 -0800
215 |
216 | libutf8 (1.0.0.2~xenial) xenial; urgency=high
217 |
218 | * Try fixing dependencies, the version may need to include ~xenial.
219 | * Added boost-dev as a dependency as we use it in our tests.
220 |
221 | -- Alexis Wilke Sat, 25 May 2019 20:54:23 -0800
222 |
223 | libutf8 (1.0.0.1~xenial) xenial; urgency=high
224 |
225 | * Enhanced the README.md
226 | * Bumped snapcatch2 dependency version to 2.7.2.10.
227 |
228 | -- Alexis Wilke Mon, 20 May 2019 01:23:11 -0800
229 |
230 | libutf8 (1.0.0.0~xenial) xenial; urgency=high
231 |
232 | * Added my wpkg libutf8 library as a Snap! C++ project.
233 |
234 | -- Alexis Wilke Mon, 20 May 2019 01:23:11 -0800
235 |
--------------------------------------------------------------------------------
/debian/compat:
--------------------------------------------------------------------------------
1 | 15
2 |
--------------------------------------------------------------------------------
/debian/control:
--------------------------------------------------------------------------------
1 | Source: libutf8
2 | Priority: extra
3 | Maintainer: R. Douglas Barbieri
4 | Build-Depends: cmake,
5 | debhelper,
6 | doxygen,
7 | graphviz,
8 | libexcept-dev (>= 1.1.0.0~jammy),
9 | snapcatch2 (>= 2.7.2.10~jammy),
10 | snapcmakemodules (>= 1.0.35.3~jammy),
11 | snapdev (>= 1.1.16.0~jammy)
12 | Standards-Version: 3.9.4
13 | Section: libs
14 | Homepage: https://snapwebsites.org/
15 | Vcs-Git: https://github.com/m2osw/snapcpp.git
16 | Vcs-Browser: https://github.com/m2osw/libutf8
17 |
18 | Package: libutf8-dev
19 | Section: libdevel
20 | Architecture: any
21 | Depends: libutf8 (= ${binary:Version}), ${misc:Depends}
22 | Description: Development package for the C++ libutf8 library.
23 | This library provides functions to convert between UTF-8 and UTF-32 characters.
24 |
25 | Package: libutf8-doc
26 | Section: doc
27 | Architecture: all
28 | Depends: ${misc:Depends}
29 | Description: Documentation for the C++ libutf8 library.
30 | This library provides functions to convert between UTF-8 and UTF-32 characters.
31 |
32 | Package: libutf8
33 | Section: libs
34 | Architecture: any
35 | Depends: ${shlibs:Depends}, ${misc:Depends}
36 | Description: C++ library for UTF-8/UTF-32 handling.
37 | This library provides functions to convert between UTF-8 and UTF-32 characters.
38 |
39 | # vim: ts=4 sw=4 et
40 |
--------------------------------------------------------------------------------
/debian/copyright:
--------------------------------------------------------------------------------
1 | Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
2 | Upstream-Name: libutf8
3 | Source: https://github.com/m2osw/libutf8
4 |
5 | Files: *
6 | Copyright: 2006-2019 Made to Order Software
7 | 2006-2019 Alexis Wilke
8 | 2006-2019 R. Douglas Barbieri
9 | License: GPL-2+
10 | This package is free software; you can redistribute it and/or modify
11 | it under the terms of the GNU General Public License as published by
12 | the Free Software Foundation; either version 2 of the License, or
13 | (at your option) any later version.
14 | .
15 | This package is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | GNU General Public License for more details.
19 | .
20 | You should have received a copy of the GNU General Public License
21 | along with this program. If not, see
22 | .
23 | On Debian systems, the complete text of the GNU General
24 | Public License version 2 can be found in "/usr/share/common-licenses/GPL-2".
25 |
26 | Files: conf/unicode/*
27 | Copyright: 1991-2021 Unicode, Inc. All rights reserved.
28 | License: Unicode
29 | UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
30 | .
31 | See Terms of Use for definitions of Unicode Inc.'s
32 | Data Files and Software.
33 | .
34 | NOTICE TO USER: Carefully read the following legal agreement.
35 | BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
36 | DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
37 | YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
38 | TERMS AND CONDITIONS OF THIS AGREEMENT.
39 | IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
40 | THE DATA FILES OR SOFTWARE.
41 | .
42 | COPYRIGHT AND PERMISSION NOTICE
43 | .
44 | Copyright (c) 1991-2021 Unicode, Inc. All rights reserved.
45 | Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
46 | .
47 | Permission is hereby granted, free of charge, to any person obtaining
48 | a copy of the Unicode data files and any associated documentation
49 | (the "Data Files") or Unicode software and any associated documentation
50 | (the "Software") to deal in the Data Files or Software
51 | without restriction, including without limitation the rights to use,
52 | copy, modify, merge, publish, distribute, and/or sell copies of
53 | the Data Files or Software, and to permit persons to whom the Data Files
54 | or Software are furnished to do so, provided that either
55 | (a) this copyright and permission notice appear with all copies
56 | of the Data Files or Software, or
57 | (b) this copyright and permission notice appear in associated
58 | Documentation.
59 | .
60 | THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
61 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
62 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
63 | NONINFRINGEMENT OF THIRD PARTY RIGHTS.
64 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
65 | NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
66 | DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
67 | DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
68 | TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
69 | PERFORMANCE OF THE DATA FILES OR SOFTWARE.
70 | .
71 | Except as contained in this notice, the name of a copyright holder
72 | shall not be used in advertising or otherwise to promote the sale,
73 | use or other dealings in these Data Files or Software without prior
74 | written authorization of the copyright holder.
75 | .
76 | See also: https://www.unicode.org/license.html
77 |
78 | # Please also look if there are files or directories which have a
79 | # different copyright/license attached and list them here.
80 | # Please avoid to pick license terms that are more restrictive than the
81 | # packaged work, as it may make Debian's contributions unacceptable upstream.
82 |
--------------------------------------------------------------------------------
/debian/docs:
--------------------------------------------------------------------------------
1 | LICENSE.txt
2 | README.md
3 | TODO.txt
4 |
--------------------------------------------------------------------------------
/debian/libutf8-dev.install:
--------------------------------------------------------------------------------
1 | usr/include/*
2 | usr/lib/lib*.so
3 | usr/share/cmake/*
4 |
--------------------------------------------------------------------------------
/debian/libutf8-doc.install:
--------------------------------------------------------------------------------
1 | usr/share/doc/libutf8/html/* usr/share/doc/libutf8-doc/html/
2 |
--------------------------------------------------------------------------------
/debian/libutf8.install:
--------------------------------------------------------------------------------
1 | usr/bin
2 | usr/lib/lib*.so.*
3 | usr/share/libutf8
4 |
--------------------------------------------------------------------------------
/debian/rules:
--------------------------------------------------------------------------------
1 | #!/usr/bin/make -f
2 | # -*- makefile -*-
3 | # Sample debian/rules that uses debhelper.
4 | # This file was originally written by Joey Hess and Craig Small.
5 | # As a special exception, when this file is copied by dh-make into a
6 | # dh-make output file, you may use that output file without restriction.
7 | # This special exception was added by Craig Small in version 0.37 of dh-make.
8 |
9 | # Uncomment this to turn on verbose mode.
10 | #export DH_VERBOSE=1
11 |
12 | %:
13 | dh $@ --parallel
14 |
15 | override_dh_auto_configure:
16 | dh_auto_configure -- -DCMAKE_BUILD_TYPE=Release
17 |
18 |
--------------------------------------------------------------------------------
/debian/source/options:
--------------------------------------------------------------------------------
1 | tar-ignore = "tmp"
2 | tar-ignore = ".git"
3 |
--------------------------------------------------------------------------------
/doc/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2006-2023 Made to Order Software Corp. All Rights Reserved
2 | #
3 | # https://snapwebsites.org/project/libutf8
4 | # contact@m2osw.com
5 | #
6 | # This program is free software; you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation; either version 2 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License along
17 | # with this program; if not, write to the Free Software Foundation, Inc.,
18 | # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 |
20 |
21 | ##
22 | ## Documentation
23 | ##
24 | find_package(SnapDoxygen)
25 | AddDoxygenTarget(libutf8
26 | ${LIBUTF8_VERSION_MAJOR}
27 | ${LIBUTF8_VERSION_MINOR}
28 | ${LIBUTF8_VERSION_PATCH}
29 | )
30 |
31 | # vim: ts=4 sw=4 et
32 |
--------------------------------------------------------------------------------
/doc/footer.html:
--------------------------------------------------------------------------------
1 |
5 |
--------------------------------------------------------------------------------
/doc/libutf8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/m2osw/libutf8/fe008f2199f5d8f43abe6ecffe6e82ead03ab2b5/doc/libutf8.png
--------------------------------------------------------------------------------
/libutf8/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved
2 | #
3 | # https://snapwebsites.org/project/libutf8
4 | # contact@m2osw.com
5 | #
6 | # This program is free software; you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation; either version 2 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License along
17 | # with this program; if not, write to the Free Software Foundation, Inc.,
18 | # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 |
20 | ##
21 | ## utf8 library
22 | ##
23 | project(utf8)
24 |
25 | # Put the version in the header file
26 | configure_file(
27 | ${CMAKE_CURRENT_SOURCE_DIR}/version.h.in
28 | ${CMAKE_CURRENT_BINARY_DIR}/version.h
29 | )
30 |
31 | add_library(${PROJECT_NAME} SHARED
32 | base.cpp
33 | iterator.cpp
34 | json_tokens.cpp
35 | libutf8.cpp
36 | unicode_data.cpp
37 | unicode_data_file.cpp
38 | version.cpp
39 | )
40 |
41 | target_include_directories(${PROJECT_NAME}
42 | PUBLIC
43 | ${LIBEXCEPT_INCLUDE_DIRS}
44 | ${SNAPDEV_INCLUDE_DIRS}
45 | )
46 |
47 | target_link_libraries(${PROJECT_NAME}
48 | ${LIBEXCEPT_LIBRARIES}
49 | )
50 |
51 | set_target_properties(${PROJECT_NAME} PROPERTIES
52 | VERSION
53 | ${LIBUTF8_VERSION_MAJOR}.${LIBUTF8_VERSION_MINOR}
54 |
55 | SOVERSION
56 | ${LIBUTF8_VERSION_MAJOR}
57 | )
58 |
59 | install(
60 | TARGETS
61 | ${PROJECT_NAME}
62 |
63 | RUNTIME DESTINATION
64 | bin
65 |
66 | LIBRARY DESTINATION
67 | lib
68 |
69 | ARCHIVE DESTINATION
70 | lib
71 | )
72 |
73 | install(
74 | FILES
75 | base.h
76 | caseinsensitivestring.h
77 | exception.h
78 | iterator.h
79 | json_tokens.h
80 | libutf8.h
81 | unicode_data.h
82 | ${CMAKE_CURRENT_BINARY_DIR}/version.h
83 |
84 | DESTINATION
85 | include/libutf8
86 | )
87 |
88 |
89 | # vim: ts=4 sw=4 et
90 |
--------------------------------------------------------------------------------
/libutf8/base.cpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved
2 | //
3 | // https://snapwebsites.org/project/libutf8
4 | // contact@m2osw.com
5 | //
6 | // This program is free software; you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation; either version 2 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 |
20 | /** \file
21 | * \brief Implementation of the UTF-8 functions.
22 | *
23 | * This file is the implementation of the UTF-8 functions of the libutf8
24 | * library. It simply is a set of functions to convert between different
25 | * character sets in a lossless manner. At this point it supports UTF-8,
26 | * UCS-4, and UTF-16 formats.
27 | *
28 | * Contrary to many of the system functions, these functions do not take
29 | * anything from the system in account (the locale can be anything, it does
30 | * not change the exact behavior of these functions.)
31 | *
32 | * Also similar functionality is found on Unices and MS-Windows, it was
33 | * simpler to just implement these few functions than to try to have a
34 | * converter that is sure not to use a locale and this way we can use
35 | * standard strings (std::string and std::wstring) instead of having to
36 | * call C functions.
37 | */
38 |
39 | // self
40 | //
41 | #include "libutf8/base.h"
42 |
43 | #include "libutf8/exception.h"
44 |
45 |
46 | // C++
47 | //
48 | #include
49 | #include
50 |
51 |
52 | // last include
53 | //
54 | #include
55 |
56 |
57 |
58 | /** \brief Name space of the UTF-8 library.
59 | *
60 | * The libutf8 library is used to seamlessly handle UTF-8 strings. It also
61 | * is used to convert betwee UTF-8, UTF-16, and UTF-32 strings.
62 | *
63 | * \todo
64 | * Implement the UTF-16 functions.
65 | */
66 | namespace libutf8
67 | {
68 |
69 |
70 | /** \var constexpr std::size_t MBS_MIN_BUFFER_LENGTH
71 | * \brief Minimum buffer length to support any UTF-8 characters.
72 | *
73 | * When converting a UTF-32 character to UTF-8, it makes use of an output
74 | * buffer. The size of that output buffer should be at least
75 | * MBS_MIN_BUFFER_LENGTH to accomodate any UTF-32 character.
76 | *
77 | * Note that the size includes space for a null terminator (`'\0'`).
78 | *
79 | * The size of your buffer can be smaller as long as the UTF-32 character
80 | * fits into it, the wctombs() function will not fail.
81 | */
82 |
83 |
84 | /** \brief Compute the UTF-8 encoded representation of wc.
85 | *
86 | * This function transforms the UTF-32 character \p wc in a
87 | * UTF-8 encoded series of bytes (called a multi-byte encoded
88 | * character.) The resulting string is null (`'\0'`) terminated.
89 | *
90 | * The \p mb buffer should be at least MBS_MIN_BUFFER_LENGTH bytes.
91 | * If less space is required, the function does not report a problem,
92 | * though. This allows to get the total size of a conversion and then
93 | * do the full conversion to that one buffer without the need to
94 | * add unnecessary bytes at the end of your destination buffer.
95 | *
96 | * \code
97 | * ...
98 | * char mb[MBS_MIN_BUFFER_LENGTH];
99 | *
100 | * wctombs(mb, big_char, sizeof(mb));
101 | * ...
102 | * \endcode
103 | *
104 | * The function does not encode invalid characters. When such is
105 | * passed to the function, the \p mb string is turned in a null
106 | * terminated string and the function returns 0. We avoid an
107 | * exception here because that way you can quickly check whether
108 | * a string of `char32_t` characters is valid or not.
109 | *
110 | * \note
111 | * Unicode defines valid characters only between zero (0) and 0x10FFFF.
112 | * Therefore this function encodes the character using 1 to 4 bytes plus
113 | * one for the null terminator.
114 | *
115 | * \warning
116 | * The function does not raise an error if the input \p wc character
117 | * is considered invalid (a UTF-16 surrogate or larger than 0x10FFFF.)
118 | * Instead it returns 0 and sets the \p mb string to the empty string.
119 | *
120 | * \exception libutf8_logic_exception
121 | * The function raises this exception if the destination buffer is too
122 | * small for the conversion. Don't forget that we add a null terminator
123 | * so if the character needs 3 UTF-8 bytes, we will check for a buffer
124 | * of at least 4 bytes to consider it valid.
125 | *
126 | * \param[out] mb The output buffer, it will always be null terminated.
127 | * \param[in] wc The wide character to convert.
128 | * \param[in] len The length of \p mb.
129 | *
130 | * \return The number of bytes in mb, not including the null terminator.
131 | */
132 | int wctombs(char * mb, char32_t wc, std::size_t len)
133 | {
134 | auto verify_length = [&len](std::size_t required_len)
135 | {
136 | if(len < required_len)
137 | {
138 | throw libutf8_logic_exception("wctombs() called with an output buffer which is too small.");
139 | }
140 | };
141 |
142 | if(wc < 0x80)
143 | {
144 | verify_length(2);
145 |
146 | /* this will also encode '\0'... */
147 | mb[0] = static_cast(wc);
148 | mb[1] = '\0';
149 | return 1;
150 | }
151 | if(wc < 0x800)
152 | {
153 | verify_length(3);
154 |
155 | mb[0] = static_cast((wc >> 6) | 0xC0);
156 | mb[1] = (wc & 0x3F) | 0x80;
157 | mb[2] = '\0';
158 | return 2;
159 | }
160 |
161 | // avoid encoding the UTF-16 surrogate because those code points do not
162 | // represent characters
163 | //
164 | if(wc < 0xD800 || wc > 0xDFFF)
165 | {
166 | if(wc < 0x10000)
167 | {
168 | verify_length(4);
169 |
170 | mb[0] = static_cast((wc >> 12) | 0xE0);
171 | mb[1] = ((wc >> 6) & 0x3F) | 0x80;
172 | mb[2] = (wc & 0x3F) | 0x80;
173 | mb[3] = '\0';
174 | return 3;
175 | }
176 | if(wc < 0x110000)
177 | {
178 | verify_length(5);
179 |
180 | mb[0] = static_cast((wc >> 18) | 0xF0);
181 | mb[1] = ((wc >> 12) & 0x3F) | 0x80;
182 | mb[2] = ((wc >> 6) & 0x3F) | 0x80;
183 | mb[3] = (wc & 0x3F) | 0x80;
184 | mb[4] = '\0';
185 | return 4;
186 | }
187 | }
188 |
189 | verify_length(1);
190 |
191 | /* an invalid wide character */
192 | mb[0] = '\0';
193 | return -1;
194 | }
195 |
196 |
197 | /** \brief Convert one multi-byte character to a wide character.
198 | *
199 | * This function converts UTF-8 bytes from \p mb to one UTF-32
200 | * wide character and saves the result in \p wc. The function
201 | * automatically increases the pointer in \p mb and simultaneously
202 | * decreases the \p len parameter.
203 | *
204 | * \p wc holds the resulting wide character, a character between
205 | * `'\0'` (NUL) and `0x10FFFF` and it returns the number of bytes
206 | * that were used from \p mb. If a bad character is encountered,
207 | * then the function returns -1 and the bad sequence of bytes is
208 | * skipped so only one error will be reported for one bad sequence.
209 | *
210 | * Bad characters when converting UTF-8 to wide characters are:
211 | *
212 | * \li The stream includes bytes 0x80 to 0xBF without an introducer.
213 | * \li The stream does not include the right number of 0x80 to 0xBF
214 | * bytes after an introducer.
215 | * \li The input ends too early and cannot accommodate the last
216 | * encoded character.
217 | * \li The codes 0xF8 to 0xFF were found in the input string.
218 | * \li The resulting \p wc value would be larger than 0x10FFFF.
219 | * \li The resulting \p wc value represents a UTF-16 surrogate
220 | * value (a number between 0xD800 and 0xDFFF).
221 | *
222 | * Code points between 0xD800 and 0xDFFF are not valid characters.
223 | * These represent low and high surrogates in UTF-16 (2 are
224 | * necessary to encode one character of 17 or more bits.)
225 | *
226 | * The function returns 0 and sets \p wc to the NUL character (`U'\0'`)
227 | * if the \p len parameter is zero (i.e. empty string.)
228 | *
229 | * \note
230 | * The function converts a NUL character (`'\0'`) in the
231 | * input string as a NUL wide character (`U'\0'`) and returns 1. It
232 | * does not see the NUL character as the end of the string.
233 | *
234 | * \warning
235 | * The function does not throw on invalid input. It is the responsibility
236 | * of the caller to do so if necessary. This is useful to very an UTF-8
237 | * string without having to catch an exception.
238 | *
239 | * \param[out] wc The output wide character variable.
240 | * \param[in,out] mb The multi-byte input string pointer, returned at the
241 | * following byte.
242 | * \param[in,out] len The number of characters left in mb.
243 | *
244 | * \return The number of bytes read or -1 if invalid bytes were found.
245 | */
246 | int mbstowc(char32_t & wc, char const * & mb, std::size_t & len)
247 | {
248 | auto skip = [](char const * & skip_mb, size_t & skip_len)
249 | {
250 | for(unsigned char b(0)
251 | ; skip_len > 0 && (b = *skip_mb, (b >= 0x80 && b <= 0xBF) || b >= 0xF5)
252 | ; ++skip_mb , --skip_len);
253 | };
254 |
255 | // already done?
256 | //
257 | if(len <= 0)
258 | {
259 | wc = U'\0';
260 | return 0;
261 | }
262 |
263 | // we eat one character from the source minimum
264 | //
265 | unsigned char c(*mb++);
266 | --len;
267 |
268 | if(c < 0x80)
269 | {
270 | wc = c;
271 | return 1;
272 | }
273 |
274 | // by default return an invalid character
275 | //
276 | wc = NOT_A_CHARACTER;
277 |
278 | // invalid stream?
279 | //
280 | if((c >= 0x80 && c <= 0xBF) || c >= 0xF5)
281 | {
282 | // this is bad UTF-8, skip all the invalid bytes
283 | //
284 | skip(mb, len);
285 | return -1;
286 | }
287 |
288 | char32_t w(U'\0');
289 | std::size_t cnt(0);
290 |
291 | if(c >= 0xF0)
292 | {
293 | w = c & 0x07;
294 | cnt = 3;
295 | }
296 | else if(c >= 0xE0)
297 | {
298 | w = c & 0x0F;
299 | cnt = 2;
300 | }
301 | else /*if(c >= 0xC0)*/ // always true so we don't have to check
302 | {
303 | w = c & 0x1F;
304 | cnt = 1;
305 | }
306 |
307 | // enough data in the input? if not, that's an error
308 | //
309 | if(len < cnt)
310 | {
311 | skip(mb, len);
312 | return -1;
313 | }
314 | len -= cnt;
315 |
316 | for(std::size_t l(cnt); l > 0; --l, mb++)
317 | {
318 | c = *mb;
319 | if(c < 0x80 || c > 0xBF)
320 | {
321 | // we got an invalid sequence!
322 | // restore whatever is left in len
323 | //
324 | len += l;
325 | return -1;
326 | }
327 | w = (w << 6) | (c & 0x3F);
328 | }
329 |
330 | if(w >= 0x110000
331 | || (w >= 0x00D800 && w <= 0x00DFFF))
332 | {
333 | // character out of range or UTF-16 surrogate
334 | // it can happen with sequences starting with 0xF7
335 | //
336 | return -1;
337 | }
338 |
339 | wc = w;
340 |
341 | return static_cast(cnt + 1);
342 | }
343 |
344 |
345 | /** \brief An overload with a non-const string.
346 | *
347 | * Since we are passing a reference to the \p mb string, whether it is
348 | * const or non-const matter to the call. So here we offer a non-const
349 | * version even though the string doesn't get modified.
350 | *
351 | * \param[out] wc The output wide character variable.
352 | * \param[in,out] mb The multi-byte input string pointer, returned at the
353 | * following byte.
354 | * \param[in,out] len The number of characters left in mb.
355 | *
356 | * \return The number of bytes read or -1 if invalid bytes were found.
357 | */
358 | int mbstowc(char32_t & wc, char * & mb, std::size_t & len)
359 | {
360 | return mbstowc(wc, const_cast(mb), len);
361 | }
362 |
363 |
364 |
365 | } // libutf8 namespace
366 | // vim: ts=4 sw=4 et
367 |
--------------------------------------------------------------------------------
/libutf8/base.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved
2 | //
3 | // https://snapwebsites.org/project/libutf8
4 | // contact@m2osw.com
5 | //
6 | // This program is free software; you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation; either version 2 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 | #pragma once
20 |
21 | /** \file
22 | * \brief The declarations of the UTF-8 library base functions.
23 | *
24 | * The functions defined in this file are used to do the actual conversions.
25 | *
26 | * They may be useful to you which is why we make them available here.
27 | * However, these are considered low level functions and you may want
28 | * to restrain using them. Using the `std::string`-base functions is
29 | * much safer and what is expected of you.
30 | */
31 |
32 | // C++
33 | //
34 | #include
35 |
36 |
37 | namespace libutf8
38 | {
39 |
40 |
41 |
42 | constexpr std::size_t MBS_MIN_BUFFER_LENGTH = 5;
43 | constexpr char32_t const BOM_CHAR = U'\U0000FEFF';
44 | constexpr char32_t const NOT_A_CHARACTER = static_cast(-2);
45 |
46 | int wctombs(char * mb, char32_t wc, size_t len);
47 | int mbstowc(char32_t & wc, char const * & mb, size_t & len);
48 | int mbstowc(char32_t & wc, char * & mb, size_t & len);
49 |
50 |
51 |
52 | } // libutf8 namespace
53 | // vim: ts=4 sw=4 et
54 |
--------------------------------------------------------------------------------
/libutf8/caseinsensitivestring.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2013-2023 Made to Order Software Corp. All Rights Reserved
2 | //
3 | // https://snapwebsites.org/project/libutf8
4 | // contact@m2osw.com
5 | //
6 | // This program is free software; you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation; either version 2 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 | #pragma once
20 |
21 |
22 | // self
23 | //
24 | #include
25 |
26 |
27 | // C++
28 | //
29 | #include
30 |
31 |
32 |
33 | namespace libutf8
34 | {
35 |
36 |
37 |
38 | /** \brief Case insensitive string.
39 | *
40 | * This class is an overload of the string template which allows you to
41 | * create case insensitive strings as far as the comparison operators
42 | * are concerned. All the other functions still work the same way.
43 | *
44 | * This is particularly useful if you manage an std::map<> with a string as
45 | * the key, string which should not be case sensitive.
46 | *
47 | * The comparisons are done using the libutf8::u8casecmp() function.
48 | *
49 | * \sa u8casecmp()
50 | */
51 | template<
52 | class _CharT,
53 | class _Traits = std::char_traits<_CharT>,
54 | class _Alloc = std::allocator<_CharT>
55 | >
56 | class case_insensitive_basic_string
57 | : public std::basic_string<_CharT, _Traits, _Alloc>
58 | {
59 | public:
60 | typedef typename std::basic_string<_CharT, _Traits, _Alloc>::size_type size_type;
61 |
62 | case_insensitive_basic_string() noexcept(std::is_nothrow_default_constructible<_Alloc>::value)
63 | : std::basic_string<_CharT, _Traits, _Alloc>()
64 | {
65 | }
66 |
67 | explicit case_insensitive_basic_string(_Alloc const & __a)
68 | : std::basic_string<_CharT, _Traits, _Alloc>(__a)
69 | {
70 | }
71 |
72 | case_insensitive_basic_string(size_type __n, _CharT __c, _Alloc const & __a = _Alloc())
73 | : std::basic_string<_CharT, _Traits, _Alloc>(__n, __c, __a)
74 | {
75 | }
76 |
77 | // the following are for C++17 and over
78 | // (and then the next two constructors will not set __n)
79 | //
80 | //case_insensitive_basic_string(std::basic_string<_CharT, _Traits, _Alloc> const & __str, size_type __pos, _Alloc const & __a = _Alloc())
81 | // : std::basic_string<_CharT, _Traits, _Alloc>(__str, __pos, __a)
82 | //{
83 | //}
84 | //
85 | //case_insensitive_basic_string(case_insensitive_basic_string const & __str, size_type __pos, _Alloc const & __a = _Alloc())
86 | // : std::basic_string<_CharT, _Traits, _Alloc>(static_cast const &>(__str), __pos, __a)
87 | //{
88 | //}
89 |
90 | case_insensitive_basic_string(std::basic_string<_CharT, _Traits, _Alloc> const & __str, size_type __pos, size_type __n = std::basic_string<_CharT, _Traits, _Alloc>::npos, _Alloc const & __a = _Alloc())
91 | : std::basic_string<_CharT, _Traits, _Alloc>(__str, __pos, __n, __a)
92 | {
93 | }
94 |
95 | case_insensitive_basic_string(case_insensitive_basic_string const & __str, size_type __pos, size_type __n = std::basic_string<_CharT, _Traits, _Alloc>::npos, _Alloc const & __a = _Alloc())
96 | : std::basic_string<_CharT, _Traits, _Alloc>(__str, __pos, __n, __a)
97 | {
98 | }
99 |
100 | case_insensitive_basic_string(_CharT const * __d, size_type __n, _Alloc const & __a = _Alloc())
101 | : std::basic_string<_CharT, _Traits, _Alloc>(__d, __n, __a)
102 | {
103 | }
104 |
105 | case_insensitive_basic_string(_CharT const * __d, _Alloc const & __a = _Alloc())
106 | : std::basic_string<_CharT, _Traits, _Alloc>(__d, __a)
107 | {
108 | }
109 |
110 | template
111 | case_insensitive_basic_string(_InputIterator __beg, _InputIterator __end, _Alloc const & __a = _Alloc())
112 | : std::basic_string<_CharT, _Traits, _Alloc>(__beg, __end, __a)
113 | {
114 | }
115 |
116 | case_insensitive_basic_string(std::basic_string<_CharT, _Traits, _Alloc> const & __str)
117 | : std::basic_string<_CharT, _Traits, _Alloc>(__str)
118 | {
119 | }
120 |
121 | case_insensitive_basic_string(case_insensitive_basic_string const & __str)
122 | : std::basic_string<_CharT, _Traits, _Alloc>(__str)
123 | {
124 | }
125 |
126 | case_insensitive_basic_string(std::basic_string<_CharT, _Traits, _Alloc> && __str) noexcept
127 | : std::basic_string<_CharT, _Traits, _Alloc>(__str)
128 | {
129 | }
130 |
131 | case_insensitive_basic_string(case_insensitive_basic_string && __str) noexcept
132 | : std::basic_string<_CharT, _Traits, _Alloc>(__str)
133 | {
134 | }
135 |
136 | case_insensitive_basic_string(std::basic_string<_CharT, _Traits, _Alloc> && __str, _Alloc const & __a)
137 | : std::basic_string<_CharT, _Traits, _Alloc>(__str, __a)
138 | {
139 | }
140 |
141 | case_insensitive_basic_string(case_insensitive_basic_string && __str, _Alloc const & __a)
142 | : std::basic_string<_CharT, _Traits, _Alloc>(__str, __a)
143 | {
144 | }
145 |
146 | case_insensitive_basic_string(std::initializer_list<_CharT> __l, _Alloc const & __a = _Alloc())
147 | : std::basic_string<_CharT, _Traits, _Alloc>(__l, __a)
148 | {
149 | }
150 |
151 |
152 | friend bool operator == (case_insensitive_basic_string const & lhs, std::basic_string<_CharT, _Traits, _Alloc> const & rhs)
153 | {
154 | return libutf8::u8casecmp(lhs, rhs) == 0;
155 | }
156 |
157 | friend bool operator == (case_insensitive_basic_string const & lhs, case_insensitive_basic_string const & rhs)
158 | {
159 | return libutf8::u8casecmp(lhs, rhs) == 0;
160 | }
161 |
162 | friend bool operator == (std::basic_string<_CharT, _Traits, _Alloc> const & lhs, case_insensitive_basic_string const & rhs)
163 | {
164 | return libutf8::u8casecmp(lhs, rhs) == 0;
165 | }
166 |
167 | friend bool operator == (case_insensitive_basic_string const & lhs, _CharT const * rhs)
168 | {
169 | return libutf8::u8casecmp(lhs, rhs) == 0;
170 | }
171 |
172 | friend bool operator == (_CharT const * lhs, case_insensitive_basic_string const & rhs)
173 | {
174 | return libutf8::u8casecmp(lhs, rhs) == 0;
175 | }
176 |
177 | friend bool operator != (case_insensitive_basic_string const & lhs, std::basic_string<_CharT, _Traits, _Alloc> const & rhs)
178 | {
179 | return libutf8::u8casecmp(lhs, rhs) != 0;
180 | }
181 |
182 | friend bool operator != (case_insensitive_basic_string const & lhs, case_insensitive_basic_string const & rhs)
183 | {
184 | return libutf8::u8casecmp(lhs, rhs) != 0;
185 | }
186 |
187 | friend bool operator != (std::basic_string<_CharT, _Traits, _Alloc> const & lhs, case_insensitive_basic_string const & rhs)
188 | {
189 | return libutf8::u8casecmp(lhs, rhs) != 0;
190 | }
191 |
192 | friend bool operator != (case_insensitive_basic_string const & lhs, _CharT const * rhs)
193 | {
194 | return libutf8::u8casecmp(lhs, rhs) != 0;
195 | }
196 |
197 | friend bool operator != (_CharT const * lhs, case_insensitive_basic_string const & rhs)
198 | {
199 | return libutf8::u8casecmp(lhs, rhs) != 0;
200 | }
201 |
202 | friend bool operator < (case_insensitive_basic_string const & lhs, std::basic_string<_CharT, _Traits, _Alloc> const & rhs)
203 | {
204 | return libutf8::u8casecmp(lhs, rhs) < 0;
205 | }
206 |
207 | friend bool operator < (case_insensitive_basic_string const & lhs, case_insensitive_basic_string const & rhs)
208 | {
209 | return libutf8::u8casecmp(lhs, rhs) < 0;
210 | }
211 |
212 | friend bool operator < (std::basic_string<_CharT, _Traits, _Alloc> const & lhs, case_insensitive_basic_string const & rhs)
213 | {
214 | return libutf8::u8casecmp(lhs, rhs) < 0;
215 | }
216 |
217 | friend bool operator < (case_insensitive_basic_string const & lhs, _CharT const * rhs)
218 | {
219 | return libutf8::u8casecmp(lhs, rhs) < 0;
220 | }
221 |
222 | friend bool operator < (_CharT const * lhs, case_insensitive_basic_string const & rhs)
223 | {
224 | return libutf8::u8casecmp(lhs, rhs) < 0;
225 | }
226 |
227 | friend bool operator <= (case_insensitive_basic_string const & lhs, std::basic_string<_CharT, _Traits, _Alloc> const & rhs)
228 | {
229 | return libutf8::u8casecmp(lhs, rhs) <= 0;
230 | }
231 |
232 | friend bool operator <= (case_insensitive_basic_string const & lhs, case_insensitive_basic_string const & rhs)
233 | {
234 | return libutf8::u8casecmp(lhs, rhs) <= 0;
235 | }
236 |
237 | friend bool operator <= (std::basic_string<_CharT, _Traits, _Alloc> const & lhs, case_insensitive_basic_string const & rhs)
238 | {
239 | return libutf8::u8casecmp(lhs, rhs) <= 0;
240 | }
241 |
242 | friend bool operator <= (case_insensitive_basic_string const & lhs, _CharT const * rhs)
243 | {
244 | return libutf8::u8casecmp(lhs, rhs) <= 0;
245 | }
246 |
247 | friend bool operator <= (_CharT const * lhs, case_insensitive_basic_string const & rhs)
248 | {
249 | return libutf8::u8casecmp(lhs, rhs) <= 0;
250 | }
251 |
252 | friend bool operator > (case_insensitive_basic_string const & lhs, std::basic_string<_CharT, _Traits, _Alloc> const & rhs)
253 | {
254 | return libutf8::u8casecmp(lhs, rhs) > 0;
255 | }
256 |
257 | friend bool operator > (case_insensitive_basic_string const & lhs, case_insensitive_basic_string const & rhs)
258 | {
259 | return libutf8::u8casecmp(lhs, rhs) > 0;
260 | }
261 |
262 | friend bool operator > (std::basic_string<_CharT, _Traits, _Alloc> const & lhs, case_insensitive_basic_string const & rhs)
263 | {
264 | return libutf8::u8casecmp(lhs, rhs) > 0;
265 | }
266 |
267 | friend bool operator > (case_insensitive_basic_string const & lhs, _CharT const * rhs)
268 | {
269 | return libutf8::u8casecmp(lhs, rhs) > 0;
270 | }
271 |
272 | friend bool operator > (_CharT const * lhs, case_insensitive_basic_string const & rhs)
273 | {
274 | return libutf8::u8casecmp(lhs, rhs) > 0;
275 | }
276 |
277 | friend bool operator >= (case_insensitive_basic_string const & lhs, std::basic_string<_CharT, _Traits, _Alloc> const & rhs)
278 | {
279 | return libutf8::u8casecmp(lhs, rhs) >= 0;
280 | }
281 |
282 | friend bool operator >= (case_insensitive_basic_string const & lhs, case_insensitive_basic_string const & rhs)
283 | {
284 | return libutf8::u8casecmp(lhs, rhs) >= 0;
285 | }
286 |
287 | friend bool operator >= (std::basic_string<_CharT, _Traits, _Alloc> const & lhs, case_insensitive_basic_string const & rhs)
288 | {
289 | return libutf8::u8casecmp(lhs, rhs) >= 0;
290 | }
291 |
292 | friend bool operator >= (case_insensitive_basic_string const & lhs, _CharT const * rhs)
293 | {
294 | return libutf8::u8casecmp(lhs, rhs) >= 0;
295 | }
296 |
297 | friend bool operator >= (_CharT const * lhs, case_insensitive_basic_string const & rhs)
298 | {
299 | return libutf8::u8casecmp(lhs, rhs) >= 0;
300 | }
301 | };
302 |
303 |
304 | typedef case_insensitive_basic_string case_insensitive_string;
305 |
306 | // TODO add support for other types
307 | //typedef case_insensitive_basic_string case_insensitive_wstring;
308 | //typedef case_insensitive_basic_string case_insensitive_u16string;
309 | //typedef case_insensitive_basic_string case_insensitive_u32string;
310 |
311 |
312 | }
313 | // libutf8 namespace
314 | // vim: ts=4 sw=4 et
315 |
--------------------------------------------------------------------------------
/libutf8/exception.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved
2 | //
3 | // https://snapwebsites.org/project/libutf8
4 | // contact@m2osw.com
5 | //
6 | // This program is free software; you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation; either version 2 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 | #pragma once
20 |
21 | /** \file
22 | * \brief The declarations of the UTF-8 library.
23 | *
24 | * This file is the declarations of the UTF-8 library which are just a few
25 | * functions used to convert a string from one format to another.
26 | */
27 |
28 | // libexcept
29 | //
30 | #include
31 |
32 |
33 |
34 | namespace libutf8
35 | {
36 |
37 |
38 |
39 | DECLARE_LOGIC_ERROR(libutf8_logic_exception);
40 |
41 | DECLARE_MAIN_EXCEPTION(libutf8_exception);
42 |
43 | DECLARE_EXCEPTION(libutf8_exception, libutf8_exception_decoding);
44 | DECLARE_EXCEPTION(libutf8_exception, libutf8_exception_encoding);
45 | DECLARE_EXCEPTION(libutf8_exception, libutf8_exception_invalid_parameter);
46 | DECLARE_EXCEPTION(libutf8_exception, libutf8_exception_io);
47 | DECLARE_EXCEPTION(libutf8_exception, libutf8_exception_missing);
48 | DECLARE_EXCEPTION(libutf8_exception, libutf8_exception_overflow);
49 | DECLARE_EXCEPTION(libutf8_exception, libutf8_exception_twice);
50 | DECLARE_EXCEPTION(libutf8_exception, libutf8_exception_unsupported);
51 |
52 |
53 |
54 | } // libutf8 namespace
55 | // vim: ts=4 sw=4 et
56 |
--------------------------------------------------------------------------------
/libutf8/iterator.cpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved
2 | //
3 | // https://snapwebsites.org/project/libutf8
4 | // contact@m2osw.com
5 | //
6 | // This program is free software; you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation; either version 2 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 |
20 | /** \file
21 | * \brief Implementation of the UTF-8 functions.
22 | *
23 | * This file is the implementation of the UTF-8 functions of the libutf8
24 | * library. It simply is a set of functions to convert between different
25 | * character sets in a lossless manner. At this point it supports UTF-8,
26 | * UCS-4, and UTF-16 formats.
27 | *
28 | * Contrary to many of the system functions, these functions do not take
29 | * anything from the system in account (the locale can be anything, it does
30 | * not change the exact behavior of these functions).
31 | *
32 | * Also similar functionality is found on Unices and MS-Windows, it was
33 | * simpler to just implement these few functions than to try to have a
34 | * converter that is sure not to use a locale and this way we can use
35 | * standard strings (std::string and std::wstring) instead of having to
36 | * call C functions.
37 | */
38 |
39 | // self
40 | //
41 | #include "libutf8/iterator.h"
42 |
43 | #include "libutf8/base.h"
44 | #include "libutf8/libutf8.h"
45 |
46 |
47 | // C++
48 | //
49 | #include
50 |
51 |
52 | // last include
53 | //
54 | #include
55 |
56 |
57 |
58 | namespace libutf8
59 | {
60 |
61 |
62 |
63 | utf8_iterator::utf8_iterator(std::string const & str, bool end)
64 | : f_str(&str)
65 | , f_pos(end ? str.length() : 0)
66 | , f_start_pos(f_pos)
67 | {
68 | }
69 |
70 |
71 | utf8_iterator & utf8_iterator::operator ++ ()
72 | {
73 | increment();
74 | return *this;
75 | }
76 |
77 |
78 | utf8_iterator utf8_iterator::operator ++ (int) // post-increment
79 | {
80 | utf8_iterator it(*this);
81 | increment();
82 | return it;
83 | }
84 |
85 |
86 | utf8_iterator & utf8_iterator::operator -- ()
87 | {
88 | decrement();
89 | return *this;
90 | }
91 |
92 |
93 | utf8_iterator utf8_iterator::operator -- (int) // post-decrement
94 | {
95 | utf8_iterator it(*this);
96 | decrement();
97 | return it;
98 | }
99 |
100 |
101 | /** \brief Read the current character.
102 | *
103 | * This function reads the current character and returns it as a char32_t
104 | * (i.e. UTF-32).
105 | *
106 | * When the iterator is at the end of the input string (it == str.end()),
107 | * then the function returns libutf8::EOS (-1).
108 | *
109 | * When the current character is valid, the value is any number from 0 to
110 | * 0x10FFFF except for UTF-16 surrogate values (0xD800 to 0xDFFF).
111 | *
112 | * When the current character is invalid (bad UTF-8 encoding, although
113 | * extended UTF-8 is accepted here), then the function returns
114 | * libutf8::NOT_A_CHARACTER (-2). Further, the good flag is also set to
115 | * false, which means good() returns false and bad() returns true.
116 | *
117 | * \code
118 | * for(libutf8::utf8_iterator it(s); it != s.end(); ++it)
119 | * {
120 | * char32_t c(*it);
121 | *
122 | * // here you can choose:
123 | * if(c == libutf8::NOT_A_CHARACTER)
124 | * {
125 | * // handle error -- current character is not valid UTF-8
126 | * break;
127 | * }
128 | * // -- or --
129 | * if(it.bad())
130 | * {
131 | * // handle error -- current character is not valid UTF-8
132 | * break;
133 | * }
134 | * }
135 | * \endcode
136 | *
137 | * Since this function returns EOS when the iterator is at the end of
138 | * the string, you can also stop the iteration process like so:
139 | *
140 | * \code
141 | * libutf8::utf8_iterator it(s);
142 | * for(;;)
143 | * {
144 | * char32_t c(*it);
145 | * if(c == libutf8::EOS)
146 | * {
147 | * // success, all characters were valid
148 | * break;
149 | * }
150 | * ...handle other cases as above...
151 | * }
152 | * \endcode
153 | *
154 | * \return EOS if at the end of the string, the current character as a
155 | * char32_t value or NOT_A_CHARACTER if the current character encoding is
156 | * wrong.
157 | *
158 | * \sa good()
159 | * \sa bad()
160 | */
161 | char32_t utf8_iterator::operator * () const
162 | {
163 | if(f_pos >= f_str->length())
164 | {
165 | return EOS;
166 | }
167 | char const * s(f_str->c_str() + f_pos);
168 | char32_t wc(NOT_A_CHARACTER);
169 | size_t len(f_str->length() - f_pos);
170 | if(mbstowc(wc, s, len) < 0)
171 | {
172 | f_good = false;
173 | }
174 | return wc;
175 | }
176 |
177 |
178 | bool utf8_iterator::operator == (utf8_iterator const & rhs) const
179 | {
180 | return f_pos == rhs.f_pos;
181 | }
182 |
183 |
184 | bool utf8_iterator::operator != (utf8_iterator const & rhs) const
185 | {
186 | return f_pos != rhs.f_pos;
187 | }
188 |
189 |
190 | bool utf8_iterator::operator == (std::string::iterator it) const
191 | {
192 | return static_cast(it - f_str->begin()) == f_pos;
193 | }
194 |
195 |
196 | bool utf8_iterator::operator != (std::string::iterator it) const
197 | {
198 | return static_cast(it - f_str->begin()) != f_pos;
199 | }
200 |
201 |
202 | bool utf8_iterator::operator == (std::string::const_iterator it) const
203 | {
204 | return static_cast(it - f_str->cbegin()) == f_pos;
205 | }
206 |
207 |
208 | bool utf8_iterator::operator != (std::string::const_iterator it) const
209 | {
210 | return static_cast(it - f_str->cbegin()) != f_pos;
211 | }
212 |
213 |
214 | bool operator == (std::string::iterator it, utf8_iterator const & rhs)
215 | {
216 | return static_cast(it - rhs.f_str->begin()) == rhs.f_pos;
217 | }
218 |
219 |
220 | bool operator != (std::string::iterator it, utf8_iterator const & rhs)
221 | {
222 | return static_cast(it - rhs.f_str->begin()) != rhs.f_pos;
223 | }
224 |
225 |
226 | bool operator == (std::string::const_iterator it, utf8_iterator const & rhs)
227 | {
228 | return static_cast(it - rhs.f_str->cbegin()) == rhs.f_pos;
229 | }
230 |
231 |
232 | bool operator != (std::string::const_iterator it, utf8_iterator const & rhs)
233 | {
234 | return static_cast(it - rhs.f_str->cbegin()) != rhs.f_pos;
235 | }
236 |
237 |
238 | void utf8_iterator::increment()
239 | {
240 | auto skip = [&]()
241 | {
242 | for(unsigned char b(0)
243 | ; f_pos < f_str->length()
244 | && (b = static_cast(f_str[0][f_pos]),
245 | (b >= 0x80 && b <= 0xBF) || b >= 0xF5)
246 | ; ++f_pos);
247 | f_good = false;
248 | };
249 |
250 | if(f_pos >= f_str->length())
251 | {
252 | return;
253 | }
254 |
255 | // increment is easy we can just get the current character and we know
256 | // the size of the character in UTF-8
257 | //
258 | unsigned char c(static_cast(f_str[0][f_pos]));
259 |
260 | if(c < 0x80)
261 | {
262 | ++f_pos;
263 | }
264 | else if(c <= 0xBF || c >= 0xF5)
265 | {
266 | // ?! invalid UTF-8 ?!
267 | //
268 | skip();
269 | }
270 | else if(c >= 0xF0)
271 | {
272 | f_pos += 4;
273 | if(c == 0xF4 && f_pos - 3 < f_str->length())
274 | {
275 | c = static_cast(f_str[0][f_pos - 3]);
276 | if(c >= 0x90)
277 | {
278 | f_pos -= 3;
279 | skip();
280 | }
281 | }
282 | }
283 | else if(c >= 0xE0)
284 | {
285 | f_pos += 3;
286 | }
287 | else /*if(c >= 0xC0)*/ // always true so we don't have to check
288 | {
289 | f_pos += 2;
290 | }
291 | if(f_pos > f_str->length())
292 | {
293 | f_pos = f_str->length();
294 | f_good = false;
295 | }
296 | }
297 |
298 |
299 | /** \brief Decrement the iterator.
300 | *
301 | * If the iterator is not already at position 0, decrement it to the previous
302 | * UTF-8 character. This means skipping to the first UTF-8 byte.
303 | *
304 | * \note
305 | * Contrary to the increment(), this function does not set the good flag to
306 | * true or false whether it is at the start or there is an invalid character.
307 | */
308 | void utf8_iterator::decrement()
309 | {
310 | if(f_pos == 0)
311 | {
312 | return;
313 | }
314 |
315 | // decrement requires us to search for the previous starting byte
316 | // which means we need to scan the string
317 | //
318 | while(f_pos > 0)
319 | {
320 | --f_pos;
321 | unsigned char c(static_cast(f_str[0][f_pos]));
322 | if(c < 0x80
323 | || c >= 0xC0)
324 | {
325 | break;
326 | }
327 | }
328 | }
329 |
330 |
331 | /** \brief Compute the distance between two iterators.
332 | *
333 | * This function computers the distance between two libutf8 iterators.
334 | *
335 | * The right hand side iterator must be from the same string as the
336 | * lhs string.
337 | *
338 | * \return The distance between the two iterators.
339 | */
340 | utf8_iterator::difference_type utf8_iterator::operator - (utf8_iterator const & rhs) const
341 | {
342 | return f_pos - rhs.f_pos;
343 | }
344 |
345 |
346 | /** \brief Compute the distance between two iterators.
347 | *
348 | * This operator computes the difference between this iterator and the
349 | * specified \p it iterator.
350 | *
351 | * \param[in] it The iterator to calculate the distance from.
352 | *
353 | * \return The distance between the two iterators.
354 | */
355 | utf8_iterator::difference_type utf8_iterator::operator - (std::string::const_iterator it) const
356 | {
357 | return static_cast(f_str->cbegin() + f_pos - it);
358 | }
359 |
360 |
361 | /** \brief Compute the distance between two iterators.
362 | *
363 | * This operator computes the difference between the two specified iterators
364 | * \p it and \p rhs.
365 | *
366 | * \param[in] it The iterator to calculate the distance from.
367 | * \param[in] rhs The iterator to calculate the distance to.
368 | *
369 | * \return The distance between the two specified iterators.
370 | */
371 | utf8_iterator::difference_type operator - (std::string::const_iterator it, utf8_iterator const & rhs)
372 | {
373 | return static_cast(it - rhs.f_str->cbegin() - rhs.f_pos);
374 | }
375 |
376 |
377 | /** \brief Restart the iterator.
378 | *
379 | * The iterator started at 0 or the end of the string, then you moved it
380 | * using the `++` or `--` operators. Later you may want to re-parse the
381 | * string from the start or end of the string.
382 | *
383 | * This function resets the position back to 0 or the end as defined on
384 | * the constructor.
385 | */
386 | void utf8_iterator::rewind()
387 | {
388 | f_pos = f_start_pos;
389 | }
390 |
391 |
392 | /** \brief Clear the errors.
393 | *
394 | * The iterator is considered good by default. If you try to retreive
395 | * a character after the end of the string being iterated or the
396 | * bytes do not represent an invalid UTF-8 character.
397 | *
398 | * \sa good()
399 | * \sa bad()
400 | */
401 | void utf8_iterator::clear()
402 | {
403 | f_good = true;
404 | }
405 |
406 |
407 | /** \brief Check whether the iterator did not run in an error.
408 | *
409 | * The iterator remains good as long as the input characters are valid
410 | * and the end of the string is not reached. After either event, this
411 | * function returns false.
412 | *
413 | * You can clear this flag by calling the clear() function.
414 | *
415 | * \return true if no errors were encountered so far.
416 | *
417 | * \sa clear()
418 | * \sa bad()
419 | */
420 | bool utf8_iterator::good() const
421 | {
422 | return f_good;
423 | }
424 |
425 |
426 | /** \brief Check whether the iterator ran in an error.
427 | *
428 | * This function returns true if an invalid character or the end of the
429 | * string was found.
430 | *
431 | * \return true if an error condition was encountered.
432 | *
433 | * \sa clear()
434 | * \sa good()
435 | */
436 | bool utf8_iterator::bad() const
437 | {
438 | return !f_good;
439 | }
440 |
441 |
442 |
443 | } // libutf8 namespace
444 | // vim: ts=4 sw=4 et
445 |
--------------------------------------------------------------------------------
/libutf8/iterator.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved
2 | //
3 | // https://snapwebsites.org/project/libutf8
4 | // contact@m2osw.com
5 | //
6 | // This program is free software; you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation; either version 2 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 | #pragma once
20 |
21 | /** \file
22 | * \brief The declarations of the UTF-8 library.
23 | *
24 | * This file is the declarations of the UTF-8 library which are just a few
25 | * functions used to convert a string from one format to another.
26 | */
27 |
28 | // C++
29 | //
30 | #include
31 |
32 |
33 |
34 | namespace libutf8
35 | {
36 |
37 |
38 | constexpr char32_t EOS = static_cast(EOF);
39 |
40 |
41 | class utf8_iterator
42 | {
43 | public:
44 | // Iterator traits
45 | //
46 | typedef std::bidirectional_iterator_tag iterator_category;
47 | typedef char32_t value_type;
48 | typedef ssize_t difference_type;
49 | typedef char32_t const * pointer;
50 | typedef char32_t const & reference;
51 |
52 | utf8_iterator(std::string const & str, bool end = false);
53 |
54 | utf8_iterator & operator ++ ();
55 | utf8_iterator operator ++ (int);
56 | utf8_iterator & operator -- ();
57 | utf8_iterator operator -- (int);
58 | value_type operator * () const;
59 | bool operator == (utf8_iterator const & rhs) const;
60 | bool operator != (utf8_iterator const & rhs) const;
61 | bool operator == (std::string::iterator it) const;
62 | bool operator != (std::string::iterator it) const;
63 | bool operator == (std::string::const_iterator it) const;
64 | bool operator != (std::string::const_iterator it) const;
65 | friend bool operator == (std::string::iterator it, utf8_iterator const & rhs);
66 | friend bool operator != (std::string::iterator it, utf8_iterator const & rhs);
67 | friend bool operator == (std::string::const_iterator it, utf8_iterator const & rhs);
68 | friend bool operator != (std::string::const_iterator it, utf8_iterator const & rhs);
69 | difference_type operator - (utf8_iterator const & rhs) const;
70 | difference_type operator - (std::string::const_iterator it) const;
71 | friend difference_type operator - (std::string::const_iterator it, utf8_iterator const & rhs);
72 |
73 | void rewind();
74 | void clear();
75 | bool good() const;
76 | bool bad() const;
77 |
78 | private:
79 | void increment();
80 | void decrement();
81 |
82 | std::string const * f_str = nullptr;
83 | std::string::size_type f_pos = 0;
84 | std::string::size_type f_start_pos = 0;
85 | mutable bool f_good = true;
86 | };
87 |
88 |
89 |
90 | } // libutf8 namespace
91 | // vim: ts=4 sw=4 et
92 |
--------------------------------------------------------------------------------
/libutf8/json_tokens.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved
2 | //
3 | // https://snapwebsites.org/project/libutf8
4 | // contact@m2osw.com
5 | //
6 | // This program is free software; you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation; either version 2 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 | #pragma once
20 |
21 | /** \file
22 | * \brief The declarations of the JSON tokens class.
23 | *
24 | * This file is the declarations of the JSON tokens class one can use to
25 | * interpret the contents of a JSON file.
26 | *
27 | * The idea of this simple parser is to (1) show how one can use the
28 | * libutf8 library and (2) give you the ability to parse a simple JSON
29 | * structure.
30 | */
31 |
32 | // self
33 | //
34 | #include
35 |
36 |
37 | // C++
38 | //
39 | #include
40 | #include
41 | #include
42 |
43 |
44 |
45 | namespace libutf8
46 | {
47 |
48 |
49 | enum class token_t
50 | {
51 | TOKEN_END,
52 | TOKEN_ERROR,
53 | TOKEN_OPEN_ARRAY,
54 | TOKEN_CLOSE_ARRAY,
55 | TOKEN_OPEN_OBJECT,
56 | TOKEN_CLOSE_OBJECT,
57 | TOKEN_NUMBER,
58 | TOKEN_STRING,
59 | TOKEN_COMMA,
60 | TOKEN_COLON,
61 | TOKEN_TRUE,
62 | TOKEN_FALSE,
63 | TOKEN_NULL,
64 | };
65 |
66 |
67 | class json_tokens
68 | {
69 | public:
70 | json_tokens(std::string const & input);
71 |
72 | int line() const;
73 | int column() const;
74 | token_t next_token();
75 | double number() const;
76 | std::string const & string() const;
77 | std::string const & error() const;
78 |
79 | private:
80 | char32_t getc();
81 | void ungetc(char32_t c);
82 | char32_t char16(char32_t & c);
83 | void add_error_character(char32_t c);
84 |
85 | std::string f_input = std::string();
86 | utf8_iterator f_iterator; // initialize in the constructor
87 | char32_t f_unget[16];
88 | std::size_t f_unget_pos = 0;
89 | std::uint32_t f_line = 1;
90 | std::uint32_t f_last_line = 0;
91 | std::uint32_t f_column = 1;
92 | std::uint32_t f_last_column = 0;
93 | double f_number = 0.0;
94 | std::string f_string = std::string();
95 | std::string f_error = std::string();
96 | };
97 |
98 |
99 | } // libutf8 namespace
100 | // vim: ts=4 sw=4 et
101 |
--------------------------------------------------------------------------------
/libutf8/libutf8.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved
2 | //
3 | // https://snapwebsites.org/project/libutf8
4 | // contact@m2osw.com
5 | //
6 | // This program is free software; you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation; either version 2 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 | #pragma once
20 |
21 | /** \file
22 | * \brief The declarations of the UTF-8 library.
23 | *
24 | * This file is the declarations of the UTF-8 library which are just a few
25 | * functions used to convert a string from one format to another.
26 | */
27 |
28 | // C++
29 | //
30 | #include
31 |
32 |
33 |
34 | namespace libutf8
35 | {
36 |
37 |
38 | enum class bom_t
39 | {
40 | BOM_NONE,
41 | BOM_UTF8,
42 | BOM_UTF16_LE,
43 | BOM_UTF16_BE,
44 | BOM_UTF32_LE,
45 | BOM_UTF32_BE
46 | };
47 |
48 |
49 | enum class surrogate_t
50 | {
51 | SURROGATE_NO,
52 | SURROGATE_HIGH,
53 | SURROGATE_LOW
54 | };
55 |
56 |
57 |
58 |
59 | bool is_valid_ascii(char c, bool ctrl = true);
60 | bool is_valid_ascii(char const * str, bool ctrl = true);
61 | bool is_valid_ascii(std::string const & str, bool ctrl = true);
62 | bool is_valid_utf8(char const * str);
63 | bool is_valid_utf8(std::string const & str);
64 | bool is_valid_utf16(std::u16string const & str);
65 | bool is_valid_unicode(char32_t const wc, bool ctrl = true);
66 | bool is_valid_unicode(char32_t const * str, bool ctrl = true);
67 | bool is_valid_unicode(std::u32string const & str, bool ctrl = true);
68 | surrogate_t is_surrogate(char32_t wc);
69 | bom_t start_with_bom(char const * str, size_t len);
70 | std::string to_u8string(std::u32string const & str);
71 | std::string to_u8string(std::u16string const & str);
72 | std::string to_u8string(std::wstring const & str);
73 | std::string to_u8string(wchar_t one, wchar_t two = L'\0');
74 | std::string to_u8string(char16_t one, char16_t two = u'\0');
75 | std::string to_u8string(char32_t const wc);
76 | std::u16string to_u16string(char32_t const wc);
77 | std::u16string to_u16string(std::string const & str);
78 | std::u32string to_u32string(std::string const & str);
79 | std::size_t u8length(std::string const & str);
80 | ssize_t u16length(std::u16string const & str);
81 | int u8casecmp(std::string const & lhs, std::string const & rhs);
82 | bool make_u8string_valid(std::string & str, char32_t fix_char = U'?');
83 |
84 |
85 |
86 | } // libutf8 namespace
87 |
88 |
89 | inline std::string operator + (char32_t wc, std::string const & rhs)
90 | {
91 | std::string v;
92 | v = libutf8::to_u8string(wc);
93 | return v + rhs;
94 | }
95 |
96 |
97 | inline std::string operator + (std::string const & lhs, char32_t wc)
98 | {
99 | std::string v;
100 | v = libutf8::to_u8string(wc);
101 | return lhs + v;
102 | }
103 |
104 |
105 | inline std::string & operator += (std::string & lhs, char32_t wc)
106 | {
107 | return lhs += libutf8::to_u8string(wc);
108 | }
109 |
110 |
111 | inline std::string & operator += (std::string & lhs, int c)
112 | {
113 | return lhs += static_cast(c);
114 | }
115 |
116 |
117 | inline std::string & operator += (std::string & lhs, unsigned int c)
118 | {
119 | return lhs += static_cast(c);
120 | }
121 |
122 |
123 | inline std::string & operator += (std::string & lhs, long c)
124 | {
125 | return lhs += static_cast(c);
126 | }
127 |
128 |
129 | inline std::string & operator += (std::string & lhs, unsigned long c)
130 | {
131 | return lhs += static_cast(c);
132 | }
133 |
134 |
135 |
136 | // vim: ts=4 sw=4 et
137 |
--------------------------------------------------------------------------------
/libutf8/unicode_data.cpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved
2 | //
3 | // https://snapwebsites.org/project/libutf8
4 | // contact@m2osw.com
5 | //
6 | // This program is free software; you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation; either version 2 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 |
20 | /** \file
21 | * \brief Implementation of the UTF-8 functions.
22 | *
23 | * This file is the implementation of the UTF-8 functions of the libutf8
24 | * library. It simply is a set of functions to convert between different
25 | * character sets in a lossless manner. At this point it supports UTF-8,
26 | * UCS-4, and UTF-16 formats.
27 | *
28 | * Contrary to many of the system functions, these functions do not take
29 | * anything from the system in account (the locale can be anything, it does
30 | * not change the exact behavior of these functions.)
31 | *
32 | * Also similar functionality is found on Unices and MS-Windows, it was
33 | * simpler to just implement these few functions than to try to have a
34 | * converter that is sure not to use a locale and this way we can use
35 | * standard strings (std::string and std::wstring) instead of having to
36 | * call C functions.
37 | */
38 |
39 | // self
40 | //
41 | #include "libutf8/unicode_data.h"
42 |
43 | #include "libutf8/exception.h"
44 | #include "libutf8/libutf8.h"
45 | #include "libutf8/unicode_data_file.h"
46 |
47 |
48 | // C++
49 | //
50 | #include
51 | #include
52 |
53 |
54 | // last include
55 | //
56 | #include
57 |
58 |
59 |
60 | /** \brief Name space of the UTF-8 library.
61 | *
62 | * The library to convert UTF-8 strings to UCS-4 (Unices) or UTF-16 strings
63 | * (MS-Windows) and vice versa.
64 | */
65 | namespace libutf8
66 | {
67 |
68 |
69 | namespace
70 | {
71 |
72 |
73 |
74 |
75 |
76 | class private_unicode_character
77 | : public unicode_character
78 | {
79 | public:
80 | private_unicode_character(
81 | char32_t code
82 | , detail::ucd_header * h);
83 |
84 | protected:
85 | virtual detail::ucd_character *
86 | ucd_character_pointer() const override;
87 |
88 | private:
89 | detail::ucd_character
90 | f_private_character = detail::ucd_character();
91 | };
92 |
93 |
94 | private_unicode_character::private_unicode_character(
95 | char32_t code
96 | , detail::ucd_header * h)
97 | : unicode_character(code, &f_private_character, h)
98 | {
99 | f_private_character.f_code = code;
100 | f_private_character.f_flags = detail::UCD_FLAG_PRIVATE;
101 | f_private_character.f_general_category = General_Category::GC_Private_Use;
102 | f_private_character.f_bidi_class = Bidi_Class::BC_Left_To_Right;
103 | }
104 |
105 |
106 | detail::ucd_character * private_unicode_character::ucd_character_pointer() const
107 | {
108 | return const_cast(&f_private_character);
109 | }
110 |
111 |
112 |
113 | } // no name namespace
114 |
115 |
116 |
117 |
118 |
119 |
120 | unicode_character::unicode_character(
121 | char32_t code
122 | , detail::ucd_character * c
123 | , detail::ucd_header * h)
124 | : f_code(code)
125 | , f_character(c)
126 | , f_header(h)
127 | {
128 | }
129 |
130 |
131 | unicode_character::~unicode_character()
132 | {
133 | }
134 |
135 |
136 | unicode_character::unicode_character(unicode_character const & rhs)
137 | {
138 | // this looks weird, but it works as expected
139 | //
140 | f_character = rhs.f_character;
141 | f_character = ucd_character_pointer();
142 | f_header = rhs.f_header;
143 | }
144 |
145 |
146 | unicode_character & unicode_character::operator = (unicode_character const & rhs)
147 | {
148 | // this looks weird, but it works as expected
149 | //
150 | f_character = rhs.f_character;
151 | f_character = ucd_character_pointer();
152 | f_header = rhs.f_header;
153 |
154 | return *this;
155 | }
156 |
157 |
158 | bool unicode_character::is_valid() const
159 | {
160 | return is_valid_unicode(f_code);
161 | }
162 |
163 |
164 | bool unicode_character::is_defined() const
165 | {
166 | return f_character->f_code != NOT_A_CHARACTER;
167 | }
168 |
169 |
170 | bool unicode_character::is_private() const
171 | {
172 | return (f_character->f_flags & detail::UCD_FLAG_PRIVATE) != 0;
173 | }
174 |
175 |
176 | General_Category unicode_character::category() const
177 | {
178 | return f_character->f_general_category;
179 | }
180 |
181 |
182 | bool unicode_character::is_letter() const
183 | {
184 | return f_character->f_general_category >= General_Category::GC_Uppercase_Letter
185 | && f_character->f_general_category <= General_Category::GC_Other_Letter;
186 | }
187 |
188 |
189 | bool unicode_character::is_mark() const
190 | {
191 | return f_character->f_general_category >= General_Category::GC_Nonspacing_Mark
192 | && f_character->f_general_category <= General_Category::GC_Enclosing_Mark;
193 | }
194 |
195 |
196 | bool unicode_character::is_number() const
197 | {
198 | return f_character->f_general_category >= General_Category::GC_Decimal_Number
199 | && f_character->f_general_category <= General_Category::GC_Other_Number;
200 | }
201 |
202 |
203 | bool unicode_character::is_punctuation() const
204 | {
205 | return f_character->f_general_category >= General_Category::GC_Connector_Punctuation
206 | && f_character->f_general_category <= General_Category::GC_Other_Punctuation;
207 | }
208 |
209 |
210 | bool unicode_character::is_symbol() const
211 | {
212 | return f_character->f_general_category >= General_Category::GC_Math_Symbol
213 | && f_character->f_general_category <= General_Category::GC_Other_Symbol;
214 | }
215 |
216 |
217 | bool unicode_character::is_separator() const
218 | {
219 | return f_character->f_general_category >= General_Category::GC_Space_Separator
220 | && f_character->f_general_category <= General_Category::GC_Paragraph_Separator;
221 | }
222 |
223 |
224 | bool unicode_character::is_other() const
225 | {
226 | return f_character->f_general_category >= General_Category::GC_Control
227 | && f_character->f_general_category <= General_Category::GC_Unassigned;
228 | }
229 |
230 |
231 |
232 | Canonical_Combining_Class unicode_character::combining_class()
233 | {
234 | return f_character->f_canonical_combining_class;
235 | }
236 |
237 |
238 | Bidi_Class unicode_character::bidi_class() const
239 | {
240 | return f_character->f_bidi_class;
241 | }
242 |
243 |
244 | bool unicode_character::is_bidi_mirrored() const
245 | {
246 | return (f_character->f_flags & detail::UCD_FLAG_BIDI_MIRROR) != 0;
247 | }
248 |
249 |
250 | Decomposition_Type unicode_character::decomposition_type() const
251 | {
252 | return static_cast(f_character->f_decomposition_type);
253 | }
254 |
255 |
256 | Numeric_Type unicode_character::numeric() const
257 | {
258 | if((f_character->f_flags & detail::UCD_FLAG_DIGIT) != 0)
259 | {
260 | return Numeric_Type::NT_Digit;
261 | }
262 |
263 | if((f_character->f_flags & detail::UCD_FLAG_DECIMAL) != 0)
264 | {
265 | return Numeric_Type::NT_Decimal;
266 | }
267 |
268 | if((f_character->f_flags & detail::UCD_FLAG_NUMERIC) != 0)
269 | {
270 | return Numeric_Type::NT_Numeric;
271 | }
272 |
273 | return Numeric_Type::NT_Unknown;
274 | }
275 |
276 |
277 | std::int64_t unicode_character::get_number(int index) const
278 | {
279 | std::size_t length(0);
280 | char const * name(find_name(detail::Name_Type::NT_Numeric, length));
281 | if(name == nullptr)
282 | {
283 | return 0;
284 | }
285 | if(length != 16)
286 | {
287 | // someone tempered with the database?
288 | //
289 | throw libutf8_logic_exception("invalid \"name\" size for a number");
290 | }
291 | std::int64_t const * number(reinterpret_cast(name));
292 | return number[index];
293 | }
294 |
295 |
296 | std::int64_t unicode_character::nominator() const
297 | {
298 | return get_number(0);
299 | }
300 |
301 |
302 | std::int64_t unicode_character::denominator() const
303 | {
304 | return get_number(1);
305 | }
306 |
307 |
308 | char const * unicode_character::find_name(detail::Name_Type type, std::size_t & length) const
309 | {
310 | if(f_character->f_names == 0)
311 | {
312 | throw libutf8_logic_exception("character is missing a name");
313 | }
314 |
315 | char const * name(reinterpret_cast(f_header)
316 | + f_header->f_strings + f_character->f_names);
317 | for(;;)
318 | {
319 | detail::Name_Type const t(static_cast(name[0]));
320 | if(t == detail::Name_Type::NT_EndOfNames)
321 | {
322 | length = 0;
323 | return nullptr;
324 | }
325 | length = static_cast(name[1]);
326 | if(t == type)
327 | {
328 | return name + 2;
329 | }
330 | name += length + 2;
331 | }
332 | }
333 |
334 |
335 | detail::ucd_character * unicode_character::ucd_character_pointer() const
336 | {
337 | return f_character;
338 | }
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 | } // libutf8 namespace
347 | // vim: ts=4 sw=4 et
348 |
--------------------------------------------------------------------------------
/libutf8/unicode_data.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2021-2023 Made to Order Software Corp. All Rights Reserved
2 | //
3 | // https://snapwebsites.org/project/libutf8
4 | // contact@m2osw.com
5 | //
6 | // This program is free software; you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation; either version 2 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 | #pragma once
20 |
21 | /** \file
22 | * \brief The declarations of the UTF-8 library.
23 | *
24 | * This file is the declarations of the UTF-8 library which are just a few
25 | * functions used to convert a string from one format to another.
26 | */
27 |
28 | // self
29 | //
30 | #include
31 |
32 |
33 | // C++
34 | //
35 | #include
36 | #include
37 |
38 |
39 |
40 | namespace libutf8
41 | {
42 |
43 |
44 | namespace detail
45 | {
46 | class unicode_data_impl;
47 | class parser_impl;
48 | class ucd_header;
49 | class ucd_character;
50 | enum class Name_Type : std::uint8_t;
51 | } // detail namespace
52 |
53 |
54 |
55 | enum class General_Category : std::uint8_t
56 | {
57 | GC_Unknown_Category = 0,
58 |
59 | // GC_Letter = 1 to 6 // L
60 | GC_Uppercase_Letter = 1, // Lu
61 | GC_Lowercase_Letter = 2, // Ll
62 | GC_TitleCase_Letter = 3, // Lt
63 | GC_Cased_Letter = 4, // LC
64 | GC_Modified_Letter = 5, // Lm
65 | GC_Other_Letter = 6, // Lo
66 |
67 | // GC_Mark = 7 to 9 // M
68 | GC_Nonspacing_Mark = 7, // Mn
69 | GC_Spacing_Mark = 8, // Mc
70 | GC_Enclosing_Mark = 9, // Me
71 |
72 | // GC_Number = 10 to 12 // N
73 | GC_Decimal_Number = 10, // Nd
74 | GC_Letter_Number = 11, // Nl
75 | GC_Other_Number = 12, // No
76 |
77 | // GC_Punctuation = 13 to 19 // P
78 | GC_Connector_Punctuation = 13, // Pc
79 | GC_Dash_Punctuation = 14, // Pd
80 | GC_Open_Punctuation = 15, // Ps
81 | GC_Close_Punctuation = 16, // Pe
82 | GC_Initial_Punctuation = 17, // Pi
83 | GC_Final_Punctuation = 18, // Pf
84 | GC_Other_Punctuation = 19, // Po
85 |
86 | // GC_Symbol = 20 to 23 // S
87 | GC_Math_Symbol = 20, // Sm
88 | GC_Current_Symbol = 21, // Sc
89 | GC_Modifier_Symbol = 22, // Sk
90 | GC_Other_Symbol = 23, // So
91 |
92 | // GC_Separator = 24 to 26 // Z
93 | GC_Space_Separator = 24, // Zs
94 | GC_Line_Separator = 25, // Zl
95 | GC_Paragraph_Separator = 26, // Zp
96 |
97 | // GC_Other = 27 to 31 // C
98 | GC_Control = 27, // Cc
99 | GC_Format = 28, // Cf
100 | GC_Surrogate = 29, // Cs
101 | GC_Private_Use = 30, // Co
102 | GC_Unassigned = 31, // Cn
103 | };
104 |
105 |
106 | enum class Canonical_Combining_Class : std::uint8_t
107 | {
108 | CCC_Not_Reordered = 0,
109 |
110 | // Fixed position classes
111 | CCC_Overlay = 1,
112 | CCC_Han_Reading = 6,
113 | CCC_Nukta = 7,
114 | CCC_Kana_Voicing = 8,
115 | CCC_Virama = 9,
116 | CCC_Ccc10 = 10, // first CCC
117 | // ... not specifically defined ...
118 | CCC_Ccc199 = 199, // last CCC
119 |
120 | // Other classes
121 | CCC_Attached_Below_Left = 200,
122 | CCC_Attached_Below = 202,
123 | CCC_Attached_Above = 214,
124 | CCC_Attached_Above_Right = 216,
125 | CCC_Below_Left = 218,
126 | CCC_Below = 220,
127 | CCC_Below_Right = 222,
128 | CCC_Left = 224,
129 | CCC_Right = 226,
130 | CCC_Above_Left = 228,
131 | CCC_Above = 230,
132 | CCC_Above_Right = 232,
133 | CCC_Double_Below = 233,
134 | CCC_Double_Above = 234,
135 | CCC_Iota_Subscript = 240,
136 | };
137 |
138 |
139 | enum class Bidi_Class : std::uint8_t
140 | {
141 | BC_Unknown = 0,
142 |
143 | // Strong Types
144 | BC_Left_To_Right = 10, // L
145 | BC_Right_To_Left = 11, // R
146 | BC_Arabic_Letter = 12, // AL
147 |
148 | // Weak Types
149 | BC_European_Number = 20, // EN
150 | BC_European_Separator = 21, // ES
151 | BC_European_Terminator = 22, // ET
152 | BC_Arabic_Number = 23, // AN
153 | BC_Common_Separator = 24, // CS
154 | BC_Nonspacing_Mark = 25, // NSM
155 | BC_Boundary_Neutral = 26, // BN
156 |
157 | // Neutral Types
158 | BC_Paragraph_Separator = 30, // B
159 | BC_Segment_Separator = 31, // S
160 | BC_White_Space = 32, // WS
161 | BC_Other_Neutral = 33, // ON
162 |
163 | // Explicit Formatting Types
164 | BC_Left_To_Right_Embedding = 40, // LRE
165 | BC_Left_To_Right_Override = 41, // LRO
166 | BC_Right_To_Left_Embedding = 42, // RLE
167 | BC_Right_To_Left_Override = 43, // RLO
168 | BC_Pop_Directional_Format = 44, // PDF
169 | BC_Left_To_Right_Isolate = 45, // LRI
170 | BC_Right_To_Left_Isolate = 46, // RLI
171 | BC_First_Strong_Isolate = 47, // FSI
172 | BC_Pop_Directional_Isolate = 48, // PDI
173 | };
174 |
175 |
176 | enum class Decomposition_Type : std::uint8_t
177 | {
178 | DT_unknown = 0,
179 | DT_none = 1,
180 | DT_canonical = 2,
181 |
182 | DT_font = 10,
183 | DT_noBreak = 11,
184 | DT_initial = 12,
185 | DT_medial = 13,
186 | DT_final = 14,
187 | DT_isolated = 15,
188 | DT_circle = 16,
189 | DT_super = 17,
190 | DT_sub = 18,
191 | DT_vertical = 19,
192 | DT_wide = 20,
193 | DT_narrow = 21,
194 | DT_small = 22,
195 | DT_square = 23,
196 | DT_fraction = 24,
197 | DT_compat = 25,
198 | };
199 |
200 |
201 | enum class Numeric_Type : std::uint8_t
202 | {
203 | NT_Unknown = 0, // a.k.a. this is not marked as a number
204 |
205 | NT_Digit = 1, // the Digit type should be viewed as equivalent to Decimal
206 | NT_Decimal = 2,
207 | NT_Numeric = 3,
208 | };
209 |
210 |
211 |
212 |
213 |
214 | class unicode_character
215 | {
216 | public:
217 | typedef std::shared_ptr
218 | pointer_t;
219 |
220 | unicode_character(
221 | char32_t code
222 | , detail::ucd_character * c
223 | , detail::ucd_header * h);
224 | virtual ~unicode_character();
225 | unicode_character(unicode_character const & rhs);
226 | unicode_character & operator = (unicode_character const & rhs);
227 |
228 | bool is_valid() const; // valid code point as far as Unicode (UTF-32) is concerned
229 | bool is_defined() const; // whether this is a Unicode defined character or not
230 | bool is_private() const; // whether this code point is reserved for private use
231 |
232 | General_Category category() const;
233 | bool is_letter() const;
234 | bool is_mark() const;
235 | bool is_number() const;
236 | bool is_punctuation() const;
237 | bool is_symbol() const;
238 | bool is_separator() const;
239 | bool is_other() const;
240 |
241 | Canonical_Combining_Class
242 | combining_class();
243 | Bidi_Class bidi_class() const;
244 | bool is_bidi_mirrored() const;
245 | Decomposition_Type decomposition_type() const;
246 |
247 | Numeric_Type numeric() const;
248 | std::int64_t nominator() const;
249 | std::int64_t denominator() const;
250 |
251 | protected:
252 | virtual detail::ucd_character *
253 | ucd_character_pointer() const;
254 |
255 | private:
256 | std::int64_t get_number(int index) const;
257 | char const * find_name(detail::Name_Type type, std::size_t & length) const;
258 |
259 | char32_t f_code = NOT_A_CHARACTER;
260 | detail::ucd_character *
261 | f_character = nullptr;
262 | detail::ucd_header *f_header = nullptr;
263 | };
264 |
265 |
266 |
267 |
268 | class unicode_data
269 | {
270 | public:
271 | typedef std::shared_ptr
272 | pointer_t;
273 |
274 | static pointer_t get_instance();
275 |
276 | // input file information
277 | //
278 | time_t last_generated();
279 | void set_cache(bool cache = true);
280 | bool get_cache() const;
281 | char const * version() const;
282 | std::string const version_string() const;
283 |
284 | // access character data
285 | //
286 | unicode_character::pointer_t
287 | character(char32_t wc);
288 |
289 | private:
290 | typedef std::shared_ptr
291 | unicode_data_impl_pointer_t;
292 |
293 | unicode_data_impl_pointer_t
294 | f_impl = unicode_data_impl_pointer_t();
295 | };
296 |
297 |
298 | class ucd_parser
299 | {
300 | public:
301 | ucd_parser(
302 | std::string const & input_dir
303 | , std::string const & output_filename);
304 |
305 | void generate();
306 |
307 | private:
308 | typedef std::shared_ptr
309 | parser_impl_pointer_t;
310 |
311 | parser_impl_pointer_t
312 | f_impl = parser_impl_pointer_t();
313 | };
314 |
315 |
316 |
317 |
318 | } // libutf8 namespace
319 | // vim: ts=4 sw=4 et
320 |
--------------------------------------------------------------------------------
/libutf8/unicode_data_file.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2021-2023 Made to Order Software Corp. All Rights Reserved
2 | //
3 | // https://snapwebsites.org/project/libutf8
4 | // contact@m2osw.com
5 | //
6 | // This program is free software; you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation; either version 2 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 | #pragma once
20 |
21 | /** \file
22 | * \brief The declarations of the Unicode compiled files.
23 | *
24 | * This file includes structures used to describe the Unicode compiled
25 | * file. This allows us to very quickly find all the information about
26 | * a character.
27 | *
28 | * From the outside, you are expected to use the unicode_character
29 | * functions defined in the unicode_data.h header. This file is
30 | * considered private.
31 | */
32 |
33 | // self
34 | //
35 | #include
36 |
37 |
38 | // C++
39 | //
40 | #include
41 |
42 |
43 |
44 | namespace libutf8
45 | {
46 |
47 | namespace detail
48 | {
49 |
50 |
51 | enum class Name_Type : std::uint8_t // see UnicodeData.txt and NameAliases.txt
52 | {
53 | NT_Name = 0xF0,
54 | NT_Abbreviation = 0xF1,
55 | NT_Jamo_Short_Name = 0xF2, // see Jamo.txt
56 | NT_Alternate = 0xF3,
57 | NT_Control = 0xF4,
58 | NT_WrongName = 0xF5, // the main name is the corrected name, this name is the invalid/incorrect name
59 | NT_Figment = 0xF6,
60 | NT_Numeric = 0xF7, // saved as two int64_t in the strings because that's under 8kb that way
61 |
62 | NT_EndOfNames = 0xFF,
63 | };
64 |
65 |
66 |
67 | struct ucd_header
68 | {
69 | char f_magic[4] = { 'U', 'C', 'D', 'B' };
70 | time_t f_timestamp = 0; // time when this file was generated
71 | std::uint8_t f_version = 0; // version of this file format
72 | std::uint8_t f_ucd_version[3] = { 1, 1, 0 }; // version of source -- i.e. 5 2 0
73 | std::uint32_t f_characters = 0; // offset to character table
74 | std::uint32_t f_strings = 0; // offset to string table
75 | std::uint32_t f_decomposition = 0; // offset to decomposition table
76 | };
77 |
78 |
79 |
80 | typedef std::uint8_t flags_t;
81 |
82 | constexpr flags_t UCD_FLAG_DIGIT = 0x01; // represents a number
83 | constexpr flags_t UCD_FLAG_DECIMAL = 0x02; // represents a number
84 | constexpr flags_t UCD_FLAG_NUMERIC = 0x04; // represents a number
85 | constexpr flags_t UCD_FLAG_BIDI_MIRROR = 0x08; // mirror of another letter left to right vs. right to left
86 | constexpr flags_t UCD_FLAG_CONTROL = 0x10;
87 | constexpr flags_t UCD_FLAG_PRIVATE = 0x20;
88 |
89 |
90 |
91 | struct ucd_character
92 | {
93 | // initialization happens in a non-virtual function, otherwise it
94 | // would break the binary use of the structure
95 | //
96 | void initialize_ucd_character()
97 | {
98 | f_code = NOT_A_CHARACTER;
99 | f_names = 0;
100 | f_flags = 0;
101 |
102 | f_general_category = General_Category::GC_Unknown_Category;
103 | f_canonical_combining_class = Canonical_Combining_Class::CCC_Not_Reordered;
104 | f_bidi_class = Bidi_Class::BC_Unknown; // see flags for mirror info
105 | f_decomposition_type = static_cast(Decomposition_Type::DT_unknown);
106 | f_decomposition_length = 0;
107 | f_decomposition_mapping = 0;
108 | f_age[0] = 1;
109 | f_age[1] = 1;
110 | }
111 |
112 | /* 32 */ char32_t f_code;
113 | /* 32 */ std::uint32_t f_names; // offset to string table
114 | /* 8 */ flags_t f_flags;
115 | /* 8 */ General_Category f_general_category;
116 | /* 8 */ Canonical_Combining_Class f_canonical_combining_class;
117 | /* 8 */ Bidi_Class f_bidi_class;
118 | /* 5 */ std::uint32_t f_decomposition_type : 5;
119 | /* 5 */ std::uint32_t f_decomposition_length : 5;
120 | /* 22 */ std::uint32_t f_decomposition_mapping : 22;
121 | /* 16 */ std::uint8_t f_age[2];
122 | };
123 |
124 |
125 | // The f_names is an offset in the string table.
126 | //
127 | // Each name is defined as:
128 | //
129 | // struct name_t
130 | // {
131 | // Name_Type f_type;
132 | // uint8_t f_size;
133 | // char8_t f_name[f_size];
134 | // };
135 | //
136 | // Names are not null terminated.
137 | // followed by UTF-8 until the next byte representing a Name_Type, the
138 | // last name ends with special type NT_EndOfNames.
139 | //
140 | // The first name is the corrected name of the character.
141 | //
142 | // Following are the other Name_Type names.
143 | //
144 | // The numeric entries are actually two 64 bit numbers (nominator and
145 | // denominator). The size will always be 16 bytes, but the alignment
146 | // is likely going to be "wrong" (although that should not matter much
147 | // on Intel and ARM processors).
148 |
149 |
150 |
151 |
152 | } // detail namespace
153 |
154 | } // libutf8 namespace
155 | // vim: ts=4 sw=4 et
156 |
--------------------------------------------------------------------------------
/libutf8/version.cpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved
2 | //
3 | // https://snapwebsites.org/
4 | // contact@m2osw.com
5 | //
6 | // This program is free software; you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation; either version 2 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 |
20 | /** \file
21 | * \brief The UTF-8 libray is used to convert C++ strings.
22 | *
23 | * This file shows the UTF-8 library version.
24 | *
25 | * The `#define` give you the library version at the time you are compiling.
26 | * The functions allow you to retrieve the version of a dynamically linked
27 | * library.
28 | */
29 |
30 | // self
31 | //
32 | #include "libutf8/version.h"
33 |
34 |
35 | // last include
36 | //
37 | #include
38 |
39 |
40 |
41 | namespace libutf8
42 | {
43 |
44 |
45 |
46 |
47 | /** \brief Get the major version of the library
48 | *
49 | * This function returns the major version of the running library (the
50 | * one you are linked against at runtime).
51 | *
52 | * \return The major version.
53 | */
54 | int get_major_version()
55 | {
56 | return LIBUTF8_VERSION_MAJOR;
57 | }
58 |
59 |
60 | /** \brief Get the minor version of the library.
61 | *
62 | * This function returns the minor version of the running library
63 | * (the one you are linked against at runtime).
64 | *
65 | * \return The release version.
66 | */
67 | int get_release_version()
68 | {
69 | return LIBUTF8_VERSION_MINOR;
70 | }
71 |
72 |
73 | /** \brief Get the patch version of the library.
74 | *
75 | * This function returns the patch version of the running library
76 | * (the one you are linked against at runtime).
77 | *
78 | * \return The patch version.
79 | */
80 | int get_patch_version()
81 | {
82 | return LIBUTF8_VERSION_PATCH;
83 | }
84 |
85 |
86 | /** \brief Get the full version of the library as a string.
87 | *
88 | * This function returns the major, minor, and patch versions of the
89 | * running library (the one you are linked against at runtime) in the
90 | * form of a string.
91 | *
92 | * The build version is not made available. In most cases we change
93 | * the build version only to run a new build, so not code will have
94 | * changed (some documentation and non-code files may changed between
95 | * build versions; but the code will work exactly the same way.)
96 | *
97 | * \return The library version.
98 | */
99 | char const * get_version_string()
100 | {
101 | return LIBUTF8_VERSION_STRING;
102 | }
103 |
104 |
105 | } // libutf8 namespace
106 | // vim: ts=4 sw=4 et
107 |
--------------------------------------------------------------------------------
/libutf8/version.h.in:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved
2 | //
3 | // https://snapwebsites.org/
4 | // contact@m2osw.com
5 | //
6 | // This program is free software; you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation; either version 2 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 | #pragma once
20 |
21 | /** \file
22 | * \brief Definitions of the libutf8 version.
23 | *
24 | * This header includes the libutf8 library version and functions you
25 | * can use to check the current version of the library.
26 | */
27 |
28 |
29 | #define LIBUTF8_VERSION_MAJOR @LIBUTF8_VERSION_MAJOR@
30 | #define LIBUTF8_VERSION_MINOR @LIBUTF8_VERSION_MINOR@
31 | #define LIBUTF8_VERSION_PATCH @LIBUTF8_VERSION_PATCH@
32 | #define LIBUTF8_VERSION_STRING "@LIBUTF8_VERSION_MAJOR@.@LIBUTF8_VERSION_MINOR@.@LIBUTF8_VERSION_PATCH@"
33 |
34 | namespace libutf8
35 | {
36 |
37 |
38 | int get_major_version();
39 | int get_release_version();
40 | int get_patch_version();
41 | char const * get_version_string();
42 |
43 |
44 |
45 | } // libutf8 namespace
46 | // vim: ts=4 sw=4 et
47 |
--------------------------------------------------------------------------------
/mk:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | #
3 | # See the snapcmakemodules project for details about this script
4 | # https://github.com/m2osw/snapcmakemodules
5 |
6 | if test -x ../../cmake/scripts/mk
7 | then
8 | ../../cmake/scripts/mk $*
9 | else
10 | echo "error: could not locate the cmake mk script"
11 | exit 1
12 | fi
13 |
14 |
--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved
2 | #
3 | # https://snapwebsites.org/project/libutf8
4 | # contact@m2osw.com
5 | #
6 | # This program is free software; you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation; either version 2 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License along
17 | # with this program; if not, write to the Free Software Foundation, Inc.,
18 | # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 |
20 | ##
21 | ## libutf8 library unit tests
22 | ##
23 | project(unittest)
24 |
25 | find_package(SnapCatch2)
26 |
27 | if(SnapCatch2_FOUND)
28 |
29 | add_executable(${PROJECT_NAME}
30 | catch_main.cpp
31 |
32 | catch_bom.cpp
33 | catch_caseinsensitive.cpp
34 | catch_character.cpp
35 | catch_iterator.cpp
36 | catch_json_tokens.cpp
37 | catch_length.cpp
38 | catch_stream.cpp
39 | catch_string.cpp
40 | catch_valid.cpp
41 | catch_version.cpp
42 | )
43 |
44 | target_include_directories(${PROJECT_NAME}
45 | PUBLIC
46 | ${CMAKE_BINARY_DIR}
47 | ${PROJECT_SOURCE_DIR}
48 | ${SNAPCATCH2_INCLUDE_DIRS}
49 | ${LIBEXCEPT_INCLUDE_DIRS}
50 | )
51 |
52 | target_link_libraries(${PROJECT_NAME}
53 | utf8
54 | ${SNAPCATCH2_LIBRARIES}
55 | )
56 |
57 | else(SnapCatch2_FOUND)
58 |
59 | message("snapcatch2 not found... no test will be built.")
60 |
61 | endif(SnapCatch2_FOUND)
62 |
63 | if(SnapCatch2_FOUND)
64 |
65 | find_package(SnapTestRunner)
66 | AddUnitTestsTarget(
67 | PROJECT_NAME
68 | rununittests
69 | )
70 |
71 | endif(SnapCatch2_FOUND)
72 |
73 | # vim: ts=4 sw=4 et
74 |
--------------------------------------------------------------------------------
/tests/catch_bom.cpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2021-2023 Made to Order Software Corp. All Rights Reserved
2 | //
3 | // https://snapwebsites.org/project/libutf8
4 | // contact@m2osw.com
5 | //
6 | // This program is free software; you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation; either version 2 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 |
20 | // libutf8
21 | //
22 | #include
23 | #include
24 |
25 |
26 | // unit test
27 | //
28 | #include "catch_main.h"
29 |
30 |
31 | // C++
32 | //
33 | #include
34 | #include
35 |
36 |
37 | // last include
38 | //
39 | #include
40 |
41 |
42 |
43 | CATCH_TEST_CASE("bom", "[characters],[bom]")
44 | {
45 | CATCH_START_SECTION("bom: Verify the BOM character")
46 | CATCH_REQUIRE(libutf8::BOM_CHAR == 0xFEFF);
47 | CATCH_END_SECTION()
48 |
49 | CATCH_START_SECTION("bom: Verify with a string that's too small")
50 | {
51 | CATCH_REQUIRE(libutf8::start_with_bom(nullptr, rand()) == libutf8::bom_t::BOM_NONE);
52 | CATCH_REQUIRE(libutf8::start_with_bom("", 0) == libutf8::bom_t::BOM_NONE);
53 | CATCH_REQUIRE(libutf8::start_with_bom("a", 1) == libutf8::bom_t::BOM_NONE);
54 | }
55 | CATCH_END_SECTION()
56 |
57 | CATCH_START_SECTION("bom: Verify the five BOMs as is")
58 | {
59 | char buf[4];
60 | char32_t const bom(libutf8::BOM_CHAR);
61 |
62 | // UTF-8
63 | buf[0] = static_cast((bom >> 12) | 0xE0);
64 | buf[1] = static_cast(((bom >> 6) & 0x3F) | 0x80);
65 | buf[2] = static_cast(((bom >> 0) & 0x3F) | 0x80);
66 | buf[3] = '?';
67 | CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF8);
68 |
69 | // UTF-16 Little Endian
70 | buf[0] = static_cast(bom >> 0);
71 | buf[1] = static_cast(bom >> 8);
72 | buf[2] = static_cast(0x00);
73 | buf[3] = static_cast(0x34);
74 | CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_LE);
75 |
76 | // UTF-16 Little Endian (with a zero in the next 2 bytes)
77 | buf[0] = static_cast(bom >> 0);
78 | buf[1] = static_cast(bom >> 8);
79 | buf[2] = static_cast(0x12);
80 | buf[3] = static_cast(0x00);
81 | CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_LE);
82 |
83 | // UTF-16 Little Endian (with a zero in the next 2 bytes)
84 | buf[0] = static_cast(bom >> 0);
85 | buf[1] = static_cast(bom >> 8);
86 | buf[2] = static_cast(0x12);
87 | buf[3] = static_cast(0x34);
88 | CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_LE);
89 |
90 | // UTF-16 Big Endian
91 | buf[0] = static_cast(bom >> 8);
92 | buf[1] = static_cast(bom >> 0);
93 | buf[2] = static_cast(0xAB);
94 | buf[3] = static_cast(0xCD);
95 | CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_BE);
96 |
97 | // UTF-16 Big Endian (with a zero in the next 2 bytes)
98 | buf[0] = static_cast(bom >> 8);
99 | buf[1] = static_cast(bom >> 0);
100 | buf[2] = static_cast(0x00);
101 | buf[3] = static_cast(0xCD);
102 | CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_BE);
103 |
104 | // UTF-16 Big Endian (with a zero in the next 2 bytes)
105 | buf[0] = static_cast(bom >> 8);
106 | buf[1] = static_cast(bom >> 0);
107 | buf[2] = static_cast(0xAB);
108 | buf[3] = static_cast(0x00);
109 | CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_BE);
110 |
111 | // UTF-32 Little Endian
112 | buf[0] = static_cast(bom >> 0);
113 | buf[1] = static_cast(bom >> 8);
114 | buf[2] = static_cast(bom >> 16);
115 | buf[3] = static_cast(bom >> 24);
116 | CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF32_LE);
117 |
118 | // UTF-32 Big Endian
119 | buf[0] = static_cast(bom >> 24);
120 | buf[1] = static_cast(bom >> 16);
121 | buf[2] = static_cast(bom >> 8);
122 | buf[3] = static_cast(bom >> 0);
123 | CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF32_BE);
124 | }
125 | CATCH_END_SECTION()
126 |
127 | CATCH_START_SECTION("bom: Verify the five BOMs as is")
128 | {
129 | char buf[4];
130 |
131 | // unknown 1 byte (well... 1 byte is never really known...)
132 | buf[0] = '?';
133 | CATCH_REQUIRE(libutf8::start_with_bom(buf, 1) == libutf8::bom_t::BOM_NONE);
134 |
135 | // unknown 2 bytes
136 | buf[0] = 'Q';
137 | buf[1] = '?';
138 | CATCH_REQUIRE(libutf8::start_with_bom(buf, 2) == libutf8::bom_t::BOM_NONE);
139 |
140 | // unknown 3 bytes
141 | buf[0] = 'B';
142 | buf[1] = 'O';
143 | buf[2] = 'M';
144 | CATCH_REQUIRE(libutf8::start_with_bom(buf, 3) == libutf8::bom_t::BOM_NONE);
145 |
146 | // unknown 4 bytes
147 | buf[0] = 'B';
148 | buf[1] = 'O';
149 | buf[2] = 'M';
150 | buf[3] = '?';
151 | CATCH_REQUIRE(libutf8::start_with_bom(buf, 4) == libutf8::bom_t::BOM_NONE);
152 | }
153 | CATCH_END_SECTION()
154 |
155 | CATCH_START_SECTION("bom: Verify u32string that starts with a BOM (CPU Endianness)")
156 | {
157 | std::u32string u32str;
158 | u32str += libutf8::BOM_CHAR;
159 | u32str += unittest::rand_char(true);
160 | size_t const size(u32str.length() * sizeof(std::u32string::value_type));
161 | for(int idx(static_cast(size)); idx >= 0; --idx)
162 | {
163 | if(static_cast(idx) >= sizeof(std::u32string::value_type))
164 | {
165 | #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
166 | CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast(u32str.c_str()), idx) == libutf8::bom_t::BOM_UTF32_BE);
167 | #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
168 | CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast(u32str.c_str()), idx) == libutf8::bom_t::BOM_UTF32_LE);
169 | #else
170 | #error "Unsupported endianness"
171 | #endif
172 | }
173 | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
174 | else if(static_cast(idx) >= sizeof(std::u16string::value_type))
175 | {
176 | CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast(u32str.c_str()), idx) == libutf8::bom_t::BOM_UTF16_LE);
177 | }
178 | #endif
179 | else
180 | {
181 | // too short
182 | //
183 | CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast(u32str.c_str()), idx) == libutf8::bom_t::BOM_NONE);
184 | }
185 | }
186 | }
187 | CATCH_END_SECTION()
188 | }
189 |
190 |
191 | // vim: ts=4 sw=4 et
192 |
--------------------------------------------------------------------------------
/tests/catch_caseinsensitive.cpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2021-2023 Made to Order Software Corp. All Rights Reserved
2 | //
3 | // https://snapwebsites.org/project/libutf8
4 | // contact@m2osw.com
5 | //
6 | // This program is free software; you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation; either version 2 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 |
20 | // libutf8
21 | //
22 | #include
23 |
24 |
25 | // unit test
26 | //
27 | #include "catch_main.h"
28 |
29 |
30 | // C++
31 | //
32 | #include
33 | #include
34 |
35 |
36 | // last include
37 | //
38 | #include
39 |
40 |
41 |
42 | namespace
43 | {
44 |
45 |
46 | libutf8::case_insensitive_string get_time(std::string & result)
47 | {
48 | time_t const now(time(nullptr));
49 | struct tm t;
50 | localtime_r(&now, &t);
51 | char buf[256];
52 | strftime(buf, sizeof(buf), "%T", &t);
53 | buf[sizeof(buf) - 1] = '\0';
54 | result = buf;
55 | libutf8::case_insensitive_string r(buf);
56 | r += " PST";
57 | return r;
58 | }
59 |
60 | std::string get_date(std::string & result)
61 | {
62 | time_t const now(time(nullptr));
63 | struct tm t;
64 | localtime_r(&now, &t);
65 | char buf[256];
66 | strftime(buf, sizeof(buf), "%F", &t);
67 | buf[sizeof(buf) - 1] = '\0';
68 | result = buf;
69 | libutf8::case_insensitive_string r(buf);
70 | r += " plus a few days";
71 | return r;
72 | }
73 |
74 |
75 |
76 | }
77 |
78 |
79 |
80 | CATCH_TEST_CASE("case_insensitive", "[string],[compare],[insensitive]")
81 | {
82 | CATCH_START_SECTION("case_insensitive: Verify Case Insensitive String Constructors")
83 | {
84 | {
85 | libutf8::case_insensitive_string empty;
86 | CATCH_REQUIRE(empty.empty());
87 | }
88 |
89 | {
90 | std::allocator allocator;
91 | libutf8::case_insensitive_string empty(allocator);
92 | CATCH_REQUIRE(empty.empty());
93 | }
94 |
95 | {
96 | libutf8::case_insensitive_string dashes(10, '-');
97 | CATCH_REQUIRE(dashes == "----------");
98 | }
99 |
100 | {
101 | libutf8::case_insensitive_string name("alexis");
102 | CATCH_REQUIRE(name == "alexis");
103 | }
104 |
105 | {
106 | libutf8::case_insensitive_string name("alexis", 4);
107 | CATCH_REQUIRE(name == "alex");
108 | }
109 |
110 | {
111 | libutf8::case_insensitive_string name("alexis");
112 | CATCH_REQUIRE(name == "alexis");
113 |
114 | libutf8::case_insensitive_string section(name, 2);
115 | CATCH_REQUIRE(section == "exis");
116 | }
117 |
118 | {
119 | libutf8::case_insensitive_string name("alexis");
120 | CATCH_REQUIRE(name == "alexis");
121 |
122 | libutf8::case_insensitive_string section(name, 2, 2);
123 | CATCH_REQUIRE(section == "ex");
124 | }
125 |
126 | {
127 | std::string name("alexis");
128 | CATCH_REQUIRE(name == "alexis");
129 |
130 | libutf8::case_insensitive_string section(name, 2);
131 | CATCH_REQUIRE(section == "exis");
132 | }
133 |
134 | {
135 | std::string name("alexis");
136 | CATCH_REQUIRE(name == "alexis");
137 |
138 | libutf8::case_insensitive_string section(name, 2, 2);
139 | CATCH_REQUIRE(section == "ex");
140 | }
141 |
142 | {
143 | libutf8::case_insensitive_string name("alexis");
144 | CATCH_REQUIRE(name == "alexis");
145 |
146 | libutf8::case_insensitive_string section(name.begin() + 2, name.end() - 2);
147 | CATCH_REQUIRE(section == "ex");
148 | }
149 |
150 | {
151 | std::string name("alexis");
152 | CATCH_REQUIRE(name == "alexis");
153 |
154 | libutf8::case_insensitive_string full(name);
155 | CATCH_REQUIRE(full == "alexis");
156 | }
157 |
158 | {
159 | libutf8::case_insensitive_string name("alexis");
160 | CATCH_REQUIRE(name == "alexis");
161 |
162 | libutf8::case_insensitive_string full(name);
163 | CATCH_REQUIRE(full == "alexis");
164 | }
165 |
166 | {
167 | libutf8::case_insensitive_string name({'a', 'l', 'e', 'x', 'i', 's'});
168 | CATCH_REQUIRE(name == "alexis");
169 | }
170 |
171 | {
172 | std::string expected("not this");
173 | libutf8::case_insensitive_string now(get_time(expected));
174 | CATCH_REQUIRE(expected + " PST" == now);
175 | }
176 |
177 | {
178 | std::allocator allocator;
179 | std::string expected("not this");
180 | libutf8::case_insensitive_string now(get_time(expected), allocator);
181 | CATCH_REQUIRE(expected + " PST" == now);
182 | }
183 |
184 | {
185 | std::string expected("not this");
186 | libutf8::case_insensitive_string now(get_date(expected));
187 | CATCH_REQUIRE(now == expected + " plus a few days");
188 | }
189 |
190 | {
191 | std::allocator allocator;
192 | std::string expected("not this");
193 | libutf8::case_insensitive_string now(get_date(expected), allocator);
194 | CATCH_REQUIRE(now == expected + " plus a few days");
195 | }
196 | }
197 | CATCH_END_SECTION()
198 |
199 | CATCH_START_SECTION("case_insensitive: Verify Case Insensitive String Comparators")
200 | {
201 | {
202 | libutf8::case_insensitive_string name1("Alexis");
203 | libutf8::case_insensitive_string name2("alexis");
204 | CATCH_REQUIRE(name1 == name2);
205 | CATCH_REQUIRE_FALSE(name1 != name2);
206 | CATCH_REQUIRE_FALSE(name1 > name2);
207 | CATCH_REQUIRE(name1 >= name2);
208 | CATCH_REQUIRE_FALSE(name1 < name2);
209 | CATCH_REQUIRE(name1 <= name2);
210 | }
211 |
212 | {
213 | libutf8::case_insensitive_string name1("Alexis");
214 | libutf8::case_insensitive_string name2("Wilke");
215 | CATCH_REQUIRE_FALSE(name1 == name2);
216 | CATCH_REQUIRE(name1 != name2);
217 | CATCH_REQUIRE_FALSE(name1 > name2);
218 | CATCH_REQUIRE_FALSE(name1 >= name2);
219 | CATCH_REQUIRE(name1 < name2);
220 | CATCH_REQUIRE(name1 <= name2);
221 | }
222 |
223 | {
224 | libutf8::case_insensitive_string name1("Alexis");
225 | std::string name2("alexis");
226 | CATCH_REQUIRE(name1 == name2);
227 | CATCH_REQUIRE_FALSE(name1 != name2);
228 | CATCH_REQUIRE_FALSE(name1 > name2);
229 | CATCH_REQUIRE(name1 >= name2);
230 | CATCH_REQUIRE_FALSE(name1 < name2);
231 | CATCH_REQUIRE(name1 <= name2);
232 | }
233 |
234 | {
235 | std::string name1("Alexis");
236 | libutf8::case_insensitive_string name2("Wilke");
237 | CATCH_REQUIRE_FALSE(name1 == name2);
238 | CATCH_REQUIRE(name1 != name2);
239 | CATCH_REQUIRE_FALSE(name1 > name2);
240 | CATCH_REQUIRE_FALSE(name1 >= name2);
241 | CATCH_REQUIRE(name1 < name2);
242 | CATCH_REQUIRE(name1 <= name2);
243 | }
244 |
245 | {
246 | libutf8::case_insensitive_string name1("Alexis");
247 | CATCH_REQUIRE(name1 == "alexis");
248 | CATCH_REQUIRE_FALSE(name1 != "alexis");
249 | CATCH_REQUIRE_FALSE(name1 > "alexis");
250 | CATCH_REQUIRE(name1 >= "alexis");
251 | CATCH_REQUIRE_FALSE(name1 < "alexis");
252 | CATCH_REQUIRE(name1 <= "alexis");
253 | }
254 |
255 | {
256 | libutf8::case_insensitive_string name2("Wilke");
257 | CATCH_REQUIRE_FALSE("Alexis" == name2);
258 | CATCH_REQUIRE("Alexis" != name2);
259 | CATCH_REQUIRE_FALSE("Alexis" > name2);
260 | CATCH_REQUIRE_FALSE("Alexis" >= name2);
261 | CATCH_REQUIRE("Alexis" < name2);
262 | CATCH_REQUIRE("Alexis" <= name2);
263 | }
264 | }
265 | CATCH_END_SECTION()
266 | }
267 |
268 |
269 | // vim: ts=4 sw=4 et
270 |
--------------------------------------------------------------------------------
/tests/catch_iterator.cpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2013-2023 Made to Order Software Corp. All Rights Reserved
2 | //
3 | // https://snapwebsites.org/project/libutf8
4 | // contact@m2osw.com
5 | //
6 | // This program is free software; you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation; either version 2 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 |
20 | // libutf8
21 | //
22 | #include
23 |
24 | #include
25 | #include
26 |
27 |
28 | // unit test
29 | //
30 | #include "catch_main.h"
31 |
32 |
33 | // C++
34 | //
35 | #include
36 | #include
37 |
38 |
39 | // last include
40 | //
41 | #include
42 |
43 |
44 |
45 | CATCH_TEST_CASE("libutf8_iterator", "[iterator]")
46 | {
47 | CATCH_START_SECTION("libutf8_iterator: valid iterators tests")
48 | {
49 | char32_t p(0);
50 | do
51 | {
52 | p = rand() % 0x11 * 0x10000;
53 | }
54 | while(p == 0 || (p >= 0xD800 && p <= 0xDFFF));
55 |
56 | for(char32_t plan(0); plan < 0x110000; plan += 0x10000)
57 | {
58 | // create one plan in one string
59 | //
60 | std::string str;
61 | str.reserve(0x10000 * 4);
62 | for(char32_t wc(0); wc < 0x10000; ++wc)
63 | {
64 | if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
65 | {
66 | wc = 0xDFFF;
67 | continue;
68 | }
69 | char buf[libutf8::MBS_MIN_BUFFER_LENGTH];
70 | CATCH_REQUIRE(libutf8::wctombs(buf, wc + plan, sizeof(buf)) >= 1);
71 | if(plan == 0 && wc == 0)
72 | {
73 | // this is a special case as buf[0] = '\0' and the += with
74 | // the string won't work
75 | //
76 | str += '\0';
77 | }
78 | else
79 | {
80 | str += buf;
81 | }
82 | }
83 | //std::cerr << "-------------- Plan " << static_cast(plan) << " String ready " << str.length() << " ...\n";
84 |
85 | {
86 | libutf8::utf8_iterator it(str);
87 | libutf8::utf8_iterator it_end(str, true);
88 | libutf8::utf8_iterator it_next(str);
89 | ++it_next;
90 |
91 | CATCH_REQUIRE(it == str.begin());
92 | CATCH_REQUIRE(it == str.cbegin());
93 | CATCH_REQUIRE(it != str.end());
94 | CATCH_REQUIRE(it != str.cend());
95 |
96 | CATCH_REQUIRE(it == it);
97 | CATCH_REQUIRE(it != it_end);
98 | CATCH_REQUIRE(it != it_next);
99 |
100 | CATCH_REQUIRE(str.begin() == it);
101 | CATCH_REQUIRE(str.cbegin() == it);
102 | CATCH_REQUIRE(str.end() != it);
103 | CATCH_REQUIRE(str.cend() != it);
104 |
105 | for(char32_t wc(0); wc < 0x10000; ++wc)
106 | {
107 | if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
108 | {
109 | wc = 0xDFFF;
110 | continue;
111 | }
112 | CATCH_REQUIRE(*it == wc + plan);
113 | ++it;
114 | }
115 |
116 | CATCH_REQUIRE(it != str.begin());
117 | CATCH_REQUIRE(it != str.cbegin());
118 | CATCH_REQUIRE(it == str.end());
119 | CATCH_REQUIRE(it == str.cend());
120 |
121 | CATCH_REQUIRE(str.begin() != it);
122 | CATCH_REQUIRE(str.cbegin() != it);
123 | CATCH_REQUIRE(str.end() == it);
124 | CATCH_REQUIRE(str.cend() == it);
125 |
126 | CATCH_REQUIRE(*it == libutf8::EOS);
127 | ++it;
128 | it++;
129 | CATCH_REQUIRE(it == str.cend());
130 |
131 | for(char32_t wc(0x10000); wc > 0; )
132 | {
133 | --wc;
134 | if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
135 | {
136 | wc = 0xD800;
137 | continue;
138 | }
139 | --it;
140 | CATCH_REQUIRE(*it == wc + plan);
141 | }
142 |
143 | --it;
144 | it--;
145 |
146 | CATCH_REQUIRE(it.good());
147 | CATCH_REQUIRE_FALSE(it.bad());
148 | }
149 |
150 | if(plan == p)
151 | {
152 | libutf8::utf8_iterator it(str);
153 |
154 | for(char32_t wc(0); wc < 0x10000; ++wc)
155 | {
156 | if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
157 | {
158 | wc = 0xDFFF;
159 | continue;
160 | }
161 | CATCH_REQUIRE(*it++ == wc + plan);
162 | }
163 |
164 | CATCH_REQUIRE(it == str.end());
165 | it++;
166 | CATCH_REQUIRE(it.good());
167 | CATCH_REQUIRE_FALSE(it.bad());
168 | ++it;
169 | CATCH_REQUIRE(it.good());
170 | CATCH_REQUIRE_FALSE(it.bad());
171 | CATCH_REQUIRE(it == str.end());
172 | CATCH_REQUIRE(it.good());
173 | CATCH_REQUIRE_FALSE(it.bad());
174 |
175 | for(char32_t wc(0x10000); wc > 0; )
176 | {
177 | --wc;
178 | if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
179 | {
180 | wc = 0xD800;
181 | continue;
182 | }
183 | CATCH_REQUIRE(*--it == wc + plan);
184 | }
185 |
186 | CATCH_REQUIRE(it == str.begin());
187 | CATCH_REQUIRE(str.begin() == it);
188 | it--;
189 | --it;
190 | CATCH_REQUIRE(it == str.begin());
191 | CATCH_REQUIRE(str.begin() == it);
192 | }
193 |
194 | if(plan == (p + 0x10000) % 0x110000)
195 | {
196 | libutf8::utf8_iterator it(str);
197 | libutf8::utf8_iterator start(str);
198 | CATCH_REQUIRE(it - start == 0);
199 | CATCH_REQUIRE(start - it == 0);
200 |
201 | for(char32_t wc(0); wc < 0x10000; ++wc)
202 | {
203 | if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
204 | {
205 | wc = 0xDFFF;
206 | continue;
207 | }
208 | CATCH_REQUIRE(*it == wc + plan);
209 | it++;
210 |
211 | libutf8::utf8_iterator zero(it);
212 | zero.rewind();
213 | CATCH_REQUIRE(zero == start);
214 | }
215 |
216 | libutf8::utf8_iterator copy(it);
217 | CATCH_REQUIRE(static_cast(it - start) == str.length());
218 | CATCH_REQUIRE(static_cast(copy - start) == str.length());
219 | CATCH_REQUIRE(copy - it == 0);
220 | CATCH_REQUIRE(it - copy == 0);
221 | copy.rewind();
222 | CATCH_REQUIRE(copy - start == 0);
223 | CATCH_REQUIRE(start - copy == 0);
224 | CATCH_REQUIRE(static_cast(start - copy) == 0);
225 | CATCH_REQUIRE(static_cast(copy - start) == 0);
226 |
227 | for(char32_t wc(0x10000); wc > 0; )
228 | {
229 | --wc;
230 | if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
231 | {
232 | wc = 0xD800;
233 | continue;
234 | }
235 | it--;
236 | CATCH_REQUIRE(*it == wc + plan);
237 | }
238 | }
239 | }
240 | }
241 | CATCH_END_SECTION()
242 | }
243 |
244 |
245 | CATCH_TEST_CASE("libutf8_iterator_invalid_string", "[iterator],[invalid]")
246 | {
247 | CATCH_START_SECTION("libutf8_iterator_invalid_string: iterators with invalid characters (bad UTF-8)")
248 | {
249 | for(int repeat(0); repeat < 100; ++repeat)
250 | {
251 | // create one plan in one string
252 | //
253 | constexpr size_t STR_LENGTH = 4;
254 | char32_t wc;
255 | std::u32string wstr;
256 | wstr.reserve(STR_LENGTH);
257 | for(size_t idx(0); idx < STR_LENGTH; ++idx)
258 | {
259 | do
260 | {
261 | wc = unittest::rand_char(true);
262 | }
263 | while(wc < 0x80);
264 | wstr += wc;
265 | }
266 | std::string str(libutf8::to_u8string(wstr));
267 |
268 | //std::cerr << "-------------- Plan " << static_cast(plan) << " String ready " << str.length() << " ...\n";
269 |
270 | // first verify that it works
271 | //
272 | std::string::size_type pos[STR_LENGTH];
273 | {
274 | libutf8::utf8_iterator it(str);
275 |
276 | CATCH_REQUIRE(it == str.begin());
277 | CATCH_REQUIRE(it == str.cbegin());
278 | CATCH_REQUIRE(it != str.end());
279 | CATCH_REQUIRE(it != str.cend());
280 |
281 | CATCH_REQUIRE(str.begin() == it);
282 | CATCH_REQUIRE(str.cbegin() == it);
283 | CATCH_REQUIRE(str.end() != it);
284 | CATCH_REQUIRE(str.cend() != it);
285 |
286 | for(size_t idx(0); idx < STR_LENGTH; ++idx)
287 | {
288 | CATCH_REQUIRE(*it == wstr[idx]);
289 | if(rand() % 2 == 0)
290 | {
291 | pos[idx] = it - str.begin();
292 | }
293 | else
294 | {
295 | pos[idx] = -(str.begin() - it);
296 | }
297 | ++it;
298 | }
299 |
300 | CATCH_REQUIRE(it != str.begin());
301 | CATCH_REQUIRE(it != str.cbegin());
302 | CATCH_REQUIRE(it == str.end());
303 | CATCH_REQUIRE(it == str.cend());
304 |
305 | CATCH_REQUIRE(str.begin() != it);
306 | CATCH_REQUIRE(str.cbegin() != it);
307 | CATCH_REQUIRE(str.end() == it);
308 | CATCH_REQUIRE(str.cend() == it);
309 |
310 | CATCH_REQUIRE(*it == libutf8::EOS);
311 | ++it;
312 | it++;
313 | CATCH_REQUIRE(it == str.cend());
314 |
315 | CATCH_REQUIRE(it.good());
316 | CATCH_REQUIRE_FALSE(it.bad());
317 | }
318 |
319 | {
320 | libutf8::utf8_iterator it(str);
321 |
322 | str[pos[1]] = rand() % 0x40 + 0x80;
323 |
324 | CATCH_REQUIRE(*it++ == wstr[0]);
325 | CATCH_REQUIRE(*it++ == libutf8::NOT_A_CHARACTER); // we broke this one
326 | CATCH_REQUIRE(*it++ == wstr[2]);
327 | CATCH_REQUIRE(*it++ == wstr[3]);
328 | CATCH_REQUIRE(*it++ == libutf8::EOS);
329 |
330 | CATCH_REQUIRE_FALSE(it.good());
331 | CATCH_REQUIRE(it.bad());
332 | it.clear();
333 | CATCH_REQUIRE(it.good());
334 | CATCH_REQUIRE_FALSE(it.bad());
335 | }
336 |
337 | {
338 | str.erase(str.length() - 1);
339 | libutf8::utf8_iterator it(str);
340 |
341 | str[pos[1]] = rand() % 0x40 + 0x80;
342 |
343 | CATCH_REQUIRE(*it++ == wstr[0]);
344 | CATCH_REQUIRE(*it++ == libutf8::NOT_A_CHARACTER);
345 | CATCH_REQUIRE(*it++ == wstr[2]);
346 | CATCH_REQUIRE(*it++ == libutf8::NOT_A_CHARACTER);
347 |
348 | CATCH_REQUIRE_FALSE(it.good());
349 | CATCH_REQUIRE(it.bad());
350 | it.clear();
351 | CATCH_REQUIRE(it.good());
352 | CATCH_REQUIRE_FALSE(it.bad());
353 | }
354 | }
355 | }
356 | CATCH_END_SECTION()
357 |
358 | CATCH_START_SECTION("libutf8_iterator_invalid_string: iterators with invalid characters (too large)")
359 | {
360 | for(char32_t wc(0x110000); wc < 0x1FFFFF; ++wc)
361 | {
362 | // since this character is not valid
363 | // we have to encode it _manually_
364 | //
365 | char buf[4];
366 | buf[0] = 0xF0 | ((wc >> 18) & 0x07);
367 | buf[1] = 0x80 | ((wc >> 12) & 0x3F);
368 | buf[2] = 0x80 | ((wc >> 6) & 0x3F);
369 | buf[3] = 0x80 | ((wc >> 0) & 0x3F);
370 |
371 | std::string str(buf, 4);
372 |
373 | // first verify that it works
374 | //
375 | {
376 | libutf8::utf8_iterator it(str);
377 |
378 | CATCH_REQUIRE(it == str.begin());
379 | CATCH_REQUIRE(it == str.cbegin());
380 | CATCH_REQUIRE(it != str.end());
381 | CATCH_REQUIRE(it != str.cend());
382 |
383 | CATCH_REQUIRE(str.begin() == it);
384 | CATCH_REQUIRE(str.cbegin() == it);
385 | CATCH_REQUIRE(str.end() != it);
386 | CATCH_REQUIRE(str.cend() != it);
387 |
388 | CATCH_REQUIRE(*it == libutf8::NOT_A_CHARACTER);
389 |
390 | CATCH_REQUIRE_FALSE(it.good());
391 | CATCH_REQUIRE(it.bad());
392 | it.clear();
393 | CATCH_REQUIRE(it.good());
394 | CATCH_REQUIRE_FALSE(it.bad());
395 |
396 | ++it;
397 |
398 | CATCH_REQUIRE(it != str.begin());
399 | CATCH_REQUIRE(it != str.cbegin());
400 | CATCH_REQUIRE(it == str.end());
401 | CATCH_REQUIRE(it == str.cend());
402 |
403 | CATCH_REQUIRE(str.begin() != it);
404 | CATCH_REQUIRE(str.cbegin() != it);
405 | CATCH_REQUIRE(str.end() == it);
406 | CATCH_REQUIRE(str.cend() == it);
407 |
408 | CATCH_REQUIRE(*it == libutf8::EOS);
409 | ++it;
410 | it++;
411 | CATCH_REQUIRE(it == str.cend());
412 |
413 | CATCH_REQUIRE_FALSE(it.good());
414 | CATCH_REQUIRE(it.bad());
415 | }
416 | }
417 | }
418 | CATCH_END_SECTION()
419 | }
420 |
421 |
422 |
423 | // vim: ts=4 sw=4 et
424 |
--------------------------------------------------------------------------------
/tests/catch_length.cpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2013-2023 Made to Order Software Corp. All Rights Reserved
2 | //
3 | // https://snapwebsites.org/project/libutf8
4 | // contact@m2osw.com
5 | //
6 | // This program is free software; you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation; either version 2 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 |
20 | // libutf8
21 | //
22 | #include
23 | #include
24 |
25 |
26 | // unit test
27 | //
28 | #include "catch_main.h"
29 |
30 |
31 | // C++
32 | //
33 | #include
34 | #include
35 | #include
36 |
37 |
38 | // last include
39 | //
40 | #include
41 |
42 |
43 |
44 | CATCH_TEST_CASE("string_length", "[strings][valid][length][u8][u16][u32]")
45 | {
46 | CATCH_START_SECTION("string_length: length of valid Unicode strings")
47 | {
48 | for(int idx(0); idx < 100; ++idx)
49 | {
50 | std::size_t const length(rand() % 100 + 1);
51 | std::u32string str32;
52 | for(std::size_t j(0); j < length; ++j)
53 | {
54 | char32_t const c(SNAP_CATCH2_NAMESPACE::random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_ZUNICODE));
55 | str32 += c;
56 | }
57 | CATCH_REQUIRE(libutf8::is_valid_unicode(str32));
58 | CATCH_REQUIRE(str32.length() == length);
59 |
60 | std::string str8(libutf8::to_u8string(str32));
61 | CATCH_REQUIRE(libutf8::is_valid_utf8(str8));
62 | CATCH_REQUIRE(str8.length() >= length);
63 | CATCH_REQUIRE(libutf8::u8length(str8) == length);
64 |
65 | std::u16string str16(libutf8::to_u16string(str8));
66 | CATCH_REQUIRE(libutf8::is_valid_utf16(str16));
67 | CATCH_REQUIRE(str16.length() >= length);
68 | CATCH_REQUIRE(static_cast(libutf8::u16length(str16)) == length);
69 | }
70 | }
71 | CATCH_END_SECTION()
72 | }
73 |
74 |
75 | CATCH_TEST_CASE("invalid_utf16_string_length", "[strings][invalid][length][u16]")
76 | {
77 | CATCH_START_SECTION("invalid_utf16_string_length: invalid UTF-16 returns -1 for length")
78 | {
79 | for(int idx(0); idx < 100; ++idx)
80 | {
81 | std::size_t const length(rand() % 30 + 5);
82 | char16_t bad_char(rand() & 0x03FF);
83 | std::size_t bad_pos(length / 2);
84 | switch(idx % 3)
85 | {
86 | case 0:
87 | bad_char += 0xDC00; // low without a high
88 | break;
89 |
90 | case 1:
91 | bad_char += 0xD800; // high not followed by a low
92 | break;
93 |
94 | case 2:
95 | bad_char += 0xD800; // high followed by u'\0'
96 | bad_pos = length - 1;
97 | break;
98 |
99 | }
100 | std::u16string str16;
101 | for(std::size_t j(0); j < length; ++j)
102 | {
103 | char32_t const wc(SNAP_CATCH2_NAMESPACE::random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_ZUNICODE));
104 | str16 += libutf8::to_u16string(wc);
105 | if(j == bad_pos)
106 | {
107 | str16 += bad_char;
108 | }
109 | }
110 |
111 | CATCH_REQUIRE_FALSE(libutf8::is_valid_utf16(str16));
112 | CATCH_REQUIRE(libutf8::u16length(str16) == -1);
113 | }
114 | }
115 | CATCH_END_SECTION()
116 | }
117 |
118 |
119 | // vim: ts=4 sw=4 et
120 |
--------------------------------------------------------------------------------
/tests/catch_main.cpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2021-2023 Made to Order Software Corp. All Rights Reserved
2 | //
3 | // https://snapwebsites.org/project/libutf8
4 | // contact@m2osw.com
5 | //
6 | // This program is free software; you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation; either version 2 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 |
20 | // Tell catch we want it to add the runner code in this file.
21 | #define CATCH_CONFIG_RUNNER
22 |
23 | // self
24 | //
25 | #include "catch_main.h"
26 |
27 |
28 | // libutf8
29 | //
30 | #include
31 | #include
32 |
33 |
34 | // libexcept
35 | //
36 | #include
37 |
38 |
39 | // C++
40 | //
41 | #include
42 |
43 |
44 | // last include
45 | //
46 | #include
47 |
48 |
49 |
50 |
51 |
52 | int main(int argc, char * argv[])
53 | {
54 | return SNAP_CATCH2_NAMESPACE::snap_catch2_main(
55 | "libutf8"
56 | , LIBUTF8_VERSION_STRING
57 | , argc
58 | , argv
59 | , []() { libexcept::set_collect_stack(libexcept::collect_stack_t::COLLECT_STACK_NO); }
60 | );
61 | }
62 |
63 |
64 | // vim: ts=4 sw=4 et
65 |
--------------------------------------------------------------------------------
/tests/catch_main.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2006-2023 Made to Order Software Corp. All Rights Reserved
2 | //
3 | // https://snapwebsites.org/project/libutf8
4 | // contact@m2osw.com
5 | //
6 | // This program is free software; you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation; either version 2 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 | #pragma once
20 |
21 | // libutf8
22 | //
23 | #include // for the ostream
24 |
25 |
26 | // catch2
27 | //
28 | #include
29 |
30 |
31 | // C++
32 | //
33 | #include
34 | #include
35 | #include
36 | #include
37 |
38 |
39 | // last include
40 | //
41 | #include
42 |
43 |
44 |
45 | namespace SNAP_CATCH2_NAMESPACE
46 | {
47 |
48 |
49 |
50 |
51 | inline char32_t rand_char(bool full_range = false)
52 | {
53 | char32_t const max((full_range ? 0x0110000 : 0x0010000) - (0xE000 - 0xD800));
54 |
55 | char32_t wc;
56 | do
57 | {
58 | wc = ((rand() << 16) ^ rand()) % max;
59 | }
60 | while(wc == 0);
61 | if(wc >= 0xD800)
62 | {
63 | // skip the surrogates
64 | //
65 | wc += 0xE000 - 0xD800;
66 | }
67 |
68 | return wc;
69 | }
70 |
71 |
72 |
73 | }
74 | // unittest namespace
75 | // vim: ts=4 sw=4 et
76 |
--------------------------------------------------------------------------------
/tests/catch_stream.cpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2013-2023 Made to Order Software Corp. All Rights Reserved
2 | //
3 | // https://snapwebsites.org/project/libutf8
4 | // contact@m2osw.com
5 | //
6 | // This program is free software; you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation; either version 2 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 |
20 | // libutf8
21 | //
22 | #include
23 |
24 |
25 | // unit test
26 | //
27 | #include "catch_main.h"
28 |
29 |
30 | // C++
31 | //
32 | #include
33 | #include
34 | #include
35 |
36 |
37 | // last include
38 | //
39 | #include
40 |
41 |
42 |
43 | CATCH_TEST_CASE("stream", "[stream][valid]")
44 | {
45 | CATCH_START_SECTION("stream: write a char32_t to a stream")
46 | {
47 | for(int i(0); i < 1000; ++i)
48 | {
49 | char32_t const wc(SNAP_CATCH2_NAMESPACE::random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_ZUNICODE));
50 |
51 | std::stringstream ss;
52 | ss << libutf8::to_u8string(wc);
53 |
54 | CATCH_REQUIRE(ss.str() == libutf8::to_u8string(wc));
55 | }
56 | }
57 | CATCH_END_SECTION()
58 | }
59 |
60 |
61 |
62 | // vim: ts=4 sw=4 et
63 |
--------------------------------------------------------------------------------
/tests/catch_valid.cpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2013-2023 Made to Order Software Corp. All Rights Reserved
2 | //
3 | // https://snapwebsites.org/project/libutf8
4 | // contact@m2osw.com
5 | //
6 | // This program is free software; you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation; either version 2 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 |
20 | // libutf8
21 | //
22 | #include
23 | #include
24 |
25 |
26 | // unit test
27 | //
28 | #include "catch_main.h"
29 |
30 |
31 | // snapdev
32 | //
33 | #include
34 |
35 |
36 | // C++
37 | //
38 | #include
39 | #include
40 | #include
41 |
42 |
43 | // last include
44 | //
45 | #include
46 |
47 |
48 |
49 | CATCH_TEST_CASE("make_valid", "[strings][valid][u8]")
50 | {
51 | CATCH_START_SECTION("make_valid: test bad encoding (1 byte when 2 necessary)")
52 | {
53 | for(char32_t two_bytes(0x80); two_bytes < 0x800; ++two_bytes)
54 | {
55 | char const byte1(static_cast((two_bytes >> 6) | 0xC0));
56 | char32_t const vc1(random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_UNICODE));
57 | char32_t const vc2(random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_UNICODE));
58 | char32_t const fix_char(random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_UNICODE));
59 | std::string invalid_string;
60 | invalid_string += vc1;
61 | invalid_string += byte1;
62 | invalid_string += vc2;
63 | std::string expected_string;
64 | expected_string += vc1;
65 | expected_string += fix_char;
66 | expected_string += vc2;
67 | CATCH_REQUIRE_FALSE(libutf8::make_u8string_valid(invalid_string, fix_char));
68 | CATCH_REQUIRE(invalid_string == expected_string);
69 | }
70 | }
71 | CATCH_END_SECTION()
72 |
73 | CATCH_START_SECTION("make_valid: test bad encoding (2 bytes when 3 necessary)")
74 | {
75 | for(char32_t two_bytes(0x800); two_bytes < 0x10000; ++two_bytes)
76 | {
77 | // Note: this includes the UTF-16 surrogates which are also
78 | // considered invalid
79 | //
80 | char const byte1(static_cast((two_bytes >> 12) | 0xE0));
81 | char const byte2(((two_bytes >> 6) & 0x3F) | 0x80);
82 | char32_t const vc1(random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_UNICODE));
83 | char32_t const vc2(random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_UNICODE));
84 | char32_t const fix_char(random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_UNICODE));
85 | std::string invalid_string;
86 | invalid_string += vc1;
87 | invalid_string += byte1;
88 | invalid_string += byte2;
89 | invalid_string += vc2;
90 | std::string expected_string;
91 | expected_string += vc1;
92 | expected_string += fix_char;
93 | expected_string += vc2;
94 | CATCH_REQUIRE_FALSE(libutf8::make_u8string_valid(invalid_string, fix_char));
95 | CATCH_REQUIRE(invalid_string == expected_string);
96 | }
97 | }
98 | CATCH_END_SECTION()
99 |
100 | CATCH_START_SECTION("make_valid: test bad encoding (3 bytes when 4 necessary)")
101 | {
102 | for(char32_t two_bytes(0x10000); two_bytes < 0x110000; ++two_bytes)
103 | {
104 | char const byte1(static_cast((two_bytes >> 18) | 0xF0));
105 | char const byte2(((two_bytes >> 12) & 0x3F) | 0x80);
106 | char const byte3(((two_bytes >> 6) & 0x3F) | 0x80);
107 | char32_t const vc1(random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_UNICODE));
108 | char32_t const vc2(random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_UNICODE));
109 | char32_t const fix_char(random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_UNICODE));
110 | std::string invalid_string;
111 | invalid_string += vc1;
112 | invalid_string += byte1;
113 | invalid_string += byte2;
114 | invalid_string += byte3;
115 | invalid_string += vc2;
116 | std::string expected_string;
117 | expected_string += vc1;
118 | expected_string += fix_char;
119 | expected_string += vc2;
120 | CATCH_REQUIRE_FALSE(libutf8::make_u8string_valid(invalid_string, fix_char));
121 | CATCH_REQUIRE(invalid_string == expected_string);
122 | }
123 | }
124 | CATCH_END_SECTION()
125 | }
126 |
127 |
128 |
129 | // vim: ts=4 sw=4 et
130 |
--------------------------------------------------------------------------------
/tests/catch_version.cpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2021-2023 Made to Order Software Corp. All Rights Reserved
2 | //
3 | // https://snapwebsites.org/project/libutf8
4 | // contact@m2osw.com
5 | //
6 | // This program is free software; you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation; either version 2 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 |
20 | // libutf8
21 | //
22 | #include
23 |
24 |
25 | // self
26 | //
27 | #include "catch_main.h"
28 |
29 |
30 | // last include
31 | //
32 | #include
33 |
34 |
35 |
36 |
37 | CATCH_TEST_CASE("version", "[version]")
38 | {
39 | CATCH_START_SECTION("version: verify runtime vs compile time version numbers")
40 | {
41 | CATCH_REQUIRE(libutf8::get_major_version() == LIBUTF8_VERSION_MAJOR);
42 | CATCH_REQUIRE(libutf8::get_release_version() == LIBUTF8_VERSION_MINOR);
43 | CATCH_REQUIRE(libutf8::get_patch_version() == LIBUTF8_VERSION_PATCH);
44 | CATCH_REQUIRE(strcmp(libutf8::get_version_string(), LIBUTF8_VERSION_STRING) == 0);
45 | }
46 | CATCH_END_SECTION()
47 | }
48 |
49 |
50 | // vim: ts=4 sw=4 et
51 |
--------------------------------------------------------------------------------
/tests/example-for-show-utf16.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/m2osw/libutf8/fe008f2199f5d8f43abe6ecffe6e82ead03ab2b5/tests/example-for-show-utf16.txt
--------------------------------------------------------------------------------
/tests/example-for-show-utf32.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/m2osw/libutf8/fe008f2199f5d8f43abe6ecffe6e82ead03ab2b5/tests/example-for-show-utf32.txt
--------------------------------------------------------------------------------
/tests/example-for-show-utf8.txt:
--------------------------------------------------------------------------------
1 | Tḩìs 𝄞 ĩş bêȧútîfüł!
2 |
--------------------------------------------------------------------------------
/tests/unicode/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Please see the ../conf/unicode/LICENSE.txt files for the license.
2 |
--------------------------------------------------------------------------------
/tests/verify-show-unicode.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh -e
2 | #
3 | # Verify that the show-unicode command line tool returns the correct exit codes
4 | #
5 | # TODO: verify the actual output (that may require a catch_....cpp file so
6 | # we can capture the output and easy compare with string we generate
7 | # in a C++ test)
8 |
9 | SHOW_UNICODE="../../BUILD/Debug/contrib/libutf8/tools/show-unicode"
10 | ERRCNT=0
11 | RED='\033[0;31m'
12 | NORMAL='\033[0m'
13 |
14 | # Verify Binary Exists
15 | if ! test -x ${SHOW_UNICODE}
16 | then
17 | echo "${RED}error: could not find valid binary \"${SHOW_UNICODE}\"; did you build the project?${NORMAL}"
18 | echo "1 error occurred. Please verify what went wrong and fix it."
19 | exit 1
20 | fi
21 |
22 | # Help
23 | echo "--- SECTION: --help"
24 | if ${SHOW_UNICODE} --help
25 | then
26 | echo "${RED}error: --help returned with success.${NORMAL}"
27 | ERRCNT=`expr ${ERRCNT} + 1`
28 | elif test ${?} -ne 2
29 | then
30 | echo "${RED}error: --help did not return with expected exit code.${NORMAL}"
31 | ERRCNT=`expr ${ERRCNT} + 1`
32 | else
33 | echo "info: --help works."
34 | fi
35 | echo
36 |
37 | # Version
38 | echo "--- SECTION: --version"
39 | if ${SHOW_UNICODE} --version
40 | then
41 | echo "${RED}error: --version returned with success.${NORMAL}"
42 | ERRCNT=`expr ${ERRCNT} + 1`
43 | elif test ${?} -ne 2
44 | then
45 | echo "${RED}error: --version did not return with expected exit code.${NORMAL}"
46 | ERRCNT=`expr ${ERRCNT} + 1`
47 | else
48 | echo "info: --version works."
49 | fi
50 | echo
51 |
52 | # String / Character
53 | echo "--- SECTION: --string"
54 | if ${SHOW_UNICODE} "Magic"
55 | then
56 | echo "info: string display worked."
57 | else
58 | echo "${RED}error: string display failed with ${?}.${NORMAL}"
59 | ERRCNT=`expr ${ERRCNT} + 1`
60 | fi
61 | echo
62 |
63 | if ${SHOW_UNICODE} --string "Élémentaire ça!"
64 | then
65 | echo "info: string display worked."
66 | else
67 | echo "${RED}error: string display failed with ${?}.${NORMAL}"
68 | ERRCNT=`expr ${ERRCNT} + 1`
69 | fi
70 | echo
71 |
72 | echo "--- SECTION: --character"
73 | if ${SHOW_UNICODE} -C 0x1D11E
74 | then
75 | echo "info: character display worked."
76 | else
77 | echo "${RED}error: character display failed with ${?}.${NORMAL}"
78 | ERRCNT=`expr ${ERRCNT} + 1`
79 | fi
80 | echo
81 |
82 | if ${SHOW_UNICODE} -C 1D11E
83 | then
84 | echo "${RED}error: character display succeeded with invalid number syntax.${NORMAL}"
85 | ERRCNT=`expr ${ERRCNT} + 1`
86 | else
87 | ERRCODE=${?}
88 | if test ${ERRCODE} -eq 1
89 | then
90 | echo "info: character display failed as expected with ${ERRCODE}."
91 | else
92 | echo "${RED}error: character display failed with unexpected error code ${ERRCODE}.${NORMAL}"
93 | ERRCNT=`expr ${ERRCNT} + 1`
94 | fi
95 | fi
96 | echo
97 |
98 | # Files
99 | check_show() {
100 | echo "--- SECTION: file with: ${1}"
101 | if ${SHOW_UNICODE} "${1}" tests/example-for-show-${2}.txt
102 | then
103 | echo "info: ${2} display worked."
104 | else
105 | echo "${RED}error: ${2} display failed with ${?}.${NORMAL}"
106 | ERRCNT=`expr ${ERRCNT} + 1`
107 | fi
108 | echo
109 | }
110 |
111 | check_show -f utf8
112 | check_show -S utf16
113 | check_show -F utf32
114 |
115 | if test ${ERRCNT} -eq 0
116 | then
117 | exit 0
118 | fi
119 |
120 | echo "${ERRCNT} errors occurred. Please verify what went wrong and fix it."
121 | exit 1
122 |
123 |
--------------------------------------------------------------------------------
/tools/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2012-2023 Made to Order Software Corp. All Rights Reserved
2 | #
3 | # https://snapwebsites.org/project/libutf8
4 | # contact@m2osw.com
5 | #
6 | # This program is free software; you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation; either version 2 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License along
17 | # with this program; if not, write to the Free Software Foundation, Inc.,
18 | # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 |
20 | ##
21 | ## show-unicode
22 | ##
23 | project(show-unicode)
24 |
25 | add_executable(${PROJECT_NAME}
26 | show_unicode.cpp
27 | )
28 |
29 | target_link_libraries(${PROJECT_NAME}
30 | utf8
31 | )
32 |
33 | install(
34 | TARGETS
35 | ${PROJECT_NAME}
36 |
37 | RUNTIME DESTINATION
38 | bin
39 | )
40 |
41 |
42 | ##
43 | ## unicode-data-parser
44 | ##
45 | project(unicode-data-parser)
46 |
47 | add_executable(${PROJECT_NAME}
48 | unicode_data_parser.cpp
49 | )
50 |
51 | target_include_directories(${PROJECT_NAME}
52 | PUBLIC
53 | ${ADVGETOPT_INCLUDE_DIRS}
54 | ${LIBEXCEPT_INCLUDE_DIRS}
55 | )
56 |
57 | target_link_libraries(${PROJECT_NAME}
58 | utf8
59 | ${LIBEXCEPT_LIBRARIES}
60 | )
61 |
62 | install(
63 | TARGETS
64 | ${PROJECT_NAME}
65 |
66 | RUNTIME DESTINATION
67 | bin
68 | )
69 |
70 |
71 | # vim: ts=4 sw=4 et
72 |
--------------------------------------------------------------------------------
/tools/show_unicode.cpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved
2 | //
3 | // https://snapwebsites.org/project/libutf8
4 | // contact@m2osw.com
5 | //
6 | // This program is free software; you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation; either version 2 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 |
20 | /** \file
21 | * \brief Tool used to convert the UnicodeData.txt file to C structures.
22 | *
23 | * This executable is used to convert the UnicodeData.txt to a set of
24 | * C structure which we can search very quickly to find Unicode characters.
25 | * This gives us all the necessary information to convert strings to NFKC
26 | * NFKD, and especially NFC and NFD.
27 | *
28 | * \sa http://www.unicode.org/reports/tr15/
29 | */
30 |
31 |
32 | // libutf8
33 | //
34 | #include
35 | #include
36 |
37 |
38 | // C++
39 | //
40 | #include
41 | #include
42 | #include
43 | #include
44 | #include
45 | #include
46 | #include
47 |
48 |
49 | // last include
50 | //
51 | #include
52 |
53 |
54 |
55 | namespace
56 | {
57 |
58 |
59 | class show_unicode
60 | {
61 | public:
62 | enum class mode_t
63 | {
64 | MODE_STRING,
65 | MODE_CHARACTER,
66 | MODE_UTF8_FILENAME,
67 | MODE_UTF16_FILENAME,
68 | MODE_UTF32_FILENAME,
69 |
70 | MODE_DEFAULT // like MODE_STRING, just not set explicitly
71 | };
72 |
73 | int parse_args(int agrc, char * argv[]);
74 | int verify_args();
75 | int process();
76 |
77 | private:
78 | void usage();
79 | int set_mode(mode_t m);
80 | int read_file();
81 |
82 | mode_t f_mode = mode_t::MODE_DEFAULT;
83 | std::string f_filename = std::string();
84 | std::vector f_input = std::vector();
85 | bool f_valid_fffe_ffff = true;
86 | };
87 |
88 |
89 |
90 |
91 | int show_unicode::parse_args(int argc, char * argv[])
92 | {
93 | for(int i(1); i < argc; ++i)
94 | {
95 | if(argv[i][0] == '-')
96 | {
97 | if(strcmp(argv[i], "-h") == 0
98 | || strcmp(argv[i], "--help") == 0)
99 | {
100 | usage();
101 | return 2;
102 | }
103 | if(strcmp(argv[i], "-V") == 0
104 | || strcmp(argv[i], "--version") == 0)
105 | {
106 | std::cout << LIBUTF8_VERSION_STRING << '\n';
107 | return 2;
108 | }
109 | if(strcmp(argv[i], "-C") == 0
110 | || strcmp(argv[i], "--unicode") == 0)
111 | {
112 | ++i;
113 | if(i >= argc)
114 | {
115 | std::cerr << "error: the --character command line option must be followed by a number representing a valid Unicode characters in UTF-32.\n";
116 | return 3;
117 | }
118 | char * end;
119 | char * s(argv[i]);
120 | int base(10);
121 | if(*s == '0')
122 | {
123 | ++s;
124 | base = 8;
125 | if(*s == 'x' || *s == 'X')
126 | {
127 | base = 16;
128 | ++s;
129 | }
130 | }
131 | char32_t const wc(strtol(s, &end, base));
132 | if(end == nullptr
133 | || *end != '\0')
134 | {
135 | std::cerr
136 | << "error: expected a valid decimal, octal, or hexadecimal number; could not parse \""
137 | << argv[i]
138 | << "\" as a valid number.\n";
139 | return 1;
140 | }
141 | if(!libutf8::is_valid_unicode(wc))
142 | {
143 | std::cerr
144 | << "error: code \"0x"
145 | << std::uppercase << std::hex << std::setfill('0') << std::setw(6) << static_cast(wc)
146 | << "\" does not represent a valid Unicode character.\n";
147 | return 1;
148 | }
149 | std::string const character(libutf8::to_u8string(wc));
150 | f_input.insert(f_input.end(), character.begin(), character.end());
151 | int const r(set_mode(mode_t::MODE_CHARACTER));
152 | if(r != 0)
153 | {
154 | return r;
155 | }
156 | continue;
157 | }
158 | if(strcmp(argv[i], "-s") == 0
159 | || strcmp(argv[i], "--string") == 0)
160 | {
161 | ++i;
162 | if(i >= argc)
163 | {
164 | std::cerr << "error: the --string command line option must be followed by the string to process.\n";
165 | return 3;
166 | }
167 | f_input.insert(f_input.end(), argv[i], argv[i] + strlen(argv[i]));
168 | int const r(set_mode(mode_t::MODE_STRING));
169 | if(r != 0)
170 | {
171 | return r;
172 | }
173 | continue;
174 | }
175 | if(strcmp(argv[i], "-f") == 0
176 | || strcmp(argv[i], "--input") == 0)
177 | {
178 | ++i;
179 | if(i >= argc)
180 | {
181 | std::cerr << "error: the --input command line option must be followed by the input filename.\n";
182 | return 3;
183 | }
184 | f_filename = argv[i];
185 | int r(set_mode(mode_t::MODE_UTF8_FILENAME));
186 | if(r == 0)
187 | {
188 | r = read_file();
189 | }
190 | if(r != 0)
191 | {
192 | return r;
193 | }
194 | continue;
195 | }
196 | if(strcmp(argv[i], "-S") == 0
197 | || strcmp(argv[i], "--input-utf16") == 0)
198 | {
199 | ++i;
200 | if(i >= argc)
201 | {
202 | std::cerr << "error: the --input-utf16 command line option must be followed by the input filename.\n";
203 | return 3;
204 | }
205 | f_filename = argv[i];
206 | int r(set_mode(mode_t::MODE_UTF16_FILENAME));
207 | if(r == 0)
208 | {
209 | r = read_file();
210 | }
211 | if(r == 0 && f_input.size() % 2 != 0)
212 | {
213 | std::cerr << "error: the size of \""
214 | << f_filename
215 | << "\" was expected to be a multiple of 2.\n";
216 | return 1;
217 | }
218 | if(r == 0)
219 | {
220 | std::u16string in(reinterpret_cast(f_input.data()), f_input.size() / 2);
221 | std::string u8(libutf8::to_u8string(in));
222 | f_input.resize(u8.length());
223 | memcpy(f_input.data(), u8.data(), u8.length());
224 | }
225 | if(r != 0)
226 | {
227 | return r;
228 | }
229 | continue;
230 | }
231 | if(strcmp(argv[i], "-F") == 0
232 | || strcmp(argv[i], "--input-utf32") == 0)
233 | {
234 | ++i;
235 | if(i >= argc)
236 | {
237 | std::cerr << "error: the --input-utf32 command line option must be followed by the input filename.\n";
238 | return 3;
239 | }
240 | f_filename = argv[i];
241 | int r(set_mode(mode_t::MODE_UTF32_FILENAME));
242 | if(r == 0)
243 | {
244 | r = read_file();
245 | }
246 | if(r == 0 && f_input.size() % 4 != 0)
247 | {
248 | std::cerr << "error: the size of \""
249 | << f_filename
250 | << "\" was expected to be a multiple of 4.\n";
251 | return 1;
252 | }
253 | if(r == 0)
254 | {
255 | std::u32string in(reinterpret_cast(f_input.data()), f_input.size() / 4);
256 | std::string u8(libutf8::to_u8string(in));
257 | f_input.resize(u8.length());
258 | memcpy(f_input.data(), u8.data(), u8.length());
259 | }
260 | if(r != 0)
261 | {
262 | return r;
263 | }
264 | continue;
265 | }
266 | if(strcmp(argv[i], "--valid-fffe-ffff") == 0)
267 | {
268 | f_valid_fffe_ffff = true;
269 | continue;
270 | }
271 | if(strcmp(argv[i], "-W") == 0
272 | || strcmp(argv[i], "--invalid-fffe-ffff") == 0)
273 | {
274 | f_valid_fffe_ffff = false;
275 | continue;
276 | }
277 | std::cerr << "error: unknown command line option \""
278 | << argv[i]
279 | << "\".\n";
280 | return 4;
281 | }
282 | else
283 | {
284 | f_input.insert(f_input.end(), argv[i], argv[i] + strlen(argv[i]));
285 | }
286 | }
287 |
288 | return 0;
289 | }
290 |
291 |
292 | int show_unicode::set_mode(mode_t m)
293 | {
294 | if(f_mode != mode_t::MODE_DEFAULT)
295 | {
296 | std::cerr << "error: mode already set to: " << static_cast(f_mode) << "\n";
297 | return 3;
298 | }
299 | f_mode = m;
300 |
301 | return 0;
302 | }
303 |
304 |
305 | int show_unicode::read_file()
306 | {
307 | std::ifstream in(f_filename);
308 | if(!in.is_open())
309 | {
310 | std::cerr
311 | << "error: could not open input file \""
312 | << f_filename
313 | << "\".\n";
314 | return 1;
315 | }
316 | in.seekg(0, std::ios::end);
317 | std::size_t const size(in.tellg());
318 | in.seekg(0);
319 | f_input.resize(size);
320 | in.read(reinterpret_cast(f_input.data()), size);
321 | if(!in)
322 | {
323 | std::cerr
324 | << "error: could not read input file \""
325 | << f_filename
326 | << "\".\n";
327 | return 1;
328 | }
329 |
330 | return 0;
331 | }
332 |
333 |
334 | int show_unicode::verify_args()
335 | {
336 | // the mode already generated an error no need for that here
337 | return 0;
338 | }
339 |
340 |
341 | int show_unicode::process()
342 | {
343 | // first show the string as is
344 | //
345 | std::string utf8(std::string(reinterpret_cast(f_input.data()), f_input.size()));
346 | std::cout << "Input: \"" << utf8 << "\".\n";
347 |
348 | // next show the string as UTF-8 bytes
349 | //
350 | std::cout << "UTF-8:" << std::hex << std::setfill('0');
351 | for(auto it(f_input.begin()); it != f_input.end(); ++it)
352 | {
353 | char const * space(" ");
354 | if(*it >= 0x80 && *it <= 0xBF)
355 | {
356 | space = ".";
357 | }
358 | std::cout << space << std::setw(2) << static_cast(*it);
359 | }
360 | std::cout << '\n';
361 |
362 | // next show the string as UTF-16 words
363 | //
364 | std::u16string utf16(libutf8::to_u16string(utf8));
365 | std::cout << "UTF-16:";
366 | for(auto it(utf16.begin()); it != utf16.end(); ++it)
367 | {
368 | std::cout << ' ' << std::setw(4) << static_cast(*it);
369 | }
370 | std::cout << '\n';
371 |
372 | // next show the string as UTF-32 words
373 | //
374 | std::u32string utf32(libutf8::to_u32string(utf8));
375 | std::cout << "UTF-32:";
376 | for(auto it(utf32.begin()); it != utf32.end(); ++it)
377 | {
378 | std::cout << ' ' << std::setw(6) << static_cast(*it);
379 | }
380 | std::cout << '\n';
381 |
382 | return 0;
383 | }
384 |
385 |
386 | void show_unicode::usage()
387 | {
388 | std::cout << "Usage: show-unicode [-] [-s|--string] '' | -C | -f \n"
389 | "Where - is one or more of:\n"
390 | " -h | --help print this help screen.\n"
391 | " -C | --unicode use specified value.\n"
392 | " -s | --string input string to convert (using -s or --string is optional).\n"
393 | " -f | --input input file of UTF-8 characters.\n"
394 | " -S | --input-utf16 input file of UTF-16 characters.\n"
395 | " -F | --input-utf32 input file of UTF-32 characters.\n"
396 | " --valid-fffe-ffff consider \\uFFFE and \\uFFFF as valid characters (default).\n"
397 | " -W | --invalid-fffe-ffff consider \\uFFFE and \\uFFFF as invalid characters.\n"
398 | " -V | --version print out this tool's version.\n"
399 | "\n";
400 | }
401 |
402 |
403 | } // no name namespace
404 |
405 |
406 | int main(int argc, char * argv[])
407 | {
408 | show_unicode show;
409 | int r(show.parse_args(argc, argv));
410 | if(r != 0)
411 | {
412 | return r;
413 | }
414 | r = show.verify_args();
415 | if(r != 0)
416 | {
417 | return r;
418 | }
419 | return show.process();
420 | }
421 |
422 |
423 | // vim: ts=4 sw=4 et
424 |
--------------------------------------------------------------------------------
/tools/unicode_data_parser.cpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2000-2023 Made to Order Software Corp. All Rights Reserved
2 | //
3 | // https://snapwebsites.org/project/libutf8
4 | // contact@m2osw.com
5 | //
6 | // This program is free software; you can redistribute it and/or modify
7 | // it under the terms of the GNU General Public License as published by
8 | // the Free Software Foundation; either version 2 of the License, or
9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 |
20 | /** \file
21 | * \brief Tool used to convert the UnicodeData.txt file to C structures.
22 | *
23 | * This executable is used to convert the UnicodeData.txt to a set of
24 | * C structure which we can search very quickly to find Unicode characters.
25 | * This gives us all the necessary information to convert strings to NFKC
26 | * NFKD, and especially NFC and NFD.
27 | *
28 | * \sa http://www.unicode.org/reports/tr15/
29 | */
30 |
31 |
32 | // libutf8
33 | //
34 | #include
35 |
36 |
37 | // libexcept
38 | //
39 | #include
40 |
41 |
42 | // C++
43 | //
44 | #include
45 | #include
46 | #include
47 |
48 |
49 | // C
50 | //
51 | #include
52 | #include
53 |
54 |
55 | // last include
56 | //
57 | #include
58 |
59 |
60 |
61 | namespace
62 | {
63 |
64 |
65 |
66 |
67 |
68 |
69 | } // no name namespace
70 |
71 |
72 |
73 | void usage()
74 | {
75 | std::cout << "Usage: unicode_data_parser \n";
76 | std::cout << "Where:\n";
77 | std::cout << " is a path to the unicode files such as UnicodeData.txt (default: \"/usr/shared/libutf8/unicode\")\n";
78 | std::cout << " is a path to the output unicode_data.ucdb file (default: a.ucdb)\n";
79 | }
80 |
81 |
82 | int main(int argc, char * argv[])
83 | {
84 | libexcept::verify_inherited_files();
85 |
86 | std::string input_dir;
87 | std::string output_filename;
88 |
89 | for(int i(1); i < argc; ++i)
90 | {
91 | if(argv[i][0] == '-')
92 | {
93 | switch(argv[i][1])
94 | {
95 | case 'h':
96 | usage();
97 | exit(1);
98 |
99 | default:
100 | std::cerr << "error: unknown command line option -"
101 | << argv[i][1]
102 | << "\n";
103 | exit(1);
104 | break;
105 |
106 | }
107 | }
108 | else
109 | {
110 | if(input_dir.empty())
111 | {
112 | input_dir = argv[i];
113 | if(input_dir.empty())
114 | {
115 | std::cerr << "error: input directory name can't be empty, try \".\" for current folder.\n";
116 | exit(1);
117 | }
118 | }
119 | else if(output_filename.empty())
120 | {
121 | output_filename = argv[i];
122 | }
123 | else
124 | {
125 | std::cerr << "error: too many filenames on the command line.\n";
126 | exit(1);
127 | }
128 | }
129 | }
130 |
131 | if(input_dir.empty())
132 | {
133 | input_dir = "/usr/shared/libutf8/unicode";
134 | }
135 |
136 | if(output_filename.empty())
137 | {
138 | output_filename = "a.ucdb";
139 | }
140 |
141 | libutf8::ucd_parser p(input_dir, output_filename);
142 | p.generate();
143 |
144 | return 0;
145 | }
146 |
147 |
148 | // vim: ts=4 sw=4 et
149 |
--------------------------------------------------------------------------------