├── .gitignore
├── CMakeLists.txt
├── LICENSE.txt
├── README.md
├── TODO.txt
├── cmake
    ├── CMakeLists.txt
    └── LibUtf8Config.cmake
├── conf
    ├── CMakeLists.txt
    └── unicode
    │   ├── CMakeLists.txt
    │   ├── DerivedAge.txt
    │   ├── Jamo.txt
    │   ├── LICENSE.txt
    │   ├── NameAliases.txt
    │   ├── README.md
    │   └── UnicodeData.txt
├── debian
    ├── changelog
    ├── compat
    ├── control
    ├── copyright
    ├── docs
    ├── libutf8-dev.install
    ├── libutf8-doc.install
    ├── libutf8.install
    ├── rules
    └── source
    │   └── options
├── doc
    ├── CMakeLists.txt
    ├── footer.html
    ├── libutf8.doxy.in
    └── libutf8.png
├── libutf8
    ├── CMakeLists.txt
    ├── base.cpp
    ├── base.h
    ├── caseinsensitivestring.h
    ├── exception.h
    ├── iterator.cpp
    ├── iterator.h
    ├── json_tokens.cpp
    ├── json_tokens.h
    ├── libutf8.cpp
    ├── libutf8.h
    ├── unicode_data.cpp
    ├── unicode_data.h
    ├── unicode_data_file.cpp
    ├── unicode_data_file.h
    ├── version.cpp
    └── version.h.in
├── mk
├── tests
    ├── CMakeLists.txt
    ├── catch_bom.cpp
    ├── catch_caseinsensitive.cpp
    ├── catch_character.cpp
    ├── catch_iterator.cpp
    ├── catch_json_tokens.cpp
    ├── catch_length.cpp
    ├── catch_main.cpp
    ├── catch_main.h
    ├── catch_stream.cpp
    ├── catch_string.cpp
    ├── catch_valid.cpp
    ├── catch_version.cpp
    ├── example-for-show-utf16.txt
    ├── example-for-show-utf32.txt
    ├── example-for-show-utf8.txt
    ├── unicode
    │   ├── LICENSE.txt
    │   └── NormalizationTest.txt
    └── verify-show-unicode.sh
└── tools
    ├── CMakeLists.txt
    ├── show_unicode.cpp
    └── unicode_data_parser.cpp


/.gitignore:
--------------------------------------------------------------------------------
1 | tmp
2 | *.sw?
3 | seed.txt
4 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2000-2023  Made to Order Software Corp.  All Rights Reserved
 2 | #
 3 | # https://snapwebsites.org/project/libutf8
 4 | # contact@m2osw.com
 5 | #
 6 | # This program is free software; you can redistribute it and/or modify
 7 | # it under the terms of the GNU General Public License as published by
 8 | # the Free Software Foundation; either version 2 of the License, or
 9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License along
17 | # with this program; if not, write to the Free Software Foundation, Inc.,
18 | # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
19 | 
20 | cmake_minimum_required(VERSION 3.10.2)
21 | 
22 | project(utf8_library)
23 | 
24 | find_package(SnapCMakeModules REQUIRED)
25 | find_package(LibExcept        REQUIRED)
26 | find_package(SnapDev          REQUIRED)
27 | 
28 | SnapGetVersion(LIBUTF8 ${CMAKE_CURRENT_SOURCE_DIR})
29 | 
30 | include_directories(
31 |     ${PROJECT_SOURCE_DIR}
32 |     ${CMAKE_CURRENT_BINARY_DIR}
33 | )
34 | 
35 | add_subdirectory(libutf8)
36 | add_subdirectory(tools  )
37 | add_subdirectory(conf   )
38 | add_subdirectory(doc    )
39 | add_subdirectory(cmake  )
40 | add_subdirectory(tests  )
41 | 
42 | # vim: ts=4 sw=4 et
43 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2006-2023  Made to Order Software Corp.  All Rights Reserved
 2 | 
 3 | https://snapwebsites.org/
 4 | contact@m2osw.com
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in
14 | all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <p align="center">
  3 | <img alt="advgetopt" title="Advance getopt, a C++ library to handle your command line options and configuration files seamlessly."
  4 | src="https://snapwebsites.org/sites/snapwebsites.org/files/images/libutf8.png" width="277" height="277"/>
  5 | </p>
  6 | 
  7 | # Introduction
  8 | 
  9 | The libutf8 library is a helper library to handle UTF-8 strings in C++.
 10 | Although C++11 added `char32_t` (and `char16_t`) and C++20 added
 11 | `char8_t`, the conversions are still not seamless between each type
 12 | (although it is becoming easier to handle such.)
 13 | 
 14 | This library proposes automated conversions between `std::string` (viewed
 15 | as UTF-8 in nearly all of our code) and `std::u32string` (a.k.a. UTF-32
 16 | strings.)
 17 | 
 18 | # Reasons Behind Having Our Own Library
 19 | 
 20 | All the libraries I've seen are either in C and very cumbersome to use or
 21 | offer an interface which depends on the current `LOCALE`. In other words,
 22 | the system default `mbstowc()` function, for example, does not always view
 23 | the input string as UTF-8. That also means there are complexities and thus
 24 | inefficiencies in determining which conversion to use.
 25 | 
 26 | In our case, we always have UTF-8 as input and output and at times we need
 27 | to handle the characters as UTF-32. For example, to transform the character
 28 | to uppercase, it is necessary to have a UTF-32 character.
 29 | 
 30 | # API
 31 | 
 32 | ## String Conversions
 33 | 
 34 | The library offers to conversion functions as follow:
 35 | 
 36 |     libutf8::to_u8string(std::u32string const & str);
 37 |     libutf8::to_u32string(std::string const & str);
 38 | 
 39 | As time passes, we will add other conversions so as to support all formats
 40 | although at this point these two are the only two we need in Snap! Websites.
 41 | 
 42 | Here is an example of usage:
 43 | 
 44 |     std::string u8;
 45 | 
 46 |     u8 = u8"This is a UTF-8 string";
 47 | 
 48 |     std::w32string u32;
 49 |     u32 = libutf8::to_u32string(u8);
 50 | 
 51 |     std::string back;
 52 |     back = libutf8::to_u8string(u32);
 53 | 
 54 | Note that u8 string could be _more_ UTF-8 by including characters outside
 55 | of the ASCII range and it would still work as you would expect.
 56 | 
 57 | ### String Length in Characters
 58 | 
 59 | The library offers the `u8length()` function which computes the length of
 60 | a UTF-8 string. Note that this does not verify whether the UTF-8 data is
 61 | valid. It very quickly counts the number of non-continuation bytes (i.e.
 62 | bytes between 0x80 and 0xBF inclusive.)
 63 | 
 64 |     std::string u8("Your UTF-8 string");
 65 |     size_t length = libutf8::u8length(u8);
 66 | 
 67 | ### Case Insensitive Compare
 68 | 
 69 | In most cases, you can compare two UTF-8 strings with the normal `==`
 70 | operator. Once in a while, though, you may want to compare them case
 71 | insensitively.
 72 | 
 73 | Like with the iterator below, we wanted to offer a function that allows
 74 | you to compare two UTF-8 strings properly and as quickly as possible.
 75 | This meant to not have to convert the entire strings before doing the
 76 | compare because having to do so means allocating memory for both
 77 | strings just to do the compare and the conversion would convert the
 78 | entire strings instead of just what's necessary.
 79 | 
 80 | Out of these constraints we created  the `u8casecmp()` function. It
 81 | takes two UTF-8 strings and compares the characters one at a time.
 82 | Unless the strings are equal, only the number of characters up to
 83 | the first non-equal one, will be converted.
 84 | 
 85 |     std::string a("First String");
 86 |     std::string b("First Test");
 87 | 
 88 |     int r(libutf8::u8casecmp(a, b));
 89 |     if(r == 0)
 90 |     {
 91 |     	std::cout << "a and b are equal" << std::endl;
 92 |     }
 93 |     else if(r < 0)
 94 |     {
 95 |     	std::cout << "a comes before b" << std::endl;
 96 |     }
 97 |     else //if(r > 0)
 98 |     {
 99 |     	std::cout << "a comes after b" << std::endl;
100 |     }
101 | 
102 | WARNING: the function does no collation, so it is not going to take the
103 | language in account. It uses lowercase characters, as suggested by the
104 | Unicode standard, but outside of that, the compare is binary.
105 | 
106 | ## UTF-8 Iterator
107 | 
108 | It is often that we have an `std::string` representing UTF-8 and we want
109 | to iterate the content as UTF-32 characters. Although we could convert
110 | the string to a full `std::u32string` and then iterate through the
111 | `std::u32string`, that (1) requires a copy and (2) uses four times
112 | the amount of memory (five times if you include the `std::string` size...)
113 | Note also that the copy requires a `malloc()` and later a `free()` once
114 | done with it.
115 | 
116 | The iterator solves these problems by allowing us to iterate through the
117 | `std::string` and getting the next or previous Unicode character without
118 | having to use any more memory. The conversion itself is slightly slower
119 | than converting a string all at once, but doing a `malloc()` to get the
120 | `std::u32string` is definitely going to be way slower than our iterator
121 | in nearly all circumstances.
122 | 
123 | The following example shows the code point of each character, one per line:
124 | 
125 |     std::string u8("This is your UTF-8 string");
126 | 
127 |     for(libutf8::utf8_iterator it(u8);
128 |     	it != u8.end();
129 | 	++it)
130 |     {
131 |     	std::cout << static_cast<int>(*it) << std::endl;
132 |     }
133 | 
134 | You can compare standard `std::string` iterators with `==` and `!=`. The
135 | `++` and `--` operators work as expected. If you do a `++` when already
136 | at the end, nothing happens. If you do a `--` when already at the beginning,
137 | nothing happens.
138 | 
139 | Once you are at the end, getting the character (`*it`) returns `libutf8::EOS`.
140 | So you can loop through until you get `libutf8::EOS` instead of checking
141 | against the end iterator:
142 | 
143 |     std::string u8("This is your UTF-8 string");
144 | 
145 |     libutf8::utf8_iterator it(u8);
146 |     while(*it != libutf8::EOS)
147 |     {
148 |     	std::cout << static_cast<int>(*it++) << std::endl;
149 |     }
150 | 
151 | Remember that a good optimization is to avoid the post increment. It will
152 | be faster to do:
153 | 
154 |     char32_t c = *it;
155 |     ++it;
156 | 
157 | because you avoid a copy of the iterator (even though it's only 16 bytes...)
158 | 
159 | ## Low Level Functions
160 | 
161 | We expose the low level functions such as `mbstowc()` for edgy cases where
162 | you may not have an `std::string`. Those functions should not be used if
163 | at all possible because they require proper handling of the buffers passed
164 | to them. An error to such and you could end up with a crashing bug in your
165 | code.
166 | 
167 | # TODO
168 | 
169 | ## Auto-Conversions
170 | 
171 | Conversions for many more types of strings such as all the `char *`
172 | and also look into whether implementing an extension to the
173 | `std::basic_string` would be possible to directly have conversions
174 | integrated in our strings (i.e. to be able to write `str8 = str32;` and
175 | `str32 = str8` without having to write `str8 = libutf8::to_u8string(str32)`.)
176 | 
177 | ## Canonicalization
178 | 
179 | Right now, we do not try to canonicalize the strings, so diacritics may
180 | appear as standalone or combined characters. We want to implement the
181 | necessary code to decomposed and re-composed in a normalized manner.
182 | 
183 | This is very important for comparing strings against each other for
184 | equality (i.e. an 'a' with a grave accent is equal to an 'a' followed
185 | by the grave accent character).
186 | 
187 | ## Character Name, Type, etc.
188 | 
189 | The UnicodeData.txt file (offered by the Unicode website) lists all the
190 | characters with their name and their types. We want to offer the user
191 | access to that data.
192 | 
193 | We should simple have the table as a struct and return a pointer to
194 | the corresponding character. Sort those by character number and use
195 | a binary search to find the structure.
196 | 
197 | Some of that information is to be used for the canonicalization so it
198 | is a must have.
199 | 
200 | UnicodeData.txt file format is defined in:
201 | http://www.unicode.org/L2/L1999/UnicodeData.html
202 | 
203 | 
204 | 
205 | # License
206 | 
207 | The source is covered by the MIT license. The debian folder is covered
208 | by the GPL 2.0.
209 | 
210 | 
211 | # Bugs
212 | 
213 | Submit bug reports and patches on
214 | [github](https://github.com/m2osw/libutf8/issues).
215 | 
216 | 
217 | _This file is part of the [snapcpp project](https://snapwebsites.org/)._
218 | 


--------------------------------------------------------------------------------
/TODO.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | * `utf8lint` verify that a file is valid UTF-8 (see show-unicode, we can have a softlink instead and if called utf8lint, assume --quiet).
 3 | * Enhance `show-unicode`:
 4 |   - Support a range (so we can see the characters in a given range).
 5 |   - Actually do a validation step.
 6 | * Add a reverse() function which works correctly with a UTF-8 string.
 7 | * Add a reverse() function which works correctly with a UTF-16 string.
 8 | * Add a fix() function which takes UTF-32/16 and removes any invalid characters (UTF-8 is done).
 9 | * Add a "lexer base" which is to read an input file one character at a time
10 |   like a lexer getc() generally does and return char32_t characters
11 |   (see basic-xml for an example on how this is done and convert that one to
12 |   using this new "lexer base")
13 | 
14 | 


--------------------------------------------------------------------------------
/cmake/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2000-2023  Made to Order Software Corp.  All Rights Reserved
 2 | #
 3 | # https://snapwebsites.org/project/libutf8
 4 | # contact@m2osw.com
 5 | #
 6 | # This program is free software; you can redistribute it and/or modify
 7 | # it under the terms of the GNU General Public License as published by
 8 | # the Free Software Foundation; either version 2 of the License, or
 9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License along
17 | # with this program; if not, write to the Free Software Foundation, Inc.,
18 | # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
19 | 
20 | project(libutf8_cmake)
21 | 
22 | install(
23 |     FILES
24 |         LibUtf8Config.cmake
25 | 
26 |     DESTINATION
27 |         share/cmake/LibUtf8
28 | )
29 | 
30 | # vim: ts=4 sw=4 et nocindent
31 | 


--------------------------------------------------------------------------------
/cmake/LibUtf8Config.cmake:
--------------------------------------------------------------------------------
 1 | # - Find LibUtf8
 2 | #
 3 | # LIBUTF8_FOUND        - System has LibUtf8
 4 | # LIBUTF8_INCLUDE_DIRS - The LibUtf8 include directories
 5 | # LIBUTF8_LIBRARIES    - The libraries needed to use LibUtf8
 6 | # LIBUTF8_DEFINITIONS  - Compiler switches required for using LibUtf8
 7 | #
 8 | # License:
 9 | #
10 | # Copyright (c) 2011-2023  Made to Order Software Corp.  All Rights Reserved
11 | #
12 | # https://snapwebsites.org/project/libutf8
13 | # contact@m2osw.com
14 | #
15 | # This program is free software: you can redistribute it and/or modify
16 | # it under the terms of the GNU General Public License as published by
17 | # the Free Software Foundation, either version 3 of the License, or
18 | # (at your option) any later version.
19 | #
20 | # This program is distributed in the hope that it will be useful,
21 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
22 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23 | # GNU General Public License for more details.
24 | #
25 | # You should have received a copy of the GNU General Public License
26 | # along with this program.  If not, see <https://www.gnu.org/licenses/>.
27 | 
28 | find_path(
29 |     LIBUTF8_INCLUDE_DIR
30 |         libutf8/libutf8.h
31 | 
32 |     PATHS
33 |         ENV LIBUTF8_INCLUDE_DIR
34 | )
35 | 
36 | find_library(
37 |     LIBUTF8_LIBRARY
38 |         utf8
39 | 
40 |     PATHS
41 |         ${LIBUTF8_LIBRARY_DIR}
42 |         ENV LIBUTF8_LIBRARY
43 | )
44 | 
45 | mark_as_advanced(
46 |     LIBUTF8_INCLUDE_DIR
47 |     LIBUTF8_LIBRARY
48 | )
49 | 
50 | set(LIBUTF8_INCLUDE_DIRS ${LIBUTF8_INCLUDE_DIR})
51 | set(LIBUTF8_LIBRARIES    ${LIBUTF8_LIBRARY})
52 | 
53 | include(FindPackageHandleStandardArgs)
54 | 
55 | find_package_handle_standard_args(
56 |     LibUtf8
57 |     REQUIRED_VARS
58 |         LIBUTF8_INCLUDE_DIR
59 |         LIBUTF8_LIBRARY
60 | )
61 | 
62 | # vim: ts=4 sw=4 et
63 | 


--------------------------------------------------------------------------------
/conf/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2000-2023  Made to Order Software Corp.  All Rights Reserved
 2 | #
 3 | # https://snapwebsites.org/project/libutf8
 4 | # contact@m2osw.com
 5 | #
 6 | # This program is free software; you can redistribute it and/or modify
 7 | # it under the terms of the GNU General Public License as published by
 8 | # the Free Software Foundation; either version 2 of the License, or
 9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License along
17 | # with this program; if not, write to the Free Software Foundation, Inc.,
18 | # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
19 | 
20 | project(utf8_library_conf)
21 | 
22 | add_subdirectory(unicode)
23 | 
24 | # vim: ts=4 sw=4 et
25 | 


--------------------------------------------------------------------------------
/conf/unicode/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2012-2023  Made to Order Software Corp.  All Rights Reserved
 2 | #
 3 | # https://snapwebsites.org/project/libutf8
 4 | # contact@m2osw.com
 5 | #
 6 | # This program is free software; you can redistribute it and/or modify
 7 | # it under the terms of the GNU General Public License as published by
 8 | # the Free Software Foundation; either version 2 of the License, or
 9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License along
17 | # with this program; if not, write to the Free Software Foundation, Inc.,
18 | # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
19 | 
20 | ##
21 | ## unicode-data
22 | ##
23 | project(unicode-data)
24 | 
25 | install(
26 |     FILES
27 |         DerivedAge.txt
28 |         Jamo.txt
29 |         NameAliases.txt
30 |         UnicodeData.txt
31 | 
32 |     DESTINATION
33 |         share/libutf8/unicode
34 | )
35 | 
36 | 
37 | # vim: ts=4 sw=4 et
38 | 


--------------------------------------------------------------------------------
/conf/unicode/Jamo.txt:
--------------------------------------------------------------------------------
 1 | # Jamo-13.0.0.txt
 2 | # Date: 2019-09-09, 19:46:00 GMT [KW, LI]
 3 | # © 2019 Unicode®, Inc.
 4 | # For terms of use, see http://www.unicode.org/terms_of_use.html
 5 | #
 6 | # Unicode Character Database
 7 | # For documentation, see http://www.unicode.org/reports/tr44/
 8 | #
 9 | # This file defines the Jamo_Short_Name property.
10 | #
11 | # See Section 3.12 of The Unicode Standard, Version 13.0
12 | # for more information.
13 | #
14 | # Each line contains two fields, separated by a semicolon.
15 | #
16 | # The first field gives the code point, in 4-digit hexadecimal
17 | # form, of a conjoining jamo character that participates in the
18 | # algorithmic determination of Hangul syllable character names.
19 | # The second field gives the Jamo_Short_Name as a one-, two-,
20 | # or three-character ASCII string (or in one case, for U+110B,
21 | # the null string).
22 | #
23 | # #############################################################
24 | 
25 | 1100; G   # HANGUL CHOSEONG KIYEOK
26 | 1101; GG  # HANGUL CHOSEONG SSANGKIYEOK
27 | 1102; N   # HANGUL CHOSEONG NIEUN
28 | 1103; D   # HANGUL CHOSEONG TIKEUT
29 | 1104; DD  # HANGUL CHOSEONG SSANGTIKEUT
30 | 1105; R   # HANGUL CHOSEONG RIEUL
31 | 1106; M   # HANGUL CHOSEONG MIEUM
32 | 1107; B   # HANGUL CHOSEONG PIEUP
33 | 1108; BB  # HANGUL CHOSEONG SSANGPIEUP
34 | 1109; S   # HANGUL CHOSEONG SIOS
35 | 110A; SS  # HANGUL CHOSEONG SSANGSIOS
36 | 110B;     # HANGUL CHOSEONG IEUNG
37 | 110C; J   # HANGUL CHOSEONG CIEUC
38 | 110D; JJ  # HANGUL CHOSEONG SSANGCIEUC
39 | 110E; C   # HANGUL CHOSEONG CHIEUCH
40 | 110F; K   # HANGUL CHOSEONG KHIEUKH
41 | 1110; T   # HANGUL CHOSEONG THIEUTH
42 | 1111; P   # HANGUL CHOSEONG PHIEUPH
43 | 1112; H   # HANGUL CHOSEONG HIEUH
44 | 1161; A   # HANGUL JUNGSEONG A
45 | 1162; AE  # HANGUL JUNGSEONG AE
46 | 1163; YA  # HANGUL JUNGSEONG YA
47 | 1164; YAE # HANGUL JUNGSEONG YAE
48 | 1165; EO  # HANGUL JUNGSEONG EO
49 | 1166; E   # HANGUL JUNGSEONG E
50 | 1167; YEO # HANGUL JUNGSEONG YEO
51 | 1168; YE  # HANGUL JUNGSEONG YE
52 | 1169; O   # HANGUL JUNGSEONG O
53 | 116A; WA  # HANGUL JUNGSEONG WA
54 | 116B; WAE # HANGUL JUNGSEONG WAE
55 | 116C; OE  # HANGUL JUNGSEONG OE
56 | 116D; YO  # HANGUL JUNGSEONG YO
57 | 116E; U   # HANGUL JUNGSEONG U
58 | 116F; WEO # HANGUL JUNGSEONG WEO
59 | 1170; WE  # HANGUL JUNGSEONG WE
60 | 1171; WI  # HANGUL JUNGSEONG WI
61 | 1172; YU  # HANGUL JUNGSEONG YU
62 | 1173; EU  # HANGUL JUNGSEONG EU
63 | 1174; YI  # HANGUL JUNGSEONG YI
64 | 1175; I   # HANGUL JUNGSEONG I
65 | 11A8; G   # HANGUL JONGSEONG KIYEOK
66 | 11A9; GG  # HANGUL JONGSEONG SSANGKIYEOK
67 | 11AA; GS  # HANGUL JONGSEONG KIYEOK-SIOS
68 | 11AB; N   # HANGUL JONGSEONG NIEUN
69 | 11AC; NJ  # HANGUL JONGSEONG NIEUN-CIEUC
70 | 11AD; NH  # HANGUL JONGSEONG NIEUN-HIEUH
71 | 11AE; D   # HANGUL JONGSEONG TIKEUT
72 | 11AF; L   # HANGUL JONGSEONG RIEUL
73 | 11B0; LG  # HANGUL JONGSEONG RIEUL-KIYEOK
74 | 11B1; LM  # HANGUL JONGSEONG RIEUL-MIEUM
75 | 11B2; LB  # HANGUL JONGSEONG RIEUL-PIEUP
76 | 11B3; LS  # HANGUL JONGSEONG RIEUL-SIOS
77 | 11B4; LT  # HANGUL JONGSEONG RIEUL-THIEUTH
78 | 11B5; LP  # HANGUL JONGSEONG RIEUL-PHIEUPH
79 | 11B6; LH  # HANGUL JONGSEONG RIEUL-HIEUH
80 | 11B7; M   # HANGUL JONGSEONG MIEUM
81 | 11B8; B   # HANGUL JONGSEONG PIEUP
82 | 11B9; BS  # HANGUL JONGSEONG PIEUP-SIOS
83 | 11BA; S   # HANGUL JONGSEONG SIOS
84 | 11BB; SS  # HANGUL JONGSEONG SSANGSIOS
85 | 11BC; NG  # HANGUL JONGSEONG IEUNG
86 | 11BD; J   # HANGUL JONGSEONG CIEUC
87 | 11BE; C   # HANGUL JONGSEONG CHIEUCH
88 | 11BF; K   # HANGUL JONGSEONG KHIEUKH
89 | 11C0; T   # HANGUL JONGSEONG THIEUTH
90 | 11C1; P   # HANGUL JONGSEONG PHIEUPH
91 | 11C2; H   # HANGUL JONGSEONG HIEUH
92 | 
93 | # EOF
94 | 


--------------------------------------------------------------------------------
/conf/unicode/LICENSE.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/m2osw/libutf8/fe008f2199f5d8f43abe6ecffe6e82ead03ab2b5/conf/unicode/LICENSE.txt


--------------------------------------------------------------------------------
/conf/unicode/NameAliases.txt:
--------------------------------------------------------------------------------
  1 | # NameAliases-13.0.0.txt
  2 | # Date: 2019-09-09, 19:47:00 GMT [KW, LI]
  3 | # © 2019 Unicode®, Inc.
  4 | # For terms of use, see http://www.unicode.org/terms_of_use.html
  5 | #
  6 | # Unicode Character Database
  7 | # For documentation, see http://www.unicode.org/reports/tr44/
  8 | #
  9 | # This file is a normative contributory data file in the
 10 | # Unicode Character Database.
 11 | #
 12 | # This file defines the formal name aliases for Unicode characters.
 13 | #
 14 | # For informative aliases, see NamesList.txt
 15 | #
 16 | # The formal name aliases are divided into five types, each with a distinct label.
 17 | #
 18 | # Type Labels:
 19 | #
 20 | # 1. correction
 21 | #      Corrections for serious problems in the character names
 22 | # 2. control
 23 | #      ISO 6429 names for C0 and C1 control functions, and other
 24 | #      commonly occurring names for control codes
 25 | # 3. alternate
 26 | #      A few widely used alternate names for format characters
 27 | # 4. figment
 28 | #      Several documented labels for C1 control code points which
 29 | #      were never actually approved in any standard
 30 | # 5. abbreviation
 31 | #      Commonly occurring abbreviations (or acronyms) for control codes,
 32 | #      format characters, spaces, and variation selectors
 33 | #
 34 | # The formal name aliases are part of the Unicode character namespace, which
 35 | # includes the character names and the names of named character sequences.
 36 | # The inclusion of ISO 6429 names and other commonly occurring names and
 37 | # abbreviations for control codes and format characters as formal name aliases
 38 | # is to help avoid name collisions between Unicode character names and the
 39 | # labels which commonly appear in text and/or in implementations such as regex, for
 40 | # control codes (which for historical reasons have no Unicode character name)
 41 | # or for format characters.
 42 | #
 43 | # For documentation, see NamesList.html and http://www.unicode.org/reports/tr44/
 44 | #
 45 | # FORMAT
 46 | #
 47 | # Each line has three fields, as described here:
 48 | #
 49 | # First field:  Code point
 50 | # Second field: Alias
 51 | # Third field:  Type
 52 | #
 53 | # The type labels used are defined above. As for property values, comparisons
 54 | # of type labels should ignore case.
 55 | #
 56 | # The type labels can be mapped to other strings for display, if desired.
 57 | #
 58 | # In case multiple aliases are assigned, additional aliases
 59 | # are provided on separate lines. Parsers of this data file should
 60 | # take note that the same code point can (and does) occur more than once.
 61 | #
 62 | # Note that currently the only instances of multiple aliases of the same
 63 | # type for a single code point are either of type "control" or "abbreviation".
 64 | # An alias of type "abbreviation" can, in principle, be added for any code
 65 | # point, although currently aliases of type "correction" do not have
 66 | # any additional aliases of type "abbreviation". Such relationships
 67 | # are not enforced by stability policies.
 68 | #
 69 | #-----------------------------------------------------------------
 70 | 
 71 | 0000;NULL;control
 72 | 0000;NUL;abbreviation
 73 | 0001;START OF HEADING;control
 74 | 0001;SOH;abbreviation
 75 | 0002;START OF TEXT;control
 76 | 0002;STX;abbreviation
 77 | 0003;END OF TEXT;control
 78 | 0003;ETX;abbreviation
 79 | 0004;END OF TRANSMISSION;control
 80 | 0004;EOT;abbreviation
 81 | 0005;ENQUIRY;control
 82 | 0005;ENQ;abbreviation
 83 | 0006;ACKNOWLEDGE;control
 84 | 0006;ACK;abbreviation
 85 | 
 86 | # Note that no formal name alias for the ISO 6429 "BELL" is
 87 | # provided for U+0007, because of the existing name collision
 88 | # with U+1F514 BELL.
 89 | 
 90 | 0007;ALERT;control
 91 | 0007;BEL;abbreviation
 92 | 
 93 | 0008;BACKSPACE;control
 94 | 0008;BS;abbreviation
 95 | 0009;CHARACTER TABULATION;control
 96 | 0009;HORIZONTAL TABULATION;control
 97 | 0009;HT;abbreviation
 98 | 0009;TAB;abbreviation
 99 | 000A;LINE FEED;control
100 | 000A;NEW LINE;control
101 | 000A;END OF LINE;control
102 | 000A;LF;abbreviation
103 | 000A;NL;abbreviation
104 | 000A;EOL;abbreviation
105 | 000B;LINE TABULATION;control
106 | 000B;VERTICAL TABULATION;control
107 | 000B;VT;abbreviation
108 | 000C;FORM FEED;control
109 | 000C;FF;abbreviation
110 | 000D;CARRIAGE RETURN;control
111 | 000D;CR;abbreviation
112 | 000E;SHIFT OUT;control
113 | 000E;LOCKING-SHIFT ONE;control
114 | 000E;SO;abbreviation
115 | 000F;SHIFT IN;control
116 | 000F;LOCKING-SHIFT ZERO;control
117 | 000F;SI;abbreviation
118 | 0010;DATA LINK ESCAPE;control
119 | 0010;DLE;abbreviation
120 | 0011;DEVICE CONTROL ONE;control
121 | 0011;DC1;abbreviation
122 | 0012;DEVICE CONTROL TWO;control
123 | 0012;DC2;abbreviation
124 | 0013;DEVICE CONTROL THREE;control
125 | 0013;DC3;abbreviation
126 | 0014;DEVICE CONTROL FOUR;control
127 | 0014;DC4;abbreviation
128 | 0015;NEGATIVE ACKNOWLEDGE;control
129 | 0015;NAK;abbreviation
130 | 0016;SYNCHRONOUS IDLE;control
131 | 0016;SYN;abbreviation
132 | 0017;END OF TRANSMISSION BLOCK;control
133 | 0017;ETB;abbreviation
134 | 0018;CANCEL;control
135 | 0018;CAN;abbreviation
136 | 0019;END OF MEDIUM;control
137 | 0019;EOM;abbreviation
138 | 001A;SUBSTITUTE;control
139 | 001A;SUB;abbreviation
140 | 001B;ESCAPE;control
141 | 001B;ESC;abbreviation
142 | 001C;INFORMATION SEPARATOR FOUR;control
143 | 001C;FILE SEPARATOR;control
144 | 001C;FS;abbreviation
145 | 001D;INFORMATION SEPARATOR THREE;control
146 | 001D;GROUP SEPARATOR;control
147 | 001D;GS;abbreviation
148 | 001E;INFORMATION SEPARATOR TWO;control
149 | 001E;RECORD SEPARATOR;control
150 | 001E;RS;abbreviation
151 | 001F;INFORMATION SEPARATOR ONE;control
152 | 001F;UNIT SEPARATOR;control
153 | 001F;US;abbreviation
154 | 0020;SP;abbreviation
155 | 007F;DELETE;control
156 | 007F;DEL;abbreviation
157 | 
158 | # PADDING CHARACTER and HIGH OCTET PRESET represent
159 | # architectural concepts initially proposed for early
160 | # drafts of ISO/IEC 10646-1. They were never actually
161 | # approved or standardized: hence their designation
162 | # here as the "figment" type. Formal name aliases
163 | # (and corresponding abbreviations) for these code
164 | # points are included here because these names leaked
165 | # out from the draft documents and were published in
166 | # at least one RFC whose names for code points was
167 | # implemented in Perl regex expressions.
168 | 
169 | 0080;PADDING CHARACTER;figment
170 | 0080;PAD;abbreviation
171 | 0081;HIGH OCTET PRESET;figment
172 | 0081;HOP;abbreviation
173 | 
174 | 0082;BREAK PERMITTED HERE;control
175 | 0082;BPH;abbreviation
176 | 0083;NO BREAK HERE;control
177 | 0083;NBH;abbreviation
178 | 0084;INDEX;control
179 | 0084;IND;abbreviation
180 | 0085;NEXT LINE;control
181 | 0085;NEL;abbreviation
182 | 0086;START OF SELECTED AREA;control
183 | 0086;SSA;abbreviation
184 | 0087;END OF SELECTED AREA;control
185 | 0087;ESA;abbreviation
186 | 0088;CHARACTER TABULATION SET;control
187 | 0088;HORIZONTAL TABULATION SET;control
188 | 0088;HTS;abbreviation
189 | 0089;CHARACTER TABULATION WITH JUSTIFICATION;control
190 | 0089;HORIZONTAL TABULATION WITH JUSTIFICATION;control
191 | 0089;HTJ;abbreviation
192 | 008A;LINE TABULATION SET;control
193 | 008A;VERTICAL TABULATION SET;control
194 | 008A;VTS;abbreviation
195 | 008B;PARTIAL LINE FORWARD;control
196 | 008B;PARTIAL LINE DOWN;control
197 | 008B;PLD;abbreviation
198 | 008C;PARTIAL LINE BACKWARD;control
199 | 008C;PARTIAL LINE UP;control
200 | 008C;PLU;abbreviation
201 | 008D;REVERSE LINE FEED;control
202 | 008D;REVERSE INDEX;control
203 | 008D;RI;abbreviation
204 | 008E;SINGLE SHIFT TWO;control
205 | 008E;SINGLE-SHIFT-2;control
206 | 008E;SS2;abbreviation
207 | 008F;SINGLE SHIFT THREE;control
208 | 008F;SINGLE-SHIFT-3;control
209 | 008F;SS3;abbreviation
210 | 0090;DEVICE CONTROL STRING;control
211 | 0090;DCS;abbreviation
212 | 0091;PRIVATE USE ONE;control
213 | 0091;PRIVATE USE-1;control
214 | 0091;PU1;abbreviation
215 | 0092;PRIVATE USE TWO;control
216 | 0092;PRIVATE USE-2;control
217 | 0092;PU2;abbreviation
218 | 0093;SET TRANSMIT STATE;control
219 | 0093;STS;abbreviation
220 | 0094;CANCEL CHARACTER;control
221 | 0094;CCH;abbreviation
222 | 0095;MESSAGE WAITING;control
223 | 0095;MW;abbreviation
224 | 0096;START OF GUARDED AREA;control
225 | 0096;START OF PROTECTED AREA;control
226 | 0096;SPA;abbreviation
227 | 0097;END OF GUARDED AREA;control
228 | 0097;END OF PROTECTED AREA;control
229 | 0097;EPA;abbreviation
230 | 0098;START OF STRING;control
231 | 0098;SOS;abbreviation
232 | 
233 | # SINGLE GRAPHIC CHARACTER INTRODUCER is another
234 | # architectural concept from early drafts of ISO/IEC 10646-1
235 | # which was never approved and standardized.
236 | 
237 | 0099;SINGLE GRAPHIC CHARACTER INTRODUCER;figment
238 | 0099;SGC;abbreviation
239 | 
240 | 009A;SINGLE CHARACTER INTRODUCER;control
241 | 009A;SCI;abbreviation
242 | 009B;CONTROL SEQUENCE INTRODUCER;control
243 | 009B;CSI;abbreviation
244 | 009C;STRING TERMINATOR;control
245 | 009C;ST;abbreviation
246 | 009D;OPERATING SYSTEM COMMAND;control
247 | 009D;OSC;abbreviation
248 | 009E;PRIVACY MESSAGE;control
249 | 009E;PM;abbreviation
250 | 009F;APPLICATION PROGRAM COMMAND;control
251 | 009F;APC;abbreviation
252 | 00A0;NBSP;abbreviation
253 | 00AD;SHY;abbreviation
254 | 01A2;LATIN CAPITAL LETTER GHA;correction
255 | 01A3;LATIN SMALL LETTER GHA;correction
256 | 034F;CGJ;abbreviation
257 | 061C;ALM;abbreviation
258 | 0709;SYRIAC SUBLINEAR COLON SKEWED LEFT;correction
259 | 0CDE;KANNADA LETTER LLLA;correction
260 | 0E9D;LAO LETTER FO FON;correction
261 | 0E9F;LAO LETTER FO FAY;correction
262 | 0EA3;LAO LETTER RO;correction
263 | 0EA5;LAO LETTER LO;correction
264 | 0FD0;TIBETAN MARK BKA- SHOG GI MGO RGYAN;correction
265 | 11EC;HANGUL JONGSEONG YESIEUNG-KIYEOK;correction
266 | 11ED;HANGUL JONGSEONG YESIEUNG-SSANGKIYEOK;correction
267 | 11EE;HANGUL JONGSEONG SSANGYESIEUNG;correction
268 | 11EF;HANGUL JONGSEONG YESIEUNG-KHIEUKH;correction
269 | 180B;FVS1;abbreviation
270 | 180C;FVS2;abbreviation
271 | 180D;FVS3;abbreviation
272 | 180E;MVS;abbreviation
273 | 200B;ZWSP;abbreviation
274 | 200C;ZWNJ;abbreviation
275 | 200D;ZWJ;abbreviation
276 | 200E;LRM;abbreviation
277 | 200F;RLM;abbreviation
278 | 202A;LRE;abbreviation
279 | 202B;RLE;abbreviation
280 | 202C;PDF;abbreviation
281 | 202D;LRO;abbreviation
282 | 202E;RLO;abbreviation
283 | 202F;NNBSP;abbreviation
284 | 205F;MMSP;abbreviation
285 | 2060;WJ;abbreviation
286 | 2066;LRI;abbreviation
287 | 2067;RLI;abbreviation
288 | 2068;FSI;abbreviation
289 | 2069;PDI;abbreviation
290 | 2118;WEIERSTRASS ELLIPTIC FUNCTION;correction
291 | 2448;MICR ON US SYMBOL;correction
292 | 2449;MICR DASH SYMBOL;correction
293 | 2B7A;LEFTWARDS TRIANGLE-HEADED ARROW WITH DOUBLE VERTICAL STROKE;correction
294 | 2B7C;RIGHTWARDS TRIANGLE-HEADED ARROW WITH DOUBLE VERTICAL STROKE;correction
295 | A015;YI SYLLABLE ITERATION MARK;correction
296 | FE00;VS1;abbreviation
297 | FE01;VS2;abbreviation
298 | FE02;VS3;abbreviation
299 | FE03;VS4;abbreviation
300 | FE04;VS5;abbreviation
301 | FE05;VS6;abbreviation
302 | FE06;VS7;abbreviation
303 | FE07;VS8;abbreviation
304 | FE08;VS9;abbreviation
305 | FE09;VS10;abbreviation
306 | FE0A;VS11;abbreviation
307 | FE0B;VS12;abbreviation
308 | FE0C;VS13;abbreviation
309 | FE0D;VS14;abbreviation
310 | FE0E;VS15;abbreviation
311 | FE0F;VS16;abbreviation
312 | FE18;PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET;correction
313 | FEFF;BYTE ORDER MARK;alternate
314 | FEFF;BOM;abbreviation
315 | FEFF;ZWNBSP;abbreviation
316 | 122D4;CUNEIFORM SIGN NU11 TENU;correction
317 | 122D5;CUNEIFORM SIGN NU11 OVER NU11 BUR OVER BUR;correction
318 | 16E56;MEDEFAIDRIN CAPITAL LETTER H;correction
319 | 16E57;MEDEFAIDRIN CAPITAL LETTER NG;correction
320 | 16E76;MEDEFAIDRIN SMALL LETTER H;correction
321 | 16E77;MEDEFAIDRIN SMALL LETTER NG;correction
322 | 1B001;HENTAIGANA LETTER E-1;correction
323 | 1D0C5;BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS;correction
324 | E0100;VS17;abbreviation
325 | E0101;VS18;abbreviation
326 | E0102;VS19;abbreviation
327 | E0103;VS20;abbreviation
328 | E0104;VS21;abbreviation
329 | E0105;VS22;abbreviation
330 | E0106;VS23;abbreviation
331 | E0107;VS24;abbreviation
332 | E0108;VS25;abbreviation
333 | E0109;VS26;abbreviation
334 | E010A;VS27;abbreviation
335 | E010B;VS28;abbreviation
336 | E010C;VS29;abbreviation
337 | E010D;VS30;abbreviation
338 | E010E;VS31;abbreviation
339 | E010F;VS32;abbreviation
340 | E0110;VS33;abbreviation
341 | E0111;VS34;abbreviation
342 | E0112;VS35;abbreviation
343 | E0113;VS36;abbreviation
344 | E0114;VS37;abbreviation
345 | E0115;VS38;abbreviation
346 | E0116;VS39;abbreviation
347 | E0117;VS40;abbreviation
348 | E0118;VS41;abbreviation
349 | E0119;VS42;abbreviation
350 | E011A;VS43;abbreviation
351 | E011B;VS44;abbreviation
352 | E011C;VS45;abbreviation
353 | E011D;VS46;abbreviation
354 | E011E;VS47;abbreviation
355 | E011F;VS48;abbreviation
356 | E0120;VS49;abbreviation
357 | E0121;VS50;abbreviation
358 | E0122;VS51;abbreviation
359 | E0123;VS52;abbreviation
360 | E0124;VS53;abbreviation
361 | E0125;VS54;abbreviation
362 | E0126;VS55;abbreviation
363 | E0127;VS56;abbreviation
364 | E0128;VS57;abbreviation
365 | E0129;VS58;abbreviation
366 | E012A;VS59;abbreviation
367 | E012B;VS60;abbreviation
368 | E012C;VS61;abbreviation
369 | E012D;VS62;abbreviation
370 | E012E;VS63;abbreviation
371 | E012F;VS64;abbreviation
372 | E0130;VS65;abbreviation
373 | E0131;VS66;abbreviation
374 | E0132;VS67;abbreviation
375 | E0133;VS68;abbreviation
376 | E0134;VS69;abbreviation
377 | E0135;VS70;abbreviation
378 | E0136;VS71;abbreviation
379 | E0137;VS72;abbreviation
380 | E0138;VS73;abbreviation
381 | E0139;VS74;abbreviation
382 | E013A;VS75;abbreviation
383 | E013B;VS76;abbreviation
384 | E013C;VS77;abbreviation
385 | E013D;VS78;abbreviation
386 | E013E;VS79;abbreviation
387 | E013F;VS80;abbreviation
388 | E0140;VS81;abbreviation
389 | E0141;VS82;abbreviation
390 | E0142;VS83;abbreviation
391 | E0143;VS84;abbreviation
392 | E0144;VS85;abbreviation
393 | E0145;VS86;abbreviation
394 | E0146;VS87;abbreviation
395 | E0147;VS88;abbreviation
396 | E0148;VS89;abbreviation
397 | E0149;VS90;abbreviation
398 | E014A;VS91;abbreviation
399 | E014B;VS92;abbreviation
400 | E014C;VS93;abbreviation
401 | E014D;VS94;abbreviation
402 | E014E;VS95;abbreviation
403 | E014F;VS96;abbreviation
404 | E0150;VS97;abbreviation
405 | E0151;VS98;abbreviation
406 | E0152;VS99;abbreviation
407 | E0153;VS100;abbreviation
408 | E0154;VS101;abbreviation
409 | E0155;VS102;abbreviation
410 | E0156;VS103;abbreviation
411 | E0157;VS104;abbreviation
412 | E0158;VS105;abbreviation
413 | E0159;VS106;abbreviation
414 | E015A;VS107;abbreviation
415 | E015B;VS108;abbreviation
416 | E015C;VS109;abbreviation
417 | E015D;VS110;abbreviation
418 | E015E;VS111;abbreviation
419 | E015F;VS112;abbreviation
420 | E0160;VS113;abbreviation
421 | E0161;VS114;abbreviation
422 | E0162;VS115;abbreviation
423 | E0163;VS116;abbreviation
424 | E0164;VS117;abbreviation
425 | E0165;VS118;abbreviation
426 | E0166;VS119;abbreviation
427 | E0167;VS120;abbreviation
428 | E0168;VS121;abbreviation
429 | E0169;VS122;abbreviation
430 | E016A;VS123;abbreviation
431 | E016B;VS124;abbreviation
432 | E016C;VS125;abbreviation
433 | E016D;VS126;abbreviation
434 | E016E;VS127;abbreviation
435 | E016F;VS128;abbreviation
436 | E0170;VS129;abbreviation
437 | E0171;VS130;abbreviation
438 | E0172;VS131;abbreviation
439 | E0173;VS132;abbreviation
440 | E0174;VS133;abbreviation
441 | E0175;VS134;abbreviation
442 | E0176;VS135;abbreviation
443 | E0177;VS136;abbreviation
444 | E0178;VS137;abbreviation
445 | E0179;VS138;abbreviation
446 | E017A;VS139;abbreviation
447 | E017B;VS140;abbreviation
448 | E017C;VS141;abbreviation
449 | E017D;VS142;abbreviation
450 | E017E;VS143;abbreviation
451 | E017F;VS144;abbreviation
452 | E0180;VS145;abbreviation
453 | E0181;VS146;abbreviation
454 | E0182;VS147;abbreviation
455 | E0183;VS148;abbreviation
456 | E0184;VS149;abbreviation
457 | E0185;VS150;abbreviation
458 | E0186;VS151;abbreviation
459 | E0187;VS152;abbreviation
460 | E0188;VS153;abbreviation
461 | E0189;VS154;abbreviation
462 | E018A;VS155;abbreviation
463 | E018B;VS156;abbreviation
464 | E018C;VS157;abbreviation
465 | E018D;VS158;abbreviation
466 | E018E;VS159;abbreviation
467 | E018F;VS160;abbreviation
468 | E0190;VS161;abbreviation
469 | E0191;VS162;abbreviation
470 | E0192;VS163;abbreviation
471 | E0193;VS164;abbreviation
472 | E0194;VS165;abbreviation
473 | E0195;VS166;abbreviation
474 | E0196;VS167;abbreviation
475 | E0197;VS168;abbreviation
476 | E0198;VS169;abbreviation
477 | E0199;VS170;abbreviation
478 | E019A;VS171;abbreviation
479 | E019B;VS172;abbreviation
480 | E019C;VS173;abbreviation
481 | E019D;VS174;abbreviation
482 | E019E;VS175;abbreviation
483 | E019F;VS176;abbreviation
484 | E01A0;VS177;abbreviation
485 | E01A1;VS178;abbreviation
486 | E01A2;VS179;abbreviation
487 | E01A3;VS180;abbreviation
488 | E01A4;VS181;abbreviation
489 | E01A5;VS182;abbreviation
490 | E01A6;VS183;abbreviation
491 | E01A7;VS184;abbreviation
492 | E01A8;VS185;abbreviation
493 | E01A9;VS186;abbreviation
494 | E01AA;VS187;abbreviation
495 | E01AB;VS188;abbreviation
496 | E01AC;VS189;abbreviation
497 | E01AD;VS190;abbreviation
498 | E01AE;VS191;abbreviation
499 | E01AF;VS192;abbreviation
500 | E01B0;VS193;abbreviation
501 | E01B1;VS194;abbreviation
502 | E01B2;VS195;abbreviation
503 | E01B3;VS196;abbreviation
504 | E01B4;VS197;abbreviation
505 | E01B5;VS198;abbreviation
506 | E01B6;VS199;abbreviation
507 | E01B7;VS200;abbreviation
508 | E01B8;VS201;abbreviation
509 | E01B9;VS202;abbreviation
510 | E01BA;VS203;abbreviation
511 | E01BB;VS204;abbreviation
512 | E01BC;VS205;abbreviation
513 | E01BD;VS206;abbreviation
514 | E01BE;VS207;abbreviation
515 | E01BF;VS208;abbreviation
516 | E01C0;VS209;abbreviation
517 | E01C1;VS210;abbreviation
518 | E01C2;VS211;abbreviation
519 | E01C3;VS212;abbreviation
520 | E01C4;VS213;abbreviation
521 | E01C5;VS214;abbreviation
522 | E01C6;VS215;abbreviation
523 | E01C7;VS216;abbreviation
524 | E01C8;VS217;abbreviation
525 | E01C9;VS218;abbreviation
526 | E01CA;VS219;abbreviation
527 | E01CB;VS220;abbreviation
528 | E01CC;VS221;abbreviation
529 | E01CD;VS222;abbreviation
530 | E01CE;VS223;abbreviation
531 | E01CF;VS224;abbreviation
532 | E01D0;VS225;abbreviation
533 | E01D1;VS226;abbreviation
534 | E01D2;VS227;abbreviation
535 | E01D3;VS228;abbreviation
536 | E01D4;VS229;abbreviation
537 | E01D5;VS230;abbreviation
538 | E01D6;VS231;abbreviation
539 | E01D7;VS232;abbreviation
540 | E01D8;VS233;abbreviation
541 | E01D9;VS234;abbreviation
542 | E01DA;VS235;abbreviation
543 | E01DB;VS236;abbreviation
544 | E01DC;VS237;abbreviation
545 | E01DD;VS238;abbreviation
546 | E01DE;VS239;abbreviation
547 | E01DF;VS240;abbreviation
548 | E01E0;VS241;abbreviation
549 | E01E1;VS242;abbreviation
550 | E01E2;VS243;abbreviation
551 | E01E3;VS244;abbreviation
552 | E01E4;VS245;abbreviation
553 | E01E5;VS246;abbreviation
554 | E01E6;VS247;abbreviation
555 | E01E7;VS248;abbreviation
556 | E01E8;VS249;abbreviation
557 | E01E9;VS250;abbreviation
558 | E01EA;VS251;abbreviation
559 | E01EB;VS252;abbreviation
560 | E01EC;VS253;abbreviation
561 | E01ED;VS254;abbreviation
562 | E01EE;VS255;abbreviation
563 | E01EF;VS256;abbreviation
564 | 
565 | # EOF
566 | 


--------------------------------------------------------------------------------
/conf/unicode/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | The files found here are copies of the Unicode files found on the Unicode
 3 | website. We only include the few files that we parse. When a new version
 4 | of Unicode comes out, we should be able to just replace those files and
 5 | parse the new version. Also, we parse at installation time, so we can
 6 | update an existing installation with a simple `apt-get upgrade`.
 7 | 
 8 | See: https://www.unicode.org/Public/
 9 | 
10 | Select a version and then `ucd`.
11 | 
12 | 


--------------------------------------------------------------------------------
/debian/changelog:
--------------------------------------------------------------------------------
  1 | libutf8 (1.0.15.2~bionic) bionic; urgency=high
  2 | 
  3 |   * Bumped build version to rebuild on Launchpad.
  4 | 
  5 |  -- Alexis Wilke <alexis@m2osw.com>  Fri, 10 Nov 2023 15:24:41 -0800
  6 | 
  7 | libutf8 (1.0.15.1~bionic) bionic; urgency=high
  8 | 
  9 |   * Bumped build version to rebuild on Launchpad.
 10 | 
 11 |  -- Alexis Wilke <alexis@m2osw.com>  Tue, 07 Nov 2023 06:03:57 -0800
 12 | 
 13 | libutf8 (1.0.15.0~jammy) jammy; urgency=high
 14 | 
 15 |   * Moved find() of doxygen in the doc/CMakeLists.txt file.
 16 |   * Changed the NOT_A_CHARACTER value to -2 to distinguish it from EOS.
 17 |   * Made utf8_iterator::operator * return NOT_A_CHARACTER on an error.
 18 |   * Define the traits in-place for std::iterator is deprecated.
 19 |   * Removed overload of ostream char32_t characters.
 20 |   * Added a show-unicode tool to display codes from character.
 21 |   * Added function to fix UTF-8 strings by replacing invalid characters.
 22 |   * Allow for += of the '\0' character.
 23 |   * Added UTF-16 functions & tests.
 24 |   * Updated the tests accordingly and added more for better coverage.
 25 |   * Added missing #include <cstdint>.
 26 |   * Applied hack so tests compiles under lunar.
 27 |   * Updated compat to the latest (v15)
 28 |   * Did some work on the UCD data (parse decomposition, read file properly...)
 29 |   * Removed boost-dev as a dependency.
 30 | 
 31 |  -- Alexis Wilke <alexis@m2osw.com>  Sun, 05 Nov 2023 08:05:54 -0800
 32 | 
 33 | libutf8 (1.0.14.0~bionic) bionic; urgency=high
 34 | 
 35 |   * Added operator+ for char32_t/string where string is viewed as UTF-8.
 36 | 
 37 |  -- Alexis Wilke <alexis@m2osw.com>  Sun, 30 Oct 2022 21:24:12 -0700
 38 | 
 39 | libutf8 (1.0.13.0~bionic) bionic; urgency=high
 40 | 
 41 |   * Added a verify_file_inheritance() in tools.
 42 | 
 43 |  -- Alexis Wilke <alexis@m2osw.com>  Mon, 11 Jul 2022 07:42:16 -0700
 44 | 
 45 | libutf8 (1.0.12.1~bionic) bionic; urgency=high
 46 | 
 47 |   * Updated the compat to v10.
 48 | 
 49 |  -- Alexis Wilke <alexis@m2osw.com>  Thu, 19 May 2022 20:28:28 -0700
 50 | 
 51 | libutf8 (1.0.12.0~bionic) bionic; urgency=high
 52 | 
 53 |   * Cleane up the cmake file.
 54 | 
 55 |  -- Alexis Wilke <alexis@m2osw.com>  Thu, 19 May 2022 18:09:49 -0700
 56 | 
 57 | libutf8 (1.0.11.2~bionic) bionic; urgency=high
 58 | 
 59 |   * Bumped build version to rebuild on Launchpad.
 60 | 
 61 |  -- Alexis Wilke <alexis@m2osw.com>  Fri, 04 Mar 2022 22:36:44 -0800
 62 | 
 63 | libutf8 (1.0.11.1~bionic) bionic; urgency=high
 64 | 
 65 |   * Bumped build version to rebuild on Launchpad.
 66 | 
 67 |  -- Alexis Wilke <alexis@m2osw.com>  Sun, 13 Feb 2022 12:35:15 -0800
 68 | 
 69 | libutf8 (1.0.11.0~bionic) bionic; urgency=high
 70 | 
 71 |   * Added a clear() for the good flag in the utf8_iterator.
 72 |   * Fixed the string test, the exception now include "libutf8_exception: ".
 73 |   * Correctly test the good flag status in cases were the iterator fails.
 74 | 
 75 |  -- Alexis Wilke <alexis@m2osw.com>  Mon, 27 Sep 2021 18:08:13 -0700
 76 | 
 77 | libutf8 (1.0.10.0~bionic) bionic; urgency=high
 78 | 
 79 |   * Updated the tests to match the new libexcept library setup.
 80 | 
 81 |  -- Alexis Wilke <alexis@m2osw.com>  Sat, 28 Aug 2021 18:23:57 -0700
 82 | 
 83 | libutf8 (1.0.9.0~bionic) bionic; urgency=high
 84 | 
 85 |   * Slowly adding Unicode to canonicalize UTF-8 strings.
 86 |   * Added SnapDev as a dependency to implement the Unicode parser.
 87 |   * Added a tool to run the parser (which is part of the library).
 88 |   * Updated the exception declarations with our macros.
 89 |   * Cleaned up licenses & copyrights.
 90 | 
 91 |  -- Alexis Wilke <alexis@m2osw.com>  Tue, 24 Aug 2021 15:49:14 -0700
 92 | 
 93 | libutf8 (1.0.8.1~bionic) bionic; urgency=high
 94 | 
 95 |   * Bumped build version to rebuild on Launchpad.
 96 | 
 97 |  -- Alexis Wilke <alexis@m2osw.com>  Fri, 04 Jun 2021 18:28:59 -0700
 98 | 
 99 | libutf8 (1.0.8.0~bionic) bionic; urgency=high
100 | 
101 |   * Fixed the name of a function in an exception message.
102 |   * Updated the mk script.
103 | 
104 |  -- Alexis Wilke <alexis@m2osw.com>  Tue, 01 Jun 2021 17:40:30 -0700
105 | 
106 | libutf8 (1.0.7.2~bionic) bionic; urgency=high
107 | 
108 |   * Bumped version to recompile against the newer versions.
109 | 
110 |  -- Alexis Wilke <alexis@m2osw.com>  Sat, 15 May 2021 09:33:12 -0700
111 | 
112 | libutf8 (1.0.7.1~bionic) bionic; urgency=high
113 | 
114 |   * Bumped version to recompile against the newer version of snapcatch2.
115 | 
116 |  -- Alexis Wilke <alexis@m2osw.com>  Fri, 08 Jan 2021 22:13:35 -0800
117 | 
118 | libutf8 (1.0.7.0~bionic) bionic; urgency=high
119 | 
120 |   * Changed the EOF of the iterator in an EOS so it works as expected with
121 |     the newest versions of catch2 (proper signess for char32_t).
122 |   * Fixed one assignment from L'0' to u'0'.
123 | 
124 |  -- Alexis Wilke <alexis@m2osw.com>  Tue, 26 Apr 2020 18:25:27 -0800
125 | 
126 | libutf8 (1.0.6.2~bionic) bionic; urgency=high
127 | 
128 |   * Create a bionic version.
129 | 
130 |  -- Alexis Wilke <alexis@m2osw.com>  Thu, 30 Apr 2020 20:59:23 -0800
131 | 
132 | libutf8 (1.0.6.0~xenial) xenial; urgency=high
133 | 
134 |   * Added the libutf8::case_insensitive_string type.
135 |   * Fixed the mk so it generates an error on an unknown command line option.
136 |   * Added a test so we can make sure that the case_insensitive_string works.
137 |   * Fixed the existing test tag names, we have to have the square brackets.
138 |   * Moved a couple of validation functions from the libsnapwebsites to here.
139 |   * Broke up the tests in a character and a string so we can just validate a
140 |     standalone character too.
141 |   * Added another validation for UTF-32 strings and characters.
142 |   * Allow for a specific test to be run with `mk -t <name>`.
143 |   * Allow for a nullptr when calling start_with_bom().
144 |   * Added a new exception for unsupported features.
145 |   * Aded a function to check whether a character is a surrogate and which one.
146 |   * Added a to_u8string() with std::wstring as input.
147 |   * Added a to_u8string() with wchar_t as input.
148 |   * Added a to_u8string() with char16_t as input.
149 | 
150 |  -- Alexis Wilke <alexis@m2osw.com>  Wed, 17 Jul 2019 19:58:43 -0800
151 | 
152 | libutf8 (1.0.5.1~xenial) xenial; urgency=high
153 | 
154 |   * Bumped version to force a rebuild, just in case.
155 | 
156 |  -- Alexis Wilke <alexis@m2osw.com>  Wed, 17 Jul 2019 19:58:43 -0800
157 | 
158 | libutf8 (1.0.5.0~xenial) xenial; urgency=high
159 | 
160 |   * Added a way to create an iterator at the end.
161 |   * Added == and != with another utf8_iterator.
162 | 
163 |  -- Alexis Wilke <alexis@m2osw.com>  Sat, 29 Jun 2019 05:05:11 -0800
164 | 
165 | libutf8 (1.0.4.0~xenial) xenial; urgency=high
166 | 
167 |   * Added a PROJECT_BRIEF description.
168 |   * Added in=C++ to the MAPPING_EXTENSION.
169 |   * Updated the doxy file to 1.8.11.
170 | 
171 |  -- Alexis Wilke <alexis@m2osw.com>  Tue, 11 Jun 2019 23:55:25 -0800
172 | 
173 | libutf8 (1.0.3.0~xenial) xenial; urgency=high
174 | 
175 |   * Moved the catch2 implementation to our `snapcatch2.hpp` header instead.
176 |   * Updated the tests accordingly.
177 |   * Cleaned up various declarations in each file.
178 |   * Moved our `obj_setenv()` to `snapdev`.
179 | 
180 |  -- Alexis Wilke <alexis@m2osw.com>  Sat,  1 Jun 2019 00:24:36 -0800
181 | 
182 | libutf8 (1.0.2.0~xenial) xenial; urgency=high
183 | 
184 |   * Got the test coverage back to 100%.
185 |   * Renamed tge tests without the "unittest_" introducer.
186 |   * Added the `start_with_bom()` function and corresponding tests.
187 |   * Fixed standalone characters, the introducer is U for char32_t characters.
188 | 
189 |  -- Alexis Wilke <alexis@m2osw.com>  Tue, 28 May 2019 18:09:01 -0800
190 | 
191 | libutf8 (1.0.1.0~xenial) xenial; urgency=high
192 | 
193 |   * Implemented the to and from UTF-8 and UTF-16 encoding.
194 |   * Fixed the u8casecmp() test function which would test 0xD800 to 0xDFFF
195 |     as valid characters.
196 |   * Added a new exception so we can distinguish whether an encoding or a
197 |     decoding went wrong.
198 |   * Optimized the UTF-32 to UTF-8 conversion, i.e. code bytes under 0x80 get
199 |     copied as is.
200 |   * Fixed the '\0' conversion, it would not get added to the output string.
201 |   * Added a to_u8string() from a char32_t so we get an std::string as output.
202 |   * Generate errors when the mbstowc() or wctombs() functions fail.
203 | 
204 |  -- Alexis Wilke <alexis@m2osw.com>  Tue, 28 May 2019 01:04:30 -0800
205 | 
206 | libutf8 (1.0.0.3~xenial) xenial; urgency=high
207 | 
208 |   * Added the cmake folder and files.
209 |   * Added the README.md and TODO.txt files to the debian/docs.
210 |   * Removed the "debian/tmp/..." from the `debian/libutf8-doc.install`.
211 |   * Added a `-i` command line option to mk to install the library.
212 |   * Added a call to prevent collection of stack trace in our tests.
213 | 
214 |  -- Alexis Wilke <alexis@m2osw.com>  Sat, 25 May 2019 20:54:23 -0800
215 | 
216 | libutf8 (1.0.0.2~xenial) xenial; urgency=high
217 | 
218 |   * Try fixing dependencies, the version may need to include ~xenial.
219 |   * Added boost-dev as a dependency as we use it in our tests.
220 | 
221 |  -- Alexis Wilke <alexis@m2osw.com>  Sat, 25 May 2019 20:54:23 -0800
222 | 
223 | libutf8 (1.0.0.1~xenial) xenial; urgency=high
224 | 
225 |   * Enhanced the README.md
226 |   * Bumped snapcatch2 dependency version to 2.7.2.10.
227 | 
228 |  -- Alexis Wilke <alexis@m2osw.com>  Mon, 20 May 2019 01:23:11 -0800
229 | 
230 | libutf8 (1.0.0.0~xenial) xenial; urgency=high
231 | 
232 |   * Added my wpkg libutf8 library as a Snap! C++ project.
233 | 
234 |  -- Alexis Wilke <alexis@m2osw.com>  Mon, 20 May 2019 01:23:11 -0800
235 | 


--------------------------------------------------------------------------------
/debian/compat:
--------------------------------------------------------------------------------
1 | 15
2 | 


--------------------------------------------------------------------------------
/debian/control:
--------------------------------------------------------------------------------
 1 | Source: libutf8
 2 | Priority: extra
 3 | Maintainer: R. Douglas Barbieri <doug@dooglio.net>
 4 | Build-Depends: cmake,
 5 |     debhelper,
 6 |     doxygen,
 7 |     graphviz,
 8 |     libexcept-dev (>= 1.1.0.0~jammy),
 9 |     snapcatch2 (>= 2.7.2.10~jammy),
10 |     snapcmakemodules (>= 1.0.35.3~jammy),
11 |     snapdev (>= 1.1.16.0~jammy)
12 | Standards-Version: 3.9.4
13 | Section: libs
14 | Homepage: https://snapwebsites.org/
15 | Vcs-Git: https://github.com/m2osw/snapcpp.git
16 | Vcs-Browser: https://github.com/m2osw/libutf8
17 | 
18 | Package: libutf8-dev
19 | Section: libdevel
20 | Architecture: any
21 | Depends: libutf8 (= ${binary:Version}), ${misc:Depends}
22 | Description: Development package for the C++ libutf8 library.
23 |  This library provides functions to convert between UTF-8 and UTF-32 characters.
24 | 
25 | Package: libutf8-doc
26 | Section: doc
27 | Architecture: all
28 | Depends: ${misc:Depends}
29 | Description: Documentation for the C++ libutf8 library.
30 |  This library provides functions to convert between UTF-8 and UTF-32 characters.
31 | 
32 | Package: libutf8
33 | Section: libs
34 | Architecture: any
35 | Depends: ${shlibs:Depends}, ${misc:Depends}
36 | Description: C++ library for UTF-8/UTF-32 handling.
37 |  This library provides functions to convert between UTF-8 and UTF-32 characters.
38 | 
39 | # vim: ts=4 sw=4 et
40 | 


--------------------------------------------------------------------------------
/debian/copyright:
--------------------------------------------------------------------------------
 1 | Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
 2 | Upstream-Name: libutf8
 3 | Source: https://github.com/m2osw/libutf8
 4 | 
 5 | Files: *
 6 | Copyright: 2006-2019 Made to Order Software <contact@m2osw.com>
 7 |            2006-2019 Alexis Wilke <alexis@m2osw.com>
 8 |            2006-2019 R. Douglas Barbieri <doug@m2osw.com>
 9 | License: GPL-2+
10 |  This package is free software; you can redistribute it and/or modify
11 |  it under the terms of the GNU General Public License as published by
12 |  the Free Software Foundation; either version 2 of the License, or
13 |  (at your option) any later version.
14 |  .
15 |  This package is distributed in the hope that it will be useful,
16 |  but WITHOUT ANY WARRANTY; without even the implied warranty of
17 |  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 |  GNU General Public License for more details.
19 |  .
20 |  You should have received a copy of the GNU General Public License
21 |  along with this program. If not, see <https://www.gnu.org/licenses/>
22 |  .
23 |  On Debian systems, the complete text of the GNU General
24 |  Public License version 2 can be found in "/usr/share/common-licenses/GPL-2".
25 | 
26 | Files: conf/unicode/*
27 | Copyright: 1991-2021 Unicode, Inc. All rights reserved.
28 | License: Unicode
29 |  UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
30 |  .
31 |  See Terms of Use for definitions of Unicode Inc.'s
32 |  Data Files and Software.
33 |  .
34 |  NOTICE TO USER: Carefully read the following legal agreement.
35 |  BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
36 |  DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
37 |  YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
38 |  TERMS AND CONDITIONS OF THIS AGREEMENT.
39 |  IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
40 |  THE DATA FILES OR SOFTWARE.
41 |  .
42 |  COPYRIGHT AND PERMISSION NOTICE
43 |  .
44 |  Copyright (c) 1991-2021 Unicode, Inc. All rights reserved.
45 |  Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
46 |  .
47 |  Permission is hereby granted, free of charge, to any person obtaining
48 |  a copy of the Unicode data files and any associated documentation
49 |  (the "Data Files") or Unicode software and any associated documentation
50 |  (the "Software") to deal in the Data Files or Software
51 |  without restriction, including without limitation the rights to use,
52 |  copy, modify, merge, publish, distribute, and/or sell copies of
53 |  the Data Files or Software, and to permit persons to whom the Data Files
54 |  or Software are furnished to do so, provided that either
55 |  (a) this copyright and permission notice appear with all copies
56 |  of the Data Files or Software, or
57 |  (b) this copyright and permission notice appear in associated
58 |  Documentation.
59 |  .
60 |  THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
61 |  ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
62 |  WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
63 |  NONINFRINGEMENT OF THIRD PARTY RIGHTS.
64 |  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
65 |  NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
66 |  DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
67 |  DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
68 |  TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
69 |  PERFORMANCE OF THE DATA FILES OR SOFTWARE.
70 |  .
71 |  Except as contained in this notice, the name of a copyright holder
72 |  shall not be used in advertising or otherwise to promote the sale,
73 |  use or other dealings in these Data Files or Software without prior
74 |  written authorization of the copyright holder.
75 |  .
76 |  See also: https://www.unicode.org/license.html
77 | 
78 | # Please also look if there are files or directories which have a
79 | # different copyright/license attached and list them here.
80 | # Please avoid to pick license terms that are more restrictive than the
81 | # packaged work, as it may make Debian's contributions unacceptable upstream.
82 | 


--------------------------------------------------------------------------------
/debian/docs:
--------------------------------------------------------------------------------
1 | LICENSE.txt
2 | README.md
3 | TODO.txt
4 | 


--------------------------------------------------------------------------------
/debian/libutf8-dev.install:
--------------------------------------------------------------------------------
1 | usr/include/*
2 | usr/lib/lib*.so
3 | usr/share/cmake/*
4 | 


--------------------------------------------------------------------------------
/debian/libutf8-doc.install:
--------------------------------------------------------------------------------
1 | usr/share/doc/libutf8/html/*        usr/share/doc/libutf8-doc/html/
2 | 


--------------------------------------------------------------------------------
/debian/libutf8.install:
--------------------------------------------------------------------------------
1 | usr/bin
2 | usr/lib/lib*.so.*
3 | usr/share/libutf8
4 | 


--------------------------------------------------------------------------------
/debian/rules:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/make -f
 2 | # -*- makefile -*-
 3 | # Sample debian/rules that uses debhelper.
 4 | # This file was originally written by Joey Hess and Craig Small.
 5 | # As a special exception, when this file is copied by dh-make into a
 6 | # dh-make output file, you may use that output file without restriction.
 7 | # This special exception was added by Craig Small in version 0.37 of dh-make.
 8 | 
 9 | # Uncomment this to turn on verbose mode.
10 | #export DH_VERBOSE=1
11 | 
12 | %:
13 | 	dh $@ --parallel
14 | 
15 | override_dh_auto_configure:
16 | 	dh_auto_configure -- -DCMAKE_BUILD_TYPE=Release
17 | 
18 | 


--------------------------------------------------------------------------------
/debian/source/options:
--------------------------------------------------------------------------------
1 | tar-ignore = "tmp"
2 | tar-ignore = ".git"
3 | 


--------------------------------------------------------------------------------
/doc/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2006-2023  Made to Order Software Corp.  All Rights Reserved
 2 | #
 3 | # https://snapwebsites.org/project/libutf8
 4 | # contact@m2osw.com
 5 | #
 6 | # This program is free software; you can redistribute it and/or modify
 7 | # it under the terms of the GNU General Public License as published by
 8 | # the Free Software Foundation; either version 2 of the License, or
 9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License along
17 | # with this program; if not, write to the Free Software Foundation, Inc.,
18 | # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
19 | 
20 | 
21 | ##
22 | ## Documentation
23 | ##
24 | find_package(SnapDoxygen)
25 | AddDoxygenTarget(libutf8
26 |     ${LIBUTF8_VERSION_MAJOR}
27 |     ${LIBUTF8_VERSION_MINOR}
28 |     ${LIBUTF8_VERSION_PATCH}
29 | )
30 | 
31 | # vim: ts=4 sw=4 et
32 | 


--------------------------------------------------------------------------------
/doc/footer.html:
--------------------------------------------------------------------------------
1 | <div style="padding: 10px; border-top: 1px solid #8899ff; background-color: #f8f8f8;">
2 | <p>This document is part of the <a href="https://snapwebsites.org/">Snap! Websites Project</a>.</p>
3 | <p>Copyright by <a href="https://www.m2osw.com/">Made to Order Software Corp.</a></p>
4 | </div>
5 | 


--------------------------------------------------------------------------------
/doc/libutf8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/m2osw/libutf8/fe008f2199f5d8f43abe6ecffe6e82ead03ab2b5/doc/libutf8.png


--------------------------------------------------------------------------------
/libutf8/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2000-2023  Made to Order Software Corp.  All Rights Reserved
 2 | #
 3 | # https://snapwebsites.org/project/libutf8
 4 | # contact@m2osw.com
 5 | #
 6 | # This program is free software; you can redistribute it and/or modify
 7 | # it under the terms of the GNU General Public License as published by
 8 | # the Free Software Foundation; either version 2 of the License, or
 9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License along
17 | # with this program; if not, write to the Free Software Foundation, Inc.,
18 | # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
19 | 
20 | ##
21 | ## utf8 library
22 | ##
23 | project(utf8)
24 | 
25 | # Put the version in the header file
26 | configure_file(
27 |     ${CMAKE_CURRENT_SOURCE_DIR}/version.h.in
28 |     ${CMAKE_CURRENT_BINARY_DIR}/version.h
29 | )
30 | 
31 | add_library(${PROJECT_NAME} SHARED
32 |     base.cpp
33 |     iterator.cpp
34 |     json_tokens.cpp
35 |     libutf8.cpp
36 |     unicode_data.cpp
37 |     unicode_data_file.cpp
38 |     version.cpp
39 | )
40 | 
41 | target_include_directories(${PROJECT_NAME}
42 |     PUBLIC
43 |         ${LIBEXCEPT_INCLUDE_DIRS}
44 |         ${SNAPDEV_INCLUDE_DIRS}
45 | )
46 | 
47 | target_link_libraries(${PROJECT_NAME}
48 |     ${LIBEXCEPT_LIBRARIES}
49 | )
50 | 
51 | set_target_properties(${PROJECT_NAME} PROPERTIES
52 |     VERSION
53 |         ${LIBUTF8_VERSION_MAJOR}.${LIBUTF8_VERSION_MINOR}
54 | 
55 |     SOVERSION
56 |         ${LIBUTF8_VERSION_MAJOR}
57 | )
58 | 
59 | install(
60 |     TARGETS
61 |         ${PROJECT_NAME}
62 | 
63 |     RUNTIME DESTINATION
64 |         bin
65 | 
66 |     LIBRARY DESTINATION
67 |         lib
68 | 
69 |     ARCHIVE DESTINATION
70 |         lib
71 | )
72 | 
73 | install(
74 |     FILES
75 |         base.h
76 |         caseinsensitivestring.h
77 |         exception.h
78 |         iterator.h
79 |         json_tokens.h
80 |         libutf8.h
81 |         unicode_data.h
82 |         ${CMAKE_CURRENT_BINARY_DIR}/version.h
83 | 
84 |     DESTINATION
85 |         include/libutf8
86 | )
87 | 
88 | 
89 | # vim: ts=4 sw=4 et
90 | 


--------------------------------------------------------------------------------
/libutf8/base.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2000-2023  Made to Order Software Corp.  All Rights Reserved
  2 | //
  3 | // https://snapwebsites.org/project/libutf8
  4 | // contact@m2osw.com
  5 | //
  6 | // This program is free software; you can redistribute it and/or modify
  7 | // it under the terms of the GNU General Public License as published by
  8 | // the Free Software Foundation; either version 2 of the License, or
  9 | // (at your option) any later version.
 10 | //
 11 | // This program is distributed in the hope that it will be useful,
 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | // GNU General Public License for more details.
 15 | //
 16 | // You should have received a copy of the GNU General Public License along
 17 | // with this program; if not, write to the Free Software Foundation, Inc.,
 18 | // 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 19 | 
 20 | /** \file
 21 |  * \brief Implementation of the UTF-8 functions.
 22 |  *
 23 |  * This file is the implementation of the UTF-8 functions of the libutf8
 24 |  * library. It simply is a set of functions to convert between different
 25 |  * character sets in a lossless manner. At this point it supports UTF-8,
 26 |  * UCS-4, and UTF-16 formats.
 27 |  *
 28 |  * Contrary to many of the system functions, these functions do not take
 29 |  * anything from the system in account (the locale can be anything, it does
 30 |  * not change the exact behavior of these functions.)
 31 |  *
 32 |  * Also similar functionality is found on Unices and MS-Windows, it was
 33 |  * simpler to just implement these few functions than to try to have a
 34 |  * converter that is sure not to use a locale and this way we can use
 35 |  * standard strings (std::string and std::wstring) instead of having to
 36 |  * call C functions.
 37 |  */
 38 | 
 39 | // self
 40 | //
 41 | #include    "libutf8/base.h"
 42 | 
 43 | #include    "libutf8/exception.h"
 44 | 
 45 | 
 46 | // C++
 47 | //
 48 | #include    <cctype>
 49 | #include    <iostream>
 50 | 
 51 | 
 52 | // last include
 53 | //
 54 | #include    <snapdev/poison.h>
 55 | 
 56 | 
 57 | 
 58 | /** \brief Name space of the UTF-8 library.
 59 |  *
 60 |  * The libutf8 library is used to seamlessly handle UTF-8 strings. It also
 61 |  * is used to convert betwee UTF-8, UTF-16, and UTF-32 strings.
 62 |  *
 63 |  * \todo
 64 |  * Implement the UTF-16 functions.
 65 |  */
 66 | namespace libutf8
 67 | {
 68 | 
 69 | 
 70 | /** \var constexpr std::size_t MBS_MIN_BUFFER_LENGTH
 71 |  * \brief Minimum buffer length to support any UTF-8 characters.
 72 |  *
 73 |  * When converting a UTF-32 character to UTF-8, it makes use of an output
 74 |  * buffer. The size of that output buffer should be at least
 75 |  * MBS_MIN_BUFFER_LENGTH to accomodate any UTF-32 character.
 76 |  *
 77 |  * Note that the size includes space for a null terminator (`'\0'`).
 78 |  *
 79 |  * The size of your buffer can be smaller as long as the UTF-32 character
 80 |  * fits into it, the wctombs() function will not fail.
 81 |  */
 82 | 
 83 | 
 84 | /** \brief Compute the UTF-8 encoded representation of wc.
 85 |  *
 86 |  * This function transforms the UTF-32 character \p wc in a
 87 |  * UTF-8 encoded series of bytes (called a multi-byte encoded
 88 |  * character.) The resulting string is null (`'\0'`) terminated.
 89 |  *
 90 |  * The \p mb buffer should be at least MBS_MIN_BUFFER_LENGTH bytes.
 91 |  * If less space is required, the function does not report a problem,
 92 |  * though. This allows to get the total size of a conversion and then
 93 |  * do the full conversion to that one buffer without the need to
 94 |  * add unnecessary bytes at the end of your destination buffer.
 95 |  *
 96 |  * \code
 97 |  * ...
 98 |  * char mb[MBS_MIN_BUFFER_LENGTH];
 99 |  *
100 |  * wctombs(mb, big_char, sizeof(mb));
101 |  * ...
102 |  * \endcode
103 |  *
104 |  * The function does not encode invalid characters. When such is
105 |  * passed to the function, the \p mb string is turned in a null
106 |  * terminated string and the function returns 0. We avoid an
107 |  * exception here because that way you can quickly check whether
108 |  * a string of `char32_t` characters is valid or not.
109 |  *
110 |  * \note
111 |  * Unicode defines valid characters only between zero (0) and 0x10FFFF.
112 |  * Therefore this function encodes the character using 1 to 4 bytes plus
113 |  * one for the null terminator.
114 |  *
115 |  * \warning
116 |  * The function does not raise an error if the input \p wc character
117 |  * is considered invalid (a UTF-16 surrogate or larger than 0x10FFFF.)
118 |  * Instead it returns 0 and sets the \p mb string to the empty string.
119 |  *
120 |  * \exception libutf8_logic_exception
121 |  * The function raises this exception if the destination buffer is too
122 |  * small for the conversion. Don't forget that we add a null terminator
123 |  * so if the character needs 3 UTF-8 bytes, we will check for a buffer
124 |  * of at least 4 bytes to consider it valid.
125 |  *
126 |  * \param[out] mb  The output buffer, it will always be null terminated.
127 |  * \param[in] wc  The wide character to convert.
128 |  * \param[in] len  The length of \p mb.
129 |  *
130 |  * \return The number of bytes in mb, not including the null terminator.
131 |  */
132 | int wctombs(char * mb, char32_t wc, std::size_t len)
133 | {
134 |     auto verify_length = [&len](std::size_t required_len)
135 |     {
136 |         if(len < required_len)
137 |         {
138 |             throw libutf8_logic_exception("wctombs() called with an output buffer which is too small.");
139 |         }
140 |     };
141 | 
142 |     if(wc < 0x80)
143 |     {
144 |         verify_length(2);
145 | 
146 |         /* this will also encode '\0'... */
147 |         mb[0] = static_cast<char>(wc);
148 |         mb[1] = '\0';
149 |         return 1;
150 |     }
151 |     if(wc < 0x800)
152 |     {
153 |         verify_length(3);
154 | 
155 |         mb[0] = static_cast<char>((wc >> 6) | 0xC0);
156 |         mb[1] = (wc & 0x3F) | 0x80;
157 |         mb[2] = '\0';
158 |         return 2;
159 |     }
160 | 
161 |     // avoid encoding the UTF-16 surrogate because those code points do not
162 |     // represent characters
163 |     //
164 |     if(wc < 0xD800 || wc > 0xDFFF)
165 |     {
166 |         if(wc < 0x10000)
167 |         {
168 |             verify_length(4);
169 | 
170 |             mb[0] = static_cast<char>((wc >> 12) | 0xE0);
171 |             mb[1] = ((wc >> 6) & 0x3F) | 0x80;
172 |             mb[2] = (wc & 0x3F) | 0x80;
173 |             mb[3] = '\0';
174 |             return 3;
175 |         }
176 |         if(wc < 0x110000)
177 |         {
178 |             verify_length(5);
179 | 
180 |             mb[0] = static_cast<char>((wc >> 18) | 0xF0);
181 |             mb[1] = ((wc >> 12) & 0x3F) | 0x80;
182 |             mb[2] = ((wc >> 6) & 0x3F) | 0x80;
183 |             mb[3] = (wc & 0x3F) | 0x80;
184 |             mb[4] = '\0';
185 |             return 4;
186 |         }
187 |     }
188 | 
189 |     verify_length(1);
190 | 
191 |     /* an invalid wide character */
192 |     mb[0] = '\0';
193 |     return -1;
194 | }
195 | 
196 | 
197 | /** \brief Convert one multi-byte character to a wide character.
198 |  *
199 |  * This function converts UTF-8 bytes from \p mb to one UTF-32
200 |  * wide character and saves the result in \p wc. The function
201 |  * automatically increases the pointer in \p mb and simultaneously
202 |  * decreases the \p len parameter.
203 |  *
204 |  * \p wc holds the resulting wide character, a character between
205 |  * `'\0'` (NUL) and `0x10FFFF` and it returns the number of bytes
206 |  * that were used from \p mb. If a bad character is encountered,
207 |  * then the function returns -1 and the bad sequence of bytes is
208 |  * skipped so only one error will be reported for one bad sequence.
209 |  *
210 |  * Bad characters when converting UTF-8 to wide characters are:
211 |  *
212 |  * \li The stream includes bytes 0x80 to 0xBF without an introducer.
213 |  * \li The stream does not include the right number of 0x80 to 0xBF
214 |  *     bytes after an introducer.
215 |  * \li The input ends too early and cannot accommodate the last
216 |  *     encoded character.
217 |  * \li The codes 0xF8 to 0xFF were found in the input string.
218 |  * \li The resulting \p wc value would be larger than 0x10FFFF.
219 |  * \li The resulting \p wc value represents a UTF-16 surrogate
220 |  *     value (a number between 0xD800 and 0xDFFF).
221 |  *
222 |  * Code points between 0xD800 and 0xDFFF are not valid characters.
223 |  * These represent low and high surrogates in UTF-16 (2 are
224 |  * necessary to encode one character of 17 or more bits.)
225 |  *
226 |  * The function returns 0 and sets \p wc to the NUL character (`U'\0'`)
227 |  * if the \p len parameter is zero (i.e. empty string.)
228 |  *
229 |  * \note
230 |  * The function converts a NUL character (`'\0'`) in the
231 |  * input string as a NUL wide character (`U'\0'`) and returns 1. It
232 |  * does not see the NUL character as the end of the string.
233 |  *
234 |  * \warning
235 |  * The function does not throw on invalid input. It is the responsibility
236 |  * of the caller to do so if necessary. This is useful to very an UTF-8
237 |  * string without having to catch an exception.
238 |  *
239 |  * \param[out] wc  The output wide character variable.
240 |  * \param[in,out] mb  The multi-byte input string pointer, returned at the
241 |  *                    following byte.
242 |  * \param[in,out] len  The number of characters left in mb.
243 |  *
244 |  * \return The number of bytes read or -1 if invalid bytes were found.
245 |  */
246 | int mbstowc(char32_t & wc, char const * & mb, std::size_t & len)
247 | {
248 |     auto skip = [](char const * & skip_mb, size_t & skip_len)
249 |     {
250 |         for(unsigned char b(0)
251 |             ; skip_len > 0 && (b = *skip_mb, (b >= 0x80 && b <= 0xBF) || b >= 0xF5)
252 |             ; ++skip_mb , --skip_len);
253 |     };
254 | 
255 |     // already done?
256 |     //
257 |     if(len <= 0)
258 |     {
259 |         wc = U'\0';
260 |         return 0;
261 |     }
262 | 
263 |     // we eat one character from the source minimum
264 |     //
265 |     unsigned char c(*mb++);
266 |     --len;
267 | 
268 |     if(c < 0x80)
269 |     {
270 |         wc = c;
271 |         return 1;
272 |     }
273 | 
274 |     // by default return an invalid character
275 |     //
276 |     wc = NOT_A_CHARACTER;
277 | 
278 |     // invalid stream?
279 |     //
280 |     if((c >= 0x80 && c <= 0xBF) || c >= 0xF5)
281 |     {
282 |         // this is bad UTF-8, skip all the invalid bytes
283 |         //
284 |         skip(mb, len);
285 |         return -1;
286 |     }
287 | 
288 |     char32_t w(U'\0');
289 |     std::size_t cnt(0);
290 | 
291 |     if(c >= 0xF0)
292 |     {
293 |         w = c & 0x07;
294 |         cnt = 3;
295 |     }
296 |     else if(c >= 0xE0)
297 |     {
298 |         w = c & 0x0F;
299 |         cnt = 2;
300 |     }
301 |     else /*if(c >= 0xC0)*/    // always true so we don't have to check
302 |     {
303 |         w = c & 0x1F;
304 |         cnt = 1;
305 |     }
306 | 
307 |     // enough data in the input? if not, that's an error
308 |     //
309 |     if(len < cnt)
310 |     {
311 |         skip(mb, len);
312 |         return -1;
313 |     }
314 |     len -= cnt;
315 | 
316 |     for(std::size_t l(cnt); l > 0; --l, mb++)
317 |     {
318 |         c = *mb;
319 |         if(c < 0x80 || c > 0xBF)
320 |         {
321 |             // we got an invalid sequence!
322 |             // restore whatever is left in len
323 |             //
324 |             len += l;
325 |             return -1;
326 |         }
327 |         w = (w << 6) | (c & 0x3F);
328 |     }
329 | 
330 |     if(w >= 0x110000
331 |     || (w >= 0x00D800 && w <= 0x00DFFF))
332 |     {
333 |         // character out of range or UTF-16 surrogate
334 |         // it can happen with sequences starting with 0xF7
335 |         //
336 |         return -1;
337 |     }
338 | 
339 |     wc = w;
340 | 
341 |     return static_cast<int>(cnt + 1);
342 | }
343 | 
344 | 
345 | /** \brief An overload with a non-const string.
346 |  *
347 |  * Since we are passing a reference to the \p mb string, whether it is
348 |  * const or non-const matter to the call. So here we offer a non-const
349 |  * version even though the string doesn't get modified.
350 |  *
351 |  * \param[out] wc  The output wide character variable.
352 |  * \param[in,out] mb  The multi-byte input string pointer, returned at the
353 |  *                    following byte.
354 |  * \param[in,out] len  The number of characters left in mb.
355 |  *
356 |  * \return The number of bytes read or -1 if invalid bytes were found.
357 |  */
358 | int mbstowc(char32_t & wc, char * & mb, std::size_t & len)
359 | {
360 |     return mbstowc(wc, const_cast<char const * &>(mb), len);
361 | }
362 | 
363 | 
364 | 
365 | } // libutf8 namespace
366 | // vim: ts=4 sw=4 et
367 | 


--------------------------------------------------------------------------------
/libutf8/base.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2000-2023  Made to Order Software Corp.  All Rights Reserved
 2 | //
 3 | // https://snapwebsites.org/project/libutf8
 4 | // contact@m2osw.com
 5 | //
 6 | // This program is free software; you can redistribute it and/or modify
 7 | // it under the terms of the GNU General Public License as published by
 8 | // the Free Software Foundation; either version 2 of the License, or
 9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
19 | #pragma once
20 | 
21 | /** \file
22 |  * \brief The declarations of the UTF-8 library base functions.
23 |  *
24 |  * The functions defined in this file are used to do the actual conversions.
25 |  *
26 |  * They may be useful to you which is why we make them available here.
27 |  * However, these are considered low level functions and you may want
28 |  * to restrain using them. Using the `std::string`-base functions is
29 |  * much safer and what is expected of you.
30 |  */
31 | 
32 | // C++
33 | //
34 | #include    <cstddef>
35 | 
36 | 
37 | namespace libutf8
38 | {
39 | 
40 | 
41 | 
42 | constexpr std::size_t       MBS_MIN_BUFFER_LENGTH = 5;
43 | constexpr char32_t const    BOM_CHAR = U'\U0000FEFF';
44 | constexpr char32_t const    NOT_A_CHARACTER = static_cast<char32_t>(-2);
45 | 
46 | int                     wctombs(char * mb, char32_t wc, size_t len);
47 | int                     mbstowc(char32_t & wc, char const * & mb, size_t & len);
48 | int                     mbstowc(char32_t & wc, char * & mb, size_t & len);
49 | 
50 | 
51 | 
52 | } // libutf8 namespace
53 | // vim: ts=4 sw=4 et
54 | 


--------------------------------------------------------------------------------
/libutf8/caseinsensitivestring.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2013-2023  Made to Order Software Corp.  All Rights Reserved
  2 | //
  3 | // https://snapwebsites.org/project/libutf8
  4 | // contact@m2osw.com
  5 | //
  6 | // This program is free software; you can redistribute it and/or modify
  7 | // it under the terms of the GNU General Public License as published by
  8 | // the Free Software Foundation; either version 2 of the License, or
  9 | // (at your option) any later version.
 10 | //
 11 | // This program is distributed in the hope that it will be useful,
 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | // GNU General Public License for more details.
 15 | //
 16 | // You should have received a copy of the GNU General Public License along
 17 | // with this program; if not, write to the Free Software Foundation, Inc.,
 18 | // 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 19 | #pragma once
 20 | 
 21 | 
 22 | // self
 23 | //
 24 | #include    <libutf8/libutf8.h>
 25 | 
 26 | 
 27 | // C++
 28 | //
 29 | #include    <string>
 30 | 
 31 | 
 32 | 
 33 | namespace libutf8
 34 | {
 35 | 
 36 | 
 37 | 
 38 | /** \brief Case insensitive string.
 39 |  *
 40 |  * This class is an overload of the string template which allows you to
 41 |  * create case insensitive strings as far as the comparison operators
 42 |  * are concerned. All the other functions still work the same way.
 43 |  *
 44 |  * This is particularly useful if you manage an std::map<> with a string as
 45 |  * the key, string which should not be case sensitive.
 46 |  *
 47 |  * The comparisons are done using the libutf8::u8casecmp() function.
 48 |  *
 49 |  * \sa u8casecmp()
 50 |  */
 51 | template<
 52 |     class _CharT,
 53 |     class _Traits = std::char_traits<_CharT>,
 54 |     class _Alloc = std::allocator<_CharT>
 55 | >
 56 | class case_insensitive_basic_string
 57 |     : public std::basic_string<_CharT, _Traits, _Alloc>
 58 | {
 59 | public:
 60 |     typedef typename std::basic_string<_CharT, _Traits, _Alloc>::size_type      size_type;
 61 | 
 62 |     case_insensitive_basic_string() noexcept(std::is_nothrow_default_constructible<_Alloc>::value)
 63 |         : std::basic_string<_CharT, _Traits, _Alloc>()
 64 |     {
 65 |     }
 66 | 
 67 |     explicit case_insensitive_basic_string(_Alloc const & __a)
 68 |         : std::basic_string<_CharT, _Traits, _Alloc>(__a)
 69 |     {
 70 |     }
 71 | 
 72 |     case_insensitive_basic_string(size_type __n, _CharT __c, _Alloc const & __a = _Alloc())
 73 |         : std::basic_string<_CharT, _Traits, _Alloc>(__n, __c, __a)
 74 |     {
 75 |     }
 76 | 
 77 |     // the following are for C++17 and over
 78 |     // (and then the next two constructors will not set __n)
 79 |     //
 80 |     //case_insensitive_basic_string(std::basic_string<_CharT, _Traits, _Alloc> const & __str, size_type __pos, _Alloc const & __a = _Alloc())
 81 |     //    : std::basic_string<_CharT, _Traits, _Alloc>(__str, __pos, __a)
 82 |     //{
 83 |     //}
 84 |     //
 85 |     //case_insensitive_basic_string(case_insensitive_basic_string const & __str, size_type __pos, _Alloc const & __a = _Alloc())
 86 |     //    : std::basic_string<_CharT, _Traits, _Alloc>(static_cast<std::basic_string<_CharT, _Traits, _Alloc> const &>(__str), __pos, __a)
 87 |     //{
 88 |     //}
 89 | 
 90 |     case_insensitive_basic_string(std::basic_string<_CharT, _Traits, _Alloc> const & __str, size_type __pos, size_type __n = std::basic_string<_CharT, _Traits, _Alloc>::npos, _Alloc const & __a = _Alloc())
 91 |         : std::basic_string<_CharT, _Traits, _Alloc>(__str, __pos, __n, __a)
 92 |     {
 93 |     }
 94 | 
 95 |     case_insensitive_basic_string(case_insensitive_basic_string const & __str, size_type __pos, size_type __n = std::basic_string<_CharT, _Traits, _Alloc>::npos, _Alloc const & __a = _Alloc())
 96 |         : std::basic_string<_CharT, _Traits, _Alloc>(__str, __pos, __n, __a)
 97 |     {
 98 |     }
 99 | 
100 |     case_insensitive_basic_string(_CharT const * __d, size_type __n, _Alloc const & __a = _Alloc())
101 |         : std::basic_string<_CharT, _Traits, _Alloc>(__d, __n, __a)
102 |     {
103 |     }
104 | 
105 |     case_insensitive_basic_string(_CharT const * __d, _Alloc const & __a = _Alloc())
106 |         : std::basic_string<_CharT, _Traits, _Alloc>(__d, __a)
107 |     {
108 |     }
109 | 
110 |     template<class _InputIterator>
111 |     case_insensitive_basic_string(_InputIterator __beg, _InputIterator __end, _Alloc const & __a = _Alloc())
112 |         : std::basic_string<_CharT, _Traits, _Alloc>(__beg, __end, __a)
113 |     {
114 |     }
115 | 
116 |     case_insensitive_basic_string(std::basic_string<_CharT, _Traits, _Alloc> const & __str)
117 |         : std::basic_string<_CharT, _Traits, _Alloc>(__str)
118 |     {
119 |     }
120 | 
121 |     case_insensitive_basic_string(case_insensitive_basic_string const & __str)
122 |         : std::basic_string<_CharT, _Traits, _Alloc>(__str)
123 |     {
124 |     }
125 | 
126 |     case_insensitive_basic_string(std::basic_string<_CharT, _Traits, _Alloc> && __str) noexcept
127 |         : std::basic_string<_CharT, _Traits, _Alloc>(__str)
128 |     {
129 |     }
130 | 
131 |     case_insensitive_basic_string(case_insensitive_basic_string && __str) noexcept
132 |         : std::basic_string<_CharT, _Traits, _Alloc>(__str)
133 |     {
134 |     }
135 | 
136 |     case_insensitive_basic_string(std::basic_string<_CharT, _Traits, _Alloc> && __str, _Alloc const & __a)
137 |         : std::basic_string<_CharT, _Traits, _Alloc>(__str, __a)
138 |     {
139 |     }
140 | 
141 |     case_insensitive_basic_string(case_insensitive_basic_string && __str, _Alloc const & __a)
142 |         : std::basic_string<_CharT, _Traits, _Alloc>(__str, __a)
143 |     {
144 |     }
145 | 
146 |     case_insensitive_basic_string(std::initializer_list<_CharT> __l, _Alloc const & __a = _Alloc())
147 |         : std::basic_string<_CharT, _Traits, _Alloc>(__l, __a)
148 |     {
149 |     }
150 | 
151 | 
152 |     friend bool operator == (case_insensitive_basic_string const & lhs, std::basic_string<_CharT, _Traits, _Alloc> const & rhs)
153 |     {
154 |         return libutf8::u8casecmp(lhs, rhs) == 0;
155 |     }
156 | 
157 |     friend bool operator == (case_insensitive_basic_string const & lhs, case_insensitive_basic_string const & rhs)
158 |     {
159 |         return libutf8::u8casecmp(lhs, rhs) == 0;
160 |     }
161 | 
162 |     friend bool operator == (std::basic_string<_CharT, _Traits, _Alloc> const & lhs, case_insensitive_basic_string const & rhs)
163 |     {
164 |         return libutf8::u8casecmp(lhs, rhs) == 0;
165 |     }
166 | 
167 |     friend bool operator == (case_insensitive_basic_string const & lhs, _CharT const * rhs)
168 |     {
169 |         return libutf8::u8casecmp(lhs, rhs) == 0;
170 |     }
171 | 
172 |     friend bool operator == (_CharT const * lhs, case_insensitive_basic_string const & rhs)
173 |     {
174 |         return libutf8::u8casecmp(lhs, rhs) == 0;
175 |     }
176 | 
177 |     friend bool operator != (case_insensitive_basic_string const & lhs, std::basic_string<_CharT, _Traits, _Alloc> const & rhs)
178 |     {
179 |         return libutf8::u8casecmp(lhs, rhs) != 0;
180 |     }
181 | 
182 |     friend bool operator != (case_insensitive_basic_string const & lhs, case_insensitive_basic_string const & rhs)
183 |     {
184 |         return libutf8::u8casecmp(lhs, rhs) != 0;
185 |     }
186 | 
187 |     friend bool operator != (std::basic_string<_CharT, _Traits, _Alloc> const & lhs, case_insensitive_basic_string const & rhs)
188 |     {
189 |         return libutf8::u8casecmp(lhs, rhs) != 0;
190 |     }
191 | 
192 |     friend bool operator != (case_insensitive_basic_string const & lhs, _CharT const * rhs)
193 |     {
194 |         return libutf8::u8casecmp(lhs, rhs) != 0;
195 |     }
196 | 
197 |     friend bool operator != (_CharT const * lhs, case_insensitive_basic_string const & rhs)
198 |     {
199 |         return libutf8::u8casecmp(lhs, rhs) != 0;
200 |     }
201 | 
202 |     friend bool operator < (case_insensitive_basic_string const & lhs, std::basic_string<_CharT, _Traits, _Alloc> const & rhs)
203 |     {
204 |         return libutf8::u8casecmp(lhs, rhs) < 0;
205 |     }
206 | 
207 |     friend bool operator < (case_insensitive_basic_string const & lhs, case_insensitive_basic_string const & rhs)
208 |     {
209 |         return libutf8::u8casecmp(lhs, rhs) < 0;
210 |     }
211 | 
212 |     friend bool operator < (std::basic_string<_CharT, _Traits, _Alloc> const & lhs, case_insensitive_basic_string const & rhs)
213 |     {
214 |         return libutf8::u8casecmp(lhs, rhs) < 0;
215 |     }
216 | 
217 |     friend bool operator < (case_insensitive_basic_string const & lhs, _CharT const * rhs)
218 |     {
219 |         return libutf8::u8casecmp(lhs, rhs) < 0;
220 |     }
221 | 
222 |     friend bool operator < (_CharT const * lhs, case_insensitive_basic_string const & rhs)
223 |     {
224 |         return libutf8::u8casecmp(lhs, rhs) < 0;
225 |     }
226 | 
227 |     friend bool operator <= (case_insensitive_basic_string const & lhs, std::basic_string<_CharT, _Traits, _Alloc> const & rhs)
228 |     {
229 |         return libutf8::u8casecmp(lhs, rhs) <= 0;
230 |     }
231 | 
232 |     friend bool operator <= (case_insensitive_basic_string const & lhs, case_insensitive_basic_string const & rhs)
233 |     {
234 |         return libutf8::u8casecmp(lhs, rhs) <= 0;
235 |     }
236 | 
237 |     friend bool operator <= (std::basic_string<_CharT, _Traits, _Alloc> const & lhs, case_insensitive_basic_string const & rhs)
238 |     {
239 |         return libutf8::u8casecmp(lhs, rhs) <= 0;
240 |     }
241 | 
242 |     friend bool operator <= (case_insensitive_basic_string const & lhs, _CharT const * rhs)
243 |     {
244 |         return libutf8::u8casecmp(lhs, rhs) <= 0;
245 |     }
246 | 
247 |     friend bool operator <= (_CharT const * lhs, case_insensitive_basic_string const & rhs)
248 |     {
249 |         return libutf8::u8casecmp(lhs, rhs) <= 0;
250 |     }
251 | 
252 |     friend bool operator > (case_insensitive_basic_string const & lhs, std::basic_string<_CharT, _Traits, _Alloc> const & rhs)
253 |     {
254 |         return libutf8::u8casecmp(lhs, rhs) > 0;
255 |     }
256 | 
257 |     friend bool operator > (case_insensitive_basic_string const & lhs, case_insensitive_basic_string const & rhs)
258 |     {
259 |         return libutf8::u8casecmp(lhs, rhs) > 0;
260 |     }
261 | 
262 |     friend bool operator > (std::basic_string<_CharT, _Traits, _Alloc> const & lhs, case_insensitive_basic_string const & rhs)
263 |     {
264 |         return libutf8::u8casecmp(lhs, rhs) > 0;
265 |     }
266 | 
267 |     friend bool operator > (case_insensitive_basic_string const & lhs, _CharT const * rhs)
268 |     {
269 |         return libutf8::u8casecmp(lhs, rhs) > 0;
270 |     }
271 | 
272 |     friend bool operator > (_CharT const * lhs, case_insensitive_basic_string const & rhs)
273 |     {
274 |         return libutf8::u8casecmp(lhs, rhs) > 0;
275 |     }
276 | 
277 |     friend bool operator >= (case_insensitive_basic_string const & lhs, std::basic_string<_CharT, _Traits, _Alloc> const & rhs)
278 |     {
279 |         return libutf8::u8casecmp(lhs, rhs) >= 0;
280 |     }
281 | 
282 |     friend bool operator >= (case_insensitive_basic_string const & lhs, case_insensitive_basic_string const & rhs)
283 |     {
284 |         return libutf8::u8casecmp(lhs, rhs) >= 0;
285 |     }
286 | 
287 |     friend bool operator >= (std::basic_string<_CharT, _Traits, _Alloc> const & lhs, case_insensitive_basic_string const & rhs)
288 |     {
289 |         return libutf8::u8casecmp(lhs, rhs) >= 0;
290 |     }
291 | 
292 |     friend bool operator >= (case_insensitive_basic_string const & lhs, _CharT const * rhs)
293 |     {
294 |         return libutf8::u8casecmp(lhs, rhs) >= 0;
295 |     }
296 | 
297 |     friend bool operator >= (_CharT const * lhs, case_insensitive_basic_string const & rhs)
298 |     {
299 |         return libutf8::u8casecmp(lhs, rhs) >= 0;
300 |     }
301 | };
302 | 
303 | 
304 | typedef case_insensitive_basic_string<char>         case_insensitive_string;
305 | 
306 | // TODO add support for other types
307 | //typedef case_insensitive_basic_string<wchar_t>      case_insensitive_wstring;
308 | //typedef case_insensitive_basic_string<char16_t>     case_insensitive_u16string;
309 | //typedef case_insensitive_basic_string<char32_t>     case_insensitive_u32string;
310 | 
311 | 
312 | }
313 | // libutf8 namespace
314 | // vim: ts=4 sw=4 et
315 | 


--------------------------------------------------------------------------------
/libutf8/exception.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2000-2023  Made to Order Software Corp.  All Rights Reserved
 2 | //
 3 | // https://snapwebsites.org/project/libutf8
 4 | // contact@m2osw.com
 5 | //
 6 | // This program is free software; you can redistribute it and/or modify
 7 | // it under the terms of the GNU General Public License as published by
 8 | // the Free Software Foundation; either version 2 of the License, or
 9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
19 | #pragma once
20 | 
21 | /** \file
22 |  * \brief The declarations of the UTF-8 library.
23 |  *
24 |  * This file is the declarations of the UTF-8 library which are just a few
25 |  * functions used to convert a string from one format to another.
26 |  */
27 | 
28 | // libexcept
29 | //
30 | #include    <libexcept/exception.h>
31 | 
32 | 
33 | 
34 | namespace libutf8
35 | {
36 | 
37 | 
38 | 
39 | DECLARE_LOGIC_ERROR(libutf8_logic_exception);
40 | 
41 | DECLARE_MAIN_EXCEPTION(libutf8_exception);
42 | 
43 | DECLARE_EXCEPTION(libutf8_exception, libutf8_exception_decoding);
44 | DECLARE_EXCEPTION(libutf8_exception, libutf8_exception_encoding);
45 | DECLARE_EXCEPTION(libutf8_exception, libutf8_exception_invalid_parameter);
46 | DECLARE_EXCEPTION(libutf8_exception, libutf8_exception_io);
47 | DECLARE_EXCEPTION(libutf8_exception, libutf8_exception_missing);
48 | DECLARE_EXCEPTION(libutf8_exception, libutf8_exception_overflow);
49 | DECLARE_EXCEPTION(libutf8_exception, libutf8_exception_twice);
50 | DECLARE_EXCEPTION(libutf8_exception, libutf8_exception_unsupported);
51 | 
52 | 
53 | 
54 | } // libutf8 namespace
55 | // vim: ts=4 sw=4 et
56 | 


--------------------------------------------------------------------------------
/libutf8/iterator.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2000-2023  Made to Order Software Corp.  All Rights Reserved
  2 | //
  3 | // https://snapwebsites.org/project/libutf8
  4 | // contact@m2osw.com
  5 | //
  6 | // This program is free software; you can redistribute it and/or modify
  7 | // it under the terms of the GNU General Public License as published by
  8 | // the Free Software Foundation; either version 2 of the License, or
  9 | // (at your option) any later version.
 10 | //
 11 | // This program is distributed in the hope that it will be useful,
 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | // GNU General Public License for more details.
 15 | //
 16 | // You should have received a copy of the GNU General Public License along
 17 | // with this program; if not, write to the Free Software Foundation, Inc.,
 18 | // 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 19 | 
 20 | /** \file
 21 |  * \brief Implementation of the UTF-8 functions.
 22 |  *
 23 |  * This file is the implementation of the UTF-8 functions of the libutf8
 24 |  * library. It simply is a set of functions to convert between different
 25 |  * character sets in a lossless manner. At this point it supports UTF-8,
 26 |  * UCS-4, and UTF-16 formats.
 27 |  *
 28 |  * Contrary to many of the system functions, these functions do not take
 29 |  * anything from the system in account (the locale can be anything, it does
 30 |  * not change the exact behavior of these functions).
 31 |  *
 32 |  * Also similar functionality is found on Unices and MS-Windows, it was
 33 |  * simpler to just implement these few functions than to try to have a
 34 |  * converter that is sure not to use a locale and this way we can use
 35 |  * standard strings (std::string and std::wstring) instead of having to
 36 |  * call C functions.
 37 |  */
 38 | 
 39 | // self
 40 | //
 41 | #include    "libutf8/iterator.h"
 42 | 
 43 | #include    "libutf8/base.h"
 44 | #include    "libutf8/libutf8.h"
 45 | 
 46 | 
 47 | // C++
 48 | //
 49 | #include    <iostream>
 50 | 
 51 | 
 52 | // last include
 53 | //
 54 | #include    <snapdev/poison.h>
 55 | 
 56 | 
 57 | 
 58 | namespace libutf8
 59 | {
 60 | 
 61 | 
 62 | 
 63 | utf8_iterator::utf8_iterator(std::string const & str, bool end)
 64 |     : f_str(&str)
 65 |     , f_pos(end ? str.length() : 0)
 66 |     , f_start_pos(f_pos)
 67 | {
 68 | }
 69 | 
 70 | 
 71 | utf8_iterator & utf8_iterator::operator ++ ()
 72 | {
 73 |     increment();
 74 |     return *this;
 75 | }
 76 | 
 77 | 
 78 | utf8_iterator utf8_iterator::operator ++ (int) // post-increment
 79 | {
 80 |     utf8_iterator it(*this);
 81 |     increment();
 82 |     return it;
 83 | }
 84 | 
 85 | 
 86 | utf8_iterator & utf8_iterator::operator -- ()
 87 | {
 88 |     decrement();
 89 |     return *this;
 90 | }
 91 | 
 92 | 
 93 | utf8_iterator utf8_iterator::operator -- (int) // post-decrement
 94 | {
 95 |     utf8_iterator it(*this);
 96 |     decrement();
 97 |     return it;
 98 | }
 99 | 
100 | 
101 | /** \brief Read the current character.
102 |  *
103 |  * This function reads the current character and returns it as a char32_t
104 |  * (i.e. UTF-32).
105 |  *
106 |  * When the iterator is at the end of the input string (it == str.end()),
107 |  * then the function returns libutf8::EOS (-1).
108 |  *
109 |  * When the current character is valid, the value is any number from 0 to
110 |  * 0x10FFFF except for UTF-16 surrogate values (0xD800 to 0xDFFF).
111 |  *
112 |  * When the current character is invalid (bad UTF-8 encoding, although
113 |  * extended UTF-8 is accepted here), then the function returns
114 |  * libutf8::NOT_A_CHARACTER (-2). Further, the good flag is also set to
115 |  * false, which means good() returns false and bad() returns true.
116 |  *
117 |  * \code
118 |  *     for(libutf8::utf8_iterator it(s); it != s.end(); ++it)
119 |  *     {
120 |  *         char32_t c(*it);
121 |  *
122 |  *         // here you can choose:
123 |  *         if(c == libutf8::NOT_A_CHARACTER)
124 |  *         {
125 |  *             // handle error -- current character is not valid UTF-8
126 |  *             break;
127 |  *         }
128 |  *         // -- or --
129 |  *         if(it.bad())
130 |  *         {
131 |  *             // handle error -- current character is not valid UTF-8
132 |  *             break;
133 |  *         }
134 |  *     }
135 |  * \endcode
136 |  *
137 |  * Since this function returns EOS when the iterator is at the end of
138 |  * the string, you can also stop the iteration process like so:
139 |  *
140 |  * \code
141 |  *     libutf8::utf8_iterator it(s);
142 |  *     for(;;)
143 |  *     {
144 |  *         char32_t c(*it);
145 |  *         if(c == libutf8::EOS)
146 |  *         {
147 |  *             // success, all characters were valid
148 |  *             break;
149 |  *         }
150 |  *         ...handle other cases as above...
151 |  *     }
152 |  * \endcode
153 |  *
154 |  * \return EOS if at the end of the string, the current character as a
155 |  * char32_t value or NOT_A_CHARACTER if the current character encoding is
156 |  * wrong.
157 |  *
158 |  * \sa good()
159 |  * \sa bad()
160 |  */
161 | char32_t utf8_iterator::operator * () const
162 | {
163 |     if(f_pos >= f_str->length())
164 |     {
165 |         return EOS;
166 |     }
167 |     char const * s(f_str->c_str() + f_pos);
168 |     char32_t wc(NOT_A_CHARACTER);
169 |     size_t len(f_str->length() - f_pos);
170 |     if(mbstowc(wc, s, len) < 0)
171 |     {
172 |         f_good = false;
173 |     }
174 |     return wc;
175 | }
176 | 
177 | 
178 | bool utf8_iterator::operator == (utf8_iterator const & rhs) const
179 | {
180 |     return f_pos == rhs.f_pos;
181 | }
182 | 
183 | 
184 | bool utf8_iterator::operator != (utf8_iterator const & rhs) const
185 | {
186 |     return f_pos != rhs.f_pos;
187 | }
188 | 
189 | 
190 | bool utf8_iterator::operator == (std::string::iterator it) const
191 | {
192 |     return static_cast<std::string::size_type>(it - f_str->begin()) == f_pos;
193 | }
194 | 
195 | 
196 | bool utf8_iterator::operator != (std::string::iterator it) const
197 | {
198 |     return static_cast<std::string::size_type>(it - f_str->begin()) != f_pos;
199 | }
200 | 
201 | 
202 | bool utf8_iterator::operator == (std::string::const_iterator it) const
203 | {
204 |     return static_cast<std::string::size_type>(it - f_str->cbegin()) == f_pos;
205 | }
206 | 
207 | 
208 | bool utf8_iterator::operator != (std::string::const_iterator it) const
209 | {
210 |     return static_cast<std::string::size_type>(it - f_str->cbegin()) != f_pos;
211 | }
212 | 
213 | 
214 | bool operator == (std::string::iterator it, utf8_iterator const & rhs)
215 | {
216 |     return static_cast<std::string::size_type>(it - rhs.f_str->begin()) == rhs.f_pos;
217 | }
218 | 
219 | 
220 | bool operator != (std::string::iterator it, utf8_iterator const & rhs)
221 | {
222 |     return static_cast<std::string::size_type>(it - rhs.f_str->begin()) != rhs.f_pos;
223 | }
224 | 
225 | 
226 | bool operator == (std::string::const_iterator it, utf8_iterator const & rhs)
227 | {
228 |     return static_cast<std::string::size_type>(it - rhs.f_str->cbegin()) == rhs.f_pos;
229 | }
230 | 
231 | 
232 | bool operator != (std::string::const_iterator it, utf8_iterator const & rhs)
233 | {
234 |     return static_cast<std::string::size_type>(it - rhs.f_str->cbegin()) != rhs.f_pos;
235 | }
236 | 
237 | 
238 | void utf8_iterator::increment()
239 | {
240 |     auto skip = [&]()
241 |     {
242 |         for(unsigned char b(0)
243 |             ; f_pos < f_str->length()
244 |                 && (b = static_cast<unsigned char>(f_str[0][f_pos]),
245 |                             (b >= 0x80 && b <= 0xBF) || b >= 0xF5)
246 |             ; ++f_pos);
247 |         f_good = false;
248 |     };
249 | 
250 |     if(f_pos >= f_str->length())
251 |     {
252 |         return;
253 |     }
254 | 
255 |     // increment is easy we can just get the current character and we know
256 |     // the size of the character in UTF-8
257 |     //
258 |     unsigned char c(static_cast<unsigned char>(f_str[0][f_pos]));
259 | 
260 |     if(c < 0x80)
261 |     {
262 |         ++f_pos;
263 |     }
264 |     else if(c <= 0xBF || c >= 0xF5)
265 |     {
266 |         // ?! invalid UTF-8 ?!
267 |         //
268 |         skip();
269 |     }
270 |     else if(c >= 0xF0)
271 |     {
272 |         f_pos += 4;
273 |         if(c == 0xF4 && f_pos - 3 < f_str->length())
274 |         {
275 |             c = static_cast<unsigned char>(f_str[0][f_pos - 3]);
276 |             if(c >= 0x90)
277 |             {
278 |                 f_pos -= 3;
279 |                 skip();
280 |             }
281 |         }
282 |     }
283 |     else if(c >= 0xE0)
284 |     {
285 |         f_pos += 3;
286 |     }
287 |     else /*if(c >= 0xC0)*/    // always true so we don't have to check
288 |     {
289 |         f_pos += 2;
290 |     }
291 |     if(f_pos > f_str->length())
292 |     {
293 |         f_pos = f_str->length();
294 |         f_good = false;
295 |     }
296 | }
297 | 
298 | 
299 | /** \brief Decrement the iterator.
300 |  *
301 |  * If the iterator is not already at position 0, decrement it to the previous
302 |  * UTF-8 character. This means skipping to the first UTF-8 byte.
303 |  *
304 |  * \note
305 |  * Contrary to the increment(), this function does not set the good flag to
306 |  * true or false whether it is at the start or there is an invalid character.
307 |  */
308 | void utf8_iterator::decrement()
309 | {
310 |     if(f_pos == 0)
311 |     {
312 |         return;
313 |     }
314 | 
315 |     // decrement requires us to search for the previous starting byte
316 |     // which means we need to scan the string
317 |     //
318 |     while(f_pos > 0)
319 |     {
320 |         --f_pos;
321 |         unsigned char c(static_cast<unsigned char>(f_str[0][f_pos]));
322 |         if(c < 0x80
323 |         || c >= 0xC0)
324 |         {
325 |             break;
326 |         }
327 |     }
328 | }
329 | 
330 | 
331 | /** \brief Compute the distance between two iterators.
332 |  *
333 |  * This function computers the distance between two libutf8 iterators.
334 |  *
335 |  * The right hand side iterator must be from the same string as the
336 |  * lhs string.
337 |  *
338 |  * \return The distance between the two iterators.
339 |  */
340 | utf8_iterator::difference_type utf8_iterator::operator - (utf8_iterator const & rhs) const
341 | {
342 |     return f_pos - rhs.f_pos;
343 | }
344 | 
345 | 
346 | /** \brief Compute the distance between two iterators.
347 |  *
348 |  * This operator computes the difference between this iterator and the
349 |  * specified \p it iterator.
350 |  *
351 |  * \param[in] it  The iterator to calculate the distance from.
352 |  *
353 |  * \return The distance between the two iterators.
354 |  */
355 | utf8_iterator::difference_type utf8_iterator::operator - (std::string::const_iterator it) const
356 | {
357 |     return static_cast<std::string::size_type>(f_str->cbegin() + f_pos - it);
358 | }
359 | 
360 | 
361 | /** \brief Compute the distance between two iterators.
362 |  *
363 |  * This operator computes the difference between the two specified iterators
364 |  * \p it and \p rhs.
365 |  *
366 |  * \param[in] it  The iterator to calculate the distance from.
367 |  * \param[in] rhs  The iterator to calculate the distance to.
368 |  *
369 |  * \return The distance between the two specified iterators.
370 |  */
371 | utf8_iterator::difference_type operator - (std::string::const_iterator it, utf8_iterator const & rhs)
372 | {
373 |     return static_cast<std::string::size_type>(it - rhs.f_str->cbegin() - rhs.f_pos);
374 | }
375 | 
376 | 
377 | /** \brief Restart  the iterator.
378 |  *
379 |  * The iterator started at 0 or the end of the string, then you moved it
380 |  * using the `++` or `--` operators. Later you may want to re-parse the
381 |  * string from the start or end of the string.
382 |  *
383 |  * This function resets the position back to 0 or the end as defined on
384 |  * the constructor.
385 |  */
386 | void utf8_iterator::rewind()
387 | {
388 |     f_pos = f_start_pos;
389 | }
390 | 
391 | 
392 | /** \brief Clear the errors.
393 |  *
394 |  * The iterator is considered good by default. If you try to retreive
395 |  * a character after the end of the string being iterated or the
396 |  * bytes do not represent an invalid UTF-8 character.
397 |  *
398 |  * \sa good()
399 |  * \sa bad()
400 |  */
401 | void utf8_iterator::clear()
402 | {
403 |     f_good = true;
404 | }
405 | 
406 | 
407 | /** \brief Check whether the iterator did not run in an error.
408 |  *
409 |  * The iterator remains good as long as the input characters are valid
410 |  * and the end of the string is not reached. After either event, this
411 |  * function returns false.
412 |  *
413 |  * You can clear this flag by calling the clear() function.
414 |  *
415 |  * \return true if no errors were encountered so far.
416 |  *
417 |  * \sa clear()
418 |  * \sa bad()
419 |  */
420 | bool utf8_iterator::good() const
421 | {
422 |     return f_good;
423 | }
424 | 
425 | 
426 | /** \brief Check whether the iterator ran in an error.
427 |  *
428 |  * This function returns true if an invalid character or the end of the
429 |  * string was found.
430 |  *
431 |  * \return true if an error condition was encountered.
432 |  *
433 |  * \sa clear()
434 |  * \sa good()
435 |  */
436 | bool utf8_iterator::bad() const
437 | {
438 |     return !f_good;
439 | }
440 | 
441 | 
442 | 
443 | } // libutf8 namespace
444 | // vim: ts=4 sw=4 et
445 | 


--------------------------------------------------------------------------------
/libutf8/iterator.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2000-2023  Made to Order Software Corp.  All Rights Reserved
 2 | //
 3 | // https://snapwebsites.org/project/libutf8
 4 | // contact@m2osw.com
 5 | //
 6 | // This program is free software; you can redistribute it and/or modify
 7 | // it under the terms of the GNU General Public License as published by
 8 | // the Free Software Foundation; either version 2 of the License, or
 9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
19 | #pragma once
20 | 
21 | /** \file
22 |  * \brief The declarations of the UTF-8 library.
23 |  *
24 |  * This file is the declarations of the UTF-8 library which are just a few
25 |  * functions used to convert a string from one format to another.
26 |  */
27 | 
28 | // C++
29 | //
30 | #include    <string>
31 | 
32 | 
33 | 
34 | namespace libutf8
35 | {
36 | 
37 | 
38 | constexpr char32_t              EOS = static_cast<char32_t>(EOF);
39 | 
40 | 
41 | class utf8_iterator
42 | {
43 | public:
44 |     // Iterator traits
45 |     //
46 |     typedef std::bidirectional_iterator_tag iterator_category;
47 |     typedef char32_t                        value_type;
48 |     typedef ssize_t                         difference_type;
49 |     typedef char32_t const *                pointer;
50 |     typedef char32_t const &                reference;
51 | 
52 |                                 utf8_iterator(std::string const & str, bool end = false);
53 | 
54 |     utf8_iterator &             operator ++ ();
55 |     utf8_iterator               operator ++ (int);
56 |     utf8_iterator &             operator -- ();
57 |     utf8_iterator               operator -- (int);
58 |     value_type                  operator * () const;
59 |     bool                        operator == (utf8_iterator const & rhs) const;
60 |     bool                        operator != (utf8_iterator const & rhs) const;
61 |     bool                        operator == (std::string::iterator it) const;
62 |     bool                        operator != (std::string::iterator it) const;
63 |     bool                        operator == (std::string::const_iterator it) const;
64 |     bool                        operator != (std::string::const_iterator it) const;
65 |     friend bool                 operator == (std::string::iterator it, utf8_iterator const & rhs);
66 |     friend bool                 operator != (std::string::iterator it, utf8_iterator const & rhs);
67 |     friend bool                 operator == (std::string::const_iterator it, utf8_iterator const & rhs);
68 |     friend bool                 operator != (std::string::const_iterator it, utf8_iterator const & rhs);
69 |     difference_type             operator - (utf8_iterator const & rhs) const;
70 |     difference_type             operator - (std::string::const_iterator it) const;
71 |     friend difference_type      operator - (std::string::const_iterator it, utf8_iterator const & rhs);
72 | 
73 |     void                        rewind();
74 |     void                        clear();
75 |     bool                        good() const;
76 |     bool                        bad() const;
77 | 
78 | private:
79 |     void                        increment();
80 |     void                        decrement();
81 | 
82 |     std::string const *         f_str = nullptr;
83 |     std::string::size_type      f_pos = 0;
84 |     std::string::size_type      f_start_pos = 0;
85 |     mutable bool                f_good = true;
86 | };
87 | 
88 | 
89 | 
90 | } // libutf8 namespace
91 | // vim: ts=4 sw=4 et
92 | 


--------------------------------------------------------------------------------
/libutf8/json_tokens.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2000-2023  Made to Order Software Corp.  All Rights Reserved
  2 | //
  3 | // https://snapwebsites.org/project/libutf8
  4 | // contact@m2osw.com
  5 | //
  6 | // This program is free software; you can redistribute it and/or modify
  7 | // it under the terms of the GNU General Public License as published by
  8 | // the Free Software Foundation; either version 2 of the License, or
  9 | // (at your option) any later version.
 10 | //
 11 | // This program is distributed in the hope that it will be useful,
 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | // GNU General Public License for more details.
 15 | //
 16 | // You should have received a copy of the GNU General Public License along
 17 | // with this program; if not, write to the Free Software Foundation, Inc.,
 18 | // 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 19 | #pragma once
 20 | 
 21 | /** \file
 22 |  * \brief The declarations of the JSON tokens class.
 23 |  *
 24 |  * This file is the declarations of the JSON tokens class one can use to
 25 |  * interpret the contents of a JSON file.
 26 |  *
 27 |  * The idea of this simple parser is to (1) show how one can use the
 28 |  * libutf8 library and (2) give you the ability to parse a simple JSON
 29 |  * structure.
 30 |  */
 31 | 
 32 | // self
 33 | //
 34 | #include    <libutf8/iterator.h>
 35 | 
 36 | 
 37 | // C++
 38 | //
 39 | #include    <cmath>
 40 | #include    <cstdint>
 41 | #include    <string>
 42 | 
 43 | 
 44 | 
 45 | namespace libutf8
 46 | {
 47 | 
 48 | 
 49 | enum class token_t
 50 | {
 51 |     TOKEN_END,
 52 |     TOKEN_ERROR,
 53 |     TOKEN_OPEN_ARRAY,
 54 |     TOKEN_CLOSE_ARRAY,
 55 |     TOKEN_OPEN_OBJECT,
 56 |     TOKEN_CLOSE_OBJECT,
 57 |     TOKEN_NUMBER,
 58 |     TOKEN_STRING,
 59 |     TOKEN_COMMA,
 60 |     TOKEN_COLON,
 61 |     TOKEN_TRUE,
 62 |     TOKEN_FALSE,
 63 |     TOKEN_NULL,
 64 | };
 65 | 
 66 | 
 67 | class json_tokens
 68 | {
 69 | public:
 70 |                         json_tokens(std::string const & input);
 71 | 
 72 |     int                 line() const;
 73 |     int                 column() const;
 74 |     token_t             next_token();
 75 |     double              number() const;
 76 |     std::string const & string() const;
 77 |     std::string const & error() const;
 78 | 
 79 | private:
 80 |     char32_t            getc();
 81 |     void                ungetc(char32_t c);
 82 |     char32_t            char16(char32_t & c);
 83 |     void                add_error_character(char32_t c);
 84 | 
 85 |     std::string         f_input = std::string();
 86 |     utf8_iterator       f_iterator; // initialize in the constructor
 87 |     char32_t            f_unget[16];
 88 |     std::size_t         f_unget_pos = 0;
 89 |     std::uint32_t       f_line = 1;
 90 |     std::uint32_t       f_last_line = 0;
 91 |     std::uint32_t       f_column = 1;
 92 |     std::uint32_t       f_last_column = 0;
 93 |     double              f_number = 0.0;
 94 |     std::string         f_string = std::string();
 95 |     std::string         f_error = std::string();
 96 | };
 97 | 
 98 | 
 99 | } // libutf8 namespace
100 | // vim: ts=4 sw=4 et
101 | 


--------------------------------------------------------------------------------
/libutf8/libutf8.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2000-2023  Made to Order Software Corp.  All Rights Reserved
  2 | //
  3 | // https://snapwebsites.org/project/libutf8
  4 | // contact@m2osw.com
  5 | //
  6 | // This program is free software; you can redistribute it and/or modify
  7 | // it under the terms of the GNU General Public License as published by
  8 | // the Free Software Foundation; either version 2 of the License, or
  9 | // (at your option) any later version.
 10 | //
 11 | // This program is distributed in the hope that it will be useful,
 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | // GNU General Public License for more details.
 15 | //
 16 | // You should have received a copy of the GNU General Public License along
 17 | // with this program; if not, write to the Free Software Foundation, Inc.,
 18 | // 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 19 | #pragma once
 20 | 
 21 | /** \file
 22 |  * \brief The declarations of the UTF-8 library.
 23 |  *
 24 |  * This file is the declarations of the UTF-8 library which are just a few
 25 |  * functions used to convert a string from one format to another.
 26 |  */
 27 | 
 28 | // C++
 29 | //
 30 | #include    <string>
 31 | 
 32 | 
 33 | 
 34 | namespace libutf8
 35 | {
 36 | 
 37 | 
 38 | enum class bom_t
 39 | {
 40 |     BOM_NONE,
 41 |     BOM_UTF8,
 42 |     BOM_UTF16_LE,
 43 |     BOM_UTF16_BE,
 44 |     BOM_UTF32_LE,
 45 |     BOM_UTF32_BE
 46 | };
 47 | 
 48 | 
 49 | enum class surrogate_t
 50 | {
 51 |     SURROGATE_NO,
 52 |     SURROGATE_HIGH,
 53 |     SURROGATE_LOW
 54 | };
 55 | 
 56 | 
 57 | 
 58 | 
 59 | bool                is_valid_ascii(char c, bool ctrl = true);
 60 | bool                is_valid_ascii(char const * str, bool ctrl = true);
 61 | bool                is_valid_ascii(std::string const & str, bool ctrl = true);
 62 | bool                is_valid_utf8(char const * str);
 63 | bool                is_valid_utf8(std::string const & str);
 64 | bool                is_valid_utf16(std::u16string const & str);
 65 | bool                is_valid_unicode(char32_t const wc, bool ctrl = true);
 66 | bool                is_valid_unicode(char32_t const * str, bool ctrl = true);
 67 | bool                is_valid_unicode(std::u32string const & str, bool ctrl = true);
 68 | surrogate_t         is_surrogate(char32_t wc);
 69 | bom_t               start_with_bom(char const * str, size_t len);
 70 | std::string         to_u8string(std::u32string const & str);
 71 | std::string         to_u8string(std::u16string const & str);
 72 | std::string         to_u8string(std::wstring const & str);
 73 | std::string         to_u8string(wchar_t one, wchar_t two = L'\0');
 74 | std::string         to_u8string(char16_t one, char16_t two = u'\0');
 75 | std::string         to_u8string(char32_t const wc);
 76 | std::u16string      to_u16string(char32_t const wc);
 77 | std::u16string      to_u16string(std::string const & str);
 78 | std::u32string      to_u32string(std::string const & str);
 79 | std::size_t         u8length(std::string const & str);
 80 | ssize_t             u16length(std::u16string const & str);
 81 | int                 u8casecmp(std::string const & lhs, std::string const & rhs);
 82 | bool                make_u8string_valid(std::string & str, char32_t fix_char = U'?');
 83 | 
 84 | 
 85 | 
 86 | } // libutf8 namespace
 87 | 
 88 | 
 89 | inline std::string operator + (char32_t wc, std::string const & rhs)
 90 | {
 91 |     std::string v;
 92 |     v = libutf8::to_u8string(wc);
 93 |     return v + rhs;
 94 | }
 95 | 
 96 | 
 97 | inline std::string operator + (std::string const & lhs, char32_t wc)
 98 | {
 99 |     std::string v;
100 |     v = libutf8::to_u8string(wc);
101 |     return lhs + v;
102 | }
103 | 
104 | 
105 | inline std::string & operator += (std::string & lhs, char32_t wc)
106 | {
107 |     return lhs += libutf8::to_u8string(wc);
108 | }
109 | 
110 | 
111 | inline std::string & operator += (std::string & lhs, int c)
112 | {
113 |     return lhs += static_cast<char>(c);
114 | }
115 | 
116 | 
117 | inline std::string & operator += (std::string & lhs, unsigned int c)
118 | {
119 |     return lhs += static_cast<char>(c);
120 | }
121 | 
122 | 
123 | inline std::string & operator += (std::string & lhs, long c)
124 | {
125 |     return lhs += static_cast<char>(c);
126 | }
127 | 
128 | 
129 | inline std::string & operator += (std::string & lhs, unsigned long c)
130 | {
131 |     return lhs += static_cast<char>(c);
132 | }
133 | 
134 | 
135 | 
136 | // vim: ts=4 sw=4 et
137 | 


--------------------------------------------------------------------------------
/libutf8/unicode_data.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2000-2023  Made to Order Software Corp.  All Rights Reserved
  2 | //
  3 | // https://snapwebsites.org/project/libutf8
  4 | // contact@m2osw.com
  5 | //
  6 | // This program is free software; you can redistribute it and/or modify
  7 | // it under the terms of the GNU General Public License as published by
  8 | // the Free Software Foundation; either version 2 of the License, or
  9 | // (at your option) any later version.
 10 | //
 11 | // This program is distributed in the hope that it will be useful,
 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | // GNU General Public License for more details.
 15 | //
 16 | // You should have received a copy of the GNU General Public License along
 17 | // with this program; if not, write to the Free Software Foundation, Inc.,
 18 | // 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 19 | 
 20 | /** \file
 21 |  * \brief Implementation of the UTF-8 functions.
 22 |  *
 23 |  * This file is the implementation of the UTF-8 functions of the libutf8
 24 |  * library. It simply is a set of functions to convert between different
 25 |  * character sets in a lossless manner. At this point it supports UTF-8,
 26 |  * UCS-4, and UTF-16 formats.
 27 |  *
 28 |  * Contrary to many of the system functions, these functions do not take
 29 |  * anything from the system in account (the locale can be anything, it does
 30 |  * not change the exact behavior of these functions.)
 31 |  *
 32 |  * Also similar functionality is found on Unices and MS-Windows, it was
 33 |  * simpler to just implement these few functions than to try to have a
 34 |  * converter that is sure not to use a locale and this way we can use
 35 |  * standard strings (std::string and std::wstring) instead of having to
 36 |  * call C functions.
 37 |  */
 38 | 
 39 | // self
 40 | //
 41 | #include    "libutf8/unicode_data.h"
 42 | 
 43 | #include    "libutf8/exception.h"
 44 | #include    "libutf8/libutf8.h"
 45 | #include    "libutf8/unicode_data_file.h"
 46 | 
 47 | 
 48 | // C++
 49 | //
 50 | #include    <cwctype>
 51 | #include    <list>
 52 | 
 53 | 
 54 | // last include
 55 | //
 56 | #include    <snapdev/poison.h>
 57 | 
 58 | 
 59 | 
 60 | /** \brief Name space of the UTF-8 library.
 61 |  *
 62 |  * The library to convert UTF-8 strings to UCS-4 (Unices) or UTF-16 strings
 63 |  * (MS-Windows) and vice versa.
 64 |  */
 65 | namespace libutf8
 66 | {
 67 | 
 68 | 
 69 | namespace
 70 | {
 71 | 
 72 | 
 73 | 
 74 | 
 75 | 
 76 | class private_unicode_character
 77 |     : public unicode_character
 78 | {
 79 | public:
 80 |                         private_unicode_character(
 81 |                                   char32_t code
 82 |                                 , detail::ucd_header * h);
 83 | 
 84 | protected:
 85 |     virtual detail::ucd_character *
 86 |                         ucd_character_pointer() const override;
 87 | 
 88 | private:
 89 |     detail::ucd_character
 90 |                         f_private_character = detail::ucd_character();
 91 | };
 92 | 
 93 | 
 94 | private_unicode_character::private_unicode_character(
 95 |           char32_t code
 96 |         , detail::ucd_header * h)
 97 |     : unicode_character(code, &f_private_character, h)
 98 | {
 99 |     f_private_character.f_code = code;
100 |     f_private_character.f_flags = detail::UCD_FLAG_PRIVATE;
101 |     f_private_character.f_general_category = General_Category::GC_Private_Use;
102 |     f_private_character.f_bidi_class = Bidi_Class::BC_Left_To_Right;
103 | }
104 | 
105 | 
106 | detail::ucd_character * private_unicode_character::ucd_character_pointer() const
107 | {
108 |     return const_cast<detail::ucd_character *>(&f_private_character);
109 | }
110 | 
111 | 
112 | 
113 | } // no name namespace
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | unicode_character::unicode_character(
121 |           char32_t code
122 |         , detail::ucd_character * c
123 |         , detail::ucd_header * h)
124 |     : f_code(code)
125 |     , f_character(c)
126 |     , f_header(h)
127 | {
128 | }
129 | 
130 | 
131 | unicode_character::~unicode_character()
132 | {
133 | }
134 | 
135 | 
136 | unicode_character::unicode_character(unicode_character const & rhs)
137 | {
138 |     // this looks weird, but it works as expected
139 |     //
140 |     f_character = rhs.f_character;
141 |     f_character = ucd_character_pointer();
142 |     f_header = rhs.f_header;
143 | }
144 | 
145 | 
146 | unicode_character & unicode_character::operator = (unicode_character const & rhs)
147 | {
148 |     // this looks weird, but it works as expected
149 |     //
150 |     f_character = rhs.f_character;
151 |     f_character = ucd_character_pointer();
152 |     f_header = rhs.f_header;
153 | 
154 |     return *this;
155 | }
156 | 
157 | 
158 | bool unicode_character::is_valid() const
159 | {
160 |     return is_valid_unicode(f_code);
161 | }
162 | 
163 | 
164 | bool unicode_character::is_defined() const
165 | {
166 |     return f_character->f_code != NOT_A_CHARACTER;
167 | }
168 | 
169 | 
170 | bool unicode_character::is_private() const
171 | {
172 |     return (f_character->f_flags & detail::UCD_FLAG_PRIVATE) != 0;
173 | }
174 | 
175 | 
176 | General_Category unicode_character::category() const
177 | {
178 |     return f_character->f_general_category;
179 | }
180 | 
181 | 
182 | bool unicode_character::is_letter() const
183 | {
184 |     return f_character->f_general_category >= General_Category::GC_Uppercase_Letter
185 |         && f_character->f_general_category <= General_Category::GC_Other_Letter;
186 | }
187 | 
188 | 
189 | bool unicode_character::is_mark() const
190 | {
191 |     return f_character->f_general_category >= General_Category::GC_Nonspacing_Mark
192 |         && f_character->f_general_category <= General_Category::GC_Enclosing_Mark;
193 | }
194 | 
195 | 
196 | bool unicode_character::is_number() const
197 | {
198 |     return f_character->f_general_category >= General_Category::GC_Decimal_Number
199 |         && f_character->f_general_category <= General_Category::GC_Other_Number;
200 | }
201 | 
202 | 
203 | bool unicode_character::is_punctuation() const
204 | {
205 |     return f_character->f_general_category >= General_Category::GC_Connector_Punctuation
206 |         && f_character->f_general_category <= General_Category::GC_Other_Punctuation;
207 | }
208 | 
209 | 
210 | bool unicode_character::is_symbol() const
211 | {
212 |     return f_character->f_general_category >= General_Category::GC_Math_Symbol
213 |         && f_character->f_general_category <= General_Category::GC_Other_Symbol;
214 | }
215 | 
216 | 
217 | bool unicode_character::is_separator() const
218 | {
219 |     return f_character->f_general_category >= General_Category::GC_Space_Separator
220 |         && f_character->f_general_category <= General_Category::GC_Paragraph_Separator;
221 | }
222 | 
223 | 
224 | bool unicode_character::is_other() const
225 | {
226 |     return f_character->f_general_category >= General_Category::GC_Control
227 |         && f_character->f_general_category <= General_Category::GC_Unassigned;
228 | }
229 | 
230 | 
231 | 
232 | Canonical_Combining_Class unicode_character::combining_class()
233 | {
234 |     return f_character->f_canonical_combining_class;
235 | }
236 | 
237 | 
238 | Bidi_Class unicode_character::bidi_class() const
239 | {
240 |     return f_character->f_bidi_class;
241 | }
242 | 
243 | 
244 | bool unicode_character::is_bidi_mirrored() const
245 | {
246 |     return (f_character->f_flags & detail::UCD_FLAG_BIDI_MIRROR) != 0;
247 | }
248 | 
249 | 
250 | Decomposition_Type unicode_character::decomposition_type() const
251 | {
252 |     return static_cast<Decomposition_Type>(f_character->f_decomposition_type);
253 | }
254 | 
255 | 
256 | Numeric_Type unicode_character::numeric() const
257 | {
258 |     if((f_character->f_flags & detail::UCD_FLAG_DIGIT) != 0)
259 |     {
260 |         return Numeric_Type::NT_Digit;
261 |     }
262 | 
263 |     if((f_character->f_flags & detail::UCD_FLAG_DECIMAL) != 0)
264 |     {
265 |         return Numeric_Type::NT_Decimal;
266 |     }
267 | 
268 |     if((f_character->f_flags & detail::UCD_FLAG_NUMERIC) != 0)
269 |     {
270 |         return Numeric_Type::NT_Numeric;
271 |     }
272 | 
273 |     return Numeric_Type::NT_Unknown;
274 | }
275 | 
276 | 
277 | std::int64_t unicode_character::get_number(int index) const
278 | {
279 |     std::size_t length(0);
280 |     char const * name(find_name(detail::Name_Type::NT_Numeric, length));
281 |     if(name == nullptr)
282 |     {
283 |         return 0;
284 |     }
285 |     if(length != 16)
286 |     {
287 |         // someone tempered with the database?
288 |         //
289 |         throw libutf8_logic_exception("invalid \"name\" size for a number");
290 |     }
291 |     std::int64_t const * number(reinterpret_cast<std::int64_t const *>(name));
292 |     return number[index];
293 | }
294 | 
295 | 
296 | std::int64_t unicode_character::nominator() const
297 | {
298 |     return get_number(0);
299 | }
300 | 
301 | 
302 | std::int64_t unicode_character::denominator() const
303 | {
304 |     return get_number(1);
305 | }
306 | 
307 | 
308 | char const * unicode_character::find_name(detail::Name_Type type, std::size_t & length) const
309 | {
310 |     if(f_character->f_names == 0)
311 |     {
312 |         throw libutf8_logic_exception("character is missing a name");
313 |     }
314 | 
315 |     char const * name(reinterpret_cast<char const *>(f_header)
316 |                     + f_header->f_strings + f_character->f_names);
317 |     for(;;)
318 |     {
319 |         detail::Name_Type const t(static_cast<detail::Name_Type>(name[0]));
320 |         if(t == detail::Name_Type::NT_EndOfNames)
321 |         {
322 |             length = 0;
323 |             return nullptr;
324 |         }
325 |         length = static_cast<std::uint8_t>(name[1]);
326 |         if(t == type)
327 |         {
328 |             return name + 2;
329 |         }
330 |         name += length + 2;
331 |     }
332 | }
333 | 
334 | 
335 | detail::ucd_character * unicode_character::ucd_character_pointer() const
336 | {
337 |     return f_character;
338 | }
339 | 
340 | 
341 | 
342 | 
343 | 
344 | 
345 | 
346 | } // libutf8 namespace
347 | // vim: ts=4 sw=4 et
348 | 


--------------------------------------------------------------------------------
/libutf8/unicode_data.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2021-2023  Made to Order Software Corp.  All Rights Reserved
  2 | //
  3 | // https://snapwebsites.org/project/libutf8
  4 | // contact@m2osw.com
  5 | //
  6 | // This program is free software; you can redistribute it and/or modify
  7 | // it under the terms of the GNU General Public License as published by
  8 | // the Free Software Foundation; either version 2 of the License, or
  9 | // (at your option) any later version.
 10 | //
 11 | // This program is distributed in the hope that it will be useful,
 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | // GNU General Public License for more details.
 15 | //
 16 | // You should have received a copy of the GNU General Public License along
 17 | // with this program; if not, write to the Free Software Foundation, Inc.,
 18 | // 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 19 | #pragma once
 20 | 
 21 | /** \file
 22 |  * \brief The declarations of the UTF-8 library.
 23 |  *
 24 |  * This file is the declarations of the UTF-8 library which are just a few
 25 |  * functions used to convert a string from one format to another.
 26 |  */
 27 | 
 28 | // self
 29 | //
 30 | #include    <libutf8/base.h>
 31 | 
 32 | 
 33 | // C++
 34 | //
 35 | #include    <cstdint>
 36 | #include    <memory>
 37 | 
 38 | 
 39 | 
 40 | namespace libutf8
 41 | {
 42 | 
 43 | 
 44 | namespace detail
 45 | {
 46 | class unicode_data_impl;
 47 | class parser_impl;
 48 | class ucd_header;
 49 | class ucd_character;
 50 | enum class Name_Type : std::uint8_t;
 51 | } // detail namespace
 52 | 
 53 | 
 54 | 
 55 | enum class General_Category : std::uint8_t
 56 | {
 57 |     GC_Unknown_Category = 0,
 58 | 
 59 |     // GC_Letter = 1 to 6           // L
 60 |     GC_Uppercase_Letter = 1,        // Lu
 61 |     GC_Lowercase_Letter = 2,        // Ll
 62 |     GC_TitleCase_Letter = 3,        // Lt
 63 |     GC_Cased_Letter = 4,            // LC
 64 |     GC_Modified_Letter = 5,         // Lm
 65 |     GC_Other_Letter = 6,            // Lo
 66 | 
 67 |     // GC_Mark = 7 to 9             // M
 68 |     GC_Nonspacing_Mark = 7,         // Mn
 69 |     GC_Spacing_Mark = 8,            // Mc
 70 |     GC_Enclosing_Mark = 9,          // Me
 71 | 
 72 |     // GC_Number = 10 to 12         // N
 73 |     GC_Decimal_Number = 10,         // Nd
 74 |     GC_Letter_Number = 11,          // Nl
 75 |     GC_Other_Number = 12,           // No
 76 | 
 77 |     // GC_Punctuation = 13 to 19    // P
 78 |     GC_Connector_Punctuation = 13,  // Pc
 79 |     GC_Dash_Punctuation = 14,       // Pd
 80 |     GC_Open_Punctuation = 15,       // Ps
 81 |     GC_Close_Punctuation = 16,      // Pe
 82 |     GC_Initial_Punctuation = 17,    // Pi
 83 |     GC_Final_Punctuation = 18,      // Pf
 84 |     GC_Other_Punctuation = 19,      // Po
 85 | 
 86 |     // GC_Symbol = 20 to 23         // S
 87 |     GC_Math_Symbol = 20,            // Sm
 88 |     GC_Current_Symbol = 21,         // Sc
 89 |     GC_Modifier_Symbol = 22,        // Sk
 90 |     GC_Other_Symbol = 23,           // So
 91 | 
 92 |     // GC_Separator = 24 to 26      // Z
 93 |     GC_Space_Separator = 24,        // Zs
 94 |     GC_Line_Separator = 25,         // Zl
 95 |     GC_Paragraph_Separator = 26,    // Zp
 96 | 
 97 |     // GC_Other = 27 to 31          // C
 98 |     GC_Control = 27,                // Cc
 99 |     GC_Format = 28,                 // Cf
100 |     GC_Surrogate = 29,              // Cs
101 |     GC_Private_Use = 30,            // Co
102 |     GC_Unassigned = 31,             // Cn
103 | };
104 | 
105 | 
106 | enum class Canonical_Combining_Class : std::uint8_t
107 | {
108 |     CCC_Not_Reordered = 0,
109 | 
110 |     // Fixed position classes
111 |     CCC_Overlay = 1,
112 |     CCC_Han_Reading = 6,
113 |     CCC_Nukta = 7,
114 |     CCC_Kana_Voicing = 8,
115 |     CCC_Virama = 9,
116 |     CCC_Ccc10 = 10,     // first CCC
117 |     // ... not specifically defined ...
118 |     CCC_Ccc199 = 199,   // last CCC
119 | 
120 |     // Other classes
121 |     CCC_Attached_Below_Left = 200,
122 |     CCC_Attached_Below = 202,
123 |     CCC_Attached_Above = 214,
124 |     CCC_Attached_Above_Right = 216,
125 |     CCC_Below_Left = 218,
126 |     CCC_Below = 220,
127 |     CCC_Below_Right = 222,
128 |     CCC_Left = 224,
129 |     CCC_Right = 226,
130 |     CCC_Above_Left = 228,
131 |     CCC_Above = 230,
132 |     CCC_Above_Right = 232,
133 |     CCC_Double_Below = 233,
134 |     CCC_Double_Above = 234,
135 |     CCC_Iota_Subscript = 240,
136 | };
137 | 
138 | 
139 | enum class Bidi_Class : std::uint8_t
140 | {
141 |     BC_Unknown = 0,
142 | 
143 |     // Strong Types
144 |     BC_Left_To_Right = 10,              // L
145 |     BC_Right_To_Left = 11,              // R
146 |     BC_Arabic_Letter = 12,              // AL
147 | 
148 |     // Weak Types
149 |     BC_European_Number = 20,            // EN
150 |     BC_European_Separator = 21,         // ES
151 |     BC_European_Terminator = 22,        // ET
152 |     BC_Arabic_Number = 23,              // AN
153 |     BC_Common_Separator = 24,           // CS
154 |     BC_Nonspacing_Mark = 25,            // NSM
155 |     BC_Boundary_Neutral = 26,           // BN
156 | 
157 |     // Neutral Types
158 |     BC_Paragraph_Separator = 30,        // B
159 |     BC_Segment_Separator = 31,          // S
160 |     BC_White_Space = 32,                // WS
161 |     BC_Other_Neutral = 33,              // ON
162 | 
163 |     // Explicit Formatting Types
164 |     BC_Left_To_Right_Embedding = 40,    // LRE
165 |     BC_Left_To_Right_Override = 41,     // LRO
166 |     BC_Right_To_Left_Embedding = 42,    // RLE
167 |     BC_Right_To_Left_Override = 43,     // RLO
168 |     BC_Pop_Directional_Format = 44,     // PDF
169 |     BC_Left_To_Right_Isolate = 45,      // LRI
170 |     BC_Right_To_Left_Isolate = 46,      // RLI
171 |     BC_First_Strong_Isolate = 47,       // FSI
172 |     BC_Pop_Directional_Isolate = 48,    // PDI
173 | };
174 | 
175 | 
176 | enum class Decomposition_Type : std::uint8_t
177 | {
178 |     DT_unknown = 0,
179 |     DT_none = 1,
180 |     DT_canonical = 2,
181 | 
182 |     DT_font = 10,
183 |     DT_noBreak = 11,
184 |     DT_initial = 12,
185 |     DT_medial = 13,
186 |     DT_final = 14,
187 |     DT_isolated = 15,
188 |     DT_circle = 16,
189 |     DT_super = 17,
190 |     DT_sub = 18,
191 |     DT_vertical = 19,
192 |     DT_wide = 20,
193 |     DT_narrow = 21,
194 |     DT_small = 22,
195 |     DT_square = 23,
196 |     DT_fraction = 24,
197 |     DT_compat = 25,
198 | };
199 | 
200 | 
201 | enum class Numeric_Type : std::uint8_t
202 | {
203 |     NT_Unknown = 0, // a.k.a. this is not marked as a number
204 | 
205 |     NT_Digit = 1,           // the Digit type should be viewed as equivalent to Decimal
206 |     NT_Decimal = 2,
207 |     NT_Numeric = 3,
208 | };
209 | 
210 | 
211 | 
212 | 
213 | 
214 | class unicode_character
215 | {
216 | public:
217 |     typedef std::shared_ptr<unicode_character>
218 |                         pointer_t;
219 | 
220 |                         unicode_character(
221 |                               char32_t code
222 |                             , detail::ucd_character * c
223 |                             , detail::ucd_header * h);
224 |     virtual             ~unicode_character();
225 |                         unicode_character(unicode_character const & rhs);
226 |     unicode_character & operator = (unicode_character const & rhs);
227 | 
228 |     bool                is_valid() const;       // valid code point as far as Unicode (UTF-32) is concerned
229 |     bool                is_defined() const;     // whether this is a Unicode defined character or not
230 |     bool                is_private() const;     // whether this code point is reserved for private use
231 | 
232 |     General_Category    category() const;
233 |     bool                is_letter() const;
234 |     bool                is_mark() const;
235 |     bool                is_number() const;
236 |     bool                is_punctuation() const;
237 |     bool                is_symbol() const;
238 |     bool                is_separator() const;
239 |     bool                is_other() const;
240 | 
241 |     Canonical_Combining_Class
242 |                         combining_class();
243 |     Bidi_Class          bidi_class() const;
244 |     bool                is_bidi_mirrored() const;
245 |     Decomposition_Type  decomposition_type() const;
246 | 
247 |     Numeric_Type        numeric() const;
248 |     std::int64_t        nominator() const;
249 |     std::int64_t        denominator() const;
250 | 
251 | protected:
252 |     virtual detail::ucd_character *
253 |                         ucd_character_pointer() const;
254 | 
255 | private:
256 |     std::int64_t        get_number(int index) const;
257 |     char const *        find_name(detail::Name_Type type, std::size_t & length) const;
258 | 
259 |     char32_t            f_code = NOT_A_CHARACTER;
260 |     detail::ucd_character *
261 |                         f_character = nullptr;
262 |     detail::ucd_header *f_header = nullptr;
263 | };
264 | 
265 | 
266 | 
267 | 
268 | class unicode_data
269 | {
270 | public:
271 |     typedef std::shared_ptr<unicode_data>
272 |                         pointer_t;
273 | 
274 |     static pointer_t    get_instance();
275 | 
276 |     // input file information
277 |     //
278 |     time_t              last_generated();
279 |     void                set_cache(bool cache = true);
280 |     bool                get_cache() const;
281 |     char const *        version() const;
282 |     std::string const   version_string() const;
283 | 
284 |     // access character data
285 |     //
286 |     unicode_character::pointer_t
287 |                         character(char32_t wc);
288 | 
289 | private:
290 |     typedef std::shared_ptr<detail::unicode_data_impl>
291 |                         unicode_data_impl_pointer_t;
292 | 
293 |     unicode_data_impl_pointer_t
294 |                         f_impl = unicode_data_impl_pointer_t();
295 | };
296 | 
297 | 
298 | class ucd_parser
299 | {
300 | public:
301 |                         ucd_parser(
302 |                               std::string const & input_dir
303 |                             , std::string const & output_filename);
304 | 
305 |     void                generate();
306 | 
307 | private:
308 |     typedef std::shared_ptr<detail::parser_impl>
309 |                         parser_impl_pointer_t;
310 | 
311 |     parser_impl_pointer_t
312 |                         f_impl = parser_impl_pointer_t();
313 | };
314 | 
315 | 
316 | 
317 | 
318 | } // libutf8 namespace
319 | // vim: ts=4 sw=4 et
320 | 


--------------------------------------------------------------------------------
/libutf8/unicode_data_file.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2021-2023  Made to Order Software Corp.  All Rights Reserved
  2 | //
  3 | // https://snapwebsites.org/project/libutf8
  4 | // contact@m2osw.com
  5 | //
  6 | // This program is free software; you can redistribute it and/or modify
  7 | // it under the terms of the GNU General Public License as published by
  8 | // the Free Software Foundation; either version 2 of the License, or
  9 | // (at your option) any later version.
 10 | //
 11 | // This program is distributed in the hope that it will be useful,
 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | // GNU General Public License for more details.
 15 | //
 16 | // You should have received a copy of the GNU General Public License along
 17 | // with this program; if not, write to the Free Software Foundation, Inc.,
 18 | // 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 19 | #pragma once
 20 | 
 21 | /** \file
 22 |  * \brief The declarations of the Unicode compiled files.
 23 |  *
 24 |  * This file includes structures used to describe the Unicode compiled
 25 |  * file. This allows us to very quickly find all the information about
 26 |  * a character.
 27 |  *
 28 |  * From the outside, you are expected to use the unicode_character
 29 |  * functions defined in the unicode_data.h header. This file is
 30 |  * considered private.
 31 |  */
 32 | 
 33 | // self
 34 | //
 35 | #include    <libutf8/unicode_data.h>
 36 | 
 37 | 
 38 | // C++
 39 | //
 40 | #include    <string>
 41 | 
 42 | 
 43 | 
 44 | namespace libutf8
 45 | {
 46 | 
 47 | namespace detail
 48 | {
 49 | 
 50 | 
 51 | enum class Name_Type : std::uint8_t  // see UnicodeData.txt and NameAliases.txt
 52 | {
 53 |     NT_Name = 0xF0,
 54 |     NT_Abbreviation = 0xF1,
 55 |     NT_Jamo_Short_Name = 0xF2,  // see Jamo.txt
 56 |     NT_Alternate = 0xF3,
 57 |     NT_Control = 0xF4,
 58 |     NT_WrongName = 0xF5,        // the main name is the corrected name, this name is the invalid/incorrect name
 59 |     NT_Figment = 0xF6,
 60 |     NT_Numeric = 0xF7,          // saved as two int64_t in the strings because that's under 8kb that way
 61 | 
 62 |     NT_EndOfNames = 0xFF,
 63 | };
 64 | 
 65 | 
 66 | 
 67 | struct ucd_header
 68 | {
 69 |     char                f_magic[4] = { 'U', 'C', 'D', 'B' };
 70 |     time_t              f_timestamp = 0;                // time when this file was generated
 71 |     std::uint8_t        f_version = 0;                  // version of this file format
 72 |     std::uint8_t        f_ucd_version[3] = { 1, 1, 0 }; // version of source -- i.e. 5 2 0
 73 |     std::uint32_t       f_characters = 0;               // offset to character table
 74 |     std::uint32_t       f_strings = 0;                  // offset to string table
 75 |     std::uint32_t       f_decomposition = 0;            // offset to decomposition table
 76 | };
 77 | 
 78 | 
 79 | 
 80 | typedef std::uint8_t        flags_t;
 81 | 
 82 | constexpr flags_t           UCD_FLAG_DIGIT              = 0x01; // represents a number
 83 | constexpr flags_t           UCD_FLAG_DECIMAL            = 0x02; // represents a number
 84 | constexpr flags_t           UCD_FLAG_NUMERIC            = 0x04; // represents a number
 85 | constexpr flags_t           UCD_FLAG_BIDI_MIRROR        = 0x08; // mirror of another letter left to right vs. right to left
 86 | constexpr flags_t           UCD_FLAG_CONTROL            = 0x10;
 87 | constexpr flags_t           UCD_FLAG_PRIVATE            = 0x20;
 88 | 
 89 | 
 90 | 
 91 | struct ucd_character
 92 | {
 93 |     // initialization happens in a non-virtual function, otherwise it
 94 |     // would break the binary use of the structure
 95 |     //
 96 |     void initialize_ucd_character()
 97 |     {
 98 |         f_code = NOT_A_CHARACTER;
 99 |         f_names = 0;
100 |         f_flags = 0;
101 | 
102 |         f_general_category = General_Category::GC_Unknown_Category;
103 |         f_canonical_combining_class = Canonical_Combining_Class::CCC_Not_Reordered;
104 |         f_bidi_class = Bidi_Class::BC_Unknown; // see flags for mirror info
105 |         f_decomposition_type = static_cast<int>(Decomposition_Type::DT_unknown);
106 |         f_decomposition_length = 0;
107 |         f_decomposition_mapping = 0;
108 |         f_age[0] = 1;
109 |         f_age[1] = 1;
110 |     }
111 | 
112 |     /* 32 */    char32_t                    f_code;
113 |     /* 32 */    std::uint32_t               f_names;        // offset to string table
114 |     /*  8 */    flags_t                     f_flags;
115 |     /*  8 */    General_Category            f_general_category;
116 |     /*  8 */    Canonical_Combining_Class   f_canonical_combining_class;
117 |     /*  8 */    Bidi_Class                  f_bidi_class;
118 |     /*  5 */    std::uint32_t               f_decomposition_type : 5;
119 |     /*  5 */    std::uint32_t               f_decomposition_length : 5;
120 |     /* 22 */    std::uint32_t               f_decomposition_mapping : 22;
121 |     /* 16 */    std::uint8_t                f_age[2];
122 | };
123 | 
124 | 
125 | // The f_names is an offset in the string table.
126 | //
127 | // Each name is defined as:
128 | //
129 | //     struct name_t
130 | //     {
131 | //         Name_Type    f_type;
132 | //         uint8_t      f_size;
133 | //         char8_t      f_name[f_size];
134 | //     };
135 | //
136 | // Names are not null terminated.
137 | // followed by UTF-8 until the next byte representing a Name_Type, the
138 | // last name ends with special type NT_EndOfNames.
139 | //
140 | // The first name is the corrected name of the character.
141 | //
142 | // Following are the other Name_Type names.
143 | //
144 | // The numeric entries are actually two 64 bit numbers (nominator and
145 | // denominator). The size will always be 16 bytes, but the alignment
146 | // is likely going to be "wrong" (although that should not matter much
147 | // on Intel and ARM processors).
148 | 
149 | 
150 | 
151 | 
152 | } // detail namespace
153 | 
154 | } // libutf8 namespace
155 | // vim: ts=4 sw=4 et
156 | 


--------------------------------------------------------------------------------
/libutf8/version.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2000-2023  Made to Order Software Corp.  All Rights Reserved
  2 | //
  3 | // https://snapwebsites.org/
  4 | // contact@m2osw.com
  5 | //
  6 | // This program is free software; you can redistribute it and/or modify
  7 | // it under the terms of the GNU General Public License as published by
  8 | // the Free Software Foundation; either version 2 of the License, or
  9 | // (at your option) any later version.
 10 | //
 11 | // This program is distributed in the hope that it will be useful,
 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | // GNU General Public License for more details.
 15 | //
 16 | // You should have received a copy of the GNU General Public License along
 17 | // with this program; if not, write to the Free Software Foundation, Inc.,
 18 | // 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 19 | 
 20 | /** \file
 21 |  * \brief The UTF-8 libray is used to convert C++ strings.
 22 |  *
 23 |  * This file shows the UTF-8 library version.
 24 |  *
 25 |  * The `#define` give you the library version at the time you are compiling.
 26 |  * The functions allow you to retrieve the version of a dynamically linked
 27 |  * library.
 28 |  */
 29 | 
 30 | // self
 31 | //
 32 | #include    "libutf8/version.h"
 33 | 
 34 | 
 35 | // last include
 36 | //
 37 | #include    <snapdev/poison.h>
 38 | 
 39 | 
 40 | 
 41 | namespace libutf8
 42 | {
 43 | 
 44 | 
 45 | 
 46 | 
 47 | /** \brief Get the major version of the library
 48 |  *
 49 |  * This function returns the major version of the running library (the
 50 |  * one you are linked against at runtime).
 51 |  *
 52 |  * \return The major version.
 53 |  */
 54 | int get_major_version()
 55 | {
 56 |     return LIBUTF8_VERSION_MAJOR;
 57 | }
 58 | 
 59 | 
 60 | /** \brief Get the minor version of the library.
 61 |  *
 62 |  * This function returns the minor version of the running library
 63 |  * (the one you are linked against at runtime).
 64 |  *
 65 |  * \return The release version.
 66 |  */
 67 | int get_release_version()
 68 | {
 69 |     return LIBUTF8_VERSION_MINOR;
 70 | }
 71 | 
 72 | 
 73 | /** \brief Get the patch version of the library.
 74 |  *
 75 |  * This function returns the patch version of the running library
 76 |  * (the one you are linked against at runtime).
 77 |  *
 78 |  * \return The patch version.
 79 |  */
 80 | int get_patch_version()
 81 | {
 82 |     return LIBUTF8_VERSION_PATCH;
 83 | }
 84 | 
 85 | 
 86 | /** \brief Get the full version of the library as a string.
 87 |  *
 88 |  * This function returns the major, minor, and patch versions of the
 89 |  * running library (the one you are linked against at runtime) in the
 90 |  * form of a string.
 91 |  *
 92 |  * The build version is not made available. In most cases we change
 93 |  * the build version only to run a new build, so not code will have
 94 |  * changed (some documentation and non-code files may changed between
 95 |  * build versions; but the code will work exactly the same way.)
 96 |  *
 97 |  * \return The library version.
 98 |  */
 99 | char const * get_version_string()
100 | {
101 |     return LIBUTF8_VERSION_STRING;
102 | }
103 | 
104 | 
105 | } // libutf8 namespace
106 | // vim: ts=4 sw=4 et
107 | 


--------------------------------------------------------------------------------
/libutf8/version.h.in:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2000-2023  Made to Order Software Corp.  All Rights Reserved
 2 | //
 3 | // https://snapwebsites.org/
 4 | // contact@m2osw.com
 5 | //
 6 | // This program is free software; you can redistribute it and/or modify
 7 | // it under the terms of the GNU General Public License as published by
 8 | // the Free Software Foundation; either version 2 of the License, or
 9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
19 | #pragma once
20 | 
21 | /** \file
22 |  * \brief Definitions of the libutf8 version.
23 |  *
24 |  * This header includes the libutf8 library version and functions you
25 |  * can use to check the current version of the library.
26 |  */
27 | 
28 | 
29 | #define    LIBUTF8_VERSION_MAJOR   @LIBUTF8_VERSION_MAJOR@
30 | #define    LIBUTF8_VERSION_MINOR   @LIBUTF8_VERSION_MINOR@
31 | #define    LIBUTF8_VERSION_PATCH   @LIBUTF8_VERSION_PATCH@
32 | #define    LIBUTF8_VERSION_STRING  "@LIBUTF8_VERSION_MAJOR@.@LIBUTF8_VERSION_MINOR@.@LIBUTF8_VERSION_PATCH@"
33 | 
34 | namespace libutf8
35 | {
36 | 
37 | 
38 | int             get_major_version();
39 | int             get_release_version();
40 | int             get_patch_version();
41 | char const *    get_version_string();
42 | 
43 | 
44 | 
45 | } // libutf8 namespace
46 | // vim: ts=4 sw=4 et
47 | 


--------------------------------------------------------------------------------
/mk:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #
 3 | # See the snapcmakemodules project for details about this script
 4 | #     https://github.com/m2osw/snapcmakemodules
 5 | 
 6 | if test -x ../../cmake/scripts/mk
 7 | then
 8 | 	../../cmake/scripts/mk $*
 9 | else
10 | 	echo "error: could not locate the cmake mk script"
11 | 	exit 1
12 | fi
13 | 
14 | 


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2000-2023  Made to Order Software Corp.  All Rights Reserved
 2 | #
 3 | # https://snapwebsites.org/project/libutf8
 4 | # contact@m2osw.com
 5 | #
 6 | # This program is free software; you can redistribute it and/or modify
 7 | # it under the terms of the GNU General Public License as published by
 8 | # the Free Software Foundation; either version 2 of the License, or
 9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License along
17 | # with this program; if not, write to the Free Software Foundation, Inc.,
18 | # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
19 | 
20 | ##
21 | ## libutf8 library unit tests
22 | ##
23 | project(unittest)
24 | 
25 | find_package(SnapCatch2)
26 | 
27 | if(SnapCatch2_FOUND)
28 | 
29 |     add_executable(${PROJECT_NAME}
30 |         catch_main.cpp
31 | 
32 |         catch_bom.cpp
33 |         catch_caseinsensitive.cpp
34 |         catch_character.cpp
35 |         catch_iterator.cpp
36 |         catch_json_tokens.cpp
37 |         catch_length.cpp
38 |         catch_stream.cpp
39 |         catch_string.cpp
40 |         catch_valid.cpp
41 |         catch_version.cpp
42 |     )
43 | 
44 |     target_include_directories(${PROJECT_NAME}
45 |         PUBLIC
46 |             ${CMAKE_BINARY_DIR}
47 |             ${PROJECT_SOURCE_DIR}
48 |             ${SNAPCATCH2_INCLUDE_DIRS}
49 |             ${LIBEXCEPT_INCLUDE_DIRS}
50 |     )
51 | 
52 |     target_link_libraries(${PROJECT_NAME}
53 |         utf8
54 |         ${SNAPCATCH2_LIBRARIES}
55 |     )
56 | 
57 | else(SnapCatch2_FOUND)
58 | 
59 |     message("snapcatch2 not found... no test will be built.")
60 | 
61 | endif(SnapCatch2_FOUND)
62 | 
63 | if(SnapCatch2_FOUND)
64 | 
65 |     find_package(SnapTestRunner)
66 |     AddUnitTestsTarget(
67 |         PROJECT_NAME
68 |             rununittests
69 |     )
70 | 
71 | endif(SnapCatch2_FOUND)
72 | 
73 | # vim: ts=4 sw=4 et
74 | 


--------------------------------------------------------------------------------
/tests/catch_bom.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2021-2023  Made to Order Software Corp.  All Rights Reserved
  2 | //
  3 | // https://snapwebsites.org/project/libutf8
  4 | // contact@m2osw.com
  5 | //
  6 | // This program is free software; you can redistribute it and/or modify
  7 | // it under the terms of the GNU General Public License as published by
  8 | // the Free Software Foundation; either version 2 of the License, or
  9 | // (at your option) any later version.
 10 | //
 11 | // This program is distributed in the hope that it will be useful,
 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | // GNU General Public License for more details.
 15 | //
 16 | // You should have received a copy of the GNU General Public License along
 17 | // with this program; if not, write to the Free Software Foundation, Inc.,
 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 19 | 
 20 | // libutf8
 21 | //
 22 | #include    <libutf8/base.h>
 23 | #include    <libutf8/libutf8.h>
 24 | 
 25 | 
 26 | // unit test
 27 | //
 28 | #include    "catch_main.h"
 29 | 
 30 | 
 31 | // C++
 32 | //
 33 | #include    <cctype>
 34 | #include    <iostream>
 35 | 
 36 | 
 37 | // last include
 38 | //
 39 | #include    <snapdev/poison.h>
 40 | 
 41 | 
 42 | 
 43 | CATCH_TEST_CASE("bom", "[characters],[bom]")
 44 | {
 45 |     CATCH_START_SECTION("bom: Verify the BOM character")
 46 |         CATCH_REQUIRE(libutf8::BOM_CHAR == 0xFEFF);
 47 |     CATCH_END_SECTION()
 48 | 
 49 |     CATCH_START_SECTION("bom: Verify with a string that's too small")
 50 |     {
 51 |         CATCH_REQUIRE(libutf8::start_with_bom(nullptr, rand()) == libutf8::bom_t::BOM_NONE);
 52 |         CATCH_REQUIRE(libutf8::start_with_bom("", 0) == libutf8::bom_t::BOM_NONE);
 53 |         CATCH_REQUIRE(libutf8::start_with_bom("a", 1) == libutf8::bom_t::BOM_NONE);
 54 |     }
 55 |     CATCH_END_SECTION()
 56 | 
 57 |     CATCH_START_SECTION("bom: Verify the five BOMs as is")
 58 |     {
 59 |         char buf[4];
 60 |         char32_t const bom(libutf8::BOM_CHAR);
 61 | 
 62 |         // UTF-8
 63 |         buf[0] = static_cast<char>((bom >> 12) | 0xE0);
 64 |         buf[1] = static_cast<char>(((bom >>  6) & 0x3F) | 0x80);
 65 |         buf[2] = static_cast<char>(((bom >>  0) & 0x3F) | 0x80);
 66 |         buf[3] = '?';
 67 |         CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF8);
 68 | 
 69 |         // UTF-16 Little Endian
 70 |         buf[0] = static_cast<char>(bom >> 0);
 71 |         buf[1] = static_cast<char>(bom >> 8);
 72 |         buf[2] = static_cast<char>(0x00);
 73 |         buf[3] = static_cast<char>(0x34);
 74 |         CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_LE);
 75 | 
 76 |         // UTF-16 Little Endian (with a zero in the next 2 bytes)
 77 |         buf[0] = static_cast<char>(bom >> 0);
 78 |         buf[1] = static_cast<char>(bom >> 8);
 79 |         buf[2] = static_cast<char>(0x12);
 80 |         buf[3] = static_cast<char>(0x00);
 81 |         CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_LE);
 82 | 
 83 |         // UTF-16 Little Endian (with a zero in the next 2 bytes)
 84 |         buf[0] = static_cast<char>(bom >> 0);
 85 |         buf[1] = static_cast<char>(bom >> 8);
 86 |         buf[2] = static_cast<char>(0x12);
 87 |         buf[3] = static_cast<char>(0x34);
 88 |         CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_LE);
 89 | 
 90 |         // UTF-16 Big Endian
 91 |         buf[0] = static_cast<char>(bom >> 8);
 92 |         buf[1] = static_cast<char>(bom >> 0);
 93 |         buf[2] = static_cast<char>(0xAB);
 94 |         buf[3] = static_cast<char>(0xCD);
 95 |         CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_BE);
 96 | 
 97 |         // UTF-16 Big Endian (with a zero in the next 2 bytes)
 98 |         buf[0] = static_cast<char>(bom >> 8);
 99 |         buf[1] = static_cast<char>(bom >> 0);
100 |         buf[2] = static_cast<char>(0x00);
101 |         buf[3] = static_cast<char>(0xCD);
102 |         CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_BE);
103 | 
104 |         // UTF-16 Big Endian (with a zero in the next 2 bytes)
105 |         buf[0] = static_cast<char>(bom >> 8);
106 |         buf[1] = static_cast<char>(bom >> 0);
107 |         buf[2] = static_cast<char>(0xAB);
108 |         buf[3] = static_cast<char>(0x00);
109 |         CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF16_BE);
110 | 
111 |         // UTF-32 Little Endian
112 |         buf[0] = static_cast<char>(bom >>  0);
113 |         buf[1] = static_cast<char>(bom >>  8);
114 |         buf[2] = static_cast<char>(bom >> 16);
115 |         buf[3] = static_cast<char>(bom >> 24);
116 |         CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF32_LE);
117 | 
118 |         // UTF-32 Big Endian
119 |         buf[0] = static_cast<char>(bom >> 24);
120 |         buf[1] = static_cast<char>(bom >> 16);
121 |         buf[2] = static_cast<char>(bom >>  8);
122 |         buf[3] = static_cast<char>(bom >>  0);
123 |         CATCH_REQUIRE(libutf8::start_with_bom(buf, sizeof(buf)) == libutf8::bom_t::BOM_UTF32_BE);
124 |     }
125 |     CATCH_END_SECTION()
126 | 
127 |     CATCH_START_SECTION("bom: Verify the five BOMs as is")
128 |     {
129 |         char buf[4];
130 | 
131 |         // unknown 1 byte (well... 1 byte is never really known...)
132 |         buf[0] = '?';
133 |         CATCH_REQUIRE(libutf8::start_with_bom(buf, 1) == libutf8::bom_t::BOM_NONE);
134 | 
135 |         // unknown 2 bytes
136 |         buf[0] = 'Q';
137 |         buf[1] = '?';
138 |         CATCH_REQUIRE(libutf8::start_with_bom(buf, 2) == libutf8::bom_t::BOM_NONE);
139 | 
140 |         // unknown 3 bytes
141 |         buf[0] = 'B';
142 |         buf[1] = 'O';
143 |         buf[2] = 'M';
144 |         CATCH_REQUIRE(libutf8::start_with_bom(buf, 3) == libutf8::bom_t::BOM_NONE);
145 | 
146 |         // unknown 4 bytes
147 |         buf[0] = 'B';
148 |         buf[1] = 'O';
149 |         buf[2] = 'M';
150 |         buf[3] = '?';
151 |         CATCH_REQUIRE(libutf8::start_with_bom(buf, 4) == libutf8::bom_t::BOM_NONE);
152 |     }
153 |     CATCH_END_SECTION()
154 | 
155 |     CATCH_START_SECTION("bom: Verify u32string that starts with a BOM (CPU Endianness)")
156 |     {
157 |         std::u32string u32str;
158 |         u32str += libutf8::BOM_CHAR;
159 |         u32str += unittest::rand_char(true);
160 |         size_t const size(u32str.length() * sizeof(std::u32string::value_type));
161 |         for(int idx(static_cast<int>(size)); idx >= 0; --idx)
162 |         {
163 |             if(static_cast<size_t>(idx) >= sizeof(std::u32string::value_type))
164 |             {
165 | #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
166 |                 CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_UTF32_BE);
167 | #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
168 |                 CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_UTF32_LE);
169 | #else
170 | #error "Unsupported endianness"
171 | #endif
172 |             }
173 | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
174 |             else if(static_cast<size_t>(idx) >= sizeof(std::u16string::value_type))
175 |             {
176 |                 CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_UTF16_LE);
177 |             }
178 | #endif
179 |             else
180 |             {
181 |                 // too short
182 |                 //
183 |                 CATCH_REQUIRE(libutf8::start_with_bom(reinterpret_cast<char const *>(u32str.c_str()), idx) == libutf8::bom_t::BOM_NONE);
184 |             }
185 |         }
186 |     }
187 |     CATCH_END_SECTION()
188 | }
189 | 
190 | 
191 | // vim: ts=4 sw=4 et
192 | 


--------------------------------------------------------------------------------
/tests/catch_caseinsensitive.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2021-2023  Made to Order Software Corp.  All Rights Reserved
  2 | //
  3 | // https://snapwebsites.org/project/libutf8
  4 | // contact@m2osw.com
  5 | //
  6 | // This program is free software; you can redistribute it and/or modify
  7 | // it under the terms of the GNU General Public License as published by
  8 | // the Free Software Foundation; either version 2 of the License, or
  9 | // (at your option) any later version.
 10 | //
 11 | // This program is distributed in the hope that it will be useful,
 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | // GNU General Public License for more details.
 15 | //
 16 | // You should have received a copy of the GNU General Public License along
 17 | // with this program; if not, write to the Free Software Foundation, Inc.,
 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 19 | 
 20 | // libutf8
 21 | //
 22 | #include    <libutf8/caseinsensitivestring.h>
 23 | 
 24 | 
 25 | // unit test
 26 | //
 27 | #include    "catch_main.h"
 28 | 
 29 | 
 30 | // C++
 31 | //
 32 | #include    <cctype>
 33 | #include    <iostream>
 34 | 
 35 | 
 36 | // last include
 37 | //
 38 | #include    <snapdev/poison.h>
 39 | 
 40 | 
 41 | 
 42 | namespace
 43 | {
 44 | 
 45 | 
 46 | libutf8::case_insensitive_string get_time(std::string & result)
 47 | {
 48 |     time_t const now(time(nullptr));
 49 |     struct tm t;
 50 |     localtime_r(&now, &t);
 51 |     char buf[256];
 52 |     strftime(buf, sizeof(buf), "%T", &t);
 53 |     buf[sizeof(buf) - 1] = '\0';
 54 |     result = buf;
 55 |     libutf8::case_insensitive_string r(buf);
 56 |     r += " PST";
 57 |     return r;
 58 | }
 59 | 
 60 | std::string get_date(std::string & result)
 61 | {
 62 |     time_t const now(time(nullptr));
 63 |     struct tm t;
 64 |     localtime_r(&now, &t);
 65 |     char buf[256];
 66 |     strftime(buf, sizeof(buf), "%F", &t);
 67 |     buf[sizeof(buf) - 1] = '\0';
 68 |     result = buf;
 69 |     libutf8::case_insensitive_string r(buf);
 70 |     r += " plus a few days";
 71 |     return r;
 72 | }
 73 | 
 74 | 
 75 | 
 76 | }
 77 | 
 78 | 
 79 | 
 80 | CATCH_TEST_CASE("case_insensitive", "[string],[compare],[insensitive]")
 81 | {
 82 |     CATCH_START_SECTION("case_insensitive: Verify Case Insensitive String Constructors")
 83 |     {
 84 |         {
 85 |             libutf8::case_insensitive_string empty;
 86 |             CATCH_REQUIRE(empty.empty());
 87 |         }
 88 | 
 89 |         {
 90 |             std::allocator<char> allocator;
 91 |             libutf8::case_insensitive_string empty(allocator);
 92 |             CATCH_REQUIRE(empty.empty());
 93 |         }
 94 | 
 95 |         {
 96 |             libutf8::case_insensitive_string dashes(10, '-');
 97 |             CATCH_REQUIRE(dashes == "----------");
 98 |         }
 99 | 
100 |         {
101 |             libutf8::case_insensitive_string name("alexis");
102 |             CATCH_REQUIRE(name == "alexis");
103 |         }
104 | 
105 |         {
106 |             libutf8::case_insensitive_string name("alexis", 4);
107 |             CATCH_REQUIRE(name == "alex");
108 |         }
109 | 
110 |         {
111 |             libutf8::case_insensitive_string name("alexis");
112 |             CATCH_REQUIRE(name == "alexis");
113 | 
114 |             libutf8::case_insensitive_string section(name, 2);
115 |             CATCH_REQUIRE(section == "exis");
116 |         }
117 | 
118 |         {
119 |             libutf8::case_insensitive_string name("alexis");
120 |             CATCH_REQUIRE(name == "alexis");
121 | 
122 |             libutf8::case_insensitive_string section(name, 2, 2);
123 |             CATCH_REQUIRE(section == "ex");
124 |         }
125 | 
126 |         {
127 |             std::string name("alexis");
128 |             CATCH_REQUIRE(name == "alexis");
129 | 
130 |             libutf8::case_insensitive_string section(name, 2);
131 |             CATCH_REQUIRE(section == "exis");
132 |         }
133 | 
134 |         {
135 |             std::string name("alexis");
136 |             CATCH_REQUIRE(name == "alexis");
137 | 
138 |             libutf8::case_insensitive_string section(name, 2, 2);
139 |             CATCH_REQUIRE(section == "ex");
140 |         }
141 | 
142 |         {
143 |             libutf8::case_insensitive_string name("alexis");
144 |             CATCH_REQUIRE(name == "alexis");
145 | 
146 |             libutf8::case_insensitive_string section(name.begin() + 2, name.end() - 2);
147 |             CATCH_REQUIRE(section == "ex");
148 |         }
149 | 
150 |         {
151 |             std::string name("alexis");
152 |             CATCH_REQUIRE(name == "alexis");
153 | 
154 |             libutf8::case_insensitive_string full(name);
155 |             CATCH_REQUIRE(full == "alexis");
156 |         }
157 | 
158 |         {
159 |             libutf8::case_insensitive_string name("alexis");
160 |             CATCH_REQUIRE(name == "alexis");
161 | 
162 |             libutf8::case_insensitive_string full(name);
163 |             CATCH_REQUIRE(full == "alexis");
164 |         }
165 | 
166 |         {
167 |             libutf8::case_insensitive_string name({'a', 'l', 'e', 'x', 'i', 's'});
168 |             CATCH_REQUIRE(name == "alexis");
169 |         }
170 | 
171 |         {
172 |             std::string expected("not this");
173 |             libutf8::case_insensitive_string now(get_time(expected));
174 |             CATCH_REQUIRE(expected + " PST" == now);
175 |         }
176 | 
177 |         {
178 |             std::allocator<char> allocator;
179 |             std::string expected("not this");
180 |             libutf8::case_insensitive_string now(get_time(expected), allocator);
181 |             CATCH_REQUIRE(expected + " PST" == now);
182 |         }
183 | 
184 |         {
185 |             std::string expected("not this");
186 |             libutf8::case_insensitive_string now(get_date(expected));
187 |             CATCH_REQUIRE(now == expected + " plus a few days");
188 |         }
189 | 
190 |         {
191 |             std::allocator<char> allocator;
192 |             std::string expected("not this");
193 |             libutf8::case_insensitive_string now(get_date(expected), allocator);
194 |             CATCH_REQUIRE(now == expected + " plus a few days");
195 |         }
196 |     }
197 |     CATCH_END_SECTION()
198 | 
199 |     CATCH_START_SECTION("case_insensitive: Verify Case Insensitive String Comparators")
200 |     {
201 |         {
202 |             libutf8::case_insensitive_string name1("Alexis");
203 |             libutf8::case_insensitive_string name2("alexis");
204 |             CATCH_REQUIRE(name1 == name2);
205 |             CATCH_REQUIRE_FALSE(name1 != name2);
206 |             CATCH_REQUIRE_FALSE(name1 > name2);
207 |             CATCH_REQUIRE(name1 >= name2);
208 |             CATCH_REQUIRE_FALSE(name1 < name2);
209 |             CATCH_REQUIRE(name1 <= name2);
210 |         }
211 | 
212 |         {
213 |             libutf8::case_insensitive_string name1("Alexis");
214 |             libutf8::case_insensitive_string name2("Wilke");
215 |             CATCH_REQUIRE_FALSE(name1 == name2);
216 |             CATCH_REQUIRE(name1 != name2);
217 |             CATCH_REQUIRE_FALSE(name1 > name2);
218 |             CATCH_REQUIRE_FALSE(name1 >= name2);
219 |             CATCH_REQUIRE(name1 < name2);
220 |             CATCH_REQUIRE(name1 <= name2);
221 |         }
222 | 
223 |         {
224 |             libutf8::case_insensitive_string name1("Alexis");
225 |             std::string name2("alexis");
226 |             CATCH_REQUIRE(name1 == name2);
227 |             CATCH_REQUIRE_FALSE(name1 != name2);
228 |             CATCH_REQUIRE_FALSE(name1 > name2);
229 |             CATCH_REQUIRE(name1 >= name2);
230 |             CATCH_REQUIRE_FALSE(name1 < name2);
231 |             CATCH_REQUIRE(name1 <= name2);
232 |         }
233 | 
234 |         {
235 |             std::string name1("Alexis");
236 |             libutf8::case_insensitive_string name2("Wilke");
237 |             CATCH_REQUIRE_FALSE(name1 == name2);
238 |             CATCH_REQUIRE(name1 != name2);
239 |             CATCH_REQUIRE_FALSE(name1 > name2);
240 |             CATCH_REQUIRE_FALSE(name1 >= name2);
241 |             CATCH_REQUIRE(name1 < name2);
242 |             CATCH_REQUIRE(name1 <= name2);
243 |         }
244 | 
245 |         {
246 |             libutf8::case_insensitive_string name1("Alexis");
247 |             CATCH_REQUIRE(name1 == "alexis");
248 |             CATCH_REQUIRE_FALSE(name1 != "alexis");
249 |             CATCH_REQUIRE_FALSE(name1 > "alexis");
250 |             CATCH_REQUIRE(name1 >= "alexis");
251 |             CATCH_REQUIRE_FALSE(name1 < "alexis");
252 |             CATCH_REQUIRE(name1 <= "alexis");
253 |         }
254 | 
255 |         {
256 |             libutf8::case_insensitive_string name2("Wilke");
257 |             CATCH_REQUIRE_FALSE("Alexis" == name2);
258 |             CATCH_REQUIRE("Alexis" != name2);
259 |             CATCH_REQUIRE_FALSE("Alexis" > name2);
260 |             CATCH_REQUIRE_FALSE("Alexis" >= name2);
261 |             CATCH_REQUIRE("Alexis" < name2);
262 |             CATCH_REQUIRE("Alexis" <= name2);
263 |         }
264 |     }
265 |     CATCH_END_SECTION()
266 | }
267 | 
268 | 
269 | // vim: ts=4 sw=4 et
270 | 


--------------------------------------------------------------------------------
/tests/catch_iterator.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2013-2023  Made to Order Software Corp.  All Rights Reserved
  2 | //
  3 | // https://snapwebsites.org/project/libutf8
  4 | // contact@m2osw.com
  5 | //
  6 | // This program is free software; you can redistribute it and/or modify
  7 | // it under the terms of the GNU General Public License as published by
  8 | // the Free Software Foundation; either version 2 of the License, or
  9 | // (at your option) any later version.
 10 | //
 11 | // This program is distributed in the hope that it will be useful,
 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | // GNU General Public License for more details.
 15 | //
 16 | // You should have received a copy of the GNU General Public License along
 17 | // with this program; if not, write to the Free Software Foundation, Inc.,
 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 19 | 
 20 | // libutf8
 21 | //
 22 | #include    <libutf8/iterator.h>
 23 | 
 24 | #include    <libutf8/base.h>
 25 | #include    <libutf8/libutf8.h>
 26 | 
 27 | 
 28 | // unit test
 29 | //
 30 | #include    "catch_main.h"
 31 | 
 32 | 
 33 | // C++
 34 | //
 35 | #include    <cctype>
 36 | #include    <iostream>
 37 | 
 38 | 
 39 | // last include
 40 | //
 41 | #include    <snapdev/poison.h>
 42 | 
 43 | 
 44 | 
 45 | CATCH_TEST_CASE("libutf8_iterator", "[iterator]")
 46 | {
 47 |     CATCH_START_SECTION("libutf8_iterator: valid iterators tests")
 48 |     {
 49 |         char32_t p(0);
 50 |         do
 51 |         {
 52 |             p = rand() % 0x11 * 0x10000;
 53 |         }
 54 |         while(p == 0 || (p >= 0xD800 && p <= 0xDFFF));
 55 | 
 56 |         for(char32_t plan(0); plan < 0x110000; plan += 0x10000)
 57 |         {
 58 |             // create one plan in one string
 59 |             //
 60 |             std::string str;
 61 |             str.reserve(0x10000 * 4);
 62 |             for(char32_t wc(0); wc < 0x10000; ++wc)
 63 |             {
 64 |                 if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
 65 |                 {
 66 |                     wc = 0xDFFF;
 67 |                     continue;
 68 |                 }
 69 |                 char buf[libutf8::MBS_MIN_BUFFER_LENGTH];
 70 |                 CATCH_REQUIRE(libutf8::wctombs(buf, wc + plan, sizeof(buf)) >= 1);
 71 |                 if(plan == 0 && wc == 0)
 72 |                 {
 73 |                     // this is a special case as buf[0] = '\0' and the += with
 74 |                     // the string won't work
 75 |                     //
 76 |                     str += '\0';
 77 |                 }
 78 |                 else
 79 |                 {
 80 |                     str += buf;
 81 |                 }
 82 |             }
 83 | //std::cerr << "-------------- Plan " << static_cast<int>(plan) << " String ready " << str.length() << " ...\n";
 84 | 
 85 |             {
 86 |                 libutf8::utf8_iterator it(str);
 87 |                 libutf8::utf8_iterator it_end(str, true);
 88 |                 libutf8::utf8_iterator it_next(str);
 89 |                 ++it_next;
 90 | 
 91 |                 CATCH_REQUIRE(it == str.begin());
 92 |                 CATCH_REQUIRE(it == str.cbegin());
 93 |                 CATCH_REQUIRE(it != str.end());
 94 |                 CATCH_REQUIRE(it != str.cend());
 95 | 
 96 |                 CATCH_REQUIRE(it == it);
 97 |                 CATCH_REQUIRE(it != it_end);
 98 |                 CATCH_REQUIRE(it != it_next);
 99 | 
100 |                 CATCH_REQUIRE(str.begin() == it);
101 |                 CATCH_REQUIRE(str.cbegin() == it);
102 |                 CATCH_REQUIRE(str.end() != it);
103 |                 CATCH_REQUIRE(str.cend() != it);
104 | 
105 |                 for(char32_t wc(0); wc < 0x10000; ++wc)
106 |                 {
107 |                     if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
108 |                     {
109 |                         wc = 0xDFFF;
110 |                         continue;
111 |                     }
112 |                     CATCH_REQUIRE(*it == wc + plan);
113 |                     ++it;
114 |                 }
115 | 
116 |                 CATCH_REQUIRE(it != str.begin());
117 |                 CATCH_REQUIRE(it != str.cbegin());
118 |                 CATCH_REQUIRE(it == str.end());
119 |                 CATCH_REQUIRE(it == str.cend());
120 | 
121 |                 CATCH_REQUIRE(str.begin() != it);
122 |                 CATCH_REQUIRE(str.cbegin() != it);
123 |                 CATCH_REQUIRE(str.end() == it);
124 |                 CATCH_REQUIRE(str.cend() == it);
125 | 
126 |                 CATCH_REQUIRE(*it == libutf8::EOS);
127 |                 ++it;
128 |                 it++;
129 |                 CATCH_REQUIRE(it == str.cend());
130 | 
131 |                 for(char32_t wc(0x10000); wc > 0; )
132 |                 {
133 |                     --wc;
134 |                     if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
135 |                     {
136 |                         wc = 0xD800;
137 |                         continue;
138 |                     }
139 |                     --it;
140 |                     CATCH_REQUIRE(*it == wc + plan);
141 |                 }
142 | 
143 |                 --it;
144 |                 it--;
145 | 
146 |                 CATCH_REQUIRE(it.good());
147 |                 CATCH_REQUIRE_FALSE(it.bad());
148 |             }
149 | 
150 |             if(plan == p)
151 |             {
152 |                 libutf8::utf8_iterator it(str);
153 | 
154 |                 for(char32_t wc(0); wc < 0x10000; ++wc)
155 |                 {
156 |                     if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
157 |                     {
158 |                         wc = 0xDFFF;
159 |                         continue;
160 |                     }
161 |                     CATCH_REQUIRE(*it++ == wc + plan);
162 |                 }
163 | 
164 |                 CATCH_REQUIRE(it == str.end());
165 |                 it++;
166 |                 CATCH_REQUIRE(it.good());
167 |                 CATCH_REQUIRE_FALSE(it.bad());
168 |                 ++it;
169 |                 CATCH_REQUIRE(it.good());
170 |                 CATCH_REQUIRE_FALSE(it.bad());
171 |                 CATCH_REQUIRE(it == str.end());
172 |                 CATCH_REQUIRE(it.good());
173 |                 CATCH_REQUIRE_FALSE(it.bad());
174 | 
175 |                 for(char32_t wc(0x10000); wc > 0; )
176 |                 {
177 |                     --wc;
178 |                     if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
179 |                     {
180 |                         wc = 0xD800;
181 |                         continue;
182 |                     }
183 |                     CATCH_REQUIRE(*--it == wc + plan);
184 |                 }
185 | 
186 |                 CATCH_REQUIRE(it == str.begin());
187 |                 CATCH_REQUIRE(str.begin() == it);
188 |                 it--;
189 |                 --it;
190 |                 CATCH_REQUIRE(it == str.begin());
191 |                 CATCH_REQUIRE(str.begin() == it);
192 |             }
193 | 
194 |             if(plan == (p + 0x10000) % 0x110000)
195 |             {
196 |                 libutf8::utf8_iterator it(str);
197 |                 libutf8::utf8_iterator start(str);
198 |                 CATCH_REQUIRE(it - start == 0);
199 |                 CATCH_REQUIRE(start - it == 0);
200 | 
201 |                 for(char32_t wc(0); wc < 0x10000; ++wc)
202 |                 {
203 |                     if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
204 |                     {
205 |                         wc = 0xDFFF;
206 |                         continue;
207 |                     }
208 |                     CATCH_REQUIRE(*it == wc + plan);
209 |                     it++;
210 | 
211 |                     libutf8::utf8_iterator zero(it);
212 |                     zero.rewind();
213 |                     CATCH_REQUIRE(zero == start);
214 |                 }
215 | 
216 |                 libutf8::utf8_iterator copy(it);
217 |                 CATCH_REQUIRE(static_cast<std::size_t>(it - start) == str.length());
218 |                 CATCH_REQUIRE(static_cast<std::size_t>(copy - start) == str.length());
219 |                 CATCH_REQUIRE(copy - it == 0);
220 |                 CATCH_REQUIRE(it - copy == 0);
221 |                 copy.rewind();
222 |                 CATCH_REQUIRE(copy - start == 0);
223 |                 CATCH_REQUIRE(start - copy == 0);
224 |                 CATCH_REQUIRE(static_cast<std::size_t>(start - copy) == 0);
225 |                 CATCH_REQUIRE(static_cast<std::size_t>(copy - start) == 0);
226 | 
227 |                 for(char32_t wc(0x10000); wc > 0; )
228 |                 {
229 |                     --wc;
230 |                     if(plan == 0 && wc >= 0xD800 && wc <= 0xDFFF)
231 |                     {
232 |                         wc = 0xD800;
233 |                         continue;
234 |                     }
235 |                     it--;
236 |                     CATCH_REQUIRE(*it == wc + plan);
237 |                 }
238 |             }
239 |         }
240 |     }
241 |     CATCH_END_SECTION()
242 | }
243 | 
244 | 
245 | CATCH_TEST_CASE("libutf8_iterator_invalid_string", "[iterator],[invalid]")
246 | {
247 |     CATCH_START_SECTION("libutf8_iterator_invalid_string: iterators with invalid characters (bad UTF-8)")
248 |     {
249 |         for(int repeat(0); repeat < 100; ++repeat)
250 |         {
251 |             // create one plan in one string
252 |             //
253 |             constexpr size_t STR_LENGTH = 4;
254 |             char32_t wc;
255 |             std::u32string wstr;
256 |             wstr.reserve(STR_LENGTH);
257 |             for(size_t idx(0); idx < STR_LENGTH; ++idx)
258 |             {
259 |                 do
260 |                 {
261 |                     wc = unittest::rand_char(true);
262 |                 }
263 |                 while(wc < 0x80);
264 |                 wstr += wc;
265 |             }
266 |             std::string str(libutf8::to_u8string(wstr));
267 | 
268 | //std::cerr << "-------------- Plan " << static_cast<int>(plan) << " String ready " << str.length() << " ...\n";
269 | 
270 |             // first verify that it works
271 |             //
272 |             std::string::size_type pos[STR_LENGTH];
273 |             {
274 |                 libutf8::utf8_iterator it(str);
275 | 
276 |                 CATCH_REQUIRE(it == str.begin());
277 |                 CATCH_REQUIRE(it == str.cbegin());
278 |                 CATCH_REQUIRE(it != str.end());
279 |                 CATCH_REQUIRE(it != str.cend());
280 | 
281 |                 CATCH_REQUIRE(str.begin()  == it);
282 |                 CATCH_REQUIRE(str.cbegin() == it);
283 |                 CATCH_REQUIRE(str.end()    != it);
284 |                 CATCH_REQUIRE(str.cend()   != it);
285 | 
286 |                 for(size_t idx(0); idx < STR_LENGTH; ++idx)
287 |                 {
288 |                     CATCH_REQUIRE(*it == wstr[idx]);
289 |                     if(rand() % 2 == 0)
290 |                     {
291 |                         pos[idx] = it - str.begin();
292 |                     }
293 |                     else
294 |                     {
295 |                         pos[idx] = -(str.begin() - it);
296 |                     }
297 |                     ++it;
298 |                 }
299 | 
300 |                 CATCH_REQUIRE(it != str.begin());
301 |                 CATCH_REQUIRE(it != str.cbegin());
302 |                 CATCH_REQUIRE(it == str.end());
303 |                 CATCH_REQUIRE(it == str.cend());
304 | 
305 |                 CATCH_REQUIRE(str.begin()  != it);
306 |                 CATCH_REQUIRE(str.cbegin() != it);
307 |                 CATCH_REQUIRE(str.end()    == it);
308 |                 CATCH_REQUIRE(str.cend()   == it);
309 | 
310 |                 CATCH_REQUIRE(*it == libutf8::EOS);
311 |                 ++it;
312 |                 it++;
313 |                 CATCH_REQUIRE(it == str.cend());
314 | 
315 |                 CATCH_REQUIRE(it.good());
316 |                 CATCH_REQUIRE_FALSE(it.bad());
317 |             }
318 | 
319 |             {
320 |                 libutf8::utf8_iterator it(str);
321 | 
322 |                 str[pos[1]] = rand() % 0x40 + 0x80;
323 | 
324 |                 CATCH_REQUIRE(*it++ == wstr[0]);
325 |                 CATCH_REQUIRE(*it++ == libutf8::NOT_A_CHARACTER);       // we broke this one
326 |                 CATCH_REQUIRE(*it++ == wstr[2]);
327 |                 CATCH_REQUIRE(*it++ == wstr[3]);
328 |                 CATCH_REQUIRE(*it++ == libutf8::EOS);
329 | 
330 |                 CATCH_REQUIRE_FALSE(it.good());
331 |                 CATCH_REQUIRE(it.bad());
332 |                 it.clear();
333 |                 CATCH_REQUIRE(it.good());
334 |                 CATCH_REQUIRE_FALSE(it.bad());
335 |             }
336 | 
337 |             {
338 |                 str.erase(str.length() - 1);
339 |                 libutf8::utf8_iterator it(str);
340 | 
341 |                 str[pos[1]] = rand() % 0x40 + 0x80;
342 | 
343 |                 CATCH_REQUIRE(*it++ == wstr[0]);
344 |                 CATCH_REQUIRE(*it++ == libutf8::NOT_A_CHARACTER);
345 |                 CATCH_REQUIRE(*it++ == wstr[2]);
346 |                 CATCH_REQUIRE(*it++ == libutf8::NOT_A_CHARACTER);
347 | 
348 |                 CATCH_REQUIRE_FALSE(it.good());
349 |                 CATCH_REQUIRE(it.bad());
350 |                 it.clear();
351 |                 CATCH_REQUIRE(it.good());
352 |                 CATCH_REQUIRE_FALSE(it.bad());
353 |             }
354 |         }
355 |     }
356 |     CATCH_END_SECTION()
357 | 
358 |     CATCH_START_SECTION("libutf8_iterator_invalid_string: iterators with invalid characters (too large)")
359 |     {
360 |         for(char32_t wc(0x110000); wc < 0x1FFFFF; ++wc)
361 |         {
362 |             // since this character is not valid
363 |             // we have to encode it _manually_
364 |             //
365 |             char buf[4];
366 |             buf[0] = 0xF0 | ((wc >> 18) & 0x07);
367 |             buf[1] = 0x80 | ((wc >> 12) & 0x3F);
368 |             buf[2] = 0x80 | ((wc >>  6) & 0x3F);
369 |             buf[3] = 0x80 | ((wc >>  0) & 0x3F);
370 | 
371 |             std::string str(buf, 4);
372 | 
373 |             // first verify that it works
374 |             //
375 |             {
376 |                 libutf8::utf8_iterator it(str);
377 | 
378 |                 CATCH_REQUIRE(it == str.begin());
379 |                 CATCH_REQUIRE(it == str.cbegin());
380 |                 CATCH_REQUIRE(it != str.end());
381 |                 CATCH_REQUIRE(it != str.cend());
382 | 
383 |                 CATCH_REQUIRE(str.begin()  == it);
384 |                 CATCH_REQUIRE(str.cbegin() == it);
385 |                 CATCH_REQUIRE(str.end()    != it);
386 |                 CATCH_REQUIRE(str.cend()   != it);
387 | 
388 |                 CATCH_REQUIRE(*it == libutf8::NOT_A_CHARACTER);
389 | 
390 |                 CATCH_REQUIRE_FALSE(it.good());
391 |                 CATCH_REQUIRE(it.bad());
392 |                 it.clear();
393 |                 CATCH_REQUIRE(it.good());
394 |                 CATCH_REQUIRE_FALSE(it.bad());
395 | 
396 |                 ++it;
397 | 
398 |                 CATCH_REQUIRE(it != str.begin());
399 |                 CATCH_REQUIRE(it != str.cbegin());
400 |                 CATCH_REQUIRE(it == str.end());
401 |                 CATCH_REQUIRE(it == str.cend());
402 | 
403 |                 CATCH_REQUIRE(str.begin()  != it);
404 |                 CATCH_REQUIRE(str.cbegin() != it);
405 |                 CATCH_REQUIRE(str.end()    == it);
406 |                 CATCH_REQUIRE(str.cend()   == it);
407 | 
408 |                 CATCH_REQUIRE(*it == libutf8::EOS);
409 |                 ++it;
410 |                 it++;
411 |                 CATCH_REQUIRE(it == str.cend());
412 | 
413 |                 CATCH_REQUIRE_FALSE(it.good());
414 |                 CATCH_REQUIRE(it.bad());
415 |             }
416 |         }
417 |     }
418 |     CATCH_END_SECTION()
419 | }
420 | 
421 | 
422 | 
423 | // vim: ts=4 sw=4 et
424 | 


--------------------------------------------------------------------------------
/tests/catch_length.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2013-2023  Made to Order Software Corp.  All Rights Reserved
  2 | //
  3 | // https://snapwebsites.org/project/libutf8
  4 | // contact@m2osw.com
  5 | //
  6 | // This program is free software; you can redistribute it and/or modify
  7 | // it under the terms of the GNU General Public License as published by
  8 | // the Free Software Foundation; either version 2 of the License, or
  9 | // (at your option) any later version.
 10 | //
 11 | // This program is distributed in the hope that it will be useful,
 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | // GNU General Public License for more details.
 15 | //
 16 | // You should have received a copy of the GNU General Public License along
 17 | // with this program; if not, write to the Free Software Foundation, Inc.,
 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 19 | 
 20 | // libutf8
 21 | //
 22 | #include    <libutf8/exception.h>
 23 | #include    <libutf8/libutf8.h>
 24 | 
 25 | 
 26 | // unit test
 27 | //
 28 | #include    "catch_main.h"
 29 | 
 30 | 
 31 | // C++
 32 | //
 33 | #include    <cctype>
 34 | #include    <iostream>
 35 | #include    <iomanip>
 36 | 
 37 | 
 38 | // last include
 39 | //
 40 | #include    <snapdev/poison.h>
 41 | 
 42 | 
 43 | 
 44 | CATCH_TEST_CASE("string_length", "[strings][valid][length][u8][u16][u32]")
 45 | {
 46 |     CATCH_START_SECTION("string_length: length of valid Unicode strings")
 47 |     {
 48 |         for(int idx(0); idx < 100; ++idx)
 49 |         {
 50 |             std::size_t const length(rand() % 100 + 1);
 51 |             std::u32string str32;
 52 |             for(std::size_t j(0); j < length; ++j)
 53 |             {
 54 |                 char32_t const c(SNAP_CATCH2_NAMESPACE::random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_ZUNICODE));
 55 |                 str32 += c;
 56 |             }
 57 |             CATCH_REQUIRE(libutf8::is_valid_unicode(str32));
 58 |             CATCH_REQUIRE(str32.length() == length);
 59 | 
 60 |             std::string str8(libutf8::to_u8string(str32));
 61 |             CATCH_REQUIRE(libutf8::is_valid_utf8(str8));
 62 |             CATCH_REQUIRE(str8.length() >= length);
 63 |             CATCH_REQUIRE(libutf8::u8length(str8) == length);
 64 | 
 65 |             std::u16string str16(libutf8::to_u16string(str8));
 66 |             CATCH_REQUIRE(libutf8::is_valid_utf16(str16));
 67 |             CATCH_REQUIRE(str16.length() >= length);
 68 |             CATCH_REQUIRE(static_cast<std::size_t>(libutf8::u16length(str16)) == length);
 69 |         }
 70 |     }
 71 |     CATCH_END_SECTION()
 72 | }
 73 | 
 74 | 
 75 | CATCH_TEST_CASE("invalid_utf16_string_length", "[strings][invalid][length][u16]")
 76 | {
 77 |     CATCH_START_SECTION("invalid_utf16_string_length: invalid UTF-16 returns -1 for length")
 78 |     {
 79 |         for(int idx(0); idx < 100; ++idx)
 80 |         {
 81 |             std::size_t const length(rand() % 30 + 5);
 82 |             char16_t bad_char(rand() & 0x03FF);
 83 |             std::size_t bad_pos(length / 2);
 84 |             switch(idx % 3)
 85 |             {
 86 |             case 0:
 87 |                 bad_char += 0xDC00;     // low without a high
 88 |                 break;
 89 | 
 90 |             case 1:
 91 |                 bad_char += 0xD800;     // high not followed by a low
 92 |                 break;
 93 | 
 94 |             case 2:
 95 |                 bad_char += 0xD800;     // high followed by u'\0'
 96 |                 bad_pos = length - 1;
 97 |                 break;
 98 | 
 99 |             }
100 |             std::u16string str16;
101 |             for(std::size_t j(0); j < length; ++j)
102 |             {
103 |                 char32_t const wc(SNAP_CATCH2_NAMESPACE::random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_ZUNICODE));
104 |                 str16 += libutf8::to_u16string(wc);
105 |                 if(j == bad_pos)
106 |                 {
107 |                     str16 += bad_char;
108 |                 }
109 |             }
110 | 
111 |             CATCH_REQUIRE_FALSE(libutf8::is_valid_utf16(str16));
112 |             CATCH_REQUIRE(libutf8::u16length(str16) == -1);
113 |         }
114 |     }
115 |     CATCH_END_SECTION()
116 | }
117 | 
118 | 
119 | // vim: ts=4 sw=4 et
120 | 


--------------------------------------------------------------------------------
/tests/catch_main.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2021-2023  Made to Order Software Corp.  All Rights Reserved
 2 | //
 3 | // https://snapwebsites.org/project/libutf8
 4 | // contact@m2osw.com
 5 | //
 6 | // This program is free software; you can redistribute it and/or modify
 7 | // it under the terms of the GNU General Public License as published by
 8 | // the Free Software Foundation; either version 2 of the License, or
 9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 | 
20 | // Tell catch we want it to add the runner code in this file.
21 | #define CATCH_CONFIG_RUNNER
22 | 
23 | // self
24 | //
25 | #include    "catch_main.h"
26 | 
27 | 
28 | // libutf8
29 | //
30 | #include    <libutf8/libutf8.h>
31 | #include    <libutf8/version.h>
32 | 
33 | 
34 | // libexcept
35 | //
36 | #include    <libexcept/exception.h>
37 | 
38 | 
39 | // C++
40 | //
41 | #include    <sstream>
42 | 
43 | 
44 | // last include
45 | //
46 | #include    <snapdev/poison.h>
47 | 
48 | 
49 | 
50 | 
51 | 
52 | int main(int argc, char * argv[])
53 | {
54 |     return SNAP_CATCH2_NAMESPACE::snap_catch2_main(
55 |               "libutf8"
56 |             , LIBUTF8_VERSION_STRING
57 |             , argc
58 |             , argv
59 |             , []() { libexcept::set_collect_stack(libexcept::collect_stack_t::COLLECT_STACK_NO); }
60 |         );
61 | }
62 | 
63 | 
64 | // vim: ts=4 sw=4 et
65 | 


--------------------------------------------------------------------------------
/tests/catch_main.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2006-2023  Made to Order Software Corp.  All Rights Reserved
 2 | //
 3 | // https://snapwebsites.org/project/libutf8
 4 | // contact@m2osw.com
 5 | //
 6 | // This program is free software; you can redistribute it and/or modify
 7 | // it under the terms of the GNU General Public License as published by
 8 | // the Free Software Foundation; either version 2 of the License, or
 9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 | #pragma once
20 | 
21 | // libutf8
22 | //
23 | #include    <libutf8/libutf8.h>     // for the ostream
24 | 
25 | 
26 | // catch2
27 | //
28 | #include    <catch2/snapcatch2.hpp>
29 | 
30 | 
31 | // C++
32 | //
33 | #include    <string>
34 | #include    <cstring>
35 | #include    <cstdlib>
36 | #include    <iostream>
37 | 
38 | 
39 | // last include
40 | //
41 | #include    <snapdev/poison.h>
42 | 
43 | 
44 | 
45 | namespace SNAP_CATCH2_NAMESPACE
46 | {
47 | 
48 | 
49 | 
50 | 
51 | inline char32_t rand_char(bool full_range = false)
52 | {
53 |     char32_t const max((full_range ? 0x0110000 : 0x0010000) - (0xE000 - 0xD800));
54 | 
55 |     char32_t wc;
56 |     do
57 |     {
58 |         wc = ((rand() << 16) ^ rand()) % max;
59 |     }
60 |     while(wc == 0);
61 |     if(wc >= 0xD800)
62 |     {
63 |         // skip the surrogates
64 |         //
65 |         wc += 0xE000 - 0xD800;
66 |     }
67 | 
68 |     return wc;
69 | }
70 | 
71 | 
72 | 
73 | }
74 | // unittest namespace
75 | // vim: ts=4 sw=4 et
76 | 


--------------------------------------------------------------------------------
/tests/catch_stream.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2013-2023  Made to Order Software Corp.  All Rights Reserved
 2 | //
 3 | // https://snapwebsites.org/project/libutf8
 4 | // contact@m2osw.com
 5 | //
 6 | // This program is free software; you can redistribute it and/or modify
 7 | // it under the terms of the GNU General Public License as published by
 8 | // the Free Software Foundation; either version 2 of the License, or
 9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 | 
20 | // libutf8
21 | //
22 | #include    <libutf8/libutf8.h>
23 | 
24 | 
25 | // unit test
26 | //
27 | #include    "catch_main.h"
28 | 
29 | 
30 | // C++
31 | //
32 | #include    <cctype>
33 | #include    <iostream>
34 | #include    <iomanip>
35 | 
36 | 
37 | // last include
38 | //
39 | #include    <snapdev/poison.h>
40 | 
41 | 
42 | 
43 | CATCH_TEST_CASE("stream", "[stream][valid]")
44 | {
45 |     CATCH_START_SECTION("stream: write a char32_t to a stream")
46 |     {
47 |         for(int i(0); i < 1000; ++i)
48 |         {
49 |             char32_t const wc(SNAP_CATCH2_NAMESPACE::random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_ZUNICODE));
50 | 
51 |             std::stringstream ss;
52 |             ss << libutf8::to_u8string(wc);
53 | 
54 |             CATCH_REQUIRE(ss.str() == libutf8::to_u8string(wc));
55 |         }
56 |     }
57 |     CATCH_END_SECTION()
58 | }
59 | 
60 | 
61 | 
62 | // vim: ts=4 sw=4 et
63 | 


--------------------------------------------------------------------------------
/tests/catch_valid.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2013-2023  Made to Order Software Corp.  All Rights Reserved
  2 | //
  3 | // https://snapwebsites.org/project/libutf8
  4 | // contact@m2osw.com
  5 | //
  6 | // This program is free software; you can redistribute it and/or modify
  7 | // it under the terms of the GNU General Public License as published by
  8 | // the Free Software Foundation; either version 2 of the License, or
  9 | // (at your option) any later version.
 10 | //
 11 | // This program is distributed in the hope that it will be useful,
 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | // GNU General Public License for more details.
 15 | //
 16 | // You should have received a copy of the GNU General Public License along
 17 | // with this program; if not, write to the Free Software Foundation, Inc.,
 18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 19 | 
 20 | // libutf8
 21 | //
 22 | #include    <libutf8/exception.h>
 23 | #include    <libutf8/libutf8.h>
 24 | 
 25 | 
 26 | // unit test
 27 | //
 28 | #include    "catch_main.h"
 29 | 
 30 | 
 31 | // snapdev
 32 | //
 33 | #include    <snapdev/hexadecimal_string.h>
 34 | 
 35 | 
 36 | // C++
 37 | //
 38 | #include    <cctype>
 39 | #include    <iostream>
 40 | #include    <iomanip>
 41 | 
 42 | 
 43 | // last include
 44 | //
 45 | #include    <snapdev/poison.h>
 46 | 
 47 | 
 48 | 
 49 | CATCH_TEST_CASE("make_valid", "[strings][valid][u8]")
 50 | {
 51 |     CATCH_START_SECTION("make_valid: test bad encoding (1 byte when 2 necessary)")
 52 |     {
 53 |         for(char32_t two_bytes(0x80); two_bytes < 0x800; ++two_bytes)
 54 |         {
 55 |             char const byte1(static_cast<char>((two_bytes >> 6) | 0xC0));
 56 |             char32_t const vc1(random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_UNICODE));
 57 |             char32_t const vc2(random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_UNICODE));
 58 |             char32_t const fix_char(random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_UNICODE));
 59 |             std::string invalid_string;
 60 |             invalid_string += vc1;
 61 |             invalid_string += byte1;
 62 |             invalid_string += vc2;
 63 |             std::string expected_string;
 64 |             expected_string += vc1;
 65 |             expected_string += fix_char;
 66 |             expected_string += vc2;
 67 |             CATCH_REQUIRE_FALSE(libutf8::make_u8string_valid(invalid_string, fix_char));
 68 |             CATCH_REQUIRE(invalid_string == expected_string);
 69 |         }
 70 |     }
 71 |     CATCH_END_SECTION()
 72 | 
 73 |     CATCH_START_SECTION("make_valid: test bad encoding (2 bytes when 3 necessary)")
 74 |     {
 75 |         for(char32_t two_bytes(0x800); two_bytes < 0x10000; ++two_bytes)
 76 |         {
 77 |             // Note: this includes the UTF-16 surrogates which are also
 78 |             //       considered invalid
 79 |             //
 80 |             char const byte1(static_cast<char>((two_bytes >> 12) | 0xE0));
 81 |             char const byte2(((two_bytes >> 6) & 0x3F) | 0x80);
 82 |             char32_t const vc1(random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_UNICODE));
 83 |             char32_t const vc2(random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_UNICODE));
 84 |             char32_t const fix_char(random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_UNICODE));
 85 |             std::string invalid_string;
 86 |             invalid_string += vc1;
 87 |             invalid_string += byte1;
 88 |             invalid_string += byte2;
 89 |             invalid_string += vc2;
 90 |             std::string expected_string;
 91 |             expected_string += vc1;
 92 |             expected_string += fix_char;
 93 |             expected_string += vc2;
 94 |             CATCH_REQUIRE_FALSE(libutf8::make_u8string_valid(invalid_string, fix_char));
 95 |             CATCH_REQUIRE(invalid_string == expected_string);
 96 |         }
 97 |     }
 98 |     CATCH_END_SECTION()
 99 | 
100 |     CATCH_START_SECTION("make_valid: test bad encoding (3 bytes when 4 necessary)")
101 |     {
102 |         for(char32_t two_bytes(0x10000); two_bytes < 0x110000; ++two_bytes)
103 |         {
104 |             char const byte1(static_cast<char>((two_bytes >> 18) | 0xF0));
105 |             char const byte2(((two_bytes >> 12) & 0x3F) | 0x80);
106 |             char const byte3(((two_bytes >> 6) & 0x3F) | 0x80);
107 |             char32_t const vc1(random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_UNICODE));
108 |             char32_t const vc2(random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_UNICODE));
109 |             char32_t const fix_char(random_char(SNAP_CATCH2_NAMESPACE::character_t::CHARACTER_UNICODE));
110 |             std::string invalid_string;
111 |             invalid_string += vc1;
112 |             invalid_string += byte1;
113 |             invalid_string += byte2;
114 |             invalid_string += byte3;
115 |             invalid_string += vc2;
116 |             std::string expected_string;
117 |             expected_string += vc1;
118 |             expected_string += fix_char;
119 |             expected_string += vc2;
120 |             CATCH_REQUIRE_FALSE(libutf8::make_u8string_valid(invalid_string, fix_char));
121 |             CATCH_REQUIRE(invalid_string == expected_string);
122 |         }
123 |     }
124 |     CATCH_END_SECTION()
125 | }
126 | 
127 | 
128 | 
129 | // vim: ts=4 sw=4 et
130 | 


--------------------------------------------------------------------------------
/tests/catch_version.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2021-2023  Made to Order Software Corp.  All Rights Reserved
 2 | //
 3 | // https://snapwebsites.org/project/libutf8
 4 | // contact@m2osw.com
 5 | //
 6 | // This program is free software; you can redistribute it and/or modify
 7 | // it under the terms of the GNU General Public License as published by
 8 | // the Free Software Foundation; either version 2 of the License, or
 9 | // (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License along
17 | // with this program; if not, write to the Free Software Foundation, Inc.,
18 | // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 | 
20 | // libutf8
21 | //
22 | #include    <libutf8/version.h>
23 | 
24 | 
25 | // self
26 | //
27 | #include    "catch_main.h"
28 | 
29 | 
30 | // last include
31 | //
32 | #include    <snapdev/poison.h>
33 | 
34 | 
35 | 
36 | 
37 | CATCH_TEST_CASE("version", "[version]")
38 | {
39 |     CATCH_START_SECTION("version: verify runtime vs compile time version numbers")
40 |     {
41 |         CATCH_REQUIRE(libutf8::get_major_version()   == LIBUTF8_VERSION_MAJOR);
42 |         CATCH_REQUIRE(libutf8::get_release_version() == LIBUTF8_VERSION_MINOR);
43 |         CATCH_REQUIRE(libutf8::get_patch_version()   == LIBUTF8_VERSION_PATCH);
44 |         CATCH_REQUIRE(strcmp(libutf8::get_version_string(), LIBUTF8_VERSION_STRING) == 0);
45 |     }
46 |     CATCH_END_SECTION()
47 | }
48 | 
49 | 
50 | // vim: ts=4 sw=4 et
51 | 


--------------------------------------------------------------------------------
/tests/example-for-show-utf16.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/m2osw/libutf8/fe008f2199f5d8f43abe6ecffe6e82ead03ab2b5/tests/example-for-show-utf16.txt


--------------------------------------------------------------------------------
/tests/example-for-show-utf32.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/m2osw/libutf8/fe008f2199f5d8f43abe6ecffe6e82ead03ab2b5/tests/example-for-show-utf32.txt


--------------------------------------------------------------------------------
/tests/example-for-show-utf8.txt:
--------------------------------------------------------------------------------
1 | Tḩìs 𝄞 ĩş bêȧútîfüł!
2 | 


--------------------------------------------------------------------------------
/tests/unicode/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Please see the ../conf/unicode/LICENSE.txt files for the license.
2 | 


--------------------------------------------------------------------------------
/tests/verify-show-unicode.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/sh -e
  2 | #
  3 | # Verify that the show-unicode command line tool returns the correct exit codes
  4 | #
  5 | # TODO: verify the actual output (that may require a catch_....cpp file so
  6 | #       we can capture the output and easy compare with string we generate
  7 | #       in a C++ test)
  8 | 
  9 | SHOW_UNICODE="../../BUILD/Debug/contrib/libutf8/tools/show-unicode"
 10 | ERRCNT=0
 11 | RED='\033[0;31m'
 12 | NORMAL='\033[0m'
 13 | 
 14 | # Verify Binary Exists
 15 | if ! test -x ${SHOW_UNICODE}
 16 | then
 17 | 	echo "${RED}error: could not find valid binary \"${SHOW_UNICODE}\"; did you build the project?${NORMAL}"
 18 | 	echo "1 error occurred. Please verify what went wrong and fix it."
 19 | 	exit 1
 20 | fi
 21 | 
 22 | # Help
 23 | echo "--- SECTION: --help"
 24 | if ${SHOW_UNICODE} --help
 25 | then
 26 | 	echo "${RED}error: --help returned with success.${NORMAL}"
 27 | 	ERRCNT=`expr ${ERRCNT} + 1`
 28 | elif test ${?} -ne 2
 29 | then
 30 | 	echo "${RED}error: --help did not return with expected exit code.${NORMAL}"
 31 | 	ERRCNT=`expr ${ERRCNT} + 1`
 32 | else
 33 | 	echo "info: --help works."
 34 | fi
 35 | echo
 36 | 
 37 | # Version
 38 | echo "--- SECTION: --version"
 39 | if ${SHOW_UNICODE} --version
 40 | then
 41 | 	echo "${RED}error: --version returned with success.${NORMAL}"
 42 | 	ERRCNT=`expr ${ERRCNT} + 1`
 43 | elif test ${?} -ne 2
 44 | then
 45 | 	echo "${RED}error: --version did not return with expected exit code.${NORMAL}"
 46 | 	ERRCNT=`expr ${ERRCNT} + 1`
 47 | else
 48 | 	echo "info: --version works."
 49 | fi
 50 | echo
 51 | 
 52 | # String / Character
 53 | echo "--- SECTION: --string"
 54 | if ${SHOW_UNICODE} "Magic"
 55 | then
 56 | 	echo "info: string display worked."
 57 | else
 58 | 	echo "${RED}error: string display failed with ${?}.${NORMAL}"
 59 | 	ERRCNT=`expr ${ERRCNT} + 1`
 60 | fi
 61 | echo
 62 | 
 63 | if ${SHOW_UNICODE} --string "Élémentaire ça!"
 64 | then
 65 | 	echo "info: string display worked."
 66 | else
 67 | 	echo "${RED}error: string display failed with ${?}.${NORMAL}"
 68 | 	ERRCNT=`expr ${ERRCNT} + 1`
 69 | fi
 70 | echo
 71 | 
 72 | echo "--- SECTION: --character"
 73 | if ${SHOW_UNICODE} -C 0x1D11E
 74 | then
 75 | 	echo "info: character display worked."
 76 | else
 77 | 	echo "${RED}error: character display failed with ${?}.${NORMAL}"
 78 | 	ERRCNT=`expr ${ERRCNT} + 1`
 79 | fi
 80 | echo
 81 | 
 82 | if ${SHOW_UNICODE} -C 1D11E
 83 | then
 84 | 	echo "${RED}error: character display succeeded with invalid number syntax.${NORMAL}"
 85 | 	ERRCNT=`expr ${ERRCNT} + 1`
 86 | else
 87 | 	ERRCODE=${?}
 88 | 	if test ${ERRCODE} -eq 1
 89 | 	then
 90 | 		echo "info: character display failed as expected with ${ERRCODE}."
 91 | 	else
 92 | 		echo "${RED}error: character display failed with unexpected error code ${ERRCODE}.${NORMAL}"
 93 | 		ERRCNT=`expr ${ERRCNT} + 1`
 94 | 	fi
 95 | fi
 96 | echo
 97 | 
 98 | # Files
 99 | check_show() {
100 | 	echo "--- SECTION: file with: ${1}"
101 | 	if ${SHOW_UNICODE} "${1}" tests/example-for-show-${2}.txt
102 | 	then
103 | 		echo "info: ${2} display worked."
104 | 	else
105 | 		echo "${RED}error: ${2} display failed with ${?}.${NORMAL}"
106 | 		ERRCNT=`expr ${ERRCNT} + 1`
107 | 	fi
108 | 	echo
109 | }
110 | 
111 | check_show -f utf8
112 | check_show -S utf16
113 | check_show -F utf32
114 | 
115 | if test ${ERRCNT} -eq 0
116 | then
117 | 	exit 0
118 | fi
119 | 
120 | echo "${ERRCNT} errors occurred. Please verify what went wrong and fix it."
121 | exit 1
122 | 
123 | 


--------------------------------------------------------------------------------
/tools/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2012-2023  Made to Order Software Corp.  All Rights Reserved
 2 | #
 3 | # https://snapwebsites.org/project/libutf8
 4 | # contact@m2osw.com
 5 | #
 6 | # This program is free software; you can redistribute it and/or modify
 7 | # it under the terms of the GNU General Public License as published by
 8 | # the Free Software Foundation; either version 2 of the License, or
 9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License along
17 | # with this program; if not, write to the Free Software Foundation, Inc.,
18 | # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
19 | 
20 | ##
21 | ## show-unicode
22 | ##
23 | project(show-unicode)
24 | 
25 | add_executable(${PROJECT_NAME}
26 |     show_unicode.cpp
27 | )
28 | 
29 | target_link_libraries(${PROJECT_NAME}
30 |     utf8
31 | )
32 | 
33 | install(
34 |     TARGETS
35 |         ${PROJECT_NAME}
36 | 
37 |     RUNTIME DESTINATION
38 |         bin
39 | )
40 | 
41 | 
42 | ##
43 | ## unicode-data-parser
44 | ##
45 | project(unicode-data-parser)
46 | 
47 | add_executable(${PROJECT_NAME}
48 |     unicode_data_parser.cpp
49 | )
50 | 
51 | target_include_directories(${PROJECT_NAME}
52 |     PUBLIC
53 |         ${ADVGETOPT_INCLUDE_DIRS}
54 |         ${LIBEXCEPT_INCLUDE_DIRS}
55 | )
56 | 
57 | target_link_libraries(${PROJECT_NAME}
58 |     utf8
59 |     ${LIBEXCEPT_LIBRARIES}
60 | )
61 | 
62 | install(
63 |     TARGETS
64 |         ${PROJECT_NAME}
65 | 
66 |     RUNTIME DESTINATION
67 |         bin
68 | )
69 | 
70 | 
71 | # vim: ts=4 sw=4 et
72 | 


--------------------------------------------------------------------------------
/tools/show_unicode.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2000-2023  Made to Order Software Corp.  All Rights Reserved
  2 | //
  3 | // https://snapwebsites.org/project/libutf8
  4 | // contact@m2osw.com
  5 | //
  6 | // This program is free software; you can redistribute it and/or modify
  7 | // it under the terms of the GNU General Public License as published by
  8 | // the Free Software Foundation; either version 2 of the License, or
  9 | // (at your option) any later version.
 10 | //
 11 | // This program is distributed in the hope that it will be useful,
 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | // GNU General Public License for more details.
 15 | //
 16 | // You should have received a copy of the GNU General Public License along
 17 | // with this program; if not, write to the Free Software Foundation, Inc.,
 18 | // 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 19 | 
 20 | /** \file
 21 |  * \brief Tool used to convert the UnicodeData.txt file to C structures.
 22 |  *
 23 |  * This executable is used to convert the UnicodeData.txt to a set of
 24 |  * C structure which we can search very quickly to find Unicode characters.
 25 |  * This gives us all the necessary information to convert strings to NFKC
 26 |  * NFKD, and especially NFC and NFD.
 27 |  *
 28 |  * \sa http://www.unicode.org/reports/tr15/
 29 |  */
 30 | 
 31 | 
 32 | // libutf8
 33 | //
 34 | #include    <libutf8/libutf8.h>
 35 | #include    <libutf8/version.h>
 36 | 
 37 | 
 38 | // C++
 39 | //
 40 | #include    <cstdint>
 41 | #include    <cstring>
 42 | #include    <fstream>
 43 | #include    <iomanip>
 44 | #include    <iostream>
 45 | #include    <string>
 46 | #include    <vector>
 47 | 
 48 | 
 49 | // last include
 50 | //
 51 | #include    <snapdev/poison.h>
 52 | 
 53 | 
 54 | 
 55 | namespace
 56 | {
 57 | 
 58 | 
 59 | class show_unicode
 60 | {
 61 | public:
 62 |     enum class mode_t
 63 |     {
 64 |         MODE_STRING,
 65 |         MODE_CHARACTER,
 66 |         MODE_UTF8_FILENAME,
 67 |         MODE_UTF16_FILENAME,
 68 |         MODE_UTF32_FILENAME,
 69 | 
 70 |         MODE_DEFAULT // like MODE_STRING, just not set explicitly
 71 |     };
 72 | 
 73 |     int         parse_args(int agrc, char * argv[]);
 74 |     int         verify_args();
 75 |     int         process();
 76 | 
 77 | private:
 78 |     void        usage();
 79 |     int         set_mode(mode_t m);
 80 |     int         read_file();
 81 | 
 82 |     mode_t                      f_mode = mode_t::MODE_DEFAULT;
 83 |     std::string                 f_filename = std::string();
 84 |     std::vector<std::uint8_t>   f_input = std::vector<std::uint8_t>();
 85 |     bool                        f_valid_fffe_ffff = true;
 86 | };
 87 | 
 88 | 
 89 | 
 90 | 
 91 | int show_unicode::parse_args(int argc, char * argv[])
 92 | {
 93 |     for(int i(1); i < argc; ++i)
 94 |     {
 95 |         if(argv[i][0] == '-')
 96 |         {
 97 |             if(strcmp(argv[i], "-h") == 0
 98 |             || strcmp(argv[i], "--help") == 0)
 99 |             {
100 |                 usage();
101 |                 return 2;
102 |             }
103 |             if(strcmp(argv[i], "-V") == 0
104 |             || strcmp(argv[i], "--version") == 0)
105 |             {
106 |                 std::cout << LIBUTF8_VERSION_STRING << '\n';
107 |                 return 2;
108 |             }
109 |             if(strcmp(argv[i], "-C") == 0
110 |             || strcmp(argv[i], "--unicode") == 0)
111 |             {
112 |                 ++i;
113 |                 if(i >= argc)
114 |                 {
115 |                     std::cerr << "error: the --character command line option must be followed by a number representing a valid Unicode characters in UTF-32.\n";
116 |                     return 3;
117 |                 }
118 |                 char * end;
119 |                 char * s(argv[i]);
120 |                 int base(10);
121 |                 if(*s == '0')
122 |                 {
123 |                     ++s;
124 |                     base = 8;
125 |                     if(*s == 'x' || *s == 'X')
126 |                     {
127 |                         base = 16;
128 |                         ++s;
129 |                     }
130 |                 }
131 |                 char32_t const wc(strtol(s, &end, base));
132 |                 if(end == nullptr
133 |                 || *end != '\0')
134 |                 {
135 |                     std::cerr
136 |                         << "error: expected a valid decimal, octal, or hexadecimal number; could not parse \""
137 |                         << argv[i]
138 |                         << "\" as a valid number.\n";
139 |                     return 1;
140 |                 }
141 |                 if(!libutf8::is_valid_unicode(wc))
142 |                 {
143 |                     std::cerr
144 |                         << "error: code \"0x"
145 |                         << std::uppercase << std::hex << std::setfill('0') << std::setw(6) << static_cast<std::uint32_t>(wc)
146 |                         << "\" does not represent a valid Unicode character.\n";
147 |                     return 1;
148 |                 }
149 |                 std::string const character(libutf8::to_u8string(wc));
150 |                 f_input.insert(f_input.end(), character.begin(), character.end());
151 |                 int const r(set_mode(mode_t::MODE_CHARACTER));
152 |                 if(r != 0)
153 |                 {
154 |                     return r;
155 |                 }
156 |                 continue;
157 |             }
158 |             if(strcmp(argv[i], "-s") == 0
159 |             || strcmp(argv[i], "--string") == 0)
160 |             {
161 |                 ++i;
162 |                 if(i >= argc)
163 |                 {
164 |                     std::cerr << "error: the --string command line option must be followed by the string to process.\n";
165 |                     return 3;
166 |                 }
167 |                 f_input.insert(f_input.end(), argv[i], argv[i] + strlen(argv[i]));
168 |                 int const r(set_mode(mode_t::MODE_STRING));
169 |                 if(r != 0)
170 |                 {
171 |                     return r;
172 |                 }
173 |                 continue;
174 |             }
175 |             if(strcmp(argv[i], "-f") == 0
176 |             || strcmp(argv[i], "--input") == 0)
177 |             {
178 |                 ++i;
179 |                 if(i >= argc)
180 |                 {
181 |                     std::cerr << "error: the --input command line option must be followed by the input filename.\n";
182 |                     return 3;
183 |                 }
184 |                 f_filename = argv[i];
185 |                 int r(set_mode(mode_t::MODE_UTF8_FILENAME));
186 |                 if(r == 0)
187 |                 {
188 |                     r = read_file();
189 |                 }
190 |                 if(r != 0)
191 |                 {
192 |                     return r;
193 |                 }
194 |                 continue;
195 |             }
196 |             if(strcmp(argv[i], "-S") == 0
197 |             || strcmp(argv[i], "--input-utf16") == 0)
198 |             {
199 |                 ++i;
200 |                 if(i >= argc)
201 |                 {
202 |                     std::cerr << "error: the --input-utf16 command line option must be followed by the input filename.\n";
203 |                     return 3;
204 |                 }
205 |                 f_filename = argv[i];
206 |                 int r(set_mode(mode_t::MODE_UTF16_FILENAME));
207 |                 if(r == 0)
208 |                 {
209 |                     r = read_file();
210 |                 }
211 |                 if(r == 0 && f_input.size() % 2 != 0)
212 |                 {
213 |                     std::cerr << "error: the size of \""
214 |                         << f_filename
215 |                         << "\" was expected to be a multiple of 2.\n";
216 |                     return 1;
217 |                 }
218 |                 if(r == 0)
219 |                 {
220 |                     std::u16string in(reinterpret_cast<char16_t *>(f_input.data()), f_input.size() / 2);
221 |                     std::string u8(libutf8::to_u8string(in));
222 |                     f_input.resize(u8.length());
223 |                     memcpy(f_input.data(), u8.data(), u8.length());
224 |                 }
225 |                 if(r != 0)
226 |                 {
227 |                     return r;
228 |                 }
229 |                 continue;
230 |             }
231 |             if(strcmp(argv[i], "-F") == 0
232 |             || strcmp(argv[i], "--input-utf32") == 0)
233 |             {
234 |                 ++i;
235 |                 if(i >= argc)
236 |                 {
237 |                     std::cerr << "error: the --input-utf32 command line option must be followed by the input filename.\n";
238 |                     return 3;
239 |                 }
240 |                 f_filename = argv[i];
241 |                 int r(set_mode(mode_t::MODE_UTF32_FILENAME));
242 |                 if(r == 0)
243 |                 {
244 |                     r = read_file();
245 |                 }
246 |                 if(r == 0 && f_input.size() % 4 != 0)
247 |                 {
248 |                     std::cerr << "error: the size of \""
249 |                         << f_filename
250 |                         << "\" was expected to be a multiple of 4.\n";
251 |                     return 1;
252 |                 }
253 |                 if(r == 0)
254 |                 {
255 |                     std::u32string in(reinterpret_cast<char32_t *>(f_input.data()), f_input.size() / 4);
256 |                     std::string u8(libutf8::to_u8string(in));
257 |                     f_input.resize(u8.length());
258 |                     memcpy(f_input.data(), u8.data(), u8.length());
259 |                 }
260 |                 if(r != 0)
261 |                 {
262 |                     return r;
263 |                 }
264 |                 continue;
265 |             }
266 |             if(strcmp(argv[i], "--valid-fffe-ffff") == 0)
267 |             {
268 |                 f_valid_fffe_ffff = true;
269 |                 continue;
270 |             }
271 |             if(strcmp(argv[i], "-W") == 0
272 |             || strcmp(argv[i], "--invalid-fffe-ffff") == 0)
273 |             {
274 |                 f_valid_fffe_ffff = false;
275 |                 continue;
276 |             }
277 |             std::cerr << "error: unknown command line option \""
278 |                 << argv[i]
279 |                 << "\".\n";
280 |             return 4;
281 |         }
282 |         else
283 |         {
284 |             f_input.insert(f_input.end(), argv[i], argv[i] + strlen(argv[i]));
285 |         }
286 |     }
287 | 
288 |     return 0;
289 | }
290 | 
291 | 
292 | int show_unicode::set_mode(mode_t m)
293 | {
294 |     if(f_mode != mode_t::MODE_DEFAULT)
295 |     {
296 |         std::cerr << "error: mode already set to: " << static_cast<int>(f_mode) << "\n";
297 |         return 3;
298 |     }
299 |     f_mode = m;
300 | 
301 |     return 0;
302 | }
303 | 
304 | 
305 | int show_unicode::read_file()
306 | {
307 |     std::ifstream in(f_filename);
308 |     if(!in.is_open())
309 |     {
310 |         std::cerr
311 |             << "error: could not open input file \""
312 |             << f_filename
313 |             << "\".\n";
314 |         return 1;
315 |     }
316 |     in.seekg(0, std::ios::end);
317 |     std::size_t const size(in.tellg());
318 |     in.seekg(0);
319 |     f_input.resize(size);
320 |     in.read(reinterpret_cast<char *>(f_input.data()), size);
321 |     if(!in)
322 |     {
323 |         std::cerr
324 |             << "error: could not read input file \""
325 |             << f_filename
326 |             << "\".\n";
327 |         return 1;
328 |     }
329 | 
330 |     return 0;
331 | }
332 | 
333 | 
334 | int show_unicode::verify_args()
335 | {
336 |     // the mode already generated an error no need for that here
337 |     return 0;
338 | }
339 | 
340 | 
341 | int show_unicode::process()
342 | {
343 |     // first show the string as is
344 |     //
345 |     std::string utf8(std::string(reinterpret_cast<char *>(f_input.data()), f_input.size()));
346 |     std::cout << "Input: \"" << utf8 << "\".\n";
347 | 
348 |     // next show the string as UTF-8 bytes
349 |     //
350 |     std::cout << "UTF-8:" << std::hex << std::setfill('0');
351 |     for(auto it(f_input.begin()); it != f_input.end(); ++it)
352 |     {
353 |         char const * space(" ");
354 |         if(*it >= 0x80 && *it <= 0xBF)
355 |         {
356 |             space = ".";
357 |         }
358 |         std::cout << space << std::setw(2) << static_cast<std::uint32_t>(*it);
359 |     }
360 |     std::cout << '\n';
361 | 
362 |     // next show the string as UTF-16 words
363 |     //
364 |     std::u16string utf16(libutf8::to_u16string(utf8));
365 |     std::cout << "UTF-16:";
366 |     for(auto it(utf16.begin()); it != utf16.end(); ++it)
367 |     {
368 |         std::cout << ' ' << std::setw(4) << static_cast<std::uint32_t>(*it);
369 |     }
370 |     std::cout << '\n';
371 | 
372 |     // next show the string as UTF-32 words
373 |     //
374 |     std::u32string utf32(libutf8::to_u32string(utf8));
375 |     std::cout << "UTF-32:";
376 |     for(auto it(utf32.begin()); it != utf32.end(); ++it)
377 |     {
378 |         std::cout << ' ' << std::setw(6) << static_cast<std::uint32_t>(*it);
379 |     }
380 |     std::cout << '\n';
381 | 
382 |     return 0;
383 | }
384 | 
385 | 
386 | void show_unicode::usage()
387 | {
388 |     std::cout << "Usage: show-unicode [-<opts>] [-s|--string] '<string>' | -C <value> | -f <filename>\n"
389 |                  "Where -<opts> is one or more of:\n"
390 |                  "  -h | --help                     print this help screen.\n"
391 |                  "  -C | --unicode <value>          use specified value.\n"
392 |                  "  -s | --string <string>          input string to convert (using -s or --string is optional).\n"
393 |                  "  -f | --input <filename>         input file of UTF-8 characters.\n"
394 |                  "  -S | --input-utf16 <filename>   input file of UTF-16 characters.\n"
395 |                  "  -F | --input-utf32 <filename>   input file of UTF-32 characters.\n"
396 |                  "       --valid-fffe-ffff          consider \\uFFFE and \\uFFFF as valid characters (default).\n"
397 |                  "  -W | --invalid-fffe-ffff        consider \\uFFFE and \\uFFFF as invalid characters.\n"
398 |                  "  -V | --version                  print out this tool's version.\n"
399 |                  "\n";
400 | }
401 | 
402 | 
403 | } // no name namespace
404 | 
405 | 
406 | int main(int argc, char * argv[])
407 | {
408 |     show_unicode show;
409 |     int r(show.parse_args(argc, argv));
410 |     if(r != 0)
411 |     {
412 |         return r;
413 |     }
414 |     r = show.verify_args();
415 |     if(r != 0)
416 |     {
417 |         return r;
418 |     }
419 |     return show.process();
420 | }
421 | 
422 | 
423 | // vim: ts=4 sw=4 et
424 | 


--------------------------------------------------------------------------------
/tools/unicode_data_parser.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2000-2023  Made to Order Software Corp.  All Rights Reserved
  2 | //
  3 | // https://snapwebsites.org/project/libutf8
  4 | // contact@m2osw.com
  5 | //
  6 | // This program is free software; you can redistribute it and/or modify
  7 | // it under the terms of the GNU General Public License as published by
  8 | // the Free Software Foundation; either version 2 of the License, or
  9 | // (at your option) any later version.
 10 | //
 11 | // This program is distributed in the hope that it will be useful,
 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | // GNU General Public License for more details.
 15 | //
 16 | // You should have received a copy of the GNU General Public License along
 17 | // with this program; if not, write to the Free Software Foundation, Inc.,
 18 | // 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 19 | 
 20 | /** \file
 21 |  * \brief Tool used to convert the UnicodeData.txt file to C structures.
 22 |  *
 23 |  * This executable is used to convert the UnicodeData.txt to a set of
 24 |  * C structure which we can search very quickly to find Unicode characters.
 25 |  * This gives us all the necessary information to convert strings to NFKC
 26 |  * NFKD, and especially NFC and NFD.
 27 |  *
 28 |  * \sa http://www.unicode.org/reports/tr15/
 29 |  */
 30 | 
 31 | 
 32 | // libutf8
 33 | //
 34 | #include    <libutf8/unicode_data.h>
 35 | 
 36 | 
 37 | // libexcept
 38 | //
 39 | #include    <libexcept/file_inheritance.h>
 40 | 
 41 | 
 42 | // C++
 43 | //
 44 | #include    <fstream>
 45 | #include    <iostream>
 46 | #include    <string>
 47 | 
 48 | 
 49 | // C
 50 | //
 51 | #include    <stdlib.h>
 52 | #include    <unistd.h>
 53 | 
 54 | 
 55 | // last include
 56 | //
 57 | #include    <snapdev/poison.h>
 58 | 
 59 | 
 60 | 
 61 | namespace
 62 | {
 63 | 
 64 | 
 65 | 
 66 | 
 67 | 
 68 | 
 69 | } // no name namespace
 70 | 
 71 | 
 72 | 
 73 | void usage()
 74 | {
 75 |     std::cout << "Usage: unicode_data_parser <in> <out>\n";
 76 |     std::cout << "Where:\n";
 77 |     std::cout << "  <in>     is a path to the unicode files such as UnicodeData.txt (default: \"/usr/shared/libutf8/unicode\")\n";
 78 |     std::cout << "  <out>    is a path to the output unicode_data.ucdb file (default: a.ucdb)\n";
 79 | }
 80 | 
 81 | 
 82 | int main(int argc, char * argv[])
 83 | {
 84 |     libexcept::verify_inherited_files();
 85 | 
 86 |     std::string input_dir;
 87 |     std::string output_filename;
 88 | 
 89 |     for(int i(1); i < argc; ++i)
 90 |     {
 91 |         if(argv[i][0] == '-')
 92 |         {
 93 |             switch(argv[i][1])
 94 |             {
 95 |             case 'h':
 96 |                 usage();
 97 |                 exit(1);
 98 | 
 99 |             default:
100 |                 std::cerr << "error: unknown command line option -"
101 |                     << argv[i][1]
102 |                     << "\n";
103 |                 exit(1);
104 |                 break;
105 | 
106 |             }
107 |         }
108 |         else
109 |         {
110 |             if(input_dir.empty())
111 |             {
112 |                 input_dir = argv[i];
113 |                 if(input_dir.empty())
114 |                 {
115 |                     std::cerr << "error: input directory name can't be empty, try \".\" for current folder.\n";
116 |                     exit(1);
117 |                 }
118 |             }
119 |             else if(output_filename.empty())
120 |             {
121 |                 output_filename = argv[i];
122 |             }
123 |             else
124 |             {
125 |                 std::cerr << "error: too many filenames on the command line.\n";
126 |                 exit(1);
127 |             }
128 |         }
129 |     }
130 | 
131 |     if(input_dir.empty())
132 |     {
133 |         input_dir = "/usr/shared/libutf8/unicode";
134 |     }
135 | 
136 |     if(output_filename.empty())
137 |     {
138 |         output_filename = "a.ucdb";
139 |     }
140 | 
141 |     libutf8::ucd_parser p(input_dir, output_filename);
142 |     p.generate();
143 | 
144 |     return 0;
145 | }
146 | 
147 | 
148 | // vim: ts=4 sw=4 et
149 | 


--------------------------------------------------------------------------------