├── README.md
└── utf8_console
    ├── README.md
    ├── demo.cpp
    ├── mk_wcwidth.c
    ├── utf8.h
    ├── utf8
        ├── checked.h
        ├── core.h
        ├── cpp11.h
        ├── cpp17.h
        └── unchecked.h
    ├── utfwidth.cpp
    └── utfwidth.h


/README.md:
--------------------------------------------------------------------------------
1 | # NJU_AdvancedProgramming_SP22


--------------------------------------------------------------------------------
/utf8_console/README.md:
--------------------------------------------------------------------------------
 1 | # 关于带中文的 UTF-8 字符串的输出
 2 | 
 3 | 中英文混合的 UTF-8 字符串在输出控制台，如果想利用 `std::setw` 等进行对齐，会出现错位的现象。这是因为 UTF-8 下中文通常占用 3 个字节，但中文在控制台下占据 2 个字符的位置。这种情况下系统直接把字节数作为宽度格式化就会出现错位的现象。
 4 | 
 5 | 如果用 GB-18030(GBK) 编码，这种情况不会出现，这源于一个巧合：在 GBK 编码中中文通常占用 2 字符，恰好和控制台下中文占 2 字符的事实对应。然而，依赖这样的巧合是不健壮的，如果包含半角日语字符、西里尔字符、希腊文字符等，这样的巧合就无法成立。
 6 | 
 7 | 因此，我们需要通用的解决方案，下面先给出使用说明，供不想看细节的同学使用。
 8 | 
 9 | ## 使用
10 | 
11 | 将 `demo.cpp` 外的所有文件/目录复制到工程中（如果您的项目有专门的文件夹存放第三方 Header-Only 库，可以把 `utf8` 目录和 `utf8.h` 移动进去）。将`mk_width.c`、`utf_width.cpp` 两个文件加入编译目标。
12 | 
13 | 为了使用这些代码，你首先需要满足这些先决条件：
14 | 
15 | - 使用 C++17 编译
16 | - 确保需要输出的字符串均是 UTF-8 类型，用窄字符类型表示（`char`，`std::string`，而非 `wchar_t`，`std::wstring`）
17 | - 可以获取将要输出的字符串的 `std::string_view` 表示（对于 `std::string`，转换自动完成；对于 C 风格字符串，请自行在网络查询有关资料）
18 | 
19 | 对于需要用到的地方，文件头需要 `#include "utfwidth.h"`，然后在所有使用 `std::setw` 的地方，用 `setw_u8` 代替，且 `setw_u8` 需要传入第二个参数，为欲输出的字符串。
20 | 
21 | **实际上，你应该参照 `demo.cpp` 以快速上手。**
22 | 
23 | 注意：Windows 下，正确地处理 UTF-8 字符串需要：
24 | 
25 | - 源代码使用 UTF-8 编译：MSVC 需要打开 `/utf-8` 开关
26 | - 源代码使用 UTF-8 编写：CLion、Visual Studio Code 等工具可以在右下角设置；Visual Studio 需要通过 `.editorconfig` 设置。
27 | - 将控制台代码页改为 65001：可使用 `system("chcp 65001");`，更优雅的做法是用 `SetConsoleOutputCP` Windows API。参照 `demo.cpp` 的做法。这一操作仅需要在程序运行最开始做一次；
28 | 
29 | ## Credits
30 | 
31 | 这组代码用到了两组第三方代码：
32 | 
33 | - [剑桥大学的 Markus Kuhn](https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c) 编写的 `mk_width.c` 提供了跨平台的 `wcwidth` 函数
34 | - [Nemanja Trifunovic 等编写的 UTF8-CPP 库](https://github.com/nemtrif/utfcpp)提供了方便的 UTF-8 到 UTF-16/32 的方法
35 | 
36 | ## 延伸阅读
37 | 
38 | https://stackoverflow.com/questions/29188948/cout-setw-doesnt-align-correctly-with-%C3%A5%C3%A4%C3%B6
39 | 
40 | https://man7.org/linux/man-pages/man3/wcswidth.3.html
41 | 
42 | https://stackoverflow.com/questions/15114303/determine-whether-a-unicode-character-is-fullwidth-or-halfwidth-in-c
43 | 
44 | http://www.unicode.org/reports/tr11/


--------------------------------------------------------------------------------
/utf8_console/demo.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <string>
 3 | #include <string_view>
 4 | #include <iomanip>
 5 | 
 6 | #include "utfwidth.h"
 7 | 
 8 | // 直接定义，避免引入 Windows.h
 9 | #ifdef WIN32
10 | extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned wCodePageID);
11 | #endif
12 | 
13 | int main()
14 | {
15 | // 在 Windows 平台下
16 | #ifdef WIN32
17 |     SetConsoleOutputCP(65001);
18 | #endif
19 | 
20 |     std::string str("测试abcｶｷｸｹｺ"); // 注意日语半角假名应该仅占 1 字符宽度
21 |     std::string str2("测试abc再测试");
22 | 
23 |     // 注意 setw 被换成了 setw_u8
24 |     std::cout << std::right << setw_u8(20, str) << str << std::endl;
25 |     std::cout << std::right << setw_u8(20, str2) << str2 << std::endl;
26 | }


--------------------------------------------------------------------------------
/utf8_console/mk_wcwidth.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This is an implementation of wcwidth() and wcswidth() (defined in
  3 |  * IEEE Std 1002.1-2001) for Unicode.
  4 |  *
  5 |  * http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html
  6 |  * http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html
  7 |  *
  8 |  * In fixed-width output devices, Latin characters all occupy a single
  9 |  * "cell" position of equal width, whereas ideographic CJK characters
 10 |  * occupy two such cells. Interoperability between terminal-line
 11 |  * applications and (teletype-style) character terminals using the
 12 |  * UTF-8 encoding requires agreement on which character should advance
 13 |  * the cursor by how many cell positions. No established formal
 14 |  * standards exist at present on which Unicode character shall occupy
 15 |  * how many cell positions on character terminals. These routines are
 16 |  * a first attempt of defining such behavior based on simple rules
 17 |  * applied to data provided by the Unicode Consortium.
 18 |  *
 19 |  * For some graphical characters, the Unicode standard explicitly
 20 |  * defines a character-cell width via the definition of the East Asian
 21 |  * FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes.
 22 |  * In all these cases, there is no ambiguity about which width a
 23 |  * terminal shall use. For characters in the East Asian Ambiguous (A)
 24 |  * class, the width choice depends purely on a preference of backward
 25 |  * compatibility with either historic CJK or Western practice.
 26 |  * Choosing single-width for these characters is easy to justify as
 27 |  * the appropriate long-term solution, as the CJK practice of
 28 |  * displaying these characters as double-width comes from historic
 29 |  * implementation simplicity (8-bit encoded characters were displayed
 30 |  * single-width and 16-bit ones double-width, even for Greek,
 31 |  * Cyrillic, etc.) and not any typographic considerations.
 32 |  *
 33 |  * Much less clear is the choice of width for the Not East Asian
 34 |  * (Neutral) class. Existing practice does not dictate a width for any
 35 |  * of these characters. It would nevertheless make sense
 36 |  * typographically to allocate two character cells to characters such
 37 |  * as for instance EM SPACE or VOLUME INTEGRAL, which cannot be
 38 |  * represented adequately with a single-width glyph. The following
 39 |  * routines at present merely assign a single-cell width to all
 40 |  * neutral characters, in the interest of simplicity. This is not
 41 |  * entirely satisfactory and should be reconsidered before
 42 |  * establishing a formal standard in this area. At the moment, the
 43 |  * decision which Not East Asian (Neutral) characters should be
 44 |  * represented by double-width glyphs cannot yet be answered by
 45 |  * applying a simple rule from the Unicode database content. Setting
 46 |  * up a proper standard for the behavior of UTF-8 character terminals
 47 |  * will require a careful analysis not only of each Unicode character,
 48 |  * but also of each presentation form, something the author of these
 49 |  * routines has avoided to do so far.
 50 |  *
 51 |  * http://www.unicode.org/unicode/reports/tr11/
 52 |  *
 53 |  * Markus Kuhn -- 2007-05-26 (Unicode 5.0)
 54 |  *
 55 |  * Permission to use, copy, modify, and distribute this software
 56 |  * for any purpose and without fee is hereby granted. The author
 57 |  * disclaims all warranties with regard to this software.
 58 |  *
 59 |  * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
 60 |  */
 61 | 
 62 | #include <wchar.h>
 63 | 
 64 | struct interval {
 65 |   int first;
 66 |   int last;
 67 | };
 68 | 
 69 | /* auxiliary function for binary search in interval table */
 70 | static int bisearch(wchar_t ucs, const struct interval *table, int max) {
 71 |   int min = 0;
 72 |   int mid;
 73 | 
 74 |   if (ucs < table[0].first || ucs > table[max].last)
 75 |     return 0;
 76 |   while (max >= min) {
 77 |     mid = (min + max) / 2;
 78 |     if (ucs > table[mid].last)
 79 |       min = mid + 1;
 80 |     else if (ucs < table[mid].first)
 81 |       max = mid - 1;
 82 |     else
 83 |       return 1;
 84 |   }
 85 | 
 86 |   return 0;
 87 | }
 88 | 
 89 | 
 90 | /* The following two functions define the column width of an ISO 10646
 91 |  * character as follows:
 92 |  *
 93 |  *    - The null character (U+0000) has a column width of 0.
 94 |  *
 95 |  *    - Other C0/C1 control characters and DEL will lead to a return
 96 |  *      value of -1.
 97 |  *
 98 |  *    - Non-spacing and enclosing combining characters (general
 99 |  *      category code Mn or Me in the Unicode database) have a
100 |  *      column width of 0.
101 |  *
102 |  *    - SOFT HYPHEN (U+00AD) has a column width of 1.
103 |  *
104 |  *    - Other format characters (general category code Cf in the Unicode
105 |  *      database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
106 |  *
107 |  *    - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
108 |  *      have a column width of 0.
109 |  *
110 |  *    - Spacing characters in the East Asian Wide (W) or East Asian
111 |  *      Full-width (F) category as defined in Unicode Technical
112 |  *      Report #11 have a column width of 2.
113 |  *
114 |  *    - All remaining characters (including all printable
115 |  *      ISO 8859-1 and WGL4 characters, Unicode control characters,
116 |  *      etc.) have a column width of 1.
117 |  *
118 |  * This implementation assumes that wchar_t characters are encoded
119 |  * in ISO 10646.
120 |  */
121 | 
122 | int mk_wcwidth(wchar_t ucs)
123 | {
124 |   /* sorted list of non-overlapping intervals of non-spacing characters */
125 |   /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
126 |   static const struct interval combining[] = {
127 |     { 0x0300, 0x036F }, { 0x0483, 0x0486 }, { 0x0488, 0x0489 },
128 |     { 0x0591, 0x05BD }, { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 },
129 |     { 0x05C4, 0x05C5 }, { 0x05C7, 0x05C7 }, { 0x0600, 0x0603 },
130 |     { 0x0610, 0x0615 }, { 0x064B, 0x065E }, { 0x0670, 0x0670 },
131 |     { 0x06D6, 0x06E4 }, { 0x06E7, 0x06E8 }, { 0x06EA, 0x06ED },
132 |     { 0x070F, 0x070F }, { 0x0711, 0x0711 }, { 0x0730, 0x074A },
133 |     { 0x07A6, 0x07B0 }, { 0x07EB, 0x07F3 }, { 0x0901, 0x0902 },
134 |     { 0x093C, 0x093C }, { 0x0941, 0x0948 }, { 0x094D, 0x094D },
135 |     { 0x0951, 0x0954 }, { 0x0962, 0x0963 }, { 0x0981, 0x0981 },
136 |     { 0x09BC, 0x09BC }, { 0x09C1, 0x09C4 }, { 0x09CD, 0x09CD },
137 |     { 0x09E2, 0x09E3 }, { 0x0A01, 0x0A02 }, { 0x0A3C, 0x0A3C },
138 |     { 0x0A41, 0x0A42 }, { 0x0A47, 0x0A48 }, { 0x0A4B, 0x0A4D },
139 |     { 0x0A70, 0x0A71 }, { 0x0A81, 0x0A82 }, { 0x0ABC, 0x0ABC },
140 |     { 0x0AC1, 0x0AC5 }, { 0x0AC7, 0x0AC8 }, { 0x0ACD, 0x0ACD },
141 |     { 0x0AE2, 0x0AE3 }, { 0x0B01, 0x0B01 }, { 0x0B3C, 0x0B3C },
142 |     { 0x0B3F, 0x0B3F }, { 0x0B41, 0x0B43 }, { 0x0B4D, 0x0B4D },
143 |     { 0x0B56, 0x0B56 }, { 0x0B82, 0x0B82 }, { 0x0BC0, 0x0BC0 },
144 |     { 0x0BCD, 0x0BCD }, { 0x0C3E, 0x0C40 }, { 0x0C46, 0x0C48 },
145 |     { 0x0C4A, 0x0C4D }, { 0x0C55, 0x0C56 }, { 0x0CBC, 0x0CBC },
146 |     { 0x0CBF, 0x0CBF }, { 0x0CC6, 0x0CC6 }, { 0x0CCC, 0x0CCD },
147 |     { 0x0CE2, 0x0CE3 }, { 0x0D41, 0x0D43 }, { 0x0D4D, 0x0D4D },
148 |     { 0x0DCA, 0x0DCA }, { 0x0DD2, 0x0DD4 }, { 0x0DD6, 0x0DD6 },
149 |     { 0x0E31, 0x0E31 }, { 0x0E34, 0x0E3A }, { 0x0E47, 0x0E4E },
150 |     { 0x0EB1, 0x0EB1 }, { 0x0EB4, 0x0EB9 }, { 0x0EBB, 0x0EBC },
151 |     { 0x0EC8, 0x0ECD }, { 0x0F18, 0x0F19 }, { 0x0F35, 0x0F35 },
152 |     { 0x0F37, 0x0F37 }, { 0x0F39, 0x0F39 }, { 0x0F71, 0x0F7E },
153 |     { 0x0F80, 0x0F84 }, { 0x0F86, 0x0F87 }, { 0x0F90, 0x0F97 },
154 |     { 0x0F99, 0x0FBC }, { 0x0FC6, 0x0FC6 }, { 0x102D, 0x1030 },
155 |     { 0x1032, 0x1032 }, { 0x1036, 0x1037 }, { 0x1039, 0x1039 },
156 |     { 0x1058, 0x1059 }, { 0x1160, 0x11FF }, { 0x135F, 0x135F },
157 |     { 0x1712, 0x1714 }, { 0x1732, 0x1734 }, { 0x1752, 0x1753 },
158 |     { 0x1772, 0x1773 }, { 0x17B4, 0x17B5 }, { 0x17B7, 0x17BD },
159 |     { 0x17C6, 0x17C6 }, { 0x17C9, 0x17D3 }, { 0x17DD, 0x17DD },
160 |     { 0x180B, 0x180D }, { 0x18A9, 0x18A9 }, { 0x1920, 0x1922 },
161 |     { 0x1927, 0x1928 }, { 0x1932, 0x1932 }, { 0x1939, 0x193B },
162 |     { 0x1A17, 0x1A18 }, { 0x1B00, 0x1B03 }, { 0x1B34, 0x1B34 },
163 |     { 0x1B36, 0x1B3A }, { 0x1B3C, 0x1B3C }, { 0x1B42, 0x1B42 },
164 |     { 0x1B6B, 0x1B73 }, { 0x1DC0, 0x1DCA }, { 0x1DFE, 0x1DFF },
165 |     { 0x200B, 0x200F }, { 0x202A, 0x202E }, { 0x2060, 0x2063 },
166 |     { 0x206A, 0x206F }, { 0x20D0, 0x20EF }, { 0x302A, 0x302F },
167 |     { 0x3099, 0x309A }, { 0xA806, 0xA806 }, { 0xA80B, 0xA80B },
168 |     { 0xA825, 0xA826 }, { 0xFB1E, 0xFB1E }, { 0xFE00, 0xFE0F },
169 |     { 0xFE20, 0xFE23 }, { 0xFEFF, 0xFEFF }, { 0xFFF9, 0xFFFB },
170 |     { 0x10A01, 0x10A03 }, { 0x10A05, 0x10A06 }, { 0x10A0C, 0x10A0F },
171 |     { 0x10A38, 0x10A3A }, { 0x10A3F, 0x10A3F }, { 0x1D167, 0x1D169 },
172 |     { 0x1D173, 0x1D182 }, { 0x1D185, 0x1D18B }, { 0x1D1AA, 0x1D1AD },
173 |     { 0x1D242, 0x1D244 }, { 0xE0001, 0xE0001 }, { 0xE0020, 0xE007F },
174 |     { 0xE0100, 0xE01EF }
175 |   };
176 | 
177 |   /* test for 8-bit control characters */
178 |   if (ucs == 0)
179 |     return 0;
180 |   if (ucs < 32 || (ucs >= 0x7f && ucs < 0xa0))
181 |     return -1;
182 | 
183 |   /* binary search in table of non-spacing characters */
184 |   if (bisearch(ucs, combining,
185 | 	       sizeof(combining) / sizeof(struct interval) - 1))
186 |     return 0;
187 | 
188 |   /* if we arrive here, ucs is not a combining or C0/C1 control character */
189 | 
190 |   return 1 + 
191 |     (ucs >= 0x1100 &&
192 |      (ucs <= 0x115f ||                    /* Hangul Jamo init. consonants */
193 |       ucs == 0x2329 || ucs == 0x232a ||
194 |       (ucs >= 0x2e80 && ucs <= 0xa4cf &&
195 |        ucs != 0x303f) ||                  /* CJK ... Yi */
196 |       (ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */
197 |       (ucs >= 0xf900 && ucs <= 0xfaff) || /* CJK Compatibility Ideographs */
198 |       (ucs >= 0xfe10 && ucs <= 0xfe19) || /* Vertical forms */
199 |       (ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */
200 |       (ucs >= 0xff00 && ucs <= 0xff60) || /* Fullwidth Forms */
201 |       (ucs >= 0xffe0 && ucs <= 0xffe6) ||
202 |       (ucs >= 0x20000 && ucs <= 0x2fffd) ||
203 |       (ucs >= 0x30000 && ucs <= 0x3fffd)));
204 | }
205 | 
206 | 
207 | int mk_wcswidth(const wchar_t *pwcs, size_t n)
208 | {
209 |   int w, width = 0;
210 | 
211 |   for (;*pwcs && n-- > 0; pwcs++)
212 |     if ((w = mk_wcwidth(*pwcs)) < 0)
213 |       return -1;
214 |     else
215 |       width += w;
216 | 
217 |   return width;
218 | }
219 | 
220 | 
221 | /*
222 |  * The following functions are the same as mk_wcwidth() and
223 |  * mk_wcswidth(), except that spacing characters in the East Asian
224 |  * Ambiguous (A) category as defined in Unicode Technical Report #11
225 |  * have a column width of 2. This variant might be useful for users of
226 |  * CJK legacy encodings who want to migrate to UCS without changing
227 |  * the traditional terminal character-width behaviour. It is not
228 |  * otherwise recommended for general use.
229 |  */
230 | int mk_wcwidth_cjk(wchar_t ucs)
231 | {
232 |   /* sorted list of non-overlapping intervals of East Asian Ambiguous
233 |    * characters, generated by "uniset +WIDTH-A -cat=Me -cat=Mn -cat=Cf c" */
234 |   static const struct interval ambiguous[] = {
235 |     { 0x00A1, 0x00A1 }, { 0x00A4, 0x00A4 }, { 0x00A7, 0x00A8 },
236 |     { 0x00AA, 0x00AA }, { 0x00AE, 0x00AE }, { 0x00B0, 0x00B4 },
237 |     { 0x00B6, 0x00BA }, { 0x00BC, 0x00BF }, { 0x00C6, 0x00C6 },
238 |     { 0x00D0, 0x00D0 }, { 0x00D7, 0x00D8 }, { 0x00DE, 0x00E1 },
239 |     { 0x00E6, 0x00E6 }, { 0x00E8, 0x00EA }, { 0x00EC, 0x00ED },
240 |     { 0x00F0, 0x00F0 }, { 0x00F2, 0x00F3 }, { 0x00F7, 0x00FA },
241 |     { 0x00FC, 0x00FC }, { 0x00FE, 0x00FE }, { 0x0101, 0x0101 },
242 |     { 0x0111, 0x0111 }, { 0x0113, 0x0113 }, { 0x011B, 0x011B },
243 |     { 0x0126, 0x0127 }, { 0x012B, 0x012B }, { 0x0131, 0x0133 },
244 |     { 0x0138, 0x0138 }, { 0x013F, 0x0142 }, { 0x0144, 0x0144 },
245 |     { 0x0148, 0x014B }, { 0x014D, 0x014D }, { 0x0152, 0x0153 },
246 |     { 0x0166, 0x0167 }, { 0x016B, 0x016B }, { 0x01CE, 0x01CE },
247 |     { 0x01D0, 0x01D0 }, { 0x01D2, 0x01D2 }, { 0x01D4, 0x01D4 },
248 |     { 0x01D6, 0x01D6 }, { 0x01D8, 0x01D8 }, { 0x01DA, 0x01DA },
249 |     { 0x01DC, 0x01DC }, { 0x0251, 0x0251 }, { 0x0261, 0x0261 },
250 |     { 0x02C4, 0x02C4 }, { 0x02C7, 0x02C7 }, { 0x02C9, 0x02CB },
251 |     { 0x02CD, 0x02CD }, { 0x02D0, 0x02D0 }, { 0x02D8, 0x02DB },
252 |     { 0x02DD, 0x02DD }, { 0x02DF, 0x02DF }, { 0x0391, 0x03A1 },
253 |     { 0x03A3, 0x03A9 }, { 0x03B1, 0x03C1 }, { 0x03C3, 0x03C9 },
254 |     { 0x0401, 0x0401 }, { 0x0410, 0x044F }, { 0x0451, 0x0451 },
255 |     { 0x2010, 0x2010 }, { 0x2013, 0x2016 }, { 0x2018, 0x2019 },
256 |     { 0x201C, 0x201D }, { 0x2020, 0x2022 }, { 0x2024, 0x2027 },
257 |     { 0x2030, 0x2030 }, { 0x2032, 0x2033 }, { 0x2035, 0x2035 },
258 |     { 0x203B, 0x203B }, { 0x203E, 0x203E }, { 0x2074, 0x2074 },
259 |     { 0x207F, 0x207F }, { 0x2081, 0x2084 }, { 0x20AC, 0x20AC },
260 |     { 0x2103, 0x2103 }, { 0x2105, 0x2105 }, { 0x2109, 0x2109 },
261 |     { 0x2113, 0x2113 }, { 0x2116, 0x2116 }, { 0x2121, 0x2122 },
262 |     { 0x2126, 0x2126 }, { 0x212B, 0x212B }, { 0x2153, 0x2154 },
263 |     { 0x215B, 0x215E }, { 0x2160, 0x216B }, { 0x2170, 0x2179 },
264 |     { 0x2190, 0x2199 }, { 0x21B8, 0x21B9 }, { 0x21D2, 0x21D2 },
265 |     { 0x21D4, 0x21D4 }, { 0x21E7, 0x21E7 }, { 0x2200, 0x2200 },
266 |     { 0x2202, 0x2203 }, { 0x2207, 0x2208 }, { 0x220B, 0x220B },
267 |     { 0x220F, 0x220F }, { 0x2211, 0x2211 }, { 0x2215, 0x2215 },
268 |     { 0x221A, 0x221A }, { 0x221D, 0x2220 }, { 0x2223, 0x2223 },
269 |     { 0x2225, 0x2225 }, { 0x2227, 0x222C }, { 0x222E, 0x222E },
270 |     { 0x2234, 0x2237 }, { 0x223C, 0x223D }, { 0x2248, 0x2248 },
271 |     { 0x224C, 0x224C }, { 0x2252, 0x2252 }, { 0x2260, 0x2261 },
272 |     { 0x2264, 0x2267 }, { 0x226A, 0x226B }, { 0x226E, 0x226F },
273 |     { 0x2282, 0x2283 }, { 0x2286, 0x2287 }, { 0x2295, 0x2295 },
274 |     { 0x2299, 0x2299 }, { 0x22A5, 0x22A5 }, { 0x22BF, 0x22BF },
275 |     { 0x2312, 0x2312 }, { 0x2460, 0x24E9 }, { 0x24EB, 0x254B },
276 |     { 0x2550, 0x2573 }, { 0x2580, 0x258F }, { 0x2592, 0x2595 },
277 |     { 0x25A0, 0x25A1 }, { 0x25A3, 0x25A9 }, { 0x25B2, 0x25B3 },
278 |     { 0x25B6, 0x25B7 }, { 0x25BC, 0x25BD }, { 0x25C0, 0x25C1 },
279 |     { 0x25C6, 0x25C8 }, { 0x25CB, 0x25CB }, { 0x25CE, 0x25D1 },
280 |     { 0x25E2, 0x25E5 }, { 0x25EF, 0x25EF }, { 0x2605, 0x2606 },
281 |     { 0x2609, 0x2609 }, { 0x260E, 0x260F }, { 0x2614, 0x2615 },
282 |     { 0x261C, 0x261C }, { 0x261E, 0x261E }, { 0x2640, 0x2640 },
283 |     { 0x2642, 0x2642 }, { 0x2660, 0x2661 }, { 0x2663, 0x2665 },
284 |     { 0x2667, 0x266A }, { 0x266C, 0x266D }, { 0x266F, 0x266F },
285 |     { 0x273D, 0x273D }, { 0x2776, 0x277F }, { 0xE000, 0xF8FF },
286 |     { 0xFFFD, 0xFFFD }, { 0xF0000, 0xFFFFD }, { 0x100000, 0x10FFFD }
287 |   };
288 | 
289 |   /* binary search in table of non-spacing characters */
290 |   if (bisearch(ucs, ambiguous,
291 | 	       sizeof(ambiguous) / sizeof(struct interval) - 1))
292 |     return 2;
293 | 
294 |   return mk_wcwidth(ucs);
295 | }
296 | 
297 | 
298 | int mk_wcswidth_cjk(const wchar_t *pwcs, size_t n)
299 | {
300 |   int w, width = 0;
301 | 
302 |   for (;*pwcs && n-- > 0; pwcs++)
303 |     if ((w = mk_wcwidth_cjk(*pwcs)) < 0)
304 |       return -1;
305 |     else
306 |       width += w;
307 | 
308 |   return width;
309 | }


--------------------------------------------------------------------------------
/utf8_console/utf8.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2006 Nemanja Trifunovic
 2 | 
 3 | /*
 4 | Permission is hereby granted, free of charge, to any person or organization
 5 | obtaining a copy of the software and accompanying documentation covered by
 6 | this license (the "Software") to use, reproduce, display, distribute,
 7 | execute, and transmit the Software, and to prepare derivative works of the
 8 | Software, and to permit third-parties to whom the Software is furnished to
 9 | do so, all subject to the following:
10 | 
11 | The copyright notices in the Software and this entire statement, including
12 | the above license grant, this restriction and the following disclaimer,
13 | must be included in all copies of the Software, in whole or in part, and
14 | all derivative works of the Software, unless such copies or derivative
15 | works are solely in the form of machine-executable object code generated by
16 | a source language processor.
17 | 
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 | DEALINGS IN THE SOFTWARE.
25 | */
26 | 
27 | 
28 | #ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
29 | #define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
30 | 
31 | #include "utf8/checked.h"
32 | #include "utf8/unchecked.h"
33 | 
34 | #endif // header guard
35 | 


--------------------------------------------------------------------------------
/utf8_console/utf8/checked.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2006-2016 Nemanja Trifunovic
  2 | 
  3 | /*
  4 | Permission is hereby granted, free of charge, to any person or organization
  5 | obtaining a copy of the software and accompanying documentation covered by
  6 | this license (the "Software") to use, reproduce, display, distribute,
  7 | execute, and transmit the Software, and to prepare derivative works of the
  8 | Software, and to permit third-parties to whom the Software is furnished to
  9 | do so, all subject to the following:
 10 | 
 11 | The copyright notices in the Software and this entire statement, including
 12 | the above license grant, this restriction and the following disclaimer,
 13 | must be included in all copies of the Software, in whole or in part, and
 14 | all derivative works of the Software, unless such copies or derivative
 15 | works are solely in the form of machine-executable object code generated by
 16 | a source language processor.
 17 | 
 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 24 | DEALINGS IN THE SOFTWARE.
 25 | */
 26 | 
 27 | 
 28 | #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 29 | #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 30 | 
 31 | #include "core.h"
 32 | #include <stdexcept>
 33 | 
 34 | namespace utf8
 35 | {
 36 |     // Base for the exceptions that may be thrown from the library
 37 |     class exception : public ::std::exception {
 38 |     };
 39 | 
 40 |     // Exceptions that may be thrown from the library functions.
 41 |     class invalid_code_point : public exception {
 42 |         uint32_t cp;
 43 |     public:
 44 |         invalid_code_point(uint32_t codepoint) : cp(codepoint) {}
 45 |         virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid code point"; }
 46 |         uint32_t code_point() const {return cp;}
 47 |     };
 48 | 
 49 |     class invalid_utf8 : public exception {
 50 |         uint8_t u8;
 51 |     public:
 52 |         invalid_utf8 (uint8_t u) : u8(u) {}
 53 |         virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-8"; }
 54 |         uint8_t utf8_octet() const {return u8;}
 55 |     };
 56 | 
 57 |     class invalid_utf16 : public exception {
 58 |         uint16_t u16;
 59 |     public:
 60 |         invalid_utf16 (uint16_t u) : u16(u) {}
 61 |         virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-16"; }
 62 |         uint16_t utf16_word() const {return u16;}
 63 |     };
 64 | 
 65 |     class not_enough_room : public exception {
 66 |     public:
 67 |         virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Not enough space"; }
 68 |     };
 69 | 
 70 |     /// The library API - functions intended to be called by the users
 71 | 
 72 |     template <typename octet_iterator>
 73 |     octet_iterator append(uint32_t cp, octet_iterator result)
 74 |     {
 75 |         if (!utf8::internal::is_code_point_valid(cp))
 76 |             throw invalid_code_point(cp);
 77 | 
 78 |         if (cp < 0x80)                        // one octet
 79 |             *(result++) = static_cast<uint8_t>(cp);
 80 |         else if (cp < 0x800) {                // two octets
 81 |             *(result++) = static_cast<uint8_t>((cp >> 6)            | 0xc0);
 82 |             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
 83 |         }
 84 |         else if (cp < 0x10000) {              // three octets
 85 |             *(result++) = static_cast<uint8_t>((cp >> 12)           | 0xe0);
 86 |             *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
 87 |             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
 88 |         }
 89 |         else {                                // four octets
 90 |             *(result++) = static_cast<uint8_t>((cp >> 18)           | 0xf0);
 91 |             *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)  | 0x80);
 92 |             *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
 93 |             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
 94 |         }
 95 |         return result;
 96 |     }
 97 | 
 98 |     template <typename octet_iterator, typename output_iterator>
 99 |     output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
100 |     {
101 |         while (start != end) {
102 |             octet_iterator sequence_start = start;
103 |             internal::utf_error err_code = utf8::internal::validate_next(start, end);
104 |             switch (err_code) {
105 |                 case internal::UTF8_OK :
106 |                     for (octet_iterator it = sequence_start; it != start; ++it)
107 |                         *out++ = *it;
108 |                     break;
109 |                 case internal::NOT_ENOUGH_ROOM:
110 |                     out = utf8::append (replacement, out);
111 |                     start = end;
112 |                     break;
113 |                 case internal::INVALID_LEAD:
114 |                     out = utf8::append (replacement, out);
115 |                     ++start;
116 |                     break;
117 |                 case internal::INCOMPLETE_SEQUENCE:
118 |                 case internal::OVERLONG_SEQUENCE:
119 |                 case internal::INVALID_CODE_POINT:
120 |                     out = utf8::append (replacement, out);
121 |                     ++start;
122 |                     // just one replacement mark for the sequence
123 |                     while (start != end && utf8::internal::is_trail(*start))
124 |                         ++start;
125 |                     break;
126 |             }
127 |         }
128 |         return out;
129 |     }
130 | 
131 |     template <typename octet_iterator, typename output_iterator>
132 |     inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
133 |     {
134 |         static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd);
135 |         return utf8::replace_invalid(start, end, out, replacement_marker);
136 |     }
137 | 
138 |     template <typename octet_iterator>
139 |     uint32_t next(octet_iterator& it, octet_iterator end)
140 |     {
141 |         uint32_t cp = 0;
142 |         internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
143 |         switch (err_code) {
144 |             case internal::UTF8_OK :
145 |                 break;
146 |             case internal::NOT_ENOUGH_ROOM :
147 |                 throw not_enough_room();
148 |             case internal::INVALID_LEAD :
149 |             case internal::INCOMPLETE_SEQUENCE :
150 |             case internal::OVERLONG_SEQUENCE :
151 |                 throw invalid_utf8(*it);
152 |             case internal::INVALID_CODE_POINT :
153 |                 throw invalid_code_point(cp);
154 |         }
155 |         return cp;
156 |     }
157 | 
158 |     template <typename octet_iterator>
159 |     uint32_t peek_next(octet_iterator it, octet_iterator end)
160 |     {
161 |         return utf8::next(it, end);
162 |     }
163 | 
164 |     template <typename octet_iterator>
165 |     uint32_t prior(octet_iterator& it, octet_iterator start)
166 |     {
167 |         // can't do much if it == start
168 |         if (it == start)
169 |             throw not_enough_room();
170 | 
171 |         octet_iterator end = it;
172 |         // Go back until we hit either a lead octet or start
173 |         while (utf8::internal::is_trail(*(--it)))
174 |             if (it == start)
175 |                 throw invalid_utf8(*it); // error - no lead byte in the sequence
176 |         return utf8::peek_next(it, end);
177 |     }
178 | 
179 |     template <typename octet_iterator, typename distance_type>
180 |     void advance (octet_iterator& it, distance_type n, octet_iterator end)
181 |     {
182 |         const distance_type zero(0);
183 |         if (n < zero) {
184 |             // backward
185 |             for (distance_type i = n; i < zero; ++i)
186 |                 utf8::prior(it, end);
187 |         } else {
188 |             // forward
189 |             for (distance_type i = zero; i < n; ++i)
190 |                 utf8::next(it, end);
191 |         }
192 |     }
193 | 
194 |     template <typename octet_iterator>
195 |     typename std::iterator_traits<octet_iterator>::difference_type
196 |     distance (octet_iterator first, octet_iterator last)
197 |     {
198 |         typename std::iterator_traits<octet_iterator>::difference_type dist;
199 |         for (dist = 0; first < last; ++dist)
200 |             utf8::next(first, last);
201 |         return dist;
202 |     }
203 | 
204 |     template <typename u16bit_iterator, typename octet_iterator>
205 |     octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
206 |     {
207 |         while (start != end) {
208 |             uint32_t cp = utf8::internal::mask16(*start++);
209 |             // Take care of surrogate pairs first
210 |             if (utf8::internal::is_lead_surrogate(cp)) {
211 |                 if (start != end) {
212 |                     uint32_t trail_surrogate = utf8::internal::mask16(*start++);
213 |                     if (utf8::internal::is_trail_surrogate(trail_surrogate))
214 |                         cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
215 |                     else
216 |                         throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
217 |                 }
218 |                 else
219 |                     throw invalid_utf16(static_cast<uint16_t>(cp));
220 | 
221 |             }
222 |             // Lone trail surrogate
223 |             else if (utf8::internal::is_trail_surrogate(cp))
224 |                 throw invalid_utf16(static_cast<uint16_t>(cp));
225 | 
226 |             result = utf8::append(cp, result);
227 |         }
228 |         return result;
229 |     }
230 | 
231 |     template <typename u16bit_iterator, typename octet_iterator>
232 |     u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
233 |     {
234 |         while (start < end) {
235 |             uint32_t cp = utf8::next(start, end);
236 |             if (cp > 0xffff) { //make a surrogate pair
237 |                 *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
238 |                 *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
239 |             }
240 |             else
241 |                 *result++ = static_cast<uint16_t>(cp);
242 |         }
243 |         return result;
244 |     }
245 | 
246 |     template <typename octet_iterator, typename u32bit_iterator>
247 |     octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
248 |     {
249 |         while (start != end)
250 |             result = utf8::append(*(start++), result);
251 | 
252 |         return result;
253 |     }
254 | 
255 |     template <typename octet_iterator, typename u32bit_iterator>
256 |     u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
257 |     {
258 |         while (start < end)
259 |             (*result++) = utf8::next(start, end);
260 | 
261 |         return result;
262 |     }
263 | 
264 |     // The iterator class
265 |     template <typename octet_iterator>
266 |     class iterator {
267 |       octet_iterator it;
268 |       octet_iterator range_start;
269 |       octet_iterator range_end;
270 |       public:
271 |       typedef uint32_t value_type;
272 |       typedef uint32_t* pointer;
273 |       typedef uint32_t& reference;
274 |       typedef std::ptrdiff_t difference_type;
275 |       typedef std::bidirectional_iterator_tag iterator_category;
276 |       iterator () {}
277 |       explicit iterator (const octet_iterator& octet_it,
278 |                          const octet_iterator& rangestart,
279 |                          const octet_iterator& rangeend) :
280 |                it(octet_it), range_start(rangestart), range_end(rangeend)
281 |       {
282 |           if (it < range_start || it > range_end)
283 |               throw std::out_of_range("Invalid utf-8 iterator position");
284 |       }
285 |       // the default "big three" are OK
286 |       octet_iterator base () const { return it; }
287 |       uint32_t operator * () const
288 |       {
289 |           octet_iterator temp = it;
290 |           return utf8::next(temp, range_end);
291 |       }
292 |       bool operator == (const iterator& rhs) const
293 |       {
294 |           if (range_start != rhs.range_start || range_end != rhs.range_end)
295 |               throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
296 |           return (it == rhs.it);
297 |       }
298 |       bool operator != (const iterator& rhs) const
299 |       {
300 |           return !(operator == (rhs));
301 |       }
302 |       iterator& operator ++ ()
303 |       {
304 |           utf8::next(it, range_end);
305 |           return *this;
306 |       }
307 |       iterator operator ++ (int)
308 |       {
309 |           iterator temp = *this;
310 |           utf8::next(it, range_end);
311 |           return temp;
312 |       }
313 |       iterator& operator -- ()
314 |       {
315 |           utf8::prior(it, range_start);
316 |           return *this;
317 |       }
318 |       iterator operator -- (int)
319 |       {
320 |           iterator temp = *this;
321 |           utf8::prior(it, range_start);
322 |           return temp;
323 |       }
324 |     }; // class iterator
325 | 
326 | } // namespace utf8
327 | 
328 | #if UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later
329 | #include "cpp17.h"
330 | #elif UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
331 | #include "cpp11.h"
332 | #endif // C++ 11 or later
333 | 
334 | #endif //header guard
335 | 
336 | 


--------------------------------------------------------------------------------
/utf8_console/utf8/core.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2006 Nemanja Trifunovic
  2 | 
  3 | /*
  4 | Permission is hereby granted, free of charge, to any person or organization
  5 | obtaining a copy of the software and accompanying documentation covered by
  6 | this license (the "Software") to use, reproduce, display, distribute,
  7 | execute, and transmit the Software, and to prepare derivative works of the
  8 | Software, and to permit third-parties to whom the Software is furnished to
  9 | do so, all subject to the following:
 10 | 
 11 | The copyright notices in the Software and this entire statement, including
 12 | the above license grant, this restriction and the following disclaimer,
 13 | must be included in all copies of the Software, in whole or in part, and
 14 | all derivative works of the Software, unless such copies or derivative
 15 | works are solely in the form of machine-executable object code generated by
 16 | a source language processor.
 17 | 
 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 24 | DEALINGS IN THE SOFTWARE.
 25 | */
 26 | 
 27 | 
 28 | #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 29 | #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 30 | 
 31 | #include <iterator>
 32 | 
 33 | // Determine the C++ standard version.
 34 | // If the user defines UTF_CPP_CPLUSPLUS, use that.
 35 | // Otherwise, trust the unreliable predefined macro __cplusplus
 36 | 
 37 | #if !defined UTF_CPP_CPLUSPLUS
 38 |     #define UTF_CPP_CPLUSPLUS __cplusplus
 39 | #endif
 40 | 
 41 | #if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
 42 |     #define UTF_CPP_OVERRIDE override
 43 |     #define UTF_CPP_NOEXCEPT noexcept
 44 | #else // C++ 98/03
 45 |     #define UTF_CPP_OVERRIDE
 46 |     #define UTF_CPP_NOEXCEPT throw()
 47 | #endif // C++ 11 or later
 48 | 
 49 | 
 50 | namespace utf8
 51 | {
 52 |     // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
 53 |     // You may need to change them to match your system.
 54 |     // These typedefs have the same names as ones from cstdint, or boost/cstdint
 55 |     typedef unsigned char   uint8_t;
 56 |     typedef unsigned short  uint16_t;
 57 |     typedef unsigned int    uint32_t;
 58 | 
 59 | // Helper code - not intended to be directly called by the library users. May be changed at any time
 60 | namespace internal
 61 | {
 62 |     // Unicode constants
 63 |     // Leading (high) surrogates: 0xd800 - 0xdbff
 64 |     // Trailing (low) surrogates: 0xdc00 - 0xdfff
 65 |     const uint16_t LEAD_SURROGATE_MIN  = 0xd800u;
 66 |     const uint16_t LEAD_SURROGATE_MAX  = 0xdbffu;
 67 |     const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
 68 |     const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
 69 |     const uint16_t LEAD_OFFSET         = 0xd7c0u;       // LEAD_SURROGATE_MIN - (0x10000 >> 10)
 70 |     const uint32_t SURROGATE_OFFSET    = 0xfca02400u;   // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN
 71 | 
 72 |     // Maximum valid value for a Unicode code point
 73 |     const uint32_t CODE_POINT_MAX      = 0x0010ffffu;
 74 | 
 75 |     template<typename octet_type>
 76 |     inline uint8_t mask8(octet_type oc)
 77 |     {
 78 |         return static_cast<uint8_t>(0xff & oc);
 79 |     }
 80 |     template<typename u16_type>
 81 |     inline uint16_t mask16(u16_type oc)
 82 |     {
 83 |         return static_cast<uint16_t>(0xffff & oc);
 84 |     }
 85 |     template<typename octet_type>
 86 |     inline bool is_trail(octet_type oc)
 87 |     {
 88 |         return ((utf8::internal::mask8(oc) >> 6) == 0x2);
 89 |     }
 90 | 
 91 |     template <typename u16>
 92 |     inline bool is_lead_surrogate(u16 cp)
 93 |     {
 94 |         return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
 95 |     }
 96 | 
 97 |     template <typename u16>
 98 |     inline bool is_trail_surrogate(u16 cp)
 99 |     {
100 |         return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
101 |     }
102 | 
103 |     template <typename u16>
104 |     inline bool is_surrogate(u16 cp)
105 |     {
106 |         return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
107 |     }
108 | 
109 |     template <typename u32>
110 |     inline bool is_code_point_valid(u32 cp)
111 |     {
112 |         return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
113 |     }
114 | 
115 |     template <typename octet_iterator>
116 |     inline typename std::iterator_traits<octet_iterator>::difference_type
117 |     sequence_length(octet_iterator lead_it)
118 |     {
119 |         uint8_t lead = utf8::internal::mask8(*lead_it);
120 |         if (lead < 0x80)
121 |             return 1;
122 |         else if ((lead >> 5) == 0x6)
123 |             return 2;
124 |         else if ((lead >> 4) == 0xe)
125 |             return 3;
126 |         else if ((lead >> 3) == 0x1e)
127 |             return 4;
128 |         else
129 |             return 0;
130 |     }
131 | 
132 |     template <typename octet_difference_type>
133 |     inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
134 |     {
135 |         if (cp < 0x80) {
136 |             if (length != 1) 
137 |                 return true;
138 |         }
139 |         else if (cp < 0x800) {
140 |             if (length != 2) 
141 |                 return true;
142 |         }
143 |         else if (cp < 0x10000) {
144 |             if (length != 3) 
145 |                 return true;
146 |         }
147 | 
148 |         return false;
149 |     }
150 | 
151 |     enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
152 | 
153 |     /// Helper for get_sequence_x
154 |     template <typename octet_iterator>
155 |     utf_error increase_safely(octet_iterator& it, octet_iterator end)
156 |     {
157 |         if (++it == end)
158 |             return NOT_ENOUGH_ROOM;
159 | 
160 |         if (!utf8::internal::is_trail(*it))
161 |             return INCOMPLETE_SEQUENCE;
162 | 
163 |         return UTF8_OK;
164 |     }
165 | 
166 |     #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}    
167 | 
168 |     /// get_sequence_x functions decode utf-8 sequences of the length x
169 |     template <typename octet_iterator>
170 |     utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point)
171 |     {
172 |         if (it == end)
173 |             return NOT_ENOUGH_ROOM;
174 | 
175 |         code_point = utf8::internal::mask8(*it);
176 | 
177 |         return UTF8_OK;
178 |     }
179 | 
180 |     template <typename octet_iterator>
181 |     utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
182 |     {
183 |         if (it == end) 
184 |             return NOT_ENOUGH_ROOM;
185 | 
186 |         code_point = utf8::internal::mask8(*it);
187 | 
188 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
189 | 
190 |         code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
191 | 
192 |         return UTF8_OK;
193 |     }
194 | 
195 |     template <typename octet_iterator>
196 |     utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point)
197 |     {
198 |         if (it == end)
199 |             return NOT_ENOUGH_ROOM;
200 |             
201 |         code_point = utf8::internal::mask8(*it);
202 | 
203 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
204 | 
205 |         code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
206 | 
207 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
208 | 
209 |         code_point += (*it) & 0x3f;
210 | 
211 |         return UTF8_OK;
212 |     }
213 | 
214 |     template <typename octet_iterator>
215 |     utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point)
216 |     {
217 |         if (it == end)
218 |            return NOT_ENOUGH_ROOM;
219 | 
220 |         code_point = utf8::internal::mask8(*it);
221 | 
222 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
223 | 
224 |         code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
225 | 
226 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
227 | 
228 |         code_point += (utf8::internal::mask8(*it) << 6) & 0xfff;
229 | 
230 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
231 | 
232 |         code_point += (*it) & 0x3f;
233 | 
234 |         return UTF8_OK;
235 |     }
236 | 
237 |     #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
238 | 
239 |     template <typename octet_iterator>
240 |     utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point)
241 |     {
242 |         if (it == end)
243 |             return NOT_ENOUGH_ROOM;
244 | 
245 |         // Save the original value of it so we can go back in case of failure
246 |         // Of course, it does not make much sense with i.e. stream iterators
247 |         octet_iterator original_it = it;
248 | 
249 |         uint32_t cp = 0;
250 |         // Determine the sequence length based on the lead octet
251 |         typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
252 |         const octet_difference_type length = utf8::internal::sequence_length(it);
253 | 
254 |         // Get trail octets and calculate the code point
255 |         utf_error err = UTF8_OK;
256 |         switch (length) {
257 |             case 0:
258 |                 return INVALID_LEAD;
259 |             case 1:
260 |                 err = utf8::internal::get_sequence_1(it, end, cp);
261 |                 break;
262 |             case 2:
263 |                 err = utf8::internal::get_sequence_2(it, end, cp);
264 |             break;
265 |             case 3:
266 |                 err = utf8::internal::get_sequence_3(it, end, cp);
267 |             break;
268 |             case 4:
269 |                 err = utf8::internal::get_sequence_4(it, end, cp);
270 |             break;
271 |         }
272 | 
273 |         if (err == UTF8_OK) {
274 |             // Decoding succeeded. Now, security checks...
275 |             if (utf8::internal::is_code_point_valid(cp)) {
276 |                 if (!utf8::internal::is_overlong_sequence(cp, length)){
277 |                     // Passed! Return here.
278 |                     code_point = cp;
279 |                     ++it;
280 |                     return UTF8_OK;
281 |                 }
282 |                 else
283 |                     err = OVERLONG_SEQUENCE;
284 |             }
285 |             else 
286 |                 err = INVALID_CODE_POINT;
287 |         }
288 | 
289 |         // Failure branch - restore the original value of the iterator
290 |         it = original_it;
291 |         return err;
292 |     }
293 | 
294 |     template <typename octet_iterator>
295 |     inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
296 |         uint32_t ignored;
297 |         return utf8::internal::validate_next(it, end, ignored);
298 |     }
299 | 
300 | } // namespace internal
301 | 
302 |     /// The library API - functions intended to be called by the users
303 | 
304 |     // Byte order mark
305 |     const uint8_t bom[] = {0xef, 0xbb, 0xbf};
306 | 
307 |     template <typename octet_iterator>
308 |     octet_iterator find_invalid(octet_iterator start, octet_iterator end)
309 |     {
310 |         octet_iterator result = start;
311 |         while (result != end) {
312 |             utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
313 |             if (err_code != internal::UTF8_OK)
314 |                 return result;
315 |         }
316 |         return result;
317 |     }
318 | 
319 |     template <typename octet_iterator>
320 |     inline bool is_valid(octet_iterator start, octet_iterator end)
321 |     {
322 |         return (utf8::find_invalid(start, end) == end);
323 |     }
324 | 
325 |     template <typename octet_iterator>
326 |     inline bool starts_with_bom (octet_iterator it, octet_iterator end)
327 |     {
328 |         return (
329 |             ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
330 |             ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
331 |             ((it != end) && (utf8::internal::mask8(*it))   == bom[2])
332 |            );
333 |     }	
334 | } // namespace utf8
335 | 
336 | #endif // header guard
337 | 
338 | 
339 | 


--------------------------------------------------------------------------------
/utf8_console/utf8/cpp11.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2018 Nemanja Trifunovic
  2 | 
  3 | /*
  4 | Permission is hereby granted, free of charge, to any person or organization
  5 | obtaining a copy of the software and accompanying documentation covered by
  6 | this license (the "Software") to use, reproduce, display, distribute,
  7 | execute, and transmit the Software, and to prepare derivative works of the
  8 | Software, and to permit third-parties to whom the Software is furnished to
  9 | do so, all subject to the following:
 10 | 
 11 | The copyright notices in the Software and this entire statement, including
 12 | the above license grant, this restriction and the following disclaimer,
 13 | must be included in all copies of the Software, in whole or in part, and
 14 | all derivative works of the Software, unless such copies or derivative
 15 | works are solely in the form of machine-executable object code generated by
 16 | a source language processor.
 17 | 
 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 24 | DEALINGS IN THE SOFTWARE.
 25 | */
 26 | 
 27 | 
 28 | #ifndef UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1
 29 | #define UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1
 30 | 
 31 | #include "checked.h"
 32 | #include <string>
 33 | 
 34 | namespace utf8
 35 | {
 36 | 
 37 |     inline void append(char32_t cp, std::string& s)
 38 |     {
 39 |         append(uint32_t(cp), std::back_inserter(s));
 40 |     }
 41 | 
 42 |     inline std::string utf16to8(const std::u16string& s)
 43 |     {
 44 |         std::string result;
 45 |         utf16to8(s.begin(), s.end(), std::back_inserter(result));
 46 |         return result;
 47 |     }
 48 | 
 49 |     inline std::u16string utf8to16(const std::string& s)
 50 |     {
 51 |         std::u16string result;
 52 |         utf8to16(s.begin(), s.end(), std::back_inserter(result));
 53 |         return result;
 54 |     }
 55 | 
 56 |     inline std::string utf32to8(const std::u32string& s)
 57 |     {
 58 |         std::string result;
 59 |         utf32to8(s.begin(), s.end(), std::back_inserter(result));
 60 |         return result;
 61 |     }
 62 | 
 63 |     inline std::u32string utf8to32(const std::string& s)
 64 |     {
 65 |         std::u32string result;
 66 |         utf8to32(s.begin(), s.end(), std::back_inserter(result));
 67 |         return result;
 68 |     }
 69 | 
 70 |     inline std::size_t find_invalid(const std::string& s)
 71 |     {
 72 |         std::string::const_iterator invalid = find_invalid(s.begin(), s.end());
 73 |         return (invalid == s.end()) ? std::string::npos : (invalid - s.begin());
 74 |     }
 75 | 
 76 |     inline bool is_valid(const std::string& s)
 77 |     {
 78 |         return is_valid(s.begin(), s.end());
 79 |     }
 80 | 
 81 |     inline std::string replace_invalid(const std::string& s, char32_t replacement)
 82 |     {
 83 |         std::string result;
 84 |         replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
 85 |         return result;
 86 |     }
 87 | 
 88 |     inline std::string replace_invalid(const std::string& s)
 89 |     {
 90 |         std::string result;
 91 |         replace_invalid(s.begin(), s.end(), std::back_inserter(result));
 92 |         return result;
 93 |     }
 94 | 
 95 |     inline bool starts_with_bom(const std::string& s)
 96 |     {
 97 |         return starts_with_bom(s.begin(), s.end());
 98 |     }
 99 |  
100 | } // namespace utf8
101 | 
102 | #endif // header guard
103 | 
104 | 


--------------------------------------------------------------------------------
/utf8_console/utf8/cpp17.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2018 Nemanja Trifunovic
  2 | 
  3 | /*
  4 | Permission is hereby granted, free of charge, to any person or organization
  5 | obtaining a copy of the software and accompanying documentation covered by
  6 | this license (the "Software") to use, reproduce, display, distribute,
  7 | execute, and transmit the Software, and to prepare derivative works of the
  8 | Software, and to permit third-parties to whom the Software is furnished to
  9 | do so, all subject to the following:
 10 | 
 11 | The copyright notices in the Software and this entire statement, including
 12 | the above license grant, this restriction and the following disclaimer,
 13 | must be included in all copies of the Software, in whole or in part, and
 14 | all derivative works of the Software, unless such copies or derivative
 15 | works are solely in the form of machine-executable object code generated by
 16 | a source language processor.
 17 | 
 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 24 | DEALINGS IN THE SOFTWARE.
 25 | */
 26 | 
 27 | 
 28 | #ifndef UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9
 29 | #define UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9
 30 | 
 31 | #include "checked.h"
 32 | #include <string>
 33 | 
 34 | namespace utf8
 35 | {
 36 | 
 37 |     inline void append(char32_t cp, std::string& s)
 38 |     {
 39 |         append(uint32_t(cp), std::back_inserter(s));
 40 |     }
 41 | 
 42 |     inline std::string utf16to8(std::u16string_view s)
 43 |     {
 44 |         std::string result;
 45 |         utf16to8(s.begin(), s.end(), std::back_inserter(result));
 46 |         return result;
 47 |     }
 48 | 
 49 |     inline std::u16string utf8to16(std::string_view s)
 50 |     {
 51 |         std::u16string result;
 52 |         utf8to16(s.begin(), s.end(), std::back_inserter(result));
 53 |         return result;
 54 |     }
 55 | 
 56 |     inline std::string utf32to8(std::u32string_view s)
 57 |     {
 58 |         std::string result;
 59 |         utf32to8(s.begin(), s.end(), std::back_inserter(result));
 60 |         return result;
 61 |     }
 62 | 
 63 |     inline std::u32string utf8to32(std::string_view s)
 64 |     {
 65 |         std::u32string result;
 66 |         utf8to32(s.begin(), s.end(), std::back_inserter(result));
 67 |         return result;
 68 |     }
 69 | 
 70 |     inline std::size_t find_invalid(std::string_view s)
 71 |     {
 72 |         std::string_view::const_iterator invalid = find_invalid(s.begin(), s.end());
 73 |         return (invalid == s.end()) ? std::string_view::npos : (invalid - s.begin());
 74 |     }
 75 | 
 76 |     inline bool is_valid(std::string_view s)
 77 |     {
 78 |         return is_valid(s.begin(), s.end());
 79 |     }
 80 | 
 81 |     inline std::string replace_invalid(std::string_view s, char32_t replacement)
 82 |     {
 83 |         std::string result;
 84 |         replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
 85 |         return result;
 86 |     }
 87 | 
 88 |     inline std::string replace_invalid(std::string_view s)
 89 |     {
 90 |         std::string result;
 91 |         replace_invalid(s.begin(), s.end(), std::back_inserter(result));
 92 |         return result;
 93 |     }
 94 | 
 95 |     inline bool starts_with_bom(std::string_view s)
 96 |     {
 97 |         return starts_with_bom(s.begin(), s.end());
 98 |     }
 99 |  
100 | } // namespace utf8
101 | 
102 | #endif // header guard
103 | 
104 | 


--------------------------------------------------------------------------------
/utf8_console/utf8/unchecked.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2006 Nemanja Trifunovic
  2 | 
  3 | /*
  4 | Permission is hereby granted, free of charge, to any person or organization
  5 | obtaining a copy of the software and accompanying documentation covered by
  6 | this license (the "Software") to use, reproduce, display, distribute,
  7 | execute, and transmit the Software, and to prepare derivative works of the
  8 | Software, and to permit third-parties to whom the Software is furnished to
  9 | do so, all subject to the following:
 10 | 
 11 | The copyright notices in the Software and this entire statement, including
 12 | the above license grant, this restriction and the following disclaimer,
 13 | must be included in all copies of the Software, in whole or in part, and
 14 | all derivative works of the Software, unless such copies or derivative
 15 | works are solely in the form of machine-executable object code generated by
 16 | a source language processor.
 17 | 
 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 24 | DEALINGS IN THE SOFTWARE.
 25 | */
 26 | 
 27 | 
 28 | #ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 29 | #define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 30 | 
 31 | #include "core.h"
 32 | 
 33 | namespace utf8
 34 | {
 35 |     namespace unchecked
 36 |     {
 37 |         template <typename octet_iterator>
 38 |         octet_iterator append(uint32_t cp, octet_iterator result)
 39 |         {
 40 |             if (cp < 0x80)                        // one octet
 41 |                 *(result++) = static_cast<uint8_t>(cp);
 42 |             else if (cp < 0x800) {                // two octets
 43 |                 *(result++) = static_cast<uint8_t>((cp >> 6)          | 0xc0);
 44 |                 *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
 45 |             }
 46 |             else if (cp < 0x10000) {              // three octets
 47 |                 *(result++) = static_cast<uint8_t>((cp >> 12)         | 0xe0);
 48 |                 *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
 49 |                 *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
 50 |             }
 51 |             else {                                // four octets
 52 |                 *(result++) = static_cast<uint8_t>((cp >> 18)         | 0xf0);
 53 |                 *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)| 0x80);
 54 |                 *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
 55 |                 *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
 56 |             }
 57 |             return result;
 58 |         }
 59 | 
 60 |         template <typename octet_iterator, typename output_iterator>
 61 |         output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
 62 |         {
 63 |             while (start != end) {
 64 |                 octet_iterator sequence_start = start;
 65 |                 internal::utf_error err_code = utf8::internal::validate_next(start, end);
 66 |                 switch (err_code) {
 67 |                     case internal::UTF8_OK :
 68 |                         for (octet_iterator it = sequence_start; it != start; ++it)
 69 |                             *out++ = *it;
 70 |                         break;
 71 |                     case internal::NOT_ENOUGH_ROOM:
 72 |                         out = utf8::unchecked::append (replacement, out);
 73 |                         start = end;
 74 |                         break;
 75 |                     case internal::INVALID_LEAD:
 76 |                         out = utf8::unchecked::append (replacement, out);
 77 |                         ++start;
 78 |                         break;
 79 |                     case internal::INCOMPLETE_SEQUENCE:
 80 |                     case internal::OVERLONG_SEQUENCE:
 81 |                     case internal::INVALID_CODE_POINT:
 82 |                         out = utf8::unchecked::append (replacement, out);
 83 |                         ++start;
 84 |                         // just one replacement mark for the sequence
 85 |                         while (start != end && utf8::internal::is_trail(*start))
 86 |                             ++start;
 87 |                         break;
 88 |                 }
 89 |             }
 90 |             return out;
 91 |         }
 92 | 
 93 |         template <typename octet_iterator, typename output_iterator>
 94 |         inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
 95 |         {
 96 |             static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd);
 97 |             return utf8::unchecked::replace_invalid(start, end, out, replacement_marker);
 98 |         }
 99 | 
100 |         template <typename octet_iterator>
101 |         uint32_t next(octet_iterator& it)
102 |         {
103 |             uint32_t cp = utf8::internal::mask8(*it);
104 |             typename std::iterator_traits<octet_iterator>::difference_type length = utf8::internal::sequence_length(it);
105 |             switch (length) {
106 |                 case 1:
107 |                     break;
108 |                 case 2:
109 |                     it++;
110 |                     cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
111 |                     break;
112 |                 case 3:
113 |                     ++it; 
114 |                     cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
115 |                     ++it;
116 |                     cp += (*it) & 0x3f;
117 |                     break;
118 |                 case 4:
119 |                     ++it;
120 |                     cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);                
121 |                     ++it;
122 |                     cp += (utf8::internal::mask8(*it) << 6) & 0xfff;
123 |                     ++it;
124 |                     cp += (*it) & 0x3f; 
125 |                     break;
126 |             }
127 |             ++it;
128 |             return cp;
129 |         }
130 | 
131 |         template <typename octet_iterator>
132 |         uint32_t peek_next(octet_iterator it)
133 |         {
134 |             return utf8::unchecked::next(it);
135 |         }
136 | 
137 |         template <typename octet_iterator>
138 |         uint32_t prior(octet_iterator& it)
139 |         {
140 |             while (utf8::internal::is_trail(*(--it))) ;
141 |             octet_iterator temp = it;
142 |             return utf8::unchecked::next(temp);
143 |         }
144 | 
145 |         template <typename octet_iterator, typename distance_type>
146 |         void advance (octet_iterator& it, distance_type n)
147 |         {
148 |             const distance_type zero(0);
149 |             if (n < zero) {
150 |                 // backward
151 |                 for (distance_type i = n; i < zero; ++i)
152 |                     utf8::unchecked::prior(it);
153 |             } else {
154 |                 // forward
155 |                 for (distance_type i = zero; i < n; ++i)
156 |                     utf8::unchecked::next(it);
157 |             }
158 |         }
159 | 
160 |         template <typename octet_iterator>
161 |         typename std::iterator_traits<octet_iterator>::difference_type
162 |         distance (octet_iterator first, octet_iterator last)
163 |         {
164 |             typename std::iterator_traits<octet_iterator>::difference_type dist;
165 |             for (dist = 0; first < last; ++dist) 
166 |                 utf8::unchecked::next(first);
167 |             return dist;
168 |         }
169 | 
170 |         template <typename u16bit_iterator, typename octet_iterator>
171 |         octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
172 |         {
173 |             while (start != end) {
174 |                 uint32_t cp = utf8::internal::mask16(*start++);
175 |             // Take care of surrogate pairs first
176 |                 if (utf8::internal::is_lead_surrogate(cp)) {
177 |                     uint32_t trail_surrogate = utf8::internal::mask16(*start++);
178 |                     cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
179 |                 }
180 |                 result = utf8::unchecked::append(cp, result);
181 |             }
182 |             return result;
183 |         }
184 | 
185 |         template <typename u16bit_iterator, typename octet_iterator>
186 |         u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
187 |         {
188 |             while (start < end) {
189 |                 uint32_t cp = utf8::unchecked::next(start);
190 |                 if (cp > 0xffff) { //make a surrogate pair
191 |                     *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
192 |                     *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
193 |                 }
194 |                 else
195 |                     *result++ = static_cast<uint16_t>(cp);
196 |             }
197 |             return result;
198 |         }
199 | 
200 |         template <typename octet_iterator, typename u32bit_iterator>
201 |         octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
202 |         {
203 |             while (start != end)
204 |                 result = utf8::unchecked::append(*(start++), result);
205 | 
206 |             return result;
207 |         }
208 | 
209 |         template <typename octet_iterator, typename u32bit_iterator>
210 |         u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
211 |         {
212 |             while (start < end)
213 |                 (*result++) = utf8::unchecked::next(start);
214 | 
215 |             return result;
216 |         }
217 | 
218 |         // The iterator class
219 |         template <typename octet_iterator>
220 |           class iterator {
221 |             octet_iterator it;
222 |             public:
223 |             typedef uint32_t value_type;
224 |             typedef uint32_t* pointer;
225 |             typedef uint32_t& reference;
226 |             typedef std::ptrdiff_t difference_type;
227 |             typedef std::bidirectional_iterator_tag iterator_category;
228 |             iterator () {}
229 |             explicit iterator (const octet_iterator& octet_it): it(octet_it) {}
230 |             // the default "big three" are OK
231 |             octet_iterator base () const { return it; }
232 |             uint32_t operator * () const
233 |             {
234 |                 octet_iterator temp = it;
235 |                 return utf8::unchecked::next(temp);
236 |             }
237 |             bool operator == (const iterator& rhs) const 
238 |             { 
239 |                 return (it == rhs.it);
240 |             }
241 |             bool operator != (const iterator& rhs) const
242 |             {
243 |                 return !(operator == (rhs));
244 |             }
245 |             iterator& operator ++ () 
246 |             {
247 |                 ::std::advance(it, utf8::internal::sequence_length(it));
248 |                 return *this;
249 |             }
250 |             iterator operator ++ (int)
251 |             {
252 |                 iterator temp = *this;
253 |                 ::std::advance(it, utf8::internal::sequence_length(it));
254 |                 return temp;
255 |             }  
256 |             iterator& operator -- ()
257 |             {
258 |                 utf8::unchecked::prior(it);
259 |                 return *this;
260 |             }
261 |             iterator operator -- (int)
262 |             {
263 |                 iterator temp = *this;
264 |                 utf8::unchecked::prior(it);
265 |                 return temp;
266 |             }
267 |           }; // class iterator
268 | 
269 |     } // namespace utf8::unchecked
270 | } // namespace utf8 
271 | 
272 | 
273 | #endif // header guard
274 | 
275 | 


--------------------------------------------------------------------------------
/utf8_console/utfwidth.cpp:
--------------------------------------------------------------------------------
 1 | #include <exception>
 2 | #include <iostream>
 3 | #include <string_view>
 4 | #include <iomanip>
 5 | #include "utf8.h"
 6 | 
 7 | extern "C" int mk_wcwidth(wchar_t ucs);
 8 | extern "C" int mk_wcswidth(const wchar_t *pwcs, size_t n);
 9 | 
10 | class wcswidth_iterator
11 | {
12 | private:
13 |     size_t _result = 0;
14 | 
15 | public:
16 |     size_t result() const { return _result; }
17 |     void reset() { _result = 0; }
18 | 
19 |     wcswidth_iterator& operator=(wchar_t value)
20 |     {
21 |         auto len = mk_wcwidth(value);
22 |         if (len < 0)
23 |             throw std::runtime_error("Invalid UTF-8 value");
24 |         _result += len;
25 |         return *this;
26 |     }
27 | 
28 |     wcswidth_iterator& operator*() { return *this; }
29 |     wcswidth_iterator& operator++() { return *this; }
30 |     wcswidth_iterator& operator++(int) { return *this; }
31 | };
32 | 
33 | size_t utf8_cswidth(std::string_view str)
34 | {
35 |     if constexpr (sizeof(wchar_t) == 4)
36 |     {
37 |         return utf8::utf8to32(str.begin(), str.end(), wcswidth_iterator()).result();
38 |     }
39 |     else
40 |     {
41 |         return utf8::utf8to16(str.begin(), str.end(), wcswidth_iterator()).result();
42 |     }
43 | }
44 | 
45 | decltype(std::setw(0)) setw_u8(int w, std::string_view u8s)
46 | {
47 |     try
48 |     {
49 |         auto delta = utf8_cswidth(u8s) - u8s.length();
50 |         return std::setw(w - delta);
51 |     } catch (std::runtime_error&)
52 |     {
53 |         return std::setw(w);
54 |     }
55 | }


--------------------------------------------------------------------------------
/utf8_console/utfwidth.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include <string_view>
4 | #include <iomanip>
5 | 
6 | decltype(std::setw(0)) setw_u8(int w, std::string_view u8s);


--------------------------------------------------------------------------------