├── .gitattributes ├── .gitignore ├── .travis.yml ├── GraphemeBreakTest.txt ├── LICENSE ├── NormalizationTest.txt ├── README.md ├── fuzzer ├── Makefile ├── fuzz-clean.c ├── fuzz-grapheme.c ├── fuzz-invalid.c ├── fuzz-normalize.c └── fuzz-valid.c ├── lutf8lib.c ├── parseucd.lua ├── rockspecs ├── luautf8-0.1.6-1.rockspec └── luautf8-scm-1.rockspec ├── test.lua ├── test_compat.lua ├── test_pm.lua └── unidata.h /.gitattributes: -------------------------------------------------------------------------------- 1 | *.h linguist-language=C 2 | 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | UCD/ 2 | UCD.*/ 3 | ucd/ 4 | ucd.*/ 5 | *.dll 6 | 7 | lua-utf8.so 8 | lua-utf8.so.* 9 | luautf8-*.zip 10 | luautf8-*.rock 11 | 12 | *.gcov 13 | *.gcda 14 | *.gcno 15 | 16 | test_*.lua 17 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: c 2 | sudo: false 3 | 4 | env: 5 | global: 6 | - LUAROCKS=2.4.3 7 | - ROCKSPEC=rockspecs/luautf8-scm-0.rockspec 8 | matrix: 9 | - LUA="lua 5.1" 10 | - LUA="lua 5.2" 11 | - LUA="lua 5.3" 12 | - LUA="luajit 2.0" 13 | - LUA="luajit 2.1" 14 | 15 | branches: 16 | only: 17 | - master 18 | - develop 19 | 20 | before_install: 21 | - pip install --user hererocks urllib3[secure] cpp-coveralls 22 | - hererocks env --$LUA -rlatest # Use latest LuaRocks, install into 'env' directory. 23 | - source env/bin/activate # Add directory with all installed binaries to PATH. 24 | 25 | install: 26 | # - sudo luarocks make $ROCKSPEC CFLAGS="-O2 -fPIC -ftest-coverage -fprofile-arcs" LIBFLAG="-shared --coverage" 27 | - luarocks make $ROCKSPEC CFLAGS="-O3 -fPIC -Wall -Wextra --coverage" LIBFLAG="-shared --coverage" 28 | 29 | script: 30 | - lua test.lua 31 | - lua test_pm.lua 32 | - lua test_compat.lua 33 | # - lunit.sh test.lua 34 | 35 | after_success: 36 | - coveralls 37 | # - coveralls -b .. -r .. --dump c.report.json 38 | # - luacov-coveralls -j c.report.json -v 39 | 40 | notifications: 41 | email: 42 | on_success: change 43 | on_failure: always 44 | 45 | # vim: ft=yaml nu et sw=2 fdc=2 fdm=syntax 46 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Xavier Wang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | UTF-8 module for Lua 5.x 2 | ======================== 3 | [![Build Status](https://travis-ci.org/starwing/luautf8.svg?branch=master)](https://travis-ci.org/starwing/luautf8) 4 | [![Coverage Status](https://coveralls.io/repos/github/starwing/luautf8/badge.svg?branch=master)](https://coveralls.io/github/starwing/luautf8?branch=master) 5 | 6 | This module adds UTF-8 support to Lua. 7 | 8 | It uses data extracted from 9 | [Unicode Character Database](http://www.unicode.org/reports/tr44/), 10 | and tested on Lua 5.2.3, Lua 5.3.0 and LuaJIT. 11 | 12 | parseucd.lua is a pure Lua script which generates unidata.h, to support 13 | converting characters and checking characters' category. 14 | 15 | It is compatible with Lua's own string module and passes all 16 | string and pattern matching tests in the Lua test suite[2]. 17 | 18 | It also adds some useful routines against UTF-8 features, such as: 19 | - a convenient interface to escape Unicode sequences in strings. 20 | - string insert/remove, since UTF-8 substring extraction may be expensive. 21 | - calculate Unicode width, useful when implementing e.g. console emulator. 22 | - a useful interface to translate Unicode offsets and byte offsets. 23 | - checking UTF-8 strings for validity and removing invalid byte sequences. 24 | - converting Unicode strings to normal form. 25 | 26 | Note that to avoid conflict with Lua5.3's built-in library 'utf8', 27 | this library produces a file like lua-utf8.dll or lua-utf8.so. so use 28 | it like this: 29 | 30 | ```lua 31 | local utf8 = require 'lua-utf8' 32 | ``` 33 | 34 | in your code :-( 35 | 36 | [2]: http://www.lua.org/tests/5.2/ 37 | 38 | 39 | LuaRocks Installation 40 | --------------------- 41 | `luarocks install luautf8` 42 | 43 | It's now fully-compatible with Lua 5.3's utf8 library, so replacing this 44 | file (and headers) with lutf8lib.c from the Lua 5.3 sources is also okay. 45 | 46 | Usage 47 | ----- 48 | 49 | Many routines are the same as Lua's string module: 50 | - `utf8.byte` 51 | - `utf8.char` 52 | - `utf8.find` 53 | - `utf8.gmatch` 54 | - `utf8.gsub` 55 | - `utf8.len` 56 | - `utf8.lower` 57 | - `utf8.match` 58 | - `utf8.reverse` 59 | - `utf8.sub` 60 | - `utf8.upper` 61 | 62 | The documentation of these functions can be found in the Lua manual[3]. 63 | 64 | [3]: http://www.lua.org/manual/5.2/manual.html#6.4 65 | 66 | 67 | Some routines in string module needn't support Unicode: 68 | - `string.dump` 69 | - `string.format` 70 | - `string.rep` 71 | 72 | They are NOT in utf8 module. 73 | 74 | Some routines are for compatibility with Lua 5.3's basic UTF-8 support library: 75 | - `utf8.offset` 76 | - `utf8.codepoint` 77 | - `utf8.codes` 78 | 79 | See Lua5.3's manual for usage. 80 | 81 | Some routines are new, with some Unicode-spec functions: 82 | 83 | ### utf8.escape(str) -> utf8 string 84 | escape a str to UTF-8 format string. It supports several escape formats: 85 | 86 | * `%ddd` - which ddd is a decimal number at any length: 87 | change Unicode code point to UTF-8 format. 88 | * `%{ddd}` - same as `%nnn` but has bracket around. 89 | * `%uddd` - same as `%ddd`, u stands Unicode 90 | * `%u{ddd}` - same as `%{ddd}` 91 | * `%xhhh` - hexadigit version of `%ddd` 92 | * `%x{hhh}` same as `%xhhh`. 93 | * `%?` - '?' stands for any other character: escape this character. 94 | 95 | #### Examples: 96 | ```lua 97 | local u = utf8.escape 98 | print(u"%123%u123%{123}%u{123}%xABC%x{ABC}") 99 | print(u"%%123%?%d%%u") 100 | ``` 101 | 102 | 103 | ### utf8.charpos(s[[, charpos], index]) -> charpos, code point 104 | convert UTF-8 position to byte offset. 105 | if only `index` is given, return byte offset of this UTF-8 char index. 106 | if both `charpos` and `index` is given, a new `charpos` will be 107 | calculated, by adding/subtracting UTF-8 char `index` to current `charpos`. 108 | in all cases, it returns a new char position, and code point (a 109 | number) at this position. 110 | 111 | 112 | ### utf8.next(s[, charpos[, index]]) -> charpos, code point 113 | iterate though the UTF-8 string s. 114 | If only s is given, it can used as a iterator: 115 | ```lua 116 | for pos, code in utf8.next, "utf8-string" do 117 | -- ... 118 | end 119 | ``` 120 | if only s and `charpos` are given, return the byte offset of the next codepoint 121 | in the string. 122 | if `charpos` and `index` are given, a new `charpos` will be calculated, by 123 | adding/subtracting UTF-8 char offset to current charpos. 124 | in all cases, it returns a new char position (in bytes), and code point 125 | (a number) at this position. 126 | 127 | 128 | ### utf8.insert(s[, idx], substring) -> new_string 129 | insert a substring into s. If `idx` is given, insert the substring before 130 | the char at this index; otherwise, substring will be concatenated onto s. 131 | `idx` can be negative. 132 | 133 | 134 | ### utf8.remove(s[, start[, stop]]) -> new_string 135 | delete a substring in s. If neither `start` nor `stop` is given, delete the 136 | last UTF-8 char in s, otherwise delete chars from `start` to the end of s. if 137 | `stop` is given, delete chars from `start` to `stop` (including `start` and `stop`). 138 | `start` and `stop` can be negative. 139 | 140 | 141 | ### utf8.width(s[, ambi_is_double[, default_width]]) -> width 142 | calculate the width of UTF-8 string s. if `ambi_is_double` is given, 143 | characters with ambiguous width will be treated as having width 2. 144 | Otherwise, they will be treated as having width 1. 145 | the width of fullwidth/doublewidth characters is 2, and the width of other 146 | characters is 1. 147 | if `default_width` is given, it will be used as the width of unprintable 148 | characters. (If you will replace unprintable characters with a placeholder, 149 | pass its width as `default_width`.) 150 | if s is a code point, return the width of this code point. 151 | 152 | 153 | ### utf8.widthindex(s, location[, ambi_is_double[, default_width]]) -> idx, offset, width 154 | return the character index at given location in string s, where location is 155 | in width units. this is the inverse operation of utf8.width(). 156 | if the requested location does not fall at a character boundary, `offset` will be 157 | greater than 1; specifically, if the location is at the second column (middle) 158 | of a wide char, `offset` will be 2. the width of the character at idx is returned also. 159 | 160 | 161 | ### utf8.title(s) -> new_string 162 | ### utf8.fold(s) -> new_string 163 | converts UTF-8 string s to title-case, or folded case (used for 164 | case-insensitive comparison). 165 | if s is a number, it's treated as a code point and a converted code 166 | point (number) is returned. 167 | utf8.lower/utf8.upper has the same extension. 168 | 169 | 170 | ### utf8.ncasecmp(a, b) -> [-1,0,1] 171 | compare a and b without case, -1 means a < b, 0 means a == b and 1 means a > b. 172 | 173 | 174 | ### utf8.isvalid(s) -> boolean 175 | check whether s is a valid UTF-8 string or not. 176 | 177 | 178 | ### utf8.clean(s[, replacement_string]) -> cleaned_string, was_valid 179 | replace any invalid UTF-8 byte sequences in s with the replacement string. 180 | if no replacement string is provided, the default is "�" (REPLACEMENT CHARACTER U+FFFD). 181 | note that *any* number of consecutive invalid bytes will be replaced by a single copy of the replacement string. 182 | the 2nd return value is true if the original string was already valid (meaning no replacements were made). 183 | 184 | 185 | ### utf8.invalidoffset(s[, init]) -> offset 186 | return the byte offset within s of the first invalid UTF-8 byte sequence. 187 | (1 is the first byte of the string.) 188 | if s is a valid UTF-8 string, return nil. 189 | the optional numeric argument init specifies where to start the search; its default value is 1 and can be negative. 190 | 191 | 192 | ### utf8.isnfc(s) -> boolean 193 | check whether s is in Normal Form C or not. 194 | "Normal Form C" means that whenever possible, combining marks are combined with a preceding codepoint. For example, instead of U+0041 (LATIN CAPITAL LETTER A) U+00B4 (ACUTE ACCENT), an NFC string will use U+00C1 (LATIN CAPITAL LETTER A WITH ACUTE). Also, some deprecated codepoints are converted to the recommended replacements. 195 | since the same sequence of characters can be represented in more than one way in Unicode, it is better to ensure strings are in Normal Form before comparing them. 196 | an error may be raised if s is not a valid UTF-8 string. 197 | 198 | 199 | ### utf8.normalize_nfc(s) -> normal_string, was_nfc 200 | convert s to Normal Form C. 201 | the 2nd return value is true if the original string was already in NFC (meaning no modifications were made). 202 | an error will be raised if s is not a valid UTF-8 string. 203 | 204 | 205 | ### utf8.grapheme_indices(s[, start[, stop]]) -> iterator 206 | return an iterator which yields the starting and ending byte index of each successive grapheme cluster in s. This range of bytes is inclusive of the endpoints, so the yielded values can be passed to `string.sub` to extract the grapheme cluster. 207 | if you provide `start` and `stop` byte indices, then the iterator will only cover the requested byte range. `start` and `stop` should fall on character boundaries, since an error will be raised if the requested byte range is not a valid UTF-8 string. 208 | ```lua 209 | local i = 1 210 | for from,to in utf8.grapheme_indices(s) do 211 | print("grapheme cluster "..i.." is from byte "..from.." to byte "..to) 212 | i = i + 1 213 | end 214 | ``` 215 | 216 | 217 | Improvement needed 218 | ------------------ 219 | 220 | - add Lua 5.3 spec test-suite. 221 | - more test case. 222 | - grapheme-compose support, and affect in utf8.reverse and utf8.width 223 | 224 | 225 | License 226 | ------- 227 | It uses the same license as Lua: http://www.lua.org/license.html 228 | -------------------------------------------------------------------------------- /fuzzer/Makefile: -------------------------------------------------------------------------------- 1 | ALL: lua-utf8.so fuzz-valid fuzz-clean fuzz-invalid fuzz-normalize fuzz-grapheme 2 | 3 | clean: 4 | rm lua-utf8.so fuzz-valid fuzz-clean fuzz-invalid fuzz-normalize fuzz-grapheme 5 | 6 | lua-utf8.so: ../lutf8lib.c 7 | clang -g -fsanitize=fuzzer-no-link,address -fPIC $$(pkg-config --cflags lua5.1) ../lutf8lib.c -shared -o lua-utf8.so 8 | 9 | fuzz-valid: fuzz-valid.c 10 | clang -g -fsanitize=address,fuzzer,undefined -I/usr/include/lua5.1 -llua5.1 fuzz-valid.c -o fuzz-valid 11 | 12 | fuzz-clean: fuzz-clean.c 13 | clang -g -fsanitize=address,fuzzer,undefined -I/usr/include/lua5.1 -llua5.1 fuzz-clean.c -o fuzz-clean 14 | 15 | fuzz-invalid: fuzz-invalid.c 16 | clang -g -fsanitize=address,fuzzer,undefined -I/usr/include/lua5.1 -llua5.1 fuzz-invalid.c -o fuzz-invalid 17 | 18 | fuzz-normalize: fuzz-normalize.c 19 | clang -g -fsanitize=address,fuzzer,undefined -I/usr/include/lua5.1 -llua5.1 -licuuc fuzz-normalize.c -o fuzz-normalize 20 | 21 | fuzz-grapheme: fuzz-grapheme.c 22 | clang -g -fsanitize=address,fuzzer,undefined -I/usr/include/lua5.1 -llua5.1 -licuuc fuzz-grapheme.c -o fuzz-grapheme 23 | -------------------------------------------------------------------------------- /fuzzer/fuzz-clean.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "lua.h" 7 | #include "lualib.h" 8 | #include "lauxlib.h" 9 | 10 | lua_State *L; 11 | 12 | /* Adapted from mb_utf8_to_wchar (from the PHP codebase) */ 13 | static bool php_mbstring_check_utf8(unsigned char *in, size_t in_len) 14 | { 15 | unsigned char *p = in, *e = p + in_len; 16 | 17 | while (p < e) { 18 | unsigned char c = *p++; 19 | 20 | if (c < 0x80) { 21 | /* do nothing */ 22 | } else if (c >= 0xC2 && c <= 0xDF) { /* 2 byte character */ 23 | if (p < e) { 24 | unsigned char c2 = *p++; 25 | if ((c2 & 0xC0) != 0x80) { 26 | return false; 27 | } 28 | } else { 29 | return false; 30 | } 31 | } else if (c >= 0xE0 && c <= 0xEF) { /* 3 byte character */ 32 | if ((e - p) >= 2) { 33 | unsigned char c2 = *p++; 34 | unsigned char c3 = *p++; 35 | if ((c2 & 0xC0) != 0x80 || (c == 0xE0 && c2 < 0xA0) || (c == 0xED && c2 >= 0xA0)) { 36 | return false; 37 | } else if ((c3 & 0xC0) != 0x80) { 38 | return false; 39 | } 40 | } else { 41 | return false; 42 | } 43 | } else if (c >= 0xF0 && c <= 0xF4) { /* 4 byte character */ 44 | if ((e - p) >= 3) { 45 | unsigned char c2 = *p++; 46 | unsigned char c3 = *p++; 47 | unsigned char c4 = *p++; 48 | /* If c == 0xF0 and c2 < 0x90, then this is an over-long code unit; it could have 49 | * fit in 3 bytes only. If c == 0xF4 and c2 >= 0x90, then this codepoint is 50 | * greater than U+10FFFF, which is the highest legal codepoint */ 51 | if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) { 52 | return false; 53 | } else if ((c3 & 0xC0) != 0x80) { 54 | return false; 55 | } else if ((c4 & 0xC0) != 0x80) { 56 | return false; 57 | } 58 | } else { 59 | return false; 60 | } 61 | } else { 62 | return false; 63 | } 64 | } 65 | 66 | return true; 67 | } 68 | 69 | int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) 70 | { 71 | lua_getglobal(L, "utf8"); 72 | lua_getfield(L, -1, "clean"); 73 | 74 | const char *orig_data = (const char*)Data; 75 | 76 | uint8_t *Comma = memchr(Data, ',', Size); 77 | const char *repl = NULL; 78 | size_t repl_len; 79 | 80 | if (Comma) { 81 | /* We will pass two arguments (the 2nd one is optional) */ 82 | lua_pushlstring(L, (const char*)Data, Comma - Data); 83 | Size -= Comma - Data + 1; 84 | Data = Comma + 1; 85 | repl = (const char*)Data; 86 | repl_len = Size; 87 | } 88 | 89 | lua_pushlstring(L, (const char*)Data, Size); 90 | 91 | size_t input_len = lua_objlen(L, Comma ? -2 : -1); 92 | 93 | /* 94 | const char *dbg = lua_tostring(L, Comma ? -2 : -1); 95 | printf("Input length = %zu\n", input_len); 96 | printf("Input = "); 97 | for (int i = 0; i < input_len; i++) 98 | printf("%02x", dbg[i] & 0xFF); 99 | printf("\n"); 100 | */ 101 | 102 | int err = lua_pcall(L, Comma ? 2 : 1, 2, 0); 103 | /* printf("Err = %x\n", err); */ 104 | 105 | if (err) { 106 | /* utf8.clean raised an error */ 107 | assert(repl != NULL); 108 | 109 | /* 110 | if (err == 2) { 111 | const char *errmsg = lua_tostring(L, -1); 112 | printf("Err message = %s\n", errmsg); 113 | } 114 | 115 | printf("Replacement length = %zu\n", repl_len); 116 | printf("Replacement = "); 117 | for (int i = 0; i < repl_len; i++) 118 | printf("%02x", repl[i] & 0xFF); 119 | printf("\n"); 120 | */ 121 | 122 | assert(!php_mbstring_check_utf8((unsigned char*)repl, repl_len)); 123 | } else { 124 | assert(lua_isstring(L, -2)); 125 | assert(lua_isboolean(L, -1)); 126 | const char *str = lua_tostring(L, -2); 127 | int was_clean = lua_toboolean(L, -1); 128 | size_t output_len = lua_objlen(L, -2); 129 | 130 | /* 131 | printf("Output length = %zu\n", output_len); 132 | printf("Output = "); 133 | for (int i = 0; i < output_len; i++) 134 | printf("%02x", str[i] & 0xFF); 135 | printf("\n"); 136 | */ 137 | 138 | if (was_clean) { 139 | assert(input_len == output_len); 140 | assert(memcmp(orig_data, str, input_len) == 0); 141 | } else { 142 | assert(input_len != output_len || memcmp(orig_data, str, input_len) != 0); 143 | } 144 | assert(php_mbstring_check_utf8((unsigned char*)str, output_len)); 145 | } 146 | 147 | lua_settop(L, 0); // clear Lua stack 148 | 149 | return 0; 150 | } 151 | 152 | int LLVMFuzzerInitialize(int *argc, char ***argv) 153 | { 154 | L = luaL_newstate(); 155 | luaL_openlibs(L); 156 | lua_getglobal(L, "require"); 157 | lua_pushstring(L, "lua-utf8"); 158 | lua_call(L, 1, 1); 159 | lua_setglobal(L, "utf8"); 160 | return 0; 161 | } 162 | -------------------------------------------------------------------------------- /fuzzer/fuzz-grapheme.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "lua.h" 8 | #include "lualib.h" 9 | #include "lauxlib.h" 10 | 11 | #include "unicode/ucnv.h" 12 | #include "unicode/ubrk.h" 13 | 14 | lua_State *L; 15 | 16 | /* Adapted from mb_utf8_to_wchar (from the PHP codebase) */ 17 | static bool php_mbstring_check_utf8(unsigned char *in, size_t in_len) 18 | { 19 | unsigned char *p = in, *e = p + in_len; 20 | 21 | while (p < e) { 22 | unsigned char c = *p++; 23 | 24 | if (c < 0x80) { 25 | /* do nothing */ 26 | } else if (c >= 0xC2 && c <= 0xDF) { /* 2 byte character */ 27 | if (p < e) { 28 | unsigned char c2 = *p++; 29 | if ((c2 & 0xC0) != 0x80) { 30 | return false; 31 | } 32 | } else { 33 | return false; 34 | } 35 | } else if (c >= 0xE0 && c <= 0xEF) { /* 3 byte character */ 36 | if ((e - p) >= 2) { 37 | unsigned char c2 = *p++; 38 | unsigned char c3 = *p++; 39 | if ((c2 & 0xC0) != 0x80 || (c == 0xE0 && c2 < 0xA0) || (c == 0xED && c2 >= 0xA0)) { 40 | return false; 41 | } else if ((c3 & 0xC0) != 0x80) { 42 | return false; 43 | } 44 | } else { 45 | return false; 46 | } 47 | } else if (c >= 0xF0 && c <= 0xF4) { /* 4 byte character */ 48 | if ((e - p) >= 3) { 49 | unsigned char c2 = *p++; 50 | unsigned char c3 = *p++; 51 | unsigned char c4 = *p++; 52 | /* If c == 0xF0 and c2 < 0x90, then this is an over-long code unit; it could have 53 | * fit in 3 bytes only. If c == 0xF4 and c2 >= 0x90, then this codepoint is 54 | * greater than U+10FFFF, which is the highest legal codepoint */ 55 | if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) { 56 | return false; 57 | } else if ((c3 & 0xC0) != 0x80) { 58 | return false; 59 | } else if ((c4 & 0xC0) != 0x80) { 60 | return false; 61 | } 62 | } else { 63 | return false; 64 | } 65 | } else { 66 | return false; 67 | } 68 | } 69 | 70 | return true; 71 | } 72 | 73 | /* From PHP codebase */ 74 | const unsigned char mblen_table_utf8[] = { 75 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 76 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 77 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 78 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 79 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 80 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 81 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 82 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 83 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 84 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 85 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 86 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 87 | 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 88 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 89 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 90 | 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 91 | }; 92 | 93 | const size_t utf16_code_unit_len(const unsigned char *s, size_t byte_len) { 94 | const unsigned char *e = s + byte_len; 95 | size_t result = 0; 96 | while (s < e) { 97 | unsigned char c = *s; 98 | s += mblen_table_utf8[c]; 99 | result++; 100 | if (c >= 0xF0 && c <= 0xF4) 101 | result++; /* 4-byte UTF-8 characters will take 2 UTF-16 code units */ 102 | } 103 | return result; 104 | } 105 | 106 | /* Adapted from source code for PostgreSQL ICU extension */ 107 | static int32_t icu_to_uchar(UConverter *icu_converter, UChar **buff_uchar, const char *buff, int32_t nbytes) 108 | { 109 | UErrorCode status = U_ZERO_ERROR; 110 | int32_t len_uchar = ucnv_toUChars(icu_converter, NULL, 0, buff, nbytes, &status); 111 | if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) { 112 | printf("Error from ucnv_toUChars: %s\n", u_errorName(status)); 113 | assert(0); 114 | } 115 | 116 | *buff_uchar = (UChar *) malloc((len_uchar + 1) * sizeof(**buff_uchar)); 117 | 118 | status = U_ZERO_ERROR; 119 | len_uchar = ucnv_toUChars(icu_converter, *buff_uchar, len_uchar + 1,buff, nbytes, &status); 120 | if (U_FAILURE(status)) { 121 | printf("Error from ucnv_toUChars: %s\n", u_errorName(status)); 122 | assert(0); 123 | } 124 | 125 | return len_uchar; 126 | } 127 | 128 | int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) 129 | { 130 | /* 131 | printf("(%zu): ", Size); 132 | for (unsigned int i = 0; i < Size; i++) 133 | printf("%02x ", Data[i]); 134 | printf("\n"); 135 | */ 136 | 137 | /* We can only compare with the results from ICU if the entire string was valid UTF-8; 138 | * ICU needs to convert the entire string to codepoints before operationg on it, 139 | * and it can only do that if it's valid UTF-8 */ 140 | bool valid_utf8 = php_mbstring_check_utf8((unsigned char*)Data, Size); 141 | 142 | UChar *ubuff = NULL; 143 | int32_t usize = 0; 144 | UConverter *icu_converter = NULL; 145 | UBreakIterator *bi = NULL; 146 | uint32_t p = 0; 147 | 148 | if (valid_utf8) { 149 | UErrorCode errcode = U_ZERO_ERROR; 150 | icu_converter = ucnv_open("utf8", &errcode); 151 | if (U_FAILURE(errcode)) { 152 | printf("Error from ucnv_open: %s\n", u_errorName(errcode)); 153 | assert(0); 154 | } 155 | usize = icu_to_uchar(icu_converter, &ubuff, (const char*)Data, Size); 156 | errcode = U_ZERO_ERROR; 157 | 158 | /* 159 | printf("UTF-16 code units from ICU: (%d): ", usize); 160 | for (unsigned int i = 0; i < usize; i++) 161 | printf("%04x ", ubuff[i]); 162 | printf("\n"); 163 | */ 164 | 165 | bi = ubrk_open(UBRK_CHARACTER, 0, ubuff, usize, &errcode); 166 | if (U_FAILURE(errcode)) { 167 | printf("Error from ubrk_open: %s\n", u_errorName(errcode)); 168 | assert(0); 169 | } 170 | p = ubrk_first(bi); 171 | } 172 | 173 | lua_getglobal(L, "utf8"); 174 | lua_getfield(L, -1, "grapheme_indices"); 175 | lua_pushlstring(L, (const char*)Data, Size); 176 | int err = lua_pcall(L, 1, 1, 0); 177 | assert(!err); 178 | assert(lua_iscfunction(L, -1)); 179 | lua_CFunction iterator = lua_tocfunction(L, -1); 180 | 181 | while (true) { 182 | lua_pushvalue(L, -1); // duplicate iterator (on top of stack) 183 | int err = lua_pcall(L, 0, 2, 0); 184 | if (err) { 185 | assert(!valid_utf8); 186 | break; 187 | } 188 | 189 | if (lua_isnil(L, -1)) { 190 | /* Finished iteration */ 191 | if (valid_utf8) { 192 | p = ubrk_next(bi); 193 | assert(p == UBRK_DONE); 194 | } 195 | break; 196 | } else { 197 | assert(lua_isnumber(L, -1)); 198 | assert(lua_isnumber(L, -2)); 199 | int start = lua_tonumber(L, -2); 200 | int end = lua_tonumber(L, -1); 201 | lua_pop(L, 2); 202 | if (valid_utf8) { 203 | printf("start = %d, end = %d, p = %d\n", start, end, p); 204 | /* start and end are byte offsets, p is a codepoint offset */ 205 | assert(p == utf16_code_unit_len(Data, start-1)); 206 | p = ubrk_next(bi); 207 | printf("moved to next boundary, now p = %d\n", p); 208 | printf("utf16_code_unit_len(Data, end) = %zu\n", utf16_code_unit_len(Data, end)); 209 | assert(p != UBRK_DONE); 210 | assert(p == utf16_code_unit_len(Data, end)); 211 | } 212 | } 213 | } 214 | 215 | lua_settop(L, 0); // clear Lua stack 216 | 217 | free(ubuff); 218 | if (icu_converter) 219 | ucnv_close(icu_converter); 220 | if (bi) 221 | ubrk_close(bi); 222 | 223 | return 0; 224 | } 225 | 226 | int LLVMFuzzerInitialize(int *argc, char ***argv) 227 | { 228 | L = luaL_newstate(); 229 | luaL_openlibs(L); 230 | lua_getglobal(L, "require"); 231 | lua_pushstring(L, "lua-utf8"); 232 | lua_call(L, 1, 1); 233 | lua_setglobal(L, "utf8"); 234 | return 0; 235 | } 236 | -------------------------------------------------------------------------------- /fuzzer/fuzz-invalid.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "lua.h" 7 | #include "lualib.h" 8 | #include "lauxlib.h" 9 | 10 | lua_State *L; 11 | 12 | /* Adapted from mb_utf8_to_wchar (from the PHP codebase) */ 13 | static bool php_mbstring_check_utf8(unsigned char *in, size_t in_len) 14 | { 15 | unsigned char *p = in, *e = p + in_len; 16 | 17 | while (p < e) { 18 | unsigned char c = *p++; 19 | 20 | if (c < 0x80) { 21 | /* do nothing */ 22 | } else if (c >= 0xC2 && c <= 0xDF) { /* 2 byte character */ 23 | if (p < e) { 24 | unsigned char c2 = *p++; 25 | if ((c2 & 0xC0) != 0x80) { 26 | return false; 27 | } 28 | } else { 29 | return false; 30 | } 31 | } else if (c >= 0xE0 && c <= 0xEF) { /* 3 byte character */ 32 | if ((e - p) >= 2) { 33 | unsigned char c2 = *p++; 34 | unsigned char c3 = *p++; 35 | if ((c2 & 0xC0) != 0x80 || (c == 0xE0 && c2 < 0xA0) || (c == 0xED && c2 >= 0xA0)) { 36 | return false; 37 | } else if ((c3 & 0xC0) != 0x80) { 38 | return false; 39 | } 40 | } else { 41 | return false; 42 | } 43 | } else if (c >= 0xF0 && c <= 0xF4) { /* 4 byte character */ 44 | if ((e - p) >= 3) { 45 | unsigned char c2 = *p++; 46 | unsigned char c3 = *p++; 47 | unsigned char c4 = *p++; 48 | /* If c == 0xF0 and c2 < 0x90, then this is an over-long code unit; it could have 49 | * fit in 3 bytes only. If c == 0xF4 and c2 >= 0x90, then this codepoint is 50 | * greater than U+10FFFF, which is the highest legal codepoint */ 51 | if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) { 52 | return false; 53 | } else if ((c3 & 0xC0) != 0x80) { 54 | return false; 55 | } else if ((c4 & 0xC0) != 0x80) { 56 | return false; 57 | } 58 | } else { 59 | return false; 60 | } 61 | } else { 62 | return false; 63 | } 64 | } 65 | 66 | return true; 67 | } 68 | 69 | int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) 70 | { 71 | lua_getglobal(L, "utf8"); 72 | lua_getfield(L, -1, "invalidoffset"); 73 | 74 | int offset = 0; 75 | if (Size > 2) { 76 | offset = *Data++; 77 | if (*Data++ % 2 == 1) 78 | offset = -offset; 79 | Size -= 2; 80 | } 81 | 82 | lua_pushlstring(L, (const char*)Data, Size); 83 | lua_pushinteger(L, offset); 84 | 85 | /* 86 | const char *dbg = lua_tostring(L, -2); 87 | printf("Input length = %zu\n", Size); 88 | printf("Input = "); 89 | for (int i = 0; i < Size; i++) 90 | printf("%02x", Data[i] & 0xFF); 91 | printf("\n"); 92 | printf("Offset = %d\n", offset); 93 | */ 94 | 95 | lua_call(L, 2, 1); 96 | 97 | assert(lua_isnumber(L, -1) || lua_isnil(L, -1)); 98 | 99 | /* Convert offset into a positive number from 1 - length of string 100 | * (offset is 1-based, not 0-based) */ 101 | if (offset < 0) { 102 | offset = Size + offset + 1; 103 | if (offset <= 0) { 104 | offset = 1; 105 | } 106 | } else if (offset == 0) { 107 | offset = 1; 108 | } else if (offset > Size) { 109 | offset = Size + 1; 110 | } 111 | 112 | if (lua_isnumber(L, -1)) { 113 | double retval = lua_tonumber(L, -1); 114 | /* printf("Retval = %d\n", (int)retval); */ 115 | assert(floor(retval) == ceil(retval)); /* Although 'double', it's actually an integer */ 116 | assert(retval >= offset); 117 | assert(retval > 0); 118 | assert(retval <= Size); 119 | assert(!php_mbstring_check_utf8((unsigned char*)Data + (int)retval - 1, Size - (int)retval + 1)); 120 | } else { 121 | assert(php_mbstring_check_utf8((unsigned char*)Data + offset - 1, Size - offset + 1)); 122 | } 123 | 124 | lua_settop(L, 0); // clear Lua stack 125 | 126 | return 0; 127 | } 128 | 129 | int LLVMFuzzerInitialize(int *argc, char ***argv) 130 | { 131 | L = luaL_newstate(); 132 | luaL_openlibs(L); 133 | lua_getglobal(L, "require"); 134 | lua_pushstring(L, "lua-utf8"); 135 | lua_call(L, 1, 1); 136 | lua_setglobal(L, "utf8"); 137 | return 0; 138 | } 139 | -------------------------------------------------------------------------------- /fuzzer/fuzz-normalize.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "lua.h" 8 | #include "lualib.h" 9 | #include "lauxlib.h" 10 | 11 | #include "unicode/ucnv.h" 12 | #include "unicode/unorm2.h" 13 | 14 | lua_State *L; 15 | 16 | /* Adapted from mb_utf8_to_wchar (from the PHP codebase) */ 17 | static bool php_mbstring_check_utf8(unsigned char *in, size_t in_len) 18 | { 19 | unsigned char *p = in, *e = p + in_len; 20 | 21 | while (p < e) { 22 | unsigned char c = *p++; 23 | 24 | if (c < 0x80) { 25 | /* do nothing */ 26 | } else if (c >= 0xC2 && c <= 0xDF) { /* 2 byte character */ 27 | if (p < e) { 28 | unsigned char c2 = *p++; 29 | if ((c2 & 0xC0) != 0x80) { 30 | return false; 31 | } 32 | } else { 33 | return false; 34 | } 35 | } else if (c >= 0xE0 && c <= 0xEF) { /* 3 byte character */ 36 | if ((e - p) >= 2) { 37 | unsigned char c2 = *p++; 38 | unsigned char c3 = *p++; 39 | if ((c2 & 0xC0) != 0x80 || (c == 0xE0 && c2 < 0xA0) || (c == 0xED && c2 >= 0xA0)) { 40 | return false; 41 | } else if ((c3 & 0xC0) != 0x80) { 42 | return false; 43 | } 44 | } else { 45 | return false; 46 | } 47 | } else if (c >= 0xF0 && c <= 0xF4) { /* 4 byte character */ 48 | if ((e - p) >= 3) { 49 | unsigned char c2 = *p++; 50 | unsigned char c3 = *p++; 51 | unsigned char c4 = *p++; 52 | /* If c == 0xF0 and c2 < 0x90, then this is an over-long code unit; it could have 53 | * fit in 3 bytes only. If c == 0xF4 and c2 >= 0x90, then this codepoint is 54 | * greater than U+10FFFF, which is the highest legal codepoint */ 55 | if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) { 56 | return false; 57 | } else if ((c3 & 0xC0) != 0x80) { 58 | return false; 59 | } else if ((c4 & 0xC0) != 0x80) { 60 | return false; 61 | } 62 | } else { 63 | return false; 64 | } 65 | } else { 66 | return false; 67 | } 68 | } 69 | 70 | return true; 71 | } 72 | 73 | /* Adapted from source code for PostgreSQL ICU extension */ 74 | static int32_t icu_to_uchar(UConverter *icu_converter, UChar **buff_uchar, const char *buff, int32_t nbytes) 75 | { 76 | UErrorCode status = U_ZERO_ERROR; 77 | int32_t len_uchar = ucnv_toUChars(icu_converter, NULL, 0, buff, nbytes, &status); 78 | if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) { 79 | printf("Error from ucnv_toUChars: %s\n", u_errorName(status)); 80 | assert(0); 81 | } 82 | 83 | *buff_uchar = (UChar *) malloc((len_uchar + 1) * sizeof(**buff_uchar)); 84 | 85 | status = U_ZERO_ERROR; 86 | len_uchar = ucnv_toUChars(icu_converter, *buff_uchar, len_uchar + 1,buff, nbytes, &status); 87 | if (U_FAILURE(status)) { 88 | printf("Error from ucnv_toUChars: %s\n", u_errorName(status)); 89 | assert(0); 90 | } 91 | 92 | return len_uchar; 93 | } 94 | 95 | static int32_t icu_from_uchar(UConverter *icu_converter, char **result, const UChar *buff_uchar, int32_t len_uchar) 96 | { 97 | UErrorCode status = U_ZERO_ERROR; 98 | uint32_t len_result = ucnv_fromUChars(icu_converter, NULL, 0, buff_uchar, len_uchar, &status); 99 | if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) 100 | assert(0); 101 | 102 | *result = (char *) malloc(len_result + 1); 103 | 104 | status = U_ZERO_ERROR; 105 | len_result = ucnv_fromUChars(icu_converter, *result, len_result + 1, buff_uchar, len_uchar, &status); 106 | if (U_FAILURE(status)) 107 | assert(0); 108 | 109 | return len_result; 110 | } 111 | 112 | int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) 113 | { 114 | /* 115 | printf("Input (%zu): ", Size); 116 | for (unsigned int i = 0; i < Size; i++) 117 | printf("%02x ", Data[i]); 118 | printf("\n"); 119 | */ 120 | 121 | /* We can only compare with the results from ICU if the entire string was valid UTF-8; 122 | * ICU won't even allow us to check whether the string is NFC unless it's valid UTF-8 */ 123 | bool valid_utf8 = php_mbstring_check_utf8((unsigned char*)Data, Size); 124 | 125 | UChar *ubuff = NULL; 126 | int32_t usize = 0; 127 | UConverter *icu_converter = NULL; 128 | 129 | if (valid_utf8) { 130 | UErrorCode errcode = U_ZERO_ERROR; 131 | icu_converter = ucnv_open("utf8", &errcode); 132 | if (U_FAILURE(errcode)) { 133 | printf("Error from ucnv_open: %s\n", u_errorName(errcode)); 134 | assert(0); 135 | } 136 | usize = icu_to_uchar(icu_converter, &ubuff, (const char*)Data, Size); 137 | } 138 | 139 | lua_getglobal(L, "utf8"); 140 | lua_getfield(L, -1, "isnfc"); 141 | lua_pushlstring(L, (const char*)Data, Size); 142 | int err = lua_pcall(L, 1, 1, 0); 143 | 144 | if (err) { 145 | /* utf8.isnfc raised an error */ 146 | assert(!valid_utf8); 147 | } else { 148 | assert(lua_isboolean(L, -1)); 149 | int was_nfc = lua_toboolean(L, -1); 150 | 151 | /* If the string was not NFC, we cannot assume that the string is valid UTF-8, 152 | * even if no error was raised... if utf8.isnfc notices that the string is not NFC, 153 | * it will immediately return false and will not check whether the trailing portion 154 | * is valid UTF-8 or not */ 155 | assert(!was_nfc || valid_utf8); 156 | 157 | if (valid_utf8) { 158 | UErrorCode errcode = U_ZERO_ERROR; 159 | const UNormalizer2 *norm = unorm2_getNFCInstance(&errcode); 160 | assert(!U_FAILURE(errcode)); 161 | UBool was_actually_nfc = unorm2_isNormalized(norm, ubuff, usize, &errcode); 162 | assert(!U_FAILURE(errcode)); 163 | 164 | /* 165 | printf("lua-utf8, is the input NFC? %s\n", was_nfc ? "yes" : "no"); 166 | printf("ICU, is the input NFC? %s\n", was_actually_nfc ? "yes" : "no"); 167 | */ 168 | 169 | assert(was_nfc == was_actually_nfc); 170 | } 171 | } 172 | 173 | lua_getglobal(L, "utf8"); 174 | lua_getfield(L, -1, "normalize_nfc"); 175 | lua_pushlstring(L, (const char*)Data, Size); 176 | err = lua_pcall(L, 1, 2, 0); 177 | 178 | if (err) { 179 | /* utf8.nfc_normalize raised an error */ 180 | assert(!valid_utf8); 181 | } else { 182 | assert(lua_isboolean(L, -1)); 183 | int was_already_nfc = lua_toboolean(L, -1); 184 | 185 | assert(lua_isstring(L, -2)); 186 | const char *str = lua_tostring(L, -2); 187 | size_t str_len = lua_objlen(L, -2); 188 | 189 | assert(valid_utf8 || !was_already_nfc); 190 | 191 | if (valid_utf8) { 192 | UErrorCode errcode = U_ZERO_ERROR; 193 | const UNormalizer2 *norm = unorm2_getNFCInstance(&errcode); 194 | assert(!U_FAILURE(errcode)); 195 | 196 | uint32_t dest_size = 3 * usize; /* Maximum size which string could possibly expand to as NFC */ 197 | UChar *dest = malloc(dest_size * sizeof(UChar)); 198 | 199 | uint32_t dest_len = unorm2_normalize(norm, ubuff, usize, dest, dest_size, &errcode); 200 | assert(!U_FAILURE(errcode)); 201 | 202 | /* Convert NFC codepoints to UTF-8 bytes */ 203 | char *bytes = NULL; 204 | uint32_t byte_len = icu_from_uchar(icu_converter, &bytes, dest, dest_len); 205 | 206 | /* 207 | printf("lua-utf8 (%zu): ", str_len); 208 | for (unsigned int i = 0; i < str_len; i++) 209 | printf("%02x ", (uint8_t)str[i]); 210 | printf("\n"); 211 | printf("ICU (%u): ", byte_len); 212 | for (unsigned int i = 0; i < byte_len; i++) 213 | printf("%02x ", (uint8_t)bytes[i]); 214 | printf("\n"); 215 | */ 216 | 217 | assert(byte_len == str_len); 218 | assert(strncmp(str, bytes, str_len) == 0); 219 | 220 | free(dest); 221 | free(bytes); 222 | } 223 | } 224 | 225 | lua_settop(L, 0); // clear Lua stack 226 | 227 | free(ubuff); 228 | if (icu_converter) 229 | ucnv_close(icu_converter); 230 | 231 | return 0; 232 | } 233 | 234 | int LLVMFuzzerInitialize(int *argc, char ***argv) 235 | { 236 | L = luaL_newstate(); 237 | luaL_openlibs(L); 238 | lua_getglobal(L, "require"); 239 | lua_pushstring(L, "lua-utf8"); 240 | lua_call(L, 1, 1); 241 | lua_setglobal(L, "utf8"); 242 | return 0; 243 | } 244 | -------------------------------------------------------------------------------- /fuzzer/fuzz-valid.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "lua.h" 6 | #include "lualib.h" 7 | #include "lauxlib.h" 8 | 9 | lua_State *L; 10 | 11 | /* Adapted from mb_utf8_to_wchar (from the PHP codebase) */ 12 | static bool php_mbstring_check_utf8(unsigned char *in, size_t in_len) 13 | { 14 | unsigned char *p = in, *e = p + in_len; 15 | 16 | while (p < e) { 17 | unsigned char c = *p++; 18 | 19 | if (c < 0x80) { 20 | /* do nothing */ 21 | } else if (c >= 0xC2 && c <= 0xDF) { /* 2 byte character */ 22 | if (p < e) { 23 | unsigned char c2 = *p++; 24 | if ((c2 & 0xC0) != 0x80) { 25 | return false; 26 | } 27 | } else { 28 | return false; 29 | } 30 | } else if (c >= 0xE0 && c <= 0xEF) { /* 3 byte character */ 31 | if ((e - p) >= 2) { 32 | unsigned char c2 = *p++; 33 | unsigned char c3 = *p++; 34 | if ((c2 & 0xC0) != 0x80 || (c == 0xE0 && c2 < 0xA0) || (c == 0xED && c2 >= 0xA0)) { 35 | return false; 36 | } else if ((c3 & 0xC0) != 0x80) { 37 | return false; 38 | } 39 | } else { 40 | return false; 41 | } 42 | } else if (c >= 0xF0 && c <= 0xF4) { /* 4 byte character */ 43 | if ((e - p) >= 3) { 44 | unsigned char c2 = *p++; 45 | unsigned char c3 = *p++; 46 | unsigned char c4 = *p++; 47 | /* If c == 0xF0 and c2 < 0x90, then this is an over-long code unit; it could have 48 | * fit in 3 bytes only. If c == 0xF4 and c2 >= 0x90, then this codepoint is 49 | * greater than U+10FFFF, which is the highest legal codepoint */ 50 | if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) { 51 | return false; 52 | } else if ((c3 & 0xC0) != 0x80) { 53 | return false; 54 | } else if ((c4 & 0xC0) != 0x80) { 55 | return false; 56 | } 57 | } else { 58 | return false; 59 | } 60 | } else { 61 | return false; 62 | } 63 | } 64 | 65 | return true; 66 | } 67 | 68 | int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) 69 | { 70 | lua_getglobal(L, "utf8"); 71 | lua_getfield(L, -1, "isvalid"); 72 | lua_pushlstring(L, (const char*)Data, Size); 73 | lua_call(L, 1, 1); 74 | 75 | assert(lua_isboolean(L, -1)); 76 | int was_valid = lua_toboolean(L, -1); 77 | if (was_valid) { 78 | assert(php_mbstring_check_utf8((unsigned char*)Data, Size)); 79 | } else { 80 | assert(!php_mbstring_check_utf8((unsigned char*)Data, Size)); 81 | } 82 | 83 | lua_settop(L, 0); // clear Lua stack 84 | 85 | return 0; 86 | } 87 | 88 | int LLVMFuzzerInitialize(int *argc, char ***argv) 89 | { 90 | L = luaL_newstate(); 91 | luaL_openlibs(L); 92 | lua_getglobal(L, "require"); 93 | lua_pushstring(L, "lua-utf8"); 94 | lua_call(L, 1, 1); 95 | lua_setglobal(L, "utf8"); 96 | return 0; 97 | } 98 | -------------------------------------------------------------------------------- /lutf8lib.c: -------------------------------------------------------------------------------- 1 | /* vim: set ft=c nu et sw=2 fdc=2 fdm=syntax : */ 2 | #define LUA_LIB 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "unidata.h" 14 | 15 | /* UTF-8 string operations */ 16 | 17 | #define UTF8_BUFFSZ 8 18 | #define UTF8_MAX 0x7FFFFFFFu 19 | #define UTF8_MAXCP 0x10FFFFu 20 | #define iscont(p) ((*(p) & 0xC0) == 0x80) 21 | #define CAST(tp,expr) ((tp)(expr)) 22 | 23 | #ifndef LUA_QL 24 | # define LUA_QL(x) "'" x "'" 25 | #endif 26 | 27 | static int utf8_invalid (utfint ch) 28 | { return (ch > UTF8_MAXCP || (0xD800u <= ch && ch <= 0xDFFFu)); } 29 | 30 | static size_t utf8_encode (char *buff, utfint x) { 31 | int n = 1; /* number of bytes put in buffer (backwards) */ 32 | lua_assert(x <= UTF8_MAX); 33 | if (x < 0x80) /* ascii? */ 34 | buff[UTF8_BUFFSZ - 1] = x & 0x7F; 35 | else { /* need continuation bytes */ 36 | utfint mfb = 0x3f; /* maximum that fits in first byte */ 37 | do { /* add continuation bytes */ 38 | buff[UTF8_BUFFSZ - (n++)] = 0x80 | (x & 0x3f); 39 | x >>= 6; /* remove added bits */ 40 | mfb >>= 1; /* now there is one less bit available in first byte */ 41 | } while (x > mfb); /* still needs continuation byte? */ 42 | buff[UTF8_BUFFSZ - n] = ((~mfb << 1) | x) & 0xFF; /* add first byte */ 43 | } 44 | return n; 45 | } 46 | 47 | static const char *utf8_decode (const char *s, utfint *val, int strict) { 48 | static const utfint limits[] = 49 | {~0u, 0x80u, 0x800u, 0x10000u, 0x200000u, 0x4000000u}; 50 | unsigned int c = (unsigned char)s[0]; 51 | utfint res = 0; /* final result */ 52 | if (c < 0x80) /* ascii? */ 53 | res = c; 54 | else { 55 | int count = 0; /* to count number of continuation bytes */ 56 | for (; c & 0x40; c <<= 1) { /* while it needs continuation bytes... */ 57 | unsigned int cc = (unsigned char)s[++count]; /* read next byte */ 58 | if ((cc & 0xC0) != 0x80) /* not a continuation byte? */ 59 | return NULL; /* invalid byte sequence */ 60 | res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */ 61 | } 62 | res |= ((utfint)(c & 0x7F) << (count * 5)); /* add first byte */ 63 | if (count > 5 || res > UTF8_MAX || res < limits[count]) 64 | return NULL; /* invalid byte sequence */ 65 | s += count; /* skip continuation bytes read */ 66 | } 67 | if (strict) { 68 | /* check for invalid code points; too large or surrogates */ 69 | if (res > UTF8_MAXCP || (0xD800u <= res && res <= 0xDFFFu)) 70 | return NULL; 71 | } 72 | if (val) *val = res; 73 | return s + 1; /* +1 to include first byte */ 74 | } 75 | 76 | static const char *utf8_prev (const char *s, const char *e) { 77 | while (s < e && iscont(e - 1)) --e; 78 | return s < e ? e - 1 : s; 79 | } 80 | 81 | static const char *utf8_next (const char *s, const char *e) { 82 | while (s < e && iscont(s + 1)) ++s; 83 | return s < e ? s + 1 : e; 84 | } 85 | 86 | static size_t utf8_length (const char *s, const char *e) { 87 | size_t i; 88 | for (i = 0; s < e; ++i) 89 | s = utf8_next(s, e); 90 | return i; 91 | } 92 | 93 | static const char *utf8_offset (const char *s, const char *e, lua_Integer offset, lua_Integer idx) { 94 | const char *p = s + offset - 1; 95 | if (idx >= 0) { 96 | while (p < e && idx > 0) 97 | p = utf8_next(p, e), --idx; 98 | return idx == 0 ? p : NULL; 99 | } else { 100 | while (s < p && idx < 0) 101 | p = utf8_prev(s, p), ++idx; 102 | return idx == 0 ? p : NULL; 103 | } 104 | } 105 | 106 | static const char *utf8_relat (const char *s, const char *e, int idx) { 107 | return idx >= 0 ? 108 | utf8_offset(s, e, 1, idx - 1) : 109 | utf8_offset(s, e, e-s+1, idx); 110 | } 111 | 112 | static int utf8_range(const char *s, const char *e, lua_Integer *i, lua_Integer *j) { 113 | const char *ps = utf8_relat(s, e, CAST(int, *i)); 114 | const char *pe = utf8_relat(s, e, CAST(int, *j)); 115 | *i = (ps ? ps : (*i > 0 ? e : s)) - s; 116 | *j = (pe ? utf8_next(pe, e) : (*j > 0 ? e : s)) - s; 117 | return *i < *j; 118 | } 119 | 120 | /* Indexed by top nibble of first byte in code unit */ 121 | static uint8_t utf8_code_unit_len[] = { 122 | 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, 2, 2, 3, 4 123 | }; 124 | 125 | /* Return pointer to first invalid UTF-8 sequence in 's', or NULL if valid */ 126 | static const char *utf8_invalid_offset(const char *s, const char *e) { 127 | while (s < e) { 128 | uint8_t c = *s; 129 | if (c >= 0x80) { 130 | /* c < 0xC0 means a continuation byte, but we are not in the middle of a multi-byte code unit 131 | * c >= 0xC0 && c < 0xC2 means an overlong 2-byte code unit 132 | * c >= 0xF8 means a 5-byte or 6-byte code unit, which is illegal, or else illegal byte 0xFE/0xFF 133 | * c >= 0xF5 && c < 0xF8 means a 4-byte code unit encoding invalid codepoint > U+10FFFF */ 134 | if (c < 0xC2 || c >= 0xF5) 135 | return s; 136 | uint8_t needed_bytes = utf8_code_unit_len[c >> 4]; 137 | if (e - s < needed_bytes) 138 | return s; /* String is truncated */ 139 | uint8_t c2 = *(s+1); 140 | if ((c2 & 0xC0) != 0x80) 141 | return s; /* 2nd byte of code unit is not a continuation byte */ 142 | if (needed_bytes >= 3) { 143 | uint8_t c3 = *(s+2); 144 | if ((c3 & 0xC0) != 0x80) 145 | return s; /* 3rd byte of code unit is not a continuation byte */ 146 | if (needed_bytes == 3) { 147 | if (c == 0xE0 && c2 < 0xA0) 148 | return s; /* Overlong 3-byte code unit */ 149 | if (c == 0xED && c2 >= 0xA0) 150 | return s; /* Reserved codepoint from U+D800-U+DFFF */ 151 | } else { 152 | uint8_t c4 = *(s+3); 153 | if ((c4 & 0xC0) != 0x80) 154 | return s; /* 4th byte of code unit is not a continuation byte */ 155 | if (c == 0xF0 && c2 < 0x90) 156 | return s; /* Overlong 4-byte code unit */ 157 | if (c == 0xF4 && c2 >= 0x90) 158 | return s; /* Illegal codepoint > U+10FFFF */ 159 | } 160 | } 161 | s += needed_bytes; 162 | } else { 163 | s++; 164 | } 165 | } 166 | return NULL; 167 | } 168 | 169 | /* Unicode character categories */ 170 | 171 | #define table_size(t) (sizeof(t)/sizeof((t)[0])) 172 | 173 | #define utf8_categories(X) \ 174 | X('a', alpha) \ 175 | X('c', cntrl) \ 176 | X('d', digit) \ 177 | X('l', lower) \ 178 | X('p', punct) \ 179 | X('s', space) \ 180 | X('t', compose) \ 181 | X('u', upper) \ 182 | X('x', xdigit) 183 | 184 | #define utf8_converters(X) \ 185 | X(lower) \ 186 | X(upper) \ 187 | X(title) \ 188 | X(fold) 189 | 190 | static int find_in_range (range_table *t, size_t size, utfint ch) { 191 | size_t begin, end; 192 | 193 | begin = 0; 194 | end = size; 195 | 196 | while (begin < end) { 197 | size_t mid = (begin + end) / 2; 198 | if (t[mid].last < ch) 199 | begin = mid + 1; 200 | else if (t[mid].first > ch) 201 | end = mid; 202 | else 203 | return (ch - t[mid].first) % t[mid].step == 0; 204 | } 205 | 206 | return 0; 207 | } 208 | 209 | static int convert_char (conv_table *t, size_t size, utfint ch) { 210 | size_t begin, end; 211 | 212 | begin = 0; 213 | end = size; 214 | 215 | while (begin < end) { 216 | size_t mid = (begin + end) / 2; 217 | if (t[mid].last < ch) 218 | begin = mid + 1; 219 | else if (t[mid].first > ch) 220 | end = mid; 221 | else if ((ch - t[mid].first) % t[mid].step == 0) 222 | return ch + t[mid].offset; 223 | else 224 | return ch; 225 | } 226 | 227 | return ch; 228 | } 229 | 230 | /* Normalization */ 231 | 232 | static int lookup_canon_cls (utfint ch) { 233 | /* The first codepoint with canonicalization class != 0 is U+0300 COMBINING GRAVE ACCENT */ 234 | if (ch < 0x300) { 235 | return 0; 236 | } 237 | size_t begin = 0, end = table_size(nfc_combining_table); 238 | 239 | while (begin < end) { 240 | size_t mid = (begin + end) / 2; 241 | if (nfc_combining_table[mid].last < ch) 242 | begin = mid + 1; 243 | else if (nfc_combining_table[mid].first > ch) 244 | end = mid; 245 | else 246 | return nfc_combining_table[mid].canon_cls; 247 | } 248 | 249 | return 0; 250 | } 251 | 252 | static nfc_table *nfc_quickcheck (utfint ch) { 253 | /* The first character which needs to be checked for possible NFC violations 254 | * is U+0300 COMBINING GRAVE ACCENT */ 255 | if (ch < 0x300) { 256 | return NULL; 257 | } 258 | size_t begin = 0, end = table_size(nfc_quickcheck_table); 259 | 260 | while (begin < end) { 261 | size_t mid = (begin + end) / 2; 262 | utfint found = nfc_quickcheck_table[mid].cp; 263 | if (found < ch) 264 | begin = mid + 1; 265 | else if (found > ch) 266 | end = mid; 267 | else 268 | return &nfc_quickcheck_table[mid]; 269 | } 270 | 271 | return NULL; 272 | } 273 | 274 | static int nfc_combine (utfint cp1, utfint cp2, utfint *dest) { 275 | size_t begin = 0, end = table_size(nfc_composite_table); 276 | unsigned int hash = (cp1 * 213) + cp2; 277 | 278 | while (begin < end) { 279 | size_t mid = (begin + end) / 2; 280 | utfint val = nfc_composite_table[mid].hash; 281 | if (val < hash) { 282 | begin = mid + 1; 283 | } else if (val > hash) { 284 | end = mid; 285 | } else if (nfc_composite_table[mid].cp1 == cp1 && nfc_composite_table[mid].cp2 == cp2) { 286 | if (dest) 287 | *dest = nfc_composite_table[mid].dest; 288 | return 1; 289 | } else { 290 | return 0; 291 | } 292 | } 293 | 294 | return 0; 295 | } 296 | 297 | static decompose_table *nfc_decompose (utfint ch) { 298 | size_t begin = 0, end = table_size(nfc_decompose_table); 299 | 300 | while (begin < end) { 301 | size_t mid = (begin + end) / 2; 302 | utfint found = nfc_decompose_table[mid].cp; 303 | if (found < ch) 304 | begin = mid + 1; 305 | else if (found > ch) 306 | end = mid; 307 | else 308 | return &nfc_decompose_table[mid]; 309 | } 310 | 311 | return NULL; 312 | } 313 | 314 | static int nfc_check (utfint ch, nfc_table *entry, utfint starter, unsigned int canon_cls, unsigned int prev_canon_cls) { 315 | int reason = entry->reason; 316 | 317 | if (reason == REASON_MUST_CONVERT_1 || reason == REASON_MUST_CONVERT_2) { 318 | /* This codepoint has a different, canonical form, so this string is not NFC */ 319 | return 0; 320 | } else if (reason == REASON_STARTER_CAN_COMBINE) { 321 | /* It is possible that this 'starter' codepoint should have been combined with the 322 | * preceding 'starter' codepoint; if so, this string is not NFC */ 323 | if (!prev_canon_cls && nfc_combine(starter, ch, NULL)) { 324 | /* These codepoints should have been combined */ 325 | return 0; 326 | } 327 | } else if (reason == REASON_COMBINING_MARK) { 328 | /* Combining mark; check if it should have been combined with preceding starter codepoint */ 329 | if (canon_cls <= prev_canon_cls) { 330 | return 1; 331 | } 332 | if (nfc_combine(starter, ch, NULL)) { 333 | /* Yes, they should have been combined. This string is not NFC */ 334 | return 0; 335 | } 336 | /* Could it be that preceding 'starter' codepoint is already combined, but with a 337 | * combining mark which is out of order with this one? */ 338 | decompose_table *decomp = nfc_decompose(starter); 339 | if (decomp) { 340 | if (decomp->canon_cls2 > canon_cls && nfc_combine(decomp->to1, ch, NULL)) { 341 | return 0; 342 | } else { 343 | decompose_table *decomp2 = nfc_decompose(decomp->to1); 344 | if (decomp2 && decomp2->canon_cls2 > canon_cls && nfc_combine(decomp2->to1, ch, NULL)) { 345 | return 0; 346 | } 347 | } 348 | } 349 | } else if (reason == REASON_JAMO_VOWEL) { 350 | if (!prev_canon_cls && starter >= 0x1100 && starter <= 0x1112) { 351 | /* Preceding codepoint was a leading jamo; they should have been combined */ 352 | return 0; 353 | } 354 | } else if (reason == REASON_JAMO_TRAILING) { 355 | if (!prev_canon_cls && starter >= 0xAC00 && starter <= 0xD7A3) { 356 | /* Preceding codepoint was a precomposed Hangul syllable; check if it had no trailing jamo */ 357 | if ((starter - 0xAC00) % 28 == 0) { 358 | /* It didn't have a trailing jamo, so this trailing jamo should have been combined */ 359 | return 0; 360 | } 361 | } 362 | } 363 | 364 | return 1; 365 | } 366 | 367 | static void merge_combining_marks (uint32_t *src1, uint32_t *src2, uint32_t *dest, size_t size1, size_t size2) { 368 | while (size1 && size2) { 369 | if ((*src1 & 0xFF) > (*src2 & 0xFF)) { 370 | *dest++ = *src2++; 371 | size2--; 372 | } else { 373 | *dest++ = *src1++; 374 | size1--; 375 | } 376 | } 377 | while (size1) { 378 | *dest++ = *src1++; 379 | size1--; 380 | } 381 | while (size2) { 382 | *dest++ = *src2++; 383 | size2--; 384 | } 385 | } 386 | 387 | static void stable_sort_combining_marks (uint32_t *vector, uint32_t *scratch, size_t size) { 388 | /* We need to use a stable sort for sorting combining marks which are in the wrong order 389 | * when doing NFC normalization; bottom-up merge sort is fast and stable */ 390 | size_t limit = size - 1; 391 | for (unsigned int i = 0; i < limit; i += 2) { 392 | if ((vector[i] & 0xFF) > (vector[i+1] & 0xFF)) { 393 | uint32_t temp = vector[i]; 394 | vector[i] = vector[i+1]; 395 | vector[i+1] = temp; 396 | } 397 | } 398 | if (size <= 2) 399 | return; 400 | 401 | uint32_t *src = vector, *dest = scratch; 402 | unsigned int runsize = 2; /* Every consecutive slice of this size is sorted */ 403 | while (runsize < size) { 404 | unsigned int blocksize = runsize * 2; /* We will now sort slices of this size */ 405 | limit = size & ~(blocksize - 1); 406 | for (unsigned int i = 0; i < limit; i += blocksize) 407 | merge_combining_marks(&src[i], &src[i+runsize], &dest[i], runsize, runsize); 408 | if (size - limit > runsize) { 409 | merge_combining_marks(&src[limit], &src[limit+runsize], &dest[limit], runsize, size - limit - runsize); 410 | } else { 411 | memcpy(&dest[limit], &src[limit], (size - limit) * sizeof(uint32_t)); 412 | } 413 | /* After each series of (progressively larger) merges, we swap src & dest to 414 | * avoid memcpy'ing the partially sorted results from dest back into src */ 415 | uint32_t *temp = src; src = dest; dest = temp; 416 | runsize = blocksize; 417 | } 418 | 419 | if (dest == vector) { 420 | /* Since src & dest are swapped on each iteration of the above loop, 421 | * this actually means the last buffer which was written into 422 | * was 'scratch' */ 423 | memcpy(vector, scratch, size * sizeof(uint32_t)); 424 | } 425 | } 426 | 427 | /* Shuffle item `i` up or down to get it into the right position */ 428 | static void stable_insert_combining_mark (uint32_t *vector, size_t vec_size, unsigned int i) 429 | { 430 | unsigned int item = vector[i]; 431 | unsigned int canon_cls = item & 0xFF; 432 | if (i > 0) { 433 | if (canon_cls < (vector[i-1] & 0xFF)) { 434 | do { 435 | vector[i] = vector[i-1]; 436 | i--; 437 | } while (i > 0 && canon_cls < (vector[i-1] & 0xFF)); 438 | vector[i] = item; 439 | return; 440 | } 441 | } 442 | if (i < vec_size-1) { 443 | if (canon_cls > (vector[i+1] & 0xFF)) { 444 | do { 445 | vector[i] = vector[i+1]; 446 | i++; 447 | } while (i < vec_size-1 && canon_cls > (vector[i+1] & 0xFF)); 448 | vector[i] = item; 449 | return; 450 | } 451 | } 452 | } 453 | 454 | static void add_utf8char (luaL_Buffer *b, utfint ch); 455 | 456 | static inline void grow_vector_if_needed (uint32_t **vector, uint32_t *onstack, size_t *size, size_t needed) 457 | { 458 | size_t current_size = *size; 459 | if (needed >= current_size) { 460 | size_t new_size = current_size * 2; /* `needed` is never bigger than `current_size * 2` */ 461 | uint32_t *new_vector = malloc(new_size * sizeof(uint32_t)); 462 | memcpy(new_vector, *vector, current_size * sizeof(uint32_t)); 463 | *size = new_size; 464 | if (*vector != onstack) 465 | free(*vector); 466 | *vector = new_vector; 467 | } 468 | } 469 | 470 | static void string_to_nfc (lua_State *L, luaL_Buffer *buff, const char *s, const char *e) 471 | { 472 | /* Converting a string to Normal Form C involves: 473 | * 1) Ensuring that codepoints with "built-in" accents are used whenever possible 474 | * rather than separate codepoints for a base character and combining mark 475 | * 2) Where combining marks must be used, putting them into canonical order 476 | * 3) Converting some deprecated codepoints to the recommended variant 477 | * 4) Ensuring that Korean Hangul are represented as precomposed syllable 478 | * codepoints whenever possible, rather than sequences of Jamo codepoints 479 | * 480 | * (Combining marks are accents which appear on top of or below the preceding 481 | * character. Starter codepoints are the base characters which combining marks can 482 | * 'combine' with. Almost all codepoints are starters, including all the Latin alphabet. 483 | * Every Unicode codepoint has a numeric 'canonicalization class'; starters have class = 0. 484 | * Combining marks must be sorted in order of their canonicalization class. Since the 485 | * canonicalization class numbers are not unique, the sort must be stable.) 486 | * 487 | * When converting to NFC, the largest scope which we need to work on at once 488 | * consists of a 'starter' codepoint and either 1 or more ensuing combining marks, 489 | * OR else a directly following starter codepoint. 490 | * 491 | * As we walk through the string, whenever we pass by a complete sequence of starter + 492 | * combining marks or starter + starter, we process that sequence to see if it is NFC or not. 493 | * If it is, we memcpy the bytes verbatim into the output buffer. If it is not, then we 494 | * convert the codepoints to NFC and then emit those codepoints as UTF-8 bytes. */ 495 | 496 | utfint starter = -1, ch; /* 'starter' is last starter codepoint seen */ 497 | const char *to_copy = s; /* pointer to next bytes we might need to memcpy into output buffer */ 498 | unsigned int prev_canon_cls = 0, canon_cls = 0; 499 | int fixedup = 0; /* has the sequence currently under consideration been modified to make it NFC? */ 500 | 501 | /* Temporary storage for a sequence of consecutive combining marks 502 | * In the vast majority of cases, this small on-stack array will provide enough 503 | * space; if not, we will switch to a malloc'd buffer */ 504 | uint32_t onstack[8]; 505 | size_t vec_size = 0, vec_max = sizeof(onstack)/sizeof(uint32_t); 506 | uint32_t *vector = onstack; 507 | 508 | while (s < e) { 509 | const char *new_s = utf8_decode(s, &ch, 1); 510 | if (new_s == NULL) { 511 | if (vector != onstack) 512 | free(vector); 513 | lua_pushstring(L, "string is not valid UTF-8"); 514 | lua_error(L); 515 | } 516 | unsigned int canon_cls = lookup_canon_cls(ch); 517 | 518 | if (!canon_cls) { 519 | /* This is a starter codepoint */ 520 | nfc_table *entry = nfc_quickcheck(ch); 521 | 522 | /* But in rare cases, a deprecated 'starter' codepoint may convert 523 | * to combining marks instead! 524 | * Why, oh why, did the Unicode Consortium do this?? */ 525 | if (entry && entry->reason == REASON_MUST_CONVERT_2) { 526 | utfint conv1 = entry->data1; 527 | unsigned int canon_cls1 = lookup_canon_cls(conv1); 528 | if (canon_cls1) { 529 | utfint conv2 = entry->data2; 530 | unsigned int canon_cls2 = lookup_canon_cls(conv2); 531 | grow_vector_if_needed(&vector, onstack, &vec_max, vec_size + 2); 532 | vector[vec_size++] = (conv1 << 8) | (canon_cls1 & 0xFF); 533 | vector[vec_size++] = (conv2 << 8) | (canon_cls2 & 0xFF); 534 | s = new_s; 535 | prev_canon_cls = canon_cls2; 536 | fixedup = 1; 537 | continue; 538 | } 539 | } 540 | 541 | /* Handle preceding starter and optional sequence of combining marks which may have followed it */ 542 | if (prev_canon_cls) { 543 | /* Before this starter, there was a sequence of combining marks. 544 | * Check those over and emit output to 'buff' */ 545 | process_combining_marks: 546 | 547 | /* Check if accumulated combining marks were in correct order */ 548 | for (unsigned int i = 1; i < vec_size; i++) { 549 | if ((vector[i-1] & 0xFF) > (vector[i] & 0xFF)) { 550 | /* Order is incorrect, we need to sort */ 551 | uint32_t *scratch = malloc(vec_size * sizeof(uint32_t)); 552 | stable_sort_combining_marks(vector, scratch, vec_size); 553 | free(scratch); 554 | fixedup = 1; 555 | break; 556 | } 557 | } 558 | 559 | /* Check if any of those combining marks are in violation of NFC */ 560 | unsigned int i = 0; 561 | while (i < vec_size) { 562 | utfint combine_mark = vector[i] >> 8; 563 | nfc_table *mark_entry = nfc_quickcheck(combine_mark); 564 | if (mark_entry) { 565 | if (mark_entry->reason == REASON_MUST_CONVERT_1) { 566 | /* This combining mark must be converted to a different one */ 567 | vector[i] = (mark_entry->data1 << 8) | mark_entry->data2; 568 | fixedup = 1; 569 | continue; 570 | } else if (mark_entry->reason == REASON_MUST_CONVERT_2) { 571 | /* This combining mark must be converted to two others */ 572 | grow_vector_if_needed(&vector, onstack, &vec_max, vec_size + 1); 573 | memmove(&vector[i+2], &vector[i+1], sizeof(uint32_t) * (vec_size - i - 1)); 574 | vector[i] = (mark_entry->data1 << 8) | lookup_canon_cls(mark_entry->data1); 575 | vector[i+1] = (mark_entry->data2 << 8) | lookup_canon_cls(mark_entry->data2); 576 | vec_size++; 577 | fixedup = 1; 578 | continue; 579 | } else if (mark_entry->reason == REASON_COMBINING_MARK) { 580 | unsigned int mark_canon_cls = vector[i] & 0xFF; 581 | if (i == 0 || mark_canon_cls > (vector[i-1] & 0xFF)) { 582 | if (nfc_combine(starter, combine_mark, &starter)) { 583 | /* This combining mark must be combined with preceding starter */ 584 | vec_size--; 585 | memmove(&vector[i], &vector[i+1], sizeof(uint32_t) * (vec_size - i)); /* Remove element i */ 586 | fixedup = 1; 587 | continue; 588 | } 589 | 590 | decompose_table *decomp = nfc_decompose(starter); 591 | if (decomp) { 592 | if (decomp->canon_cls2 > mark_canon_cls && nfc_combine(decomp->to1, combine_mark, &starter)) { 593 | /* The preceding starter already included an accent, but when represented as a combining 594 | * mark, that accent has a HIGHER canonicalization class than this one 595 | * Further, this one is able to combine with the same base character 596 | * In other words, the base character was wrongly combined with a "lower-priority" 597 | * combining mark; fix that up */ 598 | unsigned int class2 = lookup_canon_cls(decomp->to2); 599 | memmove(&vector[1], &vector[0], sizeof(uint32_t) * i); 600 | vector[0] = (decomp->to2 << 8) | class2; 601 | stable_insert_combining_mark(vector, vec_size, 0); 602 | fixedup = 1; 603 | continue; 604 | } else { 605 | decompose_table *decomp2 = nfc_decompose(decomp->to1); 606 | if (decomp2 && decomp2->canon_cls2 > mark_canon_cls && nfc_combine(decomp2->to1, combine_mark, &starter)) { 607 | grow_vector_if_needed(&vector, onstack, &vec_max, vec_size + 1); 608 | memmove(&vector[i+2], &vector[i+1], sizeof(uint32_t) * (vec_size - i - 1)); 609 | memmove(&vector[2], &vector[0], sizeof(uint32_t) * i); 610 | vector[0] = (decomp2->to2 << 8) | lookup_canon_cls(decomp2->to2); 611 | vector[1] = (decomp->to2 << 8) | lookup_canon_cls(decomp->to2); 612 | vec_size++; 613 | stable_insert_combining_mark(vector, vec_size, 1); 614 | stable_insert_combining_mark(vector, vec_size, 0); 615 | fixedup = 1; 616 | continue; 617 | } 618 | } 619 | } 620 | } 621 | } 622 | } 623 | i++; 624 | } 625 | 626 | if (fixedup) { 627 | /* The preceding starter/combining mark sequence was bad; convert fixed-up codepoints 628 | * to UTF-8 bytes */ 629 | if (starter != -1) 630 | add_utf8char(buff, starter); 631 | for (unsigned int i = 0; i < vec_size; i++) 632 | add_utf8char(buff, vector[i] >> 8); 633 | } else { 634 | /* The preceding starter/combining mark sequence was good; copy raw bytes to output */ 635 | luaL_addlstring(buff, to_copy, s - to_copy); 636 | } 637 | if (s >= e) { 638 | /* We jumped in to the middle of the main loop to finish processing trailing 639 | * combining marks... we are actually done now */ 640 | if (vector != onstack) 641 | free(vector); 642 | return; 643 | } 644 | vec_size = 0; /* Clear vector of combining marks in readiness for next such sequence */ 645 | fixedup = 0; 646 | } else if (starter != -1) { 647 | /* This starter was preceded immediately by another starter 648 | * Check if this one should combine with it */ 649 | fixedup = 0; 650 | if (entry) { 651 | if (entry->reason == REASON_STARTER_CAN_COMBINE && nfc_combine(starter, ch, &ch)) { 652 | fixedup = 1; 653 | } else if (entry->reason == REASON_JAMO_VOWEL && starter >= 0x1100 && starter <= 0x1112) { 654 | ch = 0xAC00 + ((starter - 0x1100) * 588) + ((ch - 0x1161) * 28); 655 | fixedup = 1; 656 | } else if (entry->reason == REASON_JAMO_TRAILING) { 657 | if (starter >= 0xAC00 && starter <= 0xD7A3 && (starter - 0xAC00) % 28 == 0) { 658 | ch = starter + ch - 0x11A7; 659 | fixedup = 1; 660 | } 661 | } 662 | } 663 | if (!fixedup) 664 | add_utf8char(buff, starter); /* Emit previous starter to output */ 665 | } 666 | starter = ch; 667 | to_copy = s; 668 | 669 | /* We are finished processing the preceding starter and optional sequence of combining marks 670 | * Now check if this (possibly deprecated) starter needs to be converted to a canonical variant */ 671 | if (entry) { 672 | if (entry->reason == REASON_MUST_CONVERT_1) { 673 | starter = entry->data1; 674 | fixedup = 1; 675 | } else if (entry->reason == REASON_MUST_CONVERT_2) { 676 | utfint conv1 = entry->data1; 677 | utfint conv2 = entry->data2; 678 | /* It's possible that 'ch' might convert to two other codepoints, 679 | * where the 2nd one is a combining mark */ 680 | unsigned int canon_cls2 = lookup_canon_cls(conv2); 681 | if (canon_cls2) { 682 | /* It's possible that the 1st resulting codepoint may need to be 683 | * split again into more codepoints */ 684 | nfc_table *conv_entry = nfc_quickcheck(conv1); 685 | if (conv_entry && conv_entry->reason == REASON_MUST_CONVERT_2) { 686 | utfint conv3 = conv2; 687 | unsigned int canon_cls3 = canon_cls2; 688 | conv1 = conv_entry->data1; 689 | conv2 = conv_entry->data2; 690 | canon_cls2 = lookup_canon_cls(conv2); 691 | if (canon_cls2) { 692 | starter = conv1; 693 | vector[0] = (conv2 << 8) | canon_cls2; 694 | vector[1] = (conv3 << 8) | canon_cls3; 695 | vec_size = 2; 696 | } else { 697 | add_utf8char(buff, conv1); 698 | starter = conv2; 699 | vector[0] = (conv3 << 8) | canon_cls3; 700 | vec_size = 1; 701 | } 702 | canon_cls = canon_cls3; 703 | } else { 704 | starter = conv1; 705 | vector[0] = (conv2 << 8) | canon_cls2; 706 | vec_size = 1; 707 | canon_cls = canon_cls2; 708 | } 709 | } else { 710 | add_utf8char(buff, conv1); 711 | starter = conv2; 712 | } 713 | fixedup = 1; 714 | } 715 | } 716 | } else { 717 | /* Accumulate combining marks in vector */ 718 | grow_vector_if_needed(&vector, onstack, &vec_max, vec_size + 1); 719 | vector[vec_size++] = (ch << 8) | (canon_cls & 0xFF); 720 | } 721 | 722 | s = new_s; 723 | prev_canon_cls = canon_cls; 724 | } 725 | 726 | if (vec_size) 727 | goto process_combining_marks; /* Finish processing trailing combining marks */ 728 | if (starter != -1) 729 | add_utf8char(buff, starter); 730 | 731 | if (vector != onstack) 732 | free(vector); 733 | } 734 | 735 | /* Grapheme cluster support */ 736 | 737 | static int hangul_type (utfint ch) { 738 | /* The first Hangul codepoint is U+1100 */ 739 | if (ch < 0x1100) { 740 | return 0; 741 | } 742 | size_t begin = 0, end = table_size(hangul_table); 743 | 744 | while (begin < end) { 745 | size_t mid = (begin + end) / 2; 746 | if (hangul_table[mid].last < ch) 747 | begin = mid + 1; 748 | else if (hangul_table[mid].first > ch) 749 | end = mid; 750 | else 751 | return hangul_table[mid].type; 752 | } 753 | 754 | return 0; 755 | } 756 | 757 | static int indic_conjunct_type (utfint ch) { 758 | /* The first Indic conjunct codepoint is U+0300 */ 759 | if (ch < 0x300) { 760 | return 0; 761 | } 762 | size_t begin = 0, end = table_size(indic_table); 763 | 764 | while (begin < end) { 765 | size_t mid = (begin + end) / 2; 766 | if (indic_table[mid].last < ch) 767 | begin = mid + 1; 768 | else if (indic_table[mid].first > ch) 769 | end = mid; 770 | else 771 | return indic_table[mid].type; 772 | } 773 | 774 | return 0; 775 | } 776 | 777 | #define define_category(cls, name) static int utf8_is##name (utfint ch)\ 778 | { return find_in_range(name##_table, table_size(name##_table), ch); } 779 | #define define_converter(name) static utfint utf8_to##name (utfint ch) \ 780 | { return convert_char(to##name##_table, table_size(to##name##_table), ch); } 781 | utf8_categories(define_category) 782 | utf8_converters(define_converter) 783 | #undef define_category 784 | #undef define_converter 785 | 786 | static int utf8_isgraph (utfint ch) { 787 | if (find_in_range(space_table, table_size(space_table), ch)) 788 | return 0; 789 | if (find_in_range(graph_table, table_size(graph_table), ch)) 790 | return 1; 791 | if (find_in_range(compose_table, table_size(compose_table), ch)) 792 | return 1; 793 | return 0; 794 | } 795 | 796 | static int utf8_isalnum (utfint ch) { 797 | if (find_in_range(alpha_table, table_size(alpha_table), ch)) 798 | return 1; 799 | if (find_in_range(alnum_extend_table, table_size(alnum_extend_table), ch)) 800 | return 1; 801 | return 0; 802 | } 803 | 804 | static int utf8_width (utfint ch, int ambi_is_single) { 805 | if (find_in_range(doublewidth_table, table_size(doublewidth_table), ch)) 806 | return 2; 807 | if (find_in_range(ambiwidth_table, table_size(ambiwidth_table), ch)) 808 | return ambi_is_single ? 1 : 2; 809 | if (find_in_range(compose_table, table_size(compose_table), ch)) 810 | return 0; 811 | if (find_in_range(unprintable_table, table_size(unprintable_table), ch)) 812 | return 0; 813 | return 1; 814 | } 815 | 816 | /* string module compatible interface */ 817 | 818 | static int typeerror (lua_State *L, int idx, const char *tname) 819 | { return luaL_error(L, "%s expected, got %s", tname, luaL_typename(L, idx)); } 820 | 821 | static const char *check_utf8 (lua_State *L, int idx, const char **end) { 822 | size_t len; 823 | const char *s = luaL_checklstring(L, idx, &len); 824 | if (end) *end = s+len; 825 | return s; 826 | } 827 | 828 | static const char *to_utf8 (lua_State *L, int idx, const char **end) { 829 | size_t len; 830 | const char *s = lua_tolstring(L, idx, &len); 831 | if (end) *end = s+len; 832 | return s; 833 | } 834 | 835 | static const char *utf8_safe_decode (lua_State *L, const char *p, utfint *pval) { 836 | p = utf8_decode(p, pval, 0); 837 | if (p == NULL) luaL_error(L, "invalid UTF-8 code"); 838 | return p; 839 | } 840 | 841 | static void add_utf8char (luaL_Buffer *b, utfint ch) { 842 | char buff[UTF8_BUFFSZ]; 843 | size_t n = utf8_encode(buff, ch); 844 | luaL_addlstring(b, buff+UTF8_BUFFSZ-n, n); 845 | } 846 | 847 | static lua_Integer byte_relat (lua_Integer pos, size_t len) { 848 | if (pos >= 0) return pos; 849 | else if (0u - (size_t)pos > len) return 0; 850 | else return (lua_Integer)len + pos + 1; 851 | } 852 | 853 | static int Lutf8_len (lua_State *L) { 854 | size_t len, n; 855 | const char *s = luaL_checklstring(L, 1, &len), *p, *e; 856 | lua_Integer posi = byte_relat(luaL_optinteger(L, 2, 1), len); 857 | lua_Integer pose = byte_relat(luaL_optinteger(L, 3, -1), len); 858 | int lax = lua_toboolean(L, 4); 859 | luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2, 860 | "initial position out of string"); 861 | luaL_argcheck(L, --pose < (lua_Integer)len, 3, 862 | "final position out of string"); 863 | for (n = 0, p=s+posi, e=s+pose+1; p < e; ++n) { 864 | if (lax) 865 | p = utf8_next(p, e); 866 | else { 867 | utfint ch; 868 | const char *np = utf8_decode(p, &ch, !lax); 869 | if (np == NULL || utf8_invalid(ch)) { 870 | lua_pushnil(L); 871 | lua_pushinteger(L, p - s + 1); 872 | return 2; 873 | } 874 | p = np; 875 | } 876 | } 877 | lua_pushinteger(L, n); 878 | return 1; 879 | } 880 | 881 | static int Lutf8_sub (lua_State *L) { 882 | const char *e, *s = check_utf8(L, 1, &e); 883 | lua_Integer posi = luaL_checkinteger(L, 2); 884 | lua_Integer pose = luaL_optinteger(L, 3, -1); 885 | if (utf8_range(s, e, &posi, &pose)) 886 | lua_pushlstring(L, s+posi, pose-posi); 887 | else 888 | lua_pushliteral(L, ""); 889 | return 1; 890 | } 891 | 892 | static int Lutf8_reverse (lua_State *L) { 893 | luaL_Buffer b; 894 | const char *prev, *pprev, *ends, *e, *s = check_utf8(L, 1, &e); 895 | (void) ends; 896 | int lax = lua_toboolean(L, 2); 897 | luaL_buffinit(L, &b); 898 | if (lax) { 899 | for (prev = e; s < prev; e = prev) { 900 | prev = utf8_prev(s, prev); 901 | luaL_addlstring(&b, prev, e-prev); 902 | } 903 | } else { 904 | for (prev = e; s < prev; prev = pprev) { 905 | utfint code = 0; 906 | ends = utf8_safe_decode(L, pprev = utf8_prev(s, prev), &code); 907 | assert(ends == prev); 908 | if (utf8_invalid(code)) 909 | return luaL_error(L, "invalid UTF-8 code"); 910 | if (!utf8_iscompose(code)) { 911 | luaL_addlstring(&b, pprev, e-pprev); 912 | e = pprev; 913 | } 914 | } 915 | } 916 | luaL_pushresult(&b); 917 | return 1; 918 | } 919 | 920 | static int Lutf8_byte (lua_State *L) { 921 | size_t n = 0; 922 | const char *e, *s = check_utf8(L, 1, &e); 923 | lua_Integer posi = luaL_optinteger(L, 2, 1); 924 | lua_Integer pose = luaL_optinteger(L, 3, posi); 925 | if (utf8_range(s, e, &posi, &pose)) { 926 | for (e = s + pose, s = s + posi; s < e; ++n) { 927 | utfint ch = 0; 928 | s = utf8_safe_decode(L, s, &ch); 929 | lua_pushinteger(L, ch); 930 | } 931 | } 932 | return CAST(int, n); 933 | } 934 | 935 | static int Lutf8_codepoint (lua_State *L) { 936 | const char *e, *s = check_utf8(L, 1, &e); 937 | size_t len = e-s; 938 | lua_Integer posi = byte_relat(luaL_optinteger(L, 2, 1), len); 939 | lua_Integer pose = byte_relat(luaL_optinteger(L, 3, posi), len); 940 | int lax = lua_toboolean(L, 4); 941 | int n; 942 | const char *se; 943 | luaL_argcheck(L, posi >= 1, 2, "out of range"); 944 | luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of range"); 945 | if (posi > pose) return 0; /* empty interval; return no values */ 946 | if (pose - posi >= INT_MAX) /* (lua_Integer -> int) overflow? */ 947 | return luaL_error(L, "string slice too long"); 948 | n = (int)(pose - posi + 1); 949 | luaL_checkstack(L, n, "string slice too long"); 950 | n = 0; /* count the number of returns */ 951 | se = s + pose; /* string end */ 952 | for (n = 0, s += posi - 1; s < se;) { 953 | utfint code = 0; 954 | s = utf8_safe_decode(L, s, &code); 955 | if (!lax && utf8_invalid(code)) 956 | return luaL_error(L, "invalid UTF-8 code"); 957 | lua_pushinteger(L, code); 958 | n++; 959 | } 960 | return n; 961 | } 962 | 963 | static int Lutf8_char (lua_State *L) { 964 | int i, n = lua_gettop(L); /* number of arguments */ 965 | luaL_Buffer b; 966 | luaL_buffinit(L, &b); 967 | for (i = 1; i <= n; ++i) { 968 | lua_Integer code = luaL_checkinteger(L, i); 969 | luaL_argcheck(L, code <= UTF8_MAXCP, i, "value out of range"); 970 | add_utf8char(&b, CAST(utfint, code)); 971 | } 972 | luaL_pushresult(&b); 973 | return 1; 974 | } 975 | 976 | #define bind_converter(name) \ 977 | static int Lutf8_##name (lua_State *L) { \ 978 | int t = lua_type(L, 1); \ 979 | if (t == LUA_TNUMBER) \ 980 | lua_pushinteger(L, utf8_to##name(CAST(utfint, lua_tointeger(L, 1)))); \ 981 | else if (t == LUA_TSTRING) { \ 982 | luaL_Buffer b; \ 983 | const char *e, *s = to_utf8(L, 1, &e); \ 984 | luaL_buffinit(L, &b); \ 985 | while (s < e) { \ 986 | utfint ch = 0; \ 987 | s = utf8_safe_decode(L, s, &ch); \ 988 | add_utf8char(&b, utf8_to##name(ch)); \ 989 | } \ 990 | luaL_pushresult(&b); \ 991 | } \ 992 | else return typeerror(L, 1, "number/string"); \ 993 | return 1; \ 994 | } 995 | utf8_converters(bind_converter) 996 | #undef bind_converter 997 | 998 | 999 | /* unicode extra interface */ 1000 | 1001 | static const char *parse_escape (lua_State *L, const char *s, const char *e, int hex, utfint *pch) { 1002 | utfint code = 0; 1003 | int in_bracket = 0; 1004 | if (*s == '{') ++s, in_bracket = 1; 1005 | for (; s < e; ++s) { 1006 | utfint ch = (unsigned char)*s; 1007 | if (ch >= '0' && ch <= '9') ch = ch - '0'; 1008 | else if (hex && ch >= 'A' && ch <= 'F') ch = 10 + (ch - 'A'); 1009 | else if (hex && ch >= 'a' && ch <= 'f') ch = 10 + (ch - 'a'); 1010 | else if (!in_bracket) break; 1011 | else if (ch == '}') { ++s; break; } 1012 | else luaL_error(L, "invalid escape '%c'", ch); 1013 | code *= hex ? 16 : 10; 1014 | code += ch; 1015 | } 1016 | *pch = code; 1017 | return s; 1018 | } 1019 | 1020 | static int Lutf8_escape (lua_State *L) { 1021 | const char *e, *s = check_utf8(L, 1, &e); 1022 | luaL_Buffer b; 1023 | luaL_buffinit(L, &b); 1024 | while (s < e) { 1025 | utfint ch = 0; 1026 | s = utf8_safe_decode(L, s, &ch); 1027 | if (ch == '%') { 1028 | int hex = 0; 1029 | switch (*s) { 1030 | case '0': case '1': case '2': case '3': 1031 | case '4': case '5': case '6': case '7': 1032 | case '8': case '9': case '{': 1033 | break; 1034 | case 'x': case 'X': hex = 1; /* fall through */ 1035 | case 'u': case 'U': if (s+1 < e) { ++s; break; } 1036 | /* fall through */ 1037 | default: 1038 | s = utf8_safe_decode(L, s, &ch); 1039 | goto next; 1040 | } 1041 | s = parse_escape(L, s, e, hex, &ch); 1042 | } 1043 | next: 1044 | add_utf8char(&b, ch); 1045 | } 1046 | luaL_pushresult(&b); 1047 | return 1; 1048 | } 1049 | 1050 | static int Lutf8_insert (lua_State *L) { 1051 | const char *e, *s = check_utf8(L, 1, &e); 1052 | size_t sublen; 1053 | const char *subs; 1054 | luaL_Buffer b; 1055 | int nargs = 2; 1056 | const char *first = e; 1057 | if (lua_type(L, 2) == LUA_TNUMBER) { 1058 | int idx = (int)lua_tointeger(L, 2); 1059 | if (idx != 0) first = utf8_relat(s, e, idx); 1060 | luaL_argcheck(L, first, 2, "invalid index"); 1061 | ++nargs; 1062 | } 1063 | subs = luaL_checklstring(L, nargs, &sublen); 1064 | luaL_buffinit(L, &b); 1065 | luaL_addlstring(&b, s, first-s); 1066 | luaL_addlstring(&b, subs, sublen); 1067 | luaL_addlstring(&b, first, e-first); 1068 | luaL_pushresult(&b); 1069 | return 1; 1070 | } 1071 | 1072 | static int Lutf8_remove (lua_State *L) { 1073 | const char *e, *s = check_utf8(L, 1, &e); 1074 | lua_Integer posi = luaL_optinteger(L, 2, -1); 1075 | lua_Integer pose = luaL_optinteger(L, 3, -1); 1076 | if (!utf8_range(s, e, &posi, &pose)) 1077 | lua_settop(L, 1); 1078 | else { 1079 | luaL_Buffer b; 1080 | luaL_buffinit(L, &b); 1081 | luaL_addlstring(&b, s, posi); 1082 | luaL_addlstring(&b, s+pose, e-s-pose); 1083 | luaL_pushresult(&b); 1084 | } 1085 | return 1; 1086 | } 1087 | 1088 | static int push_offset (lua_State *L, const char *s, const char *e, lua_Integer offset, lua_Integer idx) { 1089 | utfint ch = 0; 1090 | const char *p; 1091 | if (idx != 0) 1092 | p = utf8_offset(s, e, offset, idx); 1093 | else if (p = s+offset-1, iscont(p)) 1094 | p = utf8_prev(s, p); 1095 | if (p == NULL || p == e) return 0; 1096 | utf8_decode(p, &ch, 0); 1097 | lua_pushinteger(L, p-s+1); 1098 | lua_pushinteger(L, ch); 1099 | return 2; 1100 | } 1101 | 1102 | static int Lutf8_charpos (lua_State *L) { 1103 | const char *e, *s = check_utf8(L, 1, &e); 1104 | lua_Integer offset = 1; 1105 | if (lua_isnoneornil(L, 3)) { 1106 | lua_Integer idx = luaL_optinteger(L, 2, 0); 1107 | if (idx > 0) --idx; 1108 | else if (idx < 0) offset = e-s+1; 1109 | return push_offset(L, s, e, offset, idx); 1110 | } 1111 | offset = byte_relat(luaL_optinteger(L, 2, 1), e-s); 1112 | if (offset < 1) offset = 1; 1113 | return push_offset(L, s, e, offset, luaL_checkinteger(L, 3)); 1114 | } 1115 | 1116 | static int Lutf8_offset (lua_State *L) { 1117 | size_t len; 1118 | const char *s = luaL_checklstring(L, 1, &len); 1119 | lua_Integer n = luaL_checkinteger(L, 2); 1120 | lua_Integer posi = (n >= 0) ? 1 : len + 1; 1121 | posi = byte_relat(luaL_optinteger(L, 3, posi), len); 1122 | luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3, 1123 | "position out of range"); 1124 | if (n == 0) { 1125 | /* find beginning of current byte sequence */ 1126 | while (posi > 0 && iscont(s + posi)) posi--; 1127 | } else { 1128 | if (iscont(s + posi)) 1129 | return luaL_error(L, "initial position is a continuation byte"); 1130 | if (n < 0) { 1131 | while (n < 0 && posi > 0) { /* move back */ 1132 | do { /* find beginning of previous character */ 1133 | posi--; 1134 | } while (posi > 0 && iscont(s + posi)); 1135 | n++; 1136 | } 1137 | } else { 1138 | n--; /* do not move for 1st character */ 1139 | while (n > 0 && posi < (lua_Integer)len) { 1140 | do { /* find beginning of next character */ 1141 | posi++; 1142 | } while (iscont(s + posi)); /* (cannot pass final '\0') */ 1143 | n--; 1144 | } 1145 | } 1146 | } 1147 | if (n == 0) /* did it find given character? */ 1148 | lua_pushinteger(L, posi + 1); 1149 | else /* no such character */ 1150 | lua_pushnil(L); 1151 | return 1; 1152 | } 1153 | 1154 | static int Lutf8_next (lua_State *L) { 1155 | const char *e, *s = check_utf8(L, 1, &e); 1156 | lua_Integer offset = byte_relat(luaL_optinteger(L, 2, 1), e-s); 1157 | lua_Integer idx = luaL_optinteger(L, 3, !lua_isnoneornil(L, 2)); 1158 | return push_offset(L, s, e, offset, idx); 1159 | } 1160 | 1161 | static int iter_aux (lua_State *L, int strict) { 1162 | const char *e, *s = check_utf8(L, 1, &e); 1163 | int n = CAST(int, lua_tointeger(L, 2)); 1164 | const char *p = n <= 0 ? s : utf8_next(s+n-1, e); 1165 | if (p < e) { 1166 | utfint code = 0; 1167 | utf8_safe_decode(L, p, &code); 1168 | if (strict && utf8_invalid(code)) 1169 | return luaL_error(L, "invalid UTF-8 code"); 1170 | lua_pushinteger(L, p-s+1); 1171 | lua_pushinteger(L, code); 1172 | return 2; 1173 | } 1174 | return 0; /* no more codepoints */ 1175 | } 1176 | 1177 | static int iter_auxstrict (lua_State *L) { return iter_aux(L, 1); } 1178 | static int iter_auxlax (lua_State *L) { return iter_aux(L, 0); } 1179 | 1180 | static int Lutf8_codes (lua_State *L) { 1181 | int lax = lua_toboolean(L, 2); 1182 | luaL_checkstring(L, 1); 1183 | lua_pushcfunction(L, lax ? iter_auxlax : iter_auxstrict); 1184 | lua_pushvalue(L, 1); 1185 | lua_pushinteger(L, 0); 1186 | return 3; 1187 | } 1188 | 1189 | static int Lutf8_width (lua_State *L) { 1190 | int t = lua_type(L, 1); 1191 | int ambi_is_single = !lua_toboolean(L, 2); 1192 | int default_width = CAST(int, luaL_optinteger(L, 3, 0)); 1193 | if (t == LUA_TNUMBER) { 1194 | size_t chwidth = utf8_width(CAST(utfint, lua_tointeger(L, 1)), ambi_is_single); 1195 | if (chwidth == 0) chwidth = default_width; 1196 | lua_pushinteger(L, (lua_Integer)chwidth); 1197 | } else if (t != LUA_TSTRING) 1198 | return typeerror(L, 1, "number/string"); 1199 | else { 1200 | const char *e, *s = to_utf8(L, 1, &e); 1201 | int width = 0; 1202 | while (s < e) { 1203 | utfint ch = 0; 1204 | int chwidth; 1205 | s = utf8_safe_decode(L, s, &ch); 1206 | chwidth = utf8_width(ch, ambi_is_single); 1207 | width += chwidth == 0 ? default_width : chwidth; 1208 | } 1209 | lua_pushinteger(L, (lua_Integer)width); 1210 | } 1211 | return 1; 1212 | } 1213 | 1214 | static int Lutf8_widthindex (lua_State *L) { 1215 | const char *e, *s = check_utf8(L, 1, &e); 1216 | int width = CAST(int, luaL_checkinteger(L, 2)); 1217 | int ambi_is_single = !lua_toboolean(L, 3); 1218 | int default_width = CAST(int, luaL_optinteger(L, 4, 0)); 1219 | size_t idx = 1; 1220 | while (s < e) { 1221 | utfint ch = 0; 1222 | size_t chwidth; 1223 | s = utf8_safe_decode(L, s, &ch); 1224 | chwidth = utf8_width(ch, ambi_is_single); 1225 | if (chwidth == 0) chwidth = default_width; 1226 | width -= CAST(int, chwidth); 1227 | if (width <= 0) { 1228 | lua_pushinteger(L, idx); 1229 | lua_pushinteger(L, width + chwidth); 1230 | lua_pushinteger(L, chwidth); 1231 | return 3; 1232 | } 1233 | ++idx; 1234 | } 1235 | lua_pushinteger(L, (lua_Integer)idx); 1236 | return 1; 1237 | } 1238 | 1239 | static int Lutf8_ncasecmp (lua_State *L) { 1240 | const char *e1, *s1 = check_utf8(L, 1, &e1); 1241 | const char *e2, *s2 = check_utf8(L, 2, &e2); 1242 | while (s1 < e1 || s2 < e2) { 1243 | utfint ch1 = 0, ch2 = 0; 1244 | if (s1 == e1) 1245 | ch2 = 1; 1246 | else if (s2 == e2) 1247 | ch1 = 1; 1248 | else { 1249 | s1 = utf8_safe_decode(L, s1, &ch1); 1250 | s2 = utf8_safe_decode(L, s2, &ch2); 1251 | ch1 = utf8_tofold(ch1); 1252 | ch2 = utf8_tofold(ch2); 1253 | } 1254 | if (ch1 != ch2) { 1255 | lua_pushinteger(L, ch1 > ch2 ? 1 : -1); 1256 | return 1; 1257 | } 1258 | } 1259 | lua_pushinteger(L, 0); 1260 | return 1; 1261 | } 1262 | 1263 | 1264 | /* utf8 pattern matching implement */ 1265 | 1266 | #ifndef LUA_MAXCAPTURES 1267 | # define LUA_MAXCAPTURES 32 1268 | #endif /* LUA_MAXCAPTURES */ 1269 | 1270 | #define CAP_UNFINISHED (-1) 1271 | #define CAP_POSITION (-2) 1272 | 1273 | 1274 | typedef struct MatchState { 1275 | int matchdepth; /* control for recursive depth (to avoid C stack overflow) */ 1276 | const char *src_init; /* init of source string */ 1277 | const char *src_end; /* end ('\0') of source string */ 1278 | const char *p_end; /* end ('\0') of pattern */ 1279 | lua_State *L; 1280 | int level; /* total number of captures (finished or unfinished) */ 1281 | struct { 1282 | const char *init; 1283 | ptrdiff_t len; 1284 | } capture[LUA_MAXCAPTURES]; 1285 | } MatchState; 1286 | 1287 | /* recursive function */ 1288 | static const char *match (MatchState *ms, const char *s, const char *p); 1289 | 1290 | /* maximum recursion depth for 'match' */ 1291 | #if !defined(MAXCCALLS) 1292 | #define MAXCCALLS 200 1293 | #endif 1294 | 1295 | #define L_ESC '%' 1296 | #define SPECIALS "^$*+?.([%-" 1297 | 1298 | static int check_capture (MatchState *ms, int l) { 1299 | l -= '1'; 1300 | if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED) 1301 | return luaL_error(ms->L, "invalid capture index %%%d", l + 1); 1302 | return l; 1303 | } 1304 | 1305 | static int capture_to_close (MatchState *ms) { 1306 | int level = ms->level; 1307 | while (--level >= 0) 1308 | if (ms->capture[level].len == CAP_UNFINISHED) return level; 1309 | return luaL_error(ms->L, "invalid pattern capture"); 1310 | } 1311 | 1312 | static const char *classend (MatchState *ms, const char *p) { 1313 | utfint ch = 0; 1314 | p = utf8_safe_decode(ms->L, p, &ch); 1315 | switch (ch) { 1316 | case L_ESC: { 1317 | if (p == ms->p_end) 1318 | luaL_error(ms->L, "malformed pattern (ends with " LUA_QL("%%") ")"); 1319 | return utf8_next(p, ms->p_end); 1320 | } 1321 | case '[': { 1322 | if (*p == '^') p++; 1323 | do { /* look for a `]' */ 1324 | if (p == ms->p_end) 1325 | luaL_error(ms->L, "malformed pattern (missing " LUA_QL("]") ")"); 1326 | if (*(p++) == L_ESC && p < ms->p_end) 1327 | p++; /* skip escapes (e.g. `%]') */ 1328 | } while (*p != ']'); 1329 | return p+1; 1330 | } 1331 | default: { 1332 | return p; 1333 | } 1334 | } 1335 | } 1336 | 1337 | static int match_class (utfint c, utfint cl) { 1338 | int res; 1339 | switch (utf8_tolower(cl)) { 1340 | #define X(cls, name) case cls: res = utf8_is##name(c); break; 1341 | utf8_categories(X) 1342 | #undef X 1343 | case 'g' : res = utf8_isgraph(c); break; 1344 | case 'w' : res = utf8_isalnum(c); break; 1345 | case 'z' : res = (c == 0); break; /* deprecated option */ 1346 | default: return (cl == c); 1347 | } 1348 | return (utf8_islower(cl) ? res : !res); 1349 | } 1350 | 1351 | static int matchbracketclass (MatchState *ms, utfint c, const char *p, const char *ec) { 1352 | int sig = 1; 1353 | assert(*p == '['); 1354 | if (*++p == '^') { 1355 | sig = 0; 1356 | p++; /* skip the `^' */ 1357 | } 1358 | while (p < ec) { 1359 | utfint ch = 0; 1360 | p = utf8_safe_decode(ms->L, p, &ch); 1361 | if (ch == L_ESC) { 1362 | p = utf8_safe_decode(ms->L, p, &ch); 1363 | if (match_class(c, ch)) 1364 | return sig; 1365 | } else { 1366 | utfint next = 0; 1367 | const char *np = utf8_safe_decode(ms->L, p, &next); 1368 | if (next == '-' && np < ec) { 1369 | p = utf8_safe_decode(ms->L, np, &next); 1370 | if (ch <= c && c <= next) 1371 | return sig; 1372 | } 1373 | else if (ch == c) return sig; 1374 | } 1375 | } 1376 | return !sig; 1377 | } 1378 | 1379 | static int singlematch (MatchState *ms, const char *s, const char *p, const char *ep) { 1380 | if (s >= ms->src_end) 1381 | return 0; 1382 | else { 1383 | utfint ch=0, pch=0; 1384 | utf8_safe_decode(ms->L, s, &ch); 1385 | p = utf8_safe_decode(ms->L, p, &pch); 1386 | switch (pch) { 1387 | case '.': return 1; /* matches any char */ 1388 | case L_ESC: utf8_safe_decode(ms->L, p, &pch); 1389 | return match_class(ch, pch); 1390 | case '[': return matchbracketclass(ms, ch, p-1, ep-1); 1391 | default: return pch == ch; 1392 | } 1393 | } 1394 | } 1395 | 1396 | static const char *matchbalance (MatchState *ms, const char *s, const char **p) { 1397 | utfint ch=0, begin=0, end=0; 1398 | *p = utf8_safe_decode(ms->L, *p, &begin); 1399 | if (*p >= ms->p_end) 1400 | luaL_error(ms->L, "malformed pattern " 1401 | "(missing arguments to " LUA_QL("%%b") ")"); 1402 | *p = utf8_safe_decode(ms->L, *p, &end); 1403 | s = utf8_safe_decode(ms->L, s, &ch); 1404 | if (ch != begin) return NULL; 1405 | else { 1406 | int cont = 1; 1407 | while (s < ms->src_end) { 1408 | s = utf8_safe_decode(ms->L, s, &ch); 1409 | if (ch == end) { 1410 | if (--cont == 0) return s; 1411 | } 1412 | else if (ch == begin) cont++; 1413 | } 1414 | } 1415 | return NULL; /* string ends out of balance */ 1416 | } 1417 | 1418 | static const char *max_expand (MatchState *ms, const char *s, const char *p, const char *ep) { 1419 | const char *m = s; /* matched end of single match p */ 1420 | while (singlematch(ms, m, p, ep)) 1421 | m = utf8_next(m, ms->src_end); 1422 | /* keeps trying to match with the maximum repetitions */ 1423 | while (s <= m) { 1424 | const char *res = match(ms, m, ep+1); 1425 | if (res) return res; 1426 | /* else didn't match; reduce 1 repetition to try again */ 1427 | if (s == m) break; 1428 | m = utf8_prev(s, m); 1429 | } 1430 | return NULL; 1431 | } 1432 | 1433 | static const char *min_expand (MatchState *ms, const char *s, const char *p, const char *ep) { 1434 | for (;;) { 1435 | const char *res = match(ms, s, ep+1); 1436 | if (res != NULL) 1437 | return res; 1438 | else if (singlematch(ms, s, p, ep)) 1439 | s = utf8_next(s, ms->src_end); /* try with one more repetition */ 1440 | else return NULL; 1441 | } 1442 | } 1443 | 1444 | static const char *start_capture (MatchState *ms, const char *s, const char *p, int what) { 1445 | const char *res; 1446 | int level = ms->level; 1447 | if (level >= LUA_MAXCAPTURES) luaL_error(ms->L, "too many captures"); 1448 | ms->capture[level].init = s; 1449 | ms->capture[level].len = what; 1450 | ms->level = level+1; 1451 | if ((res=match(ms, s, p)) == NULL) /* match failed? */ 1452 | ms->level--; /* undo capture */ 1453 | return res; 1454 | } 1455 | 1456 | static const char *end_capture (MatchState *ms, const char *s, const char *p) { 1457 | int l = capture_to_close(ms); 1458 | const char *res; 1459 | ms->capture[l].len = s - ms->capture[l].init; /* close capture */ 1460 | if ((res = match(ms, s, p)) == NULL) /* match failed? */ 1461 | ms->capture[l].len = CAP_UNFINISHED; /* undo capture */ 1462 | return res; 1463 | } 1464 | 1465 | static const char *match_capture (MatchState *ms, const char *s, int l) { 1466 | size_t len; 1467 | l = check_capture(ms, l); 1468 | len = ms->capture[l].len; 1469 | if ((size_t)(ms->src_end-s) >= len && 1470 | memcmp(ms->capture[l].init, s, len) == 0) 1471 | return s+len; 1472 | else return NULL; 1473 | } 1474 | 1475 | static const char *match (MatchState *ms, const char *s, const char *p) { 1476 | if (ms->matchdepth-- == 0) 1477 | luaL_error(ms->L, "pattern too complex"); 1478 | init: /* using goto's to optimize tail recursion */ 1479 | if (p != ms->p_end) { /* end of pattern? */ 1480 | utfint ch = 0; 1481 | utf8_safe_decode(ms->L, p, &ch); 1482 | switch (ch) { 1483 | case '(': { /* start capture */ 1484 | if (*(p + 1) == ')') /* position capture? */ 1485 | s = start_capture(ms, s, p + 2, CAP_POSITION); 1486 | else 1487 | s = start_capture(ms, s, p + 1, CAP_UNFINISHED); 1488 | break; 1489 | } 1490 | case ')': { /* end capture */ 1491 | s = end_capture(ms, s, p + 1); 1492 | break; 1493 | } 1494 | case '$': { 1495 | if ((p + 1) != ms->p_end) /* is the `$' the last char in pattern? */ 1496 | goto dflt; /* no; go to default */ 1497 | s = (s == ms->src_end) ? s : NULL; /* check end of string */ 1498 | break; 1499 | } 1500 | case L_ESC: { /* escaped sequence not in the format class[*+?-]? */ 1501 | const char *prev_p = p; 1502 | p = utf8_safe_decode(ms->L, p+1, &ch); 1503 | switch (ch) { 1504 | case 'b': { /* balanced string? */ 1505 | s = matchbalance(ms, s, &p); 1506 | if (s != NULL) 1507 | goto init; /* return match(ms, s, p + 4); */ 1508 | /* else fail (s == NULL) */ 1509 | break; 1510 | } 1511 | case 'f': { /* frontier? */ 1512 | const char *ep; utfint previous = 0, current = 0; 1513 | if (*p != '[') 1514 | luaL_error(ms->L, "missing " LUA_QL("[") " after " 1515 | LUA_QL("%%f") " in pattern"); 1516 | ep = classend(ms, p); /* points to what is next */ 1517 | if (s != ms->src_init) 1518 | utf8_decode(utf8_prev(ms->src_init, s), &previous, 0); 1519 | if (s != ms->src_end) 1520 | utf8_decode(s, ¤t, 0); 1521 | if (!matchbracketclass(ms, previous, p, ep - 1) && 1522 | matchbracketclass(ms, current, p, ep - 1)) { 1523 | p = ep; goto init; /* return match(ms, s, ep); */ 1524 | } 1525 | s = NULL; /* match failed */ 1526 | break; 1527 | } 1528 | case '0': case '1': case '2': case '3': 1529 | case '4': case '5': case '6': case '7': 1530 | case '8': case '9': { /* capture results (%0-%9)? */ 1531 | s = match_capture(ms, s, ch); 1532 | if (s != NULL) goto init; /* return match(ms, s, p + 2) */ 1533 | break; 1534 | } 1535 | default: p = prev_p; goto dflt; 1536 | } 1537 | break; 1538 | } 1539 | default: dflt: { /* pattern class plus optional suffix */ 1540 | const char *ep = classend(ms, p); /* points to optional suffix */ 1541 | /* does not match at least once? */ 1542 | if (!singlematch(ms, s, p, ep)) { 1543 | if (*ep == '*' || *ep == '?' || *ep == '-') { /* accept empty? */ 1544 | p = ep + 1; goto init; /* return match(ms, s, ep + 1); */ 1545 | } else /* '+' or no suffix */ 1546 | s = NULL; /* fail */ 1547 | } else { /* matched once */ 1548 | const char *next_s = utf8_next(s, ms->src_end); 1549 | switch (*ep) { /* handle optional suffix */ 1550 | case '?': { /* optional */ 1551 | const char *res; 1552 | const char *next_ep = utf8_next(ep, ms->p_end); 1553 | if ((res = match(ms, next_s, next_ep)) != NULL) 1554 | s = res; 1555 | else { 1556 | p = next_ep; goto init; /* else return match(ms, s, ep + 1); */ 1557 | } 1558 | break; 1559 | } 1560 | case '+': /* 1 or more repetitions */ 1561 | s = next_s; /* 1 match already done */ 1562 | /* fall through */ 1563 | case '*': /* 0 or more repetitions */ 1564 | s = max_expand(ms, s, p, ep); 1565 | break; 1566 | case '-': /* 0 or more repetitions (minimum) */ 1567 | s = min_expand(ms, s, p, ep); 1568 | break; 1569 | default: /* no suffix */ 1570 | s = next_s; p = ep; goto init; /* return match(ms, s + 1, ep); */ 1571 | } 1572 | } 1573 | break; 1574 | } 1575 | } 1576 | } 1577 | ms->matchdepth++; 1578 | return s; 1579 | } 1580 | 1581 | static const char *lmemfind (const char *s1, size_t l1, const char *s2, size_t l2) { 1582 | if (l2 == 0) return s1; /* empty strings are everywhere */ 1583 | else if (l2 > l1) return NULL; /* avoids a negative `l1' */ 1584 | else { 1585 | const char *init; /* to search for a `*s2' inside `s1' */ 1586 | l2--; /* 1st char will be checked by `memchr' */ 1587 | l1 = l1-l2; /* `s2' cannot be found after that */ 1588 | while (l1 > 0 && (init = (const char *)memchr(s1, *s2, l1)) != NULL) { 1589 | init++; /* 1st char is already checked */ 1590 | if (memcmp(init, s2+1, l2) == 0) 1591 | return init-1; 1592 | else { /* correct `l1' and `s1' to try again */ 1593 | l1 -= init-s1; 1594 | s1 = init; 1595 | } 1596 | } 1597 | return NULL; /* not found */ 1598 | } 1599 | } 1600 | 1601 | static int get_index (const char *p, const char *s, const char *e) { 1602 | int idx; 1603 | for (idx = 0; s < e && s < p; ++idx) 1604 | s = utf8_next(s, e); 1605 | return s == p ? idx : idx - 1; 1606 | } 1607 | 1608 | static void push_onecapture (MatchState *ms, int i, const char *s, const char *e) { 1609 | if (i >= ms->level) { 1610 | if (i == 0) /* ms->level == 0, too */ 1611 | lua_pushlstring(ms->L, s, e - s); /* add whole match */ 1612 | else 1613 | luaL_error(ms->L, "invalid capture index"); 1614 | } else { 1615 | ptrdiff_t l = ms->capture[i].len; 1616 | if (l == CAP_UNFINISHED) luaL_error(ms->L, "unfinished capture"); 1617 | if (l == CAP_POSITION) { 1618 | int idx = get_index(ms->capture[i].init, ms->src_init, ms->src_end); 1619 | lua_pushinteger(ms->L, idx+1); 1620 | } else 1621 | lua_pushlstring(ms->L, ms->capture[i].init, l); 1622 | } 1623 | } 1624 | 1625 | static int push_captures (MatchState *ms, const char *s, const char *e) { 1626 | int i; 1627 | int nlevels = (ms->level == 0 && s) ? 1 : ms->level; 1628 | luaL_checkstack(ms->L, nlevels, "too many captures"); 1629 | for (i = 0; i < nlevels; i++) 1630 | push_onecapture(ms, i, s, e); 1631 | return nlevels; /* number of strings pushed */ 1632 | } 1633 | 1634 | /* check whether pattern has no special characters */ 1635 | static int nospecials (const char *p, const char * ep) { 1636 | while (p < ep) { 1637 | if (strpbrk(p, SPECIALS)) 1638 | return 0; /* pattern has a special character */ 1639 | p += strlen(p) + 1; /* may have more after \0 */ 1640 | } 1641 | return 1; /* no special chars found */ 1642 | } 1643 | 1644 | 1645 | /* utf8 pattern matching interface */ 1646 | 1647 | static int find_aux (lua_State *L, int find) { 1648 | const char *es, *s = check_utf8(L, 1, &es); 1649 | const char *ep, *p = check_utf8(L, 2, &ep); 1650 | lua_Integer idx = luaL_optinteger(L, 3, 1); 1651 | const char *init; 1652 | if (!idx) idx = 1; 1653 | init = utf8_relat(s, es, CAST(int, idx)); 1654 | if (init == NULL) { 1655 | if (idx > 0) { 1656 | lua_pushnil(L); /* cannot find anything */ 1657 | return 1; 1658 | } 1659 | init = s; 1660 | } 1661 | /* explicit request or no special characters? */ 1662 | if (find && (lua_toboolean(L, 4) || nospecials(p, ep))) { 1663 | /* do a plain search */ 1664 | const char *s2 = lmemfind(init, es-init, p, ep-p); 1665 | if (s2) { 1666 | const char *e2 = s2 + (ep - p); 1667 | if (iscont(e2)) e2 = utf8_next(e2, es); 1668 | lua_pushinteger(L, idx = get_index(s2, s, es) + 1); 1669 | lua_pushinteger(L, idx + get_index(e2, s2, es) - 1); 1670 | return 2; 1671 | } 1672 | } else { 1673 | MatchState ms; 1674 | int anchor = (*p == '^'); 1675 | if (anchor) p++; /* skip anchor character */ 1676 | if (idx < 0) idx += utf8_length(s, es)+1; /* TODO not very good */ 1677 | ms.L = L; 1678 | ms.matchdepth = MAXCCALLS; 1679 | ms.src_init = s; 1680 | ms.src_end = es; 1681 | ms.p_end = ep; 1682 | do { 1683 | const char *res; 1684 | ms.level = 0; 1685 | assert(ms.matchdepth == MAXCCALLS); 1686 | if ((res=match(&ms, init, p)) != NULL) { 1687 | if (find) { 1688 | lua_pushinteger(L, idx); /* start */ 1689 | lua_pushinteger(L, idx + utf8_length(init, res) - 1); /* end */ 1690 | return push_captures(&ms, NULL, 0) + 2; 1691 | } else 1692 | return push_captures(&ms, init, res); 1693 | } 1694 | if (init == es) break; 1695 | idx += 1; 1696 | init = utf8_next(init, es); 1697 | } while (init <= es && !anchor); 1698 | } 1699 | lua_pushnil(L); /* not found */ 1700 | return 1; 1701 | } 1702 | 1703 | static int Lutf8_find (lua_State *L) { return find_aux(L, 1); } 1704 | static int Lutf8_match (lua_State *L) { return find_aux(L, 0); } 1705 | 1706 | static int gmatch_aux (lua_State *L) { 1707 | MatchState ms; 1708 | const char *es, *s = check_utf8(L, lua_upvalueindex(1), &es); 1709 | const char *ep, *p = check_utf8(L, lua_upvalueindex(2), &ep); 1710 | const char *src; 1711 | ms.L = L; 1712 | ms.matchdepth = MAXCCALLS; 1713 | ms.src_init = s; 1714 | ms.src_end = es; 1715 | ms.p_end = ep; 1716 | for (src = s + (size_t)lua_tointeger(L, lua_upvalueindex(3)); 1717 | src <= ms.src_end; 1718 | src = utf8_next(src, ms.src_end)) { 1719 | const char *e; 1720 | ms.level = 0; 1721 | assert(ms.matchdepth == MAXCCALLS); 1722 | if ((e = match(&ms, src, p)) != NULL) { 1723 | lua_Integer newstart = e-s; 1724 | if (e == src) newstart++; /* empty match? go at least one position */ 1725 | lua_pushinteger(L, newstart); 1726 | lua_replace(L, lua_upvalueindex(3)); 1727 | return push_captures(&ms, src, e); 1728 | } 1729 | if (src == ms.src_end) break; 1730 | } 1731 | return 0; /* not found */ 1732 | } 1733 | 1734 | static int Lutf8_gmatch (lua_State *L) { 1735 | luaL_checkstring(L, 1); 1736 | luaL_checkstring(L, 2); 1737 | lua_settop(L, 2); 1738 | lua_pushinteger(L, 0); 1739 | lua_pushcclosure(L, gmatch_aux, 3); 1740 | return 1; 1741 | } 1742 | 1743 | static void add_s (MatchState *ms, luaL_Buffer *b, const char *s, const char *e) { 1744 | const char *new_end, *news = to_utf8(ms->L, 3, &new_end); 1745 | while (news < new_end) { 1746 | utfint ch = 0; 1747 | news = utf8_safe_decode(ms->L, news, &ch); 1748 | if (ch != L_ESC) 1749 | add_utf8char(b, ch); 1750 | else { 1751 | news = utf8_safe_decode(ms->L, news, &ch); /* skip ESC */ 1752 | if (!utf8_isdigit(ch)) { 1753 | if (ch != L_ESC) 1754 | luaL_error(ms->L, "invalid use of " LUA_QL("%c") 1755 | " in replacement string", L_ESC); 1756 | add_utf8char(b, ch); 1757 | } else if (ch == '0') 1758 | luaL_addlstring(b, s, e-s); 1759 | else { 1760 | push_onecapture(ms, ch-'1', s, e); 1761 | luaL_addvalue(b); /* add capture to accumulated result */ 1762 | } 1763 | } 1764 | } 1765 | } 1766 | 1767 | static void add_value (MatchState *ms, luaL_Buffer *b, const char *s, const char *e, int tr) { 1768 | lua_State *L = ms->L; 1769 | switch (tr) { 1770 | case LUA_TFUNCTION: { 1771 | int n; 1772 | lua_pushvalue(L, 3); 1773 | n = push_captures(ms, s, e); 1774 | lua_call(L, n, 1); 1775 | break; 1776 | } 1777 | case LUA_TTABLE: { 1778 | push_onecapture(ms, 0, s, e); 1779 | lua_gettable(L, 3); 1780 | break; 1781 | } 1782 | default: { /* LUA_TNUMBER or LUA_TSTRING */ 1783 | add_s(ms, b, s, e); 1784 | return; 1785 | } 1786 | } 1787 | if (!lua_toboolean(L, -1)) { /* nil or false? */ 1788 | lua_pop(L, 1); 1789 | lua_pushlstring(L, s, e - s); /* keep original text */ 1790 | } else if (!lua_isstring(L, -1)) 1791 | luaL_error(L, "invalid replacement value (a %s)", luaL_typename(L, -1)); 1792 | luaL_addvalue(b); /* add result to accumulator */ 1793 | } 1794 | 1795 | static int Lutf8_gsub (lua_State *L) { 1796 | const char *es, *s = check_utf8(L, 1, &es); 1797 | const char *ep, *p = check_utf8(L, 2, &ep); 1798 | int tr = lua_type(L, 3); 1799 | lua_Integer max_s = luaL_optinteger(L, 4, (es-s)+1); 1800 | int anchor = (*p == '^'); 1801 | lua_Integer n = 0; 1802 | MatchState ms; 1803 | luaL_Buffer b; 1804 | luaL_argcheck(L, tr == LUA_TNUMBER || tr == LUA_TSTRING || 1805 | tr == LUA_TFUNCTION || tr == LUA_TTABLE, 3, 1806 | "string/function/table expected"); 1807 | luaL_buffinit(L, &b); 1808 | if (anchor) p++; /* skip anchor character */ 1809 | ms.L = L; 1810 | ms.matchdepth = MAXCCALLS; 1811 | ms.src_init = s; 1812 | ms.src_end = es; 1813 | ms.p_end = ep; 1814 | while (n < max_s) { 1815 | const char *e; 1816 | ms.level = 0; 1817 | assert(ms.matchdepth == MAXCCALLS); 1818 | e = match(&ms, s, p); 1819 | if (e) { 1820 | n++; 1821 | add_value(&ms, &b, s, e, tr); 1822 | } 1823 | if (e && e > s) /* non empty match? */ 1824 | s = e; /* skip it */ 1825 | else if (s < es) { 1826 | utfint ch = 0; 1827 | s = utf8_safe_decode(L, s, &ch); 1828 | add_utf8char(&b, ch); 1829 | } else break; 1830 | if (anchor) break; 1831 | } 1832 | luaL_addlstring(&b, s, es-s); 1833 | luaL_pushresult(&b); 1834 | lua_pushinteger(L, n); /* number of substitutions */ 1835 | return 2; 1836 | } 1837 | 1838 | static int Lutf8_isvalid(lua_State *L) { 1839 | const char *e, *s = check_utf8(L, 1, &e); 1840 | const char *invalid = utf8_invalid_offset(s, e); 1841 | lua_pushboolean(L, invalid == NULL); 1842 | return 1; 1843 | } 1844 | 1845 | static int Lutf8_invalidoffset(lua_State *L) { 1846 | const char *e, *s = check_utf8(L, 1, &e); 1847 | const char *orig_s = s; 1848 | int offset = luaL_optinteger(L, 2, 0); 1849 | if (offset > 1) { 1850 | offset--; 1851 | s += offset; 1852 | if (s >= e) { 1853 | lua_pushnil(L); 1854 | return 1; 1855 | } 1856 | } else if (offset < 0 && s - e < offset) { 1857 | s = e + offset; 1858 | } 1859 | const char *invalid = utf8_invalid_offset(s, e); 1860 | if (invalid == NULL) { 1861 | lua_pushnil(L); 1862 | } else { 1863 | lua_pushinteger(L, invalid - orig_s + 1); 1864 | } 1865 | return 1; 1866 | } 1867 | 1868 | static int Lutf8_clean(lua_State *L) { 1869 | const char *e, *s = check_utf8(L, 1, &e); 1870 | 1871 | /* Default replacement string is REPLACEMENT CHARACTER U+FFFD */ 1872 | size_t repl_len; 1873 | const char *r = luaL_optlstring(L, 2, "\xEF\xBF\xBD", &repl_len); 1874 | 1875 | if (lua_gettop(L) > 1) { 1876 | /* Check if replacement string is valid UTF-8 or not */ 1877 | if (utf8_invalid_offset(r, r + repl_len) != NULL) { 1878 | lua_pushstring(L, "replacement string must be valid UTF-8"); 1879 | lua_error(L); 1880 | } 1881 | } 1882 | 1883 | const char *invalid = utf8_invalid_offset(s, e); 1884 | if (invalid == NULL) { 1885 | lua_settop(L, 1); /* Return input string without modification */ 1886 | lua_pushboolean(L, 1); /* String was clean already */ 1887 | return 2; 1888 | } 1889 | 1890 | luaL_Buffer buff; 1891 | luaL_buffinit(L, &buff); 1892 | 1893 | while (1) { 1894 | /* Invariant: 's' points to first GOOD byte not in output buffer, 1895 | * 'invalid' points to first BAD byte after that */ 1896 | luaL_addlstring(&buff, s, invalid - s); 1897 | luaL_addlstring(&buff, r, repl_len); 1898 | /* We do not replace every bad byte with the replacement character, 1899 | * but rather a contiguous sequence of bad bytes 1900 | * Restore the invariant by stepping forward until we find at least 1901 | * one good byte */ 1902 | s = invalid; 1903 | while (s == invalid) { 1904 | s++; 1905 | invalid = utf8_invalid_offset(s, e); 1906 | } 1907 | if (invalid == NULL) { 1908 | luaL_addlstring(&buff, s, e - s); 1909 | luaL_pushresult(&buff); 1910 | lua_pushboolean(L, 0); /* String was not clean */ 1911 | return 2; 1912 | } 1913 | } 1914 | } 1915 | 1916 | static int Lutf8_isnfc(lua_State *L) { 1917 | const char *e, *s = check_utf8(L, 1, &e); 1918 | utfint starter = 0, ch; 1919 | unsigned int prev_canon_cls = 0; 1920 | 1921 | while (s < e) { 1922 | s = utf8_decode(s, &ch, 1); 1923 | if (s == NULL) { 1924 | lua_pushstring(L, "string is not valid UTF-8"); 1925 | lua_error(L); 1926 | } 1927 | if (ch < 0x300) { 1928 | starter = ch; /* Fast path */ 1929 | prev_canon_cls = 0; 1930 | continue; 1931 | } 1932 | 1933 | unsigned int canon_cls = lookup_canon_cls(ch); 1934 | if (canon_cls && canon_cls < prev_canon_cls) { 1935 | /* Combining marks are out of order; this string is not NFC */ 1936 | lua_pushboolean(L, 0); /* Return false */ 1937 | return 1; 1938 | } 1939 | 1940 | nfc_table *entry = nfc_quickcheck(ch); 1941 | if (entry && !nfc_check(ch, entry, starter, canon_cls, prev_canon_cls)) { 1942 | lua_pushboolean(L, 0); /* Return false */ 1943 | return 1; 1944 | } 1945 | 1946 | prev_canon_cls = canon_cls; 1947 | if (!canon_cls) 1948 | starter = ch; 1949 | } 1950 | 1951 | lua_pushboolean(L, 1); /* Return true */ 1952 | return 1; 1953 | } 1954 | 1955 | static int Lutf8_normalize_nfc(lua_State *L) { 1956 | const char *e, *s = check_utf8(L, 1, &e), *p = s, *starter_p = s; 1957 | utfint starter = 0, ch; 1958 | unsigned int prev_canon_cls = 0; 1959 | 1960 | /* First scan to see if we can find any problems... if not, we may just return the 1961 | * input string unchanged */ 1962 | while (p < e) { 1963 | const char *new_p = utf8_decode(p, &ch, 1); 1964 | if (new_p == NULL) { 1965 | lua_pushstring(L, "string is not valid UTF-8"); 1966 | lua_error(L); 1967 | } 1968 | 1969 | unsigned int canon_cls = lookup_canon_cls(ch); 1970 | if (canon_cls && canon_cls < prev_canon_cls) { 1971 | goto build_string; /* Combining marks are out of order; this string is not NFC */ 1972 | } 1973 | 1974 | nfc_table *entry = nfc_quickcheck(ch); 1975 | if (entry && !nfc_check(ch, entry, starter, canon_cls, prev_canon_cls)) { 1976 | goto build_string; 1977 | } 1978 | 1979 | prev_canon_cls = canon_cls; 1980 | if (!canon_cls) { 1981 | starter = ch; 1982 | starter_p = p; 1983 | } 1984 | p = new_p; 1985 | } 1986 | 1987 | lua_settop(L, 1); /* Return input string without modification */ 1988 | lua_pushboolean(L, 1); /* String was in normal form already, so 2nd return value is 'true' */ 1989 | return 2; 1990 | 1991 | build_string: ; 1992 | /* We will need to build a new string, this one is not NFC */ 1993 | luaL_Buffer buff; 1994 | luaL_buffinit(L, &buff); 1995 | luaL_addlstring(&buff, s, starter_p - s); 1996 | 1997 | string_to_nfc(L, &buff, starter_p, e); 1998 | 1999 | luaL_pushresult(&buff); 2000 | lua_pushboolean(L, 0); 2001 | return 2; 2002 | } 2003 | 2004 | static int iterate_grapheme_indices(lua_State *L) { 2005 | const char *s = luaL_checkstring(L, lua_upvalueindex(1)); 2006 | lua_Integer pos = luaL_checkinteger(L, lua_upvalueindex(2)); 2007 | lua_Integer end = luaL_checkinteger(L, lua_upvalueindex(3)); 2008 | 2009 | if (pos > end) { 2010 | lua_pushnil(L); 2011 | return 1; 2012 | } 2013 | const char *e = s + end; 2014 | 2015 | utfint ch, next_ch; 2016 | const char *p = utf8_safe_decode(L, s + pos - 1, &ch); 2017 | 2018 | while (1) { 2019 | const char *next_p = utf8_safe_decode(L, p, &next_ch); 2020 | int bind = 0; 2021 | 2022 | if (ch == '\r') { 2023 | if (next_ch == '\n') { 2024 | /* CR binds to following LF */ 2025 | bind = 1; 2026 | } else { 2027 | break; 2028 | } 2029 | } else if (ch == '\n' || next_ch == '\r' || next_ch == '\n') { 2030 | /* CR/LF do not bind to any other codepoint or in any other way */ 2031 | break; 2032 | } else if (find_in_range(cntrl_table, table_size(cntrl_table), ch) && !find_in_range(prepend_table, table_size(prepend_table), ch) && ch != 0x200D) { 2033 | /* Control characters do not bind to anything */ 2034 | break; 2035 | } else if (next_ch == 0x200D) { 2036 | /* U+200D is ZERO WIDTH JOINER, it always binds to preceding char */ 2037 | if (next_p < e && find_in_range(pictographic_table, table_size(pictographic_table), ch)) { 2038 | /* After an Extended_Pictographic codepoint and ZWJ, we bind to a following Extended_Pictographic */ 2039 | utfint nextnext_ch; 2040 | const char *probe_ep = utf8_safe_decode(L, next_p, &nextnext_ch); 2041 | if (find_in_range(pictographic_table, table_size(pictographic_table), nextnext_ch)) { 2042 | p = probe_ep; 2043 | ch = nextnext_ch; 2044 | continue; 2045 | } 2046 | } 2047 | bind = 1; 2048 | } else if (find_in_range(cntrl_table, table_size(cntrl_table), next_ch) && !find_in_range(prepend_table, table_size(prepend_table), next_ch)) { 2049 | /* Control characters do not bind to anything */ 2050 | break; 2051 | } else { 2052 | if (indic_conjunct_type(ch) == INDIC_CONSONANT) { 2053 | utfint probed_ch = next_ch; 2054 | const char *probe = next_p; 2055 | int indic_type = indic_conjunct_type(probed_ch); 2056 | int saw_linker = 0; 2057 | while (indic_type) { 2058 | /* Consume any number of Extend or Linker codepoints, followed by a single Consonant 2059 | * The sequence must contain at least one Linker, however! */ 2060 | if (indic_type == INDIC_LINKER) { 2061 | saw_linker = 1; 2062 | } else if (indic_type == INDIC_CONSONANT) { 2063 | if (!saw_linker) 2064 | break; 2065 | p = probe; 2066 | ch = probed_ch; 2067 | goto next_iteration; 2068 | } 2069 | if (probe >= e) 2070 | break; 2071 | probe = utf8_safe_decode(L, probe, &probed_ch); 2072 | indic_type = indic_conjunct_type(probed_ch); 2073 | } 2074 | } 2075 | 2076 | if (find_in_range(compose_table, table_size(compose_table), next_ch) || (next_ch >= 0x1F3FB && next_ch <= 0x1F3FF)) { 2077 | /* The 2nd codepoint has property Grapheme_Extend, or is an Emoji_Modifier codepoint */ 2078 | if (next_p < e && find_in_range(pictographic_table, table_size(pictographic_table), ch)) { 2079 | /* Consume any number of 'extend' codepoints, one ZWJ, and following Extended_Pictographic codepoint */ 2080 | utfint probed_ch; 2081 | const char *probe = next_p; 2082 | while (probe < e) { 2083 | probe = utf8_safe_decode(L, probe, &probed_ch); 2084 | if (probed_ch == 0x200D) { 2085 | if (probe < e) { 2086 | probe = utf8_safe_decode(L, probe, &probed_ch); 2087 | if (find_in_range(pictographic_table, table_size(pictographic_table), probed_ch)) { 2088 | next_p = probe; 2089 | next_ch = probed_ch; 2090 | } 2091 | } 2092 | break; 2093 | } else if (find_in_range(compose_table, table_size(compose_table), probed_ch) || (probed_ch >= 0x1F3FB && probed_ch <= 0x1F3FF)) { 2094 | next_p = probe; 2095 | next_ch = probed_ch; 2096 | } else { 2097 | break; 2098 | } 2099 | } 2100 | } 2101 | bind = 1; 2102 | } else if (find_in_range(spacing_mark_table, table_size(spacing_mark_table), next_ch)) { 2103 | /* The 2nd codepoint is in general category Spacing_Mark */ 2104 | bind = 1; 2105 | } else if (find_in_range(prepend_table, table_size(prepend_table), ch)) { 2106 | /* The 1st codepoint has property Prepend_Concatenation_Mark, or is a type of 2107 | * Indic Syllable which binds to the following codepoint */ 2108 | bind = 1; 2109 | } else if (ch >= 0x1F1E6 && ch <= 0x1F1FF && next_ch >= 0x1F1E6 && next_ch <= 0x1F1FF) { 2110 | /* Regional Indicator (flag) emoji bind together; but only in twos */ 2111 | p = next_p; 2112 | ch = 0xFFFE; /* Set 'ch' to bogus value so we will not re-enter this branch on next iteration */ 2113 | continue; 2114 | } else { 2115 | /* Korean Hangul codepoints have their own special rules about when they 2116 | * are considered a single grapheme cluster */ 2117 | int hangul1 = hangul_type(ch); 2118 | if (hangul1) { 2119 | int hangul2 = hangul_type(next_ch); 2120 | if (hangul2) { 2121 | if (hangul1 == HANGUL_L) { 2122 | bind = (hangul2 != HANGUL_T); 2123 | } else if (hangul1 == HANGUL_LV || hangul1 == HANGUL_V) { 2124 | bind = (hangul2 == HANGUL_V || hangul2 == HANGUL_T); 2125 | } else if (hangul1 == HANGUL_LVT || hangul1 == HANGUL_T) { 2126 | bind = (hangul2 == HANGUL_T); 2127 | } 2128 | } 2129 | } 2130 | } 2131 | } 2132 | 2133 | if (!bind) 2134 | break; 2135 | p = next_p; 2136 | ch = next_ch; 2137 | next_iteration: ; 2138 | } 2139 | 2140 | lua_pushinteger(L, (p - s) + 1); 2141 | lua_replace(L, lua_upvalueindex(2)); 2142 | 2143 | lua_pushinteger(L, pos); 2144 | lua_pushinteger(L, p - s); 2145 | return 2; 2146 | } 2147 | 2148 | static int Lutf8_grapheme_indices(lua_State *L) { 2149 | size_t len; 2150 | const char *s = luaL_checklstring(L, 1, &len); 2151 | lua_Integer start = byte_relat(luaL_optinteger(L, 2, 1), len); 2152 | lua_Integer end = byte_relat(luaL_optinteger(L, 3, len), len); 2153 | luaL_argcheck(L, start >= 1, 2, "out of range"); 2154 | luaL_argcheck(L, end <= (lua_Integer)len, 3, "out of range"); 2155 | 2156 | lua_settop(L, 1); 2157 | lua_pushinteger(L, start); 2158 | lua_pushinteger(L, end); 2159 | lua_pushcclosure(L, iterate_grapheme_indices, 3); 2160 | return 1; 2161 | } 2162 | 2163 | /* lua module import interface */ 2164 | 2165 | #if LUA_VERSION_NUM >= 502 2166 | static const char UTF8PATT[] = "[\0-\x7F\xC2-\xF4][\x80-\xBF]*"; 2167 | #else 2168 | static const char UTF8PATT[] = "[%z\1-\x7F\xC2-\xF4][\x80-\xBF]*"; 2169 | #endif 2170 | 2171 | LUALIB_API int luaopen_utf8 (lua_State *L) { 2172 | luaL_Reg libs[] = { 2173 | #define ENTRY(name) { #name, Lutf8_##name } 2174 | ENTRY(offset), 2175 | ENTRY(codes), 2176 | ENTRY(codepoint), 2177 | 2178 | ENTRY(len), 2179 | ENTRY(sub), 2180 | ENTRY(reverse), 2181 | ENTRY(lower), 2182 | ENTRY(upper), 2183 | ENTRY(title), 2184 | ENTRY(fold), 2185 | ENTRY(byte), 2186 | ENTRY(char), 2187 | ENTRY(escape), 2188 | ENTRY(insert), 2189 | ENTRY(remove), 2190 | ENTRY(charpos), 2191 | ENTRY(next), 2192 | ENTRY(width), 2193 | ENTRY(widthindex), 2194 | ENTRY(ncasecmp), 2195 | ENTRY(find), 2196 | ENTRY(gmatch), 2197 | ENTRY(gsub), 2198 | ENTRY(match), 2199 | ENTRY(isvalid), 2200 | ENTRY(invalidoffset), 2201 | ENTRY(clean), 2202 | ENTRY(isnfc), 2203 | ENTRY(normalize_nfc), 2204 | ENTRY(grapheme_indices), 2205 | #undef ENTRY 2206 | { NULL, NULL } 2207 | }; 2208 | 2209 | #if LUA_VERSION_NUM >= 502 2210 | luaL_newlib(L, libs); 2211 | #else 2212 | luaL_register(L, "utf8", libs); 2213 | #endif 2214 | 2215 | lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT)-1); 2216 | lua_setfield(L, -2, "charpattern"); 2217 | 2218 | return 1; 2219 | } 2220 | 2221 | /* win32cc: flags+='-Wall -Wextra -s -O2 -mdll -DLUA_BUILD_AS_DLL' 2222 | * win32cc: libs+='-llua54.dll' output='lua-utf8.dll' 2223 | * win32cc: run='lua.exe test.lua' 2224 | * maccc: run='lua -- test_compat.lua' 2225 | * maccc: flags+='-g --coverage -bundle -undefined dynamic_lookup' output='lua-utf8.so' */ 2226 | 2227 | -------------------------------------------------------------------------------- /parseucd.lua: -------------------------------------------------------------------------------- 1 | -- generate useful data from Unicode Character Database. 2 | -- you should have these files in UCD folder in current path: 3 | -- - UCD\CaseFolding.txt 4 | -- - UCD\DerivedCoreProperties.txt 5 | -- - UCD\DerivedNormalizationProps.txt 6 | -- - UCD\EastAsianWidth.txt 7 | -- - UCD\emoji\emoji-data.txt 8 | -- - UCD\HangulSyllableType.txt 9 | -- - UCD\IndicSyllabicCategory.txt 10 | -- - UCD\PropList.txt 11 | -- - UCD\UnicodeData.txt 12 | -- 13 | -- files can be downloaded at: http://unicode.org/Public/UCD/latest/UCD/ 14 | 15 | 16 | local function parse_UnicodeData() 17 | -- UnicodeData.txt structions: 18 | -- 0. codepoint 19 | -- 1. name 20 | -- 2. general category 21 | -- 3. canonical combining class 22 | -- 4. bidi class 23 | -- 5. decomposition type/mapping 24 | -- 6. numeric type/value 25 | -- 7. numeric type/value 26 | -- 8. numeric type/value 27 | -- 9. bidi mirrored [YN] 28 | -- 10. old unicode name 29 | -- 11. iso comment 30 | -- 12. uppercase mapping 31 | -- 13. lowercase mapping 32 | -- 14. titlecase mapping 33 | local ucd = {} 34 | 35 | local patt = "^(%x+)"..(";([^;]-)"):rep(14).."$" 36 | 37 | local last_data 38 | 39 | for line in io.lines() do 40 | local cp, name, gc, canon_cls, bidi_class, decomposition, _,_,_, _, _,_, um, lm, tm = line:match(patt) 41 | assert(cp, line) 42 | cp = tonumber(cp, 16) 43 | lm = lm ~= "" and tonumber(lm, 16) 44 | um = um ~= "" and tonumber(um, 16) 45 | tm = tm ~= "" and tonumber(tm, 16) 46 | local decomp1, decomp2 = decomposition:match "^(%x+) (%x+)$" 47 | if decomp1 and decomp2 then 48 | decomposition = { tonumber(decomp1, 16), tonumber(decomp2, 16) } 49 | elseif decomposition:match("^%x+$") then 50 | decomposition = { tonumber(decomposition, 16) } 51 | else 52 | decomposition = nil 53 | end 54 | if last_data and last_data.name:match"First%>$" then 55 | assert(name:match"Last%>$", line) 56 | for i = last_data.cp, cp-1 do 57 | ucd[#ucd+1] = { 58 | cp = i, 59 | name = name, 60 | gc = gc, 61 | bidi_class = bidi_class, 62 | lm = lm, um = um, tm = tm, 63 | canon_cls = tonumber(canon_cls), 64 | decomposition = decomposition 65 | } 66 | end 67 | end 68 | local data = { 69 | cp = cp, 70 | name = name, 71 | gc = gc, 72 | bidi_class = bidi_class, 73 | lm = lm, um = um, tm = tm, 74 | canon_cls = tonumber(canon_cls), 75 | decomposition = decomposition 76 | } 77 | ucd[#ucd+1] = data 78 | last_data = data 79 | end 80 | table.sort(ucd, function(a, b) return a.cp < b.cp end) 81 | 82 | return ucd 83 | end 84 | 85 | local function parse_EastAsianWidth() 86 | local wide, ambi = {}, {} 87 | 88 | for line in io.lines() do 89 | line = line:gsub("%s*%#.*$", "") 90 | if line ~= "" then 91 | local first, last, mark 92 | first, mark = line:match "^(%x+)%s*%;%s*(%w+)$" 93 | if first then 94 | last = first 95 | else 96 | first, last, mark = line:match "^(%x+)%.%.(%x+)%s*%;%s*(%w+)$" 97 | assert(first, line) 98 | end 99 | 100 | first = tonumber(first, 16) 101 | last = tonumber(last, 16) 102 | 103 | if mark == 'W' or mark == 'F' then 104 | for i = first, last do 105 | wide[#wide+1] = i 106 | end 107 | elseif mark == 'A' then 108 | for i = first, last do 109 | ambi[#ambi+1] = i 110 | end 111 | end 112 | end 113 | end 114 | 115 | return wide, ambi 116 | end 117 | 118 | local function parse_CaseFolding() 119 | local mapping = {} 120 | for line in io.lines() do 121 | line = line:gsub("%s*%#.*$", "") 122 | if line ~= "" then 123 | local cp, class, mcp = line:match "^%s*(%x+)%s*;%s*(%w+)%s*;%s*(%x+)" 124 | assert(cp, line) 125 | if class == 'C' or class == 'S' then 126 | cp = tonumber(cp, 16) 127 | mcp = tonumber(mcp, 16) 128 | mapping[#mapping+1] = { cp = cp, mapping = mcp } 129 | end 130 | end 131 | end 132 | return mapping 133 | end 134 | 135 | local function parse_PropList(f) 136 | local ranges = {} 137 | local lookup = {} 138 | 139 | local arg = f 140 | if type(f) == 'table' then 141 | f = function(cp) return arg[cp] end 142 | elseif type(f) == 'string' then 143 | f = function(cp) return arg == cp end 144 | end 145 | 146 | for line in io.lines() do 147 | line = line:gsub("%s*%#.*$", "") 148 | if line ~= "" then 149 | local first, last, mark 150 | first, mark = line:match "^(%x+)%s*%;%s*([%w%s_;]+)%s*$" 151 | if first then 152 | last = first 153 | else 154 | first, last, mark = line:match "^(%x+)%.%.(%x+)%s*%;%s*([%w%s_;]+)%s*$" 155 | assert(first, line) 156 | end 157 | 158 | first = tonumber(first, 16) 159 | last = tonumber(last, 16) 160 | 161 | if f(mark) then 162 | for i = first, last do 163 | if not lookup[i] then 164 | lookup[i] = true 165 | ranges[#ranges+1] = i 166 | end 167 | end 168 | end 169 | end 170 | end 171 | 172 | table.sort(ranges) 173 | return ranges, lookup 174 | end 175 | 176 | local function parse_HangulSyllableType() 177 | local ranges = {} 178 | local lookup = {} 179 | 180 | for line in io.lines() do 181 | line = line:gsub("%s*%#.*$", "") 182 | if line ~= "" then 183 | local first, last, mark 184 | first, mark = line:match "^(%x+)%s*%;%s*([%w%s_;]+)%s*$" 185 | if first then 186 | last = first 187 | else 188 | first, last, mark = line:match "^(%x+)%.%.(%x+)%s*%;%s*([%w%s_;]+)%s*$" 189 | assert(first, line) 190 | end 191 | 192 | first = tonumber(first, 16) 193 | last = tonumber(last, 16) 194 | 195 | for i = first, last do 196 | if not lookup[i] then 197 | lookup[i] = true 198 | ranges[#ranges+1] = { cp=i, offset='HANGUL_'..mark } 199 | end 200 | end 201 | end 202 | end 203 | 204 | table.sort(ranges, function(a, b) return a.cp < b.cp end) 205 | return ranges 206 | end 207 | 208 | local function parse_NormalizationProps(prop, ucd) 209 | local codepoints = {} 210 | 211 | for line in io.lines() do 212 | local cps, property, tail = line:match "^([%x%.]+)%s*;%s*([%w%_]+)(.*)$" 213 | if property == prop then 214 | local value = tail:match "^%s*;%s*(%w+)" 215 | local from = cps:match "^%x+" 216 | local to = cps:match "%.%.(%x+)$" 217 | if not to then to = from end 218 | 219 | from = tonumber(from, 16) 220 | to = tonumber(to, 16) 221 | 222 | for cp = from, to, 1 do 223 | codepoints[#codepoints+1] = cp 224 | end 225 | end 226 | end 227 | 228 | table.sort(codepoints) 229 | return codepoints 230 | end 231 | 232 | local function get_ranges(list, func) 233 | local first, last, step, offset 234 | local ranges = {} 235 | for i = 1, #list do 236 | local v_cp, v_offset 237 | local v = list[i] 238 | local res = not func or func(v) 239 | if type(v) == 'number' then 240 | v_cp, v_offset = v, nil 241 | elseif v.cp then 242 | v_cp, v_offset = v.cp, v.offset 243 | end 244 | if res then 245 | if first and 246 | (not offset or offset == v_offset) and 247 | (not step or step == v_cp - last) then 248 | step = v_cp - last 249 | last = v_cp 250 | else 251 | if first then 252 | local r = { first = first, last = last, step = step, offset = offset } 253 | ranges[#ranges+1] = r 254 | end 255 | first, last, step = v_cp, v_cp, nil 256 | offset = v_offset 257 | end 258 | end 259 | end 260 | if first then 261 | local r = { first = first, last = last, step = step, offset = offset } 262 | ranges[#ranges+1] = r 263 | end 264 | return ranges 265 | end 266 | 267 | --[[ 268 | local function merge_ranges(...) 269 | local ranges = {} 270 | local lookup = {} 271 | for i = 1, select('#', ...) do 272 | for _,v in ipairs(select(i, ...)) do 273 | if not lookup[v] then 274 | lookup[v] = true 275 | ranges[#ranges+1] = v 276 | end 277 | end 278 | end 279 | table.sort(ranges) 280 | return ranges 281 | end 282 | 283 | local function diff_ranges(base, sub, force) 284 | local ranges = {} 285 | local lookup = {} 286 | local missing = {} 287 | for _, v in ipairs(sub) do 288 | for i = v.first, v.last, v.step or 1 do 289 | lookup[i] = true 290 | missing[i] = true 291 | end 292 | end 293 | for _, v in ipairs(base) do 294 | for i = v.first, v.last, v.step or 1 do 295 | if not lookup[i] then 296 | ranges[#ranges+1] = i 297 | end 298 | missing[i] = nil 299 | end 300 | end 301 | if force and next(missing) then 302 | local m = {} 303 | for i in pairs(missing) do 304 | m[#m+1] = i 305 | end 306 | table.sort(m) 307 | for i, v in ipairs(m) do 308 | m[i] = ("%X"):format(v) 309 | end 310 | error(table.concat(m, "\n")) 311 | end 312 | return get_ranges(ranges) 313 | end 314 | --]] 315 | 316 | local function get_ucd(cp, ucd) 317 | local data = ucd[cp+1] 318 | if data.cp > cp then 319 | local i = cp 320 | while data.cp > cp do 321 | data = ucd[i] 322 | i = i - 1 323 | end 324 | end 325 | return data 326 | end 327 | 328 | local function write_ranges(name, ranges) 329 | io.write("static struct range_table "..name.."_table[] = {\n") 330 | for _, r in ipairs(ranges) do 331 | io.write((" { 0x%X, 0x%X, %d },\n"):format(r.first, r.last, r.step or 1)) 332 | end 333 | io.write "};\n\n" 334 | end 335 | 336 | local function write_convtable(name, conv) 337 | io.write("static struct conv_table "..name.."_table[] = {\n") 338 | for _, c in ipairs(conv) do 339 | io.write((" { 0x%X, 0x%X, %d, %d },\n"):format( 340 | c.first, c.last, c.step or 1, c.offset)) 341 | end 342 | io.write "};\n\n" 343 | end 344 | 345 | local function write_canon_cls_table(name, ucd) 346 | io.write("static struct canon_cls_table "..name.."_table[] = {\n") 347 | local start, prev = { canon_cls=0 }, { canon_cls=0 } 348 | for _, data in ipairs(ucd) do 349 | if data.canon_cls ~= prev.canon_cls then 350 | if prev.canon_cls ~= 0 then 351 | io.write((" { 0x%X, 0x%X, %d },\n"):format(start.cp, prev.cp, prev.canon_cls)) 352 | end 353 | start = data 354 | end 355 | prev = data 356 | end 357 | if prev.canon_cls ~= 0 then 358 | io.write((" { 0x%X, 0x%X, %d },\n"):format(start.cp, prev.cp, prev.canon_cls)) 359 | end 360 | io.write "};\n\n" 361 | end 362 | 363 | local function write_combine_table(name, tbl) 364 | local function hash(cp1, cp2) 365 | return (cp1 * 213) + cp2 366 | end 367 | local dup = {} 368 | for _, c in ipairs(tbl) do 369 | local cp1, cp2 = table.unpack(c.decomposition) 370 | if dup[hash(cp1, cp2)] then 371 | local conflicting = dup[hash(cp1, cp2)] 372 | local cp3, cp4 = table.unpack(conflicting.decomposition) 373 | error("Hash collision: "..string.format("%x %x -> %x, %x %x -> %x", cp3, cp4, hash(cp3, cp4), cp1, cp2, hash(cp1, cp2))) 374 | end 375 | dup[hash(cp1, cp2)] = c 376 | end 377 | table.sort(tbl, function(a,b) 378 | return hash(table.unpack(a.decomposition)) < hash(table.unpack(b.decomposition)) 379 | end) 380 | 381 | io.write("static struct combine_table "..name.."_table[] = {\n") 382 | for _, c in ipairs(tbl) do 383 | local cp1, cp2 = table.unpack(c.decomposition) 384 | io.write((" { 0x%X, 0x%X, 0x%X, 0x%X },\n"):format(hash(cp1, cp2), cp1, cp2, c.cp)) 385 | end 386 | io.write "};\n\n" 387 | end 388 | 389 | local function write_decompose_table(name, tbl, ucd) 390 | table.sort(tbl, function(a,b) 391 | return a.cp < b.cp 392 | end) 393 | io.write("static struct decompose_table "..name.."_table[] = {\n") 394 | for _, c in ipairs(tbl) do 395 | local cp1, cp2 = table.unpack(c.decomposition) 396 | local data = get_ucd(cp2, ucd) 397 | io.write((" { 0x%X, 0x%X, 0x%X, %d },\n"):format(c.cp, cp1, cp2, data.canon_cls)) 398 | end 399 | io.write "};\n\n" 400 | end 401 | 402 | local function write_type_table(name, conv) 403 | io.write("static struct type_table "..name.."_table[] = {\n") 404 | for _, c in ipairs(conv) do 405 | if c.step and c.step ~= 1 then 406 | local i = c.first 407 | while i <= c.last do 408 | io.write((" { 0x%X, 0x%X, %s },\n"):format(i, i, c.offset)) 409 | i = i + c.step 410 | end 411 | else 412 | io.write((" { 0x%X, 0x%X, %s },\n"):format(c.first, c.last, c.offset)) 413 | end 414 | end 415 | io.write "};\n\n" 416 | end 417 | 418 | 419 | io.output "unidata.h" 420 | 421 | io.write [[ 422 | /* 423 | * unidata.h - generated by parseucd.lua 424 | */ 425 | #ifndef unidata_h 426 | #define unidata_h 427 | 428 | #ifndef utfint 429 | # define utfint utfint 430 | typedef unsigned int utfint; 431 | #endif 432 | 433 | typedef struct range_table { 434 | utfint first; 435 | utfint last; 436 | int step; 437 | } range_table; 438 | 439 | typedef struct conv_table { 440 | utfint first; 441 | utfint last; 442 | int step; 443 | int offset; 444 | } conv_table; 445 | 446 | typedef struct nfc_table { 447 | utfint cp; 448 | int reason; 449 | unsigned int data1; 450 | unsigned int data2; 451 | } nfc_table; 452 | 453 | #define REASON_MUST_CONVERT_1 1 454 | #define REASON_MUST_CONVERT_2 2 455 | #define REASON_STARTER_CAN_COMBINE 3 456 | #define REASON_COMBINING_MARK 4 457 | #define REASON_JAMO_VOWEL 5 458 | #define REASON_JAMO_TRAILING 6 459 | 460 | typedef struct canon_cls_table { 461 | utfint first; 462 | utfint last; 463 | unsigned int canon_cls; 464 | } canon_cls_table; 465 | 466 | typedef struct combine_table { 467 | utfint hash; 468 | utfint cp1; 469 | utfint cp2; 470 | utfint dest; 471 | } combine_table; 472 | 473 | typedef struct decompose_table { 474 | utfint cp; 475 | utfint to1; 476 | utfint to2; 477 | unsigned int canon_cls2; 478 | } decompose_table; 479 | 480 | #define HANGUL_L 1 481 | #define HANGUL_V 2 482 | #define HANGUL_T 3 483 | #define HANGUL_LV 4 484 | #define HANGUL_LVT 5 485 | 486 | typedef struct type_table { 487 | utfint first; 488 | utfint last; 489 | int type; 490 | } type_table; 491 | 492 | #define INDIC_CONSONANT 1 493 | #define INDIC_LINKER 2 494 | #define INDIC_EXTEND 3 495 | 496 | ]] 497 | 498 | do 499 | local function ranges(name, f) 500 | local r = get_ranges((parse_PropList(f))) 501 | write_ranges(name, r) 502 | end 503 | 504 | io.input "UCD/DerivedCoreProperties.txt" 505 | ranges("alpha", "Alphabetic") 506 | 507 | io.input "UCD/DerivedCoreProperties.txt" 508 | ranges("lower", "Lowercase") 509 | 510 | io.input "UCD/DerivedCoreProperties.txt" 511 | ranges("upper", "Uppercase") 512 | 513 | io.input "UCD/PropList.txt" 514 | ranges("xdigit", "Hex_Digit") 515 | 516 | io.input "UCD/PropList.txt" 517 | ranges("space", "White_Space") 518 | 519 | io.input "UCD/DerivedCoreProperties.txt" 520 | ranges("unprintable", "Default_Ignorable_Code_Point") 521 | 522 | io.input "UCD/DerivedCoreProperties.txt" 523 | ranges("graph", "Grapheme_Base") 524 | 525 | io.input "UCD/DerivedCoreProperties.txt" 526 | ranges("compose", "Grapheme_Extend") 527 | 528 | io.input "UCD/emoji/emoji-data.txt" 529 | ranges("pictographic", "Extended_Pictographic") 530 | end 531 | 532 | do 533 | io.input "UCD/PropList.txt" 534 | local prepend = parse_PropList("Prepended_Concatenation_Mark") 535 | io.input "UCD/IndicSyllabicCategory.txt" 536 | local indic = parse_PropList({ Consonant_Preceding_Repha=true, Consonant_Prefixed=true }) 537 | for _,cp in ipairs(indic) do 538 | table.insert(prepend, cp) 539 | end 540 | table.sort(prepend) 541 | write_ranges("prepend", get_ranges(prepend)) 542 | end 543 | 544 | do 545 | io.input "UCD/DerivedCoreProperties.txt" 546 | local linker = parse_PropList("InCB; Linker") 547 | io.input "UCD/DerivedCoreProperties.txt" 548 | local consonant = parse_PropList("InCB; Consonant") 549 | io.input "UCD/DerivedCoreProperties.txt" 550 | local extend = parse_PropList("InCB; Extend") 551 | local indic_type = {} 552 | for _,cp in ipairs(consonant) do table.insert(indic_type, { cp=cp, offset='INDIC_CONSONANT' }) end 553 | for _,cp in ipairs(linker) do table.insert(indic_type, { cp=cp, offset='INDIC_LINKER' }) end 554 | for _,cp in ipairs(extend) do table.insert(indic_type, { cp=cp, offset='INDIC_EXTEND' }) end 555 | table.sort(indic_type, function(a, b) return a.cp < b.cp end) 556 | write_type_table("indic", get_ranges(indic_type)) 557 | end 558 | 559 | do 560 | io.input "UCD/UnicodeData.txt" 561 | local ucd = parse_UnicodeData() 562 | local function set(s) 563 | local hasht = {} 564 | for word in s:gmatch "%w%w" do 565 | hasht[word] = true 566 | end 567 | return function(data) 568 | return hasht[data.gc] 569 | end 570 | end 571 | local function mapping(field) 572 | return function(data) 573 | data.offset = nil 574 | if data[field] then 575 | data.offset = data[field] - data.cp 576 | return true 577 | end 578 | end 579 | end 580 | local cntrl = "Cc Cf" 581 | local digit = "Nd" 582 | local alnum_extend = "Nd Nl No" 583 | local punct = "Sk Sc Sm Pc Pd Ps Pe Pi Pf Po" 584 | local spacing_mark = "Mc" 585 | write_ranges("cntrl", get_ranges(ucd, set(cntrl))) 586 | write_ranges("digit", get_ranges(ucd, set(digit))) 587 | write_ranges("alnum_extend", get_ranges(ucd, set(alnum_extend))) 588 | write_ranges("punct", get_ranges(ucd, set(punct))) 589 | write_ranges("spacing_mark", get_ranges(ucd, set(spacing_mark))) 590 | write_convtable("tolower", get_ranges(ucd, mapping "lm")) 591 | write_convtable("toupper", get_ranges(ucd, mapping "um")) 592 | write_convtable("totitle", get_ranges(ucd, mapping "tm")) 593 | end 594 | 595 | do 596 | io.input "UCD/CaseFolding.txt" 597 | local mapping = parse_CaseFolding() 598 | write_convtable("tofold", get_ranges(mapping, function(data) 599 | data.offset = data.mapping - data.cp 600 | return true 601 | end)) 602 | end 603 | 604 | do 605 | io.input "UCD/EastAsianWidth.txt" 606 | local wide, ambi = parse_EastAsianWidth() 607 | write_ranges("doublewidth", get_ranges(wide)) 608 | write_ranges("ambiwidth", get_ranges(ambi)) 609 | end 610 | 611 | do 612 | io.input "UCD/HangulSyllableType.txt" 613 | write_type_table("hangul", (get_ranges(parse_HangulSyllableType()))) 614 | end 615 | 616 | do 617 | io.input "UCD/UnicodeData.txt" 618 | local ucd = parse_UnicodeData() 619 | 620 | -- Write out table of all combining marks 621 | write_canon_cls_table("nfc_combining", ucd) 622 | 623 | -- Find all primary composites which we may need to consider during NFC normalization 624 | io.input "UCD/DerivedNormalizationProps.txt" 625 | local excluded = {} 626 | for _, cp in ipairs(parse_NormalizationProps('Full_Composition_Exclusion')) do 627 | excluded[cp] = true 628 | end 629 | local composite, can_combine = {}, {} 630 | for _, data in ipairs(ucd) do 631 | local decomp = data.decomposition 632 | if not excluded[data.cp] and decomp and #decomp == 2 then 633 | table.insert(composite, data) 634 | can_combine[decomp[2]] = true 635 | end 636 | end 637 | write_combine_table("nfc_composite", composite) 638 | write_decompose_table("nfc_decompose", composite, ucd) 639 | 640 | io.write("static struct nfc_table nfc_quickcheck_table[] = {\n") 641 | 642 | io.input "UCD/DerivedNormalizationProps.txt" 643 | for _, cp in ipairs(parse_NormalizationProps('NFC_QC', ucd)) do 644 | local data = get_ucd(cp, ucd) 645 | local decomp = data.decomposition 646 | if decomp then 647 | if #decomp == 1 then 648 | local decomp_data = get_ucd(decomp[1], ucd) 649 | io.write((" { 0x%X, REASON_MUST_CONVERT_1, 0x%X, %d },\n"):format(data.cp, decomp[1], decomp_data.canon_cls)) 650 | else 651 | io.write((" { 0x%X, REASON_MUST_CONVERT_2, 0x%X, 0x%X },\n"):format(data.cp, decomp[1], decomp[2])) 652 | end 653 | elseif data.canon_cls ~= 0 then 654 | io.write((" { 0x%X, REASON_COMBINING_MARK, 0, 0 },\n"):format(data.cp)) 655 | elseif can_combine[data.cp] then 656 | io.write((" { 0x%X, REASON_STARTER_CAN_COMBINE, 0, 0 },\n"):format(data.cp)) 657 | elseif data.cp >= 0x1161 and data.cp <= 0x1175 then 658 | io.write((" { 0x%X, REASON_JAMO_VOWEL, 0, 0 },\n"):format(data.cp)) 659 | elseif data.cp >= 0x11A8 and data.cp <= 0x11C2 then 660 | io.write((" { 0x%X, REASON_JAMO_TRAILING, 0, 0 },\n"):format(data.cp)) 661 | else 662 | error("Don't know why we need to check for codepoint "..string.format("0x%x", data.cp).." when doing NFC normalization") 663 | end 664 | end 665 | 666 | io.write "};\n\n" 667 | end 668 | 669 | io.write "#endif /* unidata_h */\n" 670 | -------------------------------------------------------------------------------- /rockspecs/luautf8-0.1.6-1.rockspec: -------------------------------------------------------------------------------- 1 | package = "luautf8" 2 | version = "0.1.6-1" 3 | source = { 4 | url = "https://github.com/starwing/luautf8/archive/refs/tags/0.1.6.tar.gz", 5 | dir = "luautf8-0.1.6" 6 | } 7 | description = { 8 | summary = "A UTF-8 support module for Lua", 9 | detailed = [[ 10 | This module adds UTF-8 support to Lua. It's compatible with Lua "string" module. 11 | ]], 12 | homepage = "http://github.com/starwing/luautf8", 13 | license = "MIT" 14 | } 15 | dependencies = { 16 | "lua >= 5.1" 17 | } 18 | build = { 19 | type = "builtin", 20 | modules = { 21 | ["lua-utf8"] = "lutf8lib.c" 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /rockspecs/luautf8-scm-1.rockspec: -------------------------------------------------------------------------------- 1 | package = "luautf8" 2 | version = "scm-1" 3 | source = { 4 | url = "git://github.com/starwing/luautf8" 5 | } 6 | description = { 7 | summary = "A UTF-8 support module for Lua", 8 | detailed = [[ 9 | This module adds UTF-8 support to Lua. It's compatible with Lua "string" module. 10 | ]], 11 | homepage = "http://github.com/starwing/luautf8", 12 | license = "MIT" 13 | } 14 | dependencies = { 15 | "lua >= 5.1" 16 | } 17 | build = { 18 | type = "builtin", 19 | modules = { 20 | ["lua-utf8"] = "lutf8lib.c" 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /test.lua: -------------------------------------------------------------------------------- 1 | local utf8 = require 'lua-utf8' 2 | local unpack = unpack or table.unpack 3 | local E = utf8.escape 4 | 5 | local function get_codes(s) 6 | return table.concat({utf8.byte(s, 1, -1)}, ' ') 7 | end 8 | 9 | local t = { 20985, 20984, 26364, 25171, 23567, 24618, 20861 } 10 | -- test escape & len 11 | assert(get_codes(E"%123%xabc%x{ABC}%d%u{456}") == '123 2748 2748 100 456') 12 | 13 | local s = E('%'..table.concat(t, '%')) 14 | assert(utf8.len(s) == 7) 15 | assert(get_codes(s) == table.concat(t, ' ')) 16 | 17 | 18 | -- test offset 19 | 20 | local function assert_error(f, msg) 21 | local s,e = pcall(f) 22 | return assert(not s and e:match(msg)) 23 | end 24 | 25 | assert(utf8.offset("中国", 0) == 1) 26 | assert(utf8.offset("中国", 0,1) == 1) 27 | assert(utf8.offset("中国", 0,2) == 1) 28 | assert(utf8.offset("中国", 0,3) == 1) 29 | assert(utf8.offset("中国", 0,4) == 4) 30 | assert(utf8.offset("中国", 0,5) == 4) 31 | assert(utf8.offset("中国", 1) == 1) 32 | assert_error(function() utf8.offset("中国", 1,2) end, 33 | "initial position is a continuation byte") 34 | assert(utf8.offset("中国", 2) == 4) 35 | assert(utf8.offset("中国", 3) == 7) 36 | assert(utf8.offset("中国", 4) == nil) 37 | assert(utf8.offset("中国", -1,-3) == 1) 38 | assert(utf8.offset("中国", -1,1) == nil) 39 | 40 | -- test byte 41 | local function assert_table_equal(t1, t2, i, j) 42 | i = i or 1 43 | j = j or #t2 44 | local len = j-i+1 45 | assert(#t1 == len) 46 | for cur = 1, len do 47 | assert(t1[cur] == t2[cur+i-1]) 48 | end 49 | end 50 | assert_table_equal({utf8.byte(s, 2)}, t, 2, 2) 51 | assert_table_equal({utf8.byte(s, 1, -1)}, t) 52 | assert_table_equal({utf8.byte(s, -100)}, {}) 53 | assert_table_equal({utf8.byte(s, -100, -200)}, {}) 54 | assert_table_equal({utf8.byte(s, -200, -100)}, {}) 55 | assert_table_equal({utf8.byte(s, 100)}, {}) 56 | assert_table_equal({utf8.byte(s, 100, 200)}, {}) 57 | assert_table_equal({utf8.byte(s, 200, 100)}, {}) 58 | 59 | 60 | -- test char 61 | assert(s == utf8.char(unpack(t))) 62 | 63 | -- test range 64 | for i = 1, #t do 65 | assert(utf8.byte(s, i) == t[i]) 66 | end 67 | 68 | -- test sub 69 | assert(get_codes(utf8.sub(s, 2, -2)) == table.concat(t, ' ', 2, #t-1)) 70 | assert(get_codes(utf8.sub(s, -100)) == table.concat(t, ' ')) 71 | assert(get_codes(utf8.sub(s, -100, -200)) == "") 72 | assert(get_codes(utf8.sub(s, -100, -100)) == "") 73 | assert(get_codes(utf8.sub(s, -100, 0)) == "") 74 | assert(get_codes(utf8.sub(s, -200, -100)) == "") 75 | assert(get_codes(utf8.sub(s, 100, 200)) == "") 76 | assert(get_codes(utf8.sub(s, 200, 100)) == "") 77 | 78 | 79 | -- test insert/remove 80 | assert(utf8.insert("abcdef", "...") == "abcdef...") 81 | assert(utf8.insert("abcdef", 0, "...") == "abcdef...") 82 | assert(utf8.insert("abcdef", 1, "...") == "...abcdef") 83 | assert(utf8.insert("abcdef", 6, "...") == "abcde...f") 84 | assert(utf8.insert("abcdef", 7, "...") == "abcdef...") 85 | assert(utf8.insert("abcdef", 3, "...") == "ab...cdef") 86 | assert(utf8.insert("abcdef", -3, "...") == "abc...def") 87 | assert(utf8.remove("abcdef", 3, 3) == "abdef") 88 | assert(utf8.remove("abcdef", 3, 4) == "abef") 89 | assert(utf8.remove("abcdef", 4, 3) == "abcdef") 90 | assert(utf8.remove("abcdef", -3, -3) == "abcef") 91 | assert(utf8.remove("abcdef", 100) == "abcdef") 92 | assert(utf8.remove("abcdef", -100) == "") 93 | assert(utf8.remove("abcdef", -100, 0) == "abcdef") 94 | assert(utf8.remove("abcdef", -100, -200) == "abcdef") 95 | assert(utf8.remove("abcdef", -200, -100) == "abcdef") 96 | assert(utf8.remove("abcdef", 100, 200) == "abcdef") 97 | assert(utf8.remove("abcdef", 200, 100) == "abcdef") 98 | 99 | do 100 | local s = E"a%255bc" 101 | assert(utf8.len(s, 4)) 102 | assert(string.len(s, 6)) 103 | assert(utf8.charpos(s) == 1) 104 | assert(utf8.charpos(s, 0) == 1) 105 | assert(utf8.charpos(s, 1) == 1) 106 | assert(utf8.charpos(s, 2) == 2) 107 | assert(utf8.charpos(s, 3) == 4) 108 | assert(utf8.charpos(s, 4) == 5) 109 | assert(utf8.charpos(s, 5) == nil) 110 | assert(utf8.charpos(s, 6) == nil) 111 | assert(utf8.charpos(s, -1) == 5) 112 | assert(utf8.charpos(s, -2) == 4) 113 | assert(utf8.charpos(s, -3) == 2) 114 | assert(utf8.charpos(s, -4) == 1) 115 | assert(utf8.charpos(s, -5) == nil) 116 | assert(utf8.charpos(s, -6) == nil) 117 | assert(utf8.charpos(s, 3, -1) == 2) 118 | assert(utf8.charpos(s, 3, 0) == 2) 119 | assert(utf8.charpos(s, 3, 1) == 4) 120 | assert(utf8.charpos(s, 6, -3) == 2) 121 | assert(utf8.charpos(s, 6, -4) == 1) 122 | assert(utf8.charpos(s, 6, -5) == nil) 123 | end 124 | 125 | local idx = 1 126 | for pos, code in utf8.next, s do 127 | assert(t[idx] == code) 128 | idx = idx + 1 129 | end 130 | 131 | assert(utf8.ncasecmp("abc", "AbC") == 0) 132 | assert(utf8.ncasecmp("abc", "AbE") == -1) 133 | assert(utf8.ncasecmp("abe", "AbC") == 1) 134 | assert(utf8.ncasecmp("abc", "abcdef") == -1) 135 | assert(utf8.ncasecmp("abcdef", "abc") == 1) 136 | assert(utf8.ncasecmp("abZdef", "abcZef") == 1) 137 | 138 | assert(utf8.gsub("x^[]+$", "%p", "%%%0") == "x%^%[%]%+%$") 139 | 140 | 141 | -- test invalid 142 | 143 | -- 1110-1010 10-000000 0110-0001 144 | do 145 | local s = "\234\128\97" 146 | assert(utf8.len(s, nil, nil, true) == 2) 147 | assert_table_equal({utf8.len(s)}, {nil, 1}, 1, 2) 148 | 149 | -- 1111-0000 10-000000 10-000000 ... 150 | s = "\240\128\128\128\128" 151 | assert_table_equal({utf8.len(s)}, {nil, 1}, 1, 2) 152 | end 153 | 154 | 155 | -- test compose 156 | local function assert_fail(f, patt) 157 | local ok, msg = pcall(f) 158 | assert(not ok) 159 | assert(msg:match(patt), msg) 160 | end 161 | do 162 | local s = "नमस्ते" 163 | assert(utf8.len(s) == 6) 164 | assert(utf8.reverse(s) == "तेस्मन") 165 | assert(utf8.reverse(s.." ", true) == " ेत्समन") 166 | assert(utf8.match(s..'\2', "%g+") == s) 167 | assert_fail(function() utf8.reverse(E"%xD800") end, "invalid UTF%-8 code") 168 | end 169 | 170 | 171 | -- test match 172 | assert(utf8.match('%c', '') == nil) -- %c does not match U+F000 173 | 174 | 175 | -- test codepoint 176 | for i = 1, 1000 do 177 | assert(utf8.codepoint(E("%"..i)) == i) 178 | end 179 | assert_fail(function() utf8.codepoint(E"%xD800") end, "invalid UTF%-8 code") 180 | 181 | -- test escape 182 | assert_fail(function() E"%{1a1}" end, "invalid escape 'a'") 183 | 184 | 185 | -- test codes 186 | local result = { [1] = 20985; [4] = 20984; [7] = 26364; 187 | [10] = 25171; [13] = 23567; [16] = 24618; [19] = 20861; } 188 | for p, c in utf8.codes(s) do 189 | assert(result[p] == c) 190 | end 191 | for p, c in utf8.codes(s, true) do 192 | assert(result[p] == c) 193 | end 194 | assert_fail(function() 195 | for p, c in utf8.codes(E"%xD800") do 196 | assert(result[p] == c) 197 | end 198 | end, "invalid UTF%-8 code") 199 | 200 | 201 | -- test width 202 | assert(utf8.width('नमस्ते\2') == 5) 203 | assert(utf8.width(E'%xA1') == 1) 204 | assert(utf8.width(E'%xA1', 2) == 2) 205 | assert(utf8.width(E'%x61C') == 0) 206 | assert(utf8.width "A" == 1) 207 | assert(utf8.width "A" == 2) 208 | assert(utf8.width(97) == 1) 209 | assert(utf8.width(65313) == 2) 210 | assert_fail(function() utf8.width(true) end, "number/string expected, got boolean") 211 | assert(utf8.widthindex("abcdef", 3) == 3) 212 | assert(utf8.widthindex("abcdef", 7) == 7) 213 | 214 | -- test patterns 215 | assert_fail(function() utf8.gsub("a", ".", function() return {} end) end, 216 | "invalid replacement value %(a table%)") 217 | assert_fail(function() utf8.gsub("a", ".", "%z") end, 218 | "invalid use of '%%' in replacement string") 219 | assert(utf8.find("abcabc", "ab", -10) == 1) 220 | 221 | -- test charpattern 222 | do 223 | local subj, n = "school=школа", 0 224 | for c in string.gmatch(subj, utf8.charpattern) do n = n+1 end 225 | assert(n == utf8.len(subj)) 226 | end 227 | 228 | 229 | -- test isvalid 230 | local good_strings = { 231 | '', 232 | 'A', 233 | 'abcdefghijklmnopqrstuvwxyz', 234 | "``", 235 | "@", 236 | 'नमस्ते', 237 | '中国', 238 | '日本語01234567890。', 239 | 'ひらがな', 240 | 'Καλημέρα', 241 | 'АБВГ', 242 | '⡌⠁⠧⠑ ⠼', 243 | '∑ f(i)', 244 | 'Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι, ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς', 245 | 'ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789 abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿАБВГДабвгд∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣', 246 | 'გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს', 247 | '\000' -- NUL is valid in UTF-8 248 | } 249 | 250 | for _, good in ipairs(good_strings) do 251 | assert(utf8.isvalid(good)) 252 | end 253 | 254 | assert(not utf8.isvalid("\255")) -- illegal byte 0xFF 255 | assert(not utf8.isvalid("abc\254def")) -- illegal byte 0xFE 256 | 257 | assert(not utf8.isvalid("123 \223")) -- truncated code unit 0xDF 258 | assert(not utf8.isvalid("123 \239\191")) -- truncated code unit 0xEF BF 259 | assert(not utf8.isvalid("123 \240\191")) -- truncated code unit 0xF0 BF 260 | assert(not utf8.isvalid("123 \240\191\191")) -- truncated code unit 0xF0 BF BF 261 | 262 | assert(not utf8.isvalid('\223ABC')) -- code unit 0xDF ended too soon and went to ASCII 263 | assert(not utf8.isvalid('\239\191ABC')) -- code unit 0xEF BF ended too soon and went to ASCII 264 | assert(not utf8.isvalid('\240\191ABC')) -- code unit 0xF0 BF ended too soon and went to ASCII 265 | assert(not utf8.isvalid('\240\191\191ABC')) -- code unit 0xF0 BF BF ended too soon and went to ASCII 266 | 267 | assert(not utf8.isvalid('\223中')) -- code unit 0xDF ended too soon and went to another multi-byte char 268 | assert(not utf8.isvalid('\239\191中')) -- code unit 0xEF BF ended too soon and went to another multi-byte char 269 | assert(not utf8.isvalid('\240\191中')) -- code unit 0xF0 BF ended too soon and went to another multi-byte char 270 | assert(not utf8.isvalid('\240\191\191中')) -- code unit 0xF0 BF BF ended too soon and went to another multi-byte char 271 | 272 | assert(utf8.isvalid('\237\159\191')) -- U+D7FF is valid 273 | assert(not utf8.isvalid('\237\160\128')) -- U+D800; reserved for UTF-16 surrogate 274 | assert(not utf8.isvalid('\237\175\191')) -- U+DBFF; reserved for UTF-16 surrogate 275 | assert(not utf8.isvalid('\237\191\191')) -- U+DFFF; reserved for UTF-16 surrogate 276 | assert(utf8.isvalid('\238\128\128')) -- U+E000 is valid 277 | 278 | assert(utf8.isvalid('\244\143\191\191')) -- U+10FFFF is valid 279 | assert(not utf8.isvalid('\244\144\128\128')) -- U+110000 is not valid 280 | assert(not utf8.isvalid('\247\191\191\191')) -- U+1FFFFF is not valid 281 | 282 | assert(not utf8.isvalid('\128')) -- continuation byte outside a multi-byte char 283 | assert(not utf8.isvalid('A\128A')) -- continuation byte outside a multi-byte char 284 | assert(not utf8.isvalid('中\128')) -- continuation byte outside a multi-byte char 285 | 286 | assert(not utf8.isvalid('\193\191')) -- overlong code unit 287 | assert(not utf8.isvalid('\224\159\191')) -- overlong code unit 288 | assert(not utf8.isvalid('\240\143\191\191')) -- overlong code unit 289 | 290 | -- test clean 291 | local cleaned, was_clean 292 | 293 | for _, good in ipairs(good_strings) do 294 | cleaned, was_clean = utf8.clean(good) 295 | assert(cleaned == good) 296 | assert(was_clean) 297 | end 298 | 299 | cleaned, was_clean = utf8.clean('A\128A') 300 | assert(cleaned == 'A�A') 301 | assert(not was_clean) 302 | 303 | cleaned, was_clean = utf8.clean('\128') 304 | assert(cleaned == '�') 305 | assert(not was_clean) 306 | 307 | cleaned, was_clean = utf8.clean('1\193\1912\224\159\1913\240\143\191\191', '???') 308 | assert(cleaned == '1???2???3???') 309 | assert(not was_clean) 310 | 311 | cleaned, was_clean = utf8.clean('\237\160\128\237\175\191\237\191\191') 312 | assert(cleaned == '�') -- an entire sequence of bad bytes just gets replaced with one replacement char 313 | assert(not was_clean) 314 | 315 | cleaned, was_clean = utf8.clean('123 \223', '') 316 | assert(cleaned == '123 ') 317 | assert(not was_clean) 318 | 319 | cleaned, was_clean = utf8.clean('\239\191中', '') 320 | assert(cleaned == '中') 321 | assert(not was_clean) 322 | 323 | assert_error(function() utf8.clean('abc', '\255') end, "replacement string must be valid UTF%-8") 324 | 325 | 326 | -- test invalidoffset 327 | for _, good in ipairs(good_strings) do 328 | assert(utf8.invalidoffset(good) == nil) 329 | end 330 | 331 | assert(utf8.invalidoffset("\255") == 1) 332 | assert(utf8.invalidoffset("\255", 0) == 1) 333 | assert(utf8.invalidoffset("\255", 1) == 1) 334 | assert(utf8.invalidoffset("\255", 2) == nil) 335 | assert(utf8.invalidoffset("\255", -1) == 1) 336 | assert(utf8.invalidoffset("\255", -2) == 1) 337 | assert(utf8.invalidoffset("\255", -3) == 1) 338 | 339 | assert(utf8.invalidoffset("abc\254def") == 4) 340 | assert(utf8.invalidoffset("abc\254def", 0) == 4) 341 | assert(utf8.invalidoffset("abc\254def", 1) == 4) 342 | assert(utf8.invalidoffset("abc\254def", 2) == 4) 343 | assert(utf8.invalidoffset("abc\254def", 3) == 4) 344 | assert(utf8.invalidoffset("abc\254def", 4) == 4) 345 | assert(utf8.invalidoffset("abc\254def", 5) == nil) 346 | assert(utf8.invalidoffset("abc\254def", 6) == nil) 347 | assert(utf8.invalidoffset("abc\254def", -1) == nil) 348 | assert(utf8.invalidoffset("abc\254def", -2) == nil) 349 | assert(utf8.invalidoffset("abc\254def", -3) == nil) 350 | assert(utf8.invalidoffset("abc\254def", -4) == 4) 351 | assert(utf8.invalidoffset("abc\254def", -5) == 4) 352 | 353 | assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 0) == 1) 354 | assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 1) == 1) 355 | assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 2) == 2) 356 | assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 3) == 3) 357 | assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 4) == 4) 358 | assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 5) == 5) 359 | assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 6) == 6) 360 | assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', -1) == 9) 361 | 362 | 363 | local function parse_codepoints(s) 364 | local list = {} 365 | for hex in s:gmatch("%w+") do 366 | list[#list+1] = tonumber(hex, 16) 367 | end 368 | return utf8.char(unpack(list)) 369 | end 370 | 371 | -- This is an official set of test cases for Unicode normalization 372 | -- Provided by the Unicode Consortium 373 | local normalization_test_cases = {} 374 | local f = io.open('NormalizationTest.txt', 'r') 375 | for line in f:lines() do 376 | if not line:match("^#") and not line:match("^@") then 377 | local src, nfc, nfd = line:match "([%w%s]+);([%w%s]+);([%w%s]+)" 378 | table.insert(normalization_test_cases, { src = parse_codepoints(src), nfc = parse_codepoints(nfc), nfd = parse_codepoints(nfd) }) 379 | end 380 | end 381 | 382 | 383 | -- test isnfc 384 | for _,case in ipairs(normalization_test_cases) do 385 | assert(utf8.isnfc(case.nfc)) 386 | if case.src ~= case.nfc then 387 | assert(not utf8.isnfc(case.src)) 388 | end 389 | if case.nfd ~= case.nfc and case.nfd ~= case.src then 390 | assert(not utf8.isnfc(case.nfd)) 391 | end 392 | end 393 | 394 | -- Regression tests: 395 | -- Although U+1100-115F are all leading Jamo (Korean characters), for some reason, 396 | -- the normalization algorithm only combines U+1100-1112 with a following vowel Jamo 397 | assert(utf8.isnfc("\225\133\133\225\133\163")) 398 | -- In certain cases, we did not properly check if a combining mark was blocked from 399 | -- combining with the preceding starter codepoint (by another combining mark with 400 | -- the same canonicalization class) 401 | assert(utf8.isnfc("\196\148\204\162\204\167")) 402 | -- It is possible that a codepoint which is composed from a starter and combining mark 403 | -- might be decomposed, then the resulting starter might be decomposed AGAIN, then 404 | -- those two resulting combining marks might be reordered with a following combining 405 | -- mark 406 | assert(not utf8.isnfc("\199\154\204\164")) 407 | 408 | -- test normalize_nfc 409 | for _,case in ipairs(normalization_test_cases) do 410 | assert(utf8.normalize_nfc(case.src) == case.nfc) 411 | assert(utf8.normalize_nfc(case.nfc) == case.nfc) 412 | assert(utf8.normalize_nfc(case.nfd) == case.nfc) 413 | end 414 | 415 | -- Regression tests: 416 | -- Long series of combining marks; these need to be sorted in canonical order 417 | assert(utf8.normalize_nfc("\215\129\215\133\215\133\215\129\215\129\215\129\215\129\215\129\215\129") == "\215\129\215\129\215\129\215\129\215\129\215\129\215\129\215\133\215\133") 418 | -- After converting combining marks to standard codepoints, it is possible their canonicalization class may change 419 | -- If so, make sure they are still put in the correct order 420 | assert(utf8.normalize_nfc("\200\135\204\163\204\169") == "\225\186\185\204\169\204\145") 421 | -- This test case caused an out-of-bounds read where my code tried to sort an empty array 422 | assert(utf8.normalize_nfc("\225\190\129\204\129") == "\225\190\133") 423 | -- After converting one codepoint to two, as required by the NFC normalization tables, 424 | -- if the 2nd resulting codepoint is a combining mark, we have to be ready to re-order 425 | -- it with any following combining marks 426 | assert(utf8.normalize_nfc("\224\165\152\204\184") == "\224\164\149\204\184\224\164\188") 427 | -- It can also happen that a codepoint converts to a starter followed by TWO combining marks, 428 | -- and we must be able to reorder BOTH of those combining marks with a following combining mark 429 | assert(utf8.normalize_nfc("\239\172\172\204\184") == "\215\169\204\184\214\188\215\129") 430 | -- It can even happen that a deprecated 'starter' codepoint (canonicalization class = 0) 431 | -- can convert to 'combining mark' codepoints (canonicalization class != 0) 432 | assert(utf8.normalize_nfc("\223\179\224\189\179") == "\224\189\177\224\189\178\223\179") 433 | -- In certain cases, we did not properly check if a combining mark was blocked from 434 | -- combining with the preceding starter codepoint (by another combining mark with 435 | -- the same canonicalization class) 436 | assert(utf8.normalize_nfc("\196\148\204\162\204\167") == "\196\148\204\162\204\167") 437 | assert(utf8.normalize_nfc("\200\148\204\160\204\148\204\164") == "\200\148\204\160\204\164\204\148") 438 | -- It is possible that a codepoint which is composed from a starter and combining mark 439 | -- might be decomposed, then the resulting starter might be decomposed AGAIN, then 440 | -- those two resulting combining marks might be reordered with a following combining 441 | -- mark 442 | assert(utf8.normalize_nfc("\199\154\204\164") == "\225\185\179\204\136\204\140") 443 | -- When a codepoint decomposes to a starter followed by 2 combining marks, we need to 444 | -- make sure those combining marks are in the right order with any following ones 445 | assert(utf8.normalize_nfc("\199\160\205\129\204\168") == "\196\132\204\135\204\132\204\129") 446 | -- Fixing another issue with ordering of combining marks after a codepoint decomposes 447 | -- to a starter followed by 1 or 2 combining marks: 448 | assert(utf8.normalize_nfc("\199\155\204\155\204\131\204\155") == "\198\175\204\155\204\136\204\128\204\131") 449 | 450 | 451 | -- Official set of test cases for grapheme cluster segmentation, provided by Unicode Consortium 452 | local grapheme_test_cases = {} 453 | f = io.open('GraphemeBreakTest.txt', 'r') 454 | for line in f:lines() do 455 | if not line:match("^#") and not line:match("^@") then 456 | line = line:gsub("#.*", "") 457 | line = line:gsub("^%s*÷%s*", "") 458 | line = line:gsub("%s*÷%s*$", "") 459 | local clusters = { "" } 460 | for str in line:gmatch("%S+") do 461 | if str == '×' then 462 | -- do nothing 463 | elseif str == '÷' then 464 | table.insert(clusters, "") -- start a new cluster 465 | else 466 | clusters[#clusters] = clusters[#clusters]..utf8.char(tonumber(str, 16)) 467 | end 468 | end 469 | table.insert(grapheme_test_cases, { str=table.concat(clusters), clusters=clusters }) 470 | end 471 | end 472 | 473 | 474 | -- test grapheme_indices 475 | for _,case in ipairs(grapheme_test_cases) do 476 | local actual_clusters = {} 477 | for start,stop in utf8.grapheme_indices(case.str) do 478 | table.insert(actual_clusters, case.str:sub(start, stop)) 479 | end 480 | assert(#actual_clusters == #case.clusters) 481 | for i,cluster in ipairs(case.clusters) do 482 | assert(actual_clusters[i] == cluster) 483 | end 484 | end 485 | 486 | -- try iterating over grapheme clusters in a substring 487 | local clusters = {} 488 | for a,b in utf8.grapheme_indices('ひらがな', 4, 9) do 489 | table.insert(clusters, a) 490 | table.insert(clusters, b) 491 | end 492 | for idx,value in ipairs({ 4, 6, 7, 9 }) do 493 | assert(clusters[idx] == value) 494 | end 495 | 496 | -- try private use codepoint followed by a combining character 497 | clusters = {} 498 | for a,b in utf8.grapheme_indices('\239\128\128\204\154') do 499 | table.insert(clusters, a) 500 | table.insert(clusters, b) 501 | end 502 | for idx,value in ipairs({ 1, 5 }) do 503 | assert(clusters[idx] == value) 504 | end 505 | 506 | 507 | print "OK" 508 | 509 | -- cc: run='lua -- $input' 510 | 511 | -------------------------------------------------------------------------------- /test_compat.lua: -------------------------------------------------------------------------------- 1 | local utf8 = require 'lua-utf8' 2 | print('testing utf8 library') 3 | 4 | assert(utf8.sub("123456789",2,4) == "234") 5 | assert(utf8.sub("123456789",7) == "789") 6 | assert(utf8.sub("123456789",7,6) == "") 7 | assert(utf8.sub("123456789",7,7) == "7") 8 | assert(utf8.sub("123456789",0,0) == "") 9 | assert(utf8.sub("123456789",-10,10) == "123456789") 10 | assert(utf8.sub("123456789",1,9) == "123456789") 11 | assert(utf8.sub("123456789",-10,-20) == "") 12 | assert(utf8.sub("123456789",-1) == "9") 13 | assert(utf8.sub("123456789",-4) == "6789") 14 | assert(utf8.sub("123456789",-6, -4) == "456") 15 | if not _no32 then 16 | assert(utf8.sub("123456789",-2^31, -4) == "123456") 17 | assert(utf8.sub("123456789",-2^31, 2^31 - 1) == "123456789") 18 | assert(utf8.sub("123456789",-2^31, -2^31) == "") 19 | end 20 | assert(utf8.sub("\000123456789",3,5) == "234") 21 | assert(utf8.sub("\000123456789", 8) == "789") 22 | print('+') 23 | 24 | assert(utf8.find("123456789", "345") == 3) 25 | a,b = utf8.find("123456789", "345") 26 | assert(utf8.sub("123456789", a, b) == "345") 27 | assert(utf8.find("1234567890123456789", "345", 3) == 3) 28 | assert(utf8.find("1234567890123456789", "345", 4) == 13) 29 | assert(utf8.find("1234567890123456789", "346", 4) == nil) 30 | assert(utf8.find("1234567890123456789", ".45", -9) == 13) 31 | assert(utf8.find("abcdefg", "\0", 5, 1) == nil) 32 | assert(utf8.find("", "") == 1) 33 | assert(utf8.find("", "", 1) == 1) 34 | assert(not utf8.find("", "", 2)) 35 | assert(utf8.find('', 'aaa', 1) == nil) 36 | assert(('alo(.)alo'):find('(.)', 1, 1) == 4) 37 | print('+') 38 | 39 | assert(utf8.len("") == 0) 40 | assert(utf8.len("\0\0\0") == 3) 41 | assert(utf8.len("1234567890") == 10) 42 | 43 | local E = utf8.escape 44 | assert(utf8.byte("a") == 97) 45 | assert(utf8.byte(E"%228") > 127) 46 | assert(utf8.byte(utf8.char(255)) == 255) 47 | assert(utf8.byte(utf8.char(0)) == 0) 48 | assert(utf8.byte("\0") == 0) 49 | assert(utf8.byte("\0\0alo\0x", -1) == string.byte('x')) 50 | assert(utf8.byte("ba", 2) == 97) 51 | assert(utf8.byte("\n\n", 2, -1) == 10) 52 | assert(utf8.byte("\n\n", 2, 2) == 10) 53 | assert(utf8.byte("") == nil) 54 | assert(utf8.byte("hi", -3) == nil) 55 | assert(utf8.byte("hi", 3) == nil) 56 | assert(utf8.byte("hi", 9, 10) == nil) 57 | assert(utf8.byte("hi", 2, 1) == nil) 58 | assert(utf8.char() == "") 59 | assert(utf8.char(0, 255, 0) == utf8.escape"%0%255%0") 60 | assert(utf8.char(0, utf8.byte(E"%228"), 0) == E"%0%xe4%0") 61 | assert(utf8.char(utf8.byte(E"%228l\0髐", 1, -1)) == E"%xe4l\0髐") 62 | assert(utf8.char(utf8.byte(E"%228l\0髐", 1, 0)) == "") 63 | assert(utf8.char(utf8.byte(E"%228l\0髐", -10, 100)) == E"%xe4l\0髐") 64 | print('+') 65 | 66 | assert(utf8.upper("ab\0c") == "AB\0C") 67 | assert(utf8.lower("\0ABCc%$") == "\0abcc%$") 68 | 69 | assert(utf8.reverse"" == "") 70 | assert(utf8.reverse"\0\1\2\3" == "\3\2\1\0") 71 | assert(utf8.reverse"\0001234" == "4321\0") 72 | 73 | for i=0,30 do assert(utf8.len(string.rep('a', i)) == i) end 74 | 75 | print('+') 76 | 77 | 78 | print('OK') 79 | -------------------------------------------------------------------------------- /test_pm.lua: -------------------------------------------------------------------------------- 1 | local utf8 = require 'lua-utf8' 2 | 3 | print('testing pattern matching') 4 | 5 | function f(s, p) 6 | local i,e = utf8.find(s, p) 7 | if i then return utf8.sub(s, i, e) end 8 | end 9 | 10 | function f1(s, p) 11 | p = utf8.gsub(p, "%%([0-9])", function (s) return "%" .. (tonumber(s)+1) end) 12 | p = utf8.gsub(p, "^(^?)", "%1()", 1) 13 | p = utf8.gsub(p, "($?)$", "()%1", 1) 14 | local t = {utf8.match(s, p)} 15 | return utf8.sub(s, t[1], t[#t] - 1) 16 | end 17 | 18 | a,b = utf8.find('', '') -- empty patterns are tricky 19 | assert(a == 1 and b == 0); 20 | a,b = utf8.find('alo', '') 21 | assert(a == 1 and b == 0) 22 | a,b = utf8.find('a\0o a\0o a\0o', 'a', 1) -- first position 23 | assert(a == 1 and b == 1) 24 | a,b = utf8.find('a\0o a\0o a\0o', 'a\0o', 2) -- starts in the midle 25 | assert(a == 5 and b == 7) 26 | a,b = utf8.find('a\0o a\0o a\0o', 'a\0o', 9) -- starts in the midle 27 | assert(a == 9 and b == 11) 28 | a,b = utf8.find('a\0a\0a\0a\0\0ab', '\0ab', 2); -- finds at the end 29 | assert(a == 9 and b == 11); 30 | a,b = utf8.find('a\0a\0a\0a\0\0ab', 'b') -- last position 31 | assert(a == 11 and b == 11) 32 | assert(utf8.find('a\0a\0a\0a\0\0ab', 'b\0') == nil) -- check ending 33 | assert(utf8.find('', '\0') == nil) 34 | assert(utf8.find('alo123alo', '12') == 4) 35 | assert(utf8.find('alo123alo', '^12') == nil) 36 | 37 | assert(utf8.match("aaab", ".*b") == "aaab") 38 | assert(utf8.match("aaa", ".*a") == "aaa") 39 | assert(utf8.match("b", ".*b") == "b") 40 | 41 | assert(utf8.match("aaab", ".+b") == "aaab") 42 | assert(utf8.match("aaa", ".+a") == "aaa") 43 | assert(not utf8.match("b", ".+b")) 44 | 45 | assert(utf8.match("aaab", ".?b") == "ab") 46 | assert(utf8.match("aaa", ".?a") == "aa") 47 | assert(utf8.match("b", ".?b") == "b") 48 | 49 | assert(f('aloALO', '%l*') == 'alo') 50 | assert(f('aLo_ALO', '%a*') == 'aLo') 51 | 52 | assert(f(" \n\r*&\n\r xuxu \n\n", "%g%g%g+") == "xuxu") 53 | 54 | assert(f('aaab', 'a*') == 'aaa'); 55 | assert(f('aaa', '^.*$') == 'aaa'); 56 | assert(f('aaa', 'b*') == ''); 57 | assert(f('aaa', 'ab*a') == 'aa') 58 | assert(f('aba', 'ab*a') == 'aba') 59 | assert(f('aaab', 'a+') == 'aaa') 60 | assert(f('aaa', '^.+$') == 'aaa') 61 | assert(f('aaa', 'b+') == nil) 62 | assert(f('aaa', 'ab+a') == nil) 63 | assert(f('aba', 'ab+a') == 'aba') 64 | assert(f('a$a', '.$') == 'a') 65 | assert(f('a$a', '.%$') == 'a$') 66 | assert(f('a$a', '.$.') == 'a$a') 67 | assert(f('a$a', '$$') == nil) 68 | assert(f('a$b', 'a$') == nil) 69 | assert(f('a$a', '$') == '') 70 | assert(f('', 'b*') == '') 71 | assert(f('aaa', 'bb*') == nil) 72 | assert(f('aaab', 'a-') == '') 73 | assert(f('aaa', '^.-$') == 'aaa') 74 | assert(f('aabaaabaaabaaaba', 'b.*b') == 'baaabaaabaaab') 75 | assert(f('aabaaabaaabaaaba', 'b.-b') == 'baaab') 76 | assert(f('alo xo', '.o$') == 'xo') 77 | assert(f(' \n isto é assim', '%S%S*') == 'isto') 78 | assert(f(' \n isto é assim', '%S*$') == 'assim') 79 | assert(f(' \n isto é assim', '[a-z]*$') == 'assim') 80 | assert(f('um caracter ? extra', '[^%sa-z]') == '?') 81 | assert(f('', 'a?') == '') 82 | assert(f('á', 'á?') == 'á') 83 | assert(f('ábl', 'á?b?l?') == 'ábl') 84 | assert(f(' ábl', 'á?b?l?') == '') 85 | assert(f('aa', '^aa?a?a') == 'aa') 86 | assert(f(']]]áb', '[^]]') == 'á') 87 | assert(f("0alo alo", "%x*") == "0a") 88 | assert(f("alo alo", "%C+") == "alo alo") 89 | print('+') 90 | 91 | assert(f1('alo alx 123 b\0o b\0o', '(..*) %1') == "b\0o b\0o") 92 | assert(f1('axz123= 4= 4 34', '(.+)=(.*)=%2 %1') == '3= 4= 4 3') 93 | assert(f1('=======', '^(=*)=%1$') == '=======') 94 | assert(utf8.match('==========', '^([=]*)=%1$') == nil) 95 | 96 | local function range (i, j) 97 | if i <= j then 98 | return i, range(i+1, j) 99 | end 100 | end 101 | 102 | local abc = utf8.char(range(0, 255)); 103 | 104 | assert(utf8.len(abc) == 256) 105 | assert(string.len(abc) == 384) 106 | 107 | function strset (p) 108 | local res = {s=''} 109 | utf8.gsub(abc, p, function (c) res.s = res.s .. c end) 110 | return res.s 111 | end; 112 | 113 | local E = utf8.escape 114 | assert(utf8.len(strset(E'[%200-%210]')) == 11) 115 | 116 | assert(strset('[a-z]') == "abcdefghijklmnopqrstuvwxyz") 117 | assert(strset('[a-z%d]') == strset('[%da-uu-z]')) 118 | assert(strset('[a-]') == "-a") 119 | assert(strset('[^%W]') == strset('[%w]')) 120 | assert(strset('[]%%]') == '%]') 121 | assert(strset('[a%-z]') == '-az') 122 | assert(strset('[%^%[%-a%]%-b]') == '-[]^ab') 123 | assert(strset('%Z') == strset(E'[%1-%255]')) 124 | assert(strset('.') == strset(E'[%1-%255%%z]')) 125 | print('+'); 126 | 127 | assert(utf8.match("alo xyzK", "(%w+)K") == "xyz") 128 | assert(utf8.match("254 K", "(%d*)K") == "") 129 | assert(utf8.match("alo ", "(%w*)$") == "") 130 | assert(utf8.match("alo ", "(%w+)$") == nil) 131 | assert(utf8.find("(álo)", "%(á") == 1) 132 | local a, b, c, d, e = utf8.match("âlo alo", "^(((.).).* (%w*))$") 133 | assert(a == 'âlo alo' and b == 'âl' and c == 'â' and d == 'alo' and e == nil) 134 | a, b, c, d = utf8.match('0123456789', '(.+(.?)())') 135 | assert(a == '0123456789' and b == '' and c == 11 and d == nil) 136 | print('+') 137 | 138 | assert(utf8.gsub('ülo ülo', 'ü', 'x') == 'xlo xlo') 139 | assert(utf8.gsub('alo úlo ', ' +$', '') == 'alo úlo') -- trim 140 | assert(utf8.gsub(' alo alo ', '^%s*(.-)%s*$', '%1') == 'alo alo') -- double trim 141 | assert(utf8.gsub('alo alo \n 123\n ', '%s+', ' ') == 'alo alo 123 ') 142 | t = "abç d" 143 | a, b = utf8.gsub(t, '(.)', '%1@') 144 | assert('@'..a == utf8.gsub(t, '', '@') and b == 5) 145 | a, b = utf8.gsub('abçd', '(.)', '%0@', 2) 146 | assert(a == 'a@b@çd' and b == 2) 147 | assert(utf8.gsub('alo alo', '()[al]', '%1') == '12o 56o') 148 | assert(utf8.gsub("abc=xyz", "(%w*)(%p)(%w+)", "%3%2%1-%0") == 149 | "xyz=abc-abc=xyz") 150 | assert(utf8.gsub("abc", "%w", "%1%0") == "aabbcc") 151 | assert(utf8.gsub("abc", "%w+", "%0%1") == "abcabc") 152 | assert(utf8.gsub('áéí', '$', '\0óú') == 'áéí\0óú') 153 | assert(utf8.gsub('', '^', 'r') == 'r') 154 | assert(utf8.gsub('', '$', 'r') == 'r') 155 | print('+') 156 | 157 | assert(utf8.gsub("um (dois) tres (quatro)", "(%(%w+%))", utf8.upper) == 158 | "um (DOIS) tres (QUATRO)") 159 | 160 | do 161 | local function setglobal (n,v) rawset(_G, n, v) end 162 | utf8.gsub("a=roberto,roberto=a", "(%w+)=(%w%w*)", setglobal) 163 | assert(_G.a=="roberto" and _G.roberto=="a") 164 | end 165 | 166 | function f(a,b) return utf8.gsub(a,'.',b) end 167 | assert(utf8.gsub("trocar tudo em |teste|b| é |beleza|al|", "|([^|]*)|([^|]*)|", f) == 168 | "trocar tudo em bbbbb é alalalalalal") 169 | 170 | local function dostring (s) return (loadstring or load)(s)() or "" end 171 | assert(utf8.gsub("alo $a=1$ novamente $return a$", "$([^$]*)%$", dostring) == 172 | "alo novamente 1") 173 | 174 | x = utf8.gsub("$local utf8=require'lua-utf8' x=utf8.gsub('alo', '.', utf8.upper)$ assim vai para $return x$", 175 | "$([^$]*)%$", dostring) 176 | assert(x == ' assim vai para ALO') 177 | 178 | t = {} 179 | s = 'a alo jose joao' 180 | r = utf8.gsub(s, '()(%w+)()', function (a,w,b) 181 | assert(utf8.len(w) == b-a); 182 | t[a] = b-a; 183 | end) 184 | assert(s == r and t[1] == 1 and t[3] == 3 and t[7] == 4 and t[13] == 4) 185 | 186 | 187 | function isbalanced (s) 188 | return utf8.find(utf8.gsub(s, "%b()", ""), "[()]") == nil 189 | end 190 | 191 | assert(isbalanced("(9 ((8))(\0) 7) \0\0 a b ()(c)() a")) 192 | assert(not isbalanced("(9 ((8) 7) a b (\0 c) a")) 193 | assert(utf8.gsub("alo 'oi' alo", "%b''", '"') == 'alo " alo') 194 | 195 | 196 | local t = {"apple", "orange", "lime"; n=0} 197 | assert(utf8.gsub("x and x and x", "x", function () t.n=t.n+1; return t[t.n] end) 198 | == "apple and orange and lime") 199 | 200 | t = {n=0} 201 | utf8.gsub("first second word", "%w%w*", function (w) t.n=t.n+1; t[t.n] = w end) 202 | assert(t[1] == "first" and t[2] == "second" and t[3] == "word" and t.n == 3) 203 | 204 | t = {n=0} 205 | assert(utf8.gsub("first second word", "%w+", 206 | function (w) t.n=t.n+1; t[t.n] = w end, 2) == "first second word") 207 | assert(t[1] == "first" and t[2] == "second" and t[3] == nil) 208 | 209 | assert(not pcall(utf8.gsub, "alo", "(.", print)) 210 | assert(not pcall(utf8.gsub, "alo", ".)", print)) 211 | assert(not pcall(utf8.gsub, "alo", "(.", {})) 212 | assert(not pcall(utf8.gsub, "alo", "(.)", "%2")) 213 | assert(not pcall(utf8.gsub, "alo", "(%1)", "a")) 214 | assert(not pcall(utf8.gsub, "alo", "(%0)", "a")) 215 | 216 | -- bug since 2.5 (C-stack overflow) 217 | do 218 | local function f (size) 219 | local s = string.rep("a", size) 220 | local p = string.rep(".?", size) 221 | return pcall(utf8.match, s, p) 222 | end 223 | local r, m = f(80) 224 | assert(r and #m == 80) 225 | r, m = f(200000) 226 | assert(not r and utf8.find(m, "too complex")) 227 | end 228 | 229 | if not _soft then 230 | -- big strings 231 | local a = string.rep('a', 300000) 232 | assert(utf8.find(a, '^a*.?$')) 233 | assert(not utf8.find(a, '^a*.?b$')) 234 | assert(utf8.find(a, '^a-.?$')) 235 | 236 | -- bug in 5.1.2 237 | a = string.rep('a', 10000) .. string.rep('b', 10000) 238 | assert(not pcall(utf8.gsub, a, 'b')) 239 | end 240 | 241 | -- recursive nest of gsubs 242 | function rev (s) 243 | return utf8.gsub(s, "(.)(.+)", function (c,s1) return rev(s1)..c end) 244 | end 245 | 246 | local x = "abcdef" 247 | assert(rev(rev(x)) == x) 248 | 249 | 250 | -- gsub with tables 251 | assert(utf8.gsub("alo alo", ".", {}) == "alo alo") 252 | assert(utf8.gsub("alo alo", "(.)", {a="AA", l=""}) == "AAo AAo") 253 | assert(utf8.gsub("alo alo", "(.).", {a="AA", l="K"}) == "AAo AAo") 254 | assert(utf8.gsub("alo alo", "((.)(.?))", {al="AA", o=false}) == "AAo AAo") 255 | 256 | assert(utf8.gsub("alo alo", "().", {2,5,6}) == "256 alo") 257 | 258 | t = {}; setmetatable(t, {__index = function (t,s) return utf8.upper(s) end}) 259 | assert(utf8.gsub("a alo b hi", "%w%w+", t) == "a ALO b HI") 260 | 261 | 262 | -- tests for gmatch 263 | local a = 0 264 | for i in utf8.gmatch('abcde', '()') do assert(i == a+1); a=i end 265 | assert(a==6) 266 | 267 | t = {n=0} 268 | for w in utf8.gmatch("first second word", "%w+") do 269 | t.n=t.n+1; t[t.n] = w 270 | end 271 | assert(t[1] == "first" and t[2] == "second" and t[3] == "word") 272 | 273 | t = {3, 6, 9} 274 | for i in utf8.gmatch ("xuxx uu ppar r", "()(.)%2") do 275 | assert(i == table.remove(t, 1)) 276 | end 277 | assert(#t == 0) 278 | 279 | t = {} 280 | for i,j in utf8.gmatch("13 14 10 = 11, 15= 16, 22=23", "(%d+)%s*=%s*(%d+)") do 281 | t[i] = j 282 | end 283 | a = 0 284 | for k,v in pairs(t) do assert(k+1 == v+0); a=a+1 end 285 | assert(a == 3) 286 | 287 | 288 | -- tests for `%f' (`frontiers') 289 | 290 | assert(utf8.gsub("aaa aa a aaa a", "%f[%w]a", "x") == "xaa xa x xaa x") 291 | assert(utf8.gsub("[[]] [][] [[[[", "%f[[].", "x") == "x[]] x]x] x[[[") 292 | assert(utf8.gsub("01abc45de3", "%f[%d]", ".") == ".01abc.45de.3") 293 | assert(utf8.gsub("01abc45 de3x", "%f[%D]%w", ".") == "01.bc45 de3.") 294 | local u = utf8.escape 295 | assert(utf8.gsub("function", u"%%f[%1-%255]%%w", ".") == ".unction") 296 | assert(utf8.gsub("function", u"%%f[^%1-%255]", ".") == "function.") 297 | 298 | assert(utf8.find("a", "%f[a]") == 1) 299 | assert(utf8.find("a", "%f[^%z]") == 1) 300 | assert(utf8.find("a", "%f[^%l]") == 2) 301 | assert(utf8.find("aba", "%f[a%z]") == 3) 302 | assert(utf8.find("aba", "%f[%z]") == 4) 303 | assert(not utf8.find("aba", "%f[%l%z]")) 304 | assert(not utf8.find("aba", "%f[^%l%z]")) 305 | 306 | local i, e = utf8.find(" alo aalo allo", "%f[%S].-%f[%s].-%f[%S]") 307 | assert(i == 2 and e == 5) 308 | local k = utf8.match(" alo aalo allo", "%f[%S](.-%f[%s].-%f[%S])") 309 | assert(k == 'alo ') 310 | 311 | local a = {1, 5, 9, 14, 17,} 312 | for k in utf8.gmatch("alo alo th02 is 1hat", "()%f[%w%d]") do 313 | assert(table.remove(a, 1) == k) 314 | end 315 | assert(#a == 0) 316 | 317 | 318 | -- malformed patterns 319 | local function malform (p, m) 320 | m = m or "malformed" 321 | local r, msg = pcall(utf8.find, "a", p) 322 | assert(not r and utf8.find(msg, m)) 323 | end 324 | 325 | malform("[a") 326 | malform("[]") 327 | malform("[^]") 328 | malform("[a%]") 329 | malform("[a%") 330 | malform("%b") 331 | malform("%ba") 332 | malform("%") 333 | malform("%f", "missing") 334 | 335 | -- \0 in patterns 336 | assert(utf8.match("ab\0\1\2c", "[\0-\2]+") == "\0\1\2") 337 | assert(utf8.match("ab\0\1\2c", "[\0-\0]+") == "\0") 338 | assert(utf8.find("b$a", "$\0?") == 2) 339 | assert(utf8.find("abc\0efg", "%\0") == 4) 340 | assert(utf8.match("abc\0efg\0\1e\1g", "%b\0\1") == "\0efg\0\1e\1") 341 | assert(utf8.match("abc\0\0\0", "%\0+") == "\0\0\0") 342 | assert(utf8.match("abc\0\0\0", "%\0%\0?") == "\0\0") 343 | 344 | -- magic char after \0 345 | assert(utf8.find("abc\0\0","\0.") == 4) 346 | assert(utf8.find("abcx\0\0abc\0abc","x\0\0abc\0a.") == 4) 347 | 348 | print('OK') 349 | --------------------------------------------------------------------------------