├── .gitattributes
├── .gitignore
├── .travis.yml
├── GraphemeBreakTest.txt
├── LICENSE
├── NormalizationTest.txt
├── README.md
├── fuzzer
    ├── Makefile
    ├── fuzz-clean.c
    ├── fuzz-grapheme.c
    ├── fuzz-invalid.c
    ├── fuzz-normalize.c
    └── fuzz-valid.c
├── lutf8lib.c
├── parseucd.lua
├── rockspecs
    ├── luautf8-0.1.6-1.rockspec
    └── luautf8-scm-1.rockspec
├── test.lua
├── test_compat.lua
├── test_pm.lua
└── unidata.h


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.h linguist-language=C
2 | 
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | UCD/
 2 | UCD.*/
 3 | ucd/
 4 | ucd.*/
 5 | *.dll
 6 | 
 7 | lua-utf8.so
 8 | lua-utf8.so.*
 9 | luautf8-*.zip
10 | luautf8-*.rock
11 | 
12 | *.gcov
13 | *.gcda
14 | *.gcno
15 | 
16 | test_*.lua
17 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: c
 2 | sudo: false
 3 | 
 4 | env:
 5 |   global:
 6 |     - LUAROCKS=2.4.3
 7 |     - ROCKSPEC=rockspecs/luautf8-scm-0.rockspec
 8 |   matrix:
 9 |     - LUA="lua 5.1"
10 |     - LUA="lua 5.2"
11 |     - LUA="lua 5.3"
12 |     - LUA="luajit 2.0"
13 |     - LUA="luajit 2.1"
14 | 
15 | branches:
16 |   only:
17 |     - master
18 |     - develop
19 | 
20 | before_install:
21 |   - pip install --user hererocks urllib3[secure] cpp-coveralls
22 |   - hererocks env --$LUA -rlatest    # Use latest LuaRocks, install into 'env' directory.
23 |   - source env/bin/activate          # Add directory with all installed binaries to PATH.
24 | 
25 | install:
26 |   # - sudo luarocks make $ROCKSPEC CFLAGS="-O2 -fPIC -ftest-coverage -fprofile-arcs" LIBFLAG="-shared --coverage"
27 |   - luarocks make $ROCKSPEC CFLAGS="-O3 -fPIC -Wall -Wextra --coverage" LIBFLAG="-shared --coverage"
28 | 
29 | script: 
30 |   - lua test.lua
31 |   - lua test_pm.lua
32 |   - lua test_compat.lua
33 |   # - lunit.sh test.lua
34 | 
35 | after_success:
36 |     - coveralls
37 | #  - coveralls -b .. -r .. --dump c.report.json
38 | #  - luacov-coveralls -j c.report.json -v
39 | 
40 | notifications:
41 |   email:
42 |     on_success: change
43 |     on_failure: always
44 | 
45 | # vim: ft=yaml nu et sw=2 fdc=2 fdm=syntax
46 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Xavier Wang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | UTF-8 module for Lua 5.x
  2 | ========================
  3 | [![Build Status](https://travis-ci.org/starwing/luautf8.svg?branch=master)](https://travis-ci.org/starwing/luautf8)
  4 | [![Coverage Status](https://coveralls.io/repos/github/starwing/luautf8/badge.svg?branch=master)](https://coveralls.io/github/starwing/luautf8?branch=master)
  5 | 
  6 | This module adds UTF-8 support to Lua.
  7 | 
  8 | It uses data extracted from
  9 | [Unicode Character Database](http://www.unicode.org/reports/tr44/),
 10 | and tested on Lua 5.2.3, Lua 5.3.0 and LuaJIT.
 11 | 
 12 | parseucd.lua is a pure Lua script which generates unidata.h, to support
 13 | converting characters and checking characters' category.
 14 | 
 15 | It is compatible with Lua's own string module and passes all
 16 | string and pattern matching tests in the Lua test suite[2].
 17 | 
 18 | It also adds some useful routines against UTF-8 features, such as:
 19 | - a convenient interface to escape Unicode sequences in strings.
 20 | - string insert/remove, since UTF-8 substring extraction may be expensive.
 21 | - calculate Unicode width, useful when implementing e.g. console emulator.
 22 | - a useful interface to translate Unicode offsets and byte offsets.
 23 | - checking UTF-8 strings for validity and removing invalid byte sequences.
 24 | - converting Unicode strings to normal form.
 25 | 
 26 | Note that to avoid conflict with Lua5.3's built-in library 'utf8',
 27 | this library produces a file like lua-utf8.dll or lua-utf8.so. so use
 28 | it like this:
 29 | 
 30 | ```lua
 31 | local utf8 = require 'lua-utf8'
 32 | ```
 33 | 
 34 | in your code :-(
 35 | 
 36 | [2]: http://www.lua.org/tests/5.2/
 37 | 
 38 | 
 39 | LuaRocks Installation
 40 | ---------------------
 41 | `luarocks install luautf8`
 42 | 
 43 | It's now fully-compatible with Lua 5.3's utf8 library, so replacing this
 44 | file (and headers) with lutf8lib.c from the Lua 5.3 sources is also okay.
 45 | 
 46 | Usage
 47 | -----
 48 | 
 49 | Many routines are the same as Lua's string module:
 50 | - `utf8.byte`
 51 | - `utf8.char`
 52 | - `utf8.find`
 53 | - `utf8.gmatch`
 54 | - `utf8.gsub`
 55 | - `utf8.len`
 56 | - `utf8.lower`
 57 | - `utf8.match`
 58 | - `utf8.reverse`
 59 | - `utf8.sub`
 60 | - `utf8.upper`
 61 | 
 62 | The documentation of these functions can be found in the Lua manual[3].
 63 | 
 64 | [3]: http://www.lua.org/manual/5.2/manual.html#6.4
 65 | 
 66 | 
 67 | Some routines in string module needn't support Unicode:
 68 | - `string.dump`
 69 | - `string.format`
 70 | - `string.rep`
 71 | 
 72 | They are NOT in utf8 module.
 73 | 
 74 | Some routines are for compatibility with Lua 5.3's basic UTF-8 support library:
 75 | - `utf8.offset`
 76 | - `utf8.codepoint`
 77 | - `utf8.codes`
 78 | 
 79 | See Lua5.3's manual for usage.
 80 | 
 81 | Some routines are new, with some Unicode-spec functions:
 82 | 
 83 | ### utf8.escape(str) -> utf8 string
 84 | escape a str to UTF-8 format string. It supports several escape formats:
 85 | 
 86 |  * `%ddd` - which ddd is a decimal number at any length:
 87 |    change Unicode code point to UTF-8 format.
 88 |  * `%{ddd}` - same as `%nnn` but has bracket around.
 89 |  * `%uddd` - same as `%ddd`, u stands Unicode
 90 |  * `%u{ddd}` - same as `%{ddd}`
 91 |  * `%xhhh` - hexadigit version of `%ddd`
 92 |  * `%x{hhh}` same as `%xhhh`.
 93 |  * `%?` - '?' stands for any other character: escape this character.
 94 | 
 95 | #### Examples:
 96 | ```lua
 97 | local u = utf8.escape
 98 | print(u"%123%u123%{123}%u{123}%xABC%x{ABC}")
 99 | print(u"%%123%?%d%%u")
100 | ```
101 | 
102 | 
103 | ### utf8.charpos(s[[, charpos], index]) -> charpos, code point
104 | convert UTF-8 position to byte offset.
105 | if only `index` is given, return byte offset of this UTF-8 char index.
106 | if both `charpos` and `index` is given, a new `charpos` will be
107 | calculated, by adding/subtracting UTF-8 char `index` to current `charpos`.
108 | in all cases, it returns a new char position, and code point (a
109 | number) at this position.
110 | 
111 | 
112 | ### utf8.next(s[, charpos[, index]]) -> charpos, code point
113 | iterate though the UTF-8 string s.
114 | If only s is given, it can used as a iterator:
115 | ```lua
116 | for pos, code in utf8.next, "utf8-string" do
117 |    -- ...
118 | end
119 | ```
120 | if only s and `charpos` are given, return the byte offset of the next codepoint
121 | in the string.
122 | if `charpos` and `index` are given, a new `charpos` will be calculated, by
123 | adding/subtracting UTF-8 char offset to current charpos.
124 | in all cases, it returns a new char position (in bytes), and code point
125 | (a number) at this position.
126 | 
127 | 
128 | ### utf8.insert(s[, idx], substring) -> new_string
129 | insert a substring into s. If `idx` is given, insert the substring before
130 | the char at this index; otherwise, substring will be concatenated onto s.
131 | `idx` can be negative.
132 | 
133 | 
134 | ### utf8.remove(s[, start[, stop]]) -> new_string
135 | delete a substring in s. If neither `start` nor `stop` is given, delete the
136 | last UTF-8 char in s, otherwise delete chars from `start` to the end of s. if
137 | `stop` is given, delete chars from `start` to `stop` (including `start` and `stop`).
138 | `start` and `stop` can be negative.
139 | 
140 | 
141 | ### utf8.width(s[, ambi_is_double[, default_width]]) -> width
142 | calculate the width of UTF-8 string s. if `ambi_is_double` is given,
143 | characters with ambiguous width will be treated as having width 2.
144 | Otherwise, they will be treated as having width 1.
145 | the width of fullwidth/doublewidth characters is 2, and the width of other
146 | characters is 1.
147 | if `default_width` is given, it will be used as the width of unprintable
148 | characters. (If you will replace unprintable characters with a placeholder,
149 | pass its width as `default_width`.)
150 | if s is a code point, return the width of this code point.
151 | 
152 | 
153 | ### utf8.widthindex(s, location[, ambi_is_double[, default_width]]) -> idx, offset, width
154 | return the character index at given location in string s, where location is
155 | in width units. this is the inverse operation of utf8.width().
156 | if the requested location does not fall at a character boundary, `offset` will be
157 | greater than 1; specifically, if the location is at the second column (middle)
158 | of a wide char, `offset` will be 2. the width of the character at idx is returned also.
159 | 
160 | 
161 | ### utf8.title(s) -> new_string
162 | ### utf8.fold(s) -> new_string
163 | converts UTF-8 string s to title-case, or folded case (used for
164 | case-insensitive comparison).
165 | if s is a number, it's treated as a code point and a converted code
166 | point (number) is returned.
167 | utf8.lower/utf8.upper has the same extension.
168 | 
169 | 
170 | ### utf8.ncasecmp(a, b) -> [-1,0,1]
171 | compare a and b without case, -1 means a < b, 0 means a == b and 1 means a > b.
172 | 
173 | 
174 | ### utf8.isvalid(s) -> boolean
175 | check whether s is a valid UTF-8 string or not.
176 | 
177 | 
178 | ### utf8.clean(s[, replacement_string]) -> cleaned_string, was_valid
179 | replace any invalid UTF-8 byte sequences in s with the replacement string.
180 | if no replacement string is provided, the default is "�" (REPLACEMENT CHARACTER U+FFFD).
181 | note that *any* number of consecutive invalid bytes will be replaced by a single copy of the replacement string.
182 | the 2nd return value is true if the original string was already valid (meaning no replacements were made).
183 | 
184 | 
185 | ### utf8.invalidoffset(s[, init]) -> offset
186 | return the byte offset within s of the first invalid UTF-8 byte sequence.
187 | (1 is the first byte of the string.)
188 | if s is a valid UTF-8 string, return nil.
189 | the optional numeric argument init specifies where to start the search; its default value is 1 and can be negative.
190 | 
191 | 
192 | ### utf8.isnfc(s) -> boolean
193 | check whether s is in Normal Form C or not.
194 | "Normal Form C" means that whenever possible, combining marks are combined with a preceding codepoint. For example, instead of U+0041 (LATIN CAPITAL LETTER A) U+00B4 (ACUTE ACCENT), an NFC string will use U+00C1 (LATIN CAPITAL LETTER A WITH ACUTE). Also, some deprecated codepoints are converted to the recommended replacements.
195 | since the same sequence of characters can be represented in more than one way in Unicode, it is better to ensure strings are in Normal Form before comparing them.
196 | an error may be raised if s is not a valid UTF-8 string.
197 | 
198 | 
199 | ### utf8.normalize_nfc(s) -> normal_string, was_nfc
200 | convert s to Normal Form C.
201 | the 2nd return value is true if the original string was already in NFC (meaning no modifications were made).
202 | an error will be raised if s is not a valid UTF-8 string.
203 | 
204 | 
205 | ### utf8.grapheme_indices(s[, start[, stop]]) -> iterator
206 | return an iterator which yields the starting and ending byte index of each successive grapheme cluster in s. This range of bytes is inclusive of the endpoints, so the yielded values can be passed to `string.sub` to extract the grapheme cluster.
207 | if you provide `start` and `stop` byte indices, then the iterator will only cover the requested byte range. `start` and `stop` should fall on character boundaries, since an error will be raised if the requested byte range is not a valid UTF-8 string.
208 | ```lua
209 | local i = 1
210 | for from,to in utf8.grapheme_indices(s) do
211 |   print("grapheme cluster "..i.." is from byte "..from.." to byte "..to)
212 |   i = i + 1
213 | end
214 | ```
215 | 
216 | 
217 | Improvement needed
218 | ------------------
219 | 
220 | - add Lua 5.3 spec test-suite.
221 | - more test case.
222 | - grapheme-compose support, and affect in utf8.reverse and utf8.width
223 | 
224 | 
225 | License
226 | -------
227 | It uses the same license as Lua: http://www.lua.org/license.html
228 | 


--------------------------------------------------------------------------------
/fuzzer/Makefile:
--------------------------------------------------------------------------------
 1 | ALL: lua-utf8.so fuzz-valid fuzz-clean fuzz-invalid fuzz-normalize fuzz-grapheme
 2 | 
 3 | clean:
 4 | 	rm lua-utf8.so fuzz-valid fuzz-clean fuzz-invalid fuzz-normalize fuzz-grapheme
 5 | 
 6 | lua-utf8.so: ../lutf8lib.c
 7 | 	clang -g -fsanitize=fuzzer-no-link,address -fPIC $$(pkg-config --cflags lua5.1) ../lutf8lib.c -shared -o lua-utf8.so
 8 | 
 9 | fuzz-valid: fuzz-valid.c
10 | 	clang -g -fsanitize=address,fuzzer,undefined -I/usr/include/lua5.1 -llua5.1 fuzz-valid.c -o fuzz-valid
11 | 
12 | fuzz-clean: fuzz-clean.c
13 | 	clang -g -fsanitize=address,fuzzer,undefined -I/usr/include/lua5.1 -llua5.1 fuzz-clean.c -o fuzz-clean
14 | 
15 | fuzz-invalid: fuzz-invalid.c
16 | 	clang -g -fsanitize=address,fuzzer,undefined -I/usr/include/lua5.1 -llua5.1 fuzz-invalid.c -o fuzz-invalid
17 | 
18 | fuzz-normalize: fuzz-normalize.c
19 | 	clang -g -fsanitize=address,fuzzer,undefined -I/usr/include/lua5.1 -llua5.1 -licuuc fuzz-normalize.c -o fuzz-normalize
20 | 
21 | fuzz-grapheme: fuzz-grapheme.c
22 | 	clang -g -fsanitize=address,fuzzer,undefined -I/usr/include/lua5.1 -llua5.1 -licuuc fuzz-grapheme.c -o fuzz-grapheme
23 | 


--------------------------------------------------------------------------------
/fuzzer/fuzz-clean.c:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | #include <stdbool.h>
  3 | #include <string.h>
  4 | #include <assert.h>
  5 | 
  6 | #include "lua.h"
  7 | #include "lualib.h"
  8 | #include "lauxlib.h"
  9 | 
 10 | lua_State *L;
 11 | 
 12 | /* Adapted from mb_utf8_to_wchar (from the PHP codebase) */
 13 | static bool php_mbstring_check_utf8(unsigned char *in, size_t in_len)
 14 | {
 15 | 	unsigned char *p = in, *e = p + in_len;
 16 | 
 17 | 	while (p < e) {
 18 | 		unsigned char c = *p++;
 19 | 
 20 | 		if (c < 0x80) {
 21 | 			/* do nothing */
 22 | 		} else if (c >= 0xC2 && c <= 0xDF) { /* 2 byte character */
 23 | 			if (p < e) {
 24 | 				unsigned char c2 = *p++;
 25 | 				if ((c2 & 0xC0) != 0x80) {
 26 | 					return false;
 27 | 				}
 28 | 			} else {
 29 | 				return false;
 30 | 			}
 31 | 		} else if (c >= 0xE0 && c <= 0xEF) { /* 3 byte character */
 32 | 			if ((e - p) >= 2) {
 33 | 				unsigned char c2 = *p++;
 34 | 				unsigned char c3 = *p++;
 35 | 				if ((c2 & 0xC0) != 0x80 || (c == 0xE0 && c2 < 0xA0) || (c == 0xED && c2 >= 0xA0)) {
 36 | 					return false;
 37 | 				} else if ((c3 & 0xC0) != 0x80) {
 38 | 					return false;
 39 | 				}
 40 | 			} else {
 41 | 				return false;
 42 | 			}
 43 | 		} else if (c >= 0xF0 && c <= 0xF4) { /* 4 byte character */
 44 | 			if ((e - p) >= 3) {
 45 | 				unsigned char c2 = *p++;
 46 | 				unsigned char c3 = *p++;
 47 | 				unsigned char c4 = *p++;
 48 | 				/* If c == 0xF0 and c2 < 0x90, then this is an over-long code unit; it could have
 49 | 				 * fit in 3 bytes only. If c == 0xF4 and c2 >= 0x90, then this codepoint is
 50 | 				 * greater than U+10FFFF, which is the highest legal codepoint */
 51 | 				if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) {
 52 | 					return false;
 53 | 				} else if ((c3 & 0xC0) != 0x80) {
 54 | 					return false;
 55 | 				} else if ((c4 & 0xC0) != 0x80) {
 56 | 					return false;
 57 | 				}
 58 | 			} else {
 59 | 				return false;
 60 | 			}
 61 | 		} else {
 62 | 			return false;
 63 | 		}
 64 | 	}
 65 | 
 66 | 	return true;
 67 | }
 68 | 
 69 | int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size)
 70 | {
 71 | 	lua_getglobal(L, "utf8");
 72 | 	lua_getfield(L, -1, "clean");
 73 | 
 74 | 	const char *orig_data = (const char*)Data;
 75 | 
 76 | 	uint8_t *Comma = memchr(Data, ',', Size);
 77 | 	const char *repl = NULL;
 78 | 	size_t repl_len;
 79 | 
 80 | 	if (Comma) {
 81 | 		/* We will pass two arguments (the 2nd one is optional) */
 82 | 		lua_pushlstring(L, (const char*)Data, Comma - Data);
 83 | 		Size -= Comma - Data + 1;
 84 | 		Data = Comma + 1;
 85 | 		repl = (const char*)Data;
 86 | 		repl_len = Size;
 87 | 	}
 88 | 
 89 | 	lua_pushlstring(L, (const char*)Data, Size);
 90 | 
 91 | 	size_t input_len = lua_objlen(L, Comma ? -2 : -1);
 92 | 
 93 | 	/*
 94 | 	const char *dbg = lua_tostring(L, Comma ? -2 : -1);
 95 | 	printf("Input length = %zu\n", input_len);
 96 | 	printf("Input = ");
 97 | 	for (int i = 0; i < input_len; i++)
 98 | 		printf("%02x", dbg[i] & 0xFF);
 99 | 	printf("\n");
100 | 	*/
101 | 
102 | 	int err = lua_pcall(L, Comma ? 2 : 1, 2, 0);
103 | 	/* printf("Err = %x\n", err); */
104 | 
105 | 	if (err) {
106 | 		/* utf8.clean raised an error */
107 | 		assert(repl != NULL);
108 | 
109 | 		/*
110 | 		if (err == 2) {
111 | 			const char *errmsg = lua_tostring(L, -1);
112 | 			printf("Err message = %s\n", errmsg);
113 | 		}
114 | 
115 | 		printf("Replacement length = %zu\n", repl_len);
116 | 		printf("Replacement = ");
117 | 		for (int i = 0; i < repl_len; i++)
118 | 			printf("%02x", repl[i] & 0xFF);
119 | 		printf("\n");
120 | 		*/
121 | 
122 | 		assert(!php_mbstring_check_utf8((unsigned char*)repl, repl_len));
123 | 	} else {
124 | 		assert(lua_isstring(L, -2));
125 | 		assert(lua_isboolean(L, -1));
126 | 		const char *str = lua_tostring(L, -2);
127 | 		int was_clean = lua_toboolean(L, -1);
128 | 		size_t output_len = lua_objlen(L, -2);
129 | 
130 | 		/*
131 | 		printf("Output length = %zu\n", output_len);
132 | 		printf("Output = ");
133 | 		for (int i = 0; i < output_len; i++)
134 | 			printf("%02x", str[i] & 0xFF);
135 | 		printf("\n");
136 | 		*/
137 | 
138 | 		if (was_clean) {
139 | 			assert(input_len == output_len);
140 | 			assert(memcmp(orig_data, str, input_len) == 0);
141 | 		} else {
142 | 			assert(input_len != output_len || memcmp(orig_data, str, input_len) != 0);
143 | 		}
144 | 		assert(php_mbstring_check_utf8((unsigned char*)str, output_len));
145 | 	}
146 | 
147 | 	lua_settop(L, 0); // clear Lua stack
148 | 
149 | 	return 0;
150 | }
151 | 
152 | int LLVMFuzzerInitialize(int *argc, char ***argv)
153 | {
154 | 	L = luaL_newstate();
155 | 	luaL_openlibs(L);
156 | 	lua_getglobal(L, "require");
157 | 	lua_pushstring(L, "lua-utf8");
158 | 	lua_call(L, 1, 1);
159 | 	lua_setglobal(L, "utf8");
160 | 	return 0;
161 | }
162 | 


--------------------------------------------------------------------------------
/fuzzer/fuzz-grapheme.c:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | #include <stdbool.h>
  3 | #include <stdlib.h>
  4 | #include <string.h>
  5 | #include <assert.h>
  6 | 
  7 | #include "lua.h"
  8 | #include "lualib.h"
  9 | #include "lauxlib.h"
 10 | 
 11 | #include "unicode/ucnv.h"
 12 | #include "unicode/ubrk.h"
 13 | 
 14 | lua_State *L;
 15 | 
 16 | /* Adapted from mb_utf8_to_wchar (from the PHP codebase) */
 17 | static bool php_mbstring_check_utf8(unsigned char *in, size_t in_len)
 18 | {
 19 | 	unsigned char *p = in, *e = p + in_len;
 20 | 
 21 | 	while (p < e) {
 22 | 		unsigned char c = *p++;
 23 | 
 24 | 		if (c < 0x80) {
 25 | 			/* do nothing */
 26 | 		} else if (c >= 0xC2 && c <= 0xDF) { /* 2 byte character */
 27 | 			if (p < e) {
 28 | 				unsigned char c2 = *p++;
 29 | 				if ((c2 & 0xC0) != 0x80) {
 30 | 					return false;
 31 | 				}
 32 | 			} else {
 33 | 				return false;
 34 | 			}
 35 | 		} else if (c >= 0xE0 && c <= 0xEF) { /* 3 byte character */
 36 | 			if ((e - p) >= 2) {
 37 | 				unsigned char c2 = *p++;
 38 | 				unsigned char c3 = *p++;
 39 | 				if ((c2 & 0xC0) != 0x80 || (c == 0xE0 && c2 < 0xA0) || (c == 0xED && c2 >= 0xA0)) {
 40 | 					return false;
 41 | 				} else if ((c3 & 0xC0) != 0x80) {
 42 | 					return false;
 43 | 				}
 44 | 			} else {
 45 | 				return false;
 46 | 			}
 47 | 		} else if (c >= 0xF0 && c <= 0xF4) { /* 4 byte character */
 48 | 			if ((e - p) >= 3) {
 49 | 				unsigned char c2 = *p++;
 50 | 				unsigned char c3 = *p++;
 51 | 				unsigned char c4 = *p++;
 52 | 				/* If c == 0xF0 and c2 < 0x90, then this is an over-long code unit; it could have
 53 | 				 * fit in 3 bytes only. If c == 0xF4 and c2 >= 0x90, then this codepoint is
 54 | 				 * greater than U+10FFFF, which is the highest legal codepoint */
 55 | 				if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) {
 56 | 					return false;
 57 | 				} else if ((c3 & 0xC0) != 0x80) {
 58 | 					return false;
 59 | 				} else if ((c4 & 0xC0) != 0x80) {
 60 | 					return false;
 61 | 				}
 62 | 			} else {
 63 | 				return false;
 64 | 			}
 65 | 		} else {
 66 | 			return false;
 67 | 		}
 68 | 	}
 69 | 
 70 | 	return true;
 71 | }
 72 | 
 73 | /* From PHP codebase */
 74 | const unsigned char mblen_table_utf8[] = {
 75 | 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 76 | 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 77 | 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 78 | 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 79 | 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 80 | 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 81 | 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 82 | 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 83 | 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 84 | 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 85 | 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 86 | 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 87 | 	1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 88 | 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 89 | 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 90 | 	4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
 91 | };
 92 | 
 93 | const size_t utf16_code_unit_len(const unsigned char *s, size_t byte_len) {
 94 | 	const unsigned char *e = s + byte_len;
 95 | 	size_t result = 0;
 96 | 	while (s < e) {
 97 | 		unsigned char c = *s;
 98 | 		s += mblen_table_utf8[c];
 99 | 		result++;
100 | 		if (c >= 0xF0 && c <= 0xF4)
101 | 			result++; /* 4-byte UTF-8 characters will take 2 UTF-16 code units */
102 | 	}
103 | 	return result;
104 | }
105 | 
106 | /* Adapted from source code for PostgreSQL ICU extension */
107 | static int32_t icu_to_uchar(UConverter *icu_converter, UChar **buff_uchar, const char *buff, int32_t nbytes)
108 | {
109 | 	UErrorCode status = U_ZERO_ERROR;
110 | 	int32_t len_uchar = ucnv_toUChars(icu_converter, NULL, 0, buff, nbytes, &status);
111 | 	if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
112 | 		printf("Error from ucnv_toUChars: %s\n", u_errorName(status));
113 | 		assert(0);
114 | 	}
115 | 
116 | 	*buff_uchar = (UChar *) malloc((len_uchar + 1) * sizeof(**buff_uchar));
117 | 
118 | 	status = U_ZERO_ERROR;
119 | 	len_uchar = ucnv_toUChars(icu_converter, *buff_uchar, len_uchar + 1,buff, nbytes, &status);
120 | 	if (U_FAILURE(status)) {
121 | 		printf("Error from ucnv_toUChars: %s\n", u_errorName(status));
122 | 		assert(0);
123 | 	}
124 | 
125 | 	return len_uchar;
126 | }
127 | 
128 | int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size)
129 | {
130 | 	/*
131 | 	printf("(%zu): ", Size);
132 | 	for (unsigned int i = 0; i < Size; i++)
133 | 		printf("%02x ", Data[i]);
134 | 	printf("\n");
135 | 	*/
136 | 
137 | 	/* We can only compare with the results from ICU if the entire string was valid UTF-8;
138 | 	 * ICU needs to convert the entire string to codepoints before operationg on it,
139 | 	 * and it can only do that if it's valid UTF-8 */
140 | 	bool valid_utf8 = php_mbstring_check_utf8((unsigned char*)Data, Size);
141 | 
142 | 	UChar *ubuff = NULL;
143 | 	int32_t usize = 0;
144 | 	UConverter *icu_converter = NULL;
145 | 	UBreakIterator *bi = NULL;
146 | 	uint32_t p = 0;
147 | 
148 | 	if (valid_utf8) {
149 | 		UErrorCode errcode = U_ZERO_ERROR;
150 | 		icu_converter = ucnv_open("utf8", &errcode);
151 | 		if (U_FAILURE(errcode)) {
152 | 			printf("Error from ucnv_open: %s\n", u_errorName(errcode));
153 | 			assert(0);
154 | 		}
155 | 		usize = icu_to_uchar(icu_converter, &ubuff, (const char*)Data, Size);
156 | 		errcode = U_ZERO_ERROR;
157 | 
158 | 		/*
159 | 		printf("UTF-16 code units from ICU: (%d): ", usize);
160 | 		for (unsigned int i = 0; i < usize; i++)
161 | 			printf("%04x ", ubuff[i]);
162 | 		printf("\n");
163 | 		*/
164 | 
165 | 		bi = ubrk_open(UBRK_CHARACTER, 0, ubuff, usize, &errcode);
166 | 		if (U_FAILURE(errcode)) {
167 | 			printf("Error from ubrk_open: %s\n", u_errorName(errcode));
168 | 			assert(0);
169 | 		}
170 | 		p = ubrk_first(bi);
171 | 	}
172 | 
173 | 	lua_getglobal(L, "utf8");
174 | 	lua_getfield(L, -1, "grapheme_indices");
175 | 	lua_pushlstring(L, (const char*)Data, Size);
176 | 	int err = lua_pcall(L, 1, 1, 0);
177 | 	assert(!err);
178 | 	assert(lua_iscfunction(L, -1));
179 | 	lua_CFunction iterator = lua_tocfunction(L, -1);
180 | 
181 | 	while (true) {
182 | 		lua_pushvalue(L, -1); // duplicate iterator (on top of stack)
183 | 		int err = lua_pcall(L, 0, 2, 0);
184 | 		if (err) {
185 | 			assert(!valid_utf8);
186 | 			break;
187 | 		}
188 | 
189 | 		if (lua_isnil(L, -1)) {
190 | 			/* Finished iteration */
191 | 			if (valid_utf8) {
192 | 				p = ubrk_next(bi);
193 | 				assert(p == UBRK_DONE);
194 | 			}
195 | 			break;
196 | 		} else {
197 | 			assert(lua_isnumber(L, -1));
198 | 			assert(lua_isnumber(L, -2));
199 | 			int start = lua_tonumber(L, -2);
200 | 			int end = lua_tonumber(L, -1);
201 | 			lua_pop(L, 2);
202 | 			if (valid_utf8) {
203 | 				printf("start = %d, end = %d, p = %d\n", start, end, p);
204 | 				/* start and end are byte offsets, p is a codepoint offset */
205 | 				assert(p == utf16_code_unit_len(Data, start-1));
206 | 				p = ubrk_next(bi);
207 | 				printf("moved to next boundary, now p = %d\n", p);
208 | 				printf("utf16_code_unit_len(Data, end) = %zu\n", utf16_code_unit_len(Data, end));
209 | 				assert(p != UBRK_DONE);
210 | 				assert(p == utf16_code_unit_len(Data, end));
211 | 			}
212 | 		}
213 | 	}
214 | 
215 | 	lua_settop(L, 0); // clear Lua stack
216 | 
217 | 	free(ubuff);
218 | 	if (icu_converter)
219 | 		ucnv_close(icu_converter);
220 | 	if (bi)
221 | 		ubrk_close(bi);
222 | 
223 | 	return 0;
224 | }
225 | 
226 | int LLVMFuzzerInitialize(int *argc, char ***argv)
227 | {
228 | 	L = luaL_newstate();
229 | 	luaL_openlibs(L);
230 | 	lua_getglobal(L, "require");
231 | 	lua_pushstring(L, "lua-utf8");
232 | 	lua_call(L, 1, 1);
233 | 	lua_setglobal(L, "utf8");
234 | 	return 0;
235 | }
236 | 


--------------------------------------------------------------------------------
/fuzzer/fuzz-invalid.c:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | #include <stdbool.h>
  3 | #include <assert.h>
  4 | #include <math.h>
  5 | 
  6 | #include "lua.h"
  7 | #include "lualib.h"
  8 | #include "lauxlib.h"
  9 | 
 10 | lua_State *L;
 11 | 
 12 | /* Adapted from mb_utf8_to_wchar (from the PHP codebase) */
 13 | static bool php_mbstring_check_utf8(unsigned char *in, size_t in_len)
 14 | {
 15 | 	unsigned char *p = in, *e = p + in_len;
 16 | 
 17 | 	while (p < e) {
 18 | 		unsigned char c = *p++;
 19 | 
 20 | 		if (c < 0x80) {
 21 | 			/* do nothing */
 22 | 		} else if (c >= 0xC2 && c <= 0xDF) { /* 2 byte character */
 23 | 			if (p < e) {
 24 | 				unsigned char c2 = *p++;
 25 | 				if ((c2 & 0xC0) != 0x80) {
 26 | 					return false;
 27 | 				}
 28 | 			} else {
 29 | 				return false;
 30 | 			}
 31 | 		} else if (c >= 0xE0 && c <= 0xEF) { /* 3 byte character */
 32 | 			if ((e - p) >= 2) {
 33 | 				unsigned char c2 = *p++;
 34 | 				unsigned char c3 = *p++;
 35 | 				if ((c2 & 0xC0) != 0x80 || (c == 0xE0 && c2 < 0xA0) || (c == 0xED && c2 >= 0xA0)) {
 36 | 					return false;
 37 | 				} else if ((c3 & 0xC0) != 0x80) {
 38 | 					return false;
 39 | 				}
 40 | 			} else {
 41 | 				return false;
 42 | 			}
 43 | 		} else if (c >= 0xF0 && c <= 0xF4) { /* 4 byte character */
 44 | 			if ((e - p) >= 3) {
 45 | 				unsigned char c2 = *p++;
 46 | 				unsigned char c3 = *p++;
 47 | 				unsigned char c4 = *p++;
 48 | 				/* If c == 0xF0 and c2 < 0x90, then this is an over-long code unit; it could have
 49 | 				 * fit in 3 bytes only. If c == 0xF4 and c2 >= 0x90, then this codepoint is
 50 | 				 * greater than U+10FFFF, which is the highest legal codepoint */
 51 | 				if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) {
 52 | 					return false;
 53 | 				} else if ((c3 & 0xC0) != 0x80) {
 54 | 					return false;
 55 | 				} else if ((c4 & 0xC0) != 0x80) {
 56 | 					return false;
 57 | 				}
 58 | 			} else {
 59 | 				return false;
 60 | 			}
 61 | 		} else {
 62 | 			return false;
 63 | 		}
 64 | 	}
 65 | 
 66 | 	return true;
 67 | }
 68 | 
 69 | int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size)
 70 | {
 71 | 	lua_getglobal(L, "utf8");
 72 | 	lua_getfield(L, -1, "invalidoffset");
 73 | 
 74 | 	int offset = 0;
 75 | 	if (Size > 2) {
 76 | 		offset = *Data++;
 77 | 		if (*Data++ % 2 == 1)
 78 | 			offset = -offset;
 79 | 		Size -= 2;
 80 | 	}
 81 | 
 82 | 	lua_pushlstring(L, (const char*)Data, Size);
 83 | 	lua_pushinteger(L, offset);
 84 | 
 85 | 	/*
 86 | 	const char *dbg = lua_tostring(L, -2);
 87 | 	printf("Input length = %zu\n", Size);
 88 | 	printf("Input = ");
 89 | 	for (int i = 0; i < Size; i++)
 90 | 		printf("%02x", Data[i] & 0xFF);
 91 | 	printf("\n");
 92 | 	printf("Offset = %d\n", offset);
 93 | 	*/
 94 | 
 95 | 	lua_call(L, 2, 1);
 96 | 
 97 | 	assert(lua_isnumber(L, -1) || lua_isnil(L, -1));
 98 | 
 99 | 	/* Convert offset into a positive number from 1 - length of string
100 | 	 * (offset is 1-based, not 0-based) */
101 | 	if (offset < 0) {
102 | 		offset = Size + offset + 1;
103 | 		if (offset <= 0) {
104 | 			offset = 1;
105 | 		}
106 | 	} else if (offset == 0) {
107 | 		offset = 1;
108 | 	} else if (offset > Size) {
109 | 		offset = Size + 1;
110 | 	}
111 | 
112 | 	if (lua_isnumber(L, -1)) {
113 | 		double retval = lua_tonumber(L, -1);
114 | 		/* printf("Retval = %d\n", (int)retval); */
115 | 		assert(floor(retval) == ceil(retval)); /* Although 'double', it's actually an integer */
116 | 		assert(retval >= offset);
117 | 		assert(retval > 0);
118 | 		assert(retval <= Size);
119 | 		assert(!php_mbstring_check_utf8((unsigned char*)Data + (int)retval - 1, Size - (int)retval + 1));
120 | 	} else {
121 | 		assert(php_mbstring_check_utf8((unsigned char*)Data + offset - 1, Size - offset + 1));
122 | 	}
123 | 
124 | 	lua_settop(L, 0); // clear Lua stack
125 | 
126 | 	return 0;
127 | }
128 | 
129 | int LLVMFuzzerInitialize(int *argc, char ***argv)
130 | {
131 | 	L = luaL_newstate();
132 | 	luaL_openlibs(L);
133 | 	lua_getglobal(L, "require");
134 | 	lua_pushstring(L, "lua-utf8");
135 | 	lua_call(L, 1, 1);
136 | 	lua_setglobal(L, "utf8");
137 | 	return 0;
138 | }
139 | 


--------------------------------------------------------------------------------
/fuzzer/fuzz-normalize.c:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | #include <stdbool.h>
  3 | #include <stdlib.h>
  4 | #include <string.h>
  5 | #include <assert.h>
  6 | 
  7 | #include "lua.h"
  8 | #include "lualib.h"
  9 | #include "lauxlib.h"
 10 | 
 11 | #include "unicode/ucnv.h"
 12 | #include "unicode/unorm2.h"
 13 | 
 14 | lua_State *L;
 15 | 
 16 | /* Adapted from mb_utf8_to_wchar (from the PHP codebase) */
 17 | static bool php_mbstring_check_utf8(unsigned char *in, size_t in_len)
 18 | {
 19 | 	unsigned char *p = in, *e = p + in_len;
 20 | 
 21 | 	while (p < e) {
 22 | 		unsigned char c = *p++;
 23 | 
 24 | 		if (c < 0x80) {
 25 | 			/* do nothing */
 26 | 		} else if (c >= 0xC2 && c <= 0xDF) { /* 2 byte character */
 27 | 			if (p < e) {
 28 | 				unsigned char c2 = *p++;
 29 | 				if ((c2 & 0xC0) != 0x80) {
 30 | 					return false;
 31 | 				}
 32 | 			} else {
 33 | 				return false;
 34 | 			}
 35 | 		} else if (c >= 0xE0 && c <= 0xEF) { /* 3 byte character */
 36 | 			if ((e - p) >= 2) {
 37 | 				unsigned char c2 = *p++;
 38 | 				unsigned char c3 = *p++;
 39 | 				if ((c2 & 0xC0) != 0x80 || (c == 0xE0 && c2 < 0xA0) || (c == 0xED && c2 >= 0xA0)) {
 40 | 					return false;
 41 | 				} else if ((c3 & 0xC0) != 0x80) {
 42 | 					return false;
 43 | 				}
 44 | 			} else {
 45 | 				return false;
 46 | 			}
 47 | 		} else if (c >= 0xF0 && c <= 0xF4) { /* 4 byte character */
 48 | 			if ((e - p) >= 3) {
 49 | 				unsigned char c2 = *p++;
 50 | 				unsigned char c3 = *p++;
 51 | 				unsigned char c4 = *p++;
 52 | 				/* If c == 0xF0 and c2 < 0x90, then this is an over-long code unit; it could have
 53 | 				 * fit in 3 bytes only. If c == 0xF4 and c2 >= 0x90, then this codepoint is
 54 | 				 * greater than U+10FFFF, which is the highest legal codepoint */
 55 | 				if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) {
 56 | 					return false;
 57 | 				} else if ((c3 & 0xC0) != 0x80) {
 58 | 					return false;
 59 | 				} else if ((c4 & 0xC0) != 0x80) {
 60 | 					return false;
 61 | 				}
 62 | 			} else {
 63 | 				return false;
 64 | 			}
 65 | 		} else {
 66 | 			return false;
 67 | 		}
 68 | 	}
 69 | 
 70 | 	return true;
 71 | }
 72 | 
 73 | /* Adapted from source code for PostgreSQL ICU extension */
 74 | static int32_t icu_to_uchar(UConverter *icu_converter, UChar **buff_uchar, const char *buff, int32_t nbytes)
 75 | {
 76 | 	UErrorCode status = U_ZERO_ERROR;
 77 | 	int32_t len_uchar = ucnv_toUChars(icu_converter, NULL, 0, buff, nbytes, &status);
 78 | 	if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
 79 | 		printf("Error from ucnv_toUChars: %s\n", u_errorName(status));
 80 | 		assert(0);
 81 | 	}
 82 | 
 83 | 	*buff_uchar = (UChar *) malloc((len_uchar + 1) * sizeof(**buff_uchar));
 84 | 
 85 | 	status = U_ZERO_ERROR;
 86 | 	len_uchar = ucnv_toUChars(icu_converter, *buff_uchar, len_uchar + 1,buff, nbytes, &status);
 87 | 	if (U_FAILURE(status)) {
 88 | 		printf("Error from ucnv_toUChars: %s\n", u_errorName(status));
 89 | 		assert(0);
 90 | 	}
 91 | 
 92 | 	return len_uchar;
 93 | }
 94 | 
 95 | static int32_t icu_from_uchar(UConverter *icu_converter, char **result, const UChar *buff_uchar, int32_t len_uchar)
 96 | {
 97 | 	UErrorCode status = U_ZERO_ERROR;
 98 | 	uint32_t len_result = ucnv_fromUChars(icu_converter, NULL, 0, buff_uchar, len_uchar, &status);
 99 | 	if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
100 | 		assert(0);
101 | 
102 | 	*result = (char *) malloc(len_result + 1);
103 | 
104 | 	status = U_ZERO_ERROR;
105 | 	len_result = ucnv_fromUChars(icu_converter, *result, len_result + 1, buff_uchar, len_uchar, &status);
106 | 	if (U_FAILURE(status))
107 | 		assert(0);
108 | 
109 | 	return len_result;
110 | }
111 | 
112 | int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size)
113 | {
114 | 	/*
115 | 	printf("Input (%zu): ", Size);
116 | 	for (unsigned int i = 0; i < Size; i++)
117 | 		printf("%02x ", Data[i]);
118 | 	printf("\n");
119 | 	*/
120 | 
121 | 	/* We can only compare with the results from ICU if the entire string was valid UTF-8;
122 | 	 * ICU won't even allow us to check whether the string is NFC unless it's valid UTF-8 */
123 | 	bool valid_utf8 = php_mbstring_check_utf8((unsigned char*)Data, Size);
124 | 
125 | 	UChar *ubuff = NULL;
126 | 	int32_t usize = 0;
127 | 	UConverter *icu_converter = NULL;
128 | 
129 | 	if (valid_utf8) {
130 | 		UErrorCode errcode = U_ZERO_ERROR;
131 | 		icu_converter = ucnv_open("utf8", &errcode);
132 | 		if (U_FAILURE(errcode)) {
133 | 			printf("Error from ucnv_open: %s\n", u_errorName(errcode));
134 | 			assert(0);
135 | 		}
136 | 		usize = icu_to_uchar(icu_converter, &ubuff, (const char*)Data, Size);
137 | 	}
138 | 
139 | 	lua_getglobal(L, "utf8");
140 | 	lua_getfield(L, -1, "isnfc");
141 | 	lua_pushlstring(L, (const char*)Data, Size);
142 | 	int err = lua_pcall(L, 1, 1, 0);
143 | 
144 | 	if (err) {
145 | 		/* utf8.isnfc raised an error */
146 | 		assert(!valid_utf8);
147 | 	} else {
148 | 		assert(lua_isboolean(L, -1));
149 | 		int was_nfc = lua_toboolean(L, -1);
150 | 
151 | 		/* If the string was not NFC, we cannot assume that the string is valid UTF-8,
152 | 		 * even if no error was raised... if utf8.isnfc notices that the string is not NFC,
153 | 		 * it will immediately return false and will not check whether the trailing portion
154 | 		 * is valid UTF-8 or not */
155 | 		assert(!was_nfc || valid_utf8);
156 | 
157 | 		if (valid_utf8) {
158 | 			UErrorCode errcode = U_ZERO_ERROR;
159 | 			const UNormalizer2 *norm = unorm2_getNFCInstance(&errcode);
160 | 			assert(!U_FAILURE(errcode));
161 | 			UBool was_actually_nfc = unorm2_isNormalized(norm, ubuff, usize, &errcode);
162 | 			assert(!U_FAILURE(errcode));
163 | 
164 | 			/*
165 | 			printf("lua-utf8, is the input NFC? %s\n", was_nfc ? "yes" : "no");
166 | 			printf("ICU, is the input NFC?      %s\n", was_actually_nfc ? "yes" : "no");
167 | 			*/
168 | 
169 | 			assert(was_nfc == was_actually_nfc);
170 | 		}
171 | 	}
172 | 
173 | 	lua_getglobal(L, "utf8");
174 | 	lua_getfield(L, -1, "normalize_nfc");
175 | 	lua_pushlstring(L, (const char*)Data, Size);
176 | 	err = lua_pcall(L, 1, 2, 0);
177 | 
178 | 	if (err) {
179 | 		/* utf8.nfc_normalize raised an error */
180 | 		assert(!valid_utf8);
181 | 	} else {
182 | 		assert(lua_isboolean(L, -1));
183 | 		int was_already_nfc = lua_toboolean(L, -1);
184 | 
185 | 		assert(lua_isstring(L, -2));
186 | 		const char *str = lua_tostring(L, -2);
187 | 		size_t str_len = lua_objlen(L, -2);
188 | 
189 | 		assert(valid_utf8 || !was_already_nfc);
190 | 
191 | 		if (valid_utf8) {
192 | 			UErrorCode errcode = U_ZERO_ERROR;
193 | 			const UNormalizer2 *norm = unorm2_getNFCInstance(&errcode);
194 | 			assert(!U_FAILURE(errcode));
195 | 
196 | 			uint32_t dest_size = 3 * usize; /* Maximum size which string could possibly expand to as NFC */
197 | 			UChar *dest = malloc(dest_size * sizeof(UChar));
198 | 
199 | 			uint32_t dest_len = unorm2_normalize(norm, ubuff, usize, dest, dest_size, &errcode);
200 | 			assert(!U_FAILURE(errcode));
201 | 
202 | 			/* Convert NFC codepoints to UTF-8 bytes */
203 | 			char *bytes = NULL;
204 | 			uint32_t byte_len = icu_from_uchar(icu_converter, &bytes, dest, dest_len);
205 | 
206 | 			/*
207 | 			printf("lua-utf8 (%zu): ", str_len);
208 | 			for (unsigned int i = 0; i < str_len; i++)
209 | 				printf("%02x ", (uint8_t)str[i]);
210 | 			printf("\n");
211 | 			printf("ICU      (%u): ", byte_len);
212 | 			for (unsigned int i = 0; i < byte_len; i++)
213 | 				printf("%02x ", (uint8_t)bytes[i]);
214 | 			printf("\n");
215 | 			*/
216 | 
217 | 			assert(byte_len == str_len);
218 | 			assert(strncmp(str, bytes, str_len) == 0);
219 | 
220 | 			free(dest);
221 | 			free(bytes);
222 | 		}
223 | 	}
224 | 
225 | 	lua_settop(L, 0); // clear Lua stack
226 | 
227 | 	free(ubuff);
228 | 	if (icu_converter)
229 | 		ucnv_close(icu_converter);
230 | 
231 | 	return 0;
232 | }
233 | 
234 | int LLVMFuzzerInitialize(int *argc, char ***argv)
235 | {
236 | 	L = luaL_newstate();
237 | 	luaL_openlibs(L);
238 | 	lua_getglobal(L, "require");
239 | 	lua_pushstring(L, "lua-utf8");
240 | 	lua_call(L, 1, 1);
241 | 	lua_setglobal(L, "utf8");
242 | 	return 0;
243 | }
244 | 


--------------------------------------------------------------------------------
/fuzzer/fuzz-valid.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stdbool.h>
 3 | #include <assert.h>
 4 | 
 5 | #include "lua.h"
 6 | #include "lualib.h"
 7 | #include "lauxlib.h"
 8 | 
 9 | lua_State *L;
10 | 
11 | /* Adapted from mb_utf8_to_wchar (from the PHP codebase) */
12 | static bool php_mbstring_check_utf8(unsigned char *in, size_t in_len)
13 | {
14 | 	unsigned char *p = in, *e = p + in_len;
15 | 
16 | 	while (p < e) {
17 | 		unsigned char c = *p++;
18 | 
19 | 		if (c < 0x80) {
20 | 			/* do nothing */
21 | 		} else if (c >= 0xC2 && c <= 0xDF) { /* 2 byte character */
22 | 			if (p < e) {
23 | 				unsigned char c2 = *p++;
24 | 				if ((c2 & 0xC0) != 0x80) {
25 | 					return false;
26 | 				}
27 | 			} else {
28 | 				return false;
29 | 			}
30 | 		} else if (c >= 0xE0 && c <= 0xEF) { /* 3 byte character */
31 | 			if ((e - p) >= 2) {
32 | 				unsigned char c2 = *p++;
33 | 				unsigned char c3 = *p++;
34 | 				if ((c2 & 0xC0) != 0x80 || (c == 0xE0 && c2 < 0xA0) || (c == 0xED && c2 >= 0xA0)) {
35 | 					return false;
36 | 				} else if ((c3 & 0xC0) != 0x80) {
37 | 					return false;
38 | 				}
39 | 			} else {
40 | 				return false;
41 | 			}
42 | 		} else if (c >= 0xF0 && c <= 0xF4) { /* 4 byte character */
43 | 			if ((e - p) >= 3) {
44 | 				unsigned char c2 = *p++;
45 | 				unsigned char c3 = *p++;
46 | 				unsigned char c4 = *p++;
47 | 				/* If c == 0xF0 and c2 < 0x90, then this is an over-long code unit; it could have
48 | 				 * fit in 3 bytes only. If c == 0xF4 and c2 >= 0x90, then this codepoint is
49 | 				 * greater than U+10FFFF, which is the highest legal codepoint */
50 | 				if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) {
51 | 					return false;
52 | 				} else if ((c3 & 0xC0) != 0x80) {
53 | 					return false;
54 | 				} else if ((c4 & 0xC0) != 0x80) {
55 | 					return false;
56 | 				}
57 | 			} else {
58 | 				return false;
59 | 			}
60 | 		} else {
61 | 			return false;
62 | 		}
63 | 	}
64 | 
65 | 	return true;
66 | }
67 | 
68 | int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size)
69 | {
70 | 	lua_getglobal(L, "utf8");
71 | 	lua_getfield(L, -1, "isvalid");
72 | 	lua_pushlstring(L, (const char*)Data, Size);
73 | 	lua_call(L, 1, 1);
74 | 
75 | 	assert(lua_isboolean(L, -1));
76 | 	int was_valid = lua_toboolean(L, -1);
77 | 	if (was_valid) {
78 | 		assert(php_mbstring_check_utf8((unsigned char*)Data, Size));
79 | 	} else {
80 | 		assert(!php_mbstring_check_utf8((unsigned char*)Data, Size));
81 | 	}
82 | 
83 | 	lua_settop(L, 0); // clear Lua stack
84 | 
85 | 	return 0;
86 | }
87 | 
88 | int LLVMFuzzerInitialize(int *argc, char ***argv)
89 | {
90 | 	L = luaL_newstate();
91 | 	luaL_openlibs(L);
92 | 	lua_getglobal(L, "require");
93 | 	lua_pushstring(L, "lua-utf8");
94 | 	lua_call(L, 1, 1);
95 | 	lua_setglobal(L, "utf8");
96 | 	return 0;
97 | }
98 | 


--------------------------------------------------------------------------------
/lutf8lib.c:
--------------------------------------------------------------------------------
   1 | /* vim: set ft=c nu et sw=2 fdc=2 fdm=syntax : */
   2 | #define LUA_LIB
   3 | #include <lua.h>
   4 | #include <lauxlib.h>
   5 | #include <lualib.h>
   6 | 
   7 | #include <assert.h>
   8 | #include <string.h>
   9 | #include <stdint.h>
  10 | #include <limits.h>
  11 | #include <stdlib.h>
  12 | 
  13 | #include "unidata.h"
  14 | 
  15 | /* UTF-8 string operations */
  16 | 
  17 | #define UTF8_BUFFSZ 8
  18 | #define UTF8_MAX    0x7FFFFFFFu
  19 | #define UTF8_MAXCP  0x10FFFFu
  20 | #define iscont(p)   ((*(p) & 0xC0) == 0x80)
  21 | #define CAST(tp,expr) ((tp)(expr))
  22 | 
  23 | #ifndef LUA_QL
  24 | # define LUA_QL(x) "'" x "'"
  25 | #endif
  26 | 
  27 | static int utf8_invalid (utfint ch)
  28 | { return (ch > UTF8_MAXCP || (0xD800u <= ch && ch <= 0xDFFFu)); }
  29 | 
  30 | static size_t utf8_encode (char *buff, utfint x) {
  31 |   int n = 1;  /* number of bytes put in buffer (backwards) */
  32 |   lua_assert(x <= UTF8_MAX);
  33 |   if (x < 0x80)  /* ascii? */
  34 |     buff[UTF8_BUFFSZ - 1] = x & 0x7F;
  35 |   else {  /* need continuation bytes */
  36 |     utfint mfb = 0x3f;  /* maximum that fits in first byte */
  37 |     do {  /* add continuation bytes */
  38 |       buff[UTF8_BUFFSZ - (n++)] = 0x80 | (x & 0x3f);
  39 |       x >>= 6;  /* remove added bits */
  40 |       mfb >>= 1;  /* now there is one less bit available in first byte */
  41 |     } while (x > mfb);  /* still needs continuation byte? */
  42 |     buff[UTF8_BUFFSZ - n] = ((~mfb << 1) | x) & 0xFF;  /* add first byte */
  43 |   }
  44 |   return n;
  45 | }
  46 | 
  47 | static const char *utf8_decode (const char *s, utfint *val, int strict) {
  48 |   static const utfint limits[] =
  49 |   {~0u, 0x80u, 0x800u, 0x10000u, 0x200000u, 0x4000000u};
  50 |   unsigned int c = (unsigned char)s[0];
  51 |   utfint res = 0;  /* final result */
  52 |   if (c < 0x80)  /* ascii? */
  53 |     res = c;
  54 |   else {
  55 |     int count = 0;  /* to count number of continuation bytes */
  56 |     for (; c & 0x40; c <<= 1) {  /* while it needs continuation bytes... */
  57 |       unsigned int cc = (unsigned char)s[++count];  /* read next byte */
  58 |       if ((cc & 0xC0) != 0x80)  /* not a continuation byte? */
  59 |         return NULL;  /* invalid byte sequence */
  60 |       res = (res << 6) | (cc & 0x3F);  /* add lower 6 bits from cont. byte */
  61 |     }
  62 |     res |= ((utfint)(c & 0x7F) << (count * 5));  /* add first byte */
  63 |     if (count > 5 || res > UTF8_MAX || res < limits[count])
  64 |       return NULL;  /* invalid byte sequence */
  65 |     s += count;  /* skip continuation bytes read */
  66 |   }
  67 |   if (strict) {
  68 |     /* check for invalid code points; too large or surrogates */
  69 |     if (res > UTF8_MAXCP || (0xD800u <= res && res <= 0xDFFFu))
  70 |       return NULL;
  71 |   }
  72 |   if (val) *val = res;
  73 |   return s + 1;  /* +1 to include first byte */
  74 | }
  75 | 
  76 | static const char *utf8_prev (const char *s, const char *e) {
  77 |   while (s < e && iscont(e - 1)) --e;
  78 |   return s < e ? e - 1 : s;
  79 | }
  80 | 
  81 | static const char *utf8_next (const char *s, const char *e) {
  82 |   while (s < e && iscont(s + 1)) ++s;
  83 |   return s < e ? s + 1 : e;
  84 | }
  85 | 
  86 | static size_t utf8_length (const char *s, const char *e) {
  87 |   size_t i;
  88 |   for (i = 0; s < e; ++i)
  89 |     s = utf8_next(s, e);
  90 |   return i;
  91 | }
  92 | 
  93 | static const char *utf8_offset (const char *s, const char *e, lua_Integer offset, lua_Integer idx) {
  94 |   const char *p = s + offset - 1;
  95 |   if (idx >= 0) {
  96 |     while (p < e && idx > 0)
  97 |       p = utf8_next(p, e), --idx;
  98 |     return idx == 0 ? p : NULL;
  99 |   } else {
 100 |     while (s < p && idx < 0)
 101 |       p = utf8_prev(s, p), ++idx;
 102 |     return idx == 0 ? p : NULL;
 103 |   }
 104 | }
 105 | 
 106 | static const char *utf8_relat (const char *s, const char *e, int idx) {
 107 |   return idx >= 0 ?
 108 |     utf8_offset(s, e, 1, idx - 1) :
 109 |     utf8_offset(s, e, e-s+1, idx);
 110 | }
 111 | 
 112 | static int utf8_range(const char *s, const char *e, lua_Integer *i, lua_Integer *j) {
 113 |   const char *ps = utf8_relat(s, e, CAST(int, *i));
 114 |   const char *pe = utf8_relat(s, e, CAST(int, *j));
 115 |   *i = (ps ? ps : (*i > 0 ? e : s)) - s;
 116 |   *j = (pe ? utf8_next(pe, e) : (*j > 0 ? e : s)) - s;
 117 |   return *i < *j;
 118 | }
 119 | 
 120 | /* Indexed by top nibble of first byte in code unit */
 121 | static uint8_t utf8_code_unit_len[] = {
 122 |   1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, 2, 2, 3, 4
 123 | };
 124 | 
 125 | /* Return pointer to first invalid UTF-8 sequence in 's', or NULL if valid */
 126 | static const char *utf8_invalid_offset(const char *s, const char *e) {
 127 |   while (s < e) {
 128 |     uint8_t c = *s;
 129 |     if (c >= 0x80) {
 130 |       /* c < 0xC0 means a continuation byte, but we are not in the middle of a multi-byte code unit
 131 |        * c >= 0xC0 && c < 0xC2 means an overlong 2-byte code unit
 132 |        * c >= 0xF8 means a 5-byte or 6-byte code unit, which is illegal, or else illegal byte 0xFE/0xFF
 133 |        * c >= 0xF5 && c < 0xF8 means a 4-byte code unit encoding invalid codepoint > U+10FFFF */
 134 |       if (c < 0xC2 || c >= 0xF5)
 135 |         return s;
 136 |       uint8_t needed_bytes = utf8_code_unit_len[c >> 4];
 137 |       if (e - s < needed_bytes)
 138 |         return s; /* String is truncated */
 139 |       uint8_t c2 = *(s+1);
 140 |       if ((c2 & 0xC0) != 0x80)
 141 |         return s; /* 2nd byte of code unit is not a continuation byte */
 142 |       if (needed_bytes >= 3) {
 143 |         uint8_t c3 = *(s+2);
 144 |         if ((c3 & 0xC0) != 0x80)
 145 |           return s; /* 3rd byte of code unit is not a continuation byte */
 146 |         if (needed_bytes == 3) {
 147 |           if (c == 0xE0 && c2 < 0xA0)
 148 |             return s; /* Overlong 3-byte code unit */
 149 |           if (c == 0xED && c2 >= 0xA0)
 150 |             return s; /* Reserved codepoint from U+D800-U+DFFF */
 151 |         } else {
 152 |           uint8_t c4 = *(s+3);
 153 |           if ((c4 & 0xC0) != 0x80)
 154 |             return s; /* 4th byte of code unit is not a continuation byte */
 155 |           if (c == 0xF0 && c2 < 0x90)
 156 |             return s; /* Overlong 4-byte code unit */
 157 |           if (c == 0xF4 && c2 >= 0x90)
 158 |             return s; /* Illegal codepoint > U+10FFFF */
 159 |         }
 160 |       }
 161 |       s += needed_bytes;
 162 |     } else {
 163 |       s++;
 164 |     }
 165 |   }
 166 |   return NULL;
 167 | }
 168 | 
 169 | /* Unicode character categories */
 170 | 
 171 | #define table_size(t) (sizeof(t)/sizeof((t)[0]))
 172 | 
 173 | #define utf8_categories(X) \
 174 |   X('a', alpha) \
 175 |   X('c', cntrl) \
 176 |   X('d', digit) \
 177 |   X('l', lower) \
 178 |   X('p', punct) \
 179 |   X('s', space) \
 180 |   X('t', compose) \
 181 |   X('u', upper) \
 182 |   X('x', xdigit)
 183 | 
 184 | #define utf8_converters(X) \
 185 |   X(lower) \
 186 |   X(upper) \
 187 |   X(title) \
 188 |   X(fold)
 189 | 
 190 | static int find_in_range (range_table *t, size_t size, utfint ch) {
 191 |   size_t begin, end;
 192 | 
 193 |   begin = 0;
 194 |   end = size;
 195 | 
 196 |   while (begin < end) {
 197 |     size_t mid = (begin + end) / 2;
 198 |     if (t[mid].last < ch)
 199 |       begin = mid + 1;
 200 |     else if (t[mid].first > ch)
 201 |       end = mid;
 202 |     else
 203 |       return (ch - t[mid].first) % t[mid].step == 0;
 204 |   }
 205 | 
 206 |   return 0;
 207 | }
 208 | 
 209 | static int convert_char (conv_table *t, size_t size, utfint ch) {
 210 |   size_t begin, end;
 211 | 
 212 |   begin = 0;
 213 |   end = size;
 214 | 
 215 |   while (begin < end) {
 216 |     size_t mid = (begin + end) / 2;
 217 |     if (t[mid].last < ch)
 218 |       begin = mid + 1;
 219 |     else if (t[mid].first > ch)
 220 |       end = mid;
 221 |     else if ((ch - t[mid].first) % t[mid].step == 0)
 222 |       return ch + t[mid].offset;
 223 |     else
 224 |       return ch;
 225 |   }
 226 | 
 227 |   return ch;
 228 | }
 229 | 
 230 | /* Normalization */
 231 | 
 232 | static int lookup_canon_cls (utfint ch) {
 233 |   /* The first codepoint with canonicalization class != 0 is U+0300 COMBINING GRAVE ACCENT */
 234 |   if (ch < 0x300) {
 235 |     return 0;
 236 |   }
 237 |   size_t begin = 0, end = table_size(nfc_combining_table);
 238 | 
 239 |   while (begin < end) {
 240 |     size_t mid = (begin + end) / 2;
 241 |     if (nfc_combining_table[mid].last < ch)
 242 |       begin = mid + 1;
 243 |     else if (nfc_combining_table[mid].first > ch)
 244 |       end = mid;
 245 |     else
 246 |       return nfc_combining_table[mid].canon_cls;
 247 |   }
 248 | 
 249 |   return 0;
 250 | }
 251 | 
 252 | static nfc_table *nfc_quickcheck (utfint ch) {
 253 |   /* The first character which needs to be checked for possible NFC violations
 254 |    * is U+0300 COMBINING GRAVE ACCENT */
 255 |   if (ch < 0x300) {
 256 |     return NULL;
 257 |   }
 258 |   size_t begin = 0, end = table_size(nfc_quickcheck_table);
 259 | 
 260 |   while (begin < end) {
 261 |     size_t mid = (begin + end) / 2;
 262 |     utfint found = nfc_quickcheck_table[mid].cp;
 263 |     if (found < ch)
 264 |       begin = mid + 1;
 265 |     else if (found > ch)
 266 |       end = mid;
 267 |     else
 268 |       return &nfc_quickcheck_table[mid];
 269 |   }
 270 | 
 271 |   return NULL;
 272 | }
 273 | 
 274 | static int nfc_combine (utfint cp1, utfint cp2, utfint *dest) {
 275 |   size_t begin = 0, end = table_size(nfc_composite_table);
 276 |   unsigned int hash = (cp1 * 213) + cp2;
 277 | 
 278 |   while (begin < end) {
 279 |     size_t mid = (begin + end) / 2;
 280 |     utfint val = nfc_composite_table[mid].hash;
 281 |     if (val < hash) {
 282 |       begin = mid + 1;
 283 |     } else if (val > hash) {
 284 |       end = mid;
 285 |     } else if (nfc_composite_table[mid].cp1 == cp1 && nfc_composite_table[mid].cp2 == cp2) {
 286 |       if (dest)
 287 |         *dest = nfc_composite_table[mid].dest;
 288 |       return 1;
 289 |     } else {
 290 |       return 0;
 291 |     }
 292 |   }
 293 | 
 294 |   return 0;
 295 | }
 296 | 
 297 | static decompose_table *nfc_decompose (utfint ch) {
 298 |   size_t begin = 0, end = table_size(nfc_decompose_table);
 299 | 
 300 |   while (begin < end) {
 301 |     size_t mid = (begin + end) / 2;
 302 |     utfint found = nfc_decompose_table[mid].cp;
 303 |     if (found < ch)
 304 |       begin = mid + 1;
 305 |     else if (found > ch)
 306 |       end = mid;
 307 |     else
 308 |       return &nfc_decompose_table[mid];
 309 |   }
 310 | 
 311 |   return NULL;
 312 | }
 313 | 
 314 | static int nfc_check (utfint ch, nfc_table *entry, utfint starter, unsigned int canon_cls, unsigned int prev_canon_cls) {
 315 |   int reason = entry->reason;
 316 | 
 317 |   if (reason == REASON_MUST_CONVERT_1 || reason == REASON_MUST_CONVERT_2) {
 318 |     /* This codepoint has a different, canonical form, so this string is not NFC */
 319 |     return 0;
 320 |   } else if (reason == REASON_STARTER_CAN_COMBINE) {
 321 |     /* It is possible that this 'starter' codepoint should have been combined with the
 322 |      * preceding 'starter' codepoint; if so, this string is not NFC */
 323 |     if (!prev_canon_cls && nfc_combine(starter, ch, NULL)) {
 324 |       /* These codepoints should have been combined */
 325 |       return 0;
 326 |     }
 327 |   } else if (reason == REASON_COMBINING_MARK) {
 328 |     /* Combining mark; check if it should have been combined with preceding starter codepoint */
 329 |     if (canon_cls <= prev_canon_cls) {
 330 |       return 1;
 331 |     }
 332 |     if (nfc_combine(starter, ch, NULL)) {
 333 |       /* Yes, they should have been combined. This string is not NFC */
 334 |       return 0;
 335 |     }
 336 |     /* Could it be that preceding 'starter' codepoint is already combined, but with a
 337 |      * combining mark which is out of order with this one? */
 338 |     decompose_table *decomp = nfc_decompose(starter);
 339 |     if (decomp) {
 340 |       if (decomp->canon_cls2 > canon_cls && nfc_combine(decomp->to1, ch, NULL)) {
 341 |         return 0;
 342 |       } else {
 343 |         decompose_table *decomp2 = nfc_decompose(decomp->to1);
 344 |         if (decomp2 && decomp2->canon_cls2 > canon_cls && nfc_combine(decomp2->to1, ch, NULL)) {
 345 |           return 0;
 346 |         }
 347 |       }
 348 |     }
 349 |   } else if (reason == REASON_JAMO_VOWEL) {
 350 |     if (!prev_canon_cls && starter >= 0x1100 && starter <= 0x1112) {
 351 |       /* Preceding codepoint was a leading jamo; they should have been combined */
 352 |       return 0;
 353 |     }
 354 |   } else if (reason == REASON_JAMO_TRAILING) {
 355 |     if (!prev_canon_cls && starter >= 0xAC00 && starter <= 0xD7A3) {
 356 |       /* Preceding codepoint was a precomposed Hangul syllable; check if it had no trailing jamo */
 357 |       if ((starter - 0xAC00) % 28 == 0) {
 358 |         /* It didn't have a trailing jamo, so this trailing jamo should have been combined */
 359 |         return 0;
 360 |       }
 361 |     }
 362 |   }
 363 | 
 364 |   return 1;
 365 | }
 366 | 
 367 | static void merge_combining_marks (uint32_t *src1, uint32_t *src2, uint32_t *dest, size_t size1, size_t size2) {
 368 |   while (size1 && size2) {
 369 |     if ((*src1 & 0xFF) > (*src2 & 0xFF)) {
 370 |       *dest++ = *src2++;
 371 |       size2--;
 372 |     } else {
 373 |       *dest++ = *src1++;
 374 |       size1--;
 375 |     }
 376 |   }
 377 |   while (size1) {
 378 |     *dest++ = *src1++;
 379 |     size1--;
 380 |   }
 381 |   while (size2) {
 382 |     *dest++ = *src2++;
 383 |     size2--;
 384 |   }
 385 | }
 386 | 
 387 | static void stable_sort_combining_marks (uint32_t *vector, uint32_t *scratch, size_t size) {
 388 |   /* We need to use a stable sort for sorting combining marks which are in the wrong order
 389 |    * when doing NFC normalization; bottom-up merge sort is fast and stable */
 390 |   size_t limit = size - 1;
 391 |   for (unsigned int i = 0; i < limit; i += 2) {
 392 |     if ((vector[i] & 0xFF) > (vector[i+1] & 0xFF)) {
 393 |       uint32_t temp = vector[i];
 394 |       vector[i] = vector[i+1];
 395 |       vector[i+1] = temp;
 396 |     }
 397 |   }
 398 |   if (size <= 2)
 399 |     return;
 400 | 
 401 |   uint32_t *src = vector, *dest = scratch;
 402 |   unsigned int runsize = 2; /* Every consecutive slice of this size is sorted */
 403 |   while (runsize < size) {
 404 |     unsigned int blocksize = runsize * 2; /* We will now sort slices of this size */
 405 |     limit = size & ~(blocksize - 1);
 406 |     for (unsigned int i = 0; i < limit; i += blocksize)
 407 |       merge_combining_marks(&src[i], &src[i+runsize], &dest[i], runsize, runsize);
 408 |     if (size - limit > runsize) {
 409 |       merge_combining_marks(&src[limit], &src[limit+runsize], &dest[limit], runsize, size - limit - runsize);
 410 |     } else {
 411 |       memcpy(&dest[limit], &src[limit], (size - limit) * sizeof(uint32_t));
 412 |     }
 413 |     /* After each series of (progressively larger) merges, we swap src & dest to
 414 |      * avoid memcpy'ing the partially sorted results from dest back into src */
 415 |     uint32_t *temp = src; src = dest; dest = temp;
 416 |     runsize = blocksize;
 417 |   }
 418 | 
 419 |   if (dest == vector) {
 420 |     /* Since src & dest are swapped on each iteration of the above loop,
 421 |      * this actually means the last buffer which was written into
 422 |      * was 'scratch' */
 423 |     memcpy(vector, scratch, size * sizeof(uint32_t));
 424 |   }
 425 | }
 426 | 
 427 | /* Shuffle item `i` up or down to get it into the right position */
 428 | static void stable_insert_combining_mark (uint32_t *vector, size_t vec_size, unsigned int i)
 429 | {
 430 |   unsigned int item = vector[i];
 431 |   unsigned int canon_cls = item & 0xFF;
 432 |   if (i > 0) {
 433 |     if (canon_cls < (vector[i-1] & 0xFF)) {
 434 |       do {
 435 |         vector[i] = vector[i-1];
 436 |         i--;
 437 |       } while (i > 0 && canon_cls < (vector[i-1] & 0xFF));
 438 |       vector[i] = item;
 439 |       return;
 440 |     }
 441 |   }
 442 |   if (i < vec_size-1) {
 443 |     if (canon_cls > (vector[i+1] & 0xFF)) {
 444 |       do {
 445 |         vector[i] = vector[i+1];
 446 |         i++;
 447 |       } while (i < vec_size-1 && canon_cls > (vector[i+1] & 0xFF));
 448 |       vector[i] = item;
 449 |       return;
 450 |     }
 451 |   }
 452 | }
 453 | 
 454 | static void add_utf8char (luaL_Buffer *b, utfint ch);
 455 | 
 456 | static inline void grow_vector_if_needed (uint32_t **vector, uint32_t *onstack, size_t *size, size_t needed)
 457 | {
 458 |   size_t current_size = *size;
 459 |   if (needed >= current_size) {
 460 |     size_t new_size = current_size * 2; /* `needed` is never bigger than `current_size * 2` */
 461 |     uint32_t *new_vector = malloc(new_size * sizeof(uint32_t));
 462 |     memcpy(new_vector, *vector, current_size * sizeof(uint32_t));
 463 |     *size = new_size;
 464 |     if (*vector != onstack)
 465 |       free(*vector);
 466 |     *vector = new_vector;
 467 |   }
 468 | }
 469 | 
 470 | static void string_to_nfc (lua_State *L, luaL_Buffer *buff, const char *s, const char *e)
 471 | {
 472 |   /* Converting a string to Normal Form C involves:
 473 |    * 1) Ensuring that codepoints with "built-in" accents are used whenever possible
 474 |    *    rather than separate codepoints for a base character and combining mark
 475 |    * 2) Where combining marks must be used, putting them into canonical order
 476 |    * 3) Converting some deprecated codepoints to the recommended variant
 477 |    * 4) Ensuring that Korean Hangul are represented as precomposed syllable
 478 |    *    codepoints whenever possible, rather than sequences of Jamo codepoints
 479 |    *
 480 |    * (Combining marks are accents which appear on top of or below the preceding
 481 |    * character. Starter codepoints are the base characters which combining marks can
 482 |    * 'combine' with. Almost all codepoints are starters, including all the Latin alphabet.
 483 |    * Every Unicode codepoint has a numeric 'canonicalization class'; starters have class = 0.
 484 |    * Combining marks must be sorted in order of their canonicalization class. Since the
 485 |    * canonicalization class numbers are not unique, the sort must be stable.)
 486 |    *
 487 |    * When converting to NFC, the largest scope which we need to work on at once
 488 |    * consists of a 'starter' codepoint and either 1 or more ensuing combining marks,
 489 |    * OR else a directly following starter codepoint.
 490 |    *
 491 |    * As we walk through the string, whenever we pass by a complete sequence of starter +
 492 |    * combining marks or starter + starter, we process that sequence to see if it is NFC or not.
 493 |    * If it is, we memcpy the bytes verbatim into the output buffer. If it is not, then we
 494 |    * convert the codepoints to NFC and then emit those codepoints as UTF-8 bytes. */
 495 | 
 496 |   utfint starter = -1, ch; /* 'starter' is last starter codepoint seen */
 497 |   const char *to_copy = s; /* pointer to next bytes we might need to memcpy into output buffer */
 498 |   unsigned int prev_canon_cls = 0, canon_cls = 0;
 499 |   int fixedup = 0; /* has the sequence currently under consideration been modified to make it NFC? */
 500 | 
 501 |   /* Temporary storage for a sequence of consecutive combining marks
 502 |    * In the vast majority of cases, this small on-stack array will provide enough
 503 |    * space; if not, we will switch to a malloc'd buffer */
 504 |   uint32_t onstack[8];
 505 |   size_t vec_size = 0, vec_max = sizeof(onstack)/sizeof(uint32_t);
 506 |   uint32_t *vector = onstack;
 507 | 
 508 |   while (s < e) {
 509 |     const char *new_s = utf8_decode(s, &ch, 1);
 510 |     if (new_s == NULL) {
 511 |       if (vector != onstack)
 512 |         free(vector);
 513 |       lua_pushstring(L, "string is not valid UTF-8");
 514 |       lua_error(L);
 515 |     }
 516 |     unsigned int canon_cls = lookup_canon_cls(ch);
 517 | 
 518 |     if (!canon_cls) {
 519 |       /* This is a starter codepoint */
 520 |       nfc_table *entry = nfc_quickcheck(ch);
 521 | 
 522 |       /* But in rare cases, a deprecated 'starter' codepoint may convert
 523 |        * to combining marks instead!
 524 |        * Why, oh why, did the Unicode Consortium do this?? */
 525 |       if (entry && entry->reason == REASON_MUST_CONVERT_2) {
 526 |         utfint conv1 = entry->data1;
 527 |         unsigned int canon_cls1 = lookup_canon_cls(conv1);
 528 |         if (canon_cls1) {
 529 |           utfint conv2 = entry->data2;
 530 |           unsigned int canon_cls2 = lookup_canon_cls(conv2);
 531 |           grow_vector_if_needed(&vector, onstack, &vec_max, vec_size + 2);
 532 |           vector[vec_size++] = (conv1 << 8) | (canon_cls1 & 0xFF);
 533 |           vector[vec_size++] = (conv2 << 8) | (canon_cls2 & 0xFF);
 534 |           s = new_s;
 535 |           prev_canon_cls = canon_cls2;
 536 |           fixedup = 1;
 537 |           continue;
 538 |         }
 539 |       }
 540 | 
 541 |       /* Handle preceding starter and optional sequence of combining marks which may have followed it */
 542 |       if (prev_canon_cls) {
 543 |         /* Before this starter, there was a sequence of combining marks.
 544 |          * Check those over and emit output to 'buff' */
 545 | process_combining_marks:
 546 | 
 547 |         /* Check if accumulated combining marks were in correct order */
 548 |         for (unsigned int i = 1; i < vec_size; i++) {
 549 |           if ((vector[i-1] & 0xFF) > (vector[i] & 0xFF)) {
 550 |             /* Order is incorrect, we need to sort */
 551 |             uint32_t *scratch = malloc(vec_size * sizeof(uint32_t));
 552 |             stable_sort_combining_marks(vector, scratch, vec_size);
 553 |             free(scratch);
 554 |             fixedup = 1;
 555 |             break;
 556 |           }
 557 |         }
 558 | 
 559 |         /* Check if any of those combining marks are in violation of NFC */
 560 |         unsigned int i = 0;
 561 |         while (i < vec_size) {
 562 |           utfint combine_mark = vector[i] >> 8;
 563 |           nfc_table *mark_entry = nfc_quickcheck(combine_mark);
 564 |           if (mark_entry) {
 565 |             if (mark_entry->reason == REASON_MUST_CONVERT_1) {
 566 |               /* This combining mark must be converted to a different one */
 567 |               vector[i] = (mark_entry->data1 << 8) | mark_entry->data2;
 568 |               fixedup = 1;
 569 |               continue;
 570 |             } else if (mark_entry->reason == REASON_MUST_CONVERT_2) {
 571 |               /* This combining mark must be converted to two others */
 572 |               grow_vector_if_needed(&vector, onstack, &vec_max, vec_size + 1);
 573 |               memmove(&vector[i+2], &vector[i+1], sizeof(uint32_t) * (vec_size - i - 1));
 574 |               vector[i] = (mark_entry->data1 << 8) | lookup_canon_cls(mark_entry->data1);
 575 |               vector[i+1] = (mark_entry->data2 << 8) | lookup_canon_cls(mark_entry->data2);
 576 |               vec_size++;
 577 |               fixedup = 1;
 578 |               continue;
 579 |             } else if (mark_entry->reason == REASON_COMBINING_MARK) {
 580 |               unsigned int mark_canon_cls = vector[i] & 0xFF;
 581 |               if (i == 0 || mark_canon_cls > (vector[i-1] & 0xFF)) {
 582 |                 if (nfc_combine(starter, combine_mark, &starter)) {
 583 |                   /* This combining mark must be combined with preceding starter */
 584 |                   vec_size--;
 585 |                   memmove(&vector[i], &vector[i+1], sizeof(uint32_t) * (vec_size - i)); /* Remove element i */
 586 |                   fixedup = 1;
 587 |                   continue;
 588 |                 }
 589 | 
 590 |                 decompose_table *decomp = nfc_decompose(starter);
 591 |                 if (decomp) {
 592 |                   if (decomp->canon_cls2 > mark_canon_cls && nfc_combine(decomp->to1, combine_mark, &starter)) {
 593 |                     /* The preceding starter already included an accent, but when represented as a combining
 594 |                      * mark, that accent has a HIGHER canonicalization class than this one
 595 |                      * Further, this one is able to combine with the same base character
 596 |                      * In other words, the base character was wrongly combined with a "lower-priority"
 597 |                      * combining mark; fix that up */
 598 |                     unsigned int class2 = lookup_canon_cls(decomp->to2);
 599 |                     memmove(&vector[1], &vector[0], sizeof(uint32_t) * i);
 600 |                     vector[0] = (decomp->to2 << 8) | class2;
 601 |                     stable_insert_combining_mark(vector, vec_size, 0);
 602 |                     fixedup = 1;
 603 |                     continue;
 604 |                   } else {
 605 |                     decompose_table *decomp2 = nfc_decompose(decomp->to1);
 606 |                     if (decomp2 && decomp2->canon_cls2 > mark_canon_cls && nfc_combine(decomp2->to1, combine_mark, &starter)) {
 607 |                       grow_vector_if_needed(&vector, onstack, &vec_max, vec_size + 1);
 608 |                       memmove(&vector[i+2], &vector[i+1], sizeof(uint32_t) * (vec_size - i - 1));
 609 |                       memmove(&vector[2], &vector[0], sizeof(uint32_t) * i);
 610 |                       vector[0] = (decomp2->to2 << 8) | lookup_canon_cls(decomp2->to2);
 611 |                       vector[1] = (decomp->to2 << 8) | lookup_canon_cls(decomp->to2);
 612 |                       vec_size++;
 613 |                       stable_insert_combining_mark(vector, vec_size, 1);
 614 |                       stable_insert_combining_mark(vector, vec_size, 0);
 615 |                       fixedup = 1;
 616 |                       continue;
 617 |                     }
 618 |                   }
 619 |                 }
 620 |               }
 621 |             }
 622 |           }
 623 |           i++;
 624 |         }
 625 | 
 626 |         if (fixedup) {
 627 |           /* The preceding starter/combining mark sequence was bad; convert fixed-up codepoints
 628 |            * to UTF-8 bytes */
 629 |           if (starter != -1)
 630 |             add_utf8char(buff, starter);
 631 |           for (unsigned int i = 0; i < vec_size; i++)
 632 |             add_utf8char(buff, vector[i] >> 8);
 633 |         } else {
 634 |           /* The preceding starter/combining mark sequence was good; copy raw bytes to output */
 635 |           luaL_addlstring(buff, to_copy, s - to_copy);
 636 |         }
 637 |         if (s >= e) {
 638 |           /* We jumped in to the middle of the main loop to finish processing trailing
 639 |            * combining marks... we are actually done now */
 640 |           if (vector != onstack)
 641 |             free(vector);
 642 |           return;
 643 |         }
 644 |         vec_size = 0; /* Clear vector of combining marks in readiness for next such sequence */
 645 |         fixedup = 0;
 646 |       } else if (starter != -1) {
 647 |         /* This starter was preceded immediately by another starter
 648 |          * Check if this one should combine with it */
 649 |         fixedup = 0;
 650 |         if (entry) {
 651 |           if (entry->reason == REASON_STARTER_CAN_COMBINE && nfc_combine(starter, ch, &ch)) {
 652 |             fixedup = 1;
 653 |           } else if (entry->reason == REASON_JAMO_VOWEL && starter >= 0x1100 && starter <= 0x1112) {
 654 |             ch = 0xAC00 + ((starter - 0x1100) * 588) + ((ch - 0x1161) * 28);
 655 |             fixedup = 1;
 656 |           } else if (entry->reason == REASON_JAMO_TRAILING) {
 657 |             if (starter >= 0xAC00 && starter <= 0xD7A3 && (starter - 0xAC00) % 28 == 0) {
 658 |               ch = starter + ch - 0x11A7;
 659 |               fixedup = 1;
 660 |             }
 661 |           }
 662 |         }
 663 |         if (!fixedup)
 664 |           add_utf8char(buff, starter); /* Emit previous starter to output */
 665 |       }
 666 |       starter = ch;
 667 |       to_copy = s;
 668 | 
 669 |       /* We are finished processing the preceding starter and optional sequence of combining marks
 670 |        * Now check if this (possibly deprecated) starter needs to be converted to a canonical variant */
 671 |       if (entry) {
 672 |         if (entry->reason == REASON_MUST_CONVERT_1) {
 673 |           starter = entry->data1;
 674 |           fixedup = 1;
 675 |         } else if (entry->reason == REASON_MUST_CONVERT_2) {
 676 |           utfint conv1 = entry->data1;
 677 |           utfint conv2 = entry->data2;
 678 |           /* It's possible that 'ch' might convert to two other codepoints,
 679 |            * where the 2nd one is a combining mark */
 680 |           unsigned int canon_cls2 = lookup_canon_cls(conv2);
 681 |           if (canon_cls2) {
 682 |             /* It's possible that the 1st resulting codepoint may need to be
 683 |              * split again into more codepoints */
 684 |             nfc_table *conv_entry = nfc_quickcheck(conv1);
 685 |             if (conv_entry && conv_entry->reason == REASON_MUST_CONVERT_2) {
 686 |               utfint conv3 = conv2;
 687 |               unsigned int canon_cls3 = canon_cls2;
 688 |               conv1 = conv_entry->data1;
 689 |               conv2 = conv_entry->data2;
 690 |               canon_cls2 = lookup_canon_cls(conv2);
 691 |               if (canon_cls2) {
 692 |                 starter = conv1;
 693 |                 vector[0] = (conv2 << 8) | canon_cls2;
 694 |                 vector[1] = (conv3 << 8) | canon_cls3;
 695 |                 vec_size = 2;
 696 |               } else {
 697 |                 add_utf8char(buff, conv1);
 698 |                 starter = conv2;
 699 |                 vector[0] = (conv3 << 8) | canon_cls3;
 700 |                 vec_size = 1;
 701 |               }
 702 |               canon_cls = canon_cls3;
 703 |             } else {
 704 |               starter = conv1;
 705 |               vector[0] = (conv2 << 8) | canon_cls2;
 706 |               vec_size = 1;
 707 |               canon_cls = canon_cls2;
 708 |             }
 709 |           } else {
 710 |             add_utf8char(buff, conv1);
 711 |             starter = conv2;
 712 |           }
 713 |           fixedup = 1;
 714 |         }
 715 |       }
 716 |     } else {
 717 |       /* Accumulate combining marks in vector */
 718 |       grow_vector_if_needed(&vector, onstack, &vec_max, vec_size + 1);
 719 |       vector[vec_size++] = (ch << 8) | (canon_cls & 0xFF);
 720 |     }
 721 | 
 722 |     s = new_s;
 723 |     prev_canon_cls = canon_cls;
 724 |   }
 725 | 
 726 |   if (vec_size)
 727 |     goto process_combining_marks; /* Finish processing trailing combining marks */
 728 |   if (starter != -1)
 729 |     add_utf8char(buff, starter);
 730 | 
 731 |   if (vector != onstack)
 732 |     free(vector);
 733 | }
 734 | 
 735 | /* Grapheme cluster support */
 736 | 
 737 | static int hangul_type (utfint ch) {
 738 |   /* The first Hangul codepoint is U+1100 */
 739 |   if (ch < 0x1100) {
 740 |     return 0;
 741 |   }
 742 |   size_t begin = 0, end = table_size(hangul_table);
 743 | 
 744 |   while (begin < end) {
 745 |     size_t mid = (begin + end) / 2;
 746 |     if (hangul_table[mid].last < ch)
 747 |       begin = mid + 1;
 748 |     else if (hangul_table[mid].first > ch)
 749 |       end = mid;
 750 |     else
 751 |       return hangul_table[mid].type;
 752 |   }
 753 | 
 754 |   return 0;
 755 | }
 756 | 
 757 | static int indic_conjunct_type (utfint ch) {
 758 |   /* The first Indic conjunct codepoint is U+0300 */
 759 |   if (ch < 0x300) {
 760 |     return 0;
 761 |   }
 762 |   size_t begin = 0, end = table_size(indic_table);
 763 | 
 764 |   while (begin < end) {
 765 |     size_t mid = (begin + end) / 2;
 766 |     if (indic_table[mid].last < ch)
 767 |       begin = mid + 1;
 768 |     else if (indic_table[mid].first > ch)
 769 |       end = mid;
 770 |     else
 771 |       return indic_table[mid].type;
 772 |   }
 773 | 
 774 |   return 0;
 775 | }
 776 | 
 777 | #define define_category(cls, name) static int utf8_is##name (utfint ch)\
 778 | { return find_in_range(name##_table, table_size(name##_table), ch); }
 779 | #define define_converter(name) static utfint utf8_to##name (utfint ch) \
 780 | { return convert_char(to##name##_table, table_size(to##name##_table), ch); }
 781 | utf8_categories(define_category)
 782 | utf8_converters(define_converter)
 783 | #undef define_category
 784 | #undef define_converter
 785 | 
 786 | static int utf8_isgraph (utfint ch) {
 787 |   if (find_in_range(space_table, table_size(space_table), ch))
 788 |     return 0;
 789 |   if (find_in_range(graph_table, table_size(graph_table), ch))
 790 |     return 1;
 791 |   if (find_in_range(compose_table, table_size(compose_table), ch))
 792 |     return 1;
 793 |   return 0;
 794 | }
 795 | 
 796 | static int utf8_isalnum (utfint ch) {
 797 |   if (find_in_range(alpha_table, table_size(alpha_table), ch))
 798 |     return 1;
 799 |   if (find_in_range(alnum_extend_table, table_size(alnum_extend_table), ch))
 800 |     return 1;
 801 |   return 0;
 802 | }
 803 | 
 804 | static int utf8_width (utfint ch, int ambi_is_single) {
 805 |   if (find_in_range(doublewidth_table, table_size(doublewidth_table), ch))
 806 |     return 2;
 807 |   if (find_in_range(ambiwidth_table, table_size(ambiwidth_table), ch))
 808 |     return ambi_is_single ? 1 : 2;
 809 |   if (find_in_range(compose_table, table_size(compose_table), ch))
 810 |     return 0;
 811 |   if (find_in_range(unprintable_table, table_size(unprintable_table), ch))
 812 |     return 0;
 813 |   return 1;
 814 | }
 815 | 
 816 | /* string module compatible interface */
 817 | 
 818 | static int typeerror (lua_State *L, int idx, const char *tname)
 819 | { return luaL_error(L, "%s expected, got %s", tname, luaL_typename(L, idx)); }
 820 | 
 821 | static const char *check_utf8 (lua_State *L, int idx, const char **end) {
 822 |   size_t len;
 823 |   const char *s = luaL_checklstring(L, idx, &len);
 824 |   if (end) *end = s+len;
 825 |   return s;
 826 | }
 827 | 
 828 | static const char *to_utf8 (lua_State *L, int idx, const char **end) {
 829 |   size_t len;
 830 |   const char *s = lua_tolstring(L, idx, &len);
 831 |   if (end) *end = s+len;
 832 |   return s;
 833 | }
 834 | 
 835 | static const char *utf8_safe_decode (lua_State *L, const char *p, utfint *pval) {
 836 |   p = utf8_decode(p, pval, 0);
 837 |   if (p == NULL) luaL_error(L, "invalid UTF-8 code");
 838 |   return p;
 839 | }
 840 | 
 841 | static void add_utf8char (luaL_Buffer *b, utfint ch) {
 842 |   char buff[UTF8_BUFFSZ];
 843 |   size_t n = utf8_encode(buff, ch);
 844 |   luaL_addlstring(b, buff+UTF8_BUFFSZ-n, n);
 845 | }
 846 | 
 847 | static lua_Integer byte_relat (lua_Integer pos, size_t len) {
 848 |   if (pos >= 0) return pos;
 849 |   else if (0u - (size_t)pos > len) return 0;
 850 |   else return (lua_Integer)len + pos + 1;
 851 | }
 852 | 
 853 | static int Lutf8_len (lua_State *L) {
 854 |   size_t len, n;
 855 |   const char *s = luaL_checklstring(L, 1, &len), *p, *e;
 856 |   lua_Integer posi = byte_relat(luaL_optinteger(L, 2, 1), len);
 857 |   lua_Integer pose = byte_relat(luaL_optinteger(L, 3, -1), len);
 858 |   int lax = lua_toboolean(L, 4);
 859 |   luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2,
 860 |                    "initial position out of string");
 861 |   luaL_argcheck(L, --pose < (lua_Integer)len, 3,
 862 |                    "final position out of string");
 863 |   for (n = 0, p=s+posi, e=s+pose+1; p < e; ++n) {
 864 |     if (lax)
 865 |       p = utf8_next(p, e);
 866 |     else {
 867 |       utfint ch;
 868 |       const char *np = utf8_decode(p, &ch, !lax);
 869 |       if (np == NULL || utf8_invalid(ch)) {
 870 |         lua_pushnil(L);
 871 |         lua_pushinteger(L, p - s + 1);
 872 |         return 2;
 873 |       }
 874 |       p = np;
 875 |     }
 876 |   }
 877 |   lua_pushinteger(L, n);
 878 |   return 1;
 879 | }
 880 | 
 881 | static int Lutf8_sub (lua_State *L) {
 882 |   const char *e, *s = check_utf8(L, 1, &e);
 883 |   lua_Integer posi = luaL_checkinteger(L, 2);
 884 |   lua_Integer pose = luaL_optinteger(L, 3, -1);
 885 |   if (utf8_range(s, e, &posi, &pose))
 886 |     lua_pushlstring(L, s+posi, pose-posi);
 887 |   else
 888 |     lua_pushliteral(L, "");
 889 |   return 1;
 890 | }
 891 | 
 892 | static int Lutf8_reverse (lua_State *L) {
 893 |   luaL_Buffer b;
 894 |   const char *prev, *pprev, *ends, *e, *s = check_utf8(L, 1, &e);
 895 |   (void) ends;
 896 |   int lax = lua_toboolean(L, 2);
 897 |   luaL_buffinit(L, &b);
 898 |   if (lax) {
 899 |     for (prev = e; s < prev; e = prev) {
 900 |       prev = utf8_prev(s, prev);
 901 |       luaL_addlstring(&b, prev, e-prev);
 902 |     }
 903 |   } else {
 904 |     for (prev = e; s < prev; prev = pprev) {
 905 |       utfint code = 0;
 906 |       ends = utf8_safe_decode(L, pprev = utf8_prev(s, prev), &code);
 907 |       assert(ends == prev);
 908 |       if (utf8_invalid(code))
 909 |         return luaL_error(L, "invalid UTF-8 code");
 910 |       if (!utf8_iscompose(code)) {
 911 |         luaL_addlstring(&b, pprev, e-pprev);
 912 |         e = pprev;
 913 |       }
 914 |     }
 915 |   }
 916 |   luaL_pushresult(&b);
 917 |   return 1;
 918 | }
 919 | 
 920 | static int Lutf8_byte (lua_State *L) {
 921 |   size_t n = 0;
 922 |   const char *e, *s = check_utf8(L, 1, &e);
 923 |   lua_Integer posi = luaL_optinteger(L, 2, 1);
 924 |   lua_Integer pose = luaL_optinteger(L, 3, posi);
 925 |   if (utf8_range(s, e, &posi, &pose)) {
 926 |     for (e = s + pose, s = s + posi; s < e; ++n) {
 927 |       utfint ch = 0;
 928 |       s = utf8_safe_decode(L, s, &ch);
 929 |       lua_pushinteger(L, ch);
 930 |     }
 931 |   }
 932 |   return CAST(int, n);
 933 | }
 934 | 
 935 | static int Lutf8_codepoint (lua_State *L) {
 936 |   const char *e, *s = check_utf8(L, 1, &e);
 937 |   size_t len = e-s;
 938 |   lua_Integer posi = byte_relat(luaL_optinteger(L, 2, 1), len);
 939 |   lua_Integer pose = byte_relat(luaL_optinteger(L, 3, posi), len);
 940 |   int lax = lua_toboolean(L, 4);
 941 |   int n;
 942 |   const char *se;
 943 |   luaL_argcheck(L, posi >= 1, 2, "out of range");
 944 |   luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of range");
 945 |   if (posi > pose) return 0;  /* empty interval; return no values */
 946 |   if (pose - posi >= INT_MAX)  /* (lua_Integer -> int) overflow? */
 947 |     return luaL_error(L, "string slice too long");
 948 |   n = (int)(pose -  posi + 1);
 949 |   luaL_checkstack(L, n, "string slice too long");
 950 |   n = 0;  /* count the number of returns */
 951 |   se = s + pose;  /* string end */
 952 |   for (n = 0, s += posi - 1; s < se;) {
 953 |     utfint code = 0;
 954 |     s = utf8_safe_decode(L, s, &code);
 955 |     if (!lax && utf8_invalid(code))
 956 |       return luaL_error(L, "invalid UTF-8 code");
 957 |     lua_pushinteger(L, code);
 958 |     n++;
 959 |   }
 960 |   return n;
 961 | }
 962 | 
 963 | static int Lutf8_char (lua_State *L) {
 964 |   int i, n = lua_gettop(L); /* number of arguments */
 965 |   luaL_Buffer b;
 966 |   luaL_buffinit(L, &b);
 967 |   for (i = 1; i <= n; ++i) {
 968 |     lua_Integer code = luaL_checkinteger(L, i);
 969 |     luaL_argcheck(L, code <= UTF8_MAXCP, i, "value out of range");
 970 |     add_utf8char(&b, CAST(utfint, code));
 971 |   }
 972 |   luaL_pushresult(&b);
 973 |   return 1;
 974 | }
 975 | 
 976 | #define bind_converter(name)                                   \
 977 | static int Lutf8_##name (lua_State *L) {                        \
 978 |   int t = lua_type(L, 1);                                      \
 979 |   if (t == LUA_TNUMBER)                                        \
 980 |     lua_pushinteger(L, utf8_to##name(CAST(utfint, lua_tointeger(L, 1))));    \
 981 |   else if (t == LUA_TSTRING) {                                 \
 982 |     luaL_Buffer b;                                             \
 983 |     const char *e, *s = to_utf8(L, 1, &e);                     \
 984 |     luaL_buffinit(L, &b);                                      \
 985 |     while (s < e) {                                            \
 986 |       utfint ch = 0;                                             \
 987 |       s = utf8_safe_decode(L, s, &ch);                         \
 988 |       add_utf8char(&b, utf8_to##name(ch));                     \
 989 |     }                                                          \
 990 |     luaL_pushresult(&b);                                       \
 991 |   }                                                            \
 992 |   else return typeerror(L, 1, "number/string");                \
 993 |   return 1;                                                    \
 994 | }
 995 | utf8_converters(bind_converter)
 996 | #undef bind_converter
 997 | 
 998 | 
 999 | /* unicode extra interface */
1000 | 
1001 | static const char *parse_escape (lua_State *L, const char *s, const char *e, int hex, utfint *pch) {
1002 |   utfint code = 0;
1003 |   int in_bracket = 0;
1004 |   if (*s == '{') ++s, in_bracket = 1;
1005 |   for (; s < e; ++s) {
1006 |     utfint ch = (unsigned char)*s;
1007 |     if (ch >= '0' && ch <= '9') ch = ch - '0';
1008 |     else if (hex && ch >= 'A' && ch <= 'F') ch = 10 + (ch - 'A');
1009 |     else if (hex && ch >= 'a' && ch <= 'f') ch = 10 + (ch - 'a');
1010 |     else if (!in_bracket) break;
1011 |     else if (ch == '}')   { ++s; break; }
1012 |     else luaL_error(L, "invalid escape '%c'", ch);
1013 |     code *= hex ? 16 : 10;
1014 |     code += ch;
1015 |   }
1016 |   *pch = code;
1017 |   return s;
1018 | }
1019 | 
1020 | static int Lutf8_escape (lua_State *L) {
1021 |   const char *e, *s = check_utf8(L, 1, &e);
1022 |   luaL_Buffer b;
1023 |   luaL_buffinit(L, &b);
1024 |   while (s < e) {
1025 |     utfint ch = 0;
1026 |     s = utf8_safe_decode(L, s, &ch);
1027 |     if (ch == '%') {
1028 |       int hex = 0;
1029 |       switch (*s) {
1030 |       case '0': case '1': case '2': case '3':
1031 |       case '4': case '5': case '6': case '7':
1032 |       case '8': case '9': case '{':
1033 |         break;
1034 |       case 'x': case 'X': hex = 1; /* fall through */
1035 |       case 'u': case 'U': if (s+1 < e) { ++s; break; }
1036 |                             /* fall through */
1037 |       default:
1038 |         s = utf8_safe_decode(L, s, &ch);
1039 |         goto next;
1040 |       }
1041 |       s = parse_escape(L, s, e, hex, &ch);
1042 |     }
1043 | next:
1044 |     add_utf8char(&b, ch);
1045 |   }
1046 |   luaL_pushresult(&b);
1047 |   return 1;
1048 | }
1049 | 
1050 | static int Lutf8_insert (lua_State *L) {
1051 |   const char *e, *s = check_utf8(L, 1, &e);
1052 |   size_t sublen;
1053 |   const char *subs;
1054 |   luaL_Buffer b;
1055 |   int nargs = 2;
1056 |   const char *first = e;
1057 |   if (lua_type(L, 2) == LUA_TNUMBER) {
1058 |     int idx = (int)lua_tointeger(L, 2);
1059 |     if (idx != 0) first = utf8_relat(s, e, idx);
1060 |     luaL_argcheck(L, first, 2, "invalid index");
1061 |     ++nargs;
1062 |   }
1063 |   subs = luaL_checklstring(L, nargs, &sublen);
1064 |   luaL_buffinit(L, &b);
1065 |   luaL_addlstring(&b, s, first-s);
1066 |   luaL_addlstring(&b, subs, sublen);
1067 |   luaL_addlstring(&b, first, e-first);
1068 |   luaL_pushresult(&b);
1069 |   return 1;
1070 | }
1071 | 
1072 | static int Lutf8_remove (lua_State *L) {
1073 |   const char *e, *s = check_utf8(L, 1, &e);
1074 |   lua_Integer posi = luaL_optinteger(L, 2, -1);
1075 |   lua_Integer pose = luaL_optinteger(L, 3, -1);
1076 |   if (!utf8_range(s, e, &posi, &pose))
1077 |     lua_settop(L, 1);
1078 |   else {
1079 |     luaL_Buffer b;
1080 |     luaL_buffinit(L, &b);
1081 |     luaL_addlstring(&b, s, posi);
1082 |     luaL_addlstring(&b, s+pose, e-s-pose);
1083 |     luaL_pushresult(&b);
1084 |   }
1085 |   return 1;
1086 | }
1087 | 
1088 | static int push_offset (lua_State *L, const char *s, const char *e, lua_Integer offset, lua_Integer idx) {
1089 |   utfint ch = 0;
1090 |   const char *p;
1091 |   if (idx != 0)
1092 |     p = utf8_offset(s, e, offset, idx);
1093 |   else if (p = s+offset-1, iscont(p))
1094 |     p = utf8_prev(s, p);
1095 |   if (p == NULL || p == e) return 0;
1096 |   utf8_decode(p, &ch, 0);
1097 |   lua_pushinteger(L, p-s+1);
1098 |   lua_pushinteger(L, ch);
1099 |   return 2;
1100 | }
1101 | 
1102 | static int Lutf8_charpos (lua_State *L) {
1103 |   const char *e, *s = check_utf8(L, 1, &e);
1104 |   lua_Integer offset = 1;
1105 |   if (lua_isnoneornil(L, 3)) {
1106 |       lua_Integer idx = luaL_optinteger(L, 2, 0);
1107 |       if (idx > 0) --idx;
1108 |       else if (idx < 0) offset = e-s+1;
1109 |       return push_offset(L, s, e, offset, idx);
1110 |   }
1111 |   offset = byte_relat(luaL_optinteger(L, 2, 1), e-s);
1112 |   if (offset < 1) offset = 1;
1113 |   return push_offset(L, s, e, offset, luaL_checkinteger(L, 3));
1114 | }
1115 | 
1116 | static int Lutf8_offset (lua_State *L) {
1117 |   size_t len;
1118 |   const char *s = luaL_checklstring(L, 1, &len);
1119 |   lua_Integer n  = luaL_checkinteger(L, 2);
1120 |   lua_Integer posi = (n >= 0) ? 1 : len + 1;
1121 |   posi = byte_relat(luaL_optinteger(L, 3, posi), len);
1122 |   luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3,
1123 |                    "position out of range");
1124 |   if (n == 0) {
1125 |     /* find beginning of current byte sequence */
1126 |     while (posi > 0 && iscont(s + posi)) posi--;
1127 |   } else {
1128 |     if (iscont(s + posi))
1129 |       return luaL_error(L, "initial position is a continuation byte");
1130 |     if (n < 0) {
1131 |        while (n < 0 && posi > 0) {  /* move back */
1132 |          do {  /* find beginning of previous character */
1133 |            posi--;
1134 |          } while (posi > 0 && iscont(s + posi));
1135 |          n++;
1136 |        }
1137 |      } else {
1138 |        n--;  /* do not move for 1st character */
1139 |        while (n > 0 && posi < (lua_Integer)len) {
1140 |          do {  /* find beginning of next character */
1141 |            posi++;
1142 |          } while (iscont(s + posi));  /* (cannot pass final '\0') */
1143 |          n--;
1144 |        }
1145 |      }
1146 |   }
1147 |   if (n == 0)  /* did it find given character? */
1148 |     lua_pushinteger(L, posi + 1);
1149 |   else  /* no such character */
1150 |     lua_pushnil(L);
1151 |   return 1;
1152 | }
1153 | 
1154 | static int Lutf8_next (lua_State *L) {
1155 |   const char *e, *s = check_utf8(L, 1, &e);
1156 |   lua_Integer offset = byte_relat(luaL_optinteger(L, 2, 1), e-s);
1157 |   lua_Integer idx = luaL_optinteger(L, 3, !lua_isnoneornil(L, 2));
1158 |   return push_offset(L, s, e, offset, idx);
1159 | }
1160 | 
1161 | static int iter_aux (lua_State *L, int strict) {
1162 |   const char *e, *s = check_utf8(L, 1, &e);
1163 |   int n = CAST(int, lua_tointeger(L, 2));
1164 |   const char *p = n <= 0 ? s : utf8_next(s+n-1, e);
1165 |   if (p < e) {
1166 |     utfint code = 0;
1167 |     utf8_safe_decode(L, p, &code);
1168 |     if (strict && utf8_invalid(code))
1169 |       return luaL_error(L, "invalid UTF-8 code");
1170 |     lua_pushinteger(L, p-s+1);
1171 |     lua_pushinteger(L, code);
1172 |     return 2;
1173 |   }
1174 |   return 0;  /* no more codepoints */
1175 | }
1176 | 
1177 | static int iter_auxstrict (lua_State *L) { return iter_aux(L, 1); }
1178 | static int iter_auxlax (lua_State *L) { return iter_aux(L, 0); }
1179 | 
1180 | static int Lutf8_codes (lua_State *L) {
1181 |   int lax = lua_toboolean(L, 2);
1182 |   luaL_checkstring(L, 1);
1183 |   lua_pushcfunction(L, lax ? iter_auxlax : iter_auxstrict);
1184 |   lua_pushvalue(L, 1);
1185 |   lua_pushinteger(L, 0);
1186 |   return 3;
1187 | }
1188 | 
1189 | static int Lutf8_width (lua_State *L) {
1190 |   int t = lua_type(L, 1);
1191 |   int ambi_is_single = !lua_toboolean(L, 2);
1192 |   int default_width = CAST(int, luaL_optinteger(L, 3, 0));
1193 |   if (t == LUA_TNUMBER) {
1194 |     size_t chwidth = utf8_width(CAST(utfint, lua_tointeger(L, 1)), ambi_is_single);
1195 |     if (chwidth == 0) chwidth = default_width;
1196 |     lua_pushinteger(L, (lua_Integer)chwidth);
1197 |   } else if (t != LUA_TSTRING)
1198 |     return typeerror(L, 1, "number/string");
1199 |   else {
1200 |     const char *e, *s = to_utf8(L, 1, &e);
1201 |     int width = 0;
1202 |     while (s < e) {
1203 |       utfint ch = 0;
1204 |       int chwidth;
1205 |       s = utf8_safe_decode(L, s, &ch);
1206 |       chwidth = utf8_width(ch, ambi_is_single);
1207 |       width += chwidth == 0 ? default_width : chwidth;
1208 |     }
1209 |     lua_pushinteger(L, (lua_Integer)width);
1210 |   }
1211 |   return 1;
1212 | }
1213 | 
1214 | static int Lutf8_widthindex (lua_State *L) {
1215 |   const char *e, *s = check_utf8(L, 1, &e);
1216 |   int width = CAST(int, luaL_checkinteger(L, 2));
1217 |   int ambi_is_single = !lua_toboolean(L, 3);
1218 |   int default_width = CAST(int, luaL_optinteger(L, 4, 0));
1219 |   size_t idx = 1;
1220 |   while (s < e) {
1221 |     utfint ch = 0;
1222 |     size_t chwidth;
1223 |     s = utf8_safe_decode(L, s, &ch);
1224 |     chwidth = utf8_width(ch, ambi_is_single);
1225 |     if (chwidth == 0) chwidth = default_width;
1226 |     width -= CAST(int, chwidth);
1227 |     if (width <= 0) {
1228 |       lua_pushinteger(L, idx);
1229 |       lua_pushinteger(L, width + chwidth);
1230 |       lua_pushinteger(L, chwidth);
1231 |       return 3;
1232 |     }
1233 |     ++idx;
1234 |   }
1235 |   lua_pushinteger(L, (lua_Integer)idx);
1236 |   return 1;
1237 | }
1238 | 
1239 | static int Lutf8_ncasecmp (lua_State *L) {
1240 |   const char *e1, *s1 = check_utf8(L, 1, &e1);
1241 |   const char *e2, *s2 = check_utf8(L, 2, &e2);
1242 |   while (s1 < e1 || s2 < e2) {
1243 |     utfint ch1 = 0, ch2 = 0;
1244 |     if (s1 == e1)
1245 |       ch2 = 1;
1246 |     else if (s2 == e2)
1247 |       ch1 = 1;
1248 |     else {
1249 |       s1 = utf8_safe_decode(L, s1, &ch1);
1250 |       s2 = utf8_safe_decode(L, s2, &ch2);
1251 |       ch1 = utf8_tofold(ch1);
1252 |       ch2 = utf8_tofold(ch2);
1253 |     }
1254 |     if (ch1 != ch2) {
1255 |       lua_pushinteger(L, ch1 > ch2 ? 1 : -1);
1256 |       return 1;
1257 |     }
1258 |   }
1259 |   lua_pushinteger(L, 0);
1260 |   return 1;
1261 | }
1262 | 
1263 | 
1264 | /* utf8 pattern matching implement */
1265 | 
1266 | #ifndef   LUA_MAXCAPTURES
1267 | # define  LUA_MAXCAPTURES  32
1268 | #endif /* LUA_MAXCAPTURES */
1269 | 
1270 | #define CAP_UNFINISHED (-1)
1271 | #define CAP_POSITION   (-2)
1272 | 
1273 | 
1274 | typedef struct MatchState {
1275 |   int matchdepth;  /* control for recursive depth (to avoid C stack overflow) */
1276 |   const char *src_init;  /* init of source string */
1277 |   const char *src_end;  /* end ('\0') of source string */
1278 |   const char *p_end;  /* end ('\0') of pattern */
1279 |   lua_State *L;
1280 |   int level;  /* total number of captures (finished or unfinished) */
1281 |   struct {
1282 |     const char *init;
1283 |     ptrdiff_t len;
1284 |   } capture[LUA_MAXCAPTURES];
1285 | } MatchState;
1286 | 
1287 | /* recursive function */
1288 | static const char *match (MatchState *ms, const char *s, const char *p);
1289 | 
1290 | /* maximum recursion depth for 'match' */
1291 | #if !defined(MAXCCALLS)
1292 | #define MAXCCALLS       200
1293 | #endif
1294 | 
1295 | #define L_ESC           '%'
1296 | #define SPECIALS        "^$*+?.([%-"
1297 | 
1298 | static int check_capture (MatchState *ms, int l) {
1299 |   l -= '1';
1300 |   if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED)
1301 |     return luaL_error(ms->L, "invalid capture index %%%d", l + 1);
1302 |   return l;
1303 | }
1304 | 
1305 | static int capture_to_close (MatchState *ms) {
1306 |   int level = ms->level;
1307 |   while (--level >= 0)
1308 |     if (ms->capture[level].len == CAP_UNFINISHED) return level;
1309 |   return luaL_error(ms->L, "invalid pattern capture");
1310 | }
1311 | 
1312 | static const char *classend (MatchState *ms, const char *p) {
1313 |   utfint ch = 0;
1314 |   p = utf8_safe_decode(ms->L, p, &ch);
1315 |   switch (ch) {
1316 |     case L_ESC: {
1317 |       if (p == ms->p_end)
1318 |         luaL_error(ms->L, "malformed pattern (ends with " LUA_QL("%%") ")");
1319 |       return utf8_next(p, ms->p_end);
1320 |     }
1321 |     case '[': {
1322 |       if (*p == '^') p++;
1323 |       do {  /* look for a `]' */
1324 |         if (p == ms->p_end)
1325 |           luaL_error(ms->L, "malformed pattern (missing " LUA_QL("]") ")");
1326 |         if (*(p++) == L_ESC && p < ms->p_end)
1327 |           p++;  /* skip escapes (e.g. `%]') */
1328 |       } while (*p != ']');
1329 |       return p+1;
1330 |     }
1331 |     default: {
1332 |       return p;
1333 |     }
1334 |   }
1335 | }
1336 | 
1337 | static int match_class (utfint c, utfint cl) {
1338 |   int res;
1339 |   switch (utf8_tolower(cl)) {
1340 | #define X(cls, name) case cls: res = utf8_is##name(c); break;
1341 |     utf8_categories(X)
1342 | #undef  X
1343 |     case 'g' : res = utf8_isgraph(c); break;
1344 |     case 'w' : res = utf8_isalnum(c); break;
1345 |     case 'z' : res = (c == 0); break;  /* deprecated option */
1346 |     default: return (cl == c);
1347 |   }
1348 |   return (utf8_islower(cl) ? res : !res);
1349 | }
1350 | 
1351 | static int matchbracketclass (MatchState *ms, utfint c, const char *p, const char *ec) {
1352 |   int sig = 1;
1353 |   assert(*p == '[');
1354 |   if (*++p == '^') {
1355 |     sig = 0;
1356 |     p++;  /* skip the `^' */
1357 |   }
1358 |   while (p < ec) {
1359 |     utfint ch = 0;
1360 |     p = utf8_safe_decode(ms->L, p, &ch);
1361 |     if (ch == L_ESC) {
1362 |       p = utf8_safe_decode(ms->L, p, &ch);
1363 |       if (match_class(c, ch))
1364 |         return sig;
1365 |     } else {
1366 |       utfint next = 0;
1367 |       const char *np = utf8_safe_decode(ms->L, p, &next);
1368 |       if (next == '-' && np < ec) {
1369 |         p = utf8_safe_decode(ms->L, np, &next);
1370 |         if (ch <= c && c <= next)
1371 |           return sig;
1372 |       }
1373 |       else if (ch == c) return sig;
1374 |     }
1375 |   }
1376 |   return !sig;
1377 | }
1378 | 
1379 | static int singlematch (MatchState *ms, const char *s, const char *p, const char *ep) {
1380 |   if (s >= ms->src_end)
1381 |     return 0;
1382 |   else {
1383 |     utfint ch=0, pch=0;
1384 |     utf8_safe_decode(ms->L, s, &ch);
1385 |     p = utf8_safe_decode(ms->L, p, &pch);
1386 |     switch (pch) {
1387 |       case '.': return 1;  /* matches any char */
1388 |       case L_ESC: utf8_safe_decode(ms->L, p, &pch);
1389 |                   return match_class(ch, pch);
1390 |       case '[': return matchbracketclass(ms, ch, p-1, ep-1);
1391 |       default:  return pch == ch;
1392 |     }
1393 |   }
1394 | }
1395 | 
1396 | static const char *matchbalance (MatchState *ms, const char *s, const char **p) {
1397 |   utfint ch=0, begin=0, end=0;
1398 |   *p = utf8_safe_decode(ms->L, *p, &begin);
1399 |   if (*p >= ms->p_end)
1400 |     luaL_error(ms->L, "malformed pattern "
1401 |                       "(missing arguments to " LUA_QL("%%b") ")");
1402 |   *p = utf8_safe_decode(ms->L, *p, &end);
1403 |   s = utf8_safe_decode(ms->L, s, &ch);
1404 |   if (ch != begin) return NULL;
1405 |   else {
1406 |     int cont = 1;
1407 |     while (s < ms->src_end) {
1408 |       s = utf8_safe_decode(ms->L, s, &ch);
1409 |       if (ch == end) {
1410 |         if (--cont == 0) return s;
1411 |       }
1412 |       else if (ch == begin) cont++;
1413 |     }
1414 |   }
1415 |   return NULL;  /* string ends out of balance */
1416 | }
1417 | 
1418 | static const char *max_expand (MatchState *ms, const char *s, const char *p, const char *ep) {
1419 |   const char *m = s; /* matched end of single match p */
1420 |   while (singlematch(ms, m, p, ep))
1421 |     m = utf8_next(m, ms->src_end);
1422 |   /* keeps trying to match with the maximum repetitions */
1423 |   while (s <= m) {
1424 |     const char *res = match(ms, m, ep+1);
1425 |     if (res) return res;
1426 |     /* else didn't match; reduce 1 repetition to try again */
1427 |     if (s == m) break;
1428 |     m = utf8_prev(s, m);
1429 |   }
1430 |   return NULL;
1431 | }
1432 | 
1433 | static const char *min_expand (MatchState *ms, const char *s, const char *p, const char *ep) {
1434 |   for (;;) {
1435 |     const char *res = match(ms, s, ep+1);
1436 |     if (res != NULL)
1437 |       return res;
1438 |     else if (singlematch(ms, s, p, ep))
1439 |       s = utf8_next(s, ms->src_end);  /* try with one more repetition */
1440 |     else return NULL;
1441 |   }
1442 | }
1443 | 
1444 | static const char *start_capture (MatchState *ms, const char *s, const char *p, int what) {
1445 |   const char *res;
1446 |   int level = ms->level;
1447 |   if (level >= LUA_MAXCAPTURES) luaL_error(ms->L, "too many captures");
1448 |   ms->capture[level].init = s;
1449 |   ms->capture[level].len = what;
1450 |   ms->level = level+1;
1451 |   if ((res=match(ms, s, p)) == NULL)  /* match failed? */
1452 |     ms->level--;  /* undo capture */
1453 |   return res;
1454 | }
1455 | 
1456 | static const char *end_capture (MatchState *ms, const char *s, const char *p) {
1457 |   int l = capture_to_close(ms);
1458 |   const char *res;
1459 |   ms->capture[l].len = s - ms->capture[l].init;  /* close capture */
1460 |   if ((res = match(ms, s, p)) == NULL)  /* match failed? */
1461 |     ms->capture[l].len = CAP_UNFINISHED;  /* undo capture */
1462 |   return res;
1463 | }
1464 | 
1465 | static const char *match_capture (MatchState *ms, const char *s, int l) {
1466 |   size_t len;
1467 |   l = check_capture(ms, l);
1468 |   len = ms->capture[l].len;
1469 |   if ((size_t)(ms->src_end-s) >= len &&
1470 |       memcmp(ms->capture[l].init, s, len) == 0)
1471 |     return s+len;
1472 |   else return NULL;
1473 | }
1474 | 
1475 | static const char *match (MatchState *ms, const char *s, const char *p) {
1476 |   if (ms->matchdepth-- == 0)
1477 |     luaL_error(ms->L, "pattern too complex");
1478 |   init: /* using goto's to optimize tail recursion */
1479 |   if (p != ms->p_end) {  /* end of pattern? */
1480 |     utfint ch = 0;
1481 |     utf8_safe_decode(ms->L, p, &ch);
1482 |     switch (ch) {
1483 |       case '(': {  /* start capture */
1484 |         if (*(p + 1) == ')')  /* position capture? */
1485 |           s = start_capture(ms, s, p + 2, CAP_POSITION);
1486 |         else
1487 |           s = start_capture(ms, s, p + 1, CAP_UNFINISHED);
1488 |         break;
1489 |       }
1490 |       case ')': {  /* end capture */
1491 |         s = end_capture(ms, s, p + 1);
1492 |         break;
1493 |       }
1494 |       case '$': {
1495 |         if ((p + 1) != ms->p_end)  /* is the `$' the last char in pattern? */
1496 |           goto dflt;  /* no; go to default */
1497 |         s = (s == ms->src_end) ? s : NULL;  /* check end of string */
1498 |         break;
1499 |       }
1500 |       case L_ESC: {  /* escaped sequence not in the format class[*+?-]? */
1501 |         const char *prev_p = p;
1502 |         p = utf8_safe_decode(ms->L, p+1, &ch);
1503 |         switch (ch) {
1504 |           case 'b': {  /* balanced string? */
1505 |             s = matchbalance(ms, s, &p);
1506 |             if (s != NULL)
1507 |               goto init;  /* return match(ms, s, p + 4); */
1508 |             /* else fail (s == NULL) */
1509 |             break;
1510 |           }
1511 |           case 'f': {  /* frontier? */
1512 |             const char *ep; utfint previous = 0, current = 0;
1513 |             if (*p != '[')
1514 |               luaL_error(ms->L, "missing " LUA_QL("[") " after "
1515 |                                  LUA_QL("%%f") " in pattern");
1516 |             ep = classend(ms, p);  /* points to what is next */
1517 |             if (s != ms->src_init)
1518 |               utf8_decode(utf8_prev(ms->src_init, s), &previous, 0);
1519 |             if (s != ms->src_end)
1520 |               utf8_decode(s, &current, 0);
1521 |             if (!matchbracketclass(ms, previous, p, ep - 1) &&
1522 |                  matchbracketclass(ms, current, p, ep - 1)) {
1523 |               p = ep; goto init;  /* return match(ms, s, ep); */
1524 |             }
1525 |             s = NULL;  /* match failed */
1526 |             break;
1527 |           }
1528 |           case '0': case '1': case '2': case '3':
1529 |           case '4': case '5': case '6': case '7':
1530 |           case '8': case '9': {  /* capture results (%0-%9)? */
1531 |             s = match_capture(ms, s, ch);
1532 |             if (s != NULL) goto init;  /* return match(ms, s, p + 2) */
1533 |             break;
1534 |           }
1535 |           default: p = prev_p; goto dflt;
1536 |         }
1537 |         break;
1538 |       }
1539 |       default: dflt: {  /* pattern class plus optional suffix */
1540 |         const char *ep = classend(ms, p);  /* points to optional suffix */
1541 |         /* does not match at least once? */
1542 |         if (!singlematch(ms, s, p, ep)) {
1543 |           if (*ep == '*' || *ep == '?' || *ep == '-') {  /* accept empty? */
1544 |             p = ep + 1; goto init;  /* return match(ms, s, ep + 1); */
1545 |           } else  /* '+' or no suffix */
1546 |             s = NULL;  /* fail */
1547 |         } else {  /* matched once */
1548 |           const char *next_s = utf8_next(s, ms->src_end);
1549 |           switch (*ep) {  /* handle optional suffix */
1550 |             case '?': {  /* optional */
1551 |               const char *res;
1552 |               const char *next_ep = utf8_next(ep, ms->p_end);
1553 |               if ((res = match(ms, next_s, next_ep)) != NULL)
1554 |                 s = res;
1555 |               else {
1556 |                 p = next_ep; goto init;  /* else return match(ms, s, ep + 1); */
1557 |               }
1558 |               break;
1559 |             }
1560 |             case '+':  /* 1 or more repetitions */
1561 |               s = next_s;  /* 1 match already done */
1562 |               /* fall through */
1563 |             case '*':  /* 0 or more repetitions */
1564 |               s = max_expand(ms, s, p, ep);
1565 |               break;
1566 |             case '-':  /* 0 or more repetitions (minimum) */
1567 |               s = min_expand(ms, s, p, ep);
1568 |               break;
1569 |             default:  /* no suffix */
1570 |               s = next_s; p = ep; goto init;  /* return match(ms, s + 1, ep); */
1571 |           }
1572 |         }
1573 |         break;
1574 |       }
1575 |     }
1576 |   }
1577 |   ms->matchdepth++;
1578 |   return s;
1579 | }
1580 | 
1581 | static const char *lmemfind (const char *s1, size_t l1, const char *s2, size_t l2) {
1582 |   if (l2 == 0) return s1;  /* empty strings are everywhere */
1583 |   else if (l2 > l1) return NULL;  /* avoids a negative `l1' */
1584 |   else {
1585 |     const char *init;  /* to search for a `*s2' inside `s1' */
1586 |     l2--;  /* 1st char will be checked by `memchr' */
1587 |     l1 = l1-l2;  /* `s2' cannot be found after that */
1588 |     while (l1 > 0 && (init = (const char *)memchr(s1, *s2, l1)) != NULL) {
1589 |       init++;   /* 1st char is already checked */
1590 |       if (memcmp(init, s2+1, l2) == 0)
1591 |         return init-1;
1592 |       else {  /* correct `l1' and `s1' to try again */
1593 |         l1 -= init-s1;
1594 |         s1 = init;
1595 |       }
1596 |     }
1597 |     return NULL;  /* not found */
1598 |   }
1599 | }
1600 | 
1601 | static int get_index (const char *p, const char *s, const char *e) {
1602 |     int idx;
1603 |     for (idx = 0; s < e && s < p; ++idx)
1604 |         s = utf8_next(s, e);
1605 |     return s == p ? idx : idx - 1;
1606 | }
1607 | 
1608 | static void push_onecapture (MatchState *ms, int i, const char *s, const char *e) {
1609 |   if (i >= ms->level) {
1610 |     if (i == 0)  /* ms->level == 0, too */
1611 |       lua_pushlstring(ms->L, s, e - s);  /* add whole match */
1612 |     else
1613 |       luaL_error(ms->L, "invalid capture index");
1614 |   } else {
1615 |     ptrdiff_t l = ms->capture[i].len;
1616 |     if (l == CAP_UNFINISHED) luaL_error(ms->L, "unfinished capture");
1617 |     if (l == CAP_POSITION) {
1618 |       int idx = get_index(ms->capture[i].init, ms->src_init, ms->src_end);
1619 |       lua_pushinteger(ms->L, idx+1);
1620 |     } else
1621 |       lua_pushlstring(ms->L, ms->capture[i].init, l);
1622 |   }
1623 | }
1624 | 
1625 | static int push_captures (MatchState *ms, const char *s, const char *e) {
1626 |   int i;
1627 |   int nlevels = (ms->level == 0 && s) ? 1 : ms->level;
1628 |   luaL_checkstack(ms->L, nlevels, "too many captures");
1629 |   for (i = 0; i < nlevels; i++)
1630 |     push_onecapture(ms, i, s, e);
1631 |   return nlevels;  /* number of strings pushed */
1632 | }
1633 | 
1634 | /* check whether pattern has no special characters */
1635 | static int nospecials (const char *p, const char * ep) {
1636 |   while (p < ep) {
1637 |     if (strpbrk(p, SPECIALS))
1638 |       return 0;  /* pattern has a special character */
1639 |     p += strlen(p) + 1;  /* may have more after \0 */
1640 |   }
1641 |   return 1;  /* no special chars found */
1642 | }
1643 | 
1644 | 
1645 | /* utf8 pattern matching interface */
1646 | 
1647 | static int find_aux (lua_State *L, int find) {
1648 |   const char *es, *s = check_utf8(L, 1, &es);
1649 |   const char *ep, *p = check_utf8(L, 2, &ep);
1650 |   lua_Integer idx = luaL_optinteger(L, 3, 1);
1651 |   const char *init;
1652 |   if (!idx) idx = 1;
1653 |   init = utf8_relat(s, es, CAST(int, idx));
1654 |   if (init == NULL) {
1655 |     if (idx > 0) {
1656 |       lua_pushnil(L);  /* cannot find anything */
1657 |       return 1;
1658 |     }
1659 |     init = s;
1660 |   }
1661 |   /* explicit request or no special characters? */
1662 |   if (find && (lua_toboolean(L, 4) || nospecials(p, ep))) {
1663 |     /* do a plain search */
1664 |     const char *s2 = lmemfind(init, es-init, p, ep-p);
1665 |     if (s2) {
1666 |       const char *e2 = s2 + (ep - p);
1667 |       if (iscont(e2)) e2 = utf8_next(e2, es);
1668 |       lua_pushinteger(L, idx = get_index(s2, s, es) + 1);
1669 |       lua_pushinteger(L, idx + get_index(e2, s2, es) - 1);
1670 |       return 2;
1671 |     }
1672 |   } else {
1673 |     MatchState ms;
1674 |     int anchor = (*p == '^');
1675 |     if (anchor) p++;  /* skip anchor character */
1676 |     if (idx < 0) idx += utf8_length(s, es)+1; /* TODO not very good */
1677 |     ms.L = L;
1678 |     ms.matchdepth = MAXCCALLS;
1679 |     ms.src_init = s;
1680 |     ms.src_end = es;
1681 |     ms.p_end = ep;
1682 |     do {
1683 |       const char *res;
1684 |       ms.level = 0;
1685 |       assert(ms.matchdepth == MAXCCALLS);
1686 |       if ((res=match(&ms, init, p)) != NULL) {
1687 |         if (find) {
1688 |           lua_pushinteger(L, idx);  /* start */
1689 |           lua_pushinteger(L, idx + utf8_length(init, res) - 1);   /* end */
1690 |           return push_captures(&ms, NULL, 0) + 2;
1691 |         } else
1692 |           return push_captures(&ms, init, res);
1693 |       }
1694 |       if (init == es) break;
1695 |       idx += 1;
1696 |       init = utf8_next(init, es);
1697 |     } while (init <= es && !anchor);
1698 |   }
1699 |   lua_pushnil(L);  /* not found */
1700 |   return 1;
1701 | }
1702 | 
1703 | static int Lutf8_find (lua_State *L) { return find_aux(L, 1); }
1704 | static int Lutf8_match (lua_State *L) { return find_aux(L, 0); }
1705 | 
1706 | static int gmatch_aux (lua_State *L) {
1707 |   MatchState ms;
1708 |   const char *es, *s = check_utf8(L, lua_upvalueindex(1), &es);
1709 |   const char *ep, *p = check_utf8(L, lua_upvalueindex(2), &ep);
1710 |   const char *src;
1711 |   ms.L = L;
1712 |   ms.matchdepth = MAXCCALLS;
1713 |   ms.src_init = s;
1714 |   ms.src_end = es;
1715 |   ms.p_end = ep;
1716 |   for (src = s + (size_t)lua_tointeger(L, lua_upvalueindex(3));
1717 |        src <= ms.src_end;
1718 |        src = utf8_next(src, ms.src_end)) {
1719 |     const char *e;
1720 |     ms.level = 0;
1721 |     assert(ms.matchdepth == MAXCCALLS);
1722 |     if ((e = match(&ms, src, p)) != NULL) {
1723 |       lua_Integer newstart = e-s;
1724 |       if (e == src) newstart++;  /* empty match? go at least one position */
1725 |       lua_pushinteger(L, newstart);
1726 |       lua_replace(L, lua_upvalueindex(3));
1727 |       return push_captures(&ms, src, e);
1728 |     }
1729 |     if (src == ms.src_end) break;
1730 |   }
1731 |   return 0;  /* not found */
1732 | }
1733 | 
1734 | static int Lutf8_gmatch (lua_State *L) {
1735 |   luaL_checkstring(L, 1);
1736 |   luaL_checkstring(L, 2);
1737 |   lua_settop(L, 2);
1738 |   lua_pushinteger(L, 0);
1739 |   lua_pushcclosure(L, gmatch_aux, 3);
1740 |   return 1;
1741 | }
1742 | 
1743 | static void add_s (MatchState *ms, luaL_Buffer *b, const char *s, const char *e) {
1744 |   const char *new_end, *news = to_utf8(ms->L, 3, &new_end);
1745 |   while (news < new_end) {
1746 |     utfint ch = 0;
1747 |     news = utf8_safe_decode(ms->L, news, &ch);
1748 |     if (ch != L_ESC)
1749 |       add_utf8char(b, ch);
1750 |     else {
1751 |       news = utf8_safe_decode(ms->L, news, &ch); /* skip ESC */
1752 |       if (!utf8_isdigit(ch)) {
1753 |         if (ch != L_ESC)
1754 |           luaL_error(ms->L, "invalid use of " LUA_QL("%c")
1755 |               " in replacement string", L_ESC);
1756 |         add_utf8char(b, ch);
1757 |       } else if (ch == '0')
1758 |         luaL_addlstring(b, s, e-s);
1759 |       else {
1760 |         push_onecapture(ms, ch-'1', s, e);
1761 |         luaL_addvalue(b);  /* add capture to accumulated result */
1762 |       }
1763 |     }
1764 |   }
1765 | }
1766 | 
1767 | static void add_value (MatchState *ms, luaL_Buffer *b, const char *s, const char *e, int tr) {
1768 |   lua_State *L = ms->L;
1769 |   switch (tr) {
1770 |     case LUA_TFUNCTION: {
1771 |       int n;
1772 |       lua_pushvalue(L, 3);
1773 |       n = push_captures(ms, s, e);
1774 |       lua_call(L, n, 1);
1775 |       break;
1776 |     }
1777 |     case LUA_TTABLE: {
1778 |       push_onecapture(ms, 0, s, e);
1779 |       lua_gettable(L, 3);
1780 |       break;
1781 |     }
1782 |     default: {  /* LUA_TNUMBER or LUA_TSTRING */
1783 |       add_s(ms, b, s, e);
1784 |       return;
1785 |     }
1786 |   }
1787 |   if (!lua_toboolean(L, -1)) {  /* nil or false? */
1788 |     lua_pop(L, 1);
1789 |     lua_pushlstring(L, s, e - s);  /* keep original text */
1790 |   } else if (!lua_isstring(L, -1))
1791 |     luaL_error(L, "invalid replacement value (a %s)", luaL_typename(L, -1));
1792 |   luaL_addvalue(b);  /* add result to accumulator */
1793 | }
1794 | 
1795 | static int Lutf8_gsub (lua_State *L) {
1796 |   const char *es, *s = check_utf8(L, 1, &es);
1797 |   const char *ep, *p = check_utf8(L, 2, &ep);
1798 |   int tr = lua_type(L, 3);
1799 |   lua_Integer max_s = luaL_optinteger(L, 4, (es-s)+1);
1800 |   int anchor = (*p == '^');
1801 |   lua_Integer n = 0;
1802 |   MatchState ms;
1803 |   luaL_Buffer b;
1804 |   luaL_argcheck(L, tr == LUA_TNUMBER || tr == LUA_TSTRING ||
1805 |                    tr == LUA_TFUNCTION || tr == LUA_TTABLE, 3,
1806 |                       "string/function/table expected");
1807 |   luaL_buffinit(L, &b);
1808 |   if (anchor) p++;  /* skip anchor character */
1809 |   ms.L = L;
1810 |   ms.matchdepth = MAXCCALLS;
1811 |   ms.src_init = s;
1812 |   ms.src_end = es;
1813 |   ms.p_end = ep;
1814 |   while (n < max_s) {
1815 |     const char *e;
1816 |     ms.level = 0;
1817 |     assert(ms.matchdepth == MAXCCALLS);
1818 |     e = match(&ms, s, p);
1819 |     if (e) {
1820 |       n++;
1821 |       add_value(&ms, &b, s, e, tr);
1822 |     }
1823 |     if (e && e > s) /* non empty match? */
1824 |       s = e;  /* skip it */
1825 |     else if (s < es) {
1826 |       utfint ch = 0;
1827 |       s = utf8_safe_decode(L, s, &ch);
1828 |       add_utf8char(&b, ch);
1829 |     } else break;
1830 |     if (anchor) break;
1831 |   }
1832 |   luaL_addlstring(&b, s, es-s);
1833 |   luaL_pushresult(&b);
1834 |   lua_pushinteger(L, n);  /* number of substitutions */
1835 |   return 2;
1836 | }
1837 | 
1838 | static int Lutf8_isvalid(lua_State *L) {
1839 |   const char *e, *s = check_utf8(L, 1, &e);
1840 |   const char *invalid = utf8_invalid_offset(s, e);
1841 |   lua_pushboolean(L, invalid == NULL);
1842 |   return 1;
1843 | }
1844 | 
1845 | static int Lutf8_invalidoffset(lua_State *L) {
1846 |   const char *e, *s = check_utf8(L, 1, &e);
1847 |   const char *orig_s = s;
1848 |   int offset = luaL_optinteger(L, 2, 0);
1849 |   if (offset > 1) {
1850 |     offset--;
1851 |     s += offset;
1852 |     if (s >= e) {
1853 |       lua_pushnil(L);
1854 |       return 1;
1855 |     }
1856 |   } else if (offset < 0 && s - e < offset) {
1857 |     s = e + offset;
1858 |   }
1859 |   const char *invalid = utf8_invalid_offset(s, e);
1860 |   if (invalid == NULL) {
1861 |     lua_pushnil(L);
1862 |   } else {
1863 |     lua_pushinteger(L, invalid - orig_s + 1);
1864 |   }
1865 |   return 1;
1866 | }
1867 | 
1868 | static int Lutf8_clean(lua_State *L) {
1869 |   const char *e, *s = check_utf8(L, 1, &e);
1870 | 
1871 |   /* Default replacement string is REPLACEMENT CHARACTER U+FFFD */
1872 |   size_t repl_len;
1873 |   const char *r = luaL_optlstring(L, 2, "\xEF\xBF\xBD", &repl_len);
1874 | 
1875 |   if (lua_gettop(L) > 1) {
1876 |     /* Check if replacement string is valid UTF-8 or not */
1877 |     if (utf8_invalid_offset(r, r + repl_len) != NULL) {
1878 |       lua_pushstring(L, "replacement string must be valid UTF-8");
1879 |       lua_error(L);
1880 |     }
1881 |   }
1882 | 
1883 |   const char *invalid = utf8_invalid_offset(s, e);
1884 |   if (invalid == NULL) {
1885 |     lua_settop(L, 1); /* Return input string without modification */
1886 |     lua_pushboolean(L, 1); /* String was clean already */
1887 |     return 2;
1888 |   }
1889 | 
1890 |   luaL_Buffer buff;
1891 |   luaL_buffinit(L, &buff);
1892 | 
1893 |   while (1) {
1894 |     /* Invariant: 's' points to first GOOD byte not in output buffer,
1895 |      * 'invalid' points to first BAD byte after that */
1896 |     luaL_addlstring(&buff, s, invalid - s);
1897 |     luaL_addlstring(&buff, r, repl_len);
1898 |     /* We do not replace every bad byte with the replacement character,
1899 |      * but rather a contiguous sequence of bad bytes
1900 |      * Restore the invariant by stepping forward until we find at least
1901 |      * one good byte */
1902 |     s = invalid;
1903 |     while (s == invalid) {
1904 |       s++;
1905 |       invalid = utf8_invalid_offset(s, e);
1906 |     }
1907 |     if (invalid == NULL) {
1908 |       luaL_addlstring(&buff, s, e - s);
1909 |       luaL_pushresult(&buff);
1910 |       lua_pushboolean(L, 0); /* String was not clean */
1911 |       return 2;
1912 |     }
1913 |   }
1914 | }
1915 | 
1916 | static int Lutf8_isnfc(lua_State *L) {
1917 |   const char *e, *s = check_utf8(L, 1, &e);
1918 |   utfint starter = 0, ch;
1919 |   unsigned int prev_canon_cls = 0;
1920 | 
1921 |   while (s < e) {
1922 |     s = utf8_decode(s, &ch, 1);
1923 |     if (s == NULL) {
1924 |       lua_pushstring(L, "string is not valid UTF-8");
1925 |       lua_error(L);
1926 |     }
1927 |     if (ch < 0x300) {
1928 |       starter = ch; /* Fast path */
1929 |       prev_canon_cls = 0;
1930 |       continue;
1931 |     }
1932 | 
1933 |     unsigned int canon_cls = lookup_canon_cls(ch);
1934 |     if (canon_cls && canon_cls < prev_canon_cls) {
1935 |       /* Combining marks are out of order; this string is not NFC */
1936 |       lua_pushboolean(L, 0); /* Return false */
1937 |       return 1;
1938 |     }
1939 | 
1940 |     nfc_table *entry = nfc_quickcheck(ch);
1941 |     if (entry && !nfc_check(ch, entry, starter, canon_cls, prev_canon_cls)) {
1942 |       lua_pushboolean(L, 0); /* Return false */
1943 |       return 1;
1944 |     }
1945 | 
1946 |     prev_canon_cls = canon_cls;
1947 |     if (!canon_cls)
1948 |       starter = ch;
1949 |   }
1950 | 
1951 |   lua_pushboolean(L, 1); /* Return true */
1952 |   return 1;
1953 | }
1954 | 
1955 | static int Lutf8_normalize_nfc(lua_State *L) {
1956 |   const char *e, *s = check_utf8(L, 1, &e), *p = s, *starter_p = s;
1957 |   utfint starter = 0, ch;
1958 |   unsigned int prev_canon_cls = 0;
1959 | 
1960 |   /* First scan to see if we can find any problems... if not, we may just return the
1961 |    * input string unchanged */
1962 |   while (p < e) {
1963 |     const char *new_p = utf8_decode(p, &ch, 1);
1964 |     if (new_p == NULL) {
1965 |       lua_pushstring(L, "string is not valid UTF-8");
1966 |       lua_error(L);
1967 |     }
1968 | 
1969 |     unsigned int canon_cls = lookup_canon_cls(ch);
1970 |     if (canon_cls && canon_cls < prev_canon_cls) {
1971 |       goto build_string; /* Combining marks are out of order; this string is not NFC */
1972 |     }
1973 | 
1974 |     nfc_table *entry = nfc_quickcheck(ch);
1975 |     if (entry && !nfc_check(ch, entry, starter, canon_cls, prev_canon_cls)) {
1976 |       goto build_string;
1977 |     }
1978 | 
1979 |     prev_canon_cls = canon_cls;
1980 |     if (!canon_cls) {
1981 |       starter = ch;
1982 |       starter_p = p;
1983 |     }
1984 |     p = new_p;
1985 |   }
1986 | 
1987 |   lua_settop(L, 1); /* Return input string without modification */
1988 |   lua_pushboolean(L, 1); /* String was in normal form already, so 2nd return value is 'true' */
1989 |   return 2;
1990 | 
1991 | build_string: ;
1992 |   /* We will need to build a new string, this one is not NFC */
1993 |   luaL_Buffer buff;
1994 |   luaL_buffinit(L, &buff);
1995 |   luaL_addlstring(&buff, s, starter_p - s);
1996 | 
1997 |   string_to_nfc(L, &buff, starter_p, e);
1998 | 
1999 |   luaL_pushresult(&buff);
2000 |   lua_pushboolean(L, 0);
2001 |   return 2;
2002 | }
2003 | 
2004 | static int iterate_grapheme_indices(lua_State *L) {
2005 |   const char *s = luaL_checkstring(L, lua_upvalueindex(1));
2006 |   lua_Integer pos = luaL_checkinteger(L, lua_upvalueindex(2));
2007 |   lua_Integer end = luaL_checkinteger(L, lua_upvalueindex(3));
2008 | 
2009 |   if (pos > end) {
2010 |     lua_pushnil(L);
2011 |     return 1;
2012 |   }
2013 |   const char *e = s + end;
2014 | 
2015 |   utfint ch, next_ch;
2016 |   const char *p = utf8_safe_decode(L, s + pos - 1, &ch);
2017 | 
2018 |   while (1) {
2019 |     const char *next_p = utf8_safe_decode(L, p, &next_ch);
2020 |     int bind = 0;
2021 | 
2022 |     if (ch == '\r') {
2023 |       if (next_ch == '\n') {
2024 |         /* CR binds to following LF */
2025 |         bind = 1;
2026 |       } else {
2027 |         break;
2028 |       }
2029 |     } else if (ch == '\n' || next_ch == '\r' || next_ch == '\n') {
2030 |       /* CR/LF do not bind to any other codepoint or in any other way */
2031 |       break;
2032 |     } else if (find_in_range(cntrl_table, table_size(cntrl_table), ch) && !find_in_range(prepend_table, table_size(prepend_table), ch) && ch != 0x200D) {
2033 |       /* Control characters do not bind to anything */
2034 |       break;
2035 |     } else if (next_ch == 0x200D) {
2036 |       /* U+200D is ZERO WIDTH JOINER, it always binds to preceding char */
2037 |       if (next_p < e && find_in_range(pictographic_table, table_size(pictographic_table), ch)) {
2038 |         /* After an Extended_Pictographic codepoint and ZWJ, we bind to a following Extended_Pictographic */
2039 |         utfint nextnext_ch;
2040 |         const char *probe_ep = utf8_safe_decode(L, next_p, &nextnext_ch);
2041 |         if (find_in_range(pictographic_table, table_size(pictographic_table), nextnext_ch)) {
2042 |           p = probe_ep;
2043 |           ch = nextnext_ch;
2044 |           continue;
2045 |         }
2046 |       }
2047 |       bind = 1;
2048 |     } else if (find_in_range(cntrl_table, table_size(cntrl_table), next_ch) && !find_in_range(prepend_table, table_size(prepend_table), next_ch)) {
2049 |       /* Control characters do not bind to anything */
2050 |       break;
2051 |     } else {
2052 |       if (indic_conjunct_type(ch) == INDIC_CONSONANT) {
2053 |         utfint probed_ch = next_ch;
2054 |         const char *probe = next_p;
2055 |         int indic_type = indic_conjunct_type(probed_ch);
2056 |         int saw_linker = 0;
2057 |         while (indic_type) {
2058 |           /* Consume any number of Extend or Linker codepoints, followed by a single Consonant
2059 |            * The sequence must contain at least one Linker, however! */
2060 |           if (indic_type == INDIC_LINKER) {
2061 |             saw_linker = 1;
2062 |           } else if (indic_type == INDIC_CONSONANT) {
2063 |             if (!saw_linker)
2064 |               break;
2065 |             p = probe;
2066 |             ch = probed_ch;
2067 |             goto next_iteration;
2068 |           }
2069 |           if (probe >= e)
2070 |             break;
2071 |           probe = utf8_safe_decode(L, probe, &probed_ch);
2072 |           indic_type = indic_conjunct_type(probed_ch);
2073 |         }
2074 |       }
2075 | 
2076 |       if (find_in_range(compose_table, table_size(compose_table), next_ch) || (next_ch >= 0x1F3FB && next_ch <= 0x1F3FF)) {
2077 |         /* The 2nd codepoint has property Grapheme_Extend, or is an Emoji_Modifier codepoint */
2078 |         if (next_p < e && find_in_range(pictographic_table, table_size(pictographic_table), ch)) {
2079 |           /* Consume any number of 'extend' codepoints, one ZWJ, and following Extended_Pictographic codepoint */
2080 |           utfint probed_ch;
2081 |           const char *probe = next_p;
2082 |           while (probe < e) {
2083 |             probe = utf8_safe_decode(L, probe, &probed_ch);
2084 |             if (probed_ch == 0x200D) {
2085 |               if (probe < e) {
2086 |                 probe = utf8_safe_decode(L, probe, &probed_ch);
2087 |                 if (find_in_range(pictographic_table, table_size(pictographic_table), probed_ch)) {
2088 |                   next_p = probe;
2089 |                   next_ch = probed_ch;
2090 |                 }
2091 |               }
2092 |               break;
2093 |             } else if (find_in_range(compose_table, table_size(compose_table), probed_ch) || (probed_ch >= 0x1F3FB && probed_ch <= 0x1F3FF)) {
2094 |               next_p = probe;
2095 |               next_ch = probed_ch;
2096 |             } else {
2097 |               break;
2098 |             }
2099 |           }
2100 |         }
2101 |         bind = 1;
2102 |       } else if (find_in_range(spacing_mark_table, table_size(spacing_mark_table), next_ch)) {
2103 |         /* The 2nd codepoint is in general category Spacing_Mark */
2104 |         bind = 1;
2105 |       } else if (find_in_range(prepend_table, table_size(prepend_table), ch)) {
2106 |         /* The 1st codepoint has property Prepend_Concatenation_Mark, or is a type of
2107 |          * Indic Syllable which binds to the following codepoint */
2108 |         bind = 1;
2109 |       } else if (ch >= 0x1F1E6 && ch <= 0x1F1FF && next_ch >= 0x1F1E6 && next_ch <= 0x1F1FF) {
2110 |         /* Regional Indicator (flag) emoji bind together; but only in twos */
2111 |         p = next_p;
2112 |         ch = 0xFFFE; /* Set 'ch' to bogus value so we will not re-enter this branch on next iteration */
2113 |         continue;
2114 |       } else {
2115 |         /* Korean Hangul codepoints have their own special rules about when they
2116 |          * are considered a single grapheme cluster */
2117 |         int hangul1 = hangul_type(ch);
2118 |         if (hangul1) {
2119 |           int hangul2 = hangul_type(next_ch);
2120 |           if (hangul2) {
2121 |             if (hangul1 == HANGUL_L) {
2122 |               bind = (hangul2 != HANGUL_T);
2123 |             } else if (hangul1 == HANGUL_LV || hangul1 == HANGUL_V) {
2124 |               bind = (hangul2 == HANGUL_V || hangul2 == HANGUL_T);
2125 |             } else if (hangul1 == HANGUL_LVT || hangul1 == HANGUL_T) {
2126 |               bind = (hangul2 == HANGUL_T);
2127 |             }
2128 |           }
2129 |         }
2130 |       }
2131 |     }
2132 | 
2133 |     if (!bind)
2134 |       break;
2135 |     p = next_p;
2136 |     ch = next_ch;
2137 | next_iteration: ;
2138 |   }
2139 | 
2140 |   lua_pushinteger(L, (p - s) + 1);
2141 |   lua_replace(L, lua_upvalueindex(2));
2142 | 
2143 |   lua_pushinteger(L, pos);
2144 |   lua_pushinteger(L, p - s);
2145 |   return 2;
2146 | }
2147 | 
2148 | static int Lutf8_grapheme_indices(lua_State *L) {
2149 |   size_t len;
2150 |   const char *s = luaL_checklstring(L, 1, &len);
2151 |   lua_Integer start = byte_relat(luaL_optinteger(L, 2, 1), len);
2152 |   lua_Integer end = byte_relat(luaL_optinteger(L, 3, len), len);
2153 |   luaL_argcheck(L, start >= 1, 2, "out of range");
2154 |   luaL_argcheck(L, end <= (lua_Integer)len, 3, "out of range");
2155 | 
2156 |   lua_settop(L, 1);
2157 |   lua_pushinteger(L, start);
2158 |   lua_pushinteger(L, end);
2159 |   lua_pushcclosure(L, iterate_grapheme_indices, 3);
2160 |   return 1;
2161 | }
2162 | 
2163 | /* lua module import interface */
2164 | 
2165 | #if LUA_VERSION_NUM >= 502
2166 | static const char UTF8PATT[] = "[\0-\x7F\xC2-\xF4][\x80-\xBF]*";
2167 | #else
2168 | static const char UTF8PATT[] = "[%z\1-\x7F\xC2-\xF4][\x80-\xBF]*";
2169 | #endif
2170 | 
2171 | LUALIB_API int luaopen_utf8 (lua_State *L) {
2172 |   luaL_Reg libs[] = {
2173 | #define ENTRY(name) { #name, Lutf8_##name }
2174 |     ENTRY(offset),
2175 |     ENTRY(codes),
2176 |     ENTRY(codepoint),
2177 | 
2178 |     ENTRY(len),
2179 |     ENTRY(sub),
2180 |     ENTRY(reverse),
2181 |     ENTRY(lower),
2182 |     ENTRY(upper),
2183 |     ENTRY(title),
2184 |     ENTRY(fold),
2185 |     ENTRY(byte),
2186 |     ENTRY(char),
2187 |     ENTRY(escape),
2188 |     ENTRY(insert),
2189 |     ENTRY(remove),
2190 |     ENTRY(charpos),
2191 |     ENTRY(next),
2192 |     ENTRY(width),
2193 |     ENTRY(widthindex),
2194 |     ENTRY(ncasecmp),
2195 |     ENTRY(find),
2196 |     ENTRY(gmatch),
2197 |     ENTRY(gsub),
2198 |     ENTRY(match),
2199 |     ENTRY(isvalid),
2200 |     ENTRY(invalidoffset),
2201 |     ENTRY(clean),
2202 |     ENTRY(isnfc),
2203 |     ENTRY(normalize_nfc),
2204 |     ENTRY(grapheme_indices),
2205 | #undef  ENTRY
2206 |     { NULL, NULL }
2207 |   };
2208 | 
2209 | #if LUA_VERSION_NUM >= 502
2210 |   luaL_newlib(L, libs);
2211 | #else
2212 |   luaL_register(L, "utf8", libs);
2213 | #endif
2214 | 
2215 |   lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT)-1);
2216 |   lua_setfield(L, -2, "charpattern");
2217 | 
2218 |   return 1;
2219 | }
2220 | 
2221 | /* win32cc: flags+='-Wall -Wextra -s -O2 -mdll -DLUA_BUILD_AS_DLL'
2222 |  * win32cc: libs+='-llua54.dll' output='lua-utf8.dll'
2223 |  * win32cc: run='lua.exe test.lua'
2224 |  * maccc: run='lua -- test_compat.lua'
2225 |  * maccc: flags+='-g --coverage -bundle -undefined dynamic_lookup' output='lua-utf8.so' */
2226 | 
2227 | 


--------------------------------------------------------------------------------
/parseucd.lua:
--------------------------------------------------------------------------------
  1 | -- generate useful data from Unicode Character Database.
  2 | -- you should have these files in UCD folder in current path:
  3 | --   - UCD\CaseFolding.txt
  4 | --   - UCD\DerivedCoreProperties.txt
  5 | --   - UCD\DerivedNormalizationProps.txt
  6 | --   - UCD\EastAsianWidth.txt
  7 | --   - UCD\emoji\emoji-data.txt
  8 | --   - UCD\HangulSyllableType.txt
  9 | --   - UCD\IndicSyllabicCategory.txt
 10 | --   - UCD\PropList.txt
 11 | --   - UCD\UnicodeData.txt
 12 | --
 13 | --  files can be downloaded at: http://unicode.org/Public/UCD/latest/UCD/
 14 | 
 15 | 
 16 | local function parse_UnicodeData()
 17 |     -- UnicodeData.txt structions:
 18 |     -- 0. codepoint
 19 |     -- 1. name
 20 |     -- 2. general category
 21 |     -- 3. canonical combining class
 22 |     -- 4. bidi class
 23 |     -- 5. decomposition type/mapping
 24 |     -- 6. numeric type/value
 25 |     -- 7. numeric type/value
 26 |     -- 8. numeric type/value
 27 |     -- 9. bidi mirrored [YN]
 28 |     -- 10. old unicode name
 29 |     -- 11. iso comment
 30 |     -- 12. uppercase mapping
 31 |     -- 13. lowercase mapping
 32 |     -- 14. titlecase mapping
 33 |     local ucd = {}
 34 | 
 35 |     local patt = "^(%x+)"..(";([^;]-)"):rep(14).."$"
 36 | 
 37 |     local last_data
 38 | 
 39 |     for line in io.lines() do
 40 |         local cp, name, gc, canon_cls, bidi_class, decomposition, _,_,_, _, _,_, um, lm, tm = line:match(patt)
 41 |         assert(cp, line)
 42 |         cp = tonumber(cp, 16)
 43 |         lm = lm ~= "" and tonumber(lm, 16)
 44 |         um = um ~= "" and tonumber(um, 16)
 45 |         tm = tm ~= "" and tonumber(tm, 16)
 46 |         local decomp1, decomp2 = decomposition:match "^(%x+) (%x+)$"
 47 |         if decomp1 and decomp2 then
 48 |             decomposition = { tonumber(decomp1, 16), tonumber(decomp2, 16) }
 49 |         elseif decomposition:match("^%x+$") then
 50 |             decomposition = { tonumber(decomposition, 16) }
 51 |         else
 52 |             decomposition = nil
 53 |         end
 54 |         if last_data and last_data.name:match"First%>$" then
 55 |             assert(name:match"Last%>$", line)
 56 |             for i = last_data.cp, cp-1 do
 57 |                 ucd[#ucd+1] = {
 58 |                     cp = i,
 59 |                     name = name,
 60 |                     gc = gc,
 61 |                     bidi_class = bidi_class,
 62 |                     lm = lm, um = um, tm = tm,
 63 |                     canon_cls = tonumber(canon_cls),
 64 |                     decomposition = decomposition
 65 |                 }
 66 |             end
 67 |         end
 68 |         local data = {
 69 |             cp = cp,
 70 |             name = name,
 71 |             gc = gc,
 72 |             bidi_class = bidi_class,
 73 |             lm = lm, um = um, tm = tm,
 74 |             canon_cls = tonumber(canon_cls),
 75 |             decomposition = decomposition
 76 |         }
 77 |         ucd[#ucd+1] = data
 78 |         last_data = data
 79 |     end
 80 |     table.sort(ucd, function(a, b) return a.cp < b.cp end)
 81 | 
 82 |     return ucd
 83 | end
 84 | 
 85 | local function parse_EastAsianWidth()
 86 |     local wide, ambi = {}, {}
 87 | 
 88 |     for line in io.lines() do
 89 |         line = line:gsub("%s*%#.*$", "")
 90 |         if line ~= "" then
 91 |             local first, last, mark
 92 |             first, mark = line:match "^(%x+)%s*%;%s*(%w+)$"
 93 |             if first then
 94 |                 last = first
 95 |             else
 96 |                 first, last, mark = line:match "^(%x+)%.%.(%x+)%s*%;%s*(%w+)$"
 97 |                 assert(first, line)
 98 |             end
 99 | 
100 |             first = tonumber(first, 16)
101 |             last = tonumber(last, 16)
102 | 
103 |             if mark == 'W' or mark == 'F' then
104 |                 for i = first, last do
105 |                     wide[#wide+1] = i
106 |                 end
107 |             elseif mark == 'A' then
108 |                 for i = first, last do
109 |                     ambi[#ambi+1] = i
110 |                 end
111 |             end
112 |         end
113 |     end
114 | 
115 |     return wide, ambi
116 | end
117 | 
118 | local function parse_CaseFolding()
119 |     local mapping = {}
120 |     for line in io.lines() do
121 |         line = line:gsub("%s*%#.*$", "")
122 |         if line ~= "" then
123 |             local cp, class, mcp = line:match "^%s*(%x+)%s*;%s*(%w+)%s*;%s*(%x+)"
124 |             assert(cp, line)
125 |             if class == 'C' or class == 'S' then
126 |                 cp = tonumber(cp, 16)
127 |                 mcp = tonumber(mcp, 16)
128 |                 mapping[#mapping+1] = { cp = cp, mapping = mcp }
129 |             end
130 |         end
131 |     end
132 |     return mapping
133 | end
134 | 
135 | local function parse_PropList(f)
136 |     local ranges = {}
137 |     local lookup = {}
138 | 
139 |     local arg = f
140 |     if type(f) == 'table' then
141 |         f = function(cp) return arg[cp] end
142 |     elseif type(f) == 'string' then
143 |         f = function(cp) return arg == cp end
144 |     end
145 | 
146 |     for line in io.lines() do
147 |         line = line:gsub("%s*%#.*$", "")
148 |         if line ~= "" then
149 |             local first, last, mark
150 |             first, mark = line:match "^(%x+)%s*%;%s*([%w%s_;]+)%s*$"
151 |             if first then
152 |                 last = first
153 |             else
154 |                 first, last, mark = line:match "^(%x+)%.%.(%x+)%s*%;%s*([%w%s_;]+)%s*$"
155 |                 assert(first, line)
156 |             end
157 | 
158 |             first = tonumber(first, 16)
159 |             last = tonumber(last, 16)
160 | 
161 |             if f(mark) then
162 |                 for i = first, last do
163 |                     if not lookup[i] then
164 |                         lookup[i] = true
165 |                         ranges[#ranges+1] = i
166 |                     end
167 |                 end
168 |             end
169 |         end
170 |     end
171 | 
172 |     table.sort(ranges)
173 |     return ranges, lookup
174 | end
175 | 
176 | local function parse_HangulSyllableType()
177 |     local ranges = {}
178 |     local lookup = {}
179 | 
180 |     for line in io.lines() do
181 |         line = line:gsub("%s*%#.*$", "")
182 |         if line ~= "" then
183 |             local first, last, mark
184 |             first, mark = line:match "^(%x+)%s*%;%s*([%w%s_;]+)%s*$"
185 |             if first then
186 |                 last = first
187 |             else
188 |                 first, last, mark = line:match "^(%x+)%.%.(%x+)%s*%;%s*([%w%s_;]+)%s*$"
189 |                 assert(first, line)
190 |             end
191 | 
192 |             first = tonumber(first, 16)
193 |             last = tonumber(last, 16)
194 | 
195 |             for i = first, last do
196 |                 if not lookup[i] then
197 |                     lookup[i] = true
198 |                     ranges[#ranges+1] = { cp=i, offset='HANGUL_'..mark }
199 |                 end
200 |             end
201 |         end
202 |     end
203 | 
204 |     table.sort(ranges, function(a, b) return a.cp < b.cp end)
205 |     return ranges
206 | end
207 | 
208 | local function parse_NormalizationProps(prop, ucd)
209 |     local codepoints = {}
210 | 
211 |     for line in io.lines() do
212 |         local cps, property, tail = line:match "^([%x%.]+)%s*;%s*([%w%_]+)(.*)$"
213 |         if property == prop then
214 |             local value = tail:match "^%s*;%s*(%w+)"
215 |             local from = cps:match "^%x+"
216 |             local to = cps:match "%.%.(%x+)$"
217 |             if not to then to = from end
218 | 
219 |             from = tonumber(from, 16)
220 |             to = tonumber(to, 16)
221 | 
222 |             for cp = from, to, 1 do
223 |                 codepoints[#codepoints+1] = cp
224 |             end
225 |         end
226 |     end
227 | 
228 |     table.sort(codepoints)
229 |     return codepoints
230 | end
231 | 
232 | local function get_ranges(list, func)
233 |     local first, last, step, offset
234 |     local ranges = {}
235 |     for i = 1, #list do
236 |         local v_cp, v_offset
237 |         local v = list[i]
238 |         local res = not func or func(v)
239 |         if type(v) == 'number' then
240 |             v_cp, v_offset = v, nil
241 |         elseif v.cp then
242 |             v_cp, v_offset = v.cp, v.offset
243 |         end
244 |         if res then
245 |             if first and
246 |                     (not offset or offset == v_offset) and
247 |                     (not step or step == v_cp - last) then
248 |                 step = v_cp - last
249 |                 last = v_cp
250 |             else
251 |                 if first then
252 |                     local r = { first = first, last = last, step = step, offset = offset }
253 |                     ranges[#ranges+1] = r
254 |                 end
255 |                 first, last, step = v_cp, v_cp, nil
256 |                 offset = v_offset
257 |             end
258 |         end
259 |     end
260 |     if first then
261 |         local r = { first = first, last = last, step = step, offset = offset }
262 |         ranges[#ranges+1] = r
263 |     end
264 |     return ranges
265 | end
266 | 
267 | --[[
268 | local function merge_ranges(...)
269 |     local ranges = {}
270 |     local lookup = {}
271 |     for i = 1, select('#', ...) do
272 |         for _,v in ipairs(select(i, ...)) do
273 |             if not lookup[v] then
274 |                 lookup[v] = true
275 |                 ranges[#ranges+1] = v
276 |             end
277 |         end
278 |     end
279 |     table.sort(ranges)
280 |     return ranges
281 | end
282 | 
283 | local function diff_ranges(base, sub, force)
284 |     local ranges = {}
285 |     local lookup = {}
286 |     local missing = {}
287 |     for _, v in ipairs(sub) do
288 |         for i = v.first, v.last, v.step or 1 do
289 |             lookup[i] = true
290 |             missing[i] = true
291 |         end
292 |     end
293 |     for _, v in ipairs(base) do
294 |         for i = v.first, v.last, v.step or 1 do
295 |             if not lookup[i] then
296 |                 ranges[#ranges+1] = i
297 |             end
298 |             missing[i] = nil
299 |         end
300 |     end
301 |     if force and next(missing) then
302 |         local m = {}
303 |         for i in pairs(missing) do
304 |             m[#m+1] = i
305 |         end
306 |         table.sort(m)
307 |         for i, v in ipairs(m) do
308 |             m[i] = ("%X"):format(v)
309 |         end
310 |         error(table.concat(m, "\n"))
311 |     end
312 |     return get_ranges(ranges)
313 | end
314 | --]]
315 | 
316 | local function get_ucd(cp, ucd)
317 |     local data = ucd[cp+1]
318 |     if data.cp > cp then
319 |         local i = cp
320 |         while data.cp > cp do
321 |             data = ucd[i]
322 |             i = i - 1
323 |         end
324 |     end
325 |     return data
326 | end
327 | 
328 | local function write_ranges(name, ranges)
329 |     io.write("static struct range_table "..name.."_table[] = {\n")
330 |     for _, r in ipairs(ranges) do
331 |         io.write(("    { 0x%X, 0x%X, %d },\n"):format(r.first, r.last, r.step or 1))
332 |     end
333 |     io.write "};\n\n"
334 | end
335 | 
336 | local function write_convtable(name, conv)
337 |     io.write("static struct conv_table "..name.."_table[] = {\n")
338 |     for _, c in ipairs(conv) do
339 |         io.write(("    { 0x%X, 0x%X, %d, %d },\n"):format(
340 |             c.first, c.last, c.step or 1, c.offset))
341 |     end
342 |     io.write "};\n\n"
343 | end
344 | 
345 | local function write_canon_cls_table(name, ucd)
346 |     io.write("static struct canon_cls_table "..name.."_table[] = {\n")
347 |     local start, prev = { canon_cls=0 }, { canon_cls=0 }
348 |     for _, data in ipairs(ucd) do
349 |         if data.canon_cls ~= prev.canon_cls then
350 |             if prev.canon_cls ~= 0 then
351 |                 io.write(("    { 0x%X, 0x%X, %d },\n"):format(start.cp, prev.cp, prev.canon_cls))
352 |             end
353 |             start = data
354 |         end
355 |         prev = data
356 |     end
357 |     if prev.canon_cls ~= 0 then
358 |         io.write(("    { 0x%X, 0x%X, %d },\n"):format(start.cp, prev.cp, prev.canon_cls))
359 |     end
360 |     io.write "};\n\n"
361 | end
362 | 
363 | local function write_combine_table(name, tbl)
364 |     local function hash(cp1, cp2)
365 |         return (cp1 * 213) + cp2
366 |     end
367 |     local dup = {}
368 |     for _, c in ipairs(tbl) do
369 |         local cp1, cp2 = table.unpack(c.decomposition)
370 |         if dup[hash(cp1, cp2)] then
371 |             local conflicting = dup[hash(cp1, cp2)]
372 |             local cp3, cp4 = table.unpack(conflicting.decomposition)
373 |             error("Hash collision: "..string.format("%x %x -> %x, %x %x -> %x", cp3, cp4, hash(cp3, cp4), cp1, cp2, hash(cp1, cp2)))
374 |         end
375 |         dup[hash(cp1, cp2)] = c
376 |     end
377 |     table.sort(tbl, function(a,b)
378 |         return hash(table.unpack(a.decomposition)) < hash(table.unpack(b.decomposition))
379 |     end)
380 | 
381 |     io.write("static struct combine_table "..name.."_table[] = {\n")
382 |     for _, c in ipairs(tbl) do
383 |         local cp1, cp2 = table.unpack(c.decomposition)
384 |         io.write(("    { 0x%X, 0x%X, 0x%X, 0x%X },\n"):format(hash(cp1, cp2), cp1, cp2, c.cp))
385 |     end
386 |     io.write "};\n\n"
387 | end
388 | 
389 | local function write_decompose_table(name, tbl, ucd)
390 |     table.sort(tbl, function(a,b)
391 |         return a.cp < b.cp
392 |     end)
393 |     io.write("static struct decompose_table "..name.."_table[] = {\n")
394 |     for _, c in ipairs(tbl) do
395 |         local cp1, cp2 = table.unpack(c.decomposition)
396 |         local data = get_ucd(cp2, ucd)
397 |         io.write(("    { 0x%X, 0x%X, 0x%X, %d },\n"):format(c.cp, cp1, cp2, data.canon_cls))
398 |     end
399 |     io.write "};\n\n"
400 | end
401 | 
402 | local function write_type_table(name, conv)
403 |     io.write("static struct type_table "..name.."_table[] = {\n")
404 |     for _, c in ipairs(conv) do
405 |         if c.step and c.step ~= 1 then
406 |             local i = c.first
407 |             while i <= c.last do
408 |                 io.write(("    { 0x%X, 0x%X, %s },\n"):format(i, i, c.offset))
409 |                 i = i + c.step
410 |             end
411 |         else
412 |             io.write(("    { 0x%X, 0x%X, %s },\n"):format(c.first, c.last, c.offset))
413 |         end
414 |     end
415 |     io.write "};\n\n"
416 | end
417 | 
418 | 
419 | io.output "unidata.h"
420 | 
421 | io.write [[
422 | /*
423 |  * unidata.h - generated by parseucd.lua
424 |  */
425 | #ifndef unidata_h
426 | #define unidata_h
427 | 
428 | #ifndef utfint
429 | # define utfint utfint
430 | typedef unsigned int utfint;
431 | #endif
432 | 
433 | typedef struct range_table {
434 |     utfint first;
435 |     utfint last;
436 |     int step;
437 | } range_table;
438 | 
439 | typedef struct conv_table {
440 |     utfint first;
441 |     utfint last;
442 |     int step;
443 |     int offset;
444 | } conv_table;
445 | 
446 | typedef struct nfc_table {
447 |     utfint cp;
448 |     int reason;
449 |     unsigned int data1;
450 |     unsigned int data2;
451 | } nfc_table;
452 | 
453 | #define REASON_MUST_CONVERT_1 1
454 | #define REASON_MUST_CONVERT_2 2
455 | #define REASON_STARTER_CAN_COMBINE 3
456 | #define REASON_COMBINING_MARK 4
457 | #define REASON_JAMO_VOWEL 5
458 | #define REASON_JAMO_TRAILING 6
459 | 
460 | typedef struct canon_cls_table {
461 |     utfint first;
462 |     utfint last;
463 |     unsigned int canon_cls;
464 | } canon_cls_table;
465 | 
466 | typedef struct combine_table {
467 |     utfint hash;
468 |     utfint cp1;
469 |     utfint cp2;
470 |     utfint dest;
471 | } combine_table;
472 | 
473 | typedef struct decompose_table {
474 |     utfint cp;
475 |     utfint to1;
476 |     utfint to2;
477 |     unsigned int canon_cls2;
478 | } decompose_table;
479 | 
480 | #define HANGUL_L 1
481 | #define HANGUL_V 2
482 | #define HANGUL_T 3
483 | #define HANGUL_LV 4
484 | #define HANGUL_LVT 5
485 | 
486 | typedef struct type_table {
487 |     utfint first;
488 |     utfint last;
489 |     int type;
490 | } type_table;
491 | 
492 | #define INDIC_CONSONANT 1
493 | #define INDIC_LINKER 2
494 | #define INDIC_EXTEND 3
495 | 
496 | ]]
497 | 
498 | do
499 |     local function ranges(name, f)
500 |         local r = get_ranges((parse_PropList(f)))
501 |         write_ranges(name, r)
502 |     end
503 | 
504 |     io.input "UCD/DerivedCoreProperties.txt"
505 |     ranges("alpha", "Alphabetic")
506 | 
507 |     io.input "UCD/DerivedCoreProperties.txt"
508 |     ranges("lower", "Lowercase")
509 | 
510 |     io.input "UCD/DerivedCoreProperties.txt"
511 |     ranges("upper", "Uppercase")
512 | 
513 |     io.input "UCD/PropList.txt"
514 |     ranges("xdigit", "Hex_Digit")
515 | 
516 |     io.input "UCD/PropList.txt"
517 |     ranges("space", "White_Space")
518 | 
519 |     io.input "UCD/DerivedCoreProperties.txt"
520 |     ranges("unprintable", "Default_Ignorable_Code_Point")
521 | 
522 |     io.input "UCD/DerivedCoreProperties.txt"
523 |     ranges("graph", "Grapheme_Base")
524 | 
525 |     io.input "UCD/DerivedCoreProperties.txt"
526 |     ranges("compose", "Grapheme_Extend")
527 | 
528 |     io.input "UCD/emoji/emoji-data.txt"
529 |     ranges("pictographic", "Extended_Pictographic")
530 | end
531 | 
532 | do
533 |     io.input "UCD/PropList.txt"
534 |     local prepend = parse_PropList("Prepended_Concatenation_Mark")
535 |     io.input "UCD/IndicSyllabicCategory.txt"
536 |     local indic = parse_PropList({ Consonant_Preceding_Repha=true, Consonant_Prefixed=true })
537 |     for _,cp in ipairs(indic) do
538 |         table.insert(prepend, cp)
539 |     end
540 |     table.sort(prepend)
541 |     write_ranges("prepend", get_ranges(prepend))
542 | end
543 | 
544 | do
545 |     io.input "UCD/DerivedCoreProperties.txt"
546 |     local linker = parse_PropList("InCB; Linker")
547 |     io.input "UCD/DerivedCoreProperties.txt"
548 |     local consonant = parse_PropList("InCB; Consonant")
549 |     io.input "UCD/DerivedCoreProperties.txt"
550 |     local extend = parse_PropList("InCB; Extend")
551 |     local indic_type = {}
552 |     for _,cp in ipairs(consonant) do table.insert(indic_type, { cp=cp, offset='INDIC_CONSONANT' }) end
553 |     for _,cp in ipairs(linker) do table.insert(indic_type, { cp=cp, offset='INDIC_LINKER' }) end
554 |     for _,cp in ipairs(extend) do table.insert(indic_type, { cp=cp, offset='INDIC_EXTEND' }) end
555 |     table.sort(indic_type, function(a, b) return a.cp < b.cp end)
556 |     write_type_table("indic", get_ranges(indic_type))
557 | end
558 | 
559 | do
560 |     io.input  "UCD/UnicodeData.txt"
561 |     local ucd = parse_UnicodeData()
562 |     local function set(s)
563 |         local hasht = {}
564 |         for word in s:gmatch "%w%w" do
565 |             hasht[word] = true
566 |         end
567 |         return function(data)
568 |             return hasht[data.gc]
569 |         end
570 |     end
571 |     local function mapping(field)
572 |         return function(data)
573 |             data.offset = nil
574 |             if data[field] then
575 |                 data.offset = data[field] - data.cp
576 |                 return true
577 |             end
578 |         end
579 |     end
580 |     local cntrl = "Cc Cf"
581 |     local digit = "Nd"
582 |     local alnum_extend = "Nd Nl No"
583 |     local punct = "Sk Sc Sm Pc Pd Ps Pe Pi Pf Po"
584 |     local spacing_mark = "Mc"
585 |     write_ranges("cntrl", get_ranges(ucd, set(cntrl)))
586 |     write_ranges("digit", get_ranges(ucd, set(digit)))
587 |     write_ranges("alnum_extend", get_ranges(ucd, set(alnum_extend)))
588 |     write_ranges("punct", get_ranges(ucd, set(punct)))
589 |     write_ranges("spacing_mark", get_ranges(ucd, set(spacing_mark)))
590 |     write_convtable("tolower", get_ranges(ucd, mapping "lm"))
591 |     write_convtable("toupper", get_ranges(ucd, mapping "um"))
592 |     write_convtable("totitle", get_ranges(ucd, mapping "tm"))
593 | end
594 | 
595 | do
596 |     io.input "UCD/CaseFolding.txt"
597 |     local mapping = parse_CaseFolding()
598 |     write_convtable("tofold", get_ranges(mapping, function(data)
599 |         data.offset = data.mapping - data.cp
600 |         return true
601 |     end))
602 | end
603 | 
604 | do
605 |     io.input  "UCD/EastAsianWidth.txt"
606 |     local wide, ambi = parse_EastAsianWidth()
607 |     write_ranges("doublewidth", get_ranges(wide))
608 |     write_ranges("ambiwidth", get_ranges(ambi))
609 | end
610 | 
611 | do
612 |     io.input "UCD/HangulSyllableType.txt"
613 |     write_type_table("hangul", (get_ranges(parse_HangulSyllableType())))
614 | end
615 | 
616 | do
617 |     io.input  "UCD/UnicodeData.txt"
618 |     local ucd = parse_UnicodeData()
619 | 
620 |     -- Write out table of all combining marks
621 |     write_canon_cls_table("nfc_combining", ucd)
622 | 
623 |     -- Find all primary composites which we may need to consider during NFC normalization
624 |     io.input "UCD/DerivedNormalizationProps.txt"
625 |     local excluded = {}
626 |     for _, cp in ipairs(parse_NormalizationProps('Full_Composition_Exclusion')) do
627 |         excluded[cp] = true
628 |     end
629 |     local composite, can_combine = {}, {}
630 |     for _, data in ipairs(ucd) do
631 |         local decomp = data.decomposition
632 |         if not excluded[data.cp] and decomp and #decomp == 2 then
633 |             table.insert(composite, data)
634 |             can_combine[decomp[2]] = true
635 |         end
636 |     end
637 |     write_combine_table("nfc_composite", composite)
638 |     write_decompose_table("nfc_decompose", composite, ucd)
639 | 
640 |     io.write("static struct nfc_table nfc_quickcheck_table[] = {\n")
641 | 
642 |     io.input "UCD/DerivedNormalizationProps.txt"
643 |     for _, cp in ipairs(parse_NormalizationProps('NFC_QC', ucd)) do
644 |         local data = get_ucd(cp, ucd)
645 |         local decomp = data.decomposition
646 |         if decomp then
647 |             if #decomp == 1 then
648 |                 local decomp_data = get_ucd(decomp[1], ucd)
649 |                 io.write(("    { 0x%X, REASON_MUST_CONVERT_1, 0x%X, %d },\n"):format(data.cp, decomp[1], decomp_data.canon_cls))
650 |             else
651 |                 io.write(("    { 0x%X, REASON_MUST_CONVERT_2, 0x%X, 0x%X },\n"):format(data.cp, decomp[1], decomp[2]))
652 |             end
653 |         elseif data.canon_cls ~= 0 then
654 |             io.write(("    { 0x%X, REASON_COMBINING_MARK, 0, 0 },\n"):format(data.cp))
655 |         elseif can_combine[data.cp] then
656 |             io.write(("    { 0x%X, REASON_STARTER_CAN_COMBINE, 0, 0 },\n"):format(data.cp))
657 |         elseif data.cp >= 0x1161 and data.cp <= 0x1175 then
658 |             io.write(("    { 0x%X, REASON_JAMO_VOWEL, 0, 0 },\n"):format(data.cp))
659 |         elseif data.cp >= 0x11A8 and data.cp <= 0x11C2 then
660 |             io.write(("    { 0x%X, REASON_JAMO_TRAILING, 0, 0 },\n"):format(data.cp))
661 |         else
662 |             error("Don't know why we need to check for codepoint "..string.format("0x%x", data.cp).." when doing NFC normalization")
663 |         end
664 |     end
665 | 
666 |     io.write "};\n\n"
667 | end
668 | 
669 | io.write "#endif /* unidata_h */\n"
670 | 


--------------------------------------------------------------------------------
/rockspecs/luautf8-0.1.6-1.rockspec:
--------------------------------------------------------------------------------
 1 | package = "luautf8"
 2 | version = "0.1.6-1"
 3 | source = {
 4 |    url = "https://github.com/starwing/luautf8/archive/refs/tags/0.1.6.tar.gz",
 5 |    dir = "luautf8-0.1.6"
 6 | }
 7 | description = {
 8 |    summary = "A UTF-8 support module for Lua",
 9 |    detailed = [[
10 |    This module adds UTF-8 support to Lua. It's compatible with Lua "string" module.
11 | ]],
12 |    homepage = "http://github.com/starwing/luautf8",
13 |    license = "MIT"
14 | }
15 | dependencies = {
16 |    "lua >= 5.1"
17 | }
18 | build = {
19 |    type = "builtin",
20 |    modules = {
21 |       ["lua-utf8"] = "lutf8lib.c"
22 |    }
23 | }
24 | 


--------------------------------------------------------------------------------
/rockspecs/luautf8-scm-1.rockspec:
--------------------------------------------------------------------------------
 1 | package = "luautf8"
 2 | version = "scm-1"
 3 | source = {
 4 |    url = "git://github.com/starwing/luautf8"
 5 | }
 6 | description = {
 7 |    summary = "A UTF-8 support module for Lua",
 8 |    detailed = [[
 9 |    This module adds UTF-8 support to Lua. It's compatible with Lua "string" module.
10 |   ]],
11 |    homepage = "http://github.com/starwing/luautf8",
12 |    license = "MIT"
13 | }
14 | dependencies = {
15 |    "lua >= 5.1"
16 | }
17 | build = {
18 |    type = "builtin",
19 |    modules = {
20 |       ["lua-utf8"] = "lutf8lib.c"
21 |    }
22 | }
23 | 


--------------------------------------------------------------------------------
/test.lua:
--------------------------------------------------------------------------------
  1 | local utf8 = require 'lua-utf8'
  2 | local unpack = unpack or table.unpack
  3 | local E = utf8.escape
  4 | 
  5 | local function get_codes(s)
  6 |    return table.concat({utf8.byte(s, 1, -1)}, ' ')
  7 | end
  8 | 
  9 | local t = { 20985, 20984, 26364, 25171, 23567, 24618, 20861 } 
 10 | -- test escape & len
 11 | assert(get_codes(E"%123%xabc%x{ABC}%d%u{456}") == '123 2748 2748 100 456')
 12 | 
 13 | local s = E('%'..table.concat(t, '%'))
 14 | assert(utf8.len(s) == 7)
 15 | assert(get_codes(s) == table.concat(t, ' '))
 16 | 
 17 | 
 18 | -- test offset
 19 | 
 20 | local function assert_error(f, msg)
 21 |    local s,e = pcall(f)
 22 |    return assert(not s and e:match(msg))
 23 | end
 24 | 
 25 | assert(utf8.offset("中国", 0) == 1)
 26 | assert(utf8.offset("中国", 0,1) == 1)
 27 | assert(utf8.offset("中国", 0,2) == 1)
 28 | assert(utf8.offset("中国", 0,3) == 1)
 29 | assert(utf8.offset("中国", 0,4) == 4)
 30 | assert(utf8.offset("中国", 0,5) == 4)
 31 | assert(utf8.offset("中国", 1) == 1)
 32 | assert_error(function() utf8.offset("中国", 1,2) end,
 33 |              "initial position is a continuation byte")
 34 | assert(utf8.offset("中国", 2) == 4)
 35 | assert(utf8.offset("中国", 3) == 7)
 36 | assert(utf8.offset("中国", 4) == nil)
 37 | assert(utf8.offset("中国", -1,-3) == 1)
 38 | assert(utf8.offset("中国", -1,1) == nil)
 39 | 
 40 | -- test byte
 41 | local function assert_table_equal(t1, t2, i, j)
 42 |    i = i or 1
 43 |    j = j or #t2
 44 |    local len = j-i+1
 45 |    assert(#t1 == len)
 46 |    for cur = 1, len do
 47 |       assert(t1[cur] == t2[cur+i-1])
 48 |    end
 49 | end
 50 | assert_table_equal({utf8.byte(s, 2)}, t, 2, 2)
 51 | assert_table_equal({utf8.byte(s, 1, -1)}, t)
 52 | assert_table_equal({utf8.byte(s, -100)}, {})
 53 | assert_table_equal({utf8.byte(s, -100, -200)}, {})
 54 | assert_table_equal({utf8.byte(s, -200, -100)}, {})
 55 | assert_table_equal({utf8.byte(s, 100)}, {})
 56 | assert_table_equal({utf8.byte(s, 100, 200)}, {})
 57 | assert_table_equal({utf8.byte(s, 200, 100)}, {})
 58 | 
 59 | 
 60 | -- test char
 61 | assert(s == utf8.char(unpack(t)))
 62 | 
 63 | -- test range
 64 | for i = 1, #t do
 65 |     assert(utf8.byte(s, i) == t[i])
 66 | end
 67 | 
 68 | -- test sub
 69 | assert(get_codes(utf8.sub(s, 2, -2)) == table.concat(t, ' ', 2, #t-1))
 70 | assert(get_codes(utf8.sub(s, -100)) == table.concat(t, ' '))
 71 | assert(get_codes(utf8.sub(s, -100, -200)) == "")
 72 | assert(get_codes(utf8.sub(s, -100, -100)) == "")
 73 | assert(get_codes(utf8.sub(s, -100, 0)) == "")
 74 | assert(get_codes(utf8.sub(s, -200, -100)) == "")
 75 | assert(get_codes(utf8.sub(s, 100, 200)) == "")
 76 | assert(get_codes(utf8.sub(s, 200, 100)) == "")
 77 | 
 78 | 
 79 | -- test insert/remove
 80 | assert(utf8.insert("abcdef", "...") == "abcdef...")
 81 | assert(utf8.insert("abcdef", 0, "...") == "abcdef...")
 82 | assert(utf8.insert("abcdef", 1, "...") == "...abcdef")
 83 | assert(utf8.insert("abcdef", 6, "...") == "abcde...f")
 84 | assert(utf8.insert("abcdef", 7, "...") == "abcdef...")
 85 | assert(utf8.insert("abcdef", 3, "...") == "ab...cdef")
 86 | assert(utf8.insert("abcdef", -3, "...") == "abc...def")
 87 | assert(utf8.remove("abcdef", 3, 3) == "abdef")
 88 | assert(utf8.remove("abcdef", 3, 4) == "abef")
 89 | assert(utf8.remove("abcdef", 4, 3) == "abcdef")
 90 | assert(utf8.remove("abcdef", -3, -3) == "abcef")
 91 | assert(utf8.remove("abcdef", 100) == "abcdef")
 92 | assert(utf8.remove("abcdef", -100) == "")
 93 | assert(utf8.remove("abcdef", -100, 0) == "abcdef")
 94 | assert(utf8.remove("abcdef", -100, -200) == "abcdef")
 95 | assert(utf8.remove("abcdef", -200, -100) == "abcdef")
 96 | assert(utf8.remove("abcdef", 100, 200) == "abcdef")
 97 | assert(utf8.remove("abcdef", 200, 100) == "abcdef")
 98 | 
 99 | do
100 |     local s = E"a%255bc"
101 |     assert(utf8.len(s, 4))
102 |     assert(string.len(s, 6))
103 |     assert(utf8.charpos(s) == 1)
104 |     assert(utf8.charpos(s, 0) == 1)
105 |     assert(utf8.charpos(s, 1) == 1)
106 |     assert(utf8.charpos(s, 2) == 2)
107 |     assert(utf8.charpos(s, 3) == 4)
108 |     assert(utf8.charpos(s, 4) == 5)
109 |     assert(utf8.charpos(s, 5) == nil)
110 |     assert(utf8.charpos(s, 6) == nil)
111 |     assert(utf8.charpos(s, -1) == 5)
112 |     assert(utf8.charpos(s, -2) == 4)
113 |     assert(utf8.charpos(s, -3) == 2)
114 |     assert(utf8.charpos(s, -4) == 1)
115 |     assert(utf8.charpos(s, -5) == nil)
116 |     assert(utf8.charpos(s, -6) == nil)
117 |     assert(utf8.charpos(s, 3, -1) == 2)
118 |     assert(utf8.charpos(s, 3, 0) == 2)
119 |     assert(utf8.charpos(s, 3, 1) == 4)
120 |     assert(utf8.charpos(s, 6, -3) == 2)
121 |     assert(utf8.charpos(s, 6, -4) == 1)
122 |     assert(utf8.charpos(s, 6, -5) == nil)
123 | end
124 | 
125 | local idx = 1
126 | for pos, code in utf8.next, s do
127 |    assert(t[idx] == code)
128 |    idx = idx + 1
129 | end
130 | 
131 | assert(utf8.ncasecmp("abc", "AbC") == 0)
132 | assert(utf8.ncasecmp("abc", "AbE") == -1)
133 | assert(utf8.ncasecmp("abe", "AbC") == 1)
134 | assert(utf8.ncasecmp("abc", "abcdef") == -1)
135 | assert(utf8.ncasecmp("abcdef", "abc") == 1)
136 | assert(utf8.ncasecmp("abZdef", "abcZef") == 1)
137 | 
138 | assert(utf8.gsub("x^[]+$", "%p", "%%%0") == "x%^%[%]%+%$")
139 | 
140 | 
141 | -- test invalid
142 | 
143 | -- 1110-1010 10-000000 0110-0001
144 | do
145 |    local s = "\234\128\97"
146 |    assert(utf8.len(s, nil, nil, true) == 2)
147 |    assert_table_equal({utf8.len(s)}, {nil, 1}, 1, 2)
148 | 
149 |    -- 1111-0000 10-000000 10-000000 ...
150 |    s = "\240\128\128\128\128"
151 |    assert_table_equal({utf8.len(s)}, {nil, 1}, 1, 2)
152 | end
153 | 
154 | 
155 | -- test compose
156 | local function assert_fail(f, patt)
157 |    local ok, msg = pcall(f)
158 |    assert(not ok)
159 |    assert(msg:match(patt), msg)
160 | end
161 | do
162 |    local s = "नमस्ते"
163 |    assert(utf8.len(s) == 6)
164 |    assert(utf8.reverse(s) == "तेस्मन")
165 |    assert(utf8.reverse(s.." ", true) == " ेत्समन")
166 |    assert(utf8.match(s..'\2', "%g+") == s)
167 |    assert_fail(function() utf8.reverse(E"%xD800") end, "invalid UTF%-8 code")
168 | end
169 | 
170 | 
171 | -- test match
172 | assert(utf8.match('%c', '') == nil) -- %c does not match U+F000
173 | 
174 | 
175 | -- test codepoint
176 | for i = 1, 1000 do
177 |    assert(utf8.codepoint(E("%"..i)) == i)
178 | end
179 | assert_fail(function() utf8.codepoint(E"%xD800") end, "invalid UTF%-8 code")
180 | 
181 | -- test escape
182 | assert_fail(function() E"%{1a1}" end, "invalid escape 'a'")
183 | 
184 | 
185 | -- test codes
186 | local result = { [1]  = 20985; [4]  = 20984; [7]  = 26364;
187 |    [10] = 25171; [13] = 23567; [16] = 24618; [19] = 20861; }
188 | for p, c in utf8.codes(s) do
189 |    assert(result[p] == c)
190 | end
191 | for p, c in utf8.codes(s, true) do
192 |    assert(result[p] == c)
193 | end
194 | assert_fail(function()
195 |    for p, c in utf8.codes(E"%xD800") do
196 |       assert(result[p] == c)
197 |    end
198 | end, "invalid UTF%-8 code")
199 | 
200 | 
201 | -- test width
202 | assert(utf8.width('नमस्ते\2') == 5)
203 | assert(utf8.width(E'%xA1') == 1)
204 | assert(utf8.width(E'%xA1', 2) == 2)
205 | assert(utf8.width(E'%x61C') == 0)
206 | assert(utf8.width "A" == 1)
207 | assert(utf8.width "Ａ" == 2)
208 | assert(utf8.width(97) == 1)
209 | assert(utf8.width(65313) == 2)
210 | assert_fail(function() utf8.width(true) end, "number/string expected, got boolean")
211 | assert(utf8.widthindex("abcdef", 3) == 3)
212 | assert(utf8.widthindex("abcdef", 7) == 7)
213 | 
214 | -- test patterns
215 | assert_fail(function() utf8.gsub("a", ".", function() return {} end) end,
216 |    "invalid replacement value %(a table%)")
217 | assert_fail(function() utf8.gsub("a", ".", "%z") end,
218 |    "invalid use of '%%' in replacement string")
219 | assert(utf8.find("abcabc", "ab", -10) == 1)
220 | 
221 | -- test charpattern
222 | do
223 |   local subj, n = "school=школа", 0
224 |   for c in string.gmatch(subj, utf8.charpattern) do n = n+1 end
225 |   assert(n == utf8.len(subj))
226 | end
227 | 
228 | 
229 | -- test isvalid
230 | local good_strings = {
231 |    '',
232 |    'A',
233 |    'abcdefghijklmnopqrstuvwxyz',
234 |    "``",
235 |    "@",
236 |    'नमस्ते',
237 |    '中国',
238 |    '日本語０１２３４５６７８９０。',
239 |    'ひらがな',
240 |    'Καλημέρα',
241 |    'АБВГ',
242 |    '⡌⠁⠧⠑ ⠼',
243 |    '∑ f(i)',
244 |    'Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι, ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς',
245 |    'ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789 abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿАБВГДабвгд∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣',
246 |    'გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს',
247 |    '\000' -- NUL is valid in UTF-8
248 | }
249 | 
250 | for _, good in ipairs(good_strings) do
251 |    assert(utf8.isvalid(good))
252 | end
253 | 
254 | assert(not utf8.isvalid("\255")) -- illegal byte 0xFF
255 | assert(not utf8.isvalid("abc\254def")) -- illegal byte 0xFE
256 | 
257 | assert(not utf8.isvalid("123 \223")) -- truncated code unit 0xDF
258 | assert(not utf8.isvalid("123 \239\191")) -- truncated code unit 0xEF BF
259 | assert(not utf8.isvalid("123 \240\191")) -- truncated code unit 0xF0 BF
260 | assert(not utf8.isvalid("123 \240\191\191")) -- truncated code unit 0xF0 BF BF
261 | 
262 | assert(not utf8.isvalid('\223ABC')) -- code unit 0xDF ended too soon and went to ASCII
263 | assert(not utf8.isvalid('\239\191ABC')) -- code unit 0xEF BF ended too soon and went to ASCII
264 | assert(not utf8.isvalid('\240\191ABC')) -- code unit 0xF0 BF ended too soon and went to ASCII
265 | assert(not utf8.isvalid('\240\191\191ABC')) -- code unit 0xF0 BF BF ended too soon and went to ASCII
266 | 
267 | assert(not utf8.isvalid('\223中')) -- code unit 0xDF ended too soon and went to another multi-byte char
268 | assert(not utf8.isvalid('\239\191中')) -- code unit 0xEF BF ended too soon and went to another multi-byte char
269 | assert(not utf8.isvalid('\240\191中')) -- code unit 0xF0 BF ended too soon and went to another multi-byte char
270 | assert(not utf8.isvalid('\240\191\191中')) -- code unit 0xF0 BF BF ended too soon and went to another multi-byte char
271 | 
272 | assert(utf8.isvalid('\237\159\191')) -- U+D7FF is valid
273 | assert(not utf8.isvalid('\237\160\128')) -- U+D800; reserved for UTF-16 surrogate
274 | assert(not utf8.isvalid('\237\175\191')) -- U+DBFF; reserved for UTF-16 surrogate
275 | assert(not utf8.isvalid('\237\191\191')) -- U+DFFF; reserved for UTF-16 surrogate
276 | assert(utf8.isvalid('\238\128\128')) -- U+E000 is valid
277 | 
278 | assert(utf8.isvalid('\244\143\191\191')) -- U+10FFFF is valid
279 | assert(not utf8.isvalid('\244\144\128\128')) -- U+110000 is not valid
280 | assert(not utf8.isvalid('\247\191\191\191')) -- U+1FFFFF is not valid
281 | 
282 | assert(not utf8.isvalid('\128')) -- continuation byte outside a multi-byte char
283 | assert(not utf8.isvalid('A\128A')) -- continuation byte outside a multi-byte char
284 | assert(not utf8.isvalid('中\128')) -- continuation byte outside a multi-byte char
285 | 
286 | assert(not utf8.isvalid('\193\191')) -- overlong code unit
287 | assert(not utf8.isvalid('\224\159\191')) -- overlong code unit
288 | assert(not utf8.isvalid('\240\143\191\191')) -- overlong code unit
289 | 
290 | -- test clean
291 | local cleaned, was_clean
292 | 
293 | for _, good in ipairs(good_strings) do
294 |    cleaned, was_clean = utf8.clean(good)
295 |    assert(cleaned == good)
296 |    assert(was_clean)
297 | end
298 | 
299 | cleaned, was_clean = utf8.clean('A\128A')
300 | assert(cleaned == 'A�A')
301 | assert(not was_clean)
302 | 
303 | cleaned, was_clean = utf8.clean('\128')
304 | assert(cleaned == '�')
305 | assert(not was_clean)
306 | 
307 | cleaned, was_clean = utf8.clean('1\193\1912\224\159\1913\240\143\191\191', '???')
308 | assert(cleaned == '1???2???3???')
309 | assert(not was_clean)
310 | 
311 | cleaned, was_clean = utf8.clean('\237\160\128\237\175\191\237\191\191')
312 | assert(cleaned == '�') -- an entire sequence of bad bytes just gets replaced with one replacement char
313 | assert(not was_clean)
314 | 
315 | cleaned, was_clean = utf8.clean('123 \223', '')
316 | assert(cleaned == '123 ')
317 | assert(not was_clean)
318 | 
319 | cleaned, was_clean = utf8.clean('\239\191中', '')
320 | assert(cleaned == '中')
321 | assert(not was_clean)
322 | 
323 | assert_error(function() utf8.clean('abc', '\255') end, "replacement string must be valid UTF%-8")
324 | 
325 | 
326 | -- test invalidoffset
327 | for _, good in ipairs(good_strings) do
328 |    assert(utf8.invalidoffset(good) == nil)
329 | end
330 | 
331 | assert(utf8.invalidoffset("\255") == 1)
332 | assert(utf8.invalidoffset("\255", 0) == 1)
333 | assert(utf8.invalidoffset("\255", 1) == 1)
334 | assert(utf8.invalidoffset("\255", 2) == nil)
335 | assert(utf8.invalidoffset("\255", -1) == 1)
336 | assert(utf8.invalidoffset("\255", -2) == 1)
337 | assert(utf8.invalidoffset("\255", -3) == 1)
338 | 
339 | assert(utf8.invalidoffset("abc\254def") == 4)
340 | assert(utf8.invalidoffset("abc\254def", 0) == 4)
341 | assert(utf8.invalidoffset("abc\254def", 1) == 4)
342 | assert(utf8.invalidoffset("abc\254def", 2) == 4)
343 | assert(utf8.invalidoffset("abc\254def", 3) == 4)
344 | assert(utf8.invalidoffset("abc\254def", 4) == 4)
345 | assert(utf8.invalidoffset("abc\254def", 5) == nil)
346 | assert(utf8.invalidoffset("abc\254def", 6) == nil)
347 | assert(utf8.invalidoffset("abc\254def", -1) == nil)
348 | assert(utf8.invalidoffset("abc\254def", -2) == nil)
349 | assert(utf8.invalidoffset("abc\254def", -3) == nil)
350 | assert(utf8.invalidoffset("abc\254def", -4) == 4)
351 | assert(utf8.invalidoffset("abc\254def", -5) == 4)
352 | 
353 | assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 0) == 1)
354 | assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 1) == 1)
355 | assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 2) == 2)
356 | assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 3) == 3)
357 | assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 4) == 4)
358 | assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 5) == 5)
359 | assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 6) == 6)
360 | assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', -1) == 9)
361 | 
362 | 
363 | local function parse_codepoints(s)
364 |    local list = {}
365 |    for hex in s:gmatch("%w+") do
366 |       list[#list+1] = tonumber(hex, 16)
367 |    end
368 |    return utf8.char(unpack(list))
369 | end
370 | 
371 | -- This is an official set of test cases for Unicode normalization
372 | -- Provided by the Unicode Consortium
373 | local normalization_test_cases = {}
374 | local f = io.open('NormalizationTest.txt', 'r')
375 | for line in f:lines() do
376 |    if not line:match("^#") and not line:match("^@") then
377 |       local src, nfc, nfd = line:match "([%w%s]+);([%w%s]+);([%w%s]+)"
378 |       table.insert(normalization_test_cases, { src = parse_codepoints(src), nfc = parse_codepoints(nfc), nfd = parse_codepoints(nfd) })
379 |    end
380 | end
381 | 
382 | 
383 | -- test isnfc
384 | for _,case in ipairs(normalization_test_cases) do
385 |    assert(utf8.isnfc(case.nfc))
386 |    if case.src ~= case.nfc then
387 |       assert(not utf8.isnfc(case.src))
388 |    end
389 |    if case.nfd ~= case.nfc and case.nfd ~= case.src then
390 |       assert(not utf8.isnfc(case.nfd))
391 |    end
392 | end
393 | 
394 | -- Regression tests:
395 | -- Although U+1100-115F are all leading Jamo (Korean characters), for some reason,
396 | -- the normalization algorithm only combines U+1100-1112 with a following vowel Jamo
397 | assert(utf8.isnfc("\225\133\133\225\133\163"))
398 | -- In certain cases, we did not properly check if a combining mark was blocked from
399 | -- combining with the preceding starter codepoint (by another combining mark with
400 | -- the same canonicalization class)
401 | assert(utf8.isnfc("\196\148\204\162\204\167"))
402 | -- It is possible that a codepoint which is composed from a starter and combining mark
403 | -- might be decomposed, then the resulting starter might be decomposed AGAIN, then
404 | -- those two resulting combining marks might be reordered with a following combining
405 | -- mark
406 | assert(not utf8.isnfc("\199\154\204\164"))
407 | 
408 | -- test normalize_nfc
409 | for _,case in ipairs(normalization_test_cases) do
410 |    assert(utf8.normalize_nfc(case.src) == case.nfc)
411 |    assert(utf8.normalize_nfc(case.nfc) == case.nfc)
412 |    assert(utf8.normalize_nfc(case.nfd) == case.nfc)
413 | end
414 | 
415 | -- Regression tests:
416 | -- Long series of combining marks; these need to be sorted in canonical order
417 | assert(utf8.normalize_nfc("\215\129\215\133\215\133\215\129\215\129\215\129\215\129\215\129\215\129") == "\215\129\215\129\215\129\215\129\215\129\215\129\215\129\215\133\215\133")
418 | -- After converting combining marks to standard codepoints, it is possible their canonicalization class may change
419 | -- If so, make sure they are still put in the correct order
420 | assert(utf8.normalize_nfc("\200\135\204\163\204\169") == "\225\186\185\204\169\204\145")
421 | -- This test case caused an out-of-bounds read where my code tried to sort an empty array
422 | assert(utf8.normalize_nfc("\225\190\129\204\129") == "\225\190\133")
423 | -- After converting one codepoint to two, as required by the NFC normalization tables,
424 | -- if the 2nd resulting codepoint is a combining mark, we have to be ready to re-order
425 | -- it with any following combining marks
426 | assert(utf8.normalize_nfc("\224\165\152\204\184") == "\224\164\149\204\184\224\164\188")
427 | -- It can also happen that a codepoint converts to a starter followed by TWO combining marks,
428 | -- and we must be able to reorder BOTH of those combining marks with a following combining mark
429 | assert(utf8.normalize_nfc("\239\172\172\204\184") == "\215\169\204\184\214\188\215\129")
430 | -- It can even happen that a deprecated 'starter' codepoint (canonicalization class = 0)
431 | -- can convert to 'combining mark' codepoints (canonicalization class != 0)
432 | assert(utf8.normalize_nfc("\223\179\224\189\179") == "\224\189\177\224\189\178\223\179")
433 | -- In certain cases, we did not properly check if a combining mark was blocked from
434 | -- combining with the preceding starter codepoint (by another combining mark with
435 | -- the same canonicalization class)
436 | assert(utf8.normalize_nfc("\196\148\204\162\204\167") == "\196\148\204\162\204\167")
437 | assert(utf8.normalize_nfc("\200\148\204\160\204\148\204\164") == "\200\148\204\160\204\164\204\148")
438 | -- It is possible that a codepoint which is composed from a starter and combining mark
439 | -- might be decomposed, then the resulting starter might be decomposed AGAIN, then
440 | -- those two resulting combining marks might be reordered with a following combining
441 | -- mark
442 | assert(utf8.normalize_nfc("\199\154\204\164") == "\225\185\179\204\136\204\140")
443 | -- When a codepoint decomposes to a starter followed by 2 combining marks, we need to
444 | -- make sure those combining marks are in the right order with any following ones
445 | assert(utf8.normalize_nfc("\199\160\205\129\204\168") == "\196\132\204\135\204\132\204\129")
446 | -- Fixing another issue with ordering of combining marks after a codepoint decomposes
447 | -- to a starter followed by 1 or 2 combining marks:
448 | assert(utf8.normalize_nfc("\199\155\204\155\204\131\204\155") == "\198\175\204\155\204\136\204\128\204\131")
449 | 
450 | 
451 | -- Official set of test cases for grapheme cluster segmentation, provided by Unicode Consortium
452 | local grapheme_test_cases = {}
453 | f = io.open('GraphemeBreakTest.txt', 'r')
454 | for line in f:lines() do
455 |    if not line:match("^#") and not line:match("^@") then
456 |       line = line:gsub("#.*", "")
457 |       line = line:gsub("^%s*÷%s*", "")
458 |       line = line:gsub("%s*÷%s*$", "")
459 |       local clusters = { "" }
460 |       for str in line:gmatch("%S+") do
461 |          if str == '×' then
462 |             -- do nothing
463 |          elseif str == '÷' then
464 |             table.insert(clusters, "") -- start a new cluster
465 |          else
466 |             clusters[#clusters] = clusters[#clusters]..utf8.char(tonumber(str, 16))
467 |          end
468 |       end
469 |       table.insert(grapheme_test_cases, { str=table.concat(clusters), clusters=clusters })
470 |    end
471 | end
472 | 
473 | 
474 | -- test grapheme_indices
475 | for _,case in ipairs(grapheme_test_cases) do
476 |    local actual_clusters = {}
477 |    for start,stop in utf8.grapheme_indices(case.str) do
478 |       table.insert(actual_clusters, case.str:sub(start, stop))
479 |    end
480 |    assert(#actual_clusters == #case.clusters)
481 |    for i,cluster in ipairs(case.clusters) do
482 |       assert(actual_clusters[i] == cluster)
483 |    end
484 | end
485 | 
486 | -- try iterating over grapheme clusters in a substring
487 | local clusters = {}
488 | for a,b in utf8.grapheme_indices('ひらがな', 4, 9) do
489 |    table.insert(clusters, a)
490 |    table.insert(clusters, b)
491 | end
492 | for idx,value in ipairs({ 4, 6, 7, 9 }) do
493 |    assert(clusters[idx] == value)
494 | end
495 | 
496 | -- try private use codepoint followed by a combining character
497 | clusters = {}
498 | for a,b in utf8.grapheme_indices('\239\128\128\204\154') do
499 |    table.insert(clusters, a)
500 |    table.insert(clusters, b)
501 | end
502 | for idx,value in ipairs({ 1, 5 }) do
503 |    assert(clusters[idx] == value)
504 | end
505 | 
506 | 
507 | print "OK"
508 | 
509 | -- cc: run='lua -- $input'
510 | 
511 | 


--------------------------------------------------------------------------------
/test_compat.lua:
--------------------------------------------------------------------------------
 1 | local utf8 = require 'lua-utf8'
 2 | print('testing utf8 library')
 3 | 
 4 | assert(utf8.sub("123456789",2,4) == "234")
 5 | assert(utf8.sub("123456789",7) == "789")
 6 | assert(utf8.sub("123456789",7,6) == "")
 7 | assert(utf8.sub("123456789",7,7) == "7")
 8 | assert(utf8.sub("123456789",0,0) == "")
 9 | assert(utf8.sub("123456789",-10,10) == "123456789")
10 | assert(utf8.sub("123456789",1,9) == "123456789")
11 | assert(utf8.sub("123456789",-10,-20) == "")
12 | assert(utf8.sub("123456789",-1) == "9")
13 | assert(utf8.sub("123456789",-4) == "6789")
14 | assert(utf8.sub("123456789",-6, -4) == "456")
15 | if not _no32 then
16 |   assert(utf8.sub("123456789",-2^31, -4) == "123456")
17 |   assert(utf8.sub("123456789",-2^31, 2^31 - 1) == "123456789")
18 |   assert(utf8.sub("123456789",-2^31, -2^31) == "")
19 | end
20 | assert(utf8.sub("\000123456789",3,5) == "234")
21 | assert(utf8.sub("\000123456789", 8) == "789")
22 | print('+')
23 | 
24 | assert(utf8.find("123456789", "345") == 3)
25 | a,b = utf8.find("123456789", "345")
26 | assert(utf8.sub("123456789", a, b) == "345")
27 | assert(utf8.find("1234567890123456789", "345", 3) == 3)
28 | assert(utf8.find("1234567890123456789", "345", 4) == 13)
29 | assert(utf8.find("1234567890123456789", "346", 4) == nil)
30 | assert(utf8.find("1234567890123456789", ".45", -9) == 13)
31 | assert(utf8.find("abcdefg", "\0", 5, 1) == nil)
32 | assert(utf8.find("", "") == 1)
33 | assert(utf8.find("", "", 1) == 1)
34 | assert(not utf8.find("", "", 2))
35 | assert(utf8.find('', 'aaa', 1) == nil)
36 | assert(('alo(.)alo'):find('(.)', 1, 1) == 4)
37 | print('+')
38 | 
39 | assert(utf8.len("") == 0)
40 | assert(utf8.len("\0\0\0") == 3)
41 | assert(utf8.len("1234567890") == 10)
42 | 
43 | local E = utf8.escape
44 | assert(utf8.byte("a") == 97)
45 | assert(utf8.byte(E"%228") > 127)
46 | assert(utf8.byte(utf8.char(255)) == 255)
47 | assert(utf8.byte(utf8.char(0)) == 0)
48 | assert(utf8.byte("\0") == 0)
49 | assert(utf8.byte("\0\0alo\0x", -1) == string.byte('x'))
50 | assert(utf8.byte("ba", 2) == 97)
51 | assert(utf8.byte("\n\n", 2, -1) == 10)
52 | assert(utf8.byte("\n\n", 2, 2) == 10)
53 | assert(utf8.byte("") == nil)
54 | assert(utf8.byte("hi", -3) == nil)
55 | assert(utf8.byte("hi", 3) == nil)
56 | assert(utf8.byte("hi", 9, 10) == nil)
57 | assert(utf8.byte("hi", 2, 1) == nil)
58 | assert(utf8.char() == "")
59 | assert(utf8.char(0, 255, 0) == utf8.escape"%0%255%0")
60 | assert(utf8.char(0, utf8.byte(E"%228"), 0) == E"%0%xe4%0")
61 | assert(utf8.char(utf8.byte(E"%228l\0髐", 1, -1)) == E"%xe4l\0髐")
62 | assert(utf8.char(utf8.byte(E"%228l\0髐", 1, 0)) == "")
63 | assert(utf8.char(utf8.byte(E"%228l\0髐", -10, 100)) == E"%xe4l\0髐")
64 | print('+')
65 | 
66 | assert(utf8.upper("ab\0c") == "AB\0C")
67 | assert(utf8.lower("\0ABCc%$") == "\0abcc%$")
68 | 
69 | assert(utf8.reverse"" == "")
70 | assert(utf8.reverse"\0\1\2\3" == "\3\2\1\0")
71 | assert(utf8.reverse"\0001234" == "4321\0")
72 | 
73 | for i=0,30 do assert(utf8.len(string.rep('a', i)) == i) end
74 | 
75 | print('+')
76 | 
77 | 
78 | print('OK')
79 | 


--------------------------------------------------------------------------------
/test_pm.lua:
--------------------------------------------------------------------------------
  1 | local utf8 = require 'lua-utf8'
  2 | 
  3 | print('testing pattern matching')
  4 | 
  5 | function f(s, p)
  6 |   local i,e = utf8.find(s, p)
  7 |   if i then return utf8.sub(s, i, e) end
  8 | end
  9 | 
 10 | function f1(s, p)
 11 |   p = utf8.gsub(p, "%%([0-9])", function (s) return "%" .. (tonumber(s)+1) end)
 12 |   p = utf8.gsub(p, "^(^?)", "%1()", 1)
 13 |   p = utf8.gsub(p, "($?)$", "()%1", 1)
 14 |   local t = {utf8.match(s, p)}
 15 |   return utf8.sub(s, t[1], t[#t] - 1)
 16 | end
 17 | 
 18 | a,b = utf8.find('', '')    -- empty patterns are tricky
 19 | assert(a == 1 and b == 0);
 20 | a,b = utf8.find('alo', '')
 21 | assert(a == 1 and b == 0)
 22 | a,b = utf8.find('a\0o a\0o a\0o', 'a', 1)   -- first position
 23 | assert(a == 1 and b == 1)
 24 | a,b = utf8.find('a\0o a\0o a\0o', 'a\0o', 2)   -- starts in the midle
 25 | assert(a == 5 and b == 7)
 26 | a,b = utf8.find('a\0o a\0o a\0o', 'a\0o', 9)   -- starts in the midle
 27 | assert(a == 9 and b == 11)
 28 | a,b = utf8.find('a\0a\0a\0a\0\0ab', '\0ab', 2);  -- finds at the end
 29 | assert(a == 9 and b == 11);
 30 | a,b = utf8.find('a\0a\0a\0a\0\0ab', 'b')    -- last position
 31 | assert(a == 11 and b == 11)
 32 | assert(utf8.find('a\0a\0a\0a\0\0ab', 'b\0') == nil)   -- check ending
 33 | assert(utf8.find('', '\0') == nil)
 34 | assert(utf8.find('alo123alo', '12') == 4)
 35 | assert(utf8.find('alo123alo', '^12') == nil)
 36 | 
 37 | assert(utf8.match("aaab", ".*b") == "aaab")
 38 | assert(utf8.match("aaa", ".*a") == "aaa")
 39 | assert(utf8.match("b", ".*b") == "b")
 40 | 
 41 | assert(utf8.match("aaab", ".+b") == "aaab")
 42 | assert(utf8.match("aaa", ".+a") == "aaa")
 43 | assert(not utf8.match("b", ".+b"))
 44 | 
 45 | assert(utf8.match("aaab", ".?b") == "ab")
 46 | assert(utf8.match("aaa", ".?a") == "aa")
 47 | assert(utf8.match("b", ".?b") == "b")
 48 | 
 49 | assert(f('aloALO', '%l*') == 'alo')
 50 | assert(f('aLo_ALO', '%a*') == 'aLo')
 51 | 
 52 | assert(f("  \n\r*&\n\r   xuxu  \n\n", "%g%g%g+") == "xuxu")
 53 | 
 54 | assert(f('aaab', 'a*') == 'aaa');
 55 | assert(f('aaa', '^.*$') == 'aaa');
 56 | assert(f('aaa', 'b*') == '');
 57 | assert(f('aaa', 'ab*a') == 'aa')
 58 | assert(f('aba', 'ab*a') == 'aba')
 59 | assert(f('aaab', 'a+') == 'aaa')
 60 | assert(f('aaa', '^.+$') == 'aaa')
 61 | assert(f('aaa', 'b+') == nil)
 62 | assert(f('aaa', 'ab+a') == nil)
 63 | assert(f('aba', 'ab+a') == 'aba')
 64 | assert(f('a$a', '.$') == 'a')
 65 | assert(f('a$a', '.%$') == 'a$')
 66 | assert(f('a$a', '.$.') == 'a$a')
 67 | assert(f('a$a', '$$') == nil)
 68 | assert(f('a$b', 'a$') == nil)
 69 | assert(f('a$a', '$') == '')
 70 | assert(f('', 'b*') == '')
 71 | assert(f('aaa', 'bb*') == nil)
 72 | assert(f('aaab', 'a-') == '')
 73 | assert(f('aaa', '^.-$') == 'aaa')
 74 | assert(f('aabaaabaaabaaaba', 'b.*b') == 'baaabaaabaaab')
 75 | assert(f('aabaaabaaabaaaba', 'b.-b') == 'baaab')
 76 | assert(f('alo xo', '.o$') == 'xo')
 77 | assert(f(' \n isto é assim', '%S%S*') == 'isto')
 78 | assert(f(' \n isto é assim', '%S*$') == 'assim')
 79 | assert(f(' \n isto é assim', '[a-z]*$') == 'assim')
 80 | assert(f('um caracter ? extra', '[^%sa-z]') == '?')
 81 | assert(f('', 'a?') == '')
 82 | assert(f('á', 'á?') == 'á')
 83 | assert(f('ábl', 'á?b?l?') == 'ábl')
 84 | assert(f('  ábl', 'á?b?l?') == '')
 85 | assert(f('aa', '^aa?a?a') == 'aa')
 86 | assert(f(']]]áb', '[^]]') == 'á')
 87 | assert(f("0alo alo", "%x*") == "0a")
 88 | assert(f("alo alo", "%C+") == "alo alo")
 89 | print('+')
 90 | 
 91 | assert(f1('alo alx 123 b\0o b\0o', '(..*) %1') == "b\0o b\0o")
 92 | assert(f1('axz123= 4= 4 34', '(.+)=(.*)=%2 %1') == '3= 4= 4 3')
 93 | assert(f1('=======', '^(=*)=%1$') == '=======')
 94 | assert(utf8.match('==========', '^([=]*)=%1$') == nil)
 95 | 
 96 | local function range (i, j)
 97 |   if i <= j then
 98 |     return i, range(i+1, j)
 99 |   end
100 | end
101 | 
102 | local abc = utf8.char(range(0, 255));
103 | 
104 | assert(utf8.len(abc) == 256)
105 | assert(string.len(abc) == 384)
106 | 
107 | function strset (p)
108 |   local res = {s=''}
109 |   utf8.gsub(abc, p, function (c) res.s = res.s .. c end)
110 |   return res.s
111 | end;
112 | 
113 | local E = utf8.escape
114 | assert(utf8.len(strset(E'[%200-%210]')) == 11)
115 | 
116 | assert(strset('[a-z]') == "abcdefghijklmnopqrstuvwxyz")
117 | assert(strset('[a-z%d]') == strset('[%da-uu-z]'))
118 | assert(strset('[a-]') == "-a")
119 | assert(strset('[^%W]') == strset('[%w]'))
120 | assert(strset('[]%%]') == '%]')
121 | assert(strset('[a%-z]') == '-az')
122 | assert(strset('[%^%[%-a%]%-b]') == '-[]^ab')
123 | assert(strset('%Z') == strset(E'[%1-%255]'))
124 | assert(strset('.') == strset(E'[%1-%255%%z]'))
125 | print('+');
126 | 
127 | assert(utf8.match("alo xyzK", "(%w+)K") == "xyz")
128 | assert(utf8.match("254 K", "(%d*)K") == "")
129 | assert(utf8.match("alo ", "(%w*)$") == "")
130 | assert(utf8.match("alo ", "(%w+)$") == nil)
131 | assert(utf8.find("(álo)", "%(á") == 1)
132 | local a, b, c, d, e = utf8.match("âlo alo", "^(((.).).* (%w*))$")
133 | assert(a == 'âlo alo' and b == 'âl' and c == 'â' and d == 'alo' and e == nil)
134 | a, b, c, d  = utf8.match('0123456789', '(.+(.?)())')
135 | assert(a == '0123456789' and b == '' and c == 11 and d == nil)
136 | print('+')
137 | 
138 | assert(utf8.gsub('ülo ülo', 'ü', 'x') == 'xlo xlo')
139 | assert(utf8.gsub('alo úlo  ', ' +$', '') == 'alo úlo')  -- trim
140 | assert(utf8.gsub('  alo alo  ', '^%s*(.-)%s*$', '%1') == 'alo alo')  -- double trim
141 | assert(utf8.gsub('alo  alo  \n 123\n ', '%s+', ' ') == 'alo alo 123 ')
142 | t = "abç d"
143 | a, b = utf8.gsub(t, '(.)', '%1@')
144 | assert('@'..a == utf8.gsub(t, '', '@') and b == 5)
145 | a, b = utf8.gsub('abçd', '(.)', '%0@', 2)
146 | assert(a == 'a@b@çd' and b == 2)
147 | assert(utf8.gsub('alo alo', '()[al]', '%1') == '12o 56o')
148 | assert(utf8.gsub("abc=xyz", "(%w*)(%p)(%w+)", "%3%2%1-%0") ==
149 |               "xyz=abc-abc=xyz")
150 | assert(utf8.gsub("abc", "%w", "%1%0") == "aabbcc")
151 | assert(utf8.gsub("abc", "%w+", "%0%1") == "abcabc")
152 | assert(utf8.gsub('áéí', '$', '\0óú') == 'áéí\0óú')
153 | assert(utf8.gsub('', '^', 'r') == 'r')
154 | assert(utf8.gsub('', '$', 'r') == 'r')
155 | print('+')
156 | 
157 | assert(utf8.gsub("um (dois) tres (quatro)", "(%(%w+%))", utf8.upper) ==
158 |             "um (DOIS) tres (QUATRO)")
159 | 
160 | do
161 |   local function setglobal (n,v) rawset(_G, n, v) end
162 |   utf8.gsub("a=roberto,roberto=a", "(%w+)=(%w%w*)", setglobal)
163 |   assert(_G.a=="roberto" and _G.roberto=="a")
164 | end
165 | 
166 | function f(a,b) return utf8.gsub(a,'.',b) end
167 | assert(utf8.gsub("trocar tudo em |teste|b| é |beleza|al|", "|([^|]*)|([^|]*)|", f) ==
168 |             "trocar tudo em bbbbb é alalalalalal")
169 | 
170 | local function dostring (s) return (loadstring or load)(s)() or "" end
171 | assert(utf8.gsub("alo $a=1$ novamente $return a$", "$([^$]*)%$", dostring) ==
172 |             "alo  novamente 1")
173 | 
174 | x = utf8.gsub("$local utf8=require'lua-utf8' x=utf8.gsub('alo', '.', utf8.upper)$ assim vai para $return x$",
175 |          "$([^$]*)%$", dostring)
176 | assert(x == ' assim vai para ALO')
177 | 
178 | t = {}
179 | s = 'a alo jose  joao'
180 | r = utf8.gsub(s, '()(%w+)()', function (a,w,b)
181 |       assert(utf8.len(w) == b-a);
182 |       t[a] = b-a;
183 |     end)
184 | assert(s == r and t[1] == 1 and t[3] == 3 and t[7] == 4 and t[13] == 4)
185 | 
186 | 
187 | function isbalanced (s)
188 |   return utf8.find(utf8.gsub(s, "%b()", ""), "[()]") == nil
189 | end
190 | 
191 | assert(isbalanced("(9 ((8))(\0) 7) \0\0 a b ()(c)() a"))
192 | assert(not isbalanced("(9 ((8) 7) a b (\0 c) a"))
193 | assert(utf8.gsub("alo 'oi' alo", "%b''", '"') == 'alo " alo')
194 | 
195 | 
196 | local t = {"apple", "orange", "lime"; n=0}
197 | assert(utf8.gsub("x and x and x", "x", function () t.n=t.n+1; return t[t.n] end)
198 |         == "apple and orange and lime")
199 | 
200 | t = {n=0}
201 | utf8.gsub("first second word", "%w%w*", function (w) t.n=t.n+1; t[t.n] = w end)
202 | assert(t[1] == "first" and t[2] == "second" and t[3] == "word" and t.n == 3)
203 | 
204 | t = {n=0}
205 | assert(utf8.gsub("first second word", "%w+",
206 |          function (w) t.n=t.n+1; t[t.n] = w end, 2) == "first second word")
207 | assert(t[1] == "first" and t[2] == "second" and t[3] == nil)
208 | 
209 | assert(not pcall(utf8.gsub, "alo", "(.", print))
210 | assert(not pcall(utf8.gsub, "alo", ".)", print))
211 | assert(not pcall(utf8.gsub, "alo", "(.", {}))
212 | assert(not pcall(utf8.gsub, "alo", "(.)", "%2"))
213 | assert(not pcall(utf8.gsub, "alo", "(%1)", "a"))
214 | assert(not pcall(utf8.gsub, "alo", "(%0)", "a"))
215 | 
216 | -- bug since 2.5 (C-stack overflow)
217 | do
218 |   local function f (size)
219 |     local s = string.rep("a", size)
220 |     local p = string.rep(".?", size)
221 |     return pcall(utf8.match, s, p)
222 |   end
223 |   local r, m = f(80)
224 |   assert(r and #m == 80)
225 |   r, m = f(200000)
226 |   assert(not r and utf8.find(m, "too complex"))
227 | end
228 | 
229 | if not _soft then
230 |   -- big strings
231 |   local a = string.rep('a', 300000)
232 |   assert(utf8.find(a, '^a*.?$'))
233 |   assert(not utf8.find(a, '^a*.?b$'))
234 |   assert(utf8.find(a, '^a-.?$'))
235 | 
236 |   -- bug in 5.1.2
237 |   a = string.rep('a', 10000) .. string.rep('b', 10000)
238 |   assert(not pcall(utf8.gsub, a, 'b'))
239 | end
240 | 
241 | -- recursive nest of gsubs
242 | function rev (s)
243 |   return utf8.gsub(s, "(.)(.+)", function (c,s1) return rev(s1)..c end)
244 | end
245 | 
246 | local x = "abcdef"
247 | assert(rev(rev(x)) == x)
248 | 
249 | 
250 | -- gsub with tables
251 | assert(utf8.gsub("alo alo", ".", {}) == "alo alo")
252 | assert(utf8.gsub("alo alo", "(.)", {a="AA", l=""}) == "AAo AAo")
253 | assert(utf8.gsub("alo alo", "(.).", {a="AA", l="K"}) == "AAo AAo")
254 | assert(utf8.gsub("alo alo", "((.)(.?))", {al="AA", o=false}) == "AAo AAo")
255 | 
256 | assert(utf8.gsub("alo alo", "().", {2,5,6}) == "256 alo")
257 | 
258 | t = {}; setmetatable(t, {__index = function (t,s) return utf8.upper(s) end})
259 | assert(utf8.gsub("a alo b hi", "%w%w+", t) == "a ALO b HI")
260 | 
261 | 
262 | -- tests for gmatch
263 | local a = 0
264 | for i in utf8.gmatch('abcde', '()') do assert(i == a+1); a=i end
265 | assert(a==6)
266 | 
267 | t = {n=0}
268 | for w in utf8.gmatch("first second word", "%w+") do
269 |       t.n=t.n+1; t[t.n] = w
270 | end
271 | assert(t[1] == "first" and t[2] == "second" and t[3] == "word")
272 | 
273 | t = {3, 6, 9}
274 | for i in utf8.gmatch ("xuxx uu ppar r", "()(.)%2") do
275 |   assert(i == table.remove(t, 1))
276 | end
277 | assert(#t == 0)
278 | 
279 | t = {}
280 | for i,j in utf8.gmatch("13 14 10 = 11, 15= 16, 22=23", "(%d+)%s*=%s*(%d+)") do
281 |   t[i] = j
282 | end
283 | a = 0
284 | for k,v in pairs(t) do assert(k+1 == v+0); a=a+1 end
285 | assert(a == 3)
286 | 
287 | 
288 | -- tests for `%f' (`frontiers')
289 | 
290 | assert(utf8.gsub("aaa aa a aaa a", "%f[%w]a", "x") == "xaa xa x xaa x")
291 | assert(utf8.gsub("[[]] [][] [[[[", "%f[[].", "x") == "x[]] x]x] x[[[")
292 | assert(utf8.gsub("01abc45de3", "%f[%d]", ".") == ".01abc.45de.3")
293 | assert(utf8.gsub("01abc45 de3x", "%f[%D]%w", ".") == "01.bc45 de3.")
294 | local u = utf8.escape
295 | assert(utf8.gsub("function", u"%%f[%1-%255]%%w", ".") == ".unction")
296 | assert(utf8.gsub("function", u"%%f[^%1-%255]", ".") == "function.")
297 | 
298 | assert(utf8.find("a", "%f[a]") == 1)
299 | assert(utf8.find("a", "%f[^%z]") == 1)
300 | assert(utf8.find("a", "%f[^%l]") == 2)
301 | assert(utf8.find("aba", "%f[a%z]") == 3)
302 | assert(utf8.find("aba", "%f[%z]") == 4)
303 | assert(not utf8.find("aba", "%f[%l%z]"))
304 | assert(not utf8.find("aba", "%f[^%l%z]"))
305 | 
306 | local i, e = utf8.find(" alo aalo allo", "%f[%S].-%f[%s].-%f[%S]")
307 | assert(i == 2 and e == 5)
308 | local k = utf8.match(" alo aalo allo", "%f[%S](.-%f[%s].-%f[%S])")
309 | assert(k == 'alo ')
310 | 
311 | local a = {1, 5, 9, 14, 17,}
312 | for k in utf8.gmatch("alo alo th02 is 1hat", "()%f[%w%d]") do
313 |   assert(table.remove(a, 1) == k)
314 | end
315 | assert(#a == 0)
316 | 
317 | 
318 | -- malformed patterns
319 | local function malform (p, m)
320 |   m = m or "malformed"
321 |   local r, msg = pcall(utf8.find, "a", p)
322 |   assert(not r and utf8.find(msg, m))
323 | end
324 | 
325 | malform("[a")
326 | malform("[]")
327 | malform("[^]")
328 | malform("[a%]")
329 | malform("[a%")
330 | malform("%b")
331 | malform("%ba")
332 | malform("%")
333 | malform("%f", "missing")
334 | 
335 | -- \0 in patterns
336 | assert(utf8.match("ab\0\1\2c", "[\0-\2]+") == "\0\1\2")
337 | assert(utf8.match("ab\0\1\2c", "[\0-\0]+") == "\0")
338 | assert(utf8.find("b$a", "$\0?") == 2)
339 | assert(utf8.find("abc\0efg", "%\0") == 4)
340 | assert(utf8.match("abc\0efg\0\1e\1g", "%b\0\1") == "\0efg\0\1e\1")
341 | assert(utf8.match("abc\0\0\0", "%\0+") == "\0\0\0")
342 | assert(utf8.match("abc\0\0\0", "%\0%\0?") == "\0\0")
343 | 
344 | -- magic char after \0
345 | assert(utf8.find("abc\0\0","\0.") == 4)
346 | assert(utf8.find("abcx\0\0abc\0abc","x\0\0abc\0a.") == 4)
347 | 
348 | print('OK')
349 | 


--------------------------------------------------------------------------------