├── utf8.lua
├── utf8.md
└── utf8_test.lua


/utf8.lua:
--------------------------------------------------------------------------------
  1 | 
  2 | --UTF-8 encoding and decoding for LuaJIT
  3 | --Written by Cosmin Apreutesei. Public Domain.
  4 | 
  5 | if not ... then require'utf8_test'; return end
  6 | 
  7 | local ffi = require'ffi'
  8 | local bit = require'bit'
  9 | local band, shl, shr = bit.band, bit.lshift, bit.rshift
 10 | local utf8 = {}
 11 | 
 12 | local uint32_array = ffi.typeof'uint32_t[?]'
 13 | local uint8_array = ffi.typeof'uint8_t[?]'
 14 | local uint32_ptr = ffi.typeof'const uint32_t*'
 15 | local uint8_ptr = ffi.typeof'const uint8_t*'
 16 | 
 17 | local function tobuf(s, len, ct, sizeof_ct)
 18 | 	if type(s) == 'string' then
 19 | 		return s, ffi.cast(ct or uint8_ptr, s),
 20 | 			math.min(len or 1/0, #s / (sizeof_ct or 1))
 21 | 	else
 22 | 		return nil, s, len
 23 | 	end
 24 | end
 25 | 
 26 | -- byte 1     byte 2      byte 3     byte 4
 27 | --------------------------------------------
 28 | -- 00 - 7F
 29 | -- C2 - DF    80 - BF
 30 | -- E0         A0 - BF     80 - BF
 31 | -- E1 - EC    80 - BF     80 - BF
 32 | -- ED         80 - 9F     80 - BF
 33 | -- EE - EF    80 - BF     80 - BF
 34 | -- F0         90 - BF     80 - BF    80 - BF
 35 | -- F1 - F3    80 - BF     80 - BF    80 - BF
 36 | -- F4         80 - 8F     80 - BF    80 - BF
 37 | 
 38 | function utf8.next(buf, len, i)
 39 | 	if i >= len then
 40 | 		return nil --EOS
 41 | 	end
 42 | 	local c1 = buf[i]
 43 | 	i = i + 1
 44 | 	if c1 <= 0x7F then
 45 | 		return i, c1 --ASCII
 46 | 	elseif c1 < 0xC2 then
 47 | 		--invalid
 48 | 	elseif c1 <= 0xDF then --2-byte
 49 | 		if i < len then
 50 | 			local c2 = buf[i]
 51 | 			if c2 >= 0x80 and c2 <= 0xBF then
 52 | 				return i + 1,
 53 | 				      shl(band(c1, 0x1F), 6)
 54 | 				        + band(c2, 0x3F)
 55 | 			end
 56 | 		end
 57 | 	elseif c1 <= 0xEF then --3-byte
 58 | 		if i < len + 1 then
 59 | 			local c2, c3 = buf[i], buf[i+1]
 60 | 			if not (
 61 | 				   c2 < 0x80 or c2 > 0xBF
 62 | 				or c3 < 0x80 or c3 > 0xBF
 63 | 				or (c1 == 0xE0 and c2 < 0xA0)
 64 | 				or (c1 == 0xED and c2 > 0x9F)
 65 | 			) then
 66 | 				return i + 2,
 67 | 				      shl(band(c1, 0x0F), 12)
 68 | 				    + shl(band(c2, 0x3F), 6)
 69 | 				        + band(c3, 0x3F)
 70 | 			end
 71 | 		end
 72 | 	elseif c1 <= 0xF4 then --4-byte
 73 | 		if i < len + 2 then
 74 | 			local c2, c3, c4 = buf[i], buf[i+1], buf[i+2]
 75 | 			if not (
 76 | 				   c2 < 0x80 or c2 > 0xBF
 77 | 				or c3 < 0x80 or c3 > 0xBF
 78 | 				or c3 < 0x80 or c3 > 0xBF
 79 | 				or c4 < 0x80 or c4 > 0xBF
 80 | 				or (c1 == 0xF0 and c2 < 0x90)
 81 | 				or (c1 == 0xF4 and c2 > 0x8F)
 82 | 			) then
 83 | 				return i + 3,
 84 | 				     shl(band(c1, 0x07), 18)
 85 | 				   + shl(band(c2, 0x3F), 12)
 86 | 				   + shl(band(c3, 0x3F), 6)
 87 | 				       + band(c4, 0x3F)
 88 | 			end
 89 | 		end
 90 | 	end
 91 | 	return i, nil, c1 --invalid
 92 | end
 93 | 
 94 | function utf8.prev(buf, len, i)
 95 | 	if i <= 0 then
 96 | 		return nil
 97 | 	end
 98 | 	local j = i
 99 | 	while i > 0 do --go back to a previous possible start byte
100 | 		i = i - 1
101 | 		local c = buf[i]
102 | 		if c < 0x80 or c > 0xBF or i == j-4 then
103 | 			break
104 | 		end
105 | 	end
106 | 	while true do --go forward to the real previous character
107 | 		local i1, c, b = utf8.next(buf, len, i)
108 | 		i1 = i1 or len
109 | 		if i1 == j then
110 | 			return i, c, b
111 | 		end
112 | 		i = i1
113 | 		assert(i < j)
114 | 	end
115 | end
116 | 
117 | function utf8.chars(s, i)
118 | 	local _, buf, len = tobuf(s)
119 | 	i = i and i-1 or 0
120 | 	return function()
121 | 		local c, b
122 | 		i, c, b = utf8.next(buf, len, i)
123 | 		if not i then return nil end
124 | 		return i+1, c, b
125 | 	end
126 | end
127 | 
128 | --pass `false` to `out` to only get the output length.
129 | --pass `nil` to `out` to have the function allocate the buffer.
130 | function utf8.decode(buf, len, out, outlen, repl)
131 | 	local _, buf, len = tobuf(buf, len)
132 | 	if out == nil then
133 | 		outlen = outlen or utf8.decode(buf, len, false, nil, repl)
134 | 		out = uint32_array(outlen + 1)
135 | 	end
136 | 	outlen = outlen or 1/0
137 | 	local j, p, i = 0, 0, 0
138 | 	while true do
139 | 		local i1, c = utf8.next(buf, len, i)
140 | 		if not i1 then
141 | 			break
142 | 		end
143 | 		if not c then
144 | 			p = p + 1
145 | 			if repl == 'iso-8859-1' then
146 | 				c = buf[i] --interpret as iso-8859-1 like browsers do
147 | 			else
148 | 				c = repl
149 | 			end
150 | 		end
151 | 		if c then
152 | 			if j >= outlen then
153 | 				return nil, 'overflow', i
154 | 			end
155 | 			if out then
156 | 				out[j] = c
157 | 			end
158 | 			j = j + 1
159 | 		end
160 | 		i = i1
161 | 	end
162 | 	if out then
163 | 		return out, j, p
164 | 	else
165 | 		return j, p
166 | 	end
167 | end
168 | 
169 | local function char_byte_count(c, invalid_size)
170 | 	if c < 0 or c > 0x10FFFF or (c >= 0xD800 and c <= 0xDFFF) then
171 | 		return invalid_size
172 | 	elseif c <= 0x7F then
173 | 		return 1
174 | 	elseif c <= 0x7FF then
175 | 		return 2
176 | 	elseif c <= 0xFFFF then
177 | 		return 3
178 | 	else
179 | 		return 4
180 | 	end
181 | end
182 | 
183 | local function byte_count(buf, len, repl)
184 | 	local n = 0
185 | 	local invalid_size = repl and char_byte_count(repl, 0) or 0
186 | 	for i = 0, len-1 do
187 | 		n = n + char_byte_count(buf[i], invalid_size)
188 | 	end
189 | 	return n
190 | end
191 | 
192 | local function encode_char(c, repl)
193 | 	local n, b1, b2, b3, b4 = 0
194 | 	if c >= 0xD800 and c <= 0xDFFF then --surrogate pair
195 | 		if repl then
196 | 			return encode_char(repl)
197 | 		end
198 | 	elseif c <= 0x7F then
199 | 		b1 = c
200 | 		n = 1
201 | 	elseif c <= 0x7FF then
202 | 		b2 = 0x80 + band(c, 0x3F); c = shr(c, 6)
203 | 		b1 = 0xC0 + c
204 | 		n = 2
205 | 	elseif c <= 0xFFFF then
206 | 		b3 = 0x80 + band(c, 0x3F); c = shr(c, 6)
207 | 		b2 = 0x80 + band(c, 0x3F); c = shr(c, 6)
208 | 		b1 = 0xE0 + c
209 | 		n = 3
210 | 	elseif c <= 0x10FFFF then
211 | 		b4 = 0x80 + band(c, 0x3F); c = shr(c, 6)
212 | 		b3 = 0x80 + band(c, 0x3F); c = shr(c, 6)
213 | 		b2 = 0x80 + band(c, 0x3F); c = shr(c, 6)
214 | 		b1 = 0xF0 + c
215 | 		n = 4
216 | 	elseif repl then
217 | 		return encode_char(repl)
218 | 	end
219 | 	return n, b1, b2, b3, b4
220 | end
221 | 
222 | function utf8.encode(buf, len, out, outlen, repl)
223 | 	local _, buf, len = tobuf(buf, len, uint32_ptr, 4)
224 | 	if out == nil then --allocate output buffer
225 | 		outlen = outlen or utf8.encode(buf, len, false, nil, repl)
226 | 		out = uint8_array(outlen + 1)
227 | 	elseif not out then --compute output length
228 | 		return byte_count(buf, len, repl)
229 | 	end
230 | 	local j = 0
231 | 	for i = 0, len-1 do
232 | 		local n, b1, b2, b3, b4 = encode_char(buf[i], repl)
233 | 		if n > outlen then
234 | 			return nil, 'overflow'
235 | 		end
236 | 		if b1 then out[j  ] = b1 end
237 | 		if b2 then out[j+1] = b2 end
238 | 		if b3 then out[j+2] = b3 end
239 | 		if b4 then out[j+3] = b4 end
240 | 		j = j + n
241 | 		outlen = outlen - n
242 | 	end
243 | 	return out, j
244 | end
245 | 
246 | function utf8.encode_chars(...)
247 | 	local char = string.char
248 | 	local out = {}
249 | 	local t, repl = ...
250 | 	if type(t) == 'table' then
251 | 		local j = 1
252 | 		for i = 1, #t do
253 | 			local c = t[i]
254 | 			local n, b1, b2, b3, b4 = encode_char(c, repl)
255 | 			if b1 then out[j  ] = char(b1) end
256 | 			if b2 then out[j+1] = char(b2) end
257 | 			if b3 then out[j+2] = char(b3) end
258 | 			if b4 then out[j+3] = char(b4) end
259 | 			j = j + n
260 | 		end
261 | 	else
262 | 		local j = 1
263 | 		for i = 1, select('#',...) do
264 | 			local c = select(i, ...)
265 | 			local n, b1, b2, b3, b4 = encode_char(c)
266 | 			if b1 then out[j  ] = char(b1) end
267 | 			if b2 then out[j+1] = char(b2) end
268 | 			if b3 then out[j+2] = char(b3) end
269 | 			if b4 then out[j+3] = char(b4) end
270 | 			j = j + n
271 | 		end
272 | 	end
273 | 	return table.concat(out)
274 | end
275 | 
276 | return utf8
277 | 


--------------------------------------------------------------------------------
/utf8.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | tagline: UTF-8 encoding and decoding for LuaJIT
 3 | ---
 4 | 
 5 | ## `local utf8 = require'utf8'`
 6 | 
 7 | Decode and encode UTF-8 data with control over invalid bytes.
 8 | 
 9 | ---------------------------------------------------------- --------------------------------------------
10 | `utf8.next(buf, len, i) -> ni, code, byte`                 codepoint (or invalid byte) at index `i`
11 | `utf8.prev(buf, len, i) -> ni, code, byte`                 codepoint (or invalid byte) before index `i`
12 | `utf8.decode(buf, len, out, outlen, repl) -> [out, ]n, p`  decode utf-8 buffer (or get output length)
13 | `utf8.encode(buf, len, out, outlen, repl) -> [out, ]bytes` encode utf-32 buffer (or get output length)
14 | `utf8.chars(s[, start]) -> iter() -> ni, code, byte`       iterate codepoints in string
15 | `utf8.encode_chars({c1,...}, repl | c1,...) -> s`          encode codepoints to utf-8 string
16 | ---------------------------------------------------------- --------------------------------------------
17 | 
18 | ### `utf8.next(buf, len, i) -> next_i, code, byte | nil`
19 | 
20 | Return codepoint (or invalid byte) at index `i`. Return `nil` if `i >= len`.
21 | 
22 | ### `utf8.prev(buf, len, i) -> i, code, byte | nil`
23 | 
24 | Return codepoint (or invalid byte) before index `i`. Return `nil` if `i <= 0`.
25 | 
26 | ### `utf8.decode(buf, len, out, outlen, repl) -> [out, ]n, p`
27 | 
28 | Decode utf8 buffer into a utf32 buffer or get output length.
29 | 
30 |   * if `out` is `nil` the output buffer is allocated by the function.
31 |     * the buffer is n+1 codepoints thus null-terminated.
32 |   * if `out` is `false` the output buffer is not allocated or returned.
33 |   * `n, p` is the number of valid codepoints and the number of invalid bytes.
34 |   * `repl` is an optional codepoint to replace invalid bytes with.
35 |     * if `repl` is not given, invalid bytes are skipped.
36 |     * if `repl` is `'iso-8859-1'`, invalid bytes are treated as iso-8859-1
37 |     characters like browsers do.
38 |     * replaced invalid bytes are counted in `n`.
39 |   * returns `nil, 'overflow', sz` on output buffer overflow, where `sz` is
40 |   the byte size of the text that fit into the buffer.
41 | 
42 | ### `utf8.encode(buf, len, out, outlen, repl) -> [out, ]bytes`
43 | 
44 | Encode utf32 buffer into a utf8 buffer or get output length.
45 | 
46 |   * if `out` is `nil` the output buffer is allocated by the function.
47 |     * the buffer is n+1 bytes thus null-terminated.
48 |   * if `out` is `false` the output buffer is not allocated or returned.
49 |   * `repl` is an optional valid codepoint to replace invalid codepoints with.
50 |     * if `repl` is not given, invalid codepoints are skipped.
51 |   * returns `nil, 'overflow'` on error (output buffer overflow).
52 | 
53 | ### `utf8.chars(s[, start]) -> iter() -> next_i, code, byte`
54 | 
55 | Iterate all the codepoints in a string, returning the index in string where
56 | the _next_ codepoint is, and the codepoint. Invalid bytes are returned in
57 | the second return value, in which case the codepoint is `nil`.
58 | 
59 | ### `utf8.encode_chars({c1, ...}, repl) -> s` <br> `utf8.encode_chars(c1, ...) -> s`
60 | 
61 | Encode codepoints (given as an array or as separate args) to a utf-8 string.
62 | 


--------------------------------------------------------------------------------
/utf8_test.lua:
--------------------------------------------------------------------------------
  1 | local utf8 = require'utf8'
  2 | local time = require'time'
  3 | local ffi = require'ffi'
  4 | 
  5 | --add some invalid chars
  6 | local s = ''
  7 | s = s .. '\xC2\xC0'
  8 | s = s .. '\xE0\x80'
  9 | s = s .. '\xED\xA0'
 10 | s = s .. '\xF0\x80'
 11 | s = s .. '\xF4\x90'
 12 | s = s .. '\xFF\xFF'
 13 | local invalid = s
 14 | 
 15 | local valid = [[
 16 | 
 17 | 
 18 | هذه هي بعض النصوص العربي
 19 | Hello there!
 20 | ᚠᛇᚻ᛫ᛒᛦᚦ᛫ᚠᚱᚩᚠᚢᚱ᛫ᚠᛁᚱᚪ᛫ᚷᛖᚻᚹᛦᛚᚳᚢᛗ
 21 | Sîne klâwen durh die wolken sint geslagen,
 22 | Τη γλώσσα μου έδωσαν ελληνική
 23 | На берегу пустынных волн
 24 | ვეპხის ტყაოსანი შოთა რუსთაველი
 25 | யாமறிந்த மொழிகளிலே தமிழ்மொழி போல் இனிதாவது எங்கும் காணோம்,
 26 | 我能吞下玻璃而不伤身体
 27 | 나는 유리를 먹을 수 있어요. 그래도 아프지 않아요
 28 | 
 29 | ]]
 30 | 
 31 | local n, p = utf8.decode(valid, nil, false)
 32 | assert(n == 300)
 33 | assert(p == 0)
 34 | 
 35 | local n1, p = utf8.decode(invalid, nil, false)
 36 | assert(n1 == 0)
 37 | assert(p == #invalid)
 38 | 
 39 | local s = valid .. invalid
 40 | local n1, p1 = utf8.decode(s, nil, false)
 41 | assert(n1 == n)
 42 | assert(p1 == p)
 43 | local n2, p2 = utf8.decode(s, nil, false, nil, 0)
 44 | assert(n2 == n1 + p1)
 45 | assert(p2 == p1)
 46 | 
 47 | local rep = math.floor(50 * 1024^2 / #s)
 48 | s = s:rep(rep)
 49 | local outbuf, n, p = utf8.decode(s)
 50 | assert(n == rep * n1)
 51 | assert(p == rep * p1)
 52 | 
 53 | local t0 = time.clock()
 54 | local bytes = 0
 55 | for i = 1, 2 do
 56 | 	local outbuf, len = utf8.decode(s, #s, outbuf, n)
 57 | 	assert(len == n)
 58 | 	bytes = bytes + #s
 59 | end
 60 | print(string.format('decode: %.2f Mbytes -> %.2f Mchars, %d MB/s',
 61 | 	#s / 1024^2, n / 1024^2, bytes / (time.clock() - t0) / 1024^2))
 62 | 
 63 | local slen = utf8.encode(outbuf, n, false)
 64 | assert(slen == #valid * rep)
 65 | local sbuf = ffi.new('uint8_t[?]', slen)
 66 | local t0 = time.clock()
 67 | local bytes = 0
 68 | for i = 1, 5 do
 69 | 	local outbuf, len = utf8.encode(outbuf, n, sbuf, slen)
 70 | 	assert(len == #valid * rep)
 71 | 	bytes = bytes + len
 72 | end
 73 | print(string.format('encode: %.2f Mchars -> %.2f Mbytes, %d MB/s',
 74 | 	n / 1024^2, slen / 1024^2, bytes / (time.clock() - t0) / 1024^2))
 75 | 
 76 | 
 77 | local t0 = time.clock()
 78 | local bytes = 0
 79 | for i = 1, 2 do
 80 | 	local len = 0
 81 | 	local i = slen
 82 | 	while true do
 83 | 		i = utf8.prev(sbuf, slen, i)
 84 | 		if not i then break end
 85 | 		len = len + 1
 86 | 	end
 87 | 	assert(len == n)
 88 | 	bytes = bytes + slen
 89 | end
 90 | print(string.format('prev:   %.2f Mbytes -> %.2f Mchars, %d MB/s',
 91 | 	#s / 1024^2, n / 1024^2, bytes / (time.clock() - t0) / 1024^2))
 92 | 
 93 | 
 94 | --test the string API
 95 | local ts = '我能吞下玻璃而不伤身体'
 96 | local t = {}
 97 | for _,c,b in utf8.chars(ts) do
 98 | 	t[#t+1] = c or b
 99 | end
100 | assert(utf8.encode_chars(unpack(t)) == ts)
101 | assert(utf8.encode_chars(t) == ts)
102 | 
103 | --compare speed to fribidi's implementation.
104 | --the Lua variant is 5x slower but still pretty fast at 200M/s.
105 | 
106 | local fb = require'fribidi'
107 | 
108 | local outbuf, len = fb.charset_to_unicode('utf-8', s, #s)
109 | assert(len == n + p / 4 + 3)
110 | local t0 = time.clock()
111 | local bytes = 0
112 | for i = 1, 2 do
113 | 	local _, len = fb.charset_to_unicode('utf-8', s, #s, outbuf, len)
114 | 	assert(len == n + p / 4 + 3)
115 | 	bytes = bytes + #s
116 | end
117 | print(string.format('fb-dec: %.2f Mbytes -> %.2f Mchars, %d MB/s',
118 | 	#s / 1024^2, n / 1024^2, bytes / (time.clock() - t0) / 1024^2))
119 | 
120 | 


--------------------------------------------------------------------------------