├── utf8.lua ├── utf8.md └── utf8_test.lua /utf8.lua: -------------------------------------------------------------------------------- 1 | 2 | --UTF-8 encoding and decoding for LuaJIT 3 | --Written by Cosmin Apreutesei. Public Domain. 4 | 5 | if not ... then require'utf8_test'; return end 6 | 7 | local ffi = require'ffi' 8 | local bit = require'bit' 9 | local band, shl, shr = bit.band, bit.lshift, bit.rshift 10 | local utf8 = {} 11 | 12 | local uint32_array = ffi.typeof'uint32_t[?]' 13 | local uint8_array = ffi.typeof'uint8_t[?]' 14 | local uint32_ptr = ffi.typeof'const uint32_t*' 15 | local uint8_ptr = ffi.typeof'const uint8_t*' 16 | 17 | local function tobuf(s, len, ct, sizeof_ct) 18 | if type(s) == 'string' then 19 | return s, ffi.cast(ct or uint8_ptr, s), 20 | math.min(len or 1/0, #s / (sizeof_ct or 1)) 21 | else 22 | return nil, s, len 23 | end 24 | end 25 | 26 | -- byte 1 byte 2 byte 3 byte 4 27 | -------------------------------------------- 28 | -- 00 - 7F 29 | -- C2 - DF 80 - BF 30 | -- E0 A0 - BF 80 - BF 31 | -- E1 - EC 80 - BF 80 - BF 32 | -- ED 80 - 9F 80 - BF 33 | -- EE - EF 80 - BF 80 - BF 34 | -- F0 90 - BF 80 - BF 80 - BF 35 | -- F1 - F3 80 - BF 80 - BF 80 - BF 36 | -- F4 80 - 8F 80 - BF 80 - BF 37 | 38 | function utf8.next(buf, len, i) 39 | if i >= len then 40 | return nil --EOS 41 | end 42 | local c1 = buf[i] 43 | i = i + 1 44 | if c1 <= 0x7F then 45 | return i, c1 --ASCII 46 | elseif c1 < 0xC2 then 47 | --invalid 48 | elseif c1 <= 0xDF then --2-byte 49 | if i < len then 50 | local c2 = buf[i] 51 | if c2 >= 0x80 and c2 <= 0xBF then 52 | return i + 1, 53 | shl(band(c1, 0x1F), 6) 54 | + band(c2, 0x3F) 55 | end 56 | end 57 | elseif c1 <= 0xEF then --3-byte 58 | if i < len + 1 then 59 | local c2, c3 = buf[i], buf[i+1] 60 | if not ( 61 | c2 < 0x80 or c2 > 0xBF 62 | or c3 < 0x80 or c3 > 0xBF 63 | or (c1 == 0xE0 and c2 < 0xA0) 64 | or (c1 == 0xED and c2 > 0x9F) 65 | ) then 66 | return i + 2, 67 | shl(band(c1, 0x0F), 12) 68 | + shl(band(c2, 0x3F), 6) 69 | + band(c3, 0x3F) 70 | end 71 | end 72 | elseif c1 <= 0xF4 then --4-byte 73 | if i < len + 2 then 74 | local c2, c3, c4 = buf[i], buf[i+1], buf[i+2] 75 | if not ( 76 | c2 < 0x80 or c2 > 0xBF 77 | or c3 < 0x80 or c3 > 0xBF 78 | or c3 < 0x80 or c3 > 0xBF 79 | or c4 < 0x80 or c4 > 0xBF 80 | or (c1 == 0xF0 and c2 < 0x90) 81 | or (c1 == 0xF4 and c2 > 0x8F) 82 | ) then 83 | return i + 3, 84 | shl(band(c1, 0x07), 18) 85 | + shl(band(c2, 0x3F), 12) 86 | + shl(band(c3, 0x3F), 6) 87 | + band(c4, 0x3F) 88 | end 89 | end 90 | end 91 | return i, nil, c1 --invalid 92 | end 93 | 94 | function utf8.prev(buf, len, i) 95 | if i <= 0 then 96 | return nil 97 | end 98 | local j = i 99 | while i > 0 do --go back to a previous possible start byte 100 | i = i - 1 101 | local c = buf[i] 102 | if c < 0x80 or c > 0xBF or i == j-4 then 103 | break 104 | end 105 | end 106 | while true do --go forward to the real previous character 107 | local i1, c, b = utf8.next(buf, len, i) 108 | i1 = i1 or len 109 | if i1 == j then 110 | return i, c, b 111 | end 112 | i = i1 113 | assert(i < j) 114 | end 115 | end 116 | 117 | function utf8.chars(s, i) 118 | local _, buf, len = tobuf(s) 119 | i = i and i-1 or 0 120 | return function() 121 | local c, b 122 | i, c, b = utf8.next(buf, len, i) 123 | if not i then return nil end 124 | return i+1, c, b 125 | end 126 | end 127 | 128 | --pass `false` to `out` to only get the output length. 129 | --pass `nil` to `out` to have the function allocate the buffer. 130 | function utf8.decode(buf, len, out, outlen, repl) 131 | local _, buf, len = tobuf(buf, len) 132 | if out == nil then 133 | outlen = outlen or utf8.decode(buf, len, false, nil, repl) 134 | out = uint32_array(outlen + 1) 135 | end 136 | outlen = outlen or 1/0 137 | local j, p, i = 0, 0, 0 138 | while true do 139 | local i1, c = utf8.next(buf, len, i) 140 | if not i1 then 141 | break 142 | end 143 | if not c then 144 | p = p + 1 145 | if repl == 'iso-8859-1' then 146 | c = buf[i] --interpret as iso-8859-1 like browsers do 147 | else 148 | c = repl 149 | end 150 | end 151 | if c then 152 | if j >= outlen then 153 | return nil, 'overflow', i 154 | end 155 | if out then 156 | out[j] = c 157 | end 158 | j = j + 1 159 | end 160 | i = i1 161 | end 162 | if out then 163 | return out, j, p 164 | else 165 | return j, p 166 | end 167 | end 168 | 169 | local function char_byte_count(c, invalid_size) 170 | if c < 0 or c > 0x10FFFF or (c >= 0xD800 and c <= 0xDFFF) then 171 | return invalid_size 172 | elseif c <= 0x7F then 173 | return 1 174 | elseif c <= 0x7FF then 175 | return 2 176 | elseif c <= 0xFFFF then 177 | return 3 178 | else 179 | return 4 180 | end 181 | end 182 | 183 | local function byte_count(buf, len, repl) 184 | local n = 0 185 | local invalid_size = repl and char_byte_count(repl, 0) or 0 186 | for i = 0, len-1 do 187 | n = n + char_byte_count(buf[i], invalid_size) 188 | end 189 | return n 190 | end 191 | 192 | local function encode_char(c, repl) 193 | local n, b1, b2, b3, b4 = 0 194 | if c >= 0xD800 and c <= 0xDFFF then --surrogate pair 195 | if repl then 196 | return encode_char(repl) 197 | end 198 | elseif c <= 0x7F then 199 | b1 = c 200 | n = 1 201 | elseif c <= 0x7FF then 202 | b2 = 0x80 + band(c, 0x3F); c = shr(c, 6) 203 | b1 = 0xC0 + c 204 | n = 2 205 | elseif c <= 0xFFFF then 206 | b3 = 0x80 + band(c, 0x3F); c = shr(c, 6) 207 | b2 = 0x80 + band(c, 0x3F); c = shr(c, 6) 208 | b1 = 0xE0 + c 209 | n = 3 210 | elseif c <= 0x10FFFF then 211 | b4 = 0x80 + band(c, 0x3F); c = shr(c, 6) 212 | b3 = 0x80 + band(c, 0x3F); c = shr(c, 6) 213 | b2 = 0x80 + band(c, 0x3F); c = shr(c, 6) 214 | b1 = 0xF0 + c 215 | n = 4 216 | elseif repl then 217 | return encode_char(repl) 218 | end 219 | return n, b1, b2, b3, b4 220 | end 221 | 222 | function utf8.encode(buf, len, out, outlen, repl) 223 | local _, buf, len = tobuf(buf, len, uint32_ptr, 4) 224 | if out == nil then --allocate output buffer 225 | outlen = outlen or utf8.encode(buf, len, false, nil, repl) 226 | out = uint8_array(outlen + 1) 227 | elseif not out then --compute output length 228 | return byte_count(buf, len, repl) 229 | end 230 | local j = 0 231 | for i = 0, len-1 do 232 | local n, b1, b2, b3, b4 = encode_char(buf[i], repl) 233 | if n > outlen then 234 | return nil, 'overflow' 235 | end 236 | if b1 then out[j ] = b1 end 237 | if b2 then out[j+1] = b2 end 238 | if b3 then out[j+2] = b3 end 239 | if b4 then out[j+3] = b4 end 240 | j = j + n 241 | outlen = outlen - n 242 | end 243 | return out, j 244 | end 245 | 246 | function utf8.encode_chars(...) 247 | local char = string.char 248 | local out = {} 249 | local t, repl = ... 250 | if type(t) == 'table' then 251 | local j = 1 252 | for i = 1, #t do 253 | local c = t[i] 254 | local n, b1, b2, b3, b4 = encode_char(c, repl) 255 | if b1 then out[j ] = char(b1) end 256 | if b2 then out[j+1] = char(b2) end 257 | if b3 then out[j+2] = char(b3) end 258 | if b4 then out[j+3] = char(b4) end 259 | j = j + n 260 | end 261 | else 262 | local j = 1 263 | for i = 1, select('#',...) do 264 | local c = select(i, ...) 265 | local n, b1, b2, b3, b4 = encode_char(c) 266 | if b1 then out[j ] = char(b1) end 267 | if b2 then out[j+1] = char(b2) end 268 | if b3 then out[j+2] = char(b3) end 269 | if b4 then out[j+3] = char(b4) end 270 | j = j + n 271 | end 272 | end 273 | return table.concat(out) 274 | end 275 | 276 | return utf8 277 | -------------------------------------------------------------------------------- /utf8.md: -------------------------------------------------------------------------------- 1 | --- 2 | tagline: UTF-8 encoding and decoding for LuaJIT 3 | --- 4 | 5 | ## `local utf8 = require'utf8'` 6 | 7 | Decode and encode UTF-8 data with control over invalid bytes. 8 | 9 | ---------------------------------------------------------- -------------------------------------------- 10 | `utf8.next(buf, len, i) -> ni, code, byte` codepoint (or invalid byte) at index `i` 11 | `utf8.prev(buf, len, i) -> ni, code, byte` codepoint (or invalid byte) before index `i` 12 | `utf8.decode(buf, len, out, outlen, repl) -> [out, ]n, p` decode utf-8 buffer (or get output length) 13 | `utf8.encode(buf, len, out, outlen, repl) -> [out, ]bytes` encode utf-32 buffer (or get output length) 14 | `utf8.chars(s[, start]) -> iter() -> ni, code, byte` iterate codepoints in string 15 | `utf8.encode_chars({c1,...}, repl | c1,...) -> s` encode codepoints to utf-8 string 16 | ---------------------------------------------------------- -------------------------------------------- 17 | 18 | ### `utf8.next(buf, len, i) -> next_i, code, byte | nil` 19 | 20 | Return codepoint (or invalid byte) at index `i`. Return `nil` if `i >= len`. 21 | 22 | ### `utf8.prev(buf, len, i) -> i, code, byte | nil` 23 | 24 | Return codepoint (or invalid byte) before index `i`. Return `nil` if `i <= 0`. 25 | 26 | ### `utf8.decode(buf, len, out, outlen, repl) -> [out, ]n, p` 27 | 28 | Decode utf8 buffer into a utf32 buffer or get output length. 29 | 30 | * if `out` is `nil` the output buffer is allocated by the function. 31 | * the buffer is n+1 codepoints thus null-terminated. 32 | * if `out` is `false` the output buffer is not allocated or returned. 33 | * `n, p` is the number of valid codepoints and the number of invalid bytes. 34 | * `repl` is an optional codepoint to replace invalid bytes with. 35 | * if `repl` is not given, invalid bytes are skipped. 36 | * if `repl` is `'iso-8859-1'`, invalid bytes are treated as iso-8859-1 37 | characters like browsers do. 38 | * replaced invalid bytes are counted in `n`. 39 | * returns `nil, 'overflow', sz` on output buffer overflow, where `sz` is 40 | the byte size of the text that fit into the buffer. 41 | 42 | ### `utf8.encode(buf, len, out, outlen, repl) -> [out, ]bytes` 43 | 44 | Encode utf32 buffer into a utf8 buffer or get output length. 45 | 46 | * if `out` is `nil` the output buffer is allocated by the function. 47 | * the buffer is n+1 bytes thus null-terminated. 48 | * if `out` is `false` the output buffer is not allocated or returned. 49 | * `repl` is an optional valid codepoint to replace invalid codepoints with. 50 | * if `repl` is not given, invalid codepoints are skipped. 51 | * returns `nil, 'overflow'` on error (output buffer overflow). 52 | 53 | ### `utf8.chars(s[, start]) -> iter() -> next_i, code, byte` 54 | 55 | Iterate all the codepoints in a string, returning the index in string where 56 | the _next_ codepoint is, and the codepoint. Invalid bytes are returned in 57 | the second return value, in which case the codepoint is `nil`. 58 | 59 | ### `utf8.encode_chars({c1, ...}, repl) -> s`
`utf8.encode_chars(c1, ...) -> s` 60 | 61 | Encode codepoints (given as an array or as separate args) to a utf-8 string. 62 | -------------------------------------------------------------------------------- /utf8_test.lua: -------------------------------------------------------------------------------- 1 | local utf8 = require'utf8' 2 | local time = require'time' 3 | local ffi = require'ffi' 4 | 5 | --add some invalid chars 6 | local s = '' 7 | s = s .. '\xC2\xC0' 8 | s = s .. '\xE0\x80' 9 | s = s .. '\xED\xA0' 10 | s = s .. '\xF0\x80' 11 | s = s .. '\xF4\x90' 12 | s = s .. '\xFF\xFF' 13 | local invalid = s 14 | 15 | local valid = [[ 16 | 17 | 18 | هذه هي بعض النصوص العربي 19 | Hello there! 20 | ᚠᛇᚻ᛫ᛒᛦᚦ᛫ᚠᚱᚩᚠᚢᚱ᛫ᚠᛁᚱᚪ᛫ᚷᛖᚻᚹᛦᛚᚳᚢᛗ 21 | Sîne klâwen durh die wolken sint geslagen, 22 | Τη γλώσσα μου έδωσαν ελληνική 23 | На берегу пустынных волн 24 | ვეპხის ტყაოსანი შოთა რუსთაველი 25 | யாமறிந்த மொழிகளிலே தமிழ்மொழி போல் இனிதாவது எங்கும் காணோம், 26 | 我能吞下玻璃而不伤身体 27 | 나는 유리를 먹을 수 있어요. 그래도 아프지 않아요 28 | 29 | ]] 30 | 31 | local n, p = utf8.decode(valid, nil, false) 32 | assert(n == 300) 33 | assert(p == 0) 34 | 35 | local n1, p = utf8.decode(invalid, nil, false) 36 | assert(n1 == 0) 37 | assert(p == #invalid) 38 | 39 | local s = valid .. invalid 40 | local n1, p1 = utf8.decode(s, nil, false) 41 | assert(n1 == n) 42 | assert(p1 == p) 43 | local n2, p2 = utf8.decode(s, nil, false, nil, 0) 44 | assert(n2 == n1 + p1) 45 | assert(p2 == p1) 46 | 47 | local rep = math.floor(50 * 1024^2 / #s) 48 | s = s:rep(rep) 49 | local outbuf, n, p = utf8.decode(s) 50 | assert(n == rep * n1) 51 | assert(p == rep * p1) 52 | 53 | local t0 = time.clock() 54 | local bytes = 0 55 | for i = 1, 2 do 56 | local outbuf, len = utf8.decode(s, #s, outbuf, n) 57 | assert(len == n) 58 | bytes = bytes + #s 59 | end 60 | print(string.format('decode: %.2f Mbytes -> %.2f Mchars, %d MB/s', 61 | #s / 1024^2, n / 1024^2, bytes / (time.clock() - t0) / 1024^2)) 62 | 63 | local slen = utf8.encode(outbuf, n, false) 64 | assert(slen == #valid * rep) 65 | local sbuf = ffi.new('uint8_t[?]', slen) 66 | local t0 = time.clock() 67 | local bytes = 0 68 | for i = 1, 5 do 69 | local outbuf, len = utf8.encode(outbuf, n, sbuf, slen) 70 | assert(len == #valid * rep) 71 | bytes = bytes + len 72 | end 73 | print(string.format('encode: %.2f Mchars -> %.2f Mbytes, %d MB/s', 74 | n / 1024^2, slen / 1024^2, bytes / (time.clock() - t0) / 1024^2)) 75 | 76 | 77 | local t0 = time.clock() 78 | local bytes = 0 79 | for i = 1, 2 do 80 | local len = 0 81 | local i = slen 82 | while true do 83 | i = utf8.prev(sbuf, slen, i) 84 | if not i then break end 85 | len = len + 1 86 | end 87 | assert(len == n) 88 | bytes = bytes + slen 89 | end 90 | print(string.format('prev: %.2f Mbytes -> %.2f Mchars, %d MB/s', 91 | #s / 1024^2, n / 1024^2, bytes / (time.clock() - t0) / 1024^2)) 92 | 93 | 94 | --test the string API 95 | local ts = '我能吞下玻璃而不伤身体' 96 | local t = {} 97 | for _,c,b in utf8.chars(ts) do 98 | t[#t+1] = c or b 99 | end 100 | assert(utf8.encode_chars(unpack(t)) == ts) 101 | assert(utf8.encode_chars(t) == ts) 102 | 103 | --compare speed to fribidi's implementation. 104 | --the Lua variant is 5x slower but still pretty fast at 200M/s. 105 | 106 | local fb = require'fribidi' 107 | 108 | local outbuf, len = fb.charset_to_unicode('utf-8', s, #s) 109 | assert(len == n + p / 4 + 3) 110 | local t0 = time.clock() 111 | local bytes = 0 112 | for i = 1, 2 do 113 | local _, len = fb.charset_to_unicode('utf-8', s, #s, outbuf, len) 114 | assert(len == n + p / 4 + 3) 115 | bytes = bytes + #s 116 | end 117 | print(string.format('fb-dec: %.2f Mbytes -> %.2f Mchars, %d MB/s', 118 | #s / 1024^2, n / 1024^2, bytes / (time.clock() - t0) / 1024^2)) 119 | 120 | --------------------------------------------------------------------------------