├── README.md ├── test.utf8string.lua └── utf8string.lua /README.md: -------------------------------------------------------------------------------- 1 | lua-utf8string 2 | ============== 3 | 4 | Simple UTF8 support in pure lua 5 | 6 | Current Status 7 | ============== 8 | 9 | The module emulate the string capabilities 10 | 11 | * [x] `string.byte` 12 | * [x] `string.char` 13 | * [x] `string.dump` 14 | * [ ] `string.find` 15 | * [x] `string.format` 16 | * [ ] `string.gmatch` 17 | * [ ] `string.gsub` 18 | * [x] `string.len` 19 | * [x] `string.lower` (*) 20 | * [ ] `string.match` 21 | * [x] `string.rep` 22 | * [x] `string.reverse` 23 | * [x] `string.sub` 24 | * [x] `string.upper` (*) 25 | 26 | (*) don't thread Unicode, only ascii upper/lower cases. 27 | 28 | 29 | Sample of use 30 | ============= 31 | 32 | ```lua 33 | local u = require("utf8string") 34 | 35 | local data = "àbcdéêèf" 36 | local udata = u(data) 37 | 38 | print(type(data), data) -- the orignal 39 | print(type(udata), udata) -- automatic convertion to string 40 | 41 | print(#data) -- is not the good number of printed characters on screen 42 | print(#udata) -- is the number of printed characters on screen 43 | 44 | print(udata:sub(4,5)) -- be able to use the sub() like a string 45 | ``` 46 | 47 | # TODO 48 | 49 | * See all other utf8 implementation 50 | * Try to follow the lua5.3's utf8 API 51 | * ... 52 | 53 | # License 54 | 55 | My code is under MIT License 56 | -------------------------------------------------------------------------------- /test.utf8string.lua: -------------------------------------------------------------------------------- 1 | local u = require("utf8string") 2 | 3 | -- source: https://github.com/cloudwu/skynet/issues/341 4 | local i341 = '这里只是释放了 uc 的内存, 但是 uc->pack.buffer 指向的数据并没有释放.' 5 | -- 1 2 3 4 5 6 78901 2 3 456 7 890123456789012345 6 7 8 9 0 1 2 3 4 56 6 | -- 1 5 10 15 20 15 30 35 40 46 7 | assert( #u(i341) == 46 ) 8 | assert( u(i341):reverse():reverse():tostring() == i341 ) 9 | assert( #u(i341):sub(2,3) == 2 ) 10 | 11 | assert( u(i341):reverse():sub(-3,-2):tostring() == u(i341):sub(2,3):reverse():tostring() ) 12 | 13 | local ustr = "aáâàbeéêèc-óôò€" 14 | 15 | local uobj = u(ustr) 16 | assert( tostring(uobj:sub(1, 2)) =="aá" ) 17 | assert( tostring(uobj:sub(2, 3)) =="áâ" ) 18 | assert( tostring(uobj:sub(-1,-1))=="€" ) 19 | 20 | assert( uobj:sub(1, 2):tostring() =="aá" ) 21 | assert( uobj:sub(2, 3):tostring() =="áâ" ) 22 | assert( uobj:sub(-1,-1):tostring()=="€" ) 23 | 24 | assert(#ustr == 26) 25 | assert(#uobj == 15) 26 | assert(type(uobj) == "table") 27 | assert(uobj:type() == "ustring") 28 | assert(#u("a") == 1) 29 | assert(#u("Ô") == 1) 30 | assert(string.len(tostring(u("Ô"))) == #"Ô") 31 | 32 | assert(uobj == u(uobj), "convert must detecte an already converted object") 33 | assert( tostring( u("áà"):rep(3) ) == "áàáàáà") 34 | 35 | --assert( tostring( u("àeïôú"):reverse() ) == "úôïeà") 36 | print( u("àeïôú"):reverse() ) 37 | assert(#ustr == 26) 38 | assert(#uobj == 15) 39 | 40 | -- print(uobj:sub(3,3)) -- get one UTF8 char 41 | 42 | --for i,v in ipairs(uobj) do 43 | -- print(i, v, uobj:byte(i,i)) 44 | --end 45 | 46 | 47 | assert( tostring(uobj .. " " .. uobj) == ustr.." "..ustr) 48 | assert( type(uobj) == type(uobj .. "str") ) 49 | 50 | assert( uobj..1 ) 51 | assert( " "..uobj ) 52 | 53 | --print(uobj:upper()) 54 | print(uobj:byte(2,3)) 55 | print( string.char( uobj:byte(1,2))) 56 | print( string.char(("aaaaxE"):byte(1,2))) 57 | 58 | -- clone test 59 | if false then 60 | local a = uobj() 61 | assert( a ~= uobj ) 62 | assert( tostring(a) == tostring(uobj) ) 63 | end 64 | 65 | -------------------------------------------------------------------------------- /utf8string.lua: -------------------------------------------------------------------------------- 1 | local m = {} -- the module 2 | 3 | local ustring = {} -- table to index equivalent string.* functions 4 | 5 | -- TsT 6 | -- License: MIT 7 | 8 | m._VERSION = "utf8string 1.0.0" 9 | m._URL = "https://github.com/tst2005/lua-utf8string" 10 | m._LICENSE = 'MIT ' 11 | 12 | -- my custom type for Unicode String 13 | local utf8type = "ustring" 14 | 15 | local typeof = assert(type) 16 | local tostring = assert(tostring) 17 | 18 | local string = require("string") 19 | local sgmatch = assert(string.gmatch or string.gfind) -- lua 5.1+ or 5.0 20 | local string_find = assert(string.find) 21 | local string_sub = assert(string.sub) 22 | local string_byte = assert(string.byte) 23 | 24 | local table_concat = table.concat 25 | 26 | local utf8_object 27 | 28 | local function utf8_sub(uobj, i, j) 29 | assert(i, "bad argument #2 to 'sub' (number expected, got no value)") 30 | if i then assert(type(i) == "number") end 31 | if j then assert(type(j) == "number") end 32 | 33 | if i == 0 then 34 | i = 1 35 | elseif i < 0 then 36 | i = #uobj+i+1 37 | end 38 | 39 | if j and j < 0 then 40 | j = #uobj+j+1 41 | end 42 | 43 | local b = i <= 1 and 1 or uobj[i-1]+1 44 | local e = j and uobj[j] 45 | -- create an new utf8 object from the original one (do not "parse" it again) 46 | local rel = uobj[i-1] or 0 -- relative position 47 | local new = {} 48 | for x=i,j,1 do 49 | new[#new+1] = uobj[x] -rel 50 | end 51 | new.rawstring = string_sub(uobj.rawstring, b, assert( type(e)=="number" and e)) 52 | new.usestring = uobj.usestring 53 | return utf8_object(new) 54 | end 55 | 56 | local function utf8_typeof(obj) 57 | local mt = getmetatable(obj) 58 | return mt and mt.__type or typeof(obj) 59 | end 60 | 61 | local function utf8_is_object(obj) 62 | return not not (utf8_typeof(obj) == utf8type) 63 | end 64 | 65 | local function utf8_tostring(obj) 66 | if utf8_is_object(obj) then 67 | return obj.rawstring 68 | end 69 | return obj 70 | --return tostring(obj) 71 | end 72 | 73 | local function utf8_clone(self) 74 | if not utf8_is_object(self) then 75 | error("it is not a ustring object ! what to do for clonning ?", 2) 76 | end 77 | local o = { 78 | rawstring = self.rawstring, 79 | usestring = self.usestring, 80 | } 81 | return utf8_object(o) 82 | end 83 | 84 | --local function utf8_is_uchar(uchar) 85 | -- return (uchar:len() > 1) -- len() = string.len() 86 | --end 87 | 88 | -- %z = 0x00 (\0 not allowed) 89 | -- \1 = 0x01 90 | -- \127 = 0x7F 91 | -- \128 = 0x80 92 | -- \191 = 0xBF 93 | 94 | -- parse a lua string to split each UTF-8 sequence to separated table item 95 | local function private_string2ustring(unicode_string) 96 | assert(typeof(unicode_string) == "string", "unicode_string is not a string?!") 97 | 98 | local e = 0 -- end of found string 99 | local o = {} 100 | while true do 101 | -- FIXME: how to drop invalid sequence ?! 102 | local b 103 | b, e = string_find(unicode_string, "[%z\1-\127\194-\244][\128-\191]*", e+1) 104 | if not b then break end 105 | o[#o+1] = e 106 | end 107 | o.rawstring = unicode_string 108 | o.usestring = #unicode_string == #o 109 | return utf8_object(o) 110 | end 111 | 112 | local function private_contains_unicode(str) 113 | return not not str:find("[\128-\193]+") 114 | end 115 | 116 | local function utf8_auto_convert(unicode_string, i, j) 117 | assert(typeof(unicode_string) == "string", "unicode_string is not a string: ", typeof(unicode_string)) 118 | local obj, containsutf8 = private_string2ustring(unicode_string) 119 | --if private_contains_unicode(unicode_string) then 120 | -- obj = private_string2ustring(unicode_string) 121 | --else 122 | -- obj = unicode_string 123 | --end 124 | return (i and obj:sub(i,j)) or obj 125 | end 126 | 127 | local function utf8_op_concat(obj1, obj2) 128 | -- local h 129 | -- local function sethand(o) h = getmetatable(o).__concat end 130 | -- if not pcall(sethand, obj1) then pcall(sethand, obj2) end 131 | -- if h then return h(obj1, obj2) end 132 | return utf8_auto_convert( tostring(obj1) .. tostring(obj2) ) 133 | end 134 | 135 | local floor = table.floor 136 | local string_char = string.char 137 | local table_concat = table.concat 138 | 139 | -- http://en.wikipedia.org/wiki/Utf8 140 | -- http://developer.coronalabs.com/code/utf-8-conversion-utility 141 | local function utf8_onechar(unicode) 142 | if unicode <= 0x7F then return string_char(unicode) end 143 | 144 | if (unicode <= 0x7FF) then 145 | local Byte0 = 0xC0 + floor(unicode / 0x40) 146 | local Byte1 = 0x80 + (unicode % 0x40) 147 | return string_char(Byte0, Byte1) 148 | end 149 | 150 | if (unicode <= 0xFFFF) then 151 | local Byte0 = 0xE0 + floor(unicode / 0x1000) -- 0x1000 = 0x40 * 0x40 152 | local Byte1 = 0x80 + (floor(unicode / 0x40) % 0x40) 153 | local Byte2 = 0x80 + (unicode % 0x40) 154 | return string_char(Byte0, Byte1, Byte2) 155 | end 156 | 157 | if (unicode <= 0x10FFFF) then 158 | local code = unicode 159 | local Byte3= 0x80 + (code % 0x40) 160 | code = floor(code / 0x40) 161 | local Byte2= 0x80 + (code % 0x40) 162 | code = floor(code / 0x40) 163 | local Byte1= 0x80 + (code % 0x40) 164 | code = floor(code / 0x40) 165 | local Byte0= 0xF0 + code 166 | 167 | return string_char(Byte0, Byte1, Byte2, Byte3) 168 | end 169 | 170 | error('Unicode cannot be greater than U+10FFFF!', 3) 171 | end 172 | 173 | 174 | local function utf8_char(...) 175 | local r = {} 176 | for i,v in ipairs({...}) do 177 | if type(v) ~= "number" then 178 | error("bad argument #"..i.." to 'char' (number expected, got "..type(v)..")", 2) 179 | end 180 | r[i] = utf8_onechar(v) 181 | end 182 | return table_concat(r, "") 183 | end 184 | --for _, n in ipairs{12399, 21560, 12356, 12414, 12377} do print(utf8char(n)) end 185 | --print( lua53_utf8_char( 12399, 21560, 12356, 12414, 12377 ) ) 186 | 187 | 188 | local function utf8_byte(obj, i, j) 189 | local i = i or 1 190 | local j = j or i -- FIXME: 'or i' or 'or -1' ? 191 | local uobj 192 | assert(utf8_is_object(obj), "ask utf8_byte() for a non utf8 object?!") 193 | -- if not utf8_is_object(obj) then 194 | -- uobj = utf8_auto_convert(obj, i, j) 195 | -- else 196 | uobj = obj:sub(i, j) 197 | -- end 198 | return string_byte(tostring(uobj), 1, -1) 199 | end 200 | 201 | -- FIXME: what is the lower/upper case of Unicode ?! 202 | -- FIXME: optimisation? the parse is still the same (just change the rawstring ?) 203 | local function utf8_lower(uobj) return utf8_auto_convert( tostring(uobj):lower() ) end 204 | local function utf8_upper(uobj) return utf8_auto_convert( tostring(uobj):upper() ) end 205 | 206 | -- FIXME: use the already parsed info to generate the reverse info... 207 | local function utf8_reverse(uobj) 208 | if uobj.usestring then 209 | return utf8_auto_convert(uobj.rawstring:reverse()) 210 | end 211 | 212 | local rawstring = uobj.rawstring 213 | local tmp = {} 214 | local e = uobj[#uobj] -- the ending position of uchar 215 | -- local last_value = e 216 | -- local o = {} -- new ustring object 217 | for n=#uobj-1,1,-1 do 218 | local b = uobj[n] -- the beginning position of uchar 219 | tmp[#tmp+1] = string_sub(rawstring, b+1, e) -- the uchar 220 | -- o[#o+1] = last_value-b+1 221 | e = b 222 | end 223 | tmp[#tmp+1] = string_sub(rawstring, 1, e) 224 | -- o[#o+1] = last_value 225 | -- o.rawstring = table_concat(tmp, "") 226 | -- return utf8_object(o) 227 | return utf8_auto_convert(table_concat(tmp, "")) 228 | end 229 | 230 | 231 | local function utf8_rep(uobj, n) 232 | return utf8_auto_convert(uobj.rawstring:rep(n)) -- :rep() is the string.rep() 233 | end 234 | 235 | function utf8_object(uobj) 236 | local mt 237 | if not uobj then 238 | uobj = {} 239 | mt = {} 240 | else 241 | mt = getmetatable(uobj) or {} 242 | end 243 | mt.__index = assert(ustring) 244 | mt.__concat = assert(utf8_op_concat) 245 | mt.__tostring = assert(utf8_tostring) 246 | mt.__type = assert(utf8type) 247 | -- mt.__call = function(_self, a1) 248 | -- if a1 == nil then 249 | -- return utf8_clone(_self) 250 | -- end 251 | -- return _self 252 | -- end 253 | return setmetatable(uobj, mt) 254 | end 255 | 256 | 257 | ---- Standard Lua 5.1 string.* ---- 258 | ustring.byte = assert(utf8_byte) 259 | ustring.char = assert(utf8_char) 260 | ustring.dump = assert(string.dump) 261 | --ustring.find 262 | ustring.format = assert(string.format) 263 | --ustring.gmatch 264 | --ustring.gsub 265 | ustring.len = function(uobj) return #uobj end 266 | ustring.lower = assert(utf8_lower) 267 | --ustring.match 268 | ustring.rep = assert(utf8_rep) 269 | ustring.reverse = assert(utf8_reverse) 270 | ustring.sub = assert(utf8_sub) 271 | ustring.upper = assert(utf8_upper) 272 | 273 | ---- custome add-on ---- 274 | ustring.type = assert(utf8_typeof) 275 | ustring.tostring = assert(utf8_tostring) 276 | ustring.clone = assert(utf8_clone) 277 | --ustring.debugdump = function(self) return table.concat(self, " ") end 278 | 279 | -- Add fonctions to the module 280 | for k,v in pairs(ustring) do m[k] = v end 281 | 282 | -- Allow to use the module directly to convert strings 283 | local mt = { 284 | __call = function(_self, obj, i, j) 285 | if utf8_is_object(obj) then 286 | return (i and obj:sub(i,j)) or obj 287 | end 288 | local str = obj 289 | if typeof(str) ~= "string" then 290 | str = tostring(str) 291 | end 292 | return utf8_auto_convert(str, i, j) 293 | end 294 | } 295 | 296 | return setmetatable(m,mt) 297 | 298 | ------------------------------------------------------------------------------- 299 | -- begin of the idea : http://lua-users.org/wiki/LuaUnicode 300 | -- 301 | -- for uchar in sgmatch(unicode_string, "([%z\1-\127\194-\244][\128-\191]*)") do 302 | -- 303 | --local function utf8_strlen(unicode_string) 304 | -- local _, count = string.gsub(unicode_string, "[^\128-\193]", "") 305 | -- return count 306 | --end 307 | -- http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries 308 | --------------------------------------------------------------------------------