├── LICENSE ├── README.md ├── lua-utf8-simple-1.rockspec ├── shitty_testcases.lua └── utf8_simple.lua /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright © 2015 blitmap 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the “Software”), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # lua-utf8-simple 2 | 3 | This "library" is meant to be a very thin helper that you can easily drop in to another project without really calling it a dependency. It aims to provide the most minimal of handling functions for working with utf8 strings. It does not aim to be feature-complete or even error-descriptive. It works for what is practical but not complex. You have been warned. =^__^= 4 | 5 | ## The require() Line 6 | 7 | ```lua 8 | local utf8 = require('utf8_simple') 9 | ``` 10 | 11 | ## The Only Functions You Need to Know 12 | 13 | ### utf8.chars(s[, no_subs]) 14 | - s: (string) the utf8 string to iterate over (by characters) 15 | - nosubs: (boolean) true turns the substring utf8 characters into byte-lengths 16 | 17 | ```lua 18 | -- i is the character/letter index within the string 19 | -- c is the utf8 character (string of 1 or more bytes) 20 | -- b is the byte index within the string 21 | for i, c, b in utf8.chars('Αγαπώ τηγανίτες') do 22 | print(i, c, b) 23 | end 24 | ``` 25 | 26 | Output: 27 | 28 | 1 Α 1 29 | 2 γ 3 30 | 3 α 5 31 | 4 π 7 32 | 5 ώ 9 33 | 6 11 34 | 7 τ 12 35 | 8 η 14 36 | 9 γ 16 37 | 10 α 18 38 | 11 ν 20 39 | 12 ί 22 40 | 13 τ 24 41 | 14 ε 26 42 | 15 ς 28 43 | 44 | ### ALTERNATE FORM 45 | Creating small substrings can be a performance concern, the 2nd parameter to utf8.chars() 46 | allows you to toggle the substrings to instead by the byte width of the character. 47 | 48 | This is for situations when you only care about the byte width (less common). 49 | 50 | ```lua 51 | -- i is the character/letter index within the string 52 | -- w is the utf8 character width (in bytes) 53 | -- b is the byte index within the string 54 | for i, w, b in utf8.chars('Αγαπώ τηγανίτες', true) do 55 | print(i, w, b) 56 | end 57 | ``` 58 | 59 | Output: 60 | 61 | 1 2 1 62 | 2 2 3 63 | 3 2 5 64 | 4 2 7 65 | 5 2 9 66 | 6 1 11 67 | 7 2 12 68 | 8 2 14 69 | 9 2 16 70 | 10 2 18 71 | 11 2 20 72 | 12 2 22 73 | 13 2 24 74 | 14 2 26 75 | 15 2 28 76 | 77 | ### utf8.map(s, f[, no_subs]) 78 | - s: (string) the utf8 string to map 'f' over 79 | - f: (function) a function accepting: f(visual_index, utf8_char -or- width, byte_index) 80 | - no_subs: (boolean) true means don't make small substrings from each character (byte width instead) 81 | 82 | returns: (nothing) 83 | 84 | ```lua 85 | > utf8.map('Αγαπώ τηγανίτες', print) -- does the same as the first example above 86 | ``` 87 | 88 | ```lua 89 | > utf8.map('Αγαπώ τηγανίτες', print, true) -- the alternate form from above 90 | ``` 91 | 92 | ## Others 93 | 94 | ### utf8.len(s) 95 | - s: (string) the utf8 string 96 | 97 | returns: (number) the number of utf8 characters in s (not the byte length) 98 | 99 | note: be aware of "invisible" utf8 characters 100 | 101 | ```lua 102 | > = utf8.len('Αγαπώ τηγανίτες') 103 | 15 104 | ``` 105 | 106 | ### utf8.reverse(s) 107 | - s: (string) the utf8 string 108 | 109 | returns: (string) the utf8-reversed form of s 110 | 111 | note: reversing left-to-right utf8 strings that include directional formatting characters will look odd 112 | 113 | ```lua 114 | > = utf8.reverse('Αγαπώ τηγανίτες') 115 | ςετίναγητ ώπαγΑ 116 | ``` 117 | 118 | ### utf8.strip(s) 119 | - s: (string) the utf8 string 120 | 121 | returns: (string) s with all non-ascii characters removed (characters > 1 byte) 122 | 123 | ```lua 124 | > = utf8.strip('cat♥dog') 125 | catdog 126 | ``` 127 | 128 | ### utf8.replace(s, map) 129 | - s: (string) the utf8 string 130 | - map: (table) keys are utf8 characters to replace, values are their replacement 131 | 132 | returns: (string) s with all the key-characters in map replaced 133 | 134 | note: the keys must be utf8 characters, the values **can** be strings 135 | 136 | ```lua 137 | > = utf8.replace('∃y ∀x ¬(x ≺ y)', { ['∃'] = 'E', ['∀'] = 'A', ['¬'] = '\r\n', ['≺'] = '<' }) 138 | Ey Ax 139 | (x < y) 140 | ``` 141 | 142 | ### utf8.sub(s, i, j) 143 | - s: (string) the utf8 string 144 | - i: (string) the starting index in the utf8 string 145 | - j: (stirng) the ending index in the utf8 string 146 | 147 | returns: (string) the substring formed from i to j, inclusive (this is a utf8-aware string.sub()) 148 | 149 | ```lua 150 | > = utf8.sub('Αγαπώ τηγανίτες', 3, -5) 151 | απώ τηγαν 152 | ``` 153 | -------------------------------------------------------------------------------- /lua-utf8-simple-1.rockspec: -------------------------------------------------------------------------------- 1 | package = 'utf8_simple' 2 | version = 'scm-1' 3 | 4 | source = { url = 'git://github.com/Pogs/lua-utf8-simple.git' } 5 | 6 | description = 7 | { 8 | summary = 'Minimal functions for basic UTF-8 handling on Lua strings', 9 | detailed = 'Provides minimal functions for handling UTF-8 in Lua strings: chars(), map(), len(), reverse(), strip(), replace(), & sub()', 10 | homepage = 'https://github.com/Pogs/lua-utf8-simple', 11 | license = 'BSD', 12 | maintainer = 'Sir Pogsalot ' 13 | } 14 | 15 | build = { type = 'builtin', modules = { utf8_simple = 'utf8_simple.lua' } } 16 | -------------------------------------------------------------------------------- /shitty_testcases.lua: -------------------------------------------------------------------------------- 1 | -- assuming you're running the tests from within the clone 2 | package.path = './?.lua;' .. package.path 3 | 4 | local utf8 = require('utf8_simple') 5 | 6 | local WRONG = {} 7 | 8 | local tests = {} 9 | 10 | tests.chars = 11 | function () 12 | -- 'Αγαπώ' 13 | local love, x = { 'Α', 'γ', 'α', 'π', 'ώ' }, 0 14 | local byte_idx = { 1, 3, 5, 7, 9 } 15 | 16 | for i, c, b in utf8.chars('Αγαπώ') do 17 | x = x + 1 18 | 19 | local C = love[x] 20 | 21 | if 22 | c ~= C or 23 | i ~= x or 24 | b ~= byte_idx[x] 25 | then 26 | WRONG.chars = true 27 | end 28 | end 29 | 30 | if x == 0 then WRONG.chars = true end 31 | end 32 | 33 | tests.len = 34 | function () 35 | if utf8.len('Αγαπώ') ~= 5 then 36 | WRONG.len = true 37 | end 38 | end 39 | 40 | tests.sub = 41 | function () 42 | local s = 'i αγαπώ cats' 43 | 44 | if pcall(utf8.sub, s) then WRONG.sub = true end -- no-i substring 45 | if utf8.sub(s, 3) ~= 'αγαπώ cats' then WRONG.sub = true end -- i-only substring 46 | if utf8.sub(s, -7) ~= 'πώ cats' then WRONG.sub = true end -- i-only negative substring 47 | if utf8.sub(s, 6, 7) ~= 'πώ' then WRONG.sub = true end -- normal positive-index substring 48 | if utf8.sub(s, -7, -6) ~= 'πώ' then WRONG.sub = true end -- normal negative-index substring 49 | if utf8.sub(s, -70, #s) ~= s then WRONG.sub = true end -- impossible negative-index substring 50 | if utf8.sub(s, 1, 90) ~= s then WRONG.sub = true end -- impossible positive-index substring 51 | if utf8.sub(s, 4, 4) ~= 'γ' then WRONG.sub = true end -- single-character substring 52 | if utf8.sub(s, 8, 4) ~= '' then WRONG.sub = true end -- start after end substring 53 | end 54 | 55 | tests.replace = 56 | function () 57 | if utf8.replace('∃y ∀x ¬(x ≺ y)', { ['∃'] = 'E', ['∀'] = 'A', ['¬'] = '-', ['≺'] = '<' }) ~= 'Ey Ax -(x < y)' then 58 | WRONG.replace = true 59 | end 60 | end 61 | 62 | tests.reverse = 63 | function () 64 | if utf8.reverse('Αγαπώ τηγανίτες') ~= 'ςετίναγητ ώπαγΑ' then 65 | WRONG.reverse = true 66 | end 67 | end 68 | 69 | tests.strip = 70 | function () 71 | if utf8.strip('cat♥dog∀cat♥dog') ~= 'catdogcatdog' then 72 | WRONG.strip = true 73 | end 74 | end 75 | 76 | local keys = 77 | function (t) 78 | local ks = {} 79 | 80 | for k in pairs(t) do 81 | table.insert(ks, k) 82 | end 83 | 84 | return ks 85 | end 86 | 87 | tests.run = 88 | function () 89 | local testnames = { 'chars', 'len', 'sub', 'replace', 'reverse', 'strip' } 90 | 91 | for _, func in ipairs(testnames) do 92 | print('testing ' .. func .. '..') 93 | tests[func]() 94 | end 95 | 96 | if not next(WRONG) then 97 | print('all tests succeeded! :D-S-<') 98 | else 99 | print('problems in these functions: ' .. table.concat(keys(WRONG), '(), ') .. '()') 100 | end 101 | end 102 | 103 | tests.run() 104 | -------------------------------------------------------------------------------- /utf8_simple.lua: -------------------------------------------------------------------------------- 1 | -- ABNF from RFC 3629 2 | -- 3 | -- UTF8-octets = *( UTF8-char ) 4 | -- UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4 5 | -- UTF8-1 = %x00-7F 6 | -- UTF8-2 = %xC2-DF UTF8-tail 7 | -- UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / 8 | -- %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) 9 | -- UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / 10 | -- %xF4 %x80-8F 2( UTF8-tail ) 11 | -- UTF8-tail = %x80-BF 12 | 13 | -- 0xxxxxxx | 007F (127) 14 | -- 110xxxxx 10xxxxxx | 07FF (2047) 15 | -- 1110xxxx 10xxxxxx 10xxxxxx | FFFF (65535) 16 | -- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | 10FFFF (1114111) 17 | 18 | local pattern = '[%z\1-\127\194-\244][\128-\191]*' 19 | 20 | -- helper function 21 | local posrelat = 22 | function (pos, len) 23 | if pos < 0 then 24 | pos = len + pos + 1 25 | end 26 | 27 | return pos 28 | end 29 | 30 | local utf8 = {} 31 | 32 | -- THE MEAT 33 | 34 | -- maps f over s's utf8 characters f can accept args: (visual_index, utf8_character, byte_index) 35 | utf8.map = 36 | function (s, f, no_subs) 37 | local i = 0 38 | 39 | if no_subs then 40 | for b, e in s:gmatch('()' .. pattern .. '()') do 41 | i = i + 1 42 | local c = e - b 43 | f(i, c, b) 44 | end 45 | else 46 | for b, c in s:gmatch('()(' .. pattern .. ')') do 47 | i = i + 1 48 | f(i, c, b) 49 | end 50 | end 51 | end 52 | 53 | -- THE REST 54 | 55 | -- generator for the above -- to iterate over all utf8 chars 56 | utf8.chars = 57 | function (s, no_subs) 58 | return coroutine.wrap(function () return utf8.map(s, coroutine.yield, no_subs) end) 59 | end 60 | 61 | -- returns the number of characters in a UTF-8 string 62 | utf8.len = 63 | function (s) 64 | -- count the number of non-continuing bytes 65 | return select(2, s:gsub('[^\128-\193]', '')) 66 | end 67 | 68 | -- replace all utf8 chars with mapping 69 | utf8.replace = 70 | function (s, map) 71 | return s:gsub(pattern, map) 72 | end 73 | 74 | -- reverse a utf8 string 75 | utf8.reverse = 76 | function (s) 77 | -- reverse the individual greater-than-single-byte characters 78 | s = s:gsub(pattern, function (c) return #c > 1 and c:reverse() end) 79 | 80 | return s:reverse() 81 | end 82 | 83 | -- strip non-ascii characters from a utf8 string 84 | utf8.strip = 85 | function (s) 86 | return s:gsub(pattern, function (c) return #c > 1 and '' end) 87 | end 88 | 89 | -- like string.sub() but i, j are utf8 strings 90 | -- a utf8-safe string.sub() 91 | utf8.sub = 92 | function (s, i, j) 93 | local l = utf8.len(s) 94 | 95 | i = posrelat(i, l) 96 | j = j and posrelat(j, l) or l 97 | 98 | if i < 1 then i = 1 end 99 | if j > l then j = l end 100 | 101 | if i > j then return '' end 102 | 103 | local diff = j - i 104 | local iter = utf8.chars(s, true) 105 | 106 | -- advance up to i 107 | for _ = 1, i - 1 do iter() end 108 | 109 | local c, b = select(2, iter()) 110 | 111 | -- i and j are the same, single-charaacter sub 112 | if diff == 0 then 113 | return string.sub(s, b, b + c - 1) 114 | end 115 | 116 | i = b 117 | 118 | -- advance up to j 119 | for _ = 1, diff - 1 do iter() end 120 | 121 | c, b = select(2, iter()) 122 | 123 | return string.sub(s, i, b + c - 1) 124 | end 125 | 126 | return utf8 127 | --------------------------------------------------------------------------------