├── LICENSE
├── README.md
├── lua-utf8-simple-1.rockspec
├── shitty_testcases.lua
└── utf8_simple.lua


/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright © 2015 blitmap <coroutines@gmail.com>
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the “Software”), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # lua-utf8-simple
  2 | 
  3 | This "library" is meant to be a very thin helper that you can easily drop in to another project without really calling it a dependency.  It aims to provide the most minimal of handling functions for working with utf8 strings.  It does not aim to be feature-complete or even error-descriptive.  It works for what is practical but not complex.  You have been warned. =^__^=
  4 | 
  5 | ## The require() Line
  6 | 
  7 | ```lua
  8 | local utf8 = require('utf8_simple')
  9 | ```
 10 | 
 11 | ## The Only Functions You Need to Know
 12 | 
 13 | ### utf8.chars(s[, no_subs])
 14 | - s: (string) the utf8 string to iterate over (by characters)
 15 | - nosubs: (boolean) true turns the substring utf8 characters into byte-lengths
 16 | 
 17 | ```lua
 18 | -- i is the character/letter index within the string
 19 | -- c is the utf8 character (string of 1 or more bytes)
 20 | -- b is the byte index within the string
 21 | for i, c, b in utf8.chars('Αγαπώ τηγανίτες') do
 22 | 	print(i, c, b)
 23 | end
 24 | ```
 25 | 
 26 | Output:
 27 | 
 28 | 	1	Α	1
 29 | 	2	γ	3
 30 | 	3	α	5
 31 | 	4	π	7
 32 | 	5	ώ	9
 33 | 	6		11
 34 | 	7	τ	12
 35 | 	8	η	14
 36 | 	9	γ	16
 37 | 	10	α	18
 38 | 	11	ν	20
 39 | 	12	ί	22
 40 | 	13	τ	24
 41 | 	14	ε	26
 42 | 	15	ς	28
 43 | 
 44 | ### ALTERNATE FORM
 45 | Creating small substrings can be a performance concern, the 2nd parameter to utf8.chars()
 46 | allows you to toggle the substrings to instead by the byte width of the character.
 47 | 
 48 | This is for situations when you only care about the byte width (less common).
 49 | 
 50 | ```lua
 51 | -- i is the character/letter index within the string
 52 | -- w is the utf8 character width (in bytes)
 53 | -- b is the byte index within the string
 54 | for i, w, b in utf8.chars('Αγαπώ τηγανίτες', true) do
 55 | 	print(i, w, b)
 56 | end
 57 | ```
 58 | 
 59 | Output:
 60 | 
 61 | 	1	2	1
 62 | 	2	2	3
 63 | 	3	2	5
 64 | 	4	2	7
 65 | 	5	2	9
 66 | 	6	1	11
 67 | 	7	2	12
 68 | 	8	2	14
 69 | 	9	2	16
 70 | 	10	2	18
 71 | 	11	2	20
 72 | 	12	2	22
 73 | 	13	2	24
 74 | 	14	2	26
 75 | 	15	2	28
 76 | 
 77 | ### utf8.map(s, f[, no_subs])
 78 | - s: (string) the utf8 string to map 'f' over
 79 | - f: (function) a function accepting: f(visual_index, utf8_char -or- width, byte_index)
 80 | - no_subs: (boolean) true means don't make small substrings from each character (byte width instead)
 81 | 
 82 | returns: (nothing)
 83 | 
 84 | ```lua
 85 | > utf8.map('Αγαπώ τηγανίτες', print) -- does the same as the first example above
 86 | ```
 87 | 
 88 | ```lua
 89 | > utf8.map('Αγαπώ τηγανίτες', print, true) -- the alternate form from above
 90 | ```
 91 | 
 92 | ## Others
 93 | 
 94 | ### utf8.len(s)
 95 | - s: (string) the utf8 string
 96 | 
 97 | returns: (number) the number of utf8 characters in s (not the byte length)
 98 | 
 99 | note: be aware of "invisible" utf8 characters
100 | 
101 | ```lua
102 | > = utf8.len('Αγαπώ τηγανίτες')
103 | 15
104 | ```
105 | 
106 | ### utf8.reverse(s)
107 | - s: (string) the utf8 string
108 | 
109 | returns: (string) the utf8-reversed form of s
110 | 
111 | note: reversing left-to-right utf8 strings that include directional formatting characters will look odd
112 | 
113 | ```lua
114 | > = utf8.reverse('Αγαπώ τηγανίτες')
115 | ςετίναγητ ώπαγΑ
116 | ```
117 | 
118 | ### utf8.strip(s)
119 | - s: (string) the utf8 string
120 | 
121 | returns: (string) s with all non-ascii characters removed (characters > 1 byte)
122 | 
123 | ```lua
124 | > = utf8.strip('cat♥dog')
125 | catdog
126 | ```
127 | 
128 | ### utf8.replace(s, map)
129 | - s: (string) the utf8 string
130 | - map: (table) keys are utf8 characters to replace, values are their replacement
131 | 
132 | returns: (string) s with all the key-characters in map replaced
133 | 
134 | note: the keys must be utf8 characters, the values **can** be strings
135 | 
136 | ```lua
137 | > = utf8.replace('∃y ∀x ¬(x ≺ y)', { ['∃'] = 'E', ['∀'] = 'A', ['¬'] = '\r\n', ['≺'] = '<' })
138 | Ey Ax 
139 | (x < y)
140 | ```
141 | 
142 | ### utf8.sub(s, i, j)
143 | - s: (string) the utf8 string
144 | - i: (string) the starting index in the utf8 string
145 | - j: (stirng) the ending index in the utf8 string
146 | 
147 | returns: (string) the substring formed from i to j, inclusive (this is a utf8-aware string.sub())
148 | 
149 | ```lua
150 | > = utf8.sub('Αγαπώ τηγανίτες', 3, -5)
151 | απώ τηγαν
152 | ```
153 | 


--------------------------------------------------------------------------------
/lua-utf8-simple-1.rockspec:
--------------------------------------------------------------------------------
 1 | package = 'utf8_simple'
 2 | version = 'scm-1'
 3 | 
 4 | source = { url = 'git://github.com/Pogs/lua-utf8-simple.git' }
 5 | 
 6 | description =
 7 | 	{
 8 | 		summary    = 'Minimal functions for basic UTF-8 handling on Lua strings',
 9 | 		detailed   = 'Provides minimal functions for handling UTF-8 in Lua strings: chars(), map(), len(), reverse(), strip(), replace(), & sub()',
10 | 		homepage   = 'https://github.com/Pogs/lua-utf8-simple',
11 | 		license    = 'BSD',
12 | 		maintainer = 'Sir Pogsalot <coroutines+github@gmail.com>'
13 | 	}
14 | 
15 | build = { type = 'builtin', modules = { utf8_simple = 'utf8_simple.lua' } }
16 | 


--------------------------------------------------------------------------------
/shitty_testcases.lua:
--------------------------------------------------------------------------------
  1 | -- assuming you're running the tests from within the clone
  2 | package.path = './?.lua;' .. package.path
  3 | 
  4 | local utf8 = require('utf8_simple')
  5 | 
  6 | local WRONG = {}
  7 | 
  8 | local tests = {}
  9 | 
 10 | tests.chars =
 11 | 	function ()
 12 | 		-- 'Αγαπώ'
 13 | 		local love, x  = { 'Α', 'γ', 'α', 'π', 'ώ' }, 0
 14 | 		local byte_idx = { 1, 3, 5, 7, 9 }
 15 | 
 16 | 		for i, c, b in utf8.chars('Αγαπώ') do
 17 | 			x = x + 1
 18 | 
 19 | 			local C = love[x]
 20 | 
 21 | 			if
 22 | 				c ~= C or
 23 | 				i ~= x or
 24 | 				b ~= byte_idx[x]
 25 | 			then
 26 | 				WRONG.chars = true
 27 | 			end
 28 | 		end
 29 | 
 30 | 		if x == 0 then WRONG.chars = true end
 31 | 	end
 32 | 
 33 | tests.len =
 34 | 	function ()
 35 | 		if utf8.len('Αγαπώ') ~= 5 then
 36 | 			WRONG.len = true
 37 | 		end
 38 | 	end
 39 | 
 40 | tests.sub =
 41 | 	function ()
 42 | 		local s = 'i αγαπώ cats'
 43 | 
 44 | 		if pcall(utf8.sub, s)                   then WRONG.sub = true end -- no-i substring
 45 | 		if utf8.sub(s,   3)     ~= 'αγαπώ cats' then WRONG.sub = true end -- i-only substring
 46 | 		if utf8.sub(s,  -7)     ~= 'πώ cats'    then WRONG.sub = true end -- i-only negative substring
 47 | 		if utf8.sub(s,   6,  7) ~= 'πώ'         then WRONG.sub = true end -- normal positive-index substring
 48 | 		if utf8.sub(s,  -7, -6) ~= 'πώ'         then WRONG.sub = true end -- normal negative-index substring
 49 | 		if utf8.sub(s, -70, #s) ~= s            then WRONG.sub = true end -- impossible negative-index substring
 50 | 		if utf8.sub(s,   1, 90) ~= s            then WRONG.sub = true end -- impossible positive-index substring
 51 | 		if utf8.sub(s,   4,  4) ~= 'γ'          then WRONG.sub = true end -- single-character substring
 52 | 		if utf8.sub(s,   8,  4) ~= ''           then WRONG.sub = true end -- start after end substring
 53 | 	end
 54 | 
 55 | tests.replace =
 56 | 	function ()
 57 | 		if utf8.replace('∃y ∀x ¬(x ≺ y)', { ['∃'] = 'E', ['∀'] = 'A', ['¬'] = '-', ['≺'] = '<' }) ~= 'Ey Ax -(x < y)' then
 58 | 			WRONG.replace = true
 59 | 		end
 60 | 	end
 61 | 
 62 | tests.reverse =
 63 | 	function ()
 64 | 		if utf8.reverse('Αγαπώ τηγανίτες') ~= 'ςετίναγητ ώπαγΑ' then
 65 | 			WRONG.reverse = true
 66 | 		end
 67 | 	end
 68 | 
 69 | tests.strip =
 70 | 	function ()
 71 | 		if utf8.strip('cat♥dog∀cat♥dog') ~= 'catdogcatdog' then
 72 | 			WRONG.strip = true
 73 | 		end
 74 | 	end
 75 | 
 76 | local keys =
 77 | 	function (t)
 78 | 		local ks = {}
 79 | 
 80 | 		for k in pairs(t) do
 81 | 			table.insert(ks, k)
 82 | 		end
 83 | 
 84 | 		return ks
 85 | 	end
 86 | 
 87 | tests.run =
 88 | 	function ()
 89 | 		local testnames = { 'chars', 'len', 'sub', 'replace', 'reverse', 'strip' }
 90 | 
 91 | 		for _, func in ipairs(testnames) do
 92 | 			print('testing ' .. func .. '..')
 93 | 			tests[func]()
 94 | 		end
 95 | 
 96 | 		if not next(WRONG) then
 97 | 			print('all tests succeeded! :D-S-<')
 98 | 		else
 99 | 			print('problems in these functions: ' .. table.concat(keys(WRONG), '(), ') .. '()')
100 | 		end
101 | 	end
102 | 
103 | tests.run()
104 | 


--------------------------------------------------------------------------------
/utf8_simple.lua:
--------------------------------------------------------------------------------
  1 | -- ABNF from RFC 3629
  2 | --
  3 | -- UTF8-octets = *( UTF8-char )
  4 | -- UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
  5 | -- UTF8-1 = %x00-7F
  6 | -- UTF8-2 = %xC2-DF UTF8-tail
  7 | -- UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
  8 | -- %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
  9 | -- UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
 10 | -- %xF4 %x80-8F 2( UTF8-tail )
 11 | -- UTF8-tail = %x80-BF
 12 | 
 13 | -- 0xxxxxxx                            | 007F   (127)
 14 | -- 110xxxxx	10xxxxxx                   | 07FF   (2047)
 15 | -- 1110xxxx	10xxxxxx 10xxxxxx          | FFFF   (65535)
 16 | -- 11110xxx	10xxxxxx 10xxxxxx 10xxxxxx | 10FFFF (1114111)
 17 | 
 18 | local pattern = '[%z\1-\127\194-\244][\128-\191]*'
 19 | 
 20 | -- helper function
 21 | local posrelat =
 22 | 	function (pos, len)
 23 | 		if pos < 0 then
 24 | 			pos = len + pos + 1
 25 | 		end
 26 | 
 27 | 		return pos
 28 | 	end
 29 | 
 30 | local utf8 = {}
 31 | 
 32 | -- THE MEAT
 33 | 
 34 | -- maps f over s's utf8 characters f can accept args: (visual_index, utf8_character, byte_index)
 35 | utf8.map =
 36 | 	function (s, f, no_subs)
 37 | 		local i = 0
 38 | 
 39 | 		if no_subs then
 40 | 			for b, e in s:gmatch('()' .. pattern .. '()') do
 41 | 				i = i + 1
 42 | 				local c = e - b
 43 | 				f(i, c, b)
 44 | 			end
 45 | 		else
 46 | 			for b, c in s:gmatch('()(' .. pattern .. ')') do
 47 | 				i = i + 1
 48 | 				f(i, c, b)
 49 | 			end
 50 | 		end
 51 | 	end
 52 | 
 53 | -- THE REST
 54 | 
 55 | -- generator for the above -- to iterate over all utf8 chars
 56 | utf8.chars =
 57 | 	function (s, no_subs)
 58 | 		return coroutine.wrap(function () return utf8.map(s, coroutine.yield, no_subs) end)
 59 | 	end
 60 | 
 61 | -- returns the number of characters in a UTF-8 string
 62 | utf8.len =
 63 | 	function (s)
 64 | 		-- count the number of non-continuing bytes
 65 | 		return select(2, s:gsub('[^\128-\193]', ''))
 66 | 	end
 67 | 
 68 | -- replace all utf8 chars with mapping
 69 | utf8.replace =
 70 | 	function (s, map)
 71 | 		return s:gsub(pattern, map)
 72 | 	end
 73 | 
 74 | -- reverse a utf8 string
 75 | utf8.reverse =
 76 | 	function (s)
 77 | 		-- reverse the individual greater-than-single-byte characters
 78 | 		s = s:gsub(pattern, function (c) return #c > 1 and c:reverse() end)
 79 | 
 80 | 		return s:reverse()
 81 | 	end
 82 | 
 83 | -- strip non-ascii characters from a utf8 string
 84 | utf8.strip =
 85 | 	function (s)
 86 | 		return s:gsub(pattern, function (c) return #c > 1 and '' end)
 87 | 	end
 88 | 
 89 | -- like string.sub() but i, j are utf8 strings
 90 | -- a utf8-safe string.sub()
 91 | utf8.sub =
 92 | 	function (s, i, j)
 93 | 		local l = utf8.len(s)
 94 | 
 95 | 		i =       posrelat(i, l)
 96 | 		j = j and posrelat(j, l) or l
 97 | 
 98 | 		if i < 1 then i = 1 end
 99 | 		if j > l then j = l end
100 | 
101 | 		if i > j then return '' end
102 | 
103 | 		local diff = j - i
104 | 		local iter = utf8.chars(s, true)
105 | 
106 | 		-- advance up to i
107 | 		for _ = 1, i - 1 do iter() end
108 | 
109 | 		local c, b = select(2, iter())
110 | 
111 | 		-- i and j are the same, single-charaacter sub
112 | 		if diff == 0 then
113 | 			return string.sub(s, b, b + c - 1)
114 | 		end
115 | 
116 | 		i = b
117 | 
118 | 		-- advance up to j
119 | 		for _ = 1, diff - 1 do iter() end
120 | 
121 | 		c, b = select(2, iter())
122 | 
123 | 		return string.sub(s, i, b + c - 1)
124 | 	end
125 | 
126 | return utf8
127 | 


--------------------------------------------------------------------------------