├── include
    └── AL
    │   └── utf8
    │       ├── charclass
    │           ├── compiletime
    │           │   ├── stub.lua
    │           │   ├── parser.lua
    │           │   ├── range.lua
    │           │   ├── builder.lua
    │           │   └── vanilla.lua
    │           └── runtime
    │           │   ├── init.lua
    │           │   ├── native.lua
    │           │   ├── dummy.lua
    │           │   └── base.lua
    │       ├── primitives
    │           ├── tarantool.lua
    │           ├── init.lua
    │           ├── native.lua
    │           └── dummy.lua
    │       ├── context
    │           ├── compiletime.lua
    │           └── runtime.lua
    │       ├── ends
    │           └── compiletime
    │           │   ├── parser.lua
    │           │   └── vanilla.lua
    │       ├── begins
    │           └── compiletime
    │           │   ├── parser.lua
    │           │   └── vanilla.lua
    │       ├── test.sh
    │       ├── test
    │           ├── test_utf8data.lua
    │           ├── strict.lua
    │           ├── util.lua
    │           ├── context_runtime.lua
    │           ├── charclass_runtime.lua
    │           ├── test_compat.lua
    │           ├── charclass_compiletime.lua
    │           ├── test.lua
    │           └── test_pm.lua
    │       ├── modifier
    │           └── compiletime
    │           │   ├── simple.lua
    │           │   ├── stub.lua
    │           │   ├── parser.lua
    │           │   ├── frontier.lua
    │           │   └── vanilla.lua
    │       ├── LICENSE
    │       ├── util.lua
    │       ├── init.lua
    │       ├── regex_parser.lua
    │       ├── README.md
    │       └── functions
    │           └── lua53.lua
├── LICENSE
├── README.md
└── autoload
    └── AL.Persian Toolkit.lua


/include/AL/utf8/charclass/compiletime/stub.lua:
--------------------------------------------------------------------------------
 1 | return function(utf8)
 2 | 
 3 | local cl = utf8.regex.compiletime.charclass.builder
 4 | 
 5 | return function(str, c, bs, ctx)
 6 |   return cl.new():with_codes(c), utf8.next(str, bs) - bs
 7 | end
 8 | 
 9 | end
10 | 


--------------------------------------------------------------------------------
/include/AL/utf8/primitives/tarantool.lua:
--------------------------------------------------------------------------------
 1 | return function(utf8)
 2 | 
 3 | utf8:require "primitives.dummy"
 4 | 
 5 | local tnt_utf8 = utf8.config.tarantool_utf8 or require("utf8")
 6 | 
 7 | utf8.lower = tnt_utf8.lower
 8 | utf8.upper = tnt_utf8.upper
 9 | utf8.len = tnt_utf8.len
10 | utf8.char = tnt_utf8.char
11 | 
12 | return utf8
13 | end
14 | 


--------------------------------------------------------------------------------
/include/AL/utf8/context/compiletime.lua:
--------------------------------------------------------------------------------
 1 | return function(utf8)
 2 | 
 3 | local begins = utf8.config.begins
 4 | local ends = utf8.config.ends
 5 | 
 6 | return {
 7 |   new = function()
 8 |     return {
 9 |       prev_class = nil,
10 |       begins = begins[1].default(),
11 |       ends = ends[1].default(),
12 |       funcs = {},
13 |       internal = false, -- hack for ranges, flags if parser is in []
14 |     }
15 |   end
16 | }
17 | 
18 | end
19 | 


--------------------------------------------------------------------------------
/include/AL/utf8/ends/compiletime/parser.lua:
--------------------------------------------------------------------------------
 1 | return function(utf8)
 2 | 
 3 | utf8.config.ends = utf8.config.ends or {
 4 |   utf8:require "ends.compiletime.vanilla"
 5 | }
 6 | 
 7 | function utf8.regex.compiletime.ends.parse(regex, c, bs, ctx)
 8 |   for _, m in ipairs(utf8.config.ends) do
 9 |     local functions, move = m.parse(regex, c, bs, ctx)
10 |     utf8.debug("ends", _, c, bs, move, functions)
11 |     if functions then
12 |       return functions, move
13 |     end
14 |   end
15 | end
16 | 
17 | end
18 | 


--------------------------------------------------------------------------------
/include/AL/utf8/begins/compiletime/parser.lua:
--------------------------------------------------------------------------------
 1 | return function(utf8)
 2 | 
 3 | utf8.config.begins = utf8.config.begins or {
 4 |   utf8:require "begins.compiletime.vanilla"
 5 | }
 6 | 
 7 | function utf8.regex.compiletime.begins.parse(regex, c, bs, ctx)
 8 |   for _, m in ipairs(utf8.config.begins) do
 9 |     local functions, move = m.parse(regex, c, bs, ctx)
10 |     utf8.debug("begins", _, c, bs, move, functions)
11 |     if functions then
12 |       return functions, move
13 |     end
14 |   end
15 | end
16 | 
17 | end
18 | 


--------------------------------------------------------------------------------
/include/AL/utf8/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -xe
 4 | 
 5 | lua53=$(which lua5.3 || which true)
 6 | lua51=$(which lua5.1 || which true)
 7 | luajit=$(which luajit || which true)
 8 | 
 9 | for test in \
10 |   test/charclass_compiletime.lua \
11 |   test/charclass_runtime.lua \
12 |   test/context_runtime.lua \
13 |   test/test.lua \
14 |   test/test_compat.lua \
15 |   test/test_pm.lua \
16 |   test/test_utf8data.lua
17 | do
18 |   $lua53 $test
19 |   $lua51 $test
20 |   $luajit $test
21 | done
22 | 
23 | echo "tests passed"
24 | 


--------------------------------------------------------------------------------
/include/AL/utf8/test/test_utf8data.lua:
--------------------------------------------------------------------------------
 1 | local utf8uclc = require('init')
 2 | utf8uclc.config = {
 3 |   debug = nil,
 4 | --   debug = utf8:require("util").debug,
 5 |   conversion = {
 6 |     uc_lc = setmetatable({}, {__index = function(self, idx) return "l" end}),
 7 |     lc_uc = setmetatable({}, {__index = function(self, idx) return "u" end}),
 8 |   }
 9 | }
10 | utf8uclc:init()
11 | 
12 | local assert_equals = require 'test.util'.assert_equals
13 | 
14 | assert_equals(utf8uclc.lower("фыва"), "llll")
15 | assert_equals(utf8uclc.upper("фыва"), "uuuu")
16 | 


--------------------------------------------------------------------------------
/include/AL/utf8/charclass/runtime/init.lua:
--------------------------------------------------------------------------------
 1 | return function(utf8)
 2 | 
 3 | local provided = utf8.config.runtime_charclasses
 4 | 
 5 | if provided then
 6 |   if type(provided) == "table" then
 7 |     return provided
 8 |   elseif type(provided) == "function" then
 9 |     return provided(utf8)
10 |   else
11 |     return utf8:require(provided)
12 |   end
13 | end
14 | 
15 | local ffi = pcall(require, "ffi")
16 | if not ffi then
17 |   return utf8:require "charclass.runtime.dummy"
18 | else
19 |   return utf8:require "charclass.runtime.native"
20 | end
21 | 
22 | end
23 | 


--------------------------------------------------------------------------------
/include/AL/utf8/modifier/compiletime/simple.lua:
--------------------------------------------------------------------------------
 1 | return function(utf8)
 2 | 
 3 | local matchers = {
 4 |   simple = function(class, name)
 5 |     local class_name = 'class' .. name
 6 |     return [[
 7 |   local ]] .. class_name .. [[ = ]] .. class .. [[
 8 | 
 9 |   add(function(ctx) -- simple
10 |     -- debug(ctx, 'simple', ']] .. class_name .. [[')
11 |     if ]] .. class_name .. [[:test(ctx:get_charcode()) then
12 |       ctx:next_char()
13 |       ctx:next_function()
14 |       return ctx:get_function()(ctx)
15 |     end
16 |   end)
17 | ]]
18 |   end,
19 | }
20 | 
21 | return matchers
22 | 
23 | end
24 | 


--------------------------------------------------------------------------------
/include/AL/utf8/primitives/init.lua:
--------------------------------------------------------------------------------
 1 | return function(utf8)
 2 | 
 3 | local provided = utf8.config.primitives
 4 | 
 5 | if provided then
 6 |   if type(provided) == "table" then
 7 |     return provided
 8 |   elseif type(provided) == "function" then
 9 |     return provided(utf8)
10 |   else
11 |     return utf8:require(provided)
12 |   end
13 | end
14 | 
15 | if pcall(require, "tarantool") then
16 |   return utf8:require "primitives.tarantool"
17 | elseif pcall(require, "ffi") then
18 |   return utf8:require "primitives.native"
19 | else
20 |   return utf8:require "primitives.dummy"
21 | end
22 | 
23 | end
24 | 


--------------------------------------------------------------------------------
/include/AL/utf8/modifier/compiletime/stub.lua:
--------------------------------------------------------------------------------
 1 | return function(utf8)
 2 | 
 3 | local matchers = utf8:require("modifier.compiletime.simple")
 4 | 
 5 | local function parse(regex, c, bs, ctx)
 6 |   local functions
 7 | 
 8 |   if ctx.prev_class then
 9 |     functions = { matchers.simple(ctx.prev_class, tostring(bs)) }
10 |     ctx.prev_class = nil
11 |   end
12 | 
13 |   return functions, 0
14 | end
15 | 
16 | local function check(ctx)
17 |   if ctx.prev_class then
18 |     table.insert(ctx.funcs, matchers.simple(ctx.prev_class, tostring(ctx.pos)))
19 |     ctx.prev_class = nil
20 |   end
21 | end
22 | 
23 | return {
24 |   parse = parse,
25 |   check = check,
26 | }
27 | 
28 | end
29 | 


--------------------------------------------------------------------------------
/include/AL/utf8/modifier/compiletime/parser.lua:
--------------------------------------------------------------------------------
 1 | return function(utf8)
 2 | 
 3 | utf8.config.modifier = utf8.config.modifier or {
 4 |   utf8:require "modifier.compiletime.vanilla",
 5 |   utf8:require "modifier.compiletime.frontier",
 6 |   utf8:require "modifier.compiletime.stub",
 7 | }
 8 | 
 9 | function utf8.regex.compiletime.modifier.parse(regex, c, bs, ctx)
10 |   for _, m in ipairs(utf8.config.modifier) do
11 |     local functions, move = m.parse(regex, c, bs, ctx)
12 |     utf8.debug("mod", _, c, bs, move, functions and utf8.config.unpack(functions))
13 |     if functions then
14 |       ctx.prev_class = nil
15 |       return functions, move
16 |     end
17 |   end
18 | end
19 | 
20 | end
21 | 


--------------------------------------------------------------------------------
/include/AL/utf8/charclass/compiletime/parser.lua:
--------------------------------------------------------------------------------
 1 | return function(utf8)
 2 | 
 3 | utf8.config.compiletime_charclasses = utf8.config.compiletime_charclasses or {
 4 |   utf8:require "charclass.compiletime.vanilla",
 5 |   utf8:require "charclass.compiletime.range",
 6 |   utf8:require "charclass.compiletime.stub",
 7 | }
 8 | 
 9 | function utf8.regex.compiletime.charclass.parse(regex, c, bs, ctx)
10 |   utf8.debug("parse charclass():", regex, c, bs, regex[bs])
11 |   for _, p in ipairs(utf8.config.compiletime_charclasses) do
12 |     local charclass, nbs = p(regex, c, bs, ctx)
13 |     if charclass then
14 |       ctx.prev_class = charclass:build()
15 |       utf8.debug("cc", ctx.prev_class, _, c, bs, nbs)
16 |       return charclass, nbs
17 |     end
18 |   end
19 | end
20 | 
21 | end
22 | 


--------------------------------------------------------------------------------
/include/AL/utf8/ends/compiletime/vanilla.lua:
--------------------------------------------------------------------------------
 1 | return function(utf8)
 2 | 
 3 | local matchers = {
 4 |   any = function()
 5 |     return [[
 6 |   add(function(ctx) -- any
 7 |     ctx.result.finish = ctx.pos - 1
 8 |     ctx:done()
 9 |   end)
10 | ]]
11 |   end,
12 |   toend = function(ctx)
13 |     return [[
14 |   add(function(ctx) -- toend
15 |     ctx.result.finish = ctx.pos - 1
16 |     ctx.modified = true
17 |     if ctx.pos == utf8len(ctx.str) + 1 then ctx:done() end
18 |   end)
19 | ]]
20 |   end,
21 | }
22 | 
23 | local len = utf8.raw.len
24 | 
25 | local function default()
26 |   return matchers.any()
27 | end
28 | 
29 | local function parse(regex, c, bs, ctx)
30 |   local functions
31 |   local skip = 0
32 | 
33 |   if bs == len(regex) and c == '$' then
34 |     functions = matchers.toend()
35 |     skip = 1
36 |   end
37 | 
38 |   return functions, skip
39 | end
40 | 
41 | return {
42 |   parse = parse,
43 |   default = default,
44 | }
45 | 
46 | end
47 | 


--------------------------------------------------------------------------------
/include/AL/utf8/charclass/compiletime/range.lua:
--------------------------------------------------------------------------------
 1 | return function(utf8)
 2 | 
 3 | local cl = utf8.regex.compiletime.charclass.builder
 4 | 
 5 | local next = utf8.util.next
 6 | 
 7 | return function(str, c, bs, ctx)
 8 |   if not ctx.internal then return end
 9 | 
10 |   local nbs = bs
11 | 
12 |   local r1, r2
13 | 
14 |   local c, nbs = c, bs
15 |   if c == '%' then
16 |     c, nbs = next(str, nbs)
17 |     r1 = c
18 |   else
19 |     r1 = c
20 |   end
21 | 
22 |   utf8.debug("range r1", r1, nbs)
23 | 
24 |   c, nbs = next(str, nbs)
25 |   if c ~= '-' then return end
26 | 
27 |   c, nbs = next(str, nbs)
28 |   if c == '%' then
29 |     c, nbs = next(str, nbs)
30 |     r2 = c
31 |   elseif c ~= '' and c ~= ']' then
32 |     r2 = c
33 |   end
34 | 
35 |   utf8.debug("range r2", r2, nbs)
36 | 
37 |   if r1 and r2 then
38 |     return cl.new():with_ranges{utf8.byte(r1), utf8.byte(r2)}, utf8.next(str, nbs) - bs
39 |   else
40 |     return
41 |   end
42 | end
43 | 
44 | end
45 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 mzn928
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/include/AL/utf8/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Stepets
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/include/AL/utf8/test/strict.lua:
--------------------------------------------------------------------------------
 1 | --[[--
 2 | strict.lua from http://metalua.luaforge.net/src/lib/strict.lua.html
 3 | --]]--
 4 | 
 5 | --
 6 | -- strict.lua
 7 | -- checks uses of undeclared global variables
 8 | -- All global variables must be 'declared' through a regular assignment
 9 | -- (even assigning nil will do) in a main chunk before being used
10 | -- anywhere or assigned to inside a function.
11 | --
12 | 
13 | local mt = getmetatable(_G)
14 | if mt == nil then
15 |   mt = {}
16 |   setmetatable(_G, mt)
17 | end
18 | 
19 | __STRICT = true
20 | mt.__declared = {}
21 | 
22 | mt.__newindex = function (t, n, v)
23 |   if __STRICT and not mt.__declared[n] then
24 |     local w = debug.getinfo(2, "S").what
25 |     if w ~= "main" and w ~= "C" then
26 |       error("assign to undeclared variable '"..n.."'", 2)
27 |     end
28 |     mt.__declared[n] = true
29 |   end
30 |   rawset(t, n, v)
31 | end
32 | 
33 | mt.__index = function (t, n)
34 |   if not mt.__declared[n] and debug.getinfo(2, "S").what ~= "C" then
35 |     error("variable '"..n.."' is not declared", 2)
36 |   end
37 |   return rawget(t, n)
38 | end
39 | 
40 | function global(...)
41 |    for _, v in ipairs{...} do mt.__declared[v] = true end
42 | end
43 | 


--------------------------------------------------------------------------------
/include/AL/utf8/begins/compiletime/vanilla.lua:
--------------------------------------------------------------------------------
 1 | return function(utf8)
 2 | 
 3 | local matchers = {
 4 |   sliding = function()
 5 |     return [[
 6 |     add(function(ctx) -- sliding
 7 |       while ctx.pos <= ctx.len do
 8 |         local clone = ctx:clone()
 9 |         -- debug('starting from', clone, "start_pos", clone.pos)
10 |         clone.result.start = clone.pos
11 |         clone:next_function()
12 |         clone:get_function()(clone)
13 | 
14 |         ctx:next_char()
15 |       end
16 |       ctx:terminate()
17 |     end)
18 | ]]
19 |   end,
20 |   fromstart = function(ctx)
21 |     return [[
22 |     add(function(ctx) -- fromstart
23 |         if ctx.byte_pos > ctx.len then
24 |           return
25 |         end
26 |         ctx.result.start = ctx.pos
27 |         ctx:next_function()
28 |         ctx:get_function()(ctx)
29 |         ctx:terminate()
30 |     end)
31 | ]]
32 |   end,
33 | }
34 | 
35 | local function default()
36 |   return matchers.sliding()
37 | end
38 | 
39 | local function parse(regex, c, bs, ctx)
40 |   if bs ~= 1 then return end
41 | 
42 |   local functions
43 |   local skip = 0
44 | 
45 |   if c == '^' then
46 |     functions = matchers.fromstart()
47 |     skip = 1
48 |   else
49 |     functions = matchers.sliding()
50 |   end
51 | 
52 |   return functions, skip
53 | end
54 | 
55 | return {
56 |   parse = parse,
57 |   default = default,
58 | }
59 | 
60 | end
61 | 


--------------------------------------------------------------------------------
/include/AL/utf8/primitives/native.lua:
--------------------------------------------------------------------------------
 1 | return function(utf8)
 2 | 
 3 | local ffi = require("ffi")
 4 | if ffi.os == "Windows" then
 5 |   os.setlocale(utf8.config.locale or "english_us.65001", "ctype")
 6 |   ffi.cdef[[
 7 |     short towupper(short c);
 8 |     short towlower(short c);
 9 |   ]]
10 | else
11 |   os.setlocale(utf8.config.locale or "C.UTF-8", "ctype")
12 |   ffi.cdef[[
13 |     int towupper(int c);
14 |     int towlower(int c);
15 |   ]]
16 | end
17 | 
18 | utf8:require "primitives.dummy"
19 | 
20 | if not utf8.config.conversion.uc_lc then
21 |   function utf8.lower(str)
22 |     local bs = 1
23 |     local nbs
24 |     local bytes = utf8.raw.len(str)
25 |     local res = {}
26 |   
27 |     while bs <= bytes do
28 |       nbs = utf8.next(str, bs)
29 |       local cp = utf8.unicode(str, bs, nbs)
30 |       res[#res + 1] = ffi.C.towlower(cp)
31 |       bs = nbs
32 |     end
33 |   
34 |     return utf8.char(utf8.config.unpack(res))
35 |   end
36 | end
37 | 
38 | if not utf8.config.conversion.lc_uc then
39 |   function utf8.upper(str)
40 |     local bs = 1
41 |     local nbs
42 |     local bytes = utf8.raw.len(str)
43 |     local res = {}
44 |   
45 |     while bs <= bytes do
46 |       nbs = utf8.next(str, bs)
47 |       local cp = utf8.unicode(str, bs, nbs)
48 |       res[#res + 1] = ffi.C.towupper(cp)
49 |       bs = nbs
50 |     end
51 |   
52 |     return utf8.char(utf8.config.unpack(res))
53 |   end
54 | end
55 | 
56 | return utf8
57 | end
58 | 


--------------------------------------------------------------------------------
/include/AL/utf8/util.lua:
--------------------------------------------------------------------------------
 1 | return function(utf8)
 2 | 
 3 | function utf8.util.copy(obj, deep)
 4 |   if type(obj) == 'table' then
 5 |     local result = {}
 6 |     if deep then
 7 |       for k,v in pairs(obj) do
 8 |         result[k] = utf8.util.copy(v, true)
 9 |       end
10 |     else
11 |       for k,v in pairs(obj) do
12 |         result[k] = v
13 |       end
14 |     end
15 |     return result
16 |   else
17 |     return obj
18 |   end
19 | end
20 | 
21 | local function dump(val, tab)
22 |   tab = tab or ''
23 | 
24 |   if type(val) == 'table' then
25 |     utf8.config.logger('{\n')
26 |     for k,v in pairs(val) do
27 |       utf8.config.logger(tab .. tostring(k) .. " = ")
28 |       dump(v, tab .. '\t')
29 |       utf8.config.logger("\n")
30 |     end
31 |     utf8.config.logger(tab .. '}\n')
32 |   else
33 |     utf8.config.logger(tostring(val))
34 |   end
35 | end
36 | 
37 | function utf8.util.debug(...)
38 |   local t = {...}
39 |   for _, v in ipairs(t) do
40 |     if type(v) == "table" and not (getmetatable(v) or {}).__tostring then
41 |       dump(v, '\t')
42 |     else
43 |       utf8.config.logger(tostring(v), " ")
44 |     end
45 |   end
46 | 
47 |   utf8.config.logger('\n')
48 | end
49 | 
50 | function utf8.debug(...)
51 |   if utf8.config.debug then
52 |     utf8.config.debug(...)
53 |   end
54 | end
55 | 
56 | function utf8.util.next(str, bs)
57 |   local nbs1 = utf8.next(str, bs)
58 |   local nbs2 = utf8.next(str, nbs1)
59 |   return utf8.raw.sub(str, nbs1, nbs2 - 1), nbs1
60 | end
61 | 
62 | return utf8.util
63 | 
64 | end
65 | 


--------------------------------------------------------------------------------
/include/AL/utf8/charclass/runtime/native.lua:
--------------------------------------------------------------------------------
 1 | return function(utf8)
 2 | 
 3 | os.setlocale(utf8.config.locale, "ctype")
 4 | 
 5 | local ffi = require("ffi")
 6 | ffi.cdef[[
 7 |   int iswalnum(int c);
 8 |   int iswalpha(int c);
 9 |   int iswascii(int c);
10 |   int iswblank(int c);
11 |   int iswcntrl(int c);
12 |   int iswdigit(int c);
13 |   int iswgraph(int c);
14 |   int iswlower(int c);
15 |   int iswprint(int c);
16 |   int iswpunct(int c);
17 |   int iswspace(int c);
18 |   int iswupper(int c);
19 |   int iswxdigit(int c);
20 | ]]
21 | 
22 | local base = utf8:require "charclass.runtime.base"
23 | 
24 | local native = setmetatable({}, {__index = base})
25 | local mt = {__index = native}
26 | 
27 | function native.new()
28 |   return setmetatable({}, mt)
29 | end
30 | 
31 | function native:is(class, char_code)
32 |   if class == 'alpha' then return ffi.C.iswalpha(char_code) ~= 0
33 |   elseif class == 'cntrl' then return ffi.C.iswcntrl(char_code) ~= 0
34 |   elseif class == 'digit' then return ffi.C.iswdigit(char_code) ~= 0
35 |   elseif class == 'graph' then return ffi.C.iswgraph(char_code) ~= 0
36 |   elseif class == 'lower' then return ffi.C.iswlower(char_code) ~= 0
37 |   elseif class == 'punct' then return ffi.C.iswpunct(char_code) ~= 0
38 |   elseif class == 'space' then return ffi.C.iswspace(char_code) ~= 0
39 |   elseif class == 'upper' then return ffi.C.iswupper(char_code) ~= 0
40 |   elseif class == 'alnum' then return ffi.C.iswalnum(char_code) ~= 0
41 |   elseif class == 'xdigit' then return ffi.C.iswxdigit(char_code) ~= 0
42 |   end
43 | end
44 | 
45 | return native
46 | 
47 | end
48 | 


--------------------------------------------------------------------------------
/include/AL/utf8/charclass/runtime/dummy.lua:
--------------------------------------------------------------------------------
 1 | return function(utf8)
 2 | 
 3 | local base = utf8:require "charclass.runtime.base"
 4 | 
 5 | local dummy = setmetatable({}, {__index = base})
 6 | local mt = {__index = dummy}
 7 | 
 8 | function dummy.new()
 9 |   return setmetatable({}, mt)
10 | end
11 | 
12 | function dummy:with_classes(...)
13 |   local classes = {...}
14 |   for _, c in ipairs(classes) do
15 |     if c == 'alpha' then self:with_ranges({65, 90}, {97, 122})
16 |     elseif c == 'cntrl' then self:with_ranges({0, 31}):with_codes(127)
17 |     elseif c == 'digit' then self:with_ranges({48, 57})
18 |     elseif c == 'graph' then self:with_ranges({1, 8}, {14, 31}, {33, 132}, {134, 159}, {161, 5759}, {5761, 8191}, {8203, 8231}, {8234, 8238}, {8240, 8286}, {8288, 12287})
19 |     elseif c == 'lower' then self:with_ranges({97, 122})
20 |     elseif c == 'punct' then self:with_ranges({33, 47}, {58, 64}, {91, 96}, {123, 126})
21 |     elseif c == 'space' then self:with_ranges({9, 13}):with_codes(32, 133, 160, 5760):with_ranges({8192, 8202}):with_codes(8232, 8233, 8239, 8287, 12288)
22 |     elseif c == 'upper' then self:with_ranges({65, 90})
23 |     elseif c == 'alnum' then self:with_ranges({48, 57}, {65, 90}, {97, 122})
24 |     elseif c == 'xdigit' then self:with_ranges({48, 57}, {65, 70}, {97, 102})
25 |     end
26 |   end
27 |   return self
28 | end
29 | 
30 | function dummy:without_classes(...)
31 |   local classes = {...}
32 |   if #classes > 0 then
33 |     return self:with_subs(dummy.new():with_classes(...):invert())
34 |   else
35 |     return self
36 |   end
37 | end
38 | 
39 | return dummy
40 | 
41 | end
42 | 


--------------------------------------------------------------------------------
/include/AL/utf8/modifier/compiletime/frontier.lua:
--------------------------------------------------------------------------------
 1 | return function(utf8)
 2 | 
 3 | local matchers = {
 4 |   frontier = function(class, name)
 5 |     local class_name = 'class' .. name
 6 |     return [[
 7 |   local ]] .. class_name .. [[ = ]] .. class .. [[
 8 | 
 9 |   add(function(ctx) -- frontier
10 |     ctx:prev_char()
11 |     local prev_charcode = ctx:get_charcode() or 0
12 |     ctx:next_char()
13 |     local charcode = ctx:get_charcode() or 0
14 |     -- debug("frontier pos", ctx.pos, "prev_charcode", prev_charcode, "charcode", charcode)
15 |     if ]] .. class_name .. [[:test(prev_charcode) then return end
16 |     if ]] .. class_name .. [[:test(charcode) then
17 |       ctx:next_function()
18 |       return ctx:get_function()(ctx)
19 |     end
20 |   end)
21 | ]]
22 |   end,
23 |   simple = utf8:require("modifier.compiletime.simple").simple,
24 | }
25 | 
26 | local function parse(regex, c, bs, ctx)
27 |   local functions, nbs, class
28 | 
29 |   if c == '%' then
30 |     if utf8.raw.sub(regex, bs + 1, bs + 1) ~= 'f' then return end
31 |     if utf8.raw.sub(regex, bs + 2, bs + 2) ~= '[' then error("missing '[' after '%f' in pattern") end
32 | 
33 |     functions = {}
34 |     if ctx.prev_class then
35 |       table.insert(functions, matchers.simple(ctx.prev_class, tostring(bs)))
36 |       ctx.prev_class = nil
37 |     end
38 |     class, nbs = utf8.regex.compiletime.charclass.parse(regex, '[', bs + 2, ctx)
39 |     nbs = nbs + 2
40 |     table.insert(functions, matchers.frontier(class:build(), tostring(bs)))
41 |   end
42 | 
43 |   return functions, nbs
44 | end
45 | 
46 | return {
47 |   parse = parse,
48 | }
49 | 
50 | end
51 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Aegisub Persian Toolkit
 2 | Collection of tools that might help Persian translators.
 3 | 
 4 | [AnimDL.ir](https://www.animdl.ir) | [@AnimeList_ir](https://t.me/animelist_ir)
 5 | 
 6 | # How to install
 7 | 1. Copy autoload content to autoload directory of aegisub.
 8 | 2. Copy include content to include directory of aegisub.
 9 | 
10 | Mentioned directories are at the locations bellow:
11 | - Windows:
12 | ```
13 | %appdata%\Aegisub\automation\
14 | ```
15 | - Linux:
16 | ```
17 | ~/.aegisub/automation/
18 | ```
19 | 
20 | (If folders doesn\'t exist you can create it yourself)
21 | 
22 | # Scripts
23 | ## PakNevis
24 | Correct common mistakes in Persian text.
25 | ## Extend Move
26 | Extend \move based on line's time (Created for linear signs that go outside of video boundries).
27 | ## Unretard
28 | Converts non-RTL typed text to RTL compatible one.
29 | ## RTL / RTL
30 | Fix RTL languages displaying issues.
31 | ## RTL / Un-RTL
32 | Undo RTL function effects.
33 | ## RTL Editor (Edited version of MasafAutomation\'s RTL Editor)
34 | An editor for easy editing of RTL language lines.
35 | ## Split / Split at Tags (Based on Lyger's Split at Tags automation)
36 | A splitter (at tags) for RTL language lines.
37 | ## Split / Split at Spaces
38 | A splitter (at spaces) for RTL language lines.
39 | ## Split / Reverse + Split (at Tags)
40 | Split / Reverse at Tags + Split / Split at Tags.
41 | ## Split / Reverse at Tags
42 | Reverse line at tags to use it with other LTR automations.
43 | 
44 | # Credits
45 | - [utf8.lua](https://github.com/Stepets/utf8.lua)
46 | - [MasafAutomation](https://github.com/Majid110/MasafAutomation)
47 | - [Lyger's Automations](https://github.com/lyger/Aegisub_automation_scripts)
48 | 


--------------------------------------------------------------------------------
/include/AL/utf8/init.lua:
--------------------------------------------------------------------------------
 1 | local module_path = ...
 2 | module_path = module_path:match("^(.-)init$") or (module_path .. '.')
 3 | 
 4 | local ffi_enabled, ffi = pcall(require, 'ffi')
 5 | 
 6 | local utf8 = {
 7 |   config = {},
 8 |   default = {
 9 |     debug = nil,
10 |     logger = io.write,
11 |     loadstring = (loadstring or load),
12 |     unpack = (unpack or table.unpack),
13 |     cache = {
14 |       regex = setmetatable({},{
15 |         __mode = 'kv'
16 |       }),
17 |       plain = setmetatable({},{
18 |         __mode = 'kv'
19 |       }),
20 |     },
21 |     locale = nil,
22 |     int32array = function(size)
23 |       if ffi_enabled then
24 |         return ffi.new("uint32_t[?]", size + 1)
25 |       else
26 |         return {}
27 |       end
28 |     end,
29 |     conversion = {
30 |       uc_lc = nil,
31 |       lc_uc = nil
32 |     }
33 |   },
34 |   regex = {
35 |     compiletime = {
36 |       charclass = {},
37 |       begins = {},
38 |       ends = {},
39 |       modifier = {},
40 |     }
41 |   },
42 |   util = {},
43 | }
44 | 
45 | function utf8:require(name)
46 |   local full_module_path = module_path .. name
47 |   if package.loaded[full_module_path] then
48 |     return package.loaded[full_module_path]
49 |   end
50 | 
51 |   local mod = require(full_module_path)
52 |   if type(mod) == 'function' then
53 |     mod = mod(self)
54 |     package.loaded[full_module_path] = mod
55 |   end
56 |   return mod
57 | end
58 | 
59 | function utf8:init()
60 |   for k, v in pairs(self.default) do
61 |     self.config[k] = self.config[k] or v
62 |   end
63 | 
64 |   self:require "util"
65 |   self:require "primitives.init"
66 |   self:require "functions.lua53"
67 | 
68 |   return self
69 | end
70 | 
71 | return utf8
72 | 


--------------------------------------------------------------------------------
/include/AL/utf8/test/util.lua:
--------------------------------------------------------------------------------
 1 | require "test.strict"
 2 | 
 3 | local function equals(t1, t2)
 4 |   for k,v in pairs(t1) do
 5 |     if t2[k] == nil then return false end
 6 |     if type(t2[k]) == 'cdata' and type(v) == 'cdata' then
 7 |       return true -- don't know how to compare
 8 |     elseif type(t2[k]) == 'table' and type(v) == 'table' then
 9 |       if not equals(t2[k], v) then return false end
10 |     else
11 |       if t2[k] ~= v then return false end
12 |     end
13 |   end
14 |   for k,v in pairs(t2) do
15 |     if t1[k] == nil then return false end
16 |     if type(t1[k]) == 'cdata' and type(v) == 'cdata' then
17 |       return true -- don't know how to compare
18 |     elseif type(t1[k]) == 'table' and type(v) == 'table' then
19 |       if not equals(t1[k], v) then return false end
20 |     else
21 |       if t1[k] ~= v then return false end
22 |     end
23 |   end
24 |   return true
25 | end
26 | 
27 | local old_tostring = tostring
28 | local function tostring(v)
29 |   local type = type(v)
30 |   if type == 'table' then
31 |     local tbl = "{"
32 |     for k,v in pairs(v) do
33 |       tbl = tbl .. tostring(k) .. ' = ' .. tostring(v) .. ', '
34 |     end
35 |     return tbl .. '}'
36 |   else
37 |     return old_tostring(v)
38 |   end
39 | end
40 | 
41 | local old_assert = assert
42 | local assert = function(cond, ...)
43 |   if not cond then
44 |     local data = {...}
45 |     local msg = ""
46 |     for _, v in pairs(data) do
47 |       local type = type(v)
48 |       if type == 'table' then
49 |         local tbl = "{"
50 |         for k,v in pairs(v) do
51 |           tbl = tbl .. tostring(k) .. ' = ' .. tostring(v) .. ', '
52 |         end
53 |         msg = msg .. tbl .. '}'
54 |       else
55 |         msg = msg .. tostring(v)
56 |       end
57 |     end
58 |     error(#data > 0 and msg or "assertion failed!")
59 |   end
60 |   return cond
61 | end
62 | 
63 | local function assert_equals(a,b)
64 |   assert(
65 |     type(a) == 'table' and type(b) == 'table' and equals(a,b) or a == b,
66 |     "expected: ", a and a or tostring(a), "\n",
67 |     "got: ", b and b or tostring(b)
68 |   )
69 | end
70 | 
71 | return {
72 |   equals = equals,
73 |   assert = assert,
74 |   assert_equals = assert_equals,
75 | }
76 | 


--------------------------------------------------------------------------------
/include/AL/utf8/test/context_runtime.lua:
--------------------------------------------------------------------------------
 1 | local utf8 = require("init"):init()
 2 | 
 3 | local context = utf8:require('context.runtime')
 4 | 
 5 | local equals = require('test.util').equals
 6 | local assert = require('test.util').assert
 7 | local assert_equals = require('test.util').assert_equals
 8 | 
 9 | local ctx_en
10 | local ctx_ru
11 | local function setup()
12 |   ctx_en = context.new({str = 'asdf'})
13 |   ctx_ru = context.new({str = 'фыва'})
14 | end
15 | 
16 | local test_get_char = (function()
17 |   setup()
18 | 
19 |   assert_equals('a', ctx_en:get_char())
20 |   assert_equals('ф', ctx_ru:get_char())
21 | end)()
22 | 
23 | local test_get_charcode = (function()
24 |   setup()
25 | 
26 |   assert_equals(utf8.byte'a', ctx_en:get_charcode())
27 |   assert_equals(utf8.byte'ф', ctx_ru:get_charcode())
28 | end)()
29 | 
30 | local test_next_char = (function()
31 |   setup()
32 | 
33 |   assert_equals(1, ctx_en.pos)
34 |   assert_equals(1, ctx_ru.pos)
35 | 
36 |   ctx_ru:next_char()
37 |   ctx_en:next_char()
38 | 
39 |   assert_equals(2, ctx_en.pos)
40 |   assert_equals(2, ctx_ru.pos)
41 | 
42 |   assert_equals('s', ctx_en:get_char())
43 |   assert_equals('ы', ctx_ru:get_char())
44 |   assert_equals(utf8.byte's', ctx_en:get_charcode())
45 |   assert_equals(utf8.byte'ы', ctx_ru:get_charcode())
46 | end)()
47 | 
48 | local test_clone = (function()
49 |   setup()
50 | 
51 |   local clone = ctx_en:clone()
52 | 
53 |   assert(getmetatable(clone) == getmetatable(ctx_en))
54 |   assert_equals(clone, ctx_en)
55 | 
56 |   ctx_en:next_char()
57 | 
58 |   assert_equals('a', clone:get_char())
59 |   assert_equals('s', ctx_en:get_char())
60 | 
61 | end)()
62 | 
63 | local test_last_char = (function()
64 |   ctx_en = context.new({str = 'asdf', pos = 4})
65 |   ctx_ru = context.new({str = 'фыва', pos = 4})
66 | 
67 |   assert_equals('f', ctx_en:get_char())
68 |   assert_equals('а', ctx_ru:get_char())
69 | 
70 |   ctx_ru:next_char()
71 |   ctx_en:next_char()
72 | 
73 |   assert_equals(5, ctx_en.pos)
74 |   assert_equals(5, ctx_ru.pos)
75 | 
76 |   assert_equals("", ctx_en:get_char())
77 |   assert_equals("", ctx_ru:get_char())
78 |   assert_equals(nil, ctx_en:get_charcode())
79 |   assert_equals(nil, ctx_ru:get_charcode())
80 | end)()
81 | 
82 | print('OK')
83 | 


--------------------------------------------------------------------------------
/include/AL/utf8/regex_parser.lua:
--------------------------------------------------------------------------------
 1 | return function(utf8)
 2 | 
 3 | utf8:require "modifier.compiletime.parser"
 4 | utf8:require "charclass.compiletime.parser"
 5 | utf8:require "begins.compiletime.parser"
 6 | utf8:require "ends.compiletime.parser"
 7 | 
 8 | local gensub = utf8.gensub
 9 | local sub = utf8.sub
10 | 
11 | local parser_context = utf8:require "context.compiletime"
12 | 
13 | return function(regex, plain)
14 |   utf8.debug("regex", regex)
15 |   local ctx = parser_context:new()
16 | 
17 |   local skip = {0}
18 |   for nbs, c, bs in gensub(regex, 0), skip do
19 |     repeat -- continue
20 |       skip[1] = 0
21 | 
22 |       c = utf8.raw.sub(regex, bs, utf8.next(regex, bs) - 1)
23 | 
24 |       local functions, move = utf8.regex.compiletime.begins.parse(regex, c, bs, ctx)
25 |       if functions then
26 |         ctx.begins = functions
27 |         skip[1] = move
28 |       end
29 |       if skip[1] ~= 0 then break end
30 | 
31 |       local functions, move = utf8.regex.compiletime.ends.parse(regex, c, bs, ctx)
32 |       if functions then
33 |         ctx.ends = functions
34 |         skip[1] = move
35 |       end
36 |       if skip[1] ~= 0 then break end
37 | 
38 |       local functions, move = utf8.regex.compiletime.modifier.parse(regex, c, bs, ctx)
39 |       if functions then
40 |         for _, f in ipairs(functions) do
41 |           ctx.funcs[#ctx.funcs + 1] = f
42 |         end
43 |         skip[1] = move
44 |       end
45 |       if skip[1] ~= 0 then break end
46 | 
47 |       local charclass, move = utf8.regex.compiletime.charclass.parse(regex, c, bs, ctx)
48 |       if charclass then skip[1] = move end
49 |     until true -- continue
50 |   end
51 | 
52 |   for _, m in ipairs(utf8.config.modifier) do
53 |     if m.check then m.check(ctx) end
54 |   end
55 | 
56 |   local src = [[
57 |   return function(str, init, utf8)
58 |       local ctx = utf8:require("context.runtime").new({str = str, pos = init or 1})
59 |       local cl = utf8:require("charclass.runtime.init")
60 |       local utf8sub = utf8.sub
61 |       local rawsub = utf8.raw.sub
62 |       local utf8len = utf8.len
63 |       local utf8next = utf8.next
64 |       local debug = utf8.debug
65 |       local function add(fun)
66 |           ctx.functions[#ctx.functions + 1] = fun
67 |       end
68 |   ]] .. ctx.begins
69 |   for _, v in ipairs(ctx.funcs) do src = src .. v end
70 |   src = src .. ctx.ends .. [[
71 |       return coroutine.wrap(ctx:get_function())(ctx)
72 |   end
73 |   ]]
74 | 
75 |   utf8.debug(regex, src)
76 | 
77 |   return assert(utf8.config.loadstring(src, (plain and "plain " or "") .. regex))()
78 | end
79 | 
80 | end
81 | 


--------------------------------------------------------------------------------
/include/AL/utf8/test/charclass_runtime.lua:
--------------------------------------------------------------------------------
  1 | local utf8 = require("init")
  2 | utf8.config = {
  3 |   debug = nil, --utf8:require("util").debug
  4 | }
  5 | utf8:init()
  6 | 
  7 | local cl = utf8:require("charclass.runtime.init")
  8 | 
  9 | local equals = require('test.util').equals
 10 | local assert = require('test.util').assert
 11 | local assert_equals = require('test.util').assert_equals
 12 | 
 13 | assert_equals(true, cl.new()
 14 |   :with_codes(utf8.byte' ')
 15 |   :invert()
 16 |   :in_codes(utf8.byte' '))
 17 | 
 18 | assert_equals(false, cl.new()
 19 |   :with_codes(utf8.byte' ')
 20 |   :invert()
 21 |   :test(utf8.byte' '))
 22 | 
 23 | assert_equals(false, cl.new()
 24 |   :with_codes()
 25 |   :with_ranges()
 26 |   :with_classes('space')
 27 |   :without_classes()
 28 |   :with_subs()
 29 |   :invert()
 30 |   :test(utf8.byte(' ')))
 31 | 
 32 | assert_equals(true, cl.new()
 33 |   :with_codes()
 34 |   :with_ranges()
 35 |   :with_classes()
 36 |   :without_classes('space')
 37 |   :with_subs()
 38 |   :invert()
 39 |   :test(utf8.byte(' ')))
 40 | 
 41 | assert_equals(false, cl.new()
 42 |   :with_codes()
 43 |   :with_ranges()
 44 |   :with_classes()
 45 |   :without_classes()
 46 |   :with_subs(cl.new():with_classes('space'))
 47 |   :invert()
 48 |   :test(utf8.byte(' ')))
 49 | 
 50 | assert_equals(true, cl.new()
 51 |   :with_codes()
 52 |   :with_ranges()
 53 |   :with_classes()
 54 |   :without_classes()
 55 |   :with_subs(cl.new():with_classes('space'):invert())
 56 |   :invert()
 57 |   :test(utf8.byte(' ')))
 58 | 
 59 | assert_equals(true, cl.new()
 60 |   :with_codes()
 61 |   :with_ranges()
 62 |   :with_classes('punct', 'digit', 'space', 'cntrl')
 63 |   :without_classes()
 64 |   :with_subs()
 65 |   :invert()
 66 |   :test(utf8.byte'П')
 67 | )
 68 | 
 69 | assert_equals(true, cl.new()
 70 |   :with_codes()
 71 |   :with_ranges()
 72 |   :with_classes('punct', 'digit', 'space', 'cntrl')
 73 |   :without_classes()
 74 |   :with_subs()
 75 |   :invert()
 76 |   :test(utf8.byte'и')
 77 | )
 78 | 
 79 | assert_equals(true, cl.new()
 80 |   :with_codes()
 81 |   :with_ranges()
 82 |   :with_classes()
 83 |   :without_classes('space')
 84 |   :with_subs()
 85 |   :test(utf8.byte'f')
 86 | )
 87 | 
 88 | assert_equals(false, cl.new()
 89 |   :with_codes()
 90 |   :with_ranges()
 91 |   :with_classes()
 92 |   :without_classes('space')
 93 |   :with_subs()
 94 |   :test(utf8.byte'\n')
 95 | )
 96 | 
 97 | assert_equals(false, cl.new()
 98 |   :with_codes()
 99 |   :with_ranges()
100 |   :with_classes('lower')
101 |   :without_classes()
102 |   :with_subs()
103 |   :invert()
104 |   :test(nil)
105 | )
106 | 
107 | assert_equals(false, cl.new()
108 |   :with_codes()
109 |   :with_ranges()
110 |   :with_classes('lower')
111 |   :without_classes()
112 |   :with_subs()
113 |   :test(nil)
114 | )
115 | 
116 | print "OK"
117 | 


--------------------------------------------------------------------------------
/include/AL/utf8/context/runtime.lua:
--------------------------------------------------------------------------------
  1 | return function(utf8)
  2 | 
  3 | local utf8unicode = utf8.unicode
  4 | local utf8sub = utf8.sub
  5 | local sub = utf8.raw.sub
  6 | local byte = utf8.raw.byte
  7 | local utf8len = utf8.len
  8 | local utf8next = utf8.next
  9 | local rawgsub = utf8.raw.gsub
 10 | local utf8offset = utf8.offset
 11 | local utf8char = utf8.char
 12 | 
 13 | local util = utf8.util
 14 | 
 15 | local ctx = {}
 16 | local mt = {
 17 |   __index = ctx,
 18 |   __tostring = function(self)
 19 |     return rawgsub([[str: '${str}', char: ${pos} '${char}', func: ${func_pos}]], "${(.-)}", {
 20 |       str = self.str,
 21 |       pos = self.pos,
 22 |       char = self:get_char(),
 23 |       func_pos = self.func_pos,
 24 |     })
 25 |   end
 26 | }
 27 | 
 28 | function ctx.new(obj)
 29 |   obj = obj or {}
 30 |   local res = setmetatable({
 31 |     pos = obj.pos or 1,
 32 |     byte_pos = obj.pos or 1,
 33 |     str = assert(obj.str, "str is required"),
 34 |     len = obj.len,
 35 |     rawlen = obj.rawlen,
 36 |     bytes = obj.bytes,
 37 |     offsets = obj.offsets,
 38 |     starts = obj.starts or nil,
 39 |     functions = obj.functions or {},
 40 |     func_pos = obj.func_pos or 1,
 41 |     ends = obj.ends or nil,
 42 |     result = obj.result and util.copy(obj.result) or {},
 43 |     captures = obj.captures and util.copy(obj.captures, true) or {active = {}},
 44 |     modified = false,
 45 |   }, mt)
 46 |   if not res.bytes then
 47 |     local str = res.str
 48 |     local l = #str
 49 |     local bytes = utf8.config.int32array(l)
 50 |     local offsets = utf8.config.int32array(l)
 51 |     local c, bs, i = nil, 1, 1
 52 |     while bs <= l do
 53 |       bytes[i] = utf8unicode(str, bs, bs)
 54 |       offsets[i] = bs
 55 |       bs = utf8.next(str, bs)
 56 |       i = i + 1
 57 |     end
 58 |     res.bytes = bytes
 59 |     res.offsets = offsets
 60 |     res.byte_pos = res.pos
 61 |     res.len = i
 62 |     res.rawlen = l
 63 |   end
 64 | 
 65 |   return res
 66 | end
 67 | 
 68 | function ctx:clone()
 69 |   return self:new()
 70 | end
 71 | 
 72 | function ctx:next_char()
 73 |   self.pos = self.pos + 1
 74 |   self.byte_pos = self.pos
 75 | end
 76 | 
 77 | function ctx:prev_char()
 78 |   self.pos = self.pos - 1
 79 |   self.byte_pos = self.pos
 80 | end
 81 | 
 82 | function ctx:get_char()
 83 |   if self.len <= self.pos then return "" end
 84 |   return utf8char(self.bytes[self.pos])
 85 | end
 86 | 
 87 | function ctx:get_charcode()
 88 |   if self.len <= self.pos then return nil end
 89 |   return self.bytes[self.pos]
 90 | end
 91 | 
 92 | function ctx:next_function()
 93 |   self.func_pos = self.func_pos + 1
 94 | end
 95 | 
 96 | function ctx:get_function()
 97 |   return self.functions[self.func_pos]
 98 | end
 99 | 
100 | function ctx:done()
101 |   utf8.debug('done', self)
102 |   coroutine.yield(self, self.result, self.captures)
103 | end
104 | 
105 | function ctx:terminate()
106 |   utf8.debug('terminate', self)
107 |   coroutine.yield(nil)
108 | end
109 | 
110 | return ctx
111 | 
112 | end
113 | 


--------------------------------------------------------------------------------
/include/AL/utf8/README.md:
--------------------------------------------------------------------------------
 1 | # utf8.lua
 2 | pure-lua 5.3 regex library for Lua 5.3, Lua 5.1, LuaJIT
 3 | 
 4 | This library provides simple way to add UTF-8 support into your application.
 5 | 
 6 | #### Example:
 7 | ```Lua
 8 | local utf8 = require('.utf8'):init()
 9 | for k,v in pairs(utf8) do
10 |   string[k] = v
11 | end
12 | 
13 | local str = "пыщпыщ ололоо я водитель нло"
14 | print(str:find("(.л.+)н"))
15 | -- 8	26	ололоо я водитель
16 | 
17 | print(str:gsub("ло+", "보라"))
18 | -- пыщпыщ о보라보라 я водитель н보라	3
19 | 
20 | print(str:match("^п[лопыщ ]*я"))
21 | -- пыщпыщ ололоо я
22 | ```
23 | 
24 | #### Usage:
25 | 
26 | This library can be used as drop-in replacement for vanilla string library. It exports all vanilla functions under `raw` sub-object.
27 | 
28 | ```Lua
29 | local utf8 = require('.utf8'):init()
30 | local str = "пыщпыщ ололоо я водитель нло"
31 | utf8.gsub(str, "ло+", "보라")
32 | -- пыщпыщ о보라보라 я водитель н보라	3
33 | utf8.raw.gsub(str, "ло+", "보라")
34 | -- пыщпыщ о보라보라о я водитель н보라	3
35 | ```
36 | 
37 | It also provides all functions from Lua 5.3 UTF-8 [module](https://www.lua.org/manual/5.3/manual.html#6.5) except `utf8.len (s [, i [, j]])`. If you need to validate your strings use `utf8.validate(str, byte_pos)` or iterate over with `utf8.validator`.
38 | 
39 | Please note that library assumes regexes are valid UTF-8 strings, if you need to manipulate individual bytes use vanilla functions under `utf8.raw`.
40 | 
41 | 
42 | #### Installation:
43 | 
44 | Download repository to your project folder. (no rockspecs yet)
45 | 
46 | Examples assume library placed under `utf8` subfolder not `utf8.lua`.
47 | 
48 | As of Lua 5.3 default `utf8` module has precedence over user-provided. In this case you can specify full module path (`.utf8`).
49 | 
50 | #### Configuration:
51 | 
52 | Library is highly modular. You can provide your implementation for almost any function used. Library already has several back-ends:
53 | - [Runtime character class processing](charclass/runtime/init.lua) using hardcoded codepoint ranges or using native functions through `ffi`.
54 | - [Basic functions](primitives/init.lua) for working with UTF-8 characters have specializations for `ffi`-enabled runtime and for tarantool.
55 | 
56 | Probably most interesting [customizations](init.lua) are `utf8.config.loadstring` and `utf8.config.cache` if you want to precompile your regexes.
57 | 
58 | ```Lua
59 | local utf8 = require('.utf8')
60 | utf8.config = {
61 |   cache = my_smart_cache,
62 | }
63 | utf8:init()
64 | ```
65 | 
66 | For `lower` and `upper` functions to work in environments where `ffi` cannot be used, you can specify substitution tables ([data example](https://github.com/artemshein/luv/blob/master/utf8data.lua))
67 | 
68 | ```Lua
69 | local utf8 = require('.utf8')
70 | utf8.config = {
71 |   conversion = {
72 |     uc_lc = utf8_uc_lc,
73 |     lc_uc = utf8_lc_uc
74 |   },
75 | }
76 | utf8:init()
77 | ```
78 | Customization is done before initialization. If you want, you can change configuration after `init`, it might work for everything but modules. All of them should be reloaded.
79 | 
80 | #### [Documentation:](test/test.lua)
81 | 
82 | #### Issue reporting:
83 | 
84 | Please provide example script that causes error together with environment description and debug output. Debug output can be obtained like:
85 | ```Lua
86 | local utf8 = require('.utf8')
87 | utf8.config = {
88 |   debug = utf8:require("util").debug
89 | }
90 | utf8:init()
91 | -- your code
92 | ```
93 | Default logger used is [`io.write`](https://www.lua.org/manual/5.3/manual.html#pdf-io.write) and can be changed by specifying `logger = my_logger` in configuration
94 | 


--------------------------------------------------------------------------------
/include/AL/utf8/charclass/compiletime/builder.lua:
--------------------------------------------------------------------------------
  1 | return function(utf8)
  2 | 
  3 | local byte = utf8.byte
  4 | local unpack = utf8.config.unpack
  5 | 
  6 | local builder = {}
  7 | local mt = {__index = builder}
  8 | 
  9 | utf8.regex.compiletime.charclass.builder = builder
 10 | 
 11 | function builder.new()
 12 |   return setmetatable({}, mt)
 13 | end
 14 | 
 15 | function builder:invert()
 16 |   self.inverted = true
 17 |   return self
 18 | end
 19 | 
 20 | function builder:internal() -- is it enclosed in []
 21 |   self.internal = true
 22 |   return self
 23 | end
 24 | 
 25 | function builder:with_codes(...)
 26 |   local codes = {...}
 27 |   self.codes = self.codes or {}
 28 | 
 29 |   for _, v in ipairs(codes) do
 30 |     table.insert(self.codes, type(v) == "number" and v or byte(v))
 31 |   end
 32 | 
 33 |   table.sort(self.codes)
 34 |   return self
 35 | end
 36 | 
 37 | function builder:with_ranges(...)
 38 |   local ranges = {...}
 39 |   self.ranges = self.ranges or {}
 40 | 
 41 |   for _, v in ipairs(ranges) do
 42 |     table.insert(self.ranges, v)
 43 |   end
 44 | 
 45 |   return self
 46 | end
 47 | 
 48 | function builder:with_classes(...)
 49 |   local classes = {...}
 50 |   self.classes = self.classes or {}
 51 | 
 52 |   for _, v in ipairs(classes) do
 53 |     table.insert(self.classes, v)
 54 |   end
 55 | 
 56 |   return self
 57 | end
 58 | 
 59 | function builder:without_classes(...)
 60 |   local not_classes = {...}
 61 |   self.not_classes = self.not_classes or {}
 62 | 
 63 |   for _, v in ipairs(not_classes) do
 64 |     table.insert(self.not_classes, v)
 65 |   end
 66 | 
 67 |   return self
 68 | end
 69 | 
 70 | function builder:include(b)
 71 |   if not b.inverted then
 72 |     if b.codes then
 73 |       self:with_codes(unpack(b.codes))
 74 |     end
 75 |     if b.ranges then
 76 |       self:with_ranges(unpack(b.ranges))
 77 |     end
 78 |     if b.classes then
 79 |       self:with_classes(unpack(b.classes))
 80 |     end
 81 |     if b.not_classes then
 82 |       self:without_classes(unpack(b.not_classes))
 83 |     end
 84 |   else
 85 |     self.includes = self.includes or {}
 86 |     self.includes[#self.includes + 1] = b
 87 |   end
 88 |   return self
 89 | end
 90 | 
 91 | function builder:build()
 92 |   if self.codes and #self.codes == 1 and not self.inverted and not self.ranges and not self.classes and not self.not_classes and not self.includes then
 93 |     return "{test = function(self, cc) return cc == " .. self.codes[1] .. " end}"
 94 |   else
 95 |     local codes_list = table.concat(self.codes or {}, ', ')
 96 |     local ranges_list = ''
 97 |     for i, r in ipairs(self.ranges or {}) do ranges_list = ranges_list .. (i > 1 and ', {' or '{') .. tostring(r[1]) .. ', ' .. tostring(r[2]) .. '}' end
 98 |     local classes_list = ''
 99 |     if self.classes then classes_list = "'" .. table.concat(self.classes, "', '") .. "'" end
100 |     local not_classes_list = ''
101 |     if self.not_classes then not_classes_list = "'" .. table.concat(self.not_classes, "', '") .. "'" end
102 | 
103 |     local subs_list = ''
104 |     for i, r in ipairs(self.includes or {}) do subs_list = subs_list .. (i > 1 and ', ' or '') .. r:build() .. '' end
105 | 
106 |     local src = [[cl.new():with_codes(
107 |         ]] .. codes_list .. [[
108 |       ):with_ranges(
109 |         ]] .. ranges_list .. [[
110 |       ):with_classes(
111 |         ]] .. classes_list .. [[
112 |       ):without_classes(
113 |         ]] .. not_classes_list .. [[
114 |       ):with_subs(
115 |         ]] .. subs_list .. [[
116 |       )]]
117 | 
118 |     if self.inverted then
119 |       src = src .. ':invert()'
120 |     end
121 | 
122 |     return src
123 |   end
124 | end
125 | 
126 | return builder
127 | 
128 | end
129 | 


--------------------------------------------------------------------------------
/include/AL/utf8/test/test_compat.lua:
--------------------------------------------------------------------------------
  1 | local utf8 = require 'init'
  2 | utf8.config = {
  3 |   debug = nil, --utf8:require("util").debug
  4 | }
  5 | utf8:init()
  6 | print('testing utf8 library')
  7 | 
  8 | local LUA_51, LUA_53 = false, false
  9 | if "\xe4" == "xe4" then -- lua5.1
 10 |   LUA_51 = true
 11 | else -- luajit lua5.3
 12 |   LUA_53 = true
 13 | end
 14 | 
 15 | assert(utf8.sub("123456789",2,4) == "234")
 16 | assert(utf8.sub("123456789",7) == "789")
 17 | assert(utf8.sub("123456789",7,6) == "")
 18 | assert(utf8.sub("123456789",7,7) == "7")
 19 | assert(utf8.sub("123456789",0,0) == "")
 20 | assert(utf8.sub("123456789",-10,10) == "123456789")
 21 | assert(utf8.sub("123456789",1,9) == "123456789")
 22 | assert(utf8.sub("123456789",-10,-20) == "")
 23 | assert(utf8.sub("123456789",-1) == "9")
 24 | assert(utf8.sub("123456789",-4) == "6789")
 25 | assert(utf8.sub("123456789",-6, -4) == "456")
 26 | if not _no32 then
 27 |   assert(utf8.sub("123456789",-2^31, -4) == "123456")
 28 |   assert(utf8.sub("123456789",-2^31, 2^31 - 1) == "123456789")
 29 |   assert(utf8.sub("123456789",-2^31, -2^31) == "")
 30 | end
 31 | assert(utf8.sub("\000123456789",3,5) == "234")
 32 | assert(utf8.sub("\000123456789", 8) == "789")
 33 | print('+')
 34 | 
 35 | assert(utf8.find("123456789", "345") == 3)
 36 | local a,b = utf8.find("123456789", "345")
 37 | assert(utf8.sub("123456789", a, b) == "345")
 38 | assert(utf8.find("1234567890123456789", "345", 3) == 3)
 39 | assert(utf8.find("1234567890123456789", "345", 4) == 13)
 40 | assert(utf8.find("1234567890123456789", "346", 4) == nil)
 41 | assert(utf8.find("1234567890123456789", ".45", -9) == 13)
 42 | assert(utf8.find("abcdefg", "\0", 5, 1) == nil)
 43 | assert(utf8.find("", "") == 1)
 44 | assert(utf8.find("", "", 1) == 1)
 45 | assert(not utf8.find("", "", 2))
 46 | assert(utf8.find('', 'aaa', 1) == nil)
 47 | assert(('alo(.)alo'):find('(.)', 1, 1) == 4)
 48 | print('+')
 49 | 
 50 | assert(utf8.len("") == 0)
 51 | assert(utf8.len("\0\0\0") == 3)
 52 | assert(utf8.len("1234567890") == 10)
 53 | 
 54 | assert(utf8.byte("a") == 97)
 55 | if LUA_51 then
 56 |   assert(utf8.byte("�") > 127)
 57 | else
 58 |   assert(utf8.byte("\xe4") > 127)
 59 | end
 60 | assert(utf8.byte(utf8.char(255)) == 255)
 61 | assert(utf8.byte(utf8.char(0)) == 0)
 62 | assert(utf8.byte("\0") == 0)
 63 | assert(utf8.byte("\0\0alo\0x", -1) == string.byte('x'))
 64 | assert(utf8.byte("ba", 2) == 97)
 65 | assert(utf8.byte("\n\n", 2, -1) == 10)
 66 | assert(utf8.byte("\n\n", 2, 2) == 10)
 67 | assert(utf8.byte("") == nil)
 68 | assert(utf8.byte("hi", -3) == nil)
 69 | assert(utf8.byte("hi", 3) == nil)
 70 | assert(utf8.byte("hi", 9, 10) == nil)
 71 | assert(utf8.byte("hi", 2, 1) == nil)
 72 | assert(utf8.char() == "")
 73 | if LUA_53 then
 74 |   assert(utf8.raw.char(0, 255, 0) == "\0\255\0") -- fails due 255 can't be utf8 byte
 75 |   assert(utf8.char(0, 255, 0) == "\0\195\191\0")
 76 |   assert(utf8.raw.char(0, utf8.byte("\xe4"), 0) == "\0\xe4\0")
 77 |   assert(utf8.char(0, utf8.byte("\xe4"), 0) == "\0\195\164\0")
 78 |   assert(utf8.raw.char(utf8.raw.byte("\xe4l\0�u", 1, -1)) == "\xe4l\0�u")
 79 |   assert(utf8.raw.char(utf8.raw.byte("\xe4l\0�u", 1, -1)) == "\xe4l\0�u")
 80 |   assert(utf8.raw.char(utf8.raw.byte("\xe4l\0�u", 1, 0)) == "")
 81 |   assert(utf8.raw.char(utf8.raw.byte("\xe4l\0�u", -10, 100)) == "\xe4l\0�u")
 82 | end
 83 | 
 84 | assert(utf8.upper("ab\0c") == "AB\0C")
 85 | assert(utf8.lower("\0ABCc%$") == "\0abcc%$")
 86 | assert(utf8.rep('teste', 0) == '')
 87 | assert(utf8.rep('t�s\00t�', 2) == 't�s\0t�t�s\000t�')
 88 | assert(utf8.rep('', 10) == '')
 89 | print('+')
 90 | 
 91 | assert(utf8.upper("ab\0c") == "AB\0C")
 92 | assert(utf8.lower("\0ABCc%$") == "\0abcc%$")
 93 | 
 94 | assert(utf8.reverse"" == "")
 95 | assert(utf8.reverse"\0\1\2\3" == "\3\2\1\0")
 96 | assert(utf8.reverse"\0001234" == "4321\0")
 97 | 
 98 | for i=0,30 do assert(utf8.len(string.rep('a', i)) == i) end
 99 | 
100 | print('+')
101 | 
102 | do
103 |   local f = utf8.gmatch("1 2 3 4 5", "%d+")
104 |   assert(f() == "1")
105 |   local co = coroutine.wrap(f)
106 |   assert(co() == "2")
107 | end
108 | 
109 | print('OK')
110 | 


--------------------------------------------------------------------------------
/include/AL/utf8/charclass/runtime/base.lua:
--------------------------------------------------------------------------------
  1 | return function(utf8)
  2 | 
  3 | local class = {}
  4 | local mt = {__index = class}
  5 | 
  6 | local utf8gensub = utf8.gensub
  7 | 
  8 | function class.new()
  9 |   return setmetatable({}, mt)
 10 | end
 11 | 
 12 | function class:invert()
 13 |   self.inverted = true
 14 |   return self
 15 | end
 16 | 
 17 | function class:with_codes(...)
 18 |   local codes = {...}
 19 |   self.codes = self.codes or {}
 20 | 
 21 |   for _, v in ipairs(codes) do
 22 |     table.insert(self.codes, v)
 23 |   end
 24 | 
 25 |   table.sort(self.codes)
 26 |   return self
 27 | end
 28 | 
 29 | function class:with_ranges(...)
 30 |   local ranges = {...}
 31 |   self.ranges = self.ranges or {}
 32 | 
 33 |   for _, v in ipairs(ranges) do
 34 |     table.insert(self.ranges, v)
 35 |   end
 36 | 
 37 |   return self
 38 | end
 39 | 
 40 | function class:with_classes(...)
 41 |   local classes = {...}
 42 |   self.classes = self.classes or {}
 43 | 
 44 |   for _, v in ipairs(classes) do
 45 |     table.insert(self.classes, v)
 46 |   end
 47 | 
 48 |   return self
 49 | end
 50 | 
 51 | function class:without_classes(...)
 52 |   local not_classes = {...}
 53 |   self.not_classes = self.not_classes or {}
 54 | 
 55 |   for _, v in ipairs(not_classes) do
 56 |     table.insert(self.not_classes, v)
 57 |   end
 58 | 
 59 |   return self
 60 | end
 61 | 
 62 | function class:with_subs(...)
 63 |   local subs = {...}
 64 |   self.subs = self.subs or {}
 65 | 
 66 |   for _, v in ipairs(subs) do
 67 |     table.insert(self.subs, v)
 68 |   end
 69 | 
 70 |   return self
 71 | end
 72 | 
 73 | function class:in_codes(item)
 74 |   if not self.codes or #self.codes == 0 then return nil end
 75 | 
 76 |   local head, tail = 1, #self.codes
 77 |   local mid = math.floor((head + tail)/2)
 78 |   while (tail - head) > 1 do
 79 |     if self.codes[mid] > item then
 80 |       tail = mid
 81 |     else
 82 |       head = mid
 83 |     end
 84 |     mid = math.floor((head + tail)/2)
 85 |   end
 86 |   if self.codes[head] == item then
 87 |     return true, head
 88 |   elseif self.codes[tail] == item then
 89 |     return true, tail
 90 |   else
 91 |     return false
 92 |   end
 93 | end
 94 | 
 95 | function class:in_ranges(char_code)
 96 |   if not self.ranges or #self.ranges == 0 then return nil end
 97 | 
 98 |   for _,r in ipairs(self.ranges) do
 99 |     if r[1] <= char_code and char_code <= r[2] then
100 |       return true
101 |     end
102 |   end
103 |   return false
104 | end
105 | 
106 | function class:in_classes(char_code)
107 |   if not self.classes or #self.classes == 0 then return nil end
108 | 
109 |   for _, class in ipairs(self.classes) do
110 |     if self:is(class, char_code) then
111 |       return true
112 |     end
113 |   end
114 |   return false
115 | end
116 | 
117 | function class:in_not_classes(char_code)
118 |   if not self.not_classes or #self.not_classes == 0 then return nil end
119 | 
120 |   for _, class in ipairs(self.not_classes) do
121 |     if self:is(class, char_code) then
122 |       return true
123 |     end
124 |   end
125 |   return false
126 | end
127 | 
128 | function class:is(class, char_code)
129 |   error("not implemented")
130 | end
131 | 
132 | function class:in_subs(char_code)
133 |   if not self.subs or #self.subs == 0 then return nil end
134 | 
135 |   for _, c in ipairs(self.subs) do
136 |     if not c:test(char_code) then
137 |       return false
138 |     end
139 |   end
140 |   return true
141 | end
142 | 
143 | function class:test(char_code)
144 |   local result = self:do_test(char_code)
145 |   -- utf8.debug('class:test', result, "'" .. (char_code and utf8.char(char_code) or 'nil') .. "'", char_code)
146 |   return result
147 | end
148 | 
149 | function class:do_test(char_code)
150 |   if not char_code then return false end
151 |   local in_not_classes = self:in_not_classes(char_code)
152 |   if in_not_classes then
153 |     return not not self.inverted
154 |   end
155 |   local in_codes = self:in_codes(char_code)
156 |   if in_codes then
157 |     return not self.inverted
158 |   end
159 |   local in_ranges = self:in_ranges(char_code)
160 |   if in_ranges then
161 |     return not self.inverted
162 |   end
163 |   local in_classes = self:in_classes(char_code)
164 |   if in_classes then
165 |     return not self.inverted
166 |   end
167 |   local in_subs = self:in_subs(char_code)
168 |   if in_subs then
169 |     return not self.inverted
170 |   end
171 |   if (in_codes == nil)
172 |   and (in_ranges == nil)
173 |   and (in_classes == nil)
174 |   and (in_subs == nil)
175 |   and (in_not_classes == false) then
176 |     return not self.inverted
177 |   else
178 |     return not not self.inverted
179 |   end
180 | end
181 | 
182 | return class
183 | 
184 | end
185 | 


--------------------------------------------------------------------------------
/include/AL/utf8/functions/lua53.lua:
--------------------------------------------------------------------------------
  1 | return function(utf8)
  2 | 
  3 | local utf8sub = utf8.sub
  4 | local utf8gensub = utf8.gensub
  5 | local unpack = utf8.config.unpack
  6 | local generate_matcher_function = utf8:require 'regex_parser'
  7 | 
  8 | local
  9 | function get_matcher_function(regex, plain)
 10 |   local res
 11 |   if utf8.config.cache then
 12 |     res = utf8.config.cache[plain and "plain" or "regex"][regex]
 13 |   end
 14 |   if res then
 15 |     return res
 16 |   end
 17 |   res = generate_matcher_function(regex, plain)
 18 |   if utf8.config.cache then
 19 |     utf8.config.cache[plain and "plain" or "regex"][regex] = res
 20 |   end
 21 |   return res
 22 | end
 23 | 
 24 | local function utf8find(str, regex, init, plain)
 25 |   local func = get_matcher_function(regex, plain)
 26 |   init = ((init or 1) < 0) and (utf8.len(str) + init + 1) or init
 27 |   local ctx, result, captures = func(str, init, utf8)
 28 |   if not ctx then return nil end
 29 | 
 30 |   utf8.debug('ctx:', ctx)
 31 |   utf8.debug('result:', result)
 32 |   utf8.debug('captures:', captures)
 33 | 
 34 |   return result.start, result.finish, unpack(captures)
 35 | end
 36 | 
 37 | local function utf8match(str, regex, init)
 38 |   local func = get_matcher_function(regex, false)
 39 |   init = ((init or 1) < 0) and (utf8.len(str) + init + 1) or init
 40 |   local ctx, result, captures = func(str, init, utf8)
 41 |   if not ctx then return nil end
 42 | 
 43 |   utf8.debug('ctx:', ctx)
 44 |   utf8.debug('result:', result)
 45 |   utf8.debug('captures:', captures)
 46 | 
 47 |   if #captures > 0 then return unpack(captures) end
 48 | 
 49 |   return utf8sub(str, result.start, result.finish)
 50 | end
 51 | 
 52 | local function utf8gmatch(str, regex)
 53 |   regex = (utf8sub(regex,1,1) ~= '^') and regex or '%' .. regex
 54 |   local func = get_matcher_function(regex, false)
 55 |   local ctx, result, captures
 56 |   local continue_pos = 1
 57 | 
 58 |   return function()
 59 |     ctx, result, captures = func(str, continue_pos, utf8)
 60 | 
 61 |     if not ctx then return nil end
 62 | 
 63 |     utf8.debug('ctx:', ctx)
 64 |     utf8.debug('result:', result)
 65 |     utf8.debug('captures:', captures)
 66 | 
 67 |     continue_pos = math.max(result.finish + 1, result.start + 1)
 68 |     if #captures > 0 then
 69 |       return unpack(captures)
 70 |     else
 71 |       return utf8sub(str, result.start, result.finish)
 72 |     end
 73 |   end
 74 | end
 75 | 
 76 | local function replace(repl, args)
 77 |   local ret = ''
 78 |   if type(repl) == 'string' then
 79 |     local ignore = false
 80 |     local num
 81 |     for _, c in utf8gensub(repl) do
 82 |       if not ignore then
 83 |         if c == '%' then
 84 |           ignore = true
 85 |         else
 86 |           ret = ret .. c
 87 |         end
 88 |       else
 89 |         num = tonumber(c)
 90 |         if num then
 91 |           ret = ret .. assert(args[num], "invalid capture index %" .. c)
 92 |         else
 93 |           ret = ret .. c
 94 |         end
 95 |         ignore = false
 96 |       end
 97 |     end
 98 |   elseif type(repl) == 'table' then
 99 |     ret = repl[args[1]] or args[0]
100 |   elseif type(repl) == 'function' then
101 |     ret = repl(unpack(args, 1)) or args[0]
102 |   end
103 |   return ret
104 | end
105 | 
106 | local function utf8gsub(str, regex, repl, limit)
107 |   limit = limit or -1
108 |   local subbed = ''
109 |   local prev_sub_finish = 1
110 | 
111 |   local func = get_matcher_function(regex, false)
112 |   local ctx, result, captures
113 |   local continue_pos = 1
114 | 
115 |   local n = 0
116 |   while limit ~= n do
117 |     ctx, result, captures = func(str, continue_pos, utf8)
118 |     if not ctx then break end
119 | 
120 |     utf8.debug('ctx:', ctx)
121 |     utf8.debug('result:', result)
122 |     utf8.debug('result:', utf8sub(str, result.start, result.finish))
123 |     utf8.debug('captures:', captures)
124 | 
125 |     continue_pos = math.max(result.finish + 1, result.start + 1)
126 |     local args
127 |     if #captures > 0 then
128 |       args = {[0] = utf8sub(str, result.start, result.finish), unpack(captures)}
129 |     else
130 |       args = {[0] = utf8sub(str, result.start, result.finish)}
131 |       args[1] = args[0]
132 |     end
133 | 
134 |     subbed = subbed .. utf8sub(str, prev_sub_finish, result.start - 1)
135 |     subbed = subbed .. replace(repl, args)
136 |     prev_sub_finish = result.finish + 1
137 |     n = n + 1
138 | 
139 |   end
140 | 
141 |   return subbed .. utf8sub(str, prev_sub_finish), n
142 | end
143 | 
144 | -- attaching high-level functions
145 | utf8.find    = utf8find
146 | utf8.match   = utf8match
147 | utf8.gmatch  = utf8gmatch
148 | utf8.gsub    = utf8gsub
149 | 
150 | return utf8
151 | 
152 | end
153 | 


--------------------------------------------------------------------------------
/include/AL/utf8/charclass/compiletime/vanilla.lua:
--------------------------------------------------------------------------------
  1 | return function(utf8)
  2 | 
  3 | local cl = utf8:require "charclass.compiletime.builder"
  4 | 
  5 | local next = utf8.util.next
  6 | 
  7 | local token = 1
  8 | 
  9 | local function parse(str, c, bs, ctx)
 10 |   local tttt = token
 11 |   token = token + 1
 12 | 
 13 |   local class
 14 |   local nbs = bs
 15 |   utf8.debug("cc_parse", tttt, str, c, nbs, next(str, nbs))
 16 | 
 17 |   if c == '%' then
 18 |     c, nbs = next(str, bs)
 19 |     if c == '' then
 20 |       error("malformed pattern (ends with '%')")
 21 |     end
 22 |     local _c = utf8.raw.lower(c)
 23 |     local matched
 24 |     if _c == 'a' then
 25 |       matched = ('alpha')
 26 |     elseif _c == 'c' then
 27 |       matched = ('cntrl')
 28 |     elseif _c == 'd' then
 29 |       matched = ('digit')
 30 |     elseif _c == 'g' then
 31 |       matched = ('graph')
 32 |     elseif _c == 'l' then
 33 |       matched = ('lower')
 34 |     elseif _c == 'p' then
 35 |       matched = ('punct')
 36 |     elseif _c == 's' then
 37 |       matched = ('space')
 38 |     elseif _c == 'u' then
 39 |       matched = ('upper')
 40 |     elseif _c == 'w' then
 41 |       matched = ('alnum')
 42 |     elseif _c == 'x' then
 43 |       matched = ('xdigit')
 44 |     end
 45 | 
 46 |     if matched then
 47 |       if _c ~= c then
 48 |         class = cl.new():without_classes(matched)
 49 |       else
 50 |         class = cl.new():with_classes(matched)
 51 |       end
 52 |     elseif _c == 'z' then
 53 |       class = cl.new():with_codes(0)
 54 |       if _c ~= c then
 55 |         class = class:invert()
 56 |       end
 57 |     else
 58 |       class = cl.new():with_codes(c)
 59 |     end
 60 |   elseif c == '[' and not ctx.internal then
 61 |     local old_internal = ctx.internal
 62 |     ctx.internal = true
 63 |     class = cl.new()
 64 |     local firstletter = true
 65 |     while true do
 66 |       local prev_nbs = nbs
 67 |       c, nbs = next(str, nbs)
 68 |       utf8.debug("next", tttt, c, nbs)
 69 |       if c == '^' and firstletter then
 70 |         class:invert()
 71 |         local nc, nnbs = next(str, nbs)
 72 |         if nc == ']' then
 73 |           class:with_codes(nc)
 74 |           nbs = nnbs
 75 |         end
 76 |       elseif c == ']' then
 77 |         if firstletter then
 78 |           class:with_codes(c)
 79 |         else
 80 |           utf8.debug('] on pos', tttt, nbs)
 81 |           break
 82 |         end
 83 |       elseif c == '' then
 84 |         error "malformed pattern (missing ']')"
 85 |       else
 86 |         local sub_class, skip = utf8.regex.compiletime.charclass.parse(str, c, nbs, ctx)
 87 |         nbs = prev_nbs + skip
 88 |         utf8.debug("include", tttt, bs, prev_nbs, nbs, skip)
 89 |         class:include(sub_class)
 90 |       end
 91 |       firstletter = false
 92 |     end
 93 |     ctx.internal = old_internal
 94 |   elseif c == '.' then
 95 |     if not ctx.internal then
 96 |       class = cl.new():invert()
 97 |     else
 98 |       class = cl.new():with_codes(c)
 99 |     end
100 |   end
101 | 
102 |   return class, utf8.next(str, nbs) - bs
103 | end
104 | 
105 | return parse
106 | 
107 | end
108 | 
109 | --[[
110 |     x: (where x is not one of the magic characters ^$()%.[]*+-?) represents the character x itself.
111 |     .: (a dot) represents all characters.
112 |     %a: represents all letters.
113 |     %c: represents all control characters.
114 |     %d: represents all digits.
115 |     %g: represents all printable characters except space.
116 |     %l: represents all lowercase letters.
117 |     %p: represents all punctuation characters.
118 |     %s: represents all space characters.
119 |     %u: represents all uppercase letters.
120 |     %w: represents all alphanumeric characters.
121 |     %x: represents all hexadecimal digits.
122 |     %x: (where x is any non-alphanumeric character) represents the character x. This is the standard way to escape the magic characters. Any non-alphanumeric character (including all punctuation characters, even the non-magical) can be preceded by a '%' when used to represent itself in a pattern.
123 |     [set]: represents the class which is the union of all characters in set. A range of characters can be specified by separating the end characters of the range, in ascending order, with a '-'. All classes %x described above can also be used as components in set. All other characters in set represent themselves. For example, [%w_] (or [_%w]) represents all alphanumeric characters plus the underscore, [0-7] represents the octal digits, and [0-7%l%-] represents the octal digits plus the lowercase letters plus the '-' character.
124 | 
125 |     You can put a closing square bracket in a set by positioning it as the first character in the set. You can put a hyphen in a set by positioning it as the first or the last character in the set. (You can also use an escape for both cases.)
126 | 
127 |     The interaction between ranges and classes is not defined. Therefore, patterns like [%a-z] or [a-%%] have no meaning.
128 |     [^set]: represents the complement of set, where set is interpreted as above.
129 | 
130 | For all classes represented by single letters (%a, %c, etc.), the corresponding uppercase letter represents the complement of the class. For instance, %S represents all non-space characters.
131 | ]]
132 | 


--------------------------------------------------------------------------------
/include/AL/utf8/test/charclass_compiletime.lua:
--------------------------------------------------------------------------------
  1 | local utf8 = require "init"
  2 | utf8.config = {
  3 |   debug = nil,
  4 | --   debug = utf8:require("util").debug,
  5 | }
  6 | utf8:init()
  7 | 
  8 | local ctx = utf8:require("context.compiletime"):new()
  9 | 
 10 | local equals = require 'test.util'.equals
 11 | local assert = require 'test.util'.assert
 12 | local assert_equals = require 'test.util'.assert_equals
 13 | local parse = utf8.regex.compiletime.charclass.parse
 14 | 
 15 | assert_equals({parse("aabb", "a", 1, ctx)}, {{codes = {utf8.byte("a")}}, 1})
 16 | assert_equals({parse("aabb", "a", 2, ctx)}, {{codes = {utf8.byte("a")}}, 1})
 17 | assert_equals({parse("aabb", "b", 3, ctx)}, {{codes = {utf8.byte("b")}}, 1})
 18 | assert_equals({parse("aabb", "b", 4, ctx)}, {{codes = {utf8.byte("b")}}, 1})
 19 | 
 20 | assert_equals({parse("aa%ab", "%", 3, ctx)}, {{classes = {'alpha'}}, 2})
 21 | assert_equals({parse("aac%Ab", "%", 4, ctx)}, {{not_classes = {'alpha'}}, 2})
 22 | assert_equals({parse("aa.b", ".", 3, ctx)}, {{inverted = true}, 1})
 23 | 
 24 | assert_equals({parse("aa[c]b", "[", 3, ctx)}, {
 25 |   {codes = {utf8.byte("c")}, ranges = nil, classes = nil, not_classes = nil},
 26 |   utf8.raw.len("[c]")
 27 | })
 28 | 
 29 | assert_equals({parse("aa[%A]b", "[", 3, ctx)}, {
 30 |   {codes = nil, ranges = nil, classes = nil, not_classes = {'alpha'}},
 31 |   utf8.raw.len("[%A]")
 32 | })
 33 | 
 34 | assert_equals({parse("[^%p%d%s%c]+", "[", 1, ctx)}, {
 35 |   {codes = nil, ranges = nil, classes = {'punct', 'digit', 'space', 'cntrl'}, not_classes = nil, inverted = true},
 36 |   utf8.raw.len("[^%p%d%s%c]")
 37 | })
 38 | 
 39 | assert_equals({parse("aa[[c]]b", "[", 3, ctx)}, {
 40 |   {codes = {utf8.byte("["), utf8.byte("c")}, ranges = nil, classes = nil, not_classes = nil},
 41 |   utf8.raw.len("[[c]")
 42 | })
 43 | 
 44 | assert_equals({parse("aa[%a[c]]b", "[", 3, ctx)}, {
 45 |   {codes = {utf8.byte("["), utf8.byte("c")}, ranges = nil, classes = {'alpha'}, not_classes = nil},
 46 |   utf8.raw.len("[%a[c]")
 47 | })
 48 | 
 49 | assert_equals({parse("aac-db", "c", 3, ctx)}, {
 50 |   {codes = {utf8.byte("c")}},
 51 |   utf8.raw.len("c")
 52 | })
 53 | 
 54 | assert_equals({parse("aa[c-d]b", "[", 3, ctx)}, {
 55 |   {codes = nil, ranges = {{utf8.byte("c"),utf8.byte("d")}}, classes = nil, not_classes = nil},
 56 |   utf8.raw.len("[c-d]")
 57 | })
 58 | assert_equals(ctx.internal, false)
 59 | 
 60 | assert_equals({parse("aa[c-]]b", "[", 3, ctx)}, {
 61 |   {codes = {utf8.byte("-"), utf8.byte("c")}, ranges = nil, classes = nil, not_classes = nil},
 62 |   utf8.raw.len("[c-]")
 63 | })
 64 | assert_equals(ctx.internal, false)
 65 | 
 66 | assert_equals({parse("aad-", "d", 3, ctx)}, {
 67 |   {codes = {utf8.byte("d")}},
 68 |   utf8.raw.len("d")
 69 | })
 70 | assert_equals(ctx.internal, false)
 71 | 
 72 | ctx.internal = false
 73 | assert_equals({parse(".", ".", 1, ctx)}, {
 74 |   {inverted = true},
 75 |   utf8.raw.len(".")
 76 | })
 77 | 
 78 | assert_equals({parse("[.]", "[", 1, ctx)}, {
 79 |   {codes = {utf8.byte(".")}},
 80 |   utf8.raw.len("[.]")
 81 | })
 82 | 
 83 | assert_equals({parse("%?", "%", 1, ctx)}, {
 84 |   {codes = {utf8.byte("?")}},
 85 |   utf8.raw.len("%?")
 86 | })
 87 | 
 88 | assert_equals({parse("[]]", "[", 1, ctx)}, {
 89 |   {codes = {utf8.byte("]")}},
 90 |   utf8.raw.len("[]]")
 91 | })
 92 | 
 93 | assert_equals({parse("[^]]", "[", 1, ctx)}, {
 94 |   {codes = {utf8.byte("]")}, inverted = true},
 95 |   utf8.raw.len("[^]]")
 96 | })
 97 | 
 98 | --[[--
 99 | multibyte chars
100 | --]]--
101 | 
102 | assert_equals({parse("ббюю", "б", #"" + 1, ctx)}, {{codes = {utf8.byte("б")}}, utf8.raw.len("б")})
103 | assert_equals({parse("ббюю", "б", #"б" + 1, ctx)}, {{codes = {utf8.byte("б")}}, utf8.raw.len("б")})
104 | assert_equals({parse("ббюю", "ю", #"бб" + 1, ctx)}, {{codes = {utf8.byte("ю")}}, utf8.raw.len("ю")})
105 | assert_equals({parse("ббюю", "ю", #"ббю" + 1, ctx)}, {{codes = {utf8.byte("ю")}}, utf8.raw.len("ю")})
106 | 
107 | assert_equals({parse("бб%aю", "%", #"бб" + 1, ctx)}, {{classes = {'alpha'}}, 2})
108 | assert_equals({parse("ббц%Aю", "%", #"ббц" + 1, ctx)}, {{not_classes = {'alpha'}}, 2})
109 | assert_equals({parse("бб.ю", ".", #"бб" + 1, ctx)}, {{inverted = true}, 1})
110 | 
111 | assert_equals({parse("бб[ц]ю", "[", #"бб" + 1, ctx)}, {
112 |   {codes = {utf8.byte("ц")}, ranges = nil, classes = nil, not_classes = nil},
113 |   utf8.raw.len("[ц]")
114 | })
115 | 
116 | assert_equals({parse("бб[%A]ю", "[", #"бб" + 1, ctx)}, {
117 |   {codes = nil, ranges = nil, classes = nil, not_classes = {'alpha'}},
118 |   utf8.raw.len("[%A]")
119 | })
120 | 
121 | assert_equals({parse("бб[[ц]]ю", "[", #"бб" + 1, ctx)}, {
122 |   {codes = {utf8.byte("["), utf8.byte("ц")}, ranges = nil, classes = nil, not_classes = nil},
123 |   utf8.raw.len("[[ц]")
124 | })
125 | 
126 | assert_equals({parse("бб[%a[ц]]ю", "[", #"бб" + 1, ctx)}, {
127 |   {codes = {utf8.byte("["), utf8.byte("ц")}, ranges = nil, classes = {'alpha'}, not_classes = nil},
128 |   utf8.raw.len("[%a[ц]")
129 | })
130 | 
131 | ctx.internal = true
132 | assert_equals({parse("ббц-ыю", "ц", #"бб" + 1, ctx)}, {
133 |   {ranges = {{utf8.byte("ц"),utf8.byte("ы")}}},
134 |   utf8.raw.len("ц-ы")
135 | })
136 | 
137 | ctx.internal = false
138 | assert_equals({parse("бб[ц-ы]ю", "[", #"бб" + 1, ctx)}, {
139 |   {codes = nil, ranges = {{utf8.byte("ц"),utf8.byte("ы")}}, classes = nil, not_classes = nil},
140 |   utf8.raw.len("[ц-ы]")
141 | })
142 | 
143 | assert_equals({parse("бб[ц-]]ю", "[", #"бб" + 1, ctx)}, {
144 |   {codes = {utf8.byte("-"), utf8.byte("ц")}, ranges = nil, classes = nil, not_classes = nil},
145 |   utf8.raw.len("[ц-]")
146 | })
147 | 
148 | assert_equals({parse("ббы-", "ы", #"бб" + 1, ctx)}, {
149 |   {codes = {utf8.byte("ы")}},
150 |   utf8.raw.len("ы")
151 | })
152 | 
153 | ctx.internal = true
154 | assert_equals({parse("ббы-цю", "ы", #"бб" + 1, ctx)}, {
155 |   {ranges = {{utf8.byte("ы"),utf8.byte("ц")}}},
156 |   utf8.raw.len("ы-ц")
157 | })
158 | 
159 | ctx.internal = false
160 | assert_equals({parse("бб[ы]ю", "[", #"бб" + 1, ctx)}, {
161 |   {codes = {utf8.byte("ы")}, ranges = nil, classes = nil, not_classes = nil},
162 |   utf8.raw.len("[ы]")
163 | })
164 | 
165 | print "OK"
166 | 


--------------------------------------------------------------------------------
/include/AL/utf8/test/test.lua:
--------------------------------------------------------------------------------
  1 | local utf8 = require('init')
  2 | utf8.config = {
  3 |   debug = nil,
  4 | --   debug = utf8:require("util").debug,
  5 | }
  6 | utf8:init()
  7 | 
  8 | for k,v in pairs(utf8) do
  9 |   string[k] = v
 10 | end
 11 | 
 12 | local LUA_51, LUA_53 = false, false
 13 | if "\xe4" == "xe4" then -- lua5.1
 14 |   LUA_51 = true
 15 | else -- luajit lua5.3
 16 |   LUA_53 = true
 17 | end
 18 | 
 19 | local FFI_ENABLED = false
 20 | if pcall(require, "ffi") then
 21 |   FFI_ENABLED = true
 22 | end
 23 | 
 24 | local res = {}
 25 | 
 26 | local equals = require 'test.util'.equals
 27 | local assert = require 'test.util'.assert
 28 | local assert_equals = require 'test.util'.assert_equals
 29 | 
 30 | if FFI_ENABLED then
 31 |   assert_equals(("АБВ"):lower(), "абв")
 32 |   assert_equals(("абв"):upper(), "АБВ")
 33 | end
 34 | 
 35 | res = {}
 36 | for _, w in ("123456789"):gensub(2), {1} do res[#res + 1] = w end
 37 | assert_equals({"23", "56", "89"}, res)
 38 | 
 39 | assert_equals(0, ("фыва"):next(0))
 40 | assert_equals(100, ("фыва"):next(100))
 41 | assert_equals(#"ф" + 1, ("фыва"):next(1))
 42 | assert_equals("ыва", utf8.raw.sub("фыва", ("фыва"):next(1)))
 43 | 
 44 | res = {}
 45 | for p, c in ("абвгд"):codes() do res[#res + 1] = {p, c} end
 46 | assert_equals({
 47 |   {1, utf8.byte'а'},
 48 |   {#'а' + 1, utf8.byte'б'},
 49 |   {#'аб' + 1, utf8.byte'в'},
 50 |   {#'абв' + 1, utf8.byte'г'},
 51 |   {#'абвг' + 1, utf8.byte'д'},
 52 | }, res)
 53 | 
 54 | assert_equals(1, utf8.offset('abcde', 0))
 55 | 
 56 | assert_equals(1, utf8.offset('abcde', 1))
 57 | assert_equals(5, utf8.offset('abcde', 5))
 58 | assert_equals(6, utf8.offset('abcde', 6))
 59 | assert_equals(nil, utf8.offset('abcde', 7))
 60 | 
 61 | assert_equals(5, utf8.offset('abcde', -1))
 62 | assert_equals(1, utf8.offset('abcde', -5))
 63 | assert_equals(nil, utf8.offset('abcde', -6))
 64 | 
 65 | assert_equals(1, utf8.offset('abcde', 0, 1))
 66 | assert_equals(3, utf8.offset('abcde', 0, 3))
 67 | assert_equals(6, utf8.offset('abcde', 0, 6))
 68 | 
 69 | assert_equals(3, utf8.offset('abcde', 1, 3))
 70 | assert_equals(5, utf8.offset('abcde', 3, 3))
 71 | assert_equals(6, utf8.offset('abcde', 4, 3))
 72 | assert_equals(nil, utf8.offset('abcde', 5, 3))
 73 | 
 74 | assert_equals(2, utf8.offset('abcde', -1, 3))
 75 | assert_equals(1, utf8.offset('abcde', -2, 3))
 76 | assert_equals(5, utf8.offset('abcde', -1, 6))
 77 | assert_equals(nil, utf8.offset('abcde', -3, 3))
 78 | 
 79 | assert_equals(1, utf8.offset('абвгд', 0))
 80 | 
 81 | assert_equals(1, utf8.offset('абвгд', 1))
 82 | assert_equals(#'абвг' + 1, utf8.offset('абвгд', 5))
 83 | assert_equals(#'абвгд' + 1, utf8.offset('абвгд', 6))
 84 | assert_equals(nil, utf8.offset('абвгд', 7))
 85 | 
 86 | assert_equals(#'абвг' + 1, utf8.offset('абвгд', -1))
 87 | assert_equals(1, utf8.offset('абвгд', -5))
 88 | assert_equals(nil, utf8.offset('абвгд', -6))
 89 | 
 90 | assert_equals(1, utf8.offset('абвгд', 0, 1))
 91 | assert_equals(1, utf8.offset('абвгд', 0, 2))
 92 | assert_equals(#'аб' + 1, utf8.offset('абвгд', 0, #'аб' + 1))
 93 | assert_equals(#'аб' + 1, utf8.offset('абвгд', 0, #'аб' + 2))
 94 | assert_equals(#'абвгд' + 1, utf8.offset('абвгд', 0, #'абвгд' + 1))
 95 | 
 96 | assert_equals(#'аб' + 1, utf8.offset('абвгд', 1, #'аб' + 1))
 97 | assert_equals(#'абвг' + 1, utf8.offset('абвгд', 3, #'аб' + 1))
 98 | assert_equals(#'абвгд' + 1, utf8.offset('абвгд', 4, #'аб' + 1))
 99 | assert_equals(#'абвгд' + 1, utf8.offset('абвгд', 4, #'аб' + 2))
100 | assert_equals(nil, utf8.offset('абвгд', 5, #'аб' + 1))
101 | 
102 | assert_equals(#'а' + 1, utf8.offset('абвгд', -1, #'аб' + 1))
103 | assert_equals(1, utf8.offset('абвгд', -2, #'аб' + 1))
104 | assert_equals(#'абвг' + 1, utf8.offset('абвгд', -1, #'абвгд' + 1))
105 | assert_equals(nil, utf8.offset('абвгд', -3, #'аб' + 1))
106 | 
107 | assert(("фыва"):validate())
108 | assert_equals({false, {{ pos = #"ф" + 1, part = 1, code = 255 }} }, {("ф\255ыва"):validate()})
109 | if LUA_53 then
110 |   assert_equals({false, {{ pos = #"ф" + 1, part = 1, code = 0xFF }} }, {("ф\xffыва"):validate()})
111 | end
112 | 
113 | assert_equals(nil, ("aabb"):find("%bcd"))
114 | assert_equals({1, 4}, {("aabb"):find("%bab")})
115 | assert_equals({1, 2}, {("aba"):find('%bab')})
116 | 
117 | res = {}
118 | for w in ("aacaabbcabbacbaacab"):gmatch('%bab') do res[#res + 1] = w end
119 | assert_equals({"acaabbcabb", "acb", "ab"}, res)
120 | 
121 | assert_equals({1, 0}, {("aacaabbcabbacbaacab"):find('%f[acb]')})
122 | assert_equals("a", ("aba"):match('%f[ab].'))
123 | 
124 | res = {}
125 | for w in ("aacaabbcabbacbaacab"):gmatch('%f[ab]') do res[#res + 1] = w end
126 | assert_equals({"", "", "", "", ""}, res)
127 | 
128 | assert_equals({"HaacHaabbcHabbacHbaacHab",	5}, {("aacaabbcabbacbaacab"):gsub('%f[ab]', 'H')})
129 | 
130 | res = {}
131 | for w in ("Привет, мир, от Lua"):gmatch("[^%p%d%s%c]+") do res[#res + 1] = w end
132 | assert_equals({"Привет", "мир", "от", "Lua"}, res)
133 | 
134 | res = {}
135 | for k, v in ("从=世界, 到=Lua"):gmatch("([^%p%s%c]+)=([^%p%s%c]+)") do res[k] = v end
136 | assert_equals({["到"] =	"Lua", ["从"] = "世界"}, res)
137 | 
138 | assert_equals("Ahoj Ahoj světe světe", ("Ahoj světe"):gsub("([^%p%s%c]+)", "%1 %1"))
139 | 
140 | assert_equals("Ahoj Ahoj světe", ("Ahoj světe"):gsub("[^%p%s%c]+", "%0 %0", 1))
141 | 
142 | assert_equals("κόσμο γεια Lua από", ("γεια κόσμο από Lua"):gsub("([^%p%s%c]+)%s*([^%p%s%c]+)", "%2 %1"))
143 | 
144 | assert_equals({8, 27, "ололоо я водитель э"}, {("пыщпыщ ололоо я водитель энло"):find("(.л.+)н")})
145 | 
146 | assert_equals({"пыщпыщ о보라보라 я водитель эн보라",	3}, {("пыщпыщ ололоо я водитель энло"):gsub("ло+", "보라")})
147 | 
148 | assert_equals("пыщпыщ ололоо я", ("пыщпыщ ололоо я водитель энло"):match("^п[лопыщ ]*я"))
149 | 
150 | assert_equals("в", ("пыщпыщ ололоо я водитель энло"):match("[в-д]+"))
151 | 
152 | assert_equals(nil, ('abc abc'):match('([^%s]+)%s%s')) -- https://github.com/Stepets/utf8.lua/issues/2
153 | 
154 | res = {}
155 | for w in ("aacabbacbbcaabbcbacaa"):gmatch("a+b") do res[#res + 1] = w end
156 | assert_equals({"ab","aab"}, res)
157 | 
158 | res = {}
159 | for w in ("aacabbacbbcaabbcbacaa"):gmatch("a-b") do res[#res + 1] = w end
160 | assert_equals({"ab","b","b","b","aab","b","b"}, res)
161 | 
162 | res = {}
163 | for w in ("aacabbacbbcaabbcbacaa"):gmatch("a*b") do res[#res + 1] = w end
164 | assert_equals({"ab","b","b","b","aab","b","b"}, res)
165 | 
166 | res = {}
167 | for w in ("aacabbacbbcaabbcbacaa"):gmatch("ba+") do res[#res + 1] = w end
168 | assert_equals({"ba","ba"}, res)
169 | 
170 | res = {}
171 | for w in ("aacabbacbbcaabbcbacaa"):gmatch("ba-") do res[#res + 1] = w end
172 | assert_equals({"b","b","b","b","b","b","b"}, res)
173 | 
174 | res = {}
175 | for w in ("aacabbacbbcaabbcbacaa"):gmatch("ba*") do res[#res + 1] = w end
176 | assert_equals({"b","ba","b","b","b","b","ba"}, res)
177 | 
178 | assert_equals({"bacbbcaabbcba", "ba"}, {("aacabbacbbcaabbcbacaa"):match("((ba+).*%2)")})
179 | assert_equals({"bbacbbcaabbcb", "b"}, {("aacabbacbbcaabbcbacaa"):match("((ba*).*%2)")})
180 | 
181 | res = {}
182 | for w in ("aacabbacbbcaabbcbacaa"):gmatch("((b+a*).-%2)") do res[#res + 1] = w end
183 | assert_equals({"bbacbb", "bb"}, res)
184 | 
185 | assert_equals("a**", ("a**v"):match("a**+"))
186 | assert_equals("a", ("a**v"):match("a**-"))
187 | 
188 | assert_equals({"test", "."}, {("test.lua"):match("(.-)([.])")})
189 | 
190 | -- https://github.com/Stepets/utf8.lua/issues/3
191 | assert_equals({"ab", "c"}, {("abc"):match("^([ab]-)([^b]*)$")})
192 | assert_equals({"ab", ""}, {("ab"):match("^([ab]-)([^b]*)$")})
193 | assert_equals({"items.", ""}, {("items."):match("^(.-)([^.]*)$")})
194 | assert_equals({"", "items"}, {("items"):match("^(.-)([^.]*)$")})
195 | 
196 | -- https://github.com/Stepets/utf8.lua/issues/4
197 | assert_equals({"ab.123", 1}, {("ab.?"):gsub("%?", "123")})
198 | 
199 | -- https://github.com/Stepets/utf8.lua/issues/5
200 | assert_equals({"ab", 1}, {("ab"):gsub("a", "%0")})
201 | assert_equals({"ab", 1}, {("ab"):gsub("a", "%1")})
202 | 
203 | assert_equals("c", ("abc"):match("c", -1))
204 | 
205 | print("\ntests passed\n")
206 | 


--------------------------------------------------------------------------------
/include/AL/utf8/modifier/compiletime/vanilla.lua:
--------------------------------------------------------------------------------
  1 | return function(utf8)
  2 | 
  3 | local utf8unicode = utf8.byte
  4 | local sub = utf8.raw.sub
  5 | 
  6 | local matchers = {
  7 |   star = function(class, name)
  8 |     local class_name = 'class' .. name
  9 |     return [[
 10 |   local ]] .. class_name .. [[ = ]] .. class .. [[
 11 | 
 12 |   add(function(ctx) -- star
 13 |     -- debug(ctx, 'star', ']] .. class_name .. [[')
 14 |     local clone = ctx:clone()
 15 |     while ]] .. class_name .. [[:test(clone:get_charcode()) do
 16 |       clone:next_char()
 17 |     end
 18 |     local pos = clone.pos
 19 |     while pos >= ctx.pos do
 20 |       clone.pos = pos
 21 |       clone.func_pos = ctx.func_pos
 22 |       clone:next_function()
 23 |       clone:get_function()(clone)
 24 |       if clone.modified then
 25 |         clone = ctx:clone()
 26 |       end
 27 |       pos = pos - 1
 28 |     end
 29 |   end)
 30 | ]]
 31 |   end,
 32 |   minus = function(class, name)
 33 |     local class_name = 'class' .. name
 34 |     return [[
 35 |   local ]] .. class_name .. [[ = ]] .. class .. [[
 36 | 
 37 |   add(function(ctx) -- minus
 38 |     -- debug(ctx, 'minus', ']] .. class_name .. [[')
 39 | 
 40 |     local clone = ctx:clone()
 41 |     local pos
 42 |     repeat
 43 |       pos = clone.pos
 44 |       clone:next_function()
 45 |       clone:get_function()(clone)
 46 |       if clone.modified then
 47 |         clone = ctx:clone()
 48 |         clone.pos = pos
 49 |       else
 50 |         clone.pos = pos
 51 |         clone.func_pos = ctx.func_pos
 52 |       end
 53 |       local match = ]] .. class_name .. [[:test(clone:get_charcode())
 54 |       clone:next_char()
 55 |     until not match
 56 |   end)
 57 | ]]
 58 |   end,
 59 |   question = function(class, name)
 60 |     local class_name = 'class' .. name
 61 |     return [[
 62 |   local ]] .. class_name .. [[ = ]] .. class .. [[
 63 | 
 64 |   add(function(ctx) -- question
 65 |     -- debug(ctx, 'question', ']] .. class_name .. [[')
 66 |     local saved = ctx:clone()
 67 |     if ]] .. class_name .. [[:test(ctx:get_charcode()) then
 68 |       ctx:next_char()
 69 |       ctx:next_function()
 70 |       ctx:get_function()(ctx)
 71 |     end
 72 |     ctx = saved
 73 |     ctx:next_function()
 74 |     return ctx:get_function()(ctx)
 75 |   end)
 76 | ]]
 77 |   end,
 78 |   capture_start = function(number)
 79 |     return [[
 80 |   add(function(ctx)
 81 |     ctx.modified = true
 82 |     -- debug(ctx, 'capture_start', ']] .. tostring(number) .. [[')
 83 |     table.insert(ctx.captures.active, { id = ]] .. tostring(number) .. [[, start = ctx.pos })
 84 |     ctx:next_function()
 85 |     return ctx:get_function()(ctx)
 86 |   end)
 87 | ]]
 88 |   end,
 89 |   capture_finish = function(number)
 90 |     return [[
 91 |   add(function(ctx)
 92 |     ctx.modified = true
 93 |     -- debug(ctx, 'capture_finish', ']] .. tostring(number) .. [[')
 94 |     local cap = table.remove(ctx.captures.active)
 95 |     cap.finish = ctx.pos
 96 |     local b, e = ctx.offsets[cap.start], ctx.offsets[cap.finish]
 97 |     if cap.start < 1 then
 98 |       b = 1
 99 |     elseif cap.start >= ctx.len then
100 |       b = ctx.rawlen + 1
101 |     end
102 |     if cap.finish < 1 then
103 |       e = 1
104 |     elseif cap.finish >= ctx.len then
105 |       e = ctx.rawlen + 1
106 |     end
107 |     ctx.captures[cap.id] = rawsub(ctx.str, b, e - 1)
108 |     -- debug('capture#' .. tostring(cap.id), '[' .. tostring(cap.start).. ',' .. tostring(cap.finish) .. ']' , 'is', ctx.captures[cap.id])
109 |     ctx:next_function()
110 |     return ctx:get_function()(ctx)
111 |   end)
112 | ]]
113 |   end,
114 |   capture_position = function(number)
115 |     return [[
116 |   add(function(ctx)
117 |     ctx.modified = true
118 |     -- debug(ctx, 'capture_position', ']] .. tostring(number) .. [[')
119 |     ctx.captures[ ]] .. tostring(number) .. [[ ] = ctx.pos
120 |     ctx:next_function()
121 |     return ctx:get_function()(ctx)
122 |   end)
123 | ]]
124 |   end,
125 |   capture = function(number)
126 |     return [[
127 |   add(function(ctx)
128 |     -- debug(ctx, 'capture', ']] .. tostring(number) .. [[')
129 |     local cap = ctx.captures[ ]] .. tostring(number) .. [[ ]
130 |     local len = utf8len(cap)
131 | 		local check = utf8sub(ctx.str, ctx.pos, ctx.pos + len - 1)
132 |     -- debug("capture check:", cap, check)
133 | 		if cap == check then
134 | 			ctx.pos = ctx.pos + len
135 | 			ctx:next_function()
136 |       return ctx:get_function()(ctx)
137 | 		end
138 |   end)
139 | ]]
140 |   end,
141 |   balancer = function(pair, name)
142 |     local class_name = 'class' .. name
143 |     return [[
144 | 
145 |   add(function(ctx) -- balancer
146 |     local d, b = ]] .. tostring(utf8unicode(pair[1])) .. [[, ]] .. tostring(utf8unicode(pair[2])) .. [[
147 |     if ctx:get_charcode() ~= d then return end
148 |     local balance = 0
149 |     repeat
150 |       local c = ctx:get_charcode()
151 |       if c == nil then return end
152 | 
153 |       if c == d then
154 |         balance = balance + 1
155 |       elseif c == b then
156 |         balance = balance - 1
157 |       end
158 |       -- debug("balancer: balance=", balance, ", d=", d, ", b=", b, ", charcode=", ctx:get_charcode())
159 |       ctx:next_char()
160 |     until balance == 0 or (balance == 2 and d == b)
161 |     ctx:next_function()
162 |     return ctx:get_function()(ctx)
163 |   end)
164 | ]]
165 |   end,
166 |   simple = utf8:require("modifier.compiletime.simple").simple,
167 | }
168 | 
169 | local next = utf8.util.next
170 | 
171 | local function parse(regex, c, bs, ctx)
172 |   local functions, nbs = nil, bs
173 |   if c == '%' then
174 |     c, nbs = next(regex, bs)
175 |     utf8.debug("next", c, bs)
176 |     if c == '' then
177 |       error("malformed pattern (ends with '%')")
178 |     end
179 |     if utf8.raw.find('123456789', c, 1, true) then
180 |       functions = { matchers.capture(tonumber(c)) }
181 |       nbs = utf8.next(regex, nbs)
182 |     elseif c == 'b' then
183 |       local d, b
184 |       d, nbs = next(regex, nbs)
185 |       b, nbs = next(regex, nbs)
186 |       assert(d ~= '' and b ~= '', "unbalanced pattern")
187 |       functions = { matchers.balancer({d, b}, tostring(bs)) }
188 |       nbs = utf8.next(regex, nbs)
189 |     end
190 | 
191 |     if functions and ctx.prev_class then
192 |       table.insert(functions, 1, matchers.simple(ctx.prev_class, tostring(bs)))
193 |     end
194 |   elseif c == '*' and ctx.prev_class then
195 |     functions = {
196 |       matchers.star(
197 |         ctx.prev_class,
198 |         tostring(bs)
199 |       )
200 |     }
201 |     nbs = bs + 1
202 |   elseif c == '+' and ctx.prev_class then
203 |     functions = {
204 |       matchers.simple(
205 |         ctx.prev_class,
206 |         tostring(bs)
207 |       ),
208 |       matchers.star(
209 |         ctx.prev_class,
210 |         tostring(bs)
211 |       )
212 |     }
213 |     nbs = bs + 1
214 |   elseif c == '-' and ctx.prev_class then
215 |     functions = {
216 |       matchers.minus(
217 |         ctx.prev_class,
218 |         tostring(bs)
219 |       )
220 |     }
221 |     nbs = bs + 1
222 |   elseif c == '?' and ctx.prev_class then
223 |     functions = {
224 |       matchers.question(
225 |         ctx.prev_class,
226 |         tostring(bs)
227 |       )
228 |     }
229 |     nbs = bs + 1
230 |   elseif c == '(' then
231 |     ctx.capture = ctx.capture or {balance = 0, id = 0}
232 |     ctx.capture.id = ctx.capture.id + 1
233 |     local nc = next(regex, nbs)
234 |     if nc == ')' then
235 |       functions = {matchers.capture_position(ctx.capture.id)}
236 |       nbs = bs + 2
237 |     else
238 |       ctx.capture.balance = ctx.capture.balance + 1
239 |       functions = {matchers.capture_start(ctx.capture.id)}
240 |       nbs = bs + 1
241 |     end
242 |     if ctx.prev_class then
243 |       table.insert(functions, 1, matchers.simple(ctx.prev_class, tostring(bs)))
244 |     end
245 |   elseif c == ')' then
246 |     ctx.capture = ctx.capture or {balance = 0, id = 0}
247 |     functions = { matchers.capture_finish(ctx.capture.id) }
248 | 
249 |     ctx.capture.balance = ctx.capture.balance - 1
250 |     assert(ctx.capture.balance >= 0, 'invalid capture: "(" missing')
251 | 
252 |     if ctx.prev_class then
253 |       table.insert(functions, 1, matchers.simple(ctx.prev_class, tostring(bs)))
254 |     end
255 |     nbs = bs + 1
256 |   end
257 | 
258 |   return functions, nbs - bs
259 | end
260 | 
261 | local function check(ctx)
262 |   if ctx.capture then assert(ctx.capture.balance == 0, 'invalid capture: ")" missing') end
263 | end
264 | 
265 | return {
266 |   parse = parse,
267 |   check = check,
268 | }
269 | 
270 | end
271 | 


--------------------------------------------------------------------------------
/include/AL/utf8/test/test_pm.lua:
--------------------------------------------------------------------------------
  1 | --[[--
  2 | MIT License
  3 | 
  4 | Copyright (c) 2018 Xavier Wang
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in all
 14 | copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 | SOFTWARE.
 23 | --]]--
 24 | 
 25 | local utf8 = require 'init'
 26 | utf8.config = {
 27 |   debug = nil, --utf8:require("util").debug,
 28 | }
 29 | utf8:init()
 30 | 
 31 | print('testing pattern matching')
 32 | 
 33 | local
 34 | function f(s, p)
 35 |   local i,e = utf8.find(s, p)
 36 |   if i then return utf8.sub(s, i, e) end
 37 | end
 38 | 
 39 | local
 40 | function f1(s, p)
 41 |   p = utf8.gsub(p, "%%([0-9])", function (s) return "%" .. (tonumber(s)+1) end)
 42 |   p = utf8.gsub(p, "^(^?)", "%1()", 1)
 43 |   p = utf8.gsub(p, "($?)$", "()%1", 1)
 44 |   local t = {utf8.match(s, p)}
 45 |   return utf8.sub(s, t[1], t[#t] - 1)
 46 | end
 47 | 
 48 | local
 49 | a,b = utf8.find('', '')    -- empty patterns are tricky
 50 | assert(a == 1 and b == 0);
 51 | a,b = utf8.find('alo', '')
 52 | assert(a == 1 and b == 0)
 53 | a,b = utf8.find('a\0o a\0o a\0o', 'a', 1)   -- first position
 54 | assert(a == 1 and b == 1)
 55 | a,b = utf8.find('a\0o a\0o a\0o', 'a\0o', 2)   -- starts in the midle
 56 | assert(a == 5 and b == 7)
 57 | a,b = utf8.find('a\0o a\0o a\0o', 'a\0o', 9)   -- starts in the midle
 58 | assert(a == 9 and b == 11)
 59 | a,b = utf8.find('a\0a\0a\0a\0\0ab', '\0ab', 2);  -- finds at the end
 60 | assert(a == 9 and b == 11);
 61 | a,b = utf8.find('a\0a\0a\0a\0\0ab', 'b')    -- last position
 62 | assert(a == 11 and b == 11)
 63 | assert(utf8.find('a\0a\0a\0a\0\0ab', 'b\0') == nil)   -- check ending
 64 | assert(utf8.find('', '\0') == nil)
 65 | assert(utf8.find('alo123alo', '12') == 4)
 66 | assert(utf8.find('alo123alo', '^12') == nil)
 67 | 
 68 | assert(utf8.match("aaab", ".*b") == "aaab")
 69 | assert(utf8.match("aaa", ".*a") == "aaa")
 70 | assert(utf8.match("b", ".*b") == "b")
 71 | 
 72 | assert(utf8.match("aaab", ".+b") == "aaab")
 73 | assert(utf8.match("aaa", ".+a") == "aaa")
 74 | assert(not utf8.match("b", ".+b"))
 75 | 
 76 | assert(utf8.match("aaab", ".?b") == "ab")
 77 | assert(utf8.match("aaa", ".?a") == "aa")
 78 | assert(utf8.match("b", ".?b") == "b")
 79 | 
 80 | assert(f('aloALO', '%l*') == 'alo')
 81 | assert(f('aLo_ALO', '%a*') == 'aLo')
 82 | 
 83 | assert(f("  \n\r*&\n\r   xuxu  \n\n", "%g%g%g+") == "xuxu")
 84 | 
 85 | assert(f('aaab', 'a*') == 'aaa');
 86 | assert(f('aaa', '^.*$') == 'aaa');
 87 | assert(f('aaa', 'b*') == '');
 88 | assert(f('aaa', 'ab*a') == 'aa')
 89 | assert(f('aba', 'ab*a') == 'aba')
 90 | assert(f('aaab', 'a+') == 'aaa')
 91 | assert(f('aaa', '^.+$') == 'aaa')
 92 | assert(f('aaa', 'b+') == nil)
 93 | assert(f('aaa', 'ab+a') == nil)
 94 | assert(f('aba', 'ab+a') == 'aba')
 95 | assert(f('a$a', '.$') == 'a')
 96 | assert(f('a$a', '.%$') == 'a$')
 97 | assert(f('a$a', '.$.') == 'a$a')
 98 | assert(f('a$a', '$$') == nil)
 99 | assert(f('a$b', 'a$') == nil)
100 | assert(f('a$a', '$') == '')
101 | assert(f('', 'b*') == '')
102 | assert(f('aaa', 'bb*') == nil)
103 | assert(f('aaab', 'a-') == '')
104 | assert(f('aaa', '^.-$') == 'aaa')
105 | assert(f('aabaaabaaabaaaba', 'b.*b') == 'baaabaaabaaab')
106 | assert(f('aabaaabaaabaaaba', 'b.-b') == 'baaab')
107 | assert(f('alo xo', '.o$') == 'xo')
108 | assert(f(' \n isto é assim', '%S%S*') == 'isto')
109 | assert(f(' \n isto é assim', '%S*$') == 'assim')
110 | assert(f(' \n isto é assim', '[a-z]*$') == 'assim')
111 | assert(f('um caracter ? extra', '[^%sa-z]') == '?')
112 | assert(f('', 'a?') == '')
113 | assert(f('á', 'á?') == 'á')
114 | assert(f('ábl', 'á?b?l?') == 'ábl')
115 | assert(f('  ábl', 'á?b?l?') == '')
116 | assert(f('aa', '^aa?a?a') == 'aa')
117 | assert(f(']]]áb', '[^]]') == 'á')
118 | assert(f("0alo alo", "%x*") == "0a")
119 | assert(f("alo alo", "%C+") == "alo alo")
120 | print('+')
121 | 
122 | assert(f1('alo alx 123 b\0o b\0o', '(..*) %1') == "b\0o b\0o")
123 | assert(f1('axz123= 4= 4 34', '(.+)=(.*)=%2 %1') == '3= 4= 4 3')
124 | assert(f1('=======', '^(=*)=%1$') == '=======')
125 | assert(utf8.match('==========', '^([=]*)=%1$') == nil)
126 | 
127 | local function range (i, j)
128 |   if i <= j then
129 |     return i, range(i+1, j)
130 |   end
131 | end
132 | 
133 | local abc = utf8.char(range(0, 255));
134 | 
135 | assert(utf8.len(abc) == 256)
136 | assert(string.len(abc) == 384)
137 | 
138 | local
139 | function strset (p)
140 |   local res = {s=''}
141 |   utf8.gsub(abc, p, function (c) res.s = res.s .. c end)
142 |   return res.s
143 | end;
144 | 
145 | local a, b, c, d, e, t
146 | 
147 | -- local E = utf8.escape
148 | -- assert(utf8.len(strset(E'[%200-%210]')) == 11)
149 | 
150 | assert(strset('[a-z]') == "abcdefghijklmnopqrstuvwxyz")
151 | assert(strset('[a-z%d]') == strset('[%da-uu-z]'))
152 | assert(strset('[a-]') == "-a")
153 | assert(strset('[^%W]') == strset('[%w]'))
154 | assert(strset('[]%%]') == '%]')
155 | assert(strset('[a%-z]') == '-az')
156 | assert(strset('[%^%[%-a%]%-b]') == '-[]^ab')
157 | -- assert(strset('%Z') == strset(E'[%1-%255]'))
158 | -- assert(strset('.') == strset(E'[%1-%255%%z]'))
159 | print('+');
160 | 
161 | assert(utf8.match("alo xyzK", "(%w+)K") == "xyz")
162 | assert(utf8.match("254 K", "(%d*)K") == "")
163 | assert(utf8.match("alo ", "(%w*)$") == "")
164 | assert(utf8.match("alo ", "(%w+)$") == nil)
165 | assert(utf8.find("(álo)", "%(á") == 1)
166 | a, b, c, d, e = utf8.match("âlo alo", "^(((.).).* (%w*))$")
167 | assert(a == 'âlo alo' and b == 'âl' and c == 'â' and d == 'alo' and e == nil)
168 | a, b, c, d  = utf8.match('0123456789', '(.+(.?)())')
169 | assert(a == '0123456789' and b == '' and c == 11 and d == nil)
170 | print('+')
171 | 
172 | assert(utf8.gsub('ülo ülo', 'ü', 'x') == 'xlo xlo')
173 | assert(utf8.gsub('alo úlo  ', ' +$', '') == 'alo úlo')  -- trim
174 | assert(utf8.gsub('  alo alo  ', '^%s*(.-)%s*$', '%1') == 'alo alo')  -- double trim
175 | assert(utf8.gsub('alo  alo  \n 123\n ', '%s+', ' ') == 'alo alo 123 ')
176 | t = "abç d"
177 | a, b = utf8.gsub(t, '(.)', '%1@')
178 | assert('@'..a == utf8.gsub(t, '', '@') and b == 5)
179 | a, b = utf8.gsub('abçd', '(.)', '%0@', 2)
180 | assert(a == 'a@b@çd' and b == 2)
181 | assert(utf8.gsub('alo alo', '()[al]', '%1') == '12o 56o')
182 | assert(utf8.gsub("abc=xyz", "(%w*)(%p)(%w+)", "%3%2%1-%0") ==
183 |               "xyz=abc-abc=xyz")
184 | assert(utf8.gsub("abc", "%w", "%1%0") == "aabbcc")
185 | assert(utf8.gsub("abc", "%w+", "%0%1") == "abcabc")
186 | assert(utf8.gsub('áéí', '$', '\0óú') == 'áéí\0óú')
187 | assert(utf8.gsub('', '^', 'r') == 'r')
188 | assert(utf8.gsub('', '$', 'r') == 'r')
189 | print('+')
190 | 
191 | assert(utf8.gsub("um (dois) tres (quatro)", "(%(%w+%))", utf8.upper) ==
192 |             "um (DOIS) tres (QUATRO)")
193 | 
194 | do
195 |   local function setglobal (n,v) rawset(_G, n, v) end
196 |   utf8.gsub("a=roberto,roberto=a", "(%w+)=(%w%w*)", setglobal)
197 |   assert(_G.a=="roberto" and _G.roberto=="a")
198 | end
199 | 
200 | function f(a,b) return utf8.gsub(a,'.',b) end
201 | assert(utf8.gsub("trocar tudo em |teste|b| é |beleza|al|", "|([^|]*)|([^|]*)|", f) ==
202 |             "trocar tudo em bbbbb é alalalalalal")
203 | 
204 | local function dostring (s) return (loadstring or load)(s)() or "" end
205 | assert(utf8.gsub("alo $a=1$ novamente $return a$", "$([^$]*)%$", dostring) ==
206 |             "alo  novamente 1")
207 | 
208 | x = utf8.gsub("$local utf8=require'init' x=utf8.gsub('alo', '.', utf8.upper)$ assim vai para $return x$",
209 |          "$([^$]*)%$", dostring)
210 | assert(x == ' assim vai para ALO')
211 | 
212 | local s,r
213 | t = {}
214 | s = 'a alo jose  joao'
215 | r = utf8.gsub(s, '()(%w+)()', function (a,w,b)
216 |       assert(utf8.len(w) == b-a);
217 |       t[a] = b-a;
218 |     end)
219 | assert(s == r and t[1] == 1 and t[3] == 3 and t[7] == 4 and t[13] == 4)
220 | 
221 | local
222 | function isbalanced (s)
223 |   return utf8.find(utf8.gsub(s, "%b()", ""), "[()]") == nil
224 | end
225 | 
226 | assert(isbalanced("(9 ((8))(\0) 7) \0\0 a b ()(c)() a"))
227 | assert(not isbalanced("(9 ((8) 7) a b (\0 c) a"))
228 | assert(utf8.gsub("alo 'oi' alo", "%b''", '"') == 'alo " alo')
229 | 
230 | 
231 | local t = {"apple", "orange", "lime"; n=0}
232 | assert(utf8.gsub("x and x and x", "x", function () t.n=t.n+1; return t[t.n] end)
233 |         == "apple and orange and lime")
234 | 
235 | t = {n=0}
236 | utf8.gsub("first second word", "%w%w*", function (w) t.n=t.n+1; t[t.n] = w end)
237 | assert(t[1] == "first" and t[2] == "second" and t[3] == "word" and t.n == 3)
238 | 
239 | t = {n=0}
240 | assert(utf8.gsub("first second word", "%w+",
241 |          function (w) t.n=t.n+1; t[t.n] = w end, 2) == "first second word")
242 | assert(t[1] == "first" and t[2] == "second" and t[3] == nil)
243 | 
244 | assert(not pcall(utf8.gsub, "alo", "(.", print))
245 | assert(not pcall(utf8.gsub, "alo", ".)", print))
246 | assert(not pcall(utf8.gsub, "alo", "(.", {}))
247 | assert(not pcall(utf8.gsub, "alo", "(.)", "%2"))
248 | assert(not pcall(utf8.gsub, "alo", "(%1)", "a"))
249 | --[[--
250 | Stepets: ignoring this test because it's probably bug in Lua.
251 |   %0 should be interpreted as capture reference only in replacement arg
252 |   it doesn't have sense in pattern
253 | --]]--
254 | -- assert(not pcall(utf8.gsub, "alo", "(%0)", "a"))
255 | 
256 | -- bug since 2.5 (C-stack overflow)
257 | -- todo: benchmark OOM
258 | -- do
259 | --   local function f (size)
260 | --     local s = string.rep("a", size)
261 | --     local p = string.rep(".?", size)
262 | --     return pcall(utf8.match, s, p)
263 | --   end
264 | --   local r, m = f(80)
265 | --   assert(r and #m == 80)
266 | --   r, m = f(200000)
267 | --   assert(not r and utf8.find(m, "too complex"))
268 | -- end
269 | 
270 | -- if not _soft then
271 | --   -- big strings
272 | --   local a = string.rep('a', 300000)
273 | --   assert(utf8.find(a, '^a*.?$'))
274 | --   assert(not utf8.find(a, '^a*.?b$'))
275 | --   assert(utf8.find(a, '^a-.?$'))
276 | 
277 | --   -- bug in 5.1.2
278 | --   a = string.rep('a', 10000) .. string.rep('b', 10000)
279 | --   assert(not pcall(utf8.gsub, a, 'b'))
280 | -- end
281 | 
282 | -- recursive nest of gsubs
283 | local function rev (s)
284 |   return utf8.gsub(s, "(.)(.+)", function (c,s1) return rev(s1)..c end)
285 | end
286 | 
287 | local x = "abcdef"
288 | assert(rev(rev(x)) == x)
289 | 
290 | 
291 | -- gsub with tables
292 | assert(utf8.gsub("alo alo", ".", {}) == "alo alo")
293 | assert(utf8.gsub("alo alo", "(.)", {a="AA", l=""}) == "AAo AAo")
294 | assert(utf8.gsub("alo alo", "(.).", {a="AA", l="K"}) == "AAo AAo")
295 | assert(utf8.gsub("alo alo", "((.)(.?))", {al="AA", o=false}) == "AAo AAo")
296 | 
297 | assert(utf8.gsub("alo alo", "().", {2,5,6}) == "256 alo")
298 | 
299 | t = {}; setmetatable(t, {__index = function (t,s) return utf8.upper(s) end})
300 | assert(utf8.gsub("a alo b hi", "%w%w+", t) == "a ALO b HI")
301 | 
302 | 
303 | -- tests for gmatch
304 | local a = 0
305 | for i in utf8.gmatch('abcde', '()') do assert(i == a+1); a=i end
306 | assert(a==6)
307 | 
308 | t = {n=0}
309 | for w in utf8.gmatch("first second word", "%w+") do
310 |       t.n=t.n+1; t[t.n] = w
311 | end
312 | assert(t[1] == "first" and t[2] == "second" and t[3] == "word")
313 | 
314 | t = {3, 6, 9}
315 | for i in utf8.gmatch ("xuxx uu ppar r", "()(.)%2") do
316 |   assert(i == table.remove(t, 1))
317 | end
318 | assert(#t == 0)
319 | 
320 | t = {}
321 | for i,j in utf8.gmatch("13 14 10 = 11, 15= 16, 22=23", "(%d+)%s*=%s*(%d+)") do
322 |   t[i] = j
323 | end
324 | a = 0
325 | for k,v in pairs(t) do assert(k+1 == v+0); a=a+1 end
326 | assert(a == 3)
327 | 
328 | 
329 | -- tests for `%f' (`frontiers')
330 | 
331 | assert(utf8.gsub("aaa aa a aaa a", "%f[%w]a", "x") == "xaa xa x xaa x")
332 | assert(utf8.gsub("[[]] [][] [[[[", "%f[[].", "x") == "x[]] x]x] x[[[")
333 | assert(utf8.gsub("01abc45de3", "%f[%d]", ".") == ".01abc.45de.3")
334 | assert(utf8.gsub("01abc45 de3x", "%f[%D]%w", ".") == "01.bc45 de3.")
335 | -- local u = utf8.escape
336 | -- assert(utf8.gsub("function", u"%%f[%1-%255]%%w", ".") == ".unction")
337 | -- assert(utf8.gsub("function", u"%%f[^%1-%255]", ".") == "function.")
338 | 
339 | --[[--
340 | Stepets: %z is Lua 5.1 class for representing \0
341 |   Lua 5.2, Lua 5.3 doesn't have it in documentation. So it's considered deprecated.
342 | --]]--
343 | assert(utf8.find("a", "%f[a]") == 1)
344 | assert(utf8.find("a", "%f[^%z]") == 1)
345 | assert(utf8.find("a", "%f[^%l]") == 2)
346 | assert(utf8.find("aba", "%f[a%z]") == 3)
347 | assert(utf8.find("aba", "%f[%z]") == 4)
348 | assert(not utf8.find("aba", "%f[%l%z]"))
349 | assert(not utf8.find("aba", "%f[^%l%z]"))
350 | 
351 | local i, e = utf8.find(" alo aalo allo", "%f[%S].-%f[%s].-%f[%S]")
352 | assert(i == 2 and e == 5)
353 | local k = utf8.match(" alo aalo allo", "%f[%S](.-%f[%s].-%f[%S])")
354 | assert(k == 'alo ')
355 | 
356 | local a = {1, 5, 9, 14, 17,}
357 | for k in utf8.gmatch("alo alo th02 is 1hat", "()%f[%w%d]") do
358 |   assert(table.remove(a, 1) == k)
359 | end
360 | assert(#a == 0)
361 | 
362 | -- malformed patterns
363 | local function malform (p, m)
364 |   m = m or "malformed"
365 |   local r, msg = pcall(utf8.find, "a", p)
366 |   assert(not r and utf8.find(msg, m))
367 | end
368 | 
369 | malform("[a")
370 | malform("[]")
371 | malform("[^]")
372 | malform("[a%]")
373 | malform("[a%")
374 | malform("%b", "unbalanced")
375 | malform("%ba", "unbalanced")
376 | malform("%")
377 | malform("%f", "missing")
378 | 
379 | -- \0 in patterns
380 | assert(utf8.match("ab\0\1\2c", "[\0-\2]+") == "\0\1\2")
381 | assert(utf8.match("ab\0\1\2c", "[\0-\0]+") == "\0")
382 | assert(utf8.find("b$a", "$\0?") == 2)
383 | assert(utf8.find("abc\0efg", "%\0") == 4)
384 | assert(utf8.match("abc\0efg\0\1e\1g", "%b\0\1") == "\0efg\0\1e\1")
385 | assert(utf8.match("abc\0\0\0", "%\0+") == "\0\0\0")
386 | assert(utf8.match("abc\0\0\0", "%\0%\0?") == "\0\0")
387 | 
388 | -- magic char after \0
389 | assert(utf8.find("abc\0\0","\0.") == 4)
390 | assert(utf8.find("abcx\0\0abc\0abc","x\0\0abc\0a.") == 4)
391 | 
392 | print('OK')
393 | 


--------------------------------------------------------------------------------
/include/AL/utf8/primitives/dummy.lua:
--------------------------------------------------------------------------------
  1 | -- $Id: utf8.lua 179 2009-04-03 18:10:03Z pasta $
  2 | --
  3 | -- Provides UTF-8 aware string functions implemented in pure lua:
  4 | -- * utf8len(s)
  5 | -- * utf8sub(s, i, j)
  6 | -- * utf8reverse(s)
  7 | -- * utf8char(unicode)
  8 | -- * utf8unicode(s, i, j)
  9 | -- * utf8gensub(s, sub_len)
 10 | -- * utf8find(str, regex, init, plain)
 11 | -- * utf8match(str, regex, init)
 12 | -- * utf8gmatch(str, regex, all)
 13 | -- * utf8gsub(str, regex, repl, limit)
 14 | --
 15 | -- All functions behave as their non UTF-8 aware counterparts with the exception
 16 | -- that UTF-8 characters are used instead of bytes for all units.
 17 | 
 18 | --[[
 19 | Copyright (c) 2006-2007, Kyle Smith
 20 | All rights reserved.
 21 | 
 22 | Contributors:
 23 | 	Alimov Stepan
 24 | 
 25 | Redistribution and use in source and binary forms, with or without
 26 | modification, are permitted provided that the following conditions are met:
 27 | 
 28 |     * Redistributions of source code must retain the above copyright notice,
 29 |       this list of conditions and the following disclaimer.
 30 |     * Redistributions in binary form must reproduce the above copyright
 31 |       notice, this list of conditions and the following disclaimer in the
 32 |       documentation and/or other materials provided with the distribution.
 33 |     * Neither the name of the author nor the names of its contributors may be
 34 |       used to endorse or promote products derived from this software without
 35 |       specific prior written permission.
 36 | 
 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 40 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
 41 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 42 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 43 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 44 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 45 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 46 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 47 | --]]
 48 | 
 49 | -- ABNF from RFC 3629
 50 | --
 51 | -- UTF8-octets = *( UTF8-char )
 52 | -- UTF8-char   = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
 53 | -- UTF8-1      = %x00-7F
 54 | -- UTF8-2      = %xC2-DF UTF8-tail
 55 | -- UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
 56 | --               %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
 57 | -- UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
 58 | --               %xF4 %x80-8F 2( UTF8-tail )
 59 | -- UTF8-tail   = %x80-BF
 60 | --
 61 | return function(utf8)
 62 | 
 63 | local byte    = string.byte
 64 | local char    = string.char
 65 | local dump    = string.dump
 66 | local find    = string.find
 67 | local format  = string.format
 68 | local len     = string.len
 69 | local lower   = string.lower
 70 | local rep     = string.rep
 71 | local sub     = string.sub
 72 | local upper   = string.upper
 73 | 
 74 | local utf8charpattern = '[%z\1-\127\194-\244][\128-\191]*'
 75 | 
 76 | local function utf8symbollen(byte)
 77 |   return not byte and 0 or (byte < 0x80 and 1) or (byte >= 0xF0 and 4) or (byte >= 0xE0 and 3) or (byte >= 0xC0 and 2) or 1
 78 | end
 79 | 
 80 | local head_table = utf8.config.int32array(256)
 81 | for i = 0, 255 do
 82 |   head_table[i] = utf8symbollen(i)
 83 | end
 84 | head_table[256] = 0
 85 | 
 86 | local function utf8charbytes(str, bs)
 87 |   return head_table[byte(str, bs) or 256]
 88 | end
 89 | 
 90 | local function utf8next(str, bs)
 91 |   return bs + utf8charbytes(str, bs)
 92 | end
 93 | 
 94 | -- returns the number of characters in a UTF-8 string
 95 | local function utf8len (str)
 96 |   local bs = 1
 97 |   local bytes = len(str)
 98 |   local length = 0
 99 | 
100 |   while bs <= bytes do
101 |     length = length + 1
102 |     bs = utf8next(str, bs)
103 |   end
104 | 
105 |   return length
106 | end
107 | 
108 | -- functions identically to string.sub except that i and j are UTF-8 characters
109 | -- instead of bytes
110 | local function utf8sub (s, i, j)
111 |   -- argument defaults
112 |   j = j or -1
113 | 
114 |   local bs = 1
115 |   local bytes = len(s)
116 |   local length = 0
117 | 
118 |   local l = (i >= 0 and j >= 0) or utf8len(s)
119 |   i = (i >= 0) and i or l + i + 1
120 |   j = (j >= 0) and j or l + j + 1
121 | 
122 |   if i > j then
123 |     return ""
124 |   end
125 | 
126 |   local start, finish = 1, bytes
127 | 
128 |   while bs <= bytes do
129 |     length = length + 1
130 | 
131 |     if length == i then
132 |       start = bs
133 |     end
134 | 
135 |     bs = utf8next(s, bs)
136 | 
137 |     if length == j then
138 |       finish = bs - 1
139 |       break
140 |     end
141 |   end
142 | 
143 |   if i > length then start = bytes + 1 end
144 |   if j < 1 then finish = 0 end
145 | 
146 |   return sub(s, start, finish)
147 | end
148 | 
149 | -- http://en.wikipedia.org/wiki/Utf8
150 | -- http://developer.coronalabs.com/code/utf-8-conversion-utility
151 | local function utf8char(...)
152 |   local codes = {...}
153 |   local result = {}
154 | 
155 |   for _, unicode in ipairs(codes) do
156 | 
157 |     if unicode <= 0x7F then
158 |       result[#result + 1] = unicode
159 |     elseif unicode <= 0x7FF then
160 |       local b0 = 0xC0 + math.floor(unicode / 0x40);
161 |       local b1 = 0x80 + (unicode % 0x40);
162 |       result[#result + 1] = b0
163 |       result[#result + 1] = b1
164 |     elseif unicode <= 0xFFFF then
165 |       local b0 = 0xE0 +  math.floor(unicode / 0x1000);
166 |       local b1 = 0x80 + (math.floor(unicode / 0x40) % 0x40);
167 |       local b2 = 0x80 + (unicode % 0x40);
168 |       result[#result + 1] = b0
169 |       result[#result + 1] = b1
170 |       result[#result + 1] = b2
171 |     elseif unicode <= 0x10FFFF then
172 |       local code = unicode
173 |       local b3= 0x80 + (code % 0x40);
174 |       code       = math.floor(code / 0x40)
175 |       local b2= 0x80 + (code % 0x40);
176 |       code       = math.floor(code / 0x40)
177 |       local b1= 0x80 + (code % 0x40);
178 |       code       = math.floor(code / 0x40)
179 |       local b0= 0xF0 + code;
180 | 
181 |       result[#result + 1] = b0
182 |       result[#result + 1] = b1
183 |       result[#result + 1] = b2
184 |       result[#result + 1] = b3
185 |     else
186 |       error 'Unicode cannot be greater than U+10FFFF!'
187 |     end
188 | 
189 |   end
190 | 
191 |   return char(utf8.config.unpack(result))
192 | end
193 | 
194 | 
195 | local shift_6  = 2^6
196 | local shift_12 = 2^12
197 | local shift_18 = 2^18
198 | 
199 | local utf8unicode
200 | utf8unicode = function(str, ibs, jbs)
201 |   if ibs > jbs then return end
202 | 
203 |   local ch,bytes
204 | 
205 |   bytes = utf8charbytes(str, ibs)
206 |   if bytes == 0 then return end
207 | 
208 |   local unicode
209 | 
210 |   if bytes == 1 then unicode = byte(str, ibs, ibs) end
211 |   if bytes == 2 then
212 |     local byte0,byte1 = byte(str, ibs, ibs + 1)
213 |     if byte0 and byte1 then
214 |       local code0,code1 = byte0-0xC0,byte1-0x80
215 |       unicode = code0*shift_6 + code1
216 |     else
217 |       unicode = byte0
218 |     end
219 |   end
220 |   if bytes == 3 then
221 |     local byte0,byte1,byte2 = byte(str, ibs, ibs + 2)
222 |     if byte0 and byte1 and byte2 then
223 |       local code0,code1,code2 = byte0-0xE0,byte1-0x80,byte2-0x80
224 |       unicode = code0*shift_12 + code1*shift_6 + code2
225 |     else
226 |       unicode = byte0
227 |     end
228 |   end
229 |   if bytes == 4 then
230 |     local byte0,byte1,byte2,byte3 = byte(str, ibs, ibs + 3)
231 |     if byte0 and byte1 and byte2 and byte3 then
232 |       local code0,code1,code2,code3 = byte0-0xF0,byte1-0x80,byte2-0x80,byte3-0x80
233 |       unicode = code0*shift_18 + code1*shift_12 + code2*shift_6 + code3
234 |     else
235 |       unicode = byte0
236 |     end
237 |   end
238 | 
239 |   if ibs == jbs then
240 |     return unicode
241 |   else
242 |     return unicode,utf8unicode(str, ibs+bytes, jbs)
243 |   end
244 | end
245 | 
246 | local function utf8byte(str, i, j)
247 |   if #str == 0 then return end
248 | 
249 |   local ibs, jbs
250 | 
251 |   if i or j then
252 |     i = i or 1
253 |     j = j or i
254 | 
255 |     local str_len = utf8len(str)
256 |     i = i < 0 and str_len + i + 1 or i
257 |     j = j < 0 and str_len + j + 1 or j
258 |     j = j > str_len and str_len or j
259 | 
260 |     if i > j then return end
261 | 
262 |     for p = 1, i - 1 do
263 |       ibs = utf8next(str, ibs or 1)
264 |     end
265 | 
266 |     if i == j then
267 |       jbs = ibs
268 |     else
269 |       for p = 1, j - 1 do
270 |         jbs = utf8next(str, jbs or 1)
271 |       end
272 |     end
273 | 
274 |     if not ibs or not jbs then
275 |       return nil
276 |     end
277 |   else
278 |     ibs, jbs = 1, 1
279 |   end
280 | 
281 |   return utf8unicode(str, ibs, jbs)
282 | end
283 | 
284 | local function utf8gensub(str, sub_len)
285 |   sub_len = sub_len or 1
286 |   local max_len = #str
287 |   return function(skip_ptr, bs)
288 |     bs = (bs and bs or 1) + (skip_ptr and (skip_ptr[1] or 0) or 0)
289 | 
290 |     local nbs = bs
291 |     if bs > max_len then return nil end
292 |     for i = 1, sub_len do
293 |       nbs = utf8next(str, nbs)
294 |     end
295 | 
296 |     return nbs, sub(str, bs, nbs - 1), bs
297 |   end
298 | end
299 | 
300 | local function utf8reverse (s)
301 |   local result = ''
302 |   for _, w in utf8gensub(s) do result = w .. result end
303 |   return result
304 | end
305 | 
306 | local function utf8validator(str, bs)
307 |   bs = bs or 1
308 | 
309 |   if type(str) ~= "string" then
310 |     error("bad argument #1 to 'utf8charbytes' (string expected, got ".. type(str).. ")")
311 |   end
312 |   if type(bs) ~= "number" then
313 |     error("bad argument #2 to 'utf8charbytes' (number expected, got ".. type(bs).. ")")
314 |   end
315 | 
316 |   local c = byte(str, bs)
317 |   if not c then return end
318 | 
319 |   -- determine bytes needed for character, based on RFC 3629
320 | 
321 |   -- UTF8-1
322 |   if c >= 0 and c <= 127 then
323 |     return bs + 1
324 |   elseif c >= 128 and c <= 193 then
325 |     return bs + 1, bs, 1, c
326 |       -- UTF8-2
327 |   elseif c >= 194 and c <= 223 then
328 |     local c2 = byte(str, bs + 1)
329 |     if not c2 or c2 < 128 or c2 > 191 then
330 |       return bs + 2, bs, 2, c2
331 |     end
332 | 
333 |     return bs + 2
334 |       -- UTF8-3
335 |   elseif c >= 224 and c <= 239 then
336 |     local c2 = byte(str, bs + 1)
337 | 
338 |     if not c2 then
339 |       return bs + 2, bs, 2, c2
340 |     end
341 | 
342 |     -- validate byte 2
343 |     if c == 224 and (c2 < 160 or c2 > 191) then
344 |       return bs + 2, bs, 2, c2
345 |     elseif c == 237 and (c2 < 128 or c2 > 159) then
346 |       return bs + 2, bs, 2, c2
347 |     elseif c2 < 128 or c2 > 191 then
348 |       return bs + 2, bs, 2, c2
349 |     end
350 | 
351 |     local c3 = byte(str, bs + 2)
352 |     if not c3 or c3 < 128 or c3 > 191 then
353 |       return bs + 3, bs, 3, c3
354 |     end
355 | 
356 |     return bs + 3
357 |       -- UTF8-4
358 |   elseif c >= 240 and c <= 244 then
359 |     local c2 = byte(str, bs + 1)
360 | 
361 |     if not c2 then
362 |       return bs + 2, bs, 2, c2
363 |     end
364 | 
365 |     -- validate byte 2
366 |     if c == 240 and (c2 < 144 or c2 > 191) then
367 |       return bs + 2, bs, 2, c2
368 |     elseif c == 244 and (c2 < 128 or c2 > 143) then
369 |       return bs + 2, bs, 2, c2
370 |     elseif c2 < 128 or c2 > 191 then
371 |       return bs + 2, bs, 2, c2
372 |     end
373 | 
374 |     local c3 = byte(str, bs + 2)
375 |     if not c3 or c3 < 128 or c3 > 191 then
376 |       return bs + 3, bs, 3, c3
377 |     end
378 | 
379 |     local c4 = byte(str, bs + 3)
380 |     if not c4 or c4 < 128 or c4 > 191 then
381 |       return bs + 4, bs, 4, c4
382 |     end
383 | 
384 |     return bs + 4
385 |   else -- c > 245
386 |     return bs + 1, bs, 1, c
387 |   end
388 | end
389 | 
390 | local function utf8validate(str, byte_pos)
391 |   local result = {}
392 |   for nbs, bs, part, code in utf8validator, str, byte_pos do
393 |     if bs then
394 |       result[#result + 1] = { pos = bs, part = part, code = code }
395 |     end
396 |   end
397 |   return #result == 0, result
398 | end
399 | 
400 | local function utf8codes(str)
401 |   local max_len = #str
402 |   local bs = 1
403 |   return function(skip_ptr)
404 |     if bs > max_len then return nil end
405 |     local pbs = bs
406 |     bs = utf8next(str, pbs)
407 | 
408 |     return pbs, utf8unicode(str, pbs, pbs), pbs
409 |   end
410 | end
411 | 
412 | 
413 | --[[--
414 | differs from Lua 5.3 utf8.offset in accepting any byte positions (not only head byte) for all n values
415 | 
416 | h - head, c - continuation, t - tail
417 | hhhccthccthccthcthhh
418 |         ^ start byte pos
419 | searching current charracter head by moving backwards
420 | hhhccthccthccthcthhh
421 |       ^ head
422 | 
423 | n == 0: current position
424 | n > 0: n jumps forward
425 | n < 0: n more scans backwards
426 | --]]--
427 | local function utf8offset(str, n, bs)
428 |   local l = #str
429 |   if not bs then
430 |     if n < 0 then
431 |       bs = l + 1
432 |     else
433 |       bs = 1
434 |     end
435 |   end
436 |   if bs <= 0 or bs > l + 1 then
437 |     error("bad argument #3 to 'offset' (position out of range)")
438 |   end
439 | 
440 |   if n == 0 then
441 |     if bs == l + 1 then
442 |       return bs
443 |     end
444 |     while true do
445 |       local b = byte(str, bs)
446 |       if (0 < b and b < 127)
447 |       or (194 < b and b < 244) then
448 |         return bs
449 |       end
450 |       bs = bs - 1
451 |       if bs < 1 then
452 |         return
453 |       end
454 |     end
455 |   elseif n < 0 then
456 |     bs = bs - 1
457 |     repeat
458 |       if bs < 1 then
459 |         return
460 |       end
461 | 
462 |       local b = byte(str, bs)
463 |       if (0 < b and b < 127)
464 |       or (194 < b and b < 244) then
465 |         n = n + 1
466 |       end
467 |       bs = bs - 1
468 |     until n == 0
469 |     return bs + 1
470 |   else
471 |     while true do
472 |       if bs > l then
473 |         return
474 |       end
475 | 
476 |       local b = byte(str, bs)
477 |       if (0 < b and b < 127)
478 |       or (194 < b and b < 244) then
479 |         n = n - 1
480 |         for i = 1, n do
481 |           if bs > l then
482 |             return
483 |           end
484 |           bs = utf8next(str, bs)
485 |         end
486 |         return bs
487 |       end
488 |       bs = bs - 1
489 |     end
490 |   end
491 | 
492 | end
493 | 
494 | local function utf8replace (s, mapping)
495 |   if type(s) ~= "string" then
496 |     error("bad argument #1 to 'utf8replace' (string expected, got ".. type(s).. ")")
497 |   end
498 |   if type(mapping) ~= "table" then
499 |     error("bad argument #2 to 'utf8replace' (table expected, got ".. type(mapping).. ")")
500 |   end
501 |   local result = utf8.raw.gsub( s, utf8charpattern, mapping )
502 |   return result
503 | end
504 | 
505 | local function utf8upper (s)
506 |   return utf8replace(s, utf8.config.conversion.lc_uc)
507 | end
508 | 
509 | if utf8.config.conversion.lc_uc then
510 |   upper = utf8upper
511 | end
512 | 
513 | local function utf8lower (s)
514 |   return utf8replace(s, utf8.config.conversion.uc_lc)
515 | end
516 | 
517 | if utf8.config.conversion.uc_lc then
518 |   lower = utf8lower
519 | end
520 | 
521 | utf8.len       = utf8len
522 | utf8.sub       = utf8sub
523 | utf8.reverse   = utf8reverse
524 | utf8.char      = utf8char
525 | utf8.unicode   = utf8unicode
526 | utf8.byte      = utf8byte
527 | utf8.next      = utf8next
528 | utf8.gensub    = utf8gensub
529 | utf8.validator = utf8validator
530 | utf8.validate  = utf8validate
531 | utf8.dump      = dump
532 | utf8.format    = format
533 | utf8.lower     = lower
534 | utf8.upper     = upper
535 | utf8.rep       = rep
536 | utf8.raw = {}
537 | for k,v in pairs(string) do
538 |   utf8.raw[k] = v
539 | end
540 | 
541 | utf8.charpattern = utf8charpattern
542 | utf8.offset = utf8offset
543 | if _VERSION == 'Lua 5.3' then
544 |   local utf8_53 = require "utf8"
545 |   utf8.codes = utf8_53.codes
546 |   utf8.codepoint = utf8_53.codepoint
547 |   utf8.len53 = utf8_53.len
548 | else
549 |   utf8.codes = utf8codes
550 |   utf8.codepoint = utf8unicode
551 | end
552 | 
553 | return utf8
554 | 
555 | end
556 | 


--------------------------------------------------------------------------------
/autoload/AL.Persian Toolkit.lua:
--------------------------------------------------------------------------------
   1 | -- Special thanks to Majid110 for inspiring us the great feature of RTL Editor.
   2 | -- https://github.com/Majid110/MasafAutomation
   3 | -- Special thanks to lyger for writing the base of an excelent splitter
   4 | -- https://github.com/lyger/Aegisub_automation_scripts
   5 | 
   6 | -- Authers of each section:
   7 | -- PakNevis: SSgumS
   8 | -- Extend Move: SSgumS
   9 | -- RTL: Shinsekai_Yuri & SSgumS
  10 | -- Un-RTL: Shinsekai_Yuri & SSgumS
  11 | -- Unretard: SSgumS & MD
  12 | -- RTL Editor: Majid Shamkhani (Edited by SSgumS)
  13 | -- Split at Tags: SSgumS (based on lyger's Split at Tags)
  14 | 
  15 | ----- Global Dependencies -----
  16 | include('karaskel.lua')
  17 | 
  18 | local utf8 = require 'AL.utf8':init()
  19 | local re = require 'aegisub.re'
  20 | 
  21 | ----- Script Info -----
  22 | script_name = 'AnimeList Persian Toolkit'
  23 | script_description = 'A toolkit for easier persian fansubbing.'
  24 | script_author = 'AnimeList Team'
  25 | script_version = '1.3.1'
  26 | 
  27 | ----- Script Names -----
  28 | local paknevis_script_name = 'AL Persian Toolkit/PakNevis'
  29 | local extend_move_script_name = 'AL Persian Toolkit/Extend Move'
  30 | local rtl_script_name = 'AL Persian Toolkit/RTL/RTL'
  31 | local unrtl_script_name = 'AL Persian Toolkit/RTL/Un-RTL'
  32 | local unretard_script_name = 'AL Persian Toolkit/Unretard'
  33 | local rtleditor_script_name = 'AL Persian Toolkit/RTL Editor'
  34 | local split_at_tags_script_name = 'AL Persian Toolkit/Split/Split at Tags'
  35 | local split_at_spaces_script_name = 'AL Persian Toolkit/Split/Split at Spaces'
  36 | local reverse_split_at_tags_script_name = 'AL Persian Toolkit/Split/Reverse + Split (at Tags)'
  37 | local reverse_at_tags_script_name = 'AL Persian Toolkit/Split/Reverse at Tags'
  38 | 
  39 | ----- Global Variables ----
  40 | RLE = utf8.char(0x202B)
  41 | subtitles = nil
  42 | 
  43 | ----- Global Functions -----
  44 | local function removeRleChars(text)
  45 |     text = re.sub(text, RLE, "")
  46 |     return text
  47 | end
  48 | 
  49 | local function unrtl(text)
  50 |     text, _ = re.sub(text, "^((?:\\{.*?\\})*)" .. RLE, "\\1")
  51 |     text, _ = re.sub(text, "(\\\\[Nn])((?:\\{.*?\\})*)" .. RLE, "\\1\\2")
  52 |     return text
  53 | end
  54 | 
  55 | local function rtl(text)
  56 |     text = unrtl(text)
  57 |     text, _ = re.sub(text, "^((?:\\{.*?\\})*)", "\\1" .. RLE)
  58 |     text, _ = re.sub(text, "(\\\\[Nn])((?:\\{.*?\\})*)", "\\1\\2" .. RLE)
  59 |     return text
  60 | end
  61 | 
  62 | local function serializeTable(val, name, skipnewlines, depth)
  63 |     skipnewlines = skipnewlines or false
  64 |     depth = depth or 0
  65 | 
  66 |     local tmp = string.rep(" ", depth)
  67 | 
  68 |     if name then tmp = tmp .. name .. " = " end
  69 | 
  70 |     if type(val) == "table" then
  71 |         tmp = tmp .. "{" .. (not skipnewlines and "\n" or "")
  72 | 
  73 |         for k, v in pairs(val) do
  74 |             tmp = tmp .. serializeTable(v, k, skipnewlines, depth + 1) .. "," .. (not skipnewlines and "\n" or "")
  75 |         end
  76 | 
  77 |         tmp = tmp .. string.rep(" ", depth) .. "}"
  78 |     elseif type(val) == "number" then
  79 |         tmp = tmp .. tostring(val)
  80 |     elseif type(val) == "string" then
  81 |         tmp = tmp .. string.format("%q", val)
  82 |     elseif type(val) == "boolean" then
  83 |         tmp = tmp .. (val and "true" or "false")
  84 |     else
  85 |         tmp = tmp .. "\"[inserializeable datatype:" .. type(val) .. "]\""
  86 |     end
  87 | 
  88 |     return tmp
  89 | end
  90 | 
  91 | local function has_value(tab, val)
  92 |     for index, value in ipairs(tab) do
  93 |         if value == val then
  94 |             return true
  95 |         end
  96 |     end
  97 | 
  98 |     return false
  99 | end
 100 | 
 101 | local function difference(a, b)
 102 |     local aa = {}
 103 |     for k, v in pairs(a) do aa[k] = v end
 104 |     for k, v in pairs(b) do
 105 |         if aa[k] == v then
 106 |             aa[k] = nil
 107 |         end
 108 |     end
 109 |     local ret = {}
 110 |     for k, v in pairs(aa) do -- skips nil
 111 |         ret[k] = v
 112 |     end
 113 |     return ret
 114 | end
 115 | 
 116 | -- expand to table of tag-text
 117 | local function expand(text)
 118 |     local result = {}
 119 | 
 120 |     local firstPart = re.match(text, "^([^{].*?)(?:\\{|$)")
 121 |     if firstPart ~= nil then
 122 |         table.insert(result, { tag = "", text = firstPart[2].str })
 123 |     end
 124 | 
 125 |     for f in re.gfind(text, "(\\{.*?\\})([^{]*)") do
 126 |         local m = re.match(f, "(\\{.*?\\})([^{]*)")
 127 |         if m[2] == nil then m[2] = { str = "" } end
 128 |         if m[3] == nil then m[3] = { str = "" } end
 129 |         table.insert(result, { tag = m[2].str, text = m[3].str })
 130 |     end
 131 | 
 132 |     return result
 133 | end
 134 | 
 135 | -- source: https://github.com/unanimated/luaegisub/blob/master/ua.Relocator.lua#L2555
 136 | local function round(n, dec)
 137 |     dec = dec or 0
 138 |     n = math.floor(n * 10 ^ dec + 0.5) / 10 ^ dec
 139 |     return n
 140 | end
 141 | 
 142 | ----- PakNevis -----
 143 | function PakNevis(subtitles, selected_lines, active_line)
 144 |     -- local translation_src = ' كي“”0123456789?⸮,’‘ﺑﺗﺛﺟﺣﺧﺳﺷﺻﺿﻃﻇﻋﻏﻓﻗﻛﻟﻣﻧﻫﻳﺋﺍﺏﺕﺙﺝﺡﺥﺩﺫﺭﺯﺱﺵﺹﺽﻁﻅﻉﻍﻑﻕﻙﻝﻡﻥﻩﻭﻱﺁﺃﺅﺇﺉˈﯿٱھ《》'
 145 |     -- local translation_dst = ' کی""۰۱۲۳۴۵۶۷۸۹؟؟،\'\'بتثجحخسشصضطظعغفقکلمنهیئابتثجحخدذرزسشصضطظعغفقکلمنهویآأؤإئ\'یاه«»'
 146 |     local persian_alphabets = 'ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی'
 147 |     local persian_digits = '۰۱۲۳۴۵۶۷۸۹'
 148 |     local english_digits = '0123456789'
 149 |     local punc_after = '%.:!،؛؟»%]%)'
 150 |     local punc_before = '«%[%('
 151 | 
 152 |     for z, i in ipairs(selected_lines) do
 153 |         local line = subtitles[i]
 154 |         -- translation
 155 |         -- for j = 0, translation_src:len() do
 156 |         --     line.text = utf8.gsub(line.text, '(?!{)(?=[^}])*'..utf8.sub(translation_src, j, j)..'', utf8.sub(translation_dst, j, j))
 157 |         -- end
 158 |         -- line.text = utf8.gsub(line.text, '%%', '٪')
 159 |         -- character refinement patterns
 160 |         line.text = utf8.gsub(line.text, ' +', ' ') -- remove extra spaces
 161 |         line.text = utf8.gsub(line.text, '‌+', '‌') -- remove extra zwnj
 162 |         line.text = utf8.gsub(line.text, '"([^"]+)"', '«%1»') -- replace quotation with gyoome
 163 |         line.text = utf8.gsub(line.text, 'ﻻ', 'لا') -- replace لا
 164 |         line.text = utf8.gsub(line.text, '： ', ': ') -- replace full-width colon
 165 |         line.text = utf8.gsub(line.text, '：', ': ') -- replace full-width colon
 166 |         line.text = utf8.gsub(line.text, '-+', '-') -- remove extra -
 167 |         -- line.text = utf8.gsub(line.text, '-(\\[Nn])', '–%1') -- replace ending - with –
 168 |         -- line.text = utf8.gsub(line.text, '-$', '–') -- replace ending - with –
 169 |         -- punctuation spacing patterns
 170 |         line.text = utf8.gsub(line.text, ' ([' .. punc_after .. '])', '%1') -- remove space before
 171 |         line.text = utf8.gsub(line.text, '([' .. punc_before .. ']) ', '%1') -- remove space after
 172 |         line.text = utf8.gsub(line.text, '([^%d' .. persian_digits .. ']%.)([^ ' .. punc_after .. '])', '%1 %2') -- put space after .
 173 |         line.text = utf8.gsub(line.text, '([%d' .. persian_digits .. ']%.)([^ %d' .. persian_digits .. punc_after .. '])'
 174 |             , '%1 %2') -- put space after .
 175 |         line.text = utf8.gsub(line.text, '([' .. punc_after:sub(3) .. '])([^ ' .. punc_after .. '])', '%1 %2') -- put space after
 176 |         line.text = utf8.gsub(line.text, '([^ ' .. punc_before .. '])([' .. punc_before .. '])', '%1 %2') -- put space before
 177 |         -- affix spacing patterns
 178 |         line.text = utf8.gsub(line.text, '([^ ]ه) ی ', '%1‌ی ') -- fix ی space
 179 |         line.text = utf8.gsub(line.text, ' (ن?می) ', ' %1‌') -- put zwnj after می, نمی
 180 |         line.text = utf8.gsub(line.text, '^(ن?می) ', '%1‌') -- put zwnj after می, نمی
 181 |         line.text = utf8.gsub(line.text,
 182 |             '([' .. persian_alphabets .. '][' .. persian_alphabets .. ']) (های?)([^' .. persian_alphabets .. '])',
 183 |             '%1‌%2%3') -- put zwnj before تر, تری, ترین, گر, گری, ها, های
 184 |         line.text = utf8.gsub(line.text,
 185 |             '([' .. persian_alphabets .. '][' .. persian_alphabets .. ']) (گری?)([^' .. persian_alphabets .. '])',
 186 |             '%1‌%2%3') -- put zwnj before تر, تری, ترین, گر, گری, ها, های
 187 |         line.text = utf8.gsub(line.text,
 188 |             '([' .. persian_alphabets .. '][' .. persian_alphabets .. ']) (تری?ن?)([^' .. persian_alphabets .. '])',
 189 |             '%1‌%2%3') -- put zwnj before تر, تری, ترین, گر, گری, ها, های
 190 |         line.text = utf8.gsub(line.text, '([' .. persian_alphabets .. '][' .. persian_alphabets .. ']) (های?)$',
 191 |             '%1‌%2') -- put zwnj before تر, تری, ترین, گر, گری, ها, های
 192 |         line.text = utf8.gsub(line.text, '([' .. persian_alphabets .. '][' .. persian_alphabets .. ']) (گری?)$',
 193 |             '%1‌%2') -- put zwnj before تر, تری, ترین, گر, گری, ها, های
 194 |         line.text = utf8.gsub(line.text, '([' .. persian_alphabets .. '][' .. persian_alphabets .. ']) (تری?ن?)$',
 195 |             '%1‌%2') -- put zwnj before تر, تری, ترین, گر, گری, ها, های
 196 |         line.text = utf8.gsub(line.text, '([^ ]ه) (ا[میشنت][مد]?)([^' .. persian_alphabets .. '])', '%1‌%2%3') -- join ام, ایم, اش, اند, ای, اید, ات
 197 |         line.text = utf8.gsub(line.text, '([^ ]ه) (ا[میشنت][مد]?)$', '%1‌%2') -- join ام, ایم, اش, اند, ای, اید, ات
 198 |         subtitles[i] = line
 199 |     end
 200 |     aegisub.set_undo_point(paknevis_script_name)
 201 | end
 202 | 
 203 | ----- Unretard -----
 204 | function Unretard(subtitles, selected_lines, active_line)
 205 |     local ending_punc = '%.:!،«%[%(- '
 206 |     local starting_punc = '»%]%)- '
 207 | 
 208 |     local function replace(original_text, text, search_pattern, replace_pattern)
 209 |         local match = utf8.gmatch
 210 |         if '^' == string.sub(search_pattern, 0, 1) then
 211 |             match = function(str, pattern)
 212 |                 return function()
 213 |                     return utf8.match(str, pattern)
 214 |                 end
 215 |             end
 216 |         end
 217 |         for m in match(original_text, search_pattern) do
 218 |             local puncs = utf8.reverse(m)
 219 |             puncs = utf8.gsub(puncs, '«', 't2')
 220 |             puncs = utf8.gsub(puncs, '»', 't1')
 221 |             puncs = utf8.gsub(puncs, 't2', '»')
 222 |             puncs = utf8.gsub(puncs, 't1', '«')
 223 |             puncs = utf8.gsub(puncs, '%(', 't2')
 224 |             puncs = utf8.gsub(puncs, '%)', 't1')
 225 |             puncs = utf8.gsub(puncs, 't2', '%)')
 226 |             puncs = utf8.gsub(puncs, 't1', '%(')
 227 |             puncs = utf8.gsub(puncs, '%[', 't2')
 228 |             puncs = utf8.gsub(puncs, '%]', 't1')
 229 |             puncs = utf8.gsub(puncs, 't2', '%]')
 230 |             puncs = utf8.gsub(puncs, 't1', '%[')
 231 |             text = utf8.gsub(text, replace_pattern, puncs, 1)
 232 |             if '^' == string.sub(search_pattern, 0, 1) then
 233 |                 break
 234 |             end
 235 |         end
 236 |         return text
 237 |     end
 238 | 
 239 |     for z, i in ipairs(selected_lines) do
 240 |         local line = subtitles[i]
 241 | 
 242 |         -- trim
 243 |         line.text = utf8.gsub(line.text, '^ *([^\\]+) *$', '%1')
 244 |         line.text = utf8.gsub(line.text, '^ *([^\\]+) *(\\[Nn])', '%1%2')
 245 |         line.text = utf8.gsub(line.text, '^(\\[Nn]) *([^\\]+) *(\\[Nn])', '%1%2%3')
 246 |         line.text = utf8.gsub(line.text, '^(\\[Nn]) *([^\\]+) *$', '%1%2')
 247 | 
 248 |         if utf8.match(line.text, '%{') == nil then
 249 |             -- unretard
 250 |             -- find
 251 |             local linetext_copy = line.text
 252 |             line.text = utf8.gsub(line.text, '^([' .. ending_punc .. ']+)([^\\]+)$', '%2gce') -- ending puncs
 253 |             line.text = utf8.gsub(line.text, '^([' .. ending_punc .. ']+)([^\\]+)(\\[Nn])', '%2gce%3') -- ending puncs
 254 |             line.text = utf8.gsub(line.text, '(\\[Nn])([' .. ending_punc .. ']+)([^\\]+)(\\[Nn])', '%1%3gce%4') -- ending puncs
 255 |             line.text = utf8.gsub(line.text, '(\\[Nn])([' .. ending_punc .. ']+)([^\\]+)$', '%1%3gce') -- ending puncs
 256 |             line.text = utf8.gsub(line.text, '^([^\\]+[^' .. starting_punc .. '])([' .. starting_punc .. ']+)(g?c?e?)$',
 257 |                 'gcs%1%3') -- starting puncs
 258 |             line.text = utf8.gsub(line.text, '^([^\\]+[^' .. starting_punc ..
 259 |                 '])([' .. starting_punc .. ']+)(g?c?e?)(\\[Nn])', 'gcs%1%3%4') -- starting puncs
 260 |             line.text = utf8.gsub(line.text,
 261 |                 '(\\[Nn])([^\\]+[^' .. starting_punc .. '])([' .. starting_punc .. ']+)(g?c?e?)(\\[Nn])', '%1gcs%2%4%5') -- starting puncs
 262 |             line.text = utf8.gsub(line.text, '(\\[Nn])([^\\]+[^' .. starting_punc ..
 263 |                 '])([' .. starting_punc .. ']+)(g?c?e?)$', '%1gcs%2%3') -- starting puncs
 264 |             -- replace
 265 |             line.text = replace(linetext_copy, line.text, '^([' .. ending_punc .. ']+)[^\\]+$', 'gce')
 266 |             line.text = replace(linetext_copy, line.text, '^([' .. ending_punc .. ']+)[^\\]+\\[Nn]', 'gce')
 267 |             line.text = replace(linetext_copy, line.text, '\\[Nn]([' .. ending_punc .. ']+)[^\\]+\\[Nn]', 'gce')
 268 |             line.text = replace(linetext_copy, line.text, '\\[Nn]([' .. ending_punc .. ']+)[^\\]+$', 'gce')
 269 |             line.text = replace(linetext_copy, line.text, '^[^\\]+[^' .. starting_punc .. ']([' .. starting_punc ..
 270 |                 ']+)$', 'gcs')
 271 |             line.text = replace(linetext_copy, line.text, '^[^\\]+[^' .. starting_punc ..
 272 |                 ']([' .. starting_punc .. ']+)\\[Nn]', 'gcs')
 273 |             line.text = replace(linetext_copy, line.text,
 274 |                 '\\[Nn][^\\]+[^' .. starting_punc .. ']([' .. starting_punc .. ']+)\\[Nn]', 'gcs')
 275 |             line.text = replace(linetext_copy, line.text, '\\[Nn][^\\]+[^' .. starting_punc ..
 276 |                 ']([' .. starting_punc .. ']+)$', 'gcs')
 277 |         end
 278 | 
 279 |         subtitles[i] = line
 280 |     end
 281 |     aegisub.set_undo_point(unretard_script_name)
 282 | end
 283 | 
 284 | ----- RTL -----
 285 | function Rtl(subtitles, selected_lines, active_line)
 286 |     for z, i in ipairs(selected_lines) do
 287 |         local l = subtitles[i]
 288 | 
 289 |         l.text = rtl(l.text)
 290 | 
 291 |         subtitles[i] = l
 292 |     end
 293 |     aegisub.set_undo_point(rtl_script_name)
 294 | end
 295 | 
 296 | ----- Un-RTL -----
 297 | function Unrtl(subtitles, selected_lines, active_line)
 298 |     for z, i in ipairs(selected_lines) do
 299 |         local line = subtitles[i]
 300 | 
 301 |         line.text = unrtl(line.text)
 302 | 
 303 |         subtitles[i] = line
 304 |     end
 305 |     aegisub.set_undo_point(unrtl_script_name)
 306 | end
 307 | 
 308 | ----- RTL Editor -----
 309 | local editor_btn = {
 310 |     Ok = 1,
 311 |     OkWORtl = 2,
 312 |     Cancel = 3,
 313 | }
 314 | 
 315 | local function openEditor(str)
 316 |     local btns = { "OK", "OK w/o RTL", "Cancel" }
 317 | 
 318 |     local btn_switch_case = {}
 319 |     for key, value in pairs(btns) do
 320 |         btn_switch_case[value] = key
 321 |     end
 322 | 
 323 |     local config = {
 324 |         { class = "label", label = "Press Ctrl+Shift at the right side of your keyboard to switch to RTL mode.", x = 0,
 325 |             y = 0 },
 326 |         { class = "textbox", name = "editor", value = str, x = 0, y = 1, width = 33, height = 11 }
 327 |     }
 328 |     local btn, result = aegisub.dialog.display(config, btns, { ok = "OK", cancel = "Cancel" })
 329 |     if btn == true then btn = "OK" elseif btn == false then btn = "Cancel" end
 330 |     return btn_switch_case[btn], result.editor
 331 | end
 332 | 
 333 | function RtlEditor(subtitles, selected_lines)
 334 |     if #selected_lines > 1 then
 335 |         return
 336 |     end
 337 |     local line = subtitles[selected_lines[1]]
 338 | 
 339 |     local text = unrtl(line.text)
 340 |     text = utf8.gsub(text, "\\[Nn]", "\n")
 341 |     local btn, newText = openEditor(text)
 342 | 
 343 |     if btn == editor_btn.Cancel then
 344 |         return
 345 |     end
 346 |     newText = utf8.gsub(newText, "\n", "\\N")
 347 |     if btn == editor_btn.Ok then
 348 |         newText = rtl(newText)
 349 |     end
 350 |     line.text = newText
 351 | 
 352 |     subtitles[selected_lines[1]] = line
 353 | 
 354 |     aegisub.set_undo_point(rtleditor_script_name)
 355 | end
 356 | 
 357 | ----- Split at Tags -----
 358 | local Split = {}
 359 | 
 360 | Split.puncs = '.:!،«[(»\\])\\- <>'
 361 | Split.line_type_tags = {
 362 |     'pos', 'move', 'clip', 'iclip', 'org', 'fade', 'fad', 'an', 'q'
 363 | }
 364 | Split.style_tags = {
 365 |     'i', 'b', 'u', 's', 'bord', 'xbord', 'ybord', 'shad', 'xshad', 'yshad',
 366 |     'fn', 'fs', 'fscx', 'fscy', 'fsp', 'fe', 'c', '1c', '2c', '3c', '4c',
 367 |     'alpha', '1a', '2a', '3a', '4a', 'an', 'r', 'frz', 'fr'
 368 | }
 369 | Split.non_style_tags = {
 370 |     'be', 'blur', 'frx', 'fry', 'fax', 'fay', 'k', 'K', 'kf', 'ko', 'q',
 371 |     'pos', 'move', 'org', 'fad', 'fade', 't', 'clip', 'iclip', 'p', 'pbo'
 372 | }
 373 | Split.style_names_tags = {
 374 |     { 'fontname', 'fn' }, { 'fontsize', 'fs' },
 375 |     { 'color1', '1c', '1a' }, { 'color2', '2c', '2a' }, { 'color3', '3c', '3a' }, { 'color4', '4c', '4a' },
 376 |     { 'bold', 'b' }, { 'italic', 'i' }, { 'underline', 'u' }, { 'strikeout', 's' },
 377 |     { 'scale_x', 'fscx' }, { 'scale_y', 'fscy' }, { 'spacing', 'fsp' }, { 'angle', 'frz' },
 378 |     { 'outline', 'bord' }, { 'shadow', 'shad' }, { 'align', 'an' }, { 'encoding', 'fe' }
 379 | }
 380 | Split.simple_text_value_tags = {
 381 |     'fn', 'alpha', '1a', '2a', '3a', '4a', 'c', '1c', '2c', '3c', '4c', 'r'
 382 | }
 383 | Split.boolean_style_fields = {
 384 |     'bold', 'italic', 'underline', 'strikeout'
 385 | }
 386 | 
 387 | function Split:parse_style(styleref)
 388 |     local tags = {}
 389 |     -- extract Split.style_names_tags
 390 |     for i = 1, #Split.style_names_tags do
 391 |         local table = Split.style_names_tags[i]
 392 |         local style_name = table[1]
 393 |         local tag_name1 = table[2]
 394 |         local value = styleref[style_name]
 395 |         if re.match(style_name, 'color') ~= nil then
 396 |             tags[tag_name1] = re.sub(value, '&H..(.+)', '&H\\1')
 397 |             tags[table[3]] = re.match(value, '&H..')[1].str
 398 |         else
 399 |             if has_value(Split.boolean_style_fields, style_name) then
 400 |                 if value then
 401 |                     value = 1
 402 |                 else
 403 |                     value = 0
 404 |                 end
 405 |             end
 406 |             tags[tag_name1] = value
 407 |         end
 408 |     end
 409 |     -- add other defaults
 410 |     tags['be'] = 0
 411 |     tags['blur'] = 0
 412 |     tags['frx'] = 0
 413 |     tags['fry'] = 0
 414 |     tags['fax'] = 0
 415 |     tags['fay'] = 0
 416 |     tags['pbo'] = 0
 417 |     return tags
 418 | end
 419 | 
 420 | function Split:parse_tags(tags, line_tags, current_appearance) -- TODO: add r support
 421 |     -- handle t tags
 422 |     local t_tags = {}
 423 |     for t in tags:gmatch("\\t%b()") do -- Thanks lyger!
 424 |         table.insert(t_tags, t)
 425 |     end
 426 |     tags = tags:gsub("\\t%b()", "") -- remove t tags
 427 |     if #t_tags > 0 then -- add to table
 428 |         current_appearance["t"] = t_tags
 429 |     end
 430 | 
 431 |     -- other tags
 432 |     for t in tags:gmatch("\\[^\\{}]*") do
 433 |         local tag, value = "", ""
 434 |         if t:match("\\fn") ~= nil then
 435 |             tag, value = t:match("\\(fn)(.*)")
 436 |         else
 437 |             tag, value = t:match("\\([1-4]?%a+)(%A.*)")
 438 |         end
 439 | 
 440 |         if tag == 'fr' then
 441 |             tag = 'frz'
 442 |         elseif tag == 'c' then
 443 |             tag = '1c'
 444 |         end
 445 | 
 446 |         -- add line tags to the appropriate list and others to appearance
 447 |         if has_value(Split.line_type_tags, tag) == true then
 448 |             if has_value(line_tags, tag) == false then
 449 |                 if tag == 'q' or tag == 'an' then
 450 |                     value = tonumber(value)
 451 |                 end
 452 |                 line_tags[tag] = value
 453 |             end
 454 |         else
 455 |             if has_value(Split.simple_text_value_tags, tag) == false then
 456 |                 value = tonumber(value)
 457 |             end
 458 |             current_appearance[tag] = value
 459 |         end
 460 |     end
 461 | end
 462 | 
 463 | function Split:reverse(line)
 464 |     local line = util.copy(line)
 465 |     -- read in styles and meta
 466 |     local meta, styles = karaskel.collect_head(subtitles, false)
 467 | 
 468 |     karaskel.preproc_line(subtitles, meta, styles, line)
 469 | 
 470 |     -- clean tags and text
 471 |     line.text = re.sub(line.text, '}{', '') -- combine redundant back to back tag parts
 472 |     line.text = re.sub(line.text, '^ +', '') -- trim redundant spaces
 473 |     line.text = re.sub(line.text, '^({[^{}]*}) +', '\\1')
 474 |     line.text = re.sub(line.text, ' +$', '')
 475 | 
 476 |     -- make tags-text table
 477 |     local tag_text_table = expand(line.text)
 478 |     -- aegisub.log('Parts:\n'..serializeTable(tag_text_table)..'\n')
 479 | 
 480 |     -- reverse process
 481 |     local line_tags = {}
 482 |     line.text = ''
 483 |     -- extract default appearance
 484 |     local parsed_style = Split:parse_style(line.styleref)
 485 |     -- aegisub.log('Parsed Style:\n'..serializeTable(parsed_style)..'\n')
 486 |     local current_appearance = util.deep_copy(parsed_style)
 487 |     -- 1nd step (parse)
 488 |     for i, val in ipairs(tag_text_table) do
 489 |         -- parse tags
 490 |         Split:parse_tags(val.tag, line_tags, current_appearance)
 491 |         val.tag_list = util.deep_copy(current_appearance)
 492 |     end
 493 |     -- aegisub.log('New Parts:\n'..serializeTable(tag_text_table)..'\n')
 494 | 
 495 |     -- 2nd step (rebuild)
 496 |     local last_tag_list = parsed_style
 497 |     for i = #tag_text_table, 1, -1 do
 498 |         -- get diff and rebuild tags
 499 |         local val = tag_text_table[i]
 500 |         -- get diff
 501 |         -- aegisub.log('Tag List:\n'..serializeTable(val.tag_list)..'\n')
 502 |         -- aegisub.log('Last Tag List:\n'..serializeTable(last_tag_list)..'\n')
 503 |         local diff = difference(val.tag_list, last_tag_list)
 504 |         last_tag_list = val.tag_list
 505 |         -- aegisub.log('Diff:\n'..serializeTable(diff)..'\n')
 506 |         -- rebuild tags
 507 |         local rebuilt_tag = '{}'
 508 |         for tag, value in pairs(diff) do
 509 |             if tag == "t" then
 510 |                 for _, t_tag in ipairs(value) do
 511 |                     rebuilt_tag = rebuilt_tag:gsub("}", t_tag .. "}")
 512 |                 end
 513 |             else
 514 |                 rebuilt_tag = rebuilt_tag:gsub("{", "{\\" .. tag .. value)
 515 |             end
 516 |         end
 517 |         if i == #tag_text_table then
 518 |             for tag, value in pairs(line_tags) do
 519 |                 rebuilt_tag = rebuilt_tag:gsub("{", "{\\" .. tag .. value)
 520 |             end
 521 |         end
 522 |         val.tag = rebuilt_tag
 523 | 
 524 |         -- flip spaces
 525 |         val.text, _ = re.sub(val.text, "^( *)(.*?)( *)$", "\\3\\2\\1")
 526 | 
 527 |         -- rebuild line
 528 |         line.text = line.text .. val.tag .. val.text
 529 |     end
 530 | 
 531 |     return line
 532 | end
 533 | 
 534 | function Split:splitAtTags(line)
 535 |     -- Convert float to neatly formatted string
 536 |     local function float2str(f)
 537 |         return string.format("%.3f", f):gsub("%.(%d-)0+$", "%.%1"):gsub("%.$", "")
 538 |     end
 539 | 
 540 |     -- Returns the position of a line
 541 |     local function get_pos(line)
 542 |         local _, _, posx, posy = line.text:find("\\pos%(([%d%.%-]*),([%d%.%-]*)%)")
 543 |         if posx == nil then
 544 |             _, _, posx, posy = line.text:find("\\move%(([%d%.%-]*),([%d%.%-]*),")
 545 |             if posx == nil then
 546 |                 local _, _, align_n = line.text:find("\\an([%d%.%-]*)")
 547 |                 if align_n == nil then
 548 |                     local _, _, align_dumb = line.text:find("\\a([%d]+)")
 549 |                     if align_dumb == nil then
 550 |                         -- If the line has no alignment tags
 551 |                         posx = line.x
 552 |                         posy = line.y
 553 |                     else
 554 |                         -- If the line has the \a alignment tag
 555 |                         local vid_x, vid_y = aegisub.video_size()
 556 |                         align_dumb = tonumber(align_dumb)
 557 |                         if align_dumb > 8 then
 558 |                             posy = vid_y / 2
 559 |                         elseif align_dumb > 4 then
 560 |                             posy = line.eff_margin_t
 561 |                         else
 562 |                             posy = vid_y - line.eff_margin_b
 563 |                         end
 564 |                         local _temp = align_dumb % 4
 565 |                         if _temp == 1 then
 566 |                             posx = line.eff_margin_l
 567 |                         elseif _temp == 2 then
 568 |                             posx = line.eff_margin_l +
 569 |                                 (vid_x - line.eff_margin_l -
 570 |                                     line.eff_margin_r) / 2
 571 |                         else
 572 |                             posx = vid_x - line.eff_margin_r
 573 |                         end
 574 |                     end
 575 |                 else
 576 |                     -- If the line has the \an alignment tag
 577 |                     local vid_x, vid_y = aegisub.video_size()
 578 |                     align_n = tonumber(align_n)
 579 |                     local _temp = align_n % 3
 580 |                     if align_n > 6 then
 581 |                         posy = line.eff_margin_t
 582 |                     elseif align_n > 3 then
 583 |                         posy = vid_y / 2
 584 |                     else
 585 |                         posy = vid_y - line.eff_margin_b
 586 |                     end
 587 |                     if _temp == 1 then
 588 |                         posx = line.eff_margin_l
 589 |                     elseif _temp == 2 then
 590 |                         posx = line.eff_margin_l +
 591 |                             (vid_x - line.eff_margin_l - line.eff_margin_r) /
 592 |                             2
 593 |                     else
 594 |                         posx = vid_x - line.eff_margin_r
 595 |                     end
 596 |                 end
 597 |             end
 598 |         end
 599 |         return tonumber(posx), tonumber(posy)
 600 |     end
 601 | 
 602 |     -- Returns the origin of a line
 603 |     local function get_org(line)
 604 |         local _, _, orgx, orgy = line.text:find("\\org%(([%d%.%-]*),([%d%.%-]*)%)")
 605 |         if orgx == nil then return get_pos(line) end
 606 |         return tonumber(orgx), tonumber(orgy)
 607 |     end
 608 | 
 609 |     -- Returns a table of tag-value pairs
 610 |     -- Supports fn but ignores r because fuck r
 611 |     local function full_state_subtable(tag)
 612 |         -- Store time tags in their own table, so they don't interfere
 613 |         local time_tags = {}
 614 |         for ttag in tag:gmatch("\\t%b()") do table.insert(time_tags, ttag) end
 615 | 
 616 |         -- Remove time tags from the string so we don't have to deal with them
 617 |         tag = tag:gsub("\\t%b()", "")
 618 | 
 619 |         local state_subtable = {}
 620 | 
 621 |         for t in tag:gmatch("\\[^\\{}]*") do
 622 |             local ttag, tparam = "", ""
 623 |             if t:match("\\fn") ~= nil then
 624 |                 ttag, tparam = t:match("\\(fn)(.*)")
 625 |             else
 626 |                 ttag, tparam = t:match("\\([1-4]?%a+)(%A.*)")
 627 |             end
 628 |             state_subtable[ttag] = tparam
 629 |         end
 630 | 
 631 |         -- Dump the time tags back in
 632 |         if #time_tags > 0 then state_subtable["t"] = time_tags end
 633 | 
 634 |         return state_subtable
 635 |     end
 636 | 
 637 |     local splits = {}
 638 |     local line = util.copy(line)
 639 | 
 640 |     -- clean tags and text
 641 |     line.text = re.sub(line.text, '}{', '') -- combine redundant back to back tag parts
 642 |     line.text = re.sub(line.text, '^ +', '') -- trim redundant spaces
 643 |     line.text = re.sub(line.text, '^({[^{}]*}) +', '\\1')
 644 |     line.text = re.sub(line.text, ' +$', '')
 645 | 
 646 |     -- Read in styles and meta
 647 |     local meta, styles = karaskel.collect_head(subtitles, false)
 648 | 
 649 |     -- Preprocess
 650 |     karaskel.preproc_line(subtitles, meta, styles, line)
 651 | 
 652 |     -- Get position and origin
 653 |     local px, py = get_pos(line)
 654 |     local ox, oy = get_org(line)
 655 | 
 656 |     -- If there are rotations in the line, then write the origin
 657 |     local do_org = false
 658 | 
 659 |     if line.text:match("\\fr[xyz]") ~= nil then do_org = true end
 660 | 
 661 |     -- Turn all \Ns into the newline character
 662 |     -- line.text=line.text:gsub("\\N","\n")
 663 | 
 664 |     -- Make sure any newline followed by a non-newline character has a tag afterwards
 665 |     -- (i.e. force breaks at newlines)
 666 |     -- line.text=line.text:gsub("\n([^\n{])","\n{}%1")
 667 | 
 668 |     -- Make line table
 669 |     local line_table = expand(line.text)
 670 |     local lines_added = 0
 671 |     local line_table_copy = util.copy(line_table)
 672 |     for i, e in ipairs(line_table_copy) do
 673 |         local m = re.match(e.text, "^( *)(.*?)( *)$")
 674 | 
 675 |         if m[2].str ~= "" then
 676 |             table.insert(line_table, i + lines_added, { tag = e.tag, text = rtl(m[2].str) })
 677 |             lines_added = lines_added + 1
 678 |         end
 679 |         e.text = rtl(m[3].str)
 680 |         if m[4].str ~= "" then
 681 |             table.insert(line_table, i + lines_added + 1, { tag = e.tag, text = rtl(m[4].str) })
 682 |             lines_added = lines_added + 1
 683 |         end
 684 |     end
 685 | 
 686 |     -- Stores current state of the line as style table
 687 |     local current_style = util.deep_copy(line.styleref)
 688 | 
 689 |     -- Stores the width of each section
 690 |     local substr_data = {}
 691 | 
 692 |     -- Total width of the line
 693 |     local cum_width = 0
 694 |     -- Total height of the line
 695 |     -- cum_height=0
 696 |     -- Stores the various cumulative widths for each linebreak
 697 |     -- subs_width={}
 698 |     -- subs_index=1
 699 | 
 700 |     -- First pass to collect size data
 701 |     for i, val in ipairs(line_table) do
 702 |         -- Create state subtable
 703 |         local subtable = full_state_subtable(val.tag)
 704 | 
 705 |         -- Fix style tables to reflect override tags
 706 |         current_style.fontname = subtable["fn"] or current_style.fontname
 707 |         current_style.fontsize = tonumber(subtable["fs"]) or
 708 |             current_style.fontsize
 709 |         current_style.scale_x = tonumber(subtable["fscx"]) or
 710 |             current_style.scale_x
 711 |         current_style.scale_y = tonumber(subtable["fscy"]) or
 712 |             current_style.scale_y
 713 |         current_style.spacing = tonumber(subtable["fsp"]) or
 714 |             current_style.spacing
 715 |         current_style.align = tonumber(subtable["an"]) or
 716 |             current_style.align
 717 |         if subtable["b"] ~= nil then
 718 |             if subtable["b"] == "1" then
 719 |                 current_style.bold = true
 720 |             else
 721 |                 current_style.bold = false
 722 |             end
 723 |         end
 724 |         if subtable["i"] ~= nil then
 725 |             if subtable["i"] == "1" then
 726 |                 current_style.italic = true
 727 |             else
 728 |                 current_style.italic = false
 729 |             end
 730 |         end
 731 |         if subtable["a"] ~= nil then
 732 |             local dumbalign = tonumber(subtable["a"])
 733 |             local halign = dumbalign % 4
 734 |             local valign = 0
 735 |             if dumbalign > 8 then
 736 |                 valign = 3
 737 |             elseif dumbalign > 4 then
 738 |                 valign = 6
 739 |             end
 740 |             current_style.align = valign + halign
 741 |         end
 742 | 
 743 |         -- Store this style table
 744 |         val.style = util.deep_copy(current_style)
 745 | 
 746 |         -- Get extents of the section. _sdesc is not used
 747 |         -- Temporarily remove all newlines first
 748 |         local swidth, sheight, _sdesc, sext =
 749 |         aegisub.text_extents(current_style, val.text:gsub("\n", ""))
 750 | 
 751 |         -- aegisub.log("Text: %s\n--w: %.3f\n--h: %.3f\n--d: %.3f\n--el: %.3f\n\n",
 752 |         --	val.text, swidth, sheight, _sdesc, sext)
 753 | 
 754 |         -- Add to cumulative width
 755 |         cum_width = cum_width + swidth
 756 | 
 757 |         -- Total height of the line
 758 |         local theight = 0
 759 | 
 760 |         -- Handle tasks for a line that has a newline
 761 |         --[[if val.text:match("\n")~=nil then
 762 |             --Add sheight for each newline, if any
 763 |             for nl in val.text:gmatch("\n") do
 764 |                 theight=theight+sheight
 765 |             end
 766 | 
 767 |             --Add the external lead to account for the line of normal text
 768 |             --theight=theight+sext
 769 | 
 770 |             --Store the current cumulative width and reset it to zero
 771 |             subs_width[subs_index]=cum_width
 772 |             subs_index=subs_index+1
 773 |             cum_width=0
 774 | 
 775 |             --Add to cumulative height
 776 |             cum_height=cum_height+theight
 777 |         else
 778 |             theight=sheight+sext
 779 |         end]] --
 780 | 
 781 |         -- Add data to data table
 782 |         table.insert(substr_data, {
 783 |             ["width"] = swidth,
 784 |             ["height"] = theight,
 785 |             ["subtable"] = subtable
 786 |         })
 787 | 
 788 |     end
 789 | 
 790 |     -- Store the last cumulative width
 791 |     -- subs_width[subs_index]=cum_width
 792 | 
 793 |     -- Add the last cumulative height
 794 |     -- cum_height=cum_height+substr_data[#substr_data].height
 795 | 
 796 |     -- Stores current state of the line as a state subtable
 797 |     local current_subtable = {}
 798 |     --[[current_subtable=shallow_copy(substr_data[1].subtable)
 799 |     if current_subtable["t"]~=nil then
 800 |         current_subtable["t"]=shallow_copy(substr_data[1].subtable["t"])
 801 |     end]]
 802 | 
 803 |     -- How far to offset the x coordinate
 804 |     local xoffset = 0
 805 | 
 806 |     -- How far to offset the y coordinate
 807 |     -- yoffset=0
 808 | 
 809 |     -- Newline index
 810 |     -- nindex=1
 811 | 
 812 |     -- Ways of calculating the new x position
 813 |     local xpos_func = {}
 814 |     -- Left aligned
 815 |     xpos_func[1] = function(w) return px + xoffset end
 816 |     -- Center aligned
 817 |     xpos_func[2] = function(w)
 818 |         return px - cum_width / 2 + xoffset + w / 2
 819 |     end
 820 |     -- Right aligned
 821 |     xpos_func[0] = function(w) return px - cum_width + xoffset + w end
 822 | 
 823 |     -- Ways of calculating the new y position
 824 |     --[[ypos_func={}
 825 |     --Bottom aligned
 826 |     ypos_func[1]=function(h)
 827 |             return py-cum_height+yoffset+h
 828 |         end
 829 |     --Middle aligned
 830 |     ypos_func[2]=function(h)
 831 |             return py-cum_height/2+yoffset+w/2
 832 |         end
 833 |     --Top aligned
 834 |     ypos_func[3]=function(h)
 835 |             return py+yoffset
 836 |         end]] --
 837 | 
 838 |     -- Second pass to generate lines
 839 |     for i, val in ipairs(line_table) do
 840 |         -- Here's where the action happens
 841 |         local new_line = util.copy(line)
 842 | 
 843 |         -- Fix state table to reflect current state
 844 |         for tag, param in pairs(substr_data[i].subtable) do
 845 |             if tag == "t" then
 846 |                 if current_subtable["t"] == nil then
 847 |                     current_subtable["t"] = util.copy(param)
 848 |                 else
 849 |                     -- current_subtable["t"]={unpack(current_subtable["t"]),unpack(param)}
 850 |                     for _, subval in ipairs(param) do
 851 |                         table.insert(current_subtable["t"], subval)
 852 |                     end
 853 |                 end
 854 |             else
 855 |                 current_subtable[tag] = param
 856 |             end
 857 |         end
 858 | 
 859 |         -- Figure out where the new x and y coords should be
 860 |         local new_x = xpos_func[current_style.align % 3](substr_data[i].width)
 861 |         -- new_y=ypos_func[math.ceil(current_style.align/3)](substr_data[i].height)
 862 | 
 863 |         -- Check if the text ends in whitespace
 864 |         -- local wsp = val.text:gsub("\n", ""):match("%s+$")
 865 | 
 866 |         -- Modify positioning accordingly
 867 |         -- if wsp ~= nil then
 868 |         --     local wsp_width = aegisub.text_extents(val.style, wsp)
 869 |         --     if current_style.align % 3 == 2 then
 870 |         --         new_x = new_x - wsp_width / 2
 871 |         --     elseif current_style.align % 3 == 0 then
 872 |         --         new_x = new_x - wsp_width
 873 |         --     end
 874 |         -- end
 875 | 
 876 |         -- Increase x offset
 877 |         xoffset = xoffset + substr_data[i].width
 878 | 
 879 |         -- Handle what happens in the line contains newlines
 880 |         --[[if val.text:match("\n")~=nil then
 881 |             --Increase index and reset x offset
 882 |             nindex=nindex+1
 883 |             xoffset=0
 884 |             --Increase y offset
 885 |             yoffset=yoffset+substr_data[i].height
 886 | 
 887 |             --Remove the last newline and convert back to \N
 888 |             val.text=val.text:gsub("\n$","")
 889 |             val.text=val.text:gsub("\n","\\N")
 890 |         end]] --
 891 | 
 892 |         -- Start rebuilding text
 893 |         local rebuilt_tag = string.format("{\\pos(%s,%s)}", float2str(new_x),
 894 |             float2str(py))
 895 | 
 896 |         -- Add the remaining tags
 897 |         for tag, param in pairs(current_subtable) do
 898 |             if tag == "t" then
 899 |                 for k, ttime in ipairs(param) do
 900 |                     rebuilt_tag = rebuilt_tag:gsub("}", ttime .. "}")
 901 |                 end
 902 |             elseif tag ~= "pos" and tag ~= "org" then
 903 |                 rebuilt_tag = rebuilt_tag:gsub("{", "{\\" .. tag .. param)
 904 |             end
 905 |         end
 906 | 
 907 |         if do_org then
 908 |             rebuilt_tag = rebuilt_tag:gsub("{", string.format(
 909 |                 "{\\org(%s,%s)",
 910 |                 float2str(ox), float2str(oy)))
 911 |         end
 912 | 
 913 |         -- reverse back text
 914 |         -- local match = re.match(val.text, '^(['..Split.puncs..']*)(.*[^'..Split.puncs..'])(['..Split.puncs..']*)$')
 915 |         -- aegisub.log('Matched Text 2:\n'..serializeTable(match)..'\n')
 916 |         -- if match then
 917 |         --     val.text = utf8.reverse(match[4].str)..match[3].str..utf8.reverse(match[2].str)
 918 |         -- end
 919 | 
 920 |         -- clean text
 921 |         val.text = re.sub(val.text, '^ +', '') -- trim redundant spaces
 922 |         val.text = re.sub(val.text, ' +$', '')
 923 |         val.text = re.sub(val.text, '^[' .. RLE .. ' ]+$', '')
 924 | 
 925 |         new_line.text = rebuilt_tag .. val.text
 926 | 
 927 |         -- Insert the new line
 928 |         if val.text ~= "" then
 929 |             table.insert(splits, 1, new_line)
 930 |         end
 931 |     end
 932 | 
 933 |     return splits
 934 | end
 935 | 
 936 | function Split:splitAtTagsWreverse(line)
 937 |     local result = {}
 938 |     local line = util.copy(line)
 939 |     result.reverse = Split:reverse(line)
 940 |     result.splits = Split:splitAtTags(result.reverse)
 941 |     return result
 942 | end
 943 | 
 944 | ----- Split at Tags -----
 945 | function SplitAtTags(subtitles, selected_lines, active_line)
 946 |     _G.subtitles = subtitles
 947 | 
 948 |     local lines_added = 0
 949 |     for i, n in ipairs(selected_lines) do
 950 |         local line = subtitles[n + lines_added]
 951 | 
 952 |         local result = Split:splitAtTagsWreverse(line);
 953 | 
 954 |         line.comment = true
 955 |         subtitles[n + lines_added] = line
 956 |         for _, l in ipairs(result.splits) do
 957 |             subtitles.insert(n + lines_added + 1, l)
 958 |             lines_added = lines_added + 1
 959 |         end
 960 |     end
 961 | 
 962 |     aegisub.set_undo_point(split_at_tags_script_name)
 963 | end
 964 | 
 965 | ----- Split at Spaces -----
 966 | function SplitAtSpaces(subtitles, selected_lines, active_line)
 967 |     _G.subtitles = subtitles
 968 | 
 969 |     local lines = {}
 970 | 
 971 |     -- add {} before spaces
 972 |     for i, n in ipairs(selected_lines) do
 973 |         local line = subtitles[n]
 974 |         local parts = expand(line.text)
 975 |         line.text = ""
 976 |         for _, p in ipairs(parts) do
 977 |             p.text, _ = re.sub(p.text, "( +)", "{}" .. RLE .. "\\1")
 978 |             line.text = line.text .. p.tag .. p.text
 979 |         end
 980 |         lines[i] = line
 981 |     end
 982 | 
 983 |     local lines_added = 0
 984 |     for i, line in ipairs(lines) do
 985 |         -- split at tags
 986 |         local result = Split:splitAtTagsWreverse(line)
 987 | 
 988 |         -- add lines
 989 |         local num = selected_lines[i]
 990 | 
 991 |         local l = subtitles[num + lines_added]
 992 |         l.comment = true
 993 |         subtitles[num + lines_added] = l
 994 | 
 995 |         for _, s in ipairs(result.splits) do
 996 |             subtitles.insert(num + lines_added + 1, s)
 997 |             lines_added = lines_added + 1
 998 |         end
 999 |     end
1000 | 
1001 |     aegisub.set_undo_point(split_at_spaces_script_name)
1002 | end
1003 | 
1004 | ----- Reverse + Split (at Tags) -----
1005 | function ReverseSplitAtTags(subtitles, selected_lines, active_line)
1006 |     _G.subtitles = subtitles
1007 | 
1008 |     local lines_added = 0
1009 |     for i, n in ipairs(selected_lines) do
1010 |         local line = subtitles[n + lines_added]
1011 | 
1012 |         local result = Split:splitAtTags(line);
1013 | 
1014 |         line.comment = true
1015 |         subtitles[n + lines_added] = line
1016 |         for _, l in ipairs(result) do
1017 |             subtitles.insert(n + lines_added + 1, l)
1018 |             lines_added = lines_added + 1
1019 |         end
1020 |     end
1021 | 
1022 |     aegisub.set_undo_point(reverse_split_at_tags_script_name)
1023 | end
1024 | 
1025 | ----- Reverse at Tags -----
1026 | function ReverseAtTags(subtitles, selected_lines, active_line)
1027 |     _G.subtitles = subtitles
1028 | 
1029 |     local lines_added = 0
1030 |     for i, n in ipairs(selected_lines) do
1031 |         local line = subtitles[n + lines_added]
1032 |         local new_line = util.copy(line);
1033 | 
1034 |         new_line.text = unrtl(new_line.text);
1035 |         local reverse = Split:reverse(new_line);
1036 | 
1037 |         line.comment = true
1038 |         subtitles[n + lines_added] = line
1039 |         subtitles.insert(n + lines_added + 1, reverse)
1040 |     end
1041 | 
1042 |     aegisub.set_undo_point(reverse_at_tags_script_name)
1043 | end
1044 | 
1045 | ----- Extend Move -----
1046 | function ExtendMove(subtitles, selected_lines, active_line)
1047 |     for _, i in ipairs(selected_lines) do
1048 |         local line = subtitles[i]
1049 | 
1050 |         line.text = utf8.gsub(line.text,
1051 |             "\\move%(([%d%.%-]*),([%d%.%-]*),([%d%.%-]*),([%d%.%-]*),([%d%.%-]*),([%d%.%-]*)%)",
1052 |             function(x1, y1, x2, y2, t1, t2)
1053 |                 local f1 = aegisub.frame_from_ms(line.start_time + t1)
1054 |                 if f1 ~= nil then
1055 |                     t1 = aegisub.ms_from_frame(f1)
1056 |                     local f2 = aegisub.frame_from_ms(line.start_time + t2)
1057 |                     t2 = aegisub.ms_from_frame(f2)
1058 |                 end
1059 |                 local dt = t2 - t1
1060 |                 local dxdt = (x2 - x1) / dt
1061 |                 local dydt = (y2 - y1) / dt
1062 | 
1063 |                 local s = aegisub.ms_from_frame(aegisub.frame_from_ms(line.start_time))
1064 |                 local e = aegisub.ms_from_frame(aegisub.frame_from_ms(line.end_time))
1065 |                 local ds = t1 - s
1066 |                 local de = e - t2
1067 |                 if ds < 0 then ds = 0 end
1068 |                 if de < 0 then de = 0 end
1069 | 
1070 |                 x1 = round(x1 - ds * dxdt, 2)
1071 |                 x2 = round(x2 + de * dxdt, 2)
1072 |                 y1 = round(y1 - ds * dydt, 2)
1073 |                 y2 = round(y2 + de * dydt, 2)
1074 | 
1075 |                 return "\\move(" .. x1 .. "," .. y1 .. "," .. x2 .. "," .. y2 .. ")"
1076 |             end)
1077 | 
1078 |         subtitles[i] = line
1079 |     end
1080 | 
1081 |     aegisub.set_undo_point(extend_move_script_name)
1082 | end
1083 | 
1084 | ----- Register Scripts -----
1085 | aegisub.register_macro(paknevis_script_name, 'Fix your shity writing habbits! (Unretarded Lines Only)', PakNevis)
1086 | aegisub.register_macro(extend_move_script_name, 'Extend \\move based on line\'s time.', ExtendMove)
1087 | aegisub.register_macro(unretard_script_name, 'Unretard your retarted Persian typing! (Retarded Lines Only)', Unretard)
1088 | aegisub.register_macro(rtl_script_name, 'Fix RTL languages displaying issues. (Unretarded Lines Only)', Rtl)
1089 | aegisub.register_macro(unrtl_script_name, 'Undo RTL function effects.', Unrtl)
1090 | aegisub.register_macro(rtleditor_script_name, 'An editor for easy editing of RTL language lines.', RtlEditor)
1091 | aegisub.register_macro(split_at_tags_script_name, 'A splitter (at tags) for RTL language lines.', SplitAtTags)
1092 | aegisub.register_macro(split_at_spaces_script_name, 'A splitter (at spaces) for RTL language lines.', SplitAtSpaces)
1093 | aegisub.register_macro(reverse_split_at_tags_script_name, 'Split / Reverse at Tags + Split / Split at Tags.',
1094 |     ReverseSplitAtTags)
1095 | aegisub.register_macro(reverse_at_tags_script_name, 'Reverse line at tags to use it with other LTR automations.',
1096 |     ReverseAtTags)
1097 | 


--------------------------------------------------------------------------------