├── include └── AL │ └── utf8 │ ├── charclass │ ├── compiletime │ │ ├── stub.lua │ │ ├── parser.lua │ │ ├── range.lua │ │ ├── builder.lua │ │ └── vanilla.lua │ └── runtime │ │ ├── init.lua │ │ ├── native.lua │ │ ├── dummy.lua │ │ └── base.lua │ ├── primitives │ ├── tarantool.lua │ ├── init.lua │ ├── native.lua │ └── dummy.lua │ ├── context │ ├── compiletime.lua │ └── runtime.lua │ ├── ends │ └── compiletime │ │ ├── parser.lua │ │ └── vanilla.lua │ ├── begins │ └── compiletime │ │ ├── parser.lua │ │ └── vanilla.lua │ ├── test.sh │ ├── test │ ├── test_utf8data.lua │ ├── strict.lua │ ├── util.lua │ ├── context_runtime.lua │ ├── charclass_runtime.lua │ ├── test_compat.lua │ ├── charclass_compiletime.lua │ ├── test.lua │ └── test_pm.lua │ ├── modifier │ └── compiletime │ │ ├── simple.lua │ │ ├── stub.lua │ │ ├── parser.lua │ │ ├── frontier.lua │ │ └── vanilla.lua │ ├── LICENSE │ ├── util.lua │ ├── init.lua │ ├── regex_parser.lua │ ├── README.md │ └── functions │ └── lua53.lua ├── LICENSE ├── README.md └── autoload └── AL.Persian Toolkit.lua /include/AL/utf8/charclass/compiletime/stub.lua: -------------------------------------------------------------------------------- 1 | return function(utf8) 2 | 3 | local cl = utf8.regex.compiletime.charclass.builder 4 | 5 | return function(str, c, bs, ctx) 6 | return cl.new():with_codes(c), utf8.next(str, bs) - bs 7 | end 8 | 9 | end 10 | -------------------------------------------------------------------------------- /include/AL/utf8/primitives/tarantool.lua: -------------------------------------------------------------------------------- 1 | return function(utf8) 2 | 3 | utf8:require "primitives.dummy" 4 | 5 | local tnt_utf8 = utf8.config.tarantool_utf8 or require("utf8") 6 | 7 | utf8.lower = tnt_utf8.lower 8 | utf8.upper = tnt_utf8.upper 9 | utf8.len = tnt_utf8.len 10 | utf8.char = tnt_utf8.char 11 | 12 | return utf8 13 | end 14 | -------------------------------------------------------------------------------- /include/AL/utf8/context/compiletime.lua: -------------------------------------------------------------------------------- 1 | return function(utf8) 2 | 3 | local begins = utf8.config.begins 4 | local ends = utf8.config.ends 5 | 6 | return { 7 | new = function() 8 | return { 9 | prev_class = nil, 10 | begins = begins[1].default(), 11 | ends = ends[1].default(), 12 | funcs = {}, 13 | internal = false, -- hack for ranges, flags if parser is in [] 14 | } 15 | end 16 | } 17 | 18 | end 19 | -------------------------------------------------------------------------------- /include/AL/utf8/ends/compiletime/parser.lua: -------------------------------------------------------------------------------- 1 | return function(utf8) 2 | 3 | utf8.config.ends = utf8.config.ends or { 4 | utf8:require "ends.compiletime.vanilla" 5 | } 6 | 7 | function utf8.regex.compiletime.ends.parse(regex, c, bs, ctx) 8 | for _, m in ipairs(utf8.config.ends) do 9 | local functions, move = m.parse(regex, c, bs, ctx) 10 | utf8.debug("ends", _, c, bs, move, functions) 11 | if functions then 12 | return functions, move 13 | end 14 | end 15 | end 16 | 17 | end 18 | -------------------------------------------------------------------------------- /include/AL/utf8/begins/compiletime/parser.lua: -------------------------------------------------------------------------------- 1 | return function(utf8) 2 | 3 | utf8.config.begins = utf8.config.begins or { 4 | utf8:require "begins.compiletime.vanilla" 5 | } 6 | 7 | function utf8.regex.compiletime.begins.parse(regex, c, bs, ctx) 8 | for _, m in ipairs(utf8.config.begins) do 9 | local functions, move = m.parse(regex, c, bs, ctx) 10 | utf8.debug("begins", _, c, bs, move, functions) 11 | if functions then 12 | return functions, move 13 | end 14 | end 15 | end 16 | 17 | end 18 | -------------------------------------------------------------------------------- /include/AL/utf8/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -xe 4 | 5 | lua53=$(which lua5.3 || which true) 6 | lua51=$(which lua5.1 || which true) 7 | luajit=$(which luajit || which true) 8 | 9 | for test in \ 10 | test/charclass_compiletime.lua \ 11 | test/charclass_runtime.lua \ 12 | test/context_runtime.lua \ 13 | test/test.lua \ 14 | test/test_compat.lua \ 15 | test/test_pm.lua \ 16 | test/test_utf8data.lua 17 | do 18 | $lua53 $test 19 | $lua51 $test 20 | $luajit $test 21 | done 22 | 23 | echo "tests passed" 24 | -------------------------------------------------------------------------------- /include/AL/utf8/test/test_utf8data.lua: -------------------------------------------------------------------------------- 1 | local utf8uclc = require('init') 2 | utf8uclc.config = { 3 | debug = nil, 4 | -- debug = utf8:require("util").debug, 5 | conversion = { 6 | uc_lc = setmetatable({}, {__index = function(self, idx) return "l" end}), 7 | lc_uc = setmetatable({}, {__index = function(self, idx) return "u" end}), 8 | } 9 | } 10 | utf8uclc:init() 11 | 12 | local assert_equals = require 'test.util'.assert_equals 13 | 14 | assert_equals(utf8uclc.lower("фыва"), "llll") 15 | assert_equals(utf8uclc.upper("фыва"), "uuuu") 16 | -------------------------------------------------------------------------------- /include/AL/utf8/charclass/runtime/init.lua: -------------------------------------------------------------------------------- 1 | return function(utf8) 2 | 3 | local provided = utf8.config.runtime_charclasses 4 | 5 | if provided then 6 | if type(provided) == "table" then 7 | return provided 8 | elseif type(provided) == "function" then 9 | return provided(utf8) 10 | else 11 | return utf8:require(provided) 12 | end 13 | end 14 | 15 | local ffi = pcall(require, "ffi") 16 | if not ffi then 17 | return utf8:require "charclass.runtime.dummy" 18 | else 19 | return utf8:require "charclass.runtime.native" 20 | end 21 | 22 | end 23 | -------------------------------------------------------------------------------- /include/AL/utf8/modifier/compiletime/simple.lua: -------------------------------------------------------------------------------- 1 | return function(utf8) 2 | 3 | local matchers = { 4 | simple = function(class, name) 5 | local class_name = 'class' .. name 6 | return [[ 7 | local ]] .. class_name .. [[ = ]] .. class .. [[ 8 | 9 | add(function(ctx) -- simple 10 | -- debug(ctx, 'simple', ']] .. class_name .. [[') 11 | if ]] .. class_name .. [[:test(ctx:get_charcode()) then 12 | ctx:next_char() 13 | ctx:next_function() 14 | return ctx:get_function()(ctx) 15 | end 16 | end) 17 | ]] 18 | end, 19 | } 20 | 21 | return matchers 22 | 23 | end 24 | -------------------------------------------------------------------------------- /include/AL/utf8/primitives/init.lua: -------------------------------------------------------------------------------- 1 | return function(utf8) 2 | 3 | local provided = utf8.config.primitives 4 | 5 | if provided then 6 | if type(provided) == "table" then 7 | return provided 8 | elseif type(provided) == "function" then 9 | return provided(utf8) 10 | else 11 | return utf8:require(provided) 12 | end 13 | end 14 | 15 | if pcall(require, "tarantool") then 16 | return utf8:require "primitives.tarantool" 17 | elseif pcall(require, "ffi") then 18 | return utf8:require "primitives.native" 19 | else 20 | return utf8:require "primitives.dummy" 21 | end 22 | 23 | end 24 | -------------------------------------------------------------------------------- /include/AL/utf8/modifier/compiletime/stub.lua: -------------------------------------------------------------------------------- 1 | return function(utf8) 2 | 3 | local matchers = utf8:require("modifier.compiletime.simple") 4 | 5 | local function parse(regex, c, bs, ctx) 6 | local functions 7 | 8 | if ctx.prev_class then 9 | functions = { matchers.simple(ctx.prev_class, tostring(bs)) } 10 | ctx.prev_class = nil 11 | end 12 | 13 | return functions, 0 14 | end 15 | 16 | local function check(ctx) 17 | if ctx.prev_class then 18 | table.insert(ctx.funcs, matchers.simple(ctx.prev_class, tostring(ctx.pos))) 19 | ctx.prev_class = nil 20 | end 21 | end 22 | 23 | return { 24 | parse = parse, 25 | check = check, 26 | } 27 | 28 | end 29 | -------------------------------------------------------------------------------- /include/AL/utf8/modifier/compiletime/parser.lua: -------------------------------------------------------------------------------- 1 | return function(utf8) 2 | 3 | utf8.config.modifier = utf8.config.modifier or { 4 | utf8:require "modifier.compiletime.vanilla", 5 | utf8:require "modifier.compiletime.frontier", 6 | utf8:require "modifier.compiletime.stub", 7 | } 8 | 9 | function utf8.regex.compiletime.modifier.parse(regex, c, bs, ctx) 10 | for _, m in ipairs(utf8.config.modifier) do 11 | local functions, move = m.parse(regex, c, bs, ctx) 12 | utf8.debug("mod", _, c, bs, move, functions and utf8.config.unpack(functions)) 13 | if functions then 14 | ctx.prev_class = nil 15 | return functions, move 16 | end 17 | end 18 | end 19 | 20 | end 21 | -------------------------------------------------------------------------------- /include/AL/utf8/charclass/compiletime/parser.lua: -------------------------------------------------------------------------------- 1 | return function(utf8) 2 | 3 | utf8.config.compiletime_charclasses = utf8.config.compiletime_charclasses or { 4 | utf8:require "charclass.compiletime.vanilla", 5 | utf8:require "charclass.compiletime.range", 6 | utf8:require "charclass.compiletime.stub", 7 | } 8 | 9 | function utf8.regex.compiletime.charclass.parse(regex, c, bs, ctx) 10 | utf8.debug("parse charclass():", regex, c, bs, regex[bs]) 11 | for _, p in ipairs(utf8.config.compiletime_charclasses) do 12 | local charclass, nbs = p(regex, c, bs, ctx) 13 | if charclass then 14 | ctx.prev_class = charclass:build() 15 | utf8.debug("cc", ctx.prev_class, _, c, bs, nbs) 16 | return charclass, nbs 17 | end 18 | end 19 | end 20 | 21 | end 22 | -------------------------------------------------------------------------------- /include/AL/utf8/ends/compiletime/vanilla.lua: -------------------------------------------------------------------------------- 1 | return function(utf8) 2 | 3 | local matchers = { 4 | any = function() 5 | return [[ 6 | add(function(ctx) -- any 7 | ctx.result.finish = ctx.pos - 1 8 | ctx:done() 9 | end) 10 | ]] 11 | end, 12 | toend = function(ctx) 13 | return [[ 14 | add(function(ctx) -- toend 15 | ctx.result.finish = ctx.pos - 1 16 | ctx.modified = true 17 | if ctx.pos == utf8len(ctx.str) + 1 then ctx:done() end 18 | end) 19 | ]] 20 | end, 21 | } 22 | 23 | local len = utf8.raw.len 24 | 25 | local function default() 26 | return matchers.any() 27 | end 28 | 29 | local function parse(regex, c, bs, ctx) 30 | local functions 31 | local skip = 0 32 | 33 | if bs == len(regex) and c == '$' then 34 | functions = matchers.toend() 35 | skip = 1 36 | end 37 | 38 | return functions, skip 39 | end 40 | 41 | return { 42 | parse = parse, 43 | default = default, 44 | } 45 | 46 | end 47 | -------------------------------------------------------------------------------- /include/AL/utf8/charclass/compiletime/range.lua: -------------------------------------------------------------------------------- 1 | return function(utf8) 2 | 3 | local cl = utf8.regex.compiletime.charclass.builder 4 | 5 | local next = utf8.util.next 6 | 7 | return function(str, c, bs, ctx) 8 | if not ctx.internal then return end 9 | 10 | local nbs = bs 11 | 12 | local r1, r2 13 | 14 | local c, nbs = c, bs 15 | if c == '%' then 16 | c, nbs = next(str, nbs) 17 | r1 = c 18 | else 19 | r1 = c 20 | end 21 | 22 | utf8.debug("range r1", r1, nbs) 23 | 24 | c, nbs = next(str, nbs) 25 | if c ~= '-' then return end 26 | 27 | c, nbs = next(str, nbs) 28 | if c == '%' then 29 | c, nbs = next(str, nbs) 30 | r2 = c 31 | elseif c ~= '' and c ~= ']' then 32 | r2 = c 33 | end 34 | 35 | utf8.debug("range r2", r2, nbs) 36 | 37 | if r1 and r2 then 38 | return cl.new():with_ranges{utf8.byte(r1), utf8.byte(r2)}, utf8.next(str, nbs) - bs 39 | else 40 | return 41 | end 42 | end 43 | 44 | end 45 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 mzn928 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /include/AL/utf8/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Stepets 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /include/AL/utf8/test/strict.lua: -------------------------------------------------------------------------------- 1 | --[[-- 2 | strict.lua from http://metalua.luaforge.net/src/lib/strict.lua.html 3 | --]]-- 4 | 5 | -- 6 | -- strict.lua 7 | -- checks uses of undeclared global variables 8 | -- All global variables must be 'declared' through a regular assignment 9 | -- (even assigning nil will do) in a main chunk before being used 10 | -- anywhere or assigned to inside a function. 11 | -- 12 | 13 | local mt = getmetatable(_G) 14 | if mt == nil then 15 | mt = {} 16 | setmetatable(_G, mt) 17 | end 18 | 19 | __STRICT = true 20 | mt.__declared = {} 21 | 22 | mt.__newindex = function (t, n, v) 23 | if __STRICT and not mt.__declared[n] then 24 | local w = debug.getinfo(2, "S").what 25 | if w ~= "main" and w ~= "C" then 26 | error("assign to undeclared variable '"..n.."'", 2) 27 | end 28 | mt.__declared[n] = true 29 | end 30 | rawset(t, n, v) 31 | end 32 | 33 | mt.__index = function (t, n) 34 | if not mt.__declared[n] and debug.getinfo(2, "S").what ~= "C" then 35 | error("variable '"..n.."' is not declared", 2) 36 | end 37 | return rawget(t, n) 38 | end 39 | 40 | function global(...) 41 | for _, v in ipairs{...} do mt.__declared[v] = true end 42 | end 43 | -------------------------------------------------------------------------------- /include/AL/utf8/begins/compiletime/vanilla.lua: -------------------------------------------------------------------------------- 1 | return function(utf8) 2 | 3 | local matchers = { 4 | sliding = function() 5 | return [[ 6 | add(function(ctx) -- sliding 7 | while ctx.pos <= ctx.len do 8 | local clone = ctx:clone() 9 | -- debug('starting from', clone, "start_pos", clone.pos) 10 | clone.result.start = clone.pos 11 | clone:next_function() 12 | clone:get_function()(clone) 13 | 14 | ctx:next_char() 15 | end 16 | ctx:terminate() 17 | end) 18 | ]] 19 | end, 20 | fromstart = function(ctx) 21 | return [[ 22 | add(function(ctx) -- fromstart 23 | if ctx.byte_pos > ctx.len then 24 | return 25 | end 26 | ctx.result.start = ctx.pos 27 | ctx:next_function() 28 | ctx:get_function()(ctx) 29 | ctx:terminate() 30 | end) 31 | ]] 32 | end, 33 | } 34 | 35 | local function default() 36 | return matchers.sliding() 37 | end 38 | 39 | local function parse(regex, c, bs, ctx) 40 | if bs ~= 1 then return end 41 | 42 | local functions 43 | local skip = 0 44 | 45 | if c == '^' then 46 | functions = matchers.fromstart() 47 | skip = 1 48 | else 49 | functions = matchers.sliding() 50 | end 51 | 52 | return functions, skip 53 | end 54 | 55 | return { 56 | parse = parse, 57 | default = default, 58 | } 59 | 60 | end 61 | -------------------------------------------------------------------------------- /include/AL/utf8/primitives/native.lua: -------------------------------------------------------------------------------- 1 | return function(utf8) 2 | 3 | local ffi = require("ffi") 4 | if ffi.os == "Windows" then 5 | os.setlocale(utf8.config.locale or "english_us.65001", "ctype") 6 | ffi.cdef[[ 7 | short towupper(short c); 8 | short towlower(short c); 9 | ]] 10 | else 11 | os.setlocale(utf8.config.locale or "C.UTF-8", "ctype") 12 | ffi.cdef[[ 13 | int towupper(int c); 14 | int towlower(int c); 15 | ]] 16 | end 17 | 18 | utf8:require "primitives.dummy" 19 | 20 | if not utf8.config.conversion.uc_lc then 21 | function utf8.lower(str) 22 | local bs = 1 23 | local nbs 24 | local bytes = utf8.raw.len(str) 25 | local res = {} 26 | 27 | while bs <= bytes do 28 | nbs = utf8.next(str, bs) 29 | local cp = utf8.unicode(str, bs, nbs) 30 | res[#res + 1] = ffi.C.towlower(cp) 31 | bs = nbs 32 | end 33 | 34 | return utf8.char(utf8.config.unpack(res)) 35 | end 36 | end 37 | 38 | if not utf8.config.conversion.lc_uc then 39 | function utf8.upper(str) 40 | local bs = 1 41 | local nbs 42 | local bytes = utf8.raw.len(str) 43 | local res = {} 44 | 45 | while bs <= bytes do 46 | nbs = utf8.next(str, bs) 47 | local cp = utf8.unicode(str, bs, nbs) 48 | res[#res + 1] = ffi.C.towupper(cp) 49 | bs = nbs 50 | end 51 | 52 | return utf8.char(utf8.config.unpack(res)) 53 | end 54 | end 55 | 56 | return utf8 57 | end 58 | -------------------------------------------------------------------------------- /include/AL/utf8/util.lua: -------------------------------------------------------------------------------- 1 | return function(utf8) 2 | 3 | function utf8.util.copy(obj, deep) 4 | if type(obj) == 'table' then 5 | local result = {} 6 | if deep then 7 | for k,v in pairs(obj) do 8 | result[k] = utf8.util.copy(v, true) 9 | end 10 | else 11 | for k,v in pairs(obj) do 12 | result[k] = v 13 | end 14 | end 15 | return result 16 | else 17 | return obj 18 | end 19 | end 20 | 21 | local function dump(val, tab) 22 | tab = tab or '' 23 | 24 | if type(val) == 'table' then 25 | utf8.config.logger('{\n') 26 | for k,v in pairs(val) do 27 | utf8.config.logger(tab .. tostring(k) .. " = ") 28 | dump(v, tab .. '\t') 29 | utf8.config.logger("\n") 30 | end 31 | utf8.config.logger(tab .. '}\n') 32 | else 33 | utf8.config.logger(tostring(val)) 34 | end 35 | end 36 | 37 | function utf8.util.debug(...) 38 | local t = {...} 39 | for _, v in ipairs(t) do 40 | if type(v) == "table" and not (getmetatable(v) or {}).__tostring then 41 | dump(v, '\t') 42 | else 43 | utf8.config.logger(tostring(v), " ") 44 | end 45 | end 46 | 47 | utf8.config.logger('\n') 48 | end 49 | 50 | function utf8.debug(...) 51 | if utf8.config.debug then 52 | utf8.config.debug(...) 53 | end 54 | end 55 | 56 | function utf8.util.next(str, bs) 57 | local nbs1 = utf8.next(str, bs) 58 | local nbs2 = utf8.next(str, nbs1) 59 | return utf8.raw.sub(str, nbs1, nbs2 - 1), nbs1 60 | end 61 | 62 | return utf8.util 63 | 64 | end 65 | -------------------------------------------------------------------------------- /include/AL/utf8/charclass/runtime/native.lua: -------------------------------------------------------------------------------- 1 | return function(utf8) 2 | 3 | os.setlocale(utf8.config.locale, "ctype") 4 | 5 | local ffi = require("ffi") 6 | ffi.cdef[[ 7 | int iswalnum(int c); 8 | int iswalpha(int c); 9 | int iswascii(int c); 10 | int iswblank(int c); 11 | int iswcntrl(int c); 12 | int iswdigit(int c); 13 | int iswgraph(int c); 14 | int iswlower(int c); 15 | int iswprint(int c); 16 | int iswpunct(int c); 17 | int iswspace(int c); 18 | int iswupper(int c); 19 | int iswxdigit(int c); 20 | ]] 21 | 22 | local base = utf8:require "charclass.runtime.base" 23 | 24 | local native = setmetatable({}, {__index = base}) 25 | local mt = {__index = native} 26 | 27 | function native.new() 28 | return setmetatable({}, mt) 29 | end 30 | 31 | function native:is(class, char_code) 32 | if class == 'alpha' then return ffi.C.iswalpha(char_code) ~= 0 33 | elseif class == 'cntrl' then return ffi.C.iswcntrl(char_code) ~= 0 34 | elseif class == 'digit' then return ffi.C.iswdigit(char_code) ~= 0 35 | elseif class == 'graph' then return ffi.C.iswgraph(char_code) ~= 0 36 | elseif class == 'lower' then return ffi.C.iswlower(char_code) ~= 0 37 | elseif class == 'punct' then return ffi.C.iswpunct(char_code) ~= 0 38 | elseif class == 'space' then return ffi.C.iswspace(char_code) ~= 0 39 | elseif class == 'upper' then return ffi.C.iswupper(char_code) ~= 0 40 | elseif class == 'alnum' then return ffi.C.iswalnum(char_code) ~= 0 41 | elseif class == 'xdigit' then return ffi.C.iswxdigit(char_code) ~= 0 42 | end 43 | end 44 | 45 | return native 46 | 47 | end 48 | -------------------------------------------------------------------------------- /include/AL/utf8/charclass/runtime/dummy.lua: -------------------------------------------------------------------------------- 1 | return function(utf8) 2 | 3 | local base = utf8:require "charclass.runtime.base" 4 | 5 | local dummy = setmetatable({}, {__index = base}) 6 | local mt = {__index = dummy} 7 | 8 | function dummy.new() 9 | return setmetatable({}, mt) 10 | end 11 | 12 | function dummy:with_classes(...) 13 | local classes = {...} 14 | for _, c in ipairs(classes) do 15 | if c == 'alpha' then self:with_ranges({65, 90}, {97, 122}) 16 | elseif c == 'cntrl' then self:with_ranges({0, 31}):with_codes(127) 17 | elseif c == 'digit' then self:with_ranges({48, 57}) 18 | elseif c == 'graph' then self:with_ranges({1, 8}, {14, 31}, {33, 132}, {134, 159}, {161, 5759}, {5761, 8191}, {8203, 8231}, {8234, 8238}, {8240, 8286}, {8288, 12287}) 19 | elseif c == 'lower' then self:with_ranges({97, 122}) 20 | elseif c == 'punct' then self:with_ranges({33, 47}, {58, 64}, {91, 96}, {123, 126}) 21 | elseif c == 'space' then self:with_ranges({9, 13}):with_codes(32, 133, 160, 5760):with_ranges({8192, 8202}):with_codes(8232, 8233, 8239, 8287, 12288) 22 | elseif c == 'upper' then self:with_ranges({65, 90}) 23 | elseif c == 'alnum' then self:with_ranges({48, 57}, {65, 90}, {97, 122}) 24 | elseif c == 'xdigit' then self:with_ranges({48, 57}, {65, 70}, {97, 102}) 25 | end 26 | end 27 | return self 28 | end 29 | 30 | function dummy:without_classes(...) 31 | local classes = {...} 32 | if #classes > 0 then 33 | return self:with_subs(dummy.new():with_classes(...):invert()) 34 | else 35 | return self 36 | end 37 | end 38 | 39 | return dummy 40 | 41 | end 42 | -------------------------------------------------------------------------------- /include/AL/utf8/modifier/compiletime/frontier.lua: -------------------------------------------------------------------------------- 1 | return function(utf8) 2 | 3 | local matchers = { 4 | frontier = function(class, name) 5 | local class_name = 'class' .. name 6 | return [[ 7 | local ]] .. class_name .. [[ = ]] .. class .. [[ 8 | 9 | add(function(ctx) -- frontier 10 | ctx:prev_char() 11 | local prev_charcode = ctx:get_charcode() or 0 12 | ctx:next_char() 13 | local charcode = ctx:get_charcode() or 0 14 | -- debug("frontier pos", ctx.pos, "prev_charcode", prev_charcode, "charcode", charcode) 15 | if ]] .. class_name .. [[:test(prev_charcode) then return end 16 | if ]] .. class_name .. [[:test(charcode) then 17 | ctx:next_function() 18 | return ctx:get_function()(ctx) 19 | end 20 | end) 21 | ]] 22 | end, 23 | simple = utf8:require("modifier.compiletime.simple").simple, 24 | } 25 | 26 | local function parse(regex, c, bs, ctx) 27 | local functions, nbs, class 28 | 29 | if c == '%' then 30 | if utf8.raw.sub(regex, bs + 1, bs + 1) ~= 'f' then return end 31 | if utf8.raw.sub(regex, bs + 2, bs + 2) ~= '[' then error("missing '[' after '%f' in pattern") end 32 | 33 | functions = {} 34 | if ctx.prev_class then 35 | table.insert(functions, matchers.simple(ctx.prev_class, tostring(bs))) 36 | ctx.prev_class = nil 37 | end 38 | class, nbs = utf8.regex.compiletime.charclass.parse(regex, '[', bs + 2, ctx) 39 | nbs = nbs + 2 40 | table.insert(functions, matchers.frontier(class:build(), tostring(bs))) 41 | end 42 | 43 | return functions, nbs 44 | end 45 | 46 | return { 47 | parse = parse, 48 | } 49 | 50 | end 51 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Aegisub Persian Toolkit 2 | Collection of tools that might help Persian translators. 3 | 4 | [AnimDL.ir](https://www.animdl.ir) | [@AnimeList_ir](https://t.me/animelist_ir) 5 | 6 | # How to install 7 | 1. Copy autoload content to autoload directory of aegisub. 8 | 2. Copy include content to include directory of aegisub. 9 | 10 | Mentioned directories are at the locations bellow: 11 | - Windows: 12 | ``` 13 | %appdata%\Aegisub\automation\ 14 | ``` 15 | - Linux: 16 | ``` 17 | ~/.aegisub/automation/ 18 | ``` 19 | 20 | (If folders doesn\'t exist you can create it yourself) 21 | 22 | # Scripts 23 | ## PakNevis 24 | Correct common mistakes in Persian text. 25 | ## Extend Move 26 | Extend \move based on line's time (Created for linear signs that go outside of video boundries). 27 | ## Unretard 28 | Converts non-RTL typed text to RTL compatible one. 29 | ## RTL / RTL 30 | Fix RTL languages displaying issues. 31 | ## RTL / Un-RTL 32 | Undo RTL function effects. 33 | ## RTL Editor (Edited version of MasafAutomation\'s RTL Editor) 34 | An editor for easy editing of RTL language lines. 35 | ## Split / Split at Tags (Based on Lyger's Split at Tags automation) 36 | A splitter (at tags) for RTL language lines. 37 | ## Split / Split at Spaces 38 | A splitter (at spaces) for RTL language lines. 39 | ## Split / Reverse + Split (at Tags) 40 | Split / Reverse at Tags + Split / Split at Tags. 41 | ## Split / Reverse at Tags 42 | Reverse line at tags to use it with other LTR automations. 43 | 44 | # Credits 45 | - [utf8.lua](https://github.com/Stepets/utf8.lua) 46 | - [MasafAutomation](https://github.com/Majid110/MasafAutomation) 47 | - [Lyger's Automations](https://github.com/lyger/Aegisub_automation_scripts) 48 | -------------------------------------------------------------------------------- /include/AL/utf8/init.lua: -------------------------------------------------------------------------------- 1 | local module_path = ... 2 | module_path = module_path:match("^(.-)init$") or (module_path .. '.') 3 | 4 | local ffi_enabled, ffi = pcall(require, 'ffi') 5 | 6 | local utf8 = { 7 | config = {}, 8 | default = { 9 | debug = nil, 10 | logger = io.write, 11 | loadstring = (loadstring or load), 12 | unpack = (unpack or table.unpack), 13 | cache = { 14 | regex = setmetatable({},{ 15 | __mode = 'kv' 16 | }), 17 | plain = setmetatable({},{ 18 | __mode = 'kv' 19 | }), 20 | }, 21 | locale = nil, 22 | int32array = function(size) 23 | if ffi_enabled then 24 | return ffi.new("uint32_t[?]", size + 1) 25 | else 26 | return {} 27 | end 28 | end, 29 | conversion = { 30 | uc_lc = nil, 31 | lc_uc = nil 32 | } 33 | }, 34 | regex = { 35 | compiletime = { 36 | charclass = {}, 37 | begins = {}, 38 | ends = {}, 39 | modifier = {}, 40 | } 41 | }, 42 | util = {}, 43 | } 44 | 45 | function utf8:require(name) 46 | local full_module_path = module_path .. name 47 | if package.loaded[full_module_path] then 48 | return package.loaded[full_module_path] 49 | end 50 | 51 | local mod = require(full_module_path) 52 | if type(mod) == 'function' then 53 | mod = mod(self) 54 | package.loaded[full_module_path] = mod 55 | end 56 | return mod 57 | end 58 | 59 | function utf8:init() 60 | for k, v in pairs(self.default) do 61 | self.config[k] = self.config[k] or v 62 | end 63 | 64 | self:require "util" 65 | self:require "primitives.init" 66 | self:require "functions.lua53" 67 | 68 | return self 69 | end 70 | 71 | return utf8 72 | -------------------------------------------------------------------------------- /include/AL/utf8/test/util.lua: -------------------------------------------------------------------------------- 1 | require "test.strict" 2 | 3 | local function equals(t1, t2) 4 | for k,v in pairs(t1) do 5 | if t2[k] == nil then return false end 6 | if type(t2[k]) == 'cdata' and type(v) == 'cdata' then 7 | return true -- don't know how to compare 8 | elseif type(t2[k]) == 'table' and type(v) == 'table' then 9 | if not equals(t2[k], v) then return false end 10 | else 11 | if t2[k] ~= v then return false end 12 | end 13 | end 14 | for k,v in pairs(t2) do 15 | if t1[k] == nil then return false end 16 | if type(t1[k]) == 'cdata' and type(v) == 'cdata' then 17 | return true -- don't know how to compare 18 | elseif type(t1[k]) == 'table' and type(v) == 'table' then 19 | if not equals(t1[k], v) then return false end 20 | else 21 | if t1[k] ~= v then return false end 22 | end 23 | end 24 | return true 25 | end 26 | 27 | local old_tostring = tostring 28 | local function tostring(v) 29 | local type = type(v) 30 | if type == 'table' then 31 | local tbl = "{" 32 | for k,v in pairs(v) do 33 | tbl = tbl .. tostring(k) .. ' = ' .. tostring(v) .. ', ' 34 | end 35 | return tbl .. '}' 36 | else 37 | return old_tostring(v) 38 | end 39 | end 40 | 41 | local old_assert = assert 42 | local assert = function(cond, ...) 43 | if not cond then 44 | local data = {...} 45 | local msg = "" 46 | for _, v in pairs(data) do 47 | local type = type(v) 48 | if type == 'table' then 49 | local tbl = "{" 50 | for k,v in pairs(v) do 51 | tbl = tbl .. tostring(k) .. ' = ' .. tostring(v) .. ', ' 52 | end 53 | msg = msg .. tbl .. '}' 54 | else 55 | msg = msg .. tostring(v) 56 | end 57 | end 58 | error(#data > 0 and msg or "assertion failed!") 59 | end 60 | return cond 61 | end 62 | 63 | local function assert_equals(a,b) 64 | assert( 65 | type(a) == 'table' and type(b) == 'table' and equals(a,b) or a == b, 66 | "expected: ", a and a or tostring(a), "\n", 67 | "got: ", b and b or tostring(b) 68 | ) 69 | end 70 | 71 | return { 72 | equals = equals, 73 | assert = assert, 74 | assert_equals = assert_equals, 75 | } 76 | -------------------------------------------------------------------------------- /include/AL/utf8/test/context_runtime.lua: -------------------------------------------------------------------------------- 1 | local utf8 = require("init"):init() 2 | 3 | local context = utf8:require('context.runtime') 4 | 5 | local equals = require('test.util').equals 6 | local assert = require('test.util').assert 7 | local assert_equals = require('test.util').assert_equals 8 | 9 | local ctx_en 10 | local ctx_ru 11 | local function setup() 12 | ctx_en = context.new({str = 'asdf'}) 13 | ctx_ru = context.new({str = 'фыва'}) 14 | end 15 | 16 | local test_get_char = (function() 17 | setup() 18 | 19 | assert_equals('a', ctx_en:get_char()) 20 | assert_equals('ф', ctx_ru:get_char()) 21 | end)() 22 | 23 | local test_get_charcode = (function() 24 | setup() 25 | 26 | assert_equals(utf8.byte'a', ctx_en:get_charcode()) 27 | assert_equals(utf8.byte'ф', ctx_ru:get_charcode()) 28 | end)() 29 | 30 | local test_next_char = (function() 31 | setup() 32 | 33 | assert_equals(1, ctx_en.pos) 34 | assert_equals(1, ctx_ru.pos) 35 | 36 | ctx_ru:next_char() 37 | ctx_en:next_char() 38 | 39 | assert_equals(2, ctx_en.pos) 40 | assert_equals(2, ctx_ru.pos) 41 | 42 | assert_equals('s', ctx_en:get_char()) 43 | assert_equals('ы', ctx_ru:get_char()) 44 | assert_equals(utf8.byte's', ctx_en:get_charcode()) 45 | assert_equals(utf8.byte'ы', ctx_ru:get_charcode()) 46 | end)() 47 | 48 | local test_clone = (function() 49 | setup() 50 | 51 | local clone = ctx_en:clone() 52 | 53 | assert(getmetatable(clone) == getmetatable(ctx_en)) 54 | assert_equals(clone, ctx_en) 55 | 56 | ctx_en:next_char() 57 | 58 | assert_equals('a', clone:get_char()) 59 | assert_equals('s', ctx_en:get_char()) 60 | 61 | end)() 62 | 63 | local test_last_char = (function() 64 | ctx_en = context.new({str = 'asdf', pos = 4}) 65 | ctx_ru = context.new({str = 'фыва', pos = 4}) 66 | 67 | assert_equals('f', ctx_en:get_char()) 68 | assert_equals('а', ctx_ru:get_char()) 69 | 70 | ctx_ru:next_char() 71 | ctx_en:next_char() 72 | 73 | assert_equals(5, ctx_en.pos) 74 | assert_equals(5, ctx_ru.pos) 75 | 76 | assert_equals("", ctx_en:get_char()) 77 | assert_equals("", ctx_ru:get_char()) 78 | assert_equals(nil, ctx_en:get_charcode()) 79 | assert_equals(nil, ctx_ru:get_charcode()) 80 | end)() 81 | 82 | print('OK') 83 | -------------------------------------------------------------------------------- /include/AL/utf8/regex_parser.lua: -------------------------------------------------------------------------------- 1 | return function(utf8) 2 | 3 | utf8:require "modifier.compiletime.parser" 4 | utf8:require "charclass.compiletime.parser" 5 | utf8:require "begins.compiletime.parser" 6 | utf8:require "ends.compiletime.parser" 7 | 8 | local gensub = utf8.gensub 9 | local sub = utf8.sub 10 | 11 | local parser_context = utf8:require "context.compiletime" 12 | 13 | return function(regex, plain) 14 | utf8.debug("regex", regex) 15 | local ctx = parser_context:new() 16 | 17 | local skip = {0} 18 | for nbs, c, bs in gensub(regex, 0), skip do 19 | repeat -- continue 20 | skip[1] = 0 21 | 22 | c = utf8.raw.sub(regex, bs, utf8.next(regex, bs) - 1) 23 | 24 | local functions, move = utf8.regex.compiletime.begins.parse(regex, c, bs, ctx) 25 | if functions then 26 | ctx.begins = functions 27 | skip[1] = move 28 | end 29 | if skip[1] ~= 0 then break end 30 | 31 | local functions, move = utf8.regex.compiletime.ends.parse(regex, c, bs, ctx) 32 | if functions then 33 | ctx.ends = functions 34 | skip[1] = move 35 | end 36 | if skip[1] ~= 0 then break end 37 | 38 | local functions, move = utf8.regex.compiletime.modifier.parse(regex, c, bs, ctx) 39 | if functions then 40 | for _, f in ipairs(functions) do 41 | ctx.funcs[#ctx.funcs + 1] = f 42 | end 43 | skip[1] = move 44 | end 45 | if skip[1] ~= 0 then break end 46 | 47 | local charclass, move = utf8.regex.compiletime.charclass.parse(regex, c, bs, ctx) 48 | if charclass then skip[1] = move end 49 | until true -- continue 50 | end 51 | 52 | for _, m in ipairs(utf8.config.modifier) do 53 | if m.check then m.check(ctx) end 54 | end 55 | 56 | local src = [[ 57 | return function(str, init, utf8) 58 | local ctx = utf8:require("context.runtime").new({str = str, pos = init or 1}) 59 | local cl = utf8:require("charclass.runtime.init") 60 | local utf8sub = utf8.sub 61 | local rawsub = utf8.raw.sub 62 | local utf8len = utf8.len 63 | local utf8next = utf8.next 64 | local debug = utf8.debug 65 | local function add(fun) 66 | ctx.functions[#ctx.functions + 1] = fun 67 | end 68 | ]] .. ctx.begins 69 | for _, v in ipairs(ctx.funcs) do src = src .. v end 70 | src = src .. ctx.ends .. [[ 71 | return coroutine.wrap(ctx:get_function())(ctx) 72 | end 73 | ]] 74 | 75 | utf8.debug(regex, src) 76 | 77 | return assert(utf8.config.loadstring(src, (plain and "plain " or "") .. regex))() 78 | end 79 | 80 | end 81 | -------------------------------------------------------------------------------- /include/AL/utf8/test/charclass_runtime.lua: -------------------------------------------------------------------------------- 1 | local utf8 = require("init") 2 | utf8.config = { 3 | debug = nil, --utf8:require("util").debug 4 | } 5 | utf8:init() 6 | 7 | local cl = utf8:require("charclass.runtime.init") 8 | 9 | local equals = require('test.util').equals 10 | local assert = require('test.util').assert 11 | local assert_equals = require('test.util').assert_equals 12 | 13 | assert_equals(true, cl.new() 14 | :with_codes(utf8.byte' ') 15 | :invert() 16 | :in_codes(utf8.byte' ')) 17 | 18 | assert_equals(false, cl.new() 19 | :with_codes(utf8.byte' ') 20 | :invert() 21 | :test(utf8.byte' ')) 22 | 23 | assert_equals(false, cl.new() 24 | :with_codes() 25 | :with_ranges() 26 | :with_classes('space') 27 | :without_classes() 28 | :with_subs() 29 | :invert() 30 | :test(utf8.byte(' '))) 31 | 32 | assert_equals(true, cl.new() 33 | :with_codes() 34 | :with_ranges() 35 | :with_classes() 36 | :without_classes('space') 37 | :with_subs() 38 | :invert() 39 | :test(utf8.byte(' '))) 40 | 41 | assert_equals(false, cl.new() 42 | :with_codes() 43 | :with_ranges() 44 | :with_classes() 45 | :without_classes() 46 | :with_subs(cl.new():with_classes('space')) 47 | :invert() 48 | :test(utf8.byte(' '))) 49 | 50 | assert_equals(true, cl.new() 51 | :with_codes() 52 | :with_ranges() 53 | :with_classes() 54 | :without_classes() 55 | :with_subs(cl.new():with_classes('space'):invert()) 56 | :invert() 57 | :test(utf8.byte(' '))) 58 | 59 | assert_equals(true, cl.new() 60 | :with_codes() 61 | :with_ranges() 62 | :with_classes('punct', 'digit', 'space', 'cntrl') 63 | :without_classes() 64 | :with_subs() 65 | :invert() 66 | :test(utf8.byte'П') 67 | ) 68 | 69 | assert_equals(true, cl.new() 70 | :with_codes() 71 | :with_ranges() 72 | :with_classes('punct', 'digit', 'space', 'cntrl') 73 | :without_classes() 74 | :with_subs() 75 | :invert() 76 | :test(utf8.byte'и') 77 | ) 78 | 79 | assert_equals(true, cl.new() 80 | :with_codes() 81 | :with_ranges() 82 | :with_classes() 83 | :without_classes('space') 84 | :with_subs() 85 | :test(utf8.byte'f') 86 | ) 87 | 88 | assert_equals(false, cl.new() 89 | :with_codes() 90 | :with_ranges() 91 | :with_classes() 92 | :without_classes('space') 93 | :with_subs() 94 | :test(utf8.byte'\n') 95 | ) 96 | 97 | assert_equals(false, cl.new() 98 | :with_codes() 99 | :with_ranges() 100 | :with_classes('lower') 101 | :without_classes() 102 | :with_subs() 103 | :invert() 104 | :test(nil) 105 | ) 106 | 107 | assert_equals(false, cl.new() 108 | :with_codes() 109 | :with_ranges() 110 | :with_classes('lower') 111 | :without_classes() 112 | :with_subs() 113 | :test(nil) 114 | ) 115 | 116 | print "OK" 117 | -------------------------------------------------------------------------------- /include/AL/utf8/context/runtime.lua: -------------------------------------------------------------------------------- 1 | return function(utf8) 2 | 3 | local utf8unicode = utf8.unicode 4 | local utf8sub = utf8.sub 5 | local sub = utf8.raw.sub 6 | local byte = utf8.raw.byte 7 | local utf8len = utf8.len 8 | local utf8next = utf8.next 9 | local rawgsub = utf8.raw.gsub 10 | local utf8offset = utf8.offset 11 | local utf8char = utf8.char 12 | 13 | local util = utf8.util 14 | 15 | local ctx = {} 16 | local mt = { 17 | __index = ctx, 18 | __tostring = function(self) 19 | return rawgsub([[str: '${str}', char: ${pos} '${char}', func: ${func_pos}]], "${(.-)}", { 20 | str = self.str, 21 | pos = self.pos, 22 | char = self:get_char(), 23 | func_pos = self.func_pos, 24 | }) 25 | end 26 | } 27 | 28 | function ctx.new(obj) 29 | obj = obj or {} 30 | local res = setmetatable({ 31 | pos = obj.pos or 1, 32 | byte_pos = obj.pos or 1, 33 | str = assert(obj.str, "str is required"), 34 | len = obj.len, 35 | rawlen = obj.rawlen, 36 | bytes = obj.bytes, 37 | offsets = obj.offsets, 38 | starts = obj.starts or nil, 39 | functions = obj.functions or {}, 40 | func_pos = obj.func_pos or 1, 41 | ends = obj.ends or nil, 42 | result = obj.result and util.copy(obj.result) or {}, 43 | captures = obj.captures and util.copy(obj.captures, true) or {active = {}}, 44 | modified = false, 45 | }, mt) 46 | if not res.bytes then 47 | local str = res.str 48 | local l = #str 49 | local bytes = utf8.config.int32array(l) 50 | local offsets = utf8.config.int32array(l) 51 | local c, bs, i = nil, 1, 1 52 | while bs <= l do 53 | bytes[i] = utf8unicode(str, bs, bs) 54 | offsets[i] = bs 55 | bs = utf8.next(str, bs) 56 | i = i + 1 57 | end 58 | res.bytes = bytes 59 | res.offsets = offsets 60 | res.byte_pos = res.pos 61 | res.len = i 62 | res.rawlen = l 63 | end 64 | 65 | return res 66 | end 67 | 68 | function ctx:clone() 69 | return self:new() 70 | end 71 | 72 | function ctx:next_char() 73 | self.pos = self.pos + 1 74 | self.byte_pos = self.pos 75 | end 76 | 77 | function ctx:prev_char() 78 | self.pos = self.pos - 1 79 | self.byte_pos = self.pos 80 | end 81 | 82 | function ctx:get_char() 83 | if self.len <= self.pos then return "" end 84 | return utf8char(self.bytes[self.pos]) 85 | end 86 | 87 | function ctx:get_charcode() 88 | if self.len <= self.pos then return nil end 89 | return self.bytes[self.pos] 90 | end 91 | 92 | function ctx:next_function() 93 | self.func_pos = self.func_pos + 1 94 | end 95 | 96 | function ctx:get_function() 97 | return self.functions[self.func_pos] 98 | end 99 | 100 | function ctx:done() 101 | utf8.debug('done', self) 102 | coroutine.yield(self, self.result, self.captures) 103 | end 104 | 105 | function ctx:terminate() 106 | utf8.debug('terminate', self) 107 | coroutine.yield(nil) 108 | end 109 | 110 | return ctx 111 | 112 | end 113 | -------------------------------------------------------------------------------- /include/AL/utf8/README.md: -------------------------------------------------------------------------------- 1 | # utf8.lua 2 | pure-lua 5.3 regex library for Lua 5.3, Lua 5.1, LuaJIT 3 | 4 | This library provides simple way to add UTF-8 support into your application. 5 | 6 | #### Example: 7 | ```Lua 8 | local utf8 = require('.utf8'):init() 9 | for k,v in pairs(utf8) do 10 | string[k] = v 11 | end 12 | 13 | local str = "пыщпыщ ололоо я водитель нло" 14 | print(str:find("(.л.+)н")) 15 | -- 8 26 ололоо я водитель 16 | 17 | print(str:gsub("ло+", "보라")) 18 | -- пыщпыщ о보라보라 я водитель н보라 3 19 | 20 | print(str:match("^п[лопыщ ]*я")) 21 | -- пыщпыщ ололоо я 22 | ``` 23 | 24 | #### Usage: 25 | 26 | This library can be used as drop-in replacement for vanilla string library. It exports all vanilla functions under `raw` sub-object. 27 | 28 | ```Lua 29 | local utf8 = require('.utf8'):init() 30 | local str = "пыщпыщ ололоо я водитель нло" 31 | utf8.gsub(str, "ло+", "보라") 32 | -- пыщпыщ о보라보라 я водитель н보라 3 33 | utf8.raw.gsub(str, "ло+", "보라") 34 | -- пыщпыщ о보라보라о я водитель н보라 3 35 | ``` 36 | 37 | It also provides all functions from Lua 5.3 UTF-8 [module](https://www.lua.org/manual/5.3/manual.html#6.5) except `utf8.len (s [, i [, j]])`. If you need to validate your strings use `utf8.validate(str, byte_pos)` or iterate over with `utf8.validator`. 38 | 39 | Please note that library assumes regexes are valid UTF-8 strings, if you need to manipulate individual bytes use vanilla functions under `utf8.raw`. 40 | 41 | 42 | #### Installation: 43 | 44 | Download repository to your project folder. (no rockspecs yet) 45 | 46 | Examples assume library placed under `utf8` subfolder not `utf8.lua`. 47 | 48 | As of Lua 5.3 default `utf8` module has precedence over user-provided. In this case you can specify full module path (`.utf8`). 49 | 50 | #### Configuration: 51 | 52 | Library is highly modular. You can provide your implementation for almost any function used. Library already has several back-ends: 53 | - [Runtime character class processing](charclass/runtime/init.lua) using hardcoded codepoint ranges or using native functions through `ffi`. 54 | - [Basic functions](primitives/init.lua) for working with UTF-8 characters have specializations for `ffi`-enabled runtime and for tarantool. 55 | 56 | Probably most interesting [customizations](init.lua) are `utf8.config.loadstring` and `utf8.config.cache` if you want to precompile your regexes. 57 | 58 | ```Lua 59 | local utf8 = require('.utf8') 60 | utf8.config = { 61 | cache = my_smart_cache, 62 | } 63 | utf8:init() 64 | ``` 65 | 66 | For `lower` and `upper` functions to work in environments where `ffi` cannot be used, you can specify substitution tables ([data example](https://github.com/artemshein/luv/blob/master/utf8data.lua)) 67 | 68 | ```Lua 69 | local utf8 = require('.utf8') 70 | utf8.config = { 71 | conversion = { 72 | uc_lc = utf8_uc_lc, 73 | lc_uc = utf8_lc_uc 74 | }, 75 | } 76 | utf8:init() 77 | ``` 78 | Customization is done before initialization. If you want, you can change configuration after `init`, it might work for everything but modules. All of them should be reloaded. 79 | 80 | #### [Documentation:](test/test.lua) 81 | 82 | #### Issue reporting: 83 | 84 | Please provide example script that causes error together with environment description and debug output. Debug output can be obtained like: 85 | ```Lua 86 | local utf8 = require('.utf8') 87 | utf8.config = { 88 | debug = utf8:require("util").debug 89 | } 90 | utf8:init() 91 | -- your code 92 | ``` 93 | Default logger used is [`io.write`](https://www.lua.org/manual/5.3/manual.html#pdf-io.write) and can be changed by specifying `logger = my_logger` in configuration 94 | -------------------------------------------------------------------------------- /include/AL/utf8/charclass/compiletime/builder.lua: -------------------------------------------------------------------------------- 1 | return function(utf8) 2 | 3 | local byte = utf8.byte 4 | local unpack = utf8.config.unpack 5 | 6 | local builder = {} 7 | local mt = {__index = builder} 8 | 9 | utf8.regex.compiletime.charclass.builder = builder 10 | 11 | function builder.new() 12 | return setmetatable({}, mt) 13 | end 14 | 15 | function builder:invert() 16 | self.inverted = true 17 | return self 18 | end 19 | 20 | function builder:internal() -- is it enclosed in [] 21 | self.internal = true 22 | return self 23 | end 24 | 25 | function builder:with_codes(...) 26 | local codes = {...} 27 | self.codes = self.codes or {} 28 | 29 | for _, v in ipairs(codes) do 30 | table.insert(self.codes, type(v) == "number" and v or byte(v)) 31 | end 32 | 33 | table.sort(self.codes) 34 | return self 35 | end 36 | 37 | function builder:with_ranges(...) 38 | local ranges = {...} 39 | self.ranges = self.ranges or {} 40 | 41 | for _, v in ipairs(ranges) do 42 | table.insert(self.ranges, v) 43 | end 44 | 45 | return self 46 | end 47 | 48 | function builder:with_classes(...) 49 | local classes = {...} 50 | self.classes = self.classes or {} 51 | 52 | for _, v in ipairs(classes) do 53 | table.insert(self.classes, v) 54 | end 55 | 56 | return self 57 | end 58 | 59 | function builder:without_classes(...) 60 | local not_classes = {...} 61 | self.not_classes = self.not_classes or {} 62 | 63 | for _, v in ipairs(not_classes) do 64 | table.insert(self.not_classes, v) 65 | end 66 | 67 | return self 68 | end 69 | 70 | function builder:include(b) 71 | if not b.inverted then 72 | if b.codes then 73 | self:with_codes(unpack(b.codes)) 74 | end 75 | if b.ranges then 76 | self:with_ranges(unpack(b.ranges)) 77 | end 78 | if b.classes then 79 | self:with_classes(unpack(b.classes)) 80 | end 81 | if b.not_classes then 82 | self:without_classes(unpack(b.not_classes)) 83 | end 84 | else 85 | self.includes = self.includes or {} 86 | self.includes[#self.includes + 1] = b 87 | end 88 | return self 89 | end 90 | 91 | function builder:build() 92 | if self.codes and #self.codes == 1 and not self.inverted and not self.ranges and not self.classes and not self.not_classes and not self.includes then 93 | return "{test = function(self, cc) return cc == " .. self.codes[1] .. " end}" 94 | else 95 | local codes_list = table.concat(self.codes or {}, ', ') 96 | local ranges_list = '' 97 | for i, r in ipairs(self.ranges or {}) do ranges_list = ranges_list .. (i > 1 and ', {' or '{') .. tostring(r[1]) .. ', ' .. tostring(r[2]) .. '}' end 98 | local classes_list = '' 99 | if self.classes then classes_list = "'" .. table.concat(self.classes, "', '") .. "'" end 100 | local not_classes_list = '' 101 | if self.not_classes then not_classes_list = "'" .. table.concat(self.not_classes, "', '") .. "'" end 102 | 103 | local subs_list = '' 104 | for i, r in ipairs(self.includes or {}) do subs_list = subs_list .. (i > 1 and ', ' or '') .. r:build() .. '' end 105 | 106 | local src = [[cl.new():with_codes( 107 | ]] .. codes_list .. [[ 108 | ):with_ranges( 109 | ]] .. ranges_list .. [[ 110 | ):with_classes( 111 | ]] .. classes_list .. [[ 112 | ):without_classes( 113 | ]] .. not_classes_list .. [[ 114 | ):with_subs( 115 | ]] .. subs_list .. [[ 116 | )]] 117 | 118 | if self.inverted then 119 | src = src .. ':invert()' 120 | end 121 | 122 | return src 123 | end 124 | end 125 | 126 | return builder 127 | 128 | end 129 | -------------------------------------------------------------------------------- /include/AL/utf8/test/test_compat.lua: -------------------------------------------------------------------------------- 1 | local utf8 = require 'init' 2 | utf8.config = { 3 | debug = nil, --utf8:require("util").debug 4 | } 5 | utf8:init() 6 | print('testing utf8 library') 7 | 8 | local LUA_51, LUA_53 = false, false 9 | if "\xe4" == "xe4" then -- lua5.1 10 | LUA_51 = true 11 | else -- luajit lua5.3 12 | LUA_53 = true 13 | end 14 | 15 | assert(utf8.sub("123456789",2,4) == "234") 16 | assert(utf8.sub("123456789",7) == "789") 17 | assert(utf8.sub("123456789",7,6) == "") 18 | assert(utf8.sub("123456789",7,7) == "7") 19 | assert(utf8.sub("123456789",0,0) == "") 20 | assert(utf8.sub("123456789",-10,10) == "123456789") 21 | assert(utf8.sub("123456789",1,9) == "123456789") 22 | assert(utf8.sub("123456789",-10,-20) == "") 23 | assert(utf8.sub("123456789",-1) == "9") 24 | assert(utf8.sub("123456789",-4) == "6789") 25 | assert(utf8.sub("123456789",-6, -4) == "456") 26 | if not _no32 then 27 | assert(utf8.sub("123456789",-2^31, -4) == "123456") 28 | assert(utf8.sub("123456789",-2^31, 2^31 - 1) == "123456789") 29 | assert(utf8.sub("123456789",-2^31, -2^31) == "") 30 | end 31 | assert(utf8.sub("\000123456789",3,5) == "234") 32 | assert(utf8.sub("\000123456789", 8) == "789") 33 | print('+') 34 | 35 | assert(utf8.find("123456789", "345") == 3) 36 | local a,b = utf8.find("123456789", "345") 37 | assert(utf8.sub("123456789", a, b) == "345") 38 | assert(utf8.find("1234567890123456789", "345", 3) == 3) 39 | assert(utf8.find("1234567890123456789", "345", 4) == 13) 40 | assert(utf8.find("1234567890123456789", "346", 4) == nil) 41 | assert(utf8.find("1234567890123456789", ".45", -9) == 13) 42 | assert(utf8.find("abcdefg", "\0", 5, 1) == nil) 43 | assert(utf8.find("", "") == 1) 44 | assert(utf8.find("", "", 1) == 1) 45 | assert(not utf8.find("", "", 2)) 46 | assert(utf8.find('', 'aaa', 1) == nil) 47 | assert(('alo(.)alo'):find('(.)', 1, 1) == 4) 48 | print('+') 49 | 50 | assert(utf8.len("") == 0) 51 | assert(utf8.len("\0\0\0") == 3) 52 | assert(utf8.len("1234567890") == 10) 53 | 54 | assert(utf8.byte("a") == 97) 55 | if LUA_51 then 56 | assert(utf8.byte("�") > 127) 57 | else 58 | assert(utf8.byte("\xe4") > 127) 59 | end 60 | assert(utf8.byte(utf8.char(255)) == 255) 61 | assert(utf8.byte(utf8.char(0)) == 0) 62 | assert(utf8.byte("\0") == 0) 63 | assert(utf8.byte("\0\0alo\0x", -1) == string.byte('x')) 64 | assert(utf8.byte("ba", 2) == 97) 65 | assert(utf8.byte("\n\n", 2, -1) == 10) 66 | assert(utf8.byte("\n\n", 2, 2) == 10) 67 | assert(utf8.byte("") == nil) 68 | assert(utf8.byte("hi", -3) == nil) 69 | assert(utf8.byte("hi", 3) == nil) 70 | assert(utf8.byte("hi", 9, 10) == nil) 71 | assert(utf8.byte("hi", 2, 1) == nil) 72 | assert(utf8.char() == "") 73 | if LUA_53 then 74 | assert(utf8.raw.char(0, 255, 0) == "\0\255\0") -- fails due 255 can't be utf8 byte 75 | assert(utf8.char(0, 255, 0) == "\0\195\191\0") 76 | assert(utf8.raw.char(0, utf8.byte("\xe4"), 0) == "\0\xe4\0") 77 | assert(utf8.char(0, utf8.byte("\xe4"), 0) == "\0\195\164\0") 78 | assert(utf8.raw.char(utf8.raw.byte("\xe4l\0�u", 1, -1)) == "\xe4l\0�u") 79 | assert(utf8.raw.char(utf8.raw.byte("\xe4l\0�u", 1, -1)) == "\xe4l\0�u") 80 | assert(utf8.raw.char(utf8.raw.byte("\xe4l\0�u", 1, 0)) == "") 81 | assert(utf8.raw.char(utf8.raw.byte("\xe4l\0�u", -10, 100)) == "\xe4l\0�u") 82 | end 83 | 84 | assert(utf8.upper("ab\0c") == "AB\0C") 85 | assert(utf8.lower("\0ABCc%$") == "\0abcc%$") 86 | assert(utf8.rep('teste', 0) == '') 87 | assert(utf8.rep('t�s\00t�', 2) == 't�s\0t�t�s\000t�') 88 | assert(utf8.rep('', 10) == '') 89 | print('+') 90 | 91 | assert(utf8.upper("ab\0c") == "AB\0C") 92 | assert(utf8.lower("\0ABCc%$") == "\0abcc%$") 93 | 94 | assert(utf8.reverse"" == "") 95 | assert(utf8.reverse"\0\1\2\3" == "\3\2\1\0") 96 | assert(utf8.reverse"\0001234" == "4321\0") 97 | 98 | for i=0,30 do assert(utf8.len(string.rep('a', i)) == i) end 99 | 100 | print('+') 101 | 102 | do 103 | local f = utf8.gmatch("1 2 3 4 5", "%d+") 104 | assert(f() == "1") 105 | local co = coroutine.wrap(f) 106 | assert(co() == "2") 107 | end 108 | 109 | print('OK') 110 | -------------------------------------------------------------------------------- /include/AL/utf8/charclass/runtime/base.lua: -------------------------------------------------------------------------------- 1 | return function(utf8) 2 | 3 | local class = {} 4 | local mt = {__index = class} 5 | 6 | local utf8gensub = utf8.gensub 7 | 8 | function class.new() 9 | return setmetatable({}, mt) 10 | end 11 | 12 | function class:invert() 13 | self.inverted = true 14 | return self 15 | end 16 | 17 | function class:with_codes(...) 18 | local codes = {...} 19 | self.codes = self.codes or {} 20 | 21 | for _, v in ipairs(codes) do 22 | table.insert(self.codes, v) 23 | end 24 | 25 | table.sort(self.codes) 26 | return self 27 | end 28 | 29 | function class:with_ranges(...) 30 | local ranges = {...} 31 | self.ranges = self.ranges or {} 32 | 33 | for _, v in ipairs(ranges) do 34 | table.insert(self.ranges, v) 35 | end 36 | 37 | return self 38 | end 39 | 40 | function class:with_classes(...) 41 | local classes = {...} 42 | self.classes = self.classes or {} 43 | 44 | for _, v in ipairs(classes) do 45 | table.insert(self.classes, v) 46 | end 47 | 48 | return self 49 | end 50 | 51 | function class:without_classes(...) 52 | local not_classes = {...} 53 | self.not_classes = self.not_classes or {} 54 | 55 | for _, v in ipairs(not_classes) do 56 | table.insert(self.not_classes, v) 57 | end 58 | 59 | return self 60 | end 61 | 62 | function class:with_subs(...) 63 | local subs = {...} 64 | self.subs = self.subs or {} 65 | 66 | for _, v in ipairs(subs) do 67 | table.insert(self.subs, v) 68 | end 69 | 70 | return self 71 | end 72 | 73 | function class:in_codes(item) 74 | if not self.codes or #self.codes == 0 then return nil end 75 | 76 | local head, tail = 1, #self.codes 77 | local mid = math.floor((head + tail)/2) 78 | while (tail - head) > 1 do 79 | if self.codes[mid] > item then 80 | tail = mid 81 | else 82 | head = mid 83 | end 84 | mid = math.floor((head + tail)/2) 85 | end 86 | if self.codes[head] == item then 87 | return true, head 88 | elseif self.codes[tail] == item then 89 | return true, tail 90 | else 91 | return false 92 | end 93 | end 94 | 95 | function class:in_ranges(char_code) 96 | if not self.ranges or #self.ranges == 0 then return nil end 97 | 98 | for _,r in ipairs(self.ranges) do 99 | if r[1] <= char_code and char_code <= r[2] then 100 | return true 101 | end 102 | end 103 | return false 104 | end 105 | 106 | function class:in_classes(char_code) 107 | if not self.classes or #self.classes == 0 then return nil end 108 | 109 | for _, class in ipairs(self.classes) do 110 | if self:is(class, char_code) then 111 | return true 112 | end 113 | end 114 | return false 115 | end 116 | 117 | function class:in_not_classes(char_code) 118 | if not self.not_classes or #self.not_classes == 0 then return nil end 119 | 120 | for _, class in ipairs(self.not_classes) do 121 | if self:is(class, char_code) then 122 | return true 123 | end 124 | end 125 | return false 126 | end 127 | 128 | function class:is(class, char_code) 129 | error("not implemented") 130 | end 131 | 132 | function class:in_subs(char_code) 133 | if not self.subs or #self.subs == 0 then return nil end 134 | 135 | for _, c in ipairs(self.subs) do 136 | if not c:test(char_code) then 137 | return false 138 | end 139 | end 140 | return true 141 | end 142 | 143 | function class:test(char_code) 144 | local result = self:do_test(char_code) 145 | -- utf8.debug('class:test', result, "'" .. (char_code and utf8.char(char_code) or 'nil') .. "'", char_code) 146 | return result 147 | end 148 | 149 | function class:do_test(char_code) 150 | if not char_code then return false end 151 | local in_not_classes = self:in_not_classes(char_code) 152 | if in_not_classes then 153 | return not not self.inverted 154 | end 155 | local in_codes = self:in_codes(char_code) 156 | if in_codes then 157 | return not self.inverted 158 | end 159 | local in_ranges = self:in_ranges(char_code) 160 | if in_ranges then 161 | return not self.inverted 162 | end 163 | local in_classes = self:in_classes(char_code) 164 | if in_classes then 165 | return not self.inverted 166 | end 167 | local in_subs = self:in_subs(char_code) 168 | if in_subs then 169 | return not self.inverted 170 | end 171 | if (in_codes == nil) 172 | and (in_ranges == nil) 173 | and (in_classes == nil) 174 | and (in_subs == nil) 175 | and (in_not_classes == false) then 176 | return not self.inverted 177 | else 178 | return not not self.inverted 179 | end 180 | end 181 | 182 | return class 183 | 184 | end 185 | -------------------------------------------------------------------------------- /include/AL/utf8/functions/lua53.lua: -------------------------------------------------------------------------------- 1 | return function(utf8) 2 | 3 | local utf8sub = utf8.sub 4 | local utf8gensub = utf8.gensub 5 | local unpack = utf8.config.unpack 6 | local generate_matcher_function = utf8:require 'regex_parser' 7 | 8 | local 9 | function get_matcher_function(regex, plain) 10 | local res 11 | if utf8.config.cache then 12 | res = utf8.config.cache[plain and "plain" or "regex"][regex] 13 | end 14 | if res then 15 | return res 16 | end 17 | res = generate_matcher_function(regex, plain) 18 | if utf8.config.cache then 19 | utf8.config.cache[plain and "plain" or "regex"][regex] = res 20 | end 21 | return res 22 | end 23 | 24 | local function utf8find(str, regex, init, plain) 25 | local func = get_matcher_function(regex, plain) 26 | init = ((init or 1) < 0) and (utf8.len(str) + init + 1) or init 27 | local ctx, result, captures = func(str, init, utf8) 28 | if not ctx then return nil end 29 | 30 | utf8.debug('ctx:', ctx) 31 | utf8.debug('result:', result) 32 | utf8.debug('captures:', captures) 33 | 34 | return result.start, result.finish, unpack(captures) 35 | end 36 | 37 | local function utf8match(str, regex, init) 38 | local func = get_matcher_function(regex, false) 39 | init = ((init or 1) < 0) and (utf8.len(str) + init + 1) or init 40 | local ctx, result, captures = func(str, init, utf8) 41 | if not ctx then return nil end 42 | 43 | utf8.debug('ctx:', ctx) 44 | utf8.debug('result:', result) 45 | utf8.debug('captures:', captures) 46 | 47 | if #captures > 0 then return unpack(captures) end 48 | 49 | return utf8sub(str, result.start, result.finish) 50 | end 51 | 52 | local function utf8gmatch(str, regex) 53 | regex = (utf8sub(regex,1,1) ~= '^') and regex or '%' .. regex 54 | local func = get_matcher_function(regex, false) 55 | local ctx, result, captures 56 | local continue_pos = 1 57 | 58 | return function() 59 | ctx, result, captures = func(str, continue_pos, utf8) 60 | 61 | if not ctx then return nil end 62 | 63 | utf8.debug('ctx:', ctx) 64 | utf8.debug('result:', result) 65 | utf8.debug('captures:', captures) 66 | 67 | continue_pos = math.max(result.finish + 1, result.start + 1) 68 | if #captures > 0 then 69 | return unpack(captures) 70 | else 71 | return utf8sub(str, result.start, result.finish) 72 | end 73 | end 74 | end 75 | 76 | local function replace(repl, args) 77 | local ret = '' 78 | if type(repl) == 'string' then 79 | local ignore = false 80 | local num 81 | for _, c in utf8gensub(repl) do 82 | if not ignore then 83 | if c == '%' then 84 | ignore = true 85 | else 86 | ret = ret .. c 87 | end 88 | else 89 | num = tonumber(c) 90 | if num then 91 | ret = ret .. assert(args[num], "invalid capture index %" .. c) 92 | else 93 | ret = ret .. c 94 | end 95 | ignore = false 96 | end 97 | end 98 | elseif type(repl) == 'table' then 99 | ret = repl[args[1]] or args[0] 100 | elseif type(repl) == 'function' then 101 | ret = repl(unpack(args, 1)) or args[0] 102 | end 103 | return ret 104 | end 105 | 106 | local function utf8gsub(str, regex, repl, limit) 107 | limit = limit or -1 108 | local subbed = '' 109 | local prev_sub_finish = 1 110 | 111 | local func = get_matcher_function(regex, false) 112 | local ctx, result, captures 113 | local continue_pos = 1 114 | 115 | local n = 0 116 | while limit ~= n do 117 | ctx, result, captures = func(str, continue_pos, utf8) 118 | if not ctx then break end 119 | 120 | utf8.debug('ctx:', ctx) 121 | utf8.debug('result:', result) 122 | utf8.debug('result:', utf8sub(str, result.start, result.finish)) 123 | utf8.debug('captures:', captures) 124 | 125 | continue_pos = math.max(result.finish + 1, result.start + 1) 126 | local args 127 | if #captures > 0 then 128 | args = {[0] = utf8sub(str, result.start, result.finish), unpack(captures)} 129 | else 130 | args = {[0] = utf8sub(str, result.start, result.finish)} 131 | args[1] = args[0] 132 | end 133 | 134 | subbed = subbed .. utf8sub(str, prev_sub_finish, result.start - 1) 135 | subbed = subbed .. replace(repl, args) 136 | prev_sub_finish = result.finish + 1 137 | n = n + 1 138 | 139 | end 140 | 141 | return subbed .. utf8sub(str, prev_sub_finish), n 142 | end 143 | 144 | -- attaching high-level functions 145 | utf8.find = utf8find 146 | utf8.match = utf8match 147 | utf8.gmatch = utf8gmatch 148 | utf8.gsub = utf8gsub 149 | 150 | return utf8 151 | 152 | end 153 | -------------------------------------------------------------------------------- /include/AL/utf8/charclass/compiletime/vanilla.lua: -------------------------------------------------------------------------------- 1 | return function(utf8) 2 | 3 | local cl = utf8:require "charclass.compiletime.builder" 4 | 5 | local next = utf8.util.next 6 | 7 | local token = 1 8 | 9 | local function parse(str, c, bs, ctx) 10 | local tttt = token 11 | token = token + 1 12 | 13 | local class 14 | local nbs = bs 15 | utf8.debug("cc_parse", tttt, str, c, nbs, next(str, nbs)) 16 | 17 | if c == '%' then 18 | c, nbs = next(str, bs) 19 | if c == '' then 20 | error("malformed pattern (ends with '%')") 21 | end 22 | local _c = utf8.raw.lower(c) 23 | local matched 24 | if _c == 'a' then 25 | matched = ('alpha') 26 | elseif _c == 'c' then 27 | matched = ('cntrl') 28 | elseif _c == 'd' then 29 | matched = ('digit') 30 | elseif _c == 'g' then 31 | matched = ('graph') 32 | elseif _c == 'l' then 33 | matched = ('lower') 34 | elseif _c == 'p' then 35 | matched = ('punct') 36 | elseif _c == 's' then 37 | matched = ('space') 38 | elseif _c == 'u' then 39 | matched = ('upper') 40 | elseif _c == 'w' then 41 | matched = ('alnum') 42 | elseif _c == 'x' then 43 | matched = ('xdigit') 44 | end 45 | 46 | if matched then 47 | if _c ~= c then 48 | class = cl.new():without_classes(matched) 49 | else 50 | class = cl.new():with_classes(matched) 51 | end 52 | elseif _c == 'z' then 53 | class = cl.new():with_codes(0) 54 | if _c ~= c then 55 | class = class:invert() 56 | end 57 | else 58 | class = cl.new():with_codes(c) 59 | end 60 | elseif c == '[' and not ctx.internal then 61 | local old_internal = ctx.internal 62 | ctx.internal = true 63 | class = cl.new() 64 | local firstletter = true 65 | while true do 66 | local prev_nbs = nbs 67 | c, nbs = next(str, nbs) 68 | utf8.debug("next", tttt, c, nbs) 69 | if c == '^' and firstletter then 70 | class:invert() 71 | local nc, nnbs = next(str, nbs) 72 | if nc == ']' then 73 | class:with_codes(nc) 74 | nbs = nnbs 75 | end 76 | elseif c == ']' then 77 | if firstletter then 78 | class:with_codes(c) 79 | else 80 | utf8.debug('] on pos', tttt, nbs) 81 | break 82 | end 83 | elseif c == '' then 84 | error "malformed pattern (missing ']')" 85 | else 86 | local sub_class, skip = utf8.regex.compiletime.charclass.parse(str, c, nbs, ctx) 87 | nbs = prev_nbs + skip 88 | utf8.debug("include", tttt, bs, prev_nbs, nbs, skip) 89 | class:include(sub_class) 90 | end 91 | firstletter = false 92 | end 93 | ctx.internal = old_internal 94 | elseif c == '.' then 95 | if not ctx.internal then 96 | class = cl.new():invert() 97 | else 98 | class = cl.new():with_codes(c) 99 | end 100 | end 101 | 102 | return class, utf8.next(str, nbs) - bs 103 | end 104 | 105 | return parse 106 | 107 | end 108 | 109 | --[[ 110 | x: (where x is not one of the magic characters ^$()%.[]*+-?) represents the character x itself. 111 | .: (a dot) represents all characters. 112 | %a: represents all letters. 113 | %c: represents all control characters. 114 | %d: represents all digits. 115 | %g: represents all printable characters except space. 116 | %l: represents all lowercase letters. 117 | %p: represents all punctuation characters. 118 | %s: represents all space characters. 119 | %u: represents all uppercase letters. 120 | %w: represents all alphanumeric characters. 121 | %x: represents all hexadecimal digits. 122 | %x: (where x is any non-alphanumeric character) represents the character x. This is the standard way to escape the magic characters. Any non-alphanumeric character (including all punctuation characters, even the non-magical) can be preceded by a '%' when used to represent itself in a pattern. 123 | [set]: represents the class which is the union of all characters in set. A range of characters can be specified by separating the end characters of the range, in ascending order, with a '-'. All classes %x described above can also be used as components in set. All other characters in set represent themselves. For example, [%w_] (or [_%w]) represents all alphanumeric characters plus the underscore, [0-7] represents the octal digits, and [0-7%l%-] represents the octal digits plus the lowercase letters plus the '-' character. 124 | 125 | You can put a closing square bracket in a set by positioning it as the first character in the set. You can put a hyphen in a set by positioning it as the first or the last character in the set. (You can also use an escape for both cases.) 126 | 127 | The interaction between ranges and classes is not defined. Therefore, patterns like [%a-z] or [a-%%] have no meaning. 128 | [^set]: represents the complement of set, where set is interpreted as above. 129 | 130 | For all classes represented by single letters (%a, %c, etc.), the corresponding uppercase letter represents the complement of the class. For instance, %S represents all non-space characters. 131 | ]] 132 | -------------------------------------------------------------------------------- /include/AL/utf8/test/charclass_compiletime.lua: -------------------------------------------------------------------------------- 1 | local utf8 = require "init" 2 | utf8.config = { 3 | debug = nil, 4 | -- debug = utf8:require("util").debug, 5 | } 6 | utf8:init() 7 | 8 | local ctx = utf8:require("context.compiletime"):new() 9 | 10 | local equals = require 'test.util'.equals 11 | local assert = require 'test.util'.assert 12 | local assert_equals = require 'test.util'.assert_equals 13 | local parse = utf8.regex.compiletime.charclass.parse 14 | 15 | assert_equals({parse("aabb", "a", 1, ctx)}, {{codes = {utf8.byte("a")}}, 1}) 16 | assert_equals({parse("aabb", "a", 2, ctx)}, {{codes = {utf8.byte("a")}}, 1}) 17 | assert_equals({parse("aabb", "b", 3, ctx)}, {{codes = {utf8.byte("b")}}, 1}) 18 | assert_equals({parse("aabb", "b", 4, ctx)}, {{codes = {utf8.byte("b")}}, 1}) 19 | 20 | assert_equals({parse("aa%ab", "%", 3, ctx)}, {{classes = {'alpha'}}, 2}) 21 | assert_equals({parse("aac%Ab", "%", 4, ctx)}, {{not_classes = {'alpha'}}, 2}) 22 | assert_equals({parse("aa.b", ".", 3, ctx)}, {{inverted = true}, 1}) 23 | 24 | assert_equals({parse("aa[c]b", "[", 3, ctx)}, { 25 | {codes = {utf8.byte("c")}, ranges = nil, classes = nil, not_classes = nil}, 26 | utf8.raw.len("[c]") 27 | }) 28 | 29 | assert_equals({parse("aa[%A]b", "[", 3, ctx)}, { 30 | {codes = nil, ranges = nil, classes = nil, not_classes = {'alpha'}}, 31 | utf8.raw.len("[%A]") 32 | }) 33 | 34 | assert_equals({parse("[^%p%d%s%c]+", "[", 1, ctx)}, { 35 | {codes = nil, ranges = nil, classes = {'punct', 'digit', 'space', 'cntrl'}, not_classes = nil, inverted = true}, 36 | utf8.raw.len("[^%p%d%s%c]") 37 | }) 38 | 39 | assert_equals({parse("aa[[c]]b", "[", 3, ctx)}, { 40 | {codes = {utf8.byte("["), utf8.byte("c")}, ranges = nil, classes = nil, not_classes = nil}, 41 | utf8.raw.len("[[c]") 42 | }) 43 | 44 | assert_equals({parse("aa[%a[c]]b", "[", 3, ctx)}, { 45 | {codes = {utf8.byte("["), utf8.byte("c")}, ranges = nil, classes = {'alpha'}, not_classes = nil}, 46 | utf8.raw.len("[%a[c]") 47 | }) 48 | 49 | assert_equals({parse("aac-db", "c", 3, ctx)}, { 50 | {codes = {utf8.byte("c")}}, 51 | utf8.raw.len("c") 52 | }) 53 | 54 | assert_equals({parse("aa[c-d]b", "[", 3, ctx)}, { 55 | {codes = nil, ranges = {{utf8.byte("c"),utf8.byte("d")}}, classes = nil, not_classes = nil}, 56 | utf8.raw.len("[c-d]") 57 | }) 58 | assert_equals(ctx.internal, false) 59 | 60 | assert_equals({parse("aa[c-]]b", "[", 3, ctx)}, { 61 | {codes = {utf8.byte("-"), utf8.byte("c")}, ranges = nil, classes = nil, not_classes = nil}, 62 | utf8.raw.len("[c-]") 63 | }) 64 | assert_equals(ctx.internal, false) 65 | 66 | assert_equals({parse("aad-", "d", 3, ctx)}, { 67 | {codes = {utf8.byte("d")}}, 68 | utf8.raw.len("d") 69 | }) 70 | assert_equals(ctx.internal, false) 71 | 72 | ctx.internal = false 73 | assert_equals({parse(".", ".", 1, ctx)}, { 74 | {inverted = true}, 75 | utf8.raw.len(".") 76 | }) 77 | 78 | assert_equals({parse("[.]", "[", 1, ctx)}, { 79 | {codes = {utf8.byte(".")}}, 80 | utf8.raw.len("[.]") 81 | }) 82 | 83 | assert_equals({parse("%?", "%", 1, ctx)}, { 84 | {codes = {utf8.byte("?")}}, 85 | utf8.raw.len("%?") 86 | }) 87 | 88 | assert_equals({parse("[]]", "[", 1, ctx)}, { 89 | {codes = {utf8.byte("]")}}, 90 | utf8.raw.len("[]]") 91 | }) 92 | 93 | assert_equals({parse("[^]]", "[", 1, ctx)}, { 94 | {codes = {utf8.byte("]")}, inverted = true}, 95 | utf8.raw.len("[^]]") 96 | }) 97 | 98 | --[[-- 99 | multibyte chars 100 | --]]-- 101 | 102 | assert_equals({parse("ббюю", "б", #"" + 1, ctx)}, {{codes = {utf8.byte("б")}}, utf8.raw.len("б")}) 103 | assert_equals({parse("ббюю", "б", #"б" + 1, ctx)}, {{codes = {utf8.byte("б")}}, utf8.raw.len("б")}) 104 | assert_equals({parse("ббюю", "ю", #"бб" + 1, ctx)}, {{codes = {utf8.byte("ю")}}, utf8.raw.len("ю")}) 105 | assert_equals({parse("ббюю", "ю", #"ббю" + 1, ctx)}, {{codes = {utf8.byte("ю")}}, utf8.raw.len("ю")}) 106 | 107 | assert_equals({parse("бб%aю", "%", #"бб" + 1, ctx)}, {{classes = {'alpha'}}, 2}) 108 | assert_equals({parse("ббц%Aю", "%", #"ббц" + 1, ctx)}, {{not_classes = {'alpha'}}, 2}) 109 | assert_equals({parse("бб.ю", ".", #"бб" + 1, ctx)}, {{inverted = true}, 1}) 110 | 111 | assert_equals({parse("бб[ц]ю", "[", #"бб" + 1, ctx)}, { 112 | {codes = {utf8.byte("ц")}, ranges = nil, classes = nil, not_classes = nil}, 113 | utf8.raw.len("[ц]") 114 | }) 115 | 116 | assert_equals({parse("бб[%A]ю", "[", #"бб" + 1, ctx)}, { 117 | {codes = nil, ranges = nil, classes = nil, not_classes = {'alpha'}}, 118 | utf8.raw.len("[%A]") 119 | }) 120 | 121 | assert_equals({parse("бб[[ц]]ю", "[", #"бб" + 1, ctx)}, { 122 | {codes = {utf8.byte("["), utf8.byte("ц")}, ranges = nil, classes = nil, not_classes = nil}, 123 | utf8.raw.len("[[ц]") 124 | }) 125 | 126 | assert_equals({parse("бб[%a[ц]]ю", "[", #"бб" + 1, ctx)}, { 127 | {codes = {utf8.byte("["), utf8.byte("ц")}, ranges = nil, classes = {'alpha'}, not_classes = nil}, 128 | utf8.raw.len("[%a[ц]") 129 | }) 130 | 131 | ctx.internal = true 132 | assert_equals({parse("ббц-ыю", "ц", #"бб" + 1, ctx)}, { 133 | {ranges = {{utf8.byte("ц"),utf8.byte("ы")}}}, 134 | utf8.raw.len("ц-ы") 135 | }) 136 | 137 | ctx.internal = false 138 | assert_equals({parse("бб[ц-ы]ю", "[", #"бб" + 1, ctx)}, { 139 | {codes = nil, ranges = {{utf8.byte("ц"),utf8.byte("ы")}}, classes = nil, not_classes = nil}, 140 | utf8.raw.len("[ц-ы]") 141 | }) 142 | 143 | assert_equals({parse("бб[ц-]]ю", "[", #"бб" + 1, ctx)}, { 144 | {codes = {utf8.byte("-"), utf8.byte("ц")}, ranges = nil, classes = nil, not_classes = nil}, 145 | utf8.raw.len("[ц-]") 146 | }) 147 | 148 | assert_equals({parse("ббы-", "ы", #"бб" + 1, ctx)}, { 149 | {codes = {utf8.byte("ы")}}, 150 | utf8.raw.len("ы") 151 | }) 152 | 153 | ctx.internal = true 154 | assert_equals({parse("ббы-цю", "ы", #"бб" + 1, ctx)}, { 155 | {ranges = {{utf8.byte("ы"),utf8.byte("ц")}}}, 156 | utf8.raw.len("ы-ц") 157 | }) 158 | 159 | ctx.internal = false 160 | assert_equals({parse("бб[ы]ю", "[", #"бб" + 1, ctx)}, { 161 | {codes = {utf8.byte("ы")}, ranges = nil, classes = nil, not_classes = nil}, 162 | utf8.raw.len("[ы]") 163 | }) 164 | 165 | print "OK" 166 | -------------------------------------------------------------------------------- /include/AL/utf8/test/test.lua: -------------------------------------------------------------------------------- 1 | local utf8 = require('init') 2 | utf8.config = { 3 | debug = nil, 4 | -- debug = utf8:require("util").debug, 5 | } 6 | utf8:init() 7 | 8 | for k,v in pairs(utf8) do 9 | string[k] = v 10 | end 11 | 12 | local LUA_51, LUA_53 = false, false 13 | if "\xe4" == "xe4" then -- lua5.1 14 | LUA_51 = true 15 | else -- luajit lua5.3 16 | LUA_53 = true 17 | end 18 | 19 | local FFI_ENABLED = false 20 | if pcall(require, "ffi") then 21 | FFI_ENABLED = true 22 | end 23 | 24 | local res = {} 25 | 26 | local equals = require 'test.util'.equals 27 | local assert = require 'test.util'.assert 28 | local assert_equals = require 'test.util'.assert_equals 29 | 30 | if FFI_ENABLED then 31 | assert_equals(("АБВ"):lower(), "абв") 32 | assert_equals(("абв"):upper(), "АБВ") 33 | end 34 | 35 | res = {} 36 | for _, w in ("123456789"):gensub(2), {1} do res[#res + 1] = w end 37 | assert_equals({"23", "56", "89"}, res) 38 | 39 | assert_equals(0, ("фыва"):next(0)) 40 | assert_equals(100, ("фыва"):next(100)) 41 | assert_equals(#"ф" + 1, ("фыва"):next(1)) 42 | assert_equals("ыва", utf8.raw.sub("фыва", ("фыва"):next(1))) 43 | 44 | res = {} 45 | for p, c in ("абвгд"):codes() do res[#res + 1] = {p, c} end 46 | assert_equals({ 47 | {1, utf8.byte'а'}, 48 | {#'а' + 1, utf8.byte'б'}, 49 | {#'аб' + 1, utf8.byte'в'}, 50 | {#'абв' + 1, utf8.byte'г'}, 51 | {#'абвг' + 1, utf8.byte'д'}, 52 | }, res) 53 | 54 | assert_equals(1, utf8.offset('abcde', 0)) 55 | 56 | assert_equals(1, utf8.offset('abcde', 1)) 57 | assert_equals(5, utf8.offset('abcde', 5)) 58 | assert_equals(6, utf8.offset('abcde', 6)) 59 | assert_equals(nil, utf8.offset('abcde', 7)) 60 | 61 | assert_equals(5, utf8.offset('abcde', -1)) 62 | assert_equals(1, utf8.offset('abcde', -5)) 63 | assert_equals(nil, utf8.offset('abcde', -6)) 64 | 65 | assert_equals(1, utf8.offset('abcde', 0, 1)) 66 | assert_equals(3, utf8.offset('abcde', 0, 3)) 67 | assert_equals(6, utf8.offset('abcde', 0, 6)) 68 | 69 | assert_equals(3, utf8.offset('abcde', 1, 3)) 70 | assert_equals(5, utf8.offset('abcde', 3, 3)) 71 | assert_equals(6, utf8.offset('abcde', 4, 3)) 72 | assert_equals(nil, utf8.offset('abcde', 5, 3)) 73 | 74 | assert_equals(2, utf8.offset('abcde', -1, 3)) 75 | assert_equals(1, utf8.offset('abcde', -2, 3)) 76 | assert_equals(5, utf8.offset('abcde', -1, 6)) 77 | assert_equals(nil, utf8.offset('abcde', -3, 3)) 78 | 79 | assert_equals(1, utf8.offset('абвгд', 0)) 80 | 81 | assert_equals(1, utf8.offset('абвгд', 1)) 82 | assert_equals(#'абвг' + 1, utf8.offset('абвгд', 5)) 83 | assert_equals(#'абвгд' + 1, utf8.offset('абвгд', 6)) 84 | assert_equals(nil, utf8.offset('абвгд', 7)) 85 | 86 | assert_equals(#'абвг' + 1, utf8.offset('абвгд', -1)) 87 | assert_equals(1, utf8.offset('абвгд', -5)) 88 | assert_equals(nil, utf8.offset('абвгд', -6)) 89 | 90 | assert_equals(1, utf8.offset('абвгд', 0, 1)) 91 | assert_equals(1, utf8.offset('абвгд', 0, 2)) 92 | assert_equals(#'аб' + 1, utf8.offset('абвгд', 0, #'аб' + 1)) 93 | assert_equals(#'аб' + 1, utf8.offset('абвгд', 0, #'аб' + 2)) 94 | assert_equals(#'абвгд' + 1, utf8.offset('абвгд', 0, #'абвгд' + 1)) 95 | 96 | assert_equals(#'аб' + 1, utf8.offset('абвгд', 1, #'аб' + 1)) 97 | assert_equals(#'абвг' + 1, utf8.offset('абвгд', 3, #'аб' + 1)) 98 | assert_equals(#'абвгд' + 1, utf8.offset('абвгд', 4, #'аб' + 1)) 99 | assert_equals(#'абвгд' + 1, utf8.offset('абвгд', 4, #'аб' + 2)) 100 | assert_equals(nil, utf8.offset('абвгд', 5, #'аб' + 1)) 101 | 102 | assert_equals(#'а' + 1, utf8.offset('абвгд', -1, #'аб' + 1)) 103 | assert_equals(1, utf8.offset('абвгд', -2, #'аб' + 1)) 104 | assert_equals(#'абвг' + 1, utf8.offset('абвгд', -1, #'абвгд' + 1)) 105 | assert_equals(nil, utf8.offset('абвгд', -3, #'аб' + 1)) 106 | 107 | assert(("фыва"):validate()) 108 | assert_equals({false, {{ pos = #"ф" + 1, part = 1, code = 255 }} }, {("ф\255ыва"):validate()}) 109 | if LUA_53 then 110 | assert_equals({false, {{ pos = #"ф" + 1, part = 1, code = 0xFF }} }, {("ф\xffыва"):validate()}) 111 | end 112 | 113 | assert_equals(nil, ("aabb"):find("%bcd")) 114 | assert_equals({1, 4}, {("aabb"):find("%bab")}) 115 | assert_equals({1, 2}, {("aba"):find('%bab')}) 116 | 117 | res = {} 118 | for w in ("aacaabbcabbacbaacab"):gmatch('%bab') do res[#res + 1] = w end 119 | assert_equals({"acaabbcabb", "acb", "ab"}, res) 120 | 121 | assert_equals({1, 0}, {("aacaabbcabbacbaacab"):find('%f[acb]')}) 122 | assert_equals("a", ("aba"):match('%f[ab].')) 123 | 124 | res = {} 125 | for w in ("aacaabbcabbacbaacab"):gmatch('%f[ab]') do res[#res + 1] = w end 126 | assert_equals({"", "", "", "", ""}, res) 127 | 128 | assert_equals({"HaacHaabbcHabbacHbaacHab", 5}, {("aacaabbcabbacbaacab"):gsub('%f[ab]', 'H')}) 129 | 130 | res = {} 131 | for w in ("Привет, мир, от Lua"):gmatch("[^%p%d%s%c]+") do res[#res + 1] = w end 132 | assert_equals({"Привет", "мир", "от", "Lua"}, res) 133 | 134 | res = {} 135 | for k, v in ("从=世界, 到=Lua"):gmatch("([^%p%s%c]+)=([^%p%s%c]+)") do res[k] = v end 136 | assert_equals({["到"] = "Lua", ["从"] = "世界"}, res) 137 | 138 | assert_equals("Ahoj Ahoj světe světe", ("Ahoj světe"):gsub("([^%p%s%c]+)", "%1 %1")) 139 | 140 | assert_equals("Ahoj Ahoj světe", ("Ahoj světe"):gsub("[^%p%s%c]+", "%0 %0", 1)) 141 | 142 | assert_equals("κόσμο γεια Lua από", ("γεια κόσμο από Lua"):gsub("([^%p%s%c]+)%s*([^%p%s%c]+)", "%2 %1")) 143 | 144 | assert_equals({8, 27, "ололоо я водитель э"}, {("пыщпыщ ололоо я водитель энло"):find("(.л.+)н")}) 145 | 146 | assert_equals({"пыщпыщ о보라보라 я водитель эн보라", 3}, {("пыщпыщ ололоо я водитель энло"):gsub("ло+", "보라")}) 147 | 148 | assert_equals("пыщпыщ ололоо я", ("пыщпыщ ололоо я водитель энло"):match("^п[лопыщ ]*я")) 149 | 150 | assert_equals("в", ("пыщпыщ ололоо я водитель энло"):match("[в-д]+")) 151 | 152 | assert_equals(nil, ('abc abc'):match('([^%s]+)%s%s')) -- https://github.com/Stepets/utf8.lua/issues/2 153 | 154 | res = {} 155 | for w in ("aacabbacbbcaabbcbacaa"):gmatch("a+b") do res[#res + 1] = w end 156 | assert_equals({"ab","aab"}, res) 157 | 158 | res = {} 159 | for w in ("aacabbacbbcaabbcbacaa"):gmatch("a-b") do res[#res + 1] = w end 160 | assert_equals({"ab","b","b","b","aab","b","b"}, res) 161 | 162 | res = {} 163 | for w in ("aacabbacbbcaabbcbacaa"):gmatch("a*b") do res[#res + 1] = w end 164 | assert_equals({"ab","b","b","b","aab","b","b"}, res) 165 | 166 | res = {} 167 | for w in ("aacabbacbbcaabbcbacaa"):gmatch("ba+") do res[#res + 1] = w end 168 | assert_equals({"ba","ba"}, res) 169 | 170 | res = {} 171 | for w in ("aacabbacbbcaabbcbacaa"):gmatch("ba-") do res[#res + 1] = w end 172 | assert_equals({"b","b","b","b","b","b","b"}, res) 173 | 174 | res = {} 175 | for w in ("aacabbacbbcaabbcbacaa"):gmatch("ba*") do res[#res + 1] = w end 176 | assert_equals({"b","ba","b","b","b","b","ba"}, res) 177 | 178 | assert_equals({"bacbbcaabbcba", "ba"}, {("aacabbacbbcaabbcbacaa"):match("((ba+).*%2)")}) 179 | assert_equals({"bbacbbcaabbcb", "b"}, {("aacabbacbbcaabbcbacaa"):match("((ba*).*%2)")}) 180 | 181 | res = {} 182 | for w in ("aacabbacbbcaabbcbacaa"):gmatch("((b+a*).-%2)") do res[#res + 1] = w end 183 | assert_equals({"bbacbb", "bb"}, res) 184 | 185 | assert_equals("a**", ("a**v"):match("a**+")) 186 | assert_equals("a", ("a**v"):match("a**-")) 187 | 188 | assert_equals({"test", "."}, {("test.lua"):match("(.-)([.])")}) 189 | 190 | -- https://github.com/Stepets/utf8.lua/issues/3 191 | assert_equals({"ab", "c"}, {("abc"):match("^([ab]-)([^b]*)$")}) 192 | assert_equals({"ab", ""}, {("ab"):match("^([ab]-)([^b]*)$")}) 193 | assert_equals({"items.", ""}, {("items."):match("^(.-)([^.]*)$")}) 194 | assert_equals({"", "items"}, {("items"):match("^(.-)([^.]*)$")}) 195 | 196 | -- https://github.com/Stepets/utf8.lua/issues/4 197 | assert_equals({"ab.123", 1}, {("ab.?"):gsub("%?", "123")}) 198 | 199 | -- https://github.com/Stepets/utf8.lua/issues/5 200 | assert_equals({"ab", 1}, {("ab"):gsub("a", "%0")}) 201 | assert_equals({"ab", 1}, {("ab"):gsub("a", "%1")}) 202 | 203 | assert_equals("c", ("abc"):match("c", -1)) 204 | 205 | print("\ntests passed\n") 206 | -------------------------------------------------------------------------------- /include/AL/utf8/modifier/compiletime/vanilla.lua: -------------------------------------------------------------------------------- 1 | return function(utf8) 2 | 3 | local utf8unicode = utf8.byte 4 | local sub = utf8.raw.sub 5 | 6 | local matchers = { 7 | star = function(class, name) 8 | local class_name = 'class' .. name 9 | return [[ 10 | local ]] .. class_name .. [[ = ]] .. class .. [[ 11 | 12 | add(function(ctx) -- star 13 | -- debug(ctx, 'star', ']] .. class_name .. [[') 14 | local clone = ctx:clone() 15 | while ]] .. class_name .. [[:test(clone:get_charcode()) do 16 | clone:next_char() 17 | end 18 | local pos = clone.pos 19 | while pos >= ctx.pos do 20 | clone.pos = pos 21 | clone.func_pos = ctx.func_pos 22 | clone:next_function() 23 | clone:get_function()(clone) 24 | if clone.modified then 25 | clone = ctx:clone() 26 | end 27 | pos = pos - 1 28 | end 29 | end) 30 | ]] 31 | end, 32 | minus = function(class, name) 33 | local class_name = 'class' .. name 34 | return [[ 35 | local ]] .. class_name .. [[ = ]] .. class .. [[ 36 | 37 | add(function(ctx) -- minus 38 | -- debug(ctx, 'minus', ']] .. class_name .. [[') 39 | 40 | local clone = ctx:clone() 41 | local pos 42 | repeat 43 | pos = clone.pos 44 | clone:next_function() 45 | clone:get_function()(clone) 46 | if clone.modified then 47 | clone = ctx:clone() 48 | clone.pos = pos 49 | else 50 | clone.pos = pos 51 | clone.func_pos = ctx.func_pos 52 | end 53 | local match = ]] .. class_name .. [[:test(clone:get_charcode()) 54 | clone:next_char() 55 | until not match 56 | end) 57 | ]] 58 | end, 59 | question = function(class, name) 60 | local class_name = 'class' .. name 61 | return [[ 62 | local ]] .. class_name .. [[ = ]] .. class .. [[ 63 | 64 | add(function(ctx) -- question 65 | -- debug(ctx, 'question', ']] .. class_name .. [[') 66 | local saved = ctx:clone() 67 | if ]] .. class_name .. [[:test(ctx:get_charcode()) then 68 | ctx:next_char() 69 | ctx:next_function() 70 | ctx:get_function()(ctx) 71 | end 72 | ctx = saved 73 | ctx:next_function() 74 | return ctx:get_function()(ctx) 75 | end) 76 | ]] 77 | end, 78 | capture_start = function(number) 79 | return [[ 80 | add(function(ctx) 81 | ctx.modified = true 82 | -- debug(ctx, 'capture_start', ']] .. tostring(number) .. [[') 83 | table.insert(ctx.captures.active, { id = ]] .. tostring(number) .. [[, start = ctx.pos }) 84 | ctx:next_function() 85 | return ctx:get_function()(ctx) 86 | end) 87 | ]] 88 | end, 89 | capture_finish = function(number) 90 | return [[ 91 | add(function(ctx) 92 | ctx.modified = true 93 | -- debug(ctx, 'capture_finish', ']] .. tostring(number) .. [[') 94 | local cap = table.remove(ctx.captures.active) 95 | cap.finish = ctx.pos 96 | local b, e = ctx.offsets[cap.start], ctx.offsets[cap.finish] 97 | if cap.start < 1 then 98 | b = 1 99 | elseif cap.start >= ctx.len then 100 | b = ctx.rawlen + 1 101 | end 102 | if cap.finish < 1 then 103 | e = 1 104 | elseif cap.finish >= ctx.len then 105 | e = ctx.rawlen + 1 106 | end 107 | ctx.captures[cap.id] = rawsub(ctx.str, b, e - 1) 108 | -- debug('capture#' .. tostring(cap.id), '[' .. tostring(cap.start).. ',' .. tostring(cap.finish) .. ']' , 'is', ctx.captures[cap.id]) 109 | ctx:next_function() 110 | return ctx:get_function()(ctx) 111 | end) 112 | ]] 113 | end, 114 | capture_position = function(number) 115 | return [[ 116 | add(function(ctx) 117 | ctx.modified = true 118 | -- debug(ctx, 'capture_position', ']] .. tostring(number) .. [[') 119 | ctx.captures[ ]] .. tostring(number) .. [[ ] = ctx.pos 120 | ctx:next_function() 121 | return ctx:get_function()(ctx) 122 | end) 123 | ]] 124 | end, 125 | capture = function(number) 126 | return [[ 127 | add(function(ctx) 128 | -- debug(ctx, 'capture', ']] .. tostring(number) .. [[') 129 | local cap = ctx.captures[ ]] .. tostring(number) .. [[ ] 130 | local len = utf8len(cap) 131 | local check = utf8sub(ctx.str, ctx.pos, ctx.pos + len - 1) 132 | -- debug("capture check:", cap, check) 133 | if cap == check then 134 | ctx.pos = ctx.pos + len 135 | ctx:next_function() 136 | return ctx:get_function()(ctx) 137 | end 138 | end) 139 | ]] 140 | end, 141 | balancer = function(pair, name) 142 | local class_name = 'class' .. name 143 | return [[ 144 | 145 | add(function(ctx) -- balancer 146 | local d, b = ]] .. tostring(utf8unicode(pair[1])) .. [[, ]] .. tostring(utf8unicode(pair[2])) .. [[ 147 | if ctx:get_charcode() ~= d then return end 148 | local balance = 0 149 | repeat 150 | local c = ctx:get_charcode() 151 | if c == nil then return end 152 | 153 | if c == d then 154 | balance = balance + 1 155 | elseif c == b then 156 | balance = balance - 1 157 | end 158 | -- debug("balancer: balance=", balance, ", d=", d, ", b=", b, ", charcode=", ctx:get_charcode()) 159 | ctx:next_char() 160 | until balance == 0 or (balance == 2 and d == b) 161 | ctx:next_function() 162 | return ctx:get_function()(ctx) 163 | end) 164 | ]] 165 | end, 166 | simple = utf8:require("modifier.compiletime.simple").simple, 167 | } 168 | 169 | local next = utf8.util.next 170 | 171 | local function parse(regex, c, bs, ctx) 172 | local functions, nbs = nil, bs 173 | if c == '%' then 174 | c, nbs = next(regex, bs) 175 | utf8.debug("next", c, bs) 176 | if c == '' then 177 | error("malformed pattern (ends with '%')") 178 | end 179 | if utf8.raw.find('123456789', c, 1, true) then 180 | functions = { matchers.capture(tonumber(c)) } 181 | nbs = utf8.next(regex, nbs) 182 | elseif c == 'b' then 183 | local d, b 184 | d, nbs = next(regex, nbs) 185 | b, nbs = next(regex, nbs) 186 | assert(d ~= '' and b ~= '', "unbalanced pattern") 187 | functions = { matchers.balancer({d, b}, tostring(bs)) } 188 | nbs = utf8.next(regex, nbs) 189 | end 190 | 191 | if functions and ctx.prev_class then 192 | table.insert(functions, 1, matchers.simple(ctx.prev_class, tostring(bs))) 193 | end 194 | elseif c == '*' and ctx.prev_class then 195 | functions = { 196 | matchers.star( 197 | ctx.prev_class, 198 | tostring(bs) 199 | ) 200 | } 201 | nbs = bs + 1 202 | elseif c == '+' and ctx.prev_class then 203 | functions = { 204 | matchers.simple( 205 | ctx.prev_class, 206 | tostring(bs) 207 | ), 208 | matchers.star( 209 | ctx.prev_class, 210 | tostring(bs) 211 | ) 212 | } 213 | nbs = bs + 1 214 | elseif c == '-' and ctx.prev_class then 215 | functions = { 216 | matchers.minus( 217 | ctx.prev_class, 218 | tostring(bs) 219 | ) 220 | } 221 | nbs = bs + 1 222 | elseif c == '?' and ctx.prev_class then 223 | functions = { 224 | matchers.question( 225 | ctx.prev_class, 226 | tostring(bs) 227 | ) 228 | } 229 | nbs = bs + 1 230 | elseif c == '(' then 231 | ctx.capture = ctx.capture or {balance = 0, id = 0} 232 | ctx.capture.id = ctx.capture.id + 1 233 | local nc = next(regex, nbs) 234 | if nc == ')' then 235 | functions = {matchers.capture_position(ctx.capture.id)} 236 | nbs = bs + 2 237 | else 238 | ctx.capture.balance = ctx.capture.balance + 1 239 | functions = {matchers.capture_start(ctx.capture.id)} 240 | nbs = bs + 1 241 | end 242 | if ctx.prev_class then 243 | table.insert(functions, 1, matchers.simple(ctx.prev_class, tostring(bs))) 244 | end 245 | elseif c == ')' then 246 | ctx.capture = ctx.capture or {balance = 0, id = 0} 247 | functions = { matchers.capture_finish(ctx.capture.id) } 248 | 249 | ctx.capture.balance = ctx.capture.balance - 1 250 | assert(ctx.capture.balance >= 0, 'invalid capture: "(" missing') 251 | 252 | if ctx.prev_class then 253 | table.insert(functions, 1, matchers.simple(ctx.prev_class, tostring(bs))) 254 | end 255 | nbs = bs + 1 256 | end 257 | 258 | return functions, nbs - bs 259 | end 260 | 261 | local function check(ctx) 262 | if ctx.capture then assert(ctx.capture.balance == 0, 'invalid capture: ")" missing') end 263 | end 264 | 265 | return { 266 | parse = parse, 267 | check = check, 268 | } 269 | 270 | end 271 | -------------------------------------------------------------------------------- /include/AL/utf8/test/test_pm.lua: -------------------------------------------------------------------------------- 1 | --[[-- 2 | MIT License 3 | 4 | Copyright (c) 2018 Xavier Wang 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | --]]-- 24 | 25 | local utf8 = require 'init' 26 | utf8.config = { 27 | debug = nil, --utf8:require("util").debug, 28 | } 29 | utf8:init() 30 | 31 | print('testing pattern matching') 32 | 33 | local 34 | function f(s, p) 35 | local i,e = utf8.find(s, p) 36 | if i then return utf8.sub(s, i, e) end 37 | end 38 | 39 | local 40 | function f1(s, p) 41 | p = utf8.gsub(p, "%%([0-9])", function (s) return "%" .. (tonumber(s)+1) end) 42 | p = utf8.gsub(p, "^(^?)", "%1()", 1) 43 | p = utf8.gsub(p, "($?)$", "()%1", 1) 44 | local t = {utf8.match(s, p)} 45 | return utf8.sub(s, t[1], t[#t] - 1) 46 | end 47 | 48 | local 49 | a,b = utf8.find('', '') -- empty patterns are tricky 50 | assert(a == 1 and b == 0); 51 | a,b = utf8.find('alo', '') 52 | assert(a == 1 and b == 0) 53 | a,b = utf8.find('a\0o a\0o a\0o', 'a', 1) -- first position 54 | assert(a == 1 and b == 1) 55 | a,b = utf8.find('a\0o a\0o a\0o', 'a\0o', 2) -- starts in the midle 56 | assert(a == 5 and b == 7) 57 | a,b = utf8.find('a\0o a\0o a\0o', 'a\0o', 9) -- starts in the midle 58 | assert(a == 9 and b == 11) 59 | a,b = utf8.find('a\0a\0a\0a\0\0ab', '\0ab', 2); -- finds at the end 60 | assert(a == 9 and b == 11); 61 | a,b = utf8.find('a\0a\0a\0a\0\0ab', 'b') -- last position 62 | assert(a == 11 and b == 11) 63 | assert(utf8.find('a\0a\0a\0a\0\0ab', 'b\0') == nil) -- check ending 64 | assert(utf8.find('', '\0') == nil) 65 | assert(utf8.find('alo123alo', '12') == 4) 66 | assert(utf8.find('alo123alo', '^12') == nil) 67 | 68 | assert(utf8.match("aaab", ".*b") == "aaab") 69 | assert(utf8.match("aaa", ".*a") == "aaa") 70 | assert(utf8.match("b", ".*b") == "b") 71 | 72 | assert(utf8.match("aaab", ".+b") == "aaab") 73 | assert(utf8.match("aaa", ".+a") == "aaa") 74 | assert(not utf8.match("b", ".+b")) 75 | 76 | assert(utf8.match("aaab", ".?b") == "ab") 77 | assert(utf8.match("aaa", ".?a") == "aa") 78 | assert(utf8.match("b", ".?b") == "b") 79 | 80 | assert(f('aloALO', '%l*') == 'alo') 81 | assert(f('aLo_ALO', '%a*') == 'aLo') 82 | 83 | assert(f(" \n\r*&\n\r xuxu \n\n", "%g%g%g+") == "xuxu") 84 | 85 | assert(f('aaab', 'a*') == 'aaa'); 86 | assert(f('aaa', '^.*$') == 'aaa'); 87 | assert(f('aaa', 'b*') == ''); 88 | assert(f('aaa', 'ab*a') == 'aa') 89 | assert(f('aba', 'ab*a') == 'aba') 90 | assert(f('aaab', 'a+') == 'aaa') 91 | assert(f('aaa', '^.+$') == 'aaa') 92 | assert(f('aaa', 'b+') == nil) 93 | assert(f('aaa', 'ab+a') == nil) 94 | assert(f('aba', 'ab+a') == 'aba') 95 | assert(f('a$a', '.$') == 'a') 96 | assert(f('a$a', '.%$') == 'a$') 97 | assert(f('a$a', '.$.') == 'a$a') 98 | assert(f('a$a', '$$') == nil) 99 | assert(f('a$b', 'a$') == nil) 100 | assert(f('a$a', '$') == '') 101 | assert(f('', 'b*') == '') 102 | assert(f('aaa', 'bb*') == nil) 103 | assert(f('aaab', 'a-') == '') 104 | assert(f('aaa', '^.-$') == 'aaa') 105 | assert(f('aabaaabaaabaaaba', 'b.*b') == 'baaabaaabaaab') 106 | assert(f('aabaaabaaabaaaba', 'b.-b') == 'baaab') 107 | assert(f('alo xo', '.o$') == 'xo') 108 | assert(f(' \n isto é assim', '%S%S*') == 'isto') 109 | assert(f(' \n isto é assim', '%S*$') == 'assim') 110 | assert(f(' \n isto é assim', '[a-z]*$') == 'assim') 111 | assert(f('um caracter ? extra', '[^%sa-z]') == '?') 112 | assert(f('', 'a?') == '') 113 | assert(f('á', 'á?') == 'á') 114 | assert(f('ábl', 'á?b?l?') == 'ábl') 115 | assert(f(' ábl', 'á?b?l?') == '') 116 | assert(f('aa', '^aa?a?a') == 'aa') 117 | assert(f(']]]áb', '[^]]') == 'á') 118 | assert(f("0alo alo", "%x*") == "0a") 119 | assert(f("alo alo", "%C+") == "alo alo") 120 | print('+') 121 | 122 | assert(f1('alo alx 123 b\0o b\0o', '(..*) %1') == "b\0o b\0o") 123 | assert(f1('axz123= 4= 4 34', '(.+)=(.*)=%2 %1') == '3= 4= 4 3') 124 | assert(f1('=======', '^(=*)=%1$') == '=======') 125 | assert(utf8.match('==========', '^([=]*)=%1$') == nil) 126 | 127 | local function range (i, j) 128 | if i <= j then 129 | return i, range(i+1, j) 130 | end 131 | end 132 | 133 | local abc = utf8.char(range(0, 255)); 134 | 135 | assert(utf8.len(abc) == 256) 136 | assert(string.len(abc) == 384) 137 | 138 | local 139 | function strset (p) 140 | local res = {s=''} 141 | utf8.gsub(abc, p, function (c) res.s = res.s .. c end) 142 | return res.s 143 | end; 144 | 145 | local a, b, c, d, e, t 146 | 147 | -- local E = utf8.escape 148 | -- assert(utf8.len(strset(E'[%200-%210]')) == 11) 149 | 150 | assert(strset('[a-z]') == "abcdefghijklmnopqrstuvwxyz") 151 | assert(strset('[a-z%d]') == strset('[%da-uu-z]')) 152 | assert(strset('[a-]') == "-a") 153 | assert(strset('[^%W]') == strset('[%w]')) 154 | assert(strset('[]%%]') == '%]') 155 | assert(strset('[a%-z]') == '-az') 156 | assert(strset('[%^%[%-a%]%-b]') == '-[]^ab') 157 | -- assert(strset('%Z') == strset(E'[%1-%255]')) 158 | -- assert(strset('.') == strset(E'[%1-%255%%z]')) 159 | print('+'); 160 | 161 | assert(utf8.match("alo xyzK", "(%w+)K") == "xyz") 162 | assert(utf8.match("254 K", "(%d*)K") == "") 163 | assert(utf8.match("alo ", "(%w*)$") == "") 164 | assert(utf8.match("alo ", "(%w+)$") == nil) 165 | assert(utf8.find("(álo)", "%(á") == 1) 166 | a, b, c, d, e = utf8.match("âlo alo", "^(((.).).* (%w*))$") 167 | assert(a == 'âlo alo' and b == 'âl' and c == 'â' and d == 'alo' and e == nil) 168 | a, b, c, d = utf8.match('0123456789', '(.+(.?)())') 169 | assert(a == '0123456789' and b == '' and c == 11 and d == nil) 170 | print('+') 171 | 172 | assert(utf8.gsub('ülo ülo', 'ü', 'x') == 'xlo xlo') 173 | assert(utf8.gsub('alo úlo ', ' +$', '') == 'alo úlo') -- trim 174 | assert(utf8.gsub(' alo alo ', '^%s*(.-)%s*$', '%1') == 'alo alo') -- double trim 175 | assert(utf8.gsub('alo alo \n 123\n ', '%s+', ' ') == 'alo alo 123 ') 176 | t = "abç d" 177 | a, b = utf8.gsub(t, '(.)', '%1@') 178 | assert('@'..a == utf8.gsub(t, '', '@') and b == 5) 179 | a, b = utf8.gsub('abçd', '(.)', '%0@', 2) 180 | assert(a == 'a@b@çd' and b == 2) 181 | assert(utf8.gsub('alo alo', '()[al]', '%1') == '12o 56o') 182 | assert(utf8.gsub("abc=xyz", "(%w*)(%p)(%w+)", "%3%2%1-%0") == 183 | "xyz=abc-abc=xyz") 184 | assert(utf8.gsub("abc", "%w", "%1%0") == "aabbcc") 185 | assert(utf8.gsub("abc", "%w+", "%0%1") == "abcabc") 186 | assert(utf8.gsub('áéí', '$', '\0óú') == 'áéí\0óú') 187 | assert(utf8.gsub('', '^', 'r') == 'r') 188 | assert(utf8.gsub('', '$', 'r') == 'r') 189 | print('+') 190 | 191 | assert(utf8.gsub("um (dois) tres (quatro)", "(%(%w+%))", utf8.upper) == 192 | "um (DOIS) tres (QUATRO)") 193 | 194 | do 195 | local function setglobal (n,v) rawset(_G, n, v) end 196 | utf8.gsub("a=roberto,roberto=a", "(%w+)=(%w%w*)", setglobal) 197 | assert(_G.a=="roberto" and _G.roberto=="a") 198 | end 199 | 200 | function f(a,b) return utf8.gsub(a,'.',b) end 201 | assert(utf8.gsub("trocar tudo em |teste|b| é |beleza|al|", "|([^|]*)|([^|]*)|", f) == 202 | "trocar tudo em bbbbb é alalalalalal") 203 | 204 | local function dostring (s) return (loadstring or load)(s)() or "" end 205 | assert(utf8.gsub("alo $a=1$ novamente $return a$", "$([^$]*)%$", dostring) == 206 | "alo novamente 1") 207 | 208 | x = utf8.gsub("$local utf8=require'init' x=utf8.gsub('alo', '.', utf8.upper)$ assim vai para $return x$", 209 | "$([^$]*)%$", dostring) 210 | assert(x == ' assim vai para ALO') 211 | 212 | local s,r 213 | t = {} 214 | s = 'a alo jose joao' 215 | r = utf8.gsub(s, '()(%w+)()', function (a,w,b) 216 | assert(utf8.len(w) == b-a); 217 | t[a] = b-a; 218 | end) 219 | assert(s == r and t[1] == 1 and t[3] == 3 and t[7] == 4 and t[13] == 4) 220 | 221 | local 222 | function isbalanced (s) 223 | return utf8.find(utf8.gsub(s, "%b()", ""), "[()]") == nil 224 | end 225 | 226 | assert(isbalanced("(9 ((8))(\0) 7) \0\0 a b ()(c)() a")) 227 | assert(not isbalanced("(9 ((8) 7) a b (\0 c) a")) 228 | assert(utf8.gsub("alo 'oi' alo", "%b''", '"') == 'alo " alo') 229 | 230 | 231 | local t = {"apple", "orange", "lime"; n=0} 232 | assert(utf8.gsub("x and x and x", "x", function () t.n=t.n+1; return t[t.n] end) 233 | == "apple and orange and lime") 234 | 235 | t = {n=0} 236 | utf8.gsub("first second word", "%w%w*", function (w) t.n=t.n+1; t[t.n] = w end) 237 | assert(t[1] == "first" and t[2] == "second" and t[3] == "word" and t.n == 3) 238 | 239 | t = {n=0} 240 | assert(utf8.gsub("first second word", "%w+", 241 | function (w) t.n=t.n+1; t[t.n] = w end, 2) == "first second word") 242 | assert(t[1] == "first" and t[2] == "second" and t[3] == nil) 243 | 244 | assert(not pcall(utf8.gsub, "alo", "(.", print)) 245 | assert(not pcall(utf8.gsub, "alo", ".)", print)) 246 | assert(not pcall(utf8.gsub, "alo", "(.", {})) 247 | assert(not pcall(utf8.gsub, "alo", "(.)", "%2")) 248 | assert(not pcall(utf8.gsub, "alo", "(%1)", "a")) 249 | --[[-- 250 | Stepets: ignoring this test because it's probably bug in Lua. 251 | %0 should be interpreted as capture reference only in replacement arg 252 | it doesn't have sense in pattern 253 | --]]-- 254 | -- assert(not pcall(utf8.gsub, "alo", "(%0)", "a")) 255 | 256 | -- bug since 2.5 (C-stack overflow) 257 | -- todo: benchmark OOM 258 | -- do 259 | -- local function f (size) 260 | -- local s = string.rep("a", size) 261 | -- local p = string.rep(".?", size) 262 | -- return pcall(utf8.match, s, p) 263 | -- end 264 | -- local r, m = f(80) 265 | -- assert(r and #m == 80) 266 | -- r, m = f(200000) 267 | -- assert(not r and utf8.find(m, "too complex")) 268 | -- end 269 | 270 | -- if not _soft then 271 | -- -- big strings 272 | -- local a = string.rep('a', 300000) 273 | -- assert(utf8.find(a, '^a*.?$')) 274 | -- assert(not utf8.find(a, '^a*.?b$')) 275 | -- assert(utf8.find(a, '^a-.?$')) 276 | 277 | -- -- bug in 5.1.2 278 | -- a = string.rep('a', 10000) .. string.rep('b', 10000) 279 | -- assert(not pcall(utf8.gsub, a, 'b')) 280 | -- end 281 | 282 | -- recursive nest of gsubs 283 | local function rev (s) 284 | return utf8.gsub(s, "(.)(.+)", function (c,s1) return rev(s1)..c end) 285 | end 286 | 287 | local x = "abcdef" 288 | assert(rev(rev(x)) == x) 289 | 290 | 291 | -- gsub with tables 292 | assert(utf8.gsub("alo alo", ".", {}) == "alo alo") 293 | assert(utf8.gsub("alo alo", "(.)", {a="AA", l=""}) == "AAo AAo") 294 | assert(utf8.gsub("alo alo", "(.).", {a="AA", l="K"}) == "AAo AAo") 295 | assert(utf8.gsub("alo alo", "((.)(.?))", {al="AA", o=false}) == "AAo AAo") 296 | 297 | assert(utf8.gsub("alo alo", "().", {2,5,6}) == "256 alo") 298 | 299 | t = {}; setmetatable(t, {__index = function (t,s) return utf8.upper(s) end}) 300 | assert(utf8.gsub("a alo b hi", "%w%w+", t) == "a ALO b HI") 301 | 302 | 303 | -- tests for gmatch 304 | local a = 0 305 | for i in utf8.gmatch('abcde', '()') do assert(i == a+1); a=i end 306 | assert(a==6) 307 | 308 | t = {n=0} 309 | for w in utf8.gmatch("first second word", "%w+") do 310 | t.n=t.n+1; t[t.n] = w 311 | end 312 | assert(t[1] == "first" and t[2] == "second" and t[3] == "word") 313 | 314 | t = {3, 6, 9} 315 | for i in utf8.gmatch ("xuxx uu ppar r", "()(.)%2") do 316 | assert(i == table.remove(t, 1)) 317 | end 318 | assert(#t == 0) 319 | 320 | t = {} 321 | for i,j in utf8.gmatch("13 14 10 = 11, 15= 16, 22=23", "(%d+)%s*=%s*(%d+)") do 322 | t[i] = j 323 | end 324 | a = 0 325 | for k,v in pairs(t) do assert(k+1 == v+0); a=a+1 end 326 | assert(a == 3) 327 | 328 | 329 | -- tests for `%f' (`frontiers') 330 | 331 | assert(utf8.gsub("aaa aa a aaa a", "%f[%w]a", "x") == "xaa xa x xaa x") 332 | assert(utf8.gsub("[[]] [][] [[[[", "%f[[].", "x") == "x[]] x]x] x[[[") 333 | assert(utf8.gsub("01abc45de3", "%f[%d]", ".") == ".01abc.45de.3") 334 | assert(utf8.gsub("01abc45 de3x", "%f[%D]%w", ".") == "01.bc45 de3.") 335 | -- local u = utf8.escape 336 | -- assert(utf8.gsub("function", u"%%f[%1-%255]%%w", ".") == ".unction") 337 | -- assert(utf8.gsub("function", u"%%f[^%1-%255]", ".") == "function.") 338 | 339 | --[[-- 340 | Stepets: %z is Lua 5.1 class for representing \0 341 | Lua 5.2, Lua 5.3 doesn't have it in documentation. So it's considered deprecated. 342 | --]]-- 343 | assert(utf8.find("a", "%f[a]") == 1) 344 | assert(utf8.find("a", "%f[^%z]") == 1) 345 | assert(utf8.find("a", "%f[^%l]") == 2) 346 | assert(utf8.find("aba", "%f[a%z]") == 3) 347 | assert(utf8.find("aba", "%f[%z]") == 4) 348 | assert(not utf8.find("aba", "%f[%l%z]")) 349 | assert(not utf8.find("aba", "%f[^%l%z]")) 350 | 351 | local i, e = utf8.find(" alo aalo allo", "%f[%S].-%f[%s].-%f[%S]") 352 | assert(i == 2 and e == 5) 353 | local k = utf8.match(" alo aalo allo", "%f[%S](.-%f[%s].-%f[%S])") 354 | assert(k == 'alo ') 355 | 356 | local a = {1, 5, 9, 14, 17,} 357 | for k in utf8.gmatch("alo alo th02 is 1hat", "()%f[%w%d]") do 358 | assert(table.remove(a, 1) == k) 359 | end 360 | assert(#a == 0) 361 | 362 | -- malformed patterns 363 | local function malform (p, m) 364 | m = m or "malformed" 365 | local r, msg = pcall(utf8.find, "a", p) 366 | assert(not r and utf8.find(msg, m)) 367 | end 368 | 369 | malform("[a") 370 | malform("[]") 371 | malform("[^]") 372 | malform("[a%]") 373 | malform("[a%") 374 | malform("%b", "unbalanced") 375 | malform("%ba", "unbalanced") 376 | malform("%") 377 | malform("%f", "missing") 378 | 379 | -- \0 in patterns 380 | assert(utf8.match("ab\0\1\2c", "[\0-\2]+") == "\0\1\2") 381 | assert(utf8.match("ab\0\1\2c", "[\0-\0]+") == "\0") 382 | assert(utf8.find("b$a", "$\0?") == 2) 383 | assert(utf8.find("abc\0efg", "%\0") == 4) 384 | assert(utf8.match("abc\0efg\0\1e\1g", "%b\0\1") == "\0efg\0\1e\1") 385 | assert(utf8.match("abc\0\0\0", "%\0+") == "\0\0\0") 386 | assert(utf8.match("abc\0\0\0", "%\0%\0?") == "\0\0") 387 | 388 | -- magic char after \0 389 | assert(utf8.find("abc\0\0","\0.") == 4) 390 | assert(utf8.find("abcx\0\0abc\0abc","x\0\0abc\0a.") == 4) 391 | 392 | print('OK') 393 | -------------------------------------------------------------------------------- /include/AL/utf8/primitives/dummy.lua: -------------------------------------------------------------------------------- 1 | -- $Id: utf8.lua 179 2009-04-03 18:10:03Z pasta $ 2 | -- 3 | -- Provides UTF-8 aware string functions implemented in pure lua: 4 | -- * utf8len(s) 5 | -- * utf8sub(s, i, j) 6 | -- * utf8reverse(s) 7 | -- * utf8char(unicode) 8 | -- * utf8unicode(s, i, j) 9 | -- * utf8gensub(s, sub_len) 10 | -- * utf8find(str, regex, init, plain) 11 | -- * utf8match(str, regex, init) 12 | -- * utf8gmatch(str, regex, all) 13 | -- * utf8gsub(str, regex, repl, limit) 14 | -- 15 | -- All functions behave as their non UTF-8 aware counterparts with the exception 16 | -- that UTF-8 characters are used instead of bytes for all units. 17 | 18 | --[[ 19 | Copyright (c) 2006-2007, Kyle Smith 20 | All rights reserved. 21 | 22 | Contributors: 23 | Alimov Stepan 24 | 25 | Redistribution and use in source and binary forms, with or without 26 | modification, are permitted provided that the following conditions are met: 27 | 28 | * Redistributions of source code must retain the above copyright notice, 29 | this list of conditions and the following disclaimer. 30 | * Redistributions in binary form must reproduce the above copyright 31 | notice, this list of conditions and the following disclaimer in the 32 | documentation and/or other materials provided with the distribution. 33 | * Neither the name of the author nor the names of its contributors may be 34 | used to endorse or promote products derived from this software without 35 | specific prior written permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 38 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 40 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE 41 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 42 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 43 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 44 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 45 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 46 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 47 | --]] 48 | 49 | -- ABNF from RFC 3629 50 | -- 51 | -- UTF8-octets = *( UTF8-char ) 52 | -- UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4 53 | -- UTF8-1 = %x00-7F 54 | -- UTF8-2 = %xC2-DF UTF8-tail 55 | -- UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / 56 | -- %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) 57 | -- UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / 58 | -- %xF4 %x80-8F 2( UTF8-tail ) 59 | -- UTF8-tail = %x80-BF 60 | -- 61 | return function(utf8) 62 | 63 | local byte = string.byte 64 | local char = string.char 65 | local dump = string.dump 66 | local find = string.find 67 | local format = string.format 68 | local len = string.len 69 | local lower = string.lower 70 | local rep = string.rep 71 | local sub = string.sub 72 | local upper = string.upper 73 | 74 | local utf8charpattern = '[%z\1-\127\194-\244][\128-\191]*' 75 | 76 | local function utf8symbollen(byte) 77 | return not byte and 0 or (byte < 0x80 and 1) or (byte >= 0xF0 and 4) or (byte >= 0xE0 and 3) or (byte >= 0xC0 and 2) or 1 78 | end 79 | 80 | local head_table = utf8.config.int32array(256) 81 | for i = 0, 255 do 82 | head_table[i] = utf8symbollen(i) 83 | end 84 | head_table[256] = 0 85 | 86 | local function utf8charbytes(str, bs) 87 | return head_table[byte(str, bs) or 256] 88 | end 89 | 90 | local function utf8next(str, bs) 91 | return bs + utf8charbytes(str, bs) 92 | end 93 | 94 | -- returns the number of characters in a UTF-8 string 95 | local function utf8len (str) 96 | local bs = 1 97 | local bytes = len(str) 98 | local length = 0 99 | 100 | while bs <= bytes do 101 | length = length + 1 102 | bs = utf8next(str, bs) 103 | end 104 | 105 | return length 106 | end 107 | 108 | -- functions identically to string.sub except that i and j are UTF-8 characters 109 | -- instead of bytes 110 | local function utf8sub (s, i, j) 111 | -- argument defaults 112 | j = j or -1 113 | 114 | local bs = 1 115 | local bytes = len(s) 116 | local length = 0 117 | 118 | local l = (i >= 0 and j >= 0) or utf8len(s) 119 | i = (i >= 0) and i or l + i + 1 120 | j = (j >= 0) and j or l + j + 1 121 | 122 | if i > j then 123 | return "" 124 | end 125 | 126 | local start, finish = 1, bytes 127 | 128 | while bs <= bytes do 129 | length = length + 1 130 | 131 | if length == i then 132 | start = bs 133 | end 134 | 135 | bs = utf8next(s, bs) 136 | 137 | if length == j then 138 | finish = bs - 1 139 | break 140 | end 141 | end 142 | 143 | if i > length then start = bytes + 1 end 144 | if j < 1 then finish = 0 end 145 | 146 | return sub(s, start, finish) 147 | end 148 | 149 | -- http://en.wikipedia.org/wiki/Utf8 150 | -- http://developer.coronalabs.com/code/utf-8-conversion-utility 151 | local function utf8char(...) 152 | local codes = {...} 153 | local result = {} 154 | 155 | for _, unicode in ipairs(codes) do 156 | 157 | if unicode <= 0x7F then 158 | result[#result + 1] = unicode 159 | elseif unicode <= 0x7FF then 160 | local b0 = 0xC0 + math.floor(unicode / 0x40); 161 | local b1 = 0x80 + (unicode % 0x40); 162 | result[#result + 1] = b0 163 | result[#result + 1] = b1 164 | elseif unicode <= 0xFFFF then 165 | local b0 = 0xE0 + math.floor(unicode / 0x1000); 166 | local b1 = 0x80 + (math.floor(unicode / 0x40) % 0x40); 167 | local b2 = 0x80 + (unicode % 0x40); 168 | result[#result + 1] = b0 169 | result[#result + 1] = b1 170 | result[#result + 1] = b2 171 | elseif unicode <= 0x10FFFF then 172 | local code = unicode 173 | local b3= 0x80 + (code % 0x40); 174 | code = math.floor(code / 0x40) 175 | local b2= 0x80 + (code % 0x40); 176 | code = math.floor(code / 0x40) 177 | local b1= 0x80 + (code % 0x40); 178 | code = math.floor(code / 0x40) 179 | local b0= 0xF0 + code; 180 | 181 | result[#result + 1] = b0 182 | result[#result + 1] = b1 183 | result[#result + 1] = b2 184 | result[#result + 1] = b3 185 | else 186 | error 'Unicode cannot be greater than U+10FFFF!' 187 | end 188 | 189 | end 190 | 191 | return char(utf8.config.unpack(result)) 192 | end 193 | 194 | 195 | local shift_6 = 2^6 196 | local shift_12 = 2^12 197 | local shift_18 = 2^18 198 | 199 | local utf8unicode 200 | utf8unicode = function(str, ibs, jbs) 201 | if ibs > jbs then return end 202 | 203 | local ch,bytes 204 | 205 | bytes = utf8charbytes(str, ibs) 206 | if bytes == 0 then return end 207 | 208 | local unicode 209 | 210 | if bytes == 1 then unicode = byte(str, ibs, ibs) end 211 | if bytes == 2 then 212 | local byte0,byte1 = byte(str, ibs, ibs + 1) 213 | if byte0 and byte1 then 214 | local code0,code1 = byte0-0xC0,byte1-0x80 215 | unicode = code0*shift_6 + code1 216 | else 217 | unicode = byte0 218 | end 219 | end 220 | if bytes == 3 then 221 | local byte0,byte1,byte2 = byte(str, ibs, ibs + 2) 222 | if byte0 and byte1 and byte2 then 223 | local code0,code1,code2 = byte0-0xE0,byte1-0x80,byte2-0x80 224 | unicode = code0*shift_12 + code1*shift_6 + code2 225 | else 226 | unicode = byte0 227 | end 228 | end 229 | if bytes == 4 then 230 | local byte0,byte1,byte2,byte3 = byte(str, ibs, ibs + 3) 231 | if byte0 and byte1 and byte2 and byte3 then 232 | local code0,code1,code2,code3 = byte0-0xF0,byte1-0x80,byte2-0x80,byte3-0x80 233 | unicode = code0*shift_18 + code1*shift_12 + code2*shift_6 + code3 234 | else 235 | unicode = byte0 236 | end 237 | end 238 | 239 | if ibs == jbs then 240 | return unicode 241 | else 242 | return unicode,utf8unicode(str, ibs+bytes, jbs) 243 | end 244 | end 245 | 246 | local function utf8byte(str, i, j) 247 | if #str == 0 then return end 248 | 249 | local ibs, jbs 250 | 251 | if i or j then 252 | i = i or 1 253 | j = j or i 254 | 255 | local str_len = utf8len(str) 256 | i = i < 0 and str_len + i + 1 or i 257 | j = j < 0 and str_len + j + 1 or j 258 | j = j > str_len and str_len or j 259 | 260 | if i > j then return end 261 | 262 | for p = 1, i - 1 do 263 | ibs = utf8next(str, ibs or 1) 264 | end 265 | 266 | if i == j then 267 | jbs = ibs 268 | else 269 | for p = 1, j - 1 do 270 | jbs = utf8next(str, jbs or 1) 271 | end 272 | end 273 | 274 | if not ibs or not jbs then 275 | return nil 276 | end 277 | else 278 | ibs, jbs = 1, 1 279 | end 280 | 281 | return utf8unicode(str, ibs, jbs) 282 | end 283 | 284 | local function utf8gensub(str, sub_len) 285 | sub_len = sub_len or 1 286 | local max_len = #str 287 | return function(skip_ptr, bs) 288 | bs = (bs and bs or 1) + (skip_ptr and (skip_ptr[1] or 0) or 0) 289 | 290 | local nbs = bs 291 | if bs > max_len then return nil end 292 | for i = 1, sub_len do 293 | nbs = utf8next(str, nbs) 294 | end 295 | 296 | return nbs, sub(str, bs, nbs - 1), bs 297 | end 298 | end 299 | 300 | local function utf8reverse (s) 301 | local result = '' 302 | for _, w in utf8gensub(s) do result = w .. result end 303 | return result 304 | end 305 | 306 | local function utf8validator(str, bs) 307 | bs = bs or 1 308 | 309 | if type(str) ~= "string" then 310 | error("bad argument #1 to 'utf8charbytes' (string expected, got ".. type(str).. ")") 311 | end 312 | if type(bs) ~= "number" then 313 | error("bad argument #2 to 'utf8charbytes' (number expected, got ".. type(bs).. ")") 314 | end 315 | 316 | local c = byte(str, bs) 317 | if not c then return end 318 | 319 | -- determine bytes needed for character, based on RFC 3629 320 | 321 | -- UTF8-1 322 | if c >= 0 and c <= 127 then 323 | return bs + 1 324 | elseif c >= 128 and c <= 193 then 325 | return bs + 1, bs, 1, c 326 | -- UTF8-2 327 | elseif c >= 194 and c <= 223 then 328 | local c2 = byte(str, bs + 1) 329 | if not c2 or c2 < 128 or c2 > 191 then 330 | return bs + 2, bs, 2, c2 331 | end 332 | 333 | return bs + 2 334 | -- UTF8-3 335 | elseif c >= 224 and c <= 239 then 336 | local c2 = byte(str, bs + 1) 337 | 338 | if not c2 then 339 | return bs + 2, bs, 2, c2 340 | end 341 | 342 | -- validate byte 2 343 | if c == 224 and (c2 < 160 or c2 > 191) then 344 | return bs + 2, bs, 2, c2 345 | elseif c == 237 and (c2 < 128 or c2 > 159) then 346 | return bs + 2, bs, 2, c2 347 | elseif c2 < 128 or c2 > 191 then 348 | return bs + 2, bs, 2, c2 349 | end 350 | 351 | local c3 = byte(str, bs + 2) 352 | if not c3 or c3 < 128 or c3 > 191 then 353 | return bs + 3, bs, 3, c3 354 | end 355 | 356 | return bs + 3 357 | -- UTF8-4 358 | elseif c >= 240 and c <= 244 then 359 | local c2 = byte(str, bs + 1) 360 | 361 | if not c2 then 362 | return bs + 2, bs, 2, c2 363 | end 364 | 365 | -- validate byte 2 366 | if c == 240 and (c2 < 144 or c2 > 191) then 367 | return bs + 2, bs, 2, c2 368 | elseif c == 244 and (c2 < 128 or c2 > 143) then 369 | return bs + 2, bs, 2, c2 370 | elseif c2 < 128 or c2 > 191 then 371 | return bs + 2, bs, 2, c2 372 | end 373 | 374 | local c3 = byte(str, bs + 2) 375 | if not c3 or c3 < 128 or c3 > 191 then 376 | return bs + 3, bs, 3, c3 377 | end 378 | 379 | local c4 = byte(str, bs + 3) 380 | if not c4 or c4 < 128 or c4 > 191 then 381 | return bs + 4, bs, 4, c4 382 | end 383 | 384 | return bs + 4 385 | else -- c > 245 386 | return bs + 1, bs, 1, c 387 | end 388 | end 389 | 390 | local function utf8validate(str, byte_pos) 391 | local result = {} 392 | for nbs, bs, part, code in utf8validator, str, byte_pos do 393 | if bs then 394 | result[#result + 1] = { pos = bs, part = part, code = code } 395 | end 396 | end 397 | return #result == 0, result 398 | end 399 | 400 | local function utf8codes(str) 401 | local max_len = #str 402 | local bs = 1 403 | return function(skip_ptr) 404 | if bs > max_len then return nil end 405 | local pbs = bs 406 | bs = utf8next(str, pbs) 407 | 408 | return pbs, utf8unicode(str, pbs, pbs), pbs 409 | end 410 | end 411 | 412 | 413 | --[[-- 414 | differs from Lua 5.3 utf8.offset in accepting any byte positions (not only head byte) for all n values 415 | 416 | h - head, c - continuation, t - tail 417 | hhhccthccthccthcthhh 418 | ^ start byte pos 419 | searching current charracter head by moving backwards 420 | hhhccthccthccthcthhh 421 | ^ head 422 | 423 | n == 0: current position 424 | n > 0: n jumps forward 425 | n < 0: n more scans backwards 426 | --]]-- 427 | local function utf8offset(str, n, bs) 428 | local l = #str 429 | if not bs then 430 | if n < 0 then 431 | bs = l + 1 432 | else 433 | bs = 1 434 | end 435 | end 436 | if bs <= 0 or bs > l + 1 then 437 | error("bad argument #3 to 'offset' (position out of range)") 438 | end 439 | 440 | if n == 0 then 441 | if bs == l + 1 then 442 | return bs 443 | end 444 | while true do 445 | local b = byte(str, bs) 446 | if (0 < b and b < 127) 447 | or (194 < b and b < 244) then 448 | return bs 449 | end 450 | bs = bs - 1 451 | if bs < 1 then 452 | return 453 | end 454 | end 455 | elseif n < 0 then 456 | bs = bs - 1 457 | repeat 458 | if bs < 1 then 459 | return 460 | end 461 | 462 | local b = byte(str, bs) 463 | if (0 < b and b < 127) 464 | or (194 < b and b < 244) then 465 | n = n + 1 466 | end 467 | bs = bs - 1 468 | until n == 0 469 | return bs + 1 470 | else 471 | while true do 472 | if bs > l then 473 | return 474 | end 475 | 476 | local b = byte(str, bs) 477 | if (0 < b and b < 127) 478 | or (194 < b and b < 244) then 479 | n = n - 1 480 | for i = 1, n do 481 | if bs > l then 482 | return 483 | end 484 | bs = utf8next(str, bs) 485 | end 486 | return bs 487 | end 488 | bs = bs - 1 489 | end 490 | end 491 | 492 | end 493 | 494 | local function utf8replace (s, mapping) 495 | if type(s) ~= "string" then 496 | error("bad argument #1 to 'utf8replace' (string expected, got ".. type(s).. ")") 497 | end 498 | if type(mapping) ~= "table" then 499 | error("bad argument #2 to 'utf8replace' (table expected, got ".. type(mapping).. ")") 500 | end 501 | local result = utf8.raw.gsub( s, utf8charpattern, mapping ) 502 | return result 503 | end 504 | 505 | local function utf8upper (s) 506 | return utf8replace(s, utf8.config.conversion.lc_uc) 507 | end 508 | 509 | if utf8.config.conversion.lc_uc then 510 | upper = utf8upper 511 | end 512 | 513 | local function utf8lower (s) 514 | return utf8replace(s, utf8.config.conversion.uc_lc) 515 | end 516 | 517 | if utf8.config.conversion.uc_lc then 518 | lower = utf8lower 519 | end 520 | 521 | utf8.len = utf8len 522 | utf8.sub = utf8sub 523 | utf8.reverse = utf8reverse 524 | utf8.char = utf8char 525 | utf8.unicode = utf8unicode 526 | utf8.byte = utf8byte 527 | utf8.next = utf8next 528 | utf8.gensub = utf8gensub 529 | utf8.validator = utf8validator 530 | utf8.validate = utf8validate 531 | utf8.dump = dump 532 | utf8.format = format 533 | utf8.lower = lower 534 | utf8.upper = upper 535 | utf8.rep = rep 536 | utf8.raw = {} 537 | for k,v in pairs(string) do 538 | utf8.raw[k] = v 539 | end 540 | 541 | utf8.charpattern = utf8charpattern 542 | utf8.offset = utf8offset 543 | if _VERSION == 'Lua 5.3' then 544 | local utf8_53 = require "utf8" 545 | utf8.codes = utf8_53.codes 546 | utf8.codepoint = utf8_53.codepoint 547 | utf8.len53 = utf8_53.len 548 | else 549 | utf8.codes = utf8codes 550 | utf8.codepoint = utf8unicode 551 | end 552 | 553 | return utf8 554 | 555 | end 556 | -------------------------------------------------------------------------------- /autoload/AL.Persian Toolkit.lua: -------------------------------------------------------------------------------- 1 | -- Special thanks to Majid110 for inspiring us the great feature of RTL Editor. 2 | -- https://github.com/Majid110/MasafAutomation 3 | -- Special thanks to lyger for writing the base of an excelent splitter 4 | -- https://github.com/lyger/Aegisub_automation_scripts 5 | 6 | -- Authers of each section: 7 | -- PakNevis: SSgumS 8 | -- Extend Move: SSgumS 9 | -- RTL: Shinsekai_Yuri & SSgumS 10 | -- Un-RTL: Shinsekai_Yuri & SSgumS 11 | -- Unretard: SSgumS & MD 12 | -- RTL Editor: Majid Shamkhani (Edited by SSgumS) 13 | -- Split at Tags: SSgumS (based on lyger's Split at Tags) 14 | 15 | ----- Global Dependencies ----- 16 | include('karaskel.lua') 17 | 18 | local utf8 = require 'AL.utf8':init() 19 | local re = require 'aegisub.re' 20 | 21 | ----- Script Info ----- 22 | script_name = 'AnimeList Persian Toolkit' 23 | script_description = 'A toolkit for easier persian fansubbing.' 24 | script_author = 'AnimeList Team' 25 | script_version = '1.3.1' 26 | 27 | ----- Script Names ----- 28 | local paknevis_script_name = 'AL Persian Toolkit/PakNevis' 29 | local extend_move_script_name = 'AL Persian Toolkit/Extend Move' 30 | local rtl_script_name = 'AL Persian Toolkit/RTL/RTL' 31 | local unrtl_script_name = 'AL Persian Toolkit/RTL/Un-RTL' 32 | local unretard_script_name = 'AL Persian Toolkit/Unretard' 33 | local rtleditor_script_name = 'AL Persian Toolkit/RTL Editor' 34 | local split_at_tags_script_name = 'AL Persian Toolkit/Split/Split at Tags' 35 | local split_at_spaces_script_name = 'AL Persian Toolkit/Split/Split at Spaces' 36 | local reverse_split_at_tags_script_name = 'AL Persian Toolkit/Split/Reverse + Split (at Tags)' 37 | local reverse_at_tags_script_name = 'AL Persian Toolkit/Split/Reverse at Tags' 38 | 39 | ----- Global Variables ---- 40 | RLE = utf8.char(0x202B) 41 | subtitles = nil 42 | 43 | ----- Global Functions ----- 44 | local function removeRleChars(text) 45 | text = re.sub(text, RLE, "") 46 | return text 47 | end 48 | 49 | local function unrtl(text) 50 | text, _ = re.sub(text, "^((?:\\{.*?\\})*)" .. RLE, "\\1") 51 | text, _ = re.sub(text, "(\\\\[Nn])((?:\\{.*?\\})*)" .. RLE, "\\1\\2") 52 | return text 53 | end 54 | 55 | local function rtl(text) 56 | text = unrtl(text) 57 | text, _ = re.sub(text, "^((?:\\{.*?\\})*)", "\\1" .. RLE) 58 | text, _ = re.sub(text, "(\\\\[Nn])((?:\\{.*?\\})*)", "\\1\\2" .. RLE) 59 | return text 60 | end 61 | 62 | local function serializeTable(val, name, skipnewlines, depth) 63 | skipnewlines = skipnewlines or false 64 | depth = depth or 0 65 | 66 | local tmp = string.rep(" ", depth) 67 | 68 | if name then tmp = tmp .. name .. " = " end 69 | 70 | if type(val) == "table" then 71 | tmp = tmp .. "{" .. (not skipnewlines and "\n" or "") 72 | 73 | for k, v in pairs(val) do 74 | tmp = tmp .. serializeTable(v, k, skipnewlines, depth + 1) .. "," .. (not skipnewlines and "\n" or "") 75 | end 76 | 77 | tmp = tmp .. string.rep(" ", depth) .. "}" 78 | elseif type(val) == "number" then 79 | tmp = tmp .. tostring(val) 80 | elseif type(val) == "string" then 81 | tmp = tmp .. string.format("%q", val) 82 | elseif type(val) == "boolean" then 83 | tmp = tmp .. (val and "true" or "false") 84 | else 85 | tmp = tmp .. "\"[inserializeable datatype:" .. type(val) .. "]\"" 86 | end 87 | 88 | return tmp 89 | end 90 | 91 | local function has_value(tab, val) 92 | for index, value in ipairs(tab) do 93 | if value == val then 94 | return true 95 | end 96 | end 97 | 98 | return false 99 | end 100 | 101 | local function difference(a, b) 102 | local aa = {} 103 | for k, v in pairs(a) do aa[k] = v end 104 | for k, v in pairs(b) do 105 | if aa[k] == v then 106 | aa[k] = nil 107 | end 108 | end 109 | local ret = {} 110 | for k, v in pairs(aa) do -- skips nil 111 | ret[k] = v 112 | end 113 | return ret 114 | end 115 | 116 | -- expand to table of tag-text 117 | local function expand(text) 118 | local result = {} 119 | 120 | local firstPart = re.match(text, "^([^{].*?)(?:\\{|$)") 121 | if firstPart ~= nil then 122 | table.insert(result, { tag = "", text = firstPart[2].str }) 123 | end 124 | 125 | for f in re.gfind(text, "(\\{.*?\\})([^{]*)") do 126 | local m = re.match(f, "(\\{.*?\\})([^{]*)") 127 | if m[2] == nil then m[2] = { str = "" } end 128 | if m[3] == nil then m[3] = { str = "" } end 129 | table.insert(result, { tag = m[2].str, text = m[3].str }) 130 | end 131 | 132 | return result 133 | end 134 | 135 | -- source: https://github.com/unanimated/luaegisub/blob/master/ua.Relocator.lua#L2555 136 | local function round(n, dec) 137 | dec = dec or 0 138 | n = math.floor(n * 10 ^ dec + 0.5) / 10 ^ dec 139 | return n 140 | end 141 | 142 | ----- PakNevis ----- 143 | function PakNevis(subtitles, selected_lines, active_line) 144 | -- local translation_src = ' كي“”0123456789?⸮,’‘ﺑﺗﺛﺟﺣﺧﺳﺷﺻﺿﻃﻇﻋﻏﻓﻗﻛﻟﻣﻧﻫﻳﺋﺍﺏﺕﺙﺝﺡﺥﺩﺫﺭﺯﺱﺵﺹﺽﻁﻅﻉﻍﻑﻕﻙﻝﻡﻥﻩﻭﻱﺁﺃﺅﺇﺉˈﯿٱھ《》' 145 | -- local translation_dst = ' کی""۰۱۲۳۴۵۶۷۸۹؟؟،\'\'بتثجحخسشصضطظعغفقکلمنهیئابتثجحخدذرزسشصضطظعغفقکلمنهویآأؤإئ\'یاه«»' 146 | local persian_alphabets = 'ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی' 147 | local persian_digits = '۰۱۲۳۴۵۶۷۸۹' 148 | local english_digits = '0123456789' 149 | local punc_after = '%.:!،؛؟»%]%)' 150 | local punc_before = '«%[%(' 151 | 152 | for z, i in ipairs(selected_lines) do 153 | local line = subtitles[i] 154 | -- translation 155 | -- for j = 0, translation_src:len() do 156 | -- line.text = utf8.gsub(line.text, '(?!{)(?=[^}])*'..utf8.sub(translation_src, j, j)..'', utf8.sub(translation_dst, j, j)) 157 | -- end 158 | -- line.text = utf8.gsub(line.text, '%%', '٪') 159 | -- character refinement patterns 160 | line.text = utf8.gsub(line.text, ' +', ' ') -- remove extra spaces 161 | line.text = utf8.gsub(line.text, '‌+', '‌') -- remove extra zwnj 162 | line.text = utf8.gsub(line.text, '"([^"]+)"', '«%1»') -- replace quotation with gyoome 163 | line.text = utf8.gsub(line.text, 'ﻻ', 'لا') -- replace لا 164 | line.text = utf8.gsub(line.text, ': ', ': ') -- replace full-width colon 165 | line.text = utf8.gsub(line.text, ':', ': ') -- replace full-width colon 166 | line.text = utf8.gsub(line.text, '-+', '-') -- remove extra - 167 | -- line.text = utf8.gsub(line.text, '-(\\[Nn])', '–%1') -- replace ending - with – 168 | -- line.text = utf8.gsub(line.text, '-$', '–') -- replace ending - with – 169 | -- punctuation spacing patterns 170 | line.text = utf8.gsub(line.text, ' ([' .. punc_after .. '])', '%1') -- remove space before 171 | line.text = utf8.gsub(line.text, '([' .. punc_before .. ']) ', '%1') -- remove space after 172 | line.text = utf8.gsub(line.text, '([^%d' .. persian_digits .. ']%.)([^ ' .. punc_after .. '])', '%1 %2') -- put space after . 173 | line.text = utf8.gsub(line.text, '([%d' .. persian_digits .. ']%.)([^ %d' .. persian_digits .. punc_after .. '])' 174 | , '%1 %2') -- put space after . 175 | line.text = utf8.gsub(line.text, '([' .. punc_after:sub(3) .. '])([^ ' .. punc_after .. '])', '%1 %2') -- put space after 176 | line.text = utf8.gsub(line.text, '([^ ' .. punc_before .. '])([' .. punc_before .. '])', '%1 %2') -- put space before 177 | -- affix spacing patterns 178 | line.text = utf8.gsub(line.text, '([^ ]ه) ی ', '%1‌ی ') -- fix ی space 179 | line.text = utf8.gsub(line.text, ' (ن?می) ', ' %1‌') -- put zwnj after می, نمی 180 | line.text = utf8.gsub(line.text, '^(ن?می) ', '%1‌') -- put zwnj after می, نمی 181 | line.text = utf8.gsub(line.text, 182 | '([' .. persian_alphabets .. '][' .. persian_alphabets .. ']) (های?)([^' .. persian_alphabets .. '])', 183 | '%1‌%2%3') -- put zwnj before تر, تری, ترین, گر, گری, ها, های 184 | line.text = utf8.gsub(line.text, 185 | '([' .. persian_alphabets .. '][' .. persian_alphabets .. ']) (گری?)([^' .. persian_alphabets .. '])', 186 | '%1‌%2%3') -- put zwnj before تر, تری, ترین, گر, گری, ها, های 187 | line.text = utf8.gsub(line.text, 188 | '([' .. persian_alphabets .. '][' .. persian_alphabets .. ']) (تری?ن?)([^' .. persian_alphabets .. '])', 189 | '%1‌%2%3') -- put zwnj before تر, تری, ترین, گر, گری, ها, های 190 | line.text = utf8.gsub(line.text, '([' .. persian_alphabets .. '][' .. persian_alphabets .. ']) (های?)$', 191 | '%1‌%2') -- put zwnj before تر, تری, ترین, گر, گری, ها, های 192 | line.text = utf8.gsub(line.text, '([' .. persian_alphabets .. '][' .. persian_alphabets .. ']) (گری?)$', 193 | '%1‌%2') -- put zwnj before تر, تری, ترین, گر, گری, ها, های 194 | line.text = utf8.gsub(line.text, '([' .. persian_alphabets .. '][' .. persian_alphabets .. ']) (تری?ن?)$', 195 | '%1‌%2') -- put zwnj before تر, تری, ترین, گر, گری, ها, های 196 | line.text = utf8.gsub(line.text, '([^ ]ه) (ا[میشنت][مد]?)([^' .. persian_alphabets .. '])', '%1‌%2%3') -- join ام, ایم, اش, اند, ای, اید, ات 197 | line.text = utf8.gsub(line.text, '([^ ]ه) (ا[میشنت][مد]?)$', '%1‌%2') -- join ام, ایم, اش, اند, ای, اید, ات 198 | subtitles[i] = line 199 | end 200 | aegisub.set_undo_point(paknevis_script_name) 201 | end 202 | 203 | ----- Unretard ----- 204 | function Unretard(subtitles, selected_lines, active_line) 205 | local ending_punc = '%.:!،«%[%(- ' 206 | local starting_punc = '»%]%)- ' 207 | 208 | local function replace(original_text, text, search_pattern, replace_pattern) 209 | local match = utf8.gmatch 210 | if '^' == string.sub(search_pattern, 0, 1) then 211 | match = function(str, pattern) 212 | return function() 213 | return utf8.match(str, pattern) 214 | end 215 | end 216 | end 217 | for m in match(original_text, search_pattern) do 218 | local puncs = utf8.reverse(m) 219 | puncs = utf8.gsub(puncs, '«', 't2') 220 | puncs = utf8.gsub(puncs, '»', 't1') 221 | puncs = utf8.gsub(puncs, 't2', '»') 222 | puncs = utf8.gsub(puncs, 't1', '«') 223 | puncs = utf8.gsub(puncs, '%(', 't2') 224 | puncs = utf8.gsub(puncs, '%)', 't1') 225 | puncs = utf8.gsub(puncs, 't2', '%)') 226 | puncs = utf8.gsub(puncs, 't1', '%(') 227 | puncs = utf8.gsub(puncs, '%[', 't2') 228 | puncs = utf8.gsub(puncs, '%]', 't1') 229 | puncs = utf8.gsub(puncs, 't2', '%]') 230 | puncs = utf8.gsub(puncs, 't1', '%[') 231 | text = utf8.gsub(text, replace_pattern, puncs, 1) 232 | if '^' == string.sub(search_pattern, 0, 1) then 233 | break 234 | end 235 | end 236 | return text 237 | end 238 | 239 | for z, i in ipairs(selected_lines) do 240 | local line = subtitles[i] 241 | 242 | -- trim 243 | line.text = utf8.gsub(line.text, '^ *([^\\]+) *$', '%1') 244 | line.text = utf8.gsub(line.text, '^ *([^\\]+) *(\\[Nn])', '%1%2') 245 | line.text = utf8.gsub(line.text, '^(\\[Nn]) *([^\\]+) *(\\[Nn])', '%1%2%3') 246 | line.text = utf8.gsub(line.text, '^(\\[Nn]) *([^\\]+) *$', '%1%2') 247 | 248 | if utf8.match(line.text, '%{') == nil then 249 | -- unretard 250 | -- find 251 | local linetext_copy = line.text 252 | line.text = utf8.gsub(line.text, '^([' .. ending_punc .. ']+)([^\\]+)$', '%2gce') -- ending puncs 253 | line.text = utf8.gsub(line.text, '^([' .. ending_punc .. ']+)([^\\]+)(\\[Nn])', '%2gce%3') -- ending puncs 254 | line.text = utf8.gsub(line.text, '(\\[Nn])([' .. ending_punc .. ']+)([^\\]+)(\\[Nn])', '%1%3gce%4') -- ending puncs 255 | line.text = utf8.gsub(line.text, '(\\[Nn])([' .. ending_punc .. ']+)([^\\]+)$', '%1%3gce') -- ending puncs 256 | line.text = utf8.gsub(line.text, '^([^\\]+[^' .. starting_punc .. '])([' .. starting_punc .. ']+)(g?c?e?)$', 257 | 'gcs%1%3') -- starting puncs 258 | line.text = utf8.gsub(line.text, '^([^\\]+[^' .. starting_punc .. 259 | '])([' .. starting_punc .. ']+)(g?c?e?)(\\[Nn])', 'gcs%1%3%4') -- starting puncs 260 | line.text = utf8.gsub(line.text, 261 | '(\\[Nn])([^\\]+[^' .. starting_punc .. '])([' .. starting_punc .. ']+)(g?c?e?)(\\[Nn])', '%1gcs%2%4%5') -- starting puncs 262 | line.text = utf8.gsub(line.text, '(\\[Nn])([^\\]+[^' .. starting_punc .. 263 | '])([' .. starting_punc .. ']+)(g?c?e?)$', '%1gcs%2%3') -- starting puncs 264 | -- replace 265 | line.text = replace(linetext_copy, line.text, '^([' .. ending_punc .. ']+)[^\\]+$', 'gce') 266 | line.text = replace(linetext_copy, line.text, '^([' .. ending_punc .. ']+)[^\\]+\\[Nn]', 'gce') 267 | line.text = replace(linetext_copy, line.text, '\\[Nn]([' .. ending_punc .. ']+)[^\\]+\\[Nn]', 'gce') 268 | line.text = replace(linetext_copy, line.text, '\\[Nn]([' .. ending_punc .. ']+)[^\\]+$', 'gce') 269 | line.text = replace(linetext_copy, line.text, '^[^\\]+[^' .. starting_punc .. ']([' .. starting_punc .. 270 | ']+)$', 'gcs') 271 | line.text = replace(linetext_copy, line.text, '^[^\\]+[^' .. starting_punc .. 272 | ']([' .. starting_punc .. ']+)\\[Nn]', 'gcs') 273 | line.text = replace(linetext_copy, line.text, 274 | '\\[Nn][^\\]+[^' .. starting_punc .. ']([' .. starting_punc .. ']+)\\[Nn]', 'gcs') 275 | line.text = replace(linetext_copy, line.text, '\\[Nn][^\\]+[^' .. starting_punc .. 276 | ']([' .. starting_punc .. ']+)$', 'gcs') 277 | end 278 | 279 | subtitles[i] = line 280 | end 281 | aegisub.set_undo_point(unretard_script_name) 282 | end 283 | 284 | ----- RTL ----- 285 | function Rtl(subtitles, selected_lines, active_line) 286 | for z, i in ipairs(selected_lines) do 287 | local l = subtitles[i] 288 | 289 | l.text = rtl(l.text) 290 | 291 | subtitles[i] = l 292 | end 293 | aegisub.set_undo_point(rtl_script_name) 294 | end 295 | 296 | ----- Un-RTL ----- 297 | function Unrtl(subtitles, selected_lines, active_line) 298 | for z, i in ipairs(selected_lines) do 299 | local line = subtitles[i] 300 | 301 | line.text = unrtl(line.text) 302 | 303 | subtitles[i] = line 304 | end 305 | aegisub.set_undo_point(unrtl_script_name) 306 | end 307 | 308 | ----- RTL Editor ----- 309 | local editor_btn = { 310 | Ok = 1, 311 | OkWORtl = 2, 312 | Cancel = 3, 313 | } 314 | 315 | local function openEditor(str) 316 | local btns = { "OK", "OK w/o RTL", "Cancel" } 317 | 318 | local btn_switch_case = {} 319 | for key, value in pairs(btns) do 320 | btn_switch_case[value] = key 321 | end 322 | 323 | local config = { 324 | { class = "label", label = "Press Ctrl+Shift at the right side of your keyboard to switch to RTL mode.", x = 0, 325 | y = 0 }, 326 | { class = "textbox", name = "editor", value = str, x = 0, y = 1, width = 33, height = 11 } 327 | } 328 | local btn, result = aegisub.dialog.display(config, btns, { ok = "OK", cancel = "Cancel" }) 329 | if btn == true then btn = "OK" elseif btn == false then btn = "Cancel" end 330 | return btn_switch_case[btn], result.editor 331 | end 332 | 333 | function RtlEditor(subtitles, selected_lines) 334 | if #selected_lines > 1 then 335 | return 336 | end 337 | local line = subtitles[selected_lines[1]] 338 | 339 | local text = unrtl(line.text) 340 | text = utf8.gsub(text, "\\[Nn]", "\n") 341 | local btn, newText = openEditor(text) 342 | 343 | if btn == editor_btn.Cancel then 344 | return 345 | end 346 | newText = utf8.gsub(newText, "\n", "\\N") 347 | if btn == editor_btn.Ok then 348 | newText = rtl(newText) 349 | end 350 | line.text = newText 351 | 352 | subtitles[selected_lines[1]] = line 353 | 354 | aegisub.set_undo_point(rtleditor_script_name) 355 | end 356 | 357 | ----- Split at Tags ----- 358 | local Split = {} 359 | 360 | Split.puncs = '.:!،«[(»\\])\\- <>' 361 | Split.line_type_tags = { 362 | 'pos', 'move', 'clip', 'iclip', 'org', 'fade', 'fad', 'an', 'q' 363 | } 364 | Split.style_tags = { 365 | 'i', 'b', 'u', 's', 'bord', 'xbord', 'ybord', 'shad', 'xshad', 'yshad', 366 | 'fn', 'fs', 'fscx', 'fscy', 'fsp', 'fe', 'c', '1c', '2c', '3c', '4c', 367 | 'alpha', '1a', '2a', '3a', '4a', 'an', 'r', 'frz', 'fr' 368 | } 369 | Split.non_style_tags = { 370 | 'be', 'blur', 'frx', 'fry', 'fax', 'fay', 'k', 'K', 'kf', 'ko', 'q', 371 | 'pos', 'move', 'org', 'fad', 'fade', 't', 'clip', 'iclip', 'p', 'pbo' 372 | } 373 | Split.style_names_tags = { 374 | { 'fontname', 'fn' }, { 'fontsize', 'fs' }, 375 | { 'color1', '1c', '1a' }, { 'color2', '2c', '2a' }, { 'color3', '3c', '3a' }, { 'color4', '4c', '4a' }, 376 | { 'bold', 'b' }, { 'italic', 'i' }, { 'underline', 'u' }, { 'strikeout', 's' }, 377 | { 'scale_x', 'fscx' }, { 'scale_y', 'fscy' }, { 'spacing', 'fsp' }, { 'angle', 'frz' }, 378 | { 'outline', 'bord' }, { 'shadow', 'shad' }, { 'align', 'an' }, { 'encoding', 'fe' } 379 | } 380 | Split.simple_text_value_tags = { 381 | 'fn', 'alpha', '1a', '2a', '3a', '4a', 'c', '1c', '2c', '3c', '4c', 'r' 382 | } 383 | Split.boolean_style_fields = { 384 | 'bold', 'italic', 'underline', 'strikeout' 385 | } 386 | 387 | function Split:parse_style(styleref) 388 | local tags = {} 389 | -- extract Split.style_names_tags 390 | for i = 1, #Split.style_names_tags do 391 | local table = Split.style_names_tags[i] 392 | local style_name = table[1] 393 | local tag_name1 = table[2] 394 | local value = styleref[style_name] 395 | if re.match(style_name, 'color') ~= nil then 396 | tags[tag_name1] = re.sub(value, '&H..(.+)', '&H\\1') 397 | tags[table[3]] = re.match(value, '&H..')[1].str 398 | else 399 | if has_value(Split.boolean_style_fields, style_name) then 400 | if value then 401 | value = 1 402 | else 403 | value = 0 404 | end 405 | end 406 | tags[tag_name1] = value 407 | end 408 | end 409 | -- add other defaults 410 | tags['be'] = 0 411 | tags['blur'] = 0 412 | tags['frx'] = 0 413 | tags['fry'] = 0 414 | tags['fax'] = 0 415 | tags['fay'] = 0 416 | tags['pbo'] = 0 417 | return tags 418 | end 419 | 420 | function Split:parse_tags(tags, line_tags, current_appearance) -- TODO: add r support 421 | -- handle t tags 422 | local t_tags = {} 423 | for t in tags:gmatch("\\t%b()") do -- Thanks lyger! 424 | table.insert(t_tags, t) 425 | end 426 | tags = tags:gsub("\\t%b()", "") -- remove t tags 427 | if #t_tags > 0 then -- add to table 428 | current_appearance["t"] = t_tags 429 | end 430 | 431 | -- other tags 432 | for t in tags:gmatch("\\[^\\{}]*") do 433 | local tag, value = "", "" 434 | if t:match("\\fn") ~= nil then 435 | tag, value = t:match("\\(fn)(.*)") 436 | else 437 | tag, value = t:match("\\([1-4]?%a+)(%A.*)") 438 | end 439 | 440 | if tag == 'fr' then 441 | tag = 'frz' 442 | elseif tag == 'c' then 443 | tag = '1c' 444 | end 445 | 446 | -- add line tags to the appropriate list and others to appearance 447 | if has_value(Split.line_type_tags, tag) == true then 448 | if has_value(line_tags, tag) == false then 449 | if tag == 'q' or tag == 'an' then 450 | value = tonumber(value) 451 | end 452 | line_tags[tag] = value 453 | end 454 | else 455 | if has_value(Split.simple_text_value_tags, tag) == false then 456 | value = tonumber(value) 457 | end 458 | current_appearance[tag] = value 459 | end 460 | end 461 | end 462 | 463 | function Split:reverse(line) 464 | local line = util.copy(line) 465 | -- read in styles and meta 466 | local meta, styles = karaskel.collect_head(subtitles, false) 467 | 468 | karaskel.preproc_line(subtitles, meta, styles, line) 469 | 470 | -- clean tags and text 471 | line.text = re.sub(line.text, '}{', '') -- combine redundant back to back tag parts 472 | line.text = re.sub(line.text, '^ +', '') -- trim redundant spaces 473 | line.text = re.sub(line.text, '^({[^{}]*}) +', '\\1') 474 | line.text = re.sub(line.text, ' +$', '') 475 | 476 | -- make tags-text table 477 | local tag_text_table = expand(line.text) 478 | -- aegisub.log('Parts:\n'..serializeTable(tag_text_table)..'\n') 479 | 480 | -- reverse process 481 | local line_tags = {} 482 | line.text = '' 483 | -- extract default appearance 484 | local parsed_style = Split:parse_style(line.styleref) 485 | -- aegisub.log('Parsed Style:\n'..serializeTable(parsed_style)..'\n') 486 | local current_appearance = util.deep_copy(parsed_style) 487 | -- 1nd step (parse) 488 | for i, val in ipairs(tag_text_table) do 489 | -- parse tags 490 | Split:parse_tags(val.tag, line_tags, current_appearance) 491 | val.tag_list = util.deep_copy(current_appearance) 492 | end 493 | -- aegisub.log('New Parts:\n'..serializeTable(tag_text_table)..'\n') 494 | 495 | -- 2nd step (rebuild) 496 | local last_tag_list = parsed_style 497 | for i = #tag_text_table, 1, -1 do 498 | -- get diff and rebuild tags 499 | local val = tag_text_table[i] 500 | -- get diff 501 | -- aegisub.log('Tag List:\n'..serializeTable(val.tag_list)..'\n') 502 | -- aegisub.log('Last Tag List:\n'..serializeTable(last_tag_list)..'\n') 503 | local diff = difference(val.tag_list, last_tag_list) 504 | last_tag_list = val.tag_list 505 | -- aegisub.log('Diff:\n'..serializeTable(diff)..'\n') 506 | -- rebuild tags 507 | local rebuilt_tag = '{}' 508 | for tag, value in pairs(diff) do 509 | if tag == "t" then 510 | for _, t_tag in ipairs(value) do 511 | rebuilt_tag = rebuilt_tag:gsub("}", t_tag .. "}") 512 | end 513 | else 514 | rebuilt_tag = rebuilt_tag:gsub("{", "{\\" .. tag .. value) 515 | end 516 | end 517 | if i == #tag_text_table then 518 | for tag, value in pairs(line_tags) do 519 | rebuilt_tag = rebuilt_tag:gsub("{", "{\\" .. tag .. value) 520 | end 521 | end 522 | val.tag = rebuilt_tag 523 | 524 | -- flip spaces 525 | val.text, _ = re.sub(val.text, "^( *)(.*?)( *)$", "\\3\\2\\1") 526 | 527 | -- rebuild line 528 | line.text = line.text .. val.tag .. val.text 529 | end 530 | 531 | return line 532 | end 533 | 534 | function Split:splitAtTags(line) 535 | -- Convert float to neatly formatted string 536 | local function float2str(f) 537 | return string.format("%.3f", f):gsub("%.(%d-)0+$", "%.%1"):gsub("%.$", "") 538 | end 539 | 540 | -- Returns the position of a line 541 | local function get_pos(line) 542 | local _, _, posx, posy = line.text:find("\\pos%(([%d%.%-]*),([%d%.%-]*)%)") 543 | if posx == nil then 544 | _, _, posx, posy = line.text:find("\\move%(([%d%.%-]*),([%d%.%-]*),") 545 | if posx == nil then 546 | local _, _, align_n = line.text:find("\\an([%d%.%-]*)") 547 | if align_n == nil then 548 | local _, _, align_dumb = line.text:find("\\a([%d]+)") 549 | if align_dumb == nil then 550 | -- If the line has no alignment tags 551 | posx = line.x 552 | posy = line.y 553 | else 554 | -- If the line has the \a alignment tag 555 | local vid_x, vid_y = aegisub.video_size() 556 | align_dumb = tonumber(align_dumb) 557 | if align_dumb > 8 then 558 | posy = vid_y / 2 559 | elseif align_dumb > 4 then 560 | posy = line.eff_margin_t 561 | else 562 | posy = vid_y - line.eff_margin_b 563 | end 564 | local _temp = align_dumb % 4 565 | if _temp == 1 then 566 | posx = line.eff_margin_l 567 | elseif _temp == 2 then 568 | posx = line.eff_margin_l + 569 | (vid_x - line.eff_margin_l - 570 | line.eff_margin_r) / 2 571 | else 572 | posx = vid_x - line.eff_margin_r 573 | end 574 | end 575 | else 576 | -- If the line has the \an alignment tag 577 | local vid_x, vid_y = aegisub.video_size() 578 | align_n = tonumber(align_n) 579 | local _temp = align_n % 3 580 | if align_n > 6 then 581 | posy = line.eff_margin_t 582 | elseif align_n > 3 then 583 | posy = vid_y / 2 584 | else 585 | posy = vid_y - line.eff_margin_b 586 | end 587 | if _temp == 1 then 588 | posx = line.eff_margin_l 589 | elseif _temp == 2 then 590 | posx = line.eff_margin_l + 591 | (vid_x - line.eff_margin_l - line.eff_margin_r) / 592 | 2 593 | else 594 | posx = vid_x - line.eff_margin_r 595 | end 596 | end 597 | end 598 | end 599 | return tonumber(posx), tonumber(posy) 600 | end 601 | 602 | -- Returns the origin of a line 603 | local function get_org(line) 604 | local _, _, orgx, orgy = line.text:find("\\org%(([%d%.%-]*),([%d%.%-]*)%)") 605 | if orgx == nil then return get_pos(line) end 606 | return tonumber(orgx), tonumber(orgy) 607 | end 608 | 609 | -- Returns a table of tag-value pairs 610 | -- Supports fn but ignores r because fuck r 611 | local function full_state_subtable(tag) 612 | -- Store time tags in their own table, so they don't interfere 613 | local time_tags = {} 614 | for ttag in tag:gmatch("\\t%b()") do table.insert(time_tags, ttag) end 615 | 616 | -- Remove time tags from the string so we don't have to deal with them 617 | tag = tag:gsub("\\t%b()", "") 618 | 619 | local state_subtable = {} 620 | 621 | for t in tag:gmatch("\\[^\\{}]*") do 622 | local ttag, tparam = "", "" 623 | if t:match("\\fn") ~= nil then 624 | ttag, tparam = t:match("\\(fn)(.*)") 625 | else 626 | ttag, tparam = t:match("\\([1-4]?%a+)(%A.*)") 627 | end 628 | state_subtable[ttag] = tparam 629 | end 630 | 631 | -- Dump the time tags back in 632 | if #time_tags > 0 then state_subtable["t"] = time_tags end 633 | 634 | return state_subtable 635 | end 636 | 637 | local splits = {} 638 | local line = util.copy(line) 639 | 640 | -- clean tags and text 641 | line.text = re.sub(line.text, '}{', '') -- combine redundant back to back tag parts 642 | line.text = re.sub(line.text, '^ +', '') -- trim redundant spaces 643 | line.text = re.sub(line.text, '^({[^{}]*}) +', '\\1') 644 | line.text = re.sub(line.text, ' +$', '') 645 | 646 | -- Read in styles and meta 647 | local meta, styles = karaskel.collect_head(subtitles, false) 648 | 649 | -- Preprocess 650 | karaskel.preproc_line(subtitles, meta, styles, line) 651 | 652 | -- Get position and origin 653 | local px, py = get_pos(line) 654 | local ox, oy = get_org(line) 655 | 656 | -- If there are rotations in the line, then write the origin 657 | local do_org = false 658 | 659 | if line.text:match("\\fr[xyz]") ~= nil then do_org = true end 660 | 661 | -- Turn all \Ns into the newline character 662 | -- line.text=line.text:gsub("\\N","\n") 663 | 664 | -- Make sure any newline followed by a non-newline character has a tag afterwards 665 | -- (i.e. force breaks at newlines) 666 | -- line.text=line.text:gsub("\n([^\n{])","\n{}%1") 667 | 668 | -- Make line table 669 | local line_table = expand(line.text) 670 | local lines_added = 0 671 | local line_table_copy = util.copy(line_table) 672 | for i, e in ipairs(line_table_copy) do 673 | local m = re.match(e.text, "^( *)(.*?)( *)$") 674 | 675 | if m[2].str ~= "" then 676 | table.insert(line_table, i + lines_added, { tag = e.tag, text = rtl(m[2].str) }) 677 | lines_added = lines_added + 1 678 | end 679 | e.text = rtl(m[3].str) 680 | if m[4].str ~= "" then 681 | table.insert(line_table, i + lines_added + 1, { tag = e.tag, text = rtl(m[4].str) }) 682 | lines_added = lines_added + 1 683 | end 684 | end 685 | 686 | -- Stores current state of the line as style table 687 | local current_style = util.deep_copy(line.styleref) 688 | 689 | -- Stores the width of each section 690 | local substr_data = {} 691 | 692 | -- Total width of the line 693 | local cum_width = 0 694 | -- Total height of the line 695 | -- cum_height=0 696 | -- Stores the various cumulative widths for each linebreak 697 | -- subs_width={} 698 | -- subs_index=1 699 | 700 | -- First pass to collect size data 701 | for i, val in ipairs(line_table) do 702 | -- Create state subtable 703 | local subtable = full_state_subtable(val.tag) 704 | 705 | -- Fix style tables to reflect override tags 706 | current_style.fontname = subtable["fn"] or current_style.fontname 707 | current_style.fontsize = tonumber(subtable["fs"]) or 708 | current_style.fontsize 709 | current_style.scale_x = tonumber(subtable["fscx"]) or 710 | current_style.scale_x 711 | current_style.scale_y = tonumber(subtable["fscy"]) or 712 | current_style.scale_y 713 | current_style.spacing = tonumber(subtable["fsp"]) or 714 | current_style.spacing 715 | current_style.align = tonumber(subtable["an"]) or 716 | current_style.align 717 | if subtable["b"] ~= nil then 718 | if subtable["b"] == "1" then 719 | current_style.bold = true 720 | else 721 | current_style.bold = false 722 | end 723 | end 724 | if subtable["i"] ~= nil then 725 | if subtable["i"] == "1" then 726 | current_style.italic = true 727 | else 728 | current_style.italic = false 729 | end 730 | end 731 | if subtable["a"] ~= nil then 732 | local dumbalign = tonumber(subtable["a"]) 733 | local halign = dumbalign % 4 734 | local valign = 0 735 | if dumbalign > 8 then 736 | valign = 3 737 | elseif dumbalign > 4 then 738 | valign = 6 739 | end 740 | current_style.align = valign + halign 741 | end 742 | 743 | -- Store this style table 744 | val.style = util.deep_copy(current_style) 745 | 746 | -- Get extents of the section. _sdesc is not used 747 | -- Temporarily remove all newlines first 748 | local swidth, sheight, _sdesc, sext = 749 | aegisub.text_extents(current_style, val.text:gsub("\n", "")) 750 | 751 | -- aegisub.log("Text: %s\n--w: %.3f\n--h: %.3f\n--d: %.3f\n--el: %.3f\n\n", 752 | -- val.text, swidth, sheight, _sdesc, sext) 753 | 754 | -- Add to cumulative width 755 | cum_width = cum_width + swidth 756 | 757 | -- Total height of the line 758 | local theight = 0 759 | 760 | -- Handle tasks for a line that has a newline 761 | --[[if val.text:match("\n")~=nil then 762 | --Add sheight for each newline, if any 763 | for nl in val.text:gmatch("\n") do 764 | theight=theight+sheight 765 | end 766 | 767 | --Add the external lead to account for the line of normal text 768 | --theight=theight+sext 769 | 770 | --Store the current cumulative width and reset it to zero 771 | subs_width[subs_index]=cum_width 772 | subs_index=subs_index+1 773 | cum_width=0 774 | 775 | --Add to cumulative height 776 | cum_height=cum_height+theight 777 | else 778 | theight=sheight+sext 779 | end]] -- 780 | 781 | -- Add data to data table 782 | table.insert(substr_data, { 783 | ["width"] = swidth, 784 | ["height"] = theight, 785 | ["subtable"] = subtable 786 | }) 787 | 788 | end 789 | 790 | -- Store the last cumulative width 791 | -- subs_width[subs_index]=cum_width 792 | 793 | -- Add the last cumulative height 794 | -- cum_height=cum_height+substr_data[#substr_data].height 795 | 796 | -- Stores current state of the line as a state subtable 797 | local current_subtable = {} 798 | --[[current_subtable=shallow_copy(substr_data[1].subtable) 799 | if current_subtable["t"]~=nil then 800 | current_subtable["t"]=shallow_copy(substr_data[1].subtable["t"]) 801 | end]] 802 | 803 | -- How far to offset the x coordinate 804 | local xoffset = 0 805 | 806 | -- How far to offset the y coordinate 807 | -- yoffset=0 808 | 809 | -- Newline index 810 | -- nindex=1 811 | 812 | -- Ways of calculating the new x position 813 | local xpos_func = {} 814 | -- Left aligned 815 | xpos_func[1] = function(w) return px + xoffset end 816 | -- Center aligned 817 | xpos_func[2] = function(w) 818 | return px - cum_width / 2 + xoffset + w / 2 819 | end 820 | -- Right aligned 821 | xpos_func[0] = function(w) return px - cum_width + xoffset + w end 822 | 823 | -- Ways of calculating the new y position 824 | --[[ypos_func={} 825 | --Bottom aligned 826 | ypos_func[1]=function(h) 827 | return py-cum_height+yoffset+h 828 | end 829 | --Middle aligned 830 | ypos_func[2]=function(h) 831 | return py-cum_height/2+yoffset+w/2 832 | end 833 | --Top aligned 834 | ypos_func[3]=function(h) 835 | return py+yoffset 836 | end]] -- 837 | 838 | -- Second pass to generate lines 839 | for i, val in ipairs(line_table) do 840 | -- Here's where the action happens 841 | local new_line = util.copy(line) 842 | 843 | -- Fix state table to reflect current state 844 | for tag, param in pairs(substr_data[i].subtable) do 845 | if tag == "t" then 846 | if current_subtable["t"] == nil then 847 | current_subtable["t"] = util.copy(param) 848 | else 849 | -- current_subtable["t"]={unpack(current_subtable["t"]),unpack(param)} 850 | for _, subval in ipairs(param) do 851 | table.insert(current_subtable["t"], subval) 852 | end 853 | end 854 | else 855 | current_subtable[tag] = param 856 | end 857 | end 858 | 859 | -- Figure out where the new x and y coords should be 860 | local new_x = xpos_func[current_style.align % 3](substr_data[i].width) 861 | -- new_y=ypos_func[math.ceil(current_style.align/3)](substr_data[i].height) 862 | 863 | -- Check if the text ends in whitespace 864 | -- local wsp = val.text:gsub("\n", ""):match("%s+$") 865 | 866 | -- Modify positioning accordingly 867 | -- if wsp ~= nil then 868 | -- local wsp_width = aegisub.text_extents(val.style, wsp) 869 | -- if current_style.align % 3 == 2 then 870 | -- new_x = new_x - wsp_width / 2 871 | -- elseif current_style.align % 3 == 0 then 872 | -- new_x = new_x - wsp_width 873 | -- end 874 | -- end 875 | 876 | -- Increase x offset 877 | xoffset = xoffset + substr_data[i].width 878 | 879 | -- Handle what happens in the line contains newlines 880 | --[[if val.text:match("\n")~=nil then 881 | --Increase index and reset x offset 882 | nindex=nindex+1 883 | xoffset=0 884 | --Increase y offset 885 | yoffset=yoffset+substr_data[i].height 886 | 887 | --Remove the last newline and convert back to \N 888 | val.text=val.text:gsub("\n$","") 889 | val.text=val.text:gsub("\n","\\N") 890 | end]] -- 891 | 892 | -- Start rebuilding text 893 | local rebuilt_tag = string.format("{\\pos(%s,%s)}", float2str(new_x), 894 | float2str(py)) 895 | 896 | -- Add the remaining tags 897 | for tag, param in pairs(current_subtable) do 898 | if tag == "t" then 899 | for k, ttime in ipairs(param) do 900 | rebuilt_tag = rebuilt_tag:gsub("}", ttime .. "}") 901 | end 902 | elseif tag ~= "pos" and tag ~= "org" then 903 | rebuilt_tag = rebuilt_tag:gsub("{", "{\\" .. tag .. param) 904 | end 905 | end 906 | 907 | if do_org then 908 | rebuilt_tag = rebuilt_tag:gsub("{", string.format( 909 | "{\\org(%s,%s)", 910 | float2str(ox), float2str(oy))) 911 | end 912 | 913 | -- reverse back text 914 | -- local match = re.match(val.text, '^(['..Split.puncs..']*)(.*[^'..Split.puncs..'])(['..Split.puncs..']*)$') 915 | -- aegisub.log('Matched Text 2:\n'..serializeTable(match)..'\n') 916 | -- if match then 917 | -- val.text = utf8.reverse(match[4].str)..match[3].str..utf8.reverse(match[2].str) 918 | -- end 919 | 920 | -- clean text 921 | val.text = re.sub(val.text, '^ +', '') -- trim redundant spaces 922 | val.text = re.sub(val.text, ' +$', '') 923 | val.text = re.sub(val.text, '^[' .. RLE .. ' ]+$', '') 924 | 925 | new_line.text = rebuilt_tag .. val.text 926 | 927 | -- Insert the new line 928 | if val.text ~= "" then 929 | table.insert(splits, 1, new_line) 930 | end 931 | end 932 | 933 | return splits 934 | end 935 | 936 | function Split:splitAtTagsWreverse(line) 937 | local result = {} 938 | local line = util.copy(line) 939 | result.reverse = Split:reverse(line) 940 | result.splits = Split:splitAtTags(result.reverse) 941 | return result 942 | end 943 | 944 | ----- Split at Tags ----- 945 | function SplitAtTags(subtitles, selected_lines, active_line) 946 | _G.subtitles = subtitles 947 | 948 | local lines_added = 0 949 | for i, n in ipairs(selected_lines) do 950 | local line = subtitles[n + lines_added] 951 | 952 | local result = Split:splitAtTagsWreverse(line); 953 | 954 | line.comment = true 955 | subtitles[n + lines_added] = line 956 | for _, l in ipairs(result.splits) do 957 | subtitles.insert(n + lines_added + 1, l) 958 | lines_added = lines_added + 1 959 | end 960 | end 961 | 962 | aegisub.set_undo_point(split_at_tags_script_name) 963 | end 964 | 965 | ----- Split at Spaces ----- 966 | function SplitAtSpaces(subtitles, selected_lines, active_line) 967 | _G.subtitles = subtitles 968 | 969 | local lines = {} 970 | 971 | -- add {} before spaces 972 | for i, n in ipairs(selected_lines) do 973 | local line = subtitles[n] 974 | local parts = expand(line.text) 975 | line.text = "" 976 | for _, p in ipairs(parts) do 977 | p.text, _ = re.sub(p.text, "( +)", "{}" .. RLE .. "\\1") 978 | line.text = line.text .. p.tag .. p.text 979 | end 980 | lines[i] = line 981 | end 982 | 983 | local lines_added = 0 984 | for i, line in ipairs(lines) do 985 | -- split at tags 986 | local result = Split:splitAtTagsWreverse(line) 987 | 988 | -- add lines 989 | local num = selected_lines[i] 990 | 991 | local l = subtitles[num + lines_added] 992 | l.comment = true 993 | subtitles[num + lines_added] = l 994 | 995 | for _, s in ipairs(result.splits) do 996 | subtitles.insert(num + lines_added + 1, s) 997 | lines_added = lines_added + 1 998 | end 999 | end 1000 | 1001 | aegisub.set_undo_point(split_at_spaces_script_name) 1002 | end 1003 | 1004 | ----- Reverse + Split (at Tags) ----- 1005 | function ReverseSplitAtTags(subtitles, selected_lines, active_line) 1006 | _G.subtitles = subtitles 1007 | 1008 | local lines_added = 0 1009 | for i, n in ipairs(selected_lines) do 1010 | local line = subtitles[n + lines_added] 1011 | 1012 | local result = Split:splitAtTags(line); 1013 | 1014 | line.comment = true 1015 | subtitles[n + lines_added] = line 1016 | for _, l in ipairs(result) do 1017 | subtitles.insert(n + lines_added + 1, l) 1018 | lines_added = lines_added + 1 1019 | end 1020 | end 1021 | 1022 | aegisub.set_undo_point(reverse_split_at_tags_script_name) 1023 | end 1024 | 1025 | ----- Reverse at Tags ----- 1026 | function ReverseAtTags(subtitles, selected_lines, active_line) 1027 | _G.subtitles = subtitles 1028 | 1029 | local lines_added = 0 1030 | for i, n in ipairs(selected_lines) do 1031 | local line = subtitles[n + lines_added] 1032 | local new_line = util.copy(line); 1033 | 1034 | new_line.text = unrtl(new_line.text); 1035 | local reverse = Split:reverse(new_line); 1036 | 1037 | line.comment = true 1038 | subtitles[n + lines_added] = line 1039 | subtitles.insert(n + lines_added + 1, reverse) 1040 | end 1041 | 1042 | aegisub.set_undo_point(reverse_at_tags_script_name) 1043 | end 1044 | 1045 | ----- Extend Move ----- 1046 | function ExtendMove(subtitles, selected_lines, active_line) 1047 | for _, i in ipairs(selected_lines) do 1048 | local line = subtitles[i] 1049 | 1050 | line.text = utf8.gsub(line.text, 1051 | "\\move%(([%d%.%-]*),([%d%.%-]*),([%d%.%-]*),([%d%.%-]*),([%d%.%-]*),([%d%.%-]*)%)", 1052 | function(x1, y1, x2, y2, t1, t2) 1053 | local f1 = aegisub.frame_from_ms(line.start_time + t1) 1054 | if f1 ~= nil then 1055 | t1 = aegisub.ms_from_frame(f1) 1056 | local f2 = aegisub.frame_from_ms(line.start_time + t2) 1057 | t2 = aegisub.ms_from_frame(f2) 1058 | end 1059 | local dt = t2 - t1 1060 | local dxdt = (x2 - x1) / dt 1061 | local dydt = (y2 - y1) / dt 1062 | 1063 | local s = aegisub.ms_from_frame(aegisub.frame_from_ms(line.start_time)) 1064 | local e = aegisub.ms_from_frame(aegisub.frame_from_ms(line.end_time)) 1065 | local ds = t1 - s 1066 | local de = e - t2 1067 | if ds < 0 then ds = 0 end 1068 | if de < 0 then de = 0 end 1069 | 1070 | x1 = round(x1 - ds * dxdt, 2) 1071 | x2 = round(x2 + de * dxdt, 2) 1072 | y1 = round(y1 - ds * dydt, 2) 1073 | y2 = round(y2 + de * dydt, 2) 1074 | 1075 | return "\\move(" .. x1 .. "," .. y1 .. "," .. x2 .. "," .. y2 .. ")" 1076 | end) 1077 | 1078 | subtitles[i] = line 1079 | end 1080 | 1081 | aegisub.set_undo_point(extend_move_script_name) 1082 | end 1083 | 1084 | ----- Register Scripts ----- 1085 | aegisub.register_macro(paknevis_script_name, 'Fix your shity writing habbits! (Unretarded Lines Only)', PakNevis) 1086 | aegisub.register_macro(extend_move_script_name, 'Extend \\move based on line\'s time.', ExtendMove) 1087 | aegisub.register_macro(unretard_script_name, 'Unretard your retarted Persian typing! (Retarded Lines Only)', Unretard) 1088 | aegisub.register_macro(rtl_script_name, 'Fix RTL languages displaying issues. (Unretarded Lines Only)', Rtl) 1089 | aegisub.register_macro(unrtl_script_name, 'Undo RTL function effects.', Unrtl) 1090 | aegisub.register_macro(rtleditor_script_name, 'An editor for easy editing of RTL language lines.', RtlEditor) 1091 | aegisub.register_macro(split_at_tags_script_name, 'A splitter (at tags) for RTL language lines.', SplitAtTags) 1092 | aegisub.register_macro(split_at_spaces_script_name, 'A splitter (at spaces) for RTL language lines.', SplitAtSpaces) 1093 | aegisub.register_macro(reverse_split_at_tags_script_name, 'Split / Reverse at Tags + Split / Split at Tags.', 1094 | ReverseSplitAtTags) 1095 | aegisub.register_macro(reverse_at_tags_script_name, 'Reverse line at tags to use it with other LTR automations.', 1096 | ReverseAtTags) 1097 | --------------------------------------------------------------------------------