├── .gitattributes
├── default.project.json
├── src
    ├── .robloxrc
    ├── init.lua
    ├── RegEx
    │   ├── __tests__
    │   │   ├── escapeString.lua
    │   │   └── testoutput1.spec.lua
    │   └── init.lua
    ├── __tests__
    │   ├── test.spec.lua
    │   ├── exec.spec.lua
    │   └── init.spec.lua
    └── Regexp.global.lua
├── Packages
    └── .robloxrc
├── .gitignore
├── test-model.project.json
├── wally.toml
├── foreman.toml
├── bin
    ├── ci.sh
    ├── spec.lua
    ├── parseTestFile.lua
    └── generate-pcre2-tests.lua
├── rotriever.toml
├── CHANGELOG.md
├── selene.toml
├── README.md
├── LICENSE
└── testez.toml


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.lua linguist-language=Luau
2 | 


--------------------------------------------------------------------------------
/default.project.json:
--------------------------------------------------------------------------------
1 | {
2 |   "name": "RobloxLuauRegExp",
3 |   "tree": {
4 |     "$path": "src"
5 |   }
6 | }


--------------------------------------------------------------------------------
/src/.robloxrc:
--------------------------------------------------------------------------------
1 | {
2 | 	"language": {
3 | 		"mode": "nonstrict"
4 | 	},
5 | 	"lint": {
6 | 		"*": "enabled"
7 | 	}
8 | }


--------------------------------------------------------------------------------
/Packages/.robloxrc:
--------------------------------------------------------------------------------
1 | {
2 | 	"language": {
3 | 		"mode": "nocheck"
4 | 	},
5 | 	"lint": {
6 | 		"*": "disabled"
7 | 	}
8 | }


--------------------------------------------------------------------------------
/src/init.lua:
--------------------------------------------------------------------------------
1 | local RegExp = require(script["Regexp.global"])
2 | 
3 | export type RegExp = RegExp.RegExp
4 | 
5 | return RegExp
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | Packages/*
2 | !Packages/.robloxrc
3 | # let selene auto-generate latest
4 | roblox.toml
5 | rotriever.lock
6 | *.rbxmx
7 | 


--------------------------------------------------------------------------------
/test-model.project.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "LuauRegExpTestModel",
 3 |   "tree": {
 4 |     "$className": "Folder",
 5 |     "Packages": {
 6 |       "$path": "Packages",
 7 |       "RegExp": {
 8 |         "$path": "src"
 9 |       }
10 |     }
11 |   }
12 | }


--------------------------------------------------------------------------------
/wally.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "roblox/regexp"
 3 | version = "0.2.2"
 4 | license = "MIT"
 5 | authors = ["Roblox <no-reply@roblox.com>"]
 6 | 
 7 | realm = "shared"
 8 | registry = "https://github.com/UpliftGames/wally-index"
 9 | repository = "https://github.com/Roblox/luau-regexp"
10 | 
11 | exclude = ["**/__tests__/**"]
12 | 
13 | [dependencies]
14 | 


--------------------------------------------------------------------------------
/foreman.toml:
--------------------------------------------------------------------------------
1 | [tools]
2 | selene = { source = "Roblox/Kampfkarren-selene", version = "0.21.0" }
3 | stylua = { source = "Roblox/JohnnyMorganz-StyLua", version = "0.18.1" }
4 | rotrieve = { source = "roblox/rotriever", version = "=0.5.13-alpha.5" }
5 | rbx-aged-cli = { source = "Roblox/rbx-aged-tool", version = "5.8.1" }
6 | wally = { source = "UpliftGames/wally", version = "0.3.2" }
7 | 


--------------------------------------------------------------------------------
/bin/ci.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | echo "Build project"
 6 | rojo build test-model.project.json --output model.rbxmx
 7 | echo "Remove .robloxrc from dev dependencies"
 8 | find Packages/Dev -name "*.robloxrc" | xargs rm -f
 9 | find Packages/_Index -name "*.robloxrc" | xargs rm -f
10 | echo "Run static analysis"
11 | selene src/init.lua src/__tests__ 
12 | stylua -c src/init.lua src/__tests__ 
13 | roblox-cli analyze test-model.project.json
14 | echo "Run tests"
15 | roblox-cli run --load.model model.rbxmx --run bin/spec.lua
16 | 


--------------------------------------------------------------------------------
/bin/spec.lua:
--------------------------------------------------------------------------------
 1 | local ProcessService = game:GetService("ProcessService")
 2 | local Root = script.Parent.LuauRegExpTestModel
 3 | 
 4 | local Packages = Root.Packages
 5 | local TestEZ = require(Root.Packages.Dev.TestEZ)
 6 | 
 7 | -- Run all tests, collect results, and report to stdout.
 8 | local result = TestEZ.TestBootstrap:run(
 9 | 	{ Packages.RegExp },
10 | 	TestEZ.Reporters.TextReporterQuiet
11 | )
12 | 
13 | if result.failureCount == 0 and #result.errors == 0 then
14 | 	ProcessService:ExitAsync(0)
15 | else
16 | 	ProcessService:ExitAsync(1)
17 | end
18 | 


--------------------------------------------------------------------------------
/rotriever.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "RegExp"
 3 | version = "0.2.2"
 4 | authors = ["#luau"]
 5 | description = "A regular expression library for Luau."
 6 | repository = "https://github.com/roblox/luau-regexp-internal"
 7 | keywords = ["regex", "regexpr"]
 8 | content_root = "src"
 9 | files = ["*", "!**/__tests__/**"]
10 | 
11 | [config]
12 | registry_index = true
13 | 
14 | [dev_dependencies]
15 | JestGlobals = "github.com/roblox/jest-roblox@2.0.1"
16 | TestEZ = "github.com/roblox/jest-roblox@2.0.1"
17 | LuauPolyfill = "github.com/roblox/luau-polyfill@0.2.1"
18 | 


--------------------------------------------------------------------------------
/src/RegEx/__tests__/escapeString.lua:
--------------------------------------------------------------------------------
 1 | local ESCAPES = {
 2 | 	["\n"] = "n",
 3 | 	["\r"] = "r",
 4 | 	["\t"] = "t",
 5 | 	["\f"] = "f",
 6 | 	["\a"] = "a",
 7 | 	["\v"] = "v",
 8 | 	["\\"] = "\\",
 9 | }
10 | local ESCAPE_CHARS = {}
11 | for char in pairs(ESCAPES) do
12 | 	table.insert(ESCAPE_CHARS, char)
13 | end
14 | local ESCAPE_CLASS = ("[%s%%c]"):format(table.concat(ESCAPE_CHARS, ""))
15 | 
16 | local function escapeString(str)
17 | 	return str:gsub(ESCAPE_CLASS, function(match)
18 | 		return "\\" .. (ESCAPES[match] or ("%03d"):format(match:byte()))
19 | 	end)
20 | end
21 | 
22 | return escapeString
23 | 


--------------------------------------------------------------------------------
/src/__tests__/test.spec.lua:
--------------------------------------------------------------------------------
 1 | return function()
 2 | 	local RegExpModule = script.Parent.Parent
 3 | 	local RegExp = require(RegExpModule)
 4 | 	type RegExp = RegExp.RegExp
 5 | 
 6 | 	local Packages = RegExpModule.Parent
 7 | 	local JestGlobals = require(Packages.Dev.JestGlobals)
 8 | 	local jestExpect = JestGlobals.expect
 9 | 
10 | 	it("returns true when the regex matches", function()
11 | 		local re: RegExp = RegExp("a")
12 | 		jestExpect(re:test("a")).toEqual(true)
13 | 	end)
14 | 
15 | 	it("returns false when the regex does not match", function()
16 | 		local re: RegExp = RegExp("a")
17 | 		jestExpect(re:test("b")).toEqual(false)
18 | 	end)
19 | end
20 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Luau RegExp Changelog
 2 | 
 3 | ## 0.2.0
 4 | 
 5 | - Remove unicode support to minimize package size. The existing source files to support unicode were large enough to impact time to parse, time to require and total package size ([#4](https://github.com/Roblox/luau-regexp/pull/4))
 6 | 
 7 | ## 0.1.3
 8 | 
 9 | - Bump version to uptake fix for test file filtering in cached artifact ([#3](https://github.com/Roblox/luau-regexp/pull/3))
10 | 
11 | ## 0.1.2
12 | 
13 | - Export RegExp type ([#2](https://github.com/Roblox/luau-regexp/pull/2))
14 | 
15 | ## 0.1.1
16 | 
17 | - Remove tests from packages ([#1](https://github.com/Roblox/luau-regexp/pull/1))
18 | 
19 | ## 0.1.0
20 | 
21 | - Initial release
22 | 


--------------------------------------------------------------------------------
/selene.toml:
--------------------------------------------------------------------------------
 1 | std = "roblox+testez"
 2 | 
 3 | [config]
 4 | empty_if = { comments_count = true }
 5 | unused_variable = { ignore_pattern = "result|ok|^_" }
 6 | # this comes up when translating nested try/finally scenarios
 7 | shadowing = { ignore_pattern = "result|ok|^_" }
 8 | # feature request for this config: https://github.com/Kampfkarren/selene/issues/181
 9 | # global_usage = { ignore_pattern = "^__" }
10 | 
11 | [rules]
12 | # remove this once the feature request here is implemented: https://github.com/Kampfkarren/selene/issues/181
13 | global_usage = "allow"
14 | unused_variable = "allow"
15 | # remove when the Luau type narrowing issues (and the workarounds) are resolved
16 | shadowing = "allow"
17 | 
18 | # remove when this issue is fixed: https://github.com/Kampfkarren/selene/issues/179
19 | if_same_then_else = "allow"
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # RegExp for Luau
 2 | A regular expression library for Luau. Adapted from a regex implementation by github user Blockzez (https://github.com/Blockzez).
 3 | 
 4 | ## API
 5 | Create a regex with the `new` function:
 6 | `RegExp.new(pattern: string, flags: string) -> RegularExpression`
 7 | 
 8 | A resulting RegularExpression has the following methods:
 9 | * `RegularExpression:exec(str: string) -> Match`
10 | * `RegularExpression:test(str: string) -> boolean`
11 | 
12 | The `Match` object resulting from `exec` has the following fields:
13 | * `[1..n]` - The array potion of the `Match` object contains captured groups
14 | * `n` - The length of the array of resulting captured groups
15 | * `index` - The index in the original string where the match begins
16 | * `input` - The original string passed into `exec`
17 | 
18 | ### Flags
19 | The following flags can be provided via the second argument to `RegExp.new`:
20 | * "i" - ignoreCase
21 | * "g" - global
22 | * "m" - multiline


--------------------------------------------------------------------------------
/src/__tests__/exec.spec.lua:
--------------------------------------------------------------------------------
 1 | return function()
 2 | 	local RegExpModule = script.Parent.Parent
 3 | 	local RegExp = require(RegExpModule)
 4 | 	type RegExp = RegExp.RegExp
 5 | 
 6 | 	local Packages = RegExpModule.Parent
 7 | 	local JestGlobals = require(Packages.Dev.JestGlobals)
 8 | 	local jestExpect = JestGlobals.expect
 9 | 
10 | 	-- deviation: since we can't have `nil` values in list-like
11 | 	-- tables, we have to return the total number of matches, so
12 | 	-- that we can know when to stop iteration
13 | 	it("returns the number of matches", function()
14 | 		local re: RegExp = RegExp("abc")
15 | 		local result = re:exec("abc")
16 | 		jestExpect(result.n).toEqual(1)
17 | 	end)
18 | 
19 | 	it("returns the matches starting from index 1", function()
20 | 		local re: RegExp = RegExp("abc")
21 | 		local result = re:exec("abc")
22 | 		jestExpect(result[1]).toEqual("abc")
23 | 	end)
24 | 
25 | 	it("returns the starting position of the match", function()
26 | 		local re: RegExp = RegExp("abc")
27 | 		local result = re:exec("aabc")
28 | 		jestExpect(result.index).toEqual(2)
29 | 	end)
30 | end
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Roblox
 4 | 
 5 | Copyright (c) 2020, 2023 - Blockzez (devforum.roblox.com/u/Blockzez and github.com/Blockzez)
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | of this software and associated documentation files (the "Software"), to deal
 9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/testez.toml:
--------------------------------------------------------------------------------
 1 | [[afterAll.args]]
 2 | type = "function"
 3 | 
 4 | [[afterEach.args]]
 5 | type = "function"
 6 | 
 7 | [[beforeAll.args]]
 8 | type = "function"
 9 | 
10 | [[beforeEach.args]]
11 | type = "function"
12 | 
13 | [[describe.args]]
14 | type = "string"
15 | 
16 | [[describe.args]]
17 | type = "function"
18 | 
19 | [[describeFOCUS.args]]
20 | type = "string"
21 | 
22 | [[describeFOCUS.args]]
23 | type = "function"
24 | 
25 | [[describeSKIP.args]]
26 | type = "string"
27 | 
28 | [[describeSKIP.args]]
29 | type = "function"
30 | 
31 | [[expect.args]]
32 | type = "any"
33 | 
34 | [[FIXME.args]]
35 | type = "string"
36 | required = false
37 | 
38 | [FOCUS]
39 | args = []
40 | 
41 | [[it.args]]
42 | type = "string"
43 | 
44 | [[it.args]]
45 | type = "function"
46 | 
47 | [[itFIXME.args]]
48 | type = "string"
49 | 
50 | [[itFIXME.args]]
51 | type = "function"
52 | 
53 | [[itFOCUS.args]]
54 | type = "string"
55 | 
56 | [[itFOCUS.args]]
57 | type = "function"
58 | 
59 | [[fit.args]]
60 | type = "string"
61 | 
62 | [[fit.args]]
63 | type = "function"
64 | 
65 | [[itSKIP.args]]
66 | type = "string"
67 | 
68 | [[itSKIP.args]]
69 | type = "function"
70 | 
71 | [[xit.args]]
72 | type = "string"
73 | 
74 | [[xit.args]]
75 | type = "function"
76 | 
77 | [SKIP]
78 | args = []
79 | 
80 | 


--------------------------------------------------------------------------------
/src/Regexp.global.lua:
--------------------------------------------------------------------------------
 1 | local RegEx = require(script.Parent.RegEx)
 2 | type Array<T> = { [number]: T }
 3 | 
 4 | type RegExpExecArray = Array<string> & { index: number?, input: string?, n: number }
 5 | 
 6 | export type RegExp = {
 7 |     exec: (self: RegExp, input: string) -> RegExpExecArray | nil,
 8 |     test: (self: RegExp, input: string) -> boolean,
 9 | }
10 | 
11 | local RegExp = {}
12 | local RegExpMetatable = {
13 |     __index = RegExp,
14 |     __tostring = function(self)
15 |         return tostring(self._innerRegEx)
16 |     end,
17 | }
18 | 
19 | function RegExp:exec(str: string): RegExpExecArray | nil
20 |     local match = self._innerRegEx:match(str)
21 |     if not match then
22 |         return nil
23 |     end
24 | 
25 |     local index = match:span()
26 |     local groups = match:grouparr()
27 | 
28 |     local matches = { groups[0] }
29 |     for i = 1, groups.n do
30 |         matches[i + 1] = groups[i]
31 |     end
32 |     matches.n = groups.n + 1
33 |     matches.index = index
34 |     matches.input = str
35 |     return matches
36 | end
37 | 
38 | function RegExp:test(str: string): boolean
39 |     return self:exec(str) ~= nil
40 | end
41 | 
42 | local function new(_self, pattern: RegExp | string, flags: string?)
43 |     flags = flags or ""
44 |     local innerRegEx = RegEx.new(pattern, flags)
45 |     local object = {
46 |         source = pattern,
47 |         ignoreCase = (flags :: string):find("i") ~= nil,
48 |         global = (flags :: string):find("g") ~= nil,
49 |         multiline = (flags :: string):find("m") ~= nil,
50 |         _innerRegEx = innerRegEx,
51 |     }
52 | 
53 |     return setmetatable(object, RegExpMetatable)
54 | end
55 | 
56 | -- FIXME: Capture this as a local variable before returning, else a luau bug
57 | -- prevents __call from being understood: https://jira.rbx.com/browse/CLI-40294
58 | local interface = setmetatable(RegExp, {
59 |     __call = new,
60 | })
61 | 
62 | return interface
63 | 


--------------------------------------------------------------------------------
/src/RegEx/__tests__/testoutput1.spec.lua:
--------------------------------------------------------------------------------
 1 | return function()
 2 | 	local __tests__ = script.Parent
 3 | 	local testData = require(__tests__["testoutput1.gen"])
 4 | 	local escapeString = require(__tests__.escapeString)
 5 | 	local RegEx = require(__tests__.Parent)
 6 | 
 7 | 	local function shortenIfTooLong(str)
 8 | 		if str:len() > 80 then
 9 | 			str = str:sub(1, 76) .. " ..."
10 | 		end
11 | 		return escapeString(str)
12 | 	end
13 | 
14 | 	for _, case in pairs(testData) do
15 | 		local message = ("regex `%s`%s"):format(
16 | 			shortenIfTooLong(case.source),
17 | 			case.flags == nil and "" or ("with %s flags"):format(case.flags)
18 | 		)
19 | 		describe(message, function()
20 | 			local regex = nil
21 | 			beforeEach(function()
22 | 				regex = RegEx.new(case.source, case.flags)
23 | 			end)
24 | 
25 | 			for _, testCase in ipairs(case.tests) do
26 | 				if testCase.matches == nil then
27 | 					-- using the length in the test name will dedup tests
28 | 					-- where the match is identical except one ends with `\0`
29 | 					local testMessage = ("does not match with `%s` (len: %d)"):format(
30 | 						testCase.input,
31 | 						testCase.input:len()
32 | 					)
33 | 					it(testMessage, function()
34 | 						expect(regex:match(testCase.input)).to.equal(nil)
35 | 					end)
36 | 				else
37 | 					describe(("matches with `%s`"):format(testCase.input), function()
38 | 						local matchResults = nil
39 | 						beforeEach(function()
40 | 							matchResults = regex:match(testCase.input)
41 | 						end)
42 | 
43 | 						for _, match in ipairs(testCase.matches) do
44 | 							local testMessage = ("match #%d is `%s`"):format(
45 | 								match.index,
46 | 								match.match
47 | 							)
48 | 							it(testMessage, function()
49 | 								expect(matchResults).to.be.ok()
50 | 								local expectedMatch = match.match
51 | 								if expectedMatch == "<unset>" then
52 | 									expectedMatch = nil
53 | 								end
54 | 								expect(matchResults:group(match.index)).to.equal(expectedMatch)
55 | 							end)
56 | 						end
57 | 					end)
58 | 				end
59 | 			end
60 | 		end)
61 | 	end
62 | end
63 | 


--------------------------------------------------------------------------------
/src/__tests__/init.spec.lua:
--------------------------------------------------------------------------------
 1 | return function()
 2 | 	local RegExpModule = script.Parent.Parent
 3 | 	local RegExp = require(RegExpModule)
 4 | 
 5 | 	local Packages = RegExpModule.Parent
 6 | 	local LuauPolyfill = require(Packages.Dev.LuauPolyfill)
 7 | 	local instanceof = LuauPolyfill.instanceof
 8 | 	local JestGlobals = require(Packages.Dev.JestGlobals)
 9 | 	local jestExpect = JestGlobals.expect
10 | 
11 | 	describe("ignoreCase", function()
12 | 		it("has a `ignoreCase` property set to true if the `i` flag is used", function()
13 | 			jestExpect(RegExp("foo", "i").ignoreCase).toEqual(true)
14 | 		end)
15 | 
16 | 		it("has a `ignoreCase` property set to false by default", function()
17 | 			jestExpect(RegExp("foo").ignoreCase).toEqual(false)
18 | 		end)
19 | 	end)
20 | 
21 | 	describe("multiline", function()
22 | 		it("has a `multiline` property set to true if the `m` flag is used", function()
23 | 			jestExpect(RegExp("foo", "m").multiline).toEqual(true)
24 | 		end)
25 | 
26 | 		it("has a `multiline` property set to false by default", function()
27 | 			jestExpect(RegExp("foo").multiline).toEqual(false)
28 | 		end)
29 | 	end)
30 | 
31 | 	describe("global", function()
32 | 		-- deviation: `g` flag not implemented yet
33 | 		itSKIP("has a `global` property set to true if the `g` flag is used", function()
34 | 			jestExpect(RegExp("foo", "g").global).toEqual(true)
35 | 		end)
36 | 
37 | 		-- deviation: `g` flag not implemented yet
38 | 		itSKIP("has a `global` property set to false by default", function()
39 | 			jestExpect(RegExp("foo").global).toEqual(false)
40 | 		end)
41 | 	end)
42 | 
43 | 	describe("toString", function()
44 | 		it("has a correct tostring output", function()
45 | 			jestExpect(tostring(RegExp("pattern"))).toEqual("/pattern/")
46 | 		end)
47 | 
48 | 		it("has a correct ordering of flags in tostring output", function()
49 | 			jestExpect(tostring(RegExp("regexp\\d", "mi"))).toEqual("/regexp\\d/im")
50 | 		end)
51 | 	end)
52 | 
53 | 	describe("inheritance", function()
54 | 		it("follows our expectations for inheritance", function()
55 | 			jestExpect(instanceof(RegExp("test"), RegExp)).toEqual(true)
56 | 		end)
57 | 	end)
58 | end
59 | 


--------------------------------------------------------------------------------
/bin/parseTestFile.lua:
--------------------------------------------------------------------------------
 1 | -- this limits the total number of regex tests that will be parsed
 2 | -- from the pcre2 test file.
 3 | local MAX = 100000
 4 | 
 5 | local function parseTestFile(file)
 6 | 	local testCaseList = {}
 7 | 
 8 | 	local start = file:find("^/")
 9 | 
10 | 	local count = 0
11 | 	while start ~= nil do
12 | 		count = count + 1
13 | 
14 | 		local ending = file:find("/", start + 1)
15 | 		local regexSource = file:sub(start + 1, ending - 1)
16 | 		local endOfRegexSourceLine = file:find("\n", ending + 1)
17 | 		local flags = file:sub(ending + 1, endOfRegexSourceLine - 1)
18 | 
19 | 		local nextLineStart = endOfRegexSourceLine + 1
20 | 		local nextLineEnd = file:find("\n", endOfRegexSourceLine + 1)
21 | 		local line = file:sub(nextLineStart, nextLineEnd - 1)
22 | 
23 | 		local regexTests = {}
24 | 		local currentTest = nil
25 | 
26 | 		while line ~= "" do
27 | 			local matchIndex, matchValue = line:match("^([%d ]%d+): ?(.*)$")
28 | 			if line == "No match" then
29 | 				currentTest.matches = nil
30 | 			elseif matchIndex ~= nil then
31 | 				assert(
32 | 					currentTest.matches,
33 | 					"error parsing regex " .. tostring(count) .. " tests: '" .. regexSource .. "'"
34 | 				)
35 | 				table.insert(currentTest.matches, {
36 | 					index = tonumber(matchIndex),
37 | 					match = matchValue,
38 | 				})
39 | 			else
40 | 				if currentTest ~= nil then
41 | 					table.insert(regexTests, currentTest)
42 | 				end
43 | 				currentTest = {
44 | 					input = line:match("^ *(.+)$"),
45 | 					matches = {},
46 | 				}
47 | 			end
48 | 			nextLineStart = nextLineEnd + 1
49 | 			nextLineEnd = file:find("\n", nextLineStart)
50 | 			line = file:sub(nextLineStart, nextLineEnd - 1)
51 | 		end
52 | 
53 | 		if currentTest ~= nil then
54 | 			table.insert(regexTests, currentTest)
55 | 		end
56 | 
57 | 		local regexInfo = {
58 | 			source = regexSource,
59 | 			flags = flags,
60 | 		}
61 | 
62 | 		table.insert(testCaseList, {
63 | 			source = regexSource,
64 | 			flags = flags,
65 | 			tests = regexTests,
66 | 		})
67 | 
68 | 		if regexInfo.source == nil then
69 | 			error("\n\nerror at count = " .. count .. "\n\n")
70 | 		end
71 | 
72 | 		start = file:find("\n/", nextLineEnd)
73 | 		if start then
74 | 			start = start + 1
75 | 		end
76 | 
77 | 		if count >= MAX then
78 | 			break
79 | 		end
80 | 	end
81 | 
82 | 	return testCaseList
83 | end
84 | 
85 | return parseTestFile
86 | 


--------------------------------------------------------------------------------
/bin/generate-pcre2-tests.lua:
--------------------------------------------------------------------------------
  1 | local parseTestFile = require("bin.parseTestFile")
  2 | 
  3 | --[[
  4 | 	The next require refers to a prce2 test data file wrapped as a lua module
  5 | 	that returns a string. Tthese test data files can be found on github:
  6 | 	https://github.com/luvit/pcre2/tree/master/testdata
  7 | 
  8 | 	Each file that starts with `testoutput` should work, simply take one and
  9 | 	create a lua file that looks like this:
 10 | 	```
 11 | 	return [==========================[
 12 | 		*** content of the test data file, after the comments (lines starting with #) ***
 13 | 	]==========================]
 14 | 	```
 15 | 
 16 | 	Require the test data file from the project root path. If you put it in the
 17 | 	root, it is simply the filename without the `.lua` extension.
 18 | ]]
 19 | local testFile = require("src.RegEx.__tests__.testoutput1")
 20 | 
 21 | local testCases = parseTestFile(testFile)
 22 | 
 23 | local luaOutputLines = {}
 24 | 
 25 | local function writeLine(strFormat, ...)
 26 | 	table.insert(luaOutputLines, "\t" .. strFormat:format(...))
 27 | end
 28 | 
 29 | local function findNotEscaped(str, char, startIndex)
 30 | 	local foundIndex = str:find(char, startIndex, true)
 31 | 	while foundIndex ~= nil do
 32 | 		local escaped = false
 33 | 		local currentChar = foundIndex - 1
 34 | 		while str:sub(currentChar, currentChar) == "\\" do
 35 | 			escaped = not escaped
 36 | 			currentChar = currentChar - 1
 37 | 			if currentChar == 0 then
 38 | 				break
 39 | 			end
 40 | 		end
 41 | 		if not escaped then
 42 | 			return foundIndex
 43 | 		end
 44 | 		foundIndex = str:find(char, foundIndex + 1, true)
 45 | 	end
 46 | 	return nil
 47 | end
 48 | 
 49 | local function removeUnescapeBackslash(str)
 50 | 	local index = 1
 51 | 	index = findNotEscaped(str, "\\", index)
 52 | 	while index ~= nil do
 53 | 		if str:sub(index + 1, index + 1):match("[afnrt%d\\]") then
 54 | 			index = findNotEscaped(str, "\\", index + 1)
 55 | 		else
 56 | 			str = str:sub(1, index) .. str:sub(index)
 57 | 			index = findNotEscaped(str, "\\", index + 2)
 58 | 		end
 59 | 	end
 60 | 	return str
 61 | end
 62 | 
 63 | local function quote(str)
 64 | 	return ('"%s"'):format(
 65 | 		str:gsub('"', '\\"'):gsub("\n", "\\n")
 66 | 	)
 67 | end
 68 | 
 69 | local function processInput(str)
 70 | 	str = str
 71 | 		:gsub("\\e", "\\027")
 72 | 		:gsub("\\%$", "$")
 73 | 		:gsub("\\x%x%x?", function(match)
 74 | 			local asciiIndex = tonumber(match:sub(3), 16)
 75 | 			return ("\\%03d"):format(asciiIndex)
 76 | 		end)
 77 | 
 78 | 	return quote(removeUnescapeBackslash(str))
 79 | end
 80 | 
 81 | local function processMatch(str)
 82 | 	str = str:gsub("\\e", "\\027")
 83 | 		:gsub("\\[^tnrfax\\%d]", function(match)
 84 | 			return "\\" .. match
 85 | 		end)
 86 | 		:gsub("\\x%x%x", function(match)
 87 | 			local asciiIndex = tonumber(match:sub(3), 16)
 88 | 			return ("\\%03d"):format(asciiIndex)
 89 | 		end)
 90 | 	return quote(removeUnescapeBackslash(str))
 91 | end
 92 | 
 93 | for i, case in ipairs(testCases) do
 94 | 	local totalTestCases = #case.tests
 95 | 	if totalTestCases > 0 then
 96 | 		writeLine("{")
 97 | 
 98 | 		writeLine("\tsource = [==[%s]==],", case.source)
 99 | 		if case.flags and case.flags ~= "" then
100 | 			writeLine("\tflags = %q,", case.flags)
101 | 		end
102 | 
103 | 		writeLine("\ttests = {")
104 | 		for _, test in ipairs(case.tests) do
105 | 			if test.matches then
106 | 				writeLine("\t\t{")
107 | 				writeLine("\t\t\tinput = %s,", processInput(test.input))
108 | 				if #test.matches == 1 then
109 | 					writeLine("\t\t\tmatches = {{ index = %d, match = %s }},",
110 | 						test.matches[1].index,
111 | 						processMatch(test.matches[1].match)
112 | 					)
113 | 				else
114 | 					writeLine("\t\t\tmatches = {")
115 | 					for _, match in ipairs(test.matches) do
116 | 						writeLine(
117 | 							"\t\t\t\t{ index = %d, match = %s },",
118 | 							match.index,
119 | 							processMatch(match.match)
120 | 						)
121 | 					end
122 | 					writeLine("\t\t\t}")
123 | 				end
124 | 				writeLine("\t\t},")
125 | 			else
126 | 				writeLine("\t\t{ input = %s },", processInput(test.input))
127 | 			end
128 | 		end
129 | 		writeLine("\t},")
130 | 
131 | 		writeLine("},")
132 | 	else
133 | 		print(("no test case found for #%d: %s"):format(i, case.source))
134 | 	end
135 | end
136 | 
137 | local file = io.open("testoutput1.gen.lua", "w+")
138 | file:write("return {\n")
139 | 
140 | file:write(table.concat(luaOutputLines, "\n"))
141 | 
142 | file:write("\n}\n")
143 | 
144 | file:close()
145 | 


--------------------------------------------------------------------------------
/src/RegEx/init.lua:
--------------------------------------------------------------------------------
   1 | --!nolint
   2 | --!nocheck
   3 | --[[
   4 | 	PCRE2-based RegEx implemention for Luau
   5 | 	Version 1.0.0a2 (2020)
   6 | 	Expat Licence
   7 | 	Copyright © 2020, 2023 - Blockzez (devforum.roblox.com/u/Blockzez and github.com/Blockzez)
   8 | 	All rights reserved.
   9 | 
  10 | 	Permission is hereby granted, free of charge, to any person obtaining a copy
  11 | 	of this software and associated documentation files (the "Software"), to deal
  12 | 	in the Software without restriction, including without limitation the rights
  13 | 	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 | 	copies of the Software, and to permit persons to whom the Software is
  15 | 	furnished to do so, subject to the following conditions:
  16 | 
  17 | 	The above copyright notice and this permission notice shall be included in all
  18 | 	copies or substantial portions of the Software.
  19 | 
  20 | 	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 | 	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 | 	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  23 | 	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 | 	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 | 	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  26 | 	SOFTWARE.
  27 | ]]
  28 | --[[ Settings ]]--
  29 | -- You can change them here
  30 | local options = {
  31 | 	-- The maximum cache size for regex so the patterns are cached so it doesn't recompile the pattern
  32 | 	-- The only accepted value are number values >= 0, strings that can be automatically coered to numbers that are >= 0, false and nil
  33 | 	-- Do note that empty regex patterns (comment-only patterns included) are never cached regardless
  34 | 	-- The default is 256
  35 | 	cacheSize = 256,
  36 | 
  37 | 	-- A boolean that determines whether this use unicode data
  38 | 	-- If this value evalulates to false, you can remove _unicodechar_category, _scripts and _xuc safely and it'll now error if:
  39 | 	-- - You try to compile a RegEx with unicode flag
  40 | 	-- - You try to use the \p pattern
  41 | 	-- The default is true
  42 | 	unicodeData = false,
  43 | };
  44 | 
  45 | --
  46 | local u_categories = options.unicodeData and require(script:WaitForChild("_unicodechar_category"));
  47 | local chr_scripts = options.unicodeData and require(script:WaitForChild("_scripts"));
  48 | local xuc_chr = options.unicodeData and require(script:WaitForChild("_xuc"));
  49 | local proxy = setmetatable({ }, { __mode = 'k' });
  50 | local re, re_m, match_m = { }, { }, { };
  51 | local lockmsg;
  52 | 
  53 | --[[ Functions ]]--
  54 | local function to_str_arr(self, init)
  55 | 	if init then
  56 | 		self = string.sub(self, utf8.offset(self, init));
  57 | 	end;
  58 | 	local len = utf8.len(self);
  59 | 	if len <= 1999 then
  60 | 		return { n = len, s = self, utf8.codepoint(self, 1, #self) };
  61 | 	end;
  62 | 	local clen = math.ceil(len / 1999);
  63 | 	local ret = table.create(len);
  64 | 	local p = 1;
  65 | 	for i = 1, clen do
  66 | 		local c = table.pack(utf8.codepoint(self, utf8.offset(self, i * 1999 - 1998), utf8.offset(self, i * 1999 - (i == clen and 1998 - ((len - 1) % 1999 + 1) or - 1)) - 1));
  67 | 		table.move(c, 1, c.n, p, ret);
  68 | 		p += c.n;
  69 | 	end;
  70 | 	ret.s, ret.n = self, len;
  71 | 	return ret;
  72 | end;
  73 | 
  74 | local function from_str_arr(self)
  75 | 	local len = self.n or #self;
  76 | 	if len <= 7997 then
  77 | 		return utf8.char(table.unpack(self));
  78 | 	end;
  79 | 	local clen = math.ceil(len / 7997);
  80 | 	local r = table.create(clen);
  81 | 	for i = 1, clen do
  82 | 		r[i] = utf8.char(table.unpack(self, i * 7997 - 7996, i * 7997 - (i == clen and 7997 - ((len - 1) % 7997 + 1) or 0)));
  83 | 	end;
  84 | 	return table.concat(r);
  85 | end;
  86 | 
  87 | local function utf8_sub(self, i, j)
  88 | 	j = utf8.offset(self, j);
  89 | 	return string.sub(self, utf8.offset(self, i), j and j - 1);
  90 | end;
  91 | 
  92 | --
  93 | local flag_map = {
  94 | 	a = 'anchored', i = 'caseless', m = 'multiline', s = 'dotall', u = 'unicode', U = 'ungreedy', x ='extended',
  95 | };
  96 | 
  97 | local posix_class_names = {
  98 | 	alnum = true, alpha = true, ascii = true, blank = true, cntrl = true, digit = true, graph = true, lower = true, print = true, punct = true, space = true, upper = true, word = true, xdigit = true,
  99 | };
 100 | 
 101 | local escape_chars = {
 102 | 	-- grouped
 103 | 	-- digit, spaces and words
 104 | 	[0x44] = { "class", "digit", true }, [0x53] = { "class", "space", true }, [0x57] = { "class", "word", true },
 105 | 	[0x64] = { "class", "digit", false }, [0x73] = { "class", "space", false }, [0x77] = { "class", "word", false },
 106 | 	-- horizontal/vertical whitespace and newline
 107 | 	[0x48] = { "class", "blank", true }, [0x56] = { "class", "vertical_tab", true },
 108 | 	[0x68] = { "class", "blank", false }, [0x76] = { "class", "vertical_tab", false },
 109 | 	[0x4E] = { 0x4E }, [0x52] = { 0x52 },
 110 | 
 111 | 	-- not grouped
 112 | 	[0x42] = 0x08,
 113 | 	[0x6E] = 0x0A, [0x72] = 0x0D, [0x74] = 0x09,
 114 | };
 115 | 
 116 | local b_escape_chars = {
 117 | 	-- word boundary and not word boundary
 118 | 	[0x62] = { 0x62, { "class", "word", false } }, [0x42] = { 0x42, { "class", "word", false } },
 119 | 
 120 | 	-- keep match out
 121 | 	[0x4B] = { 0x4B },
 122 | 
 123 | 	-- start & end of string
 124 | 	[0x47] = { 0x47 }, [0x4A] = { 0x4A }, [0x5A] = { 0x5A }, [0x7A] = { 0x7A },
 125 | };
 126 | 
 127 | local valid_categories = {
 128 | 	C = true, Cc = true, Cf = true, Cn = true, Co = true, Cs = true,
 129 | 	L = true, Ll = true, Lm = true, Lo = true, Lt = true, Lu = true,
 130 | 	M = true, Mc = true, Me = true, Mn = true,
 131 | 	N = true, Nd = true, Nl = true, No = true,
 132 | 	P = true, Pc = true, Pd = true, Pe = true, Pf = true, Pi = true, Po = true, Ps = true,
 133 | 	S = true, Sc = true, Sk = true, Sm = true, So = true,
 134 | 	Z = true, Zl = true, Zp = true, Zs = true,
 135 | 
 136 | 	Xan = true, Xps = true, Xsp = true, Xuc = true, Xwd = true,
 137 | };
 138 | 
 139 | local class_ascii_punct = {
 140 | 	[0x21] = true, [0x22] = true, [0x23] = true, [0x24] = true, [0x25] = true, [0x26] = true, [0x27] = true, [0x28] = true, [0x29] = true, [0x2A] = true, [0x2B] = true, [0x2C] = true, [0x2D] = true, [0x2E] = true, [0x2F] = true,
 141 | 	[0x3A] = true, [0x3B] = true, [0x3C] = true, [0x3D] = true, [0x3E] = true, [0x3F] = true, [0x40] = true, [0x5B] = true, [0x5C] = true, [0x5D] = true, [0x5E] = true, [0x5F] = true, [0x60] = true, [0x7B] = true, [0x7C] = true,
 142 | 	[0x7D] = true, [0x7E] = true,
 143 | };
 144 | 
 145 | local end_str = { 0x24 };
 146 | local dot = { 0x2E };
 147 | local beginning_str = { 0x5E };
 148 | local alternation = { 0x7C };
 149 | 
 150 | local function check_re(re_type, name, func)
 151 | 	if re_type == "Match" then
 152 | 		return function(...)
 153 | 			local arg_n = select('#', ...);
 154 | 			if arg_n < 1 then
 155 | 				error("missing argument #1 (Match expected)", 2);
 156 | 			end;
 157 | 			local arg0, arg1 = ...;
 158 | 			if not (proxy[arg0] and proxy[arg0].name == "Match") then
 159 | 				error(string.format("invalid argument #1 to %q (Match expected, got %s)", name, typeof(arg0)), 2);
 160 | 			else
 161 | 				arg0 = proxy[arg0];
 162 | 			end;
 163 | 			if name == "group" or name == "span" then
 164 | 				if arg1 == nil then
 165 | 					arg1 = 0;
 166 | 				end;
 167 | 			end;
 168 | 			return func(arg0, arg1);
 169 | 		end;
 170 | 	end;
 171 | 	return function(...)
 172 | 		local arg_n = select('#', ...);
 173 | 		if arg_n < 1 then
 174 | 			error("missing argument #1 (RegEx expected)", 2);
 175 | 		elseif arg_n < 2 then
 176 | 			error("missing argument #2 (string expected)", 2);
 177 | 		end;
 178 | 		local arg0, arg1, arg2, arg3, arg4, arg5 = ...;
 179 | 		if not (proxy[arg0] and proxy[arg0].name == "RegEx") then
 180 | 			if type(arg0) ~= "string" and type(arg0) ~= "number" then
 181 | 				error(string.format("invalid argument #1 to %q (RegEx expected, got %s)", name, typeof(arg0)), 2);
 182 | 			end;
 183 | 			arg0 = re.fromstring(arg0);
 184 | 		elseif name == "sub" then
 185 | 			if type(arg2) == "number" then
 186 | 				arg2 ..= '';
 187 | 			elseif type(arg2) ~= "string" then
 188 | 				error(string.format("invalid argument #3 to 'sub' (string expected, got %s)", typeof(arg2)), 2);
 189 | 			end;
 190 | 		elseif type(arg1) == "number" then
 191 | 			arg1 ..= '';
 192 | 		elseif type(arg1) ~= "string" then
 193 | 			error(string.format("invalid argument #2 to %q (string expected, got %s)", name, typeof(arg1)), 2);
 194 | 		end;
 195 | 		if name ~= "sub" and name ~= "split" then
 196 | 			local init_type = typeof(arg2);
 197 | 			if init_type ~= 'nil' then
 198 | 				arg2 = tonumber(arg2);
 199 | 				if not arg2 then
 200 | 					error(string.format("invalid argument #3 to %q (number expected, got %s)", name, init_type), 2);
 201 | 				elseif arg2 < 0 then
 202 | 					arg2 = #arg1 + math.floor(arg2 + 0.5) + 1;
 203 | 				else
 204 | 					arg2 = math.max(math.floor(arg2 + 0.5), 1);
 205 | 				end;
 206 | 			end;
 207 | 		end;
 208 | 		arg0 = proxy[arg0];
 209 | 		if name == "match" or name == "matchiter" then
 210 | 			arg3 = ...;
 211 | 		elseif name == "sub" then
 212 | 			arg5 = ...;
 213 | 		end;
 214 | 		return func(arg0, arg1, arg2, arg3, arg4, arg5);
 215 | 	end;
 216 | end;
 217 | 
 218 | --[[ Matches ]]--
 219 | local function match_tostr(self)
 220 | 	local spans = proxy[self].spans;
 221 | 	local s_start, s_end = spans[0][1], spans[0][2];
 222 | 	if s_end <= s_start then
 223 | 		return string.format("Match (%d..%d, empty)", s_start, s_end - 1);
 224 | 	end;
 225 | 	return string.format("Match (%d..%d): %s", s_start, s_end - 1, utf8_sub(spans.input, s_start, s_end));
 226 | end;
 227 | 
 228 | local function new_match(span_arr, group_id, re, str)
 229 | 	span_arr.source, span_arr.input = re, str;
 230 | 	local object = newproxy(true);
 231 | 	local object_mt = getmetatable(object);
 232 | 	object_mt.__metatable = lockmsg;
 233 | 	object_mt.__index = setmetatable(span_arr, match_m);
 234 | 	object_mt.__tostring = match_tostr;
 235 | 
 236 | 	proxy[object] = { name = "Match", spans = span_arr, group_id = group_id };
 237 | 	return object;
 238 | end;
 239 | 
 240 | match_m.group = check_re('Match', 'group', function(self, group_id)
 241 | 	local span = self.spans[type(group_id) == "number" and group_id or self.group_id[group_id]];
 242 | 	if not span then
 243 | 		return nil;
 244 | 	end;
 245 | 	return utf8_sub(self.spans.input, span[1], span[2]);
 246 | end);
 247 | 
 248 | match_m.span = check_re('Match', 'span', function(self, group_id)
 249 | 	local span = self.spans[type(group_id) == "number" and group_id or self.group_id[group_id]];
 250 | 	if not span then
 251 | 		return nil;
 252 | 	end;
 253 | 	return span[1], span[2] - 1;
 254 | end);
 255 | 
 256 | match_m.groups = check_re('Match', 'groups', function(self)
 257 | 	local spans = self.spans;
 258 | 	if spans.n > 0 then
 259 | 		local ret = table.create(spans.n);
 260 | 		for i = 0, spans.n do
 261 | 			local v = spans[i];
 262 | 			if v then
 263 | 				ret[i] = utf8_sub(spans.input, v[1], v[2]);
 264 | 			end;
 265 | 		end;
 266 | 		return table.unpack(ret, 1, spans.n);
 267 | 	end;
 268 | 	return utf8_sub(spans.input, spans[0][1], spans[0][2]);
 269 | end);
 270 | 
 271 | match_m.groupdict = check_re('Match', 'groupdict', function(self)
 272 | 	local spans = self.spans;
 273 | 	local ret = { };
 274 | 	for k, v in pairs(self.group_id) do
 275 | 		v = spans[v];
 276 | 		if v then
 277 | 			ret[k] = utf8_sub(spans.input, v[1], v[2]);
 278 | 		end;
 279 | 	end;
 280 | 	return ret;
 281 | end);
 282 | 
 283 | match_m.grouparr = check_re('Match', 'groupdict', function(self)
 284 | 	local spans = self.spans;
 285 | 	local ret = table.create(spans.n);
 286 | 	for i = 0, spans.n do
 287 | 		local v = spans[i];
 288 | 		if v then
 289 | 			ret[i] = utf8_sub(spans.input, v[1], v[2]);
 290 | 		end;
 291 | 	end;
 292 | 	ret.n = spans.n;
 293 | 	return ret;
 294 | end);
 295 | 
 296 | --
 297 | local line_verbs = {
 298 | 	CR = 0, LF = 1, CRLF = 2, ANYRLF = 3, ANY = 4, NUL = 5,
 299 | };
 300 | local function is_newline(str_arr, i, verb_flags)
 301 | 	local line_verb_n = verb_flags.newline;
 302 | 	local chr = str_arr[i];
 303 | 	if line_verb_n == 0 then
 304 | 		-- carriage return
 305 | 		return chr == 0x0D;
 306 | 	elseif line_verb_n == 2 then
 307 | 		-- carriage return followed by line feed
 308 | 		return chr == 0x0A and str_arr[i - 1] == 0x20;
 309 | 	elseif line_verb_n == 3 then
 310 | 		-- any of the above
 311 | 		return chr == 0x0A or chr == 0x0D;
 312 | 	elseif line_verb_n == 4 then
 313 | 		-- any of Unicode newlines
 314 | 		return chr == 0x0A or chr == 0x0B or chr == 0x0C or chr == 0x0D or chr == 0x85 or chr == 0x2028 or chr == 0x2029;
 315 | 	elseif line_verb_n == 5 then
 316 | 		-- null
 317 | 		return chr == 0;
 318 | 	end;
 319 | 	-- linefeed
 320 | 	return chr == 0x0A;
 321 | end;
 322 | 
 323 | 
 324 | local function tkn_char_match(tkn_part, str_arr, i, flags, verb_flags)
 325 | 	local chr = str_arr[i];
 326 | 	if not chr then
 327 | 		return false;
 328 | 	elseif flags.ignoreCase and chr >= 0x61 and chr <= 0x7A then
 329 | 		chr -= 0x20;
 330 | 	end;
 331 | 	if type(tkn_part) == "number" then
 332 | 		return tkn_part == chr;
 333 | 	elseif tkn_part[1] == "charset" then
 334 | 		for _, v in ipairs(tkn_part[3]) do
 335 | 			if tkn_char_match(v, str_arr, i, flags, verb_flags) then
 336 | 				return not tkn_part[2];
 337 | 			end;
 338 | 		end;
 339 | 		return tkn_part[2];
 340 | 	elseif tkn_part[1] == "range" then
 341 | 		return chr >= tkn_part[2] and chr <= tkn_part[3] or flags.ignoreCase and chr >= 0x41 and chr <= 0x5A and (chr + 0x20) >= tkn_part[2] and (chr + 0x20) <= tkn_part[3];
 342 | 	elseif tkn_part[1] == "class" then
 343 | 		local char_class = tkn_part[2];
 344 | 		local negate = tkn_part[3];
 345 | 		local match = false;
 346 | 		-- if and elseifs :(
 347 | 		-- Might make these into tables in the future
 348 | 		if char_class == "xdigit" then
 349 | 			match = chr >= 0x30 and chr <= 0x39 or chr >= 0x41 and chr <= 0x46 or chr >= 0x61 and chr <= 0x66;
 350 | 		elseif char_class == "ascii" then
 351 | 			match = chr <= 0x7F;
 352 | 		-- cannot be accessed through POSIX classes
 353 | 		elseif char_class == "vertical_tab" then
 354 | 			match = chr >= 0x0A and chr <= 0x0D or chr == 0x2028 or chr == 0x2029;
 355 | 		--
 356 | 		elseif flags.unicode then
 357 | 			local current_category = u_categories[chr] or 'Cn';
 358 | 			local first_category = current_category:sub(1, 1);
 359 | 			if char_class == "alnum" then
 360 | 				match = first_category == 'L' or current_category == 'Nl' or current_category == 'Nd';
 361 | 			elseif char_class == "alpha" then
 362 | 				match = first_category == 'L' or current_category == 'Nl';
 363 | 			elseif char_class == "blank" then
 364 | 				match = current_category == 'Zs' or chr == 0x09;
 365 | 			elseif char_class == "cntrl" then
 366 | 				match = current_category == 'Cc';
 367 | 			elseif char_class == "digit" then
 368 | 				match = current_category == 'Nd';
 369 | 			elseif char_class == "graph" then
 370 | 				match = first_category ~= 'P' and first_category ~= 'C';
 371 | 			elseif char_class == "lower" then
 372 | 				match = current_category == 'Ll';
 373 | 			elseif char_class == "print" then
 374 | 				match = first_category ~= 'C';
 375 | 			elseif char_class == "punct" then
 376 | 				match = first_category == 'P';
 377 | 			elseif char_class == "space" then
 378 | 				match = first_category == 'Z' or chr >= 0x09 and chr <= 0x0D;
 379 | 			elseif char_class == "upper" then
 380 | 				match = current_category == 'Lu';
 381 | 			elseif char_class == "word" then
 382 | 				match = first_category == 'L' or current_category == 'Nl' or current_category == 'Nd' or current_category == 'Pc';
 383 | 			end;
 384 | 		elseif char_class == "alnum" then
 385 | 			match = chr >= 0x30 and chr <= 0x39 or chr >= 0x41 and chr <= 0x5A or chr >= 0x61 and chr <= 0x7A;
 386 | 		elseif char_class == "alpha" then
 387 | 			match = chr >= 0x41 and chr <= 0x5A or chr >= 0x61 and chr <= 0x7A;
 388 | 		elseif char_class == "blank" then
 389 | 			match = chr == 0x09 or chr == 0x20;
 390 | 		elseif char_class == "cntrl" then
 391 | 			match = chr <= 0x1F or chr == 0x7F;
 392 | 		elseif char_class == "digit" then
 393 | 			match = chr >= 0x30 and chr <= 0x39;
 394 | 		elseif char_class == "graph" then
 395 | 			match = chr >= 0x21 and chr <= 0x7E;
 396 | 		elseif char_class == "lower" then
 397 | 			match = chr >= 0x61 and chr <= 0x7A;
 398 | 		elseif char_class == "print" then
 399 | 			match = chr >= 0x20 and chr <= 0x7E;
 400 | 		elseif char_class == "punct" then
 401 | 			match = class_ascii_punct[chr];
 402 | 		elseif char_class == "space" then
 403 | 			match = chr >= 0x09 and chr <= 0x0D or chr == 0x20;
 404 | 		elseif char_class == "upper" then
 405 | 			match = chr >= 0x41 and chr <= 0x5A;
 406 | 		elseif char_class == "word" then
 407 | 			match = chr >= 0x30 and chr <= 0x39 or chr >= 0x41 and chr <= 0x5A or chr >= 0x61 and chr <= 0x7A or chr == 0x5F;
 408 | 		end;
 409 | 		if negate then
 410 | 			return not match;
 411 | 		end;
 412 | 		return match;
 413 | 	elseif tkn_part[1] == "category" then
 414 | 		local chr_category = u_categories[chr] or 'Cn';
 415 | 		local category_v = tkn_part[3];
 416 | 		local category_len = #category_v;
 417 | 		if category_len == 3 then
 418 | 			local match = false;
 419 | 			if category_v == "Xan" or category_v == "Xwd" then
 420 | 				match = chr_category:find("^[LN]") or category_v == "Xwd" and chr == 0x5F;
 421 | 			elseif category_v == "Xps" or category_v == "Xsp" then
 422 | 				match = chr_category:sub(1, 1) == 'Z' or chr >= 0x09 and chr <= 0x0D;
 423 | 			elseif category_v == "Xuc" then
 424 | 				match = tkn_char_match(xuc_chr, str_arr, i, flags, verb_flags);
 425 | 			end;
 426 | 			if tkn_part[2] then
 427 | 				return not match;
 428 | 			end
 429 | 			return match;
 430 | 		elseif chr_category:sub(1, category_len) == category_v then
 431 | 			return not tkn_part[2];
 432 | 		end;
 433 | 		return tkn_part[2];
 434 | 	elseif tkn_part[1] == 0x2E then
 435 | 		return flags.dotAll or not is_newline(str_arr, i, verb_flags);
 436 | 	elseif tkn_part[1] == 0x4E then
 437 | 		return not is_newline(str_arr, i, verb_flags);
 438 | 	elseif tkn_part[1] == 0x52 then
 439 | 		if verb_flags.newline_seq == 0 then
 440 | 			-- CR, LF or CRLF
 441 | 			return chr == 0x0A or chr == 0x0D;
 442 | 		end;
 443 | 		-- any unicode newline
 444 | 		return chr == 0x0A or chr == 0x0B or chr == 0x0C or chr == 0x0D or chr == 0x85 or chr == 0x2028 or chr == 0x2029;
 445 | 	end;
 446 | 	return false;
 447 | end;
 448 | 
 449 | local function find_alternation(token, i, count)
 450 | 	while true do
 451 | 		local v = token[i];
 452 | 		local is_table = type(v) == "table";
 453 | 		if v == alternation then
 454 | 			return i, count;
 455 | 		elseif is_table and v[1] == 0x28 then
 456 | 			if count then
 457 | 				count += v.count;
 458 | 			end;
 459 | 			i = v[3];
 460 | 		elseif is_table and v[1] == "quantifier" and type(v[5]) == "table" and v[5][1] == 0x28 then
 461 | 			if count then
 462 | 				count += v[5].count;
 463 | 			end;
 464 | 			i = v[5][3];
 465 | 		elseif not v or is_table and v[1] == 0x29 then
 466 | 			return nil, count;
 467 | 		elseif count then
 468 | 			if is_table and v[1] == "quantifier" then
 469 | 				count += v[3];
 470 | 			else
 471 | 				count += 1;
 472 | 			end;
 473 | 		end;
 474 | 		i += 1;
 475 | 	end;
 476 | end;
 477 | 
 478 | local function re_rawfind(token, str_arr, init, flags, verb_flags, as_bool)
 479 | 	local tkn_i, str_i, start_i = 0, init, init;
 480 | 	local states = { };
 481 | 	while tkn_i do
 482 | 		if tkn_i == 0 then
 483 | 			tkn_i += 1;
 484 | 			local next_alt = find_alternation(token, tkn_i);
 485 | 			if next_alt then
 486 | 				table.insert(states, 1, { "alternation", next_alt, str_i });
 487 | 			end;
 488 | 			continue;
 489 | 		end;
 490 | 		local ctkn = token[tkn_i];
 491 | 		local tkn_type = type(ctkn) == "table" and ctkn[1];
 492 | 		if not ctkn then
 493 | 			break;
 494 | 		elseif ctkn == "ACCEPT" then
 495 | 			local not_lookaround = true;
 496 | 			local close_i = tkn_i;
 497 | 			repeat
 498 | 				close_i += 1;
 499 | 				local is_table = type(token[close_i]) == "table";
 500 | 				local close_i_tkn = token[close_i];
 501 | 				if is_table and (close_i_tkn[1] == 0x28 or close_i_tkn[1] == "quantifier" and type(close_i_tkn[5]) == "table" and close_i_tkn[5][1] == 0x28) then
 502 | 					close_i = close_i_tkn[1] == "quantifier" and close_i_tkn[5][3] or close_i_tkn[3];
 503 | 				elseif is_table and close_i_tkn[1] == 0x29 and (close_i_tkn[4] == 0x21 or close_i_tkn[4] == 0x3D) then
 504 | 					not_lookaround = false;
 505 | 					tkn_i = close_i;
 506 | 					break;
 507 | 				end;
 508 | 			until not close_i_tkn;
 509 | 			if not_lookaround then
 510 | 				break;
 511 | 			end;
 512 | 		elseif ctkn == "PRUNE" or ctkn == "SKIP" then
 513 | 			table.insert(states, 1, { ctkn, str_i });
 514 | 			tkn_i += 1;
 515 | 		elseif tkn_type == 0x28 then
 516 | 			table.insert(states, 1, { "group", tkn_i, str_i, nil, ctkn[2], ctkn[3], ctkn[4] });
 517 | 			tkn_i += 1;
 518 | 			local next_alt, count = find_alternation(token, tkn_i, (ctkn[4] == 0x21 or ctkn[4] == 0x3D) and ctkn[5] and 0);
 519 | 			if next_alt then
 520 | 				table.insert(states, 1, { "alternation", next_alt, str_i });
 521 | 			end;
 522 | 			if count then
 523 | 				str_i -= count;
 524 | 			end;
 525 | 		elseif tkn_type == 0x29 and ctkn[4] ~= 0x21 then
 526 | 			if ctkn[4] == 0x21 or ctkn[4] == 0x3D then
 527 | 				while true do
 528 | 					local selected_match_start;
 529 | 					local selected_state = table.remove(states, 1);
 530 | 					if selected_state[1] == "group" and selected_state[2] == ctkn[3] then
 531 | 						if (ctkn[4] == 0x21 or ctkn[4] == 0x3D) and not ctkn[5] then
 532 | 							str_i = selected_state[3];
 533 | 						end;
 534 | 						if selected_match_start then
 535 | 							table.insert(states, 1, selected_match_start);
 536 | 						end;
 537 | 						break;
 538 | 					elseif selected_state[1] == "matchStart" and not selected_match_start and ctkn[4] == 0x3D then
 539 | 						selected_match_start = selected_state;
 540 | 					end;
 541 | 				end;
 542 | 			elseif ctkn[4] == 0x3E then
 543 | 				repeat
 544 | 					local selected_state = table.remove(states, 1);
 545 | 				until not selected_state or selected_state[1] == "group" and selected_state[2] == ctkn[3];
 546 | 			else
 547 | 				for i, v in ipairs(states) do
 548 | 					if v[1] == "group" and v[2] == ctkn[3] then
 549 | 						if v.jmp then
 550 | 							-- recursive match
 551 | 							tkn_i = v.jmp;
 552 | 						end;
 553 | 						v[4] = str_i;
 554 | 						if v[7] == "quantifier" and v[10] + 1 < v[9] then
 555 | 							if token[ctkn[3]][4] ~= "lazy" or v[10] + 1 < v[8] then
 556 | 								tkn_i = ctkn[3];
 557 | 							end;
 558 | 							local ctkn1 = token[ctkn[3]];
 559 | 							local new_group = { "group", v[2], str_i, nil, ctkn1[5][2], ctkn1[5][3], "quantifier", ctkn1[2], ctkn1[3], v[10] + 1, v[11], ctkn1[4] };
 560 | 							table.insert(states, 1, new_group);
 561 | 							if v[11] then
 562 | 								table.insert(states, 1, { "alternation", v[11], str_i });
 563 | 							end;
 564 | 						end;
 565 | 						break;
 566 | 					end;
 567 | 				end;
 568 | 			end;
 569 | 			tkn_i += 1;
 570 | 		elseif tkn_type == 0x4B then
 571 | 			table.insert(states, 1, { "matchStart", str_i });
 572 | 			tkn_i += 1;
 573 | 		elseif tkn_type == 0x7C then
 574 | 			local close_i = tkn_i;
 575 | 			repeat
 576 | 				close_i += 1;
 577 | 				local is_table = type(token[close_i]) == "table";
 578 | 				local close_i_tkn = token[close_i];
 579 | 				if is_table and (close_i_tkn[1] == 0x28 or close_i_tkn[1] == "quantifier" and type(close_i_tkn[5]) == "table" and close_i_tkn[5][1] == 0x28) then
 580 | 					close_i = close_i_tkn[1] == "quantifier" and close_i_tkn[5][3] or close_i_tkn[3];
 581 | 				end;
 582 | 			until is_table and close_i_tkn[1] == 0x29 or not close_i_tkn;
 583 | 			if token[close_i] then
 584 | 				for _, v in ipairs(states) do
 585 | 					if v[1] == "group" and v[6] == close_i then
 586 | 						tkn_i = v[6];
 587 | 						break;
 588 | 					end;
 589 | 				end;
 590 | 			else
 591 | 				tkn_i = close_i;
 592 | 			end;
 593 | 		elseif tkn_type == "recurmatch" then
 594 | 			table.insert(states, 1, { "group", ctkn[3], str_i, nil, nil, token[ctkn[3]][3], nil, jmp = tkn_i });
 595 | 			tkn_i = ctkn[3] + 1;
 596 | 			local next_alt, count = find_alternation(token, tkn_i);
 597 | 			if next_alt then
 598 | 				table.insert(states, 1, { "alternation", next_alt, str_i });
 599 | 			end;
 600 | 		else
 601 | 			local match;
 602 | 			if ctkn == "FAIL" then
 603 | 				match = false;
 604 | 			elseif tkn_type == 0x29 then
 605 | 				repeat
 606 | 					local selected_state = table.remove(states, 1);
 607 | 				until selected_state[1] == "group" and selected_state[2] == ctkn[3];
 608 | 			elseif tkn_type == "quantifier" then
 609 | 				if type(ctkn[5]) == "table" and ctkn[5][1] == 0x28 then
 610 | 					local next_alt = find_alternation(token, tkn_i + 1);
 611 | 					if next_alt then
 612 | 						table.insert(states, 1, { "alternation", next_alt, str_i });
 613 | 					end;
 614 | 					table.insert(states, next_alt and 2 or 1, { "group", tkn_i, str_i, nil, ctkn[5][2], ctkn[5][3], "quantifier", ctkn[2], ctkn[3], 0, next_alt, ctkn[4] });
 615 | 					if ctkn[4] == "lazy" and ctkn[2] == 0 then
 616 | 						tkn_i = ctkn[5][3];
 617 | 					end;
 618 | 					match = true;
 619 | 				else
 620 | 					local start_i, end_i;
 621 | 					local pattern_count = 1;
 622 | 					local is_backref = type(ctkn[5]) == "table" and ctkn[5][1] == "backref";
 623 | 					if is_backref then
 624 | 						pattern_count = 0;
 625 | 						local group_n = ctkn[5][2];
 626 | 						for _, v in ipairs(states) do
 627 | 							if v[1] == "group" and v[5] == group_n then
 628 | 								start_i, end_i = v[3], v[4];
 629 | 								pattern_count = end_i - start_i;
 630 | 								break;
 631 | 							end;
 632 | 						end;
 633 | 					end;
 634 | 					local min_max_i = str_i + ctkn[2] * pattern_count;
 635 | 					local mcount = 0;
 636 | 					while mcount < ctkn[3] do
 637 | 						if is_backref then
 638 | 							if start_i and end_i then
 639 | 								local org_i = str_i;
 640 | 								if utf8_sub(str_arr.s, start_i, end_i) ~= utf8_sub(str_arr.s, org_i, str_i + pattern_count) then
 641 | 									break;
 642 | 								end;
 643 | 							else
 644 | 								break;
 645 | 							end;
 646 | 						elseif not tkn_char_match(ctkn[5], str_arr, str_i, flags, verb_flags) then
 647 | 							break;
 648 | 						end;
 649 | 						str_i += pattern_count;
 650 | 						mcount += 1;
 651 | 					end;
 652 | 					match = mcount >= ctkn[2];
 653 | 					if match and ctkn[4] ~= "possessive" then
 654 | 						if ctkn[4] == "lazy" then
 655 | 							min_max_i, str_i = str_i, min_max_i;
 656 | 						end;
 657 | 						table.insert(states, 1, { "quantifier", tkn_i, str_i, math.min(min_max_i, str_arr.n + 1), (ctkn[4] == "lazy" and 1 or -1) * pattern_count });
 658 | 					end;
 659 | 				end;
 660 | 			elseif tkn_type == "backref" then
 661 | 				local start_i, end_i;
 662 | 				local group_n = ctkn[2];
 663 | 				for _, v in ipairs(states) do
 664 | 					if v[1] == "group" and v[5] == group_n then
 665 | 						start_i, end_i = v[3], v[4];
 666 | 						break;
 667 | 					end;
 668 | 				end;
 669 | 				if start_i and end_i then
 670 | 					local org_i = str_i;
 671 | 					str_i += end_i - start_i;
 672 | 					match = utf8_sub(str_arr.s, start_i, end_i) == utf8_sub(str_arr.s, org_i, str_i);
 673 | 				end;
 674 | 			else
 675 | 				local chr = str_arr[str_i];
 676 | 				if tkn_type == 0x24 or tkn_type == 0x5A or tkn_type == 0x7A then
 677 | 					match = str_i == str_arr.n + 1 or tkn_type == 0x24 and flags.multiline and is_newline(str_arr, str_i + 1, verb_flags) or tkn_type == 0x5A and str_i == str_arr.n and is_newline(str_arr, str_i, verb_flags);
 678 | 				elseif tkn_type == 0x5E or tkn_type == 0x41 or tkn_type == 0x47 then
 679 | 					match = str_i == 1 or tkn_type == 0x5E and flags.multiline and is_newline(str_arr, str_i - 1, verb_flags) or tkn_type == 0x47 and str_i == init;
 680 | 				elseif tkn_type == 0x42 or tkn_type == 0x62 then
 681 | 					local start_m = str_i == 1 or flags.multiline and is_newline(str_arr, str_i - 1, verb_flags);
 682 | 					local end_m = str_i == str_arr.n + 1 or flags.multiline and is_newline(str_arr, str_i, verb_flags);
 683 | 					local w_m = tkn_char_match(ctkn[2], str_arr[str_i - 1], flags) and 0 or tkn_char_match(ctkn[2], chr, flags) and 1;
 684 | 					if w_m == 0 then
 685 | 						match = end_m or not tkn_char_match(ctkn[2], chr, flags);
 686 | 					elseif w_m then
 687 | 						match = start_m or not tkn_char_match(ctkn[2], str_arr[str_i - 1], flags);
 688 | 					end;
 689 | 					if tkn_type == 0x42 then
 690 | 						match = not match;
 691 | 					end;
 692 | 				else
 693 | 					match = tkn_char_match(ctkn, str_arr, str_i, flags, verb_flags);
 694 | 					str_i += 1;
 695 | 				end;
 696 | 			end;
 697 | 			if not match then
 698 | 				while true do
 699 | 					local prev_type, prev_state = states[1] and states[1][1], states[1];
 700 | 					if not prev_type or prev_type == "PRUNE" or prev_type == "SKIP" then
 701 | 						if prev_type then
 702 | 							table.clear(states);
 703 | 						end;
 704 | 						if start_i > str_arr.n then
 705 | 							if as_bool then
 706 | 								return false;
 707 | 							end;
 708 | 							return nil;
 709 | 						end;
 710 | 						start_i = prev_type == "SKIP" and prev_state[2] or start_i + 1;
 711 | 						tkn_i, str_i = 0, start_i;
 712 | 						break;
 713 | 					elseif prev_type == "alternation" then
 714 | 						tkn_i, str_i = prev_state[2], prev_state[3];
 715 | 						local next_alt, count = find_alternation(token, tkn_i + 1);
 716 | 						if next_alt then
 717 | 							prev_state[2] = next_alt;
 718 | 						else
 719 | 							table.remove(states, 1);
 720 | 						end;
 721 | 						if count then
 722 | 							str_i -= count;
 723 | 						end;
 724 | 						break;
 725 | 					elseif prev_type == "group" then
 726 | 						if prev_state[7] == "quantifier" then
 727 | 							if prev_state[12] == "greedy" and prev_state[10] >= prev_state[8]
 728 | 								or prev_state[12] == "lazy" and prev_state[10] < prev_state[9] and not prev_state[13] then
 729 | 								tkn_i, str_i = prev_state[12] == "greedy" and prev_state[6] or prev_state[2], prev_state[3];
 730 | 								if prev_state[12] == "greedy" then
 731 | 									table.remove(states, 1);
 732 | 									break;
 733 | 								elseif prev_state[10] >= prev_state[8] then
 734 | 									prev_state[13] = true;
 735 | 									break;
 736 | 								end;
 737 | 							end;
 738 | 						elseif prev_state[7] == 0x21 then
 739 | 							table.remove(states, 1);
 740 | 							tkn_i, str_i = prev_state[6], prev_state[3];
 741 | 							break;
 742 | 						end;
 743 | 					elseif prev_type == "quantifier" then
 744 | 						if math.sign(prev_state[4] - prev_state[3]) == math.sign(prev_state[5]) then
 745 | 							prev_state[3] += prev_state[5];
 746 | 							tkn_i, str_i = prev_state[2], prev_state[3];
 747 | 							break;
 748 | 						end;
 749 | 					end;
 750 | 					-- keep match out state and recursive state, can be safely removed
 751 | 					-- prevents infinite loop
 752 | 					table.remove(states, 1);
 753 | 				end;
 754 | 			end;
 755 | 			tkn_i += 1;
 756 | 		end;
 757 | 	end;
 758 | 	if as_bool then
 759 | 		return true;
 760 | 	end;
 761 | 	local match_start_ran = false;
 762 | 	local span = table.create(token.group_n);
 763 | 	span[0], span.n = { start_i, str_i }, token.group_n;
 764 | 	for _, v in ipairs(states) do
 765 | 		if v[1] == "matchStart" and not match_start_ran then
 766 | 			span[0][1], match_start_ran = v[2], true;
 767 | 		elseif v[1] == "group" and v[5] and not span[v[5]] then
 768 | 			span[v[5]] = { v[3], v[4] };
 769 | 		end;
 770 | 	end;
 771 | 	return span;
 772 | end;
 773 | 
 774 | --[[ Methods ]]--
 775 | re_m.test = check_re('RegEx', 'test', function(self, str, init)
 776 | 	return re_rawfind(self.token, to_str_arr(str, init), 1, self.flags, self.verb_flags, true);
 777 | end);
 778 | 
 779 | re_m.match = check_re('RegEx', 'match', function(self, str, init, source)
 780 | 	local span = re_rawfind(self.token, to_str_arr(str, init), 1, self.flags, self.verb_flags, false);
 781 | 	if not span then
 782 | 		return nil;
 783 | 	end;
 784 | 	return new_match(span, self.group_id, source, str);
 785 | end);
 786 | 
 787 | re_m.matchall = check_re('RegEx', 'matchall', function(self, str, init, source)
 788 | 	str = to_str_arr(str, init);
 789 | 	local i = 1;
 790 | 	return function()
 791 | 		local span = i <= str.n + 1 and re_rawfind(self.token, str, i, self.flags, self.verb_flags, false);
 792 | 		if not span then
 793 | 			return nil;
 794 | 		end;
 795 | 		i = span[0][2] + (span[0][1] >= span[0][2] and 1 or 0);
 796 | 		return new_match(span, self.group_id, source, str.s);
 797 | 	end;
 798 | end);
 799 | 
 800 | local function insert_tokenized_sub(repl_r, str, span, tkn)
 801 | 	for _, v in ipairs(tkn) do
 802 | 		if type(v) == "table" then
 803 | 			if v[1] == "condition" then
 804 | 				if span[v[2]] then
 805 | 					if v[3] then
 806 | 						insert_tokenized_sub(repl_r, str, span, v[3]);
 807 | 					else
 808 | 						table.move(str, span[v[2]][1], span[v[2]][2] - 1, #repl_r + 1, repl_r);
 809 | 					end;
 810 | 				elseif v[4] then
 811 | 					insert_tokenized_sub(repl_r, str, span, v[4]);
 812 | 				end;
 813 | 			else
 814 | 				table.move(v, 1, #v, #repl_r + 1, repl_r);
 815 | 			end;
 816 | 		elseif span[v] then
 817 | 			table.move(str, span[v][1], span[v][2] - 1, #repl_r + 1, repl_r);
 818 | 		end;
 819 | 	end;
 820 | 	repl_r.n = #repl_r;
 821 | 	return repl_r;
 822 | end;
 823 | 
 824 | re_m.sub = check_re('RegEx', 'sub', function(self, repl, str, n, repl_flag_str, source)
 825 | 	if repl_flag_str ~= nil and type(repl_flag_str) ~= "number" and type(repl_flag_str) ~= "string" then
 826 | 		error(string.format("invalid argument #5 to 'sub' (string expected, got %s)", typeof(repl_flag_str)), 3);
 827 | 	end
 828 | 	local repl_flags = {
 829 | 		l = false, o = false, u = false,
 830 | 	};
 831 | 	for f in string.gmatch(repl_flag_str or '', utf8.charpattern) do
 832 | 		if repl_flags[f] ~= false then
 833 | 			error("invalid regular expression substitution flag " .. f, 3);
 834 | 		end;
 835 | 		repl_flags[f] = true;
 836 | 	end;
 837 | 	local repl_type = type(repl);
 838 | 	if repl_type == "number" then
 839 | 		repl ..= '';
 840 | 	elseif repl_type ~= "string" and repl_type ~= "function" and (not repl_flags.o or repl_type ~= "table") then
 841 | 		error(string.format("invalid argument #2 to 'sub' (string/function%s expected, got %s)", repl_flags.o and "/table" or '', typeof(repl)), 3);
 842 | 	end;
 843 | 	if tonumber(n) then
 844 | 		n = tonumber(n);
 845 | 		if n <= -1 or n ~= n then
 846 | 			n = math.huge;
 847 | 		end;
 848 | 	elseif n ~= nil then
 849 | 		error(string.format("invalid argument #4 to 'sub' (number expected, got %s)", typeof(n)), 3);
 850 | 	else
 851 | 		n = math.huge;
 852 | 	end;
 853 | 	if n < 1 then
 854 | 		return str, 0;
 855 | 	end;
 856 | 	local min_repl_n = 0;
 857 | 	if repl_type == "string" then
 858 | 		repl = to_str_arr(repl);
 859 | 		if not repl_flags.l then
 860 | 			local i1 = 0;
 861 | 			local repl_r = table.create(3);
 862 | 			local group_n = self.token.group_n;
 863 | 			local conditional_c = { };
 864 | 			while i1 < repl.n do
 865 | 				local i2 = i1;
 866 | 				repeat
 867 | 					i2 += 1;
 868 | 				until not repl[i2] or repl[i2] == 0x24 or repl[i2] == 0x5C or (repl[i2] == 0x3A or repl[i2] == 0x7D) and conditional_c[1];
 869 | 				min_repl_n += i2 - i1 - 1;
 870 | 				if i2 - i1 > 1 then
 871 | 					table.insert(repl_r, table.move(repl, i1 + 1, i2 - 1, 1, table.create(i2 - i1 - 1)));
 872 | 				end;
 873 | 				if repl[i2] == 0x3A then
 874 | 					local current_conditional_c = conditional_c[1];
 875 | 					if current_conditional_c[2] then
 876 | 						error("malformed substitution pattern", 3);
 877 | 					end;
 878 | 					current_conditional_c[2] = table.move(repl_r, current_conditional_c[3], #repl_r, 1, table.create(#repl_r + 1 - current_conditional_c[3]));
 879 | 					for i3 = #repl_r, current_conditional_c[3], -1 do
 880 | 						repl_r[i3] = nil;
 881 | 					end;
 882 | 				elseif repl[i2] == 0x7D then
 883 | 					local current_conditional_c = table.remove(conditional_c, 1);
 884 | 					local second_c = table.move(repl_r, current_conditional_c[3], #repl_r, 1, table.create(#repl_r + 1 - current_conditional_c[3]));
 885 | 					for i3 = #repl_r, current_conditional_c[3], -1 do
 886 | 						repl_r[i3] = nil;
 887 | 					end;
 888 | 					table.insert(repl_r, { "condition", current_conditional_c[1], current_conditional_c[2] ~= true and (current_conditional_c[2] or second_c), current_conditional_c[2] and second_c });
 889 | 				elseif repl[i2] then
 890 | 					i2 += 1;
 891 | 					local subst_c = repl[i2];
 892 | 					if not subst_c then
 893 | 						if repl[i2 - 1] == 0x5C then
 894 | 							error("replacement string must not end with a trailing backslash", 3);
 895 | 						end;
 896 | 						local prev_repl_f = repl_r[#repl_r];
 897 | 						if type(prev_repl_f) == "table" then
 898 | 							table.insert(prev_repl_f, repl[i2 - 1]);
 899 | 						else
 900 | 							table.insert(repl_r, { repl[i2 - 1] });
 901 | 						end;
 902 | 					elseif subst_c == 0x5C and repl[i2 - 1] == 0x24 then
 903 | 						local prev_repl_f = repl_r[#repl_r];
 904 | 						if type(prev_repl_f) == "table" then
 905 | 							table.insert(prev_repl_f, 0x24);
 906 | 						else
 907 | 							table.insert(repl_r, { 0x24 });
 908 | 						end;
 909 | 						i2 -= 1;
 910 | 						min_repl_n += 1;
 911 | 					elseif subst_c == 0x30 then
 912 | 						table.insert(repl_r, 0);
 913 | 					elseif subst_c > 0x30 and subst_c <= 0x39 then
 914 | 						local start_i2 = i2;
 915 | 						local group_i = subst_c - 0x30;
 916 | 						while repl[i2 + 1] and repl[i2 + 1] >= 0x30 and repl[i2 + 1] <= 0x39 do
 917 | 							group_i ..= repl[i2 + 1] - 0x30;
 918 | 							i2 += 1;
 919 | 						end;
 920 | 						group_i = tonumber(group_i);
 921 | 						if not repl_flags.u and group_i > group_n then
 922 | 							error("reference to non-existent subpattern", 3);
 923 | 						end;
 924 | 						table.insert(repl_r, group_i);
 925 | 					elseif subst_c == 0x7B and repl[i2 - 1] == 0x24 then
 926 | 						i2 += 1;
 927 | 						local start_i2 = i2;
 928 | 						while repl[i2] and
 929 | 							(repl[i2] >= 0x30 and repl[i2] <= 0x39
 930 | 								or repl[i2] >= 0x41 and repl[i2] <= 0x5A
 931 | 								or repl[i2] >= 0x61 and repl[i2] <= 0x7A
 932 | 								or repl[i2] == 0x5F) do
 933 | 							i2 += 1;
 934 | 						end;
 935 | 						if (repl[i2] == 0x7D or repl[i2] == 0x3A and (repl[i2 + 1] == 0x2B or repl[i2 + 1] == 0x2D)) and i2 ~= start_i2 then
 936 | 							local group_k = utf8_sub(repl.s, start_i2, i2);
 937 | 							if repl[start_i2] >= 0x30 and repl[start_i2] <= 0x39 then
 938 | 								group_k = tonumber(group_k);
 939 | 								if not repl_flags.u and group_k > group_n then
 940 | 									error("reference to non-existent subpattern", 3);
 941 | 								end;
 942 | 							else
 943 | 								group_k = self.group_id[group_k];
 944 | 								if not repl_flags.u and (not group_k or group_k > group_n) then
 945 | 									error("reference to non-existent subpattern", 3);
 946 | 								end;
 947 | 							end;
 948 | 							if repl[i2] == 0x3A then
 949 | 								i2 += 1;
 950 | 								table.insert(conditional_c, { group_k, repl[i2] == 0x2D, #repl_r + 1 });
 951 | 							else
 952 | 								table.insert(repl_r, group_k);
 953 | 							end;
 954 | 						else
 955 | 							error("malformed substitution pattern", 3);
 956 | 						end;
 957 | 					else
 958 | 						local c_escape_char;
 959 | 						if repl[i2 - 1] == 0x24 then
 960 | 							if subst_c ~= 0x24 then
 961 | 								local prev_repl_f = repl_r[#repl_r];
 962 | 								if type(prev_repl_f) == "table" then
 963 | 									table.insert(prev_repl_f, 0x24);
 964 | 								else
 965 | 									table.insert(repl_r, { 0x24 });
 966 | 								end;
 967 | 							end;
 968 | 						else
 969 | 							c_escape_char = escape_chars[repl[i2]];
 970 | 							if type(c_escape_char) ~= "number" then
 971 | 								c_escape_char = nil;
 972 | 							end;
 973 | 						end;
 974 | 						local prev_repl_f = repl_r[#repl_r];
 975 | 						if type(prev_repl_f) == "table" then
 976 | 							table.insert(prev_repl_f, c_escape_char or repl[i2]);
 977 | 						else
 978 | 							table.insert(repl_r, { c_escape_char or repl[i2] });
 979 | 						end;
 980 | 						min_repl_n += 1;
 981 | 					end;
 982 | 				end;
 983 | 				i1 = i2;
 984 | 			end;
 985 | 			if conditional_c[1] then
 986 | 				error("malformed substitution pattern", 3);
 987 | 			end;
 988 | 			if not repl_r[2] and type(repl_r[1]) == "table" and repl_r[1][1] ~= "condition" then
 989 | 				repl, repl.n = repl_r[1], #repl_r[1];
 990 | 			else
 991 | 				repl, repl_type = repl_r, "subst_string";
 992 | 			end;
 993 | 		end;
 994 | 	end;
 995 | 	str = to_str_arr(str);
 996 | 	local incr, i0, count = 0, 1, 0;
 997 | 	while i0 <= str.n + incr + 1 do
 998 | 		local span = re_rawfind(self.token, str, i0, self.flags, self.verb_flags, false);
 999 | 		if not span then
1000 | 			break;
1001 | 		end;
1002 | 		local repl_r;
1003 | 		if repl_type == "string" then
1004 | 			repl_r = repl;
1005 | 		elseif repl_type == "subst_string" then
1006 | 			repl_r = insert_tokenized_sub(table.create(min_repl_n), str, span, repl);
1007 | 		else
1008 | 			local re_match;
1009 | 			local repl_c;
1010 | 			if repl_type == "table" then
1011 | 				re_match = utf8_sub(str.s, span[0][1], span[0][2]);
1012 | 				repl_c = repl[re_match];
1013 | 			else
1014 | 				re_match = new_match(span, self.group_id, source, str.s);
1015 | 				repl_c = repl(re_match);
1016 | 			end;
1017 | 			if repl_c == re_match or repl_flags.o and not repl_c then
1018 | 				local repl_n = span[0][2] - span[0][1];
1019 | 				repl_r = table.move(str, span[0][1], span[0][2] - 1, 1, table.create(repl_n));
1020 | 				repl_r.n = repl_n;
1021 | 			elseif type(repl_c) == "string" then
1022 | 				repl_r = to_str_arr(repl_c);
1023 | 			elseif type(repl_c) == "number" then
1024 | 				repl_r = to_str_arr(repl_c .. '');
1025 | 			elseif repl_flags.o then
1026 | 				error(string.format("invalid replacement value (a %s)", type(repl_c)), 3);
1027 | 			else
1028 | 				repl_r = { n = 0 };
1029 | 			end;
1030 | 		end;
1031 | 		local match_len = span[0][2] - span[0][1];
1032 | 		local repl_len = math.min(repl_r.n, match_len);
1033 | 		for i1 = 0, repl_len - 1 do
1034 | 			str[span[0][1] + i1] = repl_r[i1 + 1];
1035 | 		end;
1036 | 		local i1 = span[0][1] + repl_len;
1037 | 		i0 = span[0][2];
1038 | 		if match_len > repl_r.n then
1039 | 			for i2 = 1, match_len - repl_r.n do
1040 | 				table.remove(str, i1);
1041 | 				incr -= 1;
1042 | 				i0 -= 1;
1043 | 			end;
1044 | 		elseif repl_r.n > match_len then
1045 | 			for i2 = 1, repl_r.n - match_len do
1046 | 				table.insert(str, i1 + i2 - 1, repl_r[repl_len + i2]);
1047 | 				incr += 1;
1048 | 				i0 += 1;
1049 | 			end;
1050 | 		end;
1051 | 		if match_len <= 0 then
1052 | 			i0 += 1;
1053 | 		end;
1054 | 		count += 1;
1055 | 		if n < count + 1 then
1056 | 			break;
1057 | 		end;
1058 | 	end;
1059 | 	return from_str_arr(str), count;
1060 | end);
1061 | 
1062 | re_m.split = check_re('RegEx', 'split', function(self, str, n)
1063 | 	if tonumber(n) then
1064 | 		n = tonumber(n);
1065 | 		if n <= -1 or n ~= n then
1066 | 			n = math.huge;
1067 | 		end;
1068 | 	elseif n ~= nil then
1069 | 		error(string.format("invalid argument #3 to 'split' (number expected, got %s)", typeof(n)), 3);
1070 | 	else
1071 | 		n = math.huge;
1072 | 	end;
1073 | 	str = to_str_arr(str);
1074 | 	local i, count = 1, 0;
1075 | 	local ret = { };
1076 | 	local prev_empty = 0;
1077 | 	while i <= str.n + 1 do
1078 | 		count += 1;
1079 | 		local span = n >= count and re_rawfind(self.token, str, i, self.flags, self.verb_flags, false);
1080 | 		if not span then
1081 | 			break;
1082 | 		end;
1083 | 		table.insert(ret, utf8_sub(str.s, i - prev_empty, span[0][1]));
1084 | 		prev_empty = span[0][1] >= span[0][2] and 1 or 0;
1085 | 		i = span[0][2] + prev_empty;
1086 | 	end;
1087 | 	table.insert(ret, string.sub(str.s, utf8.offset(str.s, i - prev_empty)));
1088 | 	return ret;
1089 | end);
1090 | 
1091 | --
1092 | local function re_index(self, index)
1093 | 	return re_m[index] or proxy[self].flags[index];
1094 | end;
1095 | 
1096 | local function re_tostr(self)
1097 | 	return proxy[self].pattern_repr .. proxy[self].flag_repr;
1098 | end;
1099 | --
1100 | 
1101 | local other_valid_group_char = {
1102 | 	-- non-capturing group
1103 | 	[0x3A] = true,
1104 | 	-- lookarounds
1105 | 	[0x21] = true, [0x3D] = true,
1106 | 	-- atomic
1107 | 	[0x3E] = true,
1108 | 	-- branch reset
1109 | 	[0x7C] = true,
1110 | };
1111 | 
1112 | local function tokenize_ptn(codes, flags)
1113 | 	if flags.unicode and not options.unicodeData then
1114 | 		return "options.unicodeData cannot be turned off while having unicode flag";
1115 | 	end;
1116 | 	local i, len = 1, codes.n;
1117 | 	local group_n = 0;
1118 | 	local outln, group_id, verb_flags = { }, { }, {
1119 | 		newline = 1, newline_seq = 1, not_empty = 0,
1120 | 	};
1121 | 	while i <= len do
1122 | 		local c = codes[i];
1123 | 		if c == 0x28 then
1124 | 			-- Match
1125 | 			local ret;
1126 | 			if codes[i + 1] == 0x2A then
1127 | 				i += 2;
1128 | 				local start_i = i;
1129 | 				while codes[i]
1130 | 					and (codes[i] >= 0x30 and codes[i] <= 0x39
1131 | 					or codes[i] >= 0x41 and codes[i] <= 0x5A
1132 | 					or codes[i] >= 0x61 and codes[i] <= 0x7A
1133 | 					or codes[i] == 0x5F or codes[i] == 0x3A) do
1134 | 					i += 1;
1135 | 				end;
1136 | 				if codes[i] ~= 0x29 and codes[i - 1] ~= 0x3A then
1137 | 					-- fallback as normal and ( can't be repeated
1138 | 					return "quantifier doesn't follow a repeatable pattern";
1139 | 				end;
1140 | 				local selected_verb = utf8_sub(codes.s, start_i, i);
1141 | 				if selected_verb == "positive_lookahead:" or selected_verb == "negative_lookhead:"
1142 | 					or selected_verb == "positive_lookbehind:" or selected_verb == "negative_lookbehind:"
1143 | 					or selected_verb:find("^[pn]l[ab]:$") then
1144 | 					ret = { 0x28, nil, nil, selected_verb:find('^n') and 0x21 or 0x3D, selected_verb:find('b', 3, true) and 1 };
1145 | 				elseif selected_verb == "atomic:" then
1146 | 					ret = { 0x28, nil, nil, 0x3E, nil };
1147 | 				elseif selected_verb == "ACCEPT" or selected_verb == "FAIL" or selected_verb == 'F' or selected_verb == "PRUNE" or selected_verb == "SKIP" then
1148 | 					ret = selected_verb == 'F' and "FAIL" or selected_verb;
1149 | 				else
1150 | 					if line_verbs[selected_verb] then
1151 | 						verb_flags.newline = selected_verb;
1152 | 					elseif selected_verb == "BSR_ANYCRLF" or selected_verb == "BSR_UNICODE" then
1153 | 						verb_flags.newline_seq = selected_verb == "BSR_UNICODE" and 1 or 0;
1154 | 					elseif selected_verb == "NOTEMPTY" or selected_verb == "NOTEMPTY_ATSTART" then
1155 | 						verb_flags.not_empty = selected_verb == "NOTEMPTY" and 1 or 2;
1156 | 					else
1157 | 						return "unknown or malformed verb";
1158 | 					end;
1159 | 					if outln[1] then
1160 | 						return "this verb must be placed at the beginning of the regex";
1161 | 					end;
1162 | 				end;
1163 | 			elseif codes[i + 1] == 0x3F then
1164 | 				-- ? syntax
1165 | 				i += 2;
1166 | 				if codes[i] == 0x23 then
1167 | 					-- comments
1168 | 					i = table.find(codes, 0x29, i);
1169 | 					if not i then
1170 | 						return "unterminated parenthetical";
1171 | 					end;
1172 | 					i += 1;
1173 | 					continue;
1174 | 				elseif not codes[i] then
1175 | 					return "unterminated parenthetical";
1176 | 				end;
1177 | 				ret = { 0x28, nil, nil, codes[i], nil };
1178 | 				if codes[i] == 0x30 and codes[i + 1] == 0x29 then
1179 | 					-- recursive match entire pattern
1180 | 					ret[1], ret[2], ret[3], ret[5] = "recurmatch", 0, 0, nil;
1181 | 				elseif codes[i] > 0x30 and codes[i] <= 0x39 then
1182 | 					-- recursive match
1183 | 					local org_i = i;
1184 | 					i += 1;
1185 | 					while codes[i] >= 0x30 and codes[i] <= 0x30 do
1186 | 						i += 1;
1187 | 					end;
1188 | 					if codes[i] ~= 0x29 then
1189 | 						return "invalid group structure";
1190 | 					end;
1191 | 					ret[1], ret[2], ret[4] = "recurmatch", tonumber(utf8_sub(codes.s, org_i, i)), nil;
1192 | 				elseif codes[i] == 0x3C and codes[i + 1] == 0x21 or codes[i + 1] == 0x3D then
1193 | 					-- lookbehinds
1194 | 					i += 1;
1195 | 					ret[4], ret[5] = codes[i], 1;
1196 | 				elseif codes[i] == 0x7C then
1197 | 					-- branch reset
1198 | 					ret[5] = group_n;
1199 | 				elseif codes[i] == 0x50 or codes[i] == 0x3C or codes[i] == 0x27 then
1200 | 					if codes[i] == 0x50 then
1201 | 						i += 1;
1202 | 					end;
1203 | 					if codes[i] == 0x3D then
1204 | 						-- backref
1205 | 						local start_i = i + 1;
1206 | 						while codes[i] and
1207 | 							(codes[i] >= 0x30 and codes[i] <= 0x39
1208 | 								or codes[i] >= 0x41 and codes[i] <= 0x5A
1209 | 								or codes[i] >= 0x61 and codes[i] <= 0x7A
1210 | 								or codes[i] == 0x5F) do
1211 | 							i += 1;
1212 | 						end;
1213 | 						if not codes[i] then
1214 | 							return "unterminated parenthetical";
1215 | 						elseif codes[i] ~= 0x29 or i == start_i then
1216 | 							return "invalid group structure";
1217 | 						end;
1218 | 						ret = { "backref", utf8_sub(codes.s, start_i, i) };
1219 | 					elseif codes[i] == 0x3C or codes[i - 1] ~= 0x50 and codes[i] == 0x27 then
1220 | 						-- named capture
1221 | 						local delimiter = codes[i] == 0x27 and 0x27 or 0x3E;
1222 | 						local start_i = i + 1;
1223 | 						i += 1;
1224 | 						if codes[i] == 0x29 then
1225 | 							return "missing character in subpattern";
1226 | 						elseif codes[i] >= 0x30 and codes[i] <= 0x39 then
1227 | 							return "subpattern name must not begin with a digit";
1228 | 						elseif not (codes[i] >= 0x41 and codes[i] <= 0x5A or codes[i] >= 0x61 and codes[i] <= 0x7A or codes[i] == 0x5F) then
1229 | 							return "invalid character in subpattern";
1230 | 						end;
1231 | 						i += 1;
1232 | 						while codes[i] and
1233 | 							(codes[i] >= 0x30 and codes[i] <= 0x39
1234 | 								or codes[i] >= 0x41 and codes[i] <= 0x5A
1235 | 								or codes[i] >= 0x61 and codes[i] <= 0x7A
1236 | 								or codes[i] == 0x5F) do
1237 | 							i += 1;
1238 | 						end;
1239 | 						if not codes[i] then
1240 | 							return "unterminated parenthetical";
1241 | 						elseif codes[i] ~= delimiter then
1242 | 							return "invalid character in subpattern";
1243 | 						end;
1244 | 						local name = utf8_sub(codes.s, start_i, i);
1245 | 						group_n += 1;
1246 | 						if (group_id[name] or group_n) ~= group_n then
1247 | 							return "subpattern name already exists";
1248 | 						end;
1249 | 						for name1, group_n1 in pairs(group_id) do
1250 | 							if name ~= name1 and group_n == group_n1 then
1251 | 								return "different names for subpatterns of the same number aren't permitted";
1252 | 							end;
1253 | 						end;
1254 | 						group_id[name] = group_n;
1255 | 						ret[2], ret[4] = group_n, nil;
1256 | 					else
1257 | 						return "invalid group structure";
1258 | 					end;
1259 | 				elseif not other_valid_group_char[codes[i]] then
1260 | 					return "invalid group structure";
1261 | 				end;
1262 | 			else
1263 | 				group_n += 1;
1264 | 				ret = { 0x28, group_n, nil, nil };
1265 | 			end;
1266 | 			if ret then
1267 | 				table.insert(outln, ret);
1268 | 			end;
1269 | 		elseif c == 0x29 then
1270 | 			-- Close parenthesis
1271 | 			local i1 = #outln + 1;
1272 | 			local lookbehind_c = -1;
1273 | 			local current_lookbehind_c = 0;
1274 | 			local max_c, group_c = 0, 0;
1275 | 			repeat
1276 | 				i1 -= 1;
1277 | 				local v, is_table = outln[i1], type(outln[i1]) == "table";
1278 | 				if is_table and v[1] == 0x28 then
1279 | 					group_c += 1;
1280 | 					if current_lookbehind_c and v.count then
1281 | 						current_lookbehind_c += v.count;
1282 | 					end;
1283 | 					if not v[3] then
1284 | 						if v[4] == 0x7C then
1285 | 							group_n = v[5] + math.max(max_c, group_c);
1286 | 						end;
1287 | 						if current_lookbehind_c ~= lookbehind_c and lookbehind_c ~= -1 then
1288 | 							lookbehind_c = nil;
1289 | 						else
1290 | 							lookbehind_c = current_lookbehind_c;
1291 | 						end;
1292 | 						break;
1293 | 					end;
1294 | 				elseif v == alternation then
1295 | 					if current_lookbehind_c ~= lookbehind_c and lookbehind_c ~= -1 then
1296 | 						lookbehind_c, current_lookbehind_c = nil, nil;
1297 | 					else
1298 | 						lookbehind_c, current_lookbehind_c = current_lookbehind_c, 0;
1299 | 					end;
1300 | 					max_c, group_c = math.max(max_c, group_c), 0;
1301 | 				elseif current_lookbehind_c then
1302 | 					if is_table and v[1] == "quantifier" then
1303 | 						if v[2] == v[3] then
1304 | 							current_lookbehind_c += v[2];
1305 | 						else
1306 | 							current_lookbehind_c = nil;
1307 | 						end;
1308 | 					else
1309 | 						current_lookbehind_c += 1;
1310 | 					end;
1311 | 				end;
1312 | 			until i1 < 1;
1313 | 			if i1 < 1 then
1314 | 				return "unmatched ) in regular expression";
1315 | 			end;
1316 | 			local v = outln[i1];
1317 | 			local outln_len_p_1 = #outln + 1;
1318 | 			local ret = { 0x29, v[2], i1, v[4], v[5], count = lookbehind_c };
1319 | 			if (v[4] == 0x21 or v[4] == 0x3D) and v[5] and not lookbehind_c then
1320 | 				return "lookbehind assertion is not fixed width";
1321 | 			end;
1322 | 			v[3] = outln_len_p_1;
1323 | 			table.insert(outln, ret);
1324 | 		elseif c == 0x2E then
1325 | 			table.insert(outln, dot);
1326 | 		elseif c == 0x5B then
1327 | 			-- Character set
1328 | 			local negate, char_class = false, nil;
1329 | 			i += 1;
1330 | 			local start_i = i;
1331 | 			if codes[i] == 0x5E then
1332 | 				negate = true;
1333 | 				i += 1;
1334 | 			elseif codes[i] == 0x2E or codes[i] == 0x3A or codes[i] == 0x3D then
1335 | 				-- POSIX character classes
1336 | 				char_class = codes[i];
1337 | 			end;
1338 | 			local ret;
1339 | 			if codes[i] == 0x5B or codes[i] == 0x5C then
1340 | 				ret = { };
1341 | 			else
1342 | 				ret = { codes[i] };
1343 | 				i += 1;
1344 | 			end;
1345 | 			while codes[i] ~= 0x5D do
1346 | 				if not codes[i] then
1347 | 					return "unterminated character class";
1348 | 				elseif codes[i] == 0x2D and ret[1] and type(ret[1]) == "number" then
1349 | 					if codes[i + 1] == 0x5D then
1350 | 						table.insert(ret, 1, 0x2D);
1351 | 					else
1352 | 						i += 1;
1353 | 						local ret_c = codes[i];
1354 | 						if ret_c == 0x5B then
1355 | 							if codes[i + 1] == 0x2E or codes[i + 1] == 0x3A or codes[i + 1] == 0x3D then
1356 | 								-- Check for POSIX character class, name does not matter
1357 | 								local i1 = i + 2;
1358 | 								repeat
1359 | 									i1 = table.find(codes, 0x5D, i1);
1360 | 								until not i1 or codes[i1 - 1] ~= 0x5C;
1361 | 								if not i1 then
1362 | 									return "unterminated character class";
1363 | 								elseif codes[i1 - 1] == codes[i + 1] and i1 - 1 ~= i + 1 then
1364 | 									return "invalid range in character class";
1365 | 								end;
1366 | 							end;
1367 | 							if ret[1] > 0x5B then
1368 | 								return "invalid range in character class";
1369 | 							end;
1370 | 						elseif ret_c == 0x5C then
1371 | 							i += 1;
1372 | 							if codes[i] == 0x78 then
1373 | 								local radix0, radix1;
1374 | 								i += 1;
1375 | 								if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x39 or codes[i] >= 0x41 and codes[i] <= 0x46 or codes[i] >= 0x61 and codes[i] <= 0x66 then
1376 | 									radix0 = codes[i] - ((codes[i] >= 0x41 and codes[i] <= 0x5A) and 0x37 or (codes[i] >= 0x61 and codes[i] <= 0x7A) and 0x57 or 0x30);
1377 | 									i += 1;
1378 | 									if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x39 or codes[i] >= 0x41 and codes[i] <= 0x46 or codes[i] >= 0x61 and codes[i] <= 0x66 then
1379 | 										radix1 = codes[i] - ((codes[i] >= 0x41 and codes[i] <= 0x5A) and 0x37 or (codes[i] >= 0x61 and codes[i] <= 0x7A) and 0x57 or 0x30);
1380 | 									else
1381 | 										i -= 1;
1382 | 									end;
1383 | 								else
1384 | 									i -= 1;
1385 | 								end;
1386 | 								ret_c = radix0 and (radix1 and 16 * radix0 + radix1 or radix0) or 0;
1387 | 							elseif codes[i] >= 0x30 and codes[i] <= 0x37 then
1388 | 								local radix0, radix1, radix2 = codes[i] - 0x30, nil, nil;
1389 | 								i += 1;
1390 | 								if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 then
1391 | 									radix1 = codes[i] - 0x30;
1392 | 									i += 1;
1393 | 									if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 then
1394 | 										radix2 = codes[i] - 0x30;
1395 | 									else
1396 | 										i -= 1;
1397 | 									end;
1398 | 								else
1399 | 									i -= 1;
1400 | 								end;
1401 | 								ret_c = radix1 and (radix2 and 64 * radix0 + 8 * radix1 + radix2 or 8 * radix0 + radix1) or radix0;
1402 | 							else
1403 | 								ret_c = escape_chars[codes[i]] or codes[i];
1404 | 								if type(ret_c) ~= "number" then
1405 | 									return "invalid range in character class";
1406 | 								end;
1407 | 							end;
1408 | 						elseif ret[1] > ret_c then
1409 | 							return "invalid range in character class";
1410 | 						end;
1411 | 						ret[1] = { "range", ret[1], ret_c };
1412 | 					end;
1413 | 				elseif codes[i] == 0x5B then
1414 | 					if codes[i + 1] == 0x2E or codes[i + 1] == 0x3A or codes[i + 1] == 0x3D then
1415 | 						local i1 = i + 2;
1416 | 						repeat
1417 | 							i1 = table.find(codes, 0x5D, i1);
1418 | 						until not i1 or codes[i1 - 1] ~= 0x5C;
1419 | 						if not i1 then
1420 | 							return "unterminated character class";
1421 | 						elseif codes[i1 - 1] ~= codes[i + 1] or i1 - 1 == i + 1 then
1422 | 							table.insert(ret, 1, 0x5B);
1423 | 						elseif codes[i1 - 1] == 0x2E or codes[i1 - 1] == 0x3D then
1424 | 							return "POSIX collating elements aren't supported";
1425 | 						elseif codes[i1 - 1] == 0x3A then
1426 | 							-- I have no plans to support escape codes (\) in character class names
1427 | 							local negate = codes[i + 3] == 0x5E;
1428 | 							local class_name = utf8_sub(codes.s, i + (negate and 3 or 2), i1 - 1);
1429 | 							--  If not valid then throw an error
1430 | 							if not posix_class_names[class_name] then
1431 | 								return "unknown POSIX class name";
1432 | 							end;
1433 | 							table.insert(ret, 1, { "class", class_name, negate });
1434 | 							i = i1;
1435 | 						end;
1436 | 					else
1437 | 						table.insert(ret, 1, 0x5B);
1438 | 					end;
1439 | 				elseif codes[i] == 0x5C then
1440 | 					i += 1;
1441 | 					if codes[i] == 0x78 then
1442 | 						local radix0, radix1;
1443 | 						i += 1;
1444 | 						if codes[i] == 0x7B then
1445 | 							i += 1;
1446 | 							local org_i = i;
1447 | 							while codes[i] and
1448 | 								(codes[i] >= 0x30 and codes[i] <= 0x39
1449 | 									or codes[i] >= 0x41 and codes[i] <= 0x46
1450 | 									or codes[i] >= 0x61 and codes[i] <= 0x66) do
1451 | 								i += 1;
1452 | 							end;
1453 | 							if codes[i] ~= 0x7D or i == org_i then
1454 | 								return "malformed hexadecimal character";
1455 | 							elseif i - org_i > 4 then
1456 | 								return "character offset too large";
1457 | 							end;
1458 | 							table.insert(ret, 1, tonumber(utf8_sub(codes.s, org_i, i), 16));
1459 | 						else
1460 | 							if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x39 or codes[i] >= 0x41 and codes[i] <= 0x46 or codes[i] >= 0x61 and codes[i] <= 0x66 then
1461 | 								radix0 = codes[i] - ((codes[i] >= 0x41 and codes[i] <= 0x5A) and 0x37 or (codes[i] >= 0x61 and codes[i] <= 0x7A) and 0x57 or 0x30);
1462 | 								i += 1;
1463 | 								if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x39 or codes[i] >= 0x41 and codes[i] <= 0x46 or codes[i] >= 0x61 and codes[i] <= 0x66 then
1464 | 									radix1 = codes[i] - ((codes[i] >= 0x41 and codes[i] <= 0x5A) and 0x37 or (codes[i] >= 0x61 and codes[i] <= 0x7A) and 0x57 or 0x30);
1465 | 								else
1466 | 									i -= 1;
1467 | 								end;
1468 | 							else
1469 | 								i -= 1;
1470 | 							end;
1471 | 							table.insert(ret, 1, radix0 and (radix1 and 16 * radix0 + radix1 or radix0) or 0);
1472 | 						end;
1473 | 					elseif codes[i] >= 0x30 and codes[i] <= 0x37 then
1474 | 						local radix0, radix1, radix2 = codes[i] - 0x30, nil, nil;
1475 | 						i += 1;
1476 | 						if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 then
1477 | 							radix1 = codes[i] - 0x30;
1478 | 							i += 1;
1479 | 							if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 then
1480 | 								radix2 = codes[i] - 0x30;
1481 | 							else
1482 | 								i -= 1;
1483 | 							end;
1484 | 						else
1485 | 							i -= 1;
1486 | 						end;
1487 | 						table.insert(ret, 1, radix1 and (radix2 and 64 * radix0 + 8 * radix1 + radix2 or 8 * radix0 + radix1) or radix0);
1488 | 					elseif codes[i] == 0x45 then
1489 | 						-- intentionally left blank, \E that's not preceded \Q is ignored
1490 | 					elseif codes[i] == 0x51 then
1491 | 						local start_i = i + 1;
1492 | 						repeat
1493 | 							i = table.find(codes, 0x5C, i + 1);
1494 | 						until not i or codes[i + 1] == 0x45;
1495 | 						table.move(codes, start_i, i and i - 1 or #codes, #outln + 1, outln);
1496 | 						if not i then
1497 | 							break;
1498 | 						end;
1499 | 						i += 1;
1500 | 					elseif codes[i] == 0x4E then
1501 | 						if codes[i + 1] == 0x7B and codes[i + 2] == 0x55 and codes[i + 3] == 0x2B and flags.unicode then
1502 | 							i += 4;
1503 | 							local start_i = i;
1504 | 							while codes[i] and
1505 | 								(codes[i] >= 0x30 and codes[i] <= 0x39
1506 | 									or codes[i] >= 0x41 and codes[i] <= 0x46
1507 | 									or codes[i] >= 0x61 and codes[i] <= 0x66) do
1508 | 								i += 1;
1509 | 							end;
1510 | 							if codes[i] ~= 0x7D or i == start_i then
1511 | 								return "malformed Unicode code point";
1512 | 							end;
1513 | 							local code_point = tonumber(utf8_sub(codes.s, start_i, i));
1514 | 							table.insert(ret, 1, code_point);
1515 | 						else
1516 | 							return "invalid escape sequence";
1517 | 						end;
1518 | 					elseif codes[i] == 0x50 or codes[i] == 0x70 then
1519 | 						if not options.unicodeData then
1520 | 							return "options.unicodeData cannot be turned off when using \\p";
1521 | 						end;
1522 | 						i += 1;
1523 | 						if codes[i] ~= 0x7B then
1524 | 							local c_name = utf8.char(codes[i] or 0);
1525 | 							if not valid_categories[c_name] then
1526 | 								return "unknown or malformed script name";
1527 | 							end;
1528 | 							table.insert(ret, 1, { "category", false, c_name });
1529 | 						else
1530 | 							local negate = codes[i] == 0x50;
1531 | 							i += 1;
1532 | 							if codes[i] == 0x5E then
1533 | 								i += 1;
1534 | 								negate = not negate;
1535 | 							end;
1536 | 							local start_i = i;
1537 | 							while codes[i] and
1538 | 								(codes[i] >= 0x30 and codes[i] <= 0x39
1539 | 									or codes[i] >= 0x41 and codes[i] <= 0x5A
1540 | 									or codes[i] >= 0x61 and codes[i] <= 0x7A
1541 | 									or codes[i] == 0x5F) do
1542 | 								i += 1;
1543 | 							end;
1544 | 							if codes[i] ~= 0x7D then
1545 | 								return "unknown or malformed script name";
1546 | 							end;
1547 | 							local c_name = utf8_sub(codes.s, start_i, i);
1548 | 							local script_set = chr_scripts[c_name];
1549 | 							if script_set then
1550 | 								table.insert(ret, 1, { "charset", negate, script_set });
1551 | 							elseif not valid_categories[c_name] then
1552 | 								return "unknown or malformed script name";
1553 | 							else
1554 | 								table.insert(ret, 1, { "category", negate, c_name });
1555 | 							end;
1556 | 						end;
1557 | 					elseif codes[i] == 0x6F then
1558 | 						i += 1;
1559 | 						if codes[i] ~= 0x7B then
1560 | 							return "malformed octal code";
1561 | 						end;
1562 | 						i += 1;
1563 | 						local org_i = i;
1564 | 						while codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 do
1565 | 							i += 1;
1566 | 						end;
1567 | 						if codes[i] ~= 0x7D or i == org_i then
1568 | 							return "malformed octal code";
1569 | 						end;
1570 | 						local ret_chr = tonumber(utf8_sub(codes.s, org_i, i), 8);
1571 | 						if ret_chr > 0xFFFF then
1572 | 							return "character offset too large";
1573 | 						end;
1574 | 						table.insert(ret, 1, ret_chr);
1575 | 					else
1576 | 						local esc_char = escape_chars[codes[i]];
1577 | 						table.insert(ret, 1, type(esc_char) == "string" and { "class", esc_char, false } or esc_char or codes[i]);
1578 | 					end;
1579 | 				elseif flags.ignoreCase and codes[i] >= 0x61 and codes[i] <= 0x7A then
1580 | 					table.insert(ret, 1, codes[i] - 0x20);
1581 | 				else
1582 | 					table.insert(ret, 1, codes[i]);
1583 | 				end;
1584 | 				i += 1;
1585 | 			end;
1586 | 			if codes[i - 1] == char_class and i - 1 ~= start_i then
1587 | 				return char_class == 0x3A and "POSIX named classes are only support within a character set" or "POSIX collating elements aren't supported";
1588 | 			end;
1589 | 			if not ret[2] and not negate then
1590 | 				table.insert(outln, ret[1]);
1591 | 			else
1592 | 				table.insert(outln, { "charset", negate, ret });
1593 | 			end;
1594 | 		elseif c == 0x5C then
1595 | 			-- Escape char
1596 | 			i += 1;
1597 | 			local escape_c = codes[i];
1598 | 			if not escape_c then
1599 | 				return "pattern may not end with a trailing backslash";
1600 | 			elseif escape_c >= 0x30 and escape_c <= 0x39 then
1601 | 				local org_i = i;
1602 | 				while codes[i + 1] and codes[i + 1] >= 0x30 and codes[i + 1] <= 0x39 do
1603 | 					i += 1;
1604 | 				end;
1605 | 				local escape_d = tonumber(utf8_sub(codes.s, org_i, i + 1));
1606 | 				if escape_d > group_n and i ~= org_i then
1607 | 					i = org_i;
1608 | 					local radix0, radix1, radix2;
1609 | 					if codes[i] <= 0x37 then
1610 | 						radix0 = codes[i] - 0x30;
1611 | 						i += 1;
1612 | 						if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 then
1613 | 							radix1 = codes[i] - 0x30;
1614 | 							i += 1;
1615 | 							if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 then
1616 | 								radix2 = codes[i] - 0x30;
1617 | 							else
1618 | 								i -= 1;
1619 | 							end;
1620 | 						else
1621 | 							i -= 1;
1622 | 						end;
1623 | 					end;
1624 | 					table.insert(outln, radix0 and (radix1 and (radix2 and 64 * radix0 + 8 * radix1 + radix2 or 8 * radix0 + radix1) or radix0) or codes[org_i]);
1625 | 				else
1626 | 					table.insert(outln, { "backref", escape_d });
1627 | 				end;
1628 | 			elseif escape_c == 0x45 then
1629 | 				-- intentionally left blank, \E that's not preceded \Q is ignored
1630 | 			elseif escape_c == 0x51 then
1631 | 				local start_i = i + 1;
1632 | 				repeat
1633 | 					i = table.find(codes, 0x5C, i + 1);
1634 | 				until not i or codes[i + 1] == 0x45;
1635 | 				table.move(codes, start_i, i and i - 1 or #codes, #outln + 1, outln);
1636 | 				if not i then
1637 | 					break;
1638 | 				end;
1639 | 				i += 1;
1640 | 			elseif escape_c == 0x4E then
1641 | 				if codes[i + 1] == 0x7B and codes[i + 2] == 0x55 and codes[i + 3] == 0x2B and flags.unicode then
1642 | 					i += 4;
1643 | 					local start_i = i;
1644 | 					while codes[i] and
1645 | 						(codes[i] >= 0x30 and codes[i] <= 0x39
1646 | 							or codes[i] >= 0x41 and codes[i] <= 0x46
1647 | 							or codes[i] >= 0x61 and codes[i] <= 0x66) do
1648 | 						i += 1;
1649 | 					end;
1650 | 					if codes[i] ~= 0x7D or i == start_i then
1651 | 						return "malformed Unicode code point";
1652 | 					end;
1653 | 					local code_point = tonumber(utf8_sub(codes.s, start_i, i));
1654 | 					table.insert(outln, code_point);
1655 | 				else
1656 | 					table.insert(outln, escape_chars[0x4E]);
1657 | 				end;
1658 | 			elseif escape_c == 0x50 or escape_c == 0x70 then
1659 | 				if not options.unicodeData then
1660 | 					return "options.unicodeData cannot be turned off when using \\p";
1661 | 				end;
1662 | 				i += 1;
1663 | 				if codes[i] ~= 0x7B then
1664 | 					local c_name = utf8.char(codes[i] or 0);
1665 | 					if not valid_categories[c_name] then
1666 | 						return "unknown or malformed script name";
1667 | 					end;
1668 | 					table.insert(outln, { "category", false, c_name });
1669 | 				else
1670 | 					local negate = escape_c == 0x50;
1671 | 					i += 1;
1672 | 					if codes[i] == 0x5E then
1673 | 						i += 1;
1674 | 						negate = not negate;
1675 | 					end;
1676 | 					local start_i = i;
1677 | 					while codes[i] and
1678 | 						(codes[i] >= 0x30 and codes[i] <= 0x39
1679 | 							or codes[i] >= 0x41 and codes[i] <= 0x5A
1680 | 							or codes[i] >= 0x61 and codes[i] <= 0x7A
1681 | 							or codes[i] == 0x5F) do
1682 | 						i += 1;
1683 | 					end;
1684 | 					if codes[i] ~= 0x7D then
1685 | 						return "unknown or malformed script name";
1686 | 					end;
1687 | 					local c_name = utf8_sub(codes.s, start_i, i);
1688 | 					local script_set = chr_scripts[c_name];
1689 | 					if script_set then
1690 | 						table.insert(outln, { "charset", negate, script_set });
1691 | 					elseif not valid_categories[c_name] then
1692 | 						return "unknown or malformed script name";
1693 | 					else
1694 | 						table.insert(outln, { "category", negate, c_name });
1695 | 					end;
1696 | 				end;
1697 | 			elseif escape_c == 0x67 and (codes[i + 1] == 0x7B or codes[i + 1] >= 0x30 and codes[i + 1] <= 0x39) then
1698 | 				local is_grouped = false;
1699 | 				i += 1;
1700 | 				if codes[i] == 0x7B then
1701 | 					i += 1;
1702 | 					is_grouped = true;
1703 | 				elseif codes[i] < 0x30 or codes[i] > 0x39 then
1704 | 					return "malformed reference code";
1705 | 				end;
1706 | 				local org_i = i;
1707 | 				while codes[i] and
1708 | 					(codes[i] >= 0x30 and codes[i] <= 0x39
1709 | 						or codes[i] >= 0x41 and codes[i] <= 0x46
1710 | 						or codes[i] >= 0x61 and codes[i] <= 0x66) do
1711 | 					i += 1;
1712 | 				end;
1713 | 				if is_grouped and codes[i] ~= 0x7D then
1714 | 					return "malformed reference code";
1715 | 				end;
1716 | 				local ref_name = tonumber(utf8_sub(codes.s, org_i, i + (is_grouped and 0 or 1)));
1717 | 				table.insert(outln, { "backref", ref_name });
1718 | 				if not is_grouped then
1719 | 					i -= 1;
1720 | 				end;
1721 | 			elseif escape_c == 0x6F then
1722 | 				i += 1;
1723 | 				if codes[i + 1] ~= 0x7B then
1724 | 					return "malformed octal code";
1725 | 				end
1726 | 				i += 1;
1727 | 				local org_i = i;
1728 | 				while codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 do
1729 | 					i += 1;
1730 | 				end;
1731 | 				if codes[i] ~= 0x7D or i == org_i then
1732 | 					return "malformed octal code";
1733 | 				end;
1734 | 				local ret_chr = tonumber(utf8_sub(codes.s, org_i, i), 8);
1735 | 				if ret_chr > 0xFFFF then
1736 | 					return "character offset too large";
1737 | 				end;
1738 | 				table.insert(outln, ret_chr);
1739 | 			elseif escape_c == 0x78 then
1740 | 				local radix0, radix1;
1741 | 				i += 1;
1742 | 				if codes[i] == 0x7B then
1743 | 					i += 1;
1744 | 					local org_i = i;
1745 | 					while codes[i] and
1746 | 						(codes[i] >= 0x30 and codes[i] <= 0x39
1747 | 							or codes[i] >= 0x41 and codes[i] <= 0x46
1748 | 							or codes[i] >= 0x61 and codes[i] <= 0x66) do
1749 | 						i += 1;
1750 | 					end;
1751 | 					if codes[i] ~= 0x7D or i == org_i then
1752 | 						return "malformed hexadecimal code";
1753 | 					elseif i - org_i > 4 then
1754 | 						return "character offset too large";
1755 | 					end;
1756 | 					table.insert(outln, tonumber(utf8_sub(codes.s, org_i, i), 16));
1757 | 				else
1758 | 					if codes[i] and (codes[i] >= 0x30 and codes[i] <= 0x39 or codes[i] >= 0x41 and codes[i] <= 0x46 or codes[i] >= 0x61 and codes[i] <= 0x66) then
1759 | 						radix0 = codes[i] - ((codes[i] >= 0x41 and codes[i] <= 0x5A) and 0x37 or (codes[i] >= 0x61 and codes[i] <= 0x7A) and 0x57 or 0x30);
1760 | 						i += 1;
1761 | 						if codes[i] and (codes[i] >= 0x30 and codes[i] <= 0x39 or codes[i] >= 0x41 and codes[i] <= 0x46 or codes[i] >= 0x61 and codes[i] <= 0x66) then
1762 | 							radix1 = codes[i] - ((codes[i] >= 0x41 and codes[i] <= 0x5A) and 0x37 or (codes[i] >= 0x61 and codes[i] <= 0x7A) and 0x57 or 0x30);
1763 | 						else
1764 | 							i -= 1;
1765 | 						end;
1766 | 					else
1767 | 						i -= 1;
1768 | 					end;
1769 | 					table.insert(outln, radix0 and (radix1 and 16 * radix0 + radix1 or radix0) or 0);
1770 | 				end;
1771 | 			else
1772 | 				local esc_char = b_escape_chars[escape_c] or escape_chars[escape_c];
1773 | 				table.insert(outln, esc_char or escape_c);
1774 | 			end;
1775 | 		elseif c == 0x2A or c == 0x2B or c == 0x3F or c == 0x7B then
1776 | 			-- Quantifier
1777 | 			local start_q, end_q;
1778 | 			if c == 0x7B then
1779 | 				local org_i = i + 1;
1780 | 				local start_i;
1781 | 				while codes[i + 1] and (codes[i + 1] >= 0x30 and codes[i + 1] <= 0x39 or codes[i + 1] == 0x2C and not start_i and i + 1 ~= org_i) do
1782 | 					i += 1;
1783 | 					if codes[i] == 0x2C then
1784 | 						start_i = i;
1785 | 					end;
1786 | 				end;
1787 | 				if codes[i + 1] == 0x7D then
1788 | 					i += 1;
1789 | 					if not start_i then
1790 | 						start_q = tonumber(utf8_sub(codes.s, org_i, i));
1791 | 						end_q = start_q;
1792 | 					else
1793 | 						start_q, end_q = tonumber(utf8_sub(codes.s, org_i, start_i)), start_i + 1 == i and math.huge or tonumber(utf8_sub(codes.s, start_i + 1, i));
1794 | 						if end_q < start_q then
1795 | 							return "numbers out of order in {} quantifier";
1796 | 						end;
1797 | 					end;
1798 | 				else
1799 | 					table.move(codes, org_i - 1, i, #outln + 1, outln);
1800 | 				end;
1801 | 			else
1802 | 				start_q, end_q = c == 0x2B and 1 or 0, c == 0x3F and 1 or math.huge;
1803 | 			end;
1804 | 			if start_q then
1805 | 				local quantifier_type = flags.ungreedy and "lazy" or "greedy";
1806 | 				if codes[i + 1] == 0x2B or codes[i + 1] == 0x3F then
1807 | 					i += 1;
1808 | 					quantifier_type = codes[i] == 0x2B and "possessive" or flags.ungreedy and "greedy" or "lazy";
1809 | 				end;
1810 | 				local outln_len = #outln;
1811 | 				local last_outln_value = outln[outln_len];
1812 | 				if not last_outln_value or type(last_outln_value) == "table" and (last_outln_value[1] == "quantifier" or last_outln_value[1] == 0x28 or b_escape_chars[last_outln_value[1]])
1813 | 					or last_outln_value == alternation or type(last_outln_value) == "string" then
1814 | 					return "quantifier doesn't follow a repeatable pattern";
1815 | 				end;
1816 | 				if end_q == 0 then
1817 | 					table.remove(outln);
1818 | 				elseif start_q ~= 1 or end_q ~= 1 then
1819 | 					if type(last_outln_value) == "table" and last_outln_value[1] == 0x29 then
1820 | 						outln_len = last_outln_value[3];
1821 | 					end;
1822 | 					outln[outln_len] = { "quantifier", start_q, end_q, quantifier_type, outln[outln_len] };
1823 | 				end;
1824 | 			end;
1825 | 		elseif c == 0x7C then
1826 | 			-- Alternation
1827 | 			table.insert(outln, alternation);
1828 | 			local i1 = #outln;
1829 | 			repeat
1830 | 				i1 -= 1;
1831 | 				local v1, is_table = outln[i1], type(outln[i1]) == "table";
1832 | 				if is_table and v1[1] == 0x29 then
1833 | 					i1 = outln[i1][3];
1834 | 				elseif is_table and v1[1] == 0x28 then
1835 | 					if v1[4] == 0x7C then
1836 | 						group_n = v1[5];
1837 | 					end;
1838 | 					break;
1839 | 				end;
1840 | 			until not v1;
1841 | 		elseif c == 0x24 or c == 0x5E then
1842 | 			table.insert(outln, c == 0x5E and beginning_str or end_str);
1843 | 		elseif flags.ignoreCase and c >= 0x61 and c <= 0x7A then
1844 | 			table.insert(outln, c - 0x20);
1845 | 		elseif flags.extended and (c >= 0x09 and c <= 0x0D or c == 0x20 or c == 0x23) then
1846 | 			if c == 0x23 then
1847 | 				repeat
1848 | 					i += 1;
1849 | 				until not codes[i] or codes[i] == 0x0A or codes[i] == 0x0D;
1850 | 			end;
1851 | 		else
1852 | 			table.insert(outln, c);
1853 | 		end;
1854 | 		i += 1;
1855 | 	end;
1856 | 	local max_group_n = 0;
1857 | 	for i, v in ipairs(outln) do
1858 | 		if type(v) == "table" and (v[1] == 0x28 or v[1] == "quantifier" and type(v[5]) == "table" and v[5][1] == 0x28) then
1859 | 			if v[1] == "quantifier" then
1860 | 				v = v[5];
1861 | 			end;
1862 | 			if not v[3] then
1863 | 				return "unterminated parenthetical";
1864 | 			elseif v[2] then
1865 | 				max_group_n = math.max(max_group_n, v[2]);
1866 | 			end;
1867 | 		elseif type(v) == "table" and (v[1] == "backref" or v[1] == "recurmatch") then
1868 | 			if not group_id[v[2]] and (type(v[2]) ~= "number" or v[2] > group_n) then
1869 | 				return "reference to a non-existent or invalid subpattern";
1870 | 			elseif v[1] == "recurmatch" and v[2] ~= 0 then
1871 | 				for i1, v1 in ipairs(outln) do
1872 | 					if type(v1) == "table" and v1[1] == 0x28 and v1[2] == v[2] then
1873 | 						v[3] = i1;
1874 | 						break;
1875 | 					end;
1876 | 				end;
1877 | 			elseif type(v[2]) == "string" then
1878 | 				v[2] = group_id[v[2]];
1879 | 			end;
1880 | 		end;
1881 | 	end;
1882 | 	outln.group_n = max_group_n;
1883 | 	return outln, group_id, verb_flags;
1884 | end;
1885 | 
1886 | if not tonumber(options.cacheSize) then
1887 | 	error(string.format("expected number for options.cacheSize, got %s", typeof(options.cacheSize)), 2);
1888 | end;
1889 | local cacheSize = math.floor(options.cacheSize or 0) ~= 0 and tonumber(options.cacheSize);
1890 | local cache_pattern, cache_pattern_names;
1891 | if not cacheSize then
1892 | elseif cacheSize < 0 or cacheSize ~= cacheSize then
1893 | 	error("cache size cannot be a negative number or a NaN", 2);
1894 | elseif cacheSize == math.huge then
1895 | 	cache_pattern, cache_pattern_names = { nil }, { nil };
1896 | elseif cacheSize >= 2 ^ 32 then
1897 | 	error("cache size too large", 2);
1898 | else
1899 | 	cache_pattern, cache_pattern_names = table.create(options.cacheSize), table.create(options.cacheSize);
1900 | end;
1901 | if cacheSize then
1902 | 	function re.pruge()
1903 | 		table.clear(cache_pattern_names);
1904 | 		table.clear(cache_pattern);
1905 | 	end;
1906 | end;
1907 | 
1908 | local function new_re(str_arr, flags, flag_repr, pattern_repr)
1909 | 	local tokenized_ptn, group_id, verb_flags;
1910 | 	local cache_format = cacheSize and string.format("%s|%s", str_arr.s, flag_repr);
1911 | 	local cached_token = cacheSize and cache_pattern[table.find(cache_pattern_names, cache_format)];
1912 | 	if cached_token then
1913 | 		tokenized_ptn, group_id, verb_flags = table.unpack(cached_token, 1, 3);
1914 | 	else
1915 | 		tokenized_ptn, group_id, verb_flags = tokenize_ptn(str_arr, flags);
1916 | 		if type(tokenized_ptn) == "string" then
1917 | 			error(tokenized_ptn, 2);
1918 | 		end;
1919 | 		if cacheSize and tokenized_ptn[1] then
1920 | 			table.insert(cache_pattern_names, 1, cache_format);
1921 | 			table.insert(cache_pattern, 1, { tokenized_ptn, group_id, verb_flags });
1922 | 			if cacheSize ~= math.huge then
1923 | 				table.remove(cache_pattern_names, cacheSize + 1);
1924 | 				table.remove(cache_pattern, cacheSize + 1);
1925 | 			end;
1926 | 		end;
1927 | 	end;
1928 | 
1929 | 	local object = newproxy(true);
1930 | 	proxy[object] = { name = "RegEx", flags = flags, flag_repr = flag_repr, pattern_repr = pattern_repr, token = tokenized_ptn, group_id = group_id, verb_flags = verb_flags };
1931 | 	local object_mt = getmetatable(object);
1932 | 	object_mt.__index = setmetatable(flags, re_m);
1933 | 	object_mt.__tostring = re_tostr;
1934 | 	object_mt.__metatable = lockmsg;
1935 | 
1936 | 	return object;
1937 | end;
1938 | 
1939 | local function escape_fslash(pre)
1940 | 	return (#pre % 2 == 0 and '\\' or '') .. pre .. '.';
1941 | end;
1942 | 
1943 | local function sort_flag_chr(a, b)
1944 | 	return a:lower() < b:lower();
1945 | end;
1946 | 
1947 | function re.new(...)
1948 | 	if select('#', ...) == 0 then
1949 | 		error("missing argument #1 (string expected)", 2);
1950 | 	end;
1951 | 	local ptn, flags_str = ...;
1952 | 	if type(ptn) == "number" then
1953 | 		ptn ..= '';
1954 | 	elseif type(ptn) ~= "string" then
1955 | 		error(string.format("invalid argument #1 (string expected, got %s)", typeof(ptn)), 2);
1956 | 	end;
1957 | 	if type(flags_str) ~= "string" and type(flags_str) ~= "number" and flags_str ~= nil then
1958 | 		error(string.format("invalid argument #2 (string expected, got %s)", typeof(flags_str)), 2);
1959 | 	end;
1960 | 
1961 | 	local flags = {
1962 | 		anchored = false, caseless = false, multiline = false, dotall = false, unicode = false, ungreedy = false, extended = false,
1963 | 	};
1964 | 	local flag_repr = { };
1965 | 	for f in string.gmatch(flags_str or '', utf8.charpattern) do
1966 | 		if flags[flag_map[f]] ~= false then
1967 | 			error("invalid regular expression flag " .. f, 3);
1968 | 		end;
1969 | 		flags[flag_map[f]] = true;
1970 | 		table.insert(flag_repr, f);
1971 | 	end;
1972 | 	table.sort(flag_repr, sort_flag_chr);
1973 | 	flag_repr = table.concat(flag_repr);
1974 | 	return new_re(to_str_arr(ptn), flags, flag_repr, string.format("/%s/", ptn:gsub("(\\*)/", escape_fslash)));
1975 | end;
1976 | 
1977 | function re.fromstring(...)
1978 | 	if select('#', ...) == 0 then
1979 | 		error("missing argument #1 (string expected)", 2);
1980 | 	end;
1981 | 	local ptn = ...;
1982 | 	if type(ptn) == "number" then
1983 | 		ptn ..= '';
1984 | 	elseif type(ptn) ~= "string" then
1985 | 		error(string.format("invalid argument #1 (string expected, got %s)", typeof(ptn), 2));
1986 | 	end;
1987 | 	local str_arr = to_str_arr(ptn);
1988 | 	local delimiter = str_arr[1];
1989 | 	if not delimiter then
1990 | 		error("empty regex", 2);
1991 | 	elseif delimiter == 0x5C or (delimiter >= 0x30 and delimiter <= 0x39) or (delimiter >= 0x41 and delimiter <= 0x5A) or (delimiter >= 0x61 and delimiter <= 0x7A) then
1992 | 		error("delimiter must not be alphanumeric or a backslash", 2);
1993 | 	end;
1994 | 
1995 | 	local i0 = 1;
1996 | 	repeat
1997 | 		i0 = table.find(str_arr, delimiter, i0 + 1);
1998 | 		if not i0 then
1999 | 			error(string.format("no ending delimiter ('%s') found", utf8.char(delimiter)), 2);
2000 | 		end;
2001 | 		local escape_count = 1;
2002 | 		while str_arr[i0 - escape_count] == 0x5C do
2003 | 			escape_count += 1;
2004 | 		end;
2005 | 	until escape_count % 2 == 1;
2006 | 
2007 | 	local flags = {
2008 | 		anchored = false, caseless = false, multiline = false, dotall = false, unicode = false, ungreedy = false, extended = false,
2009 | 	};
2010 | 	local flag_repr = { };
2011 | 	while str_arr.n > i0 do
2012 | 		local f = utf8.char(table.remove(str_arr));
2013 | 		str_arr.n -= 1;
2014 | 		if flags[flag_map[f]] ~= false then
2015 | 			error("invalid regular expression flag " .. f, 3);
2016 | 		end;
2017 | 		flags[flag_map[f]] = true;
2018 | 		table.insert(flag_repr, f);
2019 | 	end;
2020 | 	table.sort(flag_repr, sort_flag_chr);
2021 | 	flag_repr = table.concat(flag_repr);
2022 | 	table.remove(str_arr, 1);
2023 | 	table.remove(str_arr);
2024 | 	str_arr.n -= 2;
2025 | 	str_arr.s = string.sub(str_arr.s, 2, 1 + str_arr.n);
2026 | 	return new_re(str_arr, flags, flag_repr, string.sub(ptn, 1, 2 + str_arr.n));
2027 | end;
2028 | 
2029 | local re_escape_line_chrs = {
2030 | 	['\0'] = '\\x00', ['\n'] = '\\n', ['\t'] = '\\t', ['\r'] = '\\r', ['\f'] = '\\f',
2031 | };
2032 | 
2033 | function re.escape(...)
2034 | 	if select('#', ...) == 0 then
2035 | 		error("missing argument #1 (string expected)", 2);
2036 | 	end;
2037 | 	local str, extended, delimiter = ...;
2038 | 	if type(str) == "number" then
2039 | 		str ..= '';
2040 | 	elseif type(str) ~= "string" then
2041 | 		error(string.format("invalid argument #1 to 'escape' (string expected, got %s)", typeof(str)), 2);
2042 | 	end;
2043 | 	if delimiter == nil then
2044 | 		delimiter = '';
2045 | 	elseif type(delimiter) == "number" then
2046 | 		delimiter ..= '';
2047 | 	elseif type(delimiter) ~= "string" then
2048 | 		error(string.format("invalid argument #3 to 'escape' (string expected, got %s)", typeof(delimiter)), 2);
2049 | 	end;
2050 | 	if utf8.len(delimiter) > 1 or delimiter:match("^[%a\\]$") then
2051 | 		error("delimiter have not be alphanumeric", 2);
2052 | 	end;
2053 | 	return (string.gsub(str, "[\0\f\n\r\t]", re_escape_line_chrs):gsub(string.format("[\\%s#()%%%%*+.?[%%]^{|%s]", extended and '%s' or '', (delimiter:find'^[%%%]]$' and '%' or '') .. delimiter), "\\%1"));
2054 | end;
2055 | 
2056 | function re.type(...)
2057 | 	if select('#', ...) == 0 then
2058 | 		error("missing argument #1", 2);
2059 | 	end;
2060 | 	return proxy[...] and proxy[...].name;
2061 | end;
2062 | 
2063 | for k, f in pairs(re_m) do
2064 | 	re[k] = f;
2065 | end;
2066 | 
2067 | re_m = { __index = re_m };
2068 | 
2069 | lockmsg = re.fromstring([[/The\s*metatable\s*is\s*(?:locked|inaccessible)(?#Nice try :])/i]]);
2070 | getmetatable(lockmsg).__metatable = lockmsg;
2071 | 
2072 | local function readonly_table()
2073 | 	error("Attempt to modify a readonly table", 2);
2074 | end;
2075 | 
2076 | match_m = {
2077 | 	__index = match_m,
2078 | 	__metatable = lockmsg,
2079 | 	__newindex = readonly_table,
2080 | };
2081 | 
2082 | re.Match = setmetatable({ }, match_m);
2083 | 
2084 | return setmetatable({ }, {
2085 | 	__index = re,
2086 | 	__metatable = lockmsg,
2087 | 	__newindex = readonly_table,
2088 | });
2089 | 


--------------------------------------------------------------------------------