├── .eslintignore ├── .eslintrc.js ├── .github └── workflows │ ├── benchmark.yml │ ├── release.yml │ └── test.yml ├── .gitignore ├── .prettierignore ├── .releaserc ├── .vscode └── launch.json ├── LICENSE ├── README.md ├── as-pect.config.js ├── asconfig.empty.json ├── asconfig.json ├── assembly ├── __spec_tests__ │ └── generated.spec.ts ├── __tests__ │ ├── alterations.spec.ts │ ├── as-pect.d.ts │ ├── boundary-assertions.spec.ts │ ├── capture-group.spec.ts │ ├── character-classes.spec.ts │ ├── character-sets.spec.ts │ ├── characters.ts │ ├── empty.ts │ ├── empty.wat │ ├── quantifiers.spec.ts │ ├── range-quantifiers.spec.ts │ ├── regex.spec.ts │ └── utils.ts ├── char.ts ├── env.ts ├── index.ts ├── nfa │ ├── matcher.ts │ ├── nfa.ts │ ├── types.ts │ └── walker.ts ├── parser │ ├── node.ts │ ├── parser.ts │ ├── string-iterator.ts │ └── walker.ts ├── regexp.ts ├── tsconfig.json └── util.ts ├── benchmark └── benchmark.js ├── package-lock.json ├── package.json ├── spec ├── pcre-1.dat ├── test-generator.js └── test.dat └── ts ├── index.ts └── tsconfig.json /.eslintignore: -------------------------------------------------------------------------------- 1 | node_modules/ -------------------------------------------------------------------------------- /.eslintrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | root: true, 3 | parser: "@typescript-eslint/parser", 4 | plugins: ["@typescript-eslint"], 5 | extends: [ 6 | "eslint:recommended", 7 | "plugin:@typescript-eslint/eslint-recommended", 8 | "plugin:@typescript-eslint/recommended", 9 | ], 10 | parserOptions: { 11 | ecmaVersion: 2020, 12 | sourceType: "module", 13 | ecmaFeatures: {}, 14 | }, 15 | ignorePatterns: ["node_modules/**/*"], 16 | // === General rules ========================================================= 17 | 18 | rules: { 19 | // Omitted semicolons are hugely popular, yet within the compiler it makes 20 | // sense to be better safe than sorry. 21 | semi: "error", 22 | 23 | // Our code bases uses 2 spaces for indentation, and we enforce it here so 24 | // files don't mix spaces, tabs or different indentation levels. 25 | indent: [ 26 | "error", 27 | 2, 28 | { 29 | SwitchCase: 1, 30 | VariableDeclarator: "first", 31 | offsetTernaryExpressions: true, 32 | ignoredNodes: [ 33 | // FIXME: something's odd here 34 | "ConditionalExpression > *", 35 | "ConditionalExpression > * > *", 36 | "ConditionalExpression > * > * > *", 37 | ], 38 | }, 39 | ], 40 | 41 | // This is mostly visual style, making comments look uniform. 42 | "spaced-comment": [ 43 | "error", 44 | "always", 45 | { 46 | markers: ["/"], // triple-slash 47 | exceptions: ["/"], // all slashes 48 | }, 49 | ], 50 | 51 | // This tends to be annoying as it encourages developers to make everything 52 | // that is never reassigned a 'const', sometimes semantically incorrect so, 53 | // typically leading to huge diffs in follow-up PRs modifying affected code. 54 | "prefer-const": "off", 55 | 56 | // It is perfectly fine to declare top-level variables with `var`, yet this 57 | // rule doesn't provide configuration options that would help. 58 | "no-var": "off", 59 | 60 | // Quite often, dealing with multiple related cases at once or otherwise 61 | // falling through is exactly the point of using a switch. 62 | "no-fallthrough": "off", 63 | 64 | // Typical false-positives here are `do { ... } while (true)` statements or 65 | // similar, but the only option provided here is not checking any loops. 66 | "no-constant-condition": ["error", { checkLoops: false }], 67 | 68 | // Functions are nested in blocks occasionally, and there haven't been any 69 | // problems with this so far, so turning the check off. 70 | "no-inner-declarations": "off", 71 | 72 | // Quite common in scenarios where an iteration starts at `current = this`. 73 | "@typescript-eslint/no-this-alias": "off", 74 | 75 | // Disabled here, but enabled again for JavaScript files. 76 | "no-unused-vars": "off", 77 | 78 | // Disabled here, but enabled again for TypeScript files. 79 | "@typescript-eslint/no-unused-vars": "off", 80 | }, 81 | overrides: [ 82 | // === TypeScript rules ==================================================== 83 | 84 | { 85 | files: ["**/assembly/**/*.ts"], 86 | rules: { 87 | // Enforcing to remove function parameters on stubs makes code less 88 | // maintainable, so we instead allow unused function parameters. 89 | "@typescript-eslint/no-unused-vars": [ 90 | "warn", 91 | { 92 | vars: "local", 93 | varsIgnorePattern: "^_|^[A-Z](?:From|To)?$", // ignore type params 94 | args: "none", 95 | ignoreRestSiblings: false, 96 | }, 97 | ], 98 | 99 | // Namespaces are quite useful in AssemblyScript 100 | "@typescript-eslint/no-namespace": "off", 101 | 102 | // There is actually codegen difference here 103 | "@typescript-eslint/no-array-constructor": "off", 104 | 105 | // Sometimes it can't be avoided to add a @ts-ignore 106 | "@typescript-eslint/ban-ts-comment": "off", 107 | 108 | // Utilized to achieve portability in some cases 109 | "@typescript-eslint/no-non-null-assertion": "off", 110 | }, 111 | }, 112 | 113 | // === Compiler rules (extends AssemblyScript rules) ======================= 114 | 115 | { 116 | files: ["**/assembly/**/*.ts"], 117 | rules: { 118 | // There is an actual codegen difference here - TODO: revisit 119 | "no-cond-assign": "off", 120 | 121 | // Not all types can be omitted in AS yet - TODO: revisit 122 | "@typescript-eslint/no-inferrable-types": "off", 123 | 124 | // Used rarely to reference internals that are not user-visible 125 | "@typescript-eslint/triple-slash-reference": "off", 126 | 127 | // The compiler has its own `Function` class for example 128 | "no-shadow-restricted-names": "off", 129 | "@typescript-eslint/ban-types": "off", 130 | }, 131 | }, 132 | 133 | // === Standard Library rules (extends AssemblyScript rules) =============== 134 | 135 | { 136 | files: ["**/assembly/**/*.ts"], 137 | rules: { 138 | // We are implementing with --noLib, so we shadow all the time 139 | "no-shadow-restricted-names": "off", 140 | 141 | // Similarly, sometimes we need the return type to be String, not string 142 | "@typescript-eslint/ban-types": "off", 143 | }, 144 | }, 145 | 146 | // === Standard Definition rules (extends TypeScript rules) ================ 147 | 148 | { 149 | files: ["**/assembly/**/*.d.ts"], 150 | rules: { 151 | // Often required to achieve compatibility with TypeScript 152 | "@typescript-eslint/no-explicit-any": "off", 153 | 154 | // Interfaces can be stubs here, i.e. not yet fully implemented 155 | "@typescript-eslint/no-empty-interface": "off", 156 | 157 | // Definitions make use of `object` to model rather unusual constraints 158 | "@typescript-eslint/ban-types": "off", 159 | }, 160 | }, 161 | 162 | // === Test rules (extends TypeScript rules) =============================== 163 | 164 | { 165 | files: ["**/assembly/__tests__/**/*.ts"], 166 | rules: { 167 | // Tests typically include unusual code patterns on purpose. This is 168 | // very likely not an extensive list, but covers what's there so far. 169 | "no-empty": "off", 170 | "no-cond-assign": "off", 171 | "no-compare-neg-zero": "off", 172 | "no-inner-declarations": "off", 173 | "no-constant-condition": "off", 174 | "use-isnan": "off", 175 | "@typescript-eslint/no-namespace": "off", 176 | "@typescript-eslint/no-unused-vars": "off", 177 | "@typescript-eslint/no-empty-function": "off", 178 | "@typescript-eslint/no-non-null-assertion": "off", 179 | "@typescript-eslint/no-extra-semi": "off", 180 | "@typescript-eslint/no-inferrable-types": "off", 181 | "@typescript-eslint/ban-types": "off", 182 | "@typescript-eslint/triple-slash-reference": "off", 183 | "@typescript-eslint/ban-ts-comment": "off", 184 | "@typescript-eslint/no-extra-non-null-assertion": "off", 185 | "@typescript-eslint/no-empty-interface": "off", 186 | }, 187 | }, 188 | ], 189 | }; 190 | -------------------------------------------------------------------------------- /.github/workflows/benchmark.yml: -------------------------------------------------------------------------------- 1 | name: Performance benchmark 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | 7 | jobs: 8 | benchmark: 9 | name: Performance regression check 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v2 13 | - name: Setup Node.js 14 | uses: actions/setup-node@v1 15 | with: 16 | node-version: 15 17 | - run: npm ci 18 | - run: npm run asbuild 19 | # Run benchmark with `go test -bench` and stores the output to a file 20 | - name: Run benchmark 21 | run: npm run benchmark | tee benchmark/output.txt 22 | # Download previous benchmark result from cache (if exists) 23 | - name: Download previous benchmark data 24 | uses: actions/cache@v1 25 | with: 26 | path: ./cache 27 | key: ${{ runner.os }}-benchmark 28 | # Run `github-action-benchmark` action 29 | - name: Store benchmark result 30 | uses: rhysd/github-action-benchmark@v1 31 | with: 32 | # What benchmark tool the output.txt came from 33 | tool: "benchmarkjs" 34 | # Where the output from the benchmark tool is stored 35 | output-file-path: benchmark/output.txt 36 | # Personal access token to deploy GitHub Pages branch 37 | github-token: ${{ secrets.PERSONAL_GITHUB_TOKEN }} 38 | # Enable alert commit comment 39 | comment-on-alert: true 40 | # Mention @colineberhardt in the commit comment 41 | alert-comment-cc-users: "@colineberhardt" 42 | # Push and deploy GitHub pages branch automatically 43 | auto-push: true 44 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | # This workflow will do a clean install of node dependencies, build the source code and run tests across different versions of node 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-nodejs-with-github-actions 3 | 4 | name: Release 5 | 6 | on: 7 | push: 8 | branches: 9 | - main 10 | jobs: 11 | release: 12 | name: Release 13 | runs-on: ubuntu-18.04 14 | steps: 15 | - name: Checkout 16 | uses: actions/checkout@v1 17 | - name: Setup Node.js 18 | uses: actions/setup-node@v1 19 | with: 20 | node-version: 15 21 | - name: Install dependencies 22 | run: npm ci 23 | - name: Run tests 24 | run: npm test 25 | - name: Release 26 | env: 27 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 28 | NPM_TOKEN: ${{ secrets.NPM_TOKEN }} 29 | run: npx semantic-release 30 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | # This workflow will do a clean install of node dependencies, build the source code and run tests across different versions of node 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-nodejs-with-github-actions 3 | 4 | name: Test 5 | 6 | on: 7 | push: 8 | branches: [main] 9 | pull_request: 10 | branches: [main] 11 | 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | 16 | strategy: 17 | matrix: 18 | node-version: [15.x] 19 | # See supported Node.js release schedule at https://nodejs.org/en/about/releases/ 20 | 21 | steps: 22 | - uses: actions/checkout@v2 23 | - name: Use Node.js ${{ matrix.node-version }} 24 | uses: actions/setup-node@v1 25 | with: 26 | node-version: ${{ matrix.node-version }} 27 | - run: npm ci 28 | - run: npm test 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | build/ 3 | .history 4 | .vscode 5 | .idea 6 | npm-debug.* 7 | assembly/__tests__/index.spec.wat 8 | assembly/__tests__/spec.spec.wat -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | # prettier doesn't support decorators on functions :-( 2 | assembly/char.ts 3 | assembly/nfa/types.ts -------------------------------------------------------------------------------- /.releaserc: -------------------------------------------------------------------------------- 1 | { 2 | "branches": ["main"] 3 | } -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "type": "node", 9 | "request": "launch", 10 | "name": "Launch Program", 11 | "runtimeArgs": ["-r", "ts-node/register"], 12 | "args": ["${workspaceFolder}/ts/index.ts"], 13 | "env": { "TS_NODE_PROJECT": "${workspaceFolder}/ts/tsconfig.json" } 14 | }, 15 | { 16 | "type": "node", 17 | "request": "launch", 18 | "name": "Launch Test Gen", 19 | "args": ["${workspaceFolder}/spec/test-generator.js"] 20 | } 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Colin Eberhardt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # assemblyscript-regex 2 | 3 | A regex engine for AssemblyScript. 4 | 5 | [AssemblyScript](https://www.assemblyscript.org/) is a new language, based on TypeScript, that runs on WebAssembly. AssemblyScript has a lightweight standard library, but lacks support for Regular Expression. The project fills that gap! 6 | 7 | This project exposes an API that mirrors the JavaScript [RegExp](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp) class: 8 | 9 | ```javascript 10 | const regex = new RegExp("fo*", "g"); 11 | const str = "table football, foul"; 12 | 13 | let match: Match | null = regex.exec(str); 14 | while (match != null) { 15 | // first iteration 16 | // match.index = 6 17 | // match.matches[0] = "foo" 18 | 19 | // second iteration 20 | // match.index = 16 21 | // match.matches[0] = "fo" 22 | match = regex.exec(str); 23 | } 24 | ``` 25 | 26 | ## Project status 27 | 28 | The initial focus of this implementation has been feature support and functionality over performance. It currently supports a sufficient number of regex features to be considered useful, including most character classes, common assertions, groups, alternations, capturing groups and quantifiers. 29 | 30 | The next phase of development will focussed on more extensive testing and performance. The project currently has reasonable unit test coverage, focussed on positive and negative test cases on a per-feature basis. It also includes a more exhaustive test suite with test cases borrowed from another regex library. 31 | 32 | ### Feature support 33 | 34 | Based on the classfication within the [MDN cheatsheet](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Cheatsheet) 35 | 36 | **Character sets** 37 | 38 | - [x] . 39 | - [x] \d 40 | - [x] \D 41 | - [x] \w 42 | - [x] \W 43 | - [x] \s 44 | - [x] \S 45 | - [x] \t 46 | - [x] \r 47 | - [x] \n 48 | - [x] \v 49 | - [x] \f 50 | - [ ] [\b] 51 | - [ ] \0 52 | - [ ] \cX 53 | - [x] \xhh 54 | - [x] \uhhhh 55 | - [ ] \u{hhhh} or \u{hhhhh} 56 | - [x] \ 57 | 58 | **Assertions** 59 | 60 | - [x] ^ 61 | - [x] $ 62 | - [ ] \b 63 | - [ ] \B 64 | 65 | **Other assertions** 66 | 67 | - [ ] x(?=y) Lookahead assertion 68 | - [ ] x(?!y) Negative lookahead assertion 69 | - [ ] (?<=y)x Lookbehind assertion 70 | - [ ] (?x) named capturing group 80 | - [x] (?:x) Non-capturing group 81 | 82 | **Quantifiers** 83 | 84 | - [x] x\* 85 | - [x] x+ 86 | - [x] x? 87 | - [x] x{n} 88 | - [x] x{n,} 89 | - [x] x{n,m} 90 | - [ ] x\*? / x+? / ... 91 | 92 | **RegExp** 93 | 94 | - [x] global 95 | - [ ] sticky 96 | - [x] case insensitive 97 | - [x] multiline 98 | - [x] dotAll 99 | - [ ] unicode 100 | 101 | ### Development 102 | 103 | This project is open source, MIT licenced and your contributions are very much welcomed. 104 | 105 | To get started, check out the repository and install dependencies: 106 | 107 | ``` 108 | $ npm install 109 | ``` 110 | 111 | A few general points about the tools and processes this project uses: 112 | 113 | - This project uses prettier for code formatting and eslint to provide additional syntactic checks. These are both run on `npm test` and as part of the CI build. 114 | - The unit tests are executed using [as-pect](https://github.com/jtenner/as-pect) - a native AssemblyScript test runner 115 | - The specification tests are within the `spec` folder. The `npm run test:generate` target transforms these tests into as-pect tests which execute as part of the standard build / test cycle 116 | - In order to support improved debugging you can execute this library as TypeScript (rather than WebAssembly), via the `npm run tsrun` target. 117 | -------------------------------------------------------------------------------- /as-pect.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | /** 3 | * A set of globs passed to the glob package that qualify typescript files for testing. 4 | */ 5 | include: [ 6 | "assembly/__tests__/**/*.spec.ts", 7 | "assembly/__spec_tests__/**/*.spec.ts", 8 | ], 9 | /** 10 | * A set of globs passed to the glob package that quality files to be added to each test. 11 | */ 12 | add: ["assembly/__tests__/**/*.include.ts"], 13 | /** 14 | * All the compiler flags needed for this test suite. Make sure that a binary file is output. 15 | */ 16 | flags: { 17 | /** To output a wat file, uncomment the following line. */ 18 | // "--textFile": ["output.wat"], 19 | /** A runtime must be provided here. */ 20 | "--runtime": ["stub"], // Acceptable values are: full, half, stub (arena), and none 21 | "--target": "test", 22 | }, 23 | /** 24 | * A set of regexp that will disclude source files from testing. 25 | */ 26 | disclude: [/node_modules/], 27 | /** 28 | * Add your required AssemblyScript imports here. 29 | */ 30 | imports(memory, createImports, instantiateSync, binary) { 31 | let instance; // Imports can reference this 32 | const myImports = { 33 | // put your web assembly imports here, and return the module 34 | }; 35 | instance = instantiateSync(binary, createImports(myImports)); 36 | return instance; 37 | }, 38 | /** 39 | * Add a custom reporter here if you want one. The following example is in typescript. 40 | * 41 | * @example 42 | * import { TestReporter, TestGroup, TestResult, TestContext } from "as-pect"; 43 | * 44 | * export class CustomReporter extends TestReporter { 45 | * // implement each abstract method here 46 | * public abstract onStart(suite: TestContext): void; 47 | * public abstract onGroupStart(group: TestGroup): void; 48 | * public abstract onGroupFinish(group: TestGroup): void; 49 | * public abstract onTestStart(group: TestGroup, result: TestResult): void; 50 | * public abstract onTestFinish(group: TestGroup, result: TestResult): void; 51 | * public abstract onFinish(suite: TestContext): void; 52 | * } 53 | */ 54 | // reporter: new CustomReporter(), 55 | /** 56 | * Specify if the binary wasm file should be written to the file system. 57 | */ 58 | outputBinary: false, 59 | }; 60 | -------------------------------------------------------------------------------- /asconfig.empty.json: -------------------------------------------------------------------------------- 1 | { 2 | "options": { 3 | "runtime": "stub", 4 | "textFile": "build/empty.wat", 5 | "debug": true 6 | }, 7 | "entries": ["assembly/__tests__/empty.ts"] 8 | } 9 | -------------------------------------------------------------------------------- /asconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "targets": { 3 | "debug": { 4 | "binaryFile": "build/untouched.wasm", 5 | "textFile": "build/untouched.wat", 6 | "sourceMap": true, 7 | "debug": true 8 | }, 9 | "release": { 10 | "binaryFile": "build/optimized.wasm", 11 | "textFile": "build/optimized.wat", 12 | "sourceMap": false, 13 | "optimizeLevel": 3, 14 | "shrinkLevel": 0, 15 | "converge": true, 16 | "noAssert": true 17 | }, 18 | "test": { 19 | "debug": true 20 | } 21 | }, 22 | "options": { 23 | "transform": [], 24 | "exportRuntime": true 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /assembly/__tests__/alterations.spec.ts: -------------------------------------------------------------------------------- 1 | import { expectMatch, expectNotMatch } from "./utils"; 2 | 3 | it("or", () => { 4 | expectMatch("a|b", ["b", "a"]); 5 | expectNotMatch("a|b", ["c"]); 6 | expectMatch("a|br", ["br", "a"]); 7 | expectNotMatch("a|br", ["b", "c"]); 8 | }); 9 | 10 | it("or multi-term", () => { 11 | expectMatch("a|b|c", ["b", "a", "c"]); 12 | expectNotMatch("a|b|c", ["d"]); 13 | expectMatch("a|br|pc", ["br", "a", "pc"]); 14 | expectNotMatch("a|br|pc", ["b", "pr"]); 15 | }); 16 | -------------------------------------------------------------------------------- /assembly/__tests__/as-pect.d.ts: -------------------------------------------------------------------------------- 1 | /// 2 | 3 | declare function includeBytes(filename: string): StaticArray; 4 | -------------------------------------------------------------------------------- /assembly/__tests__/boundary-assertions.spec.ts: -------------------------------------------------------------------------------- 1 | import { expectMatch, expectNotMatch, exec } from "./utils"; 2 | 3 | it("matches end of string", () => { 4 | const match = exec("a$", "ba"); 5 | expect(match.index).toBe(1); 6 | expect(match.matches[0]).toBe("a"); 7 | expectNotMatch("a$", ["ab"]); 8 | }); 9 | 10 | it("matches start of string", () => { 11 | expectMatch("^a", ["a"]); 12 | expectNotMatch("^a", ["ba"]); 13 | }); 14 | 15 | it("handles escaped boundaries", () => { 16 | expectMatch("\\^a", ["^a"]); 17 | expectMatch("a\\$", ["a$"]); 18 | }); 19 | -------------------------------------------------------------------------------- /assembly/__tests__/capture-group.spec.ts: -------------------------------------------------------------------------------- 1 | import { expectMatch, expectNotMatch, exec } from "./utils"; 2 | 3 | it("supports capture groups", () => { 4 | let match = exec("a(\\d)a", "a3a"); 5 | expect(match.index).toBe(0); 6 | expect(match.input).toBe("a3a"); 7 | expect(match.matches[0]).toBe("a3a"); 8 | expect(match.matches[1]).toBe("3"); 9 | 10 | match = exec("a(\\d)a", " a3a"); 11 | expect(match.index).toBe(2); 12 | expect(match.input).toBe(" a3a"); 13 | expect(match.matches[0]).toBe("a3a"); 14 | expect(match.matches[1]).toBe("3"); 15 | 16 | match = exec("a(\\d*)a", "a3456a"); 17 | expect(match.index).toBe(0); 18 | expect(match.input).toBe("a3456a"); 19 | expect(match.matches[0]).toBe("a3456a"); 20 | expect(match.matches[1]).toBe("3456"); 21 | 22 | match = exec("a*(\\d*)(a*)", "aaa456aaa"); 23 | expect(match.index).toBe(0); 24 | expect(match.input).toBe("aaa456aaa"); 25 | expect(match.matches[0]).toBe("aaa456aaa"); 26 | expect(match.matches[1]).toBe("456"); 27 | expect(match.matches[2]).toBe("aaa"); 28 | }); 29 | 30 | it("should not return captured values for non-matching alternations", () => { 31 | const match = exec("(a|b)c|a(b|c)", "ab"); 32 | expect(match.matches[0]).toBe("ab"); 33 | expect(match.matches[1]).toBe(""); 34 | expect(match.matches[2]).toBe("b"); 35 | }); 36 | 37 | it("repeated capture groups should return the last match", () => { 38 | const match = exec("([a-c])+", "ac"); 39 | expect(match.matches[0]).toBe("ac"); 40 | expect(match.matches[1]).toBe("c"); 41 | }); 42 | 43 | it("range repitition capture groups should return the last match", () => { 44 | const match = exec("([a-c]){2}", "ac"); 45 | expect(match.matches[0]).toBe("ac"); 46 | expect(match.matches[1]).toBe("c"); 47 | }); 48 | 49 | it("non-capturing groups should not capture", () => { 50 | const match = exec("(?:foo)bar(baz)", "foobarbaz"); 51 | expect(match.matches[0]).toBe("foobarbaz"); 52 | expect(match.matches[1]).toBe("baz"); 53 | }); 54 | -------------------------------------------------------------------------------- /assembly/__tests__/character-classes.spec.ts: -------------------------------------------------------------------------------- 1 | import { expectMatch, expectNotMatch } from "./utils"; 2 | 3 | it("throws an error if no closing bracket is found", () => { 4 | // expect(() => new RegExp("[abce")).toThrow(); 5 | }); 6 | 7 | it("matches discrete characters", () => { 8 | expectMatch("[abce]", ["a", "b", "c", "e"]); 9 | expectNotMatch("[abce]", ["", "f", "h"]); 10 | }); 11 | 12 | it("matches character ranges", () => { 13 | expectMatch("[a-c]", ["a", "b", "c"]); 14 | expectNotMatch("[a-c]", ["d", "e", ""]); 15 | expectMatch("[K-M]", ["K", "L", "M"]); 16 | expectNotMatch("[K-M]", ["9", "J"]); 17 | expectMatch("[0-9]", ["0", "9"]); 18 | expectNotMatch("[0-9]", ["a", "A"]); 19 | }); 20 | 21 | it("matches multiple ranges", () => { 22 | expectMatch("[a-ce-f]", ["a", "b", "c", "e", "f"]); 23 | expectNotMatch("[a-ce-f]", ["d"]); 24 | }); 25 | 26 | it("supports closing brackets", () => { 27 | expectMatch("[]a]", ["]", "a"]); 28 | }); 29 | 30 | it("supports negated sets", () => { 31 | expectNotMatch("[^a-c]", ["a", "b", "c"]); 32 | expectMatch("[^a-c]", ["d", "e"]); 33 | expectNotMatch("[^a-ce-f]", ["a", "b", "c", "e", "f"]); 34 | expectMatch("[^a-ce-f]", ["d"]); 35 | }); 36 | 37 | it("treats - as a literal", () => { 38 | expectMatch("[-abc]", ["-", "a", "b", "c"]); 39 | expectMatch("[abc-]", ["-", "a", "b", "c"]); 40 | }); 41 | 42 | it("treats - as a literal in negated sets", () => { 43 | expectNotMatch("[^-abc]", ["-", "a", "b", "c"]); 44 | expectMatch("[^-abc]", ["1", "A"]); 45 | }); 46 | 47 | it("supports case insensitive matching", () => { 48 | // simple ranges 49 | expectMatch("[a-c]", ["A", "C", "a", "c"], "i"); 50 | expectNotMatch("[a-c]", ["D", "d"], "i"); 51 | // complex 52 | expectMatch("[W-c]", ["W", "w", "C", "c"], "i"); 53 | expectNotMatch("[W-c]", ["V", "v", "D", "d"], "i"); 54 | }); 55 | -------------------------------------------------------------------------------- /assembly/__tests__/character-sets.spec.ts: -------------------------------------------------------------------------------- 1 | import { RegExp } from ".."; 2 | import { expectMatch, expectNotMatch, exec } from "./utils"; 3 | 4 | it("dot", () => { 5 | expectMatch(".", [" ", "B", "|", "9"]); 6 | expectNotMatch(".", ["", "\n"]); 7 | }); 8 | 9 | it("digit", () => { 10 | expectMatch("\\d", ["0", "9"]); 11 | expectNotMatch("\\d", ["", "b"]); 12 | }); 13 | 14 | it("non-digit", () => { 15 | expectNotMatch("\\D", ["0", "9", ""]); 16 | expectMatch("\\D", ["b", "|"]); 17 | }); 18 | 19 | it("word", () => { 20 | expectMatch("\\w", ["A", "a", "Z", "z", "0", "9", "_"]); 21 | expectNotMatch("\\w", ["", "$"]); 22 | }); 23 | 24 | it("not word", () => { 25 | expectNotMatch("\\W", ["A", "a", "Z", "z", "0", "9", "_", ""]); 26 | expectMatch("\\W", ["&", "$"]); 27 | }); 28 | 29 | it("whitespace", () => { 30 | expectMatch("\\s", ["\f", "\n", "\r", "\t", "\v"]); 31 | expectNotMatch("\\s", ["", "a", "0"]); 32 | }); 33 | 34 | it("not whitespace", () => { 35 | expectNotMatch("\\S", ["", "\f", "\n", "\r", "\t", "\v"]); 36 | expectMatch("\\S", ["a", "0"]); 37 | }); 38 | 39 | it("tab, cr, lf, vt, ff", () => { 40 | expectMatch("\\t", ["\t"]); 41 | expectMatch("\\r", ["\r"]); 42 | expectMatch("\\n", ["\n"]); 43 | expectMatch("\\v", ["\v"]); 44 | expectMatch("\\f", ["\f"]); 45 | expectNotMatch("\\t", ["a", " ", ""]); 46 | }); 47 | 48 | it("escaped dot", () => { 49 | expectMatch("\\.", ["."]); 50 | expectNotMatch("\\.", ["", "a"]); 51 | }); 52 | 53 | it("unrecognised character classes are treated as characters", () => { 54 | expectMatch("\\g\\m", ["gm"]); 55 | }); 56 | -------------------------------------------------------------------------------- /assembly/__tests__/characters.ts: -------------------------------------------------------------------------------- 1 | import { expectMatch, expectNotMatch } from "./utils"; 2 | 3 | it("single character", () => { 4 | expectMatch("a", ["a"]); 5 | expectNotMatch("a", ["fish", ""]); 6 | }); 7 | 8 | it("concatenation", () => { 9 | expectMatch("ab", ["ab"]); 10 | expectNotMatch("ab", ["aac", "aa", ""]); 11 | }); 12 | -------------------------------------------------------------------------------- /assembly/__tests__/empty.ts: -------------------------------------------------------------------------------- 1 | import * as regex from ".."; 2 | -------------------------------------------------------------------------------- /assembly/__tests__/empty.wat: -------------------------------------------------------------------------------- 1 | (module 2 | (memory $0 0) 3 | (table $0 1 funcref) 4 | (export "memory" (memory $0)) 5 | ) 6 | -------------------------------------------------------------------------------- /assembly/__tests__/quantifiers.spec.ts: -------------------------------------------------------------------------------- 1 | import { expectMatch, expectNotMatch, exec } from "./utils"; 2 | 3 | it("matches empty strings", () => { 4 | expectMatch("a?", [""]); 5 | expectMatch("a*", [""]); 6 | }); 7 | 8 | it("zero or one", () => { 9 | expectMatch("a?", ["a"]); 10 | let match = exec("a?", "bc"); 11 | expect(match).not.toBeNull(); 12 | expect(match.matches[0]).toStrictEqual(""); 13 | }); 14 | 15 | it("one or more", () => { 16 | expectMatch("a+", ["a", "aa"]); 17 | expectNotMatch("a+", [""]); 18 | }); 19 | 20 | it("zero or more", () => { 21 | expectMatch("a*", ["aa", "aaaa"]); 22 | }); 23 | 24 | it("multiple rules", () => { 25 | expectMatch("a*b", ["b", "ab", "aaaab"]); 26 | expectNotMatch("a*b", ["aaaad"]); 27 | }); 28 | 29 | it("zero or more is greedy", () => { 30 | let match = exec("a*", "aaaaa"); 31 | expect(match).not.toBeNull(); 32 | expect(match.matches[0]).toStrictEqual("aaaaa"); 33 | }); 34 | 35 | it("one or more is greedy", () => { 36 | let match = exec("a+", "aaaaa"); 37 | expect(match).not.toBeNull(); 38 | expect(match.matches[0]).toStrictEqual("aaaaa"); 39 | }); 40 | 41 | describe("non-greedy", () => { 42 | it("one or more supports non-greedy mode", () => { 43 | let match = exec("[a-c]+?b", "abb"); 44 | expect(match).not.toBeNull(); 45 | expect(match.matches[0]).toStrictEqual("ab"); 46 | }); 47 | 48 | it("zero or more supports non-greedy mode", () => { 49 | let match = exec("[a-c]*?b", "abb"); 50 | expect(match).not.toBeNull(); 51 | expect(match.matches[0]).toStrictEqual("ab"); 52 | }); 53 | 54 | // it("zero or one supports non-greedy mode", () => { 55 | // expectMatch("a?", ["a"]); 56 | // let match = exec("a??", "bc"); 57 | // expect(match).not.toBeNull(); 58 | // expect(match.matches[0]).toStrictEqual(""); 59 | // }); 60 | }); 61 | -------------------------------------------------------------------------------- /assembly/__tests__/range-quantifiers.spec.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable no-invalid-regexp */ 2 | import { RegExp } from ".."; 3 | import { expectMatch, expectNotMatch, exec } from "./utils"; 4 | 5 | it("handles single quantifier", () => { 6 | expectMatch("a{2}", ["aa"]); 7 | expectMatch("ba{2}", ["baa"]); 8 | expectMatch("ba{1}b", ["bab"]); 9 | }); 10 | 11 | it("handles open upper bound quantifiers", () => { 12 | expectMatch("a{2,}", ["aa", "aaaaa"]); 13 | expectMatch("ba{2,}", ["baa", "baaaaaaa"]); 14 | expectMatch("ba{1,}b", ["bab", "baaaaaab"]); 15 | }); 16 | 17 | it("handles explicit upper bound quantifiers", () => { 18 | const match = exec("a{2,4}", "aaaaaaaaaa"); 19 | expect(match.matches[0]).toBe("aaaa"); 20 | }); 21 | 22 | it("handles zero value quantifier", () => { 23 | expectMatch("ba{0}b", ["bb"]); 24 | }); 25 | 26 | it("handles quantifiers within alternates", () => { 27 | expectMatch("a{2}|b{2}", ["bb", "aa"]); 28 | expectNotMatch("a{2}|b{2}", ["cc"]); 29 | }); 30 | 31 | it("handles imcomplete quantifier ", () => { 32 | expectMatch("a{2", ["a{2"]); 33 | expectMatch("a{2,", ["a{2,"]); 34 | expectMatch("a{2,3", ["a{2,3"]); 35 | expectMatch("a{2,3a", ["a{2,3a"]); 36 | expectMatch("a{2,3a}", ["a{2,3a}"]); 37 | }); 38 | 39 | it("handles nested quantifiers", () => { 40 | expectMatch("(a{3}){2}", ["aaaaaa"]); 41 | }); 42 | 43 | it("handles nongreedy quantifiers", () => { 44 | const match = exec("a{2,4}?", "aaaaaaaaaa"); 45 | expect(match.matches[0]).toBe("aa"); 46 | }); 47 | 48 | it("throws if quantifying a quantifier!", () => { 49 | expect(() => { 50 | let foo = new RegExp("a{3}{2}"); 51 | }).toThrow(); 52 | }); 53 | -------------------------------------------------------------------------------- /assembly/__tests__/regex.spec.ts: -------------------------------------------------------------------------------- 1 | import { RegExp } from ".."; 2 | import { exec, expectNotMatch, expectMatch } from "./utils"; 3 | 4 | it("match returns correct substring", () => { 5 | const match = exec("\\d", "asd123asd"); 6 | expect(match.index).toBe(3); 7 | expect(match.input).toStrictEqual("asd123asd"); 8 | expect(match.matches[0]).toStrictEqual("1"); 9 | }); 10 | 11 | describe("dotAll mode", () => { 12 | it("sets the dotAll flag", () => { 13 | expect(new RegExp("foo", "s").dotAll).toBeTruthy(); 14 | expect(new RegExp("foo", "").dotAll).toBeFalsy(); 15 | }); 16 | 17 | it("allows dot to match any character", () => { 18 | const regex = new RegExp("^12.34", "s"); 19 | const match = exec(regex, "12\n34"); 20 | expect(match.matches[0]).toBe("12\n34"); 21 | }); 22 | }); 23 | 24 | describe("case insensitive mode", () => { 25 | it("supports characters", () => { 26 | const regex = new RegExp("AbC", "i"); 27 | const match = exec(regex, "aBc"); 28 | expect(match.matches[0]).toBe("aBc"); 29 | }); 30 | 31 | it("supports character ranges", () => { 32 | const regex = new RegExp("[a-c][A-C]", "i"); 33 | const match = exec(regex, "Ac"); 34 | expect(match.matches[0]).toBe("Ac"); 35 | }); 36 | 37 | it("sets ignoreCase flag", () => { 38 | expect(new RegExp("\\d+", "i").ignoreCase).toBeTruthy(); 39 | expect(new RegExp("\\d+", "g").ignoreCase).toBeFalsy(); 40 | }); 41 | }); 42 | 43 | describe("global mode", () => { 44 | it("sets global flag", () => { 45 | expect(new RegExp("\\d+", "g").global).toBeTruthy(); 46 | expect(new RegExp("\\d+", "").global).toBeFalsy(); 47 | }); 48 | 49 | it("increments lastIndex", () => { 50 | const regex = new RegExp("\\d+", "g"); 51 | const match = exec(regex, "dog 23 fish 45 cat"); 52 | expect(match.matches[0]).toStrictEqual("23"); 53 | expect(regex.lastIndex).toStrictEqual(6); 54 | }); 55 | 56 | it("uses lastIndex to support multiple matches", () => { 57 | const regex = new RegExp("\\d+", "g"); 58 | let match = exec(regex, "dog 23 fish 45 cat"); 59 | expect(match.matches[0]).toBe("23"); 60 | expect(regex.lastIndex).toBe(6); 61 | 62 | match = exec(regex, "dog 23 fish 45 cat"); 63 | expect(match.matches[0]).toBe("45"); 64 | expect(regex.lastIndex).toBe(14); 65 | 66 | let empty_match = regex.exec("dog 23 fish 45 cat"); 67 | expect(empty_match).toBeNull(); 68 | expect(regex.lastIndex).toBe(0); 69 | }); 70 | }); 71 | 72 | describe("multi-line mode", () => { 73 | it("sets multi-line flag", () => { 74 | expect(new RegExp("\\d+", "m").multiline).toBeTruthy(); 75 | expect(new RegExp("\\d+", "").multiline).toBeFalsy(); 76 | }); 77 | 78 | it("matches across multiple lines", () => { 79 | const match = exec("^f\\d{1}$", "f1\nbar\nbaz\nf2", "m"); 80 | expect(match.matches.length).toBe(1); 81 | expect(match.matches[0]).toBe("f1"); 82 | }); 83 | 84 | it("matches across multiple lines with global mode", () => { 85 | const regex = new RegExp("^f\\d{1}$", "gm"); 86 | 87 | let match = regex.exec("f1\nbar\nbaz\nf2"); 88 | expect(match!.matches[0]).toBe("f1"); 89 | 90 | match = regex.exec("f1\nbar\nbaz\nf2"); 91 | expect(match!.matches[0]).toBe("f2"); 92 | 93 | match = regex.exec("f1\nbar\nbaz\nf2"); 94 | expect(match).toBeNull(); 95 | }); 96 | 97 | it("matches across multiple lines with global mode", () => { 98 | const regex = new RegExp("^[a-c]", "gm"); 99 | 100 | let match = regex.exec("a1\nd2\nc3\n"); 101 | expect(match!.matches[0]).toBe("a"); 102 | 103 | match = regex.exec("a1\nd2\nc3\n"); 104 | expect(match!.matches[0]).toBe("c"); 105 | 106 | match = regex.exec("a1\nd2\nc3\n"); 107 | expect(match).toBeNull(); 108 | }); 109 | 110 | it("matches across multiple lines with global mode", () => { 111 | const regex = new RegExp("[a-c]$", "gm"); 112 | 113 | let match = regex.exec("1a\n2d\n3c\n"); 114 | expect(match!.matches[0]).toBe("a"); 115 | 116 | match = regex.exec("1a\n2d\n3c\n"); 117 | expect(match!.matches[0]).toBe("c"); 118 | 119 | match = regex.exec("1a\n2d\n3c\n"); 120 | expect(match).toBeNull(); 121 | }); 122 | }); 123 | 124 | describe("non-global mode", () => { 125 | it("doesn't increment lastIndex", () => { 126 | const regex = new RegExp("\\d+"); 127 | let match = exec(regex, "dog 23 fish 45 cat"); 128 | expect(match.matches[0]).toBe("23"); 129 | expect(regex.lastIndex).toBe(0); 130 | 131 | match = exec(regex, "dog 23 fish 45 cat"); 132 | expect(match.matches[0]).toBe("23"); 133 | expect(regex.lastIndex).toBe(0); 134 | }); 135 | }); 136 | 137 | describe("use cases", () => { 138 | it("matches combinations", () => { 139 | expectMatch("\\s\\w*", [" bar"]); 140 | expectMatch("\\S\\w*", ["foo"]); 141 | }); 142 | 143 | it("email", () => { 144 | const regex = ".+@.+\\..+"; 145 | expect(exec(regex, "colin@gmail.com")).toBeTruthy(); 146 | expectNotMatch(regex, ["gmail"]); 147 | 148 | const capturingRegex = "(.+)@(.+)\\.(.+)"; 149 | expect(exec(capturingRegex, "colin@gmail.com")).toBeTruthy(); 150 | 151 | const match = exec(capturingRegex, "colin@gmail.com"); 152 | expect(match.matches[0]).toBe("colin@gmail.com"); 153 | expect(match.matches[1]).toBe("colin"); 154 | expect(match.matches[2]).toBe("gmail"); 155 | expect(match.matches[3]).toBe("com"); 156 | }); 157 | }); 158 | 159 | describe("error cases", () => { 160 | it("throws an explicit error when there is nothing to repeat", () => { 161 | expect(() => { 162 | let foo = new RegExp("*m", ""); // eslint-disable-line no-invalid-regexp 163 | }).toThrow("Invalid regular expression: Nothing to repeat"); 164 | }); 165 | }); 166 | -------------------------------------------------------------------------------- /assembly/__tests__/utils.ts: -------------------------------------------------------------------------------- 1 | import { RegExp, Match } from ".."; 2 | 3 | export function expectMatch( 4 | regex: string, 5 | arr: string[], 6 | flags: string = "" 7 | ): void { 8 | let regexp = new RegExp(regex, flags); 9 | for (let i = 0; i < arr.length; i++) { 10 | const value = arr[i]; 11 | const match = exec(regexp, value); 12 | expect(match.matches[0]).toStrictEqual(value); 13 | } 14 | } 15 | 16 | export function expectNotMatch( 17 | regex: string, 18 | arr: string[], 19 | flags: string = "" 20 | ): void { 21 | let regexp = new RegExp(regex, flags); 22 | for (let i = 0; i < arr.length; i++) { 23 | const match = regexp.exec(arr[i]); 24 | expect(match).toBeNull( 25 | "string: " + arr[i] + " should not match regex: " + regex 26 | ); 27 | } 28 | } 29 | 30 | export function exec( 31 | regex: T, 32 | value: string, 33 | flags: string = "" 34 | ): Match { 35 | let regexp: RegExp; 36 | if (regex instanceof RegExp) { 37 | regexp = regex; 38 | } else if (isString()) { 39 | // @ts-ignore 40 | regexp = new RegExp(regex, flags); 41 | } else { 42 | ERROR("Only RegExp and string are valid types"); 43 | } 44 | // @ts-ignore 45 | let res = regexp.exec(value); 46 | // @ts-ignore 47 | expect(res).not.toBe( 48 | null, 49 | // @ts-ignore 50 | "string: " + value + " should match regex: " + regexp.toString() 51 | ); 52 | return res; 53 | } 54 | -------------------------------------------------------------------------------- /assembly/char.ts: -------------------------------------------------------------------------------- 1 | // @ts-ignore 2 | @lazy 3 | export const enum Char { 4 | None = -1, 5 | HorizontalTab = 0x09, 6 | LineFeed = 0x0a, 7 | VerticalTab = 0x0b, 8 | FormFeed = 0x0c, 9 | CarriageReturn = 0x0d, 10 | Space = 0x20, 11 | Dollar = 0x24, // "$" 12 | LeftParenthesis = 0x28, 13 | RightParenthesis = 0x29, 14 | Asterisk = 0x2a, // "*" 15 | Plus = 0x2b, // "+" 16 | Comma = 0x2c, // "*" 17 | Minus = 0x2d, // "-" 18 | Dot = 0x2e, // "." 19 | Zero = 0x30, 20 | Nine = 0x39, 21 | Colon = 0x3a, 22 | Question = 0x3f, // "?" 23 | A = 0x41, 24 | D = 0x44, 25 | S = 0x53, 26 | W = 0x57, 27 | Z = 0x5a, 28 | LeftSquareBracket = 0x5b, // "[" 29 | Backslash = 0x5c, // "\" 30 | RightSquareBracket = 0x5d, // "]" 31 | Caret = 0x5e, // "^" 32 | Underscore = 0x5f, 33 | a = 0x61, 34 | d = 0x64, 35 | f = 0x66, 36 | n = 0x6e, 37 | r = 0x72, 38 | s = 0x73, 39 | t = 0x74, 40 | u = 0x75, 41 | v = 0x76, 42 | w = 0x77, 43 | x = 0x78, 44 | z = 0x7a, 45 | LeftCurlyBrace = 0x7b /* { */, 46 | VerticalBar = 0x7c /* | */, 47 | RightCurlyBrace = 0x7d /* { */, 48 | NonBreakingSpace = 0xa0, 49 | } 50 | 51 | // @ts-ignore 52 | @inline 53 | function inRange(value: u32, from: u32, to: u32): bool { 54 | if (ASC_TARGET == 1) { 55 | // makes use of unsigned integer operations, making this 56 | // approach a little faster when compiled to WASM 57 | return value - from < (to - from + 1); 58 | } else { 59 | return value >= from && value <= to; 60 | } 61 | } 62 | 63 | export function isDigit(code: u32): bool { 64 | return inRange(code, Char.Zero, Char.Nine); 65 | } 66 | 67 | export function isHexadecimalDigit(code: u32): bool { 68 | return isDigit(code) || inRange(code, Char.a, Char.f); 69 | } 70 | 71 | export function isLowercaseAlpha(code: u32): bool { 72 | return inRange(code, Char.a, Char.z); 73 | } 74 | 75 | export function isUppercaseAlpha(code: u32): bool { 76 | return inRange(code, Char.A, Char.Z); 77 | } 78 | 79 | export function isAlpha(code: u32): bool { 80 | if (ASC_TARGET == 1) { 81 | return (code | 32) - Char.a < 26; 82 | } else { 83 | return inRange(code, Char.a, Char.z) || inRange(code, Char.A, Char.Z); 84 | } 85 | } 86 | 87 | export function isWhitespace(code: u32): bool { 88 | switch (code) { 89 | case Char.Space: 90 | case Char.HorizontalTab: 91 | case Char.VerticalTab: 92 | case Char.FormFeed: 93 | case Char.LineFeed: 94 | case Char.CarriageReturn: 95 | case Char.NonBreakingSpace: 96 | case 0x1680: // (1) 97 | case 0x2028: // (2) 98 | case 0x2029: // 99 | case 0x202f: // 100 | case 0x205f: // 101 | case 0x3000: // 102 | case 0xfeff: 103 | return true; // 104 | } 105 | if (inRange(code, 0x2000, 0x200a)) { 106 | return true; 107 | } 108 | return false; 109 | } 110 | -------------------------------------------------------------------------------- /assembly/env.ts: -------------------------------------------------------------------------------- 1 | export declare function log(str: string): void; 2 | -------------------------------------------------------------------------------- /assembly/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./regexp"; 2 | -------------------------------------------------------------------------------- /assembly/nfa/matcher.ts: -------------------------------------------------------------------------------- 1 | import { isDigit, isAlpha, isWhitespace, Char } from "../char"; 2 | 3 | import { 4 | CharacterNode, 5 | CharacterClassNode, 6 | CharacterSetNode, 7 | CharacterRangeNode, 8 | NodeType, 9 | } from "../parser/node"; 10 | import { Flags } from "../regexp"; 11 | import { Range } from "../util"; 12 | 13 | const enum MatcherType { 14 | Character, 15 | CharacterRange, 16 | CharacterSet, 17 | CharacterClass, 18 | } 19 | export class Matcher { 20 | @lazy static _flags!: Flags; 21 | 22 | constructor(readonly type: MatcherType) {} 23 | 24 | matches(code: u32): bool { 25 | return false; 26 | } 27 | 28 | static fromCharacterClassNode( 29 | node: CharacterSetNode, 30 | flags: Flags 31 | ): CharacterSetMatcher { 32 | return new CharacterSetMatcher(node.charClass, flags.dotAll); 33 | } 34 | 35 | static fromCharacterRangeNode( 36 | node: CharacterRangeNode, 37 | flags: Flags 38 | ): CharacterRangeMatcher { 39 | return new CharacterRangeMatcher( 40 | new Range(node.from, node.to), 41 | flags.ignoreCase 42 | ); 43 | } 44 | 45 | static fromCharacterSetNode( 46 | node: CharacterClassNode, 47 | flags: Flags 48 | ): CharacterClassMatcher { 49 | Matcher._flags = flags; 50 | const matchers = node.expressions.map((exp) => { 51 | switch (exp.type) { 52 | case NodeType.CharacterRange: 53 | return Matcher.fromCharacterRangeNode( 54 | exp as CharacterRangeNode, 55 | Matcher._flags 56 | ); 57 | case NodeType.Character: 58 | return Matcher.fromCharacterNode( 59 | exp as CharacterNode, 60 | Matcher._flags 61 | ); 62 | case NodeType.CharacterSet: 63 | return Matcher.fromCharacterClassNode( 64 | exp as CharacterSetNode, 65 | Matcher._flags 66 | ); 67 | default: 68 | throw new Error("unsupported node type within character set"); 69 | } 70 | }); 71 | return new CharacterClassMatcher(matchers, node.negated); 72 | } 73 | 74 | static fromCharacterNode( 75 | node: CharacterNode, 76 | flags: Flags 77 | ): CharacterMatcher { 78 | return new CharacterMatcher(node.char, flags.ignoreCase); 79 | } 80 | } 81 | 82 | export class CharacterMatcher extends Matcher { 83 | constructor(private character: Char, private ignoreCase: bool) { 84 | super(MatcherType.Character); 85 | if (ignoreCase) { 86 | this.character |= 0x20; 87 | } 88 | } 89 | 90 | matches(code: u32): bool { 91 | if (this.ignoreCase) { 92 | code |= 0x20; 93 | } 94 | return this.character == code; 95 | } 96 | } 97 | 98 | // @ts-ignore 99 | @lazy const LOWERCASE_LETTERS = new Range(Char.a, Char.z); 100 | // @ts-ignore 101 | @lazy const UPPERCASE_LETTERS = new Range(Char.A, Char.Z); 102 | // @ts-ignore 103 | @lazy const UPPER_LOWER_OFFSET = Char.a - Char.A; 104 | 105 | export class CharacterRangeMatcher extends Matcher { 106 | private ranges: Range[]; 107 | 108 | constructor(private range: Range, ignoreCase: bool) { 109 | super(MatcherType.CharacterRange); 110 | this.ranges = [range]; 111 | 112 | if (ignoreCase) { 113 | const lowerIntersect = range.intersection(LOWERCASE_LETTERS); 114 | if (lowerIntersect) { 115 | this.ranges.push(lowerIntersect.offset(-UPPER_LOWER_OFFSET)); 116 | } 117 | const upperIntersect = range.intersection(UPPERCASE_LETTERS); 118 | if (upperIntersect) { 119 | this.ranges.push(upperIntersect.offset(UPPER_LOWER_OFFSET)); 120 | } 121 | } 122 | } 123 | 124 | matches(code: u32): bool { 125 | for (let i = 0, len = this.ranges.length; i < len; i++) { 126 | if (this.ranges[i].contains(code)) { 127 | return true; 128 | } 129 | } 130 | return false; 131 | } 132 | } 133 | 134 | export class CharacterSetMatcher extends Matcher { 135 | constructor(public charClass: Char, private dotAll: bool) { 136 | super(MatcherType.CharacterSet); 137 | } 138 | 139 | matches(code: u32): bool { 140 | switch (this.charClass) { 141 | case Char.d: 142 | return isDigit(code); 143 | case Char.D: 144 | return !isDigit(code); 145 | case Char.Dot: 146 | return this.dotAll 147 | ? true 148 | : code != Char.CarriageReturn && 149 | code != Char.LineFeed && 150 | code != 8232 && 151 | code != 8233; 152 | case Char.w: 153 | return isAlpha(code) || code == Char.Underscore || isDigit(code); 154 | case Char.W: 155 | return !(isAlpha(code) || code == Char.Underscore || isDigit(code)); 156 | case Char.s: 157 | return isWhitespace(code); 158 | case Char.S: 159 | return !isWhitespace(code); 160 | case Char.t: 161 | return code == Char.HorizontalTab; 162 | case Char.r: 163 | return code == Char.CarriageReturn; 164 | case Char.n: 165 | return code == Char.LineFeed; 166 | case Char.v: 167 | return code == Char.VerticalTab; 168 | case Char.f: 169 | return code == Char.FormFeed; 170 | 171 | default: 172 | throw new Error( 173 | "unsupported character class - " + String.fromCharCode(this.charClass) 174 | ); 175 | } 176 | } 177 | } 178 | 179 | export class CharacterClassMatcher extends Matcher { 180 | constructor(public matchers: Matcher[], public negated: bool) { 181 | super(MatcherType.CharacterClass); 182 | } 183 | 184 | matches(code: u32): bool { 185 | let match: bool = false; 186 | for (let i = 0, len = this.matchers.length; i < len; i++) { 187 | let matcher = this.matchers[i]; 188 | switch (matcher.type) { 189 | case MatcherType.Character: 190 | match = (matcher as CharacterMatcher).matches(code); 191 | break; 192 | 193 | case MatcherType.CharacterRange: 194 | match = (matcher as CharacterRangeMatcher).matches(code); 195 | break; 196 | 197 | case MatcherType.CharacterSet: 198 | match = (matcher as CharacterSetMatcher).matches(code); 199 | break; 200 | 201 | case MatcherType.CharacterClass: 202 | match = (matcher as CharacterClassMatcher).matches(code); 203 | break; 204 | } 205 | if (match) break; 206 | } 207 | return this.negated ? !match : match; 208 | } 209 | } 210 | -------------------------------------------------------------------------------- /assembly/nfa/nfa.ts: -------------------------------------------------------------------------------- 1 | import { 2 | AST, 3 | CharacterNode, 4 | Node, 5 | ConcatenationNode, 6 | RepetitionNode, 7 | AlternationNode, 8 | CharacterClassNode, 9 | CharacterSetNode, 10 | GroupNode, 11 | NodeType, 12 | } from "../parser/node"; 13 | 14 | import { Char } from "../char"; 15 | import { Matcher } from "./matcher"; 16 | import { Flags } from "../regexp"; 17 | import { MatchResult } from "./types"; 18 | 19 | /* eslint @typescript-eslint/no-empty-function: ["error", { "allow": ["constructors", "methods"] }] */ 20 | export class State { 21 | @lazy static _stateId: u32 = 0; 22 | constructor( 23 | public transitions: State[] = [], 24 | public id: u32 = State._stateId++ 25 | ) {} 26 | 27 | matches(input: string, position: u32): MatchResult { 28 | return MatchResult.Ignore; 29 | } 30 | } 31 | 32 | export class GroupStartMarkerState extends State { 33 | location: i32 = -1; 34 | // a bit yucky - storing transient state in the state machine! 35 | capture: string = ""; 36 | // captures from the path through the NFA that reaches the end are flagged 37 | flagged: bool = false; 38 | 39 | constructor(next: State, public capturing: bool, public groupId: i32) { 40 | super(); 41 | this.transitions.push(next); 42 | } 43 | 44 | matches(input: string, position: u32): MatchResult { 45 | this.location = position; 46 | return MatchResult.Ignore; 47 | } 48 | } 49 | 50 | export class GroupEndMarkerState extends State { 51 | constructor(next: State, public startMarker: GroupStartMarkerState) { 52 | super(); 53 | this.transitions.push(next); 54 | } 55 | 56 | matches(input: string, position: u32): MatchResult { 57 | if (this.startMarker.capturing) { 58 | this.startMarker.capture = input.substring( 59 | this.startMarker.location, 60 | position 61 | ); 62 | } 63 | return MatchResult.Ignore; 64 | } 65 | } 66 | 67 | export class MatcherState extends State { 68 | ignoreCase: bool = false; 69 | 70 | constructor(private matcher: T, next: State) { 71 | super(); 72 | this.transitions.push(next); 73 | } 74 | 75 | matches(input: string, position: u32): MatchResult { 76 | return this.matcher.matches(input.charCodeAt(position)) 77 | ? MatchResult.Match 78 | : MatchResult.Fail; 79 | } 80 | } 81 | 82 | export class Automata { 83 | static toNFA(ast: AST, flags: Flags): Automata { 84 | return new AutomataFactor(flags).automataForNode(ast.body); 85 | } 86 | 87 | static fromEpsilon(): Automata { 88 | const start = new State(); 89 | const end = new State(); 90 | start.transitions.push(end); 91 | return new Automata(start, end); 92 | } 93 | 94 | static fromMatcher(matcher: T): Automata { 95 | const end = new State(); 96 | const start = new MatcherState(matcher, end); 97 | return new Automata(start, end); 98 | } 99 | 100 | constructor(public start: State, public end: State) {} 101 | } 102 | 103 | function concat(first: Automata, second: Automata): Automata { 104 | first.end.transitions.push(second.start); 105 | return new Automata(first.start, second.end); 106 | } 107 | 108 | function union(first: Automata, second: Automata): Automata { 109 | const start = new State(); 110 | start.transitions.push(first.start); 111 | start.transitions.push(second.start); 112 | const end = new State(); 113 | first.end.transitions.push(end); 114 | second.end.transitions.push(end); 115 | return new Automata(start, end); 116 | } 117 | 118 | function closure(nfa: Automata, greedy: bool): Automata { 119 | const start = new State(); 120 | const end = new State(); 121 | if (greedy) { 122 | nfa.end.transitions.push(nfa.start); 123 | nfa.end.transitions.push(end); 124 | start.transitions.push(nfa.start); 125 | start.transitions.push(end); 126 | } else { 127 | nfa.end.transitions.push(end); 128 | nfa.end.transitions.push(nfa.start); 129 | start.transitions.push(end); 130 | start.transitions.push(nfa.start); 131 | } 132 | return new Automata(start, end); 133 | } 134 | 135 | function zeroOrOne(nfa: Automata, greedy: bool): Automata { 136 | const start = new State(); 137 | const end = new State(); 138 | if (greedy) { 139 | start.transitions.push(nfa.start); 140 | start.transitions.push(end); 141 | } else { 142 | start.transitions.push(end); 143 | start.transitions.push(nfa.start); 144 | } 145 | nfa.end.transitions.push(end); 146 | return new Automata(start, end); 147 | } 148 | 149 | function oneOrMore(nfa: Automata, greedy: bool): Automata { 150 | const start = new State(); 151 | const end = new State(); 152 | start.transitions.push(nfa.start); 153 | if (greedy) { 154 | nfa.end.transitions.push(nfa.start); 155 | nfa.end.transitions.push(end); 156 | } else { 157 | nfa.end.transitions.push(end); 158 | nfa.end.transitions.push(nfa.start); 159 | } 160 | return new Automata(start, end); 161 | } 162 | 163 | function group(nfa: Automata, capturing: bool, id: i32): Automata { 164 | // groups are implemented by wrapping the automata with 165 | // a pair of markers that record matches 166 | const startMarker = new GroupStartMarkerState(nfa.start, capturing, id); 167 | const end = new State(); 168 | const endMarker = new GroupEndMarkerState(end, startMarker); 169 | nfa.end.transitions.push(endMarker); 170 | return new Automata(startMarker, end); 171 | } 172 | 173 | class AutomataFactor { 174 | constructor(private flags: Flags) {} 175 | 176 | // recursively builds an automata for the given AST 177 | automataForNode(expression: Node | null): Automata { 178 | if (expression == null) { 179 | return Automata.fromEpsilon(); 180 | } 181 | 182 | switch (expression.type) { 183 | case NodeType.Repetition: { 184 | const node = expression as RepetitionNode; 185 | const automata = this.automataForNode(node.expression); 186 | const quantifier = node.quantifier; 187 | if (quantifier == Char.Question) { 188 | return zeroOrOne(automata, node.greedy); 189 | } else if (quantifier == Char.Plus) { 190 | return oneOrMore(automata, node.greedy); 191 | } else if (quantifier == Char.Asterisk) { 192 | return closure(automata, node.greedy); 193 | } else { 194 | throw new Error( 195 | "unsupported quantifier - " + String.fromCharCode(quantifier) 196 | ); 197 | } 198 | } 199 | case NodeType.Character: 200 | return Automata.fromMatcher( 201 | Matcher.fromCharacterNode(expression as CharacterNode, this.flags) 202 | ); 203 | case NodeType.Concatenation: { 204 | const expressions = (expression as ConcatenationNode).expressions; 205 | if (expressions.length == 0) { 206 | return Automata.fromEpsilon(); 207 | } 208 | let automata = this.automataForNode(expressions[0]); 209 | for (let i = 1, len = expressions.length; i < len; i++) { 210 | automata = concat(automata, this.automataForNode(expressions[i])); 211 | } 212 | return automata; 213 | } 214 | case NodeType.Alternation: { 215 | const node = expression as AlternationNode; 216 | return union( 217 | this.automataForNode(node.left), 218 | this.automataForNode(node.right) 219 | ); 220 | } 221 | case NodeType.CharacterClass: 222 | return Automata.fromMatcher( 223 | Matcher.fromCharacterSetNode( 224 | expression as CharacterClassNode, 225 | this.flags 226 | ) 227 | ); 228 | case NodeType.CharacterSet: 229 | return Automata.fromMatcher( 230 | Matcher.fromCharacterClassNode( 231 | expression as CharacterSetNode, 232 | this.flags 233 | ) 234 | ); 235 | case NodeType.Group: { 236 | const node = expression as GroupNode; 237 | return group( 238 | this.automataForNode(node.expression), 239 | node.capturing, 240 | node.id 241 | ); 242 | } 243 | case NodeType.Assertion: 244 | return Automata.fromEpsilon(); 245 | default: 246 | throw new Error("un-recognised AST node"); 247 | } 248 | } 249 | } 250 | 251 | export { MatchResult } from "./types"; 252 | -------------------------------------------------------------------------------- /assembly/nfa/types.ts: -------------------------------------------------------------------------------- 1 | // @ts-ignore 2 | @lazy 3 | export enum MatchResult { 4 | // a match has occurred - which is a signal to consume a character 5 | Match, 6 | // a match failed, abort this regex 7 | Fail, 8 | // this state doesn't preform a match 9 | Ignore, 10 | } 11 | -------------------------------------------------------------------------------- /assembly/nfa/walker.ts: -------------------------------------------------------------------------------- 1 | import { State } from "./nfa"; 2 | 3 | export function walker( 4 | state: State, 5 | visitor: (state: State) => void, 6 | visited: State[] = [] 7 | ): void { 8 | if (visited.includes(state)) return; 9 | visitor(state); 10 | visited.push(state); 11 | const nextStates = state.transitions; 12 | for (let i = 0, len = nextStates.length; i < len; i++) { 13 | walker(nextStates[i], visitor, visited); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /assembly/parser/node.ts: -------------------------------------------------------------------------------- 1 | import { Char } from "../char"; 2 | import { replaceAtIndex } from "../util"; 3 | 4 | export const enum NodeType { 5 | AST, 6 | Assertion, 7 | Alternation, 8 | Concatenation, 9 | Character, 10 | CharacterClass, 11 | CharacterSet, 12 | CharacterRange, 13 | Repetition, 14 | RangeRepetition, 15 | Group, 16 | } 17 | 18 | export abstract class Node { 19 | @lazy static readonly emptyArray: Node[] = new Array(); 20 | constructor(public type: NodeType) {} 21 | 22 | children(): Node[] { 23 | return Node.emptyArray; 24 | } 25 | 26 | abstract clone(): Node; 27 | 28 | replace(node: Node, replacement: Node): void { 29 | throw new Error("replace not implemented for this node type"); 30 | } 31 | } 32 | 33 | export class AST extends Node { 34 | constructor(public body: Node | null) { 35 | super(NodeType.AST); 36 | } 37 | 38 | children(): Node[] { 39 | return this.body != null ? [this.body as Node] : Node.emptyArray; 40 | } 41 | 42 | clone(): Node { 43 | const body = this.body; 44 | return new AST(body != null ? body.clone() : null); 45 | } 46 | 47 | replace(node: Node, replacement: Node): void { 48 | this.body = replacement; 49 | } 50 | } 51 | 52 | export class ConcatenationNode extends Node { 53 | constructor(public expressions: Node[]) { 54 | super(NodeType.Concatenation); 55 | } 56 | 57 | children(): Node[] { 58 | return this.expressions; 59 | } 60 | 61 | clone(): Node { 62 | return new ConcatenationNode( 63 | this.expressions.slice(0).map((s) => s.clone()) 64 | ); 65 | } 66 | 67 | replace(node: Node, replacement: Node): void { 68 | const expressions = this.expressions; 69 | const index = expressions.indexOf(node); 70 | this.expressions = replaceAtIndex(expressions, index, replacement); 71 | } 72 | } 73 | 74 | export class CharacterClassNode extends Node { 75 | constructor(public expressions: Node[], public negated: bool) { 76 | super(NodeType.CharacterClass); 77 | } 78 | 79 | clone(): Node { 80 | return new CharacterClassNode( 81 | this.expressions.slice(0).map((s) => s.clone()), 82 | this.negated 83 | ); 84 | } 85 | } 86 | 87 | export class CharacterRangeNode extends Node { 88 | constructor(public from: u32, public to: u32) { 89 | super(NodeType.CharacterRange); 90 | } 91 | 92 | static is(node: Node): bool { 93 | return node.type == NodeType.CharacterRange; 94 | } 95 | 96 | clone(): Node { 97 | return new CharacterRangeNode(this.from, this.to); 98 | } 99 | } 100 | 101 | export class CharacterNode extends Node { 102 | constructor(public char: u32) { 103 | super(NodeType.Character); 104 | } 105 | 106 | clone(): Node { 107 | return new CharacterNode(this.char); 108 | } 109 | } 110 | 111 | export class AssertionNode extends Node { 112 | constructor(public kind: Char) { 113 | super(NodeType.Assertion); 114 | } 115 | 116 | static is(node: Node, kind: Char = Char.None): bool { 117 | return ( 118 | node.type == NodeType.Assertion && 119 | ((node as AssertionNode).kind == kind || kind == Char.None) 120 | ); 121 | } 122 | 123 | clone(): Node { 124 | return new AssertionNode(this.kind); 125 | } 126 | } 127 | 128 | export class CharacterSetNode extends Node { 129 | constructor(public charClass: Char) { 130 | super(NodeType.CharacterSet); 131 | } 132 | 133 | clone(): Node { 134 | return new CharacterSetNode(this.charClass); 135 | } 136 | } 137 | 138 | export class RepetitionNode extends Node { 139 | constructor( 140 | public expression: Node, 141 | public quantifier: Char, 142 | public greedy: bool = true 143 | ) { 144 | super(NodeType.Repetition); 145 | } 146 | 147 | clone(): Node { 148 | return new RepetitionNode(this.expression.clone(), this.quantifier); 149 | } 150 | 151 | replace(node: Node, replacement: Node): void { 152 | this.expression = replacement; 153 | } 154 | 155 | children(): Node[] { 156 | return [this.expression]; 157 | } 158 | } 159 | 160 | export class RangeRepetitionNode extends Node { 161 | constructor( 162 | public expression: Node, 163 | public from: i32, 164 | public to: i32, 165 | public greedy: bool = true 166 | ) { 167 | super(NodeType.RangeRepetition); 168 | if (expression.type == NodeType.RangeRepetition) { 169 | throw new Error("The preceding token is not quantifiable"); 170 | } 171 | } 172 | 173 | clone(): Node { 174 | return new RangeRepetitionNode(this.expression.clone(), this.from, this.to); 175 | } 176 | 177 | replace(node: Node, replacement: Node): void { 178 | this.expression = replacement; 179 | } 180 | 181 | children(): Node[] { 182 | return [this.expression]; 183 | } 184 | } 185 | 186 | export class AlternationNode extends Node { 187 | constructor(public left: Node, public right: Node) { 188 | super(NodeType.Alternation); 189 | } 190 | 191 | static is(node: Node): bool { 192 | return node.type == NodeType.Alternation; 193 | } 194 | 195 | children(): Node[] { 196 | return [this.left, this.right]; 197 | } 198 | 199 | clone(): Node { 200 | return new AlternationNode(this.left.clone(), this.right.clone()); 201 | } 202 | 203 | replace(node: Node, replacement: Node): void { 204 | if (this.left === node) { 205 | this.left = replacement; 206 | } else { 207 | this.right = replacement; 208 | } 209 | } 210 | } 211 | 212 | export class GroupNode extends Node { 213 | @lazy static _id: i32 = 0; 214 | 215 | constructor( 216 | public expression: Node, 217 | public capturing: bool, 218 | public id: i32 = -1 219 | ) { 220 | super(NodeType.Group); 221 | if (id == -1) { 222 | this.id = GroupNode._id++; 223 | } 224 | } 225 | 226 | children(): Node[] { 227 | return [this.expression]; 228 | } 229 | 230 | clone(): Node { 231 | return new GroupNode(this.expression.clone(), this.capturing, this.id); 232 | } 233 | 234 | replace(node: Node, replacement: Node): void { 235 | this.expression = replacement; 236 | } 237 | } 238 | -------------------------------------------------------------------------------- /assembly/parser/parser.ts: -------------------------------------------------------------------------------- 1 | import { isDigit, Char, isHexadecimalDigit } from "../char"; 2 | import { StringIterator } from "./string-iterator"; 3 | import { 4 | AST, 5 | RangeRepetitionNode, 6 | GroupNode, 7 | AssertionNode, 8 | CharacterSetNode, 9 | CharacterNode, 10 | Node, 11 | AlternationNode, 12 | ConcatenationNode, 13 | RepetitionNode, 14 | CharacterClassNode, 15 | CharacterRangeNode, 16 | } from "./node"; 17 | 18 | function isQuantifier(code: Char): bool { 19 | return code == Char.Question || code == Char.Plus || code == Char.Asterisk; 20 | } 21 | 22 | // characters which have special meaning within character sets 23 | function isCharacterSetSpecialChar(code: Char): bool { 24 | return ( 25 | code == Char.Caret || 26 | code == Char.Minus || 27 | code == Char.RightSquareBracket || 28 | code == Char.Backslash 29 | ); 30 | } 31 | 32 | function isCharacterClass(code: u32): bool { 33 | switch (code) { 34 | case Char.d: 35 | case Char.D: 36 | case Char.Dot: 37 | case Char.w: 38 | case Char.W: 39 | case Char.s: 40 | case Char.S: 41 | case Char.t: 42 | case Char.r: 43 | case Char.n: 44 | case Char.v: 45 | case Char.f: 46 | return true; 47 | } 48 | return false; 49 | } 50 | 51 | function isAssertion(code: u32): bool { 52 | return code == Char.Dollar || code == Char.Caret; // "$" or "^" 53 | } 54 | 55 | function isSpecialCharacter(code: u32): bool { 56 | switch (code) { 57 | case Char.Dollar: 58 | case Char.LeftParenthesis: 59 | case Char.RightParenthesis: 60 | case Char.Asterisk: 61 | case Char.Plus: 62 | case Char.Dot: 63 | case Char.Question: 64 | case Char.Backslash: 65 | case Char.LeftSquareBracket: 66 | case Char.RightSquareBracket: 67 | case Char.Caret: 68 | case Char.VerticalBar: 69 | case Char.LeftCurlyBrace: 70 | case Char.RightCurlyBrace: 71 | return true; 72 | } 73 | return false; 74 | } 75 | 76 | class Range { 77 | constructor(public from: i32, public to: i32) {} 78 | } 79 | 80 | export class Parser { 81 | iterator: StringIterator; 82 | 83 | private constructor(input: string) { 84 | this.iterator = new StringIterator(input); 85 | } 86 | 87 | static toAST(input: string): AST { 88 | return new Parser(input).toAST(); 89 | } 90 | 91 | private eatToken(value: u32 = -1): u32 { 92 | const currentToken = this.iterator.current; 93 | if (value != -1 && this.iterator.current != value) { 94 | throw new Error("invalid token"); 95 | } 96 | this.iterator.next(); 97 | return currentToken; 98 | } 99 | 100 | private toAST(): AST { 101 | if (!this.iterator.more()) { 102 | return new AST(null); 103 | } else { 104 | return new AST(this.parseSequence()); 105 | } 106 | } 107 | 108 | private parseCharacterCode(code: u32): Node { 109 | const length = code == Char.x ? 2 : 4; 110 | // check whether we have the correct number of digits ahead 111 | for (let i = 0; i < length; i++) { 112 | if (!isHexadecimalDigit(this.iterator.lookahead(i + 1))) { 113 | return new CharacterNode(this.eatToken()); 114 | } 115 | } 116 | // if so, parse the hex string 117 | this.eatToken(code); 118 | let value = ""; 119 | for (let i = 0; i < length; i++) { 120 | value += this.iterator.currentAsString(); 121 | this.eatToken(); 122 | } 123 | return new CharacterNode(u32(parseInt(value, 16))); 124 | } 125 | 126 | private parseCharacter(): Node { 127 | let token = this.iterator.current; 128 | if (this.iterator.current == Char.Backslash) { 129 | this.eatToken(Char.Backslash); 130 | token = this.iterator.current; 131 | if (isSpecialCharacter(token)) { 132 | this.eatToken(); 133 | return new CharacterNode(token); 134 | } else if (isAssertion(token)) { 135 | return new CharacterNode(this.eatToken()); 136 | } else if (token == Char.x) { 137 | return this.parseCharacterCode(Char.x); 138 | } else if (token == Char.u) { 139 | return this.parseCharacterCode(Char.u); 140 | } else if (isCharacterClass(token)) { 141 | return new CharacterSetNode(this.eatToken()); 142 | } else { 143 | return new CharacterNode(this.eatToken()); 144 | } 145 | } 146 | 147 | if (isAssertion(token)) { 148 | return new AssertionNode(this.eatToken()); 149 | } 150 | 151 | if (token == Char.Dot) { 152 | this.eatToken(Char.Dot); 153 | return new CharacterSetNode(Char.Dot); 154 | } 155 | 156 | return new CharacterNode(this.eatToken()); 157 | } 158 | 159 | private maybeParseDigit(): i32 { 160 | let digitStr = ""; 161 | while (this.iterator.more()) { 162 | const token = this.iterator.current; 163 | if (isDigit(token)) { 164 | digitStr += this.iterator.currentAsString(); 165 | } else { 166 | return digitStr == "" ? -1 : parseInt(digitStr); 167 | } 168 | this.eatToken(); 169 | } 170 | return digitStr == "" ? -1 : parseInt(digitStr); 171 | } 172 | 173 | private maybeParseRepetitionRange(): Range | null { 174 | // snapshot 175 | const iteratorCopy = this.iterator.copy(); 176 | this.eatToken(Char.LeftCurlyBrace); 177 | 178 | const from = this.maybeParseDigit(); 179 | if (from == -1) { 180 | return null; 181 | } 182 | if (this.iterator.current == Char.RightCurlyBrace) { 183 | this.eatToken(); 184 | return new Range(from, from); 185 | } else if (this.iterator.current == Char.Comma) { 186 | this.eatToken(); 187 | const to = this.maybeParseDigit(); 188 | // @ts-ignore 189 | if (this.iterator.current == Char.RightCurlyBrace) { 190 | this.eatToken(); 191 | return new Range(from, to); 192 | } 193 | } 194 | 195 | this.iterator = iteratorCopy; 196 | return null; 197 | } 198 | 199 | private isGreedy(): bool { 200 | if (this.iterator.current == Char.Question) { 201 | this.eatToken(); 202 | return false; 203 | } 204 | return true; 205 | } 206 | 207 | private isCapturing(): bool { 208 | if ( 209 | this.iterator.current == Char.Question && 210 | this.iterator.lookahead(1) == Char.Colon 211 | ) { 212 | this.eatToken(Char.Question); 213 | this.eatToken(Char.Colon); 214 | return false; 215 | } 216 | return true; 217 | } 218 | 219 | // parses a sequence of chars 220 | private parseSequence(): Node { 221 | let nodes = new Array(); 222 | while (this.iterator.more()) { 223 | const token = this.iterator.current; 224 | if (token == Char.RightParenthesis) break; 225 | // @ts-ignore 226 | if (token == Char.VerticalBar) { 227 | this.eatToken(Char.VerticalBar); 228 | const left = nodes.length > 1 ? new ConcatenationNode(nodes) : nodes[0]; 229 | nodes = [new AlternationNode(left, this.parseSequence())]; 230 | // @ts-ignore 231 | } else if (token == Char.LeftParenthesis) { 232 | this.eatToken(Char.LeftParenthesis); 233 | const capturing = this.isCapturing(); 234 | nodes.push(new GroupNode(this.parseSequence(), capturing)); 235 | this.eatToken(Char.RightParenthesis); 236 | // @ts-ignore 237 | } else if (token == Char.LeftCurlyBrace) { 238 | const range = this.maybeParseRepetitionRange(); 239 | if (range != null) { 240 | const expression = nodes.pop(); 241 | nodes.push( 242 | new RangeRepetitionNode( 243 | expression, 244 | range.from, 245 | range.to, 246 | this.isGreedy() 247 | ) 248 | ); 249 | } else { 250 | // this is not the start of a repetition, it's just a char! 251 | nodes.push(this.parseCharacter()); 252 | } 253 | } else if (isQuantifier(token)) { 254 | if (nodes.length === 0) { 255 | throw new Error("Invalid regular expression: Nothing to repeat"); 256 | } 257 | 258 | const expression = nodes.pop(); 259 | const quantifier = this.eatToken(); 260 | nodes.push(new RepetitionNode(expression, quantifier, this.isGreedy())); 261 | // @ts-ignore 262 | } else if (token == Char.LeftSquareBracket) { 263 | nodes.push(this.parseCharacterClass()); 264 | } else { 265 | nodes.push(this.parseCharacter()); 266 | } 267 | } 268 | 269 | return nodes.length > 1 ? new ConcatenationNode(nodes) : nodes[0]; 270 | } 271 | 272 | private parseCharacterRange(): Node { 273 | const from = this.eatToken(); 274 | this.eatToken(Char.Minus); 275 | const to = this.eatToken(); 276 | return new CharacterRangeNode(from, to); 277 | } 278 | 279 | private parseCharacterClass(): CharacterClassNode { 280 | this.eatToken(Char.LeftSquareBracket); 281 | 282 | const negated = this.iterator.current == Char.Caret; 283 | if (negated) { 284 | this.eatToken(Char.Caret); 285 | } 286 | 287 | const nodes = new Array(); 288 | while ( 289 | this.iterator.current != Char.RightSquareBracket || 290 | nodes.length == 0 291 | ) { 292 | // lookahead for character range 293 | if ( 294 | this.iterator.current != Char.Backslash && 295 | this.iterator.lookahead(1) == Char.Minus && 296 | this.iterator.lookahead(2) != Char.RightSquareBracket 297 | ) { 298 | nodes.push(this.parseCharacterRange()); 299 | } else { 300 | // have we encountered a backslash? 301 | if (this.iterator.current == Char.Backslash) { 302 | this.eatToken(); 303 | if (isCharacterSetSpecialChar(this.iterator.current)) { 304 | // if it was a backslashed special char, treat as a regular char 305 | nodes.push(new CharacterNode(this.eatToken())); 306 | } else { 307 | // otherwise this is a character class 308 | nodes.push(new CharacterSetNode(this.eatToken())); 309 | } 310 | } else { 311 | nodes.push(new CharacterNode(this.eatToken())); 312 | } 313 | } 314 | 315 | if (!this.iterator.more()) { 316 | throw new SyntaxError("Unterminated character class"); 317 | } 318 | } 319 | this.eatToken(Char.RightSquareBracket); 320 | return new CharacterClassNode(nodes, negated); 321 | } 322 | } 323 | -------------------------------------------------------------------------------- /assembly/parser/string-iterator.ts: -------------------------------------------------------------------------------- 1 | export class StringIterator { 2 | current: u32; 3 | cursor: u32 = 0; 4 | 5 | constructor(private sourceString: string) { 6 | this.current = this.sourceString.charCodeAt(0); 7 | } 8 | 9 | lookahead(distance: u32): u32 { 10 | return this.sourceString.charCodeAt(this.cursor + distance); 11 | } 12 | 13 | next(): bool { 14 | this.cursor++; 15 | if (this.cursor >= u32(this.sourceString.length)) { 16 | this.current = -1; 17 | return false; 18 | } 19 | this.current = this.sourceString.charCodeAt(this.cursor); 20 | return true; 21 | } 22 | 23 | currentAsString(): string { 24 | return String.fromCharCode(this.current); 25 | } 26 | 27 | more(): bool { 28 | return this.cursor < u32(this.sourceString.length); 29 | } 30 | 31 | copy(): StringIterator { 32 | const iterator = new StringIterator(this.sourceString); 33 | iterator.cursor = this.cursor; 34 | iterator.current = this.current; 35 | return iterator; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /assembly/parser/walker.ts: -------------------------------------------------------------------------------- 1 | import { Char } from "../char"; 2 | import { 3 | AST, 4 | ConcatenationNode, 5 | Node, 6 | NodeType, 7 | RangeRepetitionNode, 8 | RepetitionNode, 9 | } from "./node"; 10 | 11 | export class NodeVisitor { 12 | constructor(public node: Node, public parentNode: Node) {} 13 | } 14 | 15 | function walkNode( 16 | node: Node, 17 | parentNode: Node, 18 | visitor: (node: NodeVisitor) => void 19 | ): void { 20 | const children = node.children(); 21 | for (let i = children.length - 1; i >= 0; i--) { 22 | walkNode(children[i], node, visitor); 23 | } 24 | 25 | const nodeVisitor = new NodeVisitor(node, parentNode); 26 | visitor(nodeVisitor); 27 | } 28 | 29 | // depth first, right-left walker 30 | export function walker(ast: AST, visitor: (node: NodeVisitor) => void): void { 31 | let node = ast.body; 32 | if (node != null) { 33 | walkNode(node, ast, visitor); 34 | } 35 | } 36 | 37 | /** 38 | range quantifiers are implemented via 'expansion', which significantly 39 | increases the size of the AST. This imposes a hard limit to prevent 40 | memory-related issues 41 | */ 42 | // @ts-ignore 43 | @lazy const QUANTIFIER_LIMIT = 1000; 44 | 45 | function parentAsConcatNode(visitor: NodeVisitor): ConcatenationNode { 46 | let concatNode: ConcatenationNode | null = null; 47 | let parentNode = visitor.parentNode; 48 | if (parentNode.type != NodeType.Concatenation) { 49 | let node = visitor.node; 50 | concatNode = new ConcatenationNode([node]); 51 | parentNode.replace(node, concatNode); 52 | return concatNode; 53 | } 54 | return parentNode as ConcatenationNode; 55 | } 56 | 57 | // take each range repetition and replace with a concatenation 58 | // of cloned nodes, e.g. a{2} becomes aa 59 | export function expandRepetitions(visitor: NodeVisitor): void { 60 | let node = visitor.node; 61 | if (node.type == NodeType.RangeRepetition) { 62 | // find the parent 63 | const rangeRepNode = node as RangeRepetitionNode; 64 | 65 | if (rangeRepNode.to > QUANTIFIER_LIMIT) { 66 | throw new Error( 67 | "Cannot handle range quantifiers > " + QUANTIFIER_LIMIT.toString() 68 | ); 69 | } 70 | const concatNode = parentAsConcatNode(visitor); 71 | const expressions = concatNode.expressions; 72 | 73 | // locate the original index 74 | const index = expressions.indexOf(rangeRepNode); 75 | 76 | const from = rangeRepNode.from; 77 | const expression = rangeRepNode.expression; 78 | // create multiple clones 79 | const clones = new Array(from); 80 | // a{4} => aaaa 81 | if (from > 0) { 82 | clones[0] = expression; 83 | for (let i = 1; i < from; i++) { 84 | clones[i] = expression.clone(); 85 | } 86 | } 87 | 88 | if (rangeRepNode.to == -1) { 89 | // a{4,} => aaaaa* 90 | clones.push( 91 | new RepetitionNode( 92 | expression.clone(), 93 | Char.Asterisk, 94 | rangeRepNode.greedy 95 | ) 96 | ); 97 | } else { 98 | // a{4,6} => aaaaa?a? 99 | const count = rangeRepNode.to - rangeRepNode.from; 100 | for (let i = 0; i < count; i++) { 101 | clones.push( 102 | new RepetitionNode( 103 | expression.clone(), 104 | Char.Question, 105 | rangeRepNode.greedy 106 | ) 107 | ); 108 | } 109 | } 110 | 111 | // replace the rangeRepNode with the clones 112 | concatNode.expressions = expressions 113 | .slice(0, index) 114 | .concat(clones) 115 | .concat(expressions.slice(index + 1)); 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /assembly/regexp.ts: -------------------------------------------------------------------------------- 1 | import { State, Automata, GroupStartMarkerState, MatchResult } from "./nfa/nfa"; 2 | import { walker as nfaWalker } from "./nfa/walker"; 3 | import { ConcatenationNode, AssertionNode, NodeType } from "./parser/node"; 4 | import { Char } from "./char"; 5 | import { Parser } from "./parser/parser"; 6 | import { first, last } from "./util"; 7 | import { walker as astWalker, expandRepetitions } from "./parser/walker"; 8 | 9 | function recursiveBacktrackingSearch( 10 | state: State, 11 | input: string, 12 | visited: u32[] = [], 13 | position: i32 = 0 14 | ): string | null { 15 | // prevent endless loops when following epsilon transitions 16 | for (let i = 0, len = visited.length; i < len; i++) { 17 | if (visited[i] == state.id) { 18 | return null; 19 | } 20 | } 21 | visited.push(state.id); 22 | 23 | const matches = state.matches(input, position); 24 | if (matches == MatchResult.Match) { 25 | // a match occurred 26 | if (position == input.length) { 27 | // we've reached the end of the string 28 | return null; 29 | } 30 | visited = []; 31 | position++; 32 | } else if (matches == MatchResult.Fail) { 33 | return null; 34 | } 35 | 36 | const transitions = state.transitions; 37 | if (transitions.length == 0) { 38 | // we've reached the end, so retur the matched string 39 | return input.substring(0, position); 40 | } 41 | 42 | for (let i = 0, len = transitions.length; i < len; i++) { 43 | const match = recursiveBacktrackingSearch( 44 | transitions[i], 45 | input, 46 | visited, 47 | position 48 | ); 49 | if (match != null) { 50 | // when unwinding the stack after a successful match, flag the captured values 51 | if (state instanceof GroupStartMarkerState) { 52 | (state as GroupStartMarkerState).flagged = true; 53 | } 54 | return match; 55 | } 56 | } 57 | return null; 58 | } 59 | 60 | export class Match { 61 | constructor( 62 | public matches: string[], 63 | public index: i32, 64 | public input: string 65 | ) {} 66 | 67 | static fromMatch(match: string, index: i32, input: string): Match { 68 | return new Match([match], index, input); 69 | } 70 | } 71 | 72 | export class Flags { 73 | global: bool = false; 74 | ignoreCase: bool = false; 75 | dotAll: bool = false; 76 | multiline: bool = false; 77 | 78 | constructor(flagString: string | null) { 79 | if (flagString) { 80 | this.global = flagString.includes("g"); 81 | this.ignoreCase = flagString.includes("i"); 82 | this.dotAll = flagString.includes("s"); 83 | this.multiline = flagString.includes("m"); 84 | } 85 | } 86 | } 87 | 88 | // capture groups are implemented as GroupStart / GroupEnd states that record (capture) 89 | // the value of the current state of the string being matched. 90 | // Repeated capture groups, via range repetitions (e.g. {2,3}) share the same 'id'. The 91 | // returned regex should only return the value of the final repetition. 92 | function lastCapturesForGroup(groupMarkers: GroupStartMarkerState[]): string[] { 93 | if (!groupMarkers.length) { 94 | return []; 95 | } 96 | const values = [first(groupMarkers).capture]; 97 | let currrentId = first(groupMarkers).groupId; 98 | for (let i = 0; i < groupMarkers.length; i++) { 99 | const gm = groupMarkers[i]; 100 | if (gm.groupId != currrentId) { 101 | currrentId = gm.groupId; 102 | values.push(gm.capture); 103 | } else { 104 | if (gm.flagged) { 105 | values[values.length - 1] = gm.capture; 106 | } 107 | } 108 | } 109 | return values; 110 | } 111 | 112 | export class RegExp { 113 | @lazy static gm: GroupStartMarkerState[] = new Array(); 114 | lastIndex: i32 = 0; 115 | private flags: Flags; 116 | private nfa: Automata; 117 | private endOfInput: bool = false; 118 | private startOfInput: bool = false; 119 | private groupMarkers: GroupStartMarkerState[]; 120 | 121 | constructor(private regex: string, public flagsString: string | null = null) { 122 | const ast = Parser.toAST(regex); 123 | const flags = new Flags(flagsString); 124 | 125 | // look for start / end assertions 126 | const body = ast.body; 127 | if (body != null && body.type == NodeType.Concatenation) { 128 | const expressions = (ast.body as ConcatenationNode).expressions; 129 | this.startOfInput = AssertionNode.is(first(expressions), Char.Caret); 130 | this.endOfInput = AssertionNode.is(last(expressions), Char.Dollar); 131 | } 132 | 133 | astWalker(ast, expandRepetitions); 134 | 135 | const nfa = Automata.toNFA(ast, flags); 136 | 137 | // find all the group marker states 138 | RegExp.gm = new Array(); 139 | nfaWalker(nfa.start, (state) => { 140 | if (state instanceof GroupStartMarkerState) { 141 | const startMarker = state as GroupStartMarkerState; 142 | if (startMarker.capturing) { 143 | RegExp.gm.push(state as GroupStartMarkerState); 144 | } 145 | } 146 | }); 147 | this.nfa = nfa; 148 | this.groupMarkers = RegExp.gm; 149 | 150 | this.flags = flags; 151 | } 152 | 153 | exec(str: string): Match | null { 154 | let groupMarkers = this.groupMarkers; 155 | // remove all previous group marker results 156 | for (let i = 0, len = groupMarkers.length; i < len; i++) { 157 | groupMarkers[i].capture = ""; 158 | } 159 | 160 | let len = str.length; 161 | if (!len) { 162 | const matchStr = recursiveBacktrackingSearch(this.nfa.start, ""); 163 | return matchStr != null ? new Match([matchStr], 0, str) : null; 164 | } 165 | 166 | // search for a match at each index within the string 167 | 168 | for ( 169 | let matchIndex = this.lastIndex; 170 | matchIndex < (this.startOfInput && !this.multiline ? 1 : len); 171 | matchIndex++ 172 | ) { 173 | // search for a match in this substring 174 | const matchStr = recursiveBacktrackingSearch( 175 | this.nfa.start, 176 | str.substr(matchIndex) 177 | ); 178 | 179 | // we have found a match 180 | if (matchStr != null) { 181 | // remove any non-flagged captures 182 | groupMarkers.forEach((gm) => { 183 | gm.capture = gm.flagged ? gm.capture : ""; 184 | }); 185 | 186 | const matches: string[] = [matchStr]; 187 | const match = new Match( 188 | matches.concat(lastCapturesForGroup(groupMarkers)), 189 | matchIndex, 190 | str 191 | ); 192 | 193 | // return this match (checking end of input condition) 194 | const matchEndIndex = match.index + match.matches[0].length; 195 | 196 | // has the start of input criteria been met? 197 | if (this.startOfInput) { 198 | if (this.flags.multiline && matchIndex != 0) { 199 | if (str.charCodeAt(matchIndex - 1) != Char.LineFeed) continue; 200 | } else if (matchIndex != 0) { 201 | continue; 202 | } 203 | } 204 | 205 | // has the enf of input criteria been met? 206 | if (this.endOfInput) { 207 | if (this.flags.multiline && matchEndIndex != len) { 208 | if (str.charCodeAt(matchEndIndex) != Char.LineFeed) continue; 209 | } else if (matchEndIndex != len) { 210 | continue; 211 | } 212 | } 213 | 214 | if (this.global) { 215 | this.lastIndex = matchEndIndex; 216 | } 217 | return match; 218 | } 219 | } 220 | 221 | this.lastIndex = 0; 222 | return null; 223 | } 224 | 225 | test(str: string): bool { 226 | return this.exec(str) != null; 227 | } 228 | 229 | toString(): string { 230 | return this.regex; 231 | } 232 | 233 | get global(): bool { 234 | return this.flags.global; 235 | } 236 | 237 | get ignoreCase(): bool { 238 | return this.flags.ignoreCase; 239 | } 240 | 241 | get dotAll(): bool { 242 | return this.flags.dotAll; 243 | } 244 | 245 | get multiline(): bool { 246 | return this.flags.multiline; 247 | } 248 | } 249 | 250 | // TODO: do we need this factory function, or can we invoke 251 | // the ctr via the loader? 252 | export function createRegExp(regex: string, flags: string): RegExp { 253 | return new RegExp(regex, flags); 254 | } 255 | 256 | // the executeRegExp exported function is used for benchmarking, giving a simple API 257 | // for executing a regex a given number of times 258 | export function executeRegExp( 259 | regexStr: string, 260 | value: string, 261 | iterations: i32 262 | ): void { 263 | const regex = new RegExp(regexStr, "g"); 264 | if (iterations < 0) { 265 | while (regex.exec(value) != null); 266 | } else { 267 | for (let i = 0; i < iterations; i++) { 268 | regex.exec(value); 269 | } 270 | } 271 | } 272 | -------------------------------------------------------------------------------- /assembly/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "assemblyscript/std/assembly.json", 3 | "include": ["./**/*.ts"] 4 | } 5 | -------------------------------------------------------------------------------- /assembly/util.ts: -------------------------------------------------------------------------------- 1 | export function last(arr: T[]): T { 2 | return unchecked(arr[arr.length - 1]); 3 | } 4 | 5 | export function first(arr: T[]): T { 6 | return unchecked(arr[0]); 7 | } 8 | 9 | export function replaceAtIndex(arr: T[], index: u32, item: T): T[] { 10 | let res = arr.slice(0); 11 | unchecked((res[index] = item)); 12 | return res; 13 | } 14 | 15 | export class Range { 16 | constructor(public from: i32, public to: i32) {} 17 | 18 | intersection(other: Range): Range | null { 19 | const lower = i32(Math.max(this.from, other.from)); 20 | const upper = i32(Math.min(this.to, other.to)); 21 | return lower < upper ? new Range(lower, upper) : null; 22 | } 23 | 24 | offset(value: i32): Range { 25 | return new Range(this.from + value, this.to + value); 26 | } 27 | 28 | contains(value: i32): bool { 29 | return value >= this.from && value <= this.to; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /benchmark/benchmark.js: -------------------------------------------------------------------------------- 1 | global.TextDecoder = require("text-encoding").TextDecoder; 2 | const fs = require("fs"); 3 | const loader = require("@assemblyscript/loader"); 4 | 5 | const Benchmark = require("benchmark"); 6 | const suite = new Benchmark.Suite(); 7 | 8 | wasmModule = loader.instantiateSync(fs.readFileSync("./build/optimized.wasm"), { 9 | env: { 10 | log: () => { 11 | const { __getString } = wasmModule.exports; 12 | console.log(__getString(strPtr)); 13 | }, 14 | }, 15 | }); 16 | 17 | // the executeRegExp exported function is ex 18 | function executeRegex(regexStr, valueStr, untilNull = false) { 19 | const { executeRegExp, __newString, __pin, __unpin } = wasmModule.exports; 20 | 21 | // create the regexp 22 | const regexPtr = __pin(__newString(regexStr)); 23 | const strPtr = __newString(valueStr); 24 | executeRegExp(regexPtr, strPtr, untilNull ? -1 : 5); 25 | __unpin(regexPtr); 26 | } 27 | 28 | // add tests 29 | suite 30 | .add("baseline", () => { 31 | // this test primarily measures the overhead in the wasm / JS interop 32 | executeRegex("", ""); 33 | }) 34 | .add("character class", () => { 35 | executeRegex("[a-zA-C0-9J]", "J"); // match char 36 | executeRegex("[a-zA-C0-9J]", "a"); // match char in range 37 | }) 38 | .add("concatenation", () => { 39 | executeRegex("this is a long string", "this is a long string"); 40 | }) 41 | .add("quantifiers", () => { 42 | executeRegex("a*", "aaaaa"); 43 | executeRegex("a+", "aaaaa"); 44 | executeRegex("a?", "a"); 45 | }) 46 | .add("range quantifiers", () => { 47 | executeRegex("a{20,30}", "a".repeat(25)); 48 | }) 49 | .add("alternation", () => { 50 | executeRegex("a|b|c|d|e|f|g", "d"); 51 | }) 52 | .add("multiple regex matches", () => { 53 | const text = 54 | "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."; 55 | executeRegex("[a-l]{3}", text, true); 56 | }) 57 | .add("complex regex", () => { 58 | const text = 59 | "43.Word Processor
(N-1286)
Lega lstaff.comCA - Statewide"; 60 | const regex = 61 | "]{0,})>]{0,})>([\\d]{0,}\\.)(.*)((
([\\w\\W\\s\\d][^<>]{0,})|[\\s]{0,}))<\\/a><\\/TD>]{0,})>([\\w\\W\\s\\d][^<>]{0,})<\\/TD>]{0,})>([\\w\\W\\s\\d][^<>]{0,})<\\/TD><\\/TR>"; 62 | executeRegex(regex, text, true); 63 | }) 64 | // add listeners 65 | .on("cycle", (event) => { 66 | console.log(String(event.target)); 67 | }) 68 | // run async 69 | .run({ async: true }); 70 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "assemblyscript-regex", 3 | "version": "0.1.0", 4 | "description": "A regex engine built with AssemblyScript", 5 | "ascMain": "assembly/index.ts", 6 | "main": "assembly/index.ts", 7 | "scripts": { 8 | "pretest": "npm run test:generate && npm run asbuild:untouched && npm run asbuild:empty", 9 | "test": "npm run prettier && npm run eslint && npm run asp && npm run test:empty", 10 | "test:empty": "diff build/empty.wat assembly/__tests__/empty.wat", 11 | "test:generate": "node spec/test-generator.js", 12 | "asp": "asp --verbose --nologo", 13 | "asp:ci": "asp --nologo", 14 | "prettier": "prettier --check .", 15 | "prettier:write": "prettier --write .", 16 | "eslint:write": "npm run eslint -- --fix ", 17 | "asbuild:untouched": "asc assembly/index.ts --target debug", 18 | "asbuild:optimized": "asc assembly/index.ts --target release", 19 | "asbuild": "npm run asbuild:untouched && npm run asbuild:optimized", 20 | "asbuild:empty": "asc --config asconfig.empty.json", 21 | "tsrun": "ts-node ts/index.ts", 22 | "benchmark": "node benchmark/benchmark.js", 23 | "eslint": "eslint --max-warnings 0 --ext ts \"assembly/**/*.ts\"" 24 | }, 25 | "author": "colin.eberhardt@gmail.com", 26 | "license": "MIT", 27 | "repository": { 28 | "type": "git", 29 | "url": "git+https://github.com/ColinEberhardt/assemblyscript-regex" 30 | }, 31 | "devDependencies": { 32 | "@as-pect/cli": "^8.0.1", 33 | "@assemblyscript/loader": "^0.27.0", 34 | "@types/node": "^14.14.13", 35 | "@typescript-eslint/eslint-plugin": "^4.14.1", 36 | "@typescript-eslint/parser": "^4.14.1", 37 | "assemblyscript": "^0.25.0", 38 | "benchmark": "^2.1.4", 39 | "eslint": "^7.18.0", 40 | "husky": "^4.2.5", 41 | "prettier": "^2.2.1", 42 | "semantic-release": "^17.3.7", 43 | "text-encoding": "^0.7.0", 44 | "ts-node": "^9.1.1", 45 | "typescript": "^4.1.3" 46 | }, 47 | "husky": { 48 | "hooks": { 49 | "pre-commit": "npm run prettier && npm run eslint" 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /spec/pcre-1.dat: -------------------------------------------------------------------------------- 1 | E the quick brown fox the quick brown fox (0,19) 2 | E SAME The quick brown FOX NOMATCH 3 | E SAME What do you know about the quick brown fox? (23,42) 4 | E SAME What do you know about THE QUICK BROWN FOX? NOMATCH 5 | Ei The quick brown fox the quick brown fox (0,19) 6 | Ei SAME The quick brown FOX (0,19) 7 | Ei SAME What do you know about the quick brown fox? (23,42) 8 | Ei SAME What do you know about THE QUICK BROWN FOX? (23,42) 9 | E$ abcd\\t\\n\\r\\f\\a\\e\\071\\x3b\\$\\\\\\?caxyz abcd\t\n\r\f\a\e9;$\\?caxyz (0,20) 10 | E a*abc?xyz+pqr{3}ab{2,}xy{4,5}pq{0,6}AB{0,}zz abxyzpqrrrabbxyyyypqAzz (0,23) 11 | E SAME abxyzpqrrrabbxyyyypqAzz (0,23) 12 | E SAME aabxyzpqrrrabbxyyyypqAzz (0,24) 13 | E SAME aaabxyzpqrrrabbxyyyypqAzz (0,25) 14 | E SAME aaaabxyzpqrrrabbxyyyypqAzz (0,26) 15 | E SAME abcxyzpqrrrabbxyyyypqAzz (0,24) 16 | E SAME aabcxyzpqrrrabbxyyyypqAzz (0,25) 17 | E SAME aaabcxyzpqrrrabbxyyyypAzz (0,25) 18 | E SAME aaabcxyzpqrrrabbxyyyypqAzz (0,26) 19 | E SAME aaabcxyzpqrrrabbxyyyypqqAzz (0,27) 20 | E SAME aaabcxyzpqrrrabbxyyyypqqqAzz (0,28) 21 | E SAME aaabcxyzpqrrrabbxyyyypqqqqAzz (0,29) 22 | E SAME aaabcxyzpqrrrabbxyyyypqqqqqAzz (0,30) 23 | E SAME aaabcxyzpqrrrabbxyyyypqqqqqqAzz (0,31) 24 | E SAME aaaabcxyzpqrrrabbxyyyypqAzz (0,27) 25 | E SAME abxyzzpqrrrabbxyyyypqAzz (0,24) 26 | E SAME aabxyzzzpqrrrabbxyyyypqAzz (0,26) 27 | E SAME aaabxyzzzzpqrrrabbxyyyypqAzz (0,28) 28 | E SAME aaaabxyzzzzpqrrrabbxyyyypqAzz (0,29) 29 | E SAME abcxyzzpqrrrabbxyyyypqAzz (0,25) 30 | E SAME aabcxyzzzpqrrrabbxyyyypqAzz (0,27) 31 | E SAME aaabcxyzzzzpqrrrabbxyyyypqAzz (0,29) 32 | E SAME aaaabcxyzzzzpqrrrabbxyyyypqAzz (0,30) 33 | E SAME aaaabcxyzzzzpqrrrabbbxyyyypqAzz (0,31) 34 | E SAME aaaabcxyzzzzpqrrrabbbxyyyyypqAzz (0,32) 35 | E SAME aaabcxyzpqrrrabbxyyyypABzz (0,26) 36 | E SAME aaabcxyzpqrrrabbxyyyypABBzz (0,27) 37 | E SAME >>>aaabxyzpqrrrabbxyyyypqAzz (3,28) 38 | E SAME >aaaabxyzpqrrrabbxyyyypqAzz (1,27) 39 | E SAME >>>>abcxyzpqrrrabbxyyyypqAzz (4,28) 40 | E SAME abxyzpqrrabbxyyyypqAzz NOMATCH 41 | E SAME abxyzpqrrrrabbxyyyypqAzz NOMATCH 42 | E SAME abxyzpqrrrabxyyyypqAzz NOMATCH 43 | E SAME aaaabcxyzzzzpqrrrabbbxyyyyyypqAzz NOMATCH 44 | E SAME aaaabcxyzzzzpqrrrabbbxyyypqAzz NOMATCH 45 | E SAME aaabcxyzpqrrrabbxyyyypqqqqqqqAzz NOMATCH 46 | E ^(abc){1,2}zz abczz (0,5)(0,3) 47 | E SAME abcabczz (0,8)(3,6) 48 | E SAME zz NOMATCH 49 | E SAME abcabcabczz NOMATCH 50 | E SAME >>abczz NOMATCH 51 | E ^(b+?|a){1,2}?c bc (0,2)(0,1) 52 | E SAME bbc (0,3)(1,2) 53 | E SAME bbbc (0,4)(1,3) 54 | E SAME bac (0,3)(1,2) 55 | E SAME bbac (0,4)(2,3) 56 | E SAME aac (0,3)(1,2) 57 | E SAME abbbbbbbbbbbc (0,13)(1,12) 58 | E SAME bbbbbbbbbbbac (0,13)(11,12) 59 | E SAME aaac NOMATCH 60 | E SAME abbbbbbbbbbbac NOMATCH 61 | E ^(b+|a){1,2}c bc (0,2)(0,1) 62 | E SAME bbc (0,3)(0,2) 63 | E SAME bbbc (0,4)(0,3) 64 | E SAME bac (0,3)(1,2) 65 | E SAME bbac (0,4)(2,3) 66 | E SAME aac (0,3)(1,2) 67 | E SAME abbbbbbbbbbbc (0,13)(1,12) 68 | E SAME bbbbbbbbbbbac (0,13)(11,12) 69 | E SAME aaac NOMATCH 70 | E SAME abbbbbbbbbbbac NOMATCH 71 | E ^(b+|a){1,2}?bc bbc (0,3)(0,1) 72 | E ^(b*|ba){1,2}?bc babc (0,4)(0,2) 73 | E SAME bbabc (0,5)(1,3) 74 | E SAME bababc (0,6)(2,4) 75 | E SAME bababbc NOMATCH 76 | E SAME babababc NOMATCH 77 | E ^(ba|b*){1,2}?bc babc (0,4)(0,2) 78 | E SAME bbabc (0,5)(1,3) 79 | E SAME bababc (0,6)(2,4) 80 | E SAME bababbc NOMATCH 81 | E SAME babababc NOMATCH 82 | E$ ^\\ca\\cA\\c[\\c{\\c: \x01\x01\e;z (0,5) 83 | E$kv ^[ab\\]cde] athing (0,1) 84 | E$kv SAME bthing (0,1) 85 | E$kv SAME ]thing (0,1) 86 | E$kv SAME cthing (0,1) 87 | E$kv SAME dthing (0,1) 88 | E$kv SAME ething (0,1) 89 | E$kv SAME fthing NOMATCH 90 | E$kv SAME [thing NOMATCH 91 | E$kv SAME \\thing NOMATCH 92 | E ^[]cde] ]thing (0,1) 93 | E SAME cthing (0,1) 94 | E SAME dthing (0,1) 95 | E SAME ething (0,1) 96 | E SAME athing NOMATCH 97 | E SAME fthing NOMATCH 98 | E$kv ^[^ab\\]cde] fthing (0,1) 99 | E$kv SAME [thing (0,1) 100 | E$kv SAME \\thing (0,1) 101 | E$kv SAME athing NOMATCH 102 | E$kv SAME bthing NOMATCH 103 | E$kv SAME ]thing NOMATCH 104 | E$kv SAME cthing NOMATCH 105 | E$kv SAME dthing NOMATCH 106 | E$kv SAME ething NOMATCH 107 | E ^[^]cde] athing (0,1) 108 | E SAME fthing (0,1) 109 | E SAME ]thing NOMATCH 110 | E SAME cthing NOMATCH 111 | E SAME dthing NOMATCH 112 | E SAME ething NOMATCH 113 | E$x ^\\� � (0,1) 114 | E ^� � (0,1) 115 | E ^[0-9]+$ 0 (0,1) 116 | E SAME 1 (0,1) 117 | E SAME 2 (0,1) 118 | E SAME 3 (0,1) 119 | E SAME 4 (0,1) 120 | E SAME 5 (0,1) 121 | E SAME 6 (0,1) 122 | E SAME 7 (0,1) 123 | E SAME 8 (0,1) 124 | E SAME 9 (0,1) 125 | E SAME 10 (0,2) 126 | E SAME 100 (0,3) 127 | E SAME abc NOMATCH 128 | E ^.*nter enter (0,5) 129 | E SAME inter (0,5) 130 | E SAME uponter (0,7) 131 | E ^xxx[0-9]+$ xxx0 (0,4) 132 | E SAME xxx1234 (0,7) 133 | E SAME xxx NOMATCH 134 | E ^.+[0-9][0-9][0-9]$ x123 (0,4) 135 | E SAME xx123 (0,5) 136 | E SAME 123456 (0,6) 137 | E SAME 123 NOMATCH 138 | E SAME x1234 (0,5) 139 | E ^.+?[0-9][0-9][0-9]$ x123 (0,4) 140 | E SAME xx123 (0,5) 141 | E SAME 123456 (0,6) 142 | E SAME 123 NOMATCH 143 | E SAME x1234 (0,5) 144 | E$ ^([^!]+)!(.+)=apquxz\\.ixr\\.zzz\\.ac\\.uk$ abc!pqr=apquxz.ixr.zzz.ac.uk (0,28)(0,3)(4,7) 145 | E$ SAME !pqr=apquxz.ixr.zzz.ac.uk NOMATCH 146 | E$ SAME abc!=apquxz.ixr.zzz.ac.uk NOMATCH 147 | E$ SAME abc!pqr=apquxz:ixr.zzz.ac.uk NOMATCH 148 | E$ SAME abc!pqr=apquxz.ixr.zzz.ac.ukk NOMATCH 149 | E : Well, we need a colon: somewhere (21,22) 150 | E SAME *** Fail if we don't NOMATCH 151 | E$iv ([\\da-f:]+)$ 0abc (0,4)(0,4) 152 | E$iv SAME abc (0,3)(0,3) 153 | E$iv SAME fed (0,3)(0,3) 154 | E$iv SAME E (0,1)(0,1) 155 | E$iv SAME :: (0,2)(0,2) 156 | E$iv SAME 5f03:12C0::932e (0,15)(0,15) 157 | E$iv SAME fed def (4,7)(4,7) 158 | E$iv SAME Any old stuff (11,13)(11,13) 159 | E$iv SAME 0zzz NOMATCH 160 | E$iv SAME gzzz NOMATCH 161 | E$iv SAME fed\x20 NOMATCH 162 | E$iv SAME Any old rubbish NOMATCH 163 | E$ ^.*\\.(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})$ .1.2.3 (0,6)(1,2)(3,4)(5,6) 164 | E$ SAME A.12.123.0 (0,10)(2,4)(5,8)(9,10) 165 | E$ SAME .1.2.3333 NOMATCH 166 | E$ SAME 1.2.3 NOMATCH 167 | E$ SAME 1234.2.3 NOMATCH 168 | E$ ^(\\d+)\\s+IN\\s+SOA\\s+(\\S+)\\s+(\\S+)\\s*\\(\\s*$ 1 IN SOA non-sp1 non-sp2( (0,25)(0,1)(9,16)(17,24) 169 | E$ SAME 1 IN SOA non-sp1 non-sp2 ( (0,40)(0,1)(18,25)(29,36) 170 | E$ SAME 1IN SOA non-sp1 non-sp2( NOMATCH 171 | E$kv ^[a-zA-Z\\d][a-zA-Z\\d\\-]*(\\.[a-zA-Z\\d][a-zA-z\\d\\-]*)*\\.$ a. (0,2) 172 | E$kv SAME Z. (0,2) 173 | E$kv SAME 2. (0,2) 174 | E$kv SAME ab-c.pq-r. (0,10)(4,9) 175 | E$kv SAME sxk.zzz.ac.uk. (0,14)(10,13) 176 | E$kv SAME x-.y-. (0,6)(2,5) 177 | E$kv SAME -abc.peq. NOMATCH 178 | E$kv ^\\*\\.[a-z]([a-z\\-\\d]*[a-z\\d]+)?(\\.[a-z]([a-z\\-\\d]*[a-z\\d]+)?)*$ *.a (0,3) 179 | E$kv SAME *.b0-a (0,6)(3,6) 180 | E$kv SAME *.c3-b.c (0,8)(3,6)(6,8) 181 | E$kv SAME *.c-a.b-c (0,9)(3,5)(5,9)(7,9) 182 | E$kv SAME *.0 NOMATCH 183 | E$kv SAME *.a- NOMATCH 184 | E$kv SAME *.a-b.c- NOMATCH 185 | E$kv SAME *.c-a.0-c NOMATCH 186 | E ^(?=ab(de))(abd)(e) abde (0,4)(2,4)(0,3)(3,4) 187 | E ^(?!(ab)de|x)(abd)(f) abdf (0,4)(?,?)(0,3)(3,4) 188 | E ^(?=(ab(cd)))(ab) abcd (0,2)(0,4)(2,4)(0,2) 189 | E$iv ^[\\da-f](\\.[\\da-f])*$ a.b.c.d (0,7)(5,7) 190 | E$iv SAME A.B.C.D (0,7)(5,7) 191 | E$iv SAME a.b.c.1.2.3.C (0,13)(11,13) 192 | E$ ^".*"\\s*(;.*)?$ "1234" (0,6) 193 | E$ SAME "abcd" ; (0,8)(7,8) 194 | E$ SAME "" ; rhubarb (0,12)(3,12) 195 | E$ SAME "1234" : things NOMATCH 196 | E$ ^$ NULL (0,0) 197 | E$c ^ a (?# begins with a) b\\sc (?# then b c) $ (?# then end) ab c (0,4) 198 | E$c SAME abc NOMATCH 199 | E$c SAME ab cde NOMATCH 200 | E$ (?x) ^ a (?# begins with a) b\\sc (?# then b c) $ (?# then end) ab c (0,4) 201 | E$ SAME abc NOMATCH 202 | E$ SAME ab cde NOMATCH 203 | E$c ^ a\\ b[c ]d $ a bcd (0,5) 204 | E$c SAME a b d (0,5) 205 | E$c SAME abcd NOMATCH 206 | E$c SAME ab d NOMATCH 207 | E ^(a(b(c)))(d(e(f)))(h(i(j)))(k(l(m)))$ abcdefhijklm (0,12)(0,3)(1,3)(2,3)(3,6)(4,6)(5,6)(6,9)(7,9)(8,9)(9,12)(10,12)(11,12) 208 | E ^(?:a(b(c)))(?:d(e(f)))(?:h(i(j)))(?:k(l(m)))$ abcdefhijklm (0,12)(1,3)(2,3)(4,6)(5,6)(7,9)(8,9)(10,12)(11,12) 209 | E$v ^[\\w][\\W][\\s][\\S][\\d][\\D][\\f][\\n][\\c]][\\022] a+ Z0+\x0c\n\x1d\x12 (0,10) 210 | E$ ^[.^$|()*+?{,}]+ .^$(*+)|{?,?} (0,13) 211 | E$ ^a*\\w z (0,1) 212 | E$ SAME az (0,2) 213 | E$ SAME aaaz (0,4) 214 | E$ SAME a (0,1) 215 | E$ SAME aa (0,2) 216 | E$ SAME aaaa (0,4) 217 | E$ SAME a+ (0,1) 218 | E$ SAME aa+ (0,2) 219 | E$ ^a*?\\w z (0,1) 220 | E$ SAME az (0,1) 221 | E$ SAME aaaz (0,1) 222 | E$ SAME a (0,1) 223 | E$ SAME aa (0,1) 224 | E$ SAME aaaa (0,1) 225 | E$ SAME a+ (0,1) 226 | E$ SAME aa+ (0,1) 227 | E$ ^a+\\w az (0,2) 228 | E$ SAME aaaz (0,4) 229 | E$ SAME aa (0,2) 230 | E$ SAME aaaa (0,4) 231 | E$ SAME aa+ (0,2) 232 | E$ ^a+?\\w az (0,2) 233 | E$ SAME aaaz (0,2) 234 | E$ SAME aa (0,2) 235 | E$ SAME aaaa (0,2) 236 | E$ SAME aa+ (0,2) 237 | E$ ^\\d{8}\\w{2,} 1234567890 (0,10) 238 | E$ SAME 12345678ab (0,10) 239 | E$ SAME 12345678__ (0,10) 240 | E$ SAME 1234567 NOMATCH 241 | E$v ^[aeiou\\d]{4,5}$ uoie (0,4) 242 | E$v SAME 1234 (0,4) 243 | E$v SAME 12345 (0,5) 244 | E$v SAME aaaaa (0,5) 245 | E$v SAME 123456 NOMATCH 246 | E$v ^[aeiou\\d]{4,5}? uoie (0,4) 247 | E$v SAME 1234 (0,4) 248 | E$v SAME 12345 (0,4) 249 | E$v SAME aaaaa (0,4) 250 | E$v SAME 123456 (0,4) 251 | E$ \\A(abc|def)=(\\1){2,3}\\Z abc=abcabc (0,10)(0,3)(7,10) 252 | E$ SAME def=defdefdef (0,13)(0,3)(10,13) 253 | E$ SAME abc=defdef NOMATCH 254 | E$h ^(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)\\11*(\\3\\4)\\1(?#)2$ abcdefghijkcda2 (0,15)(0,1)(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,9)(9,10)(10,11)(11,13) 255 | E$h SAME abcdefghijkkkkcda2 (0,18)(0,1)(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,9)(9,10)(10,11)(14,16) 256 | E$ ^(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(?11)*((?3)(?4))(?1)(?#)2$ abcdefghijkcda2 (0,15)(0,1)(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,9)(9,10)(10,11)(11,13) 257 | E$ SAME abcdefghijkkkkcda2 (0,18)(0,1)(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,9)(9,10)(10,11)(14,16) 258 | E$z (cat(a(ract|tonic)|erpillar)) \\1()2(3) cataract cataract23 (0,19)(0,8)(3,8)(4,8)(17,17)(18,19) 259 | E$z SAME catatonic catatonic23 (0,21)(0,9)(3,9)(4,9)(19,19)(20,21) 260 | E$z SAME caterpillar caterpillar23 (0,25)(0,11)(3,11)(?,?)(23,23)(24,25) 261 | E ^From +([^ ]+) +[a-zA-Z][a-zA-Z][a-zA-Z] +[a-zA-Z][a-zA-Z][a-zA-Z] +[0-9]?[0-9] +[0-9][0-9]:[0-9][0-9] From abcd Mon Sep 01 12:33:02 1997 (0,27)(5,9) 262 | E$ ^From\\s+\\S+\\s+([a-zA-Z]{3}\\s+){2}\\d{1,2}\\s+\\d\\d:\\d\\d From abcd Mon Sep 01 12:33:02 1997 (0,27)(15,19) 263 | E$ SAME From abcd Mon Sep 1 12:33:02 1997 (0,27)(15,20) 264 | E$ SAME From abcd Sep 01 12:33:02 1997 NOMATCH 265 | Ej$ ^12.34 12\n34 (0,5) 266 | Ej$ SAME 12\r34 (0,5) 267 | E$ \\w+(?=\\t) the quick brown\t fox (10,15) 268 | E foo(?!bar)(.*) foobar is foolish see? (10,22)(13,22) 269 | E (?:(?!foo)...|^.{0,2})bar(.*) foobar crowbar etc (8,18)(14,18) 270 | E SAME barrel (0,6)(3,6) 271 | E SAME 2barrel (0,7)(4,7) 272 | E SAME A barrel (0,8)(5,8) 273 | E$ ^(\\D*)(?=\\d)(?!123) abc456 (0,3)(0,3) 274 | E$ SAME abc123 NOMATCH 275 | E$ ^1234(?# test newlines\n\ 276 | inside) 1234 (0,4) 277 | E$c ^1234 #comment in extended re\n\ 278 | 1234 (0,4) 279 | E$c #rhubarb\n\ 280 | abcd abcd (0,4) 281 | Ec ^abcd#rhubarb abcd (0,4) 282 | E$ ^(a)\\1{2,3}(.) aaab (0,4)(0,1)(3,4) 283 | E$ SAME aaaab (0,5)(0,1)(4,5) 284 | E$ SAME aaaaab (0,5)(0,1)(4,5) 285 | E$ SAME aaaaaab (0,5)(0,1)(4,5) 286 | E (?!^)abc the abc (4,7) 287 | E SAME abc NOMATCH 288 | E (?=^)abc abc (0,3) 289 | E SAME the abc NOMATCH 290 | E ^[ab]{1,3}(ab*|b) aabbbbb (0,7)(1,7) 291 | E ^[ab]{1,3}?(ab*|b) aabbbbb (0,7)(1,7) 292 | E ^[ab]{1,3}?(ab*?|b) aabbbbb (0,2)(1,2) 293 | E ^[ab]{1,3}(ab*?|b) aabbbbb (0,4)(3,4) 294 | E$ckv (?: [\\040\\t] | \\(\n\ 295 | (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\ 296 | \\) )* # optional leading comment\n\ 297 | (?: (?:\n\ 298 | [^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]+ # some number of atom characters...\n\ 299 | (?![^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]) # ..not followed by something that could be part of an atom\n\ 300 | |\n\ 301 | " (?: # opening quote...\n\ 302 | [^\\\\\\x80-\\xff\\n\\015"] # Anything except backslash and quote\n\ 303 | | # or\n\ 304 | \\\\ [^\\x80-\\xff] # Escaped something (something != CR)\n\ 305 | )* " # closing quote\n\ 306 | ) # initial word\n\ 307 | (?: (?: [\\040\\t] | \\(\n\ 308 | (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\ 309 | \\) )* \\. (?: [\\040\\t] | \\(\n\ 310 | (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\ 311 | \\) )* (?:\n\ 312 | [^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]+ # some number of atom characters...\n\ 313 | (?![^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]) # ..not followed by something that could be part of an atom\n\ 314 | |\n\ 315 | " (?: # opening quote...\n\ 316 | [^\\\\\\x80-\\xff\\n\\015"] # Anything except backslash and quote\n\ 317 | | # or\n\ 318 | \\\\ [^\\x80-\\xff] # Escaped something (something != CR)\n\ 319 | )* " # closing quote\n\ 320 | ) )* # further okay, if led by a period\n\ 321 | (?: [\\040\\t] | \\(\n\ 322 | (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\ 323 | \\) )* @ (?: [\\040\\t] | \\(\n\ 324 | (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\ 325 | \\) )* (?:\n\ 326 | [^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]+ # some number of atom characters...\n\ 327 | (?![^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]) # ..not followed by something that could be part of an atom\n\ 328 | | \\[ # [\n\ 329 | (?: [^\\\\\\x80-\\xff\\n\\015\\[\\]] | \\\\ [^\\x80-\\xff] )* # stuff\n\ 330 | \\] # ]\n\ 331 | ) # initial subdomain\n\ 332 | (?: #\n\ 333 | (?: [\\040\\t] | \\(\n\ 334 | (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\ 335 | \\) )* \\. # if led by a period...\n\ 336 | (?: [\\040\\t] | \\(\n\ 337 | (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\ 338 | \\) )* (?:\n\ 339 | [^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]+ # some number of atom characters...\n\ 340 | (?![^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]) # ..not followed by something that could be part of an atom\n\ 341 | | \\[ # [\n\ 342 | (?: [^\\\\\\x80-\\xff\\n\\015\\[\\]] | \\\\ [^\\x80-\\xff] )* # stuff\n\ 343 | \\] # ]\n\ 344 | ) # ...further okay\n\ 345 | )*\n\ 346 | # address\n\ 347 | | # or\n\ 348 | (?:\n\ 349 | [^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]+ # some number of atom characters...\n\ 350 | (?![^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]) # ..not followed by something that could be part of an atom\n\ 351 | |\n\ 352 | " (?: # opening quote...\n\ 353 | [^\\\\\\x80-\\xff\\n\\015"] # Anything except backslash and quote\n\ 354 | | # or\n\ 355 | \\\\ [^\\x80-\\xff] # Escaped something (something != CR)\n\ 356 | )* " # closing quote\n\ 357 | ) # one word, optionally followed by....\n\ 358 | (?:\n\ 359 | [^()<>@,;:".\\\\\\[\\]\\x80-\\xff\\000-\\010\\012-\\037] | # atom and space parts, or...\n\ 360 | \\(\n\ 361 | (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\ 362 | \\) | # comments, or...\n\ 363 | \n\ 364 | " (?: # opening quote...\n\ 365 | [^\\\\\\x80-\\xff\\n\\015"] # Anything except backslash and quote\n\ 366 | | # or\n\ 367 | \\\\ [^\\x80-\\xff] # Escaped something (something != CR)\n\ 368 | )* " # closing quote\n\ 369 | # quoted strings\n\ 370 | )*\n\ 371 | < (?: [\\040\\t] | \\(\n\ 372 | (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\ 373 | \\) )* # leading <\n\ 374 | (?: @ (?: [\\040\\t] | \\(\n\ 375 | (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\ 376 | \\) )* (?:\n\ 377 | [^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]+ # some number of atom characters...\n\ 378 | (?![^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]) # ..not followed by something that could be part of an atom\n\ 379 | | \\[ # [\n\ 380 | (?: [^\\\\\\x80-\\xff\\n\\015\\[\\]] | \\\\ [^\\x80-\\xff] )* # stuff\n\ 381 | \\] # ]\n\ 382 | ) # initial subdomain\n\ 383 | (?: #\n\ 384 | (?: [\\040\\t] | \\(\n\ 385 | (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\ 386 | \\) )* \\. # if led by a period...\n\ 387 | (?: [\\040\\t] | \\(\n\ 388 | (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\ 389 | \\) )* (?:\n\ 390 | [^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]+ # some number of atom characters...\n\ 391 | (?![^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]) # ..not followed by something that could be part of an atom\n\ 392 | | \\[ # [\n\ 393 | (?: [^\\\\\\x80-\\xff\\n\\015\\[\\]] | \\\\ [^\\x80-\\xff] )* # stuff\n\ 394 | \\] # ]\n\ 395 | ) # ...further okay\n\ 396 | )*\n\ 397 | \n\ 398 | (?: (?: [\\040\\t] | \\(\n\ 399 | (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\ 400 | \\) )* , (?: [\\040\\t] | \\(\n\ 401 | (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\ 402 | \\) )* @ (?: [\\040\\t] | \\(\n\ 403 | (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\ 404 | \\) )* (?:\n\ 405 | [^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]+ # some number of atom characters...\n\ 406 | (?![^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]) # ..not followed by something that could be part of an atom\n\ 407 | | \\[ # [\n\ 408 | (?: [^\\\\\\x80-\\xff\\n\\015\\[\\]] | \\\\ [^\\x80-\\xff] )* # stuff\n\ 409 | \\] # ]\n\ 410 | ) # initial subdomain\n\ 411 | (?: #\n\ 412 | (?: [\\040\\t] | \\(\n\ 413 | (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\ 414 | \\) )* \\. # if led by a period...\n\ 415 | (?: [\\040\\t] | \\(\n\ 416 | (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\ 417 | \\) )* (?:\n\ 418 | [^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]+ # some number of atom characters...\n\ 419 | (?![^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]) # ..not followed by something that could be part of an atom\n\ 420 | | \\[ # [\n\ 421 | (?: [^\\\\\\x80-\\xff\\n\\015\\[\\]] | \\\\ [^\\x80-\\xff] )* # stuff\n\ 422 | \\] # ]\n\ 423 | ) # ...further okay\n\ 424 | )*\n\ 425 | )* # further okay, if led by comma\n\ 426 | : # closing colon\n\ 427 | (?: [\\040\\t] | \\(\n\ 428 | (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\ 429 | \\) )* )? # optional route\n\ 430 | (?:\n\ 431 | [^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]+ # some number of atom characters...\n\ 432 | (?![^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]) # ..not followed by something that could be part of an atom\n\ 433 | |\n\ 434 | " (?: # opening quote...\n\ 435 | [^\\\\\\x80-\\xff\\n\\015"] # Anything except backslash and quote\n\ 436 | | # or\n\ 437 | \\\\ [^\\x80-\\xff] # Escaped something (something != CR)\n\ 438 | )* " # closing quote\n\ 439 | ) # initial word\n\ 440 | (?: (?: [\\040\\t] | \\(\n\ 441 | (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\ 442 | \\) )* \\. (?: [\\040\\t] | \\(\n\ 443 | (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\ 444 | \\) )* (?:\n\ 445 | [^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]+ # some number of atom characters...\n\ 446 | (?![^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]) # ..not followed by something that could be part of an atom\n\ 447 | |\n\ 448 | " (?: # opening quote...\n\ 449 | [^\\\\\\x80-\\xff\\n\\015"] # Anything except backslash and quote\n\ 450 | | # or\n\ 451 | \\\\ [^\\x80-\\xff] # Escaped something (something != CR)\n\ 452 | )* " # closing quote\n\ 453 | ) )* # further okay, if led by a period\n\ 454 | (?: [\\040\\t] | \\(\n\ 455 | (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\ 456 | \\) )* @ (?: [\\040\\t] | \\(\n\ 457 | (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\ 458 | \\) )* (?:\n\ 459 | [^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]+ # some number of atom characters...\n\ 460 | (?![^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]) # ..not followed by something that could be part of an atom\n\ 461 | | \\[ # [\n\ 462 | (?: [^\\\\\\x80-\\xff\\n\\015\\[\\]] | \\\\ [^\\x80-\\xff] )* # stuff\n\ 463 | \\] # ]\n\ 464 | ) # initial subdomain\n\ 465 | (?: #\n\ 466 | (?: [\\040\\t] | \\(\n\ 467 | (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\ 468 | \\) )* \\. # if led by a period...\n\ 469 | (?: [\\040\\t] | \\(\n\ 470 | (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\ 471 | \\) )* (?:\n\ 472 | [^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]+ # some number of atom characters...\n\ 473 | (?![^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]) # ..not followed by something that could be part of an atom\n\ 474 | | \\[ # [\n\ 475 | (?: [^\\\\\\x80-\\xff\\n\\015\\[\\]] | \\\\ [^\\x80-\\xff] )* # stuff\n\ 476 | \\] # ]\n\ 477 | ) # ...further okay\n\ 478 | )*\n\ 479 | # address spec\n\ 480 | (?: [\\040\\t] | \\(\n\ 481 | (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\ 482 | \\) )* > # trailing >\n\ 483 | # name and address\n\ 484 | ) (?: [\\040\\t] | \\(\n\^[ab]{1,3}(ab*?|b) 485 | (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] | \\( (?: [^\\\\\\x80-\\xff\\n\\015()] | \\\\ [^\\x80-\\xff] )* \\) )*\n\ 486 | \\) )* # optional trailing comment\n\ 487 | Alan Other (0,25) 488 | E$ckv SAME (1,13) 489 | E$ckv SAME user@dom.ain (0,12) 490 | E$ckv SAME "A. Other" (a comment) (0,42) 491 | E$ckv SAME A. Other (a comment) (2,40) 492 | E$ckv SAME "/s=user/ou=host/o=place/prmd=uu.yy/admd= /c=gb/"@x400-re.lay (0,61) 493 | E$ckv SAME A missing angle @,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]+ # some number of atom characters...\n\ 514 | (?![^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]) # ..not followed by something that could be part of an atom\n\ 515 | # Atom\n\ 516 | | # or\n\ 517 | " # "\n\ 518 | [^\\\\\\x80-\\xff\\n\\015"] * # normal\n\ 519 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015"] * )* # ( special normal* )*\n\ 520 | " # "\n\ 521 | # Quoted string\n\ 522 | )\n\ 523 | [\\040\\t]* # Nab whitespace.\n\ 524 | (?:\n\ 525 | \\( # (\n\ 526 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 527 | (?: # (\n\ 528 | (?: \\\\ [^\\x80-\\xff] |\n\ 529 | \\( # (\n\ 530 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 531 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015()] * )* # (special normal*)*\n\ 532 | \\) # )\n\ 533 | ) # special\n\ 534 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 535 | )* # )*\n\ 536 | \\) # )\n\ 537 | [\\040\\t]* )* # If comment found, allow more spaces.\n\ 538 | (?:\n\ 539 | \\.\n\ 540 | [\\040\\t]* # Nab whitespace.\n\ 541 | (?:\n\ 542 | \\( # (\n\ 543 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 544 | (?: # (\n\ 545 | (?: \\\\ [^\\x80-\\xff] |\n\ 546 | \\( # (\n\ 547 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 548 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015()] * )* # (special normal*)*\n\ 549 | \\) # )\n\ 550 | ) # special\n\ 551 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 552 | )* # )*\n\ 553 | \\) # )\n\ 554 | [\\040\\t]* )* # If comment found, allow more spaces.\n\ 555 | (?:\n\ 556 | [^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]+ # some number of atom characters...\n\ 557 | (?![^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]) # ..not followed by something that could be part of an atom\n\ 558 | # Atom\n\ 559 | | # or\n\ 560 | " # "\n\ 561 | [^\\\\\\x80-\\xff\\n\\015"] * # normal\n\ 562 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015"] * )* # ( special normal* )*\n\ 563 | " # "\n\ 564 | # Quoted string\n\ 565 | )\n\ 566 | [\\040\\t]* # Nab whitespace.\n\ 567 | (?:\n\ 568 | \\( # (\n\ 569 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 570 | (?: # (\n\ 571 | (?: \\\\ [^\\x80-\\xff] |\n\ 572 | \\( # (\n\ 573 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 574 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015()] * )* # (special normal*)*\n\ 575 | \\) # )\n\ 576 | ) # special\n\ 577 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 578 | )* # )*\n\ 579 | \\) # )\n\ 580 | [\\040\\t]* )* # If comment found, allow more spaces.\n\ 581 | # additional words\n\ 582 | )*\n\ 583 | @\n\ 584 | [\\040\\t]* # Nab whitespace.\n\ 585 | (?:\n\ 586 | \\( # (\n\ 587 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 588 | (?: # (\n\ 589 | (?: \\\\ [^\\x80-\\xff] |\n\ 590 | \\( # (\n\ 591 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 592 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015()] * )* # (special normal*)*\n\ 593 | \\) # )\n\ 594 | ) # special\n\ 595 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 596 | )* # )*\n\ 597 | \\) # )\n\ 598 | [\\040\\t]* )* # If comment found, allow more spaces.\n\ 599 | (?:\n\ 600 | [^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]+ # some number of atom characters...\n\ 601 | (?![^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]) # ..not followed by something that could be part of an atom\n\ 602 | |\n\ 603 | \\[ # [\n\ 604 | (?: [^\\\\\\x80-\\xff\\n\\015\\[\\]] | \\\\ [^\\x80-\\xff] )* # stuff\n\ 605 | \\] # ]\n\ 606 | )\n\ 607 | [\\040\\t]* # Nab whitespace.\n\ 608 | (?:\n\ 609 | \\( # (\n\ 610 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 611 | (?: # (\n\ 612 | (?: \\\\ [^\\x80-\\xff] |\n\ 613 | \\( # (\n\ 614 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 615 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015()] * )* # (special normal*)*\n\ 616 | \\) # )\n\ 617 | ) # special\n\ 618 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 619 | )* # )*\n\ 620 | \\) # )\n\ 621 | [\\040\\t]* )* # If comment found, allow more spaces.\n\ 622 | # optional trailing comments\n\ 623 | (?:\n\ 624 | \\.\n\ 625 | [\\040\\t]* # Nab whitespace.\n\ 626 | (?:\n\ 627 | \\( # (\n\ 628 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 629 | (?: # (\n\ 630 | (?: \\\\ [^\\x80-\\xff] |\n\ 631 | \\( # (\n\ 632 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 633 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015()] * )* # (special normal*)*\n\ 634 | \\) # )\n\ 635 | ) # special\n\ 636 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 637 | )* # )*\n\ 638 | \\) # )\n\ 639 | [\\040\\t]* )* # If comment found, allow more spaces.\n\ 640 | (?:\n\ 641 | [^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]+ # some number of atom characters...\n\ 642 | (?![^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]) # ..not followed by something that could be part of an atom\n\ 643 | |\n\ 644 | \\[ # [\n\ 645 | (?: [^\\\\\\x80-\\xff\\n\\015\\[\\]] | \\\\ [^\\x80-\\xff] )* # stuff\n\ 646 | \\] # ]\n\ 647 | )\n\ 648 | [\\040\\t]* # Nab whitespace.\n\ 649 | (?:\n\ 650 | \\( # (\n\ 651 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 652 | (?: # (\n\ 653 | (?: \\\\ [^\\x80-\\xff] |\n\ 654 | \\( # (\n\ 655 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 656 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015()] * )* # (special normal*)*\n\ 657 | \\) # )\n\ 658 | ) # special\n\ 659 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 660 | )* # )*\n\ 661 | \\) # )\n\ 662 | [\\040\\t]* )* # If comment found, allow more spaces.\n\ 663 | # optional trailing comments\n\ 664 | )*\n\ 665 | # address\n\ 666 | | # or\n\ 667 | (?:\n\ 668 | [^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]+ # some number of atom characters...\n\ 669 | (?![^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]) # ..not followed by something that could be part of an atom\n\ 670 | # Atom\n\ 671 | | # or\n\ 672 | " # "\n\ 673 | [^\\\\\\x80-\\xff\\n\\015"] * # normal\n\ 674 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015"] * )* # ( special normal* )*\n\ 675 | " # "\n\ 676 | # Quoted string\n\ 677 | )\n\ 678 | # leading word\n\ 679 | [^()<>@,;:".\\\\\\[\\]\\x80-\\xff\\000-\\010\\012-\\037] * # "normal" atoms and or spaces\n\ 680 | (?:\n\ 681 | (?:\n\ 682 | \\( # (\n\ 683 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 684 | (?: # (\n\ 685 | (?: \\\\ [^\\x80-\\xff] |\n\ 686 | \\( # (\n\ 687 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 688 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015()] * )* # (special normal*)*\n\ 689 | \\) # )\n\ 690 | ) # special\n\ 691 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 692 | )* # )*\n\ 693 | \\) # )\n\ 694 | |\n\ 695 | " # "\n\ 696 | [^\\\\\\x80-\\xff\\n\\015"] * # normal\n\ 697 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015"] * )* # ( special normal* )*\n\ 698 | " # "\n\ 699 | ) # "special" comment or quoted string\n\ 700 | [^()<>@,;:".\\\\\\[\\]\\x80-\\xff\\000-\\010\\012-\\037] * # more "normal"\n\ 701 | )*\n\ 702 | <\n\ 703 | [\\040\\t]* # Nab whitespace.\n\ 704 | (?:\n\ 705 | \\( # (\n\ 706 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 707 | (?: # (\n\ 708 | (?: \\\\ [^\\x80-\\xff] |\n\ 709 | \\( # (\n\ 710 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 711 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015()] * )* # (special normal*)*\n\ 712 | \\) # )\n\ 713 | ) # special\n\ 714 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 715 | )* # )*\n\ 716 | \\) # )\n\ 717 | [\\040\\t]* )* # If comment found, allow more spaces.\n\ 718 | # <\n\ 719 | (?:\n\ 720 | @\n\ 721 | [\\040\\t]* # Nab whitespace.\n\ 722 | (?:\n\ 723 | \\( # (\n\ 724 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 725 | (?: # (\n\ 726 | (?: \\\\ [^\\x80-\\xff] |\n\ 727 | \\( # (\n\ 728 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 729 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015()] * )* # (special normal*)*\n\ 730 | \\) # )\n\ 731 | ) # special\n\ 732 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 733 | )* # )*\n\ 734 | \\) # )\n\ 735 | [\\040\\t]* )* # If comment found, allow more spaces.\n\ 736 | (?:\n\ 737 | [^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]+ # some number of atom characters...\n\ 738 | (?![^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]) # ..not followed by something that could be part of an atom\n\ 739 | |\n\ 740 | \\[ # [\n\ 741 | (?: [^\\\\\\x80-\\xff\\n\\015\\[\\]] | \\\\ [^\\x80-\\xff] )* # stuff\n\ 742 | \\] # ]\n\ 743 | )\n\ 744 | [\\040\\t]* # Nab whitespace.\n\ 745 | (?:\n\ 746 | \\( # (\n\ 747 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 748 | (?: # (\n\ 749 | (?: \\\\ [^\\x80-\\xff] |\n\ 750 | \\( # (\n\ 751 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 752 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015()] * )* # (special normal*)*\n\ 753 | \\) # )\n\ 754 | ) # special\n\ 755 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 756 | )* # )*\n\ 757 | \\) # )\n\ 758 | [\\040\\t]* )* # If comment found, allow more spaces.\n\ 759 | # optional trailing comments\n\ 760 | (?:\n\ 761 | \\.\n\ 762 | [\\040\\t]* # Nab whitespace.\n\ 763 | (?:\n\ 764 | \\( # (\n\ 765 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 766 | (?: # (\n\ 767 | (?: \\\\ [^\\x80-\\xff] |\n\ 768 | \\( # (\n\ 769 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 770 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015()] * )* # (special normal*)*\n\ 771 | \\) # )\n\ 772 | ) # special\n\ 773 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 774 | )* # )*\n\ 775 | \\) # )\n\ 776 | [\\040\\t]* )* # If comment found, allow more spaces.\n\ 777 | (?:\n\ 778 | [^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]+ # some number of atom characters...\n\ 779 | (?![^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]) # ..not followed by something that could be part of an atom\n\ 780 | |\n\ 781 | \\[ # [\n\ 782 | (?: [^\\\\\\x80-\\xff\\n\\015\\[\\]] | \\\\ [^\\x80-\\xff] )* # stuff\n\ 783 | \\] # ]\n\ 784 | )\n\ 785 | [\\040\\t]* # Nab whitespace.\n\ 786 | (?:\n\ 787 | \\( # (\n\ 788 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 789 | (?: # (\n\ 790 | (?: \\\\ [^\\x80-\\xff] |\n\ 791 | \\( # (\n\ 792 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 793 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015()] * )* # (special normal*)*\n\ 794 | \\) # )\n\ 795 | ) # special\n\ 796 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 797 | )* # )*\n\ 798 | \\) # )\n\ 799 | [\\040\\t]* )* # If comment found, allow more spaces.\n\ 800 | # optional trailing comments\n\ 801 | )*\n\ 802 | (?: ,\n\ 803 | [\\040\\t]* # Nab whitespace.\n\ 804 | (?:\n\ 805 | \\( # (\n\ 806 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 807 | (?: # (\n\ 808 | (?: \\\\ [^\\x80-\\xff] |\n\ 809 | \\( # (\n\ 810 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 811 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015()] * )* # (special normal*)*\n\ 812 | \\) # )\n\ 813 | ) # special\n\ 814 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 815 | )* # )*\n\ 816 | \\) # )\n\ 817 | [\\040\\t]* )* # If comment found, allow more spaces.\n\ 818 | @\n\ 819 | [\\040\\t]* # Nab whitespace.\n\ 820 | (?:\n\ 821 | \\( # (\n\ 822 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 823 | (?: # (\n\ 824 | (?: \\\\ [^\\x80-\\xff] |\n\ 825 | \\( # (\n\ 826 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 827 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015()] * )* # (special normal*)*\n\ 828 | \\) # )\n\ 829 | ) # special\n\ 830 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 831 | )* # )*\n\ 832 | \\) # )\n\ 833 | [\\040\\t]* )* # If comment found, allow more spaces.\n\ 834 | (?:\n\ 835 | [^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]+ # some number of atom characters...\n\ 836 | (?![^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]) # ..not followed by something that could be part of an atom\n\ 837 | |\n\ 838 | \\[ # [\n\ 839 | (?: [^\\\\\\x80-\\xff\\n\\015\\[\\]] | \\\\ [^\\x80-\\xff] )* # stuff\n\ 840 | \\] # ]\n\ 841 | )\n\ 842 | [\\040\\t]* # Nab whitespace.\n\ 843 | (?:\n\ 844 | \\( # (\n\ 845 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 846 | (?: # (\n\ 847 | (?: \\\\ [^\\x80-\\xff] |\n\ 848 | \\( # (\n\ 849 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 850 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015()] * )* # (special normal*)*\n\ 851 | \\) # )\n\ 852 | ) # special\n\ 853 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 854 | )* # )*\n\ 855 | \\) # )\n\ 856 | [\\040\\t]* )* # If comment found, allow more spaces.\n\ 857 | # optional trailing comments\n\ 858 | (?:\n\ 859 | \\.\n\ 860 | [\\040\\t]* # Nab whitespace.\n\ 861 | (?:\n\ 862 | \\( # (\n\ 863 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 864 | (?: # (\n\ 865 | (?: \\\\ [^\\x80-\\xff] |\n\ 866 | \\( # (\n\ 867 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 868 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015()] * )* # (special normal*)*\n\ 869 | \\) # )\n\ 870 | ) # special\n\ 871 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 872 | )* # )*\n\ 873 | \\) # )\n\ 874 | [\\040\\t]* )* # If comment found, allow more spaces.\n\ 875 | (?:\n\ 876 | [^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]+ # some number of atom characters...\n\ 877 | (?![^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]) # ..not followed by something that could be part of an atom\n\ 878 | |\n\ 879 | \\[ # [\n\ 880 | (?: [^\\\\\\x80-\\xff\\n\\015\\[\\]] | \\\\ [^\\x80-\\xff] )* # stuff\n\ 881 | \\] # ]\n\ 882 | )\n\ 883 | [\\040\\t]* # Nab whitespace.\n\ 884 | (?:\n\ 885 | \\( # (\n\ 886 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 887 | (?: # (\n\ 888 | (?: \\\\ [^\\x80-\\xff] |\n\ 889 | \\( # (\n\ 890 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 891 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015()] * )* # (special normal*)*\n\ 892 | \\) # )\n\ 893 | ) # special\n\ 894 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 895 | )* # )*\n\ 896 | \\) # )\n\ 897 | [\\040\\t]* )* # If comment found, allow more spaces.\n\ 898 | # optional trailing comments\n\ 899 | )*\n\ 900 | )* # additional domains\n\ 901 | :\n\ 902 | [\\040\\t]* # Nab whitespace.\n\ 903 | (?:\n\ 904 | \\( # (\n\ 905 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 906 | (?: # (\n\ 907 | (?: \\\\ [^\\x80-\\xff] |\n\ 908 | \\( # (\n\ 909 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 910 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015()] * )* # (special normal*)*\n\ 911 | \\) # )\n\ 912 | ) # special\n\ 913 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 914 | )* # )*\n\ 915 | \\) # )\n\ 916 | [\\040\\t]* )* # If comment found, allow more spaces.\n\ 917 | # optional trailing comments\n\ 918 | )? # optional route\n\ 919 | (?:\n\ 920 | [^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]+ # some number of atom characters...\n\ 921 | (?![^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]) # ..not followed by something that could be part of an atom\n\ 922 | # Atom\n\ 923 | | # or\n\ 924 | " # "\n\ 925 | [^\\\\\\x80-\\xff\\n\\015"] * # normal\n\ 926 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015"] * )* # ( special normal* )*\n\ 927 | " # "\n\ 928 | # Quoted string\n\ 929 | )\n\ 930 | [\\040\\t]* # Nab whitespace.\n\ 931 | (?:\n\ 932 | \\( # (\n\ 933 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 934 | (?: # (\n\ 935 | (?: \\\\ [^\\x80-\\xff] |\n\ 936 | \\( # (\n\ 937 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 938 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015()] * )* # (special normal*)*\n\ 939 | \\) # )\n\ 940 | ) # special\n\ 941 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 942 | )* # )*\n\ 943 | \\) # )\n\ 944 | [\\040\\t]* )* # If comment found, allow more spaces.\n\ 945 | (?:\n\ 946 | \\.\n\ 947 | [\\040\\t]* # Nab whitespace.\n\ 948 | (?:\n\ 949 | \\( # (\n\ 950 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 951 | (?: # (\n\ 952 | (?: \\\\ [^\\x80-\\xff] |\n\ 953 | \\( # (\n\ 954 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 955 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015()] * )* # (special normal*)*\n\ 956 | \\) # )\n\ 957 | ) # special\n\ 958 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 959 | )* # )*\n\ 960 | \\) # )\n\ 961 | [\\040\\t]* )* # If comment found, allow more spaces.\n\ 962 | (?:\n\ 963 | [^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]+ # some number of atom characters...\n\ 964 | (?![^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]) # ..not followed by something that could be part of an atom\n\ 965 | # Atom\n\ 966 | | # or\n\ 967 | " # "\n\ 968 | [^\\\\\\x80-\\xff\\n\\015"] * # normal\n\ 969 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015"] * )* # ( special normal* )*\n\ 970 | " # "\n\ 971 | # Quoted string\n\ 972 | )\n\ 973 | [\\040\\t]* # Nab whitespace.\n\ 974 | (?:\n\ 975 | \\( # (\n\ 976 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 977 | (?: # (\n\ 978 | (?: \\\\ [^\\x80-\\xff] |\n\ 979 | \\( # (\n\ 980 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 981 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015()] * )* # (special normal*)*\n\ 982 | \\) # )\n\ 983 | ) # special\n\ 984 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 985 | )* # )*\n\ 986 | \\) # )\n\ 987 | [\\040\\t]* )* # If comment found, allow more spaces.\n\ 988 | # additional words\n\ 989 | )*\n\ 990 | @\n\ 991 | [\\040\\t]* # Nab whitespace.\n\ 992 | (?:\n\ 993 | \\( # (\n\ 994 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 995 | (?: # (\n\ 996 | (?: \\\\ [^\\x80-\\xff] |\n\ 997 | \\( # (\n\ 998 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 999 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015()] * )* # (special normal*)*\n\ 1000 | \\) # )\n\ 1001 | ) # special\n\ 1002 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 1003 | )* # )*\n\ 1004 | \\) # )\n\ 1005 | [\\040\\t]* )* # If comment found, allow more spaces.\n\ 1006 | (?:\n\ 1007 | [^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]+ # some number of atom characters...\n\ 1008 | (?![^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]) # ..not followed by something that could be part of an atom\n\ 1009 | |\n\ 1010 | \\[ # [\n\ 1011 | (?: [^\\\\\\x80-\\xff\\n\\015\\[\\]] | \\\\ [^\\x80-\\xff] )* # stuff\n\ 1012 | \\] # ]\n\ 1013 | )\n\ 1014 | [\\040\\t]* # Nab whitespace.\n\ 1015 | (?:\n\ 1016 | \\( # (\n\ 1017 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 1018 | (?: # (\n\ 1019 | (?: \\\\ [^\\x80-\\xff] |\n\ 1020 | \\( # (\n\ 1021 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 1022 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015()] * )* # (special normal*)*\n\ 1023 | \\) # )\n\ 1024 | ) # special\n\ 1025 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 1026 | )* # )*\n\ 1027 | \\) # )\n\ 1028 | [\\040\\t]* )* # If comment found, allow more spaces.\n\ 1029 | # optional trailing comments\n\ 1030 | (?:\n\ 1031 | \\.\n\ 1032 | [\\040\\t]* # Nab whitespace.\n\ 1033 | (?:\n\ 1034 | \\( # (\n\ 1035 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 1036 | (?: # (\n\ 1037 | (?: \\\\ [^\\x80-\\xff] |\n\ 1038 | \\( # (\n\ 1039 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 1040 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015()] * )* # (special normal*)*\n\ 1041 | \\) # )\n\ 1042 | ) # special\n\ 1043 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 1044 | )* # )*\n\ 1045 | \\) # )\n\ 1046 | [\\040\\t]* )* # If comment found, allow more spaces.\n\ 1047 | (?:\n\ 1048 | [^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]+ # some number of atom characters...\n\ 1049 | (?![^(\\040)<>@,;:".\\\\\\[\\]\\000-\\037\\x80-\\xff]) # ..not followed by something that could be part of an atom\n\ 1050 | |\n\ 1051 | \\[ # [\n\ 1052 | (?: [^\\\\\\x80-\\xff\\n\\015\\[\\]] | \\\\ [^\\x80-\\xff] )* # stuff\n\ 1053 | \\] # ]\n\ 1054 | )\n\ 1055 | [\\040\\t]* # Nab whitespace.\n\ 1056 | (?:\n\ 1057 | \\( # (\n\ 1058 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 1059 | (?: # (\n\ 1060 | (?: \\\\ [^\\x80-\\xff] |\n\ 1061 | \\( # (\n\ 1062 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 1063 | (?: \\\\ [^\\x80-\\xff] [^\\\\\\x80-\\xff\\n\\015()] * )* # (special normal*)*\n\ 1064 | \\) # )\n\ 1065 | ) # special\n\ 1066 | [^\\\\\\x80-\\xff\\n\\015()] * # normal*\n\ 1067 | )* # )*\n\ 1068 | \\) # )\n\ 1069 | [\\040\\t]* )* # If comment found, allow more spaces.\n\ 1070 | # optional trailing comments\n\ 1071 | )*\n\ 1072 | # address spec\n\ 1073 | > # >\n\ 1074 | # name and address\n\ 1075 | )\n\ 1076 | Alan Other (0,25) 1077 | E$ckv SAME (1,13) 1078 | E$ckv SAME user@dom.ain (0,12) 1079 | E$ckv SAME "A. Other" (a comment) (0,30) 1080 | E$ckv SAME A. Other (a comment) (2,28) 1081 | E$ckv SAME "/s=user/ou=host/o=place/prmd=uu.yy/admd= /c=gb/"@x400-re.lay (0,61) 1082 | E$ckv SAME A missing angle ]{0,})>]{0,})>([\\d]{0,}\\.)(.*)((
([\\w\\W\\s\\d][^<>]{0,})|[\\s]{0,}))]{0,})>([\\w\\W\\s\\d][^<>]{0,})]{0,})>([\\w\\W\\s\\d][^<>]{0,}) 43.Word Processor
(N-1286)
Lega lstaff.comCA - Statewide (0,227)(3,21)(25,47)(48,51)(51,122)(122,122)(122,122)(?,?)(134,156)(157,172)(180,202)(203,217) 1414 | E a[^a]b acb (0,3) 1415 | E$ SAME a\nb (0,3) 1416 | E a.b acb (0,3) 1417 | E$ SAME a\nb (0,3) 1418 | Ej a[^a]b acb (0,3) 1419 | Ej$ SAME a\nb (0,3) 1420 | Ej a.b acb (0,3) 1421 | Ej$ SAME a\nb (0,3) -------------------------------------------------------------------------------- /spec/test-generator.js: -------------------------------------------------------------------------------- 1 | const fs = require("fs"); 2 | const data = fs.readFileSync("./spec/pcre-1.dat", "utf8"); 3 | const lines = data.split("\n"); 4 | const prettier = require("prettier"); 5 | 6 | const escapeQuote = (str) => str.replaceAll('"', '\\"'); 7 | 8 | const range = (from, to) => 9 | Array.from({ length: to - from + 1 }, (_, i) => i + from); 10 | 11 | const knownIssues = { 12 | /* ------- features not yet implemented ------- */ 13 | "does not support start of string quantified within an alternation": [ 14 | 1363, 15 | 1369, 16 | ], 17 | "does not support hex notification in character sets": [...range(1147, 1149)], 18 | "does nto support escaped characters in character ranges": [ 19 | ...range(1301, 1308), 20 | ], 21 | "lazy quantifiers should still yield the longest overall regex match": [ 22 | ...range(141, 143), 23 | 1288, 24 | ], 25 | "peformance issue": [1313, 1314], 26 | 27 | /* -------- issues with the tests ------------ */ 28 | "test appears to be incorrect?": [203, 204], 29 | "issue with parsing the test itself": [ 30 | 1103, 31 | ...range(1095, 1098), 32 | ...range(487, 494), 33 | ...range(1077, 1082), 34 | ], 35 | "test contains an octal escape sequence": [1102], 36 | // the test results measure captured groups using character length / locations 37 | // see: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/length 38 | // this is tricky to reproduce 39 | "test requires a substring function": [1087, 1088], 40 | 41 | /* -------- differences between PCRE and JS regex ------------ */ 42 | "test indicates a malformed regex, whereas it appears OK in JS": [ 43 | 1189, 44 | ...range(1186, 1188), 45 | ], 46 | "JS does not support the \\A \\Z syntax for start and end of string": [ 47 | 1163, 48 | 1164, 49 | ], 50 | "test regex contains syntax not supported in JS": [82, 1158, 281], 51 | "the test behaviour differs between PCRE and JS": [290, 1278], 52 | }; 53 | 54 | const hasKnownIssue = (index) => { 55 | for (const issue in knownIssues) { 56 | if (knownIssues[issue].includes(index)) { 57 | return issue; 58 | } 59 | } 60 | return null; 61 | }; 62 | 63 | let testCase = ` 64 | /* eslint-disable no-useless-escape */ 65 | /* eslint-disable @typescript-eslint/no-empty-function */ 66 | /* eslint-disable @typescript-eslint/no-unused-vars */ 67 | import { RegExp, Match } from ".."; 68 | import { expectMatch, expectNotMatch, exec} from "../__tests__/utils"; 69 | 70 | `; 71 | 72 | let regex = ""; 73 | lines.forEach((line, index) => { 74 | index += 1; 75 | 76 | let nextCase = ""; 77 | 78 | const knownIssue = hasKnownIssue(index); 79 | if (knownIssue == "issue with parsing the test itself") { 80 | testCase += `xit("line ${index} - issue with parsing the test itself", () => {});`; 81 | return; 82 | } 83 | 84 | try { 85 | const parts = line.split("\t").filter((f) => f !== ""); 86 | if (parts.length < 4) { 87 | // TODO - these should probably be listed as known issues 88 | return; 89 | } 90 | 91 | regex = 92 | parts[1] == "SAME" 93 | ? regex 94 | : escapeQuote(parts[1] == "NULL" ? "" : parts[1]).replaceAll( 95 | "/", 96 | "\\\\/" 97 | ); 98 | let str = parts[2] !== "NULL" ? escapeQuote(parts[2]) : ""; 99 | let flags = "m" + (parts[0].includes("i") ? "i" : ""); 100 | flags += parts[0] !== "En$" && parts[0] !== "E$n" ? "s" : ""; 101 | 102 | if (regex.includes("\\b")) { 103 | testCase += `xit("line: ${index} - word boundary class not supported yet!", () => { });`; 104 | return; 105 | } 106 | 107 | if (str.includes("\\x{")) { 108 | testCase += `xit("line: ${index} - test encoding issue", () => { });`; 109 | return; 110 | } 111 | 112 | if (["(?!", "(?="].some((f) => regex.includes(f))) { 113 | testCase += `xit("line: ${index} - lookaheads not supported", () => {});`; 114 | return; 115 | } 116 | 117 | if (["(?m", "(?s", "(?ms"].some((f) => regex.includes(f))) { 118 | testCase += `xit("line: ${index} - JS regex does not support mode modifiers", () => {});`; 119 | return; 120 | } 121 | 122 | if (["(?#"].some((f) => regex.includes(f))) { 123 | testCase += `xit("line: ${index} - JS regex does not support comments", () => {});`; 124 | return; 125 | } 126 | 127 | if (regex.match(/\\\\\d{1}/)) { 128 | testCase += `xit("line: ${index} - back references are not supported", () => {});`; 129 | return; 130 | } 131 | 132 | if (knownIssue) { 133 | testCase += `xit("line: ${index} - ${knownIssue}", () => {});`; 134 | return; 135 | } 136 | 137 | nextCase += `it("line: ${index} - matches ${regex} against '${str}'", () => { 138 | `; 139 | if (parts[3] == "BADBR") { 140 | nextCase += ` expect(() => { let foo = new RegExp("${regex}") }).toThrow();`; 141 | } else if (parts[3] == "NOMATCH") { 142 | nextCase += ` expectNotMatch("${regex}", ["${str}"]);`; 143 | } else { 144 | nextCase += ` const match = exec("${regex}", "${str}", "${flags}");`; 145 | 146 | // create an expect for each capture group 147 | const captures = parts[3].match(/\((\d{1,3}|\?),(\d{1,3}|\?)\)+/g); 148 | captures.forEach((capture, index) => { 149 | const digits = capture.match(/\((\d{1,3}|\?),(\d{1,3}|\?)\)/); 150 | if (digits[1] !== "?") { 151 | nextCase += `expect(match.matches[${index}]).toBe("${str}".substring(${digits[1]}, ${digits[2]}));`; 152 | } 153 | }); 154 | } 155 | 156 | nextCase += `}); 157 | `; 158 | 159 | testCase += nextCase; 160 | } catch { 161 | console.error("could not parse test case", index); 162 | } 163 | }); 164 | 165 | fs.writeFileSync( 166 | "./assembly/__spec_tests__/generated.spec.ts", 167 | // testCase 168 | prettier.format(testCase, { parser: "babel" }) 169 | ); 170 | -------------------------------------------------------------------------------- /spec/test.dat: -------------------------------------------------------------------------------- 1 | BE abracadabra$ abracadabracadabra (7,18) 2 | BE a...b abababbb (2,7) 3 | BE XXXXXX ..XXXXXX (2,8) 4 | E \) () (1,2) 5 | BE a] a]a (0,2) 6 | B } } (0,1) 7 | E \} } (0,1) 8 | BE \] ] (0,1) 9 | B ] ] (0,1) 10 | E ] ] (0,1) 11 | B { { (0,1) 12 | B } } (0,1) 13 | BE ^a ax (0,1) 14 | BE \^a a^a (1,3) 15 | BE a\^ a^ (0,2) 16 | BE a$ aa (1,2) 17 | BE a\$ a$ (0,2) 18 | BE ^$ NULL (0,0) 19 | E $^ NULL (0,0) 20 | E a($) aa (1,2)(2,2) 21 | E a*(^a) aa (0,1)(0,1) 22 | E (..)*(...)* a (0,0) 23 | E (..)*(...)* abcd (0,4)(2,4) 24 | E (ab|a)(bc|c) abc (0,3)(0,2)(2,3) 25 | E (ab)c|abc abc (0,3)(0,2) 26 | E a{0}b ab (1,2) 27 | E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) 28 | E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) 29 | E a{9876543210} NULL BADBR 30 | E ((a|a)|a) a (0,1)(0,1)(0,1) 31 | E (a*)(a|aa) aaaa (0,4)(0,3)(3,4) 32 | E a*(a.|aa) aaaa (0,4)(2,4) 33 | E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2) 34 | E (a|b)?.* b (0,1)(0,1) 35 | E (a|b)c|a(b|c) ac (0,2)(0,1) 36 | E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2) 37 | E (a|b)*c|(a|ab)*c abc (0,3)(1,2) 38 | E (a|b)*c|(a|ab)*c xc (1,2) 39 | E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2) 40 | E a?(ab|ba)ab abab (0,4)(0,2) 41 | E a?(ac{0}b|ba)ab abab (0,4)(0,2) 42 | E ab|abab abbabab (0,2) 43 | E aba|bab|bba baaabbbaba (5,8) 44 | E aba|bab baaabbbaba (6,9) 45 | E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2) 46 | E (a.|.a.)*|(a|.a...) aa (0,2)(0,2) 47 | E ab|a xabc (1,3) 48 | E ab|a xxabc (2,4) 49 | Ei (Ab|cD)* aBcD (0,4)(2,4) 50 | BE [^-] --a (2,3) 51 | BE [a-]* --a (0,3) 52 | BE [a-m-]* --amoma-- (0,4) 53 | E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17) 54 | E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17) 55 | {E [[:upper:]] A (0,1) [[]] not supported 56 | E [[:lower:]]+ `az{ (1,3) 57 | E [[:upper:]]+ @AZ[ (1,3) 58 | BE [[-]] [[-]] (2,4) 59 | BE [[.NIL.]] NULL ECOLLATE 60 | BE [[=aleph=]] NULL ECOLLATE 61 | } 62 | BE$ \n \n (0,1) 63 | BEn$ \n \n (0,1) 64 | BE$ [^a] \n (0,1) 65 | BE$ \na \na (0,2) 66 | E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3) 67 | BE xxx xxx (0,3) 68 | E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6) 69 | E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3) 70 | E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11) 71 | E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1) 72 | E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2) 73 | E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81) 74 | E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25) 75 | E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22) 76 | E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11) 77 | BE$ .* \x01\xff (0,2) 78 | E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57) 79 | L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH 80 | E a*a*a*a*a*b aaaaaaaaab (0,10) 81 | BE ^ NULL (0,0) 82 | BE $ NULL (0,0) 83 | BE ^$ NULL (0,0) 84 | BE ^a$ a (0,1) 85 | BE abc abc (0,3) 86 | BE abc xabcy (1,4) 87 | BE abc ababc (2,5) 88 | BE ab*c abc (0,3) 89 | BE ab*bc abc (0,3) 90 | BE ab*bc abbc (0,4) 91 | BE ab*bc abbbbc (0,6) 92 | E ab+bc abbc (0,4) 93 | E ab+bc abbbbc (0,6) 94 | E ab?bc abbc (0,4) 95 | E ab?bc abc (0,3) 96 | E ab?c abc (0,3) 97 | BE ^abc$ abc (0,3) 98 | BE ^abc abcc (0,3) 99 | BE abc$ aabc (1,4) 100 | BE ^ abc (0,0) 101 | BE $ abc (3,3) 102 | BE a.c abc (0,3) 103 | BE a.c axc (0,3) 104 | BE a.*c axyzc (0,5) 105 | BE a[bc]d abd (0,3) 106 | BE a[b-d]e ace (0,3) 107 | BE a[b-d] aac (1,3) 108 | BE a[-b] a- (0,2) 109 | BE a[b-] a- (0,2) 110 | BE a] a] (0,2) 111 | BE a[]]b a]b (0,3) 112 | BE a[^bc]d aed (0,3) 113 | BE a[^-b]c adc (0,3) 114 | BE a[^]b]c adc (0,3) 115 | E ab|cd abc (0,2) 116 | E ab|cd abcd (0,2) 117 | E a\(b a(b (0,3) 118 | E a\(*b ab (0,2) 119 | E a\(*b a((b (0,4) 120 | E ((a)) abc (0,1)(0,1)(0,1) 121 | E (a)b(c) abc (0,3)(0,1)(2,3) 122 | E a+b+c aabbabc (4,7) 123 | E a* aaa (0,3) 124 | E (a*)* - (0,0)(0,0) 125 | E (a*)+ - (0,0)(0,0) 126 | E (a*|b)* - (0,0)(0,0) 127 | E (a+|b)* ab (0,2)(1,2) 128 | E (a+|b)+ ab (0,2)(1,2) 129 | E (a+|b)? ab (0,1)(0,1) 130 | BE [^ab]* cde (0,3) 131 | E (^)* - (0,0)(0,0) 132 | BE a* NULL (0,0) 133 | E ([abc])*d abbbcd (0,6)(4,5) 134 | E ([abc])*bcd abcd (0,4)(0,1) 135 | E a|b|c|d|e e (0,1) 136 | E (a|b|c|d|e)f ef (0,2)(0,1) 137 | E ((a*|b))* - (0,0)(0,0)(0,0) 138 | BE abcd*efg abcdefg (0,7) 139 | BE ab* xabyabbbz (1,3) 140 | BE ab* xayabbbz (1,2) 141 | E (ab|cd)e abcde (2,5)(2,4) 142 | BE [abhgefdc]ij hij (0,3) 143 | E (a|b)c*d abcd (1,4)(1,2) 144 | E (ab|ab*)bc abc (0,3)(0,1) 145 | E a([bc]*)c* abc (0,3)(1,3) 146 | E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4) 147 | E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4) 148 | E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4) 149 | E a[bcd]*dcdcde adcdcde (0,7) 150 | E (ab|a)b*c abc (0,3)(0,2) 151 | E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4) 152 | BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5) 153 | E ^a(bc+|b[eh])g|.h$ abh (1,3) 154 | E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5) 155 | E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2) 156 | E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6) 157 | E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1) 158 | BE multiple words multiple words yeah (0,14) 159 | E (.*)c(.*) abcde (0,5)(0,2)(3,5) 160 | BE abcd abcd (0,4) 161 | E a(bc)d abcd (0,4)(1,3) 162 | E a[-]?c ac (0,3) 163 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12) 164 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13) 165 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12) 166 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12) 167 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11) 168 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12) 169 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11) 170 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15) 171 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13) 172 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13) 173 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12) 174 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13) 175 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11) 176 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13) 177 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13) 178 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12) 179 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12) 180 | E a+(b|c)*d+ aabcdd (0,6)(3,4) 181 | E ^.+$ vivi (0,4) 182 | E ^(.+)$ vivi (0,4)(0,4) 183 | E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19) 184 | E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3) 185 | E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7) 186 | E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7) 187 | E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11) 188 | E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3) 189 | E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7) 190 | E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3) 191 | E ((foo)|bar)!bas bar!bas (0,7)(0,3) 192 | E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7) 193 | E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3) 194 | E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3) 195 | E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7) 196 | E (foo|(bar))!bas foo!bas (0,7)(0,3) 197 | E (foo|bar)!bas bar!bas (0,7)(0,3) 198 | E (foo|bar)!bas foo!bar!bas (4,11)(4,7) 199 | E (foo|bar)!bas foo!bas (0,7)(0,3) 200 | E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) 201 | E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3) 202 | E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7) 203 | E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11) 204 | E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7) 205 | E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3) 206 | E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7) 207 | E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) 208 | E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7) 209 | E .*(/XXX).* /XXX (0,4)(0,4) 210 | E .*(\\XXX).* \XXX (0,4)(0,4) 211 | E \\XXX \XXX (0,4) 212 | E .*(/000).* /000 (0,4)(0,4) 213 | E .*(\\000).* \000 (0,4)(0,4) 214 | E \\000 \000 (0,4) 215 | -------------------------------------------------------------------------------- /ts/index.ts: -------------------------------------------------------------------------------- 1 | import "assemblyscript/std/portable/index"; 2 | 3 | const globalAny: any = global; 4 | globalAny.log = console.log; 5 | 6 | import { RegExp } from "../assembly/regexp"; 7 | 8 | const regexObj = new RegExp("word (?:[a-zA-Z0-9]+ ){0,300}otherword", ""); 9 | let match = regexObj.exec( 10 | "word cat dog elephant mussel cow horse canary baboon snake shark the quick brown fox and the lazy dog and several other words getting close to thirty by now I hope" 11 | ); 12 | console.log(JSON.stringify(match, null, 2)); 13 | -------------------------------------------------------------------------------- /ts/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "assemblyscript/std/portable.json", 3 | "include": ["./**/*.ts"], 4 | "compilerOptions": { 5 | "types": ["node"], 6 | "strictNullChecks": false 7 | } 8 | } 9 | --------------------------------------------------------------------------------