├── src ├── js │ ├── maximum.ts │ ├── literal.ts │ ├── unicode │ │ └── index.ts │ ├── index.ts │ ├── flags.ts │ ├── create-assertion.ts │ ├── char-case-folding.ts │ └── create-char-set.ts ├── words │ ├── index.ts │ ├── conversion.ts │ ├── readable.ts │ └── word-set.ts ├── ast │ ├── index.ts │ ├── set-source.ts │ ├── set-parent.ts │ ├── nodes.ts │ └── visit.ts ├── transformers │ ├── creation-options.ts │ ├── index.ts │ ├── replace-assertions.ts │ ├── simplify.ts │ ├── sort-assertions.ts │ ├── make-greedy.ts │ ├── remove-dead-branches.ts │ ├── union-characters.ts │ ├── inline.ts │ ├── move-up-empty.ts │ └── merge-with-quantifier.ts ├── iter │ ├── index.ts │ ├── print-common.ts │ ├── print-util.ts │ ├── map-fa-builder.ts │ ├── to-string.ts │ ├── remove-dead-states.ts │ ├── to-mermaid.ts │ ├── make-deterministic.ts │ ├── to-dot.ts │ ├── intersection.ts │ └── from-words.ts ├── index.ts ├── word-set.ts ├── char-types.ts ├── errors.ts ├── intersection.ts └── char-base.ts ├── .prettierrc ├── tests ├── tsconfig.json ├── transformers │ ├── make-greedy.ts │ ├── remove-dead-branches.ts │ ├── creation-options.ts │ ├── replace-assertions.ts │ ├── sort-assertions.ts │ ├── simplify.ts │ ├── move-up-empty.ts │ ├── factor-out.ts │ ├── inline.ts │ ├── union-characters.ts │ ├── merge-with-quantifier.ts │ ├── remove-unnecessary-assertions.ts │ ├── nested-quantifiers.ts │ ├── pattern-edge-assertions.ts │ └── apply-assertions.ts ├── helper │ ├── util.ts │ ├── regexp-literals.ts │ ├── chars.ts │ ├── config.ts │ ├── literal-pair-data.ts │ ├── from-regex-data.ts │ ├── word-test-data.ts │ ├── fa.ts │ ├── literal-to-string.ts │ ├── snapshot.ts │ └── transform.ts ├── iter │ ├── to-dot.ts │ ├── to-mermaid.ts │ ├── word-sets.ts │ └── from-words.ts ├── char-map.ts ├── dfa-min.ts ├── char-base.ts ├── js │ └── create-assertion.ts ├── regex-stress-test.ts └── intersection.ts ├── scripts ├── tsconfig.json ├── util.ts ├── debug.ts ├── create-unicode.ts ├── perf.ts └── create-case-folding.ts ├── .gitignore ├── rollup.config.js ├── typedoc.json ├── LICENSE ├── .github └── workflows │ ├── nodejs.yml │ └── deploy-docs.yml ├── package.json ├── .eslintrc.js ├── CONTRIBUTING.md └── tsconfig.json /src/js/maximum.ts: -------------------------------------------------------------------------------- 1 | export const enum Maximum { 2 | UTF16 = 0xffff, 3 | UNICODE = 0x10ffff, 4 | } 5 | -------------------------------------------------------------------------------- /src/words/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./conversion"; 2 | export * from "./readable"; 3 | export * from "./word-set"; 4 | -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "printWidth": 120, 3 | "tabWidth": 4, 4 | "useTabs": true, 5 | "arrowParens": "avoid", 6 | "quoteProps": "consistent", 7 | "semi": true 8 | } 9 | -------------------------------------------------------------------------------- /src/ast/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./nodes"; 2 | export * from "./set-parent"; 3 | export * from "./set-source"; 4 | export * from "./visit"; 5 | export * from "./transform"; 6 | -------------------------------------------------------------------------------- /tests/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "../tsconfig.json", 3 | "compilerOptions": { 4 | "module": "CommonJS", 5 | "noEmit": true 6 | }, 7 | "include": ["**/*.ts"] 8 | } 9 | -------------------------------------------------------------------------------- /scripts/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "../tsconfig.json", 3 | "compilerOptions": { 4 | "module": "CommonJS", 5 | "noEmit": true 6 | }, 7 | "include": ["**/*.ts"] 8 | } 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | .out/ 3 | .vscode/ 4 | *.log 5 | /**/tempCodeRunnerFile.* 6 | .nyc_output/ 7 | coverage/ 8 | docs/ 9 | /index.js 10 | /index.mjs 11 | /index.d.ts 12 | -------------------------------------------------------------------------------- /src/js/literal.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * A light-weight representation of a 3 | * [JavaScript RegExp](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp) object. 4 | * 5 | * This interface only requires the `source` and `flags` properties of a RegExp object. 6 | */ 7 | export interface Literal { 8 | readonly source: string; 9 | readonly flags: string; 10 | } 11 | -------------------------------------------------------------------------------- /src/js/unicode/index.ts: -------------------------------------------------------------------------------- 1 | export * as Alias from "./alias"; 2 | export * as Binary_Property from "./binary-property-data"; 3 | export * as PropertiesOfStrings from "./properties-of-strings"; 4 | export * as General_Category from "./general-category-data"; 5 | export * as Script from "./script-data"; 6 | export * as Script_Extensions from "./script-extensions-data"; 7 | export * from "./case-folding"; 8 | -------------------------------------------------------------------------------- /rollup.config.js: -------------------------------------------------------------------------------- 1 | import { nodeResolve } from "@rollup/plugin-node-resolve"; 2 | import { terser } from "rollup-plugin-terser"; 3 | 4 | export default /** @type {import('rollup').RollupOptions[]} */ ([ 5 | { 6 | input: ".out/index.js", 7 | external: ["regexpp"], 8 | output: { 9 | file: "index.js", 10 | format: "cjs", 11 | }, 12 | plugins: [ 13 | nodeResolve(), 14 | terser({ compress: { pure_funcs: ['debugAssert'] } }), 15 | ], 16 | }, 17 | ]); 18 | 19 | -------------------------------------------------------------------------------- /src/transformers/creation-options.ts: -------------------------------------------------------------------------------- 1 | export interface CreationOptions { 2 | /** 3 | * If `true`, transformers are allowed to reorder alternatives and to change/ignore the laziness of quantifiers. 4 | * This may cause the behavior of the regex to change. 5 | * 6 | * @default false 7 | */ 8 | ignoreOrder?: boolean; 9 | /** 10 | * If `true`, transformers are allowed to reduce or increase the ambiguity of the regular expression. 11 | * 12 | * @default false 13 | */ 14 | ignoreAmbiguity?: boolean; 15 | } 16 | -------------------------------------------------------------------------------- /tests/transformers/make-greedy.ts: -------------------------------------------------------------------------------- 1 | import { itTest, regexSnapshot } from "../helper/transform"; 2 | import { makeGreedy } from "../../src/transformers"; 3 | 4 | describe("Transformers", function () { 5 | describe(/[\w-]+(?=\.\w+)/i.exec(__filename)![0], function () { 6 | const transformer = makeGreedy(); 7 | 8 | itTest(transformer, [/abc+/, /abc+?/, /abc+?c/, /abc+?$/, /abc+?d/, /abc{3,3}?/, /(?!cc+?d)/, /(? { 2 | isInitial(node: S): boolean; 3 | isFinal(node: S): boolean; 4 | getId(node: S): number; 5 | getNumberOfOutgoingEdges(node: S): number; 6 | } 7 | 8 | export interface SimplePrintOptions { 9 | /** 10 | * Returns the string representation of the given transition. 11 | * 12 | * @param transition 13 | * @returns 14 | */ 15 | transitionToString: (transition: T) => string; 16 | /** 17 | * Whether transitions are ordered. 18 | * 19 | * @default false 20 | */ 21 | ordered?: boolean; 22 | } 23 | -------------------------------------------------------------------------------- /tests/transformers/remove-dead-branches.ts: -------------------------------------------------------------------------------- 1 | import { itTest } from "../helper/transform"; 2 | import { removeDeadBranches } from "../../src/transformers"; 3 | 4 | describe("Transformers", function () { 5 | describe(/[\w-]+(?=\.\w+)/i.exec(__filename)![0], function () { 6 | const transformer = removeDeadBranches(); 7 | 8 | itTest(transformer, [ 9 | { 10 | literal: /(?:[])*foo|[]bar|ab?c[]/, 11 | expected: /foo/, 12 | }, 13 | { 14 | literal: /(?:[])+foo|[]?bar|abc(?:[]|(?:[]|[]+){3,5}|def)/, 15 | expected: /bar|abc(?:def)/, 16 | }, 17 | ]); 18 | }); 19 | }); 20 | -------------------------------------------------------------------------------- /tests/helper/util.ts: -------------------------------------------------------------------------------- 1 | export function prefixes(iter: Iterable): Set { 2 | const set = new Set(); 3 | 4 | for (const item of iter) { 5 | set.add(item); 6 | for (let i = 0, l = item.length; i < l; i++) { 7 | set.add(item.substr(0, i)); 8 | } 9 | } 10 | 11 | return set; 12 | } 13 | 14 | export function suffixes(iter: Iterable): Set { 15 | const set = new Set(); 16 | 17 | for (const item of iter) { 18 | set.add(item); 19 | for (let i = 0, l = item.length; i < l; i++) { 20 | set.add(item.substr(i + 1)); 21 | } 22 | } 23 | 24 | return set; 25 | } 26 | -------------------------------------------------------------------------------- /tests/helper/regexp-literals.ts: -------------------------------------------------------------------------------- 1 | import { Literal } from "../../src/js"; 2 | 3 | export const EMPTY_LITERALS: Literal[] = [/[^\s\S]/, /[^\s\S]+|[^\w\W]{4,13}/]; 4 | 5 | export const NON_EMPTY_LITERALS: Literal[] = [/(?:)/, /a*|b*c*/]; 6 | 7 | export const FINITE_LITERALS: Literal[] = [ 8 | ...EMPTY_LITERALS, 9 | 10 | // only match the empty word 11 | /(?:)/, 12 | /(?:)*/, 13 | /(?:|)*/, 14 | /(?:[^\s\S])*/, 15 | 16 | /a|b/, 17 | /a{0,10}|bb|c?/, 18 | /[\0-\uFFFF]{100}/, // matches about 4.4e+481 words but still finite 19 | ]; 20 | 21 | export const NON_FINITE_LITERALS: Literal[] = [/a+/, /a*|bb|ccc/, /(?:a|)*/, /(?:a|[^\s\S]){5,}/]; 22 | -------------------------------------------------------------------------------- /tests/helper/chars.ts: -------------------------------------------------------------------------------- 1 | import { assert } from "chai"; 2 | import { CharSet } from "../../src/char-set"; 3 | import { Literal, Parser } from "../../src/js"; 4 | 5 | export function charsFromRegex(literal: Literal): CharSet { 6 | const parser = Parser.fromLiteral(literal); 7 | const { expression } = parser.parse({ simplify: false }); 8 | 9 | assert.equal(expression.alternatives.length, 1); 10 | const elements = expression.alternatives[0].elements; 11 | assert.equal(elements.length, 1); 12 | const element = elements[0]; 13 | if (element.type === "CharacterClass") { 14 | return element.characters; 15 | } 16 | assert.fail("Only element should have been a character"); 17 | } 18 | -------------------------------------------------------------------------------- /tests/helper/config.ts: -------------------------------------------------------------------------------- 1 | export const CONFIG_UPDATE: boolean = process.argv.indexOf("--update") >= 0; 2 | 3 | export const CONFIG_RUN_TRANSFORMERS: boolean = process.argv.indexOf("--run-transformers") >= 0; 4 | 5 | export const CONFIG_RUN_STRESS_TEST: boolean = process.argv.indexOf("--run-stress-test") >= 0; 6 | 7 | /** 8 | * Setting this to `true` will cause each regex to be parsed with all possible (interesting) parse options. 9 | * 10 | * Since there are quite a number of combinations, the test will then take minutes. Only enable this if you want to 11 | * verify that the parser behaves correctly. 12 | */ 13 | export const CONFIG_ALL_PARSE_OPTIONS: boolean = process.argv.indexOf("--all-parse-options") >= 0; 14 | -------------------------------------------------------------------------------- /src/js/index.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Classes and functions to convert JavaScript RegExp to refa AST and vise versa. 3 | * 4 | * All classes and functions in this module/namespace are specific to JavaScript regular expressions as defined by the 5 | * ECMAScript standard. 6 | * 7 | * @see {@link Parser}: A class to convert from JS RegExp to refa AST. 8 | * @see {@link toLiteral}: A function to convert from refa AST to JS RegExp. 9 | * 10 | * @module 11 | */ 12 | 13 | export * from "./create-assertion"; 14 | export * from "./create-char-set"; 15 | export * from "./to-literal"; 16 | export * from "./flags"; 17 | export * from "./literal"; 18 | export * from "./parser"; 19 | export * from "./parse-unicode-set"; 20 | export * from "./string-set"; 21 | export * from "./unicode-set"; 22 | export * from "./char-case-folding"; 23 | -------------------------------------------------------------------------------- /tests/transformers/creation-options.ts: -------------------------------------------------------------------------------- 1 | import * as Transformers from "../../src/transformers"; 2 | 3 | describe("Transformers", function () { 4 | it("should all accept CreationOptions", function () { 5 | const options: Readonly = {}; 6 | 7 | Transformers.applyAssertions(options); 8 | Transformers.factorOut(options); 9 | Transformers.inline(options); 10 | Transformers.mergeWithQuantifier(options); 11 | Transformers.moveUpEmpty(options); 12 | Transformers.nestedQuantifiers(options); 13 | Transformers.patternEdgeAssertions(options); 14 | Transformers.removeDeadBranches(options); 15 | Transformers.removeUnnecessaryAssertions(options); 16 | Transformers.replaceAssertions(options); 17 | Transformers.sortAssertions(options); 18 | Transformers.unionCharacters(options); 19 | }); 20 | }); 21 | -------------------------------------------------------------------------------- /tests/transformers/replace-assertions.ts: -------------------------------------------------------------------------------- 1 | import { itTest } from "../helper/transform"; 2 | import { replaceAssertions } from "../../src/transformers"; 3 | 4 | describe("Transformers", function () { 5 | describe(/[\w-]+(?=\.\w+)/i.exec(__filename)![0], function () { 6 | itTest(null, [ 7 | { 8 | literal: /\bfoo\b|abc(?=abc$)|bar(?:\w(?(([a, b]) => [b, a]), 44 | ]; 45 | -------------------------------------------------------------------------------- /scripts/util.ts: -------------------------------------------------------------------------------- 1 | import { CharRange } from "../src/char-set"; 2 | 3 | export function printRanges(ranges: Iterable): string { 4 | return `JSON.parse('${JSON.stringify([...ranges])}')`; 5 | } 6 | 7 | export function logDurations(durations: number[], label?: string): void { 8 | durations.sort((a, b) => a - b); 9 | 10 | const avg = durations.reduce((a, b) => a + b, 0) / durations.length; 11 | const max = Math.max(...durations); 12 | 13 | const median = 14 | durations.length % 2 === 0 15 | ? (durations[durations.length / 2 - 1] + durations[durations.length / 2]) / 2 16 | : durations[(durations.length - 1) / 2]; 17 | 18 | const parts: string[] = []; 19 | // eslint-disable-next-line @typescript-eslint/strict-boolean-expressions 20 | if (label) { 21 | parts.push(label); 22 | } 23 | 24 | parts.push(`avg=${+avg.toExponential(2)}ms`); 25 | parts.push(`med=${+median.toExponential(2)}ms`); 26 | parts.push(`max=${+max.toExponential(2)}ms`); 27 | 28 | console.log(parts.join("\t")); 29 | } 30 | -------------------------------------------------------------------------------- /src/iter/print-util.ts: -------------------------------------------------------------------------------- 1 | import { FAIterator } from "../fa-types"; 2 | import { iterToArray } from "../util"; 3 | import { ensureStableOut, iterateStates, mapOut, mapOutIter } from "./iterator"; 4 | import { NodeInfo } from "./print-common"; 5 | 6 | export interface IndexNodes { 7 | stableIter: FAIterator; 8 | states: readonly S[]; 9 | info: NodeInfo; 10 | } 11 | 12 | export function indexNodes(iter: FAIterator>): IndexNodes { 13 | const stableIter = ensureStableOut(mapOut(iter, iterToArray)); 14 | const states: readonly S[] = [...iterateStates(mapOutIter(stableIter, ([s]) => s))]; 15 | const indexMap = new Map(states.map((s, i) => [s, i])); 16 | 17 | return { 18 | stableIter, 19 | states, 20 | info: { 21 | isInitial: s => s === stableIter.initial, 22 | isFinal: stableIter.isFinal, 23 | getId: (state: S): number => indexMap.get(state)!, 24 | getNumberOfOutgoingEdges: s => stableIter.getOut(s).length, 25 | }, 26 | }; 27 | } 28 | -------------------------------------------------------------------------------- /src/word-set.ts: -------------------------------------------------------------------------------- 1 | import type { CharSet } from "./char-set"; 2 | 3 | /** 4 | * A word set is finite sequence of non-empty {@link CharSet}s. 5 | * 6 | * All {@link CharSet}s are guaranteed to be non-empty and to have the same maximum. 7 | * 8 | * All FA and regex implementations are based on either {@link CharSet}s or {@link CharRange}s. This is necessary 9 | * because it's not practical to represent the large character sets used in every-day regexes using single characters. 10 | * Consequently, it is more efficient to work with {@link CharSet}s for them, so operations that yield the words of an 11 | * FA or regex typically yield {@link WordSet}s instead of {@link Word}s. 12 | * 13 | * This type serves as a way to document word sets. It should _not_ be used interchangeably with `CharSet[]`. 14 | */ 15 | export type WordSet = CharSet[]; 16 | 17 | /** 18 | * An immutable finite sequence of non-empty {@link CharSet}s. 19 | * 20 | * This is an immutable view on a {@link WordSet}. 21 | */ 22 | export type ReadonlyWordSet = readonly CharSet[]; 23 | -------------------------------------------------------------------------------- /src/transformers/index.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Contains all AST transformer implementations of refa. 3 | * 4 | * All transformer factory functions implemented here will optionally take {@link CreationOptions} or a sub-class of it. 5 | * This can be used to control the behavior of the created transformers. 6 | * 7 | * For a simple transformer that applies most transformers while preserving the semantic of the given AST, 8 | * see {@link simplify}. 9 | * 10 | * @module 11 | */ 12 | 13 | export * from "./creation-options"; 14 | 15 | export * from "./apply-assertions"; 16 | export * from "./factor-out"; 17 | export * from "./inline"; 18 | export * from "./make-greedy"; 19 | export * from "./merge-with-quantifier"; 20 | export * from "./move-up-empty"; 21 | export * from "./nested-quantifiers"; 22 | export * from "./pattern-edge-assertions"; 23 | export * from "./remove-dead-branches"; 24 | export * from "./remove-unnecessary-assertions"; 25 | export * from "./replace-assertions"; 26 | export * from "./sort-assertions"; 27 | export * from "./union-characters"; 28 | 29 | export * from "./simplify"; 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Michael Schmidt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/transformers/simplify.ts: -------------------------------------------------------------------------------- 1 | import { itTest, regexSnapshot } from "../helper/transform"; 2 | import { simplify } from "../../src/transformers"; 3 | 4 | describe("Transformers", function () { 5 | const transformer = simplify(); 6 | const transformerIgnore = simplify({ ignoreAmbiguity: true, ignoreOrder: true }); 7 | 8 | describe(/[\w-]+(?=\.\w+)/i.exec(__filename)![0], function () { 9 | itTest(transformer, [ 10 | /\b(?!\d)\b\w+\b\s*\(/, 11 | /(?:^|@)\b\w+\b/, 12 | /"""(?:(?!""").)*"""/s, 13 | /"""((?!""")(?:[^\\]|\\"))*"""/, 14 | /(?:(?!<\/title>).)*<\/title>/, 15 | 16 | /^(?:(?:25[0-5]|(2[0-4]|1\d|[1-9]|)\d)(?:\.(?!$)|$)){4}$/, 17 | { literal: /^((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.?\b){4}$/, stepByStep: true }, 18 | 19 | { literal: /\b(?:true|false)\b/, transformer: transformerIgnore, stepByStep: true }, 20 | { literal: /\b[a-z_]\w*(?=\s*\()\b/i, stepByStep: true }, 21 | ]); 22 | 23 | it("Prism regex snapshot", function () { 24 | regexSnapshot(this, transformer); 25 | }); 26 | 27 | it("Prism regex snapshot ignoring order and ambiguity", function () { 28 | regexSnapshot(this, transformerIgnore); 29 | }); 30 | }); 31 | }); 32 | -------------------------------------------------------------------------------- /tests/helper/from-regex-data.ts: -------------------------------------------------------------------------------- 1 | import { Literal } from "../../src/js"; 2 | 3 | export const FROM_REGEX_LITERALS: Literal[] = [ 4 | /a?/, 5 | /a??/, 6 | /a+/, 7 | /a+?/, 8 | /(a|b)+c/, 9 | /a*b*c*/, 10 | /a*b*?c*/, 11 | /a*?b*c*?/, 12 | /a+b+?c+/, 13 | /a{4}/, 14 | /a{4}?/, 15 | /(a|){3}/, 16 | /(|a){3}/, 17 | /(|a|){3}/, 18 | /a{2,4}/, 19 | /a{2,4}?/, 20 | /a{2,6}/, 21 | /(ab){0,3}/, 22 | /(){100,1000}/, 23 | /a+|/, 24 | /|a+/, 25 | /a*/, 26 | /a*?/, 27 | /(a|)+/, 28 | /(a*)+/, 29 | /(a*){4}/, 30 | /(a+|){4}/, 31 | /(a+)+/, 32 | /(a+|){0,4}/, 33 | /(a+){4}/, 34 | /(a*){4,}/, 35 | /((a*)+)?/, 36 | /(a|b)?c/, 37 | /(a+|b+)*/, 38 | /()*/, 39 | /([^\s\S])*/, 40 | /a*|b*/, 41 | /a+|b+|c+/, 42 | /(a*|b*)+/, 43 | /[^\s\S]/, 44 | /ab[^\s\S]ba/, 45 | /([^\s\S]|a|[^\s\S]|b[^\s\S]b|[^\s\S])a/, 46 | /[^\s\S]+/, 47 | /[^\s\S]*/, 48 | /[^\s\S]?/, 49 | /a+|aaab/, 50 | /a+|a*aa*/, 51 | /(?:a+){2,}/, 52 | /abc|ab|abd|abcd/, 53 | /abc?|abd|abcd/, 54 | /food|fool|foot/, 55 | /fo(od|ol|ot)/, 56 | /bet|get|pet|set/, 57 | /bet|bat|bit/, 58 | /a(?:bc)?|dbc/, 59 | /\d+(?:\.\d+)?(?:e[+-]?\d+)?/i, 60 | /<[=>]?|>=?|=>?|:=|\/=?/, 61 | /\{[^\r\n}:]+\}/, 62 | /'(?:%.|[^%'\r\n])+'/, 63 | /&[bchou][a-z\d]+/i, 64 | /"(?:[^\\"]|\\.)*"|'[^']*'/, 65 | ]; 66 | -------------------------------------------------------------------------------- /tests/transformers/move-up-empty.ts: -------------------------------------------------------------------------------- 1 | import { itTest, regexSnapshot } from "../helper/transform"; 2 | import { inline, moveUpEmpty, removeDeadBranches } from "../../src/transformers"; 3 | import { CombinedTransformer } from "../../src/ast"; 4 | 5 | describe("Transformers", function () { 6 | describe(/[\w-]+(?=\.\w+)/i.exec(__filename)![0], function () { 7 | const transformer = moveUpEmpty({ ignoreAmbiguity: true, ignoreOrder: true }); 8 | 9 | itTest(transformer, [ 10 | { 11 | literal: /||a*|b/, 12 | expected: /(?:a+|b)?/, 13 | }, 14 | { 15 | literal: /a*|b*|c*/, 16 | expected: /(?:a+|b+|c+)?/, 17 | }, 18 | { 19 | literal: /a*|b*|c*|d*e?/, 20 | expected: /a+|b+|c+|d*e?/, 21 | }, 22 | { 23 | literal: /a(?:|||)/, 24 | expected: /a(?:)/, 25 | }, 26 | 27 | { 28 | literal: /a*|b*|/, 29 | transformer: moveUpEmpty(), 30 | }, 31 | { 32 | literal: /|a*|b*|/, 33 | transformer: moveUpEmpty(), 34 | }, 35 | ]); 36 | 37 | it("Prism regex snapshot", function () { 38 | const transformer = new CombinedTransformer([ 39 | moveUpEmpty({ ignoreAmbiguity: true, ignoreOrder: true }), 40 | inline(), 41 | removeDeadBranches(), 42 | ]); 43 | 44 | regexSnapshot(this, transformer); 45 | }); 46 | }); 47 | }); 48 | -------------------------------------------------------------------------------- /tests/transformers/factor-out.ts: -------------------------------------------------------------------------------- 1 | import { itTest, regexSnapshot } from "../helper/transform"; 2 | import { factorOut, inline, removeDeadBranches } from "../../src/transformers"; 3 | import { CombinedTransformer } from "../../src/ast"; 4 | 5 | describe("Transformers", function () { 6 | describe(/[\w-]+(?=\.\w+)/i.exec(__filename)![0], function () { 7 | const transformer = factorOut(); 8 | 9 | itTest(transformer, [ 10 | { 11 | literal: /air|after/, 12 | expected: /a(?:i|fte)r/, 13 | }, 14 | { 15 | literal: /abc|abc|abc/, 16 | expected: /abc(?:||)/, 17 | }, 18 | { 19 | literal: /(?:a|b)*b\w+|(?:a|b)*a\w+/, 20 | expected: /(?:a|b)*(?:b|a)\w+/, 21 | }, 22 | { 23 | literal: /\w+|\w*|\w{3,}/, 24 | expected: /\w*(?:\w||\w{3})/, 25 | }, 26 | { 27 | literal: /\w+|\w{2,3}/, 28 | expected: /\w(?:\w*|\w{1,2})/, 29 | }, 30 | { 31 | literal: /\w{2,4}|\w{2,3}/, 32 | expected: /\w{2}(?:\w{0,2}|\w?)/, 33 | }, 34 | 35 | /a$|b$|c/, 36 | 37 | /a(?:b|c)|b|c|d/, 38 | /b|c|a(?:b|c)|d/, 39 | ]); 40 | 41 | it("Prism regex snapshot", function () { 42 | const transformer = new CombinedTransformer([factorOut(), inline(), removeDeadBranches()]); 43 | 44 | regexSnapshot(this, transformer); 45 | }); 46 | }); 47 | }); 48 | -------------------------------------------------------------------------------- /tests/transformers/inline.ts: -------------------------------------------------------------------------------- 1 | import { itTest } from "../helper/transform"; 2 | import { inline } from "../../src/transformers"; 3 | 4 | describe("Transformers", function () { 5 | describe(/[\w-]+(?=\.\w+)/i.exec(__filename)![0], function () { 6 | const transformer = inline(); 7 | 8 | itTest(transformer, [ 9 | { 10 | literal: /a{0}a{1}(?:foo)/, 11 | expected: /afoo/, 12 | }, 13 | { 14 | literal: /(?:a|(?:b))|c/, 15 | expected: /a|b|c/, 16 | }, 17 | { 18 | literal: /(?=(?:a|(?:b)))/, 19 | expected: /(?=a|b)/, 20 | }, 21 | 22 | { 23 | literal: /(?=a(?=b))/, 24 | expected: /(?=ab)/, 25 | }, 26 | { 27 | literal: /(?=a(?:c(?=b)|foo)?)/, 28 | expected: /(?=a(?:cb|foo)?)/, 29 | }, 30 | { 31 | literal: /(?=a(?:c(?=b)|f(?=oo)))/, 32 | expected: /(?=a(?:cb|foo))/, 33 | }, 34 | { 35 | literal: /(?<!(?<=a)b)/, 36 | expected: /(?<!ab)/, 37 | }, 38 | { 39 | literal: /(?=a(?!b))/, 40 | transformer, 41 | expected: /(?=a(?!b))/, 42 | }, 43 | { 44 | literal: /(?=a(?<=b))/, 45 | expected: /(?=a(?<=b))/, 46 | }, 47 | 48 | { 49 | literal: /(?=$)/, 50 | expected: /$/, 51 | }, 52 | { 53 | literal: /(?!(?<=a))/, 54 | expected: /(?<!a)/, 55 | }, 56 | ]); 57 | }); 58 | }); 59 | -------------------------------------------------------------------------------- /src/iter/map-fa-builder.ts: -------------------------------------------------------------------------------- 1 | import { CharSet } from "../char-set"; 2 | import { FABuilder } from "../fa-types"; 3 | import { TooManyNodesError } from "../errors"; 4 | 5 | /** 6 | * An FA builder that uses `Map` objects as nodes. Each node is the map of its outgoing transitions. 7 | */ 8 | export class MapFABuilder implements FABuilder<MapFABuilderNode, CharSet> { 9 | private readonly _limit: number; 10 | private _counter = 0; 11 | readonly initial: MapFABuilderNode = new Map(); 12 | readonly finals = new Set<MapFABuilderNode>(); 13 | 14 | constructor(maxNodes: number = Infinity) { 15 | this._limit = maxNodes; 16 | } 17 | 18 | makeFinal(state: MapFABuilderNode): void { 19 | this.finals.add(state); 20 | } 21 | isFinal(state: MapFABuilderNode): boolean { 22 | return this.finals.has(state); 23 | } 24 | createNode(): MapFABuilderNode { 25 | TooManyNodesError.assert(++this._counter, this._limit, "MapFABuilder"); 26 | 27 | return new Map(); 28 | } 29 | linkNodes(from: MapFABuilderNode, to: MapFABuilderNode, transition: CharSet): void { 30 | const current = from.get(to); 31 | if (current === undefined) { 32 | from.set(to, transition); 33 | } else { 34 | from.set(to, current.union(transition)); 35 | } 36 | } 37 | } 38 | 39 | export type MapFABuilderNode = Map<MapFABuilderNode, CharSet>; 40 | -------------------------------------------------------------------------------- /tests/transformers/union-characters.ts: -------------------------------------------------------------------------------- 1 | import { itTest, regexSnapshot } from "../helper/transform"; 2 | import { inline, removeDeadBranches, unionCharacters } from "../../src/transformers"; 3 | import { CombinedTransformer } from "../../src/ast"; 4 | 5 | describe("Transformers", function () { 6 | describe(/[\w-]+(?=\.\w+)/i.exec(__filename)![0], function () { 7 | itTest(null, [ 8 | { 9 | literal: /(a|b|c) (a|b|dd|\w) (a||b) ([a-d]|do|[c-f])/, 10 | transformer: unionCharacters({}), 11 | expected: /(?:[abc]) (?:[ab]|dd|\w) (?:a||b) (?:[a-d]|do|[c-f])/, 12 | }, 13 | { 14 | literal: /(a|b|c) (a|b|dd|\w) (a||b) ([a-d]|do|[c-f])/, 15 | transformer: unionCharacters({ ignoreOrder: true }), 16 | expected: /(?:[abc]) (?:\w|dd|[ab]) (?:[ab]|) (?:[a-f]|do|[cd])/, 17 | }, 18 | { 19 | literal: /(a|b|c) (a|b|dd|\w) (a||b) ([a-d]|do|[c-f])/, 20 | transformer: unionCharacters({ ignoreOrder: true, ignoreAmbiguity: true }), 21 | expected: /(?:[abc]) (?:\w|dd) (?:[ab]|) (?:[a-f]|do)/, 22 | }, 23 | { 24 | literal: /(a|b|c) (a|b|dd|\w) (a||b) ([a-d]|do|[c-f])/, 25 | transformer: unionCharacters({ ignoreAmbiguity: true }), 26 | expected: /(?:[abc]) (?:[ab]|dd|\w) (?:a||b) (?:[a-d]|do|[c-f])/, 27 | }, 28 | ]); 29 | 30 | it("Prism regex snapshot", function () { 31 | const transformer = new CombinedTransformer([unionCharacters(), inline(), removeDeadBranches()]); 32 | 33 | regexSnapshot(this, transformer); 34 | }); 35 | }); 36 | }); 37 | -------------------------------------------------------------------------------- /src/char-types.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * A character is a non-negative integer. 3 | * 4 | * This is one of the core concepts of refa. Instead of operating on JavaScript strings, UTF16 character codes, or 5 | * Unicode code points, this library uses plain numbers instead. This makes refa agnostic to text encodings and even 6 | * text in general since the integers used as character may represent arbitrary concepts. 7 | * 8 | * There are only 2 restrictions on the numbers that can be characters: 9 | * 10 | * 1. They have to be non-negative integers. 11 | * 2. They can be at most `Number.MAX_SAFE_INTEGER`. 12 | * 13 | * --- 14 | * 15 | * This type serves as a way to document characters. It is a clear way to signal that a value is not just any number. 16 | */ 17 | export type Char = number & { __char?: never }; 18 | 19 | /** 20 | * A word is finite sequence of {@link Char}s. 21 | * 22 | * This one of the core concepts of refa. Instead of operating on JavaScript strings, all functions operate on 23 | * {@link Char}s and char arrays (= words). This means that refa is agnostic to text encodings, the string 24 | * representation of JavaScript, and even text itself. 25 | * 26 | * This type serves as a way to document words. It should _not_ be used interchangeably with `Char[]` or `number[]`. 27 | */ 28 | export type Word = Char[]; 29 | 30 | /** 31 | * An immutable finite sequence of {@link Char}s. 32 | * 33 | * This is an immutable view on a {@link Word}. 34 | */ 35 | export type ReadonlyWord = readonly Char[]; 36 | -------------------------------------------------------------------------------- /tests/transformers/merge-with-quantifier.ts: -------------------------------------------------------------------------------- 1 | import { itTest, regexSnapshot } from "../helper/transform"; 2 | import { inline, mergeWithQuantifier, removeDeadBranches } from "../../src/transformers"; 3 | import { CombinedTransformer } from "../../src/ast"; 4 | 5 | describe("Transformers", function () { 6 | describe(/[\w-]+(?=\.\w+)/i.exec(__filename)![0], function () { 7 | const transformer = mergeWithQuantifier(); 8 | const transformerIgnoreAmbiguity = mergeWithQuantifier({ ignoreAmbiguity: true }); 9 | 10 | itTest(transformer, [ 11 | { 12 | literal: /a*a/, 13 | expected: /a+/, 14 | }, 15 | { 16 | literal: /a*a/, 17 | transformer: transformerIgnoreAmbiguity, 18 | expected: /a+/, 19 | }, 20 | 21 | { 22 | literal: /a*a+/, 23 | expected: /a*a+/, 24 | }, 25 | { 26 | literal: /a*a+/, 27 | transformer: transformerIgnoreAmbiguity, 28 | expected: /a+/, 29 | }, 30 | 31 | { 32 | literal: /a*(?:a?|a+|a{1,2})/, 33 | expected: /a*(?:a?|a+|a{1,2})/, 34 | }, 35 | { 36 | literal: /a*(?:a?|a+|a{1,2})/, 37 | transformer: transformerIgnoreAmbiguity, 38 | expected: /a*(?:|a{1}|a{1})/, 39 | }, 40 | 41 | /aab(?:ab)*/, 42 | /aba(?:ab)*/, 43 | /(?:ab)*aba/, 44 | /(?:ab)*baa/, 45 | ]); 46 | 47 | it("Prism regex snapshot", function () { 48 | const transformer = new CombinedTransformer([mergeWithQuantifier(), inline(), removeDeadBranches()]); 49 | 50 | regexSnapshot(this, transformer); 51 | }); 52 | }); 53 | }); 54 | -------------------------------------------------------------------------------- /src/transformers/replace-assertions.ts: -------------------------------------------------------------------------------- 1 | import { NoParent, Parent, TransformContext, Transformer } from "../ast"; 2 | import { filterMut } from "../util"; 3 | import { CreationOptions } from "./creation-options"; 4 | 5 | export interface RemoveAssertionsCreationOptions extends CreationOptions { 6 | /** 7 | * @default "empty-set" 8 | */ 9 | replacement?: "empty-set" | "empty-word"; 10 | } 11 | 12 | function onParent(node: NoParent<Parent>, context: TransformContext): void { 13 | filterMut(node.alternatives, alternative => { 14 | if (alternative.elements.some(e => e.type === "Assertion")) { 15 | context.signalMutation(); 16 | return false; 17 | } 18 | return true; 19 | }); 20 | } 21 | 22 | /** 23 | * This transformer will all assertions with either the empty set or the empty word. 24 | * 25 | * @param options 26 | */ 27 | export function replaceAssertions(options?: Readonly<RemoveAssertionsCreationOptions>): Transformer { 28 | const replacement = options?.replacement ?? "empty-set"; 29 | 30 | if (replacement === "empty-word") { 31 | return { 32 | name: "replaceAssertions", 33 | onConcatenation(node, { signalMutation }) { 34 | filterMut(node.elements, element => { 35 | if (element.type === "Assertion") { 36 | signalMutation(); 37 | return false; 38 | } 39 | return true; 40 | }); 41 | }, 42 | }; 43 | } else { 44 | return { 45 | name: "replaceAssertions", 46 | onAlternation: onParent, 47 | onExpression: onParent, 48 | onQuantifier: onParent, 49 | }; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /tests/transformers/remove-unnecessary-assertions.ts: -------------------------------------------------------------------------------- 1 | import { itTest, regexSnapshot } from "../helper/transform"; 2 | import { CombinedTransformer } from "../../src/ast"; 3 | import { inline, removeDeadBranches, removeUnnecessaryAssertions } from "../../src/transformers"; 4 | 5 | describe("Transformers", function () { 6 | describe(/[\w-]+(?=\.\w+)/i.exec(__filename)![0], function () { 7 | const transformer = new CombinedTransformer([ 8 | removeUnnecessaryAssertions(), 9 | removeDeadBranches(), // remove dead branches... 10 | inline(), // and inline, so the output regexes are cleaner 11 | ]); 12 | 13 | itTest(transformer, [ 14 | /(?=\s*)a|(?!\s*)b|c(?<=\s*)|d(?<!\s*)/, 15 | /(?=[])a|(?![])b|c(?<=[])|d(?<![])/, 16 | 17 | /\bfoo\b|\Bfoo\B/, 18 | 19 | /(?=\s*$|\S)[\s\S]+/, 20 | /(?=\s*$|\S)a+/, 21 | /(?=\s?$|\S)a+/, 22 | /(?=\s$|\S)a+/, 23 | /(?=(?:\s|bb)$|\S)a+/, 24 | /(?=(?:\s|b*b|)$|\S)a+/, 25 | /(?!b)a+/, 26 | /(?!b+)a+/, 27 | 28 | /(?!\s*::|\d)\w+/, 29 | /(?=\s*::|\d)\w+/, 30 | 31 | /^^/, 32 | /$$$$$/, 33 | /(?!\w)$/, 34 | /$(?!\w)/, 35 | 36 | /^(?:@|(?<!\w))[A-Z_]\w*/i, 37 | /^(?:@|(?<!\w)(?!\w))/i, 38 | /^(?:@|(?<!\w)(?!\w))$/i, 39 | /(?:@|(?<!\w)(?!\w))$/i, 40 | 41 | /^\bfoo\b$/, 42 | /^\b(?:foo|bar)\b$/, 43 | 44 | /\w+\b(?=\s*%)/, 45 | /\w+(?=\s*%)\b/, 46 | ]); 47 | 48 | it("Prism regex snapshot", function () { 49 | const transformer = new CombinedTransformer([ 50 | removeUnnecessaryAssertions(), 51 | inline(), 52 | removeDeadBranches(), 53 | ]); 54 | 55 | regexSnapshot(this, transformer); 56 | }); 57 | }); 58 | }); 59 | -------------------------------------------------------------------------------- /src/ast/set-source.ts: -------------------------------------------------------------------------------- 1 | import { NoParent, Node, SourceLocation } from "./nodes"; 2 | import { assertNever } from "../util"; 3 | 4 | /** 5 | * Sets the `source` property of the given node and all of its child nodes. 6 | * 7 | * If `source` is not a function, then the source object will be copied for all `source` properties to be set. The 8 | * object will be copied using the `start` and `end` properties alone, other properties will not be copied. 9 | * 10 | * @param node 11 | * @param source 12 | * @param overwrite 13 | */ 14 | export function setSource( 15 | node: NoParent<Node>, 16 | source: SourceLocation | (() => SourceLocation), 17 | overwrite?: boolean 18 | ): void { 19 | if (typeof source !== "function") { 20 | const { start, end } = source; 21 | source = () => ({ start, end }); 22 | } 23 | 24 | setSourceImpl(node, source, overwrite); 25 | } 26 | function setSourceImpl(node: NoParent<Node>, getSource: () => SourceLocation, overwrite?: boolean): void { 27 | if (overwrite || !node.source) { 28 | node.source = getSource(); 29 | } else { 30 | const { start, end } = node.source; 31 | getSource = () => ({ start, end }); 32 | } 33 | 34 | switch (node.type) { 35 | case "Concatenation": 36 | node.elements.forEach(e => setSourceImpl(e, getSource, overwrite)); 37 | break; 38 | 39 | case "Alternation": 40 | case "Assertion": 41 | case "Expression": 42 | case "Quantifier": 43 | node.alternatives.forEach(c => setSourceImpl(c, getSource, overwrite)); 44 | break; 45 | 46 | case "CharacterClass": 47 | case "Unknown": 48 | // no children 49 | break; 50 | 51 | default: 52 | throw assertNever(node); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /tests/transformers/nested-quantifiers.ts: -------------------------------------------------------------------------------- 1 | import { itTest, regexSnapshot } from "../helper/transform"; 2 | import { inline, nestedQuantifiers, removeDeadBranches } from "../../src/transformers"; 3 | import { CombinedTransformer } from "../../src/ast"; 4 | 5 | describe("Transformers", function () { 6 | describe(/[\w-]+(?=\.\w+)/i.exec(__filename)![0], function () { 7 | const transformer = nestedQuantifiers({ ignoreAmbiguity: true, ignoreOrder: true }); 8 | 9 | itTest(transformer, [ 10 | { 11 | literal: /(?:a+)*/, 12 | expected: /a*/, 13 | }, 14 | { 15 | literal: /(?:a*)+/, 16 | expected: /a*/, 17 | }, 18 | { 19 | literal: /(?:a*)?/, 20 | expected: /a*/, 21 | }, 22 | { 23 | literal: /(?:a?)?/, 24 | expected: /a?/, 25 | }, 26 | { 27 | literal: /(?:a+)+/, 28 | expected: /a+/, 29 | }, 30 | { 31 | literal: /(?:a{4}){8}/, 32 | expected: /a{32}/, 33 | }, 34 | { 35 | literal: /(?:a{2,4})+/, 36 | expected: /a{2,}/, 37 | }, 38 | { 39 | literal: /(?:a{2,4}){8}/, 40 | expected: /a{16,32}/, 41 | }, 42 | { 43 | literal: /(?:a{8}){2,4}/, 44 | expected: /(?:a{8}){2,4}/, 45 | }, 46 | 47 | { 48 | literal: /(?:a*|b+c|f+)*/, 49 | expected: /(?:a{1}|b+c|f{1})*/, 50 | }, 51 | { 52 | literal: /(?:a*|b+c|f+)?/, 53 | expected: /(?:a+|b+c|f+)?/, 54 | }, 55 | ]); 56 | 57 | it("Prism regex snapshot", function () { 58 | const transformer = new CombinedTransformer([ 59 | nestedQuantifiers({ ignoreAmbiguity: true, ignoreOrder: true }), 60 | inline(), 61 | removeDeadBranches(), 62 | ]); 63 | 64 | regexSnapshot(this, transformer); 65 | }); 66 | }); 67 | }); 68 | -------------------------------------------------------------------------------- /.github/workflows/nodejs.yml: -------------------------------------------------------------------------------- 1 | # This workflow will do a clean install of node dependencies, build the source code and run tests across different versions of node 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-nodejs-with-github-actions 3 | 4 | name: Node.js CI 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | node-version: [12.x, 14.x, 16.x, 18.x, 20.x] 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Use Node.js ${{ matrix.node-version }} 21 | uses: actions/setup-node@v1 22 | with: 23 | node-version: ${{ matrix.node-version }} 24 | - run: npm ci 25 | - run: npm run build 26 | env: 27 | CI: true 28 | - run: npx tsc index.d.ts --noEmit 29 | env: 30 | CI: true 31 | 32 | test: 33 | runs-on: ubuntu-latest 34 | strategy: 35 | matrix: 36 | node-version: [12.x, 14.x, 16.x, 18.x, 20.x] 37 | steps: 38 | - uses: actions/checkout@v2 39 | - name: Use Node.js ${{ matrix.node-version }} 40 | uses: actions/setup-node@v1 41 | with: 42 | node-version: ${{ matrix.node-version }} 43 | - run: npm ci 44 | - run: npm run test:all 45 | env: 46 | CI: true 47 | 48 | check: 49 | runs-on: ubuntu-latest 50 | steps: 51 | - uses: actions/checkout@v2 52 | - name: Use Node.js 16.x 53 | uses: actions/setup-node@v1 54 | with: 55 | node-version: 16.x 56 | - run: npm ci 57 | - run: npm run check 58 | env: 59 | CI: true 60 | -------------------------------------------------------------------------------- /src/js/flags.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * An unchecked partial set of RegExp flags. 3 | * 4 | * Flags are not validated by TypeScript. You must ensure that the flags are valid. 5 | * Whenever possible, use the {@link Flags} type instead. 6 | */ 7 | export interface UncheckedFlags { 8 | /** @default false */ 9 | dotAll?: boolean; 10 | /** @default false */ 11 | global?: boolean; 12 | /** @default false */ 13 | hasIndices?: boolean; 14 | /** @default false */ 15 | ignoreCase?: boolean; 16 | /** @default false */ 17 | multiline?: boolean; 18 | /** @default false */ 19 | sticky?: boolean; 20 | /** @default false */ 21 | unicode?: boolean; 22 | /** @default false */ 23 | unicodeSets?: boolean; 24 | } 25 | 26 | /** 27 | * Returns whether the given flags are valid. 28 | * 29 | * @param flags 30 | */ 31 | export function isFlags(flags: UncheckedFlags): flags is Flags { 32 | const { unicode = false, unicodeSets = false } = flags; 33 | return (!unicode && !unicodeSets) || (unicode && !unicodeSets) || (!unicode && unicodeSets); 34 | } 35 | 36 | /** 37 | * A partial set of non-Unicode-sets RegExp flags. The `v` flag is guaranteed to be unset. 38 | */ 39 | export interface NonUnicodeSetsFlags extends UncheckedFlags { 40 | /** @default false */ 41 | unicode?: boolean; 42 | /** @default false */ 43 | unicodeSets?: false; 44 | } 45 | /** 46 | * A partial set of Unicode-sets RegExp flags. The `v` flag is guaranteed to be set. 47 | */ 48 | export interface UnicodeSetsFlags extends UncheckedFlags { 49 | /** @default false */ 50 | unicode?: false; 51 | /** @default false */ 52 | unicodeSets: true; 53 | } 54 | 55 | /** 56 | * A partial set of RegExp flags. 57 | */ 58 | export type Flags = NonUnicodeSetsFlags | UnicodeSetsFlags; 59 | -------------------------------------------------------------------------------- /tests/iter/to-dot.ts: -------------------------------------------------------------------------------- 1 | import { CharSet } from "../../src/char-set"; 2 | import { toLiteral } from "../../src/js"; 3 | import { literalToDFA, literalToENFA, literalToNFA, literalToString } from "../helper/fa"; 4 | import { assertEqualSnapshot } from "../helper/snapshot"; 5 | 6 | describe("toDot", function () { 7 | describe("from regexes", function () { 8 | const regexes: RegExp[] = [ 9 | /a/, 10 | /abc/i, 11 | /a*b+c*/, 12 | /\d+(?:\.\d+)?(?:e[+-]?\d+)/i, 13 | /\/\*[\s\S]*?\*\//, 14 | /((ab)+){3,}/, 15 | // We need to escape the front slashes here to workaround a NodeJS <= v10 bug. 16 | // eslint-disable-next-line no-useless-escape 17 | /<\/?[^\s\d>\/=$<%][^\s>\/=$<%]*(?:\s+[^\s>\/=]+(?:=(?:"(?:\\[\s\S]|{(?:{(?:{[^{}]*}|[^{}])*}|[^{}])+}|[^\\"])*"|[^\s'">=]+))?)*\s*\/?>/, 18 | ]; 19 | 20 | const toStringFuncs: { name: string; fn?: (cs: CharSet) => string }[] = [ 21 | { name: "default" }, 22 | { 23 | name: "JS", 24 | fn(cs) { 25 | return toLiteral( 26 | { type: "Concatenation", elements: [{ type: "CharacterClass", characters: cs }] }, 27 | { flags: { ignoreCase: false, dotAll: false } } 28 | ).source; 29 | }, 30 | }, 31 | ]; 32 | 33 | for (const re of regexes) { 34 | for (const { name, fn } of toStringFuncs) { 35 | it(`DFA (${name}): ${literalToString(re)}`, function () { 36 | const dfa = literalToDFA(re); 37 | dfa.minimize(); 38 | assertEqualSnapshot(this, dfa.toDot(fn)); 39 | }); 40 | it(`ENFA (${name}): ${literalToString(re)}`, function () { 41 | assertEqualSnapshot(this, literalToENFA(re).toDot(fn)); 42 | }); 43 | it(`NFA (${name}): ${literalToString(re)}`, function () { 44 | assertEqualSnapshot(this, literalToNFA(re).toDot(fn)); 45 | }); 46 | } 47 | } 48 | }); 49 | }); 50 | -------------------------------------------------------------------------------- /tests/iter/to-mermaid.ts: -------------------------------------------------------------------------------- 1 | import { CharSet } from "../../src/char-set"; 2 | import { toLiteral } from "../../src/js"; 3 | import { literalToDFA, literalToENFA, literalToNFA, literalToString } from "../helper/fa"; 4 | import { assertEqualSnapshot } from "../helper/snapshot"; 5 | 6 | describe("toMermaid", function () { 7 | describe("from regexes", function () { 8 | const regexes: RegExp[] = [ 9 | /a/, 10 | /abc/i, 11 | /a*b+c*/, 12 | /\d+(?:\.\d+)?(?:e[+-]?\d+)/i, 13 | /\/\*[\s\S]*?\*\//, 14 | /((ab)+){3,}/, 15 | // We need to escape the front slashes here to workaround a NodeJS <= v10 bug. 16 | // eslint-disable-next-line no-useless-escape 17 | /<\/?[^\s\d>\/=$<%][^\s>\/=$<%]*(?:\s+[^\s>\/=]+(?:=(?:"(?:\\[\s\S]|{(?:{(?:{[^{}]*}|[^{}])*}|[^{}])+}|[^\\"])*"|[^\s'">=]+))?)*\s*\/?>/, 18 | ]; 19 | 20 | const toStringFuncs: { name: string; fn?: (cs: CharSet) => string }[] = [ 21 | { name: "default" }, 22 | { 23 | name: "JS", 24 | fn(cs) { 25 | return toLiteral( 26 | { type: "Concatenation", elements: [{ type: "CharacterClass", characters: cs }] }, 27 | { flags: { ignoreCase: false, dotAll: false } } 28 | ).source; 29 | }, 30 | }, 31 | ]; 32 | 33 | for (const re of regexes) { 34 | for (const { name, fn } of toStringFuncs) { 35 | it(`DFA (${name}): ${literalToString(re)}`, function () { 36 | const dfa = literalToDFA(re); 37 | dfa.minimize(); 38 | assertEqualSnapshot(this, dfa.toMermaid(fn)); 39 | }); 40 | it(`ENFA (${name}): ${literalToString(re)}`, function () { 41 | assertEqualSnapshot(this, literalToENFA(re).toMermaid(fn)); 42 | }); 43 | it(`NFA (${name}): ${literalToString(re)}`, function () { 44 | assertEqualSnapshot(this, literalToNFA(re).toMermaid(fn)); 45 | }); 46 | } 47 | } 48 | }); 49 | }); 50 | -------------------------------------------------------------------------------- /src/transformers/simplify.ts: -------------------------------------------------------------------------------- 1 | import { CombinedTransformer } from "../ast"; 2 | import { applyAssertions } from "./apply-assertions"; 3 | import { CreationOptions } from "./creation-options"; 4 | import { factorOut } from "./factor-out"; 5 | import { inline } from "./inline"; 6 | import { makeGreedy } from "./make-greedy"; 7 | import { mergeWithQuantifier } from "./merge-with-quantifier"; 8 | import { moveUpEmpty } from "./move-up-empty"; 9 | import { nestedQuantifiers } from "./nested-quantifiers"; 10 | import { removeDeadBranches } from "./remove-dead-branches"; 11 | import { removeUnnecessaryAssertions } from "./remove-unnecessary-assertions"; 12 | import { sortAssertions } from "./sort-assertions"; 13 | import { unionCharacters } from "./union-characters"; 14 | 15 | /** 16 | * This transformer is a combined transformer with the goal of simplifying the AST as much as possible without 17 | * changing the semantics. 18 | * 19 | * The main purpose of this transformer is to provide a stable API. The specific functionality of individual 20 | * transformers may change over time, and transformers may depend on each other. This transformer will always 21 | * provide the same functionality. Namely, it will always simplify the AST. 22 | * 23 | * As with all transformers, creation option can be provided. Depending on the options, a different set of 24 | * underlying transformers may be used. 25 | * 26 | * @param options 27 | */ 28 | export function simplify(options?: Readonly<CreationOptions>): CombinedTransformer { 29 | return new CombinedTransformer([ 30 | removeUnnecessaryAssertions(options), 31 | applyAssertions(options), 32 | sortAssertions(options), 33 | factorOut(options), 34 | makeGreedy(options), 35 | removeUnnecessaryAssertions(options), 36 | mergeWithQuantifier(options), 37 | moveUpEmpty(options), 38 | nestedQuantifiers(options), 39 | inline(options), 40 | removeDeadBranches(options), 41 | unionCharacters(options), 42 | ]); 43 | } 44 | -------------------------------------------------------------------------------- /.github/workflows/deploy-docs.yml: -------------------------------------------------------------------------------- 1 | name: Deploy docs 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | build-and-deploy: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout 13 | uses: actions/checkout@v2.3.1 14 | with: 15 | fetch-depth: 0 16 | 17 | - name: Install and Build 18 | run: | 19 | npm ci 20 | npm run build 21 | npm run build:docs 22 | 23 | - name: Deploy dev 24 | uses: JamesIves/github-pages-deploy-action@4.0.0 25 | with: 26 | branch: gh-pages # The branch the action should deploy to. 27 | folder: docs # The folder the action should deploy. 28 | target-folder: docs/dev # the folder in gh-pages that will be created/cleaned 29 | clean: true # Automatically remove deleted files from the deploy branch 30 | 31 | - name: Find current tag 32 | run: echo "tag=$(git tag --points-at HEAD)" >> $GITHUB_ENV 33 | 34 | - name: Deploy tag 35 | if: ${{ startsWith(env.tag, 'v') }} 36 | uses: JamesIves/github-pages-deploy-action@4.0.0 37 | with: 38 | branch: gh-pages # The branch the action should deploy to. 39 | folder: docs # The folder the action should deploy. 40 | target-folder: docs/${{ env.tag }} # the folder in gh-pages that will be created/cleaned 41 | clean: true # Automatically remove deleted files from the deploy branch 42 | 43 | - name: Deploy latest 44 | if: ${{ startsWith(env.tag, 'v') }} 45 | uses: JamesIves/github-pages-deploy-action@4.0.0 46 | with: 47 | branch: gh-pages # The branch the action should deploy to. 48 | folder: docs # The folder the action should deploy. 49 | target-folder: docs/latest # the folder in gh-pages that will be created/cleaned 50 | clean: true # Automatically remove deleted files from the deploy branch 51 | -------------------------------------------------------------------------------- /src/words/conversion.ts: -------------------------------------------------------------------------------- 1 | import { ReadonlyWord, Word } from "../char-types"; 2 | 3 | /** 4 | * Converts the given array of UTF16 character codes into a string. 5 | * 6 | * All numbers in the given array must be between 0 (inclusive) and 65535 = 0xFFFF (inclusive). 7 | * 8 | * @param word 9 | */ 10 | export function fromUTF16ToString(word: ReadonlyWord): string { 11 | return String.fromCharCode(...word); 12 | } 13 | 14 | /** 15 | * Converts the given array of Unicode code points into a string. 16 | * 17 | * All numbers in the given array must be between 0 (inclusive) and 1114111 = 0x10FFFF (inclusive). 18 | * 19 | * @param word 20 | */ 21 | export function fromUnicodeToString(word: ReadonlyWord): string { 22 | return String.fromCodePoint(...word); 23 | } 24 | 25 | /** 26 | * Converts the given string into an array of UTF16 character codes. 27 | * 28 | * All numbers in the returned array are guaranteed to be between 0 (inclusive) and 65535 = 0xFFFF (inclusive). 29 | * 30 | * @param string 31 | */ 32 | export function fromStringToUTF16(string: string): Word { 33 | const word: Word = []; 34 | 35 | for (let i = 0, l = string.length; i < l; i++) { 36 | word.push(string.charCodeAt(i)); 37 | } 38 | 39 | return word; 40 | } 41 | 42 | /** 43 | * Converts the given string into an array of Unicode code points. 44 | * 45 | * All numbers in the returned array are guaranteed to be between 0 (inclusive) and 1114111 = 0x10FFFF (inclusive). 46 | * 47 | * @param string 48 | */ 49 | export function fromStringToUnicode(string: string): Word { 50 | // https://stackoverflow.com/a/21409165/7595472 51 | 52 | const word: Word = []; 53 | 54 | for (let i = 0, l = string.length; i < l; i++) { 55 | const c1 = string.charCodeAt(i); 56 | if (c1 >= 0xd800 && c1 < 0xdc00 && i + 1 < l) { 57 | const c2 = string.charCodeAt(i + 1); 58 | if (c2 >= 0xdc00 && c2 < 0xe000) { 59 | word.push(0x10000 + ((c1 - 0xd800) << 10) + (c2 - 0xdc00)); 60 | i++; 61 | continue; 62 | } 63 | } 64 | word.push(c1); 65 | } 66 | 67 | return word; 68 | } 69 | -------------------------------------------------------------------------------- /src/ast/set-parent.ts: -------------------------------------------------------------------------------- 1 | import { NoParent, Node } from "./nodes"; 2 | import { assertNever } from "../util"; 3 | 4 | /** 5 | * Sets the `parent` properties of the given node and all of its child nodes. 6 | * 7 | * @param node 8 | * @param parent The parent of `node`. 9 | */ 10 | export function setParent<T extends Node>(node: T | NoParent<T>, parent: T["parent"]): asserts node is T { 11 | setParentImpl(node as Node, parent as Node["parent"]); 12 | } 13 | function setParentImpl(node: Node, parent: Node["parent"]): void { 14 | switch (node.type) { 15 | case "Concatenation": 16 | if (parent === null) { 17 | throw new Error("The parent of a concatenation cannot be null."); 18 | } 19 | 20 | switch (parent.type) { 21 | case "Alternation": 22 | case "Assertion": 23 | case "Expression": 24 | case "Quantifier": 25 | node.parent = parent; 26 | break; 27 | 28 | case "Concatenation": 29 | throw new Error("A concatenation cannot be parent of a concatenation."); 30 | 31 | default: 32 | throw assertNever(parent); 33 | } 34 | 35 | node.elements.forEach(e => setParentImpl(e, node)); 36 | break; 37 | 38 | case "Alternation": 39 | case "Assertion": 40 | case "CharacterClass": 41 | case "Quantifier": 42 | case "Unknown": 43 | if (parent === null) { 44 | throw new Error(`The parent of a(n) ${node.type} cannot be null.`); 45 | } 46 | 47 | if (parent.type === "Concatenation") { 48 | node.parent = parent; 49 | } else { 50 | throw new Error(`A(n) ${parent.type} cannot be parent of a(n) ${node.type}.`); 51 | } 52 | 53 | if (node.type !== "CharacterClass" && node.type !== "Unknown") { 54 | node.alternatives.forEach(c => setParentImpl(c, node)); 55 | } 56 | break; 57 | 58 | case "Expression": 59 | if (parent !== null) { 60 | throw new Error(`The parent of an expression has to be null and cannot be a(n) ${parent.type}.`); 61 | } 62 | 63 | node.parent = null; 64 | 65 | node.alternatives.forEach(c => setParentImpl(c, node)); 66 | break; 67 | 68 | default: 69 | throw assertNever(node); 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/errors.ts: -------------------------------------------------------------------------------- 1 | import { Char } from "./char-types"; 2 | 3 | /** 4 | * An error that is thrown when the max characters of two or more FA or transition iterables is not the same. 5 | * 6 | * Operations on FA and transition iterables require the max characters of all given FA and transition iterables to be 7 | * the same and will throw this error if they are not. 8 | */ 9 | export class MaxCharacterError extends Error { 10 | /** 11 | * Asserts the two given max characters are the same. 12 | * 13 | * @param a 14 | * @param b 15 | * @param kind 16 | */ 17 | static assert(a: Char | { maxCharacter: Char }, b: Char | { maxCharacter: Char }, kind?: string): void { 18 | let left: number | { maxCharacter: number } = a; 19 | let right: number | { maxCharacter: number } = b; 20 | 21 | if (typeof left !== "number") { 22 | left = left.maxCharacter; 23 | } 24 | if (typeof right !== "number") { 25 | right = right.maxCharacter; 26 | } 27 | 28 | if (left !== right) { 29 | if (kind !== undefined) { 30 | throw new MaxCharacterError(`The two ${kind} do not have the same max character. ${left} != ${right}`); 31 | } else { 32 | throw new MaxCharacterError(`Different max character. ${left} != ${right}`); 33 | } 34 | } 35 | } 36 | } 37 | 38 | /** 39 | * An error that is thrown when an operation causes too many nodes to be created. 40 | * 41 | * Many FA operation have the potential to create a huge number of nodes (thousands and millions) which may result in 42 | * the JavaScript runtime running out of memory and/or crashing. This error will thrown before that happens to safely 43 | * abort an otherwise resource-intensive operation. 44 | */ 45 | export class TooManyNodesError extends Error { 46 | /** 47 | * Asserts that the current number of created nodes does not exceed the limit. 48 | * 49 | * @param current 50 | * @param limit 51 | * @param kind 52 | */ 53 | static assert(current: number, limit: number, kind: string): void { 54 | if (current > limit) { 55 | throw new TooManyNodesError(`The ${kind} is not allowed to create more than ${limit} nodes.`); 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/js/create-assertion.ts: -------------------------------------------------------------------------------- 1 | import { Assertion, Element, NoParent } from "../ast"; 2 | import { CharSet } from "../char-set"; 3 | import { assertNever } from "../util"; 4 | import { Flags } from "./flags"; 5 | import { getCharEnv } from "./char-env"; 6 | 7 | export type BoundaryAssertion = WordBoundaryAssertion | TextBoundaryAssertion; 8 | export interface WordBoundaryAssertion { 9 | kind: "word"; 10 | negate: boolean; 11 | } 12 | export interface TextBoundaryAssertion { 13 | kind: "end" | "start"; 14 | } 15 | 16 | export function createAssertion(assertion: Readonly<BoundaryAssertion>, flags: Readonly<Flags>): NoParent<Element> { 17 | const env = getCharEnv(flags); 18 | 19 | switch (assertion.kind) { 20 | case "end": 21 | case "start": { 22 | // /$/m == /(?!.)/ 23 | // /$/ == /(?![^])/ 24 | // /^/m == /(?<!.)/ 25 | // /^/ == /(?<![^])/ 26 | 27 | const charSet: CharSet = flags.multiline ? env.nonLineTerminator : env.all; 28 | return newAssertion(true, assertion.kind === "start" ? "behind" : "ahead", charSet); 29 | } 30 | 31 | case "word": { 32 | // /\b/ == /(?:(?<!\w)(?=\w)|(?<=\w)(?!\w))/ 33 | // /\B/ == /(?:(?<=\w)(?=\w)|(?<!\w)(?!\w))/ 34 | 35 | return { 36 | type: "Alternation", 37 | alternatives: [ 38 | { 39 | type: "Concatenation", 40 | elements: [ 41 | newAssertion(!assertion.negate, "behind", env.word), 42 | newAssertion(false, "ahead", env.word), 43 | ], 44 | }, 45 | { 46 | type: "Concatenation", 47 | elements: [ 48 | newAssertion(assertion.negate, "behind", env.word), 49 | newAssertion(true, "ahead", env.word), 50 | ], 51 | }, 52 | ], 53 | }; 54 | } 55 | 56 | default: 57 | throw assertNever(assertion, "Unknown assertion type"); 58 | } 59 | } 60 | 61 | function newAssertion(negate: boolean, kind: "ahead" | "behind", characters: CharSet): NoParent<Assertion> { 62 | return { 63 | type: "Assertion", 64 | negate, 65 | kind, 66 | alternatives: [ 67 | { 68 | type: "Concatenation", 69 | elements: [ 70 | { 71 | type: "CharacterClass", 72 | characters, 73 | }, 74 | ], 75 | }, 76 | ], 77 | }; 78 | } 79 | -------------------------------------------------------------------------------- /src/ast/nodes.ts: -------------------------------------------------------------------------------- 1 | import { CharSet } from "../char-set"; 2 | 3 | export interface SourceLocation { 4 | start: number; 5 | end: number; 6 | } 7 | 8 | interface NodeBase { 9 | type: Node["type"]; 10 | parent: Node["parent"]; 11 | source?: SourceLocation; 12 | } 13 | 14 | export type Element = CharacterClass | Alternation | Quantifier | Assertion | Unknown; 15 | export type Parent = Expression | Alternation | Quantifier | Assertion; 16 | export type Node = Expression | CharacterClass | Alternation | Quantifier | Assertion | Concatenation | Unknown; 17 | 18 | export interface Alternation extends NodeBase { 19 | type: "Alternation"; 20 | parent: Concatenation; 21 | alternatives: Concatenation[]; 22 | } 23 | 24 | export interface Assertion extends NodeBase { 25 | type: "Assertion"; 26 | parent: Concatenation; 27 | alternatives: Concatenation[]; 28 | kind: "ahead" | "behind"; 29 | negate: boolean; 30 | } 31 | 32 | export interface Quantifier extends NodeBase { 33 | type: "Quantifier"; 34 | parent: Concatenation; 35 | alternatives: Concatenation[]; 36 | lazy: boolean; 37 | min: number; 38 | max: number; 39 | } 40 | 41 | export interface CharacterClass extends NodeBase { 42 | type: "CharacterClass"; 43 | parent: Concatenation; 44 | characters: CharSet; 45 | } 46 | 47 | export interface Unknown extends NodeBase { 48 | type: "Unknown"; 49 | parent: Concatenation; 50 | id: string; 51 | } 52 | 53 | export interface Expression extends NodeBase { 54 | type: "Expression"; 55 | parent: null; 56 | alternatives: Concatenation[]; 57 | } 58 | 59 | export interface Concatenation extends NodeBase { 60 | type: "Concatenation"; 61 | parent: Parent; 62 | elements: Element[]; 63 | } 64 | 65 | type NodeIdent = { type: Node["type"] }; 66 | 67 | type NoParentArray<T> = { [K in keyof T]: NoParent<T[K]> }; 68 | type NoParentNode<T extends NodeIdent> = { [K in keyof NoParentNodePick<T>]: NoParent<NoParentNodePick<T>[K]> }; 69 | type NoParentNodePick<T extends NodeIdent> = Pick<T, Exclude<keyof T, "parent">>; 70 | /** 71 | * A view of an AST node that hides the `parent` property. 72 | */ 73 | export type NoParent<T> = T extends NodeIdent ? NoParentNode<T> : T extends unknown[] ? NoParentArray<T> : T; 74 | -------------------------------------------------------------------------------- /src/words/readable.ts: -------------------------------------------------------------------------------- 1 | import { CharRange, CharSet } from "../char-set"; 2 | import { Char, Word } from "../char-types"; 3 | import { ReadonlyWordSet } from "../word-set"; 4 | 5 | const READABILITY_ASCII_PRIORITY: readonly CharRange[] = [ 6 | // A-Z 7 | { min: 0x41, max: 0x5a }, 8 | // a-z 9 | { min: 0x61, max: 0x7a }, 10 | // 0-9 11 | { min: 0x30, max: 0x39 }, 12 | // - 13 | { min: 0x2d, max: 0x2d }, 14 | // _ 15 | { min: 0x5f, max: 0x5f }, 16 | // space 17 | { min: 0x20, max: 0x20 }, 18 | // printable ASCII 19 | { min: 0x20, max: 0x7e }, 20 | // tab 21 | { min: 0x09, max: 0x09 }, 22 | // \n 23 | { min: 0x0a, max: 0x0a }, 24 | // \r 25 | { min: 0x0d, max: 0x0d }, 26 | ]; 27 | 28 | /** 29 | * Returns the most humanly readable character in the given character set. Which character is picked is entirely 30 | * implementation-defined but, generally, word characters will be picked over non-word characters and printable 31 | * characters will be picked over non-printable characters. 32 | * 33 | * If the given character set is empty, `undefined` will be returned. 34 | * 35 | * @param set 36 | */ 37 | export function pickMostReadableCharacter(set: CharSet): Char | undefined { 38 | if (set.ranges.length === 0) { 39 | // empty 40 | return undefined; 41 | } else if (set.ranges.length === 1) { 42 | const { min, max } = set.ranges[0]; 43 | if (min === max) { 44 | // we don't have any choice 45 | return min; 46 | } 47 | } 48 | 49 | // search in ASCII range 50 | for (const range of READABILITY_ASCII_PRIORITY) { 51 | const c = set.commonCharacter(range); 52 | if (c !== undefined) { 53 | return c; 54 | } 55 | } 56 | 57 | // choose any character 58 | return set.ranges[0].min; 59 | } 60 | 61 | /** 62 | * Returns a word of the given word set that is the most humanly readable. 63 | * 64 | * @param wordSet 65 | */ 66 | export function pickMostReadableWord(wordSet: ReadonlyWordSet): Word { 67 | const word: Word = []; 68 | for (const set of wordSet) { 69 | const c = pickMostReadableCharacter(set); 70 | if (c === undefined) { 71 | throw new Error("Word sets are not allowed to contain empty character sets."); 72 | } else { 73 | word.push(c); 74 | } 75 | } 76 | return word; 77 | } 78 | -------------------------------------------------------------------------------- /src/words/word-set.ts: -------------------------------------------------------------------------------- 1 | import { CharSet } from "../char-set"; 2 | import { Word } from "../char-types"; 3 | import { concatSequences } from "../util"; 4 | import { ReadonlyWordSet } from "../word-set"; 5 | 6 | function wordSetToWordsImpl(wordSet: ReadonlyWordSet): Iterable<Word> { 7 | if (wordSet.length === 0) { 8 | // simple base case 9 | return [[]]; 10 | } else if (wordSet.length === 1) { 11 | // This is about twice as fast as calling `concatSequences`. 12 | return charSetToWords(wordSet[0]); 13 | } 14 | 15 | // The overhead of `concatSequences` can be **really** high for single-character char sets. 16 | // So we will try to find and subtract a non-empty suffix of single-character char sets. 17 | const suffix: Word = []; 18 | for (let i = wordSet.length - 1; i >= 0; i--) { 19 | const ranges = wordSet[i].ranges; 20 | if (ranges.length === 1 && ranges[0].min === ranges[0].max) { 21 | suffix.push(ranges[0].min); 22 | } else { 23 | break; 24 | } 25 | } 26 | 27 | if (suffix.length > 0) { 28 | suffix.reverse(); 29 | if (suffix.length === wordSet.length) { 30 | return [suffix]; 31 | } 32 | 33 | wordSet = wordSet.slice(0, wordSet.length - suffix.length); 34 | 35 | return (function* (): Iterable<Word> { 36 | for (const word of wordSetToWordsImpl(wordSet)) { 37 | word.push(...suffix); 38 | yield word; 39 | } 40 | })(); 41 | } 42 | 43 | return concatSequences(wordSet.map(cs => cs.characters())); 44 | } 45 | function* charSetToWords(set: CharSet): Iterable<Word> { 46 | for (const c of set.characters()) { 47 | yield [c]; 48 | } 49 | } 50 | 51 | /** 52 | * Returns an iterable yielding all words that can be constructed from the given word sets. 53 | * 54 | * @param wordSets 55 | */ 56 | export function* wordSetsToWords(wordSets: Iterable<ReadonlyWordSet>): Iterable<Word> { 57 | for (const wordSet of wordSets) { 58 | yield* wordSetToWordsImpl(wordSet); 59 | } 60 | } 61 | 62 | /** 63 | * Returns an iterable yielding all words that can be constructed from the given word set. 64 | * 65 | * @param wordSet 66 | * @deprecated Use {@link wordSetsToWords} instead. 67 | */ 68 | export function wordSetToWords(wordSet: ReadonlyWordSet): Iterable<Word> { 69 | return wordSetToWordsImpl(wordSet); 70 | } 71 | -------------------------------------------------------------------------------- /scripts/debug.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable @typescript-eslint/no-unused-vars */ 2 | import { 3 | CharSet, 4 | CharacterClass, 5 | DFA, 6 | ENFA, 7 | Expression, 8 | FiniteAutomaton, 9 | JS, 10 | NFA, 11 | NoParent, 12 | Transformers, 13 | Words, 14 | combineTransformers, 15 | transform, 16 | } from "../src"; 17 | import { performance } from "perf_hooks"; 18 | import { logDurations } from "./util"; 19 | 20 | // util functions 21 | function toNFA(literal: JS.Literal): NFA { 22 | const parser = JS.Parser.fromLiteral(literal); 23 | const { expression, maxCharacter } = parser.parse(); 24 | return NFA.fromRegex(expression, { maxCharacter }, { assertions: "disable" }); 25 | } 26 | function toENFA(literal: JS.Literal): ENFA { 27 | const parser = JS.Parser.fromLiteral(literal); 28 | const { expression, maxCharacter } = parser.parse(); 29 | return ENFA.fromRegex(expression, { maxCharacter }, { assertions: "disable" }); 30 | } 31 | const toDFA = (literal: JS.Literal): DFA => DFA.fromFA(toNFA(literal)); 32 | function toCharSet(literal: JS.Literal): CharSet { 33 | const parser = JS.Parser.fromLiteral(literal); 34 | const { expression } = parser.parse(); 35 | return (expression.alternatives[0].elements[0] as CharacterClass).characters; 36 | } 37 | function toRegExp(value: FiniteAutomaton | CharSet | NoParent<Expression>): RegExp { 38 | let literal; 39 | if (value instanceof CharSet) { 40 | literal = JS.toLiteral({ type: "Concatenation", elements: [{ type: "CharacterClass", characters: value }] }); 41 | } else if ("toRegex" in value) { 42 | literal = JS.toLiteral(value.toRegex()); 43 | } else { 44 | literal = JS.toLiteral(value); 45 | } 46 | return RegExp(literal.source, literal.flags); 47 | } 48 | function measure<T>(fn: () => T, samples: number = 1, label?: string): T { 49 | const durations: number[] = []; 50 | let result: T; 51 | 52 | do { 53 | const start = performance.now(); 54 | result = fn(); 55 | durations.push(performance.now() - start); 56 | } while (--samples > 0); 57 | 58 | logDurations(durations, label ?? fn.toString().replace(/^\(\) => /, "")); 59 | 60 | return result; 61 | } 62 | 63 | // actual debug code 64 | // DO NOT commit changes to this file 65 | 66 | const dfa = toDFA(/a+(?:b+a+)*/); 67 | dfa.minimize(); 68 | console.log(toRegExp(dfa)); 69 | 70 | console.log(toENFA(/a*b/).toString()); 71 | console.log(toENFA(/a*?b/).toString()); 72 | -------------------------------------------------------------------------------- /src/transformers/sort-assertions.ts: -------------------------------------------------------------------------------- 1 | import { Assertion, NoParent, Transformer } from "../ast"; 2 | import { CreationOptions } from "./creation-options"; 3 | 4 | function compare(a: NoParent<Assertion>, b: NoParent<Assertion>): number { 5 | if (a.kind !== b.kind) { 6 | return a.kind === "ahead" ? 1 : -1; 7 | } else { 8 | return 0; 9 | } 10 | } 11 | /** 12 | * A simple implementation of a stable sorting method - selection sort. 13 | * 14 | * Returns whether the given array was changed. 15 | * 16 | * @param array 17 | * @param compareFn 18 | */ 19 | function stableSort<T>(array: T[], compareFn: (a: T, b: T) => number): boolean { 20 | const n = array.length; 21 | let changed = false; 22 | 23 | for (let i = 0; i < n - 1; i++) { 24 | let minIndex = i; 25 | for (let j = i + 1; j < n; j++) { 26 | if (compareFn(array[j], array[minIndex]) < 0) { 27 | minIndex = j; 28 | } 29 | } 30 | if (minIndex !== i) { 31 | changed = true; 32 | [array[minIndex], array[i]] = [array[i], array[minIndex]]; 33 | } 34 | } 35 | 36 | return changed; 37 | } 38 | 39 | /** 40 | * Sorts adjacent assertions such that lookbehinds are always to the right of lookaheads. 41 | * 42 | * This is operation may be necessary for other transformers to pick up on certain patterns. 43 | * 44 | * E.g. `(?=a)(?!b)(?<!c)(?<=d)` => `(?<!c)(?<=d)(?=a)(?!b)` 45 | * 46 | * @param _options 47 | */ 48 | // eslint-disable-next-line @typescript-eslint/no-unused-vars 49 | export function sortAssertions(_options?: Readonly<CreationOptions>): Transformer { 50 | return { 51 | name: "sortAssertions", 52 | onConcatenation(node, { signalMutation }) { 53 | let runStart = 0; 54 | let runLength = 0; 55 | function sort(): void { 56 | const slice = node.elements.slice(runStart, runStart + runLength) as NoParent<Assertion>[]; 57 | if (stableSort(slice, compare)) { 58 | node.elements.splice(runStart, runLength, ...slice); 59 | signalMutation(); 60 | } 61 | } 62 | for (let i = 0; i < node.elements.length; i++) { 63 | const current = node.elements[i]; 64 | if (current.type === "Assertion") { 65 | if (runLength === 0) { 66 | runStart = i; 67 | } 68 | runLength++; 69 | } else { 70 | if (runLength > 1) { 71 | sort(); 72 | } 73 | runLength = 0; 74 | } 75 | } 76 | if (runLength > 1) { 77 | sort(); 78 | } 79 | }, 80 | }; 81 | } 82 | -------------------------------------------------------------------------------- /tests/helper/word-test-data.ts: -------------------------------------------------------------------------------- 1 | import { assert } from "chai"; 2 | import { FiniteAutomaton } from "../../src/fa-types"; 3 | import { Literal } from "../../src/js"; 4 | import { fromStringToUTF16, fromUTF16ToString, fromUnicodeToString } from "../../src/words"; 5 | 6 | export interface WordTestCaseData { 7 | accept: number[][]; 8 | reject: number[][]; 9 | } 10 | export interface WordTestCase extends WordTestCaseData { 11 | literal: Literal; 12 | } 13 | 14 | export const wordTestData: WordTestCase[] = [ 15 | { 16 | literal: /abc/, 17 | accept: ["abc"].map(fromStringToUTF16), 18 | reject: ["", "a", "ab", "abca", "aabc", "ABC"].map(fromStringToUTF16), 19 | }, 20 | { 21 | literal: /ab+c/, 22 | accept: ["abc", "abbbbbbbbbbbbbbbbbbbbbbbbbbbc"].map(fromStringToUTF16), 23 | reject: ["", "a", "ab", "abca", "aabc", "ABC"].map(fromStringToUTF16), 24 | }, 25 | { 26 | literal: /a*b*c*/, 27 | accept: ["", "a", "b", "c", "aaaaaaa", "bbbbbbb", "ccccccc", "aaabbbbccc", "bccc", "aaaacccc"].map( 28 | fromStringToUTF16 29 | ), 30 | reject: ["abca", "ABC", "cba", "d", "dd"].map(fromStringToUTF16), 31 | }, 32 | { 33 | literal: /\w+\d+/, 34 | accept: ["abcabcabc000", "123", "00", "a0", "aaaaa00000", "a0a0a0a0a0a0a0a0a0a0a0"].map(fromStringToUTF16), 35 | reject: ["", "a", "0", "abc", "123a", "D"].map(fromStringToUTF16), 36 | }, 37 | ]; 38 | 39 | export function testWordTestCases(fa: FiniteAutomaton, data: WordTestCaseData): void { 40 | type Outcome = "accepted" | "rejected"; 41 | function fail(word: number[], expected: Outcome, actual: Outcome): void { 42 | let unicodeStr = "<error>"; 43 | try { 44 | unicodeStr = JSON.stringify(fromUnicodeToString(word)); 45 | } catch (e) { 46 | /* noop */ 47 | } 48 | 49 | let utf16Str = "<error>"; 50 | try { 51 | utf16Str = JSON.stringify(fromUTF16ToString(word)); 52 | } catch (e) { 53 | /* noop */ 54 | } 55 | 56 | assert.fail( 57 | `Expected word to be ${expected} but it was ${actual} instead.` + 58 | "\nWord: " + 59 | JSON.stringify(word) + 60 | "\nWord (Unicode): " + 61 | unicodeStr + 62 | "\nWord (UTF16): " + 63 | utf16Str 64 | ); 65 | } 66 | 67 | for (const word of data.accept) { 68 | if (!fa.test(word)) { 69 | fail(word, "accepted", "rejected"); 70 | } 71 | } 72 | for (const word of data.reject) { 73 | if (fa.test(word)) { 74 | fail(word, "rejected", "accepted"); 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /tests/helper/fa.ts: -------------------------------------------------------------------------------- 1 | import { DFA } from "../../src/dfa"; 2 | import { NFA, ReadonlyNFA } from "../../src/nfa"; 3 | import { ENFA } from "../../src/enfa"; 4 | import { Literal, Parser } from "../../src/js"; 5 | import * as Iter from "../../src/iter"; 6 | import { TransitionIterable } from "../../src/fa-types"; 7 | 8 | export function literalToString(literal: Literal | string): string { 9 | if (typeof literal === "string") { 10 | return literal; 11 | } 12 | return `/${literal.source}/${literal.flags}`; 13 | } 14 | 15 | export function literalToDFA(literal: Literal): DFA { 16 | const parsed = Parser.fromLiteral(literal).parse(); 17 | return DFA.fromFA(NFA.fromRegex(parsed.expression, { maxCharacter: parsed.maxCharacter })); 18 | } 19 | 20 | export function literalToNFA(literal: Literal): NFA { 21 | const parsed = Parser.fromLiteral(literal).parse(); 22 | return NFA.fromRegex(parsed.expression, { maxCharacter: parsed.maxCharacter }); 23 | } 24 | 25 | export function literalToENFA(literal: Literal): ENFA { 26 | const parsed = Parser.fromLiteral(literal).parse(); 27 | return ENFA.fromRegex(parsed.expression, { maxCharacter: parsed.maxCharacter }); 28 | } 29 | 30 | export function removeIndentation(expected: string): string { 31 | // remove trailing spaces and initial line breaks 32 | expected = expected.replace(/^[\r\n]+|\s+$/g, ""); 33 | 34 | const lines = expected.split(/\r\n?|\n/g); 35 | const indentation = /^[ \t]*/.exec(lines[0])![0]; 36 | 37 | if (indentation) { 38 | for (let i = 0; i < lines.length; i++) { 39 | let line = lines[i]; 40 | if (line.startsWith(indentation)) { 41 | line = line.substr(indentation.length); 42 | } 43 | lines[i] = line; 44 | } 45 | } 46 | 47 | return lines.join("\n"); 48 | } 49 | 50 | export function reachableFinalStates(nfa: ReadonlyNFA): number { 51 | const iter = Iter.iterateStates({ 52 | initial: nfa.initial, 53 | getOut(node) { 54 | return node.out.keys(); 55 | }, 56 | isFinal(node) { 57 | return nfa.finals.has(node); 58 | }, 59 | }); 60 | 61 | let count = 0; 62 | for (const final of iter) { 63 | if (nfa.finals.has(final)) { 64 | count++; 65 | } 66 | } 67 | return count; 68 | } 69 | 70 | export function faEqual<A, B>(a: TransitionIterable<A>, b: TransitionIterable<B>): boolean { 71 | const dfaA = DFA.fromFA(a); 72 | const dfaB = DFA.fromFA(b); 73 | dfaA.minimize(); 74 | dfaB.minimize(); 75 | return dfaA.structurallyEqual(dfaB); 76 | } 77 | -------------------------------------------------------------------------------- /src/iter/to-string.ts: -------------------------------------------------------------------------------- 1 | import { FAIterator } from "../fa-types"; 2 | import { iterToArray } from "../util"; 3 | import { ensureStableOut, iterateStates, mapOut, mapOutIter } from "./iterator"; 4 | 5 | /** 6 | * Returns a human readable string representation of the given FA. The FA has to have exactly one initial state. 7 | * 8 | * All states will be labeled with numbers. The initial state will **always** has the number 0. Each state will be 9 | * mapped to its outgoing states. The outgoing states may contain duplicates and are sorted alphabetically by their 10 | * transition string. The number of states will be surrounded by brackets - square brackets for final states and round 11 | * brackets for non-final states. 12 | * 13 | * A conversion function for the transitions may optionally be given. If no transition function is given, the native 14 | * `String` function will be used. 15 | * 16 | * --- 17 | * 18 | * Example output for an NFA of `a*d|bb*` 19 | * 20 | * ```text 21 | * (0) -> (1) : 'a' 22 | * -> [2] : 'b' 23 | * -> [3] : 'd' 24 | * 25 | * (1) -> [3] : 'd' 26 | * 27 | * [2] -> [2] : 'b' 28 | * 29 | * [3] -> none 30 | * ``` 31 | * 32 | * @param iter 33 | * @param toString 34 | * @param ordered 35 | */ 36 | export function toString<S, T>( 37 | iter: FAIterator<S, Iterable<[S, T]>>, 38 | toString: (value: T) => string = String, 39 | ordered: boolean = false 40 | ): string { 41 | const stableIter = ensureStableOut( 42 | mapOut(iter, out => { 43 | const mapped = iterToArray(out).map<[S, string]>(([k, v]) => [k, toString(v)]); 44 | if (!ordered) { 45 | mapped.sort(([, a], [, b]) => a.localeCompare(b)); 46 | } 47 | return mapped; 48 | }) 49 | ); 50 | 51 | // get all states 52 | const states: S[] = [...iterateStates(mapOutIter(stableIter, ([s]) => s))]; 53 | 54 | const index = new Map<S, number>(states.map((s, i) => [s, i])); 55 | const indexOf = (state: S): number => { 56 | return index.get(state)!; 57 | }; 58 | const labelOf = (state: S): string => { 59 | const index = indexOf(state); 60 | return stableIter.isFinal(state) ? `[${index}]` : `(${index})`; 61 | }; 62 | 63 | return states 64 | .map(state => { 65 | const label = labelOf(state); 66 | const out = stableIter.getOut(state); 67 | if (!ordered) { 68 | out.sort(([s1], [s2]) => indexOf(s1) - indexOf(s2)); 69 | } 70 | 71 | if (out.length === 0) { 72 | return `${label} -> none`; 73 | } else { 74 | const spaces = " ".repeat(label.length); 75 | return out 76 | .map(([s, t], i) => { 77 | return `${i ? spaces : label} -> ${labelOf(s)} : ${t}`; 78 | }) 79 | .join("\n"); 80 | } 81 | }) 82 | .join("\n\n"); 83 | } 84 | -------------------------------------------------------------------------------- /tests/helper/literal-to-string.ts: -------------------------------------------------------------------------------- 1 | import { Concatenation, Element, NoParent, Node } from "../../src/ast"; 2 | import { assertNever } from "../../src/util"; 3 | 4 | function toPatternConcatenation(concat: NoParent<Concatenation>): string { 5 | let s = ""; 6 | const elements = concat.elements; 7 | for (let i = 0, l = elements.length; i < l; i++) { 8 | s += toPatternElement(elements[i]); 9 | } 10 | return s; 11 | } 12 | function toPatternElement(element: NoParent<Element>): string { 13 | switch (element.type) { 14 | case "Alternation": { 15 | return "(?:" + toPatternAlternatives(element.alternatives) + ")"; 16 | } 17 | case "Assertion": { 18 | const kind = element.kind === "ahead" ? "" : "<"; 19 | const negate = element.negate ? "!" : "="; 20 | return `(?${kind}${negate}${toPatternAlternatives(element.alternatives)})`; 21 | } 22 | case "CharacterClass": { 23 | return `[${element.characters.toRangesString()}]`; 24 | } 25 | case "Quantifier": { 26 | let quant: string; 27 | if (element.max === Infinity) { 28 | if (element.min === 0) { 29 | quant = "*"; 30 | } else if (element.min === 1) { 31 | quant = "+"; 32 | } else { 33 | quant = `{${element.min},}`; 34 | } 35 | } else if (element.max === 1) { 36 | if (element.min === 0) { 37 | quant = "?"; 38 | } /* if (element.min === 1) */ else { 39 | quant = "{1}"; 40 | } 41 | } else if (element.min === element.max) { 42 | quant = `{${element.min}}`; 43 | } else { 44 | quant = `{${element.min},${element.max}}`; 45 | } 46 | if (element.lazy) { 47 | quant += "?"; 48 | } 49 | 50 | let content: string; 51 | if ( 52 | element.alternatives.length === 1 && 53 | element.alternatives[0].elements.length === 1 && 54 | element.alternatives[0].elements[0].type === "CharacterClass" 55 | ) { 56 | content = toPatternConcatenation(element.alternatives[0]); 57 | } else { 58 | content = "(?:" + toPatternAlternatives(element.alternatives) + ")"; 59 | } 60 | 61 | if (!content) { 62 | content = "(?:)"; 63 | } 64 | 65 | return content + quant; 66 | } 67 | case "Unknown": { 68 | return `[]Unknown:${element.id}[]`; 69 | } 70 | default: 71 | throw assertNever(element, "Invalid element"); 72 | } 73 | } 74 | function toPatternAlternatives(expressions: readonly NoParent<Concatenation>[]): string { 75 | if (expressions.length === 0) { 76 | return "[]"; 77 | } else { 78 | return expressions.map(toPatternConcatenation).join("|"); 79 | } 80 | } 81 | 82 | export function toPatternString(node: NoParent<Node>): string { 83 | switch (node.type) { 84 | case "Expression": 85 | return toPatternAlternatives(node.alternatives); 86 | case "Concatenation": 87 | return toPatternConcatenation(node); 88 | default: 89 | return toPatternElement(node); 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /scripts/create-unicode.ts: -------------------------------------------------------------------------------- 1 | import * as fs from "fs"; 2 | import * as path from "path"; 3 | import { CharSet } from "../src/char-set"; 4 | import * as aliases from "../src/js/unicode/alias"; 5 | import { printRanges } from "./util"; 6 | 7 | const UNICODE_SRC_DIR = path.join(__dirname, "../src/js/unicode"); 8 | 9 | createDataFile(Object.values(aliases.Binary_Property), "Binary_Property", "binary-property-data.ts"); 10 | createDataFile(Object.values(aliases.General_Category), "General_Category", "general-category-data.ts"); 11 | createDataFile(Object.values(aliases.ScriptAndScript_Extensions), "Script", "script-data.ts"); 12 | createDataFile(Object.values(aliases.ScriptAndScript_Extensions), "Script_Extensions", "script-extensions-data.ts"); 13 | 14 | createDataStringsFile(Object.values(aliases.Binary_Property_Of_String), "properties-of-strings.ts"); 15 | 16 | function createDataFile(properties: Iterable<string>, category: string, filename: string): void { 17 | console.log(`Creating ${filename}`); 18 | 19 | const values = new Set(properties); 20 | 21 | let code = `/* eslint-disable */ 22 | 23 | // DO NOT EDIT! 24 | // THIS FILE IS GENERATED BY scripts/create-unicode.js 25 | 26 | // Category: ${category} 27 | // Exported ranges: ${[...values].join(", ")} 28 | 29 | import { CharRange } from "../../char-set"; 30 | 31 | 32 | `; 33 | 34 | for (const prop of values) { 35 | const codePoints: number[] = require(`@unicode/unicode-15.0.0/${category}/${prop}/code-points`); 36 | const ranges = CharSet.fromCharacters(0x10ffff, codePoints).ranges; 37 | 38 | code += `export const ${prop}: readonly CharRange[] = ${printRanges(ranges)};\n`; 39 | } 40 | 41 | fs.writeFileSync(path.join(UNICODE_SRC_DIR, filename), code, "utf-8"); 42 | } 43 | 44 | function createDataStringsFile(properties: Iterable<string>, filename: string): void { 45 | console.log(`Creating ${filename}`); 46 | 47 | const values = new Set(properties); 48 | 49 | let code = `/* eslint-disable */ 50 | 51 | // DO NOT EDIT! 52 | // THIS FILE IS GENERATED BY scripts/create-unicode.js 53 | 54 | // Exported strings: ${[...values].join(", ")} 55 | 56 | import { ReadonlyWord } from "../../char-types"; 57 | 58 | 59 | `; 60 | 61 | for (const prop of values) { 62 | const strings: string[] = require(`@unicode/unicode-15.0.0/Sequence_Property/${prop}/index.js`); 63 | 64 | const codePoints = strings.map(s => [...s].map(c => c.codePointAt(0)!)); 65 | codePoints.sort((a, b) => { 66 | if (a.length !== b.length) { 67 | return a.length - b.length; 68 | } 69 | 70 | for (let i = 0; i < a.length; i++) { 71 | if (a[i] !== b[i]) { 72 | return a[i] - b[i]; 73 | } 74 | } 75 | return 0; 76 | }); 77 | 78 | code += `export const ${prop}: readonly ReadonlyWord[] = JSON.parse('${JSON.stringify(codePoints)}');\n`; 79 | } 80 | 81 | fs.writeFileSync(path.join(UNICODE_SRC_DIR, filename), code, "utf-8"); 82 | } 83 | -------------------------------------------------------------------------------- /tests/char-map.ts: -------------------------------------------------------------------------------- 1 | import { assert } from "chai"; 2 | import { CharMap } from "../src/char-map"; 3 | import { CharRange } from "../src/char-set"; 4 | 5 | describe("CharMap", function () { 6 | it("single element operations", function () { 7 | const map = new CharMap<string>(); 8 | map.set(1, "a"); 9 | map.set(2, "b"); 10 | map.set(3, "a"); 11 | map.set(2, "c"); 12 | map.set(3, "d"); 13 | 14 | assert.isTrue(map.delete(1)); 15 | assert.isFalse(map.delete(1)); 16 | assert.isFalse(map.delete(100)); 17 | 18 | map.set(12, "f"); 19 | 20 | assert.strictEqual(map.has(0), false); 21 | assert.strictEqual(map.has(1), false); 22 | assert.strictEqual(map.has(2), true); 23 | assert.strictEqual(map.has(3), true); 24 | assert.strictEqual(map.has(4), false); 25 | 26 | assert.strictEqual(map.has(2.5), false); 27 | assert.strictEqual(map.has(100), false); 28 | 29 | assert.strictEqual(map.get(0), undefined); 30 | assert.strictEqual(map.get(1), undefined); 31 | assert.strictEqual(map.get(2), "c"); 32 | assert.strictEqual(map.get(3), "d"); 33 | assert.strictEqual(map.get(4), undefined); 34 | 35 | assert.strictEqual(map.get(2.5), undefined); 36 | assert.strictEqual(map.get(100), undefined); 37 | 38 | assert.strictEqual(map.delete(2.5), false); 39 | assert.strictEqual(map.delete(100), false); 40 | 41 | assertEqual(map, [ 42 | [{ min: 2, max: 2 }, "c"], 43 | [{ min: 3, max: 3 }, "d"], 44 | [{ min: 12, max: 12 }, "f"], 45 | ]); 46 | 47 | map.filter(value => value !== "d"); 48 | 49 | assertEqual(map, [ 50 | [{ min: 2, max: 2 }, "c"], 51 | [{ min: 12, max: 12 }, "f"], 52 | ]); 53 | }); 54 | 55 | it("range operations", function () { 56 | const map = new CharMap<string>(); 57 | map.setRange({ min: 0, max: 1 }, "a"); 58 | map.setRange({ min: 4, max: 16 }, "b"); 59 | 60 | map.deleteRange({ min: 2, max: 3 }); 61 | 62 | assertEqual(map, [ 63 | [{ min: 0, max: 1 }, "a"], 64 | [{ min: 4, max: 16 }, "b"], 65 | ]); 66 | 67 | map.deleteRange({ min: 1, max: 4 }); 68 | 69 | assertEqual(map, [ 70 | [{ min: 0, max: 0 }, "a"], 71 | [{ min: 5, max: 16 }, "b"], 72 | ]); 73 | 74 | map.clear(); 75 | 76 | assertEqual(map, []); 77 | }); 78 | 79 | it("merge adjacent", function () { 80 | const map = new CharMap<number>(); 81 | 82 | map.set(1, 0); 83 | map.set(2, 0); 84 | 85 | assertEqual(map, [[{ min: 1, max: 2 }, 0]]); 86 | 87 | map.setRange({ min: 4, max: 9 }, 0); 88 | 89 | assertEqual(map, [ 90 | [{ min: 1, max: 2 }, 0], 91 | [{ min: 4, max: 9 }, 0], 92 | ]); 93 | 94 | map.set(3, 0); 95 | 96 | assertEqual(map, [[{ min: 1, max: 9 }, 0]]); 97 | }); 98 | 99 | function assertEqual<T>(charMap: CharMap<T>, expected: Iterable<[CharRange, T]>): void { 100 | assert.deepEqual([...charMap], [...expected]); 101 | } 102 | }); 103 | -------------------------------------------------------------------------------- /src/transformers/make-greedy.ts: -------------------------------------------------------------------------------- 1 | import { NoParent, Node, Quantifier, TransformContext, Transformer, visitAst } from "../ast"; 2 | import { 3 | MatchingDirection, 4 | getFirstCharAfter, 5 | getFirstCharConsumedBy, 6 | stackPath, 7 | toMatchingDirection, 8 | } from "../ast-analysis"; 9 | import { CreationOptions } from "./creation-options"; 10 | 11 | function tryMakeGreedy( 12 | quant: NoParent<Quantifier>, 13 | parents: readonly NoParent<Node>[], 14 | context: TransformContext 15 | ): void { 16 | if (!quant.lazy) { 17 | return; 18 | } 19 | 20 | let direction: MatchingDirection = "ltr"; 21 | for (let i = parents.length - 1; i >= 0; i--) { 22 | const p = parents[i]; 23 | if (p.type === "Assertion") { 24 | direction = toMatchingDirection(p.kind); 25 | break; 26 | } 27 | } 28 | 29 | const consumed = getFirstCharConsumedBy(quant.alternatives, direction, context.maxCharacter); 30 | if (consumed.empty) { 31 | return; 32 | } 33 | 34 | const after = getFirstCharAfter(stackPath(parents, quant), direction, context.maxCharacter); 35 | 36 | if (after.char.isDisjointWith(consumed.char)) { 37 | context.signalMutation(); 38 | quant.lazy = false; 39 | } 40 | } 41 | /** 42 | * This transformer will try to make quantifiers greedy whenever possible. 43 | * 44 | * Note: If `ignoreOrder` is `true`, then quantifiers will always be made greedy. 45 | * 46 | * @param options 47 | */ 48 | export function makeGreedy(options?: Readonly<CreationOptions>): Transformer { 49 | const { ignoreOrder = false } = options ?? {}; 50 | 51 | if (ignoreOrder) { 52 | return { 53 | name: "makeGreedy", 54 | onQuantifier(node, { signalMutation }) { 55 | if (node.lazy) { 56 | signalMutation(); 57 | node.lazy = false; 58 | } 59 | }, 60 | }; 61 | } 62 | 63 | // we can safely ignore the options as order and ambiguity are guaranteed to be preserved 64 | return { 65 | name: "makeGreedy", 66 | 67 | onQuantifier(node, { signalMutation }) { 68 | if (node.lazy && node.min === node.max) { 69 | signalMutation(); 70 | node.lazy = false; 71 | } 72 | }, 73 | 74 | onExpression(node, context) { 75 | const stack: NoParent<Node>[] = []; 76 | function enter(node: never): void { 77 | stack.push(node); 78 | } 79 | function leave(): void { 80 | stack.pop(); 81 | } 82 | 83 | visitAst(node, { 84 | onAlternationEnter: enter, 85 | onAssertionEnter: enter, 86 | onConcatenationEnter: enter, 87 | onExpressionEnter: enter, 88 | onQuantifierEnter: enter, 89 | 90 | onAlternationLeave: leave, 91 | onAssertionLeave: leave, 92 | onConcatenationLeave: leave, 93 | onExpressionLeave: leave, 94 | onQuantifierLeave(quant) { 95 | leave(); 96 | 97 | tryMakeGreedy(quant, stack, context); 98 | }, 99 | }); 100 | }, 101 | }; 102 | } 103 | -------------------------------------------------------------------------------- /src/ast/visit.ts: -------------------------------------------------------------------------------- 1 | import { 2 | Alternation, 3 | Assertion, 4 | CharacterClass, 5 | Concatenation, 6 | Expression, 7 | NoParent, 8 | Node, 9 | Quantifier, 10 | Unknown, 11 | } from "./nodes"; 12 | 13 | export interface VisitAstHandler { 14 | onAlternationEnter?(node: Alternation): void; 15 | onAlternationLeave?(node: Alternation): void; 16 | onAssertionEnter?(node: Assertion): void; 17 | onAssertionLeave?(node: Assertion): void; 18 | onCharacterClassEnter?(node: CharacterClass): void; 19 | onCharacterClassLeave?(node: CharacterClass): void; 20 | onConcatenationEnter?(node: Concatenation): void; 21 | onConcatenationLeave?(node: Concatenation): void; 22 | onExpressionEnter?(node: Expression): void; 23 | onExpressionLeave?(node: Expression): void; 24 | onQuantifierEnter?(node: Quantifier): void; 25 | onQuantifierLeave?(node: Quantifier): void; 26 | onUnknownEnter?(node: Unknown): void; 27 | onUnknownLeave?(node: Unknown): void; 28 | } 29 | export interface VisitNoParentAstHandler { 30 | onAlternationEnter?(node: NoParent<Alternation>): void; 31 | onAlternationLeave?(node: NoParent<Alternation>): void; 32 | onAssertionEnter?(node: NoParent<Assertion>): void; 33 | onAssertionLeave?(node: NoParent<Assertion>): void; 34 | onCharacterClassEnter?(node: NoParent<CharacterClass>): void; 35 | onCharacterClassLeave?(node: NoParent<CharacterClass>): void; 36 | onConcatenationEnter?(node: NoParent<Concatenation>): void; 37 | onConcatenationLeave?(node: NoParent<Concatenation>): void; 38 | onExpressionEnter?(node: NoParent<Expression>): void; 39 | onExpressionLeave?(node: NoParent<Expression>): void; 40 | onQuantifierEnter?(node: NoParent<Quantifier>): void; 41 | onQuantifierLeave?(node: NoParent<Quantifier>): void; 42 | onUnknownEnter?(node: NoParent<Unknown>): void; 43 | onUnknownLeave?(node: NoParent<Unknown>): void; 44 | } 45 | /** 46 | * Calls the given visitor on the given node and all of its children. 47 | * 48 | * If the given visitor throws an error, the traversal will stop and the error will be re-thrown. 49 | * 50 | * @param node 51 | * @param visitor 52 | */ 53 | export function visitAst(node: Node, visitor: VisitAstHandler): void; 54 | export function visitAst(node: NoParent<Node>, visitor: VisitNoParentAstHandler): void; 55 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 56 | export function visitAst(node: NoParent<Node>, visitor: VisitNoParentAstHandler): void { 57 | const enter = visitor[`on${node.type}Enter`]; 58 | if (enter) { 59 | enter(node as never); 60 | } 61 | 62 | switch (node.type) { 63 | case "Alternation": 64 | case "Assertion": 65 | case "Expression": 66 | case "Quantifier": 67 | for (const concat of node.alternatives) { 68 | visitAst(concat, visitor); 69 | } 70 | break; 71 | 72 | case "Concatenation": 73 | for (const element of node.elements) { 74 | visitAst(element, visitor); 75 | } 76 | break; 77 | 78 | default: 79 | break; 80 | } 81 | 82 | const leave = visitor[`on${node.type}Leave`]; 83 | if (leave) { 84 | leave(node as never); 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/iter/remove-dead-states.ts: -------------------------------------------------------------------------------- 1 | import { FAIterator } from "../fa-types"; 2 | import { ensureStableOut } from "./iterator"; 3 | 4 | /** 5 | * Removes all dead states (and trap states) from the given iterator. 6 | * 7 | * Note: This will iteratively create a complete copy of the given FA. This method is an expensive operation. 8 | * 9 | * @param iter 10 | * @param select 11 | */ 12 | export function removeDeadStates<S, O>(iter: FAIterator<S, Iterable<O>>, select: (item: O) => S): FAIterator<S, O[]> { 13 | const { initial, getOut: oldGetOut, isFinal } = ensureStableOut(iter); 14 | 15 | const cache = new Map<S, boolean>(); 16 | function canReachFinal(state: S): boolean { 17 | const cached = cache.get(state); 18 | if (cached !== undefined) { 19 | return cached; 20 | } 21 | 22 | if (isFinal(state)) { 23 | cache.set(state, true); 24 | return true; 25 | } 26 | 27 | const inMap = new Map<S, S[]>(); 28 | const seen = new Set<S>([state]); 29 | let current: S[] = [state]; 30 | 31 | while (current.length > 0) { 32 | const next: S[] = []; 33 | 34 | for (let i = 0, l = current.length; i < l; i++) { 35 | const from = current[i]; 36 | for (const item of oldGetOut(from)) { 37 | const to = select(item); 38 | 39 | // update inMap 40 | let list = inMap.get(to); 41 | if (list === undefined) { 42 | list = []; 43 | inMap.set(to, list); 44 | } 45 | list.push(from); 46 | 47 | if (seen.has(to)) { 48 | // already seen 49 | continue; 50 | } 51 | seen.add(to); 52 | 53 | const toCached = cache.get(to); 54 | if (toCached === true || isFinal(to)) { 55 | // can reach final 56 | setAllToReachable(cache, to, inMap); 57 | return true; 58 | } else if (toCached === false) { 59 | // ignore dead state 60 | continue; 61 | } 62 | 63 | next.push(to); 64 | } 65 | } 66 | 67 | current = next; 68 | } 69 | 70 | // non of the seen states can reach a final state 71 | seen.forEach(s => cache.set(s, false)); 72 | 73 | return false; 74 | } 75 | 76 | function getOut(state: S): O[] { 77 | const result: O[] = []; 78 | 79 | for (const item of oldGetOut(state)) { 80 | if (canReachFinal(select(item))) { 81 | result.push(item); 82 | } 83 | } 84 | 85 | return result; 86 | } 87 | 88 | return { 89 | initial, 90 | getOut, 91 | isFinal, 92 | }; 93 | } 94 | 95 | function setAllToReachable<S>(cache: Map<S, boolean>, root: S, inMap: ReadonlyMap<S, S[]>): void { 96 | cache.set(root, true); 97 | 98 | let current: S[] = [root]; 99 | while (current.length > 0) { 100 | const next: S[] = []; 101 | 102 | for (let i = 0, l = current.length; i < l; i++) { 103 | const state = current[i]; 104 | 105 | inMap.get(state)?.forEach(s => { 106 | if (!cache.has(s)) { 107 | cache.set(s, true); 108 | next.push(s); 109 | } 110 | }); 111 | } 112 | 113 | current = next; 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "refa", 3 | "version": "0.12.1", 4 | "description": "A library for finite automata and regular expressions in the context of JS RegExp", 5 | "main": "index", 6 | "scripts": { 7 | "check": "npm run lint && npm run check:dependencies && npx tsc --noEmit && cd tests && npx tsc --noEmit && cd ../scripts && npx tsc --noEmit", 8 | "check:dependencies": "npx depcruise --validate .dependency-cruiser.js src", 9 | "lint": "npx eslint --ignore-path .gitignore **/*.ts", 10 | "test": "cd tests && mocha -r ts-node/register '**/*.ts'", 11 | "test:all": "npm run test -- --reporter=dot --run-transformers --run-stress-test", 12 | "test:update": "npm run test -- --update --run-transformers", 13 | "build": "npx rimraf ./index.* .out/** && npx tsc && rollup -c && npm run build:dts", 14 | "build:dts": "dts-bundle --main ./.out/index.d.ts --name refa --out ../index.d.ts && npm run scripts:flat-dts && prettier --write ./index.d.ts", 15 | "build:docs": "typedoc --treatWarningsAsErrors", 16 | "coverage": "npx nyc --reporter=html --reporter=text npm run test", 17 | "scripts:create-case-folding": "npx ts-node --project scripts/tsconfig.json scripts/create-case-folding.ts", 18 | "scripts:create-unicode": "npx ts-node --project scripts/tsconfig.json scripts/create-unicode.ts", 19 | "scripts:debug": "npx ts-node --project scripts/tsconfig.json scripts/debug.ts", 20 | "scripts:flat-dts": "npx ts-node --project scripts/tsconfig.json scripts/flat-dts.ts", 21 | "scripts:perf": "npx ts-node --project scripts/tsconfig.json scripts/perf.ts", 22 | "prepublishOnly": "npm run build" 23 | }, 24 | "keywords": [ 25 | "dfa", 26 | "nfa", 27 | "regex", 28 | "regexp", 29 | "regular", 30 | "expression" 31 | ], 32 | "author": "Michael Schmidt", 33 | "homepage": "https://github.com/RunDevelopment/refa#readme", 34 | "repository": { 35 | "type": "git", 36 | "url": "https://github.com/RunDevelopment/refa.git" 37 | }, 38 | "license": "MIT", 39 | "files": [ 40 | "index.js", 41 | "index.mjs", 42 | "index.d.ts" 43 | ], 44 | "devDependencies": { 45 | "@rollup/plugin-node-resolve": "^9.0.0", 46 | "@types/chai": "^4.2.22", 47 | "@types/mocha": "^9.0.0", 48 | "@types/node": "^12.20.13", 49 | "@typescript-eslint/eslint-plugin": "^6.4.1", 50 | "@typescript-eslint/parser": "^6.4.1", 51 | "@unicode/unicode-15.0.0": "^1.4.2", 52 | "chai": "^4.3.4", 53 | "dependency-cruiser": "^12.3.0", 54 | "dts-bundle": "^0.7.3", 55 | "eslint": "^8.47.0", 56 | "eslint-config-prettier": "^8.8.0", 57 | "eslint-plugin-jsdoc": "^46.5.0", 58 | "eslint-plugin-prettier": "^4.2.1", 59 | "mocha": "^9.1.3", 60 | "nyc": "^15.1.0", 61 | "prettier": "^2.8.7", 62 | "rimraf": "^3.0.2", 63 | "rollup": "^2.47.0", 64 | "rollup-plugin-terser": "^7.0.2", 65 | "ts-node": "^10.9.1", 66 | "typedoc": "^0.24.8", 67 | "typescript": "^5.0.2" 68 | }, 69 | "dependencies": { 70 | "@eslint-community/regexpp": "^4.8.0" 71 | }, 72 | "engines": { 73 | "node": "^12.0.0 || ^14.0.0 || >=16.0.0" 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /tests/transformers/pattern-edge-assertions.ts: -------------------------------------------------------------------------------- 1 | import { itTest } from "../helper/transform"; 2 | import { patternEdgeAssertions } from "../../src/transformers"; 3 | 4 | describe("Transformers", function () { 5 | describe(/[\w-]+(?=\.\w+)/i.exec(__filename)![0], function () { 6 | itTest(null, [ 7 | { 8 | literal: /(?<!\w)(?=\w)a(?=\w)|^f(?=\w)oo(?=sb)(?!\s*\w)(?<!\d)/, 9 | transformer: patternEdgeAssertions({ inline: true, remove: true }), 10 | expected: /(?=\w)a\w|f(?=\w)oo(?!\s*\w)(?<!\d)sb/, 11 | }, 12 | { 13 | literal: /(?<!\w)(?=\w)a(?=\w)|^f(?=\w)oo(?=sb)(?!\s*\w)(?<!\d)/, 14 | transformer: patternEdgeAssertions({ inline: true, remove: false }), 15 | expected: /(?<!\w)(?=\w)a\w|^f(?=\w)oo(?!\s*\w)(?<!\d)sb/, 16 | }, 17 | { 18 | literal: /(?<!\w)(?=\w)a(?=\w)|^f(?=\w)oo(?=sb)(?!\s*\w)(?<!\d)/, 19 | transformer: patternEdgeAssertions({ inline: false, remove: true }), 20 | expected: /(?=\w)a|f(?=\w)oo(?<!\d)/, 21 | }, 22 | { 23 | literal: /(?<!\w)(?=\w)a(?=\w)|^f(?=\w)oo(?=sb)(?!\s*\w)(?<!\d)/, 24 | transformer: patternEdgeAssertions({ inline: false, remove: false }), // noop 25 | expected: /(?<!\w)(?=\w)a(?=\w)|^f(?=\w)oo(?=sb)(?!\s*\w)(?<!\d)/, 26 | }, 27 | 28 | { 29 | literal: /foo(?:a(?=b)|c(?=d))/, 30 | transformer: patternEdgeAssertions({ inline: true, remove: true }), 31 | expected: /foo(?:ab|cd)/, 32 | }, 33 | { 34 | literal: /foo(?:a(?=b|c)|c(?=d))?/, 35 | transformer: patternEdgeAssertions({ inline: true, remove: true }), 36 | expected: /foo(?:a(?:b|c)|cd)?/, 37 | }, 38 | { 39 | literal: /foo(?:a(?=b)|c(?=d))+/, 40 | transformer: patternEdgeAssertions({ inline: true, remove: true }), 41 | expected: /foo(?:a(?=b)|c(?=d))+/, 42 | }, 43 | { 44 | literal: /(?<=abc)/, 45 | transformer: patternEdgeAssertions({ inline: true, remove: true }), 46 | expected: /abc/, 47 | }, 48 | { 49 | literal: /(?=ab(?=c))/, 50 | transformer: patternEdgeAssertions({ inline: true, remove: true }), 51 | expected: /abc/, 52 | }, 53 | { 54 | literal: /\bfoo\b/, 55 | transformer: patternEdgeAssertions({ inline: true, remove: false }), 56 | expected: /(?:(?<!\w)(?=\w)|\w(?!\w))foo(?:(?<!\w)\w|(?<=\w)(?!\w))/, 57 | }, 58 | { 59 | literal: /\bfoo\b/, 60 | transformer: patternEdgeAssertions({ inline: true, remove: true }), 61 | expected: /(?:(?=\w)|\w(?!\w))foo(?:(?<!\w)\w|(?<=\w))/, 62 | }, 63 | 64 | { 65 | literal: /(?=a)\w?(?!\s)/, 66 | transformer: patternEdgeAssertions({ inline: true, remove: true }), 67 | expected: /(?=a)\w?(?!\s)/, 68 | }, 69 | { 70 | literal: /(?=a)\w(?!\s)/, 71 | transformer: patternEdgeAssertions({ inline: true, remove: true }), 72 | expected: /(?=a)\w/, 73 | }, 74 | { 75 | literal: /(?=a{4})\w{4}(?!\s)/, 76 | transformer: patternEdgeAssertions({ inline: true, remove: true }), 77 | expected: /(?=a{4})\w{4}/, 78 | }, 79 | { 80 | literal: /(?=a{5})\w{4}(?!\s)/, 81 | transformer: patternEdgeAssertions({ inline: true, remove: true }), 82 | expected: /(?=a{5})\w{4}(?!\s)/, 83 | }, 84 | ]); 85 | }); 86 | }); 87 | -------------------------------------------------------------------------------- /scripts/perf.ts: -------------------------------------------------------------------------------- 1 | import { CombinedTransformer, DFA, JS, NFA, Transformers, transform } from "../src"; 2 | import { PrismRegexes } from "../tests/helper/prism-regex-data"; 3 | import { performance } from "perf_hooks"; 4 | import { logDurations } from "./util"; 5 | 6 | function perfTest(): void { 7 | const durationRecord: Record<string, number[] | undefined> = {}; 8 | function measure<T>(label: string, fn: () => T): T { 9 | const durations = (durationRecord[label] = durationRecord[label] || []); 10 | 11 | const start = performance.now(); 12 | const result = fn(); 13 | durations.push(performance.now() - start); 14 | 15 | return result; 16 | } 17 | function showResult(): void { 18 | const maxLen = Math.max(...Object.keys(durationRecord).map(s => s.length)); 19 | for (const key in durationRecord) { 20 | logDurations(durationRecord[key]!, (key + ":").padEnd(maxLen)); 21 | } 22 | } 23 | 24 | const TOO_BIG = new Set<number>([245, 862, 1474, 2278]); 25 | 26 | let errors = 0; 27 | let counter = 0; 28 | for (const literal of PrismRegexes) { 29 | counter++; 30 | process.stdout.write(`\r${counter}/${PrismRegexes.length}`); 31 | if (counter === PrismRegexes.length) { 32 | console.log(); 33 | } 34 | 35 | if (TOO_BIG.has(counter)) { 36 | continue; 37 | } 38 | 39 | try { 40 | const parser = measure("Create parser", () => JS.Parser.fromLiteral(literal)); 41 | const { expression, maxCharacter } = measure("parse", () => 42 | parser.parse({ backreferences: "disable", maxNodes: 100_000 }) 43 | ); 44 | measure("toLiteral", () => JS.toLiteral(expression)); 45 | measure("toLiteral fast", () => JS.toLiteral(expression, { fastCharacters: true })); 46 | 47 | const finalExpression = measure("transformers", () => { 48 | const applyTransformer = new CombinedTransformer([ 49 | Transformers.inline(), 50 | Transformers.removeDeadBranches(), 51 | Transformers.removeUnnecessaryAssertions(), 52 | Transformers.sortAssertions(), 53 | Transformers.applyAssertions(), 54 | Transformers.removeUnnecessaryAssertions(), 55 | ]); 56 | const modifiedExpression = transform(applyTransformer, expression); 57 | 58 | return transform(Transformers.patternEdgeAssertions({ remove: true }), modifiedExpression); 59 | }); 60 | 61 | const nfa = measure("Create NFA", () => 62 | NFA.fromRegex( 63 | finalExpression, 64 | { maxCharacter }, 65 | { assertions: "disable" }, 66 | new NFA.LimitedNodeFactory(100_000) 67 | ) 68 | ); 69 | measure("toRegex NFA", () => nfa.toRegex({ maxNodes: 100_000 })); 70 | 71 | const dfa = measure("Create DFA", () => DFA.fromFA(nfa)); 72 | measure("Minimize DFA", () => dfa.minimize()); 73 | 74 | measure("toRegex mDFA", () => { 75 | try { 76 | dfa.toRegex({ maxNodes: 100_000 }); 77 | } catch (error) { 78 | if (!String(error).includes("toRegex operation")) { 79 | throw error; 80 | } 81 | } 82 | }); 83 | } catch (error) { 84 | errors++; 85 | console.log(`Error in ${literal}`); 86 | throw error; 87 | } 88 | } 89 | 90 | showResult(); 91 | console.log(`${errors} errors`); 92 | } 93 | 94 | perfTest(); 95 | -------------------------------------------------------------------------------- /src/transformers/remove-dead-branches.ts: -------------------------------------------------------------------------------- 1 | import { Concatenation, Element, NoParent, Node, Parent, TransformContext, Transformer } from "../ast"; 2 | import { assertNever } from "../util"; 3 | import { CreationOptions } from "./creation-options"; 4 | 5 | function isDead(node: NoParent<Node>): boolean { 6 | switch (node.type) { 7 | case "Alternation": 8 | case "Expression": 9 | return node.alternatives.every(a => isDead(a)); 10 | 11 | case "Assertion": 12 | case "Unknown": 13 | // this transformer doesn't deal with assertions and unknowns 14 | return false; 15 | 16 | case "CharacterClass": 17 | return node.characters.isEmpty; 18 | 19 | case "Concatenation": 20 | // this is an optimization 21 | // we will make sure that all dead concatenation have exactly one element, so we can ignore a lot of 22 | // non-dead branches without having to look at their contents 23 | return node.elements.length === 1 && isDead(node.elements[0]); 24 | 25 | case "Quantifier": 26 | return node.min > 0 && node.alternatives.every(a => isDead(a)); 27 | 28 | default: 29 | assertNever(node); 30 | } 31 | } 32 | 33 | function onConcatenation(node: NoParent<Concatenation>, { signalMutation }: TransformContext): void { 34 | let dead: NoParent<Element> | null = null; 35 | 36 | for (let i = 0; i < node.elements.length && !dead; i++) { 37 | const current = node.elements[i]; 38 | if (current.type === "Quantifier") { 39 | if (current.alternatives.every(a => isDead(a))) { 40 | // dead inside 41 | if (current.min === 0) { 42 | node.elements.splice(i, 1); 43 | signalMutation(); 44 | i--; 45 | } else { 46 | dead = current; 47 | } 48 | } 49 | } else if (isDead(current)) { 50 | dead = current; 51 | } 52 | } 53 | 54 | if (dead && node.elements.length > 1) { 55 | // remove all elements except for the dead on 56 | node.elements = [dead]; 57 | signalMutation(); 58 | } 59 | } 60 | function onParent(node: NoParent<Parent>, { signalMutation }: TransformContext): void { 61 | for (let i = 0; i < node.alternatives.length; i++) { 62 | const current = node.alternatives[i]; 63 | if (isDead(current)) { 64 | node.alternatives.splice(i, 1); 65 | signalMutation(); 66 | i--; 67 | } 68 | } 69 | } 70 | 71 | /** 72 | * This removes dead branches in the AST. 73 | * 74 | * Dead branches are parts of the regex that can never accept on any given input string (e.g. `[]a|b` => `b`). 75 | * 76 | * This operation may produce parent nodes with 0 alternatives. Quantifiers with 0 alternatives and a minimum of 0 will 77 | * be replaced with the empty concatenation (e.g. `a(?:[]b)?c` => `ac`). 78 | * 79 | * --- 80 | * 81 | * This transformer should be used in combination with {@link inline} to handle trivial simplifications. 82 | * 83 | * @param _options 84 | */ 85 | // eslint-disable-next-line @typescript-eslint/no-unused-vars 86 | export function removeDeadBranches(_options?: Readonly<CreationOptions>): Transformer { 87 | return { 88 | name: "removeDeadBranches", 89 | onConcatenation, 90 | 91 | onAlternation: onParent, 92 | onAssertion: onParent, 93 | onExpression: onParent, 94 | onQuantifier: onParent, 95 | }; 96 | } 97 | -------------------------------------------------------------------------------- /tests/dfa-min.ts: -------------------------------------------------------------------------------- 1 | import { DFA } from "../src/dfa"; 2 | import { assert } from "chai"; 3 | import { literalToString } from "./helper/fa"; 4 | import { Parser } from "../src/js"; 5 | import { NFA } from "../src/nfa"; 6 | import { PrismRegexes } from "./helper/prism-regex-data"; 7 | import { CombinedTransformer, transform } from "../src/ast"; 8 | import * as Transformers from "../src/transformers"; 9 | import { TooManyNodesError } from "../src/errors"; 10 | import { CONFIG_RUN_STRESS_TEST } from "./helper/config"; 11 | import { assertEqualSnapshot } from "./helper/snapshot"; 12 | import { createHash } from "crypto"; 13 | import { toString } from "../src/iter"; 14 | 15 | describe("DFA minimization", function () { 16 | if (!CONFIG_RUN_STRESS_TEST) { 17 | return; 18 | } 19 | 20 | const candidates = PrismRegexes.map((r, i) => ({ regex: r, id: i })).filter( 21 | ({ regex }) => regex.source.length < 1e3 22 | ); 23 | 24 | function toDFA(regex: RegExp): DFA { 25 | const result = Parser.fromLiteral(regex).parse({ backreferences: "disable" }); 26 | 27 | const applyTransformer = new CombinedTransformer([ 28 | Transformers.inline(), 29 | Transformers.removeDeadBranches(), 30 | Transformers.removeUnnecessaryAssertions(), 31 | Transformers.sortAssertions(), 32 | Transformers.applyAssertions(), 33 | Transformers.removeUnnecessaryAssertions(), 34 | ]); 35 | const modifiedExpression = transform(applyTransformer, result.expression); 36 | 37 | const finalExpression = transform(Transformers.patternEdgeAssertions({ remove: true }), modifiedExpression); 38 | 39 | const nfa = NFA.fromRegex(finalExpression, result, { assertions: "disable", unknowns: "disable" }); 40 | 41 | return DFA.fromFA(nfa); 42 | } 43 | 44 | for (const { regex, id } of candidates) { 45 | // node v10 has a bug where all "/" chars are escaped. This escapes all "/" chars to make the 46 | // result consistent across versions. 47 | const source = regex.source.replace(/([^\\](?:\\{2})*)(?=\/)/g, "$1\\"); 48 | 49 | const preview = literalToString({ source, flags: regex.flags }) 50 | // max length of 80 51 | .replace(/^([^]{80})[^]+/, "$1..."); 52 | 53 | it(`${id}: ${preview}`, function () { 54 | this.timeout(10_000); 55 | 56 | let dfa; 57 | try { 58 | dfa = toDFA(regex); 59 | } catch (error) { 60 | if (error instanceof TooManyNodesError) { 61 | return; 62 | } 63 | throw error; 64 | } 65 | 66 | dfa.minimize(); 67 | 68 | // minimize(minimize(dfa)) == minimize(dfa) 69 | // This is just a basic sanity check 70 | const copy = dfa.copy(); 71 | assert.isTrue(copy.structurallyEqual(dfa), "Copy is not equal to original"); 72 | copy.minimize(); 73 | 74 | if (!copy.structurallyEqual(dfa)) { 75 | assert.equal(copy.toDot(), dfa.toDot()); 76 | assert.fail("structurally equal doesn't work"); 77 | } 78 | 79 | // This way we can detect when the DFA minimization produces incorrect results. 80 | // We could also store the DFA string itself but this results in mega bytes of snapshots. 81 | assertEqualSnapshot( 82 | this, 83 | createHash("sha256") 84 | .update(toString(dfa.transitionIterator(), cs => cs.toRangesString())) 85 | .digest("base64") 86 | ); 87 | }); 88 | } 89 | }); 90 | -------------------------------------------------------------------------------- /tests/char-base.ts: -------------------------------------------------------------------------------- 1 | import { assert } from "chai"; 2 | import { CharBase } from "../src/char-base"; 3 | import { CharSet } from "../src/char-set"; 4 | import { Char } from "../src/char-types"; 5 | import { charsFromRegex } from "./helper/chars"; 6 | import { assertEqualSnapshot } from "./helper/snapshot"; 7 | 8 | describe(CharBase.name, function () { 9 | const regexes: RegExp[] = [ 10 | /a/u, 11 | /[^a]/u, 12 | /b/u, 13 | /[^b]/u, 14 | /c/u, 15 | /[^c]/u, 16 | /[ab]/u, 17 | /[^ab]/u, 18 | /\w/u, 19 | /\W/u, 20 | /\d/u, 21 | /\D/u, 22 | /\p{L}/u, 23 | /\P{L}/u, 24 | /\p{Lu}/u, 25 | /\P{Lu}/u, 26 | /\p{Ll}/u, 27 | /\P{Ll}/u, 28 | /[^]/u, 29 | /[]/u, 30 | ]; 31 | const chars = regexes.map(charsFromRegex); 32 | 33 | interface TestCase { 34 | id: string; 35 | sets: CharSet[]; 36 | } 37 | 38 | const cases: TestCase[] = []; 39 | for (let i = 0; i < chars.length; i++) { 40 | for (let j = i + 1; j < chars.length; j++) { 41 | cases.push({ id: [regexes[i], regexes[j]].join(" "), sets: [chars[i], chars[j]] }); 42 | } 43 | } 44 | for (let i = 0; i < chars.length; i++) { 45 | for (let j = i + 1; j < chars.length; j++) { 46 | for (let k = j + 1; k < chars.length; k++) { 47 | cases.push({ 48 | id: [regexes[i], regexes[j], regexes[k]].join(" "), 49 | sets: [chars[i], chars[j], chars[k]], 50 | }); 51 | } 52 | } 53 | } 54 | 55 | cases.forEach(test); 56 | 57 | function test({ id, sets }: TestCase): void { 58 | it(id, function () { 59 | const base = new CharBase(sets); 60 | 61 | assert.isTrue( 62 | base.sets.every(b => !b.isEmpty), 63 | "Expected all base sets to be non-empty" 64 | ); 65 | assert.isTrue( 66 | unionAll(sets).equals(unionAll(base.sets)), 67 | "Expected the union of all base sets to be equal to the union of all input sets." 68 | ); 69 | 70 | for (let i = 0; i < base.sets.length; i++) { 71 | const a = base.sets[i]; 72 | for (let j = i + 1; j < base.sets.length; j++) { 73 | const b = base.sets[j]; 74 | assert.isTrue(a.isDisjointWith(b), "Expected base sets to be disjoint with each other."); 75 | } 76 | } 77 | 78 | for (const s of sets) { 79 | const indexes = base.split(s); 80 | if (indexes.length === 0) { 81 | assert.isTrue(s.isEmpty); 82 | continue; 83 | } 84 | 85 | assert.deepEqual([...new Set(indexes)], indexes, "expected index to be unique"); 86 | assert.deepEqual( 87 | [...indexes].sort((a, b) => a - b), 88 | indexes, 89 | "expected index to be sorted" 90 | ); 91 | 92 | const total = unionAll(indexes.map(i => base.sets[i])); 93 | assert.isTrue(total.equals(s)); 94 | } 95 | 96 | const checkBase = new CharBase(base.sets); 97 | assert.equal(checkBase.sets.length, base.sets.length, "Expected the base sets to be as small as possible."); 98 | assert.isTrue(base.sets.every((s, i) => s.equals(checkBase.sets[i]))); 99 | 100 | if (sets.every(s => s.ranges.length < 10)) { 101 | assertEqualSnapshot(this, base.sets.join("\n")); 102 | } 103 | }); 104 | } 105 | }); 106 | 107 | function unionAll(sets: Iterable<CharSet>, maxCharacter: Char = 0x10ffff): CharSet { 108 | return CharSet.empty(maxCharacter).union(...sets); 109 | } 110 | -------------------------------------------------------------------------------- /src/iter/to-mermaid.ts: -------------------------------------------------------------------------------- 1 | import { FAIterator } from "../fa-types"; 2 | import { NodeInfo, SimplePrintOptions } from "./print-common"; 3 | import { indexNodes } from "./print-util"; 4 | 5 | export function toMermaid<S, T>( 6 | iter: FAIterator<S, Iterable<[S, T]>>, 7 | options: ToMermaidOptions<S, T> | SimplePrintOptions<T> 8 | ): string { 9 | const { getEdgeAttributes, getNodeAttributes = DEFAULT_GET_NODE_ATTRIBUTES } = 10 | "transitionToString" in options ? fromSimpleOptions(options) : options; 11 | 12 | const { stableIter, states, info } = indexNodes(iter); 13 | 14 | let s = ""; 15 | 16 | function writeText(text: string): void { 17 | if (text.length === 0) { 18 | s += " "; 19 | } else if (/^\w+$/.test(text)) { 20 | s += text; 21 | } else { 22 | s += '"' + text.replace(/"/g, """) + '"'; 23 | } 24 | } 25 | function writeNode(node: MermaidNode): void { 26 | s += node.shape[0]; 27 | writeText(node.label); 28 | s += node.shape[1]; 29 | } 30 | function writeEdge(edge: MermaidEdge): void { 31 | s += "-".repeat((edge.length ?? 1) + 1) + ">"; 32 | if (edge.label !== undefined) { 33 | s += "|"; 34 | writeText(edge.label); 35 | s += "|"; 36 | } 37 | } 38 | function writeNodeLabel(node: S): void { 39 | s += "n" + info.getId(node); 40 | } 41 | function writeNodeLabelFromIndex(index: number): void { 42 | s += "n" + index; 43 | } 44 | 45 | s += "flowchart LR\n"; 46 | 47 | // nodes 48 | s += "%% nodes\n"; 49 | s += "\tnull(( ))\n"; 50 | states.forEach((node, i) => { 51 | s += "\t"; 52 | writeNodeLabelFromIndex(i); 53 | writeNode(getNodeAttributes(node, info)); 54 | s += "\n"; 55 | }); 56 | 57 | // edges 58 | s += "\n%% edges\n"; 59 | s += "\tnull -.-> n0\n"; 60 | states.forEach((node, i) => { 61 | stableIter.getOut(node).forEach(([to, trans], nth) => { 62 | s += "\t"; 63 | writeNodeLabelFromIndex(i); 64 | s += " "; 65 | writeEdge(getEdgeAttributes(trans, nth, node, to, info)); 66 | s += " "; 67 | writeNodeLabel(to); 68 | s += "\n"; 69 | }); 70 | }); 71 | 72 | return s.trim(); 73 | } 74 | 75 | export interface ToMermaidOptions<S, T> { 76 | getNodeAttributes?: (node: S, info: NodeInfo<S>) => Readonly<MermaidNode>; 77 | getEdgeAttributes: (transition: T, nth: number, from: S, to: S, info: NodeInfo<S>) => MermaidEdge; 78 | } 79 | 80 | export interface MermaidNode { 81 | label: string; 82 | shape: [string, string]; 83 | } 84 | export interface MermaidEdge { 85 | label?: string; 86 | length?: number; 87 | } 88 | 89 | function fromSimpleOptions<S, T>({ 90 | transitionToString, 91 | ordered = false, 92 | }: SimplePrintOptions<T>): ToMermaidOptions<S, T> { 93 | return { 94 | getEdgeAttributes(trans, nth, from, _, info) { 95 | let label = transitionToString(trans); 96 | let length = 1; 97 | 98 | if (ordered && info.getNumberOfOutgoingEdges(from) > 1) { 99 | label = `(${nth + 1}) ${label}`; 100 | length = nth + 1; 101 | } 102 | 103 | return { label, length }; 104 | }, 105 | }; 106 | } 107 | 108 | const DEFAULT_GET_NODE_ATTRIBUTES: NonNullable<ToMermaidOptions<unknown, never>["getNodeAttributes"]> = ( 109 | node, 110 | info 111 | ) => { 112 | return { 113 | label: String(info.getId(node)), 114 | shape: info.isFinal(node) ? ["(((", ")))"] : ["((", "))"], 115 | }; 116 | }; 117 | -------------------------------------------------------------------------------- /.eslintrc.js: -------------------------------------------------------------------------------- 1 | /** @type {import("eslint").Linter.Config} */ 2 | module.exports = { 3 | env: { 4 | browser: true, 5 | es6: true 6 | }, 7 | root: true, 8 | extends: [ 9 | "eslint:recommended", 10 | "plugin:@typescript-eslint/eslint-recommended", 11 | "plugin:@typescript-eslint/recommended", 12 | "plugin:prettier/recommended" 13 | ], 14 | parser: "@typescript-eslint/parser", 15 | plugins: [ 16 | "@typescript-eslint", 17 | "prettier", 18 | "jsdoc" 19 | ], 20 | parserOptions: { 21 | ecmaVersion: 2018, 22 | sourceType: "module", 23 | ecmaFeatures: { 24 | node: true, 25 | spread: true 26 | }, 27 | project: "./tsconfig.json" 28 | }, 29 | rules: { 30 | "@typescript-eslint/naming-convention": [ 31 | "error", 32 | { 33 | selector: "default", 34 | format: ["camelCase"] 35 | }, 36 | { 37 | selector: ["class", "interface", "typeAlias", "enum", "typeParameter"], 38 | format: ["PascalCase"] 39 | }, 40 | { 41 | selector: "enumMember", 42 | format: ["UPPER_CASE"] 43 | }, 44 | { 45 | selector: "variable", 46 | modifiers: ["const"], 47 | format: ["UPPER_CASE", "camelCase"] 48 | }, 49 | { 50 | selector: ["classProperty", "classMethod"], 51 | modifiers: ["private"], 52 | format: ["camelCase"], 53 | leadingUnderscore: "require" 54 | }, 55 | { 56 | selector: ["variable", "parameter"], 57 | modifiers: ["unused"], 58 | format: null, 59 | leadingUnderscore: "allow" 60 | }, 61 | { 62 | selector: "typeProperty", 63 | format: ["camelCase"], 64 | leadingUnderscore: "allowDouble" 65 | } 66 | ], 67 | 68 | "curly": "error", 69 | 70 | "no-constant-condition": ["error", { checkLoops: false }], 71 | "sort-imports": ["error", { ignoreDeclarationSort: true }], 72 | "@typescript-eslint/no-inferrable-types": ["error", { ignoreParameters: true, ignoreProperties: true }], 73 | "@typescript-eslint/explicit-function-return-type": ["error", { allowExpressions: true }], 74 | "@typescript-eslint/no-unnecessary-condition": "warn", 75 | "@typescript-eslint/strict-boolean-expressions": ["warn", { allowNullableBoolean: true }], 76 | 77 | "jsdoc/no-types": "error", 78 | "jsdoc/no-bad-blocks": "error", 79 | "jsdoc/multiline-blocks": "error", 80 | "jsdoc/empty-tags": "error", 81 | "jsdoc/check-param-names": ["error", { enableFixer: true }], 82 | "jsdoc/require-param": "error", 83 | "jsdoc/tag-lines": ["warn", "any", { startLines: 1 }], 84 | 85 | "no-empty-character-class": "off", 86 | "@typescript-eslint/explicit-member-accessibility": "off", 87 | "@typescript-eslint/no-non-null-assertion": "off", 88 | "@typescript-eslint/no-use-before-define": "off", 89 | "@typescript-eslint/indent": "off", 90 | }, 91 | settings: { 92 | jsdoc: { 93 | mode: "typescript" 94 | } 95 | }, 96 | overrides: [ 97 | { 98 | files: ["scripts/**"], 99 | env: { 100 | browser: false, 101 | node: true, 102 | es6: true 103 | }, 104 | parserOptions: { 105 | project: "./scripts/tsconfig.json" 106 | }, 107 | rules: { 108 | "@typescript-eslint/no-var-requires": "off" 109 | } 110 | }, 111 | { 112 | files: ["tests/**"], 113 | parserOptions: { 114 | project: "./tests/tsconfig.json" 115 | } 116 | } 117 | ], 118 | ignorePatterns: [ 119 | "*.js", 120 | "index.d.ts", 121 | "src/js/unicode/**" 122 | ] 123 | } 124 | -------------------------------------------------------------------------------- /src/js/char-case-folding.ts: -------------------------------------------------------------------------------- 1 | import { CharSet } from "../char-set"; 2 | import { Char } from "../char-types"; 3 | import { Flags } from "./flags"; 4 | import { Maximum } from "./maximum"; 5 | import { UnicodeCaseFolding } from "./unicode"; 6 | import { UTF16CaseFolding } from "./utf16-case-folding"; 7 | 8 | /** 9 | * A set of functions that can be used to perform case-insensitive matching. 10 | * 11 | * It must fulfill the following conditions: 12 | * 13 | * 1. `canonicalize` must be idempotent, i.e. `canonicalize(canonicalize(char)) === canonicalize(char)`. 14 | * 2. `toCharSet(canonicalize(a))` is the set of all characters `c` such that `canonicalize(a) === canonicalize(c)`. 15 | */ 16 | export interface CharCaseFolding { 17 | /** 18 | * The canonicalization function. This typically maps characters to their lowercase form. 19 | * 20 | * If no function is given, then the identity function is used. This also implies that `toCharSet` must return a 21 | * set containing only the given character. 22 | * 23 | * @default char => char 24 | */ 25 | readonly canonicalize?: (char: Char) => Char; 26 | readonly toCharSet: (char: Char) => CharSet; 27 | } 28 | 29 | const CHAR_CASE_FOLDING_UTF16: CharCaseFolding = { 30 | toCharSet: boundedCache(char => { 31 | return CharSet.fromCharacter(Maximum.UTF16, char); 32 | }), 33 | }; 34 | const CHAR_CASE_FOLDING_UTF16_I: CharCaseFolding = { 35 | canonicalize: char => UTF16CaseFolding[char]?.[0] ?? char, 36 | toCharSet: boundedCache(char => { 37 | const folding = UTF16CaseFolding[char]; 38 | if (folding === undefined) { 39 | return CharSet.fromCharacter(Maximum.UTF16, char); 40 | } else { 41 | return CharSet.fromCharacters(Maximum.UTF16, folding); 42 | } 43 | }), 44 | }; 45 | const CHAR_CASE_FOLDING_UNICODE: CharCaseFolding = { 46 | toCharSet: boundedCache(char => { 47 | return CharSet.fromCharacter(Maximum.UNICODE, char); 48 | }), 49 | }; 50 | const CHAR_CASE_FOLDING_UNICODE_I: CharCaseFolding = { 51 | canonicalize: char => UnicodeCaseFolding[char]?.[0] ?? char, 52 | toCharSet: boundedCache(char => { 53 | const folding = UnicodeCaseFolding[char]; 54 | if (folding === undefined) { 55 | return CharSet.fromCharacter(Maximum.UNICODE, char); 56 | } else { 57 | return CharSet.fromCharacters(Maximum.UNICODE, folding); 58 | } 59 | }), 60 | }; 61 | 62 | export function getCharCaseFolding(unicode: boolean, ignoreCase: boolean): CharCaseFolding; 63 | export function getCharCaseFolding(flags: Readonly<Flags>): CharCaseFolding; 64 | export function getCharCaseFolding(flagsOrUnicode: Readonly<Flags> | boolean, ignoreCase?: boolean): CharCaseFolding { 65 | let unicode: boolean; 66 | if (typeof flagsOrUnicode === "boolean") { 67 | unicode = flagsOrUnicode; 68 | } else { 69 | unicode = flagsOrUnicode.unicode || !!flagsOrUnicode.unicodeSets; 70 | ignoreCase = flagsOrUnicode.ignoreCase; 71 | } 72 | 73 | if (unicode) { 74 | return ignoreCase ? CHAR_CASE_FOLDING_UNICODE_I : CHAR_CASE_FOLDING_UNICODE; 75 | } else { 76 | return ignoreCase ? CHAR_CASE_FOLDING_UTF16_I : CHAR_CASE_FOLDING_UTF16; 77 | } 78 | } 79 | 80 | function boundedCache<A, B>(compute: (value: A) => B, maxSize: number = 100): (value: A) => B { 81 | const cache = new Map<A, B>(); 82 | return value => { 83 | let cached = cache.get(value); 84 | if (cached === undefined) { 85 | cached = compute(value); 86 | if (cache.size >= maxSize) { 87 | cache.clear(); 88 | } 89 | cache.set(value, cached); 90 | } 91 | return cached; 92 | }; 93 | } 94 | -------------------------------------------------------------------------------- /src/intersection.ts: -------------------------------------------------------------------------------- 1 | import { Word } from "./char-types"; 2 | import { WordSet } from "./word-set"; 3 | import * as Iter from "./iter"; 4 | import { MapFABuilderNode } from "./iter"; 5 | import { MaxCharacterError } from "./errors"; 6 | import { TransitionIterable, TransitionIterator } from "./fa-types"; 7 | import { wordSetsToWords } from "./words"; 8 | 9 | /** 10 | * Returns a lazily-created {@link TransitionIterator} for the intersection of the two given FA. 11 | * 12 | * The iterator will create states as it is traversed. 13 | * 14 | * @param left 15 | * @param right 16 | * @param maxNodes 17 | */ 18 | export function getIntersectionIterator<L, R>( 19 | left: TransitionIterable<L>, 20 | right: TransitionIterable<R>, 21 | maxNodes: number = 10_000 22 | ): TransitionIterator<MapFABuilderNode> { 23 | MaxCharacterError.assert(left, right, "TransitionIterable"); 24 | 25 | return Iter.intersection(new Iter.MapFABuilder(maxNodes), left.transitionIterator(), right.transitionIterator()); 26 | } 27 | 28 | /** 29 | * Returns whether the languages of this and the other FA are disjoint. 30 | * 31 | * The runtime of this algorithm is `O(n * m)` (n = number of states of this NFA, m = number of states of the other 32 | * FA) but it's a lot faster in practice with the worst case being very rare. 33 | * 34 | * Since this uses the intersection operation, you can supply intersection options. 35 | * 36 | * This is equivalent to `NFA.fromIntersection(left, right).isEmpty` but implemented more efficiently. 37 | * 38 | * @param left 39 | * @param right 40 | * @param maxNodes 41 | */ 42 | export function isDisjointWith<L, R>( 43 | left: TransitionIterable<L>, 44 | right: TransitionIterable<R>, 45 | maxNodes: number = 10_000 46 | ): boolean { 47 | const iter = getIntersectionIterator(left, right, maxNodes); 48 | 49 | return !Iter.canReachFinal(Iter.mapOut(iter, n => n.keys())); 50 | } 51 | /** 52 | * Returns a potentially infinite iterable of word sets accepted by both given transition iterables. 53 | * 54 | * This function provides the following guarantees: 55 | * 56 | * 1. Word sets are guaranteed to be yielded in the order of increasing length. (Word sets of equal lengths may be 57 | * yielded in any order.) 58 | * 2. No character set of the yielded word sets is empty. 59 | * 60 | * This is roughly equivalent to `NFA.fromIntersection(left, right).wordSets()` but implemented more efficiently. 61 | * 62 | * @param left 63 | * @param right 64 | * @param maxNodes 65 | */ 66 | export function getIntersectionWordSets<L, R>( 67 | left: TransitionIterable<L>, 68 | right: TransitionIterable<R>, 69 | maxNodes: number = 10_000 70 | ): Iterable<WordSet> { 71 | const iter = getIntersectionIterator(left, right, maxNodes); 72 | 73 | return Iter.iterateWordSets(iter); 74 | } 75 | /** 76 | * Returns a potentially infinite iterable of words accepted by both given transition iterables. 77 | * 78 | * This function provides the following guarantees: 79 | * 80 | * 1. Words are guaranteed to be yielded in the order of increasing length. (Words of equal lengths may be yielded in 81 | * any order.) 82 | * 83 | * This is roughly equivalent to `NFA.fromIntersection(left, right).words()` but implemented more efficiently. 84 | * 85 | * @param left 86 | * @param right 87 | * @param maxNodes 88 | */ 89 | export function getIntersectionWords<L, R>( 90 | left: TransitionIterable<L>, 91 | right: TransitionIterable<R>, 92 | maxNodes: number = 10_000 93 | ): Iterable<Word> { 94 | return wordSetsToWords(getIntersectionWordSets(left, right, maxNodes)); 95 | } 96 | -------------------------------------------------------------------------------- /tests/js/create-assertion.ts: -------------------------------------------------------------------------------- 1 | import { assert } from "chai"; 2 | import { Parser } from "../../src/js"; 3 | import { toPatternString } from "../helper/literal-to-string"; 4 | 5 | describe("JS createCharSet", function () { 6 | interface TestCase { 7 | literal: { source: string; flags: string }; 8 | expected: string | Error; 9 | } 10 | 11 | function test(cases: Iterable<TestCase>): void { 12 | for (const { literal, expected } of cases) { 13 | it(`/${literal.source}/${literal.flags}`, function () { 14 | if (typeof expected === "string") { 15 | const parser = Parser.fromLiteral(literal); 16 | assert.strictEqual(toPatternString(parser.parse().expression), expected); 17 | } else { 18 | assert.throws(() => { 19 | const parser = Parser.fromLiteral(literal); 20 | parser.parse(); 21 | }); 22 | } 23 | }); 24 | } 25 | } 26 | 27 | const cases: TestCase[] = [ 28 | { 29 | literal: /^/, 30 | expected: "(?<![0..ffff])", 31 | }, 32 | { 33 | literal: /^/m, 34 | expected: "(?<![0..9, b..c, e..2027, 202a..ffff])", 35 | }, 36 | { 37 | literal: /^/u, 38 | expected: "(?<![0..10ffff])", 39 | }, 40 | { 41 | literal: /^/mu, 42 | expected: "(?<![0..9, b..c, e..2027, 202a..10ffff])", 43 | }, 44 | { 45 | literal: /^/ms, 46 | expected: "(?<![0..9, b..c, e..2027, 202a..ffff])", 47 | }, 48 | 49 | { 50 | literal: /$/, 51 | expected: "(?![0..ffff])", 52 | }, 53 | { 54 | literal: /$/m, 55 | expected: "(?![0..9, b..c, e..2027, 202a..ffff])", 56 | }, 57 | { 58 | literal: /$/u, 59 | expected: "(?![0..10ffff])", 60 | }, 61 | { 62 | literal: /$/mu, 63 | expected: "(?![0..9, b..c, e..2027, 202a..10ffff])", 64 | }, 65 | { 66 | literal: /$/ms, 67 | expected: "(?![0..9, b..c, e..2027, 202a..ffff])", 68 | }, 69 | 70 | { 71 | literal: /\b/, 72 | expected: 73 | "(?<![30..39, 41..5a, 5f, 61..7a])(?=[30..39, 41..5a, 5f, 61..7a])|(?<=[30..39, 41..5a, 5f, 61..7a])(?![30..39, 41..5a, 5f, 61..7a])", 74 | }, 75 | { 76 | literal: /\b/i, 77 | expected: 78 | "(?<![30..39, 41..5a, 5f, 61..7a])(?=[30..39, 41..5a, 5f, 61..7a])|(?<=[30..39, 41..5a, 5f, 61..7a])(?![30..39, 41..5a, 5f, 61..7a])", 79 | }, 80 | { 81 | literal: /\b/u, 82 | expected: 83 | "(?<![30..39, 41..5a, 5f, 61..7a])(?=[30..39, 41..5a, 5f, 61..7a])|(?<=[30..39, 41..5a, 5f, 61..7a])(?![30..39, 41..5a, 5f, 61..7a])", 84 | }, 85 | { 86 | literal: /\b/iu, 87 | expected: 88 | "(?<![30..39, 41..5a, 5f, 61..7a, 17f, 212a])(?=[30..39, 41..5a, 5f, 61..7a, 17f, 212a])|(?<=[30..39, 41..5a, 5f, 61..7a, 17f, 212a])(?![30..39, 41..5a, 5f, 61..7a, 17f, 212a])", 89 | }, 90 | 91 | { 92 | literal: /\B/, 93 | expected: 94 | "(?<=[30..39, 41..5a, 5f, 61..7a])(?=[30..39, 41..5a, 5f, 61..7a])|(?<![30..39, 41..5a, 5f, 61..7a])(?![30..39, 41..5a, 5f, 61..7a])", 95 | }, 96 | { 97 | literal: /\B/i, 98 | expected: 99 | "(?<=[30..39, 41..5a, 5f, 61..7a])(?=[30..39, 41..5a, 5f, 61..7a])|(?<![30..39, 41..5a, 5f, 61..7a])(?![30..39, 41..5a, 5f, 61..7a])", 100 | }, 101 | { 102 | literal: /\B/u, 103 | expected: 104 | "(?<=[30..39, 41..5a, 5f, 61..7a])(?=[30..39, 41..5a, 5f, 61..7a])|(?<![30..39, 41..5a, 5f, 61..7a])(?![30..39, 41..5a, 5f, 61..7a])", 105 | }, 106 | { 107 | literal: /\B/iu, 108 | expected: 109 | "(?<=[30..39, 41..5a, 5f, 61..7a, 17f, 212a])(?=[30..39, 41..5a, 5f, 61..7a, 17f, 212a])|(?<![30..39, 41..5a, 5f, 61..7a, 17f, 212a])(?![30..39, 41..5a, 5f, 61..7a, 17f, 212a])", 110 | }, 111 | ]; 112 | 113 | test(cases); 114 | }); 115 | -------------------------------------------------------------------------------- /src/transformers/union-characters.ts: -------------------------------------------------------------------------------- 1 | import { CharacterClass, Concatenation, NoParent, Parent, TransformContext, Transformer } from "../ast"; 2 | import { filterMut } from "../util"; 3 | import { CreationOptions } from "./creation-options"; 4 | 5 | function isSingleCharacterAlternative( 6 | alt: NoParent<Concatenation> 7 | ): alt is { type: "Concatenation"; elements: [NoParent<CharacterClass>] } { 8 | return alt.elements.length === 1 && alt.elements[0].type === "CharacterClass"; 9 | } 10 | 11 | /** 12 | * Combines single-character alternatives. 13 | * 14 | * This rule will try to combine as many character classes as possible to simplify the regular expression. 15 | * 16 | * E.g. `a|b|c` => `[abc]`. 17 | * 18 | * @param options 19 | */ 20 | export function unionCharacters(options?: Readonly<CreationOptions>): Transformer { 21 | const preserveOrder = !options?.ignoreOrder; 22 | const preserveAmbiguity = !options?.ignoreAmbiguity; 23 | 24 | function onParent(node: NoParent<Parent>, { signalMutation }: TransformContext): void { 25 | const { alternatives } = node; 26 | if (alternatives.length < 2) { 27 | return; 28 | } 29 | 30 | let mainCharacter: NoParent<CharacterClass> | undefined = undefined; 31 | let adjacent = true; 32 | filterMut(alternatives, alternative => { 33 | if (isSingleCharacterAlternative(alternative)) { 34 | const element: NoParent<CharacterClass> = alternative.elements[0]; 35 | 36 | if ( 37 | mainCharacter === undefined || 38 | // If order is to be preserved, character classes have to be adjacent. This works because we can 39 | // safely ignore the order of adjacent single-character alternatives 40 | // (e.g. /a|b/ == /b|a/ == /[ab]/). However, this is only true for single-character alternatives 41 | // (e.g. /a|bb|b/ != /a|b|bb/ == /[ab]|bb/). 42 | // 43 | // There are also cases where we could reorder alternatives without changing the regex 44 | // (e.g. /a|cc|b/ == /a|b|cc/ == /[ab]|cc/) but the current simple implementation seems to be good 45 | // enough for now. 46 | (!adjacent && preserveOrder) 47 | ) { 48 | // this is the first single character class found 49 | mainCharacter = element; 50 | return true; 51 | } 52 | 53 | if (preserveAmbiguity) { 54 | // find the common characters. The regex is ambiguous for all of them. E.g. /\w|[a-z-]/ 55 | const ambiguousChars = mainCharacter.characters.intersect(element.characters); 56 | if (element.characters.equals(ambiguousChars)) { 57 | // This means that the current character set is a subset of the main character set. 58 | // E.g. /\w|a/ 59 | // In this case, we leave everything as is. 60 | return true; 61 | } else { 62 | signalMutation(); 63 | mainCharacter.characters = mainCharacter.characters.union(element.characters); 64 | if (ambiguousChars.isEmpty) { 65 | // The two character sets are disjoint, so we can just remove the second one. 66 | // E.g. /a|b/ => /[ab]/ 67 | return false; 68 | } else { 69 | // The two character sets are not disjoint. We will simplify the second one preserving 70 | // ambiguity. E.g. /[ab]|[bc]/ => /[abc]|b/ 71 | element.characters = ambiguousChars; 72 | return true; 73 | } 74 | } 75 | } else { 76 | signalMutation(); 77 | mainCharacter.characters = mainCharacter.characters.union(element.characters); 78 | return false; 79 | } 80 | } else { 81 | adjacent = false; 82 | return true; 83 | } 84 | }); 85 | } 86 | 87 | return { 88 | name: "unionCharacters", 89 | onAlternation: onParent, 90 | onAssertion: onParent, 91 | onExpression: onParent, 92 | onQuantifier: onParent, 93 | }; 94 | } 95 | -------------------------------------------------------------------------------- /tests/iter/word-sets.ts: -------------------------------------------------------------------------------- 1 | import { CharSet } from "../../src/char-set"; 2 | import { WordSet } from "../../src/word-set"; 3 | import { TransitionIterable } from "../../src/fa-types"; 4 | import { approximateRejectingWordSet, iterateWordSets, shortestWordSet } from "../../src/iter"; 5 | import { literalToDFA, literalToENFA, literalToNFA } from "../helper/fa"; 6 | import { assertEqualSnapshot } from "../helper/snapshot"; 7 | 8 | describe("word sets", function () { 9 | const regexes: RegExp[] = [ 10 | /[]/, 11 | /(?:)/, 12 | /[^]?/, 13 | /[^]*/, 14 | /[ab]{0,7}c?[^]+a*b?|d*b*/, // == [^]* 15 | /[^]+/, 16 | /[^]{0,5}/, 17 | /a/, 18 | /a|b/, 19 | /aa|b/, 20 | /ab|ba/, 21 | /a+/, 22 | /a*/, 23 | /a*b*c*/, 24 | /a+b*c+/, 25 | /a+b+c+/, 26 | /a+(?:d+|e+)?/, 27 | /(?:\d+(?:\.\d*)?|\.\d+)(?:e[+-]?\d+)?/i, 28 | ]; 29 | 30 | function wordSetToString(wordSet: WordSet): string { 31 | return JSON.stringify(wordSet.map(rangesToString)); 32 | } 33 | 34 | function firstN<T>(iter: Iterable<T>, n: number): T[] { 35 | const result: T[] = []; 36 | 37 | for (const item of iter) { 38 | result.push(item); 39 | if (result.length >= n) { 40 | break; 41 | } 42 | } 43 | 44 | return result; 45 | } 46 | 47 | describe(iterateWordSets.name, function () { 48 | function runTests<T>(name: string, toFA: (regex: RegExp) => TransitionIterable<T>): void { 49 | describe(name, function () { 50 | for (const regex of regexes) { 51 | it(`${regex}`, function () { 52 | const fa = toFA(regex); 53 | const wordSets = firstN(iterateWordSets(fa.transitionIterator()), 10); 54 | assertEqualSnapshot(this, wordSets.map(wordSetToString).join("\n")); 55 | }); 56 | } 57 | }); 58 | } 59 | 60 | runTests("NFA", literalToNFA); 61 | runTests("ENFA", literalToENFA); 62 | runTests("DFA", literalToDFA); 63 | }); 64 | 65 | describe(shortestWordSet.name, function () { 66 | function runTests<T>(name: string, toFA: (regex: RegExp) => TransitionIterable<T>): void { 67 | describe(name, function () { 68 | for (const regex of regexes) { 69 | it(`${regex}`, function () { 70 | const fa = toFA(regex); 71 | const wordSet = shortestWordSet(fa.transitionIterator()); 72 | assertEqualSnapshot(this, wordSet ? wordSetToString(wordSet) : "none"); 73 | }); 74 | } 75 | }); 76 | } 77 | 78 | runTests("NFA", literalToNFA); 79 | runTests("ENFA", literalToENFA); 80 | runTests("DFA", literalToDFA); 81 | }); 82 | 83 | describe(approximateRejectingWordSet.name, function () { 84 | function runTests<T>(name: string, toFA: (regex: RegExp) => TransitionIterable<T>): void { 85 | describe(name, function () { 86 | for (const regex of regexes) { 87 | it(`${regex}`, function () { 88 | const fa = toFA(regex); 89 | const wordSet = approximateRejectingWordSet( 90 | fa.transitionIterator(), 91 | regex.unicode ? CharSet.all(0x10ffff) : CharSet.all(0xffff) 92 | ); 93 | assertEqualSnapshot(this, wordSet ? wordSetToString(wordSet) : "none"); 94 | }); 95 | } 96 | }); 97 | } 98 | 99 | runTests("NFA", literalToNFA); 100 | runTests("ENFA", literalToENFA); 101 | runTests("DFA", literalToDFA); 102 | runTests("DFA complement", re => { 103 | const dfa = literalToDFA(re); 104 | dfa.complement(); 105 | return dfa; 106 | }); 107 | }); 108 | }); 109 | 110 | function rangesToString(ranges: CharSet): string { 111 | let s = ""; 112 | for (const { min, max } of ranges.ranges) { 113 | if (s !== "") { 114 | s += " "; 115 | } 116 | if (min == max) { 117 | s += min.toString(16); 118 | } else { 119 | s += min.toString(16) + "-" + max.toString(16); 120 | } 121 | } 122 | return s; 123 | } 124 | -------------------------------------------------------------------------------- /tests/regex-stress-test.ts: -------------------------------------------------------------------------------- 1 | import { assert } from "chai"; 2 | import { ParseOptions, Parser } from "../src/js"; 3 | import { PrismRegexes } from "./helper/prism-regex-data"; 4 | import { NFA } from "../src/nfa"; 5 | import { DFA, ReadonlyDFA } from "../src/dfa"; 6 | import { Expression, NoParent } from "../src/ast"; 7 | import { CONFIG_ALL_PARSE_OPTIONS, CONFIG_RUN_STRESS_TEST } from "./helper/config"; 8 | import { TooManyNodesError } from "../src/errors"; 9 | 10 | /** 11 | * Setting this to `true` will enable the check that verifies that the language of the generated RE from `toRegex` is 12 | * the same as the language of the NFA/DFA that created it. 13 | * 14 | * The generated RE tends to create NFA that are both large and very non-deterministic. This means that the conversion 15 | * to DFA will create __A LOT__ nodes (sometimes >10k). Both creating and minimizing the DFA takes time (up to a minute 16 | * for a single regex). 17 | * 18 | * Only set this to `true` if you have the time to run it. 19 | */ 20 | const CHECK_RE_LANGUAGE = false as boolean; 21 | 22 | const maxNodes = 100_000; 23 | 24 | function equalLanguage(expected: ReadonlyDFA, re: NoParent<Expression>, maxCharacter: number): void { 25 | const nfa = NFA.fromRegex(re, { maxCharacter }, { assertions: "disable" }); 26 | const dfa = DFA.fromFA(nfa, new DFA.LimitedNodeFactory(maxNodes)); 27 | dfa.minimize(); 28 | 29 | assert.isTrue(expected.structurallyEqual(dfa)); 30 | } 31 | 32 | const parseOptions: ParseOptions[] = []; 33 | if (CONFIG_ALL_PARSE_OPTIONS) { 34 | for (const assertions of ["parse", "disable", "unknown"] as ParseOptions["assertions"][]) { 35 | for (const backreferences of ["disable", "unknown"] as ParseOptions["backreferences"][]) { 36 | for (const maxBackreferenceWords of [0, 1, 10, 100]) { 37 | for (const simplify of [true, false]) { 38 | parseOptions.push({ 39 | assertions, 40 | backreferences, 41 | maxBackreferenceWords, 42 | maxNodes, 43 | simplify, 44 | }); 45 | } 46 | } 47 | } 48 | } 49 | } else { 50 | parseOptions.push({ backreferences: "disable", maxNodes }); 51 | } 52 | 53 | describe("Regex stress test", function () { 54 | if (!CONFIG_RUN_STRESS_TEST) { 55 | return; 56 | } 57 | 58 | this.timeout(60 * 1000); // timeout after a minute 59 | 60 | parseOptions.forEach(options => { 61 | describe("Parser config: " + JSON.stringify(options), function () { 62 | PrismRegexes.forEach((literal, index) => { 63 | let patternPreview = String(literal); 64 | if (patternPreview.length > 80) { 65 | patternPreview = patternPreview.substr(0, 80) + "..."; 66 | } 67 | it(`[${index}]: ${patternPreview}`, function () { 68 | try { 69 | const { expression, maxCharacter } = Parser.fromLiteral(literal).parse(options); 70 | const nfa = NFA.fromRegex( 71 | expression, 72 | { maxCharacter }, 73 | { assertions: "disable", unknowns: "disable" }, 74 | new NFA.LimitedNodeFactory(maxNodes) 75 | ); 76 | nfa.countNodes(); 77 | 78 | const re1 = nfa.toRegex({ maxNodes }); 79 | 80 | const dfa = DFA.fromFA(nfa, new DFA.LimitedNodeFactory(maxNodes)); 81 | const dfaOriginalCount = dfa.countNodes(); 82 | dfa.minimize(); 83 | assert.isTrue(dfa.countNodes() <= dfaOriginalCount); 84 | 85 | if (CHECK_RE_LANGUAGE) { 86 | equalLanguage(dfa, re1, maxCharacter); 87 | } 88 | 89 | const re2 = dfa.toRegex({ maxNodes }); 90 | if (CHECK_RE_LANGUAGE) { 91 | equalLanguage(dfa, re2, maxCharacter); 92 | } 93 | } catch (e) { 94 | if (!(e instanceof TooManyNodesError)) { 95 | throw e; 96 | } 97 | } 98 | }); 99 | }); 100 | }); 101 | }); 102 | }); 103 | -------------------------------------------------------------------------------- /scripts/create-case-folding.ts: -------------------------------------------------------------------------------- 1 | import * as fs from "fs"; 2 | import * as path from "path"; 3 | import { CharSet } from "../src/char-set"; 4 | import { printRanges } from "./util"; 5 | 6 | const caseFoldingCommon: ReadonlyMap<number, number> = require("@unicode/unicode-15.0.0/Case_Folding/C/code-points"); 7 | const caseFoldingSimple: ReadonlyMap<number, number> = require("@unicode/unicode-15.0.0/Case_Folding/S/code-points"); 8 | 9 | createCaseFoldingFile(canonicalizeIgnoreCaseUTF16, 0xffff, "UTF16", "utf16-case-folding.ts"); 10 | createCaseFoldingFile(canonicalizeIgnoreCaseUnicode, 0x10ffff, "Unicode", "unicode/case-folding.ts"); 11 | 12 | function canonicalizeIgnoreCaseUTF16(ch: number): number { 13 | // https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch 14 | 15 | const s = String.fromCharCode(ch); 16 | const u = s.toUpperCase(); 17 | if (u.length !== 1) { 18 | return ch; 19 | } 20 | const cu = u.charCodeAt(0); 21 | if (ch >= 128 && cu < 128) { 22 | return ch; 23 | } 24 | return cu; 25 | } 26 | 27 | function canonicalizeIgnoreCaseUnicode(ch: number): number { 28 | // https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch 29 | 30 | let mapping = caseFoldingCommon.get(ch); 31 | if (mapping !== undefined) { 32 | return mapping; 33 | } 34 | mapping = caseFoldingSimple.get(ch); 35 | if (mapping !== undefined) { 36 | return mapping; 37 | } 38 | 39 | return ch; 40 | } 41 | 42 | function createCaseFoldingFile( 43 | canonicalize: (ch: number) => number, 44 | maxCharacter: number, 45 | variablePrefix: string, 46 | filename: string 47 | ): void { 48 | const canonicalizeMapping = new Map<number, number[]>(); 49 | for (let ch = 0; ch <= maxCharacter; ch++) { 50 | const c = canonicalize(ch); 51 | let list = canonicalizeMapping.get(c); 52 | if (list === undefined) { 53 | canonicalizeMapping.set(c, (list = [])); 54 | } 55 | list.push(ch); 56 | } 57 | 58 | const caseFolding: number[][] = []; 59 | canonicalizeMapping.forEach(chars => { 60 | chars.forEach(c => { 61 | caseFolding[c] = chars; 62 | }); 63 | }); 64 | 65 | let count = 0; 66 | const CASE_VARYING = CharSet.fromCharacters( 67 | maxCharacter, 68 | (function* () { 69 | for (let i = 0; i < maxCharacter; i++) { 70 | const fold = caseFolding[i]; 71 | if (fold.indexOf(i) === -1) { 72 | throw new Error(`The case folding of ${i} does not include itself.`); 73 | } 74 | if (fold.length > 1) { 75 | count++; 76 | yield i; 77 | } 78 | } 79 | })() 80 | ); 81 | 82 | const map: Record<number, number[]> = {}; 83 | caseFolding.forEach((fold, i) => { 84 | if (fold.length > 1) { 85 | map[i] = fold; 86 | } 87 | }); 88 | 89 | console.log(`${variablePrefix}: ${count} characters vary in case`); 90 | 91 | const code = `/* eslint-disable */ 92 | 93 | // DO NOT EDIT! 94 | // THIS FILE IS GENERATED BY scripts/create-case-folding.js 95 | 96 | import { CharSet } from "${"../".repeat(filename.split(/\//g).length)}char-set"; 97 | 98 | 99 | /** 100 | * A character set of all characters that have at least one case variation. 101 | */ 102 | export const ${variablePrefix}CaseVarying: CharSet = CharSet.empty(${maxCharacter}).union(${printRanges( 103 | CASE_VARYING.ranges 104 | )}); 105 | 106 | /** 107 | * A map for a given character to all it case variations. The list of case variations also includes the key character 108 | * itself. 109 | * 110 | * If the given character do not have case variations, it will not be part of this map. 111 | */ 112 | export const ${variablePrefix}CaseFolding: Readonly<Partial<Record<number, readonly number[]>>> = JSON.parse('${JSON.stringify( 113 | map 114 | )}'); 115 | `; 116 | 117 | fs.writeFileSync(path.join(__dirname, "../src/js", filename), code, "utf-8"); 118 | } 119 | -------------------------------------------------------------------------------- /src/transformers/inline.ts: -------------------------------------------------------------------------------- 1 | import { Assertion, Concatenation, NoParent, Parent, TransformContext, Transformer } from "../ast"; 2 | import { CreationOptions } from "./creation-options"; 3 | import { tryInlineAssertions } from "./util"; 4 | 5 | function onConcatenation(node: NoParent<Concatenation>, { signalMutation }: TransformContext): void { 6 | for (let i = 0; i < node.elements.length; i++) { 7 | const current = node.elements[i]; 8 | 9 | if (current.type === "Alternation") { 10 | if (current.alternatives.length === 1) { 11 | // inline alternatives with only a single alternative (e.g. a(?:bc)d == abcd) 12 | node.elements.splice(i, 1, ...current.alternatives[0].elements); 13 | signalMutation(); 14 | i--; 15 | } 16 | } else if (current.type === "Quantifier") { 17 | if (current.max === 0) { 18 | // remove 0 quantifiers (e.g. ab{0}c == ac) 19 | node.elements.splice(i, 1); 20 | signalMutation(); 21 | i--; 22 | } else if (current.min === 1 && current.max === 1) { 23 | // inline constant-one quantifiers (e.g. ab{1}c == abc) 24 | if (current.alternatives.length === 1) { 25 | node.elements.splice(i, 1, ...current.alternatives[0].elements); 26 | signalMutation(); 27 | i--; 28 | } else { 29 | node.elements[i] = { 30 | type: "Alternation", 31 | alternatives: current.alternatives, 32 | source: current.source, 33 | }; 34 | signalMutation(); 35 | } 36 | } 37 | } 38 | } 39 | } 40 | function onParent(node: NoParent<Parent>, { signalMutation }: TransformContext): void { 41 | for (let i = 0; i < node.alternatives.length; i++) { 42 | const { elements } = node.alternatives[i]; 43 | if (elements.length === 1) { 44 | const first = elements[0]; 45 | if (first.type === "Alternation") { 46 | node.alternatives.splice(i, 1, ...first.alternatives); 47 | signalMutation(); 48 | i--; 49 | } 50 | } 51 | } 52 | } 53 | function onAssertion(node: NoParent<Assertion>, context: TransformContext): void { 54 | onParent(node, context); 55 | 56 | if (node.alternatives.length === 1 && node.alternatives[0].elements.length === 1) { 57 | const single = node.alternatives[0].elements[0]; 58 | if (single.type === "Assertion") { 59 | // inline simple nested assertions (e.g. (?!(?<=a)) == (?<!a)) 60 | node.kind = single.kind; 61 | node.negate = node.negate !== single.negate; 62 | node.alternatives = single.alternatives; 63 | context.signalMutation(); 64 | return; 65 | } 66 | } 67 | 68 | // inline nested assertions (e.g. (?=a(?=b)) == (?=ab)) 69 | 70 | if (tryInlineAssertions(node.alternatives, node.kind)) { 71 | context.signalMutation(); 72 | } 73 | } 74 | 75 | /** 76 | * This transformer will simplify the AST by doing trivial inlining operations. 77 | * 78 | * It will: 79 | * 80 | * 1. Inline single-alternative alternations in concatenation (e.g. `a(?:b)c` => `abc`). 81 | * 2. Inline single-alternation concatenations (e.g. `(?:(?:a|b)|c)` => `(?:a|b|c)`). 82 | * 3. Inline constant-one quantifiers (e.g. `ab{1}c` => `abc`). 83 | * 4. Remove constant-zero quantifiers (e.g. `ab{0}c` => `ac`). 84 | * 5. Inline trivially nested assertions (e.g. `(?!(?<!a))` => `(?<=a)`). 85 | * 6. Inline nested assertions at the end of the expression tree (e.g. `(?!a(?=b))` => `(?!ab)`). 86 | * 87 | * --- 88 | * 89 | * This transformer should be used in combination with {@link removeDeadBranches} to handle trivial simplifications. 90 | * 91 | * @param _options 92 | */ 93 | // eslint-disable-next-line @typescript-eslint/no-unused-vars 94 | export function inline(_options?: Readonly<CreationOptions>): Transformer { 95 | // we can safely ignore the options as order and ambiguity are guaranteed to be preserved 96 | return { 97 | name: "inline", 98 | onConcatenation, 99 | onAssertion, 100 | 101 | onAlternation: onParent, 102 | onExpression: onParent, 103 | onQuantifier: onParent, 104 | }; 105 | } 106 | -------------------------------------------------------------------------------- /tests/helper/snapshot.ts: -------------------------------------------------------------------------------- 1 | import { assert } from "chai"; 2 | import { basename, dirname, join } from "path"; 3 | import { existsSync, mkdirSync, writeFileSync } from "fs"; 4 | import { CONFIG_UPDATE } from "./config"; 5 | 6 | function getSnapshotFilePath(testFile: string): string { 7 | return join(dirname(testFile), "__snapshots__", basename(testFile)); 8 | } 9 | 10 | function getTitlePath(test: Mocha.Runnable): string { 11 | const path = [test.title]; 12 | for (let x = test.parent; x; x = x.parent) { 13 | path.push(x.title); 14 | } 15 | path.pop(); 16 | path.reverse(); 17 | return path.join(" >> "); 18 | } 19 | 20 | function getSnapshot(file: string, title: string): string { 21 | // eslint-disable-next-line @typescript-eslint/no-var-requires 22 | const snap = require(file); 23 | 24 | if (!Object.prototype.hasOwnProperty.call(snap, title)) { 25 | throw new Error(`Cannot find snapshot for test ${title}`); 26 | } 27 | 28 | const value = snap[title]; 29 | if (typeof value !== "string") { 30 | throw new Error(`Cannot find snapshot for test ${title}`); 31 | } 32 | 33 | return value; 34 | } 35 | 36 | function getRoot(test: Mocha.Runnable): Mocha.Suite { 37 | let p = test.parent; 38 | if (!p) { 39 | throw new Error(); 40 | } 41 | while (p.parent) { 42 | p = p.parent; 43 | } 44 | return p; 45 | } 46 | 47 | const toUpdate = new Map<string, Map<string, string>>(); 48 | 49 | const escapeRegex = /(\\*)(`|\$\{|\\u(?![a-fA-F0-9]{4}))/g; 50 | function escapeBackslashes(value: string): string { 51 | return value.replace(escapeRegex, (m, backslashes: string, c: string) => { 52 | return backslashes + backslashes + "\\" + c; 53 | }); 54 | } 55 | 56 | function createSnapshot(values: Map<string, string>): string { 57 | let s = `/* eslint-disable */ 58 | 59 | var unescapeBackslashes = (str: string): string => { 60 | return str.replace(${escapeRegex}, (m, backslashes: string, c: string) => { 61 | return "\\\\".repeat(Math.floor(backslashes.length / 2)) + c; 62 | }); 63 | }; 64 | var lit = (array: TemplateStringsArray): string => { 65 | return unescapeBackslashes(array.raw[0].slice(1, -1)); 66 | }; 67 | var n = (array: TemplateStringsArray): string => { 68 | return unescapeBackslashes(array.raw[0].slice(0, -1)); 69 | }; 70 | `; 71 | 72 | for (const [title, value] of values) { 73 | s += `\nmodule.exports[n\`${escapeBackslashes(title)} \`] = lit\`\n${escapeBackslashes(value)}\n\`;\n`; 74 | } 75 | 76 | return s; 77 | } 78 | 79 | const registered = new Set<Mocha.Suite>(); 80 | function register(suite: Mocha.Suite): void { 81 | if (registered.has(suite)) { 82 | return; 83 | } 84 | registered.add(suite); 85 | 86 | suite.afterAll(() => { 87 | for (const [file, values] of toUpdate) { 88 | mkdirSync(dirname(file), { recursive: true }); 89 | writeFileSync(file, createSnapshot(values), "utf8"); 90 | } 91 | }); 92 | } 93 | 94 | function updateSnapshot(test: Mocha.Runnable, file: string, title: string, value: string): void { 95 | register(getRoot(test)); 96 | 97 | let map = toUpdate.get(file); 98 | if (map === undefined) { 99 | map = new Map(); 100 | toUpdate.set(file, map); 101 | } 102 | 103 | if (map.has(title)) { 104 | throw new Error("There can be only one snapshot value per test case."); 105 | } 106 | map.set(title, value); 107 | } 108 | 109 | export function assertEqualSnapshot(context: Mocha.Context, actual: string, message?: string): void { 110 | const { test } = context; 111 | if (!test) { 112 | throw new Error("`test` property not set."); 113 | } 114 | 115 | const file = getSnapshotFilePath(test.file!); 116 | const title = getTitlePath(test); 117 | 118 | let fileExists; 119 | try { 120 | fileExists = existsSync(require.resolve(file)); 121 | } catch { 122 | fileExists = false; 123 | } 124 | 125 | if (CONFIG_UPDATE || !fileExists) { 126 | updateSnapshot(test, file, title, actual); 127 | } else { 128 | assert.strictEqual(actual, getSnapshot(file, title), message); 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /src/iter/make-deterministic.ts: -------------------------------------------------------------------------------- 1 | import { CharSet } from "../char-set"; 2 | import { CharBase } from "../char-base"; 3 | import { FABuilder, FAIterator } from "../fa-types"; 4 | import { Char } from "../char-types"; 5 | import { debugAssert, filterMut, traverse } from "../util"; 6 | import { ensureStableOut } from "./iterator"; 7 | 8 | /** 9 | * This will return an iterator that iteratively create a DFA using the given {@link FABuilder}. 10 | * 11 | * This operation may produce up to _2^O(n)_ many states. The builder should limit the number of states created. 12 | * 13 | * @param builder 14 | * @param iter 15 | */ 16 | export function makeDeterministic<B, I>( 17 | builder: FABuilder<B, CharSet>, 18 | iter: FAIterator<I, Iterable<[I, CharSet]>> 19 | ): FAIterator<B, B> { 20 | iter = ensureStableOut(iter); 21 | 22 | const { charSets, ids } = getInfo(iter); 23 | const alphabet = new CharBase(charSets); 24 | 25 | // This will use the subset method to construct the DFA. 26 | 27 | const inputNodesToDfaNodeMap = new Map<string, B>(); 28 | const dfaNodeToInputNodesMap = new Map<B, readonly I[]>(); 29 | function getKey(nodes: readonly I[]): string { 30 | let key = ""; 31 | for (let i = 0, l = nodes.length; i < l; i++) { 32 | key += "," + ids.get(nodes[i])!; 33 | } 34 | return key; 35 | } 36 | function getDfaNode(nodes: I[]): B { 37 | // sort 38 | nodes.sort((a, b) => ids.get(a)! - ids.get(b)!); 39 | // remove duplicates 40 | filterMut(nodes, (n, prev) => n !== prev); 41 | 42 | const key = getKey(nodes); 43 | let dfaNode = inputNodesToDfaNodeMap.get(key); 44 | if (dfaNode === undefined) { 45 | // this will create a new node AND set it as final if it contains a final NFA state 46 | dfaNode = builder.createNode(); 47 | if (nodes.some(n => iter.isFinal(n))) { 48 | builder.makeFinal(dfaNode); 49 | } 50 | 51 | inputNodesToDfaNodeMap.set(key, dfaNode); 52 | dfaNodeToInputNodesMap.set(dfaNode, nodes); 53 | } 54 | return dfaNode; 55 | } 56 | function getInputNodes(node: B): readonly I[] { 57 | const nodes = dfaNodeToInputNodesMap.get(node); 58 | debugAssert(nodes !== undefined, "Unregistered DFA node."); 59 | return nodes; 60 | } 61 | // set initial states 62 | inputNodesToDfaNodeMap.set(getKey([iter.initial]), builder.initial); 63 | dfaNodeToInputNodesMap.set(builder.initial, [iter.initial]); 64 | if (iter.isFinal(iter.initial)) { 65 | builder.makeFinal(builder.initial); 66 | } 67 | 68 | function getOutNode(inputNodes: readonly I[], char: Char): B | undefined { 69 | const outNodes: I[] = []; 70 | 71 | for (const inputNode of inputNodes) { 72 | for (const [to, via] of iter.getOut(inputNode)) { 73 | if (via.has(char)) { 74 | outNodes.push(to); 75 | } 76 | } 77 | } 78 | 79 | if (outNodes.length === 0) { 80 | // this is the most likely event 81 | // we save all transitions going to a trap state 82 | return undefined; 83 | } else { 84 | return getDfaNode(outNodes); 85 | } 86 | } 87 | 88 | return { 89 | initial: builder.initial, 90 | getOut: state => { 91 | const inputNodes = getInputNodes(state); 92 | for (const set of alphabet.sets) { 93 | const out = getOutNode(inputNodes, set.ranges[0].min); 94 | if (out !== undefined) { 95 | builder.linkNodes(state, out, set); 96 | } 97 | } 98 | 99 | return state; 100 | }, 101 | isFinal: s => builder.isFinal(s), 102 | }; 103 | } 104 | 105 | interface Info<S> { 106 | charSets: Set<CharSet>; 107 | ids: ReadonlyMap<S, number>; 108 | } 109 | 110 | function getInfo<S>(iter: FAIterator<S, Iterable<[S, CharSet]>>): Info<S> { 111 | const charSets = new Set<CharSet>(); 112 | const ids = new Map<S, number>(); 113 | let id = 0; 114 | 115 | traverse(iter.initial, (n, queue) => { 116 | ids.set(n, id); 117 | id++; 118 | 119 | for (const [to, via] of iter.getOut(n)) { 120 | charSets.add(via); 121 | queue.push(to); 122 | } 123 | }); 124 | 125 | return { charSets, ids }; 126 | } 127 | -------------------------------------------------------------------------------- /tests/intersection.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable @typescript-eslint/no-explicit-any */ 2 | import { assert } from "chai"; 3 | import { TransitionIterable } from "../src/fa-types"; 4 | import { ReadonlyWord } from "../src/char-types"; 5 | import { DFA } from "../src/dfa"; 6 | import { ENFA } from "../src/enfa"; 7 | import { getIntersectionWordSets, getIntersectionWords, isDisjointWith } from "../src/intersection"; 8 | import { Literal } from "../src/js"; 9 | import { NFA } from "../src/nfa"; 10 | import { iterToArray } from "../src/util"; 11 | import { fromUnicodeToString } from "../src/words"; 12 | import { literalToDFA, literalToENFA, literalToNFA, literalToString } from "./helper/fa"; 13 | 14 | describe("intersection", function () { 15 | const cases: RegExp[] = [ 16 | /a/, 17 | /b/, 18 | /a*/, 19 | /A/i, 20 | /b*(ab+)*a/, 21 | /a*(ba+)*/, 22 | /a+/, 23 | /(?:a+){2,}/, 24 | /(?:[^>"'[\]]|"[^"]*"|'[^']*')/, 25 | /(?:[^>"'[\]]|"[^"]*"|'[^']*'){2,}/, 26 | /"(?:[^"\\r\n]|\\.)*"/, 27 | /\/\*[^]*?\*\//, 28 | ]; 29 | 30 | type FA = { nfa: NFA; dfa: DFA; enfa: ENFA }; 31 | const FA_TYPE_KEYS: (keyof FA)[] = ["nfa", "dfa", "enfa"]; 32 | function toFA(literal: Literal): FA { 33 | const nfa = literalToNFA(literal); 34 | const dfa = literalToDFA(literal); 35 | const enfa = literalToENFA(literal); 36 | 37 | return { nfa, dfa, enfa }; 38 | } 39 | 40 | function testAllCombinations<E>( 41 | getTitle: (left: string, right: string) => string, 42 | getExpected: (a: FA, b: FA) => E, 43 | getActual: <A, B>(a: TransitionIterable<A>, b: TransitionIterable<B>) => E 44 | ): void { 45 | for (const left of cases) { 46 | for (const right of cases) { 47 | it(getTitle(literalToString(left), literalToString(right)), function () { 48 | const l = toFA(left); 49 | const r = toFA(right); 50 | 51 | const expected = getExpected(l, r); 52 | 53 | for (const lKey of FA_TYPE_KEYS) { 54 | for (const rKey of FA_TYPE_KEYS) { 55 | const combination = `${lKey} and ${rKey}`; 56 | const actual = getActual<any, any>(l[lKey], r[rKey]); 57 | assert.deepStrictEqual(actual, expected, combination); 58 | } 59 | } 60 | }); 61 | } 62 | } 63 | } 64 | 65 | describe(isDisjointWith.name, function () { 66 | testAllCombinations( 67 | (left, right) => `${left} ∩ ${right} = ∅`, 68 | (left, right) => NFA.fromIntersection(left.nfa, right.nfa).isEmpty, 69 | (a, b) => isDisjointWith(a, b) 70 | ); 71 | }); 72 | 73 | describe(getIntersectionWordSets.name, function () { 74 | function isFiniteIterable<T>(iter: Iterable<T>, upperLimit: number = 1000): boolean { 75 | let counter = 0; 76 | // eslint-disable-next-line @typescript-eslint/no-unused-vars 77 | for (const _ of iter) { 78 | if (++counter > upperLimit) { 79 | return false; 80 | } 81 | } 82 | return true; 83 | } 84 | 85 | testAllCombinations( 86 | (left, right) => `${left} ∩ ${right} is finite`, 87 | (left, right) => NFA.fromIntersection(left.nfa, right.nfa).isFinite, 88 | (a, b) => isFiniteIterable(getIntersectionWordSets(a, b)) 89 | ); 90 | }); 91 | 92 | describe(getIntersectionWords.name, function () { 93 | function takeAtMostOrNothing<T>(iter: Iterable<T>, upperLimit: number = 100): T[] | null { 94 | const results: T[] = []; 95 | for (const item of iter) { 96 | results.push(item); 97 | if (results.length > upperLimit) { 98 | return null; 99 | } 100 | } 101 | return results; 102 | } 103 | 104 | function toStrings(iter: Iterable<ReadonlyWord> | null): string[] | null { 105 | if (!iter) { 106 | return null; 107 | } 108 | return iterToArray(iter).map(fromUnicodeToString).sort(); 109 | } 110 | 111 | testAllCombinations( 112 | (left, right) => `${left} ∩ ${right}`, 113 | (left, right) => toStrings(takeAtMostOrNothing(NFA.fromIntersection(left.nfa, right.nfa).words())), 114 | (a, b) => toStrings(takeAtMostOrNothing(getIntersectionWords(a, b))) 115 | ); 116 | }); 117 | }); 118 | -------------------------------------------------------------------------------- /tests/helper/transform.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable @typescript-eslint/naming-convention */ 2 | import { assert } from "chai"; 3 | import { Node, TransformOptions, Transformer, transform } from "../../src/ast"; 4 | import { TooManyNodesError } from "../../src/errors"; 5 | import { Literal, Parser, toLiteral } from "../../src/js"; 6 | import { CONFIG_RUN_TRANSFORMERS } from "./config"; 7 | import { literalToString } from "./fa"; 8 | import { PrismRegexes } from "./prism-regex-data"; 9 | import { assertEqualSnapshot } from "./snapshot"; 10 | 11 | export interface TransformTestCase { 12 | literal: Literal | string; 13 | transformer?: Transformer; 14 | options?: TransformOptions; 15 | expected?: Literal | string; 16 | stepByStep?: boolean; 17 | debug?: boolean; 18 | } 19 | 20 | function toTestCase(literal: TransformTestCase | Literal | string): TransformTestCase { 21 | if (typeof literal === "string") { 22 | return { literal }; 23 | } else if ("literal" in literal) { 24 | return literal; 25 | } else { 26 | return { literal }; 27 | } 28 | } 29 | 30 | export function itTest(defaultTransformer: Transformer, cases: Iterable<TransformTestCase | Literal | string>): void; 31 | export function itTest( 32 | defaultTransformer: null, 33 | cases: Iterable<TransformTestCase & { transformer: Transformer }> 34 | ): void; 35 | export function itTest( 36 | defaultTransformer: Transformer | null, 37 | cases: Iterable<TransformTestCase | Literal | string> 38 | ): void { 39 | for (const { 40 | literal, 41 | transformer = defaultTransformer, 42 | options = {}, 43 | expected, 44 | debug = false, 45 | stepByStep = false, 46 | } of [...cases].map(toTestCase)) { 47 | it(literalToString(literal), function () { 48 | if (debug) { 49 | // eslint-disable-next-line no-debugger 50 | debugger; 51 | } 52 | 53 | if (!transformer) { 54 | throw new Error("No transformer"); 55 | } 56 | 57 | const { expression } = Parser.fromLiteral(literal).parse({ 58 | backreferences: "throw", 59 | assertions: "parse", 60 | simplify: false, 61 | }); 62 | 63 | let actualStr: string; 64 | if (stepByStep) { 65 | const shortName: Record<Node["type"], string> = { 66 | Alternation: "Alt", 67 | Assertion: "Asr", 68 | CharacterClass: "Chr", 69 | Concatenation: "Con", 70 | Expression: "Exp", 71 | Quantifier: "Qnt", 72 | Unknown: "Unk", 73 | }; 74 | 75 | const steps: [string, string][] = [["Start:", literalToString(toLiteral(expression))]]; 76 | const transformedExpression = transform(transformer, expression, { 77 | ...options, 78 | events: { 79 | onPassStart(ast, pass) { 80 | steps.push([`Pass ${pass}`, ""]); 81 | }, 82 | onChange(ast, node, transformer) { 83 | const patternStr = literalToString(toLiteral(ast)); 84 | steps.push([`${shortName[node.type]} ${transformer.name ?? "<unnamed>"}:`, patternStr]); 85 | }, 86 | }, 87 | }); 88 | steps.push(["Final:", literalToString(toLiteral(transformedExpression))]); 89 | const maxLength = Math.max(...steps.map(([name]) => name.length)); 90 | actualStr = steps 91 | .map(([name, value]) => (name + " ").padEnd(maxLength + 1) + value) 92 | .map(s => s.trimEnd()) 93 | .join("\n"); 94 | } else { 95 | const transformedExpression = transform(transformer, expression, options); 96 | actualStr = literalToString(toLiteral(transformedExpression)); 97 | } 98 | 99 | if (expected === undefined) { 100 | assertEqualSnapshot(this, actualStr); 101 | } else { 102 | const expectedStr = literalToString(expected); 103 | assert.strictEqual(actualStr, expectedStr); 104 | } 105 | }); 106 | } 107 | } 108 | 109 | export function regexSnapshot(context: Mocha.Context, transformer: Transformer): void { 110 | if (!CONFIG_RUN_TRANSFORMERS) { 111 | return; 112 | } 113 | 114 | context.timeout(60 * 1000); // timeout after 1 minute 115 | 116 | const actual = PrismRegexes.map(re => { 117 | try { 118 | const { expression } = Parser.fromLiteral(re).parse({ backreferences: "unknown" }); 119 | return literalToString(toLiteral(transform(transformer, expression, { maxPasses: 20 }))); 120 | } catch (e) { 121 | if (e instanceof TooManyNodesError) { 122 | return "TooManyNodesError"; 123 | } 124 | throw e; 125 | } 126 | }).join("\n"); 127 | 128 | assertEqualSnapshot(context, actual); 129 | } 130 | -------------------------------------------------------------------------------- /src/iter/to-dot.ts: -------------------------------------------------------------------------------- 1 | import { FAIterator } from "../fa-types"; 2 | import { NodeInfo, SimplePrintOptions } from "./print-common"; 3 | import { indexNodes } from "./print-util"; 4 | 5 | export function toDot<S, T>( 6 | iter: FAIterator<S, Iterable<[S, T]>>, 7 | options: ToDotOptions<S, T> | SimplePrintOptions<T> 8 | ): string { 9 | const { 10 | getEdgeAttributes, 11 | getGraphAttributes = DEFAULT_GRAPH_ATTRIBUTES, 12 | getNodeAttributes = DEFAULT_GET_NODE_ATTRIBUTES, 13 | } = "transitionToString" in options ? fromSimpleOptions(options) : options; 14 | 15 | const { stableIter, states, info } = indexNodes(iter); 16 | 17 | let s = ""; 18 | 19 | function writeID(value: string): void { 20 | if (/^[a-zA-Z_]\w*$/.test(value)) { 21 | s += value; 22 | } else { 23 | let escaped = value.replace(/[\\"]/g, m => "\\" + m); 24 | if (escaped[escaped.length - 1] === "\\") { 25 | escaped += " "; 26 | } 27 | s += '"' + escaped + '"'; 28 | } 29 | } 30 | function writeValue(value: string | number): void { 31 | if (typeof value === "number") { 32 | const strValue = String(value); 33 | if (/^-?(?:\.\d+|\d+(?:\.\d*)?)$/.test(strValue)) { 34 | s += strValue; 35 | } else { 36 | writeID(strValue); 37 | } 38 | } else { 39 | writeID(value); 40 | } 41 | } 42 | function writeAttrs(attrs: Readonly<ToDotAttrs>): void { 43 | s += "["; 44 | let first = true; 45 | for (const key in attrs) { 46 | if (Object.prototype.hasOwnProperty.call(attrs, key)) { 47 | const value = attrs[key]; 48 | if (value === undefined) { 49 | continue; 50 | } 51 | 52 | if (first) { 53 | first = false; 54 | } else { 55 | s += ", "; 56 | } 57 | 58 | writeID(key); 59 | s += "="; 60 | writeValue(value); 61 | } 62 | } 63 | s += "]"; 64 | } 65 | function writeNodeLabel(node: S): void { 66 | s += "n" + info.getId(node); 67 | } 68 | function writeNodeLabelFromIndex(index: number): void { 69 | s += "n" + index; 70 | } 71 | 72 | s += "digraph "; 73 | s += "{\n"; 74 | 75 | // graph attributes 76 | s += "\t// graph attributes\n"; 77 | const graphAttrs = getGraphAttributes(); 78 | for (const key in graphAttrs) { 79 | if (Object.prototype.hasOwnProperty.call(graphAttrs, key)) { 80 | const value = graphAttrs[key]; 81 | if (value === undefined) { 82 | continue; 83 | } 84 | 85 | s += "\t"; 86 | writeID(key); 87 | s += "="; 88 | writeValue(value); 89 | s += ";\n"; 90 | } 91 | } 92 | 93 | // nodes 94 | s += "\n\t// nodes\n"; 95 | s += "\tnull [shape=point];\n"; 96 | states.forEach((node, i) => { 97 | s += "\t"; 98 | writeNodeLabelFromIndex(i); 99 | s += " "; 100 | writeAttrs(getNodeAttributes(node, info)); 101 | s += ";\n"; 102 | }); 103 | 104 | // edges 105 | s += "\n\t// edges\n"; 106 | s += "\tnull -> n0;\n"; 107 | states.forEach((node, i) => { 108 | stableIter.getOut(node).forEach(([to, trans], nth) => { 109 | s += "\t"; 110 | writeNodeLabelFromIndex(i); 111 | s += " -> "; 112 | writeNodeLabel(to); 113 | s += " "; 114 | writeAttrs(getEdgeAttributes(trans, nth, node, to, info)); 115 | s += ";\n"; 116 | }); 117 | }); 118 | 119 | s += "}"; 120 | 121 | return s; 122 | } 123 | 124 | export type ToDotAttrs = Record<string, string | number | undefined>; 125 | export interface ToDotOptions<S, T> { 126 | getEdgeAttributes: (transition: T, nth: number, from: S, to: S, info: NodeInfo<S>) => Readonly<ToDotAttrs>; 127 | getGraphAttributes?: () => Readonly<ToDotAttrs>; 128 | getNodeAttributes?: (node: S, info: NodeInfo<S>) => Readonly<ToDotAttrs>; 129 | } 130 | 131 | function fromSimpleOptions<S, T>({ transitionToString, ordered = false }: SimplePrintOptions<T>): ToDotOptions<S, T> { 132 | return { 133 | getEdgeAttributes(trans, nth, from, _, info) { 134 | const attrs: ToDotAttrs = { 135 | label: transitionToString(trans), 136 | }; 137 | 138 | if (ordered && info.getNumberOfOutgoingEdges(from) > 1) { 139 | attrs["tail" + "label"] = String(nth + 1); 140 | } 141 | 142 | return attrs; 143 | }, 144 | }; 145 | } 146 | 147 | const DEFAULT_GET_NODE_ATTRIBUTES: NonNullable<ToDotOptions<unknown, never>["getNodeAttributes"]> = (node, info) => { 148 | return { 149 | label: String(info.getId(node)), 150 | shape: info.isFinal(node) ? "doublecircle" : "circle", 151 | }; 152 | }; 153 | 154 | const DEFAULT_GRAPH_ATTRIBUTES: NonNullable<ToDotOptions<unknown, never>["getGraphAttributes"]> = () => { 155 | return { rankdir: "LR" }; 156 | }; 157 | -------------------------------------------------------------------------------- /tests/transformers/apply-assertions.ts: -------------------------------------------------------------------------------- 1 | import { itTest, regexSnapshot } from "../helper/transform"; 2 | import { applyAssertions, inline, removeDeadBranches } from "../../src/transformers"; 3 | import { CombinedTransformer } from "../../src/ast"; 4 | 5 | describe("Transformers", function () { 6 | describe(/[\w-]+(?=\.\w+)/i.exec(__filename)![0], function () { 7 | const transformer = applyAssertions(); 8 | const optimizedTransformer = new CombinedTransformer([applyAssertions(), inline(), removeDeadBranches()]); 9 | 10 | itTest(transformer, [ 11 | /(?=\d)\wa/, 12 | /(?!\d)\wa/, 13 | /(?=\w[^9])a\d/, 14 | /\da(?<=[^9]\w)/, 15 | { 16 | literal: /(?!4)(?=\d)\w(?<!2)a(?<=[^9]a)/, 17 | options: { maxPasses: 1 }, 18 | }, 19 | 20 | /(?=a)\wa/, 21 | /(?=aa)\wa/, 22 | /(?=aaa)\wa/, 23 | /(?!a)aa/, 24 | /(?!aa)\wa/, 25 | /(?!aaa)\wa/, 26 | /a$a/, 27 | /a^a/, 28 | 29 | /(?!a|""")./s, 30 | /(?!a)(?!""")./s, 31 | 32 | /(?=foo)foo/i, 33 | /(?!foo)bar/i, 34 | /(?!food)foo/i, 35 | /(?!food)foot/i, 36 | /(?!food|foobar)foot/i, 37 | /(?!food)foo\w/i, 38 | /(?!food|foot)foo\w/i, 39 | /(?!fo{6})foo\w/i, 40 | 41 | /\(\s*(?!\s)(?:[^()]|\([^()]*\))*?(?=\s*\))/, 42 | /\(\s*(?!\s)(?:[^()]|\([^()]*\))+?(?=\s*\))/, 43 | 44 | /(?!\d)(?<!\w)\w/, 45 | 46 | /(?!\d)\w+/, 47 | /\w+(?<!\d)/, 48 | /(?=\da)\w+/, 49 | /\w+(?<=a\d)/, 50 | /(?!\d)\w{1,2}/, 51 | /(?!\d)\w*/, 52 | 53 | /(?=foobar)\w*\s/, 54 | /(?=foobar)\w*(?!\w)/, 55 | /(?=foobar)\w*(?![bfo])/, 56 | /(?=foobar)\w*$/, 57 | /(?=fo{4})\w*$/, 58 | /^\w*(?<=foobar)$/, 59 | /(?!foobar)\w*\s/, 60 | /(?!foobar)\w*(?!\w)/, 61 | /(?!foobar)\w*(?![bfo])/, 62 | /(?!foobar)\w*$/, 63 | /^\w+(?<!foobar)$/, 64 | 65 | /(?!\d)(?:\w+|:|123)/, 66 | /(?=\d)\s*\w+/, 67 | /a$(?:a|b)/, 68 | /a$(?:a|b)?/, 69 | /a$(?:a|b|)/, 70 | 71 | /(a(?!b))+/, 72 | /(a(?!b))*/, 73 | /(a(?!b))*?/, 74 | /(a(?!b)){0,4}/, 75 | /(a(?!b)){1,2}/, 76 | /(\w(?=\d))*/, 77 | 78 | /(?:fo(?=o)|ba(?=r))\w+/, 79 | /(?:fo(?=o)|ba(?=r))?\w+/, 80 | /(?:fo(?=o)|ba(?=r))??\w+/, 81 | /(?:f(?=oo)|ba(?=r))\w+/, 82 | /(?:f(?=oo)|ba(?=r))?\w+/, 83 | 84 | /(?!\s)[^]*\S/, 85 | /(?=a)[^]*/, 86 | /(?=a)[^]*?/, 87 | /(?!a)[^]*?/, 88 | /(?=a)[^]*b/, 89 | /(?=a)[^]*a/, 90 | /(?=\d)[^]*\w/, 91 | /(?=\d)[^]*\w+/, 92 | /(?=\d)[^]?\w+/, 93 | /=begin\s[^]*^=end/m, 94 | /-?(?<!\w)\d+(?:\.\d+)?(?:E[-+]?\d+)?/i, 95 | /(?:a(?!\d)|foo(?=\w)|bar(?!\w)|b)\w+/i, 96 | /(?:\[)(?!\d)\w+(?=\])/i, 97 | 98 | /(?:^|[^&])(?<!\w)(?:TRUE|FALSE)/i, 99 | 100 | { literal: /""((?!"").)*""/s, stepByStep: true, transformer: optimizedTransformer }, 101 | { literal: /""(.(?<!""))*""/s, stepByStep: true, transformer: optimizedTransformer }, 102 | { literal: /""((?!"")(?:[^\\]|\\"))*""/s, transformer: optimizedTransformer }, 103 | { literal: /""((?!"")(?:[^\\]|\\"))+""/s, transformer: optimizedTransformer }, 104 | { literal: /"""((?!""").)*"""/s, stepByStep: true, transformer: optimizedTransformer }, 105 | { literal: /"""((?!""").)+"""/s, transformer: optimizedTransformer }, 106 | 107 | /(?:^|[^.]|\.\.\.\s*)(?<!\w)(?:as|async(?=\s*(?:function(?!\w)|\(|[\w$\xa0-\uffff]|$))|await|break|case|class|const|continue|debugger|default|delete|do|else|enum|export|extends|for|from|function|[gs]et(?=\s*[\w$[\xa0-\uffff])|if|implements|import|in|instanceof|interface|let|new|null|of|package|private|protected|public|return|static|super|switch|this|throw|try|typeof|undefined|var|void|while|with|yield)(?!\w)/, 108 | { 109 | literal: /(?:\b[a-z][a-z_\d]*\s*::\s*)*\b[a-z][a-z_\d]*\s*::(?!\s*<)/, 110 | transformer: optimizedTransformer, 111 | stepByStep: true, 112 | }, 113 | { 114 | literal: /(?:[-!$%&*+/:<=>?@\\|~^]|\.\.|\.(?![)\]}]))+/, 115 | transformer: optimizedTransformer, 116 | stepByStep: true, 117 | }, 118 | { 119 | literal: /(?:[a-z%]|(?<!\w)_+(?!\w))+/, 120 | transformer: optimizedTransformer, 121 | stepByStep: true, 122 | }, 123 | { 124 | literal: /(?:[a-z]|(?<!\w)_+(?!\w))+/, 125 | transformer: optimizedTransformer, 126 | }, 127 | { 128 | literal: /<title>(?:(?!<\/title>).)*<\/title>/s, 129 | transformer: optimizedTransformer, 130 | stepByStep: true, 131 | }, 132 | ]); 133 | 134 | it("Prism regex snapshot", function () { 135 | const transformer = new CombinedTransformer([applyAssertions(), inline(), removeDeadBranches()]); 136 | 137 | regexSnapshot(this, transformer); 138 | }); 139 | }); 140 | }); 141 | -------------------------------------------------------------------------------- /src/transformers/move-up-empty.ts: -------------------------------------------------------------------------------- 1 | import { Concatenation, NoParent, Parent, Quantifier, SourceLocation, TransformContext, Transformer } from "../ast"; 2 | import { isEmpty, isPotentiallyEmpty } from "../ast-analysis"; 3 | import { filterMut } from "../util"; 4 | import { CreationOptions } from "./creation-options"; 5 | import { copySource } from "./util"; 6 | 7 | function makeContentOptional( 8 | alternatives: NoParent<Concatenation>[], 9 | source: Readonly<SourceLocation> | undefined 10 | ): void { 11 | if (alternatives.length === 0) { 12 | // `[]?` == `ε` 13 | alternatives.push({ 14 | type: "Concatenation", 15 | elements: [], 16 | source: copySource(source), 17 | }); 18 | return; 19 | } 20 | 21 | if (alternatives.length === 1 && alternatives[0].elements.length === 1) { 22 | const single = alternatives[0].elements[0]; 23 | if (single.type === "Quantifier" && single.min === 1) { 24 | single.min = 0; 25 | return; 26 | } 27 | } 28 | 29 | const copy = [...alternatives]; 30 | alternatives.length = 0; 31 | alternatives.push({ 32 | type: "Concatenation", 33 | elements: [ 34 | { 35 | type: "Quantifier", 36 | lazy: false, 37 | min: 0, 38 | max: 1, 39 | alternatives: copy, 40 | source: copySource(source), 41 | }, 42 | ], 43 | source: copySource(source), 44 | }); 45 | } 46 | 47 | function onParent(node: NoParent<Parent>, { signalMutation }: TransformContext): void { 48 | if (node.alternatives.length < 2) { 49 | return; 50 | } 51 | 52 | let mutated = false; 53 | let needQuantifier = true; 54 | filterMut(node.alternatives, alt => { 55 | if (isEmpty(alt)) { 56 | signalMutation(); 57 | mutated = true; 58 | return false; 59 | } 60 | 61 | if (alt.elements.length === 1) { 62 | const first = alt.elements[0]; 63 | if (first.type === "Quantifier" && first.min === 0 && first.max > 0) { 64 | signalMutation(); 65 | mutated = true; 66 | first.min = 1; 67 | if (first.max === 1 && first.alternatives.length === 1) { 68 | // trivial inlining 69 | alt.elements = first.alternatives[0].elements; 70 | } 71 | return true; 72 | } 73 | } 74 | 75 | if (isPotentiallyEmpty(alt)) { 76 | needQuantifier = false; 77 | } 78 | return true; 79 | }); 80 | 81 | // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition 82 | if (mutated && needQuantifier) { 83 | makeContentOptional(node.alternatives, node.source); 84 | } 85 | } 86 | 87 | function onParentSafe(node: NoParent<Parent>, { signalMutation }: TransformContext): void { 88 | if (node.alternatives.length < 2) { 89 | return; 90 | } 91 | 92 | if (node.alternatives[0].elements.length === 0) { 93 | // e.g. `(?:|a|b)` => `(?:a|b)??` 94 | const quant: NoParent<Quantifier> = { 95 | type: "Quantifier", 96 | lazy: true, 97 | min: 0, 98 | max: 1, 99 | alternatives: node.alternatives.slice(1), 100 | source: copySource(node.source), 101 | }; 102 | node.alternatives = [ 103 | { 104 | type: "Concatenation", 105 | elements: [quant], 106 | source: copySource(node.source), 107 | }, 108 | ]; 109 | signalMutation(); 110 | return; 111 | } 112 | 113 | if (node.alternatives[node.alternatives.length - 1].elements.length === 0) { 114 | // e.g. `(?:a|b|)` => `(?:a|b)?` 115 | const quant: NoParent<Quantifier> = { 116 | type: "Quantifier", 117 | lazy: false, 118 | min: 0, 119 | max: 1, 120 | alternatives: node.alternatives.slice(0, -1), 121 | source: copySource(node.source), 122 | }; 123 | node.alternatives = [ 124 | { 125 | type: "Concatenation", 126 | elements: [quant], 127 | source: copySource(node.source), 128 | }, 129 | ]; 130 | signalMutation(); 131 | return; 132 | } 133 | } 134 | 135 | /** 136 | * This tries to simplify how a given sub-expression accepts the empty string. The goal is to modify the sub-expression 137 | * such that exactly one path accepts the empty string. This has the emergent result that the operator that causes the 138 | * sub-expression to accept the empty string moves closer to the root of the tree. 139 | * 140 | * Examples: 141 | * 142 | * - `a(?:b*|d?)` => `a(?:b+|d)?` 143 | * - `||a*|b` => `(?:a+|b)?` 144 | * 145 | * This operation largely ignores the order of alternatives and usually reduces the ambiguity of the expression. If 146 | * order or ambiguity have to be preserved, then the effectiveness of this transformer will be greatly reduced. 147 | * 148 | * @param options 149 | */ 150 | export function moveUpEmpty(options?: Readonly<CreationOptions>): Transformer { 151 | let on: (node: NoParent<Parent>, context: TransformContext) => void; 152 | if (!options?.ignoreOrder || !options.ignoreAmbiguity) { 153 | on = onParentSafe; 154 | } else { 155 | on = onParent; 156 | } 157 | 158 | return { 159 | name: "moveUpEmpty", 160 | onAlternation: on, 161 | onAssertion: on, 162 | onExpression: on, 163 | onQuantifier: on, 164 | }; 165 | } 166 | -------------------------------------------------------------------------------- /src/js/create-char-set.ts: -------------------------------------------------------------------------------- 1 | import { Char } from "../char-types"; 2 | import { CharRange, CharSet } from "../char-set"; 3 | import { assertNever } from "../util"; 4 | import { Flags } from "./flags"; 5 | import { CharEnv, getCharEnv } from "./char-env"; 6 | import { getCharacterProperty } from "./property"; 7 | 8 | export type PredefinedCharacterSet = 9 | | AnyCharacterSet 10 | | DigitCharacterSet 11 | | PropertyCharacterSet 12 | | SpaceCharacterSet 13 | | WordCharacterSet; 14 | export interface AnyCharacterSet { 15 | kind: "any"; 16 | } 17 | export interface DigitCharacterSet { 18 | kind: "digit"; 19 | negate: boolean; 20 | } 21 | export type PropertyCharacterSet = CharacterPropertyCharacterSet | StringPropertyCharacterSet; 22 | export interface CharacterPropertyCharacterSet { 23 | kind: "property"; 24 | key: string; 25 | value: string | null; 26 | strings: false; 27 | negate: boolean; 28 | } 29 | export interface StringPropertyCharacterSet { 30 | kind: "property"; 31 | key: string; 32 | value: null; 33 | strings: true; 34 | negate: false; 35 | } 36 | export interface SpaceCharacterSet { 37 | kind: "space"; 38 | negate: boolean; 39 | } 40 | export interface WordCharacterSet { 41 | kind: "word"; 42 | negate: boolean; 43 | } 44 | 45 | /** 46 | * Creates a new character set with the characters equivalent to a JavaScript regular expression character set. 47 | * 48 | * @param chars The characters in the set. 49 | * @param flags The flags of the pattern. 50 | */ 51 | export function createCharSet( 52 | chars: Iterable<Char | CharRange | Readonly<Exclude<PredefinedCharacterSet, StringPropertyCharacterSet>>>, 53 | flags: Readonly<Flags> 54 | ): CharSet { 55 | // https://tc39.es/ecma262/#sec-runtime-semantics-charactersetmatcher-abstract-operation 56 | 57 | // This works by first adding all characters and ranges to a single ranges array while keeping track of whether 58 | // added characters/ranges might vary in case (if ignoreCase). 59 | // If ignoreCase and the ranges might vary in case, the case variations of all characters will be added. 60 | 61 | const env = getCharEnv(flags); 62 | 63 | const ranges: CharRange[] = []; 64 | let fullCaseCheck = false; 65 | 66 | function addChar(char: Char): void { 67 | /** 68 | * We will only add all case variation for the given character if: 69 | * 1) the regexp has the i flag set. 70 | * 2) we don't already do a full case check. Since the full case check will add all case variations of this 71 | * character anyway, there's no reason to do it here. 72 | * 3) the given character actually varies in case. 73 | */ 74 | if (env.ignoreCase && !fullCaseCheck) { 75 | const fold = env.caseFolding[char]; 76 | if (fold) { 77 | // add all case variations 78 | for (let i = 0, l = fold.length; i < l; i++) { 79 | const variation = fold[i]; 80 | ranges.push({ min: variation, max: variation }); 81 | } 82 | // all case variations also include the given character, so we are done 83 | return; 84 | } 85 | } 86 | ranges.push({ min: char, max: char }); 87 | } 88 | function addRange(range: CharRange): void { 89 | if (range.min === range.max) { 90 | addChar(range.min); 91 | return; 92 | } 93 | 94 | if (env.ignoreCase && !fullCaseCheck && !env.caseVarying.isDisjointWith(range)) { 95 | fullCaseCheck = true; 96 | } 97 | 98 | ranges.push(range); 99 | } 100 | 101 | for (const char of chars) { 102 | if (isChar(char)) { 103 | addChar(char); 104 | } else if ("kind" in char) { 105 | const set = getPredefinedSet(char, flags, env); 106 | if (set.isAll) { 107 | // since all character sets and ranges are combined using union, we can stop here 108 | return set; 109 | } 110 | ranges.push(...set.ranges); 111 | } else { 112 | addRange(char); 113 | } 114 | } 115 | 116 | const cs = env.empty.union(ranges); 117 | // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition 118 | if (!env.ignoreCase || !fullCaseCheck) { 119 | // no full case check, so we're done here. 120 | return cs; 121 | } 122 | 123 | return env.withCaseVaryingCharacters(cs); 124 | } 125 | 126 | function isChar(value: unknown): value is Char { 127 | return typeof value === "number"; 128 | } 129 | 130 | function getPredefinedSet( 131 | set: Readonly<Exclude<PredefinedCharacterSet, StringPropertyCharacterSet>>, 132 | flags: Readonly<Flags>, 133 | env: CharEnv 134 | ): CharSet { 135 | switch (set.kind) { 136 | case "any": 137 | return flags.dotAll ? env.all : env.nonLineTerminator; 138 | 139 | case "digit": 140 | return set.negate ? env.nonDigit : env.digit; 141 | 142 | case "space": 143 | return set.negate ? env.nonSpace : env.space; 144 | 145 | case "word": 146 | return set.negate ? env.nonWord : env.word; 147 | 148 | case "property": { 149 | if (!env.unicode) { 150 | throw new Error("Unicode property escapes cannot be used without the u flag."); 151 | } 152 | 153 | const { key, value, negate } = set; 154 | return getCharacterProperty(key, value, negate, env, flags.unicodeSets ?? false); 155 | } 156 | 157 | default: 158 | throw assertNever(set, "Invalid predefined character set type"); 159 | } 160 | } 161 | -------------------------------------------------------------------------------- /tests/iter/from-words.ts: -------------------------------------------------------------------------------- 1 | import { assert } from "chai"; 2 | import { CharSet } from "../../src/char-set"; 3 | import { ReadonlyWord } from "../../src/char-types"; 4 | import { DFA } from "../../src/dfa"; 5 | import { ENFA } from "../../src/enfa"; 6 | import { TransitionIterable } from "../../src/fa-types"; 7 | import { fromWordSets, fromWords } from "../../src/iter"; 8 | import { Parser } from "../../src/js"; 9 | import { NFA } from "../../src/nfa"; 10 | import { ReadonlyWordSet, WordSet } from "../../src/word-set"; 11 | import { fromStringToUnicode } from "../../src/words"; 12 | import { assertEqualSnapshot } from "../helper/snapshot"; 13 | 14 | const WORDS_LIST: string[][] = [ 15 | [], 16 | [""], 17 | ["", "a"], 18 | ["", "a", "aa", "", "a", "aa"], 19 | "foo bar foo bar baz food".split(/\s/g), 20 | " a b c d e f g".split(/\s/g), 21 | "a b ab ba aa bb aaa aab aba abb baa bab bba bbb".split(/\s/g), 22 | ]; 23 | 24 | const WORD_SETS_LIST: RegExp[] = [ 25 | /[]/u, 26 | /||||/u, 27 | /a|b|c|abc|bca|cba|cab|aa|caa/i, 28 | /|a|b|c|abc|bca|cba|cab|aa|caa/i, 29 | /foo|food|bar|bar|\wd|\d\w/, 30 | ]; 31 | 32 | describe(fromWords.name, function () { 33 | function getNFA(words: Iterable<ReadonlyWord>, maxCharacter: number): NFA { 34 | const total = NFA.empty({ maxCharacter }); 35 | 36 | for (const word of words) { 37 | const w = NFA.fromTransitionIterator( 38 | { 39 | initial: 0, 40 | getOut: s => { 41 | const out = new Map<number, CharSet>(); 42 | if (s < word.length) { 43 | out.set(s + 1, CharSet.fromCharacters(maxCharacter, [word[s]])); 44 | } 45 | return out; 46 | }, 47 | isFinal: s => s === word.length, 48 | }, 49 | { maxCharacter } 50 | ); 51 | total.union(w); 52 | } 53 | 54 | return total; 55 | } 56 | 57 | function test(getActual: (words: Iterable<ReadonlyWord>, maxCharacter: number) => DFA | NFA | ENFA): void { 58 | const maxCharacter = 0x10ffff; 59 | for (const wordStrings of WORDS_LIST) { 60 | it(JSON.stringify(wordStrings), function () { 61 | const words = wordStrings.map(fromStringToUnicode); 62 | const fa = getActual(words, maxCharacter); 63 | 64 | // assert FA 65 | assertEqualSnapshot(this, fa.toString()); 66 | 67 | // compare language 68 | const actual = DFA.fromFA(fa as TransitionIterable<unknown>); 69 | const expected = DFA.fromFA(getNFA(words, maxCharacter)); 70 | actual.minimize(); 71 | expected.minimize(); 72 | assert.strictEqual(actual.toString(), expected.toString()); 73 | }); 74 | } 75 | } 76 | 77 | describe("DFA", function () { 78 | test((words, maxCharacter) => DFA.fromWords(words, { maxCharacter })); 79 | }); 80 | 81 | describe("ENFA", function () { 82 | test((words, maxCharacter) => ENFA.fromWords(words, { maxCharacter })); 83 | }); 84 | 85 | describe("NFA", function () { 86 | test((words, maxCharacter) => NFA.fromWords(words, { maxCharacter })); 87 | }); 88 | }); 89 | 90 | describe(fromWordSets.name, function () { 91 | function getNFA(wordSets: Iterable<ReadonlyWordSet>, maxCharacter: number): NFA { 92 | const total = NFA.empty({ maxCharacter }); 93 | 94 | for (const wordSet of wordSets) { 95 | const w = NFA.fromTransitionIterator( 96 | { 97 | initial: 0, 98 | getOut: s => { 99 | const out = new Map<number, CharSet>(); 100 | if (s < wordSet.length) { 101 | out.set(s + 1, wordSet[s]); 102 | } 103 | return out; 104 | }, 105 | isFinal: s => s === wordSet.length, 106 | }, 107 | { maxCharacter } 108 | ); 109 | total.union(w); 110 | } 111 | 112 | return total; 113 | } 114 | 115 | function test(getActual: (wordSets: Iterable<ReadonlyWordSet>, maxCharacter: number) => DFA | NFA | ENFA): void { 116 | for (const regex of WORD_SETS_LIST) { 117 | it(regex.toString(), function () { 118 | const { expression, maxCharacter } = Parser.fromLiteral(regex).parse(); 119 | const wordSets: WordSet[] = []; 120 | 121 | for (const alt of expression.alternatives) { 122 | const wordSet: WordSet = []; 123 | for (const e of alt.elements) { 124 | if (e.type === "CharacterClass") { 125 | if (e.characters.isEmpty) { 126 | continue; 127 | } else { 128 | wordSet.push(e.characters); 129 | } 130 | } else { 131 | throw new Error("Unexpected element. Only characters are supported"); 132 | } 133 | } 134 | wordSets.push(wordSet); 135 | } 136 | 137 | const fa = getActual(wordSets, maxCharacter); 138 | 139 | // assert FA 140 | assertEqualSnapshot(this, fa.toString()); 141 | 142 | // compare language 143 | const actual = DFA.fromFA(fa as TransitionIterable<unknown>); 144 | const expected = DFA.fromFA(getNFA(wordSets, maxCharacter)); 145 | actual.minimize(); 146 | expected.minimize(); 147 | assert.strictEqual(actual.toString(), expected.toString()); 148 | }); 149 | } 150 | } 151 | 152 | describe("DFA", function () { 153 | test((wordSets, maxCharacter) => DFA.fromWordSets(wordSets, { maxCharacter })); 154 | }); 155 | 156 | describe("ENFA", function () { 157 | test((wordSets, maxCharacter) => ENFA.fromWordSets(wordSets, { maxCharacter })); 158 | }); 159 | 160 | describe("NFA", function () { 161 | test((wordSets, maxCharacter) => NFA.fromWordSets(wordSets, { maxCharacter })); 162 | }); 163 | }); 164 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Issues, comments, and pull requests are always welcome! 4 | 5 | ## Writing code 6 | 7 | A few general rules: 8 | 9 | - Write unit tests. 10 | - Strongly type everything (except what is trivially inferred). 11 | - Document as much as possible but nothing obvious. (No `i++; // increase variable`) 12 | - Use an IDE that supports ESLint. This will take care of formatting and give warnings and errors. 13 | 14 | ### Naming conventions 15 | 16 | - Casing 17 | - Lower camelCase for variables, functions, and parameters. 18 | - Private methods and variables in a class must have one leading underscore (`_`). 19 | - Unused function parameters may have a leading underscore (`_`). 20 | - PascalCase for classes, interfaces, types, and type parameters. 21 | - UPPER_CASE for compile-time constants. 22 | - Maximum and minimum 23 | - When referring to a maximum of something, call it `maxSomething` instead of `maximumSomething`. Same for "minimum" and "min". 24 | - When referring to a maximum and minimum to describe a range, call it `max` and `min`. 25 | - When referring to a maximum (without corresponding minimum), call it `maximum`. Same for "minimum". 26 | - Assertion vs. Lookaround 27 | - Always use "assertion" when referring to a lookaround (= lookbehind inclusive or lookahead). 28 | - Use "lookbehind" and "lookahead" only when referring to either specifically, excluding the other. 29 | 30 | ### Useful commands 31 | 32 | - `npm run test` 33 | Run all tests. 34 | - `npm run test:fast` 35 | Run all tests except for the regex stress test that takes from 5s to 20s. 36 | - `npm run build` 37 | Compiles the whole project and creates the final `index.{js,d.ts}` files in the project's root directory. 38 | - `npm run scripts:debug` 39 | This will execute the `scripts/debug.ts` file. This is a quick way to test new or existing features with access to all files in the library. 40 | 41 | 42 | ## Project structure 43 | 44 | ``` 45 | refa/ 46 | |-- scripts/ 47 | | `-- ... 48 | |-- src/ 49 | | |-- ast/ 50 | | | `-- ... 51 | | |-- iter/ 52 | | | `-- ... 53 | | |-- js/ 54 | | | `-- ... 55 | | |-- transformers/ 56 | | | `-- ... 57 | | `-- ... 58 | |-- tests/ 59 | | `-- ... 60 | |-- CONTRIBUTING.md // this file 61 | |-- index.{ts,d.ts} // generated by `npm run build` 62 | |-- package.json 63 | `-- ... 64 | ``` 65 | 66 | ### `scripts` 67 | 68 | This folder contains useful scripts when working on refa. Any script can be executed via `npm run scripts:<script-name>`. 69 | 70 | The `debug` script is particularly useful. It's purpose is to be way to quickly try out things. It as access to all of refa's source files and can be run via `npm run scripts:debug`. Do not commit changes to this file. 71 | 72 | ### `src` 73 | 74 | This is folder for all files which will be in the compiled build of the library. 75 | 76 | The most important files are: 77 | 78 | 1. `char{set,map}.ts` defines the most important classes of refa: `CharSet` - a sorted interval set used to represent characters - and `CharMap` - a sorted interval map. 79 | 1. `finite-automaton.ts` defines interfaces all concrete FA implementations use. 80 | 1. `{dfa,nfa}.ts` define the concrete implementations of an NFA and DFA. 81 | 1. `words.ts` includes function to convert from JS strings to number arrays and vise versa among others. 82 | 83 | #### `src/ast` 84 | 85 | This directory includes the definition of refa's RE AST format and simple functions/interfaces for traversal and modification. 86 | 87 | #### `src/iter` 88 | 89 | This directory contains functions that consume and produce graph iterators. Graph iterators are one of refa's core concepts and allow us to implement different algorithms independently from one specific graph representation. 90 | 91 | When importing those functions from outside `src/iter`, it must be done via `src/iter/index.ts`. It's recommended to import all functions like this: 92 | 93 | ```js 94 | import * as Iter from "./iter"; 95 | ``` 96 | 97 | From inside `src/iter`, functions MUST be imported directly from the file they are defined in. 98 | 99 | #### `src/js` 100 | 101 | This is where all [JavaScript RegExp](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp)-specific logic lives. This is mainly includes a parser to convert RegExp to refa's RE AST format and a function to convert RE AST to JS RegExp. 102 | 103 | Files from `src/*` (except `index.ts`) are not allowed to import files from `src/js`. The rest of the library is supposed to be independent from this part of it because it may later be moved to its own package. 104 | 105 | #### `src/transformers` 106 | 107 | This directory contains RE AST transformers. These tools can be used to simplify and change existing RE ASTs. 108 | 109 | ### `test` 110 | 111 | This folder has a similar layout to `src`. It's supposed to somewhat mirror the file structure of `src`, so it's easy to find where tests for specific files form `src` live. 112 | 113 | You can run all tests using `npm run test` (or just `npm test`). 114 | 115 | If you want to run the tests quicker, you can use `npm run test:fast`. This will run all tests except the stress test which is executed for thousands of regexes. 116 | 117 | The `helper` folder contains functions used to implement tests. This includes useful constant, conversions function, and sets of test regexes. 118 | -------------------------------------------------------------------------------- /src/iter/intersection.ts: -------------------------------------------------------------------------------- 1 | import { CharSet } from "../char-set"; 2 | import { ensureStableOut } from "./iterator"; 3 | import { FABuilder, FAIterator, TransitionIterator } from "../fa-types"; 4 | 5 | /** 6 | * A lazy intersection algorithm that will use the given FA builder to construct the intersection FA as the returned 7 | * iterator is used to traverse the FA. 8 | * 9 | * To construct the whole intersection FA, simply traverse the entire iterator. 10 | * 11 | * @param builder 12 | * @param left 13 | * @param right 14 | */ 15 | export function intersection<S, L, R>( 16 | builder: FABuilder<S, CharSet>, 17 | left: TransitionIterator<L>, 18 | right: TransitionIterator<R> 19 | ): FAIterator<S, S> { 20 | left = ensureStableOut(left); 21 | right = ensureStableOut(right); 22 | 23 | const leftToIndex = createIndexer<L>(); 24 | const rightToIndex = createIndexer<R>(); 25 | 26 | if (left.isFinal(left.initial) && right.isFinal(right.initial)) { 27 | builder.makeFinal(builder.initial); 28 | } 29 | 30 | // node pair translation 31 | type Tuple<L, R> = readonly [L, R]; 32 | const indexBackTranslatorMap = new Map<S, Tuple<L, R>>(); 33 | indexBackTranslatorMap.set(builder.initial, [left.initial, right.initial]); 34 | const indexTranslatorCache: Record<string, S | undefined> = { 35 | [`${leftToIndex(left.initial)};${rightToIndex(right.initial)}`]: builder.initial, 36 | }; 37 | 38 | function translate(leftNode: L, rightNode: R): S { 39 | const leftKey = leftToIndex(leftNode); 40 | const rightKey = rightToIndex(rightNode); 41 | const key = "" + leftKey + ";" + rightKey; 42 | 43 | let node = indexTranslatorCache[key]; 44 | if (node === undefined) { 45 | node = builder.createNode(); 46 | indexTranslatorCache[key] = node; 47 | indexBackTranslatorMap.set(node, [leftNode, rightNode]); 48 | 49 | if (left.isFinal(leftNode) && right.isFinal(rightNode)) { 50 | builder.makeFinal(node); 51 | } 52 | } 53 | return node; 54 | } 55 | 56 | function translateBack(node: S): Tuple<L, R> { 57 | const tuple = indexBackTranslatorMap.get(node); 58 | if (tuple === undefined) { 59 | throw new Error("All created nodes have to be indexed."); 60 | } 61 | return tuple; 62 | } 63 | 64 | const intersect = createCharSetIntersectFn(); 65 | 66 | // add edges 67 | 68 | function addOutgoing(from: S): void { 69 | const tuple = translateBack(from); 70 | const leftOut = left.getOut(tuple[0]); 71 | const rightOut = right.getOut(tuple[1]); 72 | 73 | for (const [leftTo, leftTransition] of leftOut) { 74 | for (const [rightTo, rightTransition] of rightOut) { 75 | const transition = intersect(leftTransition, rightTransition); 76 | if (transition) { 77 | builder.linkNodes(from, translate(leftTo, rightTo), transition); 78 | } 79 | } 80 | } 81 | } 82 | 83 | return { 84 | initial: builder.initial, 85 | getOut(node: S): S { 86 | addOutgoing(node); 87 | return node; 88 | }, 89 | isFinal(node: S): boolean { 90 | return builder.isFinal(node); 91 | }, 92 | }; 93 | } 94 | 95 | const HASH_MASK = 0xffff; 96 | function computeHash(a: CharSet): number { 97 | let hash = a.maximum & HASH_MASK; 98 | a.ranges.forEach(({ min, max }) => { 99 | hash = ((hash * 31 + min) ^ (max * 31)) & HASH_MASK; 100 | }); 101 | return hash; 102 | } 103 | /** 104 | * Creates a function which can intersect any two char sets of the given set of character sets. 105 | * 106 | * The function return `null` if the intersection of two char sets is empty. 107 | */ 108 | function createCharSetIntersectFn(): (a: CharSet, b: CharSet) => CharSet | null { 109 | const hashTable: Record<number, CharSet | undefined> = {}; 110 | const charSetIdMap = new Map<CharSet, number>(); 111 | 112 | function getId(set: CharSet): number { 113 | let id = charSetIdMap.get(set); 114 | if (id === undefined) { 115 | let hash = computeHash(set); 116 | for (;;) { 117 | const entry = hashTable[hash]; 118 | if (entry === undefined) { 119 | // make new entry 120 | hashTable[hash] = set; 121 | id = charSetIdMap.size; 122 | break; 123 | } else if (entry.equals(set)) { 124 | // same as previous set 125 | id = charSetIdMap.get(entry)!; 126 | break; 127 | } else { 128 | // handle hash collision 129 | hash = (hash + 1) & HASH_MASK; 130 | } 131 | } 132 | charSetIdMap.set(set, id); 133 | } 134 | return id; 135 | } 136 | 137 | // use the id of char sets to store pairs 138 | // null be represent empty char sets 139 | const intersectionCache: Record<string, CharSet | null | undefined> = {}; 140 | 141 | return (a: CharSet, b: CharSet): CharSet | null => { 142 | const aId = getId(a); 143 | const bId = getId(b); 144 | 145 | // trivial 146 | if (aId == bId) { 147 | return a; 148 | } 149 | 150 | // since intersection is symmetric we don't care about the order 151 | let key; 152 | if (aId < bId) { 153 | key = "" + aId + ";" + bId; 154 | } else { 155 | key = "" + bId + ";" + aId; 156 | } 157 | 158 | let result: CharSet | null | undefined = intersectionCache[key]; 159 | if (result === undefined) { 160 | result = a.intersect(b); 161 | if (result.isEmpty) { 162 | result = null; 163 | } 164 | intersectionCache[key] = result; 165 | } 166 | 167 | return result; 168 | }; 169 | } 170 | 171 | function createIndexer<T>(): (value: T) => number { 172 | const map = new Map<T, number>(); 173 | return (value: T): number => { 174 | let index = map.get(value); 175 | if (index === undefined) { 176 | index = map.size; 177 | map.set(value, index); 178 | } 179 | return index; 180 | }; 181 | } 182 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | /* Basic Options */ 4 | "target": "ES2015", 5 | "module": "es2015", /* Specify module code generation: 'none', 'commonjs', 'amd', 'system', 'umd', 'es2015', or 'ESNext'. */ 6 | "moduleResolution": "Node", 7 | "lib": ["es2015", "es2015.iterable"], /* Specify library files to be included in the compilation. */ 8 | // "allowJs": true, /* Allow javascript files to be compiled. */ 9 | // "checkJs": true, /* Report errors in .js files. */ 10 | // "jsx": "preserve", /* Specify JSX code generation: 'preserve', 'react-native', or 'react'. */ 11 | "declaration": true, /* Generates corresponding '.d.ts' file. */ 12 | // "declarationMap": true, /* Generates a sourcemap for each corresponding '.d.ts' file. */ 13 | // "sourceMap": true, /* Generates corresponding '.map' file. */ 14 | "outDir": "./.out", /* Redirect output structure to the directory. */ 15 | // "rootDir": "./", /* Specify the root directory of input files. Use to control the output directory structure with --outDir. */ 16 | // "composite": true, /* Enable project compilation */ 17 | // "incremental": true, /* Enable incremental compilation */ 18 | // "tsBuildInfoFile": "./", /* Specify file to store incremental compilation information */ 19 | // "removeComments": true, /* Do not emit comments to output. */ 20 | // "noEmit": true, /* Do not emit outputs. */ 21 | // "importHelpers": true, /* Import emit helpers from 'tslib'. */ 22 | // "downlevelIteration": true, /* Provide full support for iterables in 'for-of', spread, and destructuring when targeting 'ES5' or 'ES3'. */ 23 | // "isolatedModules": true, /* Transpile each file as a separate module (similar to 'ts.transpileModule'). */ 24 | 25 | /* Strict Type-Checking Options */ 26 | "strict": true, /* Enable all strict type-checking options. */ 27 | "noImplicitAny": true, /* Raise error on expressions and declarations with an implied 'any' type. */ 28 | "strictNullChecks": true, /* Enable strict null checks. */ 29 | "strictFunctionTypes": true, /* Enable strict checking of function types. */ 30 | "strictBindCallApply": true, /* Enable strict 'bind', 'call', and 'apply' methods on functions. */ 31 | "strictPropertyInitialization": true, /* Enable strict checking of property initialization in classes. */ 32 | "noImplicitThis": true, /* Raise error on 'this' expressions with an implied 'any' type. */ 33 | "alwaysStrict": true, /* Parse in strict mode and emit "use strict" for each source file. */ 34 | 35 | /* Additional Checks */ 36 | // "noUnusedLocals": true, /* Report errors on unused locals. */ 37 | // "noUnusedParameters": true, /* Report errors on unused parameters. */ 38 | "noImplicitReturns": true, /* Report error when not all code paths in function return a value. */ 39 | // "noFallthroughCasesInSwitch": true, /* Report errors for fallthrough cases in switch statement. */ 40 | 41 | /* Module Resolution Options */ 42 | // "moduleResolution": "node", /* Specify module resolution strategy: 'node' (Node.js) or 'classic' (TypeScript pre-1.6). */ 43 | // "baseUrl": "./", /* Base directory to resolve non-absolute module names. */ 44 | // "paths": {}, /* A series of entries which re-map imports to lookup locations relative to the 'baseUrl'. */ 45 | // "rootDirs": [], /* List of root folders whose combined content represents the structure of the project at runtime. */ 46 | // "typeRoots": [], /* List of folders to include type definitions from. */ 47 | // "types": [], /* Type declaration files to be included in compilation. */ 48 | // "allowSyntheticDefaultImports": true, /* Allow default imports from modules with no default export. This does not affect code emit, just typechecking. */ 49 | "esModuleInterop": true, /* Enables emit interoperability between CommonJS and ES Modules via creation of namespace objects for all imports. Implies 'allowSyntheticDefaultImports'. */ 50 | // "preserveSymlinks": true, /* Do not resolve the real path of symlinks. */ 51 | // "resolveJsonModule": true, 52 | 53 | /* Source Map Options */ 54 | // "sourceRoot": "", /* Specify the location where debugger should locate TypeScript files instead of source locations. */ 55 | // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */ 56 | // "inlineSourceMap": true, /* Emit a single file with source maps instead of having a separate file. */ 57 | // "inlineSources": true, /* Emit the source alongside the sourcemaps within a single file; requires '--inlineSourceMap' or '--sourceMap' to be set. */ 58 | 59 | /* Experimental Options */ 60 | // "experimentalDecorators": true, /* Enables experimental support for ES7 decorators. */ 61 | // "emitDecoratorMetadata": true, /* Enables experimental support for emitting type metadata for decorators. */ 62 | 63 | "newLine": "lf" 64 | }, 65 | "include": [ 66 | "src/**/*.ts" 67 | ] 68 | } 69 | -------------------------------------------------------------------------------- /src/char-base.ts: -------------------------------------------------------------------------------- 1 | import { CharRange, CharSet } from "./char-set"; 2 | import { iterToSet } from "./util"; 3 | 4 | /** 5 | * A character base is constructed from a collection of character sets. It holds a list of disjoint, non-empty 6 | * character sets - the base sets - that can be used to construct every character set in the collection it was 7 | * constructed from. 8 | * 9 | * ## Guarantees 10 | * 11 | * - The base sets are guaranteed to be mutually disjoint and non-empty. 12 | * 13 | * - Every character set in the collection can be constructed by combining (union) a unique set of base sets. 14 | * 15 | * - The list of base sets is guaranteed to be as small as possible. There are at most `min(n^2, o)` base sets where `n` 16 | * is the number of unique, non-empty character sets in the collection, and `o` is the number of characters in the 17 | * union of all character sets in the collection. 18 | * 19 | * ## Use case 20 | * 21 | * The primary purpose of base sets is to remap alphabets. Some FA operations scale with the number of characters in the 22 | * alphabet of the FA (e.g. DFA minimization). 23 | * 24 | * Base sets can be used to determine which characters in an FA's alphabet *Σ* cannot be distinguished by the FA *A*. 25 | * Two characters *a,b* in *Σ* are indistinguishable if for all inputs *w* the following hold true: 26 | * 27 | * 1. *w* is accepted by *A* iff *w* with all occurrences of *a* replaced with *b* is accepted by *A*. 28 | * 2. *w* is accepted by *A* iff *w* with all occurrences of *b* replaced with *a* is accepted by *A*. 29 | * 30 | * Two indistinguishable characters are guaranteed to be in the same base set. 31 | * 32 | * By treating each base set as a character, it is possible to create a new (smaller) alphabet *Γ* (*|Γ| <= |Σ|*) such 33 | * that the FA *A* still behaves the same. 34 | * 35 | * Since *Γ* is typically (several orders of magnitude) smaller, operations that scale with the size of the alphabet 36 | * can be done more quickly. 37 | */ 38 | export class CharBase { 39 | /** 40 | * A list of disjoint, non-empty character sets. 41 | * 42 | * See {@link CharBase} to learn more. 43 | */ 44 | readonly sets: readonly CharSet[]; 45 | 46 | /** 47 | * Create the base sets of the given collection of character sets. 48 | * 49 | * See {@link CharBase} to learn more. 50 | * 51 | * @param charSets 52 | * @throws `RangeError` if the collection contains two character sets with different maximums. 53 | */ 54 | constructor(charSets: Iterable<CharSet>) { 55 | this.sets = getBaseSets(charSets); 56 | } 57 | 58 | /** 59 | * Splits the given character set into its base sets. 60 | * 61 | * The returned array will be a list of indexes of base sets necessary to construct the given character sets. The 62 | * indexes will be sorted and occur at most once. 63 | * 64 | * **Note**: This assumes that `charSet` is either empty or can be constructed from the base sets. If the 65 | * assumption is not met, the output of this function will be undefined. 66 | * 67 | * @param charSet 68 | */ 69 | split(charSet: CharSet): number[] { 70 | const indexes: number[] = []; 71 | for (let i = 0, l = this.sets.length; i < l; i++) { 72 | if (charSet.has(this.sets[i].ranges[0].min)) { 73 | indexes.push(i); 74 | } 75 | } 76 | return indexes; 77 | } 78 | } 79 | 80 | /** 81 | * Returns an array of disjoint non-empty sets that can used to construct all given sets. 82 | * 83 | * If the union of all given character sets is empty, the empty array will be returned. 84 | * 85 | * This algorithm run in O(n*log(n)) where n is the number of ranges in the given character sets. 86 | * 87 | * @param charSets 88 | */ 89 | function getBaseSets(charSets: Iterable<CharSet>): readonly CharSet[] { 90 | // remove duplicates and empty sets 91 | const sets = [...iterToSet(charSets)] 92 | .filter(set => !set.isEmpty) 93 | .sort((a, b) => a.compare(b)) 94 | .filter((set, i, array) => i === 0 || !set.equals(array[i - 1])); 95 | 96 | if (sets.length === 0) { 97 | // trivially 98 | return sets; 99 | } 100 | if (sets.length === 1) { 101 | // if there's only one set, then it's the only base set 102 | return sets; 103 | } 104 | 105 | // extract all ranges 106 | const maximum = sets[0].maximum; 107 | const ranges: CharRange[] = []; 108 | for (const set of sets) { 109 | if (set.maximum !== maximum) { 110 | throw new RangeError("The maximum of all given sets has to be the same."); 111 | } 112 | ranges.push(...set.ranges); 113 | } 114 | 115 | // union of all char sets 116 | const union = CharSet.empty(maximum).union(ranges); 117 | 118 | // set of all cuts 119 | const cuts = new Set<number>(); 120 | for (let i = 0, l = ranges.length; i < l; i++) { 121 | const { min, max } = ranges[i]; 122 | cuts.add(min); 123 | cuts.add(max + 1); 124 | } 125 | 126 | // determine the ranges of the base sets 127 | const sortedCuts = [...cuts].sort((a, b) => a - b); 128 | const baseRanges = new Map<string, CharRange[]>(); 129 | for (let i = 1, l = sortedCuts.length; i < l; i++) { 130 | const min = sortedCuts[i - 1]; 131 | if (union.has(min)) { 132 | let key = ""; 133 | for (let setIndex = 0; setIndex < sets.length; setIndex++) { 134 | const set = sets[setIndex]; 135 | if (set.has(min)) { 136 | key += setIndex + " "; 137 | } 138 | } 139 | 140 | const value = baseRanges.get(key); 141 | const range = { min, max: sortedCuts[i] - 1 }; 142 | if (value) { 143 | value.push(range); 144 | } else { 145 | baseRanges.set(key, [range]); 146 | } 147 | } 148 | } 149 | 150 | // create the base sets 151 | const baseSets: CharSet[] = []; 152 | for (const ranges of baseRanges.values()) { 153 | baseSets.push(CharSet.empty(maximum).union(ranges)); 154 | } 155 | 156 | return baseSets; 157 | } 158 | -------------------------------------------------------------------------------- /src/transformers/merge-with-quantifier.ts: -------------------------------------------------------------------------------- 1 | import { Element, NoParent, Quantifier, TransformContext, Transformer } from "../ast"; 2 | import { MatchingDirection, structurallyEqual, structurallyEqualToQuantifiedElement } from "../ast-analysis"; 3 | import { filterMut } from "../util"; 4 | import { CreationOptions } from "./creation-options"; 5 | 6 | function consumeUsingInfiniteQuantifier( 7 | quant: Readonly<NoParent<Quantifier>>, 8 | after: NoParent<Element>, 9 | direction: MatchingDirection, 10 | context: TransformContext 11 | ): void { 12 | const { signalMutation } = context; 13 | 14 | if ( 15 | // quant = a{n,} 16 | quant.max === Infinity && 17 | // after = (a|b) or (a|b){0,1} 18 | (after.type === "Alternation" || (after.type === "Quantifier" && after.max === 1)) 19 | ) { 20 | for (const alt of after.alternatives) { 21 | const firstIndex = direction === "ltr" ? 0 : alt.elements.length - 1; 22 | const first = alt.elements[firstIndex] as NoParent<Element> | undefined; 23 | if (first) { 24 | if ( 25 | first.type === "Quantifier" && 26 | first.max !== first.min && 27 | structurallyEqual(quant.alternatives, first.alternatives) 28 | ) { 29 | // we found a nested quantifier we can (partially) consume 30 | first.max = first.min; 31 | if (first.max === 0) { 32 | // remove the quantifier 33 | alt.elements.splice(firstIndex, 1); 34 | } 35 | signalMutation(); 36 | } else if (first.type === "Alternation" || (first.type === "Quantifier" && first.max === 1)) { 37 | // go into 38 | // e.g. /a*((a*|b)c|d)/, here we go from ((a*|b)c|d) into (a*|b) 39 | consumeUsingInfiniteQuantifier(quant, first, direction, context); 40 | } 41 | } 42 | } 43 | } 44 | } 45 | 46 | /** 47 | * This operation tries to merge as many elements as possible with existing quantifiers. 48 | * 49 | * Examples: 50 | * 51 | * - `/a*a/` => `/a+/` 52 | * - `/a*(?:a+|c)/` => `/a*(?:a|c)/` 53 | * 54 | * @param options 55 | */ 56 | export function mergeWithQuantifier(options?: Readonly<CreationOptions>): Transformer { 57 | // This will preserve the order of alternatives ASSUMING that there are no greedy vs lazy quantifiers. 58 | // This has to be changed as soon as lazy/greedy quantifiers are added. 59 | 60 | const { ignoreAmbiguity = false, ignoreOrder = false } = options ?? {}; 61 | 62 | function consumeNonQuantifier( 63 | elements: NoParent<Element>[], 64 | direction: MatchingDirection, 65 | context: TransformContext 66 | ): void { 67 | if (direction === "rtl") { 68 | elements.reverse(); 69 | } 70 | 71 | // make e.g. a*a -> a+ 72 | filterMut(elements, (after, before) => { 73 | if (before && before.type === "Quantifier" && structurallyEqualToQuantifiedElement(before, after)) { 74 | // e.g. a*a 75 | before.min++; 76 | before.max++; 77 | context.signalMutation(); 78 | return false; 79 | } else { 80 | return true; 81 | } 82 | }); 83 | 84 | // make e.g. ab(ab)* -> (ab)+ 85 | for (let i = 0; i < elements.length; i++) { 86 | const current = elements[i]; 87 | if (current.type === "Quantifier" && current.alternatives.length === 1) { 88 | const alt = current.alternatives[0]; 89 | if ( 90 | alt.elements.length >= 2 && 91 | i + 1 + alt.elements.length <= elements.length && 92 | alt.elements.every((e, j) => { 93 | if (direction === "rtl") { 94 | j = alt.elements.length - 1 - j; 95 | } 96 | return structurallyEqual(e, elements[i + 1 + j]); 97 | }) 98 | ) { 99 | context.signalMutation(); 100 | current.min++; 101 | current.max++; 102 | elements.splice(i + 1, alt.elements.length); 103 | i--; 104 | } 105 | } 106 | } 107 | // make e.g. a*(a+|b*)? -> a*(a|b*) 108 | if (ignoreAmbiguity) { 109 | for (let i = 1; i < elements.length; i++) { 110 | const quant = elements[i - 1]; 111 | const after = elements[i]; 112 | if (quant.type === "Quantifier" && (ignoreOrder || (!quant.lazy && quant.max === Infinity))) { 113 | consumeUsingInfiniteQuantifier(quant, after, direction, context); 114 | } 115 | } 116 | } 117 | 118 | if (direction === "rtl") { 119 | elements.reverse(); 120 | } 121 | } 122 | 123 | return { 124 | name: "mergeWithQuantifier", 125 | onConcatenation(node, context) { 126 | const elements = node.elements; 127 | const { signalMutation } = context; 128 | 129 | consumeNonQuantifier(elements, "ltr", context); 130 | consumeNonQuantifier(elements, "rtl", context); 131 | 132 | // make e.g. a*a+ -> a+ 133 | filterMut(elements, (after, before) => { 134 | if (before && before.type === "Quantifier" && after.type === "Quantifier") { 135 | const beforeConst = before.min === before.max; 136 | const afterConst = after.min === after.max; 137 | 138 | if (!ignoreAmbiguity && !(beforeConst || afterConst)) { 139 | // If ambiguity can't be ignored, at least one of the two quantifiers has to be constant 140 | return true; 141 | } 142 | if ( 143 | !ignoreOrder && 144 | !( 145 | beforeConst || 146 | afterConst || 147 | before.lazy === after.lazy || 148 | (!before.lazy && before.max === Infinity) || 149 | (!after.lazy && after.max === Infinity) 150 | ) 151 | ) { 152 | // If order can't be ignored, at least one of the two quantifiers has to be constant or both 153 | // have to have the same laziness or at least one of them is greedy and is unbounded 154 | return true; 155 | } 156 | 157 | if (structurallyEqual(before.alternatives, after.alternatives)) { 158 | // e.g. a+a* -> a+ , a{2,6}a{1,3} -> a{3,9} 159 | before.min += after.min; 160 | before.max += after.max; 161 | signalMutation(); 162 | return false; 163 | } 164 | } 165 | return true; 166 | }); 167 | }, 168 | }; 169 | } 170 | -------------------------------------------------------------------------------- /src/iter/from-words.ts: -------------------------------------------------------------------------------- 1 | import { CharSet } from "../char-set"; 2 | import { Char, ReadonlyWord } from "../char-types"; 3 | import { FABuilder, FAIterator } from "../fa-types"; 4 | import { cachedFunc, traverse } from "../util"; 5 | import { ReadonlyWordSet } from "../word-set"; 6 | 7 | /** 8 | * This eagerly creates an FA that accepts exactly all the given words. 9 | * 10 | * The construction is already finished when this method returns, so the returned FA iterator does not have to be used. 11 | * 12 | * The construction will create a DFA by default. However, the FA builder implementation has to be carefully chosen to 13 | * preserve the determinism. In order for the determinism to be preserved, `builder` and `getOutState` have to fulfill 14 | * the following conditions: 15 | * 16 | * - Let `x`, `y` be any 2 states of `builder` and `c` be any valid character `<= maxCharacter`. Iff this function 17 | * called `builder.linkNodes(x, y, c)`, then `getOutState(builder, x, c) == y`. 18 | * - `builder` has to be an empty FA when given to this method. 19 | * - `builder.makeFinal(x)` must have no effect on `getOutState`. 20 | * 21 | * @param builder 22 | * @param getOutState 23 | * @param words 24 | * @param maxCharacter 25 | * @returns 26 | */ 27 | export function fromWords<S>( 28 | builder: FABuilder<S, CharSet>, 29 | getOutState: (state: S, char: Char) => S | undefined, 30 | words: Iterable<ReadonlyWord>, 31 | maxCharacter: Char 32 | ): FAIterator<S, S> { 33 | const getCharSet = cachedFunc<Char, CharSet>(c => CharSet.fromCharacters(maxCharacter, [c])); 34 | 35 | // build a prefix trie 36 | for (const word of words) { 37 | let node = builder.initial; 38 | for (const char of word) { 39 | if (char > maxCharacter) { 40 | throw new Error(`The character (${char}) has to be <= maxCharacter (${maxCharacter}).`); 41 | } 42 | 43 | let next = getOutState(node, char); 44 | if (next === undefined) { 45 | next = builder.createNode(); 46 | builder.linkNodes(node, next, getCharSet(char)); 47 | } 48 | node = next; 49 | } 50 | builder.makeFinal(node); 51 | } 52 | 53 | return { 54 | initial: builder.initial, 55 | stableOut: true, 56 | getOut: s => s, 57 | isFinal: s => builder.isFinal(s), 58 | }; 59 | } 60 | 61 | /** 62 | * This eagerly creates an FA that accepts exactly all the given word sets. 63 | * 64 | * The construction is already finished when this method returns, so the returned FA iterator does not have to be used. 65 | * 66 | * The construction will create a DFA by default. 67 | * 68 | * @param builder 69 | * @param wordSets 70 | * @param maxCharacter 71 | * @returns 72 | */ 73 | export function fromWordSets<S>( 74 | builder: FABuilder<S, CharSet>, 75 | wordSets: Iterable<ReadonlyWordSet>, 76 | maxCharacter: Char 77 | ): FAIterator<S, S> { 78 | // create trie 79 | const root = new TrieNode(builder.initial); 80 | 81 | for (const wordSet of wordSets) { 82 | let current = new Set([root]); 83 | 84 | for (const set of wordSet) { 85 | if (set.maximum !== maxCharacter) { 86 | throw new Error( 87 | `The maximum (${set.maximum}) of the char set has to be == maxCharacter (${maxCharacter}).` 88 | ); 89 | } 90 | 91 | const next = new Set<TrieNode<S>>(); 92 | for (const node of current) { 93 | for (const n of node.getNext(set, builder)) { 94 | next.add(n); 95 | } 96 | } 97 | current = next; 98 | } 99 | 100 | for (const node of current) { 101 | node.isFinal = true; 102 | } 103 | } 104 | 105 | // translate trie 106 | traverse(root, (n, queue) => { 107 | if (n.isFinal) { 108 | builder.makeFinal(n.builderNode); 109 | } 110 | 111 | for (const { to, via } of n.out) { 112 | queue.push(to); 113 | builder.linkNodes(n.builderNode, to.builderNode, via); 114 | } 115 | }); 116 | 117 | return { 118 | initial: builder.initial, 119 | stableOut: true, 120 | getOut: s => s, 121 | isFinal: s => builder.isFinal(s), 122 | }; 123 | } 124 | 125 | class OutItem<S> { 126 | to: TrieNode<S>; 127 | via: CharSet; 128 | 129 | constructor(to: TrieNode<S>, via: CharSet) { 130 | this.to = to; 131 | this.via = via; 132 | } 133 | } 134 | 135 | class TrieNode<S> { 136 | readonly builderNode: S; 137 | readonly out: OutItem<S>[] = []; 138 | isFinal: boolean = false; 139 | 140 | constructor(builderNode: S) { 141 | this.builderNode = builderNode; 142 | } 143 | 144 | copy(builder: FABuilder<S, never>): TrieNode<S> { 145 | const copy = new TrieNode(builder.createNode()); 146 | copy.isFinal = this.isFinal; 147 | 148 | for (const { to, via } of this.out) { 149 | copy.out.push(new OutItem(to.copy(builder), via)); 150 | } 151 | 152 | return copy; 153 | } 154 | 155 | getNext(charSet: CharSet, build: FABuilder<S, never>): Iterable<TrieNode<S>> { 156 | const next: TrieNode<S>[] = []; 157 | const taken: CharSet[] = []; 158 | 159 | for (let i = 0, l = this.out.length; i < l; i++) { 160 | const item = this.out[i]; 161 | if (item.via.isDisjointWith(charSet)) { 162 | // this item has nothing to do with us 163 | } else if (item.via.isSubsetOf(charSet)) { 164 | // this is a path we want to take 165 | taken.push(item.via); 166 | next.push(item.to); 167 | } else { 168 | // the item contains parts of what we want, so we will split it 169 | const newItem = new OutItem<S>(item.to.copy(build), item.via.intersect(charSet)); 170 | this.out.push(newItem); 171 | item.via = item.via.without(charSet); 172 | 173 | taken.push(newItem.via); 174 | next.push(newItem.to); 175 | } 176 | } 177 | 178 | const rest = taken.length === 0 ? charSet : charSet.without(CharSet.empty(charSet.maximum).union(...taken)); 179 | if (!rest.isEmpty) { 180 | const newItem = new OutItem<S>(new TrieNode<S>(build.createNode()), rest); 181 | this.out.push(newItem); 182 | next.push(newItem.to); 183 | } 184 | 185 | return next; 186 | } 187 | } 188 | --------------------------------------------------------------------------------