├── .editorconfig ├── .gitignore ├── README.md ├── bin └── cli.js ├── index.js ├── package.json ├── src ├── ast.js ├── map.js ├── minimize.js ├── regex.js ├── set.js ├── state.js └── trie.js └── test └── test.js /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | charset = utf-8 5 | indent_style = space 6 | indent_size = 2 7 | end_of_line = lf 8 | insert_final_newline = true 9 | trim_trailing_whitespace = true 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | .DS_Store 3 | yarn.lock 4 | coverage/ 5 | .nyc_output 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # regexgen 2 | 3 | Generates regular expressions that match a set of strings. 4 | 5 | ## Installation 6 | 7 | `regexgen` can be installed using [npm](https://npmjs.com): 8 | 9 | ``` 10 | npm install regexgen 11 | ``` 12 | 13 | ## Example 14 | 15 | The simplest use is to simply pass an array of strings to `regexgen`: 16 | 17 | ```javascript 18 | const regexgen = require('regexgen'); 19 | 20 | regexgen(['foobar', 'foobaz', 'foozap', 'fooza']); // => /foo(?:zap?|ba[rz])/ 21 | ``` 22 | 23 | You can also use the `Trie` class directly: 24 | 25 | ```javascript 26 | const {Trie} = require('regexgen'); 27 | 28 | let t = new Trie; 29 | t.add('foobar'); 30 | t.add('foobaz'); 31 | 32 | t.toRegExp(); // => /fooba[rz]/ 33 | ``` 34 | 35 | ## CLI 36 | 37 | `regexgen` also has a simple CLI to generate regexes using inputs from the command line. 38 | 39 | ```shell 40 | $ regexgen 41 | Usage: regexgen [-gimuy] string1 string2 string3... 42 | ``` 43 | 44 | The optional first parameter is the [flags](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp) to add 45 | to the regex (e.g. `-i` for a case insensitive match). 46 | 47 | ## ES2015 and Unicode 48 | 49 | By default `regexgen` will output a standard JavaScript regular expression, with Unicode codepoints converted into UCS-2 surrogate pairs. 50 | 51 | If desired, you can request an ES2015-compatible Unicode regular expression by supplying the `-u` flag, which results in those codepoints being retained. 52 | 53 | ```shell 54 | $ regexgen 👩 👩‍💻 👩🏻‍💻 👩🏼‍💻 👩🏽‍💻 👩🏾‍💻 👩🏿‍💻 55 | /\uD83D\uDC69(?:(?:\uD83C[\uDFFB-\uDFFF])?\u200D\uD83D\uDCBB)?/ 56 | 57 | $ regexgen -u 👩 👩‍💻 👩🏻‍💻 👩🏼‍💻 👩🏽‍💻 👩🏾‍💻 👩🏿‍💻 58 | /\u{1F469}(?:[\u{1F3FB}-\u{1F3FF}]?\u200D\u{1F4BB})?/u 59 | ``` 60 | 61 | 62 | Such regular expressions are compatible with current versions of Node, as well as the latest browsers, and may be more transferrable to other languages. 63 | 64 | ## How does it work? 65 | 66 | 1. Generate a [Trie](https://en.wikipedia.org/wiki/Trie) containing all of the input strings. 67 | This is a tree structure where each edge represents a single character. This removes 68 | redundancies at the start of the strings, but common branches further down are not merged. 69 | 70 | 2. A trie can be seen as a tree-shaped deterministic finite automaton (DFA), so DFA algorithms 71 | can be applied. In this case, we apply [Hopcroft's DFA minimization algorithm](https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft.27s_algorithm) 72 | to merge the nondistinguishable states. 73 | 74 | 3. Convert the resulting minimized DFA to a regular expression. This is done using 75 | [Brzozowski's algebraic method](http://cs.stackexchange.com/questions/2016/how-to-convert-finite-automata-to-regular-expressions#2392), 76 | which is quite elegant. It expresses the DFA as a system of equations which can be solved 77 | for a resulting regex. Along the way, some additional optimizations are made, such 78 | as hoisting common substrings out of an alternation, and using character class ranges. 79 | This produces an an [Abstract Syntax Tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree) 80 | (AST) for the regex, which is then converted to a string and compiled to a JavaScript 81 | `RegExp` object. 82 | 83 | ## License 84 | 85 | MIT 86 | -------------------------------------------------------------------------------- /bin/cli.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const regexgen = require('../'); 4 | 5 | let args = process.argv.slice(2); 6 | let flags = ''; 7 | if (args.length && args[0][0] === '-') { 8 | flags = args.shift().slice(1); 9 | } 10 | 11 | if (args.length === 0) { 12 | console.log('Usage: regexgen [-gimuy] string1 string2 string3...'); 13 | process.exit(1); 14 | } 15 | 16 | console.log(regexgen(args, flags)); 17 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | const Trie = require('./src/trie'); 2 | 3 | /** 4 | * Generates a regular expression that matches the given input strings. 5 | * @param {Array} inputs 6 | * @param {string} flags 7 | * @return {RegExp} 8 | */ 9 | function regexgen(inputs, flags) { 10 | let trie = new Trie; 11 | trie.addAll(inputs); 12 | return trie.toRegExp(flags); 13 | } 14 | 15 | regexgen.Trie = Trie; 16 | module.exports = regexgen; 17 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "regexgen", 3 | "version": "1.3.0", 4 | "description": "Generate regular expressions that match a set of strings", 5 | "main": "index.js", 6 | "bin": { 7 | "regexgen": "bin/cli.js" 8 | }, 9 | "dependencies": { 10 | "jsesc": "^2.3.0", 11 | "regenerate": "^1.3.2" 12 | }, 13 | "devDependencies": { 14 | "mocha": "^3.2.0" 15 | }, 16 | "scripts": { 17 | "test": "mocha" 18 | }, 19 | "engines": { 20 | "node": ">= 6" 21 | }, 22 | "repository": { 23 | "type": "git", 24 | "url": "git+https://github.com/devongovett/regexgen.git" 25 | }, 26 | "keywords": [ 27 | "regex", 28 | "trie", 29 | "regular", 30 | "expression" 31 | ], 32 | "author": "Devon Govett ", 33 | "license": "MIT", 34 | "bugs": { 35 | "url": "https://github.com/devongovett/regexgen/issues" 36 | }, 37 | "homepage": "https://github.com/devongovett/regexgen#readme", 38 | "runkitExample": "const regexgen = require('regexgen');\n\nregexgen(['foobar', 'foobaz', 'foozap', 'fooza']);" 39 | } 40 | -------------------------------------------------------------------------------- /src/ast.js: -------------------------------------------------------------------------------- 1 | const jsesc = require('jsesc'); 2 | const regenerate = require('regenerate'); 3 | 4 | /** 5 | * Represents an alternation (e.g. `foo|bar`) 6 | */ 7 | class Alternation { 8 | constructor(...options) { 9 | this.precedence = 1; 10 | this.options = this.flatten(options); 11 | this.options.sort((a, b) => b.length - a.length); 12 | } 13 | 14 | flatten(options) { 15 | return options.reduce((res, option) => res.concat( 16 | option instanceof Alternation ? this.flatten(option.options) : option 17 | ), []); 18 | } 19 | 20 | get length() { 21 | return this.options[0].length; 22 | } 23 | 24 | toString(flags) { 25 | return this.options.map(o => parens(o, this, flags)).join('|'); 26 | } 27 | } 28 | 29 | /** 30 | * Represents a character class (e.g. [0-9a-z]) 31 | */ 32 | class CharClass { 33 | constructor(a, b) { 34 | this.precedence = 1; 35 | this.set = regenerate(a, b); 36 | } 37 | 38 | get length() { 39 | return 1; 40 | } 41 | 42 | get isSingleCharacter() { 43 | return !this.set.toArray().some(c => c > 0xffff); 44 | } 45 | 46 | get isSingleCodepoint() { 47 | return true; 48 | } 49 | 50 | toString(flags) { 51 | return this.set.toString({ 52 | hasUnicodeFlag: flags && flags.indexOf('u') !== -1 53 | }); 54 | } 55 | 56 | getCharClass() { 57 | return this.set; 58 | } 59 | } 60 | 61 | /** 62 | * Represents a concatenation (e.g. `foo`) 63 | */ 64 | class Concatenation { 65 | constructor(a, b) { 66 | this.precedence = 2; 67 | this.a = a; 68 | this.b = b; 69 | } 70 | 71 | get length() { 72 | return this.a.length + this.b.length; 73 | } 74 | 75 | toString(flags) { 76 | return parens(this.a, this, flags) + parens(this.b, this, flags); 77 | } 78 | 79 | getLiteral(side) { 80 | if (side === 'start' && this.a.getLiteral) { 81 | return this.a.getLiteral(side); 82 | } 83 | 84 | if (side === 'end' && this.b.getLiteral) { 85 | return this.b.getLiteral(side); 86 | } 87 | } 88 | 89 | removeSubstring(side, len) { 90 | let {a, b} = this; 91 | if (side === 'start' && a.removeSubstring) { 92 | a = a.removeSubstring(side, len); 93 | } 94 | 95 | if (side === 'end' && b.removeSubstring) { 96 | b = b.removeSubstring(side, len); 97 | } 98 | 99 | return a.isEmpty ? b : b.isEmpty ? a : new Concatenation(a, b); 100 | } 101 | } 102 | 103 | /** 104 | * Represents a repetition (e.g. `a*` or `a?`) 105 | */ 106 | class Repetition { 107 | constructor(expr, type) { 108 | this.precedence = 3; 109 | this.expr = expr; 110 | this.type = type; 111 | } 112 | 113 | get length() { 114 | return this.expr.length; 115 | } 116 | 117 | toString(flags) { 118 | return parens(this.expr, this, flags) + this.type; 119 | } 120 | } 121 | 122 | /** 123 | * Represents a literal (e.g. a string) 124 | */ 125 | class Literal { 126 | constructor(value) { 127 | this.precedence = 2; 128 | this.value = value; 129 | } 130 | 131 | get isEmpty() { 132 | return !this.value; 133 | } 134 | 135 | get isSingleCharacter() { 136 | return this.length === 1; 137 | } 138 | 139 | get isSingleCodepoint() { 140 | return Array.from(this.value).length === 1; 141 | } 142 | 143 | get length() { 144 | return this.value.length; 145 | } 146 | 147 | toString(flags) { 148 | return jsesc(this.value, { es6: flags && flags.indexOf('u') !== -1 }) 149 | .replace(/[\t\n\f\r\$\(\)\*\+\-\.\?\[\]\^\|]/g, '\\$&') 150 | 151 | // special handling to not escape curly braces which are part of Unicode escapes 152 | .replace(/(\\u\{[a-z0-9]+\})|([\{\}])/ig, (match, unicode, brace) => unicode || '\\' + brace); 153 | } 154 | 155 | getCharClass() { 156 | if (this.isSingleCodepoint) { 157 | return this.value; 158 | } 159 | } 160 | 161 | getLiteral() { 162 | return this.value; 163 | } 164 | 165 | removeSubstring(side, len) { 166 | if (side === 'start') { 167 | return new Literal(this.value.slice(len)); 168 | } 169 | 170 | if (side === 'end') { 171 | return new Literal(this.value.slice(0, this.value.length - len)); 172 | } 173 | } 174 | } 175 | 176 | function parens(exp, parent, flags) { 177 | let isUnicode = flags && flags.indexOf('u') !== -1; 178 | let str = exp.toString(flags); 179 | if (exp.precedence < parent.precedence && !exp.isSingleCharacter && !(isUnicode && exp.isSingleCodepoint)) { 180 | return '(?:' + str + ')'; 181 | } 182 | 183 | return str; 184 | } 185 | 186 | exports.Alternation = Alternation; 187 | exports.CharClass = CharClass; 188 | exports.Concatenation = Concatenation; 189 | exports.Repetition = Repetition; 190 | exports.Literal = Literal; 191 | -------------------------------------------------------------------------------- /src/map.js: -------------------------------------------------------------------------------- 1 | /** 2 | * This ES6 Map subclass calls the getter function passed to 3 | * the constructor to initialize undefined properties when they 4 | * are first retrieved. 5 | */ 6 | class DefaultMap extends Map { 7 | constructor(iterable, defaultGetter) { 8 | if (typeof iterable === 'function') { 9 | defaultGetter = iterable; 10 | iterable = null; 11 | } 12 | 13 | super(iterable); 14 | this.defaultGetter = defaultGetter; 15 | } 16 | 17 | get(key) { 18 | if (!super.has(key)) { 19 | let res = this.defaultGetter(key); 20 | this.set(key, res); 21 | return res; 22 | } 23 | 24 | return super.get(key); 25 | } 26 | } 27 | 28 | module.exports = DefaultMap; 29 | -------------------------------------------------------------------------------- /src/minimize.js: -------------------------------------------------------------------------------- 1 | const Map = require('./map'); 2 | const Set = require('./set'); 3 | const State = require('./state'); 4 | 5 | /** 6 | * Implements Hopcroft's DFA minimization algorithm. 7 | * https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft.27s_algorithm 8 | * 9 | * @param {State} root - the initial state of the DFA 10 | * @return {State} - the new initial state 11 | */ 12 | function minimize(root) { 13 | let states = new Set(root.visit()); 14 | let finalStates = states.filter(s => s.accepting); 15 | 16 | // Create a map of incoming transitions to each state, grouped by character. 17 | let transitions = new Map(k => new Map(k => new Set)); 18 | for (let s of states) { 19 | for (let [t, st] of s.transitions) { 20 | transitions.get(st).get(t).add(s); 21 | } 22 | } 23 | 24 | let P = new Set([finalStates, states.difference(finalStates)]); 25 | let W = new Set(P); 26 | 27 | while (W.size > 0) { 28 | let A = W.shift(); 29 | 30 | // Collect states that have transitions leading to states in A, grouped by character. 31 | let t = new Map(k => new Set); 32 | for (let s of A) { 33 | for (let [T, X] of transitions.get(s)) { 34 | t.get(T).addAll(X); 35 | } 36 | } 37 | 38 | for (let X of t.values()) { 39 | for (let Y of P) { 40 | let i = X.intersection(Y); 41 | if (i.size === 0) { 42 | continue; 43 | } 44 | 45 | let d = Y.difference(X); 46 | if (d.size === 0) { 47 | continue; 48 | } 49 | 50 | P.replace(Y, i, d); 51 | 52 | let y = W.find(v => v.equals(Y)); 53 | if (y) { 54 | W.replace(y, i, d); 55 | } else if (i.size <= d.size) { 56 | W.add(i); 57 | } else { 58 | W.add(d); 59 | } 60 | } 61 | } 62 | } 63 | 64 | // Each set S in P now represents a state in the minimized DFA. 65 | // Build the new states and transitions. 66 | let newStates = new Map(k => new State); 67 | let initial = null; 68 | 69 | for (let S of P) { 70 | let first = S.first(); 71 | let s = newStates.get(S); 72 | for (let [c, old] of first.transitions) { 73 | s.transitions.set(c, newStates.get(P.find(v => v.has(old)))); 74 | } 75 | 76 | s.accepting = first.accepting; 77 | 78 | if (S.has(root)) { 79 | initial = s; 80 | } 81 | } 82 | 83 | return initial; 84 | } 85 | 86 | module.exports = minimize; 87 | -------------------------------------------------------------------------------- /src/regex.js: -------------------------------------------------------------------------------- 1 | const {Alternation, CharClass, Concatenation, Repetition, Literal} = require('./ast'); 2 | 3 | /** 4 | * Implements Brzozowski's algebraic method to convert a DFA into a regular 5 | * expression pattern. 6 | * http://cs.stackexchange.com/questions/2016/how-to-convert-finite-automata-to-regular-expressions#2392 7 | * 8 | * @param {State} root - the initial state of the DFA 9 | * @param {string} flags - The flags to add to the regex. 10 | * @return {String} - the converted regular expression pattern 11 | */ 12 | function toRegex(root, flags) { 13 | let states = Array.from(root.visit()); 14 | 15 | // Setup the system of equations A and B from Arden's Lemma. 16 | // A represents a state transition table for the given DFA. 17 | // B is a vector of accepting states in the DFA, marked as epsilons. 18 | let A = []; 19 | let B = []; 20 | 21 | for (let i = 0; i < states.length; i++) { 22 | let a = states[i]; 23 | if (a.accepting) { 24 | B[i] = new Literal(''); 25 | } 26 | 27 | A[i] = []; 28 | for (let [t, s] of a.transitions) { 29 | let j = states.indexOf(s); 30 | A[i][j] = A[i][j] ? union(A[i][j], new Literal(t)) : new Literal(t); 31 | } 32 | } 33 | 34 | // Solve the of equations 35 | for (let n = states.length - 1; n >= 0; n--) { 36 | if (A[n][n] != null) { 37 | B[n] = concat(star(A[n][n]), B[n]); 38 | for (let j = 0; j < n; j++) { 39 | A[n][j] = concat(star(A[n][n]), A[n][j]); 40 | } 41 | } 42 | 43 | for (let i = 0; i < n; i++) { 44 | if (A[i][n] != null) { 45 | B[i] = union(B[i], concat(A[i][n], B[n])); 46 | for (let j = 0; j < n; j++) { 47 | A[i][j] = union(A[i][j], concat(A[i][n], A[n][j])); 48 | } 49 | } 50 | } 51 | } 52 | 53 | return B[0].toString(flags); 54 | } 55 | 56 | /** 57 | * Creates a repetition if `exp` exists. 58 | */ 59 | function star(exp) { 60 | return exp ? new Repetition(exp, '*') : null; 61 | } 62 | 63 | /** 64 | * Creates a union between two expressions 65 | */ 66 | function union(a, b) { 67 | if (a != null && b != null && a !== b) { 68 | // Hoist common substrings at the start and end of the options 69 | let start, end, res; 70 | [a, b, start] = removeCommonSubstring(a, b, 'start'); 71 | [a, b, end] = removeCommonSubstring(a, b, 'end'); 72 | 73 | // If a or b is empty, make an optional group instead 74 | if (a.isEmpty || b.isEmpty) { 75 | res = new Repetition(a.isEmpty ? b : a, '?'); 76 | } else if (a instanceof Repetition && a.type === '?') { 77 | res = new Repetition(new Alternation(a.expr, b), '?'); 78 | } else if (b instanceof Repetition && b.type === '?') { 79 | res = new Repetition(new Alternation(a, b.expr), '?'); 80 | } else { 81 | // Check if we can make a character class instead of an alternation 82 | let ac = a.getCharClass && a.getCharClass(); 83 | let bc = b.getCharClass && b.getCharClass(); 84 | if (ac && bc) { 85 | res = new CharClass(ac, bc); 86 | } else { 87 | res = new Alternation(a, b); 88 | } 89 | } 90 | 91 | if (start) { 92 | res = new Concatenation(new Literal(start), res); 93 | } 94 | 95 | if (end) { 96 | res = new Concatenation(res, new Literal(end)); 97 | } 98 | 99 | return res; 100 | } 101 | 102 | return a || b; 103 | } 104 | 105 | /** 106 | * Removes the common prefix or suffix from the two expressions 107 | */ 108 | function removeCommonSubstring(a, b, side) { 109 | let al = a.getLiteral && a.getLiteral(side); 110 | let bl = b.getLiteral && b.getLiteral(side); 111 | if (!al || !bl) { 112 | return [a, b, null]; 113 | } 114 | 115 | let s = commonSubstring(al, bl, side); 116 | if (!s) { 117 | return [a, b, '']; 118 | } 119 | 120 | a = a.removeSubstring(side, s.length); 121 | b = b.removeSubstring(side, s.length); 122 | 123 | return [a, b, s]; 124 | } 125 | 126 | /** 127 | * Finds the common prefix or suffix between to strings 128 | */ 129 | function commonSubstring(a, b, side) { 130 | let dir = side === 'start' ? 1 : -1; 131 | a = Array.from(a); 132 | b = Array.from(b); 133 | let ai = dir === 1 ? 0 : a.length - 1; 134 | let ae = dir === 1 ? a.length : -1; 135 | let bi = dir === 1 ? 0 : b.length - 1; 136 | let be = dir === 1 ? b.length : -1; 137 | let res = ''; 138 | 139 | for (; ai !== ae && bi !== be && a[ai] === b[bi]; ai += dir, bi += dir) { 140 | if (dir === 1) { 141 | res += a[ai]; 142 | } else { 143 | res = a[ai] + res; 144 | } 145 | } 146 | 147 | return res; 148 | } 149 | 150 | /** 151 | * Creates a concatenation between expressions a and b 152 | */ 153 | function concat(a, b) { 154 | if (a == null || b == null) { 155 | return null; 156 | } 157 | 158 | if (a.isEmpty) { 159 | return b; 160 | } 161 | 162 | if (b.isEmpty) { 163 | return a; 164 | } 165 | 166 | // Combine literals 167 | if (a instanceof Literal && b instanceof Literal) { 168 | return new Literal(a.value + b.value); 169 | } 170 | 171 | if (a instanceof Literal && b instanceof Concatenation && b.a instanceof Literal) { 172 | return new Concatenation(new Literal(a.value + b.a.value), b.b); 173 | } 174 | 175 | if (b instanceof Literal && a instanceof Concatenation && a.b instanceof Literal) { 176 | return new Concatenation(a.a, new Literal(a.b.value + b.value)); 177 | } 178 | 179 | return new Concatenation(a, b); 180 | } 181 | 182 | module.exports = toRegex; 183 | -------------------------------------------------------------------------------- /src/set.js: -------------------------------------------------------------------------------- 1 | /** 2 | * This class extends the native ES6 Set class with some additional methods 3 | */ 4 | class ExtendedSet extends Set { 5 | filter(fn) { 6 | let res = new ExtendedSet; 7 | for (let x of this) { 8 | if (fn(x)) { 9 | res.add(x); 10 | } 11 | } 12 | 13 | return res; 14 | } 15 | 16 | difference(b) { 17 | return this.filter(x => !b.has(x)); 18 | } 19 | 20 | intersection(b) { 21 | return this.filter(x => b.has(x)); 22 | } 23 | 24 | equals(b) { 25 | if (this.size !== b.size) { 26 | return false; 27 | } 28 | 29 | for (let x of this) { 30 | if (!b.has(x)) { 31 | return false; 32 | } 33 | } 34 | 35 | return true; 36 | } 37 | 38 | find(fn) { 39 | for (let x of this) { 40 | if (fn(x)) { 41 | return x; 42 | } 43 | } 44 | 45 | return null; 46 | } 47 | 48 | first() { 49 | return this.values().next().value; 50 | } 51 | 52 | shift() { 53 | let v = this.first(); 54 | this.delete(v); 55 | return v; 56 | } 57 | 58 | replace(search, ...replacements) { 59 | if (this.delete(search)) { 60 | this.addAll(replacements); 61 | } 62 | } 63 | 64 | addAll(items) { 65 | for (let x of items) { 66 | this.add(x); 67 | } 68 | } 69 | } 70 | 71 | module.exports = ExtendedSet; 72 | -------------------------------------------------------------------------------- /src/state.js: -------------------------------------------------------------------------------- 1 | const Map = require('./map'); 2 | 3 | /** 4 | * Represents a state in a DFA. 5 | */ 6 | class State { 7 | constructor() { 8 | this.accepting = false; 9 | this.transitions = new Map(k => new State); 10 | } 11 | 12 | /** 13 | * A generator that yields all states in the subtree 14 | * starting with this state. 15 | */ 16 | *visit(visited = new Set) { 17 | if (visited.has(this)) return; 18 | visited.add(this); 19 | 20 | yield this; 21 | for (let state of this.transitions.values()) { 22 | yield* state.visit(visited); 23 | } 24 | } 25 | } 26 | 27 | module.exports = State; 28 | -------------------------------------------------------------------------------- /src/trie.js: -------------------------------------------------------------------------------- 1 | const State = require('./state'); 2 | const minimize = require('./minimize'); 3 | const toRegex = require('./regex'); 4 | 5 | /** 6 | * A Trie represents a set of strings in a tree data structure 7 | * where each edge represents a single character. 8 | * https://en.wikipedia.org/wiki/Trie 9 | */ 10 | class Trie { 11 | constructor() { 12 | this.alphabet = new Set; 13 | this.root = new State; 14 | } 15 | 16 | /** 17 | * Adds the given string to the trie. 18 | * @param {string} string - the string to add 19 | */ 20 | add(string) { 21 | let node = this.root; 22 | for (let char of string) { 23 | this.alphabet.add(char); 24 | node = node.transitions.get(char); 25 | } 26 | 27 | node.accepting = true; 28 | } 29 | 30 | /** 31 | * Adds the given array of strings to the trie. 32 | * @param {Array} strings - the array of strings to add 33 | */ 34 | addAll(strings) { 35 | for (let string of strings) { 36 | this.add(string); 37 | } 38 | } 39 | 40 | /** 41 | * Returns a minimal DFA representing the strings in the trie. 42 | * @return {State} - the starting state of the minimal DFA 43 | */ 44 | minimize() { 45 | return minimize(this.root); 46 | } 47 | 48 | /** 49 | * Returns a regex pattern that matches the strings in the trie. 50 | * @param {string} flags - The flags to add to the regex. 51 | * @return {string} pattern - The regex pattern. 52 | */ 53 | toString(flags) { 54 | return toRegex(this.minimize(), flags); 55 | } 56 | 57 | /** 58 | * Returns a regex that matches the strings in the trie. 59 | * @param {string} flags - The flags to add to the regex. 60 | * @return {RegExp} 61 | */ 62 | toRegExp(flags) { 63 | return new RegExp(this.toString(flags), flags); 64 | } 65 | } 66 | 67 | module.exports = Trie; 68 | -------------------------------------------------------------------------------- /test/test.js: -------------------------------------------------------------------------------- 1 | const assert = require('assert'); 2 | const regexgen = require('../'); 3 | 4 | describe('regexgen', function () { 5 | it('should generate a char class', function () { 6 | assert.deepEqual(regexgen(['a', 'b', 'c']), /[a-c]/); 7 | }); 8 | 9 | it('should generate an alternation', function () { 10 | assert.deepEqual(regexgen(['abc', '123']), /123|abc/); 11 | }); 12 | 13 | it('should extract common prefixes at the start', function () { 14 | assert.deepEqual(regexgen(['foobar', 'foozap']), /foo(?:zap|bar)/); 15 | }); 16 | 17 | it('should extract common prefixes at the end', function () { 18 | assert.deepEqual(regexgen(['barfoo', 'zapfoo']), /(?:zap|bar)foo/); 19 | }); 20 | 21 | it('should extract common prefixes at the start and end', function () { 22 | assert.deepEqual(regexgen(['foobarfoo', 'foozapfoo']), /foo(?:zap|bar)foo/); 23 | }); 24 | 25 | it('should generate an optional group', function () { 26 | assert.deepEqual(regexgen(['foo', 'foobar']), /foo(?:bar)?/); 27 | }); 28 | 29 | it('should generate multiple optional groups', function () { 30 | assert.deepEqual(regexgen(['f', 'fo', 'fox']), /f(?:ox?)?/); 31 | }); 32 | 33 | it('should escape meta characters', function () { 34 | assert.deepEqual(regexgen(['foo|bar[test]+']), /foo\|bar\[test\]\+/); 35 | assert.deepEqual(regexgen(['u{}\\iu']), /u\{\}\\iu/); 36 | }); 37 | 38 | it('should escape non-ascii characters', function () { 39 | assert.deepEqual(regexgen(['🎉']), /\uD83C\uDF89/); 40 | }); 41 | 42 | it('should support regex flags', function () { 43 | assert.deepEqual(regexgen(['a', 'b', 'c'], 'g'), /[a-c]/g); 44 | }); 45 | 46 | it('should support using the Trie class directly', function () { 47 | let t = new regexgen.Trie; 48 | t.add('foobar'); 49 | t.add('foobaz'); 50 | 51 | assert.deepEqual(t.toString(), 'fooba[rz]'); 52 | assert.deepEqual(t.toRegExp(), /fooba[rz]/); 53 | 54 | let t2 = new regexgen.Trie; 55 | t2.addAll(['foobar', 'foobaz']); 56 | 57 | assert.deepEqual(t2.toString(), 'fooba[rz]'); 58 | assert.deepEqual(t2.toRegExp(), /fooba[rz]/); 59 | }); 60 | 61 | it('should work with optional groups', function () { 62 | assert.deepEqual(regexgen(['a', 'abc']), /a(?:bc)?/); 63 | }); 64 | 65 | it('should wrap optional character classes in parens if they contain non-BMP codepoints', function () { 66 | assert.deepEqual(regexgen(['\u261D', '\u261D\u{1f3fb}', '\u261D\u{1f3fc}']), /\u261D(?:\uD83C[\uDFFB\uDFFC])?/); 67 | }); 68 | 69 | it('should wrap optional literals in parens if they contain more than one code unit', function () { 70 | assert.deepEqual(regexgen(['\u261D', '\u261D\u{1f3fb}']), /\u261D(?:\uD83C\uDFFB)?/); 71 | }); 72 | 73 | it('should retain non-BMP codepoints when the Unicode flag is passed', function () { 74 | assert.deepEqual(regexgen(['\u261D', '\u261D\u{1f3fb}'], 'u'), /\u261D\u{1F3FB}?/u); 75 | assert.deepEqual( 76 | regexgen(['\u{1F3F4}', '\u{1F3F4}\u{E0067}\u{E0062}\u{E0065}\u{E006E}\u{E0067}', '\u{1F3F4}\u{E0067}\u{E0062}\u{E0077}\u{E006C}\u{E0073}', '\u{1F3F4}\u{E0067}\u{E0062}\u{E0073}\u{E0063}\u{E0074}'], 'u'), 77 | /\u{1F3F4}(?:\u{E0067}\u{E0062}(?:\u{E0073}\u{E0063}\u{E0074}|\u{E0077}\u{E006C}\u{E0073}|\u{E0065}\u{E006E}\u{E0067}))?/u 78 | ); 79 | }); 80 | 81 | it('should handle non-BMP codepoint ranges correctly', function() { 82 | assert.deepEqual( 83 | regexgen(['\u{1F311}', '\u{1F312}', '\u{1F313}', '\u{1F314}', '\u{1F315}', '\u{1F316}', '\u{1F317}', '\u{1F318}'], 'u'), 84 | /[\u{1F311}-\u{1F318}]/u 85 | ); 86 | }); 87 | 88 | it('should correctly extract common prefix from multiple alternations', function () { 89 | assert.deepEqual(regexgen(['abjv', 'abxcjv', 'abydjv', 'abzejv']), /ab(?:ze|yd|xc)?jv/); 90 | }); 91 | 92 | it('should sort alternation options correctly (#10)', function () { 93 | let s = '\uD83C\uDFCA\uD83C\uDFFD\u200D\u2640\uFE0F'; 94 | let r = regexgen([ 95 | '\uD83C\uDDF7\uD83C\uDDFC', 96 | '\uD83C\uDDF8\uD83C\uDDE6', 97 | '\uD83C\uDFCA\uD83C\uDFFD', 98 | s 99 | ]); 100 | 101 | assert.deepEqual(s.match(r)[0], s); 102 | }); 103 | 104 | it('should sort non-BMP alternation options correctly', function () { 105 | let r = regexgen( 106 | [ 107 | // shrug emoji 108 | '\u{1F937}\u200D', 109 | // shrug emoji with fitzpatrick modifiers 110 | '\u{1F937}\u{1F3FB}\u200D', 111 | '\u{1F937}\u{1F3FC}\u200D', 112 | '\u{1F937}\u{1F3FD}\u200D', 113 | '\u{1F937}\u{1F3FE}\u200D', 114 | '\u{1F937}\u{1F3FF}\u200D', 115 | // shrug emoji with gender modifier 116 | '\u{1F937}\u200D\u2640\uFE0F', 117 | // shrug emoji with gender and fitzpatrick modifiers 118 | '\u{1F937}\u{1F3FB}\u200D\u2640\uFE0F', 119 | '\u{1F937}\u{1F3FC}\u200D\u2640\uFE0F', 120 | '\u{1F937}\u{1F3FD}\u200D\u2640\uFE0F', 121 | '\u{1F937}\u{1F3FE}\u200D\u2640\uFE0F', 122 | '\u{1F937}\u{1F3FF}\u200D\u2640\uFE0F' 123 | ], 124 | 'u' 125 | ); 126 | 127 | assert.deepEqual(r, /\u{1F937}[\u{1F3FB}-\u{1F3FF}]?\u200D(?:\u2640\uFE0F)?/u); 128 | assert.deepEqual('\u{1F937}\u{1F3FB}\u200D\u2640\uFE0F'.match(r)[0], '\u{1F937}\u{1F3FB}\u200D\u2640\uFE0F'); 129 | }); 130 | 131 | it('should sort alternations of alternations correctly', function () { 132 | let r = regexgen(['aef', 'aghz', 'ayz', 'abcdz', 'abcd']); 133 | let s = 'abcdz'; 134 | 135 | assert.deepEqual(s.match(r)[0], s); 136 | assert.deepEqual(r, /a(?:(?:bcd|gh|y)z|bcd|ef)/); 137 | }); 138 | }); 139 | --------------------------------------------------------------------------------