├── .editorconfig
├── .gitignore
├── README.md
├── bin
    └── cli.js
├── index.js
├── package.json
├── src
    ├── ast.js
    ├── map.js
    ├── minimize.js
    ├── regex.js
    ├── set.js
    ├── state.js
    └── trie.js
└── test
    └── test.js


/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | charset = utf-8
 5 | indent_style = space
 6 | indent_size = 2
 7 | end_of_line = lf
 8 | insert_final_newline = true
 9 | trim_trailing_whitespace = true
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules/
2 | .DS_Store
3 | yarn.lock
4 | coverage/
5 | .nyc_output
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # regexgen
 2 | 
 3 | Generates regular expressions that match a set of strings.
 4 | 
 5 | ## Installation
 6 | 
 7 | `regexgen` can be installed using [npm](https://npmjs.com):
 8 | 
 9 | ```
10 | npm install regexgen
11 | ```
12 | 
13 | ## Example
14 | 
15 | The simplest use is to simply pass an array of strings to `regexgen`:
16 | 
17 | ```javascript
18 | const regexgen = require('regexgen');
19 | 
20 | regexgen(['foobar', 'foobaz', 'foozap', 'fooza']); // => /foo(?:zap?|ba[rz])/
21 | ```
22 | 
23 | You can also use the `Trie` class directly:
24 | 
25 | ```javascript
26 | const {Trie} = require('regexgen');
27 | 
28 | let t = new Trie;
29 | t.add('foobar');
30 | t.add('foobaz');
31 | 
32 | t.toRegExp(); // => /fooba[rz]/
33 | ```
34 | 
35 | ## CLI
36 | 
37 | `regexgen` also has a simple CLI to generate regexes using inputs from the command line.
38 | 
39 | ```shell
40 | $ regexgen
41 | Usage: regexgen [-gimuy] string1 string2 string3...
42 | ```
43 | 
44 | The optional first parameter is the [flags](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp) to add
45 | to the regex (e.g. `-i` for a case insensitive match).
46 | 
47 | ## ES2015 and Unicode
48 | 
49 | By default `regexgen` will output a standard JavaScript regular expression, with Unicode codepoints converted into UCS-2 surrogate pairs.
50 | 
51 | If desired, you can request an ES2015-compatible Unicode regular expression by supplying the `-u` flag, which results in those codepoints being retained.
52 | 
53 | ```shell
54 | $ regexgen 👩 👩‍💻 👩🏻‍💻 👩🏼‍💻 👩🏽‍💻 👩🏾‍💻 👩🏿‍💻
55 | /\uD83D\uDC69(?:(?:\uD83C[\uDFFB-\uDFFF])?\u200D\uD83D\uDCBB)?/
56 | 
57 | $ regexgen -u 👩 👩‍💻 👩🏻‍💻 👩🏼‍💻 👩🏽‍💻 👩🏾‍💻 👩🏿‍💻
58 | /\u{1F469}(?:[\u{1F3FB}-\u{1F3FF}]?\u200D\u{1F4BB})?/u
59 | ```
60 | 
61 | 
62 | Such regular expressions are compatible with current versions of Node, as well as the latest browsers, and may be more transferrable to other languages.
63 | 
64 | ## How does it work?
65 | 
66 | 1. Generate a [Trie](https://en.wikipedia.org/wiki/Trie) containing all of the input strings.
67 |    This is a tree structure where each edge represents a single character. This removes
68 |    redundancies at the start of the strings, but common branches further down are not merged.
69 | 
70 | 2. A trie can be seen as a tree-shaped deterministic finite automaton (DFA), so DFA algorithms
71 |    can be applied. In this case, we apply [Hopcroft's DFA minimization algorithm](https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft.27s_algorithm)
72 |    to merge the nondistinguishable states.
73 | 
74 | 3. Convert the resulting minimized DFA to a regular expression. This is done using
75 |    [Brzozowski's algebraic method](http://cs.stackexchange.com/questions/2016/how-to-convert-finite-automata-to-regular-expressions#2392),
76 |    which is quite elegant. It expresses the DFA as a system of equations which can be solved
77 |    for a resulting regex. Along the way, some additional optimizations are made, such
78 |    as hoisting common substrings out of an alternation, and using character class ranges.
79 |    This produces an an [Abstract Syntax Tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree)
80 |    (AST) for the regex, which is then converted to a string and compiled to a JavaScript
81 |    `RegExp` object.
82 | 
83 | ## License
84 | 
85 | MIT
86 | 


--------------------------------------------------------------------------------
/bin/cli.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const regexgen = require('../');
 4 | 
 5 | let args = process.argv.slice(2);
 6 | let flags = '';
 7 | if (args.length && args[0][0] === '-') {
 8 |   flags = args.shift().slice(1);
 9 | }
10 | 
11 | if (args.length === 0) {
12 |   console.log('Usage: regexgen [-gimuy] string1 string2 string3...');
13 |   process.exit(1);
14 | }
15 | 
16 | console.log(regexgen(args, flags));
17 | 


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
 1 | const Trie = require('./src/trie');
 2 | 
 3 | /**
 4 |  * Generates a regular expression that matches the given input strings.
 5 |  * @param {Array<string>} inputs
 6 |  * @param {string} flags
 7 |  * @return {RegExp}
 8 |  */
 9 | function regexgen(inputs, flags) {
10 |   let trie = new Trie;
11 |   trie.addAll(inputs);
12 |   return trie.toRegExp(flags);
13 | }
14 | 
15 | regexgen.Trie = Trie;
16 | module.exports = regexgen;
17 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "regexgen",
 3 |   "version": "1.3.0",
 4 |   "description": "Generate regular expressions that match a set of strings",
 5 |   "main": "index.js",
 6 |   "bin": {
 7 |     "regexgen": "bin/cli.js"
 8 |   },
 9 |   "dependencies": {
10 |     "jsesc": "^2.3.0",
11 |     "regenerate": "^1.3.2"
12 |   },
13 |   "devDependencies": {
14 |     "mocha": "^3.2.0"
15 |   },
16 |   "scripts": {
17 |     "test": "mocha"
18 |   },
19 |   "engines": {
20 |     "node": ">= 6"
21 |   },
22 |   "repository": {
23 |     "type": "git",
24 |     "url": "git+https://github.com/devongovett/regexgen.git"
25 |   },
26 |   "keywords": [
27 |     "regex",
28 |     "trie",
29 |     "regular",
30 |     "expression"
31 |   ],
32 |   "author": "Devon Govett <devongovett@gmail.com>",
33 |   "license": "MIT",
34 |   "bugs": {
35 |     "url": "https://github.com/devongovett/regexgen/issues"
36 |   },
37 |   "homepage": "https://github.com/devongovett/regexgen#readme",
38 |   "runkitExample": "const regexgen = require('regexgen');\n\nregexgen(['foobar', 'foobaz', 'foozap', 'fooza']);"
39 | }
40 | 


--------------------------------------------------------------------------------
/src/ast.js:
--------------------------------------------------------------------------------
  1 | const jsesc = require('jsesc');
  2 | const regenerate = require('regenerate');
  3 | 
  4 | /**
  5 |  * Represents an alternation (e.g. `foo|bar`)
  6 |  */
  7 | class Alternation {
  8 |   constructor(...options) {
  9 |     this.precedence = 1;
 10 |     this.options = this.flatten(options);
 11 |     this.options.sort((a, b) => b.length - a.length);
 12 |   }
 13 | 
 14 |   flatten(options) {
 15 |     return options.reduce((res, option) => res.concat(
 16 |       option instanceof Alternation ? this.flatten(option.options) : option
 17 |     ), []);
 18 |   }
 19 | 
 20 |   get length() {
 21 |     return this.options[0].length;
 22 |   }
 23 | 
 24 |   toString(flags) {
 25 |     return this.options.map(o => parens(o, this, flags)).join('|');
 26 |   }
 27 | }
 28 | 
 29 | /**
 30 |  * Represents a character class (e.g. [0-9a-z])
 31 |  */
 32 | class CharClass {
 33 |   constructor(a, b) {
 34 |     this.precedence = 1;
 35 |     this.set = regenerate(a, b);
 36 |   }
 37 | 
 38 |   get length() {
 39 |     return 1;
 40 |   }
 41 | 
 42 |   get isSingleCharacter() {
 43 |     return !this.set.toArray().some(c => c > 0xffff);
 44 |   }
 45 | 
 46 |   get isSingleCodepoint() {
 47 |     return true;
 48 |   }
 49 | 
 50 |   toString(flags) {
 51 |     return this.set.toString({
 52 |       hasUnicodeFlag: flags && flags.indexOf('u') !== -1
 53 |     });
 54 |   }
 55 | 
 56 |   getCharClass() {
 57 |     return this.set;
 58 |   }
 59 | }
 60 | 
 61 | /**
 62 |  * Represents a concatenation (e.g. `foo`)
 63 |  */
 64 | class Concatenation {
 65 |   constructor(a, b) {
 66 |     this.precedence = 2;
 67 |     this.a = a;
 68 |     this.b = b;
 69 |   }
 70 | 
 71 |   get length() {
 72 |     return this.a.length + this.b.length;
 73 |   }
 74 | 
 75 |   toString(flags) {
 76 |     return parens(this.a, this, flags) + parens(this.b, this, flags);
 77 |   }
 78 | 
 79 |   getLiteral(side) {
 80 |     if (side === 'start' && this.a.getLiteral) {
 81 |       return this.a.getLiteral(side);
 82 |     }
 83 | 
 84 |     if (side === 'end' && this.b.getLiteral) {
 85 |       return this.b.getLiteral(side);
 86 |     }
 87 |   }
 88 | 
 89 |   removeSubstring(side, len) {
 90 |     let {a, b} = this;
 91 |     if (side === 'start' && a.removeSubstring) {
 92 |       a = a.removeSubstring(side, len);
 93 |     }
 94 | 
 95 |     if (side === 'end' && b.removeSubstring) {
 96 |       b = b.removeSubstring(side, len);
 97 |     }
 98 | 
 99 |     return a.isEmpty ? b : b.isEmpty ? a : new Concatenation(a, b);
100 |   }
101 | }
102 | 
103 | /**
104 |  * Represents a repetition (e.g. `a*` or `a?`)
105 |  */
106 | class Repetition {
107 |   constructor(expr, type) {
108 |     this.precedence = 3;
109 |     this.expr = expr;
110 |     this.type = type;
111 |   }
112 | 
113 |   get length() {
114 |     return this.expr.length;
115 |   }
116 | 
117 |   toString(flags) {
118 |     return parens(this.expr, this, flags) + this.type;
119 |   }
120 | }
121 | 
122 | /**
123 |  * Represents a literal (e.g. a string)
124 |  */
125 | class Literal {
126 |   constructor(value) {
127 |     this.precedence = 2;
128 |     this.value = value;
129 |   }
130 | 
131 |   get isEmpty() {
132 |     return !this.value;
133 |   }
134 | 
135 |   get isSingleCharacter() {
136 |     return this.length === 1;
137 |   }
138 | 
139 |   get isSingleCodepoint() {
140 |     return Array.from(this.value).length === 1;
141 |   }
142 | 
143 |   get length() {
144 |     return this.value.length;
145 |   }
146 | 
147 |   toString(flags) {
148 |     return jsesc(this.value, { es6: flags && flags.indexOf('u') !== -1 })
149 |       .replace(/[\t\n\f\r\$\(\)\*\+\-\.\?\[\]\^\|]/g, '\\$&')
150 | 
151 |       // special handling to not escape curly braces which are part of Unicode escapes
152 |       .replace(/(\\u\{[a-z0-9]+\})|([\{\}])/ig, (match, unicode, brace) => unicode || '\\' + brace);
153 |   }
154 | 
155 |   getCharClass() {
156 |     if (this.isSingleCodepoint) {
157 |       return this.value;
158 |     }
159 |   }
160 | 
161 |   getLiteral() {
162 |     return this.value;
163 |   }
164 | 
165 |   removeSubstring(side, len) {
166 |     if (side === 'start') {
167 |       return new Literal(this.value.slice(len));
168 |     }
169 | 
170 |     if (side === 'end') {
171 |       return new Literal(this.value.slice(0, this.value.length - len));
172 |     }
173 |   }
174 | }
175 | 
176 | function parens(exp, parent, flags) {
177 |   let isUnicode = flags && flags.indexOf('u') !== -1;
178 |   let str = exp.toString(flags);
179 |   if (exp.precedence < parent.precedence && !exp.isSingleCharacter && !(isUnicode && exp.isSingleCodepoint)) {
180 |     return '(?:' + str + ')';
181 |   }
182 | 
183 |   return str;
184 | }
185 | 
186 | exports.Alternation = Alternation;
187 | exports.CharClass = CharClass;
188 | exports.Concatenation = Concatenation;
189 | exports.Repetition = Repetition;
190 | exports.Literal = Literal;
191 | 


--------------------------------------------------------------------------------
/src/map.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This ES6 Map subclass calls the getter function passed to
 3 |  * the constructor to initialize undefined properties when they
 4 |  * are first retrieved.
 5 |  */
 6 | class DefaultMap extends Map {
 7 |   constructor(iterable, defaultGetter) {
 8 |     if (typeof iterable === 'function') {
 9 |       defaultGetter = iterable;
10 |       iterable = null;
11 |     }
12 | 
13 |     super(iterable);
14 |     this.defaultGetter = defaultGetter;
15 |   }
16 | 
17 |   get(key) {
18 |     if (!super.has(key)) {
19 |       let res = this.defaultGetter(key);
20 |       this.set(key, res);
21 |       return res;
22 |     }
23 | 
24 |     return super.get(key);
25 |   }
26 | }
27 | 
28 | module.exports = DefaultMap;
29 | 


--------------------------------------------------------------------------------
/src/minimize.js:
--------------------------------------------------------------------------------
 1 | const Map = require('./map');
 2 | const Set = require('./set');
 3 | const State = require('./state');
 4 | 
 5 | /**
 6 |  * Implements Hopcroft's DFA minimization algorithm.
 7 |  * https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft.27s_algorithm
 8 |  *
 9 |  * @param {State} root - the initial state of the DFA
10 |  * @return {State} - the new initial state
11 |  */
12 | function minimize(root) {
13 |   let states = new Set(root.visit());
14 |   let finalStates = states.filter(s => s.accepting);
15 | 
16 |   // Create a map of incoming transitions to each state, grouped by character.
17 |   let transitions = new Map(k => new Map(k => new Set));
18 |   for (let s of states) {
19 |     for (let [t, st] of s.transitions) {
20 |       transitions.get(st).get(t).add(s);
21 |     }
22 |   }
23 | 
24 |   let P = new Set([finalStates, states.difference(finalStates)]);
25 |   let W = new Set(P);
26 | 
27 |   while (W.size > 0) {
28 |     let A = W.shift();
29 | 
30 |     // Collect states that have transitions leading to states in A, grouped by character.
31 |     let t = new Map(k => new Set);
32 |     for (let s of A) {
33 |       for (let [T, X] of transitions.get(s)) {
34 |         t.get(T).addAll(X);
35 |       }
36 |     }
37 | 
38 |     for (let X of t.values()) {
39 |       for (let Y of P) {
40 |         let i = X.intersection(Y);
41 |         if (i.size === 0) {
42 |           continue;
43 |         }
44 | 
45 |         let d = Y.difference(X);
46 |         if (d.size === 0) {
47 |           continue;
48 |         }
49 | 
50 |         P.replace(Y, i, d);
51 | 
52 |         let y = W.find(v => v.equals(Y));
53 |         if (y) {
54 |           W.replace(y, i, d);
55 |         } else if (i.size <= d.size) {
56 |           W.add(i);
57 |         } else {
58 |           W.add(d);
59 |         }
60 |       }
61 |     }
62 |   }
63 | 
64 |   // Each set S in P now represents a state in the minimized DFA.
65 |   // Build the new states and transitions.
66 |   let newStates = new Map(k => new State);
67 |   let initial = null;
68 | 
69 |   for (let S of P) {
70 |     let first = S.first();
71 |     let s = newStates.get(S);
72 |     for (let [c, old] of first.transitions) {
73 |       s.transitions.set(c, newStates.get(P.find(v => v.has(old))));
74 |     }
75 | 
76 |     s.accepting = first.accepting;
77 | 
78 |     if (S.has(root)) {
79 |       initial = s;
80 |     }
81 |   }
82 | 
83 |   return initial;
84 | }
85 | 
86 | module.exports = minimize;
87 | 


--------------------------------------------------------------------------------
/src/regex.js:
--------------------------------------------------------------------------------
  1 | const {Alternation, CharClass, Concatenation, Repetition, Literal} = require('./ast');
  2 | 
  3 | /**
  4 |  * Implements Brzozowski's algebraic method to convert a DFA into a regular
  5 |  * expression pattern.
  6 |  * http://cs.stackexchange.com/questions/2016/how-to-convert-finite-automata-to-regular-expressions#2392
  7 |  *
  8 |  * @param {State} root - the initial state of the DFA
  9 |  * @param {string} flags - The flags to add to the regex.
 10 |  * @return {String} - the converted regular expression pattern
 11 |  */
 12 | function toRegex(root, flags) {
 13 |   let states = Array.from(root.visit());
 14 | 
 15 |   // Setup the system of equations A and B from Arden's Lemma.
 16 |   // A represents a state transition table for the given DFA.
 17 |   // B is a vector of accepting states in the DFA, marked as epsilons.
 18 |   let A = [];
 19 |   let B = [];
 20 | 
 21 |   for (let i = 0; i < states.length; i++) {
 22 |     let a = states[i];
 23 |     if (a.accepting) {
 24 |       B[i] = new Literal('');
 25 |     }
 26 | 
 27 |     A[i] = [];
 28 |     for (let [t, s] of a.transitions) {
 29 |       let j = states.indexOf(s);
 30 |       A[i][j] = A[i][j] ? union(A[i][j], new Literal(t)) : new Literal(t);
 31 |     }
 32 |   }
 33 | 
 34 |   // Solve the of equations
 35 |   for (let n = states.length - 1; n >= 0; n--) {
 36 |     if (A[n][n] != null) {
 37 |       B[n] = concat(star(A[n][n]), B[n]);
 38 |       for (let j = 0; j < n; j++) {
 39 |         A[n][j] = concat(star(A[n][n]), A[n][j]);
 40 |       }
 41 |     }
 42 | 
 43 |     for (let i = 0; i < n; i++) {
 44 |       if (A[i][n] != null) {
 45 |         B[i] = union(B[i], concat(A[i][n], B[n]));
 46 |         for (let j = 0; j < n; j++) {
 47 |           A[i][j] = union(A[i][j], concat(A[i][n], A[n][j]));
 48 |         }
 49 |       }
 50 |     }
 51 |   }
 52 | 
 53 |   return B[0].toString(flags);
 54 | }
 55 | 
 56 | /**
 57 |  * Creates a repetition if `exp` exists.
 58 |  */
 59 | function star(exp) {
 60 |   return exp ? new Repetition(exp, '*') : null;
 61 | }
 62 | 
 63 | /**
 64 |  * Creates a union between two expressions
 65 |  */
 66 | function union(a, b) {
 67 |   if (a != null && b != null && a !== b) {
 68 |     // Hoist common substrings at the start and end of the options
 69 |     let start, end, res;
 70 |     [a, b, start] = removeCommonSubstring(a, b, 'start');
 71 |     [a, b, end] = removeCommonSubstring(a, b, 'end');
 72 | 
 73 |     // If a or b is empty, make an optional group instead
 74 |     if (a.isEmpty || b.isEmpty) {
 75 |       res = new Repetition(a.isEmpty ? b : a, '?');
 76 |     } else if (a instanceof Repetition && a.type === '?') {
 77 |       res = new Repetition(new Alternation(a.expr, b), '?');
 78 |     } else if (b instanceof Repetition && b.type === '?') {
 79 |       res = new Repetition(new Alternation(a, b.expr), '?');
 80 |     } else {
 81 |       // Check if we can make a character class instead of an alternation
 82 |       let ac = a.getCharClass && a.getCharClass();
 83 |       let bc = b.getCharClass && b.getCharClass();
 84 |       if (ac && bc) {
 85 |         res = new CharClass(ac, bc);
 86 |       } else {
 87 |         res = new Alternation(a, b);
 88 |       }
 89 |     }
 90 | 
 91 |     if (start) {
 92 |       res = new Concatenation(new Literal(start), res);
 93 |     }
 94 | 
 95 |     if (end) {
 96 |       res = new Concatenation(res, new Literal(end));
 97 |     }
 98 | 
 99 |     return res;
100 |   }
101 | 
102 |   return a || b;
103 | }
104 | 
105 | /**
106 |  * Removes the common prefix or suffix from the two expressions
107 |  */
108 | function removeCommonSubstring(a, b, side) {
109 |   let al = a.getLiteral && a.getLiteral(side);
110 |   let bl = b.getLiteral && b.getLiteral(side);
111 |   if (!al || !bl) {
112 |     return [a, b, null];
113 |   }
114 | 
115 |   let s = commonSubstring(al, bl, side);
116 |   if (!s) {
117 |     return [a, b, ''];
118 |   }
119 | 
120 |   a = a.removeSubstring(side, s.length);
121 |   b = b.removeSubstring(side, s.length);
122 | 
123 |   return [a, b, s];
124 | }
125 | 
126 | /**
127 |  * Finds the common prefix or suffix between to strings
128 |  */
129 | function commonSubstring(a, b, side) {
130 |   let dir = side === 'start' ? 1 : -1;
131 |   a = Array.from(a);
132 |   b = Array.from(b);
133 |   let ai = dir === 1 ? 0 : a.length - 1;
134 |   let ae = dir === 1 ? a.length : -1;
135 |   let bi = dir === 1 ? 0 : b.length - 1;
136 |   let be = dir === 1 ? b.length : -1;
137 |   let res = '';
138 | 
139 |   for (; ai !== ae && bi !== be && a[ai] === b[bi]; ai += dir, bi += dir) {
140 |     if (dir === 1) {
141 |       res += a[ai];
142 |     } else {
143 |       res = a[ai] + res;
144 |     }
145 |   }
146 | 
147 |   return res;
148 | }
149 | 
150 | /**
151 |  * Creates a concatenation between expressions a and b
152 |  */
153 | function concat(a, b) {
154 |   if (a == null || b == null) {
155 |     return null;
156 |   }
157 | 
158 |   if (a.isEmpty) {
159 |     return b;
160 |   }
161 | 
162 |   if (b.isEmpty) {
163 |     return a;
164 |   }
165 | 
166 |   // Combine literals
167 |   if (a instanceof Literal && b instanceof Literal) {
168 |     return new Literal(a.value + b.value);
169 |   }
170 | 
171 |   if (a instanceof Literal && b instanceof Concatenation && b.a instanceof Literal) {
172 |     return new Concatenation(new Literal(a.value + b.a.value), b.b);
173 |   }
174 | 
175 |   if (b instanceof Literal && a instanceof Concatenation && a.b instanceof Literal) {
176 |     return new Concatenation(a.a, new Literal(a.b.value + b.value));
177 |   }
178 | 
179 |   return new Concatenation(a, b);
180 | }
181 | 
182 | module.exports = toRegex;
183 | 


--------------------------------------------------------------------------------
/src/set.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This class extends the native ES6 Set class with some additional methods
 3 |  */
 4 | class ExtendedSet extends Set {
 5 |   filter(fn) {
 6 |     let res = new ExtendedSet;
 7 |     for (let x of this) {
 8 |       if (fn(x)) {
 9 |         res.add(x);
10 |       }
11 |     }
12 | 
13 |     return res;
14 |   }
15 | 
16 |   difference(b) {
17 |     return this.filter(x => !b.has(x));
18 |   }
19 | 
20 |   intersection(b) {
21 |     return this.filter(x => b.has(x));
22 |   }
23 | 
24 |   equals(b) {
25 |     if (this.size !== b.size) {
26 |       return false;
27 |     }
28 | 
29 |     for (let x of this) {
30 |       if (!b.has(x)) {
31 |         return false;
32 |       }
33 |     }
34 | 
35 |     return true;
36 |   }
37 | 
38 |   find(fn) {
39 |     for (let x of this) {
40 |       if (fn(x)) {
41 |         return x;
42 |       }
43 |     }
44 | 
45 |     return null;
46 |   }
47 | 
48 |   first() {
49 |     return this.values().next().value;
50 |   }
51 | 
52 |   shift() {
53 |     let v = this.first();
54 |     this.delete(v);
55 |     return v;
56 |   }
57 | 
58 |   replace(search, ...replacements) {
59 |     if (this.delete(search)) {
60 |       this.addAll(replacements);
61 |     }
62 |   }
63 | 
64 |   addAll(items) {
65 |     for (let x of items) {
66 |       this.add(x);
67 |     }
68 |   }
69 | }
70 | 
71 | module.exports = ExtendedSet;
72 | 


--------------------------------------------------------------------------------
/src/state.js:
--------------------------------------------------------------------------------
 1 | const Map = require('./map');
 2 | 
 3 | /**
 4 |  * Represents a state in a DFA.
 5 |  */
 6 | class State {
 7 |   constructor() {
 8 |     this.accepting = false;
 9 |     this.transitions = new Map(k => new State);
10 |   }
11 | 
12 |   /**
13 |    * A generator that yields all states in the subtree
14 |    * starting with this state.
15 |    */
16 |   *visit(visited = new Set) {
17 |     if (visited.has(this)) return;
18 |     visited.add(this);
19 | 
20 |     yield this;
21 |     for (let state of this.transitions.values()) {
22 |       yield* state.visit(visited);
23 |     }
24 |   }
25 | }
26 | 
27 | module.exports = State;
28 | 


--------------------------------------------------------------------------------
/src/trie.js:
--------------------------------------------------------------------------------
 1 | const State = require('./state');
 2 | const minimize = require('./minimize');
 3 | const toRegex = require('./regex');
 4 | 
 5 | /**
 6 |  * A Trie represents a set of strings in a tree data structure
 7 |  * where each edge represents a single character.
 8 |  * https://en.wikipedia.org/wiki/Trie
 9 |  */
10 | class Trie {
11 |   constructor() {
12 |     this.alphabet = new Set;
13 |     this.root = new State;
14 |   }
15 | 
16 |   /**
17 |    * Adds the given string to the trie.
18 |    * @param {string} string - the string to add
19 |    */
20 |   add(string) {
21 |     let node = this.root;
22 |     for (let char of string) {
23 |       this.alphabet.add(char);
24 |       node = node.transitions.get(char);
25 |     }
26 | 
27 |     node.accepting = true;
28 |   }
29 | 
30 |   /**
31 |    * Adds the given array of strings to the trie.
32 |    * @param {Array<string>} strings - the array of strings to add
33 |    */
34 |   addAll(strings) {
35 |     for (let string of strings) {
36 |       this.add(string);
37 |     }
38 |   }
39 | 
40 |   /**
41 |    * Returns a minimal DFA representing the strings in the trie.
42 |    * @return {State} - the starting state of the minimal DFA
43 |    */
44 |   minimize() {
45 |     return minimize(this.root);
46 |   }
47 | 
48 |   /**
49 |    * Returns a regex pattern that matches the strings in the trie.
50 |    * @param {string} flags - The flags to add to the regex.
51 |    * @return {string} pattern - The regex pattern.
52 |    */
53 |   toString(flags) {
54 |     return toRegex(this.minimize(), flags);
55 |   }
56 | 
57 |   /**
58 |    * Returns a regex that matches the strings in the trie.
59 |    * @param {string} flags - The flags to add to the regex.
60 |    * @return {RegExp}
61 |    */
62 |   toRegExp(flags) {
63 |     return new RegExp(this.toString(flags), flags);
64 |   }
65 | }
66 | 
67 | module.exports = Trie;
68 | 


--------------------------------------------------------------------------------
/test/test.js:
--------------------------------------------------------------------------------
  1 | const assert = require('assert');
  2 | const regexgen = require('../');
  3 | 
  4 | describe('regexgen', function () {
  5 |   it('should generate a char class', function () {
  6 |     assert.deepEqual(regexgen(['a', 'b', 'c']), /[a-c]/);
  7 |   });
  8 | 
  9 |   it('should generate an alternation', function () {
 10 |     assert.deepEqual(regexgen(['abc', '123']), /123|abc/);
 11 |   });
 12 | 
 13 |   it('should extract common prefixes at the start', function () {
 14 |     assert.deepEqual(regexgen(['foobar', 'foozap']), /foo(?:zap|bar)/);
 15 |   });
 16 | 
 17 |   it('should extract common prefixes at the end', function () {
 18 |     assert.deepEqual(regexgen(['barfoo', 'zapfoo']), /(?:zap|bar)foo/);
 19 |   });
 20 | 
 21 |   it('should extract common prefixes at the start and end', function () {
 22 |     assert.deepEqual(regexgen(['foobarfoo', 'foozapfoo']), /foo(?:zap|bar)foo/);
 23 |   });
 24 | 
 25 |   it('should generate an optional group', function () {
 26 |     assert.deepEqual(regexgen(['foo', 'foobar']), /foo(?:bar)?/);
 27 |   });
 28 | 
 29 |   it('should generate multiple optional groups', function () {
 30 |     assert.deepEqual(regexgen(['f', 'fo', 'fox']), /f(?:ox?)?/);
 31 |   });
 32 | 
 33 |   it('should escape meta characters', function () {
 34 |     assert.deepEqual(regexgen(['foo|bar[test]+']), /foo\|bar\[test\]\+/);
 35 |     assert.deepEqual(regexgen(['u{}\\iu']), /u\{\}\\iu/);
 36 |   });
 37 | 
 38 |   it('should escape non-ascii characters', function () {
 39 |     assert.deepEqual(regexgen(['🎉']), /\uD83C\uDF89/);
 40 |   });
 41 | 
 42 |   it('should support regex flags', function () {
 43 |     assert.deepEqual(regexgen(['a', 'b', 'c'], 'g'), /[a-c]/g);
 44 |   });
 45 | 
 46 |   it('should support using the Trie class directly', function () {
 47 |     let t = new regexgen.Trie;
 48 |     t.add('foobar');
 49 |     t.add('foobaz');
 50 | 
 51 |     assert.deepEqual(t.toString(), 'fooba[rz]');
 52 |     assert.deepEqual(t.toRegExp(), /fooba[rz]/);
 53 | 
 54 |     let t2 = new regexgen.Trie;
 55 |     t2.addAll(['foobar', 'foobaz']);
 56 | 
 57 |     assert.deepEqual(t2.toString(), 'fooba[rz]');
 58 |     assert.deepEqual(t2.toRegExp(), /fooba[rz]/);
 59 |   });
 60 | 
 61 |   it('should work with optional groups', function () {
 62 |     assert.deepEqual(regexgen(['a', 'abc']), /a(?:bc)?/);
 63 |   });
 64 | 
 65 |   it('should wrap optional character classes in parens if they contain non-BMP codepoints', function () {
 66 |     assert.deepEqual(regexgen(['\u261D', '\u261D\u{1f3fb}', '\u261D\u{1f3fc}']), /\u261D(?:\uD83C[\uDFFB\uDFFC])?/);
 67 |   });
 68 | 
 69 |   it('should wrap optional literals in parens if they contain more than one code unit', function () {
 70 |     assert.deepEqual(regexgen(['\u261D', '\u261D\u{1f3fb}']), /\u261D(?:\uD83C\uDFFB)?/);
 71 |   });
 72 | 
 73 |   it('should retain non-BMP codepoints when the Unicode flag is passed', function () {
 74 |     assert.deepEqual(regexgen(['\u261D', '\u261D\u{1f3fb}'], 'u'), /\u261D\u{1F3FB}?/u);
 75 |     assert.deepEqual(
 76 |       regexgen(['\u{1F3F4}', '\u{1F3F4}\u{E0067}\u{E0062}\u{E0065}\u{E006E}\u{E0067}', '\u{1F3F4}\u{E0067}\u{E0062}\u{E0077}\u{E006C}\u{E0073}', '\u{1F3F4}\u{E0067}\u{E0062}\u{E0073}\u{E0063}\u{E0074}'], 'u'),
 77 |       /\u{1F3F4}(?:\u{E0067}\u{E0062}(?:\u{E0073}\u{E0063}\u{E0074}|\u{E0077}\u{E006C}\u{E0073}|\u{E0065}\u{E006E}\u{E0067}))?/u
 78 |     );
 79 |   });
 80 | 
 81 |   it('should handle non-BMP codepoint ranges correctly', function() {
 82 |     assert.deepEqual(
 83 |       regexgen(['\u{1F311}', '\u{1F312}', '\u{1F313}', '\u{1F314}', '\u{1F315}', '\u{1F316}', '\u{1F317}', '\u{1F318}'], 'u'),
 84 |       /[\u{1F311}-\u{1F318}]/u
 85 |     );
 86 |   });
 87 | 
 88 |   it('should correctly extract common prefix from multiple alternations', function () {
 89 |     assert.deepEqual(regexgen(['abjv', 'abxcjv', 'abydjv', 'abzejv']), /ab(?:ze|yd|xc)?jv/);
 90 |   });
 91 | 
 92 |   it('should sort alternation options correctly (#10)', function () {
 93 |     let s = '\uD83C\uDFCA\uD83C\uDFFD\u200D\u2640\uFE0F';
 94 |     let r = regexgen([
 95 |       '\uD83C\uDDF7\uD83C\uDDFC',
 96 |       '\uD83C\uDDF8\uD83C\uDDE6',
 97 |       '\uD83C\uDFCA\uD83C\uDFFD',
 98 |       s
 99 |     ]);
100 | 
101 |     assert.deepEqual(s.match(r)[0], s);
102 |   });
103 | 
104 |   it('should sort non-BMP alternation options correctly', function () {
105 |     let r = regexgen(
106 |       [
107 |         // shrug emoji
108 |         '\u{1F937}\u200D',
109 |         // shrug emoji with fitzpatrick modifiers
110 |         '\u{1F937}\u{1F3FB}\u200D',
111 |         '\u{1F937}\u{1F3FC}\u200D',
112 |         '\u{1F937}\u{1F3FD}\u200D',
113 |         '\u{1F937}\u{1F3FE}\u200D',
114 |         '\u{1F937}\u{1F3FF}\u200D',
115 |         // shrug emoji with gender modifier
116 |         '\u{1F937}\u200D\u2640\uFE0F',
117 |         // shrug emoji with gender and fitzpatrick modifiers
118 |         '\u{1F937}\u{1F3FB}\u200D\u2640\uFE0F',
119 |         '\u{1F937}\u{1F3FC}\u200D\u2640\uFE0F',
120 |         '\u{1F937}\u{1F3FD}\u200D\u2640\uFE0F',
121 |         '\u{1F937}\u{1F3FE}\u200D\u2640\uFE0F',
122 |         '\u{1F937}\u{1F3FF}\u200D\u2640\uFE0F'
123 |       ],
124 |       'u'
125 |     );
126 | 
127 |     assert.deepEqual(r, /\u{1F937}[\u{1F3FB}-\u{1F3FF}]?\u200D(?:\u2640\uFE0F)?/u);
128 |     assert.deepEqual('\u{1F937}\u{1F3FB}\u200D\u2640\uFE0F'.match(r)[0], '\u{1F937}\u{1F3FB}\u200D\u2640\uFE0F');
129 |   });
130 | 
131 |   it('should sort alternations of alternations correctly', function () {
132 |     let r = regexgen(['aef', 'aghz', 'ayz', 'abcdz', 'abcd']);
133 |     let s = 'abcdz';
134 | 
135 |     assert.deepEqual(s.match(r)[0], s);
136 |     assert.deepEqual(r, /a(?:(?:bcd|gh|y)z|bcd|ef)/);
137 |   });
138 | });
139 | 


--------------------------------------------------------------------------------