├── .github └── workflows │ └── nodejs.yml ├── .gitignore ├── README.md ├── __tests__ ├── parser.test.js └── regex.test.js ├── package-lock.json ├── package.json └── src ├── index.js ├── nfa.js ├── parser.js ├── parser2.js └── regex.js /.github/workflows/nodejs.yml: -------------------------------------------------------------------------------- 1 | name: Node CI 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | 10 | strategy: 11 | matrix: 12 | node-version: [18.x, 20.x, 21.x] 13 | 14 | steps: 15 | - uses: actions/checkout@v1 16 | - name: Use Node.js ${{ matrix.node-version }} 17 | uses: actions/setup-node@v1 18 | with: 19 | node-version: ${{ matrix.node-version }} 20 | - name: npm install, build, and test 21 | run: | 22 | npm ci 23 | npm test 24 | env: 25 | CI: true 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # regexjs 2 | 3 | [![Build Status](https://github.com/deniskyashif/regexjs/workflows/Node%20CI/badge.svg)](https://github.com/deniskyashif/ssfst/actions?query=workflow%3A%22Node+CI%22) 4 | 5 | A regular expression engine implementation in JavaScript. It supports concatenation, union (|), zero-or-more (\*), one-or-more (+), and zero-or-one (?) operations as well as grouping. It follows Ken Thompson's algorithm for constructing an NFA from a regular expression. 6 | 7 | Check out my [blog post](https://deniskyashif.com/2019/02/17/implementing-a-regular-expression-engine/) for the complete implementation details. 8 | 9 | ### Example 10 | ```javascript 11 | const { createMatcher } = require('./regex'); 12 | const match = createMatcher('(a|b)*c'); 13 | 14 | match('ac'); // true 15 | match('abc'); // true 16 | match('aabababbbc'); // true 17 | match('aaaab'); // false 18 | ``` 19 | 20 | ### Try It 21 | ``` 22 | git clone https://github.com/deniskyashif/regexjs.git 23 | cd regexjs 24 | npm i 25 | npm start 26 | ``` 27 | 28 | ### Run the tests 29 | `npm t` 30 | -------------------------------------------------------------------------------- /__tests__/parser.test.js: -------------------------------------------------------------------------------- 1 | const { 2 | insertExplicitConcatOperator, 3 | toPostfix 4 | } = require('../src/parser'); 5 | 6 | describe('insertExplicitConcatSymbol tests', () => { 7 | test('call with "" should return ""', () => { 8 | expect(insertExplicitConcatOperator('')).toEqual(''); 9 | }); 10 | 11 | test('call with "a*" should return "a*"', () => { 12 | expect(insertExplicitConcatOperator('a*')).toEqual('a*'); 13 | }); 14 | 15 | test('call with "a|b" should return "a|b"', () => { 16 | expect(insertExplicitConcatOperator('a|b')).toEqual('a|b'); 17 | }); 18 | 19 | test('call with "ab" should return "a.b"', () => { 20 | expect(insertExplicitConcatOperator('ab')).toEqual('a.b'); 21 | }); 22 | 23 | test('call with "abcabc" should return "a.b.c.a.b.c"', () => { 24 | expect(insertExplicitConcatOperator('abcabc')).toEqual('a.b.c.a.b.c'); 25 | }); 26 | 27 | test('call with "ab*" should return "a.b*"', () => { 28 | expect(insertExplicitConcatOperator('ab*')).toEqual('a.b*'); 29 | }); 30 | 31 | test('call with "ab*" should return "a*b*"', () => { 32 | expect(insertExplicitConcatOperator('a*b*')).toEqual('a*.b*'); 33 | }); 34 | 35 | test('call with "ab*c" should return "a.b*.c"', () => { 36 | expect(insertExplicitConcatOperator('ab*c')).toEqual('a.b*.c'); 37 | }); 38 | 39 | test('call with "ab*(cdd)" should return "a.b*.(c.d.d)"', () => { 40 | expect(insertExplicitConcatOperator('ab*(cdd)')).toEqual('a.b*.(c.d.d)'); 41 | }); 42 | 43 | test('call with "(a|b)*c" should return "(a|b)*.c"', () => { 44 | expect(insertExplicitConcatOperator('(a|b)*c')).toEqual('(a|b)*.c'); 45 | }); 46 | }); 47 | 48 | describe('toPostfix tests', () => { 49 | test('call with "" should return ""', () => { 50 | expect(toPostfix('')).toEqual(''); 51 | }); 52 | 53 | test('call with "a" should return "a"', () => { 54 | expect(toPostfix('a')).toEqual('a'); 55 | }); 56 | 57 | test('call with "a.b" should return "ab."', () => { 58 | expect(toPostfix('a.b')).toEqual('ab.'); 59 | }); 60 | 61 | test('call with "a*" should return "a*"', () => { 62 | expect(toPostfix('a*')).toEqual('a*'); 63 | }); 64 | 65 | test('call with "a*.b" should return "a*b."', () => { 66 | expect(toPostfix('a*.b')).toEqual('a*b.'); 67 | }); 68 | 69 | test('call with "a.b|c.d" should return "ab.cd.|"', () => { 70 | expect(toPostfix('a.b|c.d')).toEqual('ab.cd.|'); 71 | }); 72 | 73 | test('call with "a|b*" should return "ab*|"', () => { 74 | expect(toPostfix('a|b*')).toEqual('ab*|'); 75 | }); 76 | 77 | test('call with "a.(b|c)*.d" should return "abc|*.d."', () => { 78 | expect(toPostfix('a.(b|c)*.d')).toEqual('abc|*.d.'); 79 | }); 80 | 81 | test('call with ((a.b)) should return ab.', () => { 82 | expect(toPostfix('((a.b))')).toEqual('ab.'); 83 | }); 84 | 85 | test('call with ((a.b)*) should return ab.*', () => { 86 | expect(toPostfix('((a.b)*)')).toEqual('ab.*'); 87 | }); 88 | 89 | test('call with "(a|b)*cd" should return "ab|*c.d."', () => { 90 | expect(toPostfix('(a|b)*.c.d')).toEqual('ab|*c.d.'); 91 | }); 92 | }); 93 | 94 | -------------------------------------------------------------------------------- /__tests__/regex.test.js: -------------------------------------------------------------------------------- 1 | const { createMatcher } = require('../src/regex'); 2 | 3 | describe('createMatcher tests', () => { 4 | test('from empty string should recognize only empty string', () => { 5 | const match = createMatcher(''); 6 | expect(match('')).toBeTruthy(); 7 | expect(match('a')).toBeFalsy(); 8 | expect(match(' ab')).toBeFalsy(); 9 | }); 10 | 11 | test('from a should recognize strings of arbitrary number of a', () => { 12 | const match = createMatcher('a'); 13 | expect(match('')).toBeFalsy(); 14 | expect(match('a')).toBeTruthy(); 15 | expect(match('aaa')).toBeFalsy(); 16 | }); 17 | 18 | test('from a* should recognize strings of arbitrary number of a\'s', () => { 19 | const match = createMatcher('a*'); 20 | expect(match('')).toBeTruthy(); 21 | expect(match('aaaa')).toBeTruthy(); 22 | expect(match('aa')).toBeTruthy(); 23 | expect(match('aba')).toBeFalsy(); 24 | }); 25 | 26 | test('from a? should recognize strings of exactly one or zero number of a\'s', () => { 27 | const match = createMatcher('a?'); 28 | expect(match('')).toBeTruthy(); 29 | expect(match('a')).toBeTruthy(); 30 | expect(match('aa')).toBeFalsy(); 31 | expect(match('aaa')).toBeFalsy(); 32 | expect(match('aba')).toBeFalsy(); 33 | expect(match('b')).toBeFalsy(); 34 | }); 35 | 36 | test('from a+ should recognize strings of one or more number of a\'s', () => { 37 | const match = createMatcher('a+'); 38 | expect(match('')).toBeFalsy(); 39 | expect(match('a')).toBeTruthy(); 40 | expect(match('aa')).toBeTruthy(); 41 | expect(match('aaa')).toBeTruthy(); 42 | expect(match('aba')).toBeFalsy(); 43 | expect(match('b')).toBeFalsy(); 44 | }); 45 | 46 | test('from a*b should recognize strings of arbitrary number of a\'s ending with b', () => { 47 | const match = createMatcher('a*b'); 48 | expect(match('')).toBeFalsy(); 49 | expect(match('aaaab')).toBeTruthy(); 50 | expect(match('aab')).toBeTruthy(); 51 | expect(match('b')).toBeTruthy(); 52 | expect(match('aba')).toBeFalsy(); 53 | }); 54 | 55 | // regex for all binary numbers divisible by 3 56 | test('from "(0|(1(01*(00)*0)*1)*)*" should recognize its language', () => { 57 | const match = createMatcher('(0|(1(01*(00)*0)*1)*)*'); 58 | expect(match('')).toBeTruthy(); 59 | expect(match('0')).toBeTruthy(); 60 | expect(match('00')).toBeTruthy(); 61 | expect(match('11')).toBeTruthy(); 62 | expect(match('000')).toBeTruthy(); 63 | expect(match('011')).toBeTruthy(); 64 | expect(match('110')).toBeTruthy(); 65 | expect(match('0000')).toBeTruthy(); 66 | expect(match('0011')).toBeTruthy(); 67 | }); 68 | 69 | test('from "(a|b)*c" should recognize strings with arbitrary number of a\'s and b\'s ending with c', () => { 70 | const match = createMatcher('(a|b)*c'); 71 | expect(match('c')).toBeTruthy(); 72 | expect(match('ac')).toBeTruthy(); 73 | expect(match('ababc')).toBeTruthy(); 74 | expect(match('bbbc')).toBeTruthy(); 75 | expect(match('aaaaaaac')).toBeTruthy(); 76 | expect(match('ac')).toBeTruthy(); 77 | expect(match('bac')).toBeTruthy(); 78 | expect(match('abbbbc')).toBeTruthy(); 79 | expect(match('cc')).toBeFalsy(); 80 | expect(match('a')).toBeFalsy(); 81 | expect(match('b')).toBeFalsy(); 82 | expect(match('ababab')).toBeFalsy(); 83 | }); 84 | 85 | test('from "abc|def" should recognize strings of abc or def', () => { 86 | const match = createMatcher('abc|def'); 87 | expect(match('abc')).toBeTruthy(); 88 | expect(match('def')).toBeTruthy(); 89 | expect(match('ab')).toBeFalsy(); 90 | expect(match('ef')).toBeFalsy(); 91 | }); 92 | 93 | test('from "a(b*|c)" should recognize strings starting with a followed by b\'s or a single c', () => { 94 | const match = createMatcher('a(b*|c)'); 95 | expect(match('ac')).toBeTruthy(); 96 | expect(match('abbbb')).toBeTruthy(); 97 | expect(match('ab')).toBeTruthy(); 98 | expect(match('a')).toBeTruthy(); 99 | expect(match('abc')).toBeFalsy(); 100 | expect(match('acc')).toBeFalsy(); 101 | expect(match('')).toBeFalsy(); 102 | }); 103 | }); 104 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "regexjs", 3 | "version": "1.0.0", 4 | "description": "Finite State Machines", 5 | "main": "index.js", 6 | "engines": { 7 | "node": ">=10.11.0" 8 | }, 9 | "scripts": { 10 | "start": "node src/index", 11 | "test": "jest" 12 | }, 13 | "repository": { 14 | "type": "git", 15 | "url": "git+https://github.com/deniskyashif/regexjs.git" 16 | }, 17 | "keywords": [ 18 | "automata", 19 | "finite state machines", 20 | "regex" 21 | ], 22 | "author": "deniskyashif", 23 | "license": "MIT", 24 | "bugs": { 25 | "url": "https://github.com/deniskyashif/regexjs/issues" 26 | }, 27 | "homepage": "https://github.com/deniskyashif/regexjs#readme", 28 | "devDependencies": { 29 | "jest": "^24.0.0" 30 | }, 31 | "dependencies": { 32 | "antlr4": "^4.7.2" 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/index.js: -------------------------------------------------------------------------------- 1 | const { createMatcher } = require('./regex'); 2 | const readline = require('readline'); 3 | 4 | const match = createMatcher('(a|b)*c'); 5 | 6 | const rl = readline.createInterface({ 7 | input: process.stdin, 8 | output: process.stdout 9 | }); 10 | 11 | rl.question(`Pattern: `, (pattern) => { 12 | const match = createMatcher(pattern); 13 | 14 | console.log('Check words: '); 15 | 16 | rl.on('line', (input) => { 17 | console.log(`Match? ${match(input)}`); 18 | }); 19 | }); 20 | -------------------------------------------------------------------------------- /src/nfa.js: -------------------------------------------------------------------------------- 1 | /* 2 | Thompson NFA Construction and Search. 3 | */ 4 | 5 | /* 6 | A state in Thompson's NFA can either have 7 | - a single symbol transition to a state 8 | or 9 | - up to two epsilon transitions to another states 10 | but not both. 11 | */ 12 | function createState(isEnd) { 13 | return { 14 | isEnd, 15 | transition: {}, 16 | epsilonTransitions: [] 17 | }; 18 | } 19 | 20 | function addEpsilonTransition(from, to) { 21 | from.epsilonTransitions.push(to); 22 | } 23 | 24 | /* 25 | Thompson's NFA state can have only one transition to another state for a given symbol. 26 | */ 27 | function addTransition(from, to, symbol) { 28 | from.transition[symbol] = to; 29 | } 30 | 31 | /* 32 | Construct an NFA that recognizes only the empty string. 33 | */ 34 | function fromEpsilon() { 35 | const start = createState(false); 36 | const end = createState(true); 37 | addEpsilonTransition(start, end); 38 | 39 | return { start, end }; 40 | } 41 | 42 | /* 43 | Construct an NFA that recognizes only a single character string. 44 | */ 45 | function fromSymbol(symbol) { 46 | const start = createState(false); 47 | const end = createState(true); 48 | addTransition(start, end, symbol); 49 | 50 | return { start, end }; 51 | } 52 | 53 | /* 54 | Concatenates two NFAs. 55 | */ 56 | function concat(first, second) { 57 | addEpsilonTransition(first.end, second.start); 58 | first.end.isEnd = false; 59 | 60 | return { start: first.start, end: second.end }; 61 | } 62 | 63 | /* 64 | Unions two NFAs. 65 | */ 66 | function union(first, second) { 67 | const start = createState(false); 68 | addEpsilonTransition(start, first.start); 69 | addEpsilonTransition(start, second.start); 70 | 71 | const end = createState(true); 72 | 73 | addEpsilonTransition(first.end, end); 74 | first.end.isEnd = false; 75 | addEpsilonTransition(second.end, end); 76 | second.end.isEnd = false; 77 | 78 | return { start, end }; 79 | } 80 | 81 | 82 | /* 83 | Apply Closure (Kleene's Star) on an NFA. 84 | */ 85 | function closure(nfa) { 86 | const start = createState(false); 87 | const end = createState(true); 88 | 89 | addEpsilonTransition(start, end); 90 | addEpsilonTransition(start, nfa.start); 91 | 92 | addEpsilonTransition(nfa.end, end); 93 | addEpsilonTransition(nfa.end, nfa.start); 94 | nfa.end.isEnd = false; 95 | 96 | return { start, end }; 97 | } 98 | 99 | /* 100 | Zero-or-one of an NFA. 101 | */ 102 | 103 | function zeroOrOne(nfa) { 104 | const start = createState(false); 105 | const end = createState(true); 106 | 107 | addEpsilonTransition(start, end); 108 | addEpsilonTransition(start, nfa.start); 109 | 110 | addEpsilonTransition(nfa.end, end); 111 | nfa.end.isEnd = false; 112 | 113 | return { start, end }; 114 | } 115 | 116 | /* 117 | One on more of an NFA. 118 | */ 119 | 120 | function oneOrMore(nfa) { 121 | const start = createState(false); 122 | const end = createState(true); 123 | 124 | addEpsilonTransition(start, nfa.start); 125 | addEpsilonTransition(nfa.end, end); 126 | addEpsilonTransition(nfa.end, nfa.start); 127 | nfa.end.isEnd = false; 128 | 129 | return { start, end }; 130 | } 131 | 132 | /* 133 | Converts a postfix regular expression into a Thompson NFA. 134 | */ 135 | function toNFA(postfixExp) { 136 | if (postfixExp === '') { 137 | return fromEpsilon(); 138 | } 139 | 140 | const stack = []; 141 | 142 | for (const token of postfixExp) { 143 | if (token === '*') { 144 | stack.push(closure(stack.pop())); 145 | } else if (token === "?") { 146 | stack.push(zeroOrOne(stack.pop())); 147 | } else if (token === "+") { 148 | stack.push(oneOrMore(stack.pop())); 149 | } else if (token === '|') { 150 | const right = stack.pop(); 151 | const left = stack.pop(); 152 | stack.push(union(left, right)); 153 | } else if (token === '.') { 154 | const right = stack.pop(); 155 | const left = stack.pop(); 156 | stack.push(concat(left, right)); 157 | } else { 158 | stack.push(fromSymbol(token)); 159 | } 160 | } 161 | 162 | return stack.pop(); 163 | } 164 | 165 | /* 166 | Regex to NFA construction using a parse tree. 167 | */ 168 | const { toParseTree } = require('./parser2'); 169 | 170 | function toNFAfromParseTree(root) { 171 | if (root.label === 'Expr') { 172 | const term = toNFAfromParseTree(root.children[0]); 173 | if (root.children.length === 3) // Expr -> Term '|' Expr 174 | return union(term, toNFAfromParseTree(root.children[2])); 175 | 176 | return term; // Expr -> Term 177 | } 178 | 179 | if (root.label === 'Term') { 180 | const factor = toNFAfromParseTree(root.children[0]); 181 | if (root.children.length === 2) // Term -> Factor Term 182 | return concat(factor, toNFAfromParseTree(root.children[1])); 183 | 184 | return factor; // Term -> Factor 185 | } 186 | 187 | if (root.label === 'Factor') { 188 | const atom = toNFAfromParseTree(root.children[0]); 189 | if (root.children.length === 2) { // Factor -> Atom MetaChar 190 | const meta = root.children[1].label; 191 | if (meta === '*') 192 | return closure(atom); 193 | if (meta === '+') 194 | return oneOrMore(atom); 195 | if (meta === '?') 196 | return zeroOrOne(atom); 197 | } 198 | 199 | return atom; // Factor -> Atom 200 | } 201 | 202 | if (root.label === 'Atom') { 203 | if (root.children.length === 3) // Atom -> '(' Expr ')' 204 | return toNFAfromParseTree(root.children[1]); 205 | 206 | return toNFAfromParseTree(root.children[0]); // Atom -> Char 207 | } 208 | 209 | if (root.label === 'Char') { 210 | if (root.children.length === 2) // Char -> '\' AnyChar 211 | return fromSymbol(root.children[1].label); 212 | 213 | return fromSymbol(root.children[0].label); // Char -> AnyCharExceptMeta 214 | } 215 | 216 | throw new Error('Unrecognized node label ' + root.label); 217 | } 218 | 219 | function toNFAFromInfixExp(infixExp) { 220 | if (infixExp === '') 221 | return fromEpsilon(); 222 | 223 | return toNFAfromParseTree(toParseTree(infixExp)); 224 | } 225 | 226 | /* 227 | Process a string through an NFA by recurisively (depth-first) traversing all the possible paths until finding a matching one. 228 | 229 | The NFA has N states, from each state it can go to at most N possible states, yet there might be at most 2^N possible paths, 230 | therefore, worst case it'll end up going through all of them until it finds a match (or not), resulting in very slow runtimes. 231 | */ 232 | function recursiveBacktrackingSearch(state, visited, input, position) { 233 | if (visited.includes(state)) { 234 | return false; 235 | } 236 | 237 | visited.push(state); 238 | 239 | if (position === input.length) { 240 | if (state.isEnd) { 241 | return true; 242 | } 243 | 244 | if (state.epsilonTransitions.some(s => recursiveBacktrackingSearch(s, visited, input, position))) { 245 | return true; 246 | } 247 | } else { 248 | const nextState = state.transition[input[position]]; 249 | 250 | if (nextState) { 251 | if (recursiveBacktrackingSearch(nextState, [], input, position + 1)) { 252 | return true; 253 | } 254 | } else { 255 | if (state.epsilonTransitions.some(s => recursiveBacktrackingSearch(s, visited, input, position))) { 256 | return true; 257 | } 258 | } 259 | 260 | return false; 261 | } 262 | } 263 | 264 | /* 265 | Follows through the epsilon transitions of a state until reaching 266 | a state with a symbol transition which gets added to the set of next states. 267 | */ 268 | function addNextState(state, nextStates, visited) { 269 | if (state.epsilonTransitions.length) { 270 | for (const st of state.epsilonTransitions) { 271 | if (!visited.find(vs => vs === st)) { 272 | visited.push(st); 273 | addNextState(st, nextStates, visited); 274 | } 275 | } 276 | } else { 277 | nextStates.push(state); 278 | } 279 | } 280 | 281 | /* 282 | Process a string through an NFA. For each input symbol it transitions into in multiple states at the same time. 283 | The string is matched if after reading the last symbol, is has transitioned into at least one end state. 284 | 285 | For an NFA with N states in can be at at most N states at a time. This algorighm finds a match by processing the input word once. 286 | */ 287 | function search(nfa, word) { 288 | let currentStates = []; 289 | /* The initial set of current states is either the start state or 290 | the set of states reachable by epsilon transitions from the start state */ 291 | addNextState(nfa.start, currentStates, []); 292 | 293 | for (const symbol of word) { 294 | const nextStates = []; 295 | 296 | for (const state of currentStates) { 297 | const nextState = state.transition[symbol]; 298 | if (nextState) { 299 | addNextState(nextState, nextStates, []); 300 | } 301 | } 302 | 303 | currentStates = nextStates; 304 | } 305 | 306 | return currentStates.find(s => s.isEnd) ? true : false; 307 | } 308 | 309 | function recognize(nfa, word) { 310 | // return recursiveBacktrackingSearch(nfa.start, [], word, 0); 311 | return search(nfa, word); 312 | } 313 | 314 | module.exports = { 315 | toNFA, 316 | toNFAFromInfixExp, 317 | recognize 318 | }; 319 | -------------------------------------------------------------------------------- /src/parser.js: -------------------------------------------------------------------------------- 1 | function insertExplicitConcatOperator(exp) { 2 | let output = ''; 3 | 4 | for (let i = 0; i < exp.length; i++) { 5 | const token = exp[i]; 6 | output += token; 7 | 8 | if (token === '(' || token === '|') { 9 | continue; 10 | } 11 | 12 | if (i < exp.length - 1) { 13 | const lookahead = exp[i + 1]; 14 | 15 | if (lookahead === '*' || lookahead === '?' || lookahead === '+' || lookahead === '|' || lookahead === ')') { 16 | continue; 17 | } 18 | 19 | output += '.'; 20 | } 21 | } 22 | 23 | return output; 24 | }; 25 | 26 | function peek(stack) { 27 | return stack.length && stack[stack.length - 1]; 28 | } 29 | 30 | const operatorPrecedence = { 31 | '|': 0, 32 | '.': 1, 33 | '?': 2, 34 | '*': 2, 35 | '+': 2 36 | }; 37 | 38 | function toPostfix(exp) { 39 | let output = ''; 40 | const operatorStack = []; 41 | 42 | for (const token of exp) { 43 | if (token === '.' || token === '|' || token === '*' || token === '?' || token === '+') { 44 | while (operatorStack.length && peek(operatorStack) !== '(' 45 | && operatorPrecedence[peek(operatorStack)] >= operatorPrecedence[token]) { 46 | output += operatorStack.pop(); 47 | } 48 | 49 | operatorStack.push(token); 50 | } else if (token === '(' || token === ')') { 51 | if (token === '(') { 52 | operatorStack.push(token); 53 | } else { 54 | while (peek(operatorStack) !== '(') { 55 | output += operatorStack.pop(); 56 | } 57 | operatorStack.pop(); 58 | } 59 | } else { 60 | output += token; 61 | } 62 | } 63 | 64 | while (operatorStack.length) { 65 | output += operatorStack.pop(); 66 | } 67 | 68 | return output; 69 | }; 70 | 71 | module.exports = { 72 | insertExplicitConcatOperator, 73 | toPostfix 74 | }; 75 | -------------------------------------------------------------------------------- /src/parser2.js: -------------------------------------------------------------------------------- 1 | /* 2 | Recursive descent parser for regular expressions. Implements the following grammar: 3 | 4 | Expr -> Term | Term '|' Expr 5 | Term -> Factor | Factor Term 6 | Factor -> Atom | Atom MetaChar 7 | Atom -> Char | '(' Expr ')' 8 | Char -> AnyCharExceptMeta | '\' AnyChar 9 | MetaChar -> '?' | '*' | '+' 10 | */ 11 | 12 | /** 13 | * @param{string} label 14 | * @param{TreeNode[]} children 15 | */ 16 | function TreeNode(label, children) { 17 | this.label = label; 18 | this.children = children || []; 19 | } 20 | 21 | let pattern = ''; 22 | let pos = 0; 23 | 24 | const peek = () => pattern[pos]; 25 | const hasMoreChars = () => pos < pattern.length; 26 | const isMetaChar = ch => ch === '*' || ch === '+' || ch === '?'; 27 | 28 | function match(ch) { 29 | if (peek() !== ch) 30 | throw new Error(`Unexpected symbol ${ch}`); 31 | pos++; 32 | } 33 | 34 | function next() { 35 | let ch = peek(); 36 | match(ch); 37 | 38 | return ch; 39 | } 40 | 41 | function expr() { 42 | const trm = term(); 43 | 44 | if (hasMoreChars() && peek() === '|') { 45 | match('|'); 46 | const exp = expr(); 47 | return new TreeNode('Expr', [trm, new TreeNode('|'), exp]); 48 | } 49 | 50 | return new TreeNode('Expr', [trm]); 51 | } 52 | 53 | function term() { 54 | const factr = factor(); 55 | 56 | if (hasMoreChars() && peek() !== ')' && peek() !== '|') { 57 | const trm = term(); 58 | return new TreeNode('Term', [factr, trm]); 59 | } 60 | 61 | return new TreeNode('Term', [factr]); 62 | } 63 | 64 | function factor() { 65 | const atm = atom(); 66 | 67 | if (hasMoreChars() && isMetaChar(peek())) { 68 | const meta = next(); 69 | return new TreeNode('Factor', [atm, new TreeNode(meta)]); 70 | } 71 | 72 | return new TreeNode('Factor', [atm]); 73 | } 74 | 75 | function atom() { 76 | if (peek() === '(') { 77 | match('('); 78 | const exp = expr(); 79 | match(')'); 80 | return new TreeNode('Atom', [new TreeNode('('), exp, new TreeNode(')')]); 81 | } 82 | 83 | const ch = char(); 84 | return new TreeNode('Atom', [ch]); 85 | } 86 | 87 | function char() { 88 | if (isMetaChar(peek())) 89 | throw new Error(`Unexpected meta char ${peek()}`); 90 | 91 | if (peek() === '\\') { 92 | match('\\'); 93 | return new TreeNode('Char', [new TreeNode('\\'), new TreeNode(next())]); 94 | } 95 | 96 | return new TreeNode('Char', [new TreeNode(next())]); 97 | } 98 | 99 | function toParseTree(regex) { 100 | pattern = regex; 101 | pos = 0; 102 | 103 | return expr(); 104 | } 105 | 106 | module.exports = { toParseTree }; 107 | -------------------------------------------------------------------------------- /src/regex.js: -------------------------------------------------------------------------------- 1 | const { insertExplicitConcatOperator, toPostfix } = require('./parser'); 2 | const { toNFA, toNFAFromInfixExp, recognize } = require('./nfa'); 3 | 4 | function createMatcher(exp) { 5 | // Generates an NFA using a stack 6 | // const expWithConcatenationOperator = insertExplicitConcatOperator(exp); 7 | // const postfixExp = toPostfix(expWithConcatenationOperator); 8 | // const nfa = toNFA(postfixExp); 9 | 10 | // Generates an NFA by constructing a parse tree 11 | // No explicit concatenation operator required 12 | const nfa = toNFAFromInfixExp(exp); 13 | 14 | return word => recognize(nfa, word); 15 | } 16 | 17 | module.exports = { createMatcher }; 18 | --------------------------------------------------------------------------------