├── LICENSE.txt
├── README.md
├── algorithms.js
├── assert.js
├── check.js
├── generate.js
├── index.js
├── package.json
├── parser.js
├── printers.js
├── test.js
└── types.js


/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Bakkot
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | CFGrammar-Tool
 2 | ==============
 3 | 
 4 | A JavaScript library for working with [context-free grammars](http://en.wikipedia.org/wiki/Context-free_grammar). It's also a node.js module (`npm install cfgrammar-tool`).
 5 | 
 6 | Check out the the [demo](http://bakkot.github.io/cfgrammar-tool/).
 7 | 
 8 | Features
 9 | --------
10 | 
11 | * Parsing. The implementation is [Earley's algorithm](http://en.wikipedia.org/wiki/Earley_parser), so arbitrary CFGs are supported without transformation. Optionally keep track of two parses or all parses, so as to catch ambiguity. Note that tracking all parses can take exponential or infinite time (though the latter possibility can be detected in advance).
12 | 
13 | * Generation. Given a grammar, generate a string of length *n* in its language. All such strings are generated with non-zero probability, and if the grammar is unambiguous and does not contain nullable nonterminals then strings are generated uniformly at random. Requires *n*^2 preprocessing time, then linear time for each string.
14 |  - Useful for automatic testing when QuickCheck and its ilk aren't generating sufficiently structured data. For example, `test.js` contains a CFG for CFGs, which was used to automatically test this very application. 
15 | 
16 | * Diagnostics and manipulation. Find/remove unreachable symbols, symbols which do not generate any string, nullable symbols, duplicate rules, unit productions (A -> B), etc.
17 | 
18 | 
19 | Example
20 | -------
21 | 
22 | ```javascript
23 | var cfgtool = require('cfgrammar-tool');
24 | var types = cfgtool.types;
25 | var parser = cfgtool.parser;
26 | var generatorFactory = cfgtool.generator;
27 | 
28 | var Grammar = types.Grammar;
29 | var Rule = types.Rule;
30 | var T = types.T;
31 | var NT = types.NT;
32 | var exprGrammar = Grammar([
33 |   Rule('E', [NT('E'), T('+'), NT('T')]),
34 |   Rule('E', [NT('T')]),
35 |   Rule('T', [NT('T'), T('*'), NT('F')]),
36 |   Rule('T', [NT('F')]),
37 |   Rule('F', [T('('), NT('E'), T(')')]),
38 |   Rule('F', [T('n')])
39 | ]);
40 | 
41 | parser.parse(exprGrammar, 'n*(n+n)').length > 0; // true
42 | parser.parse(exprGrammar, 'n(n+n)').length > 0; // false
43 | 
44 | var generator = generatorFactory(exprGrammar);
45 | generator(21); // something like 'n*((n+(n)*n+n+n*n))*n'
46 | ```
47 | 
48 | TODO
49 | ----
50 | 
51 | * General code cleanup; this was mostly written in a couple of marathon sessions to try to get a tool based on it up, and the haste shows. Strict mode and linting, too.
52 | 
53 | * Normal forms: put a grammar in [Chomsky normal form](http://en.wikipedia.org/wiki/Chomsky_normal_form), [Greibach normal form](http://en.wikipedia.org/wiki/Greibach_normal_form), or others.
54 | 
55 | * Import and export: parse and produce [BNF](http://en.wikipedia.org/wiki/Backus%E2%80%93Naur_Form) and other representations of grammars.
56 | 
57 | * Automatic tokenization. Currently all tokens are implicitly single-character strings, at least on the parsing end, which is often not what you want.
58 | 
59 | * ~~[Port to a language with a proper type system](https://github.com/bakkot/cfgrammar)~~.
60 | 
61 | * ~~[Put up a demo page on gh-pages.](http://bakkot.github.io/cfgrammar-tool/)~~
62 | 
63 | License
64 | -------
65 | 
66 | Licensed under the [MIT license](http://opensource.org/licenses/MIT). If you're making public or commercial use of this library, I encourage (but do not require) you to tell me about it!
67 | 


--------------------------------------------------------------------------------
/algorithms.js:
--------------------------------------------------------------------------------
  1 | var Rule = require('./types').Rule;
  2 | var assert = require('./assert');
  3 | // pass in the Grammar constructor and its prototype will be modified to have various algorithms
  4 | module.exports = function(Grammar) {
  5 | 
  6 | 
  7 | // todo annotate almost-terminals, which are nonterminals which can only produce strings consisting only of terminals or strings consisting of terminals and nonterminals (other than itself) which are almost-terminal.
  8 | // todo simplified and aggressive simplified. denulls, standardizes nonterminal names, standardizes rule ordering. aggressive simplified probably invokes a new 'stripped' fn: it reduces the number of rules by folding almost-terminal rules into the things which make them. obviously this can have exponential blowup.
  9 | // possibly also try to remove redundant rules?
 10 | 
 11 | 
 12 | 
 13 | // modify the grammar so each symbol has a 'nullable' property
 14 | // and the grammar to have a 'nullables' property, a list of nullable symbols
 15 | // returns the list of nullables
 16 | // http://cstheory.stackexchange.com/a/2493
 17 | Grammar.prototype.annotateNullables = function() {
 18 |   if(this.hasOwnProperty('nullables')) return this.nullables; // already done, don't redo
 19 |   
 20 |   this.nullables = [];
 21 |   var queue = [];
 22 |   var cs = []; // count of non-distinct symbols in RHS of rule i currently marked non-nullable, which does not make for a good variable name
 23 |   var rMap = this.getReverseMap();
 24 | 
 25 |   for(var i=0; i<this.symbolsList.length; ++i) {
 26 |     this.symbolMap[this.symbolsList[i]].nullable = false;
 27 |   }
 28 |   
 29 |   for(var i=0; i<this.rules.length; ++i) {
 30 |     var c = 0;
 31 |     var rule = this.rules[i];
 32 |     var maybeNullable = true; // does this rule produce a string with only nonterminals?
 33 |     for(var j=0; j<rule.production.length; ++j) {
 34 |       if(rule.production[j].type === 'NT') {
 35 |         ++c;
 36 |       }
 37 |       else {
 38 |         maybeNullable = false;
 39 |         break;
 40 |       }
 41 |     }
 42 |     if(maybeNullable) {
 43 |       cs.push(c);
 44 |     }
 45 |     else {
 46 |       cs.push(0);
 47 |     }
 48 |     
 49 |     
 50 |     if(rule.production.length == 0 && !this.symbolMap[rule.name].nullable) {
 51 |       this.symbolMap[rule.name].nullable = true;
 52 |       queue.push(rule.name);
 53 |       this.nullables.push(rule.name);
 54 |     }
 55 |   }
 56 | 
 57 |   for(var i=0; i<this.rules.length; ++i) {
 58 |     this.rules[i]._index = i;
 59 |   }
 60 |   
 61 |   while(queue.length > 0) {
 62 |     var cur = queue.pop();
 63 |     for(var i=0; i<rMap[cur].length; ++i) {
 64 |       var affected = rMap[cur][i];
 65 |       if(--cs[affected._index] === 0 && !this.symbolMap[affected.name].nullable) { // can only have been positive if the rule contained no terminals, so ok
 66 |         this.symbolMap[affected.name].nullable = true;
 67 |         queue.push(affected.name);
 68 |         this.nullables.push(affected.name);
 69 |       }
 70 |     }
 71 |   }
 72 | 
 73 |   for(var i=0; i<this.rules.length; ++i) {
 74 |     delete this.rules[i]._index;
 75 |   }
 76 | 
 77 |   
 78 |   return this.nullables;
 79 | }
 80 | 
 81 | 
 82 | // modify the grammar so each symbol has an "unreachable" property
 83 | // ie, no chain of derivations from the start symbol reaches that symbol. note that something may be reachable even if no chain which produces a string involves that thing. (eg S -> AB, B->'', A->A. then B is reachable.)
 84 | // grammar gets an "unreachables" property
 85 | // returns the list of unreachables
 86 | Grammar.prototype.annotateUnreachables = function() {
 87 |   if(this.hasOwnProperty('unreachables')) return this.unreachables; // already done, don't redo
 88 |   
 89 |   this.unreachables = [];
 90 |   var queue = [this.start];
 91 | 
 92 |   for(var i=0; i<this.symbolsList.length; ++i) {
 93 |     this.symbolMap[this.symbolsList[i]].unreachable = true;
 94 |   }
 95 |   this.symbolMap[this.start].unreachable = false;
 96 |   
 97 | 
 98 |   while(queue.length > 0) {
 99 |     var cur = queue.pop();
100 |     for(var j=0; j<this.symbolMap[cur].rules.length; ++j) {
101 |       var rule = this.symbolMap[cur].rules[j];
102 |       for(var k=0; k<rule.production.length; ++k) {
103 |         var sym = rule.production[k];
104 |         if(sym.type === 'NT' && this.symbolMap[sym.data].unreachable) {
105 |           this.symbolMap[sym.data].unreachable = false;
106 |           queue.push(sym.data);
107 |         }
108 |       }
109 |     }
110 |   }
111 |   
112 |   for(var i=0; i<this.symbolsList.length; ++i) {
113 |     if(this.symbolMap[this.symbolsList[i]].unreachable) {
114 |       this.unreachables.push(this.symbolsList[i]);
115 |     }
116 |   }
117 |   
118 |   return this.unreachables;
119 | }
120 | 
121 | 
122 | // modify the grammar so each symbol has a "useless" property
123 | // ie, there is no terminal string derivable from that symbol
124 | // grammar gets a "uselesses" property (forgive me)
125 | // returns the list of useless symbols
126 | Grammar.prototype.annotateUseless = function() {
127 |   if(this.hasOwnProperty('uselesses')) return this.uselesses; // already done, don't redo
128 |   
129 |   this.uselesses = [];
130 |   var queue = [];
131 |   var cs = []; // count of non-distinct symbols in RHS of rule i currently marked possibly-useless, which does not make for a good variable name
132 |   var rMap = this.getReverseMap();
133 | 
134 |   // very similar logic to finding nullables, except things are assumed useless until proven otherwise
135 |   for(var i=0; i<this.symbolsList.length; ++i) {
136 |     this.symbolMap[this.symbolsList[i]].useless = true;
137 |   }
138 |   
139 |   for(var i=0; i<this.rules.length; ++i) {
140 |     var c = 0;
141 |     var rule = this.rules[i];
142 |     for(var j=0; j<rule.production.length; ++j) {
143 |       if(rule.production[j].type === 'NT') {
144 |         ++c;
145 |       }
146 |     }
147 |     cs.push(c);
148 |     if(c == 0 && this.symbolMap[rule.name].useless) {
149 |       this.symbolMap[rule.name].useless = false;
150 |       queue.push(rule.name);
151 |     }
152 |   }
153 | 
154 |   for(var i=0; i<this.rules.length; ++i) {
155 |     this.rules[i]._index = i;
156 |   }
157 | 
158 |   
159 |   while(queue.length > 0) {
160 |     var cur = queue.pop();
161 |     for(var i=0; i<rMap[cur].length; ++i) {
162 |       var affected = rMap[cur][i];
163 |       if(--cs[affected._index] === 0 && this.symbolMap[affected.name].useless) {
164 |         this.symbolMap[affected.name].useless = false;
165 |         queue.push(affected.name);
166 |       }
167 |     }
168 |   }
169 | 
170 |   for(var i=0; i<this.symbolsList.length; ++i) {
171 |     if(this.symbolMap[this.symbolsList[i]].useless) {
172 |       this.uselesses.push(this.symbolsList[i]);
173 |     }
174 |   }
175 | 
176 |   for(var i=0; i<this.rules.length; ++i) {
177 |     delete this.rules[i]._index;
178 |   }
179 |   
180 |   return this.uselesses;
181 | }
182 | 
183 | 
184 | 
185 | 
186 | // modify the grammar so each symbol has a "selfDeriving" property
187 | // ie,  A *=> A (via some chain of length > 0)
188 | // grammar gets a "selfDerivings" property
189 | // returns the list of self-deriving symbols
190 | // http://cs.stackexchange.com/a/40967/12130
191 | Grammar.prototype.annotateSelfDeriving = function() {
192 |   if(this.hasOwnProperty('selfDerivings')) return this.selfDerivings; // already done, don't redo
193 |   
194 |   this.selfDerivings = [];
195 |   
196 |   this.annotateNullables();
197 |   
198 |   var derives = {}; // derives.A.B holds if A *=> B
199 |   for(var i=0; i<this.symbolsList.length; ++i) {
200 |     derives[this.symbolsList[i]] = {};
201 |   }
202 |   
203 |   
204 |   // initialization: set the one-step derivations.
205 |   o:for(var i=0; i<this.rules.length; ++i) {
206 |     var name = this.rules[i].name;
207 |     var production = this.rules[i].production;
208 |     
209 |     // easy cases: production empty, contains terminals, or contains exactly one nonterminal
210 |     if(production.length == 0) {
211 |       continue;
212 |     }
213 |     
214 |     for(var j=0; j<production.length; ++j) {
215 |       if(production[j].type == 'T') {
216 |         continue o;
217 |       }
218 |     }
219 |     
220 |     if(production.length == 1) {
221 |       derives[name][production[0].data] = true;
222 |       continue;
223 |     }
224 |     
225 |     
226 |     // harder case: production consists of two or more nonterminals. TODO could merge some loops but speedup is negligible probably
227 |     var nonnullable = null;
228 |     for(var j=0; j<production.length; ++j) {
229 |       if(!this.symbolMap[production[j].data].nullable) {
230 |         if(nonnullable !== null) {
231 |           continue o; // two or more nonnullable nonterminals: so this rule can't derive any single nonterminal
232 |         }
233 |         nonnullable = production[j].data;
234 |       }
235 |     }
236 |     
237 |     if(nonnullable !== null) { // exactly one nonnullable nonterminal: that and only that is derived.
238 |       derives[name][nonnullable] = true;
239 |     }
240 |     else { // two or more nullable: everything is derived
241 |       for(var j=0; j<production.length; ++j) {
242 |         derives[name][production[j].data] = true; // everything is a nonterminal, so this is safe
243 |       }
244 |     }
245 |   }
246 |   
247 |   // recursion: floyd-warshall, basically
248 |   for(var i=0; i<this.symbolsList.length; ++i) {
249 |     for(var j=0; j<this.symbolsList.length; ++j) {
250 |       for(var k=0; k<this.symbolsList.length; ++k) {
251 |         if(derives[this.symbolsList[i]][this.symbolsList[k]] && derives[this.symbolsList[k]][this.symbolsList[j]]) {
252 |           // if i derives k and k derives j then i derives j
253 |           derives[this.symbolsList[i]][this.symbolsList[j]] = true;
254 |         }
255 |       }
256 |     }
257 |   }
258 |   
259 |   for(var i=0; i<this.symbolsList.length; ++i) {
260 |     var cur = this.symbolsList[i];
261 |     if(derives[cur][cur]) {
262 |       this.symbolMap[cur].selfDeriving = true;
263 |       this.selfDerivings.push(cur);
264 |     }
265 |     else {
266 |       this.symbolMap[cur].selfDeriving = false;
267 |     }
268 |   }
269 |   
270 |   return this.selfDerivings;
271 | }
272 | 
273 | 
274 | 
275 | 
276 | 
277 | 
278 | 
279 | // returns a copy of the grammar without useless symbols. does not modify the grammar,
280 | // except annotating. if the result is empty, returns {empty: true}.
281 | Grammar.prototype.strippedUseless = function() {
282 |   this.annotateUseless();
283 |   var newRules = [];
284 |   
285 |   for(var i=0; i<this.rules.length; ++i) {
286 |     var rule = this.rules[i];
287 |     if(!this.symbolMap[rule.name].useless) {
288 |       var j;
289 |       for(j=0; j<rule.production.length; ++j) {
290 |         if(rule.production[j].type == 'NT' && this.symbolMap[rule.production[j].data].useless) {
291 |           break;
292 |         }
293 |       }
294 |       if(j == rule.production.length) { // ie rule does not contain any useless symbols
295 |         newRules.push(rule);
296 |       }
297 |     }
298 |   }
299 |   
300 |   if(newRules.length == 0) {
301 |     return {empty: true};
302 |   }
303 |   
304 |   var newGrammar = Grammar(newRules, this.start);
305 |   if(newGrammar.symbolMap[newGrammar.start].rules.length === 0) {
306 |     return {empty: true}; // nowhere to go: empty.
307 |   }
308 |   
309 |   
310 |   assert(newGrammar.annotateUseless().length == 0, 'Haven\'t actually eliminated all useless productions?');
311 |   
312 |   return newGrammar;
313 | }
314 | 
315 | // returns a copy of the grammar without useless symbols. does not modify the grammar,
316 | // except annotating. if the result is empty, returns {empty: true}.
317 | Grammar.prototype.strippedUnreachable = function() {
318 |   this.annotateUnreachables();
319 |   var newRules = [];
320 |   for(var i=0; i<this.rules.length; ++i) {
321 |     var rule = this.rules[i];
322 |     if(!this.symbolMap[rule.name].unreachable) {
323 |       // sufficient that the LHS is unreachable, since RHS does not contain unreachable unless LHS is unreachable
324 |       newRules.push(rule);
325 |     }
326 |   }
327 | 
328 |   if(newRules.length == 0) {
329 |     return {empty: true};
330 |   }
331 |   
332 |   var newGrammar = Grammar(newRules, this.start);
333 |   if(newGrammar.symbolMap[newGrammar.start].rules.length === 0) {
334 |     return {empty: true}; // nowhere to go: empty.
335 |   }
336 |   assert(newGrammar.annotateUnreachables().length == 0, 'Haven\'t actually eliminated all unreachable productions?');
337 |   
338 |   return newGrammar;
339 | }
340 | 
341 | 
342 | // returns a copy of the grammar with unit productions removed (A -> B) removed.
343 | // does not modify the grammar. if the result is empty, returns {empty: true}.
344 | Grammar.prototype.strippedUnitProductions = function() {
345 |   var newRules = [];
346 |   
347 |   var done = [];
348 |   var queue = [];
349 |   function seen(rule) {
350 |     for(var i=0; i<done.length; ++i) {
351 |       if(done[i].equals(rule)) {
352 |         return true;
353 |       }
354 |     }
355 |     for(var i=0; i<queue.length; ++i) {
356 |       if(queue[i].equals(rule)) {
357 |         return true;
358 |       }
359 |     }
360 |     return false;
361 |   }
362 |   
363 |   function enqueue(rule) {
364 |     if(!seen(rule)) {
365 |       queue.push(rule);
366 |     }
367 |   }
368 |   for(var i=0; i<this.rules.length; ++i) {
369 |     var rule = this.rules[i];
370 |     if(rule.production.length !== 1 || rule.production[0].type == 'T') {
371 |       newRules.push(rule);
372 |     }
373 |     else { // rule is of the form A->B
374 |       enqueue(rule);
375 |     }
376 |   }
377 |   
378 |   while(queue.length > 0) {
379 |     var rule = queue.pop();
380 |     done.push(rule);
381 |     var sym = rule.production[0].data; // everything in the queue is a unit production
382 |     if(sym !== rule.name) { // rule is not A->A, which can just be ignored
383 |       for(var j=0; j<this.symbolMap[sym].rules.length; ++j) {
384 |         var origRule = this.symbolMap[sym].rules[j]; // B->whatever
385 |         var newRule = Rule(rule.name, origRule.production.slice(0)); // A->whatever
386 |         if(newRule.production.length !==1 || newRule.production[0].type == 'T') {
387 |           newRules.push(newRule);
388 |         }
389 |         else {
390 |           enqueue(newRule);
391 |         }
392 |       }
393 |     }
394 |   }
395 |   
396 |   if(newRules.length == 0) {
397 |     return {empty: true};
398 |   }
399 |   
400 |   return Grammar(newRules, this.start); // I'm... pretty sure this is correct.
401 | }
402 | 
403 | 
404 | // returns a copy of the grammar with duplicate rules removed.
405 | // does not modify the grammar.
406 | Grammar.prototype.strippedDuplicates = function() {
407 |   var newRules = [];
408 |   for(var i=0; i<this.rules.length; ++i) {
409 |     var rule = this.rules[i];
410 |     var j;
411 |     for(j=0; j<newRules.length; ++j) {
412 |       if(newRules[j].equals(rule)) {
413 |         break;
414 |       }
415 |     }
416 |     if(j == newRules.length) {
417 |       newRules.push(rule);
418 |     }
419 |   }
420 |   return Grammar(newRules, this.start);
421 | }
422 | 
423 | // TODO some testing about the proper order to strip things, to make grammar as small as possible.
424 | // returns a copy of the grammar without useless or unreachable symbols.
425 | // also removes duplicate rules and rules of the form A->B. does not modify the grammar,
426 | // except annotating. if the result is empty, returns {empty: true}.
427 | Grammar.prototype.stripped = function() {
428 |   var newGrammar = this.strippedUnitProductions();
429 |   if(newGrammar.empty) return newGrammar;
430 | 
431 |   // useless, then unreachable. not the other way around.
432 |   newGrammar = newGrammar.strippedUseless();
433 |   if(newGrammar.empty) return newGrammar;
434 |   
435 |   newGrammar = newGrammar.strippedUnreachable();
436 |   if(newGrammar.empty) return newGrammar;
437 | 
438 |   assert(newGrammar.annotateUseless().length == 0, 'Suddenly there are more useless symbols?');  
439 |   
440 |   newGrammar = newGrammar.strippedDuplicates();
441 |   return newGrammar;
442 | }
443 | 
444 | 
445 | 
446 | // not exactly the world's most efficient implement, but whatever.
447 | // used in stripping nullables.
448 | function nthSubset(list, n) {
449 |   var out = [];
450 |   for(var i = 0, p = 1; p<=n; ++i, p<<=1) {
451 |     if(p & n) {
452 |       out.push(list[i]);
453 |     }
454 |   }
455 |   return out;
456 | }
457 | 
458 | 
459 | // returns a copy of the grammar which recognizes the same language (except without the empty string)
460 | // does not modify the grammar. new grammar has a property 'makesEpsilon' which is true iff epsilon
461 | // was recognized by the original grammar.
462 | // if the language is otherwise empty, returns {empty: true, makesEpsilon: [as appropriate]}
463 | Grammar.prototype.deNulled = function() {
464 | 
465 |   var newGrammar = this.stripped();
466 |   if(newGrammar.empty) {
467 |     newGrammar.makesEpsilon = false;
468 |     return newGrammar;
469 |   }
470 |   
471 |   newGrammar.annotateNullables();
472 |   var makesEpsilon = newGrammar.symbolMap[newGrammar.start].nullable;
473 |   newRules = [];
474 |   for(var i=0; i<newGrammar.rules.length; ++i) {
475 |     var rule = newGrammar.rules[i];
476 |     if(rule.production.length == 0) {
477 |       continue; // do not add epsilon productions
478 |     }
479 |     var nullableRHSIndices = [];
480 |     for(var j=0; j<rule.production.length; ++j) {
481 |       if(rule.production[j].type == 'NT' && newGrammar.symbolMap[rule.production[j].data].nullable) {
482 |         nullableRHSIndices.push(j);
483 |       }
484 |     }
485 |     
486 |     if(nullableRHSIndices.length == 0) { // don't actually need this case, but meh.
487 |       newRules.push(rule);
488 |       continue;
489 |     }
490 |     
491 |     var skipFinal = (nullableRHSIndices.length == rule.production.length)?1:0; // if all X's are nullable, do not make an epsilon production.
492 |     var lastSubset = Math.pow(2, nullableRHSIndices.length) - skipFinal;
493 |     
494 |     // one new rule for each subset of nullable RHS symbols, omitting precisely that subset
495 |     for(var j = 0; j<lastSubset; ++j) {
496 |       var skippedSubset = nthSubset(nullableRHSIndices, j);
497 |       
498 |       var newProduction = [];
499 |       for(var k=0; k<rule.production.length; ++k) {
500 |         if(skippedSubset.indexOf(k) == -1) {
501 |           newProduction.push(rule.production[k]);
502 |         }
503 |       }
504 |       
505 |       newRules.push(Rule(rule.name, newProduction));
506 |     }
507 |     
508 |   }
509 |   
510 |   if(newRules.length == 0) {
511 |     return {empty: true, makesEpsilon: makesEpsilon};
512 |   }
513 |   
514 |   newGrammar = Grammar(newRules, newGrammar.start);
515 |   assert(newGrammar.annotateNullables().length == 0, 'Having removed nullables, there are still nullables.');
516 |   
517 |   newGrammar = newGrammar.stripped();
518 |   newGrammar.makesEpsilon = makesEpsilon;
519 |   
520 |   assert(newGrammar.empty || newGrammar.annotateSelfDeriving().length == 0, 'Removing nullables and unit productions did not prevent self-deriving, somehow.');
521 |   
522 |   return newGrammar;
523 | }
524 | 
525 | // return a sorted string containing all of the terminals found in strings this grammar can produce.
526 | Grammar.prototype.alphabet = function() {
527 |   var deNulled = this.deNulled();
528 |   if (deNulled.empty) {
529 |     return '';
530 |   }
531 |   var alphabet = [];
532 |   deNulled.rules.forEach(function(r){
533 |     r.production.forEach(function(s){ if (s.type === 'T' && alphabet.indexOf(s.data) === -1) alphabet.push(s.data); });
534 |   });
535 |   return alphabet.sort().join('');
536 | }
537 | 
538 | }
539 | 


--------------------------------------------------------------------------------
/assert.js:
--------------------------------------------------------------------------------
1 | module.exports = function(condition, message) {
2 |   if(!condition) {
3 |     throw new Error(message);
4 |   }
5 | }
6 | 


--------------------------------------------------------------------------------
/check.js:
--------------------------------------------------------------------------------
 1 | var assert = require('./assert');
 2 | var parser = require('./parser');
 3 | var generator = require('./generate');
 4 | 
 5 | 
 6 | // Attempts to prove two grammars are different through the magic of fuzzing.
 7 | // If it finds a string s which is accepted by one but not the other,
 8 | // returns {string: s, acceptedByFirst: boolean} (acceptedByFirst is true if A accepts
 9 | // and B rejects, false if A rejects and B accepts. In other cases s is not a witness
10 | // to A and B being different.)
11 | // If no witness is found, returns null. (So you can use this in an `if` if you don't
12 | // care what the witness is.)
13 | // 'count' and 'length' are optional parameters specifying how many strings at each
14 | // length to check and the maximum length of strings to check respectively.
15 | // Default count is 10 and length is 20.
16 | // If 'deterministic' is true, the RNG used will be deterministic.
17 | // If 'skipAsserts' is true, will not bother checking that generated strings are parsed. TODO don't both with asserts at all.
18 | // Ends up wasting some time generating duplicates at low lengths, but whatever.
19 | // TODO: for efficiency, should use the ftables from generator to limit how count at
20 | // a given length
21 | function locatableDifference(A, B, count, length, deterministic, skipAsserts) {
22 |   count = count || 10;
23 |   length = length || 20;
24 |   if(length < 0 || count < 1) return false;
25 |   
26 |   var oldProduceCount = parser.PRODUCECOUNT;
27 |   parser.PRODUCECOUNT = parser.PRODUCEONE;
28 |   
29 |   function witness(s, which) {
30 |     parser.PRODUCECOUNT = oldProduceCount; // found a witness: done, so reset. yeah, probably shouldn't go here.
31 |     return {string: s, acceptedByFirst: which};
32 |   }
33 |   
34 |   var genA = generator(A, deterministic);
35 |   var genB = generator(B, deterministic);
36 |   
37 |   for(var n=0; n<length; ++n) {
38 |     // first, check that they both either do or do not produce any strings of this length
39 |     var a = genA(n);
40 |     var b = genB(n);
41 |     if(a === null && b === null) {
42 |       continue; // not gonna get any strings; move on.
43 |     }
44 |     else if(a !== null && b === null) {
45 |       if(!skipAsserts) assert(parser.parse(A, a).length === 1, 'Generated a string "' + a + '" which did not parse.');
46 |       return witness(a, true);
47 |     }
48 |     else if(a === null && b !== null) {
49 |       if(!skipAsserts) assert(parser.parse(B, b).length === 1, 'Generated a string "' + b + '" which did not parse.');
50 |       return witness(b, false);
51 |     }
52 |     // ok, at least some strings in each.
53 |     // strictly speaking, could compare a and b here, but whatever.
54 |     for(var i=0; i<count; ++i) {
55 |       a = genA(n);
56 |       if(parser.parse(B, a).length !== 1) {
57 |         return witness(a, true);
58 |       }
59 |       
60 |       b = genB(n);
61 |       if(parser.parse(A, b).length !== 1) {
62 |         return witness(b, false);
63 |       }
64 |     }
65 |   }
66 |   
67 |   parser.PRODUCECOUNT = oldProduceCount;
68 |   return null;
69 | }
70 | 
71 | 
72 | module.exports.locatableDifference = locatableDifference;


--------------------------------------------------------------------------------
/generate.js:
--------------------------------------------------------------------------------
  1 | // taken directly from http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.32.8707
  2 | 
  3 | var assert = require('./assert');
  4 | 
  5 | function sum(l) {
  6 |   var out = 0;
  7 |   for(var i=0; i<l.length; ++i) {
  8 |     out += l[i];
  9 |   }
 10 |   return out;
 11 | }
 12 | 
 13 | function choose(l, r) { // choose an entry from a list at random. should be passed a random number.
 14 |   var total = sum(l);
 15 |   if(total == 0) return -1; // no valid options
 16 |   for(var i=0; i<l.length; ++i) {
 17 |     var t = l[i]/total;
 18 |     if(r < t) return i;
 19 |     r -= t;
 20 |   }
 21 |   console.log('No choices? This shouldn\'t really happen.', r);
 22 |   return l.length-1;
 23 | }
 24 | 
 25 | 
 26 | function generatorFactory(grammar, deterministic) {
 27 |   grammar = grammar.deNulled();
 28 |   if(!grammar.empty && grammar.annotateSelfDeriving().length > 0) {
 29 |     throw Error('Generator does not work when there are infinitely many parses for a string. (ie, when A*=>A is possible.)');
 30 |   }
 31 |   
 32 |   var rand = !deterministic ? Math.random : (function() {
 33 |     var seed = 0x2F6E2B1;
 34 |     return function() {
 35 |       // Robert Jenkins' 32 bit integer hash function. From Octane / V8.
 36 |       seed = ((seed + 0x7ED55D16) + (seed << 12))  & 0xFFFFFFFF;
 37 |       seed = ((seed ^ 0xC761C23C) ^ (seed >>> 19)) & 0xFFFFFFFF;
 38 |       seed = ((seed + 0x165667B1) + (seed << 5))   & 0xFFFFFFFF;
 39 |       seed = ((seed + 0xD3A2646C) ^ (seed << 9))   & 0xFFFFFFFF;
 40 |       seed = ((seed + 0xFD7046C5) + (seed << 3))   & 0xFFFFFFFF;
 41 |       seed = ((seed ^ 0xB55A4F09) ^ (seed >>> 16)) & 0xFFFFFFFF;
 42 |       return (seed & 0xFFFFFFF) / 0x10000000;
 43 |     };
 44 |   }());
 45 | 
 46 |   var ftable = {};
 47 |   function f(sym, n) {
 48 |     if(!(sym in ftable)) {
 49 |       ftable[sym] = {};
 50 |     }
 51 |     if(n in ftable[sym]) {
 52 |       return ftable[sym][n];
 53 |     }
 54 |   
 55 |     var out = [];
 56 |     for(var j=0; j<grammar.symbolMap[sym].rules.length; ++j) {
 57 |       out.push(sum(fprime(sym, j, 0, n)));
 58 |     }
 59 |   
 60 |     ftable[sym][n] = out;
 61 |     return out;
 62 |   }
 63 | 
 64 |   var fprimetable = {};
 65 |   function fprime(sym, j, k, n) {
 66 |     if(n == 0) return [];
 67 |   
 68 |     if(!(sym in fprimetable)) {
 69 |       fprimetable[sym] = {};
 70 |     }
 71 |     if(!(j in fprimetable[sym])) {
 72 |       fprimetable[sym][j] = {};
 73 |     }
 74 |     if(!(k in fprimetable[sym][j])) {
 75 |       fprimetable[sym][j][k] = {};
 76 |     }
 77 |     if(n in fprimetable[sym][j][k]) {
 78 |       return fprimetable[sym][j][k][n];
 79 |     }
 80 |   
 81 |     var x = grammar.symbolMap[sym].rules[j].production[k];
 82 |     var tij = grammar.symbolMap[sym].rules[j].production.length-1;
 83 |     var out;
 84 |     if(x.type == 'T') {
 85 |       if(k == tij) { // basically, if we are being asked about the last symbol
 86 |         if(n == 1) { // paper has n=0. pretty sure that's a typo.
 87 |           out = [1];
 88 |         }
 89 |         else {
 90 |           out = [0];
 91 |         }
 92 |       }
 93 |       else {
 94 |         out = [sum(fprime(sym, j, k+1, n-1))];
 95 |       }
 96 |     }
 97 |     else {
 98 |       if(k == tij) {
 99 |         out = [sum(f(x.data, n))];
100 |       }
101 |       else {
102 |         out = [];
103 |         for(var l=1; l<=n-tij+k; ++l) {
104 |           out.push(sum(f(x.data, l)) * sum(fprime(sym, j, k+1, n-l)));
105 |         }
106 |       }
107 |     }
108 |   
109 |     fprimetable[sym][j][k][n] = out;
110 |     return out;
111 |   }
112 | 
113 | 
114 | 
115 |   function g(sym, n) {
116 |     var r = choose(f(sym, n), rand());
117 |     if(r == -1) return null; // no valid options
118 |     return gprime(sym, r, 0, n);
119 |   }
120 | 
121 | 
122 |   function gprime(sym, j, k, n) {
123 |     var x = grammar.symbolMap[sym].rules[j].production[k];
124 |     //console.log(sym, j, k, n, x)
125 |     var tij = grammar.symbolMap[sym].rules[j].production.length-1;
126 |   
127 |     if(x.type == 'T') {
128 |       if(k == tij) {
129 |         return [x.data];
130 |       }
131 |       else {
132 |         return [x.data].concat(gprime(sym, j, k+1, n-1));
133 |       }
134 |     }
135 |     else {
136 |       if(k == tij) {
137 |         return g(x.data, n);
138 |       }
139 |       else {
140 |         var l = choose(fprime(sym, j, k, n), rand()); // paper has i, i, k, n. pretty sure that's a typo
141 |         assert(l !== -1, "Couldn't find a valid choice.");
142 |         return g(x.data, l+1).concat(gprime(sym, j, k+1, n-(l+1))); // l is a length, not an index
143 |       }
144 |     }
145 |   }
146 | 
147 | 
148 |   function generate(n, opts) {
149 |     var asList = opts != null && opts.list;
150 |     if(n == 0) {
151 |       return grammar.makesEpsilon ? asList ? [] : '' : null;
152 |     }
153 |     if(grammar.empty) {
154 |       return null;
155 |     }
156 |     var outList = g(grammar.start, n);
157 |     if (outList == null) {
158 |       return null;
159 |     }
160 |     return asList ? outList : outList.join('');
161 |   }
162 |   
163 |   // TODO probably get rid of this.
164 |   // determine if there are any strings in the grammar of length in [start, start+range)
165 |   // returns such an n, if one exists, or -1 if none exist, or -2 if the language is {''},
166 |   // or -3 if the language is the empty set.
167 |   // by default, start=0, range=10
168 |   generate.findLength = function(start, range) {
169 |     if(grammar.empty) {
170 |       return grammar.makesEpsilon?-2:-3;
171 |     }
172 |     start = start || 0;
173 |     range = range || 10;
174 | 
175 |     if(start == 0 && grammar.makesEpsilon) {
176 |       return 0;
177 |     }
178 | 
179 |     for(var n=start; n<start+range; ++n) {
180 |       if(choose(f(grammar.start, n), rand()) !== -1) {
181 |         return n;
182 |       }
183 |     }
184 |     
185 |     return -1;
186 |   }
187 |   
188 |   // In the range [start, start+range), which lengths are possible?
189 |   // Returns null if the grammar is empty.
190 |   // TODO could also tell people when the only possibility is the empty string...
191 |   generate.findLengths = function(start, range) {
192 |     start = start || 0;
193 |     range = range || 10;
194 |     if(grammar.empty) {
195 |       if(!grammar.makesEpsilon) {
196 |         return null;
197 |       }
198 |       else {
199 |         return start == 0 ? [0]:[];
200 |       }
201 |     }
202 |     
203 |     var lengths = [];
204 |     if(start == 0) {
205 |       if(grammar.makesEpsilon) {
206 |         lengths.push(0);
207 |       }
208 |       start = 1;
209 |     }
210 |     
211 |     for(var length = start; length<start+range; ++length) {
212 |       if(choose(f(grammar.start, length), rand()) !== -1) {
213 |         lengths.push(length);
214 |       }
215 |     }
216 |     
217 |     return lengths;
218 |   }
219 |   
220 |   return generate;
221 | }
222 | 
223 | 
224 | module.exports = generatorFactory;


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
1 | module.exports = exports = {
2 |   types: require('./types'),
3 |   generator: require('./generate'),
4 |   parser: require('./parser'),
5 |   checks: require('./check'),
6 |   printers: require('./printers')
7 | }
8 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "cfgrammar-tool",
 3 |   "version": "1.0.0",
 4 |   "description": "Work with context-free grammars. Parsing, string generation, and manipulation.",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "test": "node test.js"
 8 |   },
 9 |   "repository": {
10 |     "type": "git",
11 |     "url": "https://github.com/bakkot/cfgrammar-tool.git"
12 |   },
13 |   "keywords": [
14 |     "context free grammar",
15 |     "parser",
16 |     "fuzzing"
17 |   ],
18 |   "author": "bakkot",
19 |   "license": "MIT",
20 |   "bugs": {
21 |     "url": "https://github.com/bakkot/cfgrammar-tool/issues"
22 |   },
23 |   "homepage": "https://github.com/bakkot/cfgrammar-tool"
24 | }
25 | 


--------------------------------------------------------------------------------
/parser.js:
--------------------------------------------------------------------------------
  1 | // http://cs.stackexchange.com/questions/40965/cfgs-detecting-infinitely-many-derivations-of-a-single-string
  2 | // http://www.cs.laurentian.ca/jdompierre/html/MATH2056E_W2011/cours/s8.4_closures_relations_BW.pdf
  3 | // https://a2c2a.wordpress.com/2014/09/18/implementing-an-earley-parser-that-handles-nullable-grammars-and-draws-all-unique-parse-trees-in-python/
  4 | // http://web.stanford.edu/~crwong/cfg/grammar.html
  5 | // http://en.wikipedia.org/wiki/Floyd%E2%80%93Warshall_algorithm
  6 | 
  7 | 
  8 | var assert = require('./assert');
  9 | var types = require('./types');
 10 | require('./algorithms')(types.Grammar);
 11 | 
 12 | var parser = {};
 13 | 
 14 | var enums = {
 15 |   DISTINCT: {},
 16 |   SIMILAR: {},
 17 |   IDENTICAL: {}, // ie, same rule, index, and predecessor, but different sub-parses
 18 |   PRODUCEONE: {},
 19 |   PRODUCETWO: {},
 20 |   PRODUCEALL: {}
 21 | }
 22 | parser.PRODUCEONE = enums.PRODUCEONE;
 23 | parser.PRODUCETWO = enums.PRODUCETWO;
 24 | parser.PRODUCEALL = enums.PRODUCEALL; // TODO this should not be a global setting. if you really need, have different parse functions.
 25 | 
 26 | 
 27 | 
 28 | parser.PRODUCECOUNT = enums.PRODUCETWO;
 29 | 
 30 | var NT = types.NT;
 31 | var T = types.T;
 32 | var Rule = types.Rule;
 33 | var Grammar = types.Grammar;
 34 | 
 35 | 
 36 | // library code, woo
 37 | function arraysEqual(a, b) {
 38 |   if(a === b) return true;
 39 |   if(a == null || b == null) return false;
 40 |   if(a.length != b.length) return false;
 41 |   for(var i = 0; i < a.length; ++i) {
 42 |     if(a[i] !== b[i]) return false;
 43 |   }
 44 |   return true;
 45 | }
 46 | 
 47 | 
 48 | 
 49 | 
 50 | // a State in an Earley parse is a tuple (rule, index, predecessor, backPointers)
 51 | // Conceptually, a State is a possibly-partial sub-parse of some part of the string.
 52 | // 'rule' is the rule which this state is a (possibly partial) parse of
 53 | // 'index' is how far along in the rule's production this state is
 54 | // 'predecessor' is the index in the string-being-parsed at which this rule began
 55 | // 'backPointers' is the children of this rule, essentially: that is,
 56 | //   when index > 0, index has been pushed along by a series of sub-parses completing,
 57 | //   each sub-parse representing a terminal or nonterminal in this rule's production.
 58 | //   backPointers is an array containing those completed sub-parses/States.
 59 | //   in particular, backPointers[i] is the State object corresponding to
 60 | //   rule.production[i] (or null if said production is a terminal).
 61 | // TODO rename backPointers, do away with index
 62 | // TODO have 'c' instead of null for terminals in backPointers
 63 | function State(rule, index, predecessor, backPointers) {
 64 |   if(!(this instanceof State)) return new State(rule, index, predecessor, backPointers);
 65 |   this.rule = rule;
 66 |   this.index = index;
 67 |   this.predecessor = predecessor;
 68 |   this.backPointers = backPointers || [];
 69 |   assert(this.index == this.backPointers.length); // honestly could just do away with index at this point
 70 | }
 71 | State.prototype.done = function(){ return this.index === this.rule.production.length; }
 72 | State.prototype.compare = function(other) {
 73 |   if(this.rule === other.rule
 74 |   && this.index === other.index
 75 |   && this.predecessor === other.predecessor) {
 76 |     if(arraysEqual(this.backPointers, other.backPointers)) {
 77 |       return enums.IDENTICAL;
 78 |     }
 79 |     else {
 80 |       return enums.SIMILAR;
 81 |     }
 82 |   }
 83 |   else {
 84 |     return enums.DISTINCT;
 85 |   }
 86 | }
 87 | State.prototype.next = function(){ return this.rule.production[this.index]; } 
 88 | State.prototype.toString = function(){
 89 |   return '(' + this.rule.name + ' -> ' + this.rule.production.slice(0, this.index).join('')
 90 |           + '*' + this.rule.production.slice(this.index).join('') + ', ' + this.predecessor.toString() + ')';
 91 | }
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | function parse(grammar, str, produceCount) {
100 |   if(typeof str !== 'string') throw Error('Can\'t parse non-string object ' + (typeof str));
101 |   var oldProduceCount = parser.PRODUCECOUNT;
102 |   if(produceCount) {
103 |     parser.PRODUCECOUNT = produceCount;
104 |   }
105 |   
106 |   var chart = [];
107 |   for(var i=0; i<=str.length; ++i) chart.push([]);
108 |   
109 |   function seen(state, strPos) {
110 |     var count = 0;
111 |     for(var i=0; i<chart[strPos].length; ++i) {
112 |       var equalness = state.compare(chart[strPos][i]);
113 |       if(equalness == enums.IDENTICAL || (equalness == enums.SIMILAR && parser.PRODUCECOUNT == enums.PRODUCEONE)) { // either we've seen this exact thing before, or we've seen this modulo different parses and don't care about different parses
114 |         return true;
115 |       }
116 |       if(equalness == enums.SIMILAR && parser.PRODUCECOUNT == enums.PRODUCETWO && ++count > 1) { // we've seen something similar and do care
117 |         return true;
118 |       }
119 |     }
120 |     return false;
121 |   }
122 |   
123 |   function scanner(state, strPos) {
124 |     if(state.next().equals(T(str[strPos]))) {
125 |       var newBPs = state.backPointers.slice(0);
126 |       newBPs.push(null); // terminals do not need backpointers, of course
127 |       var advanced = State(state.rule, state.index+1, state.predecessor, newBPs);
128 |       if(!seen(advanced, strPos+1)) {
129 |         chart[strPos+1].push(advanced);
130 |       }
131 |     }
132 |   }
133 |   
134 |   function predictor(state, strPos) {
135 |     var sym = state.next();
136 |     for(var i=0; i<grammar.symbolMap[sym.data].rules.length; ++i) {
137 |       var advanced = State(grammar.symbolMap[sym.data].rules[i], 0, strPos);
138 |       if(!seen(advanced, strPos)) {
139 |         chart[strPos].push(advanced);
140 |       }
141 |     }
142 |     
143 |     // handle silly nullable cornercase: we might need to "re-run" completer for a nullable
144 |     // if we are predicting that nullable but it's already been processed
145 |     // given 'nullable' annotation, we could skip this when 'sym' is not nullable
146 |     for(var i=0; i<chart[strPos].length; ++i) { // can actually abort when we hit current state, but no real need (todo check speedup)
147 |       var candidate = chart[strPos][i];
148 |       if(candidate.rule.name === sym.data && candidate.predecessor === strPos && candidate.done()) {
149 |         var newBPs = state.backPointers.slice(0);
150 |         newBPs.push(candidate); // 'candidate' is already done
151 |         var advanced = State(state.rule, state.index+1, state.predecessor, newBPs);
152 |         if(!seen(advanced, strPos)) {
153 |           chart[strPos].push(advanced);
154 |         }
155 |       }
156 |     }
157 |   }
158 |   
159 |   function completer(state, strPos) {
160 |     var thisSym = NT(state.rule.name);
161 |     for(var i=0; i<chart[state.predecessor].length; ++i) {
162 |       var prevState = chart[state.predecessor][i];
163 |       if(!prevState.done() && thisSym.equals(prevState.next())) {
164 |         var newBPs = prevState.backPointers.slice(0);
165 |         newBPs.push(state); // just finished 'state'
166 |         var advanced = State(prevState.rule, prevState.index+1, prevState.predecessor, newBPs);
167 |         if(!seen(advanced, strPos)) {
168 |           chart[strPos].push(advanced);
169 |         }
170 |       }      
171 |     }
172 |   }
173 |   
174 | 
175 |   if(parser.PRODUCECOUNT == enums.PRODUCEALL && grammar.annotateSelfDeriving().length !== 0) {
176 |     throw Error('Asked for all parses, but grammar can produce infinitely many parses for some string. Check grammar.annotateSelfDeriving() for specifics.');
177 |   }
178 |     
179 |   
180 |   var startSym = grammar.start;
181 |   var gammaRule = Rule(['GAMMA'], [NT(startSym)]); // needs a _unique_ identifier. Easiest way: new object
182 |   chart[0].push(State(gammaRule, 0, 0));
183 |   
184 |   for(var i=0; i<=str.length; ++i) {
185 |     for(var j=0; j<chart[i].length; ++j) {
186 |       var state = chart[i][j];
187 |       if(!state.done()) {
188 |         if(state.next().type == 'NT') {
189 |           predictor(state, i);
190 |         }
191 |         else {
192 |           scanner(state, i);
193 |         }
194 |       }
195 |       else {
196 |         completer(state, i);
197 |       }
198 |     }
199 |   }
200 | 
201 |   // done constructing chart; time to find parses
202 |   var parses = [];
203 |   for(var i=0; i<chart[str.length].length; ++i) {
204 |     var state = chart[str.length][i];
205 |     if(state.rule === gammaRule && state.done()) {
206 |       parses.push(state);
207 |     }
208 |   }
209 |   
210 |   parser.PRODUCECOUNT = oldProduceCount;
211 |   return parses;
212 | }
213 | 
214 | parser.parse = parse;
215 | 
216 | 
217 | 
218 | module.exports = parser;
219 | 


--------------------------------------------------------------------------------
/printers.js:
--------------------------------------------------------------------------------
  1 | var INDENT = '  ';
  2 | function subtreePrinter(state, depth) {
  3 |   depth = depth | 0;
  4 |   var prefix = '';
  5 |   for(var i=0; i<depth; ++i) {
  6 |     prefix += INDENT;
  7 |   }
  8 |   console.log(prefix + state.rule)// + ' ' + state.backPointers.length);
  9 |   prefix += INDENT;
 10 |   for(var i=0; i<state.backPointers.length; ++i) {
 11 |     var backPointer = state.backPointers[i];
 12 |     if(backPointer === null) { // ie, terminal
 13 |       console.log(prefix + state.rule.production[i].data); 
 14 |     }
 15 |     else {
 16 |       subtreePrinter(backPointer, depth+1);
 17 |     }
 18 |   }
 19 | }
 20 | 
 21 | 
 22 | function rewritePrinter(parse) {
 23 |   var str = [parse];
 24 |   
 25 |   function formatIntermediateString(highlightIndex) { // highlightIndex must be a state, not a final symbol
 26 |     var o = '';
 27 |     for(var i=0; i<str.length; ++i) {
 28 |       if(i == highlightIndex) {
 29 |         o += '*' + str[i].rule.name + '*';
 30 |       }
 31 |       else {
 32 |         if(typeof str[i] === 'string') {
 33 |           o += str[i];
 34 |         }
 35 |         else {
 36 |           o += str[i].rule.name;
 37 |         }
 38 |       }
 39 |     }
 40 |     return o;
 41 |   }
 42 |   
 43 |   for(var i = 0; i<str.length; ++i) { // NB: both str.length and i change within the rewrite
 44 |     if(typeof str[i] === 'string') {
 45 |       continue;
 46 |     }
 47 |     
 48 |     var state = str[i];
 49 |     var out = state.rule.toString() + '  |  ';
 50 |     out += formatIntermediateString(i) + '  |  ';
 51 |     
 52 |     var rewritten = [];
 53 |     for(var j=0; j<state.index; ++j) {
 54 |       if(state.rule.production[j].type == 'T') {
 55 |         rewritten.push(state.rule.production[j].data);
 56 |       }
 57 |       else {
 58 |         rewritten.push(state.backPointers[j]);
 59 |       }
 60 |     }
 61 |     str = str.slice(0, i).concat(rewritten).concat(str.slice(i+1));
 62 |     out += formatIntermediateString(-1);
 63 |     console.log(out);
 64 |     --i; // gotta reprocess the index we just rewrote
 65 |   }
 66 |   
 67 | }
 68 | 
 69 | 
 70 | function astPrinter(parse, collapseUnitProductions, discardImplicitTerminals, ruleRenamingFunction) {
 71 |   // collapseUnitProductions defaults to false. If true, rules of the form X->Y will not generate an additional level in the AST.
 72 |   // discardImplicitTerminals: if a production contains both terminals and nonterminals, children does not contain the terminals.
 73 |   // ruleRenamingFunction should be a function from Rules in the grammar to names of rules (e.g. strings), which will then be used as the 'type' of nodes. If not present, 'type' will be the Rule itself.
 74 |   // Non-terminals in the resulting AST have 'type' and 'children' properties, with 'children' being an array. Terminals have type 'Terminal' and a 'value' property containing their value.
 75 |   
 76 |   var rename = typeof ruleRenamingFunction === 'function';
 77 |   
 78 |   function backPointerToSubtree(bp) {
 79 |     if (collapseUnitProductions && bp.backPointers.length === 1) {
 80 |       var child = bp.backPointers[0];
 81 |       if (child === null) {
 82 |         return {
 83 |           type: 'Terminal',
 84 |           value: bp.rule.production[0].data
 85 |         };
 86 |       } else {
 87 |         return backPointerToSubtree(child);
 88 |       }
 89 |     }
 90 |     var tree = {
 91 |       type: rename ? ruleRenamingFunction(bp.rule) : bp.rule,
 92 |       children: []
 93 |     }
 94 |     var keepTerminals = !(discardImplicitTerminals && bp.backPointers.some(function(c){return c!== null;}));
 95 |     for (var i = 0; i<bp.backPointers.length; ++i) {
 96 |       var current = bp.backPointers[i];
 97 |       if (current === null) {
 98 |         if (keepTerminals) {
 99 |           tree.children.push({
100 |             type: 'Terminal',
101 |             value: bp.rule.production[i].data
102 |           });
103 |         }
104 |       } else {
105 |         tree.children.push(backPointerToSubtree(current));
106 |       }
107 |     }
108 |     return tree;
109 |   }
110 |   return backPointerToSubtree(parse.backPointers[0]);
111 | }
112 | 
113 | 
114 | // Helper for domRule and domGrammar
115 | // Returns a span representing a RHS.
116 | function domProduction(production) {
117 |   var o = document.createElement('span');
118 |   if(production.length == 0) {
119 |     o.appendChild(document.createTextNode('\u025B')); // epsilon
120 |   }
121 |   else {
122 |     for(var i=0; i<production.length; ++i) {
123 |       if(production[i].type == 'T') {
124 |         o.appendChild(document.createTextNode(production[i].data));
125 |       }
126 |       else {
127 |         var sp = document.createElement('span');
128 |         sp.className = 'cfg-symbol';
129 |         sp.appendChild(document.createTextNode(production[i].data));
130 |         o.appendChild(sp);
131 |       }
132 |     }
133 |   }
134 |   return o;
135 | }
136 | 
137 | // helper for domPrinter
138 | // create a DOM node representing the rule. obviously only call in browsers.
139 | // symbols get class cfg-symbol, the rule itself class cfg-rule.
140 | function domRule(rule) {
141 |   var o = document.createElement('span');
142 |   o.className = 'cfg-rule';
143 |   
144 |   var sp = document.createElement('span');
145 |   sp.className = 'cfg-symbol';
146 |   sp.appendChild(document.createTextNode(rule.name));
147 |   o.appendChild(sp);
148 |   o.appendChild(document.createTextNode(' \u2192 ')); // right arrow
149 |   
150 |   o.appendChild(domProduction(rule.production));
151 |     
152 |   return o;
153 | }
154 | 
155 | // create a DOM table representing the entire parse. obviously only call in browsers.
156 | function domPrinter(parse) {
157 |   var str = [parse];
158 |   
159 |   function formatIntermediateString(highlightStart, highlightLength) {
160 |     if(typeof highlightLength !== 'number' || highlightLength < 0) highlightLength = 1;
161 |     
162 |     var o = document.createElement('span');
163 |     c = o;
164 |     for(var i=0; i<str.length; ++i) {
165 |       if(i == highlightStart) {
166 |         c = document.createElement('span');
167 |         c.className = 'cfg-rewrite';
168 |         o.appendChild(c);
169 |       }
170 |       
171 |       if(i - highlightStart >= highlightLength) {
172 |         c = o;
173 |       }
174 |       
175 |       if(typeof str[i] === 'string') {
176 |         c.appendChild(document.createTextNode(str[i]));
177 |       }
178 |       else {
179 |         var sp = document.createElement('span');
180 |         sp.className = 'cfg-symbol';
181 |         sp.appendChild(document.createTextNode(str[i].rule.name));
182 |         c.appendChild(sp);
183 |       }
184 |     }
185 |     return o;
186 |   }
187 | 
188 |   var out = document.createElement('table');
189 |   out.className = 'cfg-derivations derivations'; // TODO second is for compat
190 |   out.innerHTML = '<thead><tr><th>Rule</th><th>Application</th><th>Result</th></tr></thead>';
191 |   
192 |   
193 |   // handle GAMMA state specially
194 |   var row = document.createElement('tr');
195 |   var cell = document.createElement('td');
196 |   var sp = document.createElement('sp');
197 |   sp.className = 'cfg-rule';
198 |   sp.innerHTML = 'Start \u2192 ' + '<span class="cfg-symbol">' + parse.backPointers[0].rule.name + '</span>';
199 |   cell.appendChild(sp);
200 |   row.appendChild(cell);
201 | 
202 |   cell = document.createElement('td');
203 |   var sp = document.createElement('span');
204 |   sp.className = 'cfg-start';
205 |   sp.appendChild(document.createTextNode('Start'));
206 |   cell.appendChild(sp);
207 |   row.appendChild(cell);
208 |   
209 |   str = [parse.backPointers[0]]; // ie, start symbol
210 |   cell = document.createElement('td');
211 |   cell.appendChild(formatIntermediateString(-1));
212 |   row.appendChild(cell);
213 |   
214 |   out.appendChild(row);
215 | 
216 |   
217 |   for(var i = 0; i<str.length; ++i) { // NB: both str.length and i change within the body of the loop
218 |     if(typeof str[i] === 'string') {
219 |       continue;
220 |     }
221 |     
222 |     var state = str[i];
223 | 
224 |     var row = document.createElement('tr');
225 |     var cell = document.createElement('td');
226 |     cell.appendChild(domRule(state.rule));
227 |     row.appendChild(cell);
228 |   
229 |     cell = document.createElement('td');
230 |     cell.appendChild(formatIntermediateString(i));
231 |     row.appendChild(cell);
232 |     
233 | 
234 |     
235 |     var rewritten = [];
236 |     for(var j=0; j<state.index; ++j) {
237 |       if(state.rule.production[j].type == 'T') {
238 |         rewritten.push(state.rule.production[j].data);
239 |       }
240 |       else {
241 |         rewritten.push(state.backPointers[j]);
242 |       }
243 |     }
244 |     str = str.slice(0, i).concat(rewritten).concat(str.slice(i+1));
245 | 
246 |     cell = document.createElement('td');
247 |     cell.appendChild(formatIntermediateString(i, rewritten.length));
248 |     row.appendChild(cell);
249 |     out.appendChild(row);
250 | 
251 |     --i; // gotta reprocess the index we just rewrote
252 |   }
253 |   
254 |   return out;
255 | }
256 | 
257 | 
258 | 
259 | function escapeHTML(str) {
260 |   // not my preferred solution, but whatever.
261 |   var div = document.createElement('div');
262 |   div.appendChild(document.createTextNode(str));
263 |   return div.innerHTML;
264 | }
265 | 
266 | // create a DOM div representing the entire parse. obviously only call in browsers.
267 | function domGrammarPrinter(grammar) {
268 |   var o = document.createElement('div');
269 |   var line = document.createElement('span');
270 |   line.innerHTML = 'Start symbol: <span class="cfg-symbol">' + escapeHTML(grammar.start) + '</span>';
271 |   o.appendChild(line);
272 |   o.appendChild(document.createElement('br'));
273 |   
274 |   for(var i=0; i<grammar.symbolsList.length; ++i) {
275 |     var sym = grammar.symbolsList[i];
276 |     line = document.createElement('span');
277 |     var sp = document.createElement('span');
278 |     sp.className = 'cfg-symbol';
279 |     sp.appendChild(document.createTextNode(sym));
280 |     line.appendChild(sp);
281 |     line.appendChild(document.createTextNode(' \u2192 '));
282 |     for(var j=0; j<grammar.symbolMap[sym].rules.length; ++j) {
283 |       if(j > 0) {
284 |         line.appendChild(document.createTextNode(' | '));
285 |       }
286 |       var rule = grammar.symbolMap[sym].rules[j];
287 |       line.appendChild(domProduction(rule.production));
288 |     }
289 |     o.appendChild(line);
290 |     o.appendChild(document.createElement('br'));
291 |   }
292 |   
293 |   return o;
294 | }
295 | 
296 | 
297 | module.exports = {
298 |   subtreePrinter: subtreePrinter,
299 |   rewritePrinter: rewritePrinter,
300 |   astPrinter: astPrinter,
301 |   domPrinter: domPrinter,
302 |   domGrammarPrinter: domGrammarPrinter
303 | }


--------------------------------------------------------------------------------
/test.js:
--------------------------------------------------------------------------------
  1 | var types = require('./types');
  2 | NT = types.NT;
  3 | T = types.T;
  4 | Rule = types.Rule;
  5 | Grammar = types.Grammar;
  6 | var generator = require('./generate');
  7 | var checks = require('./check');
  8 | var assert = require('./assert');
  9 | var parser = require('./parser');
 10 | var subtreePrinter = require('./printers').subtreePrinter;
 11 | var astPrinter = require('./printers').astPrinter;
 12 | 
 13 | 
 14 | 
 15 | 
 16 | // Arithmetic expressions on 0-9 (with precedence). Demonstrates two ways to evaluate a parse.
 17 | 
 18 | var plus = Rule('E', [NT('E'), T('+'), NT('T')]);
 19 | var term = Rule('E', [NT('T')]);
 20 | var times = Rule('T', [NT('T'), T('*'), NT('F')]);
 21 | var factor = Rule('T', [NT('F')]);
 22 | var pos = Rule('F', [NT('P')]);
 23 | var neg = Rule('F', [T('-'), NT('P')]); // JS does not allow --1
 24 | var paren = Rule('P', [T('('), NT('E'), T(')')]);
 25 | var digit = Rule('P', [NT('N')]);
 26 | 
 27 | var mathGrammar = Grammar([
 28 |   plus,
 29 |   term,
 30 |   times,
 31 |   factor,
 32 |   pos,
 33 |   neg,
 34 |   paren,
 35 |   digit,
 36 |   Rule('N', [T('0')]),
 37 |   Rule('N', [T('1')]),
 38 |   Rule('N', [T('2')]),
 39 |   Rule('N', [T('3')]),
 40 |   Rule('N', [T('4')]),
 41 |   Rule('N', [T('5')]),
 42 |   Rule('N', [T('6')]),
 43 |   Rule('N', [T('7')]),
 44 |   Rule('N', [T('8')]),
 45 |   Rule('N', [T('9')])]
 46 | );
 47 | 
 48 | // You can treat the parse tree as a very complex AST and evaluate directly, as follows:
 49 | plus.eval = function(state) { return mathEval(state.backPointers[0]) + mathEval(state.backPointers[2]); }
 50 | times.eval = function(state) { return mathEval(state.backPointers[0]) * mathEval(state.backPointers[2]); }
 51 | neg.eval = function(state) { return -mathEval(state.backPointers[1]); }
 52 | paren.eval = function(state) { return mathEval(state.backPointers[1]); }
 53 | digit.eval = function(state) { return parseInt(state.backPointers[0].rule.production[0].data); }
 54 | 
 55 | function mathEval(state) {
 56 |   if(state.rule.eval) {
 57 |     return state.rule.eval(state);
 58 |   }
 59 |   else {
 60 |     assert(state.rule.production.length == 1, 'No valid evaluation rule.');
 61 |     return mathEval(state.backPointers[0]);
 62 |   }
 63 | }
 64 | 
 65 | // Or you can use the astPrinter to get a sane AST, and then evaluate that.
 66 | function toMathAst(parse) {
 67 |   return astPrinter(parse, true, true, function(rule) { // the function is a map from rules to the name of the corresponding node
 68 |     switch(rule) {
 69 |       case plus:
 70 |         return 'Plus';
 71 |       case times:
 72 |         return 'Times';
 73 |       case neg:
 74 |         return 'Negation';
 75 |       case paren:
 76 |         return 'Paren';
 77 |       default:
 78 |         return 'Unknown';
 79 |     }
 80 |   });
 81 | }
 82 | 
 83 | function mathAstEval(ast) {
 84 |   switch(ast.type) {
 85 |     case 'Plus':
 86 |       return mathAstEval(ast.children[0]) + mathAstEval(ast.children[1]);
 87 |     case 'Times':
 88 |       return mathAstEval(ast.children[0]) * mathAstEval(ast.children[1]);
 89 |     case 'Negation':
 90 |       return -mathAstEval(ast.children[0]);
 91 |     case 'Paren':
 92 |       return mathAstEval(ast.children[0]);
 93 |     case 'Terminal':
 94 |       return +ast.value;
 95 |   }
 96 | }
 97 | 
 98 | 
 99 | var mathGenerator = generator(mathGrammar);
100 | 
101 | console.log('Arithmetic tests:');
102 | for(var i=0; i<10; ++i) {
103 |   var list = i % 2 === 0;
104 |   var expr = mathGenerator(Math.round(Math.random()*40) + 1, { list: list });
105 |   if (list) {
106 |     assert(Array.isArray(expr));
107 |     expr = expr.join('');
108 |   }
109 |   var res = parser.parse(mathGrammar, expr, parser.PRODUCEALL);
110 |   assert(res.length == 1, 'mathGrammar is ambiguous?');
111 |   
112 |   var grammarVal = mathEval(res[0]);
113 |   var jsVal = eval(expr);
114 |   assert(grammarVal === jsVal || (isNaN(grammarVal) && isNaN(jsVal)), 'JS disagrees with our evaluation.');
115 |   
116 |   var ast = toMathAst(res[0]);
117 |   var astVal = mathAstEval(ast);
118 |   assert(jsVal === astVal || (isNaN(jsVal) && isNaN(astVal)), 'JS disagrees with the AST evaluation.');
119 | }
120 | console.log('Passed.');
121 | 
122 | 
123 | 
124 | // The ur-test: generate and test CFGs. BECAUSE I CAN.
125 | // Specifically, for five-or-fewer-symbol CFGs over [x,y,z].
126 | // Only to be used for generation, not parsing (because I don't want to split up the terminal strings)
127 | 
128 | 
129 | var grammarGrammar = Grammar([
130 |   Rule('Grammar', [T('Grammar([\n  '), NT('Rule'), NT('RulesList'), T('\n]);')]),
131 |   Rule('RulesList', [T(',\n  '), NT('Rule'), NT('RulesList')]),
132 |   Rule('RulesList', []),
133 |   Rule('Rule', [T('Rule(\''), NT('NT'), T('\', ['), NT('OptionalSymList'), T('])')]),
134 |   Rule('OptionalSymList', [NT('Sym'), NT('SymList')]),
135 |   Rule('OptionalSymList', []),
136 |   Rule('SymList', [T(', '), NT('Sym'), NT('SymList')]),
137 |   Rule('SymList', []),
138 |   Rule('Sym', [T('T(\''), NT('T'), T('\')')]),
139 |   Rule('Sym', [T('NT(\''), NT('NT'), T('\')')]),
140 |   Rule('T', [T('x')]),
141 |   Rule('T', [T('y')]),
142 |   Rule('T', [T('z')]),
143 |   Rule('NT', [T('A')]),
144 |   Rule('NT', [T('B')]),
145 |   Rule('NT', [T('C')]),
146 |   Rule('NT', [T('D')]),
147 |   Rule('NT', [T('E')])
148 | ]);
149 | 
150 | var ggg = generator(grammarGrammar);
151 | 
152 | function makeGrammar() {
153 |   var x = ggg(Math.round(Math.random()*400) + 40);
154 |   //console.log(x);
155 |   return eval(x); // eval? yes. eval.
156 | }
157 | 
158 | // Generate ten random context-free grammars, and ensure that the set of strings
159 | // each generates appears to be at least a subset of the set of strings each recognizes.
160 | // (Of course, the sets should be identical, but that's harder to test.)
161 | console.log('CFG tests:');
162 | for(var i=0; i<5; ++i) {
163 |   //console.log(i);
164 |   var g = makeGrammar();
165 |   var w = checks.locatableDifference(g, g, 4, 10);
166 |   if(w) {
167 |     console.log(w);
168 |     process.exit();
169 |   }
170 | }
171 | console.log('Passed.');
172 | 
173 | 


--------------------------------------------------------------------------------
/types.js:
--------------------------------------------------------------------------------
  1 | function Sym(type, data) {
  2 |   this.type = type;
  3 |   this.data = data; 
  4 | }
  5 | Sym.prototype.equals = function(other) {
  6 |   return other.type === this.type && other.data === this.data;
  7 | }
  8 | Sym.prototype.toString = function(){ 
  9 |   return this.data.toString(); //return this.type + '(' + this.data + ')';
 10 | }
 11 | 
 12 | function NT(data) { return new Sym('NT', data); }
 13 | function T(data) { return new Sym('T', data); }
 14 | 
 15 | function reprEscape(str) { // does not handle unicode or exceptional cases properly.
 16 |   return str.replace(/['\\]/g, function(c) { return '\\' + c; })
 17 |     .replace(/\n/g, '\\n').replace(/\r/g, '\\r');
 18 | }
 19 | 
 20 | function Rule(name, production) {
 21 |   if(!(this instanceof Rule)) return new Rule(name, production);
 22 |   this.name = name; // LHS
 23 |   this.production = production; // RHS\
 24 | }
 25 | Rule.prototype.equals = function(other) {
 26 |   if(other.name !== this.name) return false;
 27 |   if(other.production.length !== this.production.length) return false;
 28 |   
 29 |   for(var i=0; i<other.production.length; ++i) {
 30 |     if(!other.production[i].equals(this.production[i])) return false;
 31 |   }
 32 |   return true;
 33 | }
 34 | Rule.prototype.toString = function() {
 35 |   return this.name + ' -> ' + this.production.join('');
 36 | }
 37 | Rule.prototype.repr = function() {
 38 |   var out = 'Rule(\'' + reprEscape(this.name) + '\', [';
 39 |   for(var i=0; i<this.production.length; ++i) {
 40 |     if(i>0) out += ', ';
 41 |     out += this.production[i].type + '(\'' + reprEscape(this.production[i].data) + '\')';
 42 |   }
 43 |   out += '])';
 44 |   return out;
 45 | }
 46 | 
 47 | 
 48 | 
 49 | 
 50 | function Grammar(rules, start) { // if not given, start is LHS of the first rule.
 51 |   if(!(this instanceof Grammar)) return new Grammar(rules, start);
 52 |   this.rules = rules;
 53 |   this.start = start || rules[0].name; // TODO warn
 54 |   this.symbolMap = {}; // initially just rules for each symbol; eventually can contain annotations like 'nullable'
 55 |   this.symbolsList = start?[start]:[];
 56 |   
 57 |   if(start) this.symbolMap[start] = {rules: []};
 58 |   
 59 |   for(var i=0; i<this.rules.length; ++i) {
 60 |     var sym = this.rules[i].name;
 61 |     if(!(sym in this.symbolMap)) {
 62 |       this.symbolMap[sym] = {rules: []};
 63 |       this.symbolsList.push(sym);
 64 |     }
 65 |     
 66 |     for(var j=0; j<this.rules[i].production.length; ++j) {
 67 |       var rhsSym = this.rules[i].production[j];
 68 |       if(rhsSym.type == 'NT' && !(rhsSym.data in this.symbolMap)) {
 69 |         this.symbolMap[rhsSym.data] = {rules: []};
 70 |         this.symbolsList.push(rhsSym.data);
 71 |       }
 72 |     }
 73 |     this.symbolMap[sym].rules.push(this.rules[i]);
 74 |   }
 75 | }
 76 | Grammar.prototype.repr = function() {
 77 |   var out = 'Grammar([\n  ';
 78 |   for(var i=0; i<this.rules.length; ++i) {
 79 |     if(i>0) out += ',\n  ';
 80 |     out += this.rules[i].repr();
 81 |   }
 82 |   out += '\n], \'' + reprEscape(this.start) + '\')';
 83 |   return out;
 84 | }
 85 | 
 86 | 
 87 | // get a map from symbols to a list of the rules they appear in the RHS of
 88 | // if a symbol appears in a RHS more than once, that rule will appear more than once in the list
 89 | // modifies the grammar to have _reverseMap property, for caching
 90 | Grammar.prototype.getReverseMap = function() {
 91 |   if(!this.hasOwnProperty('_reverseMap')) {
 92 |     this._reverseMap = {};
 93 |     for(var i=0; i<this.symbolsList.length; ++i) {
 94 |       this._reverseMap[this.symbolsList[i]] = [];
 95 |     }
 96 |     for(var i=0; i<this.rules.length; ++i) {
 97 |       var rule = this.rules[i];
 98 |       for(var j=0; j<rule.production.length; ++j) {
 99 |         if(rule.production[j].type === 'NT') {
100 |           this._reverseMap[rule.production[j].data].push(rule);
101 |         }
102 |       }
103 |     }
104 |   }
105 |   
106 |   return this._reverseMap;
107 | }
108 | 
109 | 
110 | 
111 | module.exports = {
112 |   Sym: Sym,
113 |   NT: NT,
114 |   T: T,
115 |   Rule: Rule,
116 |   Grammar: Grammar
117 | }


--------------------------------------------------------------------------------