├── .editorconfig
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── grammars
    ├── adder.grammar
    ├── arithmetic.grammar
    ├── derive_lambda.grammar
    ├── empty_production.grammar
    ├── match.grammar
    ├── modern_compiler_design.grammar
    ├── optional.grammar
    ├── reduce_reduce.grammar
    ├── shift_reduce.grammar
    ├── skip.grammar
    └── some_derive_lambda.grammar
├── pegasus-sem.grammar
├── pegasus.grammar
├── semantics
    └── adder.sem
├── shard.yml
├── spec
    ├── automaton_spec.cr
    ├── dfa_spec.cr
    ├── language_spec.cr
    ├── nfa_spec.cr
    ├── pda_spec.cr
    ├── spec_helper.cr
    └── spec_utils.cr
└── src
    ├── generators
        ├── c-common
        │   ├── standard_header.h
        │   ├── standard_source.c
        │   ├── tables.cr
        │   └── tables.ecr
        ├── c
        │   ├── pegasus_c.cr
        │   ├── pegasus_c_header_template.ecr
        │   ├── pegasus_c_template.ecr
        │   ├── tree_header.h
        │   └── tree_source.c
        ├── crystal-common
        │   ├── tables.cr
        │   └── tables.ecr
        ├── crystal
        │   ├── pegasus_crystal.cr
        │   └── pegasus_crystal_template.ecr
        ├── crystalsem
        │   ├── pegasus_crystal_template.ecr
        │   └── pegasus_crystalsem.cr
        ├── csem
        │   ├── pegasus_c_header_template.ecr
        │   ├── pegasus_c_template.ecr
        │   ├── pegasus_csem.cr
        │   ├── sem_header.h
        │   └── sem_source.c
        └── generators.cr
    ├── pegasus.cr
    ├── pegasus
        ├── automaton.cr
        ├── dfa.cr
        ├── elements.cr
        ├── error.cr
        ├── generated
        │   ├── grammar_parser.cr
        │   └── semantics_parser.cr
        ├── grammar.cr
        ├── items.cr
        ├── json.cr
        ├── language_def.cr
        ├── nfa.cr
        ├── nfa_to_dfa.cr
        ├── pda.cr
        ├── regex.cr
        ├── semantics.cr
        └── table.cr
    └── tools
        ├── dot
            └── pegasus_dot.cr
        └── sim
            └── pegasus_sim.cr


/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*.cr]
 4 | charset = utf-8
 5 | end_of_line = lf
 6 | insert_final_newline = true
 7 | indent_style = space
 8 | indent_size = 2
 9 | trim_trailing_whitespace = true
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /docs/
 2 | /lib/
 3 | /bin/
 4 | /.shards/
 5 | *.dwarf
 6 | 
 7 | # Libraries don't need dependency lock
 8 | # Dependencies will be locked in application that uses them
 9 | /shard.lock
10 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: crystal
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2018 Danila Fedorin
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # pegasus
  2 | A parser generator based on Crystal and the UNIX philosophy. It is language agnostic, but can
  3 | currently generate parsers for the [C](#c-output) and [Crystal](#crystal-output) languages.
  4 | 
  5 | _Warning: Pegasus is experimental. Its APIs are not yet solidified, and are subject to change at any time._
  6 | 
  7 | ## Table of Contents
  8 | * [Architecture](#architecture)
  9 | * [Usage](#usage)
 10 |   * [Tokens](#tokens)
 11 |   * [Rules](#rules)
 12 |   * [A Note on Parse Trees](#a-note-on-parse-trees)
 13 |   * [Regular Expressions](#regular-expressions)
 14 |   * [Included Programs](#included-programs)
 15 |   * [Options](#options)
 16 |   * [Semantic Actions](#semantic-actions)
 17 | * [C Output](#c-output)
 18 | * [C Output With Semantic Actions](#c-output-with-semantic-actions)
 19 | * [Crystal Output](#crystal-output)
 20 | * [Crystal Output With Semantic Actions](#crystal-output-with-semantic-actions)
 21 | * [JSON Format](#json-format)
 22 | 
 23 | ## Architecture
 24 | Pegasus is based on the UNIX philosophy of doing one thing, and doing it well.
 25 | The core pegasus program isn't as much a parser generator as it is a Push Down 
 26 | Automaton generator.
 27 | 
 28 | Pegasus reads the grammar files, creates a Deterministic Finite Automaton (DFA) that is then used to tokenize (lex) text. Then, it creates an
 29 | LALR(1) Push Down Automaton that is then used to parse text. However, it doesn't actually generate a parser: it outputs the generated tables for both automatons,
 30 | as well as some extra information, as JSON. Another program, specific to each
 31 | language, then reads the JSON and is responsible for code output.
 32 | 
 33 | This is beneficial because this prevents the code generator from being dependent on a language. JSON is a data interchange format, and it is easily readable from almost any programming language. If I, or others, want to add a code generation target, they can just parse the JSON in their preferred language, rather than Crystal. An additional benefit is that the addition of a target doesn't require the pegasus core to be updated or recompiled.
 34 | ## Usage
 35 | Pegasus parses grammars written in very basic notation. The grammars are separated into two
 36 | sections: the __tokens__ and the __rules__.
 37 | ### Tokens
 38 | The tokens are terminals, and are described using
 39 | regular expressions. An example token declaration is as follows:
 40 | ```
 41 | token hello = /hello/;
 42 | ```
 43 | Notice that the token declaration is terminated by a semicolon. Also notice that the regular expression is marked at both ends by a forward slash, `/`. In order to write a regular expression that includes a forward slash, it needs to be escaped, like `\/`. More information on regular expressions accepted by Pegasus can be found below.
 44 | ### Rules
 45 | Grammar rules appear after tokens in the grammar file. An example rule is given as follows:
 46 | ```
 47 | rule S = hello;
 48 | ```
 49 | This rule uses the token we declared above, that is, `hello`, which matches the string hello.
 50 | In order to expect multiple tokens, we simply write them one after another:
 51 | ```
 52 | rule S = hello hello;
 53 | ```
 54 | Grammar rules aren't limited to only tokens. The names of other grammar rules, declared either earlier or later in the file, can also be used. For example:
 55 | ```
 56 | rule S = two_hello hello;
 57 | rule two_hello = hello hello;
 58 | ```
 59 | Here, we declare a second rule, `two_hello`, and then use it in the `S` rule.
 60 | 
 61 | Sometimes, it's useful to be able to declare several alternatives for rule. For example, we want to have an "operand" rule in a basic calculator, and an operand can either be a variable like "x" or a number like "3". We can write a rule as follows:
 62 | ```
 63 | rule operand = number | variable;
 64 | ```
 65 | ### A Note on Parse Trees
 66 | Earlier, we saw two rules written as follows:
 67 | ```
 68 | rule S = two_hello hello;
 69 | rule two_hello = hello hello;
 70 | ```
 71 | While it accepts the same language, this is __not__ equivalent to the following:
 72 | ```
 73 | rule S = hello hello hello;
 74 | ```
 75 | The reason is that Pegasus, by default, produces parse trees. The first grammar will produce
 76 | a parse tree whose root node, `S`, has two children, one being `two_hello` and the other being `hello`. The `two_hello` node will have two child nodes, both `hello`. However, the second variant will produce a parse tree whose root node, `S`, has three children, all `hello`.
 77 | ### Regular Expressions
 78 | Regular
 79 | expressions support some basic operators:
 80 | * `hey+` matches `hey`, `heyy`, `heyyy`, and so on.
 81 | * `hey?` matches `hey` or `he`
 82 | * `hey*` matches `he`, `hey`, `heyy`, and so on.
 83 | 
 84 | Operators can also be applied to groups of characters:
 85 | * `(ab)+` matches `ab`, `abab`, `ababab`, and so on.
 86 | 
 87 | Please note, however, that Pegasus's lexer does not capture groups.
 88 | ### Options
 89 | Pegasus supports an experimental mechanism to aid in parser generation, which involves attaching options
 90 | to tokens or rules. Right now, the only option that is recognized is attached to a token definition. This option is "skip".
 91 | Options are delcared as such:
 92 | ```
 93 | token space = / +/ [ skip ];
 94 | ```
 95 | The skip option means that the token it's attached to, in this case `space`, will be immediately discarded, and parsing will go on
 96 | as if it wasn't there. For example, if we want a whitespace-insensitive list of digits, we can write it as such: 
 97 | ```
 98 | token space = / +/ [ skip ];
 99 | token digit = /[0-9]/;
100 | token list_start = /\[/;
101 | token list_end = /\]/;
102 | token comma = /,/;
103 | 
104 | rule list = list_start list_recursive list_end;
105 | rule list_recursive = digit | digit comma list_recursive;
106 | ```
107 | Now, this will be able to parse equivalently the strings "[3]", "[ 3 ]" and [ 3]", because the whitespace token is ignored.
108 | ### Semantic Actions
109 | It's certainly convenient to create a parse tree that perfectly mimics the structure of a language's grammar. However, this isn't always desirable - if the user desires to construct an Abstract Syntax Tree, they're left having to walk the structure of the resulting tree _again_, frequently checking what rule created a particular nonterminal, or how many children a root node has. This is less than ideal - we don't want to duplicate the work of specifying the grammar when we walk the trees. Furthermore, if the grammar changes, the code that walks the parse trees will certainly need to change.
110 | 
111 | To remedy this, I've been toying with the idea of including _semantic actions_ into Pegasus, in a very similar way to Yacc / Bison. Semantic actions are pieces of code that run when a particular rule in the grammar is matched. However, this would mean that the user has to write these actions in some particular language (Yacc / Bison use C/C++). Since Pegasus aims to be language agnostic, writing code in a particular language in the main grammar file is undesirable. Thus, I chose the approach of separating semantic actions into a separate file format. The format uses `$$` to delimit code blocks, and contains the following sections:
112 | 
113 | * Types that various nonterminals are assigned. For instance, a boolean expression can be assigned the C++ type "bool".
114 | * The actual rules that are of each of the types declared above.
115 | * The init code (placed in a global context before the parsing function)
116 | * The semantic actions for each rule.
117 | 
118 | For a concrete example of this file format, see the example code in the [C Output With Semantic Actions](#c-output-with-semantic-actions) section.
119 | 
120 | ### Included programs
121 | Before you use any of these programs, you should use
122 | ```
123 | shards build --release
124 | ```
125 | This will compile all the Pegasus programs in release mode,
126 | for optimal performance.
127 | #### `pegasus`
128 | This program reads grammars from standard input, and generates
129 | JSON descriptions out LALR automata,
130 | which will be read by the other programs. For example:
131 | ```Bash
132 | echo 'token hello = /Hello, world!/; rule S = hello;' > test.grammar
133 | ./bin/pegasus < test.grammar
134 | ```
135 | This prints the JSON to the command line. If you'd like to output
136 | JSON to a file, you can use:
137 | ```Bash
138 | ./bin/pegasus < test.grammar > test.json
139 | ```
140 | #### `pegasus-dot`
141 | This program is used largely for debugging purpose, and generates GraphViz
142 | DOT output, which can then by converted by the `dot` program into images.
143 | This greatly helps with debugging generated automata. `pegasus-dot` simply
144 | reads the generated JSON file:
145 | ```Bash
146 | ./bin/pegasus-dot < test.json
147 | ```
148 | To generate a PNG from the DOT output, you need the `dot` program installed.
149 | Once you have that, you can just pipe the output of `pegasus-dot` into `dot`:
150 | ```Bash
151 | ./bin/pegasus-dot < test.json | dot -Tpng -o visual.png
152 | ```
153 | #### `pegasus-sim`
154 | This is another program largely used for debugging. Instead of generating
155 | a parser, it reads a JSON file, then attempts to parse text from STDIN.
156 | Once it's done, it prints the result of its attempt. Note that because
157 | it reads input from STDIN, rather than JSON, the JSON
158 | file has to be given as a command-line argument:
159 | ```Bash
160 | echo 'Hello, world!' | ./bin/pegasus-sim -i test.json
161 | ```
162 | 
163 | #### `pegasus-c`
164 | Finally, a parser generator! `pegasus-c` takes JSON, and creates C
165 | header and source files that can then be integrated into your project.
166 | To learn how to use the generated code, please take a look at the
167 | [C output](#c-output) section.
168 | ```Bash
169 | ./bin/pegasus-c < test.json
170 | ```
171 | 
172 | #### `pegasus-crystal`
173 | Another parser generator. `pegasus-crystal` outputs Crystal code
174 | which can then be integrated into your project.
175 | To learn how to use the generated code, lease take a look at the
176 | [Crystal output](#crystal-output) section.
177 | ```Bash
178 | ./bin/pegasus-crystal < test.json
179 | ```
180 | 
181 | #### `pegasus-csem`
182 | Another C parser generator. The difference between this parser generator and `pegasus-c` is that it uses a separate semantic actions file to mimic the functionality of Yacc/Bison. This means you can specify C code that runs when each rule in the grammar is matched. To learn how to use this parser generator, see the [C Output With Semantic Actions](#c-output-with-semantic-actions) section.
183 | ```
184 | ./bin/pegasus-csem -l test.json -a test.sem
185 | ```
186 | 
187 | ## C Output
188 | The pegasus repository contains the source code of a program that converts the JSON output into C source code. It generates a derivation tree, stored in `pgs_tree`, which is made up of nonterminal parent nodes and terminal leaves. Below is a simple example of using the functions generated for a grammar that describes the language of a binary operation applied to two numbers.
189 | The grammar:
190 | ```
191 | token op_add = /\+/;
192 | token op_sub = /-/;
193 | token op_mul = /\*/;
194 | token op_div = /\//;
195 | token number = /[0-9]/;
196 | 
197 | rule S = expr;
198 | rule expr = number op number;
199 | rule op = op_add | op_sub | op_div | op_mul;
200 | ```
201 | _note: backslashes are necessary in the regular expressions because `+` and `*` are operators in the regular expression language._
202 | 
203 | The code for the API:
204 | ```C
205 | /* Include the generated header file */
206 | #include "parser.h"
207 | #include <stdio.h>
208 | 
209 | int main(int argc, char** argv) {
210 |     pgs_state state; /* The state is used for reporting error messages.*/
211 |     pgs_tree* tree; /* The tree that will be initialized */
212 |     char buffer[256]; /* Buffer for string input */
213 | 
214 |     gets(buffer); /* Unsafe function for the sake of example */
215 |     /* pgs_do_all lexes and parses the text from the buffer. */
216 |     if(pgs_do_all(&state, &tree, buffer)) {
217 |         /* A nonzero return code indicates error. Print it.*/
218 |         printf("Error: %s\n", state.errbuff);
219 |     } else {
220 |         /* Do nothing, free the tree. */
221 |         /* Tree is not initialized unless parse succeeds. */
222 |         pgs_free_tree(tree);
223 |     }
224 | }
225 | ```
226 | This example is boring because nothing is done with the tree. Let's walk the tree and print it out:
227 | ```C
228 | void print_tree(pgs_tree* tree, const char* source, int indent) {
229 |     size_t i;
230 |     /* Print an indent. */
231 |     for(i = 0; i < indent; i++) printf("  ");
232 |     /* If the tree is a terminal (actual token) */
233 |     if(tree->variant == PGS_TREE_TERMINAL) {
234 |         printf("Terminal: %.*s\n", (int) (PGS_TREE_T_TO(*tree) - PGS_TREE_T_FROM(*tree)),
235 |                 source + PGS_TREE_T_FROM(*tree));
236 |     } else {
237 |         /* PGS_TREE_NT gives the nonterminal ID from the given tree. */
238 |         printf("Nonterminal: %s\n", pgs_nonterminal_name(PGS_TREE_NT(*tree)));
239 |         /* PGS_TREE_NT_COUNT returns the number of children a nonterminal
240 |            node has. */
241 |         for(i = 0; i < PGS_TREE_NT_COUNT(*tree); i++) {
242 |             /* PGS_TREE_NT_CHILD gets the nth child of a nonterminal tree. */
243 |             print_tree(PGS_TREE_NT_CHILD(*tree, i), source, indent + 1);
244 |         }
245 |     }
246 | }
247 | ```
248 | For the input string `3+3`, the program will output:
249 | ```
250 | Nonterminal: S
251 |   Nonterminal: expr
252 |     Nonterminal: number
253 |       Terminal: 3
254 |     Nonterminal: op
255 |       Terminal: +
256 |     Nonterminal: number
257 |       Terminal: 3
258 | ```
259 | Some more useful C macros for accessing the trees can be found in `parser.h`
260 | 
261 | ## C Output With Semantic Actions
262 | Say you don't need a parse tree. Instead, you want to construct your own values from Pegasus grammar rules. In this case, you want to use the `pegasus-csem` parser generator. It is best demonstrated using a small example. Let's consider a language of booleans:
263 | ```
264 | token whitespace = /[ \n\t]+/ [ skip ];
265 | token true = /true/;
266 | token false = /false/;
267 | token and = /and/;
268 | token or = /or/;
269 | 
270 | rule S = expr;
271 | rule expr = tkn | expr and tkn | expr or tkn;
272 | rule tkn = true | false;
273 | ```
274 | Easy enough. But why would we want a parse tree from this? Let's operate directly on booleans (which we'll represent as integers in C). We create the semantic actions file step by step. First, we know all our actions  will produce integers (which represent booleans). So we create a boolean type:
275 | ```
276 | type boolean = $$ int $$
277 | ```
278 | Now, we want to assign this type to the nonterminals in our language. We do this as follows:
279 | ```
280 | typerules boolean = [ S, expr, tkn ]
281 | ```
282 | We don't need any global variables or functions, so we can just leave the `init` block blank:
283 | ```
284 | init = $$ $$
285 | ```
286 | Next, we write actions that correspond to each gramamr rule.
287 | ```
288 | rule S(0) = $$ $out = $0; $$
289 | ```
290 | `$out` is the "output" variable, and `$0` is the value generated for the first terminal or nonterminal in the rule (in this case, `expr`). This rule just forwards the result of the rules for `expr`. Next, let's write rules for `expr`:
291 | ```
292 | rule expr(0) = $$ $out = $0; $$
293 | rule expr(1) = $$ $out = $0 & $2; $$
294 | rule expr(2) = $$ $out = $0 | $2; $$
295 | ```
296 | The first rule simply forwards the value generated for `tkn`. The other two rules combine the results of their subexpressions using `&` and `|` (we use `&` in the grammar rule that has the `and` token, and `|` in the grammar rule that has the `or` token). Finally, we write the rules for `tkn`:
297 | ```
298 | rule tkn(0) = $$ $out = 1; $$
299 | rule tkn(1) = $$ $out = 0; $$
300 | ```
301 | Time to test this. We need to write a simple program that uses the parser. The main difference from the C output without semantic actions is that we use `pgs_stack_value` union type, with fields named after the types we registered (`boolean`, in this case). The code:
302 | ```C
303 | #include "parser.h"
304 | 
305 | int main() {
306 |     pgs_stack_value v; /* Temporary variable into which to store the result */
307 |     pgs_state s; /* The state used for reporting error message */
308 | 
309 |     /* Initialize the state */
310 |     pgs_state_init(&s);
311 |     /* Tokenize and parse a hardcoded string, ignoring error code */
312 |     pgs_do_all(&s, &v, "false or false or true");
313 |     /* Print the error generated, if any */
314 |     printf("%s\n", s.errbuff);
315 |     /* Print the boolean value as an integer. */
316 |     printf("%d\n", v.boolean);
317 | }
318 | ```
319 | The output is the result of evaluating our expression: "true", or 1:
320 | ```
321 | 
322 | 1
323 | ```
324 | 
325 | ## Crystal Output
326 | Just like with C, this repository contains a program to output Crystal when code given a JSON file.
327 | Because Crystal supports exceptions and garbage collection, there is no need to initialize
328 | any variables, or call corresponding `free` functions. The most basic example of reading
329 | a line from the standard input and parsing it is below:
330 | ```Crystal
331 | require "./parser.cr"
332 | 
333 | Pegasus::Generated.process(STDIN.gets.not_nil!)
334 | ```
335 | Of course, this isn't particularly interesting. Let's add a basic function to print the tree:
336 | ```Crystal
337 | def print_tree(tree, indent = 0)
338 |   indent.times { STDOUT << "  " }
339 |   case tree
340 |   when Pegasus::Generated::TerminalTree
341 |     STDOUT << "Terminal: "
342 |     STDOUT.puts tree.string
343 |   when Pegasus::Generated::NonterminalTree
344 |     STDOUT << "Nonterminal: " << tree.name
345 |     STDOUT.puts
346 |     tree.children.each { |it| print_tree(it, indent + 1) }
347 |   end
348 | end
349 | ```
350 | For the input string `3+3`, the program will output:
351 | ```
352 | Nonterminal: S
353 |   Nonterminal: expr
354 |     Nonterminal: number
355 |       Terminal: 3
356 |     Nonterminal: op
357 |       Terminal: +
358 |     Nonterminal: number
359 |       Terminal: 3
360 | ```
361 | 
362 | ## Crystal Output with Semantic Actions
363 | This is just like C semantic actions, but with Crystal. Suppose you don't need
364 | a parse tree. Rather, you want to generate your own values from Pegasus grammar
365 | rules. You can do this with the `pegasus-crystalsem` parser generator. When
366 | using this generator, you specify an additional file, which associates Crystal
367 | code (_semantic actions_) with each rule. Let's consider a language
368 | of booleans:
369 | ```
370 | token whitespace = /[ \n\t]+/ [ skip ];
371 | token true = /true/;
372 | token false = /false/;
373 | token and = /and/;
374 | token or = /or/;
375 | 
376 | rule S = expr;
377 | rule expr = tkn | expr and tkn | expr or tkn;
378 | rule tkn = true | false;
379 | ```
380 | Now that we have our grammar, it's time to formulate the additional file
381 | we mentioned. The first thing we need to do is figure out what Crystal
382 | type each of the nonterminals we generate. Our language is that
383 | of booleans, so we will be needing a boolean type:
384 | ```
385 | type boolean = $$ Bool $$
386 | ```
387 | Here, the stuff inside the `$$` is Crystal code that is pasted verbatim into the
388 | generated parser. Now, we want to specify which rules evaluate to that type.
389 | In our simple language, every rule evaluates to a boolean:
390 | ```
391 | typerules boolean = [ S, expr, tkn ]
392 | ```
393 | `pegasus-crystalsem` also allows you to put some code above the parsing code,
394 | globally. We don't use this, so we leave the `init` property blank:
395 | ```
396 | init = $$ $$
397 | ```
398 | It is now time to assign semantic Crystal actions to each grammar rule. We
399 | start with the first rule, `S(0)` (which means the first rule for the
400 | `S` nonterminal). Since the first rule just matches an `expr`, we
401 | simply output the value of that `expr`:
402 | ```
403 | rule S(0) = $$ $out = $0 $$
404 | ```
405 | This means "set the output to be the value of the first element in the rule's body".
406 | We now implement the actual rules for `expr`. The first rule simply forwards
407 | the result of the `tkn`, just like the rule for `S`. The other two rules actually
408 | implement the logical operations of `&` and `|`:
409 | ```
410 | rule expr(0) = $$ $out = $0 $$
411 | rule expr(1) = $$ $out = $0 & $2 $$
412 | rule expr(2) = $$ $out = $0 | $2 $$
413 | ```
414 | Finally, we use the two rules for `tkn` to actually return a boolean:
415 | ```
416 | rule tkn(0) = $$ $out = true $$
417 | rule tkn(1) = $$ $out = false $$
418 | ```
419 | Let's test this. We include the generated parser, and write the following:
420 | ```Crystal
421 | require "./parser.cr"
422 | 
423 | puts Pegasus::Generated.process(gets.not_nil!)
424 | ```
425 | Let's now run this with the expression `true or false or true`. The output:
426 | ```
427 | true
428 | ```
429 | That's indeed our answer!
430 | 
431 | ## JSON Format
432 | For the grammar given by:
433 | ```
434 | token hi = /hi/;
435 | rule A = hi;
436 | ```
437 | The corresponding (pretty-printed) JSON output is:
438 | ```
439 | {
440 |   "lex_state_table":[[..]..],
441 |   "lex_final_table”:[..],
442 |   "parse_state_table":[[..]..],
443 |   "parse_action_table":[[..]..],
444 |   "terminals":{
445 |     "hi":{
446 |       "terminal_id":0
447 |     }
448 |   },
449 |   "nonterminals":{
450 |     "A":{
451 |       "nonterminal_id":0
452 |     }
453 |   },
454 |   "items":[
455 |     {
456 |       "head":{
457 |         "nonterminal_id":0
458 |       },
459 |       "body":[
460 |         {
461 |           "terminal_id":0
462 |         }
463 |       ]
464 |     }
465 |   ],
466 |   "max_terminal":0
467 | }
468 | ```
469 | ## Contributors
470 | 
471 | - [DanilaFe](https://github.com/DanilaFe) Danila Fedorin - creator, maintainer
472 | 


--------------------------------------------------------------------------------
/grammars/adder.grammar:
--------------------------------------------------------------------------------
1 | token add = /\+/;
2 | token number = /[1-9][0-9]*/;
3 | 
4 | rule S = add_expr;
5 | rule add_expr = add_expr add number | number;
6 | 


--------------------------------------------------------------------------------
/grammars/arithmetic.grammar:
--------------------------------------------------------------------------------
 1 | token add = /\+/;
 2 | token sub = /-/;
 3 | token mul = /\*/;
 4 | token div = /\//;
 5 | token open_parenth = /\(/;
 6 | token close_parenth = /\)/;
 7 | token number = /[1-9][0-9]*/;
 8 | 
 9 | rule S = add_expr;
10 | rule add_expr = add_expr add_op mul_expr | mul_expr;
11 | rule mul_expr = mul_expr mul_op atom | atom;
12 | rule atom = open_parenth add_expr close_parenth | number;
13 | rule add_op = add | sub;
14 | rule mul_op = div | mul;
15 | 


--------------------------------------------------------------------------------
/grammars/derive_lambda.grammar:
--------------------------------------------------------------------------------
1 | token hello = /hello/;
2 | token goodbye = /goodbye/;
3 | rule S = A B;
4 | rule A = hello?;
5 | rule B = goodbye?;
6 | 


--------------------------------------------------------------------------------
/grammars/empty_production.grammar:
--------------------------------------------------------------------------------
1 | token hello = /hello/;
2 | rule S = hello?;
3 | 


--------------------------------------------------------------------------------
/grammars/match.grammar:
--------------------------------------------------------------------------------
 1 | token open_parenth = /\(/;
 2 | token close_parenth = /\)/;
 3 | token matched_parenth = /\(\)/;
 4 | token open_square = /\[/;
 5 | token close_square = /\]/;
 6 | token matched_square = /\[\]/;
 7 | token open_curly = /{/;
 8 | token close_curly = /}/;
 9 | token matched_curly = /{}/;
10 | 
11 | rule S = any;
12 | rule any = parenths any
13 |     | square_brackets any
14 |     | brackets any
15 |     | parenths
16 |     | square_brackets
17 |     | brackets;
18 | rule parenths = open_parenth any close_parenth | matched_parenth;
19 | rule square_brackets = open_square any close_square | matched_square;
20 | rule brackets = open_curly any close_curly | matched_curly;
21 | 


--------------------------------------------------------------------------------
/grammars/modern_compiler_design.grammar:
--------------------------------------------------------------------------------
1 | token x = /x/;
2 | token b = /b/;
3 | token a = /a/;
4 | 
5 | rule S = A | x b;
6 | rule A = a A b | B;
7 | rule B = x;
8 | 


--------------------------------------------------------------------------------
/grammars/optional.grammar:
--------------------------------------------------------------------------------
1 | token hello = /hello/;
2 | rule S = hello? hello;
3 | 


--------------------------------------------------------------------------------
/grammars/reduce_reduce.grammar:
--------------------------------------------------------------------------------
1 | token hello = /hello/;
2 | rule S = A | B;
3 | rule A = hello;
4 | rule B = hello;
5 | 


--------------------------------------------------------------------------------
/grammars/shift_reduce.grammar:
--------------------------------------------------------------------------------
1 | token world = /world/;
2 | token hello = /hello/;
3 | 
4 | rule S = A | B world;
5 | rule A = hello world;
6 | rule B = hello;
7 | 


--------------------------------------------------------------------------------
/grammars/skip.grammar:
--------------------------------------------------------------------------------
1 | token whitespace = /[ \n\t]+/ [ skip ];
2 | token hello = /hello/;
3 | rule S = hello hello;
4 | 


--------------------------------------------------------------------------------
/grammars/some_derive_lambda.grammar:
--------------------------------------------------------------------------------
1 | token hello = /hello/;
2 | token goodbye = /goodbye/;
3 | rule S = A B hello;
4 | rule A = hello?;
5 | rule B = goodbye?;
6 | 


--------------------------------------------------------------------------------
/pegasus-sem.grammar:
--------------------------------------------------------------------------------
 1 | token whitespace = /([ \t]|\r?\n)+/ [ skip ];
 2 | token identifier = /[a-zA-Z_\-]+/;
 3 | token integer = /[0-9]+/;
 4 | token code = /$$([^$]|$[^$])*$$/;
 5 | token keyword_type = /type/;
 6 | token keyword_typerules = /typerules/;
 7 | token keyword_state = /state/;
 8 | token keyword_init = /init/;
 9 | token keyword_rule = /rule/;
10 | token eq = /=/;
11 | token oparenth = /\(/;
12 | token cparenth = /\)/;
13 | token obracket = /\[/;
14 | token cbracket = /\]/;
15 | token comma = /,/;
16 | 
17 | rule S = type_list typerules_list init_decl rule_list;
18 | 
19 | rule type_list = type_decl type_list?;
20 | rule type_decl = keyword_type identifier eq code;
21 | 
22 | rule typerules_list = typerules_decl typerules_list?;
23 | rule typerules_decl = keyword_typerules identifier eq obracket identifier_list cbracket;
24 | rule identifier_list = identifier | identifier comma identifier_list;
25 | 
26 | rule init_decl = keyword_init eq code;
27 | 
28 | rule rule_list = rule_decl rule_list?;
29 | rule rule_decl = keyword_rule identifier oparenth integer cparenth eq code;
30 | 


--------------------------------------------------------------------------------
/pegasus.grammar:
--------------------------------------------------------------------------------
 1 | token whitespace = /([ \t]|\r?\n)+/ [ skip ];
 2 | token identifier = /[a-zA-Z_\-]+/;
 3 | token keyword_token = /token/;
 4 | token keyword_rule = /rule/;
 5 | token equals_delimiter = /=/;
 6 | token semicolon_delimiter = /;/;
 7 | token or_delimiter = /\|/;
 8 | token regex = /\/([^\/]|\\.)*\//;
 9 | token open_square = /\[/;
10 | token closed_square = /\]/;
11 | token open_parenth = /\(/;
12 | token closed_parenth = /\)/;
13 | token comma = /,/;
14 | token optional = /\?/;
15 | 
16 | rule S = token_list grammar_list | token_list | grammar_list;
17 | rule token_list = token_def | token_def token_list;
18 | rule token_def = keyword_token identifier equals_delimiter regex statement_end;
19 | rule grammar_list = grammar_rule | grammar_rule grammar_list;
20 | rule grammar_rule = keyword_rule identifier equals_delimiter grammar_bodies statement_end;
21 | rule grammar_bodies = grammar_body | grammar_body or_delimiter grammar_bodies;
22 | rule grammar_body = grammar_element | grammar_element grammar_body;
23 | rule grammar_element = identifier | identifier optional;
24 | rule statement_end = options semicolon_delimiter | semicolon_delimiter;
25 | rule options = open_square options_list closed_square;
26 | rule options_list = option | option comma options_list;
27 | rule option = identifier;
28 | 


--------------------------------------------------------------------------------
/semantics/adder.sem:
--------------------------------------------------------------------------------
 1 | type integer = $$ int $$
 2 | 
 3 | typerules integer = [S, add_expr]
 4 | 
 5 | init = $$ $$
 6 | 
 7 | rule S(0) = $$ $out = $0; $$
 8 | rule add_expr(0) = $$ $out = $0 + atoi(src + $2->from); $$
 9 | rule add_expr(1) = $$ $out = atoi(src + $0->from); $$
10 | 


--------------------------------------------------------------------------------
/shard.yml:
--------------------------------------------------------------------------------
 1 | name: pegasus
 2 | version: 0.1.1
 3 | 
 4 | authors:
 5 |   - Danila Fedorin <danila.fedorin@gmail.com>
 6 | 
 7 | targets:
 8 |   pegasus:
 9 |     main: src/pegasus.cr
10 |   pegasus-dot:
11 |     main: src/tools/dot/pegasus_dot.cr
12 |   pegasus-sim:
13 |     main: src/tools/sim/pegasus_sim.cr
14 |   pegasus-c:
15 |     main: src/generators/c/pegasus_c.cr
16 |   pegasus-csem:
17 |     main: src/generators/csem/pegasus_csem.cr
18 |   pegasus-crystal:
19 |     main: src/generators/crystal/pegasus_crystal.cr
20 |   pegasus-crystalsem:
21 |     main: src/generators/crystalsem/pegasus_crystalsem.cr
22 | 
23 | crystal: 1.14.0
24 | 
25 | license: MIT
26 | 


--------------------------------------------------------------------------------
/spec/automaton_spec.cr:
--------------------------------------------------------------------------------
 1 | require "./spec_utils.cr"
 2 | 
 3 | describe Pegasus::Automata::Automaton do
 4 |   describe "#initialize" do
 5 |     it "Starts at state 0" do
 6 |       automaton = Pegasus::Automata::Automaton(Int32, Int32).new
 7 |       automaton.last_id.should eq 0
 8 |     end
 9 | 
10 |     it "Doesn't add any states" do
11 |       automaton = Pegasus::Automata::Automaton(Int32, Int32).new
12 |       automaton.states.size.should eq 0
13 |     end
14 | 
15 |     it "Starts with a nil start state" do
16 |       automaton = Pegasus::Automata::Automaton(Int32, Int32).new
17 |       automaton.start.should be_nil
18 |     end
19 |   end
20 | 
21 |   describe "#state_for" do
22 |     it "Increments the state ID after every created state" do
23 |       automaton = Pegasus::Automata::Automaton(Int32, Int32).new
24 |       automaton.state_for(data: 3).id.should eq 0
25 |       automaton.state_for(data: 3).id.should eq 1
26 |       automaton.state_for(data: 4).id.should eq 2
27 |     end
28 | 
29 |     it "Creates a state with the correct data" do
30 |       automaton = Pegasus::Automata::Automaton(Int32, Int32).new
31 |       automaton.state_for(data: 3).data.should eq 3
32 |       automaton.state_for(data: 3).data.should eq 3
33 |       automaton.state_for(data: 4).data.should eq 4
34 |     end
35 | 
36 |     it "Adds the state to its internal list" do
37 |       automaton = Pegasus::Automata::Automaton(Int32, Int32).new
38 |       state_one = automaton.state_for(data: 1)
39 |       state_two = automaton.state_for(data: 2)
40 |       state_three = automaton.state_for(data: 3)
41 | 
42 |       automaton.states.should contain state_one
43 |       automaton.states.should contain state_two
44 |       automaton.states.should contain state_three
45 |     end
46 |   end
47 | end
48 | 
49 | describe Pegasus::Automata::UniqueAutomaton do
50 |   describe "#initialize" do
51 |     it "Has no state memorized" do
52 |       automaton = Pegasus::Automata::UniqueAutomaton(Int32, Int32).new
53 |       automaton.@memorized.size.should eq 0
54 |     end
55 |   end
56 | 
57 |   describe "#state_for" do
58 |     it "Doesn't create states with duplicate values" do
59 |       automaton = Pegasus::Automata::UniqueAutomaton(Int32, Int32).new
60 |       automaton.state_for(data: 3).id.should eq 0
61 |       automaton.state_for(data: 3).id.should eq 0
62 |       automaton.state_for(data: 4).id.should eq 1
63 |     end
64 |   end
65 | end
66 | 


--------------------------------------------------------------------------------
/spec/dfa_spec.cr:
--------------------------------------------------------------------------------
 1 | require "./spec_utils.cr"
 2 | 
 3 | describe Pegasus::Dfa do
 4 |   describe "#final_table" do
 5 |     it "Creates a two-entry table when there are no expression" do
 6 |       nfa = Pegasus::Nfa::Nfa.new
 7 |       dfa = nfa.dfa
 8 |       table = dfa.final_table
 9 |       table.size.should eq 2
10 |       table[0].should eq 0
11 |       table[1].should eq 0
12 |     end
13 | 
14 |     it "Creates a two-entry table with a final state for an empty expression" do
15 |       nfa = Pegasus::Nfa::Nfa.new
16 |       nfa.add_regex "", 0_i64
17 |       dfa = nfa.dfa
18 |       table = dfa.final_table
19 |       table.size.should eq 2
20 |       table[0].should eq 0
21 |       table[1].should eq 1
22 |     end
23 | 
24 |     it "Creates two final states for an OR expression" do
25 |       nfa = Pegasus::Nfa::Nfa.new
26 |       nfa.add_regex "h|g", 0_i64
27 |       dfa = nfa.dfa
28 |       table = dfa.final_table
29 |       table.size.should eq 4
30 |       table[0].should eq 0
31 |       table[1].should eq 0
32 |       table[2].should_not eq 0
33 |       table[3].should_not eq 0
34 |     end
35 |   end
36 | 
37 |   describe "#state_table" do
38 |     it "Does not allow transitions out of the error state" do
39 |       nfa = Pegasus::Nfa::Nfa.new
40 |       dfa = nfa.dfa
41 |       table = dfa.state_table
42 |       table[0].each &.should eq 0
43 |     end
44 | 
45 |     it "Creates a table leading to the error state when there are no expressions" do
46 |       nfa = Pegasus::Nfa::Nfa.new
47 |       dfa = nfa.dfa
48 |       table = dfa.state_table
49 |       table.each &.each &.should eq 0
50 |     end
51 | 
52 |     it "Creates a transition table with a final state for a single character" do
53 |       nfa = Pegasus::Nfa::Nfa.new
54 |       nfa.add_regex "h", 0_i64
55 |       dfa = nfa.dfa
56 |       table = dfa.state_table
57 |       table.size.should eq 3
58 |       final_byte = 'h'.bytes.first
59 |       table[1].each_with_index do |state, index|
60 |         state.should eq 0 if index != final_byte
61 |         state.should eq 2 if index == final_byte
62 |       end
63 |       table[2].each &.should eq 0
64 |     end
65 | 
66 |     it "Creates a forked transition table for a fork in the DFA" do
67 |       nfa = Pegasus::Nfa::Nfa.new
68 |       nfa.add_regex "h|e", 0_i64
69 |       dfa = nfa.dfa
70 |       table = dfa.state_table
71 |       table.size.should eq 4
72 |       h_byte = 'h'.bytes.first
73 |       e_byte = 'e'.bytes.first
74 |       table[1].each_with_index do |state, index|
75 |         state.should eq 0 if index != h_byte && index != e_byte
76 |         state.should_not eq 0 if index == h_byte || index == e_byte
77 |       end
78 |       table[2].each &.should eq 0
79 |       table[3].each &.should eq 0
80 |     end
81 |   end
82 | end
83 | 


--------------------------------------------------------------------------------
/spec/language_spec.cr:
--------------------------------------------------------------------------------
  1 | require "./spec_utils.cr"
  2 | 
  3 | describe Pegasus::Language::LanguageDefinition do
  4 |   describe "#from_string" do
  5 |     it "Handles empty strings" do
  6 |       expect_raises(Pegasus::Error::GrammarException) do
  7 |         Pegasus::Language::LanguageDefinition.new ""
  8 |       end
  9 |     end
 10 | 
 11 |     it "Errors on just a rule without a body" do
 12 |       expect_raises(Pegasus::Error::GrammarException) do
 13 |         Pegasus::Language::LanguageDefinition.new %(rule S);
 14 |       end
 15 |     end
 16 | 
 17 |     it "Errors on just a token without a body" do
 18 |       expect_raises(Pegasus::Error::GrammarException) do
 19 |         Pegasus::Language::LanguageDefinition.new %(rule S);
 20 |       end
 21 |     end
 22 | 
 23 |     it "Errors on just a rule with an equals sign, but no body" do
 24 |       expect_raises(Pegasus::Error::GrammarException) do
 25 |         Pegasus::Language::LanguageDefinition.new %(rule S = );
 26 |       end
 27 |     end
 28 | 
 29 |     it "Errors on just a token with an equals sign, but no body" do
 30 |       expect_raises(Pegasus::Error::GrammarException) do
 31 |         Pegasus::Language::LanguageDefinition.new %(token S = );
 32 |       end
 33 |     end
 34 | 
 35 |     it "Errors on a token not ending in a semicolon, when another rule follows" do
 36 |       expect_raises(Pegasus::Error::GrammarException) do
 37 |         Pegasus::Language::LanguageDefinition.new %(token t = /t/\nrule expr = h;)
 38 |       end
 39 |     end
 40 | 
 41 |     it "Errors on a rule not ending in a semicolon, when another rule follows" do
 42 |       expect_raises(Pegasus::Error::GrammarException) do
 43 |         Pegasus::Language::LanguageDefinition.new %(rule S = expr\nrule expr = h;)
 44 |       end
 45 |     end
 46 | 
 47 |     it "Errors when a duplicate token is declared" do
 48 |       expect_raises(Pegasus::Error::GrammarException) do
 49 |         Pegasus::Language::LanguageDefinition.new %(token t = /t/; token t = /r/;)
 50 |       end
 51 |     end
 52 | 
 53 |     it "Errors when a rule is named the same as a token" do
 54 |       expect_raises(Pegasus::Error::GrammarException) do
 55 |         Pegasus::Language::LanguageDefinition.new %(token t = /t/; rule t = t;)
 56 |       end
 57 |     end
 58 | 
 59 |     it "Correctly handles options" do
 60 |       language = Pegasus::Language::LanguageDefinition.new %(token hello = /hello/ [ skip ];)
 61 |       language.tokens.size.should eq 1
 62 |       language.tokens["hello"]?.should eq Pegasus::Language::Token.new("hello", [ "skip" ])
 63 |     end
 64 | 
 65 |     it "Correctly handles two rules with the same name" do
 66 |       language = Pegasus::Language::LanguageDefinition.new %(rule S = weird; rule S = not_weird;)
 67 |       language.tokens.size.should eq 0
 68 |       language.rules.size.should eq 1
 69 |       language.rules["S"]?.should eq [ rule(rule_alternative("weird")), rule(rule_alternative("not_weird")) ]
 70 |     end
 71 | 
 72 |     it "Correctly parses a single rule with a single terminal or nonterminal" do
 73 |       language = Pegasus::Language::LanguageDefinition.new %(rule S = h;)
 74 |       language.tokens.size.should eq 0
 75 |       language.rules.size.should eq 1
 76 |       language.rules["S"]?.should eq [ rule(rule_alternative("h")) ]
 77 |     end
 78 | 
 79 |     it "Correctly parses a single token declaration" do
 80 |       language = Pegasus::Language::LanguageDefinition.new %(token hello = /hello/;)
 81 |       language.tokens.size.should eq 1
 82 |       language.tokens["hello"]?.should eq Pegasus::Language::Token.new("hello")
 83 |       language.rules.size.should eq 0
 84 |     end
 85 | 
 86 |     it "Correctly parses a single rule with more than one terminal or nonterminal" do
 87 |       language = Pegasus::Language::LanguageDefinition.new %(rule S = hello world;)
 88 |       language.tokens.size.should eq 0
 89 |       language.rules.size.should eq 1
 90 |       language.rules["S"]?.should eq [ rule(rule_alternative("hello", "world")) ]
 91 |     end
 92 | 
 93 |     it "Correctly parses a rule with multiple bodies" do
 94 |       language = Pegasus::Language::LanguageDefinition.new %(rule S = s | e;)
 95 |       language.tokens.size.should eq 0
 96 |       language.rules.size.should eq 1
 97 |       language.rules["S"]?.should eq [ rule(rule_alternative("s"), rule_alternative("e")) ]
 98 |     end
 99 | 
100 |     # The following tests are run with both types of newlines (UNIX and DOS)
101 |     # to make sure we still work on Windows.
102 |     ["\n", "\r\n"].each do |nl|
103 |       it "Correctly handles whitespace between the token / rule keyword and the identifier" do
104 |         language = Pegasus::Language::LanguageDefinition.new %Q(token   #{nl}  t   #{nl}  = /t/;rule   #{nl}  S  #{nl}   = t;)
105 |         language.tokens.size.should eq 1
106 |         language.tokens["t"]?.should eq Pegasus::Language::Token.new("t")
107 |         language.rules.size.should eq 1
108 |         language.rules["S"]?.should eq [ rule(rule_alternative("t")) ]
109 |       end
110 | 
111 |       it "Correctly handles whitespace around the equals sign" do
112 |         language = Pegasus::Language::LanguageDefinition.new %Q(token t   #{nl} =    /t/;rule S   #{nl} =    #{nl}t;)
113 |         language.tokens.size.should eq 1
114 |         language.tokens["t"]?.should eq Pegasus::Language::Token.new("t")
115 |         language.rules.size.should eq 1
116 |         language.rules["S"]?.should eq [ rule(rule_alternative("t")) ]
117 |       end
118 | 
119 |       it "Correctly handles whitespace around the semicolon" do
120 |         language = Pegasus::Language::LanguageDefinition.new %Q(token t = /t/   #{nl}  ;   #{nl}   rule S = t  #{nl}  ;    #{nl})
121 |         language.tokens.size.should eq 1
122 |         language.tokens["t"]?.should eq Pegasus::Language::Token.new("t")
123 |         language.rules.size.should eq 1
124 |         language.rules["S"]?.should eq [ rule(rule_alternative("t")) ]
125 |       end
126 | 
127 |       it "Correctly handles whitespace between rule identifiers" do
128 |         language = Pegasus::Language::LanguageDefinition.new %Q(rule S = hello   #{nl}  goodbye   #{nl}  |   #{nl}   world;)
129 |         language.tokens.size.should eq 0
130 |         language.rules.size.should eq 1
131 |         language.rules["S"]?.should eq [ rule(rule_alternative("hello", "goodbye"), rule_alternative("world")) ]
132 |       end
133 | 
134 |       it "Correctly parses two rules with one body each" do
135 |         language = Pegasus::Language::LanguageDefinition.new %Q(rule S = h;#{nl}rule expr = e;)
136 |         language.tokens.size.should eq 0
137 |         language.rules.size.should eq 2
138 |         language.rules["S"]?.should eq [ rule(rule_alternative("h")) ]
139 |         language.rules["expr"]?.should eq [ rule(rule_alternative("e")) ]
140 |       end
141 |     end
142 |   end
143 | end
144 | 


--------------------------------------------------------------------------------
/spec/nfa_spec.cr:
--------------------------------------------------------------------------------
  1 | require "./spec_utils.cr"
  2 | 
  3 | describe Pegasus::Nfa::Nfa do
  4 |   describe "#initialize" do
  5 |     it "Creates a start state" do
  6 |       nfa = Pegasus::Nfa::Nfa.new
  7 |       nfa.@start.should_not be_nil
  8 |     end
  9 | 
 10 |     it "Doesn't create a final start state" do
 11 |       nfa = Pegasus::Nfa::Nfa.new
 12 |       nfa.@start.try(&.data).should be_nil
 13 |     end
 14 |   end
 15 | 
 16 |   describe "#dfa" do
 17 |     it "Creates an empty DFA with no final states when no patterns were added" do
 18 |       nfa = Pegasus::Nfa::Nfa.new
 19 |       dfa = nfa.dfa
 20 |       dfa.states.size.should eq 1
 21 |       dfa.states.each do |state|
 22 |         state.data.each do |nfa_state|
 23 |           nfa_state.data.should be_nil
 24 |         end
 25 |       end
 26 |     end
 27 | 
 28 |     it "Does not create negative states" do
 29 |       nfa = Pegasus::Nfa::Nfa.new
 30 |       nfa.add_regex "hello", 0_i64
 31 |       nfa.add_regex "goodbye", 1_i64
 32 |       dfa = nfa.dfa
 33 |       dfa.states.each do |state|
 34 |         state.id.should be >= 0
 35 |       end
 36 |     end
 37 | 
 38 |     it "Sets the start state of the new DFA" do
 39 |       nfa = Pegasus::Nfa::Nfa.new
 40 |       dfa = nfa.dfa
 41 |       dfa.start.should_not be_nil
 42 |       dfa.start.try(&.id).should eq 0_i64
 43 |     end
 44 | 
 45 |     it "Creates a basic two-state DFA for single-character patterns" do
 46 |       nfa = Pegasus::Nfa::Nfa.new
 47 |       nfa.add_regex "h", 0_i64
 48 |       dfa = nfa.dfa
 49 | 
 50 |       dfa.states.size.should eq 2
 51 |       dfa.states.each do |state|
 52 |         if state == dfa.start
 53 |           state.data.each &.data.should be_nil
 54 |           state.transitions.size.should eq 1
 55 |           next_state = state.transitions['h'.bytes.first]?
 56 |           next_state.should_not be_nil
 57 |         else
 58 |           state.pattern_id.should eq 1
 59 |         end
 60 |       end
 61 |     end
 62 | 
 63 |     it "Creates a DFA for an OR expression" do
 64 |       nfa = Pegasus::Nfa::Nfa.new
 65 |       nfa.add_regex "h|e", 0_i64
 66 |       dfa = nfa.dfa
 67 |       dfa.states.size.should eq 3
 68 |       dfa.states.each do |state|
 69 |         if state == dfa.start
 70 |           state.data.each &.data.should be_nil
 71 |           state.transitions.size.should eq 2
 72 |           h_state = state.transitions['h'.bytes.first]?
 73 |           h_state.should_not be_nil
 74 |           e_state = state.transitions['e'.bytes.first]?
 75 |           e_state.should_not be_nil
 76 |         else
 77 |           state.pattern_id.should eq 1
 78 |         end
 79 |       end
 80 |     end
 81 | 
 82 |     it "Creates a DFA for a + expression" do
 83 |       nfa = Pegasus::Nfa::Nfa.new
 84 |       nfa.add_regex "h+", 0_i64
 85 |       dfa = nfa.dfa
 86 |       dfa.states.size.should eq 2
 87 |       dfa.states.each do |state|
 88 |         if state == dfa.start
 89 |           state.data.each &.data.should be_nil
 90 |           state.transitions.size.should eq 1
 91 |           h_state = state.transitions['h'.bytes.first]?
 92 |           h_state.should_not be_nil
 93 |         else
 94 |           state.pattern_id.should eq 1
 95 |           state.transitions.size.should eq 1
 96 |           state.transitions['h'.bytes.first]?.should eq state
 97 |         end
 98 |       end
 99 |     end
100 | 
101 |     it "Creates a DFA for a * expression" do
102 |       nfa = Pegasus::Nfa::Nfa.new
103 |       nfa.add_regex "h*", 0_i64
104 |       dfa = nfa.dfa
105 |       dfa.states.size.should eq 2
106 |       dfa.states.each do |state|
107 |         state.pattern_id.should eq 1
108 |         state.transitions.size.should eq 1
109 |       end
110 |     end
111 | 
112 |     it "Creates a DFA for a ? expression" do
113 |       nfa = Pegasus::Nfa::Nfa.new
114 |       nfa.add_regex "h?", 0_i64
115 |       dfa = nfa.dfa
116 |       dfa.states.size.should eq 2
117 |       dfa.states.each do |state|
118 |         state.pattern_id.should eq 1
119 |         if state == dfa.start
120 |           next_state = state.transitions['h'.bytes.first]?
121 |           next_state.should_not be_nil
122 |         else
123 |           state.transitions['h'.bytes.first]?.should be_nil
124 |         end
125 |       end
126 |     end
127 | 
128 |     it "Creates a DFA for a range expression" do
129 |       nfa = Pegasus::Nfa::Nfa.new
130 |       nfa.add_regex "[helo0-9]", 0_i64
131 |       dfa = nfa.dfa
132 |       dfa.states.size.should eq 2
133 |       dfa.states.each do |state|
134 |         if state == dfa.start
135 |           state.transitions.size.should eq 14
136 |         else
137 |           state.transitions.size.should eq 0
138 |           state.pattern_id.should eq 1
139 |         end
140 |       end
141 |     end
142 |   end
143 | end
144 | 
145 | describe Pegasus::Nfa::Transition do
146 |   describe "#char_states" do
147 |     it "Does not return any states" do
148 |       transition = Pegasus::Nfa::Transition.new
149 |       transition.char_states.size.should eq 0
150 |     end
151 |   end
152 | end
153 | 
154 | describe Pegasus::Nfa::ByteTransition do
155 |   describe "#char_states" do
156 |     it "Only returns one byte" do
157 |       transition = Pegasus::Nfa::ByteTransition.new 0_u8
158 |       transition.char_states.should eq [ 0_u8 ]
159 |     end
160 |   end
161 | end
162 | 
163 | describe Pegasus::Nfa::AnyTransition do
164 |   describe "#char_states" do
165 |     it "Returns the full unsigned byte range" do
166 |       transition = Pegasus::Nfa::AnyTransition.new
167 |       transition.char_states.should eq (0_u8..255_u8).to_a
168 |     end
169 |   end
170 | end
171 | 
172 | describe Pegasus::Nfa::RangeTransition do
173 |   describe "#char_states" do
174 |     it "Returns the given ranges when not inverted" do
175 |       transition = Pegasus::Nfa::RangeTransition.new ranges: [(0_u8..1_u8), (2_u8..3_u8)],
176 |         inverted: false
177 |       transition.char_states.sort.should eq [ 0_u8, 1_u8, 2_u8, 3_u8 ]
178 |     end
179 | 
180 |     it "Returns the ranges not given when inverted" do
181 |       transition = Pegasus::Nfa::RangeTransition.new ranges: [(0_u8..127_u8), (130_u8..255_u8)],
182 |         inverted: true
183 |       transition.char_states.sort.should eq [ 128_u8, 129_u8 ]
184 |     end
185 |   end
186 | end
187 | 
188 | describe Pegasus::Nfa::StateChain do
189 |   describe "#initialize" do
190 |     it "Sets the final state to the start state if no final state is given" do
191 |       state = Pegasus::Nfa::NState.new id: 0_i64, data: nil
192 |       chain = Pegasus::Nfa::StateChain.new start: state
193 |       chain.start.should eq state
194 |       chain.final.should eq state
195 |     end
196 | 
197 |     it "Adds a transition to its tail state when concatenated with another chain" do
198 |       state_one = Pegasus::Nfa::NState.new id: 0i64, data: nil
199 |       state_two = Pegasus::Nfa::NState.new id: 1i64, data: nil
200 |       first_chain = Pegasus::Nfa::StateChain.new state_one, state_one
201 |       second_chain = Pegasus::Nfa::StateChain.new state_two, state_two
202 |       first_chain.append! second_chain
203 |       first_chain.start.should eq state_one
204 |       first_chain.final.should eq state_two
205 |       first_chain.start.transitions.size.should eq 1
206 |       first_chain.start.transitions.keys[0].should be_a Pegasus::Nfa::LambdaTransition
207 |       first_chain.start.transitions.values[0].should be state_two
208 |     end
209 | 
210 |     it "Doesn't do anything when a Nil is appended" do
211 |       state_one = Pegasus::Nfa::NState.new id: 0i64, data: nil
212 |       state_two = Pegasus::Nfa::NState.new id: 1i64, data: nil
213 |       state_one.transitions[Pegasus::Nfa::LambdaTransition.new] = state_two
214 |       first_chain = Pegasus::Nfa::StateChain.new state_one, state_two
215 |       first_chain.append! nil
216 |       first_chain.start.should eq state_one
217 |       first_chain.final.should eq state_two
218 |       first_chain.final.transitions.size.should eq 0
219 |     end
220 |   end
221 | end
222 | 
223 | describe Pegasus::Nfa::Nfa do
224 |   describe "#add_regex" do
225 |     it "Correctly compiles one-character regular expression" do
226 |       nfa = Pegasus::Nfa::Nfa.new
227 |       nfa.add_regex "h", 0_i64
228 |       (nfa.start.try(&.transitions.size) || 0).should eq 1
229 |       nfa.states.size.should eq 4
230 |     end
231 | 
232 |     it "Does not add negative states" do
233 |       nfa = Pegasus::Nfa::Nfa.new
234 |       nfa.add_regex "hello", 0_i64
235 |       nfa.states.each do |state|
236 |         state.id.should be >= 0
237 |       end
238 |     end
239 | 
240 |     it "Correctly compiles OR regular expression" do
241 |       nfa = Pegasus::Nfa::Nfa.new
242 |       nfa.add_regex "h|e", 0_i64
243 |       nfa.start.not_nil!.transitions.size.should eq 1
244 |       nfa.states.size.should eq 8
245 |       or_branch_state = nfa.start.not_nil!.transitions.values[0]
246 |       or_branch_state.transitions.size.should eq 2
247 |       seen_h = false
248 |       seen_e = false
249 |       or_branch_state.transitions.map(&.[1]).each do |state|
250 |         transition_byte = state.transitions.keys[0].as?(Pegasus::Nfa::ByteTransition).try(&.byte)
251 |         seen_h |= transition_byte == 'h'.bytes.first
252 |         seen_e |= transition_byte == 'e'.bytes.first
253 |       end
254 |       seen_h.should be_true
255 |       seen_e.should be_true
256 |     end
257 | 
258 |     it "Correctly compiles ? regular expression" do
259 |       nfa = Pegasus::Nfa::Nfa.new
260 |       nfa.add_regex "h?", 0_i64
261 |       nfa.start.not_nil!.transitions.size.should eq 1
262 |       nfa.states.size.should eq 6
263 |       skip_from = nfa.start.not_nil!.straight_path(length: 1)
264 |       skip_from.should_not be_nil
265 |       skip_from = skip_from.not_nil!
266 |       skip_from.transitions.size.should eq 2
267 |     end
268 | 
269 |     it "Correctly compiles * regular expression" do
270 |       nfa = Pegasus::Nfa::Nfa.new
271 |       nfa.add_regex "h?", 0_i64
272 |       nfa.start.not_nil!.transitions.size.should eq 1
273 |       nfa.states.size.should eq 6
274 |       skip_from = nfa.start.not_nil!.straight_path(length: 1)
275 |       skip_from.should_not be_nil
276 |       skip_from = skip_from.not_nil!
277 |       skip_from.transitions.size.should eq 2
278 |     end
279 | 
280 |     it "Correctly compiles + regular expression" do
281 |       nfa = Pegasus::Nfa::Nfa.new
282 |       nfa.add_regex "h+", 0_i64
283 |       nfa.start.not_nil!.transitions.size.should eq 1
284 |       nfa.states.size.should eq 6
285 |       return_to = nfa.start.not_nil!.straight_path(length: 1)
286 |       return_to.should_not be_nil
287 |       return_to = return_to.not_nil!
288 |       return_to.transitions.size.should eq 1
289 | 
290 |       return_from = return_to.straight_path(length: 3)
291 |       return_from.should_not be_nil
292 |       return_from = return_from.not_nil!
293 |       return_from.transitions.size.should eq 2
294 |     end
295 | 
296 |     it "Correctly compiles range expression" do
297 |       nfa = Pegasus::Nfa::Nfa.new
298 |       nfa.add_regex "[helo0-9]", 0_i64
299 |       nfa.states.size.should eq 4
300 |       contained = { 'h' => false, 'o' => false, '1' => false, '9' => false }
301 |       range_transition_state = nfa.start.not_nil!.straight_path(length: 1)
302 |       range_transition_state.should_not be_nil
303 |       range_transition_state = range_transition_state.not_nil!
304 |       range_transition_state.transitions.each do |transition, _|
305 |         contained.each do |k, _|
306 |           byte = k.bytes[0]
307 |           if transition.as?(Pegasus::Nfa::RangeTransition).try &.ranges.one? &.includes? byte
308 |             contained[k] = true
309 |           end
310 |         end
311 |       end
312 |       contained.values.all_should eq true
313 |     end
314 | 
315 |     it "Does not compile incomplete escape codes" do
316 |       nfa = Pegasus::Nfa::Nfa.new
317 |       expect_raises(Pegasus::Error::NfaException) do
318 |         nfa.add_regex "h\\", 1_i64
319 |       end
320 |     end
321 | 
322 |     it "Does not compile invalid escape codes" do
323 |       nfa = Pegasus::Nfa::Nfa.new
324 |       expect_raises(Pegasus::Error::NfaException) do
325 |         nfa.add_regex "\\h", 1_i64
326 |       end
327 |     end
328 | 
329 |     it "Correctly compiles valid escape codes" do
330 |       nfa = Pegasus::Nfa::Nfa.new
331 |       specials = [ "\\\"", "\\'", "\\[", "\\]", "\\(", "\\)", "\\|",  "\\?", "\\*", "\\+", "\\.", "\\n" ]
332 | 
333 |       specials.each_with_index do |special, index|
334 |         nfa.add_regex special, index.to_i64
335 |       end
336 | 
337 |       nfa.start.not_nil!.transitions.size.should eq specials.size
338 |       transition_bytes = [] of UInt8
339 |       nfa.start.not_nil!.transitions.values.each do |state|
340 |         state.transitions.size.should eq 1
341 |         state.transitions.keys[0].should be_a(Pegasus::Nfa::ByteTransition)
342 |         transition_bytes << state.transitions.keys[0].as(Pegasus::Nfa::ByteTransition).byte
343 |       end
344 |       transition_bytes[0...transition_bytes.size - 1].should eq specials[0...specials.size - 1].map(&.[1].bytes.[0])
345 |       transition_bytes.last.should eq '\n'.bytes[0]
346 |     end
347 | 
348 |     it "Combines several regular expressions" do
349 |       nfa = Pegasus::Nfa::Nfa.new
350 |       nfa.add_regex "h", 1_i64
351 |       nfa.add_regex "e", 2_i64
352 |       nfa.start.not_nil!.transitions.size.should eq 2
353 |     end
354 | 
355 |     it "Does not compile invalid operators" do
356 |       nfa = Pegasus::Nfa::Nfa.new
357 |       expect_raises(Pegasus::Error::NfaException) do
358 |         nfa.add_regex "+", 0_i64
359 |       end
360 | 
361 |       expect_raises(Pegasus::Error::NfaException) do
362 |         nfa.add_regex "h(+)", 0_i64
363 |       end
364 |     end
365 | 
366 |     it "Does not compile mismatched parentheses" do
367 |       nfa = Pegasus::Nfa::Nfa.new
368 |       expect_raises(Pegasus::Error::NfaException) do
369 |         nfa.add_regex "(", 0_i64
370 |       end
371 | 
372 |       expect_raises(Pegasus::Error::NfaException) do
373 |         nfa.add_regex ")", 0_i64
374 |       end
375 |     end
376 |   end
377 | end
378 | 


--------------------------------------------------------------------------------
/spec/pda_spec.cr:
--------------------------------------------------------------------------------
  1 | require "./spec_utils.cr"
  2 | 
  3 | describe Pegasus::Elements::TerminalId do
  4 |   describe "#==" do
  5 |     it "Compares equivalent terminals correctly" do
  6 |       terminal_one = Pegasus::Elements::TerminalId.new(0_i64)
  7 |       terminal_two = Pegasus::Elements::TerminalId.new(0_i64)
  8 |       terminal_one.should eq terminal_two
  9 |     end
 10 | 
 11 |     it "Compares different terminals correctly" do
 12 |       terminal_one = Pegasus::Elements::TerminalId.new(0_i64)
 13 |       terminal_two = Pegasus::Elements::TerminalId.new(1_i64)
 14 |       terminal_one.should_not eq terminal_two
 15 |     end
 16 |   end
 17 | end
 18 | 
 19 | describe Pegasus::Elements::NonterminalId do
 20 |   describe "#==" do
 21 |     it "Compares equivalent nonterminals correctly" do
 22 |       nonterminal_one = Pegasus::Elements::NonterminalId.new(0_i64)
 23 |       nonterminal_two = Pegasus::Elements::NonterminalId.new(0_i64)
 24 |       nonterminal_one.should eq nonterminal_two
 25 |     end
 26 | 
 27 |     it "Compares different nonterminals correctly" do
 28 |       nonterminal_one = Pegasus::Elements::NonterminalId.new(0_i64)
 29 |       nonterminal_two = Pegasus::Elements::NonterminalId.new(1_i64)
 30 |       nonterminal_one.should_not eq nonterminal_two
 31 |     end
 32 |   end
 33 | end
 34 | 
 35 | describe Pegasus::Pda::Grammar do
 36 |   describe "#initialize" do
 37 |     it "Doesn't add any items" do
 38 |       grammar = Pegasus::Pda::Grammar.new [] of Pegasus::Elements::TerminalId,
 39 |         [] of Pegasus::Elements::NonterminalId
 40 |       grammar.@items.size.should eq 0
 41 |     end
 42 |   end
 43 | 
 44 |   describe "#create_lr_pda" do
 45 |     it "Handles empty grammars" do
 46 |       grammar = Pegasus::Pda::Grammar.new [] of Pegasus::Elements::TerminalId,
 47 |         [] of Pegasus::Elements::NonterminalId
 48 |       pda = grammar.create_lr_pda
 49 |       pda.states.size.should eq 1
 50 |       pda.states.first.transitions.size.should eq 0
 51 |       pda.states.first.data.size.should eq 0
 52 |     end
 53 | 
 54 |     it "Handles grammars with one rule" do
 55 |       grammar = Pegasus::Pda::Grammar.new [ terminal 0 ],
 56 |         [ nonterminal 0, start: true ]
 57 |       grammar.add_item item head: nonterminal(0, start: true),
 58 |         body: body terminal(0)
 59 |       pda = grammar.create_lr_pda
 60 |       pda.states.size.should eq 2 # Start + with item shifted over
 61 | 
 62 |       start_state = pda.states.find(&.id.==(0)).not_nil!
 63 |       start_state.transitions.size.should eq 1 # To the shifted state
 64 |       start_state.data.size.should eq 1 # The one initial item
 65 |     end
 66 | 
 67 |     it "Handles grammars with epsilon-moves" do
 68 |       terminals = [ terminal 0 ]
 69 |       nonterminals = [ nonterminal(0, start: true), nonterminal(1) ]
 70 | 
 71 |       grammar = Pegasus::Pda::Grammar.new terminals, nonterminals
 72 |       grammar.add_item item head: nonterminals[0],
 73 |         body: body nonterminals[1]
 74 |       grammar.add_item item head: nonterminals[1],
 75 |         body: body terminals[0]
 76 | 
 77 |       pda = grammar.create_lr_pda
 78 |       pda.states.size.should eq 3
 79 | 
 80 |       start_state = pda.states.find(&.id.==(0))
 81 |       start_state.should_not be_nil
 82 |       start_state = start_state.not_nil!
 83 |       start_state.transitions.size.should eq 2
 84 |       start_state.data.size.should eq 2
 85 | 
 86 |       reduce_terminal_state = start_state.transitions[terminals[0]]?
 87 |       reduce_terminal_state.should_not be_nil
 88 |       reduce_terminal_state = reduce_terminal_state.not_nil!
 89 |       reduce_terminal_state.data.size.should eq 1
 90 |       reduce_terminal_state.data.first.index.should eq 1
 91 |       reduce_terminal_state.data.first.item.head.should eq nonterminals[1]
 92 |       reduce_terminal_state.data.first.item.body[0].should eq terminals[0]
 93 | 
 94 |       reduce_terminal_state = start_state.transitions[nonterminals[1]]?
 95 |       reduce_terminal_state.should_not be_nil
 96 |       reduce_terminal_state = reduce_terminal_state.not_nil!
 97 |       reduce_terminal_state.data.size.should eq 1
 98 |       reduce_terminal_state.data.first.index.should eq 1
 99 |       reduce_terminal_state.data.first.item.head.should eq nonterminals[0]
100 |       reduce_terminal_state.data.first.item.body[0].should eq nonterminals[1]
101 |     end
102 |   end
103 | 
104 |   describe "#create_lalr_pda" do
105 |     it "Meges states with duplicate bodies" do
106 |       # This grammar is taken from grammars/modern_compiler_design.grammar
107 |       t_x = terminal(0)
108 |       t_b = terminal(1)
109 |       t_a = terminal(2)
110 |       terminals = [ t_x, t_b, t_a ]
111 | 
112 |       s = nonterminal 0, start: true
113 |       a = nonterminal 1
114 |       b = nonterminal 2
115 |       nonterminals = [ s, a, b ]
116 | 
117 |       grammar = Pegasus::Pda::Grammar.new terminals, nonterminals
118 |       grammar.add_item item head: s,
119 |         body: body a
120 |       grammar.add_item item head: s,
121 |         body: body t_x, t_b
122 |       grammar.add_item item head: a,
123 |         body: body t_a, a, t_b
124 |       grammar.add_item item head: a,
125 |         body: body b
126 |       grammar.add_item item head: b,
127 |         body: body t_x
128 | 
129 |       lr_pda = grammar.create_lr_pda
130 |       lalr_pda = grammar.create_lalr_pda lr_pda
131 |       lr_pda.states.size.should eq 13
132 |       lalr_pda.states.size.should eq 9
133 |     end
134 |   end
135 | end
136 | 
137 | describe Pegasus::Pda::DottedItem do
138 |   describe "#next_item!" do
139 |     it "Advances the index when possible" do
140 |       new_item = item head: nonterminal(0, start: true),
141 |         body: body terminal(0), terminal(0)
142 |       dotted_item = Pegasus::Pda::DottedItem.new new_item, index: 0_i64
143 |       dotted_item.next_item!
144 |       dotted_item.index.should eq 1
145 |       dotted_item.next_item!
146 |       dotted_item.index.should eq 2
147 |     end
148 | 
149 |     it "Raises when already at the end" do
150 |       new_item = item head: nonterminal(0, start: true),
151 |         body: body terminal(0), terminal(0)
152 |       dotted_item = Pegasus::Pda::DottedItem.new new_item, index: 2_i64
153 |       expect_raises(Pegasus::Error::PdaException) do
154 |         dotted_item.next_item!
155 |       end
156 |     end
157 |   end
158 | 
159 |   describe "#done?" do
160 |     it "Returns false when dot is not past the last element" do
161 |       new_item = item head: nonterminal(0, start: true),
162 |         body: body terminal(0), terminal(0)
163 |       dotted_item = Pegasus::Pda::DottedItem.new new_item, index: 0_i64
164 |       dotted_item.done?.should be_false
165 |     end
166 | 
167 |     it "Returns true when dot is just after the last element" do
168 |       new_item = item head: nonterminal(0, start: true),
169 |         body: body terminal(0), terminal(0)
170 |       dotted_item = Pegasus::Pda::DottedItem.new new_item, index: 2_i64
171 |       dotted_item.done?.should be_true
172 |     end
173 |   end
174 | end
175 | 
176 | describe Pegasus::Pda::Pda do
177 |   describe "#action_table" do
178 |     it "Creates no actions for the error state" do
179 |       new_pda = pda item head: nonterminal(0, start: true), body: body terminal(0)
180 |       new_table = new_pda.action_table
181 |       new_table[0].each &.should eq -1
182 |     end
183 | 
184 |     it "Creates a shift and a reduce action for a single nonterminal to terminal item" do
185 |       new_pda = pda item head: nonterminal(0, start: true), body: body terminal(0)
186 |       new_table = new_pda.action_table
187 |       new_table[1][1].should eq 0
188 |       new_table[1][0].should eq -1
189 |       new_table[2][0].should eq 1
190 |       new_table[2][1].should eq -1
191 |     end
192 | 
193 |     it "Creates two shift and two reduce actions for a start state with two productions" do
194 |       new_pda = pda item(head: nonterminal(0, start: true), body: body terminal(0)),
195 |         item(head: nonterminal(0, start: true), body: body terminal(1))
196 |       new_table = new_pda.action_table
197 |       new_table[1][0].should eq -1
198 |       new_table[1][1].should eq 0
199 |       new_table[1][2].should eq 0
200 |       new_table[2][0].should eq 1
201 |       new_table[2][1].should eq -1
202 |       new_table[2][2].should eq -1
203 |       new_table[3][0].should eq 2
204 |       new_table[3][1].should eq -1
205 |       new_table[3][2].should eq -1
206 |     end
207 | 
208 |     it "Correctly reports a reduce reduce conflict" do
209 |       new_pda = pda item(head: nonterminal(0, start: true), body: body nonterminal(1)),
210 |         item(head: nonterminal(0, start: true), body: body nonterminal(2)),
211 |         item(head: nonterminal(1), body: body terminal(0)),
212 |         item(head: nonterminal(2), body: body terminal(0))
213 |       expect_raises(Pegasus::Error::TableException) do
214 |         new_pda.action_table
215 |       end
216 |     end
217 | 
218 |     it "Correctly reports a shift/reduce conflict" do
219 |       new_pda = pda item(head: nonterminal(0, start: true), body: body nonterminal(1), terminal(1)),
220 |         item(head: nonterminal(0, start: true), body: body nonterminal(2)),
221 |         item(head: nonterminal(1), body: body terminal(0)),
222 |         item(head: nonterminal(2), body: body terminal(0), terminal(1))
223 |       expect_raises(Pegasus::Error::TableException) do
224 |         new_pda.action_table
225 |       end
226 |     end
227 |   end
228 | 
229 |   describe "#state_table" do
230 |     it "Does not allow transitions out of the error state" do
231 |       new_pda = pda item head: nonterminal(0, start: true), body: body terminal(0)
232 |       new_table = new_pda.state_table
233 |       new_table[0].each &.should eq 0
234 |     end
235 | 
236 |     it "Creates transitions for terminals" do
237 |       new_pda = pda item head: nonterminal(0, start: true), body: body terminal(0)
238 |       new_table = new_pda.state_table
239 |       new_table[1][0].should eq 0
240 |       new_table[1][1].should eq 2
241 |       new_table[1][2].should eq 0
242 |       new_table[2].each &.should eq 0
243 |     end
244 | 
245 |     it "Creates transitions for nonterminals" do
246 |       new_pda = pda item(head: nonterminal(0, start: true), body: body nonterminal(1)),
247 |         item(head: nonterminal(1), body: body terminal(0))
248 |       new_table = new_pda.state_table
249 |       new_table[1][0].should eq 0
250 |       new_table[1][1].should_not eq 0
251 |       new_table[1][2].should eq 0
252 |       new_table[1][3].should_not eq 0
253 |       new_table[1][1].should_not eq new_table[1][3]
254 |       new_table[2].each &.should eq 0
255 |       new_table[3].each &.should eq 0
256 |     end
257 | 
258 |     it "Creates transitions for sequences of elements" do
259 |       new_pda = pda item head: nonterminal(0, start: true), body: body terminal(0), terminal(1)
260 |       new_table = new_pda.state_table
261 |       new_table[1].all_should eq(0), except(1, should: eq 2)
262 |       new_table[2].all_should eq(0), except(2, should: eq 3)
263 |       new_table[3].all_should eq(0)
264 |     end
265 |   end
266 | end
267 | 


--------------------------------------------------------------------------------
/spec/spec_helper.cr:
--------------------------------------------------------------------------------
1 | require "spec"
2 | require "../src/pegasus/**"
3 | 


--------------------------------------------------------------------------------
/spec/spec_utils.cr:
--------------------------------------------------------------------------------
  1 | require "./spec_helper"
  2 | 
  3 | def rule_alternative(*args)
  4 |   elements = [] of Pegasus::Language::RuleElement
  5 |   args.each do |arg|
  6 |     value = case arg
  7 |             when String
  8 |               Pegasus::Language::RuleElement.new arg
  9 |             end
 10 |     elements << value if value
 11 |   end
 12 | 
 13 |   return Pegasus::Language::RuleAlternative.new elements
 14 | end
 15 | 
 16 | def rule(*alternatives)
 17 |   return Pegasus::Language::Rule.new alternatives.to_a
 18 | end
 19 | 
 20 | def nonterminal(id, start = false)
 21 |   Pegasus::Elements::NonterminalId.new id.to_i64, start
 22 | end
 23 | 
 24 | def terminal(id)
 25 |   Pegasus::Elements::TerminalId.new id.to_i64
 26 | end
 27 | 
 28 | def body(*elements)
 29 |   array = [] of Pegasus::Elements::TerminalId | Pegasus::Elements::NonterminalId
 30 |   array.concat elements.to_a
 31 |   return array
 32 | end
 33 | 
 34 | def item(head, body)
 35 |   Pegasus::Pda::Item.new head, body
 36 | end
 37 | 
 38 | def pda(*items)
 39 |   terminals = Set(Pegasus::Elements::TerminalId).new
 40 |   nonterminals = Set(Pegasus::Elements::NonterminalId).new
 41 | 
 42 |   items.to_a.each do |item|
 43 |     nonterminals << item.head
 44 |     item.body.each do |element|
 45 |       case element
 46 |       when Pegasus::Elements::TerminalId
 47 |         terminals << element
 48 |       when Pegasus::Elements::NonterminalId
 49 |         nonterminals << element
 50 |       end
 51 |     end
 52 |   end
 53 | 
 54 |   grammar = Pegasus::Pda::Grammar.new terminals: terminals.to_a,
 55 |     nonterminals: nonterminals.to_a
 56 |   items.to_a.each do |item|
 57 |     grammar.add_item item
 58 |   end
 59 |   lr_pda = grammar.create_lr_pda
 60 |   lalr_pda = grammar.create_lalr_pda lr_pda
 61 |   return lalr_pda
 62 | end
 63 | 
 64 | class Pegasus::Automata::State(V, T)
 65 |   def pattern_id
 66 |     @data.compact_map(&.data).max_of?(&.+(1)) || 0_i64
 67 |   end
 68 | 
 69 |   def path(length, &block)
 70 |     current = self
 71 |     length.times do
 72 |       current = current.transitions
 73 |         .select { |k, _| yield k }
 74 |         .first?.try &.[1]
 75 |       break unless current
 76 |     end
 77 |     return current
 78 |   end
 79 | 
 80 |   def lambda_path(length)
 81 |     path length, &.is_a?(Pegasus::Nfa::LambdaTransition)
 82 |   end
 83 | 
 84 |   def straight_path(length)
 85 |     path(length) { true }
 86 |   end
 87 | end
 88 | 
 89 | class ExceptionRule(T, R)
 90 |   getter index : Int32
 91 |   getter should : T?
 92 |   getter should_not : R?
 93 | 
 94 |   def initialize(@index, @should = nil, @should_not = nil)
 95 |   end
 96 | end
 97 | 
 98 | class Array(T)
 99 |   def all_should(should, *exceptions)
100 |     each_with_index do |item, index|
101 |       is_exception = false
102 |       exceptions.each do |exception|
103 |         if exception.index == index
104 |           if should_rule = exception.should
105 |             item.should should_rule
106 |           end
107 |           if should_not_rule = exception.should_not
108 |             item.should_not should_not_rule
109 |           end
110 |           is_exception = true
111 |         end
112 |       end
113 |       item.should should unless is_exception
114 |     end
115 |   end
116 | end
117 | 
118 | def except(index : Int32, should : T? = nil, should_not : R? = nil) forall T, R
119 |   ExceptionRule(T, R).new index, should, should_not
120 | end
121 | 


--------------------------------------------------------------------------------
/src/generators/c-common/standard_header.h:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <string.h>
  3 | 
  4 | /**
  5 |  * Converts a nonterminal value to a string.
  6 |  * @param nt the nonterminal ID.
  7 |  * @return the name for the nonterminal.
  8 |  */
  9 | const char* pgs_nonterminal_name(long int nt);
 10 | 
 11 | /* == Generated Data Definitions == */
 12 | /**
 13 |  * A grammar item. A lot of the information collected by the parser
 14 |  * generate is not carried into the source code, which leaves items
 15 |  * as simply a nonterminal ID and the size of the right hand side.
 16 |  */
 17 | struct pgs_item_s {
 18 |     /** The nonterminal that this item is reduced to. */
 19 |     long int left_id;
 20 |     /**
 21 |      * The size of the item body, used to pop off
 22 |      * the correct number of states from the stack.
 23 |      */
 24 |     size_t right_count;
 25 | };
 26 | 
 27 | typedef struct pgs_item_s pgs_item;
 28 | 
 29 | /* == General Definitions == */
 30 | #define PGS_MAX_ERROR_LENGTH 255
 31 | 
 32 | /**
 33 |  * The types of errors that can occur while the
 34 |  * entire parsing process.
 35 |  */
 36 | enum pgs_error_e {
 37 |     /** No error occured. */
 38 |     PGS_NONE = 0,
 39 |     /** An allocation failed. */
 40 |     PGS_MALLOC,
 41 |     /** A token couldn't be recognized. */
 42 |     PGS_BAD_CHARACTER,
 43 |     /** A tree couldn't be recognized.  */
 44 |     PGS_BAD_TOKEN,
 45 |     /** End of file reached where it was not expected */
 46 |     PGS_EOF_SHIFT
 47 | };
 48 | 
 49 | /**
 50 |  * State used to report errors and their corresponding
 51 |  * messages.
 52 |  */
 53 | struct pgs_state_s {
 54 |     /** The error code. */
 55 |     enum pgs_error_e error;
 56 |     /** The error message. */
 57 |     char errbuff[PGS_MAX_ERROR_LENGTH];
 58 | };
 59 | 
 60 | typedef enum pgs_error_e pgs_error;
 61 | typedef struct pgs_state_s pgs_state;
 62 | 
 63 | /**
 64 |  * Initializes a state with no error.
 65 |  * @param s the state to initialize.
 66 |  */
 67 | void pgs_state_init(pgs_state* s);
 68 | /**
 69 |  * Sets the state to have an error.
 70 |  * @param s the state to initialize.
 71 |  * @param err the error message to return.
 72 |  */
 73 | void pgs_state_error(pgs_state* s, pgs_error err, const char* message);
 74 | 
 75 | /* == Lexing Definitions ==*/
 76 | /**
 77 |  * A token produced by lexing.
 78 |  */
 79 | struct pgs_token_s {
 80 |     /** The ID of the terminal. */
 81 |     long int terminal;
 82 |     /** The index at which the token starts. */
 83 |     size_t from;
 84 |     /** The index at which the next token begins. */
 85 |     size_t to;
 86 | };
 87 | 
 88 | /**
 89 |  * A dynamic list of tokens produced while lexing.
 90 |  */
 91 | struct pgs_token_list_s {
 92 |     /** The size of the currently allocated block of tokens */
 93 |     size_t capacity;
 94 |     /** The number of tokens in the list. */
 95 |     size_t token_count;
 96 |     /** The token data array. */
 97 |     struct pgs_token_s* tokens;
 98 | };
 99 | 
100 | typedef struct pgs_token_s pgs_token;
101 | typedef struct pgs_token_list_s pgs_token_list;
102 | 
103 | /**
104 |  * Initializes a token list.
105 |  * @param l the list to initialize.
106 |  * @return any errors that occured while initializing the list.
107 |  */
108 | pgs_error pgs_token_list_init(pgs_token_list* l);
109 | /**
110 |  * Appends a token to the list.
111 |  * @param terminal the ID of the terminal to append.
112 |  * @param from the index at which the token begins.
113 |  * @param to the index at which the next token begins.
114 |  */
115 | pgs_error pgs_token_list_append(pgs_token_list* l, long int terminal, size_t from, size_t to);
116 | /**
117 |  * Returns a token at the given index.
118 |  * @param l the list to return a token from.
119 |  * @param i the index from which to return a token.
120 |  * @return a token, or NULL if the index is out of bounds.
121 |  */
122 | pgs_token* pgs_token_list_at(pgs_token_list* l, size_t i);
123 | /**
124 |  * Returns a token ID at the given index.
125 |  * @param l the list to return an ID from.
126 |  * @param i the index from which to return an ID.
127 |  * @return returns an ID, or 0, which represents EOF.
128 |  */
129 | long int pgs_token_list_at_id(pgs_token_list* l, size_t i );
130 | /**
131 |  * Frees a list of tokens. Since the tokens are owned by the list,
132 |  * they are invalidated after this call too.
133 |  * @param l the list to free.
134 |  */
135 | void pgs_token_list_free(pgs_token_list* l);
136 | /**
137 |  * Performs a lexing operation.
138 |  * @param s the state to populate with error text, if necessary.
139 |  * @param list the list of tokens to initialize and populate.
140 |  * @param source the string to lex.
141 |  * @return the error, if any, that occured during this process.
142 |  */
143 | pgs_error pgs_do_lex(pgs_state* s, pgs_token_list* list, const char* source);
144 | 
145 | 


--------------------------------------------------------------------------------
/src/generators/c-common/standard_source.c:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <string.h>
  3 | 
  4 | /* == General Code == */
  5 | 
  6 | void pgs_state_init(pgs_state* s) {
  7 |     s->error = PGS_NONE;
  8 |     s->errbuff[0] = '\0';
  9 | }
 10 | 
 11 | void pgs_state_error(pgs_state* s, pgs_error e, const char* message) {
 12 |     s->error = e;
 13 |     strncpy(s->errbuff, message, PGS_MAX_ERROR_LENGTH);
 14 | }
 15 | 
 16 | /* == Lexing Code == */
 17 | 
 18 | pgs_error pgs_token_list_init(pgs_token_list* l) {
 19 |     l->capacity = 8;
 20 |     l->token_count = 0;
 21 |     l->tokens = (pgs_token*) malloc(sizeof(*(l->tokens)) * l->capacity);
 22 | 
 23 |     if(l->tokens == NULL) return PGS_MALLOC;
 24 |     return PGS_NONE;
 25 | }
 26 | 
 27 | pgs_error pgs_token_list_append(pgs_token_list* l, long int terminal, size_t from, size_t to) {
 28 |     if(l->capacity == l->token_count) {
 29 |         pgs_token* new_tokens =
 30 |             (pgs_token*) realloc(l->tokens, sizeof(*new_tokens) * l->capacity * 2);
 31 |         if(new_tokens == NULL) return PGS_MALLOC;
 32 |         l->capacity *= 2;
 33 |         l->tokens = new_tokens;
 34 |     }
 35 | 
 36 |     l->tokens[l->token_count].terminal = terminal;
 37 |     l->tokens[l->token_count].from = from;
 38 |     l->tokens[l->token_count].to = to + 1;
 39 |     l->token_count++;
 40 | 
 41 |     return PGS_NONE;
 42 | }
 43 | 
 44 | pgs_token* pgs_token_list_at(pgs_token_list* l, size_t i) {
 45 |     return (i < l->token_count) ? &l->tokens[i] : NULL;
 46 | }
 47 | 
 48 | long int pgs_token_list_at_id(pgs_token_list* l, size_t i) {
 49 |     if(i < l->token_count) return l->tokens[i].terminal;
 50 |     return 0;
 51 | }
 52 | 
 53 | void pgs_token_list_free(pgs_token_list* l) {
 54 |     free(l->tokens);
 55 | }
 56 | 
 57 | pgs_error pgs_do_lex(pgs_state* s, pgs_token_list* list, const char* source) {
 58 |     pgs_error error;
 59 |     size_t index = 0;
 60 |     long int final;
 61 |     long int last_final;
 62 |     long int last_final_index;
 63 |     long int last_start;
 64 |     long int state;
 65 |     size_t length = strlen(source);
 66 | 
 67 |     if((error = pgs_token_list_init(list))) return error;
 68 |     while(!error && index < length) {
 69 |         last_final = -1;
 70 |         last_final_index = -1;
 71 |         last_start = index;
 72 |         state = 1;
 73 | 
 74 |         while(index < length && state) {
 75 |             state = lexer_state_table[state][(unsigned int) source[index]];
 76 | 
 77 |             if((final = lexer_final_table[state])) {
 78 |                 last_final = final;
 79 |                 last_final_index = index;
 80 |             }
 81 | 
 82 |             if(state) index++;
 83 |         }
 84 | 
 85 |         if(last_final == -1) break;
 86 |         if(lexer_skip_table[last_final]) continue;
 87 |         error = pgs_token_list_append(list, last_final, last_start, last_final_index);
 88 |     }
 89 | 
 90 |     if(error == PGS_MALLOC) {
 91 |         pgs_token_list_free(list);
 92 |     } else if (index != length) {
 93 |         pgs_state_error(s, PGS_BAD_CHARACTER, "Invalid character at position");
 94 |         pgs_token_list_free(list);
 95 |         return PGS_BAD_CHARACTER;
 96 |     }
 97 | 
 98 |     return PGS_NONE;
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/generators/c-common/tables.cr:
--------------------------------------------------------------------------------
 1 | require "../../pegasus/language_def.cr"
 2 | require "ecr"
 3 | 
 4 | module Pegasus::Generators
 5 |   class CTableGen
 6 |     def initialize(@language : Pegasus::Language::LanguageData)
 7 |     end
 8 | 
 9 |     ECR.def_to_s "src/generators/c-common/tables.ecr"
10 |   end
11 | end
12 | 


--------------------------------------------------------------------------------
/src/generators/c-common/tables.ecr:
--------------------------------------------------------------------------------
 1 | /* == Nonterminals to String == */
 2 | 
 3 | const char* pgs_nonterminal_name(long int nt) {
 4 |     switch(nt) {
 5 |         <%- @language.nonterminals.each do |name, value| -%>
 6 |         case PGS_NONTERMINAL_<%= name.underscore.upcase %>: return <%= name.dump %>;
 7 |         <%- end -%>
 8 |         default: return "";
 9 |     }
10 | }
11 | 
12 | /* == Generated Tables and Variables== */
13 | 
14 | #define PGS_MAX_TERMINAL <%= @language.max_terminal %>
15 | int lexer_skip_table[<%= @language.lex_skip_table.size %>] = {
16 |     <% @language.lex_skip_table.each do |skip| %><%= skip ? 1 : 0 %>, <% end %>
17 | };
18 | long int lexer_state_table[<%= @language.lex_state_table.size %>][<%= @language.lex_state_table[0].size %>] = {
19 | <% @language.lex_state_table.each do |state| -%>
20 |     { <% state.each do |transition| %><%= transition %>, <% end %> },
21 | <% end -%>
22 | };
23 | long int lexer_final_table[<%= @language.lex_final_table.size %>] = {
24 |     <% @language.lex_final_table.each do |final| %><%= final %>,<% end %>
25 | };
26 | long int parse_state_table[<%= @language.parse_state_table.size %>][<%= @language.parse_state_table[0].size %>]= {
27 | <% @language.parse_state_table.each do |state| -%>
28 |     { <% state.each do |transition| %><%= transition %>, <% end %> },
29 | <% end -%>
30 | };
31 | long int parse_action_table[<%= @language.parse_action_table.size %>][<%= @language.parse_action_table[0].size %>] = {
32 | <% @language.parse_action_table.each do |state| -%>
33 |     { <% state.each do |action| %><%= action %>, <% end %> },
34 | <% end -%>
35 | };
36 | int parse_final_table[<%= @language.parse_final_table.size %>] = {
37 | <% @language.parse_final_table.each do |skip| -%>
38 |   <%= skip ? 1 : 0 %>,
39 | <% end -%>
40 | };
41 | pgs_item items[<%= @language.items.size %>] = {
42 | <% @language.items.each do |item| -%>
43 |     { <%= item.head.raw_id %>, <%= item.body.size %> },
44 | <% end -%>
45 | };
46 | 


--------------------------------------------------------------------------------
/src/generators/c/pegasus_c.cr:
--------------------------------------------------------------------------------
 1 | require "../../pegasus/language_def.cr"
 2 | require "../../pegasus/json.cr"
 3 | require "../c-common/tables.cr"
 4 | require "../generators.cr"
 5 | require "option_parser"
 6 | require "ecr"
 7 | 
 8 | module Pegasus::Generators::C
 9 |   include Pegasus::Language
10 |   include Pegasus::Generators::Api
11 | 
12 |   class CContext
13 |     def add_option(opt_parser)
14 |     end
15 |   end
16 | 
17 |   class LanguageInput < StdInput(LanguageData)
18 |     def process(opt_parser) : LanguageData
19 |       LanguageData.from_json STDIN
20 |     end
21 |   end
22 | 
23 |   class HeaderGenerator < FileGenerator(CContext, LanguageData)
24 |     def initialize(parent)
25 |       super parent, "header", "parser.h", "the parser header file"
26 |     end
27 | 
28 |     def to_s(io)
29 |       ECR.embed "src/generators/c/pegasus_c_header_template.ecr", io 
30 |     end
31 |   end
32 | 
33 |   class SourceGenerator < FileGenerator(CContext, LanguageData)
34 |     def initialize(parent)
35 |       super parent, "code", "parser.c", "the parser source code file"
36 |     end
37 | 
38 |     def to_s(io)
39 |       io << "#include \"#{@parent.output_file_names["header"]}\"\n"
40 |       ECR.embed "src/generators/c/pegasus_c_template.ecr", io 
41 |     end
42 |   end
43 | end
44 | 
45 | include Pegasus::Generators::C
46 | 
47 | parser = PegasusOptionParser(CContext, LanguageData).new LanguageInput.new
48 | HeaderGenerator.new(parser)
49 | SourceGenerator.new(parser)
50 | parser.run
51 | 


--------------------------------------------------------------------------------
/src/generators/c/pegasus_c_header_template.ecr:
--------------------------------------------------------------------------------
1 | <%= {{ read_file "src/generators/c-common/standard_header.h" }} %>
2 | 
3 | /* == Nonterminal ID Definitions == */
4 | <% input!.nonterminals.each do |name, value| -%>
5 | #define PGS_NONTERMINAL_<%= name.underscore.upcase %> <%= value.raw_id %>
6 | <% end -%>
7 | 
8 | <%= {{ read_file "src/generators/c/tree_header.h" }} %>
9 | 


--------------------------------------------------------------------------------
/src/generators/c/pegasus_c_template.ecr:
--------------------------------------------------------------------------------
1 | <%= Pegasus::Generators::CTableGen.new(input!).to_s %>
2 | 
3 | <%= {{ read_file "src/generators/c-common/standard_source.c" }} %>
4 | 
5 | <%= {{ read_file "src/generators/c/tree_source.c" }} %>
6 | 


--------------------------------------------------------------------------------
/src/generators/c/tree_header.h:
--------------------------------------------------------------------------------
  1 | #define PGS_TREE_T(tree) ((tree).tree_data.terminal.token.terminal)
  2 | #define PGS_TREE_T_FROM(tree) ((tree).tree_data.terminal.token.from)
  3 | #define PGS_TREE_T_TO(tree) ((tree).tree_data.terminal.token.to)
  4 | #define PGS_TREE_NT(tree) ((tree).tree_data.nonterminal.nonterminal)
  5 | #define PGS_TREE_NT_COUNT(tree) ((tree).tree_data.nonterminal.child_count)
  6 | #define PGS_TREE_NT_CHILD(tree, n) ((tree).tree_data.nonterminal.children[n])
  7 | #define PGS_TREE_IS_NT(tree, type) (((tree).variant == PGS_TREE_NONTERMINAL) && (PGS_TREE_NT(tree) == (type)))
  8 | 
  9 | /* == Parsing Definitions == */
 10 | /**
 11 |  *  Enum that represents the variant of a parse tree,
 12 |  *  which is either a nonterminal with chilren, or a 
 13 |  *  terminal with a token.
 14 |  */
 15 | enum pgs_tree_variant_e {
 16 |     PGS_TREE_TERMINAL,
 17 |     PGS_TREE_NONTERMINAL
 18 | };
 19 | 
 20 | /**
 21 |  * The data of a terminal tree.
 22 |  */
 23 | struct pgs_tree_terminal_s {
 24 |     /** The token this tree holds. */
 25 |     pgs_token token;
 26 | };
 27 | 
 28 | /**
 29 |  * The data of a nonterminal tree.
 30 |  */
 31 | struct pgs_tree_nonterminal_s {
 32 |     /**
 33 |      * The nonterminal ID.
 34 |      */
 35 |     long int nonterminal;
 36 |     /**
 37 |      * The number of children this tree has.
 38 |      */
 39 |     size_t child_count;
 40 |     /**
 41 |      * The array of child pointers, allocated dynamically
 42 |      * depending on the item that reduced to this nonterminal.
 43 |      */
 44 |     struct pgs_tree_s** children;
 45 | };
 46 | 
 47 | /**
 48 |  * A general struct for a tree, which is either a terminal
 49 |  * or a nonterminal.
 50 |  */
 51 | struct pgs_tree_s {
 52 |     /** The variant of the tree. */
 53 |     enum pgs_tree_variant_e variant;
 54 |     union {
 55 |         /** The terminal variant of this tree. */
 56 |         struct pgs_tree_terminal_s terminal;
 57 |         /** The nonterminal variant of this tree. */
 58 |         struct pgs_tree_nonterminal_s nonterminal;
 59 |     } tree_data;
 60 | };
 61 | 
 62 | /**
 63 |  * An element on the parse stack, which holds
 64 |  * both a tree node and a state. In theory,
 65 |  * the stack is actually items followed by states,
 66 |  * but since one always comes after the other,
 67 |  * and since both need to be looked up fast,
 68 |  * we put them on a stack in parallel.
 69 |  */
 70 | struct pgs_parse_stack_element_s {
 71 |     /** The tree on the stack */
 72 |     struct pgs_tree_s* tree;
 73 |     /** The state on the stack */
 74 |     long int state;
 75 | };
 76 | 
 77 | /**
 78 |  * A parse stack. The PDA automaton
 79 |  * has to maintain this stack, where it gradually
 80 |  * assembles a tree.
 81 |  */
 82 | struct pgs_parse_stack_s {
 83 |     /** The number of stack elements currently allocated. */
 84 |     size_t capacity;
 85 |     /** The current number of stack elements. */
 86 |     size_t size;
 87 |     /** The stack element array. */
 88 |     struct pgs_parse_stack_element_s* data;
 89 | };
 90 | 
 91 | typedef enum pgs_tree_variant_e pgs_tree_variant;
 92 | typedef struct pgs_tree_terminal_s pgs_tree_terminal;
 93 | typedef struct pgs_tree_nontermnal_s pgs_tree_nonterminal;
 94 | typedef struct pgs_tree_s pgs_tree;
 95 | typedef struct pgs_parse_stack_element_s pgs_parse_stack_element;
 96 | typedef struct pgs_parse_stack_s pgs_parse_stack;
 97 | 
 98 | /**
 99 |  * Allocates and initialzie a parse tree node that is a nonterminal with the given
100 |  * ID and the given child count.
101 |  * @param nonterminal the nonterminal ID of this tree.
102 |  * @param chil_count the number of chilren that this tree has.
103 |  * @return the newly allocated tree, or NULL if a malloc failure occured.
104 |  */
105 | pgs_tree* pgs_create_tree_nonterminal(long int nonterminal, size_t child_count);
106 | /**
107 |  * Allocates and initialize a parse tree node that is a terminal with the given token.
108 |  * @param t the token to initialize this tree with. The token need not be valid after this call.
109 |  * @return the newly allocated tree, or NULL if a malloc failure occured.
110 |  */
111 | pgs_tree* pgs_create_tree_terminal(pgs_token* t);
112 | /**
113 |  * Frees a nonterminal tree.
114 |  * @tree the tree to free.
115 |  */
116 | void pgs_free_tree_nonterminal(pgs_tree* tree);
117 | /**
118 |  * Frees a terminal tree.
119 |  * @tree the tree to free.
120 |  */
121 | void pgs_free_tree_terminal(pgs_tree* tree);
122 | /**
123 |  * Computes the parser_action_table index for the given tree.
124 |  * @param tree the tree for which to compute the index.
125 |  * @return the index.
126 |  */
127 | long int pgs_tree_table_index(pgs_tree* tree);
128 | /**
129 |  * Frees a tree.
130 |  * @param tree the tree to free.
131 |  */
132 | void pgs_free_tree(pgs_tree* tree);
133 | 
134 | /**
135 |  * Initialzies a parse stack.
136 |  * @param s the parse stack to initialize.
137 |  * @return the result of the initialization.
138 |  */
139 | pgs_error pgs_parse_stack_init(pgs_parse_stack* s);
140 | /**
141 |  * Appends (pushes) a new tree and state to the stack.
142 |  * @param s the stack to append to.
143 |  * @param tree the tree to append.
144 |  * @param state the state to append.
145 |  * @return the result of the append.
146 |  */
147 | pgs_error pgs_parse_stack_append(pgs_parse_stack* s, pgs_tree* tree, long int state);
148 | /**
149 |  * Appends a given token to the stack, by initializing a new parse tree noe.
150 |  * @param s the stack to append to.
151 |  * @param t the token for which to construct a tree and compute a new state.
152 |  * @return the result of the append.
153 |  */
154 | pgs_error pgs_parse_stack_append_terminal(pgs_parse_stack* s, pgs_token* t);
155 | /**
156 |  * Appends a given item to the stack, by popping the correct number of items
157 |  * and creating a new nonterminal tree node in their place. A new state is also
158 |  * computed from the nonterminal ID.
159 |  * @param s the stack to append to.
160 |  * @param id the nonterminal ID to create.
161 |  * @param count the number of children to pop.
162 |  * @return the result of the append.
163 |  */
164 | pgs_error pgs_parse_stack_append_nonterminal(pgs_parse_stack* s, long int id, size_t count);
165 | /**
166 |  * Gets the state on the top of the stack.
167 |  * @param s the stack for which to get a state.
168 |  * @return the state on the top of the stack.
169 |  */
170 | long int pgs_parse_stack_top_state(pgs_parse_stack* s);
171 | /**
172 |  * Gets the tree on the top of the stack.
173 |  * @param s the stack for which to get a tree.
174 |  * @return the tree on the top of the stack.
175 |  */
176 | pgs_tree* pgs_parse_stack_top_tree(pgs_parse_stack* s);
177 | /**
178 |  * Frees a parse stack, also freeing all the trees.
179 |  * @param s the stack to free.
180 |  */
181 | void pgs_parse_stack_free(pgs_parse_stack* s);
182 | /**
183 |  * Takes the given tokens, and attempts to convert them into a parse tree.
184 |  * @param s the state used for storing errors.
185 |  * @param list the list of tokens, already filled.
186 |  * @param into the tree pointer pointer into which a new tree will be stored.
187 |  * @return the error, if any, that occured.
188 |  */
189 | pgs_error pgs_do_parse(pgs_state* s, pgs_token_list* list, pgs_tree** into);
190 | 
191 | /* == Glue == */
192 | /**
193 |  * Attempts to parse tokens from the given string into the given tree.
194 |  * @param state the state to initialize with error information, if necessary.
195 |  * @param into the tree to build into.
196 |  * @param string the string from which to read.
197 |  * @return the error, if any, that occured.
198 |  */
199 | pgs_error pgs_do_all(pgs_state* state, pgs_tree** into, const char* string);
200 | 


--------------------------------------------------------------------------------
/src/generators/c/tree_source.c:
--------------------------------------------------------------------------------
  1 | /* == Parsing Code == */
  2 | 
  3 | pgs_tree* pgs_create_tree_nonterminal(long int nonterminal, size_t child_count) {
  4 |     pgs_tree* tree = (pgs_tree*) malloc(sizeof(*tree));
  5 |     pgs_tree** children = (pgs_tree**) malloc(sizeof(*children) * child_count);
  6 | 
  7 |     if(tree == NULL || children == NULL) {
  8 |         free(tree);
  9 |         return NULL;
 10 |     }
 11 | 
 12 |     tree->variant = PGS_TREE_NONTERMINAL;
 13 |     tree->tree_data.nonterminal.nonterminal = nonterminal;
 14 |     tree->tree_data.nonterminal.child_count = child_count;
 15 |     tree->tree_data.nonterminal.children = children;
 16 | 
 17 |     return tree;
 18 | }
 19 | 
 20 | pgs_tree* pgs_create_tree_terminal(pgs_token* t) {
 21 |     pgs_tree* tree = (pgs_tree*) malloc(sizeof(*tree));
 22 |     if(tree == NULL) return NULL;
 23 | 
 24 |     tree->variant = PGS_TREE_TERMINAL;
 25 |     tree->tree_data.terminal.token = *t;
 26 | 
 27 |     return tree;
 28 | }
 29 | 
 30 | void pgs_free_tree_nonterminal(pgs_tree* t) {
 31 |     size_t i;
 32 |     for(i = 0; i < t->tree_data.nonterminal.child_count; i++) {
 33 |         pgs_free_tree(t->tree_data.nonterminal.children[i]);
 34 |     }
 35 |     free(t->tree_data.nonterminal.children);
 36 |     free(t);
 37 | }
 38 | 
 39 | void pgs_free_tree_terminal(pgs_tree* t) {
 40 |     free(t);
 41 | }
 42 | 
 43 | long int pgs_tree_table_index(pgs_tree* t) {
 44 |     switch(t->variant) {
 45 |         case PGS_TREE_TERMINAL:
 46 |             return PGS_TREE_T(*t);
 47 |         case PGS_TREE_NONTERMINAL: 
 48 |             return PGS_TREE_NT(*t) + 2 + PGS_MAX_TERMINAL;
 49 |     }
 50 | }
 51 | 
 52 | void pgs_free_tree(pgs_tree* t) {
 53 |     switch(t->variant) {
 54 |         case PGS_TREE_TERMINAL: pgs_free_tree_terminal(t); break;
 55 |         case PGS_TREE_NONTERMINAL: pgs_free_tree_nonterminal(t); break;
 56 |     }
 57 | }
 58 | 
 59 | pgs_error pgs_parse_stack_init(pgs_parse_stack* s) {
 60 |     s->capacity = 8;
 61 |     s->size = 1;
 62 |     s->data = (pgs_parse_stack_element*) malloc(sizeof(*(s->data)) * s->capacity);
 63 | 
 64 |     if(s->data == NULL) return PGS_MALLOC;
 65 |     s->data[0].tree = NULL;
 66 |     s->data[0].state = 1;
 67 | 
 68 |     return PGS_NONE;
 69 | }
 70 | 
 71 | pgs_error pgs_parse_stack_append(pgs_parse_stack* s, pgs_tree* tree, long int state) {
 72 |     if(s->capacity == s->size) {
 73 |         pgs_parse_stack_element* new_elements =
 74 |             (pgs_parse_stack_element*) realloc(
 75 |                 s->data, sizeof(*new_elements) * s->capacity * 2);
 76 |         if(new_elements == NULL) return PGS_MALLOC;
 77 |         s->capacity *= 2;
 78 |         s->data = new_elements;
 79 |     }
 80 | 
 81 |     s->data[s->size].tree = tree;
 82 |     s->data[s->size].state = state;
 83 |     s->size++;
 84 | 
 85 |     return PGS_NONE;
 86 | }
 87 | 
 88 | pgs_error pgs_parse_stack_append_terminal(pgs_parse_stack* s, pgs_token* t) {
 89 |     pgs_error error;
 90 |     long int state;
 91 |     pgs_tree* tree = pgs_create_tree_terminal(t);
 92 |     if(tree == NULL) return PGS_MALLOC;
 93 |     state = parse_state_table[pgs_parse_stack_top_state(s)][t->terminal];
 94 |     error = pgs_parse_stack_append(s, tree, state);
 95 |     if(error) {
 96 |         pgs_free_tree_terminal(tree);
 97 |         return error;
 98 |     }
 99 |     return PGS_NONE;
100 | }
101 | 
102 | pgs_error pgs_parse_stack_append_nonterminal(pgs_parse_stack* s, long int id, size_t count) {
103 |     size_t i;
104 |     pgs_tree** child_array;
105 |     pgs_tree* new_tree;
106 |     
107 |     child_array = (pgs_tree**) malloc(sizeof(*child_array) * count);
108 |     new_tree = pgs_create_tree_nonterminal(id, 0);
109 |     if(child_array == NULL || new_tree == NULL) {
110 |         free(child_array);
111 |         return PGS_MALLOC;
112 |     }
113 |     for(i = 0; i < count; i++) {
114 |         child_array[i] = s->data[s->size - count + i].tree;
115 |     }
116 | 
117 |     new_tree->tree_data.nonterminal.nonterminal = id;
118 |     new_tree->tree_data.nonterminal.child_count = count;
119 |     new_tree->tree_data.nonterminal.children = child_array;
120 | 
121 |     s->size -= count;
122 |     s->data[s->size].tree = new_tree;
123 |     s->data[s->size].state = parse_state_table[pgs_parse_stack_top_state(s)][id + 2 + PGS_MAX_TERMINAL];
124 |     s->size++;
125 | 
126 |     return PGS_NONE;
127 | }
128 | 
129 | void pgs_parse_stack_free(pgs_parse_stack* s) {
130 |     size_t i;
131 |     for(i = 0; i < s->size; i++) {
132 |         free(s->data[i].tree);
133 |     }
134 |     free(s->data);
135 | }
136 | 
137 | long int pgs_parse_stack_top_state(pgs_parse_stack* s) {
138 |     return s->data[s->size - 1].state;
139 | }
140 | 
141 | pgs_tree* pgs_parse_stack_top_tree(pgs_parse_stack* s) {
142 |     return s->data[s->size - 1].tree;
143 | }
144 | 
145 | #define PGS_PARSE_ERROR(label_name, error_name, code, text) \
146 |     error_name = code; \
147 |     pgs_state_error(s, error_name, text); \
148 |     goto label_name;
149 | 
150 | pgs_error pgs_do_parse(pgs_state* s, pgs_token_list* list, pgs_tree** into) {
151 |     pgs_error error;
152 |     pgs_parse_stack stack;
153 |     pgs_tree* top_tree;
154 |     long int top_state;
155 |     long int tree_table_index;
156 |     long int current_token_id;
157 |     long int action;
158 |     struct pgs_item_s* item;
159 |     pgs_token* current_token; 
160 |     size_t index = 0;
161 |     
162 |     if((error = pgs_parse_stack_init(&stack))) return error;
163 |     while(1) {
164 |         current_token_id = pgs_token_list_at_id(list, index);
165 |         top_tree = pgs_parse_stack_top_tree(&stack);
166 |         top_state = pgs_parse_stack_top_state(&stack);
167 | 
168 |         if(top_tree &&
169 |                 top_tree->variant == PGS_TREE_NONTERMINAL &&
170 |                 parse_final_table[top_tree->tree_data.nonterminal.nonterminal + 1])
171 |             break;
172 | 
173 |         action = parse_action_table[top_state][current_token_id];
174 | 
175 |         if(action == -1) {
176 |             PGS_PARSE_ERROR(error_label, error, PGS_BAD_TOKEN, "Unexpected token at position");
177 |         } else if(action == 0) {
178 |             current_token = pgs_token_list_at(list, index);
179 |             if(index >= (list->token_count)) {
180 |                 PGS_PARSE_ERROR(error_label, error, PGS_EOF_SHIFT, "Unexpected end of file");
181 |             }
182 | 
183 |             error = pgs_parse_stack_append_terminal(&stack, current_token);
184 |             if(error) goto error_label;
185 |             index++;
186 |         } else {
187 |             item = &items[action - 1];
188 |             error = pgs_parse_stack_append_nonterminal(&stack, item->left_id, item->right_count);
189 |         }
190 |     }
191 | 
192 |     if(index != list->token_count) {
193 |         PGS_PARSE_ERROR(error_label, error, PGS_BAD_TOKEN, "Unexpected token at position");
194 |     }
195 | 
196 |     *into = stack.data[stack.size - 1].tree;
197 |     stack.size -= 1;
198 | 
199 |     error_label:
200 |     pgs_parse_stack_free(&stack);
201 |     return error;
202 | }
203 | 
204 | /* == Glue Code == */
205 | pgs_error pgs_do_all(pgs_state* state, pgs_tree** into, const char* string) {
206 |     pgs_error error;
207 |     pgs_token_list tokens;
208 |     pgs_state_init(state);
209 |     *into = NULL;
210 |     if((error = pgs_do_lex(state, &tokens, string))) {
211 |         if(error == PGS_MALLOC) {
212 |             pgs_state_error(state, error, "Failure to allocate memory while lexing");
213 |         }
214 |         return error;
215 |     }
216 |     if((error = pgs_do_parse(state, &tokens, into))) {
217 |         if(error == PGS_MALLOC) {
218 |             pgs_state_error(state, error, "Failure to allocate memory while lexing");
219 |         }
220 |     }
221 |     pgs_token_list_free(&tokens);
222 |     return error;
223 | }
224 | 


--------------------------------------------------------------------------------
/src/generators/crystal-common/tables.cr:
--------------------------------------------------------------------------------
 1 | require "../../pegasus/language_def.cr"
 2 | require "ecr"
 3 | 
 4 | module Pegasus::Generators
 5 |   class CrystalTableGen
 6 |     def initialize(@prefix : String, @language : Pegasus::Language::LanguageData)
 7 |     end
 8 | 
 9 |     ECR.def_to_s "src/generators/crystal-common/tables.ecr"
10 |   end
11 | end
12 | 


--------------------------------------------------------------------------------
/src/generators/crystal-common/tables.ecr:
--------------------------------------------------------------------------------
 1 | module <%= @prefix %>
 2 |   MAX_TERMINAL = <%= @language.max_terminal %>
 3 |   LEX_SKIP_TABLE = [ <% @language.lex_skip_table.each do |skip| %> <%= skip %>, <% end %> ]
 4 |   LEX_FINAL_TABLE = [ <% @language.lex_final_table.each do |final| %> <%= final %>_i64, <% end %> ]
 5 |   LEX_STATE_TABLE = [<% @language.lex_state_table.each do |state| %>
 6 |     [ <% state.each do |transition| %> <%= transition %>_i64, <% end %> ],<%- end %>
 7 |   ]
 8 |   PARSE_ACTION_TABLE = [<% @language.parse_action_table.each do |state| %>
 9 |     [ <% state.each do |transition| %> <%= transition %>_i64, <% end %> ],<%- end %>
10 |   ]
11 |   PARSE_STATE_TABLE = [<% @language.parse_state_table.each do |state| %>
12 |     [ <% state.each do |transition| %> <%= transition %>_i64, <% end %> ],<%- end %>
13 |   ]
14 |   PARSE_FINAL_TABLE = [<% @language.parse_final_table.each do |skip| %>
15 |     <%= skip %>,<%- end %>
16 |   ]
17 |   ITEMS = <% if @language.items.size == 0 %> [] of Tuple(Int64, Int64) <% else %> [<% @language.items.each do |item| %>
18 |     { <%= item.head.raw_id %>_i64, <%= item.body.size %>_i64 },<%- end %>
19 |   ]<%- end %>
20 | end
21 | 


--------------------------------------------------------------------------------
/src/generators/crystal/pegasus_crystal.cr:
--------------------------------------------------------------------------------
 1 | require "../../pegasus/language_def.cr"
 2 | require "../../pegasus/json.cr"
 3 | require "../crystal-common/tables.cr"
 4 | require "../generators.cr"
 5 | require "option_parser"
 6 | require "ecr"
 7 | 
 8 | module Pegasus::Generators::Crystal
 9 |   include Pegasus::Language
10 |   include Pegasus::Generators::Api
11 | 
12 |   class CrystalContext
13 |     property output_module : String
14 | 
15 |     def initialize(@output_module : String = "Pegasus::Generated")
16 |     end
17 | 
18 |     def add_option(opt_parser)
19 |       opt_parser.option_parser.on("-m",
20 |                                   "--module=MODULE",
21 |                                   "Sets the module in generated code") do |m|
22 |                                     @output_module = m
23 |                                   end
24 |     end
25 |   end
26 | 
27 |   class LanguageInput < StdInput(LanguageData)
28 |     def process(opt_parser) : LanguageData
29 |       LanguageData.from_json STDIN
30 |     end
31 |   end
32 | 
33 |   class ParserGenerator < FileGenerator(CrystalContext, LanguageData)
34 |     def initialize(parent)
35 |       super parent, "parser", "parser.cr", "the generated parser file"
36 |     end
37 | 
38 |     def to_s(io)
39 |       ECR.embed "src/generators/crystal/pegasus_crystal_template.ecr", io 
40 |     end
41 |   end
42 | end
43 | 
44 | include Pegasus::Generators::Crystal
45 | 
46 | parser = PegasusOptionParser(CrystalContext, LanguageData).new LanguageInput.new
47 | ParserGenerator.new(parser)
48 | parser.run
49 | 


--------------------------------------------------------------------------------
/src/generators/crystal/pegasus_crystal_template.ecr:
--------------------------------------------------------------------------------
  1 | <%= Pegasus::Generators::CrystalTableGen.new(context.output_module, input!).to_s %>
  2 | 
  3 | module <%= context.output_module %>
  4 |   extend self
  5 | 
  6 |   abstract class Tree
  7 |     abstract def table_index
  8 |   end
  9 | 
 10 |   class NonterminalTree < Tree
 11 |     getter nonterminal_id : Int64
 12 |     getter children : Array(Tree)
 13 | 
 14 |     def initialize(@nonterminal_id, @children = [] of Tree)
 15 |     end
 16 | 
 17 |     def table_index
 18 |       nonterminal_id + 1 + MAX_TERMINAL + 1
 19 |     end
 20 | 
 21 |     def name
 22 |       case nonterminal_id<% input!.nonterminals.each do |nt| %>
 23 |       when <%= nt[1].raw_id %>_i64
 24 |         <%= nt[0].dump -%>
 25 |       <%- end %>
 26 |       else
 27 |         "???"
 28 |       end
 29 |     end
 30 |   end
 31 | 
 32 |   class TerminalTree < Tree
 33 |     getter terminal_id : Int64
 34 |     getter string : String
 35 | 
 36 |     def initialize(@terminal_id, @string)
 37 |     end
 38 | 
 39 |     def table_index
 40 |       terminal_id + 1
 41 |     end
 42 |   end
 43 | 
 44 |   class Token
 45 |     getter terminal_id : Int64
 46 |     getter string : String
 47 | 
 48 |     def initialize(@terminal_id, @string)
 49 |     end
 50 |   end
 51 | 
 52 |   def lex(string)
 53 |     index = 0
 54 |     tokens = [] of Token
 55 |     bytes = string.bytes
 56 | 
 57 |     while index < bytes.size
 58 |       start_index = index
 59 |       last_match_index = -1
 60 |       last_pattern = -1_i64
 61 |       state = 1
 62 | 
 63 |       while index < bytes.size
 64 |         state = LEX_STATE_TABLE[state][bytes[index]]
 65 |         id = LEX_FINAL_TABLE[state]
 66 | 
 67 |         break if state == 0
 68 |         index += 1
 69 |         next if id == 0
 70 | 
 71 |         last_match_index = index - 1
 72 |         last_pattern = id
 73 |       end
 74 | 
 75 |       raise "Invalid character #{bytes[start_index].to_s.dump_unquoted} at position #{start_index}" if last_match_index == -1
 76 |       next if LEX_SKIP_TABLE[last_pattern]
 77 |       tokens << Token.new(last_pattern - 1, string[start_index..last_match_index])
 78 |     end
 79 | 
 80 |     return tokens
 81 |   end
 82 | 
 83 |   def parse(tokens)
 84 |     tree_stack = [ ] of Tree
 85 |     state_stack = [ 1_i64 ]
 86 |     index = 0
 87 | 
 88 |     loop do
 89 |       break if tree_stack.last?.try(&.as?(NonterminalTree)).try(&.nonterminal_id) == 0
 90 |       token = tokens[index]?
 91 |       action = PARSE_ACTION_TABLE[state_stack.last][token.try(&.terminal_id.+(1)) || 0_i64]
 92 |       raise "Invalid token #{token.try &.string.dump || "EOF"}" if action == -1
 93 | 
 94 |       if action == 0
 95 |         raise "Unexpected end of file" unless token
 96 |         tree_stack << TerminalTree.new token.terminal_id, token.string
 97 |         index += 1
 98 |       else
 99 |         item = ITEMS[action - 1]
100 |         tree = NonterminalTree.new item[0]
101 | 
102 |         item[1].times do
103 |           tree.children.insert 0, tree_stack.pop
104 |           state_stack.pop
105 |         end
106 | 
107 |         tree_stack << tree
108 |       end
109 | 
110 |       state_stack << PARSE_STATE_TABLE[state_stack.last][tree_stack.last.table_index]
111 |     end
112 |     raise "Invalid token #{tokens[index].string.dump}" if index < tokens.size
113 |     return tree_stack.last
114 |   end
115 | 
116 |   def process(string)
117 |     parse(lex(string))
118 |   end
119 | end
120 | 


--------------------------------------------------------------------------------
/src/generators/crystalsem/pegasus_crystal_template.ecr:
--------------------------------------------------------------------------------
 1 | <%= input!.semantics.init %>
 2 | 
 3 | <%= Pegasus::Generators::CrystalTableGen.new(context.output_module, input!.language).to_s %>
 4 | 
 5 | module <%= context.output_module %>
 6 |   extend self
 7 | 
 8 |   alias StackType = <%= input!.semantics.types.values.join "|" %>
 9 | 
10 |   class Token
11 |     getter terminal_id : Int64
12 |     getter string : String
13 | 
14 |     def initialize(@terminal_id, @string)
15 |     end
16 |   end
17 | 
18 |   def lex(string)
19 |     index = 0
20 |     tokens = [] of Token
21 |     bytes = string.bytes
22 | 
23 |     while index < bytes.size
24 |       start_index = index
25 |       last_match_index = -1
26 |       last_pattern = -1_i64
27 |       state = 1
28 | 
29 |       while index < bytes.size
30 |         state = LEX_STATE_TABLE[state][bytes[index]]
31 |         id = LEX_FINAL_TABLE[state]
32 | 
33 |         break if state == 0
34 |         index += 1
35 |         next if id == 0
36 | 
37 |         last_match_index = index - 1
38 |         last_pattern = id
39 |       end
40 | 
41 |       raise "Invalid character #{bytes[start_index].to_s.dump_unquoted} at position #{start_index}" if last_match_index == -1
42 |       next if LEX_SKIP_TABLE[last_pattern]
43 |       tokens << Token.new(last_pattern - 1, string[start_index..last_match_index])
44 |     end
45 | 
46 |     return tokens
47 |   end
48 | 
49 |   def parse(tokens)
50 |     temp = uninitialized StackType
51 |     value_stack = [ ] of StackType
52 |     state_stack = [ 1_i64 ]
53 |     index = 0
54 | 
55 |     loop do
56 |       token = tokens[index]?
57 |       action = PARSE_ACTION_TABLE[state_stack.last][token.try(&.terminal_id.+(1)) || 0_i64]
58 |       raise "Invalid token #{token.try &.string.dump || "EOF"}" if action == -1
59 | 
60 |       if action == 0
61 |         raise "Unexpected end of file" unless token
62 |         index += 1
63 |         value_stack << token
64 |         state_stack << PARSE_STATE_TABLE[state_stack.last][token.terminal_id + 1]
65 |       else
66 |         item = ITEMS[action - 1]
67 | 
68 |         case action - 1
69 |         <%- input!.semantics.actions.each do |k, v| -%>
70 |         when <%= k %>
71 |           <%= input!.format_item(k, v) %>
72 |         <%- end -%>
73 |         end
74 | 
75 |         value_stack.pop item[1]
76 |         state_stack.pop item[1]
77 |         value_stack << temp
78 | 
79 |         break if PARSE_FINAL_TABLE[item[0]+1]
80 |         state_stack << PARSE_STATE_TABLE[state_stack.last][item[0] + 2 + MAX_TERMINAL]
81 |       end
82 |     end
83 |     raise "Invalid token #{tokens[index].string.dump}" if index < tokens.size
84 |     return value_stack.last
85 |   end
86 | 
87 |   def process(string)
88 |     parse(lex(string))
89 |   end
90 | end
91 | 


--------------------------------------------------------------------------------
/src/generators/crystalsem/pegasus_crystalsem.cr:
--------------------------------------------------------------------------------
  1 | require "../../pegasus/language_def.cr"
  2 | require "../../pegasus/json.cr"
  3 | require "../../pegasus/semantics.cr"
  4 | require "../crystal-common/tables.cr"
  5 | require "../generators.cr"
  6 | require "option_parser"
  7 | require "ecr"
  8 | 
  9 | module Pegasus::Generators::CrystalSem
 10 |   include Pegasus::Language
 11 |   include Pegasus::Generators::Api
 12 |   include Pegasus::Semantics
 13 | 
 14 |   class CrystalContext
 15 |     property output_module : String
 16 | 
 17 |     def initialize(@output_module : String = "Pegasus::Generated")
 18 |     end
 19 | 
 20 |     def add_option(opt_parser)
 21 |       opt_parser.option_parser.on("-m",
 22 |                                   "--module=MODULE",
 23 |                                   "Sets the module in generated code") do |m|
 24 |                                     @output_module = m
 25 |                                   end
 26 |     end
 27 |   end
 28 | 
 29 |   class GeneratorInput
 30 |     property language : LanguageData
 31 |     property semantics : SemanticsData
 32 | 
 33 |     def initialize(@language, @semantics)
 34 |     end
 35 | 
 36 |     def format_item(index, code)
 37 |       item = @language.items[index]
 38 | 
 39 |       unless head_type = @semantics.nonterminal_types[item.head]?
 40 |           raise_general "no type specified for nonterminal" 
 41 |       end
 42 |       code = code.gsub "$out", "temp"
 43 | 
 44 |       item.body.each_with_index do |element, i|
 45 |         data_var = "value_stack[-1-#{item.body.size - 1 - i}]"
 46 |         case element
 47 |         when Pegasus::Elements::TerminalId
 48 |           data_var += ".as(Token)"
 49 |           code = code.gsub "$#{i}", "(" + data_var + ")"
 50 |         when Pegasus::Elements::NonterminalId
 51 |           next unless name = @semantics.nonterminal_types[element]
 52 |           data_var += ".as(#{@semantics.types[name]})"
 53 |           code = code.gsub "$#{i}", "(" + data_var + ")"
 54 |         end
 55 |       end
 56 | 
 57 |       return code
 58 |     end
 59 |   end
 60 | 
 61 |   class LanguageInput < FileInput(LanguageData)
 62 |     def initialize
 63 |       super "language", "the grammar file"
 64 |     end
 65 | 
 66 |     def process(opt_parser, file) : LanguageData
 67 |       LanguageData.from_json file
 68 |     end
 69 |   end
 70 | 
 71 |   class FullInput < FileInput(GeneratorInput)
 72 |     def initialize(@language_input : Input(LanguageData))
 73 |       super "actions", "the semantic actions file"
 74 |     end
 75 | 
 76 |     def process(opt_parser, file) : GeneratorInput
 77 |       language_data = @language_input.process(opt_parser)
 78 |       semantics_data = SemanticsData.new file.gets_to_end, "Token", language_data
 79 |       GeneratorInput.new(language_data,semantics_data)
 80 |     end
 81 | 
 82 |     def add_option(opt_parser)
 83 |       @language_input.add_option(opt_parser)
 84 |       super opt_parser
 85 |     end
 86 |   end
 87 | 
 88 |   class SourceGenerator < FileGenerator(CrystalContext, GeneratorInput)
 89 |     def initialize(parent)
 90 |       super parent, "code", "parser.cr", "the parser source code file"
 91 |     end
 92 | 
 93 |     def to_s(io)
 94 |       ECR.embed "src/generators/crystalsem/pegasus_crystal_template.ecr", io 
 95 |     end
 96 |   end
 97 | end
 98 | 
 99 | include Pegasus::Generators::CrystalSem
100 | 
101 | parser = PegasusOptionParser(CrystalContext, GeneratorInput).new FullInput.new(LanguageInput.new)
102 | SourceGenerator.new(parser)
103 | parser.run
104 | 


--------------------------------------------------------------------------------
/src/generators/csem/pegasus_c_header_template.ecr:
--------------------------------------------------------------------------------
 1 | <%= {{ read_file "src/generators/c-common/standard_header.h" }} %>
 2 | 
 3 | /* == Nonterminal ID Definitions == */
 4 | <% input!.language.nonterminals.each do |name, value| -%>
 5 | #define PGS_NONTERMINAL_<%= name.underscore.upcase %> <%= value.raw_id %>
 6 | <% end -%>
 7 | 
 8 | /* == Parsing Definitions == */
 9 | /**
10 |  * A value that can exist on the pegasus stack.
11 |  * The possible values of this union depend on the type
12 |  * assigned to the nonterminals.
13 |  */
14 | union pgs_stack_value_u {
15 |     <% input!.semantics.types.each do |k, v| %><%= v %> <%= k %>;<% end %>
16 | };
17 | 
18 | <%= {{ read_file "src/generators/csem/sem_header.h" }} %>
19 | 


--------------------------------------------------------------------------------
/src/generators/csem/pegasus_c_template.ecr:
--------------------------------------------------------------------------------
 1 | 
 2 | /* == User Code == */
 3 | <%= input!.semantics.init %>
 4 | 
 5 | <%= Pegasus::Generators::CTableGen.new(input!.language).to_s %>
 6 | 
 7 | <%= {{ read_file "src/generators/c-common/standard_source.c" }} %>
 8 | 
 9 | <%= {{ read_file "src/generators/csem/sem_source.c" }} %>
10 | 
11 | pgs_error pgs_do_parse(pgs_state* s, pgs_token_list* list, pgs_stack_value* into, const char* src) {
12 |     pgs_error error;
13 |     pgs_parse_stack stack;
14 |     long int top_state;
15 |     long int current_token_id;
16 |     long int action;
17 |     long int nonterminal;
18 |     size_t index = 0;
19 |     pgs_stack_value temp;
20 |     
21 |     if((error = pgs_parse_stack_init(&stack))) return error;
22 |     while(1) {
23 |         current_token_id = pgs_token_list_at_id(list, index);
24 |         top_state = pgs_parse_stack_top_state(&stack);
25 |         action = parse_action_table[top_state][current_token_id];
26 | 
27 |         if(action == -1) {
28 |             PGS_PARSE_ERROR(error_label, error, PGS_BAD_TOKEN, "Unexpected token at position");
29 |         } else if(action == 0) {
30 |             temp.token = pgs_token_list_at(list, index);
31 |             if(index >= (list->token_count)) {
32 |                 PGS_PARSE_ERROR(error_label, error, PGS_EOF_SHIFT, "Unexpected end of file");
33 |             }
34 | 
35 |             error = pgs_parse_stack_append(&stack, &temp, parse_state_table[top_state][temp.token->terminal]);
36 |             if(error) goto error_label;
37 |             index++;
38 |         } else {
39 |             switch(action - 1) {
40 |                 <%- input!.semantics.actions.each do |k, v| -%>
41 |                 case <%= k %>: <%= input!.format_item(k, v) %>
42 |                 <%- end -%>
43 |                 default: break;
44 |             }
45 |             nonterminal = items[action - 1].left_id;
46 |             stack.size -= items[action - 1].right_count;
47 |             top_state = pgs_parse_stack_top_state(&stack);
48 |             error = pgs_parse_stack_append(&stack, &temp, parse_state_table[top_state][nonterminal + 2 + PGS_MAX_TERMINAL]);
49 |             if(parse_final_table[nonterminal + 1]) goto after_loop;
50 |         }
51 |     }
52 |     after_loop:
53 | 
54 |     if(index != list->token_count) {
55 |         PGS_PARSE_ERROR(error_label, error, PGS_BAD_TOKEN, "Unexpected token at position");
56 |     }
57 | 
58 |     *into = stack.data[stack.size - 1].value;
59 |     stack.size -= 1;
60 | 
61 |     error_label:
62 |     pgs_parse_stack_free(&stack);
63 |     return error;
64 | }
65 | 


--------------------------------------------------------------------------------
/src/generators/csem/pegasus_csem.cr:
--------------------------------------------------------------------------------
  1 | require "../../pegasus/language_def.cr"
  2 | require "../../pegasus/json.cr"
  3 | require "../../pegasus/semantics.cr"
  4 | require "../c-common/tables.cr"
  5 | require "../generators.cr"
  6 | require "option_parser"
  7 | require "ecr"
  8 | 
  9 | module Pegasus::Generators::CSem
 10 |   include Pegasus::Language
 11 |   include Pegasus::Generators::Api
 12 |   include Pegasus::Semantics
 13 | 
 14 |   class CContext
 15 |     def add_option(opt_parser)
 16 |     end
 17 |   end
 18 | 
 19 |   class GeneratorInput
 20 |     property language : LanguageData
 21 |     property semantics : SemanticsData
 22 | 
 23 |     def initialize(@language, @semantics)
 24 |     end
 25 | 
 26 |     def format_item(index, code)
 27 |       item = @language.items[index]
 28 | 
 29 |       unless head_type = @semantics.nonterminal_types[item.head]?
 30 |           raise_general "no type specified for nonterminal" 
 31 |       end
 32 |       code = code.gsub "$out", "temp." + head_type
 33 | 
 34 |       item.body.each_with_index do |element, i|
 35 |         data_var = "stack.data[stack.size - 1 - #{item.body.size - 1 - i}].value"
 36 |         case element
 37 |         when Pegasus::Elements::TerminalId
 38 |           data_var += ".token"
 39 |           code = code.gsub "$#{i}", "(" + data_var + ")"
 40 |         when Pegasus::Elements::NonterminalId
 41 |           next unless name = @semantics.nonterminal_types[element]
 42 |           data_var += "." + name
 43 |           code = code.gsub "$#{i}", "(" + data_var + ")"
 44 |         end
 45 |       end
 46 | 
 47 |       return "{ { #{code} } break; }"
 48 |     end
 49 |   end
 50 | 
 51 |   class LanguageInput < FileInput(LanguageData)
 52 |     def initialize
 53 |       super "language", "the grammar file"
 54 |     end
 55 | 
 56 |     def process(opt_parser, file) : LanguageData
 57 |       LanguageData.from_json file
 58 |     end
 59 |   end
 60 | 
 61 |   class FullInput < FileInput(GeneratorInput)
 62 |     def initialize(@language_input : Input(LanguageData))
 63 |       super "actions", "the semantic actions file"
 64 |     end
 65 | 
 66 |     def process(opt_parser, file) : GeneratorInput
 67 |       language_data = @language_input.process(opt_parser)
 68 |       semantics_data = SemanticsData.new file.gets_to_end, "pgs_token*", language_data
 69 |       GeneratorInput.new(language_data,semantics_data)
 70 |     end
 71 | 
 72 |     def add_option(opt_parser)
 73 |       @language_input.add_option(opt_parser)
 74 |       super opt_parser
 75 |     end
 76 |   end
 77 | 
 78 |   class HeaderGenerator < FileGenerator(CContext, GeneratorInput)
 79 |     def initialize(parent)
 80 |       super parent, "header", "parser.h", "the parser header file"
 81 |     end
 82 | 
 83 |     def to_s(io)
 84 |       ECR.embed "src/generators/csem/pegasus_c_header_template.ecr", io 
 85 |     end
 86 |   end
 87 | 
 88 |   class SourceGenerator < FileGenerator(CContext, GeneratorInput)
 89 |     def initialize(parent)
 90 |       super parent, "code", "parser.c", "the parser source code file"
 91 |     end
 92 | 
 93 |     def to_s(io)
 94 |       io << "#include \"#{@parent.output_file_names["header"]}\"\n"
 95 |       ECR.embed "src/generators/csem/pegasus_c_template.ecr", io 
 96 |     end
 97 |   end
 98 | end
 99 | 
100 | include Pegasus::Generators::CSem
101 | 
102 | parser = PegasusOptionParser(CContext, GeneratorInput).new FullInput.new(LanguageInput.new)
103 | HeaderGenerator.new(parser)
104 | SourceGenerator.new(parser)
105 | parser.run
106 | 


--------------------------------------------------------------------------------
/src/generators/csem/sem_header.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * An element on the parse stack, which holds
 3 |  * both a tree node and a state. In theory,
 4 |  * the stack is actually items followed by states,
 5 |  * but since one always comes after the other,
 6 |  * and since both need to be looked up fast,
 7 |  * we put them on a stack in parallel.
 8 |  */
 9 | struct pgs_parse_stack_element_s {
10 |     /** The value on the stack */
11 |     union pgs_stack_value_u value;
12 |     /** The state on the stack */
13 |     long int state;
14 | };
15 | 
16 | /**
17 |  * A parse stack. The PDA automaton
18 |  * has to maintain this stack, where it gradually
19 |  * assembles a tree.
20 |  */
21 | struct pgs_parse_stack_s {
22 |     /** The number of stack elements currently allocated. */
23 |     size_t capacity;
24 |     /** The current number of stack elements. */
25 |     size_t size;
26 |     /** The stack element array. */
27 |     struct pgs_parse_stack_element_s* data;
28 | };
29 | 
30 | typedef union pgs_stack_value_u pgs_stack_value;
31 | typedef struct pgs_parse_stack_element_s pgs_parse_stack_element;
32 | typedef struct pgs_parse_stack_s pgs_parse_stack;
33 | 
34 | /**
35 |  * Initialzies a parse stack.
36 |  * @param s the parse stack to initialize.
37 |  * @return the result of the initialization.
38 |  */
39 | pgs_error pgs_parse_stack_init(pgs_parse_stack* s);
40 | /**
41 |  * Appends (pushes) a new value and state to the stack.
42 |  * @param s the stack to append to.
43 |  * @param v the value to append.
44 |  * @param state the state to append.
45 |  * @return the result of the append.
46 |  */
47 | pgs_error pgs_parse_stack_append(pgs_parse_stack* s, pgs_stack_value* v, long int state);
48 | /**
49 |  * Gets the state on the top of the stack.
50 |  * @param s the stack for which to get a state.
51 |  * @return the state on the top of the stack.
52 |  */
53 | long int pgs_parse_stack_top_state(pgs_parse_stack* s);
54 | /**
55 |  * Gets the value on the top of the stack.
56 |  * @param s the stack for which to get a value.
57 |  * @return the value on the top of the stack.
58 |  */
59 | pgs_stack_value* pgs_parse_stack_top_value(pgs_parse_stack* s);
60 | /**
61 |  * Frees a parse stack.
62 |  * @param s the stack to free.
63 |  */
64 | void pgs_parse_stack_free(pgs_parse_stack* s);
65 | /**
66 |  * Takes the given tokens, and attempts to convert them into a value.
67 |  * @param s the state used for storing errors.
68 |  * @param list the list of tokens, already filled.
69 |  * @param into the value pointer pointer into which a new value will be stored.
70 |  * @param src the original string, for the user-defined actions.
71 |  * @return the error, if any, that occured.
72 |  */
73 | pgs_error pgs_do_parse(pgs_state* s, pgs_token_list* list, pgs_stack_value* into, const char* src);
74 | 
75 | /* == Glue == */
76 | /**
77 |  * Attempts to parse tokens from the given string into the given value.
78 |  * @param state the state to initialize with error information, if necessary.
79 |  * @param into the value to build into.
80 |  * @param string the string from which to read.
81 |  * @return the error, if any, that occured.
82 |  */
83 | pgs_error pgs_do_all(pgs_state* state, pgs_stack_value* into, const char* string);
84 | 


--------------------------------------------------------------------------------
/src/generators/csem/sem_source.c:
--------------------------------------------------------------------------------
 1 | /* == Glue Code == */
 2 | 
 3 | pgs_error pgs_do_all(pgs_state* state, pgs_stack_value* into, const char* string) {
 4 |     pgs_error error;
 5 |     pgs_token_list tokens;
 6 |     pgs_state_init(state);
 7 |     if((error = pgs_do_lex(state, &tokens, string))) {
 8 |         if(error == PGS_MALLOC) {
 9 |             pgs_state_error(state, error, "Failure to allocate memory while lexing");
10 |         }
11 |         return error;
12 |     }
13 |     if((error = pgs_do_parse(state, &tokens, into, string))) {
14 |         if(error == PGS_MALLOC) {
15 |             pgs_state_error(state, error, "Failure to allocate memory while lexing");
16 |         }
17 |     }
18 |     pgs_token_list_free(&tokens);
19 |     return error;
20 | }
21 | 
22 | /* == Parsing Code == */
23 | 
24 | pgs_error pgs_parse_stack_init(pgs_parse_stack* s) {
25 |     s->capacity = 8;
26 |     s->size = 1;
27 |     s->data = (pgs_parse_stack_element*) malloc(sizeof(*(s->data)) * s->capacity);
28 | 
29 |     if(s->data == NULL) return PGS_MALLOC;
30 |     s->data[0].state = 1;
31 | 
32 |     return PGS_NONE;
33 | }
34 | 
35 | pgs_error pgs_parse_stack_append(pgs_parse_stack* s, pgs_stack_value* v, long int state) {
36 |     if(s->capacity == s->size) {
37 |         pgs_parse_stack_element* new_elements =
38 |             (pgs_parse_stack_element*) realloc(
39 |                 s->data, sizeof(*new_elements) * s->capacity * 2);
40 |         if(new_elements == NULL) return PGS_MALLOC;
41 |         s->capacity *= 2;
42 |         s->data = new_elements;
43 |     }
44 | 
45 |     s->data[s->size].value = *v;
46 |     s->data[s->size].state = state;
47 |     s->size++;
48 | 
49 |     return PGS_NONE;
50 | }
51 | 
52 | void pgs_parse_stack_free(pgs_parse_stack* s) {
53 |     size_t i;
54 |     for(i = 0; i < s->size; i++) {
55 |         /* Maybe eventually free individual union values */
56 |     }
57 |     free(s->data);
58 | }
59 | 
60 | long int pgs_parse_stack_top_state(pgs_parse_stack* s) {
61 |     return s->data[s->size - 1].state;
62 | }
63 | 
64 | pgs_stack_value* pgs_parse_stack_top_value(pgs_parse_stack* s) {
65 |     return &s->data[s->size - 1].value;
66 | }
67 | 
68 | #define PGS_PARSE_ERROR(label_name, error_name, code, text) \
69 |     error_name = code; \
70 |     pgs_state_error(s, error_name, text); \
71 |     goto label_name;
72 | 
73 | 


--------------------------------------------------------------------------------
/src/generators/generators.cr:
--------------------------------------------------------------------------------
  1 | require "../pegasus/language_def.cr"
  2 | require "option_parser"
  3 | 
  4 | module Pegasus::Generators::Api
  5 |   # Class that specifies the program's output mode.
  6 |   # The idea is to generalize behaviors such as
  7 |   # merging into a single file or printing out to STDOUT.
  8 |   # The `#output` method takes in a parser and, as side effect,
  9 |   # should emit the output of its various `FileGenerator` classes.
 10 |   abstract class OutputMode
 11 |     # Output the content of the given `opt_parser`.
 12 |     abstract def output(opt_parser)
 13 |   end
 14 | 
 15 |   # Output mode that produces individual files
 16 |   # as specified by the `FileGenerator` classes.
 17 |   class FilesOutputMode < OutputMode
 18 |     def output(opt_parser)
 19 |       opt_parser.file_gens.each do |gen|
 20 |         file = File.open(opt_parser.output_file_names[gen.name], "w")
 21 |         gen.to_s(file)
 22 |         file.close
 23 |       end
 24 |     end
 25 |   end
 26 | 
 27 |   # Output mode that produces a single file.
 28 |   class FileOutputMode < OutputMode
 29 |     # Creates a new file output mode that generates a file with the given name.
 30 |     def initialize(@filename : String)
 31 |     end
 32 | 
 33 |     def output(opt_parser)
 34 |       file = File.open(@filename, "w")
 35 |       opt_parser.file_gens.each do |gen|
 36 |         gen.to_s(file)
 37 |       end
 38 |       file.close
 39 |     end
 40 |   end
 41 | 
 42 |   # Output mode that prints all the generated files to STDOUT,
 43 |   # in the order they were added to the `PegasusOptionParser`
 44 |   class StdOutputMode < OutputMode
 45 |     def output(opt_parser)
 46 |       opt_parser.file_gens.each do |gen|
 47 |         gen.to_s(STDOUT)
 48 |       end
 49 |     end
 50 |   end
 51 | 
 52 |   # A generalization of data input. Subclasses
 53 |   # such as `StdInput` and `FileInput` provide
 54 |   # a way to read grammar / semantics files from
 55 |   # various sources. The `#add_option` method registers
 56 |   # command-line option(s) for the user to configure.
 57 |   abstract class Input(I)
 58 |     # Register this input method's options
 59 |     # with the given `PegasusOptionParser`.
 60 |     def add_option(opt_parser)
 61 |     end
 62 | 
 63 |     # Read input of type `I`.
 64 |     abstract def process(opt_parser) : I
 65 |   end
 66 | 
 67 |   # Input method that reads directly from `STDIN`.
 68 |   # This technically doesn't add any new methods,
 69 |   # but makes code more clear.
 70 |   abstract class StdInput(I) < Input(I)
 71 |   end
 72 | 
 73 |   # Input method that reads from a file, the
 74 |   # name of which is specified on the command line.
 75 |   abstract class FileInput(I) < Input(I)
 76 |     # The internal name of this input. The `PegasusOptionParser`
 77 |     # will associated a file name with this string.
 78 |     property name : String
 79 |     # The user-friendly description of the input
 80 |     # that will be shown on the help screen.
 81 |     property description : String
 82 |     # The name of the file to read from.
 83 |     property filename : String?
 84 | 
 85 |     # Create a new file input with the given internal name
 86 |     # and user-friendly description.
 87 |     def initialize(@name, @description)
 88 |     end
 89 | 
 90 |     def process(opt_parser) : I
 91 |       file = File.open(@filename.not_nil!, "r")
 92 |       result = process(opt_parser, file)
 93 |       file.close
 94 |       return result
 95 |     end
 96 | 
 97 |     def add_option(opt_parser)
 98 |       opt_parser.option_parser.on("-#{name[0].downcase} FILE",
 99 |                                   "--input-#{name}=FILE",
100 |                                   "Sets #{description}") do |file|
101 |         @filename = file
102 |       end
103 |     end
104 | 
105 |     # Read a value of type `I` from a file.
106 |     abstract def process(opt_parser, file) : I
107 |   end
108 |   
109 |   # High-level class for constructing parser generators
110 |   # that are configurable from the command line.
111 |   # 
112 |   # This class uses `Input` to read a value of
113 |   # type `I`, then uses the registered `FileGenerator` instances
114 |   # to produce output via an `OutputMode`. All of these
115 |   # listed classes are registered with Crystal's native `OptionParser`,
116 |   # which serves to provide a user with configuration options.
117 |   # 
118 |   # The `#output_file_names` and `#input_file_names` hashes store
119 |   # the names of target output files and input files, respectively.
120 |   # These are updated by the `Input` and `FileGenerator`s, as well
121 |   # as through user-supplied command-line options.
122 |   class PegasusOptionParser(C, I)
123 |     # The context class (which must implement the `add_option` method)
124 |     # is included with the generator to store and retrieve
125 |     # parser-specific options. `FileGenerator#context` is used within
126 |     # a generator to access this value.
127 |     getter context : C
128 |     # The input gathered from the `Input` class. This starts
129 |     # uninitialized, but is set partway through `#run`.
130 |     getter input : I?
131 |     # The list of registered file generators.
132 |     getter file_gens : Array(FileGenerator(C, I))
133 |     # The Crystal-native `OptionParser` used to actually
134 |     # print options to the console.
135 |     getter option_parser : OptionParser
136 |     # Hash that stores the configured file names of the various
137 |     # `FileGenerator` instances, associated with their internal names.
138 |     # The file names are kept outside their generators so that
139 |     # two generators that depend on one another (like a source file
140 |     # including a header file) can know each other's names.
141 |     getter output_file_names : Hash(String, String)
142 | 
143 |     # Create a new `PegasusOptionParser` with the given input method and context.
144 |     def initialize(@input_method : Input(I), @context = C.new)
145 |       @output = FilesOutputMode.new
146 |       @file_gens = [] of FileGenerator(C, I)
147 |       @option_parser = OptionParser.new
148 |       @output_file_names = {} of String => String
149 | 
150 |       @input_method.add_option(self)
151 |       @context.add_option(self)
152 |       @option_parser.on("-S",
153 |                         "--stdout",
154 |                         "Sets output mode to standard output") do 
155 |         @output = StdOutputMode.new
156 |       end
157 |       @option_parser.on("-s FILE",
158 |                         "--single-file=FILE",
159 |                         "Sets output mode to single file.") do |file|
160 |         @output = FileOutputMode.new file
161 |       end
162 |       @option_parser.on("-f PREFIX",
163 |                         "--file-prefix=PREFIX",
164 |                         "Sets the file prefix for generated files.") do |p|
165 |         @output_file_names.each do |k,v|
166 |           @output_file_names[k] = p + v
167 |         end
168 |       end
169 |       @option_parser.on("-H", "--help", "Show this text") do
170 |         puts @option_parser
171 |         exit
172 |       end
173 |     end
174 | 
175 |     # Run the command line program, and the constructed generator.
176 |     def run
177 |       @option_parser.parse
178 |       @input = @input_method.process(self)
179 |       @output.output(self)
180 |     end
181 |   end
182 | 
183 |   # A base class for a source file generator.
184 |   # This class is meant to be extended by each individual
185 |   # file generator that uses `ECR`, and thus provides
186 |   # the methods `#input!` and `#context` to make
187 |   # the genertor's input and context available inside
188 |   # the template file.
189 |   class FileGenerator(C, I)
190 |     # The parser program to which this generator belongs,
191 |     # used to retreive input and context and to configure
192 |     # and retreive file names.
193 |     property parent : PegasusOptionParser(C, I)
194 |     # The internal name of this file generator, 
195 |     # which will be associated with a filename by the `PegasusOptionParser`.
196 |     property name : String
197 |     # The default filename this generator will write to.
198 |     property default_filename : String
199 |     # The user-friendly description of this generator.
200 |     property description : String
201 | 
202 |     # Creates a new file generator attached to he given `PegasusOptionParser`,
203 |     # with the given name, default filename, and description.
204 |     def initialize(@parent, @name, @default_filename, @description)
205 |       @parent.file_gens << self
206 |       add_option(@parent)
207 |       @parent.output_file_names[@name] = @default_filename
208 |     end
209 | 
210 |     # Adds required options to the given option parser.
211 |     def add_option(opt_parser)
212 |       opt_parser.option_parser.on("-#{name[0].downcase} FILE",
213 |                                   "--#{name}-file=FILE",
214 |                                   "Sets output target for #{description}") do |n|
215 |         opt_parser.output_file_names[name] = n
216 |       end
217 |     end
218 | 
219 |     # Convenience method to access the parser generator input from
220 |     # an ECR template.
221 |     def input!
222 |       @parent.input.not_nil!
223 |     end
224 | 
225 |     # Convenience method to access the parser context from
226 |     # an ECR template.
227 |     def context
228 |       @parent.context
229 |     end
230 |   end
231 | end
232 | 


--------------------------------------------------------------------------------
/src/pegasus.cr:
--------------------------------------------------------------------------------
 1 | require "./pegasus/language_def.cr"
 2 | require "./pegasus/json.cr"
 3 | require "./pegasus/error.cr"
 4 | 
 5 | begin
 6 |   grammar = STDIN.gets_to_end
 7 |   definition = Pegasus::Language::LanguageDefinition.new grammar
 8 |   data = Pegasus::Language::LanguageData.new definition
 9 |   data.to_json(STDOUT)
10 | rescue e : Pegasus::Error::PegasusException
11 |   e.print(STDERR)
12 | end
13 | 


--------------------------------------------------------------------------------
/src/pegasus/automaton.cr:
--------------------------------------------------------------------------------
 1 | module Pegasus
 2 |   # This module contains automata-related code. Since Pegasus uses
 3 |   # Deterministic, nondeterministic, and push-down automata, there is a lot
 4 |   # of common code. This module is for the common code.
 5 |   module Automata
 6 |     # A generic state for an automaton, with transitions
 7 |     # labeled by T and values of V.
 8 |     class State(V, T)
 9 |       # The unique ID of the state.
10 |       getter id : Int64
11 |       # The additional data the state holds.
12 |       getter data : V
13 |       # The transitions from this state to other states.
14 |       getter transitions : Hash(T, self)
15 | 
16 |       # Creates a new state with the given ID, data, and transitions.
17 |       def initialize(*, @id, @data, @transitions = Hash(T, self).new)
18 |       end
19 |     end
20 | 
21 |     # A generic automaton to represent common operations on the
22 |     # different kinds of automata.
23 |     class Automaton(V, T)
24 |       # The states that this automaton has.
25 |       getter states : Set(State(V, T))
26 |       # The state ID to use for the next state.
27 |       getter last_id : Int64
28 |       # The start state.
29 |       property start : State(V, T)?
30 | 
31 |         # Creates a new automaton.
32 |         def initialize
33 |           @last_id = 0_i64
34 |           @states = Set(State(V, T)).new
35 |           @start = nil
36 |       end
37 | 
38 |       # Creates a new state for the given data.
39 |       def state_for(*, data : V)
40 |         new_state = State(V, T).new id: @last_id, data: data
41 |         @last_id += 1
42 |         @states << new_state
43 |         return new_state
44 |       end
45 |     end
46 | 
47 |     # Another generic automaton. Since many automatons created by
48 |     # pegasus do not like two nodes with the same data,
49 |     # this class overries the `#state_for` function to return
50 |     # an existing state with the given data if such a state exists.
51 |     class UniqueAutomaton(V, T) < Automaton(V, T)
52 |       # Creates a new UniqueAutomaton.
53 |       def initialize
54 |         super
55 |         @memorized = Hash(V, State(V, T)).new
56 |       end
57 | 
58 |       # Creates a new state for the given data,
59 |       # or returns an existing state with the data
60 |       # if one exists.
61 |       def state_for(*, data : V)
62 |         return @memorized[data] if @memorized.has_key? data
63 |         new_state = super(data: data)
64 |         @memorized[data] = new_state
65 |         return new_state
66 |       end
67 |     end
68 |   end
69 | end
70 | 


--------------------------------------------------------------------------------
/src/pegasus/dfa.cr:
--------------------------------------------------------------------------------
 1 | require "./automaton.cr"
 2 | require "./nfa.cr"
 3 | 
 4 | module Pegasus
 5 |   # This module is for deterministic finite automata.
 6 |   # DFAs are used in Pegasus to describe the tokenizer state machine.
 7 |   module Dfa
 8 |     alias DState = Automata::State(Set(Nfa::NState), UInt8)
 9 | 
10 |     # A deterministic finite automaton, whose dtransitions
11 |     # are marked by bytes and whose data is actually the collection
12 |     # of states this state represents in the source `Pegasus::Nfa::Nfa`.
13 |     class Dfa < Automata::UniqueAutomaton(Set(Nfa::NState), UInt8)
14 |     end
15 |   end
16 | end
17 | 


--------------------------------------------------------------------------------
/src/pegasus/elements.cr:
--------------------------------------------------------------------------------
  1 | module Pegasus
  2 |   # This module contains "elements" which are part of a production. Generally, elements
  3 |   # are terminals and nonterminals. Additionaly, special-case elements for certain
  4 |   # algorithms are contained here (the EOF element and the Empty element)
  5 |   module Elements
  6 |     # An item that can be in a lookahead item's follow set.
  7 |     # This could be a terminal ID, or the special reserved EOF and "empty" (epsilon)
  8 |     # elements.
  9 |     abstract class LookaheadElement
 10 |     end
 11 | 
 12 |     # A lookahead element which can be used as in index to a lookup table.
 13 |     abstract class IndexableElement < LookaheadElement
 14 |       # Gets the table index of this element.
 15 |       abstract def table_index : Int64
 16 |     end
 17 | 
 18 |     # The special-case empty (epsilon) element used for follow set computation.
 19 |     class EmptyElement < LookaheadElement
 20 |       def ==(other : EmptyElement)
 21 |         return true
 22 |       end
 23 | 
 24 |       def ==(other : LookaheadElement)
 25 |         return false
 26 |       end
 27 | 
 28 |       def hash(hasher)
 29 |         hasher
 30 |       end
 31 |     end
 32 | 
 33 |     # The EOF element. Represents the end of the file, and is not matched as a token by the lexer.
 34 |     class EofElement < IndexableElement
 35 |       def table_index : Int64
 36 |         return 0_i64
 37 |       end
 38 | 
 39 |       def ==(other : EofElement)
 40 |         return true
 41 |       end
 42 | 
 43 |       def ==(other : LookaheadElement)
 44 |         return false
 45 |       end
 46 | 
 47 |       def hash(hasher)
 48 |         hasher
 49 |       end
 50 |     end
 51 | 
 52 |     # A terminal, as specified by the user. This is __not__ a special case element, and one terminal ID
 53 |     # exists for every token the user registers.
 54 |     class TerminalId < IndexableElement 
 55 |       def initialize(@id : Int64)
 56 |       end
 57 | 
 58 |       def table_index : Int64
 59 |         return @id + 1
 60 |       end
 61 | 
 62 |       # Gets the raw ID of this terminal. This should be used with caution.
 63 |       def raw_id
 64 |         return @id
 65 |       end
 66 | 
 67 |       def ==(other : TerminalId)
 68 |         return @id == other.@id
 69 |       end
 70 | 
 71 |       def ==(other : LookaheadElement)
 72 |         return false
 73 |       end
 74 | 
 75 |       def hash(hasher)
 76 |         @id.hash(hasher)
 77 |         hasher
 78 |       end
 79 |     end
 80 | 
 81 |     # A nonterminal, as specified by the user. Nonterminals are on the left of production rules (though they can also
 82 |     # appear on the right).
 83 |     class NonterminalId
 84 |       # Creates a new NonterminalId with the given ID.
 85 |       def initialize(@id : Int64, @start = false)
 86 |       end
 87 | 
 88 |       # Gets the table index of this nonterminal.
 89 |       def table_index
 90 |         return @id + 1
 91 |       end
 92 | 
 93 |       # Gets the raw ID of this nonterminal. This should be used with caution.
 94 |       def raw_id
 95 |         return @id
 96 |       end
 97 | 
 98 |       # Checks if this nonterminal is a "start" nonterminal (i.e., a potentially top level node)
 99 |       def start?
100 |         return @start
101 |       end
102 | 
103 |       # Compares this nonterminal to another nonterminal.
104 |       def ==(other : NonterminalId)
105 |         return (@id == other.@id) && (@start == other.@start)
106 |       end
107 | 
108 |       # Creates a hash of this NonterminalId.
109 |       def hash(hasher)
110 |         @id.hash(hasher)
111 |         @start.hash(hasher)
112 |         hasher
113 |       end
114 | 
115 |       def to_s(io)
116 |         io << "NonterminalId(" << @id << ")"
117 |       end
118 |     end
119 |   end
120 | end
121 | 


--------------------------------------------------------------------------------
/src/pegasus/error.cr:
--------------------------------------------------------------------------------
  1 | require "colorize"
  2 | 
  3 | module Pegasus
  4 |   # This module contains all the error-related code.
  5 |   # This includes a custom exception class and context for it.
  6 |   module Error
  7 |     # A context for the custom exception class.
  8 |     # The idea with context is that it can be attached to exceptions and
  9 |     # shown as extra information to the user. It's attached rather than
 10 |     # added via subclassing because some parts of Pegasus code need to be
 11 |     # able to modify the context, replacing it with more thorough / clear
 12 |     # info. Instead of straight up copying the exception and changing the field,
 13 |     # (as well as the way it's displayed to the user), client code can
 14 |     # remove one bit of context and replace it with a better one.
 15 |     abstract class ErrorContext
 16 |       abstract def to_s(io)
 17 |     end
 18 | 
 19 |     # An exception thrown by Pegasus. Unlike Crystal exceptions, which will
 20 |     # be reported directly to the user without any prettyfication, the Pegasus exception is created to
 21 |     # display the error information to the user in a clear and pretty way. This includes coloring and
 22 |     # emphasizing certain sections of the message, and generally presenting them in a user-friendly way.
 23 |     abstract class PegasusException < Exception
 24 |       getter context_data : Array(ErrorContext)
 25 | 
 26 |       def initialize(@description : String, @context_data = [] of ErrorContext, @internal = false)
 27 |         super()
 28 |       end
 29 | 
 30 |       # Prints the exception to the given IO.
 31 |       def print(io)
 32 |         io << "an error".colorize.red.bold
 33 |         io << " has occured while "
 34 |         io << get_location_name.colorize.bold
 35 |         io << ": "
 36 |         io << @description
 37 |         io.puts
 38 | 
 39 |         print_extra(io)
 40 | 
 41 |         if @internal
 42 |           io << "This error is " << "internal".colorize.bold << ": this means it is likely " << "not your fault".colorize.bold
 43 |           io.puts
 44 |           io.puts "Please report this error to the developer."
 45 |         end
 46 |       end
 47 | 
 48 |       # Prints the context that the exception has attached.
 49 |       def print_extra(io)
 50 |         @context_data.each do |data|
 51 |           io << " - " << data
 52 |           io.puts
 53 |         end
 54 |       end
 55 | 
 56 |       # Get the "location" of the error, which is used to
 57 |       # report to the user when in the process the error occured.
 58 |       abstract def get_location_name
 59 |     end
 60 | 
 61 |     # An exception thrown at some point in the entire lifetime of Pegasus.
 62 |     # This is very vague, and should be used in cases where it cannot be known
 63 |     # what the surrounding code is doing at the time.
 64 |     class GeneralException < PegasusException
 65 |       def get_location_name
 66 |         "converting grammar to a parser description"
 67 |       end
 68 |     end
 69 | 
 70 |     # An exception used to signify that an error occured during grammar parsing.
 71 |     class GrammarException < PegasusException
 72 |       def get_location_name
 73 |         "parsing the grammar definition"
 74 |       end
 75 |     end
 76 | 
 77 |     # An exception used to signify that an error occured while creating
 78 |     # Nondeterministic Finite Automata.
 79 |     class NfaException < PegasusException
 80 |       def get_location_name
 81 |         "compiling regular expressions"
 82 |       end
 83 |     end
 84 | 
 85 |     # An exception used to signify that an error occured while creating
 86 |     # Deterministic Finite Automata.
 87 |     class DfaException < PegasusException
 88 |       def get_location_name
 89 |         "creating a deterministic finite automaton"
 90 |       end
 91 |     end
 92 | 
 93 |     # An exception used to signify that an error occured while creating
 94 |     # Push Down Automata.
 95 |     class PdaException < PegasusException
 96 |       def get_location_name
 97 |         "converting grammar rules into a state machine"
 98 |       end
 99 |     end
100 | 
101 |     # An exception used to signify that an error occured while creating
102 |     # the lookup tables necessary for the Pegasus state machine.
103 |     class TableException < PegasusException
104 |       def get_location_name
105 |         "creating lookup tables"
106 |       end
107 |     end
108 |   end
109 | 
110 | end
111 | 
112 | # Define a raise function from a name and a Pegasus exception class.
113 | macro define_raise(name, class_name)
114 |   def raise_{{name}}(message, context_data = [] of Pegasus::Error::ErrorContext, internal = false)
115 |     raise Pegasus::Error::{{class_name}}.new message,
116 |       context_data: context_data.map(&.as(Pegasus::Error::ErrorContext)),
117 |       internal: internal
118 |   end
119 | end
120 | 
121 | define_raise(general, GeneralException)
122 | define_raise(grammar, GrammarException)
123 | define_raise(nfa, NfaException)
124 | define_raise(dfa, DfaException)
125 | define_raise(pda, PdaException)
126 | define_raise(table, TableException)
127 | 


--------------------------------------------------------------------------------
/src/pegasus/grammar.cr:
--------------------------------------------------------------------------------
  1 | require "./elements.cr"
  2 | require "./items.cr"
  3 | require "./pda.cr"
  4 | 
  5 | module Pegasus
  6 |   # This module holds code related to push down automata, as well
  7 |   # as other helper code such as items (productions, basically),
  8 |   # dotted items (productions which know what part of the production
  9 |   # has already been parsed) and the like.
 10 |   module Pda
 11 |     # A Grammar associated with the language, contianing a list of terminals,
 12 |     # nonterminals, and the context-free production rules given by the `Item` class.
 13 |     class Grammar
 14 |       # The items that belong to this grammar.
 15 |       getter items : Array(Item)
 16 |       # The terminals that belong to this grammar.
 17 |       getter terminals : Array(Elements::TerminalId)
 18 |       # The nonterminals that belong to this grammar.
 19 |       getter nonterminals : Array(Elements::NonterminalId)
 20 | 
 21 |       # Initializes this grammar with the given terminals and nonterminals.
 22 |       def initialize(@terminals, @nonterminals)
 23 |         @items = Array(Item).new
 24 |       end
 25 | 
 26 |       # Checks if the given set contains the empty set. This is used for computing
 27 |       # FIRST and lookahead sets when generating an (LA)LR automaton.
 28 |       private def contains_empty(set)
 29 |         return set.select(&.is_a?(Elements::EmptyElement)).size != 0
 30 |       end
 31 | 
 32 |       # Concatenates a set with another set, and returns whether the size of the set
 33 |       # has changed. This is useful for "closure algorithms" as described by
 34 |       # Dick Grune and others in Modern Compiler Design. These algorithms apply
 35 |       # a rule until the data no longer changes.
 36 |       private def concat_watching(set, other)
 37 |         initial_size = set.size
 38 |         set.concat other
 39 |         return initial_size != set.size
 40 |       end
 41 | 
 42 |       # Computes the FIRST set of an alternative. The first sets hash is used
 43 |       # for already computed first sets. The empty alternative is added elsewhere,
 44 |       # and only contains the SPECIAL_EMPTY terminal.
 45 |       private def compute_alternative_first(first_sets, alternative)
 46 |         if !first_sets.has_key? alternative
 47 |           first = Set(Elements::LookaheadElement).new
 48 |           first_sets[alternative] = first
 49 |         else
 50 |           first = first_sets[alternative]
 51 |         end
 52 | 
 53 |         if alternative.size == 0
 54 |           return false
 55 |         end
 56 | 
 57 |         start_element = alternative.first
 58 |         add_first = first_sets[start_element].dup
 59 |         if contains_empty(first)
 60 |             tail = alternative[1...alternative.size]
 61 |             compute_alternative_first(first_sets, tail)
 62 |             add_first.concat first_sets[tail]
 63 |         else
 64 |             add_first = add_first.reject &.is_a?(Elements::EmptyElement)
 65 |         end
 66 | 
 67 |         return concat_watching(first, add_first)
 68 |       end
 69 | 
 70 |       # Computes the first set of every alternative or alternative tail of the given
 71 |       # item body.
 72 |       private def compute_alternatives_first(first_sets, body)
 73 |         change_occured = false
 74 |         body.size.times do |time|
 75 |           change_occured |= compute_alternative_first(first_sets, body[time...body.size])
 76 |         end
 77 |         return change_occured
 78 |       end
 79 | 
 80 |       # Computes the first sets of all the terminals, nonterminals, alternatives,
 81 |       # and alternative tails by examining the items, terminals, and nonterminals given
 82 |       # in `#initialize`
 83 |       private def compute_first
 84 |         first_sets = Hash(Elements::NonterminalId | Elements::TerminalId | Array(Elements::NonterminalId | Elements::TerminalId), Set(Elements::LookaheadElement)).new
 85 |         @terminals.each { |t| first_sets[t] = Set(Elements::LookaheadElement) { t } }
 86 |         @nonterminals.each { |nt| first_sets[nt] = Set(Elements::LookaheadElement).new }
 87 |         first_sets[[] of Elements::NonterminalId | Elements::TerminalId] = Set(Elements::LookaheadElement) { Elements::EmptyElement.new }
 88 |         change_occured = true
 89 | 
 90 |         while change_occured
 91 |           change_occured = false
 92 |           @items.each do |item|
 93 |             change_occured |= compute_alternatives_first(first_sets, item.body)
 94 |             change_occured |= concat_watching(first_sets[item.head], first_sets[item.body])
 95 |           end
 96 |         end
 97 | 
 98 |         return first_sets
 99 |       end
100 | 
101 |       # Gets a lookahead set for the given alternative, using its parent lookahead set.
102 |       private def get_lookahead(first_sets, alternative, old_lookahead)
103 |         lookahead = first_sets[alternative].dup
104 |         if contains_empty(lookahead)
105 |           lookahead.concat(old_lookahead)
106 |           lookahead = lookahead.reject &.is_a?(Elements::EmptyElement)
107 |         end
108 |         return lookahead.to_set
109 |       end
110 | 
111 |       # Creates new dotted items that are to be added because the "dot" is on the left on a nonterminal
112 |       # in the parent dotted item. The suffix parameter describes all the tokens after the nonterminal,
113 |       # which is used for looking up in the FIRST set.
114 |       private def create_dotted_items(first_sets, nonterminal, suffix, parent_lookahead)
115 |           return @items.select(&.head.==(nonterminal))
116 |                       .map { |it| LookaheadItem.new it, get_lookahead(first_sets, suffix, parent_lookahead) }
117 |       end
118 | 
119 |       # Creates new dotted items for every existing dotted item. This may be necessary if the "dot" moved
120 |       # and is now on the left hand of a Elements::NonterminalId, which warrants all the production rules for that nonterminal
121 |       # To be added to the current set (with their lookahead sets computed from scratch).
122 |       private def new_dots(first_sets, dots)
123 |         dots.map do |dot|
124 |           next Set(LookaheadItem).new if dot.index >= dot.item.body.size
125 |           next Set(LookaheadItem).new if dot.item.body[dot.index].is_a?(Elements::LookaheadElement)
126 |           next create_dotted_items(first_sets, dot.item.body[dot.index], dot.item.body[(dot.index+1)...dot.item.body.size], dot.lookahead)
127 |         end.reduce(Set(LookaheadItem).new) do |set, list|
128 |           set.concat list
129 |         end
130 |       end
131 | 
132 |       # Creates all dotted items from the given list of "initial" dotted items.
133 |       private def all_dots(first_sets, dots)
134 |         found_dots = dots.to_set.dup
135 |         while concat_watching(found_dots, new_dots(first_sets, found_dots))
136 |         end
137 |         groups = found_dots.group_by { |dot| { dot.item, dot.index } }
138 |         found_dots = groups.map do |k, v|
139 |           item, index = k
140 |           merged_lookahead = v.map(&.lookahead).reduce(Set(Elements::LookaheadElement).new) { |l, r| l.concat r }
141 |           LookaheadItem.new item, merged_lookahead, index
142 |         end
143 |         return found_dots.to_set
144 |       end
145 | 
146 |       # Gets a set of shifted items for each possible shift-transition
147 |       # from the current state.
148 |       private def get_transitions(dotted_items)
149 |         return dotted_items.compact_map do |dot|
150 |             next nil unless dot.index < dot.item.body.size
151 |             next { dot.item.body[dot.index], dot.next_item }
152 |         end.reduce(Hash(Elements::NonterminalId | Elements::TerminalId, Set(LookaheadItem)).new) do |hash, kv|
153 |            k, v = kv
154 |            hash[k] = hash[k]?.try(&.<<(v)) || Set { v }
155 |            next hash
156 |         end
157 |       end
158 | 
159 |       # Converts an LR(1) PDA to an LALR(1) PDA by merging states with the corresponding bodies, and
160 |       # combining the lookahead sets of every matching item.
161 |       def create_lalr_pda(lr_pda)
162 |         lalr_pda = Pda.new @items
163 |         groups = lr_pda.states.group_by { |s| s.data.map { |it| DottedItem.new it.item, it.index }.to_set }
164 |         # Since 2+ sets become one, we need to adjust transitions.
165 |         states = Hash(typeof(lr_pda.states.first), typeof(lalr_pda.states.first)).new
166 |         groups.each do |_, equal_states|
167 |           item_groups = equal_states
168 |               .flat_map(&.data.each)
169 |               .group_by { |it| DottedItem.new it.item, it.index }
170 |           merged_items = item_groups.map do |kv|
171 |             dotted_item, items = kv
172 |             LookaheadItem.new dotted_item.item, items.flat_map(&.lookahead.each).to_set, dotted_item.index
173 |           end.to_set
174 |           new_state = lalr_pda.state_for data: merged_items
175 |           equal_states.each do |state|
176 |             states[state] = new_state
177 |           end
178 |         end
179 | 
180 |         # Reconnect the new states.
181 |         lr_pda.states.each do |state|
182 |           new_state = states[state]
183 |           state.transitions.each do |e, other|
184 |             new_state.transitions[e] = states[other]
185 |           end
186 |         end
187 | 
188 |         return lalr_pda
189 |       end
190 | 
191 |       # Create an LR(1) PDA given a start symbol.
192 |       def create_lr_pda
193 |         pda = Pda.new @items
194 |         first_sets = compute_first
195 |         # Set of items starting with the start nonterminal
196 |         start_items = @items.select(&.head.start?).map do |it|
197 |           LookaheadItem.new it, Set(Elements::LookaheadElement) { Elements::EofElement.new }
198 |         end
199 |         # Set of all current dotted items
200 |         all_start_items = all_dots(first_sets, start_items)
201 |         start_state = pda.state_for data: all_start_items
202 | 
203 |         queue = Set(PState).new
204 |         finished = Set(PState).new
205 | 
206 |         queue << start_state
207 | 
208 |         while !queue.empty?
209 |           state = queue.first
210 |           queue.delete state
211 |           next if finished.includes? state
212 | 
213 |           finished << state
214 |           transitions = get_transitions(state.data)
215 |           transitions.each do |transition, items|
216 |             items = all_dots(first_sets, items)
217 |             new_state = pda.state_for data: items
218 |             state.transitions[transition] = new_state
219 |             queue << new_state
220 |           end
221 |         end
222 | 
223 |         return pda
224 |       end
225 | 
226 |       # Add an item to the Grammar.
227 |       def add_item(i)
228 |         items << i
229 |       end
230 |     end
231 |   end
232 | end
233 | 


--------------------------------------------------------------------------------
/src/pegasus/items.cr:
--------------------------------------------------------------------------------
  1 | require "./elements.cr"
  2 | require "./error.cr"
  3 | 
  4 | module Pegasus
  5 |   module Pda
  6 |     # An single production item, without a dot or any
  7 |     # kind of state.
  8 |     class Item
  9 |       # The nonterminal on the left of the production rule,
 10 |       # into which the right hand side is converted.
 11 |       getter head : Elements::NonterminalId
 12 |       # The body of terminals and nonterminals on the right
 13 |       # of the production rule.
 14 |       getter body : Array(Elements::NonterminalId | Elements::TerminalId)
 15 | 
 16 |       # Creates a new item with the given head and body.
 17 |       def initialize(@head, @body)
 18 |       end
 19 | 
 20 |       # Compares equality with the given other item.
 21 |       def ==(other : Item)
 22 |         return (other.head == @head) && (other.body == @body)
 23 |       end
 24 | 
 25 |       # Hashes this item.
 26 |       def hash(hasher)
 27 |         @head.hash(hasher)
 28 |         @body.hash(hasher)
 29 |         hasher
 30 |       end
 31 | 
 32 |       def to_s(io)
 33 |           io << "Item(" << head << ", [" << body.map(&.to_s).join(", ")  << "])"
 34 |       end
 35 |     end
 36 | 
 37 |     # An item with a "dot", which keeps track of how far the item is
 38 |     # in terms of being parsed.
 39 |     class DottedItem
 40 |       # The production rule this dotted item wraps.
 41 |       getter item : Item
 42 |       # The index in the body of the production rule.
 43 |       getter index : Int64
 44 | 
 45 |       # Creates a new dotted item.
 46 |       def initialize(@item, @index = 0_i64)
 47 |       end
 48 | 
 49 |       # Compares this item to another dotted item, including the index.
 50 |       def ==(other : DottedItem)
 51 |         return (other.item == @item) && (other.index == @index)
 52 |       end
 53 | 
 54 |       # Hashes this dotted item.
 55 |       def hash(hasher)
 56 |         @item.hash(hasher)
 57 |         @index.hash(hasher)
 58 |         hasher
 59 |       end
 60 | 
 61 |       def to_s(io)
 62 |           io << "DottedItem(" << item << ", " << index
 63 |           io << ", COMPLETED" if index == @item.body.size
 64 |           io << ")"
 65 |       end
 66 | 
 67 |       # Turns this item into the next item assuming a shift took place.
 68 |       def next_item!
 69 |         if @index < @item.body.size
 70 |           @index += 1
 71 |         else
 72 |           raise_pda "Reached past the end of the item!", internal: true
 73 |         end
 74 |       end
 75 | 
 76 |       # Creates a new item assuming a shift took place.
 77 |       def next_item
 78 |         new = dup
 79 |         new.next_item!
 80 |         return new
 81 |       end
 82 | 
 83 |       # Checks if this dotted item is done.
 84 |       def done?
 85 |           return @index == @item.body.size
 86 |       end
 87 |     end
 88 | 
 89 |     # A superclass of the `DottedItem` which also
 90 |     # keeps a lookahead set to further distinguish it
 91 |     # in LR(1) parser construction.
 92 |     class LookaheadItem < DottedItem
 93 |       # The lookahead set of this dotted item.
 94 |       getter lookahead : Set(Elements::LookaheadElement)
 95 | 
 96 |       # Creates a new lookahead dotted item.
 97 |       def initialize(@item, @lookahead, @index = 0_i64)
 98 |         super(@item, @index)
 99 |       end
100 | 
101 |       # Compares this dotted item to another dotted item.
102 |       def ==(other : LookaheadItem)
103 |         return super(other) && (other.lookahead == @lookahead)
104 |       end
105 | 
106 |       # Hashes this dotted item.
107 |       def hash(hasher)
108 |         super(hasher)
109 |         @lookahead.hash(hasher)
110 |         hasher
111 |       end
112 | 
113 |       def to_s(io)
114 |           io << "LookaheadItem(" << item << ", " << index << ", {" << lookahead.map(&.to_s).join(", ") << "}"
115 |           io << ", COMPLETED" if index == @item.body.size
116 |           io << ")"
117 |       end
118 |     end
119 |   end
120 | end
121 | 


--------------------------------------------------------------------------------
/src/pegasus/json.cr:
--------------------------------------------------------------------------------
 1 | require "json"
 2 | 
 3 | module Pegasus
 4 |   class Elements::TerminalId
 5 |     include JSON::Serializable
 6 |     @[JSON::Field(key: "terminal_id")]
 7 |     @id : Int64
 8 |   end
 9 | 
10 |   class Elements::NonterminalId
11 |     include JSON::Serializable
12 |     @[JSON::Field(key: "nonterminal_id")]
13 |     @id : Int64
14 |     @start : Bool
15 |   end
16 | 
17 |   module Pda
18 |     class Item
19 |       include JSON::Serializable
20 |       getter head : Elements::NonterminalId
21 |       getter body : Array(Elements::TerminalId | Elements::NonterminalId)
22 |     end
23 |   end
24 | 
25 |   module Language
26 |     class LanguageData
27 |       include JSON::Serializable
28 |       getter lex_skip_table : Array(Bool)
29 |       getter lex_state_table : Array(Array(Int64))
30 |       getter lex_final_table : Array(Int64)
31 |       getter parse_state_table : Array(Array(Int64))
32 |       getter parse_action_table : Array(Array(Int64))
33 |       getter parse_final_table : Array(Bool)
34 | 
35 |       getter terminals : Hash(String, Elements::TerminalId)
36 |       getter nonterminals : Hash(String, Elements::NonterminalId)
37 |       getter items : Array(Pda::Item)
38 |       getter max_terminal : Int64
39 |     end
40 |   end
41 | end
42 | 


--------------------------------------------------------------------------------
/src/pegasus/language_def.cr:
--------------------------------------------------------------------------------
  1 | require "./elements.cr"
  2 | require "./items.cr"
  3 | require "./grammar.cr"
  4 | require "./nfa.cr"
  5 | require "./regex.cr"
  6 | require "./nfa_to_dfa.cr"
  7 | require "./table.cr"
  8 | require "./error.cr"
  9 | require "./generated/grammar_parser.cr"
 10 | 
 11 | module Pegasus
 12 |   # This module is for handling language data. The language is given by the complete
 13 |   # Pegasus grammar, and includes the terminals, nonterminals, and other rules.
 14 |   # This module also contains `LanguageData`, which is the JSON structure
 15 |   # that is passed between pegasus and its consumer programs, like pegasus-c.
 16 |   module Language
 17 |     # An error context which reports the items involved in some kind of conflict
 18 |     # (shift / reduce or reduce / reduce). This version, unlike `ConflictErrorContext`,
 19 |     # reports the relevant items' names.
 20 |     class NamedConflictErrorContext < Error::ErrorContext
 21 |       def initialize(@nonterminals : Array(String))
 22 |       end
 23 | 
 24 |       def to_s(io)
 25 |         io << "The nonterminals involved are: "
 26 |         @nonterminals.join(io, ", ")
 27 |       end
 28 |     end
 29 | 
 30 |     # The complete data class, built to be all the information
 31 |     # needed to construct a parser generator.
 32 |     class LanguageData
 33 |       # Table for tokens that should be skipped.
 34 |       getter lex_skip_table : Array(Bool)
 35 |       # The state table for the lexer, which is used for transitions
 36 |       # of the `Nfa::Nfa` during tokenizing.
 37 |       getter lex_state_table : Array(Array(Int64))
 38 |       # The table that maps a state ID to a token ID, used to
 39 |       # recognize that a match has occured.
 40 |       getter lex_final_table : Array(Int64)
 41 |       # Transition table for the LALR parser automaton, indexed
 42 |       # by terminal and nonterminal IDs.
 43 |       getter parse_state_table : Array(Array(Int64))
 44 |       # Action table indexed by the state and the lookahead item.
 45 |       # Used to determine what the parser should do in each state.
 46 |       getter parse_action_table : Array(Array(Int64))
 47 |       # The table that maps a nonterminal ID to recognize
 48 |       # when parsing can stop.
 49 |       getter parse_final_table : Array(Bool)
 50 | 
 51 |       # The terminals, and their original names / regular expressions.
 52 |       getter terminals : Hash(String, Elements::TerminalId)
 53 |       # The nonterminals, and their original names.
 54 |       getter nonterminals : Hash(String, Elements::NonterminalId)
 55 |       # The items in the language. Used for reducing / building up
 56 |       # trees once a reduce action is performed.
 57 |       getter items : Array(Pda::Item)
 58 |       # The highest terminal ID, used for correctly accessing the
 59 |       # tables indexed by both terminal and nonterminal IDs.
 60 |       getter max_terminal : Int64
 61 | 
 62 |       # Creates a new language data object.
 63 |       def initialize(language_definition)
 64 |         @terminals, @nonterminals, grammar =
 65 |           generate_grammar(language_definition)
 66 |         @lex_skip_table, @lex_state_table, @lex_final_table,
 67 |           @parse_state_table, @parse_action_table, @parse_final_table =
 68 |           generate_tables(language_definition, @terminals, @nonterminals, grammar)
 69 |         @max_terminal = @terminals.values.max_of?(&.raw_id) || 0_i64
 70 |         @items = grammar.items
 71 |       end
 72 | 
 73 |       # Assigns an ID to each unique vaue in the iterable.
 74 |       private def assign_ids(values : Iterable(T), &block : Int64 -> R) forall T, R
 75 |         hash = {} of T => R
 76 |         last_id = 0_i64
 77 |         values.each do |value|
 78 |           next if hash[value]?
 79 |           hash[value] = yield (last_id += 1) - 1
 80 |         end
 81 |         return hash
 82 |       end
 83 | 
 84 |       # Creates a grammar, returning it and the hashes with identifiers for
 85 |       # the terminals and nonterminals.
 86 |       private def generate_grammar(language_def)
 87 |         token_ids = assign_ids(language_def.tokens.keys) do |i|
 88 |           Elements::TerminalId.new i
 89 |         end
 90 |         rule_ids = assign_ids(language_def.rules.keys) do |i|
 91 |           Elements::NonterminalId.new i, start: i == 0
 92 |         end
 93 | 
 94 |         grammar = Pda::Grammar.new token_ids.values, rule_ids.values
 95 |         language_def.rules.each do |name, bodies|
 96 |           head = rule_ids[name]
 97 |           bodies.each &.alternatives.each do |body|
 98 |             body = body.elements.map(&.name).map do |element_name|
 99 |               element = token_ids[element_name]? || rule_ids[element_name]?
100 |               raise_grammar "No terminal or rule named #{element_name}" unless element
101 |               next element
102 |             end
103 |             item = Pda::Item.new head, body
104 |             grammar.add_item item
105 |           end
106 |         end
107 | 
108 |         return { token_ids, rule_ids, grammar }
109 |       end
110 | 
111 |       # Generates lookup tables using the given terminals, nonterminals,
112 |       # and grammar.
113 |       private def generate_tables(language_def, terminals, nonterminals, grammar)
114 |         nfa = Nfa::Nfa.new
115 |         terminals.each do |terminal, value|
116 |           nfa.add_regex language_def.tokens[terminal].regex, value.raw_id
117 |         end
118 |         dfa = nfa.dfa
119 | 
120 |         begin
121 |           lex_skip_table = [ false ] +
122 |             language_def.tokens.map &.[1].options.includes?("skip")
123 |           lex_state_table = dfa.state_table
124 |           lex_final_table = dfa.final_table
125 | 
126 |           lr_pda = grammar.create_lr_pda
127 |           lalr_pda = grammar.create_lalr_pda(lr_pda)
128 |           parse_state_table = lalr_pda.state_table
129 |           parse_action_table = lalr_pda.action_table
130 |           parse_final_table = [false] + nonterminals.map &.[1].start?
131 |         rescue e : Error::PegasusException
132 |           if old_context = e.context_data
133 |             .find(&.is_a?(Dfa::ConflictErrorContext))
134 |             .as?(Dfa::ConflictErrorContext)
135 | 
136 |             names = old_context.item_ids.map do |id|
137 |               head = grammar.items[id].head
138 |               nonterminals.key_for head
139 |             end
140 |             e.context_data.delete old_context
141 |             e.context_data << NamedConflictErrorContext.new names
142 |           end
143 |           raise e
144 |         end
145 | 
146 |         return { lex_skip_table, lex_state_table, lex_final_table, parse_state_table, parse_action_table, parse_final_table }
147 |       end
148 |     end
149 | 
150 |     class ::Pegasus::Generated::Tree
151 |       alias SelfDeque = Deque(Generated::Tree)
152 | 
153 |       # Recursive call for the `#flatten` function.
154 |       protected def flatten_recursive(*, value_index : Int32, recursive_name : String, recursive_index : Int32) : SelfDeque
155 |         if flattened = self.as?(Generated::NonterminalTree)
156 |           recursive_child = flattened.children[recursive_index]?
157 |           value_child = flattened.children[value_index]?
158 | 
159 |           if flattened.name == recursive_name && recursive_child
160 |             add_to = recursive_child.flatten_recursive(
161 |               value_index: value_index,
162 |               recursive_name: recursive_name,
163 |               recursive_index: recursive_index)
164 |           else
165 |             add_to = SelfDeque.new
166 |           end
167 |           add_to.insert(0, value_child) if value_child
168 | 
169 |           return add_to
170 |         end
171 |         return SelfDeque.new
172 |       end
173 | 
174 |       # Since currently, * and + operators aren't supported in Pegasus grammars, they tend to be recursively written.
175 |       # This is a utility function to "flatten" a parse tree produced by a recursively written grammar.
176 |       def flatten(*, value_index : Int32, recursive_name : String, recursive_index : Int32)
177 |         flatten_recursive(
178 |           value_index: value_index,
179 |           recursive_name: recursive_name,
180 |           recursive_index: recursive_index).to_a
181 |       end
182 |     end
183 | 
184 |     alias Option = String
185 | 
186 |     # Since Pegasus supports options on tokens and rules,
187 |     # we need to represent an object to which options can be attached.
188 |     # this is this type of object.
189 |     abstract class OptionObject
190 |       # Gets the actual list of options attached to this object.
191 |       getter options : Array(Option)
192 | 
193 |       def initialize
194 |         @options = [] of Option
195 |       end
196 |     end
197 | 
198 |     # A token declaration, with zero or more rules attached to it.
199 |     class Token < OptionObject
200 |       # Gets the regular expression that defines this token.
201 |       getter regex : String
202 | 
203 |       def initialize(@regex, @options = [] of Option)
204 |       end
205 | 
206 |       def ==(other : Token)
207 |         return (other.regex == @regex) && (other.options == @options)
208 |       end
209 | 
210 |       def hash(hasher)
211 |         @regex.hash(hasher)
212 |         @options.hash(hasher)
213 |         hasher
214 |       end
215 |     end
216 | 
217 |     class ::Array(T)
218 |       # Gets the indices of all values matching the condition
219 |       def indices(&block)
220 |         deque = Deque(Int32).new
221 |         each_with_index do |v, i|
222 |           deque << i if yield v
223 |         end
224 |         return deque.to_a
225 |       end
226 |     end
227 | 
228 |     module ::Iterable(T)
229 |       def power_set
230 |         set = Set(Set(T)).new
231 |         set << Set(T).new
232 | 
233 |         each do |item|
234 |           to_add = Set(Set(T)).new
235 |           set.each do |subset|
236 |             to_add << subset.dup.<<(item)
237 |           end
238 |           set.concat to_add
239 |         end
240 | 
241 |         return set
242 |       end
243 |     end
244 | 
245 |     # An element of a grammar rule. Can be either a token or another rule.
246 |     class RuleElement
247 |       # The name of the element, as specified in the grammar.
248 |       getter name : String
249 | 
250 |       def initialize(@name)
251 |       end
252 | 
253 |       def ==(other : RuleElement)
254 |         return @name == other.name
255 |       end
256 | 
257 |       # If called in a child class of RuleElement,
258 |       # this strips the child class of its additional data,
259 |       # turning it back into a RuleElement base class.
260 |       def base_element
261 |         return self
262 |       end
263 | 
264 |       # Checks if this element derives lambda.
265 |       # This doesm't check if the production rule it
266 |       # represent can derive lambda; rather, it checks
267 |       # if this element has an operator applied to it
268 |       # that makes it do so, like ? or *
269 |       def derives_lambda?
270 |         return false
271 |       end
272 |     end
273 | 
274 |     # An element that is optional.
275 |     class OptionalElement < RuleElement
276 |       def base_element
277 |         return RuleElement.new name
278 |       end
279 | 
280 |       def derives_lambda?
281 |         return true
282 |       end
283 |     end
284 | 
285 |     # An element that is repeated one or more times.
286 |     class OneOrMoreElement < RuleElement
287 |     end
288 | 
289 |     # An element that is repeated zero or more times.
290 |     class ZeroOrMoreElement < RuleElement
291 |       def derives_lambda?
292 |         return true
293 |       end
294 |     end
295 | 
296 |     # One of the alternatives of a rule.
297 |     class RuleAlternative
298 |       # The elements of the rule.
299 |       getter elements : Array(RuleElement)
300 | 
301 |       def initialize(@elements)
302 |         raise_grammar "Empty productions are currently not supported" if elements.empty?
303 |       end
304 | 
305 |       def ==(other : RuleAlternative)
306 |         return @elements == other.elements
307 |       end
308 | 
309 |       # Computes a single variant, given optional indices that should be included.
310 |       private def compute_variant(indices)
311 |         new_elements = [] of RuleElement
312 |         elements.each_with_index do |element, index|
313 |           next if element.derives_lambda? && !indices.includes? index
314 |           new_elements << element.base_element
315 |         end
316 |         return RuleAlternative.new(new_elements)
317 |       end
318 | 
319 |       # Checks if this specific alternative is the lambda alternative.
320 |       def lambda?
321 |         return @elements.empty?
322 |       end
323 | 
324 |       # Determines if this rule alternative can be empty, or derive lambda.
325 |       def derives_lambda?
326 |         return derives_lambda? &.derives_lambda?
327 |       end
328 | 
329 |       # Determines if the rule alternative can be empty, using
330 |       # the block to check whether each element can be empty or not.
331 |       def derives_lambda?(&block)
332 |         return @elements.all? { |it| yield it }
333 |       end
334 | 
335 |       # Computes the variants created by optionals.
336 |       # For example, a? b? has four variants, a b, a, b, <empty>.
337 |       def compute_optional_variants
338 |         return compute_optional_variants &.derives_lambda?
339 |       end
340 | 
341 |       # Same as compute_optional_variants, but what's optional is
342 |       # now decided by the block.
343 |       def compute_optional_variants(&block)
344 |         optional_positions = @elements.indices { |it| yield it }
345 |         power_set = optional_positions.power_set
346 |         return power_set.map { |it| compute_variant(it) }
347 |       end
348 |     end
349 | 
350 |     # A single rule. This can have one or more alternatives,
351 |     # but has the same options (zero or more) applied to them.
352 |     class Rule < OptionObject
353 |       getter alternatives : Array(RuleAlternative)
354 | 
355 |       def initialize(@alternatives, @options = [] of Option)
356 |       end
357 | 
358 |       def ==(other : Rule)
359 |         return (other.alternatives == @alternatives) && (other.options == @options)
360 |       end
361 | 
362 |       def hash(hasher)
363 |         @alternatives.hash(hasher)
364 |         @options.hash(hasher)
365 |         hasher
366 |       end
367 | 
368 |       # Checks if this rule has any alternatives that can derive lambda.
369 |       def derives_lambda?
370 |         return @alternatives.any? &.derives_lambda?
371 |       end
372 | 
373 |       # Checks if this rule has any alternatives that can derive lambda,
374 |       # using a custom block for checking if an element can derive lambda.
375 |       def derives_lambda?(&block)
376 |         return @alternatives.any? &.derives_lambda? { |it| yield it }
377 |       end
378 | 
379 |       # Creates a new rule with the same options, but with alternatives expanded for optional values.
380 |       def compute_optional_variants
381 |         return Rule.new(@alternatives.flat_map &.compute_optional_variants, @options)
382 |       end
383 | 
384 |       # Creates a new rule with the same options, but with alternatives expanded for optional values.
385 |       # Uses a custom block to check if the elements can be empty or not.
386 |       def compute_optional_variants(&block)
387 |         return Rule.new(@alternatives.flat_map &.compute_optional_variants(block), @options)
388 |       end
389 |     end
390 | 
391 |     # A language definition parsed from a grammar string.
392 |     class LanguageDefinition
393 |       getter tokens : Hash(String, Token)
394 |       getter rules : Hash(String, Array(Rule))
395 | 
396 |       # Creates a new, empty language definition.
397 |       def initialize
398 |         @tokens = {} of String => Token
399 |         @rules = {} of String => Array(Rule)
400 |       end
401 | 
402 |       # Creates a new language definition from the given string.
403 |       def initialize(s : String)
404 |         @tokens = {} of String => Token
405 |         @rules = {} of String => Array(Rule)
406 |         from_string(s)
407 |       end
408 | 
409 |       # Creates a new language definition from the given IO.
410 |       def initialize(io : IO)
411 |         @tokens = {} of String => Token
412 |         @rules = {} of String => Array(Rule)
413 |         from_io(io)
414 |       end
415 | 
416 |       # Creates a list of options from a "statemend end" parse tree node.
417 |       private def extract_options(statement_end_tree)
418 |         statement_end_tree = statement_end_tree.as(Generated::NonterminalTree)
419 |         return [] of Option unless statement_end_tree.children.size > 1
420 |         options_tree = statement_end_tree.children[0].as(Generated::NonterminalTree)
421 |         options = options_tree.children[1]
422 |           .flatten(value_index: 0, recursive_name: "option_list", recursive_index: 2)
423 |           .map(&.as(Generated::NonterminalTree).children[0])
424 |           .map(&.as(Generated::TerminalTree).string)
425 |       end
426 | 
427 |       # Extracts all the tokens from the token list parse tree node, storing them
428 |       # in a member variable hash.
429 |       private def extract_tokens(token_list_tree)
430 |         token_list_tree.flatten(value_index: 0, recursive_name: "token_list", recursive_index: 1)
431 |           .map { |it| ntt = it.as(Generated::NonterminalTree); { ntt.children[1], ntt.children[3], ntt.children[4] } }
432 |           .map do |data|
433 |             name_tree, regex_tree, statement_end = data
434 |             name = name_tree
435 |               .as(Generated::TerminalTree).string
436 |             raise_grammar "Declaring a token (#{name}) a second time" if @tokens.has_key? name
437 |             regex = regex_tree
438 |               .as(Generated::TerminalTree).string[1..-2]
439 |             @tokens[name] = Token.new regex, extract_options(statement_end)
440 |           end
441 |       end
442 | 
443 |       private def extract_rule_element(grammar_element_tree)
444 |         grammar_element_tree = grammar_element_tree.as(Generated::NonterminalTree)
445 |         name = grammar_element_tree.children[0].as(Generated::TerminalTree).string
446 |         setting = grammar_element_tree.children[1]?.try { |it| it.as(Generated::TerminalTree).string }
447 |         return case setting
448 |                when "?"
449 |                  OptionalElement.new name
450 |                else
451 |                  RuleElement.new name
452 |                end
453 |       end
454 | 
455 |       # Extracts all the body definitions from the grammar bodies tree node.
456 |       # A rule has several bodies.
457 |       private def extract_bodies(bodies_tree)
458 |         bodies_tree.flatten(value_index: 0, recursive_name: "grammar_bodies", recursive_index: 2)
459 |           .map do |body|
460 |             RuleAlternative.new body
461 |               .flatten(value_index: 0, recursive_name: "grammar_body", recursive_index: 1)
462 |               .map { |it| extract_rule_element(it) }
463 |         end
464 |       end
465 | 
466 |       # Extracts all the rules from a gramamr list tree node, storin them
467 |       # in a member variable hash.
468 |       private def extract_rules(grammar_list_tree)
469 |         grammar_list_tree.flatten(value_index: 0, recursive_name: "grammar_list", recursive_index: 1)
470 |           .map { |it| ntt = it.as(Generated::NonterminalTree); { ntt.children[1], ntt.children[3], ntt.children[4] } }
471 |           .map do |data|
472 |             name_tree, bodies_tree, statement_end = data
473 |             name = name_tree
474 |               .as(Generated::TerminalTree).string
475 |             raise_grammar "Declaring a rule (#{name}) with the same name as a token" if @tokens.has_key? name
476 |             bodies = extract_bodies(bodies_tree)
477 | 
478 |             unless old_rules = @rules[name]?
479 |               @rules[name] = old_rules = Array(Rule).new
480 |             end
481 |             old_rules << Rule.new(bodies, extract_options(statement_end)).compute_optional_variants
482 |           end
483 |       end
484 | 
485 |       # Creates a language definition from a string.
486 |       private def from_string(string)
487 |         tree = ::Pegasus::Generated.process(string).as(::Pegasus::Generated::NonterminalTree)
488 |         if tokens = tree.children.find &.as(::Pegasus::Generated::NonterminalTree).name.==("token_list")
489 |           extract_tokens(tokens)
490 |         end
491 |         if rules = tree.children.find &.as(::Pegasus::Generated::NonterminalTree).name.==("grammar_list")
492 |           extract_rules(rules)
493 |         end
494 |       rescue e : Error::PegasusException
495 |         raise e
496 |       rescue e : Exception
497 |         raise_grammar e.message.not_nil!
498 |       end
499 | 
500 |       # Creates a languge definition from IO.
501 |       private def from_io(io)
502 |         string = io.gets_to_end
503 |         from_string(string)
504 |       end
505 |     end
506 |   end
507 | end
508 | 


--------------------------------------------------------------------------------
/src/pegasus/nfa.cr:
--------------------------------------------------------------------------------
 1 | require "./automaton.cr"
 2 | 
 3 | module Pegasus
 4 |   # This module is for nondeterministic finite automata. While NFAs
 5 |   # aren't very good for directly creating state machines
 6 |   # (you need to keep track of an exponential number of potential states),
 7 |   # they are easier to construct. This module contains functionality to convert
 8 |   # regular expressions to NFAs.
 9 |   module Nfa
10 |     alias NState = Automata::State(Int64?, Transition)
11 | 
12 |     # A transition class used to represent the possible transitions
13 |     # possible in the NFA.
14 |     class Transition
15 |     end
16 | 
17 |     # A transition that requires a single byte.
18 |     class ByteTransition < Transition
19 |       # The byte used for the transition.
20 |       getter byte : UInt8
21 | 
22 |       # Creates a new byte transition.
23 |       def initialize(@byte)
24 |       end
25 |     end
26 | 
27 |     # A transition that doesn't consume a token from the input.
28 |     class LambdaTransition < Transition
29 |     end
30 | 
31 |     # A transition that accepts any character.
32 |     class AnyTransition < Transition
33 |     end
34 | 
35 |     # A transition that accepts several ranges of bytes.
36 |     class RangeTransition < Transition
37 |       # The ranges this transition accepts / rejects.
38 |       getter ranges : Array(Range(UInt8, UInt8))
39 |       # If this is true, characters must __not__ be in the ranges to
40 |       # be accepted.
41 |       getter inverted : Bool
42 | 
43 |       # Creates a new range transition.
44 |       def initialize(@ranges, @inverted)
45 |       end
46 |     end
47 | 
48 |     # A nondeterministic finite automaton, to be created
49 |     # from regular expressions.
50 |     class Nfa < Automata::Automaton(Int64?, Transition)
51 |       # Creates a new Nfa with a start state.
52 |       def initialize
53 |         super
54 |         @start = state_for(data: nil)
55 |       end
56 | 
57 |       # Creates a new state for no value (aka, a set with nil as the value)
58 |       def state
59 |         state_for data: nil
60 |       end
61 |     end
62 |   end
63 | end
64 | 


--------------------------------------------------------------------------------
/src/pegasus/nfa_to_dfa.cr:
--------------------------------------------------------------------------------
  1 | require "./nfa.cr"
  2 | require "./dfa.cr"
  3 | require "./error.cr"
  4 | 
  5 | module Pegasus
  6 |   module Nfa
  7 |     class Transition
  8 |       # Returns the characters this transition accepts
  9 |       # for transitions.
 10 |       def char_states
 11 |         return [] of UInt8
 12 |       end
 13 |     end
 14 | 
 15 |     class ByteTransition
 16 |       def char_states
 17 |         return [ @byte ]
 18 |       end
 19 |     end
 20 | 
 21 |     class AnyTransition
 22 |       def char_states
 23 |         return (0_u8..255_u8).to_a
 24 |       end
 25 |     end
 26 | 
 27 |     class RangeTransition
 28 |       def char_states
 29 |         states = @ranges.map(&.to_a).flatten
 30 |         states = (0_u8..255_u8).to_a - states if @inverted
 31 |         return states
 32 |       end
 33 |     end
 34 | 
 35 |     class Nfa
 36 |       # Finds all the states connected to the given state
 37 |       # through lambda transitions, which will be in the same `Pegasus::Dfa::Dfa` state.
 38 |       private def find_lambda_states(s : NState)
 39 |         found = Set(NState).new
 40 |         queued = Set{s}
 41 |         while !queued.empty?
 42 |           state = queued.first
 43 |           queued.delete state
 44 |           next if found.includes? state
 45 | 
 46 |           found << state
 47 |           queued.concat state.transitions.select(&.is_a?(LambdaTransition)).map(&.[1])
 48 |         end
 49 |         return found
 50 |       end
 51 | 
 52 |       # Finds the lambda states connected to any of the states of the given set.
 53 |       def find_lambda_states(s : Set(NState))
 54 |         return s
 55 |             .map { |it| find_lambda_states(it) }
 56 |             .reduce(Set(NState).new) { |acc, r| acc.concat r }
 57 |       end
 58 | 
 59 |       # Merges the sets mapped to by the same key in the list of hashes.
 60 |       private def merge_hashes(a : Array(Hash(K, Set(V)))) forall K, V
 61 |         a.reduce({} of K => Set(V)) { |l, r| l.merge(r) { |_, l1, r1| l1|r1 } }
 62 |       end
 63 | 
 64 |       # Creates a `Pegasus::Dfa::Dfa` for this Nfa.
 65 |       def dfa
 66 |         raise_dfa "NFA doesn't have start state" unless @start
 67 | 
 68 |         # DFA we're constructing
 69 |         new_dfa = Pegasus::Dfa::Dfa.new
 70 |         # The NFA->DFA algorithm creates a state for every reachable combination of NFA states.
 71 |         # So, this is a set of "reachable states", and is itself a state.
 72 |         new_start_set = find_lambda_states(@start.not_nil!)
 73 |         new_start = new_dfa.state_for data: new_start_set
 74 |         new_dfa.start = new_start
 75 | 
 76 |         # The queue of states to process.
 77 |         queue = Set { new_start }
 78 |         # Visited states.
 79 |         finished = Set(Pegasus::Dfa::DState).new
 80 | 
 81 |         while !queue.empty?
 82 |           state = queue.first
 83 |           queue.delete state
 84 |           next if finished.includes? state
 85 | 
 86 |           finished << state
 87 |           sub_hashes = state.data.map do |sub_state|
 88 |               transition_hashes = sub_state.transitions.map do |k, v|
 89 |                 char_states = k.char_states
 90 |                 set_array = Array.new(char_states.size) do
 91 |                   Set { v }
 92 |                 end
 93 |                 Hash.zip(char_states, set_array)
 94 |               end
 95 |               merge_hashes(transition_hashes)
 96 |           end
 97 |           out_transitions = merge_hashes(sub_hashes)
 98 |           out_transitions.each do |char, ss|
 99 |             out_state_set = find_lambda_states(ss)
100 |             out_state = new_dfa.state_for data: out_state_set
101 |             state.transitions[char] = out_state
102 |             queue << out_state
103 |           end
104 |         end
105 | 
106 |         return new_dfa
107 |       end
108 |     end
109 |   end
110 | end
111 | 


--------------------------------------------------------------------------------
/src/pegasus/pda.cr:
--------------------------------------------------------------------------------
 1 | require "./elements.cr"
 2 | require "./automaton.cr"
 3 | require "./items.cr"
 4 | 
 5 | module Pegasus
 6 |   module Pda
 7 |     alias PState = Automata::State(Set(LookaheadItem), Elements::NonterminalId | Elements::TerminalId)
 8 | 
 9 |     # A class that represents the (LA)LR Push Down Automaton.
10 |     class Pda < Automata::UniqueAutomaton(Set(LookaheadItem), Elements::NonterminalId | Elements::TerminalId)
11 |       def initialize(@items : Array(Item))
12 |         super()
13 |       end
14 |     end
15 |   end
16 | end
17 | 


--------------------------------------------------------------------------------
/src/pegasus/regex.cr:
--------------------------------------------------------------------------------
  1 | require "./nfa.cr"
  2 | require "./error.cr"
  3 | 
  4 | module Pegasus
  5 |   module Nfa
  6 |     # A "unit" of one or more connected states.
  7 |     class StateChain
  8 |       # The beginning of this chain.
  9 |       property start : NState
 10 |       # The end of this chain.
 11 |       property final : NState
 12 | 
 13 |       # Creates a new chain with the given initial and final states.
 14 |       def initialize(@start, final = nil)
 15 |         @final = final || @start
 16 |       end
 17 | 
 18 |       # Appends another chain to this one, modifying the states' transition
 19 |       # hashes, too.
 20 |       def append!(other : StateChain)
 21 |         @final.not_nil!.transitions[LambdaTransition.new] = other.start.not_nil!
 22 |         @final = other.final
 23 |         return self
 24 |       end
 25 | 
 26 |       # Appends nothing to this chain. This is a no-op.
 27 |       def append!(other : Nil)
 28 |         return self
 29 |       end
 30 |     end
 31 | 
 32 |     class Nfa
 33 |       ESCAPES = {
 34 |         '\'' => 0x27_u8,
 35 |         '"'  => 0x22_u8,
 36 |         '?'  => 0x3f_u8,
 37 |         '\\' => 0x5c_u8,
 38 |         'a'  => 0x07_u8,
 39 |         'b'  => 0x08_u8,
 40 |         'f'  => 0x0c_u8,
 41 |         'n'  => 0x0a_u8,
 42 |         'r'  => 0x0d_u8,
 43 |         't'  => 0x09_u8,
 44 |         'v'  => 0x0b_u8,
 45 |         '*'  => 0x2a_u8,
 46 |         '+'  => 0x2b_u8,
 47 |         '-'  => 0x2d_u8,
 48 |         '|'  => 0x7c_u8,
 49 |         '['  => 0x5b_u8,
 50 |         ']'  => 0x5d_u8,
 51 |         '('  => 0x28_u8,
 52 |         ')'  => 0x29_u8,
 53 |         '.'  => 0x2e_u8,
 54 |         '/'  => 0x2f_u8,
 55 |       }
 56 | 
 57 |       # Applies the "+" operator to the given `StateChain`.
 58 |       private def nfa_plus(chain)
 59 |         new_final = state
 60 |         new_start = state
 61 |         new_final.transitions[LambdaTransition.new] = new_start
 62 |         chain.final.transitions[LambdaTransition.new] = new_final
 63 |         new_start.transitions[LambdaTransition.new] = chain.start
 64 | 
 65 |         chain.start = new_start
 66 |         chain.final = new_final
 67 |       end
 68 | 
 69 |       # Applies the "*" operator to the given `StateChain`.
 70 |       private def nfa_star(chain)
 71 |         new_final = state
 72 |         new_start = state
 73 |         new_final.transitions[LambdaTransition.new] = new_start
 74 |         new_start.transitions[LambdaTransition.new] = new_final
 75 |         chain.final.transitions[LambdaTransition.new] = new_final
 76 |         new_start.transitions[LambdaTransition.new] = chain.start
 77 | 
 78 |         chain.start = new_start
 79 |         chain.final = new_final
 80 |       end
 81 | 
 82 |       # Applies the "?" operator to the given `StateChain`.
 83 |       private def nfa_question(chain)
 84 |         new_final = state
 85 |         new_start = state
 86 |         new_start.transitions[LambdaTransition.new] = new_final
 87 |         chain.final.transitions[LambdaTransition.new] = new_final
 88 |         new_start.transitions[LambdaTransition.new] = chain.start
 89 | 
 90 |         chain.start = new_start
 91 |         chain.final = new_final
 92 |       end
 93 | 
 94 |       # Reas a character, taking into account the scape character.
 95 |       private def read_char(tokens)
 96 |         raise_nfa "Unexpected end of file"  unless tokens.first?
 97 |         char = tokens.delete_at(0)
 98 |         if char == '\\'
 99 |           raise_nfa "Incomplete escape character" unless tokens.first?
100 |           char = tokens.delete_at(0)
101 |           escape = ESCAPES[char]?
102 |           raise_nfa "Invalid escape code" unless escape
103 |           return escape
104 |         else
105 |           raise_nfa "Non-ASCII characters not supported" unless char.ascii?
106 |           return char.bytes[0]
107 |         end
108 |       end
109 | 
110 |       # Creates an NFA chain using the range syntax ([...])
111 |       private def from_regex_range(tokens)
112 |         tokens.delete_at(0)
113 |         invert = false
114 |         last_char = nil
115 |         ranges = [] of Range(UInt8, UInt8)
116 | 
117 |         if tokens.first? == '^'
118 |           invert = true
119 |           tokens.delete_at(0)
120 |         end
121 | 
122 |         while tokens.first? && tokens.first != ']'
123 |           if tokens.first == '-'
124 |             raise_nfa "Invalid range" unless last_char
125 |             tokens.delete_at(0)
126 |             ranges << (last_char..read_char(tokens))
127 |             last_char = nil
128 |           else
129 |             last_char.try { |it| ranges << (it..it) }
130 |             last_char = read_char(tokens)
131 |           end
132 |         end
133 |         last_char.try { |it| ranges << (it..it) }
134 | 
135 |         raise_nfa "Invalid range definition" if tokens.first? != ']'
136 |         tokens.delete_at(0)
137 | 
138 |         start = state
139 |         final = state
140 |         start.transitions[RangeTransition.new(ranges, invert)] = final
141 |         return StateChain.new(start, final)
142 |       end
143 | 
144 |       # Parses a (sub)expression, optionally requiring parentheses.
145 |       private def from_regex_expr(tokens, *, require_parenths = true)
146 |         substring_stack = [] of StateChain
147 |         current_chain = nil
148 |         sub_chain = nil
149 | 
150 |         if require_parenths
151 |           tokens.delete_at(0)
152 |         end
153 | 
154 |         modifiers = {
155 |           '+' => ->nfa_plus(StateChain),
156 |           '*' => ->nfa_star(StateChain),
157 |           '?' => ->nfa_question(StateChain)
158 |         }
159 | 
160 |         while tokens.first? && tokens.first != ')'
161 |           char = tokens.first
162 | 
163 |           if modifier = modifiers[char]?
164 |             tokens.delete_at(0)
165 |             raise_nfa "Invalid operator" unless sub_chain
166 |             modifier.call(sub_chain)
167 |             next
168 |           end
169 | 
170 |           current_chain = current_chain.try(&.append!(sub_chain)) || sub_chain
171 |           if char == '('
172 |             sub_chain = from_regex_expr(tokens)
173 |           elsif char == '.'
174 |             tokens.delete_at(0)
175 |             empty_state = state
176 |             actual_state = state
177 | 
178 |             empty_state.transitions[AnyTransition.new] = actual_state
179 |             sub_chain = StateChain.new(empty_state, actual_state)
180 |           elsif char == '|'
181 |             tokens.delete_at(0)
182 |             substring_stack.push current_chain if current_chain
183 |             current_chain = nil
184 |             sub_chain = nil
185 |           elsif char == '['
186 |             sub_chain = from_regex_range(tokens)
187 |           else
188 |             char = read_char(tokens)
189 | 
190 |             empty_state = state
191 |             actual_state = state
192 |             empty_state.transitions[ByteTransition.new char] = actual_state
193 |             sub_chain = StateChain.new(empty_state, actual_state)
194 |           end
195 |         end
196 |         current_chain = current_chain.try(&.append!(sub_chain)) || sub_chain
197 | 
198 |         if require_parenths && tokens.first? == ')'
199 |           tokens.delete_at(0)
200 |         elsif (require_parenths ^ (tokens.first? == ')'))
201 |           raise_nfa "Mismatched parentheses"
202 |         end
203 | 
204 |         if substring_stack.size > 0
205 |           substring_stack.push current_chain if current_chain
206 |           start_state = state
207 |           end_state = state
208 |           substring_stack.compact!.each do |chain|
209 |             start_state.transitions[LambdaTransition.new] = chain.start
210 |             chain.final.transitions[LambdaTransition.new] = end_state
211 |           end
212 |           current_chain = StateChain.new(start_state, end_state)
213 |         end
214 | 
215 |         return current_chain
216 |       end
217 | 
218 |       # Adds a regular expression branch to this Nfa.
219 |       def add_regex(str, id)
220 |         tokens = str.chars
221 |         chain = from_regex_expr(tokens, require_parenths: false)
222 |         final_state = state_for data: id
223 |         final_chain = StateChain.new(final_state, final_state)
224 |         new_start = (chain.try(&.append!(final_chain)) || final_chain).start
225 |         @start.not_nil!.transitions[LambdaTransition.new] = new_start
226 |       end
227 |     end
228 |   end
229 | end
230 | 


--------------------------------------------------------------------------------
/src/pegasus/semantics.cr:
--------------------------------------------------------------------------------
  1 | require "./generated/semantics_parser.cr"
  2 | 
  3 | module Pegasus
  4 |   module Semantics
  5 |     alias NonterminalTree = Generated::Semantics::NonterminalTree
  6 |     alias TerminalTree = Generated::Semantics::TerminalTree
  7 | 
  8 |     class SemanticsData
  9 |       getter types : Hash(String, String)
 10 |       getter nonterminal_types : Hash(Elements::NonterminalId, String)
 11 |       getter actions : Hash(Int64, String)
 12 |       getter init : String
 13 | 
 14 |       def initialize(source, token_type : String, @data : Language::LanguageData)
 15 |         @types = {} of String => String
 16 |         @nonterminal_types = {} of Elements::NonterminalId => String
 17 |         @actions = {} of Int64 => String
 18 |         @init = ""
 19 | 
 20 |         @types["token"] = token_type
 21 | 
 22 |         begin
 23 |           raw_tree = Pegasus::Generated::Semantics.process(source).as(NonterminalTree)
 24 |         rescue e : Pegasus::Error::PegasusException
 25 |           raise e
 26 |         rescue e : Exception
 27 |           raise_general e.message.not_nil!
 28 |         end
 29 | 
 30 |         register_types raw_tree.children[0]
 31 |         register_typerules raw_tree.children[1]
 32 |         register_init raw_tree.children[2]
 33 |         register_rules raw_tree.children[3]
 34 |       end
 35 | 
 36 |       private def register_types(tree)
 37 |         type_list = tree.as(NonterminalTree)
 38 |         loop do
 39 |           type_decl = type_list.children[0].as(NonterminalTree)
 40 |           identifier = type_decl.children[1].as(TerminalTree).string;
 41 |           code = type_decl.children[3].as(TerminalTree).string[2..-3];
 42 |           raise_general "Redefining #{identifier}" if @types.includes? identifier
 43 |           @types[identifier] = code
 44 | 
 45 |           break if type_list.children.size == 1
 46 |           type_list = type_list.children[1].as(NonterminalTree)
 47 |         end
 48 |       end
 49 | 
 50 |       private def register_typerules(tree)
 51 |         typerules_list = tree.as(NonterminalTree)
 52 |         loop do
 53 |           typerule_decl = typerules_list.children[0].as(NonterminalTree)
 54 |           identifier = typerule_decl.children[1].as(TerminalTree).string
 55 |           nonterminals = read_identifier_list typerule_decl.children[4]
 56 | 
 57 |           nonterminals.each do |nonterminal_name|
 58 |             unless nonterminal = @data.nonterminals[nonterminal_name]? 
 59 |               raise_general "unknown nonterminal #{nonterminal_name}"
 60 |             end
 61 | 
 62 |             if @nonterminal_types.includes? nonterminal
 63 |               raise_general "redefinition of type for #{nonterminal_name}"
 64 |             end
 65 | 
 66 |             @nonterminal_types[nonterminal] = identifier
 67 |           end
 68 | 
 69 |           break if typerules_list.children.size == 1
 70 |           typerules_list = typerules_list.children[1].as(NonterminalTree)
 71 |         end
 72 |       end
 73 | 
 74 |       private def read_identifier_list(tree)
 75 |         list = tree.as(NonterminalTree)
 76 |         identifiers = [] of String
 77 |         loop do
 78 |           identifiers << list.children[0].as(TerminalTree).string
 79 | 
 80 |           break if list.children.size == 1
 81 |           list = list.children[2].as(NonterminalTree)
 82 |         end
 83 |         return identifiers
 84 |       end
 85 | 
 86 |       private def register_init(tree)
 87 |         @init = tree.as(NonterminalTree).children[2].as(TerminalTree).string[2..-3];
 88 |       end
 89 | 
 90 |       private def register_rules(tree)
 91 |         rules_list = tree.as(NonterminalTree)
 92 |         loop do
 93 |           rule = rules_list.children[0].as(NonterminalTree)
 94 |           identifier = rule.children[1].as(TerminalTree).string
 95 |           number = rule.children[3].as(TerminalTree).string.to_i64
 96 |           code = rule.children[6].as(TerminalTree).string[2..-3];
 97 |           
 98 |           unless nonterminal = @data.nonterminals[identifier]?
 99 |             raise_general "unknown rule #{nonterminal}"
100 |           end
101 | 
102 |           index = 0
103 |           set = false
104 |           @data.items.each_with_index do |item, i|
105 |             next unless item.head == nonterminal
106 |             if index == number
107 |               raise_general "redefinition of rule #{identifier}(#{number})" if @actions.includes? i.to_i64
108 |               @actions[i.to_i64] = code
109 |               set = true
110 |               break
111 |             end
112 |             index += 1
113 |           end
114 |           raise_general "no rule #{identifier}(#{number})" unless set
115 | 
116 |           break if rules_list.children.size == 1
117 |           rules_list = rules_list.children[1].as(NonterminalTree)
118 |         end
119 |       end
120 |     end
121 |   end
122 | end
123 | 


--------------------------------------------------------------------------------
/src/pegasus/table.cr:
--------------------------------------------------------------------------------
  1 | require "./nfa.cr"
  2 | require "./pda.cr"
  3 | require "./error.cr"
  4 | 
  5 | module Pegasus
  6 |   module Dfa
  7 |     class ConflictErrorContext < Pegasus::Error::ErrorContext
  8 |       getter item_ids : Array(Int64)
  9 | 
 10 |       def initialize(@item_ids)
 11 |       end
 12 | 
 13 |       def to_s(io)
 14 |         io << "The IDs of the items involved are "
 15 |         @item_ids.join(io, ", ")
 16 |       end
 17 |     end
 18 | 
 19 |     class Dfa
 20 |       # Creates a final table, which is used to determine if a state matched a token.
 21 |       def final_table
 22 |         return [0_i64] + @states.map { |s| s.data.compact_map(&.data).max_of?(&.+(1)) || 0_i64 }
 23 |       end
 24 | 
 25 |       # Creates a transition table given, see `Pegasus::Language::LanguageData`
 26 |       def state_table
 27 |         table = [Array.new(256, 0_i64)]
 28 |         @states.each do |state|
 29 |           empty_table = Array.new(256, 0_i64)
 30 |           state.transitions.each do |byte, out_state|
 31 |             empty_table[byte] = out_state.id + 1
 32 |           end
 33 |           table << empty_table
 34 |         end
 35 |         return table
 36 |       end
 37 |     end
 38 |   end
 39 | 
 40 |   module Pda
 41 |     class LookaheadItem
 42 |       def insert_shift?(action_table, state)
 43 |         return if done?
 44 |         next_element = item.body[index]
 45 |         return if !next_element.is_a?(Elements::IndexableElement)
 46 | 
 47 |         previous_value = action_table[state.id + 1][next_element.table_index]
 48 |         if previous_value > 0
 49 |           raise_table "Shift / reduce conflict", context_data: [
 50 |             Pegasus::Dfa::ConflictErrorContext.new([ previous_value ])
 51 |           ]
 52 |         end
 53 |         action_table[state.id + 1][next_element.table_index] = 0
 54 |       end
 55 | 
 56 |       def insert_reduce?(action_table, state, self_index)
 57 |         return if !done?
 58 | 
 59 |         @lookahead.each do |terminal|
 60 |           next unless terminal.is_a?(Elements::IndexableElement)
 61 |           previous_value = action_table[state.id + 1][terminal.table_index]
 62 |           if previous_value == 0
 63 |             raise_table "Shift / reduce conflict", context_data: [
 64 |               Pegasus::Dfa::ConflictErrorContext.new([ self_index.to_i64  ])
 65 |             ]
 66 |           end
 67 |           if previous_value > 0
 68 |             raise_table "Reduce / reduce conflict", context_data: [
 69 |               Pegasus::Dfa::ConflictErrorContext.new([ previous_value - 1, self_index.to_i64  ])
 70 |             ]
 71 |           end
 72 |           action_table[state.id + 1][terminal.table_index] = self_index.to_i64 + 1
 73 |         end
 74 |       end
 75 |     end
 76 | 
 77 |     class Pda
 78 |       # Creates an action table, determing what the parser should do
 79 |       # at the given state and the lookhead token.
 80 |       def action_table
 81 |         last_terminal_index = @items.max_of? do |item|
 82 |           item.body.select(&.is_a?(Elements::IndexableElement)).max_of?(&.table_index) || 1_i64
 83 |         end || 0_i64
 84 | 
 85 |         # +1 Because the EOF token has its own spot, too.
 86 |         table = Array.new(@states.size + 1) { Array.new(last_terminal_index + 1, -1_i64) }
 87 |         @states.each do |state|
 88 |           state.data.each do |item|
 89 |             item.insert_shift?(table, state)
 90 |             item.insert_reduce?(table, state, @items.index(item.item).not_nil!)
 91 |           end
 92 |         end
 93 | 
 94 |         return table
 95 |       end
 96 | 
 97 |       # Creates a transition table that is indexed by both Terminals and Nonterminals.
 98 |       def state_table
 99 |         last_terminal_index = @items.max_of? do |item|
100 |           item.body.select(&.is_a?(Elements::TerminalId)).max_of?(&.table_index) || 0_i64
101 |         end || 0_i64
102 | 
103 |         last_nonterminal_index = @items.max_of? do |item|
104 |           Math.max(item.head.table_index, item.body.select(&.is_a?(Elements::NonterminalId)).max_of?(&.table_index) || 0_i64)
105 |         end || 0_i64
106 | 
107 |         # +1 Because the EOF token has its own spot, too.
108 |         table = Array.new(@states.size + 1) { Array.new(last_terminal_index + last_nonterminal_index + 1, 0_i64) }
109 |         @states.each do |state|
110 |           state.transitions.each do |token, to|
111 |             case token
112 |             when Elements::IndexableElement
113 |               table[state.id + 1][token.table_index] = to.id + 1
114 |             when Elements::NonterminalId
115 |               table[state.id + 1][token.table_index + last_terminal_index] = to.id + 1
116 |             end
117 |           end
118 |         end
119 | 
120 |         return table
121 |       end
122 |     end
123 |   end
124 | end
125 | 


--------------------------------------------------------------------------------
/src/tools/dot/pegasus_dot.cr:
--------------------------------------------------------------------------------
 1 | require "../../pegasus/language_def.cr"
 2 | require "../../pegasus/json.cr"
 3 | require "option_parser"
 4 | 
 5 | module Pegasus::Dot
 6 |   extend self
 7 | 
 8 |   # Outputs the DFA lexing state machine from the LanguageData.
 9 |   def output_dfa(data, io)
10 |     io << "digraph G {\n"
11 |     data.lex_state_table.each_with_index do |state, i|
12 |       next if i == 0
13 |       state_name = "q#{i}"
14 | 
15 |       state.each_with_index do |j, char|
16 |         other_state_name = "q#{j}"
17 |         if j != 0
18 |           io << "  #{state_name} -> #{other_state_name} [label=#{char.chr.to_s.dump}]\n"
19 |         end
20 |       end
21 |     end
22 |     io << "}"
23 |   end
24 | 
25 |   # Outputs the PDA parsing state machine from the LanguageData.
26 |   def output_pda(data, io)
27 |     io << "digraph G {\n"
28 |     data.parse_state_table.each_with_index do |state, i|
29 |       next if i == 0
30 |       state_name = "q#{i}"
31 | 
32 |       state.each_with_index do |j, cause|
33 |         other_state_name = "q#{j}"
34 |         if j != 0
35 |           if cause == 0
36 |             transition_label = "(EOF)"
37 |           elsif cause - 1 <= data.max_terminal
38 |             transition_label = data.terminals.find { |k, v| v.raw_id == cause - 1 }.not_nil![0].dump
39 |           else
40 |             transition_label = data.nonterminals.find { |k, v| v.raw_id == cause - 1 - (data.max_terminal + 1) }.not_nil![0].dump
41 |           end
42 |           io << "  #{state_name} -> #{other_state_name} [label=#{transition_label}]\n"
43 |         end
44 |       end
45 |     end
46 |     io << "}"
47 |   end
48 | 
49 |   # Output target specified on command line.
50 |   enum OutputTarget
51 |     # Print DOT for DFA
52 |     Dfa
53 |     # Print DOT for PDA
54 |     Pda
55 |   end
56 | end
57 | 
58 | # Configuration options
59 | output_target = Pegasus::Dot::OutputTarget::Pda
60 | 
61 | # Parse configuration from command line
62 | OptionParser.parse do |parser|
63 |   parser.banner = "Usage: pegasus-dot [arguments]"
64 |   parser.on("-o FORMAT", "--output FORMAT",
65 |             "Specifies the output format of the DOT converter. Either \"Dfa\" or \"Pda\"") do |format|
66 |     output_target = Pegasus::Dot::OutputTarget.parse? format
67 |     if output_target == nil
68 |       STDERR.puts "ERROR: #{format} is not a valid format option."
69 |       STDERR.puts parser
70 |       exit(1)
71 |     end
72 |   end
73 |   parser.on("-h", "--help", "Show this help") { puts parser }
74 |   parser.invalid_option do |flag|
75 |     STDERR.puts "ERROR: #{flag} is not a valid option."
76 |     STDERR.puts parser
77 |     exit(1)
78 |   end
79 | end
80 | 
81 | # Reaad, parse, and output LanguageData.
82 | data = Pegasus::Language::LanguageData.from_json STDIN
83 | case output_target
84 | when Pegasus::Dot::OutputTarget::Dfa
85 |   Pegasus::Dot.output_dfa(data, STDOUT)
86 | when Pegasus::Dot::OutputTarget::Pda
87 |   Pegasus::Dot.output_pda(data, STDOUT)
88 | end
89 | 


--------------------------------------------------------------------------------
/src/tools/sim/pegasus_sim.cr:
--------------------------------------------------------------------------------
  1 | require "../../pegasus/language_def.cr"
  2 | require "../../pegasus/json.cr"
  3 | require "option_parser"
  4 | 
  5 | module Pegasus::Sim
  6 |   class Token
  7 |     getter id : Int64
  8 |     getter string : String
  9 | 
 10 |     def initialize(@id, @string)
 11 |     end
 12 | 
 13 |     def to_s(io)
 14 |       io << "Token(" << id << ", " << string << ")"
 15 |     end
 16 |   end
 17 | 
 18 |   abstract class Tree
 19 |     abstract def table_index : Int64
 20 | 
 21 |     def display(io, offset)
 22 |     end
 23 |   end
 24 | 
 25 |   class TokenTree < Tree
 26 |     def initialize(@token : Token)
 27 |     end
 28 | 
 29 |     def table_index : Int64
 30 |       @token.id
 31 |     end
 32 | 
 33 |     def display(io, offset)
 34 |       offset.times { io << "  " }
 35 |       io << @token
 36 |       io.puts
 37 |     end
 38 |   end
 39 | 
 40 |   class ParentTree < Tree
 41 |     getter children : Array(Tree)
 42 | 
 43 |     def initialize(@nonterminal_id : Int64, @max_terminal : Int64, @children = [] of Tree, @name : String? = nil)
 44 |     end
 45 | 
 46 |     def table_index : Int64
 47 |       @max_terminal + 1 + 1 + @nonterminal_id
 48 |     end
 49 | 
 50 |     def display(io, offset)
 51 |       offset.times { io << "  " }
 52 |       io << "ParentTree(" << (@name || @nonterminal_id) << ")"
 53 |       io.puts
 54 |       @children.each { |child| child.display(io, offset + 1) }
 55 |     end
 56 |   end
 57 | end
 58 | 
 59 | input_json_option = nil
 60 | 
 61 | OptionParser.parse do |parser|
 62 |   parser.banner = "Usage: pegasus-sim [arguments]"
 63 |   parser.on("-i FILE", "--input FORMAT", "Specifies input JSON file") do |file|
 64 |     input_json_option = file
 65 |   end
 66 |   parser.on("-h", "--help", "Show this help") { puts parser }
 67 |   parser.invalid_option do |flag|
 68 |     STDERR.puts "ERROR: #{flag} is not a valid option."
 69 |     STDERR.puts parser
 70 |     exit(1)
 71 |   end
 72 | end
 73 | 
 74 | raise "Input file not specified" unless input_json_option
 75 | input_json = input_json_option.not_nil!
 76 | 
 77 | raise "Unable to open specified file" unless File.file? input_json
 78 | input = File.read input_json
 79 | 
 80 | data = Pegasus::Language::LanguageData.from_json input
 81 | to_parse = STDIN.gets_to_end.chomp
 82 | 
 83 | # Lexing code
 84 | 
 85 | tokens = [] of Pegasus::Sim::Token
 86 | # Index at the string
 87 | index = 0_i64
 88 | # The last "final" match.
 89 | last_final = -1_i64
 90 | # The location of the last "final" match.
 91 | last_final_index = -1_i64
 92 | # The beginning of the last token.
 93 | last_start = 0_i64
 94 | # The current state
 95 | state = 1_i64
 96 | 
 97 | while index < to_parse.size
 98 |   last_final = -1_i64
 99 |   last_final_index = -1_i64
100 |   last_start = index
101 |   state = 1_i64
102 | 
103 |   while (index < to_parse.size) && (state != 0_i64)
104 |     state = data.lex_state_table[state][to_parse[index].bytes[0]]
105 |     if (final = data.lex_final_table[state]) != 0
106 |       last_final = final
107 |       last_final_index = index
108 |     end
109 |     index += 1 if state != 0
110 |   end
111 | 
112 |   break if last_final == -1
113 |   next if data.lex_skip_table[last_final]
114 |   tokens << Pegasus::Sim::Token.new last_final, to_parse[last_start..last_final_index]
115 | end
116 | 
117 | raise "Invalid token at position #{index}" unless index == to_parse.size
118 | 
119 | # Parsing code
120 | 
121 | # Technically this is one stack. However, it's easier to keep track
122 | # of the two types of variables on the stack separately.
123 | 
124 | # The stack of trees being assembled from the bottom up.
125 | tree_stack = [] of Pegasus::Sim::Tree
126 | # The stack of the states to be followed by the automaton.
127 | state_stack = [ 1_i64 ]
128 | # The index in the tokens
129 | index = 0_i64
130 | # Final state table ID
131 | final_id = data.max_terminal + 1 + 1
132 | 
133 | loop do
134 |   break if (top = tree_stack.last?) && top.table_index == final_id
135 |   action = data.parse_action_table[state_stack.last][(tokens[index]?.try &.id) || 0_i64]
136 | 
137 |   raise "Invalid token at position #{index}" if action == -1_i64
138 |   if action == 0
139 |     raise "Unexpected end of file" unless index < tokens.size
140 |     tree_stack << Pegasus::Sim::TokenTree.new tokens[index]
141 |     index += 1
142 |   else
143 |     item = data.items[action - 1]
144 |     new_children = [] of Pegasus::Sim::Tree
145 | 
146 |     item.body.size.times do
147 |       new_children.insert 0, tree_stack.pop
148 |       state_stack.pop
149 |     end
150 |     tree_stack << Pegasus::Sim::ParentTree.new item.head.raw_id, data.max_terminal,
151 |       new_children,
152 |       data.nonterminals.find { |k, v| v.raw_id == item.head.raw_id }.not_nil![0]
153 |   end
154 |   state_stack << data.parse_state_table[state_stack.last][tree_stack.last.table_index]
155 | end
156 | raise "Unexpected token at position #{index}" if index != tokens.size
157 | tree_stack.last.display(STDOUT, 0)
158 | 


--------------------------------------------------------------------------------