├── .editorconfig ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── grammars ├── adder.grammar ├── arithmetic.grammar ├── derive_lambda.grammar ├── empty_production.grammar ├── match.grammar ├── modern_compiler_design.grammar ├── optional.grammar ├── reduce_reduce.grammar ├── shift_reduce.grammar ├── skip.grammar └── some_derive_lambda.grammar ├── pegasus-sem.grammar ├── pegasus.grammar ├── semantics └── adder.sem ├── shard.yml ├── spec ├── automaton_spec.cr ├── dfa_spec.cr ├── language_spec.cr ├── nfa_spec.cr ├── pda_spec.cr ├── spec_helper.cr └── spec_utils.cr └── src ├── generators ├── c-common │ ├── standard_header.h │ ├── standard_source.c │ ├── tables.cr │ └── tables.ecr ├── c │ ├── pegasus_c.cr │ ├── pegasus_c_header_template.ecr │ ├── pegasus_c_template.ecr │ ├── tree_header.h │ └── tree_source.c ├── crystal-common │ ├── tables.cr │ └── tables.ecr ├── crystal │ ├── pegasus_crystal.cr │ └── pegasus_crystal_template.ecr ├── crystalsem │ ├── pegasus_crystal_template.ecr │ └── pegasus_crystalsem.cr ├── csem │ ├── pegasus_c_header_template.ecr │ ├── pegasus_c_template.ecr │ ├── pegasus_csem.cr │ ├── sem_header.h │ └── sem_source.c └── generators.cr ├── pegasus.cr ├── pegasus ├── automaton.cr ├── dfa.cr ├── elements.cr ├── error.cr ├── generated │ ├── grammar_parser.cr │ └── semantics_parser.cr ├── grammar.cr ├── items.cr ├── json.cr ├── language_def.cr ├── nfa.cr ├── nfa_to_dfa.cr ├── pda.cr ├── regex.cr ├── semantics.cr └── table.cr └── tools ├── dot └── pegasus_dot.cr └── sim └── pegasus_sim.cr /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*.cr] 4 | charset = utf-8 5 | end_of_line = lf 6 | insert_final_newline = true 7 | indent_style = space 8 | indent_size = 2 9 | trim_trailing_whitespace = true 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /docs/ 2 | /lib/ 3 | /bin/ 4 | /.shards/ 5 | *.dwarf 6 | 7 | # Libraries don't need dependency lock 8 | # Dependencies will be locked in application that uses them 9 | /shard.lock 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: crystal 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2018 Danila Fedorin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pegasus 2 | A parser generator based on Crystal and the UNIX philosophy. It is language agnostic, but can 3 | currently generate parsers for the [C](#c-output) and [Crystal](#crystal-output) languages. 4 | 5 | _Warning: Pegasus is experimental. Its APIs are not yet solidified, and are subject to change at any time._ 6 | 7 | ## Table of Contents 8 | * [Architecture](#architecture) 9 | * [Usage](#usage) 10 | * [Tokens](#tokens) 11 | * [Rules](#rules) 12 | * [A Note on Parse Trees](#a-note-on-parse-trees) 13 | * [Regular Expressions](#regular-expressions) 14 | * [Included Programs](#included-programs) 15 | * [Options](#options) 16 | * [Semantic Actions](#semantic-actions) 17 | * [C Output](#c-output) 18 | * [C Output With Semantic Actions](#c-output-with-semantic-actions) 19 | * [Crystal Output](#crystal-output) 20 | * [Crystal Output With Semantic Actions](#crystal-output-with-semantic-actions) 21 | * [JSON Format](#json-format) 22 | 23 | ## Architecture 24 | Pegasus is based on the UNIX philosophy of doing one thing, and doing it well. 25 | The core pegasus program isn't as much a parser generator as it is a Push Down 26 | Automaton generator. 27 | 28 | Pegasus reads the grammar files, creates a Deterministic Finite Automaton (DFA) that is then used to tokenize (lex) text. Then, it creates an 29 | LALR(1) Push Down Automaton that is then used to parse text. However, it doesn't actually generate a parser: it outputs the generated tables for both automatons, 30 | as well as some extra information, as JSON. Another program, specific to each 31 | language, then reads the JSON and is responsible for code output. 32 | 33 | This is beneficial because this prevents the code generator from being dependent on a language. JSON is a data interchange format, and it is easily readable from almost any programming language. If I, or others, want to add a code generation target, they can just parse the JSON in their preferred language, rather than Crystal. An additional benefit is that the addition of a target doesn't require the pegasus core to be updated or recompiled. 34 | ## Usage 35 | Pegasus parses grammars written in very basic notation. The grammars are separated into two 36 | sections: the __tokens__ and the __rules__. 37 | ### Tokens 38 | The tokens are terminals, and are described using 39 | regular expressions. An example token declaration is as follows: 40 | ``` 41 | token hello = /hello/; 42 | ``` 43 | Notice that the token declaration is terminated by a semicolon. Also notice that the regular expression is marked at both ends by a forward slash, `/`. In order to write a regular expression that includes a forward slash, it needs to be escaped, like `\/`. More information on regular expressions accepted by Pegasus can be found below. 44 | ### Rules 45 | Grammar rules appear after tokens in the grammar file. An example rule is given as follows: 46 | ``` 47 | rule S = hello; 48 | ``` 49 | This rule uses the token we declared above, that is, `hello`, which matches the string hello. 50 | In order to expect multiple tokens, we simply write them one after another: 51 | ``` 52 | rule S = hello hello; 53 | ``` 54 | Grammar rules aren't limited to only tokens. The names of other grammar rules, declared either earlier or later in the file, can also be used. For example: 55 | ``` 56 | rule S = two_hello hello; 57 | rule two_hello = hello hello; 58 | ``` 59 | Here, we declare a second rule, `two_hello`, and then use it in the `S` rule. 60 | 61 | Sometimes, it's useful to be able to declare several alternatives for rule. For example, we want to have an "operand" rule in a basic calculator, and an operand can either be a variable like "x" or a number like "3". We can write a rule as follows: 62 | ``` 63 | rule operand = number | variable; 64 | ``` 65 | ### A Note on Parse Trees 66 | Earlier, we saw two rules written as follows: 67 | ``` 68 | rule S = two_hello hello; 69 | rule two_hello = hello hello; 70 | ``` 71 | While it accepts the same language, this is __not__ equivalent to the following: 72 | ``` 73 | rule S = hello hello hello; 74 | ``` 75 | The reason is that Pegasus, by default, produces parse trees. The first grammar will produce 76 | a parse tree whose root node, `S`, has two children, one being `two_hello` and the other being `hello`. The `two_hello` node will have two child nodes, both `hello`. However, the second variant will produce a parse tree whose root node, `S`, has three children, all `hello`. 77 | ### Regular Expressions 78 | Regular 79 | expressions support some basic operators: 80 | * `hey+` matches `hey`, `heyy`, `heyyy`, and so on. 81 | * `hey?` matches `hey` or `he` 82 | * `hey*` matches `he`, `hey`, `heyy`, and so on. 83 | 84 | Operators can also be applied to groups of characters: 85 | * `(ab)+` matches `ab`, `abab`, `ababab`, and so on. 86 | 87 | Please note, however, that Pegasus's lexer does not capture groups. 88 | ### Options 89 | Pegasus supports an experimental mechanism to aid in parser generation, which involves attaching options 90 | to tokens or rules. Right now, the only option that is recognized is attached to a token definition. This option is "skip". 91 | Options are delcared as such: 92 | ``` 93 | token space = / +/ [ skip ]; 94 | ``` 95 | The skip option means that the token it's attached to, in this case `space`, will be immediately discarded, and parsing will go on 96 | as if it wasn't there. For example, if we want a whitespace-insensitive list of digits, we can write it as such: 97 | ``` 98 | token space = / +/ [ skip ]; 99 | token digit = /[0-9]/; 100 | token list_start = /\[/; 101 | token list_end = /\]/; 102 | token comma = /,/; 103 | 104 | rule list = list_start list_recursive list_end; 105 | rule list_recursive = digit | digit comma list_recursive; 106 | ``` 107 | Now, this will be able to parse equivalently the strings "[3]", "[ 3 ]" and [ 3]", because the whitespace token is ignored. 108 | ### Semantic Actions 109 | It's certainly convenient to create a parse tree that perfectly mimics the structure of a language's grammar. However, this isn't always desirable - if the user desires to construct an Abstract Syntax Tree, they're left having to walk the structure of the resulting tree _again_, frequently checking what rule created a particular nonterminal, or how many children a root node has. This is less than ideal - we don't want to duplicate the work of specifying the grammar when we walk the trees. Furthermore, if the grammar changes, the code that walks the parse trees will certainly need to change. 110 | 111 | To remedy this, I've been toying with the idea of including _semantic actions_ into Pegasus, in a very similar way to Yacc / Bison. Semantic actions are pieces of code that run when a particular rule in the grammar is matched. However, this would mean that the user has to write these actions in some particular language (Yacc / Bison use C/C++). Since Pegasus aims to be language agnostic, writing code in a particular language in the main grammar file is undesirable. Thus, I chose the approach of separating semantic actions into a separate file format. The format uses `$$` to delimit code blocks, and contains the following sections: 112 | 113 | * Types that various nonterminals are assigned. For instance, a boolean expression can be assigned the C++ type "bool". 114 | * The actual rules that are of each of the types declared above. 115 | * The init code (placed in a global context before the parsing function) 116 | * The semantic actions for each rule. 117 | 118 | For a concrete example of this file format, see the example code in the [C Output With Semantic Actions](#c-output-with-semantic-actions) section. 119 | 120 | ### Included programs 121 | Before you use any of these programs, you should use 122 | ``` 123 | shards build --release 124 | ``` 125 | This will compile all the Pegasus programs in release mode, 126 | for optimal performance. 127 | #### `pegasus` 128 | This program reads grammars from standard input, and generates 129 | JSON descriptions out LALR automata, 130 | which will be read by the other programs. For example: 131 | ```Bash 132 | echo 'token hello = /Hello, world!/; rule S = hello;' > test.grammar 133 | ./bin/pegasus < test.grammar 134 | ``` 135 | This prints the JSON to the command line. If you'd like to output 136 | JSON to a file, you can use: 137 | ```Bash 138 | ./bin/pegasus < test.grammar > test.json 139 | ``` 140 | #### `pegasus-dot` 141 | This program is used largely for debugging purpose, and generates GraphViz 142 | DOT output, which can then by converted by the `dot` program into images. 143 | This greatly helps with debugging generated automata. `pegasus-dot` simply 144 | reads the generated JSON file: 145 | ```Bash 146 | ./bin/pegasus-dot < test.json 147 | ``` 148 | To generate a PNG from the DOT output, you need the `dot` program installed. 149 | Once you have that, you can just pipe the output of `pegasus-dot` into `dot`: 150 | ```Bash 151 | ./bin/pegasus-dot < test.json | dot -Tpng -o visual.png 152 | ``` 153 | #### `pegasus-sim` 154 | This is another program largely used for debugging. Instead of generating 155 | a parser, it reads a JSON file, then attempts to parse text from STDIN. 156 | Once it's done, it prints the result of its attempt. Note that because 157 | it reads input from STDIN, rather than JSON, the JSON 158 | file has to be given as a command-line argument: 159 | ```Bash 160 | echo 'Hello, world!' | ./bin/pegasus-sim -i test.json 161 | ``` 162 | 163 | #### `pegasus-c` 164 | Finally, a parser generator! `pegasus-c` takes JSON, and creates C 165 | header and source files that can then be integrated into your project. 166 | To learn how to use the generated code, please take a look at the 167 | [C output](#c-output) section. 168 | ```Bash 169 | ./bin/pegasus-c < test.json 170 | ``` 171 | 172 | #### `pegasus-crystal` 173 | Another parser generator. `pegasus-crystal` outputs Crystal code 174 | which can then be integrated into your project. 175 | To learn how to use the generated code, lease take a look at the 176 | [Crystal output](#crystal-output) section. 177 | ```Bash 178 | ./bin/pegasus-crystal < test.json 179 | ``` 180 | 181 | #### `pegasus-csem` 182 | Another C parser generator. The difference between this parser generator and `pegasus-c` is that it uses a separate semantic actions file to mimic the functionality of Yacc/Bison. This means you can specify C code that runs when each rule in the grammar is matched. To learn how to use this parser generator, see the [C Output With Semantic Actions](#c-output-with-semantic-actions) section. 183 | ``` 184 | ./bin/pegasus-csem -l test.json -a test.sem 185 | ``` 186 | 187 | ## C Output 188 | The pegasus repository contains the source code of a program that converts the JSON output into C source code. It generates a derivation tree, stored in `pgs_tree`, which is made up of nonterminal parent nodes and terminal leaves. Below is a simple example of using the functions generated for a grammar that describes the language of a binary operation applied to two numbers. 189 | The grammar: 190 | ``` 191 | token op_add = /\+/; 192 | token op_sub = /-/; 193 | token op_mul = /\*/; 194 | token op_div = /\//; 195 | token number = /[0-9]/; 196 | 197 | rule S = expr; 198 | rule expr = number op number; 199 | rule op = op_add | op_sub | op_div | op_mul; 200 | ``` 201 | _note: backslashes are necessary in the regular expressions because `+` and `*` are operators in the regular expression language._ 202 | 203 | The code for the API: 204 | ```C 205 | /* Include the generated header file */ 206 | #include "parser.h" 207 | #include 208 | 209 | int main(int argc, char** argv) { 210 | pgs_state state; /* The state is used for reporting error messages.*/ 211 | pgs_tree* tree; /* The tree that will be initialized */ 212 | char buffer[256]; /* Buffer for string input */ 213 | 214 | gets(buffer); /* Unsafe function for the sake of example */ 215 | /* pgs_do_all lexes and parses the text from the buffer. */ 216 | if(pgs_do_all(&state, &tree, buffer)) { 217 | /* A nonzero return code indicates error. Print it.*/ 218 | printf("Error: %s\n", state.errbuff); 219 | } else { 220 | /* Do nothing, free the tree. */ 221 | /* Tree is not initialized unless parse succeeds. */ 222 | pgs_free_tree(tree); 223 | } 224 | } 225 | ``` 226 | This example is boring because nothing is done with the tree. Let's walk the tree and print it out: 227 | ```C 228 | void print_tree(pgs_tree* tree, const char* source, int indent) { 229 | size_t i; 230 | /* Print an indent. */ 231 | for(i = 0; i < indent; i++) printf(" "); 232 | /* If the tree is a terminal (actual token) */ 233 | if(tree->variant == PGS_TREE_TERMINAL) { 234 | printf("Terminal: %.*s\n", (int) (PGS_TREE_T_TO(*tree) - PGS_TREE_T_FROM(*tree)), 235 | source + PGS_TREE_T_FROM(*tree)); 236 | } else { 237 | /* PGS_TREE_NT gives the nonterminal ID from the given tree. */ 238 | printf("Nonterminal: %s\n", pgs_nonterminal_name(PGS_TREE_NT(*tree))); 239 | /* PGS_TREE_NT_COUNT returns the number of children a nonterminal 240 | node has. */ 241 | for(i = 0; i < PGS_TREE_NT_COUNT(*tree); i++) { 242 | /* PGS_TREE_NT_CHILD gets the nth child of a nonterminal tree. */ 243 | print_tree(PGS_TREE_NT_CHILD(*tree, i), source, indent + 1); 244 | } 245 | } 246 | } 247 | ``` 248 | For the input string `3+3`, the program will output: 249 | ``` 250 | Nonterminal: S 251 | Nonterminal: expr 252 | Nonterminal: number 253 | Terminal: 3 254 | Nonterminal: op 255 | Terminal: + 256 | Nonterminal: number 257 | Terminal: 3 258 | ``` 259 | Some more useful C macros for accessing the trees can be found in `parser.h` 260 | 261 | ## C Output With Semantic Actions 262 | Say you don't need a parse tree. Instead, you want to construct your own values from Pegasus grammar rules. In this case, you want to use the `pegasus-csem` parser generator. It is best demonstrated using a small example. Let's consider a language of booleans: 263 | ``` 264 | token whitespace = /[ \n\t]+/ [ skip ]; 265 | token true = /true/; 266 | token false = /false/; 267 | token and = /and/; 268 | token or = /or/; 269 | 270 | rule S = expr; 271 | rule expr = tkn | expr and tkn | expr or tkn; 272 | rule tkn = true | false; 273 | ``` 274 | Easy enough. But why would we want a parse tree from this? Let's operate directly on booleans (which we'll represent as integers in C). We create the semantic actions file step by step. First, we know all our actions will produce integers (which represent booleans). So we create a boolean type: 275 | ``` 276 | type boolean = $$ int $$ 277 | ``` 278 | Now, we want to assign this type to the nonterminals in our language. We do this as follows: 279 | ``` 280 | typerules boolean = [ S, expr, tkn ] 281 | ``` 282 | We don't need any global variables or functions, so we can just leave the `init` block blank: 283 | ``` 284 | init = $$ $$ 285 | ``` 286 | Next, we write actions that correspond to each gramamr rule. 287 | ``` 288 | rule S(0) = $$ $out = $0; $$ 289 | ``` 290 | `$out` is the "output" variable, and `$0` is the value generated for the first terminal or nonterminal in the rule (in this case, `expr`). This rule just forwards the result of the rules for `expr`. Next, let's write rules for `expr`: 291 | ``` 292 | rule expr(0) = $$ $out = $0; $$ 293 | rule expr(1) = $$ $out = $0 & $2; $$ 294 | rule expr(2) = $$ $out = $0 | $2; $$ 295 | ``` 296 | The first rule simply forwards the value generated for `tkn`. The other two rules combine the results of their subexpressions using `&` and `|` (we use `&` in the grammar rule that has the `and` token, and `|` in the grammar rule that has the `or` token). Finally, we write the rules for `tkn`: 297 | ``` 298 | rule tkn(0) = $$ $out = 1; $$ 299 | rule tkn(1) = $$ $out = 0; $$ 300 | ``` 301 | Time to test this. We need to write a simple program that uses the parser. The main difference from the C output without semantic actions is that we use `pgs_stack_value` union type, with fields named after the types we registered (`boolean`, in this case). The code: 302 | ```C 303 | #include "parser.h" 304 | 305 | int main() { 306 | pgs_stack_value v; /* Temporary variable into which to store the result */ 307 | pgs_state s; /* The state used for reporting error message */ 308 | 309 | /* Initialize the state */ 310 | pgs_state_init(&s); 311 | /* Tokenize and parse a hardcoded string, ignoring error code */ 312 | pgs_do_all(&s, &v, "false or false or true"); 313 | /* Print the error generated, if any */ 314 | printf("%s\n", s.errbuff); 315 | /* Print the boolean value as an integer. */ 316 | printf("%d\n", v.boolean); 317 | } 318 | ``` 319 | The output is the result of evaluating our expression: "true", or 1: 320 | ``` 321 | 322 | 1 323 | ``` 324 | 325 | ## Crystal Output 326 | Just like with C, this repository contains a program to output Crystal when code given a JSON file. 327 | Because Crystal supports exceptions and garbage collection, there is no need to initialize 328 | any variables, or call corresponding `free` functions. The most basic example of reading 329 | a line from the standard input and parsing it is below: 330 | ```Crystal 331 | require "./parser.cr" 332 | 333 | Pegasus::Generated.process(STDIN.gets.not_nil!) 334 | ``` 335 | Of course, this isn't particularly interesting. Let's add a basic function to print the tree: 336 | ```Crystal 337 | def print_tree(tree, indent = 0) 338 | indent.times { STDOUT << " " } 339 | case tree 340 | when Pegasus::Generated::TerminalTree 341 | STDOUT << "Terminal: " 342 | STDOUT.puts tree.string 343 | when Pegasus::Generated::NonterminalTree 344 | STDOUT << "Nonterminal: " << tree.name 345 | STDOUT.puts 346 | tree.children.each { |it| print_tree(it, indent + 1) } 347 | end 348 | end 349 | ``` 350 | For the input string `3+3`, the program will output: 351 | ``` 352 | Nonterminal: S 353 | Nonterminal: expr 354 | Nonterminal: number 355 | Terminal: 3 356 | Nonterminal: op 357 | Terminal: + 358 | Nonterminal: number 359 | Terminal: 3 360 | ``` 361 | 362 | ## Crystal Output with Semantic Actions 363 | This is just like C semantic actions, but with Crystal. Suppose you don't need 364 | a parse tree. Rather, you want to generate your own values from Pegasus grammar 365 | rules. You can do this with the `pegasus-crystalsem` parser generator. When 366 | using this generator, you specify an additional file, which associates Crystal 367 | code (_semantic actions_) with each rule. Let's consider a language 368 | of booleans: 369 | ``` 370 | token whitespace = /[ \n\t]+/ [ skip ]; 371 | token true = /true/; 372 | token false = /false/; 373 | token and = /and/; 374 | token or = /or/; 375 | 376 | rule S = expr; 377 | rule expr = tkn | expr and tkn | expr or tkn; 378 | rule tkn = true | false; 379 | ``` 380 | Now that we have our grammar, it's time to formulate the additional file 381 | we mentioned. The first thing we need to do is figure out what Crystal 382 | type each of the nonterminals we generate. Our language is that 383 | of booleans, so we will be needing a boolean type: 384 | ``` 385 | type boolean = $$ Bool $$ 386 | ``` 387 | Here, the stuff inside the `$$` is Crystal code that is pasted verbatim into the 388 | generated parser. Now, we want to specify which rules evaluate to that type. 389 | In our simple language, every rule evaluates to a boolean: 390 | ``` 391 | typerules boolean = [ S, expr, tkn ] 392 | ``` 393 | `pegasus-crystalsem` also allows you to put some code above the parsing code, 394 | globally. We don't use this, so we leave the `init` property blank: 395 | ``` 396 | init = $$ $$ 397 | ``` 398 | It is now time to assign semantic Crystal actions to each grammar rule. We 399 | start with the first rule, `S(0)` (which means the first rule for the 400 | `S` nonterminal). Since the first rule just matches an `expr`, we 401 | simply output the value of that `expr`: 402 | ``` 403 | rule S(0) = $$ $out = $0 $$ 404 | ``` 405 | This means "set the output to be the value of the first element in the rule's body". 406 | We now implement the actual rules for `expr`. The first rule simply forwards 407 | the result of the `tkn`, just like the rule for `S`. The other two rules actually 408 | implement the logical operations of `&` and `|`: 409 | ``` 410 | rule expr(0) = $$ $out = $0 $$ 411 | rule expr(1) = $$ $out = $0 & $2 $$ 412 | rule expr(2) = $$ $out = $0 | $2 $$ 413 | ``` 414 | Finally, we use the two rules for `tkn` to actually return a boolean: 415 | ``` 416 | rule tkn(0) = $$ $out = true $$ 417 | rule tkn(1) = $$ $out = false $$ 418 | ``` 419 | Let's test this. We include the generated parser, and write the following: 420 | ```Crystal 421 | require "./parser.cr" 422 | 423 | puts Pegasus::Generated.process(gets.not_nil!) 424 | ``` 425 | Let's now run this with the expression `true or false or true`. The output: 426 | ``` 427 | true 428 | ``` 429 | That's indeed our answer! 430 | 431 | ## JSON Format 432 | For the grammar given by: 433 | ``` 434 | token hi = /hi/; 435 | rule A = hi; 436 | ``` 437 | The corresponding (pretty-printed) JSON output is: 438 | ``` 439 | { 440 | "lex_state_table":[[..]..], 441 | "lex_final_table”:[..], 442 | "parse_state_table":[[..]..], 443 | "parse_action_table":[[..]..], 444 | "terminals":{ 445 | "hi":{ 446 | "terminal_id":0 447 | } 448 | }, 449 | "nonterminals":{ 450 | "A":{ 451 | "nonterminal_id":0 452 | } 453 | }, 454 | "items":[ 455 | { 456 | "head":{ 457 | "nonterminal_id":0 458 | }, 459 | "body":[ 460 | { 461 | "terminal_id":0 462 | } 463 | ] 464 | } 465 | ], 466 | "max_terminal":0 467 | } 468 | ``` 469 | ## Contributors 470 | 471 | - [DanilaFe](https://github.com/DanilaFe) Danila Fedorin - creator, maintainer 472 | -------------------------------------------------------------------------------- /grammars/adder.grammar: -------------------------------------------------------------------------------- 1 | token add = /\+/; 2 | token number = /[1-9][0-9]*/; 3 | 4 | rule S = add_expr; 5 | rule add_expr = add_expr add number | number; 6 | -------------------------------------------------------------------------------- /grammars/arithmetic.grammar: -------------------------------------------------------------------------------- 1 | token add = /\+/; 2 | token sub = /-/; 3 | token mul = /\*/; 4 | token div = /\//; 5 | token open_parenth = /\(/; 6 | token close_parenth = /\)/; 7 | token number = /[1-9][0-9]*/; 8 | 9 | rule S = add_expr; 10 | rule add_expr = add_expr add_op mul_expr | mul_expr; 11 | rule mul_expr = mul_expr mul_op atom | atom; 12 | rule atom = open_parenth add_expr close_parenth | number; 13 | rule add_op = add | sub; 14 | rule mul_op = div | mul; 15 | -------------------------------------------------------------------------------- /grammars/derive_lambda.grammar: -------------------------------------------------------------------------------- 1 | token hello = /hello/; 2 | token goodbye = /goodbye/; 3 | rule S = A B; 4 | rule A = hello?; 5 | rule B = goodbye?; 6 | -------------------------------------------------------------------------------- /grammars/empty_production.grammar: -------------------------------------------------------------------------------- 1 | token hello = /hello/; 2 | rule S = hello?; 3 | -------------------------------------------------------------------------------- /grammars/match.grammar: -------------------------------------------------------------------------------- 1 | token open_parenth = /\(/; 2 | token close_parenth = /\)/; 3 | token matched_parenth = /\(\)/; 4 | token open_square = /\[/; 5 | token close_square = /\]/; 6 | token matched_square = /\[\]/; 7 | token open_curly = /{/; 8 | token close_curly = /}/; 9 | token matched_curly = /{}/; 10 | 11 | rule S = any; 12 | rule any = parenths any 13 | | square_brackets any 14 | | brackets any 15 | | parenths 16 | | square_brackets 17 | | brackets; 18 | rule parenths = open_parenth any close_parenth | matched_parenth; 19 | rule square_brackets = open_square any close_square | matched_square; 20 | rule brackets = open_curly any close_curly | matched_curly; 21 | -------------------------------------------------------------------------------- /grammars/modern_compiler_design.grammar: -------------------------------------------------------------------------------- 1 | token x = /x/; 2 | token b = /b/; 3 | token a = /a/; 4 | 5 | rule S = A | x b; 6 | rule A = a A b | B; 7 | rule B = x; 8 | -------------------------------------------------------------------------------- /grammars/optional.grammar: -------------------------------------------------------------------------------- 1 | token hello = /hello/; 2 | rule S = hello? hello; 3 | -------------------------------------------------------------------------------- /grammars/reduce_reduce.grammar: -------------------------------------------------------------------------------- 1 | token hello = /hello/; 2 | rule S = A | B; 3 | rule A = hello; 4 | rule B = hello; 5 | -------------------------------------------------------------------------------- /grammars/shift_reduce.grammar: -------------------------------------------------------------------------------- 1 | token world = /world/; 2 | token hello = /hello/; 3 | 4 | rule S = A | B world; 5 | rule A = hello world; 6 | rule B = hello; 7 | -------------------------------------------------------------------------------- /grammars/skip.grammar: -------------------------------------------------------------------------------- 1 | token whitespace = /[ \n\t]+/ [ skip ]; 2 | token hello = /hello/; 3 | rule S = hello hello; 4 | -------------------------------------------------------------------------------- /grammars/some_derive_lambda.grammar: -------------------------------------------------------------------------------- 1 | token hello = /hello/; 2 | token goodbye = /goodbye/; 3 | rule S = A B hello; 4 | rule A = hello?; 5 | rule B = goodbye?; 6 | -------------------------------------------------------------------------------- /pegasus-sem.grammar: -------------------------------------------------------------------------------- 1 | token whitespace = /([ \t]|\r?\n)+/ [ skip ]; 2 | token identifier = /[a-zA-Z_\-]+/; 3 | token integer = /[0-9]+/; 4 | token code = /$$([^$]|$[^$])*$$/; 5 | token keyword_type = /type/; 6 | token keyword_typerules = /typerules/; 7 | token keyword_state = /state/; 8 | token keyword_init = /init/; 9 | token keyword_rule = /rule/; 10 | token eq = /=/; 11 | token oparenth = /\(/; 12 | token cparenth = /\)/; 13 | token obracket = /\[/; 14 | token cbracket = /\]/; 15 | token comma = /,/; 16 | 17 | rule S = type_list typerules_list init_decl rule_list; 18 | 19 | rule type_list = type_decl type_list?; 20 | rule type_decl = keyword_type identifier eq code; 21 | 22 | rule typerules_list = typerules_decl typerules_list?; 23 | rule typerules_decl = keyword_typerules identifier eq obracket identifier_list cbracket; 24 | rule identifier_list = identifier | identifier comma identifier_list; 25 | 26 | rule init_decl = keyword_init eq code; 27 | 28 | rule rule_list = rule_decl rule_list?; 29 | rule rule_decl = keyword_rule identifier oparenth integer cparenth eq code; 30 | -------------------------------------------------------------------------------- /pegasus.grammar: -------------------------------------------------------------------------------- 1 | token whitespace = /([ \t]|\r?\n)+/ [ skip ]; 2 | token identifier = /[a-zA-Z_\-]+/; 3 | token keyword_token = /token/; 4 | token keyword_rule = /rule/; 5 | token equals_delimiter = /=/; 6 | token semicolon_delimiter = /;/; 7 | token or_delimiter = /\|/; 8 | token regex = /\/([^\/]|\\.)*\//; 9 | token open_square = /\[/; 10 | token closed_square = /\]/; 11 | token open_parenth = /\(/; 12 | token closed_parenth = /\)/; 13 | token comma = /,/; 14 | token optional = /\?/; 15 | 16 | rule S = token_list grammar_list | token_list | grammar_list; 17 | rule token_list = token_def | token_def token_list; 18 | rule token_def = keyword_token identifier equals_delimiter regex statement_end; 19 | rule grammar_list = grammar_rule | grammar_rule grammar_list; 20 | rule grammar_rule = keyword_rule identifier equals_delimiter grammar_bodies statement_end; 21 | rule grammar_bodies = grammar_body | grammar_body or_delimiter grammar_bodies; 22 | rule grammar_body = grammar_element | grammar_element grammar_body; 23 | rule grammar_element = identifier | identifier optional; 24 | rule statement_end = options semicolon_delimiter | semicolon_delimiter; 25 | rule options = open_square options_list closed_square; 26 | rule options_list = option | option comma options_list; 27 | rule option = identifier; 28 | -------------------------------------------------------------------------------- /semantics/adder.sem: -------------------------------------------------------------------------------- 1 | type integer = $$ int $$ 2 | 3 | typerules integer = [S, add_expr] 4 | 5 | init = $$ $$ 6 | 7 | rule S(0) = $$ $out = $0; $$ 8 | rule add_expr(0) = $$ $out = $0 + atoi(src + $2->from); $$ 9 | rule add_expr(1) = $$ $out = atoi(src + $0->from); $$ 10 | -------------------------------------------------------------------------------- /shard.yml: -------------------------------------------------------------------------------- 1 | name: pegasus 2 | version: 0.1.1 3 | 4 | authors: 5 | - Danila Fedorin 6 | 7 | targets: 8 | pegasus: 9 | main: src/pegasus.cr 10 | pegasus-dot: 11 | main: src/tools/dot/pegasus_dot.cr 12 | pegasus-sim: 13 | main: src/tools/sim/pegasus_sim.cr 14 | pegasus-c: 15 | main: src/generators/c/pegasus_c.cr 16 | pegasus-csem: 17 | main: src/generators/csem/pegasus_csem.cr 18 | pegasus-crystal: 19 | main: src/generators/crystal/pegasus_crystal.cr 20 | pegasus-crystalsem: 21 | main: src/generators/crystalsem/pegasus_crystalsem.cr 22 | 23 | crystal: 1.14.0 24 | 25 | license: MIT 26 | -------------------------------------------------------------------------------- /spec/automaton_spec.cr: -------------------------------------------------------------------------------- 1 | require "./spec_utils.cr" 2 | 3 | describe Pegasus::Automata::Automaton do 4 | describe "#initialize" do 5 | it "Starts at state 0" do 6 | automaton = Pegasus::Automata::Automaton(Int32, Int32).new 7 | automaton.last_id.should eq 0 8 | end 9 | 10 | it "Doesn't add any states" do 11 | automaton = Pegasus::Automata::Automaton(Int32, Int32).new 12 | automaton.states.size.should eq 0 13 | end 14 | 15 | it "Starts with a nil start state" do 16 | automaton = Pegasus::Automata::Automaton(Int32, Int32).new 17 | automaton.start.should be_nil 18 | end 19 | end 20 | 21 | describe "#state_for" do 22 | it "Increments the state ID after every created state" do 23 | automaton = Pegasus::Automata::Automaton(Int32, Int32).new 24 | automaton.state_for(data: 3).id.should eq 0 25 | automaton.state_for(data: 3).id.should eq 1 26 | automaton.state_for(data: 4).id.should eq 2 27 | end 28 | 29 | it "Creates a state with the correct data" do 30 | automaton = Pegasus::Automata::Automaton(Int32, Int32).new 31 | automaton.state_for(data: 3).data.should eq 3 32 | automaton.state_for(data: 3).data.should eq 3 33 | automaton.state_for(data: 4).data.should eq 4 34 | end 35 | 36 | it "Adds the state to its internal list" do 37 | automaton = Pegasus::Automata::Automaton(Int32, Int32).new 38 | state_one = automaton.state_for(data: 1) 39 | state_two = automaton.state_for(data: 2) 40 | state_three = automaton.state_for(data: 3) 41 | 42 | automaton.states.should contain state_one 43 | automaton.states.should contain state_two 44 | automaton.states.should contain state_three 45 | end 46 | end 47 | end 48 | 49 | describe Pegasus::Automata::UniqueAutomaton do 50 | describe "#initialize" do 51 | it "Has no state memorized" do 52 | automaton = Pegasus::Automata::UniqueAutomaton(Int32, Int32).new 53 | automaton.@memorized.size.should eq 0 54 | end 55 | end 56 | 57 | describe "#state_for" do 58 | it "Doesn't create states with duplicate values" do 59 | automaton = Pegasus::Automata::UniqueAutomaton(Int32, Int32).new 60 | automaton.state_for(data: 3).id.should eq 0 61 | automaton.state_for(data: 3).id.should eq 0 62 | automaton.state_for(data: 4).id.should eq 1 63 | end 64 | end 65 | end 66 | -------------------------------------------------------------------------------- /spec/dfa_spec.cr: -------------------------------------------------------------------------------- 1 | require "./spec_utils.cr" 2 | 3 | describe Pegasus::Dfa do 4 | describe "#final_table" do 5 | it "Creates a two-entry table when there are no expression" do 6 | nfa = Pegasus::Nfa::Nfa.new 7 | dfa = nfa.dfa 8 | table = dfa.final_table 9 | table.size.should eq 2 10 | table[0].should eq 0 11 | table[1].should eq 0 12 | end 13 | 14 | it "Creates a two-entry table with a final state for an empty expression" do 15 | nfa = Pegasus::Nfa::Nfa.new 16 | nfa.add_regex "", 0_i64 17 | dfa = nfa.dfa 18 | table = dfa.final_table 19 | table.size.should eq 2 20 | table[0].should eq 0 21 | table[1].should eq 1 22 | end 23 | 24 | it "Creates two final states for an OR expression" do 25 | nfa = Pegasus::Nfa::Nfa.new 26 | nfa.add_regex "h|g", 0_i64 27 | dfa = nfa.dfa 28 | table = dfa.final_table 29 | table.size.should eq 4 30 | table[0].should eq 0 31 | table[1].should eq 0 32 | table[2].should_not eq 0 33 | table[3].should_not eq 0 34 | end 35 | end 36 | 37 | describe "#state_table" do 38 | it "Does not allow transitions out of the error state" do 39 | nfa = Pegasus::Nfa::Nfa.new 40 | dfa = nfa.dfa 41 | table = dfa.state_table 42 | table[0].each &.should eq 0 43 | end 44 | 45 | it "Creates a table leading to the error state when there are no expressions" do 46 | nfa = Pegasus::Nfa::Nfa.new 47 | dfa = nfa.dfa 48 | table = dfa.state_table 49 | table.each &.each &.should eq 0 50 | end 51 | 52 | it "Creates a transition table with a final state for a single character" do 53 | nfa = Pegasus::Nfa::Nfa.new 54 | nfa.add_regex "h", 0_i64 55 | dfa = nfa.dfa 56 | table = dfa.state_table 57 | table.size.should eq 3 58 | final_byte = 'h'.bytes.first 59 | table[1].each_with_index do |state, index| 60 | state.should eq 0 if index != final_byte 61 | state.should eq 2 if index == final_byte 62 | end 63 | table[2].each &.should eq 0 64 | end 65 | 66 | it "Creates a forked transition table for a fork in the DFA" do 67 | nfa = Pegasus::Nfa::Nfa.new 68 | nfa.add_regex "h|e", 0_i64 69 | dfa = nfa.dfa 70 | table = dfa.state_table 71 | table.size.should eq 4 72 | h_byte = 'h'.bytes.first 73 | e_byte = 'e'.bytes.first 74 | table[1].each_with_index do |state, index| 75 | state.should eq 0 if index != h_byte && index != e_byte 76 | state.should_not eq 0 if index == h_byte || index == e_byte 77 | end 78 | table[2].each &.should eq 0 79 | table[3].each &.should eq 0 80 | end 81 | end 82 | end 83 | -------------------------------------------------------------------------------- /spec/language_spec.cr: -------------------------------------------------------------------------------- 1 | require "./spec_utils.cr" 2 | 3 | describe Pegasus::Language::LanguageDefinition do 4 | describe "#from_string" do 5 | it "Handles empty strings" do 6 | expect_raises(Pegasus::Error::GrammarException) do 7 | Pegasus::Language::LanguageDefinition.new "" 8 | end 9 | end 10 | 11 | it "Errors on just a rule without a body" do 12 | expect_raises(Pegasus::Error::GrammarException) do 13 | Pegasus::Language::LanguageDefinition.new %(rule S); 14 | end 15 | end 16 | 17 | it "Errors on just a token without a body" do 18 | expect_raises(Pegasus::Error::GrammarException) do 19 | Pegasus::Language::LanguageDefinition.new %(rule S); 20 | end 21 | end 22 | 23 | it "Errors on just a rule with an equals sign, but no body" do 24 | expect_raises(Pegasus::Error::GrammarException) do 25 | Pegasus::Language::LanguageDefinition.new %(rule S = ); 26 | end 27 | end 28 | 29 | it "Errors on just a token with an equals sign, but no body" do 30 | expect_raises(Pegasus::Error::GrammarException) do 31 | Pegasus::Language::LanguageDefinition.new %(token S = ); 32 | end 33 | end 34 | 35 | it "Errors on a token not ending in a semicolon, when another rule follows" do 36 | expect_raises(Pegasus::Error::GrammarException) do 37 | Pegasus::Language::LanguageDefinition.new %(token t = /t/\nrule expr = h;) 38 | end 39 | end 40 | 41 | it "Errors on a rule not ending in a semicolon, when another rule follows" do 42 | expect_raises(Pegasus::Error::GrammarException) do 43 | Pegasus::Language::LanguageDefinition.new %(rule S = expr\nrule expr = h;) 44 | end 45 | end 46 | 47 | it "Errors when a duplicate token is declared" do 48 | expect_raises(Pegasus::Error::GrammarException) do 49 | Pegasus::Language::LanguageDefinition.new %(token t = /t/; token t = /r/;) 50 | end 51 | end 52 | 53 | it "Errors when a rule is named the same as a token" do 54 | expect_raises(Pegasus::Error::GrammarException) do 55 | Pegasus::Language::LanguageDefinition.new %(token t = /t/; rule t = t;) 56 | end 57 | end 58 | 59 | it "Correctly handles options" do 60 | language = Pegasus::Language::LanguageDefinition.new %(token hello = /hello/ [ skip ];) 61 | language.tokens.size.should eq 1 62 | language.tokens["hello"]?.should eq Pegasus::Language::Token.new("hello", [ "skip" ]) 63 | end 64 | 65 | it "Correctly handles two rules with the same name" do 66 | language = Pegasus::Language::LanguageDefinition.new %(rule S = weird; rule S = not_weird;) 67 | language.tokens.size.should eq 0 68 | language.rules.size.should eq 1 69 | language.rules["S"]?.should eq [ rule(rule_alternative("weird")), rule(rule_alternative("not_weird")) ] 70 | end 71 | 72 | it "Correctly parses a single rule with a single terminal or nonterminal" do 73 | language = Pegasus::Language::LanguageDefinition.new %(rule S = h;) 74 | language.tokens.size.should eq 0 75 | language.rules.size.should eq 1 76 | language.rules["S"]?.should eq [ rule(rule_alternative("h")) ] 77 | end 78 | 79 | it "Correctly parses a single token declaration" do 80 | language = Pegasus::Language::LanguageDefinition.new %(token hello = /hello/;) 81 | language.tokens.size.should eq 1 82 | language.tokens["hello"]?.should eq Pegasus::Language::Token.new("hello") 83 | language.rules.size.should eq 0 84 | end 85 | 86 | it "Correctly parses a single rule with more than one terminal or nonterminal" do 87 | language = Pegasus::Language::LanguageDefinition.new %(rule S = hello world;) 88 | language.tokens.size.should eq 0 89 | language.rules.size.should eq 1 90 | language.rules["S"]?.should eq [ rule(rule_alternative("hello", "world")) ] 91 | end 92 | 93 | it "Correctly parses a rule with multiple bodies" do 94 | language = Pegasus::Language::LanguageDefinition.new %(rule S = s | e;) 95 | language.tokens.size.should eq 0 96 | language.rules.size.should eq 1 97 | language.rules["S"]?.should eq [ rule(rule_alternative("s"), rule_alternative("e")) ] 98 | end 99 | 100 | # The following tests are run with both types of newlines (UNIX and DOS) 101 | # to make sure we still work on Windows. 102 | ["\n", "\r\n"].each do |nl| 103 | it "Correctly handles whitespace between the token / rule keyword and the identifier" do 104 | language = Pegasus::Language::LanguageDefinition.new %Q(token #{nl} t #{nl} = /t/;rule #{nl} S #{nl} = t;) 105 | language.tokens.size.should eq 1 106 | language.tokens["t"]?.should eq Pegasus::Language::Token.new("t") 107 | language.rules.size.should eq 1 108 | language.rules["S"]?.should eq [ rule(rule_alternative("t")) ] 109 | end 110 | 111 | it "Correctly handles whitespace around the equals sign" do 112 | language = Pegasus::Language::LanguageDefinition.new %Q(token t #{nl} = /t/;rule S #{nl} = #{nl}t;) 113 | language.tokens.size.should eq 1 114 | language.tokens["t"]?.should eq Pegasus::Language::Token.new("t") 115 | language.rules.size.should eq 1 116 | language.rules["S"]?.should eq [ rule(rule_alternative("t")) ] 117 | end 118 | 119 | it "Correctly handles whitespace around the semicolon" do 120 | language = Pegasus::Language::LanguageDefinition.new %Q(token t = /t/ #{nl} ; #{nl} rule S = t #{nl} ; #{nl}) 121 | language.tokens.size.should eq 1 122 | language.tokens["t"]?.should eq Pegasus::Language::Token.new("t") 123 | language.rules.size.should eq 1 124 | language.rules["S"]?.should eq [ rule(rule_alternative("t")) ] 125 | end 126 | 127 | it "Correctly handles whitespace between rule identifiers" do 128 | language = Pegasus::Language::LanguageDefinition.new %Q(rule S = hello #{nl} goodbye #{nl} | #{nl} world;) 129 | language.tokens.size.should eq 0 130 | language.rules.size.should eq 1 131 | language.rules["S"]?.should eq [ rule(rule_alternative("hello", "goodbye"), rule_alternative("world")) ] 132 | end 133 | 134 | it "Correctly parses two rules with one body each" do 135 | language = Pegasus::Language::LanguageDefinition.new %Q(rule S = h;#{nl}rule expr = e;) 136 | language.tokens.size.should eq 0 137 | language.rules.size.should eq 2 138 | language.rules["S"]?.should eq [ rule(rule_alternative("h")) ] 139 | language.rules["expr"]?.should eq [ rule(rule_alternative("e")) ] 140 | end 141 | end 142 | end 143 | end 144 | -------------------------------------------------------------------------------- /spec/nfa_spec.cr: -------------------------------------------------------------------------------- 1 | require "./spec_utils.cr" 2 | 3 | describe Pegasus::Nfa::Nfa do 4 | describe "#initialize" do 5 | it "Creates a start state" do 6 | nfa = Pegasus::Nfa::Nfa.new 7 | nfa.@start.should_not be_nil 8 | end 9 | 10 | it "Doesn't create a final start state" do 11 | nfa = Pegasus::Nfa::Nfa.new 12 | nfa.@start.try(&.data).should be_nil 13 | end 14 | end 15 | 16 | describe "#dfa" do 17 | it "Creates an empty DFA with no final states when no patterns were added" do 18 | nfa = Pegasus::Nfa::Nfa.new 19 | dfa = nfa.dfa 20 | dfa.states.size.should eq 1 21 | dfa.states.each do |state| 22 | state.data.each do |nfa_state| 23 | nfa_state.data.should be_nil 24 | end 25 | end 26 | end 27 | 28 | it "Does not create negative states" do 29 | nfa = Pegasus::Nfa::Nfa.new 30 | nfa.add_regex "hello", 0_i64 31 | nfa.add_regex "goodbye", 1_i64 32 | dfa = nfa.dfa 33 | dfa.states.each do |state| 34 | state.id.should be >= 0 35 | end 36 | end 37 | 38 | it "Sets the start state of the new DFA" do 39 | nfa = Pegasus::Nfa::Nfa.new 40 | dfa = nfa.dfa 41 | dfa.start.should_not be_nil 42 | dfa.start.try(&.id).should eq 0_i64 43 | end 44 | 45 | it "Creates a basic two-state DFA for single-character patterns" do 46 | nfa = Pegasus::Nfa::Nfa.new 47 | nfa.add_regex "h", 0_i64 48 | dfa = nfa.dfa 49 | 50 | dfa.states.size.should eq 2 51 | dfa.states.each do |state| 52 | if state == dfa.start 53 | state.data.each &.data.should be_nil 54 | state.transitions.size.should eq 1 55 | next_state = state.transitions['h'.bytes.first]? 56 | next_state.should_not be_nil 57 | else 58 | state.pattern_id.should eq 1 59 | end 60 | end 61 | end 62 | 63 | it "Creates a DFA for an OR expression" do 64 | nfa = Pegasus::Nfa::Nfa.new 65 | nfa.add_regex "h|e", 0_i64 66 | dfa = nfa.dfa 67 | dfa.states.size.should eq 3 68 | dfa.states.each do |state| 69 | if state == dfa.start 70 | state.data.each &.data.should be_nil 71 | state.transitions.size.should eq 2 72 | h_state = state.transitions['h'.bytes.first]? 73 | h_state.should_not be_nil 74 | e_state = state.transitions['e'.bytes.first]? 75 | e_state.should_not be_nil 76 | else 77 | state.pattern_id.should eq 1 78 | end 79 | end 80 | end 81 | 82 | it "Creates a DFA for a + expression" do 83 | nfa = Pegasus::Nfa::Nfa.new 84 | nfa.add_regex "h+", 0_i64 85 | dfa = nfa.dfa 86 | dfa.states.size.should eq 2 87 | dfa.states.each do |state| 88 | if state == dfa.start 89 | state.data.each &.data.should be_nil 90 | state.transitions.size.should eq 1 91 | h_state = state.transitions['h'.bytes.first]? 92 | h_state.should_not be_nil 93 | else 94 | state.pattern_id.should eq 1 95 | state.transitions.size.should eq 1 96 | state.transitions['h'.bytes.first]?.should eq state 97 | end 98 | end 99 | end 100 | 101 | it "Creates a DFA for a * expression" do 102 | nfa = Pegasus::Nfa::Nfa.new 103 | nfa.add_regex "h*", 0_i64 104 | dfa = nfa.dfa 105 | dfa.states.size.should eq 2 106 | dfa.states.each do |state| 107 | state.pattern_id.should eq 1 108 | state.transitions.size.should eq 1 109 | end 110 | end 111 | 112 | it "Creates a DFA for a ? expression" do 113 | nfa = Pegasus::Nfa::Nfa.new 114 | nfa.add_regex "h?", 0_i64 115 | dfa = nfa.dfa 116 | dfa.states.size.should eq 2 117 | dfa.states.each do |state| 118 | state.pattern_id.should eq 1 119 | if state == dfa.start 120 | next_state = state.transitions['h'.bytes.first]? 121 | next_state.should_not be_nil 122 | else 123 | state.transitions['h'.bytes.first]?.should be_nil 124 | end 125 | end 126 | end 127 | 128 | it "Creates a DFA for a range expression" do 129 | nfa = Pegasus::Nfa::Nfa.new 130 | nfa.add_regex "[helo0-9]", 0_i64 131 | dfa = nfa.dfa 132 | dfa.states.size.should eq 2 133 | dfa.states.each do |state| 134 | if state == dfa.start 135 | state.transitions.size.should eq 14 136 | else 137 | state.transitions.size.should eq 0 138 | state.pattern_id.should eq 1 139 | end 140 | end 141 | end 142 | end 143 | end 144 | 145 | describe Pegasus::Nfa::Transition do 146 | describe "#char_states" do 147 | it "Does not return any states" do 148 | transition = Pegasus::Nfa::Transition.new 149 | transition.char_states.size.should eq 0 150 | end 151 | end 152 | end 153 | 154 | describe Pegasus::Nfa::ByteTransition do 155 | describe "#char_states" do 156 | it "Only returns one byte" do 157 | transition = Pegasus::Nfa::ByteTransition.new 0_u8 158 | transition.char_states.should eq [ 0_u8 ] 159 | end 160 | end 161 | end 162 | 163 | describe Pegasus::Nfa::AnyTransition do 164 | describe "#char_states" do 165 | it "Returns the full unsigned byte range" do 166 | transition = Pegasus::Nfa::AnyTransition.new 167 | transition.char_states.should eq (0_u8..255_u8).to_a 168 | end 169 | end 170 | end 171 | 172 | describe Pegasus::Nfa::RangeTransition do 173 | describe "#char_states" do 174 | it "Returns the given ranges when not inverted" do 175 | transition = Pegasus::Nfa::RangeTransition.new ranges: [(0_u8..1_u8), (2_u8..3_u8)], 176 | inverted: false 177 | transition.char_states.sort.should eq [ 0_u8, 1_u8, 2_u8, 3_u8 ] 178 | end 179 | 180 | it "Returns the ranges not given when inverted" do 181 | transition = Pegasus::Nfa::RangeTransition.new ranges: [(0_u8..127_u8), (130_u8..255_u8)], 182 | inverted: true 183 | transition.char_states.sort.should eq [ 128_u8, 129_u8 ] 184 | end 185 | end 186 | end 187 | 188 | describe Pegasus::Nfa::StateChain do 189 | describe "#initialize" do 190 | it "Sets the final state to the start state if no final state is given" do 191 | state = Pegasus::Nfa::NState.new id: 0_i64, data: nil 192 | chain = Pegasus::Nfa::StateChain.new start: state 193 | chain.start.should eq state 194 | chain.final.should eq state 195 | end 196 | 197 | it "Adds a transition to its tail state when concatenated with another chain" do 198 | state_one = Pegasus::Nfa::NState.new id: 0i64, data: nil 199 | state_two = Pegasus::Nfa::NState.new id: 1i64, data: nil 200 | first_chain = Pegasus::Nfa::StateChain.new state_one, state_one 201 | second_chain = Pegasus::Nfa::StateChain.new state_two, state_two 202 | first_chain.append! second_chain 203 | first_chain.start.should eq state_one 204 | first_chain.final.should eq state_two 205 | first_chain.start.transitions.size.should eq 1 206 | first_chain.start.transitions.keys[0].should be_a Pegasus::Nfa::LambdaTransition 207 | first_chain.start.transitions.values[0].should be state_two 208 | end 209 | 210 | it "Doesn't do anything when a Nil is appended" do 211 | state_one = Pegasus::Nfa::NState.new id: 0i64, data: nil 212 | state_two = Pegasus::Nfa::NState.new id: 1i64, data: nil 213 | state_one.transitions[Pegasus::Nfa::LambdaTransition.new] = state_two 214 | first_chain = Pegasus::Nfa::StateChain.new state_one, state_two 215 | first_chain.append! nil 216 | first_chain.start.should eq state_one 217 | first_chain.final.should eq state_two 218 | first_chain.final.transitions.size.should eq 0 219 | end 220 | end 221 | end 222 | 223 | describe Pegasus::Nfa::Nfa do 224 | describe "#add_regex" do 225 | it "Correctly compiles one-character regular expression" do 226 | nfa = Pegasus::Nfa::Nfa.new 227 | nfa.add_regex "h", 0_i64 228 | (nfa.start.try(&.transitions.size) || 0).should eq 1 229 | nfa.states.size.should eq 4 230 | end 231 | 232 | it "Does not add negative states" do 233 | nfa = Pegasus::Nfa::Nfa.new 234 | nfa.add_regex "hello", 0_i64 235 | nfa.states.each do |state| 236 | state.id.should be >= 0 237 | end 238 | end 239 | 240 | it "Correctly compiles OR regular expression" do 241 | nfa = Pegasus::Nfa::Nfa.new 242 | nfa.add_regex "h|e", 0_i64 243 | nfa.start.not_nil!.transitions.size.should eq 1 244 | nfa.states.size.should eq 8 245 | or_branch_state = nfa.start.not_nil!.transitions.values[0] 246 | or_branch_state.transitions.size.should eq 2 247 | seen_h = false 248 | seen_e = false 249 | or_branch_state.transitions.map(&.[1]).each do |state| 250 | transition_byte = state.transitions.keys[0].as?(Pegasus::Nfa::ByteTransition).try(&.byte) 251 | seen_h |= transition_byte == 'h'.bytes.first 252 | seen_e |= transition_byte == 'e'.bytes.first 253 | end 254 | seen_h.should be_true 255 | seen_e.should be_true 256 | end 257 | 258 | it "Correctly compiles ? regular expression" do 259 | nfa = Pegasus::Nfa::Nfa.new 260 | nfa.add_regex "h?", 0_i64 261 | nfa.start.not_nil!.transitions.size.should eq 1 262 | nfa.states.size.should eq 6 263 | skip_from = nfa.start.not_nil!.straight_path(length: 1) 264 | skip_from.should_not be_nil 265 | skip_from = skip_from.not_nil! 266 | skip_from.transitions.size.should eq 2 267 | end 268 | 269 | it "Correctly compiles * regular expression" do 270 | nfa = Pegasus::Nfa::Nfa.new 271 | nfa.add_regex "h?", 0_i64 272 | nfa.start.not_nil!.transitions.size.should eq 1 273 | nfa.states.size.should eq 6 274 | skip_from = nfa.start.not_nil!.straight_path(length: 1) 275 | skip_from.should_not be_nil 276 | skip_from = skip_from.not_nil! 277 | skip_from.transitions.size.should eq 2 278 | end 279 | 280 | it "Correctly compiles + regular expression" do 281 | nfa = Pegasus::Nfa::Nfa.new 282 | nfa.add_regex "h+", 0_i64 283 | nfa.start.not_nil!.transitions.size.should eq 1 284 | nfa.states.size.should eq 6 285 | return_to = nfa.start.not_nil!.straight_path(length: 1) 286 | return_to.should_not be_nil 287 | return_to = return_to.not_nil! 288 | return_to.transitions.size.should eq 1 289 | 290 | return_from = return_to.straight_path(length: 3) 291 | return_from.should_not be_nil 292 | return_from = return_from.not_nil! 293 | return_from.transitions.size.should eq 2 294 | end 295 | 296 | it "Correctly compiles range expression" do 297 | nfa = Pegasus::Nfa::Nfa.new 298 | nfa.add_regex "[helo0-9]", 0_i64 299 | nfa.states.size.should eq 4 300 | contained = { 'h' => false, 'o' => false, '1' => false, '9' => false } 301 | range_transition_state = nfa.start.not_nil!.straight_path(length: 1) 302 | range_transition_state.should_not be_nil 303 | range_transition_state = range_transition_state.not_nil! 304 | range_transition_state.transitions.each do |transition, _| 305 | contained.each do |k, _| 306 | byte = k.bytes[0] 307 | if transition.as?(Pegasus::Nfa::RangeTransition).try &.ranges.one? &.includes? byte 308 | contained[k] = true 309 | end 310 | end 311 | end 312 | contained.values.all_should eq true 313 | end 314 | 315 | it "Does not compile incomplete escape codes" do 316 | nfa = Pegasus::Nfa::Nfa.new 317 | expect_raises(Pegasus::Error::NfaException) do 318 | nfa.add_regex "h\\", 1_i64 319 | end 320 | end 321 | 322 | it "Does not compile invalid escape codes" do 323 | nfa = Pegasus::Nfa::Nfa.new 324 | expect_raises(Pegasus::Error::NfaException) do 325 | nfa.add_regex "\\h", 1_i64 326 | end 327 | end 328 | 329 | it "Correctly compiles valid escape codes" do 330 | nfa = Pegasus::Nfa::Nfa.new 331 | specials = [ "\\\"", "\\'", "\\[", "\\]", "\\(", "\\)", "\\|", "\\?", "\\*", "\\+", "\\.", "\\n" ] 332 | 333 | specials.each_with_index do |special, index| 334 | nfa.add_regex special, index.to_i64 335 | end 336 | 337 | nfa.start.not_nil!.transitions.size.should eq specials.size 338 | transition_bytes = [] of UInt8 339 | nfa.start.not_nil!.transitions.values.each do |state| 340 | state.transitions.size.should eq 1 341 | state.transitions.keys[0].should be_a(Pegasus::Nfa::ByteTransition) 342 | transition_bytes << state.transitions.keys[0].as(Pegasus::Nfa::ByteTransition).byte 343 | end 344 | transition_bytes[0...transition_bytes.size - 1].should eq specials[0...specials.size - 1].map(&.[1].bytes.[0]) 345 | transition_bytes.last.should eq '\n'.bytes[0] 346 | end 347 | 348 | it "Combines several regular expressions" do 349 | nfa = Pegasus::Nfa::Nfa.new 350 | nfa.add_regex "h", 1_i64 351 | nfa.add_regex "e", 2_i64 352 | nfa.start.not_nil!.transitions.size.should eq 2 353 | end 354 | 355 | it "Does not compile invalid operators" do 356 | nfa = Pegasus::Nfa::Nfa.new 357 | expect_raises(Pegasus::Error::NfaException) do 358 | nfa.add_regex "+", 0_i64 359 | end 360 | 361 | expect_raises(Pegasus::Error::NfaException) do 362 | nfa.add_regex "h(+)", 0_i64 363 | end 364 | end 365 | 366 | it "Does not compile mismatched parentheses" do 367 | nfa = Pegasus::Nfa::Nfa.new 368 | expect_raises(Pegasus::Error::NfaException) do 369 | nfa.add_regex "(", 0_i64 370 | end 371 | 372 | expect_raises(Pegasus::Error::NfaException) do 373 | nfa.add_regex ")", 0_i64 374 | end 375 | end 376 | end 377 | end 378 | -------------------------------------------------------------------------------- /spec/pda_spec.cr: -------------------------------------------------------------------------------- 1 | require "./spec_utils.cr" 2 | 3 | describe Pegasus::Elements::TerminalId do 4 | describe "#==" do 5 | it "Compares equivalent terminals correctly" do 6 | terminal_one = Pegasus::Elements::TerminalId.new(0_i64) 7 | terminal_two = Pegasus::Elements::TerminalId.new(0_i64) 8 | terminal_one.should eq terminal_two 9 | end 10 | 11 | it "Compares different terminals correctly" do 12 | terminal_one = Pegasus::Elements::TerminalId.new(0_i64) 13 | terminal_two = Pegasus::Elements::TerminalId.new(1_i64) 14 | terminal_one.should_not eq terminal_two 15 | end 16 | end 17 | end 18 | 19 | describe Pegasus::Elements::NonterminalId do 20 | describe "#==" do 21 | it "Compares equivalent nonterminals correctly" do 22 | nonterminal_one = Pegasus::Elements::NonterminalId.new(0_i64) 23 | nonterminal_two = Pegasus::Elements::NonterminalId.new(0_i64) 24 | nonterminal_one.should eq nonterminal_two 25 | end 26 | 27 | it "Compares different nonterminals correctly" do 28 | nonterminal_one = Pegasus::Elements::NonterminalId.new(0_i64) 29 | nonterminal_two = Pegasus::Elements::NonterminalId.new(1_i64) 30 | nonterminal_one.should_not eq nonterminal_two 31 | end 32 | end 33 | end 34 | 35 | describe Pegasus::Pda::Grammar do 36 | describe "#initialize" do 37 | it "Doesn't add any items" do 38 | grammar = Pegasus::Pda::Grammar.new [] of Pegasus::Elements::TerminalId, 39 | [] of Pegasus::Elements::NonterminalId 40 | grammar.@items.size.should eq 0 41 | end 42 | end 43 | 44 | describe "#create_lr_pda" do 45 | it "Handles empty grammars" do 46 | grammar = Pegasus::Pda::Grammar.new [] of Pegasus::Elements::TerminalId, 47 | [] of Pegasus::Elements::NonterminalId 48 | pda = grammar.create_lr_pda 49 | pda.states.size.should eq 1 50 | pda.states.first.transitions.size.should eq 0 51 | pda.states.first.data.size.should eq 0 52 | end 53 | 54 | it "Handles grammars with one rule" do 55 | grammar = Pegasus::Pda::Grammar.new [ terminal 0 ], 56 | [ nonterminal 0, start: true ] 57 | grammar.add_item item head: nonterminal(0, start: true), 58 | body: body terminal(0) 59 | pda = grammar.create_lr_pda 60 | pda.states.size.should eq 2 # Start + with item shifted over 61 | 62 | start_state = pda.states.find(&.id.==(0)).not_nil! 63 | start_state.transitions.size.should eq 1 # To the shifted state 64 | start_state.data.size.should eq 1 # The one initial item 65 | end 66 | 67 | it "Handles grammars with epsilon-moves" do 68 | terminals = [ terminal 0 ] 69 | nonterminals = [ nonterminal(0, start: true), nonterminal(1) ] 70 | 71 | grammar = Pegasus::Pda::Grammar.new terminals, nonterminals 72 | grammar.add_item item head: nonterminals[0], 73 | body: body nonterminals[1] 74 | grammar.add_item item head: nonterminals[1], 75 | body: body terminals[0] 76 | 77 | pda = grammar.create_lr_pda 78 | pda.states.size.should eq 3 79 | 80 | start_state = pda.states.find(&.id.==(0)) 81 | start_state.should_not be_nil 82 | start_state = start_state.not_nil! 83 | start_state.transitions.size.should eq 2 84 | start_state.data.size.should eq 2 85 | 86 | reduce_terminal_state = start_state.transitions[terminals[0]]? 87 | reduce_terminal_state.should_not be_nil 88 | reduce_terminal_state = reduce_terminal_state.not_nil! 89 | reduce_terminal_state.data.size.should eq 1 90 | reduce_terminal_state.data.first.index.should eq 1 91 | reduce_terminal_state.data.first.item.head.should eq nonterminals[1] 92 | reduce_terminal_state.data.first.item.body[0].should eq terminals[0] 93 | 94 | reduce_terminal_state = start_state.transitions[nonterminals[1]]? 95 | reduce_terminal_state.should_not be_nil 96 | reduce_terminal_state = reduce_terminal_state.not_nil! 97 | reduce_terminal_state.data.size.should eq 1 98 | reduce_terminal_state.data.first.index.should eq 1 99 | reduce_terminal_state.data.first.item.head.should eq nonterminals[0] 100 | reduce_terminal_state.data.first.item.body[0].should eq nonterminals[1] 101 | end 102 | end 103 | 104 | describe "#create_lalr_pda" do 105 | it "Meges states with duplicate bodies" do 106 | # This grammar is taken from grammars/modern_compiler_design.grammar 107 | t_x = terminal(0) 108 | t_b = terminal(1) 109 | t_a = terminal(2) 110 | terminals = [ t_x, t_b, t_a ] 111 | 112 | s = nonterminal 0, start: true 113 | a = nonterminal 1 114 | b = nonterminal 2 115 | nonterminals = [ s, a, b ] 116 | 117 | grammar = Pegasus::Pda::Grammar.new terminals, nonterminals 118 | grammar.add_item item head: s, 119 | body: body a 120 | grammar.add_item item head: s, 121 | body: body t_x, t_b 122 | grammar.add_item item head: a, 123 | body: body t_a, a, t_b 124 | grammar.add_item item head: a, 125 | body: body b 126 | grammar.add_item item head: b, 127 | body: body t_x 128 | 129 | lr_pda = grammar.create_lr_pda 130 | lalr_pda = grammar.create_lalr_pda lr_pda 131 | lr_pda.states.size.should eq 13 132 | lalr_pda.states.size.should eq 9 133 | end 134 | end 135 | end 136 | 137 | describe Pegasus::Pda::DottedItem do 138 | describe "#next_item!" do 139 | it "Advances the index when possible" do 140 | new_item = item head: nonterminal(0, start: true), 141 | body: body terminal(0), terminal(0) 142 | dotted_item = Pegasus::Pda::DottedItem.new new_item, index: 0_i64 143 | dotted_item.next_item! 144 | dotted_item.index.should eq 1 145 | dotted_item.next_item! 146 | dotted_item.index.should eq 2 147 | end 148 | 149 | it "Raises when already at the end" do 150 | new_item = item head: nonterminal(0, start: true), 151 | body: body terminal(0), terminal(0) 152 | dotted_item = Pegasus::Pda::DottedItem.new new_item, index: 2_i64 153 | expect_raises(Pegasus::Error::PdaException) do 154 | dotted_item.next_item! 155 | end 156 | end 157 | end 158 | 159 | describe "#done?" do 160 | it "Returns false when dot is not past the last element" do 161 | new_item = item head: nonterminal(0, start: true), 162 | body: body terminal(0), terminal(0) 163 | dotted_item = Pegasus::Pda::DottedItem.new new_item, index: 0_i64 164 | dotted_item.done?.should be_false 165 | end 166 | 167 | it "Returns true when dot is just after the last element" do 168 | new_item = item head: nonterminal(0, start: true), 169 | body: body terminal(0), terminal(0) 170 | dotted_item = Pegasus::Pda::DottedItem.new new_item, index: 2_i64 171 | dotted_item.done?.should be_true 172 | end 173 | end 174 | end 175 | 176 | describe Pegasus::Pda::Pda do 177 | describe "#action_table" do 178 | it "Creates no actions for the error state" do 179 | new_pda = pda item head: nonterminal(0, start: true), body: body terminal(0) 180 | new_table = new_pda.action_table 181 | new_table[0].each &.should eq -1 182 | end 183 | 184 | it "Creates a shift and a reduce action for a single nonterminal to terminal item" do 185 | new_pda = pda item head: nonterminal(0, start: true), body: body terminal(0) 186 | new_table = new_pda.action_table 187 | new_table[1][1].should eq 0 188 | new_table[1][0].should eq -1 189 | new_table[2][0].should eq 1 190 | new_table[2][1].should eq -1 191 | end 192 | 193 | it "Creates two shift and two reduce actions for a start state with two productions" do 194 | new_pda = pda item(head: nonterminal(0, start: true), body: body terminal(0)), 195 | item(head: nonterminal(0, start: true), body: body terminal(1)) 196 | new_table = new_pda.action_table 197 | new_table[1][0].should eq -1 198 | new_table[1][1].should eq 0 199 | new_table[1][2].should eq 0 200 | new_table[2][0].should eq 1 201 | new_table[2][1].should eq -1 202 | new_table[2][2].should eq -1 203 | new_table[3][0].should eq 2 204 | new_table[3][1].should eq -1 205 | new_table[3][2].should eq -1 206 | end 207 | 208 | it "Correctly reports a reduce reduce conflict" do 209 | new_pda = pda item(head: nonterminal(0, start: true), body: body nonterminal(1)), 210 | item(head: nonterminal(0, start: true), body: body nonterminal(2)), 211 | item(head: nonterminal(1), body: body terminal(0)), 212 | item(head: nonterminal(2), body: body terminal(0)) 213 | expect_raises(Pegasus::Error::TableException) do 214 | new_pda.action_table 215 | end 216 | end 217 | 218 | it "Correctly reports a shift/reduce conflict" do 219 | new_pda = pda item(head: nonterminal(0, start: true), body: body nonterminal(1), terminal(1)), 220 | item(head: nonterminal(0, start: true), body: body nonterminal(2)), 221 | item(head: nonterminal(1), body: body terminal(0)), 222 | item(head: nonterminal(2), body: body terminal(0), terminal(1)) 223 | expect_raises(Pegasus::Error::TableException) do 224 | new_pda.action_table 225 | end 226 | end 227 | end 228 | 229 | describe "#state_table" do 230 | it "Does not allow transitions out of the error state" do 231 | new_pda = pda item head: nonterminal(0, start: true), body: body terminal(0) 232 | new_table = new_pda.state_table 233 | new_table[0].each &.should eq 0 234 | end 235 | 236 | it "Creates transitions for terminals" do 237 | new_pda = pda item head: nonterminal(0, start: true), body: body terminal(0) 238 | new_table = new_pda.state_table 239 | new_table[1][0].should eq 0 240 | new_table[1][1].should eq 2 241 | new_table[1][2].should eq 0 242 | new_table[2].each &.should eq 0 243 | end 244 | 245 | it "Creates transitions for nonterminals" do 246 | new_pda = pda item(head: nonterminal(0, start: true), body: body nonterminal(1)), 247 | item(head: nonterminal(1), body: body terminal(0)) 248 | new_table = new_pda.state_table 249 | new_table[1][0].should eq 0 250 | new_table[1][1].should_not eq 0 251 | new_table[1][2].should eq 0 252 | new_table[1][3].should_not eq 0 253 | new_table[1][1].should_not eq new_table[1][3] 254 | new_table[2].each &.should eq 0 255 | new_table[3].each &.should eq 0 256 | end 257 | 258 | it "Creates transitions for sequences of elements" do 259 | new_pda = pda item head: nonterminal(0, start: true), body: body terminal(0), terminal(1) 260 | new_table = new_pda.state_table 261 | new_table[1].all_should eq(0), except(1, should: eq 2) 262 | new_table[2].all_should eq(0), except(2, should: eq 3) 263 | new_table[3].all_should eq(0) 264 | end 265 | end 266 | end 267 | -------------------------------------------------------------------------------- /spec/spec_helper.cr: -------------------------------------------------------------------------------- 1 | require "spec" 2 | require "../src/pegasus/**" 3 | -------------------------------------------------------------------------------- /spec/spec_utils.cr: -------------------------------------------------------------------------------- 1 | require "./spec_helper" 2 | 3 | def rule_alternative(*args) 4 | elements = [] of Pegasus::Language::RuleElement 5 | args.each do |arg| 6 | value = case arg 7 | when String 8 | Pegasus::Language::RuleElement.new arg 9 | end 10 | elements << value if value 11 | end 12 | 13 | return Pegasus::Language::RuleAlternative.new elements 14 | end 15 | 16 | def rule(*alternatives) 17 | return Pegasus::Language::Rule.new alternatives.to_a 18 | end 19 | 20 | def nonterminal(id, start = false) 21 | Pegasus::Elements::NonterminalId.new id.to_i64, start 22 | end 23 | 24 | def terminal(id) 25 | Pegasus::Elements::TerminalId.new id.to_i64 26 | end 27 | 28 | def body(*elements) 29 | array = [] of Pegasus::Elements::TerminalId | Pegasus::Elements::NonterminalId 30 | array.concat elements.to_a 31 | return array 32 | end 33 | 34 | def item(head, body) 35 | Pegasus::Pda::Item.new head, body 36 | end 37 | 38 | def pda(*items) 39 | terminals = Set(Pegasus::Elements::TerminalId).new 40 | nonterminals = Set(Pegasus::Elements::NonterminalId).new 41 | 42 | items.to_a.each do |item| 43 | nonterminals << item.head 44 | item.body.each do |element| 45 | case element 46 | when Pegasus::Elements::TerminalId 47 | terminals << element 48 | when Pegasus::Elements::NonterminalId 49 | nonterminals << element 50 | end 51 | end 52 | end 53 | 54 | grammar = Pegasus::Pda::Grammar.new terminals: terminals.to_a, 55 | nonterminals: nonterminals.to_a 56 | items.to_a.each do |item| 57 | grammar.add_item item 58 | end 59 | lr_pda = grammar.create_lr_pda 60 | lalr_pda = grammar.create_lalr_pda lr_pda 61 | return lalr_pda 62 | end 63 | 64 | class Pegasus::Automata::State(V, T) 65 | def pattern_id 66 | @data.compact_map(&.data).max_of?(&.+(1)) || 0_i64 67 | end 68 | 69 | def path(length, &block) 70 | current = self 71 | length.times do 72 | current = current.transitions 73 | .select { |k, _| yield k } 74 | .first?.try &.[1] 75 | break unless current 76 | end 77 | return current 78 | end 79 | 80 | def lambda_path(length) 81 | path length, &.is_a?(Pegasus::Nfa::LambdaTransition) 82 | end 83 | 84 | def straight_path(length) 85 | path(length) { true } 86 | end 87 | end 88 | 89 | class ExceptionRule(T, R) 90 | getter index : Int32 91 | getter should : T? 92 | getter should_not : R? 93 | 94 | def initialize(@index, @should = nil, @should_not = nil) 95 | end 96 | end 97 | 98 | class Array(T) 99 | def all_should(should, *exceptions) 100 | each_with_index do |item, index| 101 | is_exception = false 102 | exceptions.each do |exception| 103 | if exception.index == index 104 | if should_rule = exception.should 105 | item.should should_rule 106 | end 107 | if should_not_rule = exception.should_not 108 | item.should_not should_not_rule 109 | end 110 | is_exception = true 111 | end 112 | end 113 | item.should should unless is_exception 114 | end 115 | end 116 | end 117 | 118 | def except(index : Int32, should : T? = nil, should_not : R? = nil) forall T, R 119 | ExceptionRule(T, R).new index, should, should_not 120 | end 121 | -------------------------------------------------------------------------------- /src/generators/c-common/standard_header.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | /** 5 | * Converts a nonterminal value to a string. 6 | * @param nt the nonterminal ID. 7 | * @return the name for the nonterminal. 8 | */ 9 | const char* pgs_nonterminal_name(long int nt); 10 | 11 | /* == Generated Data Definitions == */ 12 | /** 13 | * A grammar item. A lot of the information collected by the parser 14 | * generate is not carried into the source code, which leaves items 15 | * as simply a nonterminal ID and the size of the right hand side. 16 | */ 17 | struct pgs_item_s { 18 | /** The nonterminal that this item is reduced to. */ 19 | long int left_id; 20 | /** 21 | * The size of the item body, used to pop off 22 | * the correct number of states from the stack. 23 | */ 24 | size_t right_count; 25 | }; 26 | 27 | typedef struct pgs_item_s pgs_item; 28 | 29 | /* == General Definitions == */ 30 | #define PGS_MAX_ERROR_LENGTH 255 31 | 32 | /** 33 | * The types of errors that can occur while the 34 | * entire parsing process. 35 | */ 36 | enum pgs_error_e { 37 | /** No error occured. */ 38 | PGS_NONE = 0, 39 | /** An allocation failed. */ 40 | PGS_MALLOC, 41 | /** A token couldn't be recognized. */ 42 | PGS_BAD_CHARACTER, 43 | /** A tree couldn't be recognized. */ 44 | PGS_BAD_TOKEN, 45 | /** End of file reached where it was not expected */ 46 | PGS_EOF_SHIFT 47 | }; 48 | 49 | /** 50 | * State used to report errors and their corresponding 51 | * messages. 52 | */ 53 | struct pgs_state_s { 54 | /** The error code. */ 55 | enum pgs_error_e error; 56 | /** The error message. */ 57 | char errbuff[PGS_MAX_ERROR_LENGTH]; 58 | }; 59 | 60 | typedef enum pgs_error_e pgs_error; 61 | typedef struct pgs_state_s pgs_state; 62 | 63 | /** 64 | * Initializes a state with no error. 65 | * @param s the state to initialize. 66 | */ 67 | void pgs_state_init(pgs_state* s); 68 | /** 69 | * Sets the state to have an error. 70 | * @param s the state to initialize. 71 | * @param err the error message to return. 72 | */ 73 | void pgs_state_error(pgs_state* s, pgs_error err, const char* message); 74 | 75 | /* == Lexing Definitions ==*/ 76 | /** 77 | * A token produced by lexing. 78 | */ 79 | struct pgs_token_s { 80 | /** The ID of the terminal. */ 81 | long int terminal; 82 | /** The index at which the token starts. */ 83 | size_t from; 84 | /** The index at which the next token begins. */ 85 | size_t to; 86 | }; 87 | 88 | /** 89 | * A dynamic list of tokens produced while lexing. 90 | */ 91 | struct pgs_token_list_s { 92 | /** The size of the currently allocated block of tokens */ 93 | size_t capacity; 94 | /** The number of tokens in the list. */ 95 | size_t token_count; 96 | /** The token data array. */ 97 | struct pgs_token_s* tokens; 98 | }; 99 | 100 | typedef struct pgs_token_s pgs_token; 101 | typedef struct pgs_token_list_s pgs_token_list; 102 | 103 | /** 104 | * Initializes a token list. 105 | * @param l the list to initialize. 106 | * @return any errors that occured while initializing the list. 107 | */ 108 | pgs_error pgs_token_list_init(pgs_token_list* l); 109 | /** 110 | * Appends a token to the list. 111 | * @param terminal the ID of the terminal to append. 112 | * @param from the index at which the token begins. 113 | * @param to the index at which the next token begins. 114 | */ 115 | pgs_error pgs_token_list_append(pgs_token_list* l, long int terminal, size_t from, size_t to); 116 | /** 117 | * Returns a token at the given index. 118 | * @param l the list to return a token from. 119 | * @param i the index from which to return a token. 120 | * @return a token, or NULL if the index is out of bounds. 121 | */ 122 | pgs_token* pgs_token_list_at(pgs_token_list* l, size_t i); 123 | /** 124 | * Returns a token ID at the given index. 125 | * @param l the list to return an ID from. 126 | * @param i the index from which to return an ID. 127 | * @return returns an ID, or 0, which represents EOF. 128 | */ 129 | long int pgs_token_list_at_id(pgs_token_list* l, size_t i ); 130 | /** 131 | * Frees a list of tokens. Since the tokens are owned by the list, 132 | * they are invalidated after this call too. 133 | * @param l the list to free. 134 | */ 135 | void pgs_token_list_free(pgs_token_list* l); 136 | /** 137 | * Performs a lexing operation. 138 | * @param s the state to populate with error text, if necessary. 139 | * @param list the list of tokens to initialize and populate. 140 | * @param source the string to lex. 141 | * @return the error, if any, that occured during this process. 142 | */ 143 | pgs_error pgs_do_lex(pgs_state* s, pgs_token_list* list, const char* source); 144 | 145 | -------------------------------------------------------------------------------- /src/generators/c-common/standard_source.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | /* == General Code == */ 5 | 6 | void pgs_state_init(pgs_state* s) { 7 | s->error = PGS_NONE; 8 | s->errbuff[0] = '\0'; 9 | } 10 | 11 | void pgs_state_error(pgs_state* s, pgs_error e, const char* message) { 12 | s->error = e; 13 | strncpy(s->errbuff, message, PGS_MAX_ERROR_LENGTH); 14 | } 15 | 16 | /* == Lexing Code == */ 17 | 18 | pgs_error pgs_token_list_init(pgs_token_list* l) { 19 | l->capacity = 8; 20 | l->token_count = 0; 21 | l->tokens = (pgs_token*) malloc(sizeof(*(l->tokens)) * l->capacity); 22 | 23 | if(l->tokens == NULL) return PGS_MALLOC; 24 | return PGS_NONE; 25 | } 26 | 27 | pgs_error pgs_token_list_append(pgs_token_list* l, long int terminal, size_t from, size_t to) { 28 | if(l->capacity == l->token_count) { 29 | pgs_token* new_tokens = 30 | (pgs_token*) realloc(l->tokens, sizeof(*new_tokens) * l->capacity * 2); 31 | if(new_tokens == NULL) return PGS_MALLOC; 32 | l->capacity *= 2; 33 | l->tokens = new_tokens; 34 | } 35 | 36 | l->tokens[l->token_count].terminal = terminal; 37 | l->tokens[l->token_count].from = from; 38 | l->tokens[l->token_count].to = to + 1; 39 | l->token_count++; 40 | 41 | return PGS_NONE; 42 | } 43 | 44 | pgs_token* pgs_token_list_at(pgs_token_list* l, size_t i) { 45 | return (i < l->token_count) ? &l->tokens[i] : NULL; 46 | } 47 | 48 | long int pgs_token_list_at_id(pgs_token_list* l, size_t i) { 49 | if(i < l->token_count) return l->tokens[i].terminal; 50 | return 0; 51 | } 52 | 53 | void pgs_token_list_free(pgs_token_list* l) { 54 | free(l->tokens); 55 | } 56 | 57 | pgs_error pgs_do_lex(pgs_state* s, pgs_token_list* list, const char* source) { 58 | pgs_error error; 59 | size_t index = 0; 60 | long int final; 61 | long int last_final; 62 | long int last_final_index; 63 | long int last_start; 64 | long int state; 65 | size_t length = strlen(source); 66 | 67 | if((error = pgs_token_list_init(list))) return error; 68 | while(!error && index < length) { 69 | last_final = -1; 70 | last_final_index = -1; 71 | last_start = index; 72 | state = 1; 73 | 74 | while(index < length && state) { 75 | state = lexer_state_table[state][(unsigned int) source[index]]; 76 | 77 | if((final = lexer_final_table[state])) { 78 | last_final = final; 79 | last_final_index = index; 80 | } 81 | 82 | if(state) index++; 83 | } 84 | 85 | if(last_final == -1) break; 86 | if(lexer_skip_table[last_final]) continue; 87 | error = pgs_token_list_append(list, last_final, last_start, last_final_index); 88 | } 89 | 90 | if(error == PGS_MALLOC) { 91 | pgs_token_list_free(list); 92 | } else if (index != length) { 93 | pgs_state_error(s, PGS_BAD_CHARACTER, "Invalid character at position"); 94 | pgs_token_list_free(list); 95 | return PGS_BAD_CHARACTER; 96 | } 97 | 98 | return PGS_NONE; 99 | } 100 | -------------------------------------------------------------------------------- /src/generators/c-common/tables.cr: -------------------------------------------------------------------------------- 1 | require "../../pegasus/language_def.cr" 2 | require "ecr" 3 | 4 | module Pegasus::Generators 5 | class CTableGen 6 | def initialize(@language : Pegasus::Language::LanguageData) 7 | end 8 | 9 | ECR.def_to_s "src/generators/c-common/tables.ecr" 10 | end 11 | end 12 | -------------------------------------------------------------------------------- /src/generators/c-common/tables.ecr: -------------------------------------------------------------------------------- 1 | /* == Nonterminals to String == */ 2 | 3 | const char* pgs_nonterminal_name(long int nt) { 4 | switch(nt) { 5 | <%- @language.nonterminals.each do |name, value| -%> 6 | case PGS_NONTERMINAL_<%= name.underscore.upcase %>: return <%= name.dump %>; 7 | <%- end -%> 8 | default: return ""; 9 | } 10 | } 11 | 12 | /* == Generated Tables and Variables== */ 13 | 14 | #define PGS_MAX_TERMINAL <%= @language.max_terminal %> 15 | int lexer_skip_table[<%= @language.lex_skip_table.size %>] = { 16 | <% @language.lex_skip_table.each do |skip| %><%= skip ? 1 : 0 %>, <% end %> 17 | }; 18 | long int lexer_state_table[<%= @language.lex_state_table.size %>][<%= @language.lex_state_table[0].size %>] = { 19 | <% @language.lex_state_table.each do |state| -%> 20 | { <% state.each do |transition| %><%= transition %>, <% end %> }, 21 | <% end -%> 22 | }; 23 | long int lexer_final_table[<%= @language.lex_final_table.size %>] = { 24 | <% @language.lex_final_table.each do |final| %><%= final %>,<% end %> 25 | }; 26 | long int parse_state_table[<%= @language.parse_state_table.size %>][<%= @language.parse_state_table[0].size %>]= { 27 | <% @language.parse_state_table.each do |state| -%> 28 | { <% state.each do |transition| %><%= transition %>, <% end %> }, 29 | <% end -%> 30 | }; 31 | long int parse_action_table[<%= @language.parse_action_table.size %>][<%= @language.parse_action_table[0].size %>] = { 32 | <% @language.parse_action_table.each do |state| -%> 33 | { <% state.each do |action| %><%= action %>, <% end %> }, 34 | <% end -%> 35 | }; 36 | int parse_final_table[<%= @language.parse_final_table.size %>] = { 37 | <% @language.parse_final_table.each do |skip| -%> 38 | <%= skip ? 1 : 0 %>, 39 | <% end -%> 40 | }; 41 | pgs_item items[<%= @language.items.size %>] = { 42 | <% @language.items.each do |item| -%> 43 | { <%= item.head.raw_id %>, <%= item.body.size %> }, 44 | <% end -%> 45 | }; 46 | -------------------------------------------------------------------------------- /src/generators/c/pegasus_c.cr: -------------------------------------------------------------------------------- 1 | require "../../pegasus/language_def.cr" 2 | require "../../pegasus/json.cr" 3 | require "../c-common/tables.cr" 4 | require "../generators.cr" 5 | require "option_parser" 6 | require "ecr" 7 | 8 | module Pegasus::Generators::C 9 | include Pegasus::Language 10 | include Pegasus::Generators::Api 11 | 12 | class CContext 13 | def add_option(opt_parser) 14 | end 15 | end 16 | 17 | class LanguageInput < StdInput(LanguageData) 18 | def process(opt_parser) : LanguageData 19 | LanguageData.from_json STDIN 20 | end 21 | end 22 | 23 | class HeaderGenerator < FileGenerator(CContext, LanguageData) 24 | def initialize(parent) 25 | super parent, "header", "parser.h", "the parser header file" 26 | end 27 | 28 | def to_s(io) 29 | ECR.embed "src/generators/c/pegasus_c_header_template.ecr", io 30 | end 31 | end 32 | 33 | class SourceGenerator < FileGenerator(CContext, LanguageData) 34 | def initialize(parent) 35 | super parent, "code", "parser.c", "the parser source code file" 36 | end 37 | 38 | def to_s(io) 39 | io << "#include \"#{@parent.output_file_names["header"]}\"\n" 40 | ECR.embed "src/generators/c/pegasus_c_template.ecr", io 41 | end 42 | end 43 | end 44 | 45 | include Pegasus::Generators::C 46 | 47 | parser = PegasusOptionParser(CContext, LanguageData).new LanguageInput.new 48 | HeaderGenerator.new(parser) 49 | SourceGenerator.new(parser) 50 | parser.run 51 | -------------------------------------------------------------------------------- /src/generators/c/pegasus_c_header_template.ecr: -------------------------------------------------------------------------------- 1 | <%= {{ read_file "src/generators/c-common/standard_header.h" }} %> 2 | 3 | /* == Nonterminal ID Definitions == */ 4 | <% input!.nonterminals.each do |name, value| -%> 5 | #define PGS_NONTERMINAL_<%= name.underscore.upcase %> <%= value.raw_id %> 6 | <% end -%> 7 | 8 | <%= {{ read_file "src/generators/c/tree_header.h" }} %> 9 | -------------------------------------------------------------------------------- /src/generators/c/pegasus_c_template.ecr: -------------------------------------------------------------------------------- 1 | <%= Pegasus::Generators::CTableGen.new(input!).to_s %> 2 | 3 | <%= {{ read_file "src/generators/c-common/standard_source.c" }} %> 4 | 5 | <%= {{ read_file "src/generators/c/tree_source.c" }} %> 6 | -------------------------------------------------------------------------------- /src/generators/c/tree_header.h: -------------------------------------------------------------------------------- 1 | #define PGS_TREE_T(tree) ((tree).tree_data.terminal.token.terminal) 2 | #define PGS_TREE_T_FROM(tree) ((tree).tree_data.terminal.token.from) 3 | #define PGS_TREE_T_TO(tree) ((tree).tree_data.terminal.token.to) 4 | #define PGS_TREE_NT(tree) ((tree).tree_data.nonterminal.nonterminal) 5 | #define PGS_TREE_NT_COUNT(tree) ((tree).tree_data.nonterminal.child_count) 6 | #define PGS_TREE_NT_CHILD(tree, n) ((tree).tree_data.nonterminal.children[n]) 7 | #define PGS_TREE_IS_NT(tree, type) (((tree).variant == PGS_TREE_NONTERMINAL) && (PGS_TREE_NT(tree) == (type))) 8 | 9 | /* == Parsing Definitions == */ 10 | /** 11 | * Enum that represents the variant of a parse tree, 12 | * which is either a nonterminal with chilren, or a 13 | * terminal with a token. 14 | */ 15 | enum pgs_tree_variant_e { 16 | PGS_TREE_TERMINAL, 17 | PGS_TREE_NONTERMINAL 18 | }; 19 | 20 | /** 21 | * The data of a terminal tree. 22 | */ 23 | struct pgs_tree_terminal_s { 24 | /** The token this tree holds. */ 25 | pgs_token token; 26 | }; 27 | 28 | /** 29 | * The data of a nonterminal tree. 30 | */ 31 | struct pgs_tree_nonterminal_s { 32 | /** 33 | * The nonterminal ID. 34 | */ 35 | long int nonterminal; 36 | /** 37 | * The number of children this tree has. 38 | */ 39 | size_t child_count; 40 | /** 41 | * The array of child pointers, allocated dynamically 42 | * depending on the item that reduced to this nonterminal. 43 | */ 44 | struct pgs_tree_s** children; 45 | }; 46 | 47 | /** 48 | * A general struct for a tree, which is either a terminal 49 | * or a nonterminal. 50 | */ 51 | struct pgs_tree_s { 52 | /** The variant of the tree. */ 53 | enum pgs_tree_variant_e variant; 54 | union { 55 | /** The terminal variant of this tree. */ 56 | struct pgs_tree_terminal_s terminal; 57 | /** The nonterminal variant of this tree. */ 58 | struct pgs_tree_nonterminal_s nonterminal; 59 | } tree_data; 60 | }; 61 | 62 | /** 63 | * An element on the parse stack, which holds 64 | * both a tree node and a state. In theory, 65 | * the stack is actually items followed by states, 66 | * but since one always comes after the other, 67 | * and since both need to be looked up fast, 68 | * we put them on a stack in parallel. 69 | */ 70 | struct pgs_parse_stack_element_s { 71 | /** The tree on the stack */ 72 | struct pgs_tree_s* tree; 73 | /** The state on the stack */ 74 | long int state; 75 | }; 76 | 77 | /** 78 | * A parse stack. The PDA automaton 79 | * has to maintain this stack, where it gradually 80 | * assembles a tree. 81 | */ 82 | struct pgs_parse_stack_s { 83 | /** The number of stack elements currently allocated. */ 84 | size_t capacity; 85 | /** The current number of stack elements. */ 86 | size_t size; 87 | /** The stack element array. */ 88 | struct pgs_parse_stack_element_s* data; 89 | }; 90 | 91 | typedef enum pgs_tree_variant_e pgs_tree_variant; 92 | typedef struct pgs_tree_terminal_s pgs_tree_terminal; 93 | typedef struct pgs_tree_nontermnal_s pgs_tree_nonterminal; 94 | typedef struct pgs_tree_s pgs_tree; 95 | typedef struct pgs_parse_stack_element_s pgs_parse_stack_element; 96 | typedef struct pgs_parse_stack_s pgs_parse_stack; 97 | 98 | /** 99 | * Allocates and initialzie a parse tree node that is a nonterminal with the given 100 | * ID and the given child count. 101 | * @param nonterminal the nonterminal ID of this tree. 102 | * @param chil_count the number of chilren that this tree has. 103 | * @return the newly allocated tree, or NULL if a malloc failure occured. 104 | */ 105 | pgs_tree* pgs_create_tree_nonterminal(long int nonterminal, size_t child_count); 106 | /** 107 | * Allocates and initialize a parse tree node that is a terminal with the given token. 108 | * @param t the token to initialize this tree with. The token need not be valid after this call. 109 | * @return the newly allocated tree, or NULL if a malloc failure occured. 110 | */ 111 | pgs_tree* pgs_create_tree_terminal(pgs_token* t); 112 | /** 113 | * Frees a nonterminal tree. 114 | * @tree the tree to free. 115 | */ 116 | void pgs_free_tree_nonterminal(pgs_tree* tree); 117 | /** 118 | * Frees a terminal tree. 119 | * @tree the tree to free. 120 | */ 121 | void pgs_free_tree_terminal(pgs_tree* tree); 122 | /** 123 | * Computes the parser_action_table index for the given tree. 124 | * @param tree the tree for which to compute the index. 125 | * @return the index. 126 | */ 127 | long int pgs_tree_table_index(pgs_tree* tree); 128 | /** 129 | * Frees a tree. 130 | * @param tree the tree to free. 131 | */ 132 | void pgs_free_tree(pgs_tree* tree); 133 | 134 | /** 135 | * Initialzies a parse stack. 136 | * @param s the parse stack to initialize. 137 | * @return the result of the initialization. 138 | */ 139 | pgs_error pgs_parse_stack_init(pgs_parse_stack* s); 140 | /** 141 | * Appends (pushes) a new tree and state to the stack. 142 | * @param s the stack to append to. 143 | * @param tree the tree to append. 144 | * @param state the state to append. 145 | * @return the result of the append. 146 | */ 147 | pgs_error pgs_parse_stack_append(pgs_parse_stack* s, pgs_tree* tree, long int state); 148 | /** 149 | * Appends a given token to the stack, by initializing a new parse tree noe. 150 | * @param s the stack to append to. 151 | * @param t the token for which to construct a tree and compute a new state. 152 | * @return the result of the append. 153 | */ 154 | pgs_error pgs_parse_stack_append_terminal(pgs_parse_stack* s, pgs_token* t); 155 | /** 156 | * Appends a given item to the stack, by popping the correct number of items 157 | * and creating a new nonterminal tree node in their place. A new state is also 158 | * computed from the nonterminal ID. 159 | * @param s the stack to append to. 160 | * @param id the nonterminal ID to create. 161 | * @param count the number of children to pop. 162 | * @return the result of the append. 163 | */ 164 | pgs_error pgs_parse_stack_append_nonterminal(pgs_parse_stack* s, long int id, size_t count); 165 | /** 166 | * Gets the state on the top of the stack. 167 | * @param s the stack for which to get a state. 168 | * @return the state on the top of the stack. 169 | */ 170 | long int pgs_parse_stack_top_state(pgs_parse_stack* s); 171 | /** 172 | * Gets the tree on the top of the stack. 173 | * @param s the stack for which to get a tree. 174 | * @return the tree on the top of the stack. 175 | */ 176 | pgs_tree* pgs_parse_stack_top_tree(pgs_parse_stack* s); 177 | /** 178 | * Frees a parse stack, also freeing all the trees. 179 | * @param s the stack to free. 180 | */ 181 | void pgs_parse_stack_free(pgs_parse_stack* s); 182 | /** 183 | * Takes the given tokens, and attempts to convert them into a parse tree. 184 | * @param s the state used for storing errors. 185 | * @param list the list of tokens, already filled. 186 | * @param into the tree pointer pointer into which a new tree will be stored. 187 | * @return the error, if any, that occured. 188 | */ 189 | pgs_error pgs_do_parse(pgs_state* s, pgs_token_list* list, pgs_tree** into); 190 | 191 | /* == Glue == */ 192 | /** 193 | * Attempts to parse tokens from the given string into the given tree. 194 | * @param state the state to initialize with error information, if necessary. 195 | * @param into the tree to build into. 196 | * @param string the string from which to read. 197 | * @return the error, if any, that occured. 198 | */ 199 | pgs_error pgs_do_all(pgs_state* state, pgs_tree** into, const char* string); 200 | -------------------------------------------------------------------------------- /src/generators/c/tree_source.c: -------------------------------------------------------------------------------- 1 | /* == Parsing Code == */ 2 | 3 | pgs_tree* pgs_create_tree_nonterminal(long int nonterminal, size_t child_count) { 4 | pgs_tree* tree = (pgs_tree*) malloc(sizeof(*tree)); 5 | pgs_tree** children = (pgs_tree**) malloc(sizeof(*children) * child_count); 6 | 7 | if(tree == NULL || children == NULL) { 8 | free(tree); 9 | return NULL; 10 | } 11 | 12 | tree->variant = PGS_TREE_NONTERMINAL; 13 | tree->tree_data.nonterminal.nonterminal = nonterminal; 14 | tree->tree_data.nonterminal.child_count = child_count; 15 | tree->tree_data.nonterminal.children = children; 16 | 17 | return tree; 18 | } 19 | 20 | pgs_tree* pgs_create_tree_terminal(pgs_token* t) { 21 | pgs_tree* tree = (pgs_tree*) malloc(sizeof(*tree)); 22 | if(tree == NULL) return NULL; 23 | 24 | tree->variant = PGS_TREE_TERMINAL; 25 | tree->tree_data.terminal.token = *t; 26 | 27 | return tree; 28 | } 29 | 30 | void pgs_free_tree_nonterminal(pgs_tree* t) { 31 | size_t i; 32 | for(i = 0; i < t->tree_data.nonterminal.child_count; i++) { 33 | pgs_free_tree(t->tree_data.nonterminal.children[i]); 34 | } 35 | free(t->tree_data.nonterminal.children); 36 | free(t); 37 | } 38 | 39 | void pgs_free_tree_terminal(pgs_tree* t) { 40 | free(t); 41 | } 42 | 43 | long int pgs_tree_table_index(pgs_tree* t) { 44 | switch(t->variant) { 45 | case PGS_TREE_TERMINAL: 46 | return PGS_TREE_T(*t); 47 | case PGS_TREE_NONTERMINAL: 48 | return PGS_TREE_NT(*t) + 2 + PGS_MAX_TERMINAL; 49 | } 50 | } 51 | 52 | void pgs_free_tree(pgs_tree* t) { 53 | switch(t->variant) { 54 | case PGS_TREE_TERMINAL: pgs_free_tree_terminal(t); break; 55 | case PGS_TREE_NONTERMINAL: pgs_free_tree_nonterminal(t); break; 56 | } 57 | } 58 | 59 | pgs_error pgs_parse_stack_init(pgs_parse_stack* s) { 60 | s->capacity = 8; 61 | s->size = 1; 62 | s->data = (pgs_parse_stack_element*) malloc(sizeof(*(s->data)) * s->capacity); 63 | 64 | if(s->data == NULL) return PGS_MALLOC; 65 | s->data[0].tree = NULL; 66 | s->data[0].state = 1; 67 | 68 | return PGS_NONE; 69 | } 70 | 71 | pgs_error pgs_parse_stack_append(pgs_parse_stack* s, pgs_tree* tree, long int state) { 72 | if(s->capacity == s->size) { 73 | pgs_parse_stack_element* new_elements = 74 | (pgs_parse_stack_element*) realloc( 75 | s->data, sizeof(*new_elements) * s->capacity * 2); 76 | if(new_elements == NULL) return PGS_MALLOC; 77 | s->capacity *= 2; 78 | s->data = new_elements; 79 | } 80 | 81 | s->data[s->size].tree = tree; 82 | s->data[s->size].state = state; 83 | s->size++; 84 | 85 | return PGS_NONE; 86 | } 87 | 88 | pgs_error pgs_parse_stack_append_terminal(pgs_parse_stack* s, pgs_token* t) { 89 | pgs_error error; 90 | long int state; 91 | pgs_tree* tree = pgs_create_tree_terminal(t); 92 | if(tree == NULL) return PGS_MALLOC; 93 | state = parse_state_table[pgs_parse_stack_top_state(s)][t->terminal]; 94 | error = pgs_parse_stack_append(s, tree, state); 95 | if(error) { 96 | pgs_free_tree_terminal(tree); 97 | return error; 98 | } 99 | return PGS_NONE; 100 | } 101 | 102 | pgs_error pgs_parse_stack_append_nonterminal(pgs_parse_stack* s, long int id, size_t count) { 103 | size_t i; 104 | pgs_tree** child_array; 105 | pgs_tree* new_tree; 106 | 107 | child_array = (pgs_tree**) malloc(sizeof(*child_array) * count); 108 | new_tree = pgs_create_tree_nonterminal(id, 0); 109 | if(child_array == NULL || new_tree == NULL) { 110 | free(child_array); 111 | return PGS_MALLOC; 112 | } 113 | for(i = 0; i < count; i++) { 114 | child_array[i] = s->data[s->size - count + i].tree; 115 | } 116 | 117 | new_tree->tree_data.nonterminal.nonterminal = id; 118 | new_tree->tree_data.nonterminal.child_count = count; 119 | new_tree->tree_data.nonterminal.children = child_array; 120 | 121 | s->size -= count; 122 | s->data[s->size].tree = new_tree; 123 | s->data[s->size].state = parse_state_table[pgs_parse_stack_top_state(s)][id + 2 + PGS_MAX_TERMINAL]; 124 | s->size++; 125 | 126 | return PGS_NONE; 127 | } 128 | 129 | void pgs_parse_stack_free(pgs_parse_stack* s) { 130 | size_t i; 131 | for(i = 0; i < s->size; i++) { 132 | free(s->data[i].tree); 133 | } 134 | free(s->data); 135 | } 136 | 137 | long int pgs_parse_stack_top_state(pgs_parse_stack* s) { 138 | return s->data[s->size - 1].state; 139 | } 140 | 141 | pgs_tree* pgs_parse_stack_top_tree(pgs_parse_stack* s) { 142 | return s->data[s->size - 1].tree; 143 | } 144 | 145 | #define PGS_PARSE_ERROR(label_name, error_name, code, text) \ 146 | error_name = code; \ 147 | pgs_state_error(s, error_name, text); \ 148 | goto label_name; 149 | 150 | pgs_error pgs_do_parse(pgs_state* s, pgs_token_list* list, pgs_tree** into) { 151 | pgs_error error; 152 | pgs_parse_stack stack; 153 | pgs_tree* top_tree; 154 | long int top_state; 155 | long int tree_table_index; 156 | long int current_token_id; 157 | long int action; 158 | struct pgs_item_s* item; 159 | pgs_token* current_token; 160 | size_t index = 0; 161 | 162 | if((error = pgs_parse_stack_init(&stack))) return error; 163 | while(1) { 164 | current_token_id = pgs_token_list_at_id(list, index); 165 | top_tree = pgs_parse_stack_top_tree(&stack); 166 | top_state = pgs_parse_stack_top_state(&stack); 167 | 168 | if(top_tree && 169 | top_tree->variant == PGS_TREE_NONTERMINAL && 170 | parse_final_table[top_tree->tree_data.nonterminal.nonterminal + 1]) 171 | break; 172 | 173 | action = parse_action_table[top_state][current_token_id]; 174 | 175 | if(action == -1) { 176 | PGS_PARSE_ERROR(error_label, error, PGS_BAD_TOKEN, "Unexpected token at position"); 177 | } else if(action == 0) { 178 | current_token = pgs_token_list_at(list, index); 179 | if(index >= (list->token_count)) { 180 | PGS_PARSE_ERROR(error_label, error, PGS_EOF_SHIFT, "Unexpected end of file"); 181 | } 182 | 183 | error = pgs_parse_stack_append_terminal(&stack, current_token); 184 | if(error) goto error_label; 185 | index++; 186 | } else { 187 | item = &items[action - 1]; 188 | error = pgs_parse_stack_append_nonterminal(&stack, item->left_id, item->right_count); 189 | } 190 | } 191 | 192 | if(index != list->token_count) { 193 | PGS_PARSE_ERROR(error_label, error, PGS_BAD_TOKEN, "Unexpected token at position"); 194 | } 195 | 196 | *into = stack.data[stack.size - 1].tree; 197 | stack.size -= 1; 198 | 199 | error_label: 200 | pgs_parse_stack_free(&stack); 201 | return error; 202 | } 203 | 204 | /* == Glue Code == */ 205 | pgs_error pgs_do_all(pgs_state* state, pgs_tree** into, const char* string) { 206 | pgs_error error; 207 | pgs_token_list tokens; 208 | pgs_state_init(state); 209 | *into = NULL; 210 | if((error = pgs_do_lex(state, &tokens, string))) { 211 | if(error == PGS_MALLOC) { 212 | pgs_state_error(state, error, "Failure to allocate memory while lexing"); 213 | } 214 | return error; 215 | } 216 | if((error = pgs_do_parse(state, &tokens, into))) { 217 | if(error == PGS_MALLOC) { 218 | pgs_state_error(state, error, "Failure to allocate memory while lexing"); 219 | } 220 | } 221 | pgs_token_list_free(&tokens); 222 | return error; 223 | } 224 | -------------------------------------------------------------------------------- /src/generators/crystal-common/tables.cr: -------------------------------------------------------------------------------- 1 | require "../../pegasus/language_def.cr" 2 | require "ecr" 3 | 4 | module Pegasus::Generators 5 | class CrystalTableGen 6 | def initialize(@prefix : String, @language : Pegasus::Language::LanguageData) 7 | end 8 | 9 | ECR.def_to_s "src/generators/crystal-common/tables.ecr" 10 | end 11 | end 12 | -------------------------------------------------------------------------------- /src/generators/crystal-common/tables.ecr: -------------------------------------------------------------------------------- 1 | module <%= @prefix %> 2 | MAX_TERMINAL = <%= @language.max_terminal %> 3 | LEX_SKIP_TABLE = [ <% @language.lex_skip_table.each do |skip| %> <%= skip %>, <% end %> ] 4 | LEX_FINAL_TABLE = [ <% @language.lex_final_table.each do |final| %> <%= final %>_i64, <% end %> ] 5 | LEX_STATE_TABLE = [<% @language.lex_state_table.each do |state| %> 6 | [ <% state.each do |transition| %> <%= transition %>_i64, <% end %> ],<%- end %> 7 | ] 8 | PARSE_ACTION_TABLE = [<% @language.parse_action_table.each do |state| %> 9 | [ <% state.each do |transition| %> <%= transition %>_i64, <% end %> ],<%- end %> 10 | ] 11 | PARSE_STATE_TABLE = [<% @language.parse_state_table.each do |state| %> 12 | [ <% state.each do |transition| %> <%= transition %>_i64, <% end %> ],<%- end %> 13 | ] 14 | PARSE_FINAL_TABLE = [<% @language.parse_final_table.each do |skip| %> 15 | <%= skip %>,<%- end %> 16 | ] 17 | ITEMS = <% if @language.items.size == 0 %> [] of Tuple(Int64, Int64) <% else %> [<% @language.items.each do |item| %> 18 | { <%= item.head.raw_id %>_i64, <%= item.body.size %>_i64 },<%- end %> 19 | ]<%- end %> 20 | end 21 | -------------------------------------------------------------------------------- /src/generators/crystal/pegasus_crystal.cr: -------------------------------------------------------------------------------- 1 | require "../../pegasus/language_def.cr" 2 | require "../../pegasus/json.cr" 3 | require "../crystal-common/tables.cr" 4 | require "../generators.cr" 5 | require "option_parser" 6 | require "ecr" 7 | 8 | module Pegasus::Generators::Crystal 9 | include Pegasus::Language 10 | include Pegasus::Generators::Api 11 | 12 | class CrystalContext 13 | property output_module : String 14 | 15 | def initialize(@output_module : String = "Pegasus::Generated") 16 | end 17 | 18 | def add_option(opt_parser) 19 | opt_parser.option_parser.on("-m", 20 | "--module=MODULE", 21 | "Sets the module in generated code") do |m| 22 | @output_module = m 23 | end 24 | end 25 | end 26 | 27 | class LanguageInput < StdInput(LanguageData) 28 | def process(opt_parser) : LanguageData 29 | LanguageData.from_json STDIN 30 | end 31 | end 32 | 33 | class ParserGenerator < FileGenerator(CrystalContext, LanguageData) 34 | def initialize(parent) 35 | super parent, "parser", "parser.cr", "the generated parser file" 36 | end 37 | 38 | def to_s(io) 39 | ECR.embed "src/generators/crystal/pegasus_crystal_template.ecr", io 40 | end 41 | end 42 | end 43 | 44 | include Pegasus::Generators::Crystal 45 | 46 | parser = PegasusOptionParser(CrystalContext, LanguageData).new LanguageInput.new 47 | ParserGenerator.new(parser) 48 | parser.run 49 | -------------------------------------------------------------------------------- /src/generators/crystal/pegasus_crystal_template.ecr: -------------------------------------------------------------------------------- 1 | <%= Pegasus::Generators::CrystalTableGen.new(context.output_module, input!).to_s %> 2 | 3 | module <%= context.output_module %> 4 | extend self 5 | 6 | abstract class Tree 7 | abstract def table_index 8 | end 9 | 10 | class NonterminalTree < Tree 11 | getter nonterminal_id : Int64 12 | getter children : Array(Tree) 13 | 14 | def initialize(@nonterminal_id, @children = [] of Tree) 15 | end 16 | 17 | def table_index 18 | nonterminal_id + 1 + MAX_TERMINAL + 1 19 | end 20 | 21 | def name 22 | case nonterminal_id<% input!.nonterminals.each do |nt| %> 23 | when <%= nt[1].raw_id %>_i64 24 | <%= nt[0].dump -%> 25 | <%- end %> 26 | else 27 | "???" 28 | end 29 | end 30 | end 31 | 32 | class TerminalTree < Tree 33 | getter terminal_id : Int64 34 | getter string : String 35 | 36 | def initialize(@terminal_id, @string) 37 | end 38 | 39 | def table_index 40 | terminal_id + 1 41 | end 42 | end 43 | 44 | class Token 45 | getter terminal_id : Int64 46 | getter string : String 47 | 48 | def initialize(@terminal_id, @string) 49 | end 50 | end 51 | 52 | def lex(string) 53 | index = 0 54 | tokens = [] of Token 55 | bytes = string.bytes 56 | 57 | while index < bytes.size 58 | start_index = index 59 | last_match_index = -1 60 | last_pattern = -1_i64 61 | state = 1 62 | 63 | while index < bytes.size 64 | state = LEX_STATE_TABLE[state][bytes[index]] 65 | id = LEX_FINAL_TABLE[state] 66 | 67 | break if state == 0 68 | index += 1 69 | next if id == 0 70 | 71 | last_match_index = index - 1 72 | last_pattern = id 73 | end 74 | 75 | raise "Invalid character #{bytes[start_index].to_s.dump_unquoted} at position #{start_index}" if last_match_index == -1 76 | next if LEX_SKIP_TABLE[last_pattern] 77 | tokens << Token.new(last_pattern - 1, string[start_index..last_match_index]) 78 | end 79 | 80 | return tokens 81 | end 82 | 83 | def parse(tokens) 84 | tree_stack = [ ] of Tree 85 | state_stack = [ 1_i64 ] 86 | index = 0 87 | 88 | loop do 89 | break if tree_stack.last?.try(&.as?(NonterminalTree)).try(&.nonterminal_id) == 0 90 | token = tokens[index]? 91 | action = PARSE_ACTION_TABLE[state_stack.last][token.try(&.terminal_id.+(1)) || 0_i64] 92 | raise "Invalid token #{token.try &.string.dump || "EOF"}" if action == -1 93 | 94 | if action == 0 95 | raise "Unexpected end of file" unless token 96 | tree_stack << TerminalTree.new token.terminal_id, token.string 97 | index += 1 98 | else 99 | item = ITEMS[action - 1] 100 | tree = NonterminalTree.new item[0] 101 | 102 | item[1].times do 103 | tree.children.insert 0, tree_stack.pop 104 | state_stack.pop 105 | end 106 | 107 | tree_stack << tree 108 | end 109 | 110 | state_stack << PARSE_STATE_TABLE[state_stack.last][tree_stack.last.table_index] 111 | end 112 | raise "Invalid token #{tokens[index].string.dump}" if index < tokens.size 113 | return tree_stack.last 114 | end 115 | 116 | def process(string) 117 | parse(lex(string)) 118 | end 119 | end 120 | -------------------------------------------------------------------------------- /src/generators/crystalsem/pegasus_crystal_template.ecr: -------------------------------------------------------------------------------- 1 | <%= input!.semantics.init %> 2 | 3 | <%= Pegasus::Generators::CrystalTableGen.new(context.output_module, input!.language).to_s %> 4 | 5 | module <%= context.output_module %> 6 | extend self 7 | 8 | alias StackType = <%= input!.semantics.types.values.join "|" %> 9 | 10 | class Token 11 | getter terminal_id : Int64 12 | getter string : String 13 | 14 | def initialize(@terminal_id, @string) 15 | end 16 | end 17 | 18 | def lex(string) 19 | index = 0 20 | tokens = [] of Token 21 | bytes = string.bytes 22 | 23 | while index < bytes.size 24 | start_index = index 25 | last_match_index = -1 26 | last_pattern = -1_i64 27 | state = 1 28 | 29 | while index < bytes.size 30 | state = LEX_STATE_TABLE[state][bytes[index]] 31 | id = LEX_FINAL_TABLE[state] 32 | 33 | break if state == 0 34 | index += 1 35 | next if id == 0 36 | 37 | last_match_index = index - 1 38 | last_pattern = id 39 | end 40 | 41 | raise "Invalid character #{bytes[start_index].to_s.dump_unquoted} at position #{start_index}" if last_match_index == -1 42 | next if LEX_SKIP_TABLE[last_pattern] 43 | tokens << Token.new(last_pattern - 1, string[start_index..last_match_index]) 44 | end 45 | 46 | return tokens 47 | end 48 | 49 | def parse(tokens) 50 | temp = uninitialized StackType 51 | value_stack = [ ] of StackType 52 | state_stack = [ 1_i64 ] 53 | index = 0 54 | 55 | loop do 56 | token = tokens[index]? 57 | action = PARSE_ACTION_TABLE[state_stack.last][token.try(&.terminal_id.+(1)) || 0_i64] 58 | raise "Invalid token #{token.try &.string.dump || "EOF"}" if action == -1 59 | 60 | if action == 0 61 | raise "Unexpected end of file" unless token 62 | index += 1 63 | value_stack << token 64 | state_stack << PARSE_STATE_TABLE[state_stack.last][token.terminal_id + 1] 65 | else 66 | item = ITEMS[action - 1] 67 | 68 | case action - 1 69 | <%- input!.semantics.actions.each do |k, v| -%> 70 | when <%= k %> 71 | <%= input!.format_item(k, v) %> 72 | <%- end -%> 73 | end 74 | 75 | value_stack.pop item[1] 76 | state_stack.pop item[1] 77 | value_stack << temp 78 | 79 | break if PARSE_FINAL_TABLE[item[0]+1] 80 | state_stack << PARSE_STATE_TABLE[state_stack.last][item[0] + 2 + MAX_TERMINAL] 81 | end 82 | end 83 | raise "Invalid token #{tokens[index].string.dump}" if index < tokens.size 84 | return value_stack.last 85 | end 86 | 87 | def process(string) 88 | parse(lex(string)) 89 | end 90 | end 91 | -------------------------------------------------------------------------------- /src/generators/crystalsem/pegasus_crystalsem.cr: -------------------------------------------------------------------------------- 1 | require "../../pegasus/language_def.cr" 2 | require "../../pegasus/json.cr" 3 | require "../../pegasus/semantics.cr" 4 | require "../crystal-common/tables.cr" 5 | require "../generators.cr" 6 | require "option_parser" 7 | require "ecr" 8 | 9 | module Pegasus::Generators::CrystalSem 10 | include Pegasus::Language 11 | include Pegasus::Generators::Api 12 | include Pegasus::Semantics 13 | 14 | class CrystalContext 15 | property output_module : String 16 | 17 | def initialize(@output_module : String = "Pegasus::Generated") 18 | end 19 | 20 | def add_option(opt_parser) 21 | opt_parser.option_parser.on("-m", 22 | "--module=MODULE", 23 | "Sets the module in generated code") do |m| 24 | @output_module = m 25 | end 26 | end 27 | end 28 | 29 | class GeneratorInput 30 | property language : LanguageData 31 | property semantics : SemanticsData 32 | 33 | def initialize(@language, @semantics) 34 | end 35 | 36 | def format_item(index, code) 37 | item = @language.items[index] 38 | 39 | unless head_type = @semantics.nonterminal_types[item.head]? 40 | raise_general "no type specified for nonterminal" 41 | end 42 | code = code.gsub "$out", "temp" 43 | 44 | item.body.each_with_index do |element, i| 45 | data_var = "value_stack[-1-#{item.body.size - 1 - i}]" 46 | case element 47 | when Pegasus::Elements::TerminalId 48 | data_var += ".as(Token)" 49 | code = code.gsub "$#{i}", "(" + data_var + ")" 50 | when Pegasus::Elements::NonterminalId 51 | next unless name = @semantics.nonterminal_types[element] 52 | data_var += ".as(#{@semantics.types[name]})" 53 | code = code.gsub "$#{i}", "(" + data_var + ")" 54 | end 55 | end 56 | 57 | return code 58 | end 59 | end 60 | 61 | class LanguageInput < FileInput(LanguageData) 62 | def initialize 63 | super "language", "the grammar file" 64 | end 65 | 66 | def process(opt_parser, file) : LanguageData 67 | LanguageData.from_json file 68 | end 69 | end 70 | 71 | class FullInput < FileInput(GeneratorInput) 72 | def initialize(@language_input : Input(LanguageData)) 73 | super "actions", "the semantic actions file" 74 | end 75 | 76 | def process(opt_parser, file) : GeneratorInput 77 | language_data = @language_input.process(opt_parser) 78 | semantics_data = SemanticsData.new file.gets_to_end, "Token", language_data 79 | GeneratorInput.new(language_data,semantics_data) 80 | end 81 | 82 | def add_option(opt_parser) 83 | @language_input.add_option(opt_parser) 84 | super opt_parser 85 | end 86 | end 87 | 88 | class SourceGenerator < FileGenerator(CrystalContext, GeneratorInput) 89 | def initialize(parent) 90 | super parent, "code", "parser.cr", "the parser source code file" 91 | end 92 | 93 | def to_s(io) 94 | ECR.embed "src/generators/crystalsem/pegasus_crystal_template.ecr", io 95 | end 96 | end 97 | end 98 | 99 | include Pegasus::Generators::CrystalSem 100 | 101 | parser = PegasusOptionParser(CrystalContext, GeneratorInput).new FullInput.new(LanguageInput.new) 102 | SourceGenerator.new(parser) 103 | parser.run 104 | -------------------------------------------------------------------------------- /src/generators/csem/pegasus_c_header_template.ecr: -------------------------------------------------------------------------------- 1 | <%= {{ read_file "src/generators/c-common/standard_header.h" }} %> 2 | 3 | /* == Nonterminal ID Definitions == */ 4 | <% input!.language.nonterminals.each do |name, value| -%> 5 | #define PGS_NONTERMINAL_<%= name.underscore.upcase %> <%= value.raw_id %> 6 | <% end -%> 7 | 8 | /* == Parsing Definitions == */ 9 | /** 10 | * A value that can exist on the pegasus stack. 11 | * The possible values of this union depend on the type 12 | * assigned to the nonterminals. 13 | */ 14 | union pgs_stack_value_u { 15 | <% input!.semantics.types.each do |k, v| %><%= v %> <%= k %>;<% end %> 16 | }; 17 | 18 | <%= {{ read_file "src/generators/csem/sem_header.h" }} %> 19 | -------------------------------------------------------------------------------- /src/generators/csem/pegasus_c_template.ecr: -------------------------------------------------------------------------------- 1 | 2 | /* == User Code == */ 3 | <%= input!.semantics.init %> 4 | 5 | <%= Pegasus::Generators::CTableGen.new(input!.language).to_s %> 6 | 7 | <%= {{ read_file "src/generators/c-common/standard_source.c" }} %> 8 | 9 | <%= {{ read_file "src/generators/csem/sem_source.c" }} %> 10 | 11 | pgs_error pgs_do_parse(pgs_state* s, pgs_token_list* list, pgs_stack_value* into, const char* src) { 12 | pgs_error error; 13 | pgs_parse_stack stack; 14 | long int top_state; 15 | long int current_token_id; 16 | long int action; 17 | long int nonterminal; 18 | size_t index = 0; 19 | pgs_stack_value temp; 20 | 21 | if((error = pgs_parse_stack_init(&stack))) return error; 22 | while(1) { 23 | current_token_id = pgs_token_list_at_id(list, index); 24 | top_state = pgs_parse_stack_top_state(&stack); 25 | action = parse_action_table[top_state][current_token_id]; 26 | 27 | if(action == -1) { 28 | PGS_PARSE_ERROR(error_label, error, PGS_BAD_TOKEN, "Unexpected token at position"); 29 | } else if(action == 0) { 30 | temp.token = pgs_token_list_at(list, index); 31 | if(index >= (list->token_count)) { 32 | PGS_PARSE_ERROR(error_label, error, PGS_EOF_SHIFT, "Unexpected end of file"); 33 | } 34 | 35 | error = pgs_parse_stack_append(&stack, &temp, parse_state_table[top_state][temp.token->terminal]); 36 | if(error) goto error_label; 37 | index++; 38 | } else { 39 | switch(action - 1) { 40 | <%- input!.semantics.actions.each do |k, v| -%> 41 | case <%= k %>: <%= input!.format_item(k, v) %> 42 | <%- end -%> 43 | default: break; 44 | } 45 | nonterminal = items[action - 1].left_id; 46 | stack.size -= items[action - 1].right_count; 47 | top_state = pgs_parse_stack_top_state(&stack); 48 | error = pgs_parse_stack_append(&stack, &temp, parse_state_table[top_state][nonterminal + 2 + PGS_MAX_TERMINAL]); 49 | if(parse_final_table[nonterminal + 1]) goto after_loop; 50 | } 51 | } 52 | after_loop: 53 | 54 | if(index != list->token_count) { 55 | PGS_PARSE_ERROR(error_label, error, PGS_BAD_TOKEN, "Unexpected token at position"); 56 | } 57 | 58 | *into = stack.data[stack.size - 1].value; 59 | stack.size -= 1; 60 | 61 | error_label: 62 | pgs_parse_stack_free(&stack); 63 | return error; 64 | } 65 | -------------------------------------------------------------------------------- /src/generators/csem/pegasus_csem.cr: -------------------------------------------------------------------------------- 1 | require "../../pegasus/language_def.cr" 2 | require "../../pegasus/json.cr" 3 | require "../../pegasus/semantics.cr" 4 | require "../c-common/tables.cr" 5 | require "../generators.cr" 6 | require "option_parser" 7 | require "ecr" 8 | 9 | module Pegasus::Generators::CSem 10 | include Pegasus::Language 11 | include Pegasus::Generators::Api 12 | include Pegasus::Semantics 13 | 14 | class CContext 15 | def add_option(opt_parser) 16 | end 17 | end 18 | 19 | class GeneratorInput 20 | property language : LanguageData 21 | property semantics : SemanticsData 22 | 23 | def initialize(@language, @semantics) 24 | end 25 | 26 | def format_item(index, code) 27 | item = @language.items[index] 28 | 29 | unless head_type = @semantics.nonterminal_types[item.head]? 30 | raise_general "no type specified for nonterminal" 31 | end 32 | code = code.gsub "$out", "temp." + head_type 33 | 34 | item.body.each_with_index do |element, i| 35 | data_var = "stack.data[stack.size - 1 - #{item.body.size - 1 - i}].value" 36 | case element 37 | when Pegasus::Elements::TerminalId 38 | data_var += ".token" 39 | code = code.gsub "$#{i}", "(" + data_var + ")" 40 | when Pegasus::Elements::NonterminalId 41 | next unless name = @semantics.nonterminal_types[element] 42 | data_var += "." + name 43 | code = code.gsub "$#{i}", "(" + data_var + ")" 44 | end 45 | end 46 | 47 | return "{ { #{code} } break; }" 48 | end 49 | end 50 | 51 | class LanguageInput < FileInput(LanguageData) 52 | def initialize 53 | super "language", "the grammar file" 54 | end 55 | 56 | def process(opt_parser, file) : LanguageData 57 | LanguageData.from_json file 58 | end 59 | end 60 | 61 | class FullInput < FileInput(GeneratorInput) 62 | def initialize(@language_input : Input(LanguageData)) 63 | super "actions", "the semantic actions file" 64 | end 65 | 66 | def process(opt_parser, file) : GeneratorInput 67 | language_data = @language_input.process(opt_parser) 68 | semantics_data = SemanticsData.new file.gets_to_end, "pgs_token*", language_data 69 | GeneratorInput.new(language_data,semantics_data) 70 | end 71 | 72 | def add_option(opt_parser) 73 | @language_input.add_option(opt_parser) 74 | super opt_parser 75 | end 76 | end 77 | 78 | class HeaderGenerator < FileGenerator(CContext, GeneratorInput) 79 | def initialize(parent) 80 | super parent, "header", "parser.h", "the parser header file" 81 | end 82 | 83 | def to_s(io) 84 | ECR.embed "src/generators/csem/pegasus_c_header_template.ecr", io 85 | end 86 | end 87 | 88 | class SourceGenerator < FileGenerator(CContext, GeneratorInput) 89 | def initialize(parent) 90 | super parent, "code", "parser.c", "the parser source code file" 91 | end 92 | 93 | def to_s(io) 94 | io << "#include \"#{@parent.output_file_names["header"]}\"\n" 95 | ECR.embed "src/generators/csem/pegasus_c_template.ecr", io 96 | end 97 | end 98 | end 99 | 100 | include Pegasus::Generators::CSem 101 | 102 | parser = PegasusOptionParser(CContext, GeneratorInput).new FullInput.new(LanguageInput.new) 103 | HeaderGenerator.new(parser) 104 | SourceGenerator.new(parser) 105 | parser.run 106 | -------------------------------------------------------------------------------- /src/generators/csem/sem_header.h: -------------------------------------------------------------------------------- 1 | /** 2 | * An element on the parse stack, which holds 3 | * both a tree node and a state. In theory, 4 | * the stack is actually items followed by states, 5 | * but since one always comes after the other, 6 | * and since both need to be looked up fast, 7 | * we put them on a stack in parallel. 8 | */ 9 | struct pgs_parse_stack_element_s { 10 | /** The value on the stack */ 11 | union pgs_stack_value_u value; 12 | /** The state on the stack */ 13 | long int state; 14 | }; 15 | 16 | /** 17 | * A parse stack. The PDA automaton 18 | * has to maintain this stack, where it gradually 19 | * assembles a tree. 20 | */ 21 | struct pgs_parse_stack_s { 22 | /** The number of stack elements currently allocated. */ 23 | size_t capacity; 24 | /** The current number of stack elements. */ 25 | size_t size; 26 | /** The stack element array. */ 27 | struct pgs_parse_stack_element_s* data; 28 | }; 29 | 30 | typedef union pgs_stack_value_u pgs_stack_value; 31 | typedef struct pgs_parse_stack_element_s pgs_parse_stack_element; 32 | typedef struct pgs_parse_stack_s pgs_parse_stack; 33 | 34 | /** 35 | * Initialzies a parse stack. 36 | * @param s the parse stack to initialize. 37 | * @return the result of the initialization. 38 | */ 39 | pgs_error pgs_parse_stack_init(pgs_parse_stack* s); 40 | /** 41 | * Appends (pushes) a new value and state to the stack. 42 | * @param s the stack to append to. 43 | * @param v the value to append. 44 | * @param state the state to append. 45 | * @return the result of the append. 46 | */ 47 | pgs_error pgs_parse_stack_append(pgs_parse_stack* s, pgs_stack_value* v, long int state); 48 | /** 49 | * Gets the state on the top of the stack. 50 | * @param s the stack for which to get a state. 51 | * @return the state on the top of the stack. 52 | */ 53 | long int pgs_parse_stack_top_state(pgs_parse_stack* s); 54 | /** 55 | * Gets the value on the top of the stack. 56 | * @param s the stack for which to get a value. 57 | * @return the value on the top of the stack. 58 | */ 59 | pgs_stack_value* pgs_parse_stack_top_value(pgs_parse_stack* s); 60 | /** 61 | * Frees a parse stack. 62 | * @param s the stack to free. 63 | */ 64 | void pgs_parse_stack_free(pgs_parse_stack* s); 65 | /** 66 | * Takes the given tokens, and attempts to convert them into a value. 67 | * @param s the state used for storing errors. 68 | * @param list the list of tokens, already filled. 69 | * @param into the value pointer pointer into which a new value will be stored. 70 | * @param src the original string, for the user-defined actions. 71 | * @return the error, if any, that occured. 72 | */ 73 | pgs_error pgs_do_parse(pgs_state* s, pgs_token_list* list, pgs_stack_value* into, const char* src); 74 | 75 | /* == Glue == */ 76 | /** 77 | * Attempts to parse tokens from the given string into the given value. 78 | * @param state the state to initialize with error information, if necessary. 79 | * @param into the value to build into. 80 | * @param string the string from which to read. 81 | * @return the error, if any, that occured. 82 | */ 83 | pgs_error pgs_do_all(pgs_state* state, pgs_stack_value* into, const char* string); 84 | -------------------------------------------------------------------------------- /src/generators/csem/sem_source.c: -------------------------------------------------------------------------------- 1 | /* == Glue Code == */ 2 | 3 | pgs_error pgs_do_all(pgs_state* state, pgs_stack_value* into, const char* string) { 4 | pgs_error error; 5 | pgs_token_list tokens; 6 | pgs_state_init(state); 7 | if((error = pgs_do_lex(state, &tokens, string))) { 8 | if(error == PGS_MALLOC) { 9 | pgs_state_error(state, error, "Failure to allocate memory while lexing"); 10 | } 11 | return error; 12 | } 13 | if((error = pgs_do_parse(state, &tokens, into, string))) { 14 | if(error == PGS_MALLOC) { 15 | pgs_state_error(state, error, "Failure to allocate memory while lexing"); 16 | } 17 | } 18 | pgs_token_list_free(&tokens); 19 | return error; 20 | } 21 | 22 | /* == Parsing Code == */ 23 | 24 | pgs_error pgs_parse_stack_init(pgs_parse_stack* s) { 25 | s->capacity = 8; 26 | s->size = 1; 27 | s->data = (pgs_parse_stack_element*) malloc(sizeof(*(s->data)) * s->capacity); 28 | 29 | if(s->data == NULL) return PGS_MALLOC; 30 | s->data[0].state = 1; 31 | 32 | return PGS_NONE; 33 | } 34 | 35 | pgs_error pgs_parse_stack_append(pgs_parse_stack* s, pgs_stack_value* v, long int state) { 36 | if(s->capacity == s->size) { 37 | pgs_parse_stack_element* new_elements = 38 | (pgs_parse_stack_element*) realloc( 39 | s->data, sizeof(*new_elements) * s->capacity * 2); 40 | if(new_elements == NULL) return PGS_MALLOC; 41 | s->capacity *= 2; 42 | s->data = new_elements; 43 | } 44 | 45 | s->data[s->size].value = *v; 46 | s->data[s->size].state = state; 47 | s->size++; 48 | 49 | return PGS_NONE; 50 | } 51 | 52 | void pgs_parse_stack_free(pgs_parse_stack* s) { 53 | size_t i; 54 | for(i = 0; i < s->size; i++) { 55 | /* Maybe eventually free individual union values */ 56 | } 57 | free(s->data); 58 | } 59 | 60 | long int pgs_parse_stack_top_state(pgs_parse_stack* s) { 61 | return s->data[s->size - 1].state; 62 | } 63 | 64 | pgs_stack_value* pgs_parse_stack_top_value(pgs_parse_stack* s) { 65 | return &s->data[s->size - 1].value; 66 | } 67 | 68 | #define PGS_PARSE_ERROR(label_name, error_name, code, text) \ 69 | error_name = code; \ 70 | pgs_state_error(s, error_name, text); \ 71 | goto label_name; 72 | 73 | -------------------------------------------------------------------------------- /src/generators/generators.cr: -------------------------------------------------------------------------------- 1 | require "../pegasus/language_def.cr" 2 | require "option_parser" 3 | 4 | module Pegasus::Generators::Api 5 | # Class that specifies the program's output mode. 6 | # The idea is to generalize behaviors such as 7 | # merging into a single file or printing out to STDOUT. 8 | # The `#output` method takes in a parser and, as side effect, 9 | # should emit the output of its various `FileGenerator` classes. 10 | abstract class OutputMode 11 | # Output the content of the given `opt_parser`. 12 | abstract def output(opt_parser) 13 | end 14 | 15 | # Output mode that produces individual files 16 | # as specified by the `FileGenerator` classes. 17 | class FilesOutputMode < OutputMode 18 | def output(opt_parser) 19 | opt_parser.file_gens.each do |gen| 20 | file = File.open(opt_parser.output_file_names[gen.name], "w") 21 | gen.to_s(file) 22 | file.close 23 | end 24 | end 25 | end 26 | 27 | # Output mode that produces a single file. 28 | class FileOutputMode < OutputMode 29 | # Creates a new file output mode that generates a file with the given name. 30 | def initialize(@filename : String) 31 | end 32 | 33 | def output(opt_parser) 34 | file = File.open(@filename, "w") 35 | opt_parser.file_gens.each do |gen| 36 | gen.to_s(file) 37 | end 38 | file.close 39 | end 40 | end 41 | 42 | # Output mode that prints all the generated files to STDOUT, 43 | # in the order they were added to the `PegasusOptionParser` 44 | class StdOutputMode < OutputMode 45 | def output(opt_parser) 46 | opt_parser.file_gens.each do |gen| 47 | gen.to_s(STDOUT) 48 | end 49 | end 50 | end 51 | 52 | # A generalization of data input. Subclasses 53 | # such as `StdInput` and `FileInput` provide 54 | # a way to read grammar / semantics files from 55 | # various sources. The `#add_option` method registers 56 | # command-line option(s) for the user to configure. 57 | abstract class Input(I) 58 | # Register this input method's options 59 | # with the given `PegasusOptionParser`. 60 | def add_option(opt_parser) 61 | end 62 | 63 | # Read input of type `I`. 64 | abstract def process(opt_parser) : I 65 | end 66 | 67 | # Input method that reads directly from `STDIN`. 68 | # This technically doesn't add any new methods, 69 | # but makes code more clear. 70 | abstract class StdInput(I) < Input(I) 71 | end 72 | 73 | # Input method that reads from a file, the 74 | # name of which is specified on the command line. 75 | abstract class FileInput(I) < Input(I) 76 | # The internal name of this input. The `PegasusOptionParser` 77 | # will associated a file name with this string. 78 | property name : String 79 | # The user-friendly description of the input 80 | # that will be shown on the help screen. 81 | property description : String 82 | # The name of the file to read from. 83 | property filename : String? 84 | 85 | # Create a new file input with the given internal name 86 | # and user-friendly description. 87 | def initialize(@name, @description) 88 | end 89 | 90 | def process(opt_parser) : I 91 | file = File.open(@filename.not_nil!, "r") 92 | result = process(opt_parser, file) 93 | file.close 94 | return result 95 | end 96 | 97 | def add_option(opt_parser) 98 | opt_parser.option_parser.on("-#{name[0].downcase} FILE", 99 | "--input-#{name}=FILE", 100 | "Sets #{description}") do |file| 101 | @filename = file 102 | end 103 | end 104 | 105 | # Read a value of type `I` from a file. 106 | abstract def process(opt_parser, file) : I 107 | end 108 | 109 | # High-level class for constructing parser generators 110 | # that are configurable from the command line. 111 | # 112 | # This class uses `Input` to read a value of 113 | # type `I`, then uses the registered `FileGenerator` instances 114 | # to produce output via an `OutputMode`. All of these 115 | # listed classes are registered with Crystal's native `OptionParser`, 116 | # which serves to provide a user with configuration options. 117 | # 118 | # The `#output_file_names` and `#input_file_names` hashes store 119 | # the names of target output files and input files, respectively. 120 | # These are updated by the `Input` and `FileGenerator`s, as well 121 | # as through user-supplied command-line options. 122 | class PegasusOptionParser(C, I) 123 | # The context class (which must implement the `add_option` method) 124 | # is included with the generator to store and retrieve 125 | # parser-specific options. `FileGenerator#context` is used within 126 | # a generator to access this value. 127 | getter context : C 128 | # The input gathered from the `Input` class. This starts 129 | # uninitialized, but is set partway through `#run`. 130 | getter input : I? 131 | # The list of registered file generators. 132 | getter file_gens : Array(FileGenerator(C, I)) 133 | # The Crystal-native `OptionParser` used to actually 134 | # print options to the console. 135 | getter option_parser : OptionParser 136 | # Hash that stores the configured file names of the various 137 | # `FileGenerator` instances, associated with their internal names. 138 | # The file names are kept outside their generators so that 139 | # two generators that depend on one another (like a source file 140 | # including a header file) can know each other's names. 141 | getter output_file_names : Hash(String, String) 142 | 143 | # Create a new `PegasusOptionParser` with the given input method and context. 144 | def initialize(@input_method : Input(I), @context = C.new) 145 | @output = FilesOutputMode.new 146 | @file_gens = [] of FileGenerator(C, I) 147 | @option_parser = OptionParser.new 148 | @output_file_names = {} of String => String 149 | 150 | @input_method.add_option(self) 151 | @context.add_option(self) 152 | @option_parser.on("-S", 153 | "--stdout", 154 | "Sets output mode to standard output") do 155 | @output = StdOutputMode.new 156 | end 157 | @option_parser.on("-s FILE", 158 | "--single-file=FILE", 159 | "Sets output mode to single file.") do |file| 160 | @output = FileOutputMode.new file 161 | end 162 | @option_parser.on("-f PREFIX", 163 | "--file-prefix=PREFIX", 164 | "Sets the file prefix for generated files.") do |p| 165 | @output_file_names.each do |k,v| 166 | @output_file_names[k] = p + v 167 | end 168 | end 169 | @option_parser.on("-H", "--help", "Show this text") do 170 | puts @option_parser 171 | exit 172 | end 173 | end 174 | 175 | # Run the command line program, and the constructed generator. 176 | def run 177 | @option_parser.parse 178 | @input = @input_method.process(self) 179 | @output.output(self) 180 | end 181 | end 182 | 183 | # A base class for a source file generator. 184 | # This class is meant to be extended by each individual 185 | # file generator that uses `ECR`, and thus provides 186 | # the methods `#input!` and `#context` to make 187 | # the genertor's input and context available inside 188 | # the template file. 189 | class FileGenerator(C, I) 190 | # The parser program to which this generator belongs, 191 | # used to retreive input and context and to configure 192 | # and retreive file names. 193 | property parent : PegasusOptionParser(C, I) 194 | # The internal name of this file generator, 195 | # which will be associated with a filename by the `PegasusOptionParser`. 196 | property name : String 197 | # The default filename this generator will write to. 198 | property default_filename : String 199 | # The user-friendly description of this generator. 200 | property description : String 201 | 202 | # Creates a new file generator attached to he given `PegasusOptionParser`, 203 | # with the given name, default filename, and description. 204 | def initialize(@parent, @name, @default_filename, @description) 205 | @parent.file_gens << self 206 | add_option(@parent) 207 | @parent.output_file_names[@name] = @default_filename 208 | end 209 | 210 | # Adds required options to the given option parser. 211 | def add_option(opt_parser) 212 | opt_parser.option_parser.on("-#{name[0].downcase} FILE", 213 | "--#{name}-file=FILE", 214 | "Sets output target for #{description}") do |n| 215 | opt_parser.output_file_names[name] = n 216 | end 217 | end 218 | 219 | # Convenience method to access the parser generator input from 220 | # an ECR template. 221 | def input! 222 | @parent.input.not_nil! 223 | end 224 | 225 | # Convenience method to access the parser context from 226 | # an ECR template. 227 | def context 228 | @parent.context 229 | end 230 | end 231 | end 232 | -------------------------------------------------------------------------------- /src/pegasus.cr: -------------------------------------------------------------------------------- 1 | require "./pegasus/language_def.cr" 2 | require "./pegasus/json.cr" 3 | require "./pegasus/error.cr" 4 | 5 | begin 6 | grammar = STDIN.gets_to_end 7 | definition = Pegasus::Language::LanguageDefinition.new grammar 8 | data = Pegasus::Language::LanguageData.new definition 9 | data.to_json(STDOUT) 10 | rescue e : Pegasus::Error::PegasusException 11 | e.print(STDERR) 12 | end 13 | -------------------------------------------------------------------------------- /src/pegasus/automaton.cr: -------------------------------------------------------------------------------- 1 | module Pegasus 2 | # This module contains automata-related code. Since Pegasus uses 3 | # Deterministic, nondeterministic, and push-down automata, there is a lot 4 | # of common code. This module is for the common code. 5 | module Automata 6 | # A generic state for an automaton, with transitions 7 | # labeled by T and values of V. 8 | class State(V, T) 9 | # The unique ID of the state. 10 | getter id : Int64 11 | # The additional data the state holds. 12 | getter data : V 13 | # The transitions from this state to other states. 14 | getter transitions : Hash(T, self) 15 | 16 | # Creates a new state with the given ID, data, and transitions. 17 | def initialize(*, @id, @data, @transitions = Hash(T, self).new) 18 | end 19 | end 20 | 21 | # A generic automaton to represent common operations on the 22 | # different kinds of automata. 23 | class Automaton(V, T) 24 | # The states that this automaton has. 25 | getter states : Set(State(V, T)) 26 | # The state ID to use for the next state. 27 | getter last_id : Int64 28 | # The start state. 29 | property start : State(V, T)? 30 | 31 | # Creates a new automaton. 32 | def initialize 33 | @last_id = 0_i64 34 | @states = Set(State(V, T)).new 35 | @start = nil 36 | end 37 | 38 | # Creates a new state for the given data. 39 | def state_for(*, data : V) 40 | new_state = State(V, T).new id: @last_id, data: data 41 | @last_id += 1 42 | @states << new_state 43 | return new_state 44 | end 45 | end 46 | 47 | # Another generic automaton. Since many automatons created by 48 | # pegasus do not like two nodes with the same data, 49 | # this class overries the `#state_for` function to return 50 | # an existing state with the given data if such a state exists. 51 | class UniqueAutomaton(V, T) < Automaton(V, T) 52 | # Creates a new UniqueAutomaton. 53 | def initialize 54 | super 55 | @memorized = Hash(V, State(V, T)).new 56 | end 57 | 58 | # Creates a new state for the given data, 59 | # or returns an existing state with the data 60 | # if one exists. 61 | def state_for(*, data : V) 62 | return @memorized[data] if @memorized.has_key? data 63 | new_state = super(data: data) 64 | @memorized[data] = new_state 65 | return new_state 66 | end 67 | end 68 | end 69 | end 70 | -------------------------------------------------------------------------------- /src/pegasus/dfa.cr: -------------------------------------------------------------------------------- 1 | require "./automaton.cr" 2 | require "./nfa.cr" 3 | 4 | module Pegasus 5 | # This module is for deterministic finite automata. 6 | # DFAs are used in Pegasus to describe the tokenizer state machine. 7 | module Dfa 8 | alias DState = Automata::State(Set(Nfa::NState), UInt8) 9 | 10 | # A deterministic finite automaton, whose dtransitions 11 | # are marked by bytes and whose data is actually the collection 12 | # of states this state represents in the source `Pegasus::Nfa::Nfa`. 13 | class Dfa < Automata::UniqueAutomaton(Set(Nfa::NState), UInt8) 14 | end 15 | end 16 | end 17 | -------------------------------------------------------------------------------- /src/pegasus/elements.cr: -------------------------------------------------------------------------------- 1 | module Pegasus 2 | # This module contains "elements" which are part of a production. Generally, elements 3 | # are terminals and nonterminals. Additionaly, special-case elements for certain 4 | # algorithms are contained here (the EOF element and the Empty element) 5 | module Elements 6 | # An item that can be in a lookahead item's follow set. 7 | # This could be a terminal ID, or the special reserved EOF and "empty" (epsilon) 8 | # elements. 9 | abstract class LookaheadElement 10 | end 11 | 12 | # A lookahead element which can be used as in index to a lookup table. 13 | abstract class IndexableElement < LookaheadElement 14 | # Gets the table index of this element. 15 | abstract def table_index : Int64 16 | end 17 | 18 | # The special-case empty (epsilon) element used for follow set computation. 19 | class EmptyElement < LookaheadElement 20 | def ==(other : EmptyElement) 21 | return true 22 | end 23 | 24 | def ==(other : LookaheadElement) 25 | return false 26 | end 27 | 28 | def hash(hasher) 29 | hasher 30 | end 31 | end 32 | 33 | # The EOF element. Represents the end of the file, and is not matched as a token by the lexer. 34 | class EofElement < IndexableElement 35 | def table_index : Int64 36 | return 0_i64 37 | end 38 | 39 | def ==(other : EofElement) 40 | return true 41 | end 42 | 43 | def ==(other : LookaheadElement) 44 | return false 45 | end 46 | 47 | def hash(hasher) 48 | hasher 49 | end 50 | end 51 | 52 | # A terminal, as specified by the user. This is __not__ a special case element, and one terminal ID 53 | # exists for every token the user registers. 54 | class TerminalId < IndexableElement 55 | def initialize(@id : Int64) 56 | end 57 | 58 | def table_index : Int64 59 | return @id + 1 60 | end 61 | 62 | # Gets the raw ID of this terminal. This should be used with caution. 63 | def raw_id 64 | return @id 65 | end 66 | 67 | def ==(other : TerminalId) 68 | return @id == other.@id 69 | end 70 | 71 | def ==(other : LookaheadElement) 72 | return false 73 | end 74 | 75 | def hash(hasher) 76 | @id.hash(hasher) 77 | hasher 78 | end 79 | end 80 | 81 | # A nonterminal, as specified by the user. Nonterminals are on the left of production rules (though they can also 82 | # appear on the right). 83 | class NonterminalId 84 | # Creates a new NonterminalId with the given ID. 85 | def initialize(@id : Int64, @start = false) 86 | end 87 | 88 | # Gets the table index of this nonterminal. 89 | def table_index 90 | return @id + 1 91 | end 92 | 93 | # Gets the raw ID of this nonterminal. This should be used with caution. 94 | def raw_id 95 | return @id 96 | end 97 | 98 | # Checks if this nonterminal is a "start" nonterminal (i.e., a potentially top level node) 99 | def start? 100 | return @start 101 | end 102 | 103 | # Compares this nonterminal to another nonterminal. 104 | def ==(other : NonterminalId) 105 | return (@id == other.@id) && (@start == other.@start) 106 | end 107 | 108 | # Creates a hash of this NonterminalId. 109 | def hash(hasher) 110 | @id.hash(hasher) 111 | @start.hash(hasher) 112 | hasher 113 | end 114 | 115 | def to_s(io) 116 | io << "NonterminalId(" << @id << ")" 117 | end 118 | end 119 | end 120 | end 121 | -------------------------------------------------------------------------------- /src/pegasus/error.cr: -------------------------------------------------------------------------------- 1 | require "colorize" 2 | 3 | module Pegasus 4 | # This module contains all the error-related code. 5 | # This includes a custom exception class and context for it. 6 | module Error 7 | # A context for the custom exception class. 8 | # The idea with context is that it can be attached to exceptions and 9 | # shown as extra information to the user. It's attached rather than 10 | # added via subclassing because some parts of Pegasus code need to be 11 | # able to modify the context, replacing it with more thorough / clear 12 | # info. Instead of straight up copying the exception and changing the field, 13 | # (as well as the way it's displayed to the user), client code can 14 | # remove one bit of context and replace it with a better one. 15 | abstract class ErrorContext 16 | abstract def to_s(io) 17 | end 18 | 19 | # An exception thrown by Pegasus. Unlike Crystal exceptions, which will 20 | # be reported directly to the user without any prettyfication, the Pegasus exception is created to 21 | # display the error information to the user in a clear and pretty way. This includes coloring and 22 | # emphasizing certain sections of the message, and generally presenting them in a user-friendly way. 23 | abstract class PegasusException < Exception 24 | getter context_data : Array(ErrorContext) 25 | 26 | def initialize(@description : String, @context_data = [] of ErrorContext, @internal = false) 27 | super() 28 | end 29 | 30 | # Prints the exception to the given IO. 31 | def print(io) 32 | io << "an error".colorize.red.bold 33 | io << " has occured while " 34 | io << get_location_name.colorize.bold 35 | io << ": " 36 | io << @description 37 | io.puts 38 | 39 | print_extra(io) 40 | 41 | if @internal 42 | io << "This error is " << "internal".colorize.bold << ": this means it is likely " << "not your fault".colorize.bold 43 | io.puts 44 | io.puts "Please report this error to the developer." 45 | end 46 | end 47 | 48 | # Prints the context that the exception has attached. 49 | def print_extra(io) 50 | @context_data.each do |data| 51 | io << " - " << data 52 | io.puts 53 | end 54 | end 55 | 56 | # Get the "location" of the error, which is used to 57 | # report to the user when in the process the error occured. 58 | abstract def get_location_name 59 | end 60 | 61 | # An exception thrown at some point in the entire lifetime of Pegasus. 62 | # This is very vague, and should be used in cases where it cannot be known 63 | # what the surrounding code is doing at the time. 64 | class GeneralException < PegasusException 65 | def get_location_name 66 | "converting grammar to a parser description" 67 | end 68 | end 69 | 70 | # An exception used to signify that an error occured during grammar parsing. 71 | class GrammarException < PegasusException 72 | def get_location_name 73 | "parsing the grammar definition" 74 | end 75 | end 76 | 77 | # An exception used to signify that an error occured while creating 78 | # Nondeterministic Finite Automata. 79 | class NfaException < PegasusException 80 | def get_location_name 81 | "compiling regular expressions" 82 | end 83 | end 84 | 85 | # An exception used to signify that an error occured while creating 86 | # Deterministic Finite Automata. 87 | class DfaException < PegasusException 88 | def get_location_name 89 | "creating a deterministic finite automaton" 90 | end 91 | end 92 | 93 | # An exception used to signify that an error occured while creating 94 | # Push Down Automata. 95 | class PdaException < PegasusException 96 | def get_location_name 97 | "converting grammar rules into a state machine" 98 | end 99 | end 100 | 101 | # An exception used to signify that an error occured while creating 102 | # the lookup tables necessary for the Pegasus state machine. 103 | class TableException < PegasusException 104 | def get_location_name 105 | "creating lookup tables" 106 | end 107 | end 108 | end 109 | 110 | end 111 | 112 | # Define a raise function from a name and a Pegasus exception class. 113 | macro define_raise(name, class_name) 114 | def raise_{{name}}(message, context_data = [] of Pegasus::Error::ErrorContext, internal = false) 115 | raise Pegasus::Error::{{class_name}}.new message, 116 | context_data: context_data.map(&.as(Pegasus::Error::ErrorContext)), 117 | internal: internal 118 | end 119 | end 120 | 121 | define_raise(general, GeneralException) 122 | define_raise(grammar, GrammarException) 123 | define_raise(nfa, NfaException) 124 | define_raise(dfa, DfaException) 125 | define_raise(pda, PdaException) 126 | define_raise(table, TableException) 127 | -------------------------------------------------------------------------------- /src/pegasus/grammar.cr: -------------------------------------------------------------------------------- 1 | require "./elements.cr" 2 | require "./items.cr" 3 | require "./pda.cr" 4 | 5 | module Pegasus 6 | # This module holds code related to push down automata, as well 7 | # as other helper code such as items (productions, basically), 8 | # dotted items (productions which know what part of the production 9 | # has already been parsed) and the like. 10 | module Pda 11 | # A Grammar associated with the language, contianing a list of terminals, 12 | # nonterminals, and the context-free production rules given by the `Item` class. 13 | class Grammar 14 | # The items that belong to this grammar. 15 | getter items : Array(Item) 16 | # The terminals that belong to this grammar. 17 | getter terminals : Array(Elements::TerminalId) 18 | # The nonterminals that belong to this grammar. 19 | getter nonterminals : Array(Elements::NonterminalId) 20 | 21 | # Initializes this grammar with the given terminals and nonterminals. 22 | def initialize(@terminals, @nonterminals) 23 | @items = Array(Item).new 24 | end 25 | 26 | # Checks if the given set contains the empty set. This is used for computing 27 | # FIRST and lookahead sets when generating an (LA)LR automaton. 28 | private def contains_empty(set) 29 | return set.select(&.is_a?(Elements::EmptyElement)).size != 0 30 | end 31 | 32 | # Concatenates a set with another set, and returns whether the size of the set 33 | # has changed. This is useful for "closure algorithms" as described by 34 | # Dick Grune and others in Modern Compiler Design. These algorithms apply 35 | # a rule until the data no longer changes. 36 | private def concat_watching(set, other) 37 | initial_size = set.size 38 | set.concat other 39 | return initial_size != set.size 40 | end 41 | 42 | # Computes the FIRST set of an alternative. The first sets hash is used 43 | # for already computed first sets. The empty alternative is added elsewhere, 44 | # and only contains the SPECIAL_EMPTY terminal. 45 | private def compute_alternative_first(first_sets, alternative) 46 | if !first_sets.has_key? alternative 47 | first = Set(Elements::LookaheadElement).new 48 | first_sets[alternative] = first 49 | else 50 | first = first_sets[alternative] 51 | end 52 | 53 | if alternative.size == 0 54 | return false 55 | end 56 | 57 | start_element = alternative.first 58 | add_first = first_sets[start_element].dup 59 | if contains_empty(first) 60 | tail = alternative[1...alternative.size] 61 | compute_alternative_first(first_sets, tail) 62 | add_first.concat first_sets[tail] 63 | else 64 | add_first = add_first.reject &.is_a?(Elements::EmptyElement) 65 | end 66 | 67 | return concat_watching(first, add_first) 68 | end 69 | 70 | # Computes the first set of every alternative or alternative tail of the given 71 | # item body. 72 | private def compute_alternatives_first(first_sets, body) 73 | change_occured = false 74 | body.size.times do |time| 75 | change_occured |= compute_alternative_first(first_sets, body[time...body.size]) 76 | end 77 | return change_occured 78 | end 79 | 80 | # Computes the first sets of all the terminals, nonterminals, alternatives, 81 | # and alternative tails by examining the items, terminals, and nonterminals given 82 | # in `#initialize` 83 | private def compute_first 84 | first_sets = Hash(Elements::NonterminalId | Elements::TerminalId | Array(Elements::NonterminalId | Elements::TerminalId), Set(Elements::LookaheadElement)).new 85 | @terminals.each { |t| first_sets[t] = Set(Elements::LookaheadElement) { t } } 86 | @nonterminals.each { |nt| first_sets[nt] = Set(Elements::LookaheadElement).new } 87 | first_sets[[] of Elements::NonterminalId | Elements::TerminalId] = Set(Elements::LookaheadElement) { Elements::EmptyElement.new } 88 | change_occured = true 89 | 90 | while change_occured 91 | change_occured = false 92 | @items.each do |item| 93 | change_occured |= compute_alternatives_first(first_sets, item.body) 94 | change_occured |= concat_watching(first_sets[item.head], first_sets[item.body]) 95 | end 96 | end 97 | 98 | return first_sets 99 | end 100 | 101 | # Gets a lookahead set for the given alternative, using its parent lookahead set. 102 | private def get_lookahead(first_sets, alternative, old_lookahead) 103 | lookahead = first_sets[alternative].dup 104 | if contains_empty(lookahead) 105 | lookahead.concat(old_lookahead) 106 | lookahead = lookahead.reject &.is_a?(Elements::EmptyElement) 107 | end 108 | return lookahead.to_set 109 | end 110 | 111 | # Creates new dotted items that are to be added because the "dot" is on the left on a nonterminal 112 | # in the parent dotted item. The suffix parameter describes all the tokens after the nonterminal, 113 | # which is used for looking up in the FIRST set. 114 | private def create_dotted_items(first_sets, nonterminal, suffix, parent_lookahead) 115 | return @items.select(&.head.==(nonterminal)) 116 | .map { |it| LookaheadItem.new it, get_lookahead(first_sets, suffix, parent_lookahead) } 117 | end 118 | 119 | # Creates new dotted items for every existing dotted item. This may be necessary if the "dot" moved 120 | # and is now on the left hand of a Elements::NonterminalId, which warrants all the production rules for that nonterminal 121 | # To be added to the current set (with their lookahead sets computed from scratch). 122 | private def new_dots(first_sets, dots) 123 | dots.map do |dot| 124 | next Set(LookaheadItem).new if dot.index >= dot.item.body.size 125 | next Set(LookaheadItem).new if dot.item.body[dot.index].is_a?(Elements::LookaheadElement) 126 | next create_dotted_items(first_sets, dot.item.body[dot.index], dot.item.body[(dot.index+1)...dot.item.body.size], dot.lookahead) 127 | end.reduce(Set(LookaheadItem).new) do |set, list| 128 | set.concat list 129 | end 130 | end 131 | 132 | # Creates all dotted items from the given list of "initial" dotted items. 133 | private def all_dots(first_sets, dots) 134 | found_dots = dots.to_set.dup 135 | while concat_watching(found_dots, new_dots(first_sets, found_dots)) 136 | end 137 | groups = found_dots.group_by { |dot| { dot.item, dot.index } } 138 | found_dots = groups.map do |k, v| 139 | item, index = k 140 | merged_lookahead = v.map(&.lookahead).reduce(Set(Elements::LookaheadElement).new) { |l, r| l.concat r } 141 | LookaheadItem.new item, merged_lookahead, index 142 | end 143 | return found_dots.to_set 144 | end 145 | 146 | # Gets a set of shifted items for each possible shift-transition 147 | # from the current state. 148 | private def get_transitions(dotted_items) 149 | return dotted_items.compact_map do |dot| 150 | next nil unless dot.index < dot.item.body.size 151 | next { dot.item.body[dot.index], dot.next_item } 152 | end.reduce(Hash(Elements::NonterminalId | Elements::TerminalId, Set(LookaheadItem)).new) do |hash, kv| 153 | k, v = kv 154 | hash[k] = hash[k]?.try(&.<<(v)) || Set { v } 155 | next hash 156 | end 157 | end 158 | 159 | # Converts an LR(1) PDA to an LALR(1) PDA by merging states with the corresponding bodies, and 160 | # combining the lookahead sets of every matching item. 161 | def create_lalr_pda(lr_pda) 162 | lalr_pda = Pda.new @items 163 | groups = lr_pda.states.group_by { |s| s.data.map { |it| DottedItem.new it.item, it.index }.to_set } 164 | # Since 2+ sets become one, we need to adjust transitions. 165 | states = Hash(typeof(lr_pda.states.first), typeof(lalr_pda.states.first)).new 166 | groups.each do |_, equal_states| 167 | item_groups = equal_states 168 | .flat_map(&.data.each) 169 | .group_by { |it| DottedItem.new it.item, it.index } 170 | merged_items = item_groups.map do |kv| 171 | dotted_item, items = kv 172 | LookaheadItem.new dotted_item.item, items.flat_map(&.lookahead.each).to_set, dotted_item.index 173 | end.to_set 174 | new_state = lalr_pda.state_for data: merged_items 175 | equal_states.each do |state| 176 | states[state] = new_state 177 | end 178 | end 179 | 180 | # Reconnect the new states. 181 | lr_pda.states.each do |state| 182 | new_state = states[state] 183 | state.transitions.each do |e, other| 184 | new_state.transitions[e] = states[other] 185 | end 186 | end 187 | 188 | return lalr_pda 189 | end 190 | 191 | # Create an LR(1) PDA given a start symbol. 192 | def create_lr_pda 193 | pda = Pda.new @items 194 | first_sets = compute_first 195 | # Set of items starting with the start nonterminal 196 | start_items = @items.select(&.head.start?).map do |it| 197 | LookaheadItem.new it, Set(Elements::LookaheadElement) { Elements::EofElement.new } 198 | end 199 | # Set of all current dotted items 200 | all_start_items = all_dots(first_sets, start_items) 201 | start_state = pda.state_for data: all_start_items 202 | 203 | queue = Set(PState).new 204 | finished = Set(PState).new 205 | 206 | queue << start_state 207 | 208 | while !queue.empty? 209 | state = queue.first 210 | queue.delete state 211 | next if finished.includes? state 212 | 213 | finished << state 214 | transitions = get_transitions(state.data) 215 | transitions.each do |transition, items| 216 | items = all_dots(first_sets, items) 217 | new_state = pda.state_for data: items 218 | state.transitions[transition] = new_state 219 | queue << new_state 220 | end 221 | end 222 | 223 | return pda 224 | end 225 | 226 | # Add an item to the Grammar. 227 | def add_item(i) 228 | items << i 229 | end 230 | end 231 | end 232 | end 233 | -------------------------------------------------------------------------------- /src/pegasus/items.cr: -------------------------------------------------------------------------------- 1 | require "./elements.cr" 2 | require "./error.cr" 3 | 4 | module Pegasus 5 | module Pda 6 | # An single production item, without a dot or any 7 | # kind of state. 8 | class Item 9 | # The nonterminal on the left of the production rule, 10 | # into which the right hand side is converted. 11 | getter head : Elements::NonterminalId 12 | # The body of terminals and nonterminals on the right 13 | # of the production rule. 14 | getter body : Array(Elements::NonterminalId | Elements::TerminalId) 15 | 16 | # Creates a new item with the given head and body. 17 | def initialize(@head, @body) 18 | end 19 | 20 | # Compares equality with the given other item. 21 | def ==(other : Item) 22 | return (other.head == @head) && (other.body == @body) 23 | end 24 | 25 | # Hashes this item. 26 | def hash(hasher) 27 | @head.hash(hasher) 28 | @body.hash(hasher) 29 | hasher 30 | end 31 | 32 | def to_s(io) 33 | io << "Item(" << head << ", [" << body.map(&.to_s).join(", ") << "])" 34 | end 35 | end 36 | 37 | # An item with a "dot", which keeps track of how far the item is 38 | # in terms of being parsed. 39 | class DottedItem 40 | # The production rule this dotted item wraps. 41 | getter item : Item 42 | # The index in the body of the production rule. 43 | getter index : Int64 44 | 45 | # Creates a new dotted item. 46 | def initialize(@item, @index = 0_i64) 47 | end 48 | 49 | # Compares this item to another dotted item, including the index. 50 | def ==(other : DottedItem) 51 | return (other.item == @item) && (other.index == @index) 52 | end 53 | 54 | # Hashes this dotted item. 55 | def hash(hasher) 56 | @item.hash(hasher) 57 | @index.hash(hasher) 58 | hasher 59 | end 60 | 61 | def to_s(io) 62 | io << "DottedItem(" << item << ", " << index 63 | io << ", COMPLETED" if index == @item.body.size 64 | io << ")" 65 | end 66 | 67 | # Turns this item into the next item assuming a shift took place. 68 | def next_item! 69 | if @index < @item.body.size 70 | @index += 1 71 | else 72 | raise_pda "Reached past the end of the item!", internal: true 73 | end 74 | end 75 | 76 | # Creates a new item assuming a shift took place. 77 | def next_item 78 | new = dup 79 | new.next_item! 80 | return new 81 | end 82 | 83 | # Checks if this dotted item is done. 84 | def done? 85 | return @index == @item.body.size 86 | end 87 | end 88 | 89 | # A superclass of the `DottedItem` which also 90 | # keeps a lookahead set to further distinguish it 91 | # in LR(1) parser construction. 92 | class LookaheadItem < DottedItem 93 | # The lookahead set of this dotted item. 94 | getter lookahead : Set(Elements::LookaheadElement) 95 | 96 | # Creates a new lookahead dotted item. 97 | def initialize(@item, @lookahead, @index = 0_i64) 98 | super(@item, @index) 99 | end 100 | 101 | # Compares this dotted item to another dotted item. 102 | def ==(other : LookaheadItem) 103 | return super(other) && (other.lookahead == @lookahead) 104 | end 105 | 106 | # Hashes this dotted item. 107 | def hash(hasher) 108 | super(hasher) 109 | @lookahead.hash(hasher) 110 | hasher 111 | end 112 | 113 | def to_s(io) 114 | io << "LookaheadItem(" << item << ", " << index << ", {" << lookahead.map(&.to_s).join(", ") << "}" 115 | io << ", COMPLETED" if index == @item.body.size 116 | io << ")" 117 | end 118 | end 119 | end 120 | end 121 | -------------------------------------------------------------------------------- /src/pegasus/json.cr: -------------------------------------------------------------------------------- 1 | require "json" 2 | 3 | module Pegasus 4 | class Elements::TerminalId 5 | include JSON::Serializable 6 | @[JSON::Field(key: "terminal_id")] 7 | @id : Int64 8 | end 9 | 10 | class Elements::NonterminalId 11 | include JSON::Serializable 12 | @[JSON::Field(key: "nonterminal_id")] 13 | @id : Int64 14 | @start : Bool 15 | end 16 | 17 | module Pda 18 | class Item 19 | include JSON::Serializable 20 | getter head : Elements::NonterminalId 21 | getter body : Array(Elements::TerminalId | Elements::NonterminalId) 22 | end 23 | end 24 | 25 | module Language 26 | class LanguageData 27 | include JSON::Serializable 28 | getter lex_skip_table : Array(Bool) 29 | getter lex_state_table : Array(Array(Int64)) 30 | getter lex_final_table : Array(Int64) 31 | getter parse_state_table : Array(Array(Int64)) 32 | getter parse_action_table : Array(Array(Int64)) 33 | getter parse_final_table : Array(Bool) 34 | 35 | getter terminals : Hash(String, Elements::TerminalId) 36 | getter nonterminals : Hash(String, Elements::NonterminalId) 37 | getter items : Array(Pda::Item) 38 | getter max_terminal : Int64 39 | end 40 | end 41 | end 42 | -------------------------------------------------------------------------------- /src/pegasus/language_def.cr: -------------------------------------------------------------------------------- 1 | require "./elements.cr" 2 | require "./items.cr" 3 | require "./grammar.cr" 4 | require "./nfa.cr" 5 | require "./regex.cr" 6 | require "./nfa_to_dfa.cr" 7 | require "./table.cr" 8 | require "./error.cr" 9 | require "./generated/grammar_parser.cr" 10 | 11 | module Pegasus 12 | # This module is for handling language data. The language is given by the complete 13 | # Pegasus grammar, and includes the terminals, nonterminals, and other rules. 14 | # This module also contains `LanguageData`, which is the JSON structure 15 | # that is passed between pegasus and its consumer programs, like pegasus-c. 16 | module Language 17 | # An error context which reports the items involved in some kind of conflict 18 | # (shift / reduce or reduce / reduce). This version, unlike `ConflictErrorContext`, 19 | # reports the relevant items' names. 20 | class NamedConflictErrorContext < Error::ErrorContext 21 | def initialize(@nonterminals : Array(String)) 22 | end 23 | 24 | def to_s(io) 25 | io << "The nonterminals involved are: " 26 | @nonterminals.join(io, ", ") 27 | end 28 | end 29 | 30 | # The complete data class, built to be all the information 31 | # needed to construct a parser generator. 32 | class LanguageData 33 | # Table for tokens that should be skipped. 34 | getter lex_skip_table : Array(Bool) 35 | # The state table for the lexer, which is used for transitions 36 | # of the `Nfa::Nfa` during tokenizing. 37 | getter lex_state_table : Array(Array(Int64)) 38 | # The table that maps a state ID to a token ID, used to 39 | # recognize that a match has occured. 40 | getter lex_final_table : Array(Int64) 41 | # Transition table for the LALR parser automaton, indexed 42 | # by terminal and nonterminal IDs. 43 | getter parse_state_table : Array(Array(Int64)) 44 | # Action table indexed by the state and the lookahead item. 45 | # Used to determine what the parser should do in each state. 46 | getter parse_action_table : Array(Array(Int64)) 47 | # The table that maps a nonterminal ID to recognize 48 | # when parsing can stop. 49 | getter parse_final_table : Array(Bool) 50 | 51 | # The terminals, and their original names / regular expressions. 52 | getter terminals : Hash(String, Elements::TerminalId) 53 | # The nonterminals, and their original names. 54 | getter nonterminals : Hash(String, Elements::NonterminalId) 55 | # The items in the language. Used for reducing / building up 56 | # trees once a reduce action is performed. 57 | getter items : Array(Pda::Item) 58 | # The highest terminal ID, used for correctly accessing the 59 | # tables indexed by both terminal and nonterminal IDs. 60 | getter max_terminal : Int64 61 | 62 | # Creates a new language data object. 63 | def initialize(language_definition) 64 | @terminals, @nonterminals, grammar = 65 | generate_grammar(language_definition) 66 | @lex_skip_table, @lex_state_table, @lex_final_table, 67 | @parse_state_table, @parse_action_table, @parse_final_table = 68 | generate_tables(language_definition, @terminals, @nonterminals, grammar) 69 | @max_terminal = @terminals.values.max_of?(&.raw_id) || 0_i64 70 | @items = grammar.items 71 | end 72 | 73 | # Assigns an ID to each unique vaue in the iterable. 74 | private def assign_ids(values : Iterable(T), &block : Int64 -> R) forall T, R 75 | hash = {} of T => R 76 | last_id = 0_i64 77 | values.each do |value| 78 | next if hash[value]? 79 | hash[value] = yield (last_id += 1) - 1 80 | end 81 | return hash 82 | end 83 | 84 | # Creates a grammar, returning it and the hashes with identifiers for 85 | # the terminals and nonterminals. 86 | private def generate_grammar(language_def) 87 | token_ids = assign_ids(language_def.tokens.keys) do |i| 88 | Elements::TerminalId.new i 89 | end 90 | rule_ids = assign_ids(language_def.rules.keys) do |i| 91 | Elements::NonterminalId.new i, start: i == 0 92 | end 93 | 94 | grammar = Pda::Grammar.new token_ids.values, rule_ids.values 95 | language_def.rules.each do |name, bodies| 96 | head = rule_ids[name] 97 | bodies.each &.alternatives.each do |body| 98 | body = body.elements.map(&.name).map do |element_name| 99 | element = token_ids[element_name]? || rule_ids[element_name]? 100 | raise_grammar "No terminal or rule named #{element_name}" unless element 101 | next element 102 | end 103 | item = Pda::Item.new head, body 104 | grammar.add_item item 105 | end 106 | end 107 | 108 | return { token_ids, rule_ids, grammar } 109 | end 110 | 111 | # Generates lookup tables using the given terminals, nonterminals, 112 | # and grammar. 113 | private def generate_tables(language_def, terminals, nonterminals, grammar) 114 | nfa = Nfa::Nfa.new 115 | terminals.each do |terminal, value| 116 | nfa.add_regex language_def.tokens[terminal].regex, value.raw_id 117 | end 118 | dfa = nfa.dfa 119 | 120 | begin 121 | lex_skip_table = [ false ] + 122 | language_def.tokens.map &.[1].options.includes?("skip") 123 | lex_state_table = dfa.state_table 124 | lex_final_table = dfa.final_table 125 | 126 | lr_pda = grammar.create_lr_pda 127 | lalr_pda = grammar.create_lalr_pda(lr_pda) 128 | parse_state_table = lalr_pda.state_table 129 | parse_action_table = lalr_pda.action_table 130 | parse_final_table = [false] + nonterminals.map &.[1].start? 131 | rescue e : Error::PegasusException 132 | if old_context = e.context_data 133 | .find(&.is_a?(Dfa::ConflictErrorContext)) 134 | .as?(Dfa::ConflictErrorContext) 135 | 136 | names = old_context.item_ids.map do |id| 137 | head = grammar.items[id].head 138 | nonterminals.key_for head 139 | end 140 | e.context_data.delete old_context 141 | e.context_data << NamedConflictErrorContext.new names 142 | end 143 | raise e 144 | end 145 | 146 | return { lex_skip_table, lex_state_table, lex_final_table, parse_state_table, parse_action_table, parse_final_table } 147 | end 148 | end 149 | 150 | class ::Pegasus::Generated::Tree 151 | alias SelfDeque = Deque(Generated::Tree) 152 | 153 | # Recursive call for the `#flatten` function. 154 | protected def flatten_recursive(*, value_index : Int32, recursive_name : String, recursive_index : Int32) : SelfDeque 155 | if flattened = self.as?(Generated::NonterminalTree) 156 | recursive_child = flattened.children[recursive_index]? 157 | value_child = flattened.children[value_index]? 158 | 159 | if flattened.name == recursive_name && recursive_child 160 | add_to = recursive_child.flatten_recursive( 161 | value_index: value_index, 162 | recursive_name: recursive_name, 163 | recursive_index: recursive_index) 164 | else 165 | add_to = SelfDeque.new 166 | end 167 | add_to.insert(0, value_child) if value_child 168 | 169 | return add_to 170 | end 171 | return SelfDeque.new 172 | end 173 | 174 | # Since currently, * and + operators aren't supported in Pegasus grammars, they tend to be recursively written. 175 | # This is a utility function to "flatten" a parse tree produced by a recursively written grammar. 176 | def flatten(*, value_index : Int32, recursive_name : String, recursive_index : Int32) 177 | flatten_recursive( 178 | value_index: value_index, 179 | recursive_name: recursive_name, 180 | recursive_index: recursive_index).to_a 181 | end 182 | end 183 | 184 | alias Option = String 185 | 186 | # Since Pegasus supports options on tokens and rules, 187 | # we need to represent an object to which options can be attached. 188 | # this is this type of object. 189 | abstract class OptionObject 190 | # Gets the actual list of options attached to this object. 191 | getter options : Array(Option) 192 | 193 | def initialize 194 | @options = [] of Option 195 | end 196 | end 197 | 198 | # A token declaration, with zero or more rules attached to it. 199 | class Token < OptionObject 200 | # Gets the regular expression that defines this token. 201 | getter regex : String 202 | 203 | def initialize(@regex, @options = [] of Option) 204 | end 205 | 206 | def ==(other : Token) 207 | return (other.regex == @regex) && (other.options == @options) 208 | end 209 | 210 | def hash(hasher) 211 | @regex.hash(hasher) 212 | @options.hash(hasher) 213 | hasher 214 | end 215 | end 216 | 217 | class ::Array(T) 218 | # Gets the indices of all values matching the condition 219 | def indices(&block) 220 | deque = Deque(Int32).new 221 | each_with_index do |v, i| 222 | deque << i if yield v 223 | end 224 | return deque.to_a 225 | end 226 | end 227 | 228 | module ::Iterable(T) 229 | def power_set 230 | set = Set(Set(T)).new 231 | set << Set(T).new 232 | 233 | each do |item| 234 | to_add = Set(Set(T)).new 235 | set.each do |subset| 236 | to_add << subset.dup.<<(item) 237 | end 238 | set.concat to_add 239 | end 240 | 241 | return set 242 | end 243 | end 244 | 245 | # An element of a grammar rule. Can be either a token or another rule. 246 | class RuleElement 247 | # The name of the element, as specified in the grammar. 248 | getter name : String 249 | 250 | def initialize(@name) 251 | end 252 | 253 | def ==(other : RuleElement) 254 | return @name == other.name 255 | end 256 | 257 | # If called in a child class of RuleElement, 258 | # this strips the child class of its additional data, 259 | # turning it back into a RuleElement base class. 260 | def base_element 261 | return self 262 | end 263 | 264 | # Checks if this element derives lambda. 265 | # This doesm't check if the production rule it 266 | # represent can derive lambda; rather, it checks 267 | # if this element has an operator applied to it 268 | # that makes it do so, like ? or * 269 | def derives_lambda? 270 | return false 271 | end 272 | end 273 | 274 | # An element that is optional. 275 | class OptionalElement < RuleElement 276 | def base_element 277 | return RuleElement.new name 278 | end 279 | 280 | def derives_lambda? 281 | return true 282 | end 283 | end 284 | 285 | # An element that is repeated one or more times. 286 | class OneOrMoreElement < RuleElement 287 | end 288 | 289 | # An element that is repeated zero or more times. 290 | class ZeroOrMoreElement < RuleElement 291 | def derives_lambda? 292 | return true 293 | end 294 | end 295 | 296 | # One of the alternatives of a rule. 297 | class RuleAlternative 298 | # The elements of the rule. 299 | getter elements : Array(RuleElement) 300 | 301 | def initialize(@elements) 302 | raise_grammar "Empty productions are currently not supported" if elements.empty? 303 | end 304 | 305 | def ==(other : RuleAlternative) 306 | return @elements == other.elements 307 | end 308 | 309 | # Computes a single variant, given optional indices that should be included. 310 | private def compute_variant(indices) 311 | new_elements = [] of RuleElement 312 | elements.each_with_index do |element, index| 313 | next if element.derives_lambda? && !indices.includes? index 314 | new_elements << element.base_element 315 | end 316 | return RuleAlternative.new(new_elements) 317 | end 318 | 319 | # Checks if this specific alternative is the lambda alternative. 320 | def lambda? 321 | return @elements.empty? 322 | end 323 | 324 | # Determines if this rule alternative can be empty, or derive lambda. 325 | def derives_lambda? 326 | return derives_lambda? &.derives_lambda? 327 | end 328 | 329 | # Determines if the rule alternative can be empty, using 330 | # the block to check whether each element can be empty or not. 331 | def derives_lambda?(&block) 332 | return @elements.all? { |it| yield it } 333 | end 334 | 335 | # Computes the variants created by optionals. 336 | # For example, a? b? has four variants, a b, a, b, . 337 | def compute_optional_variants 338 | return compute_optional_variants &.derives_lambda? 339 | end 340 | 341 | # Same as compute_optional_variants, but what's optional is 342 | # now decided by the block. 343 | def compute_optional_variants(&block) 344 | optional_positions = @elements.indices { |it| yield it } 345 | power_set = optional_positions.power_set 346 | return power_set.map { |it| compute_variant(it) } 347 | end 348 | end 349 | 350 | # A single rule. This can have one or more alternatives, 351 | # but has the same options (zero or more) applied to them. 352 | class Rule < OptionObject 353 | getter alternatives : Array(RuleAlternative) 354 | 355 | def initialize(@alternatives, @options = [] of Option) 356 | end 357 | 358 | def ==(other : Rule) 359 | return (other.alternatives == @alternatives) && (other.options == @options) 360 | end 361 | 362 | def hash(hasher) 363 | @alternatives.hash(hasher) 364 | @options.hash(hasher) 365 | hasher 366 | end 367 | 368 | # Checks if this rule has any alternatives that can derive lambda. 369 | def derives_lambda? 370 | return @alternatives.any? &.derives_lambda? 371 | end 372 | 373 | # Checks if this rule has any alternatives that can derive lambda, 374 | # using a custom block for checking if an element can derive lambda. 375 | def derives_lambda?(&block) 376 | return @alternatives.any? &.derives_lambda? { |it| yield it } 377 | end 378 | 379 | # Creates a new rule with the same options, but with alternatives expanded for optional values. 380 | def compute_optional_variants 381 | return Rule.new(@alternatives.flat_map &.compute_optional_variants, @options) 382 | end 383 | 384 | # Creates a new rule with the same options, but with alternatives expanded for optional values. 385 | # Uses a custom block to check if the elements can be empty or not. 386 | def compute_optional_variants(&block) 387 | return Rule.new(@alternatives.flat_map &.compute_optional_variants(block), @options) 388 | end 389 | end 390 | 391 | # A language definition parsed from a grammar string. 392 | class LanguageDefinition 393 | getter tokens : Hash(String, Token) 394 | getter rules : Hash(String, Array(Rule)) 395 | 396 | # Creates a new, empty language definition. 397 | def initialize 398 | @tokens = {} of String => Token 399 | @rules = {} of String => Array(Rule) 400 | end 401 | 402 | # Creates a new language definition from the given string. 403 | def initialize(s : String) 404 | @tokens = {} of String => Token 405 | @rules = {} of String => Array(Rule) 406 | from_string(s) 407 | end 408 | 409 | # Creates a new language definition from the given IO. 410 | def initialize(io : IO) 411 | @tokens = {} of String => Token 412 | @rules = {} of String => Array(Rule) 413 | from_io(io) 414 | end 415 | 416 | # Creates a list of options from a "statemend end" parse tree node. 417 | private def extract_options(statement_end_tree) 418 | statement_end_tree = statement_end_tree.as(Generated::NonterminalTree) 419 | return [] of Option unless statement_end_tree.children.size > 1 420 | options_tree = statement_end_tree.children[0].as(Generated::NonterminalTree) 421 | options = options_tree.children[1] 422 | .flatten(value_index: 0, recursive_name: "option_list", recursive_index: 2) 423 | .map(&.as(Generated::NonterminalTree).children[0]) 424 | .map(&.as(Generated::TerminalTree).string) 425 | end 426 | 427 | # Extracts all the tokens from the token list parse tree node, storing them 428 | # in a member variable hash. 429 | private def extract_tokens(token_list_tree) 430 | token_list_tree.flatten(value_index: 0, recursive_name: "token_list", recursive_index: 1) 431 | .map { |it| ntt = it.as(Generated::NonterminalTree); { ntt.children[1], ntt.children[3], ntt.children[4] } } 432 | .map do |data| 433 | name_tree, regex_tree, statement_end = data 434 | name = name_tree 435 | .as(Generated::TerminalTree).string 436 | raise_grammar "Declaring a token (#{name}) a second time" if @tokens.has_key? name 437 | regex = regex_tree 438 | .as(Generated::TerminalTree).string[1..-2] 439 | @tokens[name] = Token.new regex, extract_options(statement_end) 440 | end 441 | end 442 | 443 | private def extract_rule_element(grammar_element_tree) 444 | grammar_element_tree = grammar_element_tree.as(Generated::NonterminalTree) 445 | name = grammar_element_tree.children[0].as(Generated::TerminalTree).string 446 | setting = grammar_element_tree.children[1]?.try { |it| it.as(Generated::TerminalTree).string } 447 | return case setting 448 | when "?" 449 | OptionalElement.new name 450 | else 451 | RuleElement.new name 452 | end 453 | end 454 | 455 | # Extracts all the body definitions from the grammar bodies tree node. 456 | # A rule has several bodies. 457 | private def extract_bodies(bodies_tree) 458 | bodies_tree.flatten(value_index: 0, recursive_name: "grammar_bodies", recursive_index: 2) 459 | .map do |body| 460 | RuleAlternative.new body 461 | .flatten(value_index: 0, recursive_name: "grammar_body", recursive_index: 1) 462 | .map { |it| extract_rule_element(it) } 463 | end 464 | end 465 | 466 | # Extracts all the rules from a gramamr list tree node, storin them 467 | # in a member variable hash. 468 | private def extract_rules(grammar_list_tree) 469 | grammar_list_tree.flatten(value_index: 0, recursive_name: "grammar_list", recursive_index: 1) 470 | .map { |it| ntt = it.as(Generated::NonterminalTree); { ntt.children[1], ntt.children[3], ntt.children[4] } } 471 | .map do |data| 472 | name_tree, bodies_tree, statement_end = data 473 | name = name_tree 474 | .as(Generated::TerminalTree).string 475 | raise_grammar "Declaring a rule (#{name}) with the same name as a token" if @tokens.has_key? name 476 | bodies = extract_bodies(bodies_tree) 477 | 478 | unless old_rules = @rules[name]? 479 | @rules[name] = old_rules = Array(Rule).new 480 | end 481 | old_rules << Rule.new(bodies, extract_options(statement_end)).compute_optional_variants 482 | end 483 | end 484 | 485 | # Creates a language definition from a string. 486 | private def from_string(string) 487 | tree = ::Pegasus::Generated.process(string).as(::Pegasus::Generated::NonterminalTree) 488 | if tokens = tree.children.find &.as(::Pegasus::Generated::NonterminalTree).name.==("token_list") 489 | extract_tokens(tokens) 490 | end 491 | if rules = tree.children.find &.as(::Pegasus::Generated::NonterminalTree).name.==("grammar_list") 492 | extract_rules(rules) 493 | end 494 | rescue e : Error::PegasusException 495 | raise e 496 | rescue e : Exception 497 | raise_grammar e.message.not_nil! 498 | end 499 | 500 | # Creates a languge definition from IO. 501 | private def from_io(io) 502 | string = io.gets_to_end 503 | from_string(string) 504 | end 505 | end 506 | end 507 | end 508 | -------------------------------------------------------------------------------- /src/pegasus/nfa.cr: -------------------------------------------------------------------------------- 1 | require "./automaton.cr" 2 | 3 | module Pegasus 4 | # This module is for nondeterministic finite automata. While NFAs 5 | # aren't very good for directly creating state machines 6 | # (you need to keep track of an exponential number of potential states), 7 | # they are easier to construct. This module contains functionality to convert 8 | # regular expressions to NFAs. 9 | module Nfa 10 | alias NState = Automata::State(Int64?, Transition) 11 | 12 | # A transition class used to represent the possible transitions 13 | # possible in the NFA. 14 | class Transition 15 | end 16 | 17 | # A transition that requires a single byte. 18 | class ByteTransition < Transition 19 | # The byte used for the transition. 20 | getter byte : UInt8 21 | 22 | # Creates a new byte transition. 23 | def initialize(@byte) 24 | end 25 | end 26 | 27 | # A transition that doesn't consume a token from the input. 28 | class LambdaTransition < Transition 29 | end 30 | 31 | # A transition that accepts any character. 32 | class AnyTransition < Transition 33 | end 34 | 35 | # A transition that accepts several ranges of bytes. 36 | class RangeTransition < Transition 37 | # The ranges this transition accepts / rejects. 38 | getter ranges : Array(Range(UInt8, UInt8)) 39 | # If this is true, characters must __not__ be in the ranges to 40 | # be accepted. 41 | getter inverted : Bool 42 | 43 | # Creates a new range transition. 44 | def initialize(@ranges, @inverted) 45 | end 46 | end 47 | 48 | # A nondeterministic finite automaton, to be created 49 | # from regular expressions. 50 | class Nfa < Automata::Automaton(Int64?, Transition) 51 | # Creates a new Nfa with a start state. 52 | def initialize 53 | super 54 | @start = state_for(data: nil) 55 | end 56 | 57 | # Creates a new state for no value (aka, a set with nil as the value) 58 | def state 59 | state_for data: nil 60 | end 61 | end 62 | end 63 | end 64 | -------------------------------------------------------------------------------- /src/pegasus/nfa_to_dfa.cr: -------------------------------------------------------------------------------- 1 | require "./nfa.cr" 2 | require "./dfa.cr" 3 | require "./error.cr" 4 | 5 | module Pegasus 6 | module Nfa 7 | class Transition 8 | # Returns the characters this transition accepts 9 | # for transitions. 10 | def char_states 11 | return [] of UInt8 12 | end 13 | end 14 | 15 | class ByteTransition 16 | def char_states 17 | return [ @byte ] 18 | end 19 | end 20 | 21 | class AnyTransition 22 | def char_states 23 | return (0_u8..255_u8).to_a 24 | end 25 | end 26 | 27 | class RangeTransition 28 | def char_states 29 | states = @ranges.map(&.to_a).flatten 30 | states = (0_u8..255_u8).to_a - states if @inverted 31 | return states 32 | end 33 | end 34 | 35 | class Nfa 36 | # Finds all the states connected to the given state 37 | # through lambda transitions, which will be in the same `Pegasus::Dfa::Dfa` state. 38 | private def find_lambda_states(s : NState) 39 | found = Set(NState).new 40 | queued = Set{s} 41 | while !queued.empty? 42 | state = queued.first 43 | queued.delete state 44 | next if found.includes? state 45 | 46 | found << state 47 | queued.concat state.transitions.select(&.is_a?(LambdaTransition)).map(&.[1]) 48 | end 49 | return found 50 | end 51 | 52 | # Finds the lambda states connected to any of the states of the given set. 53 | def find_lambda_states(s : Set(NState)) 54 | return s 55 | .map { |it| find_lambda_states(it) } 56 | .reduce(Set(NState).new) { |acc, r| acc.concat r } 57 | end 58 | 59 | # Merges the sets mapped to by the same key in the list of hashes. 60 | private def merge_hashes(a : Array(Hash(K, Set(V)))) forall K, V 61 | a.reduce({} of K => Set(V)) { |l, r| l.merge(r) { |_, l1, r1| l1|r1 } } 62 | end 63 | 64 | # Creates a `Pegasus::Dfa::Dfa` for this Nfa. 65 | def dfa 66 | raise_dfa "NFA doesn't have start state" unless @start 67 | 68 | # DFA we're constructing 69 | new_dfa = Pegasus::Dfa::Dfa.new 70 | # The NFA->DFA algorithm creates a state for every reachable combination of NFA states. 71 | # So, this is a set of "reachable states", and is itself a state. 72 | new_start_set = find_lambda_states(@start.not_nil!) 73 | new_start = new_dfa.state_for data: new_start_set 74 | new_dfa.start = new_start 75 | 76 | # The queue of states to process. 77 | queue = Set { new_start } 78 | # Visited states. 79 | finished = Set(Pegasus::Dfa::DState).new 80 | 81 | while !queue.empty? 82 | state = queue.first 83 | queue.delete state 84 | next if finished.includes? state 85 | 86 | finished << state 87 | sub_hashes = state.data.map do |sub_state| 88 | transition_hashes = sub_state.transitions.map do |k, v| 89 | char_states = k.char_states 90 | set_array = Array.new(char_states.size) do 91 | Set { v } 92 | end 93 | Hash.zip(char_states, set_array) 94 | end 95 | merge_hashes(transition_hashes) 96 | end 97 | out_transitions = merge_hashes(sub_hashes) 98 | out_transitions.each do |char, ss| 99 | out_state_set = find_lambda_states(ss) 100 | out_state = new_dfa.state_for data: out_state_set 101 | state.transitions[char] = out_state 102 | queue << out_state 103 | end 104 | end 105 | 106 | return new_dfa 107 | end 108 | end 109 | end 110 | end 111 | -------------------------------------------------------------------------------- /src/pegasus/pda.cr: -------------------------------------------------------------------------------- 1 | require "./elements.cr" 2 | require "./automaton.cr" 3 | require "./items.cr" 4 | 5 | module Pegasus 6 | module Pda 7 | alias PState = Automata::State(Set(LookaheadItem), Elements::NonterminalId | Elements::TerminalId) 8 | 9 | # A class that represents the (LA)LR Push Down Automaton. 10 | class Pda < Automata::UniqueAutomaton(Set(LookaheadItem), Elements::NonterminalId | Elements::TerminalId) 11 | def initialize(@items : Array(Item)) 12 | super() 13 | end 14 | end 15 | end 16 | end 17 | -------------------------------------------------------------------------------- /src/pegasus/regex.cr: -------------------------------------------------------------------------------- 1 | require "./nfa.cr" 2 | require "./error.cr" 3 | 4 | module Pegasus 5 | module Nfa 6 | # A "unit" of one or more connected states. 7 | class StateChain 8 | # The beginning of this chain. 9 | property start : NState 10 | # The end of this chain. 11 | property final : NState 12 | 13 | # Creates a new chain with the given initial and final states. 14 | def initialize(@start, final = nil) 15 | @final = final || @start 16 | end 17 | 18 | # Appends another chain to this one, modifying the states' transition 19 | # hashes, too. 20 | def append!(other : StateChain) 21 | @final.not_nil!.transitions[LambdaTransition.new] = other.start.not_nil! 22 | @final = other.final 23 | return self 24 | end 25 | 26 | # Appends nothing to this chain. This is a no-op. 27 | def append!(other : Nil) 28 | return self 29 | end 30 | end 31 | 32 | class Nfa 33 | ESCAPES = { 34 | '\'' => 0x27_u8, 35 | '"' => 0x22_u8, 36 | '?' => 0x3f_u8, 37 | '\\' => 0x5c_u8, 38 | 'a' => 0x07_u8, 39 | 'b' => 0x08_u8, 40 | 'f' => 0x0c_u8, 41 | 'n' => 0x0a_u8, 42 | 'r' => 0x0d_u8, 43 | 't' => 0x09_u8, 44 | 'v' => 0x0b_u8, 45 | '*' => 0x2a_u8, 46 | '+' => 0x2b_u8, 47 | '-' => 0x2d_u8, 48 | '|' => 0x7c_u8, 49 | '[' => 0x5b_u8, 50 | ']' => 0x5d_u8, 51 | '(' => 0x28_u8, 52 | ')' => 0x29_u8, 53 | '.' => 0x2e_u8, 54 | '/' => 0x2f_u8, 55 | } 56 | 57 | # Applies the "+" operator to the given `StateChain`. 58 | private def nfa_plus(chain) 59 | new_final = state 60 | new_start = state 61 | new_final.transitions[LambdaTransition.new] = new_start 62 | chain.final.transitions[LambdaTransition.new] = new_final 63 | new_start.transitions[LambdaTransition.new] = chain.start 64 | 65 | chain.start = new_start 66 | chain.final = new_final 67 | end 68 | 69 | # Applies the "*" operator to the given `StateChain`. 70 | private def nfa_star(chain) 71 | new_final = state 72 | new_start = state 73 | new_final.transitions[LambdaTransition.new] = new_start 74 | new_start.transitions[LambdaTransition.new] = new_final 75 | chain.final.transitions[LambdaTransition.new] = new_final 76 | new_start.transitions[LambdaTransition.new] = chain.start 77 | 78 | chain.start = new_start 79 | chain.final = new_final 80 | end 81 | 82 | # Applies the "?" operator to the given `StateChain`. 83 | private def nfa_question(chain) 84 | new_final = state 85 | new_start = state 86 | new_start.transitions[LambdaTransition.new] = new_final 87 | chain.final.transitions[LambdaTransition.new] = new_final 88 | new_start.transitions[LambdaTransition.new] = chain.start 89 | 90 | chain.start = new_start 91 | chain.final = new_final 92 | end 93 | 94 | # Reas a character, taking into account the scape character. 95 | private def read_char(tokens) 96 | raise_nfa "Unexpected end of file" unless tokens.first? 97 | char = tokens.delete_at(0) 98 | if char == '\\' 99 | raise_nfa "Incomplete escape character" unless tokens.first? 100 | char = tokens.delete_at(0) 101 | escape = ESCAPES[char]? 102 | raise_nfa "Invalid escape code" unless escape 103 | return escape 104 | else 105 | raise_nfa "Non-ASCII characters not supported" unless char.ascii? 106 | return char.bytes[0] 107 | end 108 | end 109 | 110 | # Creates an NFA chain using the range syntax ([...]) 111 | private def from_regex_range(tokens) 112 | tokens.delete_at(0) 113 | invert = false 114 | last_char = nil 115 | ranges = [] of Range(UInt8, UInt8) 116 | 117 | if tokens.first? == '^' 118 | invert = true 119 | tokens.delete_at(0) 120 | end 121 | 122 | while tokens.first? && tokens.first != ']' 123 | if tokens.first == '-' 124 | raise_nfa "Invalid range" unless last_char 125 | tokens.delete_at(0) 126 | ranges << (last_char..read_char(tokens)) 127 | last_char = nil 128 | else 129 | last_char.try { |it| ranges << (it..it) } 130 | last_char = read_char(tokens) 131 | end 132 | end 133 | last_char.try { |it| ranges << (it..it) } 134 | 135 | raise_nfa "Invalid range definition" if tokens.first? != ']' 136 | tokens.delete_at(0) 137 | 138 | start = state 139 | final = state 140 | start.transitions[RangeTransition.new(ranges, invert)] = final 141 | return StateChain.new(start, final) 142 | end 143 | 144 | # Parses a (sub)expression, optionally requiring parentheses. 145 | private def from_regex_expr(tokens, *, require_parenths = true) 146 | substring_stack = [] of StateChain 147 | current_chain = nil 148 | sub_chain = nil 149 | 150 | if require_parenths 151 | tokens.delete_at(0) 152 | end 153 | 154 | modifiers = { 155 | '+' => ->nfa_plus(StateChain), 156 | '*' => ->nfa_star(StateChain), 157 | '?' => ->nfa_question(StateChain) 158 | } 159 | 160 | while tokens.first? && tokens.first != ')' 161 | char = tokens.first 162 | 163 | if modifier = modifiers[char]? 164 | tokens.delete_at(0) 165 | raise_nfa "Invalid operator" unless sub_chain 166 | modifier.call(sub_chain) 167 | next 168 | end 169 | 170 | current_chain = current_chain.try(&.append!(sub_chain)) || sub_chain 171 | if char == '(' 172 | sub_chain = from_regex_expr(tokens) 173 | elsif char == '.' 174 | tokens.delete_at(0) 175 | empty_state = state 176 | actual_state = state 177 | 178 | empty_state.transitions[AnyTransition.new] = actual_state 179 | sub_chain = StateChain.new(empty_state, actual_state) 180 | elsif char == '|' 181 | tokens.delete_at(0) 182 | substring_stack.push current_chain if current_chain 183 | current_chain = nil 184 | sub_chain = nil 185 | elsif char == '[' 186 | sub_chain = from_regex_range(tokens) 187 | else 188 | char = read_char(tokens) 189 | 190 | empty_state = state 191 | actual_state = state 192 | empty_state.transitions[ByteTransition.new char] = actual_state 193 | sub_chain = StateChain.new(empty_state, actual_state) 194 | end 195 | end 196 | current_chain = current_chain.try(&.append!(sub_chain)) || sub_chain 197 | 198 | if require_parenths && tokens.first? == ')' 199 | tokens.delete_at(0) 200 | elsif (require_parenths ^ (tokens.first? == ')')) 201 | raise_nfa "Mismatched parentheses" 202 | end 203 | 204 | if substring_stack.size > 0 205 | substring_stack.push current_chain if current_chain 206 | start_state = state 207 | end_state = state 208 | substring_stack.compact!.each do |chain| 209 | start_state.transitions[LambdaTransition.new] = chain.start 210 | chain.final.transitions[LambdaTransition.new] = end_state 211 | end 212 | current_chain = StateChain.new(start_state, end_state) 213 | end 214 | 215 | return current_chain 216 | end 217 | 218 | # Adds a regular expression branch to this Nfa. 219 | def add_regex(str, id) 220 | tokens = str.chars 221 | chain = from_regex_expr(tokens, require_parenths: false) 222 | final_state = state_for data: id 223 | final_chain = StateChain.new(final_state, final_state) 224 | new_start = (chain.try(&.append!(final_chain)) || final_chain).start 225 | @start.not_nil!.transitions[LambdaTransition.new] = new_start 226 | end 227 | end 228 | end 229 | end 230 | -------------------------------------------------------------------------------- /src/pegasus/semantics.cr: -------------------------------------------------------------------------------- 1 | require "./generated/semantics_parser.cr" 2 | 3 | module Pegasus 4 | module Semantics 5 | alias NonterminalTree = Generated::Semantics::NonterminalTree 6 | alias TerminalTree = Generated::Semantics::TerminalTree 7 | 8 | class SemanticsData 9 | getter types : Hash(String, String) 10 | getter nonterminal_types : Hash(Elements::NonterminalId, String) 11 | getter actions : Hash(Int64, String) 12 | getter init : String 13 | 14 | def initialize(source, token_type : String, @data : Language::LanguageData) 15 | @types = {} of String => String 16 | @nonterminal_types = {} of Elements::NonterminalId => String 17 | @actions = {} of Int64 => String 18 | @init = "" 19 | 20 | @types["token"] = token_type 21 | 22 | begin 23 | raw_tree = Pegasus::Generated::Semantics.process(source).as(NonterminalTree) 24 | rescue e : Pegasus::Error::PegasusException 25 | raise e 26 | rescue e : Exception 27 | raise_general e.message.not_nil! 28 | end 29 | 30 | register_types raw_tree.children[0] 31 | register_typerules raw_tree.children[1] 32 | register_init raw_tree.children[2] 33 | register_rules raw_tree.children[3] 34 | end 35 | 36 | private def register_types(tree) 37 | type_list = tree.as(NonterminalTree) 38 | loop do 39 | type_decl = type_list.children[0].as(NonterminalTree) 40 | identifier = type_decl.children[1].as(TerminalTree).string; 41 | code = type_decl.children[3].as(TerminalTree).string[2..-3]; 42 | raise_general "Redefining #{identifier}" if @types.includes? identifier 43 | @types[identifier] = code 44 | 45 | break if type_list.children.size == 1 46 | type_list = type_list.children[1].as(NonterminalTree) 47 | end 48 | end 49 | 50 | private def register_typerules(tree) 51 | typerules_list = tree.as(NonterminalTree) 52 | loop do 53 | typerule_decl = typerules_list.children[0].as(NonterminalTree) 54 | identifier = typerule_decl.children[1].as(TerminalTree).string 55 | nonterminals = read_identifier_list typerule_decl.children[4] 56 | 57 | nonterminals.each do |nonterminal_name| 58 | unless nonterminal = @data.nonterminals[nonterminal_name]? 59 | raise_general "unknown nonterminal #{nonterminal_name}" 60 | end 61 | 62 | if @nonterminal_types.includes? nonterminal 63 | raise_general "redefinition of type for #{nonterminal_name}" 64 | end 65 | 66 | @nonterminal_types[nonterminal] = identifier 67 | end 68 | 69 | break if typerules_list.children.size == 1 70 | typerules_list = typerules_list.children[1].as(NonterminalTree) 71 | end 72 | end 73 | 74 | private def read_identifier_list(tree) 75 | list = tree.as(NonterminalTree) 76 | identifiers = [] of String 77 | loop do 78 | identifiers << list.children[0].as(TerminalTree).string 79 | 80 | break if list.children.size == 1 81 | list = list.children[2].as(NonterminalTree) 82 | end 83 | return identifiers 84 | end 85 | 86 | private def register_init(tree) 87 | @init = tree.as(NonterminalTree).children[2].as(TerminalTree).string[2..-3]; 88 | end 89 | 90 | private def register_rules(tree) 91 | rules_list = tree.as(NonterminalTree) 92 | loop do 93 | rule = rules_list.children[0].as(NonterminalTree) 94 | identifier = rule.children[1].as(TerminalTree).string 95 | number = rule.children[3].as(TerminalTree).string.to_i64 96 | code = rule.children[6].as(TerminalTree).string[2..-3]; 97 | 98 | unless nonterminal = @data.nonterminals[identifier]? 99 | raise_general "unknown rule #{nonterminal}" 100 | end 101 | 102 | index = 0 103 | set = false 104 | @data.items.each_with_index do |item, i| 105 | next unless item.head == nonterminal 106 | if index == number 107 | raise_general "redefinition of rule #{identifier}(#{number})" if @actions.includes? i.to_i64 108 | @actions[i.to_i64] = code 109 | set = true 110 | break 111 | end 112 | index += 1 113 | end 114 | raise_general "no rule #{identifier}(#{number})" unless set 115 | 116 | break if rules_list.children.size == 1 117 | rules_list = rules_list.children[1].as(NonterminalTree) 118 | end 119 | end 120 | end 121 | end 122 | end 123 | -------------------------------------------------------------------------------- /src/pegasus/table.cr: -------------------------------------------------------------------------------- 1 | require "./nfa.cr" 2 | require "./pda.cr" 3 | require "./error.cr" 4 | 5 | module Pegasus 6 | module Dfa 7 | class ConflictErrorContext < Pegasus::Error::ErrorContext 8 | getter item_ids : Array(Int64) 9 | 10 | def initialize(@item_ids) 11 | end 12 | 13 | def to_s(io) 14 | io << "The IDs of the items involved are " 15 | @item_ids.join(io, ", ") 16 | end 17 | end 18 | 19 | class Dfa 20 | # Creates a final table, which is used to determine if a state matched a token. 21 | def final_table 22 | return [0_i64] + @states.map { |s| s.data.compact_map(&.data).max_of?(&.+(1)) || 0_i64 } 23 | end 24 | 25 | # Creates a transition table given, see `Pegasus::Language::LanguageData` 26 | def state_table 27 | table = [Array.new(256, 0_i64)] 28 | @states.each do |state| 29 | empty_table = Array.new(256, 0_i64) 30 | state.transitions.each do |byte, out_state| 31 | empty_table[byte] = out_state.id + 1 32 | end 33 | table << empty_table 34 | end 35 | return table 36 | end 37 | end 38 | end 39 | 40 | module Pda 41 | class LookaheadItem 42 | def insert_shift?(action_table, state) 43 | return if done? 44 | next_element = item.body[index] 45 | return if !next_element.is_a?(Elements::IndexableElement) 46 | 47 | previous_value = action_table[state.id + 1][next_element.table_index] 48 | if previous_value > 0 49 | raise_table "Shift / reduce conflict", context_data: [ 50 | Pegasus::Dfa::ConflictErrorContext.new([ previous_value ]) 51 | ] 52 | end 53 | action_table[state.id + 1][next_element.table_index] = 0 54 | end 55 | 56 | def insert_reduce?(action_table, state, self_index) 57 | return if !done? 58 | 59 | @lookahead.each do |terminal| 60 | next unless terminal.is_a?(Elements::IndexableElement) 61 | previous_value = action_table[state.id + 1][terminal.table_index] 62 | if previous_value == 0 63 | raise_table "Shift / reduce conflict", context_data: [ 64 | Pegasus::Dfa::ConflictErrorContext.new([ self_index.to_i64 ]) 65 | ] 66 | end 67 | if previous_value > 0 68 | raise_table "Reduce / reduce conflict", context_data: [ 69 | Pegasus::Dfa::ConflictErrorContext.new([ previous_value - 1, self_index.to_i64 ]) 70 | ] 71 | end 72 | action_table[state.id + 1][terminal.table_index] = self_index.to_i64 + 1 73 | end 74 | end 75 | end 76 | 77 | class Pda 78 | # Creates an action table, determing what the parser should do 79 | # at the given state and the lookhead token. 80 | def action_table 81 | last_terminal_index = @items.max_of? do |item| 82 | item.body.select(&.is_a?(Elements::IndexableElement)).max_of?(&.table_index) || 1_i64 83 | end || 0_i64 84 | 85 | # +1 Because the EOF token has its own spot, too. 86 | table = Array.new(@states.size + 1) { Array.new(last_terminal_index + 1, -1_i64) } 87 | @states.each do |state| 88 | state.data.each do |item| 89 | item.insert_shift?(table, state) 90 | item.insert_reduce?(table, state, @items.index(item.item).not_nil!) 91 | end 92 | end 93 | 94 | return table 95 | end 96 | 97 | # Creates a transition table that is indexed by both Terminals and Nonterminals. 98 | def state_table 99 | last_terminal_index = @items.max_of? do |item| 100 | item.body.select(&.is_a?(Elements::TerminalId)).max_of?(&.table_index) || 0_i64 101 | end || 0_i64 102 | 103 | last_nonterminal_index = @items.max_of? do |item| 104 | Math.max(item.head.table_index, item.body.select(&.is_a?(Elements::NonterminalId)).max_of?(&.table_index) || 0_i64) 105 | end || 0_i64 106 | 107 | # +1 Because the EOF token has its own spot, too. 108 | table = Array.new(@states.size + 1) { Array.new(last_terminal_index + last_nonterminal_index + 1, 0_i64) } 109 | @states.each do |state| 110 | state.transitions.each do |token, to| 111 | case token 112 | when Elements::IndexableElement 113 | table[state.id + 1][token.table_index] = to.id + 1 114 | when Elements::NonterminalId 115 | table[state.id + 1][token.table_index + last_terminal_index] = to.id + 1 116 | end 117 | end 118 | end 119 | 120 | return table 121 | end 122 | end 123 | end 124 | end 125 | -------------------------------------------------------------------------------- /src/tools/dot/pegasus_dot.cr: -------------------------------------------------------------------------------- 1 | require "../../pegasus/language_def.cr" 2 | require "../../pegasus/json.cr" 3 | require "option_parser" 4 | 5 | module Pegasus::Dot 6 | extend self 7 | 8 | # Outputs the DFA lexing state machine from the LanguageData. 9 | def output_dfa(data, io) 10 | io << "digraph G {\n" 11 | data.lex_state_table.each_with_index do |state, i| 12 | next if i == 0 13 | state_name = "q#{i}" 14 | 15 | state.each_with_index do |j, char| 16 | other_state_name = "q#{j}" 17 | if j != 0 18 | io << " #{state_name} -> #{other_state_name} [label=#{char.chr.to_s.dump}]\n" 19 | end 20 | end 21 | end 22 | io << "}" 23 | end 24 | 25 | # Outputs the PDA parsing state machine from the LanguageData. 26 | def output_pda(data, io) 27 | io << "digraph G {\n" 28 | data.parse_state_table.each_with_index do |state, i| 29 | next if i == 0 30 | state_name = "q#{i}" 31 | 32 | state.each_with_index do |j, cause| 33 | other_state_name = "q#{j}" 34 | if j != 0 35 | if cause == 0 36 | transition_label = "(EOF)" 37 | elsif cause - 1 <= data.max_terminal 38 | transition_label = data.terminals.find { |k, v| v.raw_id == cause - 1 }.not_nil![0].dump 39 | else 40 | transition_label = data.nonterminals.find { |k, v| v.raw_id == cause - 1 - (data.max_terminal + 1) }.not_nil![0].dump 41 | end 42 | io << " #{state_name} -> #{other_state_name} [label=#{transition_label}]\n" 43 | end 44 | end 45 | end 46 | io << "}" 47 | end 48 | 49 | # Output target specified on command line. 50 | enum OutputTarget 51 | # Print DOT for DFA 52 | Dfa 53 | # Print DOT for PDA 54 | Pda 55 | end 56 | end 57 | 58 | # Configuration options 59 | output_target = Pegasus::Dot::OutputTarget::Pda 60 | 61 | # Parse configuration from command line 62 | OptionParser.parse do |parser| 63 | parser.banner = "Usage: pegasus-dot [arguments]" 64 | parser.on("-o FORMAT", "--output FORMAT", 65 | "Specifies the output format of the DOT converter. Either \"Dfa\" or \"Pda\"") do |format| 66 | output_target = Pegasus::Dot::OutputTarget.parse? format 67 | if output_target == nil 68 | STDERR.puts "ERROR: #{format} is not a valid format option." 69 | STDERR.puts parser 70 | exit(1) 71 | end 72 | end 73 | parser.on("-h", "--help", "Show this help") { puts parser } 74 | parser.invalid_option do |flag| 75 | STDERR.puts "ERROR: #{flag} is not a valid option." 76 | STDERR.puts parser 77 | exit(1) 78 | end 79 | end 80 | 81 | # Reaad, parse, and output LanguageData. 82 | data = Pegasus::Language::LanguageData.from_json STDIN 83 | case output_target 84 | when Pegasus::Dot::OutputTarget::Dfa 85 | Pegasus::Dot.output_dfa(data, STDOUT) 86 | when Pegasus::Dot::OutputTarget::Pda 87 | Pegasus::Dot.output_pda(data, STDOUT) 88 | end 89 | -------------------------------------------------------------------------------- /src/tools/sim/pegasus_sim.cr: -------------------------------------------------------------------------------- 1 | require "../../pegasus/language_def.cr" 2 | require "../../pegasus/json.cr" 3 | require "option_parser" 4 | 5 | module Pegasus::Sim 6 | class Token 7 | getter id : Int64 8 | getter string : String 9 | 10 | def initialize(@id, @string) 11 | end 12 | 13 | def to_s(io) 14 | io << "Token(" << id << ", " << string << ")" 15 | end 16 | end 17 | 18 | abstract class Tree 19 | abstract def table_index : Int64 20 | 21 | def display(io, offset) 22 | end 23 | end 24 | 25 | class TokenTree < Tree 26 | def initialize(@token : Token) 27 | end 28 | 29 | def table_index : Int64 30 | @token.id 31 | end 32 | 33 | def display(io, offset) 34 | offset.times { io << " " } 35 | io << @token 36 | io.puts 37 | end 38 | end 39 | 40 | class ParentTree < Tree 41 | getter children : Array(Tree) 42 | 43 | def initialize(@nonterminal_id : Int64, @max_terminal : Int64, @children = [] of Tree, @name : String? = nil) 44 | end 45 | 46 | def table_index : Int64 47 | @max_terminal + 1 + 1 + @nonterminal_id 48 | end 49 | 50 | def display(io, offset) 51 | offset.times { io << " " } 52 | io << "ParentTree(" << (@name || @nonterminal_id) << ")" 53 | io.puts 54 | @children.each { |child| child.display(io, offset + 1) } 55 | end 56 | end 57 | end 58 | 59 | input_json_option = nil 60 | 61 | OptionParser.parse do |parser| 62 | parser.banner = "Usage: pegasus-sim [arguments]" 63 | parser.on("-i FILE", "--input FORMAT", "Specifies input JSON file") do |file| 64 | input_json_option = file 65 | end 66 | parser.on("-h", "--help", "Show this help") { puts parser } 67 | parser.invalid_option do |flag| 68 | STDERR.puts "ERROR: #{flag} is not a valid option." 69 | STDERR.puts parser 70 | exit(1) 71 | end 72 | end 73 | 74 | raise "Input file not specified" unless input_json_option 75 | input_json = input_json_option.not_nil! 76 | 77 | raise "Unable to open specified file" unless File.file? input_json 78 | input = File.read input_json 79 | 80 | data = Pegasus::Language::LanguageData.from_json input 81 | to_parse = STDIN.gets_to_end.chomp 82 | 83 | # Lexing code 84 | 85 | tokens = [] of Pegasus::Sim::Token 86 | # Index at the string 87 | index = 0_i64 88 | # The last "final" match. 89 | last_final = -1_i64 90 | # The location of the last "final" match. 91 | last_final_index = -1_i64 92 | # The beginning of the last token. 93 | last_start = 0_i64 94 | # The current state 95 | state = 1_i64 96 | 97 | while index < to_parse.size 98 | last_final = -1_i64 99 | last_final_index = -1_i64 100 | last_start = index 101 | state = 1_i64 102 | 103 | while (index < to_parse.size) && (state != 0_i64) 104 | state = data.lex_state_table[state][to_parse[index].bytes[0]] 105 | if (final = data.lex_final_table[state]) != 0 106 | last_final = final 107 | last_final_index = index 108 | end 109 | index += 1 if state != 0 110 | end 111 | 112 | break if last_final == -1 113 | next if data.lex_skip_table[last_final] 114 | tokens << Pegasus::Sim::Token.new last_final, to_parse[last_start..last_final_index] 115 | end 116 | 117 | raise "Invalid token at position #{index}" unless index == to_parse.size 118 | 119 | # Parsing code 120 | 121 | # Technically this is one stack. However, it's easier to keep track 122 | # of the two types of variables on the stack separately. 123 | 124 | # The stack of trees being assembled from the bottom up. 125 | tree_stack = [] of Pegasus::Sim::Tree 126 | # The stack of the states to be followed by the automaton. 127 | state_stack = [ 1_i64 ] 128 | # The index in the tokens 129 | index = 0_i64 130 | # Final state table ID 131 | final_id = data.max_terminal + 1 + 1 132 | 133 | loop do 134 | break if (top = tree_stack.last?) && top.table_index == final_id 135 | action = data.parse_action_table[state_stack.last][(tokens[index]?.try &.id) || 0_i64] 136 | 137 | raise "Invalid token at position #{index}" if action == -1_i64 138 | if action == 0 139 | raise "Unexpected end of file" unless index < tokens.size 140 | tree_stack << Pegasus::Sim::TokenTree.new tokens[index] 141 | index += 1 142 | else 143 | item = data.items[action - 1] 144 | new_children = [] of Pegasus::Sim::Tree 145 | 146 | item.body.size.times do 147 | new_children.insert 0, tree_stack.pop 148 | state_stack.pop 149 | end 150 | tree_stack << Pegasus::Sim::ParentTree.new item.head.raw_id, data.max_terminal, 151 | new_children, 152 | data.nonterminals.find { |k, v| v.raw_id == item.head.raw_id }.not_nil![0] 153 | end 154 | state_stack << data.parse_state_table[state_stack.last][tree_stack.last.table_index] 155 | end 156 | raise "Unexpected token at position #{index}" if index != tokens.size 157 | tree_stack.last.display(STDOUT, 0) 158 | --------------------------------------------------------------------------------