├── .gitignore ├── LICENSE.txt ├── MANIFEST.in ├── README.md ├── bin ├── lempy └── lempy_build ├── example_tree.dot ├── example_tree.png ├── lemon ├── lemon.html └── lemon.md ├── pyproject.toml ├── setup.cfg ├── src └── lemon_py │ ├── BuildGrammar.py │ ├── BuildLexer.py │ ├── Driver.py │ ├── ParseNode.hpp │ ├── ParserImpl.cpp │ ├── __init__.py │ ├── header.lemon │ ├── lemon.c │ ├── lempar.c │ └── utf.hpp └── test_grammars ├── expr ├── example.expr └── expressions.lemon ├── parasol ├── .gitignore ├── parasol.lemon ├── phong.prsl └── test_api.py └── utf8_expr ├── example.expr └── expr_utf8.lemon /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | .vscode 3 | src/lemon_py/elsewhere 4 | src/lemon_py/lemon 5 | *__pycache__* 6 | 7 | *.so 8 | *concat_grammar.* 9 | *egg-info* 10 | build 11 | dist 12 | 13 | test_grammars/ParseNode.hpp 14 | *_parser.cpp -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Aubrey R Jones 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include src/lemon_py/header.lemon 2 | include src/lemon_py/lemon.c 3 | include src/lemon_py/lempar.c 4 | include src/lemon_py/*.hpp 5 | include src/lemon_py/*.cpp 6 | 7 | -------------------------------------------------------------------------------- /bin/lempy: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | python3 -m lemon_py.Driver "$@" 4 | -------------------------------------------------------------------------------- /bin/lempy_build: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | python3 -m lemon_py.BuildGrammar "$@" 4 | -------------------------------------------------------------------------------- /example_tree.dot: -------------------------------------------------------------------------------- 1 | digraph "AST" { 2 | node [shape=record, style=filled]; 3 | 4 | node [shape=record, label="{line:1 | + }"] 0; 5 | node [shape=record, label="{line:1 | / }"] 1; 6 | 0 -> 1; 7 | node [shape=record, label="{line:1 | + }"] 2; 8 | 1 -> 2; 9 | node [shape=record, label="{line:1 | { INT_LIT | 5}}"] 3; 10 | 2 -> 3; 11 | node [shape=record, label="{line:1 | { FLOAT_LIT | 7.2}}"] 4; 12 | 2 -> 4; 13 | node [shape=record, label="{line:1 | fncall }"] 5; 14 | 1 -> 5; 15 | node [shape=record, label="{line:1 | { FNCALL | log}}"] 6; 16 | 5 -> 6; 17 | node [shape=record, label="{line:1 | arglist }"] 7; 18 | 5 -> 7; 19 | node [shape=record, label="{line:1 | neg }"] 8; 20 | 7 -> 8; 21 | node [shape=record, label="{line:1 | { INT_LIT | 24}}"] 9; 22 | 8 -> 9; 23 | node [shape=record, label="{line:1 | { STRING | nonsense}}"] 10; 24 | 0 -> 10; 25 | 26 | } 27 | -------------------------------------------------------------------------------- /example_tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aubreyrjones/lemon-py/ff105e369d124afff57b97e06ad9f23017093ecb/example_tree.png -------------------------------------------------------------------------------- /lemon/lemon.md: -------------------------------------------------------------------------------- 1 | The Lemon Parser Generator 2 | 3 | The Lemon Parser Generator 4 | ========================== 5 | 6 | Lemon is an LALR(1) parser generator for C. It does the same job as "bison" and "yacc". But Lemon is not a bison or yacc clone. Lemon uses a different grammar syntax which is designed to reduce the number of coding errors. Lemon also uses a parsing engine that is faster than yacc and bison and which is both reentrant and threadsafe. (Update: Since the previous sentence was written, bison has also been updated so that it too can generate a reentrant and threadsafe parser.) Lemon also implements features that can be used to eliminate resource leaks, making it suitable for use in long-running programs such as graphical user interfaces or embedded controllers. 7 | 8 | This document is an introduction to the Lemon parser generator. 9 | 10 | 1.0 Table of Contents 11 | --------------------- 12 | 13 | * [Introduction](#main) 14 | * [1.0 Table of Contents](#toc) 15 | * [2.0 Security Notes](#secnot) 16 | 17 | * [3.0 Theory of Operation](#optheory) 18 | * [3.1 Command Line Options](#options) 19 | * [3.2 The Parser Interface](#interface) 20 | * [3.2.1 Allocating The Parse Object On Stack](#onstack) 21 | * [3.2.2 Interface Summary](#ifsum) 22 | * [3.3 Differences With YACC and BISON](#yaccdiff) 23 | * [3.4 Building The "lemon" Or "lemon.exe" Executable](#build) 24 | * [4.0 Input File Syntax](#syntax) 25 | * [4.1 Terminals and Nonterminals](#tnt) 26 | * [4.2 Grammar Rules](#rules) 27 | * [4.3 Precedence Rules](#precrules) 28 | * [4.4 Special Directives](#special) 29 | * [5.0 Error Processing](#errors) 30 | * [6.0 History of Lemon](#history) 31 | * [7.0 Copyright](#copyright) 32 | 33 | 2.0 Security Note 34 | ----------------- 35 | 36 | The language parser code created by Lemon is very robust and is well-suited for use in internet-facing applications that need to safely process maliciously crafted inputs. 37 | 38 | The "lemon.exe" command-line tool itself works great when given a valid input grammar file and almost always gives helpful error messages for malformed inputs. However, it is possible for a malicious user to craft a grammar file that will cause lemon.exe to crash. We do not see this as a problem, as lemon.exe is not intended to be used with hostile inputs. To summarize: 39 | 40 | * Parser code generated by lemon → Robust and secure 41 | * The "lemon.exe" command line tool itself → Not so much 42 | 43 | 3.0 Theory of Operation 44 | ----------------------- 45 | 46 | Lemon is computer program that translates a context free grammar (CFG) for a particular language into C code that implements a parser for that language. The Lemon program has two inputs: 47 | 48 | * The grammar specification. 49 | * A parser template file. 50 | 51 | Typically, only the grammar specification is supplied by the programmer. Lemon comes with a default parser template ("[lempar.c](https://sqlite.org/src/file/tool/lempar.c)") that works fine for most applications. But the user is free to substitute a different parser template if desired. 52 | 53 | Depending on command-line options, Lemon will generate up to three output files. 54 | 55 | * C code to implement a parser for the input grammar. 56 | * A header file defining an integer ID for each terminal symbol (or "token"). 57 | * An information file that describes the states of the generated parser automaton. 58 | 59 | By default, all three of these output files are generated. The header file is suppressed if the "-m" command-line option is used and the report file is omitted when "-q" is selected. 60 | 61 | The grammar specification file uses a ".y" suffix, by convention. In the examples used in this document, we'll assume the name of the grammar file is "gram.y". A typical use of Lemon would be the following command: 62 | 63 | lemon gram.y 64 | 65 | This command will generate three output files named "gram.c", "gram.h" and "gram.out". The first is C code to implement the parser. The second is the header file that defines numerical values for all terminal symbols, and the last is the report that explains the states used by the parser automaton. 66 | 67 | ### 3.1 Command Line Options 68 | 69 | The behavior of Lemon can be modified using command-line options. You can obtain a list of the available command-line options together with a brief explanation of what each does by typing 70 | 71 | lemon "-?" 72 | 73 | As of this writing, the following command-line options are supported: 74 | 75 | * **\-b** Show only the basis for each parser state in the report file. 76 | * **\-c** Do not compress the generated action tables. The parser will be a little larger and slower, but it will detect syntax errors sooner. 77 | * **\-d**_directory_ Write all output files into _directory_. Normally, output files are written into the directory that contains the input grammar file. 78 | * **\-D_name_** Define C preprocessor macro _name_. This macro is usable by "[%ifdef](#pifdef)", "[%ifndef](#pifdef)", and "[%if](#pifdef) lines in the grammar file. 79 | * **\-E** Run the "%if" preprocessor step only and print the revised grammar file. 80 | * **\-g** Do not generate a parser. Instead write the input grammar to standard output with all comments, actions, and other extraneous text removed. 81 | * **\-l** Omit "#line" directives in the generated parser C code. 82 | * **\-m** Cause the output C source code to be compatible with the "makeheaders" program. 83 | * **\-p** Display all conflicts that are resolved by [precedence rules](#precrules). 84 | * **\-q** Suppress generation of the report file. 85 | * **\-r** Do not sort or renumber the parser states as part of optimization. 86 | * **\-s** Show parser statistics before exiting. 87 | * **\-T_file_** Use _file_ as the template for the generated C-code parser implementation. 88 | * **\-x** Print the Lemon version number. 89 | 90 | ### 3.2 The Parser Interface 91 | 92 | Lemon doesn't generate a complete, working program. It only generates a few subroutines that implement a parser. This section describes the interface to those subroutines. It is up to the programmer to call these subroutines in an appropriate way in order to produce a complete system. 93 | 94 | Before a program begins using a Lemon-generated parser, the program must first create the parser. A new parser is created as follows: 95 | 96 | void \*pParser = ParseAlloc( malloc ); 97 | 98 | The ParseAlloc() routine allocates and initializes a new parser and returns a pointer to it. The actual data structure used to represent a parser is opaque — its internal structure is not visible or usable by the calling routine. For this reason, the ParseAlloc() routine returns a pointer to void rather than a pointer to some particular structure. The sole argument to the ParseAlloc() routine is a pointer to the subroutine used to allocate memory. Typically this means malloc(). 99 | 100 | After a program is finished using a parser, it can reclaim all memory allocated by that parser by calling 101 | 102 | ParseFree(pParser, free); 103 | 104 | The first argument is the same pointer returned by ParseAlloc(). The second argument is a pointer to the function used to release bulk memory back to the system. 105 | 106 | After a parser has been allocated using ParseAlloc(), the programmer must supply the parser with a sequence of tokens (terminal symbols) to be parsed. This is accomplished by calling the following function once for each token: 107 | 108 | Parse(pParser, hTokenID, sTokenData, pArg); 109 | 110 | The first argument to the Parse() routine is the pointer returned by ParseAlloc(). The second argument is a small positive integer that tells the parser the type of the next token in the data stream. There is one token type for each terminal symbol in the grammar. The gram.h file generated by Lemon contains #define statements that map symbolic terminal symbol names into appropriate integer values. A value of 0 for the second argument is a special flag to the parser to indicate that the end of input has been reached. The third argument is the value of the given token. By default, the type of the third argument is "void\*", but the grammar will usually redefine this type to be some kind of structure. Typically the second argument will be a broad category of tokens such as "identifier" or "number" and the third argument will be the name of the identifier or the value of the number. 111 | 112 | The Parse() function may have either three or four arguments, depending on the grammar. If the grammar specification file requests it (via the [%extra\_argument](#extraarg) directive), the Parse() function will have a fourth parameter that can be of any type chosen by the programmer. The parser doesn't do anything with this argument except to pass it through to action routines. This is a convenient mechanism for passing state information down to the action routines without having to use global variables. 113 | 114 | A typical use of a Lemon parser might look something like the following: 115 | 116 | 1 ParseTree \*ParseFile(const char \*zFilename){ 117 | 2 Tokenizer \*pTokenizer; 118 | 3 void \*pParser; 119 | 4 Token sToken; 120 | 5 int hTokenId; 121 | 6 ParserState sState; 122 | 7 123 | 8 pTokenizer = TokenizerCreate(zFilename); 124 | 9 pParser = ParseAlloc( malloc ); 125 | 10 InitParserState(&sState); 126 | 11 while( GetNextToken(pTokenizer, &hTokenId, &sToken) ){ 127 | 12 Parse(pParser, hTokenId, sToken, &sState); 128 | 13 } 129 | 14 Parse(pParser, 0, sToken, &sState); 130 | 15 ParseFree(pParser, free ); 131 | 16 TokenizerFree(pTokenizer); 132 | 17 return sState.treeRoot; 133 | 18 } 134 | 135 | This example shows a user-written routine that parses a file of text and returns a pointer to the parse tree. (All error-handling code is omitted from this example to keep it simple.) We assume the existence of some kind of tokenizer which is created using TokenizerCreate() on line 8 and deleted by TokenizerFree() on line 16. The GetNextToken() function on line 11 retrieves the next token from the input file and puts its type in the integer variable hTokenId. The sToken variable is assumed to be some kind of structure that contains details about each token, such as its complete text, what line it occurs on, etc. 136 | 137 | This example also assumes the existence of a structure of type ParserState that holds state information about a particular parse. An instance of such a structure is created on line 6 and initialized on line 10. A pointer to this structure is passed into the Parse() routine as the optional 4th argument. The action routine specified by the grammar for the parser can use the ParserState structure to hold whatever information is useful and appropriate. In the example, we note that the treeRoot field of the ParserState structure is left pointing to the root of the parse tree. 138 | 139 | The core of this example as it relates to Lemon is as follows: 140 | 141 | ParseFile(){ 142 | pParser = ParseAlloc( malloc ); 143 | while( GetNextToken(pTokenizer,&hTokenId, &sToken) ){ 144 | Parse(pParser, hTokenId, sToken); 145 | } 146 | Parse(pParser, 0, sToken); 147 | ParseFree(pParser, free ); 148 | } 149 | 150 | Basically, what a program has to do to use a Lemon-generated parser is first create the parser, then send it lots of tokens obtained by tokenizing an input source. When the end of input is reached, the Parse() routine should be called one last time with a token type of 0. This step is necessary to inform the parser that the end of input has been reached. Finally, we reclaim memory used by the parser by calling ParseFree(). 151 | 152 | There is one other interface routine that should be mentioned before we move on. The ParseTrace() function can be used to generate debugging output from the parser. A prototype for this routine is as follows: 153 | 154 | ParseTrace(FILE \*stream, char \*zPrefix); 155 | 156 | After this routine is called, a short (one-line) message is written to the designated output stream every time the parser changes states or calls an action routine. Each such message is prefaced using the text given by zPrefix. This debugging output can be turned off by calling ParseTrace() again with a first argument of NULL (0). 157 | 158 | #### 3.2.1 Allocating The Parse Object On Stack 159 | 160 | If all calls to the Parse() interface are made from within [%code directives](#pcode), then the parse object can be allocated from the stack rather than from the heap. These are the steps: 161 | 162 | * Declare a local variable of type "yyParser" 163 | * Initialize the variable using ParseInit() 164 | * Pass a pointer to the variable in calls ot Parse() 165 | * Deallocate substructure in the parse variable using ParseFinalize(). 166 | 167 | The following code illustrates how this is done: 168 | 169 | ParseFile(){ 170 | yyParser x; 171 | ParseInit( &x ); 172 | while( GetNextToken(pTokenizer,&hTokenId, &sToken) ){ 173 | Parse(&x, hTokenId, sToken); 174 | } 175 | Parse(&x, 0, sToken); 176 | ParseFinalize( &x ); 177 | } 178 | 179 | #### 3.2.2 Interface Summary 180 | 181 | Here is a quick overview of the C-language interface to a Lemon-generated parser: 182 | 183 | > void \*ParseAlloc( (void\*(\*malloc)(size\_t) ); 184 | > void ParseFree(void \*pParser, (void(\*free)(void\*) ); 185 | > void Parse(void \*pParser, int tokenCode, ParseTOKENTYPE token, ...); 186 | > void ParseTrace(FILE \*stream, char \*zPrefix); 187 | 188 | Notes: 189 | 190 | * Use the [%name directive](#pname) to change the "Parse" prefix names of the procedures in the interface. 191 | * Use the [%token\_type directive](#token_type) to define the "ParseTOKENTYPE" type. 192 | * Use the [%extra\_argument directive](#extraarg) to specify the type and name of the 4th parameter to the Parse() function. 193 | 194 | ### 3.3 Differences With YACC and BISON 195 | 196 | Programmers who have previously used the yacc or bison parser generator will notice several important differences between yacc and/or bison and Lemon. 197 | 198 | * In yacc and bison, the parser calls the tokenizer. In Lemon, the tokenizer calls the parser. 199 | * Lemon uses no global variables. Yacc and bison use global variables to pass information between the tokenizer and parser. 200 | * Lemon allows multiple parsers to be running simultaneously. Yacc and bison do not. 201 | 202 | These differences may cause some initial confusion for programmers with prior yacc and bison experience. But after years of experience using Lemon, I firmly believe that the Lemon way of doing things is better. 203 | 204 | _Updated as of 2016-02-16:_ The text above was written in the 1990s. We are told that Bison has lately been enhanced to support the tokenizer-calls-parser paradigm used by Lemon, eliminating the need for global variables. 205 | 206 | ### 3.4 Building The "lemon" or "lemon.exe" Executable 207 | 208 | The "lemon" or "lemon.exe" program is built from a single file of C-code named "[lemon.c](https://sqlite.org/src/tool/lemon.c)". The Lemon source code is generic C89 code that uses no unusual or non-standard libraries. Any reasonable C compiler should suffice to compile the lemon program. A command-line like the following will usually work: 209 | 210 | > cc -o lemon lemon.c 211 | 212 | On Windows machines with Visual C++ installed, bring up a "VS20_NN_ x64 Native Tools Command Prompt" window and enter: 213 | 214 | > cl lemon.c 215 | 216 | Compiling Lemon really is that simple. Additional compiler options such as "-O2" or "-g" or "-Wall" can be added if desired, but they are not necessary. 217 | 218 | 4.0 Input File Syntax 219 | --------------------- 220 | 221 | The main purpose of the grammar specification file for Lemon is to define the grammar for the parser. But the input file also specifies additional information Lemon requires to do its job. Most of the work in using Lemon is in writing an appropriate grammar file. 222 | 223 | The grammar file for Lemon is, for the most part, a free format. It does not have sections or divisions like yacc or bison. Any declaration can occur at any point in the file. Lemon ignores whitespace (except where it is needed to separate tokens), and it honors the same commenting conventions as C and C++. 224 | 225 | ### 4.1 Terminals and Nonterminals 226 | 227 | A terminal symbol (token) is any string of alphanumeric and/or underscore characters that begins with an uppercase letter. A terminal can contain lowercase letters after the first character, but the usual convention is to make terminals all uppercase. A nonterminal, on the other hand, is any string of alphanumeric and underscore characters than begins with a lowercase letter. Again, the usual convention is to make nonterminals use all lowercase letters. 228 | 229 | In Lemon, terminal and nonterminal symbols do not need to be declared or identified in a separate section of the grammar file. Lemon is able to generate a list of all terminals and nonterminals by examining the grammar rules, and it can always distinguish a terminal from a nonterminal by checking the case of the first character of the name. 230 | 231 | Yacc and bison allow terminal symbols to have either alphanumeric names or to be individual characters included in single quotes, like this: ')' or '$'. Lemon does not allow this alternative form for terminal symbols. With Lemon, all symbols, terminals and nonterminals, must have alphanumeric names. 232 | 233 | ### 4.2 Grammar Rules 234 | 235 | The main component of a Lemon grammar file is a sequence of grammar rules. Each grammar rule consists of a nonterminal symbol followed by the special symbol "::=" and then a list of terminals and/or nonterminals. The rule is terminated by a period. The list of terminals and nonterminals on the right-hand side of the rule can be empty. Rules can occur in any order, except that the left-hand side of the first rule is assumed to be the start symbol for the grammar (unless specified otherwise using the [%start\_symbol](#start_symbol) directive described below.) A typical sequence of grammar rules might look something like this: 236 | 237 | expr ::= expr PLUS expr. 238 | expr ::= expr TIMES expr. 239 | expr ::= LPAREN expr RPAREN. 240 | expr ::= VALUE. 241 | 242 | There is one non-terminal in this example, "expr", and five terminal symbols or tokens: "PLUS", "TIMES", "LPAREN", "RPAREN" and "VALUE". 243 | 244 | Like yacc and bison, Lemon allows the grammar to specify a block of C code that will be executed whenever a grammar rule is reduced by the parser. In Lemon, this action is specified by putting the C code (contained within curly braces {...}) immediately after the period that closes the rule. For example: 245 | 246 | expr ::= expr PLUS expr. { printf("Doing an addition...\\n"); } 247 | 248 | In order to be useful, grammar actions must normally be linked to their associated grammar rules. In yacc and bison, this is accomplished by embedding a "$$" in the action to stand for the value of the left-hand side of the rule and symbols "$1", "$2", and so forth to stand for the value of the terminal or nonterminal at position 1, 2 and so forth on the right-hand side of the rule. This idea is very powerful, but it is also very error-prone. The single most common source of errors in a yacc or bison grammar is to miscount the number of symbols on the right-hand side of a grammar rule and say "$7" when you really mean "$8". 249 | 250 | Lemon avoids the need to count grammar symbols by assigning symbolic names to each symbol in a grammar rule and then using those symbolic names in the action. In yacc or bison, one would write this: 251 | 252 | expr -> expr PLUS expr { $$ = $1 + $3; }; 253 | 254 | But in Lemon, the same rule becomes the following: 255 | 256 | expr(A) ::= expr(B) PLUS expr(C). { A = B+C; } 257 | 258 | In the Lemon rule, any symbol in parentheses after a grammar rule symbol becomes a place holder for that symbol in the grammar rule. This place holder can then be used in the associated C action to stand for the value of that symbol. 259 | 260 | The Lemon notation for linking a grammar rule with its reduce action is superior to yacc/bison on several counts. First, as mentioned above, the Lemon method avoids the need to count grammar symbols. Secondly, if a terminal or nonterminal in a Lemon grammar rule includes a linking symbol in parentheses but that linking symbol is not actually used in the reduce action, then an error message is generated. For example, the rule 261 | 262 | expr(A) ::= expr(B) PLUS expr(C). { A = B; } 263 | 264 | will generate an error because the linking symbol "C" is used in the grammar rule but not in the reduce action. 265 | 266 | The Lemon notation for linking grammar rules to reduce actions also facilitates the use of destructors for reclaiming memory allocated by the values of terminals and nonterminals on the right-hand side of a rule. 267 | 268 | ### 4.3 Precedence Rules 269 | 270 | Lemon resolves parsing ambiguities in exactly the same way as yacc and bison. A shift-reduce conflict is resolved in favor of the shift, and a reduce-reduce conflict is resolved by reducing whichever rule comes first in the grammar file. 271 | 272 | Just like in yacc and bison, Lemon allows a measure of control over the resolution of parsing conflicts using precedence rules. A precedence value can be assigned to any terminal symbol using the [%left](#pleft), [%right](#pright) or [%nonassoc](#pnonassoc) directives. Terminal symbols mentioned in earlier directives have a lower precedence than terminal symbols mentioned in later directives. For example: 273 | 274 | %left AND. 275 | %left OR. 276 | %nonassoc EQ NE GT GE LT LE. 277 | %left PLUS MINUS. 278 | %left TIMES DIVIDE MOD. 279 | %right EXP NOT. 280 | 281 | In the preceding sequence of directives, the AND operator is defined to have the lowest precedence. The OR operator is one precedence level higher. And so forth. Hence, the grammar would attempt to group the ambiguous expression 282 | 283 | a AND b OR c 284 | 285 | like this 286 | 287 | a AND (b OR c). 288 | 289 | The associativity (left, right or nonassoc) is used to determine the grouping when the precedence is the same. AND is left-associative in our example, so 290 | 291 | a AND b AND c 292 | 293 | is parsed like this 294 | 295 | (a AND b) AND c. 296 | 297 | The EXP operator is right-associative, though, so 298 | 299 | a EXP b EXP c 300 | 301 | is parsed like this 302 | 303 | a EXP (b EXP c). 304 | 305 | The nonassoc precedence is used for non-associative operators. So 306 | 307 | a EQ b EQ c 308 | 309 | is an error. 310 | 311 | The precedence of non-terminals is transferred to rules as follows: The precedence of a grammar rule is equal to the precedence of the left-most terminal symbol in the rule for which a precedence is defined. This is normally what you want, but in those cases where you want the precedence of a grammar rule to be something different, you can specify an alternative precedence symbol by putting the symbol in square braces after the period at the end of the rule and before any C-code. For example: 312 | 313 | expr = MINUS expr. \[NOT\] 314 | 315 | This rule has a precedence equal to that of the NOT symbol, not the MINUS symbol as would have been the case by default. 316 | 317 | With the knowledge of how precedence is assigned to terminal symbols and individual grammar rules, we can now explain precisely how parsing conflicts are resolved in Lemon. Shift-reduce conflicts are resolved as follows: 318 | 319 | * If either the token to be shifted or the rule to be reduced lacks precedence information, then resolve in favor of the shift, but report a parsing conflict. 320 | * If the precedence of the token to be shifted is greater than the precedence of the rule to reduce, then resolve in favor of the shift. No parsing conflict is reported. 321 | * If the precedence of the token to be shifted is less than the precedence of the rule to reduce, then resolve in favor of the reduce action. No parsing conflict is reported. 322 | * If the precedences are the same and the shift token is right-associative, then resolve in favor of the shift. No parsing conflict is reported. 323 | * If the precedences are the same and the shift token is left-associative, then resolve in favor of the reduce. No parsing conflict is reported. 324 | * Otherwise, resolve the conflict by doing the shift, and report a parsing conflict. 325 | 326 | Reduce-reduce conflicts are resolved this way: 327 | 328 | * If either reduce rule lacks precedence information, then resolve in favor of the rule that appears first in the grammar, and report a parsing conflict. 329 | * If both rules have precedence and the precedence is different, then resolve the dispute in favor of the rule with the highest precedence, and do not report a conflict. 330 | * Otherwise, resolve the conflict by reducing by the rule that appears first in the grammar, and report a parsing conflict. 331 | 332 | ### 4.4 Special Directives 333 | 334 | The input grammar to Lemon consists of grammar rules and special directives. We've described all the grammar rules, so now we'll talk about the special directives. 335 | 336 | Directives in Lemon can occur in any order. You can put them before the grammar rules, or after the grammar rules, or in the midst of the grammar rules. It doesn't matter. The relative order of directives used to assign precedence to terminals is important, but other than that, the order of directives in Lemon is arbitrary. 337 | 338 | Lemon supports the following special directives: 339 | 340 | * [%code](#pcode) 341 | * [%default\_destructor](#default_destructor) 342 | * [%default\_type](#default_type) 343 | * [%destructor](#destructor) 344 | * [%else](#pifdef) 345 | * [%endif](#pifdef) 346 | * [%extra\_argument](#extraarg) 347 | * [%fallback](#pfallback) 348 | * [%if](#pifdef) 349 | * [%ifdef](#pifdef) 350 | * [%ifndef](#pifdef) 351 | * [%include](#pinclude) 352 | * [%left](#pleft) 353 | * [%name](#pname) 354 | * [%nonassoc](#pnonassoc) 355 | * [%parse\_accept](#parse_accept) 356 | * [%parse\_failure](#parse_failure) 357 | * [%right](#pright) 358 | * [%stack\_overflow](#stack_overflow) 359 | * [%stack\_size](#stack_size) 360 | * [%start\_symbol](#start_symbol) 361 | * [%syntax\_error](#syntax_error) 362 | * [%token](#token) 363 | * [%token\_class](#token_class) 364 | * [%token\_destructor](#token_destructor) 365 | * [%token\_prefix](#token_prefix) 366 | * [%token\_type](#token_type) 367 | * [%type](#ptype) 368 | * [%wildcard](#pwildcard) 369 | 370 | Each of these directives will be described separately in the following sections: 371 | 372 | #### 4.4.1 The %code directive 373 | 374 | The %code directive is used to specify additional C code that is added to the end of the main output file. This is similar to the [%include](#pinclude) directive except that %include is inserted at the beginning of the main output file. 375 | 376 | %code is typically used to include some action routines or perhaps a tokenizer or even the "main()" function as part of the output file. 377 | 378 | There can be multiple %code directives. The arguments of all %code directives are concatenated. 379 | 380 | #### 4.4.2 The %default\_destructor directive 381 | 382 | The %default\_destructor directive specifies a destructor to use for non-terminals that do not have their own destructor specified by a separate %destructor directive. See the documentation on the [%destructor](#destructor) directive below for additional information. 383 | 384 | In some grammars, many different non-terminal symbols have the same data type and hence the same destructor. This directive is a convenient way to specify the same destructor for all those non-terminals using a single statement. 385 | 386 | #### 4.4.3 The %default\_type directive 387 | 388 | The %default\_type directive specifies the data type of non-terminal symbols that do not have their own data type defined using a separate [%type](#ptype) directive. 389 | 390 | #### 4.4.4 The %destructor directive 391 | 392 | The %destructor directive is used to specify a destructor for a non-terminal symbol. (See also the [%token\_destructor](#token_destructor) directive which is used to specify a destructor for terminal symbols.) 393 | 394 | A non-terminal's destructor is called to dispose of the non-terminal's value whenever the non-terminal is popped from the stack. This includes all of the following circumstances: 395 | 396 | * When a rule reduces and the value of a non-terminal on the right-hand side is not linked to C code. 397 | * When the stack is popped during error processing. 398 | * When the ParseFree() function runs. 399 | 400 | The destructor can do whatever it wants with the value of the non-terminal, but its design is to deallocate memory or other resources held by that non-terminal. 401 | 402 | Consider an example: 403 | 404 | %type nt {void\*} 405 | %destructor nt { free($$); } 406 | nt(A) ::= ID NUM. { A = malloc( 100 ); } 407 | 408 | This example is a bit contrived, but it serves to illustrate how destructors work. The example shows a non-terminal named "nt" that holds values of type "void\*". When the rule for an "nt" reduces, it sets the value of the non-terminal to space obtained from malloc(). Later, when the nt non-terminal is popped from the stack, the destructor will fire and call free() on this malloced space, thus avoiding a memory leak. (Note that the symbol "$$" in the destructor code is replaced by the value of the non-terminal.) 409 | 410 | It is important to note that the value of a non-terminal is passed to the destructor whenever the non-terminal is removed from the stack, unless the non-terminal is used in a C-code action. If the non-terminal is used by C-code, then it is assumed that the C-code will take care of destroying it. More commonly, the value is used to build some larger structure, and we don't want to destroy it, which is why the destructor is not called in this circumstance. 411 | 412 | Destructors help avoid memory leaks by automatically freeing allocated objects when they go out of scope. To do the same using yacc or bison is much more difficult. 413 | 414 | #### 4.4.5 The %extra\_argument directive 415 | 416 | The %extra\_argument directive instructs Lemon to add a 4th parameter to the parameter list of the Parse() function it generates. Lemon doesn't do anything itself with this extra argument, but it does make the argument available to C-code action routines, destructors, and so forth. For example, if the grammar file contains: 417 | 418 | %extra\_argument { MyStruct \*pAbc } 419 | 420 | Then the Parse() function generated will have an 4th parameter of type "MyStruct\*" and all action routines will have access to a variable named "pAbc" that is the value of the 4th parameter in the most recent call to Parse(). 421 | 422 | The %extra\_context directive works the same except that it is passed in on the ParseAlloc() or ParseInit() routines instead of on Parse(). 423 | 424 | #### 4.4.6 The %extra\_context directive 425 | 426 | The %extra\_context directive instructs Lemon to add a 2nd parameter to the parameter list of the ParseAlloc() and ParseInit() functions. Lemon doesn't do anything itself with these extra argument, but it does store the value make it available to C-code action routines, destructors, and so forth. For example, if the grammar file contains: 427 | 428 | %extra\_context { MyStruct \*pAbc } 429 | 430 | Then the ParseAlloc() and ParseInit() functions will have an 2nd parameter of type "MyStruct\*" and all action routines will have access to a variable named "pAbc" that is the value of that 2nd parameter. 431 | 432 | The %extra\_argument directive works the same except that it is passed in on the Parse() routine instead of on ParseAlloc()/ParseInit(). 433 | 434 | #### 4.4.7 The %fallback directive 435 | 436 | The %fallback directive specifies an alternative meaning for one or more tokens. The alternative meaning is tried if the original token would have generated a syntax error. 437 | 438 | The %fallback directive was added to support robust parsing of SQL syntax in [SQLite](https://www.sqlite.org/). The SQL language contains a large assortment of keywords, each of which appears as a different token to the language parser. SQL contains so many keywords that it can be difficult for programmers to keep up with them all. Programmers will, therefore, sometimes mistakenly use an obscure language keyword for an identifier. The %fallback directive provides a mechanism to tell the parser: "If you are unable to parse this keyword, try treating it as an identifier instead." 439 | 440 | The syntax of %fallback is as follows: 441 | 442 | > %fallback _ID_ _TOKEN..._ **.** 443 | 444 | In words, the %fallback directive is followed by a list of token names terminated by a period. The first token name is the fallback token — the token to which all the other tokens fall back to. The second and subsequent arguments are tokens which fall back to the token identified by the first argument. 445 | 446 | #### 4.4.8 The %if directive and its friends 447 | 448 | The %if, %ifdef, %ifndef, %else, and %endif directives are similar to #if, #ifdef, #ifndef, #else, and #endif in the C-preprocessor, just not as general. Each of these directives must begin at the left margin. No whitespace is allowed between the "%" and the directive name. 449 | 450 | Grammar text in between "%ifdef MACRO" and the next nested "%endif" is ignored unless the "-DMACRO" command-line option is used. Grammar text betwen "%ifndef MACRO" and the next nested "%endif" is included except when the "-DMACRO" command-line option is used. 451 | 452 | The text in between "%if _CONDITIONAL_" and its corresponding %endif is included only if _CONDITIONAL_ is true. The CONDITION is one or more macro names, optionally connected using the "||" and "&&" binary operators, the "!" unary operator, and grouped using balanced parentheses. Each term is true if the corresponding macro exists, and false if it does not exist. 453 | 454 | An optional "%else" directive can occur anywhere in between a %ifdef, %ifndef, or %if directive and its corresponding %endif. 455 | 456 | Note that the argument to %ifdef and %ifndef is intended to be a single preprocessor symbol name, not a general expression. Use the "%if" directive for general expressions. 457 | 458 | #### 4.4.9 The %include directive 459 | 460 | The %include directive specifies C code that is included at the top of the generated parser. You can include any text you want — the Lemon parser generator copies it blindly. If you have multiple %include directives in your grammar file, their values are concatenated so that all %include code ultimately appears near the top of the generated parser, in the same order as it appeared in the grammar. 461 | 462 | The %include directive is very handy for getting some extra #include preprocessor statements at the beginning of the generated parser. For example: 463 | 464 | %include {#include } 465 | 466 | This might be needed, for example, if some of the C actions in the grammar call functions that are prototyped in unistd.h. 467 | 468 | Use the [%code](#pcode) directive to add code to the end of the generated parser. 469 | 470 | #### 4.4.10 The %left directive 471 | 472 | The %left directive is used (along with the [%right](#pright) and [%nonassoc](#pnonassoc) directives) to declare precedences of terminal symbols. Every terminal symbol whose name appears after a %left directive but before the next period (".") is given the same left-associative precedence value. Subsequent %left directives have higher precedence. For example: 473 | 474 | %left AND. 475 | %left OR. 476 | %nonassoc EQ NE GT GE LT LE. 477 | %left PLUS MINUS. 478 | %left TIMES DIVIDE MOD. 479 | %right EXP NOT. 480 | 481 | Note the period that terminates each %left, %right or %nonassoc directive. 482 | 483 | LALR(1) grammars can get into a situation where they require a large amount of stack space if you make heavy use or right-associative operators. For this reason, it is recommended that you use %left rather than %right whenever possible. 484 | 485 | #### 4.4.11 The %name directive 486 | 487 | By default, the functions generated by Lemon all begin with the five-character string "Parse". You can change this string to something different using the %name directive. For instance: 488 | 489 | %name Abcde 490 | 491 | Putting this directive in the grammar file will cause Lemon to generate functions named 492 | 493 | * AbcdeAlloc(), 494 | * AbcdeFree(), 495 | * AbcdeTrace(), and 496 | * Abcde(). 497 | 498 | The %name directive allows you to generate two or more different parsers and link them all into the same executable. 499 | 500 | #### 4.4.12 The %nonassoc directive 501 | 502 | This directive is used to assign non-associative precedence to one or more terminal symbols. See the section on [precedence rules](#precrules) or on the [%left](#pleft) directive for additional information. 503 | 504 | #### 4.4.13 The %parse\_accept directive 505 | 506 | The %parse\_accept directive specifies a block of C code that is executed whenever the parser accepts its input string. To "accept" an input string means that the parser was able to process all tokens without error. 507 | 508 | For example: 509 | 510 | %parse\_accept { 511 | printf("parsing complete!\\n"); 512 | } 513 | 514 | #### 4.4.14 The %parse\_failure directive 515 | 516 | The %parse\_failure directive specifies a block of C code that is executed whenever the parser fails complete. This code is not executed until the parser has tried and failed to resolve an input error using is usual error recovery strategy. The routine is only invoked when parsing is unable to continue. 517 | 518 | %parse\_failure { 519 | fprintf(stderr,"Giving up. Parser is hopelessly lost...\\n"); 520 | } 521 | 522 | #### 4.4.15 The %right directive 523 | 524 | This directive is used to assign right-associative precedence to one or more terminal symbols. See the section on [precedence rules](#precrules) or on the [%left](#pleft) directive for additional information. 525 | 526 | #### 4.4.16 The %stack\_overflow directive 527 | 528 | The %stack\_overflow directive specifies a block of C code that is executed if the parser's internal stack ever overflows. Typically this just prints an error message. After a stack overflow, the parser will be unable to continue and must be reset. 529 | 530 | %stack\_overflow { 531 | fprintf(stderr,"Giving up. Parser stack overflow\\n"); 532 | } 533 | 534 | You can help prevent parser stack overflows by avoiding the use of right recursion and right-precedence operators in your grammar. Use left recursion and and left-precedence operators instead to encourage rules to reduce sooner and keep the stack size down. For example, do rules like this: 535 | 536 | list ::= list element. // left-recursion. Good! 537 | list ::= . 538 | 539 | Not like this: 540 | 541 | list ::= element list. // right-recursion. Bad! 542 | list ::= . 543 | 544 | #### 4.4.17 The %stack\_size directive 545 | 546 | If stack overflow is a problem and you can't resolve the trouble by using left-recursion, then you might want to increase the size of the parser's stack using this directive. Put an positive integer after the %stack\_size directive and Lemon will generate a parse with a stack of the requested size. The default value is 100. 547 | 548 | %stack\_size 2000 549 | 550 | #### 4.4.18 The %start\_symbol directive 551 | 552 | By default, the start symbol for the grammar that Lemon generates is the first non-terminal that appears in the grammar file. But you can choose a different start symbol using the %start\_symbol directive. 553 | 554 | %start\_symbol prog 555 | 556 | #### 4.4.19 The %syntax\_error directive 557 | 558 | See [Error Processing](#errors). 559 | 560 | #### 4.4.20 The %token directive 561 | 562 | Tokens are normally created automatically, the first time they are used. Any identifier that begins with an upper-case letter is a token. 563 | 564 | Sometimes it is useful to declare tokens in advance, however. The integer values assigned to each token determined by the order in which the tokens are seen. So by declaring tokens in advance, it is possible to cause some tokens to have low-numbered values, which might be desirable in some grammers, or to have sequential values assigned to a sequence of related tokens. For this reason, the %token directive is provided to declare tokens in advance. The syntax is as follows: 565 | 566 | > %token _TOKEN_ _TOKEN..._ **.** 567 | 568 | The %token directive is followed by zero or more token symbols and terminated by a single ".". Each token named is created if it does not already exist. Tokens are created in order. 569 | 570 | #### 4.4.21 The %token\_class directive 571 | 572 | Undocumented. Appears to be related to the MULTITERMINAL concept. [Implementation](http://sqlite.org/src/fdiff?v1=796930d5fc2036c7&v2=624b24c5dc048e09&sbs=0). 573 | 574 | #### 4.4.22 The %token\_destructor directive 575 | 576 | The %destructor directive assigns a destructor to a non-terminal symbol. (See the description of the [%destructor](%destructor) directive above.) The %token\_destructor directive does the same thing for all terminal symbols. 577 | 578 | Unlike non-terminal symbols, which may each have a different data type for their values, terminals all use the same data type (defined by the [%token\_type](#token_type) directive) and so they use a common destructor. Other than that, the token destructor works just like the non-terminal destructors. 579 | 580 | #### 4.4.23 The %token\_prefix directive 581 | 582 | Lemon generates #defines that assign small integer constants to each terminal symbol in the grammar. If desired, Lemon will add a prefix specified by this directive to each of the #defines it generates. 583 | 584 | So if the default output of Lemon looked like this: 585 | 586 | #define AND 1 587 | #define MINUS 2 588 | #define OR 3 589 | #define PLUS 4 590 | 591 | You can insert a statement into the grammar like this: 592 | 593 | %token\_prefix TOKEN\_ 594 | 595 | to cause Lemon to produce these symbols instead: 596 | 597 | #define TOKEN\_AND 1 598 | #define TOKEN\_MINUS 2 599 | #define TOKEN\_OR 3 600 | #define TOKEN\_PLUS 4 601 | 602 | #### 4.4.24 The %token\_type and %type directives 603 | 604 | These directives are used to specify the data types for values on the parser's stack associated with terminal and non-terminal symbols. The values of all terminal symbols must be of the same type. This turns out to be the same data type as the 3rd parameter to the Parse() function generated by Lemon. Typically, you will make the value of a terminal symbol be a pointer to some kind of token structure. Like this: 605 | 606 | %token\_type {Token\*} 607 | 608 | If the data type of terminals is not specified, the default value is "void\*". 609 | 610 | Non-terminal symbols can each have their own data types. Typically the data type of a non-terminal is a pointer to the root of a parse tree structure that contains all information about that non-terminal. For example: 611 | 612 | %type expr {Expr\*} 613 | 614 | Each entry on the parser's stack is actually a union containing instances of all data types for every non-terminal and terminal symbol. Lemon will automatically use the correct element of this union depending on what the corresponding non-terminal or terminal symbol is. But the grammar designer should keep in mind that the size of the union will be the size of its largest element. So if you have a single non-terminal whose data type requires 1K of storage, then your 100 entry parser stack will require 100K of heap space. If you are willing and able to pay that price, fine. You just need to know. 615 | 616 | #### 4.4.25 The %wildcard directive 617 | 618 | The %wildcard directive is followed by a single token name and a period. This directive specifies that the identified token should match any input token. 619 | 620 | When the generated parser has the choice of matching an input against the wildcard token and some other token, the other token is always used. The wildcard token is only matched if there are no alternatives. 621 | 622 | 5.0 Error Processing 623 | -------------------- 624 | 625 | After extensive experimentation over several years, it has been discovered that the error recovery strategy used by yacc is about as good as it gets. And so that is what Lemon uses. 626 | 627 | When a Lemon-generated parser encounters a syntax error, it first invokes the code specified by the %syntax\_error directive, if any. It then enters its error recovery strategy. The error recovery strategy is to begin popping the parsers stack until it enters a state where it is permitted to shift a special non-terminal symbol named "error". It then shifts this non-terminal and continues parsing. The %syntax\_error routine will not be called again until at least three new tokens have been successfully shifted. 628 | 629 | If the parser pops its stack until the stack is empty, and it still is unable to shift the error symbol, then the [%parse\_failure](#parse_failure) routine is invoked and the parser resets itself to its start state, ready to begin parsing a new file. This is what will happen at the very first syntax error, of course, if there are no instances of the "error" non-terminal in your grammar. 630 | 631 | 6.0 History of Lemon 632 | -------------------- 633 | 634 | Lemon was originally written by Richard Hipp sometime in the late 1980s on a Sun4 Workstation using K&R C. There was a companion LL(1) parser generator program named "Lime", the source code to which as been lost. 635 | 636 | The lemon.c source file was originally many separate files that were compiled together to generate the "lemon" executable. Sometime in the 1990s, the individual source code files were combined together into the current single large "lemon.c" source file. You can still see traces of original filenames in the code. 637 | 638 | Since 2001, Lemon has been part of the [SQLite project](https://sqlite.org/) and the source code to Lemon has been managed as a part of the [SQLite source tree](https://sqlite.org/src) in the following files: 639 | 640 | * [tool/lemon.c](https://sqlite.org/src/file/tool/lemon.c) 641 | * [tool/lempar.c](https://sqlite.org/src/file/tool/lempar.c) 642 | * [doc/lemon.html](https://sqlite.org/src/file/doc/lemon.html) 643 | 644 | 7.0 Copyright 645 | ------------- 646 | 647 | All of the source code to Lemon, including the template parser file "lempar.c" and this documentation file ("lemon.html") are in the public domain. You can use the code for any purpose and without attribution. 648 | 649 | The code comes with no warranty. If it breaks, you get to keep both pieces. -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" 7 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | # replace with your username: 3 | name = lemon-py 4 | version = 0.99.4 5 | author = Aubrey R. Jones 6 | author_email = netzapper@gmail.com 7 | description = Generic parse tree, configurable lexer, `lemon` parser generator, wrapped for C++17 and Python 3. 8 | long_description = file: README.md 9 | long_description_content_type = text/markdown 10 | url = https://github.com/aubreyrjones/lemon-py 11 | project_urls = 12 | Bug Tracker = https://github.com/aubreyrjones/lemon-py/issues 13 | classifiers = 14 | Programming Language :: Python :: 3 15 | Programming Language :: C++ 16 | License :: OSI Approved :: MIT License 17 | Operating System :: POSIX 18 | Development Status :: 4 - Beta 19 | 20 | [options] 21 | zip_safe = False 22 | package_dir = 23 | = src 24 | packages = find: 25 | python_requires = >=3.6 26 | scripts = 27 | bin/lempy 28 | bin/lempy_build 29 | install_requires = 30 | pybind11 31 | include_package_data = True 32 | 33 | [options.packages.find] 34 | where = src 35 | -------------------------------------------------------------------------------- /src/lemon_py/BuildGrammar.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2021 Aubrey R Jones 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import subprocess 24 | import tempfile 25 | import os.path 26 | import os 27 | import pybind11 28 | import site 29 | import shutil 30 | import sys 31 | from typing import * 32 | 33 | from .BuildLexer import make_lexer 34 | 35 | __all__ = ['build_lempy_grammar'] 36 | 37 | def _data_file(*filename: str): 38 | return os.path.join(os.path.abspath(os.path.dirname(__file__)), *filename) 39 | 40 | _system_default_cc = 'gcc' 41 | _system_default_cxx = 'g++' 42 | 43 | # if sys.platform == 'win32': 44 | # _system_default_cc = 'cl' 45 | # _system_default_cxx = 'cl' 46 | 47 | if sys.platform == 'darwin': 48 | _system_default_cc = 'clang' 49 | _system_default_cxx = 'clang++' 50 | 51 | c_COMPILER = _system_default_cc if 'CC' not in os.environ else os.environ['CC'] 52 | cpp_COMPILER = _system_default_cxx if 'CXX' not in os.environ else os.environ['CXX'] 53 | 54 | 55 | GRAMMAR_HEADER_FILE = _data_file("header.lemon") 56 | LEMON = _data_file("lemon") 57 | LEMON_TEMPLATE = _data_file("lempar.c") 58 | 59 | def _bootstrap_lemon(): 60 | ''' 61 | Build the `lemon` executable if it doesn't already exist. 62 | ''' 63 | if os.path.isfile(_data_file('lemon')): 64 | return 65 | 66 | print("Bootstrapping `lemon`.") 67 | command = [c_COMPILER, '-O2', '-o', _data_file('lemon'), _data_file('lemon.c')] 68 | subprocess.check_call(command) 69 | 70 | 71 | def _read_all(filename: str): 72 | ''' 73 | Open and read a whole file. 74 | ''' 75 | with open(filename, 'r') as f: 76 | return f.read() 77 | 78 | 79 | def _replace_token_defines(impl_text, defines) -> str: 80 | ''' 81 | Replace the special sentinel struct with the token definitions. 82 | ''' 83 | return impl_text.replace('struct _to_be_replaced_with_token_defines{};\n', defines) 84 | 85 | 86 | def _read_impl_and_replace_tokens(): 87 | ''' 88 | Read in the implementation file from the package dir, and the header 89 | from the current dir. Inline the token defs and return. 90 | ''' 91 | impl_text = _read_all(_data_file("ParserImpl.cpp")) 92 | defines = _read_all('concat_grammar.h') #this has to be in the cwd 93 | return _replace_token_defines(impl_text, defines) 94 | 95 | 96 | def _copy_cpp_stuff(target_dir: str): 97 | ''' 98 | Assuming the current dir has a complete, post `lemon` build, 99 | this will copy a build-ready header and implementation to the 100 | indicated director. 101 | ''' 102 | if not os.path.isdir(target_dir): 103 | os.makedirs(target_dir) 104 | 105 | with open(os.path.join(target_dir, 'ParseNode.hpp'), 'w') as out: 106 | out.write('#define LEMON_PY_SUPPRESS_PYTHON 1\n\n') 107 | out.write(_read_all(_data_file("ParseNode.hpp"))) 108 | 109 | parser_text = _read_all('concat_grammar.c') 110 | 111 | #impl_text = _read_impl_and_replace_tokens() # already done. 112 | 113 | with open(os.path.join(target_dir, "_parser.cpp"), 'w') as outimpl: 114 | outimpl.write(parser_text) 115 | 116 | 117 | def _gpp_command(module_name: str): 118 | ''' 119 | Create a command to build the parser in the current directory. 120 | ''' 121 | pyinclude = subprocess.check_output(['python3-config', '--includes']).decode().strip() 122 | pylink = subprocess.check_output(['python3-config', '--ldflags']).decode().strip() 123 | 124 | retval = [cpp_COMPILER] 125 | retval.append('-O2') 126 | retval.extend('-Wall -shared -std=c++17 -fPIC -fvisibility=hidden'.split()) 127 | retval.extend(pyinclude.split()) 128 | retval.extend(pylink.split()) 129 | retval.extend([ 130 | # "-Wl,-z,defs", 131 | f"-DPYTHON_PARSER_MODULE_NAME={module_name}", 132 | f"-I{pybind11.get_include()}", 133 | f"-I{os.path.dirname(__file__)}", # add this directory to pick up 'ParseNode.hpp' 134 | f"-I{os.path.abspath('.')}", 135 | "concat_grammar.c", 136 | "-o", 137 | f"{module_name}.so" 138 | ]) 139 | return retval 140 | 141 | 142 | def _concatenate_implementation(**kwargs): 143 | retval = '' 144 | 145 | static_impl_text = _read_impl_and_replace_tokens() 146 | 147 | if not kwargs.get('separate_interface', False): 148 | static_impl_text = static_impl_text.replace('#include \n', _read_all(_data_file("ParseNode.hpp"))) 149 | 150 | if kwargs.get('suppress_python', False): 151 | retval += '#define LEMON_PY_SUPPRESS_PYTHON 1\n\n' 152 | 153 | if kwargs.get('use_unicode', False): 154 | retval += '#define LEMON_PY_UNICODE_SUPPORT 1\n\n' 155 | static_impl_text = static_impl_text.replace('struct _utf_include_replace_struct{};\n', _read_all(_data_file("utf.hpp"))) 156 | 157 | retval += static_impl_text 158 | retval += _read_all('concat_grammar.c') 159 | retval = retval.replace("#pragma once", '\n') 160 | return retval 161 | 162 | 163 | def _extract_module(text: str): 164 | ''' 165 | Get the @pymod name out of the text. 166 | ''' 167 | BAD_NAME = "lemon_derived_parser_with_an_obnoxiously_long_name" # that oughta lern 'em not to put a @pymod 168 | 169 | start = text.find('@pymod') # should look like maybe "//@pymod \t foo_parser " 170 | if start < 0: 171 | return BAD_NAME 172 | 173 | end = text.find("\n", start) 174 | 175 | linesplit = text[start:end].split() 176 | if len(linesplit) < 2: 177 | return BAD_NAME 178 | return linesplit[1].strip() 179 | 180 | 181 | def _render_lemon_input(grammar_file_path: str, **kwargs): 182 | ''' 183 | Render the input meant for `lemon`. 184 | ''' 185 | user_input = _read_all(grammar_file_path) 186 | mod = _extract_module(user_input) 187 | lexer_def, lexer_report = make_lexer(user_input, kwargs.get('use_unicode', False)) 188 | codegen_text = f"%include {{\n{lexer_def}\n}}\n" 189 | 190 | header_text = _read_all(GRAMMAR_HEADER_FILE) 191 | 192 | return (mod, user_input + codegen_text + header_text, lexer_report) 193 | 194 | 195 | def _write_build_lemon_grammar(whole_text: str): 196 | ''' 197 | Write the input and call lemon to process it. 198 | ''' 199 | _bootstrap_lemon() 200 | with open('concat_grammar.lemon', 'w') as f: 201 | f.write(whole_text) 202 | try: 203 | subprocess.check_call([LEMON, f"-T{LEMON_TEMPLATE}", "concat_grammar.lemon"]) 204 | except: 205 | print("Lemon found a problem with the grammar. See above.") 206 | exit(1) 207 | 208 | 209 | def _render_buildable_module(grammar_file_path: str, **kwargs): 210 | ''' 211 | Build the given module into a python module in the current directory. 212 | ''' 213 | module_name, rendered_grammar, lexer_report = _render_lemon_input(grammar_file_path, **kwargs) 214 | _write_build_lemon_grammar(rendered_grammar) 215 | full_impl = _concatenate_implementation(**kwargs) 216 | with open('concat_grammar.c', 'w') as f: 217 | f.write(full_impl) 218 | return (module_name, lexer_report) 219 | 220 | 221 | def _chdir_and_build(grammar_file_path, use_temp, **kwargs): 222 | grammar_file_path = os.path.abspath(grammar_file_path) 223 | old_dir = os.path.abspath(os.curdir) 224 | 225 | with tempfile.TemporaryDirectory() as workdir: 226 | if use_temp: 227 | os.chdir(workdir) 228 | 229 | grammar_module_name, lexer_report = _render_buildable_module(grammar_file_path, **kwargs) 230 | 231 | if kwargs.get('print_terminals', False): 232 | print_lang_header(lexer_report) 233 | exit(0) 234 | 235 | if kwargs.get('cpp_dir', False): 236 | _copy_cpp_stuff(kwargs['cpp_dir']) 237 | elif not kwargs.get('no_build', False): 238 | try: 239 | subprocess.check_call(_gpp_command(grammar_module_name)) 240 | except: 241 | print("Error building C++ module. This often means you have a lexdef for a token not in your grammar, or that there's a syntax error in one of your grammar actions.") 242 | exit(1) 243 | if kwargs.get('install', False): 244 | soname = f"{grammar_module_name}.so" 245 | shutil.copy2(soname, os.path.join(site.getusersitepackages(), soname)) 246 | 247 | os.chdir(old_dir) 248 | 249 | def _lexdef_skeleton(tokname: str): 250 | ''' 251 | Output a trivial lexdef for the given token. 252 | ''' 253 | justlen = 16 - len(tokname) 254 | return tokname + ":=".rjust(justlen) + " " + tokname.lower() + ":".rjust(justlen) + " [^\w_]" 255 | 256 | 257 | def print_lang_header(lexer_report: List[str]): 258 | ''' 259 | Print out a list of all the token names that were defined by the grammar being built. 260 | ''' 261 | with open('concat_grammar.h', 'r') as header_file: 262 | print("/*\n@pymod unnamed_language\n\n@lexdef\n\n!whitespace : \s+\n") 263 | for l in header_file.readlines(): 264 | tokname = l.split()[1].strip() 265 | if tokname in lexer_report: continue 266 | print(_lexdef_skeleton(tokname)) 267 | print("@endlex\n*/") 268 | 269 | # -------------- 270 | 271 | if __name__ == '__main__': 272 | import argparse 273 | ap = argparse.ArgumentParser(description="Build a grammar and optionally install it to the python path.") 274 | ap.add_argument('--unicode', default=False, const=True, action='store_const', help="Enable unicode support. This is necessary for reliable non-ASCII input, but increases memory usage in the resulting parser.") 275 | ap.add_argument('--cpp', type=str, required=False, help="Specify to output C++ compatible files to the indicated directory. Disables building the Python module.") 276 | ap.add_argument('--terminals', default=False, const=True, action='store_const', help="Print a skeleton `@lexdef` including all grammar-defined terminals.") 277 | ap.add_argument('--debug', default=False, const=True, action='store_const', help="Don't use a temp directory, dump everything in cwd.") 278 | ap.add_argument('--nobuild', default=False, const=True, action='store_const', help="Don't build the shared object. Bail beforehand.") 279 | ap.add_argument('--noinstall', default=False, const=True, action='store_const', help="Don't install the language, most useful with --debug.") 280 | ap.add_argument('grammar_file', type=str, help="The grammar file to build.") 281 | args = ap.parse_args() 282 | 283 | building_cpp = True if args.cpp else False 284 | 285 | func_args = { 286 | 'install' : not args.noinstall, 287 | 'use_unicode' : args.unicode, 288 | 'cpp_dir' : os.path.abspath(args.cpp) if building_cpp else None, 289 | 'suppress_python' : building_cpp, 290 | 'no_build' : args.nobuild, 291 | 'print_terminals' : args.terminals, 292 | 'separate_interface' : building_cpp 293 | } 294 | 295 | _chdir_and_build(args.grammar_file, not args.debug, **func_args) 296 | 297 | -------------------------------------------------------------------------------- /src/lemon_py/BuildLexer.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2021 Aubrey R Jones 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from typing import * 24 | import re 25 | 26 | LEXER_START = \ 27 | ''' 28 | namespace _parser_impl { 29 | void _init_lexer() { 30 | static bool isInit = false; 31 | if (isInit) return; 32 | isInit = true; 33 | 34 | ''' 35 | LEXER_END = \ 36 | ''' 37 | } 38 | } //namespace 39 | 40 | ''' 41 | 42 | TABBY = " " 43 | 44 | INTRO_REGEX_REGEX = '\s+(:)[:]?\s+' 45 | 46 | 47 | def escape_backslash(s: str, extra: str = '"') -> str: 48 | return s.replace('\\', '\\\\').replace(extra, '\\' + extra) 49 | 50 | 51 | def scan_regex(s: str) -> tuple: # input should _not_ be stripped! 52 | if not s or len(s) == 1: return None # this is an empty string or a ':' by itself. 53 | matchtype = s[0] 54 | 55 | if matchtype == '=': 56 | raise RuntimeError("Literal definition (:=) where regex (:/::) expected.") 57 | 58 | flags = 'RegexScannerFlags::Default' 59 | if matchtype == ':': 60 | flags = 'RegexScannerFlags::CaseSensitive' 61 | s = s[1:] 62 | return (escape_backslash(s.strip()), flags) 63 | 64 | 65 | def scan_literal(s: str) -> tuple: # input should _not_ be stripped! 66 | if not s or len(s) == 1: return None # this is an empty string or just a '=' by itself 67 | 68 | terminal_pattern_start = re.search(INTRO_REGEX_REGEX, s) 69 | 70 | if not terminal_pattern_start: # just take everything as the search string 71 | return (escape_backslash(s[1:].strip()), None) 72 | 73 | stringlit = s[1:terminal_pattern_start.span(1)[0]].strip() 74 | relit = s[terminal_pattern_start.span(1)[1]:] 75 | 76 | terminator = scan_regex(relit) 77 | 78 | return (escape_backslash(stringlit), terminator) 79 | 80 | def scan_lex_line(l: str) -> tuple: 81 | l = l.strip() 82 | if not l: return None 83 | 84 | if l[0] == '!': # skip pattern marker 85 | halves = re.split('\s+:', l, 1) 86 | return ('skip', halves[0][1:].strip(), scan_regex(halves[1])) 87 | elif l[0] == "'": # stringdef marker 88 | halves = l.rsplit(':=', 1) 89 | return ('string', halves[1].strip(), halves[0][1:].strip()) # reverse pattern/tok for string 90 | else: 91 | splitpoint = l.find(':') # find the first one (there could be a second in a literal) 92 | tokname = l[0:splitpoint].strip() 93 | if not tokname: 94 | return None 95 | matchtype = l[splitpoint + 1] 96 | if matchtype == '=': 97 | return ('literal', tokname, *scan_literal(l[splitpoint + 1:])) 98 | pass #literal 99 | else: 100 | return ('value', tokname, *scan_regex(l[splitpoint + 1:])) 101 | 102 | 103 | def scan_lexer_def(lemon_source: str) -> List[tuple]: 104 | start = lemon_source.find('@lexdef') 105 | if start < 0: 106 | raise RuntimeError("No lexer definition found.") 107 | end = lemon_source.find('@endlex', start) 108 | rawlines = lemon_source[start:end].splitlines()[1:] 109 | scanned_lines = list(filter(lambda ld: ld, map(scan_lex_line, rawlines))) 110 | return scanned_lines 111 | 112 | 113 | # Decode the little language for configuring strings. 114 | def decode_stringdef(tokname, code) -> str: 115 | code = re.sub('\s+', '', code) # get rid of whitespace in the code 116 | delim = escape_backslash(code[0], "'") # escape single-quote because it's gonna be in a `char` not a `const char*`. 117 | escape = escape_backslash(code[1], "'") 118 | special = code[2:] 119 | 120 | flags = "StringScannerFlags::Default" 121 | 122 | if '!' in special or 's' in special: 123 | flags += " | StringScannerFlags::SpanNewlines" 124 | 125 | if 'j' in special: 126 | flags += " | StringScannerFlags::JoinAdjacent" 127 | 128 | return f"Lexer::add_string_def('{delim}', '{escape}', {tokname}, {flags});\n" 129 | 130 | def cstring(s: str, uni: bool) -> str: 131 | if uni: 132 | return f'L"{s}"' 133 | else: 134 | return f'"{s}"' 135 | 136 | def implement_lexdef_line(lexdef: tuple, uni: bool) -> str: 137 | cs = lambda s: cstring(s, uni) 138 | retval = '' + TABBY 139 | kind = lexdef[0] 140 | tokname = lexdef[1] 141 | if kind == 'skip': 142 | skipre = lexdef[2] 143 | retval += f"Lexer::add_skip({cs(skipre[0])}, {skipre[1]});\n" 144 | elif kind == 'value': 145 | retval += f"Lexer::add_value_type({tokname}, {cs(lexdef[2])}, {lexdef[3]});\n" 146 | elif kind == 'literal': 147 | if lexdef[3]: 148 | termre = lexdef[3] 149 | retval += f"Lexer::add_literal({tokname}, {cs(lexdef[2])}, {cs(termre[0])}, {termre[1]});\n" 150 | else: 151 | retval += f"Lexer::add_literal({tokname}, {cs(lexdef[2])});\n" 152 | elif kind == 'string': 153 | retval += decode_stringdef(lexdef[1], lexdef[2]) 154 | 155 | if kind not in ('skip'): 156 | retval += TABBY + f"token_name_map.emplace({tokname}, {cs(tokname)});\n" 157 | 158 | return retval 159 | 160 | def lexer_report(lexdefs: List): 161 | ''' 162 | Get the list of all defined tokens. 163 | ''' 164 | return list(map(lambda ld: ld[1], lexdefs)) 165 | 166 | def make_lexer(lemon_source: str, uni = False) -> str: 167 | lexdefs = scan_lexer_def(lemon_source) 168 | lexer_impl = LEXER_START + "\n".join(map(lambda ld: implement_lexdef_line(ld, uni), lexdefs)) + LEXER_END 169 | report = lexer_report(lexdefs) 170 | return (lexer_impl, report) -------------------------------------------------------------------------------- /src/lemon_py/Driver.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2021 Aubrey R Jones 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import importlib 24 | import argparse 25 | import tempfile 26 | import subprocess 27 | import json 28 | from os import path 29 | 30 | class Driver: 31 | def __init__(self, lang_name: str): 32 | importlib.invalidate_caches() 33 | mod = importlib.import_module(lang_name) 34 | self._parse_fn = getattr(mod, 'parse') 35 | self._dot_fn = getattr(mod, 'dotify') 36 | 37 | def parse(self, instr: str): 38 | return self._parse_fn(instr) 39 | 40 | def dotify(self, parse_tree) -> str: 41 | return self._dot_fn(parse_tree) 42 | 43 | def write_dot(self, parse_tree, outfile_path): 44 | with open(outfile_path, 'w') as f: 45 | f.write(self.dotify(parse_tree)) 46 | 47 | def vis_dot(self, parse_tree): 48 | with tempfile.TemporaryDirectory() as workdir: 49 | outfile = path.join(workdir, "out.dot") 50 | pngfile = f"{outfile}.png" 51 | self.write_dot(parse_tree, outfile) 52 | subprocess.call(["dot", "-Tpng", f"-o{pngfile}", outfile]) # dot -Tpng -O out.dot && display out.dot.png 53 | subprocess.call(["display", pngfile]) 54 | 55 | def to_json(self, parse_tree) -> str: 56 | return json.dumps(parse_tree.as_dict(), indent=1) 57 | 58 | 59 | if __name__ == '__main__': 60 | import argparse 61 | import sys 62 | ap = argparse.ArgumentParser(description="Build a grammar and optionally install it to the python path.") 63 | ap.add_argument('--vis', default=False, const=True, action='store_const', help="Visualize with dot.") 64 | ap.add_argument('--dot', type=str, help="Dot output file.") 65 | ap.add_argument('--json', default=False, const=True, action='store_const', help="Dump a JSON representation of the tree to the console.") 66 | ap.add_argument('language', type=str, help="Language module name to use.") 67 | ap.add_argument('input_file', type=str, help="Input file to parser. Specify `0` (zero) to accept input from stdin.") 68 | args = ap.parse_args() 69 | 70 | d = Driver(args.language) 71 | 72 | infile = sys.stdin 73 | if args.input_file != '0': 74 | infile = open(args.input_file, 'r') 75 | 76 | parse_tree = d.parse(infile.read()) 77 | 78 | if args.input_file != '0': 79 | infile.close() 80 | 81 | if args.dot: 82 | d.write_dot(parse_tree, args.dot) 83 | 84 | if args.json: 85 | print(d.to_json(parse_tree)) 86 | 87 | if args.vis: 88 | d.vis_dot(parse_tree) -------------------------------------------------------------------------------- /src/lemon_py/ParseNode.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2021 Aubrey R Jones 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | */ 24 | 25 | #pragma once 26 | 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | #ifndef LEMON_PY_SUPPRESS_PYTHON 33 | #include 34 | #include 35 | #include 36 | namespace py = pybind11; 37 | #else 38 | /** When python is suppressed, stubs out the `dict` definition used to hold parse node attributes. */ 39 | namespace py { using dict = void*; } 40 | #endif 41 | 42 | namespace parser { 43 | 44 | #ifndef LEMON_PY_SUPPRESS_PYTHON 45 | /** Get a string value or None. */ 46 | inline 47 | py::object string_or_none(std::optional const& v) { 48 | if (!v) { 49 | return py::none(); 50 | } 51 | else { 52 | return py::str(v.value()); 53 | } 54 | } 55 | #endif 56 | 57 | 58 | /** 59 | * Sanitize a string for dot. 60 | */ 61 | inline 62 | std::string sanitize(std::string in) { 63 | auto clean = [&in] (char c, const char* replace, int skipForward = 0) { 64 | size_t res = 0; 65 | while((res = in.find(c, res)) < std::string::npos) { 66 | in.erase(res, 1); 67 | in.insert(res, replace); 68 | res += skipForward; 69 | } 70 | }; 71 | 72 | clean('&', "&", 1); 73 | clean('"', """); 74 | //clean('\'', "'"); // apparently dot doesn't care about this? 75 | clean('<', "<"); 76 | clean('>', ">"); 77 | //clean('\n', "
"); 78 | 79 | return std::move(in); 80 | } 81 | 82 | /** 83 | * A value-typed parse node (in contrast to the indirect, pointer-based parse tree used internally). 84 | */ 85 | struct ParseNode { 86 | std::optional production; ///< the production name, if an internal node 87 | std::optional tokName; ///< the token name, if a terminal node 88 | std::optional value; ///< the token value, if a value token 89 | int64_t line; ///< line number for this node. -1 if unknown. 90 | std::vector children; ///< all the children of this parse node 91 | int id; ///< id number, unique within a single tree 92 | py::dict attr; ///< if python is enabled, this is a dictionary to contain attributes added by a python transformer 93 | 94 | ParseNode() : production(), tokName(), value(), line(-1), children(), id(-1), attr() {} 95 | ParseNode(ParseNode && o) noexcept : production(std::move(o.production)), tokName(std::move(o.tokName)), value(std::move(o.value)), line(o.line), children(std::move(o.children)), id(o.id), attr(std::move(o.attr)) { 96 | o.id = -1; 97 | } 98 | 99 | ParseNode& operator=(ParseNode && o) noexcept { 100 | using namespace std; 101 | production = move(o.production); 102 | tokName = move(o.tokName); 103 | value = move(o.value); 104 | line = o.line; 105 | children = move(o.children); 106 | id = o.id; 107 | o.id = -1; 108 | attr = move(o.attr); 109 | 110 | return *this; 111 | } 112 | 113 | ParseNode(ParseNode const& o ) = delete; 114 | ParseNode& operator=(ParseNode const& o) = delete; 115 | 116 | #ifndef LEMON_PY_SUPPRESS_PYTHON 117 | 118 | py::object getProduction() const { 119 | return string_or_none(production); 120 | } 121 | 122 | py::object getValue() const { 123 | return string_or_none(value); 124 | } 125 | 126 | py::object getToken() const { 127 | return string_or_none(tokName); 128 | } 129 | 130 | py::dict asDict() const { 131 | py::dict myDict; 132 | myDict["production"] = getProduction(); 133 | myDict["type"] = getToken(); 134 | myDict["value"] = getValue(); 135 | myDict["id"] = id; 136 | myDict["line"] = line; 137 | myDict["attr"] = attr; 138 | 139 | auto childList = py::list(); 140 | for (auto const& c : *this) { 141 | childList.append(c.asDict()); 142 | } 143 | myDict["c"] = childList; 144 | 145 | return myDict; 146 | } 147 | 148 | #endif 149 | 150 | /** 151 | * Return a halfway reasonable string representation of the node (but not its children). 152 | */ 153 | std::string toString() const { 154 | char outbuf[1024]; // just do the first 1k characters 155 | if (production) { 156 | snprintf(outbuf, 1024, "{%s} [%lu]", production.value().c_str(), children.size()); 157 | } 158 | else { 159 | snprintf(outbuf, 1024, "%s <%s>", tokName.value().c_str(), value.value().c_str()); 160 | } 161 | 162 | return std::string(outbuf); 163 | } 164 | 165 | /** 166 | * Add this node and its children to the dot graph being built up in `out`. 167 | */ 168 | void dotify(std::stringstream & out, const ParseNode * parent) const { 169 | char buf[1024]; 170 | 171 | if (production) { 172 | snprintf(buf, 1024, "node [shape=record, label=\"{line:%ld | %s }\"] %d;\n", line, sanitize(production.value()).c_str(), id); 173 | } 174 | else { 175 | snprintf(buf, 1024, "node [shape=record, label=\"{line:%ld | { %s | %s}}\"] %d;\n", line, sanitize(tokName.value()).c_str(), sanitize(value.value()).c_str(), id); 176 | } 177 | out << buf; 178 | 179 | if (parent) { 180 | snprintf(buf, 1024, "%d -> %d;\n", parent->id, id); 181 | out << buf; 182 | } 183 | 184 | for (auto const& c : children) { 185 | c.dotify(out, this); 186 | } 187 | } 188 | 189 | /** 190 | * Get a particular child node. 191 | */ 192 | ParseNode const& operator[](size_t index) const { 193 | if (index >= children.size()) { 194 | throw std::runtime_error("Child index out of range."); 195 | } 196 | return children[index]; 197 | } 198 | 199 | using iter_type = decltype(children)::const_iterator; 200 | 201 | /** 202 | * Get children iterator. 203 | */ 204 | iter_type begin() const { 205 | return children.cbegin(); 206 | } 207 | 208 | /** 209 | * Get end of children vector. 210 | */ 211 | iter_type end() const { 212 | return children.cend(); 213 | } 214 | 215 | /** 216 | * Number of children of this node. 217 | */ 218 | size_t childCount() const { 219 | return children.size(); 220 | } 221 | 222 | /** 223 | * Checks for syntactic equality. Two nodes are equal if their 224 | * productions, token name, and value are identical; as well 225 | * as all their children being equal under this same definition. 226 | * 227 | * This check is recursive. 228 | */ 229 | bool operator==(ParseNode const& o) const { 230 | if (&o == this) return true; // we're always equal to ourselves. 231 | 232 | if (childCount() != o.childCount()) return false; // order these checks from cheapest to most expensive 233 | if (tokName != o.tokName) return false; 234 | if (production != o.production) return false; 235 | if (value != o.value) return false; 236 | 237 | for (auto myC = begin(), oC = o.begin(); myC != end() && oC != o.end(); ++myC, ++oC) { 238 | if (*myC != *oC) return false; 239 | } 240 | 241 | return true; 242 | } 243 | 244 | bool operator!=(ParseNode const& o) const { 245 | return !(*this == o); 246 | } 247 | 248 | }; 249 | 250 | /** 251 | * Parse a string and return a parse tree. 252 | * 253 | * @throw std::runtime_error if there is a lex or parse error. 254 | */ 255 | ParseNode parse_string(std::string const& input); 256 | 257 | /** 258 | * Create a complete dot graph, rooted at the given ParseNode. 259 | */ 260 | std::string dotify(ParseNode const& pn); 261 | 262 | } // namespace parser -------------------------------------------------------------------------------- /src/lemon_py/ParserImpl.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2021 Aubrey R Jones 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | */ 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | // Forward declarations of types needed for Lemon function forward declarations 35 | // it's turtles all the way down when you've got no headers lol 36 | namespace _parser_impl { 37 | struct Token; 38 | struct Parser; 39 | struct GrammarActionParserHandle; 40 | struct GrammarActionNodeHandle; 41 | } 42 | 43 | // these are Lemon parser functions. 44 | 45 | void* LemonPyParseAlloc(void *(*mallocProc)(size_t)); 46 | void LemonPyParseFree(void *p, void (*freeProc)(void*)); 47 | void LemonPyParse(void *, int, _parser_impl::Token, _parser_impl::GrammarActionParserHandle); 48 | void LemonPyParseInit(void *); 49 | 50 | 51 | #ifndef LEMON_PY_SUPPRESS_PYTHON 52 | #include 53 | #include 54 | #include 55 | namespace py = pybind11; 56 | #endif 57 | 58 | #ifdef LEMON_PY_UNICODE_IDE_SUPPORT 59 | #include 60 | #endif 61 | 62 | // if we're built with `--unicode`, this will get replaced with the contents of 63 | // `utf.hpp`. 64 | struct _utf_include_replace_struct{}; 65 | 66 | 67 | namespace _parser_impl { 68 | 69 | // ==================== UTILITIES AND DECLARATIONS ======================= 70 | 71 | 72 | #ifndef LEMON_PY_UNICODE_SUPPORT 73 | using ustring = std::string; 74 | using ustring_view = std::string_view; 75 | 76 | inline 77 | std::string const& toExternal(_parser_impl::ustring const& ascii) { 78 | return ascii; 79 | } 80 | 81 | inline 82 | _parser_impl::ustring const& toInternal(std::string const& ascii) { 83 | return ascii; 84 | } 85 | 86 | #else 87 | 88 | using ustring = std::wstring; 89 | using ustring_view = std::wstring_view; 90 | 91 | inline 92 | std::string toExternal(_parser_impl::ustring const& utf32) { 93 | return utf8::utf32to8(utf32); 94 | } 95 | 96 | inline 97 | _parser_impl::ustring toInternal(std::string const& utf8) { 98 | return utf8::utf8toW(utf8); 99 | } 100 | #endif 101 | 102 | using sstream = std::basic_stringstream; 103 | using siter = ustring::const_iterator; 104 | using uregex = std::basic_regex; 105 | using regex_results = std::match_results; 106 | using uuchar = ustring::value_type; 107 | 108 | //==================== TOKENS ============================== 109 | 110 | /** Used to intern strings found by the lexer. */ 111 | class StringTable { 112 | protected: 113 | std::vector strings; 114 | 115 | typedef std::unordered_map LocationMap; 116 | 117 | LocationMap cachedLocations; 118 | 119 | public: 120 | 121 | /** Clear table state. */ 122 | void clear() { 123 | cachedLocations.clear(); 124 | strings.clear(); 125 | } 126 | 127 | /** 128 | * Push a string and return the index. 129 | */ 130 | size_t pushString(ustring const& s) { 131 | auto it = cachedLocations.find(s); 132 | if (it != cachedLocations.end()){ 133 | return (*it).second; 134 | } 135 | 136 | size_t idx = strings.size(); 137 | cachedLocations.emplace(s, idx); 138 | 139 | strings.push_back(s); 140 | 141 | return idx; 142 | } 143 | 144 | /** 145 | * Get an existing string by index. 146 | */ 147 | ustring const& getString(size_t index) { 148 | return strings[index]; 149 | } 150 | }; 151 | 152 | /** Stores mappings from logical token names to string representations. */ 153 | static std::unordered_map token_name_map; 154 | 155 | /** Stores mappings from logical literal token names to literal values. */ 156 | static std::unordered_map token_literal_value_map; 157 | 158 | /** 159 | * This is the token value passed into the Lemon parser. It always has a type, 160 | * but it might not always have a value. This is indicated by having a 161 | * nullptr `valueTable`. 162 | * 163 | * It seems Token must be a trivial value type to pass through 164 | * the lemon parser. This means we need to play tricks with 165 | * the value string. 166 | * 167 | */ 168 | struct Token { 169 | int type; ///< Numeric type defined by the header 'concat_grammar.h', output by lemon. 170 | size_t valueIndex; ///< index into the string table where we can find our value. 171 | StringTable *valueTable; ///< pointer to string table of values, or nullptr if this token has no value. 172 | int line; ///< line number that the lexer *finished* this token on (sorry) 173 | 174 | /** 175 | * Get either the regex-matched value for a value token, or just a copy of the 176 | * literal string for a literal token. 177 | */ 178 | ustring value() const { 179 | if (valueTable) return valueTable->getString(valueIndex); 180 | return token_literal_value_map[type]; 181 | } 182 | 183 | /** 184 | * Get the name of this token as a string. 185 | */ 186 | ustring const& name() const { 187 | return token_name_map[type]; 188 | } 189 | 190 | /** 191 | * Get a reasonable, perhaps truncated, string representation of this token. 192 | */ 193 | ustring toString() const { 194 | sstream valueStream; 195 | valueStream << name() << "[line: " << line << "] <" << value() << ">"; 196 | return valueStream.str(); 197 | } 198 | 199 | int operator~() const { return line; } 200 | }; 201 | 202 | /** Convenience method to make a token. */ 203 | Token make_token(int type, int line) { 204 | return Token {type, 0, nullptr, line}; 205 | } 206 | 207 | /** Convenience method to make a token. */ 208 | Token make_token(int type, StringTable & st, ustring const& s, int line) { 209 | return Token {type, st.pushString(s), &st, line}; 210 | } 211 | 212 | 213 | //============================== LEXER IMPLEMENTATION ================================= 214 | 215 | /** 216 | * This implements a recursive prefix tree, used to match literals in the lexer. 217 | * 218 | * Nodes with defined values may also define a terminator pattern, which must match in order for the node to match. 219 | */ 220 | template 221 | struct PTNode { 222 | uuchar code; ///< character contribution 223 | std::optional value; ///< the output token value if matched 224 | std::optional terminatorPattern; ///< a regex used to check if the literal is properly terminated 225 | std::vector children; ///< suffixes 226 | bool isRoot; ///< is this the root node? 227 | 228 | PTNode(uuchar code, std::optional const& value, std::optional const& terminator, bool isRoot = false) : code(code), value(value), terminatorPattern(terminator), children(), isRoot(isRoot) {} 229 | 230 | /** 231 | * Recursively add a literal to the tree. 232 | */ 233 | void add_value(ustring_view const& code, V_T const& value, std::optional const& terminator = std::nullopt) { 234 | if (code.length() == 0) { // all of the previous recursions have matched (or user is adding a null string?) 235 | if (isRoot // yeah, it was a null string, which won't work and is extremely unlikely coming from the autogen lexer conf 236 | || this->value) // or we're already set 237 | throw std::runtime_error("Attempting to redefine lexer literal " + toExternal(token_name_map[this->value.value()])); 238 | this->value = value; 239 | this->terminatorPattern = terminator; 240 | return; 241 | } 242 | 243 | for (auto & c : children) { 244 | if (c.code == code[0]) { // one of our children is further along the chain 245 | c.add_value(code.substr(1, code.length() - 1), value, terminator); 246 | return; 247 | } 248 | } 249 | 250 | // directly create the child node 251 | children.emplace_back(code[0], std::nullopt, std::nullopt); 252 | 253 | // and recurse into it with the suffix of the string. If the contracted substring becomes null, that will trigger the child to take the value 254 | children.back().add_value(code.substr(1, code.length() - 1), value, terminator); 255 | } 256 | 257 | /** Check the terminator pattern, or return true automatically if we don't have a terminator defined. */ 258 | bool tryTerminator(ustring::const_iterator const& first, ustring::const_iterator const& last) const { 259 | if (!terminatorPattern) return true; 260 | 261 | return std::regex_search(first, last, terminatorPattern.value(), std::regex_constants::match_continuous); 262 | } 263 | 264 | /** Value, and an iterator pointing to the input character immediately following the literal. */ 265 | using LexResult = std::tuple; 266 | 267 | /** 268 | * Try to match the maximal string possible from the beginning of the input range. 269 | */ 270 | std::optional tryValue(ustring::const_iterator first, ustring::const_iterator last) const { 271 | if (children.empty() || first == last) { // we'll never have a null input, so we've reached end of input or end of chain while still matching. 272 | goto bailout; 273 | } 274 | 275 | for (auto const& c : children) { 276 | if (*first == c.code) { 277 | if (auto found = c.tryValue(first + 1, last)) { 278 | return found; 279 | } 280 | } 281 | } 282 | 283 | bailout: 284 | // if we got here naturally, after searching children, it's because there was a match failure on the 285 | // suffix after this node. If we have a value, check the terminator and return. 286 | if (value && tryTerminator(first, last)) { 287 | return std::make_tuple(value.value(), first); 288 | } 289 | 290 | // no match, or the match is for an internal node with no value. 291 | return std::nullopt; 292 | } 293 | }; 294 | 295 | /** Flags for regex scanning. */ 296 | struct RegexScannerFlags { 297 | const int v = 0; 298 | 299 | static constexpr auto Default = 0; 300 | static constexpr auto CaseSensitive = 1; ///< This regex should be evaluated with case sensitivity enabled 301 | 302 | operator int() const { return v; } 303 | RegexScannerFlags(int const& v) : v(v) {} 304 | RegexScannerFlags() = default; 305 | }; 306 | 307 | 308 | /** Flags for configuring string scanning. */ 309 | struct StringScannerFlags { 310 | int v = 0; 311 | 312 | static constexpr auto Default = 0; 313 | static constexpr auto SpanNewlines = 1; ///< Strings of this type should allow internal newlines without error 314 | static constexpr auto JoinAdjacent = 2; ///< Strings of this type should be joined together _in the lexer_ when only skips occur between them 315 | 316 | operator int() const { return v; } 317 | StringScannerFlags(int const& v) : v(v) {} 318 | StringScannerFlags() = default; 319 | }; 320 | 321 | /** Convert a string into a case-insensitive, ECMA-flavored regex. */ 322 | uregex s2regex(ustring const& s, RegexScannerFlags const& flags) { 323 | auto flagset = std::regex::ECMAScript; 324 | if (!(flags & RegexScannerFlags::CaseSensitive)) { 325 | flagset |= std::regex::icase; 326 | } 327 | 328 | return uregex(s, flagset); 329 | } 330 | 331 | /** 332 | * This is a relatively basic lexer. It handles two classes of tokens, plus skip patterns and strings. 333 | * 334 | * "literal" tokens are defined by a fixed string of characters, and are stored in a basic 335 | * prefix tree (PTNode above). These are matched greedily, with the longest matching sequence 336 | * having priority. Literal tokens are returned by lemon-defined code number, without a value. 337 | * 338 | * "value" tokens are defined by a regular expression, and are returned with both a code 339 | * and a value. A single sub-match may be used to denote a partial value extraction from 340 | * the overall token match. No type conversions are done, all values are strings. 341 | * 342 | * Value token patterns are checked in the same order they are defined with `add_value_type`. 343 | * 344 | * Skip patterns are simply regexes that are used to skip whitespace, comments, or other 345 | * lexically and syntactically-irrelevant content. Skip patterns are applied before every 346 | * attempt at token extraction. 347 | * 348 | * Strings have user-defined delimeters and escapes, and may optionally span newlines. 349 | * 350 | */ 351 | struct Lexer { 352 | static PTNode literals; 353 | static std::vector skips; 354 | static std::vector> valueTypes; ///< regex pattern, token code 355 | static std::vector> stringDefs; ///< delim, escape, token code, span newlines 356 | 357 | /** 358 | * Add a literal/constant token, with an optional terminator pattern. 359 | */ 360 | static void add_literal(int tok_code, ustring const& code, std::optional const& terminator = std::nullopt, RegexScannerFlags const& terminatorFlags = RegexScannerFlags::Default) { 361 | literals.add_value( 362 | code, 363 | tok_code, 364 | terminator ? 365 | std::make_optional(s2regex(terminator.value(), terminatorFlags)) 366 | : std::nullopt); 367 | token_literal_value_map.emplace(tok_code, code); 368 | } 369 | 370 | /** Add a skip pattern to the lexer definition. */ 371 | static void add_skip(ustring const& r, RegexScannerFlags const& flags = RegexScannerFlags::Default) { 372 | skips.push_back(s2regex(r, flags)); 373 | } 374 | 375 | /** Add a value pattern to the lexer definition. */ 376 | static void add_value_type(int tok_code, ustring const& r, RegexScannerFlags const& flags = RegexScannerFlags::Default) { 377 | valueTypes.push_back(std::make_tuple(s2regex(r, flags), tok_code)); 378 | } 379 | 380 | /** Add a string definition to the lexer definition. */ 381 | static void add_string_def(uuchar delim, uuchar escape, int tok_code, StringScannerFlags flags = StringScannerFlags::Default) { 382 | stringDefs.push_back(std::make_tuple(delim, escape, tok_code, flags)); 383 | } 384 | 385 | // == instance == 386 | private: 387 | ustring input; ///< the entire input string to lex 388 | ustring::const_iterator curPos; ///< current authoritative position in the string 389 | StringTable &stringTable; ///< reference to parser string table to use 390 | int count; ///< count of tokens lexed 391 | bool reachedEnd; ///< have we reached the end? 392 | int line = 1; ///< what's our current line? 393 | 394 | /** Make a runtime error with context info. */ 395 | std::runtime_error make_error(std::string const& message) { 396 | char buf[1024]; 397 | snprintf(buf, 1024, "Lexer failure on line %d. %s Around here:\n", line, message.c_str()); 398 | return std::runtime_error(std::string(buf) + toExternal(remainder(100))); 399 | } 400 | 401 | /** Advance curPos by the given count. */ 402 | siter advanceBy(size_t count) { 403 | auto oldPos = curPos; 404 | std::advance(curPos, count); 405 | line += countLines(oldPos, curPos); 406 | 407 | return oldPos; 408 | } 409 | 410 | /** Advance curPos to the given position. */ 411 | siter advanceTo(siter const& newPos) { 412 | auto oldPos = curPos; 413 | curPos = newPos; 414 | line += countLines(oldPos, curPos); 415 | 416 | return oldPos; 417 | } 418 | 419 | /** Count lines between iterators. */ 420 | int countLines(siter from, siter const& to) { 421 | int lineCount = 0; 422 | for (; from != to; from++) { 423 | if (*from == '\n') { 424 | lineCount++; 425 | } 426 | } 427 | return lineCount; 428 | } 429 | 430 | /** Repeatedly apply skip patterns, consuming input if they match. */ 431 | void skip() { 432 | bool skipped = false; 433 | do { 434 | skipped = false; 435 | for (auto const& r : skips) { 436 | regex_results results; 437 | if (std::regex_search(curPos, input.cend(), results, r, std::regex_constants::match_continuous)) { 438 | skipped = true; 439 | advanceBy(results.length()); 440 | } 441 | } 442 | } while (skipped); 443 | } 444 | 445 | /** Find the end of the string from the given start position. */ 446 | siter stringEnd(uuchar stringDelim, uuchar escape, StringScannerFlags flags, siter stringStart, siter end) { 447 | for (; stringStart != end; ++stringStart) { 448 | if (*stringStart == escape) { 449 | auto nextChar = stringStart + 1; 450 | if (nextChar == end) goto end_of_input; 451 | if ((*nextChar == stringDelim) || (*nextChar == escape)) { 452 | ++stringStart; // skip past this delim, the loop increment will skip the escaped char 453 | continue; 454 | } 455 | } 456 | else if (!(flags & StringScannerFlags::SpanNewlines) && (*stringStart == '\n')) { 457 | throw make_error("Non-spanning string crossed newline."); 458 | } 459 | else if (*stringStart == stringDelim) { 460 | return stringStart; 461 | } 462 | } 463 | 464 | end_of_input: 465 | throw make_error("String lexing reached end of line."); 466 | } 467 | 468 | /** Try all of the string definitions and attempt to get a string, returning nullopt if no string is possible. */ 469 | std::optional nextString() { 470 | auto n = [this] (int tokCode, uuchar delim, uuchar escape, StringScannerFlags flags) -> std::optional { 471 | if (*curPos == delim) { // if we get past this, we're either going to return a string token or exception out. 472 | auto send = stringEnd(delim, escape, flags, curPos + 1, input.cend()); 473 | auto startLine = line; 474 | auto sstart = advanceTo(send + 1); // move past the end delim 475 | return make_token(tokCode, stringTable, ustring(sstart + 1, send), startLine); 476 | } 477 | else { 478 | return std::nullopt; 479 | } 480 | }; 481 | 482 | using std::get; 483 | 484 | for (auto const& sdef : stringDefs) { 485 | if (auto matchedString = n(get<2>(sdef), get<0>(sdef), get<1>(sdef), get<3>(sdef))) { 486 | auto flags = get<3>(sdef); 487 | if (flags & StringScannerFlags::JoinAdjacent) { 488 | sstream retval; 489 | retval << matchedString.value().value(); 490 | skip(); 491 | while (auto anotherOne = n(get<2>(sdef), get<0>(sdef), get<1>(sdef), get<3>(sdef))) { 492 | retval << anotherOne.value().value(); // get the actual string value out, the first one's for the `optional` 493 | skip(); 494 | } 495 | return make_token(matchedString.value().type, stringTable, retval.str(), matchedString.value().line); 496 | } 497 | else { 498 | return matchedString; 499 | } 500 | } 501 | } 502 | 503 | return std::nullopt; 504 | } 505 | 506 | /** Query the prefix tree to check for a literal token, returning nullopt if nothing matches. */ 507 | std::optional nextLiteral() { 508 | auto result = literals.tryValue(curPos, input.cend()); 509 | if (!result) return std::nullopt; 510 | 511 | advanceTo(std::get<1>(result.value())); 512 | return make_token(std::get<0>(result.value()), line); 513 | } 514 | 515 | /** Try all the value patterns to see if one matches, returning it if it does. Returns nullopt if nothing matches. */ 516 | std::optional nextValue() { 517 | for (auto const& r : valueTypes) { 518 | regex_results results; 519 | if (std::regex_search(curPos, input.cend(), results, std::get<0>(r), std::regex_constants::match_continuous)) { 520 | auto match_iterator = results.begin(); 521 | if (results.size() > 1) { // skip past the whole match to get a submatch 522 | std::advance(match_iterator, 1); 523 | } 524 | 525 | ustring value = (*match_iterator).str(); 526 | 527 | advanceBy(results.length()); // advance by length of _entire_ match 528 | return make_token(std::get<1>(r), stringTable, value, line); 529 | } 530 | } 531 | return std::nullopt; 532 | } 533 | 534 | public: 535 | 536 | /** Create a new lexer with the given input, using the given string table. */ 537 | Lexer(ustring const& inputString, StringTable & stringTable) : input(inputString), curPos(input.cbegin()), stringTable(stringTable), count(0), reachedEnd(false) {} 538 | 539 | /** 540 | * Get the next token. Returns a special EOF token (defined by Lemon) when it 541 | * reaches end of input, returns nullopt on the next call after emitting EOF. 542 | * 543 | * @throw std::runtime_error if there's a error lexing. 544 | * */ 545 | std::optional next() { 546 | skip(); 547 | 548 | if (consumedInput()) { 549 | if (reachedEnd) { // second time we return nullopt so we can stop operating 550 | return std::nullopt; 551 | } 552 | else { 553 | reachedEnd = true; // first time, we emit the EOF token 554 | return make_token(0, line); 555 | } 556 | } 557 | 558 | if (auto str = nextString()) { 559 | count++; 560 | return str; 561 | } 562 | else if (auto lit = nextLiteral()) { 563 | count++; 564 | return lit; 565 | } 566 | else if (auto value = nextValue()) { 567 | count++; 568 | return value; 569 | } 570 | 571 | throw make_error("Cannot lex next character. Not part of any match."); 572 | }; 573 | 574 | /** Get the current line of the current lexer position. */ 575 | int const& getLine() const { return line; } 576 | 577 | /** Has the lexer consumed all input? */ 578 | bool consumedInput() { 579 | return curPos == input.cend(); 580 | } 581 | 582 | /** Get a portion of the input after the current position, used for error reporting. */ 583 | ustring remainder(size_t len = 0) { 584 | return ustring(curPos, len && (curPos + len < input.cend()) ? (curPos + len) : input.cend()); 585 | } 586 | 587 | /** Get a count of all tokens lexed. */ 588 | int getCount() const { 589 | return count; 590 | } 591 | }; 592 | 593 | /** Forward declaration of codegen'd lexer initialization function. Defined by the BuildLexer.py */ 594 | void _init_lexer(); 595 | 596 | // static storage for lexer. 597 | PTNode Lexer::literals(0, std::nullopt, std::nullopt, true); // root node. 598 | decltype(Lexer::skips) Lexer::skips; 599 | decltype(Lexer::valueTypes) Lexer::valueTypes; 600 | decltype(Lexer::stringDefs) Lexer::stringDefs; 601 | 602 | 603 | //========================== PARSER STATE AND INTERNAL TREE ============================== 604 | 605 | 606 | struct ParseNode; 607 | 608 | /** 609 | * Used to implement syntax sugar inside the grammar actions. 610 | * 611 | * Implicitly converts between PaseNode* and this handle. 612 | */ 613 | struct GrammarActionNodeHandle { 614 | using ChildrenPack = std::initializer_list; 615 | ParseNode* node; 616 | 617 | GrammarActionNodeHandle() = default; 618 | 619 | // implicit conversions 620 | GrammarActionNodeHandle(ParseNode* const& n) : node(n) {} 621 | operator ParseNode*() { return node; } 622 | 623 | // sugar 624 | GrammarActionNodeHandle operator[](size_t childIndex); 625 | GrammarActionNodeHandle& operator+=(GrammarActionNodeHandle & rhs); 626 | GrammarActionNodeHandle& operator+=(ChildrenPack const& rhs); 627 | int operator~() const; 628 | //explicit GrammarActionNodeHandle& operator=(Token const& tok); //TODO: need `_` in scope somehow. 629 | }; 630 | 631 | 632 | /** Either a production name or a token value. */ 633 | using ParseValue = std::variant; 634 | 635 | /** 636 | * A parser-internal parse node. 637 | * 638 | * ParseNodes are handled by pointer within the parser. 639 | */ 640 | struct ParseNode { 641 | ParseValue value; ///< the production or token 642 | int64_t line; ///< line for this node 643 | std::vector children; ///< pointers to children 644 | 645 | /** 646 | * Append a sequence of things that, individually, will 647 | * convert to ParseNode*. 648 | */ 649 | template 650 | ParseNode* append(T const& childSeq) { 651 | for (auto c : childSeq) { 652 | children.push_back(c); 653 | } 654 | 655 | return this; 656 | } 657 | 658 | /** Add a node to the end of the children list. */ 659 | ParseNode* push_back(ParseNode *n) { children.push_back(n); return this; } 660 | 661 | /** Add a node to the beginning of the children list. Not typically recommended. */ 662 | ParseNode* push_front(ParseNode *n) { children.insert(children.begin(), n); return this; } 663 | 664 | /** Add a node to the end of the children list. */ 665 | ParseNode* pb(ParseNode *n) { return push_back(n); } 666 | 667 | /** Add a node to the beginning of the children list. Not typically recommended. */ 668 | ParseNode* pf(ParseNode *n) { return push_front(n); } 669 | 670 | /** Set the line number of this node. */ 671 | ParseNode* l(int64_t line) { this->line = line; return this; } 672 | 673 | private: 674 | friend class Parser; 675 | friend class std::unique_ptr; 676 | ParseNode() = default; 677 | }; 678 | /** Used to brace-enclose a list of children for various functions. */ 679 | //using ChildrenPack = std::initializer_list; 680 | using ChildrenPack = GrammarActionNodeHandle::ChildrenPack; 681 | 682 | GrammarActionNodeHandle& GrammarActionNodeHandle::operator+=(ChildrenPack const& toAppend) { 683 | node->append(toAppend); 684 | return *this; 685 | } 686 | 687 | GrammarActionNodeHandle GrammarActionNodeHandle::operator[](size_t childIndex) { 688 | return node->children[childIndex]; 689 | } 690 | 691 | GrammarActionNodeHandle& GrammarActionNodeHandle::operator+=(GrammarActionNodeHandle & rhs) { 692 | node->children.push_back(rhs); 693 | return *this; 694 | } 695 | 696 | int GrammarActionNodeHandle::operator~() const { 697 | return node->line; 698 | } 699 | 700 | /** 701 | * Used to implement syntax sugar inside the grammar actions. 702 | */ 703 | struct GrammarActionParserHandle { 704 | Parser* parser; ///< pointer to the parent parser 705 | 706 | /** Passthrough to make_node. */ 707 | GrammarActionNodeHandle operator()(const char* production, ChildrenPack const& children = {}, int64_t line = -1); 708 | GrammarActionNodeHandle operator()(ustring const& production, ChildrenPack const& children = {}, int64_t line = -1); 709 | GrammarActionNodeHandle operator()(Token const& terminal); 710 | 711 | /** Passthrough to push_root. */ 712 | GrammarActionNodeHandle operator=(GrammarActionNodeHandle newRoot); 713 | 714 | void drop_node(GrammarActionNodeHandle & toDrop); 715 | void error(); 716 | void success(); 717 | }; 718 | 719 | 720 | /** 721 | * Implements the parser and all state for a parser run. 722 | */ 723 | class Parser { 724 | void* lemonParser; ///< opaque poiner to lemon parser 725 | 726 | std::unordered_map> allNodes; ///< storage for nodes 727 | StringTable stringTable; ///< string storage 728 | Token currentToken; ///< the last token passed from the lexer for parsing 729 | 730 | ParseNode *root = nullptr; ///< root node for the parse tree 731 | bool successful = false; ///< have we received the successful message from the parser 732 | GrammarActionParserHandle thisHandle { this }; 733 | 734 | void freeParserObject() { 735 | if (lemonParser) { // could be non-null if there was an exception. 736 | LemonPyParseFree(lemonParser, free); 737 | } 738 | lemonParser = nullptr; 739 | } 740 | 741 | void buildParserObject() { 742 | if (lemonParser) { 743 | freeParserObject(); 744 | } 745 | 746 | lemonParser = LemonPyParseAlloc(malloc); 747 | 748 | if (!lemonParser) { 749 | throw std::runtime_error("Cannot allocate memory for parser framework."); 750 | } 751 | } 752 | 753 | /** 754 | * Reset the parser state. Called internally by `parseString()`, so not necessary to call manually. 755 | */ 756 | void reset() { 757 | allNodes.clear(); 758 | stringTable.clear(); 759 | 760 | currentToken = make_token(0, -1); 761 | root = nullptr; 762 | successful = false; 763 | 764 | buildParserObject(); 765 | } 766 | 767 | /** 768 | * Pass the next token into the lemon parser. 769 | */ 770 | void offerToken(Token token) { 771 | currentToken = token; 772 | LemonPyParse(lemonParser, token.type, token, thisHandle); 773 | } 774 | 775 | 776 | public: 777 | 778 | /** Create a new parser, allocating lemon parser state. */ 779 | Parser() : lemonParser(nullptr), allNodes(), stringTable() { 780 | _init_lexer(); 781 | } 782 | 783 | /** Deallocate lemon parser state. */ 784 | ~Parser() { 785 | freeParserObject(); 786 | } 787 | 788 | 789 | /** Make a new node. */ 790 | GrammarActionNodeHandle make_node(ParseValue const& value, ChildrenPack const& children = {}, int64_t line = -1) { 791 | auto node = std::unique_ptr(new ParseNode); // can't use `make_unique` because the constructor's private. 792 | node->value = value; 793 | if (std::holds_alternative(value)) { 794 | node-> line = std::get(value).line; 795 | } 796 | else { 797 | node->line = line; 798 | } 799 | 800 | //node->children.insert(node->children.end(), children); 801 | node->append(children); 802 | 803 | auto retval = node.get(); 804 | allNodes.emplace(retval, std::move(node)); 805 | 806 | return retval; 807 | } 808 | 809 | /** Short for make_node. */ 810 | GrammarActionNodeHandle mn(ParseValue const& value, ChildrenPack const& children = {}, int64_t line = -1) { 811 | return make_node(value, children, line); 812 | } 813 | 814 | /** 815 | * Set the root node of the parse tree. 816 | */ 817 | GrammarActionNodeHandle push_root(GrammarActionNodeHandle pn) { 818 | return root = pn; 819 | } 820 | 821 | /** 822 | * Drop the given node from internal storage. Not strictly necessary, but can keep interim 823 | * memory usage lower. 824 | */ 825 | void drop_node(GrammarActionNodeHandle pn) { 826 | auto it = allNodes.find(pn); 827 | if (it != allNodes.end()) { 828 | allNodes.erase(it); 829 | } 830 | } 831 | 832 | /** 833 | * Used by the lemon parser to signal a parse error. 834 | */ 835 | void error() { 836 | throw std::runtime_error("Parse error on token: " + toExternal(currentToken.toString())); 837 | } 838 | 839 | /** 840 | * Used by the lemon parser to signal a parse success. 841 | */ 842 | void success() { 843 | successful = true; 844 | } 845 | 846 | /** 847 | * Parse the given input string, returning a parse tree on success. 848 | * 849 | * Invalidates parse nodes returned from any previous invocation of `parseString` on this Parser. 850 | * 851 | * @throw std::runtime_error on lex or parse error. 852 | */ 853 | ParseNode* parseString(std::string const& input) { 854 | reset(); // allocates the parser object 855 | 856 | Lexer lexer(toInternal(input), stringTable); 857 | 858 | while (auto tok = lexer.next()) { 859 | offerToken(tok.value()); 860 | } 861 | 862 | if (!(successful && root)) { 863 | throw std::runtime_error("Lexer reached end of input without parser completing and setting root node."); 864 | } 865 | 866 | return root; 867 | } 868 | }; 869 | 870 | GrammarActionNodeHandle GrammarActionParserHandle::operator()(const char* production, ChildrenPack const& children, int64_t line) { 871 | return parser->make_node(toInternal(production), children, line); 872 | } 873 | 874 | GrammarActionNodeHandle GrammarActionParserHandle::operator()(ustring const& production, ChildrenPack const& children, int64_t line){ 875 | return parser->make_node(production, children, line); 876 | } 877 | 878 | GrammarActionNodeHandle GrammarActionParserHandle::operator()(Token const& terminal){ 879 | return parser->make_node(terminal); 880 | } 881 | 882 | GrammarActionNodeHandle GrammarActionParserHandle::operator=(GrammarActionNodeHandle newRoot) { 883 | return parser->push_root(newRoot); 884 | } 885 | 886 | void GrammarActionParserHandle::drop_node(GrammarActionNodeHandle & toDrop) { parser->drop_node(toDrop); } 887 | void GrammarActionParserHandle::error() { parser->error(); } 888 | void GrammarActionParserHandle::success() { parser->success(); } 889 | 890 | } // namespace 891 | 892 | 893 | //========================= PUBLIC API IMPLEMENTATIONS ================================ 894 | 895 | #include 896 | 897 | namespace parser { 898 | 899 | 900 | /** 901 | * Create a complete dot graph, rooted at the given ParseNode. 902 | */ 903 | std::string dotify(ParseNode const& pn) { 904 | #ifndef LEMON_PY_SUPPRESS_PYTHON 905 | py::gil_scoped_release _release_GIL; 906 | #endif 907 | 908 | std::stringstream out; 909 | 910 | out << "digraph \"AST\" { \n"; 911 | out << "node [shape=record, style=filled];\n\n"; 912 | 913 | pn.dotify(out, nullptr); 914 | 915 | out << "\n}\n"; 916 | 917 | return out.str(); 918 | } 919 | 920 | /** 921 | * Uplift a node from the internal pointer-based representation into the 922 | * external value-semantics representation. 923 | */ 924 | ParseNode uplift_node(_parser_impl::ParseNode* alien, int & idCounter) { 925 | using _parser_impl::toExternal; 926 | ParseNode retval; 927 | retval.id = idCounter++; 928 | 929 | if (std::holds_alternative<_parser_impl::Token>(alien->value)) { 930 | auto tok = std::get<_parser_impl::Token>(alien->value); 931 | retval.tokName = toExternal(tok.name()); 932 | retval.value = toExternal(tok.value()); 933 | } 934 | else { 935 | retval.production = toExternal(std::get<_parser_impl::ustring>(alien->value)); 936 | } 937 | retval.line = alien->line; 938 | 939 | for (auto c : alien->children) { 940 | retval.children.push_back(uplift_node(c, idCounter)); 941 | } 942 | 943 | return std::move(retval); 944 | } 945 | 946 | /** 947 | * Uplift a node from the internal poiner-based representation into the 948 | * external value-semantics representation. 949 | */ 950 | ParseNode uplift_node(_parser_impl::ParseNode* alien) { 951 | int idCounter = 0; 952 | return uplift_node(alien, idCounter); 953 | } 954 | 955 | /** 956 | * Parse a string and return a value-semantics parse node. 957 | * 958 | * @throw std::runtime_error if there is a lex or parse error. 959 | */ 960 | ParseNode parse_string(std::string const& input) { 961 | #ifndef LEMON_PY_SUPPRESS_PYTHON 962 | py::gil_scoped_release _release_GIL; 963 | #endif 964 | 965 | using namespace _parser_impl; 966 | Parser p; 967 | return uplift_node(p.parseString(input)); 968 | } 969 | 970 | } // namespace parser 971 | 972 | #ifndef LEMON_PY_SUPPRESS_PYTHON 973 | PYBIND11_MODULE(PYTHON_PARSER_MODULE_NAME, m) { 974 | m.def("parse", &parser::parse_string, "Parse a string into a parse tree.", py::return_value_policy::move); 975 | m.def("dotify", &parser::dotify, "Get a graphviz DOT representation of the parse tree."); 976 | 977 | auto pn = py::class_(m, "Node") 978 | .def(py::init<>()) 979 | .def("__repr__", [](parser::ParseNode const& pn) { return py::str(pn.toString()); }, "Get an approximation of the representation.", py::return_value_policy::take_ownership) 980 | .def("__getitem__", 981 | [](parser::ParseNode const& pn, size_t item) -> py::object { 982 | if (item >= pn.childCount()) return py::none(); 983 | return py::cast(pn[item], py::return_value_policy::reference); 984 | }, 985 | "Get a child by index. Returns `None` if out of range.") 986 | .def("__iter__", [](parser::ParseNode const& pn) { return py::make_iterator(pn.begin(), pn.end(), py::return_value_policy::reference_internal); }, "Children iterator.") 987 | .def("__len__", [](parser::ParseNode const& pn) { return pn.childCount(); }, "Get number of children.") 988 | .def("as_dict", &parser::ParseNode::asDict, "Make a deep copy of this node and all children to a dictionary representation. `.attr` is ref-copied, but not deep-copied. ", py::return_value_policy::take_ownership) 989 | .def(py::self == py::self) 990 | .def(py::self != py::self) 991 | .def_property_readonly("production", &parser::ParseNode::getProduction, "Get production if non-terminal.", py::return_value_policy::take_ownership) // these return copies of strings 992 | .def_property_readonly("name", &parser::ParseNode::getProduction, "Get production if non-terminal. (alias for `.production`)", py::return_value_policy::take_ownership) // these return copies of strings 993 | .def_property_readonly("type", &parser::ParseNode::getToken, "Get type if terminal.", py::return_value_policy::take_ownership) 994 | .def_property_readonly("value", &parser::ParseNode::getValue, "Get value if terminal.", py::return_value_policy::take_ownership) 995 | .def_readonly("line", &parser::ParseNode::line, "Line number of appearance.") 996 | .def_readonly("c", &parser::ParseNode::children, "Children.", py::return_value_policy::reference_internal) 997 | .def_readonly("id", &parser::ParseNode::id, "ID number for this node (unique within tree).") 998 | .def_readonly("attr", &parser::ParseNode::attr, "Free-use attributes dictionary."); 999 | } 1000 | #endif 1001 | 1002 | // this next line is used by the codegen aspect to inline token macro definitions 1003 | struct _to_be_replaced_with_token_defines{}; 1004 | 1005 | -------------------------------------------------------------------------------- /src/lemon_py/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aubreyrjones/lemon-py/ff105e369d124afff57b97e06ad9f23017093ecb/src/lemon_py/__init__.py -------------------------------------------------------------------------------- /src/lemon_py/header.lemon: -------------------------------------------------------------------------------- 1 | %name {LemonPyParse} 2 | %token_type { _parser_impl::Token } 3 | %extra_argument { _parser_impl::GrammarActionParserHandle _ } 4 | %default_type { _parser_impl::GrammarActionNodeHandle } 5 | %default_destructor { _.drop_node($$); } 6 | 7 | %syntax_error { _.error(); } 8 | %parse_failure { _.error(); } 9 | %parse_accept { _.success(); } 10 | 11 | -------------------------------------------------------------------------------- /src/lemon_py/lempar.c: -------------------------------------------------------------------------------- 1 | /* 2 | ** 2000-05-29 3 | ** 4 | ** The author disclaims copyright to this source code. In place of 5 | ** a legal notice, here is a blessing: 6 | ** 7 | ** May you do good and not evil. 8 | ** May you find forgiveness for yourself and forgive others. 9 | ** May you share freely, never taking more than you give. 10 | ** 11 | ************************************************************************* 12 | ** Driver template for the LEMON parser generator. 13 | ** 14 | ** The "lemon" program processes an LALR(1) input grammar file, then uses 15 | ** this template to construct a parser. The "lemon" program inserts text 16 | ** at each "%%" line. Also, any "P-a-r-s-e" identifer prefix (without the 17 | ** interstitial "-" characters) contained in this template is changed into 18 | ** the value of the %name directive from the grammar. Otherwise, the content 19 | ** of this template is copied straight through into the generate parser 20 | ** source file. 21 | ** 22 | ** The following is the concatenation of all %include directives from the 23 | ** input grammar file: 24 | */ 25 | /************ Begin %include sections from the grammar ************************/ 26 | %% 27 | /**************** End of %include directives **********************************/ 28 | /* These constants specify the various numeric values for terminal symbols. 29 | ***************** Begin token definitions *************************************/ 30 | %% 31 | /**************** End token definitions ***************************************/ 32 | 33 | /* The next sections is a series of control #defines. 34 | ** various aspects of the generated parser. 35 | ** YYCODETYPE is the data type used to store the integer codes 36 | ** that represent terminal and non-terminal symbols. 37 | ** "unsigned char" is used if there are fewer than 38 | ** 256 symbols. Larger types otherwise. 39 | ** YYNOCODE is a number of type YYCODETYPE that is not used for 40 | ** any terminal or nonterminal symbol. 41 | ** YYFALLBACK If defined, this indicates that one or more tokens 42 | ** (also known as: "terminal symbols") have fall-back 43 | ** values which should be used if the original symbol 44 | ** would not parse. This permits keywords to sometimes 45 | ** be used as identifiers, for example. 46 | ** YYACTIONTYPE is the data type used for "action codes" - numbers 47 | ** that indicate what to do in response to the next 48 | ** token. 49 | ** ParseTOKENTYPE is the data type used for minor type for terminal 50 | ** symbols. Background: A "minor type" is a semantic 51 | ** value associated with a terminal or non-terminal 52 | ** symbols. For example, for an "ID" terminal symbol, 53 | ** the minor type might be the name of the identifier. 54 | ** Each non-terminal can have a different minor type. 55 | ** Terminal symbols all have the same minor type, though. 56 | ** This macros defines the minor type for terminal 57 | ** symbols. 58 | ** YYMINORTYPE is the data type used for all minor types. 59 | ** This is typically a union of many types, one of 60 | ** which is ParseTOKENTYPE. The entry in the union 61 | ** for terminal symbols is called "yy0". 62 | ** YYSTACKDEPTH is the maximum depth of the parser's stack. If 63 | ** zero the stack is dynamically sized using realloc() 64 | ** ParseARG_SDECL A static variable declaration for the %extra_argument 65 | ** ParseARG_PDECL A parameter declaration for the %extra_argument 66 | ** ParseARG_PARAM Code to pass %extra_argument as a subroutine parameter 67 | ** ParseARG_STORE Code to store %extra_argument into yypParser 68 | ** ParseARG_FETCH Code to extract %extra_argument from yypParser 69 | ** ParseCTX_* As ParseARG_ except for %extra_context 70 | ** YYERRORSYMBOL is the code number of the error symbol. If not 71 | ** defined, then do no error processing. 72 | ** YYNSTATE the combined number of states. 73 | ** YYNRULE the number of rules in the grammar 74 | ** YYNTOKEN Number of terminal symbols 75 | ** YY_MAX_SHIFT Maximum value for shift actions 76 | ** YY_MIN_SHIFTREDUCE Minimum value for shift-reduce actions 77 | ** YY_MAX_SHIFTREDUCE Maximum value for shift-reduce actions 78 | ** YY_ERROR_ACTION The yy_action[] code for syntax error 79 | ** YY_ACCEPT_ACTION The yy_action[] code for accept 80 | ** YY_NO_ACTION The yy_action[] code for no-op 81 | ** YY_MIN_REDUCE Minimum value for reduce actions 82 | ** YY_MAX_REDUCE Maximum value for reduce actions 83 | */ 84 | #ifndef INTERFACE 85 | # define INTERFACE 1 86 | #endif 87 | /************* Begin control #defines *****************************************/ 88 | %% 89 | /************* End control #defines *******************************************/ 90 | #define YY_NLOOKAHEAD ((int)(sizeof(yy_lookahead)/sizeof(yy_lookahead[0]))) 91 | 92 | /* Define the yytestcase() macro to be a no-op if is not already defined 93 | ** otherwise. 94 | ** 95 | ** Applications can choose to define yytestcase() in the %include section 96 | ** to a macro that can assist in verifying code coverage. For production 97 | ** code the yytestcase() macro should be turned off. But it is useful 98 | ** for testing. 99 | */ 100 | #ifndef yytestcase 101 | # define yytestcase(X) 102 | #endif 103 | 104 | 105 | /* Next are the tables used to determine what action to take based on the 106 | ** current state and lookahead token. These tables are used to implement 107 | ** functions that take a state number and lookahead value and return an 108 | ** action integer. 109 | ** 110 | ** Suppose the action integer is N. Then the action is determined as 111 | ** follows 112 | ** 113 | ** 0 <= N <= YY_MAX_SHIFT Shift N. That is, push the lookahead 114 | ** token onto the stack and goto state N. 115 | ** 116 | ** N between YY_MIN_SHIFTREDUCE Shift to an arbitrary state then 117 | ** and YY_MAX_SHIFTREDUCE reduce by rule N-YY_MIN_SHIFTREDUCE. 118 | ** 119 | ** N == YY_ERROR_ACTION A syntax error has occurred. 120 | ** 121 | ** N == YY_ACCEPT_ACTION The parser accepts its input. 122 | ** 123 | ** N == YY_NO_ACTION No such action. Denotes unused 124 | ** slots in the yy_action[] table. 125 | ** 126 | ** N between YY_MIN_REDUCE Reduce by rule N-YY_MIN_REDUCE 127 | ** and YY_MAX_REDUCE 128 | ** 129 | ** The action table is constructed as a single large table named yy_action[]. 130 | ** Given state S and lookahead X, the action is computed as either: 131 | ** 132 | ** (A) N = yy_action[ yy_shift_ofst[S] + X ] 133 | ** (B) N = yy_default[S] 134 | ** 135 | ** The (A) formula is preferred. The B formula is used instead if 136 | ** yy_lookahead[yy_shift_ofst[S]+X] is not equal to X. 137 | ** 138 | ** The formulas above are for computing the action when the lookahead is 139 | ** a terminal symbol. If the lookahead is a non-terminal (as occurs after 140 | ** a reduce action) then the yy_reduce_ofst[] array is used in place of 141 | ** the yy_shift_ofst[] array. 142 | ** 143 | ** The following are the tables generated in this section: 144 | ** 145 | ** yy_action[] A single table containing all actions. 146 | ** yy_lookahead[] A table containing the lookahead for each entry in 147 | ** yy_action. Used to detect hash collisions. 148 | ** yy_shift_ofst[] For each state, the offset into yy_action for 149 | ** shifting terminals. 150 | ** yy_reduce_ofst[] For each state, the offset into yy_action for 151 | ** shifting non-terminals after a reduce. 152 | ** yy_default[] Default action for each state. 153 | ** 154 | *********** Begin parsing tables **********************************************/ 155 | %% 156 | /********** End of lemon-generated parsing tables *****************************/ 157 | 158 | /* The next table maps tokens (terminal symbols) into fallback tokens. 159 | ** If a construct like the following: 160 | ** 161 | ** %fallback ID X Y Z. 162 | ** 163 | ** appears in the grammar, then ID becomes a fallback token for X, Y, 164 | ** and Z. Whenever one of the tokens X, Y, or Z is input to the parser 165 | ** but it does not parse, the type of the token is changed to ID and 166 | ** the parse is retried before an error is thrown. 167 | ** 168 | ** This feature can be used, for example, to cause some keywords in a language 169 | ** to revert to identifiers if they keyword does not apply in the context where 170 | ** it appears. 171 | */ 172 | #ifdef YYFALLBACK 173 | static const YYCODETYPE yyFallback[] = { 174 | %% 175 | }; 176 | #endif /* YYFALLBACK */ 177 | 178 | /* The following structure represents a single element of the 179 | ** parser's stack. Information stored includes: 180 | ** 181 | ** + The state number for the parser at this level of the stack. 182 | ** 183 | ** + The value of the token stored at this level of the stack. 184 | ** (In other words, the "major" token.) 185 | ** 186 | ** + The semantic value stored at this level of the stack. This is 187 | ** the information used by the action routines in the grammar. 188 | ** It is sometimes called the "minor" token. 189 | ** 190 | ** After the "shift" half of a SHIFTREDUCE action, the stateno field 191 | ** actually contains the reduce action for the second half of the 192 | ** SHIFTREDUCE. 193 | */ 194 | struct yyStackEntry { 195 | YYACTIONTYPE stateno; /* The state-number, or reduce action in SHIFTREDUCE */ 196 | YYCODETYPE major; /* The major token value. This is the code 197 | ** number for the token at this stack level */ 198 | YYMINORTYPE minor; /* The user-supplied minor token value. This 199 | ** is the value of the token */ 200 | }; 201 | typedef struct yyStackEntry yyStackEntry; 202 | 203 | /* The state of the parser is completely contained in an instance of 204 | ** the following structure */ 205 | struct yyParser { 206 | yyStackEntry *yytos; /* Pointer to top element of the stack */ 207 | #ifdef YYTRACKMAXSTACKDEPTH 208 | int yyhwm; /* High-water mark of the stack */ 209 | #endif 210 | #ifndef YYNOERRORRECOVERY 211 | int yyerrcnt; /* Shifts left before out of the error */ 212 | #endif 213 | ParseARG_SDECL /* A place to hold %extra_argument */ 214 | ParseCTX_SDECL /* A place to hold %extra_context */ 215 | #if YYSTACKDEPTH<=0 216 | int yystksz; /* Current side of the stack */ 217 | yyStackEntry *yystack; /* The parser's stack */ 218 | yyStackEntry yystk0; /* First stack entry */ 219 | #else 220 | yyStackEntry yystack[YYSTACKDEPTH]; /* The parser's stack */ 221 | yyStackEntry *yystackEnd; /* Last entry in the stack */ 222 | #endif 223 | }; 224 | typedef struct yyParser yyParser; 225 | 226 | #ifndef NDEBUG 227 | #include 228 | #include 229 | static FILE *yyTraceFILE = 0; 230 | static char *yyTracePrompt = 0; 231 | #endif /* NDEBUG */ 232 | 233 | #ifndef NDEBUG 234 | /* 235 | ** Turn parser tracing on by giving a stream to which to write the trace 236 | ** and a prompt to preface each trace message. Tracing is turned off 237 | ** by making either argument NULL 238 | ** 239 | ** Inputs: 240 | **

A FILE* to which trace output should be written. 242 | ** If NULL, then tracing is turned off. 243 | **
A prefix string written at the beginning of every 244 | ** line of trace output. If NULL, then tracing is 245 | ** turned off. 246 | **

247 | ** 248 | ** Outputs: 249 | ** None. 250 | */ 251 | void ParseTrace(FILE *TraceFILE, char *zTracePrompt){ 252 | yyTraceFILE = TraceFILE; 253 | yyTracePrompt = zTracePrompt; 254 | if( yyTraceFILE==0 ) yyTracePrompt = 0; 255 | else if( yyTracePrompt==0 ) yyTraceFILE = 0; 256 | } 257 | #endif /* NDEBUG */ 258 | 259 | #if defined(YYCOVERAGE) || !defined(NDEBUG) 260 | /* For tracing shifts, the names of all terminals and nonterminals 261 | ** are required. The following table supplies these names */ 262 | static const char *const yyTokenName[] = { 263 | %% 264 | }; 265 | #endif /* defined(YYCOVERAGE) || !defined(NDEBUG) */ 266 | 267 | #ifndef NDEBUG 268 | /* For tracing reduce actions, the names of all rules are required. 269 | */ 270 | static const char *const yyRuleName[] = { 271 | %% 272 | }; 273 | #endif /* NDEBUG */ 274 | 275 | 276 | #if YYSTACKDEPTH<=0 277 | /* 278 | ** Try to increase the size of the parser stack. Return the number 279 | ** of errors. Return 0 on success. 280 | */ 281 | static int yyGrowStack(yyParser *p){ 282 | int newSize; 283 | int idx; 284 | yyStackEntry *pNew; 285 | 286 | newSize = p->yystksz*2 + 100; 287 | idx = p->yytos ? (int)(p->yytos - p->yystack) : 0; 288 | if( p->yystack==&p->yystk0 ){ 289 | pNew = malloc(newSize*sizeof(pNew[0])); 290 | if( pNew ) pNew[0] = p->yystk0; 291 | }else{ 292 | pNew = realloc(p->yystack, newSize*sizeof(pNew[0])); 293 | } 294 | if( pNew ){ 295 | p->yystack = pNew; 296 | p->yytos = &p->yystack[idx]; 297 | #ifndef NDEBUG 298 | if( yyTraceFILE ){ 299 | fprintf(yyTraceFILE,"%sStack grows from %d to %d entries.\n", 300 | yyTracePrompt, p->yystksz, newSize); 301 | } 302 | #endif 303 | p->yystksz = newSize; 304 | } 305 | return pNew==0; 306 | } 307 | #endif 308 | 309 | /* Datatype of the argument to the memory allocated passed as the 310 | ** second argument to ParseAlloc() below. This can be changed by 311 | ** putting an appropriate #define in the %include section of the input 312 | ** grammar. 313 | */ 314 | #ifndef YYMALLOCARGTYPE 315 | # define YYMALLOCARGTYPE size_t 316 | #endif 317 | 318 | /* Initialize a new parser that has already been allocated. 319 | */ 320 | void ParseInit(void *yypRawParser ParseCTX_PDECL){ 321 | yyParser *yypParser = (yyParser*)yypRawParser; 322 | ParseCTX_STORE 323 | #ifdef YYTRACKMAXSTACKDEPTH 324 | yypParser->yyhwm = 0; 325 | #endif 326 | #if YYSTACKDEPTH<=0 327 | yypParser->yytos = NULL; 328 | yypParser->yystack = NULL; 329 | yypParser->yystksz = 0; 330 | if( yyGrowStack(yypParser) ){ 331 | yypParser->yystack = &yypParser->yystk0; 332 | yypParser->yystksz = 1; 333 | } 334 | #endif 335 | #ifndef YYNOERRORRECOVERY 336 | yypParser->yyerrcnt = -1; 337 | #endif 338 | yypParser->yytos = yypParser->yystack; 339 | yypParser->yystack[0].stateno = 0; 340 | yypParser->yystack[0].major = 0; 341 | #if YYSTACKDEPTH>0 342 | yypParser->yystackEnd = &yypParser->yystack[YYSTACKDEPTH-1]; 343 | #endif 344 | } 345 | 346 | #ifndef Parse_ENGINEALWAYSONSTACK 347 | /* 348 | ** This function allocates a new parser. 349 | ** The only argument is a pointer to a function which works like 350 | ** malloc. 351 | ** 352 | ** Inputs: 353 | ** A pointer to the function used to allocate memory. 354 | ** 355 | ** Outputs: 356 | ** A pointer to a parser. This pointer is used in subsequent calls 357 | ** to Parse and ParseFree. 358 | */ 359 | void *ParseAlloc(void *(*mallocProc)(YYMALLOCARGTYPE) ParseCTX_PDECL){ 360 | yyParser *yypParser; 361 | yypParser = (yyParser*)(*mallocProc)( (YYMALLOCARGTYPE)sizeof(yyParser) ); 362 | if( yypParser ){ 363 | ParseCTX_STORE 364 | ParseInit(yypParser ParseCTX_PARAM); 365 | } 366 | return (void*)yypParser; 367 | } 368 | #endif /* Parse_ENGINEALWAYSONSTACK */ 369 | 370 | 371 | /* The following function deletes the "minor type" or semantic value 372 | ** associated with a symbol. The symbol can be either a terminal 373 | ** or nonterminal. "yymajor" is the symbol code, and "yypminor" is 374 | ** a pointer to the value to be deleted. The code used to do the 375 | ** deletions is derived from the %destructor and/or %token_destructor 376 | ** directives of the input grammar. 377 | */ 378 | static void yy_destructor( 379 | yyParser *yypParser, /* The parser */ 380 | YYCODETYPE yymajor, /* Type code for object to destroy */ 381 | YYMINORTYPE *yypminor /* The object to be destroyed */ 382 | ){ 383 | ParseARG_FETCH 384 | ParseCTX_FETCH 385 | switch( yymajor ){ 386 | /* Here is inserted the actions which take place when a 387 | ** terminal or non-terminal is destroyed. This can happen 388 | ** when the symbol is popped from the stack during a 389 | ** reduce or during error processing or when a parser is 390 | ** being destroyed before it is finished parsing. 391 | ** 392 | ** Note: during a reduce, the only symbols destroyed are those 393 | ** which appear on the RHS of the rule, but which are *not* used 394 | ** inside the C code. 395 | */ 396 | /********* Begin destructor definitions ***************************************/ 397 | %% 398 | /********* End destructor definitions *****************************************/ 399 | default: break; /* If no destructor action specified: do nothing */ 400 | } 401 | } 402 | 403 | /* 404 | ** Pop the parser's stack once. 405 | ** 406 | ** If there is a destructor routine associated with the token which 407 | ** is popped from the stack, then call it. 408 | */ 409 | static void yy_pop_parser_stack(yyParser *pParser){ 410 | yyStackEntry *yytos; 411 | assert( pParser->yytos!=0 ); 412 | assert( pParser->yytos > pParser->yystack ); 413 | yytos = pParser->yytos--; 414 | #ifndef NDEBUG 415 | if( yyTraceFILE ){ 416 | fprintf(yyTraceFILE,"%sPopping %s\n", 417 | yyTracePrompt, 418 | yyTokenName[yytos->major]); 419 | } 420 | #endif 421 | yy_destructor(pParser, yytos->major, &yytos->minor); 422 | } 423 | 424 | /* 425 | ** Clear all secondary memory allocations from the parser 426 | */ 427 | void ParseFinalize(void *p){ 428 | yyParser *pParser = (yyParser*)p; 429 | while( pParser->yytos>pParser->yystack ) yy_pop_parser_stack(pParser); 430 | #if YYSTACKDEPTH<=0 431 | if( pParser->yystack!=&pParser->yystk0 ) free(pParser->yystack); 432 | #endif 433 | } 434 | 435 | #ifndef Parse_ENGINEALWAYSONSTACK 436 | /* 437 | ** Deallocate and destroy a parser. Destructors are called for 438 | ** all stack elements before shutting the parser down. 439 | ** 440 | ** If the YYPARSEFREENEVERNULL macro exists (for example because it 441 | ** is defined in a %include section of the input grammar) then it is 442 | ** assumed that the input pointer is never NULL. 443 | */ 444 | void ParseFree( 445 | void *p, /* The parser to be deleted */ 446 | void (*freeProc)(void*) /* Function used to reclaim memory */ 447 | ){ 448 | #ifndef YYPARSEFREENEVERNULL 449 | if( p==0 ) return; 450 | #endif 451 | ParseFinalize(p); 452 | (*freeProc)(p); 453 | } 454 | #endif /* Parse_ENGINEALWAYSONSTACK */ 455 | 456 | /* 457 | ** Return the peak depth of the stack for a parser. 458 | */ 459 | #ifdef YYTRACKMAXSTACKDEPTH 460 | int ParseStackPeak(void *p){ 461 | yyParser *pParser = (yyParser*)p; 462 | return pParser->yyhwm; 463 | } 464 | #endif 465 | 466 | /* This array of booleans keeps track of the parser statement 467 | ** coverage. The element yycoverage[X][Y] is set when the parser 468 | ** is in state X and has a lookahead token Y. In a well-tested 469 | ** systems, every element of this matrix should end up being set. 470 | */ 471 | #if defined(YYCOVERAGE) 472 | static unsigned char yycoverage[YYNSTATE][YYNTOKEN]; 473 | #endif 474 | 475 | /* 476 | ** Write into out a description of every state/lookahead combination that 477 | ** 478 | ** (1) has not been used by the parser, and 479 | ** (2) is not a syntax error. 480 | ** 481 | ** Return the number of missed state/lookahead combinations. 482 | */ 483 | #if defined(YYCOVERAGE) 484 | int ParseCoverage(FILE *out){ 485 | int stateno, iLookAhead, i; 486 | int nMissed = 0; 487 | for(stateno=0; statenoYY_MAX_SHIFT ) return stateno; 514 | assert( stateno <= YY_SHIFT_COUNT ); 515 | #if defined(YYCOVERAGE) 516 | yycoverage[stateno][iLookAhead] = 1; 517 | #endif 518 | do{ 519 | i = yy_shift_ofst[stateno]; 520 | assert( i>=0 ); 521 | assert( i<=YY_ACTTAB_COUNT ); 522 | assert( i+YYNTOKEN<=(int)YY_NLOOKAHEAD ); 523 | assert( iLookAhead!=YYNOCODE ); 524 | assert( iLookAhead < YYNTOKEN ); 525 | i += iLookAhead; 526 | assert( i<(int)YY_NLOOKAHEAD ); 527 | if( yy_lookahead[i]!=iLookAhead ){ 528 | #ifdef YYFALLBACK 529 | YYCODETYPE iFallback; /* Fallback token */ 530 | assert( iLookAhead %s\n", 536 | yyTracePrompt, yyTokenName[iLookAhead], yyTokenName[iFallback]); 537 | } 538 | #endif 539 | assert( yyFallback[iFallback]==0 ); /* Fallback loop must terminate */ 540 | iLookAhead = iFallback; 541 | continue; 542 | } 543 | #endif 544 | #ifdef YYWILDCARD 545 | { 546 | int j = i - iLookAhead + YYWILDCARD; 547 | assert( j<(int)(sizeof(yy_lookahead)/sizeof(yy_lookahead[0])) ); 548 | if( yy_lookahead[j]==YYWILDCARD && iLookAhead>0 ){ 549 | #ifndef NDEBUG 550 | if( yyTraceFILE ){ 551 | fprintf(yyTraceFILE, "%sWILDCARD %s => %s\n", 552 | yyTracePrompt, yyTokenName[iLookAhead], 553 | yyTokenName[YYWILDCARD]); 554 | } 555 | #endif /* NDEBUG */ 556 | return yy_action[j]; 557 | } 558 | } 559 | #endif /* YYWILDCARD */ 560 | return yy_default[stateno]; 561 | }else{ 562 | assert( i>=0 && i<(int)(sizeof(yy_action)/sizeof(yy_action[0])) ); 563 | return yy_action[i]; 564 | } 565 | }while(1); 566 | } 567 | 568 | /* 569 | ** Find the appropriate action for a parser given the non-terminal 570 | ** look-ahead token iLookAhead. 571 | */ 572 | static YYACTIONTYPE yy_find_reduce_action( 573 | YYACTIONTYPE stateno, /* Current state number */ 574 | YYCODETYPE iLookAhead /* The look-ahead token */ 575 | ){ 576 | int i; 577 | #ifdef YYERRORSYMBOL 578 | if( stateno>YY_REDUCE_COUNT ){ 579 | return yy_default[stateno]; 580 | } 581 | #else 582 | assert( stateno<=YY_REDUCE_COUNT ); 583 | #endif 584 | i = yy_reduce_ofst[stateno]; 585 | assert( iLookAhead!=YYNOCODE ); 586 | i += iLookAhead; 587 | #ifdef YYERRORSYMBOL 588 | if( i<0 || i>=YY_ACTTAB_COUNT || yy_lookahead[i]!=iLookAhead ){ 589 | return yy_default[stateno]; 590 | } 591 | #else 592 | assert( i>=0 && iyytos>yypParser->yystack ) yy_pop_parser_stack(yypParser); 610 | /* Here code is inserted which will execute if the parser 611 | ** stack every overflows */ 612 | /******** Begin %stack_overflow code ******************************************/ 613 | %% 614 | /******** End %stack_overflow code ********************************************/ 615 | ParseARG_STORE /* Suppress warning about unused %extra_argument var */ 616 | ParseCTX_STORE 617 | } 618 | 619 | /* 620 | ** Print tracing information for a SHIFT action 621 | */ 622 | #ifndef NDEBUG 623 | static void yyTraceShift(yyParser *yypParser, int yyNewState, const char *zTag){ 624 | if( yyTraceFILE ){ 625 | if( yyNewStateyytos->major], 628 | yyNewState); 629 | }else{ 630 | fprintf(yyTraceFILE,"%s%s '%s', pending reduce %d\n", 631 | yyTracePrompt, zTag, yyTokenName[yypParser->yytos->major], 632 | yyNewState - YY_MIN_REDUCE); 633 | } 634 | } 635 | } 636 | #else 637 | # define yyTraceShift(X,Y,Z) 638 | #endif 639 | 640 | /* 641 | ** Perform a shift action. 642 | */ 643 | static void yy_shift( 644 | yyParser *yypParser, /* The parser to be shifted */ 645 | YYACTIONTYPE yyNewState, /* The new state to shift in */ 646 | YYCODETYPE yyMajor, /* The major token to shift in */ 647 | ParseTOKENTYPE yyMinor /* The minor token to shift in */ 648 | ){ 649 | yyStackEntry *yytos; 650 | yypParser->yytos++; 651 | #ifdef YYTRACKMAXSTACKDEPTH 652 | if( (int)(yypParser->yytos - yypParser->yystack)>yypParser->yyhwm ){ 653 | yypParser->yyhwm++; 654 | assert( yypParser->yyhwm == (int)(yypParser->yytos - yypParser->yystack) ); 655 | } 656 | #endif 657 | #if YYSTACKDEPTH>0 658 | if( yypParser->yytos>yypParser->yystackEnd ){ 659 | yypParser->yytos--; 660 | yyStackOverflow(yypParser); 661 | return; 662 | } 663 | #else 664 | if( yypParser->yytos>=&yypParser->yystack[yypParser->yystksz] ){ 665 | if( yyGrowStack(yypParser) ){ 666 | yypParser->yytos--; 667 | yyStackOverflow(yypParser); 668 | return; 669 | } 670 | } 671 | #endif 672 | if( yyNewState > YY_MAX_SHIFT ){ 673 | yyNewState += YY_MIN_REDUCE - YY_MIN_SHIFTREDUCE; 674 | } 675 | yytos = yypParser->yytos; 676 | yytos->stateno = yyNewState; 677 | yytos->major = yyMajor; 678 | yytos->minor.yy0 = yyMinor; 679 | yyTraceShift(yypParser, yyNewState, "Shift"); 680 | } 681 | 682 | /* For rule J, yyRuleInfoLhs[J] contains the symbol on the left-hand side 683 | ** of that rule */ 684 | static const YYCODETYPE yyRuleInfoLhs[] = { 685 | %% 686 | }; 687 | 688 | /* For rule J, yyRuleInfoNRhs[J] contains the negative of the number 689 | ** of symbols on the right-hand side of that rule. */ 690 | static const signed char yyRuleInfoNRhs[] = { 691 | %% 692 | }; 693 | 694 | static void yy_accept(yyParser*); /* Forward Declaration */ 695 | 696 | /* 697 | ** Perform a reduce action and the shift that must immediately 698 | ** follow the reduce. 699 | ** 700 | ** The yyLookahead and yyLookaheadToken parameters provide reduce actions 701 | ** access to the lookahead token (if any). The yyLookahead will be YYNOCODE 702 | ** if the lookahead token has already been consumed. As this procedure is 703 | ** only called from one place, optimizing compilers will in-line it, which 704 | ** means that the extra parameters have no performance impact. 705 | */ 706 | static YYACTIONTYPE yy_reduce( 707 | yyParser *yypParser, /* The parser */ 708 | unsigned int yyruleno, /* Number of the rule by which to reduce */ 709 | int yyLookahead, /* Lookahead token, or YYNOCODE if none */ 710 | ParseTOKENTYPE yyLookaheadToken /* Value of the lookahead token */ 711 | ParseCTX_PDECL /* %extra_context */ 712 | ){ 713 | int yygoto; /* The next state */ 714 | YYACTIONTYPE yyact; /* The next action */ 715 | yyStackEntry *yymsp; /* The top of the parser's stack */ 716 | int yysize; /* Amount to pop the stack */ 717 | ParseARG_FETCH 718 | (void)yyLookahead; 719 | (void)yyLookaheadToken; 720 | yymsp = yypParser->yytos; 721 | 722 | switch( yyruleno ){ 723 | /* Beginning here are the reduction cases. A typical example 724 | ** follows: 725 | ** case 0: 726 | ** #line 727 | ** { ... } // User supplied code 728 | ** #line 729 | ** break; 730 | */ 731 | /********** Begin reduce actions **********************************************/ 732 | %% 733 | /********** End reduce actions ************************************************/ 734 | }; 735 | assert( yyrulenoYY_MAX_SHIFT && yyact<=YY_MAX_SHIFTREDUCE) ); 743 | 744 | /* It is not possible for a REDUCE to be followed by an error */ 745 | assert( yyact!=YY_ERROR_ACTION ); 746 | 747 | yymsp += yysize+1; 748 | yypParser->yytos = yymsp; 749 | yymsp->stateno = (YYACTIONTYPE)yyact; 750 | yymsp->major = (YYCODETYPE)yygoto; 751 | yyTraceShift(yypParser, yyact, "... then shift"); 752 | return yyact; 753 | } 754 | 755 | /* 756 | ** The following code executes when the parse fails 757 | */ 758 | #ifndef YYNOERRORRECOVERY 759 | static void yy_parse_failed( 760 | yyParser *yypParser /* The parser */ 761 | ){ 762 | ParseARG_FETCH 763 | ParseCTX_FETCH 764 | #ifndef NDEBUG 765 | if( yyTraceFILE ){ 766 | fprintf(yyTraceFILE,"%sFail!\n",yyTracePrompt); 767 | } 768 | #endif 769 | while( yypParser->yytos>yypParser->yystack ) yy_pop_parser_stack(yypParser); 770 | /* Here code is inserted which will be executed whenever the 771 | ** parser fails */ 772 | /************ Begin %parse_failure code ***************************************/ 773 | %% 774 | /************ End %parse_failure code *****************************************/ 775 | ParseARG_STORE /* Suppress warning about unused %extra_argument variable */ 776 | ParseCTX_STORE 777 | } 778 | #endif /* YYNOERRORRECOVERY */ 779 | 780 | /* 781 | ** The following code executes when a syntax error first occurs. 782 | */ 783 | static void yy_syntax_error( 784 | yyParser *yypParser, /* The parser */ 785 | int yymajor, /* The major type of the error token */ 786 | ParseTOKENTYPE yyminor /* The minor type of the error token */ 787 | ){ 788 | ParseARG_FETCH 789 | ParseCTX_FETCH 790 | #define TOKEN yyminor 791 | /************ Begin %syntax_error code ****************************************/ 792 | %% 793 | /************ End %syntax_error code ******************************************/ 794 | ParseARG_STORE /* Suppress warning about unused %extra_argument variable */ 795 | ParseCTX_STORE 796 | } 797 | 798 | /* 799 | ** The following is executed when the parser accepts 800 | */ 801 | static void yy_accept( 802 | yyParser *yypParser /* The parser */ 803 | ){ 804 | ParseARG_FETCH 805 | ParseCTX_FETCH 806 | #ifndef NDEBUG 807 | if( yyTraceFILE ){ 808 | fprintf(yyTraceFILE,"%sAccept!\n",yyTracePrompt); 809 | } 810 | #endif 811 | #ifndef YYNOERRORRECOVERY 812 | yypParser->yyerrcnt = -1; 813 | #endif 814 | assert( yypParser->yytos==yypParser->yystack ); 815 | /* Here code is inserted which will be executed whenever the 816 | ** parser accepts */ 817 | /*********** Begin %parse_accept code *****************************************/ 818 | %% 819 | /*********** End %parse_accept code *******************************************/ 820 | ParseARG_STORE /* Suppress warning about unused %extra_argument variable */ 821 | ParseCTX_STORE 822 | } 823 | 824 | /* The main parser program. 825 | ** The first argument is a pointer to a structure obtained from 826 | ** "ParseAlloc" which describes the current state of the parser. 827 | ** The second argument is the major token number. The third is 828 | ** the minor token. The fourth optional argument is whatever the 829 | ** user wants (and specified in the grammar) and is available for 830 | ** use by the action routines. 831 | ** 832 | ** Inputs: 833 | **

A pointer to the parser (an opaque structure.) 835 | **
The major token number. 836 | **
The minor token number. 837 | **
An option argument of a grammar-specified type. 838 | **

839 | ** 840 | ** Outputs: 841 | ** None. 842 | */ 843 | void Parse( 844 | void *yyp, /* The parser */ 845 | int yymajor, /* The major token code number */ 846 | ParseTOKENTYPE yyminor /* The value for the token */ 847 | ParseARG_PDECL /* Optional %extra_argument parameter */ 848 | ){ 849 | YYMINORTYPE yyminorunion; 850 | YYACTIONTYPE yyact; /* The parser action. */ 851 | #if !defined(YYERRORSYMBOL) && !defined(YYNOERRORRECOVERY) 852 | int yyendofinput; /* True if we are at the end of input */ 853 | #endif 854 | #ifdef YYERRORSYMBOL 855 | int yyerrorhit = 0; /* True if yymajor has invoked an error */ 856 | #endif 857 | yyParser *yypParser = (yyParser*)yyp; /* The parser */ 858 | ParseCTX_FETCH 859 | ParseARG_STORE 860 | 861 | assert( yypParser->yytos!=0 ); 862 | #if !defined(YYERRORSYMBOL) && !defined(YYNOERRORRECOVERY) 863 | yyendofinput = (yymajor==0); 864 | #endif 865 | 866 | yyact = yypParser->yytos->stateno; 867 | #ifndef NDEBUG 868 | if( yyTraceFILE ){ 869 | if( yyact < YY_MIN_REDUCE ){ 870 | fprintf(yyTraceFILE,"%sInput '%s' in state %d\n", 871 | yyTracePrompt,yyTokenName[yymajor],yyact); 872 | }else{ 873 | fprintf(yyTraceFILE,"%sInput '%s' with pending reduce %d\n", 874 | yyTracePrompt,yyTokenName[yymajor],yyact-YY_MIN_REDUCE); 875 | } 876 | } 877 | #endif 878 | 879 | while(1){ /* Exit by "break" */ 880 | assert( yypParser->yytos>=yypParser->yystack ); 881 | assert( yyact==yypParser->yytos->stateno ); 882 | yyact = yy_find_shift_action((YYCODETYPE)yymajor,yyact); 883 | if( yyact >= YY_MIN_REDUCE ){ 884 | unsigned int yyruleno = yyact - YY_MIN_REDUCE; /* Reduce by this rule */ 885 | assert( yyruleno<(int)(sizeof(yyRuleName)/sizeof(yyRuleName[0])) ); 886 | #ifndef NDEBUG 887 | if( yyTraceFILE ){ 888 | int yysize = yyRuleInfoNRhs[yyruleno]; 889 | if( yysize ){ 890 | fprintf(yyTraceFILE, "%sReduce %d [%s]%s, pop back to state %d.\n", 891 | yyTracePrompt, 892 | yyruleno, yyRuleName[yyruleno], 893 | yyrulenoyytos[yysize].stateno); 895 | }else{ 896 | fprintf(yyTraceFILE, "%sReduce %d [%s]%s.\n", 897 | yyTracePrompt, yyruleno, yyRuleName[yyruleno], 898 | yyrulenoyytos - yypParser->yystack)>yypParser->yyhwm ){ 909 | yypParser->yyhwm++; 910 | assert( yypParser->yyhwm == 911 | (int)(yypParser->yytos - yypParser->yystack)); 912 | } 913 | #endif 914 | #if YYSTACKDEPTH>0 915 | if( yypParser->yytos>=yypParser->yystackEnd ){ 916 | yyStackOverflow(yypParser); 917 | break; 918 | } 919 | #else 920 | if( yypParser->yytos>=&yypParser->yystack[yypParser->yystksz-1] ){ 921 | if( yyGrowStack(yypParser) ){ 922 | yyStackOverflow(yypParser); 923 | break; 924 | } 925 | } 926 | #endif 927 | } 928 | yyact = yy_reduce(yypParser,yyruleno,yymajor,yyminor ParseCTX_PARAM); 929 | }else if( yyact <= YY_MAX_SHIFTREDUCE ){ 930 | yy_shift(yypParser,yyact,(YYCODETYPE)yymajor,yyminor); 931 | #ifndef YYNOERRORRECOVERY 932 | yypParser->yyerrcnt--; 933 | #endif 934 | break; 935 | }else if( yyact==YY_ACCEPT_ACTION ){ 936 | yypParser->yytos--; 937 | yy_accept(yypParser); 938 | return; 939 | }else{ 940 | assert( yyact == YY_ERROR_ACTION ); 941 | yyminorunion.yy0 = yyminor; 942 | #ifdef YYERRORSYMBOL 943 | int yymx; 944 | #endif 945 | #ifndef NDEBUG 946 | if( yyTraceFILE ){ 947 | fprintf(yyTraceFILE,"%sSyntax Error!\n",yyTracePrompt); 948 | } 949 | #endif 950 | #ifdef YYERRORSYMBOL 951 | /* A syntax error has occurred. 952 | ** The response to an error depends upon whether or not the 953 | ** grammar defines an error token "ERROR". 954 | ** 955 | ** This is what we do if the grammar does define ERROR: 956 | ** 957 | ** * Call the %syntax_error function. 958 | ** 959 | ** * Begin popping the stack until we enter a state where 960 | ** it is legal to shift the error symbol, then shift 961 | ** the error symbol. 962 | ** 963 | ** * Set the error count to three. 964 | ** 965 | ** * Begin accepting and shifting new tokens. No new error 966 | ** processing will occur until three tokens have been 967 | ** shifted successfully. 968 | ** 969 | */ 970 | if( yypParser->yyerrcnt<0 ){ 971 | yy_syntax_error(yypParser,yymajor,yyminor); 972 | } 973 | yymx = yypParser->yytos->major; 974 | if( yymx==YYERRORSYMBOL || yyerrorhit ){ 975 | #ifndef NDEBUG 976 | if( yyTraceFILE ){ 977 | fprintf(yyTraceFILE,"%sDiscard input token %s\n", 978 | yyTracePrompt,yyTokenName[yymajor]); 979 | } 980 | #endif 981 | yy_destructor(yypParser, (YYCODETYPE)yymajor, &yyminorunion); 982 | yymajor = YYNOCODE; 983 | }else{ 984 | while( yypParser->yytos >= yypParser->yystack 985 | && (yyact = yy_find_reduce_action( 986 | yypParser->yytos->stateno, 987 | YYERRORSYMBOL)) > YY_MAX_SHIFTREDUCE 988 | ){ 989 | yy_pop_parser_stack(yypParser); 990 | } 991 | if( yypParser->yytos < yypParser->yystack || yymajor==0 ){ 992 | yy_destructor(yypParser,(YYCODETYPE)yymajor,&yyminorunion); 993 | yy_parse_failed(yypParser); 994 | #ifndef YYNOERRORRECOVERY 995 | yypParser->yyerrcnt = -1; 996 | #endif 997 | yymajor = YYNOCODE; 998 | }else if( yymx!=YYERRORSYMBOL ){ 999 | yy_shift(yypParser,yyact,YYERRORSYMBOL,yyminor); 1000 | } 1001 | } 1002 | yypParser->yyerrcnt = 3; 1003 | yyerrorhit = 1; 1004 | if( yymajor==YYNOCODE ) break; 1005 | yyact = yypParser->yytos->stateno; 1006 | #elif defined(YYNOERRORRECOVERY) 1007 | /* If the YYNOERRORRECOVERY macro is defined, then do not attempt to 1008 | ** do any kind of error recovery. Instead, simply invoke the syntax 1009 | ** error routine and continue going as if nothing had happened. 1010 | ** 1011 | ** Applications can set this macro (for example inside %include) if 1012 | ** they intend to abandon the parse upon the first syntax error seen. 1013 | */ 1014 | yy_syntax_error(yypParser,yymajor, yyminor); 1015 | yy_destructor(yypParser,(YYCODETYPE)yymajor,&yyminorunion); 1016 | break; 1017 | #else /* YYERRORSYMBOL is not defined */ 1018 | /* This is what we do if the grammar does not define ERROR: 1019 | ** 1020 | ** * Report an error message, and throw away the input token. 1021 | ** 1022 | ** * If the input token is $, then fail the parse. 1023 | ** 1024 | ** As before, subsequent error messages are suppressed until 1025 | ** three input tokens have been successfully shifted. 1026 | */ 1027 | if( yypParser->yyerrcnt<=0 ){ 1028 | yy_syntax_error(yypParser,yymajor, yyminor); 1029 | } 1030 | yypParser->yyerrcnt = 3; 1031 | yy_destructor(yypParser,(YYCODETYPE)yymajor,&yyminorunion); 1032 | if( yyendofinput ){ 1033 | yy_parse_failed(yypParser); 1034 | #ifndef YYNOERRORRECOVERY 1035 | yypParser->yyerrcnt = -1; 1036 | #endif 1037 | } 1038 | break; 1039 | #endif 1040 | } 1041 | } 1042 | #ifndef NDEBUG 1043 | if( yyTraceFILE ){ 1044 | yyStackEntry *i; 1045 | char cDiv = '['; 1046 | fprintf(yyTraceFILE,"%sReturn. Stack=",yyTracePrompt); 1047 | for(i=&yypParser->yystack[1]; i<=yypParser->yytos; i++){ 1048 | fprintf(yyTraceFILE,"%c%s", cDiv, yyTokenName[i->major]); 1049 | cDiv = ' '; 1050 | } 1051 | fprintf(yyTraceFILE,"]\n"); 1052 | } 1053 | #endif 1054 | return; 1055 | } 1056 | 1057 | /* 1058 | ** Return the fallback token corresponding to canonical token iToken, or 1059 | ** 0 if iToken has no fallback. 1060 | */ 1061 | int ParseFallback(int iToken){ 1062 | #ifdef YYFALLBACK 1063 | assert( iToken<(int)(sizeof(yyFallback)/sizeof(yyFallback[0])) ); 1064 | return yyFallback[iToken]; 1065 | #else 1066 | (void)iToken; 1067 | return 0; 1068 | #endif 1069 | } 1070 | -------------------------------------------------------------------------------- /src/lemon_py/utf.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2006-2016 Nemanja Trifunovic 2 | 3 | /* 4 | Permission is hereby granted, free of charge, to any person or organization 5 | obtaining a copy of the software and accompanying documentation covered by 6 | this license (the "Software") to use, reproduce, display, distribute, 7 | execute, and transmit the Software, and to prepare derivative works of the 8 | Software, and to permit third-parties to whom the Software is furnished to 9 | do so, all subject to the following: 10 | 11 | The copyright notices in the Software and this entire statement, including 12 | the above license grant, this restriction and the following disclaimer, 13 | must be included in all copies of the Software, in whole or in part, and 14 | all derivative works of the Software, unless such copies or derivative 15 | works are solely in the form of machine-executable object code generated by 16 | a source language processor. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | 28 | #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 29 | #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 30 | 31 | #include 32 | #include 33 | #include 34 | 35 | // Determine the C++ standard version. 36 | // If the user defines UTF_CPP_CPLUSPLUS, use that. 37 | // Otherwise, trust the unreliable predefined macro __cplusplus 38 | 39 | #if !defined UTF_CPP_CPLUSPLUS 40 | #define UTF_CPP_CPLUSPLUS __cplusplus 41 | #endif 42 | 43 | #if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later 44 | #define UTF_CPP_OVERRIDE override 45 | #define UTF_CPP_NOEXCEPT noexcept 46 | #else // C++ 98/03 47 | #define UTF_CPP_OVERRIDE 48 | #define UTF_CPP_NOEXCEPT throw() 49 | #endif // C++ 11 or later 50 | 51 | 52 | namespace utf8 53 | { 54 | // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers 55 | // You may need to change them to match your system. 56 | // These typedefs have the same names as ones from cstdint, or boost/cstdint 57 | typedef unsigned char uint8_t; 58 | typedef unsigned short uint16_t; 59 | typedef unsigned int uint32_t; 60 | 61 | // Helper code - not intended to be directly called by the library users. May be changed at any time 62 | namespace internal 63 | { 64 | // Unicode constants 65 | // Leading (high) surrogates: 0xd800 - 0xdbff 66 | // Trailing (low) surrogates: 0xdc00 - 0xdfff 67 | const uint16_t LEAD_SURROGATE_MIN = 0xd800u; 68 | const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; 69 | const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; 70 | const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; 71 | const uint16_t LEAD_OFFSET = 0xd7c0u; // LEAD_SURROGATE_MIN - (0x10000 >> 10) 72 | const uint32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN 73 | 74 | // Maximum valid value for a Unicode code point 75 | const uint32_t CODE_POINT_MAX = 0x0010ffffu; 76 | 77 | template 78 | inline uint8_t mask8(octet_type oc) 79 | { 80 | return static_cast(0xff & oc); 81 | } 82 | template 83 | inline uint16_t mask16(u16_type oc) 84 | { 85 | return static_cast(0xffff & oc); 86 | } 87 | template 88 | inline bool is_trail(octet_type oc) 89 | { 90 | return ((utf8::internal::mask8(oc) >> 6) == 0x2); 91 | } 92 | 93 | template 94 | inline bool is_lead_surrogate(u16 cp) 95 | { 96 | return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); 97 | } 98 | 99 | template 100 | inline bool is_trail_surrogate(u16 cp) 101 | { 102 | return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); 103 | } 104 | 105 | template 106 | inline bool is_surrogate(u16 cp) 107 | { 108 | return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); 109 | } 110 | 111 | template 112 | inline bool is_code_point_valid(u32 cp) 113 | { 114 | return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); 115 | } 116 | 117 | template 118 | inline typename std::iterator_traits::difference_type 119 | sequence_length(octet_iterator lead_it) 120 | { 121 | uint8_t lead = utf8::internal::mask8(*lead_it); 122 | if (lead < 0x80) 123 | return 1; 124 | else if ((lead >> 5) == 0x6) 125 | return 2; 126 | else if ((lead >> 4) == 0xe) 127 | return 3; 128 | else if ((lead >> 3) == 0x1e) 129 | return 4; 130 | else 131 | return 0; 132 | } 133 | 134 | template 135 | inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length) 136 | { 137 | if (cp < 0x80) { 138 | if (length != 1) 139 | return true; 140 | } 141 | else if (cp < 0x800) { 142 | if (length != 2) 143 | return true; 144 | } 145 | else if (cp < 0x10000) { 146 | if (length != 3) 147 | return true; 148 | } 149 | 150 | return false; 151 | } 152 | 153 | enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; 154 | 155 | /// Helper for get_sequence_x 156 | template 157 | utf_error increase_safely(octet_iterator& it, octet_iterator end) 158 | { 159 | if (++it == end) 160 | return NOT_ENOUGH_ROOM; 161 | 162 | if (!utf8::internal::is_trail(*it)) 163 | return INCOMPLETE_SEQUENCE; 164 | 165 | return UTF8_OK; 166 | } 167 | 168 | #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} 169 | 170 | /// get_sequence_x functions decode utf-8 sequences of the length x 171 | template 172 | utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point) 173 | { 174 | if (it == end) 175 | return NOT_ENOUGH_ROOM; 176 | 177 | code_point = utf8::internal::mask8(*it); 178 | 179 | return UTF8_OK; 180 | } 181 | 182 | template 183 | utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point) 184 | { 185 | if (it == end) 186 | return NOT_ENOUGH_ROOM; 187 | 188 | code_point = utf8::internal::mask8(*it); 189 | 190 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 191 | 192 | code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); 193 | 194 | return UTF8_OK; 195 | } 196 | 197 | template 198 | utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point) 199 | { 200 | if (it == end) 201 | return NOT_ENOUGH_ROOM; 202 | 203 | code_point = utf8::internal::mask8(*it); 204 | 205 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 206 | 207 | code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); 208 | 209 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 210 | 211 | code_point += (*it) & 0x3f; 212 | 213 | return UTF8_OK; 214 | } 215 | 216 | template 217 | utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point) 218 | { 219 | if (it == end) 220 | return NOT_ENOUGH_ROOM; 221 | 222 | code_point = utf8::internal::mask8(*it); 223 | 224 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 225 | 226 | code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); 227 | 228 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 229 | 230 | code_point += (utf8::internal::mask8(*it) << 6) & 0xfff; 231 | 232 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 233 | 234 | code_point += (*it) & 0x3f; 235 | 236 | return UTF8_OK; 237 | } 238 | 239 | #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR 240 | 241 | template 242 | utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point) 243 | { 244 | if (it == end) 245 | return NOT_ENOUGH_ROOM; 246 | 247 | // Save the original value of it so we can go back in case of failure 248 | // Of course, it does not make much sense with i.e. stream iterators 249 | octet_iterator original_it = it; 250 | 251 | uint32_t cp = 0; 252 | // Determine the sequence length based on the lead octet 253 | typedef typename std::iterator_traits::difference_type octet_difference_type; 254 | const octet_difference_type length = utf8::internal::sequence_length(it); 255 | 256 | // Get trail octets and calculate the code point 257 | utf_error err = UTF8_OK; 258 | switch (length) { 259 | case 0: 260 | return INVALID_LEAD; 261 | case 1: 262 | err = utf8::internal::get_sequence_1(it, end, cp); 263 | break; 264 | case 2: 265 | err = utf8::internal::get_sequence_2(it, end, cp); 266 | break; 267 | case 3: 268 | err = utf8::internal::get_sequence_3(it, end, cp); 269 | break; 270 | case 4: 271 | err = utf8::internal::get_sequence_4(it, end, cp); 272 | break; 273 | } 274 | 275 | if (err == UTF8_OK) { 276 | // Decoding succeeded. Now, security checks... 277 | if (utf8::internal::is_code_point_valid(cp)) { 278 | if (!utf8::internal::is_overlong_sequence(cp, length)){ 279 | // Passed! Return here. 280 | code_point = cp; 281 | ++it; 282 | return UTF8_OK; 283 | } 284 | else 285 | err = OVERLONG_SEQUENCE; 286 | } 287 | else 288 | err = INVALID_CODE_POINT; 289 | } 290 | 291 | // Failure branch - restore the original value of the iterator 292 | it = original_it; 293 | return err; 294 | } 295 | 296 | template 297 | inline utf_error validate_next(octet_iterator& it, octet_iterator end) { 298 | uint32_t ignored; 299 | return utf8::internal::validate_next(it, end, ignored); 300 | } 301 | 302 | } // namespace internal 303 | 304 | /// The library API - functions intended to be called by the users 305 | 306 | // Byte order mark 307 | const uint8_t bom[] = {0xef, 0xbb, 0xbf}; 308 | 309 | template 310 | octet_iterator find_invalid(octet_iterator start, octet_iterator end) 311 | { 312 | octet_iterator result = start; 313 | while (result != end) { 314 | utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end); 315 | if (err_code != internal::UTF8_OK) 316 | return result; 317 | } 318 | return result; 319 | } 320 | 321 | template 322 | inline bool is_valid(octet_iterator start, octet_iterator end) 323 | { 324 | return (utf8::find_invalid(start, end) == end); 325 | } 326 | 327 | template 328 | inline bool starts_with_bom (octet_iterator it, octet_iterator end) 329 | { 330 | return ( 331 | ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && 332 | ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && 333 | ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) 334 | ); 335 | } 336 | } // namespace utf8 337 | 338 | 339 | 340 | namespace utf8 341 | { 342 | // Base for the exceptions that may be thrown from the library 343 | class exception : public ::std::exception { 344 | }; 345 | 346 | // Exceptions that may be thrown from the library functions. 347 | class invalid_code_point : public exception { 348 | uint32_t cp; 349 | public: 350 | invalid_code_point(uint32_t codepoint) : cp(codepoint) {} 351 | virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid code point"; } 352 | uint32_t code_point() const {return cp;} 353 | }; 354 | 355 | class invalid_utf8 : public exception { 356 | uint8_t u8; 357 | public: 358 | invalid_utf8 (uint8_t u) : u8(u) {} 359 | virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-8"; } 360 | uint8_t utf8_octet() const {return u8;} 361 | }; 362 | 363 | class invalid_utf16 : public exception { 364 | uint16_t u16; 365 | public: 366 | invalid_utf16 (uint16_t u) : u16(u) {} 367 | virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-16"; } 368 | uint16_t utf16_word() const {return u16;} 369 | }; 370 | 371 | class not_enough_room : public exception { 372 | public: 373 | virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Not enough space"; } 374 | }; 375 | 376 | /// The library API - functions intended to be called by the users 377 | 378 | template 379 | octet_iterator append(uint32_t cp, octet_iterator result) 380 | { 381 | if (!utf8::internal::is_code_point_valid(cp)) 382 | throw invalid_code_point(cp); 383 | 384 | if (cp < 0x80) // one octet 385 | *(result++) = static_cast(cp); 386 | else if (cp < 0x800) { // two octets 387 | *(result++) = static_cast((cp >> 6) | 0xc0); 388 | *(result++) = static_cast((cp & 0x3f) | 0x80); 389 | } 390 | else if (cp < 0x10000) { // three octets 391 | *(result++) = static_cast((cp >> 12) | 0xe0); 392 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); 393 | *(result++) = static_cast((cp & 0x3f) | 0x80); 394 | } 395 | else { // four octets 396 | *(result++) = static_cast((cp >> 18) | 0xf0); 397 | *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); 398 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); 399 | *(result++) = static_cast((cp & 0x3f) | 0x80); 400 | } 401 | return result; 402 | } 403 | 404 | template 405 | output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) 406 | { 407 | while (start != end) { 408 | octet_iterator sequence_start = start; 409 | internal::utf_error err_code = utf8::internal::validate_next(start, end); 410 | switch (err_code) { 411 | case internal::UTF8_OK : 412 | for (octet_iterator it = sequence_start; it != start; ++it) 413 | *out++ = *it; 414 | break; 415 | case internal::NOT_ENOUGH_ROOM: 416 | out = utf8::append (replacement, out); 417 | start = end; 418 | break; 419 | case internal::INVALID_LEAD: 420 | out = utf8::append (replacement, out); 421 | ++start; 422 | break; 423 | case internal::INCOMPLETE_SEQUENCE: 424 | case internal::OVERLONG_SEQUENCE: 425 | case internal::INVALID_CODE_POINT: 426 | out = utf8::append (replacement, out); 427 | ++start; 428 | // just one replacement mark for the sequence 429 | while (start != end && utf8::internal::is_trail(*start)) 430 | ++start; 431 | break; 432 | } 433 | } 434 | return out; 435 | } 436 | 437 | template 438 | inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) 439 | { 440 | static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); 441 | return utf8::replace_invalid(start, end, out, replacement_marker); 442 | } 443 | 444 | template 445 | uint32_t next(octet_iterator& it, octet_iterator end) 446 | { 447 | uint32_t cp = 0; 448 | internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); 449 | switch (err_code) { 450 | case internal::UTF8_OK : 451 | break; 452 | case internal::NOT_ENOUGH_ROOM : 453 | throw not_enough_room(); 454 | case internal::INVALID_LEAD : 455 | case internal::INCOMPLETE_SEQUENCE : 456 | case internal::OVERLONG_SEQUENCE : 457 | throw invalid_utf8(*it); 458 | case internal::INVALID_CODE_POINT : 459 | throw invalid_code_point(cp); 460 | } 461 | return cp; 462 | } 463 | 464 | template 465 | uint32_t peek_next(octet_iterator it, octet_iterator end) 466 | { 467 | return utf8::next(it, end); 468 | } 469 | 470 | template 471 | uint32_t prior(octet_iterator& it, octet_iterator start) 472 | { 473 | // can't do much if it == start 474 | if (it == start) 475 | throw not_enough_room(); 476 | 477 | octet_iterator end = it; 478 | // Go back until we hit either a lead octet or start 479 | while (utf8::internal::is_trail(*(--it))) 480 | if (it == start) 481 | throw invalid_utf8(*it); // error - no lead byte in the sequence 482 | return utf8::peek_next(it, end); 483 | } 484 | 485 | template 486 | void advance (octet_iterator& it, distance_type n, octet_iterator end) 487 | { 488 | const distance_type zero(0); 489 | if (n < zero) { 490 | // backward 491 | for (distance_type i = n; i < zero; ++i) 492 | utf8::prior(it, end); 493 | } else { 494 | // forward 495 | for (distance_type i = zero; i < n; ++i) 496 | utf8::next(it, end); 497 | } 498 | } 499 | 500 | template 501 | typename std::iterator_traits::difference_type 502 | distance (octet_iterator first, octet_iterator last) 503 | { 504 | typename std::iterator_traits::difference_type dist; 505 | for (dist = 0; first < last; ++dist) 506 | utf8::next(first, last); 507 | return dist; 508 | } 509 | 510 | template 511 | octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) 512 | { 513 | while (start != end) { 514 | uint32_t cp = utf8::internal::mask16(*start++); 515 | // Take care of surrogate pairs first 516 | if (utf8::internal::is_lead_surrogate(cp)) { 517 | if (start != end) { 518 | uint32_t trail_surrogate = utf8::internal::mask16(*start++); 519 | if (utf8::internal::is_trail_surrogate(trail_surrogate)) 520 | cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; 521 | else 522 | throw invalid_utf16(static_cast(trail_surrogate)); 523 | } 524 | else 525 | throw invalid_utf16(static_cast(cp)); 526 | 527 | } 528 | // Lone trail surrogate 529 | else if (utf8::internal::is_trail_surrogate(cp)) 530 | throw invalid_utf16(static_cast(cp)); 531 | 532 | result = utf8::append(cp, result); 533 | } 534 | return result; 535 | } 536 | 537 | template 538 | u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) 539 | { 540 | while (start < end) { 541 | uint32_t cp = utf8::next(start, end); 542 | if (cp > 0xffff) { //make a surrogate pair 543 | *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); 544 | *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); 545 | } 546 | else 547 | *result++ = static_cast(cp); 548 | } 549 | return result; 550 | } 551 | 552 | template 553 | octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) 554 | { 555 | while (start != end) 556 | result = utf8::append(*(start++), result); 557 | 558 | return result; 559 | } 560 | 561 | template 562 | u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) 563 | { 564 | while (start < end) 565 | (*result++) = utf8::next(start, end); 566 | 567 | return result; 568 | } 569 | 570 | // The iterator class 571 | template 572 | class iterator { 573 | octet_iterator it; 574 | octet_iterator range_start; 575 | octet_iterator range_end; 576 | public: 577 | typedef uint32_t value_type; 578 | typedef uint32_t* pointer; 579 | typedef uint32_t& reference; 580 | typedef std::ptrdiff_t difference_type; 581 | typedef std::bidirectional_iterator_tag iterator_category; 582 | iterator () {} 583 | explicit iterator (const octet_iterator& octet_it, 584 | const octet_iterator& rangestart, 585 | const octet_iterator& rangeend) : 586 | it(octet_it), range_start(rangestart), range_end(rangeend) 587 | { 588 | if (it < range_start || it > range_end) 589 | throw std::out_of_range("Invalid utf-8 iterator position"); 590 | } 591 | // the default "big three" are OK 592 | octet_iterator base () const { return it; } 593 | uint32_t operator * () const 594 | { 595 | octet_iterator temp = it; 596 | return utf8::next(temp, range_end); 597 | } 598 | bool operator == (const iterator& rhs) const 599 | { 600 | if (range_start != rhs.range_start || range_end != rhs.range_end) 601 | throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); 602 | return (it == rhs.it); 603 | } 604 | bool operator != (const iterator& rhs) const 605 | { 606 | return !(operator == (rhs)); 607 | } 608 | iterator& operator ++ () 609 | { 610 | utf8::next(it, range_end); 611 | return *this; 612 | } 613 | iterator operator ++ (int) 614 | { 615 | iterator temp = *this; 616 | utf8::next(it, range_end); 617 | return temp; 618 | } 619 | iterator& operator -- () 620 | { 621 | utf8::prior(it, range_start); 622 | return *this; 623 | } 624 | iterator operator -- (int) 625 | { 626 | iterator temp = *this; 627 | utf8::prior(it, range_start); 628 | return temp; 629 | } 630 | }; // class iterator 631 | 632 | } // namespace utf8 633 | 634 | namespace utf8 635 | { 636 | 637 | inline void append(char32_t cp, std::string& s) 638 | { 639 | append(uint32_t(cp), std::back_inserter(s)); 640 | } 641 | 642 | inline std::string utf16to8(const std::u16string& s) 643 | { 644 | std::string result; 645 | utf16to8(s.begin(), s.end(), std::back_inserter(result)); 646 | return std::move(result); 647 | } 648 | 649 | inline std::u16string utf8to16(const std::string& s) 650 | { 651 | std::u16string result; 652 | utf8to16(s.begin(), s.end(), std::back_inserter(result)); 653 | return std::move(result); 654 | } 655 | 656 | inline std::string utf32to8(const std::u32string& s) 657 | { 658 | std::string result; 659 | utf32to8(s.begin(), s.end(), std::back_inserter(result)); 660 | return std::move(result); 661 | } 662 | 663 | inline std::u32string utf8to32(const std::string& s) 664 | { 665 | std::u32string result; 666 | utf8to32(s.begin(), s.end(), std::back_inserter(result)); 667 | return std::move(result); 668 | } 669 | 670 | inline std::string utf32to8(const std::wstring& s) 671 | { 672 | std::string result; 673 | utf32to8(s.begin(), s.end(), std::back_inserter(result)); 674 | return std::move(result); 675 | } 676 | 677 | inline std::wstring utf8toW(const std::string& s) 678 | { 679 | std::wstring result; 680 | utf8to32(s.begin(), s.end(), std::back_inserter(result)); 681 | return std::move(result); 682 | } 683 | 684 | inline std::size_t find_invalid(const std::string& s) 685 | { 686 | std::string::const_iterator invalid = find_invalid(s.begin(), s.end()); 687 | return (invalid == s.end()) ? std::string::npos : (invalid - s.begin()); 688 | } 689 | 690 | inline bool is_valid(const std::string& s) 691 | { 692 | return is_valid(s.begin(), s.end()); 693 | } 694 | 695 | inline std::string replace_invalid(const std::string& s, char32_t replacement) 696 | { 697 | std::string result; 698 | replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); 699 | return std::move(result); 700 | } 701 | 702 | inline std::string replace_invalid(const std::string& s) 703 | { 704 | std::string result; 705 | replace_invalid(s.begin(), s.end(), std::back_inserter(result)); 706 | return std::move(result); 707 | } 708 | 709 | inline bool starts_with_bom(const std::string& s) 710 | { 711 | return starts_with_bom(s.begin(), s.end()); 712 | } 713 | 714 | } // namespace utf8 715 | 716 | #endif //header guard 717 | 718 | -------------------------------------------------------------------------------- /test_grammars/expr/example.expr: -------------------------------------------------------------------------------- 1 | // a comment 2 | -5 + 3.2 + 3 | "a line- 4 | spanning 5 | string" * "INTE\"RRU'PTED\\" * 6 | (3 + 'some" \'chars') 7 | / a_varIAble + CALL_MACRO(a, b, c) 8 | // another comment 9 | + function_call ( 10 | a + x, 11 | b / y + x, 12 | single_fn_call( 13 | void_fn_call()) * (z + 1)) 14 | - 15 | "joined adjacent " 16 | "strings that are" " adjacent" -------------------------------------------------------------------------------- /test_grammars/expr/expressions.lemon: -------------------------------------------------------------------------------- 1 | /* 2 | @pymod expr_parse 3 | 4 | @lexdef 5 | !whitespace : \s 6 | !comment : //.*\n 7 | 8 | ' ' \ := CHAR 9 | ' " \ s j := STRING 10 | 11 | ADD := + 12 | SUB := - 13 | MUL := * 14 | DIV := / 15 | L_PAREN := ( 16 | R_PAREN := ) 17 | COMMA := , 18 | 19 | FLOAT_LIT : [0-9]+\.[0-9]+ 20 | INT_LIT : [0-9]+ 21 | FNCALL :: ([_a-z][_a-z0-9]*)\s*\( 22 | MACRO :: ([_A-Z][_A-Z0-9]*)\s*\( 23 | IDENT : [_a-z][_a-z0-9]* 24 | @endlex 25 | */ 26 | 27 | // token association, and implicit (ascending) priority 28 | %left COMMA FNCALL MACRO. 29 | %left ADD SUB. 30 | %left MUL DIV. 31 | 32 | toplevel ::= expr(c1). { _ = c1; } 33 | 34 | expr(e) ::= expr(c1) ADD(o) expr(c2). { e = _("+", {c1, c2}, ~o); } 35 | expr(e) ::= expr(c1) SUB(o) expr(c2). { e = _("-", {c1, c2}, ~o); } 36 | expr(e) ::= expr(c1) MUL(o) expr(c2). { e = _("*", {c1, c2}, ~o); } 37 | expr(e) ::= expr(c1) DIV(o) expr(c2). { e = _("/", {c1, c2}, ~o); } 38 | expr(e) ::= SUB expr(c1). [MUL] { e = _("neg", {c1}, ~c1); } 39 | expr(e) ::= L_PAREN expr(e1) R_PAREN. { e = e1; } 40 | 41 | expr(e) ::= varref(e1). { e = e1; } 42 | varref(e) ::= IDENT(lit). { e = _("varref", {_(lit)}, ~lit); } 43 | 44 | expr(e) ::= fncall(e1). { e = e1; } 45 | expr(e) ::= macro(e1). { e = e1; } 46 | fncall(e) ::= FNCALL(lit1) arg_list(c2) R_PAREN. { e = _("fncall", {_(lit1), c2}, ~lit1); } 47 | macro(e) ::= MACRO(lit1) arg_list(c2) R_PAREN. { e = _("macro", {_(lit1), c2}, ~lit1); } 48 | 49 | arg_list(L) ::= . { L = _("arglist"); } 50 | arg_list(L) ::= expr(c1). { L = _("arglist", {c1}, ~c1); } 51 | arg_list(L) ::= arg_list(L1) COMMA expr(e). { L = L1 += e; } 52 | 53 | 54 | expr(e) ::= FLOAT_LIT(lit). { e = _(lit); } 55 | expr(e) ::= INT_LIT(lit). { e = _(lit); } 56 | 57 | expr(e) ::= CHAR(lit). { e = _(lit); } 58 | expr(e) ::= STRING(lit). { e = _(lit); } 59 | 60 | -------------------------------------------------------------------------------- /test_grammars/parasol/.gitignore: -------------------------------------------------------------------------------- 1 | concat_grammar.* 2 | parasol_parser.so 3 | -------------------------------------------------------------------------------- /test_grammars/parasol/parasol.lemon: -------------------------------------------------------------------------------- 1 | /* 2 | @pymod parasol_parser 3 | 4 | @lexdef 5 | 6 | !whitespace : \s+ 7 | !comment : ;.*\n 8 | 9 | STRUCT := struct : [^\w_\?] 10 | INCLUDE := include : [^\w_\?] 11 | IN := in : [^\w_\?] 12 | ELSE := else : [^\w_\?] 13 | DEF := def : [^\w_\?] 14 | AS := as : [^\w_\?] 15 | LET := let : [^\w_\?] 16 | 17 | L_CURLY := { 18 | R_CURLY := } 19 | GOESTO := => 20 | LAMBDA := \ 21 | COMMA := , 22 | EQUALS := = 23 | L_AND := && 24 | L_OR := || 25 | B_AND := & 26 | B_OR := | 27 | LESS := < 28 | LESS_EQ := <= 29 | GREATER := > 30 | GREATER_EQ := >= 31 | EQ := == 32 | NOT_EQ := != 33 | PLUS := + 34 | MINUS := - 35 | MULT := * 36 | DIV := / 37 | DOT := *. 38 | SWIZZLE := . 39 | SEQUENCE := .. 40 | NOT := ! 41 | R_BRACKET := ] 42 | COLON := : 43 | L_PAREN := ( 44 | R_PAREN := ) 45 | ARRAY := @ 46 | 47 | FLOAT_LIT : -?[0-9]+\.[0-9]* 48 | INT_LIT : -?[0-9]+ 49 | 50 | FNCALL : ([_a-z][_a-z0-9\?]*)\s*\( 51 | SCOPEREF : ([_a-z][_a-z0-9\?]*)\s*\[ 52 | ID : [_a-z][_a-z0-9\?]* 53 | 54 | @endlex 55 | */ 56 | 57 | module ::= global_list(c1). { _ = c1; } 58 | 59 | 60 | global_list(L) ::= . { L = _("global_list"); } 61 | global_list(L) ::= global_list(RL) global_item(c1). { L = RL += c1; } 62 | 63 | 64 | global_item(q) ::= pipeline(q1). { q = q1; } 65 | global_item(q) ::= struct_def(q1). { q = q1; } 66 | 67 | 68 | pipeline(q) ::= id(c1) L_CURLY pipeline_contents(c2) R_CURLY. { q = _("pipeline", {c1, c2}, ~c1); } 69 | 70 | 71 | pipeline_contents(L) ::= . { L = _("pipeline_contents"); } 72 | pipeline_contents(L) ::= pipeline_contents(RL) pipeline_item(c1). { L = RL += c1; } 73 | 74 | 75 | pipeline_item(PI) ::= function_def(F). {PI = F;} 76 | pipeline_item(PI) ::= var_decl(E). {PI = E;} 77 | pipeline_item(PI) ::= scoped_var_decl(E). {PI = E;} 78 | pipeline_item(PI) ::= assignment_expr(E). {PI = E;} 79 | pipeline_item(PI) ::= include_decl(I). {PI = I;} 80 | 81 | 82 | function_def(F) ::= DEF(D) var_decl(VD) param_list(PL) GOESTO expr(E). { F = _("function_def", {VD, PL, E}, ~D); } 83 | function_def(F) ::= DEF(D) scoped_var_decl(VD) param_list(PL) GOESTO expr(E). { F = _("function_def", {VD, PL, E}, ~D); } 84 | 85 | 86 | lambda_def(L) ::= LAMBDA(LTOK) param_list(PL) GOESTO expr(E). { L = _("lambda", {PL, E}, ~LTOK); } 87 | 88 | 89 | param_list(PL) ::= . { PL = _("param_list"); } 90 | param_list(PL) ::= var_decl(V). { PL = _("param_list", {V}); } 91 | param_list(P) ::= param_list(PL) COMMA var_decl(V). { P = PL += V; } 92 | 93 | 94 | struct_def(S) ::= STRUCT(STOK) id(N) L_CURLY struct_contents(M) R_CURLY. { S = _("struct", {N, M}, ~STOK); } 95 | 96 | 97 | struct_contents(SC) ::= . { SC = _("struct_contents"); } 98 | struct_contents(S) ::= struct_contents(SC) var_decl(V). { S = SC += V; } 99 | 100 | 101 | include_decl(I) ::= INCLUDE(ITOK) id(P). { I = _("include", {P}, ~ITOK); } 102 | include_decl(I) ::= INCLUDE(ITOK) id(P) AS id(A). { I = _("include", {P, A}, ~ITOK);} 103 | 104 | 105 | // expressions... which is most of the language 106 | %right LAMBDA LET. 107 | %left GOESTO. 108 | %left COMMA FNCALL. 109 | %right EQUALS. 110 | %left L_AND L_OR. 111 | %left B_AND B_OR. 112 | %left LESS LESS_EQ GREATER GREATER_EQ. 113 | %left EQ NOT_EQ. 114 | %left PLUS MINUS. 115 | %left MULT DIV DOT. 116 | %left SWIZZLE SEQUENCE. 117 | %right NOT ELSE. 118 | 119 | // declarative expressions 120 | expr(E) ::= scoped_var_decl(V). { E = V;} 121 | 122 | 123 | scoped_var_decl(V) ::= scope(S) var_decl(DECL) R_BRACKET. { V = _("scoped_var_decl", {S, DECL}, ~DECL); } 124 | 125 | 126 | scope(I) ::= SCOPEREF(S). { I = _(S); } 127 | 128 | 129 | var_decl(V) ::= id(NAME). {V = _("var_decl", {NAME}, ~NAME);} 130 | var_decl(V) ::= id(NAME) COLON type_id(TYPE) . {V = _("var_decl", {NAME, TYPE}, ~NAME);} 131 | var_decl(V) ::= id(NAME) COLON integer(IDX). {V = _("var_decl", {NAME, IDX}, ~NAME);} 132 | var_decl(V) ::= id(NAME) COLON type_id(TYPE) integer(IDX). {V = _("var_decl", {NAME, TYPE, IDX}, ~NAME);} 133 | 134 | 135 | 136 | // arithmetic expressions 137 | expr(E) ::= id(I). {E = I;} 138 | expr(E) ::= float_(F). {E = F;} 139 | expr(E) ::= integer(I). {E = I;} 140 | expr(E) ::= function_call(F). {E = F;} 141 | 142 | expr(E) ::= assignment_expr(I). {E = I;} 143 | 144 | expr(E) ::= expr(L) PLUS expr(R). {E = _("+", {L, R}, ~L);} 145 | expr(E) ::= expr(L) MINUS expr(R). {E = _("-", {L, R}, ~L);} 146 | expr(E) ::= expr(L) MULT expr(R). {E = _("*", {L, R}, ~L);} 147 | expr(E) ::= expr(L) DIV expr(R). {E = _("/", {L, R}, ~L);} 148 | expr(E) ::= expr(L) DOT expr(R). {E = _("*.", {L, R}, ~L);} 149 | expr(E) ::= expr(L) SEQUENCE expr(R). {E = _("..", {L, R}, ~L);} 150 | expr(E) ::= NOT expr(I). {E = _("!", {I}, ~I);} 151 | expr(E) ::= MINUS expr(I). [NOT] {E = _("neg", {I}, ~I);} 152 | expr(E) ::= L_PAREN expr(I) R_PAREN. {E = I;} 153 | expr(E) ::= let_expr(I). {E = I;} 154 | expr(E) ::= case_set(CS). {E = CS;} 155 | expr(E) ::= lambda_def(L). {E = L;} 156 | expr(E) ::= expr(L) SWIZZLE expr(R). {E = _(".", {L, R}, ~L);} 157 | 158 | expr(E) ::= expr(L) L_AND expr(R). {E = _("&&", {L, R}, ~L);} 159 | expr(E) ::= expr(L) L_OR expr(R). {E = _("||", {L, R}, ~L);} 160 | 161 | expr(E) ::= expr(L) B_AND expr(R). {E = _("&", {L, R}, ~L);} 162 | expr(E) ::= expr(L) B_OR expr(R). {E = _("|", {L, R}, ~L);} 163 | 164 | expr(E) ::= expr(L) LESS expr(R). {E = _("<", {L, R}, ~L);} 165 | expr(E) ::= expr(L) LESS_EQ expr(R). {E = _("<=", {L, R}, ~L);} 166 | 167 | expr(E) ::= expr(L) GREATER expr(R). {E = _(">", {L, R}, ~L);} 168 | expr(E) ::= expr(L) GREATER_EQ expr(R). {E = _(">=", {L, R}, ~L);} 169 | 170 | expr(E) ::= expr(L) EQ expr(R). {E = _("==", {L, R}, ~L);} 171 | expr(E) ::= expr(L) NOT_EQ expr(R). {E = _("!=", {L, R}, ~L);} 172 | 173 | 174 | let_expr(L) ::= LET(LTOK) unscoped_assignment_list(AL) IN expr(E). { L = _("let", {AL, E}, ~LTOK); } 175 | 176 | 177 | unscoped_assignment_list(AL) ::= . {AL = _("unscoped_assignment_list"); } 178 | unscoped_assignment_list(A) ::= unscoped_assignment_list(AL) unscoped_assignment_expr(E). {A = AL += E; } 179 | 180 | 181 | case_set(CS) ::= L_CURLY(LCTOK) case_list(CL) R_CURLY. {CS = _("case_set", {CL}, ~LCTOK); } 182 | 183 | 184 | case_list(CL) ::= . {CL = _("case_list");} 185 | case_list(C) ::= case_list(CL) case(CS). {C = CL += CS; } 186 | 187 | 188 | case(C) ::= expr(COND) GOESTO expr(R). {C = _("case", {COND, R}, ~COND); } 189 | 190 | 191 | assignment_expr(E) ::= unscoped_assignment_expr(I). {E = I;} 192 | assignment_expr(E) ::= scoped_var_decl(L) EQUALS expr(R). {E = _("=", {L, R}, ~L);} 193 | 194 | 195 | unscoped_assignment_expr(E) ::= var_decl(L) EQUALS expr(R). {E = _("=", {L, R}, ~L);} 196 | 197 | 198 | function_call(F) ::= fncall(NAME) arg_list(ARGS) R_PAREN. {F = _("function_call", {NAME, ARGS}, ~NAME); } 199 | 200 | 201 | fncall(I) ::= FNCALL(F). {I = _(F);} 202 | 203 | 204 | arg_list(PL) ::= . {PL = _("arg_list"); } 205 | arg_list(PL) ::= expr(E). {PL = _("arg_list", {E}, ~E); } 206 | arg_list(A) ::= arg_list(PL) COMMA expr(E). {A = PL += E; } 207 | 208 | 209 | id(I) ::= ID(IDL). { I = _(IDL); } 210 | 211 | 212 | type_id(I) ::= ID(IDL). {I = _("type_id", {_(IDL)}, ~IDL);} 213 | type_id(I) ::= ID(IDL) ARRAY integer(A). {I = _("type_id", {_(IDL), A}, ~IDL); } 214 | 215 | 216 | integer(I) ::= INT_LIT(IL). {I = _(IL); } 217 | 218 | 219 | float_(F) ::= FLOAT_LIT(FL). {F = _(FL); } 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | -------------------------------------------------------------------------------- /test_grammars/parasol/phong.prsl: -------------------------------------------------------------------------------- 1 | include_some { 2 | 3 | } 4 | 5 | vertex_pos { 6 | v[v_pos] = u[viewMatrix: mat4] * 7 | u[modelMatrix: mat4] * 8 | vec4(a[v_inPosition: vec3], 1) 9 | 10 | v[gl_Position] = u[projMatrix: mat4] * v_pos 11 | } 12 | 13 | vertex_surface_normals { 14 | v[v_normal] = u[u_normalMat: mat3] * a[v_inNormal: vec3] 15 | } 16 | 17 | struct point_light { 18 | position: vec3 19 | color: vec4 20 | } 21 | 22 | phong_lighting { 23 | v[v_color] = a[v_inColor: vec4] 24 | 25 | u[point_lights: point_light@16] ; declares an array of size 16 26 | v[v_normal: vec3] 27 | v[v_pos: vec3] 28 | 29 | def L 30 | lightPos: vec3, fragPos: vec3 => ; no stage scope for L, simple expression function 31 | normalize(lightPos - fragPos) 32 | 33 | 34 | def f[pointLightContrib] light => ; since pointLightContrib has f-stage scope... 35 | light.color * max(0, v_normal *. L(light.position, v_pos)) ; ... it closes over the fragment inputs. 36 | 37 | def f[pointLight] 38 | light => 39 | { 40 | any(light.color) => clamp(pointLightContrib(light), 0, 1) ; if light.color is truthy, there's your answer 41 | _ => vec4(0) ; _ is the default case 42 | } 43 | 44 | f[out_color: 0] = ; inline output index spec 45 | __(point_lights, ; __(array, f, g) is short (and optimized) for reduce(map(array, f), g). 46 | pointLight, 47 | \accum, item => accum + item) ; you could also just use `add` instead, but this is an example 48 | ; map/reduce is the main form of repetition in Parasol. 49 | 50 | } 51 | 52 | complete_phong_pipeline { 53 | include vertex_pos 54 | include vertex_surface_normals 55 | include phong_lighting 56 | 57 | ; the next part is optional, and the parasol compiler will provide 58 | ; indices if you don't want to spec them. You'll have to query them 59 | ; out of the resultant linked object. 60 | 61 | ; alternatively, they can be spec'd inline with a[varName: type index] 62 | ; although this makes pipelines such pipelines difficult to compose 63 | 64 | a[v_inPosition: 0] ; spec attribute indices 65 | a[v_inNormal: 1] 66 | a[v_inColor: 2] 67 | 68 | ; f[out_color] was spec'd inline. 69 | } 70 | -------------------------------------------------------------------------------- /test_grammars/parasol/test_api.py: -------------------------------------------------------------------------------- 1 | import parasol_parser 2 | 3 | with open('phong.prsl', 'r') as f: 4 | t = parasol_parser.parse(f.read()) 5 | 6 | for idx, c in enumerate(t[0][1]): 7 | print(repr(c)) 8 | -------------------------------------------------------------------------------- /test_grammars/utf8_expr/example.expr: -------------------------------------------------------------------------------- 1 | 5Добавлять3.2-тестоМвая*ТЕСТ_МАКРО() -------------------------------------------------------------------------------- /test_grammars/utf8_expr/expr_utf8.lemon: -------------------------------------------------------------------------------- 1 | /* 2 | @pymod expr_utf_parse 3 | 4 | @lexdef 5 | 6 | !whitespace : \s+ 7 | 8 | ' ' \ := CHAR 9 | ' " \ s j := STRING 10 | 11 | ADD := Добавлять 12 | SUB := - 13 | MUL := * 14 | DIV := / 15 | L_PAREN := ( 16 | R_PAREN := ) 17 | COMMA := , 18 | 19 | FLOAT_LIT : [0-9]+\.[0-9]+ 20 | INT_LIT : [0-9]+ 21 | FNCALL :: ([_а-я][_а-я0-9]*)\s*\( 22 | MACRO :: ([_А-Я][_А-Я0-9]*)\s*\( 23 | IDENT : [_а-яА-Я][_а-яА-Я0-9]* 24 | @endlex 25 | */ 26 | 27 | // token association, and implicit (ascending) priority 28 | %left COMMA FNCALL MACRO. 29 | %left ADD SUB. 30 | %left MUL DIV. 31 | 32 | toplevel ::= expr(c1). { _ = c1; } 33 | 34 | expr(e) ::= expr(c1) ADD(o) expr(c2). { e = _("+", {c1, c2}, ~o); } 35 | expr(e) ::= expr(c1) SUB(o) expr(c2). { e = _("-", {c1, c2}, ~o); } 36 | expr(e) ::= expr(c1) MUL(o) expr(c2). { e = _("*", {c1, c2}, ~o); } 37 | expr(e) ::= expr(c1) DIV(o) expr(c2). { e = _("/", {c1, c2}, ~o); } 38 | expr(e) ::= SUB expr(c1). [MUL] { e = _("neg", {c1}, ~c1); } 39 | expr(e) ::= L_PAREN expr(e1) R_PAREN. { e = e1; } 40 | 41 | expr(e) ::= varref(e1). { e = e1; } 42 | varref(e) ::= IDENT(lit). { e = _("varref", {_(lit)}, ~lit); } 43 | 44 | expr(e) ::= fncall(e1). { e = e1; } 45 | expr(e) ::= macro(e1). { e = e1; } 46 | fncall(e) ::= FNCALL(lit1) arg_list(c2) R_PAREN. { e = _("fncall", {_(lit1), c2}, ~lit1); } 47 | macro(e) ::= MACRO(lit1) arg_list(c2) R_PAREN. { e = _("macro", {_(lit1), c2}, ~lit1); } 48 | 49 | arg_list(L) ::= . { L = _("arglist"); } 50 | arg_list(L) ::= expr(c1). { L = _("arglist", {c1}, ~c1); } 51 | arg_list(L) ::= arg_list(L1) COMMA expr(e). { L = L1 += e; } 52 | 53 | 54 | expr(e) ::= FLOAT_LIT(lit). { e = _(lit); } 55 | expr(e) ::= INT_LIT(lit). { e = _(lit); } 56 | 57 | expr(e) ::= CHAR(lit). { e = _(lit); } 58 | expr(e) ::= STRING(lit). { e = _(lit); } 59 | 60 | --------------------------------------------------------------------------------