├── .gitignore ├── bin ├── dissect └── dissect.php ├── docs ├── state_3.png ├── index.md ├── cli.md ├── common.md ├── ast.md ├── lexing.md └── parsing.md ├── CHANGELOG.md ├── tests ├── Dissect │ ├── Parser │ │ ├── LALR1 │ │ │ ├── Dumper │ │ │ │ ├── res │ │ │ │ │ ├── table │ │ │ │ │ │ ├── production.php │ │ │ │ │ │ └── debug.php │ │ │ │ │ └── graphviz │ │ │ │ │ │ ├── state.dot │ │ │ │ │ │ └── automaton.dot │ │ │ │ ├── ExampleGrammar.php │ │ │ │ ├── ProductionTableDumperTest.php │ │ │ │ ├── DebugTableDumperTest.php │ │ │ │ └── AutomatonDumperTest.php │ │ │ ├── ArithLexer.php │ │ │ ├── Analysis │ │ │ │ ├── StateTest.php │ │ │ │ ├── AutomatonTest.php │ │ │ │ ├── KernelSet │ │ │ │ │ └── KernelSetTest.php │ │ │ │ ├── ItemTest.php │ │ │ │ └── AnalyzerTest.php │ │ │ ├── ArithGrammar.php │ │ │ └── ParserTest.php │ │ ├── ExampleGrammar.php │ │ ├── RuleTest.php │ │ └── GrammarTest.php │ └── Lexer │ │ ├── StubLexer.php │ │ ├── StubRegexLexer.php │ │ ├── Recognizer │ │ ├── SimpleRecognizerTest.php │ │ └── RegexRecognizerTest.php │ │ ├── RegexLexerTest.php │ │ ├── SimpleLexerTest.php │ │ ├── StatefulLexerTest.php │ │ ├── AbstractLexerTest.php │ │ └── TokenStream │ │ └── ArrayTokenStreamTest.php └── bootstrap.php ├── .travis.yml ├── src └── Dissect │ ├── Parser │ ├── LALR1 │ │ ├── Analysis │ │ │ ├── KernelSet │ │ │ │ ├── Node.php │ │ │ │ └── KernelSet.php │ │ │ ├── Exception │ │ │ │ ├── ConflictException.php │ │ │ │ ├── ShiftReduceConflictException.php │ │ │ │ └── ReduceReduceConflictException.php │ │ │ ├── AnalysisResult.php │ │ │ ├── State.php │ │ │ ├── Automaton.php │ │ │ ├── Item.php │ │ │ └── Analyzer.php │ │ ├── Dumper │ │ │ ├── TableDumper.php │ │ │ ├── StringWriter.php │ │ │ ├── ProductionTableDumper.php │ │ │ ├── AutomatonDumper.php │ │ │ └── DebugTableDumper.php │ │ └── Parser.php │ ├── Parser.php │ ├── Exception │ │ └── UnexpectedTokenException.php │ ├── Rule.php │ └── Grammar.php │ ├── Lexer │ ├── Lexer.php │ ├── Token.php │ ├── Recognizer │ │ ├── Recognizer.php │ │ ├── SimpleRecognizer.php │ │ └── RegexRecognizer.php │ ├── Exception │ │ └── RecognitionException.php │ ├── CommonToken.php │ ├── TokenStream │ │ ├── TokenStream.php │ │ └── ArrayTokenStream.php │ ├── RegexLexer.php │ ├── AbstractLexer.php │ ├── SimpleLexer.php │ └── StatefulLexer.php │ ├── Console │ ├── Application.php │ └── Command │ │ └── DissectCommand.php │ ├── Util │ └── Util.php │ └── Node │ ├── Node.php │ └── CommonNode.php ├── phpunit.xml ├── TODO.md ├── README.md ├── composer.json └── UNLICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | vendor/ 2 | composer.phar 3 | composer.lock 4 | -------------------------------------------------------------------------------- /bin/dissect: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env php 2 | array(0=>array('a'=>2,'$eof'=>-2,),2=>array('a'=>2,'b'=>-2,),3=>array('b'=>4,),1=>array('$eof'=>0,),4=>array('$eof'=>-1,'b'=>-1,),),'goto'=>array(0=>array('S'=>1,),2=>array('S'=>3,),)); 2 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: php 2 | 3 | php: 4 | - 5.3 5 | - 5.4 6 | 7 | branches: 8 | only: 9 | - master 10 | - develop 11 | 12 | before_script: 13 | - wget http://getcomposer.org/composer.phar 14 | - php composer.phar dump-autoload 15 | -------------------------------------------------------------------------------- /tests/bootstrap.php: -------------------------------------------------------------------------------- 1 | add('Dissect', __DIR__); 10 | -------------------------------------------------------------------------------- /tests/Dissect/Parser/LALR1/Dumper/res/graphviz/state.dot: -------------------------------------------------------------------------------- 1 | digraph State2 { 2 | rankdir="LR"; 3 | 4 | 2 [label="State 2\n\nS → a • S b\nS → • a S b\nS → • [b]"]; 5 | 3 [label="State 3"]; 6 | 7 | 2 -> 3 [label="S"]; 8 | 2 -> 2 [label="a"]; 9 | } 10 | -------------------------------------------------------------------------------- /tests/Dissect/Parser/ExampleGrammar.php: -------------------------------------------------------------------------------- 1 | is('a', 'b', 'c') 11 | ->is('x', 'y', 'z'); 12 | 13 | $this->start('Foo'); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /tests/Dissect/Parser/LALR1/Dumper/ExampleGrammar.php: -------------------------------------------------------------------------------- 1 | is('a', 'S', 'b') 13 | ->is(/* empty */); 14 | 15 | $this->start('S'); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/Dissect/Parser/LALR1/Analysis/KernelSet/Node.php: -------------------------------------------------------------------------------- 1 | kernel = $hashedKernel; 16 | $this->number = $number; 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /phpunit.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | 7 | 8 | ./tests 9 | 10 | 11 | 12 | 13 | 14 | ./src 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /tests/Dissect/Parser/RuleTest.php: -------------------------------------------------------------------------------- 1 | assertEquals('y', $r->getComponent(1)); 16 | $this->assertNull($r->getComponent(2)); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/Dissect/Parser/LALR1/Dumper/TableDumper.php: -------------------------------------------------------------------------------- 1 | 9 | */ 10 | interface TableDumper 11 | { 12 | /** 13 | * Dumps the parse table. 14 | * 15 | * @param array $table The parse table. 16 | * 17 | * @return string The resulting string representation of the table. 18 | */ 19 | public function dump(array $table); 20 | } 21 | -------------------------------------------------------------------------------- /tests/Dissect/Parser/LALR1/ArithLexer.php: -------------------------------------------------------------------------------- 1 | regex('INT', '/^[1-9][0-9]*/'); 12 | $this->token('('); 13 | $this->token(')'); 14 | $this->token('+'); 15 | $this->token('-'); 16 | $this->token('**'); 17 | $this->token('*'); 18 | $this->token('/'); 19 | $this->regex('WSP', "/^[ \r\n\t]+/"); 20 | $this->skip('WSP'); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /tests/Dissect/Parser/LALR1/Dumper/res/graphviz/automaton.dot: -------------------------------------------------------------------------------- 1 | digraph Automaton { 2 | rankdir="LR"; 3 | 4 | 0 [label="State 0\n\n• S\nS → • a S b\nS → • [$eof]"]; 5 | 1 [label="State 1\n\nS • [$eof]"]; 6 | 2 [label="State 2\n\nS → a • S b\nS → • a S b\nS → • [b]"]; 7 | 3 [label="State 3\n\nS → a S • b"]; 8 | 4 [label="State 4\n\nS → a S b • [$eof b]"]; 9 | 10 | 0 -> 1 [label="S"]; 11 | 0 -> 2 [label="a"]; 12 | 2 -> 3 [label="S"]; 13 | 2 -> 2 [label="a"]; 14 | 3 -> 4 [label="b"]; 15 | } 16 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | Goals 2 | ===== 3 | 4 | 1.1 5 | --- 6 | 7 | - Optional operator precedence support (à la *yacc*, *bison*) - ✔ 8 | - A performance-oriented regex lexer (based on doctrine/lexer) - ✔ 9 | - An option to generate a hybrid recursive ascent parser - □ 10 | 11 | 1.0 12 | --- 13 | 14 | - Compute reduction lookahead by the channel algorithm from *yacc* 15 | instead of the current LALR-by-SLR algorithm - ✔ 16 | - Change the analyzer API to allow for grammar debugging 17 | (provide access to resolved conflicts, dumping the automaton to DOT ...) - ✔ 18 | - Provide classes for dumping the parse table to PHP (both the dev & prod version) - ✔ 19 | -------------------------------------------------------------------------------- /src/Dissect/Lexer/Lexer.php: -------------------------------------------------------------------------------- 1 | 10 | */ 11 | interface Lexer 12 | { 13 | /** 14 | * Lexes the given string, returning a token stream. 15 | * 16 | * @param string $string The string to lex. 17 | * 18 | * @throws \Dissect\Lexer\Exception\RecognitionException 19 | * When unable to extract more tokens from the string. 20 | * 21 | * @return \Dissect\Lexer\TokenStream\TokenStream The resulting token stream. 22 | */ 23 | public function lex($string); 24 | } 25 | -------------------------------------------------------------------------------- /tests/Dissect/Lexer/StubLexer.php: -------------------------------------------------------------------------------- 1 | getCurrentLine()); 20 | 21 | return $token; 22 | } 23 | 24 | protected function shouldSkipToken(Token $t) 25 | { 26 | return $t->getType() === 'e'; 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/Dissect/Lexer/Token.php: -------------------------------------------------------------------------------- 1 | 9 | */ 10 | interface Token 11 | { 12 | /** 13 | * Returns the token type. 14 | * 15 | * @return mixed The token type. 16 | */ 17 | public function getType(); 18 | 19 | /** 20 | * Returns the token value. 21 | * 22 | * @return string The token value. 23 | */ 24 | public function getValue(); 25 | 26 | /** 27 | * Returns the line on which the token was found. 28 | * 29 | * @return int The line. 30 | */ 31 | public function getLine(); 32 | } 33 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Welcome to Dissect! 2 | - [master](https://github.com/jakubledl/dissect/tree/master) [![build status](https://travis-ci.org/jakubledl/dissect.png?branch=master)](https://travis-ci.org/jakubledl/dissect) - this branch always contains the last stable version. 3 | - [develop](https://github.com/jakubledl/dissect) [![build status](https://travis-ci.org/jakubledl/dissect.png?branch=develop)](https://travis-ci.org/jakubledl/dissect) - the unstable development branch. 4 | 5 | Dissect is a set of tools for lexical and syntactical analysis written 6 | in pure PHP. 7 | 8 | Documentation? 9 | -------------- 10 | 11 | [Here][docs]. 12 | 13 | [docs]: https://github.com/jakubledl/dissect/blob/master/docs/index.md 14 | -------------------------------------------------------------------------------- /src/Dissect/Parser/Parser.php: -------------------------------------------------------------------------------- 1 | 11 | */ 12 | interface Parser 13 | { 14 | /** 15 | * The token type that represents an EOF. 16 | */ 17 | const EOF_TOKEN_TYPE = '$eof'; 18 | 19 | /** 20 | * Parses a token stream and returns the semantical value 21 | * of the input. 22 | * 23 | * @param \Dissect\Lexer\TokenStream\TokenStream $stream The token stream. 24 | * 25 | * @return mixed The semantical value of the input. 26 | */ 27 | public function parse(TokenStream $stream); 28 | } 29 | -------------------------------------------------------------------------------- /tests/Dissect/Parser/LALR1/Analysis/StateTest.php: -------------------------------------------------------------------------------- 1 | assertSame($item1, $state->get(1, 0)); 19 | 20 | $item2 = new Item(new Rule(2, 'T', array('T', '+', 'F')), 0); 21 | $state->add($item2); 22 | 23 | $this->assertSame($item2, $state->get(2, 0)); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/Dissect/Lexer/Recognizer/Recognizer.php: -------------------------------------------------------------------------------- 1 | 10 | */ 11 | interface Recognizer 12 | { 13 | /** 14 | * Returns a boolean value specifying whether 15 | * the string matches or not and if it does, 16 | * returns the match in the second variable. 17 | * 18 | * @param string $string The string to match. 19 | * @param string $result The variable that gets set to the value of the match. 20 | * 21 | * @return boolean Whether the match was successful or not. 22 | */ 23 | public function match($string, &$result); 24 | } 25 | -------------------------------------------------------------------------------- /tests/Dissect/Parser/LALR1/Dumper/ProductionTableDumperTest.php: -------------------------------------------------------------------------------- 1 | analyze($grammar)->getParseTable(); 18 | 19 | $dumper = new ProductionTableDumper(); 20 | $dumped = $dumper->dump($table); 21 | 22 | $this->assertStringEqualsFile(__DIR__ . '/res/table/production.php', $dumped); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /tests/Dissect/Parser/LALR1/Dumper/DebugTableDumperTest.php: -------------------------------------------------------------------------------- 1 | analyze($grammar); 18 | 19 | $dumper = new DebugTableDumper($grammar); 20 | $dumped = $dumper->dump($result->getParseTable()); 21 | 22 | $this->assertStringEqualsFile(__DIR__ . '/res/table/debug.php', $dumped); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "jakubledl/dissect", 3 | "description": "Lexing and parsing in pure PHP", 4 | "keywords": ["lexing", "parsing", "ast", "parser"], 5 | "homepage": "https://github.com/jakubledl/dissect", 6 | "license": "unlicense", 7 | "authors": [ 8 | { 9 | "name": "Jakub Lédl", 10 | "email": "jakubledl@gmail.com" 11 | } 12 | ], 13 | 14 | "require": { 15 | "php": ">=5.3.3" 16 | }, 17 | 18 | "require-dev": { 19 | "symfony/console": "~2.1" 20 | }, 21 | 22 | "suggest": { 23 | "symfony/console": "for the command-line tool" 24 | }, 25 | 26 | "bin": ["bin/dissect.php", "bin/dissect"], 27 | 28 | "autoload": { 29 | "psr-0": { "Dissect": ["src/"] } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /tests/Dissect/Lexer/StubRegexLexer.php: -------------------------------------------------------------------------------- 1 | operators)) { 28 | return $value; 29 | } else { 30 | throw new RuntimeException(sprintf('Invalid token "%s"', $value)); 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/Dissect/Lexer/Recognizer/SimpleRecognizer.php: -------------------------------------------------------------------------------- 1 | 10 | */ 11 | class SimpleRecognizer implements Recognizer 12 | { 13 | protected $string; 14 | 15 | /** 16 | * Constructor. 17 | * 18 | * @param string $string The string to match by. 19 | */ 20 | public function __construct($string) 21 | { 22 | $this->string = $string; 23 | } 24 | 25 | /** 26 | * {@inheritDoc} 27 | */ 28 | public function match($string, &$result) 29 | { 30 | if (strncmp($string, $this->string, strlen($this->string)) === 0) { 31 | $result = $this->string; 32 | 33 | return true; 34 | } 35 | 36 | return false; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/Dissect/Lexer/Exception/RecognitionException.php: -------------------------------------------------------------------------------- 1 | 11 | */ 12 | class RecognitionException extends RuntimeException 13 | { 14 | protected $sourceLine; 15 | 16 | /** 17 | * Constructor. 18 | * 19 | * @param int $line The line in the source. 20 | */ 21 | public function __construct($line) 22 | { 23 | $this->sourceLine = $line; 24 | 25 | parent::__construct(sprintf("Cannot extract another token at line %d.", $line)); 26 | } 27 | 28 | /** 29 | * Returns the source line number where the exception occured. 30 | * 31 | * @return int The source line number. 32 | */ 33 | public function getSourceLine() 34 | { 35 | return $this->sourceLine; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/Dissect/Lexer/Recognizer/RegexRecognizer.php: -------------------------------------------------------------------------------- 1 | 10 | */ 11 | class RegexRecognizer implements Recognizer 12 | { 13 | protected $regex; 14 | 15 | /** 16 | * Constructor. 17 | * 18 | * @param string $regex The regex to use in the match. 19 | */ 20 | public function __construct($regex) 21 | { 22 | $this->regex = $regex; 23 | } 24 | 25 | /** 26 | * {@inheritDoc} 27 | */ 28 | public function match($string, &$result) 29 | { 30 | $r = preg_match($this->regex, $string, $match, PREG_OFFSET_CAPTURE); 31 | 32 | if ($r === 1 && $match[0][1] === 0) { 33 | $result = $match[0][0]; 34 | 35 | return true; 36 | } 37 | 38 | return false; 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /tests/Dissect/Lexer/Recognizer/SimpleRecognizerTest.php: -------------------------------------------------------------------------------- 1 | match('class lorem ipsum', $value); 16 | 17 | $this->assertTrue($result); 18 | $this->assertNotNull($value); 19 | $this->assertEquals('class', $value); 20 | } 21 | 22 | /** 23 | * @test 24 | */ 25 | public function recognizerShouldFailAndTheValueShouldStayNull() 26 | { 27 | $recognizer = new SimpleRecognizer('class'); 28 | $result = $recognizer->match('lorem ipsum', $value); 29 | 30 | $this->assertFalse($result); 31 | $this->assertNull($value); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /bin/dissect.php: -------------------------------------------------------------------------------- 1 | run(); 31 | -------------------------------------------------------------------------------- /tests/Dissect/Parser/LALR1/Analysis/AutomatonTest.php: -------------------------------------------------------------------------------- 1 | automaton = new Automaton(); 14 | $this->automaton->addState(new State(0, array())); 15 | $this->automaton->addState(new State(1, array())); 16 | } 17 | 18 | /** 19 | * @test 20 | */ 21 | public function addingATransitionShouldBeVisibleInTheTransitionTable() 22 | { 23 | $this->automaton->addTransition(0, 'a', 1); 24 | $table = $this->automaton->getTransitionTable(); 25 | 26 | $this->assertEquals(1, $table[0]['a']); 27 | } 28 | 29 | /** 30 | * @test 31 | */ 32 | public function aNewStateShouldBeIdentifiedByItsNumber() 33 | { 34 | $state = new State(2, array()); 35 | $this->automaton->addState($state); 36 | 37 | $this->assertSame($state, $this->automaton->getState(2)); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /tests/Dissect/Lexer/RegexLexerTest.php: -------------------------------------------------------------------------------- 1 | lexer = new StubRegexLexer(); 15 | } 16 | 17 | /** 18 | * @test 19 | */ 20 | public function itShouldCallGetTypeToRetrieveTokenType() 21 | { 22 | $stream = $this->lexer->lex('5 + 6'); 23 | 24 | $this->assertCount(4, $stream); 25 | $this->assertEquals('INT', $stream->get(0)->getType()); 26 | $this->assertEquals('+', $stream->get(1)->getType()); 27 | $this->assertEquals(Parser::EOF_TOKEN_TYPE, $stream->get(3)->getType()); 28 | } 29 | 30 | /** 31 | * @test 32 | */ 33 | public function itShouldTrackLineNumbers() 34 | { 35 | $stream = $this->lexer->lex("5\n+\n\n5"); 36 | 37 | $this->assertEquals(2, $stream->get(1)->getLine()); 38 | $this->assertEquals(4, $stream->get(2)->getLine()); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /tests/Dissect/Parser/LALR1/Dumper/AutomatonDumperTest.php: -------------------------------------------------------------------------------- 1 | analyze(new ExampleGrammar())->getAutomaton(); 16 | $this->dumper = new AutomatonDumper($automaton); 17 | } 18 | 19 | /** 20 | * @test 21 | */ 22 | public function dumpDumpsTheEntireAutomaton() 23 | { 24 | $this->assertStringEqualsFile( 25 | __DIR__ . '/res/graphviz/automaton.dot', 26 | $this->dumper->dump() 27 | ); 28 | } 29 | 30 | /** 31 | * @test 32 | */ 33 | public function dumpStateDumpsOnlyTheSpecifiedStateAndTransitions() 34 | { 35 | $this->assertStringEqualsFile( 36 | __DIR__ . '/res/graphviz/state.dot', 37 | $this->dumper->dumpState(2) 38 | ); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /tests/Dissect/Parser/LALR1/Analysis/KernelSet/KernelSetTest.php: -------------------------------------------------------------------------------- 1 | assertEquals(array(1, 3, 6, 7), KernelSet::hashKernel(array( 15 | array(2, 1), 16 | array(1, 0), 17 | array(2, 0), 18 | array(3, 0), 19 | ))); 20 | } 21 | 22 | /** 23 | * @test 24 | */ 25 | public function insertShouldInsertANewNodeIfNoIdenticalKernelExists() 26 | { 27 | $set = new KernelSet(); 28 | 29 | $this->assertEquals(0, $set->insert(array( 30 | array(2, 1), 31 | ))); 32 | 33 | $this->assertEquals(1, $set->insert(array( 34 | array(2, 2), 35 | ))); 36 | 37 | $this->assertEquals(2, $set->insert(array( 38 | array(1, 1), 39 | ))); 40 | 41 | $this->assertEquals(0, $set->insert(array( 42 | array(2, 1), 43 | ))); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/Dissect/Parser/LALR1/Analysis/Exception/ConflictException.php: -------------------------------------------------------------------------------- 1 | 13 | */ 14 | class ConflictException extends LogicException 15 | { 16 | protected $state; 17 | protected $automaton; 18 | 19 | public function __construct($message, $state, Automaton $automaton) 20 | { 21 | parent::__construct($message); 22 | 23 | $this->state = $state; 24 | $this->automaton = $automaton; 25 | } 26 | 27 | /** 28 | * Returns the number of the inadequate state. 29 | * 30 | * @return int 31 | */ 32 | public function getStateNumber() 33 | { 34 | return $this->state; 35 | } 36 | 37 | /** 38 | * Returns the faulty automaton. 39 | * 40 | * @return \Dissect\Parser\LALR1\Analysis\Automaton 41 | */ 42 | public function getAutomaton() 43 | { 44 | return $this->automaton; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /UNLICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author 9 | of this software dedicates any and all copyright interest in the 10 | software to the public domain. I make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. I intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /src/Dissect/Lexer/CommonToken.php: -------------------------------------------------------------------------------- 1 | 9 | */ 10 | class CommonToken implements Token 11 | { 12 | /** 13 | * @var mixed 14 | */ 15 | protected $type; 16 | 17 | /** 18 | * @var string 19 | */ 20 | protected $value; 21 | 22 | /** 23 | * @var int 24 | */ 25 | protected $line; 26 | 27 | /** 28 | * Constructor. 29 | * 30 | * @param mixed $type The type of the token. 31 | * @param string $value The token value. 32 | * @param int $line The line. 33 | */ 34 | public function __construct($type, $value, $line) 35 | { 36 | $this->type = $type; 37 | $this->value = $value; 38 | $this->line = $line; 39 | } 40 | 41 | /** 42 | * {@inheritDoc} 43 | */ 44 | public function getType() 45 | { 46 | return $this->type; 47 | } 48 | 49 | /** 50 | * {@inheritDoc} 51 | */ 52 | public function getValue() 53 | { 54 | return $this->value; 55 | } 56 | 57 | /** 58 | * {@inheritDoc} 59 | */ 60 | public function getLine() 61 | { 62 | return $this->line; 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /tests/Dissect/Parser/LALR1/Dumper/res/table/debug.php: -------------------------------------------------------------------------------- 1 | array( 5 | 0 => array( 6 | // on a shift and go to state 2 7 | 'a' => 2, 8 | 9 | // on $eof reduce by rule S -> /* empty */ 10 | '$eof' => -2, 11 | 12 | ), 13 | 14 | 1 => array( 15 | // on $eof accept the input 16 | '$eof' => 0, 17 | 18 | ), 19 | 20 | 2 => array( 21 | // on a shift and go to state 2 22 | 'a' => 2, 23 | 24 | // on b reduce by rule S -> /* empty */ 25 | 'b' => -2, 26 | 27 | ), 28 | 29 | 3 => array( 30 | // on b shift and go to state 4 31 | 'b' => 4, 32 | 33 | ), 34 | 35 | 4 => array( 36 | // on $eof reduce by rule S -> a S b 37 | '$eof' => -1, 38 | 39 | // on b reduce by rule S -> a S b 40 | 'b' => -1, 41 | 42 | ), 43 | 44 | ), 45 | 46 | 'goto' => array( 47 | 0 => array( 48 | // on S go to state 1 49 | 'S' => 1, 50 | 51 | ), 52 | 53 | 2 => array( 54 | // on S go to state 3 55 | 'S' => 3, 56 | 57 | ), 58 | 59 | ), 60 | ); 61 | -------------------------------------------------------------------------------- /tests/Dissect/Lexer/Recognizer/RegexRecognizerTest.php: -------------------------------------------------------------------------------- 1 | match('lorem ipsum', $value); 16 | 17 | $this->assertTrue($result); 18 | $this->assertNotNull($value); 19 | $this->assertEquals('lorem', $value); 20 | } 21 | 22 | /** 23 | * @test 24 | */ 25 | public function recognizerShouldFailAndTheValueShouldStayNull() 26 | { 27 | $recognizer = new RegexRecognizer('/[a-z]+/'); 28 | $result = $recognizer->match('123 456', $value); 29 | 30 | $this->assertFalse($result); 31 | $this->assertNull($value); 32 | } 33 | 34 | /** 35 | * @test 36 | */ 37 | public function recognizerShouldFailIfTheMatchIsNotAtTheBeginningOfTheString() 38 | { 39 | $recognizer = new RegexRecognizer('/[a-z]+/'); 40 | $result = $recognizer->match('234 class', $value); 41 | 42 | $this->assertFalse($result); 43 | $this->assertNull($value); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /tests/Dissect/Parser/GrammarTest.php: -------------------------------------------------------------------------------- 1 | grammar = new ExampleGrammar(); 14 | } 15 | 16 | /** 17 | * @test 18 | */ 19 | public function ruleAlternativesShouldHaveTheSameName() 20 | { 21 | $rules = $this->grammar->getRules(); 22 | 23 | $this->assertEquals('Foo', $rules[1]->getName()); 24 | $this->assertEquals('Foo', $rules[2]->getName()); 25 | } 26 | 27 | /** 28 | * @test 29 | */ 30 | public function theGrammarShouldBeAugmentedWithAStartRule() 31 | { 32 | $this->assertEquals( 33 | Grammar::START_RULE_NAME, 34 | $this->grammar->getStartRule()->getName() 35 | ); 36 | 37 | $this->assertEquals( 38 | array('Foo'), 39 | $this->grammar->getStartRule()->getComponents() 40 | ); 41 | } 42 | 43 | /** 44 | * @test 45 | */ 46 | public function shouldReturnAlternativesGroupedByName() 47 | { 48 | $rules = $this->grammar->getGroupedRules(); 49 | $this->assertCount(2, $rules['Foo']); 50 | } 51 | 52 | /** 53 | * @test 54 | */ 55 | public function nonterminalsShouldBeDetectedFromRuleNames() 56 | { 57 | $this->assertTrue($this->grammar->hasNonterminal('Foo')); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/Dissect/Console/Application.php: -------------------------------------------------------------------------------- 1 | 14 | */ 15 | class Application extends BaseApplication 16 | { 17 | // credit goes to everzet & kostiklv, since 18 | // I copied the BehatApplication class when 19 | // dealing with some CLI problems. 20 | public function __construct($version) 21 | { 22 | parent::__construct('Dissect', $version); 23 | } 24 | 25 | protected function getCommandName(InputInterface $input) 26 | { 27 | return 'dissect'; 28 | } 29 | 30 | protected function getDefaultCommands() 31 | { 32 | $default = parent::getDefaultCommands(); 33 | $default[] = new Command\DissectCommand(); 34 | 35 | return $default; 36 | } 37 | 38 | public function getDefinition() 39 | { 40 | return new InputDefinition(array( 41 | new InputOption('--help', '-h', InputOption::VALUE_NONE, 'Display this help message.'), 42 | new InputOption('--verbose', '-v', InputOption::VALUE_NONE, 'Increase verbosity of exceptions.'), 43 | new InputOption('--version', '-V', InputOption::VALUE_NONE, 'Display version information.'), 44 | )); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /tests/Dissect/Parser/LALR1/ArithGrammar.php: -------------------------------------------------------------------------------- 1 | is('Expr', '+', 'Expr') 13 | ->call(function ($l, $_, $r) { 14 | return $l + $r; 15 | }) 16 | 17 | ->is('Expr', '-', 'Expr') 18 | ->call(function ($l, $_, $r) { 19 | return $l - $r; 20 | }) 21 | 22 | ->is('Expr', '*', 'Expr') 23 | ->call(function ($l, $_, $r) { 24 | return $l * $r; 25 | }) 26 | 27 | ->is('Expr', '/', 'Expr') 28 | ->call(function ($l, $_, $r) { 29 | return $l / $r; 30 | }) 31 | 32 | ->is('Expr', '**', 'Expr') 33 | ->call(function ($l, $_, $r) { 34 | return pow($l, $r); 35 | }) 36 | 37 | ->is('(', 'Expr', ')') 38 | ->call(function ($_, $e, $_) { 39 | return $e; 40 | }) 41 | 42 | ->is('-', 'Expr')->prec(4) 43 | ->call(function ($_, $e) { 44 | return -$e; 45 | }) 46 | 47 | ->is('INT') 48 | ->call(function ($i) { 49 | return (int)$i->getValue(); 50 | }); 51 | 52 | $this->operators('+', '-')->left()->prec(1); 53 | $this->operators('*', '/')->left()->prec(2); 54 | $this->operators('**')->right()->prec(3); 55 | 56 | $this->start('Expr'); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/Dissect/Parser/LALR1/Dumper/StringWriter.php: -------------------------------------------------------------------------------- 1 | 9 | */ 10 | class StringWriter 11 | { 12 | protected $indent = 0; 13 | protected $string = ''; 14 | 15 | /** 16 | * Appends the given string. 17 | * 18 | * @param string $string The string to write. 19 | */ 20 | public function write($string) 21 | { 22 | $this->string .= $string; 23 | } 24 | 25 | /** 26 | * Gets the string as written so far. 27 | * 28 | * @return string The string. 29 | */ 30 | public function get() 31 | { 32 | return $this->string; 33 | } 34 | 35 | /** 36 | * Adds a level of indentation. 37 | */ 38 | public function indent() 39 | { 40 | $this->indent++; 41 | } 42 | 43 | /** 44 | * Removes a level of indentation. 45 | */ 46 | public function outdent() 47 | { 48 | $this->indent--; 49 | } 50 | 51 | /** 52 | * If a string is given, it writes 53 | * it with correct indentation and 54 | * a newline appended. When no string 55 | * is given, it adheres to the rule 56 | * that empty lines should be whitespace-free 57 | * (like vim) and doesn't append any 58 | * indentation. 59 | * 60 | * @param string $string The string to write. 61 | */ 62 | public function writeLine($string = null) 63 | { 64 | if ($string) { 65 | $this->write(sprintf( 66 | "%s%s\n", 67 | str_repeat(' ', $this->indent * 4), 68 | $string 69 | )); 70 | } else { 71 | $this->write("\n"); 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /tests/Dissect/Parser/LALR1/ParserTest.php: -------------------------------------------------------------------------------- 1 | lexer = new ArithLexer(); 16 | $this->parser = new Parser(new ArithGrammar()); 17 | } 18 | 19 | /** 20 | * @test 21 | */ 22 | public function parserShouldProcessTheTokenStreamAndUseGrammarCallbacksForReductions() 23 | { 24 | $this->assertEquals(-2, $this->parser->parse($this->lexer->lex( 25 | '-1 - 1'))); 26 | 27 | $this->assertEquals(11664, $this->parser->parse($this->lexer->lex( 28 | '6 ** (1 + 1) ** 2 * (5 + 4)'))); 29 | 30 | $this->assertEquals(-4, $this->parser->parse($this->lexer->lex( 31 | '3 - 5 - 2'))); 32 | 33 | $this->assertEquals(262144, $this->parser->parse($this->lexer->lex( 34 | '4 ** 3 ** 2'))); 35 | } 36 | 37 | /** 38 | * @test 39 | */ 40 | public function parserShouldThrowAnExceptionOnInvalidInput() 41 | { 42 | try { 43 | $this->parser->parse($this->lexer->lex('6 ** 5 3')); 44 | $this->fail('Expected an UnexpectedTokenException.'); 45 | } catch (UnexpectedTokenException $e) { 46 | $this->assertEquals('INT', $e->getToken()->getType()); 47 | $this->assertEquals(array('$eof', '+', '-', '*', '/', '**', ')'), $e->getExpected()); 48 | $this->assertEquals(<<getMessage()); 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/Dissect/Parser/LALR1/Analysis/AnalysisResult.php: -------------------------------------------------------------------------------- 1 | 9 | */ 10 | class AnalysisResult 11 | { 12 | /** 13 | * @var \Dissect\Parser\LALR1\Analysis\Automaton 14 | */ 15 | protected $automaton; 16 | 17 | /** 18 | * @var array 19 | */ 20 | protected $parseTable; 21 | 22 | /** 23 | * @var array 24 | */ 25 | protected $resolvedConflicts; 26 | 27 | /** 28 | * Constructor. 29 | * 30 | * @param array $parseTable The parse table. 31 | * @param \Dissect\Parser\LALR1\Analysis\Automaton $automaton 32 | * @param array $conflicts An array of conflicts resolved during parse table 33 | * construction. 34 | */ 35 | public function __construct(array $parseTable, Automaton $automaton, array $conflicts) 36 | { 37 | $this->parseTable = $parseTable; 38 | $this->automaton = $automaton; 39 | $this->resolvedConflicts = $conflicts; 40 | } 41 | 42 | /** 43 | * Returns the handle-finding FSA. 44 | * 45 | * @return \Dissect\Parser\LALR1\Analysis\Automaton 46 | */ 47 | public function getAutomaton() 48 | { 49 | return $this->automaton; 50 | } 51 | 52 | /** 53 | * Returns the resulting parse table. 54 | * 55 | * @return array The parse table. 56 | */ 57 | public function getParseTable() 58 | { 59 | return $this->parseTable; 60 | } 61 | 62 | /** 63 | * Returns an array of resolved parse table conflicts. 64 | * 65 | * @return array The conflicts. 66 | */ 67 | public function getResolvedConflicts() 68 | { 69 | return $this->resolvedConflicts; 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/Dissect/Parser/Exception/UnexpectedTokenException.php: -------------------------------------------------------------------------------- 1 | 12 | */ 13 | class UnexpectedTokenException extends RuntimeException 14 | { 15 | const MESSAGE = <<token = $token; 40 | $this->expected = $expected; 41 | 42 | if ($token->getValue() !== $token->getType()) { 43 | $info = $token->getValue() . ' (' . $token->getType() . ')'; 44 | } else { 45 | $info = $token->getType(); 46 | } 47 | 48 | parent::__construct(sprintf( 49 | self::MESSAGE, 50 | $info, 51 | $token->getLine(), 52 | implode(', ', $expected) 53 | )); 54 | } 55 | 56 | /** 57 | * Returns the unexpected token. 58 | * 59 | * @return \Dissect\Lexer\Token The unexpected token. 60 | */ 61 | public function getToken() 62 | { 63 | return $this->token; 64 | } 65 | 66 | /** 67 | * Returns the expected token types. 68 | * 69 | * @return string[] The expected token types. 70 | */ 71 | public function getExpected() 72 | { 73 | return $this->expected; 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /tests/Dissect/Lexer/SimpleLexerTest.php: -------------------------------------------------------------------------------- 1 | lexer = new SimpleLexer(); 16 | 17 | $this->lexer 18 | ->token('A', 'a') 19 | ->token('(') 20 | ->token('B', 'b') 21 | ->token(')') 22 | ->token('C', 'c') 23 | ->regex('WS', "/[ \n\t\r]+/") 24 | 25 | ->skip('WS'); 26 | } 27 | 28 | /** 29 | * @test 30 | */ 31 | public function simpleLexerShouldWalkThroughTheRecognizers() 32 | { 33 | $stream = $this->lexer->lex('a (b) c'); 34 | 35 | $this->assertEquals(6, $stream->count()); // with EOF 36 | $this->assertEquals('(', $stream->get(1)->getType()); 37 | $this->assertEquals(1, $stream->get(3)->getLine()); 38 | $this->assertEquals('C', $stream->get(4)->getType()); 39 | } 40 | 41 | /** 42 | * @test 43 | */ 44 | public function simpleLexerShouldSkipSpecifiedTokens() 45 | { 46 | $stream = $this->lexer->lex('a (b) c'); 47 | 48 | foreach ($stream as $token) { 49 | $this->assertNotEquals('WS', $token->getType()); 50 | } 51 | } 52 | 53 | /** 54 | * @test 55 | */ 56 | public function simpleLexerShouldReturnTheBestMatch() 57 | { 58 | $this->lexer->token('CLASS', 'class'); 59 | $this->lexer->regex('WORD', '/[a-z]+/'); 60 | 61 | $stream = $this->lexer->lex('class classloremipsum'); 62 | 63 | $this->assertEquals('CLASS', $stream->getCurrentToken()->getType()); 64 | $this->assertEquals('WORD', $stream->lookAhead(1)->getType()); 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | Welcome to Dissect! 2 | =================== 3 | 4 | Dissect is a set of tools for lexical and syntactical analysis 5 | written in pure PHP. 6 | 7 | This guide assumes that you're already familiar with basic concepts 8 | of parsing. Explaining them is beyond the scope of this simple guide, 9 | so if you're not, see, for example, [this article][parsing]. 10 | This page serves as an index for individual documentation pages. 11 | 12 | 1. [Lexical analysis with Dissect](lexing.md) 13 | 1. [SimpleLexer](lexing.md#simplelexer) 14 | 2. [StatefulLexer](lexing.md#statefullexer) 15 | 3. [Improving lexer performance](lexing.md#improving-lexer-performance) 16 | 4. [RegexLexer](lexing.md#regexlexer) 17 | 2. [Parsing with Dissect](parsing.md) 18 | 1. [Why an LALR(1) parser?](parsing.md#why-an-lalr1-parser) 19 | 2. [Writing a grammar](parsing.md#writing-a-grammar) 20 | 3. [Example: Parsing mathematical expressions](parsing.md#example-parsing-mathematical-expressions) 21 | 4. [Invalid input](parsing.md#invalid-input) 22 | 5. [Precomputing the parse table](parsing.md#precomputing-the-parse-table) 23 | 6. [Resolving conflicts](parsing.md#resolving-conflicts) 24 | 3. [Building an AST](ast.md) 25 | 1. [Travesing the AST](ast.md#traversing-the-ast) 26 | 4. [Describing common syntactic structures](common.md) 27 | 1. [List of 1 or more `Foo`s](common.md#list-of-1-or-more-foos) 28 | 2. [List of 0 or more `Foo`s](common.md#list-of-0-or-more-foos) 29 | 3. [A comma separated list](common.md#a-comma-separated-list) 30 | 4. [Expressions](common.md#expressions) 31 | 5. [The command-line interface](cli.md) 32 | 1. [Running the tool](cli.md#running-the-tool) 33 | 2. [Dumping the parse table in the debug format](cli.md#dumping-the-parse-table-in-the-debug-format) 34 | 3. [Dumping the handle-finding automaton](cli.md#dumping-the-handle-finding-automaton) 35 | 36 | [parsing]: http://en.wikipedia.org/wiki/Parsing 37 | -------------------------------------------------------------------------------- /src/Dissect/Util/Util.php: -------------------------------------------------------------------------------- 1 | 9 | */ 10 | abstract class Util 11 | { 12 | /** 13 | * Merges two or more sets by values. 14 | * 15 | * {a, b} union {b, c} = {a, b, c} 16 | * 17 | * @return array The union of given sets. 18 | */ 19 | public static function union() 20 | { 21 | return array_unique(call_user_func_array('array_merge', func_get_args())); 22 | } 23 | 24 | /** 25 | * Determines whether two sets have a difference. 26 | * 27 | * @param array $first The first set. 28 | * @param array $second The second set. 29 | * 30 | * @return boolean Whether there is a difference. 31 | */ 32 | public static function different(array $first, array $second) 33 | { 34 | return count(array_diff($first, $second)) !== 0; 35 | } 36 | 37 | /** 38 | * Determines length of a UTF-8 string. 39 | * 40 | * @param string $str The string in UTF-8 encoding. 41 | * 42 | * @return int The length. 43 | */ 44 | public static function stringLength($str) 45 | { 46 | return strlen(utf8_decode($str)); 47 | } 48 | 49 | /** 50 | * Extracts a substring of a UTF-8 string. 51 | * 52 | * @param string $str The string to extract the substring from. 53 | * @param int $position The position from which to start extracting. 54 | * @param int $length The length of the substring. 55 | * 56 | * @return string The substring. 57 | */ 58 | public static function substring($str, $position, $length = null) 59 | { 60 | static $lengthFunc = null; 61 | 62 | if ($lengthFunc === null) { 63 | $lengthFunc = function_exists('mb_substr') ? 'mb_substr' : 'iconv_substr'; 64 | } 65 | 66 | if ($length === null) { 67 | $length = self::stringLength($str); 68 | } 69 | 70 | return $lengthFunc($str, $position, $length, 'UTF-8'); 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/Dissect/Parser/LALR1/Analysis/State.php: -------------------------------------------------------------------------------- 1 | 9 | */ 10 | class State 11 | { 12 | /** 13 | * @var array 14 | */ 15 | protected $items = array(); 16 | 17 | /** 18 | * @var array 19 | */ 20 | protected $itemMap = array(); 21 | 22 | /** 23 | * @var int 24 | */ 25 | protected $number; 26 | 27 | /** 28 | * Constructor. 29 | * 30 | * @param int $number The number identifying this state. 31 | * @param array $items The initial items of this state. 32 | */ 33 | public function __construct($number, array $items) 34 | { 35 | $this->number = $number; 36 | 37 | foreach ($items as $item) { 38 | $this->add($item); 39 | } 40 | } 41 | 42 | /** 43 | * Adds a new item to this state. 44 | * 45 | * @param \Dissect\Parser\LALR1\Analysis\Item $item The new item. 46 | */ 47 | public function add(Item $item) 48 | { 49 | $this->items[] = $item; 50 | 51 | $this->itemMap[$item->getRule()->getNumber()][$item->getDotIndex()] = $item; 52 | } 53 | 54 | /** 55 | * Returns an item by its rule number and dot index. 56 | * 57 | * @param int $ruleNumber The number of the rule of the desired item. 58 | * @param int $dotIndex The dot index of the desired item. 59 | * 60 | * @return \Dissect\Parser\LALR1\Analysis\Item The item. 61 | */ 62 | public function get($ruleNumber, $dotIndex) 63 | { 64 | return $this->itemMap[$ruleNumber][$dotIndex]; 65 | } 66 | 67 | /** 68 | * Returns the number identifying this state. 69 | * 70 | * @return int 71 | */ 72 | public function getNumber() 73 | { 74 | return $this->number; 75 | } 76 | 77 | /** 78 | * Returns an array of items constituting this state. 79 | * 80 | * @return array The items. 81 | */ 82 | public function getItems() 83 | { 84 | return $this->items; 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/Dissect/Lexer/TokenStream/TokenStream.php: -------------------------------------------------------------------------------- 1 | 12 | */ 13 | interface TokenStream extends Countable, IteratorAggregate 14 | { 15 | /** 16 | * Returns the current position in the stream. 17 | * 18 | * @return int The current position in the stream. 19 | */ 20 | public function getPosition(); 21 | 22 | /** 23 | * Retrieves the current token. 24 | * 25 | * @return \Dissect\Lexer\Token The current token. 26 | */ 27 | public function getCurrentToken(); 28 | 29 | /** 30 | * Returns a look-ahead token. Negative values are allowed 31 | * and serve as look-behind. 32 | * 33 | * @param int $n The look-ahead. 34 | * 35 | * @throws \OutOfBoundsException If current position + $n is out of range. 36 | * 37 | * @return \Dissect\Lexer\Token The lookahead token. 38 | */ 39 | public function lookAhead($n); 40 | 41 | /** 42 | * Returns the token at absolute position $n. 43 | * 44 | * @param int $n The position. 45 | * 46 | * @throws \OutOfBoundsException If $n is out of range. 47 | * 48 | * @return \Dissect\Lexer\Token The token at position $n. 49 | */ 50 | public function get($n); 51 | 52 | /** 53 | * Moves the cursor to the absolute position $n. 54 | * 55 | * @param int $n The position. 56 | * 57 | * @throws \OutOfBoundsException If $n is out of range. 58 | */ 59 | public function move($n); 60 | 61 | /** 62 | * Moves the cursor by $n, relative to the current position. 63 | * 64 | * @param int $n The seek. 65 | * 66 | * @throws \OutOfBoundsException If current position + $n is out of range. 67 | */ 68 | public function seek($n); 69 | 70 | /** 71 | * Moves the cursor to the next token. 72 | * 73 | * @throws \OutOfBoundsException If at the end of the stream. 74 | */ 75 | public function next(); 76 | } 77 | -------------------------------------------------------------------------------- /src/Dissect/Parser/LALR1/Analysis/Automaton.php: -------------------------------------------------------------------------------- 1 | 10 | */ 11 | class Automaton 12 | { 13 | /** 14 | * @var array 15 | */ 16 | protected $states = array(); 17 | 18 | /** 19 | * @var array 20 | */ 21 | protected $transitionTable = array(); 22 | 23 | /** 24 | * Adds a new automaton state. 25 | * 26 | * @param \Dissect\Parser\LALR1\Analysis\State $state The new state. 27 | */ 28 | public function addState(State $state) 29 | { 30 | $this->states[$state->getNumber()] = $state; 31 | } 32 | 33 | /** 34 | * Adds a new transition in the FSA. 35 | * 36 | * @param int $origin The number of the origin state. 37 | * @param string $label The symbol that triggers this transition. 38 | * @param int $dest The destination state number. 39 | */ 40 | public function addTransition($origin, $label, $dest) 41 | { 42 | $this->transitionTable[$origin][$label] = $dest; 43 | } 44 | 45 | /** 46 | * Returns a state by its number. 47 | * 48 | * @param int $number The state number. 49 | * 50 | * @return \Dissect\Parser\LALR1\Analysis\State The requested state. 51 | */ 52 | public function getState($number) 53 | { 54 | return $this->states[$number]; 55 | } 56 | 57 | /** 58 | * Does this automaton have a state identified by $number? 59 | * 60 | * @return boolean 61 | */ 62 | public function hasState($number) 63 | { 64 | return isset($this->states[$number]); 65 | } 66 | 67 | /** 68 | * Returns all states in this FSA. 69 | * 70 | * @return array The states of this FSA. 71 | */ 72 | public function getStates() 73 | { 74 | return $this->states; 75 | } 76 | 77 | /** 78 | * Returns the transition table for this automaton. 79 | * 80 | * @return array The transition table. 81 | */ 82 | public function getTransitionTable() 83 | { 84 | return $this->transitionTable; 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/Dissect/Lexer/RegexLexer.php: -------------------------------------------------------------------------------- 1 | 12 | * @author Jonathan Wage 13 | * @author Roman Borschel 14 | * @author Jakub Lédl 15 | */ 16 | abstract class RegexLexer implements Lexer 17 | { 18 | /** 19 | * {@inheritDoc} 20 | */ 21 | public function lex($string) 22 | { 23 | static $regex; 24 | 25 | if (!isset($regex)) { 26 | $regex = '/(' . implode(')|(', $this->getCatchablePatterns()) . ')|' 27 | . implode('|', $this->getNonCatchablePatterns()) . '/i'; 28 | } 29 | 30 | $string = strtr($string, array("\r\n" => "\n", "\r" => "\n")); 31 | 32 | $flags = PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_OFFSET_CAPTURE; 33 | $matches = preg_split($regex, $string, -1, $flags); 34 | $tokens = array(); 35 | $line = 1; 36 | $oldPosition = 0; 37 | 38 | foreach ($matches as $match) { 39 | list ($value, $position) = $match; 40 | 41 | $type = $this->getType($value); 42 | 43 | if ($position > 0) { 44 | $line += substr_count($string, "\n", $oldPosition, $position - $oldPosition); 45 | } 46 | 47 | $oldPosition = $position; 48 | 49 | $tokens[] = new CommonToken($type, $value, $line); 50 | } 51 | 52 | $tokens[] = new CommonToken(Parser::EOF_TOKEN_TYPE, '', $line); 53 | 54 | return new ArrayTokenStream($tokens); 55 | } 56 | 57 | /** 58 | * The patterns corresponding to tokens. 59 | * 60 | * @return array 61 | */ 62 | abstract protected function getCatchablePatterns(); 63 | 64 | /** 65 | * The patterns corresponding to tokens to be skipped. 66 | * 67 | * @return array 68 | */ 69 | abstract protected function getNonCatchablePatterns(); 70 | 71 | /** 72 | * Retrieves the token type. 73 | * 74 | * @param string $value 75 | * 76 | * @return string $type 77 | */ 78 | abstract protected function getType(&$value); 79 | } 80 | -------------------------------------------------------------------------------- /src/Dissect/Parser/LALR1/Analysis/Exception/ShiftReduceConflictException.php: -------------------------------------------------------------------------------- 1 | 13 | */ 14 | class ShiftReduceConflictException extends ConflictException 15 | { 16 | /** 17 | * The exception message template. 18 | */ 19 | const MESSAGE = << %s 23 | 24 | (on lookahead "%s" in state %d). Restructure your grammar or choose a conflict resolution mode. 25 | EOT; 26 | 27 | /** 28 | * @var \Dissect\Parser\Rule 29 | */ 30 | protected $rule; 31 | 32 | /** 33 | * @var string 34 | */ 35 | protected $lookahead; 36 | 37 | /** 38 | * Constructor. 39 | * 40 | * @param \Dissect\Parser\Rule $rule The conflicting grammar rule. 41 | * @param string $lookahead The conflicting lookahead to shift. 42 | * @param \Dissect\Parser\LALR1\Analysis\Automaton $automaton The faulty automaton. 43 | */ 44 | public function __construct($state, Rule $rule, $lookahead, Automaton $automaton) 45 | { 46 | $components = $rule->getComponents(); 47 | 48 | parent::__construct( 49 | sprintf( 50 | self::MESSAGE, 51 | $rule->getNumber(), 52 | $rule->getName(), 53 | empty($components) ? '/* empty */' : implode(' ', $components), 54 | $lookahead, 55 | $state 56 | ), 57 | $state, 58 | $automaton 59 | ); 60 | 61 | $this->rule = $rule; 62 | $this->lookahead = $lookahead; 63 | } 64 | 65 | /** 66 | * Returns the conflicting rule. 67 | * 68 | * @return \Dissect\Parser\Rule The conflicting rule. 69 | */ 70 | public function getRule() 71 | { 72 | return $this->rule; 73 | } 74 | 75 | /** 76 | * Returns the conflicting lookahead. 77 | * 78 | * @return string The conflicting lookahead. 79 | */ 80 | public function getLookahead() 81 | { 82 | return $this->lookahead; 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/Dissect/Parser/LALR1/Analysis/KernelSet/KernelSet.php: -------------------------------------------------------------------------------- 1 | 10 | */ 11 | class KernelSet 12 | { 13 | protected $nextNumber = 0; 14 | protected $root = null; 15 | 16 | /** 17 | * Inserts a new node in the BST and returns 18 | * the number of the new state if no such state 19 | * exists. Otherwise, returns the number of the 20 | * existing state. 21 | * 22 | * @param array $kernel The state kernel. 23 | * 24 | * @return int The state number. 25 | */ 26 | public function insert(array $kernel) 27 | { 28 | $kernel = KernelSet::hashKernel($kernel); 29 | 30 | if ($this->root === null) { 31 | $this->root = new Node($kernel, $n = $this->nextNumber++); 32 | 33 | return $n; 34 | } 35 | 36 | $node = $this->root; 37 | 38 | while (true) { 39 | if ($kernel < $node->kernel) { 40 | if ($node->left === null) { 41 | $node->left = new Node($kernel, $n = $this->nextNumber++); 42 | 43 | return $n; 44 | } else { 45 | $node = $node->left; 46 | } 47 | } elseif ($kernel > $node->kernel) { 48 | if ($node->right === null) { 49 | $node->right = new Node($kernel, $n = $this->nextNumber++); 50 | 51 | return $n; 52 | } else { 53 | $node = $node->right; 54 | } 55 | } else { 56 | return $node->number; 57 | } 58 | } 59 | } 60 | 61 | /** 62 | * Hashes a state kernel using a pairing function. 63 | * 64 | * @param array $kernel The kernel. 65 | * 66 | * @return array The hashed kernel. 67 | */ 68 | public static function hashKernel(array $kernel) 69 | { 70 | $kernel = array_map(function ($tuple) { 71 | list ($car, $cdr) = $tuple; 72 | 73 | return ($car + $cdr) * ($car + $cdr + 1) / 2 + $cdr; 74 | }, $kernel); 75 | 76 | sort($kernel); 77 | 78 | return $kernel; 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /tests/Dissect/Parser/LALR1/Analysis/ItemTest.php: -------------------------------------------------------------------------------- 1 | assertEquals('b', $item->getActiveComponent()); 18 | } 19 | 20 | /** 21 | * @test 22 | */ 23 | public function itemShouldBeAReduceItemIfAllComponentsHaveBeenEncountered() 24 | { 25 | $item = new Item(new Rule(1, 'A', array('a', 'b', 'c')), 1); 26 | $this->assertFalse($item->isReduceItem()); 27 | 28 | $item = new Item(new Rule(1, 'A', array('a', 'b', 'c')), 3); 29 | $this->assertTrue($item->isReduceItem()); 30 | } 31 | 32 | /** 33 | * @test 34 | */ 35 | public function itemShouldPumpLookaheadIntoConnectedItems() 36 | { 37 | $item1 = new Item(new Rule(1, 'A', array('a', 'b', 'c')), 1); 38 | $item2 = new Item(new Rule(1, 'A', array('a', 'b', 'c')), 2); 39 | 40 | $item1->connect($item2); 41 | $item1->pump('d'); 42 | 43 | $this->assertContains('d', $item2->getLookahead()); 44 | } 45 | 46 | /** 47 | * @test 48 | */ 49 | public function itemShouldPumpTheSameLookaheadOnlyOnce() 50 | { 51 | $item1 = new Item(new Rule(1, 'A', array('a', 'b', 'c')), 1); 52 | 53 | $item2 = $this->getMock( 54 | 'Dissect\\Parser\\LALR1\\Analysis\\Item', 55 | array('pump'), 56 | array( 57 | new Rule(1, 'A', array('a', 'b', 'c')), 58 | 2, 59 | ) 60 | ); 61 | 62 | $item2->expects($this->once()) 63 | ->method('pump') 64 | ->with($this->equalTo('d')); 65 | 66 | $item1->connect($item2); 67 | 68 | $item1->pump('d'); 69 | $item1->pump('d'); 70 | } 71 | 72 | /** 73 | * @test 74 | */ 75 | public function getUnrecognizedComponentsShouldReturnAllComponentAfterTheDottedOne() 76 | { 77 | $item = new Item(new Rule(1, 'A', array('a', 'b', 'c')), 1); 78 | 79 | $this->assertEquals(array('c'), $item->getUnrecognizedComponents()); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /tests/Dissect/Lexer/StatefulLexerTest.php: -------------------------------------------------------------------------------- 1 | lexer = new StatefulLexer(); 16 | } 17 | 18 | /** 19 | * @test 20 | * @expectedException LogicException 21 | * @expectedExceptionMessage Define a lexer state first. 22 | */ 23 | public function addingNewTokenShouldThrowAnExceptionWhenNoStateIsBeingBuilt() 24 | { 25 | $this->lexer->regex('WORD', '/[a-z]+/'); 26 | } 27 | 28 | /** 29 | * @test 30 | * @expectedException LogicException 31 | */ 32 | public function anExceptionShouldBeThrownOnLexingWithoutAStartingState() 33 | { 34 | $this->lexer->state('root'); 35 | $this->lexer->lex('foo'); 36 | } 37 | 38 | /** 39 | * @test 40 | */ 41 | public function theStateMechanismShouldCorrectlyPushAndPopStatesFromTheStack() 42 | { 43 | $this->lexer->state('root') 44 | ->regex('WORD', '/[a-z]+/') 45 | ->regex('WS', "/[ \r\n\t]+/") 46 | ->token('"')->action('string') 47 | ->skip('WS'); 48 | 49 | $this->lexer->state('string') 50 | ->regex('STRING_CONTENTS', '/(\\\\"|[^"])*/') 51 | ->token('"')->action(StatefulLexer::POP_STATE); 52 | 53 | $this->lexer->start('root'); 54 | 55 | $stream = $this->lexer->lex('foo bar "long \\" string" baz quux'); 56 | 57 | $this->assertCount(8, $stream); 58 | $this->assertEquals('STRING_CONTENTS', $stream->get(3)->getType()); 59 | $this->assertEquals('long \\" string', $stream->get(3)->getValue()); 60 | $this->assertEquals('quux', $stream->get(6)->getValue()); 61 | } 62 | 63 | /** 64 | * @test 65 | */ 66 | public function defaultActionShouldBeNop() 67 | { 68 | $this->lexer->state('root') 69 | ->regex('WORD', '/[a-z]+/') 70 | ->regex('WS', "/[ \r\n\t]+/") 71 | ->skip('WS'); 72 | 73 | $this->lexer->state('string'); 74 | 75 | $this->lexer->start('root'); 76 | 77 | $stream = $this->lexer->lex('foo bar'); 78 | $this->assertEquals(3, $stream->count()); 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/Dissect/Parser/LALR1/Dumper/ProductionTableDumper.php: -------------------------------------------------------------------------------- 1 | 12 | */ 13 | class ProductionTableDumper implements TableDumper 14 | { 15 | /** 16 | * {@inheritDoc} 17 | */ 18 | public function dump(array $table) 19 | { 20 | $writer = new StringWriter(); 21 | 22 | $this->writeIntro($writer); 23 | 24 | foreach ($table['action'] as $num => $state) { 25 | $this->writeState($writer, $num, $state); 26 | $writer->write(','); 27 | } 28 | 29 | $this->writeMiddle($writer); 30 | 31 | foreach($table['goto'] as $num => $map) { 32 | $this->writeGoto($writer, $num, $map); 33 | $writer->write(','); 34 | } 35 | 36 | $this->writeOutro($writer); 37 | 38 | $writer->write("\n"); // eof newline 39 | 40 | return $writer->get(); 41 | } 42 | 43 | protected function writeIntro(StringWriter $writer) 44 | { 45 | $writer->write("array("); 46 | } 47 | 48 | protected function writeState(StringWriter $writer, $num, $state) 49 | { 50 | $writer->write((string)$num . '=>array('); 51 | 52 | foreach ($state as $trigger => $action) { 53 | $this->writeAction($writer, $trigger, $action); 54 | $writer->write(','); 55 | } 56 | 57 | $writer->write(')'); 58 | } 59 | 60 | protected function writeAction(StringWriter $writer, $trigger, $action) 61 | { 62 | $writer->write(sprintf( 63 | "'%s'=>%d", 64 | $trigger, 65 | $action 66 | )); 67 | } 68 | 69 | protected function writeMiddle(StringWriter $writer) 70 | { 71 | $writer->write("),'goto'=>array("); 72 | } 73 | 74 | protected function writeGoto(StringWriter $writer, $num, $map) 75 | { 76 | $writer->write((string)$num . '=>array('); 77 | 78 | foreach ($map as $trigger => $destination) { 79 | $writer->write(sprintf( 80 | "'%s'=>%d", 81 | $trigger, 82 | $destination 83 | )); 84 | 85 | $writer->write(','); 86 | } 87 | 88 | $writer->write(')'); 89 | } 90 | 91 | protected function writeOutro(StringWriter $writer) 92 | { 93 | $writer->write('));'); 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /src/Dissect/Node/Node.php: -------------------------------------------------------------------------------- 1 | 12 | */ 13 | interface Node extends Countable, IteratorAggregate 14 | { 15 | /** 16 | * Returns the children of this node. 17 | * 18 | * @return array The children belonging to this node. 19 | */ 20 | public function getNodes(); 21 | 22 | /** 23 | * Checks for existence of child node named $name. 24 | * 25 | * @param string $name The name of the child node. 26 | * 27 | * @return boolean If the node exists. 28 | */ 29 | public function hasNode($name); 30 | 31 | /** 32 | * Returns a child node specified by $name. 33 | * 34 | * @param int|string $name The name of the node. 35 | * 36 | * @return \Dissect\Node\Node The child node specified by $name. 37 | * 38 | * @throws \RuntimeException When no child node named $name exists. 39 | */ 40 | public function getNode($name); 41 | 42 | /** 43 | * Sets a child node. 44 | * 45 | * @param string $name The name. 46 | * @param \Dissect\Node\Node $node The new child node. 47 | */ 48 | public function setNode($name, Node $child); 49 | 50 | /** 51 | * Removes a child node by name. 52 | * 53 | * @param string $name The name. 54 | */ 55 | public function removeNode($name); 56 | 57 | /** 58 | * Returns all attributes of this node. 59 | * 60 | * @return array The attributes. 61 | */ 62 | public function getAttributes(); 63 | 64 | /** 65 | * Determines whether this node has an attribute 66 | * under $key. 67 | * 68 | * @param string $key The key. 69 | * @return boolean Whether there's an attribute under $key. 70 | */ 71 | public function hasAttribute($key); 72 | 73 | /** 74 | * Gets an attribute by key. 75 | * 76 | * @param string $key The key. 77 | * @return mixed The attribute value. 78 | * 79 | * @throws \RuntimeException When no attribute exists under $key. 80 | */ 81 | public function getAttribute($key); 82 | 83 | /** 84 | * Sets an attribute by key. 85 | * 86 | * @param string $key The key. 87 | * @param mixed $value The new value. 88 | */ 89 | public function setAttribute($key, $value); 90 | 91 | /** 92 | * Removes an attribute by key. 93 | * 94 | * @param string $key The key. 95 | */ 96 | public function removeAttribute($key); 97 | } 98 | -------------------------------------------------------------------------------- /tests/Dissect/Lexer/AbstractLexerTest.php: -------------------------------------------------------------------------------- 1 | lexer = new StubLexer(); 16 | } 17 | 18 | /** 19 | * @test 20 | */ 21 | public function lexShouldDelegateToExtractTokenUpdatingTheLineAndOffsetAccordingly() 22 | { 23 | $stream = $this->lexer->lex("ab\nc"); 24 | 25 | $this->assertEquals('a', $stream->getCurrentToken()->getValue()); 26 | $this->assertEquals(1, $stream->getCurrentToken()->getLine()); 27 | $stream->next(); 28 | 29 | $this->assertEquals('b', $stream->getCurrentToken()->getValue()); 30 | $this->assertEquals(1, $stream->getCurrentToken()->getLine()); 31 | $stream->next(); 32 | 33 | $this->assertEquals("\n", $stream->getCurrentToken()->getValue()); 34 | $this->assertEquals(1, $stream->getCurrentToken()->getLine()); 35 | $stream->next(); 36 | 37 | $this->assertEquals('c', $stream->getCurrentToken()->getValue()); 38 | $this->assertEquals(2, $stream->getCurrentToken()->getLine()); 39 | } 40 | 41 | /** 42 | * @test 43 | */ 44 | public function lexShouldAppendAnEofTokenAutomatically() 45 | { 46 | $stream = $this->lexer->lex("abc"); 47 | $stream->seek(3); 48 | 49 | $this->assertEquals(Parser::EOF_TOKEN_TYPE, $stream->getCurrentToken()->getType()); 50 | $this->assertEquals(1, $stream->getCurrentToken()->getLine()); 51 | } 52 | 53 | /** 54 | * @test 55 | */ 56 | public function lexShouldThrowAnExceptionOnAnUnrecognizableToken() 57 | { 58 | try { 59 | $stream = $this->lexer->lex("abcd"); 60 | $this->fail('Expected a RecognitionException.'); 61 | } catch (RecognitionException $e) { 62 | $this->assertEquals(1, $e->getSourceLine()); 63 | } 64 | } 65 | 66 | /** 67 | * @test 68 | */ 69 | public function lexShouldNormalizeLineEndingsBeforeLexing() 70 | { 71 | $stream = $this->lexer->lex("a\r\nb"); 72 | $this->assertEquals("\n", $stream->get(1)->getValue()); 73 | } 74 | 75 | /** 76 | * @test 77 | */ 78 | public function lexShouldSkipTokensIfToldToDoSo() 79 | { 80 | $stream = $this->lexer->lex('aeb'); 81 | $this->assertNotEquals('e', $stream->get(1)->getType()); 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/Dissect/Lexer/AbstractLexer.php: -------------------------------------------------------------------------------- 1 | 16 | */ 17 | abstract class AbstractLexer implements Lexer 18 | { 19 | /** 20 | * @var int 21 | */ 22 | private $line = 1; 23 | 24 | /** 25 | * Returns the current line. 26 | * 27 | * @return int The current line. 28 | */ 29 | protected function getCurrentLine() 30 | { 31 | return $this->line; 32 | } 33 | 34 | /** 35 | * Attempts to extract another token from the string. 36 | * Returns the token on success or null on failure. 37 | * 38 | * @param string $string The string to extract the token from. 39 | * 40 | * @return \Dissect\Lexer\Token|null The extracted token or null. 41 | */ 42 | abstract protected function extractToken($string); 43 | 44 | /** 45 | * Should given token be skipped? 46 | * 47 | * @param \Dissect\Lexer\Token $token The token to evaluate. 48 | * 49 | * @return boolean Whether to skip the token. 50 | */ 51 | abstract protected function shouldSkipToken(Token $token); 52 | 53 | /** 54 | * {@inheritDoc} 55 | */ 56 | public function lex($string) 57 | { 58 | // normalize line endings 59 | $string = strtr($string, array("\r\n" => "\n", "\r" => "\n")); 60 | 61 | $tokens = array(); 62 | $position = 0; 63 | $originalString = $string; 64 | $originalLength = Util::stringLength($string); 65 | 66 | while (true) { 67 | $token = $this->extractToken($string); 68 | 69 | if ($token === null) { 70 | break; 71 | } 72 | 73 | if (!$this->shouldSkipToken($token)) { 74 | $tokens[] = $token; 75 | } 76 | 77 | $shift = Util::stringLength($token->getValue()); 78 | 79 | $position += $shift; 80 | 81 | // update line + offset 82 | if ($position > 0) { 83 | $this->line = substr_count($originalString, "\n", 0, $position) + 1; 84 | } 85 | 86 | $string = Util::substring($string, $shift); 87 | } 88 | 89 | if ($position !== $originalLength) { 90 | throw new RecognitionException($this->line); 91 | } 92 | 93 | $tokens[] = new CommonToken(Parser::EOF_TOKEN_TYPE, '', $this->line); 94 | 95 | return new ArrayTokenStream($tokens); 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/Dissect/Parser/Rule.php: -------------------------------------------------------------------------------- 1 | 9 | */ 10 | class Rule 11 | { 12 | /** 13 | * @var int 14 | */ 15 | protected $number; 16 | 17 | /** 18 | * @var string 19 | */ 20 | protected $name; 21 | 22 | /** 23 | * @var string[] 24 | */ 25 | protected $components; 26 | 27 | /** 28 | * @var callable 29 | */ 30 | protected $callback = null; 31 | 32 | /** 33 | * @var int 34 | */ 35 | protected $precedence = null; 36 | 37 | /** 38 | * Constructor. 39 | * 40 | * @param int $number The number of the rule in the grammar. 41 | * @param string $name The name (lhs) of the rule ("A" in "A -> a b c") 42 | * @param string[] $components The components of this rule. 43 | */ 44 | public function __construct($number, $name, array $components) 45 | { 46 | $this->number = $number; 47 | $this->name = $name; 48 | $this->components = $components; 49 | } 50 | 51 | /** 52 | * Returns the number of this rule. 53 | * 54 | * @return int The number of this rule. 55 | */ 56 | public function getNumber() 57 | { 58 | return $this->number; 59 | } 60 | 61 | /** 62 | * Returns the name of this rule. 63 | * 64 | * @return string The name of this rule. 65 | */ 66 | public function getName() 67 | { 68 | return $this->name; 69 | } 70 | 71 | /** 72 | * Returns the components of this rule. 73 | * 74 | * @return string[] The components of this rule. 75 | */ 76 | public function getComponents() 77 | { 78 | return $this->components; 79 | } 80 | 81 | /** 82 | * Returns a component at index $index or null 83 | * if index is out of range. 84 | * 85 | * @param int $index The index. 86 | * 87 | * @return string The component at index $index. 88 | */ 89 | public function getComponent($index) 90 | { 91 | if (!isset($this->components[$index])) { 92 | return null; 93 | } 94 | 95 | return $this->components[$index]; 96 | } 97 | 98 | /** 99 | * Sets the callback (the semantic value) of the rule. 100 | * 101 | * @param callable $callback The callback. 102 | */ 103 | public function setCallback($callback) 104 | { 105 | $this->callback = $callback; 106 | } 107 | 108 | public function getCallback() 109 | { 110 | return $this->callback; 111 | } 112 | 113 | public function getPrecedence() 114 | { 115 | return $this->precedence; 116 | } 117 | 118 | public function setPrecedence($i) 119 | { 120 | $this->precedence = $i; 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /src/Dissect/Lexer/TokenStream/ArrayTokenStream.php: -------------------------------------------------------------------------------- 1 | 12 | */ 13 | class ArrayTokenStream implements TokenStream 14 | { 15 | /** 16 | * @var \Dissect\Lexer\Token[] 17 | */ 18 | protected $tokens; 19 | 20 | /** 21 | * @var int 22 | */ 23 | protected $position = 0; 24 | 25 | /** 26 | * Constructor. 27 | * 28 | * @param \Dissect\Lexer\Token[] $tokens The tokens in this stream. 29 | */ 30 | public function __construct(array $tokens) 31 | { 32 | $this->tokens = $tokens; 33 | } 34 | 35 | /** 36 | * {@inheritDoc} 37 | */ 38 | public function getPosition() 39 | { 40 | return $this->position; 41 | } 42 | 43 | /** 44 | * {@inheritDoc} 45 | */ 46 | public function getCurrentToken() 47 | { 48 | return $this->tokens[$this->position]; 49 | } 50 | 51 | /** 52 | * {@inheritDoc} 53 | */ 54 | public function lookAhead($n) 55 | { 56 | if (isset($this->tokens[$this->position + $n])) { 57 | return $this->tokens[$this->position + $n]; 58 | } 59 | 60 | throw new OutOfBoundsException('Invalid look-ahead.'); 61 | } 62 | 63 | /** 64 | * {@inheritDoc} 65 | */ 66 | public function get($n) 67 | { 68 | if (isset($this->tokens[$n])) { 69 | return $this->tokens[$n]; 70 | } 71 | 72 | throw new OutOfBoundsException('Invalid index.'); 73 | } 74 | 75 | /** 76 | * {@inheritDoc} 77 | */ 78 | public function move($n) 79 | { 80 | if (!isset($this->tokens[$n])) { 81 | throw new OutOfBoundsException('Invalid index to move to.'); 82 | } 83 | 84 | $this->position = $n; 85 | } 86 | 87 | /** 88 | * {@inheritDoc} 89 | */ 90 | public function seek($n) 91 | { 92 | if (!isset($this->tokens[$this->position + $n])) { 93 | throw new OutOfBoundsException('Invalid seek.'); 94 | } 95 | 96 | $this->position += $n; 97 | } 98 | 99 | /** 100 | * {@inheritDoc} 101 | */ 102 | public function next() 103 | { 104 | if (!isset($this->tokens[$this->position + 1])) { 105 | throw new OutOfBoundsException('Attempting to move beyond the end of the stream.'); 106 | } 107 | 108 | $this->position++; 109 | } 110 | 111 | /** 112 | * @return int 113 | */ 114 | public function count() 115 | { 116 | return count($this->tokens); 117 | } 118 | 119 | /** 120 | * @return \ArrayIterator 121 | */ 122 | public function getIterator() 123 | { 124 | return new ArrayIterator($this->tokens); 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /src/Dissect/Node/CommonNode.php: -------------------------------------------------------------------------------- 1 | 11 | */ 12 | class CommonNode implements Node 13 | { 14 | /** 15 | * @var array 16 | */ 17 | protected $nodes; 18 | 19 | /** 20 | * @var array 21 | */ 22 | protected $attributes; 23 | 24 | /** 25 | * Constructor. 26 | * 27 | * @param array $attributes The attributes of this node. 28 | * @param array $children The children of this node. 29 | */ 30 | public function __construct(array $attributes = array(), array $nodes = array()) 31 | { 32 | $this->attributes = $attributes; 33 | $this->nodes = $nodes; 34 | } 35 | 36 | /** 37 | * {@inheritDoc} 38 | */ 39 | public function getNodes() 40 | { 41 | return $this->nodes; 42 | } 43 | 44 | /** 45 | * {@inheritDoc} 46 | */ 47 | public function hasNode($key) 48 | { 49 | return isset($this->nodes[$key]); 50 | } 51 | 52 | /** 53 | * {@inheritDoc} 54 | */ 55 | public function getNode($key) 56 | { 57 | if (!isset($this->children[$key])) { 58 | throw new RuntimeException(sprintf('No child node "%s" exists.', $key)); 59 | } 60 | 61 | return $this->nodes[$key]; 62 | } 63 | 64 | /** 65 | * {@inheritDoc} 66 | */ 67 | public function setNode($key, Node $child) 68 | { 69 | $this->children[$key] = $child; 70 | } 71 | 72 | /** 73 | * {@inheritDoc} 74 | */ 75 | public function removeNode($key) 76 | { 77 | unset($this->children[$key]); 78 | } 79 | 80 | /** 81 | * {@inheritDoc} 82 | */ 83 | public function getAttributes() 84 | { 85 | return $this->attributes; 86 | } 87 | 88 | /** 89 | * {@inheritDoc} 90 | */ 91 | public function hasAttribute($key) 92 | { 93 | return isset($this->attributes[$key]); 94 | } 95 | 96 | /** 97 | * {@inheritDoc} 98 | */ 99 | public function getAttribute($key) 100 | { 101 | if (!isset($this->attributes[$key])) { 102 | throw new RuntimeException(sprintf('No attribute "%s" exists.', $key)); 103 | } 104 | 105 | return $this->attributes[$key]; 106 | } 107 | 108 | /** 109 | * {@inheritDoc} 110 | */ 111 | public function setAttribute($key, $value) 112 | { 113 | $this->attributes[$key] = $value; 114 | } 115 | 116 | /** 117 | * {@inheritDoc} 118 | */ 119 | public function removeAttribute($key) 120 | { 121 | unset($this->attributes[$key]); 122 | } 123 | 124 | public function count() 125 | { 126 | return count($this->children); 127 | } 128 | 129 | public function getIterator() 130 | { 131 | return new ArrayIterator($this->children); 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /src/Dissect/Lexer/SimpleLexer.php: -------------------------------------------------------------------------------- 1 | 14 | */ 15 | class SimpleLexer extends AbstractLexer 16 | { 17 | /** 18 | * @var array 19 | */ 20 | protected $skipTokens = array(); 21 | 22 | /** 23 | * @var array 24 | */ 25 | protected $recognizers = array(); 26 | 27 | /** 28 | * Adds a new token definition. If given only one argument, 29 | * it assumes that the token type and recognized value are 30 | * identical. 31 | * 32 | * @param string $type The token type. 33 | * @param string $value The value to be recognized. 34 | * 35 | * @return \Dissect\Lexer\SimpleLexer This instance for fluent interface. 36 | */ 37 | public function token($type, $value = null) 38 | { 39 | if ($value) { 40 | $this->recognizers[$type] = new SimpleRecognizer($value); 41 | } else { 42 | $this->recognizers[$type] = new SimpleRecognizer($type); 43 | } 44 | 45 | return $this; 46 | } 47 | 48 | /** 49 | * Adds a new regex token definition. 50 | * 51 | * @param string $type The token type. 52 | * @param string $regex The regular expression used to match the token. 53 | * 54 | * @return \Dissect\Lexer\SimpleLexer This instance for fluent interface. 55 | */ 56 | public function regex($type, $regex) 57 | { 58 | $this->recognizers[$type] = new RegexRecognizer($regex); 59 | 60 | return $this; 61 | } 62 | 63 | /** 64 | * Marks the token types given as arguments to be skipped. 65 | * 66 | * @param mixed $type,... Unlimited number of token types. 67 | * 68 | * @return \Dissect\Lexer\SimpleLexer This instance for fluent interface. 69 | */ 70 | public function skip() 71 | { 72 | $this->skipTokens = func_get_args(); 73 | 74 | return $this; 75 | } 76 | 77 | /** 78 | * {@inheritDoc} 79 | */ 80 | protected function shouldSkipToken(Token $token) 81 | { 82 | return in_array($token->getType(), $this->skipTokens); 83 | } 84 | 85 | /** 86 | * {@inheritDoc} 87 | */ 88 | protected function extractToken($string) 89 | { 90 | $value = $type = null; 91 | 92 | foreach ($this->recognizers as $t => $recognizer) { 93 | if ($recognizer->match($string, $v)) { 94 | if ($value === null || Util::stringLength($v) > Util::stringLength($value)) { 95 | $value = $v; 96 | $type = $t; 97 | } 98 | } 99 | } 100 | 101 | if ($type !== null) { 102 | return new CommonToken($type, $value, $this->getCurrentLine()); 103 | } 104 | 105 | return null; 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/Dissect/Parser/LALR1/Parser.php: -------------------------------------------------------------------------------- 1 | 14 | */ 15 | class Parser implements P\Parser 16 | { 17 | /** 18 | * @var \Dissect\Parser\Grammar 19 | */ 20 | protected $grammar; 21 | 22 | /** 23 | * @var array 24 | */ 25 | protected $parseTable; 26 | 27 | /** 28 | * Constructor. 29 | * 30 | * @param \Dissect\Parser\Grammar $grammar The grammar. 31 | * @param array $parseTable If given, the parser doesn't have to analyze the grammar. 32 | */ 33 | public function __construct(P\Grammar $grammar, array $parseTable = null) 34 | { 35 | $this->grammar = $grammar; 36 | 37 | if ($parseTable) { 38 | $this->parseTable = $parseTable; 39 | } else { 40 | $analyzer = new Analyzer(); 41 | $this->parseTable = $analyzer->analyze($grammar)->getParseTable(); 42 | } 43 | } 44 | 45 | /** 46 | * {@inheritDoc} 47 | */ 48 | public function parse(TokenStream $stream) 49 | { 50 | $stateStack = array($currentState = 0); 51 | $args = array(); 52 | 53 | foreach ($stream as $token) { 54 | while (true) { 55 | $type = $token->getType(); 56 | 57 | if (!isset($this->parseTable['action'][$currentState][$type])) { 58 | // unexpected token 59 | 60 | throw new UnexpectedTokenException( 61 | $token, 62 | array_keys($this->parseTable['action'][$currentState]) 63 | ); 64 | } 65 | 66 | $action = $this->parseTable['action'][$currentState][$type]; 67 | 68 | if ($action > 0) { 69 | // shift 70 | 71 | $args[] = $token; 72 | $stateStack[] = $currentState = $action; 73 | 74 | break; 75 | } elseif ($action < 0) { 76 | // reduce 77 | $rule = $this->grammar->getRule(-$action); 78 | $popCount = count($rule->getComponents()); 79 | 80 | array_splice($stateStack, -$popCount); 81 | $newArgs = array_splice($args, -$popCount); 82 | 83 | if ($callback = $rule->getCallback()) { 84 | $args[] = call_user_func_array($callback, $newArgs); 85 | } else { 86 | $args[] = $newArgs[0]; 87 | } 88 | 89 | $state = $stateStack[count($stateStack) - 1]; 90 | $stateStack[] = $currentState = $this->parseTable['goto'] 91 | [$state][$rule->getName()]; 92 | } else { 93 | // accept 94 | 95 | return $args[0]; 96 | } 97 | } 98 | } 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /docs/cli.md: -------------------------------------------------------------------------------- 1 | The command-line interface 2 | ========================== 3 | 4 | Dissect provides you with a command-line tool for processing and 5 | debugging your grammars. This chapted describes the tool and its 6 | options. 7 | 8 | Running the tool 9 | ---------------- 10 | 11 | Let's assume that the executable is located in a folder called `bin`. 12 | The most basic way to invoke it is 13 | 14 | $ bin/dissect 15 | 16 | This will analyze the given grammar and, if successful, save the parse 17 | table in a file `parse_table.php` in the same folder where you've 18 | defined your grammar. You can use `/` instead of `\` as the namespace 19 | separator or enclose the class name in quotes. 20 | 21 | To change the directory in which the parse table will be saved, use the 22 | `--output-dir` (or `-o`) option: 23 | 24 | $ bin/dissect --output-dir=../dir 25 | 26 | Dumping the parse table in the debug format 27 | ------------------------------------------- 28 | 29 | By default, the parse table will be saved as a single line of PHP code, 30 | with minimal whitespace. If you want to inspect the generated table 31 | manually, you can use the `--debug` (or `-d`) option: 32 | 33 | $ bin/dissect --debug 34 | 35 | The parse table will then be written in a human-readable way and with 36 | comments explaining the steps of the parser. 37 | 38 | Dumping the handle-finding automaton 39 | ------------------------------------ 40 | 41 | If you have an understanding of the LR parsing process, being able to 42 | inspect the LR automaton visually could be an aid in resolving potential 43 | grammar conflicts. In order to dump the automaton as a Graphviz graph, 44 | use the `--dfa` (or `-D`) option: 45 | 46 | $ bin/dissect --dfa 47 | 48 | This will create a file called `automaton.dot` in the output directory. 49 | You can then run something like 50 | 51 | dot -Tpng automaton.dot > automaton.png 52 | 53 | to render it as a PNG image. 54 | 55 | Of course, for more complex grammars, the automaton will quickly become rather large 56 | and unwieldy. You can then use the `--state` (or `-s`) option to dump 57 | only the specified state: 58 | 59 | $ bin/dissect --dfa --state=5 60 | 61 | As an example, let's say we use the following grammar: 62 | 63 | ```php 64 | class PalindromeGrammar extends Grammar 65 | { 66 | public function __construct() 67 | { 68 | $this('S') 69 | ->is('a', 'S', 'a') 70 | ->is('b', 'S', 'b') 71 | ->is(/* empty */); 72 | 73 | $this->start('S'); 74 | } 75 | } 76 | ``` 77 | 78 | When running the command-line tool, we'll notice a list of resolved 79 | conflicts in the output: 80 | 81 | Resolved a shift/reduce conflict in state 2 on lookahead a 82 | Resolved a shift/reduce conflict in state 3 on lookahead b 83 | 84 | If we wanted to examine the conflict in state 3, we could run 85 | 86 | $ bin/dissect PalindromeGrammar --dfa --state=3 87 | 88 | and then 89 | 90 | $ dot -Tpng state_3.dot > state_3.png 91 | 92 | The result will be the following image: 93 | 94 | ![State 3](https://raw.github.com/jakubledl/dissect/develop/docs/state_3.png) 95 | 96 | in which we can clearly see how the conflict arose: the state #3 calls 97 | both for a shift and a reduction by the rule `S -> ` on 98 | lookahead `b`. 99 | -------------------------------------------------------------------------------- /src/Dissect/Parser/LALR1/Analysis/Exception/ReduceReduceConflictException.php: -------------------------------------------------------------------------------- 1 | 13 | */ 14 | class ReduceReduceConflictException extends ConflictException 15 | { 16 | /** 17 | * The exception message template. 18 | */ 19 | const MESSAGE = << %s 23 | 24 | vs: 25 | 26 | %d. %s -> %s 27 | 28 | (on lookahead "%s" in state %d). Restructure your grammar or choose a conflict resolution mode. 29 | EOT; 30 | 31 | /** 32 | * @var \Dissect\Parser\Rule 33 | */ 34 | protected $firstRule; 35 | 36 | /** 37 | * @var \Dissect\Parser\Rule 38 | */ 39 | protected $secondRule; 40 | 41 | /** 42 | * @var string 43 | */ 44 | protected $lookahead; 45 | 46 | /** 47 | * Constructor. 48 | * 49 | * @param int $state The number of the inadequate state. 50 | * @param \Dissect\Parser\Rule $firstRule The first conflicting grammar rule. 51 | * @param \Dissect\Parser\Rule $secondRule The second conflicting grammar rule. 52 | * @param string $lookahead The conflicting lookahead. 53 | * @param \Dissect\Parser\LALR1\Analysis\Automaton $automaton The faulty automaton. 54 | */ 55 | public function __construct($state, Rule $firstRule, Rule $secondRule, $lookahead, Automaton $automaton) 56 | { 57 | $components1 = $firstRule->getComponents(); 58 | $components2 = $secondRule->getComponents(); 59 | 60 | parent::__construct( 61 | sprintf( 62 | self::MESSAGE, 63 | $firstRule->getNumber(), 64 | $firstRule->getName(), 65 | empty($components1) ? '/* empty */' : implode(' ', $components1), 66 | $secondRule->getNumber(), 67 | $secondRule->getName(), 68 | empty($components2) ? '/* empty */' : implode(' ', $components2), 69 | $lookahead, 70 | $state 71 | ), 72 | $state, 73 | $automaton 74 | ); 75 | 76 | $this->firstRule = $firstRule; 77 | $this->secondRule = $secondRule; 78 | $this->lookahead = $lookahead; 79 | } 80 | 81 | /** 82 | * Returns the first conflicting rule. 83 | * 84 | * @return \Dissect\Parser\Rule The first conflicting rule. 85 | */ 86 | public function getFirstRule() 87 | { 88 | return $this->firstRule; 89 | } 90 | 91 | /** 92 | * Returns the second conflicting rule. 93 | * 94 | * @return \Dissect\Parser\Rule The second conflicting rule. 95 | */ 96 | public function getSecondRule() 97 | { 98 | return $this->secondRule; 99 | } 100 | 101 | /** 102 | * Returns the conflicting lookahead. 103 | * 104 | * @return string The conflicting lookahead. 105 | */ 106 | public function getLookahead() 107 | { 108 | return $this->lookahead; 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /tests/Dissect/Lexer/TokenStream/ArrayTokenStreamTest.php: -------------------------------------------------------------------------------- 1 | stream = new ArrayTokenStream(array( 15 | new CommonToken('INT', '6', 1, 1), 16 | new CommonToken('PLUS', '+', 1, 3), 17 | new CommonToken('INT', '5', 1, 5), 18 | new CommonToken('MINUS', '-', 1, 7), 19 | new CommonToken('INT', '3', 1, 9), 20 | )); 21 | } 22 | 23 | /** 24 | * @test 25 | */ 26 | public function theCursorShouldBeOnFirstTokenByDefault() 27 | { 28 | $this->assertEquals('6', $this->stream->getCurrentToken()->getValue()); 29 | } 30 | 31 | /** 32 | * @test 33 | */ 34 | public function getPositionShouldReturnCurrentPosition() 35 | { 36 | $this->stream->seek(2); 37 | $this->stream->next(); 38 | 39 | $this->assertEquals(3, $this->stream->getPosition()); 40 | } 41 | 42 | /** 43 | * @test 44 | */ 45 | public function lookAheadShouldReturnTheCorrectToken() 46 | { 47 | $this->assertEquals('5', $this->stream->lookAhead(2)->getValue()); 48 | } 49 | 50 | /** 51 | * @test 52 | * @expectedException OutOfBoundsException 53 | */ 54 | public function lookAheadShouldThrowAnExceptionWhenInvalid() 55 | { 56 | $this->stream->lookAhead(15); 57 | } 58 | 59 | /** 60 | * @test 61 | */ 62 | public function getShouldReturnATokenByAbsolutePosition() 63 | { 64 | $this->assertEquals('3', $this->stream->get(4)->getValue()); 65 | } 66 | 67 | /** 68 | * @test 69 | * @expectedException OutOfBoundsException 70 | */ 71 | public function getShouldThrowAnExceptionWhenInvalid() 72 | { 73 | $this->stream->get(15); 74 | } 75 | 76 | /** 77 | * @test 78 | */ 79 | public function moveShouldMoveTheCursorByToAnAbsolutePosition() 80 | { 81 | $this->stream->move(2); 82 | $this->assertEquals('5', $this->stream->getCurrentToken()->getValue()); 83 | } 84 | 85 | /** 86 | * @test 87 | * @expectedException OutOfBoundsException 88 | */ 89 | public function moveShouldThrowAnExceptionWhenInvalid() 90 | { 91 | $this->stream->move(15); 92 | } 93 | 94 | /** 95 | * @test 96 | */ 97 | public function seekShouldMoveTheCursorByRelativeOffset() 98 | { 99 | $this->stream->seek(4); 100 | $this->assertEquals('3', $this->stream->getCurrentToken()->getValue()); 101 | } 102 | 103 | /** 104 | * @test 105 | * @expectedException OutOfBoundsException 106 | */ 107 | public function seekShouldThrowAnExceptionWhenInvalid() 108 | { 109 | $this->stream->seek(15); 110 | } 111 | 112 | /** 113 | * @test 114 | */ 115 | public function nextShouldMoveTheCursorOneTokenAhead() 116 | { 117 | $this->stream->next(); 118 | $this->assertEquals('PLUS', $this->stream->getCurrentToken()->getType()); 119 | 120 | $this->stream->next(); 121 | $this->assertEquals('5', $this->stream->getCurrentToken()->getValue()); 122 | } 123 | 124 | /** 125 | * @test 126 | * @expectedException OutOfBoundsException 127 | */ 128 | public function nextShouldThrowAnExceptionWhenAtTheEndOfTheStream() 129 | { 130 | $this->stream->seek(4); 131 | $this->stream->next(); 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /docs/common.md: -------------------------------------------------------------------------------- 1 | Describing common syntactic structures 2 | ====================================== 3 | 4 | This chapter of the documentation shows how to implement common 5 | grammar patterns like lists & repetitions in a way that's most efficient 6 | for a LALR(1) parser like Dissect. 7 | 8 | List of 1 or more `Foo`s 9 | ------------------------ 10 | 11 | ```php 12 | $this('Foo+') 13 | ->is('Foo+', 'Foo') 14 | ->call(function ($list, $foo) { 15 | $list[] = $foo; 16 | 17 | return $list; 18 | }) 19 | 20 | ->is('Foo') 21 | ->call(function ($foo) { 22 | return [$foo]; 23 | }); 24 | ``` 25 | 26 | With some practice, it's very easy to see how this works: when the 27 | parser recognizes the first `Foo`, it reduces it to a single-item array 28 | and for each following `Foo`, it just pushes it onto the array. 29 | 30 | Note that `Foo+` is just a rule name, it could be equally well called 31 | `Foos`, `ListOfFoo` or anything else you feel like. 32 | 33 | List of 0 or more `Foo`s 34 | ------------------------ 35 | 36 | ```php 37 | $this('Foo*') 38 | ->is('Foo*', 'Foo') 39 | ->call(function ($list, $foo) { 40 | $list[] = $foo; 41 | 42 | return $list; 43 | }) 44 | 45 | ->is(/* empty */) 46 | ->call(function () { 47 | return []; 48 | }); 49 | ``` 50 | 51 | This works pretty much the same like the previous example, the only 52 | difference being that we allow `Foo*` to match nothing. 53 | 54 | A comma separated list 55 | ---------------------- 56 | 57 | The first example of this chapter is trivial to modify to include 58 | commas between the `Foo`s. Just change the second line to: 59 | 60 | ```php 61 | $this('Foo+') 62 | ->is('Foo+', ',', 'Foo') 63 | ... 64 | ``` 65 | 66 | The second example, however, cannot be modified so easily. We cannot 67 | just put a comma in the first alternative: 68 | 69 | ```php 70 | $this('Foo*') 71 | ->is('Foo*', ',', 'Foo') 72 | ... 73 | ``` 74 | 75 | since that would allow the list to start with a comma: 76 | 77 | , Foo , Foo , Foo 78 | 79 | Instead, we say that a "list of zero or more `Foo`s 80 | separated by commas" is actually "a list of one or more `Foo`s separated 81 | by commas or nothing at all". So our rule now becomes: 82 | 83 | ```php 84 | $this('Foo*') 85 | ->is('Foo+') 86 | 87 | ->is(/* empty */) 88 | ->call(function () { 89 | return []; 90 | }); 91 | 92 | $this('Foo+') 93 | ->is('Foo+', ',', 'Foo') 94 | ... 95 | ``` 96 | 97 | A note on left recursion 98 | ------------------------ 99 | 100 | One of the principal advantages of LR parsers over alternatives like LL 101 | or recursive descent is the ability to handle left-recursive rules, 102 | which are a natural expression of many grammar patterns. However, not 103 | only do LR parsers handle left recursion, they actually work *better* 104 | with left-recursive rules than with right-recursive ones in terms of 105 | memory, since a left-recursive rule can be recognized using a constant 106 | amount of memory, whereas for right-recursive rules, the amount of 107 | memory required grows lineary with each round of recursion. 108 | 109 | You may have noticed that all the examples above use left recursion for 110 | two reasons: efficiency and naturalness (you read arrays from left to 111 | right, not the other way around, right?). 112 | 113 | In short, when you *can* comfortably express your rule using left recursion, 114 | *do* so. 115 | 116 | Expressions 117 | ----------- 118 | 119 | A grammar for very basic mathematical expressions is described in the 120 | [chapter on parsing][arith]. It would require some modifications to allow 121 | for other operators, function calls, ternary operator(s), but there's a 122 | lot of grammars for practical programming languages on the internet that 123 | you can take inspiration from. 124 | 125 | For a familiar (although slighty less readable) example, take a look 126 | at [this grammar][php-grammar] for PHP itself. 127 | 128 | [php-grammar]: https://github.com/php/php-src/blob/master/Zend/zend_language_parser.y 129 | [arith]: parsing.md#example-parsing-mathematical-expressions 130 | -------------------------------------------------------------------------------- /docs/ast.md: -------------------------------------------------------------------------------- 1 | Building an AST 2 | =============== 3 | 4 | Often, when parsing a language that's more complex than 5 | [mathematical expressions][prev], you will want to represent 6 | the input as an *abstract syntax tree*, or AST (for a real-life 7 | example, see [Twig][twig-ast] or [Gherkin][gherkin-ast]). 8 | 9 | Getting the AST of the input with Dissect is nothing special; the 10 | callbacks in your grammar can return anything, so they might as well 11 | return AST nodes. Dissect however helps you by providing a simple base 12 | class for the different node types: `Dissect\Node\CommonNode`. 13 | 14 | Let's say we want to create an AST for the mathematical expressions from 15 | the previous chapter. Since the input can consist of binary operations 16 | and integers, let's create a subclass for each case: 17 | 18 | ```php 19 | use Dissect\Node\CommonNode; 20 | use Dissect\Node\Node; 21 | 22 | class BinaryExpressionNode extends CommonNode 23 | { 24 | const PLUS = 1; 25 | const TIMES = 2; 26 | const POWER = 3; 27 | 28 | public function __construct(Node $left, $op, Node $right) 29 | { 30 | parent::__construct(['operator' => $op], [ 31 | 'left' => $left, 32 | 'right' => $right, 33 | ]); 34 | } 35 | 36 | public function getLeft() 37 | { 38 | return $this->getNode('left'); 39 | } 40 | 41 | public function getRight() 42 | { 43 | return $this->getNode('right'); 44 | } 45 | 46 | public function getOperator() 47 | { 48 | return $this->getAttribute('operator'); 49 | } 50 | } 51 | 52 | class IntNode extends CommonNode 53 | { 54 | public function __construct($value) 55 | { 56 | parent::__construct(['value' => $value]); 57 | } 58 | 59 | public function getValue() 60 | { 61 | return $this->getAttribute('value'); 62 | } 63 | } 64 | ``` 65 | 66 | The original constructor has two parameters, an array of child nodes and 67 | an array of node attributes. `Dissect\Node\Node` is an interface 68 | describing common operations for an AST node. 69 | 70 | We can now easily modify the original grammar to build the AST: 71 | 72 | ```php 73 | $this('Additive') 74 | ->is('Additive', '+', 'Multiplicative') 75 | ->call(function ($l, $_, $r) { 76 | return new BinaryExpressionNode($l, BinaryExpressionNode::PLUS, $r); 77 | }) 78 | 79 | ->is('Multiplicative'); 80 | 81 | $this('Multiplicative') 82 | ->is('Multiplicative', '*', 'Power') 83 | ->call(function ($l, $_, $r) { 84 | return new BinaryExpressionNode($l, BinaryExpressionNode::TIMES, $r); 85 | }) 86 | 87 | ->is('Power'); 88 | 89 | $this('Power') 90 | ->is('Primary', '**', 'Power') 91 | ->call(function ($l, $_, $r) { 92 | return new BinaryExpressionNode($l, BinaryExpressionNode::POWER, $r); 93 | }) 94 | 95 | ->is('Primary'); 96 | 97 | $this('Primary') 98 | ->is('(', 'Additive', ')') 99 | ->call(function ($_, $e, $_) { 100 | return $e; 101 | }) 102 | 103 | ->is('INT') 104 | ->call(function ($int) { 105 | return new IntNode((int)$int->getValue()); 106 | }); 107 | ``` 108 | 109 | Traversing the AST 110 | ------------------ 111 | 112 | When we have the AST of our input, we want to interpret it somehow. 113 | The most common way to do this is to create a *node visitor* (sometimes 114 | called a *tree walker*). A trivial node visitor for our example could be 115 | the following recursive function: 116 | 117 | ```php 118 | function visit(Node $node) 119 | { 120 | if ($node instanceof BinaryExpressionNode) { 121 | switch ($node->getOperator()) { 122 | case BinaryExpressionNode::PLUS: 123 | return visit($node->getLeft()) + visit($node->getRight()); 124 | case BinaryExpressionNode::TIMES: 125 | return visit($node->getLeft()) * visit($node->getRight()); 126 | case BinaryExpressionNode::POWER: 127 | return pow(visit($node->getLeft()), visit($node->getRight()); 128 | } 129 | } elseif ($node instanceof IntNode) { 130 | return $node->getValue(); 131 | } else { 132 | throw new \Exception("Unknown node type."); 133 | } 134 | } 135 | 136 | echo visit($parser->parse(...)); 137 | ``` 138 | 139 | [prev]: parsing.md#example-parsing-mathematical-expressions 140 | [twig-ast]: https://github.com/fabpot/Twig/tree/master/lib/Twig/Node 141 | [gherkin-ast]: https://github.com/Behat/Gherkin/tree/master/src/Behat/Gherkin/Node 142 | -------------------------------------------------------------------------------- /src/Dissect/Parser/LALR1/Analysis/Item.php: -------------------------------------------------------------------------------- 1 | 15 | * A -> a . b c 16 | * 17 | * 18 | * This means that within this item, a has been recognized 19 | * and b is expected. If the dot is at the very end of the 20 | * rule: 21 | * 22 | *
 23 |  * A -> a b c .
 24 |  * 
25 | * 26 | * it means that the whole rule has been recognized and 27 | * can be reduced. 28 | * 29 | * @author Jakub Lédl 30 | */ 31 | class Item 32 | { 33 | /** 34 | * @var \Dissect\Parser\Rule 35 | */ 36 | protected $rule; 37 | 38 | /** 39 | * @var int 40 | */ 41 | protected $dotIndex; 42 | 43 | /** 44 | * @var array 45 | */ 46 | protected $lookahead = array(); 47 | 48 | /** 49 | * @var array 50 | */ 51 | protected $connected = array(); 52 | 53 | /** 54 | * Constructor. 55 | * 56 | * @param \Dissect\Parser\Rule $rule The rule of this item. 57 | * @param int $dotIndex The index of the dot in this item. 58 | */ 59 | public function __construct(Rule $rule, $dotIndex) 60 | { 61 | $this->rule = $rule; 62 | $this->dotIndex = $dotIndex; 63 | } 64 | 65 | /** 66 | * Returns the dot index of this item. 67 | * 68 | * @return int The dot index. 69 | */ 70 | public function getDotIndex() 71 | { 72 | return $this->dotIndex; 73 | } 74 | 75 | /** 76 | * Returns the currently expected component. 77 | * 78 | * If the item is: 79 | * 80 | *
 81 |      * A -> a . b c
 82 |      * 
83 | * 84 | * then this method returns the component "b". 85 | * 86 | * @return string The component. 87 | */ 88 | public function getActiveComponent() 89 | { 90 | return $this->rule->getComponent($this->dotIndex); 91 | } 92 | 93 | /** 94 | * Returns the rule of this item. 95 | * 96 | * @return \Dissect\Parser\Rule The rule. 97 | */ 98 | public function getRule() 99 | { 100 | return $this->rule; 101 | } 102 | 103 | /** 104 | * Determines whether this item is a reduce item. 105 | * 106 | * An item is a reduce item if the dot is at the very end: 107 | * 108 | *
109 |      * A -> a b c .
110 |      * 
111 | * 112 | * @return boolean Whether this item is a reduce item. 113 | */ 114 | public function isReduceItem() 115 | { 116 | return $this->dotIndex === count($this->rule->getComponents()); 117 | } 118 | 119 | /** 120 | * Connects two items with a lookahead pumping channel. 121 | * 122 | * @param \Dissect\Parser\LALR1\Analysis\Item $i The item. 123 | */ 124 | public function connect(Item $i) 125 | { 126 | $this->connected[] = $i; 127 | } 128 | 129 | /** 130 | * Pumps a lookahead token to this item and all items connected 131 | * to it. 132 | * 133 | * @param string $lookahead The lookahead token name. 134 | */ 135 | public function pump($lookahead) 136 | { 137 | if (!in_array($lookahead, $this->lookahead)) { 138 | $this->lookahead[] = $lookahead; 139 | 140 | foreach ($this->connected as $item) { 141 | $item->pump($lookahead); 142 | } 143 | } 144 | } 145 | 146 | /** 147 | * Pumps several lookahead tokens. 148 | * 149 | * @param array $lookahead The lookahead tokens. 150 | */ 151 | public function pumpAll(array $lookahead) 152 | { 153 | foreach ($lookahead as $l) { 154 | $this->pump($l); 155 | } 156 | } 157 | 158 | /** 159 | * Returns the computed lookahead for this item. 160 | * 161 | * @return string[] The lookahead symbols. 162 | */ 163 | public function getLookahead() 164 | { 165 | return $this->lookahead; 166 | } 167 | 168 | /** 169 | * Returns all components that haven't been recognized 170 | * so far. 171 | * 172 | * @return array The unrecognized components. 173 | */ 174 | public function getUnrecognizedComponents() 175 | { 176 | return array_slice($this->rule->getComponents(), $this->dotIndex + 1); 177 | } 178 | } 179 | -------------------------------------------------------------------------------- /src/Dissect/Parser/LALR1/Dumper/AutomatonDumper.php: -------------------------------------------------------------------------------- 1 | 14 | */ 15 | class AutomatonDumper 16 | { 17 | protected $automaton; 18 | 19 | /** 20 | * Constructor. 21 | * 22 | * @param \Dissect\Parser\LALR1\Analysis\Automaton $automaton 23 | */ 24 | public function __construct(Automaton $automaton) 25 | { 26 | $this->automaton = $automaton; 27 | } 28 | 29 | /** 30 | * Dumps the entire automaton. 31 | * 32 | * @return string The automaton encoded in DOT. 33 | */ 34 | public function dump() 35 | { 36 | $writer = new StringWriter(); 37 | 38 | $this->writeHeader($writer); 39 | $writer->writeLine(); 40 | 41 | foreach ($this->automaton->getStates() as $state) { 42 | $this->writeState($writer, $state); 43 | } 44 | 45 | $writer->writeLine(); 46 | 47 | foreach ($this->automaton->getTransitionTable() as $num => $map) { 48 | foreach ($map as $trigger => $destination) { 49 | $writer->writeLine(sprintf( 50 | '%d -> %d [label="%s"];', 51 | $num, 52 | $destination, 53 | $trigger 54 | )); 55 | } 56 | } 57 | 58 | $writer->outdent(); 59 | $this->writeFooter($writer); 60 | 61 | return $writer->get(); 62 | } 63 | 64 | /** 65 | * Dumps only the specified state + any relevant 66 | * transitions. 67 | * 68 | * @param int $n The number of the state. 69 | * 70 | * @return string The output in DOT format. 71 | */ 72 | public function dumpState($n) 73 | { 74 | $writer = new StringWriter(); 75 | 76 | $this->writeHeader($writer, $n); 77 | $writer->writeLine(); 78 | 79 | $this->writeState($writer, $this->automaton->getState($n)); 80 | 81 | $table = $this->automaton->getTransitionTable(); 82 | $row = isset($table[$n]) ? $table[$n] : array(); 83 | 84 | foreach ($row as $dest) { 85 | if ($dest !== $n) { 86 | $this->writeState($writer, $this->automaton->getState($dest), false); 87 | } 88 | } 89 | 90 | $writer->writeLine(); 91 | 92 | foreach ($row as $trigger => $dest) { 93 | $writer->writeLine(sprintf( 94 | '%d -> %d [label="%s"];', 95 | $n, 96 | $dest, 97 | $trigger 98 | )); 99 | } 100 | 101 | $writer->outdent(); 102 | $this->writeFooter($writer); 103 | 104 | return $writer->get(); 105 | } 106 | 107 | protected function writeHeader(StringWriter $writer, $stateNumber = null) 108 | { 109 | $writer->writeLine(sprintf( 110 | 'digraph %s {', 111 | $stateNumber ? 'State' . $stateNumber : 'Automaton' 112 | )); 113 | 114 | $writer->indent(); 115 | $writer->writeLine('rankdir="LR";'); 116 | } 117 | 118 | protected function writeState(StringWriter $writer, State $state, $full = true) 119 | { 120 | $n = $state->getNumber(); 121 | 122 | $string = sprintf( 123 | '%d [label="State %d', 124 | $n, 125 | $n 126 | ); 127 | 128 | if ($full) { 129 | $string .= '\n\n'; 130 | $items = array(); 131 | 132 | foreach ($state->getItems() as $item) { 133 | $items[] = $this->formatItem($item); 134 | } 135 | 136 | $string .= implode('\n', $items); 137 | } 138 | 139 | $string .= '"];'; 140 | 141 | $writer->writeLine($string); 142 | } 143 | 144 | protected function formatItem(Item $item) 145 | { 146 | $rule = $item->getRule(); 147 | $components = $rule->getComponents(); 148 | 149 | // the dot 150 | array_splice($components, $item->getDotIndex(), 0, array('•')); 151 | 152 | if ($rule->getNumber() === 0) { 153 | $string = ''; 154 | } else { 155 | $string = sprintf("%s → ", $rule->getName()); 156 | } 157 | 158 | $string .= implode(' ', $components); 159 | 160 | if ($item->isReduceItem()) { 161 | $string .= sprintf( 162 | ' [%s]', 163 | implode(' ', $item->getLookahead()) 164 | ); 165 | } 166 | 167 | return $string; 168 | } 169 | 170 | protected function writeFooter(StringWriter $writer) 171 | { 172 | $writer->writeLine('}'); 173 | } 174 | } 175 | -------------------------------------------------------------------------------- /src/Dissect/Parser/LALR1/Dumper/DebugTableDumper.php: -------------------------------------------------------------------------------- 1 | 13 | */ 14 | class DebugTableDumper implements TableDumper 15 | { 16 | /** 17 | * @var \Dissect\Parser\Grammar 18 | */ 19 | protected $grammar; 20 | 21 | /** 22 | * @var \Dissect\Parser\LALR1\Dumper\StringWriter 23 | */ 24 | protected $writer; 25 | 26 | /** 27 | * @var boolean 28 | */ 29 | protected $written = false; 30 | 31 | /** 32 | * Constructor. 33 | * 34 | * @param \Dissect\Parser\Grammar $grammar The grammar of this parse table. 35 | */ 36 | public function __construct(Grammar $grammar) 37 | { 38 | $this->grammar = $grammar; 39 | $this->writer = new StringWriter(); 40 | } 41 | 42 | /** 43 | * {@inheritDoc} 44 | */ 45 | public function dump(array $table) 46 | { 47 | // for readability 48 | ksort($table['action']); 49 | ksort($table['goto']); 50 | 51 | // the grammar dictates the parse table, 52 | // therefore the result is always the same 53 | if (!$this->written) { 54 | $this->writeHeader(); 55 | $this->writer->indent(); 56 | 57 | foreach ($table['action'] as $n => $state) { 58 | $this->writeState($n, $state); 59 | $this->writer->writeLine(); 60 | } 61 | 62 | $this->writer->outdent(); 63 | $this->writeMiddle(); 64 | $this->writer->indent(); 65 | 66 | foreach ($table['goto'] as $n => $map) { 67 | $this->writeGoto($n, $map); 68 | $this->writer->writeLine(); 69 | } 70 | 71 | $this->writer->outdent(); 72 | $this->writeFooter(); 73 | 74 | $this->written = true; 75 | } 76 | 77 | return $this->writer->get(); 78 | } 79 | 80 | protected function writeHeader() 81 | { 82 | $this->writer->writeLine('writer->writeLine(); 84 | $this->writer->writeLine('return array('); 85 | $this->writer->indent(); 86 | $this->writer->writeLine("'action' => array("); 87 | } 88 | 89 | protected function writeState($n, array $state) 90 | { 91 | $this->writer->writeLine((string)$n . ' => array('); 92 | $this->writer->indent(); 93 | 94 | foreach ($state as $trigger => $action) { 95 | $this->writeAction($trigger, $action); 96 | $this->writer->writeLine(); 97 | } 98 | 99 | $this->writer->outdent(); 100 | $this->writer->writeLine('),'); 101 | } 102 | 103 | protected function writeAction($trigger, $action) 104 | { 105 | if ($action > 0) { 106 | $this->writer->writeLine(sprintf( 107 | '// on %s shift and go to state %d', 108 | $trigger, 109 | $action 110 | )); 111 | } elseif ($action < 0) { 112 | $rule = $this->grammar->getRule(-$action); 113 | $components = $rule->getComponents(); 114 | 115 | if (empty($components)) { 116 | $rhs = '/* empty */'; 117 | } else { 118 | $rhs = implode(' ', $components); 119 | } 120 | 121 | $this->writer->writeLine(sprintf( 122 | '// on %s reduce by rule %s -> %s', 123 | $trigger, 124 | $rule->getName(), 125 | $rhs 126 | )); 127 | } else { 128 | $this->writer->writeLine(sprintf( 129 | '// on %s accept the input', 130 | $trigger 131 | )); 132 | } 133 | 134 | $this->writer->writeLine(sprintf( 135 | "'%s' => %d,", 136 | $trigger, 137 | $action 138 | )); 139 | } 140 | 141 | protected function writeMiddle() 142 | { 143 | $this->writer->writeLine('),'); 144 | $this->writer->writeLine(); 145 | $this->writer->writeLine("'goto' => array("); 146 | } 147 | 148 | protected function writeGoto($n, array $map) 149 | { 150 | $this->writer->writeLine((string)$n . ' => array('); 151 | $this->writer->indent(); 152 | 153 | foreach ($map as $sym => $dest) { 154 | $this->writer->writeLine(sprintf( 155 | '// on %s go to state %d', 156 | $sym, 157 | $dest 158 | )); 159 | 160 | $this->writer->writeLine(sprintf( 161 | "'%s' => %d,", 162 | $sym, 163 | $dest 164 | )); 165 | 166 | $this->writer->writeLine(); 167 | } 168 | 169 | $this->writer->outdent(); 170 | $this->writer->writeLine('),'); 171 | } 172 | 173 | protected function writeFooter() 174 | { 175 | $this->writer->writeLine('),'); 176 | $this->writer->outdent(); 177 | $this->writer->writeLine(');'); 178 | } 179 | } 180 | -------------------------------------------------------------------------------- /src/Dissect/Lexer/StatefulLexer.php: -------------------------------------------------------------------------------- 1 | 15 | */ 16 | class StatefulLexer extends AbstractLexer 17 | { 18 | protected $states = array(); 19 | protected $stateStack = array(); 20 | protected $stateBeingBuilt = null; 21 | protected $typeBeingBuilt = null; 22 | 23 | /** 24 | * Signifies that no action should be taken on encountering a token. 25 | */ 26 | const NO_ACTION = 0; 27 | 28 | /** 29 | * Indicates that a state should be popped of the state stack on 30 | * encountering a token. 31 | */ 32 | const POP_STATE = 1; 33 | 34 | /** 35 | * Adds a new token definition. If given only one argument, 36 | * it assumes that the token type and recognized value are 37 | * identical. 38 | * 39 | * @param string $type The token type. 40 | * @param string $value The value to be recognized. 41 | * 42 | * @return \Dissect\Lexer\SimpleLexer This instance for fluent interface. 43 | */ 44 | public function token($type, $value = null) 45 | { 46 | if ($this->stateBeingBuilt === null) { 47 | throw new LogicException("Define a lexer state first."); 48 | } 49 | 50 | if ($value === null) { 51 | $value = $type; 52 | } 53 | 54 | $this->states[$this->stateBeingBuilt]['recognizers'][$type] = 55 | new SimpleRecognizer($value); 56 | 57 | $this->states[$this->stateBeingBuilt]['actions'][$type] = self::NO_ACTION; 58 | 59 | $this->typeBeingBuilt = $type; 60 | 61 | return $this; 62 | } 63 | 64 | /** 65 | * Adds a new regex token definition. 66 | * 67 | * @param string $type The token type. 68 | * @param string $regex The regular expression used to match the token. 69 | * 70 | * @return \Dissect\Lexer\SimpleLexer This instance for fluent interface. 71 | */ 72 | public function regex($type, $regex) 73 | { 74 | if ($this->stateBeingBuilt === null) { 75 | throw new LogicException("Define a lexer state first."); 76 | } 77 | 78 | $this->states[$this->stateBeingBuilt]['recognizers'][$type] = 79 | new RegexRecognizer($regex); 80 | 81 | $this->states[$this->stateBeingBuilt]['actions'][$type] = self::NO_ACTION; 82 | 83 | $this->typeBeingBuilt = $type; 84 | 85 | return $this; 86 | } 87 | 88 | /** 89 | * Marks the token types given as arguments to be skipped. 90 | * 91 | * @param mixed $type,... Unlimited number of token types. 92 | * 93 | * @return \Dissect\Lexer\SimpleLexer This instance for fluent interface. 94 | */ 95 | public function skip() 96 | { 97 | if ($this->stateBeingBuilt === null) { 98 | throw new LogicException("Define a lexer state first."); 99 | } 100 | 101 | $this->states[$this->stateBeingBuilt]['skip_tokens'] = func_get_args(); 102 | 103 | return $this; 104 | } 105 | 106 | /** 107 | * Registers a new lexer state. 108 | * 109 | * @param string $state The new state name. 110 | * 111 | * @return \Dissect\Lexer\SimpleLexer This instance for fluent interface. 112 | */ 113 | public function state($state) 114 | { 115 | $this->stateBeingBuilt = $state; 116 | 117 | $this->states[$state] = array( 118 | 'recognizers' => array(), 119 | 'actions' => array(), 120 | 'skip_tokens' => array(), 121 | ); 122 | 123 | return $this; 124 | } 125 | 126 | /** 127 | * Sets the starting state for the lexer. 128 | * 129 | * @param string $state The name of the starting state. 130 | * 131 | * @return \Dissect\Lexer\SimpleLexer This instance for fluent interface. 132 | */ 133 | public function start($state) 134 | { 135 | $this->stateStack[] = $state; 136 | 137 | return $this; 138 | } 139 | 140 | /** 141 | * Sets an action for the token type that is currently being built. 142 | * 143 | * @param mixed $action The action to take. 144 | * 145 | * @return \Dissect\Lexer\SimpleLexer This instance for fluent interface. 146 | */ 147 | public function action($action) 148 | { 149 | if ($this->stateBeingBuilt === null || $this->typeBeingBuilt === null) { 150 | throw new LogicException("Define a lexer state and type first."); 151 | } 152 | 153 | $this->states[$this->stateBeingBuilt]['actions'][$this->typeBeingBuilt] = $action; 154 | 155 | return $this; 156 | } 157 | 158 | /** 159 | * {@inheritDoc} 160 | */ 161 | protected function shouldSkipToken(Token $token) 162 | { 163 | $state = $this->states[$this->stateStack[count($this->stateStack) - 1]]; 164 | 165 | return in_array($token->getType(), $state['skip_tokens']); 166 | } 167 | 168 | /** 169 | * {@inheritDoc} 170 | */ 171 | protected function extractToken($string) 172 | { 173 | if (empty($this->stateStack)) { 174 | throw new LogicException("You must set a starting state before lexing."); 175 | } 176 | 177 | $value = $type = $action = null; 178 | $state = $this->states[$this->stateStack[count($this->stateStack) - 1]]; 179 | 180 | foreach ($state['recognizers'] as $t => $recognizer) { 181 | if ($recognizer->match($string, $v)) { 182 | if ($value === null || Util::stringLength($v) > Util::stringLength($value)) { 183 | $value = $v; 184 | $type = $t; 185 | $action = $state['actions'][$type]; 186 | } 187 | } 188 | } 189 | 190 | if ($type !== null) { 191 | if (is_string($action)) { // enter new state 192 | $this->stateStack[] = $action; 193 | } elseif ($action === self::POP_STATE) { 194 | array_pop($this->stateStack); 195 | } 196 | 197 | return new CommonToken($type, $value, $this->getCurrentLine()); 198 | } 199 | 200 | return null; 201 | } 202 | } 203 | -------------------------------------------------------------------------------- /tests/Dissect/Parser/LALR1/Analysis/AnalyzerTest.php: -------------------------------------------------------------------------------- 1 | is('a', 'S', 'b') 23 | ->is(); 24 | 25 | $grammar->start('S'); 26 | 27 | $result = $this->getAnalysisResult($grammar); 28 | $table = $result->getAutomaton()->getTransitionTable(); 29 | 30 | $this->assertEquals(1, $table[0]['S']); 31 | $this->assertEquals(2, $table[0]['a']); 32 | $this->assertEquals(2, $table[2]['a']); 33 | $this->assertEquals(3, $table[2]['S']); 34 | $this->assertEquals(4, $table[3]['b']); 35 | } 36 | 37 | /** 38 | * @test 39 | */ 40 | public function lookaheadShouldBeCorrectlyPumped() 41 | { 42 | $grammar = new Grammar(); 43 | 44 | $grammar('S') 45 | ->is('A', 'B', 'C', 'D'); 46 | 47 | $grammar('A') 48 | ->is('a'); 49 | 50 | $grammar('B') 51 | ->is('b'); 52 | 53 | $grammar('C') 54 | ->is(/* empty */); 55 | 56 | $grammar('D') 57 | ->is('d'); 58 | 59 | $grammar->start('S'); 60 | 61 | $automaton = $this->getAnalysisResult($grammar)->getAutomaton(); 62 | 63 | $this->assertEquals( 64 | array(Parser::EOF_TOKEN_TYPE), 65 | $automaton->getState(1)->get(0, 1)->getLookahead() 66 | ); 67 | 68 | $this->assertEquals( 69 | array('b'), 70 | $automaton->getState(3)->get(2, 1)->getLookahead() 71 | ); 72 | 73 | $this->assertEquals( 74 | array('d'), 75 | $automaton->getState(4)->get(4, 0)->getLookahead() 76 | ); 77 | 78 | $this->assertEquals( 79 | array('d'), 80 | $automaton->getState(5)->get(3, 1)->getLookahead() 81 | ); 82 | 83 | $this->assertEquals( 84 | array(Parser::EOF_TOKEN_TYPE), 85 | $automaton->getState(7)->get(1, 4)->getLookahead() 86 | ); 87 | 88 | $this->assertEquals( 89 | array(Parser::EOF_TOKEN_TYPE), 90 | $automaton->getState(8)->get(5, 1)->getLookahead() 91 | ); 92 | } 93 | 94 | /** 95 | * @test 96 | */ 97 | public function parseTableShouldBeCorrectlyBuilt() 98 | { 99 | $grammar = new Grammar(); 100 | 101 | $grammar('S') 102 | ->is('a', 'S', 'b') 103 | ->is(/* empty */); 104 | 105 | $grammar->start('S'); 106 | 107 | $table = $this->getAnalysisResult($grammar)->getParseTable(); 108 | 109 | // shift(2) 110 | $this->assertEquals(2, $table['action'][0]['a']); 111 | 112 | // reduce(S -> ) 113 | $this->assertEquals(-2, $table['action'][0][Parser::EOF_TOKEN_TYPE]); 114 | 115 | // accept 116 | $this->assertEquals(0, $table['action'][1][Parser::EOF_TOKEN_TYPE]); 117 | 118 | // shift(2) 119 | $this->assertEquals(2, $table['action'][2]['a']); 120 | 121 | // reduce(S -> ) 122 | $this->assertEquals(-2, $table['action'][2]['b']); 123 | 124 | // shift(4) 125 | $this->assertEquals(4, $table['action'][3]['b']); 126 | 127 | // reduce(S -> a S b) 128 | $this->assertEquals(-1, $table['action'][4]['b']); 129 | $this->assertEquals(-1, $table['action'][4][Parser::EOF_TOKEN_TYPE]); 130 | 131 | $this->assertEquals(1, $table['goto'][0]['S']); 132 | $this->assertEquals(3, $table['goto'][2]['S']); 133 | } 134 | 135 | /** 136 | * @test 137 | */ 138 | public function unexpectedConflictsShouldThrowAnException() 139 | { 140 | $grammar = new Grammar(); 141 | 142 | $grammar('S') 143 | ->is('a', 'b', 'C', 'd') 144 | ->is('a', 'b', 'E', 'd'); 145 | 146 | $grammar('C') 147 | ->is(/* empty */); 148 | 149 | $grammar('E') 150 | ->is(/* empty */); 151 | 152 | $grammar->start('S'); 153 | 154 | try { 155 | $result = $this->getAnalysisResult($grammar); 156 | $this->fail('Expected an exception warning of a reduce/reduce conflict.'); 157 | } catch(ReduceReduceConflictException $e) { 158 | $this->assertEquals(3, $e->getStateNumber()); 159 | $this->assertEquals('d', $e->getLookahead()); 160 | $this->assertEquals(3, $e->getFirstRule()->getNumber()); 161 | $this->assertEquals(4, $e->getSecondRule()->getNumber()); 162 | } 163 | } 164 | 165 | /** 166 | * @test 167 | */ 168 | public function expectedConflictsShouldBeRecorded() 169 | { 170 | $grammar = new Grammar(); 171 | 172 | $grammar('S') 173 | ->is('S', 'S', 'S') 174 | ->is('S', 'S') 175 | ->is('b'); 176 | 177 | $grammar->resolve(Grammar::ALL); 178 | $grammar->start('S'); 179 | 180 | $conflicts = $this->getAnalysisResult($grammar)->getResolvedConflicts(); 181 | 182 | $this->assertCount(4, $conflicts); 183 | 184 | $conflict = $conflicts[0]; 185 | 186 | $this->assertEquals(3, $conflict['state']); 187 | $this->assertEquals('b', $conflict['lookahead']); 188 | $this->assertEquals(2, $conflict['rule']->getNumber()); 189 | $this->assertEquals(Grammar::SHIFT, $conflict['resolution']); 190 | 191 | $conflict = $conflicts[1]; 192 | 193 | $this->assertEquals(4, $conflict['state']); 194 | $this->assertEquals('b', $conflict['lookahead']); 195 | $this->assertEquals(1, $conflict['rule']->getNumber()); 196 | $this->assertEquals(Grammar::SHIFT, $conflict['resolution']); 197 | 198 | $conflict = $conflicts[2]; 199 | 200 | $this->assertEquals(4, $conflict['state']); 201 | $this->assertEquals(Parser::EOF_TOKEN_TYPE, $conflict['lookahead']); 202 | $this->assertEquals(1, $conflict['rules'][0]->getNumber()); 203 | $this->assertEquals(2, $conflict['rules'][1]->getNumber()); 204 | $this->assertEquals(Grammar::LONGER_REDUCE, $conflict['resolution']); 205 | 206 | $conflict = $conflicts[3]; 207 | 208 | $this->assertEquals(4, $conflict['state']); 209 | $this->assertEquals('b', $conflict['lookahead']); 210 | $this->assertEquals(2, $conflict['rule']->getNumber()); 211 | $this->assertEquals(Grammar::SHIFT, $conflict['resolution']); 212 | } 213 | 214 | protected function getAnalysisResult(Grammar $grammar) 215 | { 216 | return $this->getAnalyzer()->analyze($grammar); 217 | } 218 | 219 | protected function getAnalyzer() 220 | { 221 | if ($this->analyzer === null) { 222 | $this->analyzer = new Analyzer(); 223 | } 224 | 225 | return $this->analyzer; 226 | } 227 | } 228 | -------------------------------------------------------------------------------- /src/Dissect/Console/Command/DissectCommand.php: -------------------------------------------------------------------------------- 1 | setName('dissect') 24 | ->addArgument('grammar-class', InputArgument::REQUIRED, 'The grammar class.') 25 | ->addOption('debug', 'd', InputOption::VALUE_NONE, 'Writes the parse table in the debug format.') 26 | ->addOption('dfa', 'D', InputOption::VALUE_NONE, 'Exports the LALR(1) DFA as a Graphviz graph.') 27 | ->addOption('state', 's', InputOption::VALUE_REQUIRED, 'Exports only the specified state instead of the entire DFA.') 28 | ->addOption('output-dir', 'o', InputOption::VALUE_REQUIRED, 'Overrides the default output directory.') 29 | ->setHelp(<<--output-dir option: 35 | 36 | --output-dir=../some/other/dir 37 | 38 | The parse table is by default written with minimal whitespace to make it compact. 39 | If you wish to inspect the table manually, you can export it in a readable and 40 | well-commented way with the --debug option. 41 | 42 | If you wish to inspect the handle-finding automaton for your grammar (perhaps 43 | to aid with grammar debugging), use the --dfa option. When in use, Dissect 44 | will create a file with the automaton exported as a Graphviz graph 45 | in the output directory. 46 | 47 | Additionally, you can use the --state option to export only the specified 48 | state and any relevant transitions: 49 | 50 | --dfa --state=5 51 | EOT 52 | ); 53 | } 54 | 55 | protected function execute(InputInterface $input, OutputInterface $output) 56 | { 57 | $class = strtr( 58 | $input->getArgument('grammar-class'), 59 | '/', 60 | '\\' 61 | ); 62 | $formatter = $this->getHelperSet()->get('formatter'); 63 | 64 | $output->writeln('Analyzing...'); 65 | $output->writeln(''); 66 | 67 | if (!class_exists($class)) { 68 | $output->writeln(array( 69 | $formatter->formatBlock( 70 | sprintf('The class "%s" could not be found.', $class), 71 | 'error', 72 | true 73 | ), 74 | )); 75 | 76 | return 1; 77 | } 78 | 79 | $grammar = new $class(); 80 | 81 | if ($dir = $input->getOption('output-dir')) { 82 | $cwd = rtrim(getcwd(), DIRECTORY_SEPARATOR); 83 | 84 | $outputDir = $cwd . DIRECTORY_SEPARATOR . $dir; 85 | } else { 86 | $refl = new ReflectionClass($class); 87 | $outputDir = pathinfo($refl->getFileName(), PATHINFO_DIRNAME); 88 | } 89 | 90 | $analyzer = new Analyzer(); 91 | $automaton = null; 92 | 93 | try { 94 | $result = $analyzer->analyze($grammar); 95 | $conflicts = $result->getResolvedConflicts(); 96 | $automaton = $result->getAutomaton(); 97 | $table = $result->getParseTable(); 98 | 99 | if ($conflicts) { 100 | foreach ($conflicts as $conflict) { 101 | $output->writeln($this->formatConflict($conflict)); 102 | } 103 | 104 | $output->writeln(sprintf( 105 | "%d conflicts in total", 106 | count($conflicts) 107 | )); 108 | 109 | $output->writeln(''); 110 | } 111 | 112 | $output->writeln('Writing the parse table...'); 113 | 114 | $fileName = $outputDir . DIRECTORY_SEPARATOR . 'parse_table.php'; 115 | 116 | if ($input->getOption('debug')) { 117 | $tableDumper = new DebugTableDumper($grammar); 118 | } else { 119 | $tableDumper = new ProductionTableDumper(); 120 | } 121 | 122 | $code = $tableDumper->dump($table); 123 | 124 | $ret = @file_put_contents($fileName, $code); 125 | if ($ret === false) { 126 | $output->writeln('Error writing the parse table'); 127 | } else { 128 | $output->writeln('Parse table written'); 129 | } 130 | } catch(ConflictException $e) { 131 | $output->writeln(array( 132 | $formatter->formatBlock( 133 | explode("\n", $e->getMessage()), 134 | 'error', 135 | true 136 | ), 137 | )); 138 | 139 | $automaton = $e->getAutomaton(); 140 | } 141 | 142 | if ($input->getOption('dfa')) { 143 | $output->writeln(''); 144 | 145 | $automatonDumper = new AutomatonDumper($automaton); 146 | 147 | if ($input->getOption('state') === null) { 148 | $output->writeln('Exporting the DFA...'); 149 | 150 | $dot = $automatonDumper->dump(); 151 | $file = 'automaton.dot'; 152 | } else { 153 | $state = (int)$input->getOption('state'); 154 | 155 | if (!$automaton->hasState($state)) { 156 | $output->writeln(array( 157 | $formatter->formatBlock( 158 | sprintf('The automaton has no state #%d', $state), 159 | 'error', 160 | true 161 | ), 162 | )); 163 | 164 | return 1; 165 | } 166 | 167 | $output->writeln(sprintf( 168 | 'Exporting the DFA state %d...', 169 | $state 170 | )); 171 | 172 | $dot = $automatonDumper->dumpState($state); 173 | $file = sprintf('state_%d.dot', $state); 174 | } 175 | 176 | $fileName = $outputDir . DIRECTORY_SEPARATOR . $file; 177 | $ret = @file_put_contents($fileName, $dot); 178 | 179 | if ($ret === false) { 180 | $output->writeln('Error writing to the file'); 181 | } else { 182 | $output->writeln('Successfully exported'); 183 | } 184 | } 185 | 186 | return 0; 187 | } 188 | 189 | protected function formatConflict(array $conflict) 190 | { 191 | $type = $conflict['resolution'] === Grammar::SHIFT 192 | ? 'shift/reduce' 193 | : 'reduce/reduce'; 194 | 195 | return sprintf( 196 | "Resolved a %s conflict in state %d on lookahead %s", 197 | $type, 198 | $conflict['state'], 199 | $conflict['lookahead'] 200 | ); 201 | } 202 | } 203 | -------------------------------------------------------------------------------- /docs/lexing.md: -------------------------------------------------------------------------------- 1 | Lexical analysis with Dissect 2 | ============================= 3 | 4 | There are three classes for lexical analysis in Dissect, all under the 5 | namespace `Dissect\Lexer`: `SimpleLexer`, `StatefulLexer` and `RegexLexer`. 6 | 7 | SimpleLexer 8 | ----------- 9 | 10 | `SimpleLexer` simply accepts some token definitions and applies them on 11 | the input. Let's create a subclass for this chapter: 12 | 13 | ```php 14 | use Dissect\Lexer\SimpleLexer; 15 | 16 | class ArithLexer extends SimpleLexer 17 | { 18 | public function __construct() 19 | { 20 | // token definitions 21 | } 22 | } 23 | ``` 24 | 25 | ### Defining tokens 26 | 27 | There are 3 ways to define a token. The simplest one looks like this: 28 | 29 | ```php 30 | $this->token('+'); 31 | ``` 32 | 33 | This definition will simply match a plus symbol, using `+` both as the 34 | name and value of the token. You can use 2 arguments: 35 | 36 | ```php 37 | $this->token('CLASS', 'class'); 38 | ``` 39 | 40 | if you want the token name (first argument) to differ from what will actually be 41 | recognized (second argument). 42 | 43 | The final way defines a token by a regular expression: 44 | 45 | ```php 46 | $this->regex('INT', '/^[1-9][0-9]*/'); 47 | ``` 48 | 49 | Let's now define some tokens we will use in the next chapter: 50 | 51 | ```php 52 | class ArithLexer extends SimpleLexer 53 | { 54 | public function __construct() 55 | { 56 | $this->regex('INT', '/^[1-9][0-9]*/'); 57 | $this->token('('); 58 | $this->token(')'); 59 | $this->token('+'); 60 | $this->token('*'); 61 | $this->token('**'); 62 | } 63 | } 64 | ``` 65 | 66 | > **Tip**: You can also chain the method calls using a fluent interface. 67 | 68 | ### Skipping tokens 69 | 70 | Some tokens have to be recognized, but we don't want them cluttering the 71 | output. The best example are probably whitespace tokens: the lexer has 72 | to recognize them, but they carry no meaning or value, so we can tell 73 | the lexer to `skip` them: 74 | 75 | ```php 76 | class ArithLexer extends SimpleLexer 77 | { 78 | public function __construct() 79 | { 80 | $this->regex('INT', '/[1-9][0-9]*/'); 81 | $this->token('('); 82 | $this->token(')'); 83 | $this->token('+'); 84 | $this->token('*'); 85 | $this->token('**'); 86 | 87 | $this->regex('WSP', "/^[ \r\n\t]+/"); 88 | $this->skip('WSP'); 89 | } 90 | } 91 | ``` 92 | 93 | > You can pass any number of token names to the `skip` method. 94 | 95 | ### Lexing 96 | 97 | Now that we've defined our tokens, we can simply call: 98 | 99 | ```php 100 | $lexer = new ArithLexer(); 101 | $stream = $lexer->lex($input); 102 | ``` 103 | 104 | The return value is an object implementing the 105 | `Dissect\Lexer\TokenStream\TokenStream` interface. The interface defines 106 | several methods you can use to inspect and move through the token 107 | stream. See [TokenStream.php][tokenstream] for all the methods you can 108 | use. 109 | 110 | > If you `count` the token stream, you may be surprised to find out that 111 | > for input like `5 + 3`, it actually contains 4 tokens. That's because, 112 | > as the last step of lexing, a special token called `$eof` is appended 113 | > to mark the end of input. This is crucial to the parsing process, so 114 | > please, never define a token called `$eof` yourself. It could lead to 115 | > some pretty strange errors. Another forbidden token names are `$start` 116 | > and `$epsilon`. 117 | 118 | StatefulLexer 119 | ------------- 120 | 121 | `SimpleLexer` should work fine for general use cases. However, let's 122 | imagine we're lexing a very simple templating language: 123 | 124 | Outer content, {{ variable_name }}, other outer content 125 | 126 | `SimpleLexer` falls short here, because the outer content can be pretty 127 | much anything, while the content inside the tags has to be strictly 128 | intepreted. Furthermore, if we were to work with this template, we'd 129 | want to skip the whitespace inside tags, but keep it in the outer 130 | content. 131 | 132 | That's where `StatefulLexer` comes in; during lexing, it maintains a 133 | stack of states with the top one being the current one, and for each 134 | token, you can define the action the lexer should take after recognizing 135 | it. Let's see an example for our templating language: 136 | 137 | ```php 138 | use Dissect\Lexer\StatefulLexer; 139 | 140 | class TemplateLexer extends StatefulLexer 141 | { 142 | public function __construct() 143 | { 144 | $lexer->state('outside') 145 | ->regex('CONTENT', '/^[^"{{"]*/') 146 | ->token('{{')->action('tag'); 147 | 148 | $lexer->state('tag') 149 | ->regex('WSP', "/^[ \r\n\t]+/") 150 | ->regex('VAR', '/^[a-zA-Z_]+/') 151 | ->token('}}')->action(StatefulLexer::POP_STATE) 152 | ->skip('WSP'); 153 | 154 | $lexer->start('outside'); 155 | } 156 | } 157 | ``` 158 | 159 | Please note that before defining any tokens, we have to define a state. 160 | For the tokens that cause the state transition, we call `action` to 161 | specify what should the lexer do. The action can be either a string, in 162 | which case the lexer goes to the state specified by the string, or 163 | `StatefulLexer::POP_STATE`, which causes the lexer to pop the current 164 | state of the stack, essentialy going back to previous state. 165 | Finally, we tell the lexer in which state to start by calling `start`. 166 | 167 | Improving lexer performance 168 | --------------------------- 169 | 170 | There's one important trick to improve the performance of your lexers. 171 | The documentation uses it implicitly, but it requires an explicit mention: 172 | 173 | When using one of the lexer classes documented above and defining tokens 174 | using regular expressions, *always* anchor the regex at the beginning 175 | using `^` like this: 176 | 177 | ```php 178 | $this->regex('INT', '/^[1-9][0-9]*/'); 179 | ``` 180 | 181 | This little optimization will lead to substantial performance gains on 182 | any but the shortest input strings, since without anchoring, the PCRE 183 | engine would always look for matches throughout the entire remaining 184 | input string, which would be incredibly wasteful for long inputs. 185 | 186 | RegexLexer 187 | ---------- 188 | 189 | When designing the lexer classes, my goal was not to sacrifice 190 | user-friendliness for performance. However, I'm well aware that there 191 | are use cases that require the highest performace possible. That's 192 | why I adapted the highly performant but slightly less user-friendly 193 | [lexer][doctrinelexer] from [doctrine][doctrine] into Dissect. 194 | 195 | The usage is almost identical to the original class, writing a lexer 196 | for the arithmetic expressions could look something like this: 197 | 198 | ```php 199 | use Dissect\Lexer\RegexLexer; 200 | use RuntimeException; 201 | 202 | class ArithLexer extends RegexLexer 203 | { 204 | protected $tokens = ['+', '*', '**', '(', ')']; 205 | 206 | protected function getCatchablePatterns() 207 | { 208 | return ['[1-9][0-9]*']; 209 | } 210 | 211 | protected function getNonCatchablePatterns() 212 | { 213 | return ['\s+']; 214 | } 215 | 216 | protected function getType(&$value) 217 | { 218 | if (is_numeric($value)) { 219 | $value = (int)$value; 220 | 221 | return 'INT'; 222 | } elseif (in_array($value, $this->tokens)) { 223 | // the types of the simple tokens equal their values here 224 | return $value; 225 | } else { 226 | throw new RuntimeException(sprintf('Invalid token "%s"', $value)); 227 | } 228 | } 229 | } 230 | ``` 231 | 232 | Continue 233 | -------- 234 | 235 | Now that we've demonstrated how to perform lexical analysis with 236 | Dissect, we can move onto syntactical analysis, commonly known as 237 | [parsing][parsing]. 238 | 239 | [tokenstream]: ../src/Dissect/Lexer/TokenStream/TokenStream.php 240 | [parsing]: parsing.md 241 | [doctrinelexer]: https://github.com/doctrine/lexer/blob/master/lib/Doctrine/Common/Lexer/AbstractLexer.php 242 | [doctrine]: https://github.com/doctrine/lexer 243 | -------------------------------------------------------------------------------- /src/Dissect/Parser/Grammar.php: -------------------------------------------------------------------------------- 1 | 11 | */ 12 | class Grammar 13 | { 14 | /** 15 | * The name given to the rule the grammar is augmented with 16 | * when start() is called. 17 | */ 18 | const START_RULE_NAME = '$start'; 19 | 20 | /** 21 | * The epsilon symbol signifies an empty production. 22 | */ 23 | const EPSILON = '$epsilon'; 24 | 25 | /** 26 | * @var \Dissect\Parser\Rule[] 27 | */ 28 | protected $rules = array(); 29 | 30 | /** 31 | * @var array 32 | */ 33 | protected $groupedRules = array(); 34 | 35 | /** 36 | * @var int 37 | */ 38 | protected $nextRuleNumber = 1; 39 | 40 | /** 41 | * @var int 42 | */ 43 | protected $conflictsMode = 9; // SHIFT | OPERATORS 44 | 45 | /** 46 | * @var string 47 | */ 48 | protected $currentNonterminal; 49 | 50 | /** 51 | * @var \Dissect\Parser\Rule 52 | */ 53 | protected $currentRule; 54 | 55 | /** 56 | * @var array 57 | */ 58 | protected $operators = array(); 59 | 60 | /** 61 | * @var array 62 | */ 63 | protected $currentOperators; 64 | 65 | /** 66 | * Signifies that the parser should not resolve any 67 | * grammar conflicts. 68 | */ 69 | const NONE = 0; 70 | 71 | /** 72 | * Signifies that the parser should resolve 73 | * shift/reduce conflicts by always shifting. 74 | */ 75 | const SHIFT = 1; 76 | 77 | /** 78 | * Signifies that the parser should resolve 79 | * reduce/reduce conflicts by reducing with 80 | * the longer rule. 81 | */ 82 | const LONGER_REDUCE = 2; 83 | 84 | /** 85 | * Signifies that the parser should resolve 86 | * reduce/reduce conflicts by reducing 87 | * with the rule that was given earlier in 88 | * the grammar. 89 | */ 90 | const EARLIER_REDUCE = 4; 91 | 92 | /** 93 | * Signifies that the conflicts should be 94 | * resolved by taking operator precendence 95 | * into account. 96 | */ 97 | const OPERATORS = 8; 98 | 99 | /** 100 | * Signifies that the parser should automatically 101 | * resolve all grammar conflicts. 102 | */ 103 | const ALL = 15; 104 | 105 | /** 106 | * Left operator associativity. 107 | */ 108 | const LEFT = 0; 109 | 110 | /** 111 | * Right operator associativity. 112 | */ 113 | const RIGHT = 1; 114 | 115 | /** 116 | * The operator is nonassociative. 117 | */ 118 | const NONASSOC = 2; 119 | 120 | public function __invoke($nonterminal) 121 | { 122 | $this->currentNonterminal = $nonterminal; 123 | 124 | return $this; 125 | } 126 | 127 | /** 128 | * Defines an alternative for a grammar rule. 129 | * 130 | * @param string... The components of the rule. 131 | * 132 | * @return \Dissect\Parser\Grammar This instance. 133 | */ 134 | public function is() 135 | { 136 | $this->currentOperators = null; 137 | 138 | if ($this->currentNonterminal === null) { 139 | throw new LogicException( 140 | 'You must specify a name of the rule first.' 141 | ); 142 | } 143 | 144 | $num = $this->nextRuleNumber++; 145 | 146 | $rule = new Rule($num, $this->currentNonterminal, func_get_args()); 147 | 148 | $this->rules[$num] = 149 | $this->currentRule = 150 | $this->groupedRules[$this->currentNonterminal][] = 151 | $rule; 152 | 153 | return $this; 154 | } 155 | 156 | /** 157 | * Sets the callback for the current rule. 158 | * 159 | * @param callable $callback The callback. 160 | * 161 | * @return \Dissect\Parser\Grammar This instance. 162 | */ 163 | public function call($callback) 164 | { 165 | if ($this->currentRule === null) { 166 | throw new LogicException( 167 | 'You must specify a rule first.' 168 | ); 169 | } 170 | 171 | $this->currentRule->setCallback($callback); 172 | 173 | return $this; 174 | } 175 | 176 | /** 177 | * Returns the set of rules of this grammar. 178 | * 179 | * @return \Dissect\Parser\Rule[] The rules. 180 | */ 181 | public function getRules() 182 | { 183 | return $this->rules; 184 | } 185 | 186 | public function getRule($number) 187 | { 188 | return $this->rules[$number]; 189 | } 190 | 191 | /** 192 | * Returns the nonterminal symbols of this grammar. 193 | * 194 | * @return string[] The nonterminals. 195 | */ 196 | public function getNonterminals() 197 | { 198 | return $this->nonterminals; 199 | } 200 | 201 | /** 202 | * Returns rules grouped by nonterminal name. 203 | * 204 | * @return array The rules grouped by nonterminal name. 205 | */ 206 | public function getGroupedRules() 207 | { 208 | return $this->groupedRules; 209 | } 210 | 211 | /** 212 | * Sets a start rule for this grammar. 213 | * 214 | * @param string The name of the start rule. 215 | */ 216 | public function start($name) 217 | { 218 | $this->rules[0] = new Rule(0, self::START_RULE_NAME, array($name)); 219 | } 220 | 221 | /** 222 | * Returns the augmented start rule. For internal use only. 223 | * 224 | * @return \Dissect\Parser\Rule The start rule. 225 | */ 226 | public function getStartRule() 227 | { 228 | if (!isset($this->rules[0])) { 229 | throw new LogicException("No start rule specified."); 230 | } 231 | 232 | return $this->rules[0]; 233 | } 234 | 235 | /** 236 | * Sets the mode of conflict resolution. 237 | * 238 | * @param int $mode The bitmask for the mode. 239 | */ 240 | public function resolve($mode) 241 | { 242 | $this->conflictsMode = $mode; 243 | } 244 | 245 | /** 246 | * Returns the conflict resolution mode for this grammar. 247 | * 248 | * @return int The bitmask of the resolution mode. 249 | */ 250 | public function getConflictsMode() 251 | { 252 | return $this->conflictsMode; 253 | } 254 | 255 | /** 256 | * Does a nonterminal $name exist in the grammar? 257 | * 258 | * @param string $name The name of the nonterminal. 259 | * 260 | * @return boolean 261 | */ 262 | public function hasNonterminal($name) 263 | { 264 | return array_key_exists($name, $this->groupedRules); 265 | } 266 | 267 | /** 268 | * Defines a group of operators. 269 | * 270 | * @param string,... Any number of tokens that serve as the operators. 271 | * 272 | * @return \Dissect\Parser\Grammar This instance for fluent interface. 273 | */ 274 | public function operators() 275 | { 276 | $this->currentRule = null; 277 | 278 | $ops = func_get_args(); 279 | 280 | $this->currentOperators = $ops; 281 | 282 | foreach ($ops as $op) { 283 | $this->operators[$op] = array( 284 | 'prec' => 1, 285 | 'assoc' => self::LEFT, 286 | ); 287 | } 288 | 289 | return $this; 290 | } 291 | 292 | /** 293 | * Marks the current group of operators as left-associative. 294 | * 295 | * @return \Dissect\Parser\Grammar This instance for fluent interface. 296 | */ 297 | public function left() 298 | { 299 | return $this->assoc(self::LEFT); 300 | } 301 | 302 | /** 303 | * Marks the current group of operators as right-associative. 304 | * 305 | * @return \Dissect\Parser\Grammar This instance for fluent interface. 306 | */ 307 | public function right() 308 | { 309 | return $this->assoc(self::RIGHT); 310 | } 311 | 312 | /** 313 | * Marks the current group of operators as nonassociative. 314 | * 315 | * @return \Dissect\Parser\Grammar This instance for fluent interface. 316 | */ 317 | public function nonassoc() 318 | { 319 | return $this->assoc(self::NONASSOC); 320 | } 321 | 322 | /** 323 | * Explicitly sets the associatity of the current group of operators. 324 | * 325 | * @param int $a One of Grammar::LEFT, Grammar::RIGHT, Grammar::NONASSOC 326 | * 327 | * @return \Dissect\Parser\Grammar This instance for fluent interface. 328 | */ 329 | public function assoc($a) 330 | { 331 | if (!$this->currentOperators) { 332 | throw new LogicException('Define a group of operators first.'); 333 | } 334 | 335 | foreach ($this->currentOperators as $op) { 336 | $this->operators[$op]['assoc'] = $a; 337 | } 338 | 339 | return $this; 340 | } 341 | 342 | /** 343 | * Sets the precedence (as an integer) of the current group of operators. 344 | * If no group of operators is being specified, sets the precedence 345 | * of the currently described rule. 346 | * 347 | * @param int $i The precedence as an integer. 348 | * 349 | * @return \Dissect\Parser\Grammar This instance for fluent interface. 350 | */ 351 | public function prec($i) 352 | { 353 | if (!$this->currentOperators) { 354 | if (!$this->currentRule) { 355 | throw new LogicException('Define a group of operators or a rule first.'); 356 | } else { 357 | $this->currentRule->setPrecedence($i); 358 | } 359 | } else { 360 | foreach ($this->currentOperators as $op) { 361 | $this->operators[$op]['prec'] = $i; 362 | } 363 | } 364 | 365 | return $this; 366 | } 367 | 368 | /** 369 | * Is the passed token an operator? 370 | * 371 | * @param string $token The token type. 372 | * 373 | * @return boolean 374 | */ 375 | public function hasOperator($token) 376 | { 377 | return array_key_exists($token, $this->operators); 378 | } 379 | 380 | public function getOperatorInfo($token) 381 | { 382 | return $this->operators[$token]; 383 | } 384 | } 385 | -------------------------------------------------------------------------------- /docs/parsing.md: -------------------------------------------------------------------------------- 1 | Parsing with Dissect 2 | ==================== 3 | 4 | Why an LALR(1) parser? 5 | ---------------------- 6 | 7 | Parsing is a task that's needed more often than one would think; 8 | for examples in some famous PHP projects, see [this parser][twigparser] 9 | from [Twig][twig] and [these][annotationsparser] [two][dqlparser] from 10 | [Doctrine][doctrine]. Chances are you've written one; if you did, it was 11 | most likely a [recursive descent parser][rdparser], just like the 12 | examples above. Now, such parsers have several disadvantages: first, 13 | they obviously have to be manually written. Second, they're *recursive*, 14 | which means one thing: nest the input deep enough (like an 15 | annotation, which has another annotation as a parameter, that annotation 16 | has another annotation as a parameter ...) and your PHP process blows up 17 | because of stack overflow (to be fair, you'd have to nest pretty deep). 18 | And third, such parsers belong to a class of parsers known as 19 | [LL(k)][llk], which means they're generally not as powerful as [LR(k)][lrk] 20 | parsers. For instance, they cannot handle left-recursive rules 21 | (rules like `A -> A ...`), which are probably the only sane way of 22 | expressing left-associative binary operators (like addition, for 23 | example). 24 | 25 | But let's get to actually parsing something. 26 | 27 | Writing a grammar 28 | ----------------- 29 | 30 | A grammar is represented by a subclass of `Dissect\Parser\Grammar`. 31 | 32 | ```php 33 | use Dissect\Parser\Grammar; 34 | 35 | class ArithGrammar extends Grammar 36 | { 37 | public function __construct() 38 | { 39 | // rule definitions 40 | } 41 | } 42 | ``` 43 | 44 | First, you tell Dissect what rule are you describing. Let's say we want 45 | to describe a rule for a `Sum`: 46 | 47 | ```php 48 | $this('Sum') 49 | ``` 50 | 51 | and then you specify what the rule actually `is`: 52 | 53 | ```php 54 | $this('Sum') 55 | ->is('int', '+', 'int'); 56 | ``` 57 | 58 | A rule can of course have many alternatives: 59 | 60 | ```php 61 | $this('Sum') 62 | ->is('int', '+', 'int') 63 | ->is('string', '+', 'string'); 64 | ``` 65 | 66 | and you will probably want to specify how to evalute the rule: 67 | 68 | ```php 69 | $this('Sum') 70 | ->is('int', '+', 'int') 71 | ->call(function ($l, $_, $r) { 72 | return $l + $r; 73 | }) 74 | 75 | ->is('string', '+', 'string') 76 | ->call(function ($l, $_, $r) { 77 | return $l . $r; 78 | }); 79 | ``` 80 | 81 | > The number of arguments to the callback function is always equal 82 | > to the length of the rule to which it belongs. 83 | 84 | ### Empty rules 85 | 86 | A grammar can (and many times will) contain empty rules, that is, rules that 87 | can match 0 tokens of the input. This is useful when, for example, 88 | describing a list of function arguments, which can be either empty or a list of 89 | values separated by commas. 90 | 91 | An empty rule is defined simply by calling `is` with 0 arguments: 92 | 93 | ```php 94 | $this('Empty') 95 | ->is(); 96 | ``` 97 | 98 | If you find this notation unclear, you can explicitly mark empty rules 99 | with a comment: 100 | 101 | ```php 102 | $this('Empty') 103 | ->is(/* empty */); 104 | ``` 105 | 106 | > **Beware:** When you don't specify a callback for a rule, Dissect 107 | > will default to returing the leftmost (first) component of the rule. You 108 | > are, however, required to specify a callback for an empty rule, since 109 | > in a rule with zero components, there is obviously no leftmost one. 110 | 111 | Example: Parsing mathematical expressions 112 | ----------------------------------------- 113 | 114 | In the chapter on lexing, we've created a lexer we will now use to 115 | process our expressions: 116 | 117 | ```php 118 | class ArithLexer extends SimpleLexer 119 | { 120 | public function __construct() 121 | { 122 | $this->regex('INT', '/^[1-9][0-9]*/'); 123 | $this->token('('); 124 | $this->token(')'); 125 | $this->token('+'); 126 | $this->token('*'); 127 | $this->token('**'); 128 | 129 | $this->regex('WSP', "/^[ \r\n\t]+/"); 130 | $this->skip('WSP'); 131 | } 132 | } 133 | 134 | $lexer = new ArithLexer(); 135 | ``` 136 | 137 | As for the grammar, let's start out slow, with only a single operator: 138 | 139 | ```php 140 | $this('Expr') 141 | ->is('Expr', '+', 'Expr') 142 | ->call(function ($l, $_, $r) { 143 | return $l + $r; 144 | }) 145 | 146 | ->is('INT') 147 | ->call(function ($i) { 148 | return (int)$i->getValue(); 149 | }); 150 | 151 | $this->start('Expr'); 152 | ``` 153 | 154 | These two rule specify an expression to be either two expression 155 | separated by a plus or simply an integer. The call to `start()` 156 | sets the starting rule of the grammar. 157 | 158 | Now, we can simply pass the grammar to a parser object: 159 | 160 | ```php 161 | use Dissect\Parser\LALR1\Parser; 162 | 163 | $parser = new Parser(new ArithGrammar()); 164 | $stream = $lexer->lex('1 + 2 + 3'); 165 | echo $parser->parse($stream); 166 | // => 6 167 | ``` 168 | 169 | and yay, it works! 170 | 171 | ### Operator associativity 172 | 173 | Actually, it doesn't. It *seems* to work because addition happens to be 174 | commutative, but a problem appears once we add another rule to the 175 | grammar to represent subtraction: 176 | 177 | ```php 178 | $this('Expr') 179 | ->is('Expr', '+', 'Expr') ... 180 | 181 | ->is('Expr', '-', 'Expr') 182 | ->call(function ($l, $_, $r) { 183 | return $l - $r; 184 | }) 185 | 186 | ->is('INT') ... 187 | ``` 188 | 189 | The result looks like this: 190 | 191 | ```php 192 | $stream = $lexer->lex('3 - 5 - 2'); 193 | echo $parser->parse($stream); 194 | // => 0 195 | ``` 196 | 197 | Well, that's certainly incorrect. The problem is that our grammar 198 | actually contains a conflict (a *shift/reduce* conflict, if you're a fan 199 | of termini technici. See the [section on conflict resolution](#resolving-conflicts).) 200 | which Dissect automatically resolves in a way that makes our `+` and `-` 201 | operators right-associative. The problem is fortunately easy to solve: 202 | we have to mark them as left-associative operators: 203 | 204 | ```php 205 | ->is('INT') ... 206 | 207 | $this->operators('+', '-')->left(); 208 | ``` 209 | 210 | This makes Dissect treat the two tokens in a special way, the conflict 211 | is resolved to represent left-associativity and the parser works correctly: 212 | 213 | ```php 214 | $stream = $lexer->lex('3 - 5 - 2'); 215 | echo $parser->parse($stream); 216 | // => -4 217 | ``` 218 | 219 | ### Operator precedence 220 | 221 | Unfortunately, we're not out of the woods yet. When we add another two 222 | rules to represent multiplication and division, we see that the parser 223 | still makes mistakes: 224 | 225 | ```php 226 | $this('Expr') 227 | ... 228 | 229 | ->is('Expr', '*', 'Expr') 230 | ->call(function ($l, $_, $r) { 231 | return $l * $r; 232 | }) 233 | 234 | ->is('Expr', '/', 'Expr') 235 | ->call(function ($l, $_, $r) { 236 | return $l / $r; 237 | }) 238 | 239 | ... 240 | 241 | $this->operators('*', '/')->left(); 242 | ... 243 | 244 | $stream = $lexer->lex('2 + 3 * 5'); 245 | echo $parser->parse($stream); 246 | // => 25 247 | ``` 248 | 249 | The problem is that Dissect doesn't know anything about the precedence 250 | of our operators. But we can, of course, provide the necessary information: 251 | 252 | ```php 253 | $this->operators('+', '-')->left()->prec(1); 254 | $this->operators('*', '/')->left()->prec(2); 255 | 256 | ... 257 | 258 | $stream = $lexer->lex('2 + 3 * 5'); 259 | echo $parser->parse($stream); 260 | // => 17 261 | ``` 262 | 263 | The higher the integer passed to the `prec()` method, the higher the 264 | precedence of the specified operators. 265 | 266 | And we have the basic grammar for mathematical expressions in place! 267 | As an exercise, try to handle the rest of the tokens defined in the lexer: 268 | 269 | - Create a rule to handle parentheses around expressions. 270 | - Create a rule for the final operator, `**`, which represents 271 | exponentiation. Give it the highest precedence and make it 272 | *right-associative* (the method is, shockingly, called `right()`). 273 | 274 | ### Specifying precedences on rules instead of operators 275 | 276 | As a final touch, we'd like to add a unary minus operator to our grammar: 277 | 278 | ```php 279 | $this('Expr') 280 | ... 281 | 282 | ->is('-', 'Expr') 283 | ->call(function ($_, $e) { 284 | return -$e; 285 | }) 286 | ... 287 | ``` 288 | 289 | But you might feel that something is amiss. Unary minus should have the 290 | highest precedence, but we've specified the precedence of `-` to be the 291 | lowest, actually. But don't worry, we can assign precedences directly to 292 | rules: 293 | 294 | ```php 295 | $this('Expr') 296 | ... 297 | 298 | ->is('-', 'Expr')->prec(4) // higher than everything 299 | ->call(function ($_, $e) { 300 | return -$e; 301 | }) 302 | ... 303 | ``` 304 | 305 | ### Nonassociativity 306 | 307 | Apart from being left- or right-associative, operators can be 308 | nonassociative, which means that for an operator `op`, the input 309 | `a op b op c` means neither `(a op b) op c` or `a op (b op c)`, 310 | but is considered a syntax error. 311 | 312 | This has certain use cases; for instance, one of the nonassociative 313 | operators in the grammar for PHP is `<`: when parsing `1 < 2 < 3`, 314 | the PHP parser reports a syntax error. 315 | 316 | The corresponding method in Dissect grammars is `nonassoc()`: 317 | 318 | ```php 319 | $this->operators('<', '>')->nonassoc()->prec(...); 320 | ``` 321 | 322 | ### Describing common syntactic structures 323 | 324 | To see how to describe commonly used syntactic structures such as 325 | repetitions and lists, see the [dedicated documentation section][common]. 326 | 327 | Invalid input 328 | ------------- 329 | 330 | When the parser encounters a syntactical error, it stops dead and 331 | throws a `Dissect\Parser\Exception\UnexpectedTokenException`. 332 | The exception gives you programmatic access to information about the 333 | problem: `getToken()` returns a `Dissect\Lexer\Token` representing the 334 | invalid token and `getExpected()` returns an array of token types the parser 335 | expected to encounter. 336 | 337 | Precomputing the parse table 338 | ---------------------------- 339 | 340 | The parser needs a *parse table* to decide what to do based on given 341 | input. That parse table is created from the grammar and, if we give the 342 | parser only the grammar, needs to be computed every time we instantiate 343 | the parser. 344 | 345 | Grammar analysis is costly; if you need the speed, a far better choice 346 | would be to precompute the table beforehand (perhaps as a part of your 347 | build process) like this: 348 | 349 | ```php 350 | use Dissect\Parser\LALR1\Analysis\Analyzer; 351 | 352 | $analyzer = new Analyzer(); 353 | $parseTable = $analyzer->analyze($grammar)->getParseTable(); 354 | ``` 355 | 356 | Now that we've got the parse table, we can dump it to a string which 357 | we then save to a file. To do this, we can use either 358 | `Dissect\Parser\LALR1\Dumper\ProductionTableDumper`: 359 | 360 | ```php 361 | $dumper = new ProductionTableDumper(); 362 | $php = $dumper->dump($parseTable); 363 | ``` 364 | 365 | which produces very compact, whitespace-free and absolutely unreadable 366 | code, or `Dissect\Parser\LALR1\Dumper\DebugTableDumper`: 367 | 368 | ```php 369 | $dumper = new DebugTableDumper($grammar); 370 | $php = $dumper->dump($parseTable); 371 | ``` 372 | 373 | which produces indented, readable representation with comments 374 | explaining each step the parser takes when processing the input. 375 | 376 | ### Using the dumped parse table 377 | 378 | To use the dumped parse table, just write 379 | 380 | ```php 381 | $parser = new Parser($grammar, require $parseTableFile); 382 | ``` 383 | 384 | You still need to pass the grammar, since it contains the callbacks 385 | used to evalute the input. 386 | 387 | > If you intend to use Dissect more like a traditional parser generator, 388 | > you don't actually need to do any of this, of course. Dissect provides a 389 | > command-line interface you can use to process and debug your grammars. 390 | > It's described in its own [documentation section][cli]. 391 | 392 | Resolving conflicts 393 | ------------------- 394 | 395 | *Caution, this is advanced stuff. You probably won't ever need to worry 396 | about this.* 397 | 398 | LALR(1) is generally a very poweful parsing algorithm. However, there 399 | are practical grammars that are, unfortunately, almost-but-not-quite 400 | LALR(1). When running an LALR(1) analyzer on such grammars, one sees 401 | that they contain 2 types of conflicts: 402 | 403 | - **Shift/Reduce conflicts** - the parser doesn't know whether to shift 404 | another token or reduce what's on the stack. 405 | 406 | - **Reduce/Reduce conflicts** - the parser can reduce by multiple 407 | grammar rules. 408 | 409 | There are 4 commonly used ways of resolving such conflicts and Dissect allows you to 410 | combine them any way you want: 411 | 412 | 1. On a shift/reduce conflict, consult the operators precedence 413 | and associativity information. The rules for resolution are a little 414 | complicated, but the conflict may be resolved as a reduce (either the 415 | precedence of the rule is higher than that of the shifted token or the 416 | token is left-associative), a shift (the rule precedence is lower or the 417 | token is right-associative) or even as an error (when the token is 418 | nonassociative). Note that Dissect doesn't report conflicts resolved 419 | using this technique, since they were intentionally created by the user 420 | and therefore are not really conflicts. Represented by the 421 | constant `Grammar::OPERATORS`. 422 | 423 | 2. On a shift/reduce conflict, always shift. This is represented by 424 | the constant `Grammar::SHIFT` and, together with the above method, 425 | is enabled by default. 426 | 427 | 3. On a reduce/reduce conflict, reduce using the longer rule. 428 | Represented by `Grammar::LONGER_REDUCE`. Both this and the previous 429 | way represent the same philosophy: take the largest bite possible. 430 | This is usually what the user intended to express. 431 | 432 | 4. On a reduce/reduce conflict, reduce using the rule that was 433 | declared earlier in the grammar. Represented by 434 | `Grammar::EARLIER_REDUCE`. 435 | 436 | To specify precisely how should Dissect resolve parse table conflicts, 437 | call `resolve` on your grammar: 438 | 439 | ```php 440 | $this->resolve(Grammar::SHIFT | Grammar::OPERATORS | Grammar::LONGER_REDUCE); 441 | ``` 442 | 443 | There are two other constants: `Grammar::NONE` that forbids any 444 | conflicts in the grammar (even the operators-related ones) and 445 | `Grammar::ALL`, which is a combination of all the 4 above methods 446 | defined simply for convenience. 447 | 448 | [twigparser]: https://github.com/fabpot/Twig/blob/master/lib/Twig/Parser.php 449 | [twig]: https://github.com/fabpot/Twig 450 | [annotationsparser]: https://github.com/doctrine/common/blob/master/lib/Doctrine/Common/Annotations/DocParser.php 451 | [dqlparser]: https://github.com/doctrine/doctrine2/blob/master/lib/Doctrine/ORM/Query/Parser.php 452 | [doctrine]: https://github.com/doctrine 453 | [rdparser]: http://en.wikipedia.org/wiki/Recursive_descent_parser 454 | [llk]: http://en.wikipedia.org/wiki/LL_parser 455 | [lrk]: http://en.wikipedia.org/wiki/LR_parser 456 | [cli]: cli.md 457 | [common]: common.md 458 | -------------------------------------------------------------------------------- /src/Dissect/Parser/LALR1/Analysis/Analyzer.php: -------------------------------------------------------------------------------- 1 | 18 | */ 19 | class Analyzer 20 | { 21 | /** 22 | * Performs a grammar analysis. 23 | * 24 | * @param \Dissect\Parser\Grammar $grammar The grammar to analyse. 25 | * 26 | * @return \Dissect\Parser\LALR1\Analysis\AnalysisResult The result ofthe analysis. 27 | */ 28 | public function analyze(Grammar $grammar) 29 | { 30 | $automaton = $this->buildAutomaton($grammar); 31 | list($parseTable, $conflicts) = $this->buildParseTable($automaton, $grammar); 32 | 33 | return new AnalysisResult($parseTable, $automaton, $conflicts); 34 | } 35 | 36 | /** 37 | * Builds the handle-finding FSA from the grammar. 38 | * 39 | * @param \Dissect\Parser\Grammar $grammar The grammar. 40 | * 41 | * @return \Dissect\Parser\LALR1\Analysis\Automaton The resulting automaton. 42 | */ 43 | protected function buildAutomaton(Grammar $grammar) 44 | { 45 | // the eventual automaton 46 | $automaton = new Automaton(); 47 | 48 | // the queue of states that need processing 49 | $queue = new SplQueue(); 50 | 51 | // the BST for state kernels 52 | $kernelSet = new KernelSet(); 53 | 54 | // rules grouped by their name 55 | $groupedRules = $grammar->getGroupedRules(); 56 | 57 | // FIRST sets of nonterminals 58 | $firstSets = $this->calculateFirstSets($groupedRules); 59 | 60 | // keeps a list of tokens that need to be pumped 61 | // through the automaton 62 | $pumpings = array(); 63 | 64 | // the item from which the whole automaton 65 | // is derived 66 | $initialItem = new Item($grammar->getStartRule(), 0); 67 | 68 | // construct the initial state 69 | $state = new State($kernelSet->insert(array( 70 | array($initialItem->getRule()->getNumber(), $initialItem->getDotIndex()), 71 | )), array($initialItem)); 72 | 73 | // the initial item automatically has EOF 74 | // as its lookahead 75 | $pumpings[] = array($initialItem, array(Parser::EOF_TOKEN_TYPE)); 76 | 77 | $queue->enqueue($state); 78 | $automaton->addState($state); 79 | 80 | while (!$queue->isEmpty()) { 81 | $state = $queue->dequeue(); 82 | 83 | // items of this state are grouped by 84 | // the active component to calculate 85 | // transitions easily 86 | $groupedItems = array(); 87 | 88 | // calculate closure 89 | $added = array(); 90 | $currentItems = $state->getItems(); 91 | for ($x = 0; $x < count($currentItems); $x++) { 92 | $item = $currentItems[$x]; 93 | 94 | if (!$item->isReduceItem()) { 95 | $component = $item->getActiveComponent(); 96 | $groupedItems[$component][] = $item; 97 | 98 | // if nonterminal 99 | if ($grammar->hasNonterminal($component)) { 100 | 101 | // calculate lookahead 102 | $lookahead = array(); 103 | $cs = $item->getUnrecognizedComponents(); 104 | 105 | foreach ($cs as $i => $c) { 106 | if (!$grammar->hasNonterminal($c)) { 107 | // if terminal, add it and break the loop 108 | $lookahead = Util::union($lookahead, array($c)); 109 | 110 | break; 111 | } else { 112 | // if nonterminal 113 | $new = $firstSets[$c]; 114 | 115 | if (!in_array(Grammar::EPSILON, $new)) { 116 | // if the component doesn't derive 117 | // epsilon, merge FIRST sets and break 118 | $lookahead = Util::union($lookahead, $new); 119 | 120 | break; 121 | } else { 122 | // if it does 123 | 124 | if ($i < (count($cs) - 1)) { 125 | // if more components ahead, remove epsilon 126 | unset($new[array_search(Grammar::EPSILON, $new)]); 127 | } 128 | 129 | // and continue the loop 130 | $lookahead = Util::union($lookahead, $new); 131 | } 132 | } 133 | } 134 | 135 | // two items are connected if the unrecognized 136 | // part of rule 1 derives epsilon 137 | $connect = false; 138 | 139 | // only store the pumped tokens if there 140 | // actually is an unrecognized part 141 | $pump = true; 142 | 143 | if (empty($lookahead)) { 144 | $connect = true; 145 | $pump = false; 146 | } else { 147 | if (in_array(Grammar::EPSILON, $lookahead)) { 148 | unset($lookahead[array_search(Grammar::EPSILON, $lookahead)]); 149 | 150 | $connect = true; 151 | } 152 | } 153 | 154 | foreach ($groupedRules[$component] as $rule) { 155 | if (!in_array($component, $added)) { 156 | // if $component hasn't yet been expaned, 157 | // create new items for it 158 | $newItem = new Item($rule, 0); 159 | 160 | $currentItems[] = $newItem; 161 | $state->add($newItem); 162 | 163 | } else { 164 | // if it was expanded, each original 165 | // rule might bring new lookahead tokens, 166 | // so get the rule from the current state 167 | $newItem = $state->get($rule->getNumber(), 0); 168 | } 169 | 170 | if ($connect) { 171 | $item->connect($newItem); 172 | } 173 | 174 | if ($pump) { 175 | $pumpings[] = array($newItem, $lookahead); 176 | } 177 | } 178 | } 179 | 180 | // mark the component as processed 181 | $added[] = $component; 182 | } 183 | } 184 | 185 | // calculate transitions 186 | foreach ($groupedItems as $thisComponent => $theseItems) { 187 | $newKernel = array(); 188 | 189 | foreach ($theseItems as $thisItem) { 190 | $newKernel[] = array( 191 | $thisItem->getRule()->getNumber(), 192 | $thisItem->getDotIndex() + 1, 193 | ); 194 | } 195 | 196 | $num = $kernelSet->insert($newKernel); 197 | 198 | if ($automaton->hasState($num)) { 199 | // the state already exists 200 | $automaton->addTransition($state->getNumber(), $thisComponent, $num); 201 | 202 | // extract the connected items from the target state 203 | $nextState = $automaton->getState($num); 204 | 205 | foreach ($theseItems as $thisItem) { 206 | $thisItem->connect( 207 | $nextState->get( 208 | $thisItem->getRule()->getNumber(), 209 | $thisItem->getDotIndex() + 1 210 | ) 211 | ); 212 | } 213 | } else { 214 | // new state needs to be created 215 | $newState = new State($num, array_map(function (Item $i) { 216 | $new = new Item($i->getRule(), $i->getDotIndex() + 1); 217 | 218 | // connect the two items 219 | $i->connect($new); 220 | 221 | return $new; 222 | }, $theseItems)); 223 | 224 | $automaton->addState($newState); 225 | $queue->enqueue($newState); 226 | 227 | $automaton->addTransition($state->getNumber(), $thisComponent, $num); 228 | } 229 | } 230 | } 231 | 232 | // pump all the lookahead tokens 233 | foreach ($pumpings as $pumping) { 234 | $pumping[0]->pumpAll($pumping[1]); 235 | } 236 | 237 | return $automaton; 238 | } 239 | 240 | /** 241 | * Encodes the handle-finding FSA as a LR parse table. 242 | * 243 | * @param \Dissect\Parser\LALR1\Analysis\Automaton $automaton 244 | * 245 | * @return array The parse table. 246 | */ 247 | protected function buildParseTable(Automaton $automaton, Grammar $grammar) 248 | { 249 | $conflictsMode = $grammar->getConflictsMode(); 250 | $conflicts = array(); 251 | $errors = array(); 252 | 253 | // initialize the table 254 | $table = array( 255 | 'action' => array(), 256 | 'goto' => array(), 257 | ); 258 | 259 | foreach ($automaton->getTransitionTable() as $num => $transitions) { 260 | foreach ($transitions as $trigger => $destination) { 261 | if (!$grammar->hasNonterminal($trigger)) { 262 | // terminal implies shift 263 | $table['action'][$num][$trigger] = $destination; 264 | } else { 265 | // nonterminal goes in the goto table 266 | $table['goto'][$num][$trigger] = $destination; 267 | } 268 | } 269 | } 270 | 271 | foreach ($automaton->getStates() as $num => $state) { 272 | if (!isset($table['action'][$num])) { 273 | $table['action'][$num] = array(); 274 | } 275 | 276 | foreach ($state->getItems() as $item) { 277 | if ($item->isReduceItem()) { 278 | $ruleNumber = $item->getRule()->getNumber(); 279 | 280 | foreach ($item->getLookahead() as $token) { 281 | if (isset($errors[$num]) && isset($errors[$num][$token])) { 282 | // there was a previous conflict resolved as an error 283 | // entry for this token. 284 | 285 | continue; 286 | } 287 | 288 | if (array_key_exists($token, $table['action'][$num])) { 289 | // conflict 290 | $instruction = $table['action'][$num][$token]; 291 | 292 | if ($instruction > 0) { 293 | if ($conflictsMode & Grammar::OPERATORS) { 294 | if ($grammar->hasOperator($token)) { 295 | $operatorInfo = $grammar->getOperatorInfo($token); 296 | 297 | $rulePrecedence = $item->getRule()->getPrecedence(); 298 | 299 | // unless the rule has given precedence 300 | if ($rulePrecedence === null) { 301 | foreach (array_reverse($item->getRule()->getComponents()) as $c) { 302 | // try to extract it from the rightmost terminal 303 | if ($grammar->hasOperator($c)) { 304 | $ruleOperatorInfo = $grammar->getOperatorInfo($c); 305 | $rulePrecedence = $ruleOperatorInfo['prec']; 306 | 307 | break; 308 | } 309 | } 310 | } 311 | 312 | if ($rulePrecedence !== null) { 313 | // if we actually have a rule precedence 314 | 315 | $tokenPrecedence = $operatorInfo['prec']; 316 | 317 | if ($rulePrecedence > $tokenPrecedence) { 318 | // if the rule precedence is higher, reduce 319 | $table['action'][$num][$token] = -$ruleNumber; 320 | } elseif ($rulePrecedence < $tokenPrecedence) { 321 | // if the token precedence is higher, shift 322 | // (i.e. don't modify the table) 323 | } else { 324 | // precedences are equal, let's turn to associativity 325 | $assoc = $operatorInfo['assoc']; 326 | 327 | if ($assoc === Grammar::RIGHT) { 328 | // if right-associative, shift 329 | // (i.e. don't modify the table) 330 | } elseif ($assoc === Grammar::LEFT) { 331 | // if left-associative, reduce 332 | $table['action'][$num][$token] = -$ruleNumber; 333 | } elseif ($assoc === Grammar::NONASSOC) { 334 | // the token is nonassociative. 335 | // this actually means an input error, so 336 | // remove the shift entry from the table 337 | // and mark this as an explicit error 338 | // entry 339 | unset($table['action'][$num][$token]); 340 | $errors[$num][$token] = true; 341 | } 342 | } 343 | 344 | continue; // resolved the conflict, phew 345 | } 346 | 347 | // we couldn't calculate the precedence => the conflict was not resolved 348 | // move along. 349 | } 350 | } 351 | 352 | // s/r 353 | if ($conflictsMode & Grammar::SHIFT) { 354 | $conflicts[] = array( 355 | 'state' => $num, 356 | 'lookahead' => $token, 357 | 'rule' => $item->getRule(), 358 | 'resolution' => Grammar::SHIFT, 359 | ); 360 | 361 | continue; 362 | } else { 363 | throw new ShiftReduceConflictException( 364 | $num, 365 | $item->getRule(), 366 | $token, 367 | $automaton 368 | ); 369 | } 370 | } else { 371 | // r/r 372 | 373 | $originalRule = $grammar->getRule(-$instruction); 374 | $newRule = $item->getRule(); 375 | 376 | if ($conflictsMode & Grammar::LONGER_REDUCE) { 377 | 378 | $count1 = count($originalRule->getComponents()); 379 | $count2 = count($newRule->getComponents()); 380 | 381 | if ($count1 > $count2) { 382 | // original rule is longer 383 | $resolvedRules = array($originalRule, $newRule); 384 | 385 | $conflicts[] = array( 386 | 'state' => $num, 387 | 'lookahead' => $token, 388 | 'rules' => $resolvedRules, 389 | 'resolution' => Grammar::LONGER_REDUCE, 390 | ); 391 | 392 | continue; 393 | } elseif ($count2 > $count1) { 394 | // new rule is longer 395 | $table['action'][$num][$token] = -$ruleNumber; 396 | $resolvedRules = array($newRule, $originalRule); 397 | 398 | $conflicts[] = array( 399 | 'state' => $num, 400 | 'lookahead' => $token, 401 | 'rules' => $resolvedRules, 402 | 'resolution' => Grammar::LONGER_REDUCE, 403 | ); 404 | 405 | continue; 406 | } 407 | } 408 | 409 | if ($conflictsMode & Grammar::EARLIER_REDUCE) { 410 | if (-$instruction < $ruleNumber) { 411 | // original rule was earlier 412 | $resolvedRules = array($originalRule, $newRule); 413 | 414 | $conflicts[] = array( 415 | 'state' => $num, 416 | 'lookahead' => $token, 417 | 'rules' => $resolvedRules, 418 | 'resolution' => Grammar::EARLIER_REDUCE, 419 | ); 420 | 421 | continue; 422 | } else { 423 | // new rule was earlier 424 | $table['action'][$num][$token] = -$ruleNumber; 425 | 426 | $conflicts[] = array( 427 | 'state' => $num, 428 | 'lookahead' => $token, 429 | 'rules' => $resolvedRules, 430 | 'resolution' => Grammar::EARLIER_REDUCE, 431 | ); 432 | $resolvedRules = array($newRule, $originalRule); 433 | 434 | continue; 435 | } 436 | } 437 | 438 | // everything failed, throw an exception 439 | throw new ReduceReduceConflictException( 440 | $num, 441 | $originalRule, 442 | $newRule, 443 | $token, 444 | $automaton 445 | ); 446 | } 447 | } 448 | 449 | $table['action'][$num][$token] = -$ruleNumber; 450 | } 451 | } 452 | } 453 | } 454 | 455 | return array($table, $conflicts); 456 | } 457 | 458 | /** 459 | * Calculates the FIRST sets of all nonterminals. 460 | * 461 | * @param array $rules The rules grouped by the LHS. 462 | * 463 | * @return array Calculated FIRST sets. 464 | */ 465 | protected function calculateFirstSets(array $rules) 466 | { 467 | // initialize 468 | $firstSets = array(); 469 | 470 | foreach (array_keys($rules) as $lhs) { 471 | $firstSets[$lhs] = array(); 472 | } 473 | 474 | do { 475 | $changes = false; 476 | 477 | foreach ($rules as $lhs => $ruleArray) { 478 | foreach ($ruleArray as $rule) { 479 | $components = $rule->getComponents(); 480 | $new = array(); 481 | 482 | if (empty($components)) { 483 | $new = array(Grammar::EPSILON); 484 | } else { 485 | foreach ($components as $i => $component) { 486 | if (array_key_exists($component, $rules)) { 487 | // if nonterminal, copy its FIRST set to 488 | // this rule's first set 489 | $x = $firstSets[$component]; 490 | 491 | if (!in_array(Grammar::EPSILON, $x)) { 492 | // if the component doesn't derive 493 | // epsilon, merge the first sets and 494 | // we're done 495 | $new = Util::union($new, $x); 496 | 497 | break; 498 | } else { 499 | // if all components derive epsilon, 500 | // the rule itself derives epsilon 501 | 502 | if ($i < (count($components) - 1)) { 503 | // more components ahead, remove epsilon 504 | unset($x[array_search(Grammar::EPSILON, $x)]); 505 | } 506 | 507 | $new = Util::union($new, $x); 508 | } 509 | } else { 510 | // if terminal, simply add it the the FIRST set 511 | // and we're done 512 | $new = Util::union($new, array($component)); 513 | 514 | break; 515 | } 516 | } 517 | } 518 | 519 | if (Util::different($new, $firstSets[$lhs])) { 520 | $firstSets[$lhs] = Util::union($firstSets[$lhs], $new); 521 | 522 | $changes = true; 523 | } 524 | } 525 | } 526 | } while ($changes); 527 | 528 | return $firstSets; 529 | } 530 | } 531 | --------------------------------------------------------------------------------