├── .gitignore
├── bin
    ├── dissect
    └── dissect.php
├── docs
    ├── state_3.png
    ├── index.md
    ├── cli.md
    ├── common.md
    ├── ast.md
    ├── lexing.md
    └── parsing.md
├── CHANGELOG.md
├── tests
    ├── Dissect
    │   ├── Parser
    │   │   ├── LALR1
    │   │   │   ├── Dumper
    │   │   │   │   ├── res
    │   │   │   │   │   ├── table
    │   │   │   │   │   │   ├── production.php
    │   │   │   │   │   │   └── debug.php
    │   │   │   │   │   └── graphviz
    │   │   │   │   │   │   ├── state.dot
    │   │   │   │   │   │   └── automaton.dot
    │   │   │   │   ├── ExampleGrammar.php
    │   │   │   │   ├── ProductionTableDumperTest.php
    │   │   │   │   ├── DebugTableDumperTest.php
    │   │   │   │   └── AutomatonDumperTest.php
    │   │   │   ├── ArithLexer.php
    │   │   │   ├── Analysis
    │   │   │   │   ├── StateTest.php
    │   │   │   │   ├── AutomatonTest.php
    │   │   │   │   ├── KernelSet
    │   │   │   │   │   └── KernelSetTest.php
    │   │   │   │   ├── ItemTest.php
    │   │   │   │   └── AnalyzerTest.php
    │   │   │   ├── ArithGrammar.php
    │   │   │   └── ParserTest.php
    │   │   ├── ExampleGrammar.php
    │   │   ├── RuleTest.php
    │   │   └── GrammarTest.php
    │   └── Lexer
    │   │   ├── StubLexer.php
    │   │   ├── StubRegexLexer.php
    │   │   ├── Recognizer
    │   │       ├── SimpleRecognizerTest.php
    │   │       └── RegexRecognizerTest.php
    │   │   ├── RegexLexerTest.php
    │   │   ├── SimpleLexerTest.php
    │   │   ├── StatefulLexerTest.php
    │   │   ├── AbstractLexerTest.php
    │   │   └── TokenStream
    │   │       └── ArrayTokenStreamTest.php
    └── bootstrap.php
├── .travis.yml
├── src
    └── Dissect
    │   ├── Parser
    │       ├── LALR1
    │       │   ├── Analysis
    │       │   │   ├── KernelSet
    │       │   │   │   ├── Node.php
    │       │   │   │   └── KernelSet.php
    │       │   │   ├── Exception
    │       │   │   │   ├── ConflictException.php
    │       │   │   │   ├── ShiftReduceConflictException.php
    │       │   │   │   └── ReduceReduceConflictException.php
    │       │   │   ├── AnalysisResult.php
    │       │   │   ├── State.php
    │       │   │   ├── Automaton.php
    │       │   │   ├── Item.php
    │       │   │   └── Analyzer.php
    │       │   ├── Dumper
    │       │   │   ├── TableDumper.php
    │       │   │   ├── StringWriter.php
    │       │   │   ├── ProductionTableDumper.php
    │       │   │   ├── AutomatonDumper.php
    │       │   │   └── DebugTableDumper.php
    │       │   └── Parser.php
    │       ├── Parser.php
    │       ├── Exception
    │       │   └── UnexpectedTokenException.php
    │       ├── Rule.php
    │       └── Grammar.php
    │   ├── Lexer
    │       ├── Lexer.php
    │       ├── Token.php
    │       ├── Recognizer
    │       │   ├── Recognizer.php
    │       │   ├── SimpleRecognizer.php
    │       │   └── RegexRecognizer.php
    │       ├── Exception
    │       │   └── RecognitionException.php
    │       ├── CommonToken.php
    │       ├── TokenStream
    │       │   ├── TokenStream.php
    │       │   └── ArrayTokenStream.php
    │       ├── RegexLexer.php
    │       ├── AbstractLexer.php
    │       ├── SimpleLexer.php
    │       └── StatefulLexer.php
    │   ├── Console
    │       ├── Application.php
    │       └── Command
    │       │   └── DissectCommand.php
    │   ├── Util
    │       └── Util.php
    │   └── Node
    │       ├── Node.php
    │       └── CommonNode.php
├── phpunit.xml
├── TODO.md
├── README.md
├── composer.json
└── UNLICENSE


/.gitignore:
--------------------------------------------------------------------------------
1 | vendor/
2 | composer.phar
3 | composer.lock
4 | 


--------------------------------------------------------------------------------
/bin/dissect:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env php
2 | <?php
3 | 
4 | require __DIR__ . '/dissect.php';
5 | 


--------------------------------------------------------------------------------
/docs/state_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jakubledl/dissect/HEAD/docs/state_3.png


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | Changelog
 2 | =========
 3 | 
 4 | 1.0.1 (2013-01-29)
 5 | ------------------
 6 | 
 7 | - 2b40f94: Fixed an invalid format in the CLI
 8 | 
 9 | 1.0.0 (2013-01-15)
10 | ------------------
11 | 
12 | - First release.
13 | 


--------------------------------------------------------------------------------
/tests/Dissect/Parser/LALR1/Dumper/res/table/production.php:
--------------------------------------------------------------------------------
1 | <?php return array('action'=>array(0=>array('a'=>2,'$eof'=>-2,),2=>array('a'=>2,'b'=>-2,),3=>array('b'=>4,),1=>array('$eof'=>0,),4=>array('$eof'=>-1,'b'=>-1,),),'goto'=>array(0=>array('S'=>1,),2=>array('S'=>3,),));
2 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: php
 2 | 
 3 | php:
 4 |     - 5.3
 5 |     - 5.4
 6 | 
 7 | branches:
 8 |     only:
 9 |         - master
10 |         - develop
11 | 
12 | before_script:
13 |     - wget http://getcomposer.org/composer.phar
14 |     - php composer.phar dump-autoload
15 | 


--------------------------------------------------------------------------------
/tests/bootstrap.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | if (!file_exists(__DIR__ . '/../vendor/autoload.php')) {
 4 |     die("Setup the project dependencies before running unit tests." . PHP_EOL);
 5 | }
 6 | 
 7 | $loader = require __DIR__ . '/../vendor/autoload.php';
 8 | 
 9 | $loader->add('Dissect', __DIR__);
10 | 


--------------------------------------------------------------------------------
/tests/Dissect/Parser/LALR1/Dumper/res/graphviz/state.dot:
--------------------------------------------------------------------------------
 1 | digraph State2 {
 2 |     rankdir="LR";
 3 | 
 4 |     2 [label="State 2\n\nS &rarr; a &bull; S b\nS &rarr; &bull; a S b\nS &rarr; &bull; [b]"];
 5 |     3 [label="State 3"];
 6 | 
 7 |     2 -> 3 [label="S"];
 8 |     2 -> 2 [label="a"];
 9 | }
10 | 


--------------------------------------------------------------------------------
/tests/Dissect/Parser/ExampleGrammar.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Parser;
 4 | 
 5 | class ExampleGrammar extends Grammar
 6 | {
 7 |     public function __construct()
 8 |     {
 9 |         $this('Foo')
10 |             ->is('a', 'b', 'c')
11 |             ->is('x', 'y', 'z');
12 | 
13 |         $this->start('Foo');
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/tests/Dissect/Parser/LALR1/Dumper/ExampleGrammar.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Parser\LALR1\Dumper;
 4 | 
 5 | use Dissect\Parser\Grammar;
 6 | 
 7 | class ExampleGrammar extends Grammar
 8 | {
 9 |     public function __construct()
10 |     {
11 |         $this('S')
12 |             ->is('a', 'S', 'b')
13 |             ->is(/* empty */);
14 | 
15 |         $this->start('S');
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/src/Dissect/Parser/LALR1/Analysis/KernelSet/Node.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Parser\LALR1\Analysis\KernelSet;
 4 | 
 5 | class Node
 6 | {
 7 |     public $kernel;
 8 |     public $number;
 9 | 
10 |     public $left = null;
11 |     public $right = null;
12 | 
13 |     public function __construct(array $hashedKernel, $number)
14 |     {
15 |         $this->kernel = $hashedKernel;
16 |         $this->number = $number;
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/phpunit.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | 
 3 | <phpunit bootstrap="./tests/bootstrap.php" colors="true"
 4 |     backupGlobals="false">
 5 | 
 6 |     <testsuites>
 7 |         <testsuite name="Dissect test suite">
 8 |             <directory suffix="Test.php">./tests</directory>
 9 |         </testsuite>
10 |     </testsuites>
11 | 
12 |     <filter>
13 |         <whitelist>
14 |             <directory suffix=".php">./src</directory>
15 |         </whitelist>
16 |     </filter>
17 | </phpunit>
18 | 


--------------------------------------------------------------------------------
/tests/Dissect/Parser/RuleTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Parser;
 4 | 
 5 | use PHPUnit_Framework_TestCase;
 6 | 
 7 | class RuleTest extends PHPUnit_Framework_TestCase
 8 | {
 9 |     /**
10 |      * @test
11 |      */
12 |     public function getComponentShouldReturnNullIfAskedForComponentOutOfRange()
13 |     {
14 |         $r = new Rule(1, 'Foo', array('x', 'y'));
15 |         $this->assertEquals('y', $r->getComponent(1));
16 |         $this->assertNull($r->getComponent(2));
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/src/Dissect/Parser/LALR1/Dumper/TableDumper.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Parser\LALR1\Dumper;
 4 | 
 5 | /**
 6 |  * A common contract for parse table dumpers.
 7 |  *
 8 |  * @author Jakub Lédl <jakubledl@gmail.com>
 9 |  */
10 | interface TableDumper
11 | {
12 |     /**
13 |      * Dumps the parse table.
14 |      *
15 |      * @param array $table The parse table.
16 |      *
17 |      * @return string The resulting string representation of the table.
18 |      */
19 |     public function dump(array $table);
20 | }
21 | 


--------------------------------------------------------------------------------
/tests/Dissect/Parser/LALR1/ArithLexer.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Parser\LALR1;
 4 | 
 5 | use Dissect\Lexer\SimpleLexer;
 6 | 
 7 | class ArithLexer extends SimpleLexer
 8 | {
 9 |     public function __construct()
10 |     {
11 |         $this->regex('INT', '/^[1-9][0-9]*/');
12 |         $this->token('(');
13 |         $this->token(')');
14 |         $this->token('+');
15 |         $this->token('-');
16 |         $this->token('**');
17 |         $this->token('*');
18 |         $this->token('/');
19 |         $this->regex('WSP', "/^[ \r\n\t]+/");
20 |         $this->skip('WSP');
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/tests/Dissect/Parser/LALR1/Dumper/res/graphviz/automaton.dot:
--------------------------------------------------------------------------------
 1 | digraph Automaton {
 2 |     rankdir="LR";
 3 | 
 4 |     0 [label="State 0\n\n&bull; S\nS &rarr; &bull; a S b\nS &rarr; &bull; [$eof]"];
 5 |     1 [label="State 1\n\nS &bull; [$eof]"];
 6 |     2 [label="State 2\n\nS &rarr; a &bull; S b\nS &rarr; &bull; a S b\nS &rarr; &bull; [b]"];
 7 |     3 [label="State 3\n\nS &rarr; a S &bull; b"];
 8 |     4 [label="State 4\n\nS &rarr; a S b &bull; [$eof b]"];
 9 | 
10 |     0 -> 1 [label="S"];
11 |     0 -> 2 [label="a"];
12 |     2 -> 3 [label="S"];
13 |     2 -> 2 [label="a"];
14 |     3 -> 4 [label="b"];
15 | }
16 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
 1 | Goals
 2 | =====
 3 | 
 4 | 1.1
 5 | ---
 6 | 
 7 | - Optional operator precedence support (à la *yacc*, *bison*) - &#10004;
 8 | - A performance-oriented regex lexer (based on doctrine/lexer) - &#10004;
 9 | - An option to generate a hybrid recursive ascent parser - &#9633;
10 | 
11 | 1.0
12 | ---
13 | 
14 | - Compute reduction lookahead by the channel algorithm from *yacc*
15 |   instead of the current LALR-by-SLR algorithm - &#10004;
16 | - Change the analyzer API to allow for grammar debugging
17 |   (provide access to resolved conflicts, dumping the automaton to DOT ...) - &#10004;
18 | - Provide classes for dumping the parse table to PHP (both the dev & prod version) - &#10004;
19 | 


--------------------------------------------------------------------------------
/src/Dissect/Lexer/Lexer.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Lexer;
 4 | 
 5 | /**
 6 |  * A lexer takes an input string and processes
 7 |  * it into a token stream.
 8 |  *
 9 |  * @author Jakub Lédl <jakubledl@gmail.com>
10 |  */
11 | interface Lexer
12 | {
13 |     /**
14 |      * Lexes the given string, returning a token stream.
15 |      *
16 |      * @param string $string The string to lex.
17 |      *
18 |      * @throws \Dissect\Lexer\Exception\RecognitionException
19 |      * When unable to extract more tokens from the string.
20 |      *
21 |      * @return \Dissect\Lexer\TokenStream\TokenStream The resulting token stream.
22 |      */
23 |     public function lex($string);
24 | }
25 | 


--------------------------------------------------------------------------------
/tests/Dissect/Lexer/StubLexer.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Lexer;
 4 | 
 5 | class StubLexer extends AbstractLexer
 6 | {
 7 |     protected function extractToken($string)
 8 |     {
 9 |         if (strlen(utf8_decode($string)) === 0) {
10 |             return null;
11 |         }
12 | 
13 |         $char = $string[0];
14 | 
15 |         if ($char === 'd') { // unrecognizable token
16 |             return null;
17 |         }
18 | 
19 |         $token = new CommonToken($char, $char, $this->getCurrentLine());
20 | 
21 |         return $token;
22 |     }
23 | 
24 |     protected function shouldSkipToken(Token $t)
25 |     {
26 |         return $t->getType() === 'e';
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/Dissect/Lexer/Token.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Lexer;
 4 | 
 5 | /**
 6 |  * A common contract for tokens.
 7 |  *
 8 |  * @author Jakub Lédl <jakubledl@gmail.com>
 9 |  */
10 | interface Token
11 | {
12 |     /**
13 |      * Returns the token type.
14 |      *
15 |      * @return mixed The token type.
16 |      */
17 |     public function getType();
18 | 
19 |     /**
20 |      * Returns the token value.
21 |      *
22 |      * @return string The token value.
23 |      */
24 |     public function getValue();
25 | 
26 |     /**
27 |      * Returns the line on which the token was found.
28 |      *
29 |      * @return int The line.
30 |      */
31 |     public function getLine();
32 | }
33 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Welcome to Dissect!
 2 | - [master](https://github.com/jakubledl/dissect/tree/master) [![build status](https://travis-ci.org/jakubledl/dissect.png?branch=master)](https://travis-ci.org/jakubledl/dissect) - this branch always contains the last stable version.
 3 | - [develop](https://github.com/jakubledl/dissect) [![build status](https://travis-ci.org/jakubledl/dissect.png?branch=develop)](https://travis-ci.org/jakubledl/dissect) - the unstable development branch.
 4 | 
 5 | Dissect is a set of tools for lexical and syntactical analysis written
 6 | in pure PHP.
 7 | 
 8 | Documentation?
 9 | --------------
10 | 
11 | [Here][docs].
12 | 
13 | [docs]: https://github.com/jakubledl/dissect/blob/master/docs/index.md
14 | 


--------------------------------------------------------------------------------
/src/Dissect/Parser/Parser.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Parser;
 4 | 
 5 | use Dissect\Lexer\TokenStream\TokenStream;
 6 | 
 7 | /**
 8 |  * The parser interface.
 9 |  *
10 |  * @author Jakub Lédl <jakubledl@gmail.com>
11 |  */
12 | interface Parser
13 | {
14 |     /**
15 |      * The token type that represents an EOF.
16 |      */
17 |     const EOF_TOKEN_TYPE = '$eof';
18 | 
19 |     /**
20 |      * Parses a token stream and returns the semantical value
21 |      * of the input.
22 |      *
23 |      * @param \Dissect\Lexer\TokenStream\TokenStream $stream The token stream.
24 |      *
25 |      * @return mixed The semantical value of the input.
26 |      */
27 |     public function parse(TokenStream $stream);
28 | }
29 | 


--------------------------------------------------------------------------------
/tests/Dissect/Parser/LALR1/Analysis/StateTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Parser\LALR1\Analysis;
 4 | 
 5 | use Dissect\Parser\Rule;
 6 | use PHPUnit_Framework_TestCase;
 7 | 
 8 | class StateTest extends PHPUnit_Framework_TestCase
 9 | {
10 |     /**
11 |      * @test
12 |      */
13 |     public function stateShouldKeepItemsByRuleNumberAndPosition()
14 |     {
15 |         $item1 = new Item(new Rule(1, 'E', array('E', '+', 'T')), 0);
16 |         $state = new State(0, array($item1));
17 | 
18 |         $this->assertSame($item1, $state->get(1, 0));
19 | 
20 |         $item2 = new Item(new Rule(2, 'T', array('T', '+', 'F')), 0);
21 |         $state->add($item2);
22 | 
23 |         $this->assertSame($item2, $state->get(2, 0));
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/src/Dissect/Lexer/Recognizer/Recognizer.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Lexer\Recognizer;
 4 | 
 5 | /**
 6 |  * Recognizers are used by the lexer to process
 7 |  * the input string.
 8 |  *
 9 |  * @author Jakub Lédl <jakubledl@gmail.com>
10 |  */
11 | interface Recognizer
12 | {
13 |     /**
14 |      * Returns a boolean value specifying whether
15 |      * the string matches or not and if it does,
16 |      * returns the match in the second variable.
17 |      *
18 |      * @param string $string The string to match.
19 |      * @param string $result The variable that gets set to the value of the match.
20 |      *
21 |      * @return boolean Whether the match was successful or not.
22 |      */
23 |     public function match($string, &$result);
24 | }
25 | 


--------------------------------------------------------------------------------
/tests/Dissect/Parser/LALR1/Dumper/ProductionTableDumperTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Parser\LALR1\Dumper;
 4 | 
 5 | use Dissect\Parser\LALR1\Analysis\Analyzer;
 6 | use PHPUnit_Framework_TestCase;
 7 | 
 8 | class ProductionTableDumperTest extends PHPUnit_Framework_TestCase
 9 | {
10 |     /**
11 |      * @test
12 |      */
13 |     public function theWrittenTableShouldBeAsCompactAsPossible()
14 |     {
15 |         $grammar = new ExampleGrammar();
16 |         $analyzer = new Analyzer();
17 |         $table = $analyzer->analyze($grammar)->getParseTable();
18 | 
19 |         $dumper = new ProductionTableDumper();
20 |         $dumped = $dumper->dump($table);
21 | 
22 |         $this->assertStringEqualsFile(__DIR__ . '/res/table/production.php', $dumped);
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/tests/Dissect/Parser/LALR1/Dumper/DebugTableDumperTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Parser\LALR1\Dumper;
 4 | 
 5 | use Dissect\Parser\LALR1\Analysis\Analyzer;
 6 | use PHPUnit_Framework_TestCase;
 7 | 
 8 | class DebugTableDumperTest extends PHPUnit_Framework_TestCase
 9 | {
10 |     /**
11 |      * @test
12 |      */
13 |     public function itDumpsAHumanReadableParseTableWithExplainingComments()
14 |     {
15 |         $grammar = new ExampleGrammar();
16 |         $analyzer = new Analyzer();
17 |         $result = $analyzer->analyze($grammar);
18 | 
19 |         $dumper = new DebugTableDumper($grammar);
20 |         $dumped = $dumper->dump($result->getParseTable());
21 | 
22 |         $this->assertStringEqualsFile(__DIR__ . '/res/table/debug.php', $dumped);
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "jakubledl/dissect",
 3 |     "description": "Lexing and parsing in pure PHP",
 4 |     "keywords": ["lexing", "parsing", "ast", "parser"],
 5 |     "homepage": "https://github.com/jakubledl/dissect",
 6 |     "license": "unlicense",
 7 |     "authors": [
 8 |         {
 9 |             "name": "Jakub Lédl",
10 |             "email": "jakubledl@gmail.com"
11 |         }
12 |     ],
13 | 
14 |     "require": {
15 |         "php": ">=5.3.3"
16 |     },
17 | 
18 |     "require-dev": {
19 |         "symfony/console": "~2.1"
20 |     },
21 | 
22 |     "suggest": {
23 |         "symfony/console": "for the command-line tool"
24 |     },
25 | 
26 |     "bin": ["bin/dissect.php", "bin/dissect"],
27 | 
28 |     "autoload": {
29 |         "psr-0": { "Dissect": ["src/"] }
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/tests/Dissect/Lexer/StubRegexLexer.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Lexer;
 4 | 
 5 | use RuntimeException;
 6 | 
 7 | class StubRegexLexer extends RegexLexer
 8 | {
 9 |     protected $operators = array('+', '-');
10 | 
11 |     protected function getCatchablePatterns()
12 |     {
13 |         return array('[1-9][0-9]*');
14 |     }
15 | 
16 |     protected function getNonCatchablePatterns()
17 |     {
18 |         return array('\s+');
19 |     }
20 | 
21 |     protected function getType(&$value)
22 |     {
23 |         if (is_numeric($value)) {
24 |             $value = (int)$value;
25 | 
26 |             return 'INT';
27 |         } elseif (in_array($value, $this->operators)) {
28 |             return $value;
29 |         } else {
30 |             throw new RuntimeException(sprintf('Invalid token "%s"', $value));
31 |         }
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/Dissect/Lexer/Recognizer/SimpleRecognizer.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Lexer\Recognizer;
 4 | 
 5 | /**
 6 |  * SimpleRecognizer matches a string by a simple
 7 |  * strpos match.
 8 |  *
 9 |  * @author Jakub Lédl <jakubledl@gmail.com>
10 |  */
11 | class SimpleRecognizer implements Recognizer
12 | {
13 |     protected $string;
14 | 
15 |     /**
16 |      * Constructor.
17 |      *
18 |      * @param string $string The string to match by.
19 |      */
20 |     public function __construct($string)
21 |     {
22 |         $this->string = $string;
23 |     }
24 | 
25 |     /**
26 |      * {@inheritDoc}
27 |      */
28 |     public function match($string, &$result)
29 |     {
30 |         if (strncmp($string, $this->string, strlen($this->string)) === 0) {
31 |             $result = $this->string;
32 | 
33 |             return true;
34 |         }
35 | 
36 |         return false;
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/Dissect/Lexer/Exception/RecognitionException.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Lexer\Exception;
 4 | 
 5 | use RuntimeException;
 6 | 
 7 | /**
 8 |  * Thrown when a lexer is unable to extract another token.
 9 |  *
10 |  * @author Jakub Lédl <jakubledl@gmail.com>
11 |  */
12 | class RecognitionException extends RuntimeException
13 | {
14 |     protected $sourceLine;
15 | 
16 |     /**
17 |      * Constructor.
18 |      *
19 |      * @param int $line The line in the source.
20 |      */
21 |     public function __construct($line)
22 |     {
23 |         $this->sourceLine = $line;
24 | 
25 |         parent::__construct(sprintf("Cannot extract another token at line %d.", $line));
26 |     }
27 | 
28 |     /**
29 |      * Returns the source line number where the exception occured.
30 |      *
31 |      * @return int The source line number.
32 |      */
33 |     public function getSourceLine()
34 |     {
35 |         return $this->sourceLine;
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/Dissect/Lexer/Recognizer/RegexRecognizer.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Lexer\Recognizer;
 4 | 
 5 | /**
 6 |  * The RegexRecognizer matches a string using a
 7 |  * regular expression.
 8 |  *
 9 |  * @author Jakub Lédl <jakubledl@gmail.com>
10 |  */
11 | class RegexRecognizer implements Recognizer
12 | {
13 |     protected $regex;
14 | 
15 |     /**
16 |      * Constructor.
17 |      *
18 |      * @param string $regex The regex to use in the match.
19 |      */
20 |     public function __construct($regex)
21 |     {
22 |         $this->regex = $regex;
23 |     }
24 | 
25 |     /**
26 |      * {@inheritDoc}
27 |      */
28 |     public function match($string, &$result)
29 |     {
30 |         $r = preg_match($this->regex, $string, $match, PREG_OFFSET_CAPTURE);
31 | 
32 |         if ($r === 1 && $match[0][1] === 0) {
33 |             $result = $match[0][0];
34 | 
35 |             return true;
36 |         }
37 | 
38 |         return false;
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/tests/Dissect/Lexer/Recognizer/SimpleRecognizerTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Lexer\Recognizer;
 4 | 
 5 | use PHPUnit_Framework_TestCase;
 6 | 
 7 | class SimpleRecognizerTest extends PHPUnit_Framework_TestCase
 8 | {
 9 |     /**
10 |      * @test
11 |      */
12 |     public function recognizerShouldMatchAndPassTheValueByReference()
13 |     {
14 |         $recognizer = new SimpleRecognizer('class');
15 |         $result = $recognizer->match('class lorem ipsum', $value);
16 | 
17 |         $this->assertTrue($result);
18 |         $this->assertNotNull($value);
19 |         $this->assertEquals('class', $value);
20 |     }
21 | 
22 |     /**
23 |      * @test
24 |      */
25 |     public function recognizerShouldFailAndTheValueShouldStayNull()
26 |     {
27 |         $recognizer = new SimpleRecognizer('class');
28 |         $result = $recognizer->match('lorem ipsum', $value);
29 | 
30 |         $this->assertFalse($result);
31 |         $this->assertNull($value);
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/bin/dissect.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | define('DISSECT_VERSION', 'DEV');
 4 | 
 5 | if (is_dir($vendor = getcwd() . '/vendor')) {
 6 |     require $vendor . '/autoload.php';
 7 | }
 8 | 
 9 | if (is_dir($vendor = __DIR__ . '/../vendor')) {
10 |     require $vendor . '/autoload.php';
11 | } elseif (is_dir($vendor = __DIR__ . '/../../../../vendor')) {
12 |     require $vendor . '/autoload.php';
13 | } else {
14 |     die(
15 |         'You must set up the project dependencies.' . PHP_EOL .
16 |         'To do that, run the following commands:' . PHP_EOL . PHP_EOL .
17 |         '$ curl -s http://getcomposer.org/installer | php' . PHP_EOL .
18 |         '$ php composer.phar install' . PHP_EOL
19 |     );
20 | }
21 | 
22 | if (!class_exists('Symfony\Component\Console\Application')) {
23 |     die(
24 |         'You must install the symfony/console package in order ' .
25 |         'to use the command-line tool.' . PHP_EOL
26 |     );
27 | }
28 | 
29 | $app = new Dissect\Console\Application(DISSECT_VERSION);
30 | $app->run();
31 | 


--------------------------------------------------------------------------------
/tests/Dissect/Parser/LALR1/Analysis/AutomatonTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Parser\LALR1\Analysis;
 4 | 
 5 | use PHPUnit_Framework_TestCase;
 6 | 
 7 | class AutomatonTest extends PHPUnit_Framework_TestCase
 8 | {
 9 |     protected $automaton;
10 | 
11 |     protected function setUp()
12 |     {
13 |         $this->automaton = new Automaton();
14 |         $this->automaton->addState(new State(0, array()));
15 |         $this->automaton->addState(new State(1, array()));
16 |     }
17 | 
18 |     /**
19 |      * @test
20 |      */
21 |     public function addingATransitionShouldBeVisibleInTheTransitionTable()
22 |     {
23 |         $this->automaton->addTransition(0, 'a', 1);
24 |         $table = $this->automaton->getTransitionTable();
25 | 
26 |         $this->assertEquals(1, $table[0]['a']);
27 |     }
28 | 
29 |     /**
30 |      * @test
31 |      */
32 |     public function aNewStateShouldBeIdentifiedByItsNumber()
33 |     {
34 |         $state = new State(2, array());
35 |         $this->automaton->addState($state);
36 | 
37 |         $this->assertSame($state, $this->automaton->getState(2));
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/tests/Dissect/Lexer/RegexLexerTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Lexer;
 4 | 
 5 | use Dissect\Parser\Parser;
 6 | use PHPUnit_Framework_TestCase;
 7 | 
 8 | class RegexLexerTest extends PHPUnit_Framework_TestCase
 9 | {
10 |     protected $lexer;
11 | 
12 |     protected function setUp()
13 |     {
14 |         $this->lexer = new StubRegexLexer();
15 |     }
16 | 
17 |     /**
18 |      * @test
19 |      */
20 |     public function itShouldCallGetTypeToRetrieveTokenType()
21 |     {
22 |         $stream = $this->lexer->lex('5 + 6');
23 | 
24 |         $this->assertCount(4, $stream);
25 |         $this->assertEquals('INT', $stream->get(0)->getType());
26 |         $this->assertEquals('+', $stream->get(1)->getType());
27 |         $this->assertEquals(Parser::EOF_TOKEN_TYPE, $stream->get(3)->getType());
28 |     }
29 | 
30 |     /**
31 |      * @test
32 |      */
33 |     public function itShouldTrackLineNumbers()
34 |     {
35 |         $stream = $this->lexer->lex("5\n+\n\n5");
36 | 
37 |         $this->assertEquals(2, $stream->get(1)->getLine());
38 |         $this->assertEquals(4, $stream->get(2)->getLine());
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/tests/Dissect/Parser/LALR1/Dumper/AutomatonDumperTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Parser\LALR1\Dumper;
 4 | 
 5 | use Dissect\Parser\LALR1\Analysis\Analyzer;
 6 | use PHPUnit_Framework_TestCase;
 7 | 
 8 | class AutomatonDumperTest extends PHPUnit_Framework_TestCase
 9 | {
10 |     protected $dumper;
11 | 
12 |     protected function setUp()
13 |     {
14 |         $analyzer = new Analyzer();
15 |         $automaton = $analyzer->analyze(new ExampleGrammar())->getAutomaton();
16 |         $this->dumper = new AutomatonDumper($automaton);
17 |     }
18 | 
19 |     /**
20 |      * @test
21 |      */
22 |     public function dumpDumpsTheEntireAutomaton()
23 |     {
24 |         $this->assertStringEqualsFile(
25 |             __DIR__ . '/res/graphviz/automaton.dot',
26 |             $this->dumper->dump()
27 |         );
28 |     }
29 | 
30 |     /**
31 |      * @test
32 |      */
33 |     public function dumpStateDumpsOnlyTheSpecifiedStateAndTransitions()
34 |     {
35 |         $this->assertStringEqualsFile(
36 |             __DIR__ . '/res/graphviz/state.dot',
37 |             $this->dumper->dumpState(2)
38 |         );
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/tests/Dissect/Parser/LALR1/Analysis/KernelSet/KernelSetTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Parser\LALR1\Analysis\KernelSet;
 4 | 
 5 | use PHPUnit_Framework_TestCase;
 6 | 
 7 | class KernelSetTest extends PHPUnit_Framework_TestCase
 8 | {
 9 |     /**
10 |      * @test
11 |      */
12 |     public function kernelsShouldBeProperlyHashedAndOrdered()
13 |     {
14 |         $this->assertEquals(array(1, 3, 6, 7), KernelSet::hashKernel(array(
15 |             array(2, 1),
16 |             array(1, 0),
17 |             array(2, 0),
18 |             array(3, 0),
19 |         )));
20 |     }
21 | 
22 |     /**
23 |      * @test
24 |      */
25 |     public function insertShouldInsertANewNodeIfNoIdenticalKernelExists()
26 |     {
27 |         $set = new KernelSet();
28 | 
29 |         $this->assertEquals(0, $set->insert(array(
30 |             array(2, 1),
31 |         )));
32 | 
33 |         $this->assertEquals(1, $set->insert(array(
34 |             array(2, 2),
35 |         )));
36 | 
37 |         $this->assertEquals(2, $set->insert(array(
38 |             array(1, 1),
39 |         )));
40 | 
41 |         $this->assertEquals(0, $set->insert(array(
42 |             array(2, 1),
43 |         )));
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/Dissect/Parser/LALR1/Analysis/Exception/ConflictException.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Parser\LALR1\Analysis\Exception;
 4 | 
 5 | use Dissect\Parser\LALR1\Analysis\Automaton;
 6 | use LogicException;
 7 | 
 8 | /**
 9 |  * A base class for exception thrown when encountering
10 |  * inadequate states during parse table construction.
11 |  *
12 |  * @author Jakub Lédl <jakubledl@gmail.com>
13 |  */
14 | class ConflictException extends LogicException
15 | {
16 |     protected $state;
17 |     protected $automaton;
18 | 
19 |     public function __construct($message, $state, Automaton $automaton)
20 |     {
21 |         parent::__construct($message);
22 | 
23 |         $this->state = $state;
24 |         $this->automaton = $automaton;
25 |     }
26 | 
27 |     /**
28 |      * Returns the number of the inadequate state.
29 |      *
30 |      * @return int
31 |      */
32 |     public function getStateNumber()
33 |     {
34 |         return $this->state;
35 |     }
36 | 
37 |     /**
38 |      * Returns the faulty automaton.
39 |      *
40 |      * @return \Dissect\Parser\LALR1\Analysis\Automaton
41 |      */
42 |     public function getAutomaton()
43 |     {
44 |         return $this->automaton;
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/UNLICENSE:
--------------------------------------------------------------------------------
 1 | This is free and unencumbered software released into the public domain.
 2 | 
 3 | Anyone is free to copy, modify, publish, use, compile, sell, or
 4 | distribute this software, either in source code form or as a compiled
 5 | binary, for any purpose, commercial or non-commercial, and by any
 6 | means.
 7 | 
 8 | In jurisdictions that recognize copyright laws, the author
 9 | of this software dedicates any and all copyright interest in the
10 | software to the public domain. I make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. I intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | For more information, please refer to <http://unlicense.org/>
25 | 


--------------------------------------------------------------------------------
/src/Dissect/Lexer/CommonToken.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Lexer;
 4 | 
 5 | /**
 6 |  * A simple token representation.
 7 |  *
 8 |  * @author Jakub Lédl <jakubledl@gmail.com>
 9 |  */
10 | class CommonToken implements Token
11 | {
12 |     /**
13 |      * @var mixed
14 |      */
15 |     protected $type;
16 | 
17 |     /**
18 |      * @var string
19 |      */
20 |     protected $value;
21 | 
22 |     /**
23 |      * @var int
24 |      */
25 |     protected $line;
26 | 
27 |     /**
28 |      * Constructor.
29 |      *
30 |      * @param mixed $type The type of the token.
31 |      * @param string $value The token value.
32 |      * @param int $line The line.
33 |      */
34 |     public function __construct($type, $value, $line)
35 |     {
36 |         $this->type = $type;
37 |         $this->value = $value;
38 |         $this->line = $line;
39 |     }
40 | 
41 |     /**
42 |      * {@inheritDoc}
43 |      */
44 |     public function getType()
45 |     {
46 |         return $this->type;
47 |     }
48 | 
49 |     /**
50 |      * {@inheritDoc}
51 |      */
52 |     public function getValue()
53 |     {
54 |         return $this->value;
55 |     }
56 | 
57 |     /**
58 |      * {@inheritDoc}
59 |      */
60 |     public function getLine()
61 |     {
62 |         return $this->line;
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/tests/Dissect/Parser/LALR1/Dumper/res/table/debug.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | return array(
 4 |     'action' => array(
 5 |         0 => array(
 6 |             // on a shift and go to state 2
 7 |             'a' => 2,
 8 | 
 9 |             // on $eof reduce by rule S -> /* empty */
10 |             '$eof' => -2,
11 | 
12 |         ),
13 | 
14 |         1 => array(
15 |             // on $eof accept the input
16 |             '$eof' => 0,
17 | 
18 |         ),
19 | 
20 |         2 => array(
21 |             // on a shift and go to state 2
22 |             'a' => 2,
23 | 
24 |             // on b reduce by rule S -> /* empty */
25 |             'b' => -2,
26 | 
27 |         ),
28 | 
29 |         3 => array(
30 |             // on b shift and go to state 4
31 |             'b' => 4,
32 | 
33 |         ),
34 | 
35 |         4 => array(
36 |             // on $eof reduce by rule S -> a S b
37 |             '$eof' => -1,
38 | 
39 |             // on b reduce by rule S -> a S b
40 |             'b' => -1,
41 | 
42 |         ),
43 | 
44 |     ),
45 | 
46 |     'goto' => array(
47 |         0 => array(
48 |             // on S go to state 1
49 |             'S' => 1,
50 | 
51 |         ),
52 | 
53 |         2 => array(
54 |             // on S go to state 3
55 |             'S' => 3,
56 | 
57 |         ),
58 | 
59 |     ),
60 | );
61 | 


--------------------------------------------------------------------------------
/tests/Dissect/Lexer/Recognizer/RegexRecognizerTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Lexer\Recognizer;
 4 | 
 5 | use PHPUnit_Framework_TestCase;
 6 | 
 7 | class RegexRecognizerTest extends PHPUnit_Framework_TestCase
 8 | {
 9 |     /**
10 |      * @test
11 |      */
12 |     public function recognizerShouldMatchAndPassTheValueByReference()
13 |     {
14 |         $recognizer = new RegexRecognizer('/[a-z]+/');
15 |         $result = $recognizer->match('lorem ipsum', $value);
16 | 
17 |         $this->assertTrue($result);
18 |         $this->assertNotNull($value);
19 |         $this->assertEquals('lorem', $value);
20 |     }
21 | 
22 |     /**
23 |      * @test
24 |      */
25 |     public function recognizerShouldFailAndTheValueShouldStayNull()
26 |     {
27 |         $recognizer = new RegexRecognizer('/[a-z]+/');
28 |         $result = $recognizer->match('123 456', $value);
29 | 
30 |         $this->assertFalse($result);
31 |         $this->assertNull($value);
32 |     }
33 | 
34 |     /**
35 |      * @test
36 |      */
37 |     public function recognizerShouldFailIfTheMatchIsNotAtTheBeginningOfTheString()
38 |     {
39 |         $recognizer = new RegexRecognizer('/[a-z]+/');
40 |         $result = $recognizer->match('234 class', $value);
41 | 
42 |         $this->assertFalse($result);
43 |         $this->assertNull($value);
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/tests/Dissect/Parser/GrammarTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Parser;
 4 | 
 5 | use PHPUnit_Framework_TestCase;
 6 | 
 7 | class GrammarTest extends PHPUnit_Framework_TestCase
 8 | {
 9 |     protected $grammar;
10 | 
11 |     protected function setUp()
12 |     {
13 |         $this->grammar = new ExampleGrammar();
14 |     }
15 | 
16 |     /**
17 |      * @test
18 |      */
19 |     public function ruleAlternativesShouldHaveTheSameName()
20 |     {
21 |         $rules = $this->grammar->getRules();
22 | 
23 |         $this->assertEquals('Foo', $rules[1]->getName());
24 |         $this->assertEquals('Foo', $rules[2]->getName());
25 |     }
26 | 
27 |     /**
28 |      * @test
29 |      */
30 |     public function theGrammarShouldBeAugmentedWithAStartRule()
31 |     {
32 |         $this->assertEquals(
33 |             Grammar::START_RULE_NAME,
34 |             $this->grammar->getStartRule()->getName()
35 |         );
36 | 
37 |         $this->assertEquals(
38 |             array('Foo'),
39 |             $this->grammar->getStartRule()->getComponents()
40 |         );
41 |     }
42 | 
43 |     /**
44 |      * @test
45 |      */
46 |     public function shouldReturnAlternativesGroupedByName()
47 |     {
48 |         $rules = $this->grammar->getGroupedRules();
49 |         $this->assertCount(2, $rules['Foo']);
50 |     }
51 | 
52 |     /**
53 |      * @test
54 |      */
55 |     public function nonterminalsShouldBeDetectedFromRuleNames()
56 |     {
57 |         $this->assertTrue($this->grammar->hasNonterminal('Foo'));
58 |     }
59 | }
60 | 


--------------------------------------------------------------------------------
/src/Dissect/Console/Application.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Console;
 4 | 
 5 | use Symfony\Component\Console\Application as BaseApplication;
 6 | use Symfony\Component\Console\Input\InputDefinition;
 7 | use Symfony\Component\Console\Input\InputInterface;
 8 | use Symfony\Component\Console\Input\InputOption;
 9 | 
10 | /**
11 |  * The CLI application.
12 |  *
13 |  * @author Jakub Lédl <jakubledl@gmail.com>
14 |  */
15 | class Application extends BaseApplication
16 | {
17 |     // credit goes to everzet & kostiklv, since
18 |     // I copied the BehatApplication class when
19 |     // dealing with some CLI problems.
20 |     public function __construct($version)
21 |     {
22 |         parent::__construct('Dissect', $version);
23 |     }
24 | 
25 |     protected function getCommandName(InputInterface $input)
26 |     {
27 |         return 'dissect';
28 |     }
29 | 
30 |     protected function getDefaultCommands()
31 |     {
32 |         $default = parent::getDefaultCommands();
33 |         $default[] = new Command\DissectCommand();
34 | 
35 |         return $default;
36 |     }
37 | 
38 |     public function getDefinition()
39 |     {
40 |         return new InputDefinition(array(
41 |             new InputOption('--help',    '-h', InputOption::VALUE_NONE, 'Display this help message.'),
42 |             new InputOption('--verbose', '-v', InputOption::VALUE_NONE, 'Increase verbosity of exceptions.'),
43 |             new InputOption('--version', '-V', InputOption::VALUE_NONE, 'Display version information.'),
44 |         ));
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/tests/Dissect/Parser/LALR1/ArithGrammar.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Parser\LALR1;
 4 | 
 5 | use Dissect\Parser\Grammar;
 6 | 
 7 | class ArithGrammar extends Grammar
 8 | {
 9 |     public function __construct()
10 |     {
11 |         $this('Expr')
12 |             ->is('Expr', '+', 'Expr')
13 |             ->call(function ($l, $_, $r) {
14 |                 return $l + $r;
15 |             })
16 | 
17 |             ->is('Expr', '-', 'Expr')
18 |             ->call(function ($l, $_, $r) {
19 |                 return $l - $r;
20 |             })
21 | 
22 |             ->is('Expr', '*', 'Expr')
23 |             ->call(function ($l, $_, $r) {
24 |                 return $l * $r;
25 |             })
26 | 
27 |             ->is('Expr', '/', 'Expr')
28 |             ->call(function ($l, $_, $r) {
29 |                 return $l / $r;
30 |             })
31 | 
32 |             ->is('Expr', '**', 'Expr')
33 |             ->call(function ($l, $_, $r) {
34 |                 return pow($l, $r);
35 |             })
36 | 
37 |             ->is('(', 'Expr', ')')
38 |             ->call(function ($_, $e, $_) {
39 |                 return $e;
40 |             })
41 | 
42 |             ->is('-', 'Expr')->prec(4)
43 |             ->call(function ($_, $e) {
44 |                 return -$e;
45 |             })
46 | 
47 |             ->is('INT')
48 |             ->call(function ($i) {
49 |                 return (int)$i->getValue();
50 |             });
51 | 
52 |         $this->operators('+', '-')->left()->prec(1);
53 |         $this->operators('*', '/')->left()->prec(2);
54 |         $this->operators('**')->right()->prec(3);
55 | 
56 |         $this->start('Expr');
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/Dissect/Parser/LALR1/Dumper/StringWriter.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Parser\LALR1\Dumper;
 4 | 
 5 | /**
 6 |  * A string writer.
 7 |  *
 8 |  * @author Jakub Lédl <jakubledl@gmail.com>
 9 |  */
10 | class StringWriter
11 | {
12 |     protected $indent = 0;
13 |     protected $string = '';
14 | 
15 |     /**
16 |      * Appends the given string.
17 |      *
18 |      * @param string $string The string to write.
19 |      */
20 |     public function write($string)
21 |     {
22 |         $this->string .= $string;
23 |     }
24 | 
25 |     /**
26 |      * Gets the string as written so far.
27 |      *
28 |      * @return string The string.
29 |      */
30 |     public function get()
31 |     {
32 |         return $this->string;
33 |     }
34 | 
35 |     /**
36 |      * Adds a level of indentation.
37 |      */
38 |     public function indent()
39 |     {
40 |         $this->indent++;
41 |     }
42 | 
43 |     /**
44 |      * Removes a level of indentation.
45 |      */
46 |     public function outdent()
47 |     {
48 |         $this->indent--;
49 |     }
50 | 
51 |     /**
52 |      * If a string is given, it writes
53 |      * it with correct indentation and
54 |      * a newline appended. When no string
55 |      * is given, it adheres to the rule
56 |      * that empty lines should be whitespace-free
57 |      * (like vim) and doesn't append any
58 |      * indentation.
59 |      *
60 |      * @param string $string The string to write.
61 |      */
62 |     public function writeLine($string = null)
63 |     {
64 |         if ($string) {
65 |             $this->write(sprintf(
66 |                 "%s%s\n",
67 |                 str_repeat(' ', $this->indent * 4),
68 |                 $string
69 |             ));
70 |         } else {
71 |             $this->write("\n");
72 |         }
73 |     }
74 | }
75 | 


--------------------------------------------------------------------------------
/tests/Dissect/Parser/LALR1/ParserTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Parser\LALR1;
 4 | 
 5 | use Dissect\Parser\Exception\UnexpectedTokenException;
 6 | use PHPUnit_Framework_TestCase;
 7 | 
 8 | class ParserTest extends PHPUnit_Framework_TestCase
 9 | {
10 |     protected $lexer;
11 |     protected $parser;
12 | 
13 |     protected function setUp()
14 |     {
15 |         $this->lexer = new ArithLexer();
16 |         $this->parser = new Parser(new ArithGrammar());
17 |     }
18 | 
19 |     /**
20 |      * @test
21 |      */
22 |     public function parserShouldProcessTheTokenStreamAndUseGrammarCallbacksForReductions()
23 |     {
24 |         $this->assertEquals(-2, $this->parser->parse($this->lexer->lex(
25 |             '-1 - 1')));
26 | 
27 |         $this->assertEquals(11664, $this->parser->parse($this->lexer->lex(
28 |             '6 ** (1 + 1) ** 2 * (5 + 4)')));
29 | 
30 |         $this->assertEquals(-4, $this->parser->parse($this->lexer->lex(
31 |             '3 - 5 - 2')));
32 | 
33 |         $this->assertEquals(262144, $this->parser->parse($this->lexer->lex(
34 |             '4 ** 3 ** 2')));
35 |     }
36 | 
37 |     /**
38 |      * @test
39 |      */
40 |     public function parserShouldThrowAnExceptionOnInvalidInput()
41 |     {
42 |         try {
43 |             $this->parser->parse($this->lexer->lex('6 ** 5 3'));
44 |             $this->fail('Expected an UnexpectedTokenException.');
45 |         } catch (UnexpectedTokenException $e) {
46 |             $this->assertEquals('INT', $e->getToken()->getType());
47 |             $this->assertEquals(array('$eof', '+', '-', '*', '/', '**', ')'), $e->getExpected());
48 |             $this->assertEquals(<<<EOT
49 | Unexpected 3 (INT) at line 1.
50 | 
51 | Expected one of \$eof, +, -, *, /, **, ).
52 | EOT
53 |             , $e->getMessage());
54 |         }
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/Dissect/Parser/LALR1/Analysis/AnalysisResult.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Parser\LALR1\Analysis;
 4 | 
 5 | /**
 6 |  * The result of a grammar analysis.
 7 |  *
 8 |  * @author Jakub Lédl <jakubledl@gmail.com>
 9 |  */
10 | class AnalysisResult
11 | {
12 |     /**
13 |      * @var \Dissect\Parser\LALR1\Analysis\Automaton
14 |      */
15 |     protected $automaton;
16 | 
17 |     /**
18 |      * @var array
19 |      */
20 |     protected $parseTable;
21 | 
22 |     /**
23 |      * @var array
24 |      */
25 |     protected $resolvedConflicts;
26 | 
27 |     /**
28 |      * Constructor.
29 |      *
30 |      * @param array $parseTable The parse table.
31 |      * @param \Dissect\Parser\LALR1\Analysis\Automaton $automaton
32 |      * @param array $conflicts An array of conflicts resolved during parse table
33 |      * construction.
34 |      */
35 |     public function __construct(array $parseTable, Automaton $automaton, array $conflicts)
36 |     {
37 |         $this->parseTable = $parseTable;
38 |         $this->automaton = $automaton;
39 |         $this->resolvedConflicts = $conflicts;
40 |     }
41 | 
42 |     /**
43 |      * Returns the handle-finding FSA.
44 |      *
45 |      * @return \Dissect\Parser\LALR1\Analysis\Automaton
46 |      */
47 |     public function getAutomaton()
48 |     {
49 |         return $this->automaton;
50 |     }
51 | 
52 |     /**
53 |      * Returns the resulting parse table.
54 |      *
55 |      * @return array The parse table.
56 |      */
57 |     public function getParseTable()
58 |     {
59 |         return $this->parseTable;
60 |     }
61 | 
62 |     /**
63 |      * Returns an array of resolved parse table conflicts.
64 |      *
65 |      * @return array The conflicts.
66 |      */
67 |     public function getResolvedConflicts()
68 |     {
69 |         return $this->resolvedConflicts;
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/src/Dissect/Parser/Exception/UnexpectedTokenException.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Parser\Exception;
 4 | 
 5 | use Dissect\Lexer\Token;
 6 | use RuntimeException;
 7 | 
 8 | /**
 9 |  * Thrown when a parser encounters an unexpected token.
10 |  *
11 |  * @author Jakub Lédl <jakubledl@gmail.com>
12 |  */
13 | class UnexpectedTokenException extends RuntimeException
14 | {
15 |     const MESSAGE = <<<EOT
16 | Unexpected %s at line %d.
17 | 
18 | Expected one of %s.
19 | EOT;
20 | 
21 |     /**
22 |      * @var \Dissect\Lexer\Token
23 |      */
24 |     protected $token;
25 | 
26 |     /**
27 |      * @var string[]
28 |      */
29 |     protected $expected;
30 | 
31 |     /**
32 |      * Constructor.
33 |      *
34 |      * @param \Dissect\Lexer\Token $token The unexpected token.
35 |      * @param string[] $expected The expected token types.
36 |      */
37 |     public function __construct(Token $token, array $expected)
38 |     {
39 |         $this->token = $token;
40 |         $this->expected = $expected;
41 | 
42 |         if ($token->getValue() !== $token->getType()) {
43 |             $info = $token->getValue() . ' (' . $token->getType() . ')';
44 |         } else {
45 |             $info = $token->getType();
46 |         }
47 | 
48 |         parent::__construct(sprintf(
49 |             self::MESSAGE,
50 |             $info,
51 |             $token->getLine(),
52 |             implode(', ', $expected)
53 |         ));
54 |     }
55 | 
56 |     /**
57 |      * Returns the unexpected token.
58 |      *
59 |      * @return \Dissect\Lexer\Token The unexpected token.
60 |      */
61 |     public function getToken()
62 |     {
63 |         return $this->token;
64 |     }
65 | 
66 |     /**
67 |      * Returns the expected token types.
68 |      *
69 |      * @return string[] The expected token types.
70 |      */
71 |     public function getExpected()
72 |     {
73 |         return $this->expected;
74 |     }
75 | }
76 | 


--------------------------------------------------------------------------------
/tests/Dissect/Lexer/SimpleLexerTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Lexer;
 4 | 
 5 | use Dissect\Lexer\Recognizer\RegexRecognizer;
 6 | use Dissect\Lexer\Recognizer\SimpleRecognizer;
 7 | use PHPUnit_Framework_TestCase;
 8 | 
 9 | class SimpleLexerTest extends PHPUnit_Framework_TestCase
10 | {
11 |     protected $lexer;
12 | 
13 |     public function setUp()
14 |     {
15 |         $this->lexer = new SimpleLexer();
16 | 
17 |         $this->lexer
18 |             ->token('A', 'a')
19 |             ->token('(')
20 |             ->token('B', 'b')
21 |             ->token(')')
22 |             ->token('C', 'c')
23 |             ->regex('WS', "/[ \n\t\r]+/")
24 | 
25 |             ->skip('WS');
26 |     }
27 | 
28 |     /**
29 |      * @test
30 |      */
31 |     public function simpleLexerShouldWalkThroughTheRecognizers()
32 |     {
33 |         $stream = $this->lexer->lex('a (b) c');
34 | 
35 |         $this->assertEquals(6, $stream->count()); // with EOF
36 |         $this->assertEquals('(', $stream->get(1)->getType());
37 |         $this->assertEquals(1, $stream->get(3)->getLine());
38 |         $this->assertEquals('C', $stream->get(4)->getType());
39 |     }
40 | 
41 |     /**
42 |      * @test
43 |      */
44 |     public function simpleLexerShouldSkipSpecifiedTokens()
45 |     {
46 |         $stream = $this->lexer->lex('a (b) c');
47 | 
48 |         foreach ($stream as $token) {
49 |             $this->assertNotEquals('WS', $token->getType());
50 |         }
51 |     }
52 | 
53 |     /**
54 |      * @test
55 |      */
56 |     public function simpleLexerShouldReturnTheBestMatch()
57 |     {
58 |         $this->lexer->token('CLASS', 'class');
59 |         $this->lexer->regex('WORD', '/[a-z]+/');
60 | 
61 |         $stream = $this->lexer->lex('class classloremipsum');
62 | 
63 |         $this->assertEquals('CLASS', $stream->getCurrentToken()->getType());
64 |         $this->assertEquals('WORD', $stream->lookAhead(1)->getType());
65 |     }
66 | }
67 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | Welcome to Dissect!
 2 | ===================
 3 | 
 4 | Dissect is a set of tools for lexical and syntactical analysis
 5 | written in pure PHP.
 6 | 
 7 | This guide assumes that you're already familiar with basic concepts
 8 | of parsing. Explaining them is beyond the scope of this simple guide,
 9 | so if you're not, see, for example, [this article][parsing].
10 | This page serves as an index for individual documentation pages.
11 | 
12 | 1. [Lexical analysis with Dissect](lexing.md)
13 |     1. [SimpleLexer](lexing.md#simplelexer)
14 |     2. [StatefulLexer](lexing.md#statefullexer)
15 |     3. [Improving lexer performance](lexing.md#improving-lexer-performance)
16 |     4. [RegexLexer](lexing.md#regexlexer)
17 | 2. [Parsing with Dissect](parsing.md)
18 |     1. [Why an LALR(1) parser?](parsing.md#why-an-lalr1-parser)
19 |     2. [Writing a grammar](parsing.md#writing-a-grammar)
20 |     3. [Example: Parsing mathematical expressions](parsing.md#example-parsing-mathematical-expressions)
21 |     4. [Invalid input](parsing.md#invalid-input)
22 |     5. [Precomputing the parse table](parsing.md#precomputing-the-parse-table)
23 |     6. [Resolving conflicts](parsing.md#resolving-conflicts)
24 | 3. [Building an AST](ast.md)
25 |     1. [Travesing the AST](ast.md#traversing-the-ast)
26 | 4. [Describing common syntactic structures](common.md)
27 |     1. [List of 1 or more `Foo`s](common.md#list-of-1-or-more-foos)
28 |     2. [List of 0 or more `Foo`s](common.md#list-of-0-or-more-foos)
29 |     3. [A comma separated list](common.md#a-comma-separated-list)
30 |     4. [Expressions](common.md#expressions)
31 | 5. [The command-line interface](cli.md)
32 |     1. [Running the tool](cli.md#running-the-tool)
33 |     2. [Dumping the parse table in the debug format](cli.md#dumping-the-parse-table-in-the-debug-format)
34 |     3. [Dumping the handle-finding automaton](cli.md#dumping-the-handle-finding-automaton)
35 | 
36 | [parsing]: http://en.wikipedia.org/wiki/Parsing
37 | 


--------------------------------------------------------------------------------
/src/Dissect/Util/Util.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Util;
 4 | 
 5 | /**
 6 |  * Some utility functions.
 7 |  *
 8 |  * @author Jakub Lédl <jakubledl@gmail.com>
 9 |  */
10 | abstract class Util
11 | {
12 |     /**
13 |      * Merges two or more sets by values.
14 |      *
15 |      * {a, b} union {b, c} = {a, b, c}
16 |      *
17 |      * @return array The union of given sets.
18 |      */
19 |     public static function union()
20 |     {
21 |         return array_unique(call_user_func_array('array_merge', func_get_args()));
22 |     }
23 | 
24 |     /**
25 |      * Determines whether two sets have a difference.
26 |      *
27 |      * @param array $first The first set.
28 |      * @param array $second The second set.
29 |      *
30 |      * @return boolean Whether there is a difference.
31 |      */
32 |     public static function different(array $first, array $second)
33 |     {
34 |         return count(array_diff($first, $second)) !== 0;
35 |     }
36 | 
37 |     /**
38 |      * Determines length of a UTF-8 string.
39 |      *
40 |      * @param string $str The string in UTF-8 encoding.
41 |      *
42 |      * @return int The length.
43 |      */
44 |     public static function stringLength($str)
45 |     {
46 |         return strlen(utf8_decode($str));
47 |     }
48 | 
49 |     /**
50 |      * Extracts a substring of a UTF-8 string.
51 |      *
52 |      * @param string $str The string to extract the substring from.
53 |      * @param int $position The position from which to start extracting.
54 |      * @param int $length The length of the substring.
55 |      *
56 |      * @return string The substring.
57 |      */
58 |     public static function substring($str, $position, $length = null)
59 |     {
60 |         static $lengthFunc = null;
61 | 
62 |         if ($lengthFunc === null) {
63 |             $lengthFunc = function_exists('mb_substr') ? 'mb_substr' : 'iconv_substr';
64 |         }
65 | 
66 |         if ($length === null) {
67 |             $length = self::stringLength($str);
68 |         }
69 | 
70 |         return $lengthFunc($str, $position, $length, 'UTF-8');
71 |     }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/Dissect/Parser/LALR1/Analysis/State.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Parser\LALR1\Analysis;
 4 | 
 5 | /**
 6 |  * A state in a handle-finding FSA.
 7 |  *
 8 |  * @author Jakub Lédl <jakubledl@gmail.com>
 9 |  */
10 | class State
11 | {
12 |     /**
13 |      * @var array
14 |      */
15 |     protected $items = array();
16 | 
17 |     /**
18 |      * @var array
19 |      */
20 |     protected $itemMap = array();
21 | 
22 |     /**
23 |      * @var int
24 |      */
25 |     protected $number;
26 | 
27 |     /**
28 |      * Constructor.
29 |      *
30 |      * @param int $number The number identifying this state.
31 |      * @param array $items The initial items of this state.
32 |      */
33 |     public function __construct($number, array $items)
34 |     {
35 |         $this->number = $number;
36 | 
37 |         foreach ($items as $item) {
38 |             $this->add($item);
39 |         }
40 |     }
41 | 
42 |     /**
43 |      * Adds a new item to this state.
44 |      *
45 |      * @param \Dissect\Parser\LALR1\Analysis\Item $item The new item.
46 |      */
47 |     public function add(Item $item)
48 |     {
49 |         $this->items[] = $item;
50 | 
51 |         $this->itemMap[$item->getRule()->getNumber()][$item->getDotIndex()] = $item;
52 |     }
53 | 
54 |     /**
55 |      * Returns an item by its rule number and dot index.
56 |      *
57 |      * @param int $ruleNumber The number of the rule of the desired item.
58 |      * @param int $dotIndex The dot index of the desired item.
59 |      *
60 |      * @return \Dissect\Parser\LALR1\Analysis\Item The item.
61 |      */
62 |     public function get($ruleNumber, $dotIndex)
63 |     {
64 |         return $this->itemMap[$ruleNumber][$dotIndex];
65 |     }
66 | 
67 |     /**
68 |      * Returns the number identifying this state.
69 |      *
70 |      * @return int
71 |      */
72 |     public function getNumber()
73 |     {
74 |         return $this->number;
75 |     }
76 | 
77 |     /**
78 |      * Returns an array of items constituting this state.
79 |      *
80 |      * @return array The items.
81 |      */
82 |     public function getItems()
83 |     {
84 |         return $this->items;
85 |     }
86 | }
87 | 


--------------------------------------------------------------------------------
/src/Dissect/Lexer/TokenStream/TokenStream.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Lexer\TokenStream;
 4 | 
 5 | use Countable;
 6 | use IteratorAggregate;
 7 | 
 8 | /**
 9 |  * A common contract for all token stream classes.
10 |  *
11 |  * @author Jakub Lédl <jakubledl@gmail.com>
12 |  */
13 | interface TokenStream extends Countable, IteratorAggregate
14 | {
15 |     /**
16 |      * Returns the current position in the stream.
17 |      *
18 |      * @return int The current position in the stream.
19 |      */
20 |     public function getPosition();
21 | 
22 |     /**
23 |      * Retrieves the current token.
24 |      *
25 |      * @return \Dissect\Lexer\Token The current token.
26 |      */
27 |     public function getCurrentToken();
28 | 
29 |     /**
30 |      * Returns a look-ahead token. Negative values are allowed
31 |      * and serve as look-behind.
32 |      *
33 |      * @param int $n The look-ahead.
34 |      *
35 |      * @throws \OutOfBoundsException If current position + $n is out of range.
36 |      *
37 |      * @return \Dissect\Lexer\Token The lookahead token.
38 |      */
39 |     public function lookAhead($n);
40 | 
41 |     /**
42 |      * Returns the token at absolute position $n.
43 |      *
44 |      * @param int $n The position.
45 |      *
46 |      * @throws \OutOfBoundsException If $n is out of range.
47 |      *
48 |      * @return \Dissect\Lexer\Token The token at position $n.
49 |      */
50 |     public function get($n);
51 | 
52 |     /**
53 |      * Moves the cursor to the absolute position $n.
54 |      *
55 |      * @param int $n The position.
56 |      *
57 |      * @throws \OutOfBoundsException If $n is out of range.
58 |      */
59 |     public function move($n);
60 | 
61 |     /**
62 |      * Moves the cursor by $n, relative to the current position.
63 |      *
64 |      * @param int $n The seek.
65 |      *
66 |      * @throws \OutOfBoundsException If current position + $n is out of range.
67 |      */
68 |     public function seek($n);
69 | 
70 |     /**
71 |      * Moves the cursor to the next token.
72 |      *
73 |      * @throws \OutOfBoundsException If at the end of the stream.
74 |      */
75 |     public function next();
76 | }
77 | 


--------------------------------------------------------------------------------
/src/Dissect/Parser/LALR1/Analysis/Automaton.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Parser\LALR1\Analysis;
 4 | 
 5 | /**
 6 |  * A finite-state automaton for recognizing
 7 |  * grammar productions.
 8 |  *
 9 |  * @author Jakub Lédl <jakubledl@gmail.com>
10 |  */
11 | class Automaton
12 | {
13 |     /**
14 |      * @var array
15 |      */
16 |     protected $states = array();
17 | 
18 |     /**
19 |      * @var array
20 |      */
21 |     protected $transitionTable = array();
22 | 
23 |     /**
24 |      * Adds a new automaton state.
25 |      *
26 |      * @param \Dissect\Parser\LALR1\Analysis\State $state The new state.
27 |      */
28 |     public function addState(State $state)
29 |     {
30 |         $this->states[$state->getNumber()] = $state;
31 |     }
32 | 
33 |     /**
34 |      * Adds a new transition in the FSA.
35 |      *
36 |      * @param int $origin The number of the origin state.
37 |      * @param string $label The symbol that triggers this transition.
38 |      * @param int $dest The destination state number.
39 |      */
40 |     public function addTransition($origin, $label, $dest)
41 |     {
42 |         $this->transitionTable[$origin][$label] = $dest;
43 |     }
44 | 
45 |     /**
46 |      * Returns a state by its number.
47 |      *
48 |      * @param int $number The state number.
49 |      *
50 |      * @return \Dissect\Parser\LALR1\Analysis\State The requested state.
51 |      */
52 |     public function getState($number)
53 |     {
54 |         return $this->states[$number];
55 |     }
56 | 
57 |     /**
58 |      * Does this automaton have a state identified by $number?
59 |      *
60 |      * @return boolean
61 |      */
62 |     public function hasState($number)
63 |     {
64 |         return isset($this->states[$number]);
65 |     }
66 | 
67 |     /**
68 |      * Returns all states in this FSA.
69 |      *
70 |      * @return array The states of this FSA.
71 |      */
72 |     public function getStates()
73 |     {
74 |         return $this->states;
75 |     }
76 | 
77 |     /**
78 |      * Returns the transition table for this automaton.
79 |      *
80 |      * @return array The transition table.
81 |      */
82 |     public function getTransitionTable()
83 |     {
84 |         return $this->transitionTable;
85 |     }
86 | }
87 | 


--------------------------------------------------------------------------------
/src/Dissect/Lexer/RegexLexer.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Lexer;
 4 | 
 5 | use Dissect\Lexer\TokenStream\ArrayTokenStream;
 6 | use Dissect\Parser\Parser;
 7 | 
 8 | /**
 9 |  * Fast regex lexer, adapted from Doctrine.
10 |  *
11 |  * @author Guilherme Blanco <guilhermeblanco@hotmail.com>
12 |  * @author Jonathan Wage <jonwage@gmail.com>
13 |  * @author Roman Borschel <roman@code-factory.org>
14 |  * @author Jakub Lédl <jakubledl@gmail.com>
15 |  */
16 | abstract class RegexLexer implements Lexer
17 | {
18 |     /**
19 |      * {@inheritDoc}
20 |      */
21 |     public function lex($string)
22 |     {
23 |         static $regex;
24 | 
25 |         if (!isset($regex)) {
26 |             $regex = '/(' . implode(')|(', $this->getCatchablePatterns()) . ')|'
27 |                 . implode('|', $this->getNonCatchablePatterns()) . '/i';
28 |         }
29 | 
30 |         $string = strtr($string, array("\r\n" => "\n", "\r" => "\n"));
31 | 
32 |         $flags = PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_OFFSET_CAPTURE;
33 |         $matches = preg_split($regex, $string, -1, $flags);
34 |         $tokens = array();
35 |         $line = 1;
36 |         $oldPosition = 0;
37 | 
38 |         foreach ($matches as $match) {
39 |             list ($value, $position) = $match;
40 | 
41 |             $type = $this->getType($value);
42 | 
43 |             if ($position > 0) {
44 |                 $line += substr_count($string, "\n", $oldPosition, $position - $oldPosition);
45 |             }
46 | 
47 |             $oldPosition = $position;
48 | 
49 |             $tokens[] = new CommonToken($type, $value, $line);
50 |         }
51 | 
52 |         $tokens[] = new CommonToken(Parser::EOF_TOKEN_TYPE, '', $line);
53 | 
54 |         return new ArrayTokenStream($tokens);
55 |     }
56 | 
57 |     /**
58 |      * The patterns corresponding to tokens.
59 |      *
60 |      * @return array
61 |      */
62 |     abstract protected function getCatchablePatterns();
63 | 
64 |     /**
65 |      * The patterns corresponding to tokens to be skipped.
66 |      *
67 |      * @return array
68 |      */
69 |     abstract protected function getNonCatchablePatterns();
70 | 
71 |     /**
72 |      * Retrieves the token type.
73 |      *
74 |      * @param string $value
75 |      *
76 |      * @return string $type
77 |      */
78 |     abstract protected function getType(&$value);
79 | }
80 | 


--------------------------------------------------------------------------------
/src/Dissect/Parser/LALR1/Analysis/Exception/ShiftReduceConflictException.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Parser\LALR1\Analysis\Exception;
 4 | 
 5 | use Dissect\Parser\LALR1\Analysis\Automaton;
 6 | use Dissect\Parser\Rule;
 7 | 
 8 | /**
 9 |  * Thrown when a grammar is not LALR(1) and exhibits
10 |  * a shift/reduce conflict.
11 |  *
12 |  * @author Jakub Lédl <jakubledl@gmail.com>
13 |  */
14 | class ShiftReduceConflictException extends ConflictException
15 | {
16 |     /**
17 |      * The exception message template.
18 |      */
19 |     const MESSAGE = <<<EOT
20 | The grammar exhibits a shift/reduce conflict on rule:
21 | 
22 |   %d. %s -> %s
23 | 
24 | (on lookahead "%s" in state %d). Restructure your grammar or choose a conflict resolution mode.
25 | EOT;
26 | 
27 |     /**
28 |      * @var \Dissect\Parser\Rule
29 |      */
30 |     protected $rule;
31 | 
32 |     /**
33 |      * @var string
34 |      */
35 |     protected $lookahead;
36 | 
37 |     /**
38 |      * Constructor.
39 |      *
40 |      * @param \Dissect\Parser\Rule $rule The conflicting grammar rule.
41 |      * @param string $lookahead The conflicting lookahead to shift.
42 |      * @param \Dissect\Parser\LALR1\Analysis\Automaton $automaton The faulty automaton.
43 |      */
44 |     public function __construct($state, Rule $rule, $lookahead, Automaton $automaton)
45 |     {
46 |         $components = $rule->getComponents();
47 | 
48 |         parent::__construct(
49 |             sprintf(
50 |                 self::MESSAGE,
51 |                 $rule->getNumber(),
52 |                 $rule->getName(),
53 |                 empty($components) ? '/* empty */' : implode(' ', $components),
54 |                 $lookahead,
55 |                 $state
56 |             ),
57 |             $state,
58 |             $automaton
59 |         );
60 | 
61 |         $this->rule = $rule;
62 |         $this->lookahead = $lookahead;
63 |     }
64 | 
65 |     /**
66 |      * Returns the conflicting rule.
67 |      *
68 |      * @return \Dissect\Parser\Rule The conflicting rule.
69 |      */
70 |     public function getRule()
71 |     {
72 |         return $this->rule;
73 |     }
74 | 
75 |     /**
76 |      * Returns the conflicting lookahead.
77 |      *
78 |      * @return string The conflicting lookahead.
79 |      */
80 |     public function getLookahead()
81 |     {
82 |         return $this->lookahead;
83 |     }
84 | }
85 | 


--------------------------------------------------------------------------------
/src/Dissect/Parser/LALR1/Analysis/KernelSet/KernelSet.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Parser\LALR1\Analysis\KernelSet;
 4 | 
 5 | /**
 6 |  * A BST implementation for more efficient lookup
 7 |  * of states by their kernel items.
 8 |  *
 9 |  * @author Jakub Lédl <jakubledl@gmail.com>
10 |  */
11 | class KernelSet
12 | {
13 |     protected $nextNumber = 0;
14 |     protected $root = null;
15 | 
16 |     /**
17 |      * Inserts a new node in the BST and returns
18 |      * the number of the new state if no such state
19 |      * exists. Otherwise, returns the number of the
20 |      * existing state.
21 |      *
22 |      * @param array $kernel The state kernel.
23 |      *
24 |      * @return int The state number.
25 |      */
26 |     public function insert(array $kernel)
27 |     {
28 |         $kernel = KernelSet::hashKernel($kernel);
29 | 
30 |         if ($this->root === null) {
31 |             $this->root = new Node($kernel, $n = $this->nextNumber++);
32 | 
33 |             return $n;
34 |         }
35 | 
36 |         $node = $this->root;
37 | 
38 |         while (true) {
39 |             if ($kernel < $node->kernel) {
40 |                 if ($node->left === null) {
41 |                     $node->left = new Node($kernel, $n = $this->nextNumber++);
42 | 
43 |                     return $n;
44 |                 } else {
45 |                     $node = $node->left;
46 |                 }
47 |             } elseif ($kernel > $node->kernel) {
48 |                 if ($node->right === null) {
49 |                     $node->right = new Node($kernel, $n = $this->nextNumber++);
50 | 
51 |                     return $n;
52 |                 } else {
53 |                     $node = $node->right;
54 |                 }
55 |             } else {
56 |                 return $node->number;
57 |             }
58 |         }
59 |     }
60 | 
61 |     /**
62 |      * Hashes a state kernel using a pairing function.
63 |      *
64 |      * @param array $kernel The kernel.
65 |      *
66 |      * @return array The hashed kernel.
67 |      */
68 |     public static function hashKernel(array $kernel)
69 |     {
70 |         $kernel = array_map(function ($tuple) {
71 |             list ($car, $cdr) = $tuple;
72 | 
73 |             return ($car + $cdr) * ($car + $cdr + 1) / 2 + $cdr;
74 |         }, $kernel);
75 | 
76 |         sort($kernel);
77 | 
78 |         return $kernel;
79 |     }
80 | }
81 | 


--------------------------------------------------------------------------------
/tests/Dissect/Parser/LALR1/Analysis/ItemTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Parser\LALR1\Analysis;
 4 | 
 5 | use Dissect\Parser\Rule;
 6 | use PHPUnit_Framework_TestCase;
 7 | 
 8 | class ItemTest extends PHPUnit_Framework_TestCase
 9 | {
10 |     /**
11 |      * @test
12 |      */
13 |     public function getActiveComponentShouldReturnTheComponentAboutToBeEncountered()
14 |     {
15 |         $item = new Item(new Rule(1, 'A', array('a', 'b', 'c')), 1);
16 | 
17 |         $this->assertEquals('b', $item->getActiveComponent());
18 |     }
19 | 
20 |     /**
21 |      * @test
22 |      */
23 |     public function itemShouldBeAReduceItemIfAllComponentsHaveBeenEncountered()
24 |     {
25 |         $item = new Item(new Rule(1, 'A', array('a', 'b', 'c')), 1);
26 |         $this->assertFalse($item->isReduceItem());
27 | 
28 |         $item = new Item(new Rule(1, 'A', array('a', 'b', 'c')), 3);
29 |         $this->assertTrue($item->isReduceItem());
30 |     }
31 | 
32 |     /**
33 |      * @test
34 |      */
35 |     public function itemShouldPumpLookaheadIntoConnectedItems()
36 |     {
37 |         $item1 = new Item(new Rule(1, 'A', array('a', 'b', 'c')), 1);
38 |         $item2 = new Item(new Rule(1, 'A', array('a', 'b', 'c')), 2);
39 | 
40 |         $item1->connect($item2);
41 |         $item1->pump('d');
42 | 
43 |         $this->assertContains('d', $item2->getLookahead());
44 |     }
45 | 
46 |     /**
47 |      * @test
48 |      */
49 |     public function itemShouldPumpTheSameLookaheadOnlyOnce()
50 |     {
51 |         $item1 = new Item(new Rule(1, 'A', array('a', 'b', 'c')), 1);
52 | 
53 |         $item2 = $this->getMock(
54 |             'Dissect\\Parser\\LALR1\\Analysis\\Item',
55 |             array('pump'),
56 |             array(
57 |                 new Rule(1, 'A', array('a', 'b', 'c')),
58 |                 2,
59 |             )
60 |         );
61 | 
62 |         $item2->expects($this->once())
63 |             ->method('pump')
64 |             ->with($this->equalTo('d'));
65 | 
66 |         $item1->connect($item2);
67 | 
68 |         $item1->pump('d');
69 |         $item1->pump('d');
70 |     }
71 | 
72 |     /**
73 |      * @test
74 |      */
75 |     public function getUnrecognizedComponentsShouldReturnAllComponentAfterTheDottedOne()
76 |     {
77 |         $item = new Item(new Rule(1, 'A', array('a', 'b', 'c')), 1);
78 | 
79 |         $this->assertEquals(array('c'), $item->getUnrecognizedComponents());
80 |     }
81 | }
82 | 


--------------------------------------------------------------------------------
/tests/Dissect/Lexer/StatefulLexerTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Lexer;
 4 | 
 5 | use Dissect\Lexer\Recognizer\RegexRecognizer;
 6 | use Dissect\Lexer\Recognizer\SimpleRecognizer;
 7 | use PHPUnit_Framework_TestCase;
 8 | 
 9 | class StatefulLexerTest extends PHPUnit_Framework_TestCase
10 | {
11 |     protected $lexer;
12 | 
13 |     protected function setUp()
14 |     {
15 |         $this->lexer = new StatefulLexer();
16 |     }
17 | 
18 |     /**
19 |      * @test
20 |      * @expectedException LogicException
21 |      * @expectedExceptionMessage Define a lexer state first.
22 |      */
23 |     public function addingNewTokenShouldThrowAnExceptionWhenNoStateIsBeingBuilt()
24 |     {
25 |         $this->lexer->regex('WORD', '/[a-z]+/');
26 |     }
27 | 
28 |     /**
29 |      * @test
30 |      * @expectedException LogicException
31 |      */
32 |     public function anExceptionShouldBeThrownOnLexingWithoutAStartingState()
33 |     {
34 |         $this->lexer->state('root');
35 |         $this->lexer->lex('foo');
36 |     }
37 | 
38 |     /**
39 |      * @test
40 |      */
41 |     public function theStateMechanismShouldCorrectlyPushAndPopStatesFromTheStack()
42 |     {
43 |         $this->lexer->state('root')
44 |             ->regex('WORD', '/[a-z]+/')
45 |             ->regex('WS', "/[ \r\n\t]+/")
46 |             ->token('"')->action('string')
47 |             ->skip('WS');
48 | 
49 |         $this->lexer->state('string')
50 |             ->regex('STRING_CONTENTS', '/(\\\\"|[^"])*/')
51 |             ->token('"')->action(StatefulLexer::POP_STATE);
52 | 
53 |         $this->lexer->start('root');
54 | 
55 |         $stream = $this->lexer->lex('foo bar "long \\" string" baz quux');
56 | 
57 |         $this->assertCount(8, $stream);
58 |         $this->assertEquals('STRING_CONTENTS', $stream->get(3)->getType());
59 |         $this->assertEquals('long \\" string', $stream->get(3)->getValue());
60 |         $this->assertEquals('quux', $stream->get(6)->getValue());
61 |     }
62 | 
63 |     /**
64 |      * @test
65 |      */
66 |     public function defaultActionShouldBeNop()
67 |     {
68 |         $this->lexer->state('root')
69 |             ->regex('WORD', '/[a-z]+/')
70 |             ->regex('WS', "/[ \r\n\t]+/")
71 |             ->skip('WS');
72 | 
73 |         $this->lexer->state('string');
74 | 
75 |         $this->lexer->start('root');
76 | 
77 |         $stream = $this->lexer->lex('foo bar');
78 |         $this->assertEquals(3, $stream->count());
79 |     }
80 | }
81 | 


--------------------------------------------------------------------------------
/src/Dissect/Parser/LALR1/Dumper/ProductionTableDumper.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Parser\LALR1\Dumper;
 4 | 
 5 | /**
 6 |  * A table dumper for production
 7 |  * environment - the dumped table
 8 |  * is compact, whitespace-free and
 9 |  * without any comments.
10 |  *
11 |  * @author Jakub Lédl <jakubledl@gmail.com>
12 |  */
13 | class ProductionTableDumper implements TableDumper
14 | {
15 |     /**
16 |      * {@inheritDoc}
17 |      */
18 |     public function dump(array $table)
19 |     {
20 |         $writer = new StringWriter();
21 | 
22 |         $this->writeIntro($writer);
23 | 
24 |         foreach ($table['action'] as $num => $state) {
25 |             $this->writeState($writer, $num, $state);
26 |             $writer->write(',');
27 |         }
28 | 
29 |         $this->writeMiddle($writer);
30 | 
31 |         foreach($table['goto'] as $num => $map) {
32 |             $this->writeGoto($writer, $num, $map);
33 |             $writer->write(',');
34 |         }
35 | 
36 |         $this->writeOutro($writer);
37 | 
38 |         $writer->write("\n"); // eof newline
39 | 
40 |         return $writer->get();
41 |     }
42 | 
43 |     protected function writeIntro(StringWriter $writer)
44 |     {
45 |         $writer->write("<?php return array('action'=>array(");
46 |     }
47 | 
48 |     protected function writeState(StringWriter $writer, $num, $state)
49 |     {
50 |         $writer->write((string)$num . '=>array(');
51 | 
52 |         foreach ($state as $trigger => $action) {
53 |             $this->writeAction($writer, $trigger, $action);
54 |             $writer->write(',');
55 |         }
56 | 
57 |         $writer->write(')');
58 |     }
59 | 
60 |     protected function writeAction(StringWriter $writer, $trigger, $action)
61 |     {
62 |         $writer->write(sprintf(
63 |             "'%s'=>%d",
64 |             $trigger,
65 |             $action
66 |         ));
67 |     }
68 | 
69 |     protected function writeMiddle(StringWriter $writer)
70 |     {
71 |         $writer->write("),'goto'=>array(");
72 |     }
73 | 
74 |     protected function writeGoto(StringWriter $writer, $num, $map)
75 |     {
76 |         $writer->write((string)$num . '=>array(');
77 | 
78 |         foreach ($map as $trigger => $destination) {
79 |             $writer->write(sprintf(
80 |                 "'%s'=>%d",
81 |                 $trigger,
82 |                 $destination
83 |             ));
84 | 
85 |             $writer->write(',');
86 |         }
87 | 
88 |         $writer->write(')');
89 |     }
90 | 
91 |     protected function writeOutro(StringWriter $writer)
92 |     {
93 |         $writer->write('));');
94 |     }
95 | }
96 | 


--------------------------------------------------------------------------------
/src/Dissect/Node/Node.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Node;
 4 | 
 5 | use Countable;
 6 | use IteratorAggregate;
 7 | 
 8 | /**
 9 |  * A basic contract for a node in an AST.
10 |  *
11 |  * @author Jakub Lédl <jakubledl@gmail.com>
12 |  */
13 | interface Node extends Countable, IteratorAggregate
14 | {
15 |     /**
16 |      * Returns the children of this node.
17 |      *
18 |      * @return array The children belonging to this node.
19 |      */
20 |     public function getNodes();
21 | 
22 |     /**
23 |      * Checks for existence of child node named $name.
24 |      *
25 |      * @param string $name The name of the child node.
26 |      *
27 |      * @return boolean If the node exists.
28 |      */
29 |     public function hasNode($name);
30 | 
31 |     /**
32 |      * Returns a child node specified by $name.
33 |      *
34 |      * @param int|string $name The name of the node.
35 |      *
36 |      * @return \Dissect\Node\Node The child node specified by $name.
37 |      *
38 |      * @throws \RuntimeException When no child node named $name exists.
39 |      */
40 |     public function getNode($name);
41 | 
42 |     /**
43 |      * Sets a child node.
44 |      *
45 |      * @param string $name The name.
46 |      * @param \Dissect\Node\Node $node The new child node.
47 |      */
48 |     public function setNode($name, Node $child);
49 | 
50 |     /**
51 |      * Removes a child node by name.
52 |      *
53 |      * @param string $name The name.
54 |      */
55 |     public function removeNode($name);
56 | 
57 |     /**
58 |      * Returns all attributes of this node.
59 |      *
60 |      * @return array The attributes.
61 |      */
62 |     public function getAttributes();
63 | 
64 |     /**
65 |      * Determines whether this node has an attribute
66 |      * under $key.
67 |      *
68 |      * @param string $key The key.
69 |      * @return boolean Whether there's an attribute under $key.
70 |      */
71 |     public function hasAttribute($key);
72 | 
73 |     /**
74 |      * Gets an attribute by key.
75 |      *
76 |      * @param string $key The key.
77 |      * @return mixed The attribute value.
78 |      *
79 |      * @throws \RuntimeException When no attribute exists under $key.
80 |      */
81 |     public function getAttribute($key);
82 | 
83 |     /**
84 |      * Sets an attribute by key.
85 |      *
86 |      * @param string $key The key.
87 |      * @param mixed $value The new value.
88 |      */
89 |     public function setAttribute($key, $value);
90 | 
91 |     /**
92 |      * Removes an attribute by key.
93 |      *
94 |      * @param string $key The key.
95 |      */
96 |     public function removeAttribute($key);
97 | }
98 | 


--------------------------------------------------------------------------------
/tests/Dissect/Lexer/AbstractLexerTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Lexer;
 4 | 
 5 | use Dissect\Lexer\Exception\RecognitionException;
 6 | use Dissect\Parser\Parser;
 7 | use PHPUnit_Framework_TestCase;
 8 | 
 9 | class AbstractLexerTest extends PHPUnit_Framework_TestCase
10 | {
11 |     protected $lexer;
12 | 
13 |     public function setUp()
14 |     {
15 |         $this->lexer = new StubLexer();
16 |     }
17 | 
18 |     /**
19 |      * @test
20 |      */
21 |     public function lexShouldDelegateToExtractTokenUpdatingTheLineAndOffsetAccordingly()
22 |     {
23 |         $stream = $this->lexer->lex("ab\nc");
24 | 
25 |         $this->assertEquals('a', $stream->getCurrentToken()->getValue());
26 |         $this->assertEquals(1, $stream->getCurrentToken()->getLine());
27 |         $stream->next();
28 | 
29 |         $this->assertEquals('b', $stream->getCurrentToken()->getValue());
30 |         $this->assertEquals(1, $stream->getCurrentToken()->getLine());
31 |         $stream->next();
32 | 
33 |         $this->assertEquals("\n", $stream->getCurrentToken()->getValue());
34 |         $this->assertEquals(1, $stream->getCurrentToken()->getLine());
35 |         $stream->next();
36 | 
37 |         $this->assertEquals('c', $stream->getCurrentToken()->getValue());
38 |         $this->assertEquals(2, $stream->getCurrentToken()->getLine());
39 |     }
40 | 
41 |     /**
42 |      * @test
43 |      */
44 |     public function lexShouldAppendAnEofTokenAutomatically()
45 |     {
46 |         $stream = $this->lexer->lex("abc");
47 |         $stream->seek(3);
48 | 
49 |         $this->assertEquals(Parser::EOF_TOKEN_TYPE, $stream->getCurrentToken()->getType());
50 |         $this->assertEquals(1, $stream->getCurrentToken()->getLine());
51 |     }
52 | 
53 |     /**
54 |      * @test
55 |      */
56 |     public function lexShouldThrowAnExceptionOnAnUnrecognizableToken()
57 |     {
58 |         try {
59 |             $stream = $this->lexer->lex("abcd");
60 |             $this->fail('Expected a RecognitionException.');
61 |         } catch (RecognitionException $e) {
62 |             $this->assertEquals(1, $e->getSourceLine());
63 |         }
64 |     }
65 | 
66 |     /**
67 |      * @test
68 |      */
69 |     public function lexShouldNormalizeLineEndingsBeforeLexing()
70 |     {
71 |         $stream = $this->lexer->lex("a\r\nb");
72 |         $this->assertEquals("\n", $stream->get(1)->getValue());
73 |     }
74 | 
75 |     /**
76 |      * @test
77 |      */
78 |     public function lexShouldSkipTokensIfToldToDoSo()
79 |     {
80 |         $stream = $this->lexer->lex('aeb');
81 |         $this->assertNotEquals('e', $stream->get(1)->getType());
82 |     }
83 | }
84 | 


--------------------------------------------------------------------------------
/src/Dissect/Lexer/AbstractLexer.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Dissect\Lexer;
 4 | 
 5 | use Dissect\Lexer\Exception\RecognitionException;
 6 | use Dissect\Lexer\TokenStream\ArrayTokenStream;
 7 | use Dissect\Parser\Parser;
 8 | use Dissect\Util\Util;
 9 | 
10 | /**
11 |  * A base class for a lexer. A superclass simply
12 |  * has to implement the extractToken and shouldSkipToken methods. Both
13 |  * SimpleLexer and StatefulLexer extend this class.
14 |  *
15 |  * @author Jakub Lédl <jakubledl@gmail.com>
16 |  */
17 | abstract class AbstractLexer implements Lexer
18 | {
19 |     /**
20 |      * @var int
21 |      */
22 |     private $line = 1;
23 | 
24 |     /**
25 |      * Returns the current line.
26 |      *
27 |      * @return int The current line.
28 |      */
29 |     protected function getCurrentLine()
30 |     {
31 |         return $this->line;
32 |     }
33 | 
34 |     /**
35 |      * Attempts to extract another token from the string.
36 |      * Returns the token on success or null on failure.
37 |      *
38 |      * @param string $string The string to extract the token from.
39 |      *
40 |      * @return \Dissect\Lexer\Token|null The extracted token or null.
41 |      */
42 |     abstract protected function extractToken($string);
43 | 
44 |     /**
45 |      * Should given token be skipped?
46 |      *
47 |      * @param \Dissect\Lexer\Token $token The token to evaluate.
48 |      *
49 |      * @return boolean Whether to skip the token.
50 |      */
51 |     abstract protected function shouldSkipToken(Token $token);
52 | 
53 |     /**
54 |      * {@inheritDoc}
55 |      */
56 |     public function lex($string)
57 |     {
58 |         // normalize line endings
59 |         $string = strtr($string, array("\r\n" => "\n", "\r" => "\n"));
60 | 
61 |         $tokens = array();
62 |         $position = 0;
63 |         $originalString = $string;
64 |         $originalLength = Util::stringLength($string);
65 | 
66 |         while (true) {
67 |             $token = $this->extractToken($string);
68 | 
69 |             if ($token === null) {
70 |                 break;
71 |             }
72 | 
73 |             if (!$this->shouldSkipToken($token)) {
74 |                 $tokens[] = $token;
75 |             }
76 | 
77 |             $shift = Util::stringLength($token->getValue());
78 | 
79 |             $position += $shift;
80 | 
81 |             // update line + offset
82 |             if ($position > 0) {
83 |                 $this->line = substr_count($originalString, "\n", 0, $position) + 1;
84 |             }
85 | 
86 |             $string = Util::substring($string, $shift);
87 |         }
88 | 
89 |         if ($position !== $originalLength) {
90 |             throw new RecognitionException($this->line);
91 |         }
92 | 
93 |         $tokens[] = new CommonToken(Parser::EOF_TOKEN_TYPE, '', $this->line);
94 | 
95 |         return new ArrayTokenStream($tokens);
96 |     }
97 | }
98 | 


--------------------------------------------------------------------------------
/src/Dissect/Parser/Rule.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Dissect\Parser;
  4 | 
  5 | /**
  6 |  * Represents a rule in a context-free grammar.
  7 |  *
  8 |  * @author Jakub Lédl <jakubledl@gmail.com>
  9 |  */
 10 | class Rule
 11 | {
 12 |     /**
 13 |      * @var int
 14 |      */
 15 |     protected $number;
 16 | 
 17 |     /**
 18 |      * @var string
 19 |      */
 20 |     protected $name;
 21 | 
 22 |     /**
 23 |      * @var string[]
 24 |      */
 25 |     protected $components;
 26 | 
 27 |     /**
 28 |      * @var callable
 29 |      */
 30 |     protected $callback = null;
 31 | 
 32 |     /**
 33 |      * @var int
 34 |      */
 35 |     protected $precedence = null;
 36 | 
 37 |     /**
 38 |      * Constructor.
 39 |      *
 40 |      * @param int $number The number of the rule in the grammar.
 41 |      * @param string $name The name (lhs) of the rule ("A" in "A -> a b c")
 42 |      * @param string[] $components The components of this rule.
 43 |      */
 44 |     public function __construct($number, $name, array $components)
 45 |     {
 46 |         $this->number = $number;
 47 |         $this->name = $name;
 48 |         $this->components = $components;
 49 |     }
 50 | 
 51 |     /**
 52 |      * Returns the number of this rule.
 53 |      *
 54 |      * @return int The number of this rule.
 55 |      */
 56 |     public function getNumber()
 57 |     {
 58 |         return $this->number;
 59 |     }
 60 | 
 61 |     /**
 62 |      * Returns the name of this rule.
 63 |      *
 64 |      * @return string The name of this rule.
 65 |      */
 66 |     public function getName()
 67 |     {
 68 |         return $this->name;
 69 |     }
 70 | 
 71 |     /**
 72 |      * Returns the components of this rule.
 73 |      *
 74 |      * @return string[] The components of this rule.
 75 |      */
 76 |     public function getComponents()
 77 |     {
 78 |         return $this->components;
 79 |     }
 80 | 
 81 |     /**
 82 |      * Returns a component at index $index or null
 83 |      * if index is out of range.
 84 |      *
 85 |      * @param int $index The index.
 86 |      *
 87 |      * @return string The component at index $index.
 88 |      */
 89 |     public function getComponent($index)
 90 |     {
 91 |         if (!isset($this->components[$index])) {
 92 |             return null;
 93 |         }
 94 | 
 95 |         return $this->components[$index];
 96 |     }
 97 | 
 98 |     /**
 99 |      * Sets the callback (the semantic value) of the rule.
100 |      *
101 |      * @param callable $callback The callback.
102 |      */
103 |     public function setCallback($callback)
104 |     {
105 |         $this->callback = $callback;
106 |     }
107 | 
108 |     public function getCallback()
109 |     {
110 |         return $this->callback;
111 |     }
112 | 
113 |     public function getPrecedence()
114 |     {
115 |         return $this->precedence;
116 |     }
117 | 
118 |     public function setPrecedence($i)
119 |     {
120 |         $this->precedence = $i;
121 |     }
122 | }
123 | 


--------------------------------------------------------------------------------
/src/Dissect/Lexer/TokenStream/ArrayTokenStream.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Dissect\Lexer\TokenStream;
  4 | 
  5 | use ArrayIterator;
  6 | use OutOfBoundsException;
  7 | 
  8 | /**
  9 |  * A simple array based implementation of a token stream.
 10 |  *
 11 |  * @author Jakub Lédl <jakubledl@gmail.com>
 12 |  */
 13 | class ArrayTokenStream implements TokenStream
 14 | {
 15 |     /**
 16 |      * @var \Dissect\Lexer\Token[]
 17 |      */
 18 |     protected $tokens;
 19 | 
 20 |     /**
 21 |      * @var int
 22 |      */
 23 |     protected $position = 0;
 24 | 
 25 |     /**
 26 |      * Constructor.
 27 |      *
 28 |      * @param \Dissect\Lexer\Token[] $tokens The tokens in this stream.
 29 |      */
 30 |     public function __construct(array $tokens)
 31 |     {
 32 |         $this->tokens = $tokens;
 33 |     }
 34 | 
 35 |     /**
 36 |      * {@inheritDoc}
 37 |      */
 38 |     public function getPosition()
 39 |     {
 40 |         return $this->position;
 41 |     }
 42 | 
 43 |     /**
 44 |      * {@inheritDoc}
 45 |      */
 46 |     public function getCurrentToken()
 47 |     {
 48 |         return $this->tokens[$this->position];
 49 |     }
 50 | 
 51 |     /**
 52 |      * {@inheritDoc}
 53 |      */
 54 |     public function lookAhead($n)
 55 |     {
 56 |         if (isset($this->tokens[$this->position + $n])) {
 57 |             return $this->tokens[$this->position + $n];
 58 |         }
 59 | 
 60 |         throw new OutOfBoundsException('Invalid look-ahead.');
 61 |     }
 62 | 
 63 |     /**
 64 |      * {@inheritDoc}
 65 |      */
 66 |     public function get($n)
 67 |     {
 68 |         if (isset($this->tokens[$n])) {
 69 |             return $this->tokens[$n];
 70 |         }
 71 | 
 72 |         throw new OutOfBoundsException('Invalid index.');
 73 |     }
 74 | 
 75 |     /**
 76 |      * {@inheritDoc}
 77 |      */
 78 |     public function move($n)
 79 |     {
 80 |         if (!isset($this->tokens[$n])) {
 81 |             throw new OutOfBoundsException('Invalid index to move to.');
 82 |         }
 83 | 
 84 |         $this->position = $n;
 85 |     }
 86 | 
 87 |     /**
 88 |      * {@inheritDoc}
 89 |      */
 90 |     public function seek($n)
 91 |     {
 92 |         if (!isset($this->tokens[$this->position + $n])) {
 93 |             throw new OutOfBoundsException('Invalid seek.');
 94 |         }
 95 | 
 96 |         $this->position += $n;
 97 |     }
 98 | 
 99 |     /**
100 |      * {@inheritDoc}
101 |      */
102 |     public function next()
103 |     {
104 |         if (!isset($this->tokens[$this->position + 1])) {
105 |             throw new OutOfBoundsException('Attempting to move beyond the end of the stream.');
106 |         }
107 | 
108 |         $this->position++;
109 |     }
110 | 
111 |     /**
112 |      * @return int
113 |      */
114 |     public function count()
115 |     {
116 |         return count($this->tokens);
117 |     }
118 | 
119 |     /**
120 |      * @return \ArrayIterator
121 |      */
122 |     public function getIterator()
123 |     {
124 |         return new ArrayIterator($this->tokens);
125 |     }
126 | }
127 | 


--------------------------------------------------------------------------------
/src/Dissect/Node/CommonNode.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Dissect\Node;
  4 | 
  5 | use RuntimeException;
  6 | 
  7 | /**
  8 |  * An AST node.
  9 |  *
 10 |  * @author Jakub Lédl <jakubledl@gmail.com>
 11 |  */
 12 | class CommonNode implements Node
 13 | {
 14 |     /**
 15 |      * @var array
 16 |      */
 17 |     protected $nodes;
 18 | 
 19 |     /**
 20 |      * @var array
 21 |      */
 22 |     protected $attributes;
 23 | 
 24 |     /**
 25 |      * Constructor.
 26 |      *
 27 |      * @param array $attributes The attributes of this node.
 28 |      * @param array $children The children of this node.
 29 |      */
 30 |     public function __construct(array $attributes = array(), array $nodes = array())
 31 |     {
 32 |         $this->attributes = $attributes;
 33 |         $this->nodes = $nodes;
 34 |     }
 35 | 
 36 |     /**
 37 |      * {@inheritDoc}
 38 |      */
 39 |     public function getNodes()
 40 |     {
 41 |         return $this->nodes;
 42 |     }
 43 | 
 44 |     /**
 45 |      * {@inheritDoc}
 46 |      */
 47 |     public function hasNode($key)
 48 |     {
 49 |         return isset($this->nodes[$key]);
 50 |     }
 51 | 
 52 |     /**
 53 |      * {@inheritDoc}
 54 |      */
 55 |     public function getNode($key)
 56 |     {
 57 |         if (!isset($this->children[$key])) {
 58 |             throw new RuntimeException(sprintf('No child node "%s" exists.', $key));
 59 |         }
 60 | 
 61 |         return $this->nodes[$key];
 62 |     }
 63 | 
 64 |     /**
 65 |      * {@inheritDoc}
 66 |      */
 67 |     public function setNode($key, Node $child)
 68 |     {
 69 |         $this->children[$key] = $child;
 70 |     }
 71 | 
 72 |     /**
 73 |      * {@inheritDoc}
 74 |      */
 75 |     public function removeNode($key)
 76 |     {
 77 |         unset($this->children[$key]);
 78 |     }
 79 | 
 80 |     /**
 81 |      * {@inheritDoc}
 82 |      */
 83 |     public function getAttributes()
 84 |     {
 85 |         return $this->attributes;
 86 |     }
 87 | 
 88 |     /**
 89 |      * {@inheritDoc}
 90 |      */
 91 |     public function hasAttribute($key)
 92 |     {
 93 |         return isset($this->attributes[$key]);
 94 |     }
 95 | 
 96 |     /**
 97 |      * {@inheritDoc}
 98 |      */
 99 |     public function getAttribute($key)
100 |     {
101 |         if (!isset($this->attributes[$key])) {
102 |             throw new RuntimeException(sprintf('No attribute "%s" exists.', $key));
103 |         }
104 | 
105 |         return $this->attributes[$key];
106 |     }
107 | 
108 |     /**
109 |      * {@inheritDoc}
110 |      */
111 |     public function setAttribute($key, $value)
112 |     {
113 |         $this->attributes[$key] = $value;
114 |     }
115 | 
116 |     /**
117 |      * {@inheritDoc}
118 |      */
119 |     public function removeAttribute($key)
120 |     {
121 |         unset($this->attributes[$key]);
122 |     }
123 | 
124 |     public function count()
125 |     {
126 |         return count($this->children);
127 |     }
128 | 
129 |     public function getIterator()
130 |     {
131 |         return new ArrayIterator($this->children);
132 |     }
133 | }
134 | 


--------------------------------------------------------------------------------
/src/Dissect/Lexer/SimpleLexer.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Dissect\Lexer;
  4 | 
  5 | use Dissect\Lexer\Recognizer\RegexRecognizer;
  6 | use Dissect\Lexer\Recognizer\SimpleRecognizer;
  7 | use Dissect\Util\Util;
  8 | 
  9 | /**
 10 |  * SimpleLexer uses specified recognizers
 11 |  * without keeping track of state.
 12 |  *
 13 |  * @author Jakub Lédl <jakubledl@gmail.com>
 14 |  */
 15 | class SimpleLexer extends AbstractLexer
 16 | {
 17 |     /**
 18 |      * @var array
 19 |      */
 20 |     protected $skipTokens = array();
 21 | 
 22 |     /**
 23 |      * @var array
 24 |      */
 25 |     protected $recognizers = array();
 26 | 
 27 |     /**
 28 |      * Adds a new token definition. If given only one argument,
 29 |      * it assumes that the token type and recognized value are
 30 |      * identical.
 31 |      *
 32 |      * @param string $type The token type.
 33 |      * @param string $value The value to be recognized.
 34 |      *
 35 |      * @return \Dissect\Lexer\SimpleLexer This instance for fluent interface.
 36 |      */
 37 |     public function token($type, $value = null)
 38 |     {
 39 |         if ($value) {
 40 |             $this->recognizers[$type] = new SimpleRecognizer($value);
 41 |         } else {
 42 |             $this->recognizers[$type] = new SimpleRecognizer($type);
 43 |         }
 44 | 
 45 |         return $this;
 46 |     }
 47 | 
 48 |     /**
 49 |      * Adds a new regex token definition.
 50 |      *
 51 |      * @param string $type The token type.
 52 |      * @param string $regex The regular expression used to match the token.
 53 |      *
 54 |      * @return \Dissect\Lexer\SimpleLexer This instance for fluent interface.
 55 |      */
 56 |     public function regex($type, $regex)
 57 |     {
 58 |         $this->recognizers[$type] = new RegexRecognizer($regex);
 59 | 
 60 |         return $this;
 61 |     }
 62 | 
 63 |     /**
 64 |      * Marks the token types given as arguments to be skipped.
 65 |      *
 66 |      * @param mixed $type,... Unlimited number of token types.
 67 |      *
 68 |      * @return \Dissect\Lexer\SimpleLexer This instance for fluent interface.
 69 |      */
 70 |     public function skip()
 71 |     {
 72 |         $this->skipTokens = func_get_args();
 73 | 
 74 |         return $this;
 75 |     }
 76 | 
 77 |     /**
 78 |      * {@inheritDoc}
 79 |      */
 80 |     protected function shouldSkipToken(Token $token)
 81 |     {
 82 |         return in_array($token->getType(), $this->skipTokens);
 83 |     }
 84 | 
 85 |     /**
 86 |      * {@inheritDoc}
 87 |      */
 88 |     protected function extractToken($string)
 89 |     {
 90 |         $value = $type = null;
 91 | 
 92 |         foreach ($this->recognizers as $t => $recognizer) {
 93 |             if ($recognizer->match($string, $v)) {
 94 |                 if ($value === null || Util::stringLength($v) > Util::stringLength($value)) {
 95 |                     $value = $v;
 96 |                     $type = $t;
 97 |                 }
 98 |             }
 99 |         }
100 | 
101 |         if ($type !== null) {
102 |             return new CommonToken($type, $value, $this->getCurrentLine());
103 |         }
104 | 
105 |         return null;
106 |     }
107 | }
108 | 


--------------------------------------------------------------------------------
/src/Dissect/Parser/LALR1/Parser.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Dissect\Parser\LALR1;
  4 | 
  5 | use Dissect\Lexer\TokenStream\TokenStream;
  6 | use Dissect\Parser\Exception\UnexpectedTokenException;
  7 | use Dissect\Parser\LALR1\Analysis\Analyzer;
  8 | use Dissect\Parser as P;
  9 | 
 10 | /**
 11 |  * A LR parser.
 12 |  *
 13 |  * @author Jakub Lédl <jakubledl@gmail.com>
 14 |  */
 15 | class Parser implements P\Parser
 16 | {
 17 |     /**
 18 |      * @var \Dissect\Parser\Grammar
 19 |      */
 20 |     protected $grammar;
 21 | 
 22 |     /**
 23 |      * @var array
 24 |      */
 25 |     protected $parseTable;
 26 | 
 27 |     /**
 28 |      * Constructor.
 29 |      *
 30 |      * @param \Dissect\Parser\Grammar $grammar The grammar.
 31 |      * @param array $parseTable If given, the parser doesn't have to analyze the grammar.
 32 |      */
 33 |     public function __construct(P\Grammar $grammar, array $parseTable = null)
 34 |     {
 35 |         $this->grammar = $grammar;
 36 | 
 37 |         if ($parseTable) {
 38 |             $this->parseTable = $parseTable;
 39 |         } else {
 40 |             $analyzer = new Analyzer();
 41 |             $this->parseTable = $analyzer->analyze($grammar)->getParseTable();
 42 |         }
 43 |     }
 44 | 
 45 |     /**
 46 |      * {@inheritDoc}
 47 |      */
 48 |     public function parse(TokenStream $stream)
 49 |     {
 50 |         $stateStack = array($currentState = 0);
 51 |         $args = array();
 52 | 
 53 |         foreach ($stream as $token) {
 54 |             while (true) {
 55 |                 $type = $token->getType();
 56 | 
 57 |                 if (!isset($this->parseTable['action'][$currentState][$type])) {
 58 |                     // unexpected token
 59 | 
 60 |                     throw new UnexpectedTokenException(
 61 |                         $token,
 62 |                         array_keys($this->parseTable['action'][$currentState])
 63 |                     );
 64 |                 }
 65 | 
 66 |                 $action = $this->parseTable['action'][$currentState][$type];
 67 | 
 68 |                 if ($action > 0) {
 69 |                     // shift
 70 | 
 71 |                     $args[] = $token;
 72 |                     $stateStack[] = $currentState = $action;
 73 | 
 74 |                     break;
 75 |                 } elseif ($action < 0) {
 76 |                     // reduce
 77 |                     $rule = $this->grammar->getRule(-$action);
 78 |                     $popCount = count($rule->getComponents());
 79 | 
 80 |                     array_splice($stateStack, -$popCount);
 81 |                     $newArgs = array_splice($args, -$popCount);
 82 | 
 83 |                     if ($callback = $rule->getCallback()) {
 84 |                         $args[] = call_user_func_array($callback, $newArgs);
 85 |                     } else {
 86 |                         $args[] = $newArgs[0];
 87 |                     }
 88 | 
 89 |                     $state = $stateStack[count($stateStack) - 1];
 90 |                     $stateStack[] = $currentState = $this->parseTable['goto']
 91 |                         [$state][$rule->getName()];
 92 |                 } else {
 93 |                     // accept
 94 | 
 95 |                     return $args[0];
 96 |                 }
 97 |             }
 98 |         }
 99 |     }
100 | }
101 | 


--------------------------------------------------------------------------------
/docs/cli.md:
--------------------------------------------------------------------------------
 1 | The command-line interface
 2 | ==========================
 3 | 
 4 | Dissect provides you with a command-line tool for processing and
 5 | debugging your grammars. This chapted describes the tool and its
 6 | options.
 7 | 
 8 | Running the tool
 9 | ----------------
10 | 
11 | Let's assume that the executable is located in a folder called `bin`.
12 | The most basic way to invoke it is
13 | 
14 |     $ bin/dissect <grammar-class>
15 | 
16 | This will analyze the given grammar and, if successful, save the parse
17 | table in a file `parse_table.php` in the same folder where you've
18 | defined your grammar. You can use `/` instead of `\` as the namespace
19 | separator or enclose the class name in quotes.
20 | 
21 | To change the directory in which the parse table will be saved, use the
22 | `--output-dir` (or `-o`) option:
23 | 
24 |     $ bin/dissect <grammar-class> --output-dir=../dir
25 | 
26 | Dumping the parse table in the debug format
27 | -------------------------------------------
28 | 
29 | By default, the parse table will be saved as a single line of PHP code,
30 | with minimal whitespace. If you want to inspect the generated table
31 | manually, you can use the `--debug` (or `-d`) option:
32 | 
33 |     $ bin/dissect <grammar-class> --debug
34 | 
35 | The parse table will then be written in a human-readable way and with
36 | comments explaining the steps of the parser.
37 | 
38 | Dumping the handle-finding automaton
39 | ------------------------------------
40 | 
41 | If you have an understanding of the LR parsing process, being able to
42 | inspect the LR automaton visually could be an aid in resolving potential
43 | grammar conflicts. In order to dump the automaton as a Graphviz graph,
44 | use the `--dfa` (or `-D`) option:
45 | 
46 |     $ bin/dissect <grammar-class> --dfa
47 | 
48 | This will create a file called `automaton.dot` in the output directory.
49 | You can then run something like
50 | 
51 |     dot -Tpng automaton.dot > automaton.png
52 | 
53 | to render it as a PNG image.
54 | 
55 | Of course, for more complex grammars, the automaton will quickly become rather large
56 | and unwieldy. You can then use the `--state` (or `-s`) option to dump
57 | only the specified state:
58 | 
59 |     $ bin/dissect <grammar-class> --dfa --state=5
60 | 
61 | As an example, let's say we use the following grammar:
62 | 
63 | ```php
64 | class PalindromeGrammar extends Grammar
65 | {
66 |     public function __construct()
67 |     {
68 |         $this('S')
69 |             ->is('a', 'S', 'a')
70 |             ->is('b', 'S', 'b')
71 |             ->is(/* empty */);
72 | 
73 |         $this->start('S');
74 |     }
75 | }
76 | ```
77 | 
78 | When running the command-line tool, we'll notice a list of resolved
79 | conflicts in the output:
80 | 
81 |     Resolved a shift/reduce conflict in state 2 on lookahead a
82 |     Resolved a shift/reduce conflict in state 3 on lookahead b
83 | 
84 | If we wanted to examine the conflict in state 3, we could run
85 | 
86 |     $ bin/dissect PalindromeGrammar --dfa --state=3
87 | 
88 | and then
89 | 
90 |     $ dot -Tpng state_3.dot > state_3.png
91 | 
92 | The result will be the following image:
93 | 
94 | ![State 3](https://raw.github.com/jakubledl/dissect/develop/docs/state_3.png)
95 | 
96 | in which we can clearly see how the conflict arose: the state #3 calls
97 | both for a shift and a reduction by the rule `S -> ` on
98 | lookahead `b`.
99 | 


--------------------------------------------------------------------------------
/src/Dissect/Parser/LALR1/Analysis/Exception/ReduceReduceConflictException.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Dissect\Parser\LALR1\Analysis\Exception;
  4 | 
  5 | use Dissect\Parser\LALR1\Analysis\Automaton;
  6 | use Dissect\Parser\Rule;
  7 | 
  8 | /**
  9 |  * Thrown when a grammar is not LALR(1) and exhibits
 10 |  * a reduce/reduce conflict.
 11 |  *
 12 |  * @author Jakub Lédl <jakubledl@gmail.com>
 13 |  */
 14 | class ReduceReduceConflictException extends ConflictException
 15 | {
 16 |     /**
 17 |      * The exception message template.
 18 |      */
 19 |     const MESSAGE = <<<EOT
 20 | The grammar exhibits a reduce/reduce conflict on rules:
 21 | 
 22 |   %d. %s -> %s
 23 | 
 24 | vs:
 25 | 
 26 |   %d. %s -> %s
 27 | 
 28 | (on lookahead "%s" in state %d). Restructure your grammar or choose a conflict resolution mode.
 29 | EOT;
 30 | 
 31 |     /**
 32 |      * @var \Dissect\Parser\Rule
 33 |      */
 34 |     protected $firstRule;
 35 | 
 36 |     /**
 37 |      * @var \Dissect\Parser\Rule
 38 |      */
 39 |     protected $secondRule;
 40 | 
 41 |     /**
 42 |      * @var string
 43 |      */
 44 |     protected $lookahead;
 45 | 
 46 |     /**
 47 |      * Constructor.
 48 |      *
 49 |      * @param int $state The number of the inadequate state.
 50 |      * @param \Dissect\Parser\Rule $firstRule The first conflicting grammar rule.
 51 |      * @param \Dissect\Parser\Rule $secondRule The second conflicting grammar rule.
 52 |      * @param string $lookahead The conflicting lookahead.
 53 |      * @param \Dissect\Parser\LALR1\Analysis\Automaton $automaton The faulty automaton.
 54 |      */
 55 |     public function __construct($state, Rule $firstRule, Rule $secondRule, $lookahead, Automaton $automaton)
 56 |     {
 57 |         $components1 = $firstRule->getComponents();
 58 |         $components2 = $secondRule->getComponents();
 59 | 
 60 |         parent::__construct(
 61 |             sprintf(
 62 |                 self::MESSAGE,
 63 |                 $firstRule->getNumber(),
 64 |                 $firstRule->getName(),
 65 |                 empty($components1) ? '/* empty */' : implode(' ', $components1),
 66 |                 $secondRule->getNumber(),
 67 |                 $secondRule->getName(),
 68 |                 empty($components2) ? '/* empty */' : implode(' ', $components2),
 69 |                 $lookahead,
 70 |                 $state
 71 |             ),
 72 |             $state,
 73 |             $automaton
 74 |         );
 75 | 
 76 |         $this->firstRule = $firstRule;
 77 |         $this->secondRule = $secondRule;
 78 |         $this->lookahead = $lookahead;
 79 |     }
 80 | 
 81 |     /**
 82 |      * Returns the first conflicting rule.
 83 |      *
 84 |      * @return \Dissect\Parser\Rule The first conflicting rule.
 85 |      */
 86 |     public function getFirstRule()
 87 |     {
 88 |         return $this->firstRule;
 89 |     }
 90 | 
 91 |     /**
 92 |      * Returns the second conflicting rule.
 93 |      *
 94 |      * @return \Dissect\Parser\Rule The second conflicting rule.
 95 |      */
 96 |     public function getSecondRule()
 97 |     {
 98 |         return $this->secondRule;
 99 |     }
100 | 
101 |     /**
102 |      * Returns the conflicting lookahead.
103 |      *
104 |      * @return string The conflicting lookahead.
105 |      */
106 |     public function getLookahead()
107 |     {
108 |         return $this->lookahead;
109 |     }
110 | }
111 | 


--------------------------------------------------------------------------------
/tests/Dissect/Lexer/TokenStream/ArrayTokenStreamTest.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Dissect\Lexer\TokenStream;
  4 | 
  5 | use Dissect\Lexer\CommonToken;
  6 | use PHPUnit_Framework_TestCase;
  7 | 
  8 | class ArrayTokenStreamTest extends PHPUnit_Framework_TestCase
  9 | {
 10 |     protected $stream;
 11 | 
 12 |     protected function setUp()
 13 |     {
 14 |         $this->stream = new ArrayTokenStream(array(
 15 |             new CommonToken('INT', '6', 1, 1),
 16 |             new CommonToken('PLUS', '+', 1, 3),
 17 |             new CommonToken('INT', '5', 1, 5),
 18 |             new CommonToken('MINUS', '-', 1, 7),
 19 |             new CommonToken('INT', '3', 1, 9),
 20 |         ));
 21 |     }
 22 | 
 23 |     /**
 24 |      * @test
 25 |      */
 26 |     public function theCursorShouldBeOnFirstTokenByDefault()
 27 |     {
 28 |         $this->assertEquals('6', $this->stream->getCurrentToken()->getValue());
 29 |     }
 30 | 
 31 |     /**
 32 |      * @test
 33 |      */
 34 |     public function getPositionShouldReturnCurrentPosition()
 35 |     {
 36 |         $this->stream->seek(2);
 37 |         $this->stream->next();
 38 | 
 39 |         $this->assertEquals(3, $this->stream->getPosition());
 40 |     }
 41 | 
 42 |     /**
 43 |      * @test
 44 |      */
 45 |     public function lookAheadShouldReturnTheCorrectToken()
 46 |     {
 47 |         $this->assertEquals('5', $this->stream->lookAhead(2)->getValue());
 48 |     }
 49 | 
 50 |     /**
 51 |      * @test
 52 |      * @expectedException OutOfBoundsException
 53 |      */
 54 |     public function lookAheadShouldThrowAnExceptionWhenInvalid()
 55 |     {
 56 |         $this->stream->lookAhead(15);
 57 |     }
 58 | 
 59 |     /**
 60 |      * @test
 61 |      */
 62 |     public function getShouldReturnATokenByAbsolutePosition()
 63 |     {
 64 |         $this->assertEquals('3', $this->stream->get(4)->getValue());
 65 |     }
 66 | 
 67 |     /**
 68 |      * @test
 69 |      * @expectedException OutOfBoundsException
 70 |      */
 71 |     public function getShouldThrowAnExceptionWhenInvalid()
 72 |     {
 73 |         $this->stream->get(15);
 74 |     }
 75 | 
 76 |     /**
 77 |      * @test
 78 |      */
 79 |     public function moveShouldMoveTheCursorByToAnAbsolutePosition()
 80 |     {
 81 |         $this->stream->move(2);
 82 |         $this->assertEquals('5', $this->stream->getCurrentToken()->getValue());
 83 |     }
 84 | 
 85 |     /**
 86 |      * @test
 87 |      * @expectedException OutOfBoundsException
 88 |      */
 89 |     public function moveShouldThrowAnExceptionWhenInvalid()
 90 |     {
 91 |         $this->stream->move(15);
 92 |     }
 93 | 
 94 |     /**
 95 |      * @test
 96 |      */
 97 |     public function seekShouldMoveTheCursorByRelativeOffset()
 98 |     {
 99 |         $this->stream->seek(4);
100 |         $this->assertEquals('3', $this->stream->getCurrentToken()->getValue());
101 |     }
102 | 
103 |     /**
104 |      * @test
105 |      * @expectedException OutOfBoundsException
106 |      */
107 |     public function seekShouldThrowAnExceptionWhenInvalid()
108 |     {
109 |         $this->stream->seek(15);
110 |     }
111 | 
112 |     /**
113 |      * @test
114 |      */
115 |     public function nextShouldMoveTheCursorOneTokenAhead()
116 |     {
117 |         $this->stream->next();
118 |         $this->assertEquals('PLUS', $this->stream->getCurrentToken()->getType());
119 | 
120 |         $this->stream->next();
121 |         $this->assertEquals('5', $this->stream->getCurrentToken()->getValue());
122 |     }
123 | 
124 |     /**
125 |      * @test
126 |      * @expectedException OutOfBoundsException
127 |      */
128 |     public function nextShouldThrowAnExceptionWhenAtTheEndOfTheStream()
129 |     {
130 |         $this->stream->seek(4);
131 |         $this->stream->next();
132 |     }
133 | }
134 | 


--------------------------------------------------------------------------------
/docs/common.md:
--------------------------------------------------------------------------------
  1 | Describing common syntactic structures
  2 | ======================================
  3 | 
  4 | This chapter of the documentation shows how to implement common
  5 | grammar patterns like lists & repetitions in a way that's most efficient
  6 | for a LALR(1) parser like Dissect.
  7 | 
  8 | List of 1 or more `Foo`s
  9 | ------------------------
 10 | 
 11 | ```php
 12 | $this('Foo+')
 13 |     ->is('Foo+', 'Foo')
 14 |     ->call(function ($list, $foo) {
 15 |         $list[] = $foo;
 16 | 
 17 |         return $list;
 18 |     })
 19 | 
 20 |     ->is('Foo')
 21 |     ->call(function ($foo) {
 22 |         return [$foo];
 23 |     });
 24 | ```
 25 | 
 26 | With some practice, it's very easy to see how this works: when the
 27 | parser recognizes the first `Foo`, it reduces it to a single-item array
 28 | and for each following `Foo`, it just pushes it onto the array.
 29 | 
 30 | Note that `Foo+` is just a rule name, it could be equally well called
 31 | `Foos`, `ListOfFoo` or anything else you feel like.
 32 | 
 33 | List of 0 or more `Foo`s
 34 | ------------------------
 35 | 
 36 | ```php
 37 | $this('Foo*')
 38 |     ->is('Foo*', 'Foo')
 39 |     ->call(function ($list, $foo) {
 40 |         $list[] = $foo;
 41 | 
 42 |         return $list;
 43 |     })
 44 | 
 45 |     ->is(/* empty */)
 46 |     ->call(function () {
 47 |         return [];
 48 |     });
 49 | ```
 50 | 
 51 | This works pretty much the same like the previous example, the only
 52 | difference being that we allow `Foo*` to match nothing.
 53 | 
 54 | A comma separated list
 55 | ----------------------
 56 | 
 57 | The first example of this chapter is trivial to modify to include
 58 | commas between the `Foo`s. Just change the second line to:
 59 | 
 60 | ```php
 61 | $this('Foo+')
 62 |     ->is('Foo+', ',', 'Foo')
 63 | ...
 64 | ```
 65 | 
 66 | The second example, however, cannot be modified so easily. We cannot
 67 | just put a comma in the first alternative:
 68 | 
 69 | ```php
 70 | $this('Foo*')
 71 |     ->is('Foo*', ',', 'Foo')
 72 | ...
 73 | ```
 74 | 
 75 | since that would allow the list to start with a comma:
 76 | 
 77 |     , Foo , Foo , Foo
 78 | 
 79 | Instead, we say that a "list of zero or more `Foo`s
 80 | separated by commas" is actually "a list of one or more `Foo`s separated
 81 | by commas or nothing at all". So our rule now becomes:
 82 | 
 83 | ```php
 84 | $this('Foo*')
 85 |     ->is('Foo+')
 86 | 
 87 |     ->is(/* empty */)
 88 |     ->call(function () {
 89 |         return [];
 90 |     });
 91 | 
 92 | $this('Foo+')
 93 |     ->is('Foo+', ',', 'Foo')
 94 | ...
 95 | ```
 96 | 
 97 | A note on left recursion
 98 | ------------------------
 99 | 
100 | One of the principal advantages of LR parsers over alternatives like LL
101 | or recursive descent is the ability to handle left-recursive rules,
102 | which are a natural expression of many grammar patterns. However, not
103 | only do LR parsers handle left recursion, they actually work *better*
104 | with left-recursive rules than with right-recursive ones in terms of
105 | memory, since a left-recursive rule can be recognized using a constant
106 | amount of memory, whereas for right-recursive rules, the amount of
107 | memory required grows lineary with each round of recursion.
108 | 
109 | You may have noticed that all the examples above use left recursion for
110 | two reasons: efficiency and naturalness (you read arrays from left to
111 | right, not the other way around, right?).
112 | 
113 | In short, when you *can* comfortably express your rule using left recursion,
114 | *do* so.
115 | 
116 | Expressions
117 | -----------
118 | 
119 | A grammar for very basic mathematical expressions is described in the
120 | [chapter on parsing][arith]. It would require some modifications to allow
121 | for other operators, function calls, ternary operator(s), but there's a
122 | lot of grammars for practical programming languages on the internet that
123 | you can take inspiration from.
124 | 
125 | For a familiar (although slighty less readable) example, take a look
126 | at [this grammar][php-grammar] for PHP itself.
127 | 
128 | [php-grammar]: https://github.com/php/php-src/blob/master/Zend/zend_language_parser.y
129 | [arith]: parsing.md#example-parsing-mathematical-expressions
130 | 


--------------------------------------------------------------------------------
/docs/ast.md:
--------------------------------------------------------------------------------
  1 | Building an AST
  2 | ===============
  3 | 
  4 | Often, when parsing a language that's more complex than
  5 | [mathematical expressions][prev], you will want to represent
  6 | the input as an *abstract syntax tree*, or AST (for a real-life
  7 | example, see [Twig][twig-ast] or [Gherkin][gherkin-ast]).
  8 | 
  9 | Getting the AST of the input with Dissect is nothing special; the
 10 | callbacks in your grammar can return anything, so they might as well
 11 | return AST nodes. Dissect however helps you by providing a simple base
 12 | class for the different node types: `Dissect\Node\CommonNode`.
 13 | 
 14 | Let's say we want to create an AST for the mathematical expressions from
 15 | the previous chapter. Since the input can consist of binary operations
 16 | and integers, let's create a subclass for each case:
 17 | 
 18 | ```php
 19 | use Dissect\Node\CommonNode;
 20 | use Dissect\Node\Node;
 21 | 
 22 | class BinaryExpressionNode extends CommonNode
 23 | {
 24 |     const PLUS = 1;
 25 |     const TIMES = 2;
 26 |     const POWER = 3;
 27 | 
 28 |     public function __construct(Node $left, $op, Node $right)
 29 |     {
 30 |         parent::__construct(['operator' => $op], [
 31 |             'left' => $left,
 32 |             'right' => $right,
 33 |         ]);
 34 |     }
 35 | 
 36 |     public function getLeft()
 37 |     {
 38 |         return $this->getNode('left');
 39 |     }
 40 | 
 41 |     public function getRight()
 42 |     {
 43 |         return $this->getNode('right');
 44 |     }
 45 | 
 46 |     public function getOperator()
 47 |     {
 48 |         return $this->getAttribute('operator');
 49 |     }
 50 | }
 51 | 
 52 | class IntNode extends CommonNode
 53 | {
 54 |     public function __construct($value)
 55 |     {
 56 |         parent::__construct(['value' => $value]);
 57 |     }
 58 | 
 59 |     public function getValue()
 60 |     {
 61 |         return $this->getAttribute('value');
 62 |     }
 63 | }
 64 | ```
 65 | 
 66 | The original constructor has two parameters, an array of child nodes and
 67 | an array of node attributes. `Dissect\Node\Node` is an interface
 68 | describing common operations for an AST node.
 69 | 
 70 | We can now easily modify the original grammar to build the AST:
 71 | 
 72 | ```php
 73 | $this('Additive')
 74 |     ->is('Additive', '+', 'Multiplicative')
 75 |     ->call(function ($l, $_, $r) {
 76 |         return new BinaryExpressionNode($l, BinaryExpressionNode::PLUS, $r);
 77 |     })
 78 | 
 79 |     ->is('Multiplicative');
 80 | 
 81 | $this('Multiplicative')
 82 |     ->is('Multiplicative', '*', 'Power')
 83 |     ->call(function ($l, $_, $r) {
 84 |         return new BinaryExpressionNode($l, BinaryExpressionNode::TIMES, $r);
 85 |     })
 86 | 
 87 |     ->is('Power');
 88 | 
 89 | $this('Power')
 90 |     ->is('Primary', '**', 'Power')
 91 |     ->call(function ($l, $_, $r) {
 92 |         return new BinaryExpressionNode($l, BinaryExpressionNode::POWER, $r);
 93 |     })
 94 | 
 95 |     ->is('Primary');
 96 | 
 97 | $this('Primary')
 98 |     ->is('(', 'Additive', ')')
 99 |     ->call(function ($_, $e, $_) {
100 |         return $e;
101 |     })
102 | 
103 |     ->is('INT')
104 |     ->call(function ($int) {
105 |         return new IntNode((int)$int->getValue());
106 |     });
107 | ```
108 | 
109 | Traversing the AST
110 | ------------------
111 | 
112 | When we have the AST of our input, we want to interpret it somehow.
113 | The most common way to do this is to create a *node visitor* (sometimes
114 | called a *tree walker*). A trivial node visitor for our example could be
115 | the following recursive function:
116 | 
117 | ```php
118 | function visit(Node $node)
119 | {
120 |     if ($node instanceof BinaryExpressionNode) {
121 |         switch ($node->getOperator()) {
122 |             case BinaryExpressionNode::PLUS:
123 |                 return visit($node->getLeft()) + visit($node->getRight());
124 |             case BinaryExpressionNode::TIMES:
125 |                 return visit($node->getLeft()) * visit($node->getRight());
126 |             case BinaryExpressionNode::POWER:
127 |                 return pow(visit($node->getLeft()), visit($node->getRight());
128 |         }
129 |     } elseif ($node instanceof IntNode) {
130 |         return $node->getValue();
131 |     } else {
132 |         throw new \Exception("Unknown node type.");
133 |     }
134 | }
135 | 
136 | echo visit($parser->parse(...));
137 | ```
138 | 
139 | [prev]: parsing.md#example-parsing-mathematical-expressions
140 | [twig-ast]: https://github.com/fabpot/Twig/tree/master/lib/Twig/Node
141 | [gherkin-ast]: https://github.com/Behat/Gherkin/tree/master/src/Behat/Gherkin/Node
142 | 


--------------------------------------------------------------------------------
/src/Dissect/Parser/LALR1/Analysis/Item.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Dissect\Parser\LALR1\Analysis;
  4 | 
  5 | use Dissect\Parser\Rule;
  6 | 
  7 | /**
  8 |  * A LALR(1) item.
  9 |  *
 10 |  * An item represents a state where a part of
 11 |  * a grammar rule has been recognized. The current
 12 |  * position is marked by a dot:
 13 |  *
 14 |  * <pre>
 15 |  * A -> a . b c
 16 |  * </pre>
 17 |  *
 18 |  * This means that within this item, a has been recognized
 19 |  * and b is expected. If the dot is at the very end of the
 20 |  * rule:
 21 |  *
 22 |  * <pre>
 23 |  * A -> a b c .
 24 |  * </pre>
 25 |  *
 26 |  * it means that the whole rule has been recognized and
 27 |  * can be reduced.
 28 |  *
 29 |  * @author Jakub Lédl <jakubledl@gmail.com>
 30 |  */
 31 | class Item
 32 | {
 33 |     /**
 34 |      * @var \Dissect\Parser\Rule
 35 |      */
 36 |     protected $rule;
 37 | 
 38 |     /**
 39 |      * @var int
 40 |      */
 41 |     protected $dotIndex;
 42 | 
 43 |     /**
 44 |      * @var array
 45 |      */
 46 |     protected $lookahead = array();
 47 | 
 48 |     /**
 49 |      * @var array
 50 |      */
 51 |     protected $connected = array();
 52 | 
 53 |     /**
 54 |      * Constructor.
 55 |      *
 56 |      * @param \Dissect\Parser\Rule $rule The rule of this item.
 57 |      * @param int $dotIndex The index of the dot in this item.
 58 |      */
 59 |     public function __construct(Rule $rule, $dotIndex)
 60 |     {
 61 |         $this->rule = $rule;
 62 |         $this->dotIndex = $dotIndex;
 63 |     }
 64 | 
 65 |     /**
 66 |      * Returns the dot index of this item.
 67 |      *
 68 |      * @return int The dot index.
 69 |      */
 70 |     public function getDotIndex()
 71 |     {
 72 |         return $this->dotIndex;
 73 |     }
 74 | 
 75 |     /**
 76 |      * Returns the currently expected component.
 77 |      *
 78 |      * If the item is:
 79 |      *
 80 |      * <pre>
 81 |      * A -> a . b c
 82 |      * </pre>
 83 |      *
 84 |      * then this method returns the component "b".
 85 |      *
 86 |      * @return string The component.
 87 |      */
 88 |     public function getActiveComponent()
 89 |     {
 90 |         return $this->rule->getComponent($this->dotIndex);
 91 |     }
 92 | 
 93 |     /**
 94 |      * Returns the rule of this item.
 95 |      *
 96 |      * @return \Dissect\Parser\Rule The rule.
 97 |      */
 98 |     public function getRule()
 99 |     {
100 |         return $this->rule;
101 |     }
102 | 
103 |     /**
104 |      * Determines whether this item is a reduce item.
105 |      *
106 |      * An item is a reduce item if the dot is at the very end:
107 |      *
108 |      * <pre>
109 |      * A -> a b c .
110 |      * </pre>
111 |      *
112 |      * @return boolean Whether this item is a reduce item.
113 |      */
114 |     public function isReduceItem()
115 |     {
116 |         return $this->dotIndex === count($this->rule->getComponents());
117 |     }
118 | 
119 |     /**
120 |      * Connects two items with a lookahead pumping channel.
121 |      *
122 |      * @param \Dissect\Parser\LALR1\Analysis\Item $i The item.
123 |      */
124 |     public function connect(Item $i)
125 |     {
126 |         $this->connected[] = $i;
127 |     }
128 | 
129 |     /**
130 |      * Pumps a lookahead token to this item and all items connected
131 |      * to it.
132 |      *
133 |      * @param string $lookahead The lookahead token name.
134 |      */
135 |     public function pump($lookahead)
136 |     {
137 |         if (!in_array($lookahead, $this->lookahead)) {
138 |             $this->lookahead[] = $lookahead;
139 | 
140 |             foreach ($this->connected as $item) {
141 |                 $item->pump($lookahead);
142 |             }
143 |         }
144 |     }
145 | 
146 |     /**
147 |      * Pumps several lookahead tokens.
148 |      *
149 |      * @param array $lookahead The lookahead tokens.
150 |      */
151 |     public function pumpAll(array $lookahead)
152 |     {
153 |         foreach ($lookahead as $l) {
154 |             $this->pump($l);
155 |         }
156 |     }
157 | 
158 |     /**
159 |      * Returns the computed lookahead for this item.
160 |      *
161 |      * @return string[] The lookahead symbols.
162 |      */
163 |     public function getLookahead()
164 |     {
165 |         return $this->lookahead;
166 |     }
167 | 
168 |     /**
169 |      * Returns all components that haven't been recognized
170 |      * so far.
171 |      *
172 |      * @return array The unrecognized components.
173 |      */
174 |     public function getUnrecognizedComponents()
175 |     {
176 |         return array_slice($this->rule->getComponents(), $this->dotIndex + 1);
177 |     }
178 | }
179 | 


--------------------------------------------------------------------------------
/src/Dissect/Parser/LALR1/Dumper/AutomatonDumper.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Dissect\Parser\LALR1\Dumper;
  4 | 
  5 | use Dissect\Parser\LALR1\Analysis\Automaton;
  6 | use Dissect\Parser\LALR1\Analysis\Item;
  7 | use Dissect\Parser\LALR1\Analysis\State;
  8 | 
  9 | /**
 10 |  * Dumps the handle-finding FSA in the
 11 |  * format used by Graphviz.
 12 |  *
 13 |  * @author Jakub Lédl <jakubledl@gmail.com>
 14 |  */
 15 | class AutomatonDumper
 16 | {
 17 |     protected $automaton;
 18 | 
 19 |     /**
 20 |      * Constructor.
 21 |      *
 22 |      * @param \Dissect\Parser\LALR1\Analysis\Automaton $automaton
 23 |      */
 24 |     public function __construct(Automaton $automaton)
 25 |     {
 26 |         $this->automaton = $automaton;
 27 |     }
 28 | 
 29 |     /**
 30 |      * Dumps the entire automaton.
 31 |      *
 32 |      * @return string The automaton encoded in DOT.
 33 |      */
 34 |     public function dump()
 35 |     {
 36 |         $writer = new StringWriter();
 37 | 
 38 |         $this->writeHeader($writer);
 39 |         $writer->writeLine();
 40 | 
 41 |         foreach ($this->automaton->getStates() as $state) {
 42 |             $this->writeState($writer, $state);
 43 |         }
 44 | 
 45 |         $writer->writeLine();
 46 | 
 47 |         foreach ($this->automaton->getTransitionTable() as $num => $map) {
 48 |             foreach ($map as $trigger => $destination) {
 49 |                 $writer->writeLine(sprintf(
 50 |                     '%d -> %d [label="%s"];',
 51 |                     $num,
 52 |                     $destination,
 53 |                     $trigger
 54 |                 ));
 55 |             }
 56 |         }
 57 | 
 58 |         $writer->outdent();
 59 |         $this->writeFooter($writer);
 60 | 
 61 |         return $writer->get();
 62 |     }
 63 | 
 64 |     /**
 65 |      * Dumps only the specified state + any relevant
 66 |      * transitions.
 67 |      *
 68 |      * @param int $n The number of the state.
 69 |      *
 70 |      * @return string The output in DOT format.
 71 |      */
 72 |     public function dumpState($n)
 73 |     {
 74 |         $writer = new StringWriter();
 75 | 
 76 |         $this->writeHeader($writer, $n);
 77 |         $writer->writeLine();
 78 | 
 79 |         $this->writeState($writer, $this->automaton->getState($n));
 80 | 
 81 |         $table = $this->automaton->getTransitionTable();
 82 |         $row = isset($table[$n]) ? $table[$n] : array();
 83 | 
 84 |         foreach ($row as $dest) {
 85 |             if ($dest !== $n) {
 86 |                 $this->writeState($writer, $this->automaton->getState($dest), false);
 87 |             }
 88 |         }
 89 | 
 90 |         $writer->writeLine();
 91 | 
 92 |         foreach ($row as $trigger => $dest) {
 93 |             $writer->writeLine(sprintf(
 94 |                 '%d -> %d [label="%s"];',
 95 |                 $n,
 96 |                 $dest,
 97 |                 $trigger
 98 |             ));
 99 |         }
100 | 
101 |         $writer->outdent();
102 |         $this->writeFooter($writer);
103 | 
104 |         return $writer->get();
105 |     }
106 | 
107 |     protected function writeHeader(StringWriter $writer, $stateNumber = null)
108 |     {
109 |         $writer->writeLine(sprintf(
110 |             'digraph %s {',
111 |             $stateNumber ? 'State' . $stateNumber : 'Automaton'
112 |         ));
113 | 
114 |         $writer->indent();
115 |         $writer->writeLine('rankdir="LR";');
116 |     }
117 | 
118 |     protected function writeState(StringWriter $writer, State $state, $full = true)
119 |     {
120 |         $n = $state->getNumber();
121 | 
122 |         $string = sprintf(
123 |             '%d [label="State %d',
124 |             $n,
125 |             $n
126 |         );
127 | 
128 |         if ($full) {
129 |             $string .= '\n\n';
130 |             $items = array();
131 | 
132 |             foreach ($state->getItems() as $item) {
133 |                 $items[] = $this->formatItem($item);
134 |             }
135 | 
136 |             $string .= implode('\n', $items);
137 |         }
138 | 
139 |         $string .= '"];';
140 | 
141 |         $writer->writeLine($string);
142 |     }
143 | 
144 |     protected function formatItem(Item $item)
145 |     {
146 |         $rule = $item->getRule();
147 |         $components = $rule->getComponents();
148 | 
149 |         // the dot
150 |         array_splice($components, $item->getDotIndex(), 0, array('&bull;'));
151 | 
152 |         if ($rule->getNumber() === 0) {
153 |             $string = '';
154 |         } else {
155 |             $string = sprintf("%s &rarr; ", $rule->getName());
156 |         }
157 | 
158 |         $string .= implode(' ', $components);
159 | 
160 |         if ($item->isReduceItem()) {
161 |             $string .= sprintf(
162 |                 ' [%s]',
163 |                 implode(' ', $item->getLookahead())
164 |             );
165 |         }
166 | 
167 |         return $string;
168 |     }
169 | 
170 |     protected function writeFooter(StringWriter $writer)
171 |     {
172 |         $writer->writeLine('}');
173 |     }
174 | }
175 | 


--------------------------------------------------------------------------------
/src/Dissect/Parser/LALR1/Dumper/DebugTableDumper.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Dissect\Parser\LALR1\Dumper;
  4 | 
  5 | use Dissect\Parser\Grammar;
  6 | 
  7 | /**
  8 |  * Dumps a parse table using the debug format,
  9 |  * with comments explaining the actions of the
 10 |  * parser.
 11 |  *
 12 |  * @author Jakub Lédl <jakubledl@gmail.com>
 13 |  */
 14 | class DebugTableDumper implements TableDumper
 15 | {
 16 |     /**
 17 |      * @var \Dissect\Parser\Grammar
 18 |      */
 19 |     protected $grammar;
 20 | 
 21 |     /**
 22 |      * @var \Dissect\Parser\LALR1\Dumper\StringWriter
 23 |      */
 24 |     protected $writer;
 25 | 
 26 |     /**
 27 |      * @var boolean
 28 |      */
 29 |     protected $written = false;
 30 | 
 31 |     /**
 32 |      * Constructor.
 33 |      *
 34 |      * @param \Dissect\Parser\Grammar $grammar The grammar of this parse table.
 35 |      */
 36 |     public function __construct(Grammar $grammar)
 37 |     {
 38 |         $this->grammar = $grammar;
 39 |         $this->writer = new StringWriter();
 40 |     }
 41 | 
 42 |     /**
 43 |      * {@inheritDoc}
 44 |      */
 45 |     public function dump(array $table)
 46 |     {
 47 |         // for readability
 48 |         ksort($table['action']);
 49 |         ksort($table['goto']);
 50 | 
 51 |         // the grammar dictates the parse table,
 52 |         // therefore the result is always the same
 53 |         if (!$this->written) {
 54 |             $this->writeHeader();
 55 |             $this->writer->indent();
 56 | 
 57 |             foreach ($table['action'] as $n => $state) {
 58 |                 $this->writeState($n, $state);
 59 |                 $this->writer->writeLine();
 60 |             }
 61 | 
 62 |             $this->writer->outdent();
 63 |             $this->writeMiddle();
 64 |             $this->writer->indent();
 65 | 
 66 |             foreach ($table['goto'] as $n => $map) {
 67 |                 $this->writeGoto($n, $map);
 68 |                 $this->writer->writeLine();
 69 |             }
 70 | 
 71 |             $this->writer->outdent();
 72 |             $this->writeFooter();
 73 | 
 74 |             $this->written = true;
 75 |         }
 76 | 
 77 |         return $this->writer->get();
 78 |     }
 79 | 
 80 |     protected function writeHeader()
 81 |     {
 82 |         $this->writer->writeLine('<?php');
 83 |         $this->writer->writeLine();
 84 |         $this->writer->writeLine('return array(');
 85 |         $this->writer->indent();
 86 |         $this->writer->writeLine("'action' => array(");
 87 |     }
 88 | 
 89 |     protected function writeState($n, array $state)
 90 |     {
 91 |         $this->writer->writeLine((string)$n . ' => array(');
 92 |         $this->writer->indent();
 93 | 
 94 |         foreach ($state as $trigger => $action) {
 95 |             $this->writeAction($trigger, $action);
 96 |             $this->writer->writeLine();
 97 |         }
 98 | 
 99 |         $this->writer->outdent();
100 |         $this->writer->writeLine('),');
101 |     }
102 | 
103 |     protected function writeAction($trigger, $action)
104 |     {
105 |         if ($action > 0) {
106 |             $this->writer->writeLine(sprintf(
107 |                 '// on %s shift and go to state %d',
108 |                 $trigger,
109 |                 $action
110 |             ));
111 |         } elseif ($action < 0) {
112 |             $rule = $this->grammar->getRule(-$action);
113 |             $components = $rule->getComponents();
114 | 
115 |             if (empty($components)) {
116 |                 $rhs = '/* empty */';
117 |             } else {
118 |                 $rhs = implode(' ', $components);
119 |             }
120 | 
121 |             $this->writer->writeLine(sprintf(
122 |                 '// on %s reduce by rule %s -> %s',
123 |                 $trigger,
124 |                 $rule->getName(),
125 |                 $rhs
126 |             ));
127 |         } else {
128 |             $this->writer->writeLine(sprintf(
129 |                 '// on %s accept the input',
130 |                 $trigger
131 |             ));
132 |         }
133 | 
134 |         $this->writer->writeLine(sprintf(
135 |             "'%s' => %d,",
136 |             $trigger,
137 |             $action
138 |         ));
139 |     }
140 | 
141 |     protected function writeMiddle()
142 |     {
143 |         $this->writer->writeLine('),');
144 |         $this->writer->writeLine();
145 |         $this->writer->writeLine("'goto' => array(");
146 |     }
147 | 
148 |     protected function writeGoto($n, array $map)
149 |     {
150 |         $this->writer->writeLine((string)$n . ' => array(');
151 |         $this->writer->indent();
152 | 
153 |         foreach ($map as $sym => $dest) {
154 |             $this->writer->writeLine(sprintf(
155 |                 '// on %s go to state %d',
156 |                 $sym,
157 |                 $dest
158 |             ));
159 | 
160 |             $this->writer->writeLine(sprintf(
161 |                 "'%s' => %d,",
162 |                 $sym,
163 |                 $dest
164 |             ));
165 | 
166 |             $this->writer->writeLine();
167 |         }
168 | 
169 |         $this->writer->outdent();
170 |         $this->writer->writeLine('),');
171 |     }
172 | 
173 |     protected function writeFooter()
174 |     {
175 |         $this->writer->writeLine('),');
176 |         $this->writer->outdent();
177 |         $this->writer->writeLine(');');
178 |     }
179 | }
180 | 


--------------------------------------------------------------------------------
/src/Dissect/Lexer/StatefulLexer.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Dissect\Lexer;
  4 | 
  5 | use Dissect\Lexer\Recognizer\RegexRecognizer;
  6 | use Dissect\Lexer\Recognizer\SimpleRecognizer;
  7 | use Dissect\Util\Util;
  8 | use LogicException;
  9 | 
 10 | /**
 11 |  * The StatefulLexer works like SimpleLexer,
 12 |  * but internally keeps notion of current lexer state.
 13 |  *
 14 |  * @author Jakub Lédl <jakubledl@gmail.com>
 15 |  */
 16 | class StatefulLexer extends AbstractLexer
 17 | {
 18 |     protected $states = array();
 19 |     protected $stateStack = array();
 20 |     protected $stateBeingBuilt = null;
 21 |     protected $typeBeingBuilt = null;
 22 | 
 23 |     /**
 24 |      * Signifies that no action should be taken on encountering a token.
 25 |      */
 26 |     const NO_ACTION = 0;
 27 | 
 28 |     /**
 29 |      * Indicates that a state should be popped of the state stack on
 30 |      * encountering a token.
 31 |      */
 32 |     const POP_STATE = 1;
 33 | 
 34 |     /**
 35 |      * Adds a new token definition. If given only one argument,
 36 |      * it assumes that the token type and recognized value are
 37 |      * identical.
 38 |      *
 39 |      * @param string $type The token type.
 40 |      * @param string $value The value to be recognized.
 41 |      *
 42 |      * @return \Dissect\Lexer\SimpleLexer This instance for fluent interface.
 43 |      */
 44 |     public function token($type, $value = null)
 45 |     {
 46 |         if ($this->stateBeingBuilt === null) {
 47 |             throw new LogicException("Define a lexer state first.");
 48 |         }
 49 | 
 50 |         if ($value === null) {
 51 |             $value = $type;
 52 |         }
 53 | 
 54 |         $this->states[$this->stateBeingBuilt]['recognizers'][$type] =
 55 |             new SimpleRecognizer($value);
 56 | 
 57 |         $this->states[$this->stateBeingBuilt]['actions'][$type] = self::NO_ACTION;
 58 | 
 59 |         $this->typeBeingBuilt = $type;
 60 | 
 61 |         return $this;
 62 |     }
 63 | 
 64 |     /**
 65 |      * Adds a new regex token definition.
 66 |      *
 67 |      * @param string $type The token type.
 68 |      * @param string $regex The regular expression used to match the token.
 69 |      *
 70 |      * @return \Dissect\Lexer\SimpleLexer This instance for fluent interface.
 71 |      */
 72 |     public function regex($type, $regex)
 73 |     {
 74 |         if ($this->stateBeingBuilt === null) {
 75 |             throw new LogicException("Define a lexer state first.");
 76 |         }
 77 | 
 78 |         $this->states[$this->stateBeingBuilt]['recognizers'][$type] =
 79 |             new RegexRecognizer($regex);
 80 | 
 81 |         $this->states[$this->stateBeingBuilt]['actions'][$type] = self::NO_ACTION;
 82 | 
 83 |         $this->typeBeingBuilt = $type;
 84 | 
 85 |         return $this;
 86 |     }
 87 | 
 88 |     /**
 89 |      * Marks the token types given as arguments to be skipped.
 90 |      *
 91 |      * @param mixed $type,... Unlimited number of token types.
 92 |      *
 93 |      * @return \Dissect\Lexer\SimpleLexer This instance for fluent interface.
 94 |      */
 95 |     public function skip()
 96 |     {
 97 |         if ($this->stateBeingBuilt === null) {
 98 |             throw new LogicException("Define a lexer state first.");
 99 |         }
100 | 
101 |         $this->states[$this->stateBeingBuilt]['skip_tokens'] = func_get_args();
102 | 
103 |         return $this;
104 |     }
105 | 
106 |     /**
107 |      * Registers a new lexer state.
108 |      *
109 |      * @param string $state The new state name.
110 |      *
111 |      * @return \Dissect\Lexer\SimpleLexer This instance for fluent interface.
112 |      */
113 |     public function state($state)
114 |     {
115 |         $this->stateBeingBuilt = $state;
116 | 
117 |         $this->states[$state] = array(
118 |             'recognizers' => array(),
119 |             'actions' => array(),
120 |             'skip_tokens' => array(),
121 |         );
122 | 
123 |         return $this;
124 |     }
125 | 
126 |     /**
127 |      * Sets the starting state for the lexer.
128 |      *
129 |      * @param string $state The name of the starting state.
130 |      *
131 |      * @return \Dissect\Lexer\SimpleLexer This instance for fluent interface.
132 |      */
133 |     public function start($state)
134 |     {
135 |         $this->stateStack[] = $state;
136 | 
137 |         return $this;
138 |     }
139 | 
140 |     /**
141 |      * Sets an action for the token type that is currently being built.
142 |      *
143 |      * @param mixed $action The action to take.
144 |      *
145 |      * @return \Dissect\Lexer\SimpleLexer This instance for fluent interface.
146 |      */
147 |     public function action($action)
148 |     {
149 |         if ($this->stateBeingBuilt === null || $this->typeBeingBuilt === null) {
150 |             throw new LogicException("Define a lexer state and type first.");
151 |         }
152 | 
153 |         $this->states[$this->stateBeingBuilt]['actions'][$this->typeBeingBuilt] = $action;
154 | 
155 |         return $this;
156 |     }
157 | 
158 |     /**
159 |      * {@inheritDoc}
160 |      */
161 |     protected function shouldSkipToken(Token $token)
162 |     {
163 |         $state = $this->states[$this->stateStack[count($this->stateStack) - 1]];
164 | 
165 |         return in_array($token->getType(), $state['skip_tokens']);
166 |     }
167 | 
168 |     /**
169 |      * {@inheritDoc}
170 |      */
171 |     protected function extractToken($string)
172 |     {
173 |         if (empty($this->stateStack)) {
174 |             throw new LogicException("You must set a starting state before lexing.");
175 |         }
176 | 
177 |         $value = $type = $action = null;
178 |         $state = $this->states[$this->stateStack[count($this->stateStack) - 1]];
179 | 
180 |         foreach ($state['recognizers'] as $t => $recognizer) {
181 |             if ($recognizer->match($string, $v)) {
182 |                 if ($value === null || Util::stringLength($v) > Util::stringLength($value)) {
183 |                     $value = $v;
184 |                     $type = $t;
185 |                     $action = $state['actions'][$type];
186 |                 }
187 |             }
188 |         }
189 | 
190 |         if ($type !== null) {
191 |             if (is_string($action)) { // enter new state
192 |                 $this->stateStack[] = $action;
193 |             } elseif ($action === self::POP_STATE) {
194 |                 array_pop($this->stateStack);
195 |             }
196 | 
197 |             return new CommonToken($type, $value, $this->getCurrentLine());
198 |         }
199 | 
200 |         return null;
201 |     }
202 | }
203 | 


--------------------------------------------------------------------------------
/tests/Dissect/Parser/LALR1/Analysis/AnalyzerTest.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Dissect\Parser\LALR1\Analysis;
  4 | 
  5 | use Dissect\Parser\LALR1\Analysis\Exception\ReduceReduceConflictException;
  6 | use Dissect\Parser\Grammar;
  7 | use Dissect\Parser\Parser;
  8 | use PHPUnit_Framework_TestCase;
  9 | 
 10 | class AnalyzerTest extends PHPUnit_Framework_TestCase
 11 | {
 12 |     protected $analyzer = null;
 13 | 
 14 |     /**
 15 |      * @test
 16 |      */
 17 |     public function automatonShouldBeCorrectlyBuilt()
 18 |     {
 19 |         $grammar = new Grammar();
 20 | 
 21 |         $grammar('S')
 22 |             ->is('a', 'S', 'b')
 23 |             ->is();
 24 | 
 25 |         $grammar->start('S');
 26 | 
 27 |         $result = $this->getAnalysisResult($grammar);
 28 |         $table = $result->getAutomaton()->getTransitionTable();
 29 | 
 30 |         $this->assertEquals(1, $table[0]['S']);
 31 |         $this->assertEquals(2, $table[0]['a']);
 32 |         $this->assertEquals(2, $table[2]['a']);
 33 |         $this->assertEquals(3, $table[2]['S']);
 34 |         $this->assertEquals(4, $table[3]['b']);
 35 |     }
 36 | 
 37 |     /**
 38 |      * @test
 39 |      */
 40 |     public function lookaheadShouldBeCorrectlyPumped()
 41 |     {
 42 |         $grammar = new Grammar();
 43 | 
 44 |         $grammar('S')
 45 |             ->is('A', 'B', 'C', 'D');
 46 | 
 47 |         $grammar('A')
 48 |             ->is('a');
 49 | 
 50 |         $grammar('B')
 51 |             ->is('b');
 52 | 
 53 |         $grammar('C')
 54 |             ->is(/* empty */);
 55 | 
 56 |         $grammar('D')
 57 |             ->is('d');
 58 | 
 59 |         $grammar->start('S');
 60 | 
 61 |         $automaton = $this->getAnalysisResult($grammar)->getAutomaton();
 62 | 
 63 |         $this->assertEquals(
 64 |             array(Parser::EOF_TOKEN_TYPE),
 65 |             $automaton->getState(1)->get(0, 1)->getLookahead()
 66 |         );
 67 | 
 68 |         $this->assertEquals(
 69 |             array('b'),
 70 |             $automaton->getState(3)->get(2, 1)->getLookahead()
 71 |         );
 72 | 
 73 |         $this->assertEquals(
 74 |             array('d'),
 75 |             $automaton->getState(4)->get(4, 0)->getLookahead()
 76 |         );
 77 | 
 78 |         $this->assertEquals(
 79 |             array('d'),
 80 |             $automaton->getState(5)->get(3, 1)->getLookahead()
 81 |         );
 82 | 
 83 |         $this->assertEquals(
 84 |             array(Parser::EOF_TOKEN_TYPE),
 85 |             $automaton->getState(7)->get(1, 4)->getLookahead()
 86 |         );
 87 | 
 88 |         $this->assertEquals(
 89 |             array(Parser::EOF_TOKEN_TYPE),
 90 |             $automaton->getState(8)->get(5, 1)->getLookahead()
 91 |         );
 92 |     }
 93 | 
 94 |     /**
 95 |      * @test
 96 |      */
 97 |     public function parseTableShouldBeCorrectlyBuilt()
 98 |     {
 99 |         $grammar = new Grammar();
100 | 
101 |         $grammar('S')
102 |             ->is('a', 'S', 'b')
103 |             ->is(/* empty */);
104 | 
105 |         $grammar->start('S');
106 | 
107 |         $table = $this->getAnalysisResult($grammar)->getParseTable();
108 | 
109 |         // shift(2)
110 |         $this->assertEquals(2, $table['action'][0]['a']);
111 | 
112 |         // reduce(S -> )
113 |         $this->assertEquals(-2, $table['action'][0][Parser::EOF_TOKEN_TYPE]);
114 | 
115 |         // accept
116 |         $this->assertEquals(0, $table['action'][1][Parser::EOF_TOKEN_TYPE]);
117 | 
118 |         // shift(2)
119 |         $this->assertEquals(2, $table['action'][2]['a']);
120 | 
121 |         // reduce(S -> )
122 |         $this->assertEquals(-2, $table['action'][2]['b']);
123 | 
124 |         // shift(4)
125 |         $this->assertEquals(4, $table['action'][3]['b']);
126 | 
127 |         // reduce(S -> a S b)
128 |         $this->assertEquals(-1, $table['action'][4]['b']);
129 |         $this->assertEquals(-1, $table['action'][4][Parser::EOF_TOKEN_TYPE]);
130 | 
131 |         $this->assertEquals(1, $table['goto'][0]['S']);
132 |         $this->assertEquals(3, $table['goto'][2]['S']);
133 |     }
134 | 
135 |     /**
136 |      * @test
137 |      */
138 |     public function unexpectedConflictsShouldThrowAnException()
139 |     {
140 |         $grammar = new Grammar();
141 | 
142 |         $grammar('S')
143 |             ->is('a', 'b', 'C', 'd')
144 |             ->is('a', 'b', 'E', 'd');
145 | 
146 |         $grammar('C')
147 |             ->is(/* empty */);
148 | 
149 |         $grammar('E')
150 |             ->is(/* empty */);
151 | 
152 |         $grammar->start('S');
153 | 
154 |         try {
155 |             $result = $this->getAnalysisResult($grammar);
156 |             $this->fail('Expected an exception warning of a reduce/reduce conflict.');
157 |         } catch(ReduceReduceConflictException $e) {
158 |             $this->assertEquals(3, $e->getStateNumber());
159 |             $this->assertEquals('d', $e->getLookahead());
160 |             $this->assertEquals(3, $e->getFirstRule()->getNumber());
161 |             $this->assertEquals(4, $e->getSecondRule()->getNumber());
162 |         }
163 |     }
164 | 
165 |     /**
166 |      * @test
167 |      */
168 |     public function expectedConflictsShouldBeRecorded()
169 |     {
170 |         $grammar = new Grammar();
171 | 
172 |         $grammar('S')
173 |             ->is('S', 'S', 'S')
174 |             ->is('S', 'S')
175 |             ->is('b');
176 | 
177 |         $grammar->resolve(Grammar::ALL);
178 |         $grammar->start('S');
179 | 
180 |         $conflicts = $this->getAnalysisResult($grammar)->getResolvedConflicts();
181 | 
182 |         $this->assertCount(4, $conflicts);
183 | 
184 |         $conflict = $conflicts[0];
185 | 
186 |         $this->assertEquals(3, $conflict['state']);
187 |         $this->assertEquals('b', $conflict['lookahead']);
188 |         $this->assertEquals(2, $conflict['rule']->getNumber());
189 |         $this->assertEquals(Grammar::SHIFT, $conflict['resolution']);
190 | 
191 |         $conflict = $conflicts[1];
192 | 
193 |         $this->assertEquals(4, $conflict['state']);
194 |         $this->assertEquals('b', $conflict['lookahead']);
195 |         $this->assertEquals(1, $conflict['rule']->getNumber());
196 |         $this->assertEquals(Grammar::SHIFT, $conflict['resolution']);
197 | 
198 |         $conflict = $conflicts[2];
199 | 
200 |         $this->assertEquals(4, $conflict['state']);
201 |         $this->assertEquals(Parser::EOF_TOKEN_TYPE, $conflict['lookahead']);
202 |         $this->assertEquals(1, $conflict['rules'][0]->getNumber());
203 |         $this->assertEquals(2, $conflict['rules'][1]->getNumber());
204 |         $this->assertEquals(Grammar::LONGER_REDUCE, $conflict['resolution']);
205 | 
206 |         $conflict = $conflicts[3];
207 | 
208 |         $this->assertEquals(4, $conflict['state']);
209 |         $this->assertEquals('b', $conflict['lookahead']);
210 |         $this->assertEquals(2, $conflict['rule']->getNumber());
211 |         $this->assertEquals(Grammar::SHIFT, $conflict['resolution']);
212 |     }
213 | 
214 |     protected function getAnalysisResult(Grammar $grammar)
215 |     {
216 |         return $this->getAnalyzer()->analyze($grammar);
217 |     }
218 | 
219 |     protected function getAnalyzer()
220 |     {
221 |         if ($this->analyzer === null) {
222 |             $this->analyzer = new Analyzer();
223 |         }
224 | 
225 |         return $this->analyzer;
226 |     }
227 | }
228 | 


--------------------------------------------------------------------------------
/src/Dissect/Console/Command/DissectCommand.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Dissect\Console\Command;
  4 | 
  5 | use Dissect\Parser\LALR1\Analysis\Exception\ConflictException;
  6 | use Dissect\Parser\LALR1\Analysis\Analyzer;
  7 | use Dissect\Parser\LALR1\Dumper\AutomatonDumper;
  8 | use Dissect\Parser\LALR1\Dumper\DebugTableDumper;
  9 | use Dissect\Parser\LALR1\Dumper\ProductionTableDumper;
 10 | use Dissect\Parser\Grammar;
 11 | use Symfony\Component\Console\Command\Command;
 12 | use Symfony\Component\Console\Input\InputArgument;
 13 | use Symfony\Component\Console\Input\InputInterface;
 14 | use Symfony\Component\Console\Input\InputOption;
 15 | use Symfony\Component\Console\Output\OutputInterface;
 16 | use ReflectionClass;
 17 | 
 18 | class DissectCommand extends Command
 19 | {
 20 |     protected function configure()
 21 |     {
 22 |         $this
 23 |             ->setName('dissect')
 24 |             ->addArgument('grammar-class', InputArgument::REQUIRED, 'The grammar class.')
 25 |             ->addOption('debug', 'd', InputOption::VALUE_NONE, 'Writes the parse table in the debug format.')
 26 |             ->addOption('dfa', 'D', InputOption::VALUE_NONE, 'Exports the LALR(1) DFA as a Graphviz graph.')
 27 |             ->addOption('state', 's', InputOption::VALUE_REQUIRED, 'Exports only the specified state instead of the entire DFA.')
 28 |             ->addOption('output-dir', 'o', InputOption::VALUE_REQUIRED, 'Overrides the default output directory.')
 29 |             ->setHelp(<<<EOT
 30 | Analyzes the given grammar and, if successful, exports the parse table to a PHP
 31 | file.
 32 | 
 33 | By default, the output directory is taken to be the one in which the grammar is
 34 | defined. You can change that with the <info>--output-dir</info> option:
 35 | 
 36 |  <info>--output-dir=../some/other/dir</info>
 37 | 
 38 | The parse table is by default written with minimal whitespace to make it compact.
 39 | If you wish to inspect the table manually, you can export it in a readable and
 40 | well-commented way with the <info>--debug</info> option.
 41 | 
 42 | If you wish to inspect the handle-finding automaton for your grammar (perhaps
 43 | to aid with grammar debugging), use the <info>--dfa</info> option. When in use, Dissect
 44 | will create a file with the automaton exported as a Graphviz graph
 45 | in the output directory.
 46 | 
 47 | Additionally, you can use the <info>--state</info> option to export only the specified
 48 | state and any relevant transitions:
 49 | 
 50 |  <info>--dfa --state=5</info>
 51 | EOT
 52 |             );
 53 |     }
 54 | 
 55 |     protected function execute(InputInterface $input, OutputInterface $output)
 56 |     {
 57 |         $class = strtr(
 58 |             $input->getArgument('grammar-class'),
 59 |             '/',
 60 |             '\\'
 61 |         );
 62 |         $formatter = $this->getHelperSet()->get('formatter');
 63 | 
 64 |         $output->writeln('<info>Analyzing...</info>');
 65 |         $output->writeln('');
 66 | 
 67 |         if (!class_exists($class)) {
 68 |             $output->writeln(array(
 69 |                 $formatter->formatBlock(
 70 |                     sprintf('The class "%s" could not be found.', $class),
 71 |                     'error',
 72 |                     true
 73 |                 ),
 74 |             ));
 75 | 
 76 |             return 1;
 77 |         }
 78 | 
 79 |         $grammar = new $class();
 80 | 
 81 |         if ($dir = $input->getOption('output-dir')) {
 82 |             $cwd = rtrim(getcwd(), DIRECTORY_SEPARATOR);
 83 | 
 84 |             $outputDir = $cwd . DIRECTORY_SEPARATOR . $dir;
 85 |         } else {
 86 |             $refl = new ReflectionClass($class);
 87 |             $outputDir = pathinfo($refl->getFileName(), PATHINFO_DIRNAME);
 88 |         }
 89 | 
 90 |         $analyzer = new Analyzer();
 91 |         $automaton = null;
 92 | 
 93 |         try {
 94 |             $result = $analyzer->analyze($grammar);
 95 |             $conflicts = $result->getResolvedConflicts();
 96 |             $automaton = $result->getAutomaton();
 97 |             $table = $result->getParseTable();
 98 | 
 99 |             if ($conflicts) {
100 |                 foreach ($conflicts as $conflict) {
101 |                     $output->writeln($this->formatConflict($conflict));
102 |                 }
103 | 
104 |                 $output->writeln(sprintf(
105 |                     "<info><comment>%d</comment> conflicts in total",
106 |                     count($conflicts)
107 |                 ));
108 | 
109 |                 $output->writeln('');
110 |             }
111 | 
112 |             $output->writeln('<info>Writing the parse table...</info>');
113 | 
114 |             $fileName = $outputDir . DIRECTORY_SEPARATOR . 'parse_table.php';
115 | 
116 |             if ($input->getOption('debug')) {
117 |                 $tableDumper = new DebugTableDumper($grammar);
118 |             } else {
119 |                 $tableDumper = new ProductionTableDumper();
120 |             }
121 | 
122 |             $code = $tableDumper->dump($table);
123 | 
124 |             $ret = @file_put_contents($fileName, $code);
125 |             if ($ret === false) {
126 |                 $output->writeln('<error>Error writing the parse table</error>');
127 |             } else {
128 |                 $output->writeln('<info>Parse table written</info>');
129 |             }
130 |         } catch(ConflictException $e) {
131 |             $output->writeln(array(
132 |                 $formatter->formatBlock(
133 |                     explode("\n", $e->getMessage()),
134 |                     'error',
135 |                     true
136 |                 ),
137 |             ));
138 | 
139 |             $automaton = $e->getAutomaton();
140 |         }
141 | 
142 |         if ($input->getOption('dfa')) {
143 |             $output->writeln('');
144 | 
145 |             $automatonDumper = new AutomatonDumper($automaton);
146 | 
147 |             if ($input->getOption('state') === null) {
148 |                 $output->writeln('<info>Exporting the DFA...</info>');
149 | 
150 |                 $dot = $automatonDumper->dump();
151 |                 $file = 'automaton.dot';
152 |             } else {
153 |                 $state = (int)$input->getOption('state');
154 | 
155 |                 if (!$automaton->hasState($state)) {
156 |                     $output->writeln(array(
157 |                         $formatter->formatBlock(
158 |                             sprintf('The automaton has no state #%d', $state),
159 |                             'error',
160 |                             true
161 |                         ),
162 |                     ));
163 | 
164 |                     return 1;
165 |                 }
166 | 
167 |                 $output->writeln(sprintf(
168 |                     '<info>Exporting the DFA state <comment>%d</comment>...',
169 |                     $state
170 |                 ));
171 | 
172 |                 $dot = $automatonDumper->dumpState($state);
173 |                 $file = sprintf('state_%d.dot', $state);
174 |             }
175 | 
176 |             $fileName = $outputDir . DIRECTORY_SEPARATOR . $file;
177 |             $ret = @file_put_contents($fileName, $dot);
178 | 
179 |             if ($ret === false) {
180 |                 $output->writeln('<error>Error writing to the file</error>');
181 |             } else {
182 |                 $output->writeln('<info>Successfully exported</info>');
183 |             }
184 |         }
185 | 
186 |         return 0;
187 |     }
188 | 
189 |     protected function formatConflict(array $conflict)
190 |     {
191 |         $type = $conflict['resolution'] === Grammar::SHIFT
192 |             ? 'shift/reduce'
193 |             : 'reduce/reduce';
194 | 
195 |         return sprintf(
196 |             "<info>Resolved a <comment>%s</comment> conflict in state <comment>%d</comment> on lookahead <comment>%s</comment></info>",
197 |             $type,
198 |             $conflict['state'],
199 |             $conflict['lookahead']
200 |         );
201 |     }
202 | }
203 | 


--------------------------------------------------------------------------------
/docs/lexing.md:
--------------------------------------------------------------------------------
  1 | Lexical analysis with Dissect
  2 | =============================
  3 | 
  4 | There are three classes for lexical analysis in Dissect, all under the
  5 | namespace `Dissect\Lexer`: `SimpleLexer`, `StatefulLexer` and `RegexLexer`.
  6 | 
  7 | SimpleLexer
  8 | -----------
  9 | 
 10 | `SimpleLexer` simply accepts some token definitions and applies them on
 11 | the input. Let's create a subclass for this chapter:
 12 | 
 13 | ```php
 14 | use Dissect\Lexer\SimpleLexer;
 15 | 
 16 | class ArithLexer extends SimpleLexer
 17 | {
 18 |     public function __construct()
 19 |     {
 20 |         // token definitions
 21 |     }
 22 | }
 23 | ```
 24 | 
 25 | ### Defining tokens
 26 | 
 27 | There are 3 ways to define a token. The simplest one looks like this:
 28 | 
 29 | ```php
 30 | $this->token('+');
 31 | ```
 32 | 
 33 | This definition will simply match a plus symbol, using `+` both as the
 34 | name and value of the token. You can use 2 arguments:
 35 | 
 36 | ```php
 37 | $this->token('CLASS', 'class');
 38 | ```
 39 | 
 40 | if you want the token name (first argument) to differ from what will actually be
 41 | recognized (second argument).
 42 | 
 43 | The final way defines a token by a regular expression:
 44 | 
 45 | ```php
 46 | $this->regex('INT', '/^[1-9][0-9]*/');
 47 | ```
 48 | 
 49 | Let's now define some tokens we will use in the next chapter:
 50 | 
 51 | ```php
 52 | class ArithLexer extends SimpleLexer
 53 | {
 54 |     public function __construct()
 55 |     {
 56 |         $this->regex('INT', '/^[1-9][0-9]*/');
 57 |         $this->token('(');
 58 |         $this->token(')');
 59 |         $this->token('+');
 60 |         $this->token('*');
 61 |         $this->token('**');
 62 |     }
 63 | }
 64 | ```
 65 | 
 66 | > **Tip**: You can also chain the method calls using a fluent interface.
 67 | 
 68 | ### Skipping tokens
 69 | 
 70 | Some tokens have to be recognized, but we don't want them cluttering the
 71 | output. The best example are probably whitespace tokens: the lexer has
 72 | to recognize them, but they carry no meaning or value, so we can tell
 73 | the lexer to `skip` them:
 74 | 
 75 | ```php
 76 | class ArithLexer extends SimpleLexer
 77 | {
 78 |     public function __construct()
 79 |     {
 80 |         $this->regex('INT', '/[1-9][0-9]*/');
 81 |         $this->token('(');
 82 |         $this->token(')');
 83 |         $this->token('+');
 84 |         $this->token('*');
 85 |         $this->token('**');
 86 | 
 87 |         $this->regex('WSP', "/^[ \r\n\t]+/");
 88 |         $this->skip('WSP');
 89 |     }
 90 | }
 91 | ```
 92 | 
 93 | > You can pass any number of token names to the `skip` method.
 94 | 
 95 | ### Lexing
 96 | 
 97 | Now that we've defined our tokens, we can simply call:
 98 | 
 99 | ```php
100 | $lexer = new ArithLexer();
101 | $stream = $lexer->lex($input);
102 | ```
103 | 
104 | The return value is an object implementing the
105 | `Dissect\Lexer\TokenStream\TokenStream` interface. The interface defines
106 | several methods you can use to inspect and move through the token
107 | stream. See [TokenStream.php][tokenstream] for all the methods you can
108 | use.
109 | 
110 | > If you `count` the token stream, you may be surprised to find out that
111 | > for input like `5 + 3`, it actually contains 4 tokens. That's because,
112 | > as the last step of lexing, a special token called `$eof` is appended
113 | > to mark the end of input. This is crucial to the parsing process, so
114 | > please, never define a token called `$eof` yourself. It could lead to
115 | > some pretty strange errors. Another forbidden token names are `$start`
116 | > and `$epsilon`.
117 | 
118 | StatefulLexer
119 | -------------
120 | 
121 | `SimpleLexer` should work fine for general use cases. However, let's
122 | imagine we're lexing a very simple templating language:
123 | 
124 |     Outer content, {{ variable_name }}, other outer content
125 | 
126 | `SimpleLexer` falls short here, because the outer content can be pretty
127 | much anything, while the content inside the tags has to be strictly
128 | intepreted. Furthermore, if we were to work with this template, we'd
129 | want to skip the whitespace inside tags, but keep it in the outer
130 | content.
131 | 
132 | That's where `StatefulLexer` comes in; during lexing, it maintains a
133 | stack of states with the top one being the current one, and for each
134 | token, you can define the action the lexer should take after recognizing
135 | it. Let's see an example for our templating language:
136 | 
137 | ```php
138 | use Dissect\Lexer\StatefulLexer;
139 | 
140 | class TemplateLexer extends StatefulLexer
141 | {
142 |     public function __construct()
143 |     {
144 |         $lexer->state('outside')
145 |             ->regex('CONTENT', '/^[^"{{"]*/')
146 |             ->token('{{')->action('tag');
147 | 
148 |         $lexer->state('tag')
149 |             ->regex('WSP', "/^[ \r\n\t]+/")
150 |             ->regex('VAR', '/^[a-zA-Z_]+/')
151 |             ->token('}}')->action(StatefulLexer::POP_STATE)
152 |             ->skip('WSP');
153 | 
154 |         $lexer->start('outside');
155 |     }
156 | }
157 | ```
158 | 
159 | Please note that before defining any tokens, we have to define a state.
160 | For the tokens that cause the state transition, we call `action` to
161 | specify what should the lexer do. The action can be either a string, in
162 | which case the lexer goes to the state specified by the string, or
163 | `StatefulLexer::POP_STATE`, which causes the lexer to pop the current
164 | state of the stack, essentialy going back to previous state.
165 | Finally, we tell the lexer in which state to start by calling `start`.
166 | 
167 | Improving lexer performance
168 | ---------------------------
169 | 
170 | There's one important trick to improve the performance of your lexers.
171 | The documentation uses it implicitly, but it requires an explicit mention:
172 | 
173 | When using one of the lexer classes documented above and defining tokens
174 | using regular expressions, *always* anchor the regex at the beginning
175 | using `^` like this:
176 | 
177 | ```php
178 | $this->regex('INT', '/^[1-9][0-9]*/');
179 | ```
180 | 
181 | This little optimization will lead to substantial performance gains on
182 | any but the shortest input strings, since without anchoring, the PCRE
183 | engine would always look for matches throughout the entire remaining
184 | input string, which would be incredibly wasteful for long inputs.
185 | 
186 | RegexLexer
187 | ----------
188 | 
189 | When designing the lexer classes, my goal was not to sacrifice
190 | user-friendliness for performance. However, I'm well aware that there
191 | are use cases that require the highest performace possible. That's
192 | why I adapted the highly performant but slightly less user-friendly
193 | [lexer][doctrinelexer] from [doctrine][doctrine] into Dissect.
194 | 
195 | The usage is almost identical to the original class, writing a lexer
196 | for the arithmetic expressions could look something like this:
197 | 
198 | ```php
199 | use Dissect\Lexer\RegexLexer;
200 | use RuntimeException;
201 | 
202 | class ArithLexer extends RegexLexer
203 | {
204 |     protected $tokens = ['+', '*', '**', '(', ')'];
205 | 
206 |     protected function getCatchablePatterns()
207 |     {
208 |         return ['[1-9][0-9]*'];
209 |     }
210 | 
211 |     protected function getNonCatchablePatterns()
212 |     {
213 |         return ['\s+'];
214 |     }
215 | 
216 |     protected function getType(&$value)
217 |     {
218 |         if (is_numeric($value)) {
219 |             $value = (int)$value;
220 | 
221 |             return 'INT';
222 |         } elseif (in_array($value, $this->tokens)) {
223 |             // the types of the simple tokens equal their values here
224 |             return $value;
225 |         } else {
226 |             throw new RuntimeException(sprintf('Invalid token "%s"', $value));
227 |         }
228 |     }
229 | }
230 | ```
231 | 
232 | Continue
233 | --------
234 | 
235 | Now that we've demonstrated how to perform lexical analysis with
236 | Dissect, we can move onto syntactical analysis, commonly known as
237 | [parsing][parsing].
238 | 
239 | [tokenstream]: ../src/Dissect/Lexer/TokenStream/TokenStream.php
240 | [parsing]: parsing.md
241 | [doctrinelexer]: https://github.com/doctrine/lexer/blob/master/lib/Doctrine/Common/Lexer/AbstractLexer.php
242 | [doctrine]: https://github.com/doctrine/lexer
243 | 


--------------------------------------------------------------------------------
/src/Dissect/Parser/Grammar.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Dissect\Parser;
  4 | 
  5 | use LogicException;
  6 | 
  7 | /**
  8 |  * Represents a context-free grammar.
  9 |  *
 10 |  * @author Jakub Lédl <jakubledl@gmail.com>
 11 |  */
 12 | class Grammar
 13 | {
 14 |     /**
 15 |      * The name given to the rule the grammar is augmented with
 16 |      * when start() is called.
 17 |      */
 18 |     const START_RULE_NAME = '$start';
 19 | 
 20 |     /**
 21 |      * The epsilon symbol signifies an empty production.
 22 |      */
 23 |     const EPSILON = '$epsilon';
 24 | 
 25 |     /**
 26 |      * @var \Dissect\Parser\Rule[]
 27 |      */
 28 |     protected $rules = array();
 29 | 
 30 |     /**
 31 |      * @var array
 32 |      */
 33 |     protected $groupedRules = array();
 34 | 
 35 |     /**
 36 |      * @var int
 37 |      */
 38 |     protected $nextRuleNumber = 1;
 39 | 
 40 |     /**
 41 |      * @var int
 42 |      */
 43 |     protected $conflictsMode = 9; // SHIFT | OPERATORS
 44 | 
 45 |     /**
 46 |      * @var string
 47 |      */
 48 |     protected $currentNonterminal;
 49 | 
 50 |     /**
 51 |      * @var \Dissect\Parser\Rule
 52 |      */
 53 |     protected $currentRule;
 54 | 
 55 |     /**
 56 |      * @var array
 57 |      */
 58 |     protected $operators = array();
 59 | 
 60 |     /**
 61 |      * @var array
 62 |      */
 63 |     protected $currentOperators;
 64 | 
 65 |     /**
 66 |      * Signifies that the parser should not resolve any
 67 |      * grammar conflicts.
 68 |      */
 69 |     const NONE = 0;
 70 | 
 71 |     /**
 72 |      * Signifies that the parser should resolve
 73 |      * shift/reduce conflicts by always shifting.
 74 |      */
 75 |     const SHIFT = 1;
 76 | 
 77 |     /**
 78 |      * Signifies that the parser should resolve
 79 |      * reduce/reduce conflicts by reducing with
 80 |      * the longer rule.
 81 |      */
 82 |     const LONGER_REDUCE = 2;
 83 | 
 84 |     /**
 85 |      * Signifies that the parser should resolve
 86 |      * reduce/reduce conflicts by reducing
 87 |      * with the rule that was given earlier in
 88 |      * the grammar.
 89 |      */
 90 |     const EARLIER_REDUCE = 4;
 91 | 
 92 |     /**
 93 |      * Signifies that the conflicts should be
 94 |      * resolved by taking operator precendence
 95 |      * into account.
 96 |      */
 97 |     const OPERATORS = 8;
 98 | 
 99 |     /**
100 |      * Signifies that the parser should automatically
101 |      * resolve all grammar conflicts.
102 |      */
103 |     const ALL = 15;
104 | 
105 |     /**
106 |      * Left operator associativity.
107 |      */
108 |     const LEFT = 0;
109 | 
110 |     /**
111 |      * Right operator associativity.
112 |      */
113 |     const RIGHT = 1;
114 | 
115 |     /**
116 |      * The operator is nonassociative.
117 |      */
118 |     const NONASSOC = 2;
119 | 
120 |     public function __invoke($nonterminal)
121 |     {
122 |         $this->currentNonterminal = $nonterminal;
123 | 
124 |         return $this;
125 |     }
126 | 
127 |     /**
128 |      * Defines an alternative for a grammar rule.
129 |      *
130 |      * @param string... The components of the rule.
131 |      *
132 |      * @return \Dissect\Parser\Grammar This instance.
133 |      */
134 |     public function is()
135 |     {
136 |         $this->currentOperators = null;
137 | 
138 |         if ($this->currentNonterminal === null) {
139 |             throw new LogicException(
140 |                 'You must specify a name of the rule first.'
141 |             );
142 |         }
143 | 
144 |         $num = $this->nextRuleNumber++;
145 | 
146 |         $rule = new Rule($num, $this->currentNonterminal, func_get_args());
147 | 
148 |         $this->rules[$num] =
149 |             $this->currentRule =
150 |             $this->groupedRules[$this->currentNonterminal][] =
151 |             $rule;
152 | 
153 |         return $this;
154 |     }
155 | 
156 |     /**
157 |      * Sets the callback for the current rule.
158 |      *
159 |      * @param callable $callback The callback.
160 |      *
161 |      * @return \Dissect\Parser\Grammar This instance.
162 |      */
163 |     public function call($callback)
164 |     {
165 |         if ($this->currentRule === null) {
166 |             throw new LogicException(
167 |                 'You must specify a rule first.'
168 |             );
169 |         }
170 | 
171 |         $this->currentRule->setCallback($callback);
172 | 
173 |         return $this;
174 |     }
175 | 
176 |     /**
177 |      * Returns the set of rules of this grammar.
178 |      *
179 |      * @return \Dissect\Parser\Rule[] The rules.
180 |      */
181 |     public function getRules()
182 |     {
183 |         return $this->rules;
184 |     }
185 | 
186 |     public function getRule($number)
187 |     {
188 |         return $this->rules[$number];
189 |     }
190 | 
191 |     /**
192 |      * Returns the nonterminal symbols of this grammar.
193 |      *
194 |      * @return string[] The nonterminals.
195 |      */
196 |     public function getNonterminals()
197 |     {
198 |         return $this->nonterminals;
199 |     }
200 | 
201 |     /**
202 |      * Returns rules grouped by nonterminal name.
203 |      *
204 |      * @return array The rules grouped by nonterminal name.
205 |      */
206 |     public function getGroupedRules()
207 |     {
208 |         return $this->groupedRules;
209 |     }
210 | 
211 |     /**
212 |      * Sets a start rule for this grammar.
213 |      *
214 |      * @param string The name of the start rule.
215 |      */
216 |     public function start($name)
217 |     {
218 |         $this->rules[0] = new Rule(0, self::START_RULE_NAME, array($name));
219 |     }
220 | 
221 |     /**
222 |      * Returns the augmented start rule. For internal use only.
223 |      *
224 |      * @return \Dissect\Parser\Rule The start rule.
225 |      */
226 |     public function getStartRule()
227 |     {
228 |         if (!isset($this->rules[0])) {
229 |             throw new LogicException("No start rule specified.");
230 |         }
231 | 
232 |         return $this->rules[0];
233 |     }
234 | 
235 |     /**
236 |      * Sets the mode of conflict resolution.
237 |      *
238 |      * @param int $mode The bitmask for the mode.
239 |      */
240 |     public function resolve($mode)
241 |     {
242 |         $this->conflictsMode = $mode;
243 |     }
244 | 
245 |     /**
246 |      * Returns the conflict resolution mode for this grammar.
247 |      *
248 |      * @return int The bitmask of the resolution mode.
249 |      */
250 |     public function getConflictsMode()
251 |     {
252 |         return $this->conflictsMode;
253 |     }
254 | 
255 |     /**
256 |      * Does a nonterminal $name exist in the grammar?
257 |      *
258 |      * @param string $name The name of the nonterminal.
259 |      *
260 |      * @return boolean
261 |      */
262 |     public function hasNonterminal($name)
263 |     {
264 |         return array_key_exists($name, $this->groupedRules);
265 |     }
266 | 
267 |     /**
268 |      * Defines a group of operators.
269 |      *
270 |      * @param string,... Any number of tokens that serve as the operators.
271 |      *
272 |      * @return \Dissect\Parser\Grammar This instance for fluent interface.
273 |      */
274 |     public function operators()
275 |     {
276 |         $this->currentRule = null;
277 | 
278 |         $ops = func_get_args();
279 | 
280 |         $this->currentOperators = $ops;
281 | 
282 |         foreach ($ops as $op) {
283 |             $this->operators[$op] = array(
284 |                 'prec' => 1,
285 |                 'assoc' => self::LEFT,
286 |             );
287 |         }
288 | 
289 |         return $this;
290 |     }
291 | 
292 |     /**
293 |      * Marks the current group of operators as left-associative.
294 |      *
295 |      * @return \Dissect\Parser\Grammar This instance for fluent interface.
296 |      */
297 |     public function left()
298 |     {
299 |         return $this->assoc(self::LEFT);
300 |     }
301 | 
302 |     /**
303 |      * Marks the current group of operators as right-associative.
304 |      *
305 |      * @return \Dissect\Parser\Grammar This instance for fluent interface.
306 |      */
307 |     public function right()
308 |     {
309 |         return $this->assoc(self::RIGHT);
310 |     }
311 | 
312 |     /**
313 |      * Marks the current group of operators as nonassociative.
314 |      *
315 |      * @return \Dissect\Parser\Grammar This instance for fluent interface.
316 |      */
317 |     public function nonassoc()
318 |     {
319 |         return $this->assoc(self::NONASSOC);
320 |     }
321 | 
322 |     /**
323 |      * Explicitly sets the associatity of the current group of operators.
324 |      *
325 |      * @param int $a One of Grammar::LEFT, Grammar::RIGHT, Grammar::NONASSOC
326 |      *
327 |      * @return \Dissect\Parser\Grammar This instance for fluent interface.
328 |      */
329 |     public function assoc($a)
330 |     {
331 |         if (!$this->currentOperators) {
332 |             throw new LogicException('Define a group of operators first.');
333 |         }
334 | 
335 |         foreach ($this->currentOperators as $op) {
336 |             $this->operators[$op]['assoc'] = $a;
337 |         }
338 | 
339 |         return $this;
340 |     }
341 | 
342 |     /**
343 |      * Sets the precedence (as an integer) of the current group of operators.
344 |      * If no group of operators is being specified, sets the precedence
345 |      * of the currently described rule.
346 |      *
347 |      * @param int $i The precedence as an integer.
348 |      *
349 |      * @return \Dissect\Parser\Grammar This instance for fluent interface.
350 |      */
351 |     public function prec($i)
352 |     {
353 |         if (!$this->currentOperators) {
354 |             if (!$this->currentRule) {
355 |                 throw new LogicException('Define a group of operators or a rule first.');
356 |             } else {
357 |                 $this->currentRule->setPrecedence($i);
358 |             }
359 |         } else {
360 |             foreach ($this->currentOperators as $op) {
361 |                 $this->operators[$op]['prec'] = $i;
362 |             }
363 |         }
364 | 
365 |         return $this;
366 |     }
367 | 
368 |     /**
369 |      * Is the passed token an operator?
370 |      *
371 |      * @param string $token The token type.
372 |      *
373 |      * @return boolean
374 |      */
375 |     public function hasOperator($token)
376 |     {
377 |         return array_key_exists($token, $this->operators);
378 |     }
379 | 
380 |     public function getOperatorInfo($token)
381 |     {
382 |         return $this->operators[$token];
383 |     }
384 | }
385 | 


--------------------------------------------------------------------------------
/docs/parsing.md:
--------------------------------------------------------------------------------
  1 | Parsing with Dissect
  2 | ====================
  3 | 
  4 | Why an LALR(1) parser?
  5 | ----------------------
  6 | 
  7 | Parsing is a task that's needed more often than one would think;
  8 | for examples in some famous PHP projects, see [this parser][twigparser]
  9 | from [Twig][twig] and [these][annotationsparser] [two][dqlparser] from
 10 | [Doctrine][doctrine]. Chances are you've written one; if you did, it was
 11 | most likely a [recursive descent parser][rdparser], just like the
 12 | examples above. Now, such parsers have several disadvantages: first,
 13 | they obviously have to be manually written.  Second, they're *recursive*,
 14 | which means one thing: nest the input deep enough (like an
 15 | annotation, which has another annotation as a parameter, that annotation
 16 | has another annotation as a parameter ...) and your PHP process blows up
 17 | because of stack overflow (to be fair, you'd have to nest pretty deep).
 18 | And third, such parsers belong to a class of parsers known as
 19 | [LL(k)][llk], which means they're generally not as powerful as [LR(k)][lrk]
 20 | parsers. For instance, they cannot handle left-recursive rules
 21 | (rules like `A -> A ...`), which are probably the only sane way of
 22 | expressing left-associative binary operators (like addition, for
 23 | example).
 24 | 
 25 | But let's get to actually parsing something.
 26 | 
 27 | Writing a grammar
 28 | -----------------
 29 | 
 30 | A grammar is represented by a subclass of `Dissect\Parser\Grammar`.
 31 | 
 32 | ```php
 33 | use Dissect\Parser\Grammar;
 34 | 
 35 | class ArithGrammar extends Grammar
 36 | {
 37 |     public function __construct()
 38 |     {
 39 |         // rule definitions
 40 |     }
 41 | }
 42 | ```
 43 | 
 44 | First, you tell Dissect what rule are you describing. Let's say we want
 45 | to describe a rule for a `Sum`:
 46 | 
 47 | ```php
 48 | $this('Sum')
 49 | ```
 50 | 
 51 | and then you specify what the rule actually `is`:
 52 | 
 53 | ```php
 54 | $this('Sum')
 55 |     ->is('int', '+', 'int');
 56 | ```
 57 | 
 58 | A rule can of course have many alternatives:
 59 | 
 60 | ```php
 61 | $this('Sum')
 62 |     ->is('int', '+', 'int')
 63 |     ->is('string', '+', 'string');
 64 | ```
 65 | 
 66 | and you will probably want to specify how to evalute the rule:
 67 | 
 68 | ```php
 69 | $this('Sum')
 70 |     ->is('int', '+', 'int')
 71 |     ->call(function ($l, $_, $r) {
 72 |         return $l + $r;
 73 |     })
 74 | 
 75 |     ->is('string', '+', 'string')
 76 |     ->call(function ($l, $_, $r) {
 77 |         return $l . $r;
 78 |     });
 79 | ```
 80 | 
 81 | > The number of arguments to the callback function is always equal
 82 | > to the length of the rule to which it belongs.
 83 | 
 84 | ### Empty rules
 85 | 
 86 | A grammar can (and many times will) contain empty rules, that is, rules that
 87 | can match 0 tokens of the input. This is useful when, for example,
 88 | describing a list of function arguments, which can be either empty or a list of
 89 | values separated by commas.
 90 | 
 91 | An empty rule is defined simply by calling `is` with 0 arguments:
 92 | 
 93 | ```php
 94 | $this('Empty')
 95 |     ->is();
 96 | ```
 97 | 
 98 | If you find this notation unclear, you can explicitly mark empty rules
 99 | with a comment:
100 | 
101 | ```php
102 | $this('Empty')
103 |     ->is(/* empty */);
104 | ```
105 | 
106 | > **Beware:** When you don't specify a callback for a rule, Dissect
107 | > will default to returing the leftmost (first) component of the rule. You
108 | > are, however, required to specify a callback for an empty rule, since
109 | > in a rule with zero components, there is obviously no leftmost one.
110 | 
111 | Example: Parsing mathematical expressions
112 | -----------------------------------------
113 | 
114 | In the chapter on lexing, we've created a lexer we will now use to
115 | process our expressions:
116 | 
117 | ```php
118 | class ArithLexer extends SimpleLexer
119 | {
120 |     public function __construct()
121 |     {
122 |         $this->regex('INT', '/^[1-9][0-9]*/');
123 |         $this->token('(');
124 |         $this->token(')');
125 |         $this->token('+');
126 |         $this->token('*');
127 |         $this->token('**');
128 | 
129 |         $this->regex('WSP', "/^[ \r\n\t]+/");
130 |         $this->skip('WSP');
131 |     }
132 | }
133 | 
134 | $lexer = new ArithLexer();
135 | ```
136 | 
137 | As for the grammar, let's start out slow, with only a single operator:
138 | 
139 | ```php
140 | $this('Expr')
141 |     ->is('Expr', '+', 'Expr')
142 |     ->call(function ($l, $_, $r) {
143 |         return $l + $r;
144 |     })
145 | 
146 |     ->is('INT')
147 |     ->call(function ($i) {
148 |         return (int)$i->getValue();
149 |     });
150 | 
151 | $this->start('Expr');
152 | ```
153 | 
154 | These two rule specify an expression to be either two expression
155 | separated by a plus or simply an integer. The call to `start()`
156 | sets the starting rule of the grammar.
157 | 
158 | Now, we can simply pass the grammar to a parser object:
159 | 
160 | ```php
161 | use Dissect\Parser\LALR1\Parser;
162 | 
163 | $parser = new Parser(new ArithGrammar());
164 | $stream = $lexer->lex('1 + 2 + 3');
165 | echo $parser->parse($stream);
166 | // => 6
167 | ```
168 | 
169 | and yay, it works!
170 | 
171 | ### Operator associativity
172 | 
173 | Actually, it doesn't. It *seems* to work because addition happens to be
174 | commutative, but a problem appears once we add another rule to the
175 | grammar to represent subtraction:
176 | 
177 | ```php
178 | $this('Expr')
179 |     ->is('Expr', '+', 'Expr') ...
180 | 
181 |     ->is('Expr', '-', 'Expr')
182 |     ->call(function ($l, $_, $r) {
183 |         return $l - $r;
184 |     })
185 | 
186 |     ->is('INT') ...
187 | ```
188 | 
189 | The result looks like this:
190 | 
191 | ```php
192 | $stream = $lexer->lex('3 - 5 - 2');
193 | echo $parser->parse($stream);
194 | // => 0
195 | ```
196 | 
197 | Well, that's certainly incorrect. The problem is that our grammar
198 | actually contains a conflict (a *shift/reduce* conflict, if you're a fan
199 | of termini technici. See the [section on conflict resolution](#resolving-conflicts).)
200 | which Dissect automatically resolves in a way that makes our `+` and `-`
201 | operators right-associative. The problem is fortunately easy to solve:
202 | we have to mark them as left-associative operators:
203 | 
204 | ```php
205 |     ->is('INT') ...
206 | 
207 | $this->operators('+', '-')->left();
208 | ```
209 | 
210 | This makes Dissect treat the two tokens in a special way, the conflict
211 | is resolved to represent left-associativity and the parser works correctly:
212 | 
213 | ```php
214 | $stream = $lexer->lex('3 - 5 - 2');
215 | echo $parser->parse($stream);
216 | // => -4
217 | ```
218 | 
219 | ### Operator precedence
220 | 
221 | Unfortunately, we're not out of the woods yet. When we add another two
222 | rules to represent multiplication and division, we see that the parser
223 | still makes mistakes:
224 | 
225 | ```php
226 | $this('Expr')
227 |     ...
228 | 
229 |     ->is('Expr', '*', 'Expr')
230 |     ->call(function ($l, $_, $r) {
231 |         return $l * $r;
232 |     })
233 | 
234 |     ->is('Expr', '/', 'Expr')
235 |     ->call(function ($l, $_, $r) {
236 |         return $l / $r;
237 |     })
238 | 
239 |     ...
240 | 
241 |     $this->operators('*', '/')->left();
242 | ...
243 | 
244 | $stream = $lexer->lex('2 + 3 * 5');
245 | echo $parser->parse($stream);
246 | // => 25
247 | ```
248 | 
249 | The problem is that Dissect doesn't know anything about the precedence
250 | of our operators. But we can, of course, provide the necessary information:
251 | 
252 | ```php
253 | $this->operators('+', '-')->left()->prec(1);
254 | $this->operators('*', '/')->left()->prec(2);
255 | 
256 | ...
257 | 
258 | $stream = $lexer->lex('2 + 3 * 5');
259 | echo $parser->parse($stream);
260 | // => 17
261 | ```
262 | 
263 | The higher the integer passed to the `prec()` method, the higher the
264 | precedence of the specified operators.
265 | 
266 | And we have the basic grammar for mathematical expressions in place!
267 | As an exercise, try to handle the rest of the tokens defined in the lexer:
268 | 
269 | - Create a rule to handle parentheses around expressions.
270 | - Create a rule for the final operator, `**`, which represents
271 |   exponentiation. Give it the highest precedence and make it
272 |   *right-associative* (the method is, shockingly, called `right()`).
273 | 
274 | ### Specifying precedences on rules instead of operators
275 | 
276 | As a final touch, we'd like to add a unary minus operator to our grammar:
277 | 
278 | ```php
279 | $this('Expr')
280 |     ...
281 | 
282 |     ->is('-', 'Expr')
283 |     ->call(function ($_, $e) {
284 |         return -$e;
285 |     })
286 |     ...
287 | ```
288 | 
289 | But you might feel that something is amiss. Unary minus should have the
290 | highest precedence, but we've specified the precedence of `-` to be the
291 | lowest, actually. But don't worry, we can assign precedences directly to
292 | rules:
293 | 
294 | ```php
295 | $this('Expr')
296 |     ...
297 | 
298 |     ->is('-', 'Expr')->prec(4) // higher than everything
299 |     ->call(function ($_, $e) {
300 |         return -$e;
301 |     })
302 |     ...
303 | ```
304 | 
305 | ### Nonassociativity
306 | 
307 | Apart from being left- or right-associative, operators can be
308 | nonassociative, which means that for an operator `op`, the input
309 | `a op b op c` means neither `(a op b) op c` or `a op (b op c)`,
310 | but is considered a syntax error.
311 | 
312 | This has certain use cases; for instance, one of the nonassociative
313 | operators in the grammar for PHP is `<`: when parsing `1 < 2 < 3`,
314 | the PHP parser reports a syntax error.
315 | 
316 | The corresponding method in Dissect grammars is `nonassoc()`:
317 | 
318 | ```php
319 | $this->operators('<', '>')->nonassoc()->prec(...);
320 | ```
321 | 
322 | ### Describing common syntactic structures
323 | 
324 | To see how to describe commonly used syntactic structures such as
325 | repetitions and lists, see the [dedicated documentation section][common].
326 | 
327 | Invalid input
328 | -------------
329 | 
330 | When the parser encounters a syntactical error, it stops dead and
331 | throws a `Dissect\Parser\Exception\UnexpectedTokenException`.
332 | The exception gives you programmatic access to information about the
333 | problem: `getToken()` returns a `Dissect\Lexer\Token` representing the
334 | invalid token and `getExpected()` returns an array of token types the parser
335 | expected to encounter.
336 | 
337 | Precomputing the parse table
338 | ----------------------------
339 | 
340 | The parser needs a *parse table* to decide what to do based on given
341 | input. That parse table is created from the grammar and, if we give the
342 | parser only the grammar, needs to be computed every time we instantiate
343 | the parser.
344 | 
345 | Grammar analysis is costly; if you need the speed, a far better choice
346 | would be to precompute the table beforehand (perhaps as a part of your
347 | build process) like this:
348 | 
349 | ```php
350 | use Dissect\Parser\LALR1\Analysis\Analyzer;
351 | 
352 | $analyzer = new Analyzer();
353 | $parseTable = $analyzer->analyze($grammar)->getParseTable();
354 | ```
355 | 
356 | Now that we've got the parse table, we can dump it to a string which
357 | we then save to a file. To do this, we can use either
358 | `Dissect\Parser\LALR1\Dumper\ProductionTableDumper`:
359 | 
360 | ```php
361 | $dumper = new ProductionTableDumper();
362 | $php = $dumper->dump($parseTable);
363 | ```
364 | 
365 | which produces very compact, whitespace-free and absolutely unreadable
366 | code, or `Dissect\Parser\LALR1\Dumper\DebugTableDumper`:
367 | 
368 | ```php
369 | $dumper = new DebugTableDumper($grammar);
370 | $php = $dumper->dump($parseTable);
371 | ```
372 | 
373 | which produces indented, readable representation with comments
374 | explaining each step the parser takes when processing the input.
375 | 
376 | ### Using the dumped parse table
377 | 
378 | To use the dumped parse table, just write
379 | 
380 | ```php
381 | $parser = new Parser($grammar, require $parseTableFile);
382 | ```
383 | 
384 | You still need to pass the grammar, since it contains the callbacks
385 | used to evalute the input.
386 | 
387 | > If you intend to use Dissect more like a traditional parser generator,
388 | > you don't actually need to do any of this, of course. Dissect provides a
389 | > command-line interface you can use to process and debug your grammars.
390 | > It's described in its own [documentation section][cli].
391 | 
392 | Resolving conflicts
393 | -------------------
394 | 
395 | *Caution, this is advanced stuff. You probably won't ever need to worry
396 | about this.*
397 | 
398 | LALR(1) is generally a very poweful parsing algorithm. However, there
399 | are practical grammars that are, unfortunately, almost-but-not-quite
400 | LALR(1). When running an LALR(1) analyzer on such grammars, one sees
401 | that they contain 2 types of conflicts:
402 | 
403 | - **Shift/Reduce conflicts** - the parser doesn't know whether to shift
404 |   another token or reduce what's on the stack.
405 | 
406 | - **Reduce/Reduce conflicts** - the parser can reduce by multiple
407 |   grammar rules.
408 | 
409 | There are 4 commonly used ways of resolving such conflicts and Dissect allows you to
410 | combine them any way you want:
411 | 
412 | 1. On a shift/reduce conflict, consult the operators precedence
413 |    and associativity information. The rules for resolution are a little
414 |    complicated, but the conflict may be resolved as a reduce (either the
415 |    precedence of the rule is higher than that of the shifted token or the
416 |    token is left-associative), a shift (the rule precedence is lower or the
417 |    token is right-associative) or even as an error (when the token is
418 |    nonassociative). Note that Dissect doesn't report conflicts resolved
419 |    using this technique, since they were intentionally created by the user
420 |    and therefore are not really conflicts. Represented by the
421 |    constant `Grammar::OPERATORS`.
422 | 
423 | 2. On a shift/reduce conflict, always shift. This is represented by
424 |    the constant `Grammar::SHIFT` and, together with the above method,
425 |    is enabled by default.
426 | 
427 | 3. On a reduce/reduce conflict, reduce using the longer rule.
428 |    Represented by `Grammar::LONGER_REDUCE`. Both this and the previous
429 |    way represent the same philosophy: take the largest bite possible.
430 |    This is usually what the user intended to express.
431 | 
432 | 4. On a reduce/reduce conflict, reduce using the rule that was
433 |    declared earlier in the grammar. Represented by
434 |    `Grammar::EARLIER_REDUCE`.
435 | 
436 | To specify precisely how should Dissect resolve parse table conflicts,
437 | call `resolve` on your grammar:
438 | 
439 | ```php
440 | $this->resolve(Grammar::SHIFT | Grammar::OPERATORS | Grammar::LONGER_REDUCE);
441 | ```
442 | 
443 | There are two other constants: `Grammar::NONE` that forbids any
444 | conflicts in the grammar (even the operators-related ones) and
445 | `Grammar::ALL`, which is a combination of all the 4 above methods
446 | defined simply for convenience.
447 | 
448 | [twigparser]: https://github.com/fabpot/Twig/blob/master/lib/Twig/Parser.php
449 | [twig]: https://github.com/fabpot/Twig
450 | [annotationsparser]: https://github.com/doctrine/common/blob/master/lib/Doctrine/Common/Annotations/DocParser.php
451 | [dqlparser]: https://github.com/doctrine/doctrine2/blob/master/lib/Doctrine/ORM/Query/Parser.php
452 | [doctrine]: https://github.com/doctrine
453 | [rdparser]: http://en.wikipedia.org/wiki/Recursive_descent_parser
454 | [llk]: http://en.wikipedia.org/wiki/LL_parser
455 | [lrk]: http://en.wikipedia.org/wiki/LR_parser
456 | [cli]: cli.md
457 | [common]: common.md
458 | 


--------------------------------------------------------------------------------
/src/Dissect/Parser/LALR1/Analysis/Analyzer.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Dissect\Parser\LALR1\Analysis;
  4 | 
  5 | use Dissect\Parser\LALR1\Analysis\Exception\ReduceReduceConflictException;
  6 | use Dissect\Parser\LALR1\Analysis\Exception\ShiftReduceConflictException;
  7 | use Dissect\Parser\LALR1\Analysis\KernelSet\KernelSet;
  8 | use Dissect\Parser\Grammar;
  9 | use Dissect\Parser\Parser;
 10 | use Dissect\Util\Util;
 11 | use SplQueue;
 12 | 
 13 | /**
 14 |  * Performs a grammar analysis and returns
 15 |  * the result.
 16 |  *
 17 |  * @author Jakub Lédl <jakubledl@gmail.com>
 18 |  */
 19 | class Analyzer
 20 | {
 21 |     /**
 22 |      * Performs a grammar analysis.
 23 |      *
 24 |      * @param \Dissect\Parser\Grammar $grammar The grammar to analyse.
 25 |      *
 26 |      * @return \Dissect\Parser\LALR1\Analysis\AnalysisResult The result ofthe analysis.
 27 |      */
 28 |     public function analyze(Grammar $grammar)
 29 |     {
 30 |         $automaton = $this->buildAutomaton($grammar);
 31 |         list($parseTable, $conflicts) = $this->buildParseTable($automaton, $grammar);
 32 | 
 33 |         return new AnalysisResult($parseTable, $automaton, $conflicts);
 34 |     }
 35 | 
 36 |     /**
 37 |      * Builds the handle-finding FSA from the grammar.
 38 |      *
 39 |      * @param \Dissect\Parser\Grammar $grammar The grammar.
 40 |      *
 41 |      * @return \Dissect\Parser\LALR1\Analysis\Automaton The resulting automaton.
 42 |      */
 43 |     protected function buildAutomaton(Grammar $grammar)
 44 |     {
 45 |         // the eventual automaton
 46 |         $automaton = new Automaton();
 47 | 
 48 |         // the queue of states that need processing
 49 |         $queue = new SplQueue();
 50 | 
 51 |         // the BST for state kernels
 52 |         $kernelSet = new KernelSet();
 53 | 
 54 |         // rules grouped by their name
 55 |         $groupedRules = $grammar->getGroupedRules();
 56 | 
 57 |         // FIRST sets of nonterminals
 58 |         $firstSets = $this->calculateFirstSets($groupedRules);
 59 | 
 60 |         // keeps a list of tokens that need to be pumped
 61 |         // through the automaton
 62 |         $pumpings = array();
 63 | 
 64 |         // the item from which the whole automaton
 65 |         // is derived
 66 |         $initialItem = new Item($grammar->getStartRule(), 0);
 67 | 
 68 |         // construct the initial state
 69 |         $state = new State($kernelSet->insert(array(
 70 |             array($initialItem->getRule()->getNumber(), $initialItem->getDotIndex()),
 71 |         )), array($initialItem));
 72 | 
 73 |         // the initial item automatically has EOF
 74 |         // as its lookahead
 75 |         $pumpings[] = array($initialItem, array(Parser::EOF_TOKEN_TYPE));
 76 | 
 77 |         $queue->enqueue($state);
 78 |         $automaton->addState($state);
 79 | 
 80 |         while (!$queue->isEmpty()) {
 81 |             $state = $queue->dequeue();
 82 | 
 83 |             // items of this state are grouped by
 84 |             // the active component to calculate
 85 |             // transitions easily
 86 |             $groupedItems = array();
 87 | 
 88 |             // calculate closure
 89 |             $added = array();
 90 |             $currentItems = $state->getItems();
 91 |             for ($x = 0; $x < count($currentItems); $x++) {
 92 |                 $item = $currentItems[$x];
 93 | 
 94 |                 if (!$item->isReduceItem()) {
 95 |                     $component = $item->getActiveComponent();
 96 |                     $groupedItems[$component][] = $item;
 97 | 
 98 |                     // if nonterminal
 99 |                     if ($grammar->hasNonterminal($component)) {
100 | 
101 |                         // calculate lookahead
102 |                         $lookahead = array();
103 |                         $cs = $item->getUnrecognizedComponents();
104 | 
105 |                         foreach ($cs as $i => $c) {
106 |                             if (!$grammar->hasNonterminal($c)) {
107 |                                 // if terminal, add it and break the loop
108 |                                 $lookahead = Util::union($lookahead, array($c));
109 | 
110 |                                 break;
111 |                             } else {
112 |                                 // if nonterminal
113 |                                 $new = $firstSets[$c];
114 | 
115 |                                 if (!in_array(Grammar::EPSILON, $new)) {
116 |                                     // if the component doesn't derive
117 |                                     // epsilon, merge FIRST sets and break
118 |                                     $lookahead = Util::union($lookahead, $new);
119 | 
120 |                                     break;
121 |                                 } else {
122 |                                     // if it does
123 | 
124 |                                     if ($i < (count($cs) - 1)) {
125 |                                         // if more components ahead, remove epsilon
126 |                                         unset($new[array_search(Grammar::EPSILON, $new)]);
127 |                                     }
128 | 
129 |                                     // and continue the loop
130 |                                     $lookahead = Util::union($lookahead, $new);
131 |                                 }
132 |                             }
133 |                         }
134 | 
135 |                         // two items are connected if the unrecognized
136 |                         // part of rule 1 derives epsilon
137 |                         $connect = false;
138 | 
139 |                         // only store the pumped tokens if there
140 |                         // actually is an unrecognized part
141 |                         $pump = true;
142 | 
143 |                         if (empty($lookahead)) {
144 |                             $connect = true;
145 |                             $pump = false;
146 |                         } else {
147 |                             if (in_array(Grammar::EPSILON, $lookahead)) {
148 |                                 unset($lookahead[array_search(Grammar::EPSILON, $lookahead)]);
149 | 
150 |                                 $connect = true;
151 |                             }
152 |                         }
153 | 
154 |                         foreach ($groupedRules[$component] as $rule) {
155 |                             if (!in_array($component, $added)) {
156 |                                 // if $component hasn't yet been expaned,
157 |                                 // create new items for it
158 |                                 $newItem = new Item($rule, 0);
159 | 
160 |                                 $currentItems[] = $newItem;
161 |                                 $state->add($newItem);
162 | 
163 |                             } else {
164 |                                 // if it was expanded, each original
165 |                                 // rule might bring new lookahead tokens,
166 |                                 // so get the rule from the current state
167 |                                 $newItem = $state->get($rule->getNumber(), 0);
168 |                             }
169 | 
170 |                             if ($connect) {
171 |                                 $item->connect($newItem);
172 |                             }
173 | 
174 |                             if ($pump) {
175 |                                 $pumpings[] = array($newItem, $lookahead);
176 |                             }
177 |                         }
178 |                     }
179 | 
180 |                     // mark the component as processed
181 |                     $added[] = $component;
182 |                 }
183 |             }
184 | 
185 |             // calculate transitions
186 |             foreach ($groupedItems as $thisComponent => $theseItems) {
187 |                 $newKernel = array();
188 | 
189 |                 foreach ($theseItems as $thisItem) {
190 |                     $newKernel[] = array(
191 |                         $thisItem->getRule()->getNumber(),
192 |                         $thisItem->getDotIndex() + 1,
193 |                     );
194 |                 }
195 | 
196 |                 $num = $kernelSet->insert($newKernel);
197 | 
198 |                 if ($automaton->hasState($num)) {
199 |                     // the state already exists
200 |                     $automaton->addTransition($state->getNumber(), $thisComponent, $num);
201 | 
202 |                     // extract the connected items from the target state
203 |                     $nextState = $automaton->getState($num);
204 | 
205 |                     foreach ($theseItems as $thisItem) {
206 |                         $thisItem->connect(
207 |                             $nextState->get(
208 |                                 $thisItem->getRule()->getNumber(),
209 |                                 $thisItem->getDotIndex() + 1
210 |                             )
211 |                         );
212 |                     }
213 |                 } else {
214 |                     // new state needs to be created
215 |                     $newState = new State($num, array_map(function (Item $i) {
216 |                         $new = new Item($i->getRule(), $i->getDotIndex() + 1);
217 | 
218 |                         // connect the two items
219 |                         $i->connect($new);
220 | 
221 |                         return $new;
222 |                     }, $theseItems));
223 | 
224 |                     $automaton->addState($newState);
225 |                     $queue->enqueue($newState);
226 | 
227 |                     $automaton->addTransition($state->getNumber(), $thisComponent, $num);
228 |                 }
229 |             }
230 |         }
231 | 
232 |         // pump all the lookahead tokens
233 |         foreach ($pumpings as $pumping) {
234 |             $pumping[0]->pumpAll($pumping[1]);
235 |         }
236 | 
237 |         return $automaton;
238 |     }
239 | 
240 |     /**
241 |      * Encodes the handle-finding FSA as a LR parse table.
242 |      *
243 |      * @param \Dissect\Parser\LALR1\Analysis\Automaton $automaton
244 |      *
245 |      * @return array The parse table.
246 |      */
247 |     protected function buildParseTable(Automaton $automaton, Grammar $grammar)
248 |     {
249 |         $conflictsMode = $grammar->getConflictsMode();
250 |         $conflicts = array();
251 |         $errors = array();
252 | 
253 |         // initialize the table
254 |         $table = array(
255 |             'action' => array(),
256 |             'goto' => array(),
257 |         );
258 | 
259 |         foreach ($automaton->getTransitionTable() as $num => $transitions) {
260 |             foreach ($transitions as $trigger => $destination) {
261 |                 if (!$grammar->hasNonterminal($trigger)) {
262 |                     // terminal implies shift
263 |                     $table['action'][$num][$trigger] = $destination;
264 |                 } else {
265 |                     // nonterminal goes in the goto table
266 |                     $table['goto'][$num][$trigger] = $destination;
267 |                 }
268 |             }
269 |         }
270 | 
271 |         foreach ($automaton->getStates() as $num => $state) {
272 |             if (!isset($table['action'][$num])) {
273 |                 $table['action'][$num] = array();
274 |             }
275 | 
276 |             foreach ($state->getItems() as $item) {
277 |                 if ($item->isReduceItem()) {
278 |                     $ruleNumber = $item->getRule()->getNumber();
279 | 
280 |                     foreach ($item->getLookahead() as $token) {
281 |                         if (isset($errors[$num]) && isset($errors[$num][$token])) {
282 |                             // there was a previous conflict resolved as an error
283 |                             // entry for this token.
284 | 
285 |                             continue;
286 |                         }
287 | 
288 |                         if (array_key_exists($token, $table['action'][$num])) {
289 |                             // conflict
290 |                             $instruction = $table['action'][$num][$token];
291 | 
292 |                             if ($instruction > 0) {
293 |                                 if ($conflictsMode & Grammar::OPERATORS) {
294 |                                     if ($grammar->hasOperator($token)) {
295 |                                         $operatorInfo = $grammar->getOperatorInfo($token);
296 | 
297 |                                         $rulePrecedence = $item->getRule()->getPrecedence();
298 | 
299 |                                         // unless the rule has given precedence
300 |                                         if ($rulePrecedence === null) {
301 |                                             foreach (array_reverse($item->getRule()->getComponents()) as $c) {
302 |                                                 // try to extract it from the rightmost terminal
303 |                                                 if ($grammar->hasOperator($c)) {
304 |                                                     $ruleOperatorInfo = $grammar->getOperatorInfo($c);
305 |                                                     $rulePrecedence = $ruleOperatorInfo['prec'];
306 | 
307 |                                                     break;
308 |                                                 }
309 |                                             }
310 |                                         }
311 | 
312 |                                         if ($rulePrecedence !== null) {
313 |                                             // if we actually have a rule precedence
314 | 
315 |                                             $tokenPrecedence = $operatorInfo['prec'];
316 | 
317 |                                             if ($rulePrecedence > $tokenPrecedence) {
318 |                                                 // if the rule precedence is higher, reduce
319 |                                                 $table['action'][$num][$token] = -$ruleNumber;
320 |                                             } elseif ($rulePrecedence < $tokenPrecedence) {
321 |                                                 // if the token precedence is higher, shift
322 |                                                 // (i.e. don't modify the table)
323 |                                             } else {
324 |                                                 // precedences are equal, let's turn to associativity
325 |                                                 $assoc = $operatorInfo['assoc'];
326 | 
327 |                                                 if ($assoc === Grammar::RIGHT) {
328 |                                                     // if right-associative, shift
329 |                                                     // (i.e. don't modify the table)
330 |                                                 } elseif ($assoc === Grammar::LEFT) {
331 |                                                     // if left-associative, reduce
332 |                                                     $table['action'][$num][$token] = -$ruleNumber;
333 |                                                 } elseif ($assoc === Grammar::NONASSOC) {
334 |                                                     // the token is nonassociative.
335 |                                                     // this actually means an input error, so
336 |                                                     // remove the shift entry from the table
337 |                                                     // and mark this as an explicit error
338 |                                                     // entry
339 |                                                     unset($table['action'][$num][$token]);
340 |                                                     $errors[$num][$token] = true;
341 |                                                 }
342 |                                             }
343 | 
344 |                                             continue; // resolved the conflict, phew
345 |                                         }
346 | 
347 |                                         // we couldn't calculate the precedence => the conflict was not resolved
348 |                                         // move along.
349 |                                     }
350 |                                 }
351 | 
352 |                                 // s/r
353 |                                 if ($conflictsMode & Grammar::SHIFT) {
354 |                                     $conflicts[] = array(
355 |                                         'state' => $num,
356 |                                         'lookahead' => $token,
357 |                                         'rule' => $item->getRule(),
358 |                                         'resolution' => Grammar::SHIFT,
359 |                                     );
360 | 
361 |                                     continue;
362 |                                 } else {
363 |                                     throw new ShiftReduceConflictException(
364 |                                         $num,
365 |                                         $item->getRule(),
366 |                                         $token,
367 |                                         $automaton
368 |                                     );
369 |                                 }
370 |                             } else {
371 |                                 // r/r
372 | 
373 |                                 $originalRule = $grammar->getRule(-$instruction);
374 |                                 $newRule = $item->getRule();
375 | 
376 |                                 if ($conflictsMode & Grammar::LONGER_REDUCE) {
377 | 
378 |                                     $count1 = count($originalRule->getComponents());
379 |                                     $count2 = count($newRule->getComponents());
380 | 
381 |                                     if ($count1 > $count2) {
382 |                                         // original rule is longer
383 |                                         $resolvedRules = array($originalRule, $newRule);
384 | 
385 |                                         $conflicts[] = array(
386 |                                             'state' => $num,
387 |                                             'lookahead' => $token,
388 |                                             'rules' => $resolvedRules,
389 |                                             'resolution' => Grammar::LONGER_REDUCE,
390 |                                         );
391 | 
392 |                                         continue;
393 |                                     } elseif ($count2 > $count1) {
394 |                                         // new rule is longer
395 |                                         $table['action'][$num][$token] = -$ruleNumber;
396 |                                         $resolvedRules = array($newRule, $originalRule);
397 | 
398 |                                         $conflicts[] = array(
399 |                                             'state' => $num,
400 |                                             'lookahead' => $token,
401 |                                             'rules' => $resolvedRules,
402 |                                             'resolution' => Grammar::LONGER_REDUCE,
403 |                                         );
404 | 
405 |                                         continue;
406 |                                     }
407 |                                 }
408 | 
409 |                                 if ($conflictsMode & Grammar::EARLIER_REDUCE) {
410 |                                     if (-$instruction < $ruleNumber) {
411 |                                         // original rule was earlier
412 |                                         $resolvedRules = array($originalRule, $newRule);
413 | 
414 |                                         $conflicts[] = array(
415 |                                             'state' => $num,
416 |                                             'lookahead' => $token,
417 |                                             'rules' => $resolvedRules,
418 |                                             'resolution' => Grammar::EARLIER_REDUCE,
419 |                                         );
420 | 
421 |                                         continue;
422 |                                     } else {
423 |                                         // new rule was earlier
424 |                                         $table['action'][$num][$token] = -$ruleNumber;
425 | 
426 |                                         $conflicts[] = array(
427 |                                             'state' => $num,
428 |                                             'lookahead' => $token,
429 |                                             'rules' => $resolvedRules,
430 |                                             'resolution' => Grammar::EARLIER_REDUCE,
431 |                                         );
432 |                                         $resolvedRules = array($newRule, $originalRule);
433 | 
434 |                                         continue;
435 |                                     }
436 |                                 }
437 | 
438 |                                 // everything failed, throw an exception
439 |                                 throw new ReduceReduceConflictException(
440 |                                     $num,
441 |                                     $originalRule,
442 |                                     $newRule,
443 |                                     $token,
444 |                                     $automaton
445 |                                 );
446 |                             }
447 |                         }
448 | 
449 |                         $table['action'][$num][$token] = -$ruleNumber;
450 |                     }
451 |                 }
452 |             }
453 |         }
454 | 
455 |         return array($table, $conflicts);
456 |     }
457 | 
458 |     /**
459 |      * Calculates the FIRST sets of all nonterminals.
460 |      *
461 |      * @param array $rules The rules grouped by the LHS.
462 |      *
463 |      * @return array Calculated FIRST sets.
464 |      */
465 |     protected function calculateFirstSets(array $rules)
466 |     {
467 |         // initialize
468 |         $firstSets = array();
469 | 
470 |         foreach (array_keys($rules) as $lhs) {
471 |             $firstSets[$lhs] = array();
472 |         }
473 | 
474 |         do {
475 |             $changes = false;
476 | 
477 |             foreach ($rules as $lhs => $ruleArray) {
478 |                 foreach ($ruleArray as $rule) {
479 |                     $components = $rule->getComponents();
480 |                     $new = array();
481 | 
482 |                     if (empty($components)) {
483 |                         $new = array(Grammar::EPSILON);
484 |                     } else {
485 |                         foreach ($components as $i => $component) {
486 |                             if (array_key_exists($component, $rules)) {
487 |                                 // if nonterminal, copy its FIRST set to
488 |                                 // this rule's first set
489 |                                 $x = $firstSets[$component];
490 | 
491 |                                 if (!in_array(Grammar::EPSILON, $x)) {
492 |                                     // if the component doesn't derive
493 |                                     // epsilon, merge the first sets and
494 |                                     // we're done
495 |                                     $new = Util::union($new, $x);
496 | 
497 |                                     break;
498 |                                 } else {
499 |                                     // if all components derive epsilon,
500 |                                     // the rule itself derives epsilon
501 | 
502 |                                     if ($i < (count($components) - 1)) {
503 |                                         // more components ahead, remove epsilon
504 |                                         unset($x[array_search(Grammar::EPSILON, $x)]);
505 |                                     }
506 | 
507 |                                     $new = Util::union($new, $x);
508 |                                 }
509 |                             } else {
510 |                                 // if terminal, simply add it the the FIRST set
511 |                                 // and we're done
512 |                                 $new = Util::union($new, array($component));
513 | 
514 |                                 break;
515 |                             }
516 |                         }
517 |                     }
518 | 
519 |                     if (Util::different($new, $firstSets[$lhs])) {
520 |                         $firstSets[$lhs] = Util::union($firstSets[$lhs], $new);
521 | 
522 |                         $changes = true;
523 |                     }
524 |                 }
525 |             }
526 |         } while ($changes);
527 | 
528 |         return $firstSets;
529 |     }
530 | }
531 | 


--------------------------------------------------------------------------------