30 | */
31 | class Item
32 | {
33 | /**
34 | * @var \Dissect\Parser\Rule
35 | */
36 | protected $rule;
37 |
38 | /**
39 | * @var int
40 | */
41 | protected $dotIndex;
42 |
43 | /**
44 | * @var array
45 | */
46 | protected $lookahead = array();
47 |
48 | /**
49 | * @var array
50 | */
51 | protected $connected = array();
52 |
53 | /**
54 | * Constructor.
55 | *
56 | * @param \Dissect\Parser\Rule $rule The rule of this item.
57 | * @param int $dotIndex The index of the dot in this item.
58 | */
59 | public function __construct(Rule $rule, $dotIndex)
60 | {
61 | $this->rule = $rule;
62 | $this->dotIndex = $dotIndex;
63 | }
64 |
65 | /**
66 | * Returns the dot index of this item.
67 | *
68 | * @return int The dot index.
69 | */
70 | public function getDotIndex()
71 | {
72 | return $this->dotIndex;
73 | }
74 |
75 | /**
76 | * Returns the currently expected component.
77 | *
78 | * If the item is:
79 | *
80 | *
81 | * A -> a . b c
82 | *
83 | *
84 | * then this method returns the component "b".
85 | *
86 | * @return string The component.
87 | */
88 | public function getActiveComponent()
89 | {
90 | return $this->rule->getComponent($this->dotIndex);
91 | }
92 |
93 | /**
94 | * Returns the rule of this item.
95 | *
96 | * @return \Dissect\Parser\Rule The rule.
97 | */
98 | public function getRule()
99 | {
100 | return $this->rule;
101 | }
102 |
103 | /**
104 | * Determines whether this item is a reduce item.
105 | *
106 | * An item is a reduce item if the dot is at the very end:
107 | *
108 | *
109 | * A -> a b c .
110 | *
111 | *
112 | * @return boolean Whether this item is a reduce item.
113 | */
114 | public function isReduceItem()
115 | {
116 | return $this->dotIndex === count($this->rule->getComponents());
117 | }
118 |
119 | /**
120 | * Connects two items with a lookahead pumping channel.
121 | *
122 | * @param \Dissect\Parser\LALR1\Analysis\Item $i The item.
123 | */
124 | public function connect(Item $i)
125 | {
126 | $this->connected[] = $i;
127 | }
128 |
129 | /**
130 | * Pumps a lookahead token to this item and all items connected
131 | * to it.
132 | *
133 | * @param string $lookahead The lookahead token name.
134 | */
135 | public function pump($lookahead)
136 | {
137 | if (!in_array($lookahead, $this->lookahead)) {
138 | $this->lookahead[] = $lookahead;
139 |
140 | foreach ($this->connected as $item) {
141 | $item->pump($lookahead);
142 | }
143 | }
144 | }
145 |
146 | /**
147 | * Pumps several lookahead tokens.
148 | *
149 | * @param array $lookahead The lookahead tokens.
150 | */
151 | public function pumpAll(array $lookahead)
152 | {
153 | foreach ($lookahead as $l) {
154 | $this->pump($l);
155 | }
156 | }
157 |
158 | /**
159 | * Returns the computed lookahead for this item.
160 | *
161 | * @return string[] The lookahead symbols.
162 | */
163 | public function getLookahead()
164 | {
165 | return $this->lookahead;
166 | }
167 |
168 | /**
169 | * Returns all components that haven't been recognized
170 | * so far.
171 | *
172 | * @return array The unrecognized components.
173 | */
174 | public function getUnrecognizedComponents()
175 | {
176 | return array_slice($this->rule->getComponents(), $this->dotIndex + 1);
177 | }
178 | }
179 |
--------------------------------------------------------------------------------
/src/Dissect/Parser/LALR1/Dumper/AutomatonDumper.php:
--------------------------------------------------------------------------------
1 |
14 | */
15 | class AutomatonDumper
16 | {
17 | protected $automaton;
18 |
19 | /**
20 | * Constructor.
21 | *
22 | * @param \Dissect\Parser\LALR1\Analysis\Automaton $automaton
23 | */
24 | public function __construct(Automaton $automaton)
25 | {
26 | $this->automaton = $automaton;
27 | }
28 |
29 | /**
30 | * Dumps the entire automaton.
31 | *
32 | * @return string The automaton encoded in DOT.
33 | */
34 | public function dump()
35 | {
36 | $writer = new StringWriter();
37 |
38 | $this->writeHeader($writer);
39 | $writer->writeLine();
40 |
41 | foreach ($this->automaton->getStates() as $state) {
42 | $this->writeState($writer, $state);
43 | }
44 |
45 | $writer->writeLine();
46 |
47 | foreach ($this->automaton->getTransitionTable() as $num => $map) {
48 | foreach ($map as $trigger => $destination) {
49 | $writer->writeLine(sprintf(
50 | '%d -> %d [label="%s"];',
51 | $num,
52 | $destination,
53 | $trigger
54 | ));
55 | }
56 | }
57 |
58 | $writer->outdent();
59 | $this->writeFooter($writer);
60 |
61 | return $writer->get();
62 | }
63 |
64 | /**
65 | * Dumps only the specified state + any relevant
66 | * transitions.
67 | *
68 | * @param int $n The number of the state.
69 | *
70 | * @return string The output in DOT format.
71 | */
72 | public function dumpState($n)
73 | {
74 | $writer = new StringWriter();
75 |
76 | $this->writeHeader($writer, $n);
77 | $writer->writeLine();
78 |
79 | $this->writeState($writer, $this->automaton->getState($n));
80 |
81 | $table = $this->automaton->getTransitionTable();
82 | $row = isset($table[$n]) ? $table[$n] : array();
83 |
84 | foreach ($row as $dest) {
85 | if ($dest !== $n) {
86 | $this->writeState($writer, $this->automaton->getState($dest), false);
87 | }
88 | }
89 |
90 | $writer->writeLine();
91 |
92 | foreach ($row as $trigger => $dest) {
93 | $writer->writeLine(sprintf(
94 | '%d -> %d [label="%s"];',
95 | $n,
96 | $dest,
97 | $trigger
98 | ));
99 | }
100 |
101 | $writer->outdent();
102 | $this->writeFooter($writer);
103 |
104 | return $writer->get();
105 | }
106 |
107 | protected function writeHeader(StringWriter $writer, $stateNumber = null)
108 | {
109 | $writer->writeLine(sprintf(
110 | 'digraph %s {',
111 | $stateNumber ? 'State' . $stateNumber : 'Automaton'
112 | ));
113 |
114 | $writer->indent();
115 | $writer->writeLine('rankdir="LR";');
116 | }
117 |
118 | protected function writeState(StringWriter $writer, State $state, $full = true)
119 | {
120 | $n = $state->getNumber();
121 |
122 | $string = sprintf(
123 | '%d [label="State %d',
124 | $n,
125 | $n
126 | );
127 |
128 | if ($full) {
129 | $string .= '\n\n';
130 | $items = array();
131 |
132 | foreach ($state->getItems() as $item) {
133 | $items[] = $this->formatItem($item);
134 | }
135 |
136 | $string .= implode('\n', $items);
137 | }
138 |
139 | $string .= '"];';
140 |
141 | $writer->writeLine($string);
142 | }
143 |
144 | protected function formatItem(Item $item)
145 | {
146 | $rule = $item->getRule();
147 | $components = $rule->getComponents();
148 |
149 | // the dot
150 | array_splice($components, $item->getDotIndex(), 0, array('•'));
151 |
152 | if ($rule->getNumber() === 0) {
153 | $string = '';
154 | } else {
155 | $string = sprintf("%s → ", $rule->getName());
156 | }
157 |
158 | $string .= implode(' ', $components);
159 |
160 | if ($item->isReduceItem()) {
161 | $string .= sprintf(
162 | ' [%s]',
163 | implode(' ', $item->getLookahead())
164 | );
165 | }
166 |
167 | return $string;
168 | }
169 |
170 | protected function writeFooter(StringWriter $writer)
171 | {
172 | $writer->writeLine('}');
173 | }
174 | }
175 |
--------------------------------------------------------------------------------
/src/Dissect/Parser/LALR1/Dumper/DebugTableDumper.php:
--------------------------------------------------------------------------------
1 |
13 | */
14 | class DebugTableDumper implements TableDumper
15 | {
16 | /**
17 | * @var \Dissect\Parser\Grammar
18 | */
19 | protected $grammar;
20 |
21 | /**
22 | * @var \Dissect\Parser\LALR1\Dumper\StringWriter
23 | */
24 | protected $writer;
25 |
26 | /**
27 | * @var boolean
28 | */
29 | protected $written = false;
30 |
31 | /**
32 | * Constructor.
33 | *
34 | * @param \Dissect\Parser\Grammar $grammar The grammar of this parse table.
35 | */
36 | public function __construct(Grammar $grammar)
37 | {
38 | $this->grammar = $grammar;
39 | $this->writer = new StringWriter();
40 | }
41 |
42 | /**
43 | * {@inheritDoc}
44 | */
45 | public function dump(array $table)
46 | {
47 | // for readability
48 | ksort($table['action']);
49 | ksort($table['goto']);
50 |
51 | // the grammar dictates the parse table,
52 | // therefore the result is always the same
53 | if (!$this->written) {
54 | $this->writeHeader();
55 | $this->writer->indent();
56 |
57 | foreach ($table['action'] as $n => $state) {
58 | $this->writeState($n, $state);
59 | $this->writer->writeLine();
60 | }
61 |
62 | $this->writer->outdent();
63 | $this->writeMiddle();
64 | $this->writer->indent();
65 |
66 | foreach ($table['goto'] as $n => $map) {
67 | $this->writeGoto($n, $map);
68 | $this->writer->writeLine();
69 | }
70 |
71 | $this->writer->outdent();
72 | $this->writeFooter();
73 |
74 | $this->written = true;
75 | }
76 |
77 | return $this->writer->get();
78 | }
79 |
80 | protected function writeHeader()
81 | {
82 | $this->writer->writeLine('writer->writeLine();
84 | $this->writer->writeLine('return array(');
85 | $this->writer->indent();
86 | $this->writer->writeLine("'action' => array(");
87 | }
88 |
89 | protected function writeState($n, array $state)
90 | {
91 | $this->writer->writeLine((string)$n . ' => array(');
92 | $this->writer->indent();
93 |
94 | foreach ($state as $trigger => $action) {
95 | $this->writeAction($trigger, $action);
96 | $this->writer->writeLine();
97 | }
98 |
99 | $this->writer->outdent();
100 | $this->writer->writeLine('),');
101 | }
102 |
103 | protected function writeAction($trigger, $action)
104 | {
105 | if ($action > 0) {
106 | $this->writer->writeLine(sprintf(
107 | '// on %s shift and go to state %d',
108 | $trigger,
109 | $action
110 | ));
111 | } elseif ($action < 0) {
112 | $rule = $this->grammar->getRule(-$action);
113 | $components = $rule->getComponents();
114 |
115 | if (empty($components)) {
116 | $rhs = '/* empty */';
117 | } else {
118 | $rhs = implode(' ', $components);
119 | }
120 |
121 | $this->writer->writeLine(sprintf(
122 | '// on %s reduce by rule %s -> %s',
123 | $trigger,
124 | $rule->getName(),
125 | $rhs
126 | ));
127 | } else {
128 | $this->writer->writeLine(sprintf(
129 | '// on %s accept the input',
130 | $trigger
131 | ));
132 | }
133 |
134 | $this->writer->writeLine(sprintf(
135 | "'%s' => %d,",
136 | $trigger,
137 | $action
138 | ));
139 | }
140 |
141 | protected function writeMiddle()
142 | {
143 | $this->writer->writeLine('),');
144 | $this->writer->writeLine();
145 | $this->writer->writeLine("'goto' => array(");
146 | }
147 |
148 | protected function writeGoto($n, array $map)
149 | {
150 | $this->writer->writeLine((string)$n . ' => array(');
151 | $this->writer->indent();
152 |
153 | foreach ($map as $sym => $dest) {
154 | $this->writer->writeLine(sprintf(
155 | '// on %s go to state %d',
156 | $sym,
157 | $dest
158 | ));
159 |
160 | $this->writer->writeLine(sprintf(
161 | "'%s' => %d,",
162 | $sym,
163 | $dest
164 | ));
165 |
166 | $this->writer->writeLine();
167 | }
168 |
169 | $this->writer->outdent();
170 | $this->writer->writeLine('),');
171 | }
172 |
173 | protected function writeFooter()
174 | {
175 | $this->writer->writeLine('),');
176 | $this->writer->outdent();
177 | $this->writer->writeLine(');');
178 | }
179 | }
180 |
--------------------------------------------------------------------------------
/src/Dissect/Lexer/StatefulLexer.php:
--------------------------------------------------------------------------------
1 |
15 | */
16 | class StatefulLexer extends AbstractLexer
17 | {
18 | protected $states = array();
19 | protected $stateStack = array();
20 | protected $stateBeingBuilt = null;
21 | protected $typeBeingBuilt = null;
22 |
23 | /**
24 | * Signifies that no action should be taken on encountering a token.
25 | */
26 | const NO_ACTION = 0;
27 |
28 | /**
29 | * Indicates that a state should be popped of the state stack on
30 | * encountering a token.
31 | */
32 | const POP_STATE = 1;
33 |
34 | /**
35 | * Adds a new token definition. If given only one argument,
36 | * it assumes that the token type and recognized value are
37 | * identical.
38 | *
39 | * @param string $type The token type.
40 | * @param string $value The value to be recognized.
41 | *
42 | * @return \Dissect\Lexer\SimpleLexer This instance for fluent interface.
43 | */
44 | public function token($type, $value = null)
45 | {
46 | if ($this->stateBeingBuilt === null) {
47 | throw new LogicException("Define a lexer state first.");
48 | }
49 |
50 | if ($value === null) {
51 | $value = $type;
52 | }
53 |
54 | $this->states[$this->stateBeingBuilt]['recognizers'][$type] =
55 | new SimpleRecognizer($value);
56 |
57 | $this->states[$this->stateBeingBuilt]['actions'][$type] = self::NO_ACTION;
58 |
59 | $this->typeBeingBuilt = $type;
60 |
61 | return $this;
62 | }
63 |
64 | /**
65 | * Adds a new regex token definition.
66 | *
67 | * @param string $type The token type.
68 | * @param string $regex The regular expression used to match the token.
69 | *
70 | * @return \Dissect\Lexer\SimpleLexer This instance for fluent interface.
71 | */
72 | public function regex($type, $regex)
73 | {
74 | if ($this->stateBeingBuilt === null) {
75 | throw new LogicException("Define a lexer state first.");
76 | }
77 |
78 | $this->states[$this->stateBeingBuilt]['recognizers'][$type] =
79 | new RegexRecognizer($regex);
80 |
81 | $this->states[$this->stateBeingBuilt]['actions'][$type] = self::NO_ACTION;
82 |
83 | $this->typeBeingBuilt = $type;
84 |
85 | return $this;
86 | }
87 |
88 | /**
89 | * Marks the token types given as arguments to be skipped.
90 | *
91 | * @param mixed $type,... Unlimited number of token types.
92 | *
93 | * @return \Dissect\Lexer\SimpleLexer This instance for fluent interface.
94 | */
95 | public function skip()
96 | {
97 | if ($this->stateBeingBuilt === null) {
98 | throw new LogicException("Define a lexer state first.");
99 | }
100 |
101 | $this->states[$this->stateBeingBuilt]['skip_tokens'] = func_get_args();
102 |
103 | return $this;
104 | }
105 |
106 | /**
107 | * Registers a new lexer state.
108 | *
109 | * @param string $state The new state name.
110 | *
111 | * @return \Dissect\Lexer\SimpleLexer This instance for fluent interface.
112 | */
113 | public function state($state)
114 | {
115 | $this->stateBeingBuilt = $state;
116 |
117 | $this->states[$state] = array(
118 | 'recognizers' => array(),
119 | 'actions' => array(),
120 | 'skip_tokens' => array(),
121 | );
122 |
123 | return $this;
124 | }
125 |
126 | /**
127 | * Sets the starting state for the lexer.
128 | *
129 | * @param string $state The name of the starting state.
130 | *
131 | * @return \Dissect\Lexer\SimpleLexer This instance for fluent interface.
132 | */
133 | public function start($state)
134 | {
135 | $this->stateStack[] = $state;
136 |
137 | return $this;
138 | }
139 |
140 | /**
141 | * Sets an action for the token type that is currently being built.
142 | *
143 | * @param mixed $action The action to take.
144 | *
145 | * @return \Dissect\Lexer\SimpleLexer This instance for fluent interface.
146 | */
147 | public function action($action)
148 | {
149 | if ($this->stateBeingBuilt === null || $this->typeBeingBuilt === null) {
150 | throw new LogicException("Define a lexer state and type first.");
151 | }
152 |
153 | $this->states[$this->stateBeingBuilt]['actions'][$this->typeBeingBuilt] = $action;
154 |
155 | return $this;
156 | }
157 |
158 | /**
159 | * {@inheritDoc}
160 | */
161 | protected function shouldSkipToken(Token $token)
162 | {
163 | $state = $this->states[$this->stateStack[count($this->stateStack) - 1]];
164 |
165 | return in_array($token->getType(), $state['skip_tokens']);
166 | }
167 |
168 | /**
169 | * {@inheritDoc}
170 | */
171 | protected function extractToken($string)
172 | {
173 | if (empty($this->stateStack)) {
174 | throw new LogicException("You must set a starting state before lexing.");
175 | }
176 |
177 | $value = $type = $action = null;
178 | $state = $this->states[$this->stateStack[count($this->stateStack) - 1]];
179 |
180 | foreach ($state['recognizers'] as $t => $recognizer) {
181 | if ($recognizer->match($string, $v)) {
182 | if ($value === null || Util::stringLength($v) > Util::stringLength($value)) {
183 | $value = $v;
184 | $type = $t;
185 | $action = $state['actions'][$type];
186 | }
187 | }
188 | }
189 |
190 | if ($type !== null) {
191 | if (is_string($action)) { // enter new state
192 | $this->stateStack[] = $action;
193 | } elseif ($action === self::POP_STATE) {
194 | array_pop($this->stateStack);
195 | }
196 |
197 | return new CommonToken($type, $value, $this->getCurrentLine());
198 | }
199 |
200 | return null;
201 | }
202 | }
203 |
--------------------------------------------------------------------------------
/tests/Dissect/Parser/LALR1/Analysis/AnalyzerTest.php:
--------------------------------------------------------------------------------
1 | is('a', 'S', 'b')
23 | ->is();
24 |
25 | $grammar->start('S');
26 |
27 | $result = $this->getAnalysisResult($grammar);
28 | $table = $result->getAutomaton()->getTransitionTable();
29 |
30 | $this->assertEquals(1, $table[0]['S']);
31 | $this->assertEquals(2, $table[0]['a']);
32 | $this->assertEquals(2, $table[2]['a']);
33 | $this->assertEquals(3, $table[2]['S']);
34 | $this->assertEquals(4, $table[3]['b']);
35 | }
36 |
37 | /**
38 | * @test
39 | */
40 | public function lookaheadShouldBeCorrectlyPumped()
41 | {
42 | $grammar = new Grammar();
43 |
44 | $grammar('S')
45 | ->is('A', 'B', 'C', 'D');
46 |
47 | $grammar('A')
48 | ->is('a');
49 |
50 | $grammar('B')
51 | ->is('b');
52 |
53 | $grammar('C')
54 | ->is(/* empty */);
55 |
56 | $grammar('D')
57 | ->is('d');
58 |
59 | $grammar->start('S');
60 |
61 | $automaton = $this->getAnalysisResult($grammar)->getAutomaton();
62 |
63 | $this->assertEquals(
64 | array(Parser::EOF_TOKEN_TYPE),
65 | $automaton->getState(1)->get(0, 1)->getLookahead()
66 | );
67 |
68 | $this->assertEquals(
69 | array('b'),
70 | $automaton->getState(3)->get(2, 1)->getLookahead()
71 | );
72 |
73 | $this->assertEquals(
74 | array('d'),
75 | $automaton->getState(4)->get(4, 0)->getLookahead()
76 | );
77 |
78 | $this->assertEquals(
79 | array('d'),
80 | $automaton->getState(5)->get(3, 1)->getLookahead()
81 | );
82 |
83 | $this->assertEquals(
84 | array(Parser::EOF_TOKEN_TYPE),
85 | $automaton->getState(7)->get(1, 4)->getLookahead()
86 | );
87 |
88 | $this->assertEquals(
89 | array(Parser::EOF_TOKEN_TYPE),
90 | $automaton->getState(8)->get(5, 1)->getLookahead()
91 | );
92 | }
93 |
94 | /**
95 | * @test
96 | */
97 | public function parseTableShouldBeCorrectlyBuilt()
98 | {
99 | $grammar = new Grammar();
100 |
101 | $grammar('S')
102 | ->is('a', 'S', 'b')
103 | ->is(/* empty */);
104 |
105 | $grammar->start('S');
106 |
107 | $table = $this->getAnalysisResult($grammar)->getParseTable();
108 |
109 | // shift(2)
110 | $this->assertEquals(2, $table['action'][0]['a']);
111 |
112 | // reduce(S -> )
113 | $this->assertEquals(-2, $table['action'][0][Parser::EOF_TOKEN_TYPE]);
114 |
115 | // accept
116 | $this->assertEquals(0, $table['action'][1][Parser::EOF_TOKEN_TYPE]);
117 |
118 | // shift(2)
119 | $this->assertEquals(2, $table['action'][2]['a']);
120 |
121 | // reduce(S -> )
122 | $this->assertEquals(-2, $table['action'][2]['b']);
123 |
124 | // shift(4)
125 | $this->assertEquals(4, $table['action'][3]['b']);
126 |
127 | // reduce(S -> a S b)
128 | $this->assertEquals(-1, $table['action'][4]['b']);
129 | $this->assertEquals(-1, $table['action'][4][Parser::EOF_TOKEN_TYPE]);
130 |
131 | $this->assertEquals(1, $table['goto'][0]['S']);
132 | $this->assertEquals(3, $table['goto'][2]['S']);
133 | }
134 |
135 | /**
136 | * @test
137 | */
138 | public function unexpectedConflictsShouldThrowAnException()
139 | {
140 | $grammar = new Grammar();
141 |
142 | $grammar('S')
143 | ->is('a', 'b', 'C', 'd')
144 | ->is('a', 'b', 'E', 'd');
145 |
146 | $grammar('C')
147 | ->is(/* empty */);
148 |
149 | $grammar('E')
150 | ->is(/* empty */);
151 |
152 | $grammar->start('S');
153 |
154 | try {
155 | $result = $this->getAnalysisResult($grammar);
156 | $this->fail('Expected an exception warning of a reduce/reduce conflict.');
157 | } catch(ReduceReduceConflictException $e) {
158 | $this->assertEquals(3, $e->getStateNumber());
159 | $this->assertEquals('d', $e->getLookahead());
160 | $this->assertEquals(3, $e->getFirstRule()->getNumber());
161 | $this->assertEquals(4, $e->getSecondRule()->getNumber());
162 | }
163 | }
164 |
165 | /**
166 | * @test
167 | */
168 | public function expectedConflictsShouldBeRecorded()
169 | {
170 | $grammar = new Grammar();
171 |
172 | $grammar('S')
173 | ->is('S', 'S', 'S')
174 | ->is('S', 'S')
175 | ->is('b');
176 |
177 | $grammar->resolve(Grammar::ALL);
178 | $grammar->start('S');
179 |
180 | $conflicts = $this->getAnalysisResult($grammar)->getResolvedConflicts();
181 |
182 | $this->assertCount(4, $conflicts);
183 |
184 | $conflict = $conflicts[0];
185 |
186 | $this->assertEquals(3, $conflict['state']);
187 | $this->assertEquals('b', $conflict['lookahead']);
188 | $this->assertEquals(2, $conflict['rule']->getNumber());
189 | $this->assertEquals(Grammar::SHIFT, $conflict['resolution']);
190 |
191 | $conflict = $conflicts[1];
192 |
193 | $this->assertEquals(4, $conflict['state']);
194 | $this->assertEquals('b', $conflict['lookahead']);
195 | $this->assertEquals(1, $conflict['rule']->getNumber());
196 | $this->assertEquals(Grammar::SHIFT, $conflict['resolution']);
197 |
198 | $conflict = $conflicts[2];
199 |
200 | $this->assertEquals(4, $conflict['state']);
201 | $this->assertEquals(Parser::EOF_TOKEN_TYPE, $conflict['lookahead']);
202 | $this->assertEquals(1, $conflict['rules'][0]->getNumber());
203 | $this->assertEquals(2, $conflict['rules'][1]->getNumber());
204 | $this->assertEquals(Grammar::LONGER_REDUCE, $conflict['resolution']);
205 |
206 | $conflict = $conflicts[3];
207 |
208 | $this->assertEquals(4, $conflict['state']);
209 | $this->assertEquals('b', $conflict['lookahead']);
210 | $this->assertEquals(2, $conflict['rule']->getNumber());
211 | $this->assertEquals(Grammar::SHIFT, $conflict['resolution']);
212 | }
213 |
214 | protected function getAnalysisResult(Grammar $grammar)
215 | {
216 | return $this->getAnalyzer()->analyze($grammar);
217 | }
218 |
219 | protected function getAnalyzer()
220 | {
221 | if ($this->analyzer === null) {
222 | $this->analyzer = new Analyzer();
223 | }
224 |
225 | return $this->analyzer;
226 | }
227 | }
228 |
--------------------------------------------------------------------------------
/src/Dissect/Console/Command/DissectCommand.php:
--------------------------------------------------------------------------------
1 | setName('dissect')
24 | ->addArgument('grammar-class', InputArgument::REQUIRED, 'The grammar class.')
25 | ->addOption('debug', 'd', InputOption::VALUE_NONE, 'Writes the parse table in the debug format.')
26 | ->addOption('dfa', 'D', InputOption::VALUE_NONE, 'Exports the LALR(1) DFA as a Graphviz graph.')
27 | ->addOption('state', 's', InputOption::VALUE_REQUIRED, 'Exports only the specified state instead of the entire DFA.')
28 | ->addOption('output-dir', 'o', InputOption::VALUE_REQUIRED, 'Overrides the default output directory.')
29 | ->setHelp(<<--output-dir option:
35 |
36 | --output-dir=../some/other/dir
37 |
38 | The parse table is by default written with minimal whitespace to make it compact.
39 | If you wish to inspect the table manually, you can export it in a readable and
40 | well-commented way with the --debug option.
41 |
42 | If you wish to inspect the handle-finding automaton for your grammar (perhaps
43 | to aid with grammar debugging), use the --dfa option. When in use, Dissect
44 | will create a file with the automaton exported as a Graphviz graph
45 | in the output directory.
46 |
47 | Additionally, you can use the --state option to export only the specified
48 | state and any relevant transitions:
49 |
50 | --dfa --state=5
51 | EOT
52 | );
53 | }
54 |
55 | protected function execute(InputInterface $input, OutputInterface $output)
56 | {
57 | $class = strtr(
58 | $input->getArgument('grammar-class'),
59 | '/',
60 | '\\'
61 | );
62 | $formatter = $this->getHelperSet()->get('formatter');
63 |
64 | $output->writeln('Analyzing...');
65 | $output->writeln('');
66 |
67 | if (!class_exists($class)) {
68 | $output->writeln(array(
69 | $formatter->formatBlock(
70 | sprintf('The class "%s" could not be found.', $class),
71 | 'error',
72 | true
73 | ),
74 | ));
75 |
76 | return 1;
77 | }
78 |
79 | $grammar = new $class();
80 |
81 | if ($dir = $input->getOption('output-dir')) {
82 | $cwd = rtrim(getcwd(), DIRECTORY_SEPARATOR);
83 |
84 | $outputDir = $cwd . DIRECTORY_SEPARATOR . $dir;
85 | } else {
86 | $refl = new ReflectionClass($class);
87 | $outputDir = pathinfo($refl->getFileName(), PATHINFO_DIRNAME);
88 | }
89 |
90 | $analyzer = new Analyzer();
91 | $automaton = null;
92 |
93 | try {
94 | $result = $analyzer->analyze($grammar);
95 | $conflicts = $result->getResolvedConflicts();
96 | $automaton = $result->getAutomaton();
97 | $table = $result->getParseTable();
98 |
99 | if ($conflicts) {
100 | foreach ($conflicts as $conflict) {
101 | $output->writeln($this->formatConflict($conflict));
102 | }
103 |
104 | $output->writeln(sprintf(
105 | "%d conflicts in total",
106 | count($conflicts)
107 | ));
108 |
109 | $output->writeln('');
110 | }
111 |
112 | $output->writeln('Writing the parse table...');
113 |
114 | $fileName = $outputDir . DIRECTORY_SEPARATOR . 'parse_table.php';
115 |
116 | if ($input->getOption('debug')) {
117 | $tableDumper = new DebugTableDumper($grammar);
118 | } else {
119 | $tableDumper = new ProductionTableDumper();
120 | }
121 |
122 | $code = $tableDumper->dump($table);
123 |
124 | $ret = @file_put_contents($fileName, $code);
125 | if ($ret === false) {
126 | $output->writeln('Error writing the parse table');
127 | } else {
128 | $output->writeln('Parse table written');
129 | }
130 | } catch(ConflictException $e) {
131 | $output->writeln(array(
132 | $formatter->formatBlock(
133 | explode("\n", $e->getMessage()),
134 | 'error',
135 | true
136 | ),
137 | ));
138 |
139 | $automaton = $e->getAutomaton();
140 | }
141 |
142 | if ($input->getOption('dfa')) {
143 | $output->writeln('');
144 |
145 | $automatonDumper = new AutomatonDumper($automaton);
146 |
147 | if ($input->getOption('state') === null) {
148 | $output->writeln('Exporting the DFA...');
149 |
150 | $dot = $automatonDumper->dump();
151 | $file = 'automaton.dot';
152 | } else {
153 | $state = (int)$input->getOption('state');
154 |
155 | if (!$automaton->hasState($state)) {
156 | $output->writeln(array(
157 | $formatter->formatBlock(
158 | sprintf('The automaton has no state #%d', $state),
159 | 'error',
160 | true
161 | ),
162 | ));
163 |
164 | return 1;
165 | }
166 |
167 | $output->writeln(sprintf(
168 | 'Exporting the DFA state %d...',
169 | $state
170 | ));
171 |
172 | $dot = $automatonDumper->dumpState($state);
173 | $file = sprintf('state_%d.dot', $state);
174 | }
175 |
176 | $fileName = $outputDir . DIRECTORY_SEPARATOR . $file;
177 | $ret = @file_put_contents($fileName, $dot);
178 |
179 | if ($ret === false) {
180 | $output->writeln('Error writing to the file');
181 | } else {
182 | $output->writeln('Successfully exported');
183 | }
184 | }
185 |
186 | return 0;
187 | }
188 |
189 | protected function formatConflict(array $conflict)
190 | {
191 | $type = $conflict['resolution'] === Grammar::SHIFT
192 | ? 'shift/reduce'
193 | : 'reduce/reduce';
194 |
195 | return sprintf(
196 | "Resolved a %s conflict in state %d on lookahead %s",
197 | $type,
198 | $conflict['state'],
199 | $conflict['lookahead']
200 | );
201 | }
202 | }
203 |
--------------------------------------------------------------------------------
/docs/lexing.md:
--------------------------------------------------------------------------------
1 | Lexical analysis with Dissect
2 | =============================
3 |
4 | There are three classes for lexical analysis in Dissect, all under the
5 | namespace `Dissect\Lexer`: `SimpleLexer`, `StatefulLexer` and `RegexLexer`.
6 |
7 | SimpleLexer
8 | -----------
9 |
10 | `SimpleLexer` simply accepts some token definitions and applies them on
11 | the input. Let's create a subclass for this chapter:
12 |
13 | ```php
14 | use Dissect\Lexer\SimpleLexer;
15 |
16 | class ArithLexer extends SimpleLexer
17 | {
18 | public function __construct()
19 | {
20 | // token definitions
21 | }
22 | }
23 | ```
24 |
25 | ### Defining tokens
26 |
27 | There are 3 ways to define a token. The simplest one looks like this:
28 |
29 | ```php
30 | $this->token('+');
31 | ```
32 |
33 | This definition will simply match a plus symbol, using `+` both as the
34 | name and value of the token. You can use 2 arguments:
35 |
36 | ```php
37 | $this->token('CLASS', 'class');
38 | ```
39 |
40 | if you want the token name (first argument) to differ from what will actually be
41 | recognized (second argument).
42 |
43 | The final way defines a token by a regular expression:
44 |
45 | ```php
46 | $this->regex('INT', '/^[1-9][0-9]*/');
47 | ```
48 |
49 | Let's now define some tokens we will use in the next chapter:
50 |
51 | ```php
52 | class ArithLexer extends SimpleLexer
53 | {
54 | public function __construct()
55 | {
56 | $this->regex('INT', '/^[1-9][0-9]*/');
57 | $this->token('(');
58 | $this->token(')');
59 | $this->token('+');
60 | $this->token('*');
61 | $this->token('**');
62 | }
63 | }
64 | ```
65 |
66 | > **Tip**: You can also chain the method calls using a fluent interface.
67 |
68 | ### Skipping tokens
69 |
70 | Some tokens have to be recognized, but we don't want them cluttering the
71 | output. The best example are probably whitespace tokens: the lexer has
72 | to recognize them, but they carry no meaning or value, so we can tell
73 | the lexer to `skip` them:
74 |
75 | ```php
76 | class ArithLexer extends SimpleLexer
77 | {
78 | public function __construct()
79 | {
80 | $this->regex('INT', '/[1-9][0-9]*/');
81 | $this->token('(');
82 | $this->token(')');
83 | $this->token('+');
84 | $this->token('*');
85 | $this->token('**');
86 |
87 | $this->regex('WSP', "/^[ \r\n\t]+/");
88 | $this->skip('WSP');
89 | }
90 | }
91 | ```
92 |
93 | > You can pass any number of token names to the `skip` method.
94 |
95 | ### Lexing
96 |
97 | Now that we've defined our tokens, we can simply call:
98 |
99 | ```php
100 | $lexer = new ArithLexer();
101 | $stream = $lexer->lex($input);
102 | ```
103 |
104 | The return value is an object implementing the
105 | `Dissect\Lexer\TokenStream\TokenStream` interface. The interface defines
106 | several methods you can use to inspect and move through the token
107 | stream. See [TokenStream.php][tokenstream] for all the methods you can
108 | use.
109 |
110 | > If you `count` the token stream, you may be surprised to find out that
111 | > for input like `5 + 3`, it actually contains 4 tokens. That's because,
112 | > as the last step of lexing, a special token called `$eof` is appended
113 | > to mark the end of input. This is crucial to the parsing process, so
114 | > please, never define a token called `$eof` yourself. It could lead to
115 | > some pretty strange errors. Another forbidden token names are `$start`
116 | > and `$epsilon`.
117 |
118 | StatefulLexer
119 | -------------
120 |
121 | `SimpleLexer` should work fine for general use cases. However, let's
122 | imagine we're lexing a very simple templating language:
123 |
124 | Outer content, {{ variable_name }}, other outer content
125 |
126 | `SimpleLexer` falls short here, because the outer content can be pretty
127 | much anything, while the content inside the tags has to be strictly
128 | intepreted. Furthermore, if we were to work with this template, we'd
129 | want to skip the whitespace inside tags, but keep it in the outer
130 | content.
131 |
132 | That's where `StatefulLexer` comes in; during lexing, it maintains a
133 | stack of states with the top one being the current one, and for each
134 | token, you can define the action the lexer should take after recognizing
135 | it. Let's see an example for our templating language:
136 |
137 | ```php
138 | use Dissect\Lexer\StatefulLexer;
139 |
140 | class TemplateLexer extends StatefulLexer
141 | {
142 | public function __construct()
143 | {
144 | $lexer->state('outside')
145 | ->regex('CONTENT', '/^[^"{{"]*/')
146 | ->token('{{')->action('tag');
147 |
148 | $lexer->state('tag')
149 | ->regex('WSP', "/^[ \r\n\t]+/")
150 | ->regex('VAR', '/^[a-zA-Z_]+/')
151 | ->token('}}')->action(StatefulLexer::POP_STATE)
152 | ->skip('WSP');
153 |
154 | $lexer->start('outside');
155 | }
156 | }
157 | ```
158 |
159 | Please note that before defining any tokens, we have to define a state.
160 | For the tokens that cause the state transition, we call `action` to
161 | specify what should the lexer do. The action can be either a string, in
162 | which case the lexer goes to the state specified by the string, or
163 | `StatefulLexer::POP_STATE`, which causes the lexer to pop the current
164 | state of the stack, essentialy going back to previous state.
165 | Finally, we tell the lexer in which state to start by calling `start`.
166 |
167 | Improving lexer performance
168 | ---------------------------
169 |
170 | There's one important trick to improve the performance of your lexers.
171 | The documentation uses it implicitly, but it requires an explicit mention:
172 |
173 | When using one of the lexer classes documented above and defining tokens
174 | using regular expressions, *always* anchor the regex at the beginning
175 | using `^` like this:
176 |
177 | ```php
178 | $this->regex('INT', '/^[1-9][0-9]*/');
179 | ```
180 |
181 | This little optimization will lead to substantial performance gains on
182 | any but the shortest input strings, since without anchoring, the PCRE
183 | engine would always look for matches throughout the entire remaining
184 | input string, which would be incredibly wasteful for long inputs.
185 |
186 | RegexLexer
187 | ----------
188 |
189 | When designing the lexer classes, my goal was not to sacrifice
190 | user-friendliness for performance. However, I'm well aware that there
191 | are use cases that require the highest performace possible. That's
192 | why I adapted the highly performant but slightly less user-friendly
193 | [lexer][doctrinelexer] from [doctrine][doctrine] into Dissect.
194 |
195 | The usage is almost identical to the original class, writing a lexer
196 | for the arithmetic expressions could look something like this:
197 |
198 | ```php
199 | use Dissect\Lexer\RegexLexer;
200 | use RuntimeException;
201 |
202 | class ArithLexer extends RegexLexer
203 | {
204 | protected $tokens = ['+', '*', '**', '(', ')'];
205 |
206 | protected function getCatchablePatterns()
207 | {
208 | return ['[1-9][0-9]*'];
209 | }
210 |
211 | protected function getNonCatchablePatterns()
212 | {
213 | return ['\s+'];
214 | }
215 |
216 | protected function getType(&$value)
217 | {
218 | if (is_numeric($value)) {
219 | $value = (int)$value;
220 |
221 | return 'INT';
222 | } elseif (in_array($value, $this->tokens)) {
223 | // the types of the simple tokens equal their values here
224 | return $value;
225 | } else {
226 | throw new RuntimeException(sprintf('Invalid token "%s"', $value));
227 | }
228 | }
229 | }
230 | ```
231 |
232 | Continue
233 | --------
234 |
235 | Now that we've demonstrated how to perform lexical analysis with
236 | Dissect, we can move onto syntactical analysis, commonly known as
237 | [parsing][parsing].
238 |
239 | [tokenstream]: ../src/Dissect/Lexer/TokenStream/TokenStream.php
240 | [parsing]: parsing.md
241 | [doctrinelexer]: https://github.com/doctrine/lexer/blob/master/lib/Doctrine/Common/Lexer/AbstractLexer.php
242 | [doctrine]: https://github.com/doctrine/lexer
243 |
--------------------------------------------------------------------------------
/src/Dissect/Parser/Grammar.php:
--------------------------------------------------------------------------------
1 |
11 | */
12 | class Grammar
13 | {
14 | /**
15 | * The name given to the rule the grammar is augmented with
16 | * when start() is called.
17 | */
18 | const START_RULE_NAME = '$start';
19 |
20 | /**
21 | * The epsilon symbol signifies an empty production.
22 | */
23 | const EPSILON = '$epsilon';
24 |
25 | /**
26 | * @var \Dissect\Parser\Rule[]
27 | */
28 | protected $rules = array();
29 |
30 | /**
31 | * @var array
32 | */
33 | protected $groupedRules = array();
34 |
35 | /**
36 | * @var int
37 | */
38 | protected $nextRuleNumber = 1;
39 |
40 | /**
41 | * @var int
42 | */
43 | protected $conflictsMode = 9; // SHIFT | OPERATORS
44 |
45 | /**
46 | * @var string
47 | */
48 | protected $currentNonterminal;
49 |
50 | /**
51 | * @var \Dissect\Parser\Rule
52 | */
53 | protected $currentRule;
54 |
55 | /**
56 | * @var array
57 | */
58 | protected $operators = array();
59 |
60 | /**
61 | * @var array
62 | */
63 | protected $currentOperators;
64 |
65 | /**
66 | * Signifies that the parser should not resolve any
67 | * grammar conflicts.
68 | */
69 | const NONE = 0;
70 |
71 | /**
72 | * Signifies that the parser should resolve
73 | * shift/reduce conflicts by always shifting.
74 | */
75 | const SHIFT = 1;
76 |
77 | /**
78 | * Signifies that the parser should resolve
79 | * reduce/reduce conflicts by reducing with
80 | * the longer rule.
81 | */
82 | const LONGER_REDUCE = 2;
83 |
84 | /**
85 | * Signifies that the parser should resolve
86 | * reduce/reduce conflicts by reducing
87 | * with the rule that was given earlier in
88 | * the grammar.
89 | */
90 | const EARLIER_REDUCE = 4;
91 |
92 | /**
93 | * Signifies that the conflicts should be
94 | * resolved by taking operator precendence
95 | * into account.
96 | */
97 | const OPERATORS = 8;
98 |
99 | /**
100 | * Signifies that the parser should automatically
101 | * resolve all grammar conflicts.
102 | */
103 | const ALL = 15;
104 |
105 | /**
106 | * Left operator associativity.
107 | */
108 | const LEFT = 0;
109 |
110 | /**
111 | * Right operator associativity.
112 | */
113 | const RIGHT = 1;
114 |
115 | /**
116 | * The operator is nonassociative.
117 | */
118 | const NONASSOC = 2;
119 |
120 | public function __invoke($nonterminal)
121 | {
122 | $this->currentNonterminal = $nonterminal;
123 |
124 | return $this;
125 | }
126 |
127 | /**
128 | * Defines an alternative for a grammar rule.
129 | *
130 | * @param string... The components of the rule.
131 | *
132 | * @return \Dissect\Parser\Grammar This instance.
133 | */
134 | public function is()
135 | {
136 | $this->currentOperators = null;
137 |
138 | if ($this->currentNonterminal === null) {
139 | throw new LogicException(
140 | 'You must specify a name of the rule first.'
141 | );
142 | }
143 |
144 | $num = $this->nextRuleNumber++;
145 |
146 | $rule = new Rule($num, $this->currentNonterminal, func_get_args());
147 |
148 | $this->rules[$num] =
149 | $this->currentRule =
150 | $this->groupedRules[$this->currentNonterminal][] =
151 | $rule;
152 |
153 | return $this;
154 | }
155 |
156 | /**
157 | * Sets the callback for the current rule.
158 | *
159 | * @param callable $callback The callback.
160 | *
161 | * @return \Dissect\Parser\Grammar This instance.
162 | */
163 | public function call($callback)
164 | {
165 | if ($this->currentRule === null) {
166 | throw new LogicException(
167 | 'You must specify a rule first.'
168 | );
169 | }
170 |
171 | $this->currentRule->setCallback($callback);
172 |
173 | return $this;
174 | }
175 |
176 | /**
177 | * Returns the set of rules of this grammar.
178 | *
179 | * @return \Dissect\Parser\Rule[] The rules.
180 | */
181 | public function getRules()
182 | {
183 | return $this->rules;
184 | }
185 |
186 | public function getRule($number)
187 | {
188 | return $this->rules[$number];
189 | }
190 |
191 | /**
192 | * Returns the nonterminal symbols of this grammar.
193 | *
194 | * @return string[] The nonterminals.
195 | */
196 | public function getNonterminals()
197 | {
198 | return $this->nonterminals;
199 | }
200 |
201 | /**
202 | * Returns rules grouped by nonterminal name.
203 | *
204 | * @return array The rules grouped by nonterminal name.
205 | */
206 | public function getGroupedRules()
207 | {
208 | return $this->groupedRules;
209 | }
210 |
211 | /**
212 | * Sets a start rule for this grammar.
213 | *
214 | * @param string The name of the start rule.
215 | */
216 | public function start($name)
217 | {
218 | $this->rules[0] = new Rule(0, self::START_RULE_NAME, array($name));
219 | }
220 |
221 | /**
222 | * Returns the augmented start rule. For internal use only.
223 | *
224 | * @return \Dissect\Parser\Rule The start rule.
225 | */
226 | public function getStartRule()
227 | {
228 | if (!isset($this->rules[0])) {
229 | throw new LogicException("No start rule specified.");
230 | }
231 |
232 | return $this->rules[0];
233 | }
234 |
235 | /**
236 | * Sets the mode of conflict resolution.
237 | *
238 | * @param int $mode The bitmask for the mode.
239 | */
240 | public function resolve($mode)
241 | {
242 | $this->conflictsMode = $mode;
243 | }
244 |
245 | /**
246 | * Returns the conflict resolution mode for this grammar.
247 | *
248 | * @return int The bitmask of the resolution mode.
249 | */
250 | public function getConflictsMode()
251 | {
252 | return $this->conflictsMode;
253 | }
254 |
255 | /**
256 | * Does a nonterminal $name exist in the grammar?
257 | *
258 | * @param string $name The name of the nonterminal.
259 | *
260 | * @return boolean
261 | */
262 | public function hasNonterminal($name)
263 | {
264 | return array_key_exists($name, $this->groupedRules);
265 | }
266 |
267 | /**
268 | * Defines a group of operators.
269 | *
270 | * @param string,... Any number of tokens that serve as the operators.
271 | *
272 | * @return \Dissect\Parser\Grammar This instance for fluent interface.
273 | */
274 | public function operators()
275 | {
276 | $this->currentRule = null;
277 |
278 | $ops = func_get_args();
279 |
280 | $this->currentOperators = $ops;
281 |
282 | foreach ($ops as $op) {
283 | $this->operators[$op] = array(
284 | 'prec' => 1,
285 | 'assoc' => self::LEFT,
286 | );
287 | }
288 |
289 | return $this;
290 | }
291 |
292 | /**
293 | * Marks the current group of operators as left-associative.
294 | *
295 | * @return \Dissect\Parser\Grammar This instance for fluent interface.
296 | */
297 | public function left()
298 | {
299 | return $this->assoc(self::LEFT);
300 | }
301 |
302 | /**
303 | * Marks the current group of operators as right-associative.
304 | *
305 | * @return \Dissect\Parser\Grammar This instance for fluent interface.
306 | */
307 | public function right()
308 | {
309 | return $this->assoc(self::RIGHT);
310 | }
311 |
312 | /**
313 | * Marks the current group of operators as nonassociative.
314 | *
315 | * @return \Dissect\Parser\Grammar This instance for fluent interface.
316 | */
317 | public function nonassoc()
318 | {
319 | return $this->assoc(self::NONASSOC);
320 | }
321 |
322 | /**
323 | * Explicitly sets the associatity of the current group of operators.
324 | *
325 | * @param int $a One of Grammar::LEFT, Grammar::RIGHT, Grammar::NONASSOC
326 | *
327 | * @return \Dissect\Parser\Grammar This instance for fluent interface.
328 | */
329 | public function assoc($a)
330 | {
331 | if (!$this->currentOperators) {
332 | throw new LogicException('Define a group of operators first.');
333 | }
334 |
335 | foreach ($this->currentOperators as $op) {
336 | $this->operators[$op]['assoc'] = $a;
337 | }
338 |
339 | return $this;
340 | }
341 |
342 | /**
343 | * Sets the precedence (as an integer) of the current group of operators.
344 | * If no group of operators is being specified, sets the precedence
345 | * of the currently described rule.
346 | *
347 | * @param int $i The precedence as an integer.
348 | *
349 | * @return \Dissect\Parser\Grammar This instance for fluent interface.
350 | */
351 | public function prec($i)
352 | {
353 | if (!$this->currentOperators) {
354 | if (!$this->currentRule) {
355 | throw new LogicException('Define a group of operators or a rule first.');
356 | } else {
357 | $this->currentRule->setPrecedence($i);
358 | }
359 | } else {
360 | foreach ($this->currentOperators as $op) {
361 | $this->operators[$op]['prec'] = $i;
362 | }
363 | }
364 |
365 | return $this;
366 | }
367 |
368 | /**
369 | * Is the passed token an operator?
370 | *
371 | * @param string $token The token type.
372 | *
373 | * @return boolean
374 | */
375 | public function hasOperator($token)
376 | {
377 | return array_key_exists($token, $this->operators);
378 | }
379 |
380 | public function getOperatorInfo($token)
381 | {
382 | return $this->operators[$token];
383 | }
384 | }
385 |
--------------------------------------------------------------------------------
/docs/parsing.md:
--------------------------------------------------------------------------------
1 | Parsing with Dissect
2 | ====================
3 |
4 | Why an LALR(1) parser?
5 | ----------------------
6 |
7 | Parsing is a task that's needed more often than one would think;
8 | for examples in some famous PHP projects, see [this parser][twigparser]
9 | from [Twig][twig] and [these][annotationsparser] [two][dqlparser] from
10 | [Doctrine][doctrine]. Chances are you've written one; if you did, it was
11 | most likely a [recursive descent parser][rdparser], just like the
12 | examples above. Now, such parsers have several disadvantages: first,
13 | they obviously have to be manually written. Second, they're *recursive*,
14 | which means one thing: nest the input deep enough (like an
15 | annotation, which has another annotation as a parameter, that annotation
16 | has another annotation as a parameter ...) and your PHP process blows up
17 | because of stack overflow (to be fair, you'd have to nest pretty deep).
18 | And third, such parsers belong to a class of parsers known as
19 | [LL(k)][llk], which means they're generally not as powerful as [LR(k)][lrk]
20 | parsers. For instance, they cannot handle left-recursive rules
21 | (rules like `A -> A ...`), which are probably the only sane way of
22 | expressing left-associative binary operators (like addition, for
23 | example).
24 |
25 | But let's get to actually parsing something.
26 |
27 | Writing a grammar
28 | -----------------
29 |
30 | A grammar is represented by a subclass of `Dissect\Parser\Grammar`.
31 |
32 | ```php
33 | use Dissect\Parser\Grammar;
34 |
35 | class ArithGrammar extends Grammar
36 | {
37 | public function __construct()
38 | {
39 | // rule definitions
40 | }
41 | }
42 | ```
43 |
44 | First, you tell Dissect what rule are you describing. Let's say we want
45 | to describe a rule for a `Sum`:
46 |
47 | ```php
48 | $this('Sum')
49 | ```
50 |
51 | and then you specify what the rule actually `is`:
52 |
53 | ```php
54 | $this('Sum')
55 | ->is('int', '+', 'int');
56 | ```
57 |
58 | A rule can of course have many alternatives:
59 |
60 | ```php
61 | $this('Sum')
62 | ->is('int', '+', 'int')
63 | ->is('string', '+', 'string');
64 | ```
65 |
66 | and you will probably want to specify how to evalute the rule:
67 |
68 | ```php
69 | $this('Sum')
70 | ->is('int', '+', 'int')
71 | ->call(function ($l, $_, $r) {
72 | return $l + $r;
73 | })
74 |
75 | ->is('string', '+', 'string')
76 | ->call(function ($l, $_, $r) {
77 | return $l . $r;
78 | });
79 | ```
80 |
81 | > The number of arguments to the callback function is always equal
82 | > to the length of the rule to which it belongs.
83 |
84 | ### Empty rules
85 |
86 | A grammar can (and many times will) contain empty rules, that is, rules that
87 | can match 0 tokens of the input. This is useful when, for example,
88 | describing a list of function arguments, which can be either empty or a list of
89 | values separated by commas.
90 |
91 | An empty rule is defined simply by calling `is` with 0 arguments:
92 |
93 | ```php
94 | $this('Empty')
95 | ->is();
96 | ```
97 |
98 | If you find this notation unclear, you can explicitly mark empty rules
99 | with a comment:
100 |
101 | ```php
102 | $this('Empty')
103 | ->is(/* empty */);
104 | ```
105 |
106 | > **Beware:** When you don't specify a callback for a rule, Dissect
107 | > will default to returing the leftmost (first) component of the rule. You
108 | > are, however, required to specify a callback for an empty rule, since
109 | > in a rule with zero components, there is obviously no leftmost one.
110 |
111 | Example: Parsing mathematical expressions
112 | -----------------------------------------
113 |
114 | In the chapter on lexing, we've created a lexer we will now use to
115 | process our expressions:
116 |
117 | ```php
118 | class ArithLexer extends SimpleLexer
119 | {
120 | public function __construct()
121 | {
122 | $this->regex('INT', '/^[1-9][0-9]*/');
123 | $this->token('(');
124 | $this->token(')');
125 | $this->token('+');
126 | $this->token('*');
127 | $this->token('**');
128 |
129 | $this->regex('WSP', "/^[ \r\n\t]+/");
130 | $this->skip('WSP');
131 | }
132 | }
133 |
134 | $lexer = new ArithLexer();
135 | ```
136 |
137 | As for the grammar, let's start out slow, with only a single operator:
138 |
139 | ```php
140 | $this('Expr')
141 | ->is('Expr', '+', 'Expr')
142 | ->call(function ($l, $_, $r) {
143 | return $l + $r;
144 | })
145 |
146 | ->is('INT')
147 | ->call(function ($i) {
148 | return (int)$i->getValue();
149 | });
150 |
151 | $this->start('Expr');
152 | ```
153 |
154 | These two rule specify an expression to be either two expression
155 | separated by a plus or simply an integer. The call to `start()`
156 | sets the starting rule of the grammar.
157 |
158 | Now, we can simply pass the grammar to a parser object:
159 |
160 | ```php
161 | use Dissect\Parser\LALR1\Parser;
162 |
163 | $parser = new Parser(new ArithGrammar());
164 | $stream = $lexer->lex('1 + 2 + 3');
165 | echo $parser->parse($stream);
166 | // => 6
167 | ```
168 |
169 | and yay, it works!
170 |
171 | ### Operator associativity
172 |
173 | Actually, it doesn't. It *seems* to work because addition happens to be
174 | commutative, but a problem appears once we add another rule to the
175 | grammar to represent subtraction:
176 |
177 | ```php
178 | $this('Expr')
179 | ->is('Expr', '+', 'Expr') ...
180 |
181 | ->is('Expr', '-', 'Expr')
182 | ->call(function ($l, $_, $r) {
183 | return $l - $r;
184 | })
185 |
186 | ->is('INT') ...
187 | ```
188 |
189 | The result looks like this:
190 |
191 | ```php
192 | $stream = $lexer->lex('3 - 5 - 2');
193 | echo $parser->parse($stream);
194 | // => 0
195 | ```
196 |
197 | Well, that's certainly incorrect. The problem is that our grammar
198 | actually contains a conflict (a *shift/reduce* conflict, if you're a fan
199 | of termini technici. See the [section on conflict resolution](#resolving-conflicts).)
200 | which Dissect automatically resolves in a way that makes our `+` and `-`
201 | operators right-associative. The problem is fortunately easy to solve:
202 | we have to mark them as left-associative operators:
203 |
204 | ```php
205 | ->is('INT') ...
206 |
207 | $this->operators('+', '-')->left();
208 | ```
209 |
210 | This makes Dissect treat the two tokens in a special way, the conflict
211 | is resolved to represent left-associativity and the parser works correctly:
212 |
213 | ```php
214 | $stream = $lexer->lex('3 - 5 - 2');
215 | echo $parser->parse($stream);
216 | // => -4
217 | ```
218 |
219 | ### Operator precedence
220 |
221 | Unfortunately, we're not out of the woods yet. When we add another two
222 | rules to represent multiplication and division, we see that the parser
223 | still makes mistakes:
224 |
225 | ```php
226 | $this('Expr')
227 | ...
228 |
229 | ->is('Expr', '*', 'Expr')
230 | ->call(function ($l, $_, $r) {
231 | return $l * $r;
232 | })
233 |
234 | ->is('Expr', '/', 'Expr')
235 | ->call(function ($l, $_, $r) {
236 | return $l / $r;
237 | })
238 |
239 | ...
240 |
241 | $this->operators('*', '/')->left();
242 | ...
243 |
244 | $stream = $lexer->lex('2 + 3 * 5');
245 | echo $parser->parse($stream);
246 | // => 25
247 | ```
248 |
249 | The problem is that Dissect doesn't know anything about the precedence
250 | of our operators. But we can, of course, provide the necessary information:
251 |
252 | ```php
253 | $this->operators('+', '-')->left()->prec(1);
254 | $this->operators('*', '/')->left()->prec(2);
255 |
256 | ...
257 |
258 | $stream = $lexer->lex('2 + 3 * 5');
259 | echo $parser->parse($stream);
260 | // => 17
261 | ```
262 |
263 | The higher the integer passed to the `prec()` method, the higher the
264 | precedence of the specified operators.
265 |
266 | And we have the basic grammar for mathematical expressions in place!
267 | As an exercise, try to handle the rest of the tokens defined in the lexer:
268 |
269 | - Create a rule to handle parentheses around expressions.
270 | - Create a rule for the final operator, `**`, which represents
271 | exponentiation. Give it the highest precedence and make it
272 | *right-associative* (the method is, shockingly, called `right()`).
273 |
274 | ### Specifying precedences on rules instead of operators
275 |
276 | As a final touch, we'd like to add a unary minus operator to our grammar:
277 |
278 | ```php
279 | $this('Expr')
280 | ...
281 |
282 | ->is('-', 'Expr')
283 | ->call(function ($_, $e) {
284 | return -$e;
285 | })
286 | ...
287 | ```
288 |
289 | But you might feel that something is amiss. Unary minus should have the
290 | highest precedence, but we've specified the precedence of `-` to be the
291 | lowest, actually. But don't worry, we can assign precedences directly to
292 | rules:
293 |
294 | ```php
295 | $this('Expr')
296 | ...
297 |
298 | ->is('-', 'Expr')->prec(4) // higher than everything
299 | ->call(function ($_, $e) {
300 | return -$e;
301 | })
302 | ...
303 | ```
304 |
305 | ### Nonassociativity
306 |
307 | Apart from being left- or right-associative, operators can be
308 | nonassociative, which means that for an operator `op`, the input
309 | `a op b op c` means neither `(a op b) op c` or `a op (b op c)`,
310 | but is considered a syntax error.
311 |
312 | This has certain use cases; for instance, one of the nonassociative
313 | operators in the grammar for PHP is `<`: when parsing `1 < 2 < 3`,
314 | the PHP parser reports a syntax error.
315 |
316 | The corresponding method in Dissect grammars is `nonassoc()`:
317 |
318 | ```php
319 | $this->operators('<', '>')->nonassoc()->prec(...);
320 | ```
321 |
322 | ### Describing common syntactic structures
323 |
324 | To see how to describe commonly used syntactic structures such as
325 | repetitions and lists, see the [dedicated documentation section][common].
326 |
327 | Invalid input
328 | -------------
329 |
330 | When the parser encounters a syntactical error, it stops dead and
331 | throws a `Dissect\Parser\Exception\UnexpectedTokenException`.
332 | The exception gives you programmatic access to information about the
333 | problem: `getToken()` returns a `Dissect\Lexer\Token` representing the
334 | invalid token and `getExpected()` returns an array of token types the parser
335 | expected to encounter.
336 |
337 | Precomputing the parse table
338 | ----------------------------
339 |
340 | The parser needs a *parse table* to decide what to do based on given
341 | input. That parse table is created from the grammar and, if we give the
342 | parser only the grammar, needs to be computed every time we instantiate
343 | the parser.
344 |
345 | Grammar analysis is costly; if you need the speed, a far better choice
346 | would be to precompute the table beforehand (perhaps as a part of your
347 | build process) like this:
348 |
349 | ```php
350 | use Dissect\Parser\LALR1\Analysis\Analyzer;
351 |
352 | $analyzer = new Analyzer();
353 | $parseTable = $analyzer->analyze($grammar)->getParseTable();
354 | ```
355 |
356 | Now that we've got the parse table, we can dump it to a string which
357 | we then save to a file. To do this, we can use either
358 | `Dissect\Parser\LALR1\Dumper\ProductionTableDumper`:
359 |
360 | ```php
361 | $dumper = new ProductionTableDumper();
362 | $php = $dumper->dump($parseTable);
363 | ```
364 |
365 | which produces very compact, whitespace-free and absolutely unreadable
366 | code, or `Dissect\Parser\LALR1\Dumper\DebugTableDumper`:
367 |
368 | ```php
369 | $dumper = new DebugTableDumper($grammar);
370 | $php = $dumper->dump($parseTable);
371 | ```
372 |
373 | which produces indented, readable representation with comments
374 | explaining each step the parser takes when processing the input.
375 |
376 | ### Using the dumped parse table
377 |
378 | To use the dumped parse table, just write
379 |
380 | ```php
381 | $parser = new Parser($grammar, require $parseTableFile);
382 | ```
383 |
384 | You still need to pass the grammar, since it contains the callbacks
385 | used to evalute the input.
386 |
387 | > If you intend to use Dissect more like a traditional parser generator,
388 | > you don't actually need to do any of this, of course. Dissect provides a
389 | > command-line interface you can use to process and debug your grammars.
390 | > It's described in its own [documentation section][cli].
391 |
392 | Resolving conflicts
393 | -------------------
394 |
395 | *Caution, this is advanced stuff. You probably won't ever need to worry
396 | about this.*
397 |
398 | LALR(1) is generally a very poweful parsing algorithm. However, there
399 | are practical grammars that are, unfortunately, almost-but-not-quite
400 | LALR(1). When running an LALR(1) analyzer on such grammars, one sees
401 | that they contain 2 types of conflicts:
402 |
403 | - **Shift/Reduce conflicts** - the parser doesn't know whether to shift
404 | another token or reduce what's on the stack.
405 |
406 | - **Reduce/Reduce conflicts** - the parser can reduce by multiple
407 | grammar rules.
408 |
409 | There are 4 commonly used ways of resolving such conflicts and Dissect allows you to
410 | combine them any way you want:
411 |
412 | 1. On a shift/reduce conflict, consult the operators precedence
413 | and associativity information. The rules for resolution are a little
414 | complicated, but the conflict may be resolved as a reduce (either the
415 | precedence of the rule is higher than that of the shifted token or the
416 | token is left-associative), a shift (the rule precedence is lower or the
417 | token is right-associative) or even as an error (when the token is
418 | nonassociative). Note that Dissect doesn't report conflicts resolved
419 | using this technique, since they were intentionally created by the user
420 | and therefore are not really conflicts. Represented by the
421 | constant `Grammar::OPERATORS`.
422 |
423 | 2. On a shift/reduce conflict, always shift. This is represented by
424 | the constant `Grammar::SHIFT` and, together with the above method,
425 | is enabled by default.
426 |
427 | 3. On a reduce/reduce conflict, reduce using the longer rule.
428 | Represented by `Grammar::LONGER_REDUCE`. Both this and the previous
429 | way represent the same philosophy: take the largest bite possible.
430 | This is usually what the user intended to express.
431 |
432 | 4. On a reduce/reduce conflict, reduce using the rule that was
433 | declared earlier in the grammar. Represented by
434 | `Grammar::EARLIER_REDUCE`.
435 |
436 | To specify precisely how should Dissect resolve parse table conflicts,
437 | call `resolve` on your grammar:
438 |
439 | ```php
440 | $this->resolve(Grammar::SHIFT | Grammar::OPERATORS | Grammar::LONGER_REDUCE);
441 | ```
442 |
443 | There are two other constants: `Grammar::NONE` that forbids any
444 | conflicts in the grammar (even the operators-related ones) and
445 | `Grammar::ALL`, which is a combination of all the 4 above methods
446 | defined simply for convenience.
447 |
448 | [twigparser]: https://github.com/fabpot/Twig/blob/master/lib/Twig/Parser.php
449 | [twig]: https://github.com/fabpot/Twig
450 | [annotationsparser]: https://github.com/doctrine/common/blob/master/lib/Doctrine/Common/Annotations/DocParser.php
451 | [dqlparser]: https://github.com/doctrine/doctrine2/blob/master/lib/Doctrine/ORM/Query/Parser.php
452 | [doctrine]: https://github.com/doctrine
453 | [rdparser]: http://en.wikipedia.org/wiki/Recursive_descent_parser
454 | [llk]: http://en.wikipedia.org/wiki/LL_parser
455 | [lrk]: http://en.wikipedia.org/wiki/LR_parser
456 | [cli]: cli.md
457 | [common]: common.md
458 |
--------------------------------------------------------------------------------
/src/Dissect/Parser/LALR1/Analysis/Analyzer.php:
--------------------------------------------------------------------------------
1 |
18 | */
19 | class Analyzer
20 | {
21 | /**
22 | * Performs a grammar analysis.
23 | *
24 | * @param \Dissect\Parser\Grammar $grammar The grammar to analyse.
25 | *
26 | * @return \Dissect\Parser\LALR1\Analysis\AnalysisResult The result ofthe analysis.
27 | */
28 | public function analyze(Grammar $grammar)
29 | {
30 | $automaton = $this->buildAutomaton($grammar);
31 | list($parseTable, $conflicts) = $this->buildParseTable($automaton, $grammar);
32 |
33 | return new AnalysisResult($parseTable, $automaton, $conflicts);
34 | }
35 |
36 | /**
37 | * Builds the handle-finding FSA from the grammar.
38 | *
39 | * @param \Dissect\Parser\Grammar $grammar The grammar.
40 | *
41 | * @return \Dissect\Parser\LALR1\Analysis\Automaton The resulting automaton.
42 | */
43 | protected function buildAutomaton(Grammar $grammar)
44 | {
45 | // the eventual automaton
46 | $automaton = new Automaton();
47 |
48 | // the queue of states that need processing
49 | $queue = new SplQueue();
50 |
51 | // the BST for state kernels
52 | $kernelSet = new KernelSet();
53 |
54 | // rules grouped by their name
55 | $groupedRules = $grammar->getGroupedRules();
56 |
57 | // FIRST sets of nonterminals
58 | $firstSets = $this->calculateFirstSets($groupedRules);
59 |
60 | // keeps a list of tokens that need to be pumped
61 | // through the automaton
62 | $pumpings = array();
63 |
64 | // the item from which the whole automaton
65 | // is derived
66 | $initialItem = new Item($grammar->getStartRule(), 0);
67 |
68 | // construct the initial state
69 | $state = new State($kernelSet->insert(array(
70 | array($initialItem->getRule()->getNumber(), $initialItem->getDotIndex()),
71 | )), array($initialItem));
72 |
73 | // the initial item automatically has EOF
74 | // as its lookahead
75 | $pumpings[] = array($initialItem, array(Parser::EOF_TOKEN_TYPE));
76 |
77 | $queue->enqueue($state);
78 | $automaton->addState($state);
79 |
80 | while (!$queue->isEmpty()) {
81 | $state = $queue->dequeue();
82 |
83 | // items of this state are grouped by
84 | // the active component to calculate
85 | // transitions easily
86 | $groupedItems = array();
87 |
88 | // calculate closure
89 | $added = array();
90 | $currentItems = $state->getItems();
91 | for ($x = 0; $x < count($currentItems); $x++) {
92 | $item = $currentItems[$x];
93 |
94 | if (!$item->isReduceItem()) {
95 | $component = $item->getActiveComponent();
96 | $groupedItems[$component][] = $item;
97 |
98 | // if nonterminal
99 | if ($grammar->hasNonterminal($component)) {
100 |
101 | // calculate lookahead
102 | $lookahead = array();
103 | $cs = $item->getUnrecognizedComponents();
104 |
105 | foreach ($cs as $i => $c) {
106 | if (!$grammar->hasNonterminal($c)) {
107 | // if terminal, add it and break the loop
108 | $lookahead = Util::union($lookahead, array($c));
109 |
110 | break;
111 | } else {
112 | // if nonterminal
113 | $new = $firstSets[$c];
114 |
115 | if (!in_array(Grammar::EPSILON, $new)) {
116 | // if the component doesn't derive
117 | // epsilon, merge FIRST sets and break
118 | $lookahead = Util::union($lookahead, $new);
119 |
120 | break;
121 | } else {
122 | // if it does
123 |
124 | if ($i < (count($cs) - 1)) {
125 | // if more components ahead, remove epsilon
126 | unset($new[array_search(Grammar::EPSILON, $new)]);
127 | }
128 |
129 | // and continue the loop
130 | $lookahead = Util::union($lookahead, $new);
131 | }
132 | }
133 | }
134 |
135 | // two items are connected if the unrecognized
136 | // part of rule 1 derives epsilon
137 | $connect = false;
138 |
139 | // only store the pumped tokens if there
140 | // actually is an unrecognized part
141 | $pump = true;
142 |
143 | if (empty($lookahead)) {
144 | $connect = true;
145 | $pump = false;
146 | } else {
147 | if (in_array(Grammar::EPSILON, $lookahead)) {
148 | unset($lookahead[array_search(Grammar::EPSILON, $lookahead)]);
149 |
150 | $connect = true;
151 | }
152 | }
153 |
154 | foreach ($groupedRules[$component] as $rule) {
155 | if (!in_array($component, $added)) {
156 | // if $component hasn't yet been expaned,
157 | // create new items for it
158 | $newItem = new Item($rule, 0);
159 |
160 | $currentItems[] = $newItem;
161 | $state->add($newItem);
162 |
163 | } else {
164 | // if it was expanded, each original
165 | // rule might bring new lookahead tokens,
166 | // so get the rule from the current state
167 | $newItem = $state->get($rule->getNumber(), 0);
168 | }
169 |
170 | if ($connect) {
171 | $item->connect($newItem);
172 | }
173 |
174 | if ($pump) {
175 | $pumpings[] = array($newItem, $lookahead);
176 | }
177 | }
178 | }
179 |
180 | // mark the component as processed
181 | $added[] = $component;
182 | }
183 | }
184 |
185 | // calculate transitions
186 | foreach ($groupedItems as $thisComponent => $theseItems) {
187 | $newKernel = array();
188 |
189 | foreach ($theseItems as $thisItem) {
190 | $newKernel[] = array(
191 | $thisItem->getRule()->getNumber(),
192 | $thisItem->getDotIndex() + 1,
193 | );
194 | }
195 |
196 | $num = $kernelSet->insert($newKernel);
197 |
198 | if ($automaton->hasState($num)) {
199 | // the state already exists
200 | $automaton->addTransition($state->getNumber(), $thisComponent, $num);
201 |
202 | // extract the connected items from the target state
203 | $nextState = $automaton->getState($num);
204 |
205 | foreach ($theseItems as $thisItem) {
206 | $thisItem->connect(
207 | $nextState->get(
208 | $thisItem->getRule()->getNumber(),
209 | $thisItem->getDotIndex() + 1
210 | )
211 | );
212 | }
213 | } else {
214 | // new state needs to be created
215 | $newState = new State($num, array_map(function (Item $i) {
216 | $new = new Item($i->getRule(), $i->getDotIndex() + 1);
217 |
218 | // connect the two items
219 | $i->connect($new);
220 |
221 | return $new;
222 | }, $theseItems));
223 |
224 | $automaton->addState($newState);
225 | $queue->enqueue($newState);
226 |
227 | $automaton->addTransition($state->getNumber(), $thisComponent, $num);
228 | }
229 | }
230 | }
231 |
232 | // pump all the lookahead tokens
233 | foreach ($pumpings as $pumping) {
234 | $pumping[0]->pumpAll($pumping[1]);
235 | }
236 |
237 | return $automaton;
238 | }
239 |
240 | /**
241 | * Encodes the handle-finding FSA as a LR parse table.
242 | *
243 | * @param \Dissect\Parser\LALR1\Analysis\Automaton $automaton
244 | *
245 | * @return array The parse table.
246 | */
247 | protected function buildParseTable(Automaton $automaton, Grammar $grammar)
248 | {
249 | $conflictsMode = $grammar->getConflictsMode();
250 | $conflicts = array();
251 | $errors = array();
252 |
253 | // initialize the table
254 | $table = array(
255 | 'action' => array(),
256 | 'goto' => array(),
257 | );
258 |
259 | foreach ($automaton->getTransitionTable() as $num => $transitions) {
260 | foreach ($transitions as $trigger => $destination) {
261 | if (!$grammar->hasNonterminal($trigger)) {
262 | // terminal implies shift
263 | $table['action'][$num][$trigger] = $destination;
264 | } else {
265 | // nonterminal goes in the goto table
266 | $table['goto'][$num][$trigger] = $destination;
267 | }
268 | }
269 | }
270 |
271 | foreach ($automaton->getStates() as $num => $state) {
272 | if (!isset($table['action'][$num])) {
273 | $table['action'][$num] = array();
274 | }
275 |
276 | foreach ($state->getItems() as $item) {
277 | if ($item->isReduceItem()) {
278 | $ruleNumber = $item->getRule()->getNumber();
279 |
280 | foreach ($item->getLookahead() as $token) {
281 | if (isset($errors[$num]) && isset($errors[$num][$token])) {
282 | // there was a previous conflict resolved as an error
283 | // entry for this token.
284 |
285 | continue;
286 | }
287 |
288 | if (array_key_exists($token, $table['action'][$num])) {
289 | // conflict
290 | $instruction = $table['action'][$num][$token];
291 |
292 | if ($instruction > 0) {
293 | if ($conflictsMode & Grammar::OPERATORS) {
294 | if ($grammar->hasOperator($token)) {
295 | $operatorInfo = $grammar->getOperatorInfo($token);
296 |
297 | $rulePrecedence = $item->getRule()->getPrecedence();
298 |
299 | // unless the rule has given precedence
300 | if ($rulePrecedence === null) {
301 | foreach (array_reverse($item->getRule()->getComponents()) as $c) {
302 | // try to extract it from the rightmost terminal
303 | if ($grammar->hasOperator($c)) {
304 | $ruleOperatorInfo = $grammar->getOperatorInfo($c);
305 | $rulePrecedence = $ruleOperatorInfo['prec'];
306 |
307 | break;
308 | }
309 | }
310 | }
311 |
312 | if ($rulePrecedence !== null) {
313 | // if we actually have a rule precedence
314 |
315 | $tokenPrecedence = $operatorInfo['prec'];
316 |
317 | if ($rulePrecedence > $tokenPrecedence) {
318 | // if the rule precedence is higher, reduce
319 | $table['action'][$num][$token] = -$ruleNumber;
320 | } elseif ($rulePrecedence < $tokenPrecedence) {
321 | // if the token precedence is higher, shift
322 | // (i.e. don't modify the table)
323 | } else {
324 | // precedences are equal, let's turn to associativity
325 | $assoc = $operatorInfo['assoc'];
326 |
327 | if ($assoc === Grammar::RIGHT) {
328 | // if right-associative, shift
329 | // (i.e. don't modify the table)
330 | } elseif ($assoc === Grammar::LEFT) {
331 | // if left-associative, reduce
332 | $table['action'][$num][$token] = -$ruleNumber;
333 | } elseif ($assoc === Grammar::NONASSOC) {
334 | // the token is nonassociative.
335 | // this actually means an input error, so
336 | // remove the shift entry from the table
337 | // and mark this as an explicit error
338 | // entry
339 | unset($table['action'][$num][$token]);
340 | $errors[$num][$token] = true;
341 | }
342 | }
343 |
344 | continue; // resolved the conflict, phew
345 | }
346 |
347 | // we couldn't calculate the precedence => the conflict was not resolved
348 | // move along.
349 | }
350 | }
351 |
352 | // s/r
353 | if ($conflictsMode & Grammar::SHIFT) {
354 | $conflicts[] = array(
355 | 'state' => $num,
356 | 'lookahead' => $token,
357 | 'rule' => $item->getRule(),
358 | 'resolution' => Grammar::SHIFT,
359 | );
360 |
361 | continue;
362 | } else {
363 | throw new ShiftReduceConflictException(
364 | $num,
365 | $item->getRule(),
366 | $token,
367 | $automaton
368 | );
369 | }
370 | } else {
371 | // r/r
372 |
373 | $originalRule = $grammar->getRule(-$instruction);
374 | $newRule = $item->getRule();
375 |
376 | if ($conflictsMode & Grammar::LONGER_REDUCE) {
377 |
378 | $count1 = count($originalRule->getComponents());
379 | $count2 = count($newRule->getComponents());
380 |
381 | if ($count1 > $count2) {
382 | // original rule is longer
383 | $resolvedRules = array($originalRule, $newRule);
384 |
385 | $conflicts[] = array(
386 | 'state' => $num,
387 | 'lookahead' => $token,
388 | 'rules' => $resolvedRules,
389 | 'resolution' => Grammar::LONGER_REDUCE,
390 | );
391 |
392 | continue;
393 | } elseif ($count2 > $count1) {
394 | // new rule is longer
395 | $table['action'][$num][$token] = -$ruleNumber;
396 | $resolvedRules = array($newRule, $originalRule);
397 |
398 | $conflicts[] = array(
399 | 'state' => $num,
400 | 'lookahead' => $token,
401 | 'rules' => $resolvedRules,
402 | 'resolution' => Grammar::LONGER_REDUCE,
403 | );
404 |
405 | continue;
406 | }
407 | }
408 |
409 | if ($conflictsMode & Grammar::EARLIER_REDUCE) {
410 | if (-$instruction < $ruleNumber) {
411 | // original rule was earlier
412 | $resolvedRules = array($originalRule, $newRule);
413 |
414 | $conflicts[] = array(
415 | 'state' => $num,
416 | 'lookahead' => $token,
417 | 'rules' => $resolvedRules,
418 | 'resolution' => Grammar::EARLIER_REDUCE,
419 | );
420 |
421 | continue;
422 | } else {
423 | // new rule was earlier
424 | $table['action'][$num][$token] = -$ruleNumber;
425 |
426 | $conflicts[] = array(
427 | 'state' => $num,
428 | 'lookahead' => $token,
429 | 'rules' => $resolvedRules,
430 | 'resolution' => Grammar::EARLIER_REDUCE,
431 | );
432 | $resolvedRules = array($newRule, $originalRule);
433 |
434 | continue;
435 | }
436 | }
437 |
438 | // everything failed, throw an exception
439 | throw new ReduceReduceConflictException(
440 | $num,
441 | $originalRule,
442 | $newRule,
443 | $token,
444 | $automaton
445 | );
446 | }
447 | }
448 |
449 | $table['action'][$num][$token] = -$ruleNumber;
450 | }
451 | }
452 | }
453 | }
454 |
455 | return array($table, $conflicts);
456 | }
457 |
458 | /**
459 | * Calculates the FIRST sets of all nonterminals.
460 | *
461 | * @param array $rules The rules grouped by the LHS.
462 | *
463 | * @return array Calculated FIRST sets.
464 | */
465 | protected function calculateFirstSets(array $rules)
466 | {
467 | // initialize
468 | $firstSets = array();
469 |
470 | foreach (array_keys($rules) as $lhs) {
471 | $firstSets[$lhs] = array();
472 | }
473 |
474 | do {
475 | $changes = false;
476 |
477 | foreach ($rules as $lhs => $ruleArray) {
478 | foreach ($ruleArray as $rule) {
479 | $components = $rule->getComponents();
480 | $new = array();
481 |
482 | if (empty($components)) {
483 | $new = array(Grammar::EPSILON);
484 | } else {
485 | foreach ($components as $i => $component) {
486 | if (array_key_exists($component, $rules)) {
487 | // if nonterminal, copy its FIRST set to
488 | // this rule's first set
489 | $x = $firstSets[$component];
490 |
491 | if (!in_array(Grammar::EPSILON, $x)) {
492 | // if the component doesn't derive
493 | // epsilon, merge the first sets and
494 | // we're done
495 | $new = Util::union($new, $x);
496 |
497 | break;
498 | } else {
499 | // if all components derive epsilon,
500 | // the rule itself derives epsilon
501 |
502 | if ($i < (count($components) - 1)) {
503 | // more components ahead, remove epsilon
504 | unset($x[array_search(Grammar::EPSILON, $x)]);
505 | }
506 |
507 | $new = Util::union($new, $x);
508 | }
509 | } else {
510 | // if terminal, simply add it the the FIRST set
511 | // and we're done
512 | $new = Util::union($new, array($component));
513 |
514 | break;
515 | }
516 | }
517 | }
518 |
519 | if (Util::different($new, $firstSets[$lhs])) {
520 | $firstSets[$lhs] = Util::union($firstSets[$lhs], $new);
521 |
522 | $changes = true;
523 | }
524 | }
525 | }
526 | } while ($changes);
527 |
528 | return $firstSets;
529 | }
530 | }
531 |
--------------------------------------------------------------------------------