├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── bin
└── .gitignore
├── build-doc.hxml
├── build-each.hxml
├── build-interp.hxml
├── build-js.hxml
├── build.hxml
├── haxelib.json
├── hxparse.hxproj
├── src
├── byte
│ └── ByteData.hx
└── hxparse
│ ├── LexEngine.hx
│ ├── Lexer.hx
│ ├── LexerTokenSource.hx
│ ├── NoMatch.hx
│ ├── Parser.hx
│ ├── ParserBuilder.hx
│ ├── ParserBuilderImpl.macro.hx
│ ├── ParserError.hx
│ ├── Position.hx
│ ├── RuleBuilder.hx
│ ├── Ruleset.hx
│ ├── State.hx
│ ├── TokenSource.hx
│ ├── Unexpected.hx
│ ├── UnexpectedChar.hx
│ ├── Utils.hx
│ └── debug
│ └── LexerGraph.hx
└── test
├── ArithmeticParser.hx
├── JSONParser.hx
├── PrintfParser.hx
├── Test.hx
└── UnicodeTestLexer.hx
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | /dump
3 | .vscode/
4 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: haxe
2 |
3 | before_install:
4 | - sudo apt-get update
5 | - sudo apt-get install mono-devel
6 |
7 | hxml:
8 | - build.hxml
9 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Simon Krajewski
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | hxparse
2 | =======
3 |
4 | [](https://travis-ci.org/Simn/hxparse)
5 |
6 | This library provides tools for creating lexers and parsers in Haxe.
7 |
8 | ### Installation
9 |
10 | Install the library via [haxelib](http://lib.haxe.org/p/hxparse)
11 | ```
12 | haxelib install hxparse
13 | ```
14 |
15 | ### Usage
16 |
17 | - Writing a Lexer: https://github.com/Simn/hxparse/wiki/Writing-a-Lexer
18 | - Writing a Parser: https://github.com/Simn/hxparse/wiki/Writing-a-Parser
19 | - API: http://simn.github.io/hxparse/hxparse/index.html
20 |
--------------------------------------------------------------------------------
/bin/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
--------------------------------------------------------------------------------
/build-doc.hxml:
--------------------------------------------------------------------------------
1 | build-each.hxml
2 |
3 | -dce std
4 | -neko neko.n
5 | --no-output
6 | -xml bin/hxparse.xml
--------------------------------------------------------------------------------
/build-each.hxml:
--------------------------------------------------------------------------------
1 | -cp src
2 | -cp test
3 | -main Test
4 | -dce full
5 | -lib unifill
--------------------------------------------------------------------------------
/build-interp.hxml:
--------------------------------------------------------------------------------
1 | build-each.hxml
2 | --interp
3 | --times
--------------------------------------------------------------------------------
/build-js.hxml:
--------------------------------------------------------------------------------
1 | build-each.hxml
2 | -js bin/hxparse.js
--------------------------------------------------------------------------------
/build.hxml:
--------------------------------------------------------------------------------
1 | build-each.hxml
2 | -lib unifill
3 | --each
4 |
5 | --next
6 | -D dump=pretty
7 | -neko bin/hxparse.n
8 |
9 | --next
10 | -swf bin/hxparse.swf
11 |
12 | --next
13 | -swf-version 8
14 | -swf bin/hxparse8.swf
15 |
16 | --next
17 | -js bin/hxparse.js
18 |
19 | --next
20 | -php bin/php
21 |
22 | --next
23 | -cpp bin/cpp
24 |
25 | #--next
26 | #-java bin/java
27 |
28 | --next
29 | -cs bin/cs
30 |
31 | --next
32 | -python bin/hxparse.py
--------------------------------------------------------------------------------
/haxelib.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "hxparse",
3 | "url": "https://github.com/Simn/hxparse",
4 | "license": "MIT",
5 | "classPath": "src",
6 | "description": "This library provides tools for creating lexers and parsers in haxe.",
7 | "version": "4.3.0",
8 | "releasenote": "update",
9 | "contributors": ["Simn"]
10 | }
11 |
--------------------------------------------------------------------------------
/hxparse.hxproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 | "$(CompilerPath)/haxe" build.hxml
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
--------------------------------------------------------------------------------
/src/byte/ByteData.hx:
--------------------------------------------------------------------------------
1 | package byte;
2 |
3 | abstract ByteData(haxe.io.Bytes) {
4 |
5 | public var length(get,never):Int;
6 | inline function get_length() return this.length;
7 |
8 | inline public function readByte(i:Int) return this.get(i);
9 |
10 | inline function new(data) {
11 | this = data;
12 | }
13 |
14 | inline static public function ofString(s:String):ByteData {
15 | return new ByteData(haxe.io.Bytes.ofString(s));
16 | }
17 |
18 | inline static public function ofBytes(b:haxe.io.Bytes):ByteData {
19 | return new ByteData(b);
20 | }
21 |
22 | inline public function readString(pos:Int, len:Int) {
23 | return this.getString(pos, len);
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/src/hxparse/LexEngine.hx:
--------------------------------------------------------------------------------
1 | package hxparse;
2 |
3 | /**
4 | LexEngine handles pattern parsing and state transformation.
5 |
6 | This class is used by the `Lexer` and rarely has to be interacted with
7 | directly.
8 |
9 | The static `parse` method transforms a single `String` to a `Pattern`.
10 | Multiple patterns can then be passed to the constructor to generate the
11 | state machine, which is obtainable from the `firstState` method.
12 | **/
13 | class LexEngine {
14 |
15 | var uid : Int;
16 | var nodes : Array;
17 | var finals : Array;
18 | var states : Array;
19 | var hstates : Map;
20 |
21 | /**
22 | Creates a new LexEngine from `patterns`.
23 |
24 | Each LexEngine maintains a state machine, whose initial state can be
25 | obtained from the `firstState` method. After this, `this` LexEngine can
26 | be discarded.
27 |
28 | If `patterns` is null, the result is unspecified.
29 | **/
30 | public function new( patterns : Array ) {
31 | nodes = [];
32 | finals = [];
33 | states = [];
34 | hstates = new Map();
35 | uid = 0;
36 | var pid = 0;
37 | for ( p in patterns ) {
38 | var id = pid++;
39 | var f = node(id);
40 | var n = initNode(p, f,id);
41 | nodes.push(n);
42 | finals.push(f);
43 | }
44 | makeState(addNodes([], nodes));
45 | }
46 |
47 | /**
48 | Returns the entry state of the state machine generated by `this`
49 | LexEngine.
50 | **/
51 | public function firstState() {
52 | return states[0];
53 | }
54 |
55 | function makeState( nodes : Array ) {
56 | var buf = new StringBuf();
57 | for( n in nodes ) {
58 | buf.add(n.id);
59 | buf.addChar("-".code);
60 | }
61 | var key = buf.toString();
62 | var s = hstates.get(key);
63 | if( s != null )
64 | return s;
65 |
66 | s = new State();
67 | states.push(s);
68 | hstates.set(key, s);
69 |
70 | var trans = getTransitions(nodes);
71 |
72 | for ( t in trans ) {
73 | var target = makeState(t.n);
74 | for (chr in t.chars) {
75 | for (i in chr.min...(chr.max + 1)) {
76 | s.trans.set(i, target);
77 | }
78 | }
79 | }
80 |
81 | function setFinal() {
82 | for( f in finals )
83 | for( n in nodes )
84 | if( n == f ) {
85 | s.finalId = n.pid;
86 | return;
87 | }
88 | }
89 | if (s.finalId == -1)
90 | setFinal();
91 | return s;
92 | }
93 |
94 | function getTransitions( nodes : Array ) {
95 | var tl = [];
96 | for( n in nodes )
97 | for( t in n.trans )
98 | tl.push(t);
99 |
100 | // Merge transition with the same target
101 | tl.sort(function(t1, t2) return t1.n.id - t2.n.id);
102 | var t0 = tl[0];
103 | for( i in 1...tl.length ) {
104 | var t1 = tl[i];
105 | if( t0.n == t1.n ) {
106 | tl[i - 1] = null;
107 | t1 = { chars : cunion(t0.chars, t1.chars), n : t1.n };
108 | tl[i] = t1;
109 | }
110 | t0 = t1;
111 | }
112 | while( tl.remove(null) ) {
113 | }
114 |
115 | // Split char sets to make them disjoint
116 | var allChars = EMPTY;
117 | var allStates = new List<{ chars : Charset, n : Array }>();
118 | for( t in tl ) {
119 | var states = new List();
120 | states.push( { chars : cdiff(t.chars, allChars), n : [t.n] } );
121 | for( s in allStates ) {
122 | var nodes = s.n.copy();
123 | nodes.push(t.n);
124 | states.push( { chars : cinter(s.chars,t.chars), n : nodes } );
125 | states.push( { chars : cdiff(s.chars, t.chars), n : s.n } );
126 | }
127 | for( s in states )
128 | if( s.chars.length == 0 )
129 | states.remove(s);
130 | allChars = cunion(allChars, t.chars);
131 | allStates = states;
132 | }
133 |
134 | // Epsilon closure of targets
135 | var states = [];
136 | for( s in allStates )
137 | states.push({ chars : s.chars, n : addNodes([], s.n) });
138 |
139 | // Canonical ordering
140 | states.sort(function(s1, s2) {
141 | var a = s1.chars.length;
142 | var b = s2.chars.length;
143 | for( i in 0...(a < b?a:b) ) {
144 | var a = s1.chars[i];
145 | var b = s2.chars[i];
146 | if( a.min != b.min )
147 | return b.min - a.min;
148 | if( a.max != b.max )
149 | return b.max - a.max;
150 | }
151 | if( a < b )
152 | return b - a;
153 | return 0;
154 | });
155 | return states;
156 | }
157 |
158 | function addNode( nodes : Array, n : Node ) {
159 | for( n2 in nodes )
160 | if( n == n2 )
161 | return;
162 | nodes.push(n);
163 | addNodes(nodes, n.epsilon);
164 | }
165 |
166 | function addNodes( nodes : Array, add : Array ) {
167 | for( n in add )
168 | addNode(nodes, n);
169 | return nodes;
170 | }
171 |
172 | inline function node(pid) {
173 | return new Node(uid++, pid);
174 | }
175 |
176 | function initNode( p : Pattern, finalId : Node, pid : Int ) {
177 | return switch( p ) {
178 | case Empty:
179 | finalId;
180 | case Match(c):
181 | var n = node(pid);
182 | n.trans.push({ chars : c, n : finalId });
183 | n;
184 | case Star(p):
185 | var n = node(pid);
186 | var an = initNode(p,n,pid);
187 | n.epsilon.push(an);
188 | n.epsilon.push(finalId);
189 | n;
190 | case Plus(p):
191 | var n = node(pid);
192 | var an = initNode(p,n,pid);
193 | n.epsilon.push(an);
194 | n.epsilon.push(finalId);
195 | an;
196 | case Next(a,b):
197 | initNode(a, initNode(b, finalId,pid),pid);
198 | case Choice(a,b):
199 | var n = node(pid);
200 | n.epsilon.push(initNode(a,finalId,pid));
201 | n.epsilon.push(initNode(b,finalId,pid));
202 | n;
203 | case Group(p):
204 | initNode(p, finalId, pid);
205 | }
206 | }
207 |
208 | // ----------------------- PATTERN PARSING ---------------------------
209 |
210 | static inline var MAX_CODE = 255;
211 | static var EMPTY:Charset = [];
212 | static var ALL_CHARS = [ new CharRange( 0, MAX_CODE ) ];
213 |
214 | static inline function single( c : Int ) : Charset {
215 | return [ { min : c, max : c } ];
216 | }
217 |
218 | /**
219 | Parses the `pattern` `String` and returns an instance of `Pattern`.
220 |
221 | If `pattern` is not a valid pattern string, an exception of `String` is
222 | thrown.
223 |
224 | The following meta characters are supported:
225 |
226 | - `*`: zero or more
227 | - `+`: one or more
228 | - `?`: zero or one
229 | - `|`: or
230 | - `[`: begin char range
231 | - `]`: end char range
232 | - `(`: begin group
233 | - `)`: end group
234 | - `\`: escape next char
235 |
236 | These characters must be escaped if they are part of the pattern, by
237 | using `\\*`, `\\]` etc.
238 | **/
239 | public static function parse( pattern : String ) : Pattern {
240 | var p = parseInner(byte.ByteData.ofString(pattern));
241 | if( p == null ) throw "Invalid pattern '" + pattern + "'";
242 | return p.pattern;
243 | }
244 |
245 | static function next( a, b ) {
246 | return a == Empty ? b : Next(a, b);
247 | }
248 |
249 | static function plus(r) {
250 | return switch( r ) {
251 | case Next(r1, r2): Next(r1, plus(r2));
252 | default: Plus(r);
253 | }
254 | }
255 |
256 | static function star(r) {
257 | return switch( r ) {
258 | case Next(r1, r2): Next(r1, star(r2));
259 | default: Star(r);
260 | }
261 | }
262 |
263 | static function opt(r) {
264 | return switch( r ) {
265 | case Next(r1, r2): Next(r1, opt(r2));
266 | default: Choice(r, Empty);
267 | }
268 | }
269 |
270 | static function cinter(c1,c2) {
271 | return ccomplement(cunion(ccomplement(c1), ccomplement(c2)));
272 | }
273 |
274 | static function cdiff(c1,c2) {
275 | return ccomplement(cunion(ccomplement(c1), c2));
276 | }
277 |
278 | static function ccomplement( c : Charset ) {
279 | var first = c[0];
280 | var start = first != null && first.min == -1 ? c.shift().max + 1 : -1;
281 | var out: Charset = [];
282 | for( k in c ) {
283 | out.push( { min : start, max : k.min - 1 } );
284 | start = k.max + 1;
285 | }
286 | if( start <= MAX_CODE )
287 | out.push( { min : start, max : MAX_CODE } );
288 | return out;
289 | }
290 |
291 | static function cunion( ca : Charset, cb : Charset ) {
292 | var i = 0, j = 0;
293 | var out = [];
294 | var a = ca[i++], b = cb[j++];
295 | while( true ) {
296 | if( a == null ) {
297 | out.push(b);
298 | while( j < cb.length )
299 | out.push(cb[j++]);
300 | break;
301 | }
302 | if( b == null ) {
303 | out.push(a);
304 | while( i < ca.length )
305 | out.push(ca[i++]);
306 | break;
307 | }
308 | if( a.min <= b.min ) {
309 | if( a.max + 1 < b.min ) {
310 | out.push(a);
311 | a = ca[i++];
312 | } else if( a.max < b.max ) {
313 | b = { min : a.min, max : b.max };
314 | a = ca[i++];
315 | } else
316 | b = cb[j++];
317 | } else {
318 | // swap
319 | var tmp = ca;
320 | ca = cb;
321 | cb = tmp;
322 | var tmp = j;
323 | j = i;
324 | i = tmp;
325 | var tmp = a;
326 | a = b;
327 | b = tmp;
328 | }
329 | }
330 | return out;
331 | }
332 |
333 | static function parseInner( pattern : byte.ByteData, i : Int = 0, pDepth : Int = 0 ) : { pattern: Pattern, pos: Int } {
334 | function readChar() {
335 | var c = pattern.readByte(i++);
336 | if ( StringTools.isEof(c) ) {
337 | c = '\\'.code;
338 | } else if (c == "x".code) {
339 | c = Std.parseInt("0x" + pattern.readString(i, 2));
340 | i += 2;
341 | } else if (c >= "0".code && c <= "9".code) {
342 | var v = c - 48;
343 | while(true) {
344 | var cNext = pattern.readByte(i);
345 | if (cNext >= "0".code && cNext <= "9".code) {
346 | v = v * 10 + (cNext - 48);
347 | ++i;
348 | } else {
349 | break;
350 | }
351 | }
352 | c = v;
353 | }
354 | return c;
355 | }
356 |
357 | var r = Empty;
358 | var l = pattern.length;
359 | while( i < l ) {
360 | var c = pattern.readByte(i++);
361 | if (c > 255) throw c;
362 | switch( c ) {
363 | case '+'.code if (r != Empty):
364 | r = plus(r);
365 | case '*'.code if (r != Empty):
366 | r = star(r);
367 | case '?'.code if (r != Empty):
368 | r = opt(r);
369 | case '|'.code if (r != Empty):
370 | var r2 = parseInner(pattern, i);
371 | return {pattern: Choice(r, r2.pattern), pos: r2.pos};
372 | case '.'.code:
373 | r = next(r, Match(ALL_CHARS));
374 | case '('.code:
375 | var r2 = parseInner(pattern, i, pDepth + 1);
376 | i = r2.pos;
377 | r = next(r, r2.pattern);
378 | case ')'.code:
379 | if (r == Empty) throw "Empty group";
380 | return { pattern: Group(r), pos: i};
381 | case '['.code if (pattern.length > 1):
382 | var range = 0;
383 | var acc:Charset = [];
384 | var not = pattern.readByte(i) == '^'.code;
385 | if( not ) i++;
386 | while( true ) {
387 | var c = pattern.readByte(i++);
388 | if( c == ']'.code ) {
389 | if( range != 0 ) return null;
390 | break;
391 | } else if( c == '-'.code ) {
392 | if( range != 0 ) return null;
393 | var last = acc.pop();
394 | if( last == null )
395 | acc.push( { min : c, max : c } );
396 | else {
397 | if( last.min != last.max ) return null;
398 | range = last.min;
399 | }
400 | } else {
401 | if( c == '\\'.code ) {
402 | c = readChar();
403 | }
404 | if( range == 0 )
405 | acc.push( { min : c, max : c } );
406 | else {
407 | acc.push( { min : range, max : c } );
408 | range = 0;
409 | }
410 | }
411 | }
412 | var g:Charset = [];
413 | for( k in acc )
414 | g = cunion(g, [k]);
415 | if( not )
416 | g = cdiff(ALL_CHARS, g);
417 | r = next(r, Match(g));
418 | case '\\'.code:
419 | c = readChar();
420 | r = next(r, Match(single(c)));
421 | default:
422 | r = next(r, Match(single(c)));
423 | }
424 | }
425 | if (pDepth != 0) throw 'Found unclosed parenthesis while parsing "$pattern"';
426 | return {pattern:r, pos: i};
427 | }
428 | }
429 |
430 | private enum Pattern {
431 | Empty;
432 | Match( c : Charset );
433 | Star( p : Pattern );
434 | Plus( p : Pattern );
435 | Next( p1 : Pattern, p2 : Pattern );
436 | Choice( p1 : Pattern, p2 : Pattern );
437 | Group ( p : Pattern );
438 | }
439 |
440 | @:structInit private class CharRange {
441 | public var min:Int;
442 | public var max:Int;
443 | public function new(min,max) {
444 | this.min = min;
445 | this.max = max;
446 | }
447 | }
448 | private typedef Charset = Array;
449 |
450 | private class Node {
451 | public var id : Int;
452 | public var pid : Int;
453 | public var trans : Array<{ chars : Charset, n : Node }>;
454 | public var epsilon : Array;
455 | public function new(id, pid) {
456 | this.id = id;
457 | this.pid = pid;
458 | trans = [];
459 | epsilon = [];
460 | }
461 | }
462 |
463 | private class Transition {
464 | public var chars : Charset;
465 | public function new(chars) {
466 | this.chars = chars;
467 | }
468 | public function toString() {
469 | return Std.string(chars);
470 | }
471 | }
472 |
--------------------------------------------------------------------------------
/src/hxparse/Lexer.hx:
--------------------------------------------------------------------------------
1 | package hxparse;
2 |
3 | /**
4 | Lexer matches a sequence of characters against a set of rule patterns.
5 |
6 | An instance of Lexer is created once for each input and maintains state
7 | for that input. Tokens can then be obtained by calling the `token` method,
8 | passing an instance of `Ruleset`.
9 |
10 | Rule sets can be created manually, or by calling the static `buildRuleset`
11 | method.
12 | **/
13 | class Lexer {
14 |
15 | /**
16 | The `String` that was matched by the most recent invocation of the
17 | `token` method.
18 | **/
19 | public var current(default, null):String;
20 |
21 | var input:byte.ByteData;
22 | var source:String;
23 | var pos:Int;
24 |
25 | /**
26 | Creates a new Lexer for `input`.
27 |
28 | If `sourceName` is provided, it is used in error messages to denote
29 | the position of an error.
30 |
31 | If `input` is null, the result is unspecified.
32 | **/
33 | public function new(input:byte.ByteData, sourceName:String = "") {
34 | current = "";
35 | this.input = input;
36 | source = sourceName;
37 | pos = 0;
38 | }
39 |
40 | /**
41 | Returns the current position of `this` Lexer.
42 | **/
43 | public inline function curPos():Position {
44 | return new Position(source, pos - current.length, pos);
45 | }
46 |
47 | /**
48 | Returns the next token according to `ruleset`.
49 |
50 | This method starts with `ruleset.state` and reads characters from `this`
51 | input until no further state transitions are possible. It always returns
52 | the longest match.
53 |
54 | If a character is read which has no transition defined, an
55 | `UnexpectedChar` exception is thrown.
56 |
57 | If the input is in the end of file state upon method invocation,
58 | `ruleset.eofFunction` is called with `this` Lexer as argument. If
59 | `ruleset` defines no `eofFunction` field, a `haxe.io.Eof` exception
60 | is thrown.
61 |
62 | If `ruleset` is null, the result is unspecified.
63 | **/
64 | public function token(ruleset:Ruleset):T {
65 | if (pos == input.length) {
66 | if (ruleset.eofFunction != null) return ruleset.eofFunction(this);
67 | else throw new haxe.io.Eof();
68 | }
69 | var state = ruleset.state;
70 | var lastMatch = null;
71 | var lastMatchPos = pos;
72 | var start = pos;
73 |
74 | #if expose_lexer_state
75 | stateCallback(state, pos, -1);
76 | #end
77 |
78 | while(true) {
79 | if (state.finalId > -1) {
80 | lastMatch = state;
81 | lastMatchPos = pos;
82 | }
83 | if (pos == input.length) {
84 | break;
85 | }
86 | var i = input.readByte(pos);
87 | ++pos;
88 | state = state.trans.get(i);
89 |
90 | #if expose_lexer_state
91 | stateCallback(state, pos-1, i);
92 | #end
93 |
94 | if (state == null)
95 | break;
96 | }
97 | pos = lastMatchPos;
98 | current = input.readString(start, pos - start);
99 | if (lastMatch == null || lastMatch.finalId == -1)
100 | throw new UnexpectedChar(String.fromCharCode(input.readByte(pos)), curPos());
101 | return ruleset.functions[lastMatch.finalId](this);
102 | }
103 |
104 | #if expose_lexer_state
105 | /**
106 |
107 | @param state `null` if it's the last state visited
108 | @param position Position of the byte read
109 | @param input Transition input byte, `-1` if initial state
110 | **/
111 | dynamic public function stateCallback(state:State, position:Int, input:Int) {}
112 | #end
113 |
114 | /**
115 | Builds a `Ruleset` from the given `rules` `Array`.
116 |
117 | For each element of `rules`, its `rule` `String` is parsed into a
118 | `Pattern` using `LexEngine.parse`.
119 |
120 | If `rules` is null, the result is unspecified.
121 | **/
122 | static public function buildRuleset(rules:Array<{rule:String,func:Lexer->Token}>, name:String = "") {
123 | var cases = [];
124 | var functions = [];
125 | var eofFunction = null;
126 | for (rule in rules) {
127 | if (rule.rule == "") {
128 | eofFunction = rule.func;
129 | } else {
130 | cases.push(LexEngine.parse(rule.rule));
131 | functions.push(rule.func);
132 | }
133 | }
134 | return new Ruleset(new LexEngine(cases).firstState(), functions, eofFunction, name);
135 | }
136 | }
--------------------------------------------------------------------------------
/src/hxparse/LexerTokenSource.hx:
--------------------------------------------------------------------------------
1 | package hxparse;
2 |
3 | class LexerTokenSource {
4 | var lexer:Lexer;
5 | public var ruleset:Ruleset;
6 |
7 | public function new(lexer, ruleset){
8 | this.lexer = lexer;
9 | this.ruleset = ruleset;
10 | }
11 |
12 | public function token():Token{
13 | return lexer.token(ruleset);
14 | }
15 |
16 | public function curPos():Position{
17 | return lexer.curPos();
18 | }
19 | }
--------------------------------------------------------------------------------
/src/hxparse/NoMatch.hx:
--------------------------------------------------------------------------------
1 | package hxparse;
2 |
3 | /**
4 | A NoMatch exception is thrown if an outer token matching fails.
5 |
6 | Matching can continue because no tokens have been consumed.
7 | **/
8 | class NoMatch extends ParserError {
9 |
10 | /**
11 | The token which was encountered and could not be matched.
12 | **/
13 | public var token(default, null):T;
14 |
15 | /**
16 | Creates a new NoMatch exception.
17 | **/
18 | public function new(pos:hxparse.Position, token:T) {
19 | super(pos);
20 | this.token = token;
21 | }
22 |
23 | override public function toString() {
24 | return 'No match: $token';
25 | }
26 | }
--------------------------------------------------------------------------------
/src/hxparse/Parser.hx:
--------------------------------------------------------------------------------
1 | package hxparse;
2 |
3 | /**
4 | Parser is the base class for all custom parsers.
5 |
6 | The intended usage is to extend it and utilize its method as an API where
7 | required.
8 | */
9 | @:generic
10 | class Parser, Token> {
11 |
12 | /**
13 | Returns the last matched token.
14 |
15 | This is a convenience property for accessing `cache[offset - 1]`.
16 | **/
17 | public var last(default, null):Token;
18 |
19 | var stream:S;
20 | var token:haxe.ds.GenericStack.GenericCell;
21 |
22 | /**
23 | Creates a new Parser instance over `TokenSource` `stream`
24 | **/
25 | public function new(stream:S) {
26 | this.stream = stream;
27 | }
28 |
29 | /**
30 | Returns the `n`th token without consuming it.
31 | **/
32 | @:dox(show)
33 | #if cs inline #end // Workaround for https://github.com/HaxeFoundation/haxe/issues/3212
34 | function peek(n:Int):Token {
35 | if (token == null) {
36 | token = new haxe.ds.GenericStack.GenericCell(stream.token(), null);
37 | n--;
38 | }
39 | var tok = token;
40 | while (n > 0) {
41 | if (tok.next == null) tok.next = new haxe.ds.GenericStack.GenericCell(stream.token(), null);
42 | tok = tok.next;
43 | n--;
44 | }
45 | return tok.elt;
46 | }
47 |
48 | /**
49 | Consumes the current token.
50 |
51 | This method is automatically called after a successful match.
52 | **/
53 | @:dox(show)
54 | inline function junk() {
55 | last = token.elt;
56 | token = token.next;
57 | }
58 |
59 | /**
60 | Returns the current lexer position.
61 | **/
62 | @:dox(show)
63 | public inline function curPos() {
64 | return stream.curPos();
65 | }
66 |
67 | /**
68 | Invokes `f` and then `separatorFunc` with the current token until the
69 | result of that call is `false`.
70 |
71 | The result is an Array containing the results of all calls to `f`.
72 |
73 | A typical use case is parsing function arguments which are separated by
74 | a comma.
75 | **/
76 | @:dox(show)
77 | function parseSeparated(separatorFunc:Token->Bool, f:Void->T):Array {
78 | var acc = [];
79 | while(true) {
80 | try {
81 | acc.push(f());
82 | } catch(e:hxparse.NoMatch) {
83 | break;
84 | }
85 | if (separatorFunc(peek(0))) {
86 | junk();
87 | } else {
88 | break;
89 | }
90 | }
91 | return acc;
92 | }
93 |
94 | /**
95 | Returns the result of calling `f()` if a match is made, or `null`
96 | otherwise.
97 | **/
98 | @:dox(show)
99 | function parseOptional(f:Void->T) {
100 | try {
101 | return f();
102 | } catch(e:hxparse.NoMatch) {
103 | return null;
104 | }
105 | }
106 |
107 | /**
108 | Calls `f` until no match can be made.
109 |
110 | The result is an Array containing the results of all calls to `f`.
111 | **/
112 | @:dox(show)
113 | function parseRepeat(f:Void->T) {
114 | var acc = [];
115 | while(true) {
116 | try {
117 | acc.push(f());
118 | } catch(e:hxparse.NoMatch) {
119 | return acc;
120 | }
121 | }
122 | }
123 |
124 | /**
125 | Returns the result of calling `f()` if a match is made, or throw
126 | `Unexpected` otherwise.
127 | **/
128 | function parseExpect(f:Void->T) {
129 | try {
130 | return f();
131 | } catch(_:NoMatch) {
132 | unexpected();
133 | }
134 | }
135 |
136 | /**
137 | Throws `NoMatch` exception, which contains last matched position and token.
138 | **/
139 | inline function noMatch() {
140 | return new NoMatch(stream.curPos(), peek(0));
141 | }
142 |
143 | /**
144 | Throws `Unexpected` exception, which contains last matched position and token.
145 | **/
146 | inline function unexpected():Dynamic {
147 | throw new Unexpected(peek(0), stream.curPos());
148 | }
149 |
150 | /**
151 | Macro that processes and returns the result of `switch`.
152 | **/
153 | @:access(hxparse.ParserBuilderImpl.transformSwitch)
154 | static public macro function parse(e:haxe.macro.Expr) {
155 | switch (e.expr) {
156 | case ESwitch(_, cases, edef) | EParenthesis({expr: ESwitch(_, cases, edef)}):
157 | return hxparse.ParserBuilderImpl.transformSwitch(cases, edef);
158 | case _:
159 | return haxe.macro.Context.error("Expected switch expression", e.pos);
160 | }
161 | }
162 | }
163 |
--------------------------------------------------------------------------------
/src/hxparse/ParserBuilder.hx:
--------------------------------------------------------------------------------
1 | package hxparse;
2 |
3 | @:autoBuild(hxparse.ParserBuilderImpl.build())
4 | interface ParserBuilder { }
--------------------------------------------------------------------------------
/src/hxparse/ParserBuilderImpl.macro.hx:
--------------------------------------------------------------------------------
1 | package hxparse;
2 |
3 | import haxe.macro.Context;
4 | import haxe.macro.Expr;
5 |
6 | using haxe.macro.Tools;
7 | using Lambda;
8 |
9 | private typedef ParserCase = {
10 | expr: Expr,
11 | head: Expr,
12 | tail: Array
13 | }
14 |
15 | private enum CaseGroup {
16 | Simple(group:Array);
17 | Complex(c:ParserCase);
18 | }
19 |
20 | class ParserBuilderImpl {
21 | static public function build():Array {
22 | var fields = Context.getBuildFields();
23 | for (field in fields) {
24 | switch(field.kind) {
25 | case FFun(fun) if (fun.expr != null):
26 | fun.expr = map(fun.expr);
27 | case _:
28 | }
29 | }
30 | return fields;
31 | }
32 |
33 | static function punion(p1:Position, p2:Position) {
34 | var p1 = Context.getPosInfos(p1);
35 | var p2 = Context.getPosInfos(p2);
36 | return Context.makePosition({
37 | file: p1.file,
38 | min: p1.min < p2.min ? p1.min : p2.min,
39 | max: p1.max > p2.max ? p1.max : p2.max
40 | });
41 | }
42 |
43 | static function map(e:Expr) {
44 | return switch(e.expr) {
45 | case ESwitch({expr: EConst(CIdent("stream"))}, cl, edef):
46 | transformSwitch(cl, edef);
47 | case EBlock([]):
48 | e;
49 | case EBlock(el):
50 | var elast = el.pop();
51 | var el = el.map(map);
52 | el.push(map(elast));
53 | macro @:pos(e.pos) $b{el};
54 | case _: e.map(map);
55 | }
56 | }
57 |
58 | static function transformSwitch(cl:Array, edef:Null) {
59 | if (edef != null)
60 | cl.push({values: [macro _], expr: edef, guard: null});
61 | return transformCases(cl);
62 | }
63 |
64 | static function transformCases(cl:Array) {
65 | var groups = [];
66 | var group = [];
67 | var def = noMatch;
68 | for (c in cl) {
69 | switch(c.values) {
70 | case [{expr:EArrayDecl(el)}]:
71 | var head = el.shift();
72 | var chead = {head:head, tail: el, expr:c.expr == null ? macro null : map(c.expr)};
73 | switch(head.expr) {
74 | case EBinop(_):
75 | if (group.length > 0) groups.push(Simple(group));
76 | groups.push(Complex(chead));
77 | group = [];
78 | case _:
79 | group.push(chead);
80 | }
81 | case [{expr:EConst(CIdent("_"))}]:
82 | def = c.expr == null ? macro null : map(c.expr);
83 | case [e]:
84 | Context.error("Expected [ patterns ]", e.pos);
85 | case _:
86 | Context.error("Comma notation is not allowed while matching streams", punion(c.values[0].pos, c.values[c.values.length - 1].pos));
87 | }
88 | }
89 | if (group.length > 0)
90 | groups.push(Simple(group));
91 |
92 | var last = groups.pop();
93 | var elast = makeCase(last,def);
94 | while (groups.length > 0) {
95 | elast = makeCase(groups.pop(), elast);
96 | }
97 | return elast;
98 | }
99 |
100 | static var unexpected = macro unexpected();
101 | static var noMatch = macro throw noMatch();
102 |
103 | static function makeCase(g:CaseGroup, def:Expr) {
104 | return switch(g) {
105 | case Simple(group):
106 | var cl = group.map(makeInner);
107 | cl.iter(function(c) {
108 | c.expr = macro @:pos(c.expr.pos) { junk(); ${c.expr}; };
109 | });
110 | {
111 | pos: def.pos,
112 | expr: ESwitch(macro peek(0), cl, def)
113 | }
114 | case Complex(c):
115 | var inner = makeInner(c);
116 | makePattern(c.head, inner.expr, def);
117 | }
118 | }
119 |
120 | static function makeInner(c:ParserCase) {
121 | var last = c.tail.pop();
122 | if (last == null) {
123 | return {values:[c.head], guard:null, expr: c.expr};
124 | }
125 | var elast = makePattern(last, c.expr, unexpected);
126 | while (c.tail.length > 0)
127 | elast = makePattern(c.tail.pop(), elast, unexpected);
128 | return {values: [c.head], guard: null, expr: elast};
129 | }
130 |
131 | static function makePattern(pat:Expr, e:Expr, def:Expr) {
132 | return switch(pat.expr) {
133 | case EBinop(OpAssign, {expr: EConst(CIdent(s))}, e2):
134 | if (def == unexpected || def == noMatch) {
135 | var e1 = s == "_" ? e2 : macro var $s = $e2;
136 | macro {
137 | $e1;
138 | $e;
139 | }
140 | } else {
141 | buildExtractor(pat, e, e2, s, def);
142 | }
143 | case EBinop(OpBoolAnd, e1, e2):
144 | macro @:pos(pat.pos) {
145 | switch peek(0) {
146 | case $e1 if ($e2):
147 | junk();
148 | $e;
149 | case _: $def;
150 | }
151 | }
152 | case EBinop(OpBoolOr, e1, e2):
153 | makePattern(e1, e, macro throw stream.curPos() + ": " +$e2);
154 | case _:
155 | macro @:pos(pat.pos) switch peek(0) {
156 | case $pat:
157 | junk();
158 | $e;
159 | case _: $def;
160 | }
161 | }
162 | }
163 |
164 | static function buildExtractor(pat, e, e2, s, def) {
165 | var e1 = s == "_" ? e2 : macro var $s = $e2;
166 | return macro @:pos(pat.pos) {
167 | try {
168 | $e1;
169 | $e;
170 | } catch (_:hxparse.NoMatch) {
171 | $def;
172 | }
173 | }
174 | }
175 | }
--------------------------------------------------------------------------------
/src/hxparse/ParserError.hx:
--------------------------------------------------------------------------------
1 | package hxparse;
2 |
3 | /**
4 | This is the base class of all parser errors.
5 | **/
6 | class ParserError {
7 | /**
8 | The position in the input where `this` exception occured.
9 | **/
10 | public var pos(default, null):Position;
11 |
12 | public function new(pos:Position) {
13 | this.pos = pos;
14 | }
15 |
16 | public function toString() {
17 | return "Parser error";
18 | }
19 | }
--------------------------------------------------------------------------------
/src/hxparse/Position.hx:
--------------------------------------------------------------------------------
1 | package hxparse;
2 |
3 | /**
4 | The position information maintained by `Lexer`.
5 | **/
6 | class Position {
7 | /**
8 | Name of the source.
9 | **/
10 | public var psource : String;
11 |
12 | /**
13 | The first character position, counting from the beginning of the input.
14 | **/
15 | public var pmin : Int;
16 |
17 | /**
18 | The last character position, counting from the beginning of the input.
19 | **/
20 | public var pmax : Int;
21 |
22 | /**
23 | Creates a new `Position` from the given information.
24 | **/
25 | public function new(source, min, max) {
26 | psource = source;
27 | pmin = min;
28 | pmax = max;
29 | }
30 |
31 | /**
32 | Returns a readable representation of `this` position;
33 | **/
34 | public function toString() {
35 | return '$psource:characters $pmin-$pmax';
36 | }
37 |
38 | public function getLinePosition(input:byte.ByteData) {
39 | var lineMin = 1;
40 | var lineMax = 1;
41 | var posMin = 0;
42 | var posMax = 0;
43 | var cur = 0;
44 | while (cur < pmin) {
45 | if (input.readByte(cur) == "\n".code) {
46 | lineMin++;
47 | posMin = cur + 1;
48 | }
49 | cur++;
50 | }
51 | lineMax = lineMin;
52 | posMax = posMin;
53 | posMin = cur - posMin;
54 | while (cur < pmax) {
55 | if (input.readByte(cur) == "\n".code) {
56 | lineMax++;
57 | posMax = cur + 1;
58 | }
59 | cur++;
60 | }
61 | posMax = cur - posMax;
62 | return {
63 | lineMin: lineMin,
64 | lineMax: lineMax,
65 | posMin: posMin,
66 | posMax: posMax
67 | }
68 | }
69 |
70 | /**
71 | Formats `this` position by resolving line numbers within `input`.
72 |
73 | If `input` is null, the result is unspecified.
74 | **/
75 | public function format(input:byte.ByteData) {
76 | var linePos = getLinePosition(input);
77 | if (linePos.lineMin != linePos.lineMax) {
78 | return '${psource}:lines ${linePos.lineMin}-${linePos.lineMax}';
79 | } else {
80 | return '${psource}:${linePos.lineMin}: characters ${linePos.posMin}-${linePos.posMax}';
81 | }
82 | }
83 |
84 | /**
85 | Unifies two positions `p1` and `p2`, using the minimum `pmin` and
86 | maximum `pmax` of both.
87 |
88 | The resulting `psource` and `pline` are taken from `p1`.
89 |
90 | If `p1` or `p2` are null, the result is unspecified.
91 | **/
92 | static public function union(p1:Position, p2:Position) {
93 | return new Position(p1.psource, p1.pmin < p2.pmin ? p1.pmin : p2.pmin, p1.pmax > p2.pmax ? p1.pmax : p2.pmax);
94 | }
95 | }
96 |
97 | private typedef Position2 = {
98 | lineMin: Int,
99 | lineMax: Int,
100 | posMin: Int,
101 | posMax: Int
102 | }
103 |
--------------------------------------------------------------------------------
/src/hxparse/RuleBuilder.hx:
--------------------------------------------------------------------------------
1 | package hxparse;
2 |
3 | import haxe.macro.Context;
4 | import haxe.macro.Expr;
5 |
6 | using Lambda;
7 | using haxe.macro.Tools;
8 |
9 | /**
10 | The RuleBuilder interfaces provides syntactic shortcuts for writing lexer
11 | rules.
12 | **/
13 | #if !macro
14 | @:autoBuild(hxparse.RuleBuilderImpl.build())
15 | #end
16 | interface RuleBuilder { }
17 |
18 | class RuleBuilderImpl {
19 | macro static public function build():Array {
20 | var fields = Context.getBuildFields();
21 | var fieldExprs = new Map();
22 | var delays = [];
23 | var ret = [];
24 | var rules = [];
25 | for (field in fields) {
26 | if (field.access.exists(function(a) return a == AStatic))
27 | switch(field.kind) {
28 | case FVar(t, e) if (e != null):
29 | switch(e.expr) {
30 | case EMeta({name: ":rule"}, e):
31 | rules.push(field.name);
32 | delays.push(transformRule.bind(field, e, t, fieldExprs));
33 | case EMeta({name: ":mapping", params: args}, e):
34 | var offset = switch(args) {
35 | case [{expr: EConst(CInt(i))}]: Std.parseInt(i);
36 | case _: 0;
37 | }
38 | delays.push(transformMapping.bind(field, e, offset));
39 | case _:
40 | fieldExprs.set(field.name, e);
41 | }
42 | case _:
43 | }
44 | if (!field.meta.exists(function(m) return m.name == ":ruleHelper")) {
45 | ret.push(field);
46 | }
47 | }
48 | for (delay in delays)
49 | delay();
50 | var ruleIdents = [for (rv in rules) macro $i{rv}];
51 | ret.push( {
52 | name: "generatedRulesets",
53 | access: [APublic, AStatic],
54 | kind: FVar(TPath({
55 | name: "Array",
56 | pack: [],
57 | params: [TPType(TPath({
58 | name: "Ruleset",
59 | pack: ["hxparse"],
60 | params: [TPType(TPath( {
61 | name: "Dynamic",
62 | pack: []
63 | }))]
64 | }))]
65 | }), macro $a{ruleIdents}),
66 | pos: Context.currentPos()
67 | });
68 | return ret;
69 | }
70 |
71 | #if macro
72 |
73 | #if unifill
74 |
75 | static function handleUnicode(s:String, p:Position) {
76 | function getPosInfo(i, l) {
77 | var p = Context.getPosInfos(p);
78 | return Context.makePosition({
79 | min: p.min + i,
80 | max: p.min + i + l,
81 | file: p.file
82 | });
83 | }
84 | var uLength = unifill.Unifill.uLength(s);
85 | if (uLength == s.length) {
86 | return s;
87 | }
88 | var buf = new StringBuf();
89 | var itr = new unifill.InternalEncodingIter(s, 0, s.length);
90 | while (itr.hasNext()) {
91 | var i = itr.next();
92 | var c = unifill.InternalEncoding.charAt(s, i);
93 | switch (c) {
94 | case '[':
95 | buf.add("(");
96 | var first = true;
97 | while(true) {
98 | if (!itr.hasNext()) {
99 | Context.error("Unterminated regular expression", getPosInfo(itr.index, 1));
100 | }
101 | var i = itr.next();
102 | var c = unifill.InternalEncoding.charAt(s, i);
103 | switch (c) {
104 | case "]":
105 | break;
106 | case "^" if (first):
107 | var p = unifill.InternalEncoding.codePointCount(s, 0, i);
108 | Context.error("Not-ranges are not supported in unicode strings", getPosInfo(i, 1));
109 | case _:
110 | if (!first) {
111 | buf.add("|");
112 | }
113 | buf.add("(");
114 | if (!itr.hasNext()) {
115 | Context.error("Unterminated regular expression", getPosInfo(itr.index, 1));
116 | }
117 | var w = unifill.InternalEncoding.codePointWidthAt(s, i);
118 | if (unifill.InternalEncoding.charAt(s, i + w) == "-") {
119 | itr.next();
120 | if (!itr.hasNext()) {
121 | Context.error("Unterminated regular expression", getPosInfo(itr.index, 1));
122 | }
123 | var k = itr.next();
124 | var cNext = unifill.InternalEncoding.charAt(s, k);
125 | if (unifill.InternalEncoding.codePointAt(c, 0) > 0x7F) {
126 | Context.error("Unicode ranges are not supported", getPosInfo(i, 3));
127 | } else {
128 | buf.add("[");
129 | buf.add(c);
130 | buf.add("-");
131 | buf.add(cNext);
132 | buf.add("]");
133 | }
134 | } else {
135 | buf.add(c);
136 | }
137 | buf.add(")");
138 | }
139 | first = false;
140 | }
141 | buf.add(")");
142 | case _:
143 | buf.add(c);
144 | }
145 | }
146 | return buf.toString();
147 | }
148 |
149 | #end
150 |
151 | static function makeRule(fields:Map, rule:Expr):String {
152 | return switch(rule) {
153 | case macro $v{(s:String)}: #if unifill handleUnicode(s, rule.pos) #else s #end;
154 | case macro $i{i}: makeRule(fields, fields.get(i));
155 | case macro $e1 + $e2: "(" + makeRule(fields, e1) +")(" + makeRule(fields, e2) +")";
156 | case {expr:EConst(CRegexp(r, opt))}:
157 | if (opt != "") {
158 | Context.error("Cannot use regular expression flags for lexer rules", rule.pos);
159 | }
160 | r;
161 | case _: Context.error("Invalid rule", rule.pos);
162 | }
163 | }
164 |
165 | static function transformRule(field:Field, e:Expr, t:ComplexType, fields:Map) {
166 | var el = switch(e.expr) {
167 | case EArrayDecl(el): el;
168 | case _: Context.error("Expected pattern => function map declaration", e.pos);
169 | }
170 | var el = el.map(function(e) {
171 | function loop(e:Expr) {
172 | return switch(e.expr) {
173 | case EBinop(OpArrow, rule, e):
174 | macro @:pos(e.pos) {rule:$v{makeRule(fields, rule)}, func:function(lexer:hxparse.Lexer):$t return $e};
175 | case EConst(CIdent(s)) if (fields.exists(s)):
176 | loop(fields.get(s));
177 | case _:
178 | Context.error("Expected pattern => function", e.pos);
179 | }
180 | }
181 | return loop(e);
182 | });
183 | var e = macro $a{el};
184 | var e = macro hxparse.Lexer.buildRuleset($e, $v{field.name});
185 | field.kind = FVar(null, e);
186 | return e;
187 | }
188 |
189 | static function transformMapping(field:Field, e:Expr, offset:Int) {
190 | var t = Context.typeof(e).follow();
191 | var sl = [];
192 | switch(t) {
193 | case TAnonymous(_.get() => {status: AEnumStatics(_.get() => e)}):
194 | for (f in e.names) {
195 | var name = macro @:pos(e.pos) $i{f};
196 | var cName = f.charAt(offset).toLowerCase() + f.substr(offset + 1);
197 | sl.push(macro $v{cName} => $name);
198 | }
199 | case _:
200 | Context.error("Invalid mapping type", e.pos);
201 | }
202 | var e = macro $a{sl};
203 | field.kind = FVar(null, e);
204 | return e;
205 | }
206 |
207 | #end
208 | }
--------------------------------------------------------------------------------
/src/hxparse/Ruleset.hx:
--------------------------------------------------------------------------------
1 | package hxparse;
2 |
3 | /**
4 | A Ruleset wraps an input state and the semantic callback functions for the
5 | `Lexer`.
6 | **/
7 | class Ruleset {
8 |
9 | /**
10 | The initial state.
11 | **/
12 | public var state:State;
13 |
14 | /**
15 | The semantic functions.
16 | **/
17 | public var functions:ArrayToken>;
18 |
19 | /**
20 | The callback function for when end of file state is reached.
21 | **/
22 | public var eofFunction:Lexer->Token;
23 |
24 | /**
25 | Informative name for the state, if any. Generated automatically from field name by RuleBuilder if @:rule is used.
26 | **/
27 | public var name:String;
28 |
29 | /**
30 | Creates a new Ruleset.
31 | **/
32 | public function new(state, functions, eofFunction, name = "") {
33 | this.state = state;
34 | this.functions = functions;
35 | this.eofFunction = eofFunction;
36 | this.name = name;
37 | }
38 | }
--------------------------------------------------------------------------------
/src/hxparse/State.hx:
--------------------------------------------------------------------------------
1 | package hxparse;
2 |
3 | /**
4 | Represents a state in the state machine generated by the `LexEngine`.
5 | **/
6 | class State {
7 | /**
8 | The transition vector, where the index corresponds to a char code.
9 | **/
10 | public var trans:haxe.ds.Vector;
11 |
12 | /**
13 | The ids of the final states.
14 | **/
15 | public var finalId:Int;
16 |
17 | /**
18 | Creates a new State.
19 | **/
20 | public function new() {
21 | finalId = -1;
22 | trans = new haxe.ds.Vector(256);
23 | }
24 | }
--------------------------------------------------------------------------------
/src/hxparse/TokenSource.hx:
--------------------------------------------------------------------------------
1 | package hxparse;
2 |
3 | /**
4 | Defines the structure of a type usable as input for a `Parser`.
5 | **/
6 | typedef TokenSource = {
7 |
8 | /**
9 | Returns the next token
10 | **/
11 | function token():Token;
12 |
13 | /**
14 | Returns the current `Position` of `this` TokenSource.
15 | **/
16 | function curPos():Position;
17 | }
--------------------------------------------------------------------------------
/src/hxparse/Unexpected.hx:
--------------------------------------------------------------------------------
1 | package hxparse;
2 |
3 | /**
4 | Unexpected is thrown by `Parser.serror`, which is invoked when an inner
5 | token matching fails.
6 |
7 | Unlike `NoMatch`, this exception denotes that the stream is in an
8 | irrecoverable state because tokens have been consumed.
9 | **/
10 | class Unexpected extends ParserError {
11 |
12 | /**
13 | The token which was found.
14 | **/
15 | public var token:Token;
16 |
17 | /**
18 | Creates a new instance of Unexpected.
19 | **/
20 | public function new(token:Token, pos) {
21 | super(pos);
22 | this.token = token;
23 | }
24 |
25 | /**
26 | Returns a readable representation of `this` exception.
27 | **/
28 | override public function toString() {
29 | return 'Unexpected $token';
30 | }
31 | }
--------------------------------------------------------------------------------
/src/hxparse/UnexpectedChar.hx:
--------------------------------------------------------------------------------
1 | package hxparse;
2 |
3 | /**
4 | UnexpectedChar is thrown by `Lexer.token` if it encounters a character for
5 | which no state transition is defined.
6 | **/
7 | class UnexpectedChar extends ParserError {
8 |
9 | /**
10 | The character which caused `this` exception.
11 | **/
12 | public var char:String;
13 |
14 | /**
15 | Creates a new instance of UnexpectedChar.
16 | **/
17 | public function new(char, pos) {
18 | super(pos);
19 | this.char = char;
20 | }
21 |
22 | /**
23 | Returns a readable representation of `this` exception.
24 | **/
25 | override public function toString() {
26 | return 'Unexpected $char';
27 | }
28 | }
--------------------------------------------------------------------------------
/src/hxparse/Utils.hx:
--------------------------------------------------------------------------------
1 | package hxparse;
2 |
3 | import hxparse.Unexpected;
4 | import hxparse.UnexpectedChar;
5 | import hxparse.NoMatch;
6 |
7 | /**
8 | This class provides some static utility methods.
9 | **/
10 | class Utils {
11 |
12 | /**
13 | Tries to invoke `f` and return its value, while catching the lexer and
14 | parser exceptions `hxparse.NoMatch`, `hxparse.Unexpected` and
15 | `hxparse.UnexpectedChar`.
16 |
17 | If no exception occurs, the result of `f` is returned.
18 |
19 | Otherwise the caught exception is rethrown as `String` in a human-
20 | readable representation and with positions formatted within `input`.
21 |
22 | If `input` or `f` are null, the result is unspecified.
23 | **/
24 | static public function catchErrors(input:byte.ByteData, f:Void->T) {
25 | try {
26 | return f();
27 | } catch(e:ParserError) {
28 | throw e.pos.format(input) + ": " + e.toString();
29 | }
30 | }
31 | }
--------------------------------------------------------------------------------
/src/hxparse/debug/LexerGraph.hx:
--------------------------------------------------------------------------------
1 | package hxparse.debug;
2 |
3 | #if !hxdotgraph
4 | #error "Using this class requires -lib hxdotgraph"
5 | #end
6 |
7 | import hxparse.Ruleset;
8 | import hxparse.State;
9 | import dot.Graph;
10 | import dot.Node;
11 | import dot.Attribute;
12 | using Lambda;
13 |
14 | class LexerGraph {
15 |
16 | static public function printRuleset(ruleset:Ruleset):String {
17 | var lexerGraph = new LexerGraph(ruleset);
18 | return lexerGraph.graph.getDotCode();
19 | }
20 |
21 | var graph:Graph;
22 | var ruleset:Ruleset;
23 | var map:Map;
24 |
25 | function new(ruleset:Ruleset) {
26 | this.ruleset = ruleset;
27 | this.graph = new Graph([RankDir(Lr)], true);
28 | map = new Map();
29 | processState(ruleset.state);
30 | }
31 |
32 | function processState(state:State) {
33 | if (map.exists(state)) {
34 | return map[state];
35 | }
36 | var attrs = [Label("")];
37 | if (state.finalId > -1) {
38 | attrs.push(Shape(Doublecircle));
39 | }
40 |
41 | var node = graph.node(attrs);
42 | map[state] = node;
43 |
44 | var targets = new Map();
45 | for (i in 0...256) {
46 | if (state.trans[i] == null) {
47 | continue;
48 | }
49 | var target = state.trans[i];
50 | if (!targets.exists(target)) {
51 | targets[target] = [i];
52 | } else {
53 | targets[target].push(i);
54 | }
55 | }
56 |
57 | for (target in targets.keys()) {
58 | var il = targets[target];
59 | var targetNode = processState(target);
60 | var edgeLabel = getRangeString(il);
61 | graph.edge(node, targetNode, [Label(edgeLabel)]);
62 | }
63 |
64 | return node;
65 | }
66 |
67 | function getRangeString(il:Array) {
68 | if (il.length > 240) {
69 | return "[^" + getRangeString(complementOf(il)) + "]";
70 | } else if (il.length == 1) {
71 | return printCode(il[0]);
72 | }
73 |
74 | var ranges = [];
75 | var i = 0;
76 | var last = -1;
77 | var start = -1;
78 | function addRange() {
79 | if (start == last) {
80 | ranges.push(printCode(start));
81 | } else {
82 | ranges.push(printCode(start) + "-" +printCode(last));
83 | }
84 | }
85 | while (i < il.length) {
86 | var cur = il[i];
87 | if (start == -1) {
88 | start = cur;
89 | ++i;
90 | } else if (cur != last + 1) {
91 | addRange();
92 | start = -1;
93 | } else {
94 | ++i;
95 | }
96 | last = cur;
97 | }
98 | if (start != -1) {
99 | addRange();
100 | }
101 | return ranges.join(" ");
102 | }
103 |
104 | function printCode(i:Int) {
105 | if (i >= 32 && i <= 0x7F) {
106 | return switch (i) {
107 | case '"'.code: '\\"';
108 | case '\\'.code: '\\\\';
109 | case ' '.code: "' '";
110 | case _: String.fromCharCode(i);
111 | }
112 | } else {
113 | return "\\\\" +i;
114 | }
115 | }
116 |
117 | function complementOf(il:Array) {
118 | var ret = [];
119 | for (i in 0...256) {
120 | if (!il.has(i)) {
121 | ret.push(i);
122 | }
123 | }
124 | return ret;
125 | }
126 | }
127 |
--------------------------------------------------------------------------------
/test/ArithmeticParser.hx:
--------------------------------------------------------------------------------
1 | enum ArithmeticBinop {
2 | OpAdd;
3 | OpSub;
4 | OpMul;
5 | OpDiv;
6 | }
7 |
8 | enum ArithmeticToken {
9 | TNumber(f:Float);
10 | TPOpen;
11 | TPClose;
12 | TBinop(op:ArithmeticBinop);
13 | TEof;
14 | }
15 |
16 | enum ArithmeticExpr {
17 | ENumber(f:Float);
18 | EBinop(op:ArithmeticBinop, e1:ArithmeticExpr, e2:ArithmeticExpr);
19 | EParenthesis(e:ArithmeticExpr);
20 | ENeg(e:ArithmeticExpr);
21 | }
22 |
23 | class ArithmeticLexer extends hxparse.Lexer implements hxparse.RuleBuilder {
24 | static public var tok = @:rule [
25 | "[1-9][0-9]*" => TNumber(Std.parseFloat(lexer.current)), // lazy...
26 | "\\(" => TPOpen,
27 | "\\)" => TPClose,
28 | "\\+" => TBinop(OpAdd),
29 | "\\-" => TBinop(OpSub),
30 | "\\*" => TBinop(OpMul),
31 | "\\/" => TBinop(OpDiv),
32 | "[\r\n\t ]" => lexer.token(tok),
33 | "" => TEof
34 | ];
35 | }
36 |
37 | class ArithmeticParser extends hxparse.Parser, ArithmeticToken> implements hxparse.ParserBuilder {
38 | public function parse() {
39 | return switch stream {
40 | case [TNumber(f)]:
41 | parseNext(ENumber(f));
42 | case [TPOpen, e = parse(), TPClose]:
43 | parseNext(EParenthesis(e));
44 | case [TBinop(OpSub), e = parse()]:
45 | parseNext(ENeg(e));
46 | }
47 | }
48 |
49 | function parseNext(e1:ArithmeticExpr) {
50 | return switch stream {
51 | case [TBinop(op), e2 = parse()]:
52 | binop(e1, op, e2);
53 | case _:
54 | e1;
55 | }
56 | }
57 |
58 | function binop(e1:ArithmeticExpr, op:ArithmeticBinop, e2:ArithmeticExpr) {
59 | return switch [e2, op] {
60 | case [EBinop(op2 = OpAdd | OpSub, e3, e4), OpMul | OpDiv]:
61 | // precedence
62 | EBinop(op2, EBinop(op, e1, e3), e4);
63 | case _:
64 | EBinop(op, e1, e2);
65 | }
66 | }
67 | }
68 |
69 | class ArithmeticEvaluator {
70 | static public function eval(e:ArithmeticExpr):Float {
71 | return switch(e) {
72 | case ENumber(f):
73 | f;
74 | case EBinop(op, e1, e2):
75 | switch(op) {
76 | case OpAdd:
77 | eval(e1) + eval(e2);
78 | case OpSub:
79 | eval(e1) - eval(e2);
80 | case OpMul:
81 | eval(e1) * eval(e2);
82 | case OpDiv:
83 | eval(e1) / eval(e2);
84 | }
85 | case EParenthesis(e1):
86 | eval(e1);
87 | case ENeg(e1):
88 | -eval(e1);
89 | }
90 | }
91 | }
--------------------------------------------------------------------------------
/test/JSONParser.hx:
--------------------------------------------------------------------------------
1 | import hxparse.Parser.parse as parse;
2 |
3 | private enum Token {
4 | TBrOpen;
5 | TBrClose;
6 | TComma;
7 | TDblDot;
8 | TBkOpen;
9 | TBkClose;
10 | TDash;
11 | TDot;
12 | TTrue;
13 | TFalse;
14 | TNull;
15 | TNumber(v:String);
16 | TString(v:String);
17 | TEof;
18 | }
19 |
20 | class JSONLexer extends hxparse.Lexer implements hxparse.RuleBuilder {
21 |
22 | static var buf:StringBuf;
23 |
24 | public static var tok = @:rule [
25 | "{" => TBrOpen,
26 | "}" => TBrClose,
27 | "," => TComma,
28 | ":" => TDblDot,
29 | "[" => TBkOpen,
30 | "]" => TBkClose,
31 | "-" => TDash,
32 | "\\." => TDot,
33 | "true" => TTrue,
34 | "false" => TFalse,
35 | "null" => TNull,
36 | "-?(([1-9][0-9]*)|0)(.[0-9]+)?([eE][\\+\\-]?[0-9]+)?" => TNumber(lexer.current),
37 | '"' => {
38 | buf = new StringBuf();
39 | lexer.token(string);
40 | TString(buf.toString());
41 | },
42 | "[\r\n\t ]" => lexer.token(tok),
43 | "" => TEof
44 | ];
45 |
46 | static var string = @:rule [
47 | "\\\\t" => {
48 | buf.addChar("\t".code);
49 | lexer.token(string);
50 | },
51 | "\\\\n" => {
52 | buf.addChar("\n".code);
53 | lexer.token(string);
54 | },
55 | "\\\\r" => {
56 | buf.addChar("\r".code);
57 | lexer.token(string);
58 | },
59 | '\\\\"' => {
60 | buf.addChar('"'.code);
61 | lexer.token(string);
62 | },
63 | "\\\\u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f]" => {
64 | buf.add(String.fromCharCode(Std.parseInt("0x" +lexer.current.substr(2))));
65 | lexer.token(string);
66 | },
67 | '"' => {
68 | lexer.curPos().pmax;
69 | },
70 | '[^"]' => {
71 | buf.add(lexer.current);
72 | lexer.token(string);
73 | },
74 | ];
75 | }
76 |
77 | class JSONParser extends hxparse.Parser, Token> {
78 | public function new(input:byte.ByteData, sourceName:String) {
79 | var lexer = new JSONLexer(input, sourceName);
80 | var ts = new hxparse.LexerTokenSource(lexer, JSONLexer.tok);
81 | super(ts);
82 | }
83 |
84 | public function parseJson():Dynamic {
85 | return parse(switch stream {
86 | case [TBrOpen, obj = object({})]: obj;
87 | case [TBkOpen, arr = array([])]: arr;
88 | case [TNumber(s)]: s;
89 | case [TTrue]: true;
90 | case [TFalse]: false;
91 | case [TNull]: null;
92 | case [TString(s)]: s;
93 | });
94 | }
95 |
96 | function object(obj:{}) {
97 | return parse(switch stream {
98 | case [TBrClose]: obj;
99 | case [TString(s), TDblDot, e = parseJson()]:
100 | Reflect.setField(obj, s, e);
101 | switch stream {
102 | case [TBrClose]: obj;
103 | case [TComma]: object(obj);
104 | }
105 | });
106 | }
107 |
108 | function array(acc:Array) {
109 | return parse(switch stream {
110 | case [TBkClose]: acc;
111 | case [elt = parseJson()]:
112 | acc.push(elt);
113 | switch stream {
114 | case [TBkClose]: acc;
115 | case [TComma]: array(acc);
116 | }
117 | });
118 | }
119 | }
120 |
--------------------------------------------------------------------------------
/test/PrintfParser.hx:
--------------------------------------------------------------------------------
1 | enum PToken {
2 | Eof;
3 | Placeholder;
4 | Dot;
5 | Number(i:Int);
6 | Literal(s:String);
7 | Flag(flag:PFlag);
8 | Value(v:PValue);
9 | }
10 |
11 | enum PFlag {
12 | Zero;
13 | Alt;
14 | Plus;
15 | Minus;
16 | Space;
17 | }
18 |
19 | enum PValue {
20 | VInt:PValue;
21 | VString:PValue;
22 | VBool:PValue;
23 | VFloat:PValue;
24 | }
25 |
26 | enum Fmt {
27 | Lit(s:String):Fmt;
28 | Val(v:PValue):FmtA>;
29 | Cat(a:Fmt, b:Fmt):Fmt;
30 | }
31 |
32 | class PrintfLexer extends hxparse.Lexer implements hxparse.RuleBuilder {
33 |
34 | static public var tok = @:rule [
35 | "$" => Placeholder,
36 | "$$" => Literal(lexer.current),
37 | "[^$]+" => Literal(lexer.current),
38 | "" => Eof
39 | ];
40 |
41 | static public var placeholder = @:rule [
42 | "0" => Flag(Zero),
43 | "#" => Flag(Alt),
44 | " " => Flag(Space),
45 | "+" => Flag(Plus),
46 | "-" => Flag(Minus),
47 | "[1-9][0-9]*" => Number(Std.parseInt(lexer.current)),
48 | "\\." => Dot,
49 | "i" => Value(VInt),
50 | "f" => Value(VFloat),
51 | "s" => Value(VString),
52 | "b" => Value(VBool),
53 | ];
54 | }
55 |
56 | class PrintfParser extends hxparse.Parser, PToken> implements hxparse.ParserBuilder {
57 | public function new(input:byte.ByteData) {
58 | var lexer = new PrintfLexer(input);
59 | var ts = new hxparse.LexerTokenSource(lexer, PrintfLexer.tok);
60 | super(ts);
61 | }
62 |
63 | public function parse() {
64 | var v:Fmt = switch stream {
65 | case [Literal(s)]: Lit(s);
66 | case [Placeholder]:
67 | var current = stream.ruleset;
68 | stream.ruleset = PrintfLexer.placeholder;
69 | var r = parsePlaceholder();
70 | stream.ruleset = current;
71 | r;
72 | case [Eof]: null;
73 | }
74 | if (v == null) return null;
75 | var next = parse();
76 | return next == null ? v : Cat(v, next);
77 | }
78 |
79 | function parsePlaceholder() {
80 | var flags = parseFlags([]);
81 | var width = switch stream {
82 | case [Number(n)]: n;
83 | case _: -1;
84 | }
85 | var precision = switch stream {
86 | case [Dot, Number(n)]: n;
87 | case _: -1;
88 | }
89 | return switch stream {
90 | case [Value(v)]: Val(v); // we omit the config for simplicity reasons
91 | case _: unexpected();
92 | }
93 | }
94 |
95 | function parseFlags(acc:Array) {
96 | return switch stream {
97 | case [Flag(x)]:
98 | acc.push(x);
99 | parseFlags(acc);
100 | case _: acc;
101 | }
102 | }
103 | }
--------------------------------------------------------------------------------
/test/Test.hx:
--------------------------------------------------------------------------------
1 | class Test {
2 | static function main() {
3 |
4 | var t0 = haxe.Timer.stamp();
5 |
6 | var parser = new PrintfParser(byte.ByteData.ofString("Valu$$e: $-050.2f kg"));
7 | trace(parser.parse());
8 |
9 | var parser = new JSONParser(byte.ByteData.ofString('{ "key": [true, false, null], "other\tkey": [12, 12.1, 0, 0.1, 0.9e1, 0.9E1, 9E-1] }'), "jsontest");
10 | trace(parser.parseJson());
11 |
12 | // Using haxe.Utf8
13 | var value = 'hello âê€𩸽ùあ𠀀ÊÀÁÂÃÄÅÆÇÈÉËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáãäåæçèéëìíîïðñòóôõöøúûüýþÿ№ unicode';
14 | var lexer = new UnicodeTestLexer( byte.ByteData.ofString( value ), 'uft8-test' );
15 | var tokens = [];
16 |
17 | try while (true) {
18 | tokens.push( lexer.token( UnicodeTestLexer.root ) );
19 | } catch (_e:Dynamic) {
20 | trace(_e);
21 | }
22 | trace( tokens );
23 |
24 | var numTests = 0;
25 | function eq(expected:Float, s:String) {
26 | ++numTests;
27 | var lexer = new ArithmeticParser.ArithmeticLexer(byte.ByteData.ofString(s));
28 | var ts = new hxparse.LexerTokenSource(lexer, ArithmeticParser.ArithmeticLexer.tok);
29 | var parser = new ArithmeticParser(ts);
30 | var result = ArithmeticParser.ArithmeticEvaluator.eval(parser.parse());
31 | if (expected != result) {
32 | trace('Error in "$s"; expected $expected but was $result');
33 | }
34 | }
35 | eq(1, "1");
36 | eq(2, "1 + 1");
37 | eq(6, "2 * 3");
38 | eq(2, "6 / 3");
39 | eq(1.5, "3 / 2");
40 | eq(10, "2 * 3 + 4");
41 | eq(14, "2 * (3 + 4)");
42 | eq(18, "9 + (3 * 4) - 3 / (1 * 1)");
43 | eq(-9, "-9");
44 | eq(-12, "-(4 + 8)");
45 | eq(12, "--12");
46 | eq(8, "2*(3-(2+(-3)))");
47 |
48 | var diff = haxe.Timer.stamp() - t0;
49 | trace('Done $numTests tests in $diff ms');
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/test/UnicodeTestLexer.hx:
--------------------------------------------------------------------------------
1 | package ;
2 |
3 | import hxparse.Lexer;
4 | import hxparse.RuleBuilder;
5 | import haxe.Utf8;
6 |
7 | /**
8 | * ...
9 | * @author Skial Bainn
10 | */
11 | class UnicodeTestLexer extends Lexer implements RuleBuilder {
12 |
13 | public static var root = @:rule [
14 | 'â' => lexer.current,
15 | 'ê' => lexer.current,
16 | 'ù' => lexer.current,
17 | "あ𠀀" => lexer.current,
18 | '\u00CA' => lexer.current, // Ê
19 | '\u20AC' => lexer.current, // €
20 | '\u{29e3d}' => lexer.current, // 𩸽
21 | '[ a-zA-Z0-9ÀÁÂÔÕÖØÙÚÛÜÝÞßàáãäåæçèéëìíîïðñòóôõöøúûüýþÿ№あ𠀀]' => lexer.current,
22 | '\\195[\\131-\\139]' => lexer.current,
23 | '\\xC3[\\x8c-\\x93]' => lexer.current,
24 | //'[Ã-Ë]' => lexer.current
25 | ];
26 |
27 | }
28 |
--------------------------------------------------------------------------------