├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── bin └── .gitignore ├── build-doc.hxml ├── build-each.hxml ├── build-interp.hxml ├── build-js.hxml ├── build.hxml ├── haxelib.json ├── hxparse.hxproj ├── src ├── byte │ └── ByteData.hx └── hxparse │ ├── LexEngine.hx │ ├── Lexer.hx │ ├── LexerTokenSource.hx │ ├── NoMatch.hx │ ├── Parser.hx │ ├── ParserBuilder.hx │ ├── ParserBuilderImpl.macro.hx │ ├── ParserError.hx │ ├── Position.hx │ ├── RuleBuilder.hx │ ├── Ruleset.hx │ ├── State.hx │ ├── TokenSource.hx │ ├── Unexpected.hx │ ├── UnexpectedChar.hx │ ├── Utils.hx │ └── debug │ └── LexerGraph.hx └── test ├── ArithmeticParser.hx ├── JSONParser.hx ├── PrintfParser.hx ├── Test.hx └── UnicodeTestLexer.hx /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | /dump 3 | .vscode/ 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: haxe 2 | 3 | before_install: 4 | - sudo apt-get update 5 | - sudo apt-get install mono-devel 6 | 7 | hxml: 8 | - build.hxml 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Simon Krajewski 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | hxparse 2 | ======= 3 | 4 | [![TravisCI Build Status](https://api.travis-ci.org/Simn/hxparse.svg?branch=development)](https://travis-ci.org/Simn/hxparse) 5 | 6 | This library provides tools for creating lexers and parsers in Haxe. 7 | 8 | ### Installation 9 | 10 | Install the library via [haxelib](http://lib.haxe.org/p/hxparse) 11 | ``` 12 | haxelib install hxparse 13 | ``` 14 | 15 | ### Usage 16 | 17 | - Writing a Lexer: https://github.com/Simn/hxparse/wiki/Writing-a-Lexer 18 | - Writing a Parser: https://github.com/Simn/hxparse/wiki/Writing-a-Parser 19 | - API: http://simn.github.io/hxparse/hxparse/index.html 20 | -------------------------------------------------------------------------------- /bin/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /build-doc.hxml: -------------------------------------------------------------------------------- 1 | build-each.hxml 2 | 3 | -dce std 4 | -neko neko.n 5 | --no-output 6 | -xml bin/hxparse.xml -------------------------------------------------------------------------------- /build-each.hxml: -------------------------------------------------------------------------------- 1 | -cp src 2 | -cp test 3 | -main Test 4 | -dce full 5 | -lib unifill -------------------------------------------------------------------------------- /build-interp.hxml: -------------------------------------------------------------------------------- 1 | build-each.hxml 2 | --interp 3 | --times -------------------------------------------------------------------------------- /build-js.hxml: -------------------------------------------------------------------------------- 1 | build-each.hxml 2 | -js bin/hxparse.js -------------------------------------------------------------------------------- /build.hxml: -------------------------------------------------------------------------------- 1 | build-each.hxml 2 | -lib unifill 3 | --each 4 | 5 | --next 6 | -D dump=pretty 7 | -neko bin/hxparse.n 8 | 9 | --next 10 | -swf bin/hxparse.swf 11 | 12 | --next 13 | -swf-version 8 14 | -swf bin/hxparse8.swf 15 | 16 | --next 17 | -js bin/hxparse.js 18 | 19 | --next 20 | -php bin/php 21 | 22 | --next 23 | -cpp bin/cpp 24 | 25 | #--next 26 | #-java bin/java 27 | 28 | --next 29 | -cs bin/cs 30 | 31 | --next 32 | -python bin/hxparse.py -------------------------------------------------------------------------------- /haxelib.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "hxparse", 3 | "url": "https://github.com/Simn/hxparse", 4 | "license": "MIT", 5 | "classPath": "src", 6 | "description": "This library provides tools for creating lexers and parsers in haxe.", 7 | "version": "4.3.0", 8 | "releasenote": "update", 9 | "contributors": ["Simn"] 10 | } 11 | -------------------------------------------------------------------------------- /hxparse.hxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | "$(CompilerPath)/haxe" build.hxml 48 | 49 | 50 | 51 | 52 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /src/byte/ByteData.hx: -------------------------------------------------------------------------------- 1 | package byte; 2 | 3 | abstract ByteData(haxe.io.Bytes) { 4 | 5 | public var length(get,never):Int; 6 | inline function get_length() return this.length; 7 | 8 | inline public function readByte(i:Int) return this.get(i); 9 | 10 | inline function new(data) { 11 | this = data; 12 | } 13 | 14 | inline static public function ofString(s:String):ByteData { 15 | return new ByteData(haxe.io.Bytes.ofString(s)); 16 | } 17 | 18 | inline static public function ofBytes(b:haxe.io.Bytes):ByteData { 19 | return new ByteData(b); 20 | } 21 | 22 | inline public function readString(pos:Int, len:Int) { 23 | return this.getString(pos, len); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/hxparse/LexEngine.hx: -------------------------------------------------------------------------------- 1 | package hxparse; 2 | 3 | /** 4 | LexEngine handles pattern parsing and state transformation. 5 | 6 | This class is used by the `Lexer` and rarely has to be interacted with 7 | directly. 8 | 9 | The static `parse` method transforms a single `String` to a `Pattern`. 10 | Multiple patterns can then be passed to the constructor to generate the 11 | state machine, which is obtainable from the `firstState` method. 12 | **/ 13 | class LexEngine { 14 | 15 | var uid : Int; 16 | var nodes : Array; 17 | var finals : Array; 18 | var states : Array; 19 | var hstates : Map; 20 | 21 | /** 22 | Creates a new LexEngine from `patterns`. 23 | 24 | Each LexEngine maintains a state machine, whose initial state can be 25 | obtained from the `firstState` method. After this, `this` LexEngine can 26 | be discarded. 27 | 28 | If `patterns` is null, the result is unspecified. 29 | **/ 30 | public function new( patterns : Array ) { 31 | nodes = []; 32 | finals = []; 33 | states = []; 34 | hstates = new Map(); 35 | uid = 0; 36 | var pid = 0; 37 | for ( p in patterns ) { 38 | var id = pid++; 39 | var f = node(id); 40 | var n = initNode(p, f,id); 41 | nodes.push(n); 42 | finals.push(f); 43 | } 44 | makeState(addNodes([], nodes)); 45 | } 46 | 47 | /** 48 | Returns the entry state of the state machine generated by `this` 49 | LexEngine. 50 | **/ 51 | public function firstState() { 52 | return states[0]; 53 | } 54 | 55 | function makeState( nodes : Array ) { 56 | var buf = new StringBuf(); 57 | for( n in nodes ) { 58 | buf.add(n.id); 59 | buf.addChar("-".code); 60 | } 61 | var key = buf.toString(); 62 | var s = hstates.get(key); 63 | if( s != null ) 64 | return s; 65 | 66 | s = new State(); 67 | states.push(s); 68 | hstates.set(key, s); 69 | 70 | var trans = getTransitions(nodes); 71 | 72 | for ( t in trans ) { 73 | var target = makeState(t.n); 74 | for (chr in t.chars) { 75 | for (i in chr.min...(chr.max + 1)) { 76 | s.trans.set(i, target); 77 | } 78 | } 79 | } 80 | 81 | function setFinal() { 82 | for( f in finals ) 83 | for( n in nodes ) 84 | if( n == f ) { 85 | s.finalId = n.pid; 86 | return; 87 | } 88 | } 89 | if (s.finalId == -1) 90 | setFinal(); 91 | return s; 92 | } 93 | 94 | function getTransitions( nodes : Array ) { 95 | var tl = []; 96 | for( n in nodes ) 97 | for( t in n.trans ) 98 | tl.push(t); 99 | 100 | // Merge transition with the same target 101 | tl.sort(function(t1, t2) return t1.n.id - t2.n.id); 102 | var t0 = tl[0]; 103 | for( i in 1...tl.length ) { 104 | var t1 = tl[i]; 105 | if( t0.n == t1.n ) { 106 | tl[i - 1] = null; 107 | t1 = { chars : cunion(t0.chars, t1.chars), n : t1.n }; 108 | tl[i] = t1; 109 | } 110 | t0 = t1; 111 | } 112 | while( tl.remove(null) ) { 113 | } 114 | 115 | // Split char sets to make them disjoint 116 | var allChars = EMPTY; 117 | var allStates = new List<{ chars : Charset, n : Array }>(); 118 | for( t in tl ) { 119 | var states = new List(); 120 | states.push( { chars : cdiff(t.chars, allChars), n : [t.n] } ); 121 | for( s in allStates ) { 122 | var nodes = s.n.copy(); 123 | nodes.push(t.n); 124 | states.push( { chars : cinter(s.chars,t.chars), n : nodes } ); 125 | states.push( { chars : cdiff(s.chars, t.chars), n : s.n } ); 126 | } 127 | for( s in states ) 128 | if( s.chars.length == 0 ) 129 | states.remove(s); 130 | allChars = cunion(allChars, t.chars); 131 | allStates = states; 132 | } 133 | 134 | // Epsilon closure of targets 135 | var states = []; 136 | for( s in allStates ) 137 | states.push({ chars : s.chars, n : addNodes([], s.n) }); 138 | 139 | // Canonical ordering 140 | states.sort(function(s1, s2) { 141 | var a = s1.chars.length; 142 | var b = s2.chars.length; 143 | for( i in 0...(a < b?a:b) ) { 144 | var a = s1.chars[i]; 145 | var b = s2.chars[i]; 146 | if( a.min != b.min ) 147 | return b.min - a.min; 148 | if( a.max != b.max ) 149 | return b.max - a.max; 150 | } 151 | if( a < b ) 152 | return b - a; 153 | return 0; 154 | }); 155 | return states; 156 | } 157 | 158 | function addNode( nodes : Array, n : Node ) { 159 | for( n2 in nodes ) 160 | if( n == n2 ) 161 | return; 162 | nodes.push(n); 163 | addNodes(nodes, n.epsilon); 164 | } 165 | 166 | function addNodes( nodes : Array, add : Array ) { 167 | for( n in add ) 168 | addNode(nodes, n); 169 | return nodes; 170 | } 171 | 172 | inline function node(pid) { 173 | return new Node(uid++, pid); 174 | } 175 | 176 | function initNode( p : Pattern, finalId : Node, pid : Int ) { 177 | return switch( p ) { 178 | case Empty: 179 | finalId; 180 | case Match(c): 181 | var n = node(pid); 182 | n.trans.push({ chars : c, n : finalId }); 183 | n; 184 | case Star(p): 185 | var n = node(pid); 186 | var an = initNode(p,n,pid); 187 | n.epsilon.push(an); 188 | n.epsilon.push(finalId); 189 | n; 190 | case Plus(p): 191 | var n = node(pid); 192 | var an = initNode(p,n,pid); 193 | n.epsilon.push(an); 194 | n.epsilon.push(finalId); 195 | an; 196 | case Next(a,b): 197 | initNode(a, initNode(b, finalId,pid),pid); 198 | case Choice(a,b): 199 | var n = node(pid); 200 | n.epsilon.push(initNode(a,finalId,pid)); 201 | n.epsilon.push(initNode(b,finalId,pid)); 202 | n; 203 | case Group(p): 204 | initNode(p, finalId, pid); 205 | } 206 | } 207 | 208 | // ----------------------- PATTERN PARSING --------------------------- 209 | 210 | static inline var MAX_CODE = 255; 211 | static var EMPTY:Charset = []; 212 | static var ALL_CHARS = [ new CharRange( 0, MAX_CODE ) ]; 213 | 214 | static inline function single( c : Int ) : Charset { 215 | return [ { min : c, max : c } ]; 216 | } 217 | 218 | /** 219 | Parses the `pattern` `String` and returns an instance of `Pattern`. 220 | 221 | If `pattern` is not a valid pattern string, an exception of `String` is 222 | thrown. 223 | 224 | The following meta characters are supported: 225 | 226 | - `*`: zero or more 227 | - `+`: one or more 228 | - `?`: zero or one 229 | - `|`: or 230 | - `[`: begin char range 231 | - `]`: end char range 232 | - `(`: begin group 233 | - `)`: end group 234 | - `\`: escape next char 235 | 236 | These characters must be escaped if they are part of the pattern, by 237 | using `\\*`, `\\]` etc. 238 | **/ 239 | public static function parse( pattern : String ) : Pattern { 240 | var p = parseInner(byte.ByteData.ofString(pattern)); 241 | if( p == null ) throw "Invalid pattern '" + pattern + "'"; 242 | return p.pattern; 243 | } 244 | 245 | static function next( a, b ) { 246 | return a == Empty ? b : Next(a, b); 247 | } 248 | 249 | static function plus(r) { 250 | return switch( r ) { 251 | case Next(r1, r2): Next(r1, plus(r2)); 252 | default: Plus(r); 253 | } 254 | } 255 | 256 | static function star(r) { 257 | return switch( r ) { 258 | case Next(r1, r2): Next(r1, star(r2)); 259 | default: Star(r); 260 | } 261 | } 262 | 263 | static function opt(r) { 264 | return switch( r ) { 265 | case Next(r1, r2): Next(r1, opt(r2)); 266 | default: Choice(r, Empty); 267 | } 268 | } 269 | 270 | static function cinter(c1,c2) { 271 | return ccomplement(cunion(ccomplement(c1), ccomplement(c2))); 272 | } 273 | 274 | static function cdiff(c1,c2) { 275 | return ccomplement(cunion(ccomplement(c1), c2)); 276 | } 277 | 278 | static function ccomplement( c : Charset ) { 279 | var first = c[0]; 280 | var start = first != null && first.min == -1 ? c.shift().max + 1 : -1; 281 | var out: Charset = []; 282 | for( k in c ) { 283 | out.push( { min : start, max : k.min - 1 } ); 284 | start = k.max + 1; 285 | } 286 | if( start <= MAX_CODE ) 287 | out.push( { min : start, max : MAX_CODE } ); 288 | return out; 289 | } 290 | 291 | static function cunion( ca : Charset, cb : Charset ) { 292 | var i = 0, j = 0; 293 | var out = []; 294 | var a = ca[i++], b = cb[j++]; 295 | while( true ) { 296 | if( a == null ) { 297 | out.push(b); 298 | while( j < cb.length ) 299 | out.push(cb[j++]); 300 | break; 301 | } 302 | if( b == null ) { 303 | out.push(a); 304 | while( i < ca.length ) 305 | out.push(ca[i++]); 306 | break; 307 | } 308 | if( a.min <= b.min ) { 309 | if( a.max + 1 < b.min ) { 310 | out.push(a); 311 | a = ca[i++]; 312 | } else if( a.max < b.max ) { 313 | b = { min : a.min, max : b.max }; 314 | a = ca[i++]; 315 | } else 316 | b = cb[j++]; 317 | } else { 318 | // swap 319 | var tmp = ca; 320 | ca = cb; 321 | cb = tmp; 322 | var tmp = j; 323 | j = i; 324 | i = tmp; 325 | var tmp = a; 326 | a = b; 327 | b = tmp; 328 | } 329 | } 330 | return out; 331 | } 332 | 333 | static function parseInner( pattern : byte.ByteData, i : Int = 0, pDepth : Int = 0 ) : { pattern: Pattern, pos: Int } { 334 | function readChar() { 335 | var c = pattern.readByte(i++); 336 | if ( StringTools.isEof(c) ) { 337 | c = '\\'.code; 338 | } else if (c == "x".code) { 339 | c = Std.parseInt("0x" + pattern.readString(i, 2)); 340 | i += 2; 341 | } else if (c >= "0".code && c <= "9".code) { 342 | var v = c - 48; 343 | while(true) { 344 | var cNext = pattern.readByte(i); 345 | if (cNext >= "0".code && cNext <= "9".code) { 346 | v = v * 10 + (cNext - 48); 347 | ++i; 348 | } else { 349 | break; 350 | } 351 | } 352 | c = v; 353 | } 354 | return c; 355 | } 356 | 357 | var r = Empty; 358 | var l = pattern.length; 359 | while( i < l ) { 360 | var c = pattern.readByte(i++); 361 | if (c > 255) throw c; 362 | switch( c ) { 363 | case '+'.code if (r != Empty): 364 | r = plus(r); 365 | case '*'.code if (r != Empty): 366 | r = star(r); 367 | case '?'.code if (r != Empty): 368 | r = opt(r); 369 | case '|'.code if (r != Empty): 370 | var r2 = parseInner(pattern, i); 371 | return {pattern: Choice(r, r2.pattern), pos: r2.pos}; 372 | case '.'.code: 373 | r = next(r, Match(ALL_CHARS)); 374 | case '('.code: 375 | var r2 = parseInner(pattern, i, pDepth + 1); 376 | i = r2.pos; 377 | r = next(r, r2.pattern); 378 | case ')'.code: 379 | if (r == Empty) throw "Empty group"; 380 | return { pattern: Group(r), pos: i}; 381 | case '['.code if (pattern.length > 1): 382 | var range = 0; 383 | var acc:Charset = []; 384 | var not = pattern.readByte(i) == '^'.code; 385 | if( not ) i++; 386 | while( true ) { 387 | var c = pattern.readByte(i++); 388 | if( c == ']'.code ) { 389 | if( range != 0 ) return null; 390 | break; 391 | } else if( c == '-'.code ) { 392 | if( range != 0 ) return null; 393 | var last = acc.pop(); 394 | if( last == null ) 395 | acc.push( { min : c, max : c } ); 396 | else { 397 | if( last.min != last.max ) return null; 398 | range = last.min; 399 | } 400 | } else { 401 | if( c == '\\'.code ) { 402 | c = readChar(); 403 | } 404 | if( range == 0 ) 405 | acc.push( { min : c, max : c } ); 406 | else { 407 | acc.push( { min : range, max : c } ); 408 | range = 0; 409 | } 410 | } 411 | } 412 | var g:Charset = []; 413 | for( k in acc ) 414 | g = cunion(g, [k]); 415 | if( not ) 416 | g = cdiff(ALL_CHARS, g); 417 | r = next(r, Match(g)); 418 | case '\\'.code: 419 | c = readChar(); 420 | r = next(r, Match(single(c))); 421 | default: 422 | r = next(r, Match(single(c))); 423 | } 424 | } 425 | if (pDepth != 0) throw 'Found unclosed parenthesis while parsing "$pattern"'; 426 | return {pattern:r, pos: i}; 427 | } 428 | } 429 | 430 | private enum Pattern { 431 | Empty; 432 | Match( c : Charset ); 433 | Star( p : Pattern ); 434 | Plus( p : Pattern ); 435 | Next( p1 : Pattern, p2 : Pattern ); 436 | Choice( p1 : Pattern, p2 : Pattern ); 437 | Group ( p : Pattern ); 438 | } 439 | 440 | @:structInit private class CharRange { 441 | public var min:Int; 442 | public var max:Int; 443 | public function new(min,max) { 444 | this.min = min; 445 | this.max = max; 446 | } 447 | } 448 | private typedef Charset = Array; 449 | 450 | private class Node { 451 | public var id : Int; 452 | public var pid : Int; 453 | public var trans : Array<{ chars : Charset, n : Node }>; 454 | public var epsilon : Array; 455 | public function new(id, pid) { 456 | this.id = id; 457 | this.pid = pid; 458 | trans = []; 459 | epsilon = []; 460 | } 461 | } 462 | 463 | private class Transition { 464 | public var chars : Charset; 465 | public function new(chars) { 466 | this.chars = chars; 467 | } 468 | public function toString() { 469 | return Std.string(chars); 470 | } 471 | } 472 | -------------------------------------------------------------------------------- /src/hxparse/Lexer.hx: -------------------------------------------------------------------------------- 1 | package hxparse; 2 | 3 | /** 4 | Lexer matches a sequence of characters against a set of rule patterns. 5 | 6 | An instance of Lexer is created once for each input and maintains state 7 | for that input. Tokens can then be obtained by calling the `token` method, 8 | passing an instance of `Ruleset`. 9 | 10 | Rule sets can be created manually, or by calling the static `buildRuleset` 11 | method. 12 | **/ 13 | class Lexer { 14 | 15 | /** 16 | The `String` that was matched by the most recent invocation of the 17 | `token` method. 18 | **/ 19 | public var current(default, null):String; 20 | 21 | var input:byte.ByteData; 22 | var source:String; 23 | var pos:Int; 24 | 25 | /** 26 | Creates a new Lexer for `input`. 27 | 28 | If `sourceName` is provided, it is used in error messages to denote 29 | the position of an error. 30 | 31 | If `input` is null, the result is unspecified. 32 | **/ 33 | public function new(input:byte.ByteData, sourceName:String = "") { 34 | current = ""; 35 | this.input = input; 36 | source = sourceName; 37 | pos = 0; 38 | } 39 | 40 | /** 41 | Returns the current position of `this` Lexer. 42 | **/ 43 | public inline function curPos():Position { 44 | return new Position(source, pos - current.length, pos); 45 | } 46 | 47 | /** 48 | Returns the next token according to `ruleset`. 49 | 50 | This method starts with `ruleset.state` and reads characters from `this` 51 | input until no further state transitions are possible. It always returns 52 | the longest match. 53 | 54 | If a character is read which has no transition defined, an 55 | `UnexpectedChar` exception is thrown. 56 | 57 | If the input is in the end of file state upon method invocation, 58 | `ruleset.eofFunction` is called with `this` Lexer as argument. If 59 | `ruleset` defines no `eofFunction` field, a `haxe.io.Eof` exception 60 | is thrown. 61 | 62 | If `ruleset` is null, the result is unspecified. 63 | **/ 64 | public function token(ruleset:Ruleset):T { 65 | if (pos == input.length) { 66 | if (ruleset.eofFunction != null) return ruleset.eofFunction(this); 67 | else throw new haxe.io.Eof(); 68 | } 69 | var state = ruleset.state; 70 | var lastMatch = null; 71 | var lastMatchPos = pos; 72 | var start = pos; 73 | 74 | #if expose_lexer_state 75 | stateCallback(state, pos, -1); 76 | #end 77 | 78 | while(true) { 79 | if (state.finalId > -1) { 80 | lastMatch = state; 81 | lastMatchPos = pos; 82 | } 83 | if (pos == input.length) { 84 | break; 85 | } 86 | var i = input.readByte(pos); 87 | ++pos; 88 | state = state.trans.get(i); 89 | 90 | #if expose_lexer_state 91 | stateCallback(state, pos-1, i); 92 | #end 93 | 94 | if (state == null) 95 | break; 96 | } 97 | pos = lastMatchPos; 98 | current = input.readString(start, pos - start); 99 | if (lastMatch == null || lastMatch.finalId == -1) 100 | throw new UnexpectedChar(String.fromCharCode(input.readByte(pos)), curPos()); 101 | return ruleset.functions[lastMatch.finalId](this); 102 | } 103 | 104 | #if expose_lexer_state 105 | /** 106 | 107 | @param state `null` if it's the last state visited 108 | @param position Position of the byte read 109 | @param input Transition input byte, `-1` if initial state 110 | **/ 111 | dynamic public function stateCallback(state:State, position:Int, input:Int) {} 112 | #end 113 | 114 | /** 115 | Builds a `Ruleset` from the given `rules` `Array`. 116 | 117 | For each element of `rules`, its `rule` `String` is parsed into a 118 | `Pattern` using `LexEngine.parse`. 119 | 120 | If `rules` is null, the result is unspecified. 121 | **/ 122 | static public function buildRuleset(rules:Array<{rule:String,func:Lexer->Token}>, name:String = "") { 123 | var cases = []; 124 | var functions = []; 125 | var eofFunction = null; 126 | for (rule in rules) { 127 | if (rule.rule == "") { 128 | eofFunction = rule.func; 129 | } else { 130 | cases.push(LexEngine.parse(rule.rule)); 131 | functions.push(rule.func); 132 | } 133 | } 134 | return new Ruleset(new LexEngine(cases).firstState(), functions, eofFunction, name); 135 | } 136 | } -------------------------------------------------------------------------------- /src/hxparse/LexerTokenSource.hx: -------------------------------------------------------------------------------- 1 | package hxparse; 2 | 3 | class LexerTokenSource { 4 | var lexer:Lexer; 5 | public var ruleset:Ruleset; 6 | 7 | public function new(lexer, ruleset){ 8 | this.lexer = lexer; 9 | this.ruleset = ruleset; 10 | } 11 | 12 | public function token():Token{ 13 | return lexer.token(ruleset); 14 | } 15 | 16 | public function curPos():Position{ 17 | return lexer.curPos(); 18 | } 19 | } -------------------------------------------------------------------------------- /src/hxparse/NoMatch.hx: -------------------------------------------------------------------------------- 1 | package hxparse; 2 | 3 | /** 4 | A NoMatch exception is thrown if an outer token matching fails. 5 | 6 | Matching can continue because no tokens have been consumed. 7 | **/ 8 | class NoMatch extends ParserError { 9 | 10 | /** 11 | The token which was encountered and could not be matched. 12 | **/ 13 | public var token(default, null):T; 14 | 15 | /** 16 | Creates a new NoMatch exception. 17 | **/ 18 | public function new(pos:hxparse.Position, token:T) { 19 | super(pos); 20 | this.token = token; 21 | } 22 | 23 | override public function toString() { 24 | return 'No match: $token'; 25 | } 26 | } -------------------------------------------------------------------------------- /src/hxparse/Parser.hx: -------------------------------------------------------------------------------- 1 | package hxparse; 2 | 3 | /** 4 | Parser is the base class for all custom parsers. 5 | 6 | The intended usage is to extend it and utilize its method as an API where 7 | required. 8 | */ 9 | @:generic 10 | class Parser, Token> { 11 | 12 | /** 13 | Returns the last matched token. 14 | 15 | This is a convenience property for accessing `cache[offset - 1]`. 16 | **/ 17 | public var last(default, null):Token; 18 | 19 | var stream:S; 20 | var token:haxe.ds.GenericStack.GenericCell; 21 | 22 | /** 23 | Creates a new Parser instance over `TokenSource` `stream` 24 | **/ 25 | public function new(stream:S) { 26 | this.stream = stream; 27 | } 28 | 29 | /** 30 | Returns the `n`th token without consuming it. 31 | **/ 32 | @:dox(show) 33 | #if cs inline #end // Workaround for https://github.com/HaxeFoundation/haxe/issues/3212 34 | function peek(n:Int):Token { 35 | if (token == null) { 36 | token = new haxe.ds.GenericStack.GenericCell(stream.token(), null); 37 | n--; 38 | } 39 | var tok = token; 40 | while (n > 0) { 41 | if (tok.next == null) tok.next = new haxe.ds.GenericStack.GenericCell(stream.token(), null); 42 | tok = tok.next; 43 | n--; 44 | } 45 | return tok.elt; 46 | } 47 | 48 | /** 49 | Consumes the current token. 50 | 51 | This method is automatically called after a successful match. 52 | **/ 53 | @:dox(show) 54 | inline function junk() { 55 | last = token.elt; 56 | token = token.next; 57 | } 58 | 59 | /** 60 | Returns the current lexer position. 61 | **/ 62 | @:dox(show) 63 | public inline function curPos() { 64 | return stream.curPos(); 65 | } 66 | 67 | /** 68 | Invokes `f` and then `separatorFunc` with the current token until the 69 | result of that call is `false`. 70 | 71 | The result is an Array containing the results of all calls to `f`. 72 | 73 | A typical use case is parsing function arguments which are separated by 74 | a comma. 75 | **/ 76 | @:dox(show) 77 | function parseSeparated(separatorFunc:Token->Bool, f:Void->T):Array { 78 | var acc = []; 79 | while(true) { 80 | try { 81 | acc.push(f()); 82 | } catch(e:hxparse.NoMatch) { 83 | break; 84 | } 85 | if (separatorFunc(peek(0))) { 86 | junk(); 87 | } else { 88 | break; 89 | } 90 | } 91 | return acc; 92 | } 93 | 94 | /** 95 | Returns the result of calling `f()` if a match is made, or `null` 96 | otherwise. 97 | **/ 98 | @:dox(show) 99 | function parseOptional(f:Void->T) { 100 | try { 101 | return f(); 102 | } catch(e:hxparse.NoMatch) { 103 | return null; 104 | } 105 | } 106 | 107 | /** 108 | Calls `f` until no match can be made. 109 | 110 | The result is an Array containing the results of all calls to `f`. 111 | **/ 112 | @:dox(show) 113 | function parseRepeat(f:Void->T) { 114 | var acc = []; 115 | while(true) { 116 | try { 117 | acc.push(f()); 118 | } catch(e:hxparse.NoMatch) { 119 | return acc; 120 | } 121 | } 122 | } 123 | 124 | /** 125 | Returns the result of calling `f()` if a match is made, or throw 126 | `Unexpected` otherwise. 127 | **/ 128 | function parseExpect(f:Void->T) { 129 | try { 130 | return f(); 131 | } catch(_:NoMatch) { 132 | unexpected(); 133 | } 134 | } 135 | 136 | /** 137 | Throws `NoMatch` exception, which contains last matched position and token. 138 | **/ 139 | inline function noMatch() { 140 | return new NoMatch(stream.curPos(), peek(0)); 141 | } 142 | 143 | /** 144 | Throws `Unexpected` exception, which contains last matched position and token. 145 | **/ 146 | inline function unexpected():Dynamic { 147 | throw new Unexpected(peek(0), stream.curPos()); 148 | } 149 | 150 | /** 151 | Macro that processes and returns the result of `switch`. 152 | **/ 153 | @:access(hxparse.ParserBuilderImpl.transformSwitch) 154 | static public macro function parse(e:haxe.macro.Expr) { 155 | switch (e.expr) { 156 | case ESwitch(_, cases, edef) | EParenthesis({expr: ESwitch(_, cases, edef)}): 157 | return hxparse.ParserBuilderImpl.transformSwitch(cases, edef); 158 | case _: 159 | return haxe.macro.Context.error("Expected switch expression", e.pos); 160 | } 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /src/hxparse/ParserBuilder.hx: -------------------------------------------------------------------------------- 1 | package hxparse; 2 | 3 | @:autoBuild(hxparse.ParserBuilderImpl.build()) 4 | interface ParserBuilder { } -------------------------------------------------------------------------------- /src/hxparse/ParserBuilderImpl.macro.hx: -------------------------------------------------------------------------------- 1 | package hxparse; 2 | 3 | import haxe.macro.Context; 4 | import haxe.macro.Expr; 5 | 6 | using haxe.macro.Tools; 7 | using Lambda; 8 | 9 | private typedef ParserCase = { 10 | expr: Expr, 11 | head: Expr, 12 | tail: Array 13 | } 14 | 15 | private enum CaseGroup { 16 | Simple(group:Array); 17 | Complex(c:ParserCase); 18 | } 19 | 20 | class ParserBuilderImpl { 21 | static public function build():Array { 22 | var fields = Context.getBuildFields(); 23 | for (field in fields) { 24 | switch(field.kind) { 25 | case FFun(fun) if (fun.expr != null): 26 | fun.expr = map(fun.expr); 27 | case _: 28 | } 29 | } 30 | return fields; 31 | } 32 | 33 | static function punion(p1:Position, p2:Position) { 34 | var p1 = Context.getPosInfos(p1); 35 | var p2 = Context.getPosInfos(p2); 36 | return Context.makePosition({ 37 | file: p1.file, 38 | min: p1.min < p2.min ? p1.min : p2.min, 39 | max: p1.max > p2.max ? p1.max : p2.max 40 | }); 41 | } 42 | 43 | static function map(e:Expr) { 44 | return switch(e.expr) { 45 | case ESwitch({expr: EConst(CIdent("stream"))}, cl, edef): 46 | transformSwitch(cl, edef); 47 | case EBlock([]): 48 | e; 49 | case EBlock(el): 50 | var elast = el.pop(); 51 | var el = el.map(map); 52 | el.push(map(elast)); 53 | macro @:pos(e.pos) $b{el}; 54 | case _: e.map(map); 55 | } 56 | } 57 | 58 | static function transformSwitch(cl:Array, edef:Null) { 59 | if (edef != null) 60 | cl.push({values: [macro _], expr: edef, guard: null}); 61 | return transformCases(cl); 62 | } 63 | 64 | static function transformCases(cl:Array) { 65 | var groups = []; 66 | var group = []; 67 | var def = noMatch; 68 | for (c in cl) { 69 | switch(c.values) { 70 | case [{expr:EArrayDecl(el)}]: 71 | var head = el.shift(); 72 | var chead = {head:head, tail: el, expr:c.expr == null ? macro null : map(c.expr)}; 73 | switch(head.expr) { 74 | case EBinop(_): 75 | if (group.length > 0) groups.push(Simple(group)); 76 | groups.push(Complex(chead)); 77 | group = []; 78 | case _: 79 | group.push(chead); 80 | } 81 | case [{expr:EConst(CIdent("_"))}]: 82 | def = c.expr == null ? macro null : map(c.expr); 83 | case [e]: 84 | Context.error("Expected [ patterns ]", e.pos); 85 | case _: 86 | Context.error("Comma notation is not allowed while matching streams", punion(c.values[0].pos, c.values[c.values.length - 1].pos)); 87 | } 88 | } 89 | if (group.length > 0) 90 | groups.push(Simple(group)); 91 | 92 | var last = groups.pop(); 93 | var elast = makeCase(last,def); 94 | while (groups.length > 0) { 95 | elast = makeCase(groups.pop(), elast); 96 | } 97 | return elast; 98 | } 99 | 100 | static var unexpected = macro unexpected(); 101 | static var noMatch = macro throw noMatch(); 102 | 103 | static function makeCase(g:CaseGroup, def:Expr) { 104 | return switch(g) { 105 | case Simple(group): 106 | var cl = group.map(makeInner); 107 | cl.iter(function(c) { 108 | c.expr = macro @:pos(c.expr.pos) { junk(); ${c.expr}; }; 109 | }); 110 | { 111 | pos: def.pos, 112 | expr: ESwitch(macro peek(0), cl, def) 113 | } 114 | case Complex(c): 115 | var inner = makeInner(c); 116 | makePattern(c.head, inner.expr, def); 117 | } 118 | } 119 | 120 | static function makeInner(c:ParserCase) { 121 | var last = c.tail.pop(); 122 | if (last == null) { 123 | return {values:[c.head], guard:null, expr: c.expr}; 124 | } 125 | var elast = makePattern(last, c.expr, unexpected); 126 | while (c.tail.length > 0) 127 | elast = makePattern(c.tail.pop(), elast, unexpected); 128 | return {values: [c.head], guard: null, expr: elast}; 129 | } 130 | 131 | static function makePattern(pat:Expr, e:Expr, def:Expr) { 132 | return switch(pat.expr) { 133 | case EBinop(OpAssign, {expr: EConst(CIdent(s))}, e2): 134 | if (def == unexpected || def == noMatch) { 135 | var e1 = s == "_" ? e2 : macro var $s = $e2; 136 | macro { 137 | $e1; 138 | $e; 139 | } 140 | } else { 141 | buildExtractor(pat, e, e2, s, def); 142 | } 143 | case EBinop(OpBoolAnd, e1, e2): 144 | macro @:pos(pat.pos) { 145 | switch peek(0) { 146 | case $e1 if ($e2): 147 | junk(); 148 | $e; 149 | case _: $def; 150 | } 151 | } 152 | case EBinop(OpBoolOr, e1, e2): 153 | makePattern(e1, e, macro throw stream.curPos() + ": " +$e2); 154 | case _: 155 | macro @:pos(pat.pos) switch peek(0) { 156 | case $pat: 157 | junk(); 158 | $e; 159 | case _: $def; 160 | } 161 | } 162 | } 163 | 164 | static function buildExtractor(pat, e, e2, s, def) { 165 | var e1 = s == "_" ? e2 : macro var $s = $e2; 166 | return macro @:pos(pat.pos) { 167 | try { 168 | $e1; 169 | $e; 170 | } catch (_:hxparse.NoMatch) { 171 | $def; 172 | } 173 | } 174 | } 175 | } -------------------------------------------------------------------------------- /src/hxparse/ParserError.hx: -------------------------------------------------------------------------------- 1 | package hxparse; 2 | 3 | /** 4 | This is the base class of all parser errors. 5 | **/ 6 | class ParserError { 7 | /** 8 | The position in the input where `this` exception occured. 9 | **/ 10 | public var pos(default, null):Position; 11 | 12 | public function new(pos:Position) { 13 | this.pos = pos; 14 | } 15 | 16 | public function toString() { 17 | return "Parser error"; 18 | } 19 | } -------------------------------------------------------------------------------- /src/hxparse/Position.hx: -------------------------------------------------------------------------------- 1 | package hxparse; 2 | 3 | /** 4 | The position information maintained by `Lexer`. 5 | **/ 6 | class Position { 7 | /** 8 | Name of the source. 9 | **/ 10 | public var psource : String; 11 | 12 | /** 13 | The first character position, counting from the beginning of the input. 14 | **/ 15 | public var pmin : Int; 16 | 17 | /** 18 | The last character position, counting from the beginning of the input. 19 | **/ 20 | public var pmax : Int; 21 | 22 | /** 23 | Creates a new `Position` from the given information. 24 | **/ 25 | public function new(source, min, max) { 26 | psource = source; 27 | pmin = min; 28 | pmax = max; 29 | } 30 | 31 | /** 32 | Returns a readable representation of `this` position; 33 | **/ 34 | public function toString() { 35 | return '$psource:characters $pmin-$pmax'; 36 | } 37 | 38 | public function getLinePosition(input:byte.ByteData) { 39 | var lineMin = 1; 40 | var lineMax = 1; 41 | var posMin = 0; 42 | var posMax = 0; 43 | var cur = 0; 44 | while (cur < pmin) { 45 | if (input.readByte(cur) == "\n".code) { 46 | lineMin++; 47 | posMin = cur + 1; 48 | } 49 | cur++; 50 | } 51 | lineMax = lineMin; 52 | posMax = posMin; 53 | posMin = cur - posMin; 54 | while (cur < pmax) { 55 | if (input.readByte(cur) == "\n".code) { 56 | lineMax++; 57 | posMax = cur + 1; 58 | } 59 | cur++; 60 | } 61 | posMax = cur - posMax; 62 | return { 63 | lineMin: lineMin, 64 | lineMax: lineMax, 65 | posMin: posMin, 66 | posMax: posMax 67 | } 68 | } 69 | 70 | /** 71 | Formats `this` position by resolving line numbers within `input`. 72 | 73 | If `input` is null, the result is unspecified. 74 | **/ 75 | public function format(input:byte.ByteData) { 76 | var linePos = getLinePosition(input); 77 | if (linePos.lineMin != linePos.lineMax) { 78 | return '${psource}:lines ${linePos.lineMin}-${linePos.lineMax}'; 79 | } else { 80 | return '${psource}:${linePos.lineMin}: characters ${linePos.posMin}-${linePos.posMax}'; 81 | } 82 | } 83 | 84 | /** 85 | Unifies two positions `p1` and `p2`, using the minimum `pmin` and 86 | maximum `pmax` of both. 87 | 88 | The resulting `psource` and `pline` are taken from `p1`. 89 | 90 | If `p1` or `p2` are null, the result is unspecified. 91 | **/ 92 | static public function union(p1:Position, p2:Position) { 93 | return new Position(p1.psource, p1.pmin < p2.pmin ? p1.pmin : p2.pmin, p1.pmax > p2.pmax ? p1.pmax : p2.pmax); 94 | } 95 | } 96 | 97 | private typedef Position2 = { 98 | lineMin: Int, 99 | lineMax: Int, 100 | posMin: Int, 101 | posMax: Int 102 | } 103 | -------------------------------------------------------------------------------- /src/hxparse/RuleBuilder.hx: -------------------------------------------------------------------------------- 1 | package hxparse; 2 | 3 | import haxe.macro.Context; 4 | import haxe.macro.Expr; 5 | 6 | using Lambda; 7 | using haxe.macro.Tools; 8 | 9 | /** 10 | The RuleBuilder interfaces provides syntactic shortcuts for writing lexer 11 | rules. 12 | **/ 13 | #if !macro 14 | @:autoBuild(hxparse.RuleBuilderImpl.build()) 15 | #end 16 | interface RuleBuilder { } 17 | 18 | class RuleBuilderImpl { 19 | macro static public function build():Array { 20 | var fields = Context.getBuildFields(); 21 | var fieldExprs = new Map(); 22 | var delays = []; 23 | var ret = []; 24 | var rules = []; 25 | for (field in fields) { 26 | if (field.access.exists(function(a) return a == AStatic)) 27 | switch(field.kind) { 28 | case FVar(t, e) if (e != null): 29 | switch(e.expr) { 30 | case EMeta({name: ":rule"}, e): 31 | rules.push(field.name); 32 | delays.push(transformRule.bind(field, e, t, fieldExprs)); 33 | case EMeta({name: ":mapping", params: args}, e): 34 | var offset = switch(args) { 35 | case [{expr: EConst(CInt(i))}]: Std.parseInt(i); 36 | case _: 0; 37 | } 38 | delays.push(transformMapping.bind(field, e, offset)); 39 | case _: 40 | fieldExprs.set(field.name, e); 41 | } 42 | case _: 43 | } 44 | if (!field.meta.exists(function(m) return m.name == ":ruleHelper")) { 45 | ret.push(field); 46 | } 47 | } 48 | for (delay in delays) 49 | delay(); 50 | var ruleIdents = [for (rv in rules) macro $i{rv}]; 51 | ret.push( { 52 | name: "generatedRulesets", 53 | access: [APublic, AStatic], 54 | kind: FVar(TPath({ 55 | name: "Array", 56 | pack: [], 57 | params: [TPType(TPath({ 58 | name: "Ruleset", 59 | pack: ["hxparse"], 60 | params: [TPType(TPath( { 61 | name: "Dynamic", 62 | pack: [] 63 | }))] 64 | }))] 65 | }), macro $a{ruleIdents}), 66 | pos: Context.currentPos() 67 | }); 68 | return ret; 69 | } 70 | 71 | #if macro 72 | 73 | #if unifill 74 | 75 | static function handleUnicode(s:String, p:Position) { 76 | function getPosInfo(i, l) { 77 | var p = Context.getPosInfos(p); 78 | return Context.makePosition({ 79 | min: p.min + i, 80 | max: p.min + i + l, 81 | file: p.file 82 | }); 83 | } 84 | var uLength = unifill.Unifill.uLength(s); 85 | if (uLength == s.length) { 86 | return s; 87 | } 88 | var buf = new StringBuf(); 89 | var itr = new unifill.InternalEncodingIter(s, 0, s.length); 90 | while (itr.hasNext()) { 91 | var i = itr.next(); 92 | var c = unifill.InternalEncoding.charAt(s, i); 93 | switch (c) { 94 | case '[': 95 | buf.add("("); 96 | var first = true; 97 | while(true) { 98 | if (!itr.hasNext()) { 99 | Context.error("Unterminated regular expression", getPosInfo(itr.index, 1)); 100 | } 101 | var i = itr.next(); 102 | var c = unifill.InternalEncoding.charAt(s, i); 103 | switch (c) { 104 | case "]": 105 | break; 106 | case "^" if (first): 107 | var p = unifill.InternalEncoding.codePointCount(s, 0, i); 108 | Context.error("Not-ranges are not supported in unicode strings", getPosInfo(i, 1)); 109 | case _: 110 | if (!first) { 111 | buf.add("|"); 112 | } 113 | buf.add("("); 114 | if (!itr.hasNext()) { 115 | Context.error("Unterminated regular expression", getPosInfo(itr.index, 1)); 116 | } 117 | var w = unifill.InternalEncoding.codePointWidthAt(s, i); 118 | if (unifill.InternalEncoding.charAt(s, i + w) == "-") { 119 | itr.next(); 120 | if (!itr.hasNext()) { 121 | Context.error("Unterminated regular expression", getPosInfo(itr.index, 1)); 122 | } 123 | var k = itr.next(); 124 | var cNext = unifill.InternalEncoding.charAt(s, k); 125 | if (unifill.InternalEncoding.codePointAt(c, 0) > 0x7F) { 126 | Context.error("Unicode ranges are not supported", getPosInfo(i, 3)); 127 | } else { 128 | buf.add("["); 129 | buf.add(c); 130 | buf.add("-"); 131 | buf.add(cNext); 132 | buf.add("]"); 133 | } 134 | } else { 135 | buf.add(c); 136 | } 137 | buf.add(")"); 138 | } 139 | first = false; 140 | } 141 | buf.add(")"); 142 | case _: 143 | buf.add(c); 144 | } 145 | } 146 | return buf.toString(); 147 | } 148 | 149 | #end 150 | 151 | static function makeRule(fields:Map, rule:Expr):String { 152 | return switch(rule) { 153 | case macro $v{(s:String)}: #if unifill handleUnicode(s, rule.pos) #else s #end; 154 | case macro $i{i}: makeRule(fields, fields.get(i)); 155 | case macro $e1 + $e2: "(" + makeRule(fields, e1) +")(" + makeRule(fields, e2) +")"; 156 | case {expr:EConst(CRegexp(r, opt))}: 157 | if (opt != "") { 158 | Context.error("Cannot use regular expression flags for lexer rules", rule.pos); 159 | } 160 | r; 161 | case _: Context.error("Invalid rule", rule.pos); 162 | } 163 | } 164 | 165 | static function transformRule(field:Field, e:Expr, t:ComplexType, fields:Map) { 166 | var el = switch(e.expr) { 167 | case EArrayDecl(el): el; 168 | case _: Context.error("Expected pattern => function map declaration", e.pos); 169 | } 170 | var el = el.map(function(e) { 171 | function loop(e:Expr) { 172 | return switch(e.expr) { 173 | case EBinop(OpArrow, rule, e): 174 | macro @:pos(e.pos) {rule:$v{makeRule(fields, rule)}, func:function(lexer:hxparse.Lexer):$t return $e}; 175 | case EConst(CIdent(s)) if (fields.exists(s)): 176 | loop(fields.get(s)); 177 | case _: 178 | Context.error("Expected pattern => function", e.pos); 179 | } 180 | } 181 | return loop(e); 182 | }); 183 | var e = macro $a{el}; 184 | var e = macro hxparse.Lexer.buildRuleset($e, $v{field.name}); 185 | field.kind = FVar(null, e); 186 | return e; 187 | } 188 | 189 | static function transformMapping(field:Field, e:Expr, offset:Int) { 190 | var t = Context.typeof(e).follow(); 191 | var sl = []; 192 | switch(t) { 193 | case TAnonymous(_.get() => {status: AEnumStatics(_.get() => e)}): 194 | for (f in e.names) { 195 | var name = macro @:pos(e.pos) $i{f}; 196 | var cName = f.charAt(offset).toLowerCase() + f.substr(offset + 1); 197 | sl.push(macro $v{cName} => $name); 198 | } 199 | case _: 200 | Context.error("Invalid mapping type", e.pos); 201 | } 202 | var e = macro $a{sl}; 203 | field.kind = FVar(null, e); 204 | return e; 205 | } 206 | 207 | #end 208 | } -------------------------------------------------------------------------------- /src/hxparse/Ruleset.hx: -------------------------------------------------------------------------------- 1 | package hxparse; 2 | 3 | /** 4 | A Ruleset wraps an input state and the semantic callback functions for the 5 | `Lexer`. 6 | **/ 7 | class Ruleset { 8 | 9 | /** 10 | The initial state. 11 | **/ 12 | public var state:State; 13 | 14 | /** 15 | The semantic functions. 16 | **/ 17 | public var functions:ArrayToken>; 18 | 19 | /** 20 | The callback function for when end of file state is reached. 21 | **/ 22 | public var eofFunction:Lexer->Token; 23 | 24 | /** 25 | Informative name for the state, if any. Generated automatically from field name by RuleBuilder if @:rule is used. 26 | **/ 27 | public var name:String; 28 | 29 | /** 30 | Creates a new Ruleset. 31 | **/ 32 | public function new(state, functions, eofFunction, name = "") { 33 | this.state = state; 34 | this.functions = functions; 35 | this.eofFunction = eofFunction; 36 | this.name = name; 37 | } 38 | } -------------------------------------------------------------------------------- /src/hxparse/State.hx: -------------------------------------------------------------------------------- 1 | package hxparse; 2 | 3 | /** 4 | Represents a state in the state machine generated by the `LexEngine`. 5 | **/ 6 | class State { 7 | /** 8 | The transition vector, where the index corresponds to a char code. 9 | **/ 10 | public var trans:haxe.ds.Vector; 11 | 12 | /** 13 | The ids of the final states. 14 | **/ 15 | public var finalId:Int; 16 | 17 | /** 18 | Creates a new State. 19 | **/ 20 | public function new() { 21 | finalId = -1; 22 | trans = new haxe.ds.Vector(256); 23 | } 24 | } -------------------------------------------------------------------------------- /src/hxparse/TokenSource.hx: -------------------------------------------------------------------------------- 1 | package hxparse; 2 | 3 | /** 4 | Defines the structure of a type usable as input for a `Parser`. 5 | **/ 6 | typedef TokenSource = { 7 | 8 | /** 9 | Returns the next token 10 | **/ 11 | function token():Token; 12 | 13 | /** 14 | Returns the current `Position` of `this` TokenSource. 15 | **/ 16 | function curPos():Position; 17 | } -------------------------------------------------------------------------------- /src/hxparse/Unexpected.hx: -------------------------------------------------------------------------------- 1 | package hxparse; 2 | 3 | /** 4 | Unexpected is thrown by `Parser.serror`, which is invoked when an inner 5 | token matching fails. 6 | 7 | Unlike `NoMatch`, this exception denotes that the stream is in an 8 | irrecoverable state because tokens have been consumed. 9 | **/ 10 | class Unexpected extends ParserError { 11 | 12 | /** 13 | The token which was found. 14 | **/ 15 | public var token:Token; 16 | 17 | /** 18 | Creates a new instance of Unexpected. 19 | **/ 20 | public function new(token:Token, pos) { 21 | super(pos); 22 | this.token = token; 23 | } 24 | 25 | /** 26 | Returns a readable representation of `this` exception. 27 | **/ 28 | override public function toString() { 29 | return 'Unexpected $token'; 30 | } 31 | } -------------------------------------------------------------------------------- /src/hxparse/UnexpectedChar.hx: -------------------------------------------------------------------------------- 1 | package hxparse; 2 | 3 | /** 4 | UnexpectedChar is thrown by `Lexer.token` if it encounters a character for 5 | which no state transition is defined. 6 | **/ 7 | class UnexpectedChar extends ParserError { 8 | 9 | /** 10 | The character which caused `this` exception. 11 | **/ 12 | public var char:String; 13 | 14 | /** 15 | Creates a new instance of UnexpectedChar. 16 | **/ 17 | public function new(char, pos) { 18 | super(pos); 19 | this.char = char; 20 | } 21 | 22 | /** 23 | Returns a readable representation of `this` exception. 24 | **/ 25 | override public function toString() { 26 | return 'Unexpected $char'; 27 | } 28 | } -------------------------------------------------------------------------------- /src/hxparse/Utils.hx: -------------------------------------------------------------------------------- 1 | package hxparse; 2 | 3 | import hxparse.Unexpected; 4 | import hxparse.UnexpectedChar; 5 | import hxparse.NoMatch; 6 | 7 | /** 8 | This class provides some static utility methods. 9 | **/ 10 | class Utils { 11 | 12 | /** 13 | Tries to invoke `f` and return its value, while catching the lexer and 14 | parser exceptions `hxparse.NoMatch`, `hxparse.Unexpected` and 15 | `hxparse.UnexpectedChar`. 16 | 17 | If no exception occurs, the result of `f` is returned. 18 | 19 | Otherwise the caught exception is rethrown as `String` in a human- 20 | readable representation and with positions formatted within `input`. 21 | 22 | If `input` or `f` are null, the result is unspecified. 23 | **/ 24 | static public function catchErrors(input:byte.ByteData, f:Void->T) { 25 | try { 26 | return f(); 27 | } catch(e:ParserError) { 28 | throw e.pos.format(input) + ": " + e.toString(); 29 | } 30 | } 31 | } -------------------------------------------------------------------------------- /src/hxparse/debug/LexerGraph.hx: -------------------------------------------------------------------------------- 1 | package hxparse.debug; 2 | 3 | #if !hxdotgraph 4 | #error "Using this class requires -lib hxdotgraph" 5 | #end 6 | 7 | import hxparse.Ruleset; 8 | import hxparse.State; 9 | import dot.Graph; 10 | import dot.Node; 11 | import dot.Attribute; 12 | using Lambda; 13 | 14 | class LexerGraph { 15 | 16 | static public function printRuleset(ruleset:Ruleset):String { 17 | var lexerGraph = new LexerGraph(ruleset); 18 | return lexerGraph.graph.getDotCode(); 19 | } 20 | 21 | var graph:Graph; 22 | var ruleset:Ruleset; 23 | var map:Map; 24 | 25 | function new(ruleset:Ruleset) { 26 | this.ruleset = ruleset; 27 | this.graph = new Graph([RankDir(Lr)], true); 28 | map = new Map(); 29 | processState(ruleset.state); 30 | } 31 | 32 | function processState(state:State) { 33 | if (map.exists(state)) { 34 | return map[state]; 35 | } 36 | var attrs = [Label("")]; 37 | if (state.finalId > -1) { 38 | attrs.push(Shape(Doublecircle)); 39 | } 40 | 41 | var node = graph.node(attrs); 42 | map[state] = node; 43 | 44 | var targets = new Map(); 45 | for (i in 0...256) { 46 | if (state.trans[i] == null) { 47 | continue; 48 | } 49 | var target = state.trans[i]; 50 | if (!targets.exists(target)) { 51 | targets[target] = [i]; 52 | } else { 53 | targets[target].push(i); 54 | } 55 | } 56 | 57 | for (target in targets.keys()) { 58 | var il = targets[target]; 59 | var targetNode = processState(target); 60 | var edgeLabel = getRangeString(il); 61 | graph.edge(node, targetNode, [Label(edgeLabel)]); 62 | } 63 | 64 | return node; 65 | } 66 | 67 | function getRangeString(il:Array) { 68 | if (il.length > 240) { 69 | return "[^" + getRangeString(complementOf(il)) + "]"; 70 | } else if (il.length == 1) { 71 | return printCode(il[0]); 72 | } 73 | 74 | var ranges = []; 75 | var i = 0; 76 | var last = -1; 77 | var start = -1; 78 | function addRange() { 79 | if (start == last) { 80 | ranges.push(printCode(start)); 81 | } else { 82 | ranges.push(printCode(start) + "-" +printCode(last)); 83 | } 84 | } 85 | while (i < il.length) { 86 | var cur = il[i]; 87 | if (start == -1) { 88 | start = cur; 89 | ++i; 90 | } else if (cur != last + 1) { 91 | addRange(); 92 | start = -1; 93 | } else { 94 | ++i; 95 | } 96 | last = cur; 97 | } 98 | if (start != -1) { 99 | addRange(); 100 | } 101 | return ranges.join(" "); 102 | } 103 | 104 | function printCode(i:Int) { 105 | if (i >= 32 && i <= 0x7F) { 106 | return switch (i) { 107 | case '"'.code: '\\"'; 108 | case '\\'.code: '\\\\'; 109 | case ' '.code: "' '"; 110 | case _: String.fromCharCode(i); 111 | } 112 | } else { 113 | return "\\\\" +i; 114 | } 115 | } 116 | 117 | function complementOf(il:Array) { 118 | var ret = []; 119 | for (i in 0...256) { 120 | if (!il.has(i)) { 121 | ret.push(i); 122 | } 123 | } 124 | return ret; 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /test/ArithmeticParser.hx: -------------------------------------------------------------------------------- 1 | enum ArithmeticBinop { 2 | OpAdd; 3 | OpSub; 4 | OpMul; 5 | OpDiv; 6 | } 7 | 8 | enum ArithmeticToken { 9 | TNumber(f:Float); 10 | TPOpen; 11 | TPClose; 12 | TBinop(op:ArithmeticBinop); 13 | TEof; 14 | } 15 | 16 | enum ArithmeticExpr { 17 | ENumber(f:Float); 18 | EBinop(op:ArithmeticBinop, e1:ArithmeticExpr, e2:ArithmeticExpr); 19 | EParenthesis(e:ArithmeticExpr); 20 | ENeg(e:ArithmeticExpr); 21 | } 22 | 23 | class ArithmeticLexer extends hxparse.Lexer implements hxparse.RuleBuilder { 24 | static public var tok = @:rule [ 25 | "[1-9][0-9]*" => TNumber(Std.parseFloat(lexer.current)), // lazy... 26 | "\\(" => TPOpen, 27 | "\\)" => TPClose, 28 | "\\+" => TBinop(OpAdd), 29 | "\\-" => TBinop(OpSub), 30 | "\\*" => TBinop(OpMul), 31 | "\\/" => TBinop(OpDiv), 32 | "[\r\n\t ]" => lexer.token(tok), 33 | "" => TEof 34 | ]; 35 | } 36 | 37 | class ArithmeticParser extends hxparse.Parser, ArithmeticToken> implements hxparse.ParserBuilder { 38 | public function parse() { 39 | return switch stream { 40 | case [TNumber(f)]: 41 | parseNext(ENumber(f)); 42 | case [TPOpen, e = parse(), TPClose]: 43 | parseNext(EParenthesis(e)); 44 | case [TBinop(OpSub), e = parse()]: 45 | parseNext(ENeg(e)); 46 | } 47 | } 48 | 49 | function parseNext(e1:ArithmeticExpr) { 50 | return switch stream { 51 | case [TBinop(op), e2 = parse()]: 52 | binop(e1, op, e2); 53 | case _: 54 | e1; 55 | } 56 | } 57 | 58 | function binop(e1:ArithmeticExpr, op:ArithmeticBinop, e2:ArithmeticExpr) { 59 | return switch [e2, op] { 60 | case [EBinop(op2 = OpAdd | OpSub, e3, e4), OpMul | OpDiv]: 61 | // precedence 62 | EBinop(op2, EBinop(op, e1, e3), e4); 63 | case _: 64 | EBinop(op, e1, e2); 65 | } 66 | } 67 | } 68 | 69 | class ArithmeticEvaluator { 70 | static public function eval(e:ArithmeticExpr):Float { 71 | return switch(e) { 72 | case ENumber(f): 73 | f; 74 | case EBinop(op, e1, e2): 75 | switch(op) { 76 | case OpAdd: 77 | eval(e1) + eval(e2); 78 | case OpSub: 79 | eval(e1) - eval(e2); 80 | case OpMul: 81 | eval(e1) * eval(e2); 82 | case OpDiv: 83 | eval(e1) / eval(e2); 84 | } 85 | case EParenthesis(e1): 86 | eval(e1); 87 | case ENeg(e1): 88 | -eval(e1); 89 | } 90 | } 91 | } -------------------------------------------------------------------------------- /test/JSONParser.hx: -------------------------------------------------------------------------------- 1 | import hxparse.Parser.parse as parse; 2 | 3 | private enum Token { 4 | TBrOpen; 5 | TBrClose; 6 | TComma; 7 | TDblDot; 8 | TBkOpen; 9 | TBkClose; 10 | TDash; 11 | TDot; 12 | TTrue; 13 | TFalse; 14 | TNull; 15 | TNumber(v:String); 16 | TString(v:String); 17 | TEof; 18 | } 19 | 20 | class JSONLexer extends hxparse.Lexer implements hxparse.RuleBuilder { 21 | 22 | static var buf:StringBuf; 23 | 24 | public static var tok = @:rule [ 25 | "{" => TBrOpen, 26 | "}" => TBrClose, 27 | "," => TComma, 28 | ":" => TDblDot, 29 | "[" => TBkOpen, 30 | "]" => TBkClose, 31 | "-" => TDash, 32 | "\\." => TDot, 33 | "true" => TTrue, 34 | "false" => TFalse, 35 | "null" => TNull, 36 | "-?(([1-9][0-9]*)|0)(.[0-9]+)?([eE][\\+\\-]?[0-9]+)?" => TNumber(lexer.current), 37 | '"' => { 38 | buf = new StringBuf(); 39 | lexer.token(string); 40 | TString(buf.toString()); 41 | }, 42 | "[\r\n\t ]" => lexer.token(tok), 43 | "" => TEof 44 | ]; 45 | 46 | static var string = @:rule [ 47 | "\\\\t" => { 48 | buf.addChar("\t".code); 49 | lexer.token(string); 50 | }, 51 | "\\\\n" => { 52 | buf.addChar("\n".code); 53 | lexer.token(string); 54 | }, 55 | "\\\\r" => { 56 | buf.addChar("\r".code); 57 | lexer.token(string); 58 | }, 59 | '\\\\"' => { 60 | buf.addChar('"'.code); 61 | lexer.token(string); 62 | }, 63 | "\\\\u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f]" => { 64 | buf.add(String.fromCharCode(Std.parseInt("0x" +lexer.current.substr(2)))); 65 | lexer.token(string); 66 | }, 67 | '"' => { 68 | lexer.curPos().pmax; 69 | }, 70 | '[^"]' => { 71 | buf.add(lexer.current); 72 | lexer.token(string); 73 | }, 74 | ]; 75 | } 76 | 77 | class JSONParser extends hxparse.Parser, Token> { 78 | public function new(input:byte.ByteData, sourceName:String) { 79 | var lexer = new JSONLexer(input, sourceName); 80 | var ts = new hxparse.LexerTokenSource(lexer, JSONLexer.tok); 81 | super(ts); 82 | } 83 | 84 | public function parseJson():Dynamic { 85 | return parse(switch stream { 86 | case [TBrOpen, obj = object({})]: obj; 87 | case [TBkOpen, arr = array([])]: arr; 88 | case [TNumber(s)]: s; 89 | case [TTrue]: true; 90 | case [TFalse]: false; 91 | case [TNull]: null; 92 | case [TString(s)]: s; 93 | }); 94 | } 95 | 96 | function object(obj:{}) { 97 | return parse(switch stream { 98 | case [TBrClose]: obj; 99 | case [TString(s), TDblDot, e = parseJson()]: 100 | Reflect.setField(obj, s, e); 101 | switch stream { 102 | case [TBrClose]: obj; 103 | case [TComma]: object(obj); 104 | } 105 | }); 106 | } 107 | 108 | function array(acc:Array) { 109 | return parse(switch stream { 110 | case [TBkClose]: acc; 111 | case [elt = parseJson()]: 112 | acc.push(elt); 113 | switch stream { 114 | case [TBkClose]: acc; 115 | case [TComma]: array(acc); 116 | } 117 | }); 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /test/PrintfParser.hx: -------------------------------------------------------------------------------- 1 | enum PToken { 2 | Eof; 3 | Placeholder; 4 | Dot; 5 | Number(i:Int); 6 | Literal(s:String); 7 | Flag(flag:PFlag); 8 | Value(v:PValue); 9 | } 10 | 11 | enum PFlag { 12 | Zero; 13 | Alt; 14 | Plus; 15 | Minus; 16 | Space; 17 | } 18 | 19 | enum PValue { 20 | VInt:PValue; 21 | VString:PValue; 22 | VBool:PValue; 23 | VFloat:PValue; 24 | } 25 | 26 | enum Fmt { 27 | Lit(s:String):Fmt; 28 | Val(v:PValue):FmtA>; 29 | Cat(a:Fmt, b:Fmt):Fmt; 30 | } 31 | 32 | class PrintfLexer extends hxparse.Lexer implements hxparse.RuleBuilder { 33 | 34 | static public var tok = @:rule [ 35 | "$" => Placeholder, 36 | "$$" => Literal(lexer.current), 37 | "[^$]+" => Literal(lexer.current), 38 | "" => Eof 39 | ]; 40 | 41 | static public var placeholder = @:rule [ 42 | "0" => Flag(Zero), 43 | "#" => Flag(Alt), 44 | " " => Flag(Space), 45 | "+" => Flag(Plus), 46 | "-" => Flag(Minus), 47 | "[1-9][0-9]*" => Number(Std.parseInt(lexer.current)), 48 | "\\." => Dot, 49 | "i" => Value(VInt), 50 | "f" => Value(VFloat), 51 | "s" => Value(VString), 52 | "b" => Value(VBool), 53 | ]; 54 | } 55 | 56 | class PrintfParser extends hxparse.Parser, PToken> implements hxparse.ParserBuilder { 57 | public function new(input:byte.ByteData) { 58 | var lexer = new PrintfLexer(input); 59 | var ts = new hxparse.LexerTokenSource(lexer, PrintfLexer.tok); 60 | super(ts); 61 | } 62 | 63 | public function parse() { 64 | var v:Fmt = switch stream { 65 | case [Literal(s)]: Lit(s); 66 | case [Placeholder]: 67 | var current = stream.ruleset; 68 | stream.ruleset = PrintfLexer.placeholder; 69 | var r = parsePlaceholder(); 70 | stream.ruleset = current; 71 | r; 72 | case [Eof]: null; 73 | } 74 | if (v == null) return null; 75 | var next = parse(); 76 | return next == null ? v : Cat(v, next); 77 | } 78 | 79 | function parsePlaceholder() { 80 | var flags = parseFlags([]); 81 | var width = switch stream { 82 | case [Number(n)]: n; 83 | case _: -1; 84 | } 85 | var precision = switch stream { 86 | case [Dot, Number(n)]: n; 87 | case _: -1; 88 | } 89 | return switch stream { 90 | case [Value(v)]: Val(v); // we omit the config for simplicity reasons 91 | case _: unexpected(); 92 | } 93 | } 94 | 95 | function parseFlags(acc:Array) { 96 | return switch stream { 97 | case [Flag(x)]: 98 | acc.push(x); 99 | parseFlags(acc); 100 | case _: acc; 101 | } 102 | } 103 | } -------------------------------------------------------------------------------- /test/Test.hx: -------------------------------------------------------------------------------- 1 | class Test { 2 | static function main() { 3 | 4 | var t0 = haxe.Timer.stamp(); 5 | 6 | var parser = new PrintfParser(byte.ByteData.ofString("Valu$$e: $-050.2f kg")); 7 | trace(parser.parse()); 8 | 9 | var parser = new JSONParser(byte.ByteData.ofString('{ "key": [true, false, null], "other\tkey": [12, 12.1, 0, 0.1, 0.9e1, 0.9E1, 9E-1] }'), "jsontest"); 10 | trace(parser.parseJson()); 11 | 12 | // Using haxe.Utf8 13 | var value = 'hello âê€𩸽ùあ𠀀ÊÀÁÂÃÄÅÆÇÈÉËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáãäåæçèéëìíîïðñòóôõöøúûüýþÿ№ unicode'; 14 | var lexer = new UnicodeTestLexer( byte.ByteData.ofString( value ), 'uft8-test' ); 15 | var tokens = []; 16 | 17 | try while (true) { 18 | tokens.push( lexer.token( UnicodeTestLexer.root ) ); 19 | } catch (_e:Dynamic) { 20 | trace(_e); 21 | } 22 | trace( tokens ); 23 | 24 | var numTests = 0; 25 | function eq(expected:Float, s:String) { 26 | ++numTests; 27 | var lexer = new ArithmeticParser.ArithmeticLexer(byte.ByteData.ofString(s)); 28 | var ts = new hxparse.LexerTokenSource(lexer, ArithmeticParser.ArithmeticLexer.tok); 29 | var parser = new ArithmeticParser(ts); 30 | var result = ArithmeticParser.ArithmeticEvaluator.eval(parser.parse()); 31 | if (expected != result) { 32 | trace('Error in "$s"; expected $expected but was $result'); 33 | } 34 | } 35 | eq(1, "1"); 36 | eq(2, "1 + 1"); 37 | eq(6, "2 * 3"); 38 | eq(2, "6 / 3"); 39 | eq(1.5, "3 / 2"); 40 | eq(10, "2 * 3 + 4"); 41 | eq(14, "2 * (3 + 4)"); 42 | eq(18, "9 + (3 * 4) - 3 / (1 * 1)"); 43 | eq(-9, "-9"); 44 | eq(-12, "-(4 + 8)"); 45 | eq(12, "--12"); 46 | eq(8, "2*(3-(2+(-3)))"); 47 | 48 | var diff = haxe.Timer.stamp() - t0; 49 | trace('Done $numTests tests in $diff ms'); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /test/UnicodeTestLexer.hx: -------------------------------------------------------------------------------- 1 | package ; 2 | 3 | import hxparse.Lexer; 4 | import hxparse.RuleBuilder; 5 | import haxe.Utf8; 6 | 7 | /** 8 | * ... 9 | * @author Skial Bainn 10 | */ 11 | class UnicodeTestLexer extends Lexer implements RuleBuilder { 12 | 13 | public static var root = @:rule [ 14 | 'â' => lexer.current, 15 | 'ê' => lexer.current, 16 | 'ù' => lexer.current, 17 | "あ𠀀" => lexer.current, 18 | '\u00CA' => lexer.current, // Ê 19 | '\u20AC' => lexer.current, // € 20 | '\u{29e3d}' => lexer.current, // 𩸽 21 | '[ a-zA-Z0-9ÀÁÂÔÕÖØÙÚÛÜÝÞßàáãäåæçèéëìíîïðñòóôõöøúûüýþÿ№あ𠀀]' => lexer.current, 22 | '\\195[\\131-\\139]' => lexer.current, 23 | '\\xC3[\\x8c-\\x93]' => lexer.current, 24 | //'[Ã-Ë]' => lexer.current 25 | ]; 26 | 27 | } 28 | --------------------------------------------------------------------------------