├── .gitignore ├── COPYING ├── README.md ├── captures.go ├── comparison ├── comparison.html ├── highlight.js └── style.css ├── instructions.go ├── match.go ├── match_test.go └── peg.go /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | .*.swp 3 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2010 Markus Jarderot 4 | Copyright (c) 2013 Harley Laue 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | pego 2 | ==== 3 | 4 | This is a pattern matching library for Go. It is based on lpeg, which uses a flavor of PEG. 5 | 6 | This is the official continuation of the project of the same name started by Markus Jarderot. 7 | He wrote the implementation and I've updated the project to work with newer Go versions. 8 | 9 | The original project page is located here: https://code.google.com/p/pego/ 10 | 11 | ## Example 12 | ```go 13 | pat := Grm("S", map[string]*Pattern{ 14 | "S": Ref("A").Clist(), 15 | "A": Seq( 16 | NegSet("()").Rep(0, -1), 17 | Seq( 18 | Ref("B"), 19 | NegSet("()").Rep(0, -1), 20 | ).Rep(0, -1)).Csimple(), 21 | "B": Seq( 22 | "(", Ref("A"), ")"), 23 | }) 24 | ``` 25 | 26 | ## More information 27 | * [LPeg - Parsing Expression Grammars For Lua](http://www.inf.puc-rio.br/~roberto/lpeg/lpeg.html) - Source of inspiration 28 | * [A Text Pattern-Matching Tool based on Parsing Expression Grammars](http://www.inf.puc-rio.br/~roberto/docs/peg.pdf) - Paper on the implementation of LPeg. 29 | -------------------------------------------------------------------------------- /captures.go: -------------------------------------------------------------------------------- 1 | // vim: ff=unix ts=3 sw=3 noet 2 | 3 | package pego 4 | 5 | import ( 6 | "errors" 7 | "fmt" 8 | "regexp" 9 | "strconv" 10 | "strings" 11 | ) 12 | 13 | // Interface for all capture handlers 14 | type CaptureHandler interface { 15 | Process(input string, start, end int, captures *CapStack, subcaps int) (interface{}, error) 16 | } 17 | 18 | // Captures the matched substring 19 | type SimpleCapture struct{} 20 | 21 | func (h *SimpleCapture) String() string { return "simple" } 22 | func (h *SimpleCapture) Process(input string, start, end int, captures *CapStack, subcaps int) (interface{}, error) { 23 | return input[start:end], nil 24 | } 25 | 26 | // Captures the current input position 27 | type PositionCapture struct{} 28 | 29 | func (h *PositionCapture) String() string { return "position" } 30 | func (h *PositionCapture) Process(input string, start, end int, captures *CapStack, subcaps int) (interface{}, error) { 31 | return start, nil 32 | } 33 | 34 | // Captures a constant value 35 | type ConstCapture struct { 36 | value interface{} 37 | } 38 | 39 | func (h *ConstCapture) String() string { 40 | return fmt.Sprintf("const(%v)", h.value) 41 | } 42 | func (h *ConstCapture) Process(input string, start, end int, captures *CapStack, subcaps int) (interface{}, error) { 43 | return h.value, nil 44 | } 45 | 46 | // Captures a list of all sub-captures 47 | type ListCapture struct{} 48 | 49 | func (h *ListCapture) String() string { return "list" } 50 | func (h *ListCapture) Process(input string, start, end int, captures *CapStack, subcaps int) (interface{}, error) { 51 | subs := captures.Pop(subcaps) 52 | ret := make([]interface{}, len(subs)) 53 | for i := range subs { 54 | ret[i] = subs[i].value 55 | } 56 | return ret, nil 57 | } 58 | 59 | // Calls a function with all sub-captures, and captures the return value. 60 | // If functions reports an error, let it bubble up. 61 | type FunctionCapture struct { 62 | function func([]*CaptureResult) (interface{}, error) 63 | } 64 | 65 | func (h *FunctionCapture) String() string { return "function" } 66 | func (h *FunctionCapture) Process(input string, start, end int, captures *CapStack, subcaps int) (interface{}, error) { 67 | subs := captures.Pop(subcaps) 68 | return h.function(subs) 69 | } 70 | 71 | // Capture a string created from a format applied to the sub-captures. 72 | type StringCapture struct { 73 | format string 74 | } 75 | 76 | func (h *StringCapture) String() string { 77 | return fmt.Sprintf("string(%q)", h.format) 78 | } 79 | func (h *StringCapture) Process(input string, start, end int, captures *CapStack, subcaps int) (interface{}, error) { 80 | subs := captures.Pop(subcaps) 81 | p := regexp.MustCompile(`{[0-9]+}|{{|{}`) 82 | var err error 83 | ret := p.ReplaceAllStringFunc(h.format, func(s string) string { 84 | switch s[1] { 85 | case '{': 86 | return "{" 87 | case '}': 88 | return "}" 89 | } 90 | if err != nil { 91 | return "" 92 | } 93 | var i int 94 | i, err = strconv.Atoi(s[1 : len(s)-1]) 95 | if err == nil && i >= len(subs) { 96 | err = errors.New("String format number out of range") 97 | } 98 | if err != nil { 99 | return "" 100 | } 101 | return fmt.Sprintf("%v", subs[i].value) 102 | }) 103 | return ret, err 104 | } 105 | 106 | // Capture a string, with all sub-captures replaced by their string-representation. 107 | // XXX(mizardx): Better explaination? 108 | type SubstCapture struct{} 109 | 110 | func (h *SubstCapture) String() string { return "subst" } 111 | func (h *SubstCapture) Process(input string, start, end int, captures *CapStack, subcaps int) (interface{}, error) { 112 | subs := captures.Pop(subcaps) 113 | ret := make([]string, 0) 114 | pos := start 115 | for _, c := range subs { 116 | if c.start > pos { 117 | ret = append(ret, input[pos:c.start]) 118 | } 119 | ret = append(ret, fmt.Sprintf("%v", c.value)) 120 | pos = c.end 121 | } 122 | if pos < end { 123 | ret = append(ret, input[pos:end]) 124 | } 125 | return strings.Join(ret, ""), nil 126 | } 127 | -------------------------------------------------------------------------------- /comparison/comparison.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Comparison between pego and LPeg 5 | 6 | 7 | 8 | 9 | 10 |

Comparison between pego and LPeg

11 | 12 | 29 | 30 |

Low-level stuff

31 | 32 |

Low-level stuff interpreted by the virtual machine.

33 | 34 |

Instructions

35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 46 | 47 | 48 | 49 | 52 | 53 | 54 | 55 | 58 | 59 | 60 | 61 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 78 | 79 | 80 | 81 | 84 | 85 | 86 | 87 | 90 | 91 | 92 | 93 | 96 | 97 | 98 | 99 | 102 | 103 | 104 | 105 | 110 | 111 | 112 | 113 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 141 | 142 | 143 | 144 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 159 | 160 | 161 | 162 | 165 | 166 | 167 | 168 | 169 | 170 |
LPegpego
enum Opcode IAny;
type IAny struct {
 44 |    count int
 45 | }
enum Opcode IChar;
type IChar struct {
 50 |    char byte
 51 | }
enum Opcode ISet;
type ICharset struct {
 56 |    mask [8]uint32
 57 | }
enum Opcode ISpan;
type ISpan struct {
 62 |    ICharset // embedding
 63 | }
enum Opcode IRet;
type IReturn struct{}
enum Opcode IEnd;
type IEnd struct{}
enum Opcode IChoice;
type IChoice struct {
 76 |    offset int
 77 | }
enum Opcode IJmp;
type IJump struct {
 82 |    offset int
 83 | }
enum Opcode ICall;
type ICall struct {
 88 |    offset int
 89 | }
enum Opcode IOpenCall;
type IOpenCall struct {
 94 |    name string
 95 | }
enum Opcode ICommit;
type ICommit struct {
100 |    offset int
101 | }
enum Opcode IPartialCommit;
106 |
type IPartialCommit struct {
107 |    offset int
108 | }
109 |
enum Opcode IBackCommit;
114 |
type IBackCommit struct {
115 |    offset int
116 | }
117 |
enum Opcode IFailTwice;
type IFailTwice struct{}
enum Opcode IFail;
type IFail struct{}
enum Opcode IGiveup;
type IGiveUp struct{}
enum Opcode IFunc;
Won't do
enum Opcode IFullCapture;
type IFullCapture struct {
138 |    capOffset int 
139 |    handler CaptureHandler
140 | }
enum Opcode IEmptyCapture;
type IEmptyCapture struct {
145 |    capOffset int 
146 |    handler CaptureHandler
147 | }
enum Opcode IEmptyCaptureIdx;
Undecided
enum Opcode IOpenCapture;
type IOpenCapture struct {
156 |    capOffset int 
157 |    handler CaptureHandler
158 | }
enum Opcode ICloseCapture;
type ICloseCapture struct {
163 |    capOffset int
164 | }
enum Opcode ICloseRunTime;
To do
171 | ^top 172 | 173 |

Captures

174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 |
LPegpego
enum CapKind Cclose;
Not needed
enum CapKind Cposition;
type PositionCapture struct{}
enum CapKind Cconst;
type ConstCapture struct {
191 |    value interface{}
192 | }
enum CapKind Cbackref;
To do
enum CapKind Carg;
To do
enum CapKind Csimple;
type SimpleCapture struct{}
enum CapKind Ctable;
type ListCapture struct{}
enum CapKind Cfunction;
type FunctionCapture struct {
213 |    function func([]*CaptureResult) (interface{}, os.Error)
214 | }
enum CapKind Cquery;
To do
enum CapKind Cstring;
type StringCapture struct {
223 |    format string
224 | }
enum CapKind Csubst;
type SubstCapture struct{}
enum CapKind Cfold;
Undecided
enum CapKind Cruntime;
To do
enum CapKind Cgroup;
To do
243 | ^top 244 | 245 |

Mid-level stuff

246 | 247 |

Used for building patterns.

248 | ^top 249 | 250 |

Constructors

251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 |
LPegpogoComment
lpeg.P(pattern)
P(pattern)
Return the pattern unmodified
lpeg.P(string)
Pat(string) = Lit(string)
Literal match
lpeg.P(number)
Pat(number) = Any(number)
Match number of any character.
lpeg.P(-number)
Pat(-number) = Not(Any(number))
Assert that there are not number of any character.
lpeg.P(true)
Pat(true) = Succ()
Always match.
lpeg.P(false)
Pat(false) = Fail()
Never match.
lpeg.P(table)
Grm(string, map[string]*Pattern)
Compile a grammar.
lpeg.P(function)
Everything is a match-time capture. Should be changed.Match-time capture.
lpeg.R(...)
To do.Character ranges.
lpeg.S(string)
Set(string)
Character set.
lpeg.V(string)
Ref(string)
Non-terminal.
lpeg.locale([table])
UndecidedPredefined locale-dependant sets.
319 | ^top 320 | 321 |

Captures

322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 |
LPegpegoComments
lpeg.C(pattern)
Csimple(pattern) = pattern.Csimple()
Captures the matched substring.
lpeg.Carg(n)
UndecidedCaptures the n'th extra argument to the matching function.
lpeg.Cb(name)
To doBackreference.
lpeg.Cc(value)
Cconst(value)
Captures the given value.
lpeg.Cf(patt, func)
UndecidedFolding of func over the captures of patt.
lpeg.Cg(patt, [name])
UndecidedThe captures of patt, optinally tagged with name.
lpeg.Cp()
Cposition()
Captures the input position.
lpeg.Cs(patt)
Csubst(patt) = patt.Csubst()
Nested captures replaces their matches.
lpeg.Ct(patt)
Clist(patt) = patt.Clist()
A table (list) with all captures from patt.
patt / string
Cstring(patt, format)
The captures of patt are used as arguments to format a string.
patt / table
TodoThe first capture of patt is used to index the table.
patt / function
Cfunction(patt, func) = patt.Cfunction(func)
The captures of patt is used as arguments to the function.
lpeg.Cmt(patt, function)
UndecidedLike patt / function, except that it is executed immediately.
395 | ^top 396 | 397 |

Methods/Operators

398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 |
LPegpegoComment
#patt
And(patt)
Assert that patt matches.
-patt
Not(patt)
patt1 + patt2 + patt3
Or(patt1,patt2,patt3) = patt1.Or(patt2,patt3)
Ordered choice.
patt1 - patt2
Seq(Not(patt2),patt1) = patt1.Exc(patt2)
patt1 * patt2
Seq(patt1,patt2)
Sequence of matches.
patt ^ n
Rep(patt,min,max) = patt.Rep(min,max)
Repetition.
436 | ^top 437 | 438 |

High-level stuff

439 | 440 |

PEG grammar.

441 | ^top 442 | 443 |

PEG grammar and expressions

444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 |
LPegpegoComment
(p)
To doGrouping.
'string'
To doLiteral string.
"string"
To doLiteral string.
[class]
To doCharacter class.
.
To doAny character.
%name
To doPredefined pattern.
<name>
To doNon-terminal.
{}
To doPosition capture.
{ p }
To doSimple capture.
{: p :}
To doAnonymous group capture.
{:name: p :}
To doNamed group capture.
{~ p ~}
To doSubstitution capture.
=name
To doBack reference.
p ?
To doOptional match.
p *
To doZero or more repetitions.
p +
To doOne or more repetitions.
p ^ n
To doExactly n repetitions.
p ^ +n
To doAt least n repetitions.
p ^ -n
To doAt most n repetitions.
p -> 'string'
To doString capture.
p -> "string"
To doString capture.
p -> {}
To doTable capture.
p -> name
To doFunction/query/string capture, with name pulled from elsewhere.
p => name
To doMatch-time capture.
& p
To doAnd predicate.
! p
To doNot predicate.
p1 p2
To doSequence.
p1 / p2
To doOrdered choice.
name <- p
To doGrammar
597 | ^top 598 | 599 | 607 | 608 | 609 | -------------------------------------------------------------------------------- /comparison/highlight.js: -------------------------------------------------------------------------------- 1 | (function() { 2 | // Mapping of language -> class name -> list of string/regex 3 | var languages = { 4 | "c": { 5 | "keyword": [ 6 | "auto", "break", "case", "char", "const", "continue", "default", "do", 7 | "double", "else", "enum", "extern", "float", "for", "goto", "if", "int", 8 | "long", "register", "return", "short", "signed", "sizeof", "static", 9 | "struct", "switch", "typedef", "union", "unsigned", "void", "volatile", 10 | "while" 11 | ], 12 | "separator": [ 13 | "~", "}", "||", "|=", "|", "{", "^=", "^", "]", "[", "?", ">>=", ">>", ">=", 14 | ">", "==", "=", "<=", "<<=", "<<", "<", ";", ":", "/=", "/", "...", ".", 15 | "->", "-=", "--", "-", ",", "+=", "++", "+", "*=", "*", ")", "(", "&=", 16 | "&&", "&", "%=", "%", "##", "#", "!=", "!" 17 | ], 18 | "type": [ 19 | "auto", "char", "const", "double", "extern", "float", "int", "int16_t", 20 | "int32_t", "int64_t", "int8_t", "long", "register", "short", "signed", 21 | "uint16_t", "uint32_t", "uint64_t", "uint8_t", "unsigned", "volatile" 22 | ], 23 | "value": [ 24 | /"(?:\\.|[^"\\])*"/, // string literal 25 | /'(?:\\.|[^"\\])*'/, // char literal 26 | /(?:\b\d+\.\d+|\.\d+)(?:E[+-]?\d+)?\b/i, // floating point literal (1) 27 | /\b\d+(?:\.?E[+-]?\d+\b|\.)/i, // floating point literal (2) 28 | /\b[1-9]\d+\b/, // decimal integer literal 29 | /\b0x[0-9a-f]+\b/i, // hexadecimal integer literal 30 | /\b0[0-7]*\b/ // octal integer literal 31 | ], 32 | "comment": [ 33 | /\/\*[\s\S]*?\*\//, // Multi-line comment 34 | /\/\/.*/, // Single-line comment 35 | /#.*(?:\\\n.*)/ // Pre-processor instruction 36 | ] 37 | }, 38 | "lua": { 39 | "keyword": [ 40 | "and", "break", "do", "else", "elseif", "end", "for", "function", "if", 41 | "in", "local", "not", "or", "repeat", "return", "then", "until", "while" 42 | ], 43 | "separator": [ 44 | "~=", "}", "{", "^", "]", "[", ">=", ">", "==", "=", "<=", "<", ";", ":", 45 | "/", "...", "..", ".", "-", ",", "+", "*", ")", "(", "%", "#" 46 | ], 47 | "value": [ 48 | /"(?:\\.|\\\n|[^\\"])*"/, 49 | /'(?:\\.|\\\n|[^\\"])*'/, 50 | /\[(=*)\[(?:.|\n)*?]\1]/, 51 | /\b\d\+(?:\.\d+)?(?:E[+-]?\d+)?\b/i, 52 | /\b0x[0-9a-f]+\b/i 53 | ], 54 | "comment": [ 55 | /--(?!\[=*\[).*/, 56 | /--\[(=*)\[(?:.|\n)*?]\1]/ 57 | ] 58 | }, 59 | "go": { 60 | "keyword": [ 61 | "break", "default", "func", "interface", "select", "case", "defer", "go", 62 | "map", "struct", "chan", "else", "goto", "package", "switch", "const", 63 | "fallthrough", "if", "range", "type", "continue", "for", "import", "return", 64 | "var" 65 | ], 66 | "type": [ 67 | "uintptr", "uint8", "uint64", "uint32", "uint16", "uint", "string", "int8", 68 | "int64", "int32", "int16", "int", "float64", "float32", "float", 69 | "complex64", "complex128", "complex", "byte" 70 | ], 71 | "separator": [ 72 | "}", "||", "|=", "|", "{", "^=", "^", "]", "[", ">>=", ">>", ">=", ">", 73 | "==", "=", "<=", "<<=", "<<", "<-", "<", ";", ":=", ":", "/=", "/", "...", 74 | ".", "-=", "--", "-", ",", "+=", "++", "+", "*=", "*", ")", "(", "&^=", "&^", 75 | "&=", "&&", "&", "%=", "%", "!=", "!", 76 | ], 77 | "value": [ 78 | /"(?:\\.|[^\\"])*"/, // string literal 79 | /'(?:\\.|[^\\'])+'/, // character literal 80 | /`[^`]*`/, // raw string literal 81 | /\b(?:[1-9]\d*i?|0i?|0x[0-9a-f]+|0[0-7]+)\b/i, // integer literal 82 | /(?:\b\d+\.\d+|\.\d+)(?:E[+-]?\d+)?i?\b/i, // floating point literal (1) 83 | /\b\d+(?:\.?E[+-]?\d+i?\b|\.i\b|\.)/i, // floating point literal (2) 84 | ], 85 | "comment": [ 86 | /\/\*[\s\S]*?\*\//, // Multi-line comment 87 | /\/\/.*/ // Single-line comment 88 | ] 89 | } 90 | }; 91 | 92 | var escapeRegexp = function(s) { 93 | var meta = /[\\^$*+?.()|{[]/g; 94 | var escaper = function(match) { 95 | return "\\" + match; 96 | }; 97 | return s.replace(meta, escaper); 98 | } 99 | 100 | var compile = function(strings) { 101 | var regex = []; 102 | strings.sort(function(a,b) { 103 | // longest first 104 | if (a.length < b.length) { 105 | return 1; 106 | } 107 | else if (a.length > b.length) { 108 | return -1; 109 | } 110 | else { 111 | return 0; 112 | } 113 | }); 114 | for (var i = 0; i < strings.length; i++) { 115 | var p = escapeRegexp(strings[i]); 116 | if (/^\w+$/.test(strings[i])) { 117 | p = "\\b" + p + "\\b"; 118 | } 119 | regex.push(p); 120 | } 121 | return new RegExp(regex.join("|"), "g"); 122 | } 123 | 124 | var patcmp = function(a, b) { 125 | // first then longest 126 | if (a.index < b.index) { 127 | return -1; 128 | } 129 | else if (a.index > b.index) { 130 | return 1; 131 | } 132 | else if (a.size > b.size) { 133 | return -1; 134 | } 135 | else if (a.size < b.size) { 136 | return 1; 137 | } 138 | else { 139 | return 0; 140 | } 141 | } 142 | 143 | var rebuildRegExp = function(source, ignoreCase, multiLine) { 144 | var flags = "g"; 145 | if (ignoreCase) { flags += "i"; } 146 | if (multiLine) { flags += "m"; } 147 | return new RegExp(source, flags); 148 | } 149 | 150 | var highlight = function(name, node, patterns) { 151 | if (node.nodeType == 3) { 152 | var text = node.data; 153 | //~console.log("Highlighting:", text); 154 | var positions = []; 155 | for (var i = 0; i < patterns.length; i++) { 156 | var e = patterns[i]; 157 | e.re.lastIndex = 0; 158 | var match = e.re.exec(text); 159 | if (match == null) { 160 | //~console.log("Could not find any", name, e.name, e.count, ":", e.re); 161 | continue; 162 | } 163 | positions.push({ 164 | "re": e.re, 165 | "name": e.name, 166 | "count": e.count, 167 | "index": match.index, 168 | "size": match[0].length 169 | }); 170 | } 171 | var offset = 0; // position of last split 172 | var pos = 0; // start of next search 173 | var count = 0; 174 | while (positions.length > 0) { 175 | positions.sort(patcmp); 176 | var e = positions[0]; 177 | if (count > 10) { 178 | //~console.log("aborting..."); 179 | //~console.log("regex:", e.re); 180 | break; 181 | } 182 | count++; 183 | //~console.log("Highlighting", name, e.name, e.count, ":", text.substr(e.index, e.size), "( i:", e.index, "s:", e.size, ")"); 184 | if (e.size > 0) { 185 | var middleBit; 186 | if (e.index > offset) { 187 | middleBit = node.splitText(e.index-offset); 188 | } 189 | else { 190 | middleBit = node; 191 | } 192 | var endBit = middleBit.splitText(e.size); 193 | var span = document.createElement("span"); 194 | span.className = e.name; 195 | span.appendChild(middleBit.cloneNode(true)); 196 | middleBit.parentNode.replaceChild(span, middleBit); 197 | node = endBit; 198 | offset = e.index + e.size; 199 | } 200 | pos = e.index + e.size; 201 | if (e.size == 0) { pos++; } 202 | for (var i = 0; i < positions.length; i++) { 203 | var e = positions[i]; 204 | if (e.index >= offset) { break } 205 | e.re.global = true; 206 | e.re.lastIndex = pos; 207 | var match = e.re.exec(text); 208 | if (match == null) { 209 | positions.splice(i,1); 210 | i--; 211 | } 212 | else { 213 | //~console.log("from", e.index, pos, "next", e.name, e.count, "at", match.index, ":", match[0]); 214 | positions[i].index = match.index; 215 | positions[i].size = match[0].length; 216 | } 217 | } 218 | } 219 | } 220 | else if (node.nodeType == 1 && node.childNodes && 221 | !/^(?:script|style)$/i.test(node.tagName)) { 222 | var children = node.childNodes; 223 | for (var i = 0; i < children.length; i++) { 224 | highlight(name, children[i], patterns); 225 | } 226 | } 227 | } 228 | 229 | var prepareCache = {} 230 | var prepare = function(name, rules) { 231 | var patterns = prepareCache[name]; 232 | if (patterns !== undefined) { 233 | return patterns; 234 | } 235 | patterns = []; 236 | for (var key in rules) { 237 | if (!rules.hasOwnProperty(key) || rules[key].length == 0) { 238 | continue; 239 | } 240 | var values = rules[key]; 241 | var strings = []; 242 | var regexes = []; 243 | for (var j = 0; j < values.length; j++) { 244 | var v = values[j]; 245 | if (typeof v == "string" || v instanceof String) { 246 | strings.push(v); 247 | } 248 | else if (v instanceof RegExp) { 249 | regexes.push(v); 250 | } 251 | } 252 | count = 1; 253 | if (strings.length > 0) { 254 | patterns.push({ 255 | "re": compile(strings), 256 | "name": key, 257 | "count": count, 258 | "index": -1, 259 | "size": -1 260 | }); 261 | count++; 262 | } 263 | for (var j = 0; j < regexes.length; j++) { 264 | var re = regexes[j]; 265 | if (!re.global) { 266 | re.global = true; 267 | if (!re.global) { 268 | // Unable to modify flags (Google Chrome) 269 | re = rebuildRegExp(re.source,re.ignoreCase,re.multiline); 270 | } 271 | } 272 | patterns.push({ 273 | "re": re, 274 | "name": key, 275 | "count": count, 276 | "index": -1, 277 | "size": -1 278 | }); 279 | count++; 280 | } 281 | } 282 | prepareCache[name] = patterns; 283 | return patterns; 284 | } 285 | 286 | var HighlightNode = function(node, lang) { 287 | if (languages[lang] === undefined) { return; } 288 | var patterns = prepare(lang,languages[lang]); 289 | highlight(lang, node, patterns); 290 | } 291 | 292 | window.HighlightNode = HighlightNode; 293 | })(); 294 | -------------------------------------------------------------------------------- /comparison/style.css: -------------------------------------------------------------------------------- 1 | body { 2 | background-color: #000; 3 | color: #FFF; 4 | } 5 | a { 6 | text-decoration: none; 7 | color: #87CEEB; 8 | } 9 | a:hover { 10 | text-decoration: underline; 11 | } 12 | .comparison td { 13 | padding: 0; 14 | vertical-align: top; 15 | } 16 | .comparison td pre { 17 | background-color: #333; 18 | color: #FFF; 19 | padding: 5px; 20 | margin: 0; 21 | } 22 | .separator { 23 | font-weight: bold; 24 | } 25 | .type { 26 | font-weight: bold; 27 | color: #BDB76B; 28 | } 29 | .value { 30 | color: #FFA0A0; 31 | } 32 | .keyword { 33 | font-weight: bold; 34 | color: #F0E68C; 35 | } 36 | .comment { 37 | color: #87CEEB; 38 | } 39 | -------------------------------------------------------------------------------- /instructions.go: -------------------------------------------------------------------------------- 1 | // vim: ff=unix ts=3 sw=3 noet 2 | 3 | package pego 4 | 5 | import ( 6 | "fmt" 7 | "strings" 8 | ) 9 | 10 | // Interface for instructions. 11 | // XXX(mizardx): Make this more specific? 12 | type Instruction interface { 13 | fmt.Stringer 14 | } 15 | 16 | // Match a single character. 17 | type IChar struct { 18 | char byte 19 | } 20 | 21 | func (op *IChar) String() string { 22 | return fmt.Sprintf("Char %#02x", op.char) 23 | } 24 | 25 | // Relative jump. 26 | type IJump struct { 27 | offset int 28 | } 29 | 30 | func (op *IJump) String() string { 31 | return fmt.Sprintf("Jump %+d", op.offset) 32 | } 33 | 34 | // Add a fallback point at offset, and continue on next instruction. 35 | type IChoice struct { 36 | offset int 37 | } 38 | 39 | func (op *IChoice) String() string { 40 | return fmt.Sprintf("Choice %+d", op.offset) 41 | } 42 | 43 | // Unresolved call. Used by grammars. 44 | type IOpenCall struct { 45 | name string 46 | } 47 | 48 | func (op *IOpenCall) String() string { 49 | return fmt.Sprintf("OpenCall %q", op.name) 50 | } 51 | 52 | // Push return address to stack, and do a relative jump. 53 | type ICall struct { 54 | offset int 55 | } 56 | 57 | func (op *ICall) String() string { 58 | return fmt.Sprintf("Call %+d", op.offset) 59 | } 60 | 61 | // Pop a fallback point, and do a relative jump. 62 | type ICommit struct { 63 | offset int 64 | } 65 | 66 | func (op *ICommit) String() string { 67 | return fmt.Sprintf("Commit %+d", op.offset) 68 | } 69 | 70 | // Update top stack entry, and do a relative jump. 71 | type IPartialCommit struct { 72 | offset int 73 | } 74 | 75 | func (op *IPartialCommit) String() string { 76 | return fmt.Sprintf("PartialCommit %+d", op.offset) 77 | } 78 | 79 | // Backtracks to a fallback-point, and does a relative jump. 80 | type IBackCommit struct { 81 | offset int 82 | } 83 | 84 | func (op *IBackCommit) String() string { 85 | return fmt.Sprintf("BackCommit %+d", op.offset) 86 | } 87 | 88 | // Pop a return address from the stack and jump to it. 89 | type IReturn struct{} 90 | 91 | func (op *IReturn) String() string { return "Return" } 92 | 93 | // Roll back to closest fallback point. 94 | type IFail struct{} 95 | 96 | func (op *IFail) String() string { return "Fail" } 97 | 98 | // Roll back to the next closest fallback point. 99 | type IFailTwice struct{} 100 | 101 | func (op *IFailTwice) String() string { return "FailTwice" } 102 | 103 | // End of program (Last instuction only) 104 | type IEnd struct{} 105 | 106 | func (op *IEnd) String() string { return "End" } 107 | 108 | // Stop matching and return a negative result. 109 | type IGiveUp struct{} 110 | 111 | func (op *IGiveUp) String() string { return "GiveUp" } 112 | 113 | // Open a new capture 114 | type IOpenCapture struct { 115 | capOffset int 116 | handler CaptureHandler 117 | } 118 | 119 | func (op *IOpenCapture) String() string { 120 | return fmt.Sprintf("Capture open %+d (%v)", -op.capOffset, op.handler) 121 | } 122 | 123 | // Close the nearest open capture 124 | type ICloseCapture struct { 125 | capOffset int 126 | } 127 | 128 | func (op *ICloseCapture) String() string { 129 | return fmt.Sprintf("Capture close %+d", -op.capOffset) 130 | } 131 | 132 | // Open and close a new capture of fixed size 133 | type IFullCapture struct { 134 | capOffset int 135 | handler CaptureHandler 136 | } 137 | 138 | func (op *IFullCapture) String() string { 139 | return fmt.Sprintf("Capture full %+d (%s)", -op.capOffset, op.handler) 140 | } 141 | 142 | // Open and close a new empty capture 143 | type IEmptyCapture struct { 144 | capOffset int 145 | handler CaptureHandler 146 | } 147 | 148 | func (op *IEmptyCapture) String() string { 149 | return fmt.Sprintf("Capture empty (%s)", op.handler) 150 | } 151 | 152 | // Match a character from a set 153 | // TODO(mizardx): Unicode? 154 | type ICharset struct { 155 | chars [8]uint32 156 | } 157 | 158 | func (op *ICharset) String() string { 159 | def := uint32(op.chars[0] & 1) 160 | ranges := make([]string, 0) 161 | //ranges.Push("Charset") 162 | ranges = append(ranges, "Charset") 163 | if def == 0 { 164 | //ranges.Push("[") 165 | ranges = append(ranges, "[") 166 | } else { 167 | //ranges.Push("[^") 168 | ranges = append(ranges, "[^") 169 | } 170 | start := -1 171 | fmtChar := func(char int) string { 172 | if 32 < char && char < 127 { 173 | return fmt.Sprintf("%c", char) 174 | } 175 | return fmt.Sprintf("\\x%02X", char) 176 | } 177 | for i := 0; i <= 256; i++ { 178 | switch { 179 | case i < 256 && (op.chars[i>>5]>>uint(i&0x1F))&1 != def: 180 | if start == -1 { 181 | start = i 182 | } 183 | case start == -1: 184 | case start == i-1: 185 | //ranges.Push(fmtChar(start)) 186 | ranges = append(ranges, fmtChar(start)) 187 | start = -1 188 | case start == i-2: 189 | //ranges.Push(fmtChar(start)) 190 | ranges = append(ranges, fmtChar(start)) 191 | //ranges.Push(fmtChar(start + 1)) 192 | ranges = append(ranges, fmtChar(start+1)) 193 | start = -1 194 | default: 195 | //ranges.Push(fmt.Sprintf("%s-%s", fmtChar(start), fmtChar(i-1))) 196 | ranges = append(ranges, fmt.Sprintf("%s-%s", fmtChar(start), fmtChar(i-1))) 197 | start = -1 198 | } 199 | } 200 | //ranges.Push("]") 201 | return strings.Join(ranges, " ") 202 | } 203 | 204 | // Charset contains character? 205 | func (op *ICharset) Has(char byte) bool { 206 | return op.chars[int(char>>5)]&(1<>5), int(hi>>5) 212 | lobit, hibit := uint(lo&0x1F), uint(hi&0x1F) 213 | for i := lobyt; i <= hibyt; i++ { 214 | if lobyt <= i && i <= hibyt { 215 | mask := ^uint32(0) 216 | if i == lobyt { 217 | mask &^= (1 << lobit) - 1 218 | } 219 | if i == hibyt { 220 | mask &^= (^uint32(1)) << hibit 221 | } 222 | op.chars[i] |= mask 223 | } 224 | } 225 | } 226 | 227 | // Negate the whole set 228 | func (op *ICharset) negate() { 229 | for i := range op.chars { 230 | op.chars[i] = ^op.chars[i] 231 | } 232 | } 233 | 234 | // Match zero or more characters from a set 235 | type ISpan struct { 236 | ICharset 237 | } 238 | 239 | func (op *ISpan) String() string { 240 | s := (&op.ICharset).String() 241 | i := strings.Index(s, " ") 242 | return "ISpan" + s[i:] 243 | } 244 | 245 | // Match `count` of any character 246 | type IAny struct { 247 | count int 248 | } 249 | 250 | func (op *IAny) String() string { 251 | return fmt.Sprintf("Any x %d", op.count) 252 | } 253 | -------------------------------------------------------------------------------- /match.go: -------------------------------------------------------------------------------- 1 | // vim: ff=unix ts=3 sw=3 noet 2 | 3 | package pego 4 | 5 | import ( 6 | "errors" 7 | "fmt" 8 | "strings" 9 | ) 10 | 11 | // Call/fallback stack 12 | 13 | type StackEntry struct { 14 | p, i, c int 15 | } 16 | 17 | type Stack struct { 18 | slice []interface{} 19 | } 20 | 21 | func (s *Stack) Len() int { 22 | return len(s.slice) 23 | } 24 | 25 | func (s *Stack) Pop() interface{} { 26 | var i interface{} 27 | i, s.slice = s.slice[len(s.slice)-1], s.slice[:len(s.slice)-1] 28 | return i 29 | } 30 | 31 | func (s *Stack) Push(i interface{}) { 32 | s.slice = append(s.slice, i) 33 | } 34 | 35 | func (s *Stack) At(i int) interface{} { 36 | return s.slice[i] 37 | } 38 | 39 | func (s *Stack) String() string { 40 | ret := make([]string, 0) 41 | //ret.Push("[") 42 | ret = append(ret, "[") 43 | for _, v := range s.slice { 44 | switch v := v.(type) { 45 | case *StackEntry: 46 | //ret.Push(fmt.Sprintf("%v", *v)) 47 | ret = append(ret, fmt.Sprintf("%v", *v)) 48 | default: 49 | //ret.Push(fmt.Sprintf("%v", v)) 50 | ret = append(ret, fmt.Sprintf("%v", v)) 51 | } 52 | } 53 | //ret.Push("]") 54 | ret = append(ret, "]") 55 | return strings.Join(ret, " ") 56 | } 57 | 58 | // === Capture stack === 59 | 60 | type CapStack struct { 61 | data []*CaptureEntry 62 | top int 63 | } 64 | 65 | type CaptureEntry struct { 66 | p, start, end int 67 | handler CaptureHandler 68 | value interface{} 69 | } 70 | 71 | func NewCapStack() *CapStack { 72 | return &CapStack{} 73 | } 74 | 75 | func (s *CapStack) String() string { 76 | ret := make([]string, 0) 77 | //ret.Push("[") 78 | ret = append(ret, "[") 79 | var i int 80 | for i = 0; i < s.top; i++ { 81 | //ret.Push(fmt.Sprintf("%v", s.data[i])) 82 | ret = append(ret, fmt.Sprintf("%v", s.data[i])) 83 | } 84 | //ret.Push("<|") 85 | for ; i < len(s.data); i++ { 86 | //ret.Push(fmt.Sprintf("%v", s.data[i])) 87 | ret = append(ret, fmt.Sprintf("%v", s.data[i])) 88 | } 89 | //ret.Push("]") 90 | ret = append(ret, "]") 91 | return strings.Join(ret, " ") 92 | } 93 | 94 | // Open and return an new capture 95 | func (s *CapStack) Open(p int, start int) *CaptureEntry { 96 | if s.data == nil { 97 | s.data = make([]*CaptureEntry, 8) 98 | } else if len(s.data) == s.top { 99 | newData := make([]*CaptureEntry, 2*len(s.data)+1) 100 | copy(newData, s.data) 101 | s.data = newData 102 | } 103 | s.data[s.top] = &CaptureEntry{p: p, start: start, end: -1} 104 | s.top++ 105 | return s.data[s.top-1] 106 | } 107 | 108 | // Close and return the closest open capture 109 | func (s *CapStack) Close(end int) (*CaptureEntry, int) { 110 | var i int 111 | for i = s.top - 1; i >= 0; i-- { 112 | if s.data[i].end == -1 { 113 | s.data[i].end = end 114 | return s.data[i], s.top - i - 1 115 | } 116 | } 117 | return nil, 0 118 | } 119 | 120 | // Used when returning the values 121 | // Similar to CaptureEntry, but without some internal values 122 | type CaptureResult struct { 123 | start, end int 124 | value interface{} 125 | } 126 | 127 | // Pop and return the top `count` captures 128 | func (s *CapStack) Pop(count int) []*CaptureResult { 129 | subcaps := make([]*CaptureResult, count) 130 | i := s.top - count 131 | for j := 0; j < count; j++ { 132 | subcaps[j] = &CaptureResult{s.data[i+j].start, s.data[i+j].end, s.data[i+j].value} 133 | } 134 | s.top -= count 135 | return subcaps 136 | } 137 | 138 | // Create and return a mark 139 | func (s *CapStack) Mark() int { 140 | return s.top 141 | } 142 | 143 | // Rollback to a previous mark 144 | func (s *CapStack) Rollback(mark int) { 145 | s.top = mark 146 | } 147 | 148 | // Main match function 149 | func Match(program *Pattern, input string) (interface{}, error, int) { 150 | const FAIL = -1 151 | var p, i, c int 152 | stack := &Stack{make([]interface{}, 0)} 153 | captures := NewCapStack() 154 | for p = 0; p < len(*program); { 155 | if p == FAIL { 156 | // Unroll stack until a fallback point is reached 157 | if stack.Len() == 0 { 158 | return nil, errors.New("Stack is empty"), i 159 | } 160 | switch e := stack.Pop().(type) { 161 | case *StackEntry: 162 | p, i, c = e.p, e.i, e.c 163 | captures.Rollback(c) 164 | case int: 165 | } 166 | continue 167 | } 168 | // fmt.Printf("%6d %s\n", p, (*program)[p]) 169 | switch op := (*program)[p].(type) { 170 | default: 171 | return nil, errors.New(fmt.Sprintf("Unimplemented: %#v", (*program)[p])), i 172 | case nil: 173 | // Noop 174 | p++ 175 | case *IChar: 176 | if i < len(input) && input[i] == op.char { 177 | p++ 178 | i++ 179 | } else { 180 | p = FAIL 181 | } 182 | case *ICharset: 183 | if i < len(input) && op.Has(input[i]) { 184 | p++ 185 | i++ 186 | } else { 187 | p = FAIL 188 | } 189 | case *ISpan: 190 | for i < len(input) && op.Has(input[i]) { 191 | i++ 192 | } 193 | p++ 194 | case *IAny: 195 | if i+op.count > len(input) { 196 | p = FAIL 197 | } else { 198 | p++ 199 | i += op.count 200 | } 201 | case *IJump: 202 | p += op.offset 203 | case *IChoice: 204 | stack.Push(&StackEntry{p + op.offset, i, captures.Mark()}) 205 | p++ 206 | case *IOpenCall: 207 | return nil, errors.New(fmt.Sprintf("Unresolved name: %q", op.name)), i 208 | case *ICall: 209 | stack.Push(p + 1) 210 | p += op.offset 211 | case *IReturn: 212 | if stack.Len() == 0 { 213 | return nil, errors.New("Return with empty stack"), i 214 | } 215 | e, ok := stack.Pop().(int) 216 | if !ok { 217 | return nil, errors.New("Expecting return address on stack; Found failure address"), i 218 | } 219 | p = e 220 | case *ICommit: 221 | if stack.Len() == 0 { 222 | return nil, errors.New("Commit with empty stack"), i 223 | } 224 | _, ok := stack.Pop().(*StackEntry) 225 | if !ok { 226 | return nil, errors.New("Expecting failure address on stack; Found return address"), i 227 | } 228 | p += op.offset 229 | case *IPartialCommit: 230 | if stack.Len() == 0 { 231 | return nil, errors.New("PartialCommit with empty stack"), i 232 | } 233 | e, ok := stack.At(stack.Len() - 1).(*StackEntry) 234 | if !ok { 235 | return nil, errors.New("Expecting failure address on stack; Found return address"), i 236 | } 237 | e.i = i 238 | e.c = captures.Mark() 239 | p += op.offset 240 | case *IBackCommit: 241 | if stack.Len() == 0 { 242 | return nil, errors.New("BackCommit with empty stack"), i 243 | } 244 | e, ok := stack.Pop().(*StackEntry) 245 | if !ok { 246 | return nil, errors.New("Expecting failure address on stack; Found return address"), i 247 | } 248 | i = e.i 249 | captures.Rollback(e.c) 250 | p += op.offset 251 | case *IOpenCapture: 252 | e := captures.Open(p, i-op.capOffset) 253 | if op.handler == nil { 254 | e.handler = &SimpleCapture{} 255 | } else { 256 | e.handler = op.handler 257 | } 258 | p++ 259 | case *ICloseCapture: 260 | e, count := captures.Close(i - op.capOffset) 261 | v, err := e.handler.Process(input, e.start, e.end, captures, count) 262 | if err != nil { 263 | return nil, err, i 264 | } 265 | e.value = v 266 | p++ 267 | case *IFullCapture: 268 | e := captures.Open(p, i-op.capOffset) 269 | if op.handler == nil { 270 | e.handler = &SimpleCapture{} 271 | } else { 272 | e.handler = op.handler 273 | } 274 | captures.Close(i) 275 | v, err := e.handler.Process(input, e.start, e.end, captures, 0) 276 | if err != nil { 277 | return nil, err, i 278 | } 279 | e.value = v 280 | p++ 281 | case *IEmptyCapture: 282 | e := captures.Open(p, i-op.capOffset) 283 | if op.handler == nil { 284 | e.handler = &SimpleCapture{} 285 | } else { 286 | e.handler = op.handler 287 | } 288 | captures.Close(i - op.capOffset) 289 | v, err := e.handler.Process(input, e.start, e.end, captures, 0) 290 | if err != nil { 291 | return nil, err, i 292 | } 293 | e.value = v 294 | p++ 295 | case *IFail: 296 | p = FAIL 297 | case *IFailTwice: 298 | if stack.Len() == 0 { 299 | return nil, errors.New("IFailTwice with empty stack"), i 300 | } 301 | e, ok := stack.Pop().(*StackEntry) 302 | if !ok { 303 | return nil, errors.New("Expecting failure address on stack; Found return address"), i 304 | } 305 | i = e.i 306 | captures.Rollback(e.c) 307 | p = FAIL 308 | case *IGiveUp: 309 | return nil, nil, i 310 | case *IEnd: 311 | caps := captures.Pop(captures.top) 312 | var ret interface{} 313 | if len(caps) > 0 && caps[0] != nil { 314 | ret = caps[0].value 315 | } 316 | return ret, nil, i 317 | } 318 | } 319 | return nil, errors.New("Invalid jump or missing End instruction."), i 320 | } 321 | -------------------------------------------------------------------------------- /match_test.go: -------------------------------------------------------------------------------- 1 | package pego 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | func TestSimpleMatch(t *testing.T) { 9 | pat := Grm("S", map[string]*Pattern{ 10 | "S": Ref("A").Clist(), 11 | "A": Seq( 12 | NegSet("()").Rep(0, -1), 13 | Seq( 14 | Ref("B"), 15 | NegSet("()").Rep(0, -1), 16 | ).Rep(0, -1)).Csimple(), 17 | "B": Seq( 18 | "(", Ref("A"), ")"), 19 | }) 20 | fmt.Println("Compiled pattern:") 21 | fmt.Println(pat) 22 | 23 | tests := []string{ 24 | "x", "(x)", "a(b(c)d(e)f)g", ")", 25 | } 26 | 27 | for _, s := range tests { 28 | fmt.Printf("\n\n=== MATCHING %q ===\n", s) 29 | fmt.Println("Trace:") 30 | r, err, pos := Match(pat, s) 31 | 32 | if r != nil { 33 | fmt.Printf("Return value: %v\n", r) 34 | } 35 | if err != nil { 36 | t.Errorf("Error: %#v\n", err) 37 | } 38 | 39 | fmt.Printf("End position: %d\n", pos) 40 | if pos != len(s) { 41 | t.Error("Failed to match whole input") 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /peg.go: -------------------------------------------------------------------------------- 1 | // vim: ff=unix ts=3 sw=3 noet 2 | 3 | package pego 4 | 5 | import ( 6 | "fmt" 7 | "strings" 8 | ) 9 | 10 | type Pattern []Instruction 11 | 12 | func (p *Pattern) String() string { 13 | ret := make([]string, len(*p)) 14 | for i, op := range *p { 15 | ret[i] = fmt.Sprintf("%6d %s", i, op) 16 | } 17 | return strings.Join(ret, "\n") 18 | } 19 | 20 | // List of alternatives. 21 | func (p *Pattern) Or(ps ...interface{}) *Pattern { 22 | var ret *Pattern 23 | var p2 *Pattern 24 | for i := len(ps) - 1; i >= -1; i-- { 25 | if i == -1 { 26 | p2 = p 27 | } else { 28 | p2 = Pat(ps[i]) 29 | } 30 | if ret == nil { 31 | ret = p2 32 | } else { 33 | ret = Or(p2, ret) 34 | } 35 | } 36 | return ret 37 | } 38 | 39 | // Match this pattern, except for when `pred` matches. 40 | func (p *Pattern) Exc(pred *Pattern) *Pattern { 41 | return Seq(Not(pred), p) 42 | } 43 | 44 | // Match this pattern between `min` and `max` times. 45 | // max == -1 means unlimited. 46 | func (p *Pattern) Rep(min, max int) *Pattern { 47 | return Rep(p, min, max) 48 | } 49 | 50 | // Resolve an open reference. Consider using a grammar instead. 51 | func (p *Pattern) Resolve(name string, target *Pattern) *Pattern { 52 | return Grm("__start", map[string]*Pattern{ 53 | "__start": p, 54 | name: target, 55 | }) 56 | } 57 | 58 | // A simple capture of this pattern. 59 | func (p *Pattern) Csimple() *Pattern { 60 | return Csimple(p) 61 | } 62 | 63 | // A list capture of this pattern. 64 | func (p *Pattern) Clist() *Pattern { 65 | return Clist(p) 66 | } 67 | 68 | // A function capture of this pattern. 69 | func (p *Pattern) Cfunc(f func([]*CaptureResult) (interface{}, error)) *Pattern { 70 | return Cfunc(p, f) 71 | } 72 | 73 | // A string capture of this pattern. 74 | func (p *Pattern) Cstring(format string) *Pattern { 75 | return Cstring(p, format) 76 | } 77 | 78 | // A substition capture of this pattern. 79 | func (p *Pattern) Csubst() *Pattern { 80 | return Csubst(p) 81 | } 82 | 83 | // A sequence of values, instructions and other patterns. 84 | // (See Seq2) 85 | func Seq(args ...interface{}) *Pattern { 86 | return Seq2(args) 87 | } 88 | 89 | // A sequence of values, instructions and other patterns 90 | // Instructions that represent jumps are updated to match 91 | // the sizes of the argument. 92 | func Seq2(args []interface{}) *Pattern { 93 | size := 0 94 | // Figure out where each argument will be placed in 95 | // the final pattern. 96 | offsets := make(map[int]int, len(args)) 97 | for i := range args { 98 | offsets[i] = size 99 | switch v := args[i].(type) { 100 | case *Pattern: 101 | size += len(*v) - 1 102 | case Instruction: 103 | size += 1 104 | default: 105 | // Convert a value to a pattern. 106 | v2 := Pat(v) 107 | args[i] = v2 108 | size += len(*v2) - 1 109 | } 110 | } 111 | offsets[len(args)] = size 112 | // Construct the final pattern. 113 | ret := make(Pattern, size+1) 114 | pos := 0 115 | for i := range args { 116 | switch v := args[i].(type) { 117 | case *Pattern: 118 | copy(ret[pos:], *v) 119 | pos += len(*v) - 1 120 | case *IJump: 121 | ret[pos] = &IJump{offsets[i+v.offset] - pos} 122 | pos++ 123 | case *IChoice: 124 | ret[pos] = &IChoice{offsets[i+v.offset] - pos} 125 | pos++ 126 | case *ICall: 127 | ret[pos] = &ICall{offsets[i+v.offset] - pos} 128 | pos++ 129 | case *ICommit: 130 | ret[pos] = &ICommit{offsets[i+v.offset] - pos} 131 | pos++ 132 | case Instruction: 133 | ret[pos] = v 134 | pos++ 135 | } 136 | } 137 | ret[pos] = &IEnd{} 138 | return &ret 139 | } 140 | 141 | // Always succeeds (an empty pattern). 142 | func Succ() *Pattern { 143 | return Seq() 144 | } 145 | 146 | // Always fails. 147 | func Fail() *Pattern { 148 | return Seq( 149 | &IFail{}, 150 | ) 151 | } 152 | 153 | // Matches `n` of any character. 154 | func Any(n int) *Pattern { 155 | return Seq( 156 | &IAny{n}, 157 | ) 158 | } 159 | 160 | // Matches `char` 161 | func Char(char byte) *Pattern { 162 | return Seq( 163 | &IChar{char}, 164 | ) 165 | } 166 | 167 | func isfail(p *Pattern) bool { 168 | _, ok := (*p)[0].(*IFail) 169 | return ok 170 | } 171 | func issucc(p *Pattern) bool { 172 | _, ok := (*p)[0].(*IEnd) 173 | return ok 174 | } 175 | 176 | // Ordered choice of p1 and p2 177 | func Or(p1, p2 *Pattern) *Pattern { 178 | if isfail(p1) { 179 | return p2 180 | } else if issucc(p1) || isfail(p2) { 181 | return p1 182 | } 183 | return Seq( 184 | &IChoice{3}, 185 | p1, 186 | &ICommit{2}, 187 | p2, 188 | ) 189 | } 190 | 191 | // Repeat pattern between `min` and `max` times. 192 | // max == -1 means unlimited. 193 | func Rep(p *Pattern, min, max int) *Pattern { 194 | var size int 195 | if max < 0 { 196 | size = min + 3 197 | } else { 198 | size = min + 2*(max-min) + 1 199 | } 200 | args := make([]interface{}, size) 201 | for i := 0; i < min; i++ { 202 | args[i] = p 203 | } 204 | pos := min 205 | if max < 0 { 206 | args[pos+0] = &IChoice{3} 207 | args[pos+1] = p 208 | args[pos+2] = &ICommit{-2} 209 | pos += 3 210 | } else { 211 | args[pos+0] = &IChoice{2 * (max - min)} 212 | pos++ 213 | for i := min; i < max; i++ { 214 | args[pos+0] = p 215 | args[pos+1] = &IPartialCommit{1} 216 | pos += 2 217 | } 218 | args[pos+0] = &ICommit{1} 219 | pos++ 220 | } 221 | return Seq2(args) 222 | } 223 | 224 | // Negative look-ahead for the pattern. 225 | func Not(p *Pattern) *Pattern { 226 | return Seq( 227 | &IChoice{3}, 228 | p, 229 | &IFailTwice{}, 230 | ) 231 | } 232 | 233 | // Positive look-ahead for the pattern. 234 | func And(p *Pattern) *Pattern { 235 | return Seq( 236 | &IChoice{3}, 237 | p, 238 | &IBackCommit{2}, 239 | &IFail{}, 240 | ) 241 | } 242 | 243 | // Open reference to a name. Use with grammars. 244 | func Ref(name string) *Pattern { 245 | return Seq( 246 | &IOpenCall{name}, 247 | ) 248 | } 249 | 250 | // Match the text litteraly. 251 | func Lit(text string) *Pattern { 252 | args := make([]interface{}, len(text)) 253 | for i := 0; i < len(text); i++ { 254 | args[i] = &IChar{text[i]} 255 | } 256 | return Seq2(args) 257 | } 258 | 259 | // Match a grammar. 260 | // start: name of the first pattern 261 | // grammar: map of names to patterns 262 | func Grm(start string, grammar map[string]*Pattern) *Pattern { 263 | // Figure out where each pattern begins, so that open 264 | // references can be resolved 265 | refs := map[string]int{"": 0} 266 | size := 2 267 | order := make([]string, len(grammar)) 268 | i := 0 269 | for name, p := range grammar { 270 | if len(name) == 0 { 271 | panic("Invalid name") 272 | } 273 | order[i] = name 274 | i += 1 275 | refs[name] = size 276 | size += len(*p) 277 | } 278 | // Construct the final pattern 279 | ret := make(Pattern, size+1) 280 | ret[0] = &ICall{refs[start] - 0} 281 | ret[1] = &IJump{size - 1} 282 | for _, name := range order { 283 | copy(ret[refs[name]:], *grammar[name]) 284 | ret[refs[name]+len(*grammar[name])-1] = &IReturn{} 285 | } 286 | ret[len(ret)-1] = &IEnd{} 287 | // Update references 288 | for i, op := range ret { 289 | if op2, ok := op.(*IOpenCall); ok { 290 | if offset, ok := refs[op2.name]; ok { 291 | ret[i] = &ICall{offset - i} 292 | } 293 | } 294 | } 295 | return &ret 296 | } 297 | 298 | // Match a set of characters. 299 | func Set(chars string) *Pattern { 300 | mask := [...]uint32{0, 0, 0, 0, 0, 0, 0, 0} 301 | for i := 0; i < len(chars); i++ { 302 | c := chars[i] 303 | mask[c>>5] |= 1 << (c & 0x1F) 304 | } 305 | return Seq(&ICharset{mask}) 306 | } 307 | 308 | // Match a negated set of characters. Opposite of Set() 309 | func NegSet(chars string) *Pattern { 310 | const N = ^uint32(0) 311 | mask := [...]uint32{N, N, N, N, N, N, N, N} 312 | for i := 0; i < len(chars); i++ { 313 | c := chars[i] 314 | mask[c>>5] &^= 1 << (c & 0x1F) 315 | } 316 | return Seq(&ICharset{mask}) 317 | } 318 | 319 | // Resolve a value to a pattern. 320 | // Patterns are return unmodified. 321 | // * true gives a pattern that always succeeds. Equivalent to Succ(). 322 | // * false gives a pattern that always fails. Equivalent to Fail(). 323 | // * n < 0 asserts that there are at least -n characters. 324 | // Equivalent to And(Any(n)). 325 | // * n >= 0 matches n characters. Equivalent to Any(n). 326 | // * A string matches itself. Equivalent to Lit(value) 327 | func Pat(value interface{}) *Pattern { 328 | switch v := value.(type) { 329 | case *Pattern: 330 | return v 331 | case bool: 332 | if v { 333 | return Succ() 334 | } else { 335 | return Fail() 336 | } 337 | case int: 338 | if v >= 0 { 339 | return Any(v) 340 | } else { 341 | return And(Any(-v)) 342 | } 343 | case string: 344 | return Lit(v) 345 | } 346 | // TODO(mizardx): Proper error handling. 347 | return nil 348 | } 349 | 350 | // Does a simple capture of the pattern. 351 | func Csimple(p *Pattern) *Pattern { 352 | return Seq( 353 | &IOpenCapture{0, &SimpleCapture{}}, 354 | p, 355 | &ICloseCapture{}, 356 | ) 357 | } 358 | 359 | // Does a position capture. 360 | func Cposition() *Pattern { 361 | return Seq( 362 | &IEmptyCapture{0, &PositionCapture{}}, 363 | ) 364 | } 365 | 366 | // Does a constant capture. 367 | func Cconst(value interface{}) *Pattern { 368 | return Seq( 369 | &IEmptyCapture{0, &ConstCapture{value}}, 370 | ) 371 | } 372 | 373 | // Does a list capture. 374 | func Clist(p *Pattern) *Pattern { 375 | return Seq( 376 | &IOpenCapture{0, &ListCapture{}}, 377 | p, 378 | &ICloseCapture{}, 379 | ) 380 | } 381 | 382 | // Does a function capture. 383 | func Cfunc(p *Pattern, f func([]*CaptureResult) (interface{}, error)) *Pattern { 384 | return Seq( 385 | &IOpenCapture{0, &FunctionCapture{f}}, 386 | p, 387 | &ICloseCapture{}, 388 | ) 389 | } 390 | 391 | // Does a string capture. 392 | func Cstring(p *Pattern, format string) *Pattern { 393 | return Seq( 394 | &IOpenCapture{0, &StringCapture{format}}, 395 | p, 396 | &ICloseCapture{}, 397 | ) 398 | } 399 | 400 | // Does a substitution capture. 401 | func Csubst(p *Pattern) *Pattern { 402 | return Seq( 403 | &IOpenCapture{0, &SubstCapture{}}, 404 | p, 405 | &ICloseCapture{}, 406 | ) 407 | } 408 | --------------------------------------------------------------------------------