├── test ├── g0-trivial.rx ├── 30-wild.rx ├── g1-abab.rx ├── q1-plus.rx ├── g2-digits.rx ├── d1-dragon2.rx ├── 16-anchor.rx ├── d2-demo.rx ├── g4-cgpat.rx ├── g5-chgcards.rx ├── x3-repl.rcx ├── g6-minim.rx ├── g0-trivial.std ├── 08-ealts.rx ├── d3-cfloat.rx ├── Makefile ├── q1-plus.std ├── 22-csugly.rx ├── 14-nestpar.rx ├── 00-const.rx ├── 29-pcre.rx ├── 02-repl.rx ├── 04-erepl.rx ├── g3-floating.rx ├── g1-abab.std ├── 26-cscompl.rx ├── d5-abc1.rx ├── 06-alts.rx ├── 10-parens.rx ├── 12-altpar.rx ├── 05-cxrepl.rx ├── 18-basic.rx ├── 20-cset.rx ├── d7-jellyfish.rx ├── 92-reject.rx ├── d9-braid.rx ├── x1-simple.rcx ├── 03-nrepl.rx ├── g2-digits.std ├── x3-repl.rx ├── d1-dragon2.std3 ├── d8-starship.rx ├── 54-libmisc.rx ├── 90-bad.rx ├── 58-libmail.rx ├── 50-libnum.rx ├── g4-cgpat.std ├── 28-unicode.rx ├── 34-real.rx ├── g5-chgcards.std ├── g6-minim.std ├── x1-simple.rx ├── d1-dragon2.std2 ├── d5-abc1.std3 ├── 24-csesc.rx ├── 41-cswww.rx ├── d2-demo.std3 ├── 40-found.rx ├── 47-tutsplus.rx ├── d1-dragon2.std ├── 52-libstr.rx ├── d2-demo.std2 ├── d3-cfloat.std3 ├── 32-simple.rx ├── x3-repl.std ├── g3-floating.std ├── 56-libclk.rx ├── x1-simple.std ├── d2-demo.std ├── 80-questions.std2 ├── 92-reject.std ├── d3-cfloat.std2 ├── d3-cfloat.std ├── d5-abc1.std ├── runtest.sh ├── 30-wild.std ├── 90-bad.std ├── d5-abc1.std2 ├── 48-fireball.rx ├── x9-questions.rx ├── 54-libmisc.std ├── 58-libmail.std ├── 80-questions.rx ├── 50-libnum.std ├── 40-found.std ├── d7-jellyfish.std2 ├── 29-pcre.std ├── 52-libstr.std ├── 56-libclk.std ├── 48-fireball.std └── 46-mbynens.rx ├── webapp ├── static │ ├── icon.png │ ├── border.png │ ├── itest.png │ ├── background.png │ ├── functions.js │ └── style.css ├── app.yaml └── code │ ├── contact.go │ ├── syntax.go │ ├── about.go │ ├── graph.go │ ├── main.go │ ├── body.go │ ├── info.go │ ├── draw.go │ ├── home.go │ ├── examine.go │ └── form.go ├── go.mod ├── .gitignore ├── TODO ├── README ├── BUILD ├── rxq └── rxq.go ├── rxpick └── rxpick.go ├── rxsys └── system.go ├── LICENSE ├── NOTES ├── Makefile ├── bit_test.go ├── synth.go ├── util.go ├── catnode.go ├── rx.go ├── matnode.go ├── rxx └── rxx.go ├── rxg └── rxg.go ├── rxtime └── rxtime.go ├── altnode.go ├── node.go ├── bkt_test.go ├── bitset.go ├── charset.go ├── input.go ├── brackets.go ├── repnode.go └── graph.go /test/g0-trivial.rx: -------------------------------------------------------------------------------- 1 | #! rxg -R 2 | ab*c 3 | -------------------------------------------------------------------------------- /test/30-wild.rx: -------------------------------------------------------------------------------- 1 | {.} 2 | {.+} 3 | {.*.*.*} 4 | -------------------------------------------------------------------------------- /test/g1-abab.rx: -------------------------------------------------------------------------------- 1 | #! rxg -R 2 | (a|b)*abb 3 | b(ab)*a 4 | -------------------------------------------------------------------------------- /test/q1-plus.rx: -------------------------------------------------------------------------------- 1 | #! rxq 'ab+c' - 2 | ac 3 | abc 4 | abbc 5 | aabc 6 | abcc 7 | aacc 8 | -------------------------------------------------------------------------------- /test/g2-digits.rx: -------------------------------------------------------------------------------- 1 | #! rxg -R 2 | # various integer forms 3 | \d+ 4 | \d*[1-9] 5 | [1-9]\d* 6 | -------------------------------------------------------------------------------- /webapp/static/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/proebsting/re/HEAD/webapp/static/icon.png -------------------------------------------------------------------------------- /test/d1-dragon2.rx: -------------------------------------------------------------------------------- 1 | #! rxplor -m -l -n -d 2 | #! rxplor -N - 3 | #! rxplor -D - 4 | (a|b)*abb 5 | -------------------------------------------------------------------------------- /webapp/static/border.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/proebsting/re/HEAD/webapp/static/border.png -------------------------------------------------------------------------------- /webapp/static/itest.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/proebsting/re/HEAD/webapp/static/itest.png -------------------------------------------------------------------------------- /test/16-anchor.rx: -------------------------------------------------------------------------------- 1 | ^ 2 | $ 3 | ^$ 4 | ^a 5 | b$ 6 | ^c$ 7 | ^def$ 8 | ^x+y*z+$ 9 | q\$ 10 | -------------------------------------------------------------------------------- /test/d2-demo.rx: -------------------------------------------------------------------------------- 1 | #! rxplor -m -l -n -d 2 | #! rxplor -N - 3 | #! rxplor -D - 4 | (a|b)*abb 5 | b(ab)*a 6 | -------------------------------------------------------------------------------- /webapp/static/background.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/proebsting/re/HEAD/webapp/static/background.png -------------------------------------------------------------------------------- /test/g4-cgpat.rx: -------------------------------------------------------------------------------- 1 | #! rxg -R 2 | # shortened charge-card numbers 3 | 3[47]\d{3} 4 | 4\d{2}(\d\d\d)? 5 | 5[1-5]\d{4} 6 | \d{6} 7 | -------------------------------------------------------------------------------- /test/g5-chgcards.rx: -------------------------------------------------------------------------------- 1 | #! rxg -R 2 | # valid credit card numbers 3 | 3[47]\d{13} 4 | 4\d{12}(\d\d\d)? 5 | 5[1-5]\d{14} 6 | \d{16} 7 | -------------------------------------------------------------------------------- /test/x3-repl.rcx: -------------------------------------------------------------------------------- 1 | a 2 | ac 3 | abc 4 | abbc 5 | abbbc 6 | abbbbc 7 | abbbbbc 8 | abbbbbbc 9 | abbbbbbbc 10 | abbbbbbbbc 11 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/proebsting/re 2 | 3 | go 1.16 4 | 5 | replace rx => ./. 6 | 7 | require rx v0.0.0-00010101000000-000000000000 8 | -------------------------------------------------------------------------------- /test/g6-minim.rx: -------------------------------------------------------------------------------- 1 | #! rxg -R 2 | # problems minimizing in combination 3 | () 4 | a? 5 | a?(ba)*b? 6 | (a|)(ba)*(b|) 7 | a(ba)*b?|b(ab)*a? 8 | -------------------------------------------------------------------------------- /test/g0-trivial.std: -------------------------------------------------------------------------------- 1 | {"Expressions":[ 2 | {"Index":0,"Rexpr":"ab*c"} 3 | ], 4 | "Examples":[ 5 | {"State":2,"RXset":[0],"Example":"ac"} 6 | ]} 7 | -------------------------------------------------------------------------------- /test/08-ealts.rx: -------------------------------------------------------------------------------- 1 | # alternation with empty alternatives 2 | a| 3 | |a 4 | a|b| 5 | a||b 6 | a|b|c| 7 | |a|b|c|d 8 | a||c||e 9 | ab|c||def|g 10 | -------------------------------------------------------------------------------- /test/d3-cfloat.rx: -------------------------------------------------------------------------------- 1 | #! rxplor -m -l -n -d 2 | #! rxplor -N - 3 | #! rxplor -D - 4 | [+-]?(((\d+\.\d*|\.\d+)([eE][+-]?\d+)?)|(\d+[eE][+-]?\d+))[fFlL]? 5 | -------------------------------------------------------------------------------- /test/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for rx tests 2 | 3 | default: test 4 | 5 | test: 6 | runtest.sh 7 | : 8 | : 9 | 10 | clean: 11 | rm -f *.tmp *.out 12 | -------------------------------------------------------------------------------- /test/q1-plus.std: -------------------------------------------------------------------------------- 1 | regexp: ab+c 2 | REJECT: #! rxq 'ab+c' - 3 | REJECT: ac 4 | accept: abc 5 | accept: abbc 6 | REJECT: aabc 7 | REJECT: abcc 8 | REJECT: aacc 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Files for Git to ignore 2 | PROFILE 3 | expt.rx 4 | /timings 5 | /workshop 6 | tmp.* 7 | *~ 8 | *.out 9 | *.tmp 10 | .nfs* 11 | .DS_Store 12 | -------------------------------------------------------------------------------- /test/22-csugly.rx: -------------------------------------------------------------------------------- 1 | # ugly but legal cset (bracket expression) forms 2 | [-] 3 | [-x] 4 | [x-] 5 | [[x] 6 | [[x-] 7 | []x] 8 | []x-] 9 | []x[-] 10 | [][] 11 | -------------------------------------------------------------------------------- /test/14-nestpar.rx: -------------------------------------------------------------------------------- 1 | # nested parentheses 2 | ((a)) 3 | (((a))) 4 | (((a*))) 5 | (((a)*)) 6 | (((a))*) 7 | (((a)))* 8 | ((a*b)@(c*d)#)* 9 | ((b|f)(a|e))* 10 | ((n|t)(o|e)+)* 11 | -------------------------------------------------------------------------------- /test/00-const.rx: -------------------------------------------------------------------------------- 1 | # constant regexps including the empty one 2 | () 3 | a 4 | bc 5 | def 6 | 417-00-9423 7 | 515-032-6910 8 | bite the wax tadpole 9 | __\a__\e__\f__\n__\r__\t__\v__ 10 | -------------------------------------------------------------------------------- /test/29-pcre.rx: -------------------------------------------------------------------------------- 1 | # test handling of PCRE extensions 2 | a+b+(?#comment)c+d+ 3 | a+b+(?#3IИЯ3)c+d+ 4 | (?#)o*(?#) 5 | a*(?:bc)*d* 6 | a*(?'center'bc)*d* 7 | a*(?bc)*d* 8 | a*(?Pbc)*d* 9 | -------------------------------------------------------------------------------- /test/02-repl.rx: -------------------------------------------------------------------------------- 1 | # simple posfix replication 2 | a? 3 | a+ 4 | a* 5 | a?b 6 | a+b 7 | a*b 8 | ab? 9 | ab+ 10 | ab* 11 | a?b? 12 | a+b+ 13 | a*b* 14 | a?b?c? 15 | a*b*c* 16 | a+b+c+ 17 | wh?o+p! 18 | -------------------------------------------------------------------------------- /test/04-erepl.rx: -------------------------------------------------------------------------------- 1 | # replication of null strings 2 | ? 3 | * 4 | + 5 | ?? 6 | ** 7 | ++ 8 | ??? 9 | *** 10 | +++ 11 | ?*+ 12 | *+? 13 | ?+* 14 | +?*? 15 | ?*?+? 16 | ?+?*+?*+?*+?*+*???*?*+?+*?+? 17 | -------------------------------------------------------------------------------- /test/g3-floating.rx: -------------------------------------------------------------------------------- 1 | #! rxg -R 2 | # floating point number forms 3 | \d*\.\d+|\d+\.\d* 4 | [+-]?(\d+\.\d*|\.\d+)([dDeE][+-]?\d\d?\d?)? 5 | [+-]?(((\d+\.\d*|\.\d+)([eE][+-]?\d+)?)|(\d+[eE][+-]?\d+))[fFlL]? 6 | -------------------------------------------------------------------------------- /test/g1-abab.std: -------------------------------------------------------------------------------- 1 | {"Expressions":[ 2 | {"Index":0,"Rexpr":"(a|b)*abb"}, 3 | {"Index":1,"Rexpr":"b(ab)*a"} 4 | ], 5 | "Examples":[ 6 | {"State":4,"RXset":[1],"Example":"ba"}, 7 | {"State":6,"RXset":[0],"Example":"abb"} 8 | ]} 9 | -------------------------------------------------------------------------------- /test/26-cscompl.rx: -------------------------------------------------------------------------------- 1 | # complemented character sets 2 | [^ -`]* 3 | [^ -@]* 4 | [^ -/]* 5 | [^ -/:-@[-`{-~]* 6 | [^0-9]* 7 | [^A-Za-z0-9]* 8 | # these last two should be identical (under -Z) 9 | [^A-Za-z0-9_]* 10 | [^\w]* 11 | -------------------------------------------------------------------------------- /test/d5-abc1.rx: -------------------------------------------------------------------------------- 1 | #! rxplor -m -l -n -d 2 | #! rxplor -N - 3 | #! rxplor -D - 4 | 5 | # one or more characters from an alphabet of three with no doubled letters 6 | a(ba)*b?|b(ab)*a?|a?(ba)*b?c((a(ba)*b?|b(ab)*a?)c)*(a(ba)*b?|b(ab)*a?)? 7 | -------------------------------------------------------------------------------- /test/06-alts.rx: -------------------------------------------------------------------------------- 1 | # basic alternation 2 | a|b 3 | ab|c 4 | a|bc 5 | ab|cd 6 | a|b|c 7 | a|b|c|d 8 | ab|cd|ef 9 | ab|cd|ef|gh 10 | abc|def 11 | abc|def|ghi 12 | a|bc|def|ghij 13 | pqrs|tuv|wx|yz 14 | abc|d|efgh|ij|kmlno|pqr|s|tuv|wxy|z 15 | -------------------------------------------------------------------------------- /test/10-parens.rx: -------------------------------------------------------------------------------- 1 | # simple uses of parentheses 2 | (a) 3 | ((a)) 4 | a(b)* 5 | a((b))* 6 | (ab)* 7 | a(bb)?c 8 | ((ab))* 9 | ((ab)*) 10 | a(bc)*d 11 | a(bcd)+e 12 | (ab)*(cd)*(ef)* 13 | (ab)+(cd)+(ef)+ 14 | (ab)?(cd)?(ef)? 15 | (ab)+(cd)*(ef)? 16 | -------------------------------------------------------------------------------- /test/12-altpar.rx: -------------------------------------------------------------------------------- 1 | # parentheses with alternation 2 | (ab)|c 3 | a(b|c)d 4 | (ab)|c|(de) 5 | a|(bc|de|fg)|h 6 | a(b|c)(d|e)(f|g)h 7 | (ab|cd)(ef|gh) 8 | (ab|cd)|(ef|gh) 9 | (ab|cd|ef)(gh|ij|kp) 10 | x(ab|cd|ef)y(gh|ij|kp)z 11 | (a|bc)d(e|fg)|(w|x)(y|z) 12 | -------------------------------------------------------------------------------- /test/05-cxrepl.rx: -------------------------------------------------------------------------------- 1 | #! rxplor -T -p -n -d 2 | # replication in complex situations 3 | (){3} 4 | (){2,4} 5 | (abc){3} 6 | (a|b){3} 7 | (ab|cd){3} 8 | (a*){3} 9 | (a{3})* 10 | (a{3})+ 11 | (a{3})? 12 | (a{3}){2} 13 | (a{3}){1,5} 14 | a{3}b{4} 15 | a{3}|b{4}|c{5} 16 | -------------------------------------------------------------------------------- /test/18-basic.rx: -------------------------------------------------------------------------------- 1 | #! rxplor -T 2 | #! rxplor -T -p -n -d 3 | a 4 | ab 5 | abc 6 | ab*c 7 | a|b 8 | a|b|c|d 9 | ab|cd 10 | abc|de 11 | |a|bc|def 12 | () 13 | (a) 14 | (ab) 15 | (ab*) 16 | (ab)* 17 | (a|b) 18 | (a|b)* 19 | AB(|) 20 | C(|) 21 | C(A|B) 22 | C(AC|B)D 23 | -------------------------------------------------------------------------------- /test/20-cset.rx: -------------------------------------------------------------------------------- 1 | [a]+ 2 | [bc]+ 3 | [def]+ 4 | [ghij]+ 5 | [lmnop]+ 6 | [tuvwxyz]+ 7 | [ACDFGHJKLMOPQRSUVWXYZ]+ 8 | [aeiuo]+ 9 | [aeiuo]+y? 10 | [a-z]+ 11 | [A-Z]+ 12 | [a-zA-Z]+ 13 | [_a-zA-Z]+ 14 | [0-9]+ 15 | [_a-zA-Z0-9]+ 16 | [_a-zA-Z][_a-zA-Z0-9]* 17 | [A-HO-Z][A-Z]* 18 | -------------------------------------------------------------------------------- /test/d7-jellyfish.rx: -------------------------------------------------------------------------------- 1 | #! rxplor -m -l -n -d 2 | #! rxplor -N - 3 | #! rxplor -D - 4 | 5 | [+-]?\d+ 6 | \+1-\d\d\d-\d\d\d-\d\d\d\d 7 | \d\d\d-\d\d-\d\d\d\d 8 | \d{16} 9 | 3[47]\d{13} 10 | [12][90]\d\d[01][0-9][012][0-9] 11 | (0[1-9]|1[012])(0[1-9]|[12]\d|3[01])(19|20)\d\d 12 | -------------------------------------------------------------------------------- /webapp/app.yaml: -------------------------------------------------------------------------------- 1 | application: regxplorer 2 | version: 1 3 | runtime: go 4 | api_version: go1 5 | 6 | handlers: 7 | # static directory holds images, css, etc 8 | - url: /static 9 | static_dir: static 10 | # everything else goes to the Go program 11 | - url: /.* 12 | script: _go_app 13 | -------------------------------------------------------------------------------- /test/92-reject.rx: -------------------------------------------------------------------------------- 1 | # things that other accept but we don't 2 | abc[[:digit:]]efg 3 | abc\cIdef 4 | [abc\cIdef] 5 | abc\pDdef 6 | [abc\pDdef] 7 | abc\PDdef 8 | [abc\PDdef] 9 | a??b 10 | a*?b 11 | a+?b 12 | (abc)*? 13 | abc\b.* 14 | (ab)cd\1 15 | (ab)cd\5 16 | (ab)cd\9 17 | (^ab)|(^cd) 18 | (ef$)|(gh$) 19 | -------------------------------------------------------------------------------- /test/d9-braid.rx: -------------------------------------------------------------------------------- 1 | #! rxplor -m -l -n -d 2 | #! rxplor -N - 3 | #! rxplor -D - 4 | 5 | # this combination produces a braided DFA 6 | \d*[1-9] 7 | \d{7}|\d{10} 8 | \d{10} 9 | 4\d{12}(\d\d\d)? 10 | 5[1-5]\d{14} 11 | 3[47]\d{13} 12 | [0-2][0-9][0-5][0-9][0-5][0-9] 13 | (0[1-9]|1[012])(0[1-9]|[12]\d|3[01])(19|20)\d\d 14 | -------------------------------------------------------------------------------- /test/x1-simple.rcx: -------------------------------------------------------------------------------- 1 | Guadalajara 2 | Stephen Jay Gould 3 | Homer.J.Simpson 4 | 525-00-1234 5 | (404)123-4567 6 | (404)323-4567 7 | arizona.edu 8 | www.cs.arizona.edu 9 | www.cs.arizona.edu/icon 10 | www.cs.arizona.edu/icon/ 11 | http://www.cs.arizona.edu/icon 12 | gmt@arizona 13 | gmt@arizona.edu 14 | gmt@cs.arizona.edu 15 | -------------------------------------------------------------------------------- /test/03-nrepl.rx: -------------------------------------------------------------------------------- 1 | #! rxplor -T -p -n -d 2 | # numeric replication {m,n} 3 | ab{0}c 4 | ab{1}c 5 | ab{2}c 6 | ab{3}c 7 | ab{4}c 8 | ab{0,0}c 9 | ab{0,1}c 10 | ab{1,1}c 11 | ab{0,2}c 12 | ab{1,2}c 13 | ab{2,2}c 14 | ab{0,3}c 15 | ab{1,3}c 16 | ab{2,3}c 17 | ab{3,3}c 18 | ab{2,5}c 19 | ab{0,}c 20 | ab{1,}c 21 | ab{2,}c 22 | ab{3,}c 23 | ab{4,}c 24 | -------------------------------------------------------------------------------- /test/g2-digits.std: -------------------------------------------------------------------------------- 1 | {"Expressions":[ 2 | {"Index":0,"Rexpr":"\\d+"}, 3 | {"Index":1,"Rexpr":"\\d*[1-9]"}, 4 | {"Index":2,"Rexpr":"[1-9]\\d*"} 5 | ], 6 | "Examples":[ 7 | {"State":1,"RXset":[0],"Example":"0"}, 8 | {"State":2,"RXset":[0,1,2],"Example":"1"}, 9 | {"State":3,"RXset":[0,1],"Example":"05"}, 10 | {"State":4,"RXset":[0,2],"Example":"10"} 11 | ]} 12 | -------------------------------------------------------------------------------- /test/x3-repl.rx: -------------------------------------------------------------------------------- 1 | #! rxx x3-repl.rx x3-repl.rcx 2 | ac 3 | ab?c 4 | ab+c 5 | ab*c 6 | ab{0}c 7 | ab{1}c 8 | ab{2}c 9 | ab{3}c 10 | ab{4}c 11 | ab{0,0}c 12 | ab{0,1}c 13 | ab{1,1}c 14 | ab{0,2}c 15 | ab{1,2}c 16 | ab{2,2}c 17 | ab{0,3}c 18 | ab{1,3}c 19 | ab{2,3}c 20 | ab{3,3}c 21 | ab{2,5}c 22 | ab{0,}c 23 | ab{1,}c 24 | ab{2,}c 25 | ab{3,}c 26 | ab{4,}c 27 | ab{1,3}b{1,3}c 28 | -------------------------------------------------------------------------------- /test/d1-dragon2.std3: -------------------------------------------------------------------------------- 1 | // DFA: (a|b)*abb 2 | digraph DFA { 3 | label="DFA: (a|b)*abb" 4 | node [shape=circle, height=.3, width=.3, margin=0, fontsize=10] 5 | s0 [shape=triangle, regular=true] 6 | s0->s0[label=" b"] 7 | s0->s3[label=" a"] 8 | s1 [shape=doublecircle] 9 | s1->s0[label=" b"] 10 | s1->s3[label=" a"] 11 | s2->s1[label=" b"] 12 | s2->s3[label=" a"] 13 | s3->s2[label=" b"] 14 | s3->s3[label=" a"] 15 | } 16 | -------------------------------------------------------------------------------- /test/d8-starship.rx: -------------------------------------------------------------------------------- 1 | #! rxplor -m -l -n -d 2 | #! rxplor -N - 3 | #! rxplor -D - 4 | 5 | \d+ 6 | \d*[1-9] 7 | [1-9]\d* 8 | \d\d\d-(\d[1-9]|[1-9]\d)-\d\d\d\d 9 | [3456]\d{15} 10 | [0-2][0-9]:[0-5][0-9](:[0-5][0-9])? 11 | \d{8} 12 | (19|20)?\d\d(0[1-9]|1[012])(0[1-9]|[12]\d|3[01]) 13 | [01][0-9][012][0-9][12][90]\d\d 14 | (0[1-9]|1[012])/(0[1-9]|[12]\d|3[01])/((19|20)\d\d) 15 | (0[1-9]|[12]\d|3[01])(0[1-9]|1[012])(19|20)\d\d 16 | -------------------------------------------------------------------------------- /test/54-libmisc.rx: -------------------------------------------------------------------------------- 1 | # RegExLib.com -- browse -- Misc 2 | # 3 | # 42. Decimal IP numbers 4 | (([01]?\d?\d|2[0-4]\d|25[0-5])\.){3}([01]?\d?\d|2[0-4]\d|25[0-5]) 5 | # 314. US Phone numbers with area code 6 | ([\(]{1}[0-9]{3}[\)]{1}[ |\-]{0,1}|^[0-9]{3}[\-| ])?[0-9]{3}(\-| ){1}[0-9]{4} 7 | # 3. US Zip+4 code 8 | \d{5}-\d{4} 9 | # 260. UK postcode 10 | ([A-PR-UWYZ0-9][A-HK-Y0-9][AEHMNPRTVXY0-9]?[ABEHMNPRVWXY0-9]? {1,2}[0-9][ABD-HJLN-UW-Z]{2}|GIR 0AA) 11 | -------------------------------------------------------------------------------- /test/90-bad.rx: -------------------------------------------------------------------------------- 1 | # irregular (i.e. illegal) expressions 2 | ) 3 | ( 4 | abc) 5 | (def 6 | g|h) 7 | i|j|k) 8 | (m|n 9 | (o|p|q 10 | \ 11 | abc\ 12 | [ 13 | [^ 14 | [] 15 | [^] 16 | [\] 17 | [abc 18 | [def\] 19 | [t-f] 20 | a{z 21 | a{1z 22 | a{1,z 23 | a{1,2z 24 | a{1,2bz 25 | a{}z 26 | a{,}z 27 | a{1,,2}z 28 | a{,2}z 29 | a{2,1}z 30 | a{p,3}z 31 | a{4,q}z 32 | a{x,y}z 33 | a{x}z 34 | a{xyzzy}z 35 | a++ 36 | b** 37 | c?? 38 | d{2}{3} 39 | e{5}? 40 | -------------------------------------------------------------------------------- /test/58-libmail.rx: -------------------------------------------------------------------------------- 1 | # RegExLib.com -- browse -- Email 2 | # 3 | # 26. "Email validator that adheres directly to the specification" 4 | ([a-zA-Z0-9_\-\.]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?) 5 | # 356. "my all-time favourite e-mail validator" 6 | (\w[-._\w]*\w@\w[-._\w]*\w\.\w{2,3}) 7 | # 1012. "A short and sweet email address validator" 8 | ([0-9a-zA-Z]+[-._+&])*[0-9a-zA-Z]+@([-0-9a-zA-Z]+[.])+[a-zA-Z]{2,6} 9 | -------------------------------------------------------------------------------- /test/50-libnum.rx: -------------------------------------------------------------------------------- 1 | # RegExLib.com -- browse -- Numbers 2 | # 3 | # 575. numbers w/ or w/o decimal places and commas 4 | # BUG: . not escaped 5 | (((\d{1,3})(,\d{3})*)|(\d+))(.\d+)? 6 | # 185. numbers w/ or w/o exponential form 7 | [+-]?([0-9]*\.?[0-9]+|[0-9]+\.?[0-9]*)([eE][+-]?[0-9]+)? 8 | # 943. numbers allowing sign and leading zeroes 9 | # BUGGY 10 | \-?\(?([0-9]{0,3}(\,?[0-9]{3})*(\.?[0-9]*))\)? 11 | # 126. "US currency" 12 | \$(\d{1,3}(\,\d{3})*|(\d+))(\.\d{2})? 13 | -------------------------------------------------------------------------------- /test/g4-cgpat.std: -------------------------------------------------------------------------------- 1 | {"Expressions":[ 2 | {"Index":0,"Rexpr":"3[47]\\d{3}"}, 3 | {"Index":1,"Rexpr":"4\\d{2}(\\d\\d\\d)?"}, 4 | {"Index":2,"Rexpr":"5[1-5]\\d{4}"}, 5 | {"Index":3,"Rexpr":"\\d{6}"} 6 | ], 7 | "Examples":[ 8 | {"State":11,"RXset":[1],"Example":"482"}, 9 | {"State":18,"RXset":[0],"Example":"34808"}, 10 | {"State":21,"RXset":[3],"Example":"769151"}, 11 | {"State":22,"RXset":[1,3],"Example":"482062"}, 12 | {"State":23,"RXset":[2,3],"Example":"536400"} 13 | ]} 14 | -------------------------------------------------------------------------------- /test/28-unicode.rx: -------------------------------------------------------------------------------- 1 | # various applications of Unicode characters 2 | # latin1 3 | [\xA0-\xFF]+[À-ÿ]+[¡-¿]+[\u00C0-\u00FF]+ 4 | # latin extended-A 5 | [\u0100-\u017F]+[Ā-ſ]+ 6 | # one- and two-byte characters 7 | [Α-Ϋ]+[ -\u07FF]+[\xA0-߷]+ 8 | # three-byte characters 9 | [\u0800-\uFFFF]+ 10 | # all mixed up 11 | [å∫ç∂´ƒ©˙ˆ∆˚¬µ˜øπœ®ß†¨√∑≈¥Ω]+ 12 | # a fanciful example from Wikipedia 13 | üñîçøðé@example\.com 14 | # Cat Stevens 15 | Ρούμπυ γλυκεία, έλα ξανά, έλα ξανά κοντά μου 16 | -------------------------------------------------------------------------------- /webapp/code/contact.go: -------------------------------------------------------------------------------- 1 | // contact.go -- generate "contact" page 2 | 3 | // 'contact' is not a verb in this house. 4 | // -- Nero Wolfe 5 | 6 | package webapp 7 | 8 | import ( 9 | "fmt" 10 | "net/http" 11 | ) 12 | 13 | // contact generates a page with a mailto: link for feedback 14 | func contact(w http.ResponseWriter, r *http.Request) { 15 | putheader(w, r, "Contact Us") 16 | fmt.Fprintf(w, ` 17 |

We love feedback. 18 |

Send mail to %s. 19 | `, MAILTO) 20 | putfooter(w, r) 21 | } 22 | -------------------------------------------------------------------------------- /test/34-real.rx: -------------------------------------------------------------------------------- 1 | # words 2 | [a-zA-Z][a-z]+ 3 | [a-zA-Z][a-z]* 4 | [a-zA-Z][a-z]*('[a-z]+)? 5 | # phrases 6 | [a-zA-Z ]+ 7 | # identifiers 8 | [a-zA-Z0-9]+ 9 | [a-zA-Z0-9]+(\.[a-zA-Z0-9]+)+ 10 | # ssn 11 | [0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9][0-9][0-9] 12 | # telno 13 | \([0-9][0-9][0-9]\)[2-9][0-9][0-9]-[0-9][0-9][0-9][0-9] 14 | # www 15 | (http://|)(www\.)?[a-zA-Z0-9]+(\.[a-zA-Z0-9]+)+(/[-_\.\?a-zA-Z0-9]*)* 16 | # email 17 | [a-zA-Z][a-zA-Z0-9]*([-_.][a-zA-Z0-9]*)*@([a-zA-Z0-9]+\.)+[a-zA-Z0-9]{2,} 18 | -------------------------------------------------------------------------------- /test/g5-chgcards.std: -------------------------------------------------------------------------------- 1 | {"Expressions":[ 2 | {"Index":0,"Rexpr":"3[47]\\d{13}"}, 3 | {"Index":1,"Rexpr":"4\\d{12}(\\d\\d\\d)?"}, 4 | {"Index":2,"Rexpr":"5[1-5]\\d{14}"}, 5 | {"Index":3,"Rexpr":"\\d{16}"} 6 | ], 7 | "Examples":[ 8 | {"State":51,"RXset":[1],"Example":"4820627896809"}, 9 | {"State":58,"RXset":[0],"Example":"348086240477247"}, 10 | {"State":61,"RXset":[3],"Example":"7691511507924704"}, 11 | {"State":62,"RXset":[1,3],"Example":"4820627896809679"}, 12 | {"State":63,"RXset":[2,3],"Example":"5364008348080480"} 13 | ]} 14 | -------------------------------------------------------------------------------- /test/g6-minim.std: -------------------------------------------------------------------------------- 1 | {"Expressions":[ 2 | {"Index":0,"Rexpr":"()"}, 3 | {"Index":1,"Rexpr":"a?"}, 4 | {"Index":2,"Rexpr":"a?(ba)*b?"}, 5 | {"Index":3,"Rexpr":"(a|)(ba)*(b|)"}, 6 | {"Index":4,"Rexpr":"a(ba)*b?|b(ab)*a?"} 7 | ], 8 | "Examples":[ 9 | {"State":0,"RXset":[0,1,2,3],"Example":""}, 10 | {"State":1,"RXset":[1,2,3,4],"Example":"a"}, 11 | {"State":2,"RXset":[2,3,4],"Example":"b"}, 12 | {"State":3,"RXset":[2,3,4],"Example":"ab"}, 13 | {"State":4,"RXset":[2,3,4],"Example":"ba"}, 14 | {"State":5,"RXset":[2,3,4],"Example":"aba"} 15 | ]} 16 | -------------------------------------------------------------------------------- /test/x1-simple.rx: -------------------------------------------------------------------------------- 1 | #! rxx x1-simple.rx x1-simple.rcx 2 | [a-zA-Z]+ 3 | [\w]+ 4 | [a-zA-Z ]+ 5 | [-_\.a-zA-Z0-9]+ 6 | [a-zA-Z0-9]+(\.[a-zA-Z0-9]+)+ 7 | # ssn and telno 8 | [0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9][0-9][0-9] 9 | \([0-9][0-9][0-9]\)[2-9][0-9][0-9]-[0-9][0-9][0-9][0-9] 10 | # www and email 11 | (http://|)[a-zA-Z0-9]+(\.[a-zA-Z0-9]*)+(/[-_\.\?a-zA-Z0-9]*)* 12 | [a-zA-Z][-_\.a-zA-Z0-9]*@[a-zA-Z0-9]*(\.[a-zA-Z0-9]*)* 13 | [a-zA-Z][-_\.a-zA-Z0-9]*@[a-zA-Z0-9]*(\.[a-zA-Z0-9]*)? 14 | [a-zA-Z][-_\.a-zA-Z0-9]*@[a-zA-Z0-9]*(\.[a-zA-Z0-9]*)+ 15 | -------------------------------------------------------------------------------- /test/d1-dragon2.std2: -------------------------------------------------------------------------------- 1 | // NFA: (a|b)*abb 2 | digraph NFA { 3 | label="NFA: (a|b)*abb" 4 | node [shape=circle, height=.3, margin=0, fontsize=10] 5 | i->p0[label=" a"] 6 | i->p1[label=" b"] 7 | i->p2[label=" a"] 8 | i [shape=triangle, regular=true, label=""] 9 | p0 [label="p0"] 10 | p0->p0[label=" a"] 11 | p0->p1[label=" b"] 12 | p0->p2[label=" a"] 13 | p1 [label="p1"] 14 | p1->p0[label=" a"] 15 | p1->p1[label=" b"] 16 | p1->p2[label=" a"] 17 | p2 [label="p2"] 18 | p2->p3[label=" b"] 19 | p3 [label="p3"] 20 | p3->p4[label=" b"] 21 | p4 [label="p4"] 22 | p4 [shape=doublecircle] 23 | } 24 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | RX TO-DO LIST 2 | (Things to consider) 3 | (All items still remaining will probably not be done.) 4 | 5 | REGEXPS: 6 | Expand wildcards and classes to include Unicode 7 | Add POSIX classes? 8 | Add \b anchor? Embedded ^ and $? 9 | Add flags such as "case insensitive"?? 10 | Other Perl stuff? 11 | 12 | UTILITIES: 13 | Handle newlines in exprs somehow? 14 | Add metadata to the JSON output of rxg 15 | In rxcluster, subdivide large clusters 16 | 17 | CLEANUP: 18 | Add func for "canonizing" DFA (clearer output, easier testing) 19 | Make gofmt pre-commit hook automatic in every Git clone 20 | -------------------------------------------------------------------------------- /test/d5-abc1.std3: -------------------------------------------------------------------------------- 1 | // DFA: a(ba)*b?|b(ab)*a?|a?(ba)*b?c((a(ba)*b?|b(ab)*a?)c)*(a(ba)*b?|b(ab)*a?)? 2 | digraph DFA { 3 | label="DFA: a(ba)*b?|b(ab)*a?|a?(ba)*b?c((a(ba)*b?|b(ab)*a?)c)*(a(ba)*b?|b(ab)*a?)?" 4 | node [shape=circle, height=.3, width=.3, margin=0, fontsize=10] 5 | s0 [shape=triangle, regular=true] 6 | s0->s1[label=" a"] 7 | s0->s2[label=" b"] 8 | s0->s3[label=" c"] 9 | s1 [shape=doublecircle] 10 | s1->s2[label=" b"] 11 | s1->s3[label=" c"] 12 | s2 [shape=doublecircle] 13 | s2->s1[label=" a"] 14 | s2->s3[label=" c"] 15 | s3 [shape=doublecircle] 16 | s3->s1[label=" a"] 17 | s3->s2[label=" b"] 18 | } 19 | -------------------------------------------------------------------------------- /test/24-csesc.rx: -------------------------------------------------------------------------------- 1 | # character set escapes 2 | # n.b. in POSIX, \ in [] should not be special 3 | [\-]+ 4 | [\]]* 5 | [ab\[cd\-gh\]ij]+ 6 | # perl inventions 7 | \d+ 8 | [\d]+ 9 | [\d0IZESB]+ 10 | 0x([\dABCDEF][\dABCDEF])+ 11 | \w+ 12 | [\w]+ 13 | \w+-\w+ 14 | <\s> 15 | # C-style escapes 16 | <[\a]> 17 | <[\b]> 18 | <[\e]> 19 | <[\f]> 20 | <[\n]> 21 | <[\r]> 22 | <[\t]> 23 | <[\v]> 24 | # escapes in ranges 25 | [a-w] 26 | [\142-x] 27 | [c-\x79] 28 | [\x64-\u007A] 29 | # big sets \D \W \S 30 | [\D]+ 31 | [\W]+ 32 | [\S]+ 33 | # the following pairs should be identical 34 | [\d]+ 35 | [^\D]+ 36 | [\w]+ 37 | [^\W]+ 38 | [\s]+ 39 | [^\S]+ 40 | -------------------------------------------------------------------------------- /test/41-cswww.rx: -------------------------------------------------------------------------------- 1 | # Samples from old www.cs.arizona.edu/scripts by gmt 2 | # 3 | # functions.php: getvalid 4 | [A-Za-z0-9]+ 5 | # functions.php: fixamps 6 | \&([[:alnum:]]*([^;[:alnum:]]| 7 | # functions.php: mailto 8 | ([^@]+)@(.*)\.arizona\.edu 9 | # nelson.php: else 'Not a valid integer.' 10 | [+-]?[0-9]+ 11 | # nelson.php: else 'Date must be in YYYY-MM-DD form.' 12 | [12][0-9][0-9][0-9]-[01][0-9]-[0-3][0-9] 13 | # nelson.php: else 'Time must be in HH:MM:SS form.' 14 | [012][0-9]:[0-5][0-9](:[0-5][0-9])? 15 | # 16 | # From WHM grading script (Ruby): 17 | (\/\/[ \t]*)?[Ww][Hh][Mm][ \t]*says[ \t]*:[ \tA-Za-z0-9`~!@\#$%^&\*\(\)\-_\+=\[\]{}\|\:\;'",.<>?\/]+ 18 | -------------------------------------------------------------------------------- /test/d2-demo.std3: -------------------------------------------------------------------------------- 1 | // DFA: 2 expressions 2 | digraph DFA { 3 | label="DFA: 2 expressions" 4 | node [shape=circle, height=.3, width=.3, margin=0, fontsize=10] 5 | s0 [shape=triangle, regular=true] 6 | s0->s3[label=" b"] 7 | s0->s4[label=" a"] 8 | s1 [shape=doubleoctagon, label="s1 9 | B"] 10 | s1->s4[label=" a"] 11 | s1->s5[label=" b"] 12 | s2 [shape=doubleoctagon, label="s2 13 | A"] 14 | s2->s4[label=" a"] 15 | s2->s7[label=" b"] 16 | s3->s1[label=" a"] 17 | s3->s7[label=" b"] 18 | s4->s4[label=" a"] 19 | s4->s6[label=" b"] 20 | s5->s1[label=" a"] 21 | s5->s2[label=" b"] 22 | s6->s2[label=" b"] 23 | s6->s4[label=" a"] 24 | s7->s4[label=" a"] 25 | s7->s7[label=" b"] 26 | } 27 | -------------------------------------------------------------------------------- /test/40-found.rx: -------------------------------------------------------------------------------- 1 | # Miscellaneous samples from the web 2 | # 3 | #U: http://www.regular-expressions.info/ 4 | #D: (main page) "any email address" 5 | [A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4} 6 | #U: http://www.regular-expressions.info/email.html 7 | #D: (final email example) 8 | [a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+(?:[A-Z]{2}|com|org|net|edu|gov|mil|biz|info|mobi|name|aero|asia|jobs|museum) 9 | #U: http://compilers.iecc.com/comparch/article/01-10-072 10 | #D: (telno "Yecchhh") 11 | (\([0-9]{3}\)[ ]{1}){0,1}[0-9]{3}\-[0-9]{4} 12 | #U: http://compilers.iecc.com/comparch/article/01-10-081 13 | #D: (a reformulation) 14 | (\(\d\d\d\) )?\d\d\d-\d\d\d\d 15 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | This is the regular expression project of 2 | 3 | Todd Proebsting 4 | Gregg Townsend 5 | Jasmin Uribe 6 | 7 | Department of Computer Science 8 | The University of Arizona 9 | Tucson, Arizona USA 10 | 11 | regex@cs.arizona.edu 12 | 13 | It comprises a library of Go language functions for experimenting with 14 | regular expressions and some associated command-line Unix utilities. 15 | It implements the regex.cs.arizona.edu website. 16 | 17 | This is primarily a research project, so there is no separate formal 18 | documentation. Library functions have comments designed for use with 19 | the godoc utility. Utility programs are also documented with comments 20 | which in this case are most easily viewed in a text editor. 21 | 22 | See also: LICENSE, BUILD, NOTES files. 23 | -------------------------------------------------------------------------------- /test/47-tutsplus.rx: -------------------------------------------------------------------------------- 1 | # "8 Regular Expressions You Should Know" by Vasili sampled 24-feb-2014 2 | # code.tutsplus.com/tutorials/8-regular-expressions-you-should-know--net-6149 3 | # 4 | # 1. Matching a Username 5 | [a-z0-9_-]{3,16} 6 | # 7 | # 2. Matching a Password 8 | [a-z0-9_-]{6,18}$ 9 | # 10 | # 3. Matching a Hex Value [a color spec, apparently?] 11 | #?([a-f0-9]{6}|[a-f0-9]{3}) 12 | # 4. Matching a Slug 13 | [a-z0-9-]+ 14 | # 5. Matching an Email 15 | ([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6}) 16 | # 6. Matching a URL 17 | (https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/? 18 | # 7. Matching an IP Address 19 | (?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?) 20 | # 8. Matching an HTML Tag 21 | <([a-z]+)([^<]+)*(?:>(.*)<\/\1>|\s+\/>) 22 | -------------------------------------------------------------------------------- /test/d1-dragon2.std: -------------------------------------------------------------------------------- 1 | 2 | #! rxplor -m -l -n -d 3 | 4 | #! rxplor -N - 5 | 6 | #! rxplor -D - 7 | 8 | expr 0: (a|b)*abb 9 | ----------------------- NFA ---------------------- 10 | Inputs: [ab] 11 | Witnesses: [ab] 12 | begin => { 0 1 2 } 13 | p0. a => { 0 1 2 } 14 | p1. b => { 0 1 2 } 15 | p2. a => { 3 } 16 | p3. b => { 4 } 17 | p4. b => { 5 } 18 | p5. # => { } 19 | ----------------- Unoptimized DFA ---------------- 20 | s0. { p0 p1 p2 } [b]:s0 [a]:s1 21 | s1. { p0 p1 p2 p3 } [a]:s1 [b]:s2 22 | s2. { p0 p1 p2 p4 } [a]:s1 [b]:s3 23 | s3# { p0 p1 p2 p5 } [b]:s0 [a]:s1 24 | ------------------ Minimized DFA ----------------- 25 | s0. { p0 p1 p2 } [b]:s0 [a]:s3 26 | s1# { p0 p1 p2 p5 } [b]:s0 [a]:s3 27 | s2. { p0 p1 p2 p4 } [b]:s1 [a]:s3 28 | s3. { p0 p1 p2 p3 } [b]:s2 [a]:s3 29 | -------------------------------------------------------------------------------- /test/52-libstr.rx: -------------------------------------------------------------------------------- 1 | # RegExLib.com -- browse -- Strings 2 | # 3 | # 535. Social Security numbers within allocated ranges 4 | (?!000)([0-6]\d{2}|7([0-6]\d|7[012]))([ -]?)(?!00)\d\d\3(?!0000)\d{4} 5 | # 672. GUID values 6 | [{|\(]?[0-9a-fA-F]{8}[-]?([0-9a-fA-F]{4}[-]?){3}[0-9a-fA-F]{12}[\)|}]? 7 | # 580. passwords length 8-20 with at least one digit, cannot start with digit 8 | (?=[^\d_].*?\d)\w(\w|[!@#$%]){7,20} 9 | # 656. spam trap for pseudo-spellings of "Viagra" 10 | [v,V,(\\/)](\W|)[i,I,1,l,L](\W|)[a,A,@,(\/\\)](\W|)[g,G](\W|)[r,R](\W|)[a,A,@,(\/\\))] 11 | # 49. major credit cards (AmEx len 15, others len 16) 12 | ((4\d{3})|(5[1-5]\d{2})|(6011))-?\d{4}-?\d{4}-?\d{4}|3[4,7]\d{13} 13 | # 595. UK vehicle registration number from September 2001 14 | ([A-HK-PRSVWY][A-HJ-PR-Y])\s?([0][2-9]|[1-9][0-9])\s?[A-HJ-PR-Z]{3} 15 | -------------------------------------------------------------------------------- /test/d2-demo.std2: -------------------------------------------------------------------------------- 1 | // NFA: 2 expressions 2 | digraph NFA { 3 | label="NFA: 2 expressions" 4 | node [shape=circle, height=.3, margin=0, fontsize=10] 5 | i->p0[label=" a"] 6 | i->p1[label=" b"] 7 | i->p2[label=" a"] 8 | i->p6[label=" b"] 9 | i [shape=triangle, regular=true, label=""] 10 | p0 [label="p0"] 11 | p0->p0[label=" a"] 12 | p0->p1[label=" b"] 13 | p0->p2[label=" a"] 14 | p1 [label="p1"] 15 | p1->p0[label=" a"] 16 | p1->p1[label=" b"] 17 | p1->p2[label=" a"] 18 | p2 [label="p2"] 19 | p2->p3[label=" b"] 20 | p3 [label="p3"] 21 | p3->p4[label=" b"] 22 | p4 [label="p4"] 23 | p4 [shape=doublecircle] 24 | p6 [label="p6"] 25 | p6->p7[label=" a"] 26 | p6->p9[label=" a"] 27 | p7 [label="p7"] 28 | p7->p8[label=" b"] 29 | p8 [label="p8"] 30 | p8->p7[label=" a"] 31 | p8->p9[label=" a"] 32 | p9 [label="p9"] 33 | p9 [shape=doublecircle] 34 | } 35 | -------------------------------------------------------------------------------- /test/d3-cfloat.std3: -------------------------------------------------------------------------------- 1 | // DFA: [+-]?(((\d+\.\d*|\.\d+)([eE][+-]?\d+)?)|(\d+[eE][+-]?\d+))[fFlL]? 2 | digraph DFA { 3 | label="DFA: [+-]?(((\\d+\\.\\d*|\\.\\d+)([eE][+-]?\\d+)?)|(\\d+[eE][+-]?\\d+))[fFlL]?" 4 | node [shape=circle, height=.3, width=.3, margin=0, fontsize=10] 5 | s0 [shape=triangle, regular=true] 6 | s0->s2[label=" ."] 7 | s0->s3[label=" [0-9]"] 8 | s0->s4[label=" [+-]"] 9 | s1 [shape=doublecircle] 10 | s1->s1[label=" [0-9]"] 11 | s1->s5[label=" [FLfl]"] 12 | s1->s7[label=" [Ee]"] 13 | s2->s1[label=" [0-9]"] 14 | s3->s1[label=" ."] 15 | s3->s3[label=" [0-9]"] 16 | s3->s7[label=" [Ee]"] 17 | s4->s2[label=" ."] 18 | s4->s3[label=" [0-9]"] 19 | s5 [shape=doublecircle] 20 | s6 [shape=doublecircle] 21 | s6->s5[label=" [FLfl]"] 22 | s6->s6[label=" [0-9]"] 23 | s7->s6[label=" [0-9]"] 24 | s7->s8[label=" [+-]"] 25 | s8->s6[label=" [0-9]"] 26 | } 27 | -------------------------------------------------------------------------------- /test/32-simple.rx: -------------------------------------------------------------------------------- 1 | #! rxplor -T -p -n -d 2 | # source not recorded 3 | C(AC|B)D 4 | #A: Aho and Ullman 5 | #T: Dragon2: automata example 6 | (a|b)*abb 7 | #A: Sedgewick 8 | #T: Genome Marker 9 | gcg(cgg|agg)*ctg 10 | #A: Sedgewick 11 | #T: multiples of 3, in binary 12 | (0|1(01*0)*1)* 13 | # A simple expr that had problems minimizing 14 | (10)*|ac|dc 15 | #A: gmt 16 | #T: Strings without repeating chars (alphabet size 2) 17 | a?(ba)*b? 18 | # This expr uncovered TWO minimization bugs 19 | a(ba)*b?|b(ab)*a? 20 | #A: gmt 21 | #T: Experiments with word construction 22 | [bcfpt][lr]?([aeiou]|oo|ee)[dmnt] 23 | W(hit|ens|at|in|est|al)[eioa][lmnrxs](ford|by|dale|bot|man|son|gard|hold) 24 | # an example that showed the move-while-distinguishing minimization bug 25 | # originally from http://regexlib.com/REDetails.aspx?regexp_id=654 26 | ^([A-Za-z0-9]\s?)+([,]\s?([A-Za-z0-9]\s?)+)*$ 27 | -------------------------------------------------------------------------------- /test/x3-repl.std: -------------------------------------------------------------------------------- 1 | 2 | #! rxx x3-repl.rx x3-repl.rcx 3 | 1: ac 4 | 2: ab?c 5 | 3: ab+c 6 | 4: ab*c 7 | 5: ab{0}c 8 | 6: ab{1}c 9 | 7: ab{2}c 10 | 8: ab{3}c 11 | 9: ab{4}c 12 | A: ab{0,0}c 13 | B: ab{0,1}c 14 | C: ab{1,1}c 15 | D: ab{0,2}c 16 | E: ab{1,2}c 17 | F: ab{2,2}c 18 | G: ab{0,3}c 19 | H: ab{1,3}c 20 | I: ab{2,3}c 21 | J: ab{3,3}c 22 | K: ab{2,5}c 23 | L: ab{0,}c 24 | M: ab{1,}c 25 | N: ab{2,}c 26 | O: ab{3,}c 27 | P: ab{4,}c 28 | Q: ab{1,3}b{1,3}c 29 | 30 | -------------------------- a 31 | 12-45----AB-D--G----L----- ac 32 | -234-6----BCDE-GH---LM---- abc 33 | --34--7-----DEFGHI-KLMN--Q abbc 34 | --34---8-------GHIJKLMNO-Q abbbc 35 | --34----9----------KLMNOPQ abbbbc 36 | --34---------------KLMNOPQ abbbbbc 37 | --34----------------LMNOPQ abbbbbbc 38 | --34----------------LMNOP- abbbbbbbc 39 | --34----------------LMNOP- abbbbbbbbc 40 | -------------------------------------------------------------------------------- /test/g3-floating.std: -------------------------------------------------------------------------------- 1 | {"Expressions":[ 2 | {"Index":0,"Rexpr":"\\d*\\.\\d+|\\d+\\.\\d*"}, 3 | {"Index":1,"Rexpr":"[+-]?(\\d+\\.\\d*|\\.\\d+)([dDeE][+-]?\\d\\d?\\d?)?"}, 4 | {"Index":2,"Rexpr":"[+-]?(((\\d+\\.\\d*|\\.\\d+)([eE][+-]?\\d+)?)|(\\d+[eE][+-]?\\d+))[fFlL]?"} 5 | ], 6 | "Examples":[ 7 | {"State":6,"RXset":[0,1,2],"Example":".6"}, 8 | {"State":7,"RXset":[0,1,2],"Example":"3."}, 9 | {"State":9,"RXset":[1,2],"Example":"+.8"}, 10 | {"State":10,"RXset":[1,2],"Example":"+5."}, 11 | {"State":13,"RXset":[2],"Example":".6L"}, 12 | {"State":15,"RXset":[2],"Example":"3E6"}, 13 | {"State":17,"RXset":[1],"Example":".6d4"}, 14 | {"State":19,"RXset":[1,2],"Example":".6e6"}, 15 | {"State":20,"RXset":[1],"Example":".6d47"}, 16 | {"State":21,"RXset":[1,2],"Example":".6e60"}, 17 | {"State":22,"RXset":[1],"Example":".6d472"}, 18 | {"State":23,"RXset":[1,2],"Example":".6e607"}, 19 | {"State":24,"RXset":[2],"Example":".6e6070"} 20 | ]} 21 | -------------------------------------------------------------------------------- /BUILD: -------------------------------------------------------------------------------- 1 | BUILDING THE RX CODE 2 | 3 | Everything here is written in the Go programming language. 4 | Download Go (and read its documentation) from golang.org. 5 | 6 | Go requires that the environment variable GOPATH be set. 7 | This source directory should be located two levels below that, 8 | at $GOPATH/src/rx. 9 | 10 | The command-level programs are found in the ./rx* subdirectories 11 | and are documented in their comment headers. To build then, run 12 | make build 13 | which will produce executable binaries in $GOPATH/bin. 14 | 15 | The default Makefile target is oriented towards development. 16 | It runs the above build followed by a series of automated tests. 17 | Finally it runs the "rxplor" program on a local file "expt.rx" 18 | if this file exists. 19 | 20 | The webapp directory implements the regex.cs.arizona.edu website. 21 | It is built separately by "make serve" (to run locally) or 22 | "make deploy" (to upload). Either of these additionally requires 23 | the Google App Engine in the search path. 24 | -------------------------------------------------------------------------------- /test/56-libclk.rx: -------------------------------------------------------------------------------- 1 | # RegExLib.com -- browse -- Dates and Times 2 | # 3 | # 981. 24-hour time hh:mm[:ss] 4 | (([0-1]?[0-9])|([2][0-3])):([0-5]?[0-9])(:([0-5]?[0-9]))? 5 | # 193. Date in MySQL DB format 6 | ([0-9]{4})-([0-9]{1,2})-([0-9]{1,2}) 7 | # 235. "Improved date validation" (M*/D*/Y*) 1920-2019 8 | ((0?[13578]|10|12)(-|\/)(([1-9])|(0[1-9])|([12])([0-9]?)|(3[01]?))(-|\/)((19)([2-9])(\d{1})|(20)([01])(\d{1})|([8901])(\d{1}))|(0?[2469]|11)(-|\/)(([1-9])|(0[1-9])|([12])([0-9]?)|(3[0]?))(-|\/)((19)([2-9])(\d{1})|(20)([01])(\d{1})|([8901])(\d{1}))) 9 | # 406. Full names of the months 10 | (?:J(anuary|u(ne|ly))|February|Ma(rch|y)|A(pril|ugust)|(((Sept|Nov|Dec)em)|Octo)ber) 11 | # 969. RFC2822 date 12 | (?:\s*(Sun|Mon|Tue|Wed|Thu|Fri|Sat),\s*)?(0?[1-9]|[1-2][0-9]|3[01])\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+(19[0-9]{2}|[2-9][0-9]{3}|[0-9]{2})\s+(2[0-3]|[0-1][0-9]):([0-5][0-9])(?::(60|[0-5][0-9]))?\s+([-\+][0-9]{2}[0-5][0-9]|(?:UT|GMT|(?:E|C|M|P)(?:ST|DT)|[A-IK-Z]))(\s*\((\\\(|\\\)|(?<=[^\\])\((?)|(?<=[^\\])\)(?<-C>)|[^\(\)]*)*(?(C)(?!))\))*\s* 13 | -------------------------------------------------------------------------------- /rxq/rxq.go: -------------------------------------------------------------------------------- 1 | /* 2 | rxq.go -- regular expression query 3 | 4 | usage: rxq "rexpr" [file] 5 | 6 | Rxq reads strings, one per line, from file (default stdin). 7 | Each string is tested against the regular expression rexpr, 8 | and is printed with a label of "accept" or "REJECT". 9 | 10 | Spring-2014 / gmt 11 | */ 12 | package main 13 | 14 | import ( 15 | "bufio" 16 | "fmt" 17 | "log" 18 | "os" 19 | "rx" 20 | ) 21 | 22 | func main() { 23 | var ifile *bufio.Scanner 24 | if len(os.Args) == 2 { 25 | ifile = rx.MkScanner("-") 26 | } else if len(os.Args) == 3 { 27 | ifile = rx.MkScanner(os.Args[2]) 28 | } else { 29 | log.Fatal("usage: rxq \"rexpr\" [file]") 30 | } 31 | spec := os.Args[1] 32 | fmt.Printf("regexp: %s\n", spec) 33 | dfa, err := rx.Compile(spec) 34 | if err != nil { 35 | log.Fatal(err) 36 | } 37 | 38 | // load and process candidate strings 39 | for i := 0; ifile.Scan(); i++ { 40 | s := ifile.Text() 41 | if dfa.Accepts(s) != nil { 42 | fmt.Println("accept:", s) 43 | } else { 44 | fmt.Println("REJECT:", s) 45 | } 46 | } 47 | rx.CkErr(ifile.Err()) 48 | } 49 | -------------------------------------------------------------------------------- /test/x1-simple.std: -------------------------------------------------------------------------------- 1 | 2 | #! rxx x1-simple.rx x1-simple.rcx 3 | 1: [a-zA-Z]+ 4 | 2: [\w]+ 5 | 3: [a-zA-Z ]+ 6 | 4: [-_\.a-zA-Z0-9]+ 7 | 5: [a-zA-Z0-9]+(\.[a-zA-Z0-9]+)+ 8 | # ssn and telno 9 | 6: [0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9][0-9][0-9] 10 | 7: \([0-9][0-9][0-9]\)[2-9][0-9][0-9]-[0-9][0-9][0-9][0-9] 11 | # www and email 12 | 8: (http://|)[a-zA-Z0-9]+(\.[a-zA-Z0-9]*)+(/[-_\.\?a-zA-Z0-9]*)* 13 | 9: [a-zA-Z][-_\.a-zA-Z0-9]*@[a-zA-Z0-9]*(\.[a-zA-Z0-9]*)* 14 | A: [a-zA-Z][-_\.a-zA-Z0-9]*@[a-zA-Z0-9]*(\.[a-zA-Z0-9]*)? 15 | B: [a-zA-Z][-_\.a-zA-Z0-9]*@[a-zA-Z0-9]*(\.[a-zA-Z0-9]*)+ 16 | 17 | 1234- -- ---- Guadalajara 18 | --3-- -- ---- Stephen Jay Gould 19 | ---45 -- 8--- Homer.J.Simpson 20 | ---4- 6- ---- 525-00-1234 21 | ----- -- ---- (404)123-4567 22 | ----- -7 ---- (404)323-4567 23 | ---45 -- 8--- arizona.edu 24 | ---45 -- 8--- www.cs.arizona.edu 25 | ----- -- 8--- www.cs.arizona.edu/icon 26 | ----- -- 8--- www.cs.arizona.edu/icon/ 27 | ----- -- 8--- http://www.cs.arizona.edu/icon 28 | ----- -- -9A- gmt@arizona 29 | ----- -- -9AB gmt@arizona.edu 30 | ----- -- -9-B gmt@cs.arizona.edu 31 | -------------------------------------------------------------------------------- /rxpick/rxpick.go: -------------------------------------------------------------------------------- 1 | /* 2 | rxpick.go - pick out regular expressions by ordinal number 3 | 4 | usage: rxpick exprfile i ... 5 | 6 | Rxpick reads regular expressions from exprfile and prints 7 | those corresponding to the ordinals given as command arguments, 8 | with their metadata if any. 9 | 10 | spring 2014 / gmt 11 | */ 12 | package main 13 | 14 | import ( 15 | "fmt" 16 | "log" 17 | "os" 18 | "rx" 19 | "strconv" 20 | ) 21 | 22 | func main() { 23 | // get command line options 24 | if len(os.Args) < 3 { 25 | log.Fatal("usage: rxpick exprfile i ...") 26 | } 27 | filename := os.Args[1] 28 | xset := &rx.BitSet{} 29 | for i := 2; i < len(os.Args); i++ { 30 | n, err := strconv.Atoi(os.Args[i]) 31 | rx.CkErr(err) 32 | xset.Set(n) 33 | } 34 | 35 | // load expressions from file 36 | exprs := rx.LoadExpressions(filename, nil) 37 | 38 | // print desired entries 39 | for _, i := range xset.Members() { 40 | fmt.Printf("\n# { %d }\n", i) 41 | if i >= len(exprs) { 42 | fmt.Printf("# OUT OF RANGE\n") 43 | continue 44 | } 45 | exprs[i].ShowMeta(os.Stdout, "") 46 | fmt.Println(exprs[i].Expr) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /rxsys/system.go: -------------------------------------------------------------------------------- 1 | // system.go -- nonportable interface functions. 2 | // 3 | // This file holds system interface functions not usable in the App Engine. 4 | 5 | package rxsys 6 | 7 | import ( 8 | "fmt" 9 | "rx" 10 | "syscall" 11 | "time" 12 | ) 13 | 14 | // CPUtime returns the current CPU usage (user time + system time). 15 | func CPUtime() time.Duration { 16 | var ustruct syscall.Rusage 17 | rx.CkErr(syscall.Getrusage(0, &ustruct)) 18 | user := time.Duration(syscall.TimevalToNsec(ustruct.Utime)) 19 | sys := time.Duration(syscall.TimevalToNsec(ustruct.Stime)) 20 | return user + sys 21 | return 0 22 | } 23 | 24 | // Interval returns the CPU time (user + system) since the preceding call. 25 | func Interval() time.Duration { 26 | total := CPUtime() 27 | delta := total - prevTotal 28 | prevTotal = total 29 | return delta 30 | } 31 | 32 | var prevTotal time.Duration // total time at list check 33 | 34 | // ShowInterval calcs and (unless label is empty) prints the last interval. 35 | func ShowInterval(label string) { 36 | dt := Interval().Seconds() 37 | if label != "" { 38 | fmt.Printf("%7.3f %s\n", dt, label) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /test/d2-demo.std: -------------------------------------------------------------------------------- 1 | 2 | #! rxplor -m -l -n -d 3 | 4 | #! rxplor -N - 5 | 6 | #! rxplor -D - 7 | 8 | expr 0: (a|b)*abb 9 | 10 | expr 1: b(ab)*a 11 | ----------------------- NFA ---------------------- 12 | Inputs: [ab] 13 | Witnesses: [ab] 14 | begin => { 0 1 2 6 } 15 | p0. a => { 0 1 2 } 16 | p1. b => { 0 1 2 } 17 | p2. a => { 3 } 18 | p3. b => { 4 } 19 | p4. b => { 5 } 20 | p5. # => { } 21 | p6. b => { 7 9 } 22 | p7. a => { 8 } 23 | p8. b => { 7 9 } 24 | p9. a => { 10 } 25 | p10. # => { } 26 | ----------------- Unoptimized DFA ---------------- 27 | s0. { p0 p1 p2 p6 } [a]:s1 [b]:s2 28 | s1. { p0 p1 p2 p3 } [a]:s1 [b]:s3 29 | s2. { p0 p1 p2 p7 p9 } [a]:s4 [b]:s5 30 | s3. { p0 p1 p2 p4 } [a]:s1 [b]:s6 31 | s4# { p0 p1 p2 p3 p8 p10 } [a]:s1 [b]:s7 32 | s5. { p0 p1 p2 } [a]:s1 [b]:s5 33 | s6# { p0 p1 p2 p5 } [a]:s1 [b]:s5 34 | s7. { p0 p1 p2 p4 p7 p9 } [a]:s4 [b]:s6 35 | ------------------ Minimized DFA ----------------- 36 | s0. { p0 p1 p2 p6 } [b]:s3 [a]:s4 37 | s1# { p0 p1 p2 p3 p8 p10 } [a]:s4 [b]:s5 38 | s2# { p0 p1 p2 p5 } [a]:s4 [b]:s7 39 | s3. { p0 p1 p2 p7 p9 } [a]:s1 [b]:s7 40 | s4. { p0 p1 p2 p3 } [a]:s4 [b]:s6 41 | s5. { p0 p1 p2 p4 p7 p9 } [a]:s1 [b]:s2 42 | s6. { p0 p1 p2 p4 } [b]:s2 [a]:s4 43 | s7. { p0 p1 p2 } [a]:s4 [b]:s7 44 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2014 Arizona Board of Regents; all rights reserved. 2 | 3 | This software is being provided by the copyright holders under the 4 | following license. By obtaining, using and/or copying this software, you 5 | agree that you have read, understood, and will comply with the following 6 | terms and conditions: 7 | 8 | Permission to use, copy, modify, and distribute this software and its 9 | documentation for any purpose and without fee or royalty is hereby granted, 10 | provided that the full text of this notice appears on all copies of the 11 | software and documentation or portions thereof, including modifications, 12 | that you make. 13 | 14 | This software is provided "as is," and copyright holders make no 15 | representations or warranties, express or implied. By way of example, but 16 | not limitation, copyright holders make no representations or warranties of 17 | merchantability or fitness for any particular purpose or that the use of the 18 | software or documentation will not infringe any third party patents, 19 | copyrights, trademarks or other rights. Copyright holders will bear no 20 | liability for any use of this software or documentation. 21 | 22 | The name and trademarks of copyright holders may not be used in advertising 23 | or publicity pertaining to the software without specific, written prior 24 | permission. Title to copyright in this software and any associated 25 | documentation will at all times remain with copyright holders. 26 | -------------------------------------------------------------------------------- /webapp/code/syntax.go: -------------------------------------------------------------------------------- 1 | // syntax.go -- generate syntax explanation page 2 | 3 | package webapp 4 | 5 | import ( 6 | "html/template" 7 | "net/http" 8 | ) 9 | 10 | // syntax generates a page outlining the accepted syntax 11 | func syntax(w http.ResponseWriter, r *http.Request) { 12 | putheader(w, r, "Syntax") 13 | tSyntax.Execute(w, nil) 14 | putfooter(w, r) 15 | } 16 | 17 | var tSyntax = template.Must(template.New("syntax").Parse(` 18 |

We implement traditional regular expressions with a few simple extensions. 19 |

The following forms are handled:

20 |       abc  a|b|c  a(b|c)d
21 |       a?  b*  c+  d{m,n}
22 |       \a \e \f \n \r \t \v \046 \xF7 \u03A8
23 |       .  \d \s \w \D \S \W
24 |       [abc]  [^abc]  [a-c]  [\x]
25 | 
26 |

All expressions are “anchored”. 27 | An initial ^ and/or final $ is ignored. 28 |
Embedded anchors, as well as other anchor forms like \b, 29 | are illegal. 30 |

Wildcard character sets (for 31 |   .   \w   \D   [^\d]   etc.) 32 | are limited to the ASCII subset [\x01-\x7F] of Unicode. 33 |

Most Perl (? forms are illegal, 34 | but two kinds are allowed and ignored:

35 |       Comments:  (?#...)
36 |       Captures:  (?:  (?'id'  (?<id>  (?P<id>
37 | 
38 |

Leading and trailing spaces are trimmed to tolerate imprecision in 39 | copying and pasting. To inhibit trimming, parenthesize the expression. 40 | `)) 41 | -------------------------------------------------------------------------------- /test/80-questions.std2: -------------------------------------------------------------------------------- 1 | 66 expression(s) in 43 cluster(s) using 50% threshold 2 | 3 | cluster 8: "1" (8/8) 4 | 8. 0|-?[1-9]\d* 5 | 7. [+-]?\d+ 6 | 6. [1-9]\d* 7 | 5. \d*[1-9] 8 | 4. \d+ 9 | 3. 1?(01)*0? 10 | 1. 0|1[01]* 11 | 0. [01]+ 12 | 13 | cluster 15: "20714-3413" (2/2) 14 | 15. \d{5}-\d{4} 15 | 14. \d{5}(-\d{4})? 16 | 17 | cluster 19: "4169708589" (4/4) 18 | 19. ([2-9]\d\d)?[2-9]\d\d\d\d\d\d 19 | 18. [2-9]\d\d[2-9]\d\d\d\d\d\d 20 | 17. \d{10} 21 | 16. \d{7}|\d{10} 22 | 23 | cluster 21: "(492) 015-0820" (2/2) 24 | 21. ([2-9]\d\d-|\([2-9]\d\d\) ?)\d\d\d-\d\d\d\d 25 | 20. (\d\d\d-|\(\d\d\d\) ?)\d\d\d-\d\d\d\d 26 | 27 | cluster 28: "739-04-2803" (3/3) 28 | 28. ([0-6]\d{2}|7([0-6]\d|7[012]))-\d\d-\d\d\d\d 29 | 27. \d\d\d-(\d[1-9]|[1-9]\d)-\d\d\d\d 30 | 26. \d\d\d-\d\d-\d\d\d\d 31 | 32 | cluster 30: "5518709128260289" (2/2) 33 | 30. [3456]\d{15} 34 | 29. \d{16} 35 | 36 | cluster 39: "213428" (2/2) 37 | 39. ([01]\d|2[0-3])[0-5][0-9][0-5][0-9] 38 | 38. [0-2][0-9][0-5][0-9][0-5][0-9] 39 | 40 | cluster 42: "01:50:50" (3/3) 41 | 42. (0?\d|1[0-2]):[0-5][0-9](:[0-5][0-9])? 42 | 41. ([01]\d|2[0-3]):[0-5][0-9](:[0-5][0-9])? 43 | 40. [0-2][0-9]:[0-5][0-9](:[0-5][0-9])? 44 | 45 | cluster 46: "19420404" (3/3) 46 | 46. (19|20)?\d\d(0[1-9]|1[012])(0[1-9]|[12]\d|3[01]) 47 | 45. (19|20)\d\d(0[1-9]|1[012])(0[1-9]|[12]\d|3[01]) 48 | 44. [12][90]\d\d[01][0-9][012][0-9] 49 | 50 | cluster 59: "fLRX@E3VG.d0HZ" (4/4) 51 | 59. \w[-.'\w]*@\w+(\.\w+)+ 52 | 58. [-.'\w]+@\w+(\.\w+)+ 53 | 57. \w+@\w+(\.\w+)+ 54 | 56. \w+@\w+\.\w+ 55 | -------------------------------------------------------------------------------- /test/92-reject.std: -------------------------------------------------------------------------------- 1 | Options: -R -T -g -h -i -l -v -I 0 2 | 3 | # things that other accept but we don't 4 | 5 | ERROR: abc[[:digit:]]efg 6 | rx: [:class:] unimplemented: in "abc[[:digit:]]efg" 7 | 8 | ERROR: abc\cIdef 9 | rx: '\cx' unimplemented: in "abc\cIdef" 10 | 11 | ERROR: [abc\cIdef] 12 | rx: '\cx' unimplemented: in "[abc\cIdef]" 13 | 14 | ERROR: abc\pDdef 15 | rx: '\px' unimplemented: in "abc\pDdef" 16 | 17 | ERROR: [abc\pDdef] 18 | rx: '\px' unimplemented: in "[abc\pDdef]" 19 | 20 | ERROR: abc\PDdef 21 | rx: '\Px' unimplemented: in "abc\PDdef" 22 | 23 | ERROR: [abc\PDdef] 24 | rx: '\Px' unimplemented: in "[abc\PDdef]" 25 | 26 | ERROR: a??b 27 | rx: prefer-fewer '?' unimplemented: in "a??b" 28 | 29 | ERROR: a*?b 30 | rx: prefer-fewer '?' unimplemented: in "a*?b" 31 | 32 | ERROR: a+?b 33 | rx: prefer-fewer '?' unimplemented: in "a+?b" 34 | 35 | ERROR: (abc)*? 36 | rx: prefer-fewer '?' unimplemented: in "(abc)*?" 37 | 38 | ERROR: abc\b.* 39 | rx: \b (boundary) unimplemented: in "abc\b.*" 40 | 41 | ERROR: (ab)cd\1 42 | rx: \1 (backref) unimplemented: in "(ab)cd\1" 43 | 44 | ERROR: (ab)cd\5 45 | rx: \5 (backref) unimplemented: in "(ab)cd\5" 46 | 47 | ERROR: (ab)cd\9 48 | rx: \9 (backref) unimplemented: in "(ab)cd\9" 49 | 50 | ERROR: (^ab)|(^cd) 51 | rx: Embedded '^' unimplemented: in "(^ab)|(^cd)" 52 | 53 | ERROR: (ef$)|(gh$) 54 | rx: Embedded '$' unimplemented: in "(ef$)|(gh$)" 55 | 0 expression(s) loaded 56 | (17 expression(s) rejected) 57 | -------------------------------------------------------------------------------- /NOTES: -------------------------------------------------------------------------------- 1 | Some notes about the RX code 2 | 3 | Here are some random thoughts to help understand and navigate the RX code. 4 | 5 | 6 | PACKAGE ORGANIZATION 7 | The main package rx is for "library" type routines. 8 | The subpackage rx/rxsys holds library routines not acceptable in web apps. 9 | Most of the significant code should go in the library; 10 | utility programs should just provide an interface to the library. 11 | Subpackages rxplor, rxquest, etc implement command-level programs. 12 | Subtree webapp implements the regex.cs.arizona.edu website. 13 | 14 | VISIBILITY 15 | [Recall: Capitalized symbols are exported from a package.] 16 | Not everything capitalized should be considered an external "feature". 17 | Some things are exported for access by debugging printfs in commands. 18 | Parse tree node fields must all be exportable to be "gobbable". 19 | 20 | STRUCTS 21 | Structs are uniformly passed by reference (as pointers), not values. 22 | [Interface types such as Node and error are pointers without explicit '*'.] 23 | 24 | CODING STYLE 25 | Code is formatted according to the standard Go rules. 26 | "go fmt" should be run before any check-in. 27 | There is a "pre-commit" Git hook available to check this. 28 | Lines should be limited to 80 columns for better human comprehension 29 | (even though we're not limited to punch cards or fixed-width terminals). 30 | 31 | COMMENTING 32 | Terminology is based on the "Dragon book" [Aho & Ullman 1977]. 33 | Struct, func, and package comments should work with "godoc" 34 | (so: no blank line between header comment and struct or func). 35 | -------------------------------------------------------------------------------- /test/d3-cfloat.std2: -------------------------------------------------------------------------------- 1 | // NFA: [+-]?(((\d+\.\d*|\.\d+)([eE][+-]?\d+)?)|(\d+[eE][+-]?\d+))[fFlL]? 2 | digraph NFA { 3 | label="NFA: [+-]?(((\\d+\\.\\d*|\\.\\d+)([eE][+-]?\\d+)?)|(\\d+[eE][+-]?\\d+))[fFlL]?" 4 | node [shape=circle, height=.3, margin=0, fontsize=10] 5 | i->p0[label=" [+-]"] 6 | i->p1[label=" [0-9]"] 7 | i->p4[label=" ."] 8 | i->p9[label=" [0-9]"] 9 | i [shape=triangle, regular=true, label=""] 10 | p0 [label="p0"] 11 | p0->p1[label=" [0-9]"] 12 | p0->p4[label=" ."] 13 | p0->p9[label=" [0-9]"] 14 | p1 [label="p1"] 15 | p1->p1[label=" [0-9]"] 16 | p1->p2[label=" ."] 17 | p2 [label="p2"] 18 | p2->p3[label=" [0-9]"] 19 | p2->p6[label=" [Ee]"] 20 | p2->p13[label=" [FLfl]"] 21 | p2 [shape=doublecircle] 22 | p3 [label="p3"] 23 | p3->p3[label=" [0-9]"] 24 | p3->p6[label=" [Ee]"] 25 | p3->p13[label=" [FLfl]"] 26 | p3 [shape=doublecircle] 27 | p4 [label="p4"] 28 | p4->p5[label=" [0-9]"] 29 | p5 [label="p5"] 30 | p5->p5[label=" [0-9]"] 31 | p5->p6[label=" [Ee]"] 32 | p5->p13[label=" [FLfl]"] 33 | p5 [shape=doublecircle] 34 | p6 [label="p6"] 35 | p6->p7[label=" [+-]"] 36 | p6->p8[label=" [0-9]"] 37 | p7 [label="p7"] 38 | p7->p8[label=" [0-9]"] 39 | p8 [label="p8"] 40 | p8->p8[label=" [0-9]"] 41 | p8->p13[label=" [FLfl]"] 42 | p8 [shape=doublecircle] 43 | p9 [label="p9"] 44 | p9->p9[label=" [0-9]"] 45 | p9->p10[label=" [Ee]"] 46 | p10 [label="p10"] 47 | p10->p11[label=" [+-]"] 48 | p10->p12[label=" [0-9]"] 49 | p11 [label="p11"] 50 | p11->p12[label=" [0-9]"] 51 | p12 [label="p12"] 52 | p12->p12[label=" [0-9]"] 53 | p12->p13[label=" [FLfl]"] 54 | p12 [shape=doublecircle] 55 | p13 [label="p13"] 56 | p13 [shape=doublecircle] 57 | } 58 | -------------------------------------------------------------------------------- /test/d3-cfloat.std: -------------------------------------------------------------------------------- 1 | 2 | #! rxplor -m -l -n -d 3 | 4 | #! rxplor -N - 5 | 6 | #! rxplor -D - 7 | 8 | expr 0: [+-]?(((\d+\.\d*|\.\d+)([eE][+-]?\d+)?)|(\d+[eE][+-]?\d+))[fFlL]? 9 | ----------------------- NFA ---------------------- 10 | Inputs: [+.0-9EFLefl-] 11 | Witnesses: [+.0EF] 12 | begin => { 0 1 4 9 } 13 | p0. [+-] => { 1 4 9 } 14 | p1. [0-9] => { 1 2 } 15 | p2. . => { 3 6 13 14 } 16 | p3. [0-9] => { 3 6 13 14 } 17 | p4. . => { 5 } 18 | p5. [0-9] => { 5 6 13 14 } 19 | p6. [Ee] => { 7 8 } 20 | p7. [+-] => { 8 } 21 | p8. [0-9] => { 8 13 14 } 22 | p9. [0-9] => { 9 10 } 23 | p10. [Ee] => { 11 12 } 24 | p11. [+-] => { 12 } 25 | p12. [0-9] => { 12 13 14 } 26 | p13. [FLfl] => { 14 } 27 | p14. # => { } 28 | ----------------- Unoptimized DFA ---------------- 29 | s0. { p0 p1 p4 p9 } [+-]:s1 [.]:s2 [0-9]:s3 30 | s1. { p1 p4 p9 } [.]:s2 [0-9]:s3 31 | s2. { p5 } [0-9]:s4 32 | s3. { p1 p2 p9 p10 } [0-9]:s3 [.]:s5 [Ee]:s6 33 | s4# { p5 p6 p13 p14 } [0-9]:s4 [Ee]:s7 [FLfl]:s8 34 | s5# { p3 p6 p13 p14 } [0-9]:s5 [Ee]:s7 [FLfl]:s8 35 | s6. { p11 p12 } [+-]:s9 [0-9]:s10 36 | s7. { p7 p8 } [+-]:s11 [0-9]:s12 37 | s8# { p14 } 38 | s9. { p12 } [0-9]:s10 39 | s10# { p12 p13 p14 } [FLfl]:s8 [0-9]:s10 40 | s11. { p8 } [0-9]:s12 41 | s12# { p8 p13 p14 } [FLfl]:s8 [0-9]:s12 42 | ------------------ Minimized DFA ----------------- 43 | s0. { p0 p1 p4 p9 } [.]:s2 [0-9]:s3 [+-]:s4 44 | s1# { p3 p5 p6 p13 p14 } [0-9]:s1 [FLfl]:s5 [Ee]:s7 45 | s2. { p5 } [0-9]:s1 46 | s3. { p1 p2 p9 p10 } [.]:s1 [0-9]:s3 [Ee]:s7 47 | s4. { p1 p4 p9 } [.]:s2 [0-9]:s3 48 | s5# { p14 } 49 | s6# { p8 p12 p13 p14 } [FLfl]:s5 [0-9]:s6 50 | s7. { p7 p8 p11 p12 } [0-9]:s6 [+-]:s8 51 | s8. { p8 p12 } [0-9]:s6 52 | -------------------------------------------------------------------------------- /webapp/code/about.go: -------------------------------------------------------------------------------- 1 | // about.go -- generate "about" page 2 | 3 | package webapp 4 | 5 | import ( 6 | "fmt" 7 | "net/http" 8 | ) 9 | 10 | // about generates a page describing and crediting the website 11 | func about(w http.ResponseWriter, r *http.Request) { 12 | putheader(w, r, "About") 13 | fmt.Fprintf(w, ` 14 |

This website presents work by: 15 |

Todd Proebsting 16 |
Gregg Townsend 17 |
Jasmin Uribe 18 |

Department of Computer Science 19 |
The University of Arizona 20 |
Tucson, Arizona, USA 21 |

Send mail to %s. 22 |

The website uses our own custom regular expression software written in the 23 | Go programming language. 24 | Source code is available at 25 | GitHub. 26 |

Graph drawing uses the Graphviz 27 | layout package as ported to JavaScript by 28 | Viz.js 29 | using emscripten. 30 | This means that Graphviz runs in your own browser to lay out and draw 31 | the graphs! 32 | The first graph fetches a 2.5 MB script, but subsequent graphs draw quickly. 33 |

The “Download” buttons for saving graphs use 34 | cutting-edge browser features. If they don't seem to do anything, look 35 | in your Downloads folder for NFA.svg or 36 | DFA.svg (Firefox); download.svg (Chrome or Opera); 37 | or in a separate window, which can then be saved manually (Safari). 38 | `, MAILTO) 39 | putfooter(w, r) 40 | } 41 | -------------------------------------------------------------------------------- /webapp/code/graph.go: -------------------------------------------------------------------------------- 1 | // graph.go -- code for displaying DFA and NFA machines as lists or graphs 2 | 3 | package webapp 4 | 5 | import ( 6 | "bytes" 7 | "fmt" 8 | "net/http" 9 | "rx" 10 | ) 11 | 12 | // multaut displays a multi-NFA and multi-DFA 13 | func multaut(w http.ResponseWriter, r *http.Request) { 14 | 15 | // must read all input before writing anything 16 | exprlist := getexprs(r) 17 | nx := len(exprlist) 18 | 19 | // parse and echo the input 20 | treelist := make([]rx.Node, 0, nx) 21 | putheader(w, r, "Multi-expression Automata") 22 | fmt.Fprintf(w, "

%d expressions:\n", nx) 23 | for i, s := range exprlist { 24 | fmt.Fprintf(w, "
%c:   %s\n", 25 | rx.AcceptLabels[i], hx(s)) 26 | tree, err := rx.Parse(s) 27 | if !showerror(w, err) { 28 | treelist = append(treelist, rx.Augment(tree, i)) 29 | } 30 | } 31 | 32 | if nx > 0 && len(treelist) == nx { // if no errors 33 | dfa := rx.MultiDFA(treelist) // build combined DFA 34 | dmin := dfa.Minimize() 35 | showaut(w, dmin, exprlist) 36 | } 37 | putfooter(w, r) 38 | } 39 | 40 | // print NFA and DFA, with buttons linking to display page 41 | func showaut(w http.ResponseWriter, dfa *rx.DFA, exprlist []string) { 42 | 43 | fmt.Fprintln(w, `

`) 44 | 45 | nfaBuffer := &bytes.Buffer{} 46 | dfa.ShowNFA(nfaBuffer, "") 47 | fmt.Fprintf(w, "

NFA

\n%s
\n", 48 | hx(string(nfaBuffer.Bytes()))) 49 | formlink(w, "/drawNFA", exprlist, "Draw the graph") 50 | 51 | fmt.Fprintln(w, `
`) 52 | dfaBuffer := &bytes.Buffer{} 53 | dfa.ShowStates(dfaBuffer, "") 54 | fmt.Fprintf(w, 55 | "

DFA

\n%s
\n", 56 | hx(string(dfaBuffer.Bytes()))) 57 | formlink(w, "/drawDFA", exprlist, "Draw the graph") 58 | 59 | fmt.Fprintln(w, `
`) 60 | } 61 | -------------------------------------------------------------------------------- /webapp/code/main.go: -------------------------------------------------------------------------------- 1 | // main.go -- general control of the web application 2 | 3 | package webapp 4 | 5 | import ( 6 | "fmt" 7 | "html" 8 | "math/rand" 9 | "net/http" 10 | "rx" 11 | "time" 12 | ) 13 | 14 | // address for sending mail (generates a mailto: link) 15 | const MAILTO = `` 16 | 17 | // init registers URLs for dispatching and sets a random seed 18 | func init() { 19 | http.HandleFunc("/", home) // home.go; anything unmatched 20 | http.HandleFunc("/examine", examine) // examine.go 21 | http.HandleFunc("/details", details) // examine.go 22 | http.HandleFunc("/drawDFA", drawDFA) // draw.go 23 | http.HandleFunc("/drawNFA", drawNFA) // draw.go 24 | http.HandleFunc("/compare", compare) // compare.go 25 | http.HandleFunc("/combos", combos) // compare.go 26 | http.HandleFunc("/multaut", multaut) // graph.go 27 | http.HandleFunc("/syntax", syntax) // syntax.go 28 | http.HandleFunc("/about", about) // about.go 29 | http.HandleFunc("/contact", contact) // contact.go 30 | http.HandleFunc("/info", info) // info.go 31 | rand.Seed(int64(time.Now().Nanosecond())) 32 | rx.MaxComplexity = 200 // at least twice what's needed for examples 33 | } 34 | 35 | // hx escapes an arbitrary stringable value for output as HTML 36 | func hx(s interface{}) string { 37 | return html.EscapeString(fmt.Sprint(s)) 38 | } 39 | 40 | // showerror displays an error and returns true if its argument is not nil 41 | func showerror(w http.ResponseWriter, err error) bool { 42 | if err == nil { 43 | return false 44 | } 45 | fmt.Fprintf(w, "
Error:\n") 46 | if pe, ok := err.(*rx.ParseError); ok { 47 | fmt.Fprintf(w, " %s
In expression: %s", 48 | pe.Message, hx(pe.BadExpr)) 49 | } else { 50 | fmt.Fprintf(w, "%s", hx(err)) 51 | } 52 | fmt.Fprintf(w, "
\n") 53 | return true 54 | } 55 | -------------------------------------------------------------------------------- /webapp/code/body.go: -------------------------------------------------------------------------------- 1 | // body.go -- generate headers and footers for all pages 2 | 3 | package webapp 4 | 5 | import ( 6 | "html/template" 7 | "io" 8 | "net/http" 9 | ) 10 | 11 | // putheader outputs our standard HTML page header 12 | func putheader(w http.ResponseWriter, r *http.Request, title string) { 13 | data := struct{ Prefix, Title, Favicon string }{ 14 | "RegEx", title, "icon.png"} 15 | if r.Host == "localhost:8080" { 16 | data.Favicon = "itest.png" 17 | } 18 | tHeader.Execute(w, data) 19 | } 20 | 21 | var tHeader = template.Must(template.New("header").Parse( 22 | ` 23 | 24 | {{.Prefix}}: {{.Title}} 25 | 26 | 27 | 28 | 29 | 30 | 31 |

{{.Prefix}}: {{.Title}}

32 | `)) 33 | 34 | // putfooter outputs our standard HTML page footer 35 | func putfooter(w io.Writer, r *http.Request) { 36 | tFooter.Execute(w, nil) 37 | } 38 | 39 | var tFooter = template.Must(template.New("footer").Parse( 40 | `



41 |

42 | Home 43 | | Examine 44 | | Compare 45 | | Syntax 46 | | About 47 | | Contact 48 | 49 | 50 | 52 | 53 | 54 | `)) 55 | -------------------------------------------------------------------------------- /test/d5-abc1.std: -------------------------------------------------------------------------------- 1 | 2 | #! rxplor -m -l -n -d 3 | 4 | #! rxplor -N - 5 | 6 | #! rxplor -D - 7 | 8 | 9 | 10 | # one or more characters from an alphabet of three with no doubled letters 11 | 12 | expr 0: a(ba)*b?|b(ab)*a?|a?(ba)*b?c((a(ba)*b?|b(ab)*a?)c)*(a(ba)*b?|b(ab)*a?)? 13 | ----------------------- NFA ---------------------- 14 | Inputs: [abc] 15 | Witnesses: [abc] 16 | begin => { 0 4 8 9 11 12 } 17 | p0. a => { 1 3 30 } 18 | p1. b => { 2 } 19 | p2. a => { 1 3 30 } 20 | p3. b => { 30 } 21 | p4. b => { 5 7 30 } 22 | p5. a => { 6 } 23 | p6. b => { 5 7 30 } 24 | p7. a => { 30 } 25 | p8. a => { 9 11 12 } 26 | p9. b => { 10 } 27 | p10. a => { 9 11 12 } 28 | p11. b => { 12 } 29 | p12. c => { 13 17 22 26 30 } 30 | p13. a => { 14 16 21 } 31 | p14. b => { 15 } 32 | p15. a => { 14 16 21 } 33 | p16. b => { 21 } 34 | p17. b => { 18 20 21 } 35 | p18. a => { 19 } 36 | p19. b => { 18 20 21 } 37 | p20. a => { 21 } 38 | p21. c => { 13 17 22 26 30 } 39 | p22. a => { 23 25 30 } 40 | p23. b => { 24 } 41 | p24. a => { 23 25 30 } 42 | p25. b => { 30 } 43 | p26. b => { 27 29 30 } 44 | p27. a => { 28 } 45 | p28. b => { 27 29 30 } 46 | p29. a => { 30 } 47 | p30. # => { } 48 | ----------------- Unoptimized DFA ---------------- 49 | s0. { p0 p4 p8 p9 p11 p12 } [a]:s1 [b]:s2 [c]:s3 50 | s1# { p1 p3 p9 p11 p12 p30 } [c]:s3 [b]:s4 51 | s2# { p5 p7 p10 p12 p30 } [c]:s3 [a]:s5 52 | s3# { p13 p17 p22 p26 p30 } [a]:s6 [b]:s7 53 | s4# { p2 p10 p12 p30 } [a]:s1 [c]:s3 54 | s5# { p6 p9 p11 p12 p30 } [b]:s2 [c]:s3 55 | s6# { p14 p16 p21 p23 p25 p30 } [c]:s3 [b]:s8 56 | s7# { p18 p20 p21 p27 p29 p30 } [c]:s3 [a]:s9 57 | s8# { p15 p21 p24 p30 } [c]:s3 [a]:s6 58 | s9# { p19 p21 p28 p30 } [c]:s3 [b]:s7 59 | ------------------ Minimized DFA ----------------- 60 | s0. { p0 p4 p8 p9 p11 p12 } [a]:s1 [b]:s2 [c]:s3 61 | s1# { p1 p3 p6 p9 p11 p12 p14 p16 p19 p21 p23 p25 p28 p30 } [b]:s2 [c]:s3 62 | s2# { p2 p5 p7 p10 p12 p15 p18 p20 p21 p24 p27 p29 p30 } [a]:s1 [c]:s3 63 | s3# { p13 p17 p22 p26 p30 } [a]:s1 [b]:s2 64 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for RX library and programs 2 | 3 | PKG = rx 4 | PG1 = $(PKG)/rxplor $(PKG)/rxpick $(PKG)/rxquest $(PKG)/rxcluster 5 | PG2 = $(PKG)/rxtime $(PKG)/rxg $(PKG)/rxq $(PKG)/rxx 6 | PROGS = $(PG1) $(PG2) 7 | GOBIN = $$GOPATH/bin 8 | 9 | DEMO='-?(0|[1-9]\d*)(\.\d+)?([eE][+-]?\d{1,3})?' 10 | 11 | 12 | # The default is to rebuild, run all tests, run expt if present. 13 | default: build test expt 14 | 15 | # "make build" compiles all programs (and the library). 16 | build: .FORCE 17 | go install $(PROGS) 18 | 19 | # "make test" runs unit tests and the shell-based tests 20 | test: .FORCE 21 | go test 22 | cd test; $(MAKE) 23 | 24 | # "make expt" runs "rxplor -a expt.rx" if expt.rx exists. 25 | # This allows adding a quick temporary test to the build process. 26 | expt: 27 | test -f expt.rx && $(GOBIN)/rxplor -a expt.rx || : 28 | 29 | # "make bundle" combines all sources into a single file on standard output. 30 | # This requires the Kernighan and Pike "bundle" utility in the path. 31 | bundle: 32 | @bundle *.go */*.go webapp/code/*go 33 | 34 | # "make fmt" formats all the source files to Go standards 35 | # This should be done before checking in any code. 36 | # If "go fmt" echoes a filename, it has modified that file. 37 | fmt: 38 | go fmt *.go 39 | go fmt rxsys/*.go 40 | go fmt rxplor/rxplor.go 41 | go fmt rxpick/rxpick.go 42 | go fmt rxquest/rxquest.go 43 | go fmt rxcluster/rxcluster.go 44 | go fmt rxtime/rxtime.go 45 | go fmt rxg/rxg.go 46 | go fmt rxq/rxq.go 47 | go fmt rxx/rxx.go 48 | go fmt webapp/code/*.go 49 | 50 | # "make demo" displays a graph of the DFA of the exprs defined above. 51 | demo: 52 | rxplor -D @ -e $(DEMO) 53 | 54 | # "make serve" builds and runs the web app on localhost:8080. 55 | # The server runs until killed. 56 | serve: 57 | goapp serve webapp 58 | 59 | # "make deploy" uploads the web app to appspot.com. 60 | deploy: 61 | goapp deploy webapp 62 | 63 | 64 | # "make clean" removes the products of building and testing. 65 | clean: 66 | go clean -i $(PKG) $(PROGS) 67 | cd test; $(MAKE) clean 68 | 69 | 70 | .FORCE: # target meaning "always run" 71 | -------------------------------------------------------------------------------- /webapp/static/functions.js: -------------------------------------------------------------------------------- 1 | 74 | -------------------------------------------------------------------------------- /bit_test.go: -------------------------------------------------------------------------------- 1 | // bit_test.go -- unit tests for the BitSet functions 2 | // 3 | // These tests are run by the "go test" command. 4 | 5 | package rx 6 | 7 | import ( 8 | "fmt" 9 | "testing" 10 | ) 11 | 12 | // TestBits exercises BitSet operations and checks the results. 13 | func TestBits(t *testing.T) { 14 | fmt.Println("bit_test.go: TestBits") 15 | bs := &BitSet{} 16 | ck(t, "e00", true, bs.IsEmpty()) 17 | ck(t, "n00", 0, bs.Count()) 18 | ck(t, "l00", 0, bs.LowBit()) 19 | ck(t, "h00", -1, bs.HighBit()) 20 | ck(t, "s00", "{ }", bs) 21 | ck(t, "s0", "{ 0 }", bs.Set(0)) 22 | ck(t, "s2", "{ 0 2 }", bs.Set(2)) 23 | ck(t, "c0", "{ 2 }", bs.Clear(0)) 24 | ck(t, "l2", 2, bs.LowBit()) 25 | ck(t, "h2", 2, bs.HighBit()) 26 | ck(t, "s3a", "{ 2 3 }", bs.Set(3)) 27 | ck(t, "s3b", "{ 2 3 }", bs.Set(3)) // no harm setting twice 28 | ck(t, "s5", "{ 2 3 5 }", bs.Set(5)) 29 | ck(t, "n5", 3, bs.Count()) 30 | ck(t, "c3", "{ 2 5 }", bs.Clear(3)) 31 | ck(t, "t2", true, bs.Test(2)) 32 | ck(t, "t3", false, bs.Test(3)) 33 | ck(t, "e25", false, bs.IsEmpty()) 34 | bs25 := (&BitSet{}).Set(2).Set(5) 35 | ck(t, "bs25", "{ 2 5 }", bs25) 36 | bs58 := (&BitSet{}).Set(8).Set(5) 37 | ck(t, "bs58", "{ 5 8 }", bs58) 38 | ck(t, "eq25", true, bs25.Equals(bs)) 39 | ck(t, "!eq0", false, bs25.Equals(&BitSet{})) 40 | ck(t, "eq58", false, bs58.Equals(bs25)) 41 | ck(t, "and", "{ 5 }", bs25.And(bs58)) 42 | ck(t, "or", "{ 2 5 8 }", bs25.Or(bs58)) 43 | ck(t, "bs25b", "{ 2 5 }", bs25) 44 | ck(t, "bs58b", "{ 5 8 }", bs58) 45 | ck(t, "l58", 5, bs58.LowBit()) 46 | ck(t, "h58", 8, bs58.HighBit()) 47 | ck(t, "orw", "{ 2 5 8 }", bs25.OrWith(bs58)) 48 | ck(t, "andw", "{ 5 8 }", bs58.AndWith(bs25)) 49 | ck(t, "andn", "{ 2 }", bs25.AndNot(bs58)) 50 | cset := CharSet("abcdeiouvw") 51 | ck(t, "cset", "{ 97 98 99 100 101 105 111 117 118 119 }", cset) 52 | ck(t, "brac", "[a-eiouvw]", cset.Bracketed()) 53 | } 54 | 55 | // ck validates expected versus actual values (after string conversion). 56 | func ck(t *testing.T, label string, expected interface{}, actual interface{}) { 57 | ex := fmt.Sprint(expected) 58 | ac := fmt.Sprint(actual) 59 | if ex != ac { 60 | t.Error(label, ": expected \"", ex, "\", got \"", ac, "\"") 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /synth.go: -------------------------------------------------------------------------------- 1 | // synth.go -- code for generating examples of matching expressions 2 | 3 | package rx 4 | 5 | // A DFAexample is a matching string synthesized from a DFA. 6 | type DFAexample struct { 7 | State int // index of accepting state in DFA 8 | RXset *BitSet // set of matching regular expression indexes 9 | Example string // example string 10 | } 11 | 12 | // A partx is a partially built example used on the task list. 13 | type partx struct { 14 | ds *DFAstate // DFA state pointer 15 | path string // how we got here 16 | } 17 | 18 | // DFA.Synthesize produces a set of examples base on a DFA. 19 | // One example is produced for each distinct accepting state in the DFA. 20 | // Each example indicates which accepting state was reached and gives the 21 | // set of regular expressions matched at that point (by their indexes). 22 | func (dfa *DFA) Synthesize() []DFAexample { 23 | 24 | // initialize the task list 25 | todo := make([]partx, 0) // list of things to do 26 | todo = append(todo, partx{dfa.Dstates[0], ""}) 27 | 28 | // process states in breadth-first fashion 29 | results := make([]DFAexample, 0) 30 | for len(todo) > 0 { // while to-do list non-empty 31 | curr := todo[0] 32 | todo = todo[1:] // consume one entry 33 | ds := curr.ds 34 | if ds.Marked { // if we've already been here 35 | continue 36 | } 37 | ds.Marked = true // mark this state as visited 38 | 39 | // if this is an accepting state, note the result 40 | if ds.AccSet != nil { 41 | // we could re-follow the path to change random chars, 42 | // but would that variety be better, or worse? 43 | results = append(results, DFAexample{ 44 | ds.Index, ds.AccSet, curr.path}) 45 | } 46 | 47 | // add all unmarked nodes reachable in one more step 48 | slist, xmap := curr.ds.InvertMap() 49 | alist := slist.Members() 50 | // for greater randomness we could produce the list here, 51 | // but again, would that be better -- or worse? 52 | for _, arc := range alist { 53 | dest := dfa.Dstates[arc] 54 | if !dest.Marked { 55 | c := xmap[arc].RandChar() 56 | s := curr.path + string(c) 57 | todo = append(todo, partx{dest, s}) 58 | } 59 | } 60 | } 61 | 62 | return results 63 | } 64 | -------------------------------------------------------------------------------- /util.go: -------------------------------------------------------------------------------- 1 | // util.go -- miscellaneous utility helpers 2 | // 3 | // This file collects small and often unrelated general-purpose helper 4 | // functions that are not closely associated with any other file. 5 | 6 | package rx 7 | 8 | import ( 9 | "encoding/json" 10 | "fmt" 11 | "io" 12 | "log" 13 | "reflect" 14 | "sort" 15 | "strconv" 16 | ) 17 | 18 | // GCD returns the greatest common denominator of two integers 19 | func GCD(a, b int) int { 20 | for b != 0 { 21 | a, b = b, a%b 22 | } 23 | return a 24 | } 25 | 26 | // CkErr aborts with a fatal error if e is not nil. 27 | func CkErr(e error) { // abort if e is not nil 28 | if e != nil { 29 | log.Fatal(e) 30 | } 31 | } 32 | 33 | // Protect adds backslash notation, but no quotes, 34 | // to protect unprintables in a string. 35 | func Protect(s string) string { 36 | s = strconv.Quote(s) 37 | return s[1 : len(s)-1] 38 | } 39 | 40 | // ShowLabel prints a label, if not empty, in a standard format. 41 | func ShowLabel(f io.Writer, label string) { 42 | const decor = "--------------------------------------------------" 43 | const total = len(decor) 44 | if label != "" { 45 | n := len(label) + 2 46 | z := (total - n) / 2 47 | a := total - n - z 48 | fmt.Fprintf(f, "%s %s %s\n", decor[:a], label, decor[:z]) 49 | } 50 | } 51 | 52 | // KeyList returns the keys of a string:string map in sorted order. 53 | func KeyList(m map[string]string) []string { 54 | keys := make([]string, 0, len(m)) 55 | for k := range m { 56 | keys = append(keys, k) 57 | } 58 | sort.Strings(keys) 59 | return keys 60 | } 61 | 62 | // Jlist writes a slice of anything Marshalable to a file in JSON format, 63 | // one entry per line. No newline is written at the end. 64 | func Jlist(f io.Writer, slc interface{}) { 65 | switch reflect.TypeOf(slc).Kind() { 66 | case reflect.Slice: 67 | a := reflect.ValueOf(slc) 68 | n := a.Len() 69 | fmt.Fprintln(f, "[") 70 | for i := 0; i < n; i++ { 71 | json, err := json.Marshal(a.Index(i).Interface()) 72 | CkErr(err) 73 | if i < n-1 { 74 | json = append(json, ',') 75 | } 76 | fmt.Fprintf(f, "%s\n", string(json)) 77 | } 78 | fmt.Fprint(f, "]") 79 | default: 80 | panic("Jlist: unimplemented type") 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /test/runtest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # runtest [file.rx...] - run tests and validate outputs 4 | # 5 | # For each .rx file, this script runs rxplor (or another program) and 6 | # compares the output with the corresponding .std file. 7 | # 8 | # The command "rxplor -T" is run by default. If the .rx file begins 9 | # with a #! line, the command from that line is run instead. 10 | # If the file begins with multiple #! lines, the subsequent commands 11 | # are executed and their outputs compared against .std2, .std3, etc. 12 | 13 | 14 | # function definition 15 | runtest() { # runtest basename n command -- run one test and check output 16 | B=$1 # basename 17 | N=$2 # subtest number 18 | C=$3 # program command 19 | I=$B.rx # input file 20 | O=$B.out${N%1} # output file 21 | S=$B.std${N%1} # standard file for comparison 22 | 23 | printf "%-16s %s\n" "$B.$N:" "$C" 24 | eval "$C" <$I >$O # run the command 25 | if [ $? -eq 0 ] && cmp $S $O; then 26 | rm $O # normal exit, files match, so remove output file 27 | else 28 | diff -u $S $O | sed 18q 29 | echo ------------------------------------------------------------------ 30 | FAILED="$FAILED $B.$N" 31 | fi 32 | } 33 | 34 | 35 | # if no test files are specified, run them all 36 | if [ $# = 0 ]; then 37 | set - `ls *.rx` # add all .rx files as command args 38 | fi 39 | 40 | # loop through the chosen tests 41 | PATH=$GOPATH/bin:$PATH 42 | unset RX_COMPLEXITY 43 | echo "" 44 | FAILED= 45 | for F in $*; do # for each file on command line 46 | B=${F%.*} # basename 47 | I=$B.rx # input file 48 | N=0 # number of tests run 49 | exec <$I # redirect stdin 50 | while read LINE; do 51 | case "$LINE" in 52 | "#!"*) # this is a test spec 53 | N=$(($N+1)) # count it 54 | CMD=${LINE#??} # extract the command 55 | runtest $B $N "$CMD" # run it as a test 56 | ;; 57 | ""|*) 58 | break;; 59 | esac 60 | done 61 | if [ $N = 0 ]; then # if no explicit test named in file 62 | runtest $B 1 " rxplor -T" # then run rxplor -T 63 | fi 64 | done 65 | 66 | # summarize the results 67 | echo "" 68 | if [ "$FAILED" = "" ]; then 69 | echo "All tests passed." 70 | echo "" 71 | exit 0 72 | else 73 | echo "Tests failed: $FAILED" 74 | echo "" 75 | exit 1 76 | fi 77 | -------------------------------------------------------------------------------- /test/30-wild.std: -------------------------------------------------------------------------------- 1 | Options: -R -T -g -h -i -l -v -I 0 2 | 3 | expr 0: {.} 4 | tree: (({[\x01-\x7f])}) 5 | augmnt: ((({[\x01-\x7f])})#) 6 | length: 3 to 3 7 | cplxty: 5 8 | -------------------- Examples -------------------- 9 | ex(0): {$} {9} {w} {x} {s} {X} {=} {D} {I} {!} {d} {i} {W} {6} 10 | ex(1): {7} {'} {V} {f} {{} {c} {~} {C} {Z} {T} {B} {n} {1} {s} 11 | ex(2): {6} {}} {p} {{} {q} {t} {,} {P} {~} {u} {2} {'} {^} {q} 12 | ex(3): {M} { } {|} {\"} {G} {)} {;} {2} {u} {g} {\\} {]} {@} {h} 13 | ex(5): {1} {x} {O} {l} {n} {O} {\\} {>} {^} {\\} {]} {<} {-} {%} 14 | ex(8): {+} {m} {'} {\\} {/} {P} {x} {]} {Z} {G} {f} {$} {.} {)} 15 | ---------------- Examples from DFA --------------- 16 | s1: {p} 17 | 18 | expr 1: {.+} 19 | tree: (({[\x01-\x7f]+)}) 20 | augmnt: ((({[\x01-\x7f]+)})#) 21 | length: 3 to * 22 | cplxty: 5 23 | -------------------- Examples -------------------- 24 | ex(0): {$} {9} {w} {x} {s} {X} {=} {D} {I} {!} {d} {i} {W} {6} 25 | ex(1): {7} {'} {V} {f} {{} {c} {~} {C} {Z} {T} {B} {n} {1} {s} 26 | ex(2): {p} {$} {{V} {`} {,} {=} {Gu} {Z} {>^} {y} {w} { } {eN} 27 | ex(3): {/;} {@Gu} {.{\\} {(\\@} {U1} {1} {qo} {l} {_HO} {h'>} {FX} 28 | ex(5): {-#c%} {>} {7m} {~e\\} {;} {_xB7} {Z} {x2f]B} {.DC} {B} {/} 29 | ex(8): {xZS^[d\"} {5>S} {6]*M+L8>} {JSW??} {;(sQI} {!IC|X.~`} {.7Y} {H} 30 | ---------------- Examples from DFA --------------- 31 | s1: {:} 32 | 33 | expr 2: {.*.*.*} 34 | tree: (((({[\x01-\x7f]*)[\x01-\x7f]*)[\x01-\x7f]*)}) 35 | augmnt: ((((({[\x01-\x7f]*)[\x01-\x7f]*)[\x01-\x7f]*)})#) 36 | length: 2 to * 37 | cplxty: 9 38 | -------------------- Examples -------------------- 39 | ex(0): {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} 40 | ex(1): {G} {7} {} {`} {}[} {Z} {Bn} {T6} {} {p} {Vq} {} {}V} {;} 41 | ex(2): {Jw{} {|e\"(G} {//P} {uK.{} {]\\`+} {=ZLt} {k} {n} {\\h>t^F} 42 | ex(3): {cUz>} {7ma~e\\} {;tP_x7} {ZGx2} {} {C)\\B7} {U?x-(xZ} {\"7 5>} 43 | ex(5): {>g{SW??} {;(QI%3!} {.~`Th} {1(jH<;^5$&} {U^oWC} {s>28, 0) 51 | data.Vtime = t.Format("01/02 15:04") 52 | } 53 | 54 | putheader(w, r, "Info") 55 | 56 | fmt.Fprint(w, 57 | "

(Information for use of the website maintainers.)\n

") 58 | fmt.Fprintf(w, "= cg = \n") // .cg 59 | fmt.Fprintf(w, "= cw = \n") // .cw 60 | for i := 0; i < NCOLORS; i++ { 61 | fmt.Fprintf(w, "= %s = \n", 62 | colorname(i), colorname(i)) 63 | } 64 | fmt.Fprintln(w) 65 | tInfo.Execute(w, data) 66 | putfooter(w, r) 67 | } 68 | 69 | var tInfo = template.Must(template.New("info").Parse( 70 | `

Host: {{.Req.Host}} 71 |
Datacenter: {{.Dctr}} ({{.GoArch}} {{.GoOs}}) 72 |
Go Version: {{.GoVer}} 73 |
App Version ID: {{.VID}} ({{.Vtime}}) 74 |

MaxComplexity: {{.MaxCx}} 75 |

Request Header

76 |

{{range $k, $v := .Req.Header}}{{$k}} : {{$v}}
77 | {{end}} 78 |

Request Body

79 | {{printf "%s" .Body}} 80 | {{if .BE}}

BODY ERROR: {{.BE}}{{end}} 81 | `)) 82 | -------------------------------------------------------------------------------- /catnode.go: -------------------------------------------------------------------------------- 1 | // catnode.go -- parse tree node for concatenation of two subtrees 2 | 3 | package rx 4 | 5 | import ( 6 | "fmt" 7 | ) 8 | 9 | // ConcatNode is a parse tree node for concatenating two subpatterns. 10 | // Unlike an AltNode it is *not* generalized to other than two children. 11 | type ConcatNode struct { 12 | L Node 13 | R Node 14 | NodeData 15 | } 16 | 17 | // ConcatNode.Children returns a list of the two child nodes. 18 | func (d *ConcatNode) Children() []Node { 19 | return []Node{d.L, d.R} 20 | } 21 | 22 | // ConcatNode.MinLen sums the min lengths of its subpatterns. 23 | func (d *ConcatNode) MinLen() int { 24 | return d.L.MinLen() + d.R.MinLen() 25 | } 26 | 27 | // ConcatNode.MaxLen sums the max lengths of its subpatterns. 28 | // A value of -1 means that the length is unbounded. 29 | func (d *ConcatNode) MaxLen() int { 30 | llen := d.L.MaxLen() 31 | rlen := d.R.MaxLen() 32 | if llen < 0 || rlen < 0 { // if unbounded 33 | return -1 34 | } else { 35 | return llen + rlen 36 | } 37 | } 38 | 39 | // ConcatNode.SetNFL sets the Nullable, FirstPos, LastPos fields. 40 | func (d *ConcatNode) SetNFL() { 41 | d.Nullable = d.L.nullable() && d.R.nullable() 42 | if d.L.nullable() { 43 | d.FirstPos = d.L.firstPos().Or(d.R.firstPos()) 44 | } else { 45 | d.FirstPos = d.L.firstPos() 46 | } 47 | if d.R.nullable() { 48 | d.LastPos = d.R.lastPos().Or(d.L.lastPos()) 49 | } else { 50 | d.LastPos = d.R.lastPos() 51 | } 52 | } 53 | 54 | // ConcatNode.SetFollow registers FollowPos nodes due to concatenation. 55 | func (d *ConcatNode) SetFollow(pmap []*MatchNode) { 56 | for _, i := range d.L.lastPos().Members() { 57 | for _, f := range d.R.firstPos().Members() { 58 | pmap[i].followPos().Set(f) 59 | } 60 | } 61 | } 62 | 63 | // ConcatNode.Example appends one example from each subpattern. 64 | func (d *ConcatNode) Example(s []byte, n int) []byte { 65 | s = d.L.Example(s, n) 66 | s = d.R.Example(s, n) 67 | return s 68 | } 69 | 70 | // ConcatNode.String appends a parenthesized concatenation of subpatterns. 71 | func (d *ConcatNode) String() string { 72 | return fmt.Sprintf("(%s%s)", d.L, d.R) 73 | } 74 | 75 | // Concatenate makes a ConcatNode, optimizing if either arg is an Epsilon. 76 | func Concatenate(d Node, e Node) Node { 77 | if d == nil || IsEpsilon(d) { 78 | return e 79 | } 80 | if e == nil || IsEpsilon(e) { 81 | return d 82 | } 83 | return &ConcatNode{d, e, nildata} 84 | } 85 | -------------------------------------------------------------------------------- /rx.go: -------------------------------------------------------------------------------- 1 | // rx.go -- some top-level entry points for the rx library package 2 | 3 | // Rx provides facilities for dealing with regular expressions. 4 | package rx 5 | 6 | // Match tests whether a string is matched by a regular expression. 7 | func Match(rexpr string, s string) (bool, error) { 8 | dfa, err := Compile(rexpr) 9 | if err != nil { 10 | return false, err 11 | } 12 | return (dfa.Accepts(s) != nil), nil 13 | } 14 | 15 | // Compile makes a minimized DFA from a regular expression. 16 | // The DFA can be reused for multiple matches, saving compilation costs. 17 | func Compile(rexpr string) (*DFA, error) { 18 | ptree, err := Parse(rexpr) // make faithful parse tree 19 | if err != nil { 20 | return nil, err 21 | } 22 | atree := Augment(ptree, 0) // make augmented parse tree 23 | dfa := BuildDFA(atree) // build DFA from augmented tree 24 | dfa = dfa.Minimize() // minimize the number of states 25 | return dfa, nil // return it 26 | 27 | } 28 | 29 | // DFA.Accepts returns the set of regexps that accept a string, or nil. 30 | // This function treats the input string as Unicode runes. 31 | func (dfa *DFA) Accepts(s string) *BitSet { 32 | state := dfa.Dstates[0] 33 | for _, r := range s { 34 | state = state.Dnext[int(r)] 35 | if state == nil { 36 | return nil // unmatched char 37 | } 38 | } 39 | return state.AccSet // end of string; return accepting set if any 40 | } 41 | 42 | // Augment produces a modified parse tree in preparation for building a DFA. 43 | // The original tree is modified in place, and a new root is returned. 44 | // This new root concatenates an Accept node to the input tree. 45 | // Additionally, any fixed {m,n} replications with m>1 or n>1 are replaced 46 | // by concatenations of duplicated subtrees. 47 | func Augment(tree Node, rxindex int) Node { 48 | Walk(tree, nil, func(d Node) { 49 | switch d.(type) { 50 | case *ConcatNode: 51 | cnode := d.(*ConcatNode) 52 | cnode.L = replfix(cnode.L) 53 | cnode.R = replfix(cnode.R) 54 | case *AltNode: 55 | anode := d.(*AltNode) 56 | alts := make([]Node, 0, len(anode.Alts)) 57 | for _, e := range anode.Alts { 58 | alts = append(alts, replfix(e)) 59 | } 60 | anode.Alts = alts 61 | case *ReplNode: 62 | rnode := d.(*ReplNode) 63 | rnode.Child = replfix(rnode.Child) 64 | case *MatchNode: 65 | 66 | } 67 | }) 68 | tree = replfix(tree) // need to handle top node, too 69 | return Concatenate(tree, Accept(rxindex)) 70 | } 71 | -------------------------------------------------------------------------------- /matnode.go: -------------------------------------------------------------------------------- 1 | // matnode.go -- parse tree node for matching any one character of a set 2 | 3 | package rx 4 | 5 | // MatchNode is a leaf node that matches exactly one char from a given set. 6 | // It generalizes the textbook leaf node that matches a particular char. 7 | // A special MatchNode with an empty set represents an acceptance marker. 8 | type MatchNode struct { 9 | Cset *BitSet // the characters that will match 10 | Posn int // integer "position" designator of leaf 11 | RxIndex int // which RE does this Accept node belong to? 12 | NodeData 13 | } 14 | 15 | // MatchAny creates a MatchNode for a given set of characters. 16 | func MatchAny(cs *BitSet) Node { 17 | return &MatchNode{cs, 0, 0, nildata} 18 | } 19 | 20 | // Accept returns a special MatchNode with an empty cset. 21 | func Accept(rxindex int) Node { 22 | return &MatchNode{&BitSet{}, 0, rxindex, nildata} 23 | } 24 | 25 | // IsAccept returns true for an Accept node. 26 | func IsAccept(d Node) bool { 27 | mnode, ok := d.(*MatchNode) 28 | return ok && mnode.Cset.IsEmpty() 29 | } 30 | 31 | // MatchNode.Children returns an empty list. 32 | func (d *MatchNode) Children() []Node { 33 | return barren 34 | } 35 | 36 | var barren = make([]Node, 0, 0) // empty list of children 37 | 38 | // MatchNode.MinLen always returns 1 (except 0 for an AcceptNode). 39 | func (d *MatchNode) MinLen() int { 40 | if IsAccept(d) { 41 | return 0 42 | } else { 43 | return 1 44 | } 45 | } 46 | 47 | // MatchNode.MaxLen always returns 1 (except 0 for an AcceptNode). 48 | func (d *MatchNode) MaxLen() int { 49 | if IsAccept(d) { 50 | return 0 51 | } else { 52 | return 1 53 | } 54 | } 55 | 56 | // MatchNode.SetNFL sets the Nullable, FirstPos, LastPos fields. 57 | func (d *MatchNode) SetNFL() { 58 | d.Nullable = false 59 | d.FirstPos = &BitSet{} 60 | d.LastPos = &BitSet{} 61 | d.FirstPos.Set(d.Posn) 62 | d.LastPos.Set(d.Posn) 63 | } 64 | 65 | // MatchNode.SetFollow has nothing to do. 66 | func (d *MatchNode) SetFollow(pmap []*MatchNode) { 67 | } 68 | 69 | // MatchNode.Example appends a single randomly chosen matching character. 70 | // (Note that this may be multiple UTF-8 bytes.) 71 | func (d *MatchNode) Example(s []byte, n int) []byte { 72 | if IsAccept(d) { 73 | return s // don't alter if Accept node 74 | } else { 75 | // assumes cset is not empty 76 | return append(s, string(d.Cset.RandChar())...) 77 | } 78 | } 79 | 80 | // MatchNode.String returns a singleton character or a bracketed expression. 81 | func (d *MatchNode) String() string { 82 | if d.Cset.IsEmpty() { 83 | return "#" // special "accept" node 84 | } else { 85 | return d.Cset.Unbracketed() 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /rxx/rxx.go: -------------------------------------------------------------------------------- 1 | /* 2 | rxx.go -- regular expression cross product with candidate list 3 | 4 | usage: rxx efile sfile 5 | 6 | Rxx reads a set of up to 61 regular expressions, one per line, 7 | from efile. It then tests every line from sfile against each 8 | regular expression, printing a grid of results on standard output. 9 | 10 | A line beginning with '#', or an empty line, is treated as a comment. 11 | 12 | Spring-2014 / gmt 13 | */ 14 | package main 15 | 16 | import ( 17 | "fmt" 18 | "log" 19 | "os" 20 | "rx" 21 | ) 22 | 23 | type tester struct { // one regular expression for testing 24 | label string // one-character label 25 | spec string // regular expression specification 26 | tree rx.Node // unaugmented parse tree 27 | index int // result index 28 | } 29 | 30 | func main() { 31 | args := os.Args 32 | if len(args) != 3 { 33 | log.Fatal("usage: rxx efile sfile") 34 | } 35 | ename := args[1] 36 | sfile := rx.MkScanner(args[2]) 37 | 38 | labels := 39 | "123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 40 | elist := make([]tester, 0, len(labels)) 41 | 42 | // load and compile regexps 43 | fmt.Println() 44 | tlist := make([]rx.Node, 0) // list of valid parse trees 45 | rx.LoadExpressions(ename, func(x *rx.RegExParsed) { 46 | spec := x.Expr 47 | ptree := x.Tree 48 | err := x.Err 49 | if err != nil { 50 | fmt.Printf("ERR %s\n", spec) 51 | elist = append(elist, tester{" ", spec, nil, 0}) 52 | } else if ptree == nil { 53 | fmt.Printf(" %s\n", spec) 54 | elist = append(elist, tester{" ", spec, nil, 0}) 55 | } else { 56 | i := len(tlist) 57 | if i >= len(labels) { 58 | log.Fatal("too many regular expressions") 59 | } 60 | label := string(labels[i : i+1]) 61 | fmt.Printf("%s: %s\n", label, spec) 62 | atree := rx.Augment(ptree, len(tlist)) 63 | tlist = append(tlist, atree) 64 | elist = append(elist, tester{label, spec, ptree, i}) 65 | } 66 | }) 67 | 68 | dfa := rx.MultiDFA(tlist) 69 | _ = dfa.Minimize() // should have no effect 70 | _ = dfa.Minimize() // should again have no effect 71 | dfa = dfa.Minimize() // not necessary, but a good stress test 72 | dfa = dfa.Minimize() // especially if done more than once 73 | 74 | // read and test candidate strings 75 | fmt.Println() 76 | for sfile.Scan() { 77 | s := string(sfile.Bytes()) 78 | results := dfa.Accepts(s) 79 | if results == nil { 80 | results = &rx.BitSet{} 81 | } 82 | for _, e := range elist { 83 | if e.tree == nil { 84 | fmt.Print(" ") 85 | } else { 86 | if results.Test(e.index) { 87 | fmt.Print(e.label) 88 | } else { 89 | fmt.Print("-") 90 | } 91 | } 92 | } 93 | fmt.Printf(" %s\n", s) 94 | } 95 | rx.CkErr(sfile.Err()) 96 | } 97 | -------------------------------------------------------------------------------- /test/90-bad.std: -------------------------------------------------------------------------------- 1 | Options: -R -T -g -h -i -l -v -I 0 2 | 3 | # irregular (i.e. illegal) expressions 4 | 5 | ERROR: ) 6 | rx: unmatched ')': in ")" 7 | 8 | ERROR: ( 9 | rx: unclosed '(': in "(" 10 | 11 | ERROR: abc) 12 | rx: unmatched ')': in "abc)" 13 | 14 | ERROR: (def 15 | rx: unclosed '(': in "(def" 16 | 17 | ERROR: g|h) 18 | rx: unmatched ')': in "g|h)" 19 | 20 | ERROR: i|j|k) 21 | rx: unmatched ')': in "i|j|k)" 22 | 23 | ERROR: (m|n 24 | rx: unclosed '(': in "(m|n" 25 | 26 | ERROR: (o|p|q 27 | rx: unclosed '(': in "(o|p|q" 28 | 29 | ERROR: \ 30 | rx: '\' at end: in "\" 31 | 32 | ERROR: abc\ 33 | rx: '\' at end: in "abc\" 34 | 35 | ERROR: [ 36 | rx: unclosed '[': in "[" 37 | 38 | ERROR: [^ 39 | rx: unclosed '[': in "[^" 40 | 41 | ERROR: [] 42 | rx: unclosed '[': in "[]" 43 | 44 | ERROR: [^] 45 | rx: unclosed '[': in "[^]" 46 | 47 | ERROR: [\] 48 | rx: unclosed '[': in "[\]" 49 | 50 | ERROR: [abc 51 | rx: unclosed '[': in "[abc" 52 | 53 | ERROR: [def\] 54 | rx: unclosed '[': in "[def\]" 55 | 56 | ERROR: [t-f] 57 | rx: invalid range: in "[t-f]" 58 | 59 | ERROR: a{z 60 | rx: malformed "{m,n}": in "a{z" 61 | 62 | ERROR: a{1z 63 | rx: malformed "{m,n}": in "a{1z" 64 | 65 | ERROR: a{1,z 66 | rx: malformed "{m,n}": in "a{1,z" 67 | 68 | ERROR: a{1,2z 69 | rx: malformed "{m,n}": in "a{1,2z" 70 | 71 | ERROR: a{1,2bz 72 | rx: malformed "{m,n}": in "a{1,2bz" 73 | 74 | ERROR: a{}z 75 | rx: malformed "{m,n}": in "a{}z" 76 | 77 | ERROR: a{,}z 78 | rx: malformed "{m,n}": in "a{,}z" 79 | 80 | ERROR: a{1,,2}z 81 | rx: malformed "{m,n}": in "a{1,,2}z" 82 | 83 | ERROR: a{,2}z 84 | rx: malformed "{m,n}": in "a{,2}z" 85 | 86 | ERROR: a{2,1}z 87 | rx: malformed "{m,n}": in "a{2,1}z" 88 | 89 | ERROR: a{p,3}z 90 | rx: malformed "{m,n}": in "a{p,3}z" 91 | 92 | ERROR: a{4,q}z 93 | rx: malformed "{m,n}": in "a{4,q}z" 94 | 95 | ERROR: a{x,y}z 96 | rx: malformed "{m,n}": in "a{x,y}z" 97 | 98 | ERROR: a{x}z 99 | rx: malformed "{m,n}": in "a{x}z" 100 | 101 | ERROR: a{xyzzy}z 102 | rx: malformed "{m,n}": in "a{xyzzy}z" 103 | 104 | ERROR: a++ 105 | rx: multiple adjacent duplication symbols: in "a++" 106 | 107 | ERROR: b** 108 | rx: multiple adjacent duplication symbols: in "b**" 109 | 110 | ERROR: c?? 111 | rx: prefer-fewer '?' unimplemented: in "c??" 112 | 113 | ERROR: d{2}{3} 114 | rx: multiple adjacent duplication symbols: in "d{2}{3}" 115 | 116 | ERROR: e{5}? 117 | rx: prefer-fewer '?' unimplemented: in "e{5}?" 118 | 0 expression(s) loaded 119 | (38 expression(s) rejected) 120 | -------------------------------------------------------------------------------- /test/d5-abc1.std2: -------------------------------------------------------------------------------- 1 | // NFA: a(ba)*b?|b(ab)*a?|a?(ba)*b?c((a(ba)*b?|b(ab)*a?)c)*(a(ba)*b?|b(ab)*a?)? 2 | digraph NFA { 3 | label="NFA: a(ba)*b?|b(ab)*a?|a?(ba)*b?c((a(ba)*b?|b(ab)*a?)c)*(a(ba)*b?|b(ab)*a?)?" 4 | node [shape=circle, height=.3, margin=0, fontsize=10] 5 | i->p0[label=" a"] 6 | i->p4[label=" b"] 7 | i->p8[label=" a"] 8 | i->p9[label=" b"] 9 | i->p11[label=" b"] 10 | i->p12[label=" c"] 11 | i [shape=triangle, regular=true, label=""] 12 | p0 [label="p0"] 13 | p0->p1[label=" b"] 14 | p0->p3[label=" b"] 15 | p0 [shape=doublecircle] 16 | p1 [label="p1"] 17 | p1->p2[label=" a"] 18 | p2 [label="p2"] 19 | p2->p1[label=" b"] 20 | p2->p3[label=" b"] 21 | p2 [shape=doublecircle] 22 | p3 [label="p3"] 23 | p3 [shape=doublecircle] 24 | p4 [label="p4"] 25 | p4->p5[label=" a"] 26 | p4->p7[label=" a"] 27 | p4 [shape=doublecircle] 28 | p5 [label="p5"] 29 | p5->p6[label=" b"] 30 | p6 [label="p6"] 31 | p6->p5[label=" a"] 32 | p6->p7[label=" a"] 33 | p6 [shape=doublecircle] 34 | p7 [label="p7"] 35 | p7 [shape=doublecircle] 36 | p8 [label="p8"] 37 | p8->p9[label=" b"] 38 | p8->p11[label=" b"] 39 | p8->p12[label=" c"] 40 | p9 [label="p9"] 41 | p9->p10[label=" a"] 42 | p10 [label="p10"] 43 | p10->p9[label=" b"] 44 | p10->p11[label=" b"] 45 | p10->p12[label=" c"] 46 | p11 [label="p11"] 47 | p11->p12[label=" c"] 48 | p12 [label="p12"] 49 | p12->p13[label=" a"] 50 | p12->p17[label=" b"] 51 | p12->p22[label=" a"] 52 | p12->p26[label=" b"] 53 | p12 [shape=doublecircle] 54 | p13 [label="p13"] 55 | p13->p14[label=" b"] 56 | p13->p16[label=" b"] 57 | p13->p21[label=" c"] 58 | p14 [label="p14"] 59 | p14->p15[label=" a"] 60 | p15 [label="p15"] 61 | p15->p14[label=" b"] 62 | p15->p16[label=" b"] 63 | p15->p21[label=" c"] 64 | p16 [label="p16"] 65 | p16->p21[label=" c"] 66 | p17 [label="p17"] 67 | p17->p18[label=" a"] 68 | p17->p20[label=" a"] 69 | p17->p21[label=" c"] 70 | p18 [label="p18"] 71 | p18->p19[label=" b"] 72 | p19 [label="p19"] 73 | p19->p18[label=" a"] 74 | p19->p20[label=" a"] 75 | p19->p21[label=" c"] 76 | p20 [label="p20"] 77 | p20->p21[label=" c"] 78 | p21 [label="p21"] 79 | p21->p13[label=" a"] 80 | p21->p17[label=" b"] 81 | p21->p22[label=" a"] 82 | p21->p26[label=" b"] 83 | p21 [shape=doublecircle] 84 | p22 [label="p22"] 85 | p22->p23[label=" b"] 86 | p22->p25[label=" b"] 87 | p22 [shape=doublecircle] 88 | p23 [label="p23"] 89 | p23->p24[label=" a"] 90 | p24 [label="p24"] 91 | p24->p23[label=" b"] 92 | p24->p25[label=" b"] 93 | p24 [shape=doublecircle] 94 | p25 [label="p25"] 95 | p25 [shape=doublecircle] 96 | p26 [label="p26"] 97 | p26->p27[label=" a"] 98 | p26->p29[label=" a"] 99 | p26 [shape=doublecircle] 100 | p27 [label="p27"] 101 | p27->p28[label=" b"] 102 | p28 [label="p28"] 103 | p28->p27[label=" a"] 104 | p28->p29[label=" a"] 105 | p28 [shape=doublecircle] 106 | p29 [label="p29"] 107 | p29 [shape=doublecircle] 108 | } 109 | -------------------------------------------------------------------------------- /webapp/code/draw.go: -------------------------------------------------------------------------------- 1 | // draw.go -- code for drawing a DFA or NFA 2 | 3 | package webapp 4 | 5 | import ( 6 | "fmt" 7 | "html/template" 8 | "net/http" 9 | "rx" 10 | ) 11 | 12 | // drawDFA draws a DFA. 13 | func drawDFA(w http.ResponseWriter, r *http.Request) { 14 | draw(w, r, "DFA") 15 | } 16 | 17 | // drawNFA draws a DFA. 18 | func drawNFA(w http.ResponseWriter, r *http.Request) { 19 | draw(w, r, "NFA") 20 | } 21 | 22 | // draw produces a Dot file for rendering a DFA or NFA in the user's browser. 23 | func draw(w http.ResponseWriter, r *http.Request, which string) { 24 | 25 | exprlist := getexprs(r) // must load data before writing anything 26 | nx := len(exprlist) 27 | 28 | putheader(w, r, which+" Graph") // write page header 29 | fmt.Fprintln(w, "

") 30 | 31 | treelist := make([]rx.Node, 0) 32 | for i, e := range exprlist { 33 | if nx > 1 { 34 | fmt.Fprintf(w, "%c.   ", rx.AcceptLabels[i]) 35 | } 36 | fmt.Fprintf(w, "%s
\n", hx(e)) 37 | tree, err := rx.Parse(e) 38 | if !showerror(w, err) { 39 | treelist = append(treelist, rx.Augment(tree, i)) 40 | } 41 | } 42 | 43 | if nx > 0 && len(treelist) == nx { // if no errors 44 | dfa := rx.MultiDFA(treelist) // build combined DFA 45 | dmin := dfa.Minimize() // minimize it 46 | 47 | fmt.Fprintln(w, ``) 57 | 58 | tDraw.Execute(w, which) 59 | } 60 | putfooter(w, r) 61 | } 62 | 63 | var tDraw = template.Must(template.New("draw").Parse(` 64 | 65 | 79 |

{{if eq . "NFA"}} The unlabeled node is the start state. 80 | {{else}} State s0 is the start state.{{end}} 81 | {{if eq . "Multi"}} Double octagons are acceptance states, with capital 82 | letters indicating which expressions are accepted. 83 | {{else}} Double circles are acceptance states.{{end}} 84 | Edge labels indicate input characters or classes of characters. 85 |

86 | 88 | 90 | `)) 91 | -------------------------------------------------------------------------------- /rxg/rxg.go: -------------------------------------------------------------------------------- 1 | /* 2 | rxg.go -- regular expression multiple example generator 3 | 4 | usage: rxg [-R] [exprfile] 5 | 6 | Rxg reads a set of regular expressions and synthesizes one example 7 | corresponding to each accepting state in the combined DFA. 8 | 9 | -R produce reproducible output by using a fixed random seed 10 | 11 | Input is one unadorned regular expression per line. 12 | A line beginning with '#', or an empty line, is treated as a comment. 13 | 14 | Output is a struct of two arrays in JSON format. The first array 15 | lists the regular expressions with input numbers. The second lists 16 | examples with state numbers and sets of matching regular expressions. 17 | 18 | Example: 19 | For the input 20 | \d+ 21 | \d*[1-9] 22 | [1-9]\d* 23 | the output is: 24 | {"Expressions":[ 25 | {"Index":0,"Rexpr":"\\d+"}, 26 | {"Index":1,"Rexpr":"\\d*[1-9]"}, 27 | {"Index":2,"Rexpr":"[1-9]\\d*"} 28 | ], 29 | "Examples":[ 30 | {"State":1,"RXset":[0],"Example":"0"}, 31 | {"State":2,"RXset":[0,1,2],"Example":"7"}, 32 | {"State":3,"RXset":[0,1],"Example":"02"}, 33 | {"State":4,"RXset":[0,2],"Example":"70"} 34 | ]} 35 | 36 | Spring-2014 / gmt 37 | */ 38 | package main 39 | 40 | import ( 41 | "flag" 42 | "fmt" 43 | "math/rand" 44 | "os" 45 | "rx" 46 | "time" 47 | ) 48 | 49 | type RegEx struct { // one regexp for JSON output 50 | Index int // index number 51 | Rexpr string // regular expression 52 | } 53 | 54 | type Example struct { // one example for JSON output 55 | State int // index of accepting state in DFA 56 | RXset []int // set of matching regular expression indexes 57 | Example string // example string 58 | } 59 | 60 | func main() { 61 | 62 | rflag := flag.Bool("R", false, "reproducible output") 63 | flag.Parse() 64 | if *rflag { 65 | rand.Seed(0) 66 | } else { 67 | rand.Seed(int64(time.Now().Nanosecond())) 68 | } 69 | 70 | // load and process regexps 71 | exprs := make([]*RegEx, 0) 72 | tlist := make([]rx.Node, 0) 73 | rx.LoadExpressions(rx.OneInputFile(), func(l *rx.RegExParsed) { 74 | if l.Err != nil { 75 | fmt.Fprintln(os.Stderr, l.Err) 76 | } 77 | if l.Tree != nil { 78 | atree := rx.Augment(l.Tree, len(tlist)) 79 | tlist = append(tlist, atree) 80 | exprs = append(exprs, &RegEx{len(exprs), l.Expr}) 81 | } 82 | }) 83 | 84 | // echo the input with index numbers 85 | fmt.Print(`{"Expressions":`) 86 | rx.Jlist(os.Stdout, exprs) 87 | fmt.Println(",") 88 | 89 | // build the DFA and produce examples 90 | synthx := rx.MultiDFA(tlist).Synthesize() 91 | 92 | // convert into expected form with int array replacing BitSet 93 | results := make([]*Example, 0, len(synthx)) 94 | for _, x := range synthx { 95 | results = append(results, 96 | &Example{x.State, x.RXset.Members(), x.Example}) 97 | } 98 | 99 | // output the array of synthesized examples 100 | fmt.Print(`"Examples":`) 101 | rx.Jlist(os.Stdout, results) 102 | fmt.Println("}") 103 | } 104 | -------------------------------------------------------------------------------- /test/48-fireball.rx: -------------------------------------------------------------------------------- 1 | #! rxplor 2 | 3 | # Daring Fireball by John Gruber 4 | # 5 | # November 2009: daringfireball.net/2009/11/liberal_regex_for_matching_urls 6 | \b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/))) 7 | # 8 | # July 2010: daringfireball.net/2010/07/improved_regex_for_matching_urls 9 | (?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])) 10 | # July 2010 -- Web URLs only (e.g. not mailto:) 11 | (?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])) 12 | # 13 | # His latest "all URLs" gist.github.com/gruber/249502 sampled 20 Feb 2014 14 | (?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])) 15 | # His latest "web URLs" gist.github.com/gruber/8891611 sampled 20 Feb 2014 16 | (?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(? This website provides some simple tools for experimenting with 24 | “classic” 25 | regular 26 | expressions as described in computer science 27 | textbooks and implemented in the early versions of 28 | grep. 29 | These tools are an outgrowth of research exploring automata that 30 | implement multiple formal languages simultaneously. 31 | `) 32 | refexamine(w) 33 | refcompare(w) 34 | fmt.Fprintln(w, ` 35 |

The Syntax page briefly outlines the 36 | forms of regular expressions implemented here.`) 37 | putfooter(w, r) 38 | } 39 | 40 | // refexamine advertises the Examine page 41 | func refexamine(w http.ResponseWriter) { 42 | fmt.Fprint(w, ` 43 |

On the Examine page you can enter 44 | a regular expression to generate several kinds of data: 45 | parse trees, synthetic examples, and the automata state lists. 46 | Links from there produce diagrams of either the 47 | NFA 48 | or 49 | DFA 50 | for the language. 51 | `) 52 | tree, _ := rx.Parse(HomeExample) 53 | augt := rx.Augment(tree, 0) 54 | fmt.Fprintf(w, ` 55 |

Regular Expression: %s 56 |
Augmented Parse Tree: %s 57 |
Examples:
`, hx(HomeExample), hx(augt)) 58 | genexamples(w, augt, 0) 59 | genexamples(w, augt, 2) 60 | genexamples(w, augt, 4) 61 | fmt.Fprintln(w, `
`) 62 | genexam(w, "submit this example to see full output", HomeExample) 63 | fmt.Fprintln(w, `
`) 64 | } 65 | 66 | // refcompare advertises the Compare page 67 | func refcompare(w http.ResponseWriter) { 68 | fmt.Fprintf(w, ` 69 |

The Compare page accepts multiple expressions 70 | and shows how their languages overlap or differ. 71 | The results page shows synthesized examples and indicates which expressions 72 | they match. 73 | You can also submit your own examples for testing. 74 | Again, there are links to produce automata diagrams. 75 |

76 | %d expressions: 77 | `, len(HomeCompare)) 78 | treelist := lpxc(w, HomeCompare) 79 | dfa := rx.MultiDFA(treelist) 80 | synthx := dfa.Synthesize() 81 | trylist := make([]string, 0) 82 | for _, x := range synthx { 83 | trylist = append(trylist, x.Example) 84 | } 85 | fmt.Fprintln(w, `

`) 86 | showgrid(w, dfa, len(treelist), trylist) 87 | fmt.Fprintln(w, `
`) 88 | gencomp(w, "submit this example to see full output", HomeCompare) 89 | fmt.Fprintln(w, `
`) 90 | } 91 | -------------------------------------------------------------------------------- /rxtime/rxtime.go: -------------------------------------------------------------------------------- 1 | /* 2 | rxtime.go - combine regular expressions n ways 3 | 4 | usage: rxtime exprfile [n] 5 | 6 | Rxtime reads regular expressions from exprfile and prints statistics, 7 | including timings, for all possible combinations of n expressions, 8 | just 1 by default. 9 | 10 | Each output line shows, in this order: 11 | Number of states in the initial combined DFA 12 | Number of states in the minimized DFA 13 | Time in seconds to produce the combined DFA 14 | Time in seconds to minimize the DFA 15 | If n == 1, the computed "complexity score" 16 | Ordinals of the expressions combined in this DFA 17 | 18 | Erroneous expressions are silently ignored. 19 | 20 | spring 2014 / gmt 21 | */ 22 | package main 23 | 24 | import ( 25 | "fmt" 26 | "log" 27 | "os" 28 | "rx" 29 | "rx/rxsys" 30 | "strconv" 31 | ) 32 | 33 | func main() { 34 | // get command line options 35 | nways := 1 36 | if len(os.Args) == 3 { 37 | nways, _ = strconv.Atoi(os.Args[2]) 38 | } 39 | if len(os.Args) < 2 || len(os.Args) > 3 || nways < 1 { 40 | log.Fatal("usage: rxtime exprfile [n]") 41 | } 42 | filename := os.Args[1] 43 | 44 | // load expressions from file 45 | exprs := rx.LoadExpressions(filename, nil) 46 | nexprs := len(exprs) 47 | if nways < 1 || nways > nexprs { 48 | log.Fatal(fmt.Sprintf( 49 | "cannot combine %d expressions(s) in %d way(s)", 50 | nexprs, nways)) 51 | } 52 | 53 | // record individual complexity scores and make augmented parse trees 54 | cx := make([]int, nexprs) 55 | for i, t := range exprs { 56 | cx[i] = rx.ComplexityScore(t.Tree) 57 | t.Tree = rx.Augment(t.Tree, i) 58 | } 59 | 60 | // initialize index list for first combination {0,1,2...} 61 | xlist := make([]int, nways) 62 | for i := range xlist { 63 | xlist[i] = i 64 | } 65 | 66 | // try all possible n-way combinations by varying the index list 67 | tlist := make([]rx.Node, nways) 68 | for xlist != nil { 69 | for i, x := range xlist { 70 | tlist[i] = exprs[x].Tree 71 | } 72 | _ = rxsys.Interval() // reset timer 73 | dfa1 := rx.MultiDFA(tlist) // make DFA 74 | t1 := rxsys.Interval().Seconds() // measure time 75 | dfa2 := dfa1.Minimize() // minimize DFA 76 | t2 := rxsys.Interval().Seconds() // measure time 77 | fmt.Printf("%6d %6d %8.3f %8.3f", 78 | len(dfa1.Dstates), len(dfa2.Dstates), t1, t2) 79 | if nways == 1 { 80 | fmt.Printf(" %6d", cx[xlist[0]]) 81 | } 82 | fmt.Print(" {") 83 | for _, x := range xlist { 84 | fmt.Printf(" %d", x) 85 | } 86 | fmt.Print(" }\n") 87 | xlist = advance(xlist, nexprs) // get next combination 88 | } 89 | } 90 | 91 | // advance an index list (initially {0,1,2...}) to next combination in sequence 92 | // (n.b. although this returns a slice, it is changing the underlying array) 93 | func advance(xlist []int, nitems int) []int { 94 | nchoose := len(xlist) 95 | i := nchoose - 1 96 | // find an index that can be incremented 97 | for i >= 0 && xlist[i] > (nitems-(nchoose-i)-1) { 98 | i-- 99 | } 100 | if i < 0 { 101 | return nil // no more combinations to try 102 | } 103 | // increment index i and reset all that follow 104 | xlist[i]++ 105 | for i++; i < nchoose; i++ { 106 | xlist[i] = xlist[i-1] + 1 107 | } 108 | return xlist 109 | } 110 | -------------------------------------------------------------------------------- /test/54-libmisc.std: -------------------------------------------------------------------------------- 1 | Options: -R -T -g -h -i -l -v -I 0 2 | 3 | # RegExLib.com -- browse -- Misc 4 | 5 | # 6 | 7 | # 42. Decimal IP numbers 8 | 9 | expr 0: (([01]?\d?\d|2[0-4]\d|25[0-5])\.){3}([01]?\d?\d|2[0-4]\d|25[0-5]) 10 | tree: ((((([01]?[0-9]?)[0-9])|((2[0-4])[0-9])|((25)[0-5])).){3}((([01]?[0-9]?)[0-9])|((2[0-4])[0-9])|((25)[0-5]))) 11 | augmnt: (((((((([01]?[0-9]?)[0-9])|((2[0-4])[0-9])|((25)[0-5])).)(((([01]?[0-9]?)[0-9])|((2[0-4])[0-9])|((25)[0-5])).))(((([01]?[0-9]?)[0-9])|((2[0-4])[0-9])|((25)[0-5])).))((([01]?[0-9]?)[0-9])|((2[0-4])[0-9])|((25)[0-5])))#) 12 | length: 7 to 15 13 | cplxty: 71 14 | -------------------- Examples -------------------- 15 | ex(0): 4.217.8.250 1.208.1.212 8.230.250.250 2.254.7.255 8.255.6.8 16 | ex(1): 254.01.250.2 239.03.7.069 250.204.217.14 228.254.1.05 17 | ex(2): 238.252.252.254 255.233.250.252 255.0.38.217 255.254.46.204 18 | ex(3): 255.242.251.216 201.217.210.210 147.2.209.254 250.219.250.251 19 | ex(5): 245.227.11.07 252.255.6.227 247.09.253.209 2.253.252.254 20 | ex(8): 2.233.33.13 190.251.252.236 61.66.216.254 253.217.250.97 21 | ---------------- Examples from DFA --------------- 22 | s1: 1.0.0.0 23 | s9: 1.0.0.2 24 | s17: 1.0.0.6 25 | s8: 1.0.0.25 26 | s16: 1.0.0.27 27 | 28 | # 314. US Phone numbers with area code 29 | 30 | ERROR: ([\(]{1}[0-9]{3}[\)]{1}[ |\-]{0,1}|^[0-9]{3}[\-| ])?[0-9]{3}(\-| ){1}[0-9]{4} 31 | rx: Embedded '^' unimplemented: in "([\(]{1}[0-9]{3}[\)]{1}[ |\-]{0,1}|^[0-9]{3}[\-| ])?[0-9]{3}(\-| ){1}[0-9]{4}" 32 | 33 | # 3. US Zip+4 code 34 | 35 | expr 1: \d{5}-\d{4} 36 | tree: (([0-9]{5}-)[0-9]{4}) 37 | augmnt: ((((((([0-9][0-9])[0-9])[0-9])[0-9])-)((([0-9][0-9])[0-9])[0-9]))#) 38 | length: 10 to 10 39 | cplxty: 12 40 | -------------------- Examples -------------------- 41 | ex(0): 44365-7788 87982-1004 58601-2012 78548-0094 74689-8027 08429-7464 42 | ex(1): 36478-9433 48212-2620 62088-9001 19910-9642 26893-9403 78887-5707 43 | ex(2): 49791-0347 14007-0329 08464-0376 85246-8793 27998-4209 10688-2380 44 | ex(3): 41853-8832 09305-6682 70799-8007 43383-3675 36348-5644 07465-9043 45 | ex(5): 83817-3780 46560-9260 06990-1621 51198-7389 10951-0219 94772-8228 46 | ex(8): 44232-9055 10733-1747 86699-2663 68792-6179 52752-6019 41981-8796 47 | ---------------- Examples from DFA --------------- 48 | s1: 66674-7604 49 | 50 | # 260. UK postcode 51 | 52 | expr 2: ([A-PR-UWYZ0-9][A-HK-Y0-9][AEHMNPRTVXY0-9]?[ABEHMNPRVWXY0-9]? {1,2}[0-9][ABD-HJLN-UW-Z]{2}|GIR 0AA) 53 | tree: ((((((([0-9A-PR-UWYZ][0-9A-HK-Y])[0-9AEHMNPRTVXY]?)[0-9ABEHMNPRV-Y]?) {1,2})[0-9])[ABD-HJLN-UW-Z]{2})|((((((GI)R) )0)A)A)) 54 | augmnt: (((((((([0-9A-PR-UWYZ][0-9A-HK-Y])[0-9AEHMNPRTVXY]?)[0-9ABEHMNPRV-Y]?)( ?))[0-9])([ABD-HJLN-UW-Z][ABD-HJLN-UW-Z]))|((((((GI)R) )0)A)A))#) 55 | length: 6 to 9 56 | cplxty: 29 57 | -------------------- Examples -------------------- 58 | ex(0): P4 6ZZ HT 9EE CF 5TG GIR 0AA GIR 0AA GIR 0AA M8 4EZ B2 4QP 59 | ex(1): GIR 0AA GIR 0AA 0B 8LB MCAH 4YS 7KY 8YX GIR 0AA J3N 9TN 60 | ex(2): GIR 0AA GIR 0AA CAV8 2SJ GIR 0AA G32 6XA N54 0RD G9 7DJ 61 | ex(3): 9AHX 5JY C09 3EU K90V 1TD GIR 0AA GIR 0AA GIR 0AA GIR 0AA 62 | ex(5): DOXW 8ZH GIR 0AA JBB 5RL R5H3 6SG GIR 0AA GIR 0AA HLR8 1OE 63 | ex(8): GIR 0AA GIR 0AA JPB 7NB TQ6 5WR GIR 0AA GIR 0AA JT4 1AB 1R 4SY 64 | ---------------- Examples from DFA --------------- 65 | s1: J7 9DS 66 | 3 expression(s) loaded 67 | (1 expression(s) rejected) 68 | -------------------------------------------------------------------------------- /test/58-libmail.std: -------------------------------------------------------------------------------- 1 | Options: -R -T -g -h -i -l -v -I 0 2 | 3 | # RegExLib.com -- browse -- Email 4 | 5 | # 6 | 7 | # 26. "Email validator that adheres directly to the specification" 8 | 9 | expr 0: ([a-zA-Z0-9_\-\.]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?) 10 | tree: (((([.0-9A-Z_a-z-]+@)((((((([[0-9]{1,3}).)[0-9]{1,3}).)[0-9]{1,3}).)|([0-9A-Za-z-]+.)+))([A-Za-z]{2,4}|[0-9]{1,3}))]?) 11 | augmnt: ((((([.0-9A-Z_a-z-]+@)((((((([(([0-9][0-9]?)[0-9]?)).)(([0-9][0-9]?)[0-9]?)).)(([0-9][0-9]?)[0-9]?)).)|([0-9A-Za-z-]+.)+))(((([A-Za-z][A-Za-z])[A-Za-z]?)[A-Za-z]?)|(([0-9][0-9]?)[0-9]?)))]?)#) 12 | length: 5 to * 13 | cplxty: 38 14 | -------------------- Examples -------------------- 15 | ex(0): c@y.JD Q@[9.2.1.YB 9@[6.0.2.ZB Z@[4.8.7.MF k@[9.7.6.XQ g@L.6 16 | ex(1): f@[0.8.01.19 o@[994.38.750.ZR T@[471.041.08.20 5@o.62 17 | ex(2): X@[62.83.82.00 e@[90.043.33.34 F@[04.90.2.61] 1J@[7.66.9.SlqH 18 | ex(3): g00@[2.537.66.gKf] 9@D.MFN DUM@S.JY.t.dhX rdt@[719.0.64.qWUL 19 | ex(5): kw8h@[83.23.024.vA Z95@NA-V.2onf.EDHH.Q.Y.GZ] 6i@[64.3.9.30 20 | ex(8): 1fB0G4VF@[40.3.441.75 6Eg@[40.775.817.16] 21 | ---------------- Examples from DFA --------------- 22 | s1: E@q.8 23 | s12: E@q.8] 24 | s14: E@q.84 25 | s13: E@q.yY 26 | s24: E@q.845 27 | s25: E@q.yYe 28 | s21: E@[5.5.2.8 29 | s22: E@[5.5.2.PB 30 | s27: E@[5.5.2.88 31 | s28: E@[5.5.2.PBp 32 | s23: E@[5.5.2.882 33 | 34 | # 356. "my all-time favourite e-mail validator" 35 | 36 | expr 1: (\w[-._\w]*\w@\w[-._\w]*\w\.\w{2,3}) 37 | tree: (((((((([0-9A-Z_a-z][.0-9A-Z_a-z-]*)[0-9A-Z_a-z])@)[0-9A-Z_a-z])[.0-9A-Z_a-z-]*)[0-9A-Z_a-z]).)[0-9A-Z_a-z]{2,3}) 38 | augmnt: ((((((((([0-9A-Z_a-z][.0-9A-Z_a-z-]*)[0-9A-Z_a-z])@)[0-9A-Z_a-z])[.0-9A-Z_a-z-]*)[0-9A-Z_a-z]).)(([0-9A-Z_a-z][0-9A-Z_a-z])[0-9A-Z_a-z]?))#) 39 | length: 8 to * 40 | cplxty: 19 41 | -------------------- Examples -------------------- 42 | ex(0): HW@Oq.s2 o3@_k.Kc lI@qB.NJ Td@ei.Dp eA@zW.uc eC@B0.Kb Td@0K.hM 43 | ex(1): 0W6@tyo.u3 Oo@JY.qw fO@yvu.aP 1u@9A5.i3 G6l@dW.bZ 80@kl1.qB 44 | ex(2): cILN@77b_.wd n72@fJf.Gc m.le@n_XG.tN kQD@v3.UD2 vCwt@ita.IZ 45 | ex(3): G_ah@9s.cD4 5t@gBzy.sey 0n@q-Dc.S3 ql@XDEy6.hYQ YLy@6z.hw 46 | ex(5): luIb1@HERp.bXl ZvAfV7E@CnYgMDf.a_J mJg@dt3ic.Rmo hedg@wDh.k6 47 | ex(8): TaY6FN@oso.Ip swXVMMekq@knpUr3G.Rs cHe5Y@tZ9.dkb 3ZVbc@5cREw.2l 48 | ---------------- Examples from DFA --------------- 49 | s1: V6@5O.YR 50 | s9: V6@5O.YRV 51 | 52 | # 1012. "A short and sweet email address validator" 53 | 54 | expr 2: ([0-9a-zA-Z]+[-._+&])*[0-9a-zA-Z]+@([-0-9a-zA-Z]+[.])+[a-zA-Z]{2,6} 55 | tree: ((((([0-9A-Za-z]+[&+.;_amp-])*[0-9A-Za-z]+)@)([0-9A-Za-z-]+.)+)[A-Za-z]{2,6}) 56 | augmnt: (((((([0-9A-Za-z]+[&+.;_amp-])*[0-9A-Za-z]+)@)([0-9A-Za-z-]+.)+)((((([A-Za-z][A-Za-z])[A-Za-z]?)[A-Za-z]?)[A-Za-z]?)[A-Za-z]?))#) 57 | length: 6 to * 58 | cplxty: 18 59 | -------------------- Examples -------------------- 60 | ex(0): H@w.yJ 2@-.Dv z@n.qY q@5.jD m@x.ZB b@V.Dc c@e.yM K@I.Ut 0@B.oX 61 | ex(1): O;6@8.YVaW y-J@e.EkmVr l;O@T.KrNYx e@C.hueR 8@o.PpvOey v@r.Sx 62 | ex(2): RVpfW.f@w.lQb F.dk@yo.dmsM n_E.N@l.x.lGbQo f9-c@Zb.o.jwBj 63 | ex(3): 4QY+u6;hw_ak4@W.ucvj z;A.p5_lZe@fV.sMnY Vn;J@YKt.6.CuUP 64 | ex(5): KJNFo-NIpJ;af@kw8h.apUr.zlGicV 65 | ex(8): gkAe0voy@b.-nRFd.52eEVsV.BLkhE 66 | ---------------- Examples from DFA --------------- 67 | s1: P@T.pr 68 | s10: P@T.prq 69 | s9: P@T.prqP 70 | s8: P@T.prqPI 71 | s7: P@T.prqPID 72 | 3 expression(s) loaded 73 | -------------------------------------------------------------------------------- /test/80-questions.rx: -------------------------------------------------------------------------------- 1 | #! rxplor -T 2 | #! rxcluster -i 1 3 | #! printf 'n\\nn\\nn\\ny\\ny\\ny\\n' | rxquest -i 1 80-questions.rx 4 | 5 | # things to try to distinguish via "20 questions" 6 | # some of these are deliberately incomplete, imprecise, or even incorrect 7 | 8 | # binary number 9 | [01]+ 10 | # canonical binary number 11 | 0|1[01]* 12 | # binary number divisible by 3 (Sedgewick) 13 | (0|1(01*0)*1)* 14 | # binary number with no doubled digits 15 | 1?(01)*0? 16 | 17 | # unsigned integer 18 | \d+ 19 | # integer not (ending in!) 0 20 | \d*[1-9] 21 | # nonzero integer, no leading 0 22 | [1-9]\d* 23 | # possibly signed integer 24 | [+-]?\d+ 25 | # canonical integer form (-i or i, no +, or 0) 26 | 0|-?[1-9]\d* 27 | 28 | # number with decimal point 29 | \d*\.\d+|\d+\.\d* 30 | # FORTRAN floating point constant 31 | [+-]?(\d+\.\d*|\.\d+)([dDeE][+-]?\d\d?\d?)? 32 | # C floating point constant 33 | [+-]?(((\d+\.\d*|\.\d+)([eE][+-]?\d+)?)|(\d+[eE][+-]?\d+))[fFlL]? 34 | # JSON number 35 | -?(0|[1-9]\d*)(\.\d+)?([eE][+-]?\d{1,3})? 36 | 37 | # US Zip Code 38 | \d{5} 39 | \d{5}(-\d{4})? 40 | \d{5}-\d{4} 41 | 42 | # US Phone Number 43 | \d{7}|\d{10} 44 | \d{10} 45 | [2-9]\d\d[2-9]\d\d\d\d\d\d 46 | ([2-9]\d\d)?[2-9]\d\d\d\d\d\d 47 | (\d\d\d-|\(\d\d\d\) ?)\d\d\d-\d\d\d\d 48 | ([2-9]\d\d-|\([2-9]\d\d\) ?)\d\d\d-\d\d\d\d 49 | \+1 \d\d\d \d\d\d \d\d\d\d 50 | \+1-\d\d\d-\d\d\d-\d\d\d\d 51 | \+1\.\d\d\d\.\d\d\d\.\d\d\d\d 52 | 53 | # US Social Security Number 54 | \d{9} 55 | \d\d\d-\d\d-\d\d\d\d 56 | \d\d\d-(\d[1-9]|[1-9]\d)-\d\d\d\d 57 | ([0-6]\d{2}|7([0-6]\d|7[012]))-\d\d-\d\d\d\d 58 | 59 | # some from: http://www.regular-expressions.info/creditcard.html 60 | # 16-digit credit card number 61 | \d{16} 62 | # slightly more plausible number 63 | [3456]\d{15} 64 | # 16-digit credit card number with optional spaces 65 | \d{4} ?\d{4} ?\d{4} ?\d{4} 66 | # Visa card including old 13-digit form 67 | 4\d{12}(\d\d\d)? 68 | # MasterCard 69 | 5[1-5]\d{14} 70 | # American Express 71 | 3[47]\d{13} 72 | # Discover Card 73 | 6(011|5\d\d)\d{12} 74 | # Credit card number of correct length for issuer 75 | 3\d{14}|[456]\d{15} 76 | 77 | # time 78 | \d{6} 79 | [0-2][0-9][0-5][0-9][0-5][0-9] 80 | ([01]\d|2[0-3])[0-5][0-9][0-5][0-9] 81 | [0-2][0-9]:[0-5][0-9](:[0-5][0-9])? 82 | ([01]\d|2[0-3]):[0-5][0-9](:[0-5][0-9])? 83 | (0?\d|1[0-2]):[0-5][0-9](:[0-5][0-9])? 84 | 85 | # date yyyymmdd 86 | \d{8} 87 | [12][90]\d\d[01][0-9][012][0-9] 88 | (19|20)\d\d(0[1-9]|1[012])(0[1-9]|[12]\d|3[01]) 89 | (19|20)?\d\d(0[1-9]|1[012])(0[1-9]|[12]\d|3[01]) 90 | (19|20)\d\d\-(0[1-9]|1[012])\-(0[1-9]|[12]\d|3[01]) 91 | (19|20)\d\d\.(0[1-9]|1[012])\.(0[1-9]|[12]\d|3[01]) 92 | # date mmddyyyy 93 | [01][0-9][012][0-9][12][90]\d\d 94 | (0[1-9]|1[012])(0[1-9]|[12]\d|3[01])(19|20)\d\d 95 | (0[1-9]|1[012])/(0[1-9]|[12]\d|3[01])/((19|20)\d\d) 96 | (0?[1-9]|1[012])/(0?[1-9]|[12]\d|3[01])/((19|20)?\d\d) 97 | # date ddmmyyyy 98 | (0[1-9]|[12]\d|3[01])(0[1-9]|1[012])(19|20)\d\d 99 | (0[1-9]|[12]\d|3[01])/(0[1-9]|1[012])/(19|20)\d\d 100 | (0?[1-9]|[12]\d|3[01])/(0?[1-9]|1[012])/(19|20)?\d\d 101 | 102 | # email (simplified) 103 | \w+@\w+\.\w+ 104 | \w+@\w+(\.\w+)+ 105 | [-.'\w]+@\w+(\.\w+)+ 106 | \w[-.'\w]*@\w+(\.\w+)+ 107 | # from RegExLib 356 & 1012 108 | (\w[-._\w]*\w@\w[-._\w]*\w\.\w{2,3}) 109 | ([0-9a-zA-Z]+[-._+&])*[0-9a-zA-Z]+@([-0-9a-zA-Z]+[.])+[a-zA-Z]{2,6} 110 | 111 | # web (simplified) 112 | www\.[a-z]{3,}\.(com|edu|net|gov|mil|co\.uk)(/.*)? 113 | www(\.[a-z]{3,})+\.[a-z]{1,3}(/.*)? 114 | www(\.[a-z]{2,})+\.[a-z]{1,3}(/.*)? 115 | (https?://)?\w+(\.\w+)(/.*)? 116 | -------------------------------------------------------------------------------- /webapp/code/examine.go: -------------------------------------------------------------------------------- 1 | // examine.go -- code for inspecting a single expression in detail 2 | 3 | package webapp 4 | 5 | import ( 6 | "fmt" 7 | "net/http" 8 | "rx" 9 | "strings" 10 | ) 11 | 12 | const linemax = 79 // max output line length for generated examples 13 | 14 | // examine presents a query page for examining a single expression 15 | func examine(w http.ResponseWriter, r *http.Request) { 16 | putheader(w, r, "Inspection Query") 17 | fmt.Fprintln(w, `

Here you can specify 18 | a regular expression to generate several kinds of data: 19 | parse trees, synthetic examples, and the automata state lists. 20 | Links from the results page produce diagrams of either the 21 | NFA 22 | or 23 | DFA 24 | for the language. 25 | `) 26 | putform(w, "/details", "Enter a regular expression:", 1, nil, 0, nil) 27 | fmt.Fprintln(w, `

Or choose one of these examples:`) 28 | for _, x := range examples { 29 | genexam(w, x.Caption, x.Expr) 30 | } 31 | putfooter(w, r) 32 | } 33 | 34 | var examples = []struct{ Expr, Caption string }{ 35 | {HomeExample, HomeExLabel}, 36 | {`-?(0|[1-9]\d*)(\.\d+)?([eE][+-]?\d+)?`, "JSON number"}, 37 | {`(0|1(01*0)*1)*`, "Binary number divisible by 3"}, 38 | {`A(BA)*B?|B(AB)*A?|A?(BA)*B?C((A(BA)*B?|B(AB)*A?)C)*(A(BA)*B?|B(AB)*A?)?`, 39 | "ABCs with no letter doubled"}, 40 | {`\([2-9]\d\d\) [2-9]\d\d\-\d{4}`, "US telephone number"}, 41 | {`[0-8]\d\d-\d\d-\d{4}`, "US social security number"}, 42 | {`(19|20)\d\d\-(0[1-9]|1[012])\-(0[1-9]|[12]\d|3[01])`, "ISO 8601 date"}, 43 | {`([01]\d|2[0-3]):[0-5]\d:[0-5]\dZ`, "ISO 8601 time"}, 44 | {`\w+@\w+(\.\w+)+`, "Naive e-mail address"}, 45 | } 46 | 47 | // details responds to an inspection request for a single expression 48 | func details(w http.ResponseWriter, r *http.Request) { 49 | field := fmt.Sprintf("v%d", baseExpr) 50 | expr := r.FormValue(field) // must read before any writing 51 | expr = strings.TrimSpace(expr) // trim leading/trailing blanks 52 | putheader(w, r, "Inspect Expression") 53 | tree, err := rx.Parse(expr) 54 | if err == nil { 55 | fmt.Fprintf(w, "

Regular Expression: %s\n", hx(expr)) 56 | // must print (or at least stringize) tree before augmenting 57 | fmt.Fprintf(w, "

Initial Parse Tree: %s\n", hx(tree)) 58 | 59 | augt := rx.Augment(tree, 0) 60 | dfa := rx.BuildDFA(augt) 61 | dmin := dfa.Minimize() 62 | 63 | fmt.Fprintf(w, "

Augmented Tree: %s\n", hx(augt)) 64 | fmt.Fprintf(w, "

Examples

\n

") 65 | genexamples(w, tree, 0) 66 | genexamples(w, tree, 1) 67 | genexamples(w, tree, 2) 68 | genexamples(w, tree, 3) 69 | genexamples(w, tree, 5) 70 | genexamples(w, tree, 8) 71 | 72 | showaut(w, dmin, []string{expr}) 73 | } else { 74 | fmt.Fprint(w, "

") 75 | showerror(w, err) 76 | } 77 | 78 | fmt.Fprint(w, "

Try another?

") 79 | putform(w, "/details", "Enter a regular expression:", 80 | 1, []string{expr}, 0, nil) 81 | putfooter(w, r) 82 | } 83 | 84 | // genexamples writes a line of specimen strings matching the expression 85 | func genexamples(w http.ResponseWriter, tree rx.Node, maxrepl int) { 86 | nprinted := 0 87 | ncolm := 0 88 | for { 89 | s := rx.Specimen(tree, maxrepl) 90 | t := rx.Protect(s) 91 | ncolm += 2 + len(t) 92 | if nprinted > 0 && ncolm > linemax { 93 | break 94 | } 95 | fmt.Fprintf(w, " %s   ", hx(t)) 96 | nprinted++ 97 | } 98 | fmt.Fprint(w, "
\n") 99 | } 100 | -------------------------------------------------------------------------------- /altnode.go: -------------------------------------------------------------------------------- 1 | // altnode.go -- parse tree node offering multiple alternative subtrees 2 | 3 | package rx 4 | 5 | import ( 6 | "fmt" 7 | "math/rand" 8 | ) 9 | 10 | // An AltNode generalizes the two-child parse tree node of textbooks to 11 | // allow an arbitrary number of alternatives. This makes it easier to 12 | // give multiple alternatives equal probability when generating examples 13 | // from the parse tree. The degenerate form with zero children functions 14 | // as an Epsilon node. 15 | type AltNode struct { 16 | Alts []Node 17 | NodeData 18 | } 19 | 20 | // Epsilon returns a special AltNode exhibiting no alternatives. 21 | func Epsilon() Node { 22 | return &AltNode{make([]Node, 0), nildata} 23 | } 24 | 25 | // Node.IsEpsilon returns true for an Epsilon node. 26 | func IsEpsilon(d Node) bool { 27 | anode, ok := d.(*AltNode) 28 | return ok && len(anode.Alts) == 0 29 | } 30 | 31 | // AltNode.Children returns the list of alternative subtrees. 32 | func (d *AltNode) Children() []Node { 33 | return d.Alts 34 | } 35 | 36 | // AltNode.MinLen returns the smallest minimum of its subpatterns. 37 | func (d *AltNode) MinLen() int { 38 | n := 0 39 | for i, e := range d.Alts { 40 | emin := e.MinLen() 41 | if i == 0 || emin < n { 42 | n = emin 43 | } 44 | } 45 | return n 46 | } 47 | 48 | // AltNode.MaxLen returns the largest maxima of its subpatterns. 49 | // A value of -1 means that the length is unbounded. 50 | func (d *AltNode) MaxLen() int { 51 | n := 0 52 | for _, e := range d.Alts { 53 | emax := e.MaxLen() 54 | if emax < 0 { // if unbounded 55 | return -1 56 | } 57 | if emax > n { 58 | n = emax 59 | } 60 | } 61 | return n 62 | } 63 | 64 | // AltNode.SetNFL sets the Nullable, FirstPos, LastPos fields. 65 | func (d *AltNode) SetNFL() { 66 | d.Nullable = (len(d.Alts) == 0) // only if an Epsilon 67 | d.FirstPos = &BitSet{} 68 | d.LastPos = &BitSet{} 69 | for _, e := range d.Alts { 70 | d.Nullable = d.Nullable || e.nullable() 71 | d.FirstPos.OrWith(e.firstPos()) 72 | d.LastPos.OrWith(e.lastPos()) 73 | } 74 | } 75 | 76 | // AltNode.SetFollow has nothing to do. 77 | func (d *AltNode) SetFollow(pmap []*MatchNode) { 78 | } 79 | 80 | // AltNode.Example chooses one subpattern to generate an example. 81 | func (d *AltNode) Example(s []byte, n int) []byte { 82 | if IsEpsilon(d) { 83 | return s // was an Epsilon 84 | } else { 85 | return d.Alts[rand.Intn(len(d.Alts))].Example(s, n) 86 | } 87 | } 88 | 89 | // AltNode.String shows all subpatterns separated by | in parentheses. 90 | func (d *AltNode) String() string { 91 | b := make([]byte, 0) 92 | b = append(b, '(') 93 | for k, v := range d.Alts { 94 | if k > 0 { 95 | b = append(b, '|') 96 | } 97 | b = append(b, fmt.Sprint(v)...) 98 | } 99 | b = append(b, ')') 100 | return string(b) 101 | } 102 | 103 | // Alternate makes an AltNode, collapsing multiple alternatives. 104 | func Alternate(d Node, e Node) Node { 105 | // if left is nil (not Epsilon), just return right 106 | // (this makes certain loops easier) 107 | if d == nil { 108 | return e 109 | } 110 | // if right is non-Epsilon AltNode, and left is not, combine 111 | altd, okd := d.(*AltNode) 112 | alte, oke := e.(*AltNode) 113 | if (oke && len(alte.Alts) > 0) && !(okd && len(altd.Alts) > 0) { 114 | // insert at left end for intuitive ordering 115 | alist := append(alte.Alts, nil) 116 | copy(alist[1:], alist[0:]) 117 | alist[0] = d 118 | alte.Alts = alist 119 | return alte 120 | } else { 121 | return &AltNode{append(make([]Node, 0), d, e), nildata} 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /webapp/code/form.go: -------------------------------------------------------------------------------- 1 | // form.go -- form generation 2 | 3 | package webapp 4 | 5 | import ( 6 | "fmt" 7 | "io" 8 | "net/http" 9 | "rx" 10 | "strings" 11 | ) 12 | 13 | // Form configuration values. 14 | var ( 15 | nExpr = 4 // default number of comparison fields 16 | nTest = 2 // default number of example labels 17 | baseExpr = 100 // base for numbering expressions 18 | baseTest = 200 // base for numbering test cases 19 | maxExpr = len(rx.AcceptLabels) // limit on exprs we can label 20 | maxTest = maxExpr // arbitrarily make this same 21 | ) 22 | 23 | // getexprs retrieves and trims submitted expr values 24 | // (if called with none by W3C validator, make one up for better validation) 25 | func getexprs(r *http.Request) []string { 26 | exprlist := make([]string, 0) 27 | for i := 0; i < maxExpr; i++ { 28 | arg := r.FormValue(fmt.Sprintf("v%d", baseExpr+i)) 29 | arg = strings.TrimSpace(arg) 30 | if arg != "" { 31 | exprlist = append(exprlist, arg) 32 | } 33 | } 34 | if len(exprlist) > 0 { 35 | return exprlist 36 | } 37 | if strings.HasPrefix(r.Header.Get("User-Agent"), "W3C_Validator") { 38 | return []string{"(a|b)*abb"} 39 | } 40 | return exprlist 41 | } 42 | 43 | // putform outputs a form for submitting nx expressions and nt tests 44 | // with at least one empty field if possible. 45 | // 46 | // changes in the HTML generated here may require corresponding changes 47 | // in the Javascript addslots() function. 48 | func putform(w io.Writer, target string, label string, 49 | nx int, exprs []string, nt int, tests []string) { 50 | 51 | fmt.Fprintf(w, ` 52 |
53 |
%s  
54 |
55 | `, target, label) 57 | if nx > 1 && nt > 1 { 58 | fmt.Fprintf(w, ` 59 | `, 61 | baseExpr, maxExpr, baseTest, maxTest) 62 | } 63 | fmt.Fprintf(w, ` 64 |
65 |
`) 66 | putfields(w, exprs, nx, baseExpr, maxExpr) 67 | if nt > 0 { 68 | fmt.Fprintf(w, ` 69 |
Enter examples (optional):
`) 70 | putfields(w, tests, nt, baseTest, maxTest) 71 | } 72 | fmt.Fprintf(w, ` 73 |
`) 74 | } 75 | 76 | // putfields outputs a sequence of text input fields. 77 | func putfields(w io.Writer, values []string, n int, base int, max int) { 78 | if n > 1 && len(values) < max { 79 | values = append(values, "") 80 | } 81 | for len(values) < n { 82 | values = append(values, "") 83 | } 84 | for i, v := range values { 85 | fmt.Fprintf(w, ` 86 |
`, 87 | base+i, base+i, hx(v)) 88 | } 89 | } 90 | 91 | // genexam generates a link (actually a form) to "examine" one predefined regex 92 | func genexam(w io.Writer, label string, expr string) { 93 | formlink(w, "/details", []string{expr}, label) 94 | } 95 | 96 | // gencomp generates a link (actually a form) to "compare" multiple regexes 97 | func gencomp(w io.Writer, label string, exprs []string) { 98 | formlink(w, "/combos", exprs, label) 99 | } 100 | 101 | // formlink generates a link-like form for submitting canned examples 102 | func formlink(w io.Writer, path string, exprs []string, label string) { 103 | fmt.Fprintf(w, ` 104 |
`, path) 105 | for i, v := range exprs { 106 | fmt.Fprintf(w, ` 107 | `, baseExpr+i, hx(v)) 108 | } 109 | fmt.Fprintf(w, ` 110 |
`, hx(label)) 111 | } 112 | -------------------------------------------------------------------------------- /node.go: -------------------------------------------------------------------------------- 1 | // node.go -- parse tree node supertype 2 | // 3 | // It's not clear that this is the best way to accomplish "variant records", 4 | // nor is it clear that this is not. 5 | 6 | package rx 7 | 8 | import ( 9 | "fmt" 10 | "io" 11 | ) 12 | 13 | // Node is the "parent class" of all parse tree node subtypes. 14 | // All subtype fields must also be exportable for use with package gob. 15 | // 16 | // The four proper subtypes are MatchNode, ConcatNode, ReplNode, and AltNode. 17 | // Epsilon and Accept are special AltNode and MatchNode forms respectively. 18 | // 19 | // A parse tree is referenced by its root node. There is nothing special 20 | // about the root node, and every subtree is also a valid parse tree. 21 | // However, the First/Last/Follow sets only make sense in the context of 22 | // the root node with respect to which they were computed. 23 | type Node interface { 24 | nullable() bool // return nullable value 25 | firstPos() *BitSet // return FirstPos value 26 | lastPos() *BitSet // return LastPos value 27 | followPos() *BitSet // return FollowPos value 28 | clearFollow() // clear FollowPos set 29 | Children() []Node // return children for tree walking 30 | MinLen() int // return min len matched (0 if nullable) 31 | MaxLen() int // return max len matched (-1 for infinity) 32 | Example([]byte, int) []byte // append example of max repl n to buf 33 | SetNFL() // set Nullable, FirstPos, LastPos 34 | SetFollow([]*MatchNode) // set FollowPos 35 | String() string // return string for printing 36 | } 37 | 38 | // NodeData is included (anonymously) in every Node subtype. 39 | // Common getter functions are defined below for every field. 40 | // The FollowPos sets represent arcs of the NFA implementing the parse tree. 41 | type NodeData struct { 42 | Nullable bool // can this subtree match empty string? 43 | FirstPos *BitSet // possible initial nodes ("positions") 44 | LastPos *BitSet // possible final nodes ("positions") 45 | FollowPos *BitSet // positions that can follow in NFA 46 | } 47 | 48 | func (dp *NodeData) nullable() bool { return dp.Nullable } 49 | func (dp *NodeData) firstPos() *BitSet { return dp.FirstPos } 50 | func (dp *NodeData) lastPos() *BitSet { return dp.LastPos } 51 | func (dp *NodeData) followPos() *BitSet { return dp.FollowPos } 52 | func (dp *NodeData) clearFollow() { dp.FollowPos = &BitSet{} } 53 | 54 | var nildata = NodeData{} // convenient for initialization 55 | 56 | // Specimen generates a synthetic example from a parse tree 57 | func Specimen(tree Node, maxrepl int) string { 58 | return string(tree.Example(make([]byte, 0), maxrepl)) 59 | } 60 | 61 | // VisitFunc is a function to be executed at each node when walking a tree. 62 | type VisitFunc func(d Node) 63 | 64 | // Walk calls pre and post for every node, before and after visiting children. 65 | // One of these can be nil to accomplish preorder or postorder traversal. 66 | func Walk(tree Node, pre VisitFunc, post VisitFunc) { 67 | if pre != nil { 68 | pre(tree) 69 | } 70 | for _, c := range tree.Children() { 71 | Walk(c, pre, post) 72 | } 73 | if post != nil { 74 | post(tree) 75 | } 76 | } 77 | 78 | // treenodes prints details of the parse tree. 79 | func (dfa *DFA) ShowTree(f io.Writer, tree Node, label string) { 80 | ShowLabel(f, label) 81 | indent := "" 82 | Walk(tree, func(d Node) { 83 | indent = indent + " " 84 | c := "F" 85 | if d.nullable() { 86 | c = "T" 87 | } 88 | fmt.Fprintf(f, "%s{%s, ", indent[3:], c) 89 | for _, k := range d.firstPos().Members() { 90 | fmt.Fprint(f, dfa.Leaves[k]) 91 | } 92 | fmt.Fprint(f, ", ") 93 | for _, k := range d.lastPos().Members() { 94 | fmt.Fprint(f, dfa.Leaves[k]) 95 | } 96 | fmt.Fprintln(f, "} ", d) 97 | }, func(d Node) { 98 | indent = indent[3:] 99 | }) 100 | } 101 | -------------------------------------------------------------------------------- /test/50-libnum.std: -------------------------------------------------------------------------------- 1 | Options: -R -T -g -h -i -l -v -I 0 2 | 3 | # RegExLib.com -- browse -- Numbers 4 | 5 | # 6 | 7 | # 575. numbers w/ or w/o decimal places and commas 8 | 9 | # BUG: . not escaped 10 | 11 | expr 0: (((\d{1,3})(,\d{3})*)|(\d+))(.\d+)? 12 | tree: ((([0-9]{1,3}(,[0-9]{3})*)|[0-9]+)([\x01-\x7f][0-9]+)?) 13 | augmnt: (((((([0-9][0-9]?)[0-9]?)(,(([0-9][0-9])[0-9]))*)|[0-9]+)([\x01-\x7f][0-9]+)?)#) 14 | length: 1 to * 15 | cplxty: 15 16 | -------------------- Examples -------------------- 17 | ex(0): 4 6 6 7 8 7 8 6 0 4 8 0 6 0 2 8 4 3 0 4 4 8 7 0 18 | ex(1): 8,746 78,091:4 8B4 3 222 6 9,991 9 2}9 4 7 837q9 0 56|9 7 19 | ex(2): 08,908,641 68@6 892,981,209W8 58,209,28661 50c9 5 6 90 043B67 20 | ex(3): 86 6)04 0 6S9 37\"0 66,260,699,116 119 781%602 94h4 2[77 66 21 | ex(5): 3617 6699O63 89,179,275,760~19812 76,717,600 64,047,112,926,735,359 22 | ex(8): 09,827,224,350,214,214,087,422183344333 5783819t2606188 4]36 23 | ---------------- Examples from DFA --------------- 24 | s1: 0 25 | s10: 04 26 | s2: 0C4 27 | s5: 0,8 28 | s9: 045 29 | s4: 0,89 30 | s8: 0457 31 | s3: 0,892 32 | 33 | # 185. numbers w/ or w/o exponential form 34 | 35 | expr 1: [+-]?([0-9]*\.?[0-9]+|[0-9]+\.?[0-9]*)([eE][+-]?[0-9]+)? 36 | tree: (([+-]?((([0-9]*.?)[0-9]+)|(([0-9]+.?)[0-9]*)))(([Ee][+-]?)[0-9]+)?) 37 | augmnt: ((([+-]?((([0-9]*.?)[0-9]+)|(([0-9]+.?)[0-9]*)))(([Ee][+-]?)[0-9]+)?)#) 38 | length: 1 to * 39 | cplxty: 19 40 | -------------------- Examples -------------------- 41 | ex(0): 4 6 6 7 8 7 8 6 0 4 8 0 6 0 2 8 4 3 0 4 4 8 7 0 42 | ex(1): 9 -7e+9 +.3E+8 2. 8 01e+9 -8E-0 +7e7 7 +.9e+7 -.0e8 .3e-4 43 | ex(2): +4210 80 86e+30 9e6 99 .38e53 86 499 0E+83 43.46e-6 9916 44 | ex(3): +5161e+4 84.77 44219 +3.74e99 36921 +756 -981796 179 74604e+2 45 | ex(5): -6800.84 2243.72E46 3297e+33336 +8.9026 8984.3 6.43 693 46 | ex(8): 97562E61 14.3 303304449e-50277143 3081715E-31 3.750619 372284238 47 | ---------------- Examples from DFA --------------- 48 | s1: 1 49 | s4: 1. 50 | s6: 1e4 51 | 52 | # 943. numbers allowing sign and leading zeroes 53 | 54 | # BUGGY 55 | 56 | expr 2: \-?\(?([0-9]{0,3}(\,?[0-9]{3})*(\.?[0-9]*))\)? 57 | tree: (((-?(?)(([0-9]{0,3}(,?[0-9]{3})*)(.?[0-9]*))))?) 58 | augmnt: ((((-?(?)(((([0-9]?[0-9]?)[0-9]?)(,?(([0-9][0-9])[0-9]))*)(.?[0-9]*))))?)#) 59 | length: 0 to * 60 | cplxty: 19 61 | -------------------- Examples -------------------- 62 | ex(0): 63 | ex(1): 667) 0. -288304) 7. (4. -16394. (2 0,910. 3,037 -(67) 71347. 64 | ex(2): -2879,79942 -80 8,418.8 -3082799 43.6) -(67 (02536.) 65 | ex(3): (5,867,910,1609) -459,726.42) -503174.69) 6,161,527,760.19 66 | ex(5): -01926735359462.7) 9482,224.072) -(,229.) 33,381,902061898361. -(66 67 | ex(8): -(93630,044949.743502) -(40,715,517,310.7407) 68 | ---------------- Examples from DFA --------------- 69 | s0: 70 | s2: 1 71 | s3: - 72 | s6: ) 73 | s7: . 74 | s8: ,592 75 | s10: ,5929 76 | s9: ,59295 77 | 78 | # 126. "US currency" 79 | 80 | expr 3: \$(\d{1,3}(\,\d{3})*|(\d+))(\.\d{2})? 81 | tree: (($(([0-9]{1,3}(,[0-9]{3})*)|[0-9]+))(.[0-9]{2})?) 82 | augmnt: ((($(((([0-9][0-9]?)[0-9]?)(,(([0-9][0-9])[0-9]))*)|[0-9]+))(.([0-9][0-9]))?)#) 83 | length: 2 to * 84 | cplxty: 18 85 | -------------------- Examples -------------------- 86 | ex(0): $3 $6 $8 $7 $2 $0 $5 $0 $2 $2 $5 $3 $9 $4 $9 $0 $0 $2 87 | ex(1): $47 $0.36 $8.43 $82,620 $8,011.10 $6 $63,403.88 $5.76 $0 $5 88 | ex(2): $08,908,641 $8.46 $73,981,209.68 $3 $28,652 $50.09 $5,270,990 89 | ex(3): $634 $44 $690.00 $318.38 $37 $66,260,699,116 $198 $91 $602.94 90 | ex(5): $89,726.44 $190.10 $6174.66 $26636 $161.52 $760.41 $1,646.17 $0 91 | ex(8): $9,350,592,625,379,680,994 $23,335,721,321 $83,979,334,333.57 $19390 92 | ---------------- Examples from DFA --------------- 93 | s1: $6 94 | s12: $66 95 | s11: $664 96 | s4: $6.08 97 | s10: $6645 98 | s7: $6,577 99 | 4 expression(s) loaded 100 | -------------------------------------------------------------------------------- /test/40-found.std: -------------------------------------------------------------------------------- 1 | Options: -R -T -g -h -i -l -v -I 0 2 | 3 | # Miscellaneous samples from the web 4 | 5 | # 6 | 7 | #D: (main page) "any email address" 8 | #U: http://www.regular-expressions.info/ 9 | expr 0: [A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4} 10 | tree: (((([%+.0-9A-Z_-]+@)[.0-9A-Z-]+).)[A-Z]{2,4}) 11 | augmnt: ((((([%+.0-9A-Z_-]+@)[.0-9A-Z-]+).)((([A-Z][A-Z])[A-Z]?)[A-Z]?))#) 12 | length: 6 to * 13 | cplxty: 12 14 | -------------------- Examples -------------------- 15 | ex(0): V@B.HI Y@1.KB K@N.OC L@K.CK Z@Z.VW E@E.GZ _@W.HY E@G.AE Z@..UE 16 | ex(1): S@R.ZQS A@E.JU J@0.KI Y@D.BOD E@J.FR G@Y.PLL W@U.IPTO L@F.IKF 17 | ex(2): DO@R.JOY S@C..OKE -@B4.LHKS 2@C7.LCW 5@OM.KX OP@7.IID O4@7.CTJ 18 | ex(3): Z@9.AZ 9@Q.RZ JXS@XHI.CZT -T@C.UCP .@46-.XYF 7JD@JO.PGRO 19 | ex(5): 91I6@AXH.SNP G@Z.VSF ._C@LKZWK.RUMX 0197R@MJF.7.DT 2_@795YE.GAU 20 | ex(8): +KXE8@-.LH T3F@LBF4-LG.ET -MXG1.4@OCP2.FOF W.5LSFAG@KO61TB.IOAI 21 | ---------------- Examples from DFA --------------- 22 | s1: -@R.EV 23 | s8: -@R.EVI 24 | s7: -@R.EVIB 25 | 26 | #D: (final email example) 27 | #U: http://www.regular-expressions.info/email.html 28 | expr 1: [a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+(?:[A-Z]{2}|com|org|net|edu|gov|mil|biz|info|mobi|name|aero|asia|jobs|museum) 29 | tree: (((([!#-'*+/-9=?^-~-]+(.[!#-'*+/-9=?^-~-]+)*)@)(([0-9a-z]([0-9a-z-]*[0-9a-z])?).)+)([A-Z]{2}|((co)m)|((or)g)|((ne)t)|((ed)u)|((go)v)|((mi)l)|((bi)z)|(((in)f)o)|(((mo)b)i)|(((na)m)e)|(((ae)r)o)|(((as)i)a)|(((jo)b)s)|(((((mu)s)e)u)m))) 30 | augmnt: ((((([!#-'*+/-9=?^-~-]+(.[!#-'*+/-9=?^-~-]+)*)@)(([0-9a-z]([0-9a-z-]*[0-9a-z])?).)+)(([A-Z][A-Z])|((co)m)|((or)g)|((ne)t)|((ed)u)|((go)v)|((mi)l)|((bi)z)|(((in)f)o)|(((mo)b)i)|(((na)m)e)|(((ae)r)o)|(((as)i)a)|(((jo)b)s)|(((((mu)s)e)u)m)))#) 31 | length: 6 to * 32 | cplxty: 107 33 | -------------------- Examples -------------------- 34 | ex(0): k@9.biz w@k.asia 9@k.aero u@9.jobs '@e.biz 8@k.edu s@n.info 35 | ex(1): $.`@9.asia s@f.com m.+@e.mobi _.i@7y7.mobi z.q@kaq.mobi _.e@u.mobi 36 | ex(2): r8@f.f8n.info n.{.^@5.g.net '.*.+3@x.net 4x./y.^@na7.g.net 37 | ex(3): 9?!@q9eky.mil xb`.k.&t_@x.a.y.edu !jz.hu@vn5.e.org 38 | ex(5): 62.7g.$9^.1.y~@24.qdap9ru.jm9o.m.name 39 | ex(8): p-@v1eb07k.mobi 3c8pp4m.8~.!??}.jk$'3=c.{.j-#@7.0.7q8j.vlttixf.0.com 40 | ---------------- Examples from DFA --------------- 41 | s1: =@9.SM 42 | s31: =@9.biz 43 | 44 | #D: (telno "Yecchhh") 45 | #U: http://compilers.iecc.com/comparch/article/01-10-072 46 | expr 2: (\([0-9]{3}\)[ ]{1}){0,1}[0-9]{3}\-[0-9]{4} 47 | tree: ((((((([0-9]{3}))) )?[0-9]{3})-)[0-9]{4}) 48 | augmnt: (((((((((([0-9][0-9])[0-9])))) )?(([0-9][0-9])[0-9]))-)((([0-9][0-9])[0-9])[0-9]))#) 49 | length: 8 to 14 50 | cplxty: 20 51 | -------------------- Examples -------------------- 52 | ex(0): 443-5677 888-9826 100-5860 162-1278 548-0094 746-9780 270-4290 53 | ex(1): 390-1364 (394) 482-2326 062-8889 011-9109 (422) 939-4037 887-5707 54 | ex(2): 497-1703 714-0780 (908) 410-7685 462-7932 (981) 091-6885 380-2098 55 | ex(3): (883) 930-0668 707-9080 743-8393 753-3488 (440) 659-0432 053-3189 56 | ex(5): 465-0592 006-9011 215-1986 (891) 516-2199 772-8228 593-7269 57 | ex(8): (107) 617-7866 (926) 687-2161 (527) 760-9419 128-9646 58 | ---------------- Examples from DFA --------------- 59 | s1: 640-4207 60 | 61 | #D: (a reformulation) 62 | #U: http://compilers.iecc.com/comparch/article/01-10-081 63 | expr 3: (\(\d\d\d\) )?\d\d\d-\d\d\d\d 64 | tree: (((((((((((((([0-9])[0-9])[0-9]))) )?[0-9])[0-9])[0-9])-)[0-9])[0-9])[0-9])[0-9]) 65 | augmnt: ((((((((((((((([0-9])[0-9])[0-9]))) )?[0-9])[0-9])[0-9])-)[0-9])[0-9])[0-9])[0-9])#) 66 | length: 8 to 14 67 | cplxty: 27 68 | -------------------- Examples -------------------- 69 | ex(0): 443-5677 888-9826 100-5860 162-1278 548-0094 746-9780 270-4290 70 | ex(1): 390-1364 (394) 482-2326 062-8889 011-9109 (422) 939-4037 887-5707 71 | ex(2): 497-1703 714-0780 (908) 410-7685 462-7932 (981) 091-6885 380-2098 72 | ex(3): (883) 930-0668 707-9080 743-8393 753-3488 (440) 659-0432 053-3189 73 | ex(5): 465-0592 006-9011 215-1986 (891) 516-2199 772-8228 593-7269 74 | ex(8): (107) 617-7866 (926) 687-2161 (527) 760-9419 128-9646 75 | ---------------- Examples from DFA --------------- 76 | s1: 640-4207 77 | 4 expression(s) loaded 78 | -------------------------------------------------------------------------------- /bkt_test.go: -------------------------------------------------------------------------------- 1 | // bkt_test.go -- unit tests for bracket-expression parsing 2 | // 3 | // These tests validate bxparse() and also BitSet.Bracketed(). 4 | // 5 | // These tests are run by the "go test" command. 6 | 7 | package rx 8 | 9 | import ( 10 | "fmt" 11 | "testing" 12 | ) 13 | 14 | // TestBrackets runs a series of self-contained small tests. 15 | func TestBrackets(t *testing.T) { 16 | fmt.Println("bkt_test.go: TestBrackets") 17 | // try(t, string, expected) 18 | try(t, `[a]`, `[a]`) 19 | try(t, `[bc]`, `[bc]`) 20 | try(t, `[def]`, `[def]`) 21 | try(t, `[ghij]`, `[g-j]`) 22 | try(t, `[lmnop]`, `[l-p]`) 23 | try(t, `[tuvwxyz]`, `[t-z]`) 24 | try(t, `[ACDFGHJKLMOPQRSUVWXYZ]`, `[ACDFGHJ-MO-SU-Z]`) 25 | try(t, `[aeiuo]`, `[aeiou]`) 26 | try(t, `[aeiuo]`, `[aeiou]`) 27 | try(t, `[a-z]`, `[a-z]`) 28 | try(t, `[A-Z]`, `[A-Z]`) 29 | try(t, `[a-zA-Z]`, `[A-Za-z]`) 30 | try(t, `[_a-zA-Z]`, `[A-Z_a-z]`) 31 | try(t, `[0-9]`, `[0-9]`) 32 | try(t, `[_a-zA-Z0-9]`, `[0-9A-Z_a-z]`) 33 | try(t, `[_a-zA-Z]`, `[A-Z_a-z]`) 34 | try(t, `[A-HO-Z]`, `[A-HO-Z]`) 35 | // ugly but legal cset (bracket expression) forms 36 | try(t, `[-]`, `[-]`) 37 | try(t, `[-x]`, `[x-]`) 38 | try(t, `[x-]`, `[x-]`) 39 | try(t, `[[x]`, `[[x]`) 40 | try(t, `[[x-]`, `[[x-]`) 41 | try(t, `[]x]`, `[]x]`) 42 | try(t, `[]x-]`, `[]x-]`) 43 | try(t, `[]x[-]`, `[][x-]`) 44 | try(t, `[][]`, `[][]`) 45 | // ] and - at end of range 46 | try(t, `[$-\-]`, `[$-,-]`) 47 | try(t, `[]\\[ZYX]`, `[]X-\]`) 48 | try(t, `[]$-\-\\[ZYX]`, `[]$-,X-\-]`) 49 | try(t, `[-.-\]]]`, `[].-\-]`) 50 | // character set escapes 51 | // n.b. in POSIX, \ in [] is not special, but we follow Perl here 52 | try(t, `[\-]`, `[-]`) 53 | try(t, `[\]]`, `[]]`) 54 | try(t, `[ab\[cd\-gh\]ij]`, `[][a-dg-j-]`) 55 | try(t, `[*&=!+]`, `[!&*+=]`) 56 | try(t, `[\*\&\=\!\+]`, `[!&*+=]`) 57 | // perl inventions 58 | try(t, `[\d]`, `[0-9]`) 59 | try(t, `[\d0IZESB]`, `[0-9BEISZ]`) 60 | try(t, `[\w]`, `[0-9A-Z_a-z]`) 61 | // C-style escapes 62 | try(t, `[\a]`, `[\a]`) 63 | try(t, `[\b]`, `[\b]`) 64 | try(t, `[\e]`, `[\x1b]`) 65 | try(t, `[\f]`, `[\f]`) 66 | try(t, `[\n]`, `[\n]`) 67 | try(t, `[\r]`, `[\r]`) 68 | try(t, `[\t]`, `[\t]`) 69 | try(t, `[\v]`, `[\v]`) 70 | try(t, `[\y]`, `'\y' unrecognized`) 71 | // octal escapes 72 | try(t, `[\0]`, `[\x00]`) 73 | try(t, `[\00]`, `[\x00]`) 74 | try(t, `[\000]`, `[\x00]`) 75 | try(t, `[\1a]`, `[\x01a]`) 76 | try(t, `[\11b]`, `[\tb]`) 77 | try(t, `[\111c]`, `[Ic]`) 78 | try(t, `[\7]`, `[\a]`) 79 | try(t, `[\043]`, `[#]`) 80 | try(t, `[\45]`, `[%]`) 81 | try(t, `[\75]`, `[=]`) 82 | try(t, `[\107]`, `[G]`) 83 | try(t, `[\176]`, `[~]`) 84 | // hex escapes 85 | try(t, `[\x2a3]`, `[*3]`) 86 | try(t, `[\u006B4]`, `[4k]`) 87 | try(t, `[\x`, `malformed '\xhh'`) 88 | try(t, `[\x5`, `malformed '\xhh'`) 89 | try(t, `[\x5G`, `malformed '\xhh'`) 90 | try(t, `[\u`, `malformed '\uhhhh'`) 91 | try(t, `[\u173]`, `malformed '\uhhhh'`) 92 | try(t, `[\umass]`, `malformed '\uhhhh'`) 93 | // big sets \D \W \S 94 | try(t, `[\D]`, `[\x01-/:-\x7f]`) 95 | try(t, `[\W]`, "[\\x01-/:-@[-^`{-\\x7f]") 96 | try(t, `[\S]`, `[\x01-\b\x0e-\x1f!-\x7f]`) 97 | // the following pairs should be identical 98 | try(t, `[\d]`, `[0-9]`) 99 | try(t, `[^\D]`, `[0-9]`) 100 | try(t, `[\w]`, `[0-9A-Z_a-z]`) 101 | try(t, `[^\W]`, `[0-9A-Z_a-z]`) 102 | try(t, `[\s]`, `[\t-\r ]`) 103 | try(t, `[^\S]`, `[\t-\r ]`) 104 | // complemented character sets 105 | try(t, "[^ -`]", `[\x01-\x1fa-\x7f]`) 106 | try(t, `[^ -@]`, `[\x01-\x1fA-\x7f]`) 107 | try(t, `[^ -/]`, `[\x01-\x1f0-\x7f]`) 108 | try(t, "[^ -/:-@[-`{-~]", `[\x01-\x1f0-9A-Za-z\x7f]`) 109 | try(t, `[^0-9]`, `[\x01-/:-\x7f]`) 110 | try(t, `[^A-Za-z0-9]`, "[\\x01-/:-@[-`{-\\x7f]") 111 | // these last two should be identical (under -Z) 112 | try(t, `[^A-Za-z0-9_]`, "[\\x01-/:-@[-^`{-\\x7f]") 113 | try(t, `[^\w]`, "[\\x01-/:-@[-^`{-\\x7f]") 114 | // erroneous 115 | try(t, `[`, `unclosed '['`) 116 | try(t, `[^`, `unclosed '['`) 117 | try(t, `[]`, `unclosed '['`) 118 | try(t, `[^]`, `unclosed '['`) 119 | try(t, `[\]`, `unclosed '['`) 120 | try(t, `[abc`, `unclosed '['`) 121 | try(t, `[def\]`, `unclosed '['`) 122 | } 123 | 124 | // try parses one string and checks the result. 125 | // The input string should begin with the initial '[' to be ignored. 126 | func try(t *testing.T, input string, expected string) { 127 | var output string 128 | subj := []rune(input[1:]) 129 | result, errmsg := bxparse(subj) 130 | if result == nil { 131 | output = string(errmsg) 132 | } else { 133 | output = result.Bracketed() 134 | } 135 | if output != expected { 136 | t.Error(input, "=>", output, "(expected ", expected, ")") 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /bitset.go: -------------------------------------------------------------------------------- 1 | // bitset.go -- a datatype for representing sets of small integers 2 | // 3 | // Most of these operations are functional in concept, 4 | // but a few modify the receiver: Set, Clear, OrWith, AndWith. 5 | // 6 | // based originally on code from the Go Playground 7 | // http://play.golang.org/p/NpHns5EBnQ as of 11-Feb-2014 8 | 9 | package rx 10 | 11 | import ( 12 | "fmt" 13 | "math/big" 14 | ) 15 | 16 | // A BitSet is a simple bit-mapped representation of a set of small ints. 17 | // No explicit constructor is needed; use new(BitSet) or &BitSet{}. 18 | type BitSet struct { 19 | Bits big.Int 20 | } 21 | 22 | // BitSet.Set sets one bit in a BitSet. 23 | func (b *BitSet) Set(bit int) *BitSet { 24 | b.Bits.SetBit(&b.Bits, bit, 1) 25 | return b 26 | } 27 | 28 | // BitSet.Clear clears one bit in a BitSet. 29 | func (b *BitSet) Clear(bit int) *BitSet { 30 | b.Bits.SetBit(&b.Bits, bit, 0) 31 | return b 32 | } 33 | 34 | // BitSet.Test returns true if the specified BitSet bit is set. 35 | func (b *BitSet) Test(bit int) bool { 36 | return b.Bits.Bit(bit) == 1 37 | } 38 | 39 | // BitSet.IsEmpty returns true if no bits are set in the BitSet. 40 | func (b *BitSet) IsEmpty() bool { 41 | return b.Bits.BitLen() == 0 42 | } 43 | 44 | // BitSet.Count returns the number of bits that are set. 45 | func (b *BitSet) Count() int { 46 | n := 0 47 | l := b.LowBit() 48 | h := b.HighBit() 49 | for i := l; i <= h; i++ { // for all values up to highest 50 | if b.Test(i) { // if this value is included 51 | n++ // count it 52 | } 53 | } 54 | return n 55 | } 56 | 57 | // BitSet.LowBit returns the number of the smallest bit set. 58 | // It returns 0 if the BitSet is empty. 59 | func (b *BitSet) LowBit() int { 60 | // inspired by thoughts of HAKMEM... 61 | bigTemp.Sub(&b.Bits, bigOne) 62 | bigTemp.Xor(&b.Bits, bigTemp) 63 | bigTemp.Add(bigOne, bigTemp) 64 | n := bigTemp.BitLen() - 2 65 | if n >= 0 { 66 | return n 67 | } else { 68 | return 0 69 | } 70 | } 71 | 72 | // BitSet.HighBit returns the number of the highest bit set. 73 | // It returns -1 if the BitSet is empty. 74 | func (b *BitSet) HighBit() int { 75 | return b.Bits.BitLen() - 1 76 | } 77 | 78 | var bigOne = big.NewInt(1) // static constant used in LowBit() 79 | var bigTemp = big.NewInt(0) // static temporary used in LowBit() 80 | 81 | // BitSet.Equals returns true if the argument set is identical to this one. 82 | func (b1 *BitSet) Equals(b2 *BitSet) bool { 83 | return (b1.Bits.Cmp(&b2.Bits) == 0) 84 | } 85 | 86 | // BitSet.Or produces a new BitSet that is the union of its inputs. 87 | func (b1 *BitSet) Or(b2 *BitSet) *BitSet { 88 | b3 := new(BitSet) 89 | b3.Bits.Or(&b1.Bits, &b2.Bits) 90 | return b3 91 | } 92 | 93 | // BitSet.OrWith accomplishes an OR-in-place, eliminating a memory allocation. 94 | func (b1 *BitSet) OrWith(b2 *BitSet) *BitSet { 95 | b1.Bits.Or(&b1.Bits, &b2.Bits) 96 | return b1 97 | } 98 | 99 | // BitSet.And produces a new BitSet that is the intersection of its inputs. 100 | func (b1 *BitSet) And(b2 *BitSet) *BitSet { 101 | b3 := new(BitSet) 102 | b3.Bits.And(&b1.Bits, &b2.Bits) 103 | return b3 104 | } 105 | 106 | // BitSet.AndWith accomplishes an And-in-place, eliminating a memory allocn. 107 | func (b1 *BitSet) AndWith(b2 *BitSet) *BitSet { 108 | b1.Bits.And(&b1.Bits, &b2.Bits) 109 | return b1 110 | } 111 | 112 | // BitSet.AndNot produces a new BitSet that clears the bits of b2 from b1. 113 | func (b1 *BitSet) AndNot(b2 *BitSet) *BitSet { 114 | b3 := new(BitSet) 115 | b3.Bits.AndNot(&b1.Bits, &b2.Bits) 116 | return b3 117 | } 118 | 119 | // BitSet.Key returns an unprintable string usable as a map key. 120 | // (Neither a BitSet nor the underlying big.Int is a legal key type.) 121 | func (b *BitSet) Key() string { 122 | if b == nil { 123 | return "" 124 | } else { 125 | return string(b.Bits.Bytes()) 126 | } 127 | } 128 | 129 | // BitSet.Members returns a slice containing the values found in the set. 130 | // This is the easiest way to iterate through the members of a bit set: 131 | // for _, i := range bset.Members() { ... } 132 | func (b *BitSet) Members() []int { 133 | m := make([]int, 0, 0) // initial capacity 0 is faster than h-l+1 134 | l := b.LowBit() 135 | h := b.HighBit() 136 | for i := l; i <= h; i++ { // for all values up to highest 137 | if b.Test(i) { // if this value is included 138 | m = append(m, i) 139 | } 140 | } 141 | return m 142 | } 143 | 144 | // BitSet.String() returns a set-notation representation of the bitset. 145 | // This is used automatically when printing a bitset with "%s" format. 146 | func (b *BitSet) String() string { 147 | m := b.Members() 148 | s := make([]byte, 0, 4*len(m)) 149 | s = append(s, '{') 150 | for _, i := range m { 151 | s = append(s, fmt.Sprintf(" %d", i)...) 152 | } 153 | return string(append(s, " }"...)) 154 | } 155 | -------------------------------------------------------------------------------- /charset.go: -------------------------------------------------------------------------------- 1 | // charset.go -- bit set extensions for use as sets of characters 2 | // 3 | // These additional functions support the use of a BitSet as a set of chars. 4 | // No distinct type is defined, however -- it's still a BitSet. 5 | // 6 | // Note that "all characters" (for purposes of wildcarding or complementing) 7 | // defines a set of just the ASCII characters [\x01-\x7F]. 8 | 9 | package rx 10 | 11 | import ( 12 | "math/rand" 13 | "strconv" 14 | "unicode/utf8" 15 | ) 16 | 17 | // Predefined global character sets. 18 | // Once constructed, these should be treated as constant. 19 | var ( 20 | SpaceSet *BitSet = CharSet("\t\n\v\f\r ") 21 | DigitSet *BitSet = CharSet("0123456789") 22 | UpperSet *BitSet = CharSet("ABCDEFGHIJKLMNOPQRSTUVWXYZ") 23 | LowerSet *BitSet = CharSet("abcdefghijklmnopqrstuvwxyz") 24 | LetterSet *BitSet = UpperSet.Or(LowerSet) 25 | WordSet *BitSet = LetterSet.Or(DigitSet).Set('_') 26 | CtrlSet *BitSet = CharRange('\x00', '\x1F').Or(CharRange('\x7F', '\x9F')) 27 | AllChars *BitSet = CharRange('\x01', '\x7F') // matched by "." 28 | NonDigit *BitSet = DigitSet.CharCompl() 29 | NonSpace *BitSet = SpaceSet.CharCompl() 30 | NonWord *BitSet = WordSet.CharCompl() 31 | ) 32 | 33 | // CharSet makes a BitSet from a string of member characters. 34 | func CharSet(s string) *BitSet { 35 | cs := new(BitSet) 36 | for _, ch := range s { 37 | cs.Set(int(ch)) 38 | } 39 | return cs 40 | } 41 | 42 | // CharRange makes a BitSet from a range of characters. 43 | func CharRange(low int, high int) *BitSet { 44 | cs := new(BitSet) 45 | for ; low <= high; low++ { 46 | cs.Set(low) 47 | } 48 | return cs 49 | } 50 | 51 | // BitSet.CharCompl produces a new BitSet that is the complement of its inputs 52 | // with respect to the universe of matchable characters AllChars. 53 | func (b1 *BitSet) CharCompl() *BitSet { 54 | b3 := new(BitSet) 55 | b3.Bits.Xor(&b1.Bits, &AllChars.Bits) 56 | return b3 57 | } 58 | 59 | // BitSet.RandChar returns a single randomly chosen BitSet element. 60 | // Control characters are avoided unless nothing else is available. 61 | func (b *BitSet) RandChar() rune { 62 | low := b.LowBit() // lowest eligible char 63 | high := b.HighBit() // highest eligible char 64 | if low < ' ' || high > '~' { // if range extends beyond ASCII printables 65 | b2 := b.AndNot(CtrlSet) // remove the control characters 66 | if !b2.IsEmpty() { // and use that set if non-empty 67 | low = b2.LowBit() 68 | high = b2.HighBit() 69 | } 70 | } 71 | // pick a random char between low and high inclusive. 72 | // if it's part of the set, we're done. 73 | c := low + rand.Intn(high-low+1) 74 | if b.Test(c) { 75 | return rune(c) 76 | } 77 | // otherwise, pick a random stride and find one. 78 | span := high - low + 1 79 | stride := rand.Intn(span) 80 | for GCD(stride, span) > 1 { 81 | stride-- 82 | } 83 | c = low + ((c - low + stride) % span) 84 | for !b.Test(c) { 85 | c = low + ((c - low + stride) % span) 86 | } 87 | return rune(c) 88 | } 89 | 90 | // BitSet.Unbracketed() returns a single character or else a bracketed form. 91 | func (b *BitSet) Unbracketed() string { 92 | s := b.Bracketed() 93 | if utf8.RuneCountInString(s) == 3 { 94 | return s[1 : len(s)-1] // return one char without brackets 95 | } else { 96 | return s // return multiple characters with brackets 97 | } 98 | } 99 | 100 | // BitSet.Bracketed() returns a bracket-expression form of a character set, 101 | // using ranges if appropriate and escaping (only) unprintables. 102 | // ] and - are specially placed at beginning and end respectively. 103 | func (b *BitSet) Bracketed() string { 104 | l := b.LowBit() 105 | h := b.HighBit() 106 | open := "[" 107 | close := "]" 108 | s := make([]byte, 0) 109 | for i := l; i <= h; i++ { // for all chars up to highest 110 | if b.Test(i) { // if char is included 111 | if i == ']' { 112 | open = "[]" // move ']' to front 113 | continue 114 | } 115 | if i == '-' { 116 | close = "-]" // defer '-' to end 117 | continue 118 | } 119 | s = append(s, cprotect(rune(i))...) // show char 120 | var j int 121 | for j = i + 1; b.Test(j); j++ { 122 | // count consecutive inclusions 123 | } 124 | if j-i > 3 { // if worth using [a-z] form 125 | if (j-1) == '-' || (j-1) == ']' { 126 | // don't end range with '-' or ']' 127 | j-- 128 | } 129 | i = j - 1 130 | s = append(s, '-') 131 | s = append(s, cprotect(rune(i))...) 132 | } 133 | } 134 | } 135 | return open + string(s) + close 136 | } 137 | 138 | // cprotect returns its argument if printable, else a backslash form. 139 | func cprotect(r rune) string { 140 | if strconv.IsPrint(r) { 141 | return string(r) 142 | } else { 143 | s := strconv.QuoteRune(r) 144 | return s[1 : len(s)-1] 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /test/d7-jellyfish.std2: -------------------------------------------------------------------------------- 1 | // NFA: 7 expressions 2 | digraph NFA { 3 | label="NFA: 7 expressions" 4 | node [shape=circle, height=.3, margin=0, fontsize=10] 5 | i->p0[label=" [+-]"] 6 | i->p1[label=" [0-9]"] 7 | i->p3[label=" +"] 8 | i->p19[label=" [0-9]"] 9 | i->p31[label=" [0-9]"] 10 | i->p48[label=" 3"] 11 | i->p64[label=" [12]"] 12 | i->p73[label=" 0"] 13 | i->p75[label=" 1"] 14 | i [shape=triangle, regular=true, label=""] 15 | p0 [label="p0"] 16 | p0->p1[label=" [0-9]"] 17 | p1 [label="p1"] 18 | p1->p1[label=" [0-9]"] 19 | p1 [shape=doublecircle] 20 | p3 [label="p3"] 21 | p3->p4[label=" 1"] 22 | p4 [label="p4"] 23 | p4->p5[label=" -"] 24 | p5 [label="p5"] 25 | p5->p6[label=" [0-9]"] 26 | p6 [label="p6"] 27 | p6->p7[label=" [0-9]"] 28 | p7 [label="p7"] 29 | p7->p8[label=" [0-9]"] 30 | p8 [label="p8"] 31 | p8->p9[label=" -"] 32 | p9 [label="p9"] 33 | p9->p10[label=" [0-9]"] 34 | p10 [label="p10"] 35 | p10->p11[label=" [0-9]"] 36 | p11 [label="p11"] 37 | p11->p12[label=" [0-9]"] 38 | p12 [label="p12"] 39 | p12->p13[label=" -"] 40 | p13 [label="p13"] 41 | p13->p14[label=" [0-9]"] 42 | p14 [label="p14"] 43 | p14->p15[label=" [0-9]"] 44 | p15 [label="p15"] 45 | p15->p16[label=" [0-9]"] 46 | p16 [label="p16"] 47 | p16->p17[label=" [0-9]"] 48 | p17 [label="p17"] 49 | p17 [shape=doublecircle] 50 | p19 [label="p19"] 51 | p19->p20[label=" [0-9]"] 52 | p20 [label="p20"] 53 | p20->p21[label=" [0-9]"] 54 | p21 [label="p21"] 55 | p21->p22[label=" -"] 56 | p22 [label="p22"] 57 | p22->p23[label=" [0-9]"] 58 | p23 [label="p23"] 59 | p23->p24[label=" [0-9]"] 60 | p24 [label="p24"] 61 | p24->p25[label=" -"] 62 | p25 [label="p25"] 63 | p25->p26[label=" [0-9]"] 64 | p26 [label="p26"] 65 | p26->p27[label=" [0-9]"] 66 | p27 [label="p27"] 67 | p27->p28[label=" [0-9]"] 68 | p28 [label="p28"] 69 | p28->p29[label=" [0-9]"] 70 | p29 [label="p29"] 71 | p29 [shape=doublecircle] 72 | p31 [label="p31"] 73 | p31->p32[label=" [0-9]"] 74 | p32 [label="p32"] 75 | p32->p33[label=" [0-9]"] 76 | p33 [label="p33"] 77 | p33->p34[label=" [0-9]"] 78 | p34 [label="p34"] 79 | p34->p35[label=" [0-9]"] 80 | p35 [label="p35"] 81 | p35->p36[label=" [0-9]"] 82 | p36 [label="p36"] 83 | p36->p37[label=" [0-9]"] 84 | p37 [label="p37"] 85 | p37->p38[label=" [0-9]"] 86 | p38 [label="p38"] 87 | p38->p39[label=" [0-9]"] 88 | p39 [label="p39"] 89 | p39->p40[label=" [0-9]"] 90 | p40 [label="p40"] 91 | p40->p41[label=" [0-9]"] 92 | p41 [label="p41"] 93 | p41->p42[label=" [0-9]"] 94 | p42 [label="p42"] 95 | p42->p43[label=" [0-9]"] 96 | p43 [label="p43"] 97 | p43->p44[label=" [0-9]"] 98 | p44 [label="p44"] 99 | p44->p45[label=" [0-9]"] 100 | p45 [label="p45"] 101 | p45->p46[label=" [0-9]"] 102 | p46 [label="p46"] 103 | p46 [shape=doublecircle] 104 | p48 [label="p48"] 105 | p48->p49[label=" [47]"] 106 | p49 [label="p49"] 107 | p49->p50[label=" [0-9]"] 108 | p50 [label="p50"] 109 | p50->p51[label=" [0-9]"] 110 | p51 [label="p51"] 111 | p51->p52[label=" [0-9]"] 112 | p52 [label="p52"] 113 | p52->p53[label=" [0-9]"] 114 | p53 [label="p53"] 115 | p53->p54[label=" [0-9]"] 116 | p54 [label="p54"] 117 | p54->p55[label=" [0-9]"] 118 | p55 [label="p55"] 119 | p55->p56[label=" [0-9]"] 120 | p56 [label="p56"] 121 | p56->p57[label=" [0-9]"] 122 | p57 [label="p57"] 123 | p57->p58[label=" [0-9]"] 124 | p58 [label="p58"] 125 | p58->p59[label=" [0-9]"] 126 | p59 [label="p59"] 127 | p59->p60[label=" [0-9]"] 128 | p60 [label="p60"] 129 | p60->p61[label=" [0-9]"] 130 | p61 [label="p61"] 131 | p61->p62[label=" [0-9]"] 132 | p62 [label="p62"] 133 | p62 [shape=doublecircle] 134 | p64 [label="p64"] 135 | p64->p65[label=" [09]"] 136 | p65 [label="p65"] 137 | p65->p66[label=" [0-9]"] 138 | p66 [label="p66"] 139 | p66->p67[label=" [0-9]"] 140 | p67 [label="p67"] 141 | p67->p68[label=" [01]"] 142 | p68 [label="p68"] 143 | p68->p69[label=" [0-9]"] 144 | p69 [label="p69"] 145 | p69->p70[label=" [012]"] 146 | p70 [label="p70"] 147 | p70->p71[label=" [0-9]"] 148 | p71 [label="p71"] 149 | p71 [shape=doublecircle] 150 | p73 [label="p73"] 151 | p73->p74[label=" [1-9]"] 152 | p74 [label="p74"] 153 | p74->p77[label=" 0"] 154 | p74->p79[label=" [12]"] 155 | p74->p81[label=" 3"] 156 | p75 [label="p75"] 157 | p75->p76[label=" [012]"] 158 | p76 [label="p76"] 159 | p76->p77[label=" 0"] 160 | p76->p79[label=" [12]"] 161 | p76->p81[label=" 3"] 162 | p77 [label="p77"] 163 | p77->p78[label=" [1-9]"] 164 | p78 [label="p78"] 165 | p78->p83[label=" 1"] 166 | p78->p85[label=" 2"] 167 | p79 [label="p79"] 168 | p79->p80[label=" [0-9]"] 169 | p80 [label="p80"] 170 | p80->p83[label=" 1"] 171 | p80->p85[label=" 2"] 172 | p81 [label="p81"] 173 | p81->p82[label=" [01]"] 174 | p82 [label="p82"] 175 | p82->p83[label=" 1"] 176 | p82->p85[label=" 2"] 177 | p83 [label="p83"] 178 | p83->p84[label=" 9"] 179 | p84 [label="p84"] 180 | p84->p87[label=" [0-9]"] 181 | p85 [label="p85"] 182 | p85->p86[label=" 0"] 183 | p86 [label="p86"] 184 | p86->p87[label=" [0-9]"] 185 | p87 [label="p87"] 186 | p87->p88[label=" [0-9]"] 187 | p88 [label="p88"] 188 | p88 [shape=doublecircle] 189 | } 190 | -------------------------------------------------------------------------------- /input.go: -------------------------------------------------------------------------------- 1 | // input.go -- regular expression input 2 | 3 | package rx 4 | 5 | import ( 6 | "bufio" 7 | "flag" 8 | "fmt" 9 | "io" 10 | "log" 11 | "os" 12 | "regexp" 13 | "strings" 14 | ) 15 | 16 | // Globals set as a side effect of loading input 17 | var ( 18 | InputRegExCount int // number of expressions successfully loaded 19 | InputErrorCount int // number of unacceptable expressions rejected 20 | ) 21 | 22 | // A RegExParsed is a single parsed regular expression. 23 | // If Tree is not nil then the expression was parsed as valid. 24 | // If Tree is nil and Err is not, Err is a parsing error. 25 | // If Tree is nil and Err is nil, the struct can represent a comment. 26 | // Meta includes all immediately preceding metadata lines. 27 | type RegExParsed struct { 28 | Expr string // input string 29 | Tree Node // parse tree 30 | Err error // parse error 31 | Meta map[string]string // metadata list 32 | } 33 | 34 | // RegExParsed.IsExpr returns true if the struct represents an expression, 35 | // not a comment, whether or not it is valid. 36 | func (rxp *RegExParsed) IsExpr() bool { 37 | return rxp.Tree != nil || rxp.Err != nil 38 | } 39 | 40 | // RegExParsed.ShowMeta prints the expression's metadata intelligently. 41 | func (rxp *RegExParsed) ShowMeta(f io.Writer, indent string) { 42 | if rxp.Meta != nil { 43 | for _, k := range KeyList(rxp.Meta) { 44 | for _, s := range strings.Split(rxp.Meta[k], "\n") { 45 | fmt.Fprintf(f, "%s#%s: %s\n", indent, k, s) 46 | } 47 | } 48 | } 49 | } 50 | 51 | // LoadExpressions reads a file and parses the expressions found. 52 | // A filename of "" or "-" reads from standard input. Any file error is fatal. 53 | // See LoadFromScanner for details. 54 | func LoadExpressions(fname string, f func(*RegExParsed)) []*RegExParsed { 55 | return LoadFromScanner(MkScanner(fname), f) 56 | } 57 | 58 | // LoadFromScanner reads and parses expressions from a bufio.Scanner. 59 | // 60 | // Empty lines and lines beginning with '#' are treated as comments. 61 | // If non-nil, the function f is called for each non-metadata line read. 62 | // The returned array contains only successfully parsed expressions. 63 | // 64 | // Metadata from comments matching the pattern "^#\w+:" is accumulated and 65 | // returned with the next non-metadata line (whether comment or expr). 66 | // 67 | // The globals InputRegExCount and InputExprErrors are set by this function. 68 | func LoadFromScanner(efile *bufio.Scanner, f func(*RegExParsed)) []*RegExParsed { 69 | mpat := regexp.MustCompile(`^#(\w+): *(.*)`) 70 | elist := make([]*RegExParsed, 0) 71 | meta := make(map[string]string) 72 | InputRegExCount = 0 73 | InputErrorCount = 0 74 | for efile.Scan() { 75 | line := efile.Text() 76 | e := &RegExParsed{Expr: line} 77 | if IsComment(line) { 78 | r := mpat.FindStringSubmatch(line) 79 | if r != nil { // if recognized metadata format 80 | addMeta(meta, r[1], r[2]) // accumulate metadata 81 | continue // and don't call 82 | } else { 83 | e.Meta = meta // return accumulation 84 | } 85 | } else { 86 | e.Tree, e.Err = Parse(line) // parse input 87 | if e.Tree != nil { // if okay 88 | elist = append(elist, e) // save parse tree 89 | InputRegExCount++ // count success 90 | } else { 91 | InputErrorCount++ // else count error 92 | } 93 | e.Meta = meta // accumulated metadata 94 | } 95 | if f != nil { 96 | f(e) 97 | } 98 | meta = make(map[string]string) // reset meta collection 99 | } 100 | CkErr(efile.Err()) 101 | return elist 102 | } 103 | 104 | // addMeta grows the metadata, concatenating with \n if the key is a duplicate. 105 | func addMeta(meta map[string]string, key string, val string) { 106 | if meta[key] == "" { 107 | meta[key] = val 108 | } else { 109 | meta[key] = meta[key] + "\n" + val 110 | } 111 | } 112 | 113 | // OneInputFile returns the name of the input file from the command line. 114 | // The program is aborted with an error message if multiple arguments appear. 115 | // If no arguments are present, "-" is returned to represent standard input. 116 | func OneInputFile() string { 117 | flag.Parse() // in case not already called 118 | args := flag.Args() 119 | switch len(args) { 120 | case 0: 121 | return "-" 122 | case 1: 123 | return args[0] 124 | default: 125 | log.Fatal("too many arguments") 126 | } 127 | return "" //NOTREACHED 128 | } 129 | 130 | // MkScanner creates a Scanner for reading from a file, aborting on error. 131 | // A filename of "-" reads standard input. 132 | func MkScanner(fname string) *bufio.Scanner { // return scanner for file 133 | if fname == "" || fname == "-" { 134 | return bufio.NewScanner(os.Stdin) 135 | } else { 136 | f, err := os.Open(fname) 137 | CkErr(err) 138 | return bufio.NewScanner(f) 139 | } 140 | } 141 | 142 | // IsComment returns true if a line begins with '#' or is empty. 143 | func IsComment(s string) bool { 144 | return len(s) == 0 || s[0] == '#' 145 | } 146 | -------------------------------------------------------------------------------- /test/29-pcre.std: -------------------------------------------------------------------------------- 1 | Options: -R -T -g -h -i -l -v -I 0 2 | 3 | # test handling of PCRE extensions 4 | 5 | expr 0: a+b+(?#comment)c+d+ 6 | tree: (((a+b+)c+)d+) 7 | augmnt: ((((a+b+)c+)d+)#) 8 | length: 4 to * 9 | cplxty: 7 10 | -------------------- Examples -------------------- 11 | ex(0): abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd 12 | ex(1): abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd 13 | ex(2): abbcd aabbccd aabbcdd abbcdd aabccd abbcd abccdd abccdd abcdd 14 | ex(3): aaabbbcccd abcccdd aaabbcdd abcdd abbbcccd aabccddd abccdd 15 | ex(5): abcccccddddd aaabbccdddd abbccddddd aaabccccddddd aaabccdddd 16 | ex(8): aaaaaabccdddddddd aaaaabbbbbbccccccccdddd aaabbbbbbcccccdddddddd 17 | ---------------- Examples from DFA --------------- 18 | s1: abcd 19 | 20 | expr 1: a+b+(?#3IИЯ3)c+d+ 21 | tree: (((a+b+)c+)d+) 22 | augmnt: ((((a+b+)c+)d+)#) 23 | length: 4 to * 24 | cplxty: 7 25 | -------------------- Examples -------------------- 26 | ex(0): abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd 27 | ex(1): abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd abcd 28 | ex(2): abbcd aabbccd aabbcdd abbcdd aabccd abbcd abccdd abccdd abcdd 29 | ex(3): aaabbbcccd abcccdd aaabbcdd abcdd abbbcccd aabccddd abccdd 30 | ex(5): abcccccddddd aaabbccdddd abbccddddd aaabccccddddd aaabccdddd 31 | ex(8): aaaaaabccdddddddd aaaaabbbbbbccccccccdddd aaabbbbbbcccccdddddddd 32 | ---------------- Examples from DFA --------------- 33 | s1: abcd 34 | 35 | expr 2: (?#)o*(?#) 36 | tree: o* 37 | augmnt: (o*#) 38 | length: 0 to * 39 | cplxty: 1 40 | -------------------- Examples -------------------- 41 | ex(0): 42 | ex(1): o o o o o o o o o o o o o 43 | ex(2): o oo o oo oo o o oo oo oo oo oo 44 | ex(3): oo oo o o ooo oo ooo o oo o o o ooo ooo oo 45 | ex(5): ooo oo o ooooo oo ooo ooooo ooooo ooo oooo oooo ooooo o 46 | ex(8): oo oooooooo ooo o oo ooooooo oooooo ooooooo ooooo oooo oooo 47 | ---------------- Examples from DFA --------------- 48 | s0: 49 | 50 | expr 3: a*(?:bc)*d* 51 | tree: ((a*(bc)*)d*) 52 | augmnt: (((a*(bc)*)d*)#) 53 | length: 0 to * 54 | cplxty: 7 55 | -------------------- Examples -------------------- 56 | ex(0): 57 | ex(1): d abc bc bc a a abc bcd a a bc bc a ad ad ad abc 58 | ex(2): d abcbc abcbc dd dd d add d bcbcd d aabcdd add d abcbcdd 59 | ex(3): bcbc bcbcbcdd aabcd abcbc aaabcbcd aaa aabcbcbcddd aaabc bcbcbc 60 | ex(5): aaaaabcbcdddd aabcddd aaabcd aaaabcdd abcbcbcbcdd abcbcbcdd 61 | ex(8): aaaaaaabcbcbcbcbc aabcbcbcbcbcdddd aabcbcbcbcbcbcbcbcddd 62 | ---------------- Examples from DFA --------------- 63 | s0: 64 | s2: d 65 | s3: bc 66 | 67 | expr 4: a*(?'center'bc)*d* 68 | tree: ((a*(bc)*)d*) 69 | augmnt: (((a*(bc)*)d*)#) 70 | length: 0 to * 71 | cplxty: 7 72 | -------------------- Examples -------------------- 73 | ex(0): 74 | ex(1): d abc bc bc a a abc bcd a a bc bc a ad ad ad abc 75 | ex(2): d abcbc abcbc dd dd d add d bcbcd d aabcdd add d abcbcdd 76 | ex(3): bcbc bcbcbcdd aabcd abcbc aaabcbcd aaa aabcbcbcddd aaabc bcbcbc 77 | ex(5): aaaaabcbcdddd aabcddd aaabcd aaaabcdd abcbcbcbcdd abcbcbcdd 78 | ex(8): aaaaaaabcbcbcbcbc aabcbcbcbcbcdddd aabcbcbcbcbcbcbcbcddd 79 | ---------------- Examples from DFA --------------- 80 | s0: 81 | s2: d 82 | s3: bc 83 | 84 | expr 5: a*(?bc)*d* 85 | tree: ((a*(bc)*)d*) 86 | augmnt: (((a*(bc)*)d*)#) 87 | length: 0 to * 88 | cplxty: 7 89 | -------------------- Examples -------------------- 90 | ex(0): 91 | ex(1): d abc bc bc a a abc bcd a a bc bc a ad ad ad abc 92 | ex(2): d abcbc abcbc dd dd d add d bcbcd d aabcdd add d abcbcdd 93 | ex(3): bcbc bcbcbcdd aabcd abcbc aaabcbcd aaa aabcbcbcddd aaabc bcbcbc 94 | ex(5): aaaaabcbcdddd aabcddd aaabcd aaaabcdd abcbcbcbcdd abcbcbcdd 95 | ex(8): aaaaaaabcbcbcbcbc aabcbcbcbcbcdddd aabcbcbcbcbcbcbcbcddd 96 | ---------------- Examples from DFA --------------- 97 | s0: 98 | s2: d 99 | s3: bc 100 | 101 | expr 6: a*(?Pbc)*d* 102 | tree: ((a*(bc)*)d*) 103 | augmnt: (((a*(bc)*)d*)#) 104 | length: 0 to * 105 | cplxty: 7 106 | -------------------- Examples -------------------- 107 | ex(0): 108 | ex(1): d abc bc bc a a abc bcd a a bc bc a ad ad ad abc 109 | ex(2): d abcbc abcbc dd dd d add d bcbcd d aabcdd add d abcbcdd 110 | ex(3): bcbc bcbcbcdd aabcd abcbc aaabcbcd aaa aabcbcbcddd aaabc bcbcbc 111 | ex(5): aaaaabcbcdddd aabcddd aaabcd aaaabcdd abcbcbcbcdd abcbcbcdd 112 | ex(8): aaaaaaabcbcbcbcbc aabcbcbcbcbcdddd aabcbcbcbcbcbcbcbcddd 113 | ---------------- Examples from DFA --------------- 114 | s0: 115 | s2: d 116 | s3: bc 117 | 7 expression(s) loaded 118 | -------------------------------------------------------------------------------- /test/52-libstr.std: -------------------------------------------------------------------------------- 1 | Options: -R -T -g -h -i -l -v -I 0 2 | 3 | # RegExLib.com -- browse -- Strings 4 | 5 | # 6 | 7 | # 535. Social Security numbers within allocated ranges 8 | 9 | ERROR: (?!000)([0-6]\d{2}|7([0-6]\d|7[012]))([ -]?)(?!00)\d\d\3(?!0000)\d{4} 10 | rx: '(?...' unimplemented: in "(?!000)([0-6]\d{2}|7([0-6]\d|7[012]))([ -]?)(?!00)\d\d\3(?!0000)\d{4}" 11 | 12 | # 672. GUID values 13 | 14 | expr 0: [{|\(]?[0-9a-fA-F]{8}[-]?([0-9a-fA-F]{4}[-]?){3}[0-9a-fA-F]{12}[\)|}]? 15 | tree: ((((([({|]?[0-9A-Fa-f]{8})-?)([0-9A-Fa-f]{4}-?){3})[0-9A-Fa-f]{12})[)|}]?) 16 | augmnt: (((((([({|]?((((((([0-9A-Fa-f][0-9A-Fa-f])[0-9A-Fa-f])[0-9A-Fa-f])[0-9A-Fa-f])[0-9A-Fa-f])[0-9A-Fa-f])[0-9A-Fa-f]))-?)(((((([0-9A-Fa-f][0-9A-Fa-f])[0-9A-Fa-f])[0-9A-Fa-f])-?)(((([0-9A-Fa-f][0-9A-Fa-f])[0-9A-Fa-f])[0-9A-Fa-f])-?))(((([0-9A-Fa-f][0-9A-Fa-f])[0-9A-Fa-f])[0-9A-Fa-f])-?)))((((((((((([0-9A-Fa-f][0-9A-Fa-f])[0-9A-Fa-f])[0-9A-Fa-f])[0-9A-Fa-f])[0-9A-Fa-f])[0-9A-Fa-f])[0-9A-Fa-f])[0-9A-Fa-f])[0-9A-Fa-f])[0-9A-Fa-f])[0-9A-Fa-f]))[)|}]?)#) 17 | length: 32 to 38 18 | cplxty: 46 19 | -------------------- Examples -------------------- 20 | ex(0): 10AB18779cD8aafFADf5eDA1FF3Cd379 AbfD36EfccCd4B42AF9B9bcd5ADfeC19 21 | ex(1): 28DA3355D9DF1DDd-fDa6-26D7e8De06da| 22 | ex(2): 96fCd228aCe40a41-bfabbD1Ca60A5D9E da3B97BFdBB3-a0ea-dCaf-BeC6cd74EFa7 23 | ex(3): FEb258ec-abA7-F0A41fa9-4F1f3aEEbee3) 24 | ex(5): 3a8E2DaF4CCd-af8e-8c589a1B5db90928} 25 | ex(8): {03Ba3DDf-7e9FFD94-f0cD-D2a9B4dEa797} 26 | ---------------- Examples from DFA --------------- 27 | s1: F3FfBa4dbb9e6bdEFB2B3C9aCDC20aCb 28 | s34: F3FfBa4dbb9e6bdEFB2B3C9aCDC20aCb) 29 | 30 | # 580. passwords length 8-20 with at least one digit, cannot start with digit 31 | 32 | ERROR: (?=[^\d_].*?\d)\w(\w|[!@#$%]){7,20} 33 | rx: '(?...' unimplemented: in "(?=[^\d_].*?\d)\w(\w|[!@#$%]){7,20}" 34 | 35 | # 656. spam trap for pseudo-spellings of "Viagra" 36 | 37 | expr 1: [v,V,(\\/)](\W|)[i,I,1,l,L](\W|)[a,A,@,(\/\\)](\W|)[g,G](\W|)[r,R](\W|)[a,A,@,(\/\\))] 38 | tree: (((((((((([(),/V\v]([\x01-/:-@[-^`{-\x7f]|()))[,1ILil])([\x01-/:-@[-^`{-\x7f]|()))[(),/@A\a])([\x01-/:-@[-^`{-\x7f]|()))[,Gg])([\x01-/:-@[-^`{-\x7f]|()))[,Rr])([\x01-/:-@[-^`{-\x7f]|()))[(),/@A\a]) 39 | augmnt: ((((((((((([(),/V\v]([\x01-/:-@[-^`{-\x7f]|()))[,1ILil])([\x01-/:-@[-^`{-\x7f]|()))[(),/@A\a])([\x01-/:-@[-^`{-\x7f]|()))[,Gg])([\x01-/:-@[-^`{-\x7f]|()))[,Rr])([\x01-/:-@[-^`{-\x7f]|()))[(),/@A\a])#) 40 | length: 6 to 11 41 | cplxty: 31 42 | -------------------- Examples -------------------- 43 | ex(0): (i;(\\gA{gR~, V1%\\?G<,>@ v]i,,,,A )L/,]gr|, 44 | ex(1): ),&),,\\a \\)lA!g,.( \\}1^,gR@ ,%i>)%,,@\\ Vi(#GR\"( ,),(@*,,( 45 | ex(2): V|,)Gr;A V;i|/,r!, (,@G;R\\ V1,G,<) Vi/Gr:/ /i(,,+/ \\,]/,r;, 46 | ex(3): ){i}\\{,\",( )L$/',{,A )I|(*G-r( V=1&\\!G.,A ,L\\@g,>a //l\\/G$,:) 47 | ex(5): v,#).GR$/ VI,gr@ )L/&g>R\\ v`i#,g]R[/ VL=\\^GR&) v.,/],,/a 48 | ex(8): /I@\\,,/ \\,!A\"g\",A )),!(G+r) ,;I'A#,^,{( v%I),R/ /1 ),RA 49 | ---------------- Examples from DFA --------------- 50 | s1: /,(,,/ 51 | s7: /,(,,a 52 | s8: /,(,,, 53 | s9: /,(,R/ 54 | s20: /,(G,, 55 | s10: /,,,,\\ 56 | s21: /,,,,, 57 | s26: /,A,,, 58 | 59 | # 49. major credit cards (AmEx len 15, others len 16) 60 | 61 | expr 2: ((4\d{3})|(5[1-5]\d{2})|(6011))-?\d{4}-?\d{4}-?\d{4}|3[4,7]\d{13} 62 | tree: (((((((((4[0-9]{3})|((5[1-5])[0-9]{2})|(((60)1)1))-?)[0-9]{4})-?)[0-9]{4})-?)[0-9]{4})|((3[,47])[0-9]{13})) 63 | augmnt: ((((((((((4(([0-9][0-9])[0-9]))|((5[1-5])([0-9][0-9]))|(((60)1)1))-?)((([0-9][0-9])[0-9])[0-9]))-?)((([0-9][0-9])[0-9])[0-9]))-?)((([0-9][0-9])[0-9])[0-9]))|((3[,47])(((((((((((([0-9][0-9])[0-9])[0-9])[0-9])[0-9])[0-9])[0-9])[0-9])[0-9])[0-9])[0-9])[0-9])))#) 64 | length: 15 to 19 65 | cplxty: 58 66 | -------------------- Examples -------------------- 67 | ex(0): 4656778887982610 4860162012785483 5574689780270842 376407784390913 68 | ex(1): 60119001-9109-4226 4994378873577697 45564979-0347-0078 69 | ex(2): 3,7998142091068 4380209886521853 4320-05068270-9080 70 | ex(3): 49903200-6318-3817 54045605-6006-0116 41196738-09510219 71 | ex(5): 6011-1905-07331747 4999663679211795 53761941-1287-4677 72 | ex(8): 42496073-35926256 3,6800994848273 43357214-14607342 73 | ---------------- Examples from DFA --------------- 74 | s1: 349332644696936 75 | 76 | # 595. UK vehicle registration number from September 2001 77 | 78 | expr 3: ([A-HK-PRSVWY][A-HJ-PR-Y])\s?([0][2-9]|[1-9][0-9])\s?[A-HJ-PR-Z]{3} 79 | tree: ((((([A-HK-PRSVWY][A-HJ-PR-Y])[\t-\r ]?)((0[2-9])|([1-9][0-9])))[\t-\r ]?)[A-HJ-PR-Z]{3}) 80 | augmnt: (((((([A-HK-PRSVWY][A-HJ-PR-Y])[\t-\r ]?)((0[2-9])|([1-9][0-9])))[\t-\r ]?)(([A-HJ-PR-Z][A-HJ-PR-Z])[A-HJ-PR-Z]))#) 81 | length: 7 to 9 82 | cplxty: 19 83 | -------------------- Examples -------------------- 84 | ex(0): YO25HZK DS38EAN AA06NNC CU87EVW DS07GZM BD28UOH AN05ENC GE07MGP 85 | ex(1): SO14SXK NW06EEM NE05 TSF AY06 XLU FD 09 XOX BO 09 OAF WE 08 AOP 86 | ex(2): NH06AEG MJ 59DSH EB06 NWK MR32KXA PS 06 ZDA OG07TDJ AN04 AXF 87 | ex(3): SF 03 SBE FF 07ZCZ GH38PXU PK 06PBM WS 39EFF RG21AMN YJ72SKZ 88 | ex(5): RD80 LKZ DS14 KSV YJ07GTR WL59 TPU RV34 YXO NR 09 ZFL KL08NUO 89 | ex(8): FW 05 UUR LS 02WLH EX07 YGN WU 43TSP SC 05 WHZ SS 75 LYX EX 02RWO 90 | ---------------- Examples from DFA --------------- 91 | s1: BE06UWO 92 | 4 expression(s) loaded 93 | (2 expression(s) rejected) 94 | -------------------------------------------------------------------------------- /brackets.go: -------------------------------------------------------------------------------- 1 | // brackets.go -- parsing of bracket expression 2 | 3 | package rx 4 | 5 | import ( 6 | "fmt" 7 | "strconv" 8 | "unicode" 9 | ) 10 | 11 | // bxparse parses a string as a bracket expression, returning the 12 | // computed set of characters and the remaining unprocessed part of s. 13 | // It assumes the introductory '[' has already been stripped from s. 14 | // 15 | // If an error is found, bxparse returns (nil, errmsg). 16 | // 17 | // bxparse implements: [abc] [^abc] [a-c] [\x] 18 | func bxparse(s []rune) (*BitSet, []rune) { 19 | 20 | result := &BitSet{} 21 | compl := false 22 | 23 | // check for initial '^' 24 | if len(s) > 0 && s[0] == '^' { 25 | compl = true 26 | s = s[1:] 27 | } 28 | cprev := 0 // no previous character 29 | // process body of expression 30 | for len(s) > 0 { 31 | ch := int(s[0]) 32 | s = s[1:] 33 | switch ch { 34 | case '[': 35 | // ordinary, but diagnose [:class:] 36 | if len(s) > 2 && s[0] == ':' && 37 | unicode.IsLetter(s[1]) { 38 | return nil, []rune("[:class:] unimplemented") 39 | } else { 40 | result.Set(ch) 41 | } 42 | case '-': 43 | // range of chars 44 | if cprev != 0 && len(s) > 0 && s[0] != ']' { 45 | ch = int(s[0]) 46 | s = s[1:] 47 | if ch == '\\' { 48 | var eset *BitSet 49 | eset, s = bescape(s) 50 | if eset == nil { 51 | return nil, s 52 | } 53 | ch = eset.LowBit() 54 | } 55 | if ch < cprev { 56 | return nil, []rune("invalid range") 57 | } 58 | for j := cprev; j <= ch; j++ { 59 | result.Set(j) 60 | } 61 | } else { 62 | result.Set(ch) 63 | } 64 | case ']': 65 | // set is complete unless this is first char 66 | if !result.IsEmpty() { 67 | if compl { 68 | result = result.CharCompl() 69 | } 70 | return result, s 71 | } else { 72 | // initial ']' is ordinary 73 | result.Set(ch) 74 | } 75 | case '\\': 76 | if len(s) > 0 { 77 | var eset *BitSet 78 | eset, s = bescape(s) 79 | if eset == nil { 80 | return nil, s 81 | } 82 | result.OrWith(eset) 83 | ch = eset.HighBit() 84 | } // else: error caught on next iteration 85 | default: 86 | // an ordinary char; add to set 87 | result.Set(ch) 88 | } 89 | cprev = ch 90 | } 91 | return nil, []rune("unclosed '['") 92 | } 93 | 94 | // bescape interprets a backslash sequence in the context of a bracket 95 | // expression from which the initial \ has already been consumed. 96 | // In this context \b is a backspace. bescape returns the computed 97 | // charset and the remaining unescaped portion of the string. 98 | // If an error is found, bescape returns (nil, errmsg). 99 | // 100 | // bescape implements: 101 | // \a \b \e \f \n \r \t \v \046 \xF7 \u03A8 102 | // \d \s \w \D \S \W 103 | func bescape(s []rune) (*BitSet, []rune) { 104 | if len(s) == 0 { 105 | return nil, []rune("'\\' at end") 106 | } 107 | c := int(s[0]) 108 | s = s[1:] 109 | switch c { 110 | case '0', '1', '2', '3', '4', '5', '6', '7': 111 | v := c - '0' // first digit 112 | if o := octal(s); o >= 0 { // optional 2nd digit 113 | v = 8*v + o 114 | s = s[1:] 115 | } 116 | if o := octal(s); o >= 0 { // optional 3nd digit 117 | v = 8*v + o 118 | s = s[1:] 119 | } 120 | return (&BitSet{}).Set(v), s 121 | case 'a': 122 | return (&BitSet{}).Set('\a'), s 123 | case 'b': 124 | return (&BitSet{}).Set('\b'), s 125 | case 'c': 126 | return nil, []rune("'\\cx' unimplemented") 127 | case 'd': 128 | return DigitSet, s 129 | case 'e': 130 | return (&BitSet{}).Set('\033'), s 131 | case 'f': 132 | return (&BitSet{}).Set('\f'), s 133 | case 'n': 134 | return (&BitSet{}).Set('\n'), s 135 | case 'p': 136 | return nil, []rune("'\\px' unimplemented") 137 | case 'r': 138 | return (&BitSet{}).Set('\r'), s 139 | case 's': 140 | return SpaceSet, s 141 | case 't': 142 | return (&BitSet{}).Set('\t'), s 143 | case 'u': 144 | v := hexl(s, 4) 145 | if v >= 0 { 146 | return (&BitSet{}).Set(v), s[4:] 147 | } else { 148 | return nil, []rune("malformed '\\uhhhh'") 149 | } 150 | case 'v': 151 | return (&BitSet{}).Set('\v'), s 152 | case 'w': 153 | return WordSet, s 154 | case 'x': 155 | v := hexl(s, 2) 156 | if v >= 0 { 157 | return (&BitSet{}).Set(v), s[2:] 158 | } else { 159 | return nil, []rune("malformed '\\xhh'") 160 | } 161 | case 'D': 162 | return NonDigit, s 163 | case 'P': 164 | return nil, []rune("'\\Px' unimplemented") 165 | case 'S': 166 | return NonSpace, s 167 | case 'W': 168 | return NonWord, s 169 | 170 | default: 171 | if unicode.IsLetter(rune(c)) { 172 | return nil, []rune(fmt.Sprintf("'\\%c' unrecognized", c)) 173 | } else { 174 | return (&BitSet{}).Set(c), s 175 | } 176 | } 177 | } 178 | 179 | // octal returns the value of the first digit of s, or -1 if not octal digit. 180 | func octal(s []rune) int { 181 | if len(s) > 0 && s[0] >= '0' && s[0] <= '7' { 182 | return int(s[0]) - '0' 183 | } else { 184 | return -1 185 | } 186 | } 187 | 188 | // hexl returns the value of the first n hex digits of s, or -1 if bad. 189 | func hexl(s []rune, n int) int { 190 | if len(s) < n { 191 | return -1 192 | } 193 | v, err := strconv.ParseInt(string(s[0:n]), 16, 64) 194 | if err == nil { 195 | return int(v) 196 | } else { 197 | return -1 198 | } 199 | } 200 | -------------------------------------------------------------------------------- /repnode.go: -------------------------------------------------------------------------------- 1 | // repnode.go -- parse tree node for replication of a subtree 2 | 3 | package rx 4 | 5 | import ( 6 | "bytes" 7 | "encoding/gob" 8 | "fmt" 9 | "math/rand" 10 | ) 11 | 12 | // A ReplNode represents controlled (or not) replication: e?, e+, e*, e{m,n}. 13 | // It is a generalization of the parse tree "*" node of textbooks. 14 | // In an augmented parse tree, m and n cannot exceed 1. 15 | type ReplNode struct { 16 | Min int // minimum number of occurrences (0 or 1) 17 | Max int // maximum (a positive limit, or -1 meaning infinity) 18 | Child Node // subpattern being replicated 19 | NodeData 20 | } 21 | 22 | // ReplNode.Children returns a list consisting of the one child. 23 | func (d *ReplNode) Children() []Node { 24 | return []Node{d.Child} 25 | } 26 | 27 | // ReplNode.MinLen returns the minimum length after replication. 28 | func (d *ReplNode) MinLen() int { 29 | return d.Min * d.Child.MinLen() 30 | } 31 | 32 | // ReplNode.MaxLen returns the maximum length after replication. 33 | // A value of -1 means that the length is unbounded. 34 | func (d *ReplNode) MaxLen() int { 35 | n := d.Child.MaxLen() 36 | if n == 0 || d.Max == 0 { // if only matches empty string 37 | return 0 38 | } else if n < 0 || d.Max < 0 { // if unbounded 39 | return -1 40 | } else { 41 | return d.Max * n // calculable maximum length 42 | } 43 | } 44 | 45 | // ReplNode.SetNFL sets the Nullable, FirstPos, LastPos fields. 46 | func (d *ReplNode) SetNFL() { 47 | d.Nullable = d.Min == 0 || d.Child.nullable() 48 | d.FirstPos = d.Child.firstPos() 49 | d.LastPos = d.Child.lastPos() 50 | } 51 | 52 | // ReplNode.SetFollow registers FollowPos nodes. 53 | func (d *ReplNode) SetFollow(pmap []*MatchNode) { 54 | if d.Max != 1 { // if just 1, self can't follow 55 | for _, i := range d.LastPos.Members() { 56 | for _, f := range d.FirstPos.Members() { 57 | pmap[i].followPos().Set(f) 58 | } 59 | } 60 | } 61 | } 62 | 63 | // ReplNode.Example produces an example with maximum replication n. 64 | func (d *ReplNode) Example(s []byte, n int) []byte { 65 | m := n // save original n for propagation to child 66 | // limit n to maximum allowed by the regexp 67 | if n > d.Max && d.Max >= 0 { 68 | n = d.Max 69 | } 70 | // choose desired replication count randomly within legal range 71 | if n > d.Min { 72 | n = d.Min + rand.Intn(n-d.Min+1) 73 | } else { 74 | n = d.Min 75 | } 76 | // and finally replicate 77 | for i := 0; i < n; i++ { 78 | s = d.Child.Example(s, m) 79 | } 80 | return s 81 | } 82 | 83 | // ReplNode.String produces a string representation using a postfix 84 | // replication operator: e* or e+ or e? or e{n} or e{n,} or e{m,n}. 85 | func (d *ReplNode) String() string { 86 | if d.Max < 0 { 87 | if d.Min == 0 { 88 | return fmt.Sprintf("%s*", d.Child) 89 | } else if d.Min == 1 { 90 | return fmt.Sprintf("%s+", d.Child) 91 | } else { 92 | return fmt.Sprintf("%s{%d,}", d.Child, d.Min) 93 | } 94 | } else if d.Max == d.Min { 95 | return fmt.Sprintf("%s{%d}", d.Child, d.Min) 96 | } else if d.Max == 1 && d.Min == 0 { 97 | return fmt.Sprintf("%s?", d.Child) 98 | } else { 99 | return fmt.Sprintf("%s{%d,%d}", d.Child, d.Min, d.Max) 100 | } 101 | } 102 | 103 | // replfix returns a replacement subtree if counting is needed, e.g. a{3}. 104 | // The original subtree is returned if it is okay. 105 | func replfix(d Node) Node { 106 | r, ok := d.(*ReplNode) 107 | if !ok { 108 | return d // nothing to do, not a replication node 109 | } 110 | if r.Min < 2 && r.Max < 2 { 111 | return r // nothing to do, return as is 112 | } 113 | 114 | // We need to split this node into a concatenation of two or more 115 | // deep copies, each with a modified ReplNode at the top. 116 | // Do this by bundling the subtree into a gob and then decoding 117 | // as many times as needed. 118 | gob.Register(&MatchNode{}) // register concrete types 119 | gob.Register(&ConcatNode{}) 120 | gob.Register(&AltNode{}) 121 | gob.Register(&ReplNode{}) 122 | wbuf := new(bytes.Buffer) // writable output buffer 123 | enc := gob.NewEncoder(wbuf) // create encoder 124 | CkErr(enc.Encode(&r.Child)) // encode the tree 125 | rbuf := bytes.NewReader(wbuf.Bytes()) // make resettable input buffer 126 | 127 | // Final result will be a concatenation of a left side of duplicate 128 | // nodes followed by a final node handling leftovers. 129 | var lside Node = nil // l side = empty 130 | rside := &ReplNode{r.Min, r.Max, degob(rbuf), nildata} // r side = copy 131 | for rside.Min > 1 || (rside.Min > 0 && rside.Min < rside.Max) { 132 | lside = Concatenate(lside, degob(rbuf)) 133 | rside.Min-- 134 | if rside.Max > 0 { 135 | rside.Max-- 136 | } 137 | } 138 | for rside.Max > 1 { 139 | optr := ReplNode{0, 1, degob(rbuf), nildata} 140 | lside = Concatenate(lside, &optr) 141 | rside.Max-- 142 | } 143 | if rside.Min == 1 && rside.Max == 1 { // if max==min originally 144 | return Concatenate(lside, rside.Child) // simpler case 145 | } else { 146 | return Concatenate(lside, rside) // e.g. regexp{5,*} 147 | } 148 | } 149 | 150 | // degob converts a gob into a new copy of a subtree. 151 | func degob(buf *bytes.Reader) Node { 152 | var tree Node 153 | buf.Seek(0, 0) 154 | dec := gob.NewDecoder(buf) 155 | CkErr(dec.Decode(&tree)) 156 | return tree 157 | } 158 | -------------------------------------------------------------------------------- /test/56-libclk.std: -------------------------------------------------------------------------------- 1 | Options: -R -T -g -h -i -l -v -I 0 2 | 3 | # RegExLib.com -- browse -- Dates and Times 4 | 5 | # 6 | 7 | # 981. 24-hour time hh:mm[:ss] 8 | 9 | expr 0: (([0-1]?[0-9])|([2][0-3])):([0-5]?[0-9])(:([0-5]?[0-9]))? 10 | tree: ((((([01]?[0-9])|(2[0-3])):)([0-5]?[0-9]))(:([0-5]?[0-9]))?) 11 | augmnt: (((((([01]?[0-9])|(2[0-3])):)([0-5]?[0-9]))(:([0-5]?[0-9]))?)#) 12 | length: 3 to 8 13 | cplxty: 19 14 | -------------------- Examples -------------------- 15 | ex(0): 4:6 23:8 8:9 2:1 0:5 6:1 2:1 7:5 8:0 9:7 6:9 20:7 8:2 21:6 16 | ex(1): 3:9:4 21:3:2 21:2 0:8:1 21:9:2 8:39 18:7:07 10:5:4 23:23 20:7 17 | ex(2): 03:8:6 7:07:1 0:46 03:2 02:5 05:8 20:05 8:47:8 7:38:47 20:8 18 | ex(3): 19:3 5:3:23 14:00 00:26 9:1:1 23:6:9 23:20 21:32 2:5:32 04:3 19 | ex(5): 21:6:26 08:21 21:7:00 22:01 16:7:56 6:7 00:41:9 0:10:04 5:59 20 | ex(8): 2:42 21:01 21:0 22:9:3 20:13 20:1:02 6:8:3 16:9 22:06 20:56:7 21 | ---------------- Examples from DFA --------------- 22 | s1: 0:1 23 | s4: 0:7 24 | s5: 0:1:0 25 | s8: 0:1:8 26 | 27 | # 193. Date in MySQL DB format 28 | 29 | expr 1: ([0-9]{4})-([0-9]{1,2})-([0-9]{1,2}) 30 | tree: (((([0-9]{4}-)[0-9]{1,2})-)[0-9]{1,2}) 31 | augmnt: (((((((([0-9][0-9])[0-9])[0-9])-)([0-9][0-9]?))-)([0-9][0-9]?))#) 32 | length: 8 to 10 33 | cplxty: 14 34 | -------------------- Examples -------------------- 35 | ex(0): 4436-6-7 8887-8-6 1004-8-0 1620-2-8 5483-0-4 7468-7-0 2708-2-0 36 | ex(1): 4390-16-7 3943-4-1 3262-6-8 8900-19-09 6422-83-9 0378-83-7 37 | ex(2): 0497-10-41 4007-02-0 4641-36-5 4628-92-98 1420-1-8 5238-2-9 38 | ex(3): 8832-90-0 6827-79-8 0743-89-65 3634-86-4 7465-9-3 0053-38-88 39 | ex(5): 0592-0-90 1162-51-8 7389-05-6 2199-72-8 2845-37-66 4423-10-50 40 | ex(8): 6999-6-6 7921-19-25 2760-9-9 1287-6-71 7960-6-7 2760-7-1 41 | ---------------- Examples from DFA --------------- 42 | s1: 2462-6-9 43 | s9: 2462-6-98 44 | 45 | # 235. "Improved date validation" (M*/D*/Y*) 1920-2019 46 | 47 | expr 2: ((0?[13578]|10|12)(-|\/)(([1-9])|(0[1-9])|([12])([0-9]?)|(3[01]?))(-|\/)((19)([2-9])(\d{1})|(20)([01])(\d{1})|([8901])(\d{1}))|(0?[2469]|11)(-|\/)(([1-9])|(0[1-9])|([12])([0-9]?)|(3[0]?))(-|\/)((19)([2-9])(\d{1})|(20)([01])(\d{1})|([8901])(\d{1}))) 48 | tree: (((((((0?[13578])|(10)|(12))(-|/))([1-9]|(0[1-9])|([12][0-9]?)|(3[01]?)))(-|/))((((19)[2-9])[0-9])|(((20)[01])[0-9])|([0189][0-9])))|((((((0?[2469])|(11))(-|/))([1-9]|(0[1-9])|([12][0-9]?)|(30?)))(-|/))((((19)[2-9])[0-9])|(((20)[01])[0-9])|([0189][0-9])))) 49 | augmnt: ((((((((0?[13578])|(10)|(12))(-|/))([1-9]|(0[1-9])|([12][0-9]?)|(3[01]?)))(-|/))((((19)[2-9])[0-9])|(((20)[01])[0-9])|([0189][0-9])))|((((((0?[2469])|(11))(-|/))([1-9]|(0[1-9])|([12][0-9]?)|(30?)))(-|/))((((19)[2-9])[0-9])|(((20)[01])[0-9])|([0189][0-9]))))#) 50 | length: 6 to 10 51 | cplxty: 96 52 | -------------------- Examples -------------------- 53 | ex(0): 3/3-1962 12-9-1992 6-5/2009 2-1-2006 1-3-2007 12/1-2002 12-2-2011 54 | ex(1): 07-08-1962 03-05/1994 10-2-80 11/1-2019 06-3-1980 12-5/18 55 | ex(2): 7/7-1983 11/3-2004 10/3/2016 11-9/2004 12/7-99 07/30-2010 56 | ex(3): 10/3-2003 10/07/1987 5/9/12 06-05-1999 12/3-17 2-2-2010 57 | ex(5): 10/3-99 10-04-2012 09-2/1999 01/07/1979 11-5-84 02-07-96 10-30-90 58 | ex(8): 12/06-1959 11-09/1934 11/3-2012 06-02/2017 11/07-1979 10/1-2008 59 | ---------------- Examples from DFA --------------- 60 | s1: 1-2-82 61 | s9: 1-2-19 62 | 63 | # 406. Full names of the months 64 | 65 | expr 3: (?:J(anuary|u(ne|ly))|February|Ma(rch|y)|A(pril|ugust)|(((Sept|Nov|Dec)em)|Octo)ber) 66 | tree: ((J((((((an)u)a)r)y)|(u((ne)|(ly)))))|(((((((Fe)b)r)u)a)r)y)|((Ma)(((rc)h)|y))|(A((((pr)i)l)|((((ug)u)s)t)))|((((((((((Se)p)t)|((No)v)|((De)c))e)m)|(((Oc)t)o))b)e)r)) 67 | augmnt: (((J((((((an)u)a)r)y)|(u((ne)|(ly)))))|(((((((Fe)b)r)u)a)r)y)|((Ma)(((rc)h)|y))|(A((((pr)i)l)|((((ug)u)s)t)))|((((((((((Se)p)t)|((No)v)|((De)c))e)m)|(((Oc)t)o))b)e)r))#) 68 | length: 3 to 9 69 | cplxty: 105 70 | -------------------- Examples -------------------- 71 | ex(0): November March June February April December November August 72 | ex(1): December March July October August August February July January 73 | ex(2): February March August May February March January April January 74 | ex(3): August August November April April February January February 75 | ex(5): August February March July November January October February 76 | ex(8): May October February January February December March March 77 | ---------------- Examples from DFA --------------- 78 | s1: May 79 | 80 | # 969. RFC2822 date 81 | 82 | ERROR: (?:\s*(Sun|Mon|Tue|Wed|Thu|Fri|Sat),\s*)?(0?[1-9]|[1-2][0-9]|3[01])\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+(19[0-9]{2}|[2-9][0-9]{3}|[0-9]{2})\s+(2[0-3]|[0-1][0-9]):([0-5][0-9])(?::(60|[0-5][0-9]))?\s+([-\+][0-9]{2}[0-5][0-9]|(?:UT|GMT|(?:E|C|M|P)(?:ST|DT)|[A-IK-Z]))(\s*\((\\\(|\\\)|(?<=[^\\])\((?)|(?<=[^\\])\)(?<-C>)|[^\(\)]*)*(?(C)(?!))\))*\s* 83 | rx: '(?...' unimplemented: in "(?:\s*(Sun|Mon|Tue|Wed|Thu|Fri|Sat),\s*)?(0?[1-9]|[1-2][0-9]|3[01])\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+(19[0-9]{2}|[2-9][0-9]{3}|[0-9]{2})\s+(2[0-3]|[0-1][0-9]):([0-5][0-9])(?::(60|[0-5][0-9]))?\s+([-\+][0-9]{2}[0-5][0-9]|(?:UT|GMT|(?:E|C|M|P)(?:ST|DT)|[A-IK-Z]))(\s*\((\\\(|\\\)|(?<=[^\\])\((?)|(?<=[^\\])\)(?<-C>)|[^\(\)]*)*(?(C)(?!))\))*\s*" 84 | 4 expression(s) loaded 85 | (1 expression(s) rejected) 86 | -------------------------------------------------------------------------------- /test/48-fireball.std: -------------------------------------------------------------------------------- 1 | 2 | ERROR: \b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/))) 3 | rx: \b (boundary) unimplemented: in "\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))" 4 | 5 | ERROR: (?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])) 6 | rx: '(?...' unimplemented: in "(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))" 7 | 8 | ERROR: (?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])) 9 | rx: '(?...' unimplemented: in "(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))" 10 | 11 | ERROR: (?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])) 12 | rx: '(?...' unimplemented: in "(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))" 13 | 14 | ERROR: (?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?p%d[label=\" %s\"]\n", 33 | p, dfa.Leaves[p]) 34 | } 35 | } 36 | fmt.Fprintf(f, "i [shape=%s, regular=true, label=\"\"]\n", startshape) 37 | for i, l := range dfa.Leaves { 38 | if IsAccept(l) { 39 | continue 40 | } 41 | fmt.Fprintf(f, "p%d [label=\"p%d\"]\n", i, i) 42 | for _, p := range l.FollowPos.Members() { 43 | if IsAccept(dfa.Leaves[p]) { 44 | fmt.Fprintf(f, "p%d [shape=doublecircle]\n", i) 45 | } else { 46 | fmt.Fprintf(f, "p%d->p%d[label=\" %s\"]\n", 47 | i, p, dfa.Leaves[p]) 48 | } 49 | } 50 | } 51 | fmt.Fprintln(f, "}") 52 | } 53 | 54 | // DFA.ToDot generates a Dot (GraphViz) representation of the DFA. 55 | func (dfa *DFA) ToDot(f io.Writer, title string, labels string) { 56 | fmt.Fprintln(f, "//", title) 57 | fmt.Fprintln(f, "digraph DFA {") 58 | fmt.Fprintf(f, "label=%s\n", strconv.Quote(title)) 59 | fmt.Fprintln(f, 60 | "node [shape=circle, height=.3, width=.3, margin=0, fontsize=10]") 61 | fmt.Fprintln(f, "s0 [shape=triangle, regular=true]") 62 | for _, src := range dfa.Dstates { 63 | if src.AccSet != nil { 64 | if labels == "" { 65 | fmt.Fprintf(f, "s%d [shape=doublecircle]\n", 66 | src.Index) 67 | } else { 68 | fmt.Fprintf(f, 69 | "s%d [shape=doubleoctagon, label=\"s%d\n", 70 | src.Index, src.Index) 71 | for _, i := range src.AccSet.Members() { 72 | fmt.Fprintf(f, "%c", labels[i]) 73 | } 74 | fmt.Fprintf(f, "\"]\n") 75 | } 76 | } 77 | slist, xmap := src.InvertMap() 78 | for _, dst := range slist.Members() { 79 | fmt.Fprintf(f, "s%d->s%d[label=\" %s\"]\n", 80 | src.Index, dst, xmap[dst].Unbracketed()) 81 | } 82 | } 83 | fmt.Fprintln(f, "}") 84 | } 85 | 86 | // WriteGraph writes a graph based on the extension of the given filename. 87 | // *.dot -> Dot (GraphViz) format (default for unrecognized forms) 88 | // *.gif -> GIF (Graphics Interchange Format) 89 | // *.pdf -> PDF (Portable Document Format) 90 | // *.png -> PNG (Portable Network Graphics) 91 | // *.svg -> SVG (Scalable Vector Graphics) 92 | // 93 | // The argument genfunc is a function to actually generate the Dot output. 94 | // If another format is wanted, output is written to a temporary file and 95 | // then "dot" is run from the path to convert it. 96 | // 97 | // If the filename is "@", another temporary file is written in SVG format 98 | // and a viewer is opened. This temporary file is never deleted because we 99 | // don't know when it's safe to remove it. 100 | // 101 | // If the filename is "-", standard output is written in Dot format. 102 | func WriteGraph(filename string, genfunc func(io.Writer)) { 103 | var err error 104 | var otype string // output conversion type 105 | var dotfile *os.File // output file for Dot format 106 | 107 | // check what type of output is wanted 108 | switch { 109 | case filename == FNAME_VIEW: // view interactively 110 | otype = "-Tsvg" 111 | case strings.HasSuffix(filename, ".gif"): 112 | otype = "-Tgif" 113 | case strings.HasSuffix(filename, ".pdf"): 114 | otype = "-Tpdf" 115 | case strings.HasSuffix(filename, ".png"): 116 | otype = "-Tpng" 117 | case strings.HasSuffix(filename, ".svg"): 118 | otype = "-Tsvg" 119 | case filename == FNAME_STDOUT: // Dot on stdout 120 | dotfile = os.Stdout 121 | default: // write .dot directly w/o temp file 122 | dotfile, err = os.Create(filename) 123 | CkErr(err) 124 | } 125 | if dotfile == nil { // if we need to use a temporary file 126 | dotfile, err = ioutil.TempFile("", "rxplor") 127 | CkErr(err) 128 | } 129 | 130 | // generate the Dot file 131 | genfunc(dotfile) 132 | CkErr(dotfile.Close()) 133 | if otype == "" { // if nothing more to do 134 | return 135 | } 136 | 137 | // convert from Dot format to desired output format 138 | dotname := dotfile.Name() 139 | outname := filename 140 | if outname == FNAME_VIEW { 141 | outname = dotname + ".svg" 142 | } 143 | CkErr(exec.Command("dot", otype, dotname, "-o", outname).Run()) 144 | os.Remove(dotname) 145 | if filename != FNAME_VIEW { 146 | return // no viewer wanted 147 | } 148 | 149 | // run a viewer 150 | if runtime.GOOS == "darwin" { // if Macintosh 151 | CkErr(exec.Command("open", "-W", outname).Run()) 152 | } else { 153 | CkErr(exec.Command("xdg-open", outname).Run()) 154 | } 155 | // DISABLED: os.Remove(outname) 156 | // We don't remove the temp file because we don't know when it's safe. 157 | // It's especially problematic when multiple views are open at once. 158 | // It would be nice to find a solution for this. 159 | } 160 | -------------------------------------------------------------------------------- /test/46-mbynens.rx: -------------------------------------------------------------------------------- 1 | # "In search of the perfect URL validation regex" 2 | # mathiasbynens.be/demo/url-regex sampled 20-feb-2014 3 | # (I've removed PHP delimiters, anchors, suffix modifiers (typically iS)) 4 | # 5 | # Spoon Library (979 chars) 6 | (((http|ftp|https):\/{2})+(([0-9a-z_-]+\.)+(aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cx|cy|cz|cz|de|dj|dk|dm|do|dz|ec|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mn|mn|mo|mp|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|nom|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ra|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw|arpa)(:[0-9]+)?((\/([~0-9a-zA-Z\#\+\%@\.\/_-]+))?(\?[0-9a-zA-Z\+\%@\/&\[\];=_-]+)?)?)) 7 | # @krijnhoetmer (115 chars) 8 | (^|[\s.:;?\-\]<\(])(https?://[-\w;/?:@&=+$\|\_.!~*\|'()\[\]%#,☺]+[\w/#](\(\))?)(?=$|[\s',\|\(\).:;?\-\[\]>\)]) 9 | # @gruber (71 chars) 10 | (([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/))) 11 | # @gruber v2 (218 chars) 12 | ((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])) 13 | # @cowboy (1241 chars) 14 | [a-z\d.-]+://[^<>\s]+|\b(?:(?:(?:[^\s!@#$%^&*()_=+[\]{}\|;:'",.<>/?]+)\.)+(?:ac|ad|aero|ae|af|ag|ai|al|am|an|ao|aq|arpa|ar|asia|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|biz|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|cat|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|coop|com|co|cr|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|edu|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gov|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|info|int|in|io|iq|ir|is|it|je|jm|jobs|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mil|mk|ml|mm|mn|mobi|mo|mp|mq|mr|ms|mt|museum|mu|mv|mw|mx|my|mz|name|na|nc|net|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|org|pa|pe|pf|pg|ph|pk|pl|pm|pn|pro|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tel|tf|tg|th|tj|tk|tl|tm|tn|to|tp|travel|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|xn--0zwm56d|xn--11b5bs3a9aj6g|xn--80akhbyknj4f|xn--9t4b11yi5a|xn--deba0ad|xn--g6w251d|xn--hgbk6aj7f53bba|xn--hlcj6aya9esc7a|xn--jxalpdlp|xn--kgbechtv|xn--zckzah|ye|yt|yu|za|zm|zw)|(?:(?:[0-9]|[1-9]\d|1\d{2}|2[0-4]\d|25[0-5])\.){3}(?:[0-9]|[1-9]\d|1\d{2}|2[0-4]\d|25[0-5]))(?:[;/][^#?<>\s]*)?(?:\?[^#<>\s]*)?(?:#[^<>\s]*)?(?!\w)) 15 | # Jeffrey Friedl (241 chars) 16 | ((ftp|https?)://[-\w]+(\.\w[-\w]*)+|(?:[a-z0-9](?:[-a-z0-9]*[a-z0-9])?\.)+(?: com\b|edu\b|biz\b|gov\b|in(?:t|fo)\b|mil\b|net\b|org\b|[a-z][a-z]\b))(\:\d+)?(/[^.!,?;"'<>()\[\]{}\s\x7F-\xFF]*(?:[.!,?]+[^.!,?;"'<>()\[\]{}\s\x7F-\xFF]+)*)? 17 | # @mattfarina (287 chars) 18 | ([a-z][a-z0-9\*\-\.]*):\/\/(?:(?:(?:[\w\.\-\+!$&'\(\)*\+,;=]|%[0-9a-f]{2})+:)*(?:[\w\.\-\+%!$&'\(\)*\+,;=]|%[0-9a-f]{2})+@)?(?:(?:[a-z0-9\-\.]|%[0-9a-f]{2})+|(?:\[(?:[0-9a-f]{0,4}:)*(?:[0-9a-f]{0,4})\]))(?::[0-9]+)?(?:[\/|\?](?:[\w#!:\.\?\+=&@!$'~*,;\/\(\)\[\]\-]|%[0-9a-f]{2})*)? 19 | # @stephenhay (38 chars) 20 | (https?|ftp)://[^\s/$.?#].[^\s]* 21 | # @scottgonzales (1347 chars) 22 | ([a-z]([a-z]|\d|\+|-|\.)*):(\/\/(((([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&'\(\)\*\+,;=]|:)*@)?((\[(|(v[\da-f]{1,}\.(([a-z]|\d|-|\.|_|~)|[!\$&'\(\)\*\+,;=]|:)+))\])|((\d|[1-9]\d|1\d\d|2[0-4]\d|25[0-5])\.(\d|[1-9]\d|1\d\d|2[0-4]\d|25[0-5])\.(\d|[1-9]\d|1\d\d|2[0-4]\d|25[0-5])\.(\d|[1-9]\d|1\d\d|2[0-4]\d|25[0-5]))|(([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&'\(\)\*\+,;=])*)(:\d*)?)(\/(([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&'\(\)\*\+,;=]|:|@)*)*|(\/((([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&'\(\)\*\+,;=]|:|@)+(\/(([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&'\(\)\*\+,;=]|:|@)*)*)?)|((([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&'\(\)\*\+,;=]|:|@)+(\/(([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&'\(\)\*\+,;=]|:|@)*)*)|((([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&'\(\)\*\+,;=]|:|@)){0})(\?((([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&'\(\)\*\+,;=]|:|@)|[\xE000-\xF8FF]|\/|\?)*)?(\#((([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&'\(\)\*\+,;=]|:|@)|\/|\?)*)? 23 | # @rodneyrehm (109 chars) 24 | ((https?://|ftp://|www\.|[^\s:=]+@www\.).*?[a-z_\/0-9\-\#=&])(?=(\.|,|;|\?|\!)?("|'|«|»|\[|\s|\r|\n|$)) 25 | # @imme_emosol (54 chars) 26 | (https?|ftp)://(-\.)?([^\s/?\.#-]+\.?)+(/[^\s]*)? 27 | # @diegoperini (502 chars) 28 | (?:(?:https?|ftp)://)(?:\S+(?::\S*)?@)?(?:(?!10(?:\.\d{1,3}){3})(?!127(?:\.\d{1,3}){3})(?!169\.254(?:\.\d{1,3}){2})(?!192\.168(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\x{00a1}-\x{ffff}0-9]+-?)*[a-z\x{00a1}-\x{ffff}0-9]+)(?:\.(?:[a-z\x{00a1}-\x{ffff}0-9]+-?)*[a-z\x{00a1}-\x{ffff}0-9]+)*(?:\.(?:[a-z\x{00a1}-\x{ffff}]{2,})))(?::\d{2,5})?(?:/[^\s]*)? 29 | --------------------------------------------------------------------------------