├── LICENSE ├── Makefile ├── backtrack.c ├── compile.c ├── main.c ├── parse.y ├── pike.c ├── recursive.c ├── regexp.h ├── sub.c └── thompson.c /LICENSE: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2007-2009 Russ Cox, Google Inc. All rights reserved. 2 | // 3 | // Redistribution and use in source and binary forms, with or without 4 | // modification, are permitted provided that the following conditions are 5 | // met: 6 | // 7 | // * Redistributions of source code must retain the above copyright 8 | // notice, this list of conditions and the following disclaimer. 9 | // * Redistributions in binary form must reproduce the above 10 | // copyright notice, this list of conditions and the following disclaimer 11 | // in the documentation and/or other materials provided with the 12 | // distribution. 13 | // * Neither the name of Google, Inc. nor the names of its 14 | // contributors may be used to endorse or promote products derived from 15 | // this software without specific prior written permission. 16 | // 17 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Copyright 2007-2009 Russ Cox. All Rights Reserved. 2 | # Use of this source code is governed by a BSD-style 3 | # license that can be found in the LICENSE file. 4 | 5 | CC=gcc 6 | CFLAGS=-ggdb -Wall -O2 7 | 8 | TARG=re 9 | OFILES=\ 10 | backtrack.o\ 11 | compile.o\ 12 | main.o\ 13 | pike.o\ 14 | recursive.o\ 15 | sub.o\ 16 | thompson.o\ 17 | y.tab.o\ 18 | 19 | HFILES=\ 20 | regexp.h\ 21 | y.tab.h\ 22 | 23 | re: $(OFILES) 24 | $(CC) -o re $(OFILES) 25 | 26 | %.o: %.c $(HFILES) 27 | $(CC) -c $(CFLAGS) $*.c 28 | 29 | y.tab.h y.tab.c: parse.y 30 | bison -v -y parse.y 31 | 32 | clean: 33 | rm -f *.o core re y.tab.[ch] y.output 34 | -------------------------------------------------------------------------------- /backtrack.c: -------------------------------------------------------------------------------- 1 | // Copyright 2007-2009 Russ Cox. All Rights Reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | #include "regexp.h" 6 | 7 | typedef struct Thread Thread; 8 | struct Thread 9 | { 10 | Inst *pc; 11 | char *sp; 12 | Sub *sub; 13 | }; 14 | 15 | static Thread 16 | thread(Inst *pc, char *sp, Sub *sub) 17 | { 18 | Thread t = {pc, sp, sub}; 19 | return t; 20 | } 21 | 22 | int 23 | backtrack(Prog *prog, char *input, char **subp, int nsubp) 24 | { 25 | enum { MAX = 1000 }; 26 | Thread ready[MAX]; 27 | int i, nready; 28 | Inst *pc; 29 | char *sp; 30 | Sub *sub; 31 | 32 | /* queue initial thread */ 33 | sub = newsub(nsubp); 34 | for(i=0; isub[i] = nil; 36 | ready[0] = thread(prog->start, input, sub); 37 | nready = 1; 38 | 39 | /* run threads in stack order */ 40 | while(nready > 0) { 41 | --nready; /* pop state for next thread to run */ 42 | pc = ready[nready].pc; 43 | sp = ready[nready].sp; 44 | sub = ready[nready].sub; 45 | assert(sub->ref > 0); 46 | for(;;) { 47 | switch(pc->opcode) { 48 | case Char: 49 | if(*sp != pc->c) 50 | goto Dead; 51 | pc++; 52 | sp++; 53 | continue; 54 | case Any: 55 | if(*sp == 0) 56 | goto Dead; 57 | pc++; 58 | sp++; 59 | continue; 60 | case Match: 61 | for(i=0; isub[i]; 63 | decref(sub); 64 | return 1; 65 | case Jmp: 66 | pc = pc->x; 67 | continue; 68 | case Split: 69 | if(nready >= MAX) 70 | fatal("backtrack overflow"); 71 | ready[nready++] = thread(pc->y, sp, incref(sub)); 72 | pc = pc->x; /* continue current thread */ 73 | continue; 74 | case Save: 75 | sub = update(sub, pc->n, sp); 76 | pc++; 77 | continue; 78 | } 79 | } 80 | Dead: 81 | decref(sub); 82 | } 83 | return 0; 84 | } 85 | 86 | -------------------------------------------------------------------------------- /compile.c: -------------------------------------------------------------------------------- 1 | // Copyright 2007-2009 Russ Cox. All Rights Reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | #include "regexp.h" 6 | 7 | static Inst *pc; 8 | static int count(Regexp*); 9 | static void emit(Regexp*); 10 | 11 | Prog* 12 | compile(Regexp *r) 13 | { 14 | int n; 15 | Prog *p; 16 | 17 | n = count(r) + 1; 18 | p = mal(sizeof *p + n*sizeof p->start[0]); 19 | p->start = (Inst*)(p+1); 20 | pc = p->start; 21 | emit(r); 22 | pc->opcode = Match; 23 | pc++; 24 | p->len = pc - p->start; 25 | return p; 26 | } 27 | 28 | // how many instructions does r need? 29 | static int 30 | count(Regexp *r) 31 | { 32 | switch(r->type) { 33 | default: 34 | fatal("bad count"); 35 | case Alt: 36 | return 2 + count(r->left) + count(r->right); 37 | case Cat: 38 | return count(r->left) + count(r->right); 39 | case Lit: 40 | case Dot: 41 | return 1; 42 | case Paren: 43 | return 2 + count(r->left); 44 | case Quest: 45 | return 1 + count(r->left); 46 | case Star: 47 | return 2 + count(r->left); 48 | case Plus: 49 | return 1 + count(r->left); 50 | } 51 | } 52 | 53 | static void 54 | emit(Regexp *r) 55 | { 56 | Inst *p1, *p2, *t; 57 | 58 | switch(r->type) { 59 | default: 60 | fatal("bad emit"); 61 | 62 | case Alt: 63 | pc->opcode = Split; 64 | p1 = pc++; 65 | p1->x = pc; 66 | emit(r->left); 67 | pc->opcode = Jmp; 68 | p2 = pc++; 69 | p1->y = pc; 70 | emit(r->right); 71 | p2->x = pc; 72 | break; 73 | 74 | case Cat: 75 | emit(r->left); 76 | emit(r->right); 77 | break; 78 | 79 | case Lit: 80 | pc->opcode = Char; 81 | pc->c = r->ch; 82 | pc++; 83 | break; 84 | 85 | case Dot: 86 | pc++->opcode = Any; 87 | break; 88 | 89 | case Paren: 90 | pc->opcode = Save; 91 | pc->n = 2*r->n; 92 | pc++; 93 | emit(r->left); 94 | pc->opcode = Save; 95 | pc->n = 2*r->n + 1; 96 | pc++; 97 | break; 98 | 99 | case Quest: 100 | pc->opcode = Split; 101 | p1 = pc++; 102 | p1->x = pc; 103 | emit(r->left); 104 | p1->y = pc; 105 | if(r->n) { // non-greedy 106 | t = p1->x; 107 | p1->x = p1->y; 108 | p1->y = t; 109 | } 110 | break; 111 | 112 | case Star: 113 | pc->opcode = Split; 114 | p1 = pc++; 115 | p1->x = pc; 116 | emit(r->left); 117 | pc->opcode = Jmp; 118 | pc->x = p1; 119 | pc++; 120 | p1->y = pc; 121 | if(r->n) { // non-greedy 122 | t = p1->x; 123 | p1->x = p1->y; 124 | p1->y = t; 125 | } 126 | break; 127 | 128 | case Plus: 129 | p1 = pc; 130 | emit(r->left); 131 | pc->opcode = Split; 132 | pc->x = p1; 133 | p2 = pc; 134 | pc++; 135 | p2->y = pc; 136 | if(r->n) { // non-greedy 137 | t = p2->x; 138 | p2->x = p2->y; 139 | p2->y = t; 140 | } 141 | break; 142 | } 143 | } 144 | 145 | void 146 | printprog(Prog *p) 147 | { 148 | Inst *pc, *e; 149 | 150 | pc = p->start; 151 | e = p->start + p->len; 152 | 153 | for(; pc < e; pc++) { 154 | switch(pc->opcode) { 155 | default: 156 | fatal("printprog"); 157 | case Split: 158 | printf("%2d. split %d, %d\n", (int)(pc-p->start), (int)(pc->x-p->start), (int)(pc->y-p->start)); 159 | break; 160 | case Jmp: 161 | printf("%2d. jmp %d\n", (int)(pc-p->start), (int)(pc->x-p->start)); 162 | break; 163 | case Char: 164 | printf("%2d. char %c\n", (int)(pc-p->start), pc->c); 165 | break; 166 | case Any: 167 | printf("%2d. any\n", (int)(pc-p->start)); 168 | break; 169 | case Match: 170 | printf("%2d. match\n", (int)(pc-p->start)); 171 | break; 172 | case Save: 173 | printf("%2d. save %d\n", (int)(pc-p->start), pc->n); 174 | } 175 | } 176 | } 177 | 178 | -------------------------------------------------------------------------------- /main.c: -------------------------------------------------------------------------------- 1 | // Copyright 2007-2009 Russ Cox. All Rights Reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | #include "regexp.h" 6 | 7 | struct { 8 | char *name; 9 | int (*fn)(Prog*, char*, char**, int); 10 | } tab[] = { 11 | "recursive", recursiveprog, 12 | "recursiveloop", recursiveloopprog, 13 | "backtrack", backtrack, 14 | "thompson", thompsonvm, 15 | "pike", pikevm, 16 | }; 17 | 18 | void 19 | usage(void) 20 | { 21 | fprintf(stderr, "usage: re regexp string...\n"); 22 | exit(2); 23 | } 24 | 25 | int 26 | main(int argc, char **argv) 27 | { 28 | int i, j, k, l; 29 | Regexp *re; 30 | Prog *prog; 31 | char *sub[MAXSUB]; 32 | 33 | if(argc < 2) 34 | usage(); 35 | 36 | re = parse(argv[1]); 37 | printre(re); 38 | printf("\n"); 39 | 40 | prog = compile(re); 41 | printprog(prog); 42 | 43 | for(i=2; i0; k--) 54 | if(sub[k-1]) 55 | break; 56 | for(l=0; l CHAR EOL 22 | %type alt concat repeat single line 23 | %type count 24 | 25 | %% 26 | 27 | line: alt EOL 28 | { 29 | parsed_regexp = $1; 30 | return 1; 31 | } 32 | 33 | alt: 34 | concat 35 | | alt '|' concat 36 | { 37 | $$ = reg(Alt, $1, $3); 38 | } 39 | ; 40 | 41 | concat: 42 | repeat 43 | | concat repeat 44 | { 45 | $$ = reg(Cat, $1, $2); 46 | } 47 | ; 48 | 49 | repeat: 50 | single 51 | | single '*' 52 | { 53 | $$ = reg(Star, $1, nil); 54 | } 55 | | single '*' '?' 56 | { 57 | $$ = reg(Star, $1, nil); 58 | $$->n = 1; 59 | } 60 | | single '+' 61 | { 62 | $$ = reg(Plus, $1, nil); 63 | } 64 | | single '+' '?' 65 | { 66 | $$ = reg(Plus, $1, nil); 67 | $$->n = 1; 68 | } 69 | | single '?' 70 | { 71 | $$ = reg(Quest, $1, nil); 72 | } 73 | | single '?' '?' 74 | { 75 | $$ = reg(Quest, $1, nil); 76 | $$->n = 1; 77 | } 78 | ; 79 | 80 | count: 81 | { 82 | $$ = ++nparen; 83 | } 84 | ; 85 | 86 | single: 87 | '(' count alt ')' 88 | { 89 | $$ = reg(Paren, $3, nil); 90 | $$->n = $2; 91 | } 92 | | '(' '?' ':' alt ')' 93 | { 94 | $$ = $4; 95 | } 96 | | CHAR 97 | { 98 | $$ = reg(Lit, nil, nil); 99 | $$->ch = $1; 100 | } 101 | | '.' 102 | { 103 | $$ = reg(Dot, nil, nil); 104 | } 105 | ; 106 | 107 | %% 108 | 109 | static char *input; 110 | static Regexp *parsed_regexp; 111 | static int nparen; 112 | int gen; 113 | 114 | static int 115 | yylex(void) 116 | { 117 | int c; 118 | 119 | if(input == NULL || *input == 0) 120 | return EOL; 121 | c = *input++; 122 | if(strchr("|*+?():.", c)) 123 | return c; 124 | yylval.c = c; 125 | return CHAR; 126 | } 127 | 128 | void 129 | fatal(char *fmt, ...) 130 | { 131 | va_list arg; 132 | 133 | va_start(arg, fmt); 134 | fprintf(stderr, "fatal error: "); 135 | vfprintf(stderr, fmt, arg); 136 | fprintf(stderr, "\n"); 137 | va_end(arg); 138 | exit(2); 139 | } 140 | 141 | static void 142 | yyerror(char *s) 143 | { 144 | fatal("%s", s); 145 | } 146 | 147 | 148 | Regexp* 149 | parse(char *s) 150 | { 151 | Regexp *r, *dotstar; 152 | 153 | input = s; 154 | parsed_regexp = nil; 155 | nparen = 0; 156 | if(yyparse() != 1) 157 | yyerror("did not parse"); 158 | if(parsed_regexp == nil) 159 | yyerror("parser nil"); 160 | 161 | r = reg(Paren, parsed_regexp, nil); // $0 parens 162 | dotstar = reg(Star, reg(Dot, nil, nil), nil); 163 | dotstar->n = 1; // non-greedy 164 | return reg(Cat, dotstar, r); 165 | } 166 | 167 | void* 168 | mal(int n) 169 | { 170 | void *v; 171 | 172 | v = malloc(n); 173 | if(v == nil) 174 | fatal("out of memory"); 175 | memset(v, 0, n); 176 | return v; 177 | } 178 | 179 | Regexp* 180 | reg(int type, Regexp *left, Regexp *right) 181 | { 182 | Regexp *r; 183 | 184 | r = mal(sizeof *r); 185 | r->type = type; 186 | r->left = left; 187 | r->right = right; 188 | return r; 189 | } 190 | 191 | void 192 | printre(Regexp *r) 193 | { 194 | switch(r->type) { 195 | default: 196 | printf("???"); 197 | break; 198 | 199 | case Alt: 200 | printf("Alt("); 201 | printre(r->left); 202 | printf(", "); 203 | printre(r->right); 204 | printf(")"); 205 | break; 206 | 207 | case Cat: 208 | printf("Cat("); 209 | printre(r->left); 210 | printf(", "); 211 | printre(r->right); 212 | printf(")"); 213 | break; 214 | 215 | case Lit: 216 | printf("Lit(%c)", r->ch); 217 | break; 218 | 219 | case Dot: 220 | printf("Dot"); 221 | break; 222 | 223 | case Paren: 224 | printf("Paren(%d, ", r->n); 225 | printre(r->left); 226 | printf(")"); 227 | break; 228 | 229 | case Star: 230 | if(r->n) 231 | printf("Ng"); 232 | printf("Star("); 233 | printre(r->left); 234 | printf(")"); 235 | break; 236 | 237 | case Plus: 238 | if(r->n) 239 | printf("Ng"); 240 | printf("Plus("); 241 | printre(r->left); 242 | printf(")"); 243 | break; 244 | 245 | case Quest: 246 | if(r->n) 247 | printf("Ng"); 248 | printf("Quest("); 249 | printre(r->left); 250 | printf(")"); 251 | break; 252 | } 253 | } 254 | -------------------------------------------------------------------------------- /pike.c: -------------------------------------------------------------------------------- 1 | // Copyright 2007-2009 Russ Cox. All Rights Reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | #include "regexp.h" 6 | 7 | typedef struct Thread Thread; 8 | struct Thread 9 | { 10 | Inst *pc; 11 | Sub *sub; 12 | }; 13 | 14 | typedef struct ThreadList ThreadList; 15 | struct ThreadList 16 | { 17 | int n; 18 | Thread t[1]; 19 | }; 20 | 21 | static Thread 22 | thread(Inst *pc, Sub *sub) 23 | { 24 | Thread t = {pc, sub}; 25 | return t; 26 | } 27 | 28 | static ThreadList* 29 | threadlist(int n) 30 | { 31 | return mal(sizeof(ThreadList)+n*sizeof(Thread)); 32 | } 33 | 34 | static void 35 | addthread(ThreadList *l, Thread t, char *sp) 36 | { 37 | if(t.pc->gen == gen) { 38 | decref(t.sub); 39 | return; // already on list 40 | } 41 | t.pc->gen = gen; 42 | 43 | switch(t.pc->opcode) { 44 | default: 45 | l->t[l->n] = t; 46 | l->n++; 47 | break; 48 | case Jmp: 49 | addthread(l, thread(t.pc->x, t.sub), sp); 50 | break; 51 | case Split: 52 | addthread(l, thread(t.pc->x, incref(t.sub)), sp); 53 | addthread(l, thread(t.pc->y, t.sub), sp); 54 | break; 55 | case Save: 56 | addthread(l, thread(t.pc+1, update(t.sub, t.pc->n, sp)), sp); 57 | break; 58 | } 59 | } 60 | 61 | int 62 | pikevm(Prog *prog, char *input, char **subp, int nsubp) 63 | { 64 | int i, len; 65 | ThreadList *clist, *nlist, *tmp; 66 | Inst *pc; 67 | char *sp; 68 | Sub *sub, *matched; 69 | 70 | matched = nil; 71 | for(i=0; isub[i] = nil; 76 | 77 | len = prog->len; 78 | clist = threadlist(len); 79 | nlist = threadlist(len); 80 | 81 | gen++; 82 | addthread(clist, thread(prog->start, sub), input); 83 | matched = 0; 84 | for(sp=input;; sp++) { 85 | if(clist->n == 0) 86 | break; 87 | // printf("%d(%02x).", (int)(sp - input), *sp & 0xFF); 88 | gen++; 89 | for(i=0; in; i++) { 90 | pc = clist->t[i].pc; 91 | sub = clist->t[i].sub; 92 | // printf(" %d", (int)(pc - prog->start)); 93 | switch(pc->opcode) { 94 | case Char: 95 | if(*sp != pc->c) { 96 | decref(sub); 97 | break; 98 | } 99 | case Any: 100 | if(*sp == 0) { 101 | decref(sub); 102 | break; 103 | } 104 | addthread(nlist, thread(pc+1, sub), sp+1); 105 | break; 106 | case Match: 107 | if(matched) 108 | decref(matched); 109 | matched = sub; 110 | for(i++; i < clist->n; i++) 111 | decref(clist->t[i].sub); 112 | goto BreakFor; 113 | // Jmp, Split, Save handled in addthread, so that 114 | // machine execution matches what a backtracker would do. 115 | // This is discussed (but not shown as code) in 116 | // Regular Expression Matching: the Virtual Machine Approach. 117 | } 118 | } 119 | BreakFor: 120 | // printf("\n"); 121 | tmp = clist; 122 | clist = nlist; 123 | nlist = tmp; 124 | nlist->n = 0; 125 | if(*sp == '\0') 126 | break; 127 | } 128 | if(matched) { 129 | for(i=0; isub[i]; 131 | decref(matched); 132 | return 1; 133 | } 134 | return 0; 135 | } 136 | -------------------------------------------------------------------------------- /recursive.c: -------------------------------------------------------------------------------- 1 | // Copyright 2007-2009 Russ Cox. All Rights Reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | #include "regexp.h" 6 | 7 | int 8 | recursive(Inst *pc, char *sp, char **subp, int nsubp) 9 | { 10 | char *old; 11 | 12 | switch(pc->opcode) { 13 | case Char: 14 | if(*sp != pc->c) 15 | return 0; 16 | case Any: 17 | if(*sp == '\0') 18 | return 0; 19 | return recursive(pc+1, sp+1, subp, nsubp); 20 | case Match: 21 | return 1; 22 | case Jmp: 23 | return recursive(pc->x, sp, subp, nsubp); 24 | case Split: 25 | if(recursive(pc->x, sp, subp, nsubp)) 26 | return 1; 27 | return recursive(pc->y, sp, subp, nsubp); 28 | case Save: 29 | if(pc->n >= nsubp) 30 | return recursive(pc+1, sp, subp, nsubp); 31 | old = subp[pc->n]; 32 | subp[pc->n] = sp; 33 | if(recursive(pc+1, sp, subp, nsubp)) 34 | return 1; 35 | subp[pc->n] = old; 36 | return 0; 37 | } 38 | fatal("recursive"); 39 | return -1; 40 | } 41 | 42 | int 43 | recursiveprog(Prog *prog, char *input, char **subp, int nsubp) 44 | { 45 | return recursive(prog->start, input, subp, nsubp); 46 | } 47 | 48 | int 49 | recursiveloop(Inst *pc, char *sp, char **subp, int nsubp) 50 | { 51 | char *old; 52 | 53 | for(;;) { 54 | switch(pc->opcode) { 55 | case Char: 56 | if(*sp != pc->c) 57 | return 0; 58 | case Any: 59 | pc++; 60 | sp++; 61 | continue; 62 | case Match: 63 | return 1; 64 | case Jmp: 65 | pc = pc->x; 66 | continue; 67 | case Split: 68 | if(recursiveloop(pc->x, sp, subp, nsubp)) 69 | return 1; 70 | pc = pc->y; 71 | continue; 72 | case Save: 73 | if(pc->n >= nsubp) { 74 | pc++; 75 | continue; 76 | } 77 | old = subp[pc->n]; 78 | subp[pc->n] = sp; 79 | if(recursiveloop(pc+1, sp, subp, nsubp)) 80 | return 1; 81 | subp[pc->n] = old; 82 | return 0; 83 | } 84 | fatal("recursiveloop"); 85 | } 86 | } 87 | 88 | int 89 | recursiveloopprog(Prog *prog, char *input, char **subp, int nsubp) 90 | { 91 | return recursiveloop(prog->start, input, subp, nsubp); 92 | } 93 | -------------------------------------------------------------------------------- /regexp.h: -------------------------------------------------------------------------------- 1 | // Copyright 2007-2009 Russ Cox. All Rights Reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #define nil ((void*)0) 12 | #define nelem(x) (sizeof(x)/sizeof((x)[0])) 13 | 14 | typedef struct Regexp Regexp; 15 | typedef struct Prog Prog; 16 | typedef struct Inst Inst; 17 | 18 | struct Regexp 19 | { 20 | int type; 21 | int n; 22 | int ch; 23 | Regexp *left; 24 | Regexp *right; 25 | }; 26 | 27 | enum /* Regexp.type */ 28 | { 29 | Alt = 1, 30 | Cat, 31 | Lit, 32 | Dot, 33 | Paren, 34 | Quest, 35 | Star, 36 | Plus, 37 | }; 38 | 39 | Regexp *parse(char*); 40 | Regexp *reg(int type, Regexp *left, Regexp *right); 41 | void printre(Regexp*); 42 | void fatal(char*, ...); 43 | void *mal(int); 44 | 45 | struct Prog 46 | { 47 | Inst *start; 48 | int len; 49 | }; 50 | 51 | struct Inst 52 | { 53 | int opcode; 54 | int c; 55 | int n; 56 | Inst *x; 57 | Inst *y; 58 | int gen; // global state, oooh! 59 | }; 60 | 61 | enum /* Inst.opcode */ 62 | { 63 | Char = 1, 64 | Match, 65 | Jmp, 66 | Split, 67 | Any, 68 | Save, 69 | }; 70 | 71 | Prog *compile(Regexp*); 72 | void printprog(Prog*); 73 | 74 | extern int gen; 75 | 76 | enum { 77 | MAXSUB = 20 78 | }; 79 | 80 | typedef struct Sub Sub; 81 | 82 | struct Sub 83 | { 84 | int ref; 85 | int nsub; 86 | char *sub[MAXSUB]; 87 | }; 88 | 89 | Sub *newsub(int n); 90 | Sub *incref(Sub*); 91 | Sub *copy(Sub*); 92 | Sub *update(Sub*, int, char*); 93 | void decref(Sub*); 94 | 95 | int backtrack(Prog*, char*, char**, int); 96 | int pikevm(Prog*, char*, char**, int); 97 | int recursiveloopprog(Prog*, char*, char**, int); 98 | int recursiveprog(Prog*, char*, char**, int); 99 | int thompsonvm(Prog*, char*, char**, int); 100 | -------------------------------------------------------------------------------- /sub.c: -------------------------------------------------------------------------------- 1 | // Copyright 2007-2009 Russ Cox. All Rights Reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | #include "regexp.h" 6 | 7 | Sub *freesub; 8 | 9 | Sub* 10 | newsub(int n) 11 | { 12 | Sub *s; 13 | 14 | s = freesub; 15 | if(s != nil) 16 | freesub = (Sub*)s->sub[0]; 17 | else 18 | s = mal(sizeof *s); 19 | s->nsub = n; 20 | s->ref = 1; 21 | return s; 22 | } 23 | 24 | Sub* 25 | incref(Sub *s) 26 | { 27 | s->ref++; 28 | return s; 29 | } 30 | 31 | Sub* 32 | update(Sub *s, int i, char *p) 33 | { 34 | Sub *s1; 35 | int j; 36 | 37 | if(s->ref > 1) { 38 | s1 = newsub(s->nsub); 39 | for(j=0; jnsub; j++) 40 | s1->sub[j] = s->sub[j]; 41 | s->ref--; 42 | s = s1; 43 | } 44 | s->sub[i] = p; 45 | return s; 46 | } 47 | 48 | void 49 | decref(Sub *s) 50 | { 51 | if(--s->ref == 0) { 52 | s->sub[0] = (char*)freesub; 53 | freesub = s; 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /thompson.c: -------------------------------------------------------------------------------- 1 | // Copyright 2007-2009 Russ Cox. All Rights Reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | #include "regexp.h" 6 | 7 | typedef struct Thread Thread; 8 | struct Thread 9 | { 10 | Inst *pc; 11 | }; 12 | 13 | typedef struct ThreadList ThreadList; 14 | struct ThreadList 15 | { 16 | int n; 17 | Thread t[1]; 18 | }; 19 | 20 | static Thread 21 | thread(Inst *pc) 22 | { 23 | Thread t = {pc}; 24 | return t; 25 | } 26 | 27 | static ThreadList* 28 | threadlist(int n) 29 | { 30 | return mal(sizeof(ThreadList)+n*sizeof(Thread)); 31 | } 32 | 33 | static void 34 | addthread(ThreadList *l, Thread t) 35 | { 36 | if(t.pc->gen == gen) 37 | return; // already on list 38 | 39 | t.pc->gen = gen; 40 | l->t[l->n] = t; 41 | l->n++; 42 | 43 | switch(t.pc->opcode) { 44 | case Jmp: 45 | addthread(l, thread(t.pc->x)); 46 | break; 47 | case Split: 48 | addthread(l, thread(t.pc->x)); 49 | addthread(l, thread(t.pc->y)); 50 | break; 51 | case Save: 52 | addthread(l, thread(t.pc+1)); 53 | break; 54 | } 55 | } 56 | 57 | int 58 | thompsonvm(Prog *prog, char *input, char **subp, int nsubp) 59 | { 60 | int i, len, matched; 61 | ThreadList *clist, *nlist, *tmp; 62 | Inst *pc; 63 | char *sp; 64 | 65 | for(i=0; ilen; 69 | clist = threadlist(len); 70 | nlist = threadlist(len); 71 | 72 | if(nsubp >= 1) 73 | subp[0] = input; 74 | gen++; 75 | addthread(clist, thread(prog->start)); 76 | matched = 0; 77 | for(sp=input;; sp++) { 78 | if(clist->n == 0) 79 | break; 80 | // printf("%d(%02x).", (int)(sp - input), *sp & 0xFF); 81 | gen++; 82 | for(i=0; in; i++) { 83 | pc = clist->t[i].pc; 84 | // printf(" %d", (int)(pc - prog->start)); 85 | switch(pc->opcode) { 86 | case Char: 87 | if(*sp != pc->c) 88 | break; 89 | case Any: 90 | if(*sp == 0) 91 | break; 92 | addthread(nlist, thread(pc+1)); 93 | break; 94 | case Match: 95 | if(nsubp >= 2) 96 | subp[1] = sp; 97 | matched = 1; 98 | goto BreakFor; 99 | // Jmp, Split, Save handled in addthread, so that 100 | // machine execution matches what a backtracker would do. 101 | // This is discussed (but not shown as code) in 102 | // Regular Expression Matching: the Virtual Machine Approach. 103 | } 104 | } 105 | BreakFor: 106 | // printf("\n"); 107 | tmp = clist; 108 | clist = nlist; 109 | nlist = tmp; 110 | nlist->n = 0; 111 | if(*sp == '\0') 112 | break; 113 | } 114 | return matched; 115 | } 116 | --------------------------------------------------------------------------------