├── .gitignore ├── Makefile ├── README.md ├── capture.c ├── compile.c ├── main.c ├── parse.c ├── regex.h └── vm.c /.gitignore: -------------------------------------------------------------------------------- 1 | main 2 | *.o 3 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CFLAGS += -std=c99 2 | 3 | main: main.o parse.o capture.o compile.o vm.o 4 | main.o parse.o capture.o compile.o vm.o: regex.h 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Regex 2 | 3 | A parody of http://code.google.com/p/re1/ with the intention to provide a 4 | simple, concise and constructive regex engine implementation. 5 | 6 | The parser part is a proof-of-concept design employing Shunting-yard algorithm 7 | to illustrate to feasibility to process unary operators and surrounds like 8 | parentheses. 9 | 10 | # Syntax 11 | 12 | a? match 1 or 0 times 13 | a* match 0 or more times 14 | a+ match 1 or more times 15 | a?? match 1 or 0 times, not greedily 16 | a*? match 0 or more, not greedily 17 | a+? match 1 or more, not greedily 18 | . match any character 19 | | alternative 20 | (a) capturing parentheses 21 | 22 | # Build 23 | 24 | make 25 | 26 | # Usage 27 | 28 | ./main '(ray|yar)' 'hello, ray' 29 | -------------------------------------------------------------------------------- /capture.c: -------------------------------------------------------------------------------- 1 | #include "regex.h" 2 | 3 | Capture *free_capture; 4 | 5 | Capture *newCapture(size_t size) 6 | { 7 | Capture *c; 8 | if (free_capture) { 9 | c = free_capture; 10 | free_capture = (Capture*)free_capture->captures[0]; 11 | } else 12 | c = malloc(sizeof(Capture)); 13 | c->size = size; 14 | c->ref = 1; 15 | return c; 16 | } 17 | 18 | Capture *incRef(Capture *c) 19 | { 20 | c->ref++; 21 | return c; 22 | } 23 | 24 | Capture *decRef(Capture *c) 25 | { 26 | if (! --c->ref) { 27 | c->captures[0] = (char*)free_capture; 28 | free_capture = c; 29 | } 30 | return c; 31 | } 32 | 33 | Capture *updateCapture(Capture *c, int i, const char *pos) 34 | { 35 | if (c->ref > 1) { 36 | Capture *cc = newCapture(c->size); 37 | memcpy(cc->captures, c->captures, sizeof(*c->captures) * c->size); 38 | c->ref--; 39 | c = cc; 40 | } 41 | c->captures[i] = pos; 42 | return c; 43 | } 44 | -------------------------------------------------------------------------------- /compile.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "regex.h" 3 | 4 | static Instruction *pc; 5 | 6 | static size_t count(Regex *r) 7 | { 8 | switch (r->type) { 9 | case Lit: 10 | case Dot: 11 | return 1; 12 | case Paren: 13 | case Star: case NGStar: 14 | return 2 + count(r->left); 15 | case Optional: case NGOptional: 16 | case Plus: case NGPlus: 17 | return 1 + count(r->left); 18 | case Concat: 19 | return count(r->left) + count(r->right); 20 | case Alternative: 21 | return 2 + count(r->left) + count(r->right); 22 | default: 23 | assert(0); 24 | } 25 | } 26 | 27 | static void build(Regex *r) 28 | { 29 | switch (r->type) { 30 | case Lit: 31 | pc->op = Char; 32 | pc++->ch = r->ch; 33 | break; 34 | case Dot: 35 | pc++->op = Any; 36 | break; 37 | case Concat: 38 | build(r->left); 39 | build(r->right); 40 | break; 41 | case Alternative: 42 | { 43 | Instruction *x = pc++; 44 | x->op = Split; 45 | x->left = pc; 46 | build(r->left); 47 | x->right = pc + 1; 48 | Instruction *y = pc++; 49 | y->op = Jmp; 50 | build(r->right); 51 | y->left = pc; 52 | break; 53 | } 54 | case Optional: 55 | case NGOptional: 56 | { 57 | Instruction *x = pc++; 58 | x->op = Split; 59 | x->left = pc; 60 | build(r->left); 61 | x->right = pc; 62 | if (r->type == NGOptional) 63 | swap(x->left, x->right); 64 | break; 65 | } 66 | case Star: 67 | case NGStar: 68 | { 69 | Instruction *x = pc++; 70 | x->op = Split; 71 | x->left = pc; 72 | build(r->left); 73 | pc->op = Jmp; 74 | pc++->left = x; 75 | x->right = pc; 76 | if (r->type == NGStar) 77 | swap(x->left, x->right); 78 | break; 79 | } 80 | case Plus: 81 | case NGPlus: 82 | { 83 | Instruction *x = pc; 84 | build(r->left); 85 | pc->op = Split; 86 | pc->left = x; 87 | pc->right = pc + 1; 88 | if (r->type == NGPlus) 89 | swap(pc->left, pc->right); 90 | pc++; 91 | break; 92 | } 93 | case Paren: 94 | pc->op = Save; 95 | pc++->save_id = 2 * r->capture_id; 96 | build(r->left); 97 | pc->op = Save; 98 | pc++->save_id = 2 * r->capture_id + 1; 99 | break; 100 | } 101 | } 102 | 103 | Program *compile(Regex *r) 104 | { 105 | size_t size = count(r) + 1; 106 | Program *p = malloc(sizeof(Program) + size * sizeof(Instruction)); 107 | pc = p->start = (Instruction*)(p+1); 108 | build(r); 109 | (pc++)->op = Match; 110 | p->size = pc - p->start; 111 | return p; 112 | } 113 | 114 | void printProgram(Program *p) 115 | { 116 | Instruction *s = p->start; 117 | for (size_t i = 0; i < p->size; i++) { 118 | Instruction *pc = s + i; 119 | printf("%2zd ", i); 120 | switch (pc->op) { 121 | case Char: 122 | printf("char %c\n", pc->ch); 123 | break; 124 | case Any: 125 | puts("any"); 126 | break; 127 | case Split: 128 | printf("split %td, %td\n", pc->left - s, pc->right - s); 129 | break; 130 | case Save: 131 | printf("save %d\n", pc->save_id); 132 | break; 133 | case Jmp: 134 | printf("jmp %td\n", pc->left - s); 135 | break; 136 | case Match: 137 | puts("match"); 138 | break; 139 | default: 140 | abort(); 141 | break; 142 | } 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /main.c: -------------------------------------------------------------------------------- 1 | #include "regex.h" 2 | 3 | int main(int argc, char *argv[]) 4 | { 5 | char s[MAXN]; 6 | if (argc < 2) 7 | return 1; 8 | Regex *r = newRegex(argv[1]); 9 | printRegex(r, 0); 10 | 11 | Program *p = compile(r); 12 | printProgram(p); 13 | puts(""); 14 | puts("+++"); 15 | puts(""); 16 | 17 | const char *captures[MAXCAP]; 18 | for (int i = 2; i < argc; i++) 19 | if (! run(p, argv[i], captures, MAXCAP)) 20 | puts("unmatch"); 21 | else { 22 | puts("match"); 23 | for (size_t i = 0; i < MAXN; i++) { 24 | if (! captures[i]) break; 25 | printf("%zd: %td\n", i, captures[i] - argv[2]); 26 | } 27 | } 28 | 29 | free(p); 30 | destroyRegex(r); 31 | return 0; 32 | } 33 | -------------------------------------------------------------------------------- /parse.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "regex.h" 6 | 7 | static const int isp[Alternative + 1] = { 8 | [Sentinel] = 1, 9 | [Paren] = 2, 10 | [Alternative] = 4, 11 | [Concat] = 6, 12 | [Optional] = 8, 13 | [Star] = 8, 14 | [Plus] = 8, 15 | [NGOptional] = 8, 16 | [NGStar] = 8, 17 | [NGPlus] = 8, 18 | }; 19 | static const int icp[Alternative + 1] = { 20 | [Sentinel] = 1, 21 | [Paren] = 9, 22 | [Alternative] = 3, 23 | [Concat] = 5, 24 | [Optional] = 7, 25 | [Star] = 7, 26 | [Plus] = 7, 27 | [NGOptional] = 7, 28 | [NGStar] = 7, 29 | [NGPlus] = 7, 30 | [RParen] = 2, 31 | }; 32 | static const enum Token c2t[256] = { 33 | ['\0'] = Sentinel, 34 | ['('] = Paren, 35 | [')'] = RParen, 36 | ['?'] = Optional, 37 | ['+'] = Plus, 38 | ['*'] = Star, 39 | ['|'] = Alternative, 40 | }; 41 | 42 | static Regex *newObj(int type, Regex *left, Regex *right) 43 | { 44 | Regex *r = malloc(sizeof(Regex)); 45 | r->type = type; 46 | r->left = left; 47 | r->right = right; 48 | r->capture_id = 0; // no use 49 | return r; 50 | } 51 | 52 | void destroyRegex(Regex *r) 53 | { 54 | switch (token_type[r->type]) { 55 | case Surround: 56 | case Unary: 57 | destroyRegex(r->left); 58 | break; 59 | case Binary: 60 | destroyRegex(r->left); 61 | destroyRegex(r->right); 62 | break; 63 | } 64 | free(r); 65 | } 66 | 67 | Regex *newRegex(const char *s) 68 | { 69 | static Regex *obj[MAXN]; 70 | static enum Token op[MAXN]; 71 | enum Token tok; 72 | bool flag = false, quit = false; 73 | int ic, nobj = 0, nop = 0, nparen = 0; 74 | op[nop++] = Sentinel; 75 | while (! quit) { 76 | // implicit concat 77 | tok = *s; 78 | if (c2t[tok]) tok = c2t[tok]; 79 | if (flag && (tok < 256 || tok == Paren)) { 80 | flag = false; 81 | tok = Concat; 82 | ic = icp[Concat]; 83 | } else { 84 | s++; 85 | ic = icp[tok]; 86 | flag = tok != Paren && tok != Alternative; 87 | if (tok == Sentinel) 88 | quit = true; 89 | } 90 | 91 | if (! ic) { 92 | if (tok == '.') 93 | obj[nobj++] = newObj(Dot, NULL, NULL); 94 | else { 95 | obj[nobj] = newObj(Lit, NULL, NULL); 96 | obj[nobj++]->ch = tok; 97 | } 98 | continue; 99 | } 100 | 101 | if (! quit && *s == '?') { 102 | if (tok == Optional) 103 | tok = NGOptional, s++; 104 | else if (tok == Star) 105 | tok = NGStar, s++; 106 | else if (tok == Plus) 107 | tok = NGPlus, s++; 108 | } 109 | 110 | bool new_obj = true; 111 | for (; nop > 0 && isp[op[nop-1]] >= ic; nop--) { 112 | switch (token_type[op[nop-1]]) { 113 | case Surround: 114 | if (op[nop-1] == Paren) { 115 | obj[nobj-2]->left = obj[nobj-1]; 116 | nobj--; 117 | } 118 | break; 119 | case Unary: 120 | break; 121 | case Binary: 122 | obj[nobj-2]->right = obj[nobj-1]; 123 | nobj--; 124 | break; 125 | } 126 | if (isp[op[nop-1]] == ic) { 127 | new_obj = false; 128 | nop--; 129 | break; 130 | } 131 | } 132 | if (new_obj) { 133 | op[nop++] = tok; 134 | if (tok == Paren) { 135 | obj[nobj] = newObj(tok, NULL, NULL); 136 | obj[nobj++]->capture_id = nparen++; 137 | } else // unary or binary 138 | obj[nobj-1] = newObj(tok, obj[nobj-1], NULL); 139 | } 140 | } 141 | assert(nop == 0 && nobj == 1); 142 | //return obj[0]; 143 | // implicit .*? 144 | return newObj(Concat, newObj(NGStar, newObj(Dot, NULL, NULL), NULL), newObj(Paren, obj[0], NULL)); 145 | } 146 | 147 | void printRegex(Regex *r, int d) 148 | { 149 | switch (r->type) { 150 | case Alternative: 151 | printf("%*sAlternative\n", 2*d, ""); 152 | printRegex(r->left, d + 1); 153 | printRegex(r->right, d + 1); 154 | break; 155 | case Concat: 156 | printf("%*sConcat\n", 2*d, ""); 157 | printRegex(r->left, d + 1); 158 | printRegex(r->right, d + 1); 159 | break; 160 | case Lit: 161 | printf("%*sLit %c\n", 2*d, "", r->ch); 162 | break; 163 | case Dot: 164 | printf("%*sDot\n", 2*d, ""); 165 | break; 166 | case Paren: 167 | printf("%*sParen\n", 2*d, ""); 168 | printRegex(r->left, d + 1); 169 | break; 170 | case Optional: 171 | printf("%*sOptional\n", 2*d, ""); 172 | printRegex(r->left, d + 1); 173 | break; 174 | case NGOptional: 175 | printf("%*sNGOptional\n", 2*d, ""); 176 | printRegex(r->left, d + 1); 177 | break; 178 | case Star: 179 | printf("%*sStar\n", 2*d, ""); 180 | printRegex(r->left, d + 1); 181 | break; 182 | case NGStar: 183 | printf("%*sNGStar\n", 2*d, ""); 184 | printRegex(r->left, d + 1); 185 | break; 186 | case Plus: 187 | printf("%*sPlus\n", 2*d, ""); 188 | printRegex(r->left, d + 1); 189 | break; 190 | case NGPlus: 191 | printf("%*sNGPlus\n", 2*d, ""); 192 | printRegex(r->left, d + 1); 193 | break; 194 | default: 195 | break; 196 | } 197 | } 198 | -------------------------------------------------------------------------------- /regex.h: -------------------------------------------------------------------------------- 1 | #ifndef REGEX_H 2 | #define REGEX_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define MAXN 92 10 | #define MAXCAP 20 11 | #define swap(x,y) do \ 12 | { unsigned char swap_temp[sizeof(x) == sizeof(y) ? (signed)sizeof(x) : -1]; \ 13 | memcpy(swap_temp,&y,sizeof(x)); \ 14 | memcpy(&y,&x, sizeof(x)); \ 15 | memcpy(&x,swap_temp,sizeof(x)); \ 16 | } while(0) 17 | 18 | enum Token { 19 | Lit = 256, 20 | Dot, 21 | 22 | Sentinel, 23 | Paren, 24 | RParen, 25 | 26 | Optional, 27 | Star, 28 | Plus, 29 | NGOptional, 30 | NGStar, 31 | NGPlus, 32 | 33 | Concat, 34 | Alternative, 35 | }; 36 | 37 | enum Op { 38 | Char, 39 | Any, 40 | Save, 41 | Match, 42 | Split, 43 | Jmp, 44 | }; 45 | 46 | enum TokenType { 47 | NonOp, 48 | Surround, 49 | Unary, 50 | Binary, 51 | }; 52 | 53 | static const enum TokenType token_type[Alternative + 1] = { 54 | [Char] = NonOp, 55 | [Dot] = NonOp, 56 | 57 | [Optional] = Unary, 58 | [Star] = Unary, 59 | [Plus] = Unary, 60 | [NGOptional] = Unary, 61 | [NGStar] = Unary, 62 | [NGPlus] = Unary, 63 | 64 | [Concat] = Binary, 65 | [Alternative] = Binary, 66 | 67 | [Sentinel] = Surround, 68 | [Paren] = Surround, 69 | }; 70 | 71 | typedef struct Regex Regex; 72 | struct Regex 73 | { 74 | enum Token type; 75 | union { 76 | char ch; 77 | int capture_id; 78 | }; 79 | Regex *left, *right; 80 | }; 81 | 82 | typedef struct Instruction Instruction; 83 | struct Instruction 84 | { 85 | enum Op op; 86 | union { 87 | char ch; 88 | int save_id; 89 | }; 90 | int timestamp; 91 | Instruction *left, *right; 92 | }; 93 | 94 | typedef struct Program Program; 95 | struct Program 96 | { 97 | Instruction *start; 98 | size_t size; 99 | }; 100 | 101 | typedef struct Capture Capture; 102 | struct Capture 103 | { 104 | size_t ref, size; 105 | const char *captures[MAXCAP]; 106 | }; 107 | 108 | Regex *newRegex(const char*); 109 | void printRegex(Regex *, int); 110 | void destroyRegex(Regex *r); 111 | 112 | Program *compile(Regex *); 113 | void printProgram(Program *); 114 | 115 | Capture *newCapture(size_t); 116 | Capture *incRef(Capture *); 117 | Capture *decRef(Capture *); 118 | Capture *updateCapture(Capture *, int, const char *); 119 | 120 | bool run(Program *, const char *, const char **, size_t); 121 | 122 | #endif /* end of include guard: REGEX_H */ 123 | -------------------------------------------------------------------------------- /vm.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "regex.h" 4 | 5 | typedef struct { 6 | Instruction *pc; 7 | Capture *capture; 8 | } State; 9 | 10 | typedef struct { 11 | size_t size; 12 | State states[0]; 13 | } StateList; 14 | 15 | int tick; 16 | 17 | static StateList* newStateList(size_t size) 18 | { 19 | StateList *sl = malloc(sizeof(StateList) + size * sizeof(State)); 20 | sl->size = 0; 21 | return sl; 22 | } 23 | 24 | static State state(Instruction *pc, Capture *capture) 25 | { 26 | State s = {pc, capture}; 27 | return s; 28 | } 29 | 30 | static void addState(StateList *sl, State s, const char *pos) 31 | { 32 | if (s.pc->timestamp == tick) { 33 | decRef(s.capture); 34 | return; 35 | } 36 | 37 | s.pc->timestamp = tick; 38 | switch (s.pc->op) { 39 | case Jmp: 40 | addState(sl, state(s.pc->left, s.capture), pos); 41 | break; 42 | case Split: 43 | addState(sl, state(s.pc->left, incRef(s.capture)), pos); 44 | addState(sl, state(s.pc->right, s.capture), pos); 45 | break; 46 | case Save: 47 | addState(sl, state(s.pc + 1, updateCapture(s.capture, s.pc->save_id, pos)), pos); 48 | break; 49 | default: 50 | sl->states[sl->size++] = s; 51 | break; 52 | } 53 | } 54 | 55 | bool run(Program *p, const char *input, const char **captures, size_t ncaptures) 56 | { 57 | Capture *capture = newCapture(ncaptures), *match = NULL; 58 | memset(capture->captures, 0, sizeof(*capture->captures) * ncaptures); 59 | 60 | StateList *cur = newStateList(p->size), 61 | *suc = newStateList(p->size); 62 | for (size_t i = 0; i < p->size; i++) 63 | p->start[i].timestamp = 0; 64 | tick = 1; 65 | addState(cur, state(p->start, capture), input); 66 | 67 | for (; ; input++) { 68 | tick++; 69 | for (size_t i = 0; i < cur->size; i++) { 70 | Instruction *pc = cur->states[i].pc; 71 | Capture *capture = cur->states[i].capture; 72 | switch (pc->op) { 73 | case Char: 74 | if (pc->ch == *input) 75 | addState(suc, state(pc + 1, capture), input + 1); 76 | else 77 | decRef(capture); 78 | break; 79 | case Any: 80 | if (*input) 81 | addState(suc, state(pc + 1, capture), input + 1); 82 | else 83 | decRef(capture); 84 | break; 85 | case Match: 86 | if (match) decRef(match); 87 | match = capture; 88 | while (++i < cur->size) 89 | decRef(cur->states[i].capture); 90 | goto Break; 91 | } 92 | } 93 | Break: 94 | { 95 | StateList *t = cur; 96 | cur = suc; 97 | suc = t; 98 | suc->size = 0; 99 | if (! *input) 100 | break; 101 | } 102 | } 103 | 104 | free(cur); 105 | free(suc); 106 | 107 | if (match) { 108 | for (size_t i = 0; i < ncaptures; i++) 109 | captures[i] = match->captures[i]; 110 | decRef(match); 111 | return true; 112 | } 113 | return false; 114 | } 115 | --------------------------------------------------------------------------------