├── .gitignore ├── Makefile └── re.c /.gitignore: -------------------------------------------------------------------------------- 1 | re 2 | *.sw[po] 3 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | re: re.c 2 | clang -Wall -g -o re re.c 3 | -------------------------------------------------------------------------------- /re.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | void die(char *reason); 10 | 11 | enum { 12 | Range, 13 | Jump, 14 | Goto, 15 | Accept, 16 | }; 17 | 18 | typedef struct Op Op; 19 | struct Op { 20 | union { 21 | unsigned int rstart; 22 | int jmp; 23 | }; 24 | unsigned short rlen; 25 | short ty; 26 | }; 27 | 28 | typedef struct Re Re; 29 | struct Re { 30 | Op *nfa; 31 | size_t len, size; 32 | }; 33 | 34 | typedef struct MatchState MatchState; 35 | struct MatchState { 36 | unsigned char *s, *n; 37 | size_t pos; 38 | enum { 39 | Active, 40 | Match, 41 | /* Fail, */ 42 | } state; 43 | }; 44 | 45 | char *errs; 46 | 47 | 48 | /* compiling */ 49 | 50 | static void 51 | pushop(Re *re, short ty, ...) 52 | { 53 | va_list ap; 54 | Op *op; 55 | 56 | if (re->len >= re->size) { 57 | re->size *= 2; 58 | if (re->size == 0) 59 | re->size = 10; 60 | re->nfa = realloc(re->nfa, re->size * sizeof(Op)); 61 | if (!re->nfa) 62 | die("(re.c) out of memory"); 63 | } 64 | 65 | op = &re->nfa[re->len++]; 66 | op->ty = ty; 67 | 68 | va_start(ap, ty); 69 | switch (ty) { 70 | case Range: 71 | op->rstart = va_arg(ap, unsigned int); 72 | op->rlen = va_arg(ap, int); 73 | break; 74 | case Jump: 75 | case Goto: 76 | op->jmp = va_arg(ap, int); 77 | break; 78 | } 79 | va_end(ap); 80 | } 81 | 82 | /* Features we support: 83 | * - \ 84 | * - . 85 | * - [x-y] 86 | * - [^x-y] (not yet) 87 | * - | 88 | * - ( ) 89 | * - * 90 | * - + 91 | * - ? 92 | */ 93 | static Re *pre; 94 | static jmp_buf pboom; 95 | static char *errloc; 96 | 97 | static void parseor(char **ps); /* forward ref */ 98 | static void 99 | parse(char **ps) 100 | { 101 | int c, beg, end; 102 | size_t op; 103 | char *s; 104 | 105 | s = *ps; 106 | 107 | for (c=*s; c && c != '|' && c != ')'; c=*++s) 108 | switch (c) { 109 | 110 | case '(': 111 | op = pre->len; 112 | *ps = s+1; 113 | parseor(ps); 114 | s = *ps; 115 | if (*s != ')') { 116 | errloc = s; 117 | errs = "expected )"; 118 | longjmp(pboom, 1); 119 | } 120 | break; 121 | 122 | case '[': 123 | op = pre->len; 124 | s++; 125 | if ((beg = *s++) == 0 126 | || *s++ != '-' 127 | || (end = *s++) == 0 128 | || *s != ']') { 129 | errloc = s-1; 130 | errs = "ill formed range"; 131 | longjmp(pboom, 1); 132 | } 133 | pushop(pre, Range, beg, end-beg+1); 134 | break; 135 | 136 | case '.': 137 | op = pre->len; 138 | pushop(pre, Range, 0, 0xffff); 139 | break; 140 | 141 | case '+': 142 | case '*': 143 | pushop(pre, Jump, op-pre->len); 144 | if (c == '+') 145 | break; 146 | /* fallback */ 147 | 148 | case '?': 149 | pushop(pre, Accept); /* dummy push */ 150 | memmove(&pre->nfa[op+1], &pre->nfa[op], 151 | (pre->len-1-op) * sizeof(Op)); 152 | pre->nfa[op].ty = Jump; 153 | pre->nfa[op].jmp = pre->len-op; 154 | break; 155 | 156 | case '\\': 157 | c = *++s; 158 | if (!*s) { 159 | errloc = s-1; 160 | errs = "invalid \\"; 161 | longjmp(pboom, 1); 162 | } 163 | /* fallback */ 164 | 165 | default: 166 | op = pre->len; 167 | pushop(pre, Range, c, 1); 168 | break; 169 | 170 | } 171 | 172 | *ps = s; 173 | } 174 | 175 | static void 176 | parseor(char **ps) 177 | { 178 | Op *opjmp; 179 | size_t j, g; 180 | 181 | j = pre->len; 182 | g = 0; 183 | parse(ps); 184 | 185 | while (**ps && **ps != ')') { 186 | assert(**ps == '|'); 187 | ++*ps; 188 | 189 | pushop(pre, Accept); /* dummy push */ 190 | opjmp = &pre->nfa[j]; 191 | memmove(opjmp+1, opjmp, (pre->len-1-j) * sizeof(Op)); 192 | opjmp->ty = Jump; 193 | opjmp->jmp = pre->len-j+1; 194 | g = pre->len; 195 | pushop(pre, Goto, 0); 196 | 197 | parse(ps); 198 | 199 | pre->nfa[g].jmp = pre->len-g; 200 | } 201 | } 202 | 203 | int 204 | recompile(Re *re, char *str) 205 | { 206 | char *s; 207 | static char errmsg[512]; 208 | 209 | s = str; 210 | re->nfa = 0; 211 | re->len = 0; 212 | re->size = 0; 213 | 214 | if (setjmp(pboom)) { 215 | sprintf(errmsg, "%s at character %td", errs, errloc-s); 216 | errs = errmsg; 217 | free(re->nfa); 218 | return 0; 219 | } 220 | 221 | pre = re; 222 | parseor(&str); 223 | if (*str != 0) { 224 | errloc = str; 225 | errs = "spurious character"; 226 | longjmp(pboom, 1); 227 | } 228 | pushop(re, Accept); 229 | return 1; 230 | } 231 | 232 | void 233 | refree(Re *re) 234 | { 235 | free(re->nfa); 236 | re->nfa = 0; 237 | } 238 | 239 | 240 | /* matching */ 241 | 242 | static int 243 | step(unsigned char *n, Re *re, size_t i) 244 | { 245 | int acc; 246 | 247 | switch (re->nfa[i].ty) { 248 | 249 | case Range: 250 | n[i/8] |= 1 << i%8; 251 | acc = 0; 252 | break; 253 | 254 | case Accept: 255 | acc = 1; 256 | break; 257 | 258 | case Goto: 259 | case Jump: 260 | acc = step(n, re, i+re->nfa[i].jmp); 261 | if (re->nfa[i].ty == Jump) 262 | acc |= step(n, re, i+1); 263 | break; 264 | 265 | } 266 | 267 | return acc; 268 | } 269 | 270 | void 271 | rebegin(MatchState *m, Re *re) 272 | { 273 | m->s = calloc((re->len+7) / 8, 1); 274 | m->n = calloc((re->len+7) / 8, 1); 275 | if (!m->s || !m->n) 276 | die("(re.c) out of memory"); 277 | m->pos = 0; 278 | if (step(m->s, re, 0)) 279 | m->state = Match; 280 | else 281 | m->state = Active; 282 | } 283 | 284 | void 285 | reend(MatchState *m) 286 | { 287 | free(m->s); 288 | free(m->n); 289 | m->s = m->n = 0; 290 | } 291 | 292 | void 293 | refeed(MatchState *m, Re *re, char *str) 294 | { 295 | unsigned char mask, *s, *t; 296 | int c, j; 297 | size_t i; 298 | Op *op; 299 | 300 | if (m->state != Active) 301 | return; 302 | 303 | for (; (c = *str & 0xff); str++) { 304 | 305 | m->pos++; 306 | s = m->s; 307 | i = 0; 308 | step(m->n, re, 0); 309 | 310 | do { 311 | for (mask=1, j=0; j<8; i++, mask *= 2, j++) { 312 | if ((*s & mask) == 0) 313 | continue; 314 | 315 | op = &re->nfa[i]; 316 | assert(op->ty == Range); 317 | if (c >= op->rstart) 318 | if (c < op->rstart+op->rlen) 319 | if (step(m->n, re, i+1)) { 320 | m->state = Match; 321 | return; 322 | } 323 | } 324 | s++; 325 | } while (ilen); 326 | 327 | t = m->s; /* swap next and current state */ 328 | m->s = m->n; 329 | m->n = t; 330 | memset(t, 0, (re->len+7)/8); 331 | } 332 | } 333 | 334 | 335 | /* test */ 336 | 337 | void 338 | die(char *reason) 339 | { 340 | fprintf(stderr, "dying, %s\n", reason); 341 | exit(1); 342 | } 343 | 344 | void 345 | repp(FILE *f, Re *re) 346 | { 347 | size_t i; 348 | 349 | for (i=0; ilen; i++) 350 | switch (re->nfa[i].ty) { 351 | case Accept: 352 | fprintf(f, "%02zd: Accept\n", i); 353 | break; 354 | case Jump: 355 | fprintf(f, "%02zd: Jump %d\n", i, 356 | re->nfa[i].jmp); 357 | break; 358 | case Goto: 359 | fprintf(f, "%02zd: Goto %d\n", i, 360 | re->nfa[i].jmp); 361 | break; 362 | case Range: 363 | fprintf(f, "%02zd: Range %u %hu\n", i, 364 | re->nfa[i].rstart, re->nfa[i].rlen); 365 | break; 366 | } 367 | } 368 | 369 | int 370 | main(int ac, char *av[]) 371 | { 372 | char line[8196]; 373 | MatchState ms; 374 | Re re; 375 | 376 | if (ac<2) 377 | return 1; 378 | 379 | if (!recompile(&re, av[1])) { 380 | printf("regexp compilation error: %s\n", errs); 381 | return 1; 382 | } 383 | repp(stdout, &re); 384 | 385 | while (fgets(line, 8196, stdin)) { 386 | rebegin(&ms, &re); 387 | refeed(&ms, &re, line); 388 | if (ms.state == Match) { 389 | size_t s; 390 | 391 | for (s=0; s