├── .github └── workflows │ ├── c.yml │ └── coverity.yml ├── Makefile ├── NOTICE ├── README.md ├── regaux.c ├── regcomp.c ├── regcomp.h ├── regerror.c ├── regexec.c ├── regexp9.3 ├── regexp9.7 ├── regexp9.h ├── regsub.c ├── rregexec.c ├── rregsub.c ├── test.c ├── test2.c ├── utf.c └── utf.h /.github/workflows/c.yml: -------------------------------------------------------------------------------- 1 | name: C 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v3 16 | - name: make clean 17 | run: make clean 18 | - name: make 19 | run: make 20 | -------------------------------------------------------------------------------- /.github/workflows/coverity.yml: -------------------------------------------------------------------------------- 1 | name: Coverity Scan 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | 7 | jobs: 8 | coverity: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v3 12 | - uses: vapier/coverity-scan-action@v1 13 | with: 14 | email: ${{ secrets.COVERITY_SCAN_EMAIL }} 15 | token: ${{ secrets.COVERITY_SCAN_TOKEN }} 16 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CC=gcc 2 | CFLAGS+=-Wall -Wextra -O3 -c -g 3 | O=o 4 | LIB=libregexp9.a 5 | 6 | RANLIB=true 7 | 8 | LIB=libregexp9.a 9 | 10 | OFILES=\ 11 | regcomp.$O\ 12 | regerror.$O\ 13 | regexec.$O\ 14 | regsub.$O\ 15 | regaux.$O\ 16 | rregexec.$O\ 17 | rregsub.$O\ 18 | utf.$O\ 19 | 20 | HFILES=\ 21 | regexp9.h\ 22 | regcomp.h\ 23 | utf.h\ 24 | 25 | all: $(LIB) 26 | 27 | install: $(LIB) 28 | mkdir -p $(PREFIX)/share/man/man3 $(PREFIX)/share/man/man7 29 | install -m 0644 regexp9.3 $(PREFIX)/share/man/man3/regexp9.3 30 | install -m 0644 regexp9.7 $(PREFIX)/share/man/man7/regexp9.7 31 | mkdir -p $(PREFIX)/lib 32 | install -m 0644 $(LIB) $(PREFIX)/lib/$(LIB) 33 | mkdir -p $(PREFIX)/include 34 | install -m 0644 regexp9.h $(PREFIX)/include/regexp9.h 35 | 36 | test: test.$O $(LIB) 37 | $(CC) -o test test.$O $(LIB) 38 | 39 | test2: test2.$O $(LIB) 40 | $(CC) -o test2 test2.$O $(LIB) 41 | 42 | $(LIB): $(OFILES) 43 | $(AR) $(ARFLAGS) $(LIB) $(OFILES) 44 | $(RANLIB) $(LIB) 45 | 46 | %.$O: %.c 47 | $(CC) $(CFLAGS) $*.c 48 | 49 | $(OFILES): $(HFILES) 50 | 51 | clean: 52 | rm -f $(OFILES) $(LIB) 53 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | This is a Unix port of the Plan 9 regular expression library, by Rob Pike. 2 | Please send comments about the packaging to Russ Cox . 3 | 4 | Copyright © 2021 Plan 9 Foundation 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://github.com/0intro/libregexp/workflows/C/badge.svg)](https://github.com/0intro/libregexp/actions/workflows/c.yml) 2 | [![Coverity Scan Build Status](https://scan.coverity.com/projects/0intro-libregexp/badge.svg)](https://scan.coverity.com/projects/0intro-libregexp) 3 | 4 | This is a Unix port of the Plan 9 regular expression library, 5 | originally done for the Inferno operating system. 6 | 7 | Russ Cox repackaged this to build as a standalone 8 | Unix library. 9 | 10 | David du Colombier repackaged Russ Cox's work, 11 | removed dependencies on libutf and libfmt and 12 | fixed various issues. 13 | -------------------------------------------------------------------------------- /regaux.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "regexp9.h" 3 | #include "regcomp.h" 4 | 5 | 6 | /* 7 | * save a new match in mp 8 | */ 9 | extern void 10 | _renewmatch(Resub *mp, int ms, Resublist *sp) 11 | { 12 | int i; 13 | 14 | if(mp==0 || ms<=0) 15 | return; 16 | if(mp[0].s.sp==0 || sp->m[0].s.spm[0].s.sp==mp[0].s.sp && sp->m[0].e.ep>mp[0].e.ep)){ 18 | for(i=0; im[i]; 20 | for(; iinst; p++){ 39 | if(p->inst == ip){ 40 | if(sep->m[0].s.sp < p->se.m[0].s.sp){ 41 | if(ms > 1) 42 | p->se = *sep; 43 | else 44 | p->se.m[0] = sep->m[0]; 45 | } 46 | return 0; 47 | } 48 | } 49 | p->inst = ip; 50 | if(ms > 1) 51 | p->se = *sep; 52 | else 53 | p->se.m[0] = sep->m[0]; 54 | (++p)->inst = 0; 55 | return p; 56 | } 57 | 58 | /* 59 | * same as renewthread, but called with 60 | * initial empty start pointer. 61 | */ 62 | extern Relist* 63 | _renewemptythread(Relist *lp, /* _relist to add to */ 64 | Reinst *ip, /* instruction to add */ 65 | int ms, 66 | char *sp) /* pointers to subexpressions */ 67 | { 68 | Relist *p; 69 | 70 | for(p=lp; p->inst; p++){ 71 | if(p->inst == ip){ 72 | if(sp < p->se.m[0].s.sp) { 73 | if(ms > 1) 74 | memset(&p->se, 0, sizeof(p->se)); 75 | p->se.m[0].s.sp = sp; 76 | } 77 | return 0; 78 | } 79 | } 80 | p->inst = ip; 81 | if(ms > 1) 82 | memset(&p->se, 0, sizeof(p->se)); 83 | p->se.m[0].s.sp = sp; 84 | (++p)->inst = 0; 85 | return p; 86 | } 87 | 88 | extern Relist* 89 | _rrenewemptythread(Relist *lp, /* _relist to add to */ 90 | Reinst *ip, /* instruction to add */ 91 | int ms, 92 | Rune *rsp) /* pointers to subexpressions */ 93 | { 94 | Relist *p; 95 | 96 | for(p=lp; p->inst; p++){ 97 | if(p->inst == ip){ 98 | if(rsp < p->se.m[0].s.rsp) { 99 | if(ms > 1) 100 | memset(&p->se, 0, sizeof(p->se)); 101 | p->se.m[0].s.rsp = rsp; 102 | } 103 | return 0; 104 | } 105 | } 106 | p->inst = ip; 107 | if(ms > 1) 108 | memset(&p->se, 0, sizeof(p->se)); 109 | p->se.m[0].s.rsp = rsp; 110 | (++p)->inst = 0; 111 | return p; 112 | } 113 | -------------------------------------------------------------------------------- /regcomp.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "regexp9.h" 6 | #include "regcomp.h" 7 | 8 | #define TRUE 1 9 | #define FALSE 0 10 | 11 | /* 12 | * Parser Information 13 | */ 14 | typedef 15 | struct Node 16 | { 17 | Reinst* first; 18 | Reinst* last; 19 | }Node; 20 | 21 | /* max character classes per program is nelem(reprog->class) */ 22 | static Reprog *reprog; 23 | 24 | /* max rune ranges per character class is nelem(classp->spans)/2 */ 25 | #define NCCRUNE nelem(classp->spans) 26 | 27 | #define NSTACK 20 28 | static Node andstack[NSTACK]; 29 | static Node *andp; 30 | static int atorstack[NSTACK]; 31 | static int* atorp; 32 | static int cursubid; /* id of current subexpression */ 33 | static int subidstack[NSTACK]; /* parallel to atorstack */ 34 | static int* subidp; 35 | static int lastwasand; /* Last token was operand */ 36 | static int nbra; 37 | static char* exprp; /* pointer to next character in source expression */ 38 | static int lexdone; 39 | static unsigned int nclass; 40 | static Reclass*classp; 41 | static Reinst* freep; 42 | static int errors; 43 | static Rune yyrune; /* last lex'd rune */ 44 | static Reclass*yyclassp; /* last lex'd class */ 45 | 46 | /* predeclared crap */ 47 | static void operator(int); 48 | static void pushand(Reinst*, Reinst*); 49 | static void pushator(int); 50 | static void evaluntil(int); 51 | static int bldcclass(void); 52 | 53 | static jmp_buf regkaboom; 54 | 55 | static void 56 | rcerror(char *s) 57 | { 58 | errors++; 59 | regerror(s); 60 | longjmp(regkaboom, 1); 61 | } 62 | 63 | static Reinst* 64 | newinst(int t) 65 | { 66 | freep->type = t; 67 | freep->l.left = 0; 68 | freep->r.right = 0; 69 | return freep++; 70 | } 71 | 72 | static void 73 | operand(int t) 74 | { 75 | Reinst *i; 76 | 77 | if(lastwasand) 78 | operator(CAT); /* catenate is implicit */ 79 | i = newinst(t); 80 | 81 | if(t == CCLASS || t == NCCLASS) 82 | i->r.cp = yyclassp; 83 | if(t == RUNE) 84 | i->r.r = yyrune; 85 | 86 | pushand(i, i); 87 | lastwasand = TRUE; 88 | } 89 | 90 | static void 91 | operator(int t) 92 | { 93 | if(t==RBRA && --nbra<0) 94 | rcerror("unmatched right paren"); 95 | if(t==LBRA){ 96 | if(++cursubid >= NSUBEXP) 97 | rcerror("too many subexpressions"); 98 | nbra++; 99 | if(lastwasand) 100 | operator(CAT); 101 | }else 102 | evaluntil(t); 103 | if(t != RBRA) 104 | pushator(t); 105 | lastwasand = FALSE; 106 | if(t==STAR || t==QUEST || t==PLUS || t==RBRA) 107 | lastwasand = TRUE; /* these look like operands */ 108 | } 109 | 110 | static void 111 | regerr2(char *s, int c) 112 | { 113 | char buf[100]; 114 | char *cp = buf; 115 | while(*s) 116 | *cp++ = *s++; 117 | *cp++ = c; 118 | *cp = '\0'; 119 | rcerror(buf); 120 | } 121 | 122 | static void 123 | cant(char *s) 124 | { 125 | char buf[100]; 126 | strncpy(buf, "can't happen: ", sizeof(buf)); 127 | strncat(buf, s, sizeof(buf)-1); 128 | rcerror(buf); 129 | } 130 | 131 | static void 132 | pushand(Reinst *f, Reinst *l) 133 | { 134 | if(andp >= &andstack[NSTACK]) 135 | cant("operand stack overflow"); 136 | andp->first = f; 137 | andp->last = l; 138 | andp++; 139 | } 140 | 141 | static void 142 | pushator(int t) 143 | { 144 | if(atorp >= &atorstack[NSTACK]) 145 | cant("operator stack overflow"); 146 | *atorp++ = t; 147 | *subidp++ = cursubid; 148 | } 149 | 150 | static Node* 151 | popand(int op) 152 | { 153 | Reinst *inst; 154 | 155 | if(andp <= &andstack[0]){ 156 | regerr2("missing operand for ", op); 157 | inst = newinst(NOP); 158 | pushand(inst,inst); 159 | } 160 | return --andp; 161 | } 162 | 163 | static int 164 | popator(void) 165 | { 166 | if(atorp <= &atorstack[0]) 167 | cant("operator stack underflow"); 168 | --subidp; 169 | return *--atorp; 170 | } 171 | 172 | static void 173 | evaluntil(int pri) 174 | { 175 | Node *op1, *op2; 176 | Reinst *inst1, *inst2; 177 | 178 | while(pri==RBRA || atorp[-1]>=pri){ 179 | switch(popator()){ 180 | default: 181 | rcerror("unknown operator in evaluntil"); 182 | break; 183 | case LBRA: /* must have been RBRA */ 184 | op1 = popand('('); 185 | inst2 = newinst(RBRA); 186 | inst2->r.subid = *subidp; 187 | op1->last->l.next = inst2; 188 | inst1 = newinst(LBRA); 189 | inst1->r.subid = *subidp; 190 | inst1->l.next = op1->first; 191 | pushand(inst1, inst2); 192 | return; 193 | case OR: 194 | op2 = popand('|'); 195 | op1 = popand('|'); 196 | inst2 = newinst(NOP); 197 | op2->last->l.next = inst2; 198 | op1->last->l.next = inst2; 199 | inst1 = newinst(OR); 200 | inst1->r.right = op1->first; 201 | inst1->l.left = op2->first; 202 | pushand(inst1, inst2); 203 | break; 204 | case CAT: 205 | op2 = popand(0); 206 | op1 = popand(0); 207 | op1->last->l.next = op2->first; 208 | pushand(op1->first, op2->last); 209 | break; 210 | case STAR: 211 | op2 = popand('*'); 212 | inst1 = newinst(OR); 213 | op2->last->l.next = inst1; 214 | inst1->r.right = op2->first; 215 | pushand(inst1, inst1); 216 | break; 217 | case PLUS: 218 | op2 = popand('+'); 219 | inst1 = newinst(OR); 220 | op2->last->l.next = inst1; 221 | inst1->r.right = op2->first; 222 | pushand(op2->first, inst1); 223 | break; 224 | case QUEST: 225 | op2 = popand('?'); 226 | inst1 = newinst(OR); 227 | inst2 = newinst(NOP); 228 | inst1->l.left = inst2; 229 | inst1->r.right = op2->first; 230 | op2->last->l.next = inst2; 231 | pushand(inst1, inst2); 232 | break; 233 | } 234 | } 235 | } 236 | 237 | static Reprog* 238 | optimize(Reprog *pp) 239 | { 240 | Reinst *inst, *target; 241 | int size; 242 | Reprog *npp; 243 | Reclass *cl; 244 | int diff; 245 | 246 | /* 247 | * get rid of NOOP chains 248 | */ 249 | for(inst=pp->firstinst; inst->type!=END; inst++){ 250 | target = inst->l.next; 251 | while(target->type == NOP) 252 | target = target->l.next; 253 | inst->l.next = target; 254 | } 255 | 256 | /* 257 | * The original allocation is for an area larger than 258 | * necessary. Reallocate to the actual space used 259 | * and then relocate the code. 260 | */ 261 | size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst); 262 | npp = realloc(pp, size); 263 | if(npp==0 || npp==pp) 264 | return pp; 265 | diff = (char *)npp - (char *)pp; 266 | freep = (Reinst *)((char *)freep + diff); 267 | for(inst=npp->firstinst; insttype){ 269 | case OR: 270 | case STAR: 271 | case PLUS: 272 | case QUEST: 273 | inst->r.right = (void*)((char*)inst->r.right + diff); 274 | break; 275 | case CCLASS: 276 | case NCCLASS: 277 | inst->r.right = (void*)((char*)inst->r.right + diff); 278 | cl = inst->r.cp; 279 | cl->end = (void*)((char*)cl->end + diff); 280 | break; 281 | } 282 | inst->l.left = (void*)((char*)inst->l.left + diff); 283 | } 284 | npp->startinst = (void*)((char*)npp->startinst + diff); 285 | return npp; 286 | } 287 | 288 | #ifdef DEBUG 289 | static void 290 | dumpstack(void){ 291 | Node *stk; 292 | int *ip; 293 | 294 | printf("operators\n"); 295 | for(ip=atorstack; ipfirst->type, stk->last->type); 300 | } 301 | 302 | static void 303 | dump(Reprog *pp) 304 | { 305 | Reinst *l; 306 | Rune *p; 307 | 308 | l = pp->firstinst; 309 | do{ 310 | printf("%d:\t0%o\t%d\t%d", (int)(l-pp->firstinst), l->type, 311 | (int)(l->l.left-pp->firstinst), (int)(l->r.right-pp->firstinst)); 312 | if(l->type == RUNE) 313 | printf("\t%C\n", l->r.r); 314 | else if(l->type == CCLASS || l->type == NCCLASS){ 315 | printf("\t["); 316 | if(l->type == NCCLASS) 317 | printf("^"); 318 | for(p = l->r.cp->spans; p < l->r.cp->end; p += 2) 319 | if(p[0] == p[1]) 320 | printf("%C", p[0]); 321 | else 322 | printf("%C-%C", p[0], p[1]); 323 | printf("]\n"); 324 | } else 325 | printf("\n"); 326 | }while(l++->type); 327 | } 328 | #endif 329 | 330 | static Reclass* 331 | newclass(void) 332 | { 333 | if(nclass >= nelem(reprog->class)) 334 | rcerror("too many character classes; increase Reprog.class size"); 335 | return &(classp[nclass++]); 336 | } 337 | 338 | static int 339 | nextc(Rune *rp) 340 | { 341 | if(lexdone){ 342 | *rp = 0; 343 | return 1; 344 | } 345 | exprp += chartorune(rp, exprp); 346 | if(*rp == '\\'){ 347 | exprp += chartorune(rp, exprp); 348 | return 1; 349 | } 350 | if(*rp == 0) 351 | lexdone = 1; 352 | return 0; 353 | } 354 | 355 | static int 356 | lex(int literal, int dot_type) 357 | { 358 | int quoted; 359 | 360 | quoted = nextc(&yyrune); 361 | if(literal || quoted){ 362 | if(yyrune == 0) 363 | return END; 364 | return RUNE; 365 | } 366 | 367 | switch(yyrune){ 368 | case 0: 369 | return END; 370 | case '*': 371 | return STAR; 372 | case '?': 373 | return QUEST; 374 | case '+': 375 | return PLUS; 376 | case '|': 377 | return OR; 378 | case '.': 379 | return dot_type; 380 | case '(': 381 | return LBRA; 382 | case ')': 383 | return RBRA; 384 | case '^': 385 | return BOL; 386 | case '$': 387 | return EOL; 388 | case '[': 389 | return bldcclass(); 390 | } 391 | return RUNE; 392 | } 393 | 394 | static int 395 | bldcclass(void) 396 | { 397 | int type; 398 | Rune r[NCCRUNE]; 399 | Rune *p, *ep, *np; 400 | Rune rune; 401 | int quoted; 402 | 403 | /* we have already seen the '[' */ 404 | type = CCLASS; 405 | yyclassp = newclass(); 406 | 407 | /* look ahead for negation */ 408 | /* SPECIAL CASE!!! negated classes don't match \n */ 409 | ep = r; 410 | quoted = nextc(&rune); 411 | if(!quoted && rune == '^'){ 412 | type = NCCLASS; 413 | quoted = nextc(&rune); 414 | *ep++ = '\n'; 415 | *ep++ = '\n'; 416 | } 417 | 418 | /* parse class into a set of spans */ 419 | while(ep < &r[NCCRUNE-1]){ 420 | if(rune == 0){ 421 | rcerror("malformed '[]'"); 422 | return 0; 423 | } 424 | if(!quoted && rune == ']') 425 | break; 426 | if(!quoted && rune == '-'){ 427 | if(ep == r){ 428 | rcerror("malformed '[]'"); 429 | return 0; 430 | } 431 | quoted = nextc(&rune); 432 | if((!quoted && rune == ']') || rune == 0){ 433 | rcerror("malformed '[]'"); 434 | return 0; 435 | } 436 | *(ep-1) = rune; 437 | } else { 438 | *ep++ = rune; 439 | *ep++ = rune; 440 | } 441 | quoted = nextc(&rune); 442 | } 443 | if(ep >= &r[NCCRUNE-1]) { 444 | rcerror("char class too large; increase Reclass.spans size"); 445 | return 0; 446 | } 447 | 448 | /* sort on span start */ 449 | for(p = r; p < ep; p += 2){ 450 | for(np = p; np < ep; np += 2) 451 | if(*np < *p){ 452 | rune = np[0]; 453 | np[0] = p[0]; 454 | p[0] = rune; 455 | rune = np[1]; 456 | np[1] = p[1]; 457 | p[1] = rune; 458 | } 459 | } 460 | 461 | /* merge spans */ 462 | np = yyclassp->spans; 463 | p = r; 464 | if(r == ep) 465 | yyclassp->end = np; 466 | else { 467 | np[0] = *p++; 468 | np[1] = *p++; 469 | for(; p < ep; p += 2) 470 | /* overlapping or adjacent ranges? */ 471 | if(p[0] <= np[1] + 1){ 472 | if(p[1] >= np[1]) 473 | np[1] = p[1]; /* coalesce */ 474 | } else { 475 | np += 2; 476 | np[0] = p[0]; 477 | np[1] = p[1]; 478 | } 479 | yyclassp->end = np+2; 480 | } 481 | 482 | return type; 483 | } 484 | 485 | static Reprog* 486 | regcomp1(char *s, int literal, int dot_type) 487 | { 488 | int token; 489 | Reprog *volatile pp; 490 | 491 | /* get memory for the program */ 492 | pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s)); 493 | if(pp == 0){ 494 | regerror("out of memory"); 495 | return 0; 496 | } 497 | freep = pp->firstinst; 498 | classp = pp->class; 499 | errors = 0; 500 | 501 | if(setjmp(regkaboom)) 502 | goto out; 503 | 504 | /* go compile the sucker */ 505 | lexdone = 0; 506 | exprp = s; 507 | nclass = 0; 508 | nbra = 0; 509 | atorp = atorstack; 510 | andp = andstack; 511 | subidp = subidstack; 512 | lastwasand = FALSE; 513 | cursubid = 0; 514 | 515 | /* Start with a low priority operator to prime parser */ 516 | pushator(START-1); 517 | while((token = lex(literal, dot_type)) != END){ 518 | if((token&0300) == OPERATOR) 519 | operator(token); 520 | else 521 | operand(token); 522 | } 523 | 524 | /* Close with a low priority operator */ 525 | evaluntil(START); 526 | 527 | /* Force END */ 528 | operand(END); 529 | evaluntil(START); 530 | #ifdef DEBUG 531 | dumpstack(); 532 | #endif 533 | if(nbra) 534 | rcerror("unmatched left paren"); 535 | --andp; /* points to first and only operand */ 536 | pp->startinst = andp->first; 537 | #ifdef DEBUG 538 | dump(pp); 539 | #endif 540 | pp = optimize(pp); 541 | #ifdef DEBUG 542 | printf("start: %d\n", (int)(andp->first-pp->firstinst)); 543 | dump(pp); 544 | #endif 545 | out: 546 | if(errors){ 547 | free(pp); 548 | pp = 0; 549 | } 550 | return pp; 551 | } 552 | 553 | extern Reprog* 554 | regcomp(char *s) 555 | { 556 | return regcomp1(s, 0, ANY); 557 | } 558 | 559 | extern Reprog* 560 | regcomplit(char *s) 561 | { 562 | return regcomp1(s, 1, ANY); 563 | } 564 | 565 | extern Reprog* 566 | regcompnl(char *s) 567 | { 568 | return regcomp1(s, 0, ANYNL); 569 | } 570 | -------------------------------------------------------------------------------- /regcomp.h: -------------------------------------------------------------------------------- 1 | /* 2 | * substitution list 3 | */ 4 | #define nelem(x) (sizeof(x)/sizeof((x)[0])) 5 | 6 | #define NSUBEXP 32 7 | typedef struct Resublist Resublist; 8 | struct Resublist 9 | { 10 | Resub m[NSUBEXP]; 11 | }; 12 | 13 | /* 14 | * Actions and Tokens (Reinst types) 15 | * 16 | * 02xx are operators, value == precedence 17 | * 03xx are tokens, i.e. operands for operators 18 | */ 19 | #define RUNE 0177 20 | #define OPERATOR 0200 /* Bitmask of all operators */ 21 | #define START 0200 /* Start, used for marker on stack */ 22 | #define RBRA 0201 /* Right bracket, ) */ 23 | #define LBRA 0202 /* Left bracket, ( */ 24 | #define OR 0203 /* Alternation, | */ 25 | #define CAT 0204 /* Concatentation, implicit operator */ 26 | #define STAR 0205 /* Closure, * */ 27 | #define PLUS 0206 /* a+ == aa* */ 28 | #define QUEST 0207 /* a? == a|nothing, i.e. 0 or 1 a's */ 29 | #define ANY 0300 /* Any character except newline, . */ 30 | #define ANYNL 0301 /* Any character including newline, . */ 31 | #define NOP 0302 /* No operation, internal use only */ 32 | #define BOL 0303 /* Beginning of line, ^ */ 33 | #define EOL 0304 /* End of line, $ */ 34 | #define CCLASS 0305 /* Character class, [] */ 35 | #define NCCLASS 0306 /* Negated character class, [] */ 36 | #define END 0377 /* Terminate: match found */ 37 | 38 | /* 39 | * regexec execution lists 40 | */ 41 | #define LISTSIZE 10 42 | #define BIGLISTSIZE (25*LISTSIZE) 43 | typedef struct Relist Relist; 44 | struct Relist 45 | { 46 | Reinst* inst; /* Reinstruction of the thread */ 47 | Resublist se; /* matched subexpressions in this thread */ 48 | }; 49 | typedef struct Reljunk Reljunk; 50 | struct Reljunk 51 | { 52 | Relist* relist[2]; 53 | Relist* reliste[2]; 54 | int starttype; 55 | Rune startchar; 56 | char* starts; 57 | char* eol; 58 | Rune* rstarts; 59 | Rune* reol; 60 | }; 61 | 62 | extern Relist* _renewthread(Relist*, Reinst*, int, Resublist*); 63 | extern void _renewmatch(Resub*, int, Resublist*); 64 | extern Relist* _renewemptythread(Relist*, Reinst*, int, char*); 65 | extern Relist* _rrenewemptythread(Relist*, Reinst*, int, Rune*); 66 | -------------------------------------------------------------------------------- /regerror.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "regexp9.h" 5 | 6 | void 7 | regerror(char *s) 8 | { 9 | char buf[132]; 10 | 11 | strncpy(buf, "regerror: ", sizeof(buf)); 12 | strncat(buf, s, sizeof(buf)-1); 13 | strncat(buf, "\n", sizeof(buf)-1); 14 | write(2, buf, strlen(buf)); 15 | exit(1); 16 | } 17 | -------------------------------------------------------------------------------- /regexec.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "regexp9.h" 3 | #include "regcomp.h" 4 | 5 | 6 | /* 7 | * return 0 if no match 8 | * >0 if a match 9 | * <0 if we ran out of _relist space 10 | */ 11 | static int 12 | regexec1(Reprog *progp, /* program to run */ 13 | char *bol, /* string to run machine on */ 14 | Resub *mp, /* subexpression elements */ 15 | int ms, /* number of elements at mp */ 16 | Reljunk *j 17 | ) 18 | { 19 | int flag=0; 20 | Reinst *inst; 21 | Relist *tlp; 22 | char *s; 23 | int i, checkstart; 24 | Rune r, *rp, *ep; 25 | int n; 26 | Relist* tl; /* This list, next list */ 27 | Relist* nl; 28 | Relist* tle; /* ends of this and next list */ 29 | Relist* nle; 30 | int match; 31 | char *p; 32 | 33 | match = 0; 34 | checkstart = j->starttype; 35 | if(mp) 36 | for(i=0; irelist[0][0].inst = 0; 41 | j->relist[1][0].inst = 0; 42 | 43 | /* Execute machine once for each character, including terminal NUL */ 44 | s = j->starts; 45 | do{ 46 | /* fast check for first char */ 47 | if(checkstart) { 48 | switch(j->starttype) { 49 | case RUNE: 50 | p = utfrune(s, j->startchar); 51 | if(p == 0 || s == j->eol) 52 | return match; 53 | s = p; 54 | break; 55 | case BOL: 56 | if(s == bol) 57 | break; 58 | p = utfrune(s, '\n'); 59 | if(p == 0 || s == j->eol) 60 | return match; 61 | s = p+1; 62 | break; 63 | } 64 | } 65 | r = *(unsigned char*)s; 66 | if(r < Runeself) 67 | n = 1; 68 | else 69 | n = chartorune(&r, s); 70 | 71 | /* switch run lists */ 72 | tl = j->relist[flag]; 73 | tle = j->reliste[flag]; 74 | nl = j->relist[flag^=1]; 75 | nle = j->reliste[flag]; 76 | nl->inst = 0; 77 | 78 | /* Add first instruction to current list */ 79 | if(match == 0) 80 | _renewemptythread(tl, progp->startinst, ms, s); 81 | 82 | /* Execute machine until current list is empty */ 83 | for(tlp=tl; tlp->inst; tlp++){ /* assignment = */ 84 | for(inst = tlp->inst; inst; inst = inst->l.next){ 85 | switch(inst->type){ 86 | case RUNE: /* regular character */ 87 | if(inst->r.r == r){ 88 | if(_renewthread(nl, inst->l.next, ms, &tlp->se)==nle) 89 | return -1; 90 | } 91 | break; 92 | case LBRA: 93 | tlp->se.m[inst->r.subid].s.sp = s; 94 | continue; 95 | case RBRA: 96 | tlp->se.m[inst->r.subid].e.ep = s; 97 | continue; 98 | case ANY: 99 | if(r != '\n') 100 | if(_renewthread(nl, inst->l.next, ms, &tlp->se)==nle) 101 | return -1; 102 | break; 103 | case ANYNL: 104 | if(_renewthread(nl, inst->l.next, ms, &tlp->se)==nle) 105 | return -1; 106 | break; 107 | case BOL: 108 | if(s == bol || *(s-1) == '\n') 109 | continue; 110 | break; 111 | case EOL: 112 | if(s == j->eol || r == 0 || r == '\n') 113 | continue; 114 | break; 115 | case CCLASS: 116 | ep = inst->r.cp->end; 117 | for(rp = inst->r.cp->spans; rp < ep; rp += 2) 118 | if(r >= rp[0] && r <= rp[1]){ 119 | if(_renewthread(nl, inst->l.next, ms, &tlp->se)==nle) 120 | return -1; 121 | break; 122 | } 123 | break; 124 | case NCCLASS: 125 | ep = inst->r.cp->end; 126 | for(rp = inst->r.cp->spans; rp < ep; rp += 2) 127 | if(r >= rp[0] && r <= rp[1]) 128 | break; 129 | if(rp == ep) 130 | if(_renewthread(nl, inst->l.next, ms, &tlp->se)==nle) 131 | return -1; 132 | break; 133 | case OR: 134 | /* evaluate right choice later */ 135 | if(_renewthread(tlp, inst->r.right, ms, &tlp->se) == tle) 136 | return -1; 137 | /* efficiency: advance and re-evaluate */ 138 | continue; 139 | case END: /* Match! */ 140 | match = 1; 141 | tlp->se.m[0].e.ep = s; 142 | if(mp != 0) 143 | _renewmatch(mp, ms, &tlp->se); 144 | break; 145 | } 146 | break; 147 | } 148 | } 149 | if(s == j->eol) 150 | break; 151 | checkstart = j->starttype && nl->inst==0; 152 | s += n; 153 | }while(r); 154 | return match; 155 | } 156 | 157 | static int 158 | regexec2(Reprog *progp, /* program to run */ 159 | char *bol, /* string to run machine on */ 160 | Resub *mp, /* subexpression elements */ 161 | int ms, /* number of elements at mp */ 162 | Reljunk *j 163 | ) 164 | { 165 | int rv; 166 | Relist *relist0, *relist1; 167 | 168 | /* mark space */ 169 | relist0 = malloc(BIGLISTSIZE*sizeof(Relist)); 170 | if(relist0 == NULL) 171 | return -1; 172 | relist1 = malloc(BIGLISTSIZE*sizeof(Relist)); 173 | if(relist1 == NULL){ 174 | free(relist0); 175 | return -1; 176 | } 177 | j->relist[0] = relist0; 178 | j->relist[1] = relist1; 179 | j->reliste[0] = relist0 + BIGLISTSIZE - 2; 180 | j->reliste[1] = relist1 + BIGLISTSIZE - 2; 181 | 182 | rv = regexec1(progp, bol, mp, ms, j); 183 | free(relist0); 184 | free(relist1); 185 | return rv; 186 | } 187 | 188 | extern int 189 | regexec(Reprog *progp, /* program to run */ 190 | char *bol, /* string to run machine on */ 191 | Resub *mp, /* subexpression elements */ 192 | int ms) /* number of elements at mp */ 193 | { 194 | Reljunk j; 195 | Relist relist0[LISTSIZE], relist1[LISTSIZE]; 196 | int rv; 197 | 198 | /* 199 | * use user-specified starting/ending location if specified 200 | */ 201 | j.starts = bol; 202 | j.eol = 0; 203 | if(mp && ms>0){ 204 | if(mp->s.sp) 205 | j.starts = mp->s.sp; 206 | if(mp->e.ep) 207 | j.eol = mp->e.ep; 208 | } 209 | j.starttype = 0; 210 | j.startchar = 0; 211 | if(progp->startinst->type == RUNE && progp->startinst->r.r < Runeself) { 212 | j.starttype = RUNE; 213 | j.startchar = progp->startinst->r.r; 214 | } 215 | if(progp->startinst->type == BOL) 216 | j.starttype = BOL; 217 | 218 | /* mark space */ 219 | j.relist[0] = relist0; 220 | j.relist[1] = relist1; 221 | j.reliste[0] = relist0 + nelem(relist0) - 2; 222 | j.reliste[1] = relist1 + nelem(relist1) - 2; 223 | 224 | rv = regexec1(progp, bol, mp, ms, &j); 225 | if(rv >= 0) 226 | return rv; 227 | rv = regexec2(progp, bol, mp, ms, &j); 228 | if(rv >= 0) 229 | return rv; 230 | return -1; 231 | } 232 | -------------------------------------------------------------------------------- /regexp9.3: -------------------------------------------------------------------------------- 1 | .TH REGEXP 3 2 | .SH NAME 3 | regcomp, regcomplit, regcompnl, regexec, regsub, rregexec, rregsub, regerror \- regular expression 4 | .SH SYNOPSIS 5 | .B #include 6 | .br 7 | .B #include 8 | .br 9 | .B #include 10 | .PP 11 | .ta \w'\fLRegprog 'u 12 | .B 13 | Reprog *regcomp(char *exp) 14 | .PP 15 | .B 16 | Reprog *regcomplit(char *exp) 17 | .PP 18 | .B 19 | Reprog *regcompnl(char *exp) 20 | .PP 21 | .nf 22 | .B 23 | int regexec(Reprog *prog, char *string, Resub *match, int msize) 24 | .PP 25 | .nf 26 | .B 27 | void regsub(char *source, char *dest, int dlen, Resub *match, int msize) 28 | .PP 29 | .nf 30 | .B 31 | int rregexec(Reprog *prog, Rune *string, Resub *match, int msize) 32 | .PP 33 | .nf 34 | .B 35 | void rregsub(Rune *source, Rune *dest, int dlen, Resub *match, int msize) 36 | .PP 37 | .B 38 | void regerror(char *msg) 39 | .SH DESCRIPTION 40 | .I Regcomp 41 | compiles a 42 | regular expression and returns 43 | a pointer to the generated description. 44 | The space is allocated by 45 | .IR malloc (3) 46 | and may be released by 47 | .IR free . 48 | Regular expressions are exactly as in 49 | .IR regexp (7). 50 | .PP 51 | .I Regcomplit 52 | is like 53 | .I regcomp 54 | except that all characters are treated literally. 55 | .I Regcompnl 56 | is like 57 | .I regcomp 58 | except that the 59 | .B . 60 | metacharacter matches all characters, including newlines. 61 | .PP 62 | .I Regexec 63 | matches a null-terminated 64 | .I string 65 | against the compiled regular expression in 66 | .IR prog . 67 | If it matches, 68 | .I regexec 69 | returns 70 | .B 1 71 | and fills in the array 72 | .I match 73 | with character pointers to the substrings of 74 | .I string 75 | that correspond to the 76 | parenthesized subexpressions of 77 | .IR exp : 78 | .BI match[ i ].sp 79 | points to the beginning and 80 | .BI match[ i ].ep 81 | points just beyond 82 | the end of the 83 | .IR i th 84 | substring. 85 | (Subexpression 86 | .I i 87 | begins at the 88 | .IR i th 89 | left parenthesis, counting from 1.) 90 | Pointers in 91 | .B match[0] 92 | pick out the substring that corresponds to 93 | the whole regular expression. 94 | Unused elements of 95 | .I match 96 | are filled with zeros. 97 | Matches involving 98 | .LR * , 99 | .LR + , 100 | and 101 | .L ? 102 | are extended as far as possible. 103 | The number of array elements in 104 | .I match 105 | is given by 106 | .IR msize . 107 | The structure of elements of 108 | .I match 109 | is: 110 | .IP 111 | .EX 112 | typedef struct { 113 | union { 114 | char *sp; 115 | Rune *rsp; 116 | } s; 117 | union { 118 | char *ep; 119 | Rune *rep; 120 | } e; 121 | } Resub; 122 | .EE 123 | .LP 124 | If 125 | .B match[0].s.sp 126 | is nonzero on entry, 127 | .I regexec 128 | starts matching at that point within 129 | .IR string . 130 | If 131 | .B match[0].e.ep 132 | is nonzero on entry, 133 | the last character matched is the one 134 | preceding that point. 135 | .PP 136 | .I Regsub 137 | places in 138 | .I dest 139 | a substitution instance of 140 | .I source 141 | in the context of the last 142 | .I regexec 143 | performed using 144 | .IR match . 145 | Each instance of 146 | .BI \e n\f1, 147 | where 148 | .I n 149 | is a digit, is replaced by the 150 | string delimited by 151 | .BI match[ n ].sp 152 | and 153 | .BI match[ n ].ep\f1. 154 | Each instance of 155 | .L & 156 | is replaced by the string delimited by 157 | .B match[0].sp 158 | and 159 | .BR match[0].ep . 160 | The substitution will always be null terminated and 161 | trimmed to fit into dlen bytes. 162 | .PP 163 | .IR Regerror , 164 | called whenever an error is detected in 165 | .IR regcomp , 166 | writes the string 167 | .I msg 168 | on the standard error file and exits. 169 | .I Regerror 170 | can be replaced to perform 171 | special error processing. 172 | If the user supplied 173 | .I regerror 174 | returns rather than exits, 175 | .I regcomp 176 | will return 0. 177 | .PP 178 | .I Rregexec 179 | and 180 | .I rregsub 181 | are variants of 182 | .I regexec 183 | and 184 | .I regsub 185 | that use strings of 186 | .B Runes 187 | instead of strings of 188 | .BR chars . 189 | With these routines, the 190 | .I rsp 191 | and 192 | .I rep 193 | fields of the 194 | .I match 195 | array elements should be used. 196 | .SH SOURCE 197 | .B \*9/src/libregexp 198 | .SH "SEE ALSO" 199 | .IR grep (1) 200 | .SH DIAGNOSTICS 201 | .I Regcomp 202 | returns 203 | .B 0 204 | for an illegal expression 205 | or other failure. 206 | .I Regexec 207 | returns 0 208 | if 209 | .I string 210 | is not matched. 211 | .SH BUGS 212 | There is no way to specify or match a NUL character; NULs terminate patterns and strings. 213 | -------------------------------------------------------------------------------- /regexp9.7: -------------------------------------------------------------------------------- 1 | .TH REGEXP 7 2 | .SH NAME 3 | regexp \- Plan 9 regular expression notation 4 | .SH DESCRIPTION 5 | This manual page describes the regular expression 6 | syntax used by the Plan 9 regular expression library 7 | .IR regexp (3). 8 | It is the form used by 9 | .IR egrep (1) 10 | before 11 | .I egrep 12 | got complicated. 13 | .PP 14 | A 15 | .I "regular expression" 16 | specifies 17 | a set of strings of characters. 18 | A member of this set of strings is said to be 19 | .I matched 20 | by the regular expression. In many applications 21 | a delimiter character, commonly 22 | .LR / , 23 | bounds a regular expression. 24 | In the following specification for regular expressions 25 | the word `character' means any character (rune) but newline. 26 | .PP 27 | The syntax for a regular expression 28 | .B e0 29 | is 30 | .IP 31 | .EX 32 | e3: literal | charclass | '.' | '^' | '$' | '(' e0 ')' 33 | 34 | e2: e3 35 | | e2 REP 36 | 37 | REP: '*' | '+' | '?' 38 | 39 | e1: e2 40 | | e1 e2 41 | 42 | e0: e1 43 | | e0 '|' e1 44 | .EE 45 | .PP 46 | A 47 | .B literal 48 | is any non-metacharacter, or a metacharacter 49 | (one of 50 | .BR .*+?[]()|\e^$ ), 51 | or the delimiter 52 | preceded by 53 | .LR \e . 54 | .PP 55 | A 56 | .B charclass 57 | is a nonempty string 58 | .I s 59 | bracketed 60 | .BI [ \|s\| ] 61 | (or 62 | .BI [^ s\| ]\fR); 63 | it matches any character in (or not in) 64 | .IR s . 65 | A negated character class never 66 | matches newline. 67 | A substring 68 | .IB a - b\f1, 69 | with 70 | .I a 71 | and 72 | .I b 73 | in ascending 74 | order, stands for the inclusive 75 | range of 76 | characters between 77 | .I a 78 | and 79 | .IR b . 80 | In 81 | .IR s , 82 | the metacharacters 83 | .LR - , 84 | .LR ] , 85 | an initial 86 | .LR ^ , 87 | and the regular expression delimiter 88 | must be preceded by a 89 | .LR \e ; 90 | other metacharacters 91 | have no special meaning and 92 | may appear unescaped. 93 | .PP 94 | A 95 | .L . 96 | matches any character. 97 | .PP 98 | A 99 | .L ^ 100 | matches the beginning of a line; 101 | .L $ 102 | matches the end of the line. 103 | .PP 104 | The 105 | .B REP 106 | operators match zero or more 107 | .RB ( * ), 108 | one or more 109 | .RB ( + ), 110 | zero or one 111 | .RB ( ? ), 112 | instances respectively of the preceding regular expression 113 | .BR e2 . 114 | .PP 115 | A concatenated regular expression, 116 | .BR "e1\|e2" , 117 | matches a match to 118 | .B e1 119 | followed by a match to 120 | .BR e2 . 121 | .PP 122 | An alternative regular expression, 123 | .BR "e0\||\|e1" , 124 | matches either a match to 125 | .B e0 126 | or a match to 127 | .BR e1 . 128 | .PP 129 | A match to any part of a regular expression 130 | extends as far as possible without preventing 131 | a match to the remainder of the regular expression. 132 | .SH "SEE ALSO" 133 | .IR regexp (3) 134 | -------------------------------------------------------------------------------- /regexp9.h: -------------------------------------------------------------------------------- 1 | #ifndef _REGEXP9_H_ 2 | #define _REGEXP9_H_ 1 3 | #if defined(__cplusplus) 4 | extern "C" { 5 | #endif 6 | 7 | #ifdef AUTOLIB 8 | AUTOLIB(regexp9) 9 | #endif 10 | 11 | #include "utf.h" 12 | 13 | typedef struct Resub Resub; 14 | typedef struct Reclass Reclass; 15 | typedef struct Reinst Reinst; 16 | typedef struct Reprog Reprog; 17 | 18 | /* 19 | * Sub expression matches 20 | */ 21 | struct Resub{ 22 | union 23 | { 24 | char *sp; 25 | Rune *rsp; 26 | }s; 27 | union 28 | { 29 | char *ep; 30 | Rune *rep; 31 | }e; 32 | }; 33 | 34 | /* 35 | * character class, each pair of rune's defines a range 36 | */ 37 | struct Reclass{ 38 | Rune *end; 39 | Rune spans[64]; 40 | }; 41 | 42 | /* 43 | * Machine instructions 44 | */ 45 | struct Reinst{ 46 | int type; 47 | union { 48 | Reclass *cp; /* class pointer */ 49 | Rune r; /* character */ 50 | int subid; /* sub-expression id for RBRA and LBRA */ 51 | Reinst *right; /* right child of OR */ 52 | }r; 53 | union { /* regexp relies on these two being in the same union */ 54 | Reinst *left; /* left child of OR */ 55 | Reinst *next; /* next instruction for CAT & LBRA */ 56 | }l; 57 | }; 58 | 59 | /* 60 | * Reprogram definition 61 | */ 62 | struct Reprog{ 63 | Reinst *startinst; /* start pc */ 64 | Reclass class[16]; /* .data */ 65 | Reinst firstinst[5]; /* .text */ 66 | }; 67 | 68 | extern Reprog *regcomp9(char*); 69 | extern Reprog *regcomplit9(char*); 70 | extern Reprog *regcompnl9(char*); 71 | extern void regerror9(char*); 72 | extern int regexec9(Reprog*, char*, Resub*, int); 73 | extern void regsub9(char*, char*, int, Resub*, int); 74 | 75 | extern int rregexec9(Reprog*, Rune*, Resub*, int); 76 | extern void rregsub9(Rune*, Rune*, int, Resub*, int); 77 | 78 | /* 79 | * Darwin simply cannot handle having routines that 80 | * override other library routines. 81 | */ 82 | #ifndef NOPLAN9DEFINES 83 | #define regcomp regcomp9 84 | #define regcomplit regcomplit9 85 | #define regcompnl regcompnl9 86 | #define regerror regerror9 87 | #define regexec regexec9 88 | #define regsub regsub9 89 | #define rregexec rregexec9 90 | #define rregsub rregsub9 91 | #endif 92 | 93 | #if defined(__cplusplus) 94 | } 95 | #endif 96 | #endif 97 | -------------------------------------------------------------------------------- /regsub.c: -------------------------------------------------------------------------------- 1 | #include "regexp9.h" 2 | 3 | /* substitute into one string using the matches from the last regexec() */ 4 | extern void 5 | regsub(char *sp, /* source string */ 6 | char *dp, /* destination string */ 7 | int dlen, 8 | Resub *mp, /* subexpression elements */ 9 | int ms) /* number of elements pointed to by mp */ 10 | { 11 | char *ssp, *ep; 12 | int i; 13 | 14 | ep = dp+dlen-1; 15 | while(*sp != '\0'){ 16 | if(*sp == '\\'){ 17 | switch(*++sp){ 18 | case '0': 19 | case '1': 20 | case '2': 21 | case '3': 22 | case '4': 23 | case '5': 24 | case '6': 25 | case '7': 26 | case '8': 27 | case '9': 28 | i = *sp-'0'; 29 | if(mp!=0 && mp[i].s.sp != 0 && ms>i) 30 | for(ssp = mp[i].s.sp; ssp < mp[i].e.ep; ssp++) 31 | if(dp < ep) 32 | *dp++ = *ssp; 33 | break; 34 | case '\\': 35 | if(dp < ep) 36 | *dp++ = '\\'; 37 | break; 38 | case '\0': 39 | sp--; 40 | break; 41 | default: 42 | if(dp < ep) 43 | *dp++ = *sp; 44 | break; 45 | } 46 | }else if(*sp == '&'){ 47 | if(mp!=0 && mp[0].s.sp != 0 && ms>0) 48 | for(ssp = mp[0].s.sp; ssp < mp[0].e.ep; ssp++) 49 | if(dp < ep) 50 | *dp++ = *ssp; 51 | }else{ 52 | if(dp < ep) 53 | *dp++ = *sp; 54 | } 55 | sp++; 56 | } 57 | *dp = '\0'; 58 | } 59 | -------------------------------------------------------------------------------- /rregexec.c: -------------------------------------------------------------------------------- 1 | #include "regexp9.h" 2 | #include "regcomp.h" 3 | 4 | /* 5 | * return 0 if no match 6 | * >0 if a match 7 | * <0 if we ran out of _relist space 8 | */ 9 | static int 10 | rregexec1(Reprog *progp, /* program to run */ 11 | Rune *bol, /* string to run machine on */ 12 | Resub *mp, /* subexpression elements */ 13 | int ms, /* number of elements at mp */ 14 | Reljunk *j) 15 | { 16 | int flag=0; 17 | Reinst *inst; 18 | Relist *tlp; 19 | Rune *s; 20 | int i, checkstart; 21 | Rune r, *rp, *ep; 22 | Relist* tl; /* This list, next list */ 23 | Relist* nl; 24 | Relist* tle; /* ends of this and next list */ 25 | Relist* nle; 26 | int match; 27 | Rune *p; 28 | 29 | match = 0; 30 | checkstart = j->startchar; 31 | if(mp) 32 | for(i=0; irelist[0][0].inst = 0; 37 | j->relist[1][0].inst = 0; 38 | 39 | /* Execute machine once for each character, including terminal NUL */ 40 | s = j->rstarts; 41 | do{ 42 | /* fast check for first char */ 43 | if(checkstart) { 44 | switch(j->starttype) { 45 | case RUNE: 46 | p = runestrchr(s, j->startchar); 47 | if(p == 0 || s == j->reol) 48 | return match; 49 | s = p; 50 | break; 51 | case BOL: 52 | if(s == bol) 53 | break; 54 | p = runestrchr(s, '\n'); 55 | if(p == 0 || s == j->reol) 56 | return match; 57 | s = p+1; 58 | break; 59 | } 60 | } 61 | 62 | r = *s; 63 | 64 | /* switch run lists */ 65 | tl = j->relist[flag]; 66 | tle = j->reliste[flag]; 67 | nl = j->relist[flag^=1]; 68 | nle = j->reliste[flag]; 69 | nl->inst = 0; 70 | 71 | /* Add first instruction to current list */ 72 | _rrenewemptythread(tl, progp->startinst, ms, s); 73 | 74 | /* Execute machine until current list is empty */ 75 | for(tlp=tl; tlp->inst; tlp++){ 76 | for(inst=tlp->inst; ; inst = inst->l.next){ 77 | switch(inst->type){ 78 | case RUNE: /* regular character */ 79 | if(inst->r.r == r) 80 | if(_renewthread(nl, inst->l.next, ms, &tlp->se)==nle) 81 | return -1; 82 | break; 83 | case LBRA: 84 | tlp->se.m[inst->r.subid].s.rsp = s; 85 | continue; 86 | case RBRA: 87 | tlp->se.m[inst->r.subid].e.rep = s; 88 | continue; 89 | case ANY: 90 | if(r != '\n') 91 | if(_renewthread(nl, inst->l.next, ms, &tlp->se)==nle) 92 | return -1; 93 | break; 94 | case ANYNL: 95 | if(_renewthread(nl, inst->l.next, ms, &tlp->se)==nle) 96 | return -1; 97 | break; 98 | case BOL: 99 | if(s == bol || *(s-1) == '\n') 100 | continue; 101 | break; 102 | case EOL: 103 | if(s == j->reol || r == 0 || r == '\n') 104 | continue; 105 | break; 106 | case CCLASS: 107 | ep = inst->r.cp->end; 108 | for(rp = inst->r.cp->spans; rp < ep; rp += 2) 109 | if(r >= rp[0] && r <= rp[1]){ 110 | if(_renewthread(nl, inst->l.next, ms, &tlp->se)==nle) 111 | return -1; 112 | break; 113 | } 114 | break; 115 | case NCCLASS: 116 | ep = inst->r.cp->end; 117 | for(rp = inst->r.cp->spans; rp < ep; rp += 2) 118 | if(r >= rp[0] && r <= rp[1]) 119 | break; 120 | if(rp == ep) 121 | if(_renewthread(nl, inst->l.next, ms, &tlp->se)==nle) 122 | return -1; 123 | break; 124 | case OR: 125 | /* evaluate right choice later */ 126 | if(_renewthread(tlp, inst->r.right, ms, &tlp->se) == tle) 127 | return -1; 128 | /* efficiency: advance and re-evaluate */ 129 | continue; 130 | case END: /* Match! */ 131 | match = 1; 132 | tlp->se.m[0].e.rep = s; 133 | if(mp != 0) 134 | _renewmatch(mp, ms, &tlp->se); 135 | break; 136 | } 137 | break; 138 | } 139 | } 140 | if(s == j->reol) 141 | break; 142 | checkstart = j->startchar && nl->inst==0; 143 | s++; 144 | }while(r); 145 | return match; 146 | } 147 | 148 | static int 149 | rregexec2(Reprog *progp, /* program to run */ 150 | Rune *bol, /* string to run machine on */ 151 | Resub *mp, /* subexpression elements */ 152 | int ms, /* number of elements at mp */ 153 | Reljunk *j 154 | ) 155 | { 156 | Relist relist0[5*LISTSIZE], relist1[5*LISTSIZE]; 157 | 158 | /* mark space */ 159 | j->relist[0] = relist0; 160 | j->relist[1] = relist1; 161 | j->reliste[0] = relist0 + nelem(relist0) - 2; 162 | j->reliste[1] = relist1 + nelem(relist1) - 2; 163 | 164 | return rregexec1(progp, bol, mp, ms, j); 165 | } 166 | 167 | extern int 168 | rregexec(Reprog *progp, /* program to run */ 169 | Rune *bol, /* string to run machine on */ 170 | Resub *mp, /* subexpression elements */ 171 | int ms) /* number of elements at mp */ 172 | { 173 | Reljunk j; 174 | Relist relist0[LISTSIZE], relist1[LISTSIZE]; 175 | int rv; 176 | 177 | /* 178 | * use user-specified starting/ending location if specified 179 | */ 180 | j.rstarts = bol; 181 | j.reol = 0; 182 | if(mp && ms>0){ 183 | if(mp->s.sp) 184 | j.rstarts = mp->s.rsp; 185 | if(mp->e.ep) 186 | j.reol = mp->e.rep; 187 | } 188 | j.starttype = 0; 189 | j.startchar = 0; 190 | if(progp->startinst->type == RUNE && progp->startinst->r.r < Runeself) { 191 | j.starttype = RUNE; 192 | j.startchar = progp->startinst->r.r; 193 | } 194 | if(progp->startinst->type == BOL) 195 | j.starttype = BOL; 196 | 197 | /* mark space */ 198 | j.relist[0] = relist0; 199 | j.relist[1] = relist1; 200 | j.reliste[0] = relist0 + nelem(relist0) - 2; 201 | j.reliste[1] = relist1 + nelem(relist1) - 2; 202 | 203 | rv = rregexec1(progp, bol, mp, ms, &j); 204 | if(rv >= 0) 205 | return rv; 206 | rv = rregexec2(progp, bol, mp, ms, &j); 207 | if(rv >= 0) 208 | return rv; 209 | return -1; 210 | } 211 | -------------------------------------------------------------------------------- /rregsub.c: -------------------------------------------------------------------------------- 1 | #include "regexp9.h" 2 | 3 | /* substitute into one string using the matches from the last regexec() */ 4 | extern void 5 | rregsub(Rune *sp, /* source string */ 6 | Rune *dp, /* destination string */ 7 | int dlen, 8 | Resub *mp, /* subexpression elements */ 9 | int ms) /* number of elements pointed to by mp */ 10 | { 11 | Rune *ssp, *ep; 12 | int i; 13 | 14 | ep = dp+(dlen/sizeof(Rune))-1; 15 | while(*sp != '\0'){ 16 | if(*sp == '\\'){ 17 | switch(*++sp){ 18 | case '0': 19 | case '1': 20 | case '2': 21 | case '3': 22 | case '4': 23 | case '5': 24 | case '6': 25 | case '7': 26 | case '8': 27 | case '9': 28 | i = *sp-'0'; 29 | if(mp!=0 && mp[i].s.rsp != 0 && ms>i) 30 | for(ssp = mp[i].s.rsp; 31 | ssp < mp[i].e.rep; 32 | ssp++) 33 | if(dp < ep) 34 | *dp++ = *ssp; 35 | break; 36 | case '\\': 37 | if(dp < ep) 38 | *dp++ = '\\'; 39 | break; 40 | case '\0': 41 | sp--; 42 | break; 43 | default: 44 | if(dp < ep) 45 | *dp++ = *sp; 46 | break; 47 | } 48 | }else if(*sp == '&'){ 49 | if(mp!=0 && mp[0].s.rsp != 0 && ms>0) 50 | for(ssp = mp[0].s.rsp; 51 | ssp < mp[0].e.rep; ssp++) 52 | if(dp < ep) 53 | *dp++ = *ssp; 54 | }else{ 55 | if(dp < ep) 56 | *dp++ = *sp; 57 | } 58 | sp++; 59 | } 60 | *dp = '\0'; 61 | } 62 | -------------------------------------------------------------------------------- /test.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "regexp9.h" 5 | 6 | struct x 7 | { 8 | char *re; 9 | char *s; 10 | Reprog *p; 11 | }; 12 | 13 | struct x t[] = { 14 | { "^[^!@]+$", "/bin/upas/aliasmail '&'", 0 }, 15 | { "^local!(.*)$", "/mail/box/\\1/mbox", 0 }, 16 | { "^plan9!(.*)$", "\\1", 0 }, 17 | { "^helix!(.*)$", "\\1", 0 }, 18 | { "^([^!]+)@([^!@]+)$", "\\2!\\1", 0 }, 19 | { "^(uk\\.[^!]*)(!.*)$", "/bin/upas/uk2uk '\\1' '\\2'", 0 }, 20 | { "^[^!]*\\.[^!]*!.*$", "inet!&", 0 }, 21 | { "^\xE2\x98\xBA$", "smiley", 0 }, 22 | { "^(coma|research|pipe|pyxis|inet|hunny|gauss)!(.*)$", "/mail/lib/qmail '\\s' 'net!\\1' '\\2'", 0 }, 23 | { "^.*$", "/mail/lib/qmail '\\s' 'net!research' '&'", 0 }, 24 | { 0, 0, 0 }, 25 | }; 26 | 27 | int 28 | main(int ac, char **av) 29 | { 30 | Resub rs[10]; 31 | char dst[128]; 32 | struct x *tp; 33 | 34 | if(ac != 2) 35 | exit(1); 36 | 37 | for(tp = t; tp->re; tp++) 38 | tp->p = regcomp(tp->re); 39 | 40 | for(tp = t; tp->re; tp++){ 41 | printf("%s VIA %s", av[1], tp->re); 42 | memset(rs, 0, sizeof rs); 43 | if(regexec(tp->p, av[1], rs, 10)){ 44 | regsub(tp->s, dst, sizeof dst, rs, 10); 45 | printf(" sub %s -> %s", tp->s, dst); 46 | } 47 | printf("\n"); 48 | } 49 | exit(0); 50 | } 51 | -------------------------------------------------------------------------------- /test2.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "regexp9.h" 5 | 6 | int 7 | main() 8 | { 9 | Resub rs[10]; 10 | Reprog *p; 11 | char *s; 12 | 13 | p = regcomp("[^a-z]"); 14 | s = "\n"; 15 | if(regexec(p, s, rs, 10)) 16 | printf("%s %p %p %p\n", s, s, rs[0].s.sp, rs[0].e.ep); 17 | s = "0"; 18 | if(regexec(p, s, rs, 10)) 19 | printf("%s %p %p %p\n", s, s, rs[0].s.sp, rs[0].e.ep); 20 | exit(0); 21 | } 22 | -------------------------------------------------------------------------------- /utf.c: -------------------------------------------------------------------------------- 1 | /* 2 | * The authors of this software are Rob Pike and Ken Thompson. 3 | * Copyright (c) 2002 by Lucent Technologies. 4 | * Permission to use, copy, modify, and distribute this software for any 5 | * purpose without fee is hereby granted, provided that this entire notice 6 | * is included in all copies of any software which is or includes a copy 7 | * or modification of this software and in all copies of the supporting 8 | * documentation for such software. 9 | * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED 10 | * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE 11 | * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY 12 | * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. 13 | */ 14 | #include 15 | #include "utf.h" 16 | 17 | enum 18 | { 19 | Bit1 = 7, 20 | Bitx = 6, 21 | Bit2 = 5, 22 | Bit3 = 4, 23 | Bit4 = 3, 24 | Bit5 = 2, 25 | 26 | T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ 27 | Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ 28 | T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ 29 | T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ 30 | T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ 31 | T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ 32 | 33 | Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ 34 | Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ 35 | Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ 36 | Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */ 37 | 38 | Maskx = (1< T1 53 | */ 54 | c = *(unsigned char*)str; 55 | if(c < Tx) { 56 | *rune = c; 57 | return 1; 58 | } 59 | 60 | /* 61 | * two character sequence 62 | * 0080-07FF => T2 Tx 63 | */ 64 | c1 = *(unsigned char*)(str+1) ^ Tx; 65 | if(c1 & Testx) 66 | goto bad; 67 | if(c < T3) { 68 | if(c < T2) 69 | goto bad; 70 | l = ((c << Bitx) | c1) & Rune2; 71 | if(l <= Rune1) 72 | goto bad; 73 | *rune = l; 74 | return 2; 75 | } 76 | 77 | /* 78 | * three character sequence 79 | * 0800-FFFF => T3 Tx Tx 80 | */ 81 | c2 = *(unsigned char*)(str+2) ^ Tx; 82 | if(c2 & Testx) 83 | goto bad; 84 | if(c < T4) { 85 | l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; 86 | if(l <= Rune2) 87 | goto bad; 88 | *rune = l; 89 | return 3; 90 | } 91 | 92 | /* 93 | * four character sequence 94 | * 10000-10FFFF => T4 Tx Tx Tx 95 | */ 96 | if(UTFmax >= 4) { 97 | c3 = *(unsigned char*)(str+3) ^ Tx; 98 | if(c3 & Testx) 99 | goto bad; 100 | if(c < T5) { 101 | l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; 102 | if(l <= Rune3) 103 | goto bad; 104 | if(l > Runemax) 105 | goto bad; 106 | *rune = l; 107 | return 4; 108 | } 109 | } 110 | 111 | /* 112 | * bad decoding 113 | */ 114 | bad: 115 | *rune = Bad; 116 | return 1; 117 | } 118 | 119 | Rune* 120 | runestrchr(Rune *s, Rune c) 121 | { 122 | Rune c0 = c; 123 | Rune c1; 124 | 125 | if(c == 0) { 126 | while(*s++) 127 | ; 128 | return s-1; 129 | } 130 | 131 | while((c1 = *s++)) 132 | if(c1 == c0) 133 | return s-1; 134 | return 0; 135 | } 136 | 137 | char* 138 | utfrune(char *s, Rune c) 139 | { 140 | Rune c1; 141 | Rune r; 142 | int n; 143 | 144 | if(c < Runesync) /* not part of utf sequence */ 145 | return strchr(s, c); 146 | 147 | for(;;) { 148 | c1 = *(unsigned char*)s; 149 | if(c1 < Runeself) { /* one byte rune */ 150 | if(c1 == 0) 151 | return 0; 152 | if(c1 == c) 153 | return s; 154 | s++; 155 | continue; 156 | } 157 | n = chartorune(&r, s); 158 | if(r == c) 159 | return s; 160 | s += n; 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /utf.h: -------------------------------------------------------------------------------- 1 | #ifndef _UTF_H_ 2 | #define _UTF_H_ 1 3 | #if defined(__cplusplus) 4 | extern "C" { 5 | #endif 6 | 7 | typedef unsigned int Rune; /* 32 bits */ 8 | 9 | enum 10 | { 11 | UTFmax = 4, /* maximum bytes per rune */ 12 | Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ 13 | Runeself = 0x80, /* rune and UTF sequences are the same (<) */ 14 | Runeerror = 0xFFFD, /* decoding error in UTF */ 15 | Runemax = 0x10FFFF /* maximum rune value */ 16 | }; 17 | 18 | int chartorune(Rune *rune, char *str); 19 | Rune* runestrchr(Rune *s, Rune c); 20 | char* utfrune(char *s, Rune c); 21 | 22 | #if defined(__cplusplus) 23 | } 24 | #endif 25 | #endif 26 | --------------------------------------------------------------------------------