├── additional ├── formal.ref.h ├── re3.c ├── nregex.h ├── zregex.h ├── regexl.h ├── ref.c ├── zre.h └── nre.h ├── array.h ├── LICENSE ├── regexp.h ├── regex.h ├── README.md ├── mkfile ├── Makefile ├── README ├── sed.h ├── sed3.cpp ├── re0.cpp ├── ChangeLog ├── sed0.cpp ├── grep.cpp ├── testgrep.sh ├── re.h ├── sed.1 ├── sed2.cpp ├── testsed.sh ├── testre.cpp ├── testre.dat ├── sed1.cpp ├── re1.cpp └── re2.cpp /additional/formal.ref.h: -------------------------------------------------------------------------------- 1 | /v/lib/eign 6 2 | formal.ref 3 | -------------------------------------------------------------------------------- /additional/re3.c: -------------------------------------------------------------------------------- 1 | void 2 | printnew(Rex *rex) 3 | { 4 | int t; 5 | printf("new %s\n", 6 | rex==0? "ESPACE": 7 | (t=rex->type)==OK? "OK": 8 | t==ANCHOR? "ANCHOR": 9 | t==END? "END": 10 | t==DOT? "DOT": 11 | t==ONECHAR? "ONECHAR": 12 | t==STRING? "STRING": 13 | t==KMP? "KMP": 14 | t==TRIE? "TRIE": 15 | t==CLASS? "CLASS": 16 | t==BACK? "BACK": 17 | t==SUBEXP? "SUBEXP": 18 | t==ALT? "ALT": 19 | t==REP? "REP": 20 | t==TEMP? "TEMP": 21 | "HUH"); 22 | } 23 | -------------------------------------------------------------------------------- /array.h: -------------------------------------------------------------------------------- 1 | /* expandable array, begins pointing to preallocated 2 | space. can be reallocated if space runs out */ 3 | 4 | #ifndef ARRAY_H 5 | #define ARRAY_H 6 | 7 | template struct Array { 8 | enum { SIZE = 20 }; 9 | T *p; // where the array is 10 | int size; // how big it is 11 | T space[SIZE]; // initial space 12 | Array() { p = space; size = SIZE; } 13 | ~Array() { if(p != space) delete(p); } 14 | T& operator[] (int i) { return p[i]; } 15 | int realloc(int i) // when array must grow 16 | { 17 | i *= 2; 18 | T *q = new T[i]; 19 | if(q == 0) 20 | return 1; 21 | memmove(q, p, size*sizeof(T)); 22 | if(p != space) 23 | delete p; 24 | p = q; 25 | size = i; 26 | return 0; 27 | } 28 | int assure(int i) { return i>=size? realloc(i): 0; } 29 | // assure p[i] exists 30 | char *bytes() const { return (char *) p; } 31 | }; 32 | #endif 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | /**************************************************************** 2 | Copyright (C) Lucent Technologies 1998 3 | All Rights Reserved 4 | 5 | Permission to use, copy, modify, and distribute this software and 6 | its documentation for any purpose and without fee is hereby 7 | granted, provided that the above copyright notice appear in all 8 | copies and that both that the copyright notice and this 9 | permission notice and warranty disclaimer appear in supporting 10 | documentation, and that the name Lucent Technologies or any of 11 | its entities not be used in advertising or publicity pertaining 12 | to distribution of the software without specific, written prior 13 | permission. 14 | 15 | LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16 | INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17 | IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18 | SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20 | IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21 | ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22 | THIS SOFTWARE. 23 | ****************************************************************/ 24 | -------------------------------------------------------------------------------- /regexp.h: -------------------------------------------------------------------------------- 1 | #pragma src "/sys/src/libregexp" 2 | #pragma lib "libregexp.a" 3 | 4 | typedef struct Resub Resub; 5 | typedef struct Reclass Reclass; 6 | typedef struct Reinst Reinst; 7 | typedef struct Reprog Reprog; 8 | 9 | /* 10 | * Sub expression matches 11 | */ 12 | struct Resub{ 13 | union 14 | { 15 | char *sp; 16 | Rune *rsp; 17 | }; 18 | union 19 | { 20 | char *ep; 21 | Rune *rep; 22 | }; 23 | }; 24 | 25 | /* 26 | * character class, each pair of rune's defines a range 27 | */ 28 | struct Reclass{ 29 | Rune *end; 30 | Rune spans[64]; 31 | }; 32 | 33 | /* 34 | * Machine instructions 35 | */ 36 | struct Reinst{ 37 | int type; 38 | union { 39 | Reclass *cp; /* class pointer */ 40 | Rune r; /* character */ 41 | int subid; /* sub-expression id for RBRA and LBRA */ 42 | Reinst *right; /* right child of OR */ 43 | }; 44 | union { /* regexp relies on these two being in the same union */ 45 | Reinst *left; /* left child of OR */ 46 | Reinst *next; /* next instruction for CAT & LBRA */ 47 | }; 48 | }; 49 | 50 | /* 51 | * Reprogram definition 52 | */ 53 | struct Reprog{ 54 | Reinst *startinst; /* start pc */ 55 | Reclass class[16]; /* .data */ 56 | Reinst firstinst[5]; /* .text */ 57 | }; 58 | 59 | extern Reprog *regcomp(char*); 60 | extern Reprog *regcomplit(char*); 61 | extern Reprog *regcompnl(char*); 62 | extern void regerror(char*); 63 | extern int regexec(Reprog*, char*, Resub*, int); 64 | extern void regsub(char*, char*, Resub*, int); 65 | extern int rregexec(Reprog*, Rune*, Resub*, int); 66 | extern void rregsub(Rune*, Rune*, Resub*, int); 67 | -------------------------------------------------------------------------------- /regex.h: -------------------------------------------------------------------------------- 1 | #ifndef REGEX_H 2 | #define REGEX_H 3 | #include /* for size_t */ 4 | 5 | #ifdef __cplusplus 6 | extern "C" { 7 | #endif 8 | 9 | typedef long regoff_t; 10 | 11 | typedef struct { 12 | regoff_t rm_so; /* offset of start */ 13 | regoff_t rm_eo; /* offset of end */ 14 | } regmatch_t; 15 | 16 | typedef struct { 17 | size_t re_nsub; /* number of subexpressions */ 18 | /* local fields, not specified by posix */ 19 | struct Rex *rex; /* compiled expression */ 20 | int flags; /* flags from regcomp() */ 21 | unsigned char *map; /* for REG_ICASE folding */ 22 | int unused1; 23 | } regex_t; 24 | 25 | int regcomp(regex_t*, const char*, int); 26 | int regexec(const regex_t*, const char*, size_t, regmatch_t*, int); 27 | size_t regerror(int, const regex_t*, char*, size_t); 28 | void regfree(regex_t*); 29 | 30 | /* functions needed by grep (nonstandard) */ 31 | 32 | int regcomb(regex_t*, regex_t*); 33 | int regnexec(const regex_t*, const char*, size_t, size_t, regmatch_t*, int); 34 | 35 | /* regcomp flags */ 36 | #define REG_EXTENDED 0x0001 37 | #define REG_ICASE 0x0002 38 | #define REG_NOSUB 0x0004 39 | #define REG_NEWLINE 0x0008 40 | /* regexec flags */ 41 | #define REG_NOTBOL 0x0010 42 | #define REG_NOTEOL 0x0020 43 | /* nonstandard flags */ 44 | #define REG_NULL 0x0040 /* allow null patterns for grep */ 45 | #define REG_ANCH 0x0080 /* grep option -x (no Kmp) */ 46 | #define REG_LITERAL 0x0100 /* grep option -F (no operators) */ 47 | #define REG_AUGMENTED 0x0200 /* allow & and ! operators */ 48 | 49 | enum { /* regex error codes */ 50 | REG_NOMATCH = 1, 51 | REG_BADPAT, 52 | REG_ECOLLATE, 53 | REG_ECTYPE, 54 | REG_EESCAPE, 55 | REG_ESUBREG, 56 | REG_EBRACK, 57 | REG_EPAREN, 58 | REG_EBRACE, 59 | REG_BADBR, 60 | REG_ERANGE, 61 | REG_ESPACE, 62 | REG_BADRPT 63 | }; 64 | 65 | #ifdef __cplusplus 66 | } 67 | #endif 68 | 69 | #endif 70 | -------------------------------------------------------------------------------- /additional/nregex.h: -------------------------------------------------------------------------------- 1 | #ifndef REGEX_H 2 | #define REGEX_H 3 | #include /* for size_t */ 4 | 5 | #ifdef __cplusplus 6 | extern "C" { 7 | #endif 8 | 9 | typedef long regoff_t; 10 | 11 | typedef struct { 12 | regoff_t rm_so; /* offset of start */ 13 | regoff_t rm_eo; /* offset of end */ 14 | } regmatch_t; 15 | 16 | typedef struct { 17 | size_t re_nsub; /* number of subexpressions */ 18 | /* local fields, not specified by posix */ 19 | struct Rex *rex; /* compiled expression */ 20 | int flags; /* flags from regcomp() */ 21 | unsigned char *map; /* for REG_ICASE folding */ 22 | int unused1; 23 | } regex_t; 24 | 25 | int regcomp(regex_t*, const char*, int); 26 | int regexec(const regex_t*, const char*, size_t, regmatch_t*, int); 27 | size_t regerror(int, const regex_t*, char*, size_t); 28 | void regfree(regex_t*); 29 | 30 | /* functions needed by grep (nonstandard) */ 31 | 32 | int regcomb(regex_t*, regex_t*); 33 | int regnexec(const regex_t*, const char*, size_t, size_t, regmatch_t*, int); 34 | 35 | /* regcomp flags */ 36 | #define REG_EXTENDED 0x0001 37 | #define REG_ICASE 0x0002 38 | #define REG_NOSUB 0x0004 39 | #define REG_NEWLINE 0x0008 40 | /* regexec flags */ 41 | #define REG_NOTBOL 0x0010 42 | #define REG_NOTEOL 0x0020 43 | /* nonstandard flags */ 44 | #define REG_NULL 0x0040 /* allow null patterns for grep */ 45 | #define REG_ANCH 0x0080 /* grep option -x (no Kmp) */ 46 | #define REG_LITERAL 0x0100 /* grep option -F (no operators) */ 47 | #define REG_AUGMENTED 0x0200 /* allow & and ! operators */ 48 | #define REG_WHICH 0x0400 /* enable \: progress marks */ 49 | 50 | enum { /* regex error codes */ 51 | REG_NOMATCH = -1, 52 | REG_BADPAT = -2, 53 | REG_ECOLLATE = -3, 54 | REG_ECTYPE = -4, 55 | REG_EESCAPE = -5, 56 | REG_ESUBREG = -6, 57 | REG_EBRACK = -7, 58 | REG_EPAREN = -8, 59 | REG_EBRACE = -9, 60 | REG_BADBR = -10, 61 | REG_ERANGE = -11, 62 | REG_ESPACE = -12, 63 | REG_BADRPT = -13 64 | }; 65 | 66 | #ifdef __cplusplus 67 | } 68 | #endif 69 | 70 | #endif 71 | -------------------------------------------------------------------------------- /additional/zregex.h: -------------------------------------------------------------------------------- 1 | #ifndef REGEX_H 2 | #define REGEX_H 3 | #include /* for size_t */ 4 | 5 | #ifdef __cplusplus 6 | extern "C" { 7 | #endif 8 | 9 | typedef long regoff_t; 10 | 11 | typedef struct { 12 | regoff_t rm_so; /* offset of start */ 13 | regoff_t rm_eo; /* offset of end */ 14 | } regmatch_t; 15 | 16 | typedef struct { 17 | size_t re_nsub; /* number of subexpressions */ 18 | /* local fields, not specified by posix */ 19 | struct Rex *rex; /* compiled expression */ 20 | int flags; /* flags from regcomp() */ 21 | unsigned char *map; /* for REG_ICASE folding */ 22 | int unused1; 23 | } regex_t; 24 | 25 | int regcomp(regex_t*, const char*, int); 26 | int regexec(const regex_t*, const char*, size_t, regmatch_t*, int); 27 | size_t regerror(int, const regex_t*, char*, size_t); 28 | void regfree(regex_t*); 29 | 30 | /* functions used by grep (nonstandard) */ 31 | 32 | int regcomb(regex_t*, regex_t*); 33 | int regnexec(const regex_t*, const char*, size_t, size_t, regmatch_t*, int); 34 | 35 | /* regcomp flags */ 36 | #define REG_EXTENDED 0x0001 37 | #define REG_ICASE 0x0002 38 | #define REG_NOSUB 0x0004 39 | #define REG_NEWLINE 0x0008 40 | /* regexec flags */ 41 | #define REG_NOTBOL 0x0010 42 | #define REG_NOTEOL 0x0020 43 | /* nonstandard flags */ 44 | #define REG_NULL 0x0040 /* allow null patterns for grep */ 45 | #define REG_LEFT 0x0080 /* implied ^, grep -x (no Kmp) */ 46 | #define REG_RIGHT 0x0100 /* implied $, grep -x */ 47 | #define REG_LITERAL 0x0200 /* grep -F (no operators) */ 48 | #define REG_AUGMENTED 0x0400 /* allow & and ! operators */ 49 | #define REG_WHICH 0x1000 /* enable 50 | 51 | enum { /* regex error codes */ 52 | REG_NOMATCH = -1, 53 | REG_BADPAT = -2, 54 | REG_ECOLLATE = -3, 55 | REG_ECTYPE = -4, 56 | REG_EESCAPE = -5, 57 | REG_ESUBREG = -6, 58 | REG_EBRACK = -7, 59 | REG_EPAREN = -8, 60 | REG_EBRACE = -9, 61 | REG_BADBR = -10, 62 | REG_ERANGE = -11, 63 | REG_ESPACE = -12, 64 | REG_BADRPT = -13 65 | }; 66 | 67 | #ifdef __cplusplus 68 | } 69 | #endif 70 | 71 | #endif 72 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Doug McIlroy's C++ Regular Expression Matcher 2 | 3 | ### Introduction 4 | 5 | This is Doug McIlroy's C++ regular expression matcher. 6 | He wrote it while still at Bell Labs, for Cfront 4.0. 7 | 8 | This code was converted to C by Glenn Fowler for use in 9 | David Korn's [AST](https://github.com/att/ast) package. 10 | 11 | In 2003 Dr. McIlroy attempted to modernize it, but did not get very far. 12 | 13 | In July of 2017, I convinced him to send me the code as-is. 14 | I did this because: 15 | 16 | * I wanted to see this code preserved somewhere public. 17 | * I have an interest in regular expression matchers. 18 | * I also wanted to try to modernize it, for at least C++ 98. 19 | 20 | ### Contents 21 | 22 | I have attempted to construct a Git repo to represent the code's history 23 | as close as I can reconstruct it. The initial commit should represent 24 | the state of the code as it was in 1998 when Dr. McIlroy left Bell Labs. 25 | The dates on the files are based on what his filesystem shows, 26 | as sent to me in email. 27 | 28 | I have added a `ChangeLog` file that pretends to track the work, 29 | but of course this is my own invention. 30 | 31 | The `additional` directory contains some additional files Dr. McIlroy 32 | sent me that seem to be part of his work to modernize the package in 2003, 33 | so I have dated the additional files as being from 2003. 34 | 35 | ### Branches 36 | 37 | The **master** branch has the original files in the initial commit. 38 | The `additional` files are in the second commit, and the third commit 39 | contains changes sent by Dr. McIlroy to compile on a modern Linux system. 40 | 41 | One file was accidentally left out: `array.c`. This has been added witih 42 | a commit date matching what Dr. McIlroy shows in his file system. I 43 | have tagged this point in the code and also saved it aside in the 44 | `original-code` branch. 45 | 46 | Similarly for the `testgrep.sh` and `testsed.sh` scripts, which have 47 | been added back into the code base. 48 | 49 | Going forward will be my own work to modernize the package and to make 50 | it buildable using `make` instead of the Bell Labs `mk` tool. 51 | 52 | ### Plans 53 | 54 | Here are my thoughts: 55 | 56 | 1. Rename C++ files to have a `.cpp` extension. (DONE) 57 | 2. Update the `Makefile` from Dr. McIlroy. (DONE) 58 | 3. Compile the code without warnings using `g++`, and 59 | if possible, also `clang`. (DONE, against versions 5, 7, and 8 of `g++`.) 60 | 4. Make the code pass the original tests supplied by Dr. McIlroy. 61 | 5. Review the code and try to improve the use of C++. 62 | 63 | ##### Last updated: 64 | 65 | Sun Jul 22 16:59:17 IDT 2018 66 | 67 | Arnold Robbins 68 | [arnold at skeeve.com](mailto:arnold@skeeve.com) 69 | -------------------------------------------------------------------------------- /mkfile: -------------------------------------------------------------------------------- 1 | BUILTINS = 2 | CFLAGS = -I. 3 | CCFLAGS = $CFLAGS -B 4 | CC = cc 5 | 6 | all:V: re1.o re2.o grep sed 7 | 8 | retest: testre testre.dat 9 | testre bundle 66 | 67 | 68 | 69 | re_to_mona: mona.tab.c lex.yy.c 70 | cc mona.tab.c -ll -ly -o re_to_mona 71 | 72 | lex.yy.c: mona.lex 73 | lex mona.lex 74 | 75 | mona.tab.c: mona.rie 76 | rie mona.rie 77 | sed '/#line/d' mona.tab.c >junk 78 | mv junk mona.tab.c 79 | 80 | clean: 81 | rm -f *.o *.ii sed grep testre re Dre?.c *dummy 82 | rm -f btestre prof.out Dre?.int.o 83 | rm -f mona.tab.c lex.yy.c re_to_mona 84 | rm -f a.out core # just in case 85 | rm -f in out expect pat # testgrep.sh 86 | rm -f SCRIPT INPUT OUTPUT* RESULT NOWHERE DIAG # testsed.sh 87 | 88 | 89 | btestre: bre1.o bre2.o testre.o 90 | rm prof.out 91 | lcc -b -g bre[12].o testre.o -o btestre 92 | 93 | bre(.)\.o:R: Ddummy 94 | sed 's/^Dre/bre/' Dre$stem1.ii >bre$stem1.c 95 | CC $CCFLAGS -c -DDEBUG -S bre$stem1.c 96 | mv bre$stem1.int.c bre$stem1.c 97 | lcc -b -N -I. -I/usr/include -c bre$stem1.c 98 | 99 | bretest: btestre 100 | btestre bundle 95 | 96 | clean: 97 | rm -f *.o *.ii sed grep testre re Dre?.cpp *dummy 98 | rm -f btestre prof.out Dre?.int.o 99 | rm -f a.out core # just in case 100 | rm -f in out expect pat # testgrep.sh 101 | rm -f SCRIPT INPUT OUTPUT* RESULT NOWHERE DIAG # testsed.sh 102 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | This package implements Posix regular-expression facilities: 2 | regex.h 3 | regcomp() 4 | regexec() 5 | regfree() 6 | regerror() 7 | grep 8 | sed 9 | 10 | The four reg* functions are implemented in files re1.o and re2.o. 11 | 12 | Some of the programs are written in C++, but the object files 13 | re1.o and re2.o are intended to be loadable by cc. The mkfile 14 | uses option -B of cfront 4.0 to achieve this. 15 | 16 | To make everything listed above: 17 | 18 | mk all 19 | 20 | To test the regular-expression routines: 21 | 22 | mk retest 23 | 24 | To test the utilities: 25 | 26 | mk sedtest 27 | mk greptest 28 | 29 | The routines implement the apparent consensus of the "RE-experts 30 | group" about the "longest-leftmost" rule for choosing 31 | among abiguous parses. The rules are applied by outside-in 32 | structural recursion. 33 | 34 | 1. Catenation and alternation are right-associative. 35 | 2. Parentheses affect association. 36 | 3. A closure P* is treated as P{0,infinity}. 37 | 4. A match to a pattern is maximized before 38 | the matches to its subpatterns. 39 | 5. If P can match the null string, a match to P{u,v} 40 | may contain null matches to P only as necessary, i.e. 41 | if deleting the match changes something else (other 42 | than another omittable null match) in the overall 43 | match. (Requires the null match either to be one of 44 | the first u matches. 45 | 6. P{u,v} is treated as P P{u-1,v-1} if u > 1. 46 | P | P{1,v-1} if u = 1 47 | | P{1,v} if u = 0 48 | 7. In a catenation AB, the length of B is 49 | maximized after A and the subpatterns of A. 50 | 8. In an alternation A|B, the longest of matches 51 | to A or B is taken, with ties going to A. 52 | 53 | Rule 5 differs from the Toronto interpretation and thus could 54 | still be overridden by the standards committee. The Toronto 55 | rule was 56 | 57 | 5T. If P can match the null string, a match of P{0,v} 58 | to the null string includes a match of P to the 59 | null string. Otherwise a match of P{u,v} may 60 | contain null matches to P only if it contains 61 | exactly u matches to P. 62 | 63 | The regex routines support some extensions under control of regcomp 64 | flags for the benefit of Posix grep, which does not have exactly 65 | the same regular expressions as regex: REG_LITERAL for grep -f, 66 | REG_NULL for grep in general and REG_ANCH for grep -x. A more 67 | interesting flag, REG_AUGMENTED implies REG_EXTENDED and adds 68 | two more operators: 69 | 70 | ! negation; match strings that dpesn't 71 | & match strings that match both catenations 72 | 73 | For example a! matches "abc". 74 | a! also matches the null string at the beginning of "a". 75 | a!&. matches the b in "abc"; it does not match "a". 76 | (.*\*/.*)! matches a string that contains no instance of 77 | "*/"; it would be useful in identifying C comments. 78 | 79 | The regex match array will only contain (-1,-1) entries for 80 | subexpressions in a negated . It will contain useful 81 | entries for both sides of a conjunction. The order of precedence 82 | of binary operators is catenation then conjunction then alternation. 83 | 84 | 9. In a conjunction A&B, the length of B is 85 | maximized after A and the subpatterns of A. 86 | -------------------------------------------------------------------------------- /sed.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include "regex.h" 3 | 4 | 5 | typedef unsigned char uchar; 6 | 7 | typedef struct { 8 | uchar *w; /* write pointer */ 9 | uchar *e; /* end */ 10 | uchar *s; /* start */ 11 | } Text; 12 | 13 | extern void compile(Text *script, Text *raw); 14 | extern void execute(Text *script, Text *input); 15 | extern int recomp(Text *script, Text *t, int seof); 16 | extern int match(uchar *re, Text *data, int gflag); 17 | extern int substitute(regex_t*, Text* data, uchar *rhs, int gf); 18 | extern regex_t *readdr(int addr); 19 | extern void tcopy(Text *from, Text *to); 20 | void printscript(Text *script); 21 | extern void warn(const char*, ...); 22 | extern void quit(const char*, ...); 23 | extern void vacate(Text*); 24 | extern void synwarn(const char*); 25 | extern void syntax(const char*); 26 | extern int readline(Text*); 27 | extern int ateof(void); 28 | extern void coda(void); 29 | 30 | #define exch(a, b, t) ((t)=(a), (a)=(b), (b)=(t)) 31 | 32 | /* space management; assure room for n more chars in Text */ 33 | #define assure(/*Text*/t, /*int*/ n) \ 34 | if((t)->s==0 || (t)->w>=(t)->e-n-1) \ 35 | grow(t, n); \ 36 | else 37 | extern void grow(Text*, int); 38 | 39 | /* round character pointer up to integer pointer. 40 | portable to the cray; simpler tricks are not */ 41 | 42 | #define intp(/*uchar**/p) (int*)(p + sizeof(int) - 1 \ 43 | - (p+sizeof(int)-1 - (uchar*)0)%sizeof(int)) 44 | 45 | extern int recno; 46 | extern int nflag; 47 | extern int qflag; 48 | extern int sflag; 49 | extern int bflag; 50 | extern int options; 51 | extern const char *stdouterr; 52 | 53 | extern Text files; 54 | 55 | /* SCRIPT LAYOUT 56 | 57 | script commands are packed thus: 58 | 0,1,or2 address words signed + for numbers - for regexp 59 | if 2 addresses, then another word indicates activity 60 | positive: active, the record number where activated 61 | negative: inactive, sign or-ed with number where deactivated 62 | instruction word 63 | high byte IMASK+flags; flags are NEG and SEL 64 | next byte command code (a letter) 65 | next two bytes, length of this command, including addrs 66 | (length is a multiple of 4; capacity could be expanded 67 | by counting the length in words instead of bytes) 68 | after instruction word 69 | on s command 70 | offset of regexp in rebuf 71 | word containing flags p,w plus n (n=0 => g) 72 | replacement text 73 | word containing file designator, if flag w 74 | on y command 75 | 256-byte transliteration table 76 | on b and t command 77 | offset of label in script 78 | */ 79 | 80 | enum { 81 | BYTE = 8, 82 | 83 | IMASK = (int)0xC0000000,/* instruction flag */ 84 | NEG = 0x01000000, /* instruction written with ! */ 85 | 86 | LMASK = 0xffff, /* low half word */ 87 | AMASK = (int)(~0u>>1), /* address mask, clear sign bit */ 88 | INACT = ~AMASK, /* inactive bit, the sign bit */ 89 | 90 | DOLLAR = AMASK, /* huge address */ 91 | REGADR = ~AMASK, /* context address */ 92 | 93 | PFLAG = (int)0x80000000,/* s/../../p */ 94 | WFLAG = 0x40000000 /* s/../../w */ 95 | }; 96 | 97 | extern int pack(int neg, int cmd, int length); 98 | extern int *instr(uchar*); 99 | #define code(/*int*/ inst) ((inst)>>2*BYTE & 0xff) 100 | #define nexti(/*uchar**/ p) ((p) + (*instr(p)&LMASK)) 101 | -------------------------------------------------------------------------------- /sed3.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "sed.h" 3 | 4 | void docopy(uchar *where, int n); 5 | int dosub(uchar *where, uchar *rp); 6 | 7 | Text retemp; /* holds a rewritten regex, without delimiter */ 8 | 9 | int 10 | recomp(Text *rebuf, Text *t, int delim) 11 | { 12 | static int lastre; 13 | uchar *w; 14 | vacate(&retemp); 15 | for(w=t->w; ; retemp.w++,w++) { 16 | assure(&retemp, 2) 17 | *retemp.w = *w; 18 | if(*w == delim) 19 | break; 20 | else if(*w==0 || *w=='\n') 21 | syntax("unterminated address"); 22 | else if(*w != '\\') 23 | continue; 24 | else if(*++w==delim) 25 | *retemp.w = delim; 26 | else if(*w == 'n') 27 | *retemp.w = '\n'; 28 | else if(*w==0 || *w=='\n') 29 | syntax("unterminated regular expression"); 30 | else { 31 | assure(&retemp, 2); 32 | *++retemp.w = *w; 33 | } 34 | } 35 | *retemp.w = 0; 36 | 37 | assure(rebuf, sizeof(regex_t)); 38 | if(*retemp.s != 0) { 39 | if(regcomp((regex_t*)rebuf->w,(char*)retemp.s,options) != 0) 40 | syntax("bad regular expression"); 41 | lastre = rebuf->w - rebuf->s; 42 | rebuf->w += sizeof(regex_t); 43 | } else if(rebuf->w == rebuf->s) 44 | syntax("no previous regular expression"); 45 | t->w = w + 1; 46 | return lastre; 47 | } 48 | 49 | Text gendata; 50 | 51 | #define NMATCH 10 52 | regmatch_t matches[NMATCH]; 53 | #define so matches[0].rm_so 54 | #define eo matches[0].rm_eo 55 | 56 | int 57 | substitute(regex_t *re, Text* data, uchar *rhs, int n) 58 | { 59 | Text t; 60 | uchar *where = data->s; 61 | if(regexec(re, (char*)data->s, NMATCH, matches, 0)) 62 | return 0; 63 | vacate(&gendata); 64 | if(n == 0) 65 | do { 66 | docopy(where, so); 67 | if(!dosub(where, rhs)) 68 | return 0; 69 | where += eo; 70 | if(eo == so) 71 | if(where < data->w) 72 | docopy(where++, 1); 73 | else 74 | goto done; 75 | } while(regexec(re, (char*)where, NMATCH, matches, REG_NOTBOL) == 0); 76 | else { 77 | while(--n > 0) { 78 | where += eo; 79 | if(eo == so) 80 | if(where < data->w) 81 | where++; 82 | else 83 | return 0; 84 | if(regexec(re, (char*)where, NMATCH, matches, REG_NOTBOL)) 85 | return 0; 86 | } 87 | docopy(data->s, where-data->s+so); 88 | if(!dosub(where, rhs)) 89 | return 0; 90 | where += eo; 91 | } 92 | eo = so = data->w - where; 93 | docopy(where, so); 94 | done: 95 | exch(gendata, *data, t); 96 | return 1; 97 | } 98 | 99 | void 100 | docopy(uchar *where, int n) 101 | { 102 | assure(&gendata, n+1); 103 | memmove(gendata.w, where, n); 104 | gendata.w += n; 105 | *gendata.w = 0; 106 | } 107 | 108 | /* interpretation problem: if there is no match for \1, say, 109 | does the substitition occur? dosub uses a null string. 110 | a change where indicated will abort the substitution */ 111 | 112 | int 113 | dosub(uchar *where, uchar *rp) 114 | { 115 | int c, n; 116 | regmatch_t *m; 117 | 118 | while(c = *rp++) { 119 | if(c == '\\') { 120 | c = *rp++; 121 | if (c >= '1' && c <= '9') { 122 | m = matches + c - '0'; 123 | if(m->rm_eo == -1) 124 | continue; /* or return 0 */ 125 | n = m->rm_eo - m->rm_so; 126 | assure(&gendata, n); 127 | memmove(gendata.w,where+m->rm_so,n); 128 | gendata.w += n; 129 | continue; 130 | } 131 | } else if(c == '&') { 132 | assure(&gendata, eo-so); 133 | memmove(gendata.w,where+so,eo-so); 134 | gendata.w += eo-so; 135 | continue; 136 | } 137 | assure(&gendata, 1); 138 | *gendata.w++ = c; 139 | } 140 | return 1; 141 | } 142 | -------------------------------------------------------------------------------- /re0.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #ifdef DEBUG 6 | #include "re.h" 7 | #else 8 | #include "regex.h" 9 | #endif 10 | #include 11 | #include 12 | #include 13 | extern "C" void abort(); 14 | 15 | /* regular expression tester 16 | usage: re [options] [pattern] 17 | options: 18 | -a ARE, augmented regular expression 19 | -b BRE, basic regular expression (default) 20 | -e ERE, extended regular expression 21 | -a ARE, augmented regular expression 22 | -i ignore case 23 | -tc trace the compilation 24 | -te trace the execution 25 | if a pattern is present, use like grep 26 | if not, enter patterns and strings as alternate lines 27 | it echoes compiled patterns in a bastard notation 28 | and prints the match array in the form 29 | (m0,n0)(m1,n1)(m2,n2)... with trailing (-1,-1) entries 30 | summarized as k*(?,?) 31 | an empty pattern is taken as a repeat of previous */ 32 | 33 | #define UNTOUCHED -2 34 | 35 | void matchprint(regmatch_t *m, char *s, int n) 36 | { 37 | int i; 38 | regoff_t so, eo; 39 | char *p; 40 | if(m == 0) 41 | printf("No match\n"); 42 | else { 43 | while(--n >= 0) 44 | if(m[n].rm_so != UNTOUCHED) 45 | break; 46 | int k = 0; 47 | while(n>=0 && m[n].rm_so==-1) { 48 | n--; 49 | k++; 50 | } 51 | for(i=0; i<=n; i++) { 52 | so = m[i].rm_so; 53 | if(so < 0) 54 | printf("(?,?)"); 55 | else { 56 | eo = m[i].rm_eo; 57 | printf("(%ld,%ld) ", so, eo); 58 | for(p=s+so; pprint(); 136 | #endif 137 | printf("\n"); 138 | if(fgets(s,sizeof s,stdin) == 0) 139 | break; 140 | *strchr(s, '\n') = 0; 141 | for(i=0; i 2 | 3 | * Rename all *.c files to *.cpp. 4 | * Makefile: Adjusted accordingly. Add rule for sed0.cpp, use 5 | CXXFLAGS. 6 | * README.md: Updated. 7 | 8 | 2018-07-22 Arnold D. Robbins 9 | 10 | * testsed.sh: Fix calls to close() in embedded awk program. 11 | * sed0.c: Remove trailing whitespace from one of the lines. 12 | * Makefile: Compile sed for debugging. 13 | * README.md: Updated. 14 | 15 | 2018-07-20 Arnold D. Robbins 16 | 17 | Get things to compile without warnings. 18 | 19 | * Makefile (retest): Use ./testre. Remove all references to 20 | Dr. McIlroy's private malloc.c. Remove references to array.c. 21 | * array.h (realloc): Move the definition here from array.c. 22 | * array.c: Removed, and all references to it removed. 23 | * grep.c: Include for declaration of getopt(). 24 | (main): Add type to declaration. 25 | (getline, execute, doregerror, warn, error): Use const char * 26 | instead of char *. 27 | (execute): Call line.bytes() to get the data for printing. 28 | Fixes a core dump. 29 | * re.h (Rex::dprint): Use const char *. 30 | (Dup::in): Add leading type to declaration. 31 | (Kmp::parse): Add leading type to declaration. 32 | (orset): Renamed from 'or' which is a keyword in modern C++. 33 | Fix all uses. 34 | * re1.c (printpos): Use %ld for a long value. 35 | (Rex::dprint): Use const char *. 36 | * sed.h (warn, quit, synwarn, syntax, stdouterr): Use const char *. 37 | * sed0.c (ustrncmp): Remove leading cast to uchar*, use const 38 | for first parameter's cast. 39 | (main): Add type to declaration. 40 | (struct input): Need tag for some compilers. 41 | * sed1.c (ustrcmp, ustrcrpy, ustrchr): Use const for first parameter. 42 | (synwarn, syntax): Use const char *. 43 | * sed2.c (stdouterr): Use const char *. 44 | (struct digram): Need tag for some compilers. 45 | * README.md: Updated. 46 | 47 | 2018-07-20 M. Douglas McIlroy 48 | 49 | Add test files to the repo: 50 | 51 | -rwxr-xr-x 1 doug faculty 6258 Jun 5 1996 testgrep.sh 52 | -rw-r--r-- 1 doug faculty 8328 Apr 16 1997 testsed.sh 53 | 54 | 2018-05-30 Arnold D. Robbins 55 | 56 | * README.md: Updated. 57 | 58 | 2018-05-30 M. Douglas McIlroy 59 | 60 | Add array.c to the repo. 61 | 62 | -rw-r--r-- 1 doug faculty 375 Dec 1 1995 array.c 63 | 64 | 2017-10-15 Arnold D. Robbins 65 | 66 | * LICENSE: New file, added after consultation with Dr. McIlroy. 67 | 68 | 2017-10-10 Arnold D. Robbins 69 | 70 | * README.md: New file, for GitHub. 71 | 72 | 2017-10-10 M. Douglas McIlroy 73 | 74 | * re0.c, re1.c, re2.c, testre.c: Changes after running 75 | through a current C++ compiler. 76 | 77 | 2003-01-26 M. Douglas McIlroy 78 | 79 | Additional files while attempting to modernize the library, 80 | in the `additional' directory: 81 | 82 | -rw-r--r-- 1 doug faculty 25 Jan 26 2003 formal.ref.h 83 | -rw-r--r-- 1 doug faculty 8218 Jan 26 2003 nre.h 84 | -rw-r--r-- 1 doug faculty 1706 Jan 26 2003 nregex.h 85 | -rw-r--r-- 1 doug faculty 370 Jan 26 2003 re3.c 86 | -rw-r--r-- 1 doug faculty 4875 Jan 26 2003 ref.c 87 | -rw-r--r-- 1 doug faculty 4622 Jan 26 2003 regexl.h 88 | -rw-r--r-- 1 doug faculty 7976 Jan 26 2003 zre.h 89 | -rw-r--r-- 1 doug faculty 1729 Jan 26 2003 zregex.h 90 | 91 | 1998-03-04 M. Douglas McIlroy 92 | 93 | Initial commit: 94 | 95 | -rw-r--r-- 1 doug staff 2737 Jun 6 1995 Makefile 96 | -rw-rw-r-- 1 doug staff 2968 Mar 4 1998 README 97 | -rw-rw-r-- 1 doug staff 536 Sep 27 1995 array.h 98 | -rw-rw-r-- 1 doug staff 5654 Jun 4 1996 grep.c 99 | -rw-rw-r-- 1 doug staff 2489 Jun 5 1996 mkfile 100 | -rw-rw-r-- 1 doug staff 8021 Mar 4 1998 re.h 101 | -rw-rw-r-- 1 doug staff 3154 Jan 21 1996 re0.c 102 | -rw-rw-r-- 1 doug staff 21376 Mar 4 1998 re1.c 103 | -rw-rw-r-- 1 doug staff 21751 Mar 4 1998 re2.c 104 | -rw-rw-r-- 1 doug staff 1585 Jun 5 1996 regex.h 105 | -rw-rw-r-- 1 doug staff 1298 Jun 8 1993 regexp.h 106 | -rw-r--r-- 1 doug staff 8382 Sep 11 1997 sed.1 107 | -rw-rw-r-- 1 doug staff 2819 Sep 11 1997 sed.h 108 | -rw-rw-r-- 1 doug staff 4501 Sep 11 1997 sed0.c 109 | -rw-rw-r-- 1 doug staff 12798 Sep 11 1997 sed1.c 110 | -rw-rw-r-- 1 doug staff 8739 Apr 16 1997 sed2.c 111 | -rw-rw-r-- 1 doug staff 2859 Sep 11 1997 sed3.c 112 | -rw-rw-r-- 1 doug staff 9464 Mar 4 1998 testre.c 113 | -rw-rw-r-- 1 doug staff 11557 Jan 25 1996 testre.dat 114 | -------------------------------------------------------------------------------- /additional/regexl.h: -------------------------------------------------------------------------------- 1 | /* Posix BRE (basic regular expression) recognizer. 2 | A backtrack parser on Floyd's model. (Backtracking is 3 | nearly unavoidable; BRE parsing is NP-complete, aside 4 | from the minor limitation to 9 backreferences.) 5 | 6 | The program realizes an outside-in understanding 7 | of the longest-leftmost rule, which chooses "the 8 | longest" recursively by length of substrings in 9 | an in-order walk over a parse tree (where catenation 10 | is right-associative). */ 11 | 12 | enum RexType { 13 | OK, /* always succeed, used internally */ 14 | ANCHOR, /* initial ^ */ 15 | END, /* final $ */ 16 | DOT, /* . */ 17 | STRING, /* some chars */ 18 | CLASS, /* [...] */ 19 | BACK, /* \1, \2, etc */ 20 | SUBEXP, /* $...$ */ 21 | REP, /* closure */ 22 | SEQ, /* concatenation */ 23 | NREXTYPE /* dummy */ 24 | }; 25 | 26 | enum { REGEX = NREXTYPE }; /* for debugging */ 27 | 28 | typedef adt Seg; 29 | typedef adt Rex; 30 | typedef (int, Seg*, int) Match; /* length, subexpressions, best */ 31 | enum { NMATCH = 9 }; 32 | enum { DONE = -1 }; 33 | 34 | /* A parsing function takes a regular expression e, a Segment s, 35 | an array m of subexpression matches, and a demand channel c. 36 | It communicates back to its caller a sequence of match lengths 37 | and subexpression-match lists. A length of DONE 38 | denotes no more matches. Each time a better match 39 | is reported, the "best" flag is 1, else 0. 40 | 41 | A demand channel (Chan) has a control channel, by which 42 | the caller asks for another alternative (MORE), an end 43 | of parsing (KILL), or to RESET the calculation of "best". */ 44 | 45 | /* the functions .make, .unmake initialize and destroy; 46 | in other adt's functions .new and .free do heap allocation */ 47 | 48 | enum Ctl { MORE, KILL, RESET }; 49 | adt Chan { 50 | extern chan(Ctl) ctl; 51 | extern chan(Match) data; 52 | Chan make(); 53 | void unmake(Chan); 54 | }; 55 | 56 | typedef void Parse(Rex *e, Seg s, Seg *m, Chan c); 57 | 58 | Parse *parse[]; 59 | 60 | void (*rexprint[])(Rex *); 61 | 62 | void (*rexfree[])(Rex *); 63 | 64 | /* A segment is a string pointer and length. A length 65 | of -1 for a subexpression match means no match. 66 | 67 | Function prefix returns 1 if segment a is a prefix 68 | of segemnt b, else 0 69 | 70 | Function append appends segment s as backref pattern n 71 | to match list m1, clearing out all later backref pats, 72 | putting result in match list m2 73 | Function clear copies m1 to m2 and clears from backref 74 | n onwards */ 75 | 76 | adt Seg { 77 | extern byte *p; 78 | extern int n; 79 | Seg make(byte *p, int n); 80 | int prefix(Seg a, Seg b); 81 | void append(Seg s, Seg *m1, int n, Seg *m2); 82 | void clear(int n, Seg *m1, Seg *m2); 83 | void next(*Seg); 84 | void prev(*Seg); 85 | }; 86 | 87 | void matchprint(Seg *m, Seg s); 88 | 89 | /* A set of ascii characters, represented as a bitstring */ 90 | 91 | adt Set { 92 | byte cl[256/8]; 93 | void clear(*Set); 94 | void init(*Set, Seg s); 95 | void insert(*Set, byte c); 96 | int in(*Set, byte c); 97 | void or(*Set, Set*); 98 | void neg(*Set); 99 | }; 100 | 101 | /* adt's for the various kinds of Rex. Each has a 102 | recognizer .parse, a .new, a .free, and a debugging 103 | .print function */ 104 | 105 | adt Ok { 106 | Rex *new(); 107 | void free(Rex*); 108 | Parse parse; 109 | void print(*Rex); 110 | }; 111 | 112 | adt Anchor { 113 | extern Rex *rex; 114 | Rex *new(Rex *e); 115 | void free(Rex*); 116 | Parse parse; 117 | void print(*Rex); 118 | }; 119 | 120 | adt End { 121 | Rex *new(); 122 | void free(Rex*); 123 | Parse parse; 124 | void print(*Rex); 125 | }; 126 | 127 | adt Dot { 128 | Rex *new(); 129 | void free(Rex*); 130 | Parse parse; 131 | void print(*Rex); 132 | }; 133 | 134 | adt Class { 135 | Set cl; 136 | Rex *new(Seg); 137 | void free(Rex*); 138 | Parse parse; 139 | int in(*Class, int c); 140 | void or(*Class, Set*); 141 | void neg(*Class); 142 | void print(*Rex); 143 | }; 144 | 145 | adt String { 146 | extern Seg seg; 147 | Rex *new(Seg); 148 | void free(Rex*); 149 | Parse parse; 150 | void print(*Rex); 151 | }; 152 | 153 | adt Back { 154 | int n; 155 | Rex *new(int); 156 | void free(Rex*); 157 | Parse parse; 158 | void print(*Rex); 159 | }; 160 | 161 | adt Subexp { 162 | int n; 163 | extern Rex *rex; 164 | Rex *new(int n, Rex *); 165 | void free(Rex*); 166 | Parse parse; 167 | void print(*Rex); 168 | }; 169 | 170 | adt Rep { 171 | int lo; 172 | int hi; 173 | int n; /* largest preceding backref index */ 174 | extern Rex *rex; 175 | Rex *new(int lo, int hi, int n, Rex*); 176 | void free(Rex*); 177 | Parse parse; 178 | void print(*Rex); 179 | }; 180 | 181 | adt Seq { 182 | extern Rex *rex; 183 | extern Rex *seq; 184 | Rex *new(Rex *rex, Rex *seq); 185 | void free(Rex*); 186 | Parse parse; 187 | void print(*Rex); 188 | }; 189 | enum Hard { EASY, HARD }; 190 | aggr Stat { /* used only in Rex.hard1() */ 191 | int n; /* length of regex, if no closure or backref */ 192 | int s; /* number of simple closures */ 193 | int c; /* number of closures */ 194 | int b; /* number of backrefs */ 195 | }; 196 | adt Rex { 197 | extern RexType type; 198 | union { 199 | Ok; 200 | Anchor; 201 | End; 202 | Dot; 203 | String; 204 | Class; 205 | Back; 206 | Subexp; 207 | Rep; 208 | Seq; 209 | }; 210 | Rex *new(*Rex, RexType type); /* called from new()s */ 211 | Hard hard(*Rex); 212 | intern Stat hard1(*Rex); 213 | void free(*Rex); 214 | void print(*Rex); 215 | }; 216 | 217 | /* parse a possibly anchored regular expression */ 218 | 219 | Seg *regex(Rex *rex, Seg s); 220 | Rex *regcomp(Seg s); 221 | 222 | -------------------------------------------------------------------------------- /sed0.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "sed.h" 7 | 8 | void readscript(Text*, char*); 9 | void copyscript(Text*, uchar*); 10 | void initinput(int, char **); 11 | FILE *aopen(char*); 12 | 13 | #define ustrncmp(a,b,c) strncmp((const char*)(a), (const char*)(b), c) 14 | 15 | int recno; /* current record number */ 16 | int nflag; /* nonprint option */ 17 | int qflag; /* command q executed */ 18 | int sflag; /* substitution has occurred */ 19 | int bflag; /* strip leading blanks from c,a,i */ 20 | int options; /* conjunction, negation */ 21 | 22 | int 23 | main(int argc, char **argv) 24 | { 25 | static Text script; 26 | static Text data; 27 | for(;;) { 28 | switch(getopt(argc, argv, "bnf:e:")) { 29 | case 'b': 30 | bflag++; 31 | continue; 32 | case 'e': 33 | copyscript(&data, (uchar*)optarg); 34 | continue; 35 | case 'f': 36 | readscript(&data, optarg); 37 | continue; 38 | case 'n': 39 | nflag++; 40 | continue; 41 | case '?': 42 | quit("usage: sed [-n] script [files]\n" 43 | " sed [-n] [-f scriptfile] " 44 | "[-e script] [files]"); 45 | case -1: 46 | break; 47 | } 48 | break; 49 | } 50 | if(data.s == 0) { 51 | if(optind >= argc) 52 | quit("no script"); 53 | copyscript(&data, (uchar*)argv[optind++]); 54 | } 55 | if(ustrncmp(data.s, "#n", 2) == 0) 56 | nflag = 1; 57 | copyscript(&data, (uchar*)"\n\n"); /* e.g. s/a/\ */ 58 | compile(&script, &data); 59 | /* printscript(&script); /* debugging */ 60 | 61 | initinput(argc-optind, argv+optind); 62 | for(;;) { 63 | data.w = data.s; 64 | if(!readline(&data)) 65 | break; 66 | execute(&script, &data); 67 | } 68 | if(fclose(stdout) == EOF) 69 | quit(stdouterr); 70 | return 0; 71 | } 72 | 73 | void 74 | grow(Text *t, int n) 75 | { 76 | int w = t->w - t->s; 77 | int e = t->e - t->s + (n/BUFSIZ+1)*BUFSIZ; 78 | t->s = (uchar*)realloc(t->s, e); 79 | if(t->s == 0) 80 | quit("out of space"); 81 | t->w = t->s + w; 82 | t->e = t->s + e; 83 | } 84 | 85 | /* BUG: a segment that ends with a comment whose 86 | last character is \ causes a diagnostic */ 87 | 88 | void 89 | safescript(Text *t) 90 | { 91 | if(t->w > t->s+1 && t->w[-2] == '\\') 92 | warn("script segment ends with \\"); 93 | } 94 | 95 | void 96 | readscript(Text *t, char *s) 97 | { 98 | int n; 99 | FILE *f = aopen(s); 100 | for(;;) { 101 | assure(t, 4); 102 | n = fread(t->w, 1, t->e - t->w - 3, f); 103 | if(n <= 0) 104 | break; 105 | t->w += n; 106 | } 107 | fclose(f); 108 | if(t->w > t->s && t->w[-1] != '\n') { 109 | *t->w++ = '\n'; 110 | warn("newline appended to script segment"); 111 | } 112 | *t->w = 0; 113 | safescript(t); 114 | } 115 | 116 | void 117 | copyscript(Text *t, uchar *s) 118 | { 119 | do { 120 | assure(t, 2); 121 | } while(*t->w++ = *s++); 122 | if(--t->w > t->s && t->w[-1] != '\n') { 123 | *t->w++ = '\n'; 124 | *t->w = 0; 125 | } 126 | safescript(t); 127 | } 128 | 129 | /* DATA INPUT */ 130 | 131 | struct input { 132 | int iargc; /* # of files not fully read */ 133 | char **iargv; /* current file */ 134 | FILE *ifile; /* current input file */ 135 | } input; 136 | /* getch fetches char from current file 137 | returns EOF at final end of file 138 | leaves iargc==0 after line $ 139 | */ 140 | #define getch(cp) if((*(cp)=getc(input.ifile))==EOF) \ 141 | *(cp)=gopen(); else 142 | int gopen(void); /* called only by getch() */ 143 | 144 | int 145 | readline(Text *t) 146 | { 147 | int c; 148 | int len = t->w - t->s; 149 | coda(); 150 | if(qflag || ateof()) 151 | return 0; 152 | for(;;) { 153 | assure(t, 2); 154 | getch(&c); 155 | if(c == '\n') 156 | break; 157 | else if(c != EOF) 158 | *t->w++ = c; 159 | else if(t->w - t->s == len) 160 | return 0; 161 | else { 162 | warn("newline appended"); 163 | break; 164 | } 165 | } 166 | *t->w = 0; /* for safety */ 167 | getch(&c); /* to identify line $ */ 168 | if(c != EOF) 169 | ungetc(c, input.ifile); 170 | recno++; 171 | sflag = 0; 172 | return 1; 173 | } 174 | 175 | int 176 | gopen(void) 177 | { 178 | int c = EOF; 179 | while(c==EOF && --input.iargc > 0) { 180 | fclose(input.ifile); 181 | input.ifile = aopen(*++input.iargv); 182 | c = getc(input.ifile); 183 | } 184 | return c; 185 | } 186 | 187 | int 188 | ateof(void) 189 | { 190 | return input.iargc <= 0; 191 | } 192 | 193 | void 194 | initinput(int argc, char **argv) 195 | { 196 | input.iargc = argc; 197 | input.iargv = argv; 198 | if(input.iargc == 0) { 199 | input.iargc = 1; /* for ateof() */ 200 | input.ifile = stdin; 201 | } else 202 | input.ifile = aopen(*input.iargv); 203 | } 204 | 205 | FILE * 206 | aopen(char *s) 207 | { 208 | FILE *f = fopen(s, "r"); 209 | if(f == 0) 210 | quit("cannot open %s", s); 211 | return f; 212 | } 213 | 214 | void 215 | warn(const char *format, ...) 216 | { 217 | va_list args; 218 | fprintf(stderr,"sed warning: "); 219 | va_start(args, format); 220 | vfprintf(stderr, format, args); 221 | va_end(args); 222 | fprintf(stderr,"\n"); 223 | } 224 | 225 | void 226 | quit(const char *format, ...) 227 | { 228 | va_list args; 229 | fprintf(stderr,"sed error: "); 230 | va_start(args, format); 231 | vfprintf(stderr, format, args); 232 | va_end(args); 233 | fprintf(stderr,"\n"); 234 | exit(1); 235 | } 236 | 237 | /* debugging code 1; compile and execute stubs. 238 | simply prints the already collected script and 239 | prints numbered input lines 240 | 241 | void 242 | compile(Text *script, Text *t) 243 | { 244 | uchar *s = t->s; 245 | assure(script, 1); 246 | *script->w++ = 0; 247 | while(*s) putchar(*s++); 248 | } 249 | 250 | void 251 | execute(Text *x, Text *y) 252 | { 253 | x = x; 254 | printf("%d: %s", recno, y->s); 255 | } 256 | 257 | */ 258 | -------------------------------------------------------------------------------- /additional/ref.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | /* 4 | Slow and certain Posix RE recognizer. 5 | */ 6 | 7 | /* 8 | stack of endpoints for adjudicating best match */ 9 | 10 | char line[100]; 11 | char *last; 12 | 13 | enum { NMATCH = 20 }; 14 | struct Match { 15 | char *s; // start pointer 16 | char *e; // end pointer 17 | }; 18 | Match match[NMATCH]; 19 | Match bestmatch[NMATCH]; 20 | 21 | struct Save { 22 | Match space[NMATCH]; 23 | Save(int nsub = NMATCH); 24 | restore() { memmove(match, space, sizeof match); } 25 | }; 26 | 27 | Save::Save(int nsub) 28 | { 29 | memmove(space, match, sizeof match); 30 | memset(&match[nsub],0,(NMATCH-nsub)*sizeof(Match)); 31 | } 32 | 33 | struct End { 34 | enum { SIZE = 100 }; 35 | int n; 36 | struct { 37 | int serial; 38 | char *s; 39 | } pos[SIZE]; 40 | } end, best; 41 | 42 | inline void push() { end.n++; } 43 | inline void pop() { end.:n--; } 44 | void push(int serial, int s) 45 | { 46 | end.pos[end.n].serial = serial; 47 | end.pos[end.n].s = s; 48 | push(); 49 | if(end.n >= End::SIZE) 50 | abort(); 51 | }; 52 | 53 | struct Rex { 54 | int serial; 55 | virtual void print() { } 56 | virtual void parse(char *s, Rex *cont); 57 | }; 58 | 59 | /* Continuation routines to get control at the end of 60 | a component. 61 | Coda: simple coda 62 | ACoda: Alt coda backpatch position into stack 63 | PCoda: push position onto stack 64 | SCoda: push; also record regmatch_t.eo 65 | */ 66 | Struct Coda : Rex { 67 | Rex *rex; // the successor 68 | Rex *cont; // and its continuation 69 | Coda(Rex *rex, Rex *cont) : rex(rex), cont(cont) { } 70 | void parse(char *s, Rex *cont); 71 | }; 72 | 73 | struct Pcoda : Coda { 74 | int serial; 75 | Pcoda(int serial, Rex *rex, Rex *cont) 76 | : serial(serial), Coda(rex, cont) { } 77 | void parse(char *s, Rex *cont); 78 | }; 79 | 80 | struct ACoda : Coda { 81 | int n; // original value of end.n 82 | ACoda(Rex *rex, Rex *cont, int n) 83 | : Coda(rex, cont), n(n) { } 84 | void parse(char *s, Rex *cont); 85 | }; 86 | 87 | struct SCoda : PCoda { 88 | int nsub; 89 | SCoda(int serial, int nsub, Rex *rex, Rex *cont) 90 | : nsub(nsub), PCoda(serial, rex, cont) { } 91 | void parse(char *s, Rex *cont); 92 | }; 93 | 94 | struct Done : Rex { 95 | void parse(char *s, Rex *cont); 96 | } 97 | 98 | struct Char : Rex { 99 | char c; 100 | Char(int c) c(c) { } 101 | void print() { printf("<%d:%c>", serial, c); } 102 | void parse(char *s, Rex *cont); 103 | }; 104 | 105 | struct Dot : Rex { 106 | void print(); { printf(".<%d>", serial); } 107 | void parse(char *s, Rex *cont); 108 | }; 109 | 110 | struct Seq : Rex { 111 | Rex *left; 112 | Rex *right; 113 | Seq(Rex *left, Rex *right) : left(left), right(right) { } 114 | void print(); { left->print(); right->print(); } 115 | void parse(char *s, Rex *cont); 116 | }; 117 | 118 | struct Alt : Rex { 119 | Rex *left; 120 | Rex *right; 121 | Alt(Rex *left, Rex *right) : left(left), right(right) { } 122 | void print() { 123 | void parse(char *s, Rex *cont); 124 | }; 125 | 126 | void Alt::print() 127 | { 128 | printf("<%d:", serial); 129 | left->print(); 130 | printf("|"); 131 | right->print(); 132 | printf(">"); 133 | } 134 | 135 | struct Sub : Rex { 136 | int nsub; 137 | Rex *rex; 138 | Sub(int nsub, Rex *rex) : nsub(nsub), rex(rex) { } 139 | void print(); 140 | void parse(char *s, Rex *cont); 141 | }; 142 | 143 | void Sub::print() 144 | { 145 | printf("(%d:", serial); 146 | rex->print(); 147 | printf(")"); 148 | } 149 | 150 | struct Back : Rex { 151 | int nsub; 152 | void print() { printf("<%d:\\%d>", serial, nsub); } 153 | void parse(char *s, Rex *cont); 154 | }; 155 | 156 | struct Rep : Rex { 157 | Rex *rex; 158 | void print(); 159 | void parse(char *s, Rex *cont); 160 | }; 161 | 162 | void Rep::print() 163 | { 164 | printf("<%d:", serial); 165 | rex->print(); 166 | printf(">*"); 167 | } 168 | 169 | void Done::parse(char *s, Rex *cont) 170 | { 171 | int i; 172 | for(i=0; i"end.pos[i].serial,end.pos[i].s-line); 174 | printf("\n"); 175 | } 176 | 177 | void Char::parse(char *s, Rex *cont) 178 | { 179 | if(s>=last || *s!=c) 180 | return; 181 | push(serial, s); 182 | cont->parse(s+1, 0, tree); 183 | pop(); 184 | } 185 | 186 | void Dot::parse(char *s, Rex *cont) 187 | { 188 | if(s>=last) 189 | return; 190 | push(serial, s); 191 | cont->parse(s+1, 0, tree); 192 | pop(); 193 | } 194 | 195 | void Alt::parse(char *s, Rex *cont) 196 | { 197 | Save save; 198 | ACoda coda(cont, 0, end.n); 199 | push(serial, 0); 200 | left->parse(s, &coda); 201 | save.restore(); 202 | right->parse(s, &coda); 203 | pop(); 204 | save.restore(); 205 | } 206 | 207 | void Seq::parse(char *s, Rex *cont) 208 | { 209 | Next next(right, cont); 210 | left->parse(s, &next); 211 | } 212 | 213 | void Sub::parse(char *s, Rex *cont) 214 | { 215 | Save save; 216 | SCoda coda(serial, nsub, cont, 0); 217 | rex->parse(s, &coda); 218 | save.restore(); 219 | } 220 | 221 | void Rep::parse(char *s, Rex *cont) 222 | { 223 | PCoda coda(serial, rex, &coda); 224 | push(serial, s); 225 | cont->parse(s, 0); 226 | pop(); 227 | } 228 | 229 | void Coda::parse(char *s, Rex*) 230 | { 231 | rex->parse(s, cont); 232 | } 233 | 234 | void Pcoda::parse(char *s, Rex*) 235 | { 236 | push(serial, s); 237 | Coda::parse(s, 0); 238 | pop(); 239 | } 240 | 241 | void ACoda::parse(char *s, Rex*) 242 | { 243 | end.pos[end.n].s = s; 244 | Coda::parse(s, 0); 245 | } 246 | 247 | void SCoda::parse(char *s, Rex*) 248 | { 249 | match[nsub].e = s; 250 | PCoda::parse(s, 0); 251 | } 252 | 253 | Rex *mkAlt() 254 | { 255 | Rex *alt1 = mkSeq(); 256 | if(*s != '|') 257 | return alt1; 258 | s++; 259 | Rex *alt2 = mkAlt(); 260 | return new Alt(alt1, alt2); 261 | } 262 | 263 | Rex *mkSeq() 264 | { 265 | Rex *seq1 = mkPrim(); 266 | if(*s == '*') { 267 | s++; 268 | seq1 = new Rep(seq1); 269 | } 270 | Rex *seq2 = MkSeq(); 271 | if(seq2 == 0) 272 | return seq1; 273 | return new Seq(seq1, seq2); 274 | } 275 | 276 | Rex *mkPrim() 277 | { 278 | Rex *prim; 279 | if(*s == 0) 280 | return 0; 281 | if(*s == '(') { 282 | s++: 283 | prim = mkAlt(); 284 | prim = new Sub(++nsub, prim); 285 | if(*s++ != ')') 286 | abort(); 287 | } else if(s == '.') { 288 | s++; 289 | prim = new Dot; 290 | } else if(isalpha(*s)) 291 | prim = new Char(*s++); 292 | else 293 | abort(); 294 | return prim; 295 | -------------------------------------------------------------------------------- /grep.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "array.h" 7 | #include "regex.h" 8 | 9 | 10 | /* this grep is based on the Posix re package. 11 | unfortunately it has to have a nonstandard interface. 12 | 1. fgrep does not have usual operators. REG_LITERAL 13 | caters for this. 14 | 2. grep allows null expressions, hence REG_NULL. 15 | 3. it may be possible to combine the multiple 16 | patterns of grep into single patterns. important 17 | special cases are handled by regcomb(). 18 | 4. anchoring by -x has to be done separately from 19 | compilation (remember that fgrep has no ^ or $ operator), 20 | hence REG_ANCH. (An honest, but slow 21 | alternative: run regexec with REG_NOSUB off and nmatch=1 22 | and check whether the match is full length) 23 | */ 24 | 25 | int Eflag; // like egrep 26 | int Fflag; // like fgrep 27 | int cflag; // count number of hits 28 | int lflag; // list files with hits 29 | int qflag; // quiet; return status but no output 30 | int nflag; // line numbers 31 | int sflag; // no messages for unopenable files 32 | int vflag; // reverse sense; seek nonmatches 33 | int hflag; // do not print file-name headers 34 | 35 | /* the Array<> definitions allow for a quantity of patterns, 36 | or a length of input line that is unbounded except by 37 | the amount of memory available */ 38 | 39 | Array argpat; 40 | int nargpat; 41 | Array filepat; 42 | int nfilepat; 43 | Array re; 44 | int nre; 45 | Array line; 46 | 47 | int hits, anyhits; 48 | int retval = 1; // what to return for no hits 49 | int options = REG_NOSUB | REG_NULL; 50 | int nfiles; 51 | 52 | void grepcomp(); 53 | void docomp(char *s); 54 | int getline(FILE *input, const char *name); 55 | void execute(FILE *input, const char *name); 56 | void doregerror(int result, const char *name, int lineno); 57 | void warn(const char *s, const char *t); 58 | void error(const char *s, const char *t); 59 | 60 | int 61 | main(int argc, char **argv) 62 | { 63 | for(;;) { 64 | switch(getopt(argc, argv, "AEFclqinsvxe:f:h")) { 65 | case 'A': 66 | options |= REG_AUGMENTED; 67 | if(REG_AUGMENTED) 68 | continue; 69 | 70 | case '?': 71 | fprintf(stderr, 72 | "usage: grep -EFclqinsvxh pattern [file] ...\n" 73 | " grep -EFclqinsvxh -ef pattern-or-file ... [file] ...\n"); 74 | exit(2); 75 | case 'E': 76 | Eflag = 1; 77 | options |= REG_EXTENDED; 78 | continue; 79 | case 'F': 80 | Fflag = 1; 81 | options |= REG_LITERAL; 82 | continue; 83 | case 'c': 84 | cflag = 1; 85 | continue; 86 | case 'l': 87 | lflag = 1; 88 | continue; 89 | case 'q': 90 | qflag = 1; 91 | continue; 92 | case 'i': 93 | options |= REG_ICASE; 94 | continue; 95 | case 'n': 96 | nflag = 1; 97 | continue; 98 | case 's': 99 | sflag = 1; 100 | continue; 101 | case 'v': 102 | vflag = 1; 103 | continue; 104 | case 'x': 105 | options |= REG_ANCH; 106 | continue; 107 | case 'h': 108 | hflag = 1; 109 | continue; 110 | case 'e': 111 | argpat.assure(nargpat); 112 | argpat[nargpat++] = optarg; 113 | continue; 114 | case 'f': 115 | filepat.assure(nfilepat); 116 | filepat[nfilepat++] = optarg; 117 | continue; 118 | case -1: 119 | break; 120 | } 121 | break; 122 | } 123 | if(nargpat + nfilepat == 0) 124 | if(optind >= argc) 125 | error("no pattern", ""); 126 | else 127 | argpat[nargpat++] = argv[optind++]; 128 | if(Fflag+Eflag > 1) 129 | error("-E and -F are incompatible", ""); 130 | grepcomp(); 131 | nfiles = argc - optind; 132 | if(nfiles <= 0) 133 | execute(stdin, "(standard input)"); 134 | else for( ; optind= 0) 170 | docomp(&line[0]); 171 | else if(!sflag) 172 | error("cannot open", filepat[i]); 173 | else 174 | retval = 2; 175 | fclose(patfile); 176 | } 177 | if(nre == 0) 178 | error("no pattern", ""); 179 | } 180 | 181 | void 182 | docomp(char *s) 183 | { 184 | if(re.assure(nre)) 185 | error("out of space at--", s); 186 | int result = regcomp(&re[nre], s, options); 187 | if(result) 188 | doregerror(result, s, 0); 189 | if(!nre || !regcomb(&re[nre-1], &re[nre])) 190 | nre++; 191 | } 192 | 193 | int 194 | getline(FILE *input, const char *name) 195 | { 196 | int c, j; 197 | for(j=0; ; j++) { 198 | if(line.assure(j)) 199 | error("out of space reading ", name); 200 | switch(c = getc(input)) { 201 | default: 202 | line[j] = c; 203 | continue; 204 | case EOF: 205 | if(j == 0) 206 | return -1; 207 | warn("newline appended to ", name); 208 | case '\n': 209 | line[j] = 0; 210 | return j; 211 | } 212 | } 213 | } 214 | 215 | void 216 | execute(FILE *input, const char *name) 217 | { 218 | int i, lineno; 219 | int hits = 0; 220 | for(lineno=1; ; lineno++) { 221 | int n = getline(input, name); 222 | if(n < 0) 223 | break; 224 | for(i=0; i1 && !hflag) 238 | printf("%s:", name); 239 | if(nflag) 240 | printf("%d:", lineno); 241 | printf("%s\n", line.bytes()); 242 | } 243 | } 244 | if(hits) 245 | anyhits = 1; 246 | if(qflag) 247 | return; 248 | if(lflag && hits) 249 | printf("%s\n", name); 250 | if(!lflag && cflag) { 251 | if(nfiles>1 && !hflag) 252 | printf("%s:", name); 253 | printf("%d\n", hits); 254 | } 255 | } 256 | 257 | void 258 | doregerror(int result, const char *name, int lineno) 259 | { 260 | char errbuf[100]; 261 | if(result==0 || result==REG_NOMATCH) 262 | return; 263 | regerror(result, 0, errbuf, sizeof(errbuf)); 264 | fprintf(stderr, "grep: %s: %s", errbuf, name); 265 | if(lineno) 266 | fprintf(stderr, ":%d\n", lineno); 267 | else 268 | fprintf(stderr, "\n"); 269 | exit(2); 270 | } 271 | 272 | void 273 | warn(const char *s, const char *t) 274 | { 275 | fprintf(stderr, "grep: %s %s\n", s, t); 276 | } 277 | 278 | void 279 | error(const char *s, const char *t) 280 | { 281 | warn(s, t); 282 | exit(2); 283 | } 284 | 285 | -------------------------------------------------------------------------------- /testgrep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # this test script for grep assumes that the regular expression 3 | # package has been tested independently 4 | 5 | echo 6 | echo program under test: 7 | which grep 8 | echo test numbers below denote progress, not trouble 9 | echo 10 | 11 | # for tests that produce a file "out", given the answer in "expect" 12 | # usage: compare testnumber 13 | compare() { 14 | if cmp out expect >/dev/null 2>&1 15 | then : 16 | else echo test $1 failed 17 | fi 18 | } 19 | 20 | # for tests that produce a single line of output 21 | # usage: check 'expected value' testnumber 22 | check() { 23 | grep -x -q "$1" || echo $2 failed 24 | } 25 | 26 | # for tests that are expected to produce an empty "out" file 27 | # usage: empty testnumber 28 | empty() { 29 | if test -s out 30 | then echo test $1 failed 31 | fi 32 | } 33 | 34 | trap "rm -f in out expect pat; exit" 0 1 2 13 15 35 | 36 | #--------------------------------------------- 37 | TEST=00 # -q, needed by check() 38 | echo $TEST 39 | 40 | grep -q . /dev/null > out && echo ${TEST}A failed 41 | empty ${TEST}B 42 | grep -q -v . /dev/null >out && echo ${TEST}C failed 43 | empty ${TEST}D 44 | (echo x | grep -q . >out) || echo ${TEST}E failed 45 | empty ${TEST}F 46 | (echo x | grep -v -q . >out) && echo ${TEST}G failed 47 | empty ${TEST}H 48 | 49 | #--------------------------------------------- 50 | TEST=01 # basic sanity of BRE, ERE, -x, -v, -e, 51 | echo $TEST 52 | 53 | awk 'BEGIN{ for(i=0;i<=10000;i++) print i }' >in expect <out 62 | compare ${TEST}A 63 | 64 | grep -x '10*' in >out 65 | compare ${TEST}B 66 | 67 | grep -x -e '10*' in >out 68 | compare ${TEST}C 69 | 70 | grep -E '^1(00)*0?$' in >out 71 | compare ${TEST}D 72 | 73 | grep -x '[^[:digit:]]*[[=one=]][[.zero.]]\{0,\}' in >out 74 | compare ${TEST}E 75 | 76 | grep -x -E '[^[:digit:]]*[[=one=]][[.zero.]]{0,}' in >out 77 | compare ${TEST}F 78 | 79 | grep -e '^1$\(0$\2\)*$ 80 | ^10$\(0$\2\)*$' in >out 81 | compare ${TEST}G 82 | 83 | grep -e '^1$\(0$\2\)*$' -e '^10$\(0$\2\)*$' in >out 84 | compare ${TEST}H 85 | 86 | grep -e '1$\(0$\2\)*' -e '10$\(0$\2\)*' -x in >out 87 | compare ${TEST}I 88 | 89 | grep -v -E '[2-9]|1.*1|^0' in >out 90 | compare ${TEST}J 91 | 92 | grep -E -x '1(0{0,2}){1,2}' in >out 93 | compare ${TEST}K 94 | 95 | grep -E '1*^10*$' in >out 96 | compare ${TEST}L 97 | 98 | #--------------------------------------------- 99 | TEST=02 # character classes, -c, -i 100 | echo $TEST 101 | 102 | # make a file of one-char lines, omitting NUL(0) and newline(10) 103 | # assumes that no byte bigger than 127 is in any char class 104 | awk 'BEGIN { for(i=1; i<256; i++) if(i!=10) printf "%c\n", i }' >in expect <out 127 | grep -c '[[:alnum:]]' in >>out 128 | grep -c '[[:alpha:]]' in >>out 129 | grep -c '[[:blank:]]' in >>out 130 | grep -c '[[:cntrl:]]' in >>out 131 | grep -c '[[:digit:]]' in >>out 132 | grep -c '[[:graph:]]' in >>out 133 | grep -c '[[:lower:]]' in >>out 134 | grep -c '[[:print:]]' in >>out 135 | grep -c '[[:punct:]]' in >>out 136 | grep -c '[[:space:]]' in >>out 137 | grep -c '[[:upper:]]' in >>out 138 | grep -c '[[:xdigit:]]' in >>out 139 | grep -c -i '[[:alnum:]]' in >>out 140 | grep -c -i '[[:alpha:]]' in >>out 141 | grep -c -i '[[:lower:]]' in >>out 142 | grep -c -i '[[:upper:]]' in >>out 143 | 144 | compare ${TEST}B 145 | 146 | #--------------------------------------------- 147 | TEST=03 # null expressions, dot 148 | echo $TEST 149 | 150 | # make a file of one-char lines, omitting NUL(0) and newline(10) 151 | awk 'BEGIN { for(i=1; i<256; i++) if(i!=10) printf "%c\n", i }' >in expect <out 163 | grep -c -x '' in >>out 164 | grep -c -x -E '' in >>out 165 | grep -c -e 'a' in >>out 166 | grep -c -e ' 167 | a' in >>out 168 | grep -c -e 'a 169 | ' in >>out 170 | grep -c -x 'a 171 | ' in >>out 172 | 173 | compare $TEST 174 | 175 | #--------------------------------------------- 176 | TEST=04 # -f, -F, big pattern 177 | echo $TEST 178 | 179 | awk 'BEGIN{ for(i=0;i<10000;i++) print i }' >in pat expect <out 198 | grep -c -f pat -v >out 199 | grep -c -F -fpat >out 200 | grep -c -F -fpat -v >out 201 | grep -c -x -fpat in >>out 202 | grep -c -x -fpat -v in >>out 203 | grep -c -x -F -f pat in >>out 204 | grep -c -x -F -f pat -v in >>out 205 | grep -c -E -fpat >out 206 | grep -c -E -fpat -v >out 207 | grep -c -x -E -f pat in >>out 208 | grep -c -x -E -f pat -v in >>out 209 | 210 | compare ${TEST}A 211 | 212 | #--------------------------------------------- 213 | TEST=05 # -n, -c, -q, -l 214 | echo $TEST 215 | 216 | awk 'BEGIN{ for(i=1;i<10000;i++) print i }' >in out 219 | grep -v -q '^$.*$:\1$' out && echo ${TEST}A failed 220 | grep -c . out | check '90' ${TEST}B 221 | grep -l . out | check 'out' ${TEST}C 222 | grep -l . expect <out 230 | compare ${TEST}E 231 | 232 | grep -l . /dev/null in in /dev/null >out 233 | compare ${TEST}F 234 | 235 | grep -l -v . in in >out 236 | empty ${TEST}G 237 | 238 | grep -l -q . in in >out 239 | empty ${TEST}I 240 | 241 | grep -c . /dev/null | check '0' ${TEST}J 242 | 243 | cat >expect <out 249 | compare ${TEST}K 250 | 251 | #--------------------------------------------- 252 | TEST=06 # exit status, -s 253 | echo $TEST 254 | 255 | for q in '' -q 256 | do 257 | for opt in -e -c -l 258 | do 259 | 260 | grep $q $opt . /dev/null >/dev/null 261 | case $? in 262 | 0) echo test ${TEST}A$q$opt failed ;; 263 | 1) : ;; 264 | *) echo test ${TEST}B$q$opt failed 265 | esac 266 | 267 | echo x | grep $q $opt . >/dev/null 268 | case $? in 269 | 0) : ;; 270 | *) echo test ${TEST}C$q$opt failed 271 | esac 272 | 273 | grep $q $opt . nonexistent 2>/dev/null 274 | case $? in 275 | 0|1) echo test ${TEST}D$q$opt failed 276 | esac 277 | 278 | grep -s $q $opt . nonexistent 2>out 279 | case $? in 280 | 0|1) echo test ${TEST}E$q$opt failed 281 | esac 282 | empty ${TEST}F 283 | 284 | echo x >in 285 | grep -s $q $opt . in nonexistent 2>out >/dev/null 286 | case $? in 287 | 0) : ;; 288 | *) echo test ${TEST}G$q$opt failed 289 | esac 290 | empty ${TEST}H 291 | 292 | done 293 | done 294 | 295 | #--------------------------------------------- 296 | TEST=07 # -F, metacharacters, null scripts 297 | echo $TEST 298 | 299 | # make a file of one-char lines, omitting NUL(0) and newline(10) 300 | awk 'BEGIN { for(i=1; i<256; i++) if(i!=10) printf "%c\n", i }' >in /dev/null | check '3' ${TEST}A 305 | 306 | grep -c -F '' in 2>/dev/null | check '254' ${TEST}B 307 | grep -c -x -F '' in 2>/dev/null | check '0' ${TEST}C 308 | 309 | cat <pat 310 | 311 | x 312 | ! 313 | cat <expect 314 | in:1 315 | pat:2 316 | ! 317 | 318 | grep -c -F -f pat in | check '254' ${TEST}D 319 | grep -c -F -x -f pat in | check '1' ${TEST}E 320 | grep -c -F -x -f pat in pat >out 321 | compare ${TEST}F 322 | 323 | #--------------------------------------------- 324 | TEST=08 # -x 325 | echo $TEST 326 | 327 | cat <in 328 | a 329 | b 330 | ab 331 | ba 332 | ! 333 | 334 | cat <expect 335 | b 336 | ab 337 | ! 338 | 339 | grep -x -E 'a.|b' in >out 340 | compare ${TEST}A 341 | -------------------------------------------------------------------------------- /additional/zre.h: -------------------------------------------------------------------------------- 1 | /* Posix BRE/ERE recognizer. */ 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "regex.h" 8 | #include "array.h" 9 | 10 | #undef RE_DUP_MAX // posix puts this in limits.h! 11 | enum { RE_DUP_MAX = -(INT_MIN/2)-1, // 2*RE_DUP_MAX won't overflow 12 | RE_DUP_INF = RE_DUP_MAX + 1, // infinity, for * 13 | BACK_REF_MAX = 9 14 | }; 15 | 16 | #ifndef REG_NULL 17 | #define REG_NULL 0 18 | #endif 19 | #ifndef REG_ANCH 20 | #define REG_ANCH 0 21 | #endif 22 | #ifndef REG_LITERAL 23 | #define REG_LITERAL 0 24 | #endif 25 | #ifndef REG_AUGMENTED 26 | #define REG_AUGMENTED 0 27 | #endif 28 | 29 | /* it is believed that the codes defined in regex.h are 30 | contiguous, but their order is not recalled */ 31 | 32 | enum { CFLAGS = REG_EXTENDED | REG_ICASE | REG_NOSUB | REG_NEWLINE, 33 | EFLAGS = REG_NOTBOL | REG_NOTEOL, 34 | GFLAGS = REG_NULL | REG_ANCH | REG_LITERAL | REG_AUGMENTED, 35 | ALLBIT0 = CFLAGS | EFLAGS | GFLAGS, 36 | NEWBIT1 = (ALLBIT0<<1) & ~ALLBIT0, 37 | NEWBIT2 = NEWBIT1 << 1, 38 | NEWBIT3 = NEWBIT2 << 1 39 | }; 40 | 41 | typedef unsigned char uchar; 42 | 43 | #define elementsof(x) (sizeof(x)/sizeof(x[0])) 44 | #define ustrlen(s) strlen((char*)(s)) 45 | #define ustrncmp(a,b,n) strncmp((char*)(a), (char*)(b), n) 46 | #define ustrchr(s,n) (uchar*)strchr((char*)(s), n) 47 | 48 | /* avoid dependence on the C++ library */ 49 | #ifdef DEBUG 50 | extern int Rexmalloc; 51 | void *operator new(size_t); 52 | void operator delete(void*); 53 | #define VIRTUAL virtual 54 | extern void flagprint(regex_t*); 55 | #else 56 | inline void *operator new(size_t size) { return malloc(size); } 57 | inline void operator delete(void *p) { free(p); } 58 | #define VIRTUAL 59 | #endif 60 | 61 | struct Seg; 62 | 63 | enum Type { 64 | OK, // null string, used internally 65 | ANCHOR, // initial ^ 66 | END, // final $ 67 | DOT, // . 68 | ONECHAR, // a single-character literal 69 | STRING, // some chars 70 | TRIE, // alternation of strings 71 | CLASS, // [...] 72 | BACK, // \1, \2, etc 73 | SUBEXP, // $...$ 74 | ALT, // a|b 75 | CONJ, // a&b 76 | REP, // Kleene closure 77 | NEG, // negation 78 | KMP, // Knuth-Morris-Pratt 79 | KR, // modified Karp-Rabin 80 | DONE, // completed match, used internally 81 | TEMP // node kept on stack 82 | }; 83 | 84 | enum { // local environment flags 85 | SPACE = NEWBIT1, // out of space 86 | EASY = 0, // greedy match known to work 87 | HARD = NEWBIT2, // otherwise 88 | }; 89 | 90 | struct Eenv; // environment during regexec() 91 | struct Cenv; // environment during regcomp() 92 | struct Stat; // used during regcomp() 93 | 94 | /* Rex is a node in a regular expression; TEMP nodes live 95 | temporarily on the stack during recognition by regexec; 96 | their next pointer should not be followed by ~Rex() 97 | */ 98 | struct Rex { 99 | uchar type; // what flavor of Rex 100 | short serial; // subpattern number 101 | Rex *next; // following part of reg exp 102 | Rex(int type=TEMP) : type(type), next(0) { } 103 | virtual ~Rex(); 104 | virtual Stat stat(Cenv*); 105 | virtual int serialize(int n); 106 | virtual int parse(uchar*, Rex*, Eenv*); 107 | VIRTUAL void print(); 108 | int follow(uchar *s, Rex *cont, Eenv *env); 109 | protected: 110 | void dprint(char *, uchar *); 111 | }; 112 | 113 | struct Dup : Rex { // for all duplicated expressions 114 | int lo, hi; 115 | Dup(int lo, int hi, int typ) : 116 | Rex(typ), lo(lo), hi(hi) { } 117 | Stat stat(Cenv*); 118 | void print(); 119 | }; 120 | 121 | /* A segment is a string pointer and length. A length 122 | of -1 for a subexpression match means no match. 123 | */ 124 | struct Seg { 125 | uchar *p; 126 | int n; 127 | Seg() { } 128 | Seg copy(); 129 | Seg(uchar *p, int n) : p(p), n(n) { } 130 | void next(int d=1) { p+=d; n-=d; } 131 | void prev(int d=1) { p-=d; n+=d; } 132 | }; 133 | 134 | // A set of ascii characters, represented as a bit string 135 | 136 | struct Set { 137 | uchar cl[(UCHAR_MAX+1)/CHAR_BIT]; 138 | Set() { memset(cl,0,sizeof cl); } 139 | void insert(uchar c); 140 | in(uchar c) { return (cl[c/CHAR_BIT]>>(c%CHAR_BIT)) & 1; } 141 | void or(Set*); 142 | void neg(); 143 | void clear(); 144 | }; 145 | 146 | /* various kinds of Rex. Each has a recognizer .parse and 147 | a debugging .print function 148 | 149 | A parsing function takes a Segment s, and a regular 150 | expression for the continuation of the parse. 151 | The environment has a match list in which are recorded the 152 | current strings for each referenceable subexpression */ 153 | 154 | struct Ok : Rex { 155 | Ok() : Rex(OK) { }; 156 | int parse(uchar *, Rex*,Eenv*); 157 | void print(); 158 | }; 159 | 160 | struct Anchor : Rex { 161 | Anchor() : Rex(ANCHOR) { } 162 | int parse(uchar*, Rex*,Eenv*); 163 | void print(); 164 | }; 165 | 166 | struct End : Rex { 167 | End() : Rex(END) { } 168 | int parse(uchar*, Rex*,Eenv*); 169 | void print(); 170 | }; 171 | 172 | struct Dot : Dup { 173 | Dot(int lo=1, int hi=1) : Dup(lo, hi, DOT) { } 174 | int parse(uchar*, Rex*, Eenv*); 175 | void print(); 176 | }; 177 | 178 | struct Class : Dup { 179 | Set cl; 180 | Class() : Dup(1,1,CLASS), cl() { } 181 | int parse(uchar *, Rex*,Eenv*); 182 | int in(int c) { return cl.in(c); } 183 | void or(Set*); 184 | void icase(uchar *map); 185 | void neg(int cflags); 186 | void print(); 187 | }; 188 | 189 | struct Onechar : Dup { 190 | uchar c; 191 | Onechar(int c, int lo=1, int hi=1) : 192 | Dup(lo,hi,ONECHAR), c(c) { } 193 | int parse(uchar*, Rex*, Eenv*); 194 | void print(); 195 | }; 196 | 197 | struct String : Rex { 198 | Seg seg; 199 | String(Seg seg, uchar *map = 0); 200 | ~String() { delete(seg.p); } 201 | Stat stat(Cenv*); 202 | int parse(uchar *, Rex*, Eenv*); 203 | void print(); 204 | protected: 205 | String() { }; // for Kmp and Kr only 206 | }; 207 | 208 | struct Kmp : String { // for string first in pattern 209 | Array fail; 210 | Kmp(Seg seg, int*); // ICASE-mapped already 211 | parse(uchar*, Rex*, Eenv*); 212 | }; 213 | 214 | /* data structure for an alternation of pure strings 215 | son points to a subtree of all strings with a common 216 | prefix ending in character c. sib links alternate 217 | letters in the same position of a word. end=1 if 218 | some word ends with c. the order of strings is 219 | irrelevant, except long words must be investigated 220 | before short ones. The first level of trie is indexed 221 | into buckets. 222 | */ 223 | struct Trie : Rex { 224 | enum { MASK = UCHAR_MAX, NROOT = MASK+1 }; 225 | struct Tnode { 226 | uchar c; 227 | uchar end; 228 | Tnode *son; 229 | Tnode *sib; 230 | Tnode(uchar c) : c(c), end(0), son(0), sib(0) { } 231 | ~Tnode() { delete son; delete sib; } 232 | }; 233 | int min, max; // length of entry 234 | Tnode *root[NROOT]; // index of trie roots 235 | int insert(uchar*); 236 | Trie() : Rex(TRIE), min(INT_MAX), max(0) { 237 | memset(root, 0, sizeof(root)); } 238 | ~Trie() { for(int i=0; i&); 245 | }; 246 | 247 | struct Back : Rex { 248 | int n; 249 | Back(int n) : Rex(BACK), n(n) { } 250 | Stat stat(Cenv*); 251 | int parse(uchar *, Rex*,Eenv*); 252 | void print(); 253 | }; 254 | 255 | struct Subexp : Rex { 256 | short n; // subexpression number 257 | uchar used; // nonzero if backreferenced 258 | Rex *rex; // contents 259 | Subexp(int n, Rex *rex) 260 | : Rex(SUBEXP), n(n), rex(rex), used(0) { } 261 | ~Subexp() { delete(rex); } 262 | int serialize(int); 263 | Stat stat(Cenv*); 264 | int parse(uchar *, Rex*,Eenv*); 265 | void print(); 266 | }; 267 | 268 | struct Alt: Rex { 269 | int n1, n2; // same as in Rep 270 | Rex *left; 271 | Rex *right; 272 | int rserial; 273 | Alt(int n1, int n2, Rex* left, Rex *right) : 274 | Rex(ALT), n1(n1), n2(n2), left(left), right(right) { } 275 | ~Alt() { delete(left); delete(right); } 276 | int serialize(int); 277 | Stat stat(Cenv*); 278 | int parse(uchar *, Rex*,Eenv*); 279 | void print(); 280 | }; 281 | 282 | struct Conj: Rex { 283 | Rex *left; 284 | Rex *right; 285 | Conj(Rex *left, Rex *right) : 286 | Rex(CONJ), left(left), right(right) { } 287 | ~Conj() { delete left; delete right; } 288 | int serialize(int); 289 | Stat stat(Cenv*); 290 | int parse(uchar*, Rex*, Eenv*); 291 | void print(); 292 | }; 293 | 294 | struct Rep : Dup { 295 | int n1; // subexpression number, or 0 296 | int n2; // last contained subexpression number 297 | Rex *rex; 298 | Rep(int lo, int hi, int n1, int n2, Rex *rex) : 299 | Dup(lo,hi,REP), n1(n1), n2(n2), 300 | rex(rex) { } 301 | ~Rep() { delete rex; }; 302 | int serialize(int); 303 | Stat stat(Cenv*); 304 | int parse(uchar *, Rex*, Eenv*); 305 | int dorep(int, uchar *, Rex*, Eenv*); 306 | void print(); 307 | }; 308 | 309 | struct Neg : Rex { 310 | Rex *rex; 311 | Neg(Rex *rex) : Rex(NEG), rex(rex) { } 312 | ~Neg() { delete rex; } 313 | int serialize(int); 314 | Stat stat(Cenv*); 315 | int parse(uchar*, Rex*, Eenv*); 316 | void print(); 317 | }; 318 | 319 | /* Only one copy of class Done is ever needed, however 320 | that copy is initialized dynamically, not statically. 321 | in order to avoid dependence on C++ library, so the 322 | object code can be loaded by cc */ 323 | 324 | struct Done : Rex { 325 | static Rex *done; // pointer to the one copy 326 | int parse(uchar *, Rex*, Eenv*); 327 | void print() { } 328 | }; 329 | -------------------------------------------------------------------------------- /re.h: -------------------------------------------------------------------------------- 1 | /* Posix BRE/ERE recognizer. */ 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "regex.h" 8 | #include "array.h" 9 | 10 | #undef RE_DUP_MAX // posix puts this in limits.h! 11 | enum { RE_DUP_MAX = -(INT_MIN/2)-1, // 2*RE_DUP_MAX won't overflow 12 | RE_DUP_INF = RE_DUP_MAX + 1, // infinity, for * 13 | BACK_REF_MAX = 9 14 | }; 15 | 16 | #ifndef REG_NULL 17 | #define REG_NULL 0 18 | #endif 19 | #ifndef REG_ANCH 20 | #define REG_ANCH 0 21 | #endif 22 | #ifndef REG_LITERAL 23 | #define REG_LITERAL 0 24 | #endif 25 | #ifndef REG_AUGMENTED 26 | #define REG_AUGMENTED 0 27 | #endif 28 | 29 | /* it is believed that the codes defined in regex.h are 30 | contiguous, but their order is not recalled */ 31 | 32 | enum { CFLAGS = REG_EXTENDED | REG_ICASE | REG_NOSUB | REG_NEWLINE, 33 | EFLAGS = REG_NOTBOL | REG_NOTEOL, 34 | GFLAGS = REG_NULL | REG_ANCH | REG_LITERAL | REG_AUGMENTED, 35 | ALLBIT0 = CFLAGS | EFLAGS | GFLAGS, 36 | NEWBIT1 = (ALLBIT0<<1) & ~ALLBIT0, 37 | NEWBIT2 = NEWBIT1 << 1, 38 | NEWBIT3 = NEWBIT2 << 1 39 | }; 40 | 41 | typedef unsigned char uchar; 42 | 43 | #define elementsof(x) (sizeof(x)/sizeof(x[0])) 44 | #define ustrlen(s) strlen((char*)(s)) 45 | #define ustrncmp(a,b,n) strncmp((char*)(a), (char*)(b), n) 46 | #define ustrchr(s,n) (uchar*)strchr((char*)(s), n) 47 | 48 | /* avoid dependence on the C++ library */ 49 | #ifdef DEBUG 50 | extern int Rexmalloc; 51 | void *operator new(size_t); 52 | void operator delete(void*); 53 | #define VIRTUAL virtual 54 | extern void flagprint(regex_t*); 55 | #else 56 | inline void *operator new(size_t size) { return malloc(size); } 57 | inline void operator delete(void *p) { free(p); } 58 | #define VIRTUAL 59 | #endif 60 | 61 | struct Seg; 62 | 63 | enum Type { 64 | OK, // null string, used internally 65 | ANCHOR, // initial ^ 66 | END, // final $ 67 | DOT, // . 68 | ONECHAR, // a single-character literal 69 | STRING, // some chars 70 | TRIE, // alternation of strings 71 | CLASS, // [...] 72 | BACK, // \1, \2, etc 73 | SUBEXP, // $...$ 74 | ALT, // a|b 75 | CONJ, // a&b 76 | REP, // Kleene closure 77 | NEG, // negation 78 | KMP, // Knuth-Morris-Pratt 79 | KR, // modified Karp-Rabin 80 | DONE, // completed match, used internally 81 | TEMP // node kept on stack 82 | }; 83 | 84 | enum { // local environment flags 85 | SPACE = NEWBIT1, // out of space 86 | EASY = 0, // greedy match known to work 87 | HARD = NEWBIT2, // otherwise 88 | ONCE = NEWBIT3 // if 1st parse fails, quit 89 | }; 90 | 91 | struct Eenv; // environment during regexec() 92 | struct Cenv; // environment during regcomp() 93 | struct Stat; // used during regcomp() 94 | 95 | /* Rex is a node in a regular expression; TEMP nodes live 96 | temporarily on the stack during recognition by regexec; 97 | their next pointer should not be followed by ~Rex() 98 | */ 99 | struct Rex { 100 | uchar type; // what flavor of Rex 101 | short serial; // subpattern number 102 | Rex *next; // following part of reg exp 103 | Rex(int type=TEMP) : type(type), next(0) { } 104 | virtual ~Rex(); 105 | virtual Stat stat(Cenv*); 106 | virtual int serialize(int n); 107 | virtual int parse(uchar*, Rex*, Eenv*); 108 | VIRTUAL void print(); 109 | int follow(uchar *s, Rex *cont, Eenv *env); 110 | protected: 111 | void dprint(const char *, const uchar *); 112 | }; 113 | 114 | struct Dup : Rex { // for all duplicated expressions 115 | int lo, hi; 116 | Dup(int lo, int hi, int typ) : 117 | Rex(typ), lo(lo), hi(hi) { } 118 | Stat stat(Cenv*); 119 | void print(); 120 | }; 121 | 122 | /* A segment is a string pointer and length. A length 123 | of -1 for a subexpression match means no match. 124 | */ 125 | struct Seg { 126 | uchar *p; 127 | int n; 128 | Seg() { } 129 | Seg copy(); 130 | Seg(uchar *p, int n) : p(p), n(n) { } 131 | void next(int d=1) { p+=d; n-=d; } 132 | void prev(int d=1) { p-=d; n+=d; } 133 | }; 134 | 135 | // A set of ascii characters, represented as a bit string 136 | 137 | struct Set { 138 | uchar cl[(UCHAR_MAX+1)/CHAR_BIT]; 139 | Set() { memset(cl,0,sizeof cl); } 140 | void insert(uchar c); 141 | int in(uchar c) { return (cl[c/CHAR_BIT]>>(c%CHAR_BIT)) & 1; } 142 | void orset(Set*); 143 | void neg(); 144 | void clear(); 145 | }; 146 | 147 | /* various kinds of Rex. Each has a recognizer .parse and 148 | a debugging .print function 149 | 150 | A parsing function takes a Segment s, and a regular 151 | expression for the continuation of the parse. 152 | The environment has a match list in which are recorded the 153 | current strings for each referenceable subexpression */ 154 | 155 | struct Ok : Rex { 156 | Ok() : Rex(OK) { }; 157 | int parse(uchar *, Rex*,Eenv*); 158 | void print(); 159 | }; 160 | 161 | struct Anchor : Rex { 162 | Anchor() : Rex(ANCHOR) { } 163 | int parse(uchar*, Rex*,Eenv*); 164 | void print(); 165 | }; 166 | 167 | struct End : Rex { 168 | End() : Rex(END) { } 169 | int parse(uchar*, Rex*,Eenv*); 170 | void print(); 171 | }; 172 | 173 | struct Dot : Dup { 174 | Dot(int lo=1, int hi=1) : Dup(lo, hi, DOT) { } 175 | int parse(uchar*, Rex*, Eenv*); 176 | void print(); 177 | }; 178 | 179 | struct Class : Dup { 180 | Set cl; 181 | Class() : Dup(1,1,CLASS), cl() { } 182 | int parse(uchar *, Rex*,Eenv*); 183 | int in(int c) { return cl.in(c); } 184 | void orset(Set*); 185 | void icase(uchar *map); 186 | void neg(int cflags); 187 | void print(); 188 | }; 189 | 190 | struct Onechar : Dup { 191 | uchar c; 192 | Onechar(int c, int lo=1, int hi=1) : 193 | Dup(lo,hi,ONECHAR), c(c) { } 194 | int parse(uchar*, Rex*, Eenv*); 195 | void print(); 196 | }; 197 | 198 | struct String : Rex { 199 | Seg seg; 200 | String(Seg seg, uchar *map = 0); 201 | ~String() { delete(seg.p); } 202 | Stat stat(Cenv*); 203 | int parse(uchar *, Rex*, Eenv*); 204 | void print(); 205 | protected: 206 | String() { }; // for Kmp and Kr only 207 | }; 208 | 209 | struct Kmp : String { // for string first in pattern 210 | Array fail; 211 | Kmp(Seg seg, int*); // ICASE-mapped already 212 | int parse(uchar*, Rex*, Eenv*); 213 | }; 214 | 215 | /* data structure for an alternation of pure strings 216 | son points to a subtree of all strings with a common 217 | prefix ending in character c. sib links alternate 218 | letters in the same position of a word. end=1 if 219 | some word ends with c. the order of strings is 220 | irrelevant, except long words must be investigated 221 | before short ones. The first level of trie is indexed 222 | into buckets. 223 | */ 224 | struct Trie : Rex { 225 | enum { MASK = UCHAR_MAX, NROOT = MASK+1 }; 226 | struct Tnode { 227 | uchar c; 228 | uchar end; 229 | Tnode *son; 230 | Tnode *sib; 231 | Tnode(uchar c) : c(c), end(0), son(0), sib(0) { } 232 | ~Tnode() { delete son; delete sib; } 233 | }; 234 | int min, max; // length of entry 235 | Tnode *root[NROOT]; // index of trie roots 236 | int insert(uchar*); 237 | Trie() : Rex(TRIE), min(INT_MAX), max(0) { 238 | memset(root, 0, sizeof(root)); } 239 | ~Trie() { for(int i=0; i&); 246 | }; 247 | 248 | struct Back : Rex { 249 | int n; 250 | Back(int n) : Rex(BACK), n(n) { } 251 | Stat stat(Cenv*); 252 | int parse(uchar *, Rex*,Eenv*); 253 | void print(); 254 | }; 255 | 256 | struct Subexp : Rex { 257 | short n; // subexpression number 258 | uchar used; // nonzero if backreferenced 259 | Rex *rex; // contents 260 | Subexp(int n, Rex *rex) 261 | : Rex(SUBEXP), n(n), rex(rex), used(0) { } 262 | ~Subexp() { delete(rex); } 263 | int serialize(int); 264 | Stat stat(Cenv*); 265 | int parse(uchar *, Rex*,Eenv*); 266 | void print(); 267 | }; 268 | 269 | struct Alt: Rex { 270 | int n1, n2; // same as in Rep 271 | Rex *left; 272 | Rex *right; 273 | int rserial; 274 | Alt(int n1, int n2, Rex* left, Rex *right) : 275 | Rex(ALT), n1(n1), n2(n2), left(left), right(right) { } 276 | ~Alt() { delete(left); delete(right); } 277 | int serialize(int); 278 | Stat stat(Cenv*); 279 | int parse(uchar *, Rex*,Eenv*); 280 | void print(); 281 | }; 282 | 283 | struct Conj: Rex { 284 | Rex *left; 285 | Rex *right; 286 | Conj(Rex *left, Rex *right) : 287 | Rex(CONJ), left(left), right(right) { } 288 | ~Conj() { delete left; delete right; } 289 | int serialize(int); 290 | Stat stat(Cenv*); 291 | int parse(uchar*, Rex*, Eenv*); 292 | void print(); 293 | }; 294 | 295 | struct Rep : Dup { 296 | int n1; // subexpression number, or 0 297 | int n2; // last contained subexpression number 298 | Rex *rex; 299 | Rep(int lo, int hi, int n1, int n2, Rex *rex) : 300 | Dup(lo,hi,REP), n1(n1), n2(n2), 301 | rex(rex) { } 302 | ~Rep() { delete rex; }; 303 | int serialize(int); 304 | Stat stat(Cenv*); 305 | int parse(uchar *, Rex*, Eenv*); 306 | int dorep(int, uchar *, Rex*, Eenv*); 307 | void print(); 308 | }; 309 | 310 | struct Neg : Rex { 311 | Rex *rex; 312 | Neg(Rex *rex) : Rex(NEG), rex(rex) { } 313 | ~Neg() { delete rex; } 314 | int serialize(int); 315 | Stat stat(Cenv*); 316 | int parse(uchar*, Rex*, Eenv*); 317 | void print(); 318 | }; 319 | 320 | /* Only one copy of class Done is ever needed, however 321 | that copy is initialized dynamically, not statically. 322 | in order to avoid dependence on C++ library, so the 323 | object code can be loaded by cc */ 324 | 325 | struct Done : Rex { 326 | static Rex *done; // pointer to the one copy 327 | int parse(uchar *, Rex*, Eenv*); 328 | void print() { } 329 | }; 330 | -------------------------------------------------------------------------------- /additional/nre.h: -------------------------------------------------------------------------------- 1 | /* Posix BRE/ERE recognizer. */ 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "nregex.h" 8 | #include "array.h" 9 | 10 | #undef RE_DUP_MAX // posix puts this in limits.h! 11 | enum { RE_DUP_MAX = -(INT_MIN/2)-1, // 2*RE_DUP_MAX won't overflow 12 | RE_DUP_INF = RE_DUP_MAX + 1, // infinity, for * 13 | BACK_REF_MAX = 9 14 | }; 15 | 16 | #ifndef REG_NULL 17 | #define REG_NULL 0 18 | #endif 19 | #ifndef REG_ANCH 20 | #define REG_ANCH 0 21 | #endif 22 | #ifndef REG_LITERAL 23 | #define REG_LITERAL 0 24 | #endif 25 | #ifndef REG_AUGMENTED 26 | #define REG_AUGMENTED 0 27 | #endif 28 | 29 | /* it is believed that the codes defined in regex.h are 30 | contiguous, but their order is not recalled */ 31 | 32 | enum { CFLAGS = REG_EXTENDED | REG_ICASE | REG_NOSUB | REG_NEWLINE, 33 | EFLAGS = REG_NOTBOL | REG_NOTEOL, 34 | GFLAGS = REG_NULL | REG_ANCH | REG_LITERAL | REG_AUGMENTED | REG_WHICH, 35 | ALLBIT0 = CFLAGS | EFLAGS | GFLAGS, 36 | NEWBIT1 = (ALLBIT0<<1) & ~ALLBIT0, 37 | NEWBIT2 = NEWBIT1 << 1, 38 | NEWBIT3 = NEWBIT2 << 1 39 | }; 40 | 41 | typedef unsigned char uchar; 42 | 43 | #define elementsof(x) (sizeof(x)/sizeof(x[0])) 44 | #define ustrlen(s) strlen((char*)(s)) 45 | #define ustrncmp(a,b,n) strncmp((char*)(a), (char*)(b), n) 46 | #define ustrchr(s,n) (uchar*)strchr((char*)(s), n) 47 | 48 | /* avoid dependence on the C++ library */ 49 | #ifdef DEBUG 50 | extern int Rexmalloc; 51 | void *operator new(size_t); 52 | void operator delete(void*); 53 | #define VIRTUAL virtual 54 | extern void flagprint(regex_t*); 55 | #else 56 | inline void *operator new(size_t size) { return malloc(size); } 57 | inline void operator delete(void *p) { free(p); } 58 | #define VIRTUAL 59 | #endif 60 | 61 | struct Seg; 62 | 63 | enum Type { 64 | OK, // null string, used internally 65 | ANCHOR, // initial ^ 66 | END, // final $ 67 | DOT, // . 68 | ONECHAR, // a single-character literal 69 | STRING, // some chars 70 | TRIE, // alternation of strings 71 | CLASS, // [...] 72 | BACK, // \1, \2, etc 73 | SUBEXP, // $...$ 74 | ALT, // a|b 75 | CONJ, // a&b 76 | REP, // Kleene closure 77 | NEG, // negation 78 | KMP, // Knuth-Morris-Pratt 79 | KR, // modified Karp-Rabin 80 | DONE, // completed match, used internally 81 | TEMP // node kept on stack 82 | }; 83 | 84 | enum { // local environment flags 85 | SPACE = NEWBIT1, // out of space 86 | EASY = 0, // greedy match known to work 87 | HARD = NEWBIT2, // otherwise 88 | ONCE = NEWBIT3 // if 1st parse fails, quit 89 | }; 90 | 91 | struct Eenv; // environment during regexec() 92 | struct Cenv; // environment during regcomp() 93 | struct Stat; // used during regcomp() 94 | 95 | /* Rex is a node in a regular expression; TEMP nodes live 96 | temporarily on the stack during recognition by regexec; 97 | their next pointer should not be followed by ~Rex() 98 | */ 99 | struct Rex { 100 | uchar type; // what flavor of Rex 101 | uchar prog; // progress mark 102 | short serial; // subpattern number 103 | Rex *next; // following part of reg exp 104 | Rex(int type=TEMP) : type(type), next(0), prog(0) { } 105 | virtual ~Rex(); 106 | virtual Stat stat(Cenv*); 107 | virtual int serialize(int n); 108 | virtual int parse(uchar*, int, Rex*, Eenv*); 109 | VIRTUAL void print(); 110 | int follow(uchar *s, int, Rex *cont, Eenv *env); 111 | protected: 112 | void dprint(char *, uchar *); 113 | }; 114 | 115 | struct Dup : Rex { // for all duplicated expressions 116 | int lo, hi; 117 | Dup(int lo, int hi, int typ) : 118 | Rex(typ), lo(lo), hi(hi) { } 119 | Stat stat(Cenv*); 120 | void print(); 121 | }; 122 | 123 | /* A segment is a string pointer and length. A length 124 | of -1 for a subexpression match means no match. 125 | */ 126 | struct Seg { 127 | uchar *p; 128 | int n; 129 | Seg() { } 130 | Seg copy(); 131 | Seg(uchar *p, int n) : p(p), n(n) { } 132 | void next(int d=1) { p+=d; n-=d; } 133 | void prev(int d=1) { p-=d; n+=d; } 134 | }; 135 | 136 | // A set of ascii characters, represented as a bit string 137 | 138 | struct Set { 139 | uchar cl[(UCHAR_MAX+1)/CHAR_BIT]; 140 | Set() { memset(cl,0,sizeof cl); } 141 | void insert(uchar c); 142 | in(uchar c) { return (cl[c/CHAR_BIT]>>(c%CHAR_BIT)) & 1; } 143 | void or(Set*); 144 | void neg(); 145 | void clear(); 146 | }; 147 | 148 | /* various kinds of Rex. Each has a recognizer .parse and 149 | a debugging .print function 150 | 151 | A parsing function takes a Segment s, and a regular 152 | expression for the continuation of the parse. 153 | The environment has a match list in which are recorded the 154 | current strings for each referenceable subexpression */ 155 | 156 | struct Ok : Rex { 157 | Ok() : Rex(OK) { }; 158 | int parse(uchar *, int, Rex*,Eenv*); 159 | void print(); 160 | }; 161 | 162 | struct Anchor : Rex { 163 | Anchor() : Rex(ANCHOR) { } 164 | int parse(uchar*, int, Rex*,Eenv*); 165 | void print(); 166 | }; 167 | 168 | struct End : Rex { 169 | End() : Rex(END) { } 170 | int parse(uchar*, int, Rex*,Eenv*); 171 | void print(); 172 | }; 173 | 174 | struct Dot : Dup { 175 | Dot(int lo=1, int hi=1) : Dup(lo, hi, DOT) { } 176 | int parse(uchar*, int, Rex*, Eenv*); 177 | void print(); 178 | }; 179 | 180 | struct Class : Dup { 181 | Set cl; 182 | Class() : Dup(1,1,CLASS), cl() { } 183 | int parse(uchar *, int, Rex*,Eenv*); 184 | int in(int c) { return cl.in(c); } 185 | void or(Set*); 186 | void icase(uchar *map); 187 | void neg(int cflags); 188 | void print(); 189 | }; 190 | 191 | struct Onechar : Dup { 192 | uchar c; 193 | Onechar(int c, int lo=1, int hi=1) : 194 | Dup(lo,hi,ONECHAR), c(c) { } 195 | int parse(uchar*, int, Rex*, Eenv*); 196 | void print(); 197 | }; 198 | 199 | struct String : Rex { 200 | Seg seg; 201 | String(Seg seg, uchar *map = 0); 202 | ~String() { delete(seg.p); } 203 | Stat stat(Cenv*); 204 | int parse(uchar *, int, Rex*, Eenv*); 205 | void print(); 206 | protected: 207 | String() { }; // for Kmp and Kr only 208 | }; 209 | 210 | struct Kmp : String { // for string first in pattern 211 | Array fail; 212 | Kmp(Seg seg, int*); // ICASE-mapped already 213 | parse(uchar*, int, Rex*, Eenv*); 214 | }; 215 | 216 | /* data structure for an alternation of pure strings 217 | son points to a subtree of all strings with a common 218 | prefix ending in character c. sib links alternate 219 | letters in the same position of a word. end=1 if 220 | some word ends with c. the order of strings is 221 | irrelevant, except long words must be investigated 222 | before short ones. The first level of trie is indexed 223 | into buckets. 224 | */ 225 | struct Trie : Rex { 226 | enum { MASK = UCHAR_MAX, NROOT = MASK+1 }; 227 | struct Tnode { 228 | uchar c; 229 | uchar end; 230 | uchar prog; 231 | Tnode *son; 232 | Tnode *sib; 233 | Tnode(uchar c) : c(c), end(0), prog(0), son(0), sib(0) { } 234 | ~Tnode() { delete son; delete sib; } 235 | }; 236 | int min, max; // length of entry 237 | Tnode *root[NROOT]; // index of trie roots 238 | int insert(uchar*, int); 239 | Trie() : Rex(TRIE), min(INT_MAX), max(0) { 240 | memset(root, 0, sizeof(root)); } 241 | ~Trie() { for(int i=0; i&); 248 | }; 249 | 250 | struct Back : Rex { 251 | int n; 252 | Back(int n) : Rex(BACK), n(n) { } 253 | Stat stat(Cenv*); 254 | int parse(uchar *, int, Rex*,Eenv*); 255 | void print(); 256 | }; 257 | 258 | struct Subexp : Rex { 259 | short n; // subexpression number 260 | uchar used; // nonzero if backreferenced 261 | Rex *rex; // contents 262 | Subexp(int n, Rex *rex) 263 | : Rex(SUBEXP), n(n), rex(rex), used(0) { } 264 | ~Subexp() { delete(rex); } 265 | int serialize(int); 266 | Stat stat(Cenv*); 267 | int parse(uchar *, int, Rex*,Eenv*); 268 | void print(); 269 | }; 270 | 271 | struct Alt: Rex { 272 | int n1, n2; // same as in Rep 273 | Rex *left; 274 | Rex *right; 275 | int rserial; 276 | Alt(int n1, int n2, Rex* left, Rex *right) : 277 | Rex(ALT), n1(n1), n2(n2), left(left), right(right) { } 278 | ~Alt() { delete(left); delete(right); } 279 | int serialize(int); 280 | Stat stat(Cenv*); 281 | int parse(uchar *, int, Rex*,Eenv*); 282 | void print(); 283 | }; 284 | 285 | struct Conj: Rex { 286 | Rex *left; 287 | Rex *right; 288 | Conj(Rex *left, Rex *right) : 289 | Rex(CONJ), left(left), right(right) { } 290 | ~Conj() { delete left; delete right; } 291 | int serialize(int); 292 | Stat stat(Cenv*); 293 | int parse(uchar*, int, Rex*, Eenv*); 294 | void print(); 295 | }; 296 | 297 | struct Rep : Dup { 298 | int n1; // subexpression number, or 0 299 | int n2; // last contained subexpression number 300 | Rex *rex; 301 | Rep(int lo, int hi, int n1, int n2, Rex *rex) : 302 | Dup(lo,hi,REP), n1(n1), n2(n2), 303 | rex(rex) { } 304 | ~Rep() { delete rex; }; 305 | int serialize(int); 306 | Stat stat(Cenv*); 307 | int parse(uchar *, int, Rex*, Eenv*); 308 | int dorep(int, uchar *, int, Rex*, Eenv*); 309 | void print(); 310 | }; 311 | 312 | struct Neg : Rex { 313 | Rex *rex; 314 | Neg(Rex *rex) : Rex(NEG), rex(rex) { } 315 | ~Neg() { delete rex; } 316 | int serialize(int); 317 | Stat stat(Cenv*); 318 | int parse(uchar*, int, Rex*, Eenv*); 319 | void print(); 320 | }; 321 | 322 | /* Only one copy of class Done is ever needed, however 323 | that copy is initialized dynamically, not statically. 324 | in order to avoid dependence on C++ library, so the 325 | object code can be loaded by cc */ 326 | 327 | struct Done : Rex { 328 | static Rex *done; // pointer to the one copy 329 | int parse(uchar *, int, Rex*, Eenv*); 330 | void print() { } 331 | }; 332 | -------------------------------------------------------------------------------- /sed.1: -------------------------------------------------------------------------------- 1 | .TH SED 1 2 | .CT 1 files editor 3 | .SH NAME 4 | sed \- stream editor 5 | .SH SYNOPSIS 6 | .B sed 7 | [ 8 | .B -nb 9 | ] 10 | .I script 11 | [ 12 | .I file ... 13 | ] 14 | .PP 15 | .B sed 16 | [ 17 | option ... 18 | ] [ 19 | .I file ... 20 | ] 21 | .SH DESCRIPTION 22 | .I Sed 23 | copies the named 24 | .I files 25 | (standard input default) to the standard output, 26 | edited according to a 27 | .I script 28 | of commands. 29 | The script may be accumulated from 30 | one or more 31 | .B -e 32 | and 33 | .B -f 34 | options. 35 | The options are 36 | .TP 37 | .BI -e " ascript 38 | Append 39 | .I ascript 40 | to the script. 41 | .TP 42 | .BI -f " sfile 43 | Append 44 | .I sfile 45 | to the script. 46 | .TP 47 | .B -n 48 | Suppress default output, giving output only as directed by 49 | .BR p 50 | in the script. 51 | .TP 52 | .B -b 53 | Strip leading blanks from 54 | .I text 55 | in commands (nonstandard option for interpreting some old scripts). 56 | .PP 57 | A script consists of commands, one per line (with semicolon 58 | equivalent to newline, a common but nonstandard convention). 59 | The form of a command is 60 | .IP 61 | [\fIaddress\fR [\f5,\fI address\fR] ] \fIfunction\fR [\fIargument\fR ...] 62 | .PP 63 | In normal operation 64 | .I sed 65 | cyclically copies a line of input into a 66 | .I pattern space 67 | (unless there is something left after a 68 | .L D 69 | command), 70 | applies in sequence 71 | all commands whose 72 | .I addresses 73 | select that pattern space, 74 | and at the end of the script copies the pattern space 75 | to the standard output (except under 76 | .BR -n ) 77 | and deletes the pattern space. 78 | .PP 79 | An 80 | .I address 81 | is either a decimal number that counts 82 | input lines cumulatively across files, a 83 | .L $ 84 | that 85 | addresses the last line of input, or a context address, 86 | .BI / regular-expression / , 87 | as in 88 | .IR ed (1) 89 | with the added conventions that 90 | .IP 91 | .L \en 92 | matches an embedded newline and 93 | .BI \e c 94 | for any character 95 | .I c 96 | matches 97 | .I c. 98 | .IP 99 | .BI \e "c ... c", 100 | for any character 101 | .I c, 102 | can play the delimiting role of 103 | .BR / ... / . 104 | .PP 105 | A command line with no addresses selects every pattern space. 106 | .PP 107 | A command line with 108 | one address selects each pattern space that matches the address. 109 | (Address 110 | .L 0 111 | is never matched.) 112 | .PP 113 | A command line with 114 | two addresses selects the inclusive range from the first 115 | pattern space that matches the first address through 116 | the next pattern space that matches 117 | the second, or the end of input. 118 | (If the second address is a number less than or equal 119 | to the line number first selected, only one 120 | line is selected.) 121 | Thereafter the process is repeated, looking again for the 122 | first address. 123 | .PP 124 | The negation function 125 | .L ! 126 | complements the set of selected pattern spaces. 127 | .PP 128 | In the following list of functions the 129 | maximum number of permissible addresses 130 | for each function is indicated in parentheses. 131 | .PP 132 | A 133 | .I text 134 | argument consists of one or more lines, 135 | all but the last of which end with 136 | .L \e 137 | to hide the 138 | newline. 139 | Backslashes in text are treated like backslashes 140 | in the replacement string of an 141 | .L s 142 | command, 143 | and may be used to protect initial blanks and tabs 144 | against the stripping that is done on 145 | every script line. 146 | .PP 147 | An 148 | .I rfile 149 | or 150 | .I wfile 151 | argument must terminate the command 152 | line and must be preceded by exactly one blank. 153 | Each 154 | .I wfile 155 | is created before processing begins. 156 | .TP 157 | .RB (1) \|a\e 158 | .br 159 | .ns 160 | .TP 161 | .I text 162 | Append. 163 | Place 164 | .I text 165 | on the output before 166 | reading the next input line. 167 | .HP 168 | .RB (2) \|b 169 | .I label 170 | .br 171 | Branch to the 172 | .B : 173 | command bearing the 174 | .I label. 175 | If 176 | .I label 177 | is empty, branch to the end of the script. 178 | .TP 179 | .RB (2) \|c\e 180 | .br 181 | .ns 182 | .TP 183 | .I text 184 | Change. 185 | Delete the pattern space. 186 | With 0 or 1 address or at the end of a 2-address range, place 187 | .I text 188 | on the output. 189 | Start the next cycle. 190 | .TP 191 | .RB (2) \|d 192 | Delete the pattern space. 193 | Start the next cycle. 194 | .TP 195 | .RB (2) \|D 196 | Delete the initial segment of the 197 | pattern space through the first newline. 198 | Start the next cycle. 199 | .TP 200 | .RB (2) \|g 201 | Replace the contents of the pattern space 202 | by the contents of the hold space. 203 | .TP 204 | .RB (2) \|G 205 | Append the contents of the hold space to the pattern space. 206 | .TP 207 | .RB (2) \|h 208 | Replace the contents of the hold space by the contents of the pattern space. 209 | .TP 210 | .RB (2) \|H 211 | Append the contents of the pattern space to the hold space. 212 | .TP 213 | .RB (1) \|i\e 214 | .br 215 | .ns 216 | .TP 217 | .I text 218 | Insert. 219 | Place 220 | .I text 221 | on the standard output. 222 | .TP 223 | .RB (2) \|l 224 | Literal. 225 | Place an unambiguous image of the pattern 226 | space on the standard output, 227 | using C escape sequences. 228 | Break long lines, indicating the breakpoint by 229 | a single backslash. 230 | Append 231 | .B \en 232 | if pattern space ends with space or newline. 233 | .TP 234 | .RB (2) \|n 235 | Copy the pattern space to the standard output. 236 | Replace the pattern space with the next line of input. 237 | .TP 238 | .RB (2) \|N 239 | Append the next line of input to the pattern space 240 | with an embedded newline. 241 | (The current line number changes.) 242 | .TP 243 | .RB (2) \|p 244 | Print. 245 | Copy the pattern space to the standard output. 246 | .TP 247 | .RB (2) \|P 248 | Copy the initial segment of the pattern space through 249 | the first newline to the standard output. 250 | .TP 251 | .RB (1) \|q 252 | Quit. 253 | Branch to the end of the script. 254 | Do not start a new cycle. 255 | .HP 256 | .RB (2) \|r 257 | .I rfile 258 | .br 259 | Read the contents of 260 | .IR rfile . 261 | Place them on the output before reading 262 | the next input line. 263 | .TP 264 | .RB (2) \|s/\fIregular-expression\fP/\fIreplacement\fP/\fIflags 265 | Substitute the 266 | .I replacement 267 | string for instances of the 268 | .I regular-expression 269 | in the pattern space. 270 | Any character may be used as a delimiter instead of 271 | .LR / . 272 | In the replacement text, 273 | .L & 274 | stands for a copy of the matched part 275 | of the input text; 276 | .BI \e n, 277 | where 278 | .I n 279 | is a digit (1-9) 280 | stands for a copy of the last match for the 281 | .IR n th 282 | parenthesized (with 283 | .BR \e( ... \e) ) 284 | subexpression; and 285 | .L \e 286 | quotes the following nondigit (even a newline). 287 | .I Flags 288 | is zero or more of 289 | .RS 290 | .TP 291 | .I n 292 | Substitute for the 293 | .IR n th 294 | occurrence of the regular expression; 295 | .IR n =1 296 | by default. 297 | .TP 298 | .B g 299 | Global. 300 | Substitute for all non-overlapping instances of the 301 | regular expression. 302 | .TP 303 | .B p 304 | Print the pattern space if a substitution was made. 305 | .TP 306 | .BI w " wfile" 307 | Write. 308 | Append the pattern space to 309 | .I wfile 310 | if a substitution 311 | was made. 312 | .RE 313 | .HP 314 | .RB (2) \|t 315 | .I label 316 | .br 317 | Test. 318 | Branch to the 319 | .L : 320 | command bearing the 321 | .I label 322 | if any 323 | substitutions have been made since the most recent 324 | reading of an input line or execution of a 325 | .LR t . 326 | If 327 | .I label 328 | is empty, branch to the end of the script. 329 | .TP 330 | .RB (2) \|w 331 | .I wfile 332 | .br 333 | Write. 334 | Append the pattern space to 335 | .I wfile. 336 | .TP 337 | .RB (2) \|x 338 | Exchange the contents of the pattern and hold spaces. 339 | .TP 340 | .RB (2) \|y/\fIstring1\fP/\fIstring2\fP/ 341 | Transform. 342 | Replace all occurrences of characters in 343 | .I string1 344 | with the corresponding character in 345 | .I string2. 346 | The lengths of 347 | .I 348 | string1 349 | and 350 | .I string2 351 | must be equal. 352 | .HP 353 | .RB (2) ! 354 | .I function 355 | .br 356 | Don't. 357 | Apply the 358 | .I function 359 | (or group, if 360 | .I function 361 | is 362 | .LR { ) 363 | only to lines 364 | .I not 365 | selected by the address(es). 366 | .HP 367 | .RB (0) \|: 368 | .I label 369 | .br 370 | This command does nothing; it bears a 371 | .I label 372 | for 373 | .B b 374 | and 375 | .B t 376 | commands to branch to. 377 | .TP 378 | .RB (1) \|= 379 | Place the current line number on the standard output as a line. 380 | .TP 381 | .RB (2) \|{ 382 | Execute the following commands through a matching 383 | .L } 384 | function only when the pattern space is selected. 385 | .TP 386 | .RB (0) \| 387 | Ignore this empty line. 388 | .TP 389 | .RB (0) \|# 390 | Comment. Ignore this line (and any semicolons in it). 391 | .ne 4 392 | .SH EXAMPLES 393 | .TP 394 | .B sed 10q file 395 | Print the first 10 lines of the file. 396 | .TP 397 | .B sed '/^$/d' 398 | Delete empty lines from standard input. 399 | .TP 400 | .B sed 's/UNIX/& system/g' 401 | Replace every instance of 402 | .L UNIX 403 | by 404 | .LR "UNIX system" . 405 | .PP 406 | .EX 407 | sed 's/ *$// \fRdrop trailing blanks\fP 408 | /^$/d \fRdrop empty lines\fP 409 | s/ */\e \fRreplace blanks by newlines\fP 410 | /g 411 | /^$/d' chapter* 412 | .EE 413 | .ns 414 | .IP 415 | Print the files 416 | .BR chapter1 , 417 | .BR chapter2 , 418 | etc. one word to a line. 419 | .PP 420 | .EX 421 | nroff -ms manuscript | sed ' 422 | ${ 423 | /^$/p \fRif last line of file is empty, print it\fP 424 | } 425 | //N \fRif current line is empty, append next line\fP 426 | /^\en$/D' \fRif two lines are empty, delete the first\fP 427 | .EE 428 | .ns 429 | .IP 430 | Delete all but one of each group of empty lines from a 431 | formatted manuscript. 432 | .PP 433 | .EX 434 | ls /usr/* | sed ' 435 | /^$/d \fRdelete empty lines\fP 436 | /^[/].*:$/{ \fRlook for lines like \fP/usr/lem: 437 | s/:$/\e// \fRreplace \fP:\fR by \fP/ 438 | h \fRhold directory name\fP 439 | d \fRdon't print; get next line\fP 440 | } 441 | G \fRappend held directory name\fP 442 | s/\e(.*\e)\en\e(.*\e)/\e2\e1/' \fRexchange file and directory\fP 443 | .EE 444 | .ns 445 | .IP 446 | List all files in user directories, as 447 | .B ls -d /usr/*/* 448 | would do if it didn't cause argument list overflow. 449 | .SH SEE ALSO 450 | .IR ed (1), 451 | .IR gre (1), 452 | .IR awk (1), 453 | .IR lex (1), 454 | .IR cut (1), 455 | .IR split (1), 456 | .IR sam (9.1) 457 | .SH BUGS 458 | If input is from a pipe, buffering may consume 459 | characters beyond a line on which a 460 | .L q 461 | command is executed. 462 | -------------------------------------------------------------------------------- /sed2.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "sed.h" 6 | 7 | #define ustrchr(p, c) (uchar*)strchr((char*)(p), c) 8 | 9 | int selected(uchar*, Text*); 10 | 11 | #define Re Ie 12 | #define Ce Ie 13 | #define Se Ie 14 | #define re ae 15 | 16 | /* execution functions return pointer to next instruction */ 17 | 18 | typedef uchar *exef(Text*, uchar *, Text*); 19 | exef Ce, Se, Ee; /* colon, semicolon, equal */ 20 | exef Le, Re; /* left {, right { */ 21 | exef Ie, vv; /* ignore, error */ 22 | exef De, Ge, He, Ne, Pe; 23 | exef ae, be, ce, de, ge, he, ie, le, ne; 24 | exef pe, qe, re, se, te, we, xe, ye; 25 | 26 | static exef *excom[128] = { 27 | vv,vv,vv,vv,vv,vv,vv,vv,vv,vv,Ie,vv,vv,vv,vv,vv, 28 | vv,vv,vv,vv,vv,vv,vv,vv,vv,vv,vv,vv,vv,vv,vv,vv, 29 | vv,vv,vv,Ie,vv,vv,vv,vv,vv,vv,vv,vv,vv,vv,vv,vv, /* # */ 30 | vv,vv,vv,vv,vv,vv,vv,vv,vv,vv,Ce,Se,vv,Ee,vv,vv, /* :;= */ 31 | vv,vv,vv,vv,De,vv,vv,Ge,He,vv,vv,vv,vv,vv,Ne,vv, /* DGHN */ 32 | Pe,vv,vv,vv,vv,vv,vv,vv,vv,vv,vv,vv,vv,vv,vv,vv, /* P */ 33 | vv,ae,be,ce,de,vv,vv,ge,he,ie,vv,vv,le,vv,ne,vv, /* a-n */ 34 | pe,qe,re,se,te,vv,vv,we,xe,ye,vv,Le,vv,Re,vv,vv /* p-y{} */ 35 | }; 36 | 37 | #define IBUG "interpreter bug %d" 38 | const char *stdouterr = "writing standard output"; 39 | 40 | Text hold; 41 | 42 | void 43 | cputchar(int c) 44 | { 45 | if(putchar(c) == EOF) 46 | quit(stdouterr); 47 | } 48 | 49 | void 50 | writeline(Text *data) 51 | { 52 | int n = data->w - data->s; 53 | if(fwrite(data->s, 1, n, stdout) != n) 54 | quit(stdouterr); 55 | cputchar('\n'); 56 | } 57 | 58 | void 59 | execute(Text *script, Text *data) 60 | { 61 | uchar *pc; 62 | int sel; 63 | for(pc = script->s; pc < script->w; ) { 64 | sel = selected(pc, data); 65 | if(sel) { 66 | int cmd = code(*instr(pc)); 67 | if(sel==2 && cmd=='c') 68 | cmd = 'd'; 69 | pc = excom[cmd](script, pc, data); 70 | if(pc == 0) 71 | return; 72 | } else 73 | pc = nexti(pc); 74 | } 75 | if(!nflag) 76 | writeline(data); 77 | } 78 | 79 | /* return 1 if action is to be taken on current line, 80 | -1 if (numeric) address has been passed, 81 | 0 otherwise*/ 82 | int 83 | sel1(int addr, Text *data) 84 | { 85 | if(addr & REGADR) 86 | return regexec(readdr(addr),(char*)data->s,0,0,0) == 0; 87 | if(addr == recno) 88 | return 1; 89 | if(addr == DOLLAR) 90 | return ateof(); 91 | if(addr < recno) 92 | return -1; 93 | return 0; 94 | } 95 | 96 | /* return 2 on non-final line of a selected range, 97 | 1 on any other selected line, 98 | 0 on non-selected lines 99 | (the 1-2 distinction matters only for 'c' commands) */ 100 | 101 | int 102 | selected(uchar *pc, Text *data) 103 | { 104 | int active; 105 | int *ipc = (int*)pc; /* points to address words */ 106 | int *q = instr(pc); /* points to instruction word */ 107 | int neg = !!(*q & NEG); 108 | switch(q - ipc) { 109 | case 0: /* 0 address */ 110 | return !neg; 111 | case 1: /* 1 address */ 112 | return neg ^ sel1(ipc[0], data)==1; 113 | case 2: 114 | quit(IBUG,1); 115 | case 3: /* 2 address */ 116 | q--; /* points to activity indicator */ 117 | active = !(*q & INACT); 118 | if((*q&AMASK) < recno) { 119 | switch(sel1(ipc[active], data)) { 120 | case 0: 121 | if((active&ateof()) == 0) 122 | break; 123 | case 1: 124 | *q = recno; 125 | if(active) 126 | *q |= INACT; 127 | return (neg^1) << (!active&!ateof()); 128 | case -1: 129 | if(active) { 130 | *q = recno | INACT; 131 | return neg; 132 | } 133 | } 134 | } 135 | return (neg^active) << 1; 136 | default: 137 | quit(IBUG,2); 138 | return 0; /* dummy */ 139 | } 140 | } 141 | 142 | void 143 | vacate(Text *t) 144 | { 145 | assure(t, 1); 146 | t->w = t->s; 147 | *t->w = 0; 148 | } 149 | 150 | void 151 | tcopy(Text *from, Text *to) 152 | { 153 | int n = from->w - from->s; 154 | assure(to, n+1); 155 | memmove(to->w, from->s, n); 156 | to->w += n; 157 | *to->w = 0; 158 | } 159 | 160 | 161 | /* EASY COMMANDS */ 162 | 163 | uchar * 164 | vv(Text *script, uchar *pc, Text *data) 165 | { 166 | script = script; 167 | pc = pc; 168 | data = data; 169 | quit(IBUG,3); 170 | return 0; /* dummy */ 171 | } 172 | 173 | uchar * 174 | be(Text *script, uchar *pc, Text *data) 175 | { 176 | script = script; 177 | data = data; 178 | return script->s + instr(pc)[1]; 179 | } 180 | 181 | uchar * 182 | De(Text *script, uchar *pc, Text *data) 183 | { 184 | int n; 185 | uchar *end = (uchar*)ustrchr(data->s, '\n'); 186 | if(end == 0) 187 | return de(script, pc, data); 188 | end++; 189 | n = data->w - end; 190 | memmove(data->s, end, n+1); 191 | data->w = data->s + n; 192 | return script->s; 193 | } 194 | 195 | uchar * 196 | de(Text *script, uchar *pc, Text *data) 197 | { 198 | pc = pc; 199 | vacate(data); 200 | return 0; 201 | } 202 | 203 | uchar * 204 | Ee(Text *script, uchar *pc, Text *data) 205 | { 206 | script = script; 207 | data = data; 208 | if(printf("%d\n", recno) <= 0) 209 | quit(stdouterr); 210 | return nexti(pc); 211 | } 212 | 213 | uchar * 214 | Ge(Text *script, uchar *pc, Text *data) 215 | { 216 | script = script; 217 | if(hold.s == 0) 218 | vacate(&hold); 219 | if(data->w > data->s) 220 | *data->w++ = '\n'; 221 | tcopy(&hold, data); 222 | return nexti(pc); 223 | } 224 | 225 | uchar * 226 | ge(Text *script, uchar *pc, Text *data) 227 | { 228 | vacate(data); 229 | return Ge(script, pc, data); 230 | } 231 | 232 | uchar * 233 | He(Text *script, uchar *pc, Text *data) 234 | { 235 | script = script; 236 | assure(&hold, 1); 237 | *hold.w++ = '\n'; 238 | tcopy(data, &hold); 239 | return nexti(pc); 240 | } 241 | 242 | uchar * 243 | he(Text *script, uchar *pc, Text *data) 244 | { 245 | script = script; 246 | vacate(&hold); 247 | tcopy(data, &hold); 248 | return nexti(pc); 249 | } 250 | 251 | uchar * 252 | Ie(Text *script, uchar *pc, Text *data) 253 | { 254 | script = script; 255 | data = data; 256 | return nexti(pc); 257 | } 258 | 259 | uchar * 260 | ie(Text *script, uchar *pc, Text *data) 261 | { 262 | script = script; 263 | data = data; 264 | if(printf("%s", (char*)(instr(pc)+1)) <= 0) 265 | quit(stdouterr); 266 | return nexti(pc); 267 | } 268 | 269 | uchar * 270 | Le(Text *script, uchar *pc, Text *data) 271 | { 272 | script = script; 273 | data = data; 274 | return (uchar*)(instr(pc)+1); 275 | } 276 | 277 | uchar * 278 | Ne(Text *script, uchar *pc, Text *data) 279 | { 280 | assure(data, 1); 281 | *data->w++ = '\n'; 282 | if(readline(data)) 283 | return nexti(pc); 284 | *--data->w = 0; 285 | return de(script, pc, data); 286 | } 287 | 288 | uchar * 289 | ne(Text *script, uchar *pc, Text *data) 290 | { 291 | if(!nflag) 292 | writeline(data); 293 | vacate(data); 294 | if(readline(data)) 295 | return nexti(pc); 296 | return 0; 297 | } 298 | 299 | uchar * 300 | Pe(Text *script, uchar *pc, Text *data) 301 | { 302 | int n; 303 | uchar *end = ustrchr(data->s, '\n'); 304 | if(end == 0) 305 | n = data->w - data->s; 306 | else 307 | n = end - data->s; 308 | if(fwrite(data->s, 1, n, stdout) != n) 309 | quit(stdouterr); 310 | cputchar('\n'); 311 | script = script; 312 | return nexti(pc); 313 | } 314 | 315 | uchar * 316 | pe(Text *script, uchar *pc, Text *data) 317 | { 318 | writeline(data); 319 | script = script; 320 | return nexti(pc); 321 | } 322 | 323 | uchar * 324 | qe(Text *script, uchar *pc, Text *data) 325 | { 326 | pc = pc; 327 | data = data; 328 | qflag++; 329 | return script->w; 330 | } 331 | 332 | uchar * 333 | te(Text *script, uchar *pc, Text *data) 334 | { 335 | int tflag = sflag; 336 | sflag = 0; 337 | if(tflag) 338 | return be(script, pc, data); 339 | else 340 | return nexti(pc); 341 | } 342 | 343 | uchar * 344 | ww(Text *script, uchar *pc, Text *data, int offset) 345 | { 346 | int *q = (int*)(files.s + offset); 347 | FILE *f = *(FILE**)q; 348 | int n = data->w - data->s; 349 | assure(data, 1); 350 | *data->w = '\n'; 351 | if(fwrite(data->s, 1, n+1, f) != n+1 || 352 | fflush(f) == EOF) /* in case of subsequent r */ 353 | quit("error writing %s", (char*)(q+1)); 354 | *data->w = 0; 355 | script = script; 356 | return nexti(pc); 357 | } 358 | 359 | uchar * 360 | we(Text *script, uchar *pc, Text *data) 361 | { 362 | return ww(script, pc, data, instr(pc)[1]); 363 | } 364 | 365 | uchar * 366 | xe(Text *script, uchar *pc, Text *data) 367 | { 368 | uchar *t; 369 | script = script; 370 | if(hold.s == 0) 371 | vacate(&hold); 372 | exch(data->s, hold.s, t); 373 | exch(data->e, hold.e, t); 374 | exch(data->w, hold.w, t); 375 | return nexti(pc); 376 | } 377 | 378 | uchar * 379 | ye(Text *script, uchar *pc, Text *data) 380 | { 381 | uchar *s = (uchar*)data->s; 382 | uchar *w = (uchar*)data->w; 383 | uchar *tbl = (uchar*)(instr(pc)+1); 384 | for( ; ss; sw; s++, i++) { 429 | if(i >= 60) { 430 | cputchar('\\'); 431 | cputchar('\n'); 432 | i = 0; 433 | } 434 | for(j=0; j/dev/null 12 | then awk=awk 13 | elif nawk 'func error() { }' /dev/null 14 | then awk=nawk 15 | elif gawk 'func error() { }' /dev/null 16 | then awk=gawk 17 | else echo cannot find good awk, good bye; exit 1 18 | fi 19 | 20 | rm -f SCRIPT INPUT OUTPUT* RESULT NOWHERE 21 | $awk ' 22 | /^TEST/ { if(phase!=0) error() 23 | if(NF>=2) testno=$2; else testno++; next } 24 | /^SCRIPT/ { args = "" 25 | for(i=2; i<=NF; i++) args = args " " $i } 26 | /^SCRIPT|^INPUT|^OUTPUT/ { phase=$1; printf "" >$1; next } 27 | /^END/ { print testno 28 | command = "sed " args " -f SCRIPT RESULT" 29 | r = system(command) 30 | if(r) print "test " testno " returned " r 31 | if(system("cmp RESULT OUTPUT >/dev/null 2>&1")) { 32 | print "test " testno " FAILED" } 33 | close("SCRIPT"); close("INPUT"); close("OUTPUT") 34 | phase=0; next } 35 | phase==0 { next } 36 | { gsub(/;/,"\n",$0); print >phase } 37 | ' <<'FINI' 38 | 39 | # if one of the sections (SCRIPT, INPUT, OUTPUT) of a test 40 | # is missing the previous one is used. tests labeled 41 | # with the same number, e.g. 02A and 02B, are so linked 42 | # and should be kept together in order. 43 | 44 | # anything on the SCRIPT line becomes an argument to sed 45 | 46 | # semicolons are replaced by newlines in each section 47 | 48 | TEST 01A # =, -n, blank lines 49 | SCRIPT -n 50 | ;=; 51 | INPUT 52 | a;b;c 53 | OUTPUT 54 | 1;2;3 55 | END 56 | 57 | TEST 01B # comments, including #n 58 | SCRIPT 59 | #n;=;#comment 60 | END 61 | 62 | TEST 01C # empty script 63 | SCRIPT 64 | OUTPUT 65 | a;b;c 66 | END 67 | 68 | TEST 01D # do-nothing script 69 | SCRIPT 70 | # nothing 71 | 72 | END 73 | 74 | TEST 01E # substitution; regexp dot; &; substitution flag 75 | SCRIPT 76 | s/./&&/; s/./&x/; s/./&y/2; s/./&z/5 77 | OUTPUT 78 | axya;bxyb;cxyc 79 | END 80 | 81 | TEST 02A # line counting, ranges, overlaps 82 | SCRIPT 83 | 1,2d; 2,6d; 4,5d; 7,10d 84 | INPUT 85 | 1;2;3;4;5;6;7;8 86 | OUTPUT 87 | 3;6 88 | END 89 | 90 | TEST 02B # -e 91 | SCRIPT -e 1,2d -e 2,6d 92 | 4,5d; 7,10d 93 | END 94 | 95 | TEST 02C # bypassing end of range 96 | SCRIPT -n 97 | 2,5d; 1,3p; 1,6p 98 | OUTPUT 99 | 1;1;6 100 | END 101 | 102 | TEST 02C # negation, address $, print 103 | SCRIPT 104 | 3,$!d; $d; p; 4,5d 105 | OUTPUT 106 | 3;3;4;5;6;6;7;7 107 | END 108 | 109 | TEST 03 # regexp addresses, append, insert, change 110 | SCRIPT 111 | /a/a\;A\;A 112 | /a/a\;B 113 | /d/i\;D 114 | /c/c\;C\;C 115 | $i\;E 116 | INPUT 117 | a;b;c;d 118 | OUTPUT 119 | a;A;A;B;b;C;C;D;E;d 120 | END 121 | 122 | TEST 04 # braces 123 | SCRIPT 124 | 3,7{ 125 | /a/s/a/&&/;/b/,/./s/./&&&/ 126 | } 127 | INPUT 128 | a;b;a;b;a;a;b;a;b 129 | OUTPUT 130 | a;b;aa;bbb;aaaa;aa;bbb;a;b 131 | END 132 | 133 | TEST 05A # hold, get 134 | SCRIPT 135 | $!H;$!d;$G 136 | INPUT 137 | 1;2;3;4 138 | OUTPUT 139 | 4;;1;2;3 140 | END 141 | 142 | TEST 05B # hold, exchange, get, brace without newline 143 | SCRIPT 144 | 1{h;d;};3x;$g 145 | OUTPUT 146 | 2;1;3 147 | END 148 | 149 | TEST 05D # quit 150 | SCRIPT 151 | a\;x;q;= 152 | OUTPUT 153 | 1;x 154 | END 155 | 156 | TEST 06 # next, regexp $ 157 | SCRIPT 158 | s/$/x/;n;s/./&y/;N;s/.../&z/;= 159 | INPUT 160 | a;b;c 161 | OUTPUT 162 | ax;3;by;zc 163 | END 164 | 165 | TEST 07A # newline, flag p 166 | SCRIPT -n 167 | /../s/./&\;/;p;P;s/\n//p 168 | INPUT 169 | a;bc;d 170 | OUTPUT 171 | a;a;b;c;b;bc;d;d 172 | END 173 | 174 | TEST 07B # write, flag w 175 | SCRIPT -n 176 | /../s/./&\;/;w RESULT 177 | s/\n/x/w RESULT 178 | OUTPUT 179 | a;b;c;bxc;d 180 | END 181 | 182 | TEST 08 # character classes, flag g 183 | SCRIPT 184 | 1s/[a-z][a-z]*// 185 | 2s/[[:digit:]][[:digit:]]*// 186 | 3s/[^a-z]/X/g 187 | INPUT 188 | AZ`abyz{ 189 | /0189: 190 | a1b2c3d 191 | OUTPUT 192 | AZ`{ 193 | /: 194 | aXbXcXd 195 | END 196 | 197 | TEST 09 # null matches 198 | SCRIPT 199 | 1,2s/a*/x/g 200 | INPUT 201 | 123 202 | aaa 203 | OUTPUT 204 | x1x2x3x 205 | xx 206 | END 207 | 208 | TEST 10 # longest match, unmatched subexpressions 209 | SCRIPT 210 | s/$...$*$..$*/:\1:\2:/ 211 | INPUT 212 | abc 213 | abcd 214 | abcde 215 | abcdef 216 | abcdefg 217 | OUTPUT 218 | :abc:: 219 | ::cd: 220 | :abc:de: 221 | :def:: 222 | :abc:fg: 223 | END 224 | 225 | TEST 11 # metacharacters in substition 226 | SCRIPT 227 | 1s/$/\&/ 228 | 2s/$/\b/ 229 | 3s/$/\\/ 230 | 4s/$/\// 231 | 5s&$&\&& 232 | INPUT 233 | 1;2;3;4;5 234 | OUTPUT 235 | 1&;2b;3\;4/;5 236 | END 237 | 238 | TEST 12A # branches 239 | SCRIPT 240 | :x 241 | /a/{;s/a/x/;bx 242 | } 243 | INPUT 244 | aaa;b;abca 245 | OUTPUT 246 | xxx;b;xbcx 247 | END 248 | 249 | TEST 12B # long labels may be truncated 250 | SCRIPT 251 | :longlabel 252 | /a/s/a/x/;tlonglabel 253 | END 254 | 255 | TEST 12C # jump to end of script 256 | SCRIPT 257 | 3b 258 | /a/s/a/x/g 259 | OUTPUT 260 | xxx;b;abca 261 | END 262 | 263 | 264 | TEST 13A # playing with end of bracket range 265 | SCRIPT -n 266 | /c/d 267 | /a/,/d/{ 268 | /b/,/c/{ 269 | = 270 | } 271 | } 272 | INPUT 273 | a;b;c;d;a 274 | OUTPUT 275 | 2;4;5 276 | END 277 | 278 | TEST 13B # end of change range 279 | SCRIPT 280 | /a/,/b/{ 281 | /b/,/c/c\ 282 | x 283 | } 284 | OUTPUT 285 | a;c;d;x 286 | END 287 | 288 | TEST 13C # end of change range 289 | INPUT 290 | a;b;c;a;c;b;d 291 | OUTPUT 292 | a;c;x;d 293 | END 294 | 295 | TEST 14 # end of change range 296 | SCRIPT 297 | /a/,/b/c\;c 298 | INPUT 299 | a;b;a 300 | OUTPUT 301 | c;c 302 | END 303 | 304 | TEST 15 # weird delimiters, remembered expression (could fail) 305 | SCRIPT 306 | 1s1$.$\11\11 307 | 2s.$\.$\..\.\1. 308 | 3s*$.$\**\*\1* 309 | 4s&$.$\&&\1\&& 310 | \1$\1$\11s//\1/ 311 | \&$\&$\&&s//\&b&/ 312 | INPUT 313 | a1b 314 | abc 315 | abc 316 | a&b 317 | 11b 318 | a&& 319 | OUTPUT 320 | 1b 321 | .ac 322 | *c 323 | aa&b 324 | 1b 325 | a&b&& 326 | END 327 | 328 | TEST 16 # 7680-char line, backreferencing, // could fail 329 | SCRIPT 330 | s/.*/&&&&&&&&/;s//&&&&&&&&/;s//&&&&&&&&/;h 331 | s/[^8]//g 332 | s/^$.*$\1\1\1$/\1/;s//\1/;s//\1/;s//\1/p 333 | g 334 | s/^$.*$\1\1\1$/\1/;s//\1/;s//\1/;s//\1/;s/$.*$\1/\1/ 335 | INPUT 336 | 123456787654321 337 | OUTPUT 338 | 88 339 | 123456787654321 340 | END 341 | 342 | TEST 17 # r from w file, nonexistent r 343 | SCRIPT -n 344 | r NOWHERE 345 | r RESULT 346 | w RESULT 347 | INPUT 348 | 1;2;3 349 | OUTPUT 350 | 1;1;2;1;2;3 351 | END 352 | 353 | TEST 18A # eof in n and N 354 | SCRIPT 355 | a\;1;n 356 | INPUT 357 | a 358 | OUTPUT 359 | a 360 | 1 361 | END 362 | 363 | TEST 18B 364 | SCRIPT 365 | a\;1;N 366 | OUTPUT 367 | 1 368 | END 369 | 370 | TEST 19 # transliterate 371 | SCRIPT 372 | y/abc/ABC/ 373 | y:/\\:.:\/.\:: 374 | INPUT 375 | abcABCabcdef 376 | 1/:2.\ 377 | OUTPUT 378 | ABCABCABCdef 379 | 1\.2:/ 380 | END 381 | 382 | TEST 20 # N, D 383 | SCRIPT 384 | =;N;p;D;s/.*/x/ 385 | INPUT 386 | a;b;c 387 | OUTPUT 388 | 1;a;b;2;b;c;3 389 | END 390 | 391 | TEST 20A # D, G initial states 392 | SCRIPT 393 | 1D;G;h 394 | OUTPUT 395 | b;;c;b; 396 | END 397 | 398 | TEST 21 # interaction of a,c,r 399 | SCRIPT 400 | $!a\;A 401 | $!r INPUT 402 | $!a\;B 403 | 1,2c\;C 404 | INPUT 405 | a;b;c 406 | OUTPUT 407 | A;a;b;c;B;C;A;a;b;c;B;c 408 | END 409 | 410 | TEST 22A # multiple substitutions for null string 411 | SCRIPT 412 | s/a*/b/g 413 | INPUT 414 | aaa 415 | ccc 416 | OUTPUT 417 | bb 418 | bcbcbcb 419 | END 420 | 421 | TEST 22B 422 | SCRIPT 423 | s/a*/b/2 424 | OUTPUT 425 | aaab;cbcc 426 | END 427 | 428 | TEST 22C 429 | SCRIPT 430 | s/a*/b/3 431 | OUTPUT 432 | aaa;ccbc 433 | END 434 | 435 | 436 | FINI 437 | 438 | TEST=a0 # perverse semicolons 439 | 440 | echo $TEST 441 | echo 'a;b' | sed 's;\;;x;'| sed -n '/axb/!s/.*/test'$TEST' FAILED/p' 442 | 443 | TEST=a1 # multiple files, script argumen 444 | echo $TEST 445 | (echo a; echo b; echo c) >INPUT 446 | case "`sed -n '$=' INPUT INPUT INPUT`" in 447 | 9) : ;; 448 | *) echo test $TEST FAILED 449 | esac 450 | 451 | TEST=a2 # l command: weird chars, line folding 452 | # script argument, stdin 453 | echo $TEST 454 | 455 | awk 'BEGIN{printf "\ta%c\\\n", 1}' /dev/null | 456 | sed -n ' 457 | s/.*/&&&&&&&&&&/ 458 | H 459 | H 460 | H 461 | G 462 | l 463 | ' | sed ' 464 | :x 465 | /\\$/{ 466 | $bz 467 | N 468 | s/\\\n// 469 | tx 470 | bz 471 | } 472 | s/\$$// 473 | s/\\n// 474 | s/\\n// 475 | s/\\n// 476 | s/\\n// 477 | tw 478 | :z 479 | s/.*/test '$TEST' FAILED/ 480 | q 481 | :w 482 | s/\\ta\\001\\\\//g 483 | /^$/!bz 484 | d 485 | ' 486 | 487 | TEST=a3 # multiple w files, multiple inputs, OUTPUT1 empty 488 | 489 | echo $TEST 490 | 491 | rm -f OUTPUT* 492 | awk 'BEGIN{for(i=1;i<=9;i++) print i}' /dev/null | 493 | sed -n ' 494 | 9,$d 495 | w OUTPUT9 496 | 8,$d 497 | w OUTPUT8 498 | 7,$d 499 | w OUTPUT7 500 | 6,$d 501 | w OUTPUT6 502 | 5,$d 503 | w OUTPUT5 504 | 4,$d 505 | w OUTPUT4 506 | 3,$d 507 | w OUTPUT3 508 | 2,$d 509 | w OUTPUT2 510 | 1,$d 511 | w OUTPUT1 512 | ' 513 | sed -n '$=' OUTPUT1 OUTPUT2 OUTPUT3 OUTPUT4 OUTPUT5 \ 514 | OUTPUT6 OUTPUT7 OUTPUT8 OUTPUT9 | 515 | sed -n '/36/!s/.*/test '$TEST' FAILED/p' 516 | 517 | TEST=a4 # assorted errors; each field is an argument 518 | 519 | echo $TEST 520 | y=0 521 | rm -f NOWHERE 522 | while read x 523 | do if sed $x DIAG && test ! -s DIAG 524 | then case $y in 525 | 0) echo bad or dubious usage not diagnosed: 526 | y=1 527 | esac 528 | echo \"$x\" 529 | fi 530 | done <<'END' 531 | = NOWHERE 532 | -e :x -e :x 533 | r/dev/null 534 | -f NOWHERE 535 | 536 | 0p 537 | //p 538 | bx 539 | / 540 | /a 541 | /\\ 542 | /a/ 543 | =;= 544 | 1,c 545 | 1,/ 546 | 1,2,3p 547 | /\\(/p 548 | /\\1/p 549 | s/a/b 550 | s/a/b/q 551 | s/a/b/g3 552 | s/a/b/gg 553 | s/a/b/pp 554 | s/a/b/wNOWHERE 555 | y/a 556 | y/a/ 557 | y/a/b 558 | y/aa/bb/ 559 | y/aa/ab/ 560 | y/a/bb/ 561 | y/aa/b/ 562 | 1,2= 563 | 1,2q 564 | : 565 | \\ 566 | a 567 | c 568 | e 569 | f 570 | i 571 | j 572 | k 573 | m 574 | o 575 | r 576 | s 577 | u 578 | v 579 | w 580 | y 581 | z 582 | { 583 | } 584 | 1 585 | pq 586 | dq 587 | aq 588 | !!p 589 | 1# 590 | a\\ 591 | END 592 | 593 | TEST=a5 # assorted errors; each line is a script 594 | 595 | echo $TEST 596 | y=0 597 | while read x 598 | do if sed "$x" DIAG && test ! -s DIAG 599 | then case $y in 600 | 0) echo bad or dubious usage not diagnosed: 601 | y=1 602 | esac 603 | echo \"$x\" 604 | fi 605 | done <<'END' 606 | w . 607 | s /a/b/ 608 | s/a/b/w . 609 | 1, 2p 610 | END 611 | 612 | echo 613 | echo Checking some customary extensions. 614 | echo 615 | 616 | echo 'Is semicolon usable as newline?' 617 | if sed '=;=' /dev/null 618 | then echo Yes. 619 | echo 'Does semicolon terminate a label?' 620 | echo No. | sed ':x;s/No./Yes./' 621 | else echo No. 622 | fi 623 | 624 | echo 'Can previous regular expression be abbreviated as //'? 625 | if sed '/a/s///' /dev/null 626 | then echo Yes. 627 | echo 'Is the meaning of // static or dynamic?' 628 | echo ab | sed ' 629 | /a/bx 630 | /b/= 631 | :x 632 | s/// 633 | /a/s/.*/Static./ 634 | /b/s/.*/Dynamic./ 635 | ' 636 | else echo No. 637 | fi 638 | 639 | echo 'Is space optional in r and w commands?' 640 | if sed 'w/dev/null' /dev/null 641 | then echo Yes. 642 | else echo No. 643 | fi 644 | 645 | echo 'Can \ precede a non-special character in regular expression?' 646 | if sed '/\y/b' /dev/null 647 | then echo Yes. 648 | else echo No. 649 | fi 650 | 651 | echo 'Can \ precede a non-special character in substitution text?' 652 | if sed 's/x/\y/' /dev/null 653 | then echo Yes. 654 | else echo No. 655 | fi 656 | 657 | echo 'Are spaces allowed between addresses?' 658 | if sed '1, 2p' /dev/null 659 | then echo Yes. 660 | else echo No. 661 | fi 662 | 663 | echo 'Does ^ in $^a$ denote anchoring?' 664 | echo Yes. | sed '/$^Yes.$/!s/Yes/No/' 665 | -------------------------------------------------------------------------------- /testre.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * regex tester 3 | * 4 | * testre [-n] [-tN] [-v] < testre.dat 5 | * 6 | * -n repeat each test with REG_NOSUB 7 | * -tN time limit, N sec per test (default=10, no limit=0) 8 | * -v list each test line 9 | * 10 | * see comments in testre.dat for description of format 11 | */ 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include "regex.h" 19 | #include 20 | #include 21 | 22 | #ifndef REG_NULL 23 | #define REG_NULL 0 24 | #endif 25 | #ifndef REG_ANCH 26 | #define REG_ANCH 0 27 | #endif 28 | #ifndef REG_LITERAL 29 | #define REG_LITERAL 0 30 | #endif 31 | #ifndef REG_AUGMENTED 32 | #define REG_AUGMENTED 0 33 | #endif 34 | 35 | #ifdef DEBUG /* tied to MDM's regex package */ 36 | #define MSTAT 1 37 | extern int mallocblocks; /* to look for memory leaks */ 38 | extern int mallocbytes; 39 | extern int freeblocks; 40 | #else 41 | #define MSTAT 0 42 | int mallocblocks; /* to keep the compiler happy */ 43 | int mallocbytes; 44 | int freeblocks; 45 | #endif 46 | 47 | #define elementsof(x) (sizeof(x)/sizeof(x[0])) 48 | #define streq(a,b) !strcmp(a,b) 49 | 50 | #define NOTEST ~0 51 | 52 | struct codes { 53 | int code; 54 | char *name; 55 | } codes[] = { 56 | {REG_NOMATCH, "NOMATCH"}, 57 | {REG_BADPAT, "BADPAT"}, 58 | {REG_ECOLLATE, "ECOLLATE"}, 59 | {REG_ECTYPE, "ECTYPE"}, 60 | {REG_EESCAPE, "EESCAPE"}, 61 | {REG_ESUBREG, "ESUBREG"}, 62 | {REG_EBRACK, "EBRACK"}, 63 | {REG_EPAREN, "EPAREN"}, 64 | {REG_EBRACE, "EBRACE"}, 65 | {REG_BADBR, "BADBR"}, 66 | {REG_ERANGE, "ERANGE"}, 67 | {REG_ESPACE, "ESPACE"}, 68 | {REG_BADRPT, "BADRPT"} 69 | }; 70 | 71 | int errors; 72 | int lineno; 73 | char *which; 74 | int prog; 75 | int nflag; 76 | int verbose; 77 | int timelim = 10; 78 | char *nosubmsg = ""; 79 | regmatch_t NOMATCH = {-2, -2}; 80 | 81 | static char* 82 | null(char* s) 83 | { 84 | return s ? (*s ? s : (char*)"NULL") : (char*)"NULL"; 85 | } 86 | 87 | static void 88 | report(char *comment, char *re, char *s) 89 | { 90 | errors++; 91 | printf("%d:%s versus %s %s %s %s", 92 | lineno, null(re), null(s), which, nosubmsg, comment); 93 | } 94 | 95 | static void 96 | bad(char *comment, char *re, char *s) 97 | { 98 | nosubmsg = ""; 99 | report(comment, re, s); 100 | printf(", test run abandoned\n"); 101 | exit(1); 102 | } 103 | 104 | static void 105 | doregerror(int code, regex_t *preg) 106 | { 107 | char buf[200]; 108 | char *msg = buf; 109 | 110 | switch(code) { 111 | case -SIGBUS: 112 | msg = "bus error"; 113 | break; 114 | case -SIGSEGV: 115 | msg = "memory fault"; 116 | break; 117 | case -SIGALRM: 118 | msg = "did not terminate"; 119 | break; 120 | default: 121 | regerror(code, preg, msg, sizeof buf); 122 | break; 123 | } 124 | printf("%s\n", msg); 125 | } 126 | 127 | static int 128 | readfield(char *f, char end) 129 | { 130 | int c; 131 | for(;;) { 132 | *f = 0; 133 | c = getc(stdin); 134 | if(c == EOF) 135 | return 1; 136 | if(c == end) 137 | break; 138 | if(c == '\n') 139 | return 1; 140 | *f++ = c; 141 | } 142 | if(c == '\t') { 143 | while(c == end) 144 | c = getc(stdin); 145 | ungetc(c, stdin); 146 | } 147 | return 0; 148 | } 149 | 150 | static int 151 | hex(int c) 152 | { 153 | return isdigit(c)? c-'0': 154 | isupper(c)? c-'A'+10: 155 | c-'a'+10; 156 | } 157 | 158 | void escape(char *s) 159 | { 160 | char *t; 161 | for(t=s; *t=*s; s++, t++) { 162 | if(*s != '\\') 163 | continue; 164 | switch(*++s) { 165 | case 0: 166 | *++t = 0; 167 | break; 168 | case 'n': 169 | *t = '\n'; 170 | break; 171 | case 'x': 172 | if(!isxdigit(s[1]) || !isxdigit(s[2])) 173 | bad("bad \\x\n", 0, 0); 174 | *t = hex(*++s) << 4; 175 | *t |= hex(*++s); 176 | break; 177 | default: 178 | s--; 179 | } 180 | } 181 | } 182 | 183 | static void 184 | getprog(char *ans) 185 | { 186 | char *s = ans + strlen(ans); 187 | while(isdigit(s[-1])) 188 | s--; 189 | if(*s == 0) 190 | prog = -1; 191 | else 192 | prog = atoi(s); 193 | } 194 | 195 | static int 196 | readline(char *spec, char *re, char *s, char *ans) 197 | { 198 | int c = getchar(); 199 | switch(c) { 200 | case EOF: 201 | return 0; 202 | case '#': 203 | while(c != '\n') 204 | c = getchar(); 205 | case '\n': 206 | *spec = 0; 207 | return 1; 208 | } 209 | ungetc(c, stdin); 210 | if(readfield(spec, '\t')) return 0; 211 | if(readfield(re, '\t')) return 0; 212 | if(readfield(s, '\t')) return 0; 213 | if(readfield(ans, '\n')) return 0; 214 | escape(re); 215 | escape(s); 216 | getprog(ans); 217 | return 1; 218 | } 219 | 220 | static void 221 | matchprint(regmatch_t *match, int nmatch, int m) 222 | { 223 | int i; 224 | for( ; nmatch>m; nmatch--) 225 | if(match[nmatch-1].rm_so != -1) 226 | break; 227 | for(i=0; i"); 349 | while((p = *++argv) && *p == '-') 350 | for(;;) 351 | { 352 | switch(*++p) 353 | { 354 | case 0: 355 | break; 356 | case 'n': 357 | nflag = 1; 358 | printf(", NOSUB"); 359 | continue; 360 | case 't': 361 | if(*++p == 0) 362 | p = "0"; 363 | timelim = atoi(p); 364 | break; 365 | case 'v': 366 | verbose = 1; 367 | printf(", verbose"); 368 | continue; 369 | default: 370 | printf(", invalid option %c", *p); 371 | continue; 372 | } 373 | break; 374 | } 375 | if(p) 376 | printf(", argument(s) ignored"); 377 | printf("\n"); 378 | signal(SIGALRM, gotcha); 379 | // signal(SIGBUS, gotcha); 380 | // signal(SIGSEGV, gotcha); 381 | while(readline(spec, re, s, ans)) { 382 | lineno++; 383 | if(*spec == 0) 384 | continue; 385 | 386 | /* interpret: */ 387 | 388 | cflags = eflags = are = bre = ere = lre = 0; 389 | nmatch = 20; 390 | for(p=spec; *p; p++) { 391 | if(isdigit(*p)) { 392 | nmatch = strtol(p, &p, 10); 393 | p--; 394 | continue; 395 | } 396 | switch(*p) { 397 | case 'A': 398 | are = REG_AUGMENTED; 399 | continue; 400 | case 'B': 401 | bre = 1; 402 | continue; 403 | case 'E': 404 | ere = 1; 405 | continue; 406 | case 'L': 407 | lre = REG_LITERAL; 408 | continue; 409 | case 'N': 410 | cflags |= nonstd(REG_NOSUB); 411 | continue; 412 | case 'I': 413 | cflags |= nonstd(REG_ICASE); 414 | continue; 415 | case 'W': 416 | cflags |= nonstd(REG_NEWLINE); 417 | continue; 418 | case 'U': 419 | cflags |= nonstd(REG_NULL); 420 | continue; 421 | case 'C': 422 | cflags |= nonstd(REG_ANCH); 423 | continue; 424 | case 'b': 425 | eflags |= nonstd(REG_NOTBOL); 426 | continue; 427 | case 'e': 428 | eflags |= nonstd(REG_NOTEOL); 429 | continue; 430 | default: 431 | bad("bad spec\n", re, s); 432 | } 433 | } 434 | if(streq(re, "NULL")) 435 | re[0] = 0; 436 | if((cflags|eflags) == NOTEST) 437 | continue; 438 | 439 | compile: 440 | fflush(stdout); 441 | if(bre) { 442 | which = "BRE"; 443 | bre = 0; 444 | flags = cflags; 445 | } else if(ere) { 446 | which = "ERE"; 447 | ere = 0; 448 | flags = cflags | REG_EXTENDED; 449 | } else if(are) { 450 | which = "ARE"; 451 | are = 0; 452 | flags= cflags | REG_AUGMENTED; 453 | } else if (lre) { 454 | which = "LRE"; 455 | lre = 0; 456 | flags = cflags | REG_LITERAL; 457 | } else 458 | continue; 459 | 460 | nosub: 461 | nosubmsg = (char*)((flags^cflags)®_NOSUB? " (NOSUB)": ""); 462 | testno++; 463 | cret = alarmcomp(&preg, re, flags); 464 | if(cret == 0) { 465 | if(!streq(ans, "NULL") && 466 | !streq(ans,"NOMATCH") && 467 | ans[0]!='(') { 468 | report("regcomp should fail and didn't", re, ans); 469 | printf("\n"); 470 | continue; 471 | } 472 | } else if(streq(ans,"NULL") || ans[0]=='(') { 473 | report("regcomp failed: ", re, ans); 474 | doregerror(cret, &preg); 475 | goto next; 476 | } else if(cret==REG_BADPAT || cret==codeval(ans)) 477 | goto next; 478 | else if(streq(ans, "BADPAT")) 479 | goto next; 480 | else { 481 | report("regcomp failed with unexpected answer: ", re, ans); 482 | errors--; 483 | doregerror(cret, &preg); 484 | goto next; 485 | } 486 | 487 | /* execute: */ 488 | 489 | for(i=0; i= 0) { 496 | if(eret == prog) 497 | eret = 0; 498 | else 499 | report("wrong progress mark:", re, s); 500 | } 501 | 502 | if(eret != 0) { 503 | if(!streq(ans, "NOMATCH")) { 504 | report("regexec failed: ", re, s); 505 | doregerror(eret, &preg); 506 | } 507 | } else if(streq(ans,"NOMATCH")) { 508 | report("regexec should fail and didn't: " ,re, s); 509 | matchprint(match, nmatch, 0); 510 | } else if(streq(ans,"NULL") || flags®_NOSUB) 511 | matchcheck(0, match, ans, re, s); 512 | else 513 | matchcheck(nmatch, match, ans, re, s); 514 | regfree(&preg); 515 | 516 | next: 517 | if(nflag && (flags®_NOSUB)==0) { 518 | flags |= REG_NOSUB; 519 | goto nosub; 520 | } 521 | goto compile; 522 | } 523 | printf("%d lines, ", lineno); 524 | printf("%d tests, %d errors\n", testno, errors); 525 | if(MSTAT) { 526 | printf("%d blocks allocated", mallocblocks); 527 | printf(", %d bytes allocated", mallocbytes); 528 | printf(", %d blocks lost\n", mallocblocks-freeblocks); 529 | } 530 | return 0; 531 | } 532 | -------------------------------------------------------------------------------- /testre.dat: -------------------------------------------------------------------------------- 1 | # posix regular expression tests 2 | 3 | # input lines may be blank, or comment beginning with #, 4 | # or a test spec, four fields separated by 1 or more tabs 5 | 6 | # field 1, what tests to apply, one or more chars 7 | # B BRE 8 | # E ERE 9 | # A ARE (skip if REG_AUGMENTED is undefined) 10 | # N REG_NOSUB 11 | # I REG_ICASE 12 | # W REG_NEWLINE 13 | # U REG_NULL (skip if REG_NULL is undefined) 14 | # C REG_ANCH (skip if REG_ANCH is undefined) 15 | # L REG_LITERAL (skip if REG_LITERAL is undefined) 16 | # b REG_NOTBOL 17 | # e REG_NOTEOL 18 | # numb use numb for nmatch (20 by default) 19 | 20 | # field 2, a regular expression, or NULL meaning empty. 21 | 22 | # field 3, a string to match, or NULL meaning empty. 23 | 24 | # field 4, outcome. one of the posix error codes (with REG_ 25 | # deleted) or the match array, a list of (m,n) entries 26 | # with m and n being first and last+1 positions in string, 27 | # or NULL if REG_NOSUB is in effect and success is expected. 28 | # BADPAT is acceptable in place of any regcomp error code 29 | # and vice versa. 30 | 31 | # fields 2 and 3 may contain certain escape sequences: 32 | # \n \xhh (hexadecimal) 33 | 34 | # the distinct regcomp() error values are parlous; 35 | # BADPAT matches any error and vice versa 36 | 37 | # basic sanity 38 | 39 | BEA abracadabra$ abracadabracadabra (7,18) 40 | BEAI aBrAcAdAbRa$ AbRaCaDaBrA (0,11) 41 | BEAI aBrAc 1ABRA2abra3abrac (11,16) 42 | BEA a...b abababbb (2,7) 43 | 44 | # "null" expressions 45 | 46 | B  () (0,0)(0,0) 47 | E  () (0,2) 48 | B () () (0,2) 49 | EA ) () (1,2) 50 | EA \) () (1,2) 51 | EA } {} (1,2) 52 | BEA a] a]a (0,2) 53 | 54 | EA NULL NULL BADPAT 55 | EA ()* NULL BADPAT 56 | B NULL NULL BADPAT 57 | B $a NULL EPAREN 58 | EA (a NULL EPAREN 59 | B $ NULL EPAREN 60 | B \{ NULL BADPAT 61 | EA { NULL BADPAT 62 | BEA \} NULL BADPAT 63 | BEA \] NULL BADPAT 64 | 65 | # anchoring 66 | 67 | BEA ^abc xa NOMATCH 68 | BEA ^a ax (0,1) 69 | BEA \^a a^a (1,3) 70 | BEA a\^ a^ (0,2) 71 | BEA a$ aa$ NOMATCH 72 | BEA a$ aa (1,2) 73 | BEA a\$ aa NOMATCH 74 | BEA a\$ a$ (0,2) 75 | BEA ^$ NULL (0,0) 76 | 77 | B $^ $^ (0,2) 78 | B $^ NULL NOMATCH 79 | EA $^ $^ NOMATCH 80 | EA $^ NULL (0,0) 81 | 82 | EA x^a x^a NOMATCH 83 | B x^a x^a (0,3) 84 | EA $a $a NOMATCH 85 | B $a $a (0,2) 86 | EA ^* a (0,0) 87 | B ^* * (0,1) 88 | #B $^$ and $$$not tested; std allows multiple behaviors 89 | B $\^$ ^ (0,1)(0,1) 90 | B $\$$ $ (0,1)(0,1) 91 | EA (^)* NULL (0,0) 92 | EA (^)+ NULL (0,0)(0,0) 93 | EA ^(^) NULL (0,0)(0,0) 94 | EA a($) aa (1,2)(2,2) 95 | EA a*(^a) aa (0,1)(0,1) 96 | B $* $$ (0,2) 97 | EA $* $$ (0,0) 98 | 99 | # longest leftmost match 100 | 101 | EA (..)*(...)* a (0,0)(?,?)(?,?) 102 | EA (..)*(...)* abc (0,3)(?,?)(0,3) 103 | EA (..)*(...)* abcd (0,4)(2,4)(?,?) 104 | EA (..)*(...)*.* abc (0,3)(0,2)(?,?) 105 | EA ((..)*(...)*).* abc (0,3)(0,3)(?,?)(0,3) 106 | EA ((...)*(..)*).* abcd (0,4)(0,4)(?,?)(2,4) 107 | EA (...)*(..)*.* abcd (0,4)(0,3)(?,?) 108 | EA (a|ab)(bc|c) abc (0,3)(0,2)(2,3) 109 | EA (ab|a)(bc|c) abc (0,3)(0,2)(2,3) 110 | EA (ab)c|abc abc (0,3)(0,2) 111 | EA abc|(ab)c abc (0,3)(?,?) 112 | 113 | B $..$*$...$* a (0,0)(?,?)(?,?) 114 | B $..$*$...$* abc (0,3)(?,?)(0,3) 115 | B $..$*$...$* abcd (0,4)(2,4)(?,?) 116 | B $..$*$...$*.* abc (0,3)(0,2)(?,?) 117 | B $\(..$*$...$*\).* abc (0,3)(0,3)(?,?)(0,3) 118 | 119 | # backreference 120 | 121 | B $\(.$\2\)*a xxbxxyya (3,8)(5,7)(5,6) 122 | B $\(x$\2\)*a xxbxxyyaa (7,8)(?,?)(?,?) 123 | B $\(x$*a\)* xaxxaa (0,6)(5,6)(?,?) 124 | B $\(a*b*$\2\)* abbabbaa (0,8)(6,8)(6,7) 125 | 126 | B $.*$\1\1$.*$\2\2\2 aaa (0,3)(0,1)(3,3) 127 | B $.*$\1\1$.*$\2\2\2 aaaa (0,4)(0,0)(0,1) 128 | B $.*$\1\1$.*$\2\2\2 aaaaa (0,4)(0,0)(0,1) 129 | B $.*$\1\1$.*$\2\2\2 aaaaaa (0,6)(0,2)(6,6) 130 | B $.*$\1\1$.*$\2\2\2 aaaaaaa (0,7)(0,1)(3,4) 131 | B $.*$\1\1$.*$\2\{3\} aaaaaaa (0,7)(0,1)(3,4) 132 | 133 | B $.*$\1* ababab (0,6)(0,6) 134 | B $.*$\1*\1 ababab (0,6)(0,2) 135 | BI $.*$\1*\1 abABAb (0,6)(0,2) 136 | B $\(b$\)*\2 ab NOMATCH 137 | B $\(a$\)*$\2$*$b$ ab (0,2)(0,1)(0,1)(?,?)(1,2) 138 | 139 | B $\(a$$b$*\2\3\)* a (0,0)(?,?)(?,?)(?,?) 140 | B $\(a$$b$*\2\3\)* aa (0,0)(?,?)(?,?)(?,?) 141 | B $\(a$$b$*\2\3\)* abab (0,4)(0,4)(0,1)(1,2) 142 | B $\(a$$b$*\2\3\)* ababaab (0,4)(0,4)(0,1)(1,2) 143 | B $\(a$$b$*\2\3\)* abbab (0,5)(0,5)(0,1)(2,3) 144 | 145 | B $\(x$\2\)*$\(x$\4\4\)* xxxxxxy (0,6)(4,6)(4,5)(?,?)(?,?) 146 | 147 | B [ab]*$b[ab]*$\1 baaabaababbaaabaabab (0,20)(0,10) 148 | B [ab]*$b[ab]*$\1[ab]* baaabaababbaaabaabab (0,20)(9,10) 149 | 150 | B a\1 NULL ESUBREG 151 | B $\1$ NULL ESUBREG 152 | B $.$\0 NULL ESUBREG 153 | 154 | # duplications 155 | 156 | B * * (0,1) 157 | B a+ aa+ (1,3) 158 | B $*x$ x*x (1,3)(1,3) 159 | EA a{0}b ab (1,2) 160 | BEAI ba*BA* AaBaAbAa (2,8) 161 | EA (a+ba*)* abaabaaabaaaa (0,13)(7,13) 162 | EA ((ax)+b(ax)*)* axbaxaxbaxaxaxbaxaxaxax (0,23)(12,23)(12,14)(21,23) 163 | EA (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) 164 | EA (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) 165 | B $a*$$b\{0,1\}$$b\{1,\}$b\{3\} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) 166 | 167 | EA (a*){1,3} aaa (0,3)(0,3) 168 | EA (a*){2,3} aaa (0,3)(3,3) 169 | EA (a*){1,} aaa (0,3)(0,3) 170 | EA (a*){2} aaa (0,3)(3,3) 171 | B $a*$\{1,3\} aaa (0,3)(0,3) 172 | B $a*$\{2,3\} aaa (0,3)(3,3) 173 | B $a*$\{1,\} aaa (0,3)(0,3) 174 | B $a*$\{2\} aaa (0,3)(3,3) 175 | B $a*$\{1,3\}\1 aaa (0,3)(3,3) 176 | B $a*$\{2,3\}\1 aaa (0,3)(3,3) 177 | B $a*$\{1,\}\1 aaa (0,3)(3,3) 178 | B $a*$\{2\}\1 aaa (0,3)(3,3) 179 | 180 | EA * NULL BADRPT 181 | EA + NULL BADRPT 182 | EA | NULL BADPAT 183 | EA ? NULL BADRPT 184 | EA (*x) NULL BADRPT 185 | EA (a|) NULL BADPAT 186 | EA a{17 NULL EBRACE 187 | B a\{17 NULL EBRACE 188 | EA a{2,1} NULL BADBR 189 | B a\{2,1\} NULL BADBR 190 | EA a{9876543210} NULL BADBR 191 | EA a{-1} NULL BADBR 192 | EA a{1,10 NULL EBRACE 193 | EA a{1,10u} NULL BADBR 194 | B a\{1,10.\} NULL BADBR 195 | 196 | # alternations (these tests are not adequate to test an 197 | # optimizing compiler, because many of them can be 198 | # easily converted to simpler expressions, often involving 199 | # character classes) 200 | 201 | EA (a|at|ate|aten)(ten|en|n|d) atend (0,5)(0,4)(4,5) 202 | EA (a|aa|aaa|aaaaa)(aa|aaaa|aaa) aaaaaa (0,6)(0,3)(3,6) 203 | EA (a|a.|a..|a....)(a.|a...|a..) aaaaaa (0,6)(0,3)(3,6) 204 | EA (a*b|b*a)(a*b|b*a) bbaa (0,4)(0,3)(3,4) 205 | EA (ab|((ab)c|abc))c* abc (0,3)(0,3)(0,3)(0,2) 206 | EA ((a*b*c*)|(a*c*b*))* abcacb (0,6)(3,6)(?,?)(3,6) 207 | EA (a|(a|a)) a (0,1)(0,1)(?,?) 208 | EA ((a|a)|a) a (0,1)(0,1)(0,1) 209 | EA a*(a|aa) aaaa (0,4)(3,4) 210 | EA a*(a.|aa) aaaa (0,4)(2,4) 211 | EA a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2) 212 | EA a(b)|a(b)c abc (0,3)(?,?)(1,2) 213 | EA .*(a|b)? b (0,1)(?,?) 214 | EA (a|b)?.* b (0,1)(0,1) 215 | EA .*(a|b).* aa (0,2)(1,2) 216 | EA (a|b)c|a(b|c) ac (0,2)(0,1)(?,?) 217 | EA (a|b)c|a(b|c) ab (0,2)(?,?)(1,2) 218 | EA (a|b)*c|(a|ab)*c abc (0,3)(1,2)(?,?) 219 | EA (a|b)*c|(a|ab)*c xc (1,2)(?,?)(?,?) 220 | EA ((a|ab)(bc|c)|abc) abc (0,3)(0,3)(0,2)(2,3) 221 | EA ((.a|.ab)(bc.|c.)|abc.) xabcx (0,5)(0,5)(0,3)(3,5) 222 | EA (a|b)*|(c|d)* cd (0,2)(?,?)(1,2) 223 | EA (a.|b.)*|(c.|d.)* cxdx (0,4)(?,?)(2,4) 224 | EA (.a|.b).*|.*(.a|.b) xa (0,2)(0,2)(?,?) 225 | EA (aa|ba)*a.*(b|bb) abbb (0,4)(?,?)(3,4) 226 | EA (a{1}a|ba)*a.*(b{1}|bb) abbb (0,4)(?,?)(3,4) 227 | EA .*|.*(a|b)? aa (0,2)(?,?) 228 | EA a?(ab|ba)ab abab (0,4)(0,2) 229 | EA a?(ac{0}b|ba)ab abab (0,4)(0,2) 230 | EA ab|abab aabab (1,5) 231 | EA ab|abab abbabab (0,2) 232 | EA aba|bab|bba baaabbbaba (5,8) 233 | EA aba|bab baaabbbaba (6,9) 234 | EA .*(bc|abcd) abcd (0,4)(0,4) 235 | 236 | EA (aa|aaa)*|(a|aaaaa) a (0,1)(?,?)(0,1) 237 | EA (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2)(?,?) 238 | EA (aa|aaa)*|(a|aaaaa) aaa (0,3)(0,3)(?,?) 239 | EA (aa|aaa)*|(a|aaaaa) aaaa (0,4)(2,4)(?,?) 240 | EA (aa|aaa)*|(a|aaaaa) aaaaa (0,5)(3,5)(?,?) 241 | EA (a.|a..)*|(a|a....) a (0,1)(?,?)(0,1) 242 | EA (a.|.a.)*|(a|.a...) aa (0,2)(0,2)(?,?) 243 | EA (a.|..a)*|(a|..a..) aaa (0,3)(0,3)(?,?) 244 | EA (a.|a..)*|(a|...a.) aaaa (0,4)(2,4)(?,?) 245 | EA (a.|.a.)*|(a|....a) aaaaa (0,5)(3,5)(?,?) 246 | 247 | EA ab|a xabc (1,3) 248 | EA ab|a xxabc (2,4) 249 | EA ab|cd bc NOMATCH 250 | EAI (Ab|cD)* aBcD (0,4)(2,4) 251 | 252 | EA (a|) NULL BADPAT 253 | EA a||b NULL BADPAT 254 | 255 | EA (\x7f\x80|\x80\x81|\x81\x82)* \x7f\x80\x81\x82 (0,4)(2,4) 256 | 257 | 258 | # character classes 259 | 260 | BEA [a-z][0-9] AZ90az90 (5,7) 261 | BEA [[:lower:]][[:digit:]] AZ90az90 (5,7) 262 | BEA [[.a.]-z][[.zero.]-9] AZ90az90 (5,7) 263 | BEAI [a-z][0-9] AZ90az90 (1,3) 264 | BEAI [[:upper:]][[:lower:]] aBCde (0,2) 265 | EA [[:lower:]]+ `az{ (1,3) 266 | EA [[:upper:]]+ @AZ[ (1,3) 267 | BEA [\t]* t\ (0,2) 268 | EA [^a-z]+ a@Z`{z (1,5) 269 | B [$[:xdigit:].,]\{1,\} gfa$1,234.AFG (1,12) 270 | EAI [[=Z=]]+ azZa (1,3) 271 | BEA [[=a=]b]* abc (0,2) 272 | EA [a[=b=]]+ cabc (1,3) 273 | BEA [[.-.][.].]-[.^.]]* ]-^a (0,3) 274 | BEA [[-]] [[-]] (2,4) 275 | BEA [---] a-b (1,2) 276 | BEA [^-] --a (2,3) 277 | BEA [a-]* --a (0,3) 278 | 279 | BEA []][[.].]][[.right-square-bracket.]] a]]] (1,4) 280 | EA [[-[.right-square-bracket.]]+ Z[\]^ (1,4) 281 | 282 | BEA [z-a] NULL ERANGE 283 | BEA [^] NULL EBRACK 284 | BEA [[:upper:]-z] NULL ERANGE 285 | BEA [[.a.]-[=b=]]* NULL ERANGE 286 | BEA [[.NULL.]] NULL ECOLLATE 287 | BEA abc[[ NULL EBRACK 288 | BEA [^abc NULL EBRACK 289 | BEA [A-Z-a] NULL BADPAT 290 | BEA [[..]] NULL ECOLLATE 291 | BEA [[.- NULL BADPAT 292 | BEA [[.^.] NULL EBRACK 293 | BEA [[.?.]] NULL ECOLLATE 294 | BEA [[.[.]] NULL ECOLLATE 295 | BEA [[ NULL EBRACK 296 | BEA [[= NULL BADPAT 297 | BEA [[=a NULL BADPAT 298 | BEA [[=a= NULL BADPAT 299 | BEA [[=a=] NULL EBRACK 300 | BEA [[:greek:]] NULL ECTYPE 301 | BEA [[:upperc:]] NULL ECTYPE 302 | BEA [[:upper=]] NULL BADPAT 303 | BEA [[:upper:}] NULL BADPAT 304 | BEA [[=aleph=]] NULL ECOLLATE 305 | 306 | # metacharacters; see also null expressions 307 | 308 | B \$$\^^\\\.()\[] $$^^\.()[] (0,10) 309 | EA \$\^\\\.\{})\[] $^\.{}())[] (0,11) 310 | 311 | BEA \ NULL EESCAPE 312 | BEA [ NULL EBRACK 313 | EA ( NULL EPAREN 314 | BEA \\\ NULL EESCAPE 315 | BEA \t NULL BADPAT 316 | 317 | # funny characters 318 | 319 | BEA [[.NUL.]] NULL NOMATCH 320 | BEA [[.NUL.]-[.STX.]] \x03\x01 (1,2) 321 | BEA [[.DEL.]-\x81] \xfe\x80\x82 (1,2) 322 | EA a[[.NUL.]]?b ab (0,2) 323 | BEA a[[.NUL.]] a NOMATCH 324 | BEA .* \x01\xff (0,2) 325 | BEA [^[.NUL.]]* \x01\xff (0,2) 326 | 327 | # NOTBOL, NOTEOL, NEWLINE 328 | 329 | BEAb ^a abc NOMATCH 330 | BEAe a$ aaa NOMATCH 331 | BEAeW a$ aa\na (1,2) 332 | BEA . \n (0,1) 333 | BEAW . \n NOMATCH 334 | BEAW .*$ a\na (0,1) 335 | BEA \n \n (0,1) 336 | BEAW \n \n (0,1) 337 | BEAbW ^a a\na (2,3) 338 | BEA [^a] \n (0,1) 339 | BEAW [^a] \n NOMATCH 340 | EAW .^ \na NOMATCH 341 | EAW $. \n NOMATCH 342 | EAW $ \n (0,0) 343 | EAW \n^ \na (0,1) 344 | BEAW [[.newline.]] \n (0,1) 345 | BEA [[.newline.]] \n (0,1) 346 | 347 | # NOSUB 348 | 349 | EAN (a)(b)(c) abc NULL 350 | BEAN xxx xxx NULL 351 | BEAN xxx xx NOMATCH 352 | 353 | # mouthfuls 354 | 355 | B $.$$.$$.$$.$$.$$.$$.$$.$$.$$.$\9\8\7\6\5\4\3\2\1 1234567890987654321 (0,19)(0,1)(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,9)(9,10) 356 | B5 $.$$.$$.$$.$$.$$.$$.$$.$$.$$.$\9\8\7\6\5\4\3\2\1 1234567890987654321 (0,19)(0,1)(1,2)(2,3)(3,4) 357 | EA1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6) 358 | EA1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3) 359 | EA1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/60 NOMATCH 360 | EA1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11) 361 | EA3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1) 362 | EA3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2) 363 | EA3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))|xx) xx (0,2)(0,2)(?,?) 364 | EA3N ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))|xx) xx NULL 365 | EA a?(ab|ba)* abababababababababababababababababababababababababababababababababababababababab (0,80)(78,80) 366 | EA a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81) 367 | EA abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25) 368 | EA abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22) 369 | EA aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11) 370 | 371 | # augmented re's 372 | 373 | A a! a (0,0) 374 | A [abc]+&[bcd]* abcd (1,3) 375 | A [abc]*&[bcd]+ abcd (1,3) 376 | A (a.*)!&(.+) abb (1,3)(?,?)(1,3) 377 | A xy(.*xy.*)!xy xy.xx.yy.xy.xy (0,11)(?,?) 378 | A a\! aa!a (1,3) 379 | E a! aa!a (1,3) 380 | E a\! aa!a BADPAT 381 | A x!a* NULL (0,0) 382 | A ((...|.....)*)! aa (0,2) 383 | A ((...|.....)*)! aaa (0,2) 384 | A ((...|.....)*)! aaaa (0,4) 385 | A ((...|.....)*)! aaaaa (0,4) 386 | A ((...|.....)*)! aaaaaa (0,4) 387 | A ((...|.....)*)! aaaaaaa (0,7) 388 | A ((...|.....)*)! aaaaaaaa (0,7) 389 | A ((...|.....)*)! aaaaaaaaa (0,7) 390 | A ((...)*(.....)*)! aa (0,2) 391 | A ((...)*(.....)*)! aaa (0,2) 392 | A ((...)*(.....)*)! aaaa (0,4) 393 | A ((...)*(.....)*)! aaaaa (0,4) 394 | A ((...)*(.....)*)! aaaaaa (0,4) 395 | A ((...)*(.....)*)! aaaaaaa (0,7) 396 | A ((...)*(.....)*)! aaaaaaaa (0,7) 397 | A ((...)*(.....)*)! aaaaaaaaa (0,7) 398 | -------------------------------------------------------------------------------- /sed1.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "sed.h" 6 | 7 | #define ustrlen(p) strlen((char*)(p)) 8 | #define ustrcmp(p, q) strcmp((const char*)(p), (const char*)(q)) 9 | #define ustrcpy(p, q) (uchar*)strcpy((char*)(p), (const char*)(q)) 10 | #define ustrchr(p, c) (uchar*)strchr((const char*)(p), c) 11 | 12 | int blank(Text*); 13 | void fixlabels(Text*); 14 | void fixbrack(Text*); 15 | void ckludge(Text*, int, int, int, Text*); 16 | int addr(Text*, Text*); 17 | int pack(int, int, int); 18 | int* instr(uchar*); 19 | uchar *succi(uchar*); 20 | extern void jprint(regex_t*); /* secret entry into regex pkg */ 21 | 22 | int semicolon; 23 | Text rebuf; 24 | 25 | uchar adrs[256] = { /* max no. of addrs, 3 is illegal */ 26 | 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, /* */ 27 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 28 | 3, 2, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* !# */ 29 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 3, 1, 3, 3, /* := */ 30 | 3, 3, 3, 3, 2, 3, 3, 2, 2, 3, 3, 3, 3, 3, 2, 3, /* DGHN */ 31 | 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* P */ 32 | 3, 1, 2, 2, 2, 3, 3, 2, 2, 1, 3, 3, 2, 3, 2, 3, /* a-n */ 33 | 2, 1, 2, 2, 2, 3, 3, 2, 2, 2, 3, 2, 3, 0, 3, 3, /* p-y{} */ 34 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 35 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 36 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 37 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 38 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 39 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 40 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 41 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 42 | }; 43 | 44 | #define Ec Tc /* commands that have same compilation method */ 45 | #define Dc Tc 46 | #define Gc Tc 47 | #define Hc Tc 48 | #define Nc Tc 49 | #define Pc Tc 50 | #define dc Tc 51 | #define gc Tc 52 | #define hc Tc 53 | #define lc Tc 54 | #define nc Tc 55 | #define pc Tc 56 | #define qc Tc 57 | #define xc Tc 58 | #define tc bc 59 | #define ic ac 60 | #define cc ac 61 | 62 | typedef void cmdf(Text*, Text*); 63 | cmdf Xc, Cc, Ec; /* comment #, colon, equal */ 64 | cmdf Lc, Rc; /* left {, right }, */ 65 | cmdf Ic, Tc, xx; /* ignore, trivial, error */ 66 | cmdf Dc, Gc, Hc, Nc, Pc; 67 | cmdf ac, bc, cc, dc, gc, hc, ic, nc; 68 | cmdf pc, qc, rc, sc, tc, wc, xc, yc; 69 | 70 | static cmdf *docom[128] = { 71 | xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,Ic,xx,xx,xx,xx,xx, /* */ 72 | xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx, 73 | xx,Ic,xx,Xc,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx, /* !# */ 74 | xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,Cc,Ic,xx,Ec,xx,xx, /* :;= */ 75 | xx,xx,xx,xx,Dc,xx,xx,Gc,Hc,xx,xx,xx,xx,xx,Nc,xx, /* DGHN */ 76 | Pc,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx,xx, /* P */ 77 | xx,ac,bc,cc,dc,xx,xx,gc,hc,ic,xx,xx,lc,xx,nc,xx, /* a-n */ 78 | pc,qc,rc,sc,tc,xx,xx,wc,xc,yc,xx,Lc,xx,Rc,xx,xx /* p-y{} */ 79 | }; 80 | 81 | uchar *synl; /* current line pointer for syntax errors */ 82 | 83 | void 84 | compile(Text *script, Text *t) 85 | { 86 | int loc; /* progam counter */ 87 | int neg; /* ! in effect */ 88 | int cmd; 89 | int naddr; 90 | int *q; /* address of instruction word */ 91 | t->w = t->s; /* here w is a read pointer */ 92 | while(*t->w) { 93 | assure(script, 4*sizeof(int)); 94 | loc = script->w - script->s; 95 | synl = t->w; 96 | naddr = 0; 97 | while(blank(t)) ; 98 | naddr += addr(script, t); 99 | if(naddr && *t->w ==',') { 100 | t->w++; 101 | naddr += addr(script, t); 102 | if(naddr < 2) 103 | syntax("missing address"); 104 | } 105 | q = (int*)script->w; 106 | if(naddr == 2) 107 | *q++ = INACT; 108 | script->w = (uchar*)(q+1); 109 | neg = 0; 110 | for(;;) { 111 | while(blank(t)); 112 | cmd = *t->w++; 113 | if(neg && docom[cmd&0xff]==Ic) 114 | syntax("improper !"); 115 | if(cmd != '!') 116 | break; 117 | neg = NEG; 118 | } 119 | if(!neg) { 120 | switch(adrs[cmd]) { 121 | case 1: 122 | if(naddr <= 1) 123 | break; 124 | case 0: 125 | if(naddr == 0) 126 | break; 127 | syntax("too many addresses"); 128 | } 129 | } 130 | docom[cmd&0xff](script, t); 131 | switch(*t->w) { 132 | case 0: 133 | script->w = script->s + loc; 134 | break; 135 | default: 136 | if(cmd == '{') 137 | break; 138 | syntax("junk after command"); 139 | case ';': 140 | if(!semicolon++) 141 | synwarn("semicolon separators"); 142 | case '\n': 143 | t->w++; 144 | } 145 | *q = pack(neg,cmd,script->w-script->s-loc); 146 | } 147 | fixbrack(script); 148 | fixlabels(script); 149 | } 150 | 151 | 152 | /* COMMAND LAYOUT */ 153 | 154 | int 155 | blank(Text *t) 156 | { 157 | if(*t->w==' ' || *t->w=='\t') { 158 | t->w++; 159 | return 1; 160 | } else 161 | return 0; 162 | } 163 | 164 | int * 165 | instr(uchar *p) /* get address of command word */ 166 | { 167 | int *q = (int*)p; 168 | while((*q & IMASK) != IMASK) 169 | q++; 170 | return q; 171 | } 172 | 173 | uchar * 174 | succi(uchar *p) 175 | { 176 | int *q = instr(p); 177 | if(code(*q) == '{') 178 | return (uchar*)(q+1); 179 | else 180 | return p + (*q & LMASK); 181 | } 182 | 183 | int 184 | pack(int neg, int cmd, int length) 185 | { 186 | int l = length & LMASK; 187 | if(length != l) 188 | syntax(" or too long"); 189 | return IMASK | neg | cmd << 2*BYTE | l; 190 | } 191 | 192 | void 193 | putint(Text *s, int n) 194 | { 195 | assure(s, sizeof(int)); 196 | *(int*)s->w = n; 197 | s->w += sizeof(int); 198 | } 199 | 200 | int 201 | number(Text *t) 202 | { 203 | unsigned n = 0; 204 | while(isdigit(*t->w)) { 205 | if(n > (INT_MAX-9)/10) 206 | syntax("number too big"); 207 | n = n*10 + *t->w++ - '0'; 208 | } 209 | return n; 210 | } 211 | 212 | int 213 | addr(Text *script, Text *t) 214 | { 215 | int n; 216 | switch(*t->w) { 217 | default: 218 | return 0; 219 | case '$': 220 | t->w++; 221 | n = DOLLAR; 222 | break; 223 | case '\\': 224 | t->w++; 225 | if(*t->w=='\n' ||*t->w=='\\') 226 | syntax("bad regexp delimiter"); 227 | case '/': 228 | n = recomp(&rebuf, t, *t->w++) | REGADR; 229 | break; 230 | case '0': case '1': case '2': case '3': case '4': 231 | case '5': case '6': case '7': case '8': case '9': 232 | n = number(t); 233 | if(n == 0) 234 | syntax("address is zero"); 235 | } 236 | putint(script, n); 237 | return 1; 238 | } 239 | 240 | regex_t * 241 | readdr(int x) 242 | { 243 | return (regex_t*)(rebuf.s + (x&AMASK)); 244 | } 245 | 246 | /* LABEL HANDLING */ 247 | 248 | /* the labels array consists of int values followed by strings. 249 | value -1 means unassigned; other values are relative to the 250 | beginning of the script 251 | 252 | on the first pass, every script ref to a label becomes the 253 | integer offset of that label in the labels array, or -1 if 254 | it is a branch to the end of script 255 | 256 | on the second pass (fixlabels), the script ref is replaced 257 | by the value from the labels array. */ 258 | 259 | Text labels; 260 | 261 | int * 262 | lablook(uchar *l, Text *labels) 263 | { 264 | uchar *p, *q; 265 | int n; 266 | assure(labels, 1); 267 | for(p = labels->s; p < labels->w; ) { 268 | q = p + sizeof(int); 269 | if(ustrcmp(q, l) == 0) 270 | return (int*)p; 271 | q += ustrlen(q) + 1; 272 | p = (uchar*)intp(q); 273 | } 274 | n = ustrlen(l); 275 | assure(labels, sizeof(int)+n+1+sizeof(int)); 276 | *(int*)p = -1; 277 | q = p + sizeof(int); 278 | ustrcpy(q, l); 279 | q += ustrlen(q) + 1; 280 | labels->w = (uchar*)intp(q); 281 | return (int*)p; 282 | } 283 | 284 | /* find pos in label list; assign value i to label if i>=0 */ 285 | 286 | int 287 | getlab(Text *t, int i) 288 | { 289 | int *p; 290 | uchar *u; 291 | while(blank(t)); /* not exactly posix */ 292 | for(u=t->w; *t->w!='\n'; t->w++) 293 | if(!isprint(*t->w) || *t->w==' ') 294 | synwarn("invisible character in name"); 295 | if(u == t->w) 296 | return -1; 297 | *t->w = 0; 298 | p = lablook(u, &labels); 299 | if(*p == -1) 300 | *p = i; 301 | else if(i != -1) 302 | syntax("duplicate label"); 303 | *t->w = '\n'; 304 | return (uchar*)p - labels.s; 305 | } 306 | 307 | void 308 | Cc(Text *script, Text *t) /* colon */ 309 | { 310 | if(getlab(t, script->w - sizeof(int) - script->s) == -1) 311 | syntax("missing label"); 312 | } 313 | 314 | void 315 | bc(Text *script, Text *t) 316 | { 317 | int g; 318 | g = getlab(t, -1); /* relative pointer to label list */ 319 | putint(script, g); 320 | } 321 | 322 | void 323 | fixlabels(Text *script) 324 | { 325 | uchar *p; 326 | int *q; 327 | for(p=script->s; pw; p=succi(p)) { 328 | q = instr(p); 329 | switch(code(*q)) { 330 | case 't': 331 | case 'b': 332 | if(q[1] == -1) 333 | q[1] = script->w - script->s; 334 | else if(*(int*)(labels.s+q[1]) != -1) 335 | q[1] = *(int*)(labels.s+q[1]); 336 | else 337 | quit("undefined label: ", 338 | labels.s+q[1]+sizeof(int)); 339 | } 340 | } 341 | free(labels.s); 342 | } 343 | 344 | /* FILES */ 345 | 346 | Text files; 347 | 348 | void 349 | rc(Text *script, Text *t) 350 | { 351 | uchar *u; 352 | if(!blank(t)) 353 | synwarn("no space before file name"); 354 | while(blank(t)) ; 355 | for(u=t->w; *t->w!='\n'; t->w++) ; 356 | if(u == t->w) 357 | syntax("missing file name"); 358 | *t->w = 0; 359 | putint(script, (uchar*)lablook(u, &files) - files.s); 360 | *t->w = '\n'; 361 | } 362 | 363 | void 364 | wc(Text *script, Text *t) 365 | { 366 | int *p; 367 | rc(script, t); 368 | p = (int*)(files.s + ((int*)script->w)[-1]); 369 | if(*p != -1) 370 | return; 371 | *(FILE**)p = fopen((char*)(p+1), "w"); 372 | if(*p == 0) 373 | syntax("can't open file for writing"); 374 | } 375 | 376 | /* BRACKETS */ 377 | 378 | Text brack; 379 | 380 | /* Lc() stacks (in brack) the location of the { command word. 381 | Rc() stuffs into that word the offset of the } sequel 382 | relative to the command word. 383 | fixbrack() modifies the offset to be relative to the 384 | beginning of the instruction, including addresses. */ 385 | 386 | void /* { */ 387 | Lc(Text *script, Text *t) 388 | { 389 | while(blank(t)); 390 | putint(&brack, script->w - sizeof(int) - script->s); 391 | } 392 | 393 | void /* } */ 394 | Rc(Text *script, Text *t) 395 | { 396 | int l; 397 | int *p; 398 | t = t; 399 | if(brack.w == 0 || (brack.w-=sizeof(int)) < brack.s) 400 | syntax("unmatched }"); 401 | l = *(int*)brack.w; 402 | p = (int*)(script->s + l); 403 | l = script->w - script->s - l; 404 | if(l >= LMASK - 3*sizeof(int)) /* fixbrack could add 3 */ 405 | syntax("{command-list} too long)"); 406 | *p = *p&~LMASK | l; 407 | } 408 | 409 | void 410 | fixbrack(Text *script) 411 | { 412 | uchar *p; 413 | int *q; 414 | if(brack.w == 0) 415 | return; 416 | if(brack.w > brack.s) 417 | syntax("unmatched {"); 418 | for(p=script->s; pw; p=succi(p)) { 419 | q = instr(p); 420 | if(code(*q) == '{') 421 | *q += (uchar*)q - p; 422 | } 423 | free(brack.s); 424 | } 425 | 426 | /* EASY COMMANDS */ 427 | 428 | void 429 | Xc(Text *script, Text *t) /* # */ 430 | { 431 | script = script; /* avoid use/set diagnostics */ 432 | if(t->s[1]=='n') 433 | nflag = 1; 434 | while(*t->w != '\n') 435 | t->w++; 436 | } 437 | 438 | void 439 | Ic(Text *script, Text *t) /* ignore */ 440 | { 441 | script = script; 442 | t->w--; 443 | } 444 | 445 | void 446 | Tc(Text *script, Text *t) /* trivial to compile */ 447 | { 448 | script = script; 449 | t = t; 450 | } 451 | 452 | void 453 | xx(Text *script, Text *t) 454 | { 455 | script = script; 456 | t = t; 457 | syntax("unknown command"); 458 | } 459 | 460 | /* MISCELLANY */ 461 | 462 | void 463 | ac(Text *script, Text *t) 464 | { 465 | if(*t->w++ != '\\' || *t->w++ != '\n') 466 | syntax("\\ missing after command"); 467 | for(;;) { 468 | while(bflag && blank(t)) ; 469 | assure(script, 2 + sizeof(int)); 470 | switch(*t->w) { 471 | case 0: 472 | quit("bug: missed end of "); 473 | case '\n': 474 | *script->w++ = *t->w; 475 | *script->w++ = 0; 476 | script->w = (uchar*)intp(script->w); 477 | return; 478 | case '\\': 479 | t->w++; 480 | default: 481 | *script->w++ = *t->w++; 482 | } 483 | } 484 | } 485 | void 486 | yc(Text *script, Text *t) 487 | { 488 | int i; 489 | int delim = *t->w++; 490 | uchar *s = script->w; 491 | uchar *p, *q; 492 | uchar c, d; 493 | if(delim == '\n' || delim=='\\') 494 | syntax("missing delimiter"); 495 | assure(script, 256); 496 | for(i=0; i<256; i++) 497 | s[i] = 0; 498 | for(q=t->w; *q!=delim; q++) 499 | if(*q == '\n') 500 | syntax("missing delimiter"); 501 | else if(*q=='\\' && q[1]==delim) 502 | q++; 503 | for(p=t->w, q++; *p != delim; p++, q++) { 504 | if(*p=='\\' && p[1]==delim) 505 | p++; 506 | if(*q == '\n') 507 | syntax("missing delimiter"); 508 | if(*q == delim) 509 | syntax("string lengths differ"); 510 | if(*q=='\\' && q[1]==delim) 511 | q++; 512 | if(s[*p] && s[*p]!=*q) 513 | syntax("ambiguous map"); 514 | if(s[*p]) 515 | synwarn("redundant map"); 516 | s[*p] = *q; 517 | } 518 | if(*q++ != delim) 519 | syntax("string lengths differ"); 520 | for(i=0; i<256; i++) 521 | if(s[i] == 0) 522 | s[i] = i; 523 | t->w = q; 524 | script->w += 256; 525 | } 526 | 527 | void 528 | sc(Text *script, Text *t) 529 | { 530 | int c, flags, re; 531 | int *q; 532 | int n = -1; 533 | int nsub; 534 | int delim = *t->w++; 535 | switch(delim) { 536 | case '\n': 537 | case '\\': 538 | syntax("improper delimiter"); 539 | } 540 | re = recomp(&rebuf, t, delim); 541 | putint(script, re); 542 | nsub = readdr(re)->re_nsub; 543 | flags = script->w - script->s; 544 | putint(script, 0); /* space for flags */ 545 | while((c=*t->w++) != delim) { 546 | assure(script, 3+sizeof(int*)); 547 | if(c == '\n') 548 | syntax("unterminated command"); 549 | else if(c == '\\') { 550 | int d = *t->w; 551 | if(d==delim) 552 | ; 553 | else if(d=='&' || d=='\\') 554 | *script->w++ = c; 555 | else if(d>='0' && d<='9') { 556 | if(d > '0'+nsub) 557 | syntax("improper backreference"); 558 | *script->w++ = c; 559 | } 560 | c = *t->w++; 561 | } 562 | *script->w++ = c; 563 | } 564 | *script->w++ = 0; 565 | script->w = (uchar*)intp(script->w); 566 | q = (int*)(script->s + flags); 567 | *q = 0; 568 | for(;;) { 569 | switch(*t->w) { 570 | case '0': case '1': case '2': case '3': case '4': 571 | case '5': case '6': case '7': case '8': case '9': 572 | if(n != -1) 573 | syntax("extra flags"); 574 | n = number(t); 575 | if(n == 0 || (n&(PFLAG|WFLAG)) != 0) 576 | syntax("count out of range"); 577 | continue; 578 | case 'p': 579 | if(*q & PFLAG) 580 | syntax("extra flags"); 581 | t->w++; 582 | *q |= PFLAG; 583 | continue; 584 | case 'g': 585 | t->w++; 586 | if(n != -1) 587 | syntax("extra flags"); 588 | n = 0; 589 | continue; 590 | case 'w': 591 | t->w++; 592 | *q |= WFLAG; 593 | wc(script, t); 594 | } 595 | break; 596 | } 597 | *q |= n==-1? 1: n; 598 | } 599 | 600 | void 601 | synwarn(const char *s) 602 | { 603 | uchar *t = ustrchr(synl, '\n'); 604 | warn("%s: %.*s", s, t-synl, synl); 605 | } 606 | 607 | void 608 | syntax(const char *s) 609 | { 610 | uchar *t = ustrchr(synl, '\n'); 611 | quit("%s: %.*s", s, t-synl, synl); 612 | } 613 | 614 | 615 | void 616 | printscript(Text *script) 617 | { 618 | /* 619 | uchar *s; 620 | int *q; 621 | for(s=script->s; sw; s = succi(s)) { 622 | q = (int*)s; 623 | if((*q&IMASK) != IMASK) { 624 | if((*q®ADR) == 0) 625 | printf("%d", *q); 626 | else 627 | jprint((regex_t*)(*q & AMASK)); 628 | q++; 629 | } 630 | if((*q&IMASK) != IMASK) { 631 | if((*q®ADR) == 0) 632 | printf(",%d", *q); 633 | else 634 | jprint((regex_t*)(*q & AMASK)); 635 | q += 2; 636 | } 637 | if(code(*q) == '\n') 638 | continue; 639 | printf("%s%c\n", *q&NEG?"!":"", code(*q)); 640 | } 641 | */ 642 | } 643 | 644 | /* debugging code 2; execute stub. 645 | prints the compiled script (without arguments) 646 | then each input line with line numbers 647 | 648 | void 649 | execute(Text *script, Text *y) 650 | { 651 | if(recno == 1) 652 | printscript(script); 653 | printf("%d:%s",recno,y->s); 654 | } 655 | 656 | */ 657 | -------------------------------------------------------------------------------- /re1.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "re.h" 6 | 7 | /* regular expression recognizer. parse() is coded in 8 | continuation-passing style. 9 | see re.h for other descriptive comments */ 10 | 11 | #ifdef DEBUG 12 | int edebug = 0; // OR of types to be traced in parsing 13 | #define debug(type, msg, s) \ 14 | if(edebug & (1< pos; // posns of certain subpatterns 61 | Array bestpos; // ditto for best match 62 | Array match;// subexrs in current match 63 | Array best; // ditto in best match yet 64 | Eenv(const regex_t *preg, int eflags, uchar *string, size_t len); 65 | int pushpos(Rex*, uchar*, int); 66 | void poppos() { npos--; } 67 | }; 68 | 69 | int Eenv::pushpos(Rex *rex, uchar *p, int b_e ) 70 | { 71 | if(pos.assure(npos+1)) // +1 is probably superstition 72 | return 1; 73 | pos[npos].serial = rex->serial; 74 | pos[npos].p = p; 75 | pos[npos].be = b_e; 76 | npos++; 77 | return 0; 78 | } 79 | 80 | #ifdef DEBUG 81 | static void printpos(Pos *pos, int n, Eenv *env) /* for debugging */ 82 | { 83 | int i; 84 | for(i=0; ip); 92 | if(pos[i].be == ENDP) printf(")"); 93 | printf(" "); 94 | } 95 | printf("\n"); 96 | fflush(stdout); 97 | } 98 | #else 99 | #define printpos(pos, n, env) 100 | #endif 101 | 102 | static regmatch_t NOMATCH = { -1, -1 }; 103 | 104 | inline 105 | Eenv::Eenv(const regex_t *preg, int eflags, uchar *string, size_t len) : 106 | preg(preg), p(string), last(string+len), 107 | flags(eflags&EFLAGS | preg->flags) 108 | { 109 | int n = preg->re_nsub; 110 | if(match.assure(n) || best.assure(n)) { 111 | flags |= SPACE; 112 | return; 113 | } 114 | npos = nbestpos = 0; 115 | best[0].rm_so = 0; 116 | best[0].rm_eo = -1; 117 | } 118 | 119 | Seg Seg::copy() 120 | { 121 | Seg seg(new uchar[n+1], n); 122 | if(seg.p) { 123 | memmove(seg.p, p, (size_t)n); 124 | seg.p[n] = 0; 125 | } 126 | return seg; 127 | } 128 | 129 | void Set::insert(uchar c) 130 | { 131 | cl[c/CHAR_BIT] |= 1 << (c%CHAR_BIT); 132 | } 133 | void Set::neg() 134 | { 135 | int i; 136 | for(i=0; (unsigned)icl[i]; 144 | } 145 | void Set::clear() 146 | { 147 | memset(cl, 0, sizeof(cl)); 148 | } 149 | 150 | #ifdef DEBUG 151 | int mallocbytes; 152 | int mallocblocks = -1; // -1 accounts for Done::done 153 | int freeblocks; 154 | void *operator new(size_t size) { 155 | void *p = malloc(size); 156 | if(p) { 157 | mallocbytes += size; 158 | mallocblocks++; 159 | } 160 | return p; 161 | } 162 | void operator delete(void *p) { 163 | if(p) { 164 | freeblocks++; 165 | free(p); 166 | } 167 | } 168 | #endif 169 | 170 | Rex::~Rex() 171 | { 172 | if(type!=TEMP) 173 | delete next; 174 | } 175 | 176 | void Rex::dprint(const char *msg, const uchar *s) 177 | { 178 | printf("%s _", msg); 179 | print(); 180 | printf("_ _%s_\n", s); 181 | } 182 | void Rex::print() { } 183 | 184 | #ifdef DEBUG // debuggint output routines 185 | extern "C" void jprint(regex_t *preg) 186 | { 187 | preg->rex->print(); 188 | printf("\n"); 189 | } 190 | 191 | void flagprint(regex_t *re) 192 | { 193 | int flags = re->flags; 194 | if(flags®_EXTENDED) printf("EXTENDED:"); 195 | if(flags®_AUGMENTED) printf("AUGMENTED:"); 196 | if(flags®_ICASE) printf("ICASE:"); 197 | if(flags®_NOSUB) printf("NOSUB:"); 198 | if(flags®_NEWLINE) printf("NEWLINE:"); 199 | if(flags®_NOTBOL) printf("NOTBOL:"); 200 | if(flags®_NOTEOL) printf("NOTEOL:"); 201 | if(flags®_NULL) printf("NULL:"); 202 | if(flags®_ANCH) printf("ANCH:"); 203 | if(flags®_LITERAL) printf("LITERAL:"); 204 | if(flags&HARD) printf("HARD:"); 205 | if(flags&ONCE) printf("ONCE:"); 206 | } 207 | 208 | void Dup::print() 209 | { 210 | if(lo == 1 && hi == 1) 211 | ; 212 | else if(lo == 0 && hi == RE_DUP_INF) 213 | printf("*"); 214 | else if(hi == lo) 215 | printf("\\{%d\\}", lo); 216 | else if(hi == RE_DUP_INF) 217 | printf("\\{%d,\\}", lo); 218 | else 219 | printf("\\{%d,%d\\}", lo, hi); 220 | if(next) 221 | next->print(); 222 | } 223 | void Ok::print() { 224 | if(next) 225 | next->print(); 226 | } 227 | void Anchor::print() { 228 | printf("^"); 229 | if(next) 230 | next->print(); 231 | } 232 | void End::print() { 233 | printf("$"); 234 | if(next) 235 | next->print(); 236 | } 237 | void Dot::print() 238 | { 239 | printf("."); 240 | this->Dup::print(); 241 | } 242 | void Onechar::print() 243 | { 244 | printf("%c", c); 245 | this->Dup::print(); 246 | } 247 | void Class::print() 248 | { 249 | int i; 250 | printf("["); 251 | for(i=0; i<128; i++) 252 | if(in(i)) 253 | printf(isprint(i)?"%c":"\\x%.2x", i); 254 | printf("]"); 255 | this->Dup::print(); 256 | } 257 | void String::print() 258 | { 259 | printf("%.*s", seg.n, seg.p); 260 | if(next) 261 | next->print(); 262 | } 263 | void Trie::print() 264 | { 265 | int i; 266 | Array s; 267 | int count = 0; 268 | for(i=0; i &s) 276 | { 277 | for(;;) { 278 | s.assure(n); 279 | s[n] = node->c; 280 | if(node->son) { 281 | print(node->son, n+1, s); 282 | if(node->end) 283 | printf("|"); 284 | } 285 | if(node->end) 286 | printf("%.*s", n+1, &s[0]); 287 | node = node->sib; 288 | if(node == 0) 289 | return; 290 | printf("|"); 291 | } 292 | } 293 | void Back::print() 294 | { 295 | printf("\\%d", n); 296 | if(next) 297 | next->print(); 298 | } 299 | void Subexp::print() 300 | { 301 | printf("\$"); 302 | rex->print(); 303 | printf("\$"); 304 | if(next) 305 | next->print(); 306 | } 307 | void Alt::print() 308 | { 309 | left->print(); 310 | printf("|"); 311 | right->print(); 312 | } 313 | void Conj::print() 314 | { 315 | left->print(); 316 | printf("&"); 317 | right->print(); 318 | } 319 | void Rep::print() 320 | { 321 | rex->print(); 322 | this->Dup::print(); 323 | } 324 | void Neg::print() 325 | { 326 | rex->print(); 327 | printf("!"); 328 | if(next) 329 | next->print(); 330 | } 331 | #endif 332 | 333 | int Rex::parse(uchar*, Rex*, Eenv*) // pure virtual, avoid libC++ 334 | { 335 | abort(); // "can't happen" 336 | return 0; 337 | } 338 | 339 | inline int Rex::follow(uchar *s, Rex *cont, Eenv *env) 340 | { 341 | return next? next->parse(s, cont, env): 342 | cont->parse(s, 0, env); 343 | } 344 | 345 | int Ok::parse(uchar *s, Rex *cont, Eenv *env) 346 | { 347 | debug(OK, "Ok", s); 348 | return follow(s, cont, env); 349 | } 350 | 351 | int Anchor::parse(uchar *s, Rex *cont, Eenv *env) 352 | { 353 | debug(ANCHOR, "Anchor", s); 354 | if((env->flags®_NEWLINE) && 355 | s>env->p && s[-1]=='\n' || 356 | !(env->flags®_NOTBOL) && s==env->p) 357 | return follow(s, cont, env); 358 | return NONE; 359 | } 360 | 361 | int End::parse(uchar *s, Rex *cont, Eenv *env) 362 | { 363 | debug(END, "End", s); 364 | if((*s==0 && !(env->flags®_NOTEOL)) || 365 | (env->flags®_NEWLINE) && *s=='\n') 366 | return follow(s, cont, env); 367 | return NONE; 368 | } 369 | 370 | int Dot::parse(uchar *s, Rex *cont, Eenv *env) 371 | { 372 | debug(DOT, "Dot", s); 373 | int n = hi; 374 | if(n > env->last-s) 375 | n = env->last-s; 376 | if(env->flags®_NEWLINE) { 377 | for(int i=0 ; i=lo; s--) 383 | switch(follow(s, cont, env)) { 384 | case BEST: 385 | return BEST; 386 | case BAD: 387 | return BAD; 388 | case GOOD: 389 | result = GOOD; 390 | } 391 | return result; 392 | } 393 | 394 | int Onechar::parse(uchar *s, Rex *cont, Eenv *env) 395 | { 396 | debug(ONECHAR, "Onechar", s); 397 | int n = hi; 398 | uchar *map = env->preg->map; 399 | if(n > env->last-s) 400 | n = env->last-s; 401 | int i = 0; 402 | for( ; i=lo; s--) 407 | switch(follow(s, cont, env)) { 408 | case BEST: 409 | return BEST; 410 | case BAD: 411 | return BAD; 412 | case GOOD: 413 | result = GOOD; 414 | } 415 | return result; 416 | } 417 | 418 | int Class::parse(uchar *s, Rex *cont, Eenv *env) 419 | { 420 | debug(CLASS, "Class", s); 421 | int n = hi; 422 | if(n > env->last-s) 423 | n = env->last-s; 424 | for(int i=0; i=lo; s--) 429 | switch(follow(s, cont, env)) { 430 | case BEST: 431 | return BEST; 432 | case BAD: 433 | return BAD; 434 | case GOOD: 435 | result = GOOD; 436 | } 437 | return result; 438 | } 439 | void Class::orset(Set *y) 440 | { 441 | cl.orset(y); 442 | } 443 | void Class::neg(int cflags) 444 | { 445 | cl.neg(); 446 | if(cflags®_NEWLINE) 447 | cl.cl['\n'/CHAR_BIT] &= ~(1 << ('\n'%CHAR_BIT)); 448 | } 449 | void Class::icase(uchar *map) 450 | { 451 | if(map['A'] != map['a']) 452 | return; 453 | for(int i=0; i<256; i++) 454 | if(cl.in(i)) { 455 | cl.insert(toupper(i)); 456 | cl.insert(tolower(i)); 457 | } 458 | } 459 | 460 | String::String(Seg s, uchar *map) : Rex(STRING), seg(s) 461 | { 462 | uchar *p; 463 | if(map) 464 | for(p=seg.p; *p; p++) 465 | *p = map[*p]; 466 | } 467 | int String::parse(uchar *s, Rex *cont, Eenv *env) 468 | { 469 | debug(STRING, "String", s); 470 | if(s+seg.n > env->last) 471 | return NONE; 472 | uchar *map = env->preg->map; 473 | uchar *p = seg.p; 474 | while(*p) 475 | if(map[*s++] != *p++) 476 | return NONE; 477 | return follow(s, cont, env); 478 | } 479 | 480 | /* Knuth-Morris-Pratt, adapted from Corman-Leiserson-Rivest */ 481 | Kmp::Kmp(Seg seg, int *flags) : String(seg) 482 | { 483 | type = KMP; 484 | if(fail.assure(seg.n)) { 485 | *flags |= SPACE; 486 | return; 487 | } 488 | int q, k ; 489 | fail[0] = k = -1; 490 | for(q=1; q=0 && seg.p[k+1] != seg.p[q]) 492 | k = fail[k]; 493 | if(seg.p[k+1] == seg.p[q]) 494 | k++; 495 | fail[q] = k; 496 | } 497 | } 498 | int Kmp::parse(uchar *s, Rex* cont, Eenv *env) 499 | { 500 | debug(KMP, "Kmp", s); 501 | uchar *map = env->preg->map; 502 | uchar *t = s; 503 | uchar *last = env->last; 504 | while(t+seg.n <= last) { 505 | int k = -1; 506 | for( ; t=0 && seg.p[k+1] != map[*t]) 508 | k = fail[k]; 509 | if(seg.p[k+1] == map[*t]) 510 | k++; 511 | if(k+1 == seg.n) { 512 | env->best[0].rm_so = ++t - s - seg.n; 513 | switch(follow(t, cont, env)) { 514 | case GOOD: 515 | case BEST: 516 | return BEST; 517 | case BAD: 518 | return BAD; 519 | } 520 | t -= seg.n - 1; 521 | break; 522 | } 523 | } 524 | } 525 | return NONE; 526 | } 527 | 528 | 529 | int Trie::parse(uchar *s, Rex *contin, Eenv *env) 530 | { 531 | Tnode *node = root[env->preg->map[*s]&MASK]; 532 | if(node==0 || s+min>env->last) 533 | return NONE; 534 | return parse(node, s, contin, env); 535 | } 536 | int Trie::parse(Tnode *node, uchar *s, Rex* contin, Eenv *env) 537 | { 538 | debug(TRIE, "Trie", s); 539 | uchar *map = env->preg->map; 540 | for(;;) { 541 | if(s >= env->last) 542 | return NONE; 543 | while(node->c != map[*s]) { 544 | node = node->sib; 545 | if(node == 0) 546 | return NONE; 547 | } 548 | if(node->end) 549 | break; 550 | node = node->son; 551 | s++; 552 | } 553 | int longresult = NONE; 554 | if(node->son) 555 | longresult = parse(node->son, s+1, contin, env); 556 | if(longresult==BEST || longresult==BAD) 557 | return longresult; 558 | int shortresult = follow(s+1, contin, env); 559 | return shortresult==NONE? longresult: shortresult; 560 | } 561 | /* returns 1 if out of space 562 | string s must be nonempty */ 563 | int Trie::insert(uchar *s) 564 | { 565 | int len; 566 | Tnode *node = root[*s&MASK]; 567 | if(node == 0) 568 | node = root[*s&MASK] = new Tnode(*s); 569 | for(len=1; ; ) { 570 | if(node == 0) 571 | return 1; 572 | if(node->c == *s) { 573 | if(s[1] == 0) 574 | break; 575 | if(node->son == 0) 576 | node->son = new Tnode(s[1]); 577 | node = node->son; 578 | len++; 579 | s++; 580 | } else { 581 | if(node->sib == 0) 582 | node->sib = new Tnode(*s); 583 | node = node->sib; 584 | } 585 | } 586 | if(len < min) 587 | min = len; 588 | else if(len > max) 589 | max = len; 590 | node->end = 1; 591 | return 0; 592 | } 593 | 594 | int Back::parse(uchar *s, Rex *cont, Eenv *env) 595 | { 596 | regmatch_t &m = env->match[n]; 597 | debug(BACK, "Back", s); 598 | if(m.rm_so < 0) 599 | return NONE; 600 | uchar *p = env->p + m.rm_so; 601 | long n = m.rm_eo - m.rm_so; 602 | if(s+n > env->last) 603 | return NONE; 604 | uchar *map = env->preg->map; 605 | while(--n >= 0) 606 | if(map[*s++] != map[*p++]) 607 | return NONE; 608 | return follow(s, cont, env); 609 | } 610 | 611 | struct Subexp1 : Rex { 612 | Rex *cont; 613 | Subexp *ref; 614 | Subexp1(Subexp *ref, Rex *cont) : ref(ref), 615 | cont(cont) { next = ref->next; } 616 | int parse(uchar*, Rex*, Eenv*); 617 | }; 618 | int Subexp::parse(uchar *s, Rex *cont, Eenv *env) 619 | { 620 | debug(SUBEXP, "Subexp", s); 621 | int result; 622 | regoff_t &so = env->match[n].rm_so; 623 | Subexp1 subexp1(this, cont); 624 | so = s - env->p; 625 | if(env->pushpos(this, s, BEGS)) 626 | return BAD; 627 | result = rex->parse(s, &subexp1, env); 628 | env->poppos(); 629 | so = -1; 630 | return result; 631 | } 632 | int Subexp1::parse(uchar *s, Rex*, Eenv *env) 633 | { 634 | debug(SUBEXP, "Subexp1", s); 635 | int result; 636 | regoff_t &eo = env->match[ref->n].rm_eo; 637 | eo = s - env->p; 638 | if(env->pushpos(ref, s, ENDP)) 639 | return BAD; 640 | result = follow(s, cont, env); 641 | env->poppos(); 642 | eo = -1; 643 | return result; 644 | } 645 | 646 | /* save and restore match records around alternate attempts, 647 | so that fossils will not be left in the match array. 648 | (These are the only entries in the match array that 649 | are not otherwise guaranteed to have current data 650 | in them when they get used) If there's too much 651 | to save, dynamically allocate space, 652 | The recognizer will slow to a crawl, 653 | allocating memory on every repetition 654 | but it will only happen if 20 parentheses 655 | occur under one * or in one alternation. 656 | */ 657 | struct Save { 658 | int n1, n2; 659 | Array area; 660 | Save(int n1, int n2, Eenv *env); 661 | void restore(Eenv *env); 662 | }; 663 | Save::Save(int n1, int nn2, Eenv *env) : n1(n1), n2(nn2) 664 | { 665 | regmatch_t *match = &env->match[0]; 666 | if(n1 != 0) { 667 | int i = n2 - n1; 668 | if(area.assure(i)) { 669 | env->flags |= SPACE; 670 | n2 = n1; 671 | return; 672 | } 673 | regmatch_t *a = &area[0]; 674 | match += n1; 675 | do { 676 | *a++ = *match; 677 | *match++ = NOMATCH; 678 | } while(--i >= 0); 679 | } 680 | } 681 | 682 | void Save::restore(Eenv *env) 683 | { 684 | regmatch_t *match = &env->match[0]; 685 | if(n1 != 0) { 686 | int i = n2 - n1; 687 | match += n1; 688 | regmatch_t *a = &area[0]; 689 | do { 690 | *match++ = *a++; 691 | } while(--i >= 0); 692 | } 693 | } 694 | 695 | /* Alt1 is a catcher, solely to get control at the end of an 696 | alternative to keep records for comparing matches. 697 | */ 698 | 699 | struct Alt1 : Rex { 700 | Rex *cont; 701 | Alt1(Rex *cont, int ser) : cont(cont) { serial = ser; } 702 | int parse(uchar*, Rex*, Eenv*); 703 | }; 704 | int Alt::parse(uchar *s, Rex *cont, Eenv *env) 705 | { 706 | debug(ALT, "Altl", s); 707 | Save save(n1, n2, env); 708 | if(env->flags&SPACE) 709 | return BAD; 710 | if(env->pushpos(this, s, BEGA)) 711 | return BAD; 712 | Alt1 alt1(cont, serial); 713 | int result = left->parse(s, &alt1, env); 714 | if(result!=BEST && result!=BAD) { 715 | debug(ALT, "Altr", s); 716 | save.restore(env); 717 | env->pos[env->npos-1].serial = rserial; 718 | alt1.serial = rserial; 719 | int rightresult = right->parse(s, &alt1, env); 720 | if(rightresult != NONE) 721 | result = rightresult; 722 | } 723 | env->poppos(); 724 | save.restore(env); 725 | return result; 726 | } 727 | int Alt1::parse(uchar *s, Rex*, Eenv *env) 728 | { 729 | if(env->pushpos(this, s, ENDP)) 730 | return BAD; 731 | int result = follow(s, cont, env); 732 | env->poppos(); 733 | return result; 734 | } 735 | 736 | struct Conj2: Rex { // right catcher 737 | uchar *last; // end of left match 738 | Rex *cont; // ambient continuation 739 | int parse(uchar*, Rex*, Eenv*); 740 | Conj2(Rex *cont, Rex *nex) : cont(cont) { next = nex; } 741 | }; 742 | struct Conj1 : Rex { // left catcher 743 | uchar *p; // beginning of left match 744 | Rex *right; // right pattern 745 | Conj2 *conj2p; // right catcher 746 | Conj1(uchar *p, Rex *right, Conj2 *conj2p) : 747 | p(p), right(right), conj2p(conj2p) { } 748 | int parse(uchar*, Rex*, Eenv*); 749 | }; 750 | int Conj::parse(uchar *s, Rex *cont, Eenv *env) 751 | { 752 | debug(CONJ, "Conjl", s); 753 | Conj2 conj2(cont, next); 754 | Conj1 conj1(s, right, &conj2); 755 | return left->parse(s, &conj1, env); 756 | } 757 | int Conj1::parse(uchar *s, Rex*, Eenv *env) 758 | { 759 | debug(CONJ, "Conjr", p); 760 | conj2p->last = s; 761 | return right->parse(p, conj2p, env); 762 | } 763 | int Conj2::parse(uchar *s, Rex*, Eenv *env) 764 | { 765 | if(s != last) 766 | return NONE; 767 | return follow(s, cont, env); 768 | } 769 | 770 | /* Rep1 nodes are catchers. One is created on the stack for 771 | each iteration of a complex repetition. 772 | */ 773 | 774 | struct Rep1 : Rex { 775 | struct Rep *ref; // where the original node is 776 | uchar *p1; // where this iteration began 777 | int n; // iteration count 778 | Rex *cont; 779 | Rep1(Rep *ref, uchar *p1, int n, Rex *cont) 780 | : ref(ref), p1(p1), n(n), cont(cont) { 781 | next = ref->next; serial=ref->serial; } 782 | int parse(uchar *, Rex*, Eenv*); 783 | }; 784 | int Rep::dorep(int n, uchar *s, Rex *cont, Eenv *env) 785 | { 786 | int result = NONE; 787 | if(hi > n) { 788 | Rep1 rep1(this, s, n+1, cont); 789 | Save save(n1, n2, env); 790 | if(env->flags&SPACE) 791 | return BAD; 792 | if(env->pushpos(this, s, BEGI)) 793 | return BAD; 794 | result = rex->parse(s, &rep1, env); 795 | env->poppos(); 796 | save.restore(env); 797 | } 798 | if(result==BEST || result==BAD || lo>n) 799 | return result; 800 | if(env->pushpos(this, s, ENDP)) // end BEGR 801 | return BAD; 802 | int res1 = follow(s, cont, env); 803 | env->poppos(); 804 | return res1==NONE? result: res1; 805 | } 806 | int Rep1::parse(uchar *s, Rex*, Eenv *env) 807 | { 808 | int result; 809 | debug(REP, "Rep1", s); 810 | if(env->pushpos(this, s, ENDP)) // end BEGI 811 | return BAD; 812 | if(s==p1 && n>ref->lo) // optional empty iteration 813 | if(env->flags®_EXTENDED || (env->flags&HARD) == 0) 814 | result = NONE; // unwanted 815 | else if(env->pushpos(this, s, ENDP)) // end BEGR 816 | return BAD; 817 | else { 818 | result = follow(s, cont, env); 819 | env->poppos(); 820 | } 821 | else 822 | result = ref->dorep(n, s, cont, env); 823 | env->poppos(); 824 | return result; 825 | } 826 | int Rep::parse(uchar *s, Rex *cont, Eenv *env) 827 | { 828 | debug(REP, "Rep", s); 829 | if(env->pushpos(this, s, BEGR)) 830 | return BAD; 831 | int result = dorep(0, s, cont, env); 832 | env->poppos(); 833 | return result; 834 | } 835 | 836 | /* Neg1 catcher determines what string lengths can be matched, 837 | then Neg investigates continuations of other lengths. 838 | this is inefficient. for EASY expressions, we can do better: 839 | since matches to rex will be enumerated in decreasing order, 840 | we can investigate continuations whenever a length is 841 | skipped. */ 842 | 843 | struct Neg1 : Rex { 844 | uchar *p; // start of negated match 845 | Array index; // bit array of string sizes seen 846 | Neg1(uchar *p, int n); 847 | int parse(uchar *s, Rex*, Eenv*); 848 | void bitset(int n) { 849 | index[n/CHAR_BIT] |= 1<<(n%CHAR_BIT); } 850 | int bittest(int n) { 851 | return index[n/CHAR_BIT] & (1<<(n%CHAR_BIT)); } 852 | }; 853 | int Neg::parse(uchar *s, Rex *cont, Eenv *env) 854 | { 855 | debug(NEG, "Neg", s); 856 | int n = env->last - s; 857 | Neg1 neg1(s, n); 858 | if(rex->parse(s, &neg1, env) == BAD) 859 | return BAD; 860 | int result = NONE; 861 | for( ; n>=0; n--) { 862 | if(neg1.bittest(n)) 863 | continue; 864 | int res1 = follow(s+n, cont, env); 865 | if(res1==BAD || res1==BEST) 866 | return res1; 867 | if(res1 == GOOD) 868 | result = GOOD; 869 | } 870 | return result; 871 | } 872 | Neg1::Neg1(uchar *p, int n) : p(p) 873 | { 874 | n = (n+CHAR_BIT-1)/CHAR_BIT; 875 | index.assure(n); 876 | memset(index.p,0,n+1); 877 | } 878 | int Neg1::parse(uchar *s, Rex*, Eenv*) 879 | { 880 | debug(NEG, "Neg1", s); 881 | bitset(s-p); 882 | return NONE; 883 | } 884 | 885 | 886 | static Pos *rpos(Pos *a) /* find matching right pos record */ 887 | { 888 | int serial = a->serial; 889 | int inner; 890 | for(inner=0;;) { 891 | if((++a)->serial != serial) 892 | continue; 893 | if(a->be != ENDP) 894 | inner++; 895 | else if(inner-- <= 0) 896 | return a; 897 | } 898 | } 899 | 900 | /* two matches are known to have the same length 901 | os is start of old pos array, ns is start of new, 902 | oend and nend are end+1 pointers to ends of arrays. 903 | oe and ne are ends (not end+1) of subarrays. 904 | returns 1 if new is better, -1 if old, else 0 */ 905 | 906 | static int better(Pos *os, Pos *ns, Pos *oend, Pos *nend) 907 | { 908 | Pos *oe, *ne; 909 | int k; 910 | for( ; osserial > os->serial) 912 | return -1; 913 | if(os->serial > ns->serial) 914 | abort(); // folk theorem bites the dust 915 | if(os->p > ns->p) 916 | return -1; 917 | if(ns->p > os->p) 918 | return 1; // believed impossible 919 | oe = rpos(os); 920 | ne = rpos(ns); 921 | if(ne->p > oe->p) 922 | return 1; 923 | if(oe->p > ne->p) 924 | return -1; 925 | k = better(os+1, ns+1, oe, ne); 926 | if(k) 927 | return k; 928 | } 929 | if(ns < nend) 930 | abort(); // another one bites the dust 931 | return os < oend; // true => inessential null 932 | } 933 | 934 | int Done::parse(uchar *s, Rex*, Eenv *env) 935 | { 936 | if(edebug & (1<pos[0], env->npos, env); 939 | } 940 | if(env->flags®_ANCH && s!=env->last) 941 | return NONE; 942 | if(env->flags & REG_NOSUB) 943 | return BEST; 944 | int n = s - env->p; 945 | int nsub = env->preg->re_nsub; 946 | if((env->flags&HARD) == 0) { 947 | env->best[0].rm_eo = n; 948 | memmove(&env->best[1], &env->match[1], 949 | nsub*sizeof(regmatch_t)); 950 | return BEST; 951 | } 952 | if(env->best[0].rm_eo >= 0) { /* only happens on HARD */ 953 | long d = env->best[0].rm_eo; 954 | if(n < d) 955 | return GOOD; 956 | if(n == d) { 957 | if(edebug & (1<bestpos[0], 959 | env->nbestpos, env); 960 | d = better(&env->bestpos[0], 961 | &env->pos[0], 962 | &env->bestpos[env->nbestpos], 963 | &env->pos[env->npos]); 964 | if(d <= 0) 965 | return GOOD; 966 | } 967 | } 968 | env->best[0].rm_eo = n; 969 | memmove(&env->best[1], &env->match[1], 970 | nsub*sizeof(regmatch_t)); 971 | n = env->npos; 972 | if(env->bestpos.assure(n)) { 973 | env->flags |= SPACE; 974 | return BAD; 975 | } 976 | env->nbestpos = n; 977 | memmove(&env->bestpos[0], &env->pos[0], n*sizeof(Pos)); 978 | return GOOD; 979 | } 980 | 981 | /* regnexec is a side door for use when string length is known. 982 | returning REG_BADPAT or REG_ESPACE is not explicitly 983 | countenanced by the standard. */ 984 | 985 | int regnexec(const regex_t *preg, const char *string, size_t len, 986 | size_t nmatch, regmatch_t *match, int eflags) 987 | { 988 | int i; 989 | if(preg->rex == 0) // not required, but kind 990 | return REG_BADPAT; 991 | Eenv env(preg, eflags, (uchar*)string, len); 992 | if(env.flags&SPACE) 993 | return REG_ESPACE; 994 | if(env.flags®_NOSUB) 995 | nmatch = 0; 996 | for(i=0; (unsigned)ire_nsub; i++) 997 | env.match[i] = NOMATCH; 998 | 999 | while(preg->rex->parse((uchar*)string,Done::done,&env) == NONE) { 1000 | if(env.flags & ONCE) 1001 | return REG_NOMATCH; 1002 | if((uchar*)++string > env.last) 1003 | return REG_NOMATCH; 1004 | env.best[0].rm_so++; 1005 | } 1006 | if(env.flags & SPACE) 1007 | return REG_ESPACE; 1008 | 1009 | for(i=0; (unsigned)ire_nsub) 1011 | match[i] = env.best[i]; 1012 | else 1013 | match[i] = NOMATCH; 1014 | return 0; 1015 | } 1016 | 1017 | int regexec(const regex_t *preg, const char *string, size_t nmatch, 1018 | regmatch_t *match, int eflags) 1019 | { 1020 | const char *s = string; 1021 | while(*s) 1022 | s++; 1023 | return regnexec(preg, string, s-string, nmatch, match, eflags); 1024 | } 1025 | -------------------------------------------------------------------------------- /re2.cpp: -------------------------------------------------------------------------------- 1 | // regular expression compiler 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "re.h" 9 | 10 | #ifdef DEBUG 11 | int cdebug = 0; 12 | #else 13 | #define cdebug 0 14 | #endif 15 | 16 | Rex* Done::done; 17 | 18 | /* Lexical analysis */ 19 | 20 | enum Token { 21 | T_META = UCHAR_MAX+1, // must be first 22 | T_CFLX, 23 | T_DOT, 24 | T_END, 25 | T_BAD, 26 | T_DOLL, 27 | T_STAR, 28 | T_PLUS, 29 | T_QUES, 30 | T_OPEN, 31 | T_CLOSE, 32 | T_LEFT, 33 | T_RIGHT, 34 | T_BRA, 35 | T_BAR, 36 | T_AND, 37 | T_BANG, 38 | T_BACK = 512, 39 | T_NEXT = T_BACK+BACK_REF_MAX+1 // dummy spaceholder 40 | }; 41 | 42 | /* table of special characters. the "funny" things get special 43 | treatment at ends of BRE. escindex[c], if nonzero, tells 44 | where to find key c in the escape table. to make this 45 | work, row zero of the escape table can't be used */ 46 | 47 | enum { BRE, ERE, ARE }; 48 | 49 | 50 | static uchar escindex[UCHAR_MAX+1]; 51 | 52 | static struct { 53 | uchar key; 54 | struct { short unesc, esc; } val[3]; 55 | } escape[] = { 56 | // key BRE ERE ARE 57 | { }, 58 | { '\\', '\\', '\\', '\\', '\\', '\\', '\\' }, 59 | { '^', '^', '^', T_CFLX, '^', T_CFLX, '^' }, // funny 60 | { '.', T_DOT, '.', T_DOT, '.', T_DOT, '.' }, 61 | { '$', '$', '$', T_DOLL, '$', T_DOLL, '$' }, // funny 62 | { '*', T_STAR, '*', T_STAR, '*', T_STAR, '*' }, 63 | { '[', T_BRA, '[', T_BRA, '[', T_BRA, '[' }, 64 | { '|', '|', T_BAD, T_BAR, '|', T_BAR, '|' }, 65 | { '+', '+', T_BAD, T_PLUS, '+', T_PLUS, '+' }, 66 | { '?', '?', T_BAD, T_QUES, '?', T_QUES, '?' }, 67 | { '(', '(', T_OPEN, T_OPEN, '(', T_OPEN, '(' }, 68 | { ')', ')', T_CLOSE,T_CLOSE,')', T_CLOSE,')' }, 69 | { '{', '{', T_LEFT, T_LEFT, '{', T_LEFT, '{' }, 70 | { '&', '&', T_BAD, '&', T_BAD, T_AND, '&' }, 71 | { '!', '!', T_BAD, '!', T_BAD, T_BANG, '!' }, 72 | { '}', '}', T_RIGHT, '}', T_BAD, '}', T_BAD }, 73 | { '1', '1', T_BACK+1, '1', T_BAD, '1', T_BAD }, 74 | { '2', '2', T_BACK+2, '2', T_BAD, '2', T_BAD }, 75 | { '3', '3', T_BACK+3, '3', T_BAD, '3', T_BAD }, 76 | { '4', '4', T_BACK+4, '4', T_BAD, '4', T_BAD }, 77 | { '5', '5', T_BACK+5, '5', T_BAD, '5', T_BAD }, 78 | { '6', '6', T_BACK+6, '6', T_BAD, '6', T_BAD }, 79 | { '7', '7', T_BACK+7, '7', T_BAD, '7', T_BAD }, 80 | { '8', '8', T_BACK+8, '8', T_BAD, '8', T_BAD }, 81 | { '9', '9', T_BACK+9, '9', T_BAD, '9', T_BAD } 82 | }; 83 | 84 | /* character maps for REG_ICASE. conceptually they are statics 85 | in regex_t, but initializing them would entail dependence 86 | on C++ runtime system, which we don't want */ 87 | 88 | static uchar ident[UCHAR_MAX+1]; 89 | static uchar fold[UCHAR_MAX+1]; 90 | 91 | static void 92 | init() 93 | { 94 | int i; 95 | for(i=0; (unsigned)iERE 130 | flags®_EXTENDED? ERE: 131 | BRE; 132 | posixkludge = retype == BRE; 133 | expr = cursor; 134 | memset(paren, 0, sizeof(paren)); 135 | } 136 | 137 | #ifdef DEBUG 138 | static void 139 | printnew(Rex *rex) 140 | { 141 | int t; 142 | printf("new %s\n", 143 | rex==0? "ESPACE": 144 | (t=rex->type)==OK? "OK": 145 | t==ANCHOR? "ANCHOR": 146 | t==END? "END": 147 | t==DOT? "DOT": 148 | t==ONECHAR? "ONECHAR": 149 | t==STRING? "STRING": 150 | t==KMP? "KMP": 151 | t==KR? "KR": 152 | t==TRIE? "TRIE": 153 | t==CLASS? "CLASS": 154 | t==BACK? "BACK": 155 | t==SUBEXP? "SUBEXP": 156 | t==ALT? "ALT": 157 | t==REP? "REP": 158 | t==TEMP? "TEMP": 159 | "HUH"); 160 | } 161 | #else 162 | #define printnew(rex) 163 | #endif 164 | 165 | static Rex *ERROR = 0; 166 | 167 | Rex *NEWinit(Rex *rex, Cenv *env) 168 | { 169 | if(cdebug) 170 | printnew(rex); 171 | if(env->flags&SPACE) { 172 | delete rex; 173 | return ERROR; 174 | } else 175 | return rex; 176 | } 177 | #define NEW(x) NEWinit(new x, env) 178 | 179 | /* determine whether greedy matching will work, i.e. produce 180 | the best match first. such expressions are "easy", and 181 | need no backtracking once a complete match is found. 182 | if an expression has backreferences or alts it's hard 183 | else if it has only one closure it's easy 184 | else if all closures are simple (i.e. one-character) it's easy 185 | else it's hard. 186 | */ 187 | 188 | /* struct in which statistics about an re are gathered */ 189 | 190 | struct Stat { 191 | int n; // min length 192 | uchar s; // number of simple closures 193 | uchar c; // number of closures 194 | uchar b; // number of backrefs 195 | uchar t; // number of tries 196 | int a; // number of alternations 197 | uchar p; // number of parens (subexpressions) 198 | uchar o; // nonzero on overflow of some field 199 | Stat() { memset(this, 0, sizeof(Stat)); } 200 | }; 201 | 202 | static Stat addStat(Stat &st1, Stat &st2) 203 | { 204 | Stat st = st1; 205 | st.n += st2.n; 206 | st.s += st2.s; 207 | st.c += st2.c; 208 | st.b += st2.b; 209 | st.a += st2.a; 210 | st.p += st2.p; 211 | st.t += st2.t; 212 | if(st.nstat(env); 222 | return addStat(st1, st2); 223 | } 224 | Stat Rex::stat(Cenv *env) 225 | { 226 | static Stat nullStat; 227 | return addStat(nullStat, next, env); 228 | } 229 | Stat Dup::stat(Cenv *env) 230 | { 231 | Stat st1; 232 | st1.n = lo; 233 | st1.s = st1.c = hi != lo; 234 | return addStat(st1, next, env); 235 | } 236 | Stat Back::stat(Cenv *env) 237 | { 238 | static Stat backStat; 239 | backStat.b = 1; 240 | return addStat(backStat, next, env); 241 | } 242 | Stat Subexp::stat(Cenv *env) 243 | { 244 | Stat st = rex->stat(env); 245 | if(env->backref & 1<stat(env); 254 | Stat st2 = right->stat(env); 255 | Stat st = addStat(st1, st2); 256 | st.n = st1.n<=st2.n? st1.n: st2.n; 257 | if(++st.a <= 0) 258 | st.o |= 1; 259 | return addStat(st, next, env); 260 | } 261 | Stat Conj::stat(Cenv *env) 262 | { 263 | Stat st1 = left->stat(env); 264 | Stat st = addStat(st1, right, env); 265 | return addStat(st, next, env); 266 | } 267 | Stat Rep::stat(Cenv *env) 268 | { 269 | Stat st = rex->stat(env); 270 | if(st.n == 1 && st.c+st.b == 0) 271 | st.s++; 272 | st.c++; 273 | st.n *= lo; 274 | if(st.n<0 || st.c<=0) 275 | st.o |= 1; 276 | return addStat(st, next, env); 277 | } 278 | Stat Neg::stat(Cenv *env) 279 | { 280 | Stat st = rex->stat(env); 281 | return addStat(st, next, env); 282 | } 283 | Stat String::stat(Cenv *env) 284 | { 285 | Stat st; 286 | st.n = seg.n; 287 | return addStat(st, next, env); 288 | } 289 | Stat Trie::stat(Cenv *env) 290 | { 291 | Stat st; 292 | st.n = min; 293 | if(min == max) 294 | return st; 295 | if(++st.t <= 0) 296 | st.o |= 1; 297 | return st; 298 | } 299 | 300 | int hard(Stat *stat) 301 | { 302 | if(stat->a | stat->b) 303 | return HARD; 304 | else if(stat->t <=1 && stat->c == 0) 305 | return EASY; 306 | else if(stat->t) 307 | return HARD; 308 | else if(stat->c<=1 || stat->s==stat->c) 309 | return EASY; 310 | else 311 | return HARD; 312 | } 313 | 314 | /* Assign subpattern numbers by a preorder tree walk. */ 315 | 316 | int Rex::serialize(int n) 317 | { 318 | serial = n++; // hygiene; won't be used 319 | return next? next->serialize(n): n; 320 | } 321 | 322 | int Subexp::serialize(int n) 323 | { 324 | serial = n++; 325 | n = rex->serialize(n); 326 | return next? next->serialize(n): n; 327 | } 328 | 329 | int Alt::serialize(int n) 330 | { 331 | serial = n++; 332 | n = left->serialize(n); 333 | rserial = n++; 334 | n = right->serialize(n); 335 | return next? next->serialize(n): n; 336 | } 337 | 338 | int Conj::serialize(int n) 339 | { 340 | serial = n++; 341 | n = left->serialize(n); 342 | n = right->serialize(n); 343 | return next? next->serialize(n): n; 344 | } 345 | 346 | int Rep::serialize(int n) 347 | { 348 | serial = n++; 349 | n = rex->serialize(n); 350 | return next? next->serialize(n): n; 351 | } 352 | 353 | int Neg::serialize(int n) 354 | { 355 | serial = n++; 356 | n = rex->serialize(n); 357 | return next? next->serialize(n): n; 358 | } 359 | /* extended and escaped are both 0-1 arguments */ 360 | 361 | 362 | /* this stuff gets around posix failure to define isblank, 363 | and the fact that ctype functions are macros */ 364 | 365 | static int Isalnum(int c) { return isalnum(c); } 366 | static int Isalpha(int c) { return isalpha(c); } 367 | static int Isblank(int c) { return c==' ' || c=='\t'; } 368 | static int Iscntrl(int c) { return iscntrl(c); } 369 | static int Isdigit(int c) { return isdigit(c); } 370 | static int Isgraph(int c) { return isgraph(c); } 371 | static int Islower(int c) { return islower(c); } 372 | static int Isprint(int c) { return isprint(c); } 373 | static int Ispunct(int c) { return ispunct(c); } 374 | static int Isspace(int c) { return isspace(c); } 375 | static int Isupper(int c) { return isupper(c); } 376 | static int Isxdigit(int c){ return isxdigit(c);} 377 | 378 | static struct { 379 | const char *name; 380 | int(*ctype)(int); 381 | } ctype[] = { 382 | { "alnum", Isalnum }, 383 | { "alpha", Isalpha }, 384 | { "blank", Isblank }, 385 | { "cntrl", Iscntrl }, 386 | { "digit", Isdigit }, 387 | { "graph", Isgraph }, 388 | { "lower", Islower }, 389 | { "print", Isprint }, 390 | { "punct", Ispunct }, 391 | { "space", Isspace }, 392 | { "upper", Isupper }, 393 | { "xdigit",Isxdigit} 394 | }; 395 | 396 | static int 397 | getcharcl(int c, Set *set, Cenv *env) 398 | { 399 | int i, j, n; 400 | for(i=0; (unsigned)icursor.n > n+2 && 403 | strncmp(ctype[i].name,(char*)env->cursor.p,n) == 0 && 404 | env->cursor.p[n] == c && env->cursor.p[n+1] == ']') { 405 | env->cursor.next(n+2); 406 | int (*f)(int) = ctype[i].ctype; 407 | for(j=0; jinsert(j); 410 | return 1; 411 | } 412 | } 413 | return 0; 414 | } 415 | 416 | /* find a collating element delimited by [c c], where c is 417 | either '=' or '.' */ 418 | static int 419 | findcollelem(int c, Cenv *env) 420 | { 421 | int i; 422 | if(env->cursor.n > 3 && env->cursor.p[1] == c && 423 | env->cursor.p[2] == ']') { 424 | i = env->cursor.p[0]; 425 | env->cursor.next(3); 426 | return i; 427 | } 428 | return -1; 429 | } 430 | 431 | static int 432 | token(Cenv *env) 433 | { 434 | long n = env->cursor.n; 435 | if(n <= 0) 436 | return T_END; 437 | int c = *env->cursor.p; 438 | if(env->flags & REG_LITERAL) 439 | return c; 440 | if(env->posixkludge) { 441 | env->posixkludge = 0; 442 | if(c == '*') 443 | return c; // * first in subexr 444 | } 445 | if(c == '\\') { 446 | if(n < 2) 447 | return T_BAD; 448 | c = env->cursor.p[1]; 449 | if(c=='(' && env->retype==BRE) 450 | env->posixkludge = 1; 451 | else if(c==')' && env->retype==BRE && env->parnest==0) 452 | return T_BAD; 453 | if(escindex[c]) 454 | return escape[escindex[c]].val[env->retype].esc; 455 | else 456 | return T_BAD; 457 | } 458 | if(c=='$' && n==1) 459 | return T_DOLL; 460 | else if(c=='^' && env->retype==BRE && env->cursor.p==env->expr.p) { 461 | env->posixkludge = 1; 462 | return T_CFLX; 463 | } else if(c==')' && env->retype!=BRE && env->parnest==0) 464 | return c; 465 | return escindex[c]? escape[escindex[c]].val[env->retype].unesc: c; 466 | } 467 | inline void 468 | eat(Cenv *env) 469 | { 470 | env->cursor.next(1 + (*env->cursor.p=='\\')); 471 | } 472 | 473 | static Rex* // bracket expression 474 | regbra(Cenv *env) 475 | { 476 | Class *r = (Class*)NEW(Class); 477 | Set set; 478 | int c, i, neg, last, inrange, init; 479 | neg = 0; 480 | if(env->cursor.n>0 && *env->cursor.p=='^') { 481 | env->cursor.next(); 482 | neg = 1; 483 | } 484 | if(env->cursor.n < 2) 485 | goto error; 486 | inrange = 0; // 0=no, 1=possibly, 2=definitely 487 | for(init=1; ; init=0) { 488 | if(env->cursor.n <= 0) 489 | goto error; 490 | c = *env->cursor.p; 491 | env->cursor.next(); 492 | if(c == ']') { 493 | if(init) { 494 | last = c; 495 | inrange = 1; 496 | continue; 497 | } 498 | if(inrange != 0) 499 | set.insert(last); 500 | if(inrange == 2) 501 | set.insert('-'); 502 | break; 503 | } else if(c == '-') { 504 | if(inrange == 0 && !init) 505 | goto error; 506 | if(inrange == 1) { 507 | inrange = 2; 508 | continue; 509 | } 510 | } else if(c == '[') { 511 | if(env->cursor.n < 2) 512 | goto error; 513 | c = *env->cursor.p; 514 | switch(c) { 515 | case ':': 516 | env->cursor.next(); 517 | if(inrange == 1) 518 | set.insert(last); 519 | if(!getcharcl(c, &set, env)) 520 | goto error; 521 | inrange = 0; 522 | continue; 523 | case '=': 524 | env->cursor.next(); 525 | if(inrange == 2) 526 | goto error; 527 | if(inrange == 1) 528 | set.insert(last); 529 | i = findcollelem(c, env); 530 | if(i == -1) 531 | goto error; 532 | set.insert(i); 533 | inrange = 0; 534 | continue; 535 | case '.': 536 | env->cursor.next(); 537 | c = findcollelem(c, env); 538 | if(c == -1) 539 | goto error; 540 | break; 541 | default: 542 | c = '['; 543 | } 544 | } 545 | if(inrange == 2) { 546 | if(last > c) 547 | goto error; 548 | for(i=last; i<=c; i++) 549 | set.insert(i); 550 | inrange = 0; 551 | } else if(inrange == 1) 552 | set.insert(last); 553 | else 554 | inrange = 1; 555 | last = c; 556 | } 557 | r->orset(&set); 558 | r->icase(env->map); 559 | if(neg) 560 | r->neg(env->flags); 561 | return r; 562 | error: 563 | delete r; 564 | return ERROR; 565 | } 566 | 567 | static Rex* 568 | regRep(Rex *e, int n1, int n2, Cenv *env) 569 | { 570 | if(cdebug) 571 | printf("regRep %lx.%d\n", 572 | (unsigned long)env->cursor.p,env->cursor.n); 573 | int c; 574 | unsigned long m = 0; 575 | unsigned long n = RE_DUP_INF; 576 | char *sp, *ep; 577 | if(e == ERROR) 578 | return e; 579 | c = token(env); 580 | switch(c) { 581 | default: 582 | return e; 583 | case T_BANG: 584 | eat(env); 585 | return NEW(Neg(e)); 586 | case T_QUES: 587 | n = 1; 588 | eat(env); 589 | break; 590 | case T_STAR: 591 | eat(env); 592 | break; 593 | case T_PLUS: 594 | m = 1; 595 | eat(env); 596 | break; 597 | case T_LEFT: 598 | eat(env); 599 | errno = 0; 600 | sp = (char*)env->cursor.p; 601 | n = m = strtoul(sp, &ep, 10); 602 | if(ep == sp || ep-sp >= env->cursor.n) 603 | goto error; 604 | if(*ep == ',') { 605 | sp = ep + 1; 606 | n = strtoul(sp, &ep, 10); 607 | if(ep == sp) 608 | n = RE_DUP_INF; 609 | else if(n > RE_DUP_MAX) 610 | goto error; 611 | else if(ep-sp >= env->cursor.n) 612 | goto error; 613 | } 614 | env->cursor.next(ep - (char*)env->cursor.p); 615 | if(errno || m > n || m > RE_DUP_MAX) 616 | goto error; 617 | else if(env->flags®_EXTENDED) { 618 | if(token(env) != '}') 619 | goto error; 620 | } else if(token(env) != T_RIGHT) 621 | goto error; 622 | eat(env); 623 | break; 624 | } 625 | switch(e->type) { 626 | case DOT: 627 | case CLASS: 628 | case ONECHAR: 629 | ((Dup*)e)->lo = (int)m; 630 | ((Dup*)e)->hi = (int)n; 631 | return e; 632 | } 633 | return NEW(Rep((int)m, (int)n, n1, n2, e)); 634 | error: 635 | delete e; 636 | return ERROR; 637 | } 638 | 639 | /* combine e and f into a sequence, collapsing them if 640 | either is Ok, or if both are Dots. */ 641 | 642 | static Rex * 643 | mkSeq(Rex *e, Rex *f) 644 | { 645 | Rex *g; 646 | if(f == ERROR) { 647 | delete e; 648 | return f; 649 | } else if(e->type == OK) { 650 | delete e; 651 | return f; 652 | } else if(f->type == OK) { 653 | g = (Rex*)f->next; 654 | f->next = 0; 655 | delete f; 656 | f = g; 657 | } else if(e->type==DOT && f->type==DOT) { 658 | unsigned m = ((Dot*)e)->lo + ((Dot*)f)->lo; 659 | unsigned n = ((Dot*)e)->hi + ((Dot*)f)->hi; 660 | if(m <= RE_DUP_MAX) { 661 | if(((Dot*)e)->hi > RE_DUP_MAX || 662 | ((Dot*)f)->hi > RE_DUP_MAX) { 663 | n = RE_DUP_INF; 664 | goto n_ok; 665 | } else if(n <= RE_DUP_MAX) { // unless ovfl, 666 | n_ok: ((Dot*)e)->lo = m; // combine 667 | ((Dot*)e)->hi = n; 668 | g = (Rex*)f->next; 669 | f->next = 0; 670 | delete f; 671 | f = g; 672 | } 673 | } 674 | } 675 | e->next = f; 676 | return e; 677 | } 678 | 679 | static Rex *regSeq(Cenv *env); 680 | static Rex *regConj(Cenv *env); 681 | static Rex *regTrie(Rex *e, Rex *f, Cenv *env); 682 | 683 | static Rex * 684 | regAlt(int n1, Cenv *env) // n1= no. of 1st subexpr in alt 685 | { 686 | if(cdebug) 687 | printf("regAlt %lx.%d parno=%d\n", 688 | (unsigned long)env->cursor.p, env->cursor.n, env->parno); 689 | Rex *e = regConj(env); 690 | if(e == ERROR) 691 | return e; 692 | else if(token(env) != T_BAR) 693 | return e; 694 | eat(env); 695 | Rex *f = regAlt(n1, env); 696 | if(f == ERROR) { 697 | delete e; 698 | return f; 699 | } 700 | Rex *g = regTrie(e, f, env); 701 | if(g != ERROR) 702 | return g; 703 | if(e->type==OK || f->type==OK) 704 | goto bad; 705 | g = NEW(Alt(n1, env->parno, e, f)); 706 | if(g != ERROR) 707 | return g; 708 | bad: 709 | delete e; 710 | delete f; 711 | return ERROR; 712 | } 713 | 714 | static Rex* 715 | regConj(Cenv *env) 716 | { 717 | if(cdebug) 718 | printf("regConj %lx.%d parno=%d\n", 719 | (unsigned long)env->cursor.p, env->cursor.n, env->parno); 720 | Rex *e = regSeq(env); 721 | if(env->retype != ARE) 722 | return e; 723 | if(e == ERROR) 724 | return e; 725 | if(token(env) != T_AND) 726 | return e; 727 | eat(env); 728 | Rex *f = regConj(env); 729 | if(f == ERROR) { 730 | delete e; 731 | return f; 732 | } 733 | Rex *g = NEW(Conj(e, f)); 734 | if(g == ERROR){ 735 | delete e; 736 | delete f; 737 | } 738 | return g; 739 | } 740 | 741 | /* regTrie tries to combine nontrivial e and f into a Trie. unless 742 | ERROR is returned, e and f are deleted as far as possible */ 743 | 744 | static int 745 | isstring(Rex *e) 746 | { 747 | switch(e->type) { 748 | case KMP: 749 | case KR: 750 | case STRING: 751 | return 1; 752 | case ONECHAR: 753 | return ((Onechar*)e)->lo==1 && ((Onechar*)e)->hi==1; 754 | } 755 | return 0; 756 | } 757 | static int 758 | insert(Rex *f, Trie *g) 759 | { 760 | uchar temp[2]; 761 | switch(f->type) { 762 | case KMP: 763 | case KR: 764 | case STRING: 765 | return g->insert(((String*)f)->seg.p); 766 | case ONECHAR: 767 | temp[0] = ((Onechar*)f)->c; 768 | temp[1] = 0; 769 | return g->insert(temp); 770 | } 771 | return 1; // shouldn't happen 772 | } 773 | static Rex * 774 | regTrie(Rex *e, Rex *f, Cenv *env) 775 | { 776 | Trie *g = (Trie*)f; 777 | if(e->next || f->next || !isstring(e)) 778 | return ERROR; 779 | if(isstring(f)) { 780 | g = (Trie*)NEW(Trie()); // env is used here 781 | if(g == ERROR) 782 | return ERROR; 783 | if(insert(f, g)) 784 | goto nospace; 785 | } else if(f->type != TRIE) 786 | return ERROR; 787 | if(insert(e, g)) 788 | goto nospace; 789 | delete e; 790 | if(f != g) 791 | delete f; 792 | return g; 793 | nospace: 794 | if(g != f) 795 | delete g; 796 | return ERROR; 797 | } 798 | 799 | static Rex * 800 | regSeq(Cenv *env) 801 | { 802 | Rex *e, *f, *g; 803 | int c, parno; 804 | uchar ch; 805 | uchar data[101]; 806 | if(cdebug) 807 | printf("regSeq %lx.%d parno=%d\n", 808 | (unsigned long)env->cursor.p, env->cursor.n, env->parno); 809 | Seg string(data, 0); 810 | for( ; ; ch = c) { // get string 811 | c = token(env); 812 | if(c>T_META || (unsigned)string.n>=sizeof(data)-1) 813 | break; 814 | string.p[string.n++] = c; 815 | eat(env); 816 | } 817 | if(c == T_BAD) 818 | return ERROR; 819 | if(string.n > 0) switch(c) { 820 | case T_STAR: 821 | case T_PLUS: 822 | case T_LEFT: 823 | case T_QUES: 824 | case T_BANG: 825 | string.n--; 826 | if(string.n < 0) 827 | return ERROR; 828 | if(string.n == 0) 829 | e = NEW(Ok); 830 | else { 831 | Seg copy = string.copy(); 832 | if(copy.p == 0) 833 | return ERROR; 834 | e = NEW(String(copy, env->map)); 835 | } 836 | f = NEW(Onechar(env->map[ch])); 837 | f = regRep(f, 0, 0, env); 838 | if(f == ERROR) { 839 | delete e; 840 | return f; 841 | } 842 | g = regSeq(env); 843 | return mkSeq(e, mkSeq(f, g)); 844 | default: 845 | e = NEW(String(string.copy(), env->map)); 846 | f = regSeq(env); 847 | return mkSeq(e, f); 848 | } else if(c > T_BACK) { 849 | eat(env); 850 | c -= T_BACK; 851 | if(c>env->parno || env->paren[c] == 0) 852 | return ERROR; 853 | env->backref |= 1<flags®_EXTENDED) 869 | e = regRep(e, 0, 0, env); 870 | break; 871 | case T_OPEN: 872 | eat(env); 873 | ++env->parnest; 874 | parno = ++env->parno; 875 | e = regAlt(parno+1, env); 876 | if(e == ERROR) 877 | break; 878 | if(e->type==OK && env->flags®_EXTENDED) { 879 | delete e; 880 | return ERROR; 881 | } 882 | if(token(env) != T_CLOSE) { 883 | delete e; 884 | return ERROR; 885 | } 886 | --env->parnest; 887 | eat(env); 888 | if(parno <= BACK_REF_MAX) 889 | env->paren[parno] = 1; 890 | e = NEW(Subexp(parno, e)); 891 | if(e == ERROR) 892 | break; 893 | e = regRep(e, parno, env->parno, env); 894 | break; 895 | case T_BRA: 896 | eat(env); 897 | e = regRep(regbra(env), 0, 0, env); 898 | break; 899 | case T_DOT: 900 | eat(env); 901 | e = regRep(NEW(Dot), 0, 0, env); 902 | break; 903 | default: 904 | return ERROR; 905 | } 906 | if(e != ERROR && env->cursor.n > 0) 907 | e = mkSeq(e, regSeq(env)); 908 | return e; 909 | } 910 | 911 | 912 | /* rewrite the expression tree for some special cases. 913 | 1. it is a null expression - illegal 914 | 2. it begins with an unanchored string - use KMP algorithm 915 | 3. it begins with .* or ^ - regexec only need try it ONCE 916 | 4. it begins with one of the above parenthesized and unduplicated 917 | */ 918 | 919 | static int 920 | special(regex_t *preg, Cenv *env) 921 | { 922 | Rex* kmp; 923 | Rex* rex = preg->rex; 924 | String *string; 925 | if(rex == ERROR) 926 | return 0; 927 | switch(rex->type) { 928 | case SUBEXP: 929 | for(;;) { 930 | rex = ((Subexp*)rex)->rex; 931 | switch(rex->type) { 932 | case SUBEXP: 933 | continue; 934 | case DOT: 935 | goto dot; 936 | case ANCHOR: 937 | goto anchor; 938 | default: 939 | return 0; 940 | } 941 | } 942 | dot: 943 | case DOT: // .* 944 | if(((Dot*)rex)->lo==0 && ((Dot*)rex)->hi==RE_DUP_INF) 945 | return ONCE; 946 | return 0; 947 | case OK: // empty regexp 948 | if(env->flags & REG_NULL) 949 | return ONCE; 950 | regfree(preg); 951 | return 0; 952 | case STRING: 953 | if(env->flags & (REG_ANCH | REG_LITERAL)) 954 | return 0; 955 | string = (String*)rex; 956 | kmp = NEW(Kmp(string->seg, &env->flags)); 957 | if(kmp==ERROR || env->flags&SPACE) 958 | return 0; 959 | kmp->next = rex->next; 960 | preg->rex = kmp; 961 | string->seg = Seg(0,0); 962 | string->next = 0; 963 | delete string; 964 | return ONCE; 965 | anchor: 966 | case ANCHOR: 967 | if(!(env->flags & REG_NEWLINE)) 968 | return ONCE; 969 | } 970 | return 0; 971 | } 972 | 973 | int 974 | regcomp(regex_t *preg, const char *pattern, int cflags) 975 | { 976 | preg->rex = 0; 977 | if(Done::done==0 && (Done::done=new Done)==0) 978 | return REG_ESPACE; 979 | if(cflags & REG_AUGMENTED) 980 | cflags |= REG_EXTENDED; 981 | cflags &= CFLAGS|GFLAGS; 982 | Cenv env(pattern, cflags); 983 | if(env.flags&SPACE) 984 | return REG_ESPACE; 985 | 986 | preg->rex = regAlt(1, &env); 987 | cflags |= special(preg, &env); 988 | if(preg->rex == ERROR) 989 | return env.flags&SPACE? REG_ESPACE: REG_BADPAT; 990 | 991 | preg->rex->serialize(1); 992 | Stat st = preg->rex->stat(&env); 993 | if(st.o) { 994 | regfree(preg); 995 | return REG_ESPACE; 996 | } 997 | cflags |= hard(&st); 998 | if(cflags & REG_ANCH) 999 | cflags |= ONCE; 1000 | preg->flags = cflags; 1001 | preg->re_nsub = st.p; 1002 | preg->map = env.map; 1003 | return 0; 1004 | } 1005 | 1006 | void 1007 | regfree(regex_t *preg) 1008 | { 1009 | delete preg->rex; 1010 | preg->rex = ERROR; 1011 | } 1012 | 1013 | size_t 1014 | regerror(int errcode, const regex_t*, char *errbuf, size_t errbuf_size) 1015 | { 1016 | if(errbuf_size == 0) 1017 | return 0; 1018 | const char *s = "unknown error"; 1019 | switch(errcode) { 1020 | case 0: 1021 | s = "success"; 1022 | break; 1023 | case REG_NOMATCH: 1024 | s = "no match"; 1025 | break; 1026 | case REG_ESPACE: 1027 | s = "out of space"; 1028 | break; 1029 | case REG_BADPAT: 1030 | case REG_ECOLLATE: 1031 | case REG_ECTYPE: 1032 | case REG_EESCAPE: 1033 | case REG_ESUBREG: 1034 | case REG_EBRACK: 1035 | case REG_EPAREN: 1036 | case REG_EBRACE: 1037 | case REG_BADBR: 1038 | case REG_ERANGE: 1039 | case REG_BADRPT: 1040 | s = "improper regular expression"; 1041 | break; 1042 | } 1043 | strncpy(errbuf, s, errbuf_size); 1044 | errbuf[errbuf_size-1] = 0; 1045 | return 1+strlen(errbuf); 1046 | } 1047 | 1048 | /* combine two regular expressions if possible, 1049 | replacing first with the combination and freeing second. 1050 | return 1 on success. 1051 | the only combinations handled are building a Trie 1052 | from String|Kmp|Trie and String|Kmp */ 1053 | 1054 | int 1055 | regcomb(regex_t *preg0, regex_t *preg1) 1056 | { 1057 | if(cdebug) 1058 | printf("regcomb\n"); 1059 | Rex *rex0 = preg0->rex; 1060 | Rex *rex1 = preg1->rex; 1061 | Cenv env(preg0->flags); 1062 | if(rex0==ERROR || rex0->next || 1063 | rex1==ERROR || rex1->next) 1064 | return 0; 1065 | Rex *g = regTrie(rex1, rex0, &env); 1066 | if(g == 0) 1067 | return 0; 1068 | preg0->rex = g; 1069 | if((preg0->flags®_ANCH) == 0) 1070 | preg0->flags &= ~ONCE; 1071 | preg1->rex = ERROR; 1072 | return 1; 1073 | } 1074 | --------------------------------------------------------------------------------