├── .github └── workflows │ └── c-cpp.yml ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── clib.json ├── formal_verification.md ├── re.c ├── re.h ├── scripts ├── exrex.py ├── regex_test.py └── regex_test_neg.py └── tests ├── test1.c ├── test2.c ├── test_compile.c ├── test_print.c ├── test_rand.c └── test_rand_neg.c /.github/workflows/c-cpp.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | # Allows you to run this workflow manually from the Actions tab 10 | workflow_dispatch: 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: make clean 20 | run: make clean 21 | - name: make all 22 | run: make all 23 | - name: make test 24 | run: make test 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.pyc 3 | /tests/* 4 | !/tests/*.c 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Compiler to use - can be replaced by clang for instance 2 | CC := gcc 3 | 4 | # Number of random text expressions to generate, for random testing 5 | NRAND_TESTS := 1000 6 | 7 | # Flags to pass to compiler 8 | CFLAGS := -O3 -Wall -Wextra -std=c99 -I. 9 | 10 | all: 11 | @$(CC) $(CFLAGS) re.c tests/test1.c -o tests/test1 12 | @$(CC) $(CFLAGS) re.c tests/test2.c -o tests/test2 13 | @$(CC) $(CFLAGS) re.c tests/test_rand.c -o tests/test_rand 14 | @$(CC) $(CFLAGS) re.c tests/test_rand_neg.c -o tests/test_rand_neg 15 | @$(CC) $(CFLAGS) re.c tests/test_compile.c -o tests/test_compile 16 | 17 | clean: 18 | @rm -f tests/test1 tests/test2 tests/test_rand tests/test_compile 19 | @#@$(foreach test_bin,$(TEST_BINS), rm -f $(test_bin) ; ) 20 | @rm -f a.out 21 | @rm -f *.o 22 | 23 | 24 | test: all 25 | @$(test python) 26 | @echo 27 | @echo Testing hand-picked regex\'s: 28 | @./tests/test1 29 | @echo Testing handling of invalid regex patterns 30 | @./tests/test_compile 31 | @echo Testing patterns against $(NRAND_TESTS) random strings matching the Python implementation and comparing: 32 | @echo 33 | @python ./scripts/regex_test.py \\d+\\w?\\D\\d $(NRAND_TESTS) 34 | @python ./scripts/regex_test.py \\s+[a-zA-Z0-9?]* $(NRAND_TESTS) 35 | @python ./scripts/regex_test.py \\w*\\d?\\w\\? $(NRAND_TESTS) 36 | @python ./scripts/regex_test.py [^\\d]+\\\\?\\s $(NRAND_TESTS) 37 | @python ./scripts/regex_test.py [^\\w][^-1-4] $(NRAND_TESTS) 38 | @python ./scripts/regex_test.py [^\\w] $(NRAND_TESTS) 39 | @python ./scripts/regex_test.py [^1-4] $(NRAND_TESTS) 40 | @python ./scripts/regex_test.py [^-1-4] $(NRAND_TESTS) 41 | @python ./scripts/regex_test.py [^\\d]+\\s?[\\w]* $(NRAND_TESTS) 42 | @python ./scripts/regex_test.py a+b*[ac]*.+.*.[\\.]. $(NRAND_TESTS) 43 | @python ./scripts/regex_test.py a?b[ac*]*.?[\\]+[?]? $(NRAND_TESTS) 44 | @python ./scripts/regex_test.py [-1-5]+[-1-2]-[-] $(NRAND_TESTS) 45 | @python ./scripts/regex_test.py [-1-3]-[-]+ $(NRAND_TESTS) 46 | @python ./scripts/regex_test.py [1-5]+[-1-2]-[\\-] $(NRAND_TESTS) 47 | @python ./scripts/regex_test.py [-1-2]* $(NRAND_TESTS) 48 | @python ./scripts/regex_test.py \\s?[a-fKL098]+-? $(NRAND_TESTS) 49 | @python ./scripts/regex_test.py [\\-]* $(NRAND_TESTS) 50 | @python ./scripts/regex_test.py [\\\\]+ $(NRAND_TESTS) 51 | @python ./scripts/regex_test.py [0-9a-fA-F]+ $(NRAND_TESTS) 52 | @python ./scripts/regex_test.py [1379][2468][abcdef] $(NRAND_TESTS) 53 | @python ./scripts/regex_test.py [012345-9]?[0123-789] $(NRAND_TESTS) 54 | @python ./scripts/regex_test.py [012345-9] $(NRAND_TESTS) 55 | @python ./scripts/regex_test.py [0-56789] $(NRAND_TESTS) 56 | @python ./scripts/regex_test.py [abc-zABC-Z] $(NRAND_TESTS) 57 | @python ./scripts/regex_test.py [a\d]?1234 $(NRAND_TESTS) 58 | @python ./scripts/regex_test.py .*123faerdig $(NRAND_TESTS) 59 | @python ./scripts/regex_test.py .?\\w+jsj $(NRAND_TESTS) 60 | @python ./scripts/regex_test.py [?to][+to][?ta][*ta] $(NRAND_TESTS) 61 | @python ./scripts/regex_test.py \\d+ $(NRAND_TESTS) 62 | @python ./scripts/regex_test.py [a-z]+ $(NRAND_TESTS) 63 | @python ./scripts/regex_test.py \\s+[a-zA-Z0-9?]* $(NRAND_TESTS) 64 | @python ./scripts/regex_test.py \\w $(NRAND_TESTS) 65 | @python ./scripts/regex_test.py \\d $(NRAND_TESTS) 66 | @python ./scripts/regex_test.py [\\d] $(NRAND_TESTS) 67 | @python ./scripts/regex_test.py [^\\d] $(NRAND_TESTS) 68 | @python ./scripts/regex_test.py [^-1-4] $(NRAND_TESTS) 69 | @echo 70 | @echo 71 | @echo 72 | @echo Testing rejection of patterns against $(NRAND_TESTS) random strings also rejected by the Python implementation: 73 | @echo 74 | @python ./scripts/regex_test_neg.py \\d+ $(NRAND_TESTS) 75 | @python ./scripts/regex_test_neg.py [a-z]+ $(NRAND_TESTS) 76 | @python ./scripts/regex_test_neg.py \\s+[a-zA-Z0-9?]* $(NRAND_TESTS) 77 | @python ./scripts/regex_test_neg.py ^\\w $(NRAND_TESTS) 78 | @python ./scripts/regex_test_neg.py ^\\d $(NRAND_TESTS) 79 | @python ./scripts/regex_test_neg.py [\\d] $(NRAND_TESTS) 80 | @python ./scripts/regex_test_neg.py ^[^\\d] $(NRAND_TESTS) 81 | @python ./scripts/regex_test_neg.py [^\\w]+ $(NRAND_TESTS) 82 | @python ./scripts/regex_test_neg.py ^[\\w]+ $(NRAND_TESTS) 83 | @python ./scripts/regex_test_neg.py ^[^0-9] $(NRAND_TESTS) 84 | @python ./scripts/regex_test_neg.py [a-z].[A-Z] $(NRAND_TESTS) 85 | @python ./scripts/regex_test_neg.py [-1-3]-[-]+ $(NRAND_TESTS) 86 | @python ./scripts/regex_test_neg.py [1-5]+[-1-2]-[\\-] $(NRAND_TESTS) 87 | @python ./scripts/regex_test_neg.py [-0-9]+ $(NRAND_TESTS) 88 | @python ./scripts/regex_test_neg.py [\\-]+ $(NRAND_TESTS) 89 | @python ./scripts/regex_test_neg.py [\\\\]+ $(NRAND_TESTS) 90 | @python ./scripts/regex_test_neg.py [0-9a-fA-F]+ $(NRAND_TESTS) 91 | @python ./scripts/regex_test_neg.py [1379][2468][abcdef] $(NRAND_TESTS) 92 | @python ./scripts/regex_test_neg.py [012345-9] $(NRAND_TESTS) 93 | @python ./scripts/regex_test_neg.py [0-56789] $(NRAND_TESTS) 94 | @python ./scripts/regex_test_neg.py .*123faerdig $(NRAND_TESTS) 95 | @echo 96 | @echo 97 | @./tests/test2 98 | @echo 99 | @echo 100 | 101 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![CI](https://github.com/kokke/tiny-regex-c/workflows/CI/badge.svg) 2 | # tiny-regex-c 3 | # A small regex implementation in C 4 | ### Description 5 | Small and portable [Regular Expression](https://en.wikipedia.org/wiki/Regular_expression) (regex) library written in C. 6 | 7 | Design is inspired by Rob Pike's regex-code for the book *"Beautiful Code"* [available online here](http://www.cs.princeton.edu/courses/archive/spr09/cos333/beautiful.html). 8 | 9 | Supports a subset of the syntax and semantics of the Python standard library implementation (the `re`-module). 10 | 11 | **I will gladly accept patches correcting bugs.** 12 | 13 | ### Design goals 14 | The main design goal of this library is to be small, correct, self contained and use few resources while retaining acceptable performance and feature completeness. Clarity of the code is also highly valued. 15 | 16 | ### Notable features and omissions 17 | - Small code and binary size: 500 SLOC, ~3kb binary for x86. Statically #define'd memory usage / allocation. 18 | - No use of dynamic memory allocation (i.e. no calls to `malloc` / `free`). 19 | - To avoid call-stack exhaustion, iterative searching is preferred over recursive by default (can be changed with a pre-processor flag). 20 | - No support for capturing groups or named capture: `(^Pgroup)` etc. 21 | - Thorough testing : [exrex](https://github.com/asciimoo/exrex) is used to randomly generate test-cases from regex patterns, which are fed into the regex code for verification. Try `make test` to generate a few thousand tests cases yourself. 22 | - Verification-harness for [KLEE Symbolic Execution Engine](https://klee.github.io), see [formal verification.md](https://github.com/kokke/tiny-regex-c/blob/master/formal_verification.md). 23 | - Provides character length of matches. 24 | - Compiled for x86 using GCC 7.2.0 and optimizing for size, the binary takes up ~2-3kb code space and allocates ~0.5kb RAM : 25 | ``` 26 | > gcc -Os -c re.c 27 | > size re.o 28 | text data bss dec hex filename 29 | 2404 0 304 2708 a94 re.o 30 | 31 | ``` 32 | 33 | 34 | 35 | ### API 36 | This is the public / exported API: 37 | ```C 38 | /* Typedef'd pointer to hide implementation details. */ 39 | typedef struct regex_t* re_t; 40 | 41 | /* Compiles regex string pattern to a regex_t-array. */ 42 | re_t re_compile(const char* pattern); 43 | 44 | /* Finds matches of the compiled pattern inside text. */ 45 | int re_matchp(re_t pattern, const char* text, int* matchlength); 46 | 47 | /* Finds matches of pattern inside text (compiles first automatically). */ 48 | int re_match(const char* pattern, const char* text, int* matchlength); 49 | ``` 50 | 51 | ### Supported regex-operators 52 | The following features / regex-operators are supported by this library. 53 | 54 | NOTE: inverted character classes are buggy - see the test harness for concrete examples. 55 | 56 | 57 | - `.` Dot, matches any character 58 | - `^` Start anchor, matches beginning of string 59 | - `$` End anchor, matches end of string 60 | - `*` Asterisk, match zero or more (greedy) 61 | - `+` Plus, match one or more (greedy) 62 | - `?` Question, match zero or one (non-greedy) 63 | - `[abc]` Character class, match if one of {'a', 'b', 'c'} 64 | - `[^abc]` Inverted class, match if NOT one of {'a', 'b', 'c'} 65 | - `[a-zA-Z]` Character ranges, the character set of the ranges { a-z | A-Z } 66 | - `\s` Whitespace, \t \f \r \n \v and spaces 67 | - `\S` Non-whitespace 68 | - `\w` Alphanumeric, [a-zA-Z0-9_] 69 | - `\W` Non-alphanumeric 70 | - `\d` Digits, [0-9] 71 | - `\D` Non-digits 72 | 73 | ### Usage 74 | Compile a regex from ASCII-string (char-array) to a custom pattern structure using `re_compile()`. 75 | 76 | Search a text-string for a regex and get an index into the string, using `re_match()` or `re_matchp()`. 77 | 78 | The returned index points to the first place in the string, where the regex pattern matches. 79 | 80 | The integer pointer passed will hold the length of the match. 81 | 82 | If the regular expression doesn't match, the matching function returns an index of -1 to indicate failure. 83 | 84 | ### Examples 85 | Example of usage: 86 | ```C 87 | /* Standard int to hold length of match */ 88 | int match_length; 89 | 90 | /* Standard null-terminated C-string to search: */ 91 | const char* string_to_search = "ahem.. 'hello world !' .."; 92 | 93 | /* Compile a simple regular expression using character classes, meta-char and greedy + non-greedy quantifiers: */ 94 | re_t pattern = re_compile("[Hh]ello [Ww]orld\\s*[!]?"); 95 | 96 | /* Check if the regex matches the text: */ 97 | int match_idx = re_matchp(pattern, string_to_search, &match_length); 98 | if (match_idx != -1) 99 | { 100 | printf("match at idx %i, %i chars long.\n", match_idx, match_length); 101 | } 102 | ``` 103 | 104 | For more usage examples I encourage you to look at the code in the `tests`-folder. 105 | 106 | ### TODO 107 | - Fix the implementation of inverted character classes. 108 | - Fix implementation of branches (`|`), and see if that can lead us closer to groups as well, e.g. `(a|b)+`. 109 | - Add `example.c` that demonstrates usage. 110 | - Add `tests/test_perf.c` for performance and time measurements. 111 | - Testing: Improve pattern rejection testing. 112 | 113 | ### FAQ 114 | - *Q: What differentiates this library from other C regex implementations?* 115 | 116 | A: Well, the small size for one. 500 lines of C-code compiling to 2-3kb ROM, using very little RAM. 117 | 118 | ### License 119 | All material in this repository is in the public domain. 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /clib.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tiny-regex-c", 3 | "version": "0.1.0", 4 | "repo": "kokke/tiny-regex-c", 5 | "keywords": ["tiny", "regex", "pcre"], 6 | "license": "Public Domain", 7 | "makefile": "Makefile", 8 | "src": [ 9 | "re.h", 10 | "re.c" 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /formal_verification.md: -------------------------------------------------------------------------------- 1 | # Using KLEE for formal verification 2 | 3 | Here is a crude demo of formal verification of tiny-regex. This is a hefty plagiat of [@DavidKorczynski](https://twitter.com/davkorcz/) - see https://www.youtube.com/watch?v=z6bsk-lsk1Q or [#44](https://github.com/kokke/tiny-regex-c/issues/44) for more context. 4 | 5 | I am using the [KLEE Symbolic Execution Engine](https://klee.github.io/) and their Docker image here on a Debian-based host. 6 | 7 | What this does, is mechanically try to prove the abscence of all run-time errors, memory corruption bugs and other problems by symbolic execution. We mark the inputs as being symbolic, so that the tool knows to use that as the "search space". That means KLEE checks all possible inputs of the form we give it. 8 | 9 | Steps: 10 | 11 | - Get the KLEE Docker image: ` $ sudo docker pull klee/klee ` 12 | - Run the KLEE Docker image: ` $ sudo docker run --rm -ti --ulimit='stack=-1:-1' klee/klee ` 13 | - NOTE: You should see a command prompt like this: ` klee@cc0c26c5b84c:~$ ` 14 | - Fetch `re.h`: ` klee@cc0c26c5b84c:~$ wget https://raw.githubusercontent.com/kokke/tiny-regex-c/master/re.h ` 15 | - Fetch `re.c`: ` klee@cc0c26c5b84c:~$ wget https://raw.githubusercontent.com/kokke/tiny-regex-c/master/re.c ` 16 | - Run your favorite editor, and insert the code below in the bottom of `re.c` 17 | ```C 18 | /* 19 | tiny-regex KLEE test driver 20 | kindly contributed by @DavidKorczynski - see https://github.com/kokke/tiny-regex-c/issues/44 21 | */ 22 | 23 | int main(int argc, char* argv[]) 24 | { 25 | /* test input - ten chars used as a regex-pattern input */ 26 | char arr[10]; 27 | 28 | /* make input symbolic, to search all paths through the code */ 29 | /* i.e. the input is checked for all possible ten-char combinations */ 30 | klee_make_symbolic(arr, sizeof(arr), "arr"); 31 | 32 | /* assume proper NULL termination */ 33 | klee_assume(arr[sizeof(arr) - 1] == 0); 34 | 35 | /* verify abscence of run-time errors - go! */ 36 | re_compile(arr); 37 | 38 | return 0; 39 | } 40 | ``` 41 | - Alternatively, run this command: 42 | ` klee@cc0c26c5b84c:~$ echo "int main(int argc,char* argv[]){ char arr[10]; klee_make_symbolic(arr, sizeof(arr), \"arr\"); klee_assume(arr[sizeof(arr)-1] == 0); re_compile(arr); return 0; }" >> re.c ` 43 | - Compile and emit LLVM bitcode: ` klee@cc0c26c5b84c:~$ clang -emit-llvm -g -c -O0 -Xclang -disable-O0-optnone re.c ` [(NOTE: flags passed to clang are the ones "recommended" by the KLEE project)](https://klee.github.io/tutorials/testing-function/) 44 | - Run KLEE and wait for 5-10 minutes: ` klee@cc0c26c5b84c:~$ klee --libc=uclibc re.bc ` 45 | - A positive result looks like this: 46 | ``` 47 | klee@cc0c26c5b84c:~$ klee --libc=uclibc re.bc 48 | KLEE: NOTE: Using klee-uclibc : /tmp/klee_build90stp_z3/runtime/lib/klee-uclibc.bca 49 | KLEE: output directory is "/home/klee/klee-out-3" 50 | KLEE: Using STP solver backend 51 | warning: Linking two modules of different target triples: re.bc' is 'x86_64-unknown-linux-gnu' whereas '__uClibc_main.os' is 'x86_64-pc-linux-gnu' 52 | 53 | KLEE: WARNING: undefined reference to function: __syscall_rt_sigaction 54 | KLEE: WARNING: undefined reference to function: close 55 | KLEE: WARNING: undefined reference to function: fcntl 56 | KLEE: WARNING: undefined reference to function: fstat 57 | KLEE: WARNING: undefined reference to function: ioctl 58 | KLEE: WARNING: undefined reference to function: lseek64 59 | KLEE: WARNING: undefined reference to function: mkdir 60 | KLEE: WARNING: undefined reference to function: open 61 | KLEE: WARNING: undefined reference to function: open64 62 | KLEE: WARNING: undefined reference to function: read 63 | KLEE: WARNING: undefined reference to function: sigprocmask 64 | KLEE: WARNING: undefined reference to function: stat 65 | KLEE: WARNING: undefined reference to function: write 66 | KLEE: WARNING: undefined reference to function: kill (UNSAFE)! 67 | KLEE: WARNING: executable has module level assembly (ignoring) 68 | KLEE: WARNING ONCE: calling external: ioctl(0, 21505, 94666720729472) at libc/termios/tcgetattr.c:43 12 69 | KLEE: WARNING ONCE: calling __user_main with extra arguments. 70 | KLEE: WARNING ONCE: skipping fork (memory cap exceeded) 71 | KLEE: WARNING: killing 12290 states (over memory cap: 2102MB) 72 | KLEE: WARNING: killing 11467 states (over memory cap: 2101MB) 73 | 74 | KLEE: done: total instructions = 104365773 75 | KLEE: done: completed paths = 801298 76 | KLEE: done: generated tests = 801298 77 | klee@cc0c26c5b84c:~$ 78 | ``` 79 | 80 | Similarly, the code below tests both `re_compile(...)` and `re_match(...)` which should be sufficient for coverage of the core logic. 81 | Depending on your hardware, you should be able to increase the sizes of `pat` and `txt` to increase your confidence in the verification. 82 | 83 | 84 | ```C 85 | /* 86 | tiny-regex KLEE test driver 87 | kindly contributed by @DavidKorczynski - see https://github.com/kokke/tiny-regex-c/issues/44 88 | */ 89 | 90 | int main(int argc, char* argv[]) 91 | { 92 | /* test input - a regex-pattern and a text string to search in */ 93 | char pat[7]; 94 | char txt[3]; 95 | 96 | /* make input symbolic, to search all paths through the code */ 97 | /* i.e. the input is checked for all possible ten-char combinations */ 98 | klee_make_symbolic(pat, sizeof(pat), "pat"); 99 | klee_make_symbolic(txt, sizeof(txt), "txt"); 100 | 101 | /* assume proper NULL termination */ 102 | klee_assume(pat[sizeof(pat) - 1] == 0); 103 | klee_assume(txt[sizeof(txt) - 1] == 0); 104 | 105 | /* verify abscence of run-time errors - go! */ 106 | int l; 107 | re_match(pat, txt, &l); 108 | 109 | return 0; 110 | } 111 | ``` 112 | 113 | My modest hardware (T420/i5-2520M@2.5GHz/8GB) completes a check of a 7-char pattern and a 3-char text string in 20-30 minutes (size includes null-termination), whereas 8/5 takes +8 hours, 8/6 takes 14 hours: 114 | 115 | ``` 116 | klee@780432c1aaae0:~$ clang -emit-llvm -g -c -O0 -Xclang -disable-O0-optnone re.c 117 | klee@780432c1aaae0:~$ time klee --libc=uclibc --optimize re.bc 118 | KLEE: NOTE: Using klee-uclibc : /tmp/klee_build90stp_z3/runtime/lib/klee-uclibc.bca 119 | KLEE: output directory is "/home/klee/klee-out-0" 120 | KLEE: Using STP solver backend 121 | warning: Linking two modules of different target triples: re.bc' is 'x86_64-unknown-linux-gnu' whereas '__uClibc_main.os' is 'x86_64-pc-linux-gnu' 122 | 123 | KLEE: WARNING: undefined reference to function: fcntl 124 | KLEE: WARNING: undefined reference to function: fstat 125 | KLEE: WARNING: undefined reference to function: ioctl 126 | KLEE: WARNING: undefined reference to function: open 127 | KLEE: WARNING: undefined reference to function: write 128 | KLEE: WARNING: executable has module level assembly (ignoring) 129 | KLEE: WARNING ONCE: calling external: ioctl(0, 21505, 94248844458320) at libc/termios/tcgetattr:43 12 130 | KLEE: WARNING ONCE: calling __user_main with extra arguments. 131 | KLEE: WARNING ONCE: skipping fork (memory cap exceeded) 132 | 133 | KLEE: done: total instructions = 201292178 134 | KLEE: done: completed paths = 910249 135 | KLEE: done: generated tests = 910249 136 | 137 | real 29m16.633s 138 | user 19m38.438s 139 | sys 9m34.654s 140 | klee@780432c1aaae0:~$ 141 | ``` 142 | 143 | -------------------------------------------------------------------------------- /re.c: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * Mini regex-module inspired by Rob Pike's regex code described in: 4 | * 5 | * http://www.cs.princeton.edu/courses/archive/spr09/cos333/beautiful.html 6 | * 7 | * 8 | * 9 | * Supports: 10 | * --------- 11 | * '.' Dot, matches any character 12 | * '^' Start anchor, matches beginning of string 13 | * '$' End anchor, matches end of string 14 | * '*' Asterisk, match zero or more (greedy) 15 | * '+' Plus, match one or more (greedy) 16 | * '?' Question, match zero or one (non-greedy) 17 | * '[abc]' Character class, match if one of {'a', 'b', 'c'} 18 | * '[^abc]' Inverted class, match if NOT one of {'a', 'b', 'c'} -- NOTE: feature is currently broken! 19 | * '[a-zA-Z]' Character ranges, the character set of the ranges { a-z | A-Z } 20 | * '\s' Whitespace, \t \f \r \n \v and spaces 21 | * '\S' Non-whitespace 22 | * '\w' Alphanumeric, [a-zA-Z0-9_] 23 | * '\W' Non-alphanumeric 24 | * '\d' Digits, [0-9] 25 | * '\D' Non-digits 26 | * 27 | * 28 | */ 29 | 30 | 31 | 32 | #include "re.h" 33 | #include 34 | #include 35 | 36 | /* Definitions: */ 37 | 38 | #define MAX_REGEXP_OBJECTS 30 /* Max number of regex symbols in expression. */ 39 | #define MAX_CHAR_CLASS_LEN 40 /* Max length of character-class buffer in. */ 40 | 41 | 42 | enum { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS, CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT, ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE, /* BRANCH */ }; 43 | 44 | typedef struct regex_t 45 | { 46 | unsigned char type; /* CHAR, STAR, etc. */ 47 | union 48 | { 49 | unsigned char ch; /* the character itself */ 50 | unsigned char* ccl; /* OR a pointer to characters in class */ 51 | } u; 52 | } regex_t; 53 | 54 | 55 | 56 | /* Private function declarations: */ 57 | static int matchpattern(regex_t* pattern, const char* text, int* matchlength); 58 | static int matchcharclass(char c, const char* str); 59 | static int matchstar(regex_t p, regex_t* pattern, const char* text, int* matchlength); 60 | static int matchplus(regex_t p, regex_t* pattern, const char* text, int* matchlength); 61 | static int matchone(regex_t p, char c); 62 | static int matchdigit(char c); 63 | static int matchalpha(char c); 64 | static int matchwhitespace(char c); 65 | static int matchmetachar(char c, const char* str); 66 | static int matchrange(char c, const char* str); 67 | static int matchdot(char c); 68 | static int ismetachar(char c); 69 | 70 | 71 | 72 | /* Public functions: */ 73 | int re_match(const char* pattern, const char* text, int* matchlength) 74 | { 75 | return re_matchp(re_compile(pattern), text, matchlength); 76 | } 77 | 78 | int re_matchp(re_t pattern, const char* text, int* matchlength) 79 | { 80 | *matchlength = 0; 81 | if (pattern != 0) 82 | { 83 | if (pattern[0].type == BEGIN) 84 | { 85 | return ((matchpattern(&pattern[1], text, matchlength)) ? 0 : -1); 86 | } 87 | else 88 | { 89 | int idx = -1; 90 | 91 | do 92 | { 93 | idx += 1; 94 | 95 | if (matchpattern(pattern, text, matchlength)) 96 | { 97 | if (text[0] == '\0') 98 | return -1; 99 | 100 | return idx; 101 | } 102 | } 103 | while (*text++ != '\0'); 104 | } 105 | } 106 | return -1; 107 | } 108 | 109 | re_t re_compile(const char* pattern) 110 | { 111 | /* The sizes of the two static arrays below substantiates the static RAM usage of this module. 112 | MAX_REGEXP_OBJECTS is the max number of symbols in the expression. 113 | MAX_CHAR_CLASS_LEN determines the size of buffer for chars in all char-classes in the expression. */ 114 | static regex_t re_compiled[MAX_REGEXP_OBJECTS]; 115 | static unsigned char ccl_buf[MAX_CHAR_CLASS_LEN]; 116 | int ccl_bufidx = 1; 117 | 118 | char c; /* current char in pattern */ 119 | int i = 0; /* index into pattern */ 120 | int j = 0; /* index into re_compiled */ 121 | 122 | while (pattern[i] != '\0' && (j+1 < MAX_REGEXP_OBJECTS)) 123 | { 124 | c = pattern[i]; 125 | 126 | switch (c) 127 | { 128 | /* Meta-characters: */ 129 | case '^': { re_compiled[j].type = BEGIN; } break; 130 | case '$': { re_compiled[j].type = END; } break; 131 | case '.': { re_compiled[j].type = DOT; } break; 132 | case '*': { re_compiled[j].type = STAR; } break; 133 | case '+': { re_compiled[j].type = PLUS; } break; 134 | case '?': { re_compiled[j].type = QUESTIONMARK; } break; 135 | /* case '|': { re_compiled[j].type = BRANCH; } break; <-- not working properly */ 136 | 137 | /* Escaped character-classes (\s \w ...): */ 138 | case '\\': 139 | { 140 | if (pattern[i+1] != '\0') 141 | { 142 | /* Skip the escape-char '\\' */ 143 | i += 1; 144 | /* ... and check the next */ 145 | switch (pattern[i]) 146 | { 147 | /* Meta-character: */ 148 | case 'd': { re_compiled[j].type = DIGIT; } break; 149 | case 'D': { re_compiled[j].type = NOT_DIGIT; } break; 150 | case 'w': { re_compiled[j].type = ALPHA; } break; 151 | case 'W': { re_compiled[j].type = NOT_ALPHA; } break; 152 | case 's': { re_compiled[j].type = WHITESPACE; } break; 153 | case 'S': { re_compiled[j].type = NOT_WHITESPACE; } break; 154 | 155 | /* Escaped character, e.g. '.' or '$' */ 156 | default: 157 | { 158 | re_compiled[j].type = CHAR; 159 | re_compiled[j].u.ch = pattern[i]; 160 | } break; 161 | } 162 | } 163 | /* '\\' as last char in pattern -> invalid regular expression. */ 164 | /* 165 | else 166 | { 167 | re_compiled[j].type = CHAR; 168 | re_compiled[j].ch = pattern[i]; 169 | } 170 | */ 171 | } break; 172 | 173 | /* Character class: */ 174 | case '[': 175 | { 176 | /* Remember where the char-buffer starts. */ 177 | int buf_begin = ccl_bufidx; 178 | 179 | /* Look-ahead to determine if negated */ 180 | if (pattern[i+1] == '^') 181 | { 182 | re_compiled[j].type = INV_CHAR_CLASS; 183 | i += 1; /* Increment i to avoid including '^' in the char-buffer */ 184 | if (pattern[i+1] == 0) /* incomplete pattern, missing non-zero char after '^' */ 185 | { 186 | return 0; 187 | } 188 | } 189 | else 190 | { 191 | re_compiled[j].type = CHAR_CLASS; 192 | } 193 | 194 | /* Copy characters inside [..] to buffer */ 195 | while ( (pattern[++i] != ']') 196 | && (pattern[i] != '\0')) /* Missing ] */ 197 | { 198 | if (pattern[i] == '\\') 199 | { 200 | if (ccl_bufidx >= MAX_CHAR_CLASS_LEN - 1) 201 | { 202 | //fputs("exceeded internal buffer!\n", stderr); 203 | return 0; 204 | } 205 | if (pattern[i+1] == 0) /* incomplete pattern, missing non-zero char after '\\' */ 206 | { 207 | return 0; 208 | } 209 | ccl_buf[ccl_bufidx++] = pattern[i++]; 210 | } 211 | else if (ccl_bufidx >= MAX_CHAR_CLASS_LEN) 212 | { 213 | //fputs("exceeded internal buffer!\n", stderr); 214 | return 0; 215 | } 216 | ccl_buf[ccl_bufidx++] = pattern[i]; 217 | } 218 | if (ccl_bufidx >= MAX_CHAR_CLASS_LEN) 219 | { 220 | /* Catches cases such as [00000000000000000000000000000000000000][ */ 221 | //fputs("exceeded internal buffer!\n", stderr); 222 | return 0; 223 | } 224 | /* Null-terminate string end */ 225 | ccl_buf[ccl_bufidx++] = 0; 226 | re_compiled[j].u.ccl = &ccl_buf[buf_begin]; 227 | } break; 228 | 229 | /* Other characters: */ 230 | default: 231 | { 232 | re_compiled[j].type = CHAR; 233 | re_compiled[j].u.ch = c; 234 | } break; 235 | } 236 | /* no buffer-out-of-bounds access on invalid patterns - see https://github.com/kokke/tiny-regex-c/commit/1a279e04014b70b0695fba559a7c05d55e6ee90b */ 237 | if (pattern[i] == 0) 238 | { 239 | return 0; 240 | } 241 | 242 | i += 1; 243 | j += 1; 244 | } 245 | /* 'UNUSED' is a sentinel used to indicate end-of-pattern */ 246 | re_compiled[j].type = UNUSED; 247 | 248 | return (re_t) re_compiled; 249 | } 250 | 251 | void re_print(regex_t* pattern) 252 | { 253 | const char* types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE", "BRANCH" }; 254 | 255 | int i; 256 | int j; 257 | char c; 258 | for (i = 0; i < MAX_REGEXP_OBJECTS; ++i) 259 | { 260 | if (pattern[i].type == UNUSED) 261 | { 262 | break; 263 | } 264 | 265 | printf("type: %s", types[pattern[i].type]); 266 | if (pattern[i].type == CHAR_CLASS || pattern[i].type == INV_CHAR_CLASS) 267 | { 268 | printf(" ["); 269 | for (j = 0; j < MAX_CHAR_CLASS_LEN; ++j) 270 | { 271 | c = pattern[i].u.ccl[j]; 272 | if ((c == '\0') || (c == ']')) 273 | { 274 | break; 275 | } 276 | printf("%c", c); 277 | } 278 | printf("]"); 279 | } 280 | else if (pattern[i].type == CHAR) 281 | { 282 | printf(" '%c'", pattern[i].u.ch); 283 | } 284 | printf("\n"); 285 | } 286 | } 287 | 288 | 289 | 290 | /* Private functions: */ 291 | static int matchdigit(char c) 292 | { 293 | return isdigit(c); 294 | } 295 | static int matchalpha(char c) 296 | { 297 | return isalpha(c); 298 | } 299 | static int matchwhitespace(char c) 300 | { 301 | return isspace(c); 302 | } 303 | static int matchalphanum(char c) 304 | { 305 | return ((c == '_') || matchalpha(c) || matchdigit(c)); 306 | } 307 | static int matchrange(char c, const char* str) 308 | { 309 | return ( (c != '-') 310 | && (str[0] != '\0') 311 | && (str[0] != '-') 312 | && (str[1] == '-') 313 | && (str[2] != '\0') 314 | && ( (c >= str[0]) 315 | && (c <= str[2]))); 316 | } 317 | static int matchdot(char c) 318 | { 319 | #if defined(RE_DOT_MATCHES_NEWLINE) && (RE_DOT_MATCHES_NEWLINE == 1) 320 | (void)c; 321 | return 1; 322 | #else 323 | return c != '\n' && c != '\r'; 324 | #endif 325 | } 326 | static int ismetachar(char c) 327 | { 328 | return ((c == 's') || (c == 'S') || (c == 'w') || (c == 'W') || (c == 'd') || (c == 'D')); 329 | } 330 | 331 | static int matchmetachar(char c, const char* str) 332 | { 333 | switch (str[0]) 334 | { 335 | case 'd': return matchdigit(c); 336 | case 'D': return !matchdigit(c); 337 | case 'w': return matchalphanum(c); 338 | case 'W': return !matchalphanum(c); 339 | case 's': return matchwhitespace(c); 340 | case 'S': return !matchwhitespace(c); 341 | default: return (c == str[0]); 342 | } 343 | } 344 | 345 | static int matchcharclass(char c, const char* str) 346 | { 347 | do 348 | { 349 | if (matchrange(c, str)) 350 | { 351 | return 1; 352 | } 353 | else if (str[0] == '\\') 354 | { 355 | /* Escape-char: increment str-ptr and match on next char */ 356 | str += 1; 357 | if (matchmetachar(c, str)) 358 | { 359 | return 1; 360 | } 361 | else if ((c == str[0]) && !ismetachar(c)) 362 | { 363 | return 1; 364 | } 365 | } 366 | else if (c == str[0]) 367 | { 368 | if (c == '-') 369 | { 370 | return ((str[-1] == '\0') || (str[1] == '\0')); 371 | } 372 | else 373 | { 374 | return 1; 375 | } 376 | } 377 | } 378 | while (*str++ != '\0'); 379 | 380 | return 0; 381 | } 382 | 383 | static int matchone(regex_t p, char c) 384 | { 385 | switch (p.type) 386 | { 387 | case DOT: return matchdot(c); 388 | case CHAR_CLASS: return matchcharclass(c, (const char*)p.u.ccl); 389 | case INV_CHAR_CLASS: return !matchcharclass(c, (const char*)p.u.ccl); 390 | case DIGIT: return matchdigit(c); 391 | case NOT_DIGIT: return !matchdigit(c); 392 | case ALPHA: return matchalphanum(c); 393 | case NOT_ALPHA: return !matchalphanum(c); 394 | case WHITESPACE: return matchwhitespace(c); 395 | case NOT_WHITESPACE: return !matchwhitespace(c); 396 | default: return (p.u.ch == c); 397 | } 398 | } 399 | 400 | static int matchstar(regex_t p, regex_t* pattern, const char* text, int* matchlength) 401 | { 402 | int prelen = *matchlength; 403 | const char* prepoint = text; 404 | while ((text[0] != '\0') && matchone(p, *text)) 405 | { 406 | text++; 407 | (*matchlength)++; 408 | } 409 | while (text >= prepoint) 410 | { 411 | if (matchpattern(pattern, text--, matchlength)) 412 | return 1; 413 | (*matchlength)--; 414 | } 415 | 416 | *matchlength = prelen; 417 | return 0; 418 | } 419 | 420 | static int matchplus(regex_t p, regex_t* pattern, const char* text, int* matchlength) 421 | { 422 | const char* prepoint = text; 423 | while ((text[0] != '\0') && matchone(p, *text)) 424 | { 425 | text++; 426 | (*matchlength)++; 427 | } 428 | while (text > prepoint) 429 | { 430 | if (matchpattern(pattern, text--, matchlength)) 431 | return 1; 432 | (*matchlength)--; 433 | } 434 | 435 | return 0; 436 | } 437 | 438 | static int matchquestion(regex_t p, regex_t* pattern, const char* text, int* matchlength) 439 | { 440 | if (p.type == UNUSED) 441 | return 1; 442 | if (matchpattern(pattern, text, matchlength)) 443 | return 1; 444 | if (*text && matchone(p, *text++)) 445 | { 446 | if (matchpattern(pattern, text, matchlength)) 447 | { 448 | (*matchlength)++; 449 | return 1; 450 | } 451 | } 452 | return 0; 453 | } 454 | 455 | 456 | #if 0 457 | 458 | /* Recursive matching */ 459 | static int matchpattern(regex_t* pattern, const char* text, int *matchlength) 460 | { 461 | int pre = *matchlength; 462 | if ((pattern[0].type == UNUSED) || (pattern[1].type == QUESTIONMARK)) 463 | { 464 | return matchquestion(pattern[1], &pattern[2], text, matchlength); 465 | } 466 | else if (pattern[1].type == STAR) 467 | { 468 | return matchstar(pattern[0], &pattern[2], text, matchlength); 469 | } 470 | else if (pattern[1].type == PLUS) 471 | { 472 | return matchplus(pattern[0], &pattern[2], text, matchlength); 473 | } 474 | else if ((pattern[0].type == END) && pattern[1].type == UNUSED) 475 | { 476 | return text[0] == '\0'; 477 | } 478 | else if ((text[0] != '\0') && matchone(pattern[0], text[0])) 479 | { 480 | (*matchlength)++; 481 | return matchpattern(&pattern[1], text+1); 482 | } 483 | else 484 | { 485 | *matchlength = pre; 486 | return 0; 487 | } 488 | } 489 | 490 | #else 491 | 492 | /* Iterative matching */ 493 | static int matchpattern(regex_t* pattern, const char* text, int* matchlength) 494 | { 495 | int pre = *matchlength; 496 | do 497 | { 498 | if ((pattern[0].type == UNUSED) || (pattern[1].type == QUESTIONMARK)) 499 | { 500 | return matchquestion(pattern[0], &pattern[2], text, matchlength); 501 | } 502 | else if (pattern[1].type == STAR) 503 | { 504 | return matchstar(pattern[0], &pattern[2], text, matchlength); 505 | } 506 | else if (pattern[1].type == PLUS) 507 | { 508 | return matchplus(pattern[0], &pattern[2], text, matchlength); 509 | } 510 | else if ((pattern[0].type == END) && pattern[1].type == UNUSED) 511 | { 512 | return (text[0] == '\0'); 513 | } 514 | /* Branching is not working properly 515 | else if (pattern[1].type == BRANCH) 516 | { 517 | return (matchpattern(pattern, text) || matchpattern(&pattern[2], text)); 518 | } 519 | */ 520 | (*matchlength)++; 521 | } 522 | while ((text[0] != '\0') && matchone(*pattern++, *text++)); 523 | 524 | *matchlength = pre; 525 | return 0; 526 | } 527 | 528 | #endif 529 | -------------------------------------------------------------------------------- /re.h: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * Mini regex-module inspired by Rob Pike's regex code described in: 4 | * 5 | * http://www.cs.princeton.edu/courses/archive/spr09/cos333/beautiful.html 6 | * 7 | * 8 | * 9 | * Supports: 10 | * --------- 11 | * '.' Dot, matches any character 12 | * '^' Start anchor, matches beginning of string 13 | * '$' End anchor, matches end of string 14 | * '*' Asterisk, match zero or more (greedy) 15 | * '+' Plus, match one or more (greedy) 16 | * '?' Question, match zero or one (non-greedy) 17 | * '[abc]' Character class, match if one of {'a', 'b', 'c'} 18 | * '[^abc]' Inverted class, match if NOT one of {'a', 'b', 'c'} -- NOTE: feature is currently broken! 19 | * '[a-zA-Z]' Character ranges, the character set of the ranges { a-z | A-Z } 20 | * '\s' Whitespace, \t \f \r \n \v and spaces 21 | * '\S' Non-whitespace 22 | * '\w' Alphanumeric, [a-zA-Z0-9_] 23 | * '\W' Non-alphanumeric 24 | * '\d' Digits, [0-9] 25 | * '\D' Non-digits 26 | * 27 | * 28 | */ 29 | 30 | #ifndef _TINY_REGEX_C 31 | #define _TINY_REGEX_C 32 | 33 | 34 | #ifndef RE_DOT_MATCHES_NEWLINE 35 | /* Define to 0 if you DON'T want '.' to match '\r' + '\n' */ 36 | #define RE_DOT_MATCHES_NEWLINE 1 37 | #endif 38 | 39 | #ifdef __cplusplus 40 | extern "C"{ 41 | #endif 42 | 43 | 44 | 45 | /* Typedef'd pointer to get abstract datatype. */ 46 | typedef struct regex_t* re_t; 47 | 48 | 49 | /* Compile regex string pattern to a regex_t-array. */ 50 | re_t re_compile(const char* pattern); 51 | 52 | 53 | /* Find matches of the compiled pattern inside text. */ 54 | int re_matchp(re_t pattern, const char* text, int* matchlength); 55 | 56 | 57 | /* Find matches of the txt pattern inside text (will compile automatically first). */ 58 | int re_match(const char* pattern, const char* text, int* matchlength); 59 | 60 | 61 | #ifdef __cplusplus 62 | } 63 | #endif 64 | 65 | #endif /* ifndef _TINY_REGEX_C */ 66 | -------------------------------------------------------------------------------- /scripts/exrex.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # This file is part of exrex. 5 | # 6 | # exrex is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU Affero General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # exrex is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU Affero General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU Affero General Public License 17 | # along with exrex. If not, see < http://www.gnu.org/licenses/ >. 18 | # 19 | # (C) 2012- by Adam Tauber, 20 | 21 | try: 22 | from future_builtins import map, range 23 | except: 24 | pass 25 | from re import match, U 26 | try: 27 | import re._parser as sre_parse 28 | except ImportError: # Python < 3.11 29 | from re import sre_parse 30 | from itertools import tee 31 | from random import choice, randint 32 | from types import GeneratorType 33 | 34 | from sys import version_info 35 | IS_PY3 = version_info[0] == 3 36 | IS_PY36_OR_GREATER = IS_PY3 and version_info[1] > 5 37 | 38 | if IS_PY3: 39 | unichr = chr 40 | 41 | __all__ = ( 42 | 'generate', 43 | 'CATEGORIES', 44 | 'count', 45 | 'parse', 46 | 'getone', 47 | 'sre_to_string', 48 | 'simplify' 49 | ) 50 | 51 | CATEGORIES = { 52 | sre_parse.CATEGORY_SPACE: sorted(sre_parse.WHITESPACE), 53 | sre_parse.CATEGORY_DIGIT: sorted(sre_parse.DIGITS), 54 | #sre_parse.CATEGORY_WORD: [unichr(x) for x in range(256) if 55 | # match(r'\w', unichr(x), U)], 56 | sre_parse.CATEGORY_WORD: list('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_'), 57 | sre_parse.CATEGORY_NOT_WORD: [unichr(x) for x in range(256) if 58 | match(r'\W', unichr(x), U)], 59 | 'category_any': [unichr(x) for x in range(32, 123)] 60 | } 61 | 62 | 63 | def _build_reverse_categories(): 64 | reverse = {} 65 | for key, value in sre_parse.CATEGORIES.items(): 66 | if not hasattr(value[1], '__iter__'): 67 | continue 68 | 69 | for vv in value[1]: 70 | if value[0] == sre_parse.IN and vv[0] == sre_parse.CATEGORY: 71 | reverse.update({vv[1]: key}) 72 | 73 | return reverse 74 | 75 | 76 | REVERSE_CATEGORIES = _build_reverse_categories() 77 | 78 | 79 | def comb(g, i): 80 | for c in g: 81 | g2, i = tee(i) 82 | for c2 in g2: 83 | yield c + c2 84 | 85 | 86 | def mappend(g, c): 87 | for cc in g: 88 | yield cc + c 89 | 90 | 91 | def dappend(g, d, k): 92 | for cc in g: 93 | yield cc + d[k] 94 | 95 | 96 | def _in(d): 97 | ret = [] 98 | neg = False 99 | for i in d: 100 | if i[0] == sre_parse.RANGE: 101 | subs = map(unichr, range(i[1][0], i[1][1] + 1)) 102 | if neg: 103 | for char in subs: 104 | try: 105 | ret.remove(char) 106 | except: 107 | pass 108 | else: 109 | ret.extend(subs) 110 | elif i[0] == sre_parse.LITERAL: 111 | if neg: 112 | try: 113 | ret.remove(unichr(i[1])) 114 | except: 115 | pass 116 | else: 117 | ret.append(unichr(i[1])) 118 | elif i[0] == sre_parse.CATEGORY: 119 | subs = CATEGORIES.get(i[1], ['']) 120 | if neg: 121 | for char in subs: 122 | try: 123 | ret.remove(char) 124 | except: 125 | pass 126 | else: 127 | ret.extend(subs) 128 | elif i[0] == sre_parse.NEGATE: 129 | ret = list(CATEGORIES['category_any']) 130 | neg = True 131 | return ret 132 | 133 | 134 | def prods(orig, ran, items, limit, grouprefs): 135 | for o in orig: 136 | for r in ran: 137 | if r == 0: 138 | yield o 139 | else: 140 | ret = [o] 141 | for _ in range(r): 142 | ret = ggen( 143 | ret, _gen, items, limit=limit, count=False, grouprefs=grouprefs) 144 | for i in ret: 145 | yield i 146 | 147 | 148 | def ggen(g1, f, *args, **kwargs): 149 | groupref = None 150 | grouprefs = kwargs.get('grouprefs', {}) 151 | if 'groupref' in kwargs.keys(): 152 | groupref = kwargs.pop('groupref') 153 | for a in g1: 154 | g2 = f(*args, **kwargs) 155 | if isinstance(g2, GeneratorType): 156 | for b in g2: 157 | grouprefs[groupref] = b 158 | yield a + b 159 | else: 160 | yield g2 161 | 162 | 163 | def concit(g1, seqs, limit, grouprefs): 164 | for a in g1: 165 | for s in seqs: 166 | for b in _gen(s, limit, grouprefs=grouprefs): 167 | yield a + b 168 | 169 | 170 | def _gen(d, limit=20, count=False, grouprefs=None): 171 | """docstring for _gen""" 172 | if grouprefs is None: 173 | grouprefs = {} 174 | ret = [''] 175 | strings = 0 176 | literal = False 177 | for i in d: 178 | if i[0] == sre_parse.IN: 179 | subs = _in(i[1]) 180 | if count: 181 | strings = (strings or 1) * len(subs) 182 | ret = comb(ret, subs) 183 | elif i[0] == sre_parse.LITERAL: 184 | literal = True 185 | ret = mappend(ret, unichr(i[1])) 186 | elif i[0] == sre_parse.CATEGORY: 187 | subs = CATEGORIES.get(i[1], ['']) 188 | if count: 189 | strings = (strings or 1) * len(subs) 190 | ret = comb(ret, subs) 191 | elif i[0] == sre_parse.ANY: 192 | subs = CATEGORIES['category_any'] 193 | if count: 194 | strings = (strings or 1) * len(subs) 195 | ret = comb(ret, subs) 196 | elif i[0] == sre_parse.MAX_REPEAT or i[0] == sre_parse.MIN_REPEAT: 197 | items = list(i[1][2]) 198 | if i[1][1] + 1 - i[1][0] >= limit: 199 | r1 = i[1][0] 200 | r2 = i[1][0] + limit 201 | else: 202 | r1 = i[1][0] 203 | r2 = i[1][1] + 1 204 | ran = range(r1, r2) 205 | if count: 206 | branch_count = 0 207 | for p in ran: 208 | branch_count += pow(_gen(items, limit, True, grouprefs), p) 209 | strings = (strings or 1) * branch_count 210 | 211 | ret = prods(ret, ran, items, limit, grouprefs) 212 | elif i[0] == sre_parse.BRANCH: 213 | if count: 214 | for x in i[1][1]: 215 | strings += _gen(x, limit, True, grouprefs) or 1 216 | ret = concit(ret, i[1][1], limit, grouprefs) 217 | elif i[0] == sre_parse.SUBPATTERN or i[0] == sre_parse.ASSERT: 218 | subexpr = i[1][1] 219 | if IS_PY36_OR_GREATER and i[0] == sre_parse.SUBPATTERN: 220 | subexpr = i[1][3] 221 | if count: 222 | strings = ( 223 | strings or 1) * (sum(ggen([0], _gen, subexpr, limit=limit, count=True, grouprefs=grouprefs)) or 1) 224 | ret = ggen(ret, _gen, subexpr, limit=limit, count=False, grouprefs=grouprefs, groupref=i[1][0]) 225 | # ignore ^ and $ 226 | elif i[0] == sre_parse.AT: 227 | continue 228 | elif i[0] == sre_parse.NOT_LITERAL: 229 | subs = list(CATEGORIES['category_any']) 230 | if unichr(i[1]) in subs: 231 | subs.remove(unichr(i[1])) 232 | if count: 233 | strings = (strings or 1) * len(subs) 234 | ret = comb(ret, subs) 235 | elif i[0] == sre_parse.GROUPREF: 236 | ret = dappend(ret, grouprefs, i[1]) 237 | elif i[0] == sre_parse.ASSERT_NOT: 238 | pass 239 | else: 240 | print('[!] cannot handle expression ' + repr(i)) 241 | 242 | if count: 243 | if strings == 0 and literal: 244 | inc = True 245 | for i in d: 246 | if i[0] not in (sre_parse.AT, sre_parse.LITERAL): 247 | inc = False 248 | if inc: 249 | strings = 1 250 | return strings 251 | 252 | return ret 253 | 254 | 255 | def _randone(d, limit=20, grouprefs=None): 256 | if grouprefs is None: 257 | grouprefs = {} 258 | """docstring for _randone""" 259 | ret = '' 260 | for i in d: 261 | if i[0] == sre_parse.IN: 262 | ret += choice(_in(i[1])) 263 | elif i[0] == sre_parse.LITERAL: 264 | ret += unichr(i[1]) 265 | elif i[0] == sre_parse.CATEGORY: 266 | ret += choice(CATEGORIES.get(i[1], [''])) 267 | elif i[0] == sre_parse.ANY: 268 | ret += choice(CATEGORIES['category_any']) 269 | elif i[0] == sre_parse.MAX_REPEAT or i[0] == sre_parse.MIN_REPEAT: 270 | if i[1][1] + 1 - i[1][0] >= limit: 271 | min, max = i[1][0], i[1][0] + limit - 1 272 | else: 273 | min, max = i[1][0], i[1][1] 274 | for _ in range(randint(min, max)): 275 | ret += _randone(list(i[1][2]), limit, grouprefs) 276 | elif i[0] == sre_parse.BRANCH: 277 | ret += _randone(choice(i[1][1]), limit, grouprefs) 278 | elif i[0] == sre_parse.SUBPATTERN or i[0] == sre_parse.ASSERT: 279 | subexpr = i[1][1] 280 | if IS_PY36_OR_GREATER and i[0] == sre_parse.SUBPATTERN: 281 | subexpr = i[1][3] 282 | subp = _randone(subexpr, limit, grouprefs) 283 | if i[1][0]: 284 | grouprefs[i[1][0]] = subp 285 | ret += subp 286 | elif i[0] == sre_parse.AT: 287 | continue 288 | elif i[0] == sre_parse.NOT_LITERAL: 289 | c = list(CATEGORIES['category_any']) 290 | if unichr(i[1]) in c: 291 | c.remove(unichr(i[1])) 292 | ret += choice(c) 293 | elif i[0] == sre_parse.GROUPREF: 294 | ret += grouprefs[i[1]] 295 | elif i[0] == sre_parse.ASSERT_NOT: 296 | pass 297 | else: 298 | print('[!] cannot handle expression "%s"' % str(i)) 299 | 300 | return ret 301 | 302 | 303 | def sre_to_string(sre_obj, paren=True): 304 | """sre_parse object to string 305 | 306 | :param sre_obj: Output of sre_parse.parse() 307 | :type sre_obj: list 308 | :rtype: str 309 | """ 310 | ret = u'' 311 | for i in sre_obj: 312 | if i[0] == sre_parse.IN: 313 | prefix = '' 314 | if len(i[1]) and i[1][0][0] == sre_parse.NEGATE: 315 | prefix = '^' 316 | ret += u'[{0}{1}]'.format(prefix, sre_to_string(i[1], paren=paren)) 317 | elif i[0] == sre_parse.LITERAL: 318 | u = unichr(i[1]) 319 | ret += u if u not in sre_parse.SPECIAL_CHARS else '\\{0}'.format(u) 320 | elif i[0] == sre_parse.CATEGORY: 321 | ret += REVERSE_CATEGORIES[i[1]] 322 | elif i[0] == sre_parse.ANY: 323 | ret += '.' 324 | elif i[0] == sre_parse.BRANCH: 325 | # TODO simplifications here 326 | parts = [sre_to_string(x, paren=paren) for x in i[1][1]] 327 | if not any(parts): 328 | continue 329 | if i[1][0]: 330 | if len(parts) == 1: 331 | paren = False 332 | prefix = '' 333 | else: 334 | prefix = '?:' 335 | branch = '|'.join(parts) 336 | if paren: 337 | ret += '({0}{1})'.format(prefix, branch) 338 | else: 339 | ret += '{0}'.format(branch) 340 | elif i[0] == sre_parse.SUBPATTERN: 341 | subexpr = i[1][1] 342 | if IS_PY36_OR_GREATER and i[0] == sre_parse.SUBPATTERN: 343 | subexpr = i[1][3] 344 | if i[1][0]: 345 | ret += '({0})'.format(sre_to_string(subexpr, paren=False)) 346 | else: 347 | ret += '{0}'.format(sre_to_string(subexpr, paren=paren)) 348 | elif i[0] == sre_parse.NOT_LITERAL: 349 | ret += '[^{0}]'.format(unichr(i[1])) 350 | elif i[0] == sre_parse.MAX_REPEAT: 351 | if i[1][0] == i[1][1]: 352 | range_str = '{{{0}}}'.format(i[1][0]) 353 | else: 354 | if i[1][0] == 0 and i[1][1] - i[1][0] == sre_parse.MAXREPEAT: 355 | range_str = '*' 356 | elif i[1][0] == 1 and i[1][1] - i[1][0] == sre_parse.MAXREPEAT - 1: 357 | range_str = '+' 358 | else: 359 | range_str = '{{{0},{1}}}'.format(i[1][0], i[1][1]) 360 | ret += sre_to_string(i[1][2], paren=paren) + range_str 361 | elif i[0] == sre_parse.MIN_REPEAT: 362 | if i[1][0] == 0 and i[1][1] == sre_parse.MAXREPEAT: 363 | range_str = '*?' 364 | elif i[1][0] == 1 and i[1][1] == sre_parse.MAXREPEAT: 365 | range_str = '+?' 366 | elif i[1][1] == sre_parse.MAXREPEAT: 367 | range_str = '{{{0},}}?'.format(i[1][0]) 368 | else: 369 | range_str = '{{{0},{1}}}?'.format(i[1][0], i[1][1]) 370 | ret += sre_to_string(i[1][2], paren=paren) + range_str 371 | elif i[0] == sre_parse.GROUPREF: 372 | ret += '\\{0}'.format(i[1]) 373 | elif i[0] == sre_parse.AT: 374 | if i[1] == sre_parse.AT_BEGINNING: 375 | ret += '^' 376 | elif i[1] == sre_parse.AT_END: 377 | ret += '$' 378 | elif i[0] == sre_parse.NEGATE: 379 | pass 380 | elif i[0] == sre_parse.RANGE: 381 | ret += '{0}-{1}'.format(unichr(i[1][0]), unichr(i[1][1])) 382 | elif i[0] == sre_parse.ASSERT: 383 | if i[1][0]: 384 | ret += '(?={0})'.format(sre_to_string(i[1][1], paren=False)) 385 | else: 386 | ret += '{0}'.format(sre_to_string(i[1][1], paren=paren)) 387 | elif i[0] == sre_parse.ASSERT_NOT: 388 | pass 389 | else: 390 | print('[!] cannot handle expression "%s"' % str(i)) 391 | return ret 392 | 393 | 394 | def simplify(regex_string): 395 | """Simplify a regular expression 396 | 397 | :param regex_string: Regular expression 398 | :type regex_string: str 399 | :rtype: str 400 | """ 401 | r = parse(regex_string) 402 | return sre_to_string(r) 403 | 404 | 405 | def parse(s): 406 | """Regular expression parser 407 | 408 | :param s: Regular expression 409 | :type s: str 410 | :rtype: list 411 | """ 412 | if IS_PY3: 413 | r = sre_parse.parse(s, flags=U) 414 | else: 415 | r = sre_parse.parse(s.decode('utf-8'), flags=U) 416 | return list(r) 417 | 418 | 419 | def generate(s, limit=20): 420 | """Creates a generator that generates all matching strings to a given regular expression 421 | 422 | :param s: Regular expression 423 | :type s: str 424 | :param limit: Range limit 425 | :type limit: int 426 | :returns: string generator object 427 | """ 428 | return _gen(parse(s), limit) 429 | 430 | 431 | def count(s, limit=20): 432 | """Counts all matching strings to a given regular expression 433 | 434 | :param s: Regular expression 435 | :type s: str 436 | :param limit: Range limit 437 | :type limit: int 438 | :rtype: int 439 | :returns: number of matching strings 440 | """ 441 | return _gen(parse(s), limit, count=True) 442 | 443 | 444 | def getone(regex_string, limit=20): 445 | """Returns a random matching string to a given regular expression 446 | """ 447 | return _randone(parse(regex_string), limit) 448 | 449 | 450 | def argparser(): 451 | import argparse 452 | from sys import stdout 453 | argp = argparse.ArgumentParser( 454 | description='exrex - regular expression string generator') 455 | argp.add_argument( 456 | '-o', '--output', 457 | help='Output file - default is STDOUT', 458 | metavar='FILE', 459 | default=stdout, 460 | type=argparse.FileType('w', encoding='utf-8') 461 | ) 462 | argp.add_argument( 463 | '-l', '--limit', 464 | help='Max limit for range size - default is 20', 465 | default=20, 466 | action='store', 467 | type=int, 468 | metavar='N' 469 | ) 470 | argp.add_argument( 471 | '-c', '--count', 472 | help='Count matching strings', 473 | default=False, 474 | action='store_true' 475 | ) 476 | argp.add_argument( 477 | '-m', '--max-number', 478 | help='Max number of strings - default is -1', 479 | default=-1, 480 | action='store', 481 | type=int, 482 | metavar='N' 483 | ) 484 | argp.add_argument( 485 | '-r', '--random', 486 | help='Returns a random string that matches to the regex', 487 | default=False, 488 | action='store_true' 489 | ) 490 | argp.add_argument( 491 | '-s', '--simplify', 492 | help='Simplifies a regular expression', 493 | default=False, 494 | action='store_true' 495 | ) 496 | argp.add_argument( 497 | '-d', '--delimiter', 498 | help='Delimiter - default is \\n', 499 | default='\n' 500 | ) 501 | argp.add_argument( 502 | '-v', '--verbose', 503 | action='store_true', 504 | help='Verbose mode', 505 | default=False 506 | ) 507 | argp.add_argument( 508 | 'regex', 509 | metavar='REGEX', 510 | help='REGEX string' 511 | ) 512 | return vars(argp.parse_args()) 513 | 514 | 515 | def __main__(): 516 | from sys import exit, stderr 517 | args = argparser() 518 | if args['verbose']: 519 | args['output'].write( 520 | '%r%s' % (parse(args['regex']), args['delimiter'])) 521 | if args['count']: 522 | args['output'].write( 523 | '%d%s' % (count(args['regex'], limit=args['limit']), args['delimiter'])) 524 | exit(0) 525 | if args['random']: 526 | args['output'].write( 527 | '%s%s' % (getone(args['regex'], limit=args['limit']), args['delimiter'])) 528 | exit(0) 529 | if args['simplify']: 530 | args['output'].write( 531 | '%s%s' % (simplify(args['regex']), args['delimiter'])) 532 | exit(0) 533 | try: 534 | g = generate(args['regex'], args['limit']) 535 | except Exception as e: 536 | stderr.write('[!] Error: %s\n' % e) 537 | exit(1) 538 | args['output'].write(next(g)) 539 | args['max_number'] -= 1 540 | for s in g: 541 | if args['max_number'] == 0: 542 | break 543 | args['max_number'] -= 1 544 | args['output'].write(args['delimiter']) 545 | args['output'].write(s) 546 | if args['delimiter'] == '\n': 547 | args['output'].write('\n') 548 | 549 | 550 | if __name__ == '__main__': 551 | __main__() 552 | -------------------------------------------------------------------------------- /scripts/regex_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This program generates random text that matches a given regex-pattern. 5 | The pattern is given via sys.argv and the generated text is passed to 6 | the binary 'tests/test_rand' to check if the generated text also matches 7 | the regex-pattern in the C implementation. 8 | The exit-code of the testing program, is used to determine test success. 9 | 10 | This script is called by the Makefile when doing 'make test' 11 | """ 12 | 13 | 14 | import re 15 | import sys 16 | import exrex 17 | from subprocess import call 18 | 19 | 20 | def get_one(pattern): 21 | """Ensure that Python's re-module agrees the example matches the pattern""" 22 | while True: 23 | p = exrex.getone(pattern) 24 | m = re.match(pattern, p) 25 | if m: 26 | return p 27 | 28 | prog = "./tests/test_rand" 29 | 30 | if len(sys.argv) < 2: 31 | print("") 32 | print("usage: %s pattern [nrepeat]" % sys.argv[0]) 33 | print(" where [nrepeat] is optional") 34 | print("") 35 | sys.exit(-1) 36 | 37 | own_prog = sys.argv[0] 38 | pattern = sys.argv[1] 39 | if len(sys.argv) > 2: 40 | ntests = int(sys.argv[2]) 41 | else: 42 | ntests = 10 43 | nfails = 0 44 | repeats = ntests 45 | 46 | 47 | try: 48 | repeats = int(sys.argv[2]) 49 | except: 50 | pass 51 | 52 | 53 | sys.stdout.write("%-35s" % (" pattern '%s': " % pattern)) 54 | 55 | 56 | while repeats >= 0: 57 | try: 58 | repeats -= 1 59 | example = get_one(pattern) 60 | #print("%s %s %s" % (prog, pattern, example)) 61 | ret = call([prog, "\"%s\"" % pattern, "\"%s\"" % example]) 62 | if ret != 0: 63 | escaped = repr(example) # escapes special chars for better printing 64 | print(" FAIL : doesn't match %s as expected [%s]." % (escaped, ", ".join([("0x%02x" % ord(e)) for e in example]) )) 65 | nfails += 1 66 | 67 | except: 68 | import traceback 69 | print("EXCEPTION!", traceback.format_exc()) 70 | ntests -= 1 71 | repeats += 1 72 | #nfails += 1 73 | 74 | sys.stdout.write("%4d/%d tests succeeded \n" % (ntests - nfails, ntests)) 75 | #print("") 76 | 77 | -------------------------------------------------------------------------------- /scripts/regex_test_neg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This program generates random text that matches a given regex-pattern. 5 | The pattern is given via sys.argv and the generated text is passed to 6 | the binary 'tests/test_rand' to check if the generated text also matches 7 | the regex-pattern in the C implementation. 8 | The exit-code of the testing program, is used to determine test success. 9 | 10 | This script is called by the Makefile when doing 'make test' 11 | """ 12 | 13 | 14 | import re 15 | import sys 16 | import string 17 | import random 18 | from subprocess import call 19 | 20 | 21 | prog = "./tests/test_rand_neg" 22 | 23 | if len(sys.argv) < 2: 24 | print("") 25 | print("usage: %s pattern [nrepeat]" % sys.argv[0]) 26 | print(" where [nrepeat] is optional") 27 | print("") 28 | sys.exit(-1) 29 | 30 | own_prog = sys.argv[0] 31 | pattern = sys.argv[1] 32 | if len(sys.argv) > 2: 33 | ntests = int(sys.argv[2]) 34 | else: 35 | ntests = 10 36 | nfails = 0 37 | repeats = ntests 38 | 39 | 40 | try: 41 | repeats = int(sys.argv[2]) 42 | except: 43 | pass 44 | 45 | sys.stdout.write("%-35s" % (" pattern '%s': " % pattern)) 46 | 47 | 48 | 49 | 50 | def gen_no_match(pattern, minlen=1, maxlen=50, maxattempts=500): 51 | nattempts = 0 52 | while True: 53 | nattempts += 1 54 | ret = "".join([random.choice(string.printable) for i in range(random.Random().randint(minlen, maxlen))]) 55 | if re.findall(pattern, ret) == []: 56 | return ret 57 | if nattempts >= maxattempts: 58 | raise Exception("Could not generate string that did not match the regex pattern '%s' after %d attempts" % (pattern, nattempts)) 59 | 60 | 61 | 62 | while repeats >= 0: 63 | try: 64 | repeats -= 1 65 | example = gen_no_match(pattern) 66 | #print("%s %s %s" % (prog, pattern, example)) 67 | ret = call([prog, "\"%s\"" % pattern, "\"%s\"" % example]) 68 | if ret != 0: 69 | escaped = repr(example) # escapes special chars for better printing 70 | print(" FAIL : matches %s unexpectedly [%s]." % (escaped, ", ".join([("0x%02x" % ord(e)) for e in example]) )) 71 | nfails += 1 72 | 73 | except: 74 | #import traceback 75 | #print("EXCEPTION!") 76 | #raw_input(traceback.format_exc()) 77 | ntests -= 1 78 | repeats += 1 79 | #nfails += 1 80 | 81 | sys.stdout.write("%4d/%d tests succeeded \n" % (ntests - nfails, ntests)) 82 | #print("") 83 | -------------------------------------------------------------------------------- /tests/test1.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Testing various regex-patterns 3 | */ 4 | 5 | #include 6 | #include 7 | #include "re.h" 8 | 9 | 10 | #define OK ((char*) 1) 11 | #define NOK ((char*) 0) 12 | 13 | 14 | char* test_vector[][4] = 15 | { 16 | { OK, "\\d", "5", (char*) 1 }, 17 | { OK, "\\w+", "hej", (char*) 3 }, 18 | { OK, "\\s", "\t \n", (char*) 1 }, 19 | { NOK, "\\S", "\t \n", (char*) 0 }, 20 | { OK, "[\\s]", "\t \n", (char*) 1 }, 21 | { NOK, "[\\S]", "\t \n", (char*) 0 }, 22 | { NOK, "\\D", "5", (char*) 0 }, 23 | { NOK, "\\W+", "hej", (char*) 0 }, 24 | { OK, "[0-9]+", "12345", (char*) 5 }, 25 | { OK, "\\D", "hej", (char*) 1 }, 26 | { NOK, "\\d", "hej", (char*) 0 }, 27 | { OK, "[^\\w]", "\\", (char*) 1 }, 28 | { OK, "[\\W]", "\\", (char*) 1 }, 29 | { NOK, "[\\w]", "\\", (char*) 0 }, 30 | { OK, "[^\\d]", "d", (char*) 1 }, 31 | { NOK, "[\\d]", "d", (char*) 0 }, 32 | { NOK, "[^\\D]", "d", (char*) 0 }, 33 | { OK, "[\\D]", "d", (char*) 1 }, 34 | { OK, "^.*\\\\.*$", "c:\\Tools", (char*) 8 }, 35 | { OK, "^.*\\\\.*$", "c:\\Tools", (char*) 8 }, 36 | { OK, ".?\\w+jsj$", "%JxLLcVx8wxrjsj", (char*) 15 }, 37 | { OK, ".?\\w+jsj$", "=KbvUQjsj", (char*) 9 }, 38 | { OK, ".?\\w+jsj$", "^uDnoZjsj", (char*) 9 }, 39 | { OK, ".?\\w+jsj$", "UzZbjsj", (char*) 7 }, 40 | { OK, ".?\\w+jsj$", "\"wjsj", (char*) 5 }, 41 | { OK, ".?\\w+jsj$", "zLa_FTEjsj", (char*) 10 }, 42 | { OK, ".?\\w+jsj$", "\"mw3p8_Ojsj", (char*) 11 }, 43 | { OK, "^[\\+-]*[\\d]+$", "+27", (char*) 3 }, 44 | { OK, "[abc]", "1c2", (char*) 1 }, 45 | { NOK, "[abc]", "1C2", (char*) 0 }, 46 | { OK, "[1-5]+", "0123456789", (char*) 5 }, 47 | { OK, "[.2]", "1C2", (char*) 1 }, 48 | { OK, "a*$", "Xaa", (char*) 2 }, 49 | { OK, "a*$", "Xaa", (char*) 2 }, 50 | { OK, "[a-h]+", "abcdefghxxx", (char*) 8 }, 51 | { NOK, "[a-h]+", "ABCDEFGH", (char*) 0 }, 52 | { OK, "[A-H]+", "ABCDEFGH", (char*) 8 }, 53 | { NOK, "[A-H]+", "abcdefgh", (char*) 0 }, 54 | { OK, "[^\\s]+", "abc def", (char*) 3 }, 55 | { OK, "[^fc]+", "abc def", (char*) 2 }, 56 | { OK, "[^d\\sf]+", "abc def", (char*) 3 }, 57 | { OK, "\n", "abc\ndef", (char*) 1 }, 58 | { OK, "b.\\s*\n", "aa\r\nbb\r\ncc\r\n\r\n",(char*) 4 }, 59 | { OK, ".*c", "abcabc", (char*) 6 }, 60 | { OK, ".+c", "abcabc", (char*) 6 }, 61 | { OK, "[b-z].*", "ab", (char*) 1 }, 62 | { OK, "b[k-z]*", "ab", (char*) 1 }, 63 | { NOK, "[0-9]", " - ", (char*) 0 }, 64 | { OK, "[^0-9]", " - ", (char*) 1 }, 65 | { OK, "0|", "0|", (char*) 2 }, 66 | { NOK, "\\d\\d:\\d\\d:\\d\\d", "0s:00:00", (char*) 0 }, 67 | { NOK, "\\d\\d:\\d\\d:\\d\\d", "000:00", (char*) 0 }, 68 | { NOK, "\\d\\d:\\d\\d:\\d\\d", "00:0000", (char*) 0 }, 69 | { NOK, "\\d\\d:\\d\\d:\\d\\d", "100:0:00", (char*) 0 }, 70 | { NOK, "\\d\\d:\\d\\d:\\d\\d", "00:100:00", (char*) 0 }, 71 | { NOK, "\\d\\d:\\d\\d:\\d\\d", "0:00:100", (char*) 0 }, 72 | { OK, "\\d\\d?:\\d\\d?:\\d\\d?", "0:0:0", (char*) 5 }, 73 | { OK, "\\d\\d?:\\d\\d?:\\d\\d?", "0:00:0", (char*) 6 }, 74 | { OK, "\\d\\d?:\\d\\d?:\\d\\d?", "0:0:00", (char*) 5 }, 75 | { OK, "\\d\\d?:\\d\\d?:\\d\\d?", "00:0:0", (char*) 6 }, 76 | { OK, "\\d\\d?:\\d\\d?:\\d\\d?", "00:00:0", (char*) 7 }, 77 | { OK, "\\d\\d?:\\d\\d?:\\d\\d?", "00:0:00", (char*) 6 }, 78 | { OK, "\\d\\d?:\\d\\d?:\\d\\d?", "0:00:00", (char*) 6 }, 79 | { OK, "\\d\\d?:\\d\\d?:\\d\\d?", "00:00:00", (char*) 7 }, 80 | { OK, "[Hh]ello [Ww]orld\\s*[!]?", "Hello world !", (char*) 12 }, 81 | { OK, "[Hh]ello [Ww]orld\\s*[!]?", "hello world !", (char*) 12 }, 82 | { OK, "[Hh]ello [Ww]orld\\s*[!]?", "Hello World !", (char*) 12 }, 83 | { OK, "[Hh]ello [Ww]orld\\s*[!]?", "Hello world! ", (char*) 11 }, 84 | { OK, "[Hh]ello [Ww]orld\\s*[!]?", "Hello world !", (char*) 13 }, 85 | { OK, "[Hh]ello [Ww]orld\\s*[!]?", "hello World !", (char*) 15 }, 86 | { NOK, "\\d\\d?:\\d\\d?:\\d\\d?", "a:0", (char*) 0 }, /* Failing test case reported in https://github.com/kokke/tiny-regex-c/issues/12 */ 87 | /* 88 | { OK, "[^\\w][^-1-4]", ")T", (char*) 2 }, 89 | { OK, "[^\\w][^-1-4]", ")^", (char*) 2 }, 90 | { OK, "[^\\w][^-1-4]", "*)", (char*) 2 }, 91 | { OK, "[^\\w][^-1-4]", "!.", (char*) 2 }, 92 | { OK, "[^\\w][^-1-4]", " x", (char*) 2 }, 93 | { OK, "[^\\w][^-1-4]", "$b", (char*) 2 }, 94 | */ 95 | { OK, ".?bar", "real_bar", (char*) 4 }, 96 | { NOK, ".?bar", "real_foo", (char*) 0 }, 97 | { NOK, "X?Y", "Z", (char*) 0 }, 98 | { OK, "[a-z]+\nbreak", "blahblah\nbreak", (char*) 14 }, 99 | { OK, "[a-z\\s]+\nbreak", "bla bla \nbreak", (char*) 14 }, 100 | }; 101 | 102 | 103 | void re_print(re_t); 104 | 105 | int main() 106 | { 107 | char* text; 108 | char* pattern; 109 | int should_fail; 110 | int length; 111 | int correctlen; 112 | size_t ntests = sizeof(test_vector) / sizeof(*test_vector); 113 | size_t nfailed = 0; 114 | size_t i; 115 | 116 | for (i = 0; i < ntests; ++i) 117 | { 118 | pattern = test_vector[i][1]; 119 | text = test_vector[i][2]; 120 | should_fail = (test_vector[i][0] == NOK); 121 | correctlen = (int)(test_vector[i][3]); 122 | 123 | int m = re_match(pattern, text, &length); 124 | 125 | if (should_fail) 126 | { 127 | if (m != (-1)) 128 | { 129 | printf("\n"); 130 | re_print(re_compile(pattern)); 131 | fprintf(stderr, "[%lu/%lu]: pattern '%s' matched '%s' unexpectedly, matched %i chars. \n", (i+1), ntests, pattern, text, length); 132 | nfailed += 1; 133 | } 134 | } 135 | else 136 | { 137 | if (m == (-1)) 138 | { 139 | printf("\n"); 140 | re_print(re_compile(pattern)); 141 | fprintf(stderr, "[%lu/%lu]: pattern '%s' didn't match '%s' as expected. \n", (i+1), ntests, pattern, text); 142 | nfailed += 1; 143 | } 144 | else if (length != correctlen) 145 | { 146 | fprintf(stderr, "[%lu/%lu]: pattern '%s' matched '%i' chars of '%s'; expected '%i'. \n", (i+1), ntests, pattern, length, text, correctlen); 147 | nfailed += 1; 148 | } 149 | } 150 | } 151 | 152 | // printf("\n"); 153 | printf("%lu/%lu tests succeeded.\n", ntests - nfailed, ntests); 154 | printf("\n"); 155 | printf("\n"); 156 | printf("\n"); 157 | 158 | return nfailed; /* 0 if all tests passed */ 159 | } 160 | -------------------------------------------------------------------------------- /tests/test_compile.c: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | This file tests two bug patterns reported by @DavidKorczynski in https://github.com/kokke/tiny-regex-c/issues/44 4 | 5 | */ 6 | 7 | #include 8 | #include /* for NULL */ 9 | #include "re.h" 10 | 11 | 12 | int main() 13 | { 14 | /* Test 1: inverted set without a closing ']' */ 15 | assert(re_compile("\\\x01[^\\\xff][^") == NULL); 16 | 17 | /* Test 2: set with an incomplete escape sequence and without a closing ']' */ 18 | assert(re_compile("\\\x01[^\\\xff][\\") == NULL); 19 | 20 | return 0; 21 | } 22 | 23 | -------------------------------------------------------------------------------- /tests/test_print.c: -------------------------------------------------------------------------------- 1 | /* 2 | This program prints out a verbose explanation of a given regular expression. 3 | */ 4 | 5 | #include 6 | #include "re.h" 7 | 8 | 9 | int main(int argc, char** argv) 10 | { 11 | if (argc == 2) 12 | { 13 | re_print(re_compile(argv[1])); 14 | } 15 | else 16 | { 17 | printf("\nUsage: %s \n", argv[0]); 18 | } 19 | return -2; 20 | } 21 | 22 | -------------------------------------------------------------------------------- /tests/test_rand.c: -------------------------------------------------------------------------------- 1 | /* 2 | This program tries to match a given regular expression with text given as input to stdin. 3 | If the text is a match for the pattern, the program returns 0. 4 | If the text doesn't match the pattern, the program returns -2. 5 | 6 | This program is used in random testing to test a lot of random text and regex together. 7 | See ./scripts/regex_test.py and the Makefile for this project for the gritty details. 8 | */ 9 | 10 | #include 11 | #include "re.h" 12 | 13 | 14 | int main(int argc, char** argv) 15 | { 16 | int length; 17 | if (argc == 3) 18 | { 19 | int m = re_match(argv[1], argv[2], &length); 20 | if (m != -1) 21 | return 0; 22 | } 23 | else 24 | { 25 | printf("\nUsage: %s \n", argv[0]); 26 | } 27 | return -2; 28 | } 29 | 30 | -------------------------------------------------------------------------------- /tests/test_rand_neg.c: -------------------------------------------------------------------------------- 1 | /* 2 | Negative version of test_rand.c -- returns true if no match 3 | 4 | This program tries to match a given regular expression with text given as input to stdin. 5 | If the text is NOT a match for the pattern, the program returns 0. 6 | If the text does match the pattern, the program returns -2. 7 | 8 | This program is used in random testing to test a lot of random text and regex together. 9 | See ./scripts/regex_test_neg.py and the Makefile for this project for the gritty details. 10 | */ 11 | 12 | #include 13 | #include "re.h" 14 | 15 | 16 | int main(int argc, char** argv) 17 | { 18 | int length; 19 | if (argc == 3) 20 | { 21 | int m = re_match(argv[1], argv[2], &length); 22 | if (m == -1) 23 | return 0; 24 | } 25 | else 26 | { 27 | printf("\nUsage: %s \n", argv[0]); 28 | } 29 | return -2; 30 | } 31 | --------------------------------------------------------------------------------