├── .github
    └── workflows
    │   └── c-cpp.yml
├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── clib.json
├── formal_verification.md
├── re.c
├── re.h
├── scripts
    ├── exrex.py
    ├── regex_test.py
    └── regex_test_neg.py
└── tests
    ├── test1.c
    ├── test2.c
    ├── test_compile.c
    ├── test_print.c
    ├── test_rand.c
    └── test_rand_neg.c


/.github/workflows/c-cpp.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: [ master ]
 8 | 
 9 |   # Allows you to run this workflow manually from the Actions tab
10 |   workflow_dispatch:
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v2
19 |     - name: make clean
20 |       run: make clean
21 |     - name: make all
22 |       run: make all
23 |     - name: make test
24 |       run: make test
25 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | *.pyc
3 | /tests/*
4 | !/tests/*.c
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | This is free and unencumbered software released into the public domain.
 2 | 
 3 | Anyone is free to copy, modify, publish, use, compile, sell, or
 4 | distribute this software, either in source code form or as a compiled
 5 | binary, for any purpose, commercial or non-commercial, and by any
 6 | means.
 7 | 
 8 | In jurisdictions that recognize copyright laws, the author or authors
 9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | For more information, please refer to <http://unlicense.org>
25 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | # Compiler to use - can be replaced by clang for instance
  2 | CC := gcc
  3 | 
  4 | # Number of random text expressions to generate, for random testing
  5 | NRAND_TESTS := 1000
  6 | 
  7 | # Flags to pass to compiler
  8 | CFLAGS := -O3 -Wall -Wextra -std=c99 -I.
  9 | 
 10 | all:
 11 | 	@$(CC) $(CFLAGS) re.c tests/test1.c         -o tests/test1
 12 | 	@$(CC) $(CFLAGS) re.c tests/test2.c         -o tests/test2
 13 | 	@$(CC) $(CFLAGS) re.c tests/test_rand.c     -o tests/test_rand
 14 | 	@$(CC) $(CFLAGS) re.c tests/test_rand_neg.c -o tests/test_rand_neg
 15 | 	@$(CC) $(CFLAGS) re.c tests/test_compile.c  -o tests/test_compile
 16 | 
 17 | clean:
 18 | 	@rm -f tests/test1 tests/test2 tests/test_rand tests/test_compile
 19 | 	@#@$(foreach test_bin,$(TEST_BINS), rm -f $(test_bin) ; )
 20 | 	@rm -f a.out
 21 | 	@rm -f *.o
 22 | 
 23 | 
 24 | test: all
 25 | 	@$(test python)
 26 | 	@echo
 27 | 	@echo Testing hand-picked regex\'s:
 28 | 	@./tests/test1
 29 | 	@echo Testing handling of invalid regex patterns
 30 | 	@./tests/test_compile
 31 | 	@echo Testing patterns against $(NRAND_TESTS) random strings matching the Python implementation and comparing:
 32 | 	@echo
 33 | 	@python ./scripts/regex_test.py \\d+\\w?\\D\\d             $(NRAND_TESTS)
 34 | 	@python ./scripts/regex_test.py \\s+[a-zA-Z0-9?]*          $(NRAND_TESTS)
 35 | 	@python ./scripts/regex_test.py \\w*\\d?\\w\\?             $(NRAND_TESTS)
 36 | 	@python ./scripts/regex_test.py [^\\d]+\\\\?\\s            $(NRAND_TESTS)
 37 | 	@python ./scripts/regex_test.py [^\\w][^-1-4]              $(NRAND_TESTS)
 38 | 	@python ./scripts/regex_test.py [^\\w]                     $(NRAND_TESTS)
 39 | 	@python ./scripts/regex_test.py [^1-4]                     $(NRAND_TESTS)
 40 | 	@python ./scripts/regex_test.py [^-1-4]                    $(NRAND_TESTS)
 41 | 	@python ./scripts/regex_test.py [^\\d]+\\s?[\\w]*          $(NRAND_TESTS)
 42 | 	@python ./scripts/regex_test.py a+b*[ac]*.+.*.[\\.].       $(NRAND_TESTS)
 43 | 	@python ./scripts/regex_test.py a?b[ac*]*.?[\\]+[?]?       $(NRAND_TESTS)
 44 | 	@python ./scripts/regex_test.py [-1-5]+[-1-2]-[-]          $(NRAND_TESTS)
 45 | 	@python ./scripts/regex_test.py [-1-3]-[-]+                $(NRAND_TESTS)
 46 | 	@python ./scripts/regex_test.py [1-5]+[-1-2]-[\\-]         $(NRAND_TESTS)
 47 | 	@python ./scripts/regex_test.py [-1-2]*                    $(NRAND_TESTS)
 48 | 	@python ./scripts/regex_test.py \\s?[a-fKL098]+-?          $(NRAND_TESTS)
 49 | 	@python ./scripts/regex_test.py [\\-]*                     $(NRAND_TESTS)
 50 | 	@python ./scripts/regex_test.py [\\\\]+                    $(NRAND_TESTS)
 51 | 	@python ./scripts/regex_test.py [0-9a-fA-F]+               $(NRAND_TESTS)
 52 | 	@python ./scripts/regex_test.py [1379][2468][abcdef]       $(NRAND_TESTS)
 53 | 	@python ./scripts/regex_test.py [012345-9]?[0123-789]      $(NRAND_TESTS)
 54 | 	@python ./scripts/regex_test.py [012345-9]                 $(NRAND_TESTS)
 55 | 	@python ./scripts/regex_test.py [0-56789]                  $(NRAND_TESTS)
 56 | 	@python ./scripts/regex_test.py [abc-zABC-Z]               $(NRAND_TESTS)
 57 | 	@python ./scripts/regex_test.py [a\d]?1234                 $(NRAND_TESTS)
 58 | 	@python ./scripts/regex_test.py .*123faerdig               $(NRAND_TESTS)
 59 | 	@python ./scripts/regex_test.py .?\\w+jsj                  $(NRAND_TESTS)
 60 | 	@python ./scripts/regex_test.py [?to][+to][?ta][*ta]       $(NRAND_TESTS)
 61 | 	@python ./scripts/regex_test.py \\d+                       $(NRAND_TESTS)
 62 | 	@python ./scripts/regex_test.py [a-z]+                     $(NRAND_TESTS)
 63 | 	@python ./scripts/regex_test.py \\s+[a-zA-Z0-9?]*          $(NRAND_TESTS)
 64 | 	@python ./scripts/regex_test.py \\w                        $(NRAND_TESTS)
 65 | 	@python ./scripts/regex_test.py \\d                        $(NRAND_TESTS)
 66 | 	@python ./scripts/regex_test.py [\\d]                      $(NRAND_TESTS)
 67 | 	@python ./scripts/regex_test.py [^\\d]                     $(NRAND_TESTS)
 68 | 	@python ./scripts/regex_test.py [^-1-4]                    $(NRAND_TESTS)
 69 | 	@echo
 70 | 	@echo
 71 | 	@echo
 72 | 	@echo Testing rejection of patterns against $(NRAND_TESTS) random strings also rejected by the Python implementation:
 73 | 	@echo
 74 | 	@python ./scripts/regex_test_neg.py \\d+                   $(NRAND_TESTS)
 75 | 	@python ./scripts/regex_test_neg.py [a-z]+                 $(NRAND_TESTS)
 76 | 	@python ./scripts/regex_test_neg.py \\s+[a-zA-Z0-9?]*      $(NRAND_TESTS)
 77 | 	@python ./scripts/regex_test_neg.py ^\\w                   $(NRAND_TESTS)
 78 | 	@python ./scripts/regex_test_neg.py ^\\d                   $(NRAND_TESTS)
 79 | 	@python ./scripts/regex_test_neg.py [\\d]                  $(NRAND_TESTS)
 80 | 	@python ./scripts/regex_test_neg.py ^[^\\d]                $(NRAND_TESTS)
 81 | 	@python ./scripts/regex_test_neg.py [^\\w]+                $(NRAND_TESTS)
 82 | 	@python ./scripts/regex_test_neg.py ^[\\w]+                $(NRAND_TESTS)
 83 | 	@python ./scripts/regex_test_neg.py ^[^0-9]                $(NRAND_TESTS)
 84 | 	@python ./scripts/regex_test_neg.py [a-z].[A-Z]            $(NRAND_TESTS)
 85 | 	@python ./scripts/regex_test_neg.py [-1-3]-[-]+            $(NRAND_TESTS)
 86 | 	@python ./scripts/regex_test_neg.py [1-5]+[-1-2]-[\\-]     $(NRAND_TESTS)
 87 | 	@python ./scripts/regex_test_neg.py [-0-9]+                $(NRAND_TESTS)
 88 | 	@python ./scripts/regex_test_neg.py [\\-]+                 $(NRAND_TESTS)
 89 | 	@python ./scripts/regex_test_neg.py [\\\\]+                $(NRAND_TESTS)
 90 | 	@python ./scripts/regex_test_neg.py [0-9a-fA-F]+           $(NRAND_TESTS)
 91 | 	@python ./scripts/regex_test_neg.py [1379][2468][abcdef]   $(NRAND_TESTS)
 92 | 	@python ./scripts/regex_test_neg.py [012345-9]             $(NRAND_TESTS)
 93 | 	@python ./scripts/regex_test_neg.py [0-56789]              $(NRAND_TESTS)
 94 | 	@python ./scripts/regex_test_neg.py .*123faerdig           $(NRAND_TESTS)
 95 | 	@echo
 96 | 	@echo
 97 | 	@./tests/test2
 98 | 	@echo
 99 | 	@echo
100 | 
101 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![CI](https://github.com/kokke/tiny-regex-c/workflows/CI/badge.svg)
  2 | # tiny-regex-c
  3 | # A small regex implementation in C
  4 | ### Description
  5 | Small and portable [Regular Expression](https://en.wikipedia.org/wiki/Regular_expression) (regex) library written in C. 
  6 | 
  7 | Design is inspired by Rob Pike's regex-code for the book *"Beautiful Code"* [available online here](http://www.cs.princeton.edu/courses/archive/spr09/cos333/beautiful.html).
  8 | 
  9 | Supports a subset of the syntax and semantics of the Python standard library implementation (the `re`-module).
 10 | 
 11 | **I will gladly accept patches correcting bugs.**
 12 | 
 13 | ### Design goals
 14 | The main design goal of this library is to be small, correct, self contained and use few resources while retaining acceptable performance and feature completeness. Clarity of the code is also highly valued.
 15 | 
 16 | ### Notable features and omissions
 17 | - Small code and binary size: 500 SLOC, ~3kb binary for x86. Statically #define'd memory usage / allocation.
 18 | - No use of dynamic memory allocation (i.e. no calls to `malloc` / `free`).
 19 | - To avoid call-stack exhaustion, iterative searching is preferred over recursive by default (can be changed with a pre-processor flag).
 20 | - No support for capturing groups or named capture: `(^P<name>group)` etc.
 21 | - Thorough testing : [exrex](https://github.com/asciimoo/exrex) is used to randomly generate test-cases from regex patterns, which are fed into the regex code for verification. Try `make test` to generate a few thousand tests cases yourself. 
 22 | - Verification-harness for [KLEE Symbolic Execution Engine](https://klee.github.io), see [formal verification.md](https://github.com/kokke/tiny-regex-c/blob/master/formal_verification.md).
 23 | - Provides character length of matches.
 24 | - Compiled for x86 using GCC 7.2.0 and optimizing for size, the binary takes up ~2-3kb code space and allocates ~0.5kb RAM :
 25 |   ```
 26 |   > gcc -Os -c re.c
 27 |   > size re.o
 28 |       text     data     bss     dec     hex filename
 29 |       2404        0     304    2708     a94 re.o
 30 |       
 31 |   ```
 32 | 
 33 | 
 34 | 
 35 | ### API
 36 | This is the public / exported API:
 37 | ```C
 38 | /* Typedef'd pointer to hide implementation details. */
 39 | typedef struct regex_t* re_t;
 40 | 
 41 | /* Compiles regex string pattern to a regex_t-array. */
 42 | re_t re_compile(const char* pattern);
 43 | 
 44 | /* Finds matches of the compiled pattern inside text. */
 45 | int  re_matchp(re_t pattern, const char* text, int* matchlength);
 46 | 
 47 | /* Finds matches of pattern inside text (compiles first automatically). */
 48 | int  re_match(const char* pattern, const char* text, int* matchlength);
 49 | ```
 50 | 
 51 | ### Supported regex-operators
 52 | The following features / regex-operators are supported by this library.
 53 | 
 54 | NOTE: inverted character classes are buggy - see the test harness for concrete examples.
 55 | 
 56 | 
 57 |   -  `.`         Dot, matches any character
 58 |   -  `^`         Start anchor, matches beginning of string
 59 |   -  `$`         End anchor, matches end of string
 60 |   -  `*`         Asterisk, match zero or more (greedy)
 61 |   -  `+`         Plus, match one or more (greedy)
 62 |   -  `?`         Question, match zero or one (non-greedy)
 63 |   -  `[abc]`     Character class, match if one of {'a', 'b', 'c'}
 64 |   -  `[^abc]`   Inverted class, match if NOT one of {'a', 'b', 'c'}
 65 |   -  `[a-zA-Z]` Character ranges, the character set of the ranges { a-z | A-Z }
 66 |   -  `\s`       Whitespace, \t \f \r \n \v and spaces
 67 |   -  `\S`       Non-whitespace
 68 |   -  `\w`       Alphanumeric, [a-zA-Z0-9_]
 69 |   -  `\W`       Non-alphanumeric
 70 |   -  `\d`       Digits, [0-9]
 71 |   -  `\D`       Non-digits
 72 | 
 73 | ### Usage
 74 | Compile a regex from ASCII-string (char-array) to a custom pattern structure using `re_compile()`.
 75 | 
 76 | Search a text-string for a regex and get an index into the string, using `re_match()` or `re_matchp()`.
 77 | 
 78 | The returned index points to the first place in the string, where the regex pattern matches.
 79 | 
 80 | The integer pointer passed will hold the length of the match.
 81 | 
 82 | If the regular expression doesn't match, the matching function returns an index of -1 to indicate failure.
 83 | 
 84 | ### Examples
 85 | Example of usage:
 86 | ```C
 87 | /* Standard int to hold length of match */
 88 | int match_length;
 89 | 
 90 | /* Standard null-terminated C-string to search: */
 91 | const char* string_to_search = "ahem.. 'hello world !' ..";
 92 | 
 93 | /* Compile a simple regular expression using character classes, meta-char and greedy + non-greedy quantifiers: */
 94 | re_t pattern = re_compile("[Hh]ello [Ww]orld\\s*[!]?");
 95 | 
 96 | /* Check if the regex matches the text: */
 97 | int match_idx = re_matchp(pattern, string_to_search, &match_length);
 98 | if (match_idx != -1)
 99 | {
100 |   printf("match at idx %i, %i chars long.\n", match_idx, match_length);
101 | }
102 | ```
103 | 
104 | For more usage examples I encourage you to look at the code in the `tests`-folder.
105 | 
106 | ### TODO
107 | - Fix the implementation of inverted character classes.
108 | - Fix implementation of branches (`|`), and see if that can lead us closer to groups as well, e.g. `(a|b)+`.
109 | - Add `example.c` that demonstrates usage.
110 | - Add `tests/test_perf.c` for performance and time measurements.
111 | - Testing: Improve pattern rejection testing.
112 | 
113 | ### FAQ
114 | - *Q: What differentiates this library from other C regex implementations?*
115 | 
116 |   A: Well, the small size for one. 500 lines of C-code compiling to 2-3kb ROM, using very little RAM.
117 | 
118 | ### License
119 | All material in this repository is in the public domain.
120 | 
121 | 
122 | 
123 |  
124 | 


--------------------------------------------------------------------------------
/clib.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "tiny-regex-c",
 3 |   "version": "0.1.0",
 4 |   "repo": "kokke/tiny-regex-c",
 5 |   "keywords": ["tiny", "regex", "pcre"],
 6 |   "license": "Public Domain",
 7 |   "makefile": "Makefile",
 8 |   "src": [
 9 |     "re.h",
10 |     "re.c"
11 |   ]
12 | }
13 | 


--------------------------------------------------------------------------------
/formal_verification.md:
--------------------------------------------------------------------------------
  1 | # Using KLEE for formal verification
  2 | 
  3 | Here is a crude demo of formal verification of tiny-regex. This is a hefty plagiat of [@DavidKorczynski](https://twitter.com/davkorcz/) - see https://www.youtube.com/watch?v=z6bsk-lsk1Q or [#44](https://github.com/kokke/tiny-regex-c/issues/44) for more context.
  4 | 
  5 | I am using the [KLEE Symbolic Execution Engine](https://klee.github.io/) and their Docker image here on a Debian-based host.
  6 | 
  7 | What this does, is mechanically try to prove the abscence of all run-time errors, memory corruption bugs and other problems by symbolic execution. We mark the inputs as being symbolic, so that the tool knows to use that as the "search space". That means KLEE checks all possible inputs of the form we give it.
  8 | 
  9 | Steps:
 10 | 
 11 | - Get the KLEE Docker image: ` $ sudo docker pull klee/klee `
 12 | - Run the KLEE Docker image: ` $ sudo docker run --rm -ti --ulimit='stack=-1:-1' klee/klee `
 13 | - NOTE: You should see a command prompt like this: ` klee@cc0c26c5b84c:~$ `
 14 | - Fetch `re.h`: ` klee@cc0c26c5b84c:~$ wget https://raw.githubusercontent.com/kokke/tiny-regex-c/master/re.h `
 15 | - Fetch `re.c`: ` klee@cc0c26c5b84c:~$ wget https://raw.githubusercontent.com/kokke/tiny-regex-c/master/re.c `
 16 | - Run your favorite editor, and insert the code below in the bottom of `re.c`
 17 | ```C
 18 | /*
 19 | tiny-regex KLEE test driver
 20 | kindly contributed by @DavidKorczynski - see https://github.com/kokke/tiny-regex-c/issues/44
 21 | */
 22 | 
 23 | int main(int argc, char* argv[])
 24 | {
 25 |   /* test input - ten chars used as a regex-pattern input */
 26 |   char arr[10];
 27 | 
 28 |   /* make input symbolic, to search all paths through the code */
 29 |   /* i.e. the input is checked for all possible ten-char combinations */
 30 |   klee_make_symbolic(arr, sizeof(arr), "arr"); 
 31 | 
 32 |   /* assume proper NULL termination */
 33 |   klee_assume(arr[sizeof(arr) - 1] == 0);
 34 | 
 35 |   /* verify abscence of run-time errors - go! */
 36 |   re_compile(arr);
 37 | 
 38 |   return 0;
 39 | }
 40 | ```
 41 | - Alternatively, run this command:
 42 | ` klee@cc0c26c5b84c:~$ echo "int main(int argc,char* argv[]){ char arr[10]; klee_make_symbolic(arr, sizeof(arr), \"arr\"); klee_assume(arr[sizeof(arr)-1] == 0); re_compile(arr); return 0; }" >> re.c `
 43 | - Compile and emit LLVM bitcode: ` klee@cc0c26c5b84c:~$ clang -emit-llvm -g -c -O0 -Xclang -disable-O0-optnone re.c ` [(NOTE: flags passed to clang are the ones "recommended" by the KLEE project)](https://klee.github.io/tutorials/testing-function/)
 44 | - Run KLEE and wait for 5-10 minutes: ` klee@cc0c26c5b84c:~$ klee --libc=uclibc re.bc `
 45 | - A positive result looks like this:
 46 | ```
 47 | klee@cc0c26c5b84c:~$ klee --libc=uclibc re.bc
 48 | KLEE: NOTE: Using klee-uclibc : /tmp/klee_build90stp_z3/runtime/lib/klee-uclibc.bca
 49 | KLEE: output directory is "/home/klee/klee-out-3"
 50 | KLEE: Using STP solver backend
 51 | warning: Linking two modules of different target triples: re.bc' is 'x86_64-unknown-linux-gnu' whereas '__uClibc_main.os' is 'x86_64-pc-linux-gnu'
 52 | 
 53 | KLEE: WARNING: undefined reference to function: __syscall_rt_sigaction
 54 | KLEE: WARNING: undefined reference to function: close
 55 | KLEE: WARNING: undefined reference to function: fcntl
 56 | KLEE: WARNING: undefined reference to function: fstat
 57 | KLEE: WARNING: undefined reference to function: ioctl
 58 | KLEE: WARNING: undefined reference to function: lseek64
 59 | KLEE: WARNING: undefined reference to function: mkdir
 60 | KLEE: WARNING: undefined reference to function: open
 61 | KLEE: WARNING: undefined reference to function: open64
 62 | KLEE: WARNING: undefined reference to function: read
 63 | KLEE: WARNING: undefined reference to function: sigprocmask
 64 | KLEE: WARNING: undefined reference to function: stat
 65 | KLEE: WARNING: undefined reference to function: write
 66 | KLEE: WARNING: undefined reference to function: kill (UNSAFE)!
 67 | KLEE: WARNING: executable has module level assembly (ignoring)
 68 | KLEE: WARNING ONCE: calling external: ioctl(0, 21505, 94666720729472) at libc/termios/tcgetattr.c:43 12
 69 | KLEE: WARNING ONCE: calling __user_main with extra arguments.
 70 | KLEE: WARNING ONCE: skipping fork (memory cap exceeded)
 71 | KLEE: WARNING: killing 12290 states (over memory cap: 2102MB)
 72 | KLEE: WARNING: killing 11467 states (over memory cap: 2101MB)
 73 | 
 74 | KLEE: done: total instructions = 104365773
 75 | KLEE: done: completed paths = 801298
 76 | KLEE: done: generated tests = 801298
 77 | klee@cc0c26c5b84c:~$ 
 78 | ```
 79 | 
 80 | Similarly, the code below tests both `re_compile(...)` and `re_match(...)` which should be sufficient for coverage of the core logic.
 81 | Depending on your hardware, you should be able to increase the sizes of `pat` and `txt` to increase your confidence in the verification.
 82 | 
 83 | 
 84 | ```C
 85 | /*
 86 | tiny-regex KLEE test driver
 87 | kindly contributed by @DavidKorczynski - see https://github.com/kokke/tiny-regex-c/issues/44
 88 | */
 89 | 
 90 | int main(int argc, char* argv[])
 91 | {
 92 |   /* test input - a regex-pattern and a text string to search in */
 93 |   char pat[7];
 94 |   char txt[3];
 95 | 
 96 |   /* make input symbolic, to search all paths through the code */
 97 |   /* i.e. the input is checked for all possible ten-char combinations */
 98 |   klee_make_symbolic(pat, sizeof(pat), "pat"); 
 99 |   klee_make_symbolic(txt, sizeof(txt), "txt"); 
100 | 
101 |   /* assume proper NULL termination */
102 |   klee_assume(pat[sizeof(pat) - 1] == 0);
103 |   klee_assume(txt[sizeof(txt) - 1] == 0);
104 | 
105 |   /* verify abscence of run-time errors - go! */
106 |   int l;
107 |   re_match(pat, txt, &l);
108 | 
109 |   return 0;
110 | }
111 | ```
112 | 
113 | My modest hardware (T420/i5-2520M@2.5GHz/8GB) completes a check of a 7-char pattern and a 3-char text string in 20-30 minutes (size includes null-termination), whereas 8/5 takes +8 hours, 8/6 takes 14 hours:
114 | 
115 | ```
116 | klee@780432c1aaae0:~$ clang -emit-llvm -g -c -O0 -Xclang -disable-O0-optnone re.c
117 | klee@780432c1aaae0:~$ time klee --libc=uclibc --optimize re.bc
118 | KLEE: NOTE: Using klee-uclibc : /tmp/klee_build90stp_z3/runtime/lib/klee-uclibc.bca
119 | KLEE: output directory is "/home/klee/klee-out-0"
120 | KLEE: Using STP solver backend
121 | warning: Linking two modules of different target triples: re.bc' is 'x86_64-unknown-linux-gnu' whereas '__uClibc_main.os' is 'x86_64-pc-linux-gnu'
122 | 
123 | KLEE: WARNING: undefined reference to function: fcntl
124 | KLEE: WARNING: undefined reference to function: fstat
125 | KLEE: WARNING: undefined reference to function: ioctl
126 | KLEE: WARNING: undefined reference to function: open
127 | KLEE: WARNING: undefined reference to function: write
128 | KLEE: WARNING: executable has module level assembly (ignoring)
129 | KLEE: WARNING ONCE: calling external: ioctl(0, 21505, 94248844458320) at libc/termios/tcgetattr:43 12
130 | KLEE: WARNING ONCE: calling __user_main with extra arguments.
131 | KLEE: WARNING ONCE: skipping fork (memory cap exceeded)
132 | 
133 | KLEE: done: total instructions = 201292178
134 | KLEE: done: completed paths = 910249
135 | KLEE: done: generated tests = 910249
136 | 
137 | real    29m16.633s
138 | user    19m38.438s
139 | sys     9m34.654s
140 | klee@780432c1aaae0:~$ 
141 | ```
142 | 
143 | 


--------------------------------------------------------------------------------
/re.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *
  3 |  * Mini regex-module inspired by Rob Pike's regex code described in:
  4 |  *
  5 |  * http://www.cs.princeton.edu/courses/archive/spr09/cos333/beautiful.html
  6 |  *
  7 |  *
  8 |  *
  9 |  * Supports:
 10 |  * ---------
 11 |  *   '.'        Dot, matches any character
 12 |  *   '^'        Start anchor, matches beginning of string
 13 |  *   '$'        End anchor, matches end of string
 14 |  *   '*'        Asterisk, match zero or more (greedy)
 15 |  *   '+'        Plus, match one or more (greedy)
 16 |  *   '?'        Question, match zero or one (non-greedy)
 17 |  *   '[abc]'    Character class, match if one of {'a', 'b', 'c'}
 18 |  *   '[^abc]'   Inverted class, match if NOT one of {'a', 'b', 'c'} -- NOTE: feature is currently broken!
 19 |  *   '[a-zA-Z]' Character ranges, the character set of the ranges { a-z | A-Z }
 20 |  *   '\s'       Whitespace, \t \f \r \n \v and spaces
 21 |  *   '\S'       Non-whitespace
 22 |  *   '\w'       Alphanumeric, [a-zA-Z0-9_]
 23 |  *   '\W'       Non-alphanumeric
 24 |  *   '\d'       Digits, [0-9]
 25 |  *   '\D'       Non-digits
 26 |  *
 27 |  *
 28 |  */
 29 | 
 30 | 
 31 | 
 32 | #include "re.h"
 33 | #include <stdio.h>
 34 | #include <ctype.h>
 35 | 
 36 | /* Definitions: */
 37 | 
 38 | #define MAX_REGEXP_OBJECTS      30    /* Max number of regex symbols in expression. */
 39 | #define MAX_CHAR_CLASS_LEN      40    /* Max length of character-class buffer in.   */
 40 | 
 41 | 
 42 | enum { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS, CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT, ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE, /* BRANCH */ };
 43 | 
 44 | typedef struct regex_t
 45 | {
 46 |   unsigned char  type;   /* CHAR, STAR, etc.                      */
 47 |   union
 48 |   {
 49 |     unsigned char  ch;   /*      the character itself             */
 50 |     unsigned char* ccl;  /*  OR  a pointer to characters in class */
 51 |   } u;
 52 | } regex_t;
 53 | 
 54 | 
 55 | 
 56 | /* Private function declarations: */
 57 | static int matchpattern(regex_t* pattern, const char* text, int* matchlength);
 58 | static int matchcharclass(char c, const char* str);
 59 | static int matchstar(regex_t p, regex_t* pattern, const char* text, int* matchlength);
 60 | static int matchplus(regex_t p, regex_t* pattern, const char* text, int* matchlength);
 61 | static int matchone(regex_t p, char c);
 62 | static int matchdigit(char c);
 63 | static int matchalpha(char c);
 64 | static int matchwhitespace(char c);
 65 | static int matchmetachar(char c, const char* str);
 66 | static int matchrange(char c, const char* str);
 67 | static int matchdot(char c);
 68 | static int ismetachar(char c);
 69 | 
 70 | 
 71 | 
 72 | /* Public functions: */
 73 | int re_match(const char* pattern, const char* text, int* matchlength)
 74 | {
 75 |   return re_matchp(re_compile(pattern), text, matchlength);
 76 | }
 77 | 
 78 | int re_matchp(re_t pattern, const char* text, int* matchlength)
 79 | {
 80 |   *matchlength = 0;
 81 |   if (pattern != 0)
 82 |   {
 83 |     if (pattern[0].type == BEGIN)
 84 |     {
 85 |       return ((matchpattern(&pattern[1], text, matchlength)) ? 0 : -1);
 86 |     }
 87 |     else
 88 |     {
 89 |       int idx = -1;
 90 | 
 91 |       do
 92 |       {
 93 |         idx += 1;
 94 | 
 95 |         if (matchpattern(pattern, text, matchlength))
 96 |         {
 97 |           if (text[0] == '\0')
 98 |             return -1;
 99 | 
100 |           return idx;
101 |         }
102 |       }
103 |       while (*text++ != '\0');
104 |     }
105 |   }
106 |   return -1;
107 | }
108 | 
109 | re_t re_compile(const char* pattern)
110 | {
111 |   /* The sizes of the two static arrays below substantiates the static RAM usage of this module.
112 |      MAX_REGEXP_OBJECTS is the max number of symbols in the expression.
113 |      MAX_CHAR_CLASS_LEN determines the size of buffer for chars in all char-classes in the expression. */
114 |   static regex_t re_compiled[MAX_REGEXP_OBJECTS];
115 |   static unsigned char ccl_buf[MAX_CHAR_CLASS_LEN];
116 |   int ccl_bufidx = 1;
117 | 
118 |   char c;     /* current char in pattern   */
119 |   int i = 0;  /* index into pattern        */
120 |   int j = 0;  /* index into re_compiled    */
121 | 
122 |   while (pattern[i] != '\0' && (j+1 < MAX_REGEXP_OBJECTS))
123 |   {
124 |     c = pattern[i];
125 | 
126 |     switch (c)
127 |     {
128 |       /* Meta-characters: */
129 |       case '^': {    re_compiled[j].type = BEGIN;           } break;
130 |       case '$': {    re_compiled[j].type = END;             } break;
131 |       case '.': {    re_compiled[j].type = DOT;             } break;
132 |       case '*': {    re_compiled[j].type = STAR;            } break;
133 |       case '+': {    re_compiled[j].type = PLUS;            } break;
134 |       case '?': {    re_compiled[j].type = QUESTIONMARK;    } break;
135 | /*    case '|': {    re_compiled[j].type = BRANCH;          } break; <-- not working properly */
136 | 
137 |       /* Escaped character-classes (\s \w ...): */
138 |       case '\\':
139 |       {
140 |         if (pattern[i+1] != '\0')
141 |         {
142 |           /* Skip the escape-char '\\' */
143 |           i += 1;
144 |           /* ... and check the next */
145 |           switch (pattern[i])
146 |           {
147 |             /* Meta-character: */
148 |             case 'd': {    re_compiled[j].type = DIGIT;            } break;
149 |             case 'D': {    re_compiled[j].type = NOT_DIGIT;        } break;
150 |             case 'w': {    re_compiled[j].type = ALPHA;            } break;
151 |             case 'W': {    re_compiled[j].type = NOT_ALPHA;        } break;
152 |             case 's': {    re_compiled[j].type = WHITESPACE;       } break;
153 |             case 'S': {    re_compiled[j].type = NOT_WHITESPACE;   } break;
154 | 
155 |             /* Escaped character, e.g. '.' or '$' */
156 |             default:
157 |             {
158 |               re_compiled[j].type = CHAR;
159 |               re_compiled[j].u.ch = pattern[i];
160 |             } break;
161 |           }
162 |         }
163 |         /* '\\' as last char in pattern -> invalid regular expression. */
164 | /*
165 |         else
166 |         {
167 |           re_compiled[j].type = CHAR;
168 |           re_compiled[j].ch = pattern[i];
169 |         }
170 | */
171 |       } break;
172 | 
173 |       /* Character class: */
174 |       case '[':
175 |       {
176 |         /* Remember where the char-buffer starts. */
177 |         int buf_begin = ccl_bufidx;
178 | 
179 |         /* Look-ahead to determine if negated */
180 |         if (pattern[i+1] == '^')
181 |         {
182 |           re_compiled[j].type = INV_CHAR_CLASS;
183 |           i += 1; /* Increment i to avoid including '^' in the char-buffer */
184 |           if (pattern[i+1] == 0) /* incomplete pattern, missing non-zero char after '^' */
185 |           {
186 |             return 0;
187 |           }
188 |         }
189 |         else
190 |         {
191 |           re_compiled[j].type = CHAR_CLASS;
192 |         }
193 | 
194 |         /* Copy characters inside [..] to buffer */
195 |         while (    (pattern[++i] != ']')
196 |                 && (pattern[i]   != '\0')) /* Missing ] */
197 |         {
198 |           if (pattern[i] == '\\')
199 |           {
200 |             if (ccl_bufidx >= MAX_CHAR_CLASS_LEN - 1)
201 |             {
202 |               //fputs("exceeded internal buffer!\n", stderr);
203 |               return 0;
204 |             }
205 |             if (pattern[i+1] == 0) /* incomplete pattern, missing non-zero char after '\\' */
206 |             {
207 |               return 0;
208 |             }
209 |             ccl_buf[ccl_bufidx++] = pattern[i++];
210 |           }
211 |           else if (ccl_bufidx >= MAX_CHAR_CLASS_LEN)
212 |           {
213 |               //fputs("exceeded internal buffer!\n", stderr);
214 |               return 0;
215 |           }
216 |           ccl_buf[ccl_bufidx++] = pattern[i];
217 |         }
218 |         if (ccl_bufidx >= MAX_CHAR_CLASS_LEN)
219 |         {
220 |             /* Catches cases such as [00000000000000000000000000000000000000][ */
221 |             //fputs("exceeded internal buffer!\n", stderr);
222 |             return 0;
223 |         }
224 |         /* Null-terminate string end */
225 |         ccl_buf[ccl_bufidx++] = 0;
226 |         re_compiled[j].u.ccl = &ccl_buf[buf_begin];
227 |       } break;
228 | 
229 |       /* Other characters: */
230 |       default:
231 |       {
232 |         re_compiled[j].type = CHAR;
233 |         re_compiled[j].u.ch = c;
234 |       } break;
235 |     }
236 |     /* no buffer-out-of-bounds access on invalid patterns - see https://github.com/kokke/tiny-regex-c/commit/1a279e04014b70b0695fba559a7c05d55e6ee90b */
237 |     if (pattern[i] == 0)
238 |     {
239 |       return 0;
240 |     }
241 | 
242 |     i += 1;
243 |     j += 1;
244 |   }
245 |   /* 'UNUSED' is a sentinel used to indicate end-of-pattern */
246 |   re_compiled[j].type = UNUSED;
247 | 
248 |   return (re_t) re_compiled;
249 | }
250 | 
251 | void re_print(regex_t* pattern)
252 | {
253 |   const char* types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE", "BRANCH" };
254 | 
255 |   int i;
256 |   int j;
257 |   char c;
258 |   for (i = 0; i < MAX_REGEXP_OBJECTS; ++i)
259 |   {
260 |     if (pattern[i].type == UNUSED)
261 |     {
262 |       break;
263 |     }
264 | 
265 |     printf("type: %s", types[pattern[i].type]);
266 |     if (pattern[i].type == CHAR_CLASS || pattern[i].type == INV_CHAR_CLASS)
267 |     {
268 |       printf(" [");
269 |       for (j = 0; j < MAX_CHAR_CLASS_LEN; ++j)
270 |       {
271 |         c = pattern[i].u.ccl[j];
272 |         if ((c == '\0') || (c == ']'))
273 |         {
274 |           break;
275 |         }
276 |         printf("%c", c);
277 |       }
278 |       printf("]");
279 |     }
280 |     else if (pattern[i].type == CHAR)
281 |     {
282 |       printf(" '%c'", pattern[i].u.ch);
283 |     }
284 |     printf("\n");
285 |   }
286 | }
287 | 
288 | 
289 | 
290 | /* Private functions: */
291 | static int matchdigit(char c)
292 | {
293 |   return isdigit(c);
294 | }
295 | static int matchalpha(char c)
296 | {
297 |   return isalpha(c);
298 | }
299 | static int matchwhitespace(char c)
300 | {
301 |   return isspace(c);
302 | }
303 | static int matchalphanum(char c)
304 | {
305 |   return ((c == '_') || matchalpha(c) || matchdigit(c));
306 | }
307 | static int matchrange(char c, const char* str)
308 | {
309 |   return (    (c != '-')
310 |            && (str[0] != '\0')
311 |            && (str[0] != '-')
312 |            && (str[1] == '-')
313 |            && (str[2] != '\0')
314 |            && (    (c >= str[0])
315 |                 && (c <= str[2])));
316 | }
317 | static int matchdot(char c)
318 | {
319 | #if defined(RE_DOT_MATCHES_NEWLINE) && (RE_DOT_MATCHES_NEWLINE == 1)
320 |   (void)c;
321 |   return 1;
322 | #else
323 |   return c != '\n' && c != '\r';
324 | #endif
325 | }
326 | static int ismetachar(char c)
327 | {
328 |   return ((c == 's') || (c == 'S') || (c == 'w') || (c == 'W') || (c == 'd') || (c == 'D'));
329 | }
330 | 
331 | static int matchmetachar(char c, const char* str)
332 | {
333 |   switch (str[0])
334 |   {
335 |     case 'd': return  matchdigit(c);
336 |     case 'D': return !matchdigit(c);
337 |     case 'w': return  matchalphanum(c);
338 |     case 'W': return !matchalphanum(c);
339 |     case 's': return  matchwhitespace(c);
340 |     case 'S': return !matchwhitespace(c);
341 |     default:  return (c == str[0]);
342 |   }
343 | }
344 | 
345 | static int matchcharclass(char c, const char* str)
346 | {
347 |   do
348 |   {
349 |     if (matchrange(c, str))
350 |     {
351 |       return 1;
352 |     }
353 |     else if (str[0] == '\\')
354 |     {
355 |       /* Escape-char: increment str-ptr and match on next char */
356 |       str += 1;
357 |       if (matchmetachar(c, str))
358 |       {
359 |         return 1;
360 |       }
361 |       else if ((c == str[0]) && !ismetachar(c))
362 |       {
363 |         return 1;
364 |       }
365 |     }
366 |     else if (c == str[0])
367 |     {
368 |       if (c == '-')
369 |       {
370 |         return ((str[-1] == '\0') || (str[1] == '\0'));
371 |       }
372 |       else
373 |       {
374 |         return 1;
375 |       }
376 |     }
377 |   }
378 |   while (*str++ != '\0');
379 | 
380 |   return 0;
381 | }
382 | 
383 | static int matchone(regex_t p, char c)
384 | {
385 |   switch (p.type)
386 |   {
387 |     case DOT:            return matchdot(c);
388 |     case CHAR_CLASS:     return  matchcharclass(c, (const char*)p.u.ccl);
389 |     case INV_CHAR_CLASS: return !matchcharclass(c, (const char*)p.u.ccl);
390 |     case DIGIT:          return  matchdigit(c);
391 |     case NOT_DIGIT:      return !matchdigit(c);
392 |     case ALPHA:          return  matchalphanum(c);
393 |     case NOT_ALPHA:      return !matchalphanum(c);
394 |     case WHITESPACE:     return  matchwhitespace(c);
395 |     case NOT_WHITESPACE: return !matchwhitespace(c);
396 |     default:             return  (p.u.ch == c);
397 |   }
398 | }
399 | 
400 | static int matchstar(regex_t p, regex_t* pattern, const char* text, int* matchlength)
401 | {
402 |   int prelen = *matchlength;
403 |   const char* prepoint = text;
404 |   while ((text[0] != '\0') && matchone(p, *text))
405 |   {
406 |     text++;
407 |     (*matchlength)++;
408 |   }
409 |   while (text >= prepoint)
410 |   {
411 |     if (matchpattern(pattern, text--, matchlength))
412 |       return 1;
413 |     (*matchlength)--;
414 |   }
415 | 
416 |   *matchlength = prelen;
417 |   return 0;
418 | }
419 | 
420 | static int matchplus(regex_t p, regex_t* pattern, const char* text, int* matchlength)
421 | {
422 |   const char* prepoint = text;
423 |   while ((text[0] != '\0') && matchone(p, *text))
424 |   {
425 |     text++;
426 |     (*matchlength)++;
427 |   }
428 |   while (text > prepoint)
429 |   {
430 |     if (matchpattern(pattern, text--, matchlength))
431 |       return 1;
432 |     (*matchlength)--;
433 |   }
434 | 
435 |   return 0;
436 | }
437 | 
438 | static int matchquestion(regex_t p, regex_t* pattern, const char* text, int* matchlength)
439 | {
440 |   if (p.type == UNUSED)
441 |     return 1;
442 |   if (matchpattern(pattern, text, matchlength))
443 |       return 1;
444 |   if (*text && matchone(p, *text++))
445 |   {
446 |     if (matchpattern(pattern, text, matchlength))
447 |     {
448 |       (*matchlength)++;
449 |       return 1;
450 |     }
451 |   }
452 |   return 0;
453 | }
454 | 
455 | 
456 | #if 0
457 | 
458 | /* Recursive matching */
459 | static int matchpattern(regex_t* pattern, const char* text, int *matchlength)
460 | {
461 |   int pre = *matchlength;
462 |   if ((pattern[0].type == UNUSED) || (pattern[1].type == QUESTIONMARK))
463 |   {
464 |     return matchquestion(pattern[1], &pattern[2], text, matchlength);
465 |   }
466 |   else if (pattern[1].type == STAR)
467 |   {
468 |     return matchstar(pattern[0], &pattern[2], text, matchlength);
469 |   }
470 |   else if (pattern[1].type == PLUS)
471 |   {
472 |     return matchplus(pattern[0], &pattern[2], text, matchlength);
473 |   }
474 |   else if ((pattern[0].type == END) && pattern[1].type == UNUSED)
475 |   {
476 |     return text[0] == '\0';
477 |   }
478 |   else if ((text[0] != '\0') && matchone(pattern[0], text[0]))
479 |   {
480 |     (*matchlength)++;
481 |     return matchpattern(&pattern[1], text+1);
482 |   }
483 |   else
484 |   {
485 |     *matchlength = pre;
486 |     return 0;
487 |   }
488 | }
489 | 
490 | #else
491 | 
492 | /* Iterative matching */
493 | static int matchpattern(regex_t* pattern, const char* text, int* matchlength)
494 | {
495 |   int pre = *matchlength;
496 |   do
497 |   {
498 |     if ((pattern[0].type == UNUSED) || (pattern[1].type == QUESTIONMARK))
499 |     {
500 |       return matchquestion(pattern[0], &pattern[2], text, matchlength);
501 |     }
502 |     else if (pattern[1].type == STAR)
503 |     {
504 |       return matchstar(pattern[0], &pattern[2], text, matchlength);
505 |     }
506 |     else if (pattern[1].type == PLUS)
507 |     {
508 |       return matchplus(pattern[0], &pattern[2], text, matchlength);
509 |     }
510 |     else if ((pattern[0].type == END) && pattern[1].type == UNUSED)
511 |     {
512 |       return (text[0] == '\0');
513 |     }
514 | /*  Branching is not working properly
515 |     else if (pattern[1].type == BRANCH)
516 |     {
517 |       return (matchpattern(pattern, text) || matchpattern(&pattern[2], text));
518 |     }
519 | */
520 |   (*matchlength)++;
521 |   }
522 |   while ((text[0] != '\0') && matchone(*pattern++, *text++));
523 | 
524 |   *matchlength = pre;
525 |   return 0;
526 | }
527 | 
528 | #endif
529 | 


--------------------------------------------------------------------------------
/re.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *
 3 |  * Mini regex-module inspired by Rob Pike's regex code described in:
 4 |  *
 5 |  * http://www.cs.princeton.edu/courses/archive/spr09/cos333/beautiful.html
 6 |  *
 7 |  *
 8 |  *
 9 |  * Supports:
10 |  * ---------
11 |  *   '.'        Dot, matches any character
12 |  *   '^'        Start anchor, matches beginning of string
13 |  *   '$'        End anchor, matches end of string
14 |  *   '*'        Asterisk, match zero or more (greedy)
15 |  *   '+'        Plus, match one or more (greedy)
16 |  *   '?'        Question, match zero or one (non-greedy)
17 |  *   '[abc]'    Character class, match if one of {'a', 'b', 'c'}
18 |  *   '[^abc]'   Inverted class, match if NOT one of {'a', 'b', 'c'} -- NOTE: feature is currently broken!
19 |  *   '[a-zA-Z]' Character ranges, the character set of the ranges { a-z | A-Z }
20 |  *   '\s'       Whitespace, \t \f \r \n \v and spaces
21 |  *   '\S'       Non-whitespace
22 |  *   '\w'       Alphanumeric, [a-zA-Z0-9_]
23 |  *   '\W'       Non-alphanumeric
24 |  *   '\d'       Digits, [0-9]
25 |  *   '\D'       Non-digits
26 |  *
27 |  *
28 |  */
29 | 
30 | #ifndef _TINY_REGEX_C
31 | #define _TINY_REGEX_C
32 | 
33 | 
34 | #ifndef RE_DOT_MATCHES_NEWLINE
35 | /* Define to 0 if you DON'T want '.' to match '\r' + '\n' */
36 | #define RE_DOT_MATCHES_NEWLINE 1
37 | #endif
38 | 
39 | #ifdef __cplusplus
40 | extern "C"{
41 | #endif
42 | 
43 | 
44 | 
45 | /* Typedef'd pointer to get abstract datatype. */
46 | typedef struct regex_t* re_t;
47 | 
48 | 
49 | /* Compile regex string pattern to a regex_t-array. */
50 | re_t re_compile(const char* pattern);
51 | 
52 | 
53 | /* Find matches of the compiled pattern inside text. */
54 | int re_matchp(re_t pattern, const char* text, int* matchlength);
55 | 
56 | 
57 | /* Find matches of the txt pattern inside text (will compile automatically first). */
58 | int re_match(const char* pattern, const char* text, int* matchlength);
59 | 
60 | 
61 | #ifdef __cplusplus
62 | }
63 | #endif
64 | 
65 | #endif /* ifndef _TINY_REGEX_C */
66 | 


--------------------------------------------------------------------------------
/scripts/exrex.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # This file is part of exrex.
  5 | #
  6 | # exrex is free software: you can redistribute it and/or modify
  7 | # it under the terms of the GNU Affero General Public License as published by
  8 | # the Free Software Foundation, either version 3 of the License, or
  9 | # (at your option) any later version.
 10 | #
 11 | # exrex is distributed in the hope that it will be useful,
 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | # GNU Affero General Public License for more details.
 15 | #
 16 | # You should have received a copy of the GNU Affero General Public License
 17 | # along with exrex. If not, see < http://www.gnu.org/licenses/ >.
 18 | #
 19 | # (C) 2012- by Adam Tauber, <asciimoo@gmail.com>
 20 | 
 21 | try:
 22 |     from future_builtins import map, range
 23 | except:
 24 |     pass
 25 | from re import match, U
 26 | try:
 27 |     import re._parser as sre_parse
 28 | except ImportError: # Python < 3.11
 29 |     from re import sre_parse
 30 | from itertools import tee
 31 | from random import choice, randint
 32 | from types import GeneratorType
 33 | 
 34 | from sys import version_info
 35 | IS_PY3 = version_info[0] == 3
 36 | IS_PY36_OR_GREATER = IS_PY3 and version_info[1] > 5
 37 | 
 38 | if IS_PY3:
 39 |     unichr = chr
 40 | 
 41 | __all__ = (
 42 |     'generate',
 43 |     'CATEGORIES',
 44 |     'count',
 45 |     'parse',
 46 |     'getone',
 47 |     'sre_to_string',
 48 |     'simplify'
 49 | )
 50 | 
 51 | CATEGORIES = {
 52 |     sre_parse.CATEGORY_SPACE: sorted(sre_parse.WHITESPACE),
 53 |     sre_parse.CATEGORY_DIGIT: sorted(sre_parse.DIGITS),
 54 |     #sre_parse.CATEGORY_WORD: [unichr(x) for x in range(256) if
 55 |     #                          match(r'\w', unichr(x), U)],
 56 |     sre_parse.CATEGORY_WORD: list('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_'),
 57 |     sre_parse.CATEGORY_NOT_WORD: [unichr(x) for x in range(256) if
 58 |                                   match(r'\W', unichr(x), U)],
 59 |     'category_any': [unichr(x) for x in range(32, 123)]
 60 | }
 61 | 
 62 | 
 63 | def _build_reverse_categories():
 64 |     reverse = {}
 65 |     for key, value in sre_parse.CATEGORIES.items():
 66 |         if not hasattr(value[1], '__iter__'):
 67 |             continue
 68 | 
 69 |         for vv in value[1]:
 70 |             if value[0] == sre_parse.IN and vv[0] == sre_parse.CATEGORY:
 71 |                 reverse.update({vv[1]: key})
 72 | 
 73 |     return reverse
 74 | 
 75 | 
 76 | REVERSE_CATEGORIES = _build_reverse_categories()
 77 | 
 78 | 
 79 | def comb(g, i):
 80 |     for c in g:
 81 |         g2, i = tee(i)
 82 |         for c2 in g2:
 83 |             yield c + c2
 84 | 
 85 | 
 86 | def mappend(g, c):
 87 |     for cc in g:
 88 |         yield cc + c
 89 | 
 90 | 
 91 | def dappend(g, d, k):
 92 |     for cc in g:
 93 |         yield cc + d[k]
 94 | 
 95 | 
 96 | def _in(d):
 97 |     ret = []
 98 |     neg = False
 99 |     for i in d:
100 |         if i[0] == sre_parse.RANGE:
101 |             subs = map(unichr, range(i[1][0], i[1][1] + 1))
102 |             if neg:
103 |                 for char in subs:
104 |                     try:
105 |                         ret.remove(char)
106 |                     except:
107 |                         pass
108 |             else:
109 |                 ret.extend(subs)
110 |         elif i[0] == sre_parse.LITERAL:
111 |             if neg:
112 |                 try:
113 |                     ret.remove(unichr(i[1]))
114 |                 except:
115 |                     pass
116 |             else:
117 |                 ret.append(unichr(i[1]))
118 |         elif i[0] == sre_parse.CATEGORY:
119 |             subs = CATEGORIES.get(i[1], [''])
120 |             if neg:
121 |                 for char in subs:
122 |                     try:
123 |                         ret.remove(char)
124 |                     except:
125 |                         pass
126 |             else:
127 |                 ret.extend(subs)
128 |         elif i[0] == sre_parse.NEGATE:
129 |             ret = list(CATEGORIES['category_any'])
130 |             neg = True
131 |     return ret
132 | 
133 | 
134 | def prods(orig, ran, items, limit, grouprefs):
135 |     for o in orig:
136 |         for r in ran:
137 |             if r == 0:
138 |                 yield o
139 |             else:
140 |                 ret = [o]
141 |                 for _ in range(r):
142 |                     ret = ggen(
143 |                         ret, _gen, items, limit=limit, count=False, grouprefs=grouprefs)
144 |                 for i in ret:
145 |                     yield i
146 | 
147 | 
148 | def ggen(g1, f, *args, **kwargs):
149 |     groupref = None
150 |     grouprefs = kwargs.get('grouprefs', {})
151 |     if 'groupref' in kwargs.keys():
152 |         groupref = kwargs.pop('groupref')
153 |     for a in g1:
154 |         g2 = f(*args, **kwargs)
155 |         if isinstance(g2, GeneratorType):
156 |             for b in g2:
157 |                 grouprefs[groupref] = b
158 |                 yield a + b
159 |         else:
160 |             yield g2
161 | 
162 | 
163 | def concit(g1, seqs, limit, grouprefs):
164 |     for a in g1:
165 |         for s in seqs:
166 |             for b in _gen(s, limit, grouprefs=grouprefs):
167 |                 yield a + b
168 | 
169 | 
170 | def _gen(d, limit=20, count=False, grouprefs=None):
171 |     """docstring for _gen"""
172 |     if grouprefs is None:
173 |         grouprefs = {}
174 |     ret = ['']
175 |     strings = 0
176 |     literal = False
177 |     for i in d:
178 |         if i[0] == sre_parse.IN:
179 |             subs = _in(i[1])
180 |             if count:
181 |                 strings = (strings or 1) * len(subs)
182 |             ret = comb(ret, subs)
183 |         elif i[0] == sre_parse.LITERAL:
184 |             literal = True
185 |             ret = mappend(ret, unichr(i[1]))
186 |         elif i[0] == sre_parse.CATEGORY:
187 |             subs = CATEGORIES.get(i[1], [''])
188 |             if count:
189 |                 strings = (strings or 1) * len(subs)
190 |             ret = comb(ret, subs)
191 |         elif i[0] == sre_parse.ANY:
192 |             subs = CATEGORIES['category_any']
193 |             if count:
194 |                 strings = (strings or 1) * len(subs)
195 |             ret = comb(ret, subs)
196 |         elif i[0] == sre_parse.MAX_REPEAT or i[0] == sre_parse.MIN_REPEAT:
197 |             items = list(i[1][2])
198 |             if i[1][1] + 1 - i[1][0] >= limit:
199 |                 r1 = i[1][0]
200 |                 r2 = i[1][0] + limit
201 |             else:
202 |                 r1 = i[1][0]
203 |                 r2 = i[1][1] + 1
204 |             ran = range(r1, r2)
205 |             if count:
206 |                 branch_count = 0
207 |                 for p in ran:
208 |                     branch_count += pow(_gen(items, limit, True, grouprefs), p)
209 |                 strings = (strings or 1) * branch_count
210 | 
211 |             ret = prods(ret, ran, items, limit, grouprefs)
212 |         elif i[0] == sre_parse.BRANCH:
213 |             if count:
214 |                 for x in i[1][1]:
215 |                     strings += _gen(x, limit, True, grouprefs) or 1
216 |             ret = concit(ret, i[1][1], limit, grouprefs)
217 |         elif i[0] == sre_parse.SUBPATTERN or i[0] == sre_parse.ASSERT:
218 |             subexpr = i[1][1]
219 |             if IS_PY36_OR_GREATER and i[0] == sre_parse.SUBPATTERN:
220 |                 subexpr = i[1][3]
221 |             if count:
222 |                 strings = (
223 |                     strings or 1) * (sum(ggen([0], _gen, subexpr, limit=limit, count=True, grouprefs=grouprefs)) or 1)
224 |             ret = ggen(ret, _gen, subexpr, limit=limit, count=False, grouprefs=grouprefs, groupref=i[1][0])
225 |         # ignore ^ and $
226 |         elif i[0] == sre_parse.AT:
227 |             continue
228 |         elif i[0] == sre_parse.NOT_LITERAL:
229 |             subs = list(CATEGORIES['category_any'])
230 |             if unichr(i[1]) in subs:
231 |                 subs.remove(unichr(i[1]))
232 |             if count:
233 |                 strings = (strings or 1) * len(subs)
234 |             ret = comb(ret, subs)
235 |         elif i[0] == sre_parse.GROUPREF:
236 |             ret = dappend(ret, grouprefs, i[1])
237 |         elif i[0] == sre_parse.ASSERT_NOT:
238 |             pass
239 |         else:
240 |             print('[!] cannot handle expression ' + repr(i))
241 | 
242 |     if count:
243 |         if strings == 0 and literal:
244 |             inc = True
245 |             for i in d:
246 |                 if i[0] not in (sre_parse.AT, sre_parse.LITERAL):
247 |                     inc = False
248 |             if inc:
249 |                 strings = 1
250 |         return strings
251 | 
252 |     return ret
253 | 
254 | 
255 | def _randone(d, limit=20, grouprefs=None):
256 |     if grouprefs is None:
257 |         grouprefs = {}
258 |     """docstring for _randone"""
259 |     ret = ''
260 |     for i in d:
261 |         if i[0] == sre_parse.IN:
262 |             ret += choice(_in(i[1]))
263 |         elif i[0] == sre_parse.LITERAL:
264 |             ret += unichr(i[1])
265 |         elif i[0] == sre_parse.CATEGORY:
266 |             ret += choice(CATEGORIES.get(i[1], ['']))
267 |         elif i[0] == sre_parse.ANY:
268 |             ret += choice(CATEGORIES['category_any'])
269 |         elif i[0] == sre_parse.MAX_REPEAT or i[0] == sre_parse.MIN_REPEAT:
270 |             if i[1][1] + 1 - i[1][0] >= limit:
271 |                 min, max = i[1][0], i[1][0] + limit - 1
272 |             else:
273 |                 min, max = i[1][0], i[1][1]
274 |             for _ in range(randint(min, max)):
275 |                 ret += _randone(list(i[1][2]), limit, grouprefs)
276 |         elif i[0] == sre_parse.BRANCH:
277 |             ret += _randone(choice(i[1][1]), limit, grouprefs)
278 |         elif i[0] == sre_parse.SUBPATTERN or i[0] == sre_parse.ASSERT:
279 |             subexpr = i[1][1]
280 |             if IS_PY36_OR_GREATER and i[0] == sre_parse.SUBPATTERN:
281 |                 subexpr = i[1][3]
282 |             subp = _randone(subexpr, limit, grouprefs)
283 |             if i[1][0]:
284 |                 grouprefs[i[1][0]] = subp
285 |             ret += subp
286 |         elif i[0] == sre_parse.AT:
287 |             continue
288 |         elif i[0] == sre_parse.NOT_LITERAL:
289 |             c = list(CATEGORIES['category_any'])
290 |             if unichr(i[1]) in c:
291 |                 c.remove(unichr(i[1]))
292 |             ret += choice(c)
293 |         elif i[0] == sre_parse.GROUPREF:
294 |             ret += grouprefs[i[1]]
295 |         elif i[0] == sre_parse.ASSERT_NOT:
296 |             pass
297 |         else:
298 |             print('[!] cannot handle expression "%s"' % str(i))
299 | 
300 |     return ret
301 | 
302 | 
303 | def sre_to_string(sre_obj, paren=True):
304 |     """sre_parse object to string
305 | 
306 |     :param sre_obj: Output of sre_parse.parse()
307 |     :type sre_obj: list
308 |     :rtype: str
309 |     """
310 |     ret = u''
311 |     for i in sre_obj:
312 |         if i[0] == sre_parse.IN:
313 |             prefix = ''
314 |             if len(i[1]) and i[1][0][0] == sre_parse.NEGATE:
315 |                 prefix = '^'
316 |             ret += u'[{0}{1}]'.format(prefix, sre_to_string(i[1], paren=paren))
317 |         elif i[0] == sre_parse.LITERAL:
318 |             u = unichr(i[1])
319 |             ret += u if u not in sre_parse.SPECIAL_CHARS else '\\{0}'.format(u)
320 |         elif i[0] == sre_parse.CATEGORY:
321 |             ret += REVERSE_CATEGORIES[i[1]]
322 |         elif i[0] == sre_parse.ANY:
323 |             ret += '.'
324 |         elif i[0] == sre_parse.BRANCH:
325 |             # TODO simplifications here
326 |             parts = [sre_to_string(x, paren=paren) for x in i[1][1]]
327 |             if not any(parts):
328 |                 continue
329 |             if i[1][0]:
330 |                 if len(parts) == 1:
331 |                     paren = False
332 |                 prefix = ''
333 |             else:
334 |                 prefix = '?:'
335 |             branch = '|'.join(parts)
336 |             if paren:
337 |                 ret += '({0}{1})'.format(prefix, branch)
338 |             else:
339 |                 ret += '{0}'.format(branch)
340 |         elif i[0] == sre_parse.SUBPATTERN:
341 |             subexpr = i[1][1]
342 |             if IS_PY36_OR_GREATER and i[0] == sre_parse.SUBPATTERN:
343 |                 subexpr = i[1][3]
344 |             if i[1][0]:
345 |                 ret += '({0})'.format(sre_to_string(subexpr, paren=False))
346 |             else:
347 |                 ret += '{0}'.format(sre_to_string(subexpr, paren=paren))
348 |         elif i[0] == sre_parse.NOT_LITERAL:
349 |             ret += '[^{0}]'.format(unichr(i[1]))
350 |         elif i[0] == sre_parse.MAX_REPEAT:
351 |             if i[1][0] == i[1][1]:
352 |                 range_str = '{{{0}}}'.format(i[1][0])
353 |             else:
354 |                 if i[1][0] == 0 and i[1][1] - i[1][0] == sre_parse.MAXREPEAT:
355 |                     range_str = '*'
356 |                 elif i[1][0] == 1 and i[1][1] - i[1][0] == sre_parse.MAXREPEAT - 1:
357 |                     range_str = '+'
358 |                 else:
359 |                     range_str = '{{{0},{1}}}'.format(i[1][0], i[1][1])
360 |             ret += sre_to_string(i[1][2], paren=paren) + range_str
361 |         elif i[0] == sre_parse.MIN_REPEAT:
362 |             if i[1][0] == 0 and i[1][1] == sre_parse.MAXREPEAT:
363 |                 range_str = '*?'
364 |             elif i[1][0] == 1 and i[1][1] == sre_parse.MAXREPEAT:
365 |                 range_str = '+?'
366 |             elif i[1][1] == sre_parse.MAXREPEAT:
367 |                 range_str = '{{{0},}}?'.format(i[1][0])
368 |             else:
369 |                 range_str = '{{{0},{1}}}?'.format(i[1][0], i[1][1])
370 |             ret += sre_to_string(i[1][2], paren=paren) + range_str
371 |         elif i[0] == sre_parse.GROUPREF:
372 |             ret += '\\{0}'.format(i[1])
373 |         elif i[0] == sre_parse.AT:
374 |             if i[1] == sre_parse.AT_BEGINNING:
375 |                 ret += '^'
376 |             elif i[1] == sre_parse.AT_END:
377 |                 ret += '$'
378 |         elif i[0] == sre_parse.NEGATE:
379 |             pass
380 |         elif i[0] == sre_parse.RANGE:
381 |             ret += '{0}-{1}'.format(unichr(i[1][0]), unichr(i[1][1]))
382 |         elif i[0] == sre_parse.ASSERT:
383 |             if i[1][0]:
384 |                 ret += '(?={0})'.format(sre_to_string(i[1][1], paren=False))
385 |             else:
386 |                 ret += '{0}'.format(sre_to_string(i[1][1], paren=paren))
387 |         elif i[0] == sre_parse.ASSERT_NOT:
388 |             pass
389 |         else:
390 |             print('[!] cannot handle expression "%s"' % str(i))
391 |     return ret
392 | 
393 | 
394 | def simplify(regex_string):
395 |     """Simplify a regular expression
396 | 
397 |     :param regex_string: Regular expression
398 |     :type regex_string: str
399 |     :rtype: str
400 |     """
401 |     r = parse(regex_string)
402 |     return sre_to_string(r)
403 | 
404 | 
405 | def parse(s):
406 |     """Regular expression parser
407 | 
408 |     :param s: Regular expression
409 |     :type s: str
410 |     :rtype: list
411 |     """
412 |     if IS_PY3:
413 |         r = sre_parse.parse(s, flags=U)
414 |     else:
415 |         r = sre_parse.parse(s.decode('utf-8'), flags=U)
416 |     return list(r)
417 | 
418 | 
419 | def generate(s, limit=20):
420 |     """Creates a generator that generates all matching strings to a given regular expression
421 | 
422 |     :param s: Regular expression
423 |     :type s: str
424 |     :param limit: Range limit
425 |     :type limit: int
426 |     :returns: string generator object
427 |     """
428 |     return _gen(parse(s), limit)
429 | 
430 | 
431 | def count(s, limit=20):
432 |     """Counts all matching strings to a given regular expression
433 | 
434 |     :param s: Regular expression
435 |     :type s: str
436 |     :param limit: Range limit
437 |     :type limit: int
438 |     :rtype: int
439 |     :returns: number of matching strings
440 |     """
441 |     return _gen(parse(s), limit, count=True)
442 | 
443 | 
444 | def getone(regex_string, limit=20):
445 |     """Returns a random matching string to a given regular expression
446 |     """
447 |     return _randone(parse(regex_string), limit)
448 | 
449 | 
450 | def argparser():
451 |     import argparse
452 |     from sys import stdout
453 |     argp = argparse.ArgumentParser(
454 |         description='exrex - regular expression string generator')
455 |     argp.add_argument(
456 |         '-o', '--output',
457 |         help='Output file - default is STDOUT',
458 |         metavar='FILE',
459 |         default=stdout,
460 |         type=argparse.FileType('w', encoding='utf-8')
461 |     )
462 |     argp.add_argument(
463 |         '-l', '--limit',
464 |         help='Max limit for range size - default is 20',
465 |         default=20,
466 |         action='store',
467 |         type=int,
468 |         metavar='N'
469 |     )
470 |     argp.add_argument(
471 |         '-c', '--count',
472 |         help='Count matching strings',
473 |         default=False,
474 |         action='store_true'
475 |     )
476 |     argp.add_argument(
477 |         '-m', '--max-number',
478 |         help='Max number of strings - default is -1',
479 |         default=-1,
480 |         action='store',
481 |         type=int,
482 |         metavar='N'
483 |     )
484 |     argp.add_argument(
485 |         '-r', '--random',
486 |         help='Returns a random string that matches to the regex',
487 |         default=False,
488 |         action='store_true'
489 |     )
490 |     argp.add_argument(
491 |         '-s', '--simplify',
492 |         help='Simplifies a regular expression',
493 |         default=False,
494 |         action='store_true'
495 |     )
496 |     argp.add_argument(
497 |         '-d', '--delimiter',
498 |         help='Delimiter - default is \\n',
499 |         default='\n'
500 |     )
501 |     argp.add_argument(
502 |         '-v', '--verbose',
503 |         action='store_true',
504 |         help='Verbose mode',
505 |         default=False
506 |     )
507 |     argp.add_argument(
508 |         'regex',
509 |         metavar='REGEX',
510 |         help='REGEX string'
511 |     )
512 |     return vars(argp.parse_args())
513 | 
514 | 
515 | def __main__():
516 |     from sys import exit, stderr
517 |     args = argparser()
518 |     if args['verbose']:
519 |         args['output'].write(
520 |             '%r%s' % (parse(args['regex']), args['delimiter']))
521 |     if args['count']:
522 |         args['output'].write(
523 |             '%d%s' % (count(args['regex'], limit=args['limit']), args['delimiter']))
524 |         exit(0)
525 |     if args['random']:
526 |         args['output'].write(
527 |             '%s%s' % (getone(args['regex'], limit=args['limit']), args['delimiter']))
528 |         exit(0)
529 |     if args['simplify']:
530 |         args['output'].write(
531 |             '%s%s' % (simplify(args['regex']), args['delimiter']))
532 |         exit(0)
533 |     try:
534 |         g = generate(args['regex'], args['limit'])
535 |     except Exception as e:
536 |         stderr.write('[!] Error: %s\n' % e)
537 |         exit(1)
538 |     args['output'].write(next(g))
539 |     args['max_number'] -= 1
540 |     for s in g:
541 |         if args['max_number'] == 0:
542 |             break
543 |         args['max_number'] -= 1
544 |         args['output'].write(args['delimiter'])
545 |         args['output'].write(s)
546 |     if args['delimiter'] == '\n':
547 |         args['output'].write('\n')
548 | 
549 | 
550 | if __name__ == '__main__':
551 |     __main__()
552 | 


--------------------------------------------------------------------------------
/scripts/regex_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 |   This program generates random text that matches a given regex-pattern.
 5 |   The pattern is given via sys.argv and the generated text is passed to
 6 |   the binary 'tests/test_rand' to check if the generated text also matches
 7 |   the regex-pattern in the C implementation.
 8 |   The exit-code of the testing program, is used to determine test success.
 9 | 
10 |   This script is called by the Makefile when doing 'make test'
11 | """
12 | 
13 | 
14 | import re
15 | import sys
16 | import exrex
17 | from subprocess import call
18 | 
19 | 
20 | def get_one(pattern):
21 |   """Ensure that Python's re-module agrees the example matches the pattern"""
22 |   while True:
23 |     p = exrex.getone(pattern)
24 |     m = re.match(pattern, p)
25 |     if m:
26 |       return p
27 | 
28 | prog = "./tests/test_rand"
29 | 
30 | if len(sys.argv) < 2:
31 |   print("")
32 |   print("usage: %s pattern [nrepeat]" % sys.argv[0])
33 |   print("  where [nrepeat] is optional")
34 |   print("")
35 |   sys.exit(-1)
36 | 
37 | own_prog = sys.argv[0]
38 | pattern = sys.argv[1]
39 | if len(sys.argv) > 2:
40 |   ntests = int(sys.argv[2])
41 | else:
42 |   ntests = 10
43 | nfails = 0
44 | repeats = ntests
45 | 
46 | 
47 | try:
48 |   repeats = int(sys.argv[2])
49 | except:
50 |   pass
51 | 
52 | 
53 | sys.stdout.write("%-35s" % ("  pattern '%s': " % pattern))
54 | 
55 | 
56 | while repeats >= 0:
57 |   try:
58 |     repeats -= 1
59 |     example = get_one(pattern)
60 |     #print("%s %s %s" % (prog, pattern, example))
61 |     ret = call([prog, "\"%s\"" % pattern, "\"%s\"" % example])
62 |     if ret != 0:
63 |       escaped = repr(example) # escapes special chars for better printing
64 |       print("    FAIL : doesn't match %s as expected [%s]." % (escaped, ", ".join([("0x%02x" % ord(e)) for e in example]) ))
65 |       nfails += 1
66 | 
67 |   except:
68 |     import traceback
69 |     print("EXCEPTION!", traceback.format_exc())
70 |     ntests -= 1
71 |     repeats += 1
72 |     #nfails += 1
73 | 
74 | sys.stdout.write("%4d/%d tests succeeded \n" % (ntests - nfails, ntests))
75 | #print("")
76 | 
77 | 


--------------------------------------------------------------------------------
/scripts/regex_test_neg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 |   This program generates random text that matches a given regex-pattern.
 5 |   The pattern is given via sys.argv and the generated text is passed to
 6 |   the binary 'tests/test_rand' to check if the generated text also matches
 7 |   the regex-pattern in the C implementation.
 8 |   The exit-code of the testing program, is used to determine test success.
 9 | 
10 |   This script is called by the Makefile when doing 'make test'
11 | """
12 | 
13 | 
14 | import re
15 | import sys
16 | import string
17 | import random
18 | from subprocess import call
19 | 
20 | 
21 | prog = "./tests/test_rand_neg"
22 | 
23 | if len(sys.argv) < 2:
24 |   print("")
25 |   print("usage: %s pattern [nrepeat]" % sys.argv[0])
26 |   print("  where [nrepeat] is optional")
27 |   print("")
28 |   sys.exit(-1)
29 | 
30 | own_prog = sys.argv[0]
31 | pattern = sys.argv[1]
32 | if len(sys.argv) > 2:
33 |   ntests = int(sys.argv[2])
34 | else:
35 |   ntests = 10
36 | nfails = 0
37 | repeats = ntests
38 | 
39 | 
40 | try:
41 |   repeats = int(sys.argv[2])
42 | except:
43 |   pass
44 | 
45 | sys.stdout.write("%-35s" % ("  pattern '%s': " % pattern))
46 | 
47 | 
48 | 
49 | 
50 | def gen_no_match(pattern, minlen=1, maxlen=50, maxattempts=500):
51 |   nattempts = 0
52 |   while True:
53 |     nattempts += 1
54 |     ret = "".join([random.choice(string.printable) for i in range(random.Random().randint(minlen, maxlen))])
55 |     if re.findall(pattern, ret) == []:
56 |       return ret
57 |     if nattempts >= maxattempts:
58 |       raise Exception("Could not generate string that did not match the regex pattern '%s' after %d attempts" % (pattern, nattempts))
59 | 
60 | 
61 | 
62 | while repeats >= 0:
63 |   try:
64 |     repeats -= 1
65 |     example = gen_no_match(pattern)
66 |     #print("%s %s %s" % (prog, pattern, example))
67 |     ret = call([prog, "\"%s\"" % pattern, "\"%s\"" % example])
68 |     if ret != 0:
69 |       escaped = repr(example) # escapes special chars for better printing
70 |       print("    FAIL : matches %s unexpectedly [%s]." % (escaped, ", ".join([("0x%02x" % ord(e)) for e in example]) ))
71 |       nfails += 1
72 | 
73 |   except:
74 |     #import traceback
75 |     #print("EXCEPTION!")
76 |     #raw_input(traceback.format_exc())
77 |     ntests -= 1
78 |     repeats += 1
79 |     #nfails += 1
80 | 
81 | sys.stdout.write("%4d/%d tests succeeded \n" % (ntests - nfails, ntests))
82 | #print("")
83 | 


--------------------------------------------------------------------------------
/tests/test1.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Testing various regex-patterns
  3 |  */
  4 | 
  5 | #include <stdio.h>
  6 | #include <string.h>
  7 | #include "re.h"
  8 | 
  9 | 
 10 | #define OK    ((char*) 1)
 11 | #define NOK   ((char*) 0)
 12 | 
 13 | 
 14 | char* test_vector[][4] =
 15 | {
 16 |   { OK,  "\\d",                       "5",                (char*) 1      },
 17 |   { OK,  "\\w+",                      "hej",              (char*) 3      },
 18 |   { OK,  "\\s",                       "\t \n",            (char*) 1      },
 19 |   { NOK, "\\S",                       "\t \n",            (char*) 0      },
 20 |   { OK,  "[\\s]",                     "\t \n",            (char*) 1      },
 21 |   { NOK, "[\\S]",                     "\t \n",            (char*) 0      },
 22 |   { NOK, "\\D",                       "5",                (char*) 0      },
 23 |   { NOK, "\\W+",                      "hej",              (char*) 0      },
 24 |   { OK,  "[0-9]+",                    "12345",            (char*) 5      },
 25 |   { OK,  "\\D",                       "hej",              (char*) 1      },
 26 |   { NOK, "\\d",                       "hej",              (char*) 0      },
 27 |   { OK,  "[^\\w]",                    "\\",               (char*) 1      },
 28 |   { OK,  "[\\W]",                     "\\",               (char*) 1      },
 29 |   { NOK, "[\\w]",                     "\\",               (char*) 0      },
 30 |   { OK,  "[^\\d]",                    "d",                (char*) 1      },
 31 |   { NOK, "[\\d]",                     "d",                (char*) 0      },
 32 |   { NOK, "[^\\D]",                    "d",                (char*) 0      },
 33 |   { OK,  "[\\D]",                     "d",                (char*) 1      },
 34 |   { OK,  "^.*\\\\.*$",                "c:\\Tools",        (char*) 8      },
 35 |   { OK,  "^.*\\\\.*$",                "c:\\Tools",        (char*) 8      },
 36 |   { OK,  ".?\\w+jsj$",                "%JxLLcVx8wxrjsj",  (char*) 15     },
 37 |   { OK,  ".?\\w+jsj$",                "=KbvUQjsj",        (char*) 9      },
 38 |   { OK,  ".?\\w+jsj$",                "^uDnoZjsj",        (char*) 9      },
 39 |   { OK,  ".?\\w+jsj$",                "UzZbjsj",          (char*) 7      },
 40 |   { OK,  ".?\\w+jsj$",                "\"wjsj",           (char*) 5      },
 41 |   { OK,  ".?\\w+jsj$",                "zLa_FTEjsj",       (char*) 10     },
 42 |   { OK,  ".?\\w+jsj$",                "\"mw3p8_Ojsj",     (char*) 11     },
 43 |   { OK,  "^[\\+-]*[\\d]+$",           "+27",              (char*) 3      },
 44 |   { OK,  "[abc]",                     "1c2",              (char*) 1      },
 45 |   { NOK, "[abc]",                     "1C2",              (char*) 0      },
 46 |   { OK,  "[1-5]+",                    "0123456789",       (char*) 5      },
 47 |   { OK,  "[.2]",                      "1C2",              (char*) 1      },
 48 |   { OK,  "a*$",                       "Xaa",              (char*) 2      },
 49 |   { OK,  "a*$",                       "Xaa",              (char*) 2      },
 50 |   { OK,  "[a-h]+",                    "abcdefghxxx",      (char*) 8      },
 51 |   { NOK, "[a-h]+",                    "ABCDEFGH",         (char*) 0      },
 52 |   { OK,  "[A-H]+",                    "ABCDEFGH",         (char*) 8      },
 53 |   { NOK, "[A-H]+",                    "abcdefgh",         (char*) 0      },
 54 |   { OK,  "[^\\s]+",                   "abc def",          (char*) 3      },
 55 |   { OK,  "[^fc]+",                    "abc def",          (char*) 2      },
 56 |   { OK,  "[^d\\sf]+",                 "abc def",          (char*) 3      },
 57 |   { OK,  "\n",                        "abc\ndef",         (char*) 1      },
 58 |   { OK,  "b.\\s*\n",                  "aa\r\nbb\r\ncc\r\n\r\n",(char*) 4      },
 59 |   { OK,  ".*c",                       "abcabc",           (char*) 6      },
 60 |   { OK,  ".+c",                       "abcabc",           (char*) 6      },
 61 |   { OK,  "[b-z].*",                   "ab",               (char*) 1      },
 62 |   { OK,  "b[k-z]*",                   "ab",               (char*) 1      },
 63 |   { NOK, "[0-9]",                     "  - ",             (char*) 0      },
 64 |   { OK,  "[^0-9]",                    "  - ",             (char*) 1      },
 65 |   { OK,  "0|",                        "0|",               (char*) 2      },
 66 |   { NOK, "\\d\\d:\\d\\d:\\d\\d",      "0s:00:00",         (char*) 0      },
 67 |   { NOK, "\\d\\d:\\d\\d:\\d\\d",      "000:00",           (char*) 0      },
 68 |   { NOK, "\\d\\d:\\d\\d:\\d\\d",      "00:0000",          (char*) 0      },
 69 |   { NOK, "\\d\\d:\\d\\d:\\d\\d",      "100:0:00",         (char*) 0      },
 70 |   { NOK, "\\d\\d:\\d\\d:\\d\\d",      "00:100:00",        (char*) 0      },
 71 |   { NOK, "\\d\\d:\\d\\d:\\d\\d",      "0:00:100",         (char*) 0      },
 72 |   { OK,  "\\d\\d?:\\d\\d?:\\d\\d?",   "0:0:0",            (char*) 5      },
 73 |   { OK,  "\\d\\d?:\\d\\d?:\\d\\d?",   "0:00:0",           (char*) 6      },
 74 |   { OK,  "\\d\\d?:\\d\\d?:\\d\\d?",   "0:0:00",           (char*) 5      },
 75 |   { OK,  "\\d\\d?:\\d\\d?:\\d\\d?",   "00:0:0",           (char*) 6      },
 76 |   { OK,  "\\d\\d?:\\d\\d?:\\d\\d?",   "00:00:0",          (char*) 7      },
 77 |   { OK,  "\\d\\d?:\\d\\d?:\\d\\d?",   "00:0:00",          (char*) 6      },
 78 |   { OK,  "\\d\\d?:\\d\\d?:\\d\\d?",   "0:00:00",          (char*) 6      },
 79 |   { OK,  "\\d\\d?:\\d\\d?:\\d\\d?",   "00:00:00",         (char*) 7      },
 80 |   { OK,  "[Hh]ello [Ww]orld\\s*[!]?", "Hello world !",    (char*) 12     },
 81 |   { OK,  "[Hh]ello [Ww]orld\\s*[!]?", "hello world !",    (char*) 12     },
 82 |   { OK,  "[Hh]ello [Ww]orld\\s*[!]?", "Hello World !",    (char*) 12     },
 83 |   { OK,  "[Hh]ello [Ww]orld\\s*[!]?", "Hello world!   ",  (char*) 11     },
 84 |   { OK,  "[Hh]ello [Ww]orld\\s*[!]?", "Hello world  !",   (char*) 13     },
 85 |   { OK,  "[Hh]ello [Ww]orld\\s*[!]?", "hello World    !", (char*) 15     },
 86 |   { NOK, "\\d\\d?:\\d\\d?:\\d\\d?",   "a:0",              (char*) 0      }, /* Failing test case reported in https://github.com/kokke/tiny-regex-c/issues/12 */
 87 | /*
 88 |   { OK,  "[^\\w][^-1-4]",     ")T",          (char*) 2      },
 89 |   { OK,  "[^\\w][^-1-4]",     ")^",          (char*) 2      },
 90 |   { OK,  "[^\\w][^-1-4]",     "*)",          (char*) 2      },
 91 |   { OK,  "[^\\w][^-1-4]",     "!.",          (char*) 2      },
 92 |   { OK,  "[^\\w][^-1-4]",     " x",          (char*) 2      },
 93 |   { OK,  "[^\\w][^-1-4]",     "$b",          (char*) 2      },
 94 | */
 95 |   { OK,  ".?bar",                      "real_bar",        (char*) 4      },
 96 |   { NOK, ".?bar",                      "real_foo",        (char*) 0      },
 97 |   { NOK, "X?Y",                        "Z",               (char*) 0      },
 98 |   { OK, "[a-z]+\nbreak",              "blahblah\nbreak",  (char*) 14     },
 99 |   { OK, "[a-z\\s]+\nbreak",           "bla bla \nbreak",  (char*) 14     },
100 | };
101 | 
102 | 
103 | void re_print(re_t);
104 | 
105 | int main()
106 | {
107 |     char* text;
108 |     char* pattern;
109 |     int should_fail;
110 |     int length;
111 |     int correctlen;
112 |     size_t ntests = sizeof(test_vector) / sizeof(*test_vector);
113 |     size_t nfailed = 0;
114 |     size_t i;
115 | 
116 |     for (i = 0; i < ntests; ++i)
117 |     {
118 |         pattern = test_vector[i][1];
119 |         text = test_vector[i][2];
120 |         should_fail = (test_vector[i][0] == NOK);
121 |         correctlen = (int)(test_vector[i][3]);
122 | 
123 |         int m = re_match(pattern, text, &length);
124 | 
125 |         if (should_fail)
126 |         {
127 |             if (m != (-1))
128 |             {
129 |                 printf("\n");
130 |                 re_print(re_compile(pattern));
131 |                 fprintf(stderr, "[%lu/%lu]: pattern '%s' matched '%s' unexpectedly, matched %i chars. \n", (i+1), ntests, pattern, text, length);
132 |                 nfailed += 1;
133 |             }
134 |         }
135 |         else
136 |         {
137 |             if (m == (-1))
138 |             {
139 |                 printf("\n");
140 |                 re_print(re_compile(pattern));
141 |                 fprintf(stderr, "[%lu/%lu]: pattern '%s' didn't match '%s' as expected. \n", (i+1), ntests, pattern, text);
142 |                 nfailed += 1;
143 |             }
144 |             else if (length != correctlen)
145 |             {
146 |                 fprintf(stderr, "[%lu/%lu]: pattern '%s' matched '%i' chars of '%s'; expected '%i'. \n", (i+1), ntests, pattern, length, text, correctlen);
147 |                 nfailed += 1;
148 |             }
149 |         }
150 |     }
151 | 
152 |     // printf("\n");
153 |     printf("%lu/%lu tests succeeded.\n", ntests - nfailed, ntests);
154 |     printf("\n");
155 |     printf("\n");
156 |     printf("\n");
157 | 
158 |     return nfailed; /* 0 if all tests passed */
159 | }
160 | 


--------------------------------------------------------------------------------
/tests/test_compile.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | 
 3 | This file tests two bug patterns reported by @DavidKorczynski in https://github.com/kokke/tiny-regex-c/issues/44
 4 | 
 5 | */
 6 | 
 7 | #include <assert.h>
 8 | #include <stdlib.h> /* for NULL */
 9 | #include "re.h"
10 | 
11 | 
12 | int main()
13 | {
14 |   /* Test 1: inverted set without a closing ']' */
15 |   assert(re_compile("\\\x01[^\\\xff][^") == NULL);
16 | 
17 |   /* Test 2: set with an incomplete escape sequence and without a closing ']' */
18 |   assert(re_compile("\\\x01[^\\\xff][\\") == NULL);
19 | 
20 |   return 0;
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/tests/test_print.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |     This program prints out a verbose explanation of a given regular expression.
 3 | */
 4 | 
 5 | #include <stdio.h>
 6 | #include "re.h"
 7 | 
 8 | 
 9 | int main(int argc, char** argv)
10 | {
11 |   if (argc == 2)
12 |   {
13 |     re_print(re_compile(argv[1]));
14 |   }
15 |   else
16 |   {
17 |     printf("\nUsage: %s <PATTERN> \n", argv[0]);
18 |   }
19 |   return -2;
20 | }
21 | 
22 | 


--------------------------------------------------------------------------------
/tests/test_rand.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |     This program tries to match a given regular expression with text given as input to stdin.
 3 |     If the text is a match for the pattern, the program returns 0.
 4 |     If the text doesn't match the pattern, the program returns -2.
 5 | 
 6 |     This program is used in random testing to test a lot of random text and regex together.
 7 |     See ./scripts/regex_test.py and the Makefile for this project for the gritty details.
 8 | */
 9 | 
10 | #include <stdio.h>
11 | #include "re.h"
12 | 
13 | 
14 | int main(int argc, char** argv)
15 | {
16 |   int length;
17 |   if (argc == 3)
18 |   {
19 |     int m = re_match(argv[1], argv[2], &length);
20 |     if (m != -1)
21 |       return 0;
22 |   }
23 |   else
24 |   {
25 |     printf("\nUsage: %s <PATTERN> <TEXT> \n", argv[0]);
26 |   }
27 |   return -2;
28 | }
29 | 
30 | 


--------------------------------------------------------------------------------
/tests/test_rand_neg.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |     Negative version of test_rand.c -- returns true if no match
 3 | 
 4 |     This program tries to match a given regular expression with text given as input to stdin.
 5 |     If the text is NOT a match for the pattern, the program returns 0.
 6 |     If the text does match the pattern, the program returns -2.
 7 | 
 8 |     This program is used in random testing to test a lot of random text and regex together.
 9 |     See ./scripts/regex_test_neg.py and the Makefile for this project for the gritty details.
10 | */
11 | 
12 | #include <stdio.h>
13 | #include "re.h"
14 | 
15 | 
16 | int main(int argc, char** argv)
17 | {
18 |   int length;
19 |   if (argc == 3)
20 |   {
21 |     int m = re_match(argv[1], argv[2], &length);
22 |     if (m == -1)
23 |       return 0;
24 |   }
25 |   else
26 |   {
27 |     printf("\nUsage: %s <PATTERN> <TEXT> \n", argv[0]);
28 |   }
29 |   return -2;
30 | }
31 | 


--------------------------------------------------------------------------------