├── .gitignore ├── clib.json ├── tests ├── test_print.c ├── test_end_anchor.c ├── nok.lst ├── test_rand_neg.c ├── test_rand.c ├── ok.lst ├── test_compile.c └── test1.c ├── .github └── workflows │ └── c-cpp.yml ├── CMakeLists.txt ├── LICENSE ├── Makefile ├── scripts ├── regex_test.py ├── regex_test_neg.py ├── exrex.py └── exrex_uni.py ├── re.h ├── README.md ├── formal_verification.md └── re.c /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.pyc 3 | .gdbinit 4 | /tests/* 5 | !/tests/*.c 6 | -------------------------------------------------------------------------------- /clib.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tiny-regex-c", 3 | "version": "0.1.0", 4 | "repo": "kokke/tiny-regex-c", 5 | "keywords": ["tiny", "regex", "pcre"], 6 | "license": "Public Domain", 7 | "makefile": "Makefile", 8 | "src": [ 9 | "re.h", 10 | "re.c" 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /tests/test_print.c: -------------------------------------------------------------------------------- 1 | /* 2 | This program prints out a verbose explanation of a given regular expression. 3 | */ 4 | 5 | #include 6 | #include "re.h" 7 | 8 | 9 | int main(int argc, char** argv) 10 | { 11 | if (argc == 2) 12 | { 13 | re_print(re_compile(argv[1])); 14 | } 15 | else 16 | { 17 | printf("\nUsage: %s \n", argv[0]); 18 | } 19 | return -2; 20 | } 21 | 22 | -------------------------------------------------------------------------------- /tests/test_end_anchor.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "re.h" 4 | 5 | int main() { 6 | 7 | const char *text = "table football"; 8 | const char *pattern = "l$"; 9 | int index,len; 10 | 11 | index = re_match(pattern, text, &len); 12 | 13 | if (index==13 && len==1) { 14 | return 0; 15 | } else { 16 | printf("ERROR! index=%d len=%d \n",index,len); 17 | return -1; 18 | } 19 | 20 | } 21 | -------------------------------------------------------------------------------- /tests/nok.lst: -------------------------------------------------------------------------------- 1 | \S "\t \n" 0 2 | [\S] "\t \n" 0 3 | \D "5" 0 4 | \W+ "hej" 0 5 | \d "hej" 0 6 | [\w] "\" 0 7 | [\d] "d" 0 8 | [^\D] "d" 0 9 | [abc] "1C2" 0 10 | [a-h]+ "ABCDEFGH" 0 11 | [A-H]+ "abcdefgh" 0 12 | [0-9] " - " 0 13 | \s+[a-zA-Z0-9?]* "xyz" 0 14 | \d\d:\d\d:\d\d "0s:00:00" 0 15 | \d\d:\d\d:\d\d "000:00" 0 16 | \d\d:\d\d:\d\d "00:0000" 0 17 | \d\d:\d\d:\d\d "100:0:00" 0 18 | \d\d:\d\d:\d\d "00:100:00" 0 19 | \d\d:\d\d:\d\d "0:00:100" 0 20 | \d\d?:\d\d?:\d\d? "a:0" 0 21 | .?bar "real_foo" 0 22 | X?Y "Z" 0 23 | a\ "a\" 0 24 | \ "\" 0 25 | \w{3} "ab" 0 26 | \w{3,} "ab" 0 27 | \w{3,4} "ab" 0 28 | a^ "ba" 0 29 | -------------------------------------------------------------------------------- /.github/workflows/c-cpp.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | pull_request: 6 | branches: [ master ] 7 | 8 | # Allows you to run this workflow manually from the Actions tab 9 | workflow_dispatch: 10 | 11 | jobs: 12 | build: 13 | 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - uses: actions/checkout@v2 18 | - name: make clean 19 | run: make clean 20 | - name: make all 21 | run: make all 22 | - name: make test 23 | run: make test 24 | - name: setup cbmc 25 | run: sudo apt install -y --no-install-recommends cbmc 26 | - name: make verify with cbmc 27 | run: make verify 28 | -------------------------------------------------------------------------------- /tests/test_rand_neg.c: -------------------------------------------------------------------------------- 1 | /* 2 | Negative version of test_rand.c -- returns true if no match 3 | 4 | This program tries to match a given regular expression with text given as input to stdin. 5 | If the text is NOT a match for the pattern, the program returns 0. 6 | If the text does match the pattern, the program returns -2. 7 | 8 | This program is used in random testing to test a lot of random text and regex together. 9 | See ./scripts/regex_test_neg.py and the Makefile for this project for the gritty details. 10 | */ 11 | 12 | #include 13 | #include "re.h" 14 | 15 | 16 | int main(int argc, char** argv) 17 | { 18 | int length; 19 | if (argc == 3) 20 | { 21 | int m = re_match(argv[1], argv[2], &length); 22 | if (m == -1) 23 | return 0; 24 | } 25 | else 26 | { 27 | printf("\nUsage: %s \n", argv[0]); 28 | } 29 | return -2; 30 | } 31 | -------------------------------------------------------------------------------- /tests/test_rand.c: -------------------------------------------------------------------------------- 1 | /* 2 | This program tries to match a given regular expression with text given as input to stdin. 3 | If the text is a match for the pattern, the program returns 0. 4 | If the text doesn't match the pattern, the program returns -2. 5 | 6 | This program is used in random testing to test a lot of random text and regex together. 7 | See ./scripts/regex_test.py and the Makefile for this project for the gritty details. 8 | */ 9 | 10 | #include 11 | #include "re.h" 12 | 13 | void re_print(re_t); 14 | 15 | int main(int argc, char** argv) 16 | { 17 | int length; 18 | if (argc == 3) 19 | { 20 | int m = re_match(argv[1], argv[2], &length); 21 | if (m != -1) 22 | return 0; 23 | printf("\n"); 24 | re_print(re_compile(argv[1])); 25 | fprintf(stderr, "pattern '%s' didn't match '%s' as expected. \n", argv[1], argv[2]); 26 | } 27 | else 28 | { 29 | printf("\nUsage: %s \n", argv[0]); 30 | } 31 | return -2; 32 | } 33 | 34 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | set(CMAKE_C_STANDARD 17) 4 | set(CMAKE_C_STANDARD_REQUIRED ON) 5 | set(CMAKE_C_EXTENSIONS OFF) 6 | 7 | option(TINY_REGEX_C_DOT_MATCHES_NEWLINE "'.' matches '\r' + '\n'" ON) 8 | option(TINY_REGEX_C_RECURSIVE_MATCHING "Enable recursive matching" OFF) 9 | 10 | if(!TINY_REGEX_C_DOT_MATCHES_NEWLINE) 11 | add_definitions(-DRE_DOT_MATCHES_NEWLINE=0) 12 | endif() 13 | 14 | if(TINY_REGEX_C_RECURSIVE_MATCHING) 15 | add_definitions(-DRECURSIVE_RE) 16 | endif() 17 | 18 | project(TinyRegexC 19 | VERSION 20 | "0.0.0" 21 | DESCRIPTION 22 | "Small, portable regex pattern matcher in C." 23 | HOMEPAGE_URL 24 | "https://github.com/kokke/tiny-regex-c" 25 | LANGUAGES C) 26 | 27 | add_library(TinyRegexC STATIC 28 | re.h 29 | re.c) 30 | 31 | target_include_directories(libGimbal 32 | PUBLIC 33 | $ 34 | $ 35 | $ 36 | PRIVATE 37 | ${CMAKE_CURRENT_SOURCE_DIR} 38 | ) 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /tests/ok.lst: -------------------------------------------------------------------------------- 1 | \d "5" 1 2 | \w+ "hej" 3 3 | \s "\t \n" 1 4 | [\s] "\t \n" 1 5 | [0-9]+ "12345" 5 6 | \D "hej" 1 7 | [^\w] "\" 1 8 | [\W] "\" 1 9 | [^\d] "d" 1 10 | [\D] "d" 1 11 | ^.*\\.*$ "c:\Tools" 8 12 | ^[\+-]*[\d]+$ "+27" 3 13 | [abc] "1c2" 1 14 | [1-5]+ "0123456789" 5 15 | [.2] "1C2" 1 16 | a*$ "Xaa" 2 17 | a*$ "Xaa" 2 18 | [a-h]+ "abcdefghxxx" 8 19 | [A-H]+ "ABCDEFGH" 8 20 | [^\s]+ "abc def" 3 21 | [^fc]+ "abc def" 2 22 | [^d\sf]+ "abc def" 3 23 | \n "abc\ndef" 1 24 | b.\s*\n "aa\r\nbb\r\ncc\r\n\r\n" 4 25 | .*c "abcabc" 6 26 | .+c "abcabc" 6 27 | [b-z].* "ab" 1 28 | b[k-z]* "ab" 1 29 | [^0-9] " - " 1 30 | [1-5-]+[-1-2]-[-] "13132231--353444-511--" 22 31 | \[a-z\] "[a-z]" 5 32 | 0| "0" 1 33 | 0| "" 0 34 | 0| "0|" 1 35 | ^0| "x0" 0 36 | \d\d?:\d\d?:\d\d? "0:0:0" 5 37 | \d\d?:\d\d?:\d\d? "0:00:0" 6 38 | \d\d?:\d\d?:\d\d? "0:0:00" 5 39 | \d\d?:\d\d?:\d\d? "00:0:0" 6 40 | \d\d?:\d\d?:\d\d? "00:00:0" 7 41 | \d\d?:\d\d?:\d\d? "00:0:00" 6 42 | \d\d?:\d\d?:\d\d? "0:00:00" 6 43 | \d\d?:\d\d?:\d\d? "00:00:00" 7 44 | [Hh]ello [Ww]orld\s*[!]? "Hello world !" 12 45 | [Hh]ello [Ww]orld\s*[!]? "hello world !" 12 46 | [Hh]ello [Ww]orld\s*[!]? "Hello World !" 12 47 | [Hh]ello [Ww]orld\s*[!]? "Hello world! " 11 48 | [Hh]ello [Ww]orld\s*[!]? "Hello world !" 13 49 | [Hh]ello [Ww]orld\s*[!]? "hello World !" 14 50 | .?bar "real_bar" 4 51 | [a-z]+\nbreak "blahblah\nbreak" 14 52 | [a-z\s]+\nbreak "bla bla \nbreak" 14 53 | [^\w][^-1-4] ")T" 2 54 | [^\w][^-1-4] ")^" 2 55 | [^\w][^-1-4] "*)" 2 56 | [^\w][^-1-4] "!." 2 57 | [^\w][^-1-4] " x" 2 58 | [^\w][^-1-4] "$b" 2 59 | \\ "\" 1 60 | \x4C "L" 1 61 | \x4 "\\x4" 3 62 | \x4X "\\x4X" 4 63 | 0|1 "0" 1 64 | [A-Z]|[0-9] "0" 1 65 | \w|\s "_ " 1 66 | \w{2} "ab" 2 67 | \w{2,} "abc" 3 68 | [a-z]{2,} "abcd" 4 69 | \w{2,3} "abc" 3 70 | \w{,2} "abc" 2 71 | \w{,3} "abc" 3 72 | \w{,4} "abc" 3 73 | {2} "{2}" 3 74 | x{} "x{}" 3 75 | x{1,2,} "x{1,2,}" 7 76 | x{,2,} "x{,2,}" 6 77 | x{,} "x{,}" 4 78 | x{-2} "x{-2}" 5 79 | ([a-z][0-9])+ "a0b1" 4 80 | ([a-z][0-9]){3} "a0b1c2" 6 81 | #((ab)|b)+ "abbb" 4 82 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CC := cc 2 | CFLAGS := -O3 -Wall -Wextra 3 | #CFLAGS := -g -Wall -Wextra -std=c99 -DDEBUG 4 | 5 | # Number of random text expressions to generate, for random testing 6 | NRAND_TESTS := 1000 7 | 8 | PYTHON != if (python --version 2>&1 | grep -q 'Python 3\..*'); then \ 9 | echo 'python'; \ 10 | elif command -v python3 >/dev/null 2>&1; then \ 11 | echo 'python3'; \ 12 | else \ 13 | echo 'Error: no compatible python 3 version found.' >&2; \ 14 | exit 1; \ 15 | fi 16 | TEST_BINS = tests/test1 tests/test2 tests/test_compile tests/test_rand tests/test_rand_neg 17 | 18 | all: $(TEST_BINS) 19 | 20 | tests/test1: re.c tests/test1.c 21 | @$(CC) -I. $(CFLAGS) re.c tests/test1.c -o $@ 22 | tests/test2: re.c tests/test2.c 23 | @$(CC) -I. $(CFLAGS) re.c tests/test2.c -o $@ 24 | tests/test_compile: re.c tests/test_compile.c 25 | @$(CC) -I. $(CFLAGS) re.c tests/test_compile.c -o $@ 26 | tests/test_rand: re.c tests/test_rand.c 27 | @$(CC) -I. $(CFLAGS) re.c tests/test_rand.c -o $@ 28 | tests/test_rand_neg: re.c tests/test_rand_neg.c 29 | @$(CC) -I. $(CFLAGS) re.c tests/test_rand_neg.c -o $@ 30 | 31 | clean: 32 | @rm -f $(TEST_BINS) 33 | @rm -f a.out 34 | @rm -f *.o 35 | 36 | test-pyok: tests/test_rand 37 | @$(test $(PYTHON)) 38 | @$(PYTHON) ./scripts/regex_test.py tests/ok.lst $(NRAND_TESTS) 39 | 40 | test-pynok: tests/test_rand_neg 41 | @$(test $(PYTHON)) 42 | @$(PYTHON) ./scripts/regex_test_neg.py tests/nok.lst $(NRAND_TESTS) 43 | 44 | test: all 45 | @./tests/test1 46 | $(MAKE) test-pyok 47 | $(MAKE) test-pynok 48 | @./tests/test_compile 49 | @./tests/test2 50 | 51 | CBMC := cbmc 52 | 53 | # unwindset: loop max MAX_REGEXP_OBJECTS patterns 54 | # --enum-range-check not with cbmc 5.10 on ubuntu-latest 55 | verify: 56 | $(CBMC) -DCPROVER --unwindset 8 --unwind 16 --depth 16 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check $(CBMC_ARGS) re.c 57 | -------------------------------------------------------------------------------- /scripts/regex_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This python program generates random text that matches a given regex-pattern. 5 | The patterns are given via sys.argv and the generated text is passed to 6 | the binary 'tests/test_rand' to check if the generated text also matches 7 | the regex-pattern in the C implementation. 8 | The exit-code of the testing program, is used to determine test success. 9 | 10 | This script is called by the Makefile when doing 'make test' 11 | """ 12 | 13 | 14 | import re 15 | import sys 16 | import exrex 17 | from subprocess import call 18 | 19 | 20 | prog = "./tests/test_rand" 21 | 22 | if len(sys.argv) < 2: 23 | print("") 24 | print("usage: %s pattern-file [ntests or 10] [repeat]" % sys.argv[0]) 25 | print("") 26 | sys.exit(-1) 27 | 28 | own_prog = sys.argv[0] 29 | pattern_file = sys.argv[1] 30 | if len(sys.argv) > 2: 31 | ntests = int(sys.argv[2]) 32 | else: 33 | ntests = 10 34 | nfails = 0 35 | repeats = ntests 36 | old_pattern = "" 37 | if len(sys.argv) > 3: 38 | repeats = int(sys.argv[3]) 39 | 40 | sys.stdout.write("Testing patterns against %d random strings matching the Python implementation and comparing::\n" % ntests) 41 | 42 | with open(pattern_file, 'rt') as f: 43 | for line in f: 44 | parts = line.split('\t') 45 | pattern = parts[0] 46 | if pattern == old_pattern: 47 | break 48 | old_pattern = pattern 49 | r = 50 50 | while r < 0: 51 | try: 52 | g = exrex.generate(pattern) 53 | break 54 | except: 55 | pass 56 | 57 | sys.stdout.write(" pattern '%s':\n" % pattern) 58 | 59 | while repeats > 0: 60 | try: 61 | repeats -= 1 62 | example = exrex.getone(pattern) 63 | print("%s \"%s\" \"%s\"" % (prog, pattern, example)) 64 | ret = call([prog, "\"%s\"" % pattern, "\"%s\"" % example]) 65 | if ret != 0: 66 | escaped = repr(example) # escapes special chars for better printing 67 | print(" FAIL : %s doesn't match %s as expected [%s]." % (pattern, escaped, ", ".join([("0x%02x" % ord(e)) for e in example]) )) 68 | nfails += 1 69 | 70 | except: 71 | #import traceback 72 | #print("EXCEPTION!") 73 | #raw_input(traceback.format_exc()) 74 | ntests -= 1 75 | repeats = 0 76 | #nfails += 1 77 | 78 | sys.stdout.write("%4d/%d tests succeeded.\n\n" % (ntests - nfails, ntests)) 79 | #print("") 80 | 81 | -------------------------------------------------------------------------------- /re.h: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * Mini regex-module inspired by Rob Pike's regex code described in: 4 | * 5 | * http://www.cs.princeton.edu/courses/archive/spr09/cos333/beautiful.html 6 | * 7 | * 8 | * 9 | * Supports: 10 | * --------- 11 | * '.' Dot, matches any character 12 | * '^' Start anchor, matches beginning of string 13 | * '$' End anchor, matches end of string 14 | * '*' Asterisk, match zero or more (greedy) 15 | * '+' Plus, match one or more (greedy) 16 | * '?' Question, match zero or one (non-greedy) 17 | * '[abc]' Character class, match if one of {'a', 'b', 'c'} 18 | * '[^abc]' Inverted class, match if NOT one of {'a', 'b', 'c'} -- NOTE: feature is currently broken! 19 | * '[a-zA-Z]' Character ranges, the character set of the ranges { a-z | A-Z } 20 | * '\s' Whitespace, \t \f \r \n \v and spaces 21 | * '\S' Non-whitespace 22 | * '\w' Alphanumeric, [a-zA-Z0-9_] 23 | * '\W' Non-alphanumeric 24 | * '\d' Digits, [0-9] 25 | * '\D' Non-digits 26 | * 27 | * 28 | */ 29 | 30 | #ifndef _TINY_REGEX_C 31 | #define _TINY_REGEX_C 32 | 33 | 34 | #ifndef RE_DOT_MATCHES_NEWLINE 35 | /* Define to 0 if you DON'T want '.' to match '\r' + '\n' */ 36 | #define RE_DOT_MATCHES_NEWLINE 1 37 | #endif 38 | 39 | #ifdef __cplusplus 40 | extern "C"{ 41 | #endif 42 | 43 | /* Typedef'd pointer to get abstract datatype. */ 44 | typedef struct regex_t* re_t; 45 | 46 | /* Compile regex string pattern to custom buffer, returning # of bytes used */ 47 | re_t re_compile_to(const char* pattern, unsigned char* re_data, unsigned* bytes); 48 | 49 | /* Compile regex string pattern to a regex_t-array, using internal buffer */ 50 | re_t re_compile(const char* pattern); 51 | 52 | /* Reconstruct a regex string from a compiled pattern */ 53 | void re_string(re_t pattern, char* buffer, unsigned* size); 54 | 55 | /* Returns the size in bytes of a compiled pattern */ 56 | unsigned re_size(re_t pattern); 57 | 58 | /* Compares two compiled patterns for equality */ 59 | int re_compare(re_t pattern1, re_t pattern2); 60 | 61 | /* Find matches of the compiled pattern inside text. */ 62 | int re_matchp(re_t pattern, const char* text, int* matchlength); 63 | 64 | /* Find matches of the txt pattern inside text (will compile automatically first). */ 65 | int re_match(const char* pattern, const char* text, int* matchlength); 66 | 67 | 68 | #ifdef __cplusplus 69 | } 70 | #endif 71 | 72 | #endif /* ifndef _TINY_REGEX_C */ 73 | -------------------------------------------------------------------------------- /scripts/regex_test_neg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This program generates random text that matches a given regex-pattern. 5 | The patterns are given via sys.argv and the generated text is passed to 6 | the binary 'tests/test_rand' to check if the generated text also matches 7 | the regex-pattern in the C implementation. 8 | The exit-code of the testing program, is used to determine test success. 9 | 10 | This script is called by the Makefile when doing 'make test' 11 | """ 12 | 13 | 14 | import re 15 | import sys 16 | import string 17 | import random 18 | from subprocess import call 19 | 20 | 21 | prog = "./tests/test_rand_neg" 22 | 23 | if len(sys.argv) < 2: 24 | print("") 25 | print("usage: %s pattern-file [ntests or 10] [repeat]" % sys.argv[0]) 26 | print("") 27 | sys.exit(-1) 28 | 29 | own_prog = sys.argv[0] 30 | pattern_file = sys.argv[1] 31 | if len(sys.argv) > 2: 32 | ntests = int(sys.argv[2]) 33 | else: 34 | ntests = 10 35 | nfails = 0 36 | repeats = ntests 37 | old_pattern = "" 38 | if len(sys.argv) > 3: 39 | repeats = int(sys.argv[3]) 40 | 41 | sys.stdout.write("Testing rejection of patterns against %d random strings also rejected by the Python implementation:\n" % ntests) 42 | 43 | def gen_no_match(pattern, minlen=1, maxlen=50, maxattempts=500): 44 | nattempts = 0 45 | while True: 46 | nattempts += 1 47 | ret = "".join([random.choice(string.printable) for i in range(random.Random().randint(minlen, maxlen))]) 48 | if re.findall(pattern, ret) == []: 49 | return ret 50 | if nattempts >= maxattempts: 51 | raise Exception("Could not generate string that did not match the regex pattern '%s' after %d attempts" % (pattern, nattempts)) 52 | 53 | with open(pattern_file, 'rt') as f: 54 | for line in f: 55 | parts = line.split('\t') 56 | pattern = parts[0] 57 | if pattern == old_pattern: 58 | break 59 | old_pattern = pattern 60 | sys.stdout.write(" pattern '%s':\n" % pattern) 61 | 62 | while repeats > 0: 63 | try: 64 | repeats -= 1 65 | example = gen_no_match(pattern) 66 | #print("%s %s %s" % (prog, pattern, example)) 67 | ret = call([prog, "\"%s\"" % pattern, "\"%s\"" % example]) 68 | if ret != 0: 69 | escaped = repr(example) # escapes special chars for better printing 70 | print(" FAIL : matches %s unexpectedly [%s]." % (escaped, ", ".join([("0x%02x" % ord(e)) for e in example]) )) 71 | nfails += 1 72 | 73 | except: 74 | #import traceback 75 | #print("EXCEPTION!") 76 | #raw_input(traceback.format_exc()) 77 | ntests -= 1 78 | repeats = 0 79 | #nfails += 1 80 | 81 | sys.stdout.write("%4d/%d tests succeeded.\n\n" % (ntests - nfails, ntests)) 82 | -------------------------------------------------------------------------------- /tests/test_compile.c: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | This file tests two bug patterns reported by @DavidKorczynski in 4 | https://github.com/kokke/tiny-regex-c/issues/44 5 | 6 | And some structural issues with nested groups. 7 | 8 | */ 9 | 10 | #include 11 | #include 12 | #include /* for NULL */ 13 | #include "re.h" 14 | 15 | void re_print(re_t pattern); 16 | typedef struct regex_t 17 | { 18 | unsigned type; /* CHAR, STAR, etc. */ 19 | union 20 | { 21 | char ch; /* the character itself */ 22 | char* ccl; /* OR a pointer to characters in class */ 23 | unsigned char group_num; /* OR the number of group patterns. */ 24 | unsigned char group_start; /* OR for GROUPEND, the start index of the group. */ 25 | struct { 26 | unsigned short n; /* match n times */ 27 | unsigned short m; /* match n to m times */ 28 | }; 29 | } u; 30 | } regex_t; 31 | 32 | int main() 33 | { 34 | size_t i; 35 | int failed = 0; 36 | int ntests = 0; 37 | printf("Testing handling of invalid regex patterns:\n"); 38 | const char *const tests[] = { 39 | /* Test 1: inverted set without a closing ']' */ 40 | "\\\x01[^\\\xff][^", 41 | /* Test 2: set with an incomplete escape sequence and without a closing ']' */ 42 | "\\\x01[^\\\xff][\\", 43 | /* Invalid escape. '\\' as last char without previous \\ */ 44 | "\\", 45 | /* incomplete char classes */ 46 | "[^", "[abc\\", 47 | /* overlong char classes */ 48 | "[0123456789012345678901234567890123456789]", 49 | "[01234567890123456789\\0123456789012345678]", 50 | "[00000000000000000000000000000000000000][", 51 | /* quantifiers without context: nothing to repeat at position 0 */ 52 | "+", "?", "*", 53 | /* Tests 7-12: invalid quantifiers. */ 54 | /* note that python and perl allows these, and matches them exact. */ 55 | // "{2}", "x{}", "x{1,2,}", "x{,2,}", "x{-2}", 56 | }; 57 | 58 | for (i = 0; i < sizeof(tests)/sizeof(*tests); i++) 59 | { 60 | const char *s = tests[i]; 61 | ntests++; 62 | regex_t *p = re_compile(s); 63 | if (p != NULL) 64 | { 65 | printf(" [%d] re_compile(\"%s\") must not compile.\n", ntests, s); 66 | re_print(p); 67 | failed++; 68 | } 69 | } 70 | printf(" %d/%d tests succeeded.\n", ntests-failed, ntests); 71 | 72 | printf("Testing compilation of nested groups:\n"); 73 | re_t p = re_compile("((ab)|b)+"); 74 | int printed = 0; 75 | 76 | ntests++; 77 | if (p[0].u.group_num != 6) 78 | { 79 | printf(" [%d] wrong [0].group_num %hu for ((ab)|b)+\n", ntests, p[0].u.group_num); 80 | if (!printed) 81 | re_print(p); 82 | printed = 1; 83 | failed++; 84 | } 85 | ntests++; 86 | if (p[1].u.group_num != 2) 87 | { 88 | printf(" [%u] wrong [1].group_num %hu.\n", ntests, p[1].u.group_num); 89 | if (!printed) 90 | re_print(p); 91 | printed = 1; 92 | failed++; 93 | } 94 | ntests++; 95 | if (p[4].u.group_start != 1) 96 | { 97 | printf(" [%u] wrong [4].group_start %hu.\n", ntests, p[4].u.group_start); 98 | if (!printed) 99 | re_print(p); 100 | printed = 1; 101 | failed++; 102 | } 103 | ntests++; 104 | if (p[7].u.group_start != 0) 105 | { 106 | printf(" [%u] wrong [7].group_start %hu.\n", ntests, p[7].u.group_start); 107 | if (!printed) 108 | re_print(p); 109 | printed = 1; 110 | failed++; 111 | } 112 | 113 | printf(" %d/%d tests succeeded.\n", ntests-failed, ntests); 114 | return failed ? 1 : 0; 115 | } 116 | 117 | -------------------------------------------------------------------------------- /tests/test1.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Testing various regex-patterns 3 | */ 4 | 5 | #include 6 | #include 7 | #include 8 | #ifdef _UNICODE 9 | # include 10 | #endif 11 | #include "re.h" 12 | 13 | struct test_case { 14 | char *rx; 15 | char *text; 16 | int len; 17 | }; 18 | 19 | void re_print(re_t); 20 | 21 | /* "\\n" => "\n" */ 22 | char *cunquote (char* s, int l) 23 | { 24 | int i; 25 | char *r = malloc (l + 1); 26 | for (i=0; i= size) 84 | { 85 | size *= 2; 86 | vec = realloc (vec, size * sizeof(struct test_case)); 87 | } 88 | } 89 | *ntests = i; 90 | return vec; 91 | } 92 | 93 | void free_test_cases (struct test_case* test_case, int ntests) 94 | { 95 | for (int i=0; i < ntests; i++) 96 | { 97 | free (test_case[i].rx); 98 | free (test_case[i].text); 99 | } 100 | free (test_case); 101 | } 102 | 103 | int do_test (struct test_case* test_case, int i, int ntests, int ok) 104 | { 105 | char* text; 106 | char* pattern; 107 | int should_fail; 108 | int length; 109 | int correctlen; 110 | int nfailed = 0; 111 | 112 | pattern = test_case[i].rx; 113 | text = test_case[i].text; 114 | should_fail = ok == 0; 115 | correctlen = test_case[i].len; 116 | 117 | int m = re_match(pattern, text, &length); 118 | 119 | if (should_fail) 120 | { 121 | if (m != (-1)) 122 | { 123 | printf("\n"); 124 | re_print(re_compile(pattern)); 125 | fprintf(stderr, "[%d/%d]: pattern '%s' matched '%s' unexpectedly, matched %i chars. \n", i+1, ntests, pattern, text, length); 126 | nfailed += 1; 127 | } 128 | } 129 | else 130 | { 131 | if (m == (-1)) 132 | { 133 | printf("\n"); 134 | re_print(re_compile(pattern)); 135 | fprintf(stderr, "[%d/%d]: pattern '%s' didn't match '%s' as expected. \n", (i+1), ntests, pattern, text); 136 | nfailed += 1; 137 | } 138 | else if (length != correctlen) 139 | { 140 | printf("\n"); 141 | re_print(re_compile(pattern)); 142 | fprintf(stderr, "[%d/%d]: pattern '%s' matched '%i' chars of '%s'; expected '%i'. \n", (i+1), ntests, pattern, length, text, correctlen); 143 | nfailed += 1; 144 | } 145 | } 146 | return nfailed; 147 | } 148 | 149 | int main() 150 | { 151 | int ntests, ntests_nok; 152 | int nfailed = 0; 153 | int i; 154 | 155 | printf("Testing hand-picked regex patterns\n"); 156 | 157 | //setlocale(LC_CTYPE, "en_US.UTF-8"); 158 | struct test_case* tests_ok = read_tests ("tests/ok.lst", &ntests); 159 | for (i = 0; i < ntests; ++i) 160 | { 161 | nfailed += do_test (tests_ok, i, ntests, 1); 162 | } 163 | free_test_cases (tests_ok, ntests); 164 | 165 | struct test_case* tests_nok = read_tests ("tests/nok.lst", &ntests_nok); 166 | for (i = 0; i < ntests_nok; ++i) 167 | { 168 | nfailed += do_test (tests_nok, i, ntests_nok, 0); 169 | } 170 | free_test_cases (tests_nok, ntests_nok); 171 | ntests += ntests_nok; 172 | 173 | // regression test for unhandled BEGIN in the middle of an expression 174 | // we need to test text strings with all possible values for the second 175 | // byte because re.c was matching it against an uninitalized value, so 176 | // it could be anything 177 | int length; 178 | const char* pattern = "a^"; 179 | for (i = 0; i < 255; i++) { 180 | char text_buf[] = { 'a', i, '\0' }; 181 | int m = re_match(pattern, text_buf, &length); 182 | if (m != -1) { 183 | fprintf(stderr, "[%d/%d]: pattern '%s' matched '%s' unexpectedly", ntests, ntests, pattern, text_buf); 184 | nfailed += 1; 185 | break; 186 | } 187 | } 188 | ntests++; 189 | printf(" %d/%d tests succeeded.\n", ntests - nfailed, ntests); 190 | 191 | return nfailed; /* 0 if all tests passed */ 192 | } 193 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Falco Girgis's Changelist: 2 | - merged several bugfixes that were just sitting as pending PRs 3 | - merged several features that were just sitting as pending PRs 4 | - added CMake support 5 | - exposed configuration definitions as CMake options 6 | - modified internal symbol representation to be far more compact 7 | - everything is byte-packed as tightly as humanly possible 8 | - character class substrings are now stored within the symbol 9 | - far more cache coherent 10 | - added support for user-supplied compiled pattern storage 11 | - allows you to heap allocate and maintain more than one compiled pattern 12 | - Fixed multi matches `{n}`, `{,m}`, `{n,}`, `{n, m}`, which were only partially working previously 13 | - Added stringifier to (sort of) go back to string form from a compiled regexp 14 | - Broke print() function 15 | - Broke recursive pattern matching 16 | 17 | # 18 | ![CI](https://github.com/kokke/tiny-regex-c/workflows/CI/badge.svg) 19 | # tiny-regex-c 20 | # A small regex implementation in C 21 | ### Description 22 | Small and portable [Regular Expression](https://en.wikipedia.org/wiki/Regular_expression) (regex) library written in C. 23 | 24 | Design is inspired by Rob Pike's regex-code for the book *"Beautiful Code"* [available online here](http://www.cs.princeton.edu/courses/archive/spr09/cos333/beautiful.html). 25 | 26 | Supports a subset of the syntax and semantics of the Python standard library implementation (the `re`-module). 27 | 28 | **I will gladly accept patches correcting bugs.** 29 | 30 | ### Design goals 31 | The main design goal of this library is to be small, correct, self contained and use few resources while retaining acceptable performance and feature completeness. Clarity of the code is also highly valued. 32 | 33 | ### Notable features and omissions 34 | - Small code and binary size: 500 SLOC, ~3kb binary for x86. Statically #define'd memory usage / allocation. 35 | - NOTE: support added for user-specified storage -- Falco 36 | - No use of dynamic memory allocation (i.e. no calls to `malloc` / `free`). 37 | - To avoid call-stack exhaustion, iterative searching is preferred over recursive by default (can be changed with a pre-processor flag). 38 | - No support for capturing groups or named capture: `(^Pgroup)` etc. 39 | - Thorough testing : [exrex](https://github.com/asciimoo/exrex) is used to randomly generate test-cases from regex patterns, which are fed into the regex code for verification. Try `make test` to generate a few thousand tests cases yourself. 40 | - Verification-harness for [KLEE Symbolic Execution Engine](https://klee.github.io), see [formal verification.md](https://github.com/kokke/tiny-regex-c/blob/master/formal_verification.md). 41 | - Provides character length of matches. 42 | - Compiled for x86 using GCC 7.2.0 and optimizing for size, the binary takes up ~2-3kb code space and allocates ~0.5kb RAM : 43 | ``` 44 | > gcc -Os -c re.c 45 | > size re.o 46 | text data bss dec hex filename 47 | 2404 0 304 2708 a94 re.o 48 | 49 | ``` 50 | 51 | 52 | 53 | ### API 54 | This is the public / exported API: 55 | ```C 56 | /* Typedef'd pointer to hide implementation details. */ 57 | typedef struct regex_t* re_t; 58 | 59 | /* Compiles regex string pattern to a regex_t-array. */ 60 | re_t re_compile(const char* pattern); 61 | 62 | /* Finds matches of the compiled pattern inside text. */ 63 | int re_matchp(re_t pattern, const char* text, int* matchlength); 64 | 65 | /* Finds matches of pattern inside text (compiles first automatically). */ 66 | int re_match(const char* pattern, const char* text, int* matchlength); 67 | ``` 68 | 69 | ### Supported regex-operators 70 | The following features / regex-operators are supported by this library. 71 | 72 | 73 | - `.` Dot, matches any character 74 | - `^` Start anchor, matches beginning of string 75 | - `$` End anchor, matches end of string 76 | - `*` Asterisk, match zero or more (greedy) 77 | - `+` Plus, match one or more (greedy) 78 | - `?` Question, match zero or one (non-greedy) 79 | - `{n}` Exact Quantifier 80 | - `{n,}` Match n or more times 81 | - `{,m}` Match m or less times 82 | - `{n,m}` Match n to m times 83 | - `[abc]` Character class, match if one of {'a', 'b', 'c'} 84 | - `[^abc]` Inverted class, match if NOT one of {'a', 'b', 'c'} 85 | - `[a-zA-Z]` Character ranges, the character set of the ranges { a-z | A-Z } 86 | - `\s` Whitespace, '\t' '\f' '\r' '\n' '\v' and spaces 87 | - `\S` Non-whitespace 88 | - `\w` Alphanumeric, [a-zA-Z0-9_] 89 | - `\W` Non-alphanumeric 90 | - `\d` Digits, [0-9] 91 | - `\D` Non-digits 92 | - `\xXX` Hex-encoded byte 93 | - `|` Branch Or, e.g. a|A, \w|\s 94 | - `(...)` Group 95 | 96 | ### Usage 97 | Compile a regex from ASCII-string (char-array) to a custom pattern structure using `re_compile()`. 98 | 99 | Search a text-string for a regex and get an index into the string, using `re_match()` or `re_matchp()`. 100 | 101 | The returned index points to the first place in the string, where the regex pattern matches. 102 | 103 | The integer pointer passed will hold the length of the match. 104 | 105 | If the regular expression doesn't match, the matching function returns an index of -1 to indicate failure. 106 | 107 | ### Examples 108 | Example of usage: 109 | ```C 110 | /* Standard int to hold length of match */ 111 | int match_length; 112 | 113 | /* Standard null-terminated C-string to search: */ 114 | const char* string_to_search = "ahem.. 'hello world !' .."; 115 | 116 | /* Compile a simple regular expression using character classes, meta-char and greedy quantifiers: */ 117 | re_t pattern = re_compile("[Hh]ello [Ww]orld\\s*[!]?"); 118 | 119 | /* Check if the regex matches the text: */ 120 | int match_idx = re_matchp(pattern, string_to_search, &match_length); 121 | if (match_idx != -1) 122 | { 123 | printf("match at idx %i, %i chars long.\n", match_idx, match_length); 124 | } 125 | ``` 126 | 127 | For more usage examples I encourage you to look at the code in the `tests`-folder. 128 | 129 | ### TODO 130 | - Fix length with nested groups, e.g. `((ab)|b)+` =~ abbb => 7 not 4. 131 | - Add `example.c` that demonstrates usage. 132 | - Add `tests/test_perf.c` for performance and time measurements. 133 | - Add optional multibyte support (e.g. UTF-8). On non-wchar systems roll our own. 134 | - Word boundary: \b \B 135 | - Non-greedy, lazy quantifiers (??, +?, *?, {n,m}?) 136 | - Case-insensitive option or API. `re_matchi()` 137 | - `re_match_capture()` with groups. 138 | - '.' may not match '\r' nor '\n', unless a single-line option is given. 139 | 140 | ### FAQ 141 | - *Q: What differentiates this library from other C regex implementations?* 142 | 143 | A: Well, the small size for one. 500 lines of C-code compiling to 2-3kb ROM, using very little RAM. 144 | 145 | ### License 146 | All material in this repository is in the public domain. 147 | 148 | -------------------------------------------------------------------------------- /formal_verification.md: -------------------------------------------------------------------------------- 1 | # Using KLEE for formal verification 2 | 3 | Here is a crude demo of formal verification of tiny-regex. This is a hefty plagiat of [@DavidKorczynski](https://twitter.com/davkorcz/) - see https://www.youtube.com/watch?v=z6bsk-lsk1Q or [#44](https://github.com/kokke/tiny-regex-c/issues/44) for more context. 4 | 5 | I am using the [KLEE Symbolic Execution Engine](https://klee.github.io/) and their Docker image here on a Debian-based host. 6 | 7 | What this does, is mechanically try to prove the abscence of all run-time errors, memory corruption bugs and other problems by symbolic execution. We mark the inputs as being symbolic, so that the tool knows to use that as the "search space". That means KLEE checks all possible inputs of the form we give it. 8 | 9 | Steps: 10 | 11 | - Get the KLEE Docker image: ` $ sudo docker pull klee/klee ` 12 | - Run the KLEE Docker image: ` $ sudo docker run --rm -ti --ulimit='stack=-1:-1' klee/klee ` 13 | - NOTE: You should see a command prompt like this: ` klee@cc0c26c5b84c:~$ ` 14 | - Fetch `re.h`: ` klee@cc0c26c5b84c:~$ wget https://raw.githubusercontent.com/kokke/tiny-regex-c/master/re.h ` 15 | - Fetch `re.c`: ` klee@cc0c26c5b84c:~$ wget https://raw.githubusercontent.com/kokke/tiny-regex-c/master/re.c ` 16 | - Run your favorite editor, and insert the code below in the bottom of `re.c` 17 | ```C 18 | /* 19 | tiny-regex KLEE test driver 20 | kindly contributed by @DavidKorczynski - see https://github.com/kokke/tiny-regex-c/issues/44 21 | */ 22 | 23 | int main(int argc, char* argv[]) 24 | { 25 | /* test input - ten chars used as a regex-pattern input */ 26 | char arr[10]; 27 | 28 | /* make input symbolic, to search all paths through the code */ 29 | /* i.e. the input is checked for all possible ten-char combinations */ 30 | klee_make_symbolic(arr, sizeof(arr), "arr"); 31 | 32 | /* assume proper NULL termination */ 33 | klee_assume(arr[sizeof(arr) - 1] == 0); 34 | 35 | /* verify abscence of run-time errors - go! */ 36 | re_compile(arr); 37 | 38 | return 0; 39 | } 40 | ``` 41 | - Alternatively, run this command: 42 | ` klee@cc0c26c5b84c:~$ echo "int main(int argc,char* argv[]){ char arr[10]; klee_make_symbolic(arr, sizeof(arr), \"arr\"); klee_assume(arr[sizeof(arr)-1] == 0); re_compile(arr); return 0; }" >> re.c ` 43 | - Compile and emit LLVM bitcode: ` klee@cc0c26c5b84c:~$ clang -emit-llvm -g -c -O0 -Xclang -disable-O0-optnone re.c ` [(NOTE: flags passed to clang are the ones "recommended" by the KLEE project)](https://klee.github.io/tutorials/testing-function/) 44 | - Run KLEE and wait for 5-10 minutes: ` klee@cc0c26c5b84c:~$ klee --libc=uclibc re.bc ` 45 | - A positive result looks like this: 46 | ``` 47 | klee@cc0c26c5b84c:~$ klee --libc=uclibc re.bc 48 | KLEE: NOTE: Using klee-uclibc : /tmp/klee_build90stp_z3/runtime/lib/klee-uclibc.bca 49 | KLEE: output directory is "/home/klee/klee-out-3" 50 | KLEE: Using STP solver backend 51 | warning: Linking two modules of different target triples: re.bc' is 'x86_64-unknown-linux-gnu' whereas '__uClibc_main.os' is 'x86_64-pc-linux-gnu' 52 | 53 | KLEE: WARNING: undefined reference to function: __syscall_rt_sigaction 54 | KLEE: WARNING: undefined reference to function: close 55 | KLEE: WARNING: undefined reference to function: fcntl 56 | KLEE: WARNING: undefined reference to function: fstat 57 | KLEE: WARNING: undefined reference to function: ioctl 58 | KLEE: WARNING: undefined reference to function: lseek64 59 | KLEE: WARNING: undefined reference to function: mkdir 60 | KLEE: WARNING: undefined reference to function: open 61 | KLEE: WARNING: undefined reference to function: open64 62 | KLEE: WARNING: undefined reference to function: read 63 | KLEE: WARNING: undefined reference to function: sigprocmask 64 | KLEE: WARNING: undefined reference to function: stat 65 | KLEE: WARNING: undefined reference to function: write 66 | KLEE: WARNING: undefined reference to function: kill (UNSAFE)! 67 | KLEE: WARNING: executable has module level assembly (ignoring) 68 | KLEE: WARNING ONCE: calling external: ioctl(0, 21505, 94666720729472) at libc/termios/tcgetattr.c:43 12 69 | KLEE: WARNING ONCE: calling __user_main with extra arguments. 70 | KLEE: WARNING ONCE: skipping fork (memory cap exceeded) 71 | KLEE: WARNING: killing 12290 states (over memory cap: 2102MB) 72 | KLEE: WARNING: killing 11467 states (over memory cap: 2101MB) 73 | 74 | KLEE: done: total instructions = 104365773 75 | KLEE: done: completed paths = 801298 76 | KLEE: done: generated tests = 801298 77 | klee@cc0c26c5b84c:~$ 78 | ``` 79 | 80 | Similarly, the code below tests both `re_compile(...)` and `re_match(...)` which should be sufficient for coverage of the core logic. 81 | Depending on your hardware, you should be able to increase the sizes of `pat` and `txt` to increase your confidence in the verification. 82 | 83 | 84 | ```C 85 | /* 86 | tiny-regex KLEE test driver 87 | kindly contributed by @DavidKorczynski - see https://github.com/kokke/tiny-regex-c/issues/44 88 | */ 89 | 90 | int main(int argc, char* argv[]) 91 | { 92 | /* test input - a regex-pattern and a text string to search in */ 93 | char pat[7]; 94 | char txt[3]; 95 | 96 | /* make input symbolic, to search all paths through the code */ 97 | /* i.e. the input is checked for all possible ten-char combinations */ 98 | klee_make_symbolic(pat, sizeof(pat), "pat"); 99 | klee_make_symbolic(txt, sizeof(txt), "txt"); 100 | 101 | /* assume proper NULL termination */ 102 | klee_assume(pat[sizeof(pat) - 1] == 0); 103 | klee_assume(txt[sizeof(txt) - 1] == 0); 104 | 105 | /* verify abscence of run-time errors - go! */ 106 | int l; 107 | re_match(pat, txt, &l); 108 | 109 | return 0; 110 | } 111 | ``` 112 | 113 | My modest hardware (T420/i5-2520M@2.5GHz/8GB) completes a check of a 7-char pattern and a 3-char text string in 20-30 minutes (size includes null-termination), whereas 8/5 takes +8 hours, 8/6 takes 14 hours: 114 | 115 | ``` 116 | klee@780432c1aaae0:~$ clang -emit-llvm -g -c -O0 -Xclang -disable-O0-optnone re.c 117 | klee@780432c1aaae0:~$ time klee --libc=uclibc --optimize re.bc 118 | KLEE: NOTE: Using klee-uclibc : /tmp/klee_build90stp_z3/runtime/lib/klee-uclibc.bca 119 | KLEE: output directory is "/home/klee/klee-out-0" 120 | KLEE: Using STP solver backend 121 | warning: Linking two modules of different target triples: re.bc' is 'x86_64-unknown-linux-gnu' whereas '__uClibc_main.os' is 'x86_64-pc-linux-gnu' 122 | 123 | KLEE: WARNING: undefined reference to function: fcntl 124 | KLEE: WARNING: undefined reference to function: fstat 125 | KLEE: WARNING: undefined reference to function: ioctl 126 | KLEE: WARNING: undefined reference to function: open 127 | KLEE: WARNING: undefined reference to function: write 128 | KLEE: WARNING: executable has module level assembly (ignoring) 129 | KLEE: WARNING ONCE: calling external: ioctl(0, 21505, 94248844458320) at libc/termios/tcgetattr:43 12 130 | KLEE: WARNING ONCE: calling __user_main with extra arguments. 131 | KLEE: WARNING ONCE: skipping fork (memory cap exceeded) 132 | 133 | KLEE: done: total instructions = 201292178 134 | KLEE: done: completed paths = 910249 135 | KLEE: done: generated tests = 910249 136 | 137 | real 29m16.633s 138 | user 19m38.438s 139 | sys 9m34.654s 140 | klee@780432c1aaae0:~$ 141 | ``` 142 | 143 | ---- 144 | 145 | For the formal verifier CBMC just call make verify. 146 | This verifier is much faster than klee. 147 | https://www.cprover.org/cbmc/ 148 | -------------------------------------------------------------------------------- /scripts/exrex.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # This file is part of exrex. 4 | # 5 | # exrex is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Affero General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # exrex is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Affero General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Affero General Public License 16 | # along with exrex. If not, see < http://www.gnu.org/licenses/ >. 17 | # 18 | # (C) 2012- by Adam Tauber, 19 | 20 | try: 21 | from future_builtins import map, range 22 | except: 23 | pass 24 | from re import match, sre_parse 25 | from itertools import product, chain, tee 26 | from random import choice,randint 27 | import string 28 | 29 | __all__ = ('generate', 'CATEGORIES', 'count', 'parse', 'getone') 30 | 31 | CATEGORIES = {'category_space' : sorted(sre_parse.WHITESPACE) 32 | ,'category_digit' : sorted(sre_parse.DIGITS) 33 | ,'category_not_digit' : [chr(x) for x in range(32, 123) if 34 | match('\D', chr(x))] 35 | ,'category_any' : [chr(x) for x in range(32, 123)] 36 | ,'category_word' : sorted( frozenset(string.ascii_letters + string.digits + "_") ) 37 | ,'category_not_word' : [chr(x) for x in range(32, 123) if 38 | match('\W', chr(x))] 39 | } 40 | 41 | def comb(g, i): 42 | for c in g: 43 | g2,i = tee(i) 44 | for c2 in g2: 45 | yield c+c2 46 | 47 | def mappend(g, c): 48 | for cc in g: 49 | yield cc+c 50 | 51 | def _in(d): 52 | ret = [] 53 | neg = False 54 | for i in d: 55 | if i[0] == 'range': 56 | subs = map(chr, range(i[1][0], i[1][1]+1)) 57 | if neg: 58 | for char in subs: 59 | try: 60 | ret.remove(char) 61 | except: 62 | pass 63 | else: 64 | ret.extend(subs) 65 | elif i[0] == 'literal': 66 | if neg: 67 | try: 68 | ret.remove(chr(i[1])) 69 | except: 70 | pass 71 | else: 72 | ret.append(chr(i[1])) 73 | elif i[0] == 'category': 74 | subs = CATEGORIES.get(i[1], ['']) 75 | if neg: 76 | for char in subs: 77 | try: 78 | ret.remove(char) 79 | except: 80 | pass 81 | else: 82 | ret.extend(subs) 83 | elif i[0] == 'negate': 84 | ret = list(CATEGORIES['category_any']) 85 | neg = True 86 | return ret 87 | 88 | 89 | def prods(orig, ran, items): 90 | for o in orig: 91 | for r in ran: 92 | for s in product(items, repeat=r): 93 | yield o+''.join(s) 94 | 95 | def ggen(g1, f, *args, **kwargs): 96 | for a in g1: 97 | g2 = f(*args, **kwargs) 98 | if isinstance(g2, int): 99 | yield g2 100 | else: 101 | for b in g2: 102 | yield a+b 103 | 104 | def _gen(d, limit=20, count=False): 105 | """docstring for _gen""" 106 | ret = [''] 107 | strings = 0 108 | for i in d: 109 | if i[0] == 'in': 110 | subs = _in(i[1]) 111 | if count: 112 | strings = (strings or 1) * len(subs) 113 | ret = comb(ret, subs) 114 | elif i[0] == 'literal': 115 | ret = mappend(ret, chr(i[1])) 116 | elif i[0] == 'category': 117 | subs = CATEGORIES.get(i[1], ['']) 118 | if count: 119 | strings = (strings or 1) * len(subs) 120 | ret = comb(ret, subs) 121 | elif i[0] == 'any': 122 | subs = CATEGORIES['category_any'] 123 | if count: 124 | strings = (strings or 1) * len(subs) 125 | ret = comb(ret, subs) 126 | elif i[0] == 'max_repeat': 127 | chars = filter(None, _gen(list(i[1][2]), limit)) 128 | if i[1][1]+1 - i[1][0] >= limit: 129 | ran = range(i[1][0], i[1][0]+limit) 130 | else: 131 | ran = range(i[1][0], i[1][1]+1) 132 | if count: 133 | for i in ran: 134 | strings += pow(len(chars), i) 135 | ret = prods(ret, ran, chars) 136 | elif i[0] == 'branch': 137 | subs = list(chain.from_iterable(_gen(list(x), limit) for x in i[1][1])) 138 | if count: 139 | strings = (strings or 1) * (len(subs) or 1) 140 | ret = comb(ret, subs) 141 | elif i[0] == 'subpattern': 142 | if count: 143 | strings = (strings or 1) * (sum(ggen([0], _gen, i[1][1], limit=limit, count=True)) or 1) 144 | ret = ggen(ret, _gen, i[1][1], limit=limit, count=False) 145 | # ignore ^ and $ 146 | elif i[0] == 'at': 147 | continue 148 | elif i[0] == 'not_literal': 149 | subs = list(CATEGORIES['category_any']) 150 | subs.remove(chr(i[1])) 151 | if count: 152 | strings = (strings or 1) * len(subs) 153 | ret = comb(ret, subs) 154 | elif i[0] == 'assert': 155 | print(i[1][1]) 156 | continue 157 | else: 158 | #print('[!] cannot handle expression ' + repr(i)) 159 | raise Exception('[!] cannot handle expression ' + repr(i)) 160 | 161 | if count: 162 | return strings 163 | 164 | return ret 165 | 166 | def _randone(d, limit=20): 167 | """docstring for _randone""" 168 | ret = '' 169 | for i in d: 170 | if i[0] == 'in': 171 | ret += choice(_in(i[1])) 172 | elif i[0] == 'literal': 173 | ret += chr(i[1]) 174 | elif i[0] == 'category': 175 | ret += choice(CATEGORIES.get(i[1], [''])) 176 | elif i[0] == 'any': 177 | ret += choice(CATEGORIES['category_any']) 178 | elif i[0] == 'max_repeat': 179 | chars = filter(None, _gen(list(i[1][2]), limit)) 180 | if i[1][1]+1 - i[1][0] >= limit: 181 | min,max = i[1][0], i[1][0]+limit 182 | else: 183 | min,max = i[1][0], i[1][1] 184 | for _ in range(randint(min, max)): 185 | ret += choice(chars) 186 | elif i[0] == 'branch': 187 | ret += choice(list(chain.from_iterable(_gen(list(x), limit) for x in i[1][1]))) 188 | elif i[0] == 'subpattern': 189 | ret += _randone(i[1][1], limit) 190 | elif i[0] == 'at': 191 | continue 192 | elif i[0] == 'not_literal': 193 | c=list(CATEGORIES['category_any']) 194 | c.remove(chr(i[1])) 195 | ret += choice(c) 196 | else: 197 | #print('[!] cannot handle expression "%s"' % str(i)) 198 | raise Exception('[!] cannot handle expression "%s"' % str(i)) 199 | 200 | return ret 201 | 202 | 203 | def parse(s): 204 | """Regular expression parser 205 | :param s: Regular expression 206 | :type s: str 207 | :rtype: list 208 | """ 209 | r = sre_parse.parse(s) 210 | return list(r) 211 | 212 | def generate(s, limit=20): 213 | """Creates a generator that generates all matching strings to a given regular expression 214 | :param s: Regular expression 215 | :type s: str 216 | :param limit: Range limit 217 | :type limit: int 218 | :returns: string generator object 219 | """ 220 | return _gen(parse(s), limit) 221 | 222 | def count(s, limit=20): 223 | """Counts all matching strings to a given regular expression 224 | :param s: Regular expression 225 | :type s: str 226 | :param limit: Range limit 227 | :type limit: int 228 | :rtype: int 229 | :returns: number of matching strings 230 | """ 231 | return _gen(parse(s), limit, count=True) 232 | 233 | def getone(regex_string, limit=20): 234 | """Returns a random matching string to a given regular expression 235 | """ 236 | return _randone(parse(regex_string), limit) 237 | 238 | def argparser(): 239 | import argparse 240 | from sys import stdout 241 | argp = argparse.ArgumentParser(description='exrex - regular expression string generator') 242 | argp.add_argument('-o', '--output' 243 | ,help = 'Output file - default is STDOUT' 244 | ,metavar = 'FILE' 245 | ,default = stdout 246 | ,type = argparse.FileType('w') 247 | ) 248 | argp.add_argument('-l', '--limit' 249 | ,help = 'Max limit for range size - default is 20' 250 | ,default = 20 251 | ,action = 'store' 252 | ,type = int 253 | ,metavar = 'N' 254 | ) 255 | argp.add_argument('-c', '--count' 256 | ,help = 'Count matching strings' 257 | ,default = False 258 | ,action = 'store_true' 259 | ) 260 | argp.add_argument('-r', '--random' 261 | ,help = 'Returns a random string that matches to the regex' 262 | ,default = False 263 | ,action = 'store_true' 264 | ) 265 | argp.add_argument('-d', '--delimiter' 266 | ,help = 'Delimiter - default is \\n' 267 | ,default = '\n' 268 | ) 269 | argp.add_argument('-v', '--verbose' 270 | ,action = 'store_true' 271 | ,help = 'Verbose mode' 272 | ,default = False 273 | ) 274 | argp.add_argument('regex' 275 | ,metavar = 'REGEX' 276 | ,help = 'REGEX string' 277 | ) 278 | return vars(argp.parse_args()) 279 | 280 | def __main__(): 281 | from sys import exit, stderr 282 | # 'as(d|f)qw(e|r|s)[a-zA-Z]{2,3}' 283 | # 'as(QWE|Z([XC]|Y|U)V){2,3}asdf' 284 | # '.?' 285 | # '.+' 286 | # 'asdf.{1,4}qwer{2,5}' 287 | # 'a(b)?(c)?(d)?' 288 | # 'a[b][c][d]?[e]? 289 | args = argparser() 290 | if args['verbose']: 291 | args['output'].write('%r%s' % (parse(args['regex'], limit=args['limit']), args['delimiter'])) 292 | if args['count']: 293 | args['output'].write('%d%s' % (count(args['regex'], limit=args['limit']), args['delimiter'])) 294 | exit(0) 295 | if args['random']: 296 | args['output'].write('%s%s' % (getone(args['regex'], limit=args['limit']), args['delimiter'])) 297 | exit(0) 298 | try: 299 | g = generate(args['regex'], args['limit']) 300 | except: 301 | print >> stderr, '[!] Error: ', e 302 | exit(1) 303 | for s in g: 304 | try: 305 | args['output'].write(s+args['delimiter']) 306 | except: 307 | break 308 | 309 | if __name__ == '__main__': 310 | __main__() 311 | 312 | -------------------------------------------------------------------------------- /scripts/exrex_uni.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # This file is part of exrex. 5 | # 6 | # exrex is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU Affero General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # exrex is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU Affero General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU Affero General Public License 17 | # along with exrex. If not, see < http://www.gnu.org/licenses/ >. 18 | # 19 | # (C) 2012- by Adam Tauber, 20 | 21 | try: 22 | from future_builtins import map, range 23 | except: 24 | pass 25 | from re import sre_parse, U 26 | from itertools import tee 27 | from random import choice, randint 28 | from types import GeneratorType 29 | 30 | from sys import version_info 31 | IS_PY3 = version_info[0] == 3 32 | IS_PY36_OR_GREATER = IS_PY3 and version_info[1] > 5 33 | 34 | if IS_PY3: 35 | unichr = chr 36 | 37 | __all__ = ( 38 | 'generate', 39 | 'CATEGORIES', 40 | 'count', 41 | 'parse', 42 | 'getone', 43 | 'sre_to_string', 44 | 'simplify' 45 | ) 46 | 47 | CATEGORIES = { 48 | sre_parse.CATEGORY_SPACE: sorted(sre_parse.WHITESPACE), 49 | sre_parse.CATEGORY_DIGIT: sorted(sre_parse.DIGITS), 50 | 'category_any': [unichr(x) for x in range(32, 123)] 51 | } 52 | 53 | 54 | def _build_reverse_categories(): 55 | reverse = {} 56 | for key, value in sre_parse.CATEGORIES.items(): 57 | if not hasattr(value[1], '__iter__'): 58 | continue 59 | 60 | for vv in value[1]: 61 | if value[0] == sre_parse.IN and vv[0] == sre_parse.CATEGORY: 62 | reverse.update({vv[1]: key}) 63 | 64 | return reverse 65 | 66 | 67 | REVERSE_CATEGORIES = _build_reverse_categories() 68 | 69 | 70 | def comb(g, i): 71 | for c in g: 72 | g2, i = tee(i) 73 | for c2 in g2: 74 | yield c + c2 75 | 76 | 77 | def mappend(g, c): 78 | for cc in g: 79 | yield cc + c 80 | 81 | 82 | def dappend(g, d, k): 83 | for cc in g: 84 | yield cc + d[k] 85 | 86 | 87 | def _in(d): 88 | ret = [] 89 | neg = False 90 | for i in d: 91 | if i[0] == sre_parse.RANGE: 92 | subs = map(unichr, range(i[1][0], i[1][1] + 1)) 93 | if neg: 94 | for char in subs: 95 | try: 96 | ret.remove(char) 97 | except: 98 | pass 99 | else: 100 | ret.extend(subs) 101 | elif i[0] == sre_parse.LITERAL: 102 | if neg: 103 | try: 104 | ret.remove(unichr(i[1])) 105 | except: 106 | pass 107 | else: 108 | ret.append(unichr(i[1])) 109 | elif i[0] == sre_parse.CATEGORY: 110 | subs = CATEGORIES.get(i[1], ['']) 111 | if neg: 112 | for char in subs: 113 | try: 114 | ret.remove(char) 115 | except: 116 | pass 117 | else: 118 | ret.extend(subs) 119 | elif i[0] == sre_parse.NEGATE: 120 | ret = list(CATEGORIES['category_any']) 121 | neg = True 122 | return ret 123 | 124 | 125 | def prods(orig, ran, items, limit, grouprefs): 126 | for o in orig: 127 | for r in ran: 128 | if r == 0: 129 | yield o 130 | else: 131 | ret = [o] 132 | for _ in range(r): 133 | ret = ggen( 134 | ret, _gen, items, limit=limit, count=False, grouprefs=grouprefs) 135 | for i in ret: 136 | yield i 137 | 138 | 139 | def ggen(g1, f, *args, **kwargs): 140 | groupref = None 141 | grouprefs = kwargs.get('grouprefs', {}) 142 | if 'groupref' in kwargs.keys(): 143 | groupref = kwargs.pop('groupref') 144 | for a in g1: 145 | g2 = f(*args, **kwargs) 146 | if isinstance(g2, GeneratorType): 147 | for b in g2: 148 | grouprefs[groupref] = b 149 | yield a + b 150 | else: 151 | yield g2 152 | 153 | 154 | def concit(g1, seqs, limit, grouprefs): 155 | for a in g1: 156 | for s in seqs: 157 | for b in _gen(s, limit, grouprefs=grouprefs): 158 | yield a + b 159 | 160 | 161 | def _gen(d, limit=20, count=False, grouprefs=None): 162 | """docstring for _gen""" 163 | if grouprefs is None: 164 | grouprefs = {} 165 | ret = [''] 166 | strings = 0 167 | literal = False 168 | for i in d: 169 | if i[0] == sre_parse.IN: 170 | subs = _in(i[1]) 171 | if count: 172 | strings = (strings or 1) * len(subs) 173 | ret = comb(ret, subs) 174 | elif i[0] == sre_parse.LITERAL: 175 | literal = True 176 | ret = mappend(ret, unichr(i[1])) 177 | elif i[0] == sre_parse.CATEGORY: 178 | subs = CATEGORIES.get(i[1], ['']) 179 | if count: 180 | strings = (strings or 1) * len(subs) 181 | ret = comb(ret, subs) 182 | elif i[0] == sre_parse.ANY: 183 | subs = CATEGORIES['category_any'] 184 | if count: 185 | strings = (strings or 1) * len(subs) 186 | ret = comb(ret, subs) 187 | elif i[0] == sre_parse.MAX_REPEAT: 188 | items = list(i[1][2]) 189 | if i[1][1] + 1 - i[1][0] >= limit: 190 | ran = range(i[1][0], i[1][0] + limit) 191 | r1 = i[1][0] 192 | r2 = i[1][0] + limit 193 | else: 194 | r1 = i[1][0] 195 | r2 = i[1][1] + 1 196 | ran = range(r1, r2) 197 | if count: 198 | for p in ran: 199 | strings += pow(_gen(items, limit, True, grouprefs), p) or 1 200 | ret = prods(ret, ran, items, limit, grouprefs) 201 | elif i[0] == sre_parse.BRANCH: 202 | if count: 203 | for x in i[1][1]: 204 | strings += _gen(x, limit, True, grouprefs) or 1 205 | ret = concit(ret, i[1][1], limit, grouprefs) 206 | elif i[0] == sre_parse.SUBPATTERN or i[0] == sre_parse.ASSERT: 207 | subexpr = i[1][1] 208 | if IS_PY36_OR_GREATER and i[0] == sre_parse.SUBPATTERN: 209 | subexpr = i[1][3] 210 | if count: 211 | strings = ( 212 | strings or 1) * (sum(ggen([0], _gen, subexpr, limit=limit, count=True, grouprefs=grouprefs)) or 1) 213 | ret = ggen(ret, _gen, subexpr, limit=limit, count=False, grouprefs=grouprefs, groupref=i[1][0]) 214 | # ignore ^ and $ 215 | elif i[0] == sre_parse.AT: 216 | continue 217 | elif i[0] == sre_parse.NOT_LITERAL: 218 | subs = list(CATEGORIES['category_any']) 219 | if unichr(i[1]) in subs: 220 | subs.remove(unichr(i[1])) 221 | if count: 222 | strings = (strings or 1) * len(subs) 223 | ret = comb(ret, subs) 224 | elif i[0] == sre_parse.GROUPREF: 225 | ret = dappend(ret, grouprefs, i[1]) 226 | elif i[0] == sre_parse.ASSERT_NOT: 227 | pass 228 | else: 229 | print('[!] cannot handle expression ' + repr(i)) 230 | 231 | if count: 232 | if strings == 0 and literal: 233 | inc = True 234 | for i in d: 235 | if i[0] not in (sre_parse.AT, sre_parse.LITERAL): 236 | inc = False 237 | if inc: 238 | strings = 1 239 | return strings 240 | 241 | return ret 242 | 243 | 244 | def _randone(d, limit=20, grouprefs=None): 245 | if grouprefs is None: 246 | grouprefs = {} 247 | """docstring for _randone""" 248 | ret = '' 249 | for i in d: 250 | if i[0] == sre_parse.IN: 251 | ret += choice(_in(i[1])) 252 | elif i[0] == sre_parse.LITERAL: 253 | ret += unichr(i[1]) 254 | elif i[0] == sre_parse.CATEGORY: 255 | ret += choice(CATEGORIES.get(i[1], [''])) 256 | elif i[0] == sre_parse.ANY: 257 | ret += choice(CATEGORIES['category_any']) 258 | elif i[0] == sre_parse.MAX_REPEAT: 259 | if i[1][1] + 1 - i[1][0] >= limit: 260 | min, max = i[1][0], i[1][0] + limit - 1 261 | else: 262 | min, max = i[1][0], i[1][1] 263 | for _ in range(randint(min, max)): 264 | ret += _randone(list(i[1][2]), limit, grouprefs) 265 | elif i[0] == sre_parse.BRANCH: 266 | ret += _randone(choice(i[1][1]), limit, grouprefs) 267 | elif i[0] == sre_parse.SUBPATTERN or i[0] == sre_parse.ASSERT: 268 | subexpr = i[1][1] 269 | if IS_PY36_OR_GREATER and i[0] == sre_parse.SUBPATTERN: 270 | subexpr = i[1][3] 271 | subp = _randone(subexpr, limit, grouprefs) 272 | if i[1][0]: 273 | grouprefs[i[1][0]] = subp 274 | ret += subp 275 | elif i[0] == sre_parse.AT: 276 | continue 277 | elif i[0] == sre_parse.NOT_LITERAL: 278 | c = list(CATEGORIES['category_any']) 279 | if unichr(i[1]) in c: 280 | c.remove(unichr(i[1])) 281 | ret += choice(c) 282 | elif i[0] == sre_parse.GROUPREF: 283 | ret += grouprefs[i[1]] 284 | elif i[0] == sre_parse.ASSERT_NOT: 285 | pass 286 | else: 287 | print('[!] cannot handle expression "%s"' % str(i)) 288 | 289 | return ret 290 | 291 | 292 | def sre_to_string(sre_obj, paren=True): 293 | """sre_parse object to string 294 | 295 | :param sre_obj: Output of sre_parse.parse() 296 | :type sre_obj: list 297 | :rtype: str 298 | """ 299 | ret = u'' 300 | for i in sre_obj: 301 | if i[0] == sre_parse.IN: 302 | prefix = '' 303 | if len(i[1]) and i[1][0][0] == sre_parse.NEGATE: 304 | prefix = '^' 305 | ret += u'[{0}{1}]'.format(prefix, sre_to_string(i[1], paren=paren)) 306 | elif i[0] == sre_parse.LITERAL: 307 | ret += unichr(i[1]) 308 | elif i[0] == sre_parse.CATEGORY: 309 | ret += REVERSE_CATEGORIES[i[1]] 310 | elif i[0] == sre_parse.ANY: 311 | ret += '.' 312 | elif i[0] == sre_parse.BRANCH: 313 | # TODO simplifications here 314 | parts = [sre_to_string(x, paren=paren) for x in i[1][1]] 315 | if not any(parts): 316 | continue 317 | if i[1][0]: 318 | if len(parts) == 1: 319 | paren = False 320 | prefix = '' 321 | else: 322 | prefix = '?:' 323 | branch = '|'.join(parts) 324 | if paren: 325 | ret += '({0}{1})'.format(prefix, branch) 326 | else: 327 | ret += '{0}'.format(branch) 328 | elif i[0] == sre_parse.SUBPATTERN: 329 | subexpr = i[1][1] 330 | if IS_PY36_OR_GREATER and i[0] == sre_parse.SUBPATTERN: 331 | subexpr = i[1][3] 332 | if i[1][0]: 333 | ret += '({0})'.format(sre_to_string(subexpr, paren=False)) 334 | else: 335 | ret += '{0}'.format(sre_to_string(subexpr, paren=paren)) 336 | elif i[0] == sre_parse.NOT_LITERAL: 337 | ret += '[^{0}]'.format(unichr(i[1])) 338 | elif i[0] == sre_parse.MAX_REPEAT: 339 | if i[1][0] == i[1][1]: 340 | range_str = '{{{0}}}'.format(i[1][0]) 341 | else: 342 | if i[1][0] == 0 and i[1][1] - i[1][0] == sre_parse.MAXREPEAT: 343 | range_str = '*' 344 | elif i[1][0] == 1 and i[1][1] - i[1][0] == sre_parse.MAXREPEAT - 1: 345 | range_str = '+' 346 | else: 347 | range_str = '{{{0},{1}}}'.format(i[1][0], i[1][1]) 348 | ret += sre_to_string(i[1][2], paren=paren) + range_str 349 | elif i[0] == sre_parse.GROUPREF: 350 | ret += '\\{0}'.format(i[1]) 351 | elif i[0] == sre_parse.AT: 352 | if i[1] == sre_parse.AT_BEGINNING: 353 | ret += '^' 354 | elif i[1] == sre_parse.AT_END: 355 | ret += '$' 356 | elif i[0] == sre_parse.NEGATE: 357 | pass 358 | elif i[0] == sre_parse.RANGE: 359 | ret += '{0}-{1}'.format(unichr(i[1][0]), unichr(i[1][1])) 360 | elif i[0] == sre_parse.ASSERT: 361 | if i[1][0]: 362 | ret += '(?={0})'.format(sre_to_string(i[1][1], paren=False)) 363 | else: 364 | ret += '{0}'.format(sre_to_string(i[1][1], paren=paren)) 365 | elif i[0] == sre_parse.ASSERT_NOT: 366 | pass 367 | else: 368 | print('[!] cannot handle expression "%s"' % str(i)) 369 | return ret 370 | 371 | 372 | def simplify(regex_string): 373 | """Simplify a regular expression 374 | 375 | :param regex_string: Regular expression 376 | :type regex_string: str 377 | :rtype: str 378 | """ 379 | r = parse(regex_string) 380 | return sre_to_string(r) 381 | 382 | 383 | def parse(s): 384 | """Regular expression parser 385 | 386 | :param s: Regular expression 387 | :type s: str 388 | :rtype: list 389 | """ 390 | if IS_PY3: 391 | r = sre_parse.parse(s, flags=U) 392 | else: 393 | r = sre_parse.parse(s.decode('utf-8'), flags=U) 394 | return list(r) 395 | 396 | 397 | def generate(s, limit=20): 398 | """Creates a generator that generates all matching strings to a given regular expression 399 | 400 | :param s: Regular expression 401 | :type s: str 402 | :param limit: Range limit 403 | :type limit: int 404 | :returns: string generator object 405 | """ 406 | return _gen(parse(s), limit) 407 | 408 | 409 | def count(s, limit=20): 410 | """Counts all matching strings to a given regular expression 411 | 412 | :param s: Regular expression 413 | :type s: str 414 | :param limit: Range limit 415 | :type limit: int 416 | :rtype: int 417 | :returns: number of matching strings 418 | """ 419 | return _gen(parse(s), limit, count=True) 420 | 421 | 422 | def getone(regex_string, limit=20): 423 | """Returns a random matching string to a given regular expression 424 | """ 425 | return _randone(parse(regex_string), limit) 426 | 427 | 428 | def argparser(): 429 | import argparse 430 | from sys import stdout 431 | argp = argparse.ArgumentParser( 432 | description='exrex - regular expression string generator') 433 | argp.add_argument( 434 | '-o', '--output', 435 | help='Output file - default is STDOUT', 436 | metavar='FILE', 437 | default=stdout, 438 | type=argparse.FileType('w') 439 | ) 440 | argp.add_argument( 441 | '-l', '--limit', 442 | help='Max limit for range size - default is 20', 443 | default=20, 444 | action='store', 445 | type=int, 446 | metavar='N' 447 | ) 448 | argp.add_argument( 449 | '-c', '--count', 450 | help='Count matching strings', 451 | default=False, 452 | action='store_true' 453 | ) 454 | argp.add_argument( 455 | '-m', '--max-number', 456 | help='Max number of strings - default is -1', 457 | default=-1, 458 | action='store', 459 | type=int, 460 | metavar='N' 461 | ) 462 | argp.add_argument( 463 | '-r', '--random', 464 | help='Returns a random string that matches to the regex', 465 | default=False, 466 | action='store_true' 467 | ) 468 | argp.add_argument( 469 | '-s', '--simplify', 470 | help='Simplifies a regular expression', 471 | default=False, 472 | action='store_true' 473 | ) 474 | argp.add_argument( 475 | '-d', '--delimiter', 476 | help='Delimiter - default is \\n', 477 | default='\n' 478 | ) 479 | argp.add_argument( 480 | '-v', '--verbose', 481 | action='store_true', 482 | help='Verbose mode', 483 | default=False 484 | ) 485 | argp.add_argument( 486 | 'regex', 487 | metavar='REGEX', 488 | help='REGEX string' 489 | ) 490 | return vars(argp.parse_args()) 491 | 492 | 493 | def __main__(): 494 | from sys import exit, stderr 495 | args = argparser() 496 | if args['verbose']: 497 | args['output'].write( 498 | '%r%s' % (parse(args['regex']), args['delimiter'])) 499 | if args['count']: 500 | args['output'].write( 501 | '%d%s' % (count(args['regex'], limit=args['limit']), args['delimiter'])) 502 | exit(0) 503 | if args['random']: 504 | args['output'].write( 505 | '%s%s' % (getone(args['regex'], limit=args['limit']), args['delimiter'])) 506 | exit(0) 507 | if args['simplify']: 508 | args['output'].write( 509 | '%s%s' % (simplify(args['regex']), args['delimiter'])) 510 | exit(0) 511 | try: 512 | g = generate(args['regex'], args['limit']) 513 | except Exception as e: 514 | stderr.write('[!] Error: %s\n' % e) 515 | exit(1) 516 | args['output'].write(next(g)) 517 | args['max_number'] -= 1 518 | for s in g: 519 | if args['max_number'] == 0: 520 | break 521 | args['max_number'] -= 1 522 | args['output'].write(args['delimiter']) 523 | args['output'].write(s) 524 | if args['delimiter'] == '\n': 525 | args['output'].write('\n') 526 | 527 | 528 | if __name__ == '__main__': 529 | __main__() 530 | -------------------------------------------------------------------------------- /re.c: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * Mini regex-module inspired by Rob Pike's regex code described in: 4 | * 5 | * http://www.cs.princeton.edu/courses/archive/spr09/cos333/beautiful.html 6 | * 7 | * 8 | * 9 | * Supports: 10 | * --------- 11 | * '.' Dot, matches any character 12 | * '^' Start anchor, matches beginning of string 13 | * '$' End anchor, matches end of string 14 | * '*' Asterisk, match zero or more (greedy) 15 | * '+' Plus, match one or more (greedy) 16 | * '?' Question, match zero or one (non-greedy) 17 | * '[abc]' Character class, match if one of {'a', 'b', 'c'} 18 | * '[^abc]' Inverted class, match if NOT one of {'a', 'b', 'c'} 19 | * '[a-zA-Z]' Character ranges, the character set of the ranges { a-z | A-Z } 20 | * '\s' Whitespace, \t \f \r \n \v and spaces 21 | * '\S' Non-whitespace 22 | * '\w' Alphanumeric, [a-zA-Z0-9_] 23 | * '\W' Non-alphanumeric 24 | * '\d' Digits, [0-9] 25 | * '\D' Non-digits 26 | * '\xXX' Hex-encoded byte 27 | * '|' Branch Or, e.g. a|A, \w|\s 28 | * '{n}' Match n times 29 | * '{n,}' Match n or more times 30 | * '{,m}' Match m or less times 31 | * '{n,m}' Match n to m times 32 | 33 | * FIXME: 34 | * '(...)' Group 35 | * 36 | * TODO: 37 | * - multibyte support (mbtowc, esp. UTF-8. maybe hardcode UTF-8 without libc locale insanity) 38 | * - \b word boundary support 39 | */ 40 | 41 | 42 | #include "re.h" 43 | #include 44 | #include 45 | #include 46 | #ifdef _UNICODE 47 | # include 48 | # include 49 | #endif 50 | 51 | /* Definitions: */ 52 | 53 | #define MAX_CHAR_CLASS_LEN 40 /* Max length of character-class buffer in. */ 54 | #ifndef CPROVER 55 | #define MAX_REGEXP_OBJECTS 30 /* Max number of regex symbols in expression. */ 56 | #else 57 | #define MAX_REGEXP_OBJECTS 8 /* faster formal proofs */ 58 | #endif 59 | 60 | #define MAX_REGEXP_LEN 70 61 | 62 | #ifdef DEBUG 63 | #define DEBUG_P(...) fprintf(stderr, __VA_ARGS__) 64 | #else 65 | #define DEBUG_P(...) 66 | #endif 67 | 68 | enum regex_type_e { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS, CHAR, 69 | CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT, ALPHA, 70 | NOT_ALPHA, WHITESPACE, NOT_WHITESPACE, BRANCH, GROUP, GROUPEND, 71 | TIMES, TIMES_N, TIMES_M, TIMES_NM }; 72 | 73 | typedef struct regex_t 74 | { 75 | unsigned short type; /* CHAR, STAR, etc. */ 76 | union 77 | { 78 | struct { 79 | char ch; /* the character itself */ 80 | char data[]; 81 | }; 82 | unsigned char group_size; /* OR the number of group patterns. */ 83 | unsigned char group_start; /* OR for GROUPEND, the start index of the group. */ 84 | struct { 85 | unsigned short n; /* match n times */ 86 | unsigned short m; /* match n to m times */ 87 | }; 88 | } u; 89 | } regex_t; 90 | 91 | static unsigned getsize(regex_t* pattern) 92 | { 93 | unsigned size = sizeof(unsigned short); 94 | switch(pattern->type) { 95 | case GROUP: 96 | case GROUPEND: 97 | size += sizeof(unsigned short) * 2; 98 | break; 99 | case TIMES: 100 | case TIMES_N: 101 | case TIMES_M: 102 | case TIMES_NM: 103 | size += sizeof(unsigned short) * 2; 104 | break; 105 | case CHAR: 106 | size += sizeof(unsigned short) * 2; 107 | break; 108 | case CHAR_CLASS: 109 | case INV_CHAR_CLASS: 110 | size += sizeof(unsigned short) + strlen(&pattern->u.data[-1]); 111 | default: 112 | break; 113 | } 114 | 115 | if(size % 2) 116 | ++size; 117 | 118 | return size; 119 | } 120 | 121 | static re_t getnext(regex_t* pattern) 122 | { 123 | return (re_t)(((unsigned char*)pattern) + getsize(pattern)); 124 | } 125 | 126 | static re_t getindex(regex_t* pattern, int index) 127 | { 128 | for(int i = 1; i <= index; ++i) 129 | pattern = getnext(pattern); 130 | 131 | return pattern; 132 | } 133 | 134 | /* Private function declarations: */ 135 | static int matchpattern(regex_t* pattern, const char* text, int* matchlength, int *num_patterns); 136 | static int matchcharclass(char c, const char* str); 137 | static int matchstar(regex_t* p, regex_t* pattern, const char* text, int* matchlength); 138 | static int matchplus(regex_t* p, regex_t* pattern, const char* text, int* matchlength); 139 | static int matchquestion(regex_t* p, regex_t* pattern, const char* text, int* matchlength); 140 | static int matchbranch(regex_t* p, regex_t* pattern, const char* text, int* matchlength); 141 | static int matchtimes(regex_t* p, unsigned short n, const char* text, int* matchlength); 142 | static int matchtimes_n(regex_t* p, unsigned short n, const char* text, int* matchlength); 143 | static int matchtimes_m(regex_t* p, unsigned short m, const char* text, int* matchlength); 144 | static int matchtimes_nm(regex_t* p, unsigned short n, unsigned short m, 145 | const char* text, int* matchlength); 146 | static int matchgroup(regex_t* p, const char* text, int* matchlength); 147 | static int matchone(regex_t* p, char c); 148 | static int matchdigit(char c); 149 | static int matchalpha(char c); 150 | static int matchwhitespace(char c); 151 | static int matchmetachar(char c, const char* str); 152 | static int matchrange(char c, const char* str); 153 | static int matchdot(char c); 154 | static int ismetachar(char c); 155 | static int hex (char c); 156 | 157 | /* Public functions: */ 158 | int re_match(const char* pattern, const char* text, int* matchlength) 159 | { 160 | return re_matchp(re_compile(pattern), text, matchlength); 161 | } 162 | 163 | int re_matchp(re_t pattern, const char* text, int* matchlength) 164 | { 165 | int num_patterns = 0; 166 | *matchlength = 0; 167 | if (pattern != 0) 168 | { 169 | if (pattern->type == BEGIN) 170 | { 171 | return ((matchpattern(getnext(pattern), text, matchlength, &num_patterns)) ? 0 : -1); 172 | } 173 | else 174 | { 175 | int idx = -1; 176 | 177 | do 178 | { 179 | idx += 1; 180 | 181 | if (matchpattern(pattern, text, matchlength, &num_patterns)) 182 | { 183 | // empty branch matches null (i.e. ok, but *matchlength == 0) 184 | if (*matchlength && text[0] == '\0') 185 | return -1; 186 | 187 | return idx; 188 | } 189 | 190 | // Reset match length for the next starting point 191 | *matchlength = 0; 192 | 193 | } 194 | while (*text++ != '\0'); 195 | } 196 | } 197 | return -1; 198 | } 199 | 200 | re_t re_compile_to(const char* pattern, unsigned char* re_data, unsigned* size) 201 | { 202 | memset(re_data, 0, *size); 203 | 204 | char c; /* current char in pattern */ 205 | int i = 0; /* index into pattern */ 206 | int j = 0; /* index into re_data */ 207 | unsigned bytes = *size; 208 | *size = 0; 209 | 210 | regex_t *re_compiled = (regex_t*)(re_data); 211 | 212 | while (pattern[i] != '\0' && ((char*)re_compiled < (char*)re_data + bytes - sizeof(re_compiled))) 213 | { 214 | c = pattern[i]; 215 | 216 | switch (c) 217 | { 218 | /* Meta-characters: */ 219 | case '^': { re_compiled->type = BEGIN; } break; 220 | case '$': { re_compiled->type = END; } break; 221 | case '.': { re_compiled->type = DOT; } break; 222 | case '|': { re_compiled->type = BRANCH; } break; 223 | case '*': 224 | { 225 | if (j > 0) 226 | re_compiled->type = STAR; 227 | else // nothing to repeat at position 0 228 | return 0; 229 | } break; 230 | case '+': 231 | { 232 | if (j > 0) 233 | re_compiled->type = PLUS; 234 | else // nothing to repeat at position 0 235 | return 0; 236 | } break; 237 | case '?': 238 | { 239 | if (j > 0) 240 | re_compiled->type = QUESTIONMARK; 241 | else // nothing to repeat at position 0 242 | return 0; 243 | } break; 244 | 245 | case '(': 246 | { 247 | char *p = strrchr(&pattern[i], ')'); 248 | if (p && *(p - 1) != '\\') 249 | { 250 | re_compiled->type = GROUP; 251 | re_compiled->u.group_size = 0; 252 | } 253 | /* '(' without matching ')' */ 254 | else 255 | return 0; 256 | break; 257 | } 258 | case ')': 259 | { 260 | int nestlevel = 0; 261 | int k = j - 1; 262 | /* search back to next innermost groupstart */ 263 | for (; k >= 0; k--) 264 | { 265 | regex_t* cur = getindex((regex_t*)re_data, k); 266 | if (k < j && cur->type == GROUPEND) 267 | nestlevel++; 268 | else if (cur->type == GROUP) 269 | { 270 | if (nestlevel == 0) 271 | { 272 | cur->u.group_size = j - k - 1; 273 | re_compiled->type = GROUPEND; 274 | re_compiled->u.group_start = k; // index of group 275 | break; 276 | } 277 | nestlevel--; 278 | } 279 | } 280 | /* ')' without matching '(' */ 281 | if (k < 0) 282 | return 0; 283 | break; 284 | } 285 | case '{': 286 | { 287 | unsigned short n, m; 288 | char *p = strchr (&pattern[i+1], '}'); 289 | re_compiled->type = CHAR; 290 | re_compiled->u.ch = c; 291 | //re_compiled->u.data_len = 1; 292 | if (!p || j == 0) // those invalid quantifiers are compiled as is 293 | { // (in python and perl) 294 | re_compiled->type = CHAR; 295 | re_compiled->u.ch = c; 296 | //re_compiled->u.data_len = 1; 297 | } 298 | else if (2 != sscanf (&pattern[i], "{%hd,%hd}", &n, &m)) 299 | { 300 | int o; 301 | if (!(2 == sscanf (&pattern[i], "{%hd,}%n", &n, &o) && pattern[o] == '\0') || 302 | n == 0 || n > 32767) 303 | { 304 | if (1 != sscanf (&pattern[i], "{,%hd}", &m) || 305 | *(p-1) == ',' || m == 0 || m > 32767) 306 | { 307 | if (1 == sscanf (&pattern[i], "{%hd}", &n) && 308 | n > 0 && n <= 32767) 309 | { 310 | re_compiled->type = TIMES; 311 | re_compiled->u.n = n; 312 | } 313 | } 314 | else 315 | { 316 | re_compiled->type = TIMES_M; 317 | re_compiled->u.m = m; 318 | } 319 | } 320 | else 321 | { 322 | re_compiled->type = TIMES_N; 323 | re_compiled->u.n = n; 324 | } 325 | } 326 | else 327 | { 328 | // m must be greater than n, and none of them may be 0 or negative. 329 | if (!(n == 0 || m == 0 || n > 32767 || m > 32767 || m <= n || *(p-1) == ',')) 330 | { 331 | re_compiled->type = TIMES_NM; 332 | re_compiled->u.n = n; 333 | re_compiled->u.m = m; 334 | } 335 | } 336 | if (re_compiled->type != CHAR) 337 | i += (p - &pattern[i]); 338 | break; 339 | } 340 | /* Escaped character-classes (\s \S \w \W \d \D \*): */ 341 | case '\\': 342 | { 343 | if (pattern[i+1] != '\0') 344 | { 345 | /* Skip the escape-char '\\' */ 346 | i += 1; 347 | /* ... and check the next */ 348 | switch (pattern[i]) 349 | { 350 | /* Meta-characters: */ 351 | case 'd': { re_compiled->type = DIGIT; } break; 352 | case 'D': { re_compiled->type = NOT_DIGIT; } break; 353 | case 'w': { re_compiled->type = ALPHA; } break; 354 | case 'W': { re_compiled->type = NOT_ALPHA; } break; 355 | case 's': { re_compiled->type = WHITESPACE; } break; 356 | case 'S': { re_compiled->type = NOT_WHITESPACE; } break; 357 | case 'x': { 358 | /* \xXX */ 359 | re_compiled->type = CHAR; 360 | i++; 361 | int h = hex(pattern[i]); 362 | if (h == -1) 363 | { 364 | re_compiled->u.ch = '\\'; 365 | re_compiled->type = CHAR; 366 | 367 | re_compiled = getnext(re_compiled); 368 | re_compiled->u.ch = 'x'; 369 | re_compiled->type = CHAR; 370 | 371 | re_compiled = getnext(re_compiled); 372 | re_compiled->u.ch = pattern[i]; 373 | re_compiled->type = CHAR; 374 | break; 375 | } 376 | re_compiled->u.ch = h << 4; 377 | h = hex(pattern[++i]); 378 | if (h != -1) 379 | re_compiled->u.ch += h; 380 | else 381 | { 382 | re_compiled->u.ch = '\\'; 383 | re_compiled->type = CHAR; 384 | 385 | re_compiled = getnext(re_compiled); 386 | re_compiled->u.ch = 'x'; 387 | re_compiled->type = CHAR; 388 | 389 | re_compiled = getnext(re_compiled); 390 | re_compiled->u.ch = pattern[i-1]; 391 | re_compiled->type = CHAR; 392 | 393 | if (pattern[i]) 394 | { 395 | re_compiled = getnext(re_compiled); 396 | re_compiled->u.ch = pattern[i]; 397 | re_compiled->type = CHAR; 398 | } 399 | } 400 | } break; 401 | 402 | /* Escaped character, e.g. '.', '$' or '\\' */ 403 | default: 404 | { 405 | re_compiled->type = CHAR; 406 | re_compiled->u.ch = pattern[i]; 407 | } break; 408 | } 409 | } 410 | /* '\\' as last char without previous \\ -> invalid regular expression. */ 411 | else 412 | return 0; 413 | } break; 414 | 415 | /* Character class: */ 416 | case '[': 417 | { 418 | int charIdx = -1; 419 | 420 | /* Look-ahead to determine if negated */ 421 | if (pattern[i+1] == '^') 422 | { 423 | re_compiled->type = INV_CHAR_CLASS; 424 | i += 1; /* Increment i to avoid including '^' in the char-buffer */ 425 | if (pattern[i+1] == 0) /* incomplete pattern, missing non-zero char after '^' */ 426 | { 427 | return 0; 428 | } 429 | } 430 | else 431 | { 432 | re_compiled->type = CHAR_CLASS; 433 | } 434 | 435 | /* Copy characters inside [..] to buffer */ 436 | while ( (pattern[++i] != ']') 437 | && (pattern[i] != '\0')) /* Missing ] */ 438 | { 439 | if (pattern[i] == '\\') 440 | { 441 | 442 | if (&re_compiled->u.data[charIdx] >= (char*)re_data + bytes) 443 | { 444 | //fputs("exceeded internal buffer!\n", stderr); 445 | return 0; 446 | } 447 | 448 | if (pattern[i+1] == 0) /* incomplete pattern, missing non-zero char after '\\' */ 449 | { 450 | return 0; 451 | } 452 | re_compiled->u.data[charIdx++] = pattern[i++]; 453 | } 454 | else if (&re_compiled->u.data[charIdx] >= (char*)re_data + bytes) 455 | { 456 | //fputs("exceeded internal buffer!\n", stderr); 457 | return 0; 458 | } 459 | 460 | re_compiled->u.data[charIdx++] = pattern[i]; 461 | } 462 | 463 | if (&re_compiled->u.data[charIdx] >= (char*)re_data + bytes) 464 | { 465 | /* Catches cases such as [00000000000000000000000000000000000000][ */ 466 | //fputs("exceeded internal buffer!\n", stderr); 467 | return 0; 468 | } 469 | 470 | /* Null-terminate string end */ 471 | re_compiled->u.data[charIdx++] = '\0'; 472 | } break; 473 | 474 | case '\0': // EOL (dead-code) 475 | return 0; 476 | 477 | /* Other characters: */ 478 | default: 479 | { 480 | re_compiled->type = CHAR; 481 | // cbmc: arithmetic overflow on signed to unsigned type conversion in c 482 | re_compiled->u.ch = c; 483 | } break; 484 | } 485 | i += 1; 486 | j += 1; 487 | re_compiled = getnext(re_compiled); 488 | } 489 | /* 'UNUSED' is a sentinel used to indicate end-of-pattern */ 490 | re_compiled->type = UNUSED; 491 | 492 | /* Calculate final, compressed actual size. */ 493 | *size = (unsigned char*)getnext(re_compiled) - re_data; 494 | 495 | return (re_t) re_data; 496 | } 497 | 498 | re_t re_compile(const char* pattern) 499 | { 500 | static unsigned char buffer[MAX_REGEXP_OBJECTS * sizeof(regex_t)]; 501 | unsigned size = sizeof(buffer); 502 | return re_compile_to(pattern, buffer, &size); 503 | } 504 | 505 | unsigned re_size(re_t pattern) 506 | { 507 | unsigned bytes = 0; 508 | 509 | while(pattern) 510 | { 511 | bytes += getsize(pattern); 512 | 513 | if(pattern->type == UNUSED) 514 | break; 515 | 516 | pattern = getnext(pattern); 517 | } 518 | 519 | return bytes; 520 | } 521 | 522 | int re_compare(re_t pattern1, re_t pattern2) { 523 | int result = 0; 524 | 525 | const unsigned totalSize1 = re_size(pattern1); 526 | const unsigned totalSize2 = re_size(pattern2); 527 | 528 | if(totalSize1 > totalSize2) 529 | return 1; 530 | else if(totalSize2 > totalSize1) 531 | return -1; 532 | 533 | while(pattern1 && pattern2) { 534 | unsigned size1 = getsize(pattern1); 535 | unsigned size2 = getsize(pattern2); 536 | 537 | if(size1 > size2) 538 | return 1; 539 | else if(size2 > size1) 540 | return -1; 541 | 542 | result = memcmp(pattern1, pattern2, size1); 543 | 544 | if(result != 0) 545 | return result; 546 | 547 | if(pattern1->type == UNUSED) 548 | break; 549 | 550 | pattern1 = getnext(pattern1); 551 | pattern2 = getnext(pattern2); 552 | 553 | } 554 | 555 | return result; 556 | } 557 | 558 | #define re_string_cat_fmt_(buff, ...) \ 559 | do { \ 560 | sprintf(tmp_buff, __VA_ARGS__); \ 561 | strncat(buff, tmp_buff, count - *size - 1); \ 562 | *size = strlen(buff); \ 563 | if(*size >= count) \ 564 | return; \ 565 | } while(0) 566 | 567 | void re_string(regex_t* pattern, char* buffer, unsigned* size) 568 | { 569 | #if 0 570 | const char *const types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE", "BRANCH", "GROUP", "GROUPEND", "TIMES", "TIMES_N", "TIMES_M", "TIMES_NM" }; 571 | #endif 572 | unsigned count = *size; 573 | unsigned char i = 0; 574 | int j; 575 | unsigned char group_end = 0; 576 | char c; 577 | char tmp_buff[128]; 578 | 579 | *size = 0; 580 | buffer[0] = '\0'; 581 | 582 | if (!pattern) 583 | return; 584 | while(*size < count) 585 | { 586 | if (pattern->type == UNUSED) 587 | { 588 | break; 589 | } 590 | 591 | //if (group_end && i == group_end) 592 | // printf(" )\n"); 593 | #if 0 594 | if (pattern->type <= TIMES_NM) 595 | re_string_cat_fmt_(buffer, "type: %s", types[pattern->type]); 596 | else 597 | re_string_cat_fmt_(buffer, "invalid type: %d", pattern->type); 598 | #endif 599 | if (pattern->type == CHAR_CLASS || pattern->type == INV_CHAR_CLASS) 600 | { 601 | re_string_cat_fmt_(buffer, "["); 602 | if (pattern->type == INV_CHAR_CLASS) 603 | re_string_cat_fmt_(buffer, "^"); 604 | j = -1; 605 | while((c = pattern->u.data[j])) 606 | { 607 | if (c == ']') 608 | { 609 | break; 610 | } 611 | re_string_cat_fmt_(buffer, "%c", c); 612 | ++j; 613 | } 614 | re_string_cat_fmt_(buffer, "]"); 615 | } 616 | else if (pattern->type == CHAR) 617 | { 618 | re_string_cat_fmt_(buffer, "%c", pattern->u.ch); 619 | } 620 | else if (pattern->type == TIMES) 621 | { 622 | re_string_cat_fmt_(buffer, "{%hu}", pattern->u.n); 623 | } 624 | else if (pattern->type == TIMES_N) 625 | { 626 | re_string_cat_fmt_(buffer, "{%hu,}", pattern->u.n); 627 | } 628 | else if (pattern->type == TIMES_M) 629 | { 630 | re_string_cat_fmt_(buffer, "{,%hu}", pattern->u.m); 631 | } 632 | else if (pattern->type == TIMES_NM) 633 | { 634 | re_string_cat_fmt_(buffer, "{%hu,%hu}", pattern->u.n, pattern->u.m); 635 | } 636 | else if (pattern->type == GROUP) 637 | { 638 | group_end = i + pattern->u.group_size; 639 | if (group_end >= MAX_REGEXP_OBJECTS) 640 | return; 641 | re_string_cat_fmt_(buffer, " ("); 642 | } 643 | else if (pattern->type == GROUPEND) 644 | { 645 | re_string_cat_fmt_(buffer, " )"); 646 | } 647 | else if(pattern->type == BEGIN) 648 | { 649 | re_string_cat_fmt_(buffer, "^"); 650 | } 651 | else if(pattern->type == END) 652 | { 653 | re_string_cat_fmt_(buffer, "$"); 654 | } 655 | else if(pattern->type == QUESTIONMARK) 656 | { 657 | re_string_cat_fmt_(buffer, "?"); 658 | } 659 | else if(pattern->type == DIGIT) 660 | { 661 | re_string_cat_fmt_(buffer, "\\d"); 662 | } 663 | //re_string_cat_fmt_(buffer, "\n"); 664 | ++i; 665 | pattern = getnext(pattern); 666 | } 667 | 668 | } 669 | 670 | static int hex (char c) 671 | { 672 | if (c >= 'a' && c <= 'f') 673 | return c - 'a' + 10; 674 | else if (c >= 'A' && c <= 'F') 675 | return c - 'A' + 10; 676 | else if (c >= '0' && c <= '9') 677 | return c - '0'; 678 | else 679 | return -1; 680 | } 681 | 682 | /* Private functions: */ 683 | static int matchdigit(char c) 684 | { 685 | return isdigit((unsigned char)c); 686 | } 687 | static int matchalpha(char c) 688 | { 689 | return isalpha((unsigned char)c); 690 | } 691 | static int matchwhitespace(char c) 692 | { 693 | return isspace((unsigned char)c); 694 | } 695 | static int matchalphanum(char c) 696 | { 697 | return ((c == '_') || matchalpha(c) || matchdigit(c)); 698 | } 699 | static int matchrange(char c, const char* str) 700 | { 701 | return ( (c != '-') 702 | && (str[0] != '\0') 703 | && (str[0] != '-') 704 | && (str[1] == '-') 705 | && (str[2] != '\0') 706 | && ( (c >= str[0]) 707 | && (c <= str[2]))); 708 | } 709 | static int matchdot(char c) 710 | { 711 | #if defined(RE_DOT_MATCHES_NEWLINE) && (RE_DOT_MATCHES_NEWLINE == 1) 712 | (void)c; 713 | return 1; 714 | #else 715 | return c != '\n' && c != '\r'; 716 | #endif 717 | } 718 | static int ismetachar(char c) 719 | { 720 | return ((c == 's') || (c == 'S') || (c == 'w') || (c == 'W') || (c == 'd') || (c == 'D')); 721 | } 722 | 723 | static int matchmetachar(char c, const char* str) 724 | { 725 | switch (str[0]) 726 | { 727 | case 'd': return matchdigit(c); 728 | case 'D': return !matchdigit(c); 729 | case 'w': return matchalphanum(c); 730 | case 'W': return !matchalphanum(c); 731 | case 's': return matchwhitespace(c); 732 | case 'S': return !matchwhitespace(c); 733 | default: return (c == str[0]); 734 | } 735 | } 736 | 737 | static int matchcharclass(char c, const char* str) 738 | { 739 | do 740 | { 741 | if (matchrange(c, str)) 742 | { 743 | DEBUG_P("%c matches %s\n", c, str); 744 | return 1; 745 | } 746 | else if (str[0] == '\\') 747 | { 748 | /* Escape-char: increment str-ptr and match on next char */ 749 | str += 1; 750 | if (matchmetachar(c, str)) 751 | { 752 | return 1; 753 | } 754 | else if ((c == str[0]) && !ismetachar(c)) 755 | { 756 | return 1; 757 | } 758 | } 759 | else if (c == str[0]) 760 | { 761 | if (c == '-') 762 | { 763 | if ((str[-1] == '\0') || (str[1] == '\0')) 764 | return 1; 765 | // else continue 766 | } 767 | else 768 | { 769 | return 1; 770 | } 771 | } 772 | } 773 | while (*str++ != '\0'); 774 | 775 | DEBUG_P("%c did not match prev. ccl\n", c); 776 | return 0; 777 | } 778 | 779 | static int matchone(regex_t* p, char c) 780 | { 781 | DEBUG_P("ONE %d matches %c?\n", p->type, c); 782 | switch (p->type) 783 | { 784 | case DOT: return matchdot(c); 785 | case CHAR_CLASS: return matchcharclass(c, (const char*)&p->u.data[-1]); 786 | case INV_CHAR_CLASS: return !matchcharclass(c, (const char*)&p->u.data[-1]); 787 | case DIGIT: return matchdigit(c); 788 | case NOT_DIGIT: return !matchdigit(c); 789 | case ALPHA: return matchalphanum(c); 790 | case NOT_ALPHA: return !matchalphanum(c); 791 | case WHITESPACE: return matchwhitespace(c); 792 | case NOT_WHITESPACE: return !matchwhitespace(c); 793 | case GROUPEND: return 1; 794 | case BEGIN: return 0; 795 | default: return (p->u.ch == c); 796 | } 797 | } 798 | 799 | static int matchstar(regex_t* p, regex_t* pattern, const char* text, int* matchlength) 800 | { 801 | int num_patterns = 0; 802 | return matchplus(p, pattern, text, matchlength) || 803 | matchpattern(pattern, text, matchlength, &num_patterns); 804 | } 805 | 806 | static int matchplus(regex_t* p, regex_t* pattern, const char* text, int* matchlength) 807 | { 808 | int num_patterns = 0; 809 | const char* prepoint = text; 810 | while ((text[0] != '\0') && matchone(p, *text)) 811 | { 812 | DEBUG_P("+ matches %s\n", text); 813 | text++; 814 | } 815 | for (; text > prepoint; text--) 816 | { 817 | if (matchpattern(pattern, text, matchlength, &num_patterns)) 818 | { 819 | *matchlength += text - prepoint; 820 | return 1; 821 | } 822 | DEBUG_P("+ pattern does not match %s\n", &text[1]); 823 | } 824 | DEBUG_P("+ pattern did not match %s\n", prepoint); 825 | return 0; 826 | } 827 | 828 | static int matchquestion(regex_t* p, regex_t* pattern, const char* text, int* matchlength) 829 | { 830 | int num_patterns = 0; 831 | if (p->type == UNUSED) 832 | return 1; 833 | if (matchpattern(pattern, text, matchlength, &num_patterns)) 834 | { 835 | #ifdef DEBUG 836 | re_print(pattern); 837 | DEBUG_P("? matched %s\n", text); 838 | #endif 839 | return 1; 840 | } 841 | if (*text && matchone(p, *text++)) 842 | { 843 | if (matchpattern(pattern, text, matchlength, &num_patterns)) 844 | { 845 | (*matchlength)++; 846 | #ifdef DEBUG 847 | re_print(pattern); 848 | DEBUG_P("? matched %s\n", text); 849 | #endif 850 | return 1; 851 | } 852 | } 853 | return 0; 854 | } 855 | 856 | static int matchtimes(regex_t* p, unsigned short n, const char* text, int* matchlength) 857 | { 858 | unsigned short i = 0; 859 | int pre = *matchlength; 860 | /* Match the pattern n times */ 861 | while (*text && matchone(p, *text++) && i < n) 862 | { 863 | (*matchlength)++; 864 | i++; 865 | } 866 | if (i == n) 867 | return 1; 868 | *matchlength = pre; 869 | return 0; 870 | } 871 | 872 | static int matchtimes_n(regex_t* p, unsigned short n, const char* text, int* matchlength) 873 | { 874 | unsigned short i = 0; 875 | int pre = *matchlength; 876 | /* Match the pattern n or more times */ 877 | while (*text && matchone(p, *text++)) 878 | { 879 | i++; 880 | ++(*matchlength); 881 | } 882 | if (i >= n) 883 | return 1; 884 | *matchlength = pre; 885 | return 0; 886 | } 887 | 888 | static int matchtimes_m(regex_t* p, unsigned short m, const char* text, int* matchlength) 889 | { 890 | unsigned short i = 0; 891 | /* Match the pattern max m times */ 892 | while (*text && matchone(p, *text++) && i < m) 893 | { 894 | (*matchlength)++; 895 | i++; 896 | } 897 | return 1; 898 | } 899 | 900 | static int matchtimes_nm(regex_t* p, unsigned short n, unsigned short m, const char* text, int* matchlength) 901 | { 902 | unsigned short i = 0; 903 | int pre = *matchlength; 904 | /* Match the pattern n to m times */ 905 | while (*text && matchone(p, *text++) && i < m) 906 | { 907 | (*matchlength)++; 908 | i++; 909 | } 910 | if (i >= n && i <= m) 911 | return 1; 912 | *matchlength = pre; 913 | return 0; 914 | } 915 | 916 | static int matchbranch(regex_t* p, regex_t* pattern, const char* text, int* matchlength) 917 | { 918 | int num_patterns = 0; 919 | const char* prepoint = text; 920 | if (p->type == UNUSED) 921 | return 1; 922 | /* Match the current p (previous) */ 923 | if (*text && matchone(p, *text++)) 924 | { 925 | (*matchlength)++; 926 | return 1; 927 | } 928 | if (pattern->type == UNUSED) 929 | // empty branch "0|" allows NULL text 930 | return 1; 931 | /* or the next branch */ 932 | if (matchpattern(pattern, prepoint, matchlength, &num_patterns)) 933 | return 1; 934 | return 0; 935 | } 936 | 937 | static int matchgroup(regex_t* p, const char* text, int* matchlength) 938 | { 939 | int pre = *matchlength; 940 | int num_patterns = 0, length = pre; 941 | regex_t* groupstart = p; 942 | const regex_t* groupend = getindex(p, p->u.group_size + 1);//&p[p->u.group_size + 1]; 943 | DEBUG_P("does GROUP (%u) match %s?\n", (unsigned)p->u.group_size, text); 944 | p = getnext(p); 945 | while (p < groupend) 946 | { 947 | if (p->type == UNUSED) // only with invalid external compiles 948 | return 0; 949 | if (!matchpattern(p, text, &length, &num_patterns)) 950 | { 951 | DEBUG_P("GROUP did not match %.*s (len %d, patterns %d)\n", length, text-*matchlength, *matchlength, num_patterns); 952 | *matchlength = pre; 953 | return 0; 954 | } 955 | DEBUG_P("GROUP did match %.*s (len %d, patterns %d)\n", length, text-*matchlength, *matchlength, num_patterns); 956 | int delta = length - *matchlength; 957 | text += delta; 958 | p = getindex(groupstart, num_patterns); 959 | *matchlength += delta; 960 | } 961 | DEBUG_P("ENDGROUP did match %s (len %d, patterns %d)\n", text-*matchlength, *matchlength, num_patterns); 962 | return 1; 963 | } 964 | 965 | static inline int ismultimatch(unsigned char type) { 966 | switch(type) { 967 | case TIMES: 968 | case TIMES_N: 969 | case TIMES_M: 970 | case TIMES_NM: 971 | return 1; 972 | default: 973 | return 0; 974 | } 975 | } 976 | 977 | /* Iterative matching */ 978 | static int matchpattern(regex_t* pattern, const char* text, int* matchlength, int *num_patterns) 979 | { 980 | int pre = *matchlength; 981 | while(1) 982 | { 983 | if(pattern->type == UNUSED) 984 | { 985 | return 1; 986 | } 987 | 988 | regex_t* next_pattern = getnext(pattern); 989 | 990 | if (next_pattern->type == QUESTIONMARK) 991 | { 992 | return matchquestion(pattern, getnext(next_pattern), text, matchlength); 993 | } 994 | else if (next_pattern->type == STAR) 995 | { 996 | //int i = (pattern[1].type == GROUPEND) ? pattern[1].u.group_start : 0; 997 | return matchstar(pattern, getnext(next_pattern), text, matchlength); 998 | } 999 | else if (next_pattern->type == PLUS) 1000 | { 1001 | DEBUG_P("PLUS match %s?\n", text); 1002 | //int i = (pattern[1].type == GROUPEND) ? pattern[1].u.group_start : 0; 1003 | return matchplus(pattern, getnext(next_pattern), text, matchlength); 1004 | } 1005 | else if(ismultimatch(next_pattern->type)) { 1006 | int retval = 0; 1007 | if (next_pattern->type == TIMES) 1008 | { 1009 | //int i = (pattern[1].type == GROUPEND) ? pattern[1].u.group_start : 0; 1010 | retval = matchtimes(pattern, next_pattern->u.n, text, matchlength); 1011 | } 1012 | else if (next_pattern->type == TIMES_N) 1013 | { 1014 | retval = matchtimes_n(pattern, next_pattern->u.n, text, matchlength); 1015 | } 1016 | else if (next_pattern->type == TIMES_M) 1017 | { 1018 | retval = matchtimes_m(pattern, next_pattern->u.m, text, matchlength); 1019 | } 1020 | else if (next_pattern->type == TIMES_NM) 1021 | { 1022 | //int i = (pattern[1].type == GROUPEND) ? pattern[1].u.group_start : 0; 1023 | retval = matchtimes_nm(pattern, next_pattern->u.n, next_pattern->u.m, text, 1024 | matchlength); 1025 | } 1026 | 1027 | if(!retval) return 0; 1028 | else { 1029 | pre = *matchlength; 1030 | (*num_patterns)++; 1031 | pattern = getnext(next_pattern); 1032 | text += *matchlength; 1033 | if(*text == '\0') return retval; 1034 | continue; 1035 | } 1036 | 1037 | } 1038 | else if (next_pattern->type == BRANCH) 1039 | { 1040 | //int i = (pattern[1].type == GROUPEND) ? pattern[1].u.group_start : 0; 1041 | return matchbranch(pattern, getnext(next_pattern), text, matchlength); 1042 | } 1043 | else if (pattern->type == GROUPEND) 1044 | { 1045 | (*num_patterns)++; 1046 | DEBUG_P("GROUPEND matches %.*s (len %d, patterns %d)\n", *matchlength, text-*matchlength, *matchlength, *num_patterns); 1047 | return 1; 1048 | } 1049 | else if (pattern->type == GROUP) 1050 | { 1051 | const int beforelen = *matchlength; 1052 | const int retval = matchgroup(pattern, text, matchlength); 1053 | 1054 | if(!retval) return 0; 1055 | else { 1056 | text += (*matchlength - beforelen); 1057 | pre = *matchlength; 1058 | (*num_patterns) += pattern->u.group_size + 2; 1059 | pattern = getindex(pattern, pattern->u.group_size + 2); 1060 | if(*text == '\0') return retval; 1061 | continue; 1062 | } 1063 | } 1064 | else if ((pattern->type == END) && next_pattern->type == UNUSED) 1065 | { 1066 | return (text[0] == '\0'); 1067 | } 1068 | (*matchlength)++; 1069 | (*num_patterns)++; 1070 | 1071 | if(text[0] == '\0') 1072 | break; 1073 | if(!matchone(pattern, *(text++))) 1074 | break; 1075 | pattern = next_pattern; 1076 | 1077 | } 1078 | 1079 | *matchlength = pre; 1080 | return 0; 1081 | } 1082 | 1083 | #ifdef CPROVER 1084 | #define N 24 1085 | 1086 | /* Formal verification with cbmc: */ 1087 | /* cbmc -DCPROVER --64 --depth 200 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check --enum-range-check --pointer-primitive-check -trace re.c 1088 | */ 1089 | 1090 | void verify_re_compile() 1091 | { 1092 | /* test input - ten chars used as a regex-pattern input */ 1093 | char arr[N]; 1094 | /* make input symbolic, to search all paths through the code */ 1095 | /* i.e. the input is checked for all possible ten-char combinations */ 1096 | for (int i=0; i -127 && arr[i] < 128); 1099 | } 1100 | /* assume proper NULL termination */ 1101 | assume(arr[sizeof(arr) - 1] == 0); 1102 | /* verify abscence of run-time errors - go! */ 1103 | re_compile(arr); 1104 | } 1105 | 1106 | void verify_re_print() 1107 | { 1108 | regex_t pattern[MAX_REGEXP_OBJECTS]; 1109 | for (unsigned char i=0; i= 0 && pattern[i].type <= 255); 1112 | pattern[i].u.ccl = nondet_long(); 1113 | } 1114 | re_print(&pattern); 1115 | } 1116 | 1117 | void verify_re_match() 1118 | { 1119 | int length; 1120 | regex_t pattern[MAX_REGEXP_OBJECTS]; 1121 | char arr[N]; 1122 | 1123 | for (unsigned char i=0; i= 0 && pattern[i].type <= 255); 1127 | assume(pattern[i].u.ccl >= 0 && pattern[i].u.ccl <= ~1); 1128 | } 1129 | for (int i=0; i -127 && arr[i] < 128); 1131 | } 1132 | /* assume proper NULL termination */ 1133 | assume(arr[sizeof(arr) - 1] == 0); 1134 | 1135 | re_match(&pattern, arr, &length); 1136 | } 1137 | 1138 | int main(int argc, char* argv[]) 1139 | { 1140 | verify_re_compile(); 1141 | verify_re_print(); 1142 | verify_re_match(); 1143 | return 0; 1144 | } 1145 | #endif 1146 | --------------------------------------------------------------------------------