├── .gitattributes ├── .gitignore ├── LICENSE.md ├── README.md ├── reg_exp.cpp └── reg_exp.h /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear on external disk 35 | .Spotlight-V100 36 | .Trashes 37 | 38 | # Directories potentially created on remote AFP share 39 | .AppleDB 40 | .AppleDesktop 41 | Network Trash Folder 42 | Temporary Items 43 | .apdisk 44 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # C++ Regex Matcher 2 | 3 | This is an educational library for matching regular expressions. 4 | It runs extremely fast, a few orders of magnitude faster than built in C++11 libraries, and beats in performance almost any other wide known library. 5 | However it is very minimalistic: Operates on ASCII symbols only, does not support multi-threading nor multi-core support. 6 | The library was tested under iOS, Android, Windows and Linux. 7 | -------------------------------------------------------------------------------- /reg_exp.cpp: -------------------------------------------------------------------------------- 1 | // Includes 2 | #include "reg_exp.h" 3 | #include // printfs 4 | #include // memory operations like memcmp() memset() 5 | 6 | /****************************************************************************/ 7 | // Basic definitions of types. Feel free to remove if you have them already defined. 8 | #ifndef FALSE 9 | #define FALSE (0) 10 | #endif 11 | #ifndef TRUE 12 | #define TRUE (1) 13 | #endif 14 | #ifndef MAX 15 | #define MAX(a,b) ((a) > (b) ? (a) : (b)) 16 | #endif 17 | typedef unsigned char MYBOOL, UCHAR; // Feel free to change boolean to 'int' or C++ 'bool'. 18 | #if defined(__ANDROID__) || defined(__APPLE__) || defined(__MACH__) 19 | typedef __int32_t INT32; // Android and iOS specific INT32. Feel free to switch to 64bits integer 20 | #define strcpy_s(dst,size,src) strcpy(dst,src) // Forward compatibility to safe I/O methods. Activate only if needed (not needed for windows devices) 21 | #else 22 | typedef signed __int32 INT32; // Windows 23 | #endif 24 | 25 | /****************************************************************************/ 26 | // Fast Comparison of 8,16,32 bits int - without branching. Use in loops where speed is crucial. 27 | #define is_int_negative( num1 ) ((((INT32) (num1) )&0x80000000)>>31) // (num1<0 ) ? 1 : 0 28 | inline INT32 is_int_notZero( INT32 num1 ){ return is_int_negative(num1|-num1); } // (num1!=0 ) ? 1 : 0, Since (-X)|X is 0 for X==0 and negative for every other X. 29 | #define is_int_notEqual( num1,num2) is_int_notZero(num1^num2) // (num1!=num2)? 1 : 0 30 | #define is_int_equal( num1,num2) (is_int_notZero(num1^num2)^0x1) // (num1==num2)? 1 : 0 31 | #define is_int_inRange(i,L,U) ((((INT32((i)-(L)))|(INT32((U)-(i))))^0x80000000)>>31) // ((i>=L)&&(i<=U) ? 1 : 0 32 | #define isDigit( chr) is_int_inRange(chr,'0','9') // ('9'>=chr>='0') ? 1 : 0 33 | 34 | /****************************************************************************/ 35 | static int atoui_simple(const char* s){ // Read positive integer from string. Like "123" is converted to 123. 36 | int resI = 0; 37 | for (;isDigit(*s); s++) 38 | resI = resI*10 + (s[0]-'0'); 39 | return resI; 40 | } 41 | 42 | /****************************************************************************/ 43 | // Define the types of special regex character commands (as bits). 44 | #define TYPE_CHAR (0) // Default 0 - Single character. Like: A b 7 . 45 | #define TYPE_PREFIX (1) // Sepcial '\\' command for Abbreviations and using special characters. like \\?. This is the only command that actually uses 2 characters 46 | #define TYPE_SUFFIX (2) // Command has iteration suffix like a*, z+, b{3,5} 47 | #define TYPE_OPEN (4) // Left Parentheses: On of the following { ( [ . This rule opens a sub expression 48 | #define TYPE_CLOSE (8) // Right parentheses: } ) ] 49 | #define TYPE_RECURSION (16) // Termination of recursive call. (command is a suffix of previous one. Like previous is 'A' and current is '{2}') Todo: Not used yet 50 | 51 | #define NO_MATCH (-1) // Returned when regular expression cannot be matched to the string. 52 | /****************************************************************************/ 53 | // Define a command structure (single language rule) 54 | typedef struct{ 55 | char id; // The character which represennts the command. Like: * ? [ 56 | char attr; // Type of the command. Can be combination of above types 57 | void* f; // Pointer to function which proccesses the current command. Polymorphism (C style) 58 | } Cmd; 59 | 60 | /****************************************************************************/ 61 | // Total there are 6 functions 62 | // First parameter is the pattern. Second is the sample string and optional third is pointer to the end of the pattern 63 | // Each one returns the number of consumed characters in sample string or NO_MATCH if inapplicable 64 | // (like 'a+' was applied on 'bbb'). Note 'a*' can be applied on 'bbb' and it consumes zero characters. 65 | static inline int c_achar( const char* pat, const char* sam); // Handling single char comparison. Like 'b' 66 | static inline int c_any( const char* pat, const char* sam); // Handlling 'any' comparison. '.' Note: '.' may consume few bytes (1 char) when working with unicodes, and only 1 byte (= 1 char) for ascii. 67 | static inline int c_extended(const char* pat, const char* sam); // Handaling special extended abreviations starting with '\\' 68 | static inline int c_group( const char* pat, const char* sam); // Sub pattern. Grouping characters. Like (the) when searching inside 'I am the master of the realm' 69 | static inline int c_option( const char* pat, const char* sam); // Selection of one option to match [aA]ce mathces both words: Ace and ace 70 | static inline int c_multi( const char* pat, const char* sam, const char* endp);// Multiple occurance of the character. Like A+, A* A{4} A? 71 | 72 | // Define pointers to functions that process the special commands. 2 Types 73 | typedef int (*Stand_Func)(const char* pat, const char* sam ); // Standard functions and characters like () . [] \\ A 7 .... 74 | typedef int (*SuffixFunc)(const char* pat, const char* sam,const char* endp); // Suffix functions for multiple occurences like * + ? {} 75 | 76 | /****************************************************************************/ 77 | // Define the table of command (regex rules). For each id, it's length, type of command and processing function 78 | // Rules of commands: TYPE_CLOSE follows TYPE_OPEN immediately in command table 79 | static const Cmd cmd_tbl[] = { 80 | '(', TYPE_OPEN|TYPE_RECURSION, (void*)c_group, 81 | ')', TYPE_CLOSE|TYPE_RECURSION, (void*)NULL, 82 | '|', TYPE_CLOSE|TYPE_RECURSION, (void*)NULL, 83 | '[', TYPE_OPEN, (void*)c_option, 84 | ']', TYPE_CLOSE, (void*)NULL, 85 | '{', TYPE_SUFFIX|TYPE_OPEN, (void*)c_multi, 86 | '}', TYPE_CLOSE, (void*)NULL, 87 | '*', TYPE_SUFFIX, (void*)c_multi, 88 | '+', TYPE_SUFFIX, (void*)c_multi, 89 | '?', TYPE_SUFFIX, (void*)c_multi, 90 | '\\', TYPE_PREFIX, (void*)c_extended, 91 | '.', TYPE_CHAR, (void*)c_any, 92 | 0, TYPE_CHAR, (void*)c_achar, 93 | }; 94 | 95 | #define cmdLength( cmd) (1 + ((cmd)->attr&TYPE_PREFIX)) // All commands take 1 character + optional prefix character 96 | #define isSuffix(cmd) ( (cmd)->attr&TYPE_SUFFIX) // Does current command is a suffix of previous one. Like previous is 'A' and current is '{2}' 97 | #define isOpen( cmd) ( (cmd)->attr&TYPE_OPEN ) // Does this command opens a sub expression 98 | // Inverse table of the above (given a character like '*', 'C', '\' get the appropriate command). We use a look up table for all possible ASCII characters 99 | static const Cmd* get_cmd_byChar[128]; 100 | static int isInitialized = 0; // Was the look up table above initialized 101 | #define get_cmd(c) get_cmd_byChar[(c)&0x7F] // Get the command strucutre by character. For '(' will return '(' command 102 | #define isReservedSymbol(c) (get_cmd(c)->id != 0) // Is the given character a reserved symbol (used as regex command) 103 | 104 | /****************************************************************************/ 105 | /****************************** Aux functions *******************************/ 106 | /****************************************************************************/ 107 | // Initialize the get_cmd_byChar[] look up table 108 | static inline void init_regex_mechanism_private(void){ 109 | if (isInitialized) return; 110 | const Cmd* cmd = cmd_tbl, *end = cmd_tbl; 111 | while (end->id) end++; // Find the last default command (character processing) and store it in 'end' 112 | for (int i=0; i<128; i++) 113 | get_cmd_byChar[i] = end; // Set the whole look up table to point to the default command 114 | for (; cmdid] = cmd; // For all the real commands set an entry in the look up table 116 | isInitialized = TRUE; 117 | } 118 | 119 | /****************************************************************************/ 120 | // Find the end of the string. 121 | static inline const char* endOfString(const char *str){ 122 | while (*str) str++; 123 | return str; 124 | } 125 | 126 | /****************************************************************************/ 127 | // Find first occurence of character in the string. Returns the poitner in 'str' starting at 'c' or NULL if not found 128 | static inline const char* findFirstCinS(const char *str, const char c){ 129 | while ((*str)&&(*str-c)) str++; 130 | return (*str) ? str : NULL; 131 | } 132 | 133 | /****************************************************************************/ 134 | // Given an expression starting with left parentheses 'p' return a pointer after the end of this expression (right parentheses 'rp'). 135 | // Example: given p=[a*(1+3)[z@][aa]]tp5 returns pointer to: 'tp5' by skipping 136 | // the [a*(1+3)[z@][aa]] expression. 137 | // Note: handles nested parentheses by remembring the parentheses depth. Example: (((X)Y)Z)A - X is in depth 3, Z - in depth 1, A in depth zero. 138 | // Extension _uc means uncompiled. needs to read forward in real time to find the end of the expression. 139 | static inline const char* findExpressionEnd_uc(const char *p, const char rp){ 140 | char lp = *p; // left parethesis 141 | int depth = 1; // amount of '('- amount of ')'. 142 | MYBOOL isValidCommand = TRUE; // \\[ doesn't count as valid parentheses since it should be treated as part of the text. 143 | for (p++; is_int_notZero(*p)&is_int_notZero(depth); p++){ 144 | depth += isValidCommand * (is_int_equal(*p,lp) - is_int_equal(*p,rp)); // Update the depth only if parenthesis is valid. each 'lp' causes +1, rp causes -1 145 | isValidCommand = is_int_notEqual(*p,'\\'); // If current charachter == '\' than the parentheses become invalid 146 | } 147 | return (depth==0) ? p : NULL; // (depth==0) -> Expresion parentheses was matched, otherwise end of string reached and Parentheses are not balanced 148 | } 149 | 150 | /****************************************************************************/ 151 | // Same as findExpressionEnd_uc() but supports the the set-union extended POSIX defenition. 152 | // This method considers the symbol '|' as closing parentheses. So (aab|ccd|ef) will return 'aab' as first expression. 153 | // On the second execution will return 'ccd' and on the third will return 'ef'. 154 | // Extension _uc means uncompiled. needs to read forward in real time to find the end of the expression. 155 | static inline const char* findExpressionEnd_UnionSet_uc(const char *p, const char lp, const char rp){ 156 | int depth = 1; 157 | MYBOOL isValidCommand = TRUE; 158 | for (p++; (*p!='\0')&&(depth!=0); p++){ 159 | depth += isValidCommand * (is_int_equal(*p,lp) - is_int_equal(*p,rp)); 160 | depth -= is_int_equal(depth,1)&is_int_equal(*p,'|'); // '|' affects only on top level. For example: (XXX(a|B|)DDD) the '|' are not possible alternatives 161 | isValidCommand = is_int_notEqual(*p,'\\'); 162 | } 163 | return (depth==0) ? p : NULL; 164 | } 165 | 166 | /****************************************************************************/ 167 | // Find next unit of pattern 168 | static const char* goToNextPat_uc(const char* cur){ 169 | const Cmd* cmd = get_cmd(*cur); // Get the command 170 | if isOpen(cmd) // If this is open command: [,(,{ than search for closeing character Otherwise just advance forward 171 | return findExpressionEnd_uc(cur,(cmd+1)->id); // Find the closing parentheses of cur. 172 | return cur + cmdLength(cmd); // Just skip the command 173 | } 174 | 175 | /****************************************************************************/ 176 | /****************************** Compilation *********************************/ 177 | /****************************************************************************/ 178 | // Assumes the pattern is legal. Compiles it into 'C'. Returns the end of the pattern on success or NULL on failure. 179 | const char* tCompiledRegex::compile(const char *pat){ 180 | init_regex_mechanism_private(); 181 | start = pat; // Pointer to the pattern 182 | // For each 'OPEN' rules calculate the length of the expression. Todo: make it O(n) instead of O(n^2) for worst case of "((((((((((A))))))))))" 183 | for (int i = 0; *pat; pat++, i++) 184 | exprLen[i] = (UCHAR)(goToNextPat_uc(pat) - pat); // 'i' is alwyas equals to (pat - start). Initialize the length of current 185 | end = pat; 186 | 187 | // For each '(' ')' rules calculate union set if relevant. Like (A(z*)A|BB|CC) 188 | memset(unionLen,0,sizeof(*unionLen)*(end-start)); 189 | pat = start; 190 | for (int i = 0; *pat; pat++, i++){ 191 | if ((*pat!='|')||(pat[-1]=='\\')) // We don't care about non unions. 192 | unionLen[i] = exprLen[i]; 193 | else if (unionLen[i]==0){ // If we already calculated the length for current union, skip it. 194 | // We are by definition at the first union. Example: For (AA|BB|CC), We are at |BB|CC). 195 | int open; 196 | for (open = i-1; start + open + exprLen[open] <= pat; open--); // Go backwards until we find the '(' of the current union. 197 | // OK, now start+open points exactly to the '(' that opened a union. Moreover 'pat' points to the first '|' 198 | const char *next = pat, *cur = start+open; // Iterate over all the '|' and for each store the length until the next '|' 199 | while (*next =='|'){ 200 | unionLen[cur-start] = (UCHAR)(next - cur + 1); // Mark the current '|' 201 | cur = next; // Advance to the next '|' or the terminating ')' 202 | next = findExpressionEnd_UnionSet_uc(next, '(',')') -1; 203 | } 204 | unionLen[cur-start] = (UCHAR)(next - cur + 1); // MArk for the last '|' the length until the terminating ')' 205 | } 206 | } 207 | return end; 208 | } 209 | 210 | static const tCompiledRegex* compiledRegexPtr = NULL; // Pointer to the current regex-used. This line ruins multi-threading capabilities. Feel free to change the system architecture to support mutli-threading 211 | static const char* EOS = NULL; // Pointer to the end of current processed sample. Same reason as above 212 | 213 | /****************************************************************************/ 214 | /**************************** Command Handlers ******************************/ 215 | /****************************************************************************/ 216 | // Main method. Matches pattern to sample string. Returns the number of used characters 217 | // Or NO_MATCH if impossible to match. 218 | // 'endp' is pointer to the end of the pattern. 219 | // Declaration of the main mehtod is needed since it is recursivly called. 220 | static int match(const char* pat, const char* sam, const char* endp); 221 | 222 | /****************************************************************************/ 223 | // Any char comparison is always true 224 | static inline int c_any( const char* pat, const char* sam){ 225 | return 1; 226 | } 227 | 228 | /****************************************************************************/ 229 | // Single char comparison. Match uses one charactr. Wrong returns NO_MATCH. 230 | static inline int c_achar(const char* pat, const char* sam){ 231 | return (*pat == *sam) ? 1 : NO_MATCH; 232 | } 233 | 234 | /****************************************************************************/ 235 | static inline int c_group( const char* pat, const char* sam){ 236 | const char *close = compiledRegexPtr->getExpressionEnd_UnionSet(pat); 237 | if (!close) return NO_MATCH; // Could not match the paretheses. Wrong expresion. Exit 238 | int nCharsMatched; 239 | while (close[-1]=='|'){ 240 | nCharsMatched = match(pat+1, sam, close-1); // +1 and -1 remove the parentheses 241 | if (nCharsMatched >= 0) 242 | return nCharsMatched; 243 | pat = close-1; // Advance to the next alternative 244 | close = compiledRegexPtr->getExpressionEnd_UnionSet(pat); 245 | if (!close) return NO_MATCH; // Could not match the paretheses. Wrong expresion. Exit 246 | } 247 | return match(pat+1, sam, close-1); // Execute the final alternative. 248 | } 249 | 250 | /****************************************************************************/ 251 | // All possible abbreviations 252 | static inline int c_extended( const char* pat, const char* sam){ 253 | #define ABB_LENGTH (32) 254 | char abbr[ABB_LENGTH] = ""; 255 | switch (*++pat){ 256 | case 'd': strcpy_s(abbr, ABB_LENGTH, "[0-9]"); break; // Digit 257 | case 'D': strcpy_s(abbr, ABB_LENGTH, "[^0-9]"); break; // Non-digit 258 | case 'x': strcpy_s(abbr, ABB_LENGTH, "[0-9A-Fa-f]"); break; // Hex digit 259 | case 'X': strcpy_s(abbr, ABB_LENGTH, "[^0-9A-Fa-f]"); break; // Non Hex 260 | case 'w': strcpy_s(abbr, ABB_LENGTH, "[0-9A-Za-z_]"); break; // Word character 261 | case 'W': strcpy_s(abbr, ABB_LENGTH, "[^0-9A-Za-z_]"); break; 262 | case 'h': strcpy_s(abbr, ABB_LENGTH, "[0-9A-Za-z]"); break; // head of word character 263 | case 'H': strcpy_s(abbr, ABB_LENGTH, "[^0-9A-Za-z]"); break; 264 | case 'a': strcpy_s(abbr, ABB_LENGTH, "[A-Za-z]"); break; // Alphabetic character 265 | case 'A': strcpy_s(abbr, ABB_LENGTH, "[^A-Za-z]"); break; 266 | case 'l': strcpy_s(abbr, ABB_LENGTH, "[a-z]"); break; // Lowercase character 267 | case 'L': strcpy_s(abbr, ABB_LENGTH, "[^a-z]"); break; 268 | case 'u': strcpy_s(abbr, ABB_LENGTH, "[A-Z]"); break; // Uppercase character 269 | case 'U': strcpy_s(abbr, ABB_LENGTH, "[^A-Z]"); break; 270 | case 's': strcpy_s(abbr, ABB_LENGTH, "[ \t\r\n\v\f]"); break; // Whitespace characters 271 | case 'S': strcpy_s(abbr, ABB_LENGTH, "[^ \t\r\n\v\f]"); break; 272 | } 273 | 274 | if (*abbr) return match(abbr, sam, endOfString(abbr)); 275 | else return c_achar(pat,sam); // Unknown abbreviation. Just assume that it is a character comparison 276 | } 277 | 278 | /****************************************************************************/ 279 | // Chose one of the options in []. Like [\\-0-9$_#] 280 | static inline int c_option( const char* pat, const char* sam){ 281 | const char *from = NULL; // If we have [a-z] 'from' is 'a', 'to' is 'z' 282 | const char *to = NULL; // If we have [qQ] 'from' is 'q' and 'Q', 'to' is not needed 283 | const char *close = compiledRegexPtr->getExpressionEnd(pat);// Extract the expression inside the [] parentheses 284 | pat++; close--; // +1 and -1 remove the parentheses 285 | int negationOp = ((*pat == '^') ? NO_MATCH : 1); // Check for negation flag. Invert character [^a-z], representing negation operator 286 | if (negationOp<0) 287 | pat++; 288 | 289 | while (pat < close){ 290 | if (*pat == '-' && from){ // Check for range selection. Like 0-9, where we already have the from 291 | to = pat + 1; // Find the 'to' 292 | if (*to == '\\') to++; // Comparison with reserved character. like \- or \* 293 | // Test for range 294 | if is_int_inRange(*sam,*from,*to) 295 | return negationOp; // We have found a match. If 'not' is active than this is a violation of the pattern 296 | pat = to + 1; // So *sam didn't match the current range, try the next range. Like a-z and A-Z 297 | continue; 298 | } 299 | 300 | from = pat; // Beggining of the pattern. Initialize 'from' 301 | if (*from == '\\'){ 302 | from++; pat++; // Comparison with reserved character. like \\* or \\? 303 | } 304 | 305 | if (*sam == *from) 306 | return negationOp; // Comparison of single letter. Like [a-ZAB] 307 | pat++; 308 | } 309 | return -negationOp; // We tested all the options and nothing was mathing. 310 | } 311 | 312 | /****************************************************************************/ 313 | // Multiple occurence of a character 314 | static inline int c_multi( const char* pat, const char* sam, const char* endp){ 315 | const Cmd* cmd = get_cmd(*pat); 316 | int nCharsMatched; // How many characters the multi repitition consumed. 317 | int nRestCharsMatched = NO_MATCH; // How many characters the rest of the pattern consumes (if it exists). 318 | int nRepitions; // Counter, how many repititions were made up to now in a loop 319 | const char* start_sam = sam; 320 | const char *ends = EOS; // Get the end of the sample (stored in cache, instead of recalculation) 321 | const char *foundMatchAt = NULL; // We already found a match but want to try and find a longer matching string 322 | const char *multi = compiledRegexPtr->getExpressionEnd(pat); // Multi occurence pattern: {}, *,?,+ 323 | const char *next_pat = compiledRegexPtr->getExpressionEnd(multi); // The rest of the pattern. 324 | 325 | // Calculate Min/Max numbers of needed occurences 326 | int min = 0, max = 1; // Initialization not really needed. Just in case. 327 | switch (*multi){ 328 | case '{': // For range of repetition: {4} or {4-8} 329 | { 330 | const char* comma = findFirstCinS(multi, ','); 331 | const char* rEnd = findFirstCinS(multi, '}'); 332 | // Read the minimum value 333 | min = atoui_simple(multi+1); 334 | // If comma exists inside {} than read also the maximum value 335 | if (comma){ 336 | if (comma < rEnd-1) max = MAX(atoui_simple(comma + 1),1); // Read Max, Max must be at least 1; 337 | else if (comma == rEnd-1) max = (1<<30); // Max does not exists: '{min,}', assume 1 billion is enough. Can use MAX_INT instead. Daniel did not want a dependency on 338 | } 339 | else max = MAX(min,1); // No range: Like {4}, Max must be at least 1; 340 | } 341 | break; 342 | case '+': min = 1; max = (1<<30); break; 343 | case '?': min = 0; max = 1; break; 344 | case '*': min = 0; max = (1<<30); break; 345 | } 346 | 347 | // If (min==0), we first try to match the rest of pattern 348 | if ((min==0)&&(*next_pat)){ 349 | nRestCharsMatched = match(next_pat, sam, endp); 350 | if (nRestCharsMatched>=0) 351 | foundMatchAt = start_sam + nRestCharsMatched; // Yes! The rest of the sample string matches the rest of the pattern. But maybe we can do more repititions and still be fine. like '.*b' matched to 'ab' but can also match 'abccqqb' 352 | // Note: if nRestCharsMatched==0 than the rest of the pattern can be matched to an empty string. Success is guaranteed. Now we want to match as much repititions as we can. Like 'a*b?' was matched to first character of 'aaaz' but can be matched to 'aaa'. 353 | } 354 | 355 | // OK. We need to take at least one repitiotion. Enter the loop 356 | nRepitions = 0; 357 | while (sam < ends){ 358 | nCharsMatched = ((Stand_Func)cmd->f)(pat, sam); // Find the pattern for the i'th time. 359 | if (nCharsMatched < 0){ 360 | // No more repetitions are possible 361 | if (nRepitions < min) return NO_MATCH; // We need at least 'min' but failed 362 | 363 | if (*next_pat){ 364 | // If (nRestCharsMatched < 0) 365 | // We have enough iterations but we already know that the rest of pattern can't be matched. If we found a good solution earlier return it. Otherwise no solution for matching 366 | // Else We have found a good solution right now and no more iterations possible. Return the good solution. 367 | return (foundMatchAt) ? (int)(foundMatchAt-start_sam) : NO_MATCH; 368 | } 369 | return (int)(sam-start_sam); // Macth found. Use 'nRepitions' 370 | } 371 | sam += nCharsMatched; // Found 'i' repitiotions. Advance pointers 372 | nRepitions++; 373 | 374 | if (nRepitions < min) continue; // If we still havent reached the minimal amount of repititions than continue to gather more repetitions. 375 | 376 | // OK, we have at least 'min' iterations, Time to check the if the rest of the 377 | // pattern can be matched. If not we will look for more occurences. 378 | // Otherwise we will use the current amount of occurences. 379 | if (*next_pat){ 380 | nRestCharsMatched = match(next_pat, sam, endp); 381 | if (nRestCharsMatched>=0) 382 | foundMatchAt = sam + nRestCharsMatched; // See explanation of the code line 'foundMatchAt = start_sam + nRestCharsMatched;' above 383 | } 384 | 385 | if (nRepitions == max){ // Check the maximal limit of repititions. 386 | if (*next_pat) 387 | return (foundMatchAt) ? (int)(foundMatchAt-start_sam) : NO_MATCH; // See explanation for this exact code line above 388 | return (int)(sam-start_sam); // Macth found. Use maximal possible amounts of repitions. 389 | } 390 | } 391 | 392 | // None of the iterations yielded a consistent match. We exited the loop due to end of sample string. 393 | if (nRepitions < min) return NO_MATCH; // Sample string terminated and we didn't get our minimal amount. 394 | 395 | if ((*next_pat)&&(nRestCharsMatched < 0)) 396 | return (foundMatchAt) ? (int)(foundMatchAt-start_sam) : NO_MATCH; // Sample string terminated and the rest of the pattern cannot be matched to an empty string. If we found a good solution return it. Otherwise no solution for matching 397 | return (int)(sam-start_sam); // No following patterns that require aditional characters and we got enough iterations. like a*z? on a string of 'aaaa' 398 | } 399 | 400 | /****************************************************************************/ 401 | // Match pattern to the 'sam' string from its beginning. 402 | // Returns the amount of consumed characters if match was successfull. Otherwise returns NO_MATCH. 403 | // Note: 0 means successfull match. For example 'a?' is matched to 'bc' with zero occurences of 'a' 404 | static int match(const char* pat, const char* sam, const char* endp){ 405 | const Cmd* cmd; 406 | int nCharsMatched; 407 | const char* start_sam = sam, *next_pat; 408 | 409 | if (!pat) 410 | return NO_MATCH; // NULL pattern is illegal 411 | while (pat < endp){ 412 | next_pat = compiledRegexPtr->getExpressionEnd(pat); // Find next pattern to see if it is a suffix like *, {x,y},?,+ 413 | if (next_pat==NULL) 414 | return NO_MATCH; // Wrong regular expression. For example '(A)))' 415 | cmd = get_cmd(*next_pat); // Check the next command if it is a suffix 416 | if (isSuffix(cmd)){ // 'cmd' is indeed a suffix. like 'z{3,7}'. Activate {3,7} on pattern 'z'. 417 | int matchedLen = ((SuffixFunc)cmd->f)(pat, sam, endp); // Execute the suffix 418 | return (matchedLen>=0) ? (int)(sam-start_sam) + matchedLen : NO_MATCH; 419 | } 420 | else{ // No suffix 421 | cmd = get_cmd(*pat); 422 | // if (cmd->attr&TYPE_RECURSION){ To do: Handle the the case of the bug (.*)AB is not matched to 'ZAB' because .* consumes 3 letters 423 | nCharsMatched = ((Stand_Func)cmd->f)(pat, sam); 424 | if (nCharsMatched < 0) 425 | return NO_MATCH; // If matching failed return NO_MATCH. 426 | 427 | sam += nCharsMatched; // Advance to next pattern 428 | pat = next_pat; 429 | } 430 | } 431 | return (int)(sam-start_sam); 432 | } 433 | 434 | /****************************************************************************/ 435 | /******************************** API methods *******************************/ 436 | /****************************************************************************/ 437 | const char* regex_search(const char* pattern, const char* sampleString, int* resLen){ 438 | tCompiledRegex builtInCompiledRegex; 439 | builtInCompiledRegex.compile(pattern); 440 | return builtInCompiledRegex.search(sampleString, resLen); 441 | } 442 | 443 | /****************************************************************************/ 444 | const char* tCompiledRegex::search(const char* sampleString, int* resLen) const{ 445 | compiledRegexPtr = this; // Store 'this' as current regex 446 | const char* pattern = start; 447 | const char* endPattern = end; 448 | const char* endOfSearch= EOS = endOfString(sampleString); // When comparint the pattern to sample string we will search the entire sample. 449 | if (pattern == endPattern){ // Empty pattern is matched with zero length matching 450 | *resLen = 0; 451 | return sampleString; 452 | } 453 | 454 | if (pattern[0]=='^'){ // Check if first characters forces a matching to beggining of the string 455 | pattern++; // Skip the '^' 456 | endOfSearch = sampleString+1; // Allow match only for first position. 457 | } 458 | 459 | // Try to match from every possible place in the sample string 460 | for (;sampleString != endOfSearch; sampleString++){ 461 | *resLen = match(pattern, sampleString, endPattern); 462 | if (*resLen > 0) // Note the >0 comparison and not >=. We do not allow empty string match. 463 | return sampleString; // Full match was found. Return the current location in sample string. 464 | } 465 | 466 | sampleString = EOS; // No match was found through the entire search. 467 | *resLen = match(pattern, sampleString, endPattern); // Try to match pattern to empty string (since we didn't allow it before). 468 | return (*resLen>= 0) ? sampleString : NULL; // Empty sample string was matched to pattern (like a*b*c*) (if==0). Otherwise no match 469 | }; 470 | 471 | /****************************************************************************/ 472 | /****************************** UNITEST methods *****************************/ 473 | /****************************************************************************/ 474 | #ifdef REGEX_UNITEST 475 | void regex_debug(const char* pattern, const char* sampleString, const char* trueAnswer){ 476 | int len = 0; 477 | char output[256]; 478 | const char* res; 479 | const char error[] = "@NO"; 480 | const char empty[] = ""; 481 | 482 | res = regex_search(pattern, sampleString, &len); 483 | if (len>0){ memcpy(output,res, len); output[len] = '\0'; } 484 | else if (len==0) memcpy(output,empty,strlen(empty)+1); 485 | else memcpy(output,error,strlen(error)+1); 486 | // Print only errors 487 | if (strcmp(output,trueAnswer)) fprintf(stderr,"Error: Reg:%s\t\t in %s\t\t : %s,\n", pattern, sampleString, output); 488 | else fprintf(stderr,"Test OK!\n"); 489 | } 490 | 491 | /****************************************************************************/ 492 | // Sample of regular expression and true answers. Checks that mechanism works fine 493 | static void regex_debug_private_tests(void){ 494 | regex_debug("(\\(*.\\[)[qQ]", "((a[Q]","((a[Q"); 495 | regex_debug("((b)c)", "abc","bc"); 496 | regex_debug("(a)((b)c)?", "abc","abc"); 497 | regex_debug("(a)(((b))c)(d)?", "abc","abc"); 498 | regex_debug("ba*(ac)?", "baaa","baaa"); 499 | regex_debug("a{1,2}c?q?", "aaa","aa"); 500 | regex_debug("a{3,}c?q?", "aaacq","aaacq"); 501 | regex_debug("a{2}v", "aaav","aav"); 502 | regex_debug("a*b*c*", "",""); // Testy empty matches 503 | regex_debug("a*","zb",""); 504 | regex_debug("a*b*c*q", "","@NO"); 505 | regex_debug("ba*", "baaa","baaa"); 506 | regex_debug("ba*(ac)+", "baaac","baaac"); 507 | regex_debug("ba+", "baaa","baaa"); 508 | regex_debug("ba{0,3}", "baaa","baaa"); 509 | regex_debug("ba{1,2}c?q?", "baaa","baa"); 510 | regex_debug("[0-3]*(22a)", "12222a","12222a"); 511 | regex_debug("b[0-9]+", "abc123xyz","@NO"); 512 | regex_debug("ac?.[0-2]*", "abc123xyz","ab"); 513 | regex_debug("ac?b[q-z]*c{0,4}.[0-2]*(22a)", "abc12222a","abc12222a"); 514 | regex_debug("(6(0|4|5)_)+", "61_60_64_65_A","60_64_65_"); 515 | regex_debug("(6(0|4|5)_)*", "61_60_64_65_A","60_64_65_"); 516 | regex_debug("(6(011|44|5)_)*", "6012_6011_644_65_6541_","6011_644_65_"); 517 | regex_debug("^a[ \t\r\n\v\f]*", "za \n ","@NO"); // Test spaces 518 | regex_debug("a[ \t\r\n\v\f]*b", "ga \tbv","a \tb"); 519 | regex_debug("a[ \t\r\n\v\f]*", "za \t\t \f\n ","a \t\t \f\n "); 520 | regex_debug("f..k","zfolky","folk"); 521 | regex_debug("va*b","kkvaaaab","vaaaab"); 522 | regex_debug(".*b","ababc","abab"); 523 | regex_debug("z.*b","xyzababc","zabab"); 524 | regex_debug(".*b","bc","b"); 525 | regex_debug("[abc]1{2}(cat|pup+y|dog).{2}","_a11puppy11_","a11puppy11"); 526 | regex_debug("[abc]*\\|","zab|","ab|"); 527 | regex_debug("[abc]*\\|[AC-Z]*","zab|ACDB","ab|ACD"); 528 | regex_debug("[abc]*\\|[ABCK]*","zab|ACKvv","ab|ACK"); 529 | regex_debug("[ABCK\\+UZ0-9]*","AAZ+BUB","AAZ+BUB"); 530 | regex_debug("[ABCKUZ0-9\\-]*","AAZ-BUB","AAZ-BUB"); 531 | regex_debug("[ABCK\\-UZ0-9]*","AAZ-BUB","AAZ-BUB"); 532 | regex_debug("[abc]*\\|[A\\?B-Z]*","zab|A?BCD","ab|A?BCD"); 533 | regex_debug("[^xyz]","zab","a"); 534 | regex_debug("[^xyc]+","zab","zab"); 535 | regex_debug("([0-9]+a{2,4})+q","1aa23aaa445aaaaq","1aa23aaa445aaaaq"); 536 | regex_debug("([abc]?[01]?)*","a1b00aab","a1b00aab"); 537 | regex_debug("[a-c]+(x{2,4})*","cxxxxxxxx","cxxxxxxxx"); 538 | regex_debug("([a-c]+(x{1,2})?)+x","cxccxxccxxx","cxccxxccxxx"); 539 | regex_debug("a[0-9]+b", "za789b","a789b"); 540 | regex_debug("[^A-Za-z]*","abc12..","12.."); 541 | regex_debug("^a\\+{3}\\*\\?b","a+++*?bc","a+++*?b"); // Check special characters 542 | regex_debug("\\[\\[","a[?][[1","[["); 543 | regex_debug("(V\\.?A\\.?T\\.?|TAX|TVA)( *[^A-Z]? +)?[A-_\\. ]*([0-9]+([\\.,][0-9]{1,2})?%?[0-9]+([\\.,][0-9]{1,2})?%_? +)?([A-_\\. ]*)?([0-9]+[\\.,][0-9]{1,2} +)?[A-_]? *[\\-\\+]?[0-9]+(,[0-9]{3})*[\\.,][0-9]{1,2} *.? *\n"," VAT 3.93\n","VAT 3.93\n"); 544 | regex_debug("\\[\\]","a[?][]1","[]1"); 545 | regex_debug("(\\[\\?\\])*\\|(\\*)+","a[?][?]|**1","[?][?]|**"); 546 | regex_debug("(.+)(.+)","AB","AB"); // Known bug in Daniels implementation! 547 | regex_debug("((.*A)BABA)Z","ZABABABABAZQ","ZABABABABAZ"); // Known bug in Daniels implementation! 548 | } 549 | 550 | #endif 551 | /****************************************************************************/ 552 | // EOF. -------------------------------------------------------------------------------- /reg_exp.h: -------------------------------------------------------------------------------- 1 | #ifndef REGULAR_EXPRESSION_H_DANIEL 2 | #define REGULAR_EXPRESSION_H_DANIEL 3 | /* 4 | * Coded By : Daniel Herman Shmulyan 5 | * Description : Proccesing regular expressions. Given an expression and a string returns the longest substring matching the expression. 6 | * See examples in the implementation of regex_debug() function 7 | * Based on rules described at: http://en.wikipedia.org/wiki/Regular_expression. 8 | * This is a recursive regex (It does not compile regular expression to automaton like described in http://swtch.com/~rsc/regexp/regexp1.html) 9 | * But it uses uses constant small amount of memory for compilation and is extremely fast 10 | * Designed to work on simple regular expressions (ASCII only and no longer than 256 bytes) 11 | */ 12 | 13 | /****************************************************************************/ 14 | // Regular expression definitions and rules: 15 | // Single character is matched like 'abc' to 'zabcd' from place 1, length 3. 16 | // . - is matched to any character. 'a..d' matches 'abcd', 'aaad','aqwd' 17 | // * - 0 or more occurences: 'a*' is matched to '', 'a', 'aa', 'aaaaaa' 18 | // + - 1 or more occurences: 'a+' is matched to , 'a', 'aa', 'aaaaaa' but not '' 19 | // ? - 0 or 1 occurences: 'ab?c' is matched to 'ac' and 'abc' 20 | // {N} - exactly N occurences: 'a{4}' is matched to 'aaaa' 21 | // {M,N} - Between M to N occurences: 'a{1,3}' is matched to 'a','aa','aaa' but not '' nor 'aaaaa' 22 | // {M,} - At least M but with no maximum limit. 'a{2,} is matched to 'aa','aaaaa','aaaaaaaaa' 23 | // Use \\ to inclde reserved characters '\\[p\\]*' is matched to '[p]' 24 | // [] - Matches a single character that is contained within the brackets. [abc] is matched to 'a', 'b', 'c' but not 'd' 25 | // [^] - Invert selection: [^abc] is matched to any possible character except 'a', 'b', 'c' 26 | // [a-z] - Selection range: [A-Z] is matched to any capital english letter 27 | // [0-9a-zA-Z\\-\\+\\.] is matched to any alpha-numeric character (part of number or english word) 28 | // () - Used to group expression. Like: '(ab)*' is mathced to 'abababab' 29 | // | - Choose one of alternative expressions: (cat|dog|au+a) is matched 'cat' or 'dog' or 'aua' or 'auua' or 'auuua' etc 30 | // You can use any combinations of the above, Example: (_[hc]?|me) matches "_hat", "_cat", "_at and "me". 31 | // 32 | // Known Issues: 33 | // Since we use recursion on (), the greedy '.*' mechanism may cause '*' to grab too much characters. Example: "(.*A)BA" in "ABA" will return NULL since '.*' is wrongly matched to 'AB'. 34 | // No multi threading support (a single regex matching cannot run of a few CPU's and nither can a few regex matches run in parallel). 35 | // Special first character (appears as first character of the regular expression pattern). 36 | // ^ - Match to beggining of the string. Example: "ab" matches "zab" at place 1 but "^ab" does not match "zab" at all. 37 | // $ - Match to the end. Not supported yet. Daniel, Todo... Example: 'ab' matches 'abz' with length 2, but '$ab' will not match 'abz' at all. 38 | // ] - Chose last match. Not supported yet. Daniel, Todo... Example: 'ab' will match 'ab1ab3' at first 2 characters. but ']ab' will match to the second (last) appearance of 'ab' 39 | 40 | // Abbreviations: 41 | // \d ie. [0-9] digit 42 | // \D ie. [^0-9] non-digit 43 | // \x ie. [0-9A-Fa-f] hex digit 44 | // \X ie. [^0-9A-Fa-f] 45 | // \w ie. [0-9A-Za-z_] word character 46 | // \W ie. [^0-9A-Za-z_] 47 | // \h ie. [0-9A-Za-z] head of word character 48 | // \H ie. [^0-9A-Za-z] 49 | // \a ie. [A-Za-z] alphabetic character 50 | // \A ie. [^A-Za-z] 51 | // \l ie. [a-z] lowercase character 52 | // \L ie. [^a-z] 53 | // \u ie. [A-Z] uppercase character 54 | // \U ie. [^A-Z] 55 | // \s ie. [ \t\r\n\v\f] Whitespace characters 56 | // \S ie. [^ \t\r\n\v\f] Non-whitespace characters 57 | 58 | /****************************************************************************/ 59 | /******************************** Functions *********************************/ 60 | /****************************************************************************/ 61 | // Structures for compilation of regex pattern (faster execution of regex, at cost of initial calculation and memory usage). The faster execution is achieved by precalculating for each expression its length 62 | struct tCompiledRegex{ 63 | private: 64 | const char* start; // Pointer to beggining of the pattern 65 | const char* end; // Pointer to the end of the pattern 66 | unsigned char exprLen[ 256]; // For each sub expression - stores its length (faster jumping between expressions). Supports maximal expression length of 256. 67 | unsigned char unionLen[256]; // For each union set selection (aab|ccd|ef) - stores its length (faster jumping between union sets) 68 | 69 | public: 70 | const char* getExpressionEnd( const char *ex) const { return ex + exprLen[ (int)(ex-start)]; } // Given a pointer to beggining of sub expresion, returns pointer to its end 71 | const char* getExpressionEnd_UnionSet(const char *ex) const { return ex + unionLen[(int)(ex-start)]; } // Same for union set 72 | 73 | const char* compile(const char *pattern); // Assumes the pattern is legal expresion no longer than 256 bytes and compiles the pattern. Returns the end of the pattern on success or NULL on failure. 74 | const char* search( const char *sampleString, int* resLen) const; // Returns a pointer to the beginning of the matched part and sets resLen to store the length of the mathced part. 75 | }; 76 | 77 | const char* regex_search(const char* pattern, const char* sampleString, int* resLen); // Compile and search in one function. A C style API. 78 | 79 | #endif // H 80 | /****************************************************************************/ 81 | // EOF. 82 | --------------------------------------------------------------------------------