├── .gitattributes
├── .gitignore
├── LICENSE.md
├── README.md
├── reg_exp.cpp
└── reg_exp.h


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Windows image file caches
 2 | Thumbs.db
 3 | ehthumbs.db
 4 | 
 5 | # Folder config file
 6 | Desktop.ini
 7 | 
 8 | # Recycle Bin used on file shares
 9 | $RECYCLE.BIN/
10 | 
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 | 
17 | # Windows shortcuts
18 | *.lnk
19 | 
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 | 
24 | # OSX
25 | # =========================
26 | 
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 | 
31 | # Thumbnails
32 | ._*
33 | 
34 | # Files that might appear on external disk
35 | .Spotlight-V100
36 | .Trashes
37 | 
38 | # Directories potentially created on remote AFP share
39 | .AppleDB
40 | .AppleDesktop
41 | Network Trash Folder
42 | Temporary Items
43 | .apdisk
44 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | This is free and unencumbered software released into the public domain.
 2 | 
 3 | Anyone is free to copy, modify, publish, use, compile, sell, or
 4 | distribute this software, either in source code form or as a compiled
 5 | binary, for any purpose, commercial or non-commercial, and by any
 6 | means.
 7 | 
 8 | In jurisdictions that recognize copyright laws, the author or authors
 9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | For more information, please refer to <http://unlicense.org>
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # C++ Regex Matcher
2 | 
3 | This is an educational library for matching regular expressions.
4 | It runs extremely fast, a few orders of magnitude faster than built in C++11 libraries, and beats in performance almost any other wide known library.
5 | However it is very minimalistic: Operates on ASCII symbols only, does not support multi-threading nor multi-core support.
6 | The library was tested under iOS, Android, Windows and Linux. 
7 | 


--------------------------------------------------------------------------------
/reg_exp.cpp:
--------------------------------------------------------------------------------
  1 | ﻿// Includes
  2 | #include "reg_exp.h"
  3 | #include <stdio.h>								// printfs
  4 | #include <string.h>								// memory operations like memcmp() memset()
  5 | 
  6 | /****************************************************************************/
  7 | // Basic definitions of types. Feel free to remove if you have them already defined.
  8 | #ifndef FALSE
  9 | 	#define	FALSE		(0)
 10 | #endif
 11 | #ifndef TRUE
 12 | 	#define	TRUE		(1)
 13 | #endif
 14 | #ifndef MAX
 15 | 	#define MAX(a,b)	((a) > (b) ? (a) : (b))
 16 | #endif
 17 | typedef unsigned char		MYBOOL, UCHAR;		// Feel free to change boolean to 'int' or C++ 'bool'. 
 18 | #if defined(__ANDROID__) || defined(__APPLE__) || defined(__MACH__)    
 19 | 	typedef __int32_t			INT32;			// Android and iOS specific INT32. Feel free to switch to 64bits integer
 20 | 	#define strcpy_s(dst,size,src)			strcpy(dst,src)			// Forward compatibility to safe I/O methods. Activate only if needed (not needed for windows devices)
 21 | #else
 22 | 	typedef signed   __int32    INT32;			// Windows
 23 | #endif
 24 | 
 25 | /****************************************************************************/
 26 | // Fast Comparison of 8,16,32 bits int - without branching. Use in loops where speed is crucial. 
 27 | #define		   is_int_negative(       num1     ) ((((INT32) (num1)        )&0x80000000)>>31)	// (num1<0   ) ? 1 : 0
 28 | inline INT32 is_int_notZero( INT32  num1     ){	return is_int_negative(num1|-num1); }			// (num1!=0  ) ? 1 : 0, Since (-X)|X is 0 for X==0 and negative for every other X.
 29 | #define		   is_int_notEqual(	   num1,num2)	is_int_notZero(num1^num2)						// (num1!=num2)? 1 : 0
 30 | #define		   is_int_equal(	   num1,num2)	(is_int_notZero(num1^num2)^0x1)					// (num1==num2)? 1 : 0
 31 | #define        is_int_inRange(i,L,U)  ((((INT32((i)-(L)))|(INT32((U)-(i))))^0x80000000)>>31)	// ((i>=L)&&(i<=U) ? 1 : 0
 32 | #define        isDigit(               chr)       is_int_inRange(chr,'0','9')					// ('9'>=chr>='0') ? 1 : 0
 33 | 
 34 | /****************************************************************************/
 35 | static int atoui_simple(const char* s){			// Read positive integer from string. Like "123" is converted to 123.
 36 | 	int resI = 0;															
 37 | 	for (;isDigit(*s); s++)
 38 | 		resI = resI*10 + (s[0]-'0');
 39 | 	return resI;
 40 | }
 41 | 
 42 | /****************************************************************************/
 43 | // Define the types of special regex character commands (as bits). 
 44 | #define		TYPE_CHAR				(0)			// Default 0 -  Single character. Like: A b 7 .
 45 | #define     TYPE_PREFIX				(1)			// Sepcial '\\' command for Abbreviations and using special characters. like \\?. This is the only command that actually uses 2 characters
 46 | #define     TYPE_SUFFIX				(2)			// Command has iteration suffix like a*, z+, b{3,5} 
 47 | #define     TYPE_OPEN				(4)			// Left Parentheses: On of the following { ( [ . This rule opens a sub expression
 48 | #define     TYPE_CLOSE				(8)			// Right parentheses: } ) ]
 49 | #define     TYPE_RECURSION			(16)		// Termination of recursive call. (command is a suffix of previous one. Like previous is 'A' and current is '{2}') Todo: Not used yet
 50 | 
 51 | #define		NO_MATCH				(-1)		// Returned when regular expression cannot be matched to the string.
 52 | /****************************************************************************/
 53 | // Define a command structure (single language rule)
 54 | typedef struct{
 55 |     char	id;			// The character which represennts the command. Like: * ? [
 56 |     char	attr;		// Type of the command. Can be combination of above types
 57 |     void*	f;			// Pointer to function which proccesses the current command. Polymorphism (C style)
 58 | } Cmd;
 59 | 
 60 | /****************************************************************************/
 61 | // Total there are 6 functions
 62 | // First parameter is the pattern. Second is the sample string and optional third is pointer to the end of the pattern
 63 | // Each one returns the number of consumed characters in sample string or NO_MATCH if inapplicable 
 64 | // (like 'a+' was applied on 'bbb'). Note 'a*' can be applied on 'bbb' and it consumes zero characters.
 65 | static inline int c_achar(	 const char* pat, const char* sam);					 // Handling single char comparison. Like 'b'
 66 | static inline int c_any(	 const char* pat, const char* sam);					 // Handlling 'any' comparison. '.'  Note: '.' may consume few bytes (1 char) when working with unicodes, and only 1 byte (= 1 char) for ascii.
 67 | static inline int c_extended(const char* pat, const char* sam);					 // Handaling special extended abreviations starting with '\\'
 68 | static inline int c_group(	 const char* pat, const char* sam);					 // Sub pattern. Grouping characters. Like (the) when searching inside 'I am the master of the realm'
 69 | static inline int c_option(	 const char* pat, const char* sam);					 // Selection of one option to match [aA]ce mathces both words: Ace and ace
 70 | static inline int c_multi(	 const char* pat, const char* sam, const char* endp);// Multiple occurance of the character. Like A+, A* A{4} A?
 71 | 
 72 | // Define pointers to functions that process the special commands. 2 Types
 73 | typedef int (*Stand_Func)(const char* pat, const char* sam                 );	// Standard functions and characters like () . [] \\ A 7 ....
 74 | typedef int (*SuffixFunc)(const char* pat, const char* sam,const char* endp);	// Suffix functions for multiple occurences like * + ? {}
 75 | 
 76 | /****************************************************************************/
 77 | // Define the table of command (regex rules). For each id, it's length, type of command and processing function
 78 | // Rules of commands: TYPE_CLOSE follows TYPE_OPEN immediately in command table
 79 | static const Cmd cmd_tbl[] = {
 80 |     '(', TYPE_OPEN|TYPE_RECURSION,		(void*)c_group,
 81 |     ')', TYPE_CLOSE|TYPE_RECURSION,		(void*)NULL,        
 82 |     '|', TYPE_CLOSE|TYPE_RECURSION,		(void*)NULL,        
 83 |     '[', TYPE_OPEN,						(void*)c_option,     
 84 |     ']', TYPE_CLOSE,					(void*)NULL,        
 85 |     '{', TYPE_SUFFIX|TYPE_OPEN,			(void*)c_multi,    
 86 |     '}', TYPE_CLOSE,					(void*)NULL,       
 87 |     '*', TYPE_SUFFIX,					(void*)c_multi,    
 88 |     '+', TYPE_SUFFIX,					(void*)c_multi,    
 89 |     '?', TYPE_SUFFIX,					(void*)c_multi,    
 90 |    '\\', TYPE_PREFIX,					(void*)c_extended, 
 91 |     '.', TYPE_CHAR,						(void*)c_any,         
 92 |       0, TYPE_CHAR,						(void*)c_achar,    
 93 | };
 94 | 
 95 | #define cmdLength(  cmd)		(1 + ((cmd)->attr&TYPE_PREFIX))		// All commands take 1 character + optional prefix character
 96 | #define    isSuffix(cmd)		(     (cmd)->attr&TYPE_SUFFIX)		// Does current command is a suffix of previous one. Like previous is 'A' and current is '{2}'
 97 | #define    isOpen(  cmd)        (     (cmd)->attr&TYPE_OPEN  )		// Does this command opens a sub expression
 98 | // Inverse table of the above (given a character like '*', 'C', '\' get the appropriate command). We use a look up table for all possible ASCII characters
 99 | static const Cmd* get_cmd_byChar[128];
100 | static int        isInitialized = 0;								// Was the look up table above initialized
101 | #define get_cmd(c)               get_cmd_byChar[(c)&0x7F]			// Get the command strucutre by character. For '(' will return '(' command
102 | #define isReservedSymbol(c)      (get_cmd(c)->id != 0)				// Is the given character a reserved symbol (used as regex command)
103 | 
104 | /****************************************************************************/
105 | /****************************** Aux functions *******************************/
106 | /****************************************************************************/
107 | // Initialize the get_cmd_byChar[] look up table
108 | static inline void init_regex_mechanism_private(void){
109 | 	if (isInitialized) return;
110 |     const Cmd* cmd = cmd_tbl, *end = cmd_tbl;
111 |     while (end->id) end++;									// Find the last default command (character processing) and store it in 'end'
112 | 	for (int i=0; i<128; i++)
113 | 		get_cmd_byChar[i]		= end;						// Set the whole look up table to point to the default command
114 | 	for (; cmd<end; cmd++)
115 | 		get_cmd_byChar[cmd->id] = cmd;						// For all the real commands set an entry in the look up table
116 | 	isInitialized = TRUE;
117 | }
118 | 
119 | /****************************************************************************/
120 | // Find the end of the string.
121 | static inline const char* endOfString(const char *str){
122 | 	while (*str) str++;
123 | 	return str;
124 | }
125 | 
126 | /****************************************************************************/
127 | // Find first occurence of character in the string. Returns the poitner in 'str' starting at 'c' or NULL if not found
128 | static inline const char* findFirstCinS(const char *str, const char c){
129 | 	while ((*str)&&(*str-c)) str++;
130 | 	return (*str) ? str : NULL;
131 | }
132 | 
133 | /****************************************************************************/
134 | // Given an expression starting with left parentheses 'p' return a pointer after the end of this expression (right parentheses 'rp').
135 | // Example: given p=[a*(1+3)[z@][aa]]tp5 returns pointer to: 'tp5' by skipping
136 | //          the [a*(1+3)[z@][aa]] expression.
137 | // Note: handles nested parentheses by remembring the parentheses depth. Example: (((X)Y)Z)A  - X is in depth 3, Z - in depth 1, A in depth zero.
138 | // Extension _uc means uncompiled. needs to read forward in real time to find the end of the expression.
139 | static inline const char* findExpressionEnd_uc(const char *p, const char rp){
140 | 	char	lp  = *p;															// left parethesis
141 | 	int		depth  = 1;															// amount of '('- amount of ')'.
142 | 	MYBOOL	isValidCommand = TRUE;												//  \\[ doesn't count as valid parentheses since it should be treated as part of the text.
143 | 	for (p++; is_int_notZero(*p)&is_int_notZero(depth); p++){
144 | 		depth += isValidCommand * (is_int_equal(*p,lp) - is_int_equal(*p,rp));	// Update the depth only if parenthesis is valid. each 'lp' causes +1, rp causes -1
145 | 		isValidCommand = is_int_notEqual(*p,'\\');			// If current charachter == '\' than the parentheses become invalid
146 | 	}
147 | 	return (depth==0) ? p : NULL;							// (depth==0) -> Expresion parentheses was matched, otherwise end of string reached and Parentheses are not balanced
148 | }
149 | 
150 | /****************************************************************************/
151 | // Same as findExpressionEnd_uc() but supports the the set-union extended POSIX defenition.
152 | // This method considers the symbol '|' as closing parentheses. So (aab|ccd|ef) will return 'aab' as first expression.
153 | //          On the second execution will return 'ccd' and on the third will return 'ef'.
154 | // Extension _uc means uncompiled. needs to read forward in real time to find the end of the expression.
155 | static inline const char* findExpressionEnd_UnionSet_uc(const char *p, const char lp, const char rp){
156 | 	int		depth  = 1;						
157 | 	MYBOOL	isValidCommand = TRUE;			
158 | 	for (p++; (*p!='\0')&&(depth!=0); p++){
159 | 		depth += isValidCommand * (is_int_equal(*p,lp) - is_int_equal(*p,rp));
160 | 		depth -= is_int_equal(depth,1)&is_int_equal(*p,'|');					// '|' affects only on top level. For example: (XXX(a|B|)DDD) the '|' are not possible alternatives
161 | 		isValidCommand = is_int_notEqual(*p,'\\');			
162 | 	}
163 | 	return (depth==0) ? p : NULL;			
164 | }
165 | 
166 | /****************************************************************************/
167 | // Find next unit of pattern
168 | static const char* goToNextPat_uc(const char* cur){   	
169 |     const Cmd* cmd = get_cmd(*cur);						// Get the command	
170 |     if isOpen(cmd)										// If this is open command: [,(,{ than search for closeing character Otherwise just advance forward
171 | 		return findExpressionEnd_uc(cur,(cmd+1)->id);	// Find the closing parentheses of cur.	
172 | 	return cur + cmdLength(cmd);						// Just skip the command
173 | }
174 | 
175 | /****************************************************************************/
176 | /****************************** Compilation *********************************/
177 | /****************************************************************************/
178 | // Assumes the pattern is legal. Compiles it into 'C'. Returns the end of the pattern on success or NULL on failure.
179 | const char* tCompiledRegex::compile(const char *pat){
180 | 	init_regex_mechanism_private();
181 | 	start = pat;																			// Pointer to the pattern
182 | 	// For each 'OPEN' rules calculate the length of the expression. Todo: make it O(n) instead of O(n^2) for worst case of "((((((((((A))))))))))"
183 | 	for (int i = 0; *pat; pat++, i++)
184 | 		exprLen[i]   = (UCHAR)(goToNextPat_uc(pat) - pat);									// 'i' is alwyas equals to (pat - start). Initialize the length of current 
185 | 	end = pat;
186 | 	
187 | 	// For each '(' ')' rules calculate union set if relevant. Like (A(z*)A|BB|CC)
188 | 	memset(unionLen,0,sizeof(*unionLen)*(end-start));
189 | 	pat = start;
190 | 	for (int i = 0; *pat; pat++, i++){
191 | 		if ((*pat!='|')||(pat[-1]=='\\'))													// We don't care about non unions.
192 | 			unionLen[i] = exprLen[i];
193 | 		else if (unionLen[i]==0){															// If we already calculated the length for current union, skip it.
194 | 			// We are by definition at the first union. Example: For (AA|BB|CC), We are at |BB|CC). 
195 | 			int open;
196 | 			for (open = i-1; start + open + exprLen[open] <= pat; open--);					// Go backwards until we find the '(' of the current union. 
197 | 			// OK, now start+open points exactly to the '(' that opened a union. Moreover 'pat' points to the first '|'
198 | 			const char *next = pat, *cur = start+open;										// Iterate over all the '|' and for each store the length until the next '|'
199 | 			while (*next =='|'){
200 | 				unionLen[cur-start] = (UCHAR)(next - cur + 1);								// Mark the current '|'
201 | 				cur = next;																	// Advance to the next '|' or the terminating ')'
202 | 				next = findExpressionEnd_UnionSet_uc(next, '(',')') -1;
203 | 			}
204 | 			unionLen[cur-start] = (UCHAR)(next - cur + 1);									// MArk for the last '|' the length until the terminating ')'
205 | 		} 
206 | 	}
207 | 	return end;
208 | }
209 | 
210 | static const tCompiledRegex* compiledRegexPtr = NULL;	// Pointer to the current regex-used. This line ruins multi-threading capabilities. Feel free to change the system architecture to support mutli-threading
211 | static const char*	 		 EOS = NULL;				// Pointer to the end of current processed sample. Same reason as above
212 | 
213 | /****************************************************************************/
214 | /**************************** Command Handlers ******************************/
215 | /****************************************************************************/
216 | // Main method. Matches pattern to sample string. Returns the number of used characters
217 | // Or NO_MATCH if impossible to match.
218 | // 'endp' is pointer to the end of the pattern.
219 | // Declaration of the main mehtod is needed since it is recursivly called.
220 | static int match(const char* pat, const char* sam, const char* endp);
221 | 
222 | /****************************************************************************/
223 | // Any char comparison is always true
224 | static inline int c_any(	const char* pat, const char* sam){
225 | 	return 1;
226 | }
227 | 
228 | /****************************************************************************/
229 | // Single char comparison. Match uses one charactr. Wrong returns NO_MATCH.
230 | static inline int c_achar(const char* pat, const char* sam){
231 | 	return (*pat == *sam) ? 1 : NO_MATCH;
232 | }
233 | 
234 | /****************************************************************************/
235 | static inline int c_group(	const char* pat, const char* sam){
236 | 	const char *close = compiledRegexPtr->getExpressionEnd_UnionSet(pat);
237 |     if (!close)  return NO_MATCH;						// Could not match the paretheses. Wrong expresion. Exit
238 | 	int nCharsMatched;
239 | 	while (close[-1]=='|'){
240 | 		nCharsMatched = match(pat+1, sam, close-1);		// +1 and -1 remove the parentheses
241 | 		if (nCharsMatched >= 0)
242 | 			return nCharsMatched;
243 | 		pat   = close-1;								// Advance to the next alternative
244 | 		close = compiledRegexPtr->getExpressionEnd_UnionSet(pat);
245 | 		if (!close)  return NO_MATCH;					// Could not match the paretheses. Wrong expresion. Exit
246 | 	}
247 |     return match(pat+1, sam, close-1);					// Execute the final alternative.
248 | }
249 | 
250 | /****************************************************************************/
251 | // All possible abbreviations
252 | static inline int c_extended(	const char* pat, const char* sam){
253 | 	#define ABB_LENGTH  (32)
254 |     char abbr[ABB_LENGTH] = "";    
255 |     switch (*++pat){
256 |         case 'd':	strcpy_s(abbr, ABB_LENGTH, "[0-9]");			break;	// Digit
257 |         case 'D':   strcpy_s(abbr, ABB_LENGTH, "[^0-9]");			break;  // Non-digit
258 |         case 'x':   strcpy_s(abbr, ABB_LENGTH, "[0-9A-Fa-f]");		break;	// Hex digit
259 |         case 'X':	strcpy_s(abbr, ABB_LENGTH, "[^0-9A-Fa-f]");		break;	// Non Hex
260 |         case 'w':   strcpy_s(abbr, ABB_LENGTH, "[0-9A-Za-z_]");		break;	// Word character
261 |         case 'W':	strcpy_s(abbr, ABB_LENGTH, "[^0-9A-Za-z_]");	break;
262 |         case 'h':   strcpy_s(abbr, ABB_LENGTH, "[0-9A-Za-z]");		break;	// head of word character
263 |         case 'H':	strcpy_s(abbr, ABB_LENGTH, "[^0-9A-Za-z]");		break;
264 |         case 'a':   strcpy_s(abbr, ABB_LENGTH, "[A-Za-z]");			break;	// Alphabetic character
265 |         case 'A':	strcpy_s(abbr, ABB_LENGTH, "[^A-Za-z]");		break;
266 |         case 'l':	strcpy_s(abbr, ABB_LENGTH, "[a-z]");			break;	// Lowercase character
267 |         case 'L':	strcpy_s(abbr, ABB_LENGTH, "[^a-z]");			break;
268 |         case 'u':   strcpy_s(abbr, ABB_LENGTH, "[A-Z]");			break;	// Uppercase character
269 |         case 'U':	strcpy_s(abbr, ABB_LENGTH, "[^A-Z]");			break;
270 |         case 's':   strcpy_s(abbr, ABB_LENGTH, "[ \t\r\n\v\f]");	break;	// Whitespace characters
271 |         case 'S':	strcpy_s(abbr, ABB_LENGTH, "[^ \t\r\n\v\f]");	break;
272 |     }
273 | 
274 |     if (*abbr)	return match(abbr, sam, endOfString(abbr));
275 |     else		return c_achar(pat,sam);						// Unknown abbreviation. Just assume that it is a character comparison
276 | }
277 | 
278 | /****************************************************************************/
279 | // Chose one of the options in []. Like [\\-0-9$_#]
280 | static inline  int c_option(	const char* pat, const char* sam){
281 |     const char *from  = NULL;									// If we have [a-z]		'from' is 'a', 'to' is 'z'
282 |     const char *to	  = NULL;									// If we have [qQ]		'from' is 'q' and 'Q', 'to' is not needed
283 | 	const char *close = compiledRegexPtr->getExpressionEnd(pat);// Extract the expression inside the [] parentheses
284 | 	pat++; close--;												// +1 and -1 remove the parentheses	
285 |     int negationOp = ((*pat == '^') ? NO_MATCH : 1);			// Check for negation flag. Invert character [^a-z], representing negation operator
286 | 	if (negationOp<0)
287 | 		pat++;
288 | 
289 |     while (pat < close){		
290 |         if (*pat == '-' && from){								// Check for range selection. Like 0-9, where we already have the from
291 | 			to = pat + 1;										// Find the 'to'
292 |             if (*to == '\\')  to++;								// Comparison with reserved character. like \-  or \*			
293 | 			// Test for range			
294 |             if is_int_inRange(*sam,*from,*to)				
295 |                 return negationOp;								// We have found a match. If 'not' is active than this is a violation of the pattern			
296 |             pat = to + 1;										// So *sam didn't match the current range, try the next range. Like a-z and A-Z
297 |             continue;
298 |         }
299 | 
300 |         from = pat;												// Beggining of the pattern. Initialize 'from'
301 |         if (*from == '\\'){ 
302 | 			from++; pat++;										// Comparison with reserved character. like \\* or \\?			
303 | 		}
304 | 		
305 |         if (*sam == *from)
306 |             return negationOp;									// Comparison of single letter. Like [a-ZAB]
307 |         pat++;
308 |     }
309 |     return -negationOp;											// We tested all the options and nothing was mathing.  
310 | }
311 | 
312 | /****************************************************************************/
313 | // Multiple occurence of a character
314 | static inline int c_multi(	const char* pat, const char* sam, const char* endp){
315 |     const Cmd* cmd = get_cmd(*pat);
316 |     int   nCharsMatched;														// How many characters the multi repitition consumed.
317 |     int   nRestCharsMatched = NO_MATCH;											// How many characters the rest of the pattern consumes (if it exists).
318 |     int   nRepitions;															// Counter, how many repititions were made up to now in a loop
319 |     const char* start_sam = sam;
320 |     const char *ends      = EOS;												// Get the end of the sample (stored in cache, instead of recalculation)
321 | 	const char *foundMatchAt = NULL;											// We already found a match but want to try and find a longer matching string
322 | 	const char *multi    = compiledRegexPtr->getExpressionEnd(pat);				// Multi occurence pattern: {}, *,?,+
323 |     const char *next_pat = compiledRegexPtr->getExpressionEnd(multi);			// The rest of the pattern.
324 | 
325 | 	// Calculate Min/Max numbers of needed occurences
326 |     int min = 0, max = 1;														// Initialization not really needed. Just in case.	
327 |     switch (*multi){
328 |         case '{':  // For range of repetition: {4} or {4-8}
329 |             {
330 |                 const char* comma = findFirstCinS(multi, ',');
331 | 				const char* rEnd  = findFirstCinS(multi, '}');
332 | 				// Read the minimum value
333 |                 min = atoui_simple(multi+1);
334 | 				// If comma exists inside {} than read also the maximum value
335 | 				if (comma){
336 | 					if       (comma <  rEnd-1)	max = MAX(atoui_simple(comma + 1),1);	// Read Max, Max must be at least 1;
337 | 					else if  (comma == rEnd-1)  max = (1<<30);					// Max does not exists: '{min,}', assume 1 billion is enough. Can use MAX_INT instead. Daniel did not want a dependency on <limits.h>
338 | 				}
339 | 				else                            max = MAX(min,1);				// No range: Like {4}, Max must be at least 1;
340 |             } 
341 | 			break;
342 |         case '+':		min = 1;		max = (1<<30);		break;
343 |         case '?':		min = 0;		max = 1;			break;
344 |         case '*':		min = 0;		max = (1<<30);		break;
345 |     }
346 | 
347 | 	// If (min==0), we first try to match the rest of pattern
348 | 	if ((min==0)&&(*next_pat)){
349 | 		nRestCharsMatched = match(next_pat, sam, endp);
350 | 		if (nRestCharsMatched>=0)			
351 | 			foundMatchAt = start_sam + nRestCharsMatched;	// Yes! The rest of the sample string matches the rest of the pattern. But maybe we can do more repititions and still be fine. like '.*b' matched to 'ab' but can also match  'abccqqb'
352 | 		// Note: if nRestCharsMatched==0 than the rest of the pattern can be matched to an empty string. Success is guaranteed. Now we want to match as much repititions as we can. Like 'a*b?' was matched to first character of 'aaaz' but can be matched to 'aaa'.
353 | 	}
354 | 
355 | 	// OK. We need to take at least one repitiotion. Enter the loop
356 | 	nRepitions = 0;
357 |     while (sam < ends){
358 |         nCharsMatched = ((Stand_Func)cmd->f)(pat, sam);									// Find the pattern for the i'th time.
359 |         if (nCharsMatched < 0){			
360 | 			// No more repetitions are possible
361 | 			if (nRepitions < min) return NO_MATCH;										// We need at least 'min' but failed
362 | 
363 | 			if  (*next_pat){
364 | 				// If (nRestCharsMatched < 0)
365 | 				//      We have enough iterations but we already know that the rest of pattern can't be matched. If we found a good solution earlier return it. Otherwise no solution for matching
366 | 				// Else We have found a good solution right now and no more iterations possible. Return the good solution.
367 | 				return (foundMatchAt) ? (int)(foundMatchAt-start_sam) : NO_MATCH; 
368 | 			}
369 | 			return (int)(sam-start_sam);												// Macth found. Use 'nRepitions'
370 | 		}        
371 |         sam += nCharsMatched;															// Found 'i' repitiotions. Advance pointers
372 |         nRepitions++;
373 | 
374 |         if (nRepitions < min) continue;													// If we still havent reached the minimal amount of repititions than continue to gather more repetitions.
375 | 		
376 | 		// OK, we have at least 'min' iterations, Time to check the if the rest of the 
377 | 		// pattern can be matched. If not we will look for more occurences.
378 | 		// Otherwise we will use the current amount of occurences.
379 |         if (*next_pat){
380 |             nRestCharsMatched = match(next_pat, sam, endp);
381 |             if (nRestCharsMatched>=0)
382 | 				foundMatchAt = sam + nRestCharsMatched;									// See explanation of the code line 'foundMatchAt = start_sam + nRestCharsMatched;' above
383 |         }
384 | 		
385 | 	    if (nRepitions == max){															// Check the maximal limit of repititions.
386 | 			if  (*next_pat)
387 | 				return (foundMatchAt) ? (int)(foundMatchAt-start_sam) : NO_MATCH;		// See explanation for this exact code line above 
388 | 			return (int)(sam-start_sam);												// Macth found. Use maximal possible amounts of repitions.
389 | 		}
390 | 	}
391 | 
392 |     // None of the iterations yielded a consistent match. We exited the loop due to end of sample string.
393 |     if (nRepitions < min) return NO_MATCH;												// Sample string terminated and we didn't get our minimal amount.
394 | 
395 | 	if ((*next_pat)&&(nRestCharsMatched < 0))
396 | 		return (foundMatchAt) ? (int)(foundMatchAt-start_sam) : NO_MATCH;				// Sample string terminated and the rest of the pattern cannot be matched to an empty string. If we found a good solution return it. Otherwise no solution for matching	
397 |     return (int)(sam-start_sam);														// No following patterns that require aditional characters and we got enough iterations. like a*z? on a string of 'aaaa'
398 | }
399 | 
400 | /****************************************************************************/
401 | // Match pattern to the 'sam' string from its beginning. 
402 | // Returns the amount of consumed characters if match was successfull. Otherwise returns NO_MATCH.
403 | // Note: 0 means successfull match. For example 'a?' is matched to 'bc' with zero occurences of 'a'
404 | static int match(const char* pat, const char* sam, const char* endp){
405 |     const Cmd* cmd;
406 |     int  nCharsMatched;
407 |     const char* start_sam = sam, *next_pat;
408 | 
409 | 	if (!pat)
410 | 		return NO_MATCH;												// NULL pattern is illegal
411 |     while (pat < endp){		
412 |         next_pat  = compiledRegexPtr->getExpressionEnd(pat);			// Find next pattern to see if it is a suffix like *, {x,y},?,+
413 | 		if (next_pat==NULL)
414 | 			return NO_MATCH;											// Wrong regular expression. For example '(A)))'
415 | 		cmd = get_cmd(*next_pat);										// Check the next command if it is a suffix
416 | 		if (isSuffix(cmd)){												// 'cmd' is indeed a suffix. like 'z{3,7}'. Activate {3,7} on pattern 'z'.
417 | 			int matchedLen = ((SuffixFunc)cmd->f)(pat, sam, endp);		// Execute the suffix
418 | 			return (matchedLen>=0) ? (int)(sam-start_sam) + matchedLen : NO_MATCH;
419 |         }
420 |         else{															// No suffix
421 | 			cmd = get_cmd(*pat);
422 | 			// if (cmd->attr&TYPE_RECURSION){ To do: Handle the the case of the bug (.*)AB is not matched to 'ZAB' because .* consumes 3 letters
423 |             nCharsMatched = ((Stand_Func)cmd->f)(pat, sam);
424 |             if (nCharsMatched < 0) 
425 | 				return NO_MATCH;										// If matching failed return NO_MATCH.
426 | 			
427 |             sam += nCharsMatched;										// Advance to next pattern
428 |             pat = next_pat;
429 |         }
430 |     }
431 |     return (int)(sam-start_sam);
432 | }
433 | 
434 | /****************************************************************************/
435 | /******************************** API methods *******************************/
436 | /****************************************************************************/
437 | const char* regex_search(const char* pattern, const char* sampleString, int* resLen){
438 | 	tCompiledRegex builtInCompiledRegex;
439 | 	builtInCompiledRegex.compile(pattern);
440 | 	return builtInCompiledRegex.search(sampleString, resLen);
441 | }
442 | 
443 | /****************************************************************************/
444 | const char* tCompiledRegex::search(const char* sampleString, int* resLen) const{
445 | 	compiledRegexPtr	   = this;										// Store 'this' as current regex
446 | 	const char* pattern    = start;
447 | 	const char* endPattern = end;
448 | 	const char* endOfSearch= EOS = endOfString(sampleString);			// When comparint the pattern to sample string we will search the entire sample.
449 | 	if (pattern == endPattern){											// Empty pattern is matched with zero length matching		
450 | 		*resLen = 0;
451 | 		return sampleString;
452 | 	}
453 | 	
454 | 	if (pattern[0]=='^'){												// Check if first characters forces a matching to beggining of the string
455 | 		pattern++;														// Skip the '^'
456 | 		endOfSearch = sampleString+1;									// Allow match only for first position.
457 | 	}
458 | 
459 | 	// Try to match from every possible place in the sample string
460 |     for (;sampleString != endOfSearch; sampleString++){
461 |         *resLen = match(pattern, sampleString, endPattern);
462 |         if (*resLen > 0)												// Note the >0 comparison and not >=. We do not allow empty string match.
463 |             return sampleString;										// Full match was found. Return the current location in sample string.
464 |     }
465 | 	
466 | 	sampleString = EOS;													// No match was found through the entire search.
467 | 	*resLen = match(pattern, sampleString, endPattern);					// Try to match pattern to empty string (since we didn't allow it before).
468 |      return (*resLen>= 0) ? sampleString : NULL;						// Empty sample string was matched to pattern (like a*b*c*) (if==0). Otherwise no match
469 | };
470 | 
471 | /****************************************************************************/
472 | /****************************** UNITEST methods *****************************/
473 | /****************************************************************************/
474 | #ifdef REGEX_UNITEST
475 | void regex_debug(const char* pattern, const char* sampleString, const char* trueAnswer){
476 | 	int   len = 0;
477 | 	char  output[256];
478 | 	const char* res;
479 | 	const char error[] = "@NO";
480 | 	const char empty[] = "";
481 | 
482 | 	res =  regex_search(pattern, sampleString, &len);
483 | 	if (len>0){			memcpy(output,res, len); output[len] = '\0';	}
484 | 	else if (len==0)	memcpy(output,empty,strlen(empty)+1);
485 | 	else				memcpy(output,error,strlen(error)+1);	
486 | 	// Print only errors
487 | 	if (strcmp(output,trueAnswer))		fprintf(stderr,"Error: Reg:%s\t\t in %s\t\t : %s,\n", pattern, sampleString, output);
488 | 	else								fprintf(stderr,"Test OK!\n");
489 | }
490 | 
491 | /****************************************************************************/
492 | // Sample of regular expression and true answers. Checks that mechanism works fine
493 | static void regex_debug_private_tests(void){
494 | 	regex_debug("(\\(*.\\[)[qQ]", "((a[Q]","((a[Q");
495 | 	regex_debug("((b)c)", "abc","bc");
496 | 	regex_debug("(a)((b)c)?", "abc","abc");
497 | 	regex_debug("(a)(((b))c)(d)?", "abc","abc");
498 | 	regex_debug("ba*(ac)?", "baaa","baaa");
499 | 	regex_debug("a{1,2}c?q?", "aaa","aa");
500 | 	regex_debug("a{3,}c?q?", "aaacq","aaacq");
501 | 	regex_debug("a{2}v", "aaav","aav");
502 | 	regex_debug("a*b*c*", "","");												// Testy empty matches
503 | 	regex_debug("a*","zb","");   
504 | 	regex_debug("a*b*c*q", "","@NO");
505 | 	regex_debug("ba*", "baaa","baaa");
506 | 	regex_debug("ba*(ac)+", "baaac","baaac");
507 | 	regex_debug("ba+", "baaa","baaa");
508 | 	regex_debug("ba{0,3}", "baaa","baaa");
509 | 	regex_debug("ba{1,2}c?q?", "baaa","baa");
510 | 	regex_debug("[0-3]*(22a)", "12222a","12222a");
511 | 	regex_debug("b[0-9]+", "abc123xyz","@NO");
512 | 	regex_debug("ac?.[0-2]*", "abc123xyz","ab");
513 | 	regex_debug("ac?b[q-z]*c{0,4}.[0-2]*(22a)", "abc12222a","abc12222a");
514 | 	regex_debug("(6(0|4|5)_)+", "61_60_64_65_A","60_64_65_");
515 | 	regex_debug("(6(0|4|5)_)*", "61_60_64_65_A","60_64_65_");
516 | 	regex_debug("(6(011|44|5)_)*", "6012_6011_644_65_6541_","6011_644_65_");
517 | 	regex_debug("^a[ \t\r\n\v\f]*", "za \n ","@NO");							// Test spaces
518 | 	regex_debug("a[ \t\r\n\v\f]*b", "ga \tbv","a \tb");
519 | 	regex_debug("a[ \t\r\n\v\f]*", "za \t\t \f\n ","a \t\t \f\n ");
520 | 	regex_debug("f..k","zfolky","folk");
521 | 	regex_debug("va*b","kkvaaaab","vaaaab");
522 | 	regex_debug(".*b","ababc","abab");
523 | 	regex_debug("z.*b","xyzababc","zabab");
524 | 	regex_debug(".*b","bc","b");
525 | 	regex_debug("[abc]1{2}(cat|pup+y|dog).{2}","_a11puppy11_","a11puppy11");   
526 | 	regex_debug("[abc]*\\|","zab|","ab|");   
527 | 	regex_debug("[abc]*\\|[AC-Z]*","zab|ACDB","ab|ACD");   
528 | 	regex_debug("[abc]*\\|[ABCK]*","zab|ACKvv","ab|ACK");   
529 | 	regex_debug("[ABCK\\+UZ0-9]*","AAZ+BUB","AAZ+BUB");   
530 | 	regex_debug("[ABCKUZ0-9\\-]*","AAZ-BUB","AAZ-BUB");			
531 | 	regex_debug("[ABCK\\-UZ0-9]*","AAZ-BUB","AAZ-BUB");							
532 | 	regex_debug("[abc]*\\|[A\\?B-Z]*","zab|A?BCD","ab|A?BCD");   
533 | 	regex_debug("[^xyz]","zab","a");   
534 | 	regex_debug("[^xyc]+","zab","zab");   
535 | 	regex_debug("([0-9]+a{2,4})+q","1aa23aaa445aaaaq","1aa23aaa445aaaaq");   
536 | 	regex_debug("([abc]?[01]?)*","a1b00aab","a1b00aab");   
537 | 	regex_debug("[a-c]+(x{2,4})*","cxxxxxxxx","cxxxxxxxx");   
538 | 	regex_debug("([a-c]+(x{1,2})?)+x","cxccxxccxxx","cxccxxccxxx");   
539 | 	regex_debug("a[0-9]+b",   "za789b","a789b");
540 | 	regex_debug("[^A-Za-z]*","abc12..","12..");
541 | 	regex_debug("^a\\+{3}\\*\\?b","a+++*?bc","a+++*?b");						// Check special characters
542 | 	regex_debug("\\[\\[","a[?][[1","[[");
543 | 	regex_debug("(V\\.?A\\.?T\\.?|TAX|TVA)( *[^A-Z]? +)?[A-_\\. ]*([0-9]+([\\.,][0-9]{1,2})?%?[0-9]+([\\.,][0-9]{1,2})?%_? +)?([A-_\\. ]*)?([0-9]+[\\.,][0-9]{1,2} +)?[A-_]? *[\\-\\+]?[0-9]+(,[0-9]{3})*[\\.,][0-9]{1,2} *.? *\n","         VAT                      3.93\n","VAT                      3.93\n");
544 | 	regex_debug("\\[\\]","a[?][]1","[]1");										
545 | 	regex_debug("(\\[\\?\\])*\\|(\\*)+","a[?][?]|**1","[?][?]|**");				
546 | 	regex_debug("(.+)(.+)","AB","AB");											// Known bug in Daniels implementation!
547 | 	regex_debug("((.*A)BABA)Z","ZABABABABAZQ","ZABABABABAZ");					// Known bug in Daniels implementation!
548 | }
549 | 
550 | #endif
551 | /****************************************************************************/
552 | // EOF.


--------------------------------------------------------------------------------
/reg_exp.h:
--------------------------------------------------------------------------------
 1 | #ifndef REGULAR_EXPRESSION_H_DANIEL
 2 | #define REGULAR_EXPRESSION_H_DANIEL
 3 | /* 
 4 | * Coded By    : Daniel Herman Shmulyan
 5 | * Description : Proccesing regular expressions. Given an expression and a string returns the longest substring matching the expression. 
 6 | *				See examples in the implementation of regex_debug() function
 7 | *				Based on rules described at: http://en.wikipedia.org/wiki/Regular_expression.
 8 | *				This is a recursive regex (It does not compile regular expression to automaton like described in http://swtch.com/~rsc/regexp/regexp1.html)
 9 | *               But it uses uses constant small amount of memory for compilation and is extremely fast
10 | *				Designed to work on simple regular expressions (ASCII only and no longer than 256 bytes)
11 | */
12 | 
13 | /****************************************************************************/
14 | // Regular expression definitions and rules:
15 | //		Single character is matched like 'abc' to 'zabcd' from place 1, length 3.
16 | //		. - is matched to any character. 'a..d' matches 'abcd', 'aaad','aqwd'
17 | //		* - 0 or more occurences: 'a*' is matched to '', 'a', 'aa', 'aaaaaa'
18 | //		+ - 1 or more occurences: 'a+' is matched to   , 'a', 'aa', 'aaaaaa' but not ''
19 | //		? - 0 or 1    occurences: 'ab?c' is matched to 'ac' and 'abc'
20 | //		{N} - exactly N occurences: 'a{4}' is matched to 'aaaa'
21 | //		{M,N} - Between M to N occurences: 'a{1,3}' is matched to 'a','aa','aaa' but not '' nor 'aaaaa'
22 | //		{M,}  - At least M but with no maximum limit. 'a{2,} is matched to 'aa','aaaaa','aaaaaaaaa'
23 | //		Use \\ to inclde reserved characters '\\[p\\]*' is matched to '[p]'
24 | //		[]  - Matches a single character that is contained within the brackets. [abc] is matched to 'a', 'b', 'c' but not 'd'
25 | //		[^] - Invert selection: [^abc] is matched to any possible character except 'a', 'b', 'c'
26 | //		[a-z] - Selection range: [A-Z] is matched to any capital english letter
27 | //		[0-9a-zA-Z\\-\\+\\.] is matched to any alpha-numeric character (part of number or english word)
28 | //		() - Used to group expression. Like:  '(ab)*' is mathced to 'abababab'
29 | //		|  - Choose one of alternative expressions: (cat|dog|au+a) is matched 'cat' or 'dog' or 'aua' or 'auua' or 'auuua' etc
30 | //		You can use any combinations of the above, Example: (_[hc]?|me) matches "_hat", "_cat", "_at and "me".
31 | //
32 | // Known Issues:
33 | //		Since we use recursion on (), the greedy '.*' mechanism may cause '*' to grab too much characters. Example: "(.*A)BA" in "ABA" will return NULL since '.*' is wrongly matched to 'AB'.
34 | //		No multi threading support (a single regex matching cannot run of a few CPU's and nither can a few regex matches run in parallel).
35 | // Special first character (appears as first character of the regular expression  pattern).
36 | //		^ - Match to beggining of the string. Example: "ab" matches "zab" at place 1 but "^ab" does not match "zab" at all.
37 | //		$ - Match to the end. Not supported yet. Daniel, Todo...  Example: 'ab' matches 'abz' with length 2, but '$ab' will not match 'abz' at all.
38 | //		] - Chose last match. Not supported yet. Daniel, Todo...  Example: 'ab' will match 'ab1ab3' at first 2 characters. but ']ab' will match to the second (last) appearance of 'ab'
39 | 
40 | // Abbreviations:
41 | //		\d       ie. [0-9]          digit
42 | //		\D       ie. [^0-9]         non-digit
43 | //		\x       ie. [0-9A-Fa-f]    hex digit
44 | //		\X       ie. [^0-9A-Fa-f]
45 | //		\w       ie. [0-9A-Za-z_]   word character         
46 | //		\W       ie. [^0-9A-Za-z_]
47 | //		\h       ie. [0-9A-Za-z]    head of word character     
48 | //		\H       ie. [^0-9A-Za-z]
49 | //		\a       ie. [A-Za-z]       alphabetic character     
50 | //		\A       ie. [^A-Za-z]
51 | //		\l       ie. [a-z]          lowercase character     
52 | //		\L       ie. [^a-z]
53 | //		\u       ie. [A-Z]          uppercase character     
54 | //		\U       ie. [^A-Z]
55 | //		\s		 ie. [ \t\r\n\v\f]	Whitespace characters
56 | //		\S		 ie. [^ \t\r\n\v\f]	Non-whitespace characters
57 | 
58 | /****************************************************************************/
59 | /******************************** Functions *********************************/
60 | /****************************************************************************/
61 | // Structures for compilation of regex pattern (faster execution of regex, at cost of initial calculation and memory usage). The faster execution is achieved by precalculating for each expression its length
62 | struct tCompiledRegex{
63 | private:
64 |     const char* start;					// Pointer to beggining of the pattern
65 | 	const char* end;					// Pointer to the end of the pattern 
66 | 	unsigned char exprLen[ 256];		// For each sub expression - stores its length (faster jumping between expressions). Supports maximal expression length of 256.
67 | 	unsigned char unionLen[256];		// For each union set selection (aab|ccd|ef) - stores its length (faster jumping between union sets)
68 | 
69 | public:
70 | 	const char* getExpressionEnd(         const char *ex) const { return ex + exprLen[ (int)(ex-start)]; }		// Given a pointer to beggining of sub expresion, returns pointer to its end
71 | 	const char* getExpressionEnd_UnionSet(const char *ex) const { return ex + unionLen[(int)(ex-start)]; }		// Same for union set 
72 | 	
73 | 	const char* compile(const char *pattern);							// Assumes the pattern is legal expresion no longer than 256 bytes and compiles the pattern. Returns the end of the pattern on success or NULL on failure.
74 | 	const char* search( const char *sampleString, int* resLen) const;	// Returns a pointer to the beginning of the matched part and sets resLen to store the length of the mathced part.
75 | };
76 | 
77 | const char* regex_search(const char* pattern, const char* sampleString, int* resLen); // Compile and search in one function. A C style API.
78 | 
79 | #endif // H 
80 | /****************************************************************************/
81 | // EOF.
82 | 


--------------------------------------------------------------------------------