├── README.md ├── Regexp.h ├── examples ├── GlobalMatch │ └── GlobalMatch.pde ├── GlobalReplace │ └── GlobalReplace.pde ├── GlobalReplace_Entities │ └── GlobalReplace_Entities.pde ├── Match │ └── Match.pde └── MatchCount │ └── MatchCount.pde ├── keywords.txt ├── library.properties └── src ├── Regexp.cpp └── Regexp.h /README.md: -------------------------------------------------------------------------------- 1 | Regexp 2 | ====== 3 | 4 | Regular expression parser for microcontrollers based on the Lua one. 5 | 6 | Documentation on interfacing with the library, and other details at: 7 | 8 | http://www.gammon.com.au/forum/?id=11063 9 | 10 | ## Documentation on regular expressions (Lua patterns) 11 | 12 | * [Official Lua documentation](http://www.lua.org/manual/5.2/manual.html#6.4.1) 13 | 14 | * [Simplified documentation from MUSHclient help](http://www.gammon.com.au/scripts/doc.php?lua=string.find) 15 | -------------------------------------------------------------------------------- /Regexp.h: -------------------------------------------------------------------------------- 1 | #include "src/Regexp.h" 2 | -------------------------------------------------------------------------------- /examples/GlobalMatch/GlobalMatch.pde: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | // called for each match 4 | void match_callback (const char * match, // matching string (not null-terminated) 5 | const unsigned int length, // length of matching string 6 | const MatchState & ms) // MatchState in use (to get captures) 7 | { 8 | char cap [10]; // must be large enough to hold captures 9 | 10 | Serial.print ("Matched: "); 11 | Serial.write ((byte *) match, length); 12 | Serial.println (); 13 | 14 | for (byte i = 0; i < ms.level; i++) 15 | { 16 | Serial.print ("Capture "); 17 | Serial.print (i, DEC); 18 | Serial.print (" = "); 19 | ms.GetCapture (cap, i); 20 | Serial.println (cap); 21 | } // end of for each capture 22 | 23 | } // end of match_callback 24 | 25 | 26 | void setup () 27 | { 28 | Serial.begin (115200); 29 | Serial.println (); 30 | unsigned long count; 31 | 32 | // what we are searching (the target) 33 | char buf [100] = "The quick brown fox jumps over the lazy wolf"; 34 | 35 | // match state object 36 | MatchState ms (buf); 37 | 38 | // original buffer 39 | Serial.println (buf); 40 | 41 | // search for three letters followed by a space (two captures) 42 | count = ms.GlobalMatch ("(%a+)( )", match_callback); 43 | 44 | // show results 45 | Serial.print ("Found "); 46 | Serial.print (count); // 8 in this case 47 | Serial.println (" matches."); 48 | 49 | 50 | } // end of setup 51 | 52 | void loop () {} -------------------------------------------------------------------------------- /examples/GlobalReplace/GlobalReplace.pde: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | // called for every match 4 | void replace_callback (const char * match, // what we found 5 | const unsigned int length, // how long it was 6 | const char * & replacement, // put replacement here 7 | unsigned int & replacement_length, // put replacement length here 8 | const MatchState & ms) // for looking up captures 9 | { 10 | 11 | // show matching text 12 | Serial.print("Match = "); 13 | Serial.write((byte *) match, length); 14 | Serial.println (); 15 | 16 | replacement = "Nick"; 17 | replacement_length = 4; 18 | } // end of replace_callback 19 | 20 | void setup () 21 | { 22 | Serial.begin (115200); 23 | Serial.println (); 24 | unsigned long count; 25 | 26 | // what we are searching (the target) 27 | char buf [100] = "The quick brown fox jumps over the lazy wolf"; 28 | 29 | // match state object 30 | MatchState ms (buf); 31 | 32 | // original buffer 33 | Serial.println (buf); 34 | 35 | // search for three letters 36 | count = ms.GlobalReplace ("%a+", replace_callback); 37 | 38 | // show results 39 | Serial.print ("Converted string: "); 40 | Serial.println (buf); 41 | Serial.print ("Found "); 42 | Serial.print (count); // 9 in this case 43 | Serial.println (" matches."); 44 | 45 | // copy in new target 46 | strcpy (buf, "But does it get goat's blood out?"); 47 | ms.Target (buf); // recompute length 48 | 49 | // replace vowels with * 50 | count = ms.GlobalReplace ("[aeiou]", "*"); 51 | 52 | // show results 53 | Serial.print ("Converted string: "); 54 | Serial.println (buf); 55 | Serial.print ("Found "); 56 | Serial.print (count); // 13 in this case 57 | Serial.println (" matches."); 58 | 59 | } // end of setup 60 | 61 | void loop () {} 62 | -------------------------------------------------------------------------------- /examples/GlobalReplace_Entities/GlobalReplace_Entities.pde: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | // called for every match 4 | void replace_callback (const char * match, // what we found 5 | const unsigned int length, // how long it was 6 | const char * & replacement, // put replacement here 7 | unsigned int & replacement_length, // put replacement length here 8 | const MatchState & ms) // for looking up captures 9 | { 10 | static byte c; // for holding replacement byte, must be static 11 | 12 | char hexdigits [3]; // to hold hex string 13 | 14 | // get first capture 15 | ms.GetCapture (hexdigits, 0); 16 | // convert from hex to printable 17 | c = strtol (hexdigits, NULL, 16); 18 | 19 | // set as replacement 20 | replacement = (char *) &c; 21 | replacement_length = 1; 22 | } // end of replace_callback 23 | 24 | 25 | void setup () 26 | { 27 | Serial.begin (115200); 28 | 29 | // what we are searching 30 | char buf [100] = "%7B%22John+Doe%22%7D"; 31 | 32 | // for matching regular expressions 33 | MatchState ms (buf); 34 | 35 | // easy part, replace + by space 36 | ms.GlobalReplace ("%+", " "); 37 | 38 | // replace %xx (eg. %22) by what the hex code represents 39 | ms.GlobalReplace ("%%(%x%x)", replace_callback); 40 | 41 | Serial.println (buf); 42 | 43 | } // end of setup 44 | 45 | void loop () {} 46 | -------------------------------------------------------------------------------- /examples/Match/Match.pde: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void setup () 4 | { 5 | Serial.begin (115200); 6 | 7 | // match state object 8 | MatchState ms; 9 | 10 | // what we are searching (the target) 11 | char buf [100] = "The quick brown fox jumps over the lazy wolf"; 12 | ms.Target (buf); // set its address 13 | Serial.println (buf); 14 | 15 | char result = ms.Match ("f.x"); 16 | 17 | if (result > 0) 18 | { 19 | Serial.print ("Found match at: "); 20 | Serial.println (ms.MatchStart); // 16 in this case 21 | Serial.print ("Match length: "); 22 | Serial.println (ms.MatchLength); // 3 in this case 23 | } 24 | else 25 | Serial.println ("No match."); 26 | 27 | } // end of setup 28 | 29 | void loop () {} 30 | -------------------------------------------------------------------------------- /examples/MatchCount/MatchCount.pde: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void setup () 4 | { 5 | Serial.begin (115200); 6 | 7 | // match state object 8 | MatchState ms; 9 | 10 | // what we are searching (the target) 11 | char buf [100] = "The quick brown fox jumps over the lazy wolf"; 12 | ms.Target (buf); // set its address 13 | 14 | unsigned int count = ms.MatchCount ("[aeiou]"); 15 | 16 | Serial.println (buf); 17 | Serial.print ("Found "); 18 | Serial.print (count); // 11 in this case 19 | Serial.println (" matches."); 20 | 21 | } // end of setup 22 | 23 | void loop () {} 24 | -------------------------------------------------------------------------------- /keywords.txt: -------------------------------------------------------------------------------- 1 | MatchState KEYWORD1 2 | Match KEYWORD2 3 | Target KEYWORD2 4 | GetMatch KEYWORD2 5 | GetCapture KEYWORD2 6 | GetResult KEYWORD2 7 | MatchCount KEYWORD2 8 | GlobalMatch KEYWORD2 9 | GlobalReplace KEYWORD2 10 | -------------------------------------------------------------------------------- /library.properties: -------------------------------------------------------------------------------- 1 | name=Regexp 2 | version=0.1.0 3 | author=Nick Gammon 4 | maintainer=Nick Gammon 5 | sentence=Regular expression parser for microcontrollers 6 | paragraph=Based upon Lua implementation 7 | category=Uncategorized 8 | url=https://github.com/nickgammon/Regexp 9 | architectures=* 10 | -------------------------------------------------------------------------------- /src/Regexp.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Regular-expression matching library for Arduino. 4 | 5 | Written by Nick Gammon. 6 | Date: 30 April 2011 7 | 8 | Heavily based on the Lua regular expression matching library written by Roberto Ierusalimschy. 9 | 10 | Adapted to run on the Arduino by Nick Gammon. 11 | 12 | VERSION 13 | 14 | Version 1.0 - 30th April 2011 : initial release. 15 | Version 1.1 - 1st May 2011 : added some helper functions, made more modular. 16 | Version 1.2 - 19th May 2011 : added more helper functions for replacing etc. 17 | 18 | 19 | LICENSE 20 | 21 | 22 | Copyright © 1994–2010 Lua.org, PUC-Rio. 23 | 24 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), 25 | to deal in the Software without restriction, including without limitation the rights to use, 26 | copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, 27 | and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 28 | 29 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 30 | 31 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 32 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 33 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 34 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 35 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 36 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 37 | OR OTHER DEALINGS IN THE SOFTWARE. 38 | 39 | 40 | USAGE 41 | 42 | 43 | Find the first match of the regular expression "pattern" in the supplied string, starting at position "index". 44 | 45 | If found, returns REGEXP_MATCHED (1). 46 | 47 | Also match_start and match_len in the MatchState structure are set to the start offset and length of the match. 48 | 49 | The capture in the MatchState structure has the locations and lengths of each capture. 50 | 51 | If not found, returns REGEXP_NOMATCH (0). 52 | 53 | On a parsing error (eg. trailing % symbol) returns a negative number. 54 | 55 | 56 | EXAMPLE OF CALLING ON THE ARDUINO 57 | 58 | // ------------------------------------- // 59 | 60 | #include 61 | 62 | void setup () 63 | { 64 | 65 | Serial.begin (115200); 66 | Serial.println (); 67 | 68 | MatchState ms; 69 | char buf [100]; // large enough to hold expected string, or malloc it 70 | 71 | // string we are searching 72 | ms.Target ("Testing: answer=42"); 73 | 74 | // search it 75 | char result = ms.Match ("(%a+)=(%d+)", 0); 76 | 77 | // check results 78 | 79 | switch (result) 80 | { 81 | case REGEXP_MATCHED: 82 | 83 | Serial.println ("-----"); 84 | Serial.print ("Matched on: "); 85 | Serial.println (ms.GetMatch (buf)); 86 | 87 | // matching offsets in ms.capture 88 | 89 | Serial.print ("Captures: "); 90 | Serial.println (ms.level); 91 | 92 | for (int j = 0; j < ms.level; j++) 93 | { 94 | Serial.print ("Capture number: "); 95 | Serial.println (j + 1, DEC); 96 | Serial.print ("Text: '"); 97 | Serial.print (ms.GetCapture (buf, j)); 98 | Serial.println ("'"); 99 | 100 | } 101 | break; 102 | 103 | case REGEXP_NOMATCH: 104 | Serial.println ("No match."); 105 | break; 106 | 107 | default: 108 | Serial.print ("Regexp error: "); 109 | Serial.println (result, DEC); 110 | break; 111 | 112 | } // end of switch 113 | 114 | } // end of setup 115 | 116 | void loop () {} // end of loop 117 | 118 | // ------------------------------------- // 119 | 120 | 121 | PATTERNS 122 | 123 | Patterns 124 | 125 | The standard patterns (character classes) you can search for are: 126 | 127 | 128 | . --- (a dot) represents all characters. 129 | %a --- all letters. 130 | %c --- all control characters. 131 | %d --- all digits. 132 | %l --- all lowercase letters. 133 | %p --- all punctuation characters. 134 | %s --- all space characters. 135 | %u --- all uppercase letters. 136 | %w --- all alphanumeric characters. 137 | %x --- all hexadecimal digits. 138 | %z --- the character with hex representation 0x00 (null). 139 | %% --- a single '%' character. 140 | 141 | %1 --- captured pattern 1. 142 | %2 --- captured pattern 2 (and so on). 143 | %f[s] transition from not in set 's' to in set 's'. 144 | %b() balanced pair ( ... ) 145 | 146 | 147 | Important! - the uppercase versions of the above represent the complement of the class. 148 | eg. %U represents everything except uppercase letters, %D represents everything except digits. 149 | 150 | There are some "magic characters" (such as %) that have special meanings. These are: 151 | 152 | 153 | ^ $ ( ) % . [ ] * + - ? 154 | 155 | 156 | If you want to use those in a pattern (as themselves) you must precede them by a % symbol. 157 | 158 | eg. %% would match a single % 159 | 160 | You can build your own pattern classes (sets) by using square brackets, eg. 161 | 162 | 163 | [abc] ---> matches a, b or c 164 | [a-z] ---> matches lowercase letters (same as %l) 165 | [^abc] ---> matches anything except a, b or c 166 | [%a%d] ---> matches all letters and digits 167 | 168 | [%a%d_] ---> matches all letters, digits and underscore 169 | [%[%]] ---> matches square brackets (had to escape them with %) 170 | 171 | 172 | You can use pattern classes in the form %x in the set. 173 | If you use other characters (like periods and brackets, etc.) they are simply themselves. 174 | 175 | You can specify a range of character inside a set by using simple characters (not pattern classes like %a) separated by a hyphen. 176 | For example, [A-Z] or [0-9]. These can be combined with other things. For example [A-Z0-9] or [A-Z,.]. 177 | 178 | A end-points of a range must be given in ascending order. That is, [A-Z] would match upper-case letters, but [Z-A] would not match anything. 179 | 180 | You can negate a set by starting it with a "^" symbol, thus [^0-9] is everything except the digits 0 to 9. 181 | The negation applies to the whole set, so [^%a%d] would match anything except letters or digits. 182 | In anywhere except the first position of a set, the "^" symbol is simply itself. 183 | 184 | Inside a set (that is a sequence delimited by square brackets) the only "magic" characters are: 185 | 186 | ] ---> to end the set, unless preceded by % 187 | % ---> to introduce a character class (like %a), or magic character (like "]") 188 | ^ ---> in the first position only, to negate the set (eg. [^A-Z) 189 | - ---> between two characters, to specify a range (eg. [A-F]) 190 | 191 | 192 | Thus, inside a set, characters like "." and "?" are just themselves. 193 | 194 | The repetition characters, which can follow a character, class or set, are: 195 | 196 | 197 | + ---> 1 or more repetitions (greedy) 198 | * ---> 0 or more repetitions (greedy) 199 | 200 | - ---> 0 or more repetitions (non greedy) 201 | ? ---> 0 or 1 repetition only 202 | 203 | 204 | A "greedy" match will match on as many characters as possible, a non-greedy one will match on as few as possible. 205 | 206 | The standard "anchor" characters apply: 207 | 208 | 209 | ^ ---> anchor to start of subject string 210 | $ ---> anchor to end of subject string 211 | 212 | 213 | You can also use round brackets to specify "captures": 214 | 215 | 216 | You see (.*) here 217 | 218 | 219 | Here, whatever matches (.*) becomes the first pattern. 220 | 221 | You can also refer to matched substrings (captures) later on in an expression: 222 | 223 | eg. This would match: 224 | 225 | string = "You see dogs and dogs" 226 | regexp = "You see (.*) and %1" 227 | 228 | 229 | This example shows how you can look for a repetition of a word matched earlier, whatever that word was ("dogs" in this case). 230 | 231 | As a special case, an empty capture string returns as the captured pattern, the position of itself in the string. eg. 232 | 233 | string = "You see dogs and dogs" 234 | regexp = "You .* ()dogs .*" 235 | 236 | This would return a capture with an offset of 8, and a length of CAP_POSITION (-2) 237 | 238 | Finally you can look for nested "balanced" things (such as parentheses) by using %b, like this: 239 | 240 | 241 | string = "I see a (big fish (swimming) in the pond) here" 242 | regexp = "%b()" 243 | 244 | 245 | After %b you put 2 characters, which indicate the start and end of the balanced pair. 246 | If it finds a nested version it keeps processing until we are back at the top level. 247 | In this case the matching string was "(big fish (swimming) in the pond)". 248 | 249 | 250 | */ 251 | 252 | 253 | #include 254 | #include 255 | #include 256 | #include "Regexp.h" 257 | 258 | // for throwing errors 259 | static jmp_buf regexp_error_return; 260 | typedef unsigned char byte; 261 | 262 | // error codes raised during regexp processing 263 | static byte error (const char err) 264 | { 265 | // does not return 266 | longjmp (regexp_error_return, err); 267 | return 0; // keep compiler happy 268 | } // end of error 269 | 270 | static int check_capture (MatchState *ms, int l) { 271 | l -= '1'; 272 | if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED) 273 | return error(ERR_INVALID_CAPTURE_INDEX); 274 | return l; 275 | } // end of check_capture 276 | 277 | static int capture_to_close (MatchState *ms) { 278 | int level = ms->level; 279 | for (level--; level>=0; level--) 280 | if (ms->capture[level].len == CAP_UNFINISHED) return level; 281 | return error(ERR_INVALID_PATTERN_CAPTURE); 282 | } // end of capture_to_close 283 | 284 | static const char *classend (MatchState *ms, const char *p) { 285 | switch (*p++) { 286 | case REGEXP_ESC: { 287 | if (*p == '\0') 288 | error(ERR_MALFORMED_PATTERN_ENDS_WITH_ESCAPE); 289 | return p+1; 290 | } 291 | case '[': { 292 | if (*p == '^') p++; 293 | do { /* look for a `]' */ 294 | if (*p == '\0') 295 | error(ERR_MALFORMED_PATTERN_ENDS_WITH_RH_SQUARE_BRACKET); 296 | if (*(p++) == REGEXP_ESC && *p != '\0') 297 | p++; /* skip escapes (e.g. `%]') */ 298 | } while (*p != ']'); 299 | return p+1; 300 | } 301 | default: { 302 | return p; 303 | } 304 | } 305 | } // end of classend 306 | 307 | 308 | static int match_class (int c, int cl) { 309 | int res; 310 | switch (tolower(cl)) { 311 | case 'a' : res = isalpha(c); break; 312 | case 'c' : res = iscntrl(c); break; 313 | case 'd' : res = isdigit(c); break; 314 | case 'l' : res = islower(c); break; 315 | case 'p' : res = ispunct(c); break; 316 | case 's' : res = isspace(c); break; 317 | case 'u' : res = isupper(c); break; 318 | case 'w' : res = isalnum(c); break; 319 | case 'x' : res = isxdigit(c); break; 320 | case 'z' : res = (c == 0); break; 321 | default: return (cl == c); 322 | } 323 | return (islower(cl) ? res : !res); 324 | } // end of match_class 325 | 326 | 327 | static int matchbracketclass (int c, const char *p, const char *ec) { 328 | int sig = 1; 329 | if (*(p+1) == '^') { 330 | sig = 0; 331 | p++; /* skip the `^' */ 332 | } 333 | while (++p < ec) { 334 | if (*p == REGEXP_ESC) { 335 | p++; 336 | if (match_class(c, uchar(*p))) 337 | return sig; 338 | } 339 | else if ((*(p+1) == '-') && (p+2 < ec)) { 340 | p+=2; 341 | if (uchar(*(p-2)) <= c && c <= uchar(*p)) 342 | return sig; 343 | } 344 | else if (uchar(*p) == c) return sig; 345 | } 346 | return !sig; 347 | } // end of matchbracketclass 348 | 349 | 350 | static int singlematch (int c, const char *p, const char *ep) { 351 | switch (*p) { 352 | case '.': return 1; /* matches any char */ 353 | case REGEXP_ESC: return match_class(c, uchar(*(p+1))); 354 | case '[': return matchbracketclass(c, p, ep-1); 355 | default: return (uchar(*p) == c); 356 | } 357 | } // end of singlematch 358 | 359 | 360 | static const char *match (MatchState *ms, const char *s, const char *p); 361 | 362 | 363 | static const char *matchbalance (MatchState *ms, const char *s, 364 | const char *p) { 365 | if (*p == 0 || *(p+1) == 0) 366 | error(ERR_UNBALANCED_PATTERN); 367 | if (*s != *p) return NULL; 368 | else { 369 | int b = *p; 370 | int e = *(p+1); 371 | int cont = 1; 372 | while (++s < ms->src_end) { 373 | if (*s == e) { 374 | if (--cont == 0) return s+1; 375 | } 376 | else if (*s == b) cont++; 377 | } 378 | } 379 | return NULL; /* string ends out of balance */ 380 | } // end of matchbalance 381 | 382 | 383 | static const char *max_expand (MatchState *ms, const char *s, 384 | const char *p, const char *ep) { 385 | int i = 0; /* counts maximum expand for item */ 386 | while ((s+i)src_end && singlematch(uchar(*(s+i)), p, ep)) 387 | i++; 388 | /* keeps trying to match with the maximum repetitions */ 389 | while (i>=0) { 390 | const char *res = match(ms, (s+i), ep+1); 391 | if (res) return res; 392 | i--; /* else didn't match; reduce 1 repetition to try again */ 393 | } 394 | return NULL; 395 | } // end of max_expand 396 | 397 | 398 | static const char *min_expand (MatchState *ms, const char *s, 399 | const char *p, const char *ep) { 400 | for (;;) { 401 | const char *res = match(ms, s, ep+1); 402 | if (res != NULL) 403 | return res; 404 | else if (ssrc_end && singlematch(uchar(*s), p, ep)) 405 | s++; /* try with one more repetition */ 406 | else return NULL; 407 | } 408 | } // end of min_expand 409 | 410 | 411 | static const char *start_capture (MatchState *ms, const char *s, 412 | const char *p, int what) { 413 | const char *res; 414 | int level = ms->level; 415 | if (level >= MAXCAPTURES) error(ERR_TOO_MANY_CAPTURES); 416 | ms->capture[level].init = s; 417 | ms->capture[level].len = what; 418 | ms->level = level+1; 419 | if ((res=match(ms, s, p)) == NULL) /* match failed? */ 420 | ms->level--; /* undo capture */ 421 | return res; 422 | } // end of start_capture 423 | 424 | 425 | static const char *end_capture (MatchState *ms, const char *s, 426 | const char *p) { 427 | int l = capture_to_close(ms); 428 | const char *res; 429 | ms->capture[l].len = s - ms->capture[l].init; /* close capture */ 430 | if ((res = match(ms, s, p)) == NULL) /* match failed? */ 431 | ms->capture[l].len = CAP_UNFINISHED; /* undo capture */ 432 | return res; 433 | } // end of end_capture 434 | 435 | 436 | static const char *match_capture (MatchState *ms, const char *s, int l) { 437 | size_t len; 438 | l = check_capture(ms, l); 439 | len = ms->capture[l].len; 440 | if ((size_t)(ms->src_end-s) >= len && 441 | memcmp(ms->capture[l].init, s, len) == 0) 442 | return s+len; 443 | else return NULL; 444 | } // end of match_capture 445 | 446 | 447 | static const char *match (MatchState *ms, const char *s, const char *p) { 448 | init: /* using goto's to optimize tail recursion */ 449 | switch (*p) { 450 | case '(': { /* start capture */ 451 | if (*(p+1) == ')') /* position capture? */ 452 | return start_capture(ms, s, p+2, CAP_POSITION); 453 | else 454 | return start_capture(ms, s, p+1, CAP_UNFINISHED); 455 | } 456 | case ')': { /* end capture */ 457 | return end_capture(ms, s, p+1); 458 | } 459 | case REGEXP_ESC: { 460 | switch (*(p+1)) { 461 | case 'b': { /* balanced string? */ 462 | s = matchbalance(ms, s, p+2); 463 | if (s == NULL) return NULL; 464 | p+=4; goto init; /* else return match(ms, s, p+4); */ 465 | } 466 | case 'f': { /* frontier? */ 467 | const char *ep; char previous; 468 | p += 2; 469 | if (*p != '[') 470 | error(ERR_MISSING_LH_SQUARE_BRACKET_AFTER_ESC_F); 471 | ep = classend(ms, p); /* points to what is next */ 472 | previous = (s == ms->src) ? '\0' : *(s-1); 473 | if (matchbracketclass(uchar(previous), p, ep-1) || 474 | !matchbracketclass(uchar(*s), p, ep-1)) return NULL; 475 | p=ep; goto init; /* else return match(ms, s, ep); */ 476 | } 477 | default: { 478 | if (isdigit(uchar(*(p+1)))) { /* capture results (%0-%9)? */ 479 | s = match_capture(ms, s, uchar(*(p+1))); 480 | if (s == NULL) return NULL; 481 | p+=2; goto init; /* else return match(ms, s, p+2) */ 482 | } 483 | goto dflt; /* case default */ 484 | } 485 | } 486 | } 487 | case '\0': { /* end of pattern */ 488 | return s; /* match succeeded */ 489 | } 490 | case '$': { 491 | if (*(p+1) == '\0') /* is the `$' the last char in pattern? */ 492 | return (s == ms->src_end) ? s : NULL; /* check end of string */ 493 | else goto dflt; 494 | } 495 | default: dflt: { /* it is a pattern item */ 496 | const char *ep = classend(ms, p); /* points to what is next */ 497 | int m = ssrc_end && singlematch(uchar(*s), p, ep); 498 | switch (*ep) { 499 | case '?': { /* optional */ 500 | const char *res; 501 | if (m && ((res=match(ms, s+1, ep+1)) != NULL)) 502 | return res; 503 | p=ep+1; goto init; /* else return match(ms, s, ep+1); */ 504 | } 505 | case '*': { /* 0 or more repetitions */ 506 | return max_expand(ms, s, p, ep); 507 | } 508 | case '+': { /* 1 or more repetitions */ 509 | return (m ? max_expand(ms, s+1, p, ep) : NULL); 510 | } 511 | case '-': { /* 0 or more repetitions (minimum) */ 512 | return min_expand(ms, s, p, ep); 513 | } 514 | default: { 515 | if (!m) return NULL; 516 | s++; p=ep; goto init; /* else return match(ms, s+1, ep); */ 517 | } 518 | } 519 | } 520 | } 521 | } // end of match 522 | 523 | 524 | // functions below written by Nick Gammon ... 525 | 526 | char MatchState::Match (const char * pattern, unsigned int index) 527 | { 528 | // set up for throwing errors 529 | char rtn = setjmp (regexp_error_return); 530 | 531 | // error return 532 | if (rtn) 533 | return ((result = rtn)); 534 | 535 | if (!src) 536 | error (ERR_NO_TARGET_STRING); 537 | 538 | if (index > src_len) 539 | index = src_len; 540 | 541 | int anchor = (*pattern == '^') ? (pattern++, 1) : 0; 542 | const char *s1 =src + index; 543 | src_end = src + src_len; 544 | 545 | // iterate through target string, character by character unless anchored 546 | do { 547 | const char *res; 548 | level = 0; 549 | if ((res=match(this, s1, pattern)) != NULL) 550 | { 551 | MatchStart = s1 - src; 552 | MatchLength = res - s1; 553 | return (result = REGEXP_MATCHED); 554 | } // end of match at this position 555 | } while (s1++ < src_end && !anchor); 556 | 557 | return (result = REGEXP_NOMATCH); // no match 558 | 559 | } // end of regexp 560 | 561 | // set up the target string 562 | void MatchState::Target (char * s) 563 | { 564 | Target (s, strlen (s)); 565 | } // end of MatchState::Target 566 | 567 | void MatchState::Target (char * s, const unsigned int len) 568 | { 569 | src = s; 570 | src_len = len; 571 | result = REGEXP_NOMATCH; 572 | } // end of MatchState::Target 573 | 574 | // copy the match string to user-supplied buffer 575 | // buffer must be large enough to hold it 576 | char * MatchState::GetMatch (char * s) const 577 | { 578 | if (result != REGEXP_MATCHED) 579 | s [0] = 0; 580 | else 581 | { 582 | memcpy (s, &src [MatchStart], MatchLength); 583 | s [MatchLength] = 0; // null-terminated string 584 | } 585 | return s; 586 | } // end of MatchState::GetMatch 587 | 588 | // get one of the capture strings (zero-relative level) 589 | // buffer must be large enough to hold it 590 | char * MatchState::GetCapture (char * s, const int n) const 591 | { 592 | if (result != REGEXP_MATCHED || n >= level || capture [n].len <= 0) 593 | s [0] = 0; 594 | else 595 | { 596 | memcpy (s, capture [n].init, capture [n].len); 597 | s [capture [n].len] = 0; // null-terminated string 598 | } 599 | return s; 600 | } // end of MatchState::GetCapture 601 | 602 | // match repeatedly on a string, return count of matches 603 | unsigned int MatchState::MatchCount (const char * pattern) 604 | { 605 | unsigned int count = 0; 606 | 607 | // keep matching until we run out of matches 608 | for (unsigned int index = 0; 609 | Match (pattern, index) > 0 && 610 | index < src_len; // otherwise empty matches loop 611 | count++) 612 | // increment index ready for next time, go forwards at least one byte 613 | index = MatchStart + (MatchLength == 0 ? 1 : MatchLength); 614 | 615 | return count; 616 | 617 | } // end of MatchState::MatchCount 618 | 619 | // match repeatedly on a string, call function f for each match 620 | unsigned int MatchState::GlobalMatch (const char * pattern, GlobalMatchCallback f) 621 | { 622 | unsigned int count = 0; 623 | 624 | // keep matching until we run out of matches 625 | for (unsigned int index = 0; 626 | Match (pattern, index) > 0; 627 | count++) 628 | { 629 | f (& src [MatchStart], MatchLength, *this); 630 | // increment index ready for next time, go forwards at least one byte 631 | index = MatchStart + (MatchLength == 0 ? 1 : MatchLength); 632 | } // end of for each match 633 | return count; 634 | 635 | } // end of MatchState::GlobalMatch 636 | 637 | // match repeatedly on a string, call function f for each match 638 | // f sets replacement string, incorporate replacement and continue 639 | // maximum of max_count replacements if max_count > 0 640 | // replacement string in GlobalReplaceCallback must stay in scope (eg. static string or literal) 641 | unsigned int MatchState::GlobalReplace (const char * pattern, GlobalReplaceCallback f, const unsigned int max_count) 642 | { 643 | unsigned int count = 0; 644 | 645 | // keep matching until we run out of matches 646 | for (unsigned int index = 0; 647 | Match (pattern, index) > 0 && // stop when no match 648 | index < src_len && // otherwise empty matches loop 649 | (max_count == 0 || count < max_count); // stop when count reached 650 | count++) 651 | { 652 | // default is to replace with self 653 | const char * replacement = &src [MatchStart]; 654 | unsigned int replacement_length = MatchLength; 655 | 656 | // increment index ready for next time, go forwards at least one byte 657 | if (MatchLength == 0) 658 | index = MatchStart + 1; // go forwards at least one byte or we will loop forever 659 | else 660 | { 661 | // increment index ready for next time, 662 | index = MatchStart + MatchLength; 663 | 664 | // call function to find replacement text 665 | f (&src [MatchStart], MatchLength, replacement, replacement_length, *this); 666 | 667 | // see how much memory we need to move 668 | int lengthDiff = MatchLength - replacement_length; 669 | 670 | // copy the rest of the buffer backwards/forwards to allow for the length difference 671 | memmove (&src [index - lengthDiff], &src [index], src_len - index); 672 | 673 | // copy in the replacement 674 | memmove (&src [MatchStart], replacement, replacement_length); 675 | 676 | // adjust the index for the next search 677 | index -= lengthDiff; 678 | // and the length of the source 679 | src_len -= lengthDiff; 680 | } // end if matching at least one byte 681 | } // end of for each match 682 | 683 | // put a terminating null in 684 | src [src_len] = 0; 685 | return count; 686 | } // end of MatchState::GlobalReplace 687 | 688 | 689 | // match repeatedly on a string, replaces with replacement string for each match 690 | // maximum of max_count replacements if max_count > 0 691 | // replacement string in GlobalReplaceCallback must stay in scope (eg. static string or literal) 692 | unsigned int MatchState::GlobalReplace (const char * pattern, const char * replacement, const unsigned int max_count) 693 | { 694 | unsigned int count = 0; 695 | unsigned int replacement_length = strlen (replacement); 696 | 697 | // keep matching until we run out of matches 698 | for (unsigned int index = 0; 699 | Match (pattern, index) > 0 && // stop when no match 700 | index < src_len && // otherwise empty matches loop 701 | (max_count == 0 || count < max_count); // stop when count reached 702 | count++) 703 | { 704 | if (MatchLength == 0) 705 | index = MatchStart + 1; // go forwards at least one byte or we will loop forever 706 | else 707 | { 708 | // increment index ready for next time, 709 | index = MatchStart + MatchLength; 710 | 711 | // see how much memory we need to move 712 | int lengthDiff = MatchLength - replacement_length; 713 | 714 | // copy the rest of the buffer backwards/forwards to allow for the length difference 715 | memmove (&src [index - lengthDiff], &src [index], src_len - index); 716 | 717 | // copy in the replacement 718 | memmove (&src [MatchStart], replacement, replacement_length); 719 | 720 | // adjust the index for the next search 721 | index -= lengthDiff; 722 | // and the length of the source 723 | src_len -= lengthDiff; 724 | } // end if matching at least one byte 725 | 726 | } // end of for each match 727 | 728 | // put a terminating null in 729 | src [src_len] = 0; 730 | return count; 731 | } // end of MatchState::GlobalReplace 732 | 733 | -------------------------------------------------------------------------------- /src/Regexp.h: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Regular-expression matching library for Arduino. 4 | 5 | Written by Nick Gammon. 6 | Date: 30 April 2011 7 | 8 | Heavily based on the Lua regular expression matching library written by Roberto Ierusalimschy. 9 | 10 | Adapted to run on the Arduino by Nick Gammon. 11 | 12 | 13 | VERSION 14 | 15 | Version 1.0 - 30th April 2011 : initial release. 16 | Version 1.1 - 1st May 2011 : added some helper functions, made more modular. 17 | Version 1.2 - 19th May 2011 : added more helper functions for replacing etc. 18 | 19 | 20 | */ 21 | 22 | #pragma once 23 | 24 | 25 | // Maximum of captures we can return. 26 | // Increase if you need more, decrease to save memory. 27 | #define MAXCAPTURES 32 28 | 29 | // the "magic escape" character 30 | #define REGEXP_ESC '%' 31 | 32 | // special characters that have to be escaped 33 | // (not used in the library, but you might need this) 34 | #define REGEXP_SPECIALS "^$*+?.([%-" 35 | 36 | // Result codes from calling regexp: 37 | 38 | // we got a match 39 | #define REGEXP_MATCHED 1 40 | 41 | // no match, or not attempted to match yet 42 | #define REGEXP_NOMATCH 0 43 | 44 | // errors when matching 45 | #define ERR_INVALID_CAPTURE_INDEX -1 46 | #define ERR_INVALID_PATTERN_CAPTURE -2 47 | #define ERR_MALFORMED_PATTERN_ENDS_WITH_ESCAPE -3 48 | #define ERR_MALFORMED_PATTERN_ENDS_WITH_RH_SQUARE_BRACKET -4 49 | #define ERR_UNBALANCED_PATTERN -5 50 | #define ERR_TOO_MANY_CAPTURES -6 51 | #define ERR_MISSING_LH_SQUARE_BRACKET_AFTER_ESC_F -7 52 | #define ERR_NO_TARGET_STRING -8 53 | 54 | 55 | /* macro to `unsign' a character */ 56 | #define uchar(c) ((unsigned char)(c)) 57 | 58 | // special capture "lengths" 59 | #define CAP_UNFINISHED (-1) 60 | #define CAP_POSITION (-2) 61 | 62 | class MatchState; // forward definition for the callback routines 63 | 64 | typedef void (*GlobalMatchCallback) (const char * match, // matching string (not null-terminated) 65 | const unsigned int length, // length of matching string 66 | const MatchState & ms); // MatchState in use (to get captures) 67 | typedef void (*GlobalReplaceCallback) (const char * match, // matching string (not null-terminated) 68 | const unsigned int length, // length of matching string 69 | const char * & replacement, 70 | unsigned int & replacement_length, 71 | const MatchState & ms); // MatchState in use (to get captures) 72 | typedef class MatchState { 73 | private: 74 | 75 | char result; // result of last Match call 76 | 77 | public: 78 | 79 | MatchState () : result (REGEXP_NOMATCH), src (0) {}; // constructor 80 | MatchState (char * s) : result (REGEXP_NOMATCH) 81 | { Target (s); }; // constructor from null-terminated string 82 | MatchState (char * s, const unsigned int len) : result (REGEXP_NOMATCH) 83 | { Target (s, len); }; // constructor from string and length 84 | 85 | // supply these two: 86 | char *src; /* source string */ 87 | unsigned int src_len; /* length of source string */ 88 | 89 | // used internally 90 | char *src_end; /* end of source string */ 91 | 92 | // returned fields: 93 | 94 | unsigned int MatchStart; // zero-relative offset of start of match 95 | unsigned int MatchLength; // length of match 96 | 97 | int level; /* total number of captures in array below (finished or unfinished) */ 98 | 99 | // capture addresses and lengths 100 | struct { 101 | const char *init; 102 | int len; // might be CAP_UNFINISHED or CAP_POSITION 103 | } capture[MAXCAPTURES]; 104 | 105 | // add target string, null-terminated 106 | void Target (char * s); 107 | // add target string, with specified length 108 | void Target (char * s, const unsigned int len); 109 | // do a match on a supplied pattern and zero-relative starting point 110 | char Match (const char * pattern, unsigned int index = 0); 111 | // return the matching string 112 | char * GetMatch (char * s) const; 113 | // return capture string n 114 | char * GetCapture (char * s, const int n) const; 115 | // get result of previous match 116 | char GetResult () const { return result; } 117 | 118 | // count number of matches on a supplied pattern 119 | unsigned int MatchCount (const char * pattern); 120 | // iterate with a supplied pattern, call function f for each match 121 | // returns count of matches 122 | unsigned int GlobalMatch (const char * pattern, GlobalMatchCallback f); 123 | // iterate with a supplied pattern, call function f for each match, maximum of max_count matches if max_count > 0 124 | // returns count of replacements 125 | unsigned int GlobalReplace (const char * pattern, GlobalReplaceCallback f, const unsigned int max_count = 0); 126 | // iterate with a supplied pattern, replaces with replacement string, maximum of max_count matches if max_count > 0 127 | // returns count of replacements 128 | unsigned int GlobalReplace (const char * pattern, const char * replacement, const unsigned int max_count = 0); 129 | 130 | } MatchState; 131 | 132 | --------------------------------------------------------------------------------