├── README.md └── emacs-24.2-regex.patch /README.md: -------------------------------------------------------------------------------- 1 | emacs-regex-lookaround 2 | ====================== 3 | 4 | This patch will add the lookahead & lookbehind assertions to Emacs 24.2 regular expressions. 5 | 6 | cd emacs 7 | patch -b -p0 < ../emacs-24.2-regex.patch 8 | 9 | then compile Emacs. 10 | 11 | 12 | This patch is based on http://emacs.1067599.n5.nabble.com/Patch-for-lookaround-assertion-in-regexp-td121057.html#a23863009 13 | but got error when patching Emacs 24.2, I just fixed it with small changes. 14 | -------------------------------------------------------------------------------- /emacs-24.2-regex.patch: -------------------------------------------------------------------------------- 1 | --- src/regex.c 2012-08-23 13:33:42.000000000 +0800 2 | +++ src/regex.c 2013-02-04 21:15:08.320351000 +0800 3 | @@ -673,8 +673,15 @@ 4 | syntaxspec, 5 | 6 | /* Matches any character whose syntax is not that specified. */ 7 | - notsyntaxspec 8 | + notsyntaxspec, 9 | 10 | + lookahead, 11 | + lookahead_not, 12 | + lookbehind, 13 | + lookbehind_not, 14 | + lookaround_succeed, 15 | + lookaround_fail 16 | + 17 | #ifdef emacs 18 | ,before_dot, /* Succeeds if before point. */ 19 | at_dot, /* Succeeds if at point. */ 20 | @@ -966,6 +973,36 @@ 21 | fprintf (stderr, "/stop_memory/%d", *p++); 22 | break; 23 | 24 | + case lookahead: 25 | + extract_number_and_incr (&mcnt, &p); 26 | + fprintf (stderr, "/lookahead/%d", mcnt); 27 | + break; 28 | + 29 | + case lookahead_not: 30 | + extract_number_and_incr (&mcnt, &p); 31 | + fprintf (stderr, "/lookahead_not/%d", mcnt); 32 | + break; 33 | + 34 | + case lookbehind: 35 | + extract_number_and_incr (&mcnt, &p); 36 | + extract_number_and_incr (&mcnt2, &p); 37 | + fprintf (stderr, "/lookbehind/%d/%d", mcnt, mcnt2); 38 | + break; 39 | + 40 | + case lookbehind_not: 41 | + extract_number_and_incr (&mcnt, &p); 42 | + extract_number_and_incr (&mcnt2, &p); 43 | + fprintf (stderr, "/lookbehind_not/%d/%d", mcnt, mcnt2); 44 | + break; 45 | + 46 | + case lookaround_succeed: 47 | + fprintf (stderr, "/lookaround_succeed"); 48 | + break; 49 | + 50 | + case lookaround_fail: 51 | + fprintf (stderr, "/lookaround_fail"); 52 | + break; 53 | + 54 | case duplicate: 55 | fprintf (stderr, "/duplicate/%d", *p++); 56 | break; 57 | @@ -1529,11 +1566,17 @@ 58 | } \ 59 | else \ 60 | { \ 61 | - regend[pfreg] = POP_FAILURE_POINTER (); \ 62 | - regstart[pfreg] = POP_FAILURE_POINTER (); \ 63 | - DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \ 64 | - pfreg, regstart[pfreg], regend[pfreg]); \ 65 | - } \ 66 | + re_char *start, *end; \ 67 | + end = POP_FAILURE_POINTER (); \ 68 | + start = POP_FAILURE_POINTER (); \ 69 | + if (!discard_saved_regs) \ 70 | + { \ 71 | + regstart[reg] = start; \ 72 | + regend[reg] = end; \ 73 | + DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \ 74 | + reg, regstart[reg], regend[reg]); \ 75 | + } \ 76 | + } \ 77 | } while (0) 78 | 79 | /* Check that we are not stuck in an infinite loop. */ 80 | @@ -1652,6 +1695,29 @@ 81 | DEBUG_STATEMENT (nfailure_points_popped++); \ 82 | } while (0) /* POP_FAILURE_POINT */ 83 | 84 | +#define FINISH_LOOKAROUND() \ 85 | + do { \ 86 | + re_char *str, *pat; \ 87 | + re_opcode_t op; \ 88 | + discard_saved_regs = 1; \ 89 | + while (!FAIL_STACK_EMPTY ()) \ 90 | + { \ 91 | + POP_FAILURE_POINT (str, pat); \ 92 | + op = (re_opcode_t) *pat; \ 93 | + if (op == lookahead \ 94 | + || op == lookahead_not \ 95 | + || op == lookbehind \ 96 | + || op == lookbehind_not) \ 97 | + { \ 98 | + d = str; \ 99 | + dend = ((d >= string1 && d <= end1) \ 100 | + ? end_match_1 : end_match_2); \ 101 | + break; \ 102 | + } \ 103 | + } \ 104 | + discard_saved_regs = 0; \ 105 | + } while (0); 106 | + 107 | 108 | 109 | /* Registers are set to a sentinel when they haven't yet matched. */ 110 | @@ -1841,6 +1907,7 @@ 111 | pattern_offset_t fixup_alt_jump; 112 | pattern_offset_t laststart_offset; 113 | regnum_t regnum; 114 | + int lookaround; 115 | } compile_stack_elt_t; 116 | 117 | 118 | @@ -2433,6 +2500,8 @@ 119 | compile_stack, 120 | regnum_t regnum)); 121 | 122 | +static int exact_chars_in_pattern_buffer _RE_ARGS ((struct re_pattern_buffer *bufp, re_char *p, re_char *pend)); 123 | + 124 | /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. 125 | Returns one of error codes defined in `regex.h', or zero for success. 126 | 127 | @@ -3168,6 +3237,7 @@ 128 | handle_open: 129 | { 130 | int shy = 0; 131 | + int lookaround = 0; 132 | regnum_t regnum = 0; 133 | if (p+1 < pend) 134 | { 135 | @@ -3189,6 +3259,27 @@ 136 | case '1': case '2': case '3': case '4': 137 | case '5': case '6': case '7': case '8': case '9': 138 | regnum = 10*regnum + (c - '0'); break; 139 | + case '=': 140 | + /* Positive lookahead assertion. */ 141 | + shy = lookaround = 1; 142 | + break; 143 | + case '!': 144 | + /* Negative lookahead assertion. */ 145 | + shy = lookaround = 2; 146 | + break; 147 | + case '<': 148 | + { 149 | + PATFETCH (c); 150 | + if (c == '=') 151 | + /* Positive lookbehind assertion. */ 152 | + shy = lookaround = -1; 153 | + else if (c == '!') 154 | + /* Negative lookbehind assertion. */ 155 | + shy = lookaround = -2; 156 | + else 157 | + FREE_STACK_RETURN (REG_BADPAT); 158 | + } 159 | + break; 160 | default: 161 | /* Only (?:...) is supported right now. */ 162 | FREE_STACK_RETURN (REG_BADPAT); 163 | @@ -3235,7 +3326,8 @@ 164 | = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; 165 | COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer; 166 | COMPILE_STACK_TOP.regnum = regnum; 167 | - 168 | + COMPILE_STACK_TOP.lookaround = lookaround; 169 | + 170 | /* Do not push a start_memory for groups beyond the last one 171 | we can represent in the compiled pattern. */ 172 | if (regnum <= MAX_REGNUM && regnum > 0) 173 | @@ -3284,7 +3376,8 @@ 174 | later groups should continue to be numbered higher, 175 | as in `(ab)c(de)' -- the second group is #2. */ 176 | regnum_t regnum; 177 | - 178 | + int lookaround; 179 | + 180 | compile_stack.avail--; 181 | begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset; 182 | fixup_alt_jump 183 | @@ -3296,13 +3389,40 @@ 184 | /* If we've reached MAX_REGNUM groups, then this open 185 | won't actually generate any code, so we'll have to 186 | clear pending_exact explicitly. */ 187 | + lookaround = COMPILE_STACK_TOP.lookaround; 188 | pending_exact = 0; 189 | 190 | /* We're at the end of the group, so now we know how many 191 | groups were inside this one. */ 192 | if (regnum <= MAX_REGNUM && regnum > 0) 193 | BUF_PUSH_2 (stop_memory, regnum); 194 | - } 195 | + else if (lookaround) 196 | + { 197 | + if (lookaround > 0) 198 | + { 199 | + /* Positive/negative lookahead assertion. */ 200 | + GET_BUFFER_SPACE (3); 201 | + INSERT_JUMP (lookaround == 1 ? lookahead : lookahead_not, laststart, b + 4); 202 | + b += 3; 203 | + } 204 | + else 205 | + { 206 | + /* Positive/negative lookbehind assertion. */ 207 | + int count = exact_chars_in_pattern_buffer (bufp, laststart, b); 208 | + if (count == -1) /* variable length */ 209 | + FREE_STACK_RETURN (REG_BADPAT); 210 | + 211 | + GET_BUFFER_SPACE (5); 212 | + INSERT_JUMP2 (lookaround == -1 ? lookbehind : lookbehind_not, laststart, b + 6, count); 213 | + b += 5; 214 | + } 215 | + 216 | + /* Negative form. */ 217 | + if (lookaround > 1 || lookaround < -1) 218 | + BUF_PUSH (lookaround_fail); 219 | + BUF_PUSH (lookaround_succeed); 220 | + } 221 | + } 222 | break; 223 | 224 | 225 | @@ -3840,10 +3960,16 @@ 226 | /* After an alternative? */ 227 | || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash)) 228 | /* After a shy subexpression? */ 229 | - || ((syntax & RE_SHY_GROUPS) && prev - 2 >= pattern 230 | - && prev[-1] == '?' && prev[-2] == '(' 231 | - && (syntax & RE_NO_BK_PARENS 232 | - || (prev - 3 >= pattern && prev[-3] == '\\'))); 233 | + || ((syntax & RE_SHY_GROUPS) 234 | + && ((prev - 2 >= pattern 235 | + && prev[-1] == '?' && prev[-2] == '(' 236 | + && (syntax & RE_NO_BK_PARENS 237 | + || (prev - 3 >= pattern && prev[-3] == '\\'))) 238 | + || (prev - 3 >= pattern 239 | + && (*prev == '=' || *prev == '!') 240 | + && prev[-1] == '<' && prev[-2] == '?' && prev[-3] == '(' 241 | + && (syntax & RE_NO_BK_PARENS 242 | + || (prev - 4 >= pattern && prev[-4] == '\\'))))); 243 | } 244 | 245 | 246 | @@ -4079,7 +4205,13 @@ 247 | match_any_multibyte_characters = true; 248 | } 249 | break; 250 | - 251 | + case lookahead: 252 | + case lookahead_not: 253 | + case lookbehind: 254 | + case lookbehind_not: 255 | + if (!fastmap) break; 256 | + return -1; 257 | + 258 | /* All cases after this match the empty string. These end with 259 | `continue'. */ 260 | 261 | @@ -4706,6 +4838,93 @@ 262 | return p; 263 | } 264 | 265 | +static int 266 | +exact_chars_in_pattern_buffer (bufp, p, pend) 267 | + struct re_pattern_buffer *bufp; 268 | + re_char *p, *pend; 269 | +{ 270 | + int count = 0; 271 | + while (p < pend) 272 | + { 273 | + switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++)) 274 | + { 275 | + case exactn: 276 | + { 277 | + int mcnt = *p++; 278 | + int buf_charlen; 279 | + while (mcnt > 0) { 280 | + STRING_CHAR_AND_LENGTH (p, buf_charlen); 281 | + p += buf_charlen; 282 | + mcnt -= buf_charlen; 283 | + count++; 284 | + } 285 | + } 286 | + break; 287 | + case start_memory: 288 | + case stop_memory: 289 | + p++; 290 | + break; 291 | +#ifdef emacs 292 | + case categoryspec: 293 | + case notcategoryspec: 294 | +#endif /* emacs */ 295 | + case syntaxspec: 296 | + case notsyntaxspec: 297 | + p++; 298 | + case anychar: 299 | + count++; 300 | + break; 301 | + 302 | + case charset: 303 | + case charset_not: 304 | + if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1)) 305 | + { 306 | + int mcnt; 307 | + p = CHARSET_RANGE_TABLE (p - 1); 308 | + EXTRACT_NUMBER_AND_INCR (mcnt, p); 309 | + p = CHARSET_RANGE_TABLE_END (p, mcnt); 310 | + } 311 | + else 312 | + p += 1 + CHARSET_BITMAP_SIZE (p - 1); 313 | + count++; 314 | + break; 315 | + 316 | +#ifdef emacs 317 | + case before_dot: 318 | + case at_dot: 319 | + case after_dot: 320 | +#endif /* emacs */ 321 | + case no_op: 322 | + case begline: 323 | + case endline: 324 | + case begbuf: 325 | + case endbuf: 326 | + case wordbound: 327 | + case notwordbound: 328 | + case wordbeg: 329 | + case wordend: 330 | + case symbeg: 331 | + case symend: 332 | + /* Zero width. */ 333 | + continue; 334 | + case lookahead: 335 | + case lookahead_not: 336 | + case lookbehind: 337 | + case lookbehind_not: 338 | + /* Skip to lookaround_success. */ 339 | + while (p < pend) 340 | + { 341 | + if ((re_opcode_t) *p++ == lookaround_succeed) 342 | + break; 343 | + } 344 | + break; 345 | + default: 346 | + return -1; 347 | + } 348 | + } 349 | + return count; 350 | +} 351 | + 352 | /* Non-zero if "p1 matches something" implies "p2 fails". */ 353 | static int 354 | mutually_exclusive_p (struct re_pattern_buffer *bufp, const re_char *p1, const re_char *p2) 355 | @@ -5049,6 +5268,9 @@ 356 | re_char **best_regstart, **best_regend; 357 | #endif 358 | 359 | + /* Discard a saved register from the stack. */ 360 | + boolean discard_saved_regs = 0; 361 | + 362 | /* Logically, this is `best_regend[0]'. But we don't want to have to 363 | allocate space for that if we're not allocating space for anything 364 | else (see below). Also, we never need info about register 0 for 365 | @@ -5621,6 +5843,77 @@ 366 | p += 1; 367 | break; 368 | 369 | + case lookahead: 370 | + case lookahead_not: 371 | + DEBUG_PRINT1 ((re_opcode_t) *(p - 1) == lookahead ? "EXECUTING lookahead.\n" : "EXECUTING lookahead_not.\n"); 372 | + 373 | + p += 2; 374 | + PUSH_FAILURE_POINT (p - 3, d); 375 | + break; 376 | + 377 | + case lookbehind: 378 | + case lookbehind_not: 379 | + { 380 | + int mcnt, count; 381 | + boolean not = (re_opcode_t) *(p - 1) != lookbehind; 382 | + 383 | + EXTRACT_NUMBER_AND_INCR (mcnt, p); 384 | + EXTRACT_NUMBER_AND_INCR (count, p); 385 | + 386 | + DEBUG_PRINT2 (not 387 | + ? "EXECUTING lookbehind_not %d.\n" 388 | + : "EXECUTING lookbehind %d.\n", count); 389 | + 390 | + dfail = d; 391 | + while (d != string1 && count > 0) 392 | + { 393 | + if (d == string2) 394 | + { 395 | + if (!string1) 396 | + break; 397 | + d = end1; 398 | + dend = end_match_1; 399 | + } 400 | + 401 | + if (target_multibyte) 402 | + { 403 | + re_char *dhead = (d >= string1 && d <= end1) ? string1 : string2; 404 | + PREV_CHAR_BOUNDARY (d, dhead); 405 | + } 406 | + else 407 | + d--; 408 | + count--; 409 | + } 410 | + 411 | + if (count > 0) 412 | + { 413 | + if (not) 414 | + { 415 | + /* There is no enough string to match. 416 | + So just make it succeeded here. */ 417 | + d = dfail; 418 | + p = p - 2 + mcnt; 419 | + break; 420 | + } 421 | + else 422 | + goto fail; 423 | + } 424 | + 425 | + PUSH_FAILURE_POINT (p - 5, dfail); 426 | + } 427 | + break; 428 | + 429 | + case lookaround_succeed: 430 | + DEBUG_PRINT1 ("EXECUTING lookaround_succeed.\n"); 431 | + 432 | + FINISH_LOOKAROUND(); 433 | + break; 434 | + 435 | + case lookaround_fail: 436 | + DEBUG_PRINT1 ("EXECUTING lookaround_fail.\n"); 437 | + 438 | + FINISH_LOOKAROUND(); 439 | + goto fail; 440 | 441 | /* \ has been turned into a `duplicate' command which is 442 | followed by the numeric value of as the register number. */ 443 | @@ -6269,12 +6562,16 @@ 444 | case on_failure_jump_loop: 445 | case on_failure_jump: 446 | case succeed_n: 447 | + case lookahead_not: 448 | + case lookbehind_not: 449 | d = str; 450 | continue_failure_jump: 451 | EXTRACT_NUMBER_AND_INCR (mcnt, pat); 452 | p = pat + mcnt; 453 | break; 454 | 455 | + case lookahead: 456 | + case lookbehind: 457 | case no_op: 458 | /* A special frame used for nastyloops. */ 459 | goto fail; 460 | --------------------------------------------------------------------------------