├── LICENSE.txt ├── README.md ├── index.html ├── package.json └── src ├── pcre.css └── pcre.js /LICENSE.txt: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2021-2025, Xavier G. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CodeMirror PCRE mode 2 | 3 | This is a [CodeMirror](https://codemirror.net/) mode that brings syntax highlighting for [Perl Compatible Regular Expressions (PCRE)](https://www.pcre.org/). 4 | 5 | ## How to use 6 | ### Basic use 7 | Load `pcre.js` and `pcre.css` at adequate locations in your HTML structure. 8 | Mention `mode: 'pcre'` when creating your CodeMirror instance. 9 | 10 | ### Configuration 11 | codemirror-mode-pcre supports extended mode (`x` flag) and actually enables it by default. This can be turned off by passing `extended: false` when creating the CodeMirror instance. 12 | 13 | ### Theming 14 | This mode does not leverage CodeMirror's default tokens (they are not exactly fitted for regular expressions). Consequently, if you use a theme other than the default one, you will likely want to write your own `pcre.css` file. 15 | 16 | **Breaking change:** starting with version 2.0.0, the `cm-end-group` style is no longer supported and should no longer be used. It is replaced with `cm-start-group`. 17 | 18 | ### Nesting 19 | codemirror-mode-pcre can be nested within another mode, i.e. it can highlight regular expressions for another mode. 20 | This requires adjusting the other mode though. 21 | See the demo page for an example of such nesting. 22 | 23 | ## Non-features 24 | codemirror-mode-pcre does **not** offer: 25 | - completion (e.g. suggesting POSIX class names or script names for `\p` and `\P`); 26 | - tooltips reflecting what the various parts of an expression actually mean. 27 | 28 | Those may come in the future though. 29 | 30 | ## License 31 | Like the PCRE library, this mode is released under the 3-clause BSD license. 32 | -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | CodeMirror PCRE mode 7 | 13 | 14 | 21 | 22 | 23 |

CodeMirror PCRE mode

24 |

25 | This is a CodeMirror mode that brings 26 | syntax highlighting for Perl Compatible Regular Expressions (PCRE). 27 |

28 |

29 | MIME types defined: 30 |

text/x-regex
text/x-pcre-regex

34 |

35 |

36 | Options: 37 |

extended: boolean: initial state of the 'x' flag; default: true.

40 |

41 |

42 | Table of contents: 43 |

Use as nested mode 45 |
man pcresyntax 46 |
Examples from man pcrepattern 47 |
Other details 48 |

49 |

50 |

Use as nested mode

51 |

52 | Below are demonstrations of how the PCRE mode can be used to highlight regular expressions within other languages. 53 | This first example is an nginx configuration snippet: 54 |

55 | 70 |

This second example is a simple list of one-line regexes with comments:

71 | 79 |

man pcresyntax

80 |

81 | This is a slightly adjusted copy of `man pcresyntax`. This man page reflects most PCRE syntactic structures in a 82 | colourful way thanks to CodeMirror and the PCRE mode. 83 |

84 |

85 | # PCRESYNTAX(3)                                  Library Functions Manual                                 PCRESYNTAX(3)
 86 | #
 87 | # NAME
 88 | #        PCRE - Perl-compatible regular expressions
 89 | #
 90 | # PCRE REGULAR EXPRESSION SYNTAX SUMMARY
 91 | #
 92 | #        The  full  syntax  and  semantics  of  the regular expressions that are supported by PCRE are described in the
 93 | #        pcrepattern documentation. This document contains a quick-reference summary of the syntax.
 94 | #
 95 | # QUOTING
 96 | 
 97 |           \x         where x is non-alphanumeric is a literal x
 98 |           \Q...\E    treat enclosed characters as literal
 99 | 
100 | # CHARACTERS
101 | 
102 |           \a       # alarm, that is, the BEL character (hex 07)
103 |           \cx      # "control-x", where x is any ASCII character
104 |           \e       # escape (hex 1B)
105 |           \f       # form feed (hex 0C)
106 |           \n       # newline (hex 0A)
107 |           \r       # carriage return (hex 0D)
108 |           \t       # tab (hex 09)
109 |           \077     # character with octal code 0dd
110 |           \777     # character with octal code ddd, or backreference
111 |           \o{777}  # character with octal code ddd..
112 |           \xff     # character with hex code hh
113 |           \x{fffe} # character with hex code hhh..
114 | 
115 | #        Note that \0dd is always an octal code, and that \8 and \9 are the literal characters "8" and "9".
116 | #
117 | # CHARACTER TYPES
118 | 
119 |           .        # any character except newline;
120 |                    #   in dotall mode, any character whatsoever
121 |           \C       # one data unit, even in UTF mode (best avoided)
122 |           \d       # a decimal digit
123 |           \D       # a character that is not a decimal digit
124 |           \h       # a horizontal white space character
125 |           \H       # a character that is not a horizontal white space character
126 |           \N       # a character that is not a newline
127 |           \p{Pi}   # a character with the xx property
128 |           \P{Pi}   # a character without the xx property
129 |           \R       # a newline sequence
130 |           \s       # a white space character
131 |           \S       # a character that is not a white space character
132 |           \v       # a vertical white space character
133 |           \V       # a character that is not a vertical white space character
134 |           \w       # a "word" character
135 |           \W       # a "non-word" character
136 |           \X       # a Unicode extended grapheme cluster
137 | 
138 | #        By default, \d, \s, and \w match only ASCII characters, even in UTF-8 mode or in the 16- bit  and  32-bit  li‐
139 | #        braries.  However,  if  locale-specific  matching  is happening, \s and \w may also match characters with code
140 | #        points in the range 128-255. If the PCRE_UCP option is set, the behaviour of these escape sequences is changed
141 | #        to use Unicode properties and they match many more characters.
142 | 
143 | # GENERAL CATEGORY PROPERTIES FOR \p and \P
144 | 
145 |        \p{C}       # Other
146 |        \p{Cc}      # Control
147 |        \p{Cf}      # Format
148 |        \p{Cn}      # Unassigned
149 |        \p{Co}      # Private use
150 |        \p{Cs}      # Surrogate
151 | 
152 |        \p{L}       # Letter
153 |        \p{Ll}      # Lower case letter
154 |        \p{Lm}      # Modifier letter
155 |        \p{Lo}      # Other letter
156 |        \p{Lt}      # Title case letter
157 |        \p{Lu}      # Upper case letter
158 |        \p{L&}      # Ll, Lu, or Lt
159 | 
160 |        \p{M}       # Mark
161 |        \p{Mc}      # Spacing mark
162 |        \p{Me}      # Enclosing mark
163 |        \p{Mn}      # Non-spacing mark
164 | 
165 |        \p{N}       # Number
166 |        \p{Nd}      # Decimal number
167 |        \p{Nl}      # Letter number
168 |        \p{No}      # Other number
169 | 
170 |        \p{P}       # Punctuation
171 |        \p{Pc}      # Connector punctuation
172 |        \p{Pd}      # Dash punctuation
173 |        \p{Pe}      # Close punctuation
174 |        \p{Pf}      # Final punctuation
175 |        \p{Pi}      # Initial punctuation
176 |        \p{Po}      # Other punctuation
177 |        \p{Ps}      # Open punctuation
178 | 
179 |        \p{S}       # Symbol
180 |        \p{Sc}      # Currency symbol
181 |        \p{Sk}      # Modifier symbol
182 |        \p{Sm}      # Mathematical symbol
183 |        \p{So}      # Other symbol
184 | 
185 |        \p{Z}       # Separator
186 |        \p{Zl}      # Line separator
187 |        \p{Zp}      # Paragraph separator
188 |        \p{Zs}      # Space separator
189 | 
190 | # PCRE SPECIAL CATEGORY PROPERTIES FOR \p and \P
191 | 
192 |        \p{Xan}     # Alphanumeric: union of properties L and N
193 |        \p{Xps}     # POSIX space: property Z or tab, NL, VT, FF, CR
194 |        \p{Xsp}     # Perl space: property Z or tab, NL, VT, FF, CR
195 |        \p{Xuc}     # Univerally-named character: one that can be
196 |                    #   represented by a Universal Character Name
197 |        \p{Xwd}     # Perl word: property Xan or underscore
198 | 
199 | #        Perl  and  POSIX  space  are  now  the same. Perl added VT to its space character set at release 5.18 and PCRE
200 | #        changed at release 8.34.
201 | 
202 | # SCRIPT NAMES FOR \p AND \P
203 | 
204 |        \p{Arabic}, \p{Armenian}, \p{Avestan}, \p{Balinese}, \p{Bamum}, \p{Bassa_Vah}, \p{Batak}, \p{Bengali},
205 |        \p{Bopomofo}, \p{Brahmi}, \p{Braille}, \p{Buginese}, \p{Buhid}, \p{Canadian_Aboriginal}, \p{Carian},
206 |        \p{Caucasian_Albanian}, \p{Chakma}, \p{Cham}, \p{Cherokee}, \p{Common}, \p{Coptic}, \p{Cuneiform},
207 |        \p{Cypriot}, \p{Cyrillic}, \p{Deseret}, \p{Devanagari}, \p{Duployan}, \p{Egyptian_Hieroglyphs}, \p{Elbasan},
208 |        \p{Ethiopic}, \p{Georgian}, \p{Glagolitic}, \p{Gothic}, \p{Grantha}, \p{Greek}, \p{Gujarati}, \p{Gurmukhi},
209 |        \p{Han}, \p{Hangul}, \p{Hanunoo}, \p{Hebrew}, \p{Hiragana}, \p{Imperial_Aramaic}, \p{Inherited},
210 |        \p{Inscriptional_Pahlavi}, \p{Inscriptional_Parthian}, \p{Javanese}, \p{Kaithi}, \p{Kannada}, \p{Katakana},
211 |        \p{Kayah_Li}, \p{Kharoshthi}, \p{Khmer}, \p{Khojki}, \p{Khudawadi}, \p{Lao}, \p{Latin}, \p{Lepcha}, \p{Limbu},
212 |        \p{Linear_A}, \p{Linear_B}, \p{Lisu}, \p{Lycian}, \p{Lydian}, \p{Mahajani}, \p{Malayalam}, \p{Mandaic},
213 |        \p{Manichaean}, \p{Meetei_Mayek}, \p{Mende_Kikakui}, \p{Meroitic_Cursive}, \p{Meroitic_Hieroglyphs}, \p{Miao},
214 |        \p{Modi}, \p{Mongolian}, \p{Mro}, \p{Myanmar}, \p{Nabataean}, \p{New_Tai_Lue}, \p{Nko}, \p{Ogham},
215 |        \p{Ol_Chiki}, \p{Old_Italic}, \p{Old_North_Arabian}, \p{Old_Permic}, \p{Old_Persian}, \p{Old_South_Arabian},
216 |        \p{Old_Turkic}, \p{Oriya}, \p{Osmanya}, \p{Pahawh_Hmong}, \p{Palmyrene}, \p{Pau_Cin_Hau}, \p{Phags_Pa},
217 |        \p{Phoenician}, \p{Psalter_Pahlavi}, \p{Rejang}, \p{Runic}, \p{Samaritan}, \p{Saurashtra}, \p{Sharada},
218 |        \p{Shavian}, \p{Siddham}, \p{Sinhala}, \p{Sora_Sompeng}, \p{Sundanese}, \p{Syloti_Nagri}, \p{Syriac},
219 |        \p{Tagalog}, \p{Tagbanwa}, \p{Tai_Le}, \p{Tai_Tham}, \p{Tai_Viet}, \p{Takri}, \p{Tamil}, \p{Telugu},
220 |        \p{Thaana}, \p{Thai}, \p{Tibetan}, \p{Tifinagh}, \p{Tirhuta}, \p{Ugaritic}, \p{Vai}, \p{Warang_Citi}, \p{Yi}.
221 | 
222 | # CHARACTER CLASSES
223 | 
224 |        [...]        # positive character class
225 |        [^...]       # negative character class
226 |        [x-y]        # range (can be used for hex characters)
227 |        [[:word:]]   # positive POSIX named set
228 |        [[:^word:]]  # negative POSIX named set
229 | 
230 |        [[:alnum:]]  # alphanumeric
231 |        [[:alpha:]]  # alphabetic
232 |        [[:ascii:]]  # 0-127
233 |        [[:blank:]]  # space or tab
234 |        [[:cntrl:]]  # control character
235 |        [[:digit:]]  # decimal digit
236 |        [[:graph:]]  # printing, excluding space
237 |        [[:lower:]]  # lower case letter
238 |        [[:print:]]  # printing, including space
239 |        [[:punct:]]  # printing, excluding alphanumeric
240 |        [[:space:]]  # white space
241 |        [[:upper:]]  # upper case letter
242 |        [[:word:]]   # same as \w
243 |        [[:xdigit:]] # hexadecimal digit
244 | 
245 | #        In  PCRE,  POSIX  character set names recognize only ASCII characters by default, but some of them use Unicode
246 | #        properties if PCRE_UCP is set. You can use \Q...\E inside a character class.
247 | 
248 | # QUANTIFIERS
249 | 
250 |           ?         # 0 or 1, greedy
251 |           ?+        # 0 or 1, possessive
252 |           ??        # 0 or 1, lazy
253 |           *         # 0 or more, greedy
254 |           *+        # 0 or more, possessive
255 |           *?        # 0 or more, lazy
256 |           +         # 1 or more, greedy
257 |           ++        # 1 or more, possessive
258 |           +?        # 1 or more, lazy
259 |           {1}       # exactly n
260 |           {1,6}     # at least n, no more than m, greedy
261 |           {1,6}+    # at least n, no more than m, possessive
262 |           {1,6}?    # at least n, no more than m, lazy
263 |           {1,}      # n or more, greedy
264 |           {1,}+     # n or more, possessive
265 |           {1,}?     # n or more, lazy
266 | 
267 | # ANCHORS AND SIMPLE ASSERTIONS
268 | 
269 |           \b        # word boundary
270 |           \B        # not a word boundary
271 |           ^         # start of subject
272 |                     #  also after internal newline in multiline mode
273 |           \A        # start of subject
274 |           $         # end of subject
275 |                     #  also before newline at end of subject
276 |                     #  also before internal newline in multiline mode
277 |           \Z        # end of subject
278 |                     #  also before newline at end of subject
279 |           \z        # end of subject
280 |           \G        # first matching position in subject
281 | 
282 | # MATCH POINT RESET
283 | 
284 |           \K        # reset start of match
285 | 
286 | #        \K is honoured in positive assertions, but ignored in negative ones.
287 | 
288 | # ALTERNATION
289 | 
290 |           expr|expr|expr...
291 | 
292 | # CAPTURING
293 | 
294 |           (...)         # capturing group
295 |           (?<name>...)  # named capturing group (Perl)
296 |           (?'name'...)  # named capturing group (Perl)
297 |           (?P<name>...) # named capturing group (Python)
298 |           (?:...)       # non-capturing group
299 |           (?|...)       # non-capturing group; reset group numbers for
300 |                         #  capturing groups in each alternative
301 | 
302 | # ATOMIC GROUPS
303 | 
304 |           (?>...)       # atomic, non-capturing group
305 | 
306 | # COMMENT
307 | 
308 |           (?#....)      # comment (not nestable)
309 | 
310 | # OPTION SETTING
311 | 
312 |           (?i)          # caseless
313 |           (?J)          # allow duplicate names
314 |           (?m)          # multiline
315 |           (?s)          # single line (dotall)
316 |           (?U)          # default ungreedy (lazy)
317 |           (?x)          # extended (ignore white space)
318 |           (?-iUs)       # unset option(s)
319 | 
320 | #        The following are recognized only at the very start of a pattern or after one of the  newline  or  \R  options
321 | #        with similar syntax. More than one of them may appear.
322 | 
323 |           (*LIMIT_MATCH=4)     # set the match limit to d (decimal number)
324 |           (*LIMIT_RECURSION=3) # set the recursion limit to d (decimal number)
325 |           (*NO_AUTO_POSSESS)   # no auto-possessification (PCRE_NO_AUTO_POSSESS)
326 |           (*NO_START_OPT)      # no start-match optimization (PCRE_NO_START_OPTIMIZE)
327 |           (*UTF8)              # set UTF-8 mode: 8-bit library (PCRE_UTF8)
328 |           (*UTF16)             # set UTF-16 mode: 16-bit library (PCRE_UTF16)
329 |           (*UTF32)             # set UTF-32 mode: 32-bit library (PCRE_UTF32)
330 |           (*UTF)               # set appropriate UTF mode for the library in use
331 |           (*UCP)               # set PCRE_UCP (use Unicode properties for \d etc)
332 | 
333 | #        Note  that  LIMIT_MATCH  and  LIMIT_RECURSION  can  only  reduce  the value of the limits set by the caller of
334 | #        pcre_exec(), not increase them.
335 | 
336 | # NEWLINE CONVENTION
337 | 
338 | #        These are recognized only at the very start of the pattern or after option settings with a similar syntax.
339 | 
340 |           (*CR)         # carriage return only
341 |           (*LF)         # linefeed only
342 |           (*CRLF)       # carriage return followed by linefeed
343 |           (*ANYCRLF)    # all three of the above
344 |           (*ANY)        # any Unicode newline sequence
345 | 
346 | # WHAT \R MATCHES
347 | 
348 | #        These are recognized only at the very start of the pattern or after option setting with a similar syntax.
349 | 
350 |           (*BSR_ANYCRLF) # CR, LF, or CRLF
351 |           (*BSR_UNICODE) # any Unicode newline sequence
352 | 
353 | # LOOKAHEAD AND LOOKBEHIND ASSERTIONS
354 | 
355 |           (?=...)       # positive look ahead
356 |           (?!...)       # negative look ahead
357 |           (?<=...)      # positive look behind
358 |           (?<!...)      # negative look behind
359 | 
360 | #        Each top-level branch of a look behind must be of a fixed length.
361 | 
362 | # BACKREFERENCES
363 | 
364 |           \1            # reference by number (can be ambiguous)
365 |           \g2           # reference by number
366 |           \g{3}         # reference by number
367 |           \g{-4}        # relative reference by number
368 |           \k<name>      # reference by name (Perl)
369 |           \k'name'      # reference by name (Perl)
370 |           \g{name}      # reference by name (Perl)
371 |           \k{name}      # reference by name (.NET)
372 |           (?P=name)     # reference by name (Python)
373 | 
374 | # SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)
375 | 
376 |           (?R)          # recurse whole pattern
377 |           (?1)          # call subpattern by absolute number
378 |           (?+2)         # call subpattern by relative number
379 |           (?-3)         # call subpattern by relative number
380 |           (?&name)      # call subpattern by name (Perl)
381 |           (?P>name)     # call subpattern by name (Python)
382 |           \g<name>      # call subpattern by name (Oniguruma)
383 |           \g'name'      # call subpattern by name (Oniguruma)
384 |           \g<4>         # call subpattern by absolute number (Oniguruma)
385 |           \g'5'         # call subpattern by absolute number (Oniguruma)
386 |           \g<+6>        # call subpattern by relative number (PCRE extension)
387 |           \g'+7'        # call subpattern by relative number (PCRE extension)
388 |           \g<-8>        # call subpattern by relative number (PCRE extension)
389 |           \g'-9'        # call subpattern by relative number (PCRE extension)
390 | 
391 | # CONDITIONAL PATTERNS
392 | 
393 |           (?(condition)yes-pattern)
394 |           (?(condition)yes-pattern|no-pattern)
395 | 
396 |           (?(1)...)      # absolute reference condition
397 |           (?(+2)...)     # relative reference condition
398 |           (?(-3)...)     # relative reference condition
399 |           (?(<name>)...) # named reference condition (Perl)
400 |           (?('name')...) # named reference condition (Perl)
401 |           (?(name)...)   # named reference condition (PCRE)
402 |           (?(R)...)      # overall recursion condition
403 |           (?(R4)...)     # specific group recursion condition
404 |           (?(R&name)...) # specific recursion condition
405 |           (?(DEFINE)...) # define subpattern for reference
406 |           (?(?=assert).) # assertion condition
407 | 
408 | # BACKTRACKING CONTROL
409 | 
410 | #        The following act immediately they are reached:
411 | 
412 |           (*ACCEPT)     # force successful match
413 |           (*FAIL)       (?# force backtrack; synonym) (*F)
414 |           (*MARK:NAME)  (?# set name to be passed back; synonym) (*:NAME)
415 | 
416 | #        The following act only when a subsequent match failure causes a backtrack to reach  them.  They  all  force  a
417 | #        match  failure,  but they differ in what happens afterwards. Those that advance the start-of-match point do so
418 | #        only if the pattern is not anchored.
419 | 
420 |           (*COMMIT)     # overall failure, no advance of starting point
421 |           (*PRUNE)      # advance to next starting character
422 |           (*PRUNE:NAME) # equivalent to (*MARK:NAME)(*PRUNE)
423 |           (*SKIP)       # advance to current matching position
424 |           (*SKIP:NAME)  # advance to position corresponding to an earlier
425 |                         # (*MARK:NAME); if not found, the (*SKIP) is ignored
426 |           (*THEN)       # local failure, backtrack to next alternation
427 |           (*THEN:NAME)  # equivalent to (*MARK:NAME)(*THEN)
428 | 
429 | # CALLOUTS
430 | 
431 |           (?C)    # callout
432 |           (?C255) # callout with data n
433 | 
434 | # SEE ALSO
435 | #
436 | #        pcrepattern(3), pcreapi(3), pcrecallout(3), pcrematching(3), pcre(3).
437 | #
438 | # AUTHOR
439 | #
440 | #        Philip Hazel
441 | #        University Computing Service
442 | #        Cambridge CB2 3QH, England.
443 | #
444 | # REVISION
445 | #
446 | #        Last updated: 08 January 2014
447 | #        Copyright (c) 1997-2014 University of Cambridge.
448 | #
449 | # PCRE 8.35                                          08 January 2014                                      PCRESYNTAX(3)

450 |

Examples from man pcrepattern

451 |

452 | `man pcrepattern` is much longer than `man pcresyntax`, which is why only its examples were reproduced below. 453 |

454 |

455 | (*CR)a.b
456 | 
457 | \Qabc$xyz\E \Qabc\$xyz\E \Qabc\E\$\Qxyz\E
458 | 
459 | \040  # is another way of writing an ASCII space
460 | \40   # is the same, provided there are fewer than 40 previous capturing subpatterns
461 | \7    # is always a back reference
462 | \11   # might be a back reference, or another way of writing a tab
463 | \011  # is always a tab
464 | \0113 # is a tab followed by the character "3"
465 | \113  # might be a back reference, otherwise the character with octal code 113
466 | \377  # might be a back reference, otherwise the value 255 (decimal)
467 | \81   # is either a back reference, or the two characters "8" and "1"
468 | 
469 | \xdc is exactly the same as \x{dc}, or \u00dc in JavaScript mode.
470 | 
471 | [In addition, inside a character class, \b is interpreted as the backspace character (hex 08).]
472 | 
473 | [\N is not allowed in a character class.]
474 | [\B, \R, and \X are not special inside a character class.]
475 | 
476 | # In 8-bit non-UTF-8 mode \R is equivalent to the following:
477 | (?>\r\n|\n|\x0b|\f|\r|\x85)
478 | 
479 | foo\Kbar
480 | (foo)\Kbar
481 | ^abc$
482 | 
483 | (?|	(?=[\x00-\x7f])(\C) |
484 | 	(?=[\x80-\x{7ff}])(\C)(\C) |
485 | 	(?=[\x{800}-\x{ffff}])(\C)(\C)(\C) |
486 | 	(?=[\x{10000}-\x{1fffff}])(\C)(\C)(\C)(\C))
487 | 
488 | [aeiou] [^aeiou] [d-m]
489 | [W-]46] [W-\]46]
490 | [z-\xff]
491 | [A-\d] [A-[:digit:]]
492 | [\000-\037]
493 | (?i)[W-c] [][\\^_`wxyzabc]
494 | [\xc8-\xcb]
495 | # If a closing square bracket is required as a member of the class, it should be the first data character in the class
496 | # (after an initial circumflex, if present) or escaped with a backslash.
497 | []\\^_`wxyzabc]
498 | 
499 | [01[:alpha:]%]
500 | [12[:^digit:]]
501 | 
502 | [[:<:]]  is converted to  \b(?=\w)
503 | [[:>:]]  is converted to  \b(?<=\w)
504 | # Only these exact character sequences are recognized. A sequence such as
505 | [a[:<:]b]
506 | # provokes error for an unrecognized POSIX class name.
507 | 
508 | gilbert|sullivan
509 | (a(?i)b)c
510 | (a(?i)b|c)
511 | 
512 | cat(aract|erpillar|)
513 |  ((red|white) (king|queen))
514 | ((?:red|white) (king|queen))
515 | 
516 | (?i:saturday|sunday)
517 | (?:(?i)saturday|sunday)
518 | 
519 | (?|(Sat)ur|(Sun))day
520 | /(?|(abc)|(def))\1/
521 | /(?|(abc)|(def))(?1)/
522 | 
523 | (?<DN>Mon|Fri|Sun)(?:day)?|
524 | (?<DN>Tue)(?:sday)?|
525 | (?<DN>Wed)(?:nesday)?|
526 | (?<DN>Thu)(?:rsday)?|
527 | (?<DN>Sat)(?:urday)?
528 | 
529 | 
530 | (?:(?<n>foo)|(?<n>bar))\k<n>
531 | z{2,4}
532 | [aeiou]{3,}
533 | \d{8}
534 | (a?)*
535 | /\*.*\*/
536 | /\*.*?\*/
537 | \d??\d
538 | (.*)abc\1
539 | (?>.*?a)b
540 | (tweedle[dume]{3}\s*)+
541 | /(a|(b))+/
542 | \d+foo
543 | (?>\d+)foo
544 | \d++foo
545 | (abc|xyz){2,3}+
546 | (\D+|<\d+>)*[!?]
547 | ((?>\D+)|<\d+>)*[!?]
548 | (ring), \1
549 | (ring), \g1
550 | (ring), \g{1}
551 | (abc(def)ghi)\g{-1}
552 | (sens|respons)e and \1ibility
553 | ((?i)rah)\s+\1
554 | (?<p1>(?i)rah)\s+\k<p1>
555 | (?'p1'(?i)rah)\s+\k{p1}
556 | (?P<p1>(?i)rah)\s+(?P=p1)
557 | (?<p1>(?i)rah)\s+\g{p1}
558 | (a|(bc))\2
559 | (a\1)
560 | (a|b\1)+
561 | \w+(?=;)
562 | foo(?!bar)
563 | (?!foo)bar
564 | (?<!foo)bar
565 | (?<=bullock|donkey)
566 | (?<!dogs?|cats?)
567 | (?<=ab(c|de))
568 | (?<=abc|abde)
569 | abcd$
570 | ^.*abcd$
571 | ^.*+(?<=abcd)
572 | (?<=\d{3})(?<!999)foo
573 | (?<=\d{3}...)(?<!999)foo
574 | (?<=(?<!foo)bar)baz
575 | (?<=\d{3}(?!999)...)foo
576 | 
577 | (?(condition)yes-pattern)
578 | (?(condition)yes-pattern|no-pattern)
579 | 
580 | (?(1) (A|B|C) | (D | (?(2)E|F) | E) )
581 | 
582 | ( $ )?    [^()]+    (?(1) $ )
583 | 
584 | ( $ )?    [^()]+    (?(-1) $ )
585 | 
586 | (?<OPEN> $ )?    [^()]+    (?(<OPEN>) $ )
587 | 
588 | (?(R3)...) or (?(R&name)...)
589 | 
590 | (?(DEFINE) (?<byte> 2[0-4]\d | 25[0-5] | 1\d\d | [1-9]?\d) )
591 | \b (?&byte) (\.(?&byte)){3} \b
592 | 
593 | 
594 | (?(?=[^a-z]*[a-z])
595 | \d{2}-[a-z]{3}-\d{2}  |  \d{2}-\d{2}-\d{2} )
596 | 
597 | abc #comment \n still comment
598 | ( $ ( [^()]++ | (?1) )* $ )
599 | (?<pn> $ ( [^()]++ | (?&pn) )* $ )
600 | (ab(cd)ef)
601 | < (?: (?(R) \d++  | [^<>]*+) | (?R)) * >
602 | ^(.|(.)(?1)\2)$
603 | ^((.)(?1)\2|.)$
604 | ^((.)(?1)\2|.?)$
605 | ^(?:((.)(?1)\2|)|((.)(?3)\4|.))
606 | ^\W*+(?:((.)\W*+(?1)\W*+\2|)|((.)\W*+(?3)\W*+\4|\W*+.\W*+))\W*+$
607 | ^(.)(\1|a(?2))
608 | 
609 | (...(absolute)...)...(?2)...
610 | (...(relative)...)...(?-1)...
611 | (...(?+1)...(relative)...)
612 | (sens|respons)e and \1ibility
613 | (sens|respons)e and (?1)ibility
614 | (abc)(?i:(?-1))
615 | (?<pn> $ ( (?>[^()]+) | \g<pn> )* $ )
616 | (sens|respons)e and \g'1'ibility
617 | (abc)(?i:\g<-1>)
618 | (?C1)abc(?C2)def
619 | # An explicit callout may also be set at this position, as in this example:
620 | # Note that this applies only to assertion conditions, not to other types of condition.
621 | (?(?C9)(?=a)abc|def)
622 | A((?:A|B(*ACCEPT)|C)D)
623 | a+(?C)(*FAIL)
624 | /X(*MARK:A)Y|X(*MARK:B)Z/K
625 | a+(*COMMIT)b
626 | /(*COMMIT)abc/
627 | a+(*SKIP)b
628 | ( COND1 (*THEN) FOO | COND2 (*THEN) BAR | COND3 (*THEN) BAZ ) ...
629 | A (B(*THEN)C) | D
630 | A (B(*THEN)C | (*FAIL)) | D
631 | ^.*? (?(?=a) a | b(*THEN)c )
632 | (A(*COMMIT)B(*THEN)C|ABD)
633 | ...(*COMMIT)(*PRUNE)...
634 | /(a(*COMMIT)b)+ac/

635 |

Other details

636 |

637 | # Option setting: extended mode (x) is enabled by default
638 | (?-i) (?-J) (?-m) (?-s) (?-U) (?-x) # This part is no longer in extended mode so this is not a comment
639 | (?iJm) (?-iJm) (?iJm-sUx) (?xJm-s-U-i) # Back to extended mode: this is a comment
640 | 
641 | # Reset group numbers:
642 | (?|(reset)|(group)|(numbers))
643 | 
644 | ## Everything that can take a name:
645 | # Capturing groups:
646 |  (?<named>capturing_group)  (?<1badname>capturing_group)  (?<nametoolooooooooooooooooooooooooong>capturing_group)
647 | (?P<named>capturing_group) (?P<2badname>capturing_group) (?P<nametoolooooooooooooooooooooooooong>capturing_group)
648 |  (?'named'capturing_group)  (?'3badname'capturing_group)  (?'nametoolooooooooooooooooooooooooong'capturing_group)
649 | 
650 | # Backreferences:
651 |  \g{name}  \g{1badname}  \g{nametoolooooooooooooooooooooooooong}
652 |  \k{name}  \k{1badname}  \k{nametoolooooooooooooooooooooooooong}
653 |  \k<name>  \k<1badname>  \k<nametoolooooooooooooooooooooooooong>
654 |  \k'name'  \k'1badname'  \k'nametoolooooooooooooooooooooooooong'
655 | (?P=name) (?P=1badname) (?P=nametoolooooooooooooooooooooooooong)
656 | 
657 | # Subroutines:
658 | (?P>name) (?P>1badname) (?P>nametoolooooooooooooooooooooooooong)
659 |  (?&name)  (?&1badname)  (?&nametoolooooooooooooooooooooooooong)
660 |  \g<name>  \g<1badname>  \g<nametoolooooooooooooooooooooooooong>
661 |  \g'name'  \g'1badname'  \g'nametoolooooooooooooooooooooooooong'
662 | 
663 | # Conditions:
664 |  (?(<name>)...) (?(<1badname>)...) (?(<nametoolooooooooooooooooooooooooong>)...)
665 |  (?('name')...) (?('1badname')...) (?('nametoolooooooooooooooooooooooooong')...)
666 |   (?(name)...)   (?(1badname)...)   (?(nametoolooooooooooooooooooooooooong)...)
667 | (?(R&name)...) (?(R&1badname)...) (?(R&nametoolooooooooooooooooooooooooong)...)
668 | 
669 | # Verbs:
670 |      (*:name)      (*:1badname)      (*:nametoolooooooooooooooooooooooooong)
671 |  (*MARK:name)  (*MARK:1badname)  (*MARK:nametoolooooooooooooooooooooooooong)
672 | (*PRUNE:name) (*PRUNE:1badname) (*PRUNE:nametoolooooooooooooooooooooooooong)
673 |  (*SKIP:name)  (*SKIP:1badname)  (*SKIP:nametoolooooooooooooooooooooooooong)
674 |  (*THEN:name)  (*THEN:1badname)  (*THEN:nametoolooooooooooooooooooooooooong)
675 | 
676 | # Nested groups on a single line:
677 |   (?<level1>  (?<level2>  (?<level3>  (?<level4>  (?<level5>  (?<level6>  (?<level7>  (?<level8>  8)  7)  6)  5)  4)  3)  2)  1)
678 | 
679 | # Nested groups on multiple lines:
680 | 	(?<level1>
681 | 		(?<level2>
682 | 			(?<level3>
683 | 				(?<level4>
684 | 					(?<level5>
685 | 						(?<level6>
686 | 							(?<level7>
687 | 								(?<level8>
688 | 									This ought to be enough for everyone ©
689 | 								)
690 | 							)
691 | 						)
692 | 					)
693 | 				)
694 | 			)
695 | 		)
696 | 	)
697 | 
698 | \[\\\#/\]       # [\#/] escaped
699 | 
700 | # Better \Q...\E test:
701 | \Q Everything between \Q and \ E is escaped: !@#$%^&*()_-+[]{} \E
702 | 
703 | # \Q...\E in a character class:
704 | [123\QYou can use \Q...\ E inside character classes\E456]
705 | 
706 | # It remains ugly in extended mode though:
707 | \Q hello
708 |    I span
709 |    multiple lines
710 |    and this is ugly
711 | \E
712 | 
713 | # Octal notations:
714 | \0 \07 \077 \123 \o{456}
715 | 
716 | # Hexadecimal notations:
717 | \x \xa \xaa \xAA \x{bb} \u12ff
718 | 
719 | (?C) (?C0) (?C1) (?C23) (?C234) (?C255) (?C256) # callouts
720 | 
721 | \d \D \h \H \v \V \w \W
722 | \n \N # LF (line feed, 0x0A) character vs any character that is not a newline
723 | 
724 | # More \p tests:
725 | \p{C}   \P{C}   \p{^C}   \P{^C}
726 | \p{L&}  \P{L&}  \p{^L&}  \P{^L&}
727 | \p{Xwd} \P{Xwd} \p{^Xwd} \P{^Xwd}
728 | \p{Han} \P{Han} \p{^Han} \P{^Han}
729 | \p{Hna} \P{Hna} \p{^Hna} \P{^Hna} # The Han/Hna typo is made visible
730 | 
731 | # POSIX named classes are supported only within a class:
732 | [:word:] [:^space:]
733 | [
734 | 	[:alnum:]  [:alpha:]  [:ascii:]  [:blank:]   [:cntrl:]
735 | 	[:digit:]  [:graph:]  [:lower:]  [:print:]   [:punct:]
736 | 	[:space:]  [:upper:]  [:word:]   [:xdigit:]  [:fake:]
737 | 	[:^alnum:] [:^alpha:] [:^ascii:] [:^blank:]  [:^cntrl:]
738 | 	[:^digit:] [:^graph:] [:^lower:] [:^print:]  [:^punct:]
739 | 	[:^space:] [:^upper:] [:^word:]  [:^xdigit:] [:^fake:]
740 | ]
741 | 
742 | # Quantifiers: there are no {,m} quantifiers:
743 | x{,6}      # For example, {,6} is not  a  quantifier, but a literal string of four characters.
744 | 
745 | # Conditional patterns: DEFINE should not prevent names starting with "DEFINE"
746 | (?(DEFINER)yes-pattern|no-pattern)

747 | 753 | 759 | 760 | 813 | 834 | 857 | 858 | 859 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "codemirror-mode-pcre", 3 | "version": "2.0.0", 4 | "description": "Perl Compatible Regular Expressions (PCRE) mode for CodeMirror", 5 | "main": "src/pcre.js", 6 | "scripts": { 7 | "test": "true" 8 | }, 9 | "repository": { 10 | "type": "git", 11 | "url": "git+https://github.com/xavierog/codemirror-mode-pcre.git" 12 | }, 13 | "keywords": [ 14 | "codemirror-mode", 15 | "pcre", 16 | "pcre-regex" 17 | ], 18 | "author": "Xavier G.", 19 | "license": "BSD-3-Clause", 20 | "bugs": { 21 | "url": "https://github.com/xavierog/codemirror-mode-pcre/issues" 22 | }, 23 | "homepage": "https://github.com/xavierog/codemirror-mode-pcre#readme" 24 | } 25 | -------------------------------------------------------------------------------- /src/pcre.css: -------------------------------------------------------------------------------- 1 | .cm-s-default span.cm-group1 { background: #ffff7f40; } 2 | .cm-s-default span.cm-group2 { background: #aaffff40; } 3 | .cm-s-default span.cm-group3 { background: #ffff7f60; } 4 | .cm-s-default span.cm-group4 { background: #aaffff60; } 5 | .cm-s-default span.cm-group5 { background: #ffff7f80; } 6 | .cm-s-default span.cm-group6 { background: #aaffff80; } 7 | .cm-s-default span.cm-group7 { background: #ffff7fa0; } 8 | .cm-s-default span.cm-group8 { background: #aaffffa0; } 9 | 10 | .cm-s-default span.cm-start-group, 11 | .cm-s-default span.cm-alternation 12 | { 13 | font-weight: bold; 14 | } 15 | 16 | .cm-s-default span.cm-condition { } 17 | 18 | .cm-s-default span.cm-option-sequence { background: #5cb85c; color: white; font-weight: bold; } 19 | .cm-s-default span.cm-verb { background: #5459b8; color: white; font-weight: bold; } 20 | 21 | .cm-s-default span.cm-backreference { color: #25ba36; font-weight: bold; } 22 | .cm-s-default span.cm-callout { background: #ff00ff; color: white; } 23 | .cm-s-default span.cm-condition-subroutine, 24 | .cm-s-default span.cm-subroutine { color: #aa00ff; font-weight: bold; } 25 | .cm-s-default span.cm-quantifier { background: #aad1f760; } 26 | 27 | .cm-s-default span.cm-character-class { background: #f9ca6960; } 28 | 29 | .cm-s-default span.cm-non-printing-character { color: #aa0000; } 30 | .cm-s-default span.cm-generic-character-type { color: blue; } 31 | 32 | .cm-s-default span.cm-escaped-character { font-weight: bold; } 33 | .cm-s-default span.cm-escaped-sequence-start, 34 | .cm-s-default span.cm-escaped-sequence-end, 35 | .cm-s-default span.cm-escaped-sequence { 36 | border-top: 1px solid #7f7f7f; 37 | border-bottom: 1px solid #7f7f7f; 38 | } 39 | .cm-s-default span.cm-escaped-sequence-start, 40 | .cm-s-default span.cm-escaped-sequence-end { 41 | color: white; 42 | background-color: #00000080; 43 | } 44 | 45 | .cm-s-default span.cm-anchor { background: black; color: white; font-weight: bold; } 46 | 47 | .cm-s-default span.cm-define { background: blue; color: yellow; } 48 | .cm-s-default span.cm-name { color: blue; font-weight: normal; } 49 | .cm-s-default span.cm-verb.cm-name { color: yellow; } 50 | .cm-s-default span.cm-err { background: #ff4300c0; } 51 | -------------------------------------------------------------------------------- /src/pcre.js: -------------------------------------------------------------------------------- 1 | // Declare global variables to avoid warnings in JSHint 2 | /* global CodeMirror, define */ 3 | 4 | (function (mod) { 5 | if (typeof exports === "object" && typeof module === "object") // CommonJS 6 | mod(require("codemirror/lib/codemirror")); 7 | else if (typeof define === "function" && define.amd) // AMD 8 | define(["codemirror/lib/codemirror"], mod); 9 | else // Plain browser env 10 | mod(CodeMirror); 11 | })(function (CodeMirror) { 12 | "use strict"; 13 | 14 | CodeMirror.defineMode('pcre', function(editor_options, mode_options) { 15 | // Default settings: 16 | var options = { 17 | extended: true, 18 | }; 19 | // Override default settings with user-provided settings: 20 | if ('extended' in mode_options) options.extended = Boolean(mode_options.extended); 21 | 22 | var delimiters = { 23 | '<': '>', 24 | '[': ']', 25 | '{': '}', 26 | '(': ')', 27 | }; 28 | // Behaviour of alphanumeric characters after a backslash character (normal context): 29 | var backslash_in_normal_context = { 30 | '0': 'non-printing-character', 31 | '1': 'backreference', 32 | '2': 'backreference', 33 | '3': 'backreference', 34 | '4': 'backreference', 35 | '5': 'backreference', 36 | '6': 'backreference', 37 | '7': 'backreference', 38 | '8': 'backreference', 39 | '9': 'backreference', 40 | 'A': 'anchor', // \A start of subject 41 | 'B': 'anchor', // \B not a word boundary 42 | 'C': 'generic-character-type', // \C one data unit, even in UTF mode (best avoided) 43 | 'D': 'generic-character-type', // \D any character that is not a decimal digit 44 | 'E': 'err no-error-message', // \E ends \Q but never matches 'E' -- PCRE does not emit any error message 45 | 'F': '', // \F matches F 46 | 'G': 'anchor', // \G first matching position in subject 47 | 'H': 'generic-character-type', // \H any character that is not a horizontal white space character 48 | 'I': '', // \I matches I 49 | 'J': '', // \J matches J 50 | 'K': 'anchor', // \K reset start of match (neither an anchor nor a simple assertion) 51 | 'L': 'err unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u 52 | 'M': '', // \M matches M 53 | 'N': 'generic-character-type', // \N a character that is not a newline 54 | 'O': '', // \O matches O 55 | 'P': 'err malformed-backslash-p-sequence', // malformed \P or \p sequence 56 | 'Q': 'escaped-sequence-start', // \Q starts \Q...\E escape sequences. 57 | 'R': 'generic-character-type', // \R a newline sequence 58 | 'S': 'generic-character-type', // \S any character that is not a white space character 59 | 'T': '', // \T matches T 60 | 'U': 'unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u 61 | 'V': 'generic-character-type', // \V any character that is not a vertical white space character 62 | 'W': 'generic-character-type', // \W any "non-word" character 63 | 'X': 'generic-character-type', // \X a Unicode extended grapheme cluster 64 | 'Y': '', // \Y matches Y 65 | 'Z': 'anchor', // \Z matches at the end of the subject; also matches before a newline at the end of the subject 66 | 'a': 'non-printing-character', // \a alarm, that is, the BEL character (hex 07) 67 | 'b': 'anchor', // \b word boundary 68 | 'c': 'err backslash-c-at-end-of-pattern', // \cx "control-x", where x is any ASCII character 69 | 'd': 'generic-character-type', // \d any decimal digi 70 | 'e': 'non-printing-character', // \e escape (hex 1B) 71 | 'f': 'non-printing-character', // \f form feed (hex 0C) 72 | 'g': 'err a-number-reference-must-not-be-zero', // a numbered reference must not be zero 73 | 'h': 'generic-character-type', // \h any horizontal white space character 74 | 'i': '', // \i matches i 75 | 'j': '', // \j matches j 76 | 'k': 'err backslash-k-is-not-followed-by-a-name', // \k is not followed by a [...] name 77 | 'l': 'unsupported-espace-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u 78 | 'm': '', // \m matches m 79 | 'n': 'non-printing-character', // \n linefeed (hex 0A) 80 | 'o': '', // \o matches o 81 | 'p': 'err malformed-backslash-p-sequence', // malformed \P or \p sequence 82 | 'q': '', // \q matches q 83 | 'r': 'non-printing-character', // \r carriage return (hex 0D) 84 | 's': 'generic-character-type', // \s any white space character 85 | 't': 'non-printing-character', // \t tab (hex 09) 86 | 'u': 'err unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u 87 | 'v': 'generic-character-type', // \v any vertical white space character 88 | 'w': 'generic-character-type', // any "word" character 89 | 'x': 'non-printing-character', // binary zero (or x if PCRE_JAVASCRIPT_COMPAT) 90 | 'y': '', // \y matches y 91 | 'z': 'anchor', // \z end of subject 92 | }; 93 | // Behaviour of alphanumeric characters after a backslash character (character class context, i.e. [...]): 94 | var backslash_in_character_class = { 95 | '0': 'non-printing-character', // octal code 96 | '1': 'non-printing-character', // octal code 97 | '2': 'non-printing-character', // octal code 98 | '3': 'non-printing-character', // octal code 99 | '4': 'non-printing-character', // octal code 100 | '5': 'non-printing-character', // octal code 101 | '6': 'non-printing-character', // octal code 102 | '7': 'non-printing-character', // octal code 103 | '8': '', // \8 matches 8 104 | '9': '', // \9 matches 9 105 | 'A': '', // \A matches A 106 | 'B': '', // \B matches B -- \B, \R, and \X are not special inside a character class. 107 | 'C': '', // \C matches C 108 | 'D': 'generic-character-type', // \D any character that is not a decimal digit 109 | 'E': 'err no-error-message', // \E ends \Q but never matches 'E' -- PCRE does not emit any error message 110 | 'F': '', // \F matches F 111 | 'G': '', // \G matches G 112 | 'H': 'generic-character-type', // \H any character that is not a horizontal white space character 113 | 'I': '', // \I matches I 114 | 'J': '', // \J matches J 115 | 'K': '', // \K matches K 116 | 'L': 'err unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u 117 | 'M': '', // \M matches M 118 | 'N': 'err backslash-n-is-not-supported-in-a-class', // \N is not allowed in a character class. 119 | 'O': '', // \O matches O 120 | 'P': 'err malformed-backslash-p-sequence', // malformed \P or \p sequence 121 | 'Q': 'escaped-sequence-start', // \Q starts \Q...\E escape sequences. 122 | 'R': '', // \R matches R -- \B, \R, and \X are not special inside a character class. 123 | 'S': 'generic-character-type', // \S any character that is not a white space character 124 | 'T': '', // \T matches T 125 | 'U': 'unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u 126 | 'V': 'generic-character-type', // \V any character that is not a vertical white space character 127 | 'W': 'generic-character-type', // \W any "non-word" character 128 | 'X': '', // \X matches X -- \B, \R, and \X are not special inside a character class. 129 | 'Y': '', // \Y matches Y 130 | 'Z': '', // \Z matches Z 131 | 'a': 'non-printing-character', // \a alarm, that is, the BEL character (hex 07) 132 | 'b': 'non-printing-character', // inside a character class, \b is interpreted as the backspace character (hex 08) 133 | 'c': 'err backslash-c-at-end-of-pattern', // \cx "control-x", where x is any ASCII character 134 | 'd': 'generic-character-type', // \d any decimal digi 135 | 'e': 'non-printing-character', // \e escape (hex 1B) 136 | 'f': 'non-printing-character', // \f form feed (hex 0C) 137 | 'g': '', // \g matches g 138 | 'h': 'generic-character-type', // \h any horizontal white space character 139 | 'i': '', // \i matches i 140 | 'j': '', // \j matches j 141 | 'k': '', // \k matches k 142 | 'l': 'unsupported-espace-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u 143 | 'm': '', // \m matches m 144 | 'n': 'non-printing-character', // \n linefeed (hex 0A) 145 | 'o': '', // \o matches o 146 | 'p': 'err malformed-backslash-p-sequence', // malformed \P or \p sequence 147 | 'q': '', // \q matches q 148 | 'r': 'non-printing-character', // \r carriage return (hex 0D) 149 | 's': 'generic-character-type', // \s any white space character 150 | 't': 'non-printing-character', // \t tab (hex 09) 151 | 'u': 'err unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u 152 | 'v': 'generic-character-type', // \v any vertical white space character 153 | 'w': 'generic-character-type', // any "word" character 154 | 'x': 'non-printing-character', // binary zero (or x if PCRE_JAVASCRIPT_COMPAT) 155 | 'y': '', // \y matches y 156 | 'z': '', // \z matches z 157 | }; 158 | var backslask_p_properties = { 159 | // GENERAL CATEGORY PROPERTIES FOR \p and \P 160 | 'C': 'Other', 161 | 'Cc': 'Control', 162 | 'Cf': 'Format', 163 | 'Cn': 'Unassigned', 164 | 'Co': 'Private use', 165 | 'Cs': 'Surrogate', 166 | 167 | 'L': 'Letter', 168 | 'Ll': 'Lower case letter', 169 | 'Lm': 'Modifier letter', 170 | 'Lo': 'Other letter', 171 | 'Lt': 'Title case letter', 172 | 'Lu': 'Upper case letter', 173 | 'L&': 'Ll, Lu, or Lt', 174 | 175 | 'M': 'Mark', 176 | 'Mc': 'Spacing mark', 177 | 'Me': 'Enclosing mark', 178 | 'Mn': 'Non-spacing mark', 179 | 180 | 'N': 'Number', 181 | 'Nd': 'Decimal number', 182 | 'Nl': 'Letter number', 183 | 'No': 'Other number', 184 | 185 | 'P': 'Punctuation', 186 | 'Pc': 'Connector punctuation', 187 | 'Pd': 'Dash punctuation', 188 | 'Pe': 'Close punctuation', 189 | 'Pf': 'Final punctuation', 190 | 'Pi': 'Initial punctuation', 191 | 'Po': 'Other punctuation', 192 | 'Ps': 'Open punctuation', 193 | 194 | 'S': 'Symbol', 195 | 'Sc': 'Currency symbol', 196 | 'Sk': 'Modifier symbol', 197 | 'Sm': 'Mathematical symbol', 198 | 'So': 'Other symbol', 199 | 200 | 'Z': 'Separator', 201 | 'Zl': 'Line separator', 202 | 'Zp': 'Paragraph separator', 203 | 'Zs': 'Space separator', 204 | 205 | // PCRE SPECIAL CATEGORY PROPERTIES FOR \p and \P 206 | 'Xan': 'Alphanumeric: union of properties L and N', 207 | 'Xps': 'POSIX space: property Z or tab, NL, VT, FF, CR', 208 | 'Xsp': 'Perl space: property Z or tab, NL, VT, FF, CR', 209 | 'Xuc': 'Univerally-named character: one that can be represented by a Universal Character Name', 210 | 'Xwd': 'Perl word: property Xan or underscore', 211 | 212 | // SCRIPT NAMES FOR \p AND \P 213 | 'Arabic': true, 214 | 'Armenian': true, 215 | 'Avestan': true, 216 | 'Balinese': true, 217 | 'Bamum': true, 218 | 'Bassa_Vah': true, 219 | 'Batak': true, 220 | 'Bengali': true, 221 | 'Bopomofo': true, 222 | 'Brahmi': true, 223 | 'Braille': true, 224 | 'Buginese': true, 225 | 'Buhid': true, 226 | 'Canadian_Aboriginal': true, 227 | 'Carian': true, 228 | 'Caucasian_Albanian': true, 229 | 'Chakma': true, 230 | 'Cham': true, 231 | 'Cherokee': true, 232 | 'Common': true, 233 | 'Coptic': true, 234 | 'Cuneiform': true, 235 | 'Cypriot': true, 236 | 'Cyrillic': true, 237 | 'Deseret': true, 238 | 'Devanagari': true, 239 | 'Duployan': true, 240 | 'Egyptian_Hieroglyphs': true, 241 | 'Elbasan': true, 242 | 'Ethiopic': true, 243 | 'Georgian': true, 244 | 'Glagolitic': true, 245 | 'Gothic': true, 246 | 'Grantha': true, 247 | 'Greek': true, 248 | 'Gujarati': true, 249 | 'Gurmukhi': true, 250 | 'Han': true, 251 | 'Hangul': true, 252 | 'Hanunoo': true, 253 | 'Hebrew': true, 254 | 'Hiragana': true, 255 | 'Imperial_Aramaic': true, 256 | 'Inherited': true, 257 | 'Inscriptional_Pahlavi': true, 258 | 'Inscriptional_Parthian': true, 259 | 'Javanese': true, 260 | 'Kaithi': true, 261 | 'Kannada': true, 262 | 'Katakana': true, 263 | 'Kayah_Li': true, 264 | 'Kharoshthi': true, 265 | 'Khmer': true, 266 | 'Khojki': true, 267 | 'Khudawadi': true, 268 | 'Lao': true, 269 | 'Latin': true, 270 | 'Lepcha': true, 271 | 'Limbu': true, 272 | 'Linear_A': true, 273 | 'Linear_B': true, 274 | 'Lisu': true, 275 | 'Lycian': true, 276 | 'Lydian': true, 277 | 'Mahajani': true, 278 | 'Malayalam': true, 279 | 'Mandaic': true, 280 | 'Manichaean': true, 281 | 'Meetei_Mayek': true, 282 | 'Mende_Kikakui': true, 283 | 'Meroitic_Cursive': true, 284 | 'Meroitic_Hieroglyphs': true, 285 | 'Miao': true, 286 | 'Modi': true, 287 | 'Mongolian': true, 288 | 'Mro': true, 289 | 'Myanmar': true, 290 | 'Nabataean': true, 291 | 'New_Tai_Lue': true, 292 | 'Nko': true, 293 | 'Ogham': true, 294 | 'Ol_Chiki': true, 295 | 'Old_Italic': true, 296 | 'Old_North_Arabian': true, 297 | 'Old_Permic': true, 298 | 'Old_Persian': true, 299 | 'Old_South_Arabian': true, 300 | 'Old_Turkic': true, 301 | 'Oriya': true, 302 | 'Osmanya': true, 303 | 'Pahawh_Hmong': true, 304 | 'Palmyrene': true, 305 | 'Pau_Cin_Hau': true, 306 | 'Phags_Pa': true, 307 | 'Phoenician': true, 308 | 'Psalter_Pahlavi': true, 309 | 'Rejang': true, 310 | 'Runic': true, 311 | 'Samaritan': true, 312 | 'Saurashtra': true, 313 | 'Sharada': true, 314 | 'Shavian': true, 315 | 'Siddham': true, 316 | 'Sinhala': true, 317 | 'Sora_Sompeng': true, 318 | 'Sundanese': true, 319 | 'Syloti_Nagri': true, 320 | 'Syriac': true, 321 | 'Tagalog': true, 322 | 'Tagbanwa': true, 323 | 'Tai_Le': true, 324 | 'Tai_Tham': true, 325 | 'Tai_Viet': true, 326 | 'Takri': true, 327 | 'Tamil': true, 328 | 'Telugu': true, 329 | 'Thaana': true, 330 | 'Thai': true, 331 | 'Tibetan': true, 332 | 'Tifinagh': true, 333 | 'Tirhuta': true, 334 | 'Ugaritic': true, 335 | 'Vai': true, 336 | 'Warang_Citi': true, 337 | 'Yi': true, 338 | }; 339 | var backslash_p_regex_string = '[pP]\\{\\^?([\\w&]+)\\}'; 340 | var backslash_p_regex = new RegExp(backslash_p_regex_string); 341 | 342 | var posix_named_sets = { 343 | 'alnum': 'alphanumeric', 344 | 'alpha': 'alphabetic', 345 | 'ascii': '0-127', 346 | 'blank': 'space or tab', 347 | 'cntrl': 'control character', 348 | 'digit': 'decimal digit', 349 | 'graph': 'printing, excluding space', 350 | 'lower': 'lower case letter', 351 | 'print': 'printing, including space', 352 | 'punct': 'printing, excluding alphanumeric', 353 | 'space': 'white space', 354 | 'upper': 'upper case letter', 355 | 'word': 'same as \\w', 356 | 'xdigit': 'hexadecimal digit', 357 | }; 358 | // Include '<' and '>' to spot errors such as [a[:<:]b] 359 | var posix_named_sets_regex_string = '\\[:\\^?([\\w<>]+):]'; 360 | var posix_named_sets_regex = new RegExp(posix_named_sets_regex_string); 361 | 362 | var callout_regex_string = '\$\\?C(\\d{0,3})\$'; 363 | var callout_regex = new RegExp(callout_regex_string); 364 | 365 | var assertion_regex_string = '\\(\\? reference by name (Perl) 503 | // \k'name' reference by name (Perl) 504 | // \k{name} reference by name (.NET) 505 | if (stream.match(/k[<'{]/, false)) return push(state, 'backreference'); 506 | if (stream.match(/[0-9]+/)) return 'backreference'; 507 | 508 | // For compatibility with Oniguruma, the non-Perl syntax \g followed by a name or a number enclosed either 509 | // in angle brackets or single quotes, is an alternative syntax for referencing a subpattern as a 510 | // "subroutine". 511 | if (stream.match(/g<[-+]?[0-9]+>/)) return 'subroutine'; 512 | if (stream.match(/g'[-+]?[0-9]+'/)) return 'subroutine'; 513 | if (stream.match(/g[<']/, false)) return push(state, 'subroutine'); 514 | } 515 | // At this stage, we have looked for: 516 | // - a backslash followed by nothing 517 | // - a backslash followed by a single non-alphanumeric character 518 | // - a backslash followed by 1 or more characters to achieve a special, context-dependent meaning 519 | // Look for a backslash followed by a single alphanumeric character: 520 | var backslash_p = in_character_class ? backslash_in_character_class : backslash_in_normal_context; 521 | return backslash_p[stream.next()]; 522 | } 523 | 524 | function handle_name(stream, state) { 525 | var ret, rem, consume_limit; 526 | var ch = stream.next(); 527 | // Names must start with a non-digit. 528 | if (!state.name_value.length && (!ch.match(/\w/) || ch.match(/\d/))) { 529 | ret = 'err erroneous-start-of-name'; 530 | consume_limit = 0; 531 | } 532 | // Names consist of up to 32 alphanumeric characters and underscores. 533 | else if (state.name_value.length > 31) { 534 | ret = 'err name-too-long'; 535 | consume_limit = -1; 536 | } else consume_limit = 32 - state.name_value.length - 1; 537 | state.name_value += ch; 538 | if (consume_limit < 0) { 539 | if (rem = stream.match(/^\w+/)) state.name_value += rem[0]; 540 | } else while (consume_limit --) { 541 | if (rem = stream.match(/^\w/)) state.name_value += rem[0]; 542 | else break; 543 | } 544 | var next_char = stream.peek(); 545 | if (!next_char || !next_char.match(/\w/)) return pop(state, ret); 546 | return all_tokens(state, ret); 547 | } 548 | 549 | function handle_callout(stream, state) { 550 | // (?C) callout 551 | // (?Cn) callout with data n 552 | var rem = stream.match(callout_regex); 553 | if (rem) { 554 | return Number(rem[1]) < 256 ? 'callout' : 'err erroneous-callout-number'; 555 | } 556 | return false; 557 | } 558 | 559 | function handle_condition_subroutines(stream, state) { 560 | if (stream.peek() === ')') { 561 | pop(state); 562 | return tokenBase(stream, state); 563 | } 564 | stream.eat('R'); 565 | if (stream.eat('&')) return expect_name(state); 566 | stream.match(/\d+/); 567 | return pop(state); 568 | } 569 | 570 | function handle_conditions(stream, state) { 571 | var condition_state = current_context_state(state); 572 | var expected_end = read_expected_end(stream, state); 573 | if (expected_end) return expected_end; 574 | if (condition_state.ok) { 575 | pop(state); 576 | return tokenBase(stream, state); 577 | } 578 | // (?(DEFINE)... define subpattern for reference 579 | if (stream.match(/DEFINE(?=\))/)) { 580 | return pop(state, 'define'); 581 | } 582 | // (?(R)... overall recursion condition 583 | // (?(Rn)... specific group recursion condition 584 | // (?(R&name)...) specific recursion condition 585 | if (stream.match(/R(\d+|&\w+|)\)/, false)) { 586 | condition_state.ok = true; 587 | push(state, 'condition-subroutine'); 588 | return tokenBase(stream, state); 589 | } 590 | // (?(n)... absolute reference condition 591 | // (?(+n)... relative reference condition 592 | // (?(-n)... relative reference condition 593 | if (stream.match(/(-|\+|)\d+/)) { 594 | condition_state.ok = true; 595 | return all_tokens(state, 'backreference'); 596 | } 597 | var rem = stream.match(/([<'])/); 598 | if (rem) { 599 | condition_state.ok = false; 600 | expect_end(state, delimiter(rem[1])); 601 | return expect_name(state); 602 | } 603 | if (stream.match(/\w+/, false)) { 604 | condition_state.ok = true; // the "name" state will handle everything for us 605 | return expect_name(state); 606 | } 607 | // If the condition is not in any of the above formats, it must be an assertion. This may be a positive or 608 | // negative lookahead or lookbehind assertion. 609 | if (stream.match(/\?...) named capturing group (Perl) 647 | // (?'name'...) named capturing group (Perl) 648 | // (?P...) named capturing group (Python) 649 | rem = stream.match(/\(\?P?([<'])/); 650 | if (rem) { 651 | expect_end(state, delimiter(rem[1])); 652 | return expect_name(state); 653 | } 654 | // Same as (?: but with options, e.g. (?x-i: 655 | if (stream.match(group_options_regex, false)) { 656 | // As a convenient shorthand, if any option settings are required at the start of a non-capturing 657 | // subpattern, the option letters may appear between the "?" and the ":". 658 | stream.match('(?'); 659 | start_group_state.option_shorthand = 1; 660 | return all_tokens(state); 661 | } 662 | // "(?(" typically marks the start of a condition: (?(condition)yes-pattern|no-pattern) 663 | if (stream.match('(?') && stream.peek() === '(') { 664 | // An explicit callout may be set just before an assertion condition: (?(?C7)(?|&)/); 688 | if (rem) { 689 | expect_end(state, delimiter(rem[1])); 690 | return expect_name(state); 691 | } 692 | stream.next(); 693 | return all_tokens(state, 'err erroneous-subroutine'); 694 | } 695 | 696 | function handle_verb(stream, state) { 697 | var expected_end = read_expected_end(stream, state); 698 | if (expected_end) return expected_end; 699 | expect_end(state, ')'); 700 | return expect_name(state); 701 | } 702 | 703 | function update_options(state, options) { 704 | // We are only interested in x (extended mode). 705 | var enable = true, new_state = null, i = 0, c = null; 706 | for (; i < options.length; ++i) { 707 | c = options[i]; 708 | if (c === '-') enable = false; 709 | else if (c === 'x') new_state = enable; 710 | } 711 | if (new_state !== null) state.extended = new_state; 712 | } 713 | 714 | function tokenBase(stream, state) { 715 | var rem, ret; // stand for Regular Expression Match and RETurn, respectively. 716 | 717 | // Get current state, current char, next char: 718 | var ch = stream.peek(); 719 | if (!ch) return; 720 | var current_state = current(state); 721 | var group_state; 722 | 723 | if (current_state === 'name') return handle_name(stream, state); 724 | if (current_state === 'condition') return handle_conditions(stream, state); 725 | if (current_state === 'condition-subroutine') return handle_condition_subroutines(stream, state); 726 | if (current_state === 'start-group') return handle_start_group(stream, state); 727 | if (current_state === 'backreference') return handle_backreference(stream, state); 728 | if (current_state === 'subroutine') return handle_subroutine(stream, state); 729 | if (current_state === 'verb') return handle_verb(stream, state); 730 | 731 | if (current_state === 'escaped-sequence') { 732 | if (stream.match('\\E')) return pop(state, 'escaped-sequence-end'); 733 | consume(stream); 734 | return all_tokens(state); 735 | } 736 | 737 | // Escaped characters: 738 | if (stream.match(/\\./, false)) return all_tokens(state, handle_backslash(stream, state)); 739 | 740 | if (stream.match('[', false)) { 741 | if (current_state !== 'character-class') { 742 | if (stream.match(posix_named_sets_regex)) { 743 | return all_tokens(state, 'err posix-outside-class-unsupported'); 744 | } 745 | // In the POSIX.2 compliant library that was included in 4.4BSD Unix, the ugly syntax [[:<:]] and 746 | // [[:>:]] is used for matching "start of word" and "end of word". 747 | if (stream.match('[[:<:]]') || stream.match('[[:>:]]')) return all_tokens(state, 'anchor'); 748 | // At this stage, we do have a new character class: 749 | push(state, 'character-class'); 750 | stream.eat('['); 751 | stream.eat('^'); 752 | // If a closing square bracket is required as a member of the class, it should be the first data 753 | // character in the class (after an initial circumflex, if present) or escaped with a backslash. 754 | // Note: ']' should be on the same line as '[', even in extended mode. 755 | stream.eat(']'); 756 | return all_tokens(state); 757 | } 758 | } 759 | 760 | if (current_state === 'character-class') { 761 | rem = stream.match(posix_named_sets_regex); 762 | if (rem) { 763 | if (rem[1] in posix_named_sets) return all_tokens(state, 'generic-character-type'); 764 | else return all_tokens(state, 'err unknown-posix-class-name'); 765 | } 766 | if (stream.eat(']')) return pop(state); 767 | consume(stream); 768 | return all_tokens(state); 769 | } 770 | 771 | // Regular comments in extended mode: 772 | if (state.extended && stream.eat('#')) { 773 | stream.skipToEnd(); 774 | return 'comment'; 775 | } 776 | 777 | if (stream.eat('{')) { 778 | // exactly n: 779 | if (stream.match(/\d+\}/)) return all_tokens(state, 'quantifier'); 780 | // "at least n, no more than m" and "n or more", greedy, possessive or lazy: 781 | if (stream.match(/\d+,\d*\}[+?]?/)) return all_tokens(state, 'quantifier'); 782 | } 783 | 784 | if (stream.eat('|')) { 785 | return all_tokens(state, 'alternation'); 786 | } 787 | 788 | if (stream.peek() === '(') { 789 | if (stream.match(/$\*(?:UTF(?:8|16|32|)|UCP|NO_AUTO_POSSESS|NO_START_OPT)$/)) return all_tokens(state, 'option-sequence'); 790 | if (stream.match(/$\*LIMIT_(?:RECURSION|MATCH)=[0-9]+$/)) return all_tokens(state, 'option-sequence'); 791 | // Newline convention + what \R matches: 792 | if (stream.match(/$\*(?:CR|LF|CRLF|ANYCRLF|ANY|BSR_(?:ANYCRLF|UNICODE))$/)) return all_tokens(state, 'option-sequence'); 793 | // Backtracking control: 794 | if (stream.match(/$\*(?:ACCEPT|FAIL|F|COMMIT|PRUNE|SKIP|THEN)$/)) return all_tokens(state, 'verb'); 795 | if (stream.match(/$\*(?:MARK|PRUNE|SKIP|THEN|):/)) return push(state, 'verb', {}, 'verb'); 796 | rem = stream.match(option_sequence_regex); 797 | if (rem) { 798 | update_options(state, rem[0]); 799 | return all_tokens(state, 'option-sequence'); 800 | } 801 | // (?#....) comment (not nestable) 802 | if (stream.match(/\(\?#[^)]*$/)) return all_tokens(state, 'comment'); 803 | // (?P=name) reference by name (Python) 804 | if (stream.match(/$\?P=/, false)) return push(state, 'backreference'); 805 | // (?&name) call subpattern by name (Perl) 806 | // (?P>name) call subpattern by name (Python) 807 | if (stream.match(/\(\?(P>|&)/, false)) return push(state, 'subroutine'); 808 | // (?n) call subpattern by absolute number 809 | // (?+n) call subpattern by relative number 810 | // (?-n) call subpattern by relative number 811 | if (stream.match(/\(\?(\-|\+|)\d+$/)) return all_tokens(state, 'subroutine'); 812 | // (?R) recurse whole pattern 813 | if (stream.match('(?R)')) return all_tokens(state, 'subroutine'); 814 | // Callouts: 815 | var callout = handle_callout(stream, state); 816 | if (callout) return all_tokens(state, callout); 817 | 818 | // At this stage, we have a new group: 819 | ++ state.group_level; 820 | group_state = 'group' + state.group_level; 821 | push(state, group_state); 822 | 823 | // (?=...) positive look ahead 824 | // (?!...) negative look ahead 825 | // (?<=...) positive look behind 826 | // (?...) atomic, non-capturing group 831 | if (stream.match(/\(\?[:|>]/)) return all_tokens(state, 'start-group'); 832 | 833 | if (stream.match('(?', false)) { 834 | push(state, 'start-group'); 835 | return tokenBase(stream, state); 836 | } 837 | stream.eat('('); 838 | return all_tokens(state, 'start-group'); 839 | } 840 | 841 | if (stream.peek() === ')') { 842 | if (current_state && current_state.match(/^group/)) { 843 | ret = 'start-group'; // formerly 'end-group' but that used to confuse matchbrackets (see issue #4) 844 | if (current_context_state(state).leave_closing_parenthesis) ret = ''; 845 | else stream.next(); 846 | -- state.group_level; 847 | return pop(state, ret); 848 | } 849 | stream.next(); 850 | return all_tokens(state, 'err unmatched-closing-parenthesis'); 851 | } 852 | 853 | // Anchors 854 | if (stream.eat('^') || stream.eat('$')) return all_tokens(state, 'anchor'); 855 | if (stream.eat('.')) return all_tokens(state, 'generic-character-type'); 856 | // Quantifiers: 0 or 1, 0 or more, 1 or more, greedy: 857 | if (stream.eat('?') || stream.eat('*') || stream.eat('+')) { 858 | // Handle possessive and lazy variants: 859 | stream.eat(/[+?]/); 860 | return all_tokens(state, 'quantifier'); 861 | } 862 | consume(stream); 863 | return all_tokens(state); 864 | } 865 | 866 | function startState() { 867 | return { 868 | context: [], 869 | context_state: [], 870 | group_level: 0, 871 | name_value: '', 872 | extended: options.extended, 873 | }; 874 | } 875 | 876 | function copyState(o) { // o = original 877 | var i, oo, oc, key, c = startState(); // c = copy, oo = original object, oc = object copy 878 | for (i = 0; i < o.context_state.length; ++i) { 879 | oo = o.context_state[i]; 880 | oc = {}; 881 | for (key in oo) oc[key] = (key === 'expected') ? oo[key].slice() : oo[key]; 882 | c.context_state.push(oc); 883 | } 884 | c.context = o.context.slice(); 885 | c.group_level = o.group_level; 886 | c.name_value = o.name_value; 887 | c.extended = o.extended; 888 | return c; 889 | } 890 | 891 | return { 892 | startState: startState, 893 | copyState: copyState, 894 | token: tokenBase, 895 | }; 896 | }); 897 | 898 | CodeMirror.defineMIME('text/x-regex', 'pcre'); 899 | CodeMirror.defineMIME('text/x-pcre-regex', 'pcre'); 900 | 901 | }); 902 | --------------------------------------------------------------------------------