├── LICENSE.txt ├── README.md ├── index.html ├── package.json └── src ├── pcre.css └── pcre.js /LICENSE.txt: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2021-2025, Xavier G. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CodeMirror PCRE mode 2 | 3 | This is a [CodeMirror](https://codemirror.net/) mode that brings syntax highlighting for [Perl Compatible Regular Expressions (PCRE)](https://www.pcre.org/). 4 | 5 | ## How to use 6 | ### Basic use 7 | Load `pcre.js` and `pcre.css` at adequate locations in your HTML structure. 8 | Mention `mode: 'pcre'` when creating your CodeMirror instance. 9 | 10 | ### Configuration 11 | codemirror-mode-pcre supports extended mode (`x` flag) and actually enables it by default. This can be turned off by passing `extended: false` when creating the CodeMirror instance. 12 | 13 | ### Theming 14 | This mode does not leverage CodeMirror's default tokens (they are not exactly fitted for regular expressions). Consequently, if you use a theme other than the default one, you will likely want to write your own `pcre.css` file. 15 | 16 | **Breaking change:** starting with version 2.0.0, the `cm-end-group` style is no longer supported and should no longer be used. It is replaced with `cm-start-group`. 17 | 18 | ### Nesting 19 | codemirror-mode-pcre can be nested within another mode, i.e. it can highlight regular expressions for another mode. 20 | This requires adjusting the other mode though. 21 | See the demo page for an example of such nesting. 22 | 23 | ## Non-features 24 | codemirror-mode-pcre does **not** offer: 25 | - completion (e.g. suggesting POSIX class names or script names for `\p` and `\P`); 26 | - tooltips reflecting what the various parts of an expression actually mean. 27 | 28 | Those may come in the future though. 29 | 30 | ## License 31 | Like the PCRE library, this mode is released under the 3-clause BSD license. 32 | -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | CodeMirror PCRE mode 7 | 13 | 14 | 21 | 22 | 23 |

CodeMirror PCRE mode

24 |

25 | This is a CodeMirror mode that brings 26 | syntax highlighting for Perl Compatible Regular Expressions (PCRE). 27 |

28 |

29 | MIME types defined: 30 |

34 |

35 |

36 | Options: 37 |

40 |

41 |

42 | Table of contents: 43 |

49 |

50 |

Use as nested mode

51 |

52 | Below are demonstrations of how the PCRE mode can be used to highlight regular expressions within other languages. 53 | This first example is an nginx configuration snippet: 54 |

55 | 70 |

This second example is a simple list of one-line regexes with comments:

71 | 79 |

man pcresyntax

80 |

81 | This is a slightly adjusted copy of `man pcresyntax`. This man page reflects most PCRE syntactic structures in a 82 | colourful way thanks to CodeMirror and the PCRE mode. 83 |

84 | 450 |

Examples from man pcrepattern

451 |

452 | `man pcrepattern` is much longer than `man pcresyntax`, which is why only its examples were reproduced below. 453 |

454 | 635 |

Other details

636 | 747 | 753 | 759 | 760 | 813 | 834 | 857 | 858 | 859 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "codemirror-mode-pcre", 3 | "version": "2.0.0", 4 | "description": "Perl Compatible Regular Expressions (PCRE) mode for CodeMirror", 5 | "main": "src/pcre.js", 6 | "scripts": { 7 | "test": "true" 8 | }, 9 | "repository": { 10 | "type": "git", 11 | "url": "git+https://github.com/xavierog/codemirror-mode-pcre.git" 12 | }, 13 | "keywords": [ 14 | "codemirror-mode", 15 | "pcre", 16 | "pcre-regex" 17 | ], 18 | "author": "Xavier G.", 19 | "license": "BSD-3-Clause", 20 | "bugs": { 21 | "url": "https://github.com/xavierog/codemirror-mode-pcre/issues" 22 | }, 23 | "homepage": "https://github.com/xavierog/codemirror-mode-pcre#readme" 24 | } 25 | -------------------------------------------------------------------------------- /src/pcre.css: -------------------------------------------------------------------------------- 1 | .cm-s-default span.cm-group1 { background: #ffff7f40; } 2 | .cm-s-default span.cm-group2 { background: #aaffff40; } 3 | .cm-s-default span.cm-group3 { background: #ffff7f60; } 4 | .cm-s-default span.cm-group4 { background: #aaffff60; } 5 | .cm-s-default span.cm-group5 { background: #ffff7f80; } 6 | .cm-s-default span.cm-group6 { background: #aaffff80; } 7 | .cm-s-default span.cm-group7 { background: #ffff7fa0; } 8 | .cm-s-default span.cm-group8 { background: #aaffffa0; } 9 | 10 | .cm-s-default span.cm-start-group, 11 | .cm-s-default span.cm-alternation 12 | { 13 | font-weight: bold; 14 | } 15 | 16 | .cm-s-default span.cm-condition { } 17 | 18 | .cm-s-default span.cm-option-sequence { background: #5cb85c; color: white; font-weight: bold; } 19 | .cm-s-default span.cm-verb { background: #5459b8; color: white; font-weight: bold; } 20 | 21 | .cm-s-default span.cm-backreference { color: #25ba36; font-weight: bold; } 22 | .cm-s-default span.cm-callout { background: #ff00ff; color: white; } 23 | .cm-s-default span.cm-condition-subroutine, 24 | .cm-s-default span.cm-subroutine { color: #aa00ff; font-weight: bold; } 25 | .cm-s-default span.cm-quantifier { background: #aad1f760; } 26 | 27 | .cm-s-default span.cm-character-class { background: #f9ca6960; } 28 | 29 | .cm-s-default span.cm-non-printing-character { color: #aa0000; } 30 | .cm-s-default span.cm-generic-character-type { color: blue; } 31 | 32 | .cm-s-default span.cm-escaped-character { font-weight: bold; } 33 | .cm-s-default span.cm-escaped-sequence-start, 34 | .cm-s-default span.cm-escaped-sequence-end, 35 | .cm-s-default span.cm-escaped-sequence { 36 | border-top: 1px solid #7f7f7f; 37 | border-bottom: 1px solid #7f7f7f; 38 | } 39 | .cm-s-default span.cm-escaped-sequence-start, 40 | .cm-s-default span.cm-escaped-sequence-end { 41 | color: white; 42 | background-color: #00000080; 43 | } 44 | 45 | .cm-s-default span.cm-anchor { background: black; color: white; font-weight: bold; } 46 | 47 | .cm-s-default span.cm-define { background: blue; color: yellow; } 48 | .cm-s-default span.cm-name { color: blue; font-weight: normal; } 49 | .cm-s-default span.cm-verb.cm-name { color: yellow; } 50 | .cm-s-default span.cm-err { background: #ff4300c0; } 51 | -------------------------------------------------------------------------------- /src/pcre.js: -------------------------------------------------------------------------------- 1 | // Declare global variables to avoid warnings in JSHint 2 | /* global CodeMirror, define */ 3 | 4 | (function (mod) { 5 | if (typeof exports === "object" && typeof module === "object") // CommonJS 6 | mod(require("codemirror/lib/codemirror")); 7 | else if (typeof define === "function" && define.amd) // AMD 8 | define(["codemirror/lib/codemirror"], mod); 9 | else // Plain browser env 10 | mod(CodeMirror); 11 | })(function (CodeMirror) { 12 | "use strict"; 13 | 14 | CodeMirror.defineMode('pcre', function(editor_options, mode_options) { 15 | // Default settings: 16 | var options = { 17 | extended: true, 18 | }; 19 | // Override default settings with user-provided settings: 20 | if ('extended' in mode_options) options.extended = Boolean(mode_options.extended); 21 | 22 | var delimiters = { 23 | '<': '>', 24 | '[': ']', 25 | '{': '}', 26 | '(': ')', 27 | }; 28 | // Behaviour of alphanumeric characters after a backslash character (normal context): 29 | var backslash_in_normal_context = { 30 | '0': 'non-printing-character', 31 | '1': 'backreference', 32 | '2': 'backreference', 33 | '3': 'backreference', 34 | '4': 'backreference', 35 | '5': 'backreference', 36 | '6': 'backreference', 37 | '7': 'backreference', 38 | '8': 'backreference', 39 | '9': 'backreference', 40 | 'A': 'anchor', // \A start of subject 41 | 'B': 'anchor', // \B not a word boundary 42 | 'C': 'generic-character-type', // \C one data unit, even in UTF mode (best avoided) 43 | 'D': 'generic-character-type', // \D any character that is not a decimal digit 44 | 'E': 'err no-error-message', // \E ends \Q but never matches 'E' -- PCRE does not emit any error message 45 | 'F': '', // \F matches F 46 | 'G': 'anchor', // \G first matching position in subject 47 | 'H': 'generic-character-type', // \H any character that is not a horizontal white space character 48 | 'I': '', // \I matches I 49 | 'J': '', // \J matches J 50 | 'K': 'anchor', // \K reset start of match (neither an anchor nor a simple assertion) 51 | 'L': 'err unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u 52 | 'M': '', // \M matches M 53 | 'N': 'generic-character-type', // \N a character that is not a newline 54 | 'O': '', // \O matches O 55 | 'P': 'err malformed-backslash-p-sequence', // malformed \P or \p sequence 56 | 'Q': 'escaped-sequence-start', // \Q starts \Q...\E escape sequences. 57 | 'R': 'generic-character-type', // \R a newline sequence 58 | 'S': 'generic-character-type', // \S any character that is not a white space character 59 | 'T': '', // \T matches T 60 | 'U': 'unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u 61 | 'V': 'generic-character-type', // \V any character that is not a vertical white space character 62 | 'W': 'generic-character-type', // \W any "non-word" character 63 | 'X': 'generic-character-type', // \X a Unicode extended grapheme cluster 64 | 'Y': '', // \Y matches Y 65 | 'Z': 'anchor', // \Z matches at the end of the subject; also matches before a newline at the end of the subject 66 | 'a': 'non-printing-character', // \a alarm, that is, the BEL character (hex 07) 67 | 'b': 'anchor', // \b word boundary 68 | 'c': 'err backslash-c-at-end-of-pattern', // \cx "control-x", where x is any ASCII character 69 | 'd': 'generic-character-type', // \d any decimal digi 70 | 'e': 'non-printing-character', // \e escape (hex 1B) 71 | 'f': 'non-printing-character', // \f form feed (hex 0C) 72 | 'g': 'err a-number-reference-must-not-be-zero', // a numbered reference must not be zero 73 | 'h': 'generic-character-type', // \h any horizontal white space character 74 | 'i': '', // \i matches i 75 | 'j': '', // \j matches j 76 | 'k': 'err backslash-k-is-not-followed-by-a-name', // \k is not followed by a [...] name 77 | 'l': 'unsupported-espace-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u 78 | 'm': '', // \m matches m 79 | 'n': 'non-printing-character', // \n linefeed (hex 0A) 80 | 'o': '', // \o matches o 81 | 'p': 'err malformed-backslash-p-sequence', // malformed \P or \p sequence 82 | 'q': '', // \q matches q 83 | 'r': 'non-printing-character', // \r carriage return (hex 0D) 84 | 's': 'generic-character-type', // \s any white space character 85 | 't': 'non-printing-character', // \t tab (hex 09) 86 | 'u': 'err unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u 87 | 'v': 'generic-character-type', // \v any vertical white space character 88 | 'w': 'generic-character-type', // any "word" character 89 | 'x': 'non-printing-character', // binary zero (or x if PCRE_JAVASCRIPT_COMPAT) 90 | 'y': '', // \y matches y 91 | 'z': 'anchor', // \z end of subject 92 | }; 93 | // Behaviour of alphanumeric characters after a backslash character (character class context, i.e. [...]): 94 | var backslash_in_character_class = { 95 | '0': 'non-printing-character', // octal code 96 | '1': 'non-printing-character', // octal code 97 | '2': 'non-printing-character', // octal code 98 | '3': 'non-printing-character', // octal code 99 | '4': 'non-printing-character', // octal code 100 | '5': 'non-printing-character', // octal code 101 | '6': 'non-printing-character', // octal code 102 | '7': 'non-printing-character', // octal code 103 | '8': '', // \8 matches 8 104 | '9': '', // \9 matches 9 105 | 'A': '', // \A matches A 106 | 'B': '', // \B matches B -- \B, \R, and \X are not special inside a character class. 107 | 'C': '', // \C matches C 108 | 'D': 'generic-character-type', // \D any character that is not a decimal digit 109 | 'E': 'err no-error-message', // \E ends \Q but never matches 'E' -- PCRE does not emit any error message 110 | 'F': '', // \F matches F 111 | 'G': '', // \G matches G 112 | 'H': 'generic-character-type', // \H any character that is not a horizontal white space character 113 | 'I': '', // \I matches I 114 | 'J': '', // \J matches J 115 | 'K': '', // \K matches K 116 | 'L': 'err unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u 117 | 'M': '', // \M matches M 118 | 'N': 'err backslash-n-is-not-supported-in-a-class', // \N is not allowed in a character class. 119 | 'O': '', // \O matches O 120 | 'P': 'err malformed-backslash-p-sequence', // malformed \P or \p sequence 121 | 'Q': 'escaped-sequence-start', // \Q starts \Q...\E escape sequences. 122 | 'R': '', // \R matches R -- \B, \R, and \X are not special inside a character class. 123 | 'S': 'generic-character-type', // \S any character that is not a white space character 124 | 'T': '', // \T matches T 125 | 'U': 'unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u 126 | 'V': 'generic-character-type', // \V any character that is not a vertical white space character 127 | 'W': 'generic-character-type', // \W any "non-word" character 128 | 'X': '', // \X matches X -- \B, \R, and \X are not special inside a character class. 129 | 'Y': '', // \Y matches Y 130 | 'Z': '', // \Z matches Z 131 | 'a': 'non-printing-character', // \a alarm, that is, the BEL character (hex 07) 132 | 'b': 'non-printing-character', // inside a character class, \b is interpreted as the backspace character (hex 08) 133 | 'c': 'err backslash-c-at-end-of-pattern', // \cx "control-x", where x is any ASCII character 134 | 'd': 'generic-character-type', // \d any decimal digi 135 | 'e': 'non-printing-character', // \e escape (hex 1B) 136 | 'f': 'non-printing-character', // \f form feed (hex 0C) 137 | 'g': '', // \g matches g 138 | 'h': 'generic-character-type', // \h any horizontal white space character 139 | 'i': '', // \i matches i 140 | 'j': '', // \j matches j 141 | 'k': '', // \k matches k 142 | 'l': 'unsupported-espace-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u 143 | 'm': '', // \m matches m 144 | 'n': 'non-printing-character', // \n linefeed (hex 0A) 145 | 'o': '', // \o matches o 146 | 'p': 'err malformed-backslash-p-sequence', // malformed \P or \p sequence 147 | 'q': '', // \q matches q 148 | 'r': 'non-printing-character', // \r carriage return (hex 0D) 149 | 's': 'generic-character-type', // \s any white space character 150 | 't': 'non-printing-character', // \t tab (hex 09) 151 | 'u': 'err unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u 152 | 'v': 'generic-character-type', // \v any vertical white space character 153 | 'w': 'generic-character-type', // any "word" character 154 | 'x': 'non-printing-character', // binary zero (or x if PCRE_JAVASCRIPT_COMPAT) 155 | 'y': '', // \y matches y 156 | 'z': '', // \z matches z 157 | }; 158 | var backslask_p_properties = { 159 | // GENERAL CATEGORY PROPERTIES FOR \p and \P 160 | 'C': 'Other', 161 | 'Cc': 'Control', 162 | 'Cf': 'Format', 163 | 'Cn': 'Unassigned', 164 | 'Co': 'Private use', 165 | 'Cs': 'Surrogate', 166 | 167 | 'L': 'Letter', 168 | 'Ll': 'Lower case letter', 169 | 'Lm': 'Modifier letter', 170 | 'Lo': 'Other letter', 171 | 'Lt': 'Title case letter', 172 | 'Lu': 'Upper case letter', 173 | 'L&': 'Ll, Lu, or Lt', 174 | 175 | 'M': 'Mark', 176 | 'Mc': 'Spacing mark', 177 | 'Me': 'Enclosing mark', 178 | 'Mn': 'Non-spacing mark', 179 | 180 | 'N': 'Number', 181 | 'Nd': 'Decimal number', 182 | 'Nl': 'Letter number', 183 | 'No': 'Other number', 184 | 185 | 'P': 'Punctuation', 186 | 'Pc': 'Connector punctuation', 187 | 'Pd': 'Dash punctuation', 188 | 'Pe': 'Close punctuation', 189 | 'Pf': 'Final punctuation', 190 | 'Pi': 'Initial punctuation', 191 | 'Po': 'Other punctuation', 192 | 'Ps': 'Open punctuation', 193 | 194 | 'S': 'Symbol', 195 | 'Sc': 'Currency symbol', 196 | 'Sk': 'Modifier symbol', 197 | 'Sm': 'Mathematical symbol', 198 | 'So': 'Other symbol', 199 | 200 | 'Z': 'Separator', 201 | 'Zl': 'Line separator', 202 | 'Zp': 'Paragraph separator', 203 | 'Zs': 'Space separator', 204 | 205 | // PCRE SPECIAL CATEGORY PROPERTIES FOR \p and \P 206 | 'Xan': 'Alphanumeric: union of properties L and N', 207 | 'Xps': 'POSIX space: property Z or tab, NL, VT, FF, CR', 208 | 'Xsp': 'Perl space: property Z or tab, NL, VT, FF, CR', 209 | 'Xuc': 'Univerally-named character: one that can be represented by a Universal Character Name', 210 | 'Xwd': 'Perl word: property Xan or underscore', 211 | 212 | // SCRIPT NAMES FOR \p AND \P 213 | 'Arabic': true, 214 | 'Armenian': true, 215 | 'Avestan': true, 216 | 'Balinese': true, 217 | 'Bamum': true, 218 | 'Bassa_Vah': true, 219 | 'Batak': true, 220 | 'Bengali': true, 221 | 'Bopomofo': true, 222 | 'Brahmi': true, 223 | 'Braille': true, 224 | 'Buginese': true, 225 | 'Buhid': true, 226 | 'Canadian_Aboriginal': true, 227 | 'Carian': true, 228 | 'Caucasian_Albanian': true, 229 | 'Chakma': true, 230 | 'Cham': true, 231 | 'Cherokee': true, 232 | 'Common': true, 233 | 'Coptic': true, 234 | 'Cuneiform': true, 235 | 'Cypriot': true, 236 | 'Cyrillic': true, 237 | 'Deseret': true, 238 | 'Devanagari': true, 239 | 'Duployan': true, 240 | 'Egyptian_Hieroglyphs': true, 241 | 'Elbasan': true, 242 | 'Ethiopic': true, 243 | 'Georgian': true, 244 | 'Glagolitic': true, 245 | 'Gothic': true, 246 | 'Grantha': true, 247 | 'Greek': true, 248 | 'Gujarati': true, 249 | 'Gurmukhi': true, 250 | 'Han': true, 251 | 'Hangul': true, 252 | 'Hanunoo': true, 253 | 'Hebrew': true, 254 | 'Hiragana': true, 255 | 'Imperial_Aramaic': true, 256 | 'Inherited': true, 257 | 'Inscriptional_Pahlavi': true, 258 | 'Inscriptional_Parthian': true, 259 | 'Javanese': true, 260 | 'Kaithi': true, 261 | 'Kannada': true, 262 | 'Katakana': true, 263 | 'Kayah_Li': true, 264 | 'Kharoshthi': true, 265 | 'Khmer': true, 266 | 'Khojki': true, 267 | 'Khudawadi': true, 268 | 'Lao': true, 269 | 'Latin': true, 270 | 'Lepcha': true, 271 | 'Limbu': true, 272 | 'Linear_A': true, 273 | 'Linear_B': true, 274 | 'Lisu': true, 275 | 'Lycian': true, 276 | 'Lydian': true, 277 | 'Mahajani': true, 278 | 'Malayalam': true, 279 | 'Mandaic': true, 280 | 'Manichaean': true, 281 | 'Meetei_Mayek': true, 282 | 'Mende_Kikakui': true, 283 | 'Meroitic_Cursive': true, 284 | 'Meroitic_Hieroglyphs': true, 285 | 'Miao': true, 286 | 'Modi': true, 287 | 'Mongolian': true, 288 | 'Mro': true, 289 | 'Myanmar': true, 290 | 'Nabataean': true, 291 | 'New_Tai_Lue': true, 292 | 'Nko': true, 293 | 'Ogham': true, 294 | 'Ol_Chiki': true, 295 | 'Old_Italic': true, 296 | 'Old_North_Arabian': true, 297 | 'Old_Permic': true, 298 | 'Old_Persian': true, 299 | 'Old_South_Arabian': true, 300 | 'Old_Turkic': true, 301 | 'Oriya': true, 302 | 'Osmanya': true, 303 | 'Pahawh_Hmong': true, 304 | 'Palmyrene': true, 305 | 'Pau_Cin_Hau': true, 306 | 'Phags_Pa': true, 307 | 'Phoenician': true, 308 | 'Psalter_Pahlavi': true, 309 | 'Rejang': true, 310 | 'Runic': true, 311 | 'Samaritan': true, 312 | 'Saurashtra': true, 313 | 'Sharada': true, 314 | 'Shavian': true, 315 | 'Siddham': true, 316 | 'Sinhala': true, 317 | 'Sora_Sompeng': true, 318 | 'Sundanese': true, 319 | 'Syloti_Nagri': true, 320 | 'Syriac': true, 321 | 'Tagalog': true, 322 | 'Tagbanwa': true, 323 | 'Tai_Le': true, 324 | 'Tai_Tham': true, 325 | 'Tai_Viet': true, 326 | 'Takri': true, 327 | 'Tamil': true, 328 | 'Telugu': true, 329 | 'Thaana': true, 330 | 'Thai': true, 331 | 'Tibetan': true, 332 | 'Tifinagh': true, 333 | 'Tirhuta': true, 334 | 'Ugaritic': true, 335 | 'Vai': true, 336 | 'Warang_Citi': true, 337 | 'Yi': true, 338 | }; 339 | var backslash_p_regex_string = '[pP]\\{\\^?([\\w&]+)\\}'; 340 | var backslash_p_regex = new RegExp(backslash_p_regex_string); 341 | 342 | var posix_named_sets = { 343 | 'alnum': 'alphanumeric', 344 | 'alpha': 'alphabetic', 345 | 'ascii': '0-127', 346 | 'blank': 'space or tab', 347 | 'cntrl': 'control character', 348 | 'digit': 'decimal digit', 349 | 'graph': 'printing, excluding space', 350 | 'lower': 'lower case letter', 351 | 'print': 'printing, including space', 352 | 'punct': 'printing, excluding alphanumeric', 353 | 'space': 'white space', 354 | 'upper': 'upper case letter', 355 | 'word': 'same as \\w', 356 | 'xdigit': 'hexadecimal digit', 357 | }; 358 | // Include '<' and '>' to spot errors such as [a[:<:]b] 359 | var posix_named_sets_regex_string = '\\[:\\^?([\\w<>]+):]'; 360 | var posix_named_sets_regex = new RegExp(posix_named_sets_regex_string); 361 | 362 | var callout_regex_string = '\\(\\?C(\\d{0,3})\\)'; 363 | var callout_regex = new RegExp(callout_regex_string); 364 | 365 | var assertion_regex_string = '\\(\\? reference by name (Perl) 503 | // \k'name' reference by name (Perl) 504 | // \k{name} reference by name (.NET) 505 | if (stream.match(/k[<'{]/, false)) return push(state, 'backreference'); 506 | if (stream.match(/[0-9]+/)) return 'backreference'; 507 | 508 | // For compatibility with Oniguruma, the non-Perl syntax \g followed by a name or a number enclosed either 509 | // in angle brackets or single quotes, is an alternative syntax for referencing a subpattern as a 510 | // "subroutine". 511 | if (stream.match(/g<[-+]?[0-9]+>/)) return 'subroutine'; 512 | if (stream.match(/g'[-+]?[0-9]+'/)) return 'subroutine'; 513 | if (stream.match(/g[<']/, false)) return push(state, 'subroutine'); 514 | } 515 | // At this stage, we have looked for: 516 | // - a backslash followed by nothing 517 | // - a backslash followed by a single non-alphanumeric character 518 | // - a backslash followed by 1 or more characters to achieve a special, context-dependent meaning 519 | // Look for a backslash followed by a single alphanumeric character: 520 | var backslash_p = in_character_class ? backslash_in_character_class : backslash_in_normal_context; 521 | return backslash_p[stream.next()]; 522 | } 523 | 524 | function handle_name(stream, state) { 525 | var ret, rem, consume_limit; 526 | var ch = stream.next(); 527 | // Names must start with a non-digit. 528 | if (!state.name_value.length && (!ch.match(/\w/) || ch.match(/\d/))) { 529 | ret = 'err erroneous-start-of-name'; 530 | consume_limit = 0; 531 | } 532 | // Names consist of up to 32 alphanumeric characters and underscores. 533 | else if (state.name_value.length > 31) { 534 | ret = 'err name-too-long'; 535 | consume_limit = -1; 536 | } else consume_limit = 32 - state.name_value.length - 1; 537 | state.name_value += ch; 538 | if (consume_limit < 0) { 539 | if (rem = stream.match(/^\w+/)) state.name_value += rem[0]; 540 | } else while (consume_limit --) { 541 | if (rem = stream.match(/^\w/)) state.name_value += rem[0]; 542 | else break; 543 | } 544 | var next_char = stream.peek(); 545 | if (!next_char || !next_char.match(/\w/)) return pop(state, ret); 546 | return all_tokens(state, ret); 547 | } 548 | 549 | function handle_callout(stream, state) { 550 | // (?C) callout 551 | // (?Cn) callout with data n 552 | var rem = stream.match(callout_regex); 553 | if (rem) { 554 | return Number(rem[1]) < 256 ? 'callout' : 'err erroneous-callout-number'; 555 | } 556 | return false; 557 | } 558 | 559 | function handle_condition_subroutines(stream, state) { 560 | if (stream.peek() === ')') { 561 | pop(state); 562 | return tokenBase(stream, state); 563 | } 564 | stream.eat('R'); 565 | if (stream.eat('&')) return expect_name(state); 566 | stream.match(/\d+/); 567 | return pop(state); 568 | } 569 | 570 | function handle_conditions(stream, state) { 571 | var condition_state = current_context_state(state); 572 | var expected_end = read_expected_end(stream, state); 573 | if (expected_end) return expected_end; 574 | if (condition_state.ok) { 575 | pop(state); 576 | return tokenBase(stream, state); 577 | } 578 | // (?(DEFINE)... define subpattern for reference 579 | if (stream.match(/DEFINE(?=\))/)) { 580 | return pop(state, 'define'); 581 | } 582 | // (?(R)... overall recursion condition 583 | // (?(Rn)... specific group recursion condition 584 | // (?(R&name)...) specific recursion condition 585 | if (stream.match(/R(\d+|&\w+|)\)/, false)) { 586 | condition_state.ok = true; 587 | push(state, 'condition-subroutine'); 588 | return tokenBase(stream, state); 589 | } 590 | // (?(n)... absolute reference condition 591 | // (?(+n)... relative reference condition 592 | // (?(-n)... relative reference condition 593 | if (stream.match(/(-|\+|)\d+/)) { 594 | condition_state.ok = true; 595 | return all_tokens(state, 'backreference'); 596 | } 597 | var rem = stream.match(/([<'])/); 598 | if (rem) { 599 | condition_state.ok = false; 600 | expect_end(state, delimiter(rem[1])); 601 | return expect_name(state); 602 | } 603 | if (stream.match(/\w+/, false)) { 604 | condition_state.ok = true; // the "name" state will handle everything for us 605 | return expect_name(state); 606 | } 607 | // If the condition is not in any of the above formats, it must be an assertion. This may be a positive or 608 | // negative lookahead or lookbehind assertion. 609 | if (stream.match(/\?...) named capturing group (Perl) 647 | // (?'name'...) named capturing group (Perl) 648 | // (?P...) named capturing group (Python) 649 | rem = stream.match(/\(\?P?([<'])/); 650 | if (rem) { 651 | expect_end(state, delimiter(rem[1])); 652 | return expect_name(state); 653 | } 654 | // Same as (?: but with options, e.g. (?x-i: 655 | if (stream.match(group_options_regex, false)) { 656 | // As a convenient shorthand, if any option settings are required at the start of a non-capturing 657 | // subpattern, the option letters may appear between the "?" and the ":". 658 | stream.match('(?'); 659 | start_group_state.option_shorthand = 1; 660 | return all_tokens(state); 661 | } 662 | // "(?(" typically marks the start of a condition: (?(condition)yes-pattern|no-pattern) 663 | if (stream.match('(?') && stream.peek() === '(') { 664 | // An explicit callout may be set just before an assertion condition: (?(?C7)(?|&)/); 688 | if (rem) { 689 | expect_end(state, delimiter(rem[1])); 690 | return expect_name(state); 691 | } 692 | stream.next(); 693 | return all_tokens(state, 'err erroneous-subroutine'); 694 | } 695 | 696 | function handle_verb(stream, state) { 697 | var expected_end = read_expected_end(stream, state); 698 | if (expected_end) return expected_end; 699 | expect_end(state, ')'); 700 | return expect_name(state); 701 | } 702 | 703 | function update_options(state, options) { 704 | // We are only interested in x (extended mode). 705 | var enable = true, new_state = null, i = 0, c = null; 706 | for (; i < options.length; ++i) { 707 | c = options[i]; 708 | if (c === '-') enable = false; 709 | else if (c === 'x') new_state = enable; 710 | } 711 | if (new_state !== null) state.extended = new_state; 712 | } 713 | 714 | function tokenBase(stream, state) { 715 | var rem, ret; // stand for Regular Expression Match and RETurn, respectively. 716 | 717 | // Get current state, current char, next char: 718 | var ch = stream.peek(); 719 | if (!ch) return; 720 | var current_state = current(state); 721 | var group_state; 722 | 723 | if (current_state === 'name') return handle_name(stream, state); 724 | if (current_state === 'condition') return handle_conditions(stream, state); 725 | if (current_state === 'condition-subroutine') return handle_condition_subroutines(stream, state); 726 | if (current_state === 'start-group') return handle_start_group(stream, state); 727 | if (current_state === 'backreference') return handle_backreference(stream, state); 728 | if (current_state === 'subroutine') return handle_subroutine(stream, state); 729 | if (current_state === 'verb') return handle_verb(stream, state); 730 | 731 | if (current_state === 'escaped-sequence') { 732 | if (stream.match('\\E')) return pop(state, 'escaped-sequence-end'); 733 | consume(stream); 734 | return all_tokens(state); 735 | } 736 | 737 | // Escaped characters: 738 | if (stream.match(/\\./, false)) return all_tokens(state, handle_backslash(stream, state)); 739 | 740 | if (stream.match('[', false)) { 741 | if (current_state !== 'character-class') { 742 | if (stream.match(posix_named_sets_regex)) { 743 | return all_tokens(state, 'err posix-outside-class-unsupported'); 744 | } 745 | // In the POSIX.2 compliant library that was included in 4.4BSD Unix, the ugly syntax [[:<:]] and 746 | // [[:>:]] is used for matching "start of word" and "end of word". 747 | if (stream.match('[[:<:]]') || stream.match('[[:>:]]')) return all_tokens(state, 'anchor'); 748 | // At this stage, we do have a new character class: 749 | push(state, 'character-class'); 750 | stream.eat('['); 751 | stream.eat('^'); 752 | // If a closing square bracket is required as a member of the class, it should be the first data 753 | // character in the class (after an initial circumflex, if present) or escaped with a backslash. 754 | // Note: ']' should be on the same line as '[', even in extended mode. 755 | stream.eat(']'); 756 | return all_tokens(state); 757 | } 758 | } 759 | 760 | if (current_state === 'character-class') { 761 | rem = stream.match(posix_named_sets_regex); 762 | if (rem) { 763 | if (rem[1] in posix_named_sets) return all_tokens(state, 'generic-character-type'); 764 | else return all_tokens(state, 'err unknown-posix-class-name'); 765 | } 766 | if (stream.eat(']')) return pop(state); 767 | consume(stream); 768 | return all_tokens(state); 769 | } 770 | 771 | // Regular comments in extended mode: 772 | if (state.extended && stream.eat('#')) { 773 | stream.skipToEnd(); 774 | return 'comment'; 775 | } 776 | 777 | if (stream.eat('{')) { 778 | // exactly n: 779 | if (stream.match(/\d+\}/)) return all_tokens(state, 'quantifier'); 780 | // "at least n, no more than m" and "n or more", greedy, possessive or lazy: 781 | if (stream.match(/\d+,\d*\}[+?]?/)) return all_tokens(state, 'quantifier'); 782 | } 783 | 784 | if (stream.eat('|')) { 785 | return all_tokens(state, 'alternation'); 786 | } 787 | 788 | if (stream.peek() === '(') { 789 | if (stream.match(/\(\*(?:UTF(?:8|16|32|)|UCP|NO_AUTO_POSSESS|NO_START_OPT)\)/)) return all_tokens(state, 'option-sequence'); 790 | if (stream.match(/\(\*LIMIT_(?:RECURSION|MATCH)=[0-9]+\)/)) return all_tokens(state, 'option-sequence'); 791 | // Newline convention + what \R matches: 792 | if (stream.match(/\(\*(?:CR|LF|CRLF|ANYCRLF|ANY|BSR_(?:ANYCRLF|UNICODE))\)/)) return all_tokens(state, 'option-sequence'); 793 | // Backtracking control: 794 | if (stream.match(/\(\*(?:ACCEPT|FAIL|F|COMMIT|PRUNE|SKIP|THEN)\)/)) return all_tokens(state, 'verb'); 795 | if (stream.match(/\(\*(?:MARK|PRUNE|SKIP|THEN|):/)) return push(state, 'verb', {}, 'verb'); 796 | rem = stream.match(option_sequence_regex); 797 | if (rem) { 798 | update_options(state, rem[0]); 799 | return all_tokens(state, 'option-sequence'); 800 | } 801 | // (?#....) comment (not nestable) 802 | if (stream.match(/\(\?#[^)]*\)/)) return all_tokens(state, 'comment'); 803 | // (?P=name) reference by name (Python) 804 | if (stream.match(/\(\?P=/, false)) return push(state, 'backreference'); 805 | // (?&name) call subpattern by name (Perl) 806 | // (?P>name) call subpattern by name (Python) 807 | if (stream.match(/\(\?(P>|&)/, false)) return push(state, 'subroutine'); 808 | // (?n) call subpattern by absolute number 809 | // (?+n) call subpattern by relative number 810 | // (?-n) call subpattern by relative number 811 | if (stream.match(/\(\?(\-|\+|)\d+\)/)) return all_tokens(state, 'subroutine'); 812 | // (?R) recurse whole pattern 813 | if (stream.match('(?R)')) return all_tokens(state, 'subroutine'); 814 | // Callouts: 815 | var callout = handle_callout(stream, state); 816 | if (callout) return all_tokens(state, callout); 817 | 818 | // At this stage, we have a new group: 819 | ++ state.group_level; 820 | group_state = 'group' + state.group_level; 821 | push(state, group_state); 822 | 823 | // (?=...) positive look ahead 824 | // (?!...) negative look ahead 825 | // (?<=...) positive look behind 826 | // (?...) atomic, non-capturing group 831 | if (stream.match(/\(\?[:|>]/)) return all_tokens(state, 'start-group'); 832 | 833 | if (stream.match('(?', false)) { 834 | push(state, 'start-group'); 835 | return tokenBase(stream, state); 836 | } 837 | stream.eat('('); 838 | return all_tokens(state, 'start-group'); 839 | } 840 | 841 | if (stream.peek() === ')') { 842 | if (current_state && current_state.match(/^group/)) { 843 | ret = 'start-group'; // formerly 'end-group' but that used to confuse matchbrackets (see issue #4) 844 | if (current_context_state(state).leave_closing_parenthesis) ret = ''; 845 | else stream.next(); 846 | -- state.group_level; 847 | return pop(state, ret); 848 | } 849 | stream.next(); 850 | return all_tokens(state, 'err unmatched-closing-parenthesis'); 851 | } 852 | 853 | // Anchors 854 | if (stream.eat('^') || stream.eat('$')) return all_tokens(state, 'anchor'); 855 | if (stream.eat('.')) return all_tokens(state, 'generic-character-type'); 856 | // Quantifiers: 0 or 1, 0 or more, 1 or more, greedy: 857 | if (stream.eat('?') || stream.eat('*') || stream.eat('+')) { 858 | // Handle possessive and lazy variants: 859 | stream.eat(/[+?]/); 860 | return all_tokens(state, 'quantifier'); 861 | } 862 | consume(stream); 863 | return all_tokens(state); 864 | } 865 | 866 | function startState() { 867 | return { 868 | context: [], 869 | context_state: [], 870 | group_level: 0, 871 | name_value: '', 872 | extended: options.extended, 873 | }; 874 | } 875 | 876 | function copyState(o) { // o = original 877 | var i, oo, oc, key, c = startState(); // c = copy, oo = original object, oc = object copy 878 | for (i = 0; i < o.context_state.length; ++i) { 879 | oo = o.context_state[i]; 880 | oc = {}; 881 | for (key in oo) oc[key] = (key === 'expected') ? oo[key].slice() : oo[key]; 882 | c.context_state.push(oc); 883 | } 884 | c.context = o.context.slice(); 885 | c.group_level = o.group_level; 886 | c.name_value = o.name_value; 887 | c.extended = o.extended; 888 | return c; 889 | } 890 | 891 | return { 892 | startState: startState, 893 | copyState: copyState, 894 | token: tokenBase, 895 | }; 896 | }); 897 | 898 | CodeMirror.defineMIME('text/x-regex', 'pcre'); 899 | CodeMirror.defineMIME('text/x-pcre-regex', 'pcre'); 900 | 901 | }); 902 | --------------------------------------------------------------------------------