├── LICENSE.txt
├── README.md
├── index.html
├── package.json
└── src
├── pcre.css
└── pcre.js
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2021-2025, Xavier G.
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | 3. Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CodeMirror PCRE mode
2 |
3 | This is a [CodeMirror](https://codemirror.net/) mode that brings syntax highlighting for [Perl Compatible Regular Expressions (PCRE)](https://www.pcre.org/).
4 |
5 | ## How to use
6 | ### Basic use
7 | Load `pcre.js` and `pcre.css` at adequate locations in your HTML structure.
8 | Mention `mode: 'pcre'` when creating your CodeMirror instance.
9 |
10 | ### Configuration
11 | codemirror-mode-pcre supports extended mode (`x` flag) and actually enables it by default. This can be turned off by passing `extended: false` when creating the CodeMirror instance.
12 |
13 | ### Theming
14 | This mode does not leverage CodeMirror's default tokens (they are not exactly fitted for regular expressions). Consequently, if you use a theme other than the default one, you will likely want to write your own `pcre.css` file.
15 |
16 | **Breaking change:** starting with version 2.0.0, the `cm-end-group` style is no longer supported and should no longer be used. It is replaced with `cm-start-group`.
17 |
18 | ### Nesting
19 | codemirror-mode-pcre can be nested within another mode, i.e. it can highlight regular expressions for another mode.
20 | This requires adjusting the other mode though.
21 | See the demo page for an example of such nesting.
22 |
23 | ## Non-features
24 | codemirror-mode-pcre does **not** offer:
25 | - completion (e.g. suggesting POSIX class names or script names for `\p` and `\P`);
26 | - tooltips reflecting what the various parts of an expression actually mean.
27 |
28 | Those may come in the future though.
29 |
30 | ## License
31 | Like the PCRE library, this mode is released under the 3-clause BSD license.
32 |
--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
52 | Below are demonstrations of how the PCRE mode can be used to highlight regular expressions within other languages.
53 | This first example is an nginx configuration snippet:
54 |
55 |
70 |
This second example is a simple list of one-line regexes with comments:
71 |
79 |
man pcresyntax
80 |
81 | This is a slightly adjusted copy of `man pcresyntax`. This man page reflects most PCRE syntactic structures in a
82 | colourful way thanks to CodeMirror and the PCRE mode.
83 |
84 |
450 |
Examples from man pcrepattern
451 |
452 | `man pcrepattern` is much longer than `man pcresyntax`, which is why only its examples were reproduced below.
453 |
454 |
635 |
Other details
636 |
747 |
753 |
759 |
760 |
813 |
834 |
857 |
858 |
859 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "codemirror-mode-pcre",
3 | "version": "2.0.0",
4 | "description": "Perl Compatible Regular Expressions (PCRE) mode for CodeMirror",
5 | "main": "src/pcre.js",
6 | "scripts": {
7 | "test": "true"
8 | },
9 | "repository": {
10 | "type": "git",
11 | "url": "git+https://github.com/xavierog/codemirror-mode-pcre.git"
12 | },
13 | "keywords": [
14 | "codemirror-mode",
15 | "pcre",
16 | "pcre-regex"
17 | ],
18 | "author": "Xavier G.",
19 | "license": "BSD-3-Clause",
20 | "bugs": {
21 | "url": "https://github.com/xavierog/codemirror-mode-pcre/issues"
22 | },
23 | "homepage": "https://github.com/xavierog/codemirror-mode-pcre#readme"
24 | }
25 |
--------------------------------------------------------------------------------
/src/pcre.css:
--------------------------------------------------------------------------------
1 | .cm-s-default span.cm-group1 { background: #ffff7f40; }
2 | .cm-s-default span.cm-group2 { background: #aaffff40; }
3 | .cm-s-default span.cm-group3 { background: #ffff7f60; }
4 | .cm-s-default span.cm-group4 { background: #aaffff60; }
5 | .cm-s-default span.cm-group5 { background: #ffff7f80; }
6 | .cm-s-default span.cm-group6 { background: #aaffff80; }
7 | .cm-s-default span.cm-group7 { background: #ffff7fa0; }
8 | .cm-s-default span.cm-group8 { background: #aaffffa0; }
9 |
10 | .cm-s-default span.cm-start-group,
11 | .cm-s-default span.cm-alternation
12 | {
13 | font-weight: bold;
14 | }
15 |
16 | .cm-s-default span.cm-condition { }
17 |
18 | .cm-s-default span.cm-option-sequence { background: #5cb85c; color: white; font-weight: bold; }
19 | .cm-s-default span.cm-verb { background: #5459b8; color: white; font-weight: bold; }
20 |
21 | .cm-s-default span.cm-backreference { color: #25ba36; font-weight: bold; }
22 | .cm-s-default span.cm-callout { background: #ff00ff; color: white; }
23 | .cm-s-default span.cm-condition-subroutine,
24 | .cm-s-default span.cm-subroutine { color: #aa00ff; font-weight: bold; }
25 | .cm-s-default span.cm-quantifier { background: #aad1f760; }
26 |
27 | .cm-s-default span.cm-character-class { background: #f9ca6960; }
28 |
29 | .cm-s-default span.cm-non-printing-character { color: #aa0000; }
30 | .cm-s-default span.cm-generic-character-type { color: blue; }
31 |
32 | .cm-s-default span.cm-escaped-character { font-weight: bold; }
33 | .cm-s-default span.cm-escaped-sequence-start,
34 | .cm-s-default span.cm-escaped-sequence-end,
35 | .cm-s-default span.cm-escaped-sequence {
36 | border-top: 1px solid #7f7f7f;
37 | border-bottom: 1px solid #7f7f7f;
38 | }
39 | .cm-s-default span.cm-escaped-sequence-start,
40 | .cm-s-default span.cm-escaped-sequence-end {
41 | color: white;
42 | background-color: #00000080;
43 | }
44 |
45 | .cm-s-default span.cm-anchor { background: black; color: white; font-weight: bold; }
46 |
47 | .cm-s-default span.cm-define { background: blue; color: yellow; }
48 | .cm-s-default span.cm-name { color: blue; font-weight: normal; }
49 | .cm-s-default span.cm-verb.cm-name { color: yellow; }
50 | .cm-s-default span.cm-err { background: #ff4300c0; }
51 |
--------------------------------------------------------------------------------
/src/pcre.js:
--------------------------------------------------------------------------------
1 | // Declare global variables to avoid warnings in JSHint
2 | /* global CodeMirror, define */
3 |
4 | (function (mod) {
5 | if (typeof exports === "object" && typeof module === "object") // CommonJS
6 | mod(require("codemirror/lib/codemirror"));
7 | else if (typeof define === "function" && define.amd) // AMD
8 | define(["codemirror/lib/codemirror"], mod);
9 | else // Plain browser env
10 | mod(CodeMirror);
11 | })(function (CodeMirror) {
12 | "use strict";
13 |
14 | CodeMirror.defineMode('pcre', function(editor_options, mode_options) {
15 | // Default settings:
16 | var options = {
17 | extended: true,
18 | };
19 | // Override default settings with user-provided settings:
20 | if ('extended' in mode_options) options.extended = Boolean(mode_options.extended);
21 |
22 | var delimiters = {
23 | '<': '>',
24 | '[': ']',
25 | '{': '}',
26 | '(': ')',
27 | };
28 | // Behaviour of alphanumeric characters after a backslash character (normal context):
29 | var backslash_in_normal_context = {
30 | '0': 'non-printing-character',
31 | '1': 'backreference',
32 | '2': 'backreference',
33 | '3': 'backreference',
34 | '4': 'backreference',
35 | '5': 'backreference',
36 | '6': 'backreference',
37 | '7': 'backreference',
38 | '8': 'backreference',
39 | '9': 'backreference',
40 | 'A': 'anchor', // \A start of subject
41 | 'B': 'anchor', // \B not a word boundary
42 | 'C': 'generic-character-type', // \C one data unit, even in UTF mode (best avoided)
43 | 'D': 'generic-character-type', // \D any character that is not a decimal digit
44 | 'E': 'err no-error-message', // \E ends \Q but never matches 'E' -- PCRE does not emit any error message
45 | 'F': '', // \F matches F
46 | 'G': 'anchor', // \G first matching position in subject
47 | 'H': 'generic-character-type', // \H any character that is not a horizontal white space character
48 | 'I': '', // \I matches I
49 | 'J': '', // \J matches J
50 | 'K': 'anchor', // \K reset start of match (neither an anchor nor a simple assertion)
51 | 'L': 'err unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u
52 | 'M': '', // \M matches M
53 | 'N': 'generic-character-type', // \N a character that is not a newline
54 | 'O': '', // \O matches O
55 | 'P': 'err malformed-backslash-p-sequence', // malformed \P or \p sequence
56 | 'Q': 'escaped-sequence-start', // \Q starts \Q...\E escape sequences.
57 | 'R': 'generic-character-type', // \R a newline sequence
58 | 'S': 'generic-character-type', // \S any character that is not a white space character
59 | 'T': '', // \T matches T
60 | 'U': 'unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u
61 | 'V': 'generic-character-type', // \V any character that is not a vertical white space character
62 | 'W': 'generic-character-type', // \W any "non-word" character
63 | 'X': 'generic-character-type', // \X a Unicode extended grapheme cluster
64 | 'Y': '', // \Y matches Y
65 | 'Z': 'anchor', // \Z matches at the end of the subject; also matches before a newline at the end of the subject
66 | 'a': 'non-printing-character', // \a alarm, that is, the BEL character (hex 07)
67 | 'b': 'anchor', // \b word boundary
68 | 'c': 'err backslash-c-at-end-of-pattern', // \cx "control-x", where x is any ASCII character
69 | 'd': 'generic-character-type', // \d any decimal digi
70 | 'e': 'non-printing-character', // \e escape (hex 1B)
71 | 'f': 'non-printing-character', // \f form feed (hex 0C)
72 | 'g': 'err a-number-reference-must-not-be-zero', // a numbered reference must not be zero
73 | 'h': 'generic-character-type', // \h any horizontal white space character
74 | 'i': '', // \i matches i
75 | 'j': '', // \j matches j
76 | 'k': 'err backslash-k-is-not-followed-by-a-name', // \k is not followed by a [...] name
77 | 'l': 'unsupported-espace-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u
78 | 'm': '', // \m matches m
79 | 'n': 'non-printing-character', // \n linefeed (hex 0A)
80 | 'o': '', // \o matches o
81 | 'p': 'err malformed-backslash-p-sequence', // malformed \P or \p sequence
82 | 'q': '', // \q matches q
83 | 'r': 'non-printing-character', // \r carriage return (hex 0D)
84 | 's': 'generic-character-type', // \s any white space character
85 | 't': 'non-printing-character', // \t tab (hex 09)
86 | 'u': 'err unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u
87 | 'v': 'generic-character-type', // \v any vertical white space character
88 | 'w': 'generic-character-type', // any "word" character
89 | 'x': 'non-printing-character', // binary zero (or x if PCRE_JAVASCRIPT_COMPAT)
90 | 'y': '', // \y matches y
91 | 'z': 'anchor', // \z end of subject
92 | };
93 | // Behaviour of alphanumeric characters after a backslash character (character class context, i.e. [...]):
94 | var backslash_in_character_class = {
95 | '0': 'non-printing-character', // octal code
96 | '1': 'non-printing-character', // octal code
97 | '2': 'non-printing-character', // octal code
98 | '3': 'non-printing-character', // octal code
99 | '4': 'non-printing-character', // octal code
100 | '5': 'non-printing-character', // octal code
101 | '6': 'non-printing-character', // octal code
102 | '7': 'non-printing-character', // octal code
103 | '8': '', // \8 matches 8
104 | '9': '', // \9 matches 9
105 | 'A': '', // \A matches A
106 | 'B': '', // \B matches B -- \B, \R, and \X are not special inside a character class.
107 | 'C': '', // \C matches C
108 | 'D': 'generic-character-type', // \D any character that is not a decimal digit
109 | 'E': 'err no-error-message', // \E ends \Q but never matches 'E' -- PCRE does not emit any error message
110 | 'F': '', // \F matches F
111 | 'G': '', // \G matches G
112 | 'H': 'generic-character-type', // \H any character that is not a horizontal white space character
113 | 'I': '', // \I matches I
114 | 'J': '', // \J matches J
115 | 'K': '', // \K matches K
116 | 'L': 'err unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u
117 | 'M': '', // \M matches M
118 | 'N': 'err backslash-n-is-not-supported-in-a-class', // \N is not allowed in a character class.
119 | 'O': '', // \O matches O
120 | 'P': 'err malformed-backslash-p-sequence', // malformed \P or \p sequence
121 | 'Q': 'escaped-sequence-start', // \Q starts \Q...\E escape sequences.
122 | 'R': '', // \R matches R -- \B, \R, and \X are not special inside a character class.
123 | 'S': 'generic-character-type', // \S any character that is not a white space character
124 | 'T': '', // \T matches T
125 | 'U': 'unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u
126 | 'V': 'generic-character-type', // \V any character that is not a vertical white space character
127 | 'W': 'generic-character-type', // \W any "non-word" character
128 | 'X': '', // \X matches X -- \B, \R, and \X are not special inside a character class.
129 | 'Y': '', // \Y matches Y
130 | 'Z': '', // \Z matches Z
131 | 'a': 'non-printing-character', // \a alarm, that is, the BEL character (hex 07)
132 | 'b': 'non-printing-character', // inside a character class, \b is interpreted as the backspace character (hex 08)
133 | 'c': 'err backslash-c-at-end-of-pattern', // \cx "control-x", where x is any ASCII character
134 | 'd': 'generic-character-type', // \d any decimal digi
135 | 'e': 'non-printing-character', // \e escape (hex 1B)
136 | 'f': 'non-printing-character', // \f form feed (hex 0C)
137 | 'g': '', // \g matches g
138 | 'h': 'generic-character-type', // \h any horizontal white space character
139 | 'i': '', // \i matches i
140 | 'j': '', // \j matches j
141 | 'k': '', // \k matches k
142 | 'l': 'unsupported-espace-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u
143 | 'm': '', // \m matches m
144 | 'n': 'non-printing-character', // \n linefeed (hex 0A)
145 | 'o': '', // \o matches o
146 | 'p': 'err malformed-backslash-p-sequence', // malformed \P or \p sequence
147 | 'q': '', // \q matches q
148 | 'r': 'non-printing-character', // \r carriage return (hex 0D)
149 | 's': 'generic-character-type', // \s any white space character
150 | 't': 'non-printing-character', // \t tab (hex 09)
151 | 'u': 'err unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u
152 | 'v': 'generic-character-type', // \v any vertical white space character
153 | 'w': 'generic-character-type', // any "word" character
154 | 'x': 'non-printing-character', // binary zero (or x if PCRE_JAVASCRIPT_COMPAT)
155 | 'y': '', // \y matches y
156 | 'z': '', // \z matches z
157 | };
158 | var backslask_p_properties = {
159 | // GENERAL CATEGORY PROPERTIES FOR \p and \P
160 | 'C': 'Other',
161 | 'Cc': 'Control',
162 | 'Cf': 'Format',
163 | 'Cn': 'Unassigned',
164 | 'Co': 'Private use',
165 | 'Cs': 'Surrogate',
166 |
167 | 'L': 'Letter',
168 | 'Ll': 'Lower case letter',
169 | 'Lm': 'Modifier letter',
170 | 'Lo': 'Other letter',
171 | 'Lt': 'Title case letter',
172 | 'Lu': 'Upper case letter',
173 | 'L&': 'Ll, Lu, or Lt',
174 |
175 | 'M': 'Mark',
176 | 'Mc': 'Spacing mark',
177 | 'Me': 'Enclosing mark',
178 | 'Mn': 'Non-spacing mark',
179 |
180 | 'N': 'Number',
181 | 'Nd': 'Decimal number',
182 | 'Nl': 'Letter number',
183 | 'No': 'Other number',
184 |
185 | 'P': 'Punctuation',
186 | 'Pc': 'Connector punctuation',
187 | 'Pd': 'Dash punctuation',
188 | 'Pe': 'Close punctuation',
189 | 'Pf': 'Final punctuation',
190 | 'Pi': 'Initial punctuation',
191 | 'Po': 'Other punctuation',
192 | 'Ps': 'Open punctuation',
193 |
194 | 'S': 'Symbol',
195 | 'Sc': 'Currency symbol',
196 | 'Sk': 'Modifier symbol',
197 | 'Sm': 'Mathematical symbol',
198 | 'So': 'Other symbol',
199 |
200 | 'Z': 'Separator',
201 | 'Zl': 'Line separator',
202 | 'Zp': 'Paragraph separator',
203 | 'Zs': 'Space separator',
204 |
205 | // PCRE SPECIAL CATEGORY PROPERTIES FOR \p and \P
206 | 'Xan': 'Alphanumeric: union of properties L and N',
207 | 'Xps': 'POSIX space: property Z or tab, NL, VT, FF, CR',
208 | 'Xsp': 'Perl space: property Z or tab, NL, VT, FF, CR',
209 | 'Xuc': 'Univerally-named character: one that can be represented by a Universal Character Name',
210 | 'Xwd': 'Perl word: property Xan or underscore',
211 |
212 | // SCRIPT NAMES FOR \p AND \P
213 | 'Arabic': true,
214 | 'Armenian': true,
215 | 'Avestan': true,
216 | 'Balinese': true,
217 | 'Bamum': true,
218 | 'Bassa_Vah': true,
219 | 'Batak': true,
220 | 'Bengali': true,
221 | 'Bopomofo': true,
222 | 'Brahmi': true,
223 | 'Braille': true,
224 | 'Buginese': true,
225 | 'Buhid': true,
226 | 'Canadian_Aboriginal': true,
227 | 'Carian': true,
228 | 'Caucasian_Albanian': true,
229 | 'Chakma': true,
230 | 'Cham': true,
231 | 'Cherokee': true,
232 | 'Common': true,
233 | 'Coptic': true,
234 | 'Cuneiform': true,
235 | 'Cypriot': true,
236 | 'Cyrillic': true,
237 | 'Deseret': true,
238 | 'Devanagari': true,
239 | 'Duployan': true,
240 | 'Egyptian_Hieroglyphs': true,
241 | 'Elbasan': true,
242 | 'Ethiopic': true,
243 | 'Georgian': true,
244 | 'Glagolitic': true,
245 | 'Gothic': true,
246 | 'Grantha': true,
247 | 'Greek': true,
248 | 'Gujarati': true,
249 | 'Gurmukhi': true,
250 | 'Han': true,
251 | 'Hangul': true,
252 | 'Hanunoo': true,
253 | 'Hebrew': true,
254 | 'Hiragana': true,
255 | 'Imperial_Aramaic': true,
256 | 'Inherited': true,
257 | 'Inscriptional_Pahlavi': true,
258 | 'Inscriptional_Parthian': true,
259 | 'Javanese': true,
260 | 'Kaithi': true,
261 | 'Kannada': true,
262 | 'Katakana': true,
263 | 'Kayah_Li': true,
264 | 'Kharoshthi': true,
265 | 'Khmer': true,
266 | 'Khojki': true,
267 | 'Khudawadi': true,
268 | 'Lao': true,
269 | 'Latin': true,
270 | 'Lepcha': true,
271 | 'Limbu': true,
272 | 'Linear_A': true,
273 | 'Linear_B': true,
274 | 'Lisu': true,
275 | 'Lycian': true,
276 | 'Lydian': true,
277 | 'Mahajani': true,
278 | 'Malayalam': true,
279 | 'Mandaic': true,
280 | 'Manichaean': true,
281 | 'Meetei_Mayek': true,
282 | 'Mende_Kikakui': true,
283 | 'Meroitic_Cursive': true,
284 | 'Meroitic_Hieroglyphs': true,
285 | 'Miao': true,
286 | 'Modi': true,
287 | 'Mongolian': true,
288 | 'Mro': true,
289 | 'Myanmar': true,
290 | 'Nabataean': true,
291 | 'New_Tai_Lue': true,
292 | 'Nko': true,
293 | 'Ogham': true,
294 | 'Ol_Chiki': true,
295 | 'Old_Italic': true,
296 | 'Old_North_Arabian': true,
297 | 'Old_Permic': true,
298 | 'Old_Persian': true,
299 | 'Old_South_Arabian': true,
300 | 'Old_Turkic': true,
301 | 'Oriya': true,
302 | 'Osmanya': true,
303 | 'Pahawh_Hmong': true,
304 | 'Palmyrene': true,
305 | 'Pau_Cin_Hau': true,
306 | 'Phags_Pa': true,
307 | 'Phoenician': true,
308 | 'Psalter_Pahlavi': true,
309 | 'Rejang': true,
310 | 'Runic': true,
311 | 'Samaritan': true,
312 | 'Saurashtra': true,
313 | 'Sharada': true,
314 | 'Shavian': true,
315 | 'Siddham': true,
316 | 'Sinhala': true,
317 | 'Sora_Sompeng': true,
318 | 'Sundanese': true,
319 | 'Syloti_Nagri': true,
320 | 'Syriac': true,
321 | 'Tagalog': true,
322 | 'Tagbanwa': true,
323 | 'Tai_Le': true,
324 | 'Tai_Tham': true,
325 | 'Tai_Viet': true,
326 | 'Takri': true,
327 | 'Tamil': true,
328 | 'Telugu': true,
329 | 'Thaana': true,
330 | 'Thai': true,
331 | 'Tibetan': true,
332 | 'Tifinagh': true,
333 | 'Tirhuta': true,
334 | 'Ugaritic': true,
335 | 'Vai': true,
336 | 'Warang_Citi': true,
337 | 'Yi': true,
338 | };
339 | var backslash_p_regex_string = '[pP]\\{\\^?([\\w&]+)\\}';
340 | var backslash_p_regex = new RegExp(backslash_p_regex_string);
341 |
342 | var posix_named_sets = {
343 | 'alnum': 'alphanumeric',
344 | 'alpha': 'alphabetic',
345 | 'ascii': '0-127',
346 | 'blank': 'space or tab',
347 | 'cntrl': 'control character',
348 | 'digit': 'decimal digit',
349 | 'graph': 'printing, excluding space',
350 | 'lower': 'lower case letter',
351 | 'print': 'printing, including space',
352 | 'punct': 'printing, excluding alphanumeric',
353 | 'space': 'white space',
354 | 'upper': 'upper case letter',
355 | 'word': 'same as \\w',
356 | 'xdigit': 'hexadecimal digit',
357 | };
358 | // Include '<' and '>' to spot errors such as [a[:<:]b]
359 | var posix_named_sets_regex_string = '\\[:\\^?([\\w<>]+):]';
360 | var posix_named_sets_regex = new RegExp(posix_named_sets_regex_string);
361 |
362 | var callout_regex_string = '\\(\\?C(\\d{0,3})\\)';
363 | var callout_regex = new RegExp(callout_regex_string);
364 |
365 | var assertion_regex_string = '\\(\\?[=!]';
366 | var assertion_regex = new RegExp(assertion_regex_string);
367 |
368 | var condition_callout_regex_string = callout_regex_string + assertion_regex_string;
369 | var condition_callout_regex = new RegExp(condition_callout_regex_string);
370 |
371 | // (?i) caseless
372 | // (?J) allow duplicate names
373 | // (?m) multiline
374 | // (?s) single line (dotall)
375 | // (?U) default ungreedy (lazy)
376 | // (?x) extended (ignore white space)
377 | // (?-...) unset option(s)
378 | // + combinations e.g. (?im-sx) or (?iJm-s-U-x)
379 | var options_regex_string = '(?:-?[iJmsUx]+)+';
380 |
381 | // Standalone option sequence, e.g. (?x-i)
382 | var option_sequence_regex_string = '\\(\\?' + options_regex_string + '\\)';
383 | var option_sequence_regex = new RegExp(option_sequence_regex_string);
384 |
385 | // Start of non-capturing group with options, e.g. (?i-U:
386 | var group_options_regex_string = '\\(\\?' + options_regex_string + ':';
387 | var group_options_regex = new RegExp(group_options_regex_string);
388 |
389 | // Helper functions:
390 | function delimiter(ch) {
391 | return (ch in delimiters) ? delimiters[ch] : ch;
392 | }
393 | function current(state) {
394 | if (!state.context.length) return false;
395 | return state.context[state.context.length - 1];
396 | }
397 | function consume(stream) {
398 | // As a nested mode, we should not consume too much so as to let the nesting mode in charge.
399 | // That said, eating \w is usually safe:
400 | if (!stream.match(/\w+/)) stream.next();
401 | }
402 | function all_tokens(state, token) {
403 | var result = state.context.join(' ');
404 | if (token) {
405 | // Avoid leading spaces as they confuse matchbrackets (see issue #4):
406 | if (result) result += ' ';
407 | result += token;
408 | }
409 | return result;
410 | }
411 | function push(state, new_context, new_context_state, token) {
412 | var ret = all_tokens(state, token);
413 | state.context.push(new_context);
414 | state.context_state.push(new_context_state || {});
415 | return ret;
416 | }
417 | function pop(state, token) {
418 | var current_context = state.context.pop();
419 | state.context_state.pop();
420 | if (token) current_context += ' ' + token;
421 | return all_tokens(state, current_context);
422 | }
423 | function current_context_state(state) {
424 | return state.context_state[state.context_state.length - 1];
425 | }
426 | function expect_name(state) {
427 | state.name_value = '';
428 | return push(state, 'name');
429 | }
430 | function expect_end(state, end_string) {
431 | var context_state = current_context_state(state);
432 | var end_string_array = [];
433 | for (var i = 0; i < end_string.length; ++ i) end_string_array.push(end_string[i]);
434 | context_state.expected = end_string_array;
435 | return context_state;
436 | }
437 | function read_expected_end(stream, state) {
438 | var expected, expected_ch, ch;
439 | expected = current_context_state(state).expected;
440 | if (expected && expected.length) {
441 | expected_ch = expected.shift();
442 | ch = stream.next();
443 | if (ch === expected_ch) {
444 | if (!expected.length) {
445 | return pop(state);
446 | }
447 | return all_tokens(state);
448 | }
449 | // console.log('erroneous end:', ch, 'expected:', expected_ch, 'context', current(state));
450 | return all_tokens(state, 'err erroneous-end-of-token');
451 | }
452 | else {
453 | return false;
454 | }
455 | }
456 |
457 | function handle_backslash(stream, state) {
458 | stream.eat('\\');
459 | if (!stream.peek()) return 'err backslash-at-end-of-pattern';
460 |
461 | // The backslash character has several uses. Firstly, if it is followed by a character that is not a number
462 | // or a letter, it takes away any special meaning that character may have.
463 | if (stream.match(/[^0-9a-zA-Z]/)) return 'escaped-character';
464 |
465 | // \Q is used to start an escaped sequence:
466 | if (stream.match('Q') && current(state) != 'escaped-sequence') {
467 | push(state, 'escaped-sequence');
468 | return 'escaped-sequence-start';
469 | }
470 |
471 | // \cx "control-x", where x is any ASCII character
472 | if (stream.match(/c[ -~]/)) return 'non-printing-character';
473 |
474 | // \0dd character with octal code 0dd
475 | if (stream.match(/0[0-7]{0,2}/)) return 'non-printing-character';
476 | // \ddd character with octal code ddd, or back reference
477 | if (stream.match(/[1-7][0-7]{1,2}/)) return 'non-printing-character';
478 | // \o{ddd..} character with octal code ddd..
479 | if (stream.match(/o\{[0-7]+\}/)) return 'non-printing-character';
480 | // \x{hhh..} character with hex code hhh.. (non-JavaScript mode)
481 | if (stream.match(/x\{[0-9a-fA-F]+}/)) return 'non-printing-character';
482 | // \xhh character with hex code hh
483 | if (stream.match(/x[0-9a-fA-F]{0,2}/)) return 'non-printing-character';
484 | // \uhhhh character with hex code hhhh (JavaScript mode only)
485 | if (stream.match(/u[0-9a-fA-F]{4}/)) return 'non-printing-character';
486 |
487 | // \p{...} and \P{...}:
488 | var rem = stream.match(backslash_p_regex);
489 | if (rem) {
490 | if (rem[1] in backslask_p_properties) return 'generic-character-type';
491 | else return 'err unknown-property-name-after-p';
492 | }
493 |
494 | var in_character_class = (current(state) === 'character-class');
495 | // Nothing in this condition can be found in a character class:
496 | if (!in_character_class) {
497 | // The sequence \g followed by an unsigned or a negative number, optionally enclosed in braces, is an
498 | // absolute or relative back reference. A named back reference can be coded as \g{name}.
499 | if (stream.match(/g-?[0-9]+/)) return 'backreference';
500 | if (stream.match(/g\{-?[0-9]+\}/)) return 'backreference';
501 | if (stream.match(/g\{/, false)) return push(state, 'backreference');
502 | // \k reference by name (Perl)
503 | // \k'name' reference by name (Perl)
504 | // \k{name} reference by name (.NET)
505 | if (stream.match(/k[<'{]/, false)) return push(state, 'backreference');
506 | if (stream.match(/[0-9]+/)) return 'backreference';
507 |
508 | // For compatibility with Oniguruma, the non-Perl syntax \g followed by a name or a number enclosed either
509 | // in angle brackets or single quotes, is an alternative syntax for referencing a subpattern as a
510 | // "subroutine".
511 | if (stream.match(/g<[-+]?[0-9]+>/)) return 'subroutine';
512 | if (stream.match(/g'[-+]?[0-9]+'/)) return 'subroutine';
513 | if (stream.match(/g[<']/, false)) return push(state, 'subroutine');
514 | }
515 | // At this stage, we have looked for:
516 | // - a backslash followed by nothing
517 | // - a backslash followed by a single non-alphanumeric character
518 | // - a backslash followed by 1 or more characters to achieve a special, context-dependent meaning
519 | // Look for a backslash followed by a single alphanumeric character:
520 | var backslash_p = in_character_class ? backslash_in_character_class : backslash_in_normal_context;
521 | return backslash_p[stream.next()];
522 | }
523 |
524 | function handle_name(stream, state) {
525 | var ret, rem, consume_limit;
526 | var ch = stream.next();
527 | // Names must start with a non-digit.
528 | if (!state.name_value.length && (!ch.match(/\w/) || ch.match(/\d/))) {
529 | ret = 'err erroneous-start-of-name';
530 | consume_limit = 0;
531 | }
532 | // Names consist of up to 32 alphanumeric characters and underscores.
533 | else if (state.name_value.length > 31) {
534 | ret = 'err name-too-long';
535 | consume_limit = -1;
536 | } else consume_limit = 32 - state.name_value.length - 1;
537 | state.name_value += ch;
538 | if (consume_limit < 0) {
539 | if (rem = stream.match(/^\w+/)) state.name_value += rem[0];
540 | } else while (consume_limit --) {
541 | if (rem = stream.match(/^\w/)) state.name_value += rem[0];
542 | else break;
543 | }
544 | var next_char = stream.peek();
545 | if (!next_char || !next_char.match(/\w/)) return pop(state, ret);
546 | return all_tokens(state, ret);
547 | }
548 |
549 | function handle_callout(stream, state) {
550 | // (?C) callout
551 | // (?Cn) callout with data n
552 | var rem = stream.match(callout_regex);
553 | if (rem) {
554 | return Number(rem[1]) < 256 ? 'callout' : 'err erroneous-callout-number';
555 | }
556 | return false;
557 | }
558 |
559 | function handle_condition_subroutines(stream, state) {
560 | if (stream.peek() === ')') {
561 | pop(state);
562 | return tokenBase(stream, state);
563 | }
564 | stream.eat('R');
565 | if (stream.eat('&')) return expect_name(state);
566 | stream.match(/\d+/);
567 | return pop(state);
568 | }
569 |
570 | function handle_conditions(stream, state) {
571 | var condition_state = current_context_state(state);
572 | var expected_end = read_expected_end(stream, state);
573 | if (expected_end) return expected_end;
574 | if (condition_state.ok) {
575 | pop(state);
576 | return tokenBase(stream, state);
577 | }
578 | // (?(DEFINE)... define subpattern for reference
579 | if (stream.match(/DEFINE(?=\))/)) {
580 | return pop(state, 'define');
581 | }
582 | // (?(R)... overall recursion condition
583 | // (?(Rn)... specific group recursion condition
584 | // (?(R&name)...) specific recursion condition
585 | if (stream.match(/R(\d+|&\w+|)\)/, false)) {
586 | condition_state.ok = true;
587 | push(state, 'condition-subroutine');
588 | return tokenBase(stream, state);
589 | }
590 | // (?(n)... absolute reference condition
591 | // (?(+n)... relative reference condition
592 | // (?(-n)... relative reference condition
593 | if (stream.match(/(-|\+|)\d+/)) {
594 | condition_state.ok = true;
595 | return all_tokens(state, 'backreference');
596 | }
597 | var rem = stream.match(/([<'])/);
598 | if (rem) {
599 | condition_state.ok = false;
600 | expect_end(state, delimiter(rem[1]));
601 | return expect_name(state);
602 | }
603 | if (stream.match(/\w+/, false)) {
604 | condition_state.ok = true; // the "name" state will handle everything for us
605 | return expect_name(state);
606 | }
607 | // If the condition is not in any of the above formats, it must be an assertion. This may be a positive or
608 | // negative lookahead or lookbehind assertion.
609 | if (stream.match(/\?[=!]/)) {
610 | condition_state.ok = true; // the "group" state will handle everything for us
611 | // Ensure "group" leaves the closing parenthesis untouched so "start-group" can consume it:
612 | var group_options = {'leave_closing_parenthesis': true};
613 | return push(state, 'group' + (++ state.group_level), group_options, 'start-group');
614 | }
615 | stream.next();
616 | return all_tokens(state, 'err erroneous-condition');
617 | }
618 |
619 | function handle_start_group(stream, state) {
620 | var start_group_state = current_context_state(state);
621 | var expected_end = read_expected_end(stream, state);
622 | if (expected_end) return expected_end;
623 | var rem;
624 | if (start_group_state.option_shorthand === 1) {
625 | // A shorthand option was spotted, handle it:
626 | start_group_state.option_shorthand = 2;
627 | stream.match(/[^:]+/);
628 | return all_tokens(state, 'option-sequence');
629 | }
630 | if (start_group_state.option_shorthand === 2) {
631 | // A shorthand option was handled, finish the job:
632 | stream.eat(':');
633 | return pop(state);
634 | }
635 | if (start_group_state.condition_callout === 1) {
636 | // A pre-condition callout was spotted, handle it:
637 | start_group_state.condition_callout = 2;
638 | return all_tokens(state, handle_callout(stream, state));
639 | }
640 | if (start_group_state.condition_callout === 2) {
641 | // A pre-condition callout was handled, resume
642 | stream.eat('(');
643 | expect_end(state, ')');
644 | return push(state, 'condition');
645 | }
646 | // (?...) named capturing group (Perl)
647 | // (?'name'...) named capturing group (Perl)
648 | // (?P...) named capturing group (Python)
649 | rem = stream.match(/\(\?P?([<'])/);
650 | if (rem) {
651 | expect_end(state, delimiter(rem[1]));
652 | return expect_name(state);
653 | }
654 | // Same as (?: but with options, e.g. (?x-i:
655 | if (stream.match(group_options_regex, false)) {
656 | // As a convenient shorthand, if any option settings are required at the start of a non-capturing
657 | // subpattern, the option letters may appear between the "?" and the ":".
658 | stream.match('(?');
659 | start_group_state.option_shorthand = 1;
660 | return all_tokens(state);
661 | }
662 | // "(?(" typically marks the start of a condition: (?(condition)yes-pattern|no-pattern)
663 | if (stream.match('(?') && stream.peek() === '(') {
664 | // An explicit callout may be set just before an assertion condition: (?(?C7)(?|&)/);
688 | if (rem) {
689 | expect_end(state, delimiter(rem[1]));
690 | return expect_name(state);
691 | }
692 | stream.next();
693 | return all_tokens(state, 'err erroneous-subroutine');
694 | }
695 |
696 | function handle_verb(stream, state) {
697 | var expected_end = read_expected_end(stream, state);
698 | if (expected_end) return expected_end;
699 | expect_end(state, ')');
700 | return expect_name(state);
701 | }
702 |
703 | function update_options(state, options) {
704 | // We are only interested in x (extended mode).
705 | var enable = true, new_state = null, i = 0, c = null;
706 | for (; i < options.length; ++i) {
707 | c = options[i];
708 | if (c === '-') enable = false;
709 | else if (c === 'x') new_state = enable;
710 | }
711 | if (new_state !== null) state.extended = new_state;
712 | }
713 |
714 | function tokenBase(stream, state) {
715 | var rem, ret; // stand for Regular Expression Match and RETurn, respectively.
716 |
717 | // Get current state, current char, next char:
718 | var ch = stream.peek();
719 | if (!ch) return;
720 | var current_state = current(state);
721 | var group_state;
722 |
723 | if (current_state === 'name') return handle_name(stream, state);
724 | if (current_state === 'condition') return handle_conditions(stream, state);
725 | if (current_state === 'condition-subroutine') return handle_condition_subroutines(stream, state);
726 | if (current_state === 'start-group') return handle_start_group(stream, state);
727 | if (current_state === 'backreference') return handle_backreference(stream, state);
728 | if (current_state === 'subroutine') return handle_subroutine(stream, state);
729 | if (current_state === 'verb') return handle_verb(stream, state);
730 |
731 | if (current_state === 'escaped-sequence') {
732 | if (stream.match('\\E')) return pop(state, 'escaped-sequence-end');
733 | consume(stream);
734 | return all_tokens(state);
735 | }
736 |
737 | // Escaped characters:
738 | if (stream.match(/\\./, false)) return all_tokens(state, handle_backslash(stream, state));
739 |
740 | if (stream.match('[', false)) {
741 | if (current_state !== 'character-class') {
742 | if (stream.match(posix_named_sets_regex)) {
743 | return all_tokens(state, 'err posix-outside-class-unsupported');
744 | }
745 | // In the POSIX.2 compliant library that was included in 4.4BSD Unix, the ugly syntax [[:<:]] and
746 | // [[:>:]] is used for matching "start of word" and "end of word".
747 | if (stream.match('[[:<:]]') || stream.match('[[:>:]]')) return all_tokens(state, 'anchor');
748 | // At this stage, we do have a new character class:
749 | push(state, 'character-class');
750 | stream.eat('[');
751 | stream.eat('^');
752 | // If a closing square bracket is required as a member of the class, it should be the first data
753 | // character in the class (after an initial circumflex, if present) or escaped with a backslash.
754 | // Note: ']' should be on the same line as '[', even in extended mode.
755 | stream.eat(']');
756 | return all_tokens(state);
757 | }
758 | }
759 |
760 | if (current_state === 'character-class') {
761 | rem = stream.match(posix_named_sets_regex);
762 | if (rem) {
763 | if (rem[1] in posix_named_sets) return all_tokens(state, 'generic-character-type');
764 | else return all_tokens(state, 'err unknown-posix-class-name');
765 | }
766 | if (stream.eat(']')) return pop(state);
767 | consume(stream);
768 | return all_tokens(state);
769 | }
770 |
771 | // Regular comments in extended mode:
772 | if (state.extended && stream.eat('#')) {
773 | stream.skipToEnd();
774 | return 'comment';
775 | }
776 |
777 | if (stream.eat('{')) {
778 | // exactly n:
779 | if (stream.match(/\d+\}/)) return all_tokens(state, 'quantifier');
780 | // "at least n, no more than m" and "n or more", greedy, possessive or lazy:
781 | if (stream.match(/\d+,\d*\}[+?]?/)) return all_tokens(state, 'quantifier');
782 | }
783 |
784 | if (stream.eat('|')) {
785 | return all_tokens(state, 'alternation');
786 | }
787 |
788 | if (stream.peek() === '(') {
789 | if (stream.match(/\(\*(?:UTF(?:8|16|32|)|UCP|NO_AUTO_POSSESS|NO_START_OPT)\)/)) return all_tokens(state, 'option-sequence');
790 | if (stream.match(/\(\*LIMIT_(?:RECURSION|MATCH)=[0-9]+\)/)) return all_tokens(state, 'option-sequence');
791 | // Newline convention + what \R matches:
792 | if (stream.match(/\(\*(?:CR|LF|CRLF|ANYCRLF|ANY|BSR_(?:ANYCRLF|UNICODE))\)/)) return all_tokens(state, 'option-sequence');
793 | // Backtracking control:
794 | if (stream.match(/\(\*(?:ACCEPT|FAIL|F|COMMIT|PRUNE|SKIP|THEN)\)/)) return all_tokens(state, 'verb');
795 | if (stream.match(/\(\*(?:MARK|PRUNE|SKIP|THEN|):/)) return push(state, 'verb', {}, 'verb');
796 | rem = stream.match(option_sequence_regex);
797 | if (rem) {
798 | update_options(state, rem[0]);
799 | return all_tokens(state, 'option-sequence');
800 | }
801 | // (?#....) comment (not nestable)
802 | if (stream.match(/\(\?#[^)]*\)/)) return all_tokens(state, 'comment');
803 | // (?P=name) reference by name (Python)
804 | if (stream.match(/\(\?P=/, false)) return push(state, 'backreference');
805 | // (?&name) call subpattern by name (Perl)
806 | // (?P>name) call subpattern by name (Python)
807 | if (stream.match(/\(\?(P>|&)/, false)) return push(state, 'subroutine');
808 | // (?n) call subpattern by absolute number
809 | // (?+n) call subpattern by relative number
810 | // (?-n) call subpattern by relative number
811 | if (stream.match(/\(\?(\-|\+|)\d+\)/)) return all_tokens(state, 'subroutine');
812 | // (?R) recurse whole pattern
813 | if (stream.match('(?R)')) return all_tokens(state, 'subroutine');
814 | // Callouts:
815 | var callout = handle_callout(stream, state);
816 | if (callout) return all_tokens(state, callout);
817 |
818 | // At this stage, we have a new group:
819 | ++ state.group_level;
820 | group_state = 'group' + state.group_level;
821 | push(state, group_state);
822 |
823 | // (?=...) positive look ahead
824 | // (?!...) negative look ahead
825 | // (?<=...) positive look behind
826 | // (?...) atomic, non-capturing group
831 | if (stream.match(/\(\?[:|>]/)) return all_tokens(state, 'start-group');
832 |
833 | if (stream.match('(?', false)) {
834 | push(state, 'start-group');
835 | return tokenBase(stream, state);
836 | }
837 | stream.eat('(');
838 | return all_tokens(state, 'start-group');
839 | }
840 |
841 | if (stream.peek() === ')') {
842 | if (current_state && current_state.match(/^group/)) {
843 | ret = 'start-group'; // formerly 'end-group' but that used to confuse matchbrackets (see issue #4)
844 | if (current_context_state(state).leave_closing_parenthesis) ret = '';
845 | else stream.next();
846 | -- state.group_level;
847 | return pop(state, ret);
848 | }
849 | stream.next();
850 | return all_tokens(state, 'err unmatched-closing-parenthesis');
851 | }
852 |
853 | // Anchors
854 | if (stream.eat('^') || stream.eat('$')) return all_tokens(state, 'anchor');
855 | if (stream.eat('.')) return all_tokens(state, 'generic-character-type');
856 | // Quantifiers: 0 or 1, 0 or more, 1 or more, greedy:
857 | if (stream.eat('?') || stream.eat('*') || stream.eat('+')) {
858 | // Handle possessive and lazy variants:
859 | stream.eat(/[+?]/);
860 | return all_tokens(state, 'quantifier');
861 | }
862 | consume(stream);
863 | return all_tokens(state);
864 | }
865 |
866 | function startState() {
867 | return {
868 | context: [],
869 | context_state: [],
870 | group_level: 0,
871 | name_value: '',
872 | extended: options.extended,
873 | };
874 | }
875 |
876 | function copyState(o) { // o = original
877 | var i, oo, oc, key, c = startState(); // c = copy, oo = original object, oc = object copy
878 | for (i = 0; i < o.context_state.length; ++i) {
879 | oo = o.context_state[i];
880 | oc = {};
881 | for (key in oo) oc[key] = (key === 'expected') ? oo[key].slice() : oo[key];
882 | c.context_state.push(oc);
883 | }
884 | c.context = o.context.slice();
885 | c.group_level = o.group_level;
886 | c.name_value = o.name_value;
887 | c.extended = o.extended;
888 | return c;
889 | }
890 |
891 | return {
892 | startState: startState,
893 | copyState: copyState,
894 | token: tokenBase,
895 | };
896 | });
897 |
898 | CodeMirror.defineMIME('text/x-regex', 'pcre');
899 | CodeMirror.defineMIME('text/x-pcre-regex', 'pcre');
900 |
901 | });
902 |
--------------------------------------------------------------------------------