├── .gitignore ├── .vscode ├── extensions.json ├── tasks.json └── settings.json ├── .gitattributes ├── .github └── workflows │ ├── build.yml │ └── pr.yml ├── .yo-rc.json ├── package.json ├── table-nonbinary-unicode-properties.html ├── gulpfile.js ├── LICENSE ├── table-unicode-general-category-values.html ├── README.md ├── table-binary-unicode-properties.html ├── table-unicode-script-values.html └── spec.emu /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | coverage 3 | docs -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "rbuckton.ecmarkup-vscode" 4 | ] 5 | } -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | index.html -diff merge=ours 2 | spec.js -diff merge=ours 3 | spec.css -diff merge=ours -------------------------------------------------------------------------------- /.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | // See https://go.microsoft.com/fwlink/?LinkId=733558 3 | // for the documentation about the tasks.json format 4 | "version": "2.0.0", 5 | "tasks": [ 6 | { 7 | "type": "gulp", 8 | "task": "build", 9 | "group": { 10 | "kind": "build", 11 | "isDefault": true 12 | } 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "[markdown]": { 3 | "files.trimTrailingWhitespace": false 4 | }, 5 | "[html]": { 6 | "editor.insertSpaces": true, 7 | "editor.tabSize": 2, 8 | }, 9 | "[ecmarkup]": { 10 | "editor.insertSpaces": true, 11 | "editor.tabSize": 2, 12 | }, 13 | "files.associations": { 14 | "*.html": "ecmarkup", 15 | "*.emu": "ecmarkup" 16 | } 17 | } -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Publish Spec to gh-pages 2 | on: 3 | push: 4 | branches: [ main ] 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v2 10 | - run: npm install --legacy-peer-deps 11 | - run: npm run compile 12 | - name: Deploy 13 | uses: JamesIves/github-pages-deploy-action@4.1.4 14 | with: 15 | branch: gh-pages 16 | folder: docs 17 | clean-exclude: | 18 | pr 19 | -------------------------------------------------------------------------------- /.yo-rc.json: -------------------------------------------------------------------------------- 1 | { 2 | "generator-ecmascript-proposal": { 3 | "promptValues": { 4 | "hasChampion": true, 5 | "championName": "Ron Buckton", 6 | "championGithub": "rbuckton", 7 | "spec": "https://rbuckton.github.io/proposal-regexp-modifiers", 8 | "stage": "0", 9 | "sections": [ 10 | "prior-art", 11 | "syntax", 12 | "semantics", 13 | "examples", 14 | "grammar", 15 | "references", 16 | "prior-discussion" 17 | ], 18 | "vscode": true, 19 | "build": "gulp", 20 | "githubWorkflowCI": true, 21 | "githubWorkflowPR": true 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "proposal-regexp-modifiers", 3 | "version": "0.0.0", 4 | "private": true, 5 | "description": "Regular Expression Pattern Modifiers for ECMAScript", 6 | "homepage": "https://github.com/rbuckton/proposal-regexp-modifiers#readme", 7 | "author": { 8 | "name": "Ron Buckton", 9 | "email": "ron.buckton@microsoft.com" 10 | }, 11 | "keywords": [ 12 | "javascript", 13 | "ecmascript" 14 | ], 15 | "scripts": { 16 | "compile": "gulp build", 17 | "start": "gulp start" 18 | }, 19 | "license": "SEE LICENSE IN https://tc39.github.io/ecma262/#sec-copyright-and-software-license", 20 | "devDependencies": { 21 | "@tc39/ecma262-biblio": "^2.0.2322", 22 | "del": "^6.0.0", 23 | "ecmarkup": "^12.1.0", 24 | "gulp": "^4.0.2", 25 | "gulp-emu": "^2.1.0", 26 | "gulp-live-server": "0.0.31", 27 | "gulp-rename": "^2.0.0" 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /table-nonbinary-unicode-properties.html: -------------------------------------------------------------------------------- 1 | 2 | Non-binary Unicode property aliases and their canonical property names 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
Property name and aliasesCanonical property name
`General_Category``General_Category`
`gc`
`Script``Script`
`sc`
`Script_Extensions``Script_Extensions`
`scx`
32 |
33 | -------------------------------------------------------------------------------- /.github/workflows/pr.yml: -------------------------------------------------------------------------------- 1 | name: Publish PR to gh-pages/pr/ 2 | on: 3 | pull_request: 4 | branches: [ main ] 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | if: ${{ github.event.number }} 9 | steps: 10 | - uses: actions/checkout@v2 11 | - run: npm install --legacy-peer-deps 12 | - run: npm run compile 13 | - name: Deploy 14 | uses: JamesIves/github-pages-deploy-action@4.1.4 15 | with: 16 | branch: gh-pages 17 | folder: docs 18 | target-folder: pr/${{ github.event.number }}/ 19 | - id: get-preview-url 20 | name: Get preview url 21 | run: echo "::set-output name=preview-url::https://tc39.es/$(basename $GITHUB_REPOSITORY)/pr/${{ github.event.number }}" 22 | shell: bash 23 | - name: Post Preview Comment 24 | uses: phulsechinmay/rewritable-pr-comment@v0.3.0 25 | with: 26 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 27 | COMMENT_IDENTIFIER: tc39_pr_preview_comment 28 | message: | 29 | A preview of this PR can be found at ${{ steps.get-preview-url.outputs.preview-url }}. 30 | -------------------------------------------------------------------------------- /gulpfile.js: -------------------------------------------------------------------------------- 1 | const del = require("del"); 2 | const path = require("path"); 3 | const gulp = require("gulp"); 4 | const emu = require("gulp-emu"); 5 | const rename = require("gulp-rename"); 6 | const gls = require("gulp-live-server"); 7 | 8 | gulp.task("clean", () => del("docs/**/*")); 9 | 10 | gulp.task("build", () => gulp 11 | .src(["spec.emu"]) 12 | .pipe(emu({ 13 | log: require("ecmarkup/lib/utils").logVerbose, 14 | warn: err => { 15 | const file = path.resolve(err.file || "spec.emu"); 16 | const message = `Warning: ${file}:${typeof err.line === "number" ? `${err.line}:${err.column}:` : ""} ${err.message}`; 17 | require("ecmarkup/lib/utils").logWarning(message); 18 | }, 19 | ecma262Biblio: false, 20 | })) 21 | .pipe(rename("index.html")) 22 | .pipe(gulp.dest("docs"))); 23 | 24 | gulp.task("watch", () => gulp 25 | .watch(["spec.emu"], gulp.task("build"))); 26 | 27 | gulp.task("start", gulp.parallel("watch", () => { 28 | const server = gls.static("docs", 8080); 29 | const promise = server.start(); 30 | (/** @type {import("chokidar").FSWatcher}*/(gulp.watch(["docs/**/*"]))) 31 | .on("change", file => { 32 | server.notify({ path: path.resolve(file) }); 33 | }); 34 | return promise; 35 | })); 36 | 37 | gulp.task("default", gulp.task("build")); -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2017, Ron Buckton, Ecma International 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /table-unicode-general-category-values.html: -------------------------------------------------------------------------------- 1 | 2 | Value aliases and canonical values for the Unicode property `General_Category` 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 |
Property value and aliasesCanonical property value
`Cased_Letter``Cased_Letter`
`LC`
`Close_Punctuation``Close_Punctuation`
`Pe`
`Connector_Punctuation``Connector_Punctuation`
`Pc`
`Control``Control`
`Cc`
`cntrl`
`Currency_Symbol``Currency_Symbol`
`Sc`
`Dash_Punctuation``Dash_Punctuation`
`Pd`
`Decimal_Number``Decimal_Number`
`Nd`
`digit`
`Enclosing_Mark``Enclosing_Mark`
`Me`
`Final_Punctuation``Final_Punctuation`
`Pf`
`Format``Format`
`Cf`
`Initial_Punctuation``Initial_Punctuation`
`Pi`
`Letter``Letter`
`L`
`Letter_Number``Letter_Number`
`Nl`
`Line_Separator``Line_Separator`
`Zl`
`Lowercase_Letter``Lowercase_Letter`
`Ll`
`Mark``Mark`
`M`
`Combining_Mark`
`Math_Symbol``Math_Symbol`
`Sm`
`Modifier_Letter``Modifier_Letter`
`Lm`
`Modifier_Symbol``Modifier_Symbol`
`Sk`
`Nonspacing_Mark``Nonspacing_Mark`
`Mn`
`Number``Number`
`N`
`Open_Punctuation``Open_Punctuation`
`Ps`
`Other``Other`
`C`
`Other_Letter``Other_Letter`
`Lo`
`Other_Number``Other_Number`
`No`
`Other_Punctuation``Other_Punctuation`
`Po`
`Other_Symbol``Other_Symbol`
`So`
`Paragraph_Separator``Paragraph_Separator`
`Zp`
`Private_Use``Private_Use`
`Co`
`Punctuation``Punctuation`
`P`
`punct`
`Separator``Separator`
`Z`
`Space_Separator``Space_Separator`
`Zs`
`Spacing_Mark``Spacing_Mark`
`Mc`
`Surrogate``Surrogate`
`Cs`
`Symbol``Symbol`
`S`
`Titlecase_Letter``Titlecase_Letter`
`Lt`
`Unassigned``Unassigned`
`Cn`
`Uppercase_Letter``Uppercase_Letter`
`Lu`
289 |
290 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Regular Expression Pattern Modifiers for ECMAScript 3 | 4 | 5 | 6 | 7 | ## Status 8 | 9 | **Stage:** 4 10 | **Champion:** Ron Buckton ([@rbuckton](https://github.com/rbuckton)) 11 | 12 | _For detailed status of this proposal see [TODO](#todo), below._ 13 | 14 | 15 | 16 | ## Authors 17 | 18 | * Ron Buckton ([@rbuckton](https://github.com/rbuckton)) 19 | 20 | 21 | 22 | # Motivations 23 | 24 | One common capability amongst the majority of regular expression engines that 25 | is commonly used by parsers, syntax highlighters, and other tools is the capability to 26 | control a subset of regular expression flags such as: 27 | 28 | - `i` — Ignore Case 29 | - `m` — Multiline 30 | - `s` — Single-line (a.k.a. "dot all") 31 | - `x` — Extended mode (see https://github.com/rbuckton/proposal-regexp-x-mode) 32 | 33 | Modifiers are especially helpful when regular expressions are defined in a context 34 | where executable code cannot be evaluated, such as a JSON configuration file or 35 | TextMate tmLanguage grammar file. 36 | 37 | As part of this proposal, we will investigate each existing (and future-proposed) RegExp flag 38 | to determine whether they are feasible to used as modifiers. 39 | 40 | 41 | 42 | 43 | # Prior Art 44 | 45 | * [Perl](https://rbuckton.github.io/regexp-features/engines/perl.html#feature-modifiers) 46 | * [PCRE](https://rbuckton.github.io/regexp-features/engines/pcre.html#feature-modifiers) 47 | * [Boost.Regex](https://rbuckton.github.io/regexp-features/engines/boost.regex.html#feature-modifiers) 48 | * [.NET](https://rbuckton.github.io/regexp-features/engines/dotnet.html#feature-modifiers) 49 | * [Oniguruma](https://rbuckton.github.io/regexp-features/engines/oniguruma.html#feature-modifiers) 50 | * [Hyperscan](https://rbuckton.github.io/regexp-features/engines/hyperscan.html#feature-modifiers) 51 | * [ICU](https://rbuckton.github.io/regexp-features/engines/icu.html#feature-modifiers) 52 | * [Glib/GRegex](https://rbuckton.github.io/regexp-features/engines/glib-gregex.html#feature-modifiers) 53 | 54 | See https://rbuckton.github.io/regexp-features/features/modifiers.html for additional information. 55 | 56 | 57 | 58 | # Syntax 59 | 60 | Modifiers allow you to change the currently active RegExp flags within a subexpression. 61 | 62 | - `(?imsx-imsx:subexpression)` — Sets or unsets (using `-`) the specified RegExp flags for the subexpression. 63 | - ~~`(?imsx-imsx)` — Sets or unsets (using `-`) the specified RegExp flags starting at the current position until the next closing `)` or the end of the pattern.~~ 64 | 65 | > NOTE: Certain flags cannot be modified mid-expression. These currently include `g` (global), `y` (sticky), `u` (unicode), and `d` (hasIndices). 66 | 67 | > NOTE: The actual supported flags will be determined on a case-by-case basis. See [#1](https://github.com/tc39/proposal-regexp-modifiers/issues/1). 68 | 69 | > NOTE: This has no conflicts with existing syntax, as ECMAScript currently produces an error for this syntax in both `u` and non-`u` modes. 70 | 71 | > NOTE: The "self-bounded" form (`(?imsx-imsx:subexpression)`) advanced to Stage 2 on December 15th, 2021. 72 | 73 | > NOTE: The "unbounded" form (`(?imsx-imsx)`) is no longer being considered as part of this proposal as of December 15th, 2021. 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | # Examples 85 | 86 | ```js 87 | const re1 = /^[a-z](?-i:[a-z])$/i; 88 | re1.test("ab"); // true 89 | re1.test("Ab"); // true 90 | re1.test("aB"); // false 91 | 92 | const re2 = /^(?i:[a-z])[a-z]$/; 93 | re2.test("ab"); // true 94 | re2.test("Ab"); // true 95 | re2.test("aB"); // false 96 | ``` 97 | 98 | 99 | 100 | 101 | 106 | 107 | 108 | 109 | 113 | 114 | 115 | 116 | 121 | 122 | 123 | # History 124 | 125 | - October 27th, 2021 — Proposed for Stage 1 ([slides](https://1drv.ms/p/s!AjgWTO11Fk-Tkfl7c6yR-2P8T4gn0w?e=cvaUL2)) 126 | - Outcome: Advanced to Stage 1 127 | - December 15th, 2021 — Proposed for Stage 2 ([slides](https://1drv.ms/p/s!AjgWTO11Fk-Tkfs3yIyrh3hZ2k6PCQ?e=Yodx4H)) 128 | - Outcome: Advanced to Stage 2 with "self-bounded" form only ("unbounded" form did not advance). 129 | - Stage 2 Reviewers: Richard Gibson, Waldemar Horwat 130 | - June 7th, 2022 — Proposed for Stage 3 ([slides](https://1drv.ms/p/s!AjgWTO11Fk-Tkf5daRnRsxu8BY5Nsg?e=UKVf8W)) 131 | - Outcome: [Advanced to Stage 3](https://github.com/tc39/notes/blob/31edb829db604fdb0255b21238b20898b66cee41/meetings/2022-06/jun-07.md) 132 | 133 | 134 | # TODO 135 | 136 | The following is a high-level list of tasks to progress through each stage of the [TC39 proposal process](https://tc39.github.io/process-document/): 137 | 138 | ### Stage 1 Entrance Criteria 139 | 140 | * [x] Identified a "[champion][Champion]" who will advance the addition. 141 | * [x] [Prose][Prose] outlining the problem or need and the general shape of a solution. 142 | * [x] Illustrative [examples][Examples] of usage. 143 | * [ ] ~~High-level [API][API].~~ 144 | 145 | ### Stage 2 Entrance Criteria 146 | 147 | * [x] [Initial specification text][Specification]. 148 | * [ ] ~~[Transpiler support][Transpiler] (_Optional_)~~. 149 | 150 | ### Stage 2.7 Entrance Criteria 151 | 152 | * [x] [Complete specification text][Specification]. 153 | * [x] Designated reviewers have [signed off][Stage3ReviewerSignOff] on the current spec text (1 of 2). 154 | * [x] The ECMAScript editor has [signed off][Stage3EditorSignOff] on the current spec text. 155 | 156 | ### Stage 3 Entrance Criteria 157 | 158 | * [x] [Test262](https://github.com/tc39/test262) acceptance tests have been written for mainline usage scenarios and [merged][Test262PullRequest]. 159 | 160 | ### Stage 4 Entrance Criteria 161 | 162 | * [x] Two compatible implementations which pass the acceptance tests: 163 | * [X] [V8][Implementation1] - Shipping in [Chrome 125](https://developer.chrome.com/release-notes/125)/Edge 125 164 | * [x] [SpiderMonkey][Implementation2] - Shipping in FireFox 130 behind a flag, Shipping in FireFox 132b (Nightly) [unflagged](https://bugzilla.mozilla.org/show_bug.cgi?id=1913752) 165 | * [ ] JSC 166 | * [ ] [Engine262](https://github.com/engine262/engine262/pull/229) 167 | * [x] A [pull request][Ecma262PullRequest] has been sent to tc39/ecma262 with the integrated spec text. 168 | * [ ] The ECMAScript editor has signed off on the [pull request][Ecma262PullRequest]. 169 | * [x] [Kevin Gibbons](https://github.com/tc39/ecma262/pull/3221#pullrequestreview-1735554031) 170 | * [ ] [Michael Ficarra](https://github.com/tc39/ecma262/pull/3221#pullrequestreview-1784954743) 171 | 172 | 173 | 174 | 175 | [Process]: https://tc39.es/process-document/ 176 | [Proposals]: https://github.com/tc39/proposals/ 177 | [Grammarkdown]: http://github.com/rbuckton/grammarkdown#readme 178 | [Champion]: #status 179 | [Prose]: #motivations 180 | [Examples]: #examples 181 | [API]: #api 182 | [Specification]: https://rbuckton.github.io/proposal-regexp-modifiers 183 | 184 | [Transpiler]: #todo 185 | [Stage3ReviewerSignOff]: #todo 186 | [Stage3EditorSignOff]: #todo 187 | [Test262PullRequest]: https://github.com/tc39/test262/pull/3960 188 | [Implementation1]: https://bugs.chromium.org/p/v8/issues/detail?id=12956 189 | [Implementation2]: https://bugzilla.mozilla.org/show_bug.cgi?id=1899813 190 | [Ecma262PullRequest]: https://github.com/tc39/ecma262/pull/3221 191 | -------------------------------------------------------------------------------- /table-binary-unicode-properties.html: -------------------------------------------------------------------------------- 1 | 2 | Binary Unicode property aliases and their canonical property names 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 |
Property name and aliasesCanonical property name
`ASCII``ASCII`
`ASCII_Hex_Digit``ASCII_Hex_Digit`
`AHex`
`Alphabetic``Alphabetic`
`Alpha`
`Any``Any`
`Assigned``Assigned`
`Bidi_Control``Bidi_Control`
`Bidi_C`
`Bidi_Mirrored``Bidi_Mirrored`
`Bidi_M`
`Case_Ignorable``Case_Ignorable`
`CI`
`Cased``Cased`
`Changes_When_Casefolded``Changes_When_Casefolded`
`CWCF`
`Changes_When_Casemapped``Changes_When_Casemapped`
`CWCM`
`Changes_When_Lowercased``Changes_When_Lowercased`
`CWL`
`Changes_When_NFKC_Casefolded``Changes_When_NFKC_Casefolded`
`CWKCF`
`Changes_When_Titlecased``Changes_When_Titlecased`
`CWT`
`Changes_When_Uppercased``Changes_When_Uppercased`
`CWU`
`Dash``Dash`
`Default_Ignorable_Code_Point``Default_Ignorable_Code_Point`
`DI`
`Deprecated``Deprecated`
`Dep`
`Diacritic``Diacritic`
`Dia`
`Emoji``Emoji`
`Emoji_Component``Emoji_Component`
`EComp`
`Emoji_Modifier``Emoji_Modifier`
`EMod`
`Emoji_Modifier_Base``Emoji_Modifier_Base`
`EBase`
`Emoji_Presentation``Emoji_Presentation`
`EPres`
`Extended_Pictographic``Extended_Pictographic`
`ExtPict`
`Extender``Extender`
`Ext`
`Grapheme_Base``Grapheme_Base`
`Gr_Base`
`Grapheme_Extend``Grapheme_Extend`
`Gr_Ext`
`Hex_Digit``Hex_Digit`
`Hex`
`IDS_Binary_Operator``IDS_Binary_Operator`
`IDSB`
`IDS_Trinary_Operator``IDS_Trinary_Operator`
`IDST`
`ID_Continue``ID_Continue`
`IDC`
`ID_Start``ID_Start`
`IDS`
`Ideographic``Ideographic`
`Ideo`
`Join_Control``Join_Control`
`Join_C`
`Logical_Order_Exception``Logical_Order_Exception`
`LOE`
`Lowercase``Lowercase`
`Lower`
`Math``Math`
`Noncharacter_Code_Point``Noncharacter_Code_Point`
`NChar`
`Pattern_Syntax``Pattern_Syntax`
`Pat_Syn`
`Pattern_White_Space``Pattern_White_Space`
`Pat_WS`
`Quotation_Mark``Quotation_Mark`
`QMark`
`Radical``Radical`
`Regional_Indicator``Regional_Indicator`
`RI`
`Sentence_Terminal``Sentence_Terminal`
`STerm`
`Soft_Dotted``Soft_Dotted`
`SD`
`Terminal_Punctuation``Terminal_Punctuation`
`Term`
`Unified_Ideograph``Unified_Ideograph`
`UIdeo`
`Uppercase``Uppercase`
`Upper`
`Variation_Selector``Variation_Selector`
`VS`
`White_Space``White_Space`
`space`
`XID_Continue``XID_Continue`
`XIDC`
`XID_Start``XID_Start`
`XIDS`
358 |
359 | -------------------------------------------------------------------------------- /table-unicode-script-values.html: -------------------------------------------------------------------------------- 1 | 2 | Value aliases and canonical values for the Unicode properties `Script` and `Script_Extensions` 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 | 720 | 721 | 722 | 723 | 724 | 725 | 726 | 727 | 728 | 729 | 730 | 731 | 732 | 733 | 734 | 735 | 736 | 737 | 738 | 739 | 740 | 741 | 742 | 743 | 744 | 745 | 746 | 747 | 748 | 749 | 750 | 751 | 752 | 753 | 754 | 755 | 756 | 757 | 758 | 759 | 760 | 761 | 762 | 763 | 764 | 765 | 766 | 767 | 768 | 769 | 770 | 771 | 772 | 773 | 774 | 775 | 776 | 777 | 778 | 779 | 780 | 781 | 782 | 783 | 784 | 785 | 786 | 787 | 788 | 789 | 790 | 791 | 792 | 793 | 794 | 795 | 796 | 797 | 798 | 799 | 800 | 801 | 802 | 803 | 804 | 805 | 806 | 807 | 808 | 809 | 810 | 811 | 812 | 813 | 814 | 815 | 816 | 817 | 818 | 819 | 820 | 821 | 822 | 823 | 824 | 825 | 826 | 827 | 828 | 829 | 830 | 831 | 832 | 833 | 834 | 835 | 836 | 837 | 838 | 839 | 840 | 841 | 842 | 843 | 844 | 845 | 846 | 847 | 848 | 849 | 850 | 851 | 852 | 853 | 854 | 855 | 856 | 857 | 858 | 859 | 860 | 861 | 862 | 863 | 864 | 865 | 866 | 867 | 868 | 869 | 870 | 871 | 872 | 873 | 874 | 875 | 876 | 877 | 878 | 879 | 880 | 881 | 882 | 883 | 884 | 885 | 886 | 887 | 888 | 889 | 890 | 891 | 892 | 893 | 894 | 895 | 896 | 897 | 898 | 899 | 900 | 901 | 902 | 903 | 904 | 905 | 906 | 907 | 908 | 909 | 910 | 911 | 912 | 913 | 914 | 915 | 916 | 917 | 918 | 919 | 920 | 921 | 922 | 923 | 924 | 925 | 926 | 927 | 928 | 929 | 930 | 931 | 932 | 933 | 934 | 935 | 936 | 937 | 938 | 939 | 940 | 941 | 942 | 943 | 944 | 945 | 946 | 947 | 948 | 949 | 950 | 951 | 952 | 953 | 954 | 955 | 956 | 957 | 958 | 959 | 960 | 961 | 962 | 963 | 964 | 965 | 966 | 967 | 968 | 969 | 970 | 971 | 972 | 973 | 974 | 975 | 976 | 977 | 978 | 979 | 980 | 981 | 982 | 983 | 984 | 985 | 986 | 987 | 988 | 989 | 990 | 991 | 992 | 993 | 994 | 995 | 996 | 997 | 998 | 999 | 1000 | 1001 | 1002 | 1003 | 1004 | 1005 | 1006 | 1007 | 1008 | 1009 | 1010 | 1011 | 1012 | 1013 | 1014 | 1015 | 1016 | 1017 | 1018 | 1019 | 1020 | 1021 | 1022 | 1023 | 1024 | 1025 | 1026 | 1027 | 1028 | 1029 | 1030 | 1031 | 1032 | 1033 | 1034 | 1035 | 1036 | 1037 | 1038 | 1039 | 1040 | 1041 | 1042 | 1043 | 1044 | 1045 | 1046 | 1047 | 1048 | 1049 | 1050 | 1051 | 1052 | 1053 | 1054 | 1055 | 1056 | 1057 | 1058 | 1059 | 1060 | 1061 | 1062 | 1063 | 1064 | 1065 | 1066 | 1067 | 1068 | 1069 | 1070 | 1071 | 1072 | 1073 | 1074 | 1075 | 1076 | 1077 | 1078 | 1079 | 1080 | 1081 | 1082 | 1083 | 1084 | 1085 | 1086 | 1087 | 1088 | 1089 | 1090 |
Property value and aliasesCanonical property value
`Adlam``Adlam`
`Adlm`
`Ahom``Ahom`
`Anatolian_Hieroglyphs``Anatolian_Hieroglyphs`
`Hluw`
`Arabic``Arabic`
`Arab`
`Armenian``Armenian`
`Armn`
`Avestan``Avestan`
`Avst`
`Balinese``Balinese`
`Bali`
`Bamum``Bamum`
`Bamu`
`Bassa_Vah``Bassa_Vah`
`Bass`
`Batak``Batak`
`Batk`
`Bengali``Bengali`
`Beng`
`Bhaiksuki``Bhaiksuki`
`Bhks`
`Bopomofo``Bopomofo`
`Bopo`
`Brahmi``Brahmi`
`Brah`
`Braille``Braille`
`Brai`
`Buginese``Buginese`
`Bugi`
`Buhid``Buhid`
`Buhd`
`Canadian_Aboriginal``Canadian_Aboriginal`
`Cans`
`Carian``Carian`
`Cari`
`Caucasian_Albanian``Caucasian_Albanian`
`Aghb`
`Chakma``Chakma`
`Cakm`
`Cham``Cham`
`Chorasmian``Chorasmian`
`Chrs`
`Cherokee``Cherokee`
`Cher`
`Common``Common`
`Zyyy`
`Coptic``Coptic`
`Copt`
`Qaac`
`Cuneiform``Cuneiform`
`Xsux`
`Cypriot``Cypriot`
`Cprt`
`Cyrillic``Cyrillic`
`Cyrl`
`Deseret``Deseret`
`Dsrt`
`Devanagari``Devanagari`
`Deva`
`Dives_Akuru``Dives_Akuru`
`Diak`
`Dogra``Dogra`
`Dogr`
`Duployan``Duployan`
`Dupl`
`Egyptian_Hieroglyphs``Egyptian_Hieroglyphs`
`Egyp`
`Elbasan``Elbasan`
`Elba`
`Elymaic``Elymaic`
`Elym`
`Ethiopic``Ethiopic`
`Ethi`
`Georgian``Georgian`
`Geor`
`Glagolitic``Glagolitic`
`Glag`
`Gothic``Gothic`
`Goth`
`Grantha``Grantha`
`Gran`
`Greek``Greek`
`Grek`
`Gujarati``Gujarati`
`Gujr`
`Gunjala_Gondi``Gunjala_Gondi`
`Gong`
`Gurmukhi``Gurmukhi`
`Guru`
`Han``Han`
`Hani`
`Hangul``Hangul`
`Hang`
`Hanifi_Rohingya``Hanifi_Rohingya`
`Rohg`
`Hanunoo``Hanunoo`
`Hano`
`Hatran``Hatran`
`Hatr`
`Hebrew``Hebrew`
`Hebr`
`Hiragana``Hiragana`
`Hira`
`Imperial_Aramaic``Imperial_Aramaic`
`Armi`
`Inherited``Inherited`
`Zinh`
`Qaai`
`Inscriptional_Pahlavi``Inscriptional_Pahlavi`
`Phli`
`Inscriptional_Parthian``Inscriptional_Parthian`
`Prti`
`Javanese``Javanese`
`Java`
`Kaithi``Kaithi`
`Kthi`
`Kannada``Kannada`
`Knda`
`Katakana``Katakana`
`Kana`
`Kayah_Li``Kayah_Li`
`Kali`
`Kharoshthi``Kharoshthi`
`Khar`
`Khitan_Small_Script``Khitan_Small_Script`
`Kits`
`Khmer``Khmer`
`Khmr`
`Khojki``Khojki`
`Khoj`
`Khudawadi``Khudawadi`
`Sind`
`Lao``Lao`
`Laoo`
`Latin``Latin`
`Latn`
`Lepcha``Lepcha`
`Lepc`
`Limbu``Limbu`
`Limb`
`Linear_A``Linear_A`
`Lina`
`Linear_B``Linear_B`
`Linb`
`Lisu``Lisu`
`Lycian``Lycian`
`Lyci`
`Lydian``Lydian`
`Lydi`
`Mahajani``Mahajani`
`Mahj`
`Makasar``Makasar`
`Maka`
`Malayalam``Malayalam`
`Mlym`
`Mandaic``Mandaic`
`Mand`
`Manichaean``Manichaean`
`Mani`
`Marchen``Marchen`
`Marc`
`Medefaidrin``Medefaidrin`
`Medf`
`Masaram_Gondi``Masaram_Gondi`
`Gonm`
`Meetei_Mayek``Meetei_Mayek`
`Mtei`
`Mende_Kikakui``Mende_Kikakui`
`Mend`
`Meroitic_Cursive``Meroitic_Cursive`
`Merc`
`Meroitic_Hieroglyphs``Meroitic_Hieroglyphs`
`Mero`
`Miao``Miao`
`Plrd`
`Modi``Modi`
`Mongolian``Mongolian`
`Mong`
`Mro``Mro`
`Mroo`
`Multani``Multani`
`Mult`
`Myanmar``Myanmar`
`Mymr`
`Nabataean``Nabataean`
`Nbat`
`Nandinagari``Nandinagari`
`Nand`
`New_Tai_Lue``New_Tai_Lue`
`Talu`
`Newa``Newa`
`Nko``Nko`
`Nkoo`
`Nushu``Nushu`
`Nshu`
`Nyiakeng_Puachue_Hmong``Nyiakeng_Puachue_Hmong`
`Hmnp`
`Ogham``Ogham`
`Ogam`
`Ol_Chiki``Ol_Chiki`
`Olck`
`Old_Hungarian``Old_Hungarian`
`Hung`
`Old_Italic``Old_Italic`
`Ital`
`Old_North_Arabian``Old_North_Arabian`
`Narb`
`Old_Permic``Old_Permic`
`Perm`
`Old_Persian``Old_Persian`
`Xpeo`
`Old_Sogdian``Old_Sogdian`
`Sogo`
`Old_South_Arabian``Old_South_Arabian`
`Sarb`
`Old_Turkic``Old_Turkic`
`Orkh`
`Oriya``Oriya`
`Orya`
`Osage``Osage`
`Osge`
`Osmanya``Osmanya`
`Osma`
`Pahawh_Hmong``Pahawh_Hmong`
`Hmng`
`Palmyrene``Palmyrene`
`Palm`
`Pau_Cin_Hau``Pau_Cin_Hau`
`Pauc`
`Phags_Pa``Phags_Pa`
`Phag`
`Phoenician``Phoenician`
`Phnx`
`Psalter_Pahlavi``Psalter_Pahlavi`
`Phlp`
`Rejang``Rejang`
`Rjng`
`Runic``Runic`
`Runr`
`Samaritan``Samaritan`
`Samr`
`Saurashtra``Saurashtra`
`Saur`
`Sharada``Sharada`
`Shrd`
`Shavian``Shavian`
`Shaw`
`Siddham``Siddham`
`Sidd`
`SignWriting``SignWriting`
`Sgnw`
`Sinhala``Sinhala`
`Sinh`
`Sogdian``Sogdian`
`Sogd`
`Sora_Sompeng``Sora_Sompeng`
`Sora`
`Soyombo``Soyombo`
`Soyo`
`Sundanese``Sundanese`
`Sund`
`Syloti_Nagri``Syloti_Nagri`
`Sylo`
`Syriac``Syriac`
`Syrc`
`Tagalog``Tagalog`
`Tglg`
`Tagbanwa``Tagbanwa`
`Tagb`
`Tai_Le``Tai_Le`
`Tale`
`Tai_Tham``Tai_Tham`
`Lana`
`Tai_Viet``Tai_Viet`
`Tavt`
`Takri``Takri`
`Takr`
`Tamil``Tamil`
`Taml`
`Tangut``Tangut`
`Tang`
`Telugu``Telugu`
`Telu`
`Thaana``Thaana`
`Thaa`
`Thai``Thai`
`Tibetan``Tibetan`
`Tibt`
`Tifinagh``Tifinagh`
`Tfng`
`Tirhuta``Tirhuta`
`Tirh`
`Ugaritic``Ugaritic`
`Ugar`
`Vai``Vai`
`Vaii`
`Wancho``Wancho`
`Wcho`
`Warang_Citi``Warang_Citi`
`Wara`
`Yezidi``Yezidi`
`Yezi`
`Yi``Yi`
`Yiii`
`Zanabazar_Square``Zanabazar_Square`
`Zanb`
1091 |
1092 | -------------------------------------------------------------------------------- /spec.emu: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
   6 | title: Regular Expression Pattern Modifiers for ECMAScript
   7 | stage: 3
   8 | contributors: Ron Buckton, Ecma International
   9 | 
10 | 11 | 12 | 13 | 14 |

Introduction

15 |

See the proposal repository for background material and discussion.

16 |
17 | 18 | 19 |

Text Processing

20 | 21 | 22 |

RegExp (Regular Expression) Objects

23 |

A RegExp object contains a regular expression and the associated flags.

24 | 25 |

The form and functionality of regular expressions is modelled after the regular expression facility in the Perl 5 programming language.

26 |
27 | 28 | 29 |

Patterns

30 |

The RegExp constructor applies the following grammar to the input pattern String. An error occurs if the grammar cannot interpret the String as an expansion of |Pattern|.

31 |

Syntax

32 | 33 | Pattern[UnicodeMode, N] :: 34 | Disjunction[?UnicodeMode, ?N] 35 | 36 | Disjunction[UnicodeMode, N] :: 37 | Alternative[?UnicodeMode, ?N] 38 | Alternative[?UnicodeMode, ?N] `|` Disjunction[?UnicodeMode, ?N] 39 | 40 | Alternative[UnicodeMode, N] :: 41 | [empty] 42 | Alternative[?UnicodeMode, ?N] Term[?UnicodeMode, ?N] 43 | 44 | Term[UnicodeMode, N] :: 45 | Assertion[?UnicodeMode, ?N] 46 | Atom[?UnicodeMode, ?N] 47 | Atom[?UnicodeMode, ?N] Quantifier 48 | 49 | Assertion[UnicodeMode, N] :: 50 | `^` 51 | `$` 52 | `\` `b` 53 | `\` `B` 54 | `(` `?` `=` Disjunction[?UnicodeMode, ?N] `)` 55 | `(` `?` `!` Disjunction[?UnicodeMode, ?N] `)` 56 | `(` `?` `<=` Disjunction[?UnicodeMode, ?N] `)` 57 | `(` `?` `<!` Disjunction[?UnicodeMode, ?N] `)` 58 | 59 | Quantifier :: 60 | QuantifierPrefix 61 | QuantifierPrefix `?` 62 | 63 | QuantifierPrefix :: 64 | `*` 65 | `+` 66 | `?` 67 | `{` DecimalDigits[~Sep] `}` 68 | `{` DecimalDigits[~Sep] `,` `}` 69 | `{` DecimalDigits[~Sep] `,` DecimalDigits[~Sep] `}` 70 | 71 | Atom[UnicodeMode, N] :: 72 | PatternCharacter 73 | `.` 74 | `\` AtomEscape[?UnicodeMode, ?N] 75 | CharacterClass[?UnicodeMode] 76 | `(` GroupSpecifier[?UnicodeMode] Disjunction[?UnicodeMode, ?N] `)` 77 | `(` `?` `:` Disjunction[?UnicodeMode, ?N] `)` 78 | `(` `?` RegularExpressionFlags `:` Disjunction[?UnicodeMode, ?N] `)` 79 | `(` `?` RegularExpressionFlags `-` RegularExpressionFlags `:` Disjunction[?UnicodeMode, ?N] `)` 80 | 81 | SyntaxCharacter :: one of 82 | `^` `$` `\` `.` `*` `+` `?` `(` `)` `[` `]` `{` `}` `|` 83 | 84 | PatternCharacter :: 85 | SourceCharacter but not SyntaxCharacter 86 | 87 | AtomEscape[UnicodeMode, N] :: 88 | DecimalEscape 89 | CharacterClassEscape[?UnicodeMode] 90 | CharacterEscape[?UnicodeMode] 91 | [+N] `k` GroupName[?UnicodeMode] 92 | 93 | CharacterEscape[UnicodeMode] :: 94 | ControlEscape 95 | `c` ControlLetter 96 | `0` [lookahead ∉ DecimalDigit] 97 | HexEscapeSequence 98 | RegExpUnicodeEscapeSequence[?UnicodeMode] 99 | IdentityEscape[?UnicodeMode] 100 | 101 | ControlEscape :: one of 102 | `f` `n` `r` `t` `v` 103 | 104 | ControlLetter :: one of 105 | `a` `b` `c` `d` `e` `f` `g` `h` `i` `j` `k` `l` `m` `n` `o` `p` `q` `r` `s` `t` `u` `v` `w` `x` `y` `z` 106 | `A` `B` `C` `D` `E` `F` `G` `H` `I` `J` `K` `L` `M` `N` `O` `P` `Q` `R` `S` `T` `U` `V` `W` `X` `Y` `Z` 107 | 108 | GroupSpecifier[UnicodeMode] :: 109 | [empty] 110 | `?` GroupName[?UnicodeMode] 111 | 112 | GroupName[UnicodeMode] :: 113 | `<` RegExpIdentifierName[?UnicodeMode] `>` 114 | 115 | RegExpIdentifierName[UnicodeMode] :: 116 | RegExpIdentifierStart[?UnicodeMode] 117 | RegExpIdentifierName[?UnicodeMode] RegExpIdentifierPart[?UnicodeMode] 118 | 119 | RegExpIdentifierStart[UnicodeMode] :: 120 | IdentifierStartChar 121 | `\` RegExpUnicodeEscapeSequence[+UnicodeMode] 122 | [~UnicodeMode] UnicodeLeadSurrogate UnicodeTrailSurrogate 123 | 124 | RegExpIdentifierPart[UnicodeMode] :: 125 | IdentifierPartChar 126 | `\` RegExpUnicodeEscapeSequence[+UnicodeMode] 127 | [~UnicodeMode] UnicodeLeadSurrogate UnicodeTrailSurrogate 128 | 129 | RegExpUnicodeEscapeSequence[UnicodeMode] :: 130 | [+UnicodeMode] `u` HexLeadSurrogate `\u` HexTrailSurrogate 131 | [+UnicodeMode] `u` HexLeadSurrogate 132 | [+UnicodeMode] `u` HexTrailSurrogate 133 | [+UnicodeMode] `u` HexNonSurrogate 134 | [~UnicodeMode] `u` Hex4Digits 135 | [+UnicodeMode] `u{` CodePoint `}` 136 | 137 | UnicodeLeadSurrogate :: 138 | > any Unicode code point in the inclusive range 0xD800 to 0xDBFF 139 | 140 | UnicodeTrailSurrogate :: 141 | > any Unicode code point in the inclusive range 0xDC00 to 0xDFFF 142 | 143 |

Each `\\u` |HexTrailSurrogate| for which the choice of associated `u` |HexLeadSurrogate| is ambiguous shall be associated with the nearest possible `u` |HexLeadSurrogate| that would otherwise have no corresponding `\\u` |HexTrailSurrogate|.

144 | 145 | HexLeadSurrogate :: 146 | Hex4Digits [> but only if the MV of |Hex4Digits| is in the inclusive range 0xD800 to 0xDBFF] 147 | 148 | HexTrailSurrogate :: 149 | Hex4Digits [> but only if the MV of |Hex4Digits| is in the inclusive range 0xDC00 to 0xDFFF] 150 | 151 | HexNonSurrogate :: 152 | Hex4Digits [> but only if the MV of |Hex4Digits| is not in the inclusive range 0xD800 to 0xDFFF] 153 | 154 | IdentityEscape[UnicodeMode] :: 155 | [+UnicodeMode] SyntaxCharacter 156 | [+UnicodeMode] `/` 157 | [~UnicodeMode] SourceCharacter but not UnicodeIDContinue 158 | 159 | DecimalEscape :: 160 | NonZeroDigit DecimalDigits[~Sep]? [lookahead ∉ DecimalDigit] 161 | 162 | CharacterClassEscape[UnicodeMode] :: 163 | `d` 164 | `D` 165 | `s` 166 | `S` 167 | `w` 168 | `W` 169 | [+UnicodeMode] `p{` UnicodePropertyValueExpression `}` 170 | [+UnicodeMode] `P{` UnicodePropertyValueExpression `}` 171 | 172 | UnicodePropertyValueExpression :: 173 | UnicodePropertyName `=` UnicodePropertyValue 174 | LoneUnicodePropertyNameOrValue 175 | 176 | UnicodePropertyName :: 177 | UnicodePropertyNameCharacters 178 | 179 | UnicodePropertyNameCharacters :: 180 | UnicodePropertyNameCharacter UnicodePropertyNameCharacters? 181 | 182 | UnicodePropertyValue :: 183 | UnicodePropertyValueCharacters 184 | 185 | LoneUnicodePropertyNameOrValue :: 186 | UnicodePropertyValueCharacters 187 | 188 | UnicodePropertyValueCharacters :: 189 | UnicodePropertyValueCharacter UnicodePropertyValueCharacters? 190 | 191 | UnicodePropertyValueCharacter :: 192 | UnicodePropertyNameCharacter 193 | DecimalDigit 194 | 195 | UnicodePropertyNameCharacter :: 196 | ControlLetter 197 | `_` 198 | 199 | CharacterClass[UnicodeMode] :: 200 | `[` [lookahead != `^`] ClassRanges[?UnicodeMode] `]` 201 | `[` `^` ClassRanges[?UnicodeMode] `]` 202 | 203 | ClassRanges[UnicodeMode] :: 204 | [empty] 205 | NonemptyClassRanges[?UnicodeMode] 206 | 207 | NonemptyClassRanges[UnicodeMode] :: 208 | ClassAtom[?UnicodeMode] 209 | ClassAtom[?UnicodeMode] NonemptyClassRangesNoDash[?UnicodeMode] 210 | ClassAtom[?UnicodeMode] `-` ClassAtom[?UnicodeMode] ClassRanges[?UnicodeMode] 211 | 212 | NonemptyClassRangesNoDash[UnicodeMode] :: 213 | ClassAtom[?UnicodeMode] 214 | ClassAtomNoDash[?UnicodeMode] NonemptyClassRangesNoDash[?UnicodeMode] 215 | ClassAtomNoDash[?UnicodeMode] `-` ClassAtom[?UnicodeMode] ClassRanges[?UnicodeMode] 216 | 217 | ClassAtom[UnicodeMode] :: 218 | `-` 219 | ClassAtomNoDash[?UnicodeMode] 220 | 221 | ClassAtomNoDash[UnicodeMode] :: 222 | SourceCharacter but not one of `\` or `]` or `-` 223 | `\` ClassEscape[?UnicodeMode] 224 | 225 | ClassEscape[UnicodeMode] :: 226 | `b` 227 | [+UnicodeMode] `-` 228 | CharacterClassEscape[?UnicodeMode] 229 | CharacterEscape[?UnicodeMode] 230 | 231 | 232 | 233 |

A number of productions in this section are given alternative definitions in section .

234 |
235 |
236 | 237 | 238 |

Pattern Semantics

239 | 240 |

Notation

241 |

The descriptions below use the following aliases:

242 |
    243 |
  • 244 | _Input_ is a List whose elements are the characters of the String being matched by the regular expression pattern. Each character is either a code unit or a code point, depending upon the kind of pattern involved. The notation _Input_[_n_] means the _n_th character of _Input_, where _n_ can range between 0 (inclusive) and _InputLength_ (exclusive). 245 |
  • 246 |
  • 247 | _InputLength_ is the number of characters in _Input_. 248 |
  • 249 |
  • 250 | _NcapturingParens_ is the total number of left-capturing parentheses (i.e. the total number of Atom :: `(` GroupSpecifier Disjunction `)` Parse Nodes) in the pattern. A left-capturing parenthesis is any `(` pattern character that is matched by the `(` terminal of the Atom :: `(` GroupSpecifier Disjunction `)` production. 251 |
  • 252 |
  • 253 | _DotAll_ is *true* if the RegExp object's [[OriginalFlags]] internal slot contains *"s"* and otherwise is *false*. 254 |
  • 255 |
  • 256 | _IgnoreCase_ is *true* if the RegExp object's [[OriginalFlags]] internal slot contains *"i"* and otherwise is *false*. 257 |
  • 258 |
  • 259 | _Multiline_ is *true* if the RegExp object's [[OriginalFlags]] internal slot contains *"m"* and otherwise is *false*. 260 |
  • 261 |
  • 262 | _Unicode_ is *true* if the RegExp object's [[OriginalFlags]] internal slot contains *"u"* and otherwise is *false*. 263 |
  • 264 |
  • 265 | _WordCharacters_ is the mathematical set that is the union of all sixty-three characters in *"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_"* (letters, numbers, and U+005F (LOW LINE) in the Unicode Basic Latin block) and all characters _c_ for which _c_ is not in that set but Canonicalize(_c_) is. _WordCharacters_ cannot contain more than sixty-three characters unless _Unicode_ and _IgnoreCase_ are both *true*. 266 |
  • 267 |
268 |

Furthermore, the descriptions below use the following internal data structures:

269 |
    270 |
  • 271 | A CharSet is a mathematical set of characters. When the _Unicode_ flag is *true*, “all characters” means the CharSet containing all code point values; otherwise “all characters” means the CharSet containing all code unit values. 272 |
  • 273 |
  • 274 | A State is an ordered pair (_endIndex_, _captures_) where _endIndex_ is an integer and _captures_ is a List of _NcapturingParens_ values. States are used to represent partial match states in the regular expression matching algorithms. The _endIndex_ is one plus the index of the last input character matched so far by the pattern, while _captures_ holds the results of capturing parentheses. The _n_th element of _captures_ is either a List of characters that represents the value obtained by the _n_th set of capturing parentheses or *undefined* if the _n_th set of capturing parentheses hasn't been reached yet. Due to backtracking, many States may be in use at any time during the matching process. 275 |
  • 276 |
  • 277 | A MatchResult is either a State or the special token ~failure~ that indicates that the match failed. 278 |
  • 279 |
  • 280 | A Continuation is an Abstract Closure that takes one State argument and returns a MatchResult result. The Continuation attempts to match the remaining portion (specified by the closure's captured values) of the pattern against _Input_, starting at the intermediate state given by its State argument. If the match succeeds, the Continuation returns the final State that it reached; if the match fails, the Continuation returns ~failure~. 281 |
  • 282 |
  • 283 | A Matcher is an Abstract Closure that takes two arguments—a State and a Continuation—and returns a MatchResult result. A Matcher attempts to match a middle subpattern (specified by the closure's captured values) of the pattern against _Input_, starting at the intermediate state given by its State argument. The Continuation argument should be a closure that matches the rest of the pattern. After matching the subpattern of a pattern to obtain a new State, the Matcher then calls Continuation on that new State to test if the rest of the pattern can match as well. If it can, the Matcher returns the State returned by Continuation; if not, the Matcher may try different choices at its choice points, repeatedly calling Continuation until it either succeeds or all possibilities have been exhausted. 284 |
  • 285 |
286 |
287 | 288 | 289 | 290 |

Static Semantics: Early Errors

291 | Atom :: `(` `?` RegularExpressionFlags `:` Disjunction `)` 292 |
    293 |
  • It is a Syntax Error if the source text matched by |RegularExpressionFlags| contains any code point other than `i`, `m`, or `s`, or if it contains the same code point more than once. 294 |
295 | Atom :: `(` `?` RegularExpressionFlags `-` RegularExpressionFlags `:` Disjunction `)` 296 |
    297 |
  • It is a Syntax Error if the source text matched by the first |RegularExpressionFlags| and the source text matched by the second |RegularExpressionFlags| are both empty. 298 |
  • It is a Syntax Error if the source text matched by the first |RegularExpressionFlags| contains any code point other than `i`, `m`, or `s`, or contains the same code point more than once. 299 |
  • It is a Syntax Error if the source text matched by the second |RegularExpressionFlags| contains any code point other than `i`, `m`, or `s`, or contains the same code point more than once. 300 |
  • It is a Syntax Error if any code point in the source text matched by the first |RegularExpressionFlags| is also contained in the source text matched by the second |RegularExpressionFlags|. 301 |
302 |
303 | 304 | 305 |

Modifiers Records

306 |

A Modifiers Record is a Record value used to encapsulate information about the regular expression flags that apply to a subpattern.

307 |

Modifiers Records have the fields listed in .

308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 |
Field NameValueMeaning
[[DotAll]]a BooleanIndicates whether the *"s"* flag is currently enabled.
[[IgnoreCase]]a BooleanIndicates whether the *"i"* flag is currently enabled.
[[Multiline]]a BooleanIndicates whether the *"m"* flag is currently enabled.
331 |
332 |
333 |
334 | 335 | 336 |

Runtime Semantics: CompilePattern

337 |
338 |
description
339 |
It returns an Abstract Closure that takes a String and a non-negative integer and returns a MatchResult.
340 |
341 | Pattern :: Disjunction 342 | 343 | 1. Let _modifiers_ be the Modifiers Record { [[DotAll]]: _DotAll_, [[IgnoreCase]]: _IgnoreCase_, [[Multiline]]: _Multiline_ }. 344 | 1. Let _m_ be CompileSubpattern of |Disjunction| with arguments ~forward~ and _modifiers_. 345 | 1. Return a new Abstract Closure with parameters (_str_, _index_) that captures _m_ and performs the following steps when called: 346 | 1. Assert: Type(_str_) is String. 347 | 1. Assert: _index_ is a non-negative integer which is ≤ the length of _str_. 348 | 1. If _Unicode_ is *true*, let _Input_ be StringToCodePoints(_str_). Otherwise, let _Input_ be a List whose elements are the code units that are the elements of _str_. _Input_ will be used throughout the algorithms in . Each element of _Input_ is considered to be a character. 349 | 1. Let _InputLength_ be the number of characters contained in _Input_. This alias will be used throughout the algorithms in . 350 | 1. Let _listIndex_ be the index into _Input_ of the character that was obtained from element _index_ of _str_. 351 | 1. Let _c_ be a new Continuation with parameters (_y_) that captures nothing and performs the following steps when called: 352 | 1. Assert: _y_ is a State. 353 | 1. Return _y_. 354 | 1. Let _cap_ be a List of _NcapturingParens_ *undefined* values, indexed 1 through _NcapturingParens_. 355 | 1. Let _x_ be the State (_listIndex_, _cap_). 356 | 1. Return _m_(_x_, _c_). 357 | 358 | 359 |

A Pattern compiles to an Abstract Closure value. RegExpBuiltinExec can then apply this procedure to a String and an offset within the String to determine whether the pattern would match starting at exactly that offset within the String, and, if it does match, what the values of the capturing parentheses would be. The algorithms in are designed so that compiling a pattern may throw a *SyntaxError* exception; on the other hand, once the pattern is successfully compiled, applying the resulting Abstract Closure to find a match in a String cannot throw an exception (except for any implementation-defined exceptions that can occur anywhere such as out-of-memory).

360 |
361 |
362 | 363 | 364 |

365 | Runtime Semantics: CompileSubpattern ( 366 | _direction_: ~forward~ or ~backward~, 367 | _modifiers_: a Modifiers Record, 368 | ): a Matcher 369 |

370 |
371 |
372 | 373 |

This section is amended in B.1.2.4.

374 |
375 | 376 | 377 | Disjunction :: Alternative `|` Disjunction 378 | 379 | 1. Let _m1_ be CompileSubpattern of |Alternative| with arguments _direction_ and _modifiers_. 380 | 1. Let _m2_ be CompileSubpattern of |Disjunction| with arguments _direction_ and _modifiers_. 381 | 1. Return a new Matcher with parameters (_x_, _c_) that captures _m1_ and _m2_ and performs the following steps when called: 382 | 1. Assert: _x_ is a State. 383 | 1. Assert: _c_ is a Continuation. 384 | 1. Let _r_ be _m1_(_x_, _c_). 385 | 1. If _r_ is not ~failure~, return _r_. 386 | 1. Return _m2_(_x_, _c_). 387 | 388 | 389 |

The `|` regular expression operator separates two alternatives. The pattern first tries to match the left |Alternative| (followed by the sequel of the regular expression); if it fails, it tries to match the right |Disjunction| (followed by the sequel of the regular expression). If the left |Alternative|, the right |Disjunction|, and the sequel all have choice points, all choices in the sequel are tried before moving on to the next choice in the left |Alternative|. If choices in the left |Alternative| are exhausted, the right |Disjunction| is tried instead of the left |Alternative|. Any capturing parentheses inside a portion of the pattern skipped by `|` produce *undefined* values instead of Strings. Thus, for example,

390 |
/a|ab/.exec("abc")
391 |

returns the result *"a"* and not *"ab"*. Moreover,

392 |
/((a)|(ab))((c)|(bc))/.exec("abc")
393 |

returns the array

394 |
["abc", "a", "a", undefined, "bc", undefined, "bc"]
395 |

and not

396 |
["abc", "ab", undefined, "ab", "c", "c", undefined]
397 |

The order in which the two alternatives are tried is independent of the value of _direction_.

398 |
399 | 400 | 401 | Alternative :: [empty] 402 | 403 | 1. Return a new Matcher with parameters (_x_, _c_) that captures nothing and performs the following steps when called: 404 | 1. Assert: _x_ is a State. 405 | 1. Assert: _c_ is a Continuation. 406 | 1. Return _c_(_x_). 407 | 408 | Alternative :: Alternative Term 409 | 410 | 1. Let _m1_ be CompileSubpattern of |Alternative| with arguments _direction_ and _modifiers_. 411 | 1. Let _m2_ be CompileSubpattern of |Term| with arguments _direction_ and _modifiers_. 412 | 1. If _direction_ is ~forward~, then 413 | 1. Let _m_ be a new Matcher with parameters (_x_, _c_) that captures _m1_ and _m2_ and performs the following steps when called: 414 | 1. Assert: _x_ is a State. 415 | 1. Assert: _c_ is a Continuation. 416 | 1. Let _d_ be a new Continuation with parameters (_y_) that captures _c_ and _m2_ and performs the following steps when called: 417 | 1. Assert: _y_ is a State. 418 | 1. Return _m2_(_y_, _c_). 419 | 1. Return _m1_(_x_, _d_). 420 | 1. Else, 421 | 1. Assert: _direction_ is ~backward~. 422 | 1. Let _m_ be a new Matcher with parameters (_x_, _c_) that captures _m1_ and _m2_ and performs the following steps when called: 423 | 1. Assert: _x_ is a State. 424 | 1. Assert: _c_ is a Continuation. 425 | 1. Let _d_ be a new Continuation with parameters (_y_) that captures _c_ and _m1_ and performs the following steps when called: 426 | 1. Assert: _y_ is a State. 427 | 1. Return _m1_(_y_, _c_). 428 | 1. Return _m2_(_x_, _d_). 429 | 430 | 431 |

Consecutive |Term|s try to simultaneously match consecutive portions of _Input_. When _direction_ is ~forward~, if the left |Alternative|, the right |Term|, and the sequel of the regular expression all have choice points, all choices in the sequel are tried before moving on to the next choice in the right |Term|, and all choices in the right |Term| are tried before moving on to the next choice in the left |Alternative|. When _direction_ is ~backward~, the evaluation order of |Alternative| and |Term| are reversed.

432 |
433 | 434 | 435 | Term :: Assertion 436 | 437 | 1. Return CompileAssertion of |Assertion| with argument _modifiers_. 438 | 439 | 440 |

The resulting Matcher is independent of _direction_.

441 |
442 | Term :: Atom 443 | 444 | 1. Return CompileAtom of |Atom| with arguments _direction_ and _modifiers_. 445 | 446 | Term :: Atom Quantifier 447 | 448 | 1. Let _m_ be CompileAtom of |Atom| with arguments _direction_ and _modifiers_. 449 | 1. Let _q_ be CompileQuantifier of |Quantifier|. 450 | 1. Assert: _q_.[[Min]] ≤ _q_.[[Max]]. 451 | 1. Let _parenIndex_ be the number of left-capturing parentheses in the entire regular expression that occur to the left of this |Term|. This is the total number of Atom :: `(` GroupSpecifier Disjunction `)` Parse Nodes prior to or enclosing this |Term|. 452 | 1. Let _parenCount_ be the number of left-capturing parentheses in |Atom|. This is the total number of Atom :: `(` GroupSpecifier Disjunction `)` Parse Nodes enclosed by |Atom|. 453 | 1. Return a new Matcher with parameters (_x_, _c_) that captures _m_, _q_, _parenIndex_, and _parenCount_ and performs the following steps when called: 454 | 1. Assert: _x_ is a State. 455 | 1. Assert: _c_ is a Continuation. 456 | 1. Return RepeatMatcher(_m_, _q_.[[Min]], _q_.[[Max]], _q_.[[Greedy]], _x_, _c_, _parenIndex_, _parenCount_). 457 | 458 |
459 | 460 | 461 |

462 | Runtime Semantics: CompileAssertion ( 463 | _modifiers_: a Modifiers Record, 464 | ): a Matcher 465 |

466 |
467 |
468 | 469 |

This section is amended in B.1.2.5.

470 |
471 | Assertion :: `^` 472 | 473 | 1. Return a new Matcher with parameters (_x_, _c_) that captures nothing and performs the following steps when called: 474 | 1. Assert: _x_ is a State. 475 | 1. Assert: _c_ is a Continuation. 476 | 1. Let _e_ be _x_'s _endIndex_. 477 | 1. If _e_ = 0, or if _Multiline__modifiers_.[[Multiline]] is *true* and the character _Input_[_e_ - 1] is one of |LineTerminator|, then 478 | 1. Return _c_(_x_). 479 | 1. Return ~failure~. 480 | 481 | 482 |

Even when the `y` flag is used with a pattern, `^` always matches only at the beginning of _Input_, or (if _Multiline__modifiers_.[[Multiline]] is *true*) at the beginning of a line.

483 |
484 | Assertion :: `$` 485 | 486 | 1. Return a new Matcher with parameters (_x_, _c_) that captures nothing and performs the following steps when called: 487 | 1. Assert: _x_ is a State. 488 | 1. Assert: _c_ is a Continuation. 489 | 1. Let _e_ be _x_'s _endIndex_. 490 | 1. If _e_ = _InputLength_, or if _Multiline__modifiers_.[[Multiline]] is *true* and the character _Input_[_e_] is one of |LineTerminator|, then 491 | 1. Return _c_(_x_). 492 | 1. Return ~failure~. 493 | 494 | Assertion :: `\` `b` 495 | 496 | 1. Return a new Matcher with parameters (_x_, _c_) that captures nothing and performs the following steps when called: 497 | 1. Assert: _x_ is a State. 498 | 1. Assert: _c_ is a Continuation. 499 | 1. Let _e_ be _x_'s _endIndex_. 500 | 1. Let _a_ be IsWordChar(_e_ - 1, _modifiers_). 501 | 1. Let _b_ be IsWordChar(_e_, _modifiers_). 502 | 1. If _a_ is *true* and _b_ is *false*, or if _a_ is *false* and _b_ is *true*, return _c_(_x_). 503 | 1. Return ~failure~. 504 | 505 | Assertion :: `\` `B` 506 | 507 | 1. Return a new Matcher with parameters (_x_, _c_) that captures nothing and performs the following steps when called: 508 | 1. Assert: _x_ is a State. 509 | 1. Assert: _c_ is a Continuation. 510 | 1. Let _e_ be _x_'s _endIndex_. 511 | 1. Let _a_ be IsWordChar(_e_ - 1, _modifiers_). 512 | 1. Let _b_ be IsWordChar(_e_, _modifiers_). 513 | 1. If _a_ is *true* and _b_ is *true*, or if _a_ is *false* and _b_ is *false*, return _c_(_x_). 514 | 1. Return ~failure~. 515 | 516 | Assertion :: `(` `?` `=` Disjunction `)` 517 | 518 | 1. Let _m_ be CompileSubpattern of |Disjunction| with arguments ~forward~ and _modifiers_. 519 | 1. Return a new Matcher with parameters (_x_, _c_) that captures _m_ and performs the following steps when called: 520 | 1. Assert: _x_ is a State. 521 | 1. Assert: _c_ is a Continuation. 522 | 1. Let _d_ be a new Continuation with parameters (_y_) that captures nothing and performs the following steps when called: 523 | 1. Assert: _y_ is a State. 524 | 1. Return _y_. 525 | 1. Let _r_ be _m_(_x_, _d_). 526 | 1. If _r_ is ~failure~, return ~failure~. 527 | 1. Let _y_ be _r_'s State. 528 | 1. Let _cap_ be _y_'s _captures_ List. 529 | 1. Let _xe_ be _x_'s _endIndex_. 530 | 1. Let _z_ be the State (_xe_, _cap_). 531 | 1. Return _c_(_z_). 532 | 533 | Assertion :: `(` `?` `!` Disjunction `)` 534 | 535 | 1. Let _m_ be CompileSubpattern of |Disjunction| with arguments ~forward~ and _modifiers_. 536 | 1. Return a new Matcher with parameters (_x_, _c_) that captures _m_ and performs the following steps when called: 537 | 1. Assert: _x_ is a State. 538 | 1. Assert: _c_ is a Continuation. 539 | 1. Let _d_ be a new Continuation with parameters (_y_) that captures nothing and performs the following steps when called: 540 | 1. Assert: _y_ is a State. 541 | 1. Return _y_. 542 | 1. Let _r_ be _m_(_x_, _d_). 543 | 1. If _r_ is not ~failure~, return ~failure~. 544 | 1. Return _c_(_x_). 545 | 546 | Assertion :: `(` `?` `<=` Disjunction `)` 547 | 548 | 1. Let _m_ be CompileSubpattern of |Disjunction| with arguments ~backward~ and _modifiers_. 549 | 1. Return a new Matcher with parameters (_x_, _c_) that captures _m_ and performs the following steps when called: 550 | 1. Assert: _x_ is a State. 551 | 1. Assert: _c_ is a Continuation. 552 | 1. Let _d_ be a new Continuation with parameters (_y_) that captures nothing and performs the following steps when called: 553 | 1. Assert: _y_ is a State. 554 | 1. Return _y_. 555 | 1. Let _r_ be _m_(_x_, _d_). 556 | 1. If _r_ is ~failure~, return ~failure~. 557 | 1. Let _y_ be _r_'s State. 558 | 1. Let _cap_ be _y_'s _captures_ List. 559 | 1. Let _xe_ be _x_'s _endIndex_. 560 | 1. Let _z_ be the State (_xe_, _cap_). 561 | 1. Return _c_(_z_). 562 | 563 | Assertion :: `(` `?` `<!` Disjunction `)` 564 | 565 | 1. Let _m_ be CompileSubpattern of |Disjunction| with arguments ~backward~ and _modifiers_. 566 | 1. Return a new Matcher with parameters (_x_, _c_) that captures _m_ and performs the following steps when called: 567 | 1. Assert: _x_ is a State. 568 | 1. Assert: _c_ is a Continuation. 569 | 1. Let _d_ be a new Continuation with parameters (_y_) that captures nothing and performs the following steps when called: 570 | 1. Assert: _y_ is a State. 571 | 1. Return _y_. 572 | 1. Let _r_ be _m_(_x_, _d_). 573 | 1. If _r_ is not ~failure~, return ~failure~. 574 | 1. Return _c_(_x_). 575 | 576 | 577 | 578 |

579 | IsWordChar ( 580 | _e_: an integer, 581 | _modifiers_: a Modifiers Record, 582 | ) 583 |

584 |
585 |
586 | 587 | 1. If _e_ = -1 or _e_ is _InputLength_, return *false*. 588 | 1. Let _c_ be the character _Input_[_e_]. 589 | 1. Let _wordCharacters_ be GetWordCharacters(_modifiers_). 590 | 1. If _c_ is in _WordCharacters__wordCharacters_, return *true*. 591 | 1. Return *false*. 592 | 593 |
594 |
595 | 596 | 597 |

598 | Runtime Semantics: CompileAtom ( 599 | _direction_: ~forward~ or ~backward~, 600 | _modifiers_: a Modifiers Record, 601 | ): a Matcher 602 |

603 |
604 |
605 | 606 |

This section is amended in B.1.2.6.

607 |
608 | 609 | 610 | Atom :: PatternCharacter 611 | 612 | 1. Let _ch_ be the character matched by |PatternCharacter|. 613 | 1. Let _A_ be a one-element CharSet containing the character _ch_. 614 | 1. Return CharacterSetMatcher(_A_, *false*, _direction_, _modifiers_). 615 | 616 | Atom :: `.` 617 | 618 | 1. Let _A_ be the CharSet of all characters. 619 | 1. If _DotAll__modifiers_.[[DotAll]] is not *true*, then 620 | 1. Remove from _A_ all characters corresponding to a code point on the right-hand side of the |LineTerminator| production. 621 | 1. Return CharacterSetMatcher(_A_, *false*, _direction_, _modifiers_). 622 | 623 | Atom :: CharacterClass 624 | 625 | 1. Let _cc_ be CompileCharacterClass of |CharacterClass|. 626 | 1. Return CharacterSetMatcher(_cc_.[[CharSet]], _cc_.[[Invert]], _direction_, _modifiers_). 627 | 628 | Atom :: `(` GroupSpecifier Disjunction `)` 629 | 630 | 1. Let _m_ be CompileSubpattern of |Disjunction| with arguments _direction_ and _modifiers_. 631 | 1. Let _parenIndex_ be the number of left-capturing parentheses in the entire regular expression that occur to the left of this |Atom|. This is the total number of Atom :: `(` GroupSpecifier Disjunction `)` Parse Nodes prior to or enclosing this |Atom|. 632 | 1. Return a new Matcher with parameters (_x_, _c_) that captures _direction_, _m_, and _parenIndex_ and performs the following steps when called: 633 | 1. Assert: _x_ is a State. 634 | 1. Assert: _c_ is a Continuation. 635 | 1. Let _d_ be a new Continuation with parameters (_y_) that captures _x_, _c_, _direction_, and _parenIndex_ and performs the following steps when called: 636 | 1. Assert: _y_ is a State. 637 | 1. Let _cap_ be a copy of _y_'s _captures_ List. 638 | 1. Let _xe_ be _x_'s _endIndex_. 639 | 1. Let _ye_ be _y_'s _endIndex_. 640 | 1. If _direction_ is ~forward~, then 641 | 1. Assert: _xe_ ≤ _ye_. 642 | 1. Let _s_ be a List whose elements are the characters of _Input_ at indices _xe_ (inclusive) through _ye_ (exclusive). 643 | 1. Else, 644 | 1. Assert: _direction_ is ~backward~. 645 | 1. Assert: _ye_ ≤ _xe_. 646 | 1. Let _s_ be a List whose elements are the characters of _Input_ at indices _ye_ (inclusive) through _xe_ (exclusive). 647 | 1. Set _cap_[_parenIndex_ + 1] to _s_. 648 | 1. Let _z_ be the State (_ye_, _cap_). 649 | 1. Return _c_(_z_). 650 | 1. Return _m_(_x_, _d_). 651 | 652 | 653 | 654 | Atom :: `(` `?` `:` Disjunction `)` 655 | 656 | 1. Return CompileSubpattern of |Disjunction| with arguments _direction_ and _modifiers_. 657 | 658 | 659 | 660 | 661 | Atom :: `(` `?` RegularExpressionFlags `:` Disjunction `)` 662 | 663 | 1. Let _addModifiers_ be the source text matched by |RegularExpressionFlags|. 664 | 1. Let _removeModifiers_ be the empty String. 665 | 1. Let _newModifiers_ be UpdateModifiers(_modifiers_, CodePointsToString(_addModifiers_), _removeModifiers_). 666 | 1. Return CompileSubpattern of |Disjunction| with arguments _direction_ and _newModifiers_. 667 | 668 | Atom :: `(` `?` RegularExpressionFlags `-` RegularExpressionFlags `:` Disjunction `)` 669 | 670 | 1. Let _addModifiers_ be the source text matched by the first |RegularExpressionFlags|. 671 | 1. Let _removeModifiers_ be the source text matched by the second |RegularExpressionFlags|. 672 | 1. Let _newModifiers_ be UpdateModifiers(_modifiers_, CodePointsToString(_addModifiers_), CodePointsToString(_removeModifiers_)). 673 | 1. Return CompileSubpattern of |Disjunction| with arguments _direction_ and _newModifiers_. 674 | 675 | 676 | 677 | 678 | AtomEscape :: DecimalEscape 679 | 680 | 1. Let _n_ be the CapturingGroupNumber of |DecimalEscape|. 681 | 1. Assert: _n_ ≤ _NcapturingParens_. 682 | 1. Return BackreferenceMatcher(_n_, _direction_, _modifiers_). 683 | 684 | 685 |

An escape sequence of the form `\\` followed by a non-zero decimal number _n_ matches the result of the _n_th set of capturing parentheses (). It is an error if the regular expression has fewer than _n_ capturing parentheses. If the regular expression has _n_ or more capturing parentheses but the _n_th one is *undefined* because it has not captured anything, then the backreference always succeeds.

686 |
687 | AtomEscape :: CharacterEscape 688 | 689 | 1. Let _cv_ be the CharacterValue of |CharacterEscape|. 690 | 1. Let _ch_ be the character whose character value is _cv_. 691 | 1. Let _A_ be a one-element CharSet containing the character _ch_. 692 | 1. Return CharacterSetMatcher(_A_, *false*, _direction_, _modifiers_). 693 | 694 | AtomEscape :: CharacterClassEscape 695 | 696 | 1. Let _A_ be CompileToCharSet of |CharacterClassEscape|. 697 | 1. Return CharacterSetMatcher(_A_, *false*, _direction_, _modifiers_). 698 | 699 | AtomEscape :: `k` GroupName 700 | 701 | 1. Search the enclosing |Pattern| for an instance of a |GroupSpecifier| containing a |RegExpIdentifierName| which has a CapturingGroupName equal to the CapturingGroupName of the |RegExpIdentifierName| contained in |GroupName|. 702 | 1. Assert: A unique such |GroupSpecifier| is found. 703 | 1. Let _parenIndex_ be the number of left-capturing parentheses in the entire regular expression that occur to the left of the located |GroupSpecifier|. This is the total number of Atom :: `(` GroupSpecifier Disjunction `)` Parse Nodes prior to or enclosing the located |GroupSpecifier|, including its immediately enclosing |Atom|. 704 | 1. Return BackreferenceMatcher(_parenIndex_, _direction_, _modifiers_). 705 | 706 | 707 | 708 |

709 | CharacterSetMatcher ( 710 | _A_: a CharSet, 711 | _invert_: a Boolean, 712 | _direction_: ~forward~ or ~backward~, 713 | _modifiers_: a Modifiers Record, 714 | ): a Matcher 715 |

716 |
717 |
718 | 719 | 1. Return a new Matcher with parameters (_x_, _c_) that captures _A_, _invert_, and _direction_ and performs the following steps when called: 720 | 1. Assert: _x_ is a State. 721 | 1. Assert: _c_ is a Continuation. 722 | 1. Let _e_ be _x_'s _endIndex_. 723 | 1. If _direction_ is ~forward~, let _f_ be _e_ + 1. 724 | 1. Else, let _f_ be _e_ - 1. 725 | 1. If _f_ < 0 or _f_ > _InputLength_, return ~failure~. 726 | 1. Let _index_ be min(_e_, _f_). 727 | 1. Let _ch_ be the character _Input_[_index_]. 728 | 1. Let _cc_ be Canonicalize(_ch_, _modifiers_). 729 | 1. If there exists a member _a_ of _A_ such that Canonicalize(_a_, _modifiers_) is _cc_, let _found_ be *true*. Otherwise, let _found_ be *false*. 730 | 1. If _invert_ is *false* and _found_ is *false*, return ~failure~. 731 | 1. If _invert_ is *true* and _found_ is *true*, return ~failure~. 732 | 1. Let _cap_ be _x_'s _captures_ List. 733 | 1. Let _y_ be the State (_f_, _cap_). 734 | 1. Return _c_(_y_). 735 | 736 |
737 | 738 | 739 |

740 | BackreferenceMatcher ( 741 | _n_: a positive integer, 742 | _direction_: ~forward~ or ~backward~, 743 | _modifiers_: a Modifiers Record, 744 | ): a Matcher 745 |

746 |
747 |
748 | 749 | 1. Assert: _n_ ≥ 1. 750 | 1. Return a new Matcher with parameters (_x_, _c_) that captures _n_ and _direction_ and performs the following steps when called: 751 | 1. Assert: _x_ is a State. 752 | 1. Assert: _c_ is a Continuation. 753 | 1. Let _cap_ be _x_'s _captures_ List. 754 | 1. Let _s_ be _cap_[_n_]. 755 | 1. If _s_ is *undefined*, return _c_(_x_). 756 | 1. Let _e_ be _x_'s _endIndex_. 757 | 1. Let _len_ be the number of elements in _s_. 758 | 1. If _direction_ is ~forward~, let _f_ be _e_ + _len_. 759 | 1. Else, let _f_ be _e_ - _len_. 760 | 1. If _f_ < 0 or _f_ > _InputLength_, return ~failure~. 761 | 1. Let _g_ be min(_e_, _f_). 762 | 1. If there exists an integer _i_ between 0 (inclusive) and _len_ (exclusive) such that Canonicalize(_s_[_i_], _modifiers_) is not the same character value as Canonicalize(_Input_[_g_ + _i_], _modifiers_), return ~failure~. 763 | 1. Let _y_ be the State (_f_, _cap_). 764 | 1. Return _c_(_y_). 765 | 766 |
767 | 768 | 769 |

770 | Canonicalize ( 771 | _ch_: a character, 772 | _modifiers_: a Modifiers Record, 773 | ): a Matcher 774 |

775 |
776 |
777 | 778 | 1. If _Unicode_ is *true* and _IgnoreCase__modifiers_.[[IgnoreCase]] is *true*, then 779 | 1. If the file CaseFolding.txt of the Unicode Character Database provides a simple or common case folding mapping for _ch_, return the result of applying that mapping to _ch_. 780 | 1. Return _ch_. 781 | 1. If _IgnoreCase__modifiers_.[[IgnoreCase]] is *false*, return _ch_. 782 | 1. Assert: _ch_ is a UTF-16 code unit. 783 | 1. Let _cp_ be the code point whose numeric value is that of _ch_. 784 | 1. Let _u_ be the result of toUppercase(« _cp_ »), according to the Unicode Default Case Conversion algorithm. 785 | 1. Let _uStr_ be CodePointsToString(_u_). 786 | 1. If _uStr_ does not consist of a single code unit, return _ch_. 787 | 1. Let _cu_ be _uStr_'s single code unit element. 788 | 1. If the numeric value of _ch_ ≥ 128 and the numeric value of _cu_ < 128, return _ch_. 789 | 1. Return _cu_. 790 | 791 | 792 |

Parentheses of the form `(` |Disjunction| `)` serve both to group the components of the |Disjunction| pattern together and to save the result of the match. The result can be used either in a backreference (`\\` followed by a non-zero decimal number), referenced in a replace String, or returned as part of an array from the regular expression matching Abstract Closure. To inhibit the capturing behaviour of parentheses, use the form `(?:` |Disjunction| `)` instead.

793 |
794 | 795 |

The form `(?=` |Disjunction| `)` specifies a zero-width positive lookahead. In order for it to succeed, the pattern inside |Disjunction| must match at the current position, but the current position is not advanced before matching the sequel. If |Disjunction| can match at the current position in several ways, only the first one is tried. Unlike other regular expression operators, there is no backtracking into a `(?=` form (this unusual behaviour is inherited from Perl). This only matters when the |Disjunction| contains capturing parentheses and the sequel of the pattern contains backreferences to those captures.

796 |

For example,

797 |
/(?=(a+))/.exec("baaabac")
798 |

matches the empty String immediately after the first `b` and therefore returns the array:

799 |
["", "aaa"]
800 |

To illustrate the lack of backtracking into the lookahead, consider:

801 |
/(?=(a+))a*b\1/.exec("baaabac")
802 |

This expression returns

803 |
["aba", "a"]
804 |

and not:

805 |
["aaaba", "a"]
806 |
807 | 808 |

The form `(?!` |Disjunction| `)` specifies a zero-width negative lookahead. In order for it to succeed, the pattern inside |Disjunction| must fail to match at the current position. The current position is not advanced before matching the sequel. |Disjunction| can contain capturing parentheses, but backreferences to them only make sense from within |Disjunction| itself. Backreferences to these capturing parentheses from elsewhere in the pattern always return *undefined* because the negative lookahead must fail for the pattern to succeed. For example,

809 |
/(.*?)a(?!(a+)b\2c)\2(.*)/.exec("baaabaac")
810 |

looks for an `a` not immediately followed by some positive number n of `a`'s, a `b`, another n `a`'s (specified by the first `\\2`) and a `c`. The second `\\2` is outside the negative lookahead, so it matches against *undefined* and therefore always succeeds. The whole expression returns the array:

811 |
["baaabaac", "ba", undefined, "abaac"]
812 |
813 | 814 |

In case-insignificant matches when _Unicode_ is *true*, all characters are implicitly case-folded using the simple mapping provided by the Unicode standard immediately before they are compared. The simple mapping always maps to a single code point, so it does not map, for example, `ß` (U+00DF) to `SS`. It may however map a code point outside the Basic Latin range to a character within, for example, `ſ` (U+017F) to `s`. Such characters are not mapped if _Unicode_ is *false*. This prevents Unicode code points such as U+017F and U+212A from matching regular expressions such as `/[a-z]/i`, but they will match `/[a-z]/ui`.

815 |
816 |
817 |
818 | 819 | 820 |

Runtime Semantics: CompileToCharSet ( ): a CharSet

821 |
822 |
823 | 824 |

This section is amended in .

825 |
826 | 827 | 828 | ClassRanges :: [empty] 829 | 830 | 1. Return the empty CharSet. 831 | 832 | 833 | 834 | NonemptyClassRanges :: ClassAtom NonemptyClassRangesNoDash 835 | 836 | 1. Let _A_ be CompileToCharSet of |ClassAtom|. 837 | 1. Let _B_ be CompileToCharSet of |NonemptyClassRangesNoDash|. 838 | 1. Return the union of CharSets _A_ and _B_. 839 | 840 | NonemptyClassRanges :: ClassAtom `-` ClassAtom ClassRanges 841 | 842 | 1. Let _A_ be CompileToCharSet of the first |ClassAtom|. 843 | 1. Let _B_ be CompileToCharSet of the second |ClassAtom|. 844 | 1. Let _C_ be CompileToCharSet of |ClassRanges|. 845 | 1. Let _D_ be CharacterRange(_A_, _B_). 846 | 1. Return the union of _D_ and _C_. 847 | 848 | 849 | 850 | NonemptyClassRangesNoDash :: ClassAtomNoDash NonemptyClassRangesNoDash 851 | 852 | 1. Let _A_ be CompileToCharSet of |ClassAtomNoDash|. 853 | 1. Let _B_ be CompileToCharSet of |NonemptyClassRangesNoDash|. 854 | 1. Return the union of CharSets _A_ and _B_. 855 | 856 | NonemptyClassRangesNoDash :: ClassAtomNoDash `-` ClassAtom ClassRanges 857 | 858 | 1. Let _A_ be CompileToCharSet of |ClassAtomNoDash|. 859 | 1. Let _B_ be CompileToCharSet of |ClassAtom|. 860 | 1. Let _C_ be CompileToCharSet of |ClassRanges|. 861 | 1. Let _D_ be CharacterRange(_A_, _B_). 862 | 1. Return the union of _D_ and _C_. 863 | 864 | 865 |

|ClassRanges| can expand into a single |ClassAtom| and/or ranges of two |ClassAtom| separated by dashes. In the latter case the |ClassRanges| includes all characters between the first |ClassAtom| and the second |ClassAtom|, inclusive; an error occurs if either |ClassAtom| does not represent a single character (for example, if one is \w) or if the first |ClassAtom|'s character value is greater than the second |ClassAtom|'s character value.

866 |
867 | 868 |

Even if the pattern ignores case, the case of the two ends of a range is significant in determining which characters belong to the range. Thus, for example, the pattern `/[E-F]/i` matches only the letters `E`, `F`, `e`, and `f`, while the pattern `/[E-f]/i` matches all upper and lower-case letters in the Unicode Basic Latin block as well as the symbols `[`, `\\`, `]`, `^`, `_`, and `.

869 |
870 | 871 |

A `-` character can be treated literally or it can denote a range. It is treated literally if it is the first or last character of |ClassRanges|, the beginning or end limit of a range specification, or immediately follows a range specification.

872 |
873 | 874 | 875 | ClassAtom :: `-` 876 | 877 | 1. Return the CharSet containing the single character `-` U+002D (HYPHEN-MINUS). 878 | 879 | 880 | 881 | ClassAtomNoDash :: SourceCharacter but not one of `\` or `]` or `-` 882 | 883 | 1. Return the CharSet containing the character matched by |SourceCharacter|. 884 | 885 | 886 | 887 | 888 | ClassEscape :: `b` 889 | 890 | ClassEscape :: `-` 891 | 892 | ClassEscape :: CharacterEscape 893 | 894 | 895 | 1. Let _cv_ be the CharacterValue of this |ClassEscape|. 896 | 1. Let _c_ be the character whose character value is _cv_. 897 | 1. Return the CharSet containing the single character _c_. 898 | 899 | 900 |

A |ClassAtom| can use any of the escape sequences that are allowed in the rest of the regular expression except for `\\b`, `\\B`, and backreferences. Inside a |CharacterClass|, `\\b` means the backspace character, while `\\B` and backreferences raise errors. Using a backreference inside a |ClassAtom| causes an error.

901 |
902 | 903 | 904 | CharacterClassEscape :: `d` 905 | 906 | 1. Return the ten-element CharSet containing the characters `0` through `9` inclusive. 907 | 908 | CharacterClassEscape :: `D` 909 | 910 | 1. Return the CharSet containing all characters not in the CharSet returned by CharacterClassEscape :: `d` . 911 | 912 | CharacterClassEscape :: `s` 913 | 914 | 1. Return the CharSet containing all characters corresponding to a code point on the right-hand side of the |WhiteSpace| or |LineTerminator| productions. 915 | 916 | CharacterClassEscape :: `S` 917 | 918 | 1. Return the CharSet containing all characters not in the CharSet returned by CharacterClassEscape :: `s` . 919 | 920 | CharacterClassEscape :: `w` 921 | 922 | 1. Return _WordCharacters_GetWordCharacters(_modifiers_). 923 | 924 | CharacterClassEscape :: `W` 925 | 926 | 1. Return the CharSet containing all characters not in the CharSet returned by CharacterClassEscape :: `w` . 927 | 928 | CharacterClassEscape :: `p{` UnicodePropertyValueExpression `}` 929 | 930 | 1. Return the CharSet containing all Unicode code points included in CompileToCharSet of |UnicodePropertyValueExpression|. 931 | 932 | CharacterClassEscape :: `P{` UnicodePropertyValueExpression `}` 933 | 934 | 1. Return the CharSet containing all Unicode code points not included in CompileToCharSet of |UnicodePropertyValueExpression|. 935 | 936 | UnicodePropertyValueExpression :: UnicodePropertyName `=` UnicodePropertyValue 937 | 938 | 1. Let _ps_ be SourceText of |UnicodePropertyName|. 939 | 1. Let _p_ be UnicodeMatchProperty(_ps_). 940 | 1. Assert: _p_ is a Unicode property name or property alias listed in the “Property name and aliases” column of . 941 | 1. Let _vs_ be SourceText of |UnicodePropertyValue|. 942 | 1. Let _v_ be UnicodeMatchPropertyValue(_p_, _vs_). 943 | 1. Return the CharSet containing all Unicode code points whose character database definition includes the property _p_ with value _v_. 944 | 945 | UnicodePropertyValueExpression :: LoneUnicodePropertyNameOrValue 946 | 947 | 1. Let _s_ be SourceText of |LoneUnicodePropertyNameOrValue|. 948 | 1. If UnicodeMatchPropertyValue(`General_Category`, _s_) is identical to a List of Unicode code points that is the name of a Unicode general category or general category alias listed in the “Property value and aliases” column of , then 949 | 1. Return the CharSet containing all Unicode code points whose character database definition includes the property “General_Category” with value _s_. 950 | 1. Let _p_ be UnicodeMatchProperty(_s_). 951 | 1. Assert: _p_ is a binary Unicode property or binary property alias listed in the “Property name and aliases” column of . 952 | 1. Return the CharSet containing all Unicode code points whose character database definition includes the property _p_ with value “True”. 953 | 954 |
955 | 956 | 957 | 958 |

959 | 960 | GetWordCharacters ( 961 | _modifiers_: a Modifiers Record, 962 | ): a CharSet 963 | 964 |

965 |
966 |
967 | 968 | 1. Let _wordCharacters_ be the mathematical set that is the union of all sixty-three characters in *"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_"* (letters, numbers, and U+005F (LOW LINE) in the Unicode Basic Latin block) and all characters _c_ for which _c_ is not in that set but Canonicalize(_c_, _modifiers_) is. 969 | 1. Return _wordCharacters_. 970 | 971 | 972 | _wordCharacters_ cannot contain more than sixty-three characters unless _Unicode_ and _modifiers_.[[IgnoreCase]] are both *true*. 973 | 974 |
975 | 976 | 977 |

978 | 979 | UpdateModifiers ( 980 | _modifiers_: a Modifiers Record, 981 | _add_: a String, 982 | _remove_: a String, 983 | ): a Modifiers 984 | 985 |

986 |
987 |
988 | 989 | 1. Let _dotAll_ be _modifiers_.[[DotAll]]. 990 | 1. Let _ignoreCase_ be _modifiers_.[[IgnoreCase]]. 991 | 1. Let _multiline_ be _modifiers_.[[Multiline]]. 992 | 1. If _add_ contains *"s"*, set _dotAll_ to *true*. 993 | 1. If _add_ contains *"i"*, set _ignoreCase_ to *true*. 994 | 1. If _add_ contains *"m"*, set _multiline_ to *true*. 995 | 1. If _remove_ contains *"s"*, set _dotAll_ to *false*. 996 | 1. If _remove_ contains *"i"*, set _ignoreCase_ to *false*. 997 | 1. If _remove_ contains *"m"*, set _multiline_ to *false*. 998 | 1. Return the Modifiers Record { [[DotAll]]: _dotAll_, [[IgnoreCase]]: _ignoreCase_, [[Multiline]]: _multiline_ }. 999 | 1000 |
1001 |
1002 |
1003 |
1004 |
1005 | 1006 | 1007 |

Additional ECMAScript Features for Web Browsers

1008 | 1009 | 1010 |

Additional Syntax

1011 | 1012 | 1013 |

Regular Expressions Patterns

1014 |

The syntax of is modified and extended as follows. These changes introduce ambiguities that are broken by the ordering of grammar productions and by contextual information. When parsing using the following grammar, each alternative is considered only if previous production alternatives do not match.

1015 |

This alternative pattern grammar and semantics only changes the syntax and semantics of BMP patterns. The following grammar extensions include productions parameterized with the [UnicodeMode] parameter. However, none of these extensions change the syntax of Unicode patterns recognized when parsing with the [UnicodeMode] parameter present on the goal symbol.

1016 |

Syntax

1017 | 1018 | Term[UnicodeMode, N] :: 1019 | [+UnicodeMode] Assertion[+UnicodeMode, ?N] 1020 | [+UnicodeMode] Atom[+UnicodeMode, ?N] Quantifier 1021 | [+UnicodeMode] Atom[+UnicodeMode, ?N] 1022 | [~UnicodeMode] QuantifiableAssertion[?N] Quantifier 1023 | [~UnicodeMode] Assertion[~UnicodeMode, ?N] 1024 | [~UnicodeMode] ExtendedAtom[?N] Quantifier 1025 | [~UnicodeMode] ExtendedAtom[?N] 1026 | 1027 | Assertion[UnicodeMode, N] :: 1028 | `^` 1029 | `$` 1030 | `\` `b` 1031 | `\` `B` 1032 | [+UnicodeMode] `(` `?` `=` Disjunction[+UnicodeMode, ?N] `)` 1033 | [+UnicodeMode] `(` `?` `!` Disjunction[+UnicodeMode, ?N] `)` 1034 | [~UnicodeMode] QuantifiableAssertion[?N] 1035 | `(` `?` `<=` Disjunction[?UnicodeMode, ?N] `)` 1036 | `(` `?` `<!` Disjunction[?UnicodeMode, ?N] `)` 1037 | 1038 | QuantifiableAssertion[N] :: 1039 | `(` `?` `=` Disjunction[~UnicodeMode, ?N] `)` 1040 | `(` `?` `!` Disjunction[~UnicodeMode, ?N] `)` 1041 | 1042 | ExtendedAtom[N] :: 1043 | `.` 1044 | `\` AtomEscape[~UnicodeMode, ?N] 1045 | `\` [lookahead == `c`] 1046 | CharacterClass[~UnicodeMode] 1047 | `(` Disjunction[~UnicodeMode, ?N] `)` 1048 | `(` `?` `:` Disjunction[~UnicodeMode, ?N] `)` 1049 | `(` `?` RegularExpressionFlags `:` Disjunction[?UnicodeMode, ?N] `)` 1050 | `(` `?` RegularExpressionFlags `-` RegularExpressionFlags `:` Disjunction[?UnicodeMode, ?N] `)` 1051 | InvalidBracedQuantifier 1052 | ExtendedPatternCharacter 1053 | 1054 | InvalidBracedQuantifier :: 1055 | `{` DecimalDigits[~Sep] `}` 1056 | `{` DecimalDigits[~Sep] `,` `}` 1057 | `{` DecimalDigits[~Sep] `,` DecimalDigits[~Sep] `}` 1058 | 1059 | ExtendedPatternCharacter :: 1060 | SourceCharacter but not one of `^` `$` `\` `.` `*` `+` `?` `(` `)` `[` `|` 1061 | 1062 | AtomEscape[UnicodeMode, N] :: 1063 | [+UnicodeMode] DecimalEscape 1064 | [~UnicodeMode] DecimalEscape [> but only if the CapturingGroupNumber of |DecimalEscape| is ≤ _NcapturingParens_] 1065 | CharacterClassEscape[?UnicodeMode] 1066 | CharacterEscape[?UnicodeMode, ?N] 1067 | [+N] `k` GroupName[?UnicodeMode] 1068 | 1069 | CharacterEscape[UnicodeMode, N] :: 1070 | ControlEscape 1071 | `c` ControlLetter 1072 | `0` [lookahead ∉ DecimalDigit] 1073 | HexEscapeSequence 1074 | RegExpUnicodeEscapeSequence[?UnicodeMode] 1075 | [~UnicodeMode] LegacyOctalEscapeSequence 1076 | IdentityEscape[?UnicodeMode, ?N] 1077 | 1078 | IdentityEscape[UnicodeMode, N] :: 1079 | [+UnicodeMode] SyntaxCharacter 1080 | [+UnicodeMode] `/` 1081 | [~UnicodeMode] SourceCharacterIdentityEscape[?N] 1082 | 1083 | SourceCharacterIdentityEscape[N] :: 1084 | [~N] SourceCharacter but not `c` 1085 | [+N] SourceCharacter but not one of `c` or `k` 1086 | 1087 | ClassAtomNoDash[UnicodeMode, N] :: 1088 | SourceCharacter but not one of `\` or `]` or `-` 1089 | `\` ClassEscape[?UnicodeMode, ?N] 1090 | `\` [lookahead == `c`] 1091 | 1092 | ClassEscape[UnicodeMode, N] :: 1093 | `b` 1094 | [+UnicodeMode] `-` 1095 | [~UnicodeMode] `c` ClassControlLetter 1096 | CharacterClassEscape[?UnicodeMode] 1097 | CharacterEscape[?UnicodeMode, ?N] 1098 | 1099 | ClassControlLetter :: 1100 | DecimalDigit 1101 | `_` 1102 | 1103 | 1104 |

When the same left-hand sides occurs with both [+UnicodeMode] and [\~UnicodeMode] guards it is to control the disambiguation priority.

1105 |
1106 |
1107 |
1108 |
--------------------------------------------------------------------------------