├── .nvmrc ├── .travis-github-deploy-key.enc ├── .editorconfig ├── .gitignore ├── package.json ├── table-nonbinary-unicode-properties.html ├── .travis.yml ├── table-unicode-general-category-values.html ├── table-binary-unicode-properties.html ├── spec.html ├── README.md └── table-unicode-script-values.html /.nvmrc: -------------------------------------------------------------------------------- 1 | 6 2 | -------------------------------------------------------------------------------- /.travis-github-deploy-key.enc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tc39/proposal-regexp-unicode-property-escapes/HEAD/.travis-github-deploy-key.enc -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | charset = utf-8 5 | indent_style = tab 6 | end_of_line = lf 7 | insert_final_newline = true 8 | trim_trailing_whitespace = true 9 | 10 | [{README.md,package.json,spec.html,.travis.yml}] 11 | indent_style = space 12 | indent_size = 2 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | package-lock.json 2 | 3 | dist 4 | 5 | # Installed npm modules 6 | node_modules 7 | 8 | # Folder view configuration files 9 | .DS_Store 10 | Desktop.ini 11 | 12 | # Thumbnail cache files 13 | ._* 14 | Thumbs.db 15 | 16 | # Files that might appear on external disks 17 | .Spotlight-V100 18 | .Trashes 19 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "private": true, 3 | "scripts": { 4 | "test": ":", 5 | "build": "mkdir -p dist; ecmarkup --verbose spec.html dist/index.html --css dist/ecmarkup.css --js dist/ecmarkup.js" 6 | }, 7 | "devDependencies": { 8 | "@alrra/travis-scripts": "^3.0.1", 9 | "ecmarkup": "^3.12.0" 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /table-nonbinary-unicode-properties.html: -------------------------------------------------------------------------------- 1 | 2 | Non-binary Unicode property aliases and their canonical property names 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 17 | 18 | 19 | 20 | 26 | 27 | 28 | 29 | 35 | 36 | 37 |

Property name and aliases	Canonical property name
12 \| 13 \| `General_Category` 14 \| `gc` 15 \| 16 \|	`General_Category`
21 \| 22 \| `Script` 23 \| `sc` 24 \| 25 \|	`Script`
30 \| 31 \| `Script_Extensions` 32 \| `scx` 33 \| 34 \|	`Script_Extensions`

38 | 39 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | after_success: 3 | - $(npm bin)/set-up-ssh --key "${encrypted_7d5e65f18d78_key}" 4 | --iv "${encrypted_7d5e65f18d78_iv}" 5 | --path-encrypted-key .travis-github-deploy-key.enc 6 | - $(npm bin)/update-branch --commands 'npm run build' 7 | --commit-message "Update gh-pages @ ${TRAVIS_COMMIT}" 8 | --directory 'dist' 9 | --distribution-branch 'gh-pages' 10 | --source-branch 'master' 11 | git: 12 | depth: 1 13 | branches: 14 | only: 15 | - master 16 | env: 17 | global: 18 | secure: "npte8Ch8CnVYBkM0wTzQz5FF7FmrIDrnR4n/4YJ/zUo3CgaQbYeQ55eK44ge4Etu08em52BBlaBXq5eQ8kdZyLNf4pENGh/IzmSE6uvLKgsT2LV2IKXvWhpgc+D+nhDvUm0XnhuJ2HxfHCspYXDJxBzzhZUFsjvvApKwmi1L4crMum0anjmf+d1ob2E5ZFrcV5BdSv1fds7a5aOU232FDoJDp9HbTEnz5TteS1P+CyJu0R0hD7w2PZj7vU8ZaP7h+Pa7tJc1y92pcTostMw+z6FFhxpunsPWdvT4vkn5Tx7fVBQgoWSLP250/soXIaRY7fKq2qvnq7E9dRI5lqgOGzcLiuTiMHdSia+1zRxqEdPqIBEyLLKfZBAq3s77TiQOAiuwIr+dvKsTAAlbKqGrLc6kZfrvUlekHtP5C8nNhExbmBOSAs0vFK1EeavNONLVxqftMhbcxjc7+fDWe1/KtpDhSK6X/hlB/LGnYSDF5CTak01mNPDO578Be+YhC2q3Au+Ns/z0JLR6XWyd/8qRYunvHeP8eZHscJ2OyA/Aa7LWjXngXEQDsZDM80KQSlDe1/NoZAV1QEcnQ/WMWAmHhP9cx2kbQv8qh9m8yWT4QesQTm08y4MMsGtgm389VwLE5QhIb4OaGL3KSZ8IWJzeNjfGNmMFLjtltICd8vQ2b0k=" 19 | -------------------------------------------------------------------------------- /table-unicode-general-category-values.html: -------------------------------------------------------------------------------- 1 | 2 | Value aliases and canonical values for the Unicode property `General_Category` 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 17 | 18 | 19 | 20 | 26 | 27 | 28 | 29 | 35 | 36 | 37 | 38 | 45 | 46 | 47 | 48 | 54 | 55 | 56 | 57 | 63 | 64 | 65 | 66 | 73 | 74 | 75 | 76 | 82 | 83 | 84 | 85 | 91 | 92 | 93 | 94 | 100 | 101 | 102 | 103 | 109 | 110 | 111 | 112 | 118 | 119 | 120 | 121 | 127 | 128 | 129 | 130 | 136 | 137 | 138 | 139 | 145 | 146 | 147 | 148 | 155 | 156 | 157 | 158 | 164 | 165 | 166 | 167 | 173 | 174 | 175 | 176 | 182 | 183 | 184 | 185 | 191 | 192 | 193 | 194 | 200 | 201 | 202 | 203 | 209 | 210 | 211 | 212 | 218 | 219 | 220 | 221 | 227 | 228 | 229 | 230 | 236 | 237 | 238 | 239 | 245 | 246 | 247 | 248 | 254 | 255 | 256 | 257 | 263 | 264 | 265 | 266 | 272 | 273 | 274 | 275 | 282 | 283 | 284 | 285 | 291 | 292 | 293 | 294 | 300 | 301 | 302 | 303 | 309 | 310 | 311 | 312 | 318 | 319 | 320 | 321 | 327 | 328 | 329 | 330 | 336 | 337 | 338 | 339 | 345 | 346 | 347 | 348 | 354 | 355 | 356 |

Property value and aliases	Canonical property value
12 \| 13 \| `Cased_Letter` 14 \| `LC` 15 \| 16 \|	`Cased_Letter`
21 \| 22 \| `Close_Punctuation` 23 \| `Pe` 24 \| 25 \|	`Close_Punctuation`
30 \| 31 \| `Connector_Punctuation` 32 \| `Pc` 33 \| 34 \|	`Connector_Punctuation`
39 \| 40 \| `Control` 41 \| `Cc` 42 \| `cntrl` 43 \| 44 \|	`Control`
49 \| 50 \| `Currency_Symbol` 51 \| `Sc` 52 \| 53 \|	`Currency_Symbol`
58 \| 59 \| `Dash_Punctuation` 60 \| `Pd` 61 \| 62 \|	`Dash_Punctuation`
67 \| 68 \| `Decimal_Number` 69 \| `Nd` 70 \| `digit` 71 \| 72 \|	`Decimal_Number`
77 \| 78 \| `Enclosing_Mark` 79 \| `Me` 80 \| 81 \|	`Enclosing_Mark`
86 \| 87 \| `Final_Punctuation` 88 \| `Pf` 89 \| 90 \|	`Final_Punctuation`
95 \| 96 \| `Format` 97 \| `Cf` 98 \| 99 \|	`Format`
104 \| 105 \| `Initial_Punctuation` 106 \| `Pi` 107 \| 108 \|	`Initial_Punctuation`
113 \| 114 \| `Letter` 115 \| `L` 116 \| 117 \|	`Letter`
122 \| 123 \| `Letter_Number` 124 \| `Nl` 125 \| 126 \|	`Letter_Number`
131 \| 132 \| `Line_Separator` 133 \| `Zl` 134 \| 135 \|	`Line_Separator`
140 \| 141 \| `Lowercase_Letter` 142 \| `Ll` 143 \| 144 \|	`Lowercase_Letter`
149 \| 150 \| `Mark` 151 \| `M` 152 \| `Combining_Mark` 153 \| 154 \|	`Mark`
159 \| 160 \| `Math_Symbol` 161 \| `Sm` 162 \| 163 \|	`Math_Symbol`
168 \| 169 \| `Modifier_Letter` 170 \| `Lm` 171 \| 172 \|	`Modifier_Letter`
177 \| 178 \| `Modifier_Symbol` 179 \| `Sk` 180 \| 181 \|	`Modifier_Symbol`
186 \| 187 \| `Nonspacing_Mark` 188 \| `Mn` 189 \| 190 \|	`Nonspacing_Mark`
195 \| 196 \| `Number` 197 \| `N` 198 \| 199 \|	`Number`
204 \| 205 \| `Open_Punctuation` 206 \| `Ps` 207 \| 208 \|	`Open_Punctuation`
213 \| 214 \| `Other` 215 \| `C` 216 \| 217 \|	`Other`
222 \| 223 \| `Other_Letter` 224 \| `Lo` 225 \| 226 \|	`Other_Letter`
231 \| 232 \| `Other_Number` 233 \| `No` 234 \| 235 \|	`Other_Number`
240 \| 241 \| `Other_Punctuation` 242 \| `Po` 243 \| 244 \|	`Other_Punctuation`
249 \| 250 \| `Other_Symbol` 251 \| `So` 252 \| 253 \|	`Other_Symbol`
258 \| 259 \| `Paragraph_Separator` 260 \| `Zp` 261 \| 262 \|	`Paragraph_Separator`
267 \| 268 \| `Private_Use` 269 \| `Co` 270 \| 271 \|	`Private_Use`
276 \| 277 \| `Punctuation` 278 \| `P` 279 \| `punct` 280 \| 281 \|	`Punctuation`
286 \| 287 \| `Separator` 288 \| `Z` 289 \| 290 \|	`Separator`
295 \| 296 \| `Space_Separator` 297 \| `Zs` 298 \| 299 \|	`Space_Separator`
304 \| 305 \| `Spacing_Mark` 306 \| `Mc` 307 \| 308 \|	`Spacing_Mark`
313 \| 314 \| `Surrogate` 315 \| `Cs` 316 \| 317 \|	`Surrogate`
322 \| 323 \| `Symbol` 324 \| `S` 325 \| 326 \|	`Symbol`
331 \| 332 \| `Titlecase_Letter` 333 \| `Lt` 334 \| 335 \|	`Titlecase_Letter`
340 \| 341 \| `Unassigned` 342 \| `Cn` 343 \| 344 \|	`Unassigned`
349 \| 350 \| `Uppercase_Letter` 351 \| `Lu` 352 \| 353 \|	`Uppercase_Letter`

357 | 358 | -------------------------------------------------------------------------------- /table-binary-unicode-properties.html: -------------------------------------------------------------------------------- 1 | 2 | Binary Unicode property aliases and their canonical property names 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 21 | 22 | 23 | 24 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 47 | 48 | 49 | 50 | 56 | 57 | 58 | 59 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 78 | 79 | 80 | 81 | 87 | 88 | 89 | 90 | 96 | 97 | 98 | 99 | 105 | 106 | 107 | 108 | 114 | 115 | 116 | 117 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 136 | 137 | 138 | 139 | 145 | 146 | 147 | 148 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 183 | 184 | 185 | 186 | 192 | 193 | 194 | 195 | 201 | 202 | 203 | 204 | 210 | 211 | 212 | 213 | 219 | 220 | 221 | 222 | 228 | 229 | 230 | 231 | 237 | 238 | 239 | 240 | 246 | 247 | 248 | 249 | 255 | 256 | 257 | 258 | 264 | 265 | 266 | 267 | 273 | 274 | 275 | 276 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 295 | 296 | 297 | 298 | 304 | 305 | 306 | 307 | 313 | 314 | 315 | 316 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 335 | 336 | 337 | 338 | 344 | 345 | 346 | 347 | 353 | 354 | 355 | 356 | 362 | 363 | 364 | 365 | 371 | 372 | 373 | 374 | 380 | 381 | 382 | 383 | 389 | 390 | 391 | 392 | 398 | 399 | 400 | 401 | 407 | 408 | 409 | 410 | 416 | 417 | 418 |

Property name and aliases	Canonical property name
`ASCII`	`ASCII`
16 \| 17 \| `ASCII_Hex_Digit` 18 \| `AHex` 19 \| 20 \|	`ASCII_Hex_Digit`
25 \| 26 \| `Alphabetic` 27 \| `Alpha` 28 \| 29 \|	`Alphabetic`
`Any`	`Any`
`Assigned`	`Assigned`
42 \| 43 \| `Bidi_Control` 44 \| `Bidi_C` 45 \| 46 \|	`Bidi_Control`
51 \| 52 \| `Bidi_Mirrored` 53 \| `Bidi_M` 54 \| 55 \|	`Bidi_Mirrored`
60 \| 61 \| `Case_Ignorable` 62 \| `CI` 63 \| 64 \|	`Case_Ignorable`
`Cased`	`Cased`
73 \| 74 \| `Changes_When_Casefolded` 75 \| `CWCF` 76 \| 77 \|	`Changes_When_Casefolded`
82 \| 83 \| `Changes_When_Casemapped` 84 \| `CWCM` 85 \| 86 \|	`Changes_When_Casemapped`
91 \| 92 \| `Changes_When_Lowercased` 93 \| `CWL` 94 \| 95 \|	`Changes_When_Lowercased`
100 \| 101 \| `Changes_When_NFKC_Casefolded` 102 \| `CWKCF` 103 \| 104 \|	`Changes_When_NFKC_Casefolded`
109 \| 110 \| `Changes_When_Titlecased` 111 \| `CWT` 112 \| 113 \|	`Changes_When_Titlecased`
118 \| 119 \| `Changes_When_Uppercased` 120 \| `CWU` 121 \| 122 \|	`Changes_When_Uppercased`
`Dash`	`Dash`
131 \| 132 \| `Default_Ignorable_Code_Point` 133 \| `DI` 134 \| 135 \|	`Default_Ignorable_Code_Point`
140 \| 141 \| `Deprecated` 142 \| `Dep` 143 \| 144 \|	`Deprecated`
149 \| 150 \| `Diacritic` 151 \| `Dia` 152 \| 153 \|	`Diacritic`
`Emoji`	`Emoji`
`Emoji_Component`	`Emoji_Component`
`Emoji_Modifier`	`Emoji_Modifier`
`Emoji_Modifier_Base`	`Emoji_Modifier_Base`
`Emoji_Presentation`	`Emoji_Presentation`
178 \| 179 \| `Extender` 180 \| `Ext` 181 \| 182 \|	`Extender`
187 \| 188 \| `Grapheme_Base` 189 \| `Gr_Base` 190 \| 191 \|	`Grapheme_Base`
196 \| 197 \| `Grapheme_Extend` 198 \| `Gr_Ext` 199 \| 200 \|	`Grapheme_Extend`
205 \| 206 \| `Hex_Digit` 207 \| `Hex` 208 \| 209 \|	`Hex_Digit`
214 \| 215 \| `IDS_Binary_Operator` 216 \| `IDSB` 217 \| 218 \|	`IDS_Binary_Operator`
223 \| 224 \| `IDS_Trinary_Operator` 225 \| `IDST` 226 \| 227 \|	`IDS_Trinary_Operator`
232 \| 233 \| `ID_Continue` 234 \| `IDC` 235 \| 236 \|	`ID_Continue`
241 \| 242 \| `ID_Start` 243 \| `IDS` 244 \| 245 \|	`ID_Start`
250 \| 251 \| `Ideographic` 252 \| `Ideo` 253 \| 254 \|	`Ideographic`
259 \| 260 \| `Join_Control` 261 \| `Join_C` 262 \| 263 \|	`Join_Control`
268 \| 269 \| `Logical_Order_Exception` 270 \| `LOE` 271 \| 272 \|	`Logical_Order_Exception`
277 \| 278 \| `Lowercase` 279 \| `Lower` 280 \| 281 \|	`Lowercase`
`Math`	`Math`
290 \| 291 \| `Noncharacter_Code_Point` 292 \| `NChar` 293 \| 294 \|	`Noncharacter_Code_Point`
299 \| 300 \| `Pattern_Syntax` 301 \| `Pat_Syn` 302 \| 303 \|	`Pattern_Syntax`
308 \| 309 \| `Pattern_White_Space` 310 \| `Pat_WS` 311 \| 312 \|	`Pattern_White_Space`
317 \| 318 \| `Quotation_Mark` 319 \| `QMark` 320 \| 321 \|	`Quotation_Mark`
`Radical`	`Radical`
330 \| 331 \| `Regional_Indicator` 332 \| `RI` 333 \| 334 \|	`Regional_Indicator`
339 \| 340 \| `Sentence_Terminal` 341 \| `STerm` 342 \| 343 \|	`Sentence_Terminal`
348 \| 349 \| `Soft_Dotted` 350 \| `SD` 351 \| 352 \|	`Soft_Dotted`
357 \| 358 \| `Terminal_Punctuation` 359 \| `Term` 360 \| 361 \|	`Terminal_Punctuation`
366 \| 367 \| `Unified_Ideograph` 368 \| `UIdeo` 369 \| 370 \|	`Unified_Ideograph`
375 \| 376 \| `Uppercase` 377 \| `Upper` 378 \| 379 \|	`Uppercase`
384 \| 385 \| `Variation_Selector` 386 \| `VS` 387 \| 388 \|	`Variation_Selector`
393 \| 394 \| `White_Space` 395 \| `space` 396 \| 397 \|	`White_Space`
402 \| 403 \| `XID_Continue` 404 \| `XIDC` 405 \| 406 \|	`XID_Continue`
411 \| 412 \| `XID_Start` 413 \| `XIDS` 414 \| 415 \|	`XID_Start`

419 | 420 | -------------------------------------------------------------------------------- /spec.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |

  4 | title: Unicode property escapes in regular expressions
  5 | status: proposal
  6 | stage: 4
  7 | location: https://tc39.github.io/proposal-regexp-unicode-property-escapes/
  8 | copyright: false
  9 | contributors: Mathias Bynens
 10 |

11 | 12 | 13 | 34 | 35 |

The syntax listed in 21.2.1 Patterns is modified as follows.

36 | 37 | 38 | ControlLetter :: one of 39 | `a` `b` `c` `d` `e` `f` `g` `h` `i` `j` `k` `l` `m` `n` `o` `p` `q` `r` `s` `t` `u` `v` `w` `x` `y` `z` 40 | `A` `B` `C` `D` `E` `F` `G` `H` `I` `J` `K` `L` `M` `N` `O` `P` `Q` `R` `S` `T` `U` `V` `W` `X` `Y` `Z` 41 | 42 | CharacterClassEscape[U] :: 43 | `d` 44 | `D` 45 | `s` 46 | `S` 47 | `w` 48 | `W` 49 | [+U] `p{` UnicodePropertyValueExpression `}` 50 | [+U] `P{` UnicodePropertyValueExpression `}` 51 | 52 | UnicodePropertyValueExpression :: 53 | UnicodePropertyName `=` UnicodePropertyValue 54 | LoneUnicodePropertyNameOrValue 55 | 56 | UnicodePropertyNameCharacter :: 57 | ControlLetter 58 | `_` 59 | 60 | UnicodePropertyNameCharacters :: 61 | UnicodePropertyNameCharacter UnicodePropertyNameCharacters? 62 | 63 | UnicodePropertyName :: 64 | UnicodePropertyNameCharacters 65 | 66 | UnicodePropertyValueCharacter :: 67 | UnicodePropertyNameCharacter 68 | `0` 69 | `1` 70 | `2` 71 | `3` 72 | `4` 73 | `5` 74 | `6` 75 | `7` 76 | `8` 77 | `9` 78 | 79 | UnicodePropertyValueCharacters :: 80 | UnicodePropertyValueCharacter UnicodePropertyValueCharacters? 81 | 82 | UnicodePropertyValue :: 83 | UnicodePropertyValueCharacters 84 | 85 | LoneUnicodePropertyNameOrValue :: 86 | UnicodePropertyValueCharacters 87 | 88 | 89 | 90 |

Static Semantics: SourceText

91 | UnicodePropertyNameCharacters :: UnicodePropertyNameCharacter UnicodePropertyNameCharacters? 92 | UnicodePropertyValueCharacters :: UnicodePropertyValueCharacter UnicodePropertyValueCharacters? 93 | 94 | 1. Return the List, in source text order, of Unicode code points in the source text matched by this production. 95 | 96 | 97 | 98 |

99 | 100 |

The following items are appended to 21.2.1.1 Static Semantics: Early Errors.

101 | 102 | UnicodePropertyValueExpression :: UnicodePropertyName `=` UnicodePropertyValue 103 |

105 | It is a Syntax Error if the List of Unicode code points that is SourceText of UnicodePropertyName is not identical to a List of Unicode code points that is a Unicode property name or property alias listed in the “Property name and aliases” column of . 106 |
108 | It is a Syntax Error if the List of Unicode code points that is SourceText of UnicodePropertyValue is not identical to a List of Unicode code points that is a value or value alias for the Unicode property or property alias given by SourceText of UnicodePropertyName listed in the “Property value and aliases” column of the corresponding tables or . 109 |

111 | 112 | UnicodePropertyValueExpression :: LoneUnicodePropertyNameOrValue 113 |

115 | It is a Syntax Error if the List of Unicode code points that is SourceText of LoneUnicodePropertyNameOrValue is not identical to a List of Unicode code points that is a Unicode general category or general category alias listed in the “Property value and aliases” column of , nor a binary property or binary property alias listed in the “Property name and aliases” column of . 116 |

118 | 119 |

120 | 121 |

The following two abstract operations are appended to 21.2.2.8 Atom.

122 | 123 | 124 |

Runtime Semantics: UnicodeMatchProperty ( _p_ )

125 |

The algorithm uses values from the following tables, which associate supported Unicode property names and property aliases and their canonical property names.

126 |

Implementations must support the following non-binary Unicode properties and their property aliases:

127 | 128 |

Additionally, implementations must support the following binary Unicode properties and their property aliases:

129 | 130 |

The abstract operation UnicodeMatchProperty takes a parameter _p_ that is a List of Unicode code points and performs the following steps:

131 | 132 | 1. Assert: _p_ is a List of Unicode code points that is identical to a List of Unicode code points that is a Unicode property name or property alias listed in the “Property name and aliases” column of or . 133 | 1. Let _p_ be the canonical property name of _p_ as given in the “Canonical property name” column of the corresponding row. 134 | 1. Return the List of Unicode code points of _p_. 135 | 136 |

To ensure interoperability, implementations must not extend Unicode property support to the remaining properties.

137 |

Implementations must only recognize the property aliases listed in and .

138 |

Implementations must only recognize the property value aliases and canonical property value names listed in and .

139 | 140 |

For example, `Script_Extensions` (property name) and `scx` (property alias) are valid, but `script_extensions` or `Scx` aren’t.

141 | 142 | 143 |

The listed properties form a superset of what UTS18 RL1.2 requires.

144 | 145 | 146 | 147 | 148 |

Runtime Semantics: UnicodeMatchPropertyValue ( _p_, _v_ )

149 |

The algorithm uses values from the following tables, which associate canonical Unicode property names and their supported values and value aliases:

150 | 151 | 152 |

The abstract operation UnicodeMatchPropertyValue takes two parameters _p_ and _v_, each of which is a List of Unicode code points, and performs the following steps:

153 | 154 | 1. Assert: _p_ is a List of Unicode code points that is identical to a List of Unicode code points that is a canonical, unaliased Unicode property name listed in the “Canonical property name” column of . 155 | 1. Assert: _v_ is a List of Unicode code points that is identical to a List of Unicode code points that is a property value or property value alias for Unicode property _p_ listed in the “Property value and aliases” column of or . 156 | 1. Let _value_ be the canonical property value of _v_ as given in the “Canonical property value” column of the corresponding row. 157 | 1. Return the List of Unicode code points of _value_. 158 | 159 |

Only the canonical property values and property value aliases listed in and must be recognized.

160 | 161 |

For example, `Xpeo` and `Old_Persian` are valid `Script_Extension` values, but `xpeo` and `Old Persian` aren’t.

162 | 163 | 164 |

This algorithm differs from the matching rules for symbolic values listed in UAX44: case, white space, U+002D (HYPHEN-MINUS), and U+005F (LOW LINE) are not ignored, and the `Is` prefix is not supported.

165 | 166 | 167 | 168 |

169 | 170 |

The following is appended to the list of productions in 21.2.2.12 CharacterClassEscape.

171 | 172 |

The production CharacterClassEscape :: `\p{` UnicodePropertyValueExpression `}` evaluates by returning the CharSet containing all Unicode code points included in the CharSet returned by UnicodePropertyValueExpression.

173 |

The production CharacterClassEscape :: `\P{` UnicodePropertyValueExpression `}` evaluates by returning the CharSet containing all Unicode code points not included in the CharSet returned by UnicodePropertyValueExpression.

174 |

The production UnicodePropertyValueExpression :: UnicodePropertyName `=` UnicodePropertyValue evaluates as follows:

175 | 176 | 1. Let _p_ be ! UnicodeMatchProperty(_UnicodePropertyName_). 177 | 1. Assert: _p_ is a Unicode property name or property alias listed in the “Property name and aliases” column of . 178 | 1. Let _v_ be ! UnicodeMatchPropertyValue(_p_, _UnicodePropertyValue_). 179 | 1. Return the CharSet containing all Unicode code points whose character database definition includes the property _p_ with value _v_. 180 | 181 |

The production UnicodePropertyValueExpression :: LoneUnicodePropertyNameOrValue evaluates as follows:

182 | 183 | 1. If ! UnicodeMatchPropertyValue(`"General_Category"`, _LoneUnicodePropertyNameOrValue_) is identical to a List of Unicode code points that is the name of a Unicode general category or general category alias listed in the “Property value and aliases” column of , then 184 | 1. Return the CharSet containing all Unicode code points whose character database definition includes the property `General_Category` with value _LoneUnicodePropertyNameOrValue_. 185 | 1. Let _p_ be ! UnicodeMatchProperty(_LoneUnicodePropertyNameOrValue_). 186 | 1. Assert: _p_ is a binary Unicode property or binary property alias listed in the “Property name and aliases” column of . 187 | 1. Return the CharSet containing all Unicode code points whose character database definition includes the property _p_ with value |True|. 188 | 189 | 190 |

The following is appended to the bibliography.

191 | 192 | 193 | 194 |

Bibliography

195 |

197 | Unicode Standard Annex #18: Unicode Regular Expressions, available at <https://unicode.org/reports/tr18/> 198 |
200 | Unicode Standard Annex #24: Unicode `Script` Property, available at <https://unicode.org/reports/tr24/> 201 |
203 | Unicode Standard Annex #44: Unicode Character Database, available at <https://unicode.org/reports/tr44/> 204 |
206 | Unicode Technical Report #51: Unicode Emoji, available at <https://unicode.org/reports/tr51/> 207 |

209 | 210 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ECMAScript proposal: Unicode property escapes in regular expressions 2 | 3 | ## Status 4 | 5 | This proposal is at stage 4 of [the TC39 process](https://tc39.github.io/process-document/) and is scheduled to be included in ES2018. 6 | 7 | ## Motivation 8 | 9 | The Unicode Standard assigns various properties and property values to every symbol. For example, to get the set of symbols that are used exclusively in the Greek script, search the Unicode database for symbols whose `Script` property is set to `Greek`. 10 | 11 | There currently is no way to access these Unicode character properties natively in ECMAScript regular expressions. This makes it painful for developers to support full Unicode in their regular expressions. They currently have two options, neither of which is ideal: 12 | 13 | 1. Use a library such as [XRegExp](https://github.com/slevithan/xregexp) to create the regular expressions at run-time: 14 | 15 | ```js 16 | const regexGreekSymbol = XRegExp('\\p{Greek}', 'A'); 17 | regexGreekSymbol.test('π'); 18 | // → true 19 | ``` 20 | 21 | The downside of this approach is that the XRegExp library is a run-time dependency which may not be ideal for performance-sensitive applications. For usage on the web, there is an additional load-time performance penalty: `xregexp-all-min.js.gz` takes up over 35 KB of space after minifying and applying gzip compression. Whenever the Unicode Standard is updated, a new version of XRegExp must be published and end users need to update their XRegExp copy in order to use the latest available data. 22 | 23 | 2. Use a library such as [Regenerate](https://github.com/mathiasbynens/regenerate) to generate the regular expression at build time: 24 | 25 | ```js 26 | const regenerate = require('regenerate'); 27 | const codePoints = require('unicode-9.0.0/Script/Greek/code-points.js'); 28 | const set = regenerate(codePoints); 29 | set.toString(); 30 | // → '[\u0370-\u0373\u0375-\u0377\u037A-\u037D\u037F\u0384\u0386\u0388-\u038A\u038C\u038E-\u03A1\u03A3-\u03E1\u03F0-\u03FF\u1D26-\u1D2A\u1D5D-\u1D61\u1D66-\u1D6A\u1DBF\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FC4\u1FC6-\u1FD3\u1FD6-\u1FDB\u1FDD-\u1FEF\u1FF2-\u1FF4\u1FF6-\u1FFE\u2126\uAB65]|\uD800[\uDD40-\uDD8E\uDDA0]|\uD834[\uDE00-\uDE45]' 31 | // Imagine there’s more code here to save this pattern to a file. 32 | ``` 33 | 34 | This approach results in optimal run-time performance, although the generated regular expressions tend to be fairly large in size (which could lead to load-time performance problems on the web). The biggest downside is that it requires a build script, which gets painful as the developer needs more Unicode-aware regular expressions. Whenever the Unicode Standard is updated, the build script must be updated and its results must be deployed in order to use the latest available data. 35 | 36 | ## Proposed solution 37 | 38 | We propose the addition of _Unicode property escapes_ of the form `\p{…}` and `\P{…}`. Unicode property escapes are a new type of escape sequence available in regular expressions that have the `u` flag set. With this feature, the above regular expression could be written as: 39 | 40 | ```js 41 | const regexGreekSymbol = /\p{Script=Greek}/u; 42 | regexGreekSymbol.test('π'); 43 | // → true 44 | ``` 45 | 46 | This proposal solves all the abovementioned problems: 47 | 48 | * It is no longer painful to create Unicode-aware regular expressions. 49 | * There is no dependency on run-time libraries. 50 | * The regular expressions patterns are compact and readable — no more file size bloat. 51 | * Creating a script that generates the regular expression at build time is no longer necessary. 52 | * Code that uses Unicode property escapes stays up-to-date “automatically” from the developer’s point of view: whenever the Unicode Standard gets an update, the ECMAScript engine updates its data. 53 | 54 | ## High-level API 55 | 56 | Unicode property escapes for non-binary Unicode properties look like this: 57 | 58 |

\p{UnicodePropertyName=UnicodePropertyValue}

59 | 60 | The aliases defined in [`PropertyAliases.txt`](http://unicode.org/Public/UNIDATA/PropertyAliases.txt) and [`PropertyValueAliases.txt`](http://unicode.org/Public/UNIDATA/PropertyValueAliases.txt) may be used instead of the canonical property and value names. The use of an unknown property name or value triggers an early `SyntaxError`. 61 | 62 | For binary properties, the following syntax is available: 63 | 64 |

\p{LoneUnicodePropertyNameOrValue}

65 | 66 | This syntax may also be used as a shorthand for `General_Category` values, e.g. `\p{Letter}` instead of `\p{General_Category=Letter}`. 67 | 68 | `\P{…}` is the negated form of `\p{…}`. 69 | 70 | Implementations must support the list of Unicode properties and their property aliases mentioned in the spec proposal. This includes `General_Category`, `Script`, `Script_Extensions`, and some binary properties (including but not limited to `Alphabetic`, `Uppercase`, `Lowercase`, `White_Space`, `Noncharacter_Code_Point`, `Default_Ignorable_Code_Point`, `Any`, `ASCII`, `Assigned`, `ID_Start`, `ID_Continue`, `Join_Control`, `Emoji_Presentation`, `Emoji_Modifier`, `Emoji_Modifier_Base`, etc.). This is a superset of what [UTS18 RL1.2](http://unicode.org/reports/tr18/#RL1.2) requires. To ensure interoperability, implementations must not extend Unicode property support to the remaining properties. 71 | 72 | ### FAQ 73 | 74 | #### What about backwards compatibility? 75 | 76 | In regular expressions without the `u` flag, the pattern `\p` is an (unnecessary) escape sequence for `p`. Patterns of the form `\p{Letter}` might already be present in existing regular expressions without the `u` flag, and therefore we cannot assign new meaning to such patterns without breaking backwards compatibility. 77 | 78 | For this reason, ECMAScript 2015 made unnecessary escape sequences like `\p` and `\P` [throw an exception](https://bugs.ecmascript.org/show_bug.cgi?id=3157) when the `u` flag is set. This enables us to change the meaning of `\p{…}` and `\P{…}` in regular expressions with the `u` flag without breaking backwards compatibility. 79 | 80 | #### Why not support loose matching? 81 | 82 | [UAX44-LM3](http://unicode.org/reports/tr44/#Matching_Symbolic) specifies the loose matching rules for comparing Unicode property and value aliases. 83 | 84 | > Ignore case, whitespace, underscores, hyphens, […] 85 | 86 | Loose matching makes `\p{lB=Ba}` equivalent to `\p{Line_Break=Break_After}` or `/\p{___lower C-A-S-E___}/u` equivalent to `/\p{Lowercase}/u`. We assert that this feature does not add any value, and in fact harms code readability and maintainability. 87 | 88 | Should the need arise, then support for loose matching can always be added later, as part of a separate ECMAScript proposal. If we add it now, however, there is no going back. 89 | 90 | #### Why not support the `is` prefix? 91 | 92 | [UAX44-LM3](http://unicode.org/reports/tr44/#Matching_Symbolic) specifies the loose matching rules for comparing Unicode property and value aliases, one of which is: 93 | 94 | > Ignore […] any initial prefix string `is`. 95 | 96 | This rule makes `Script=IsGreek` and `IsScript=Greek` equivalent to `Script=Greek`. We assert that this feature does not add any value, and in fact harms code readability. It introduces ambiguity and increases implementation complexity, since some property values or aliases already start with `is`, e.g. `Decomposition_Type=Isolated` and `Line_Break=IS` which is an alias for `Line_Break=Infix_Numeric`. 97 | 98 | Compatibility with Unicode property escapes in other languages is not an argument either, since [no existing regular expression engine](http://unicode.org/mail-arch/unicode-ml/y2016-m06/0012.html) seems to implement the `is` prefix exactly as described in UAX44-LM3, and those that partially implement it wildly differ in behavior. 99 | 100 | Strictness is preferred over ambiguity. 101 | 102 | Should the need arise, then support for the `is` prefix can always be added later, as part of a separate ECMAScript proposal. If we add it now, however, there is no going back. 103 | 104 | #### Why not support e.g. `\pL` as a shorthand for `\p{L}`? 105 | 106 | This shorthand doesn’t add any value and as such the added implementation complexity (small as it may be) isn’t worth it. `\p{L}` works; there’s no reason to introduce another syntax for it other than compatibility with other languages which is an utopian goal anyhow. 107 | 108 | Should the need arise, then support for this shorthand can always be added later, as part of a separate ECMAScript proposal. If we add it now, however, there is no going back. 109 | 110 | #### Why use `=` (and not something else) as a separator? 111 | 112 | The `=` in `\p{…=…}` aligns with the `=` in `(?=…)` for positive lookaheads and `(?<=…)` for positive lookbehinds. Also, `=` is what most regular expression engines use as a separator. [See issue #8 for more information.](https://github.com/tc39/proposal-regexp-unicode-property-escapes/issues/8) 113 | 114 | #### Why not support `:` as a separator in addition to `=`? 115 | 116 | Supporting multiple separators doesn’t add any value and as such the added implementation complexity (small as it may be) isn’t worth it. `\p{Script_Extensions=Greek}` works; there’s no reason to introduce another syntax for it other than compatibility with other languages which is an utopian goal anyhow. 117 | 118 | Should the need arise, then support for the `:` separator can always be added later, as part of a separate ECMAScript proposal. If we add it now, however, there is no going back. 119 | 120 | #### Why not support e.g. `\p{ScriptName}` as a shorthand for `\p{Script=ScriptName}`? 121 | 122 | In the majority of use cases, `Script_Extensions` should be used over `Script`. [UTS24](http://unicode.org/reports/tr24/#Multiple_Script_Values) explains this nicely with practical examples. As such, it would make more sense to add a shorthand for `Script_Extensions` than for `Script`. Doing either would cause confusion, however, since the sets of values for these two properties are identical. For example, it wouldn’t be clear if `\p{Old_Persian}` refers to the `Script` or `Script_Extensions` with that name. 123 | 124 | #### Why not overload `\u{…}` instead of adding `\p{…}` and `\P{…}`? 125 | 126 | The main argument in favor of overloading `\u{…}` is that it hints that it is Unicode. We assert that this hint is unnecessary, as the required `u` flag on the regular expression already indicates Unicode. 127 | 128 | The `p` in `\p{…}` stands for “property”. Combined with the `u` flag, this indicates nicely that the expression within the braces relates to a Unicode property. 129 | 130 | Overloading `\u{…}` introduces an ambiguity. Imagine a new binary property or general category named `Beef` is added to the Unicode Standard. Since `Beef` consists of hexadecimal digits only (`[A-Fa-f0-9]`), it’s unclear whether `\u{Beef}` is a code point escape sequence for [U+BEEF HANGUL SYLLABLE BBEGS](https://codepoints.net/U+BEEF) or whether it’s a property escape sequence referring to the property/category named `Beef`. 131 | 132 | Existing other languages with support for Unicode property escapes use `\p{…}` and `\P{…}`. Although compatibility with these other implementations is a non-goal (since they’re not compatible amongst themselves to begin with), it makes sense to follow the tradition here and re-use the base syntax that developers are already familiar with. 133 | 134 | #### Why not support the `Name` property (`\p{Name=…}`)? 135 | 136 | Developers already have a way to refer to a specific symbol without having to use that symbol in their source code: Unicode code point escapes of the form `\u{1D306}`. As such, the need to support `\p{Name=TETRAGRAM FOR CENTRE}` is not strong enough to warrant inclusion in this proposal. 137 | 138 | Support for the `Name` property can always be added later, as part of a separate ECMAScript proposal. If we add it now, however, there is no going back. 139 | 140 | ## Illustrative examples 141 | 142 | ### Unicode-aware version of `\d` 143 | 144 | To match any decimal number in Unicode rather than just ASCII `[0-9]`, use `\p{Decimal_Number}` instead of `\d` as per [UTS18](http://unicode.org/reports/tr18/#digit). 145 | 146 | ```js 147 | const regex = /^\p{Decimal_Number}+$/u; 148 | regex.test('𝟏𝟐𝟑𝟜𝟝𝟞𝟩𝟪𝟫𝟬𝟭𝟮𝟯𝟺𝟻𝟼'); 149 | // → true 150 | ``` 151 | 152 | ### Unicode-aware version of `\D` 153 | 154 | To match any Unicode symbol that is not a decimal number rather than just `[^0-9]`, use `\P{Decimal_Number}` instead of `\D`. 155 | 156 | ```js 157 | const regex = /^\P{Decimal_Number}+$/u; 158 | regex.test('Իմ օդաթիռը լի է օձաձկերով'); 159 | // → true 160 | ``` 161 | 162 | ### Unicode-aware version of `\w` 163 | 164 | To match any word symbol in Unicode rather than just ASCII `[a-zA-Z0-9_]`, use `[\p{Alphabetic}\p{Mark}\p{Decimal_Number}\p{Connector_Punctuation}\p{Join_Control}]` as per [UTS18](http://unicode.org/reports/tr18/#word). 165 | 166 | ```js 167 | const regex = /([\p{Alphabetic}\p{Mark}\p{Decimal_Number}\p{Connector_Punctuation}\p{Join_Control}]+)/gu; 168 | const text = ` 169 | Amharic: የኔ ማንዣበቢያ መኪና በዓሣዎች ተሞልቷል 170 | Bengali: আমার হভারক্রাফ্ট কুঁচে মাছ-এ ভরা হয়ে গেছে 171 | Georgian: ჩემი ხომალდი საჰაერო ბალიშზე სავსეა გველთევზებით 172 | Macedonian: Моето летачко возило е полно со јагули 173 | Vietnamese: Tàu cánh ngầm của tôi đầy lươn 174 | `; 175 | 176 | let match; 177 | while (match = regex.exec(text)) { 178 | const word = match[1]; 179 | console.log(`Matched word with length ${ word.length }: ${ word }`); 180 | } 181 | ``` 182 | 183 | Console output: 184 | 185 | ``` 186 | Matched word with length 7: Amharic 187 | Matched word with length 2: የኔ 188 | Matched word with length 6: ማንዣበቢያ 189 | Matched word with length 3: መኪና 190 | Matched word with length 5: በዓሣዎች 191 | Matched word with length 5: ተሞልቷል 192 | Matched word with length 7: Bengali 193 | Matched word with length 4: আমার 194 | Matched word with length 11: হভারক্রাফ্ট 195 | Matched word with length 5: কুঁচে 196 | Matched word with length 3: মাছ 197 | Matched word with length 1: এ 198 | Matched word with length 3: ভরা 199 | Matched word with length 3: হয়ে 200 | Matched word with length 4: গেছে 201 | Matched word with length 8: Georgian 202 | Matched word with length 4: ჩემი 203 | Matched word with length 7: ხომალდი 204 | Matched word with length 7: საჰაერო 205 | Matched word with length 7: ბალიშზე 206 | Matched word with length 6: სავსეა 207 | Matched word with length 12: გველთევზებით 208 | Matched word with length 10: Macedonian 209 | Matched word with length 5: Моето 210 | Matched word with length 7: летачко 211 | Matched word with length 6: возило 212 | Matched word with length 1: е 213 | Matched word with length 5: полно 214 | Matched word with length 2: со 215 | Matched word with length 6: јагули 216 | Matched word with length 10: Vietnamese 217 | Matched word with length 3: Tàu 218 | Matched word with length 4: cánh 219 | Matched word with length 4: ngầm 220 | Matched word with length 3: của 221 | Matched word with length 3: tôi 222 | Matched word with length 3: đầy 223 | Matched word with length 4: lươn 224 | ``` 225 | 226 | ### Unicode-aware version of `\W` 227 | 228 | To match any non-word symbol in Unicode rather than just `[^a-zA-Z0-9_]`, use `[^\p{Alphabetic}\p{Mark}\p{Decimal_Number}\p{Connector_Punctuation}\p{Join_Control}]`. 229 | 230 | ### Matching emoji 231 | 232 | To match emoji symbols, the binary properties from [UTR51](http://unicode.org/reports/tr51/) come in handy. 233 | 234 | ```js 235 | const regex = /\p{Emoji_Modifier_Base}\p{Emoji_Modifier}?|\p{Emoji_Presentation}|\p{Emoji}\uFE0F/gu; 236 | ``` 237 | 238 | This regular expression matches, from left to right: 239 | 240 | 1. emoji with optional modifiers (`\p{Emoji_Modifier_Base}\p{Emoji_Modifier}?`); 241 | 2. any remaining symbols that render as emoji rather than text by default (`\p{Emoji_Presentation}`); 242 | 3. symbols that render as text by default, but are forced to render as emoji using U+FE0F VARIATION SELECTOR-16 (`\p{Emoji}\uFE0F`). 243 | 244 | ```js 245 | const regex = /\p{Emoji_Modifier_Base}\p{Emoji_Modifier}?|\p{Emoji_Presentation}|\p{Emoji}\uFE0F/gu; 246 | const text = ` 247 | \u{231A}: ⌚ default emoji presentation character (Emoji_Presentation) 248 | \u{2194}\u{FE0F}: ↔️ default text presentation character rendered as emoji 249 | \u{1F469}: 👩 emoji modifier base (Emoji_Modifier_Base) 250 | \u{1F469}\u{1F3FF}: 👩🏿 emoji modifier base followed by a modifier 251 | `; 252 | 253 | let match; 254 | while (match = regex.exec(text)) { 255 | const emoji = match[0]; 256 | console.log(`Matched sequence ${ emoji } — code points: ${ [...emoji].length }`); 257 | } 258 | ``` 259 | 260 | Console output: 261 | 262 | ``` 263 | Matched sequence ⌚ — code points: 1 264 | Matched sequence ⌚ — code points: 1 265 | Matched sequence ↔️ — code points: 2 266 | Matched sequence ↔️ — code points: 2 267 | Matched sequence 👩 — code points: 1 268 | Matched sequence 👩 — code points: 1 269 | Matched sequence 👩🏿 — code points: 2 270 | Matched sequence 👩🏿 — code points: 2 271 | ``` 272 | 273 | ### Other examples 274 | 275 | Match any numeric symbol in Unicode, including non-decimal symbols such as Roman numerals: 276 | 277 | ```js 278 | const regex = /^\p{Number}+$/u; 279 | regex.test('²³¹¼½¾𝟏𝟐𝟑𝟜𝟝𝟞𝟩𝟪𝟫𝟬𝟭𝟮𝟯𝟺𝟻𝟼㉛㉜㉝ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫⅬⅭⅮⅯⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹⅺⅻⅼⅽⅾⅿ'); 280 | // → true 281 | ``` 282 | 283 | Match ECMAScript [`IdentifierStart`](https://tc39.github.io/ecma262/#prod-IdentifierStart) or [`IdentifierPart`](https://tc39.github.io/ecma262/#prod-IdentifierPart) symbols [without the need for complex regular expressions generated by build scripts](https://gist.github.com/mathiasbynens/6334847): 284 | 285 | ```js 286 | const regexIdentifierStart = /[$_\p{ID_Start}]/u; 287 | const regexIdentifierPart = /[$\u200C\u200D\p{ID_Continue}]/u; 288 | const regexIdentifierName = /^(?:[$_\p{ID_Start}])(?:[$\u200C\u200D\p{ID_Continue}])*$/u; 289 | ``` 290 | 291 | ## Specification 292 | 293 | * [Ecmarkup source](https://github.com/tc39/proposal-regexp-unicode-property-escapes/blob/master/spec.html) 294 | * [HTML version](https://tc39.github.io/proposal-regexp-unicode-property-escapes/) 295 | 296 | ## Implementations 297 | 298 | * [V8](https://bugs.chromium.org/p/v8/issues/detail?id=4743), shipping in Chrome 64 299 | * [Safari/JavaScriptCore](https://developer.apple.com/safari/technology-preview/release-notes/) beginning in Safari Technology Preview 42 300 | * [regexpu (transpiler)](https://github.com/mathiasbynens/regexpu) with the `{ unicodePropertyEscape: true }` option enabled 301 | * [online demo](https://mothereff.in/regexpu#input=/%5Cp%7BLetter%7D/u&unicodePropertyEscape=1) 302 | * [exhaustive list of supported properties](https://github.com/mathiasbynens/regexpu-core/blob/master/property-escapes.md) 303 | * [Babel plugin](https://github.com/mathiasbynens/babel-plugin-transform-unicode-property-regex) 304 | -------------------------------------------------------------------------------- /table-unicode-script-values.html: -------------------------------------------------------------------------------- 1 | 2 | Value aliases and canonical values for the Unicode properties `Script` and `Script_Extensions` 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 17 | 18 | 19 | 20 | 26 | 27 | 28 | 29 | 35 | 36 | 37 | 38 | 44 | 45 | 46 | 47 | 53 | 54 | 55 | 56 | 62 | 63 | 64 | 65 | 71 | 72 | 73 | 74 | 80 | 81 | 82 | 83 | 89 | 90 | 91 | 92 | 98 | 99 | 100 | 101 | 107 | 108 | 109 | 110 | 116 | 117 | 118 | 119 | 125 | 126 | 127 | 128 | 134 | 135 | 136 | 137 | 143 | 144 | 145 | 146 | 152 | 153 | 154 | 155 | 161 | 162 | 163 | 164 | 170 | 171 | 172 | 173 | 179 | 180 | 181 | 182 | 188 | 189 | 190 | 191 | 197 | 198 | 199 | 200 | 206 | 207 | 208 | 209 | 215 | 216 | 217 | 218 | 224 | 225 | 226 | 227 | 234 | 235 | 236 | 237 | 243 | 244 | 245 | 246 | 252 | 253 | 254 | 255 | 261 | 262 | 263 | 264 | 270 | 271 | 272 | 273 | 279 | 280 | 281 | 282 | 288 | 289 | 290 | 291 | 297 | 298 | 299 | 300 | 306 | 307 | 308 | 309 | 315 | 316 | 317 | 318 | 324 | 325 | 326 | 327 | 333 | 334 | 335 | 336 | 342 | 343 | 344 | 345 | 351 | 352 | 353 | 354 | 360 | 361 | 362 | 363 | 369 | 370 | 371 | 372 | 378 | 379 | 380 | 381 | 387 | 388 | 389 | 390 | 396 | 397 | 398 | 399 | 405 | 406 | 407 | 408 | 414 | 415 | 416 | 417 | 423 | 424 | 425 | 426 | 432 | 433 | 434 | 435 | 441 | 442 | 443 | 444 | 451 | 452 | 453 | 454 | 460 | 461 | 462 | 463 | 469 | 470 | 471 | 472 | 478 | 479 | 480 | 481 | 487 | 488 | 489 | 490 | 496 | 497 | 498 | 499 | 505 | 506 | 507 | 508 | 514 | 515 | 516 | 517 | 523 | 524 | 525 | 526 | 532 | 533 | 534 | 535 | 541 | 542 | 543 | 544 | 550 | 551 | 552 | 553 | 559 | 560 | 561 | 562 | 568 | 569 | 570 | 571 | 577 | 578 | 579 | 580 | 586 | 587 | 588 | 589 | 595 | 596 | 597 | 598 | 604 | 605 | 606 | 607 | 613 | 614 | 615 | 616 | 622 | 623 | 624 | 625 | 631 | 632 | 633 | 634 | 640 | 641 | 642 | 643 | 649 | 650 | 651 | 652 | 658 | 659 | 660 | 661 | 667 | 668 | 669 | 670 | 676 | 677 | 678 | 679 | 685 | 686 | 687 | 688 | 694 | 695 | 696 | 697 | 703 | 704 | 705 | 706 | 712 | 713 | 714 | 715 | 721 | 722 | 723 | 724 | 730 | 731 | 732 | 733 | 739 | 740 | 741 | 742 | 748 | 749 | 750 | 751 | 757 | 758 | 759 | 760 | 766 | 767 | 768 | 769 | 775 | 776 | 777 | 778 | 784 | 785 | 786 | 787 | 793 | 794 | 795 | 796 | 802 | 803 | 804 | 805 | 811 | 812 | 813 | 814 | 820 | 821 | 822 | 823 | 829 | 830 | 831 | 832 | 838 | 839 | 840 | 841 | 847 | 848 | 849 | 850 | 856 | 857 | 858 | 859 | 865 | 866 | 867 | 868 | 874 | 875 | 876 | 877 | 883 | 884 | 885 | 886 | 892 | 893 | 894 | 895 | 901 | 902 | 903 | 904 | 910 | 911 | 912 | 913 | 919 | 920 | 921 | 922 | 928 | 929 | 930 | 931 | 937 | 938 | 939 | 940 | 946 | 947 | 948 | 949 | 955 | 956 | 957 | 958 | 964 | 965 | 966 | 967 | 973 | 974 | 975 | 976 | 982 | 983 | 984 | 985 | 991 | 992 | 993 | 994 | 1000 | 1001 | 1002 | 1003 | 1009 | 1010 | 1011 | 1012 | 1018 | 1019 | 1020 | 1021 | 1027 | 1028 | 1029 | 1030 | 1036 | 1037 | 1038 | 1039 | 1045 | 1046 | 1047 | 1048 | 1054 | 1055 | 1056 | 1057 | 1063 | 1064 | 1065 | 1066 | 1072 | 1073 | 1074 | 1075 | 1081 | 1082 | 1083 | 1084 | 1090 | 1091 | 1092 | 1093 | 1099 | 1100 | 1101 | 1102 | 1108 | 1109 | 1110 | 1111 | 1117 | 1118 | 1119 | 1120 | 1126 | 1127 | 1128 | 1129 | 1135 | 1136 | 1137 | 1138 | 1144 | 1145 | 1146 | 1147 | 1153 | 1154 | 1155 | 1156 | 1162 | 1163 | 1164 | 1165 | 1171 | 1172 | 1173 | 1174 | 1180 | 1181 | 1182 | 1183 | 1189 | 1190 | 1191 | 1192 | 1198 | 1199 | 1200 | 1201 | 1207 | 1208 | 1209 | 1210 | 1216 | 1217 | 1218 | 1219 | 1225 | 1226 | 1227 | 1228 | 1234 | 1235 | 1236 | 1237 | 1243 | 1244 | 1245 | 1246 | 1252 | 1253 | 1254 | 1255 | 1261 | 1262 | 1263 | 1264 | 1270 | 1271 | 1272 | 1273 | 1279 | 1280 | 1281 |

Property value and aliases	Canonical property value
12 \| 13 \| `Adlam` 14 \| `Adlm` 15 \| 16 \|	`Adlam`
21 \| 22 \| `Ahom` 23 \| `Ahom` 24 \| 25 \|	`Ahom`
30 \| 31 \| `Anatolian_Hieroglyphs` 32 \| `Hluw` 33 \| 34 \|	`Anatolian_Hieroglyphs`
39 \| 40 \| `Arabic` 41 \| `Arab` 42 \| 43 \|	`Arabic`
48 \| 49 \| `Armenian` 50 \| `Armn` 51 \| 52 \|	`Armenian`
57 \| 58 \| `Avestan` 59 \| `Avst` 60 \| 61 \|	`Avestan`
66 \| 67 \| `Balinese` 68 \| `Bali` 69 \| 70 \|	`Balinese`
75 \| 76 \| `Bamum` 77 \| `Bamu` 78 \| 79 \|	`Bamum`
84 \| 85 \| `Bassa_Vah` 86 \| `Bass` 87 \| 88 \|	`Bassa_Vah`
93 \| 94 \| `Batak` 95 \| `Batk` 96 \| 97 \|	`Batak`
102 \| 103 \| `Bengali` 104 \| `Beng` 105 \| 106 \|	`Bengali`
111 \| 112 \| `Bhaiksuki` 113 \| `Bhks` 114 \| 115 \|	`Bhaiksuki`
120 \| 121 \| `Bopomofo` 122 \| `Bopo` 123 \| 124 \|	`Bopomofo`
129 \| 130 \| `Brahmi` 131 \| `Brah` 132 \| 133 \|	`Brahmi`
138 \| 139 \| `Braille` 140 \| `Brai` 141 \| 142 \|	`Braille`
147 \| 148 \| `Buginese` 149 \| `Bugi` 150 \| 151 \|	`Buginese`
156 \| 157 \| `Buhid` 158 \| `Buhd` 159 \| 160 \|	`Buhid`
165 \| 166 \| `Canadian_Aboriginal` 167 \| `Cans` 168 \| 169 \|	`Canadian_Aboriginal`
174 \| 175 \| `Carian` 176 \| `Cari` 177 \| 178 \|	`Carian`
183 \| 184 \| `Caucasian_Albanian` 185 \| `Aghb` 186 \| 187 \|	`Caucasian_Albanian`
192 \| 193 \| `Chakma` 194 \| `Cakm` 195 \| 196 \|	`Chakma`
201 \| 202 \| `Cham` 203 \| `Cham` 204 \| 205 \|	`Cham`
210 \| 211 \| `Cherokee` 212 \| `Cher` 213 \| 214 \|	`Cherokee`
219 \| 220 \| `Common` 221 \| `Zyyy` 222 \| 223 \|	`Common`
228 \| 229 \| `Coptic` 230 \| `Copt` 231 \| `Qaac` 232 \| 233 \|	`Coptic`
238 \| 239 \| `Cuneiform` 240 \| `Xsux` 241 \| 242 \|	`Cuneiform`
247 \| 248 \| `Cypriot` 249 \| `Cprt` 250 \| 251 \|	`Cypriot`
256 \| 257 \| `Cyrillic` 258 \| `Cyrl` 259 \| 260 \|	`Cyrillic`
265 \| 266 \| `Deseret` 267 \| `Dsrt` 268 \| 269 \|	`Deseret`
274 \| 275 \| `Devanagari` 276 \| `Deva` 277 \| 278 \|	`Devanagari`
283 \| 284 \| `Duployan` 285 \| `Dupl` 286 \| 287 \|	`Duployan`
292 \| 293 \| `Egyptian_Hieroglyphs` 294 \| `Egyp` 295 \| 296 \|	`Egyptian_Hieroglyphs`
301 \| 302 \| `Elbasan` 303 \| `Elba` 304 \| 305 \|	`Elbasan`
310 \| 311 \| `Ethiopic` 312 \| `Ethi` 313 \| 314 \|	`Ethiopic`
319 \| 320 \| `Georgian` 321 \| `Geor` 322 \| 323 \|	`Georgian`
328 \| 329 \| `Glagolitic` 330 \| `Glag` 331 \| 332 \|	`Glagolitic`
337 \| 338 \| `Gothic` 339 \| `Goth` 340 \| 341 \|	`Gothic`
346 \| 347 \| `Grantha` 348 \| `Gran` 349 \| 350 \|	`Grantha`
355 \| 356 \| `Greek` 357 \| `Grek` 358 \| 359 \|	`Greek`
364 \| 365 \| `Gujarati` 366 \| `Gujr` 367 \| 368 \|	`Gujarati`
373 \| 374 \| `Gurmukhi` 375 \| `Guru` 376 \| 377 \|	`Gurmukhi`
382 \| 383 \| `Han` 384 \| `Hani` 385 \| 386 \|	`Han`
391 \| 392 \| `Hangul` 393 \| `Hang` 394 \| 395 \|	`Hangul`
400 \| 401 \| `Hanunoo` 402 \| `Hano` 403 \| 404 \|	`Hanunoo`
409 \| 410 \| `Hatran` 411 \| `Hatr` 412 \| 413 \|	`Hatran`
418 \| 419 \| `Hebrew` 420 \| `Hebr` 421 \| 422 \|	`Hebrew`
427 \| 428 \| `Hiragana` 429 \| `Hira` 430 \| 431 \|	`Hiragana`
436 \| 437 \| `Imperial_Aramaic` 438 \| `Armi` 439 \| 440 \|	`Imperial_Aramaic`
445 \| 446 \| `Inherited` 447 \| `Zinh` 448 \| `Qaai` 449 \| 450 \|	`Inherited`
455 \| 456 \| `Inscriptional_Pahlavi` 457 \| `Phli` 458 \| 459 \|	`Inscriptional_Pahlavi`
464 \| 465 \| `Inscriptional_Parthian` 466 \| `Prti` 467 \| 468 \|	`Inscriptional_Parthian`
473 \| 474 \| `Javanese` 475 \| `Java` 476 \| 477 \|	`Javanese`
482 \| 483 \| `Kaithi` 484 \| `Kthi` 485 \| 486 \|	`Kaithi`
491 \| 492 \| `Kannada` 493 \| `Knda` 494 \| 495 \|	`Kannada`
500 \| 501 \| `Katakana` 502 \| `Kana` 503 \| 504 \|	`Katakana`
509 \| 510 \| `Kayah_Li` 511 \| `Kali` 512 \| 513 \|	`Kayah_Li`
518 \| 519 \| `Kharoshthi` 520 \| `Khar` 521 \| 522 \|	`Kharoshthi`
527 \| 528 \| `Khmer` 529 \| `Khmr` 530 \| 531 \|	`Khmer`
536 \| 537 \| `Khojki` 538 \| `Khoj` 539 \| 540 \|	`Khojki`
545 \| 546 \| `Khudawadi` 547 \| `Sind` 548 \| 549 \|	`Khudawadi`
554 \| 555 \| `Lao` 556 \| `Laoo` 557 \| 558 \|	`Lao`
563 \| 564 \| `Latin` 565 \| `Latn` 566 \| 567 \|	`Latin`
572 \| 573 \| `Lepcha` 574 \| `Lepc` 575 \| 576 \|	`Lepcha`
581 \| 582 \| `Limbu` 583 \| `Limb` 584 \| 585 \|	`Limbu`
590 \| 591 \| `Linear_A` 592 \| `Lina` 593 \| 594 \|	`Linear_A`
599 \| 600 \| `Linear_B` 601 \| `Linb` 602 \| 603 \|	`Linear_B`
608 \| 609 \| `Lisu` 610 \| `Lisu` 611 \| 612 \|	`Lisu`
617 \| 618 \| `Lycian` 619 \| `Lyci` 620 \| 621 \|	`Lycian`
626 \| 627 \| `Lydian` 628 \| `Lydi` 629 \| 630 \|	`Lydian`
635 \| 636 \| `Mahajani` 637 \| `Mahj` 638 \| 639 \|	`Mahajani`
644 \| 645 \| `Malayalam` 646 \| `Mlym` 647 \| 648 \|	`Malayalam`
653 \| 654 \| `Mandaic` 655 \| `Mand` 656 \| 657 \|	`Mandaic`
662 \| 663 \| `Manichaean` 664 \| `Mani` 665 \| 666 \|	`Manichaean`
671 \| 672 \| `Marchen` 673 \| `Marc` 674 \| 675 \|	`Marchen`
680 \| 681 \| `Masaram_Gondi` 682 \| `Gonm` 683 \| 684 \|	`Masaram_Gondi`
689 \| 690 \| `Meetei_Mayek` 691 \| `Mtei` 692 \| 693 \|	`Meetei_Mayek`
698 \| 699 \| `Mende_Kikakui` 700 \| `Mend` 701 \| 702 \|	`Mende_Kikakui`
707 \| 708 \| `Meroitic_Cursive` 709 \| `Merc` 710 \| 711 \|	`Meroitic_Cursive`
716 \| 717 \| `Meroitic_Hieroglyphs` 718 \| `Mero` 719 \| 720 \|	`Meroitic_Hieroglyphs`
725 \| 726 \| `Miao` 727 \| `Plrd` 728 \| 729 \|	`Miao`
734 \| 735 \| `Modi` 736 \| `Modi` 737 \| 738 \|	`Modi`
743 \| 744 \| `Mongolian` 745 \| `Mong` 746 \| 747 \|	`Mongolian`
752 \| 753 \| `Mro` 754 \| `Mroo` 755 \| 756 \|	`Mro`
761 \| 762 \| `Multani` 763 \| `Mult` 764 \| 765 \|	`Multani`
770 \| 771 \| `Myanmar` 772 \| `Mymr` 773 \| 774 \|	`Myanmar`
779 \| 780 \| `Nabataean` 781 \| `Nbat` 782 \| 783 \|	`Nabataean`
788 \| 789 \| `New_Tai_Lue` 790 \| `Talu` 791 \| 792 \|	`New_Tai_Lue`
797 \| 798 \| `Newa` 799 \| `Newa` 800 \| 801 \|	`Newa`
806 \| 807 \| `Nko` 808 \| `Nkoo` 809 \| 810 \|	`Nko`
815 \| 816 \| `Nushu` 817 \| `Nshu` 818 \| 819 \|	`Nushu`
824 \| 825 \| `Ogham` 826 \| `Ogam` 827 \| 828 \|	`Ogham`
833 \| 834 \| `Ol_Chiki` 835 \| `Olck` 836 \| 837 \|	`Ol_Chiki`
842 \| 843 \| `Old_Hungarian` 844 \| `Hung` 845 \| 846 \|	`Old_Hungarian`
851 \| 852 \| `Old_Italic` 853 \| `Ital` 854 \| 855 \|	`Old_Italic`
860 \| 861 \| `Old_North_Arabian` 862 \| `Narb` 863 \| 864 \|	`Old_North_Arabian`
869 \| 870 \| `Old_Permic` 871 \| `Perm` 872 \| 873 \|	`Old_Permic`
878 \| 879 \| `Old_Persian` 880 \| `Xpeo` 881 \| 882 \|	`Old_Persian`
887 \| 888 \| `Old_South_Arabian` 889 \| `Sarb` 890 \| 891 \|	`Old_South_Arabian`
896 \| 897 \| `Old_Turkic` 898 \| `Orkh` 899 \| 900 \|	`Old_Turkic`
905 \| 906 \| `Oriya` 907 \| `Orya` 908 \| 909 \|	`Oriya`
914 \| 915 \| `Osage` 916 \| `Osge` 917 \| 918 \|	`Osage`
923 \| 924 \| `Osmanya` 925 \| `Osma` 926 \| 927 \|	`Osmanya`
932 \| 933 \| `Pahawh_Hmong` 934 \| `Hmng` 935 \| 936 \|	`Pahawh_Hmong`
941 \| 942 \| `Palmyrene` 943 \| `Palm` 944 \| 945 \|	`Palmyrene`
950 \| 951 \| `Pau_Cin_Hau` 952 \| `Pauc` 953 \| 954 \|	`Pau_Cin_Hau`
959 \| 960 \| `Phags_Pa` 961 \| `Phag` 962 \| 963 \|	`Phags_Pa`
968 \| 969 \| `Phoenician` 970 \| `Phnx` 971 \| 972 \|	`Phoenician`
977 \| 978 \| `Psalter_Pahlavi` 979 \| `Phlp` 980 \| 981 \|	`Psalter_Pahlavi`
986 \| 987 \| `Rejang` 988 \| `Rjng` 989 \| 990 \|	`Rejang`
995 \| 996 \| `Runic` 997 \| `Runr` 998 \| 999 \|	`Runic`
1004 \| 1005 \| `Samaritan` 1006 \| `Samr` 1007 \| 1008 \|	`Samaritan`
1013 \| 1014 \| `Saurashtra` 1015 \| `Saur` 1016 \| 1017 \|	`Saurashtra`
1022 \| 1023 \| `Sharada` 1024 \| `Shrd` 1025 \| 1026 \|	`Sharada`
1031 \| 1032 \| `Shavian` 1033 \| `Shaw` 1034 \| 1035 \|	`Shavian`
1040 \| 1041 \| `Siddham` 1042 \| `Sidd` 1043 \| 1044 \|	`Siddham`
1049 \| 1050 \| `SignWriting` 1051 \| `Sgnw` 1052 \| 1053 \|	`SignWriting`
1058 \| 1059 \| `Sinhala` 1060 \| `Sinh` 1061 \| 1062 \|	`Sinhala`
1067 \| 1068 \| `Sora_Sompeng` 1069 \| `Sora` 1070 \| 1071 \|	`Sora_Sompeng`
1076 \| 1077 \| `Soyombo` 1078 \| `Soyo` 1079 \| 1080 \|	`Soyombo`
1085 \| 1086 \| `Sundanese` 1087 \| `Sund` 1088 \| 1089 \|	`Sundanese`
1094 \| 1095 \| `Syloti_Nagri` 1096 \| `Sylo` 1097 \| 1098 \|	`Syloti_Nagri`
1103 \| 1104 \| `Syriac` 1105 \| `Syrc` 1106 \| 1107 \|	`Syriac`
1112 \| 1113 \| `Tagalog` 1114 \| `Tglg` 1115 \| 1116 \|	`Tagalog`
1121 \| 1122 \| `Tagbanwa` 1123 \| `Tagb` 1124 \| 1125 \|	`Tagbanwa`
1130 \| 1131 \| `Tai_Le` 1132 \| `Tale` 1133 \| 1134 \|	`Tai_Le`
1139 \| 1140 \| `Tai_Tham` 1141 \| `Lana` 1142 \| 1143 \|	`Tai_Tham`
1148 \| 1149 \| `Tai_Viet` 1150 \| `Tavt` 1151 \| 1152 \|	`Tai_Viet`
1157 \| 1158 \| `Takri` 1159 \| `Takr` 1160 \| 1161 \|	`Takri`
1166 \| 1167 \| `Tamil` 1168 \| `Taml` 1169 \| 1170 \|	`Tamil`
1175 \| 1176 \| `Tangut` 1177 \| `Tang` 1178 \| 1179 \|	`Tangut`
1184 \| 1185 \| `Telugu` 1186 \| `Telu` 1187 \| 1188 \|	`Telugu`
1193 \| 1194 \| `Thaana` 1195 \| `Thaa` 1196 \| 1197 \|	`Thaana`
1202 \| 1203 \| `Thai` 1204 \| `Thai` 1205 \| 1206 \|	`Thai`
1211 \| 1212 \| `Tibetan` 1213 \| `Tibt` 1214 \| 1215 \|	`Tibetan`
1220 \| 1221 \| `Tifinagh` 1222 \| `Tfng` 1223 \| 1224 \|	`Tifinagh`
1229 \| 1230 \| `Tirhuta` 1231 \| `Tirh` 1232 \| 1233 \|	`Tirhuta`
1238 \| 1239 \| `Ugaritic` 1240 \| `Ugar` 1241 \| 1242 \|	`Ugaritic`
1247 \| 1248 \| `Vai` 1249 \| `Vaii` 1250 \| 1251 \|	`Vai`
1256 \| 1257 \| `Warang_Citi` 1258 \| `Wara` 1259 \| 1260 \|	`Warang_Citi`
1265 \| 1266 \| `Yi` 1267 \| `Yiii` 1268 \| 1269 \|	`Yi`
1274 \| 1275 \| `Zanabazar_Square` 1276 \| `Zanb` 1277 \| 1278 \|	`Zanabazar_Square`

1282 | 1283 | --------------------------------------------------------------------------------