├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── UNICODE-LICENSE ├── pyunormalize ├── __init__.py ├── _unicode.py ├── _version.py ├── normalization.py ├── tests │ ├── __init__.py │ ├── data │ │ └── NormalizationTest.txt │ ├── test_pyunormalize.py │ └── unicode_conformance.py └── tools │ └── generate_unicode.py ├── setup.py └── tox.ini /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Distribution / packaging 7 | .Python 8 | build/ 9 | develop-eggs/ 10 | dist/ 11 | downloads/ 12 | eggs/ 13 | .eggs/ 14 | lib/ 15 | lib64/ 16 | parts/ 17 | sdist/ 18 | var/ 19 | wheels/ 20 | share/python-wheels/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | MANIFEST 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021-2024, Marc Lodewijck 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | prune pyunormalize/tests 2 | global-exclude *.pyc __pycache__ 3 | exclude tox.ini 4 | include UNICODE-LICENSE 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pyunormalize 2 | A pure Python implementation of the **Unicode normalization algorithm** independent of Python’s core Unicode database. This package conforms to version 16.0 of the Unicode standard, released in September 2024, and has been rigorously tested for accuracy using the official [Unicode test file](https://www.unicode.org/Public/16.0.0/ucd/NormalizationTest.txt). 3 | 4 | ### Installation and updates 5 | To install the package, run: 6 | ```shell 7 | pip install pyunormalize 8 | ``` 9 | 10 | To upgrade to the latest version, run: 11 | ```shell 12 | pip install pyunormalize --upgrade 13 | ``` 14 | 15 | ### Unicode character database (UCD) version 16 | To retrieve the version of the Unicode character database in use: 17 | ```python 18 | >>> from pyunormalize import UCD_VERSION 19 | >>> UCD_VERSION 20 | '16.0.0' 21 | ``` 22 | 23 | ### Example usage 24 | ```python 25 | >>> from pyunormalize import NFC, NFD, NFKC, NFKD 26 | >>> s = "élève" # "\u00E9\u006C\u00E8\u0076\u0065" 27 | >>> nfc = NFC(s) 28 | >>> nfd = NFD(s) 29 | >>> nfc == s 30 | True 31 | >>> nfd == nfc 32 | False 33 | >>> " ".join([f"{ord(x):04X}" for x in nfc]) 34 | '00E9 006C 00E8 0076 0065' 35 | >>> " ".join([f"{ord(x):04X}" for x in nfd]) 36 | '0065 0301 006C 0065 0300 0076 0065' 37 | >>> 38 | >>> s = "⑴ ffi ²" 39 | >>> NFC(s), NFKC(s), NFD(s), NFKD(s) 40 | ('⑴ ffi ²', '(1) ffi 2', '⑴ ffi ²', '(1) ffi 2') 41 | 42 | >>> from pyunormalize import normalize 43 | >>> normalize("NFKD", "⑴ ffi ²") 44 | '(1) ffi 2' 45 | >>> forms = ["NFC", "NFD", "NFKC", "NFKD"] 46 | >>> [normalize(f, "\u017F\u0307\u0323") for f in forms] 47 | ['ẛ̣', 'ẛ̣', 'ṩ', 'ṩ'] 48 | ``` 49 | 50 | ### Related resources 51 | This implementation is based on the following resources: 52 | - [Section 3.11, “Normalization Forms,” in the Unicode core specification, version 16.0.0](https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G49537) 53 | - [Unicode Standard Annex #15: Unicode Normalization Forms, revision 56](https://www.unicode.org/reports/tr15/tr15-56.html) 54 | 55 | ### Licenses 56 | The code is licensed under the [MIT license](https://github.com/mlodewijck/pyunormalize/blob/master/LICENSE). 57 | 58 | Usage of Unicode data files is subject to the [UNICODE TERMS OF USE](https://www.unicode.org/copyright.html). Additional rights and restrictions regarding Unicode data files and software are outlined in the [Unicode Data Files and Software License](https://www.unicode.org/license.txt), a copy of which is included as [UNICODE-LICENSE](https://github.com/mlodewijck/pyunormalize/blob/master/UNICODE-LICENSE). -------------------------------------------------------------------------------- /UNICODE-LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlodewijck/pyunormalize/4b45c576567fb0293acb93a308c97cbaba3caa5f/UNICODE-LICENSE -------------------------------------------------------------------------------- /pyunormalize/__init__.py: -------------------------------------------------------------------------------- 1 | """Utility for Unicode normalization. 2 | 3 | This is a pure Python implementation of the Unicode normalization algorithm, 4 | independent of the Python core Unicode database, and ensuring compliance 5 | with version 16.0 of the Unicode standard (released in September 2024). It has 6 | been rigorously tested using the official Unicode test file, available 7 | at https://www.unicode.org/Public/16.0.0/ucd/NormalizationTest.txt. 8 | 9 | For the formal specification of the Unicode normalization algorithm, 10 | see Section 3.11, "Normalization Forms," in the Unicode core specification. 11 | 12 | Copyright (c) 2021-2024, Marc Lodewijck 13 | All rights reserved. 14 | 15 | This software is distributed under the MIT license. 16 | """ 17 | 18 | import sys 19 | 20 | if sys.version_info < (3, 6): 21 | raise SystemExit(f"\n{__package__} requires Python 3.6 or later.") 22 | del sys 23 | 24 | __all__ = [ 25 | "NFC", 26 | "NFD", 27 | "NFKC", 28 | "NFKD", 29 | "normalize", 30 | "UCD_VERSION", 31 | "UNICODE_VERSION", 32 | "__version__", 33 | ] 34 | 35 | # Unicode standard used to process the data 36 | UNICODE_VERSION = UCD_VERSION = "16.0.0" 37 | 38 | 39 | from pyunormalize import _version 40 | __version__ = _version.__version__ 41 | del _version 42 | 43 | from pyunormalize._unicode import _UNICODE_VERSION 44 | if _UNICODE_VERSION != UNICODE_VERSION: 45 | raise SystemExit( 46 | f"Unicode version mismatch in {_unicode.__name__} " 47 | f"(expected {UNICODE_VERSION}, found {_UNICODE_VERSION})." 48 | ) 49 | del _UNICODE_VERSION 50 | 51 | from pyunormalize.normalization import * 52 | -------------------------------------------------------------------------------- /pyunormalize/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = "16.0.0" 2 | -------------------------------------------------------------------------------- /pyunormalize/normalization.py: -------------------------------------------------------------------------------- 1 | """Unicode normalization algorithms.""" 2 | 3 | from pyunormalize._unicode import ( 4 | _COMPOSITION_EXCLUSIONS, 5 | _DECOMP_BY_CHARACTER, 6 | _NFC__QC_NO_OR_MAYBE, 7 | _NFD__QC_NO, 8 | _NFKC_QC_NO_OR_MAYBE, 9 | _NFKD_QC_NO, 10 | _NON_ZERO_CCC_TABLE, 11 | ) 12 | 13 | # Hangul syllables for modern Korean 14 | _SB = 0xAC00 15 | _SL = 0xD7A3 16 | 17 | # Hangul leading consonants (syllable onsets) 18 | _LB = 0x1100 19 | _LL = 0x1112 20 | 21 | # Hangul vowels (syllable nucleuses) 22 | _VB = 0x1161 23 | _VL = 0x1175 24 | 25 | # Hangul trailing consonants (syllable codas) 26 | _TB = 0x11A8 27 | _TL = 0x11C2 28 | 29 | # Number of Hangul vowels 30 | _VCOUNT = 21 31 | 32 | # Number of Hangul trailing consonants, 33 | # with the additional case of no trailing consonant 34 | _TCOUNT = 27 + 1 35 | 36 | # Dictionary mapping characters to their full canonical decompositions, 37 | # not including Hangul syllables 38 | _FULL_CDECOMP_BY_CHAR = {} 39 | 40 | # Dictionary mapping characters to their full compatibility decompositions, 41 | # not including Hangul syllables 42 | _FULL_KDECOMP_BY_CHAR = {} 43 | 44 | # Dictionary mapping canonical decompositions to their canonical composite, 45 | # not including Hangul syllables 46 | _COMPOSITE_BY_CDECOMP = {} 47 | 48 | # Note: As Hangul compositions and decompositions are algorithmic, 49 | # corresponding operations are performed in code rather than by storing 50 | # the data in general-purpose tables. 51 | 52 | 53 | def _full_decomposition(decomp_dict): 54 | # A full decomposition of a character sequence results from decomposing 55 | # each of the characters in the sequence until no characters can be further 56 | # decomposed. 57 | 58 | for key in decomp_dict: 59 | tmp = [] 60 | decomposition = [key] 61 | 62 | while True: 63 | for x in decomposition: 64 | if x in decomp_dict: 65 | tmp.extend(decomp_dict[x]) 66 | else: 67 | tmp.append(x) 68 | 69 | if tmp == decomposition: 70 | decomp_dict[key] = decomposition # done with decomposition 71 | break 72 | 73 | decomposition = tmp 74 | tmp = [] 75 | 76 | 77 | def _populate_decomposition_dictionaries(decomp_by_character): 78 | # Populate dictionaries with full canonical decompositions 79 | # and full compatibility decompositions. 80 | 81 | for key, val in decomp_by_character.items(): 82 | if isinstance(val[0], int): 83 | # assert len(val) in (1, 2) 84 | 85 | if len(val) == 2 and val[0] not in _NON_ZERO_CCC_TABLE: 86 | _COMPOSITE_BY_CDECOMP[tuple(val)] = key 87 | 88 | _FULL_CDECOMP_BY_CHAR[key] = _FULL_KDECOMP_BY_CHAR[key] = val 89 | else: 90 | _FULL_KDECOMP_BY_CHAR[key] = val[1:] 91 | 92 | # Make full canonical decomposition 93 | _full_decomposition(_FULL_CDECOMP_BY_CHAR) 94 | 95 | # Make full compatibility decomposition 96 | _full_decomposition(_FULL_KDECOMP_BY_CHAR) 97 | 98 | 99 | # Populate full decomposition dictionaries 100 | _populate_decomposition_dictionaries(_DECOMP_BY_CHARACTER) 101 | 102 | del _DECOMP_BY_CHARACTER 103 | 104 | 105 | # 106 | # Public interface 107 | # 108 | 109 | def NFC(unistr): 110 | """Return the canonical equivalent "composed" form of the original Unicode 111 | string `unistr`. This function transforms the Unicode string into the 112 | Unicode "normalization form C", where character sequences are replaced by 113 | canonically equivalent composites, where possible, while compatibility 114 | characters are unaffected. 115 | 116 | For performance optimization, the function verifies whether the input 117 | string is already in NFC. If it is, the original string is returned 118 | directly to avoid unnecessary processing. 119 | 120 | Args: 121 | unistr (str): The input Unicode string. 122 | 123 | Returns: 124 | str: The NFC normalized Unicode string. 125 | 126 | Examples: 127 | 128 | >>> unistr = "élève" 129 | >>> nfc = NFC(unistr) 130 | >>> unistr, nfc 131 | ('élève', 'élève') 132 | >>> nfc == unistr 133 | False 134 | >>> " ".join(f"{ord(x):04X}" for x in unistr) 135 | '0065 0301 006C 0065 0300 0076 0065' 136 | >>> " ".join(f"{ord(x):04X}" for x in nfc) 137 | '00E9 006C 00E8 0076 0065' 138 | 139 | >>> unistr = "한국" 140 | >>> nfc = NFC(unistr) 141 | >>> unistr, nfc 142 | ('한국', '한국') 143 | >>> " ".join(f"{ord(x):04X}" for x in unistr) 144 | '1112 1161 11AB 1100 116E 11A8' 145 | >>> " ".join(f"{ord(x):04X}" for x in nfc) 146 | 'D55C AD6D' 147 | 148 | >>> NFC("ffi") 149 | 'ffi' 150 | 151 | """ 152 | prev_ccc = 0 153 | 154 | for u in unistr: 155 | u = ord(u) 156 | 157 | if u in _NFC__QC_NO_OR_MAYBE: 158 | break 159 | 160 | if u not in _NON_ZERO_CCC_TABLE: 161 | continue 162 | 163 | curr_ccc = _NON_ZERO_CCC_TABLE[u] 164 | 165 | if curr_ccc < prev_ccc: 166 | break 167 | 168 | prev_ccc = curr_ccc 169 | else: 170 | return unistr 171 | 172 | result = map(chr, _compose([*map(ord, NFD(unistr))])) 173 | 174 | return "".join(result) 175 | 176 | 177 | def NFD(unistr): 178 | """Return the canonical equivalent "decomposed" form of the original 179 | Unicode string `unistr`. This function transforms the Unicode string into 180 | the Unicode "normalization form D", where composite characters are replaced 181 | by canonically equivalent character sequences, in canonical order, while 182 | compatibility characters are unaffected. 183 | 184 | For performance optimization, the function verifies whether the input 185 | string is already in NFD. If it is, the original string is returned 186 | directly to avoid unnecessary processing. 187 | 188 | Args: 189 | unistr (str): The input Unicode string. 190 | 191 | Returns: 192 | str: The NFD normalized Unicode string. 193 | 194 | Examples: 195 | 196 | >>> unistr = "élève" 197 | >>> nfd = NFD(unistr) 198 | >>> unistr, nfd 199 | ('élève', 'élève') 200 | >>> nfd == unistr 201 | False 202 | >>> " ".join(f"{ord(x):04X}" for x in unistr) 203 | '00E9 006C 00E8 0076 0065' 204 | >>> " ".join(f"{ord(x):04X}" for x in nfd) 205 | '0065 0301 006C 0065 0300 0076 0065' 206 | 207 | >>> unistr = "한국" 208 | >>> nfd = NFD(unistr) 209 | >>> unistr, nfd 210 | ('한국', '한국') 211 | >>> " ".join(f"{ord(x):04X}" for x in unistr) 212 | 'D55C AD6D' 213 | >>> " ".join(f"{ord(x):04X}" for x in nfd) 214 | '1112 1161 11AB 1100 116E 11A8' 215 | 216 | >>> NFD("ffi") 217 | 'ffi' 218 | 219 | """ 220 | prev_ccc = 0 221 | 222 | for u in unistr: 223 | u = ord(u) 224 | 225 | if u in _NFD__QC_NO: 226 | break 227 | 228 | if u not in _NON_ZERO_CCC_TABLE: 229 | continue 230 | 231 | curr_ccc = _NON_ZERO_CCC_TABLE[u] 232 | 233 | if curr_ccc < prev_ccc: 234 | break 235 | 236 | prev_ccc = curr_ccc 237 | else: 238 | return unistr 239 | 240 | result = map(chr, _reorder(_decompose(unistr))) 241 | 242 | return "".join(result) 243 | 244 | 245 | def NFKC(unistr): 246 | """Return the compatibility equivalent "composed" form of the original 247 | Unicode string `unistr`. This function transforms the Unicode string into 248 | the Unicode "normalization form KC", where character sequences are replaced 249 | by canonically equivalent composites, where possible, and compatibility 250 | characters are replaced by their nominal counterparts. 251 | 252 | For performance optimization, the function verifies whether the input 253 | string is already in NFKC. If it is, the original string is returned 254 | directly to avoid unnecessary processing. 255 | 256 | Args: 257 | unistr (str): The input Unicode string. 258 | 259 | Returns: 260 | str: The NFKC normalized Unicode string. 261 | 262 | Example: 263 | >>> NFKC("ffi") 264 | 'ffi' 265 | 266 | """ 267 | prev_ccc = 0 268 | 269 | for u in unistr: 270 | u = ord(u) 271 | 272 | if u in _NFKC_QC_NO_OR_MAYBE: 273 | break 274 | 275 | if u not in _NON_ZERO_CCC_TABLE: 276 | continue 277 | 278 | curr_ccc = _NON_ZERO_CCC_TABLE[u] 279 | 280 | if curr_ccc < prev_ccc: 281 | break 282 | 283 | prev_ccc = curr_ccc 284 | else: 285 | return unistr 286 | 287 | result = map(chr, _compose([*map(ord, NFKD(unistr))])) 288 | 289 | return "".join(result) 290 | 291 | 292 | def NFKD(unistr): 293 | """Return the compatibility equivalent "decomposed" form of the original 294 | Unicode string `unistr`. This function transforms the Unicode string into 295 | the Unicode "normalization form KD", where composite characters are 296 | replaced by canonically equivalent character sequences, in canonical order, 297 | and compatibility characters are replaced by their nominal counterparts. 298 | 299 | For performance optimization, the function verifies whether the input 300 | string is already in NFKD. If it is, the original string is returned 301 | directly to avoid unnecessary processing. 302 | 303 | Args: 304 | unistr (str): The input Unicode string. 305 | 306 | Returns: 307 | str: The NFKD normalized Unicode string. 308 | 309 | Example: 310 | >>> NFKD("⑴") 311 | '(1)' 312 | 313 | """ 314 | prev_ccc = 0 315 | 316 | for u in unistr: 317 | u = ord(u) 318 | 319 | if u in _NFKD_QC_NO: 320 | break 321 | 322 | if u not in _NON_ZERO_CCC_TABLE: 323 | continue 324 | 325 | curr_ccc = _NON_ZERO_CCC_TABLE[u] 326 | 327 | if curr_ccc < prev_ccc: 328 | break 329 | 330 | prev_ccc = curr_ccc 331 | else: 332 | return unistr 333 | 334 | result = map(chr, _reorder(_decompose(unistr, compatibility=True))) 335 | 336 | return "".join(result) 337 | 338 | 339 | # Dictionary for normalization forms dispatch 340 | _normalization_forms = { 341 | "NFC": NFC, 342 | "NFD": NFD, 343 | "NFKC": NFKC, 344 | "NFKD": NFKD, 345 | } 346 | 347 | def normalize(form, unistr): 348 | """Transform the Unicode string `unistr` into the Unicode normalization 349 | form `form`. Valid values for `form` are "NFC", "NFD", "NFKC", and "NFKD". 350 | 351 | Args: 352 | form (str): The normalization form to apply, one of "NFC", "NFD", 353 | "NFKC", or "NFKD". 354 | 355 | unistr (str): The input Unicode string to be normalized. 356 | 357 | Returns: 358 | str: The normalized Unicode string. 359 | 360 | Examples: 361 | 362 | >>> normalize("NFKD", "⑴ ffi ²") 363 | '(1) ffi 2' 364 | 365 | >>> forms = ["NFC", "NFD", "NFKC", "NFKD"] 366 | >>> [normalize(f, "\u017F\u0307\u0323") for f in forms] 367 | ['ẛ̣', 'ẛ̣', 'ṩ', 'ṩ'] 368 | 369 | """ 370 | return _normalization_forms[form](unistr) 371 | 372 | 373 | # 374 | # Internals 375 | # 376 | 377 | def _decompose(unistr, *, compatibility=False): 378 | # Compute the full decomposition of the Unicode string based 379 | # on the specified normalization form. The type of full decomposition 380 | # chosen depends on which Unicode normalization form is involved. For NFC 381 | # or NFD, it performs a full canonical decomposition. For NFKC or NFKD, 382 | # it performs a full compatibility decomposition. 383 | 384 | result = [] 385 | decomp = _FULL_KDECOMP_BY_CHAR if compatibility else _FULL_CDECOMP_BY_CHAR 386 | 387 | for u in unistr: 388 | u = ord(u) 389 | 390 | if u in decomp: 391 | result.extend(decomp[u]) 392 | elif _SB <= u <= _SL: 393 | result.extend(_decompose_hangul_syllable(u)) 394 | else: 395 | result.append(u) 396 | 397 | return result 398 | 399 | 400 | def _decompose_hangul_syllable(cp): 401 | # Perform Hangul syllable decomposition algorithm to derive the full 402 | # canonical decomposition of a precomposed Hangul syllable into its 403 | # constituent jamo characters. 404 | 405 | sindex = cp - _SB 406 | tindex = sindex % _TCOUNT 407 | q = (sindex - tindex) // _TCOUNT 408 | V = _VB + (q % _VCOUNT) 409 | L = _LB + (q // _VCOUNT) 410 | 411 | if tindex: 412 | # LVT syllable 413 | return (L, V, _TB - 1 + tindex) 414 | 415 | # LV syllable 416 | return (L, V) 417 | 418 | 419 | def _reorder(elements): 420 | # Perform canonical ordering algorithm. Once a string has been fully 421 | # decomposed, this algorithm ensures that any sequences of combining marks 422 | # within it are arranged in a well-defined order. Only combining marks with 423 | # non-zero Canonical_Combining_Class property values are subject to 424 | # potential reordering. The canonical ordering imposed by both composed 425 | # and decomposed normalization forms is crucial for ensuring the uniqueness 426 | # of normal forms. 427 | 428 | n = len(elements) 429 | 430 | while n > 1: 431 | new_n = 0 432 | i = 1 433 | 434 | while i < n: 435 | ccc_b = _NON_ZERO_CCC_TABLE.get(elements[i]) 436 | 437 | if not ccc_b: 438 | i += 2 439 | continue 440 | 441 | ccc_a = _NON_ZERO_CCC_TABLE.get(elements[i - 1]) 442 | 443 | if not ccc_a or ccc_a <= ccc_b: 444 | i += 1 445 | continue 446 | 447 | elements[i - 1], elements[i] = elements[i], elements[i - 1] 448 | 449 | new_n = i 450 | i += 1 451 | 452 | n = new_n 453 | 454 | return elements 455 | 456 | 457 | def _compose(elements): 458 | # Canonical composition algorithm to transform a fully decomposed 459 | # and canonically ordered string into its most fully composed but still 460 | # canonically equivalent sequence. 461 | 462 | for i, x in enumerate(elements): 463 | if x is None or x in _NON_ZERO_CCC_TABLE: 464 | continue 465 | 466 | last_cc = False 467 | blocked = False 468 | 469 | for j, y in enumerate(elements[i + 1 :], i + 1): 470 | if y in _NON_ZERO_CCC_TABLE: 471 | last_cc = True 472 | else: 473 | blocked = True 474 | 475 | if blocked and last_cc: 476 | continue 477 | 478 | prev = elements[j - 1] 479 | 480 | if (prev is None 481 | or prev not in _NON_ZERO_CCC_TABLE 482 | or _NON_ZERO_CCC_TABLE[prev] < _NON_ZERO_CCC_TABLE[y]): 483 | 484 | pair = (x, y) 485 | 486 | if pair in _COMPOSITE_BY_CDECOMP: 487 | precomp = _COMPOSITE_BY_CDECOMP[pair] 488 | else: 489 | precomp = _compose_hangul_syllable(*pair) 490 | 491 | if precomp is None or precomp in _COMPOSITION_EXCLUSIONS: 492 | if blocked: 493 | break 494 | else: 495 | elements[i] = x = precomp 496 | elements[j] = None 497 | 498 | if blocked: 499 | blocked = False 500 | else: 501 | last_cc = False 502 | 503 | return [*filter(None, elements)] 504 | 505 | 506 | def _compose_hangul_syllable(x, y): 507 | # Perform Hangul syllable composition algorithm to derive the mapping 508 | # of a canonically decomposed sequence of Hangul jamo characters 509 | # to an equivalent precomposed Hangul syllable. 510 | 511 | if _LB <= x <= _LL and _VB <= y <= _VL: 512 | # Compose a leading consonant and a vowel into an LV syllable 513 | return _SB + (((x - _LB) * _VCOUNT) + y - _VB) * _TCOUNT 514 | 515 | if _SB <= x <= _SL and not (x - _SB) % _TCOUNT and _TB <= y <= _TL: 516 | # Compose an LV syllable and a trailing consonant into an LVT syllable 517 | return x + y - (_TB - 1) 518 | 519 | return None 520 | 521 | 522 | if __name__ == "__main__": 523 | import doctest 524 | doctest.testmod() 525 | -------------------------------------------------------------------------------- /pyunormalize/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlodewijck/pyunormalize/4b45c576567fb0293acb93a308c97cbaba3caa5f/pyunormalize/tests/__init__.py -------------------------------------------------------------------------------- /pyunormalize/tests/test_pyunormalize.py: -------------------------------------------------------------------------------- 1 | """Unit tests for pyunormalize.""" 2 | 3 | import unittest 4 | 5 | from pyunormalize.normalization import _decompose, _reorder, _compose 6 | from pyunormalize import ( 7 | NFC, 8 | NFD, 9 | NFKC, 10 | NFKD, 11 | normalize, 12 | UNICODE_VERSION as _UNICODE_VERSION, 13 | ) 14 | 15 | UNICODE_VERSION = "16.0.0" 16 | 17 | 18 | class Misc(unittest.TestCase): 19 | 20 | def test_UNICODE_VERSION(self): 21 | self.assertTrue(_UNICODE_VERSION == UNICODE_VERSION) 22 | 23 | def test_normalize(self): 24 | # Characters whose normalization forms 25 | # under NFC, NFD, NFKC, and NFKD are all different: 26 | # ϓ U+03D3 GREEK UPSILON WITH ACUTE AND HOOK SYMBOL 27 | # ϔ U+03D4 GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL 28 | # ẛ U+1E9B LATIN SMALL LETTER LONG S WITH DOT ABOVE 29 | for s in ["\u03D3", "\u03D4", "\u1E9B"]: 30 | self.assertTrue( 31 | normalize("NFC", s) == NFC(s) 32 | ) 33 | self.assertTrue( 34 | normalize("NFD", s) == NFD(s) 35 | ) 36 | self.assertTrue( 37 | normalize("NFKC", s) == NFKC(s) 38 | ) 39 | self.assertTrue( 40 | normalize("NFKD", s) == NFKD(s) 41 | ) 42 | 43 | def test_internals(self): 44 | 45 | self.assertEqual( 46 | _decompose("\u00C0"), 47 | [0x0041, 0x0300] 48 | ) 49 | 50 | self.assertEqual( 51 | _decompose("\u00BE", compatibility=True), 52 | [0x0033, 0x2044, 0x0034] 53 | ) 54 | 55 | self.assertEqual( 56 | _decompose("힡"), 57 | [0x1112, 0x1175, 0x11C0] 58 | ) 59 | 60 | self.assertEqual( 61 | _reorder([0x017F, 0x0307, 0x0323]), 62 | [0x017F, 0x0323, 0x0307] 63 | ) 64 | 65 | s = "a\u0328\u0302\u0301" # a + ogonek + circumflex + acute 66 | self.assertEqual( 67 | _decompose(s), 68 | [0x0061, 0x0328, 0x0302, 0x0301] 69 | ) 70 | self.assertEqual( 71 | _reorder([0x0061, 0x0328, 0x0302, 0x0301]), 72 | [0x0061, 0x0328, 0x0302, 0x0301] 73 | ) 74 | self.assertEqual( 75 | _compose([0x0061, 0x0328, 0x0302, 0x0301]), 76 | [0x0105, 0x0302, 0x0301] 77 | ) 78 | 79 | s = "\u0105\u0302\u0301" # a-ogonek + circumflex + acute 80 | self.assertEqual( 81 | _compose(_decompose(_reorder(s))), 82 | [0x0105, 0x0302, 0x0301] 83 | ) 84 | 85 | s = "\u0105\u0301\u0302" # a-ogonek + acute + circumflex 86 | self.assertEqual( 87 | _decompose(s), 88 | [0x0061, 0x0328, 0x0301, 0x0302] 89 | ) 90 | self.assertEqual( 91 | _reorder([0x0061, 0x0328, 0x0301, 0x0302]), 92 | [0x0061, 0x0328, 0x0301, 0x0302] 93 | ) 94 | self.assertEqual( 95 | _compose([0x0061, 0x0328, 0x0301, 0x0302]), 96 | [0x0105, 0x0301, 0x0302] 97 | ) 98 | 99 | self.assertEqual( 100 | _compose(_decompose(_reorder(s))), 101 | [0x0105, 0x0301, 0x0302] 102 | ) 103 | 104 | # At https://www.unicode.org/versions/Unicode16.0.0/UnicodeStandard-16.0.pdf, 105 | # p. 140: "The replacement of the Starter L in R2 requires continuing 106 | # to check the succeeding characters until the character at that 107 | # position is no longer part of any Non-blocked Pair that can be 108 | # replaced by a Primary Composite. For example, consider the following 109 | # hypothetical coded character sequence: . None 111 | # of the first three combining marks forms a Primary Composite with 112 | # the letter z. However, the fourth combining mark in the sequence, 113 | # acute, does form a Primary Composite with z, and it is not Blocked 114 | # from the z. Therefore, R2 mandates the replacement of the sequence 115 | # with , even though 116 | # there are three other combining marks intervening in the sequence." 117 | items = [0x007A, 0x0335, 0x0327, 0x0324, 0x0301] 118 | self.assertEqual( 119 | _compose(items), 120 | [0x017A, 0x0335, 0x0327, 0x0324] 121 | ) 122 | 123 | 124 | if __name__ == "__main__": 125 | unittest.main() 126 | -------------------------------------------------------------------------------- /pyunormalize/tests/unicode_conformance.py: -------------------------------------------------------------------------------- 1 | """Unicode conformance testing. 2 | 3 | Information about conformance testing for Unicode normalization forms: 4 | - https://www.unicode.org/Public/16.0.0/ucd/NormalizationTest.txt 5 | - https://www.unicode.org/reports/tr15/tr15-56.html 6 | """ 7 | 8 | import os 9 | import time 10 | 11 | from pyunormalize import ( 12 | NFC, 13 | NFD, 14 | NFKC, 15 | NFKD, 16 | UNICODE_VERSION, 17 | ) 18 | 19 | # Unicode conformance test file 20 | UNICODE_FILE = "NormalizationTest.txt" 21 | 22 | 23 | def parse(lines): 24 | # Check file version 25 | assert UNICODE_VERSION in lines[0], "Wrong Unicode version number." 26 | 27 | data = [] # list of lists 28 | dec = [] # needed for character by character test 29 | 30 | for num, line in enumerate(lines, 1): 31 | if line and not line.startswith(("#", "@")): 32 | *c, _ = line.split(";", 5) 33 | 34 | rec = [ 35 | "".join([chr(int(x, 16)) for x in seq.split()]) 36 | for seq in c 37 | ] 38 | 39 | # rec: [source, nfc, nfd, nfkc, nfkd] 40 | data.append([num, *rec]) 41 | 42 | if not " " in c[0]: 43 | dec.append(int(c[0], 16)) 44 | 45 | s = set(dec) 46 | chars = [chr(x) for x in range(0x110000) if x not in s] 47 | 48 | return data, chars 49 | 50 | 51 | def main(): 52 | data_dir = os.path.join("pyunormalize", "tests", "data") 53 | path = os.path.join(data_dir, UNICODE_FILE) 54 | 55 | with open(path, encoding="utf-8") as f: 56 | lines = f.read().splitlines() 57 | 58 | data, chars = parse(lines) 59 | 60 | counter = 0 61 | start_time = time.perf_counter() 62 | 63 | # 64 | # NFC 65 | # c2 == toNFC(c1) == toNFC(c2) == toNFC(c3) 66 | # c4 == toNFC(c4) == toNFC(c5) 67 | # 68 | 69 | print(f"\nNormalization Form C\n{'-' * 70}") 70 | 71 | s = f = 0 72 | for record in data: 73 | num, source, nfc, nfd, nfkc, nfkd = record 74 | 75 | lst1 = [] 76 | lst1.append(NFC(source)) 77 | lst1.append(NFC(nfc)) 78 | lst1.append(NFC(nfd)) 79 | 80 | lst2 = [] 81 | lst2.append(NFC(nfkc)) 82 | lst2.append(NFC(nfkd)) 83 | 84 | if (lst1.count(nfc) == len(lst1) 85 | and lst2.count(nfkc) == len(lst2)): 86 | s += 1 87 | else: 88 | f += 1 89 | print(f"Failed on line {num}") 90 | 91 | r = s + f 92 | if f: 93 | print(f"FAIL ({r:,} items, {f:,} failures)\n") 94 | else: 95 | print(f"OK ({r:,} items)\n") 96 | counter += 1 97 | 98 | # 99 | # NFD 100 | # c3 == toNFD(c1) == toNFD(c2) == toNFD(c3) 101 | # c5 == toNFD(c4) == toNFD(c5) 102 | # 103 | 104 | print(f"Normalization Form D\n{'-' * 70}") 105 | 106 | s = f = 0 107 | for record in data: 108 | num, source, nfc, nfd, nfkc, nfkd = record 109 | 110 | lst1 = [] 111 | lst1.append(NFD(source)) 112 | lst1.append(NFD(nfc)) 113 | lst1.append(NFD(nfd)) 114 | 115 | lst2 = [] 116 | lst2.append(NFD(nfkc)) 117 | lst2.append(NFD(nfkd)) 118 | 119 | if (lst1.count(nfd) == len(lst1) 120 | and lst2.count(nfkd) == len(lst2)): 121 | s += 1 122 | else: 123 | f += 1 124 | print(f"Failed on line {num}") 125 | 126 | r = s + f 127 | if f: 128 | print(f"FAIL ({r:,} items, {f:,} failures)\n") 129 | else: 130 | print(f"OK ({r:,} items)\n") 131 | counter += 1 132 | 133 | # 134 | # NFKC 135 | # c4 == toNFKC(c1) == toNFKC(c2) == toNFKC(c3) == toNFKC(c4) == toNFKC(c5) 136 | # 137 | 138 | print(f"Normalization Form KC\n{'-' * 70}") 139 | 140 | s = f = 0 141 | for record in data: 142 | num, source, nfc, nfd, nfkc, nfkd = record 143 | 144 | lst = [] 145 | lst.append(NFKC(source)) 146 | lst.append(NFKC(nfc)) 147 | lst.append(NFKC(nfd)) 148 | lst.append(NFKC(nfkc)) 149 | lst.append(NFKC(nfkd)) 150 | 151 | if lst.count(nfkc) == len(lst): 152 | s += 1 153 | else: 154 | f += 1 155 | print(f"Failed on line {num}") 156 | 157 | r = s + f 158 | if f: 159 | print(f"FAIL ({r:,} items, {f:,} failures)\n") 160 | else: 161 | print(f"OK ({r:,} items)\n") 162 | counter += 1 163 | 164 | # 165 | # NFKD 166 | # c5 == toNFKD(c1) == toNFKD(c2) == toNFKD(c3) == toNFKD(c4) == toNFKD(c5) 167 | # 168 | 169 | print(f"Normalization Form KD\n{'-' * 70}") 170 | 171 | s = f = 0 172 | for record in data: 173 | num, source, nfc, nfd, nfkc, nfkd = record 174 | 175 | lst = [] 176 | lst.append(NFKD(source)) 177 | lst.append(NFKD(nfc)) 178 | lst.append(NFKD(nfd)) 179 | lst.append(NFKD(nfkc)) 180 | lst.append(NFKD(nfkd)) 181 | 182 | if lst.count(nfkd) == len(lst): 183 | s += 1 184 | else: 185 | f += 1 186 | print(f"Failed on line {num}") 187 | 188 | r = s + f 189 | if f: 190 | print(f"FAIL ({r:,} items, {f:,} failures)\n") 191 | else: 192 | print(f"OK ({r:,} items)\n") 193 | counter += 1 194 | 195 | # 196 | # Character by character test 197 | # X == toNFC(X) == toNFD(X) == toNFKC(X) == toNFKD(X) 198 | # 199 | 200 | print(f"Character by character test, all normalization forms\n{'-' * 70}") 201 | 202 | s = f = 0 203 | for x in chars: 204 | lst = [] 205 | lst.append(NFC(x)) 206 | lst.append(NFD(x)) 207 | lst.append(NFKC(x)) 208 | lst.append(NFKD(x)) 209 | 210 | if lst.count(x) == len(lst): 211 | s += 1 212 | else: 213 | f += 1 214 | print(f"Failed for U+{ord(x):04X}") 215 | 216 | r = s + f 217 | if f: 218 | print(f"FAIL ({r:,} items, {f:,} failures)\n") 219 | else: 220 | print(f"OK ({r:,} items)\n") 221 | counter += 1 222 | 223 | 224 | uax = f"UAX #15, version {UNICODE_VERSION}." 225 | 226 | if counter == 5: 227 | print(f".. Implementation conforms to {uax}") 228 | else: 229 | print(f".. Implementation does not conform to {uax}") 230 | 231 | print(f".. {time.perf_counter() - start_time:.3f} seconds") 232 | 233 | 234 | if __name__ == "__main__": 235 | main() 236 | -------------------------------------------------------------------------------- /pyunormalize/tools/generate_unicode.py: -------------------------------------------------------------------------------- 1 | # This script generates the pyunormalize.unicode module. 2 | # 3 | # Input files: 4 | # https://www.unicode.org/Public/16.0.0/ucd/CompositionExclusions.txt 5 | # https://www.unicode.org/Public/16.0.0/ucd/DerivedNormalizationProps.txt 6 | # https://www.unicode.org/Public/16.0.0/ucd/UnicodeData.txt 7 | # 8 | # Output file: 9 | # tools/generate_unicode/unicode.py 10 | # 11 | # The output file must be copied to the `pyunormalize` directory. 12 | 13 | import pathlib 14 | import urllib.error 15 | import urllib.request 16 | 17 | UNICODE_VERSION = "16.0.0" 18 | SCRIPT_PATH = "/".join(pathlib.Path(__file__).parts[-3:]) 19 | 20 | # Files from the Unicode character database (UCD) 21 | EXCLUSIONS = "CompositionExclusions.txt" 22 | PROPS = "DerivedNormalizationProps.txt" 23 | UNICODE_DATA = "UnicodeData.txt" 24 | 25 | 26 | def read_remote(filename): 27 | url = f"https://www.unicode.org/Public/{UNICODE_VERSION}/ucd/" 28 | 29 | try: 30 | print("\n.. Fetching URL...") 31 | response = urllib.request.urlopen(f"{url}{filename}") 32 | # print(response.__dict__) 33 | except urllib.error.HTTPError as e: 34 | raise Exception( 35 | f"The server could not fulfill the request. Error code: {e.code}" 36 | ) 37 | except urllib.error.URLError as e: 38 | raise Exception( 39 | f"We failed to reach a server.\nReason:\n{e.reason}" 40 | ) 41 | 42 | print(f".. Extracting data from {filename}") 43 | return response.read().decode("utf-8").splitlines() 44 | 45 | 46 | def check_version(line): 47 | assert UNICODE_VERSION in line, "Wrong Unicode version number." 48 | 49 | 50 | def main(): 51 | # Current working directory 52 | cwd = pathlib.Path.cwd() 53 | 54 | # 55 | # Unicode file: UnicodeData.txt 56 | # 57 | 58 | try: 59 | lines = (cwd / UNICODE_DATA).read_text(encoding="utf-8").splitlines() 60 | except FileNotFoundError: 61 | lines = read_remote(UNICODE_DATA) 62 | print(".. Done.") 63 | 64 | # File version is not specified in UnicodeData.txt 65 | # and therefore cannot be checked. 66 | 67 | ccc_list = [] 68 | dcp_list = [] 69 | 70 | for line in lines: 71 | code, _, _, ccc, _, dcp, *_ = line.split(";", 6) 72 | 73 | if ccc != "0": 74 | ccc_list.append(f" 0x{code:0>5}: {ccc:>3},") 75 | 76 | if dcp: 77 | dec_dcp = [] 78 | 79 | for c in dcp.split(" "): 80 | dec_dcp.append(f'"{c}"' if c.startswith("<") else f"0x{c}") 81 | 82 | dcp_list.append(f" 0x{code:0>5}: [{', '.join(dec_dcp)}],") 83 | 84 | # 85 | # Unicode file: CompositionExclusions.txt 86 | # 87 | 88 | try: 89 | lines = (cwd / EXCLUSIONS).read_text(encoding="utf-8").splitlines() 90 | except FileNotFoundError: 91 | lines = read_remote(EXCLUSIONS) 92 | print(".. Done.") 93 | 94 | # Check file version 95 | check_version(lines[0]) 96 | 97 | exclusions_list = [] 98 | 99 | for line in lines: 100 | line = line.rstrip() 101 | if line and not line.startswith("#"): 102 | code = line.split("#")[0].rstrip() 103 | exclusions_list.append(f" 0x{code:0>5},") 104 | 105 | # 106 | # Unicode file: DerivedNormalizationProps.txt 107 | # 108 | 109 | try: 110 | lines = (cwd / PROPS).read_text(encoding="utf-8").splitlines() 111 | except FileNotFoundError: 112 | lines = read_remote(PROPS) 113 | print(".. Done.") 114 | 115 | # Check file version 116 | check_version(lines[0]) 117 | 118 | tmp = [] 119 | 120 | start = lines.index( 121 | "# Property: NFD_Quick_Check" 122 | ) 123 | stop = lines.index( 124 | "# Derived Property: Expands_On_NFD (DEPRECATED as of Unicode 6.0.0)" 125 | ) 126 | 127 | for line in lines[start:stop]: 128 | if not line or line.startswith("#"): 129 | continue 130 | tmp.append(line) 131 | 132 | NFD_QC_NO_list = [] 133 | NFKD_QC_NO_list = [] 134 | NFC_QC_NO_list = [] 135 | NFC_QC_MAYBE_list = [] 136 | NFKC_QC_NO_list = [] 137 | NFKC_QC_MAYBE_list = [] 138 | 139 | prop_values = { 140 | "NFD_QC" : NFD_QC_NO_list, 141 | "NFKD_QC" : NFKD_QC_NO_list, 142 | "NFC_QC" : (NFC_QC_NO_list, NFC_QC_MAYBE_list), 143 | "NFKC_QC" : (NFKC_QC_NO_list, NFKC_QC_MAYBE_list), 144 | } 145 | 146 | for line in tmp: 147 | data = line.split(" # ")[0].split(";") 148 | data = [d.strip() for d in data] 149 | 150 | code, prop, prov_val = data 151 | 152 | if ".." in code: 153 | start, end = code.split("..") 154 | 155 | if prop == "NFC_QC" and prov_val == "N": 156 | tmp_list = prop_values[prop][0] 157 | elif prop == "NFC_QC": # and prov_val == "MAYBE" 158 | tmp_list = prop_values[prop][1] 159 | elif prop == "NFKC_QC" and prov_val == "N": 160 | tmp_list = prop_values[prop][0] 161 | elif prop == "NFKC_QC": # and prov_val == "MAYBE" 162 | tmp_list = prop_values[prop][1] 163 | else: 164 | tmp_list = prop_values[prop] 165 | 166 | tmp_list.append( 167 | f" *range(0x{start:0>5}, 0x{end:0>5} + 1)," 168 | ) 169 | 170 | else: 171 | if prop == "NFC_QC" and prov_val == "N": 172 | tmp_list = prop_values[prop][0] 173 | elif prop == "NFC_QC": # and prov_val == "MAYBE" 174 | tmp_list = prop_values[prop][1] 175 | elif prop == "NFKC_QC" and prov_val == "N": 176 | tmp_list = prop_values[prop][0] 177 | elif prop == "NFKC_QC": # and prov_val == "MAYBE": 178 | tmp_list = prop_values[prop][1] 179 | else: 180 | tmp_list = prop_values[prop] 181 | 182 | tmp_list.append(f" 0x{code:0>5},") 183 | 184 | 185 | dcp = "\n".join(dcp_list) 186 | ccc = "\n".join(ccc_list) 187 | exclusions = "\n".join(exclusions_list) 188 | NFD_QC_N = "\n".join(NFD_QC_NO_list) 189 | NFKD_QC_N = "\n".join(NFKD_QC_NO_list) 190 | NFC_QC_N = "\n".join(NFC_QC_NO_list) 191 | NFC_QC_M = "\n".join(NFC_QC_MAYBE_list) 192 | NFKC_QC_N = "\n".join(NFKC_QC_NO_list) 193 | NFKC_QC_M = "\n".join(NFKC_QC_MAYBE_list) 194 | 195 | with open(cwd / "_unicode.py", "w", encoding="utf-8", newline="\n") as f: 196 | f.write(f'''\ 197 | """Data derived from the Unicode character database (UCD). 198 | 199 | This file was generated from {SCRIPT_PATH} 200 | """ 201 | 202 | _UNICODE_VERSION = "{UNICODE_VERSION}" 203 | 204 | # Dictionary mapping characters to their canonical decompositions, 205 | # not including Hangul syllables 206 | _DECOMP_BY_CHARACTER = {{ 207 | {dcp} 208 | }} 209 | 210 | # Dictionary mapping characters with non-zero canonical combining class values 211 | # to their corresponding values 212 | _NON_ZERO_CCC_TABLE = {{ 213 | {ccc} 214 | }} 215 | 216 | # Characters which are excluded from composition 217 | _COMPOSITION_EXCLUSIONS = {{ 218 | {exclusions} 219 | }} 220 | 221 | # NFC_Quick_Check=No 222 | # Characters that cannot ever occur in the normalization form C 223 | _NFC__QC_NO = set([ 224 | {NFC_QC_N} 225 | ]) 226 | 227 | # NFC_Quick_Check=Maybe 228 | # Characters that may or may not occur in the normalization form C, 229 | # depending on the context 230 | _NFC__QC_MAYBE = set([ 231 | {NFC_QC_M} 232 | ]) 233 | 234 | # Code points listed for NFC_Quick_Check=No or NFC_Quick_Check=Maybe 235 | _NFC__QC_NO_OR_MAYBE = _NFC__QC_NO | _NFC__QC_MAYBE 236 | 237 | # NFD_Quick_Check=No 238 | # Characters that cannot ever occur in the normalization form D 239 | _NFD__QC_NO = set([ 240 | {NFD_QC_N} 241 | ]) 242 | 243 | # NFKC_Quick_Check=No 244 | # Characters that cannot ever occur in the normalization form KC 245 | _NFKC_QC_NO = set([ 246 | {NFKC_QC_N} 247 | ]) 248 | 249 | # NFKC_Quick_Check=Maybe 250 | # Characters that may or may not occur in the normalization form KC, 251 | # depending on the context 252 | _NFKC_QC_MAYBE = set([ 253 | {NFKC_QC_M} 254 | ]) 255 | 256 | # Code points listed for NFKC_Quick_Check=No or NFKC_Quick_Check=Maybe 257 | _NFKC_QC_NO_OR_MAYBE = _NFKC_QC_NO | _NFKC_QC_MAYBE 258 | 259 | # NFKD_Quick_Check=No 260 | # Characters that cannot ever occur in the normalization form KD 261 | _NFKD_QC_NO = set([ 262 | {NFKD_QC_N} 263 | ]) 264 | 265 | del _NFC__QC_NO, _NFC__QC_MAYBE, _NFKC_QC_NO, _NFKC_QC_MAYBE 266 | ''') 267 | 268 | 269 | if __name__ == "__main__": 270 | main() 271 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """Setup script for pyunormalize.""" 2 | 3 | import os 4 | from setuptools import setup, find_packages 5 | 6 | URL = "https://github.com/mlodewijck/pyunormalize" 7 | 8 | 9 | def get_version(): 10 | version_file = os.path.join("pyunormalize", "_version.py") 11 | namespace = {} 12 | with open(version_file) as f: 13 | exec(compile(f.read(), version_file, "exec"), namespace) 14 | return namespace["__version__"] 15 | 16 | with open("README.md", encoding="utf-8") as f: 17 | README = f.read() 18 | 19 | setup( 20 | name="pyunormalize", 21 | version=get_version(), 22 | description=( 23 | "Unicode normalization forms (NFC, NFKC, NFD, NFKD). A library " 24 | "independent of the Python core Unicode database." 25 | ), 26 | long_description=README, 27 | long_description_content_type="text/markdown", 28 | author="Marc Lodewijck", 29 | author_email="mlodewijck@gmail.com", 30 | license="MIT", 31 | url=URL, 32 | project_urls={ 33 | "Bug Reports": "{}/issues".format(URL), 34 | "Source": "{}/".format(URL), 35 | }, 36 | keywords=[ 37 | "Unicode", 38 | "Unicode data", 39 | "Unicode normalization", 40 | "normalization", 41 | "NFC", 42 | "NFD", 43 | "NFKC", 44 | "NFKD", 45 | "Unicode Normalization Forms", 46 | "Canonical Ordering Algorithm", 47 | "Canonical Composition Algorithm", 48 | "canonical ordering", 49 | "canonical composition", 50 | "Hangul Syllable Composition Algorithm", 51 | "Hangul Syllable Decomposition Algorithm", 52 | "Hangul syllables", 53 | "Hangul jamo characters", 54 | ], 55 | # Trove classifiers 56 | classifiers=[ 57 | "Intended Audience :: Developers", 58 | "License :: OSI Approved :: MIT License", 59 | "Programming Language :: Python :: 3.6", 60 | "Programming Language :: Python :: 3.7", 61 | "Programming Language :: Python :: 3.8", 62 | "Programming Language :: Python :: 3.9", 63 | "Programming Language :: Python :: 3.10", 64 | "Programming Language :: Python :: 3.11", 65 | "Programming Language :: Python :: 3.12", 66 | "Topic :: Software Development", 67 | "Topic :: Software Development :: Internationalization", 68 | "Topic :: Text Processing", 69 | "Topic :: Text Processing :: Linguistic", 70 | "Topic :: Utilities", 71 | ], 72 | python_requires=">=3.6", 73 | packages=find_packages(exclude=["tests"]), 74 | include_package_data=True, 75 | # All data files matched by MANIFEST.in will get included 76 | # if they are inside a package directory. 77 | zip_safe=False, 78 | ) 79 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py36, py37, py38, py39, py310, py311, py312 3 | 4 | [testenv] 5 | commands = 6 | python -m unittest discover -s pyunormalize/tests -p "*.py" 7 | python pyunormalize/tests/unicode_conformance.py 8 | --------------------------------------------------------------------------------