├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── UNICODE-LICENSE
├── pyunormalize
    ├── __init__.py
    ├── _unicode.py
    ├── _version.py
    ├── normalization.py
    ├── tests
    │   ├── __init__.py
    │   ├── data
    │   │   └── NormalizationTest.txt
    │   ├── test_pyunormalize.py
    │   └── unicode_conformance.py
    └── tools
    │   └── generate_unicode.py
├── setup.py
└── tox.ini


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # Distribution / packaging
 7 | .Python
 8 | build/
 9 | develop-eggs/
10 | dist/
11 | downloads/
12 | eggs/
13 | .eggs/
14 | lib/
15 | lib64/
16 | parts/
17 | sdist/
18 | var/
19 | wheels/
20 | share/python-wheels/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | MANIFEST
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021-2024, Marc Lodewijck
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | prune pyunormalize/tests
2 | global-exclude *.pyc __pycache__
3 | exclude tox.ini
4 | include UNICODE-LICENSE
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # pyunormalize
 2 | A pure Python implementation of the **Unicode normalization algorithm** independent of Python’s core Unicode database. This package conforms to version&nbsp;16.0 of the Unicode standard, released in September&nbsp;2024, and has been rigorously tested for accuracy using the official [Unicode test file](https://www.unicode.org/Public/16.0.0/ucd/NormalizationTest.txt).
 3 | 
 4 | ### Installation and updates
 5 | To install the package, run:
 6 | ```shell
 7 | pip install pyunormalize
 8 | ```
 9 | 
10 | To upgrade to the latest version, run:
11 | ```shell
12 | pip install pyunormalize --upgrade
13 | ```
14 | 
15 | ### Unicode character database (UCD) version
16 | To retrieve the version of the Unicode character database in use:
17 | ```python
18 | >>> from pyunormalize import UCD_VERSION
19 | >>> UCD_VERSION
20 | '16.0.0'
21 | ```
22 | 
23 | ### Example usage
24 | ```python
25 | >>> from pyunormalize import NFC, NFD, NFKC, NFKD
26 | >>> s = "élève"  # "\u00E9\u006C\u00E8\u0076\u0065"
27 | >>> nfc = NFC(s)
28 | >>> nfd = NFD(s)
29 | >>> nfc == s
30 | True
31 | >>> nfd == nfc
32 | False
33 | >>> " ".join([f"{ord(x):04X}" for x in nfc])
34 | '00E9 006C 00E8 0076 0065'
35 | >>> " ".join([f"{ord(x):04X}" for x in nfd])
36 | '0065 0301 006C 0065 0300 0076 0065'
37 | >>>
38 | >>> s = "⑴ ﬃ ²"
39 | >>> NFC(s), NFKC(s), NFD(s), NFKD(s)
40 | ('⑴ ﬃ ²', '(1) ffi 2', '⑴ ﬃ ²', '(1) ffi 2')
41 | 
42 | >>> from pyunormalize import normalize
43 | >>> normalize("NFKD", "⑴ ﬃ ²")
44 | '(1) ffi 2'
45 | >>> forms = ["NFC", "NFD", "NFKC", "NFKD"]
46 | >>> [normalize(f, "\u017F\u0307\u0323") for f in forms]
47 | ['ẛ̣', 'ẛ̣', 'ṩ', 'ṩ']
48 | ```
49 | 
50 | ### Related resources
51 | This implementation is based on the following resources:
52 | - [Section 3.11, “Normalization Forms,” in the Unicode core specification, version&nbsp;16.0.0](https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G49537)
53 | - [Unicode Standard Annex #15: Unicode Normalization Forms, revision&nbsp;56](https://www.unicode.org/reports/tr15/tr15-56.html)
54 | 
55 | ### Licenses
56 | The code is licensed under the [MIT license](https://github.com/mlodewijck/pyunormalize/blob/master/LICENSE).
57 | 
58 | Usage of Unicode data files is subject to the [UNICODE TERMS OF USE](https://www.unicode.org/copyright.html). Additional rights and restrictions regarding Unicode data files and software are outlined in the [Unicode Data Files and Software License](https://www.unicode.org/license.txt), a copy of which is included as [UNICODE-LICENSE](https://github.com/mlodewijck/pyunormalize/blob/master/UNICODE-LICENSE).


--------------------------------------------------------------------------------
/UNICODE-LICENSE:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlodewijck/pyunormalize/4b45c576567fb0293acb93a308c97cbaba3caa5f/UNICODE-LICENSE


--------------------------------------------------------------------------------
/pyunormalize/__init__.py:
--------------------------------------------------------------------------------
 1 | """Utility for Unicode normalization.
 2 | 
 3 | This is a pure Python implementation of the Unicode normalization algorithm,
 4 | independent of the Python core Unicode database, and ensuring compliance
 5 | with version 16.0 of the Unicode standard (released in September 2024). It has
 6 | been rigorously tested using the official Unicode test file, available
 7 | at https://www.unicode.org/Public/16.0.0/ucd/NormalizationTest.txt.
 8 | 
 9 | For the formal specification of the Unicode normalization algorithm,
10 | see Section 3.11, "Normalization Forms," in the Unicode core specification.
11 | 
12 | Copyright (c) 2021-2024, Marc Lodewijck
13 | All rights reserved.
14 | 
15 | This software is distributed under the MIT license.
16 | """
17 | 
18 | import sys
19 | 
20 | if sys.version_info < (3, 6):
21 |     raise SystemExit(f"\n{__package__} requires Python 3.6 or later.")
22 | del sys
23 | 
24 | __all__ = [
25 |     "NFC",
26 |     "NFD",
27 |     "NFKC",
28 |     "NFKD",
29 |     "normalize",
30 |     "UCD_VERSION",
31 |     "UNICODE_VERSION",
32 |     "__version__",
33 | ]
34 | 
35 | # Unicode standard used to process the data
36 | UNICODE_VERSION = UCD_VERSION = "16.0.0"
37 | 
38 | 
39 | from pyunormalize import _version
40 | __version__ = _version.__version__
41 | del _version
42 | 
43 | from pyunormalize._unicode import _UNICODE_VERSION
44 | if _UNICODE_VERSION != UNICODE_VERSION:
45 |     raise SystemExit(
46 |         f"Unicode version mismatch in {_unicode.__name__} "
47 |         f"(expected {UNICODE_VERSION}, found {_UNICODE_VERSION})."
48 |     )
49 | del _UNICODE_VERSION
50 | 
51 | from pyunormalize.normalization import *
52 | 


--------------------------------------------------------------------------------
/pyunormalize/_version.py:
--------------------------------------------------------------------------------
1 | __version__ = "16.0.0"
2 | 


--------------------------------------------------------------------------------
/pyunormalize/normalization.py:
--------------------------------------------------------------------------------
  1 | """Unicode normalization algorithms."""
  2 | 
  3 | from pyunormalize._unicode import (
  4 |     _COMPOSITION_EXCLUSIONS,
  5 |     _DECOMP_BY_CHARACTER,
  6 |     _NFC__QC_NO_OR_MAYBE,
  7 |     _NFD__QC_NO,
  8 |     _NFKC_QC_NO_OR_MAYBE,
  9 |     _NFKD_QC_NO,
 10 |     _NON_ZERO_CCC_TABLE,
 11 | )
 12 | 
 13 | # Hangul syllables for modern Korean
 14 | _SB = 0xAC00
 15 | _SL = 0xD7A3
 16 | 
 17 | # Hangul leading consonants (syllable onsets)
 18 | _LB = 0x1100
 19 | _LL = 0x1112
 20 | 
 21 | # Hangul vowels (syllable nucleuses)
 22 | _VB = 0x1161
 23 | _VL = 0x1175
 24 | 
 25 | # Hangul trailing consonants (syllable codas)
 26 | _TB = 0x11A8
 27 | _TL = 0x11C2
 28 | 
 29 | # Number of Hangul vowels
 30 | _VCOUNT = 21
 31 | 
 32 | # Number of Hangul trailing consonants,
 33 | # with the additional case of no trailing consonant
 34 | _TCOUNT = 27 + 1
 35 | 
 36 | # Dictionary mapping characters to their full canonical decompositions,
 37 | # not including Hangul syllables
 38 | _FULL_CDECOMP_BY_CHAR = {}
 39 | 
 40 | # Dictionary mapping characters to their full compatibility decompositions,
 41 | # not including Hangul syllables
 42 | _FULL_KDECOMP_BY_CHAR = {}
 43 | 
 44 | # Dictionary mapping canonical decompositions to their canonical composite,
 45 | # not including Hangul syllables
 46 | _COMPOSITE_BY_CDECOMP = {}
 47 | 
 48 | # Note: As Hangul compositions and decompositions are algorithmic,
 49 | # corresponding operations are performed in code rather than by storing
 50 | # the data in general-purpose tables.
 51 | 
 52 | 
 53 | def _full_decomposition(decomp_dict):
 54 |     # A full decomposition of a character sequence results from decomposing
 55 |     # each of the characters in the sequence until no characters can be further
 56 |     # decomposed.
 57 | 
 58 |     for key in decomp_dict:
 59 |         tmp = []
 60 |         decomposition = [key]
 61 | 
 62 |         while True:
 63 |             for x in decomposition:
 64 |                 if x in decomp_dict:
 65 |                     tmp.extend(decomp_dict[x])
 66 |                 else:
 67 |                     tmp.append(x)
 68 | 
 69 |             if tmp == decomposition:
 70 |                 decomp_dict[key] = decomposition  # done with decomposition
 71 |                 break
 72 | 
 73 |             decomposition = tmp
 74 |             tmp = []
 75 | 
 76 | 
 77 | def _populate_decomposition_dictionaries(decomp_by_character):
 78 |     # Populate dictionaries with full canonical decompositions
 79 |     # and full compatibility decompositions.
 80 | 
 81 |     for key, val in decomp_by_character.items():
 82 |         if isinstance(val[0], int):
 83 |             # assert len(val) in (1, 2)
 84 | 
 85 |             if len(val) == 2 and val[0] not in _NON_ZERO_CCC_TABLE:
 86 |                 _COMPOSITE_BY_CDECOMP[tuple(val)] = key
 87 | 
 88 |             _FULL_CDECOMP_BY_CHAR[key] = _FULL_KDECOMP_BY_CHAR[key] = val
 89 |         else:
 90 |             _FULL_KDECOMP_BY_CHAR[key] = val[1:]
 91 | 
 92 |     # Make full canonical decomposition
 93 |     _full_decomposition(_FULL_CDECOMP_BY_CHAR)
 94 | 
 95 |     # Make full compatibility decomposition
 96 |     _full_decomposition(_FULL_KDECOMP_BY_CHAR)
 97 | 
 98 | 
 99 | # Populate full decomposition dictionaries
100 | _populate_decomposition_dictionaries(_DECOMP_BY_CHARACTER)
101 | 
102 | del _DECOMP_BY_CHARACTER
103 | 
104 | 
105 | #
106 | # Public interface
107 | #
108 | 
109 | def NFC(unistr):
110 |     """Return the canonical equivalent "composed" form of the original Unicode
111 |     string `unistr`. This function transforms the Unicode string into the
112 |     Unicode "normalization form C", where character sequences are replaced by
113 |     canonically equivalent composites, where possible, while compatibility
114 |     characters are unaffected.
115 | 
116 |     For performance optimization, the function verifies whether the input
117 |     string is already in NFC. If it is, the original string is returned
118 |     directly to avoid unnecessary processing.
119 | 
120 |     Args:
121 |         unistr (str): The input Unicode string.
122 | 
123 |     Returns:
124 |         str: The NFC normalized Unicode string.
125 | 
126 |     Examples:
127 | 
128 |         >>> unistr = "élève"
129 |         >>> nfc = NFC(unistr)
130 |         >>> unistr, nfc
131 |         ('élève', 'élève')
132 |         >>> nfc == unistr
133 |         False
134 |         >>> " ".join(f"{ord(x):04X}" for x in unistr)
135 |         '0065 0301 006C 0065 0300 0076 0065'
136 |         >>> " ".join(f"{ord(x):04X}" for x in nfc)
137 |         '00E9 006C 00E8 0076 0065'
138 | 
139 |         >>> unistr = "한국"
140 |         >>> nfc = NFC(unistr)
141 |         >>> unistr, nfc
142 |         ('한국', '한국')
143 |         >>> " ".join(f"{ord(x):04X}" for x in unistr)
144 |         '1112 1161 11AB 1100 116E 11A8'
145 |         >>> " ".join(f"{ord(x):04X}" for x in nfc)
146 |         'D55C AD6D'
147 | 
148 |         >>> NFC("ﬃ")
149 |         'ﬃ'
150 | 
151 |     """
152 |     prev_ccc = 0
153 | 
154 |     for u in unistr:
155 |         u = ord(u)
156 | 
157 |         if u in _NFC__QC_NO_OR_MAYBE:
158 |             break
159 | 
160 |         if u not in _NON_ZERO_CCC_TABLE:
161 |             continue
162 | 
163 |         curr_ccc = _NON_ZERO_CCC_TABLE[u]
164 | 
165 |         if curr_ccc < prev_ccc:
166 |             break
167 | 
168 |         prev_ccc = curr_ccc
169 |     else:
170 |         return unistr
171 | 
172 |     result = map(chr, _compose([*map(ord, NFD(unistr))]))
173 | 
174 |     return "".join(result)
175 | 
176 | 
177 | def NFD(unistr):
178 |     """Return the canonical equivalent "decomposed" form of the original
179 |     Unicode string `unistr`. This function transforms the Unicode string into
180 |     the Unicode "normalization form D", where composite characters are replaced
181 |     by canonically equivalent character sequences, in canonical order, while
182 |     compatibility characters are unaffected.
183 | 
184 |     For performance optimization, the function verifies whether the input
185 |     string is already in NFD. If it is, the original string is returned
186 |     directly to avoid unnecessary processing.
187 | 
188 |     Args:
189 |         unistr (str): The input Unicode string.
190 | 
191 |     Returns:
192 |         str: The NFD normalized Unicode string.
193 | 
194 |     Examples:
195 | 
196 |         >>> unistr = "élève"
197 |         >>> nfd = NFD(unistr)
198 |         >>> unistr, nfd
199 |         ('élève', 'élève')
200 |         >>> nfd == unistr
201 |         False
202 |         >>> " ".join(f"{ord(x):04X}" for x in unistr)
203 |         '00E9 006C 00E8 0076 0065'
204 |         >>> " ".join(f"{ord(x):04X}" for x in nfd)
205 |         '0065 0301 006C 0065 0300 0076 0065'
206 | 
207 |         >>> unistr = "한국"
208 |         >>> nfd = NFD(unistr)
209 |         >>> unistr, nfd
210 |         ('한국', '한국')
211 |         >>> " ".join(f"{ord(x):04X}" for x in unistr)
212 |         'D55C AD6D'
213 |         >>> " ".join(f"{ord(x):04X}" for x in nfd)
214 |         '1112 1161 11AB 1100 116E 11A8'
215 | 
216 |         >>> NFD("ﬃ")
217 |         'ﬃ'
218 | 
219 |     """
220 |     prev_ccc = 0
221 | 
222 |     for u in unistr:
223 |         u = ord(u)
224 | 
225 |         if u in _NFD__QC_NO:
226 |             break
227 | 
228 |         if u not in _NON_ZERO_CCC_TABLE:
229 |             continue
230 | 
231 |         curr_ccc = _NON_ZERO_CCC_TABLE[u]
232 | 
233 |         if curr_ccc < prev_ccc:
234 |             break
235 | 
236 |         prev_ccc = curr_ccc
237 |     else:
238 |         return unistr
239 | 
240 |     result = map(chr, _reorder(_decompose(unistr)))
241 | 
242 |     return "".join(result)
243 | 
244 | 
245 | def NFKC(unistr):
246 |     """Return the compatibility equivalent "composed" form of the original
247 |     Unicode string `unistr`. This function transforms the Unicode string into
248 |     the Unicode "normalization form KC", where character sequences are replaced
249 |     by canonically equivalent composites, where possible, and compatibility
250 |     characters are replaced by their nominal counterparts.
251 | 
252 |     For performance optimization, the function verifies whether the input
253 |     string is already in NFKC. If it is, the original string is returned
254 |     directly to avoid unnecessary processing.
255 | 
256 |     Args:
257 |         unistr (str): The input Unicode string.
258 | 
259 |     Returns:
260 |         str: The NFKC normalized Unicode string.
261 | 
262 |     Example:
263 |         >>> NFKC("ﬃ")
264 |         'ffi'
265 | 
266 |     """
267 |     prev_ccc = 0
268 | 
269 |     for u in unistr:
270 |         u = ord(u)
271 | 
272 |         if u in _NFKC_QC_NO_OR_MAYBE:
273 |             break
274 | 
275 |         if u not in _NON_ZERO_CCC_TABLE:
276 |             continue
277 | 
278 |         curr_ccc = _NON_ZERO_CCC_TABLE[u]
279 | 
280 |         if curr_ccc < prev_ccc:
281 |             break
282 | 
283 |         prev_ccc = curr_ccc
284 |     else:
285 |         return unistr
286 | 
287 |     result = map(chr, _compose([*map(ord, NFKD(unistr))]))
288 | 
289 |     return "".join(result)
290 | 
291 | 
292 | def NFKD(unistr):
293 |     """Return the compatibility equivalent "decomposed" form of the original
294 |     Unicode string `unistr`. This function transforms the Unicode string into
295 |     the Unicode "normalization form KD", where composite characters are
296 |     replaced by canonically equivalent character sequences, in canonical order,
297 |     and compatibility characters are replaced by their nominal counterparts.
298 | 
299 |     For performance optimization, the function verifies whether the input
300 |     string is already in NFKD. If it is, the original string is returned
301 |     directly to avoid unnecessary processing.
302 | 
303 |     Args:
304 |         unistr (str): The input Unicode string.
305 | 
306 |     Returns:
307 |         str: The NFKD normalized Unicode string.
308 | 
309 |     Example:
310 |         >>> NFKD("⑴")
311 |         '(1)'
312 | 
313 |     """
314 |     prev_ccc = 0
315 | 
316 |     for u in unistr:
317 |         u = ord(u)
318 | 
319 |         if u in _NFKD_QC_NO:
320 |             break
321 | 
322 |         if u not in _NON_ZERO_CCC_TABLE:
323 |             continue
324 | 
325 |         curr_ccc = _NON_ZERO_CCC_TABLE[u]
326 | 
327 |         if curr_ccc < prev_ccc:
328 |             break
329 | 
330 |         prev_ccc = curr_ccc
331 |     else:
332 |         return unistr
333 | 
334 |     result = map(chr, _reorder(_decompose(unistr, compatibility=True)))
335 | 
336 |     return "".join(result)
337 | 
338 | 
339 | # Dictionary for normalization forms dispatch
340 | _normalization_forms = {
341 |     "NFC": NFC,
342 |     "NFD": NFD,
343 |     "NFKC": NFKC,
344 |     "NFKD": NFKD,
345 | }
346 | 
347 | def normalize(form, unistr):
348 |     """Transform the Unicode string `unistr` into the Unicode normalization
349 |     form `form`. Valid values for `form` are "NFC", "NFD", "NFKC", and "NFKD".
350 | 
351 |     Args:
352 |         form (str): The normalization form to apply, one of "NFC", "NFD",
353 |             "NFKC", or "NFKD".
354 | 
355 |         unistr (str): The input Unicode string to be normalized.
356 | 
357 |     Returns:
358 |         str: The normalized Unicode string.
359 | 
360 |     Examples:
361 | 
362 |         >>> normalize("NFKD", "⑴ ﬃ ²")
363 |         '(1) ffi 2'
364 | 
365 |         >>> forms = ["NFC", "NFD", "NFKC", "NFKD"]
366 |         >>> [normalize(f, "\u017F\u0307\u0323") for f in forms]
367 |         ['ẛ̣', 'ẛ̣', 'ṩ', 'ṩ']
368 | 
369 |     """
370 |     return _normalization_forms[form](unistr)
371 | 
372 | 
373 | #
374 | # Internals
375 | #
376 | 
377 | def _decompose(unistr, *, compatibility=False):
378 |     # Compute the full decomposition of the Unicode string based
379 |     # on the specified normalization form. The type of full decomposition
380 |     # chosen depends on which Unicode normalization form is involved. For NFC
381 |     # or NFD, it performs a full canonical decomposition. For NFKC or NFKD,
382 |     # it performs a full compatibility decomposition.
383 | 
384 |     result = []
385 |     decomp = _FULL_KDECOMP_BY_CHAR if compatibility else _FULL_CDECOMP_BY_CHAR
386 | 
387 |     for u in unistr:
388 |         u = ord(u)
389 | 
390 |         if u in decomp:
391 |             result.extend(decomp[u])
392 |         elif _SB <= u <= _SL:
393 |             result.extend(_decompose_hangul_syllable(u))
394 |         else:
395 |             result.append(u)
396 | 
397 |     return result
398 | 
399 | 
400 | def _decompose_hangul_syllable(cp):
401 |     # Perform Hangul syllable decomposition algorithm to derive the full
402 |     # canonical decomposition of a precomposed Hangul syllable into its
403 |     # constituent jamo characters.
404 | 
405 |     sindex = cp - _SB
406 |     tindex = sindex % _TCOUNT
407 |     q = (sindex - tindex) // _TCOUNT
408 |     V = _VB + (q  % _VCOUNT)
409 |     L = _LB + (q // _VCOUNT)
410 | 
411 |     if tindex:
412 |         # LVT syllable
413 |         return (L, V, _TB - 1 + tindex)
414 | 
415 |     # LV syllable
416 |     return (L, V)
417 | 
418 | 
419 | def _reorder(elements):
420 |     # Perform canonical ordering algorithm. Once a string has been fully
421 |     # decomposed, this algorithm ensures that any sequences of combining marks
422 |     # within it are arranged in a well-defined order. Only combining marks with
423 |     # non-zero Canonical_Combining_Class property values are subject to
424 |     # potential reordering. The canonical ordering imposed by both composed
425 |     # and decomposed normalization forms is crucial for ensuring the uniqueness
426 |     # of normal forms.
427 | 
428 |     n = len(elements)
429 | 
430 |     while n > 1:
431 |         new_n = 0
432 |         i = 1
433 | 
434 |         while i < n:
435 |             ccc_b = _NON_ZERO_CCC_TABLE.get(elements[i])
436 | 
437 |             if not ccc_b:
438 |                 i += 2
439 |                 continue
440 | 
441 |             ccc_a = _NON_ZERO_CCC_TABLE.get(elements[i - 1])
442 | 
443 |             if not ccc_a or ccc_a <= ccc_b:
444 |                 i += 1
445 |                 continue
446 | 
447 |             elements[i - 1], elements[i] = elements[i], elements[i - 1]
448 | 
449 |             new_n = i
450 |             i += 1
451 | 
452 |         n = new_n
453 | 
454 |     return elements
455 | 
456 | 
457 | def _compose(elements):
458 |     # Canonical composition algorithm to transform a fully decomposed
459 |     # and canonically ordered string into its most fully composed but still
460 |     # canonically equivalent sequence.
461 | 
462 |     for i, x in enumerate(elements):
463 |         if x is None or x in _NON_ZERO_CCC_TABLE:
464 |             continue
465 | 
466 |         last_cc = False
467 |         blocked = False
468 | 
469 |         for j, y in enumerate(elements[i + 1 :], i + 1):
470 |             if y in _NON_ZERO_CCC_TABLE:
471 |                 last_cc = True
472 |             else:
473 |                 blocked = True
474 | 
475 |             if blocked and last_cc:
476 |                 continue
477 | 
478 |             prev = elements[j - 1]
479 | 
480 |             if (prev is None
481 |                     or prev not in _NON_ZERO_CCC_TABLE
482 |                     or _NON_ZERO_CCC_TABLE[prev] < _NON_ZERO_CCC_TABLE[y]):
483 | 
484 |                 pair = (x, y)
485 | 
486 |                 if pair in _COMPOSITE_BY_CDECOMP:
487 |                     precomp = _COMPOSITE_BY_CDECOMP[pair]
488 |                 else:
489 |                     precomp = _compose_hangul_syllable(*pair)
490 | 
491 |                 if precomp is None or precomp in _COMPOSITION_EXCLUSIONS:
492 |                     if blocked:
493 |                         break
494 |                 else:
495 |                     elements[i] = x = precomp
496 |                     elements[j] = None
497 | 
498 |                     if blocked:
499 |                         blocked = False
500 |                     else:
501 |                         last_cc = False
502 | 
503 |     return [*filter(None, elements)]
504 | 
505 | 
506 | def _compose_hangul_syllable(x, y):
507 |     # Perform Hangul syllable composition algorithm to derive the mapping
508 |     # of a canonically decomposed sequence of Hangul jamo characters
509 |     # to an equivalent precomposed Hangul syllable.
510 | 
511 |     if _LB <= x <= _LL and _VB <= y <= _VL:
512 |         # Compose a leading consonant and a vowel into an LV syllable
513 |         return _SB + (((x - _LB) * _VCOUNT) + y - _VB) * _TCOUNT
514 | 
515 |     if _SB <= x <= _SL and not (x - _SB) % _TCOUNT and _TB <= y <= _TL:
516 |         # Compose an LV syllable and a trailing consonant into an LVT syllable
517 |         return x + y - (_TB - 1)
518 | 
519 |     return None
520 | 
521 | 
522 | if __name__ == "__main__":
523 |     import doctest
524 |     doctest.testmod()
525 | 


--------------------------------------------------------------------------------
/pyunormalize/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlodewijck/pyunormalize/4b45c576567fb0293acb93a308c97cbaba3caa5f/pyunormalize/tests/__init__.py


--------------------------------------------------------------------------------
/pyunormalize/tests/test_pyunormalize.py:
--------------------------------------------------------------------------------
  1 | """Unit tests for pyunormalize."""
  2 | 
  3 | import unittest
  4 | 
  5 | from pyunormalize.normalization import _decompose, _reorder, _compose
  6 | from pyunormalize import (
  7 |     NFC,
  8 |     NFD,
  9 |     NFKC,
 10 |     NFKD,
 11 |     normalize,
 12 |     UNICODE_VERSION as _UNICODE_VERSION,
 13 | )
 14 | 
 15 | UNICODE_VERSION = "16.0.0"
 16 | 
 17 | 
 18 | class Misc(unittest.TestCase):
 19 | 
 20 |     def test_UNICODE_VERSION(self):
 21 |         self.assertTrue(_UNICODE_VERSION == UNICODE_VERSION)
 22 | 
 23 |     def test_normalize(self):
 24 |         # Characters whose normalization forms
 25 |         # under NFC, NFD, NFKC, and NFKD are all different:
 26 |         #   ϓ   U+03D3 GREEK UPSILON WITH ACUTE AND HOOK SYMBOL
 27 |         #   ϔ   U+03D4 GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL
 28 |         #   ẛ   U+1E9B LATIN SMALL LETTER LONG S WITH DOT ABOVE
 29 |         for s in ["\u03D3", "\u03D4", "\u1E9B"]:
 30 |             self.assertTrue(
 31 |                 normalize("NFC", s) == NFC(s)
 32 |             )
 33 |             self.assertTrue(
 34 |                 normalize("NFD", s) == NFD(s)
 35 |             )
 36 |             self.assertTrue(
 37 |                 normalize("NFKC", s) == NFKC(s)
 38 |             )
 39 |             self.assertTrue(
 40 |                 normalize("NFKD", s) == NFKD(s)
 41 |             )
 42 | 
 43 |     def test_internals(self):
 44 | 
 45 |         self.assertEqual(
 46 |             _decompose("\u00C0"),
 47 |             [0x0041, 0x0300]
 48 |         )
 49 | 
 50 |         self.assertEqual(
 51 |             _decompose("\u00BE", compatibility=True),
 52 |             [0x0033, 0x2044, 0x0034]
 53 |         )
 54 | 
 55 |         self.assertEqual(
 56 |             _decompose("힡"),
 57 |             [0x1112, 0x1175, 0x11C0]
 58 |         )
 59 | 
 60 |         self.assertEqual(
 61 |             _reorder([0x017F, 0x0307, 0x0323]),
 62 |             [0x017F, 0x0323, 0x0307]
 63 |         )
 64 | 
 65 |         s = "a\u0328\u0302\u0301"  # a + ogonek + circumflex + acute
 66 |         self.assertEqual(
 67 |             _decompose(s),
 68 |             [0x0061, 0x0328, 0x0302, 0x0301]
 69 |         )
 70 |         self.assertEqual(
 71 |             _reorder([0x0061, 0x0328, 0x0302, 0x0301]),
 72 |             [0x0061, 0x0328, 0x0302, 0x0301]
 73 |         )
 74 |         self.assertEqual(
 75 |             _compose([0x0061, 0x0328, 0x0302, 0x0301]),
 76 |             [0x0105, 0x0302, 0x0301]
 77 |         )
 78 | 
 79 |         s = "\u0105\u0302\u0301"  # a-ogonek + circumflex + acute
 80 |         self.assertEqual(
 81 |             _compose(_decompose(_reorder(s))),
 82 |             [0x0105, 0x0302, 0x0301]
 83 |         )
 84 | 
 85 |         s = "\u0105\u0301\u0302"  # a-ogonek + acute + circumflex
 86 |         self.assertEqual(
 87 |             _decompose(s),
 88 |             [0x0061, 0x0328, 0x0301, 0x0302]
 89 |         )
 90 |         self.assertEqual(
 91 |             _reorder([0x0061, 0x0328, 0x0301, 0x0302]),
 92 |             [0x0061, 0x0328, 0x0301, 0x0302]
 93 |         )
 94 |         self.assertEqual(
 95 |             _compose([0x0061, 0x0328, 0x0301, 0x0302]),
 96 |             [0x0105, 0x0301, 0x0302]
 97 |         )
 98 | 
 99 |         self.assertEqual(
100 |             _compose(_decompose(_reorder(s))),
101 |             [0x0105, 0x0301, 0x0302]
102 |         )
103 | 
104 |         # At https://www.unicode.org/versions/Unicode16.0.0/UnicodeStandard-16.0.pdf,
105 |         # p. 140:  "The replacement of the Starter L in R2 requires continuing
106 |         # to check the succeeding characters until the character at that
107 |         # position is no longer part of any Non-blocked Pair that can be
108 |         # replaced by a Primary Composite. For example, consider the following
109 |         # hypothetical coded character sequence: <U+007A z, U+0335 short stroke
110 |         # overlay, U+0327 cedilla, U+0324 diaeresis below, U+0301 acute>. None
111 |         # of the first three combining marks forms a Primary Composite with
112 |         # the letter z. However, the fourth combining mark in the sequence,
113 |         # acute, does form a Primary Composite with z, and it is not Blocked
114 |         # from the z. Therefore, R2 mandates the replacement of the sequence
115 |         # <U+007A z, ... U+0301 acute> with <U+017A z-acute, ...>, even though
116 |         # there are three other combining marks intervening in the sequence."
117 |         items = [0x007A, 0x0335, 0x0327, 0x0324, 0x0301]
118 |         self.assertEqual(
119 |             _compose(items),
120 |             [0x017A, 0x0335, 0x0327, 0x0324]
121 |         )
122 | 
123 | 
124 | if __name__ == "__main__":
125 |     unittest.main()
126 | 


--------------------------------------------------------------------------------
/pyunormalize/tests/unicode_conformance.py:
--------------------------------------------------------------------------------
  1 | """Unicode conformance testing.
  2 | 
  3 | Information about conformance testing for Unicode normalization forms:
  4 |     - https://www.unicode.org/Public/16.0.0/ucd/NormalizationTest.txt
  5 |     - https://www.unicode.org/reports/tr15/tr15-56.html
  6 | """
  7 | 
  8 | import os
  9 | import time
 10 | 
 11 | from pyunormalize import (
 12 |     NFC,
 13 |     NFD,
 14 |     NFKC,
 15 |     NFKD,
 16 |     UNICODE_VERSION,
 17 | )
 18 | 
 19 | # Unicode conformance test file
 20 | UNICODE_FILE = "NormalizationTest.txt"
 21 | 
 22 | 
 23 | def parse(lines):
 24 |     # Check file version
 25 |     assert UNICODE_VERSION in lines[0], "Wrong Unicode version number."
 26 | 
 27 |     data = []  # list of lists
 28 |     dec  = []  # needed for character by character test
 29 | 
 30 |     for num, line in enumerate(lines, 1):
 31 |         if line and not line.startswith(("#", "@")):
 32 |             *c, _ = line.split(";", 5)
 33 | 
 34 |             rec = [
 35 |                 "".join([chr(int(x, 16)) for x in seq.split()])
 36 |                 for seq in c
 37 |             ]
 38 | 
 39 |             # rec: [source, nfc, nfd, nfkc, nfkd]
 40 |             data.append([num, *rec])
 41 | 
 42 |             if not " " in c[0]:
 43 |                 dec.append(int(c[0], 16))
 44 | 
 45 |     s = set(dec)
 46 |     chars = [chr(x) for x in range(0x110000) if x not in s]
 47 | 
 48 |     return data, chars
 49 | 
 50 | 
 51 | def main():
 52 |     data_dir = os.path.join("pyunormalize", "tests", "data")
 53 |     path = os.path.join(data_dir, UNICODE_FILE)
 54 | 
 55 |     with open(path, encoding="utf-8") as f:
 56 |         lines = f.read().splitlines()
 57 | 
 58 |     data, chars = parse(lines)
 59 | 
 60 |     counter = 0
 61 |     start_time = time.perf_counter()
 62 | 
 63 |     #
 64 |     # NFC
 65 |     # c2 ==  toNFC(c1) ==  toNFC(c2) ==  toNFC(c3)
 66 |     # c4 ==  toNFC(c4) ==  toNFC(c5)
 67 |     #
 68 | 
 69 |     print(f"\nNormalization Form C\n{'-' * 70}")
 70 | 
 71 |     s = f = 0
 72 |     for record in data:
 73 |         num, source, nfc, nfd, nfkc, nfkd = record
 74 | 
 75 |         lst1 = []
 76 |         lst1.append(NFC(source))
 77 |         lst1.append(NFC(nfc))
 78 |         lst1.append(NFC(nfd))
 79 | 
 80 |         lst2 = []
 81 |         lst2.append(NFC(nfkc))
 82 |         lst2.append(NFC(nfkd))
 83 | 
 84 |         if (lst1.count(nfc) == len(lst1)
 85 |                 and lst2.count(nfkc) == len(lst2)):
 86 |             s += 1
 87 |         else:
 88 |             f += 1
 89 |             print(f"Failed on line {num}")
 90 | 
 91 |     r = s + f
 92 |     if f:
 93 |         print(f"FAIL ({r:,} items, {f:,} failures)\n")
 94 |     else:
 95 |         print(f"OK ({r:,} items)\n")
 96 |         counter += 1
 97 | 
 98 |     #
 99 |     # NFD
100 |     # c3 ==  toNFD(c1) ==  toNFD(c2) ==  toNFD(c3)
101 |     # c5 ==  toNFD(c4) ==  toNFD(c5)
102 |     #
103 | 
104 |     print(f"Normalization Form D\n{'-' * 70}")
105 | 
106 |     s = f = 0
107 |     for record in data:
108 |         num, source, nfc, nfd, nfkc, nfkd = record
109 | 
110 |         lst1 = []
111 |         lst1.append(NFD(source))
112 |         lst1.append(NFD(nfc))
113 |         lst1.append(NFD(nfd))
114 | 
115 |         lst2 = []
116 |         lst2.append(NFD(nfkc))
117 |         lst2.append(NFD(nfkd))
118 | 
119 |         if (lst1.count(nfd) == len(lst1)
120 |                 and lst2.count(nfkd) == len(lst2)):
121 |             s += 1
122 |         else:
123 |             f += 1
124 |             print(f"Failed on line {num}")
125 | 
126 |     r = s + f
127 |     if f:
128 |         print(f"FAIL ({r:,} items, {f:,} failures)\n")
129 |     else:
130 |         print(f"OK ({r:,} items)\n")
131 |         counter += 1
132 | 
133 |     #
134 |     # NFKC
135 |     # c4 == toNFKC(c1) == toNFKC(c2) == toNFKC(c3) == toNFKC(c4) == toNFKC(c5)
136 |     #
137 | 
138 |     print(f"Normalization Form KC\n{'-' * 70}")
139 | 
140 |     s = f = 0
141 |     for record in data:
142 |         num, source, nfc, nfd, nfkc, nfkd = record
143 | 
144 |         lst = []
145 |         lst.append(NFKC(source))
146 |         lst.append(NFKC(nfc))
147 |         lst.append(NFKC(nfd))
148 |         lst.append(NFKC(nfkc))
149 |         lst.append(NFKC(nfkd))
150 | 
151 |         if lst.count(nfkc) == len(lst):
152 |             s += 1
153 |         else:
154 |             f += 1
155 |             print(f"Failed on line {num}")
156 | 
157 |     r = s + f
158 |     if f:
159 |         print(f"FAIL ({r:,} items, {f:,} failures)\n")
160 |     else:
161 |         print(f"OK ({r:,} items)\n")
162 |         counter += 1
163 | 
164 |     #
165 |     # NFKD
166 |     # c5 == toNFKD(c1) == toNFKD(c2) == toNFKD(c3) == toNFKD(c4) == toNFKD(c5)
167 |     #
168 | 
169 |     print(f"Normalization Form KD\n{'-' * 70}")
170 | 
171 |     s = f = 0
172 |     for record in data:
173 |         num, source, nfc, nfd, nfkc, nfkd = record
174 | 
175 |         lst = []
176 |         lst.append(NFKD(source))
177 |         lst.append(NFKD(nfc))
178 |         lst.append(NFKD(nfd))
179 |         lst.append(NFKD(nfkc))
180 |         lst.append(NFKD(nfkd))
181 | 
182 |         if lst.count(nfkd) == len(lst):
183 |             s += 1
184 |         else:
185 |             f += 1
186 |             print(f"Failed on line {num}")
187 | 
188 |     r = s + f
189 |     if f:
190 |         print(f"FAIL ({r:,} items, {f:,} failures)\n")
191 |     else:
192 |         print(f"OK ({r:,} items)\n")
193 |         counter += 1
194 | 
195 |     #
196 |     # Character by character test
197 |     # X == toNFC(X) == toNFD(X) == toNFKC(X) == toNFKD(X)
198 |     #
199 | 
200 |     print(f"Character by character test, all normalization forms\n{'-' * 70}")
201 | 
202 |     s = f = 0
203 |     for x in chars:
204 |         lst = []
205 |         lst.append(NFC(x))
206 |         lst.append(NFD(x))
207 |         lst.append(NFKC(x))
208 |         lst.append(NFKD(x))
209 | 
210 |         if lst.count(x) == len(lst):
211 |             s += 1
212 |         else:
213 |             f += 1
214 |             print(f"Failed for U+{ord(x):04X}")
215 | 
216 |     r = s + f
217 |     if f:
218 |         print(f"FAIL ({r:,} items, {f:,} failures)\n")
219 |     else:
220 |         print(f"OK ({r:,} items)\n")
221 |         counter += 1
222 | 
223 | 
224 |     uax = f"UAX #15, version {UNICODE_VERSION}."
225 | 
226 |     if counter == 5:
227 |         print(f".. Implementation conforms to {uax}")
228 |     else:
229 |         print(f".. Implementation does not conform to {uax}")
230 | 
231 |     print(f".. {time.perf_counter() - start_time:.3f} seconds")
232 | 
233 | 
234 | if __name__ == "__main__":
235 |     main()
236 | 


--------------------------------------------------------------------------------
/pyunormalize/tools/generate_unicode.py:
--------------------------------------------------------------------------------
  1 | # This script generates the pyunormalize.unicode module.
  2 | #
  3 | # Input files:
  4 | #     https://www.unicode.org/Public/16.0.0/ucd/CompositionExclusions.txt
  5 | #     https://www.unicode.org/Public/16.0.0/ucd/DerivedNormalizationProps.txt
  6 | #     https://www.unicode.org/Public/16.0.0/ucd/UnicodeData.txt
  7 | #
  8 | # Output file:
  9 | #     tools/generate_unicode/unicode.py
 10 | #
 11 | # The output file must be copied to the `pyunormalize` directory.
 12 | 
 13 | import pathlib
 14 | import urllib.error
 15 | import urllib.request
 16 | 
 17 | UNICODE_VERSION = "16.0.0"
 18 | SCRIPT_PATH = "/".join(pathlib.Path(__file__).parts[-3:])
 19 | 
 20 | # Files from the Unicode character database (UCD)
 21 | EXCLUSIONS = "CompositionExclusions.txt"
 22 | PROPS = "DerivedNormalizationProps.txt"
 23 | UNICODE_DATA = "UnicodeData.txt"
 24 | 
 25 | 
 26 | def read_remote(filename):
 27 |     url = f"https://www.unicode.org/Public/{UNICODE_VERSION}/ucd/"
 28 | 
 29 |     try:
 30 |         print("\n.. Fetching URL...")
 31 |         response = urllib.request.urlopen(f"{url}{filename}")
 32 |         # print(response.__dict__)
 33 |     except urllib.error.HTTPError as e:
 34 |         raise Exception(
 35 |             f"The server could not fulfill the request. Error code: {e.code}"
 36 |         )
 37 |     except urllib.error.URLError as e:
 38 |         raise Exception(
 39 |             f"We failed to reach a server.\nReason:\n{e.reason}"
 40 |         )
 41 | 
 42 |     print(f".. Extracting data from {filename}")
 43 |     return response.read().decode("utf-8").splitlines()
 44 | 
 45 | 
 46 | def check_version(line):
 47 |     assert UNICODE_VERSION in line, "Wrong Unicode version number."
 48 | 
 49 | 
 50 | def main():
 51 |     # Current working directory
 52 |     cwd = pathlib.Path.cwd()
 53 | 
 54 |     #
 55 |     # Unicode file: UnicodeData.txt
 56 |     #
 57 | 
 58 |     try:
 59 |         lines = (cwd / UNICODE_DATA).read_text(encoding="utf-8").splitlines()
 60 |     except FileNotFoundError:
 61 |         lines = read_remote(UNICODE_DATA)
 62 |         print(".. Done.")
 63 | 
 64 |     # File version is not specified in UnicodeData.txt
 65 |     # and therefore cannot be checked.
 66 | 
 67 |     ccc_list = []
 68 |     dcp_list = []
 69 | 
 70 |     for line in lines:
 71 |         code, _, _, ccc, _, dcp, *_ = line.split(";", 6)
 72 | 
 73 |         if ccc != "0":
 74 |             ccc_list.append(f"    0x{code:0>5}: {ccc:>3},")
 75 | 
 76 |         if dcp:
 77 |             dec_dcp = []
 78 | 
 79 |             for c in dcp.split(" "):
 80 |                 dec_dcp.append(f'"{c}"' if c.startswith("<") else f"0x{c}")
 81 | 
 82 |             dcp_list.append(f"    0x{code:0>5}: [{', '.join(dec_dcp)}],")
 83 | 
 84 |     #
 85 |     # Unicode file: CompositionExclusions.txt
 86 |     #
 87 | 
 88 |     try:
 89 |         lines = (cwd / EXCLUSIONS).read_text(encoding="utf-8").splitlines()
 90 |     except FileNotFoundError:
 91 |         lines = read_remote(EXCLUSIONS)
 92 |         print(".. Done.")
 93 | 
 94 |     # Check file version
 95 |     check_version(lines[0])
 96 | 
 97 |     exclusions_list = []
 98 | 
 99 |     for line in lines:
100 |         line = line.rstrip()
101 |         if line and not line.startswith("#"):
102 |             code = line.split("#")[0].rstrip()
103 |             exclusions_list.append(f"    0x{code:0>5},")
104 | 
105 |     #
106 |     # Unicode file: DerivedNormalizationProps.txt
107 |     #
108 | 
109 |     try:
110 |         lines = (cwd / PROPS).read_text(encoding="utf-8").splitlines()
111 |     except FileNotFoundError:
112 |         lines = read_remote(PROPS)
113 |         print(".. Done.")
114 | 
115 |     # Check file version
116 |     check_version(lines[0])
117 | 
118 |     tmp = []
119 | 
120 |     start = lines.index(
121 |         "# Property:	NFD_Quick_Check"
122 |     )
123 |     stop  = lines.index(
124 |         "# Derived Property: Expands_On_NFD (DEPRECATED as of Unicode 6.0.0)"
125 |     )
126 | 
127 |     for line in lines[start:stop]:
128 |         if not line or line.startswith("#"):
129 |             continue
130 |         tmp.append(line)
131 | 
132 |     NFD_QC_NO_list = []
133 |     NFKD_QC_NO_list = []
134 |     NFC_QC_NO_list = []
135 |     NFC_QC_MAYBE_list = []
136 |     NFKC_QC_NO_list = []
137 |     NFKC_QC_MAYBE_list = []
138 | 
139 |     prop_values = {
140 |         "NFD_QC"  :  NFD_QC_NO_list,
141 |         "NFKD_QC" :  NFKD_QC_NO_list,
142 |         "NFC_QC"  : (NFC_QC_NO_list, NFC_QC_MAYBE_list),
143 |         "NFKC_QC" : (NFKC_QC_NO_list, NFKC_QC_MAYBE_list),
144 |     }
145 | 
146 |     for line in tmp:
147 |         data = line.split(" # ")[0].split(";")
148 |         data = [d.strip() for d in data]
149 | 
150 |         code, prop, prov_val = data
151 | 
152 |         if ".." in code:
153 |             start, end = code.split("..")
154 | 
155 |             if prop == "NFC_QC" and prov_val == "N":
156 |                 tmp_list = prop_values[prop][0]
157 |             elif prop == "NFC_QC":  # and prov_val == "MAYBE"
158 |                 tmp_list = prop_values[prop][1]
159 |             elif prop == "NFKC_QC" and prov_val == "N":
160 |                 tmp_list = prop_values[prop][0]
161 |             elif prop == "NFKC_QC":  # and prov_val == "MAYBE"
162 |                 tmp_list = prop_values[prop][1]
163 |             else:
164 |                 tmp_list = prop_values[prop]
165 | 
166 |             tmp_list.append(
167 |                 f"    *range(0x{start:0>5}, 0x{end:0>5} + 1),"
168 |             )
169 | 
170 |         else:
171 |             if prop == "NFC_QC" and prov_val == "N":
172 |                 tmp_list = prop_values[prop][0]
173 |             elif prop == "NFC_QC":  # and prov_val == "MAYBE"
174 |                 tmp_list = prop_values[prop][1]
175 |             elif prop == "NFKC_QC" and prov_val == "N":
176 |                 tmp_list = prop_values[prop][0]
177 |             elif prop == "NFKC_QC":  # and prov_val == "MAYBE":
178 |                 tmp_list = prop_values[prop][1]
179 |             else:
180 |                 tmp_list = prop_values[prop]
181 | 
182 |             tmp_list.append(f"           0x{code:0>5},")
183 | 
184 | 
185 |     dcp = "\n".join(dcp_list)
186 |     ccc = "\n".join(ccc_list)
187 |     exclusions = "\n".join(exclusions_list)
188 |     NFD_QC_N   = "\n".join(NFD_QC_NO_list)
189 |     NFKD_QC_N  = "\n".join(NFKD_QC_NO_list)
190 |     NFC_QC_N   = "\n".join(NFC_QC_NO_list)
191 |     NFC_QC_M   = "\n".join(NFC_QC_MAYBE_list)
192 |     NFKC_QC_N  = "\n".join(NFKC_QC_NO_list)
193 |     NFKC_QC_M  = "\n".join(NFKC_QC_MAYBE_list)
194 | 
195 |     with open(cwd / "_unicode.py", "w", encoding="utf-8", newline="\n") as f:
196 |         f.write(f'''\
197 | """Data derived from the Unicode character database (UCD).
198 | 
199 | This file was generated from {SCRIPT_PATH}
200 | """
201 | 
202 | _UNICODE_VERSION = "{UNICODE_VERSION}"
203 | 
204 | # Dictionary mapping characters to their canonical decompositions,
205 | # not including Hangul syllables
206 | _DECOMP_BY_CHARACTER = {{
207 | {dcp}
208 | }}
209 | 
210 | # Dictionary mapping characters with non-zero canonical combining class values
211 | # to their corresponding values
212 | _NON_ZERO_CCC_TABLE = {{
213 | {ccc}
214 | }}
215 | 
216 | # Characters which are excluded from composition
217 | _COMPOSITION_EXCLUSIONS = {{
218 | {exclusions}
219 | }}
220 | 
221 | # NFC_Quick_Check=No
222 | # Characters that cannot ever occur in the normalization form C
223 | _NFC__QC_NO = set([
224 | {NFC_QC_N}
225 | ])
226 | 
227 | # NFC_Quick_Check=Maybe
228 | # Characters that may or may not occur in the normalization form C,
229 | # depending on the context
230 | _NFC__QC_MAYBE = set([
231 | {NFC_QC_M}
232 | ])
233 | 
234 | # Code points listed for NFC_Quick_Check=No or NFC_Quick_Check=Maybe
235 | _NFC__QC_NO_OR_MAYBE = _NFC__QC_NO | _NFC__QC_MAYBE
236 | 
237 | # NFD_Quick_Check=No
238 | # Characters that cannot ever occur in the normalization form D
239 | _NFD__QC_NO = set([
240 | {NFD_QC_N}
241 | ])
242 | 
243 | # NFKC_Quick_Check=No
244 | # Characters that cannot ever occur in the normalization form KC
245 | _NFKC_QC_NO = set([
246 | {NFKC_QC_N}
247 | ])
248 | 
249 | # NFKC_Quick_Check=Maybe
250 | # Characters that may or may not occur in the normalization form KC,
251 | # depending on the context
252 | _NFKC_QC_MAYBE = set([
253 | {NFKC_QC_M}
254 | ])
255 | 
256 | # Code points listed for NFKC_Quick_Check=No or NFKC_Quick_Check=Maybe
257 | _NFKC_QC_NO_OR_MAYBE = _NFKC_QC_NO | _NFKC_QC_MAYBE
258 | 
259 | # NFKD_Quick_Check=No
260 | # Characters that cannot ever occur in the normalization form KD
261 | _NFKD_QC_NO = set([
262 | {NFKD_QC_N}
263 | ])
264 | 
265 | del _NFC__QC_NO, _NFC__QC_MAYBE, _NFKC_QC_NO, _NFKC_QC_MAYBE
266 | ''')
267 | 
268 | 
269 | if __name__ == "__main__":
270 |     main()
271 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """Setup script for pyunormalize."""
 2 | 
 3 | import os
 4 | from setuptools import setup, find_packages
 5 | 
 6 | URL = "https://github.com/mlodewijck/pyunormalize"
 7 | 
 8 | 
 9 | def get_version():
10 |     version_file = os.path.join("pyunormalize", "_version.py")
11 |     namespace = {}
12 |     with open(version_file) as f:
13 |         exec(compile(f.read(), version_file, "exec"), namespace)
14 |     return namespace["__version__"]
15 | 
16 | with open("README.md", encoding="utf-8") as f:
17 |     README = f.read()
18 | 
19 | setup(
20 |     name="pyunormalize",
21 |     version=get_version(),
22 |     description=(
23 |         "Unicode normalization forms (NFC, NFKC, NFD, NFKD). A library "
24 |         "independent of the Python core Unicode database."
25 |     ),
26 |     long_description=README,
27 |     long_description_content_type="text/markdown",
28 |     author="Marc Lodewijck",
29 |     author_email="mlodewijck@gmail.com",
30 |     license="MIT",
31 |     url=URL,
32 |     project_urls={
33 |         "Bug Reports": "{}/issues".format(URL),
34 |         "Source": "{}/".format(URL),
35 |     },
36 |     keywords=[
37 |         "Unicode",
38 |         "Unicode data",
39 |         "Unicode normalization",
40 |         "normalization",
41 |         "NFC",
42 |         "NFD",
43 |         "NFKC",
44 |         "NFKD",
45 |         "Unicode Normalization Forms",
46 |         "Canonical Ordering Algorithm",
47 |         "Canonical Composition Algorithm",
48 |         "canonical ordering",
49 |         "canonical composition",
50 |         "Hangul Syllable Composition Algorithm",
51 |         "Hangul Syllable Decomposition Algorithm",
52 |         "Hangul syllables",
53 |         "Hangul jamo characters",
54 |     ],
55 |     # Trove classifiers
56 |     classifiers=[
57 |         "Intended Audience :: Developers",
58 |         "License :: OSI Approved :: MIT License",
59 |         "Programming Language :: Python :: 3.6",
60 |         "Programming Language :: Python :: 3.7",
61 |         "Programming Language :: Python :: 3.8",
62 |         "Programming Language :: Python :: 3.9",
63 |         "Programming Language :: Python :: 3.10",
64 |         "Programming Language :: Python :: 3.11",
65 |         "Programming Language :: Python :: 3.12",
66 |         "Topic :: Software Development",
67 |         "Topic :: Software Development :: Internationalization",
68 |         "Topic :: Text Processing",
69 |         "Topic :: Text Processing :: Linguistic",
70 |         "Topic :: Utilities",
71 |     ],
72 |     python_requires=">=3.6",
73 |     packages=find_packages(exclude=["tests"]),
74 |     include_package_data=True,
75 |     # All data files matched by MANIFEST.in will get included
76 |     # if they are inside a package directory.
77 |     zip_safe=False,
78 | )
79 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py36, py37, py38, py39, py310, py311, py312
3 | 
4 | [testenv]
5 | commands =
6 |     python -m unittest discover -s pyunormalize/tests -p "*.py"
7 |     python pyunormalize/tests/unicode_conformance.py
8 | 


--------------------------------------------------------------------------------