├── .github
    └── workflows
    │   └── publish.yml
├── .gitignore
├── .mailmap
├── .readthedocs.yaml
├── CHANGELOG.md
├── LICENSE.txt
├── MANIFEST.in
├── README.md
├── docs
    ├── Makefile
    ├── _static
    │   └── css
    │   │   └── custom.css
    ├── avoid.rst
    ├── bad_encodings.rst
    ├── cite.rst
    ├── cli.rst
    ├── conf.py
    ├── config.rst
    ├── detect.rst
    ├── encodings.rst
    ├── explain.rst
    ├── fixes.rst
    ├── heuristic.rst
    ├── images
    │   └── shipping-label.png
    └── index.rst
├── ftfy
    ├── __init__.py
    ├── bad_codecs
    │   ├── __init__.py
    │   ├── sloppy.py
    │   └── utf8_variants.py
    ├── badness.py
    ├── chardata.py
    ├── cli.py
    ├── fixes.py
    ├── formatting.py
    └── py.typed
├── mypy.ini
├── notebook
    ├── excel-export.png
    └── ftfy talk.ipynb
├── notes
    └── mysteries.txt
├── pyproject.toml
├── pytest.ini
├── scripts
    └── char_data_table.py
├── tests
    ├── __init__.py
    ├── face.txt
    ├── test-cases
    │   ├── README.md
    │   ├── in-the-wild.json
    │   ├── known-failures.json
    │   ├── language-names.json
    │   ├── negative.json
    │   └── synthetic.json
    ├── test_bytes.py
    ├── test_characters.py
    ├── test_cli.py
    ├── test_encodings.py
    ├── test_entities.py
    └── test_examples_in_json.py
├── tox.ini
└── uv.lock


/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Python distribution 📦 to PyPI
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - "v*"
 7 | 
 8 | jobs:
 9 |   build:
10 |     name: Build distribution 📦
11 |     runs-on: ubuntu-latest
12 | 
13 |     steps:
14 |       - uses: actions/checkout@v4
15 |       - name: Set up Python
16 |         uses: actions/setup-python@v5
17 |         with:
18 |           python-version: "3.x"
19 |       - name: Install pypa/build
20 |         run: >-
21 |           python3 -m
22 |           pip install
23 |           build
24 |           --user
25 |       - name: Build a binary wheel and a source tarball
26 |         run: python3 -m build
27 |       - name: Store the distribution packages
28 |         uses: actions/upload-artifact@v4
29 |         with:
30 |           name: python-package-distributions
31 |           path: dist/
32 | 
33 |   publish-to-pypi:
34 |     name: >-
35 |       Publish Python distribution 📦 to PyPI
36 |     if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes
37 |     needs:
38 |       - build
39 |     runs-on: ubuntu-latest
40 |     environment:
41 |       name: pypi
42 |       url: https://pypi.org/p/ftfy
43 |     permissions:
44 |       id-token: write # IMPORTANT: mandatory for trusted publishing
45 | 
46 |     steps:
47 |       - name: Download all the dists
48 |         uses: actions/download-artifact@v4
49 |         with:
50 |           name: python-package-distributions
51 |           path: dist/
52 |       - name: Publish distribution 📦 to PyPI
53 |         uses: pypa/gh-action-pypi-publish@release/v1
54 | 
55 |   github-release:
56 |     name: >-
57 |       Sign the Python distribution 📦 with Sigstore
58 |       and upload them to GitHub Release
59 |     needs:
60 |       - publish-to-pypi
61 |     runs-on: ubuntu-latest
62 | 
63 |     permissions:
64 |       contents: write # IMPORTANT: mandatory for making GitHub Releases
65 |       id-token: write # IMPORTANT: mandatory for sigstore
66 | 
67 |     steps:
68 |       - name: Download all the dists
69 |         uses: actions/download-artifact@v4
70 |         with:
71 |           name: python-package-distributions
72 |           path: dist/
73 |       - name: Sign the dists with Sigstore
74 |         uses: sigstore/gh-action-sigstore-python@v3.0.0
75 |         with:
76 |           inputs: >-
77 |             ./dist/*.tar.gz
78 |             ./dist/*.whl
79 |       - name: Create GitHub Release
80 |         env:
81 |           GITHUB_TOKEN: ${{ github.token }}
82 |         run: >-
83 |           gh release create
84 |           '${{ github.ref_name }}'
85 |           --repo '${{ github.repository }}'
86 |           --notes ""
87 |       - name: Upload artifact signatures to GitHub Release
88 |         env:
89 |           GITHUB_TOKEN: ${{ github.token }}
90 |         # Upload to GitHub Release using the `gh` CLI.
91 |         # `dist/` contains the built packages, and the
92 |         # sigstore-produced signatures and certificates.
93 |         run: >-
94 |           gh release upload
95 |           '${{ github.ref_name }}' dist/**
96 |           --repo '${{ github.repository }}'
97 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | __pycache__
 3 | .coverage
 4 | dist
 5 | *.egg-info
 6 | build
 7 | _build
 8 | twitterlogs
 9 | .eggs
10 | .pytest_cache
11 | .tox
12 | specimens
13 | .vscode
14 | .python-version
15 | 


--------------------------------------------------------------------------------
/.mailmap:
--------------------------------------------------------------------------------
1 | # Robyn has used different names and e-mail addresses in the course of this project. Map them all to her current name and e-mail.
2 | Robyn Speer <rspeer@arborelia.net> <rspeer@luminoso.com>
3 | Robyn Speer <rspeer@arborelia.net> <rspeer@mit.edu>
4 | Robyn Speer <rspeer@arborelia.net> <rspeer+gh@luminoso.com>
5 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the version of Python and other tools you might need
 9 | build:
10 |   os: ubuntu-24.04
11 |   tools:
12 |     python: "3.11"
13 |   commands:
14 |     - asdf plugin add uv
15 |     - asdf install uv latest
16 |     - asdf global uv latest
17 |     - uv venv
18 |     - uv sync
19 |     - .venv/bin/python -m sphinx -T -b html -d docs/_build/doctrees -D language=en docs $READTHEDOCS_OUTPUT/html
20 | 
21 | # Build documentation in the docs/ directory with Sphinx
22 | sphinx:
23 |   configuration: docs/conf.py
24 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | ## Version 6.3.1 (October 25, 2024)
  2 | 
  3 | - Fixed `license` metadata field in pyproject.toml.
  4 | - Removed extraneous files from the `hatchling` sdist output.
  5 | 
  6 | ## Version 6.3.0 (October 8, 2024)
  7 | 
  8 | - Switched packaging from poetry to uv.
  9 | - Uses modern Python packaging exclusively (no setup.py).
 10 | - Added support for mojibake in Windows-1257 (Baltic).
 11 | - Detects mojibake for "Ü" in an uppercase word, such as "ZURÜCK".
 12 | - Expanded a heuristic that notices improbable punctuation.
 13 | - Fixed a false positive involving two concatenated strings, one of which began with the § sign.
 14 | - Rewrote `chardata.py` to be more human-readable and debuggable, instead of being full of
 15 |   keysmash-like character sets.
 16 | 
 17 | ## Version 6.2.3 (August 5, 2024)
 18 | 
 19 | - Updated PyPI metadata.
 20 | 
 21 | ## Version 6.2.2 (August 5, 2024)
 22 | 
 23 | - Updated Read the Docs config so that docs might build again.
 24 | 
 25 | ## Version 6.2.1 (August 5, 2024)
 26 | 
 27 | - Updated setup.py and tox.ini to indicate support for Python 3.8 through 3.13.
 28 | - Replaced the text file used in CLI tests with a better one that tests the same issue.
 29 | - Lints and auto-formatting using ruff.
 30 | - Packaging and test fixes by Michał Górny.
 31 | 
 32 | ## Version 6.2.0 (March 15, 2024)
 33 | 
 34 | - Fixed a case where an en-dash and a space near other mojibake would be
 35 |   interpreted (probably incorrectly) as MacRoman mojibake.
 36 | - Added [project.urls] metadata to pyproject.toml.
 37 | - README contains license clarifications for entitled jerks.
 38 | 
 39 | ## Version 6.1.3 (November 21, 2023)
 40 | 
 41 | - Updated wcwidth.
 42 | - Switched to the Apache 2.0 license.
 43 | - Dropped support for Python 3.7.
 44 | 
 45 | ## Version 6.1.2 (February 17, 2022)
 46 | 
 47 | - Added type information for `guess_bytes`.
 48 | 
 49 | ## Version 6.1.1 (February 9, 2022)
 50 | 
 51 | - Updated the heuristic to fix the letter ß in UTF-8/MacRoman mojibake,
 52 |   which had regressed since version 5.6.
 53 | 
 54 | - Packaging fixes to pyproject.toml.
 55 | 
 56 | ## Version 6.1 (February 9, 2022)
 57 | 
 58 | - Updated the heuristic to fix the letter Ñ with more confidence.
 59 | 
 60 | - Fixed type annotations and added py.typed.
 61 | 
 62 | - ftfy is packaged using Poetry now, and wheels are created and uploaded to
 63 |   PyPI.
 64 | 
 65 | ## Version 6.0.3 (May 14, 2021)
 66 | 
 67 | - Allow the keyword argument `fix_entities` as a deprecated alias for
 68 |   `unescape_html`, raising a warning.
 69 | 
 70 | - `ftfy.formatting` functions now disregard ANSI terminal escapes when
 71 |   calculating text width.
 72 | 
 73 | 
 74 | ## Version 6.0.2 (May 4, 2021)
 75 | 
 76 | This version is purely a cosmetic change, updating the maintainer's e-mail
 77 | address and the project's canonical location on GitHub.
 78 | 
 79 | 
 80 | ## Version 6.0.1 (April 12, 2021)
 81 | 
 82 | - The `remove_terminal_escapes` step was accidentally not being used. This
 83 |   version restores it.
 84 | 
 85 | - Specified in setup.py that ftfy 6 requires Python 3.6 or later.
 86 | 
 87 | - Use a lighter link color when the docs are viewed in dark mode.
 88 | 
 89 | ## Version 6.0 (April 2, 2021)
 90 | 
 91 | - New function: `ftfy.fix_and_explain()` can describe all the transformations
 92 |   that happen when fixing a string. This is similar to what
 93 |   `ftfy.fixes.fix_encoding_and_explain()` did in previous versions, but it
 94 |   can fix more than the encoding.
 95 | 
 96 | - `fix_and_explain()` and `fix_encoding_and_explain()` are now in the top-level
 97 |   ftfy module.
 98 | 
 99 | - Changed the heuristic entirely. ftfy no longer needs to categorize every
100 |   Unicode character, but only characters that are expected to appear in
101 |   mojibake.
102 | 
103 | - Because of the new heuristic, ftfy will no longer have to release a new
104 |   version for every new version of Unicode. It should also run faster and
105 |   use less RAM when imported.
106 | 
107 | - The heuristic `ftfy.badness.is_bad(text)` can be used to determine whether
108 |   there appears to be mojibake in a string. Some users were already using
109 |   the old function `sequence_weirdness()` for that, but this one is actually
110 |   designed for that purpose.
111 | 
112 | - Instead of a pile of named keyword arguments, ftfy functions now take in
113 |   a TextFixerConfig object. The keyword arguments still work, and become
114 |   settings that override the defaults in TextFixerConfig.
115 | 
116 | - Added support for UTF-8 mixups with Windows-1253 and Windows-1254.
117 | 
118 | - Overhauled the documentation: https://ftfy.readthedocs.org
119 | 
120 | ## Version 5.9 (February 10, 2021)
121 | 
122 | This version is brought to you by the letter à and the number 0xC3.
123 | 
124 | - Tweaked the heuristic to decode, for example, "Ã&nbsp;" as the letter "à"
125 |   more often.
126 | 
127 | - This combines with the non-breaking-space fixer to decode "Ã " as "à" as
128 |   well. However, in many cases, the text " Ã " was intended to be " à ",
129 |   preserving the space -- the underlying mojibake had two spaces after it, but
130 |   the Web coalesced them into one. We detect this case based on common French
131 |   and Portuguese words, and preserve the space when it appears intended.
132 | 
133 | Thanks to @zehavoc for bringing to my attention how common this case is.
134 | 
135 | - Updated the data file of Unicode character categories to Unicode 13, as
136 |   used in Python 3.9. (No matter what version of Python you're on, ftfy uses
137 |   the same data.)
138 | 
139 | ## Version 5.8 (July 17, 2020)
140 | 
141 | - Improved detection of UTF-8 mojibake of Greek, Cyrillic, Hebrew, and Arabic
142 |   scripts.
143 | 
144 | - Fixed the undeclared dependency on setuptools by removing the use of
145 |   `pkg_resources`.
146 | 
147 | ## Version 5.7 (February 18, 2020)
148 | 
149 | - Updated the data file of Unicode character categories to Unicode 12.1, as
150 |   used in Python 3.8. (No matter what version of Python you're on, ftfy uses
151 |   the same data.)
152 | 
153 | - Corrected an omission where short sequences involving the ACUTE ACCENT
154 |   character were not being fixed.
155 | 
156 | ## Version 5.6 (August 7, 2019)
157 | 
158 | - The `unescape_html` function now supports all the HTML5 entities that appear
159 |   in `html.entities.html5`, including those with long names such as
160 |   `&DiacriticalDoubleAcute;`.
161 | 
162 | - Unescaping of numeric HTML entities now uses the standard library's
163 |   `html.unescape`, making edge cases consistent.
164 | 
165 |   (The reason we don't run `html.unescape` on all text is that it's not always
166 |   appropriate to apply, and can lead to false positive fixes. The text
167 |   "This&NotThat" should not have "&Not" replaced by a symbol, as
168 |   `html.unescape` would do.)
169 | 
170 | - On top of Python's support for HTML5 entities, ftfy will also convert HTML
171 |   escapes of common Latin capital letters that are (nonstandardly) written
172 |   in all caps, such as `&NTILDE;` for `Ñ`.
173 | 
174 | 
175 | ## Version 5.5.1 (September 14, 2018)
176 | 
177 | - Added Python 3.7 support.
178 | 
179 | - Updated the data file of Unicode character categories to Unicode 11, as used
180 |   in Python 3.7.0. (No matter what version of Python you're on, ftfy uses the
181 |   same data.)
182 | 
183 | 
184 | ## Version 5.5 (September 6, 2018)
185 | 
186 | - Recent versions have emphasized making a reasonable attempt to fix short,
187 |   common mojibake sequences, such as `Ã»`. In this version, we've expanded the
188 |   heuristics to recognize these sequences in MacRoman as well as Windows-125x
189 |   encodings.
190 | 
191 | - A related rule for fixing isolated Windows-1252/UTF-8 mixups, even when they
192 |   were inconsistent with the rest of the string, claimed to work on Latin-1/UTF-8
193 |   mixups as well, but in practice it didn't. We've made the rule more robust.
194 | 
195 | - Fixed a failure when testing the CLI on Windows.
196 | 
197 | - Removed the `pytest-runner` invocation from setup.py, as it created complex
198 |   dependencies that would stop setup.py from working in some environments.
199 |   The `pytest` command still works fine. `pytest-runner` is just too clever.
200 | 
201 | 
202 | ## Version 5.4.1 (June 14, 2018)
203 | 
204 | - Fixed a bug in the `setup.py` metadata.
205 | 
206 |   This bug was causing ftfy, a package that fixes encoding mismatches, to not
207 |   install in some environments due to an encoding mismatch. (We were really
208 |   putting the "meta" in "metadata" here.)
209 | 
210 | 
211 | ## Version 5.4 (June 1, 2018)
212 | 
213 | - ftfy was still too conservative about fixing short mojibake sequences,
214 |   such as "aoÃ»t" -> "août", when the broken version contained punctuation
215 |   such as curly or angle quotation marks.
216 | 
217 |   The new heuristic observes in some cases that, even if quotation marks are
218 |   expected to appear next to letters, it is strange to have an accented capital
219 |   A before the quotation mark and more letters after the quotation mark.
220 | 
221 | - Provides better metadata for the new PyPI.
222 | 
223 | - Switched from nosetests to pytest.
224 | 
225 | 
226 | ## Version 5.3 (January 25, 2018)
227 | 
228 | - A heuristic has been too conservative since version 4.2, causing a regression
229 |   compared to previous versions: ftfy would fail to fix mojibake of common
230 |   characters such as `á` when seen in isolation. A new heuristic now makes it
231 |   possible to fix more of these common cases with less evidence.
232 | 
233 | 
234 | ## Version 5.2 (November 27, 2017)
235 | 
236 | - The command-line tool will not accept the same filename as its input
237 |   and output. (Previously, this would write a zero-length file.)
238 | 
239 | - The `uncurl_quotes` fixer, which replaces curly quotes with straight quotes,
240 |   now also replaces MODIFIER LETTER APOSTROPHE.
241 | 
242 | - Codepoints that contain two Latin characters crammed together for legacy
243 |   encoding reasons are replaced by those two separate characters, even in NFC
244 |   mode. We formerly did this just with ligatures such as `ﬁ` and `Ĳ`, but now
245 |   this includes the Afrikaans digraph `ŉ` and Serbian/Croatian digraphs such as
246 |   `ǆ`.
247 | 
248 | 
249 | ## Version 5.1.1 and 4.4.3 (May 15, 2017)
250 | 
251 | These releases fix two unrelated problems with the tests, one in each version.
252 | 
253 | - v5.1.1: fixed the CLI tests (which are new in v5) so that they pass
254 |   on Windows, as long as the Python output encoding is UTF-8.
255 | 
256 | - v4.4.3: added the `# coding: utf-8` declaration to two files that were
257 |   missing it, so that tests can run on Python 2.
258 | 
259 | ## Version 5.1 (April 7, 2017)
260 | 
261 | - Removed the dependency on `html5lib` by dropping support for Python 3.2.
262 | 
263 |   We previously used the dictionary `html5lib.constants.entities` to decode
264 |   HTML entities.  In Python 3.3 and later, that exact dictionary is now in the
265 |   standard library as `html.entities.html5`.
266 | 
267 | - Moved many test cases about how particular text should be fixed into
268 |   `test_cases.json`, which may ease porting to other languages.
269 | 
270 | The functionality of this version remains the same as 5.0.2 and 4.4.2.
271 | 
272 | 
273 | ## Version 5.0.2 and 4.4.2 (March 21, 2017)
274 | 
275 | Added a `MANIFEST.in` that puts files such as the license file and this
276 | changelog inside the source distribution.
277 | 
278 | 
279 | ## Version 5.0.1 and 4.4.1 (March 10, 2017)
280 | 
281 | Bug fix:
282 | 
283 | - The `unescape_html` fixer will decode entities between `&#128;` and `&#159;`
284 |   as what they would be in Windows-1252, even without the help of
285 |   `fix_encoding`.
286 | 
287 |   This better matches what Web browsers do, and fixes a regression that version
288 |   4.4 introduced in an example that uses `&#133;` as an ellipsis.
289 | 
290 | 
291 | ## Version 5.0 (February 17, 2017)
292 | 
293 | Breaking changes:
294 | 
295 | - Dropped support for Python 2. If you need Python 2 support, you should get
296 |   version 4.4, which has the same features as this version.
297 | 
298 | - The top-level functions require their arguments to be given as keyword
299 |   arguments.
300 | 
301 | Version 5.0 also now has tests for the command-line invocation of ftfy.
302 | 
303 | 
304 | ## Version 4.4.0 (February 17, 2017)
305 | 
306 | Heuristic changes:
307 | 
308 | - ftfy can now fix mojibake involving the Windows-1250 or ISO-8859-2 encodings.
309 | 
310 | - The `fix_entities` fixer is now applied after `fix_encoding`. This makes
311 |   more situations resolvable when both fixes are needed.
312 | 
313 | - With a few exceptions for commonly-used characters such as `^`, it is now
314 |   considered "weird" whenever a diacritic appears in non-combining form,
315 |   such as the diaeresis character `¨`.
316 | 
317 | - It is also now weird when IPA phonetic letters, besides `ə`, appear next to
318 |   capital letters.
319 | 
320 | - These changes to the heuristics, and others we've made in recent versions,
321 |   let us lower the "cost" for fixing mojibake in some encodings, causing them
322 |   to be fixed in more cases.
323 | 
324 | 
325 | ## Version 4.3.1 (January 12, 2017)
326 | 
327 | Bug fix:
328 | 
329 | - `remove_control_chars` was removing U+0D ('\r') prematurely. That's the
330 |   job of `fix_line_breaks`.
331 | 
332 | 
333 | ## Version 4.3.0 (December 29, 2016)
334 | 
335 | ftfy has gotten by for four years without dependencies on other Python
336 | libraries, but now we can spare ourselves some code and some maintenance burden
337 | by delegating certain tasks to other libraries that already solve them well.
338 | This version now depends on the `html5lib` and `wcwidth` libraries.
339 | 
340 | Feature changes:
341 | 
342 | - The `remove_control_chars` fixer will now remove some non-ASCII control
343 |   characters as well, such as deprecated Arabic control characters and
344 |   byte-order marks. Bidirectional controls are still left as is.
345 | 
346 |   This should have no impact on well-formed text, while cleaning up many
347 |   characters that the Unicode Consortium deems "not suitable for markup"
348 |   (see Unicode Technical Report #20).
349 | 
350 | - The `unescape_html` fixer uses a more thorough list of HTML entities,
351 |   which it imports from `html5lib`.
352 | 
353 | - `ftfy.formatting` now uses `wcwidth` to compute the width that a string
354 |   will occupy in a text console.
355 | 
356 | Heuristic changes:
357 | 
358 | - Updated the data file of Unicode character categories to Unicode 9, as used
359 |   in Python 3.6.0. (No matter what version of Python you're on, ftfy uses the
360 |   same data.)
361 | 
362 | Pending deprecations:
363 | 
364 | - The `remove_bom` option will become deprecated in 5.0, because it has been
365 |   superseded by `remove_control_chars`.
366 | 
367 | - ftfy 5.0 will remove the previously deprecated name `fix_text_encoding`. It
368 |   was renamed to `fix_encoding` in 4.0.
369 | 
370 | - ftfy 5.0 will require Python 3.2 or later, as planned. Python 2 users, please
371 |   specify `ftfy < 5` in your dependencies if you haven't already.
372 | 
373 | 
374 | ## Version 4.2.0 (September 28, 2016)
375 | 
376 | Heuristic changes:
377 | 
378 | - Math symbols next to currency symbols are no longer considered 'weird' by the
379 |   heuristic. This fixes a false positive where text that involved the
380 |   multiplication sign and British pounds or euros (as in '5×£35') could turn
381 |   into Hebrew letters.
382 | 
383 | - A heuristic that used to be a bonus for certain punctuation now also gives a
384 |   bonus to successfully decoding other common codepoints, such as the
385 |   non-breaking space, the degree sign, and the byte order mark.
386 | 
387 | - In version 4.0, we tried to "future-proof" the categorization of emoji (as a
388 |   kind of symbol) to include codepoints that would likely be assigned to emoji
389 |   later. The future happened, and there are even more emoji than we expected.
390 |   We have expanded the range to include those emoji, too.
391 | 
392 |   ftfy is still mostly based on information from Unicode 8 (as Python 3.5 is),
393 |   but this expanded range should include the emoji from Unicode 9 and 10.
394 | 
395 | - Emoji are increasingly being modified by variation selectors and skin-tone
396 |   modifiers. Those codepoints are now grouped with 'symbols' in ftfy, so they
397 |   fit right in with emoji, instead of being considered 'marks' as their Unicode
398 |   category would suggest.
399 | 
400 |   This enables fixing mojibake that involves iOS's new diverse emoji.
401 | 
402 | - An old heuristic that wasn't necessary anymore considered Latin text with
403 |   high-numbered codepoints to be 'weird', but this is normal in languages such
404 |   as Vietnamese and Azerbaijani. This does not seem to have caused any false
405 |   positives, but it caused ftfy to be too reluctant to fix some cases of broken
406 |   text in those languages.
407 | 
408 |   The heuristic has been changed, and all languages that use Latin letters
409 |   should be on even footing now.
410 | 
411 | 
412 | ## Version 4.1.1 (April 13, 2016)
413 | 
414 | - Bug fix: in the command-line interface, the `-e` option had no effect on
415 |   Python 3 when using standard input. Now, it correctly lets you specify
416 |   a different encoding for standard input.
417 | 
418 | 
419 | ## Version 4.1.0 (February 25, 2016)
420 | 
421 | Heuristic changes:
422 | 
423 | - ftfy can now deal with "lossy" mojibake. If your text has been run through
424 |   a strict Windows-1252 decoder, such as the one in Python, it may contain
425 |   the replacement character � (U+FFFD) where there were bytes that are
426 |   unassigned in Windows-1252.
427 | 
428 |   Although ftfy won't recover the lost information, it can now detect this
429 |   situation, replace the entire lossy character with �, and decode the rest of
430 |   the characters. Previous versions would be unable to fix any string that
431 |   contained U+FFFD.
432 | 
433 |   As an example, text in curly quotes that gets corrupted `â€œ like this â€�`
434 |   now gets fixed to be `“ like this �`.
435 | 
436 | - Updated the data file of Unicode character categories to Unicode 8.0, as used
437 |   in Python 3.5.0. (No matter what version of Python you're on, ftfy uses the
438 |   same data.)
439 | 
440 | - Heuristics now count characters such as `~` and `^` as punctuation instead
441 |   of wacky math symbols, improving the detection of mojibake in some edge cases.
442 | 
443 | New features:
444 | 
445 | - A new module, `ftfy.formatting`, can be used to justify Unicode text in a
446 |   monospaced terminal. It takes into account that each character can take up
447 |   anywhere from 0 to 2 character cells.
448 | 
449 | - Internally, the `utf-8-variants` codec was simplified and optimized.
450 | 
451 | 
452 | ## Version 4.0.0 (April 10, 2015)
453 | 
454 | Breaking changes:
455 | 
456 | - The default normalization form is now NFC, not NFKC. NFKC replaces a large
457 |   number of characters with 'equivalent' characters, and some of these
458 |   replacements are useful, but some are not desirable to do by default.
459 | 
460 | - The `fix_text` function has some new options that perform more targeted
461 |   operations that are part of NFKC normalization, such as
462 |   `fix_character_width`, without requiring hitting all your text with the huge
463 |   mallet that is NFKC.
464 | 
465 |   - If you were already using NFC normalization, or in general if you want to
466 |     preserve the *spacing* of CJK text, you should be sure to set
467 |     `fix_character_width=False`.
468 | 
469 | - The `remove_unsafe_private_use` parameter has been removed entirely, after
470 |   two versions of deprecation. The function name `fix_bad_encoding` is also
471 |   gone.
472 | 
473 | New features:
474 | 
475 | - Fixers for strange new forms of mojibake, including particularly clear cases
476 |   of mixed UTF-8 and Windows-1252.
477 | 
478 | - New heuristics, so that ftfy can fix more stuff, while maintaining
479 |   approximately zero false positives.
480 | 
481 | - The command-line tool trusts you to know what encoding your *input* is in,
482 |   and assumes UTF-8 by default. You can still tell it to guess with the `-g`
483 |   option.
484 | 
485 | - The command-line tool can be configured with options, and can be used as a
486 |   pipe.
487 | 
488 | - Recognizes characters that are new in Unicode 7.0, as well as emoji from
489 |   Unicode 8.0+ that may already be in use on iOS.
490 | 
491 | Deprecations:
492 | 
493 | - `fix_text_encoding` is being renamed again, for conciseness and consistency.
494 |   It's now simply called `fix_encoding`. The name `fix_text_encoding` is
495 |   available but emits a warning.
496 | 
497 | Pending deprecations:
498 | 
499 | - Python 2.6 support is largely coincidental.
500 | 
501 | - Python 2.7 support is on notice. If you use Python 2, be sure to pin a
502 |   version of ftfy less than 5.0 in your requirements.
503 | 
504 | 
505 | ## Version 3.4.0 (January 15, 2015)
506 | 
507 | New features:
508 | 
509 | - `ftfy.fixes.fix_surrogates` will fix all 16-bit surrogate codepoints,
510 |   which would otherwise break various encoding and output functions.
511 | 
512 | Deprecations:
513 | 
514 | - `remove_unsafe_private_use` emits a warning, and will disappear in the
515 |   next minor or major version.
516 | 
517 | 
518 | ## Version 3.3.1 (December 12, 2014)
519 | 
520 | This version restores compatibility with Python 2.6.
521 | 
522 | 
523 | ## Version 3.3.0 (August 16, 2014)
524 | 
525 | Heuristic changes:
526 | 
527 | - Certain symbols are marked as "ending punctuation" that may naturally occur
528 |   after letters. When they follow an accented capital letter and look like
529 |   mojibake, they will not be "fixed" without further evidence.
530 |   An example is that "MARQUÉ…" will become "MARQUÉ...", and not "MARQUɅ".
531 | 
532 | New features:
533 | 
534 | - `ftfy.explain_unicode` is a diagnostic function that shows you what's going
535 |   on in a Unicode string. It shows you a table with each code point in
536 |   hexadecimal, its glyph, its name, and its Unicode category.
537 | 
538 | - `ftfy.fixes.decode_escapes` adds a feature missing from the standard library:
539 |   it lets you decode a Unicode string with backslashed escape sequences in it
540 |   (such as "\u2014") the same way that Python itself would.
541 | 
542 | - `ftfy.streamtester` is a release of the code that I use to test ftfy on
543 |   an endless stream of real-world data from Twitter. With the new heuristics,
544 |   the false positive rate of ftfy is about 1 per 6 million tweets. (See
545 |   the "Accuracy" section of the documentation.)
546 | 
547 | Deprecations:
548 | 
549 | - Python 2.6 is no longer supported.
550 | 
551 | - `remove_unsafe_private_use` is no longer needed in any current version of
552 |   Python. This fixer will disappear in a later version of ftfy.
553 | 
554 | 
555 | ## Version 3.2.0 (June 27, 2014)
556 | 
557 | - `fix_line_breaks` fixes three additional characters that are considered line
558 |   breaks in some environments, such as Javascript, and Python's "codecs"
559 |   library. These are all now replaced with \n:
560 | 
561 |       U+0085   <control>, with alias "NEXT LINE"
562 |       U+2028   LINE SEPARATOR
563 |       U+2029   PARAGRAPH SEPARATOR
564 | 
565 | 
566 | ## Version 3.1.3 (May 15, 2014)
567 | 
568 | - Fix `utf-8-variants` so it never outputs surrogate codepoints, even on
569 |   Python 2 where that would otherwise be possible.
570 | 
571 | 
572 | ## Version 3.1.2 (January 29, 2014)
573 | 
574 | - Fix bug in 3.1.1 where strings with backslashes in them could never be fixed
575 | 
576 | 
577 | ## Version 3.1.1 (January 29, 2014)
578 | 
579 | - Add the `ftfy.bad_codecs` package, which registers new codecs that can
580 |   decoding things that Python may otherwise refuse to decode:
581 | 
582 |   - `utf-8-variants`, which decodes CESU-8 and its Java lookalike
583 | 
584 |   - `sloppy-windows-*`, which decodes character-map encodings while treating
585 |     unmapped characters as Latin-1
586 | 
587 | - Simplify the code using `ftfy.bad_codecs`.
588 | 
589 | 
590 | ## Version 3.0.6 (November 5, 2013)
591 | 
592 | - `fix_entities` can now be True, False, or 'auto'. The new case is True, which
593 |   will decode all entities, even in text that already contains angle brackets.
594 |   This may also be faster, because it doesn't have to check.
595 | - `build_data.py` will refuse to run on Python < 3.3, to prevent building
596 |   an inconsistent data file.
597 | 
598 | 
599 | ## Version 3.0.5 (November 1, 2013)
600 | 
601 | - Fix the arguments to `fix_file`, because they were totally wrong.
602 | 
603 | 
604 | ## Version 3.0.4 (October 1, 2013)
605 | 
606 | - Restore compatibility with Python 2.6.
607 | 
608 | 
609 | ## Version 3.0.3 (September 9, 2013)
610 | 
611 | - Fixed an ugly regular expression bug that prevented ftfy from importing on a
612 |   narrow build of Python.
613 | 
614 | 
615 | ## Version 3.0.2 (September 4, 2013)
616 | 
617 | - Fixed some false positives.
618 | 
619 |   - Basically, 3.0.1 was too eager to treat text as MacRoman or cp437 when
620 |     three consecutive characters coincidentally decoded as UTF-8. Increased the
621 |     cost of those encodings so that they have to successfully decode multiple
622 |     UTF-8 characters.
623 | 
624 |   - See `tests/test_real_tweets.py` for the new test cases that were added as a
625 |     result.
626 | 
627 | 
628 | ## Version 3.0.1 (August 30, 2013)
629 | 
630 | - Fix bug in `fix_java_encoding` that led to only the first instance of
631 |   CESU-8 badness per line being fixed
632 | - Add a fixer that removes unassigned characters that can break Python 3.3
633 |   (http://bugs.python.org/issue18183)
634 | 
635 | 
636 | ## Version 3.0 (August 26, 2013)
637 | 
638 | - Generally runs faster
639 | - Idempotent
640 | - Simplified decoding logic
641 | - Understands more encodings and more kinds of mistakes
642 | - Takes options that enable or disable particular normalization steps
643 | - Long line handling: now the time-consuming step (`fix_text_encoding`) will be
644 |   consistently skipped on long lines, but all other fixes will apply
645 | - Tested on millions of examples from Twitter, ensuring a near-zero rate of
646 |   false positives
647 | 
648 | 
649 | ## Version 2.0.2 (June 20, 2013)
650 | 
651 | - Fix breaking up of long lines, so it can't go into an infinite loop
652 | 
653 | 
654 | ## Version 2.0.1 (March 19, 2013)
655 | 
656 | - Restored Python 2.6 support
657 | 
658 | 
659 | ## Version 2.0 (January 30, 2013)
660 | 
661 | - Python 3 support
662 | - Use fast Python built-ins to speed up fixes
663 | - Bugfixes
664 | 
665 | 
666 | ## Version 1.0 (August 24, 2012)
667 | 
668 | - Made into its own package with no dependencies, instead of a part of
669 |   `metanl`
670 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright 2023 Robyn Speer
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | graft tests
2 | 
3 | include *.md
4 | include *.txt
5 | 
6 | global-exclude __pycache__ *.py[cod]
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ftfy: fixes text for you
  2 | 
  3 | [![PyPI package](https://badge.fury.io/py/ftfy.svg)](https://badge.fury.io/py/ftfy)
  4 | [![Docs](https://readthedocs.org/projects/ftfy/badge/?version=latest)](https://ftfy.readthedocs.org/en/latest/)
  5 | 
  6 | ```python
  7 | 
  8 | >>> from ftfy import fix_encoding
  9 | >>> print(fix_encoding("(à¸‡'âŒ£')à¸‡"))
 10 | (ง'⌣')ง
 11 | 
 12 | ```
 13 | 
 14 | The full documentation of ftfy is available at [ftfy.readthedocs.org](https://ftfy.readthedocs.org). The documentation covers a lot more than this README, so here are some links into it:
 15 | 
 16 | - [Fixing problems and getting explanations](https://ftfy.readthedocs.io/en/latest/explain.html)
 17 | - [Configuring ftfy](https://ftfy.readthedocs.io/en/latest/config.html)
 18 | - [Encodings ftfy can handle](https://ftfy.readthedocs.io/en/latest/encodings.html)
 19 | - [“Fixer” functions](https://ftfy.readthedocs.io/en/latest/fixes.html)
 20 | - [Is ftfy an encoding detector?](https://ftfy.readthedocs.io/en/latest/detect.html)
 21 | - [Heuristics for detecting mojibake](https://ftfy.readthedocs.io/en/latest/heuristic.html)
 22 | - [Support for “bad” encodings](https://ftfy.readthedocs.io/en/latest/bad_encodings.html)
 23 | - [Command-line usage](https://ftfy.readthedocs.io/en/latest/cli.html)
 24 | - [Citing ftfy](https://ftfy.readthedocs.io/en/latest/cite.html)
 25 | 
 26 | ## Testimonials
 27 | 
 28 | - “My life is livable again!”
 29 |   — [@planarrowspace](https://twitter.com/planarrowspace)
 30 | - “A handy piece of magic”
 31 |   — [@simonw](https://twitter.com/simonw)
 32 | - “Saved me a large amount of frustrating dev work”
 33 |   — [@iancal](https://twitter.com/iancal)
 34 | - “ftfy did the right thing right away, with no faffing about. Excellent work, solving a very tricky real-world (whole-world!) problem.”
 35 |   — Brennan Young
 36 | - “I have no idea when I’m gonna need this, but I’m definitely bookmarking it.”
 37 |   — [/u/ocrow](https://reddit.com/u/ocrow)
 38 | 
 39 | ## What it does
 40 | 
 41 | Here are some examples (found in the real world) of what ftfy can do:
 42 | 
 43 | ftfy can fix mojibake (encoding mix-ups), by detecting patterns of characters that were clearly meant to be UTF-8 but were decoded as something else:
 44 | 
 45 |     >>> import ftfy
 46 |     >>> ftfy.fix_text('âœ” No problems')
 47 |     '✔ No problems'
 48 | 
 49 | Does this sound impossible? It's really not. UTF-8 is a well-designed encoding that makes it obvious when it's being misused, and a string of mojibake usually contains all the information we need to recover the original string.
 50 | 
 51 | ftfy can fix multiple layers of mojibake simultaneously:
 52 | 
 53 |     >>> ftfy.fix_text('The Mona Lisa doesnÃƒÂ¢Ã¢â€šÂ¬Ã¢â€žÂ¢t have eyebrows.')
 54 |     "The Mona Lisa doesn't have eyebrows."
 55 | 
 56 | It can fix mojibake that has had "curly quotes" applied on top of it, which cannot be consistently decoded until the quotes are uncurled:
 57 | 
 58 |     >>> ftfy.fix_text("l’humanitÃ©")
 59 |     "l'humanité"
 60 | 
 61 | ftfy can fix mojibake that would have included the character U+A0 (non-breaking space), but the U+A0 was turned into an ASCII space and then combined with another following space:
 62 | 
 63 |     >>> ftfy.fix_text('Ã\xa0 perturber la rÃ©flexion')
 64 |     'à perturber la réflexion'
 65 |     >>> ftfy.fix_text('Ã perturber la rÃ©flexion')
 66 |     'à perturber la réflexion'
 67 | 
 68 | ftfy can also decode HTML entities that appear outside of HTML, even in cases where the entity has been incorrectly capitalized:
 69 | 
 70 |     >>> # by the HTML 5 standard, only 'P&Eacute;REZ' is acceptable
 71 |     >>> ftfy.fix_text('P&EACUTE;REZ')
 72 |     'PÉREZ'
 73 |   
 74 | These fixes are not applied in all cases, because ftfy has a strongly-held goal of avoiding false positives -- it should never change correctly-decoded text to something else.
 75 | 
 76 | The following text could be encoded in Windows-1252 and decoded in UTF-8, and it would decode as 'MARQUɅ'. However, the original text is already sensible, so it is unchanged.
 77 | 
 78 |     >>> ftfy.fix_text('IL Y MARQUÉ…')
 79 |     'IL Y MARQUÉ…'
 80 | 
 81 | ## Installing
 82 | 
 83 | ftfy is a Python 3 package that can be installed using `pip` or `uv pip`:
 84 | 
 85 |     pip install ftfy
 86 | 
 87 | (Or use `pip3 install ftfy` on systems where Python 2 and 3 are both globally installed and `pip` refers to Python 2.)
 88 | 
 89 | If you use `poetry`, you can use ftfy as a dependency in the usual way (such as `poetry add ftfy`).
 90 | 
 91 | ### Local development
 92 | 
 93 | ftfy is developed using [uv](https://github.com/astral-sh/uv). You can build a virtual environment with its local dependencies by running `uv venv`, and test it with `uv run pytest`.
 94 | 
 95 | ## Who maintains ftfy?
 96 | 
 97 | I'm Robyn Speer, also known as Elia Robyn Lake. You can find my projects
 98 | [on GitHub](https://github.com/rspeer) and my posts on [my own blog](https://posts.arborelia.net).
 99 | 
100 | ## Citing ftfy
101 | 
102 | ftfy has been used as a crucial data processing step in major NLP research.
103 | 
104 | It's important to give credit appropriately to everyone whose work you build on in research. This includes software, not just high-status contributions such as mathematical models. All I ask when you use ftfy for research is that you cite it.
105 | 
106 | ftfy has a citable record [on Zenodo](https://zenodo.org/record/2591652). A citation of ftfy may look like this:
107 | 
108 |     Robyn Speer. (2019). ftfy (Version 5.5). Zenodo.
109 |     http://doi.org/10.5281/zenodo.2591652
110 | 
111 | In BibTeX format, the citation is::
112 | 
113 |     @misc{speer-2019-ftfy,
114 |       author       = {Robyn Speer},
115 |       title        = {ftfy},
116 |       note         = {Version 5.5},
117 |       year         = 2019,
118 |       howpublished = {Zenodo},
119 |       doi          = {10.5281/zenodo.2591652},
120 |       url          = {https://doi.org/10.5281/zenodo.2591652}
121 |     }
122 | 
123 | ## Important license clarifications
124 | 
125 | If you do not follow ftfy's license, you do not have a license to ftfy.
126 | 
127 | This sounds obvious and tautological, but there are people who think open source licenses mean that they can just do what they want, especially in the field of generative AI. It's a permissive license but you still have to follow it. The [Apache license](https://www.apache.org/licenses/LICENSE-2.0) is the only thing that gives you permission to use and copy ftfy; otherwise, all rights are reserved.
128 | 
129 | If you use or distribute ftfy, you must follow the terms of the [Apache license](https://www.apache.org/licenses/LICENSE-2.0), including that you must attribute the author of ftfy (Robyn Speer) correctly.
130 | 
131 | You _may not_ make a derived work of ftfy that obscures its authorship, such as by putting its code in an AI training dataset, including the code in AI training at runtime, or using a generative AI that copies code from such a dataset.
132 | 
133 | At my discretion, I may notify you of a license violation, and give you a chance to either remedy it or delete all copies of ftfy in your possession.
134 | 
135 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 38 | 	@echo "  text       to make text files"
 39 | 	@echo "  man        to make manual pages"
 40 | 	@echo "  texinfo    to make Texinfo files"
 41 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 42 | 	@echo "  gettext    to make PO message catalogs"
 43 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 44 | 	@echo "  xml        to make Docutils-native XML files"
 45 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 46 | 	@echo "  linkcheck  to check all external links for integrity"
 47 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 48 | 
 49 | clean:
 50 | 	rm -rf $(BUILDDIR)/*
 51 | 
 52 | html:
 53 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 54 | 	@echo
 55 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 56 | 
 57 | dirhtml:
 58 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 59 | 	@echo
 60 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 61 | 
 62 | singlehtml:
 63 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 64 | 	@echo
 65 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 66 | 
 67 | pickle:
 68 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 69 | 	@echo
 70 | 	@echo "Build finished; now you can process the pickle files."
 71 | 
 72 | json:
 73 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 74 | 	@echo
 75 | 	@echo "Build finished; now you can process the JSON files."
 76 | 
 77 | htmlhelp:
 78 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 79 | 	@echo
 80 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 81 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 82 | 
 83 | qthelp:
 84 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 85 | 	@echo
 86 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 87 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 88 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/ftfy.qhcp"
 89 | 	@echo "To view the help file:"
 90 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/ftfy.qhc"
 91 | 
 92 | devhelp:
 93 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 94 | 	@echo
 95 | 	@echo "Build finished."
 96 | 	@echo "To view the help file:"
 97 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/ftfy"
 98 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/ftfy"
 99 | 	@echo "# devhelp"
100 | 
101 | epub:
102 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
103 | 	@echo
104 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
105 | 
106 | latex:
107 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
108 | 	@echo
109 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
110 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
111 | 	      "(use \`make latexpdf' here to do that automatically)."
112 | 
113 | latexpdf:
114 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
115 | 	@echo "Running LaTeX files through pdflatex..."
116 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
117 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
118 | 
119 | latexpdfja:
120 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
121 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
122 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
123 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
124 | 
125 | text:
126 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
127 | 	@echo
128 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
129 | 
130 | man:
131 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
132 | 	@echo
133 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
134 | 
135 | texinfo:
136 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
137 | 	@echo
138 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
139 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
140 | 	      "(use \`make info' here to do that automatically)."
141 | 
142 | info:
143 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
144 | 	@echo "Running Texinfo files through makeinfo..."
145 | 	make -C $(BUILDDIR)/texinfo info
146 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
147 | 
148 | gettext:
149 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
150 | 	@echo
151 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
152 | 
153 | changes:
154 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
155 | 	@echo
156 | 	@echo "The overview file is in $(BUILDDIR)/changes."
157 | 
158 | linkcheck:
159 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
160 | 	@echo
161 | 	@echo "Link check complete; look for any errors in the above output " \
162 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
163 | 
164 | doctest:
165 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
166 | 	@echo "Testing of doctests in the sources finished, look at the " \
167 | 	      "results in $(BUILDDIR)/doctest/output.txt."
168 | 
169 | xml:
170 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
171 | 	@echo
172 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
173 | 
174 | pseudoxml:
175 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
176 | 	@echo
177 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
178 | 


--------------------------------------------------------------------------------
/docs/_static/css/custom.css:
--------------------------------------------------------------------------------
1 | @import url('https://fonts.googleapis.com/css2?family=Inconsolata:wght@400;700&family=Source+Sans+Pro:ital,wght@0,400;0,700;1,400;1,700&display=swap');
2 | 


--------------------------------------------------------------------------------
/docs/avoid.rst:
--------------------------------------------------------------------------------
 1 | How can I avoid producing mojibake?
 2 | ===================================
 3 | 
 4 | Read the Python Unicode HOWTO
 5 | -----------------------------
 6 | 
 7 | The `Python Unicode HOWTO`_ is a useful introduction to how to use Unicode correctly in Python. If you find yourself confused about the difference between bytes and characters, or you need to unlearn bad habits from Python 2, it's a great place to start.
 8 | 
 9 | .. _`Python Unicode HOWTO`: https://docs.python.org/3/howto/unicode.html
10 | 
11 | Assume UTF-8
12 | ------------
13 | 
14 | **Assume text is in UTF-8** unless you have a specific reason to believe it isn't.
15 | 
16 | In the 2020s, `UTF-8 is everywhere`_, especially in text meant to be transferred over the Internet. Most mojibake comes from decoding correct UTF-8 as if it were some other encoding.
17 | 
18 | .. _`UTF-8 is everywhere`: http://utf8everywhere.org/
19 | 
20 | In Python 3, you should use the Unicode string type (`str`) for all operations. You should open text files in UTF-8::
21 | 
22 |     openfile = open(filename, encoding='utf-8', errors='replace')
23 | 
24 | When you are specifically working with bytes and you need to turn them into text, you should decode them as UTF-8::
25 | 
26 |     text = bytebuffer.decode('utf-8', 'replace')
27 | 
28 | The exceptions, the cases where you're not using UTF-8, are few but relevant. If you're interacting with C APIs, you'll need to represent your text as bytes in the format the API expects. Windows APIs in particular expect UTF-16.
29 | 
30 | We're mostly past the dark days when encodings were "character maps" of 256 possible characters, one byte per character. An unfortunate thing that keeps them alive is Microsoft Excel, whose "Export" feature will pick a 256-character encoding *based on your computer's operating system and default language*. So:
31 | 
32 | Don't export CSVs from Excel
33 | ----------------------------
34 | 
35 | I know that I'm telling you not to do something that may seem like a requirement of doing your job. But **don't export CSV files from Excel** if you have any other choice. Though Excel CSVs look right on basic ASCII characters, on any other text, it either won't work or won't do what you want. Excel CSVs aren't even interoperable between different computers.
36 | 
37 | My recommendation is to use Google Sheets to create CSVs, and keep Excel files in .xlsx format so the Unicode won't be mangled.
38 | 
39 | If you must export a CSV-like file from Excel, you can find an option to tell Excel to export in "Unicode Text", and it will create a tab-separated UTF-16 file. This is not a very widely-used format, but at least it's not mojibake.
40 | 
41 | You can follow these `unwieldy directions from a SalesForce help article`_  to use Excel and Notepad to create a UTF-8 CSV. You can see why I don't recommend this process.
42 | 
43 | .. _`unwieldy directions from a SalesForce help article`: https://help.salesforce.com/articleView?id=000324657&type=1&mode=1
44 | 
45 | Don't use chardet
46 | -----------------
47 | 
48 | Encoding detection on raw bytes is not a good idea. It was important in the '90s, during the rough transition to Unicode -- and the most popular way of doing it, ``chardet``, hasn't changed since the '90s.
49 | 
50 | A heuristic designed before there was multilingual social media, before there were emoji, is not going to work correctly in the 2020s.
51 | 
52 | When chardet sees the *correct* UTF-8 encoding of an emoji, it will have no idea what it's looking at, because it won't match anything in its training data. Often, it will guess that it's Turkish encoded in Windows-1254. On other reasonable text, it will guess the "iso-8859-2" encoding, an encoding that you'd very rarely see used intentionally. Because the problem is inherent to the design of chardet, it's not easily fixed.
53 | 
54 | chardet was built on the assumption that "encoding detection is language detection", which is no longer true. Web sites now contain text in multiple languages, and for the most part they use UTF-8 regardless of the language.
55 | 
56 | I've strengthened my recommendation from "don't trust chardet's output" to "don't use chardet", because there's no realistic way to use chardet without trusting its output. We've reached a situation where major Python packages such as ``requests`` assume that chardet is correct, and yet the changing nature of text means that chardet is more wrong with each passing year.
57 | 
58 | So how should you interpret raw bytes of text if you're not told what encoding they're in? As UTF-8. Text is UTF-8 until proven otherwise.
59 | 
60 | ASCII isn't extended
61 | --------------------
62 | 
63 | A sign that something is about to go wrong with encodings is if a developer is talking about "extended ASCII".
64 | 
65 | ASCII is a set of 128 character codes (95 of them displayable). It has not had any new characters added to it since the backslash was added in 1967.
66 | 
67 | Because ASCII is a 7-bit encoding but our computers use 8-bit bytes, it seems clear that ASCII *could* be extended to assign a meaning to all 256 possible bytes. There are many different encodings that have done so, and they're all incompatible with one another, which is why treating bytes as characters as a bad idea and why we have Unicode now.
68 | 
69 | Many developers refer to one of these encodings as "extended ASCII", whose colloquial meaning is "the encoding of 256 characters that I learned first". Its meaning is completely dependent on the country you were in and the operating system you were using when you started programming:
70 | 
71 | - My "extended ASCII" when I learned to program was IBM codepage 437, the one that was used in US versions of MS-DOS.
72 | - To many people, "extended ASCII" is Windows codepage 1252, which they'd find in the Character Map of their Windows 9x computer, at least if they were in North America or Western Europe.
73 | - To others in other countries, it could be a different Windows codepage, such as 1251 (which contains Cyrillic letters) or 1250 (which contains a different set of accented letters for Eastern European languages).
74 | - Or it might be Latin-1, the common name for the ISO-8859-1 standard that became the first 256 characters of Unicode. Latin-1 is easy to implement by accident, such as when you see byte C2 and assume it means Unicode codepoint U+00C2 -- what you get by incorrectly running `chr()` on each byte.
75 | 
76 | "Extended ASCII" doesn't specify which encoding you mean, and often indicates that you don't realize that different people are thinking of different sets of 256 characters.
77 | 
78 | Instead of "extended ASCII", say the name of the encoding such as "Latin-1", "Windows-1252", "Windows-1250", "codepage 437", or maybe "I don't know what it is but it looks right on my machine".
79 | 
80 | And then revise things so that you use UTF-8, which is still a superset of ASCII but can represent every Unicode character.
81 | 


--------------------------------------------------------------------------------
/docs/bad_encodings.rst:
--------------------------------------------------------------------------------
 1 | Support for "bad" encodings
 2 | ===========================
 3 | 
 4 | .. automodule:: ftfy.bad_codecs
 5 | 
 6 | "Sloppy" encodings
 7 | ------------------
 8 | .. automodule:: ftfy.bad_codecs.sloppy
 9 | 
10 | Variants of UTF-8
11 | -----------------
12 | .. automodule:: ftfy.bad_codecs.utf8_variants
13 | 
14 | 


--------------------------------------------------------------------------------
/docs/cite.rst:
--------------------------------------------------------------------------------
 1 | .. _cite:
 2 | 
 3 | Citing ftfy
 4 | ===========
 5 | ftfy has been used as a data processing step in major NLP research, including OpenAI's original GPT.
 6 | 
 7 | It's important to give credit appropriately to everyone whose work you build on in research. This includes software, not just high-status contributions such as mathematical models. All I ask when you use ftfy for research is that you cite it. 
 8 | 
 9 | ftfy has a citable record on `Zenodo`_. A citation of ftfy may look like this:
10 | 
11 |     Robyn Speer. (2019). ftfy (Version 5.5). Zenodo.
12 |     http://doi.org/10.5281/zenodo.2591652
13 | 
14 | In BibTeX format, the citation is::
15 | 
16 |     @misc{speer-2019-ftfy,
17 |       author       = {Robyn Speer},
18 |       title        = {ftfy},
19 |       note         = {Version 5.5},
20 |       year         = 2019,
21 |       howpublished = {Zenodo},
22 |       doi          = {10.5281/zenodo.2591652},
23 |       url          = {https://doi.org/10.5281/zenodo.2591652}
24 |     }
25 | 
26 | .. _Zenodo: https://zenodo.org/record/2591652
27 | 


--------------------------------------------------------------------------------
/docs/cli.rst:
--------------------------------------------------------------------------------
 1 | Command-line usage
 2 | ==================
 3 | ftfy can be used from the command line. By default, it takes UTF-8 input and
 4 | writes it to UTF-8 output, fixing problems in its Unicode as it goes.
 5 | 
 6 | Here's the usage documentation for the `ftfy` command:
 7 | 
 8 | .. code-block:: text
 9 | 
10 |     usage: ftfy [-h] [-o OUTPUT] [-g] [-e ENCODING] [-n NORMALIZATION]
11 |                 [--preserve-entities]
12 |                 [filename]
13 | 
14 |     ftfy (fixes text for you), version 6.0
15 | 
16 |     positional arguments:
17 |       filename              The file whose Unicode is to be fixed. Defaults to -,
18 |                             meaning standard input.
19 | 
20 |     optional arguments:
21 |       -h, --help            show this help message and exit
22 |       -o OUTPUT, --output OUTPUT
23 |                             The file to output to. Defaults to -, meaning standard
24 |                             output.
25 |       -g, --guess           Ask ftfy to guess the encoding of your input. This is
26 |                             risky. Overrides -e.
27 |       -e ENCODING, --encoding ENCODING
28 |                             The encoding of the input. Defaults to UTF-8.
29 |       -n NORMALIZATION, --normalization NORMALIZATION
30 |                             The normalization of Unicode to apply. Defaults to
31 |                             NFC. Can be "none".
32 |       --preserve-entities   Leave HTML entities as they are. The default is to
33 |                             decode them, as long as no HTML tags have appeared in
34 |                             the file.
35 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # ftfy documentation build configuration file, created by
  3 | # sphinx-quickstart on Wed Aug 28 03:18:27 2013.
  4 | #
  5 | # This file is execfile()d with the current directory set to its containing dir.
  6 | #
  7 | # Note that not all possible configuration values are present in this
  8 | # autogenerated file.
  9 | #
 10 | # All configuration values have a default; values that are commented out
 11 | # serve to show the default.
 12 | 
 13 | 
 14 | # If extensions (or modules to document with autodoc) are in another directory,
 15 | # add these directories to sys.path here. If the directory is relative to the
 16 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 17 | # sys.path.insert(0, os.path.abspath('.'))
 18 | 
 19 | # -- General configuration -----------------------------------------------------
 20 | 
 21 | # If your documentation needs a minimal Sphinx version, state it here.
 22 | # needs_sphinx = '1.0'
 23 | 
 24 | # Add any Sphinx extension module names here, as strings. They can be extensions
 25 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 26 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.viewcode"]
 27 | 
 28 | # Add any paths that contain templates here, relative to this directory.
 29 | templates_path = ["_templates"]
 30 | 
 31 | # The suffix of source filenames.
 32 | source_suffix = ".rst"
 33 | 
 34 | # The encoding of source files.
 35 | # source_encoding = 'utf-8-sig'
 36 | 
 37 | # The master toctree document.
 38 | master_doc = "index"
 39 | 
 40 | # General information about the project.
 41 | project = "ftfy"
 42 | copyright = "2024, Robyn Speer"
 43 | 
 44 | # The version info for the project you're documenting, acts as replacement for
 45 | # |version| and |release|, also used in various other places throughout the
 46 | # built documents.
 47 | #
 48 | # The short X.Y version.
 49 | version = "6.3"
 50 | # The full version, including alpha/beta/rc tags.
 51 | release = "6.3.1"
 52 | 
 53 | # The language for content autogenerated by Sphinx. Refer to documentation
 54 | # for a list of supported languages.
 55 | # language = None
 56 | 
 57 | # There are two options for replacing |today|: either, you set today to some
 58 | # non-false value, then it is used:
 59 | # today = ''
 60 | # Else, today_fmt is used as the format for a strftime call.
 61 | # today_fmt = '%B %d, %Y'
 62 | 
 63 | # List of patterns, relative to source directory, that match files and
 64 | # directories to ignore when looking for source files.
 65 | exclude_patterns = ["_build"]
 66 | 
 67 | # The reST default role (used for this markup: `text`) to use for all documents.
 68 | default_role = "code"
 69 | 
 70 | # If true, '()' will be appended to :func: etc. cross-reference text.
 71 | # add_function_parentheses = True
 72 | 
 73 | # If true, the current module name will be prepended to all description
 74 | # unit titles (such as .. function::).
 75 | # add_module_names = True
 76 | 
 77 | # If true, sectionauthor and moduleauthor directives will be shown in the
 78 | # output. They are ignored by default.
 79 | # show_authors = False
 80 | 
 81 | # The name of the Pygments (syntax highlighting) style to use.
 82 | pygments_style = "default"
 83 | pygments_dark_style = "monokai"
 84 | 
 85 | # A list of ignored prefixes for module index sorting.
 86 | # modindex_common_prefix = []
 87 | 
 88 | # If true, keep warnings as "system message" paragraphs in the built documents.
 89 | # keep_warnings = False
 90 | 
 91 | 
 92 | # -- Options for HTML output ---------------------------------------------------
 93 | 
 94 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 95 | # a list of builtin themes.
 96 | html_theme = "furo"
 97 | 
 98 | # Theme options are theme-specific and customize the look and feel of a theme
 99 | # further.  For a list of options available for each theme, see the
100 | # documentation.
101 | html_theme_options = {
102 |     "light_css_variables": {
103 |         "color-brand-primary": "#7C4DFF",
104 |         "color-brand-content": "#7C4DFF",
105 |         "font-stack": "Source Sans Pro, sans-serif",
106 |         "font-stack--monospace": "Inconsolata",
107 |         "code-font-size": "18px",
108 |         # I don't know why furo wants inline code to be so small, but don't let it
109 |         "font-size--small--2": "100%",
110 |     },
111 |     "dark_css_variables": {
112 |         "color-brand-primary": "#AC8DFF",
113 |         "color-brand-content": "#AC8DFF",
114 |         "font-stack": "Source Sans Pro, sans-serif",
115 |         "font-stack--monospace": "Inconsolata",
116 |         "code-font-size": "18px",
117 |         "font-size--small--2": "100%",
118 |     },
119 | }
120 | html_css_files = [
121 |     "css/custom.css",
122 | ]
123 | 
124 | # Add any paths that contain custom themes here, relative to this directory.
125 | # html_theme_path = []
126 | 
127 | # The name for this set of Sphinx documents.  If None, it defaults to
128 | # "<project> v<release> documentation".
129 | html_title = "ftfy: fixes text for you"
130 | 
131 | # A shorter title for the navigation bar.  Default is the same as html_title.
132 | html_short_title = "ftfy"
133 | 
134 | # The name of an image file (relative to this directory) to place at the top
135 | # of the sidebar.
136 | # html_logo = None
137 | 
138 | # The name of an image file (within the static path) to use as favicon of the
139 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
140 | # pixels large.
141 | # html_favicon = None
142 | 
143 | # Add any paths that contain custom static files (such as style sheets) here,
144 | # relative to this directory. They are copied after the builtin static files,
145 | # so a file named "default.css" will overwrite the builtin "default.css".
146 | html_static_path = ["_static"]
147 | 
148 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
149 | # using the given strftime format.
150 | html_last_updated_fmt = "%b %d, %Y"
151 | 
152 | # If true, SmartyPants will be used to convert quotes and dashes to
153 | # typographically correct entities.
154 | # html_use_smartypants = True
155 | 
156 | # Custom sidebar templates, maps document names to template names.
157 | # html_sidebars = {}
158 | 
159 | # Additional templates that should be rendered to pages, maps page names to
160 | # template names.
161 | # html_additional_pages = {}
162 | 
163 | # If false, no module index is generated.
164 | # html_domain_indices = True
165 | 
166 | # If false, no index is generated.
167 | # html_use_index = True
168 | 
169 | # If true, the index is split into individual pages for each letter.
170 | # html_split_index = False
171 | 
172 | # If true, links to the reST sources are added to the pages.
173 | html_show_sourcelink = False
174 | 
175 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
176 | # html_show_sphinx = True
177 | 
178 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
179 | # html_show_copyright = True
180 | 
181 | # If true, an OpenSearch description file will be output, and all pages will
182 | # contain a <link> tag referring to it.  The value of this option must be the
183 | # base URL from which the finished HTML is served.
184 | # html_use_opensearch = ''
185 | 
186 | # This is the file name suffix for HTML files (e.g. ".xhtml").
187 | # html_file_suffix = None
188 | 
189 | # Output file base name for HTML help builder.
190 | htmlhelp_basename = "ftfydoc"
191 | 
192 | 
193 | # -- Options for LaTeX output --------------------------------------------------
194 | 
195 | latex_elements = {
196 |     # The paper size ('letterpaper' or 'a4paper').
197 |     # 'papersize': 'letterpaper',
198 |     # The font size ('10pt', '11pt' or '12pt').
199 |     # 'pointsize': '10pt',
200 |     # Additional stuff for the LaTeX preamble.
201 |     # 'preamble': '',
202 | }
203 | 
204 | # Grouping the document tree into LaTeX files. List of tuples
205 | # (source start file, target name, title, author, documentclass [howto/manual]).
206 | latex_documents = []
207 | 
208 | # The name of an image file (relative to this directory) to place at the top of
209 | # the title page.
210 | # latex_logo = None
211 | 
212 | # For "manual" documents, if this is true, then toplevel headings are parts,
213 | # not chapters.
214 | # latex_use_parts = False
215 | 
216 | # If true, show page references after internal links.
217 | # latex_show_pagerefs = False
218 | 
219 | # If true, show URL addresses after external links.
220 | # latex_show_urls = False
221 | 
222 | # Documents to append as an appendix to all manuals.
223 | # latex_appendices = []
224 | 
225 | # If false, no module index is generated.
226 | # latex_domain_indices = True
227 | 
228 | 
229 | # -- Options for manual page output --------------------------------------------
230 | 
231 | # One entry per manual page. List of tuples
232 | # (source start file, name, description, authors, manual section).
233 | man_pages = [("index", "ftfy", "ftfy Documentation", ["Robyn Speer"], 1)]
234 | 
235 | # If true, show URL addresses after external links.
236 | # man_show_urls = False
237 | 
238 | 
239 | # -- Options for Texinfo output ------------------------------------------------
240 | 
241 | # Grouping the document tree into Texinfo files. List of tuples
242 | # (source start file, target name, title, author,
243 | #  dir menu entry, description, category)
244 | texinfo_documents = []
245 | 
246 | # Documents to append as an appendix to all manuals.
247 | # texinfo_appendices = []
248 | 
249 | # If false, no module index is generated.
250 | # texinfo_domain_indices = True
251 | 
252 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
253 | # texinfo_show_urls = 'footnote'
254 | 
255 | # If true, do not generate a @detailmenu in the "Top" node's menu.
256 | # texinfo_no_detailmenu = False
257 | 


--------------------------------------------------------------------------------
/docs/config.rst:
--------------------------------------------------------------------------------
 1 | .. _config:
 2 | 
 3 | Configuring ftfy
 4 | ================
 5 | 
 6 | The main functions of ftfy -- :func:`ftfy.fix_text` and :func:`ftfy.fix_and_explain` -- run text through a sequence of fixes. If the text changed, it will run them through again, so that you can be sure the output ends up in a standard form that will be unchanged by ftfy.
 7 | 
 8 | All the fixes are on by default, but you can pass in a configuration object or keyword options to turn them off. Check that the default fixes are appropriate for your use case. For example:
 9 | 
10 | - You should set `unescape_html` to False if the output is meant to be interpreted as HTML.
11 | 
12 | - You should set `fix_character_width` to False if you want to preserve the spacing of CJK text.
13 | 
14 | - You should set `uncurl_quotes` to False if you want to preserve quotation marks with nice typography. You could even consider doing the opposite of `uncurl_quotes`, running `smartypants`_ on the result to make all the punctuation typographically nice.
15 | 
16 | - To be cautious and only fix mojibake when it can be fixed with a consistent sequence of encoding and decoding steps, you should set `decode_inconsistent_utf8` to False.
17 | 
18 | .. _smartypants: http://pythonhosted.org/smartypants/
19 | 
20 | If the only fix you need is to detect and repair decoding errors (mojibake), use the :func:`ftfy.fix_encoding` function directly. However, note that mojibake is often entangled with other issues such as the curliness of quotation marks, so limiting the process to this step might make some mojibake unfixable.
21 | 
22 | The TextFixerConfig object
23 | --------------------------
24 | 
25 | The top-level functions of ftfy take a `config` argument that is an instance of :class:`ftfy.TextFixerConfig`. If this argument is None, the configuration will use its default values.
26 | 
27 | .. autoclass:: ftfy.TextFixerConfig()
28 | 
29 | Keyword arguments
30 | -----------------
31 | The top-level functions also accept keyword arguments in place of a `config` argument. Given these keyword arguments, they will pass them to the :class:`ftfy.TextFixerConfig` constructor, overriding the default values of those configuration options.
32 | 


--------------------------------------------------------------------------------
/docs/detect.rst:
--------------------------------------------------------------------------------
 1 | Is ftfy an encoding detector?
 2 | =============================
 3 | 
 4 | No, it's a mojibake detector (and fixer). That makes its task much easier, because it doesn't have to guess the encoding of everything: it can leave correct-looking text as it is.
 5 | 
 6 | Encoding detectors have ended up being a bad idea, and they are largely responsible for *creating* the problems that ftfy has to fix.
 7 | 
 8 | The text that you put into ftfy should be Unicode that you've attempted to decode correctly. ftfy doesn't accept bytes as input.
 9 | 
10 | There is a lot of Unicode out there that has already been mangled by mojibake, even when decoded properly. That is, you might correctly interpret the text as UTF-8, and what the UTF-8 text really says is a mojibake string like "rÃ©flexion" that needs to be decoded *again*. This is when you need ftfy.
11 | 
12 | 
13 | I really need to guess the encoding of some bytes
14 | -------------------------------------------------
15 | 
16 | I understand. Sometimes we can't have nice things.
17 | 
18 | Though it's not part of the usual operation of ftfy, ftfy *does* contain a byte-encoding-guesser that tries to be less terrible than other byte-encoding-guessers in common cases. Instead of using probabilistic heuristics, it picks up on very strong signals like "having a UTF-16 byte-order mark" or "decoding successfully as UTF-8".
19 | 
20 | This function won't solve everything. It can't solve everything. In particular, it has no capacity to guess non-Unicode CJK encodings such as Shift-JIS or Big5.
21 | 
22 | .. autofunction:: ftfy.guess_bytes
23 | 
24 | 


--------------------------------------------------------------------------------
/docs/encodings.rst:
--------------------------------------------------------------------------------
 1 | Encodings ftfy can handle
 2 | =========================
 3 | 
 4 | ftfy can't fix all possible mix-ups. Its goal is to cover the most common encoding mix-ups while keeping false positives to a very low rate.
 5 | 
 6 | ftfy can understand text that was decoded as any of these single-byte encodings:
 7 | 
 8 | - Latin-1 (ISO-8859-1)
 9 | - Windows-1250 (cp1250 -- used in Microsoft products in Eastern Europe)
10 | - Windows-1251 (cp1251 -- used in Microsoft products in Russia)
11 | - Windows-1252 (cp1252 -- used in Microsoft products in Western Europe and the Americas)
12 | - Windows-1253 (cp1253 -- used in Microsoft products in Greece)
13 | - Windows-1254 (cp1254 -- used in Microsoft products in Türkiye)
14 | - Windows-1257 (cp1257 -- used in Microsoft products in Baltic countries)
15 | - ISO-8859-2 (which is not quite the same as Windows-1250)
16 | - MacRoman (used on Mac OS 9 and earlier)
17 | - cp437 (it's the "text mode" in your video card firmware)
18 | 
19 | when it was actually intended to be decoded as one of these variable-length encodings:
20 | 
21 | - UTF-8
22 | - CESU-8 (a common, incorrect implementation of UTF-8)
23 | 
24 | It can also understand text that was intended as Windows-1252 but decoded as Latin-1. That's the very common case where things like smart-quotes and bullets turn into single weird control characters.
25 | 
26 | However, ftfy cannot understand other mixups between single-byte encodings, because it is extremely difficult to detect which mixup in particular is the one that happened.
27 | 
28 | We also can't handle the legacy encodings used for Chinese, Japanese, and Korean, such as ``shift-jis`` and ``gb18030``.  See `issue #34`_ for why this is so hard.
29 | 
30 | I tried adding support for cp850, the cp437-workalike that supported European languages, but I couldn't find any real examples that it fixed, and it introduced some false positives.
31 | 
32 | .. _`issue #34`: https://github.com/rspeer/python-ftfy/issues/34
33 | 
34 | Remember that the input to ftfy is Unicode, so it handles actual CJK *text* just fine. It just can't discover that a CJK *encoding* introduced mojibake into the text.
35 | 


--------------------------------------------------------------------------------
/docs/explain.rst:
--------------------------------------------------------------------------------
 1 | Fixing problems and getting explanations
 2 | ========================================
 3 | 
 4 | Ode to a Shipping Label
 5 | -----------------------
 6 | 
 7 | A `poem about mojibake`_, whose original author might be `Carlos Bueno on Facebook`_, shows a shipping label that serves as an excellent example for this section, addressed to the surname `L&AMP;AMP;ATILDE;&AMP;AMP;SUP3;PEZ`.
 8 | 
 9 | .. _`poem about mojibake`: https://imgur.com/4J7Il0m
10 | .. _`Carlos Bueno on Facebook`: https://www.facebook.com/cmb/posts/619241744770551:0
11 | 
12 | .. image:: images/shipping-label.png
13 |    :width: 600
14 |    :alt: A package addressed to a name including "L&AMP;AMP;ATILDE;&AMP;AMP;SUP3;PEZ"
15 | 
16 | We can use ftfy not only to fix the text that was on the label, but to show us what happened to it (like the poem does)::
17 | 
18 |     >>> from ftfy import fix_and_explain, apply_plan
19 |     >>> shipping_label = "L&AMP;AMP;ATILDE;&AMP;AMP;SUP3;PEZ"
20 |     >>> fixed, explanation = fix_and_explain(shipping_label)
21 |     >>> fixed
22 |     'LóPEZ'
23 | 
24 |     >>> explanation
25 |     [('apply', 'unescape_html'),
26 |      ('apply', 'unescape_html'),
27 |      ('apply', 'unescape_html'),
28 |      ('encode', 'latin-1'),
29 |      ('decode', 'utf-8')]
30 | 
31 | The capitalization is inconsistent because the encoding of a lowercase "ó" is in there, but everything was printed in capital letters.
32 | 
33 | The explanation may even be able to be applied to different text with the same problem::
34 | 
35 |     >>> label2 = "CARR&AMP;AMP;ATILDE;&AMP;AMP;COPY;"
36 |     >>> apply_plan(label2, explanation)
37 |     'CARRé'
38 | 
39 | Functions that fix text
40 | -----------------------
41 | 
42 | The function that you'll probably use most often is :func:`ftfy.fix_text`, which applies all the fixes it can to every line of text, and returns the fixed text.
43 | 
44 | .. autofunction:: ftfy.fix_text
45 | 
46 | :func:`ftfy.fix_and_explain` takes the same arguments as :func:`ftfy.fix_text`, but provides an explanation, like we saw in the first section.
47 | 
48 | .. autofunction:: ftfy.fix_and_explain
49 | 
50 | Unlike :func:`ftfy.fix_text`, :func:`ftfy.fix_and_explain` doesn't separate the text into lines that it fixes separately -- because it's looking for a unified explanation of what happened to the text, not a different one for each line.
51 | 
52 | A more targeted function is :func:`ftfy.fix_encoding_and_explain`, which only fixes problems that can be solved by encoding and decoding the text, not other problems such as HTML entities:
53 | 
54 | .. autofunction:: ftfy.fix_encoding_and_explain
55 | 
56 | This function has a counterpart that returns just the fixed string, without the explanation. It still fixes the string as a whole, not line by line.
57 | 
58 | .. autofunction:: ftfy.fix_encoding
59 | 
60 | The return type of the `..._and_explain` functions is a kind of NamedTuple called `ExplainedText`:
61 | 
62 | .. autoclass:: ftfy.ExplainedText
63 | 
64 | These explanations can be re-applied to text using :func:`apply_plan`:
65 | 
66 | .. autofunction:: ftfy.apply_plan
67 | 
68 | Showing the characters in a string
69 | ----------------------------------
70 | 
71 | A different kind of explanation you might need is simply a breakdown of what Unicode characters a string contains. For this, ftfy provides a utility function, :func:`ftfy.explain_unicode()`.
72 | 
73 | .. autofunction:: ftfy.explain_unicode
74 | 
75 | A command-line utility that provides similar information, and even more detail, is lunasorcery's `utf8info`_.
76 | 
77 | .. _`utf8info`: https://github.com/lunasorcery/utf8info


--------------------------------------------------------------------------------
/docs/fixes.rst:
--------------------------------------------------------------------------------
 1 | "Fixer" functions
 2 | =================
 3 | 
 4 | .. automodule:: ftfy.fixes
 5 |    :members: decode_escapes, decode_inconsistent_utf8, fix_c1_controls,
 6 |              fix_character_width, fix_latin_ligatures, fix_line_breaks,
 7 |              fix_surrogates, remove_control_chars, remove_terminal_escapes,
 8 |              replace_lossy_sequences, restore_byte_a0, uncurl_quotes,
 9 |              unescape_html
10 | 


--------------------------------------------------------------------------------
/docs/heuristic.rst:
--------------------------------------------------------------------------------
 1 | .. _heuristic:
 2 | 
 3 | Heuristics for detecting mojibake
 4 | =================================
 5 | 
 6 | The "badness" heuristic
 7 | -----------------------
 8 | 
 9 | .. automodule:: ftfy.badness
10 |    :members: badness, is_bad
11 | 
12 | 
13 | The "UTF-8 detector" heuristic
14 | ------------------------------
15 | A more narrow heuristic is defined in ``chardata.py`` as ``UTF8_DETECTOR_RE``. This heuristic looks for specific sequences of mojibake characters that come from the decoding of common UTF-8 sequences.
16 | 
17 | Text that matches this regular expression can be partially fixed by :func:`ftfy.fixes.decode_inconsistent_utf8`, even when the string as a whole doesn't decode consistently. 
18 | 
19 | Because of this, the expression requires that the match isn't preceded by likely UTF-8 characters -- if this were allowed, then it might pick two or three characters out of a larger mess of mojibake to decode as another character while leaving the rest untouched. This makes the problem more confusing, doesn't really solve anything, and can even pile up recursively to decode as entirely arbitrary characters.
20 | 
21 | 
22 | The "lossy UTF-8"  heuristic
23 | ----------------------------
24 | ``chardata.py`` also includes ``LOSSY_UTF8_RE``, which is used similarly to the "UTF-8 detector" heuristic. This regular expression matches sequences that look like they were incorrectly decoded from UTF-8, but with characters replaced by question marks or the Unicode replacement character `�`.
25 | 
26 | Characters that match this heuristic will be replaced by `�` in the :func:`ftfy.fixes.replace_lossy_sequences` fixer.


--------------------------------------------------------------------------------
/docs/images/shipping-label.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rspeer/python-ftfy/74dd0452b48286a3770013b3a02755313bd5575e/docs/images/shipping-label.png


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | ftfy: fixes text for you
 2 | ========================
 3 | 
 4 | *Version 6.3*
 5 | 
 6 | “Assume all external input is the result of (a series of) bugs.”
 7 | — `RFC 9225`_: Software Defects Considered Harmful
 8 | 
 9 | .. _`RFC 9225`: https://www.rfc-editor.org/rfc/rfc9225.html
10 | 
11 | **ftfy** fixes Unicode that's broken in various ways.
12 | 
13 | The goal of ftfy is to **take in bad Unicode and output good Unicode**, for use in your Unicode-aware code.
14 | 
15 | This is different from taking in non-Unicode and outputting Unicode, which is not a goal of ftfy. It also isn't designed to protect you from having to write Unicode-aware code. ftfy helps those who help themselves.
16 | 
17 | Of course you're better off if your input is decoded properly and has no glitches. But you often don't have any control over your input; it's someone else's mistake, but it's your problem now. ftfy will do everything it can to fix the problem.
18 | 
19 | ftfy is a heuristic that was designed (not machine-learned) by Robyn Speer. If you use ftfy in research, including pre-processing your language model data, you need to cite it: see :ref:`cite`.
20 | 
21 | .. toctree::
22 |    :maxdepth: 1
23 | 
24 |    explain
25 |    config
26 |    encodings
27 |    fixes
28 |    detect
29 |    avoid
30 |    heuristic
31 |    bad_encodings
32 |    cli
33 |    cite
34 | 
35 | 
36 | Some quick examples
37 | -------------------
38 | 
39 | Here are some examples (found in the real world) of what ftfy can do:
40 | 
41 | ftfy can fix mojibake (encoding mix-ups), by detecting patterns of characters that were clearly meant to be UTF-8 but were decoded as something else:
42 | 
43 |     >>> import ftfy
44 |     >>> ftfy.fix_text('âœ” No problems')
45 |     '✔ No problems'
46 | 
47 | Does this sound impossible? It's really not. UTF-8 is a well-designed encoding that makes it obvious when it's being misused, and a string of mojibake usually contains all the information we need to recover the original string.
48 | 
49 | ftfy can fix multiple layers of mojibake simultaneously:
50 | 
51 |     >>> ftfy.fix_text('The Mona Lisa doesnÃƒÂ¢Ã¢â€šÂ¬Ã¢â€žÂ¢t have eyebrows.')
52 |     "The Mona Lisa doesn't have eyebrows."
53 | 
54 | It can fix mojibake that has had "curly quotes" applied on top of it, which cannot be consistently decoded until the quotes are uncurled:
55 | 
56 |     >>> ftfy.fix_text("l’humanitÃ©")
57 |     "l'humanité"
58 | 
59 | ftfy can fix mojibake that would have included the character U+A0 (non-breaking space), but the U+A0 was turned into an ASCII space and then combined with another following space:
60 | 
61 |     >>> ftfy.fix_text('Ã\xa0 perturber la rÃ©flexion')
62 |     'à perturber la réflexion'
63 |     >>> ftfy.fix_text('Ã perturber la rÃ©flexion')
64 |     'à perturber la réflexion'
65 | 
66 | ftfy can also decode HTML entities that appear outside of HTML, even in cases where the entity has been incorrectly capitalized:
67 | 
68 |     >>> # by the HTML 5 standard, only 'P&Eacute;REZ' is acceptable
69 |     >>> ftfy.fix_text('P&EACUTE;REZ')
70 |     'PÉREZ'
71 |   
72 | These fixes are not applied in all cases, because ftfy has a strongly-held goal of avoiding false positives -- it should never change correctly-decoded text to something else.
73 | 
74 | The following text could be encoded in Windows-1252 and decoded in UTF-8, and it would decode as 'MARQUɅ'. However, the original text is already sensible, so it is unchanged.
75 | 
76 |     >>> ftfy.fix_text('IL Y MARQUÉ…')
77 |     'IL Y MARQUÉ…'
78 | 


--------------------------------------------------------------------------------
/ftfy/bad_codecs/__init__.py:
--------------------------------------------------------------------------------
  1 | r"""
  2 | The `ftfy.bad_codecs` module gives Python the ability to decode some common,
  3 | flawed encodings.
  4 | 
  5 | Python does not want you to be sloppy with your text. Its encoders and decoders
  6 | ("codecs") follow the relevant standards whenever possible, which means that
  7 | when you get text that *doesn't* follow those standards, you'll probably fail
  8 | to decode it. Or you might succeed at decoding it for implementation-specific
  9 | reasons, which is perhaps worse.
 10 | 
 11 | There are some encodings out there that Python wishes didn't exist, which are
 12 | widely used outside of Python:
 13 | 
 14 | - "utf-8-variants", a family of not-quite-UTF-8 encodings, including the
 15 |   ever-popular CESU-8 and "Java modified UTF-8".
 16 | - "Sloppy" versions of character map encodings, where bytes that don't map to
 17 |   anything will instead map to the Unicode character with the same number.
 18 | 
 19 | Simply importing this module, or in fact any part of the `ftfy` package, will
 20 | make these new "bad codecs" available to Python through the standard Codecs
 21 | API. You never have to actually call any functions inside `ftfy.bad_codecs`.
 22 | 
 23 | However, if you want to call something because your code checker insists on it,
 24 | you can call ``ftfy.bad_codecs.ok()``.
 25 | 
 26 | A quick example of decoding text that's encoded in CESU-8:
 27 | 
 28 |     >>> import ftfy.bad_codecs
 29 |     >>> print(b'\xed\xa0\xbd\xed\xb8\x8d'.decode('utf-8-variants'))
 30 |     😍
 31 | """
 32 | 
 33 | import codecs
 34 | from encodings import normalize_encoding
 35 | from typing import Optional
 36 | 
 37 | _CACHE: dict[str, codecs.CodecInfo] = {}
 38 | 
 39 | # Define some aliases for 'utf-8-variants'. All hyphens get turned into
 40 | # underscores, because of `normalize_encoding`.
 41 | UTF8_VAR_NAMES = (
 42 |     "utf_8_variants",
 43 |     "utf8_variants",
 44 |     "utf_8_variant",
 45 |     "utf8_variant",
 46 |     "utf_8_var",
 47 |     "utf8_var",
 48 |     "cesu_8",
 49 |     "cesu8",
 50 |     "java_utf_8",
 51 |     "java_utf8",
 52 | )
 53 | 
 54 | 
 55 | def search_function(encoding: str) -> Optional[codecs.CodecInfo]:
 56 |     """
 57 |     Register our "bad codecs" with Python's codecs API. This involves adding
 58 |     a search function that takes in an encoding name, and returns a codec
 59 |     for that encoding if it knows one, or None if it doesn't.
 60 | 
 61 |     The encodings this will match are:
 62 | 
 63 |     - Encodings of the form 'sloppy-windows-NNNN' or 'sloppy-iso-8859-N',
 64 |       where the non-sloppy version is an encoding that leaves some bytes
 65 |       unmapped to characters.
 66 |     - The 'utf-8-variants' encoding, which has the several aliases seen
 67 |       above.
 68 |     """
 69 |     if encoding in _CACHE:
 70 |         return _CACHE[encoding]
 71 | 
 72 |     norm_encoding = normalize_encoding(encoding)
 73 |     codec = None
 74 |     if norm_encoding in UTF8_VAR_NAMES:
 75 |         from ftfy.bad_codecs.utf8_variants import CODEC_INFO
 76 | 
 77 |         codec = CODEC_INFO
 78 |     elif norm_encoding.startswith("sloppy_"):
 79 |         from ftfy.bad_codecs.sloppy import CODECS
 80 | 
 81 |         codec = CODECS.get(norm_encoding)
 82 | 
 83 |     if codec is not None:
 84 |         _CACHE[encoding] = codec
 85 | 
 86 |     return codec
 87 | 
 88 | 
 89 | def ok() -> None:
 90 |     """
 91 |     A feel-good function that gives you something to call after importing
 92 |     this package.
 93 | 
 94 |     Why is this here? Pyflakes. Pyflakes gets upset when you import a module
 95 |     and appear not to use it. It doesn't know that you're using it when
 96 |     you use the ``unicode.encode`` and ``bytes.decode`` methods with certain
 97 |     encodings.
 98 |     """
 99 | 
100 | 
101 | codecs.register(search_function)
102 | 


--------------------------------------------------------------------------------
/ftfy/bad_codecs/sloppy.py:
--------------------------------------------------------------------------------
  1 | r"""
  2 | `ftfy.bad_codecs.sloppy` provides character-map encodings that fill their "holes"
  3 | in a messy but common way: by outputting the Unicode codepoints with the same
  4 | numbers.
  5 | 
  6 | This is incredibly ugly, and it's also in the HTML5 standard.
  7 | 
  8 | A single-byte encoding maps each byte to a Unicode character, except that some
  9 | bytes are left unmapped. In the commonly-used Windows-1252 encoding, for
 10 | example, bytes 0x81 and 0x8D, among others, have no meaning.
 11 | 
 12 | Python, wanting to preserve some sense of decorum, will handle these bytes
 13 | as errors. But Windows knows that 0x81 and 0x8D are possible bytes and they're
 14 | different from each other. It just hasn't defined what they are in terms of
 15 | Unicode.
 16 | 
 17 | Software that has to interoperate with Windows-1252 and Unicode -- such as all
 18 | the common Web browsers -- will pick some Unicode characters for them to map
 19 | to, and the characters they pick are the Unicode characters with the same
 20 | numbers: U+0081 and U+008D. This is the same as what Latin-1 does, and the
 21 | resulting characters tend to fall into a range of Unicode that's set aside for
 22 | obsolete Latin-1 control characters anyway.
 23 | 
 24 | These sloppy codecs let Python do the same thing, thus interoperating with
 25 | other software that works this way. It defines a sloppy version of many
 26 | single-byte encodings with holes. (There is no need for a sloppy version of
 27 | an encoding without holes: for example, there is no such thing as
 28 | sloppy-iso-8859-2 or sloppy-macroman.)
 29 | 
 30 | The following encodings will become defined:
 31 | 
 32 | - sloppy-windows-1250 (Central European, sort of based on ISO-8859-2)
 33 | - sloppy-windows-1251 (Cyrillic)
 34 | - sloppy-windows-1252 (Western European, based on Latin-1)
 35 | - sloppy-windows-1253 (Greek, sort of based on ISO-8859-7)
 36 | - sloppy-windows-1254 (Turkish, based on ISO-8859-9)
 37 | - sloppy-windows-1255 (Hebrew, based on ISO-8859-8)
 38 | - sloppy-windows-1256 (Arabic)
 39 | - sloppy-windows-1257 (Baltic, based on ISO-8859-13)
 40 | - sloppy-windows-1258 (Vietnamese)
 41 | - sloppy-cp874 (Thai, based on ISO-8859-11)
 42 | - sloppy-iso-8859-3 (Maltese and Esperanto, I guess)
 43 | - sloppy-iso-8859-6 (different Arabic)
 44 | - sloppy-iso-8859-7 (Greek)
 45 | - sloppy-iso-8859-8 (Hebrew)
 46 | - sloppy-iso-8859-11 (Thai)
 47 | 
 48 | Aliases such as "sloppy-cp1252" for "sloppy-windows-1252" will also be
 49 | defined.
 50 | 
 51 | Five of these encodings (`sloppy-windows-1250` through `sloppy-windows-1254`)
 52 | are used within ftfy.
 53 | 
 54 | Here are some examples, using :func:`ftfy.explain_unicode` to illustrate how
 55 | sloppy-windows-1252 merges Windows-1252 with Latin-1:
 56 | 
 57 |     >>> from ftfy import explain_unicode
 58 |     >>> some_bytes = b'\x80\x81\x82'
 59 |     >>> explain_unicode(some_bytes.decode('latin-1'))
 60 |     U+0080  \x80    [Cc] <unknown>
 61 |     U+0081  \x81    [Cc] <unknown>
 62 |     U+0082  \x82    [Cc] <unknown>
 63 | 
 64 |     >>> explain_unicode(some_bytes.decode('windows-1252', 'replace'))
 65 |     U+20AC  €       [Sc] EURO SIGN
 66 |     U+FFFD  �       [So] REPLACEMENT CHARACTER
 67 |     U+201A  ‚       [Ps] SINGLE LOW-9 QUOTATION MARK
 68 | 
 69 |     >>> explain_unicode(some_bytes.decode('sloppy-windows-1252'))
 70 |     U+20AC  €       [Sc] EURO SIGN
 71 |     U+0081  \x81    [Cc] <unknown>
 72 |     U+201A  ‚       [Ps] SINGLE LOW-9 QUOTATION MARK
 73 | """
 74 | 
 75 | from __future__ import annotations
 76 | 
 77 | import codecs
 78 | from encodings import normalize_encoding
 79 | 
 80 | REPLACEMENT_CHAR = "\ufffd"
 81 | 
 82 | 
 83 | def make_sloppy_codec(encoding: str) -> codecs.CodecInfo:
 84 |     """
 85 |     Take a codec name, and return a 'sloppy' version of that codec that can
 86 |     encode and decode the unassigned bytes in that encoding.
 87 | 
 88 |     Single-byte encodings in the standard library are defined using some
 89 |     boilerplate classes surrounding the functions that do the actual work,
 90 |     `codecs.charmap_decode` and `charmap_encode`. This function, given an
 91 |     encoding name, *defines* those boilerplate classes.
 92 |     """
 93 |     # Make a bytestring of all 256 possible bytes.
 94 |     all_bytes = bytes(range(256))
 95 | 
 96 |     # Get a list of what they would decode to in Latin-1.
 97 |     sloppy_chars = list(all_bytes.decode("latin-1"))
 98 | 
 99 |     # Get a list of what they decode to in the given encoding. Use the
100 |     # replacement character for unassigned bytes.
101 |     decoded_chars = all_bytes.decode(encoding, errors="replace")
102 | 
103 |     # Update the sloppy_chars list. Each byte that was successfully decoded
104 |     # gets its decoded value in the list. The unassigned bytes are left as
105 |     # they are, which gives their decoding in Latin-1.
106 |     for i, char in enumerate(decoded_chars):
107 |         if char != REPLACEMENT_CHAR:
108 |             sloppy_chars[i] = char
109 | 
110 |     # For ftfy's own purposes, we're going to allow byte 1A, the "Substitute"
111 |     # control code, to encode the Unicode replacement character U+FFFD.
112 |     sloppy_chars[0x1A] = REPLACEMENT_CHAR
113 | 
114 |     # Create the data structures that tell the charmap methods how to encode
115 |     # and decode in this sloppy encoding.
116 |     decoding_table = "".join(sloppy_chars)
117 |     encoding_table = codecs.charmap_build(decoding_table)
118 | 
119 |     # Now produce all the class boilerplate. Look at the Python source for
120 |     # `encodings.cp1252` for comparison; this is almost exactly the same,
121 |     # except I made it follow pep8.
122 |     class Codec(codecs.Codec):
123 |         def encode(self, input: str, errors: str | None = "strict") -> tuple[bytes, int]:
124 |             return codecs.charmap_encode(input, errors, encoding_table)
125 | 
126 |         def decode(self, input: bytes, errors: str | None = "strict") -> tuple[str, int]:
127 |             return codecs.charmap_decode(input, errors, decoding_table)  # type: ignore[arg-type]
128 | 
129 |     class IncrementalEncoder(codecs.IncrementalEncoder):
130 |         def encode(self, input: str, final: bool = False) -> bytes:
131 |             return codecs.charmap_encode(input, self.errors, encoding_table)[0]
132 | 
133 |     class IncrementalDecoder(codecs.IncrementalDecoder):
134 |         def decode(self, input: bytes, final: bool = False) -> str:  # type: ignore[override]
135 |             return codecs.charmap_decode(input, self.errors, decoding_table)[0]  # type: ignore[arg-type]
136 | 
137 |     class StreamWriter(Codec, codecs.StreamWriter):
138 |         pass
139 | 
140 |     class StreamReader(Codec, codecs.StreamReader):
141 |         pass
142 | 
143 |     return codecs.CodecInfo(
144 |         name="sloppy-" + encoding,
145 |         encode=Codec().encode,
146 |         decode=Codec().decode,  # type: ignore[arg-type]
147 |         incrementalencoder=IncrementalEncoder,
148 |         incrementaldecoder=IncrementalDecoder,
149 |         streamreader=StreamReader,
150 |         streamwriter=StreamWriter,
151 |     )
152 | 
153 | 
154 | # Define a codec for each incomplete encoding. The resulting CODECS dictionary
155 | # can be used by the main module of ftfy.bad_codecs.
156 | CODECS = {}
157 | INCOMPLETE_ENCODINGS = (
158 |     [f"windows-{num}" for num in range(1250, 1259)]
159 |     + [f"iso-8859-{num}" for num in (3, 6, 7, 8, 11)]
160 |     + [f"cp{num}" for num in range(1250, 1259)]
161 |     + ["cp874"]
162 | )
163 | 
164 | for _encoding in INCOMPLETE_ENCODINGS:
165 |     _new_name = normalize_encoding("sloppy-" + _encoding)
166 |     CODECS[_new_name] = make_sloppy_codec(_encoding)
167 | 


--------------------------------------------------------------------------------
/ftfy/bad_codecs/utf8_variants.py:
--------------------------------------------------------------------------------
  1 | r"""
  2 | This file defines a codec called "utf-8-variants" (or "utf-8-var"), which can
  3 | decode text that's been encoded with a popular non-standard version of UTF-8.
  4 | This includes CESU-8, the accidental encoding made by layering UTF-8 on top of
  5 | UTF-16, as well as Java's twist on CESU-8 that contains a two-byte encoding for
  6 | codepoint 0.
  7 | 
  8 | This is particularly relevant in Python 3, which provides no other way of
  9 | decoding CESU-8 [1]_.
 10 | 
 11 | The easiest way to use the codec is to simply import `ftfy.bad_codecs`:
 12 | 
 13 |     >>> import ftfy.bad_codecs
 14 |     >>> result = b'here comes a null! \xc0\x80'.decode('utf-8-var')
 15 |     >>> print(repr(result).lstrip('u'))
 16 |     'here comes a null! \x00'
 17 | 
 18 | The codec does not at all enforce "correct" CESU-8. For example, the Unicode
 19 | Consortium's not-quite-standard describing CESU-8 requires that there is only
 20 | one possible encoding of any character, so it does not allow mixing of valid
 21 | UTF-8 and CESU-8. This codec *does* allow that, just like Python 2's UTF-8
 22 | decoder does.
 23 | 
 24 | Characters in the Basic Multilingual Plane still have only one encoding. This
 25 | codec still enforces the rule, within the BMP, that characters must appear in
 26 | their shortest form. There is one exception: the sequence of bytes `0xc0 0x80`,
 27 | instead of just `0x00`, may be used to encode the null character `U+0000`, like
 28 | in Java.
 29 | 
 30 | If you encode with this codec, you get legitimate UTF-8. Decoding with this
 31 | codec and then re-encoding is not idempotent, although encoding and then
 32 | decoding is. So this module won't produce CESU-8 for you. Look for that
 33 | functionality in the sister module, "Breaks Text For You", coming approximately
 34 | never.
 35 | 
 36 | .. [1] In a pinch, you can decode CESU-8 in Python 2 using the UTF-8 codec:
 37 |    first decode the bytes (incorrectly), then encode them, then decode them
 38 |    again, using UTF-8 as the codec every time. But Python 2 is dead, so use
 39 |    ftfy instead.
 40 | """
 41 | 
 42 | import codecs
 43 | import re
 44 | from encodings.utf_8 import (
 45 |     IncrementalDecoder as UTF8IncrementalDecoder,
 46 | )
 47 | from encodings.utf_8 import (
 48 |     IncrementalEncoder as UTF8IncrementalEncoder,
 49 | )
 50 | from typing import Callable, Optional
 51 | 
 52 | NAME = "utf-8-variants"
 53 | 
 54 | # This regular expression matches all possible six-byte CESU-8 sequences,
 55 | # plus truncations of them at the end of the string. (If any of the
 56 | # subgroups matches $, then all the subgroups after it also have to match $,
 57 | # as there are no more characters to match.)
 58 | CESU8_EXPR = (
 59 |     b"("
 60 |     b"\xed"
 61 |     b"([\xa0-\xaf]|$)"
 62 |     b"([\x80-\xbf]|$)"
 63 |     b"(\xed|$)"
 64 |     b"([\xb0-\xbf]|$)"
 65 |     b"([\x80-\xbf]|$)"
 66 |     b")"
 67 | )
 68 | 
 69 | CESU8_RE = re.compile(CESU8_EXPR)
 70 | 
 71 | # This expression matches isolated surrogate characters that aren't
 72 | # CESU-8, which have to be handled carefully on Python 2.
 73 | SURROGATE_EXPR = b"(\xed([\xa0-\xbf]|$)([\x80-\xbf]|$))"
 74 | 
 75 | # This expression matches the Java encoding of U+0, including if it's
 76 | # truncated and we need more bytes.
 77 | NULL_EXPR = b"(\xc0(\x80|$))"
 78 | 
 79 | # This regex matches cases that we need to decode differently from
 80 | # standard UTF-8.
 81 | SPECIAL_BYTES_RE = re.compile(b"|".join([NULL_EXPR, CESU8_EXPR, SURROGATE_EXPR]))
 82 | 
 83 | 
 84 | class IncrementalDecoder(UTF8IncrementalDecoder):
 85 |     """
 86 |     An incremental decoder that extends Python's built-in UTF-8 decoder.
 87 | 
 88 |     This encoder needs to take in bytes, possibly arriving in a stream, and
 89 |     output the correctly decoded text. The general strategy for doing this
 90 |     is to fall back on the real UTF-8 decoder whenever possible, because
 91 |     the real UTF-8 decoder is way optimized, but to call specialized methods
 92 |     we define here for the cases the real encoder isn't expecting.
 93 |     """
 94 | 
 95 |     @staticmethod
 96 |     def _buffer_decode(  # type: ignore[override]
 97 |         input: bytes, errors: Optional[str], final: bool
 98 |     ) -> tuple[str, int]:
 99 |         """
100 |         Decode bytes that may be arriving in a stream, following the Codecs
101 |         API.
102 | 
103 |         `input` is the incoming sequence of bytes. `errors` tells us how to
104 |         handle errors, though we delegate all error-handling cases to the real
105 |         UTF-8 decoder to ensure correct behavior. `final` indicates whether
106 |         this is the end of the sequence, in which case we should raise an
107 |         error given incomplete input.
108 | 
109 |         Returns as much decoded text as possible, and the number of bytes
110 |         consumed.
111 |         """
112 |         # decoded_segments are the pieces of text we have decoded so far,
113 |         # and position is our current position in the byte string. (Bytes
114 |         # before this position have been consumed, and bytes after it have
115 |         # yet to be decoded.)
116 |         decoded_segments = []
117 |         position = 0
118 |         while True:
119 |             # Use _buffer_decode_step to decode a segment of text.
120 |             decoded, consumed = IncrementalDecoder._buffer_decode_step(
121 |                 input[position:], errors, final
122 |             )
123 |             if consumed == 0:
124 |                 # Either there's nothing left to decode, or we need to wait
125 |                 # for more input. Either way, we're done for now.
126 |                 break
127 | 
128 |             # Append the decoded text to the list, and update our position.
129 |             decoded_segments.append(decoded)
130 |             position += consumed
131 | 
132 |         if final:
133 |             # _buffer_decode_step must consume all the bytes when `final` is
134 |             # true.
135 |             assert position == len(input)
136 | 
137 |         return "".join(decoded_segments), position
138 | 
139 |     @staticmethod
140 |     def _buffer_decode_step(input: bytes, errors: Optional[str], final: bool) -> tuple[str, int]:
141 |         """
142 |         There are three possibilities for each decoding step:
143 | 
144 |         - Decode as much real UTF-8 as possible.
145 |         - Decode a six-byte CESU-8 sequence at the current position.
146 |         - Decode a Java-style null at the current position.
147 | 
148 |         This method figures out which step is appropriate, and does it.
149 |         """
150 |         # Get a reference to the superclass method that we'll be using for
151 |         # most of the real work.
152 |         sup = UTF8IncrementalDecoder._buffer_decode
153 | 
154 |         # Find the next byte position that indicates a variant of UTF-8.
155 |         match = SPECIAL_BYTES_RE.search(input)
156 |         if match is None:
157 |             return sup(input, errors, final)
158 | 
159 |         cutoff = match.start()
160 |         if cutoff > 0:
161 |             return sup(input[:cutoff], errors, True)
162 | 
163 |         # Some byte sequence that we intend to handle specially matches
164 |         # at the beginning of the input.
165 |         if input.startswith(b"\xc0"):
166 |             if len(input) > 1:
167 |                 # Decode the two-byte sequence 0xc0 0x80.
168 |                 return "\u0000", 2
169 |             if final:
170 |                 # We hit the end of the stream. Let the superclass method
171 |                 # handle it.
172 |                 return sup(input, errors, True)
173 |             # Wait to see another byte.
174 |             return "", 0
175 |         # Decode a possible six-byte sequence starting with 0xed.
176 |         return IncrementalDecoder._buffer_decode_surrogates(sup, input, errors, final)
177 | 
178 |     @staticmethod
179 |     def _buffer_decode_surrogates(
180 |         sup: Callable[[bytes, Optional[str], bool], tuple[str, int]],
181 |         input: bytes,
182 |         errors: Optional[str],
183 |         final: bool,
184 |     ) -> tuple[str, int]:
185 |         """
186 |         When we have improperly encoded surrogates, we can still see the
187 |         bits that they were meant to represent.
188 | 
189 |         The surrogates were meant to encode a 20-bit number, to which we
190 |         add 0x10000 to get a codepoint. That 20-bit number now appears in
191 |         this form:
192 | 
193 |           11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst
194 | 
195 |         The CESU8_RE above matches byte sequences of this form. Then we need
196 |         to extract the bits and assemble a codepoint number from them.
197 |         """
198 |         if len(input) < 6:
199 |             if final:
200 |                 # We found 0xed near the end of the stream, and there aren't
201 |                 # six bytes to decode. Delegate to the superclass method to
202 |                 # handle it as normal UTF-8. It might be a Hangul character
203 |                 # or an error.
204 |                 return sup(input, errors, final)
205 |             # We found a surrogate, the stream isn't over yet, and we don't
206 |             # know enough of the following bytes to decode anything, so
207 |             # consume zero bytes and wait.
208 |             return "", 0
209 |         if CESU8_RE.match(input):
210 |             # Given this is a CESU-8 sequence, do some math to pull out
211 |             # the intended 20-bit value, and consume six bytes.
212 |             codepoint = (
213 |                 ((input[1] & 0x0F) << 16)
214 |                 + ((input[2] & 0x3F) << 10)
215 |                 + ((input[4] & 0x0F) << 6)
216 |                 + (input[5] & 0x3F)
217 |                 + 0x10000
218 |             )
219 |             return chr(codepoint), 6
220 |         # This looked like a CESU-8 sequence, but it wasn't one.
221 |         # 0xed indicates the start of a three-byte sequence, so give
222 |         # three bytes to the superclass to decode as usual.
223 |         return sup(input[:3], errors, False)
224 | 
225 | 
226 | # The encoder is identical to UTF-8.
227 | IncrementalEncoder = UTF8IncrementalEncoder
228 | 
229 | 
230 | class StreamWriter(codecs.StreamWriter):
231 |     @staticmethod
232 |     def encode(input: str, errors: str = "strict") -> tuple[bytes, int]:
233 |         return IncrementalEncoder(errors).encode(input, final=True), len(input)
234 | 
235 | 
236 | class StreamReader(codecs.StreamReader):
237 |     @staticmethod
238 |     def decode(input: bytes, errors: str = "strict") -> tuple[str, int]:
239 |         return IncrementalDecoder(errors).decode(input, final=True), len(input)
240 | 
241 | 
242 | CODEC_INFO = codecs.CodecInfo(
243 |     name=NAME,
244 |     encode=StreamWriter.encode,
245 |     decode=StreamReader.decode,  # type: ignore[arg-type]
246 |     incrementalencoder=IncrementalEncoder,
247 |     incrementaldecoder=IncrementalDecoder,
248 |     streamreader=StreamReader,
249 |     streamwriter=StreamWriter,
250 | )
251 | 


--------------------------------------------------------------------------------
/ftfy/badness.py:
--------------------------------------------------------------------------------
  1 | """
  2 | `ftfy.badness` contains a heuristic that detects likely mojibake.
  3 | 
  4 | This heuristic signals to ftfy which segments of text need to be fixed, and
  5 | also indicates when the text can stop being fixed.
  6 | 
  7 | The design of this heuristic is that we categorize the approximately 400
  8 | Unicode characters that occur in UTF-8 mojibake, specifically the characters
  9 | that come from mixing up UTF-8 with the other encodings we support. We
 10 | identify sequences and contexts of these characters that are much more likely
 11 | to be mojibake than intended strings, such as lowercase accented letters
 12 | followed immediately by currency symbols.
 13 | """
 14 | 
 15 | import warnings
 16 | import re
 17 | 
 18 | 
 19 | # There are only a few hundred characters that occur in known UTF-8 mojibake, and we can
 20 | # characterize them:
 21 | 
 22 | MOJIBAKE_CATEGORIES = {
 23 |     # Characters that appear in many different contexts. Sequences that contain
 24 |     # them are not inherently mojibake
 25 |     "common": (
 26 |         "\N{NO-BREAK SPACE}"
 27 |         "\N{SOFT HYPHEN}"
 28 |         "\N{MIDDLE DOT}"
 29 |         "\N{ACUTE ACCENT}"
 30 |         "\N{EN DASH}"
 31 |         "\N{EM DASH}"
 32 |         "\N{HORIZONTAL BAR}"
 33 |         "\N{HORIZONTAL ELLIPSIS}"
 34 |         "\N{RIGHT SINGLE QUOTATION MARK}"
 35 |     ),
 36 |     # the C1 control character range, which have no uses outside of mojibake anymore
 37 |     "c1": "\x80-\x9f",
 38 |     # Characters that are nearly 100% used in mojibake
 39 |     "bad": (
 40 |         "\N{BROKEN BAR}"
 41 |         "\N{CURRENCY SIGN}"
 42 |         "\N{DIAERESIS}"
 43 |         "\N{NOT SIGN}"
 44 |         "\N{MACRON}"
 45 |         "\N{CEDILLA}"
 46 |         "\N{LATIN SMALL LETTER F WITH HOOK}"
 47 |         "\N{MODIFIER LETTER CIRCUMFLEX ACCENT}"  # it's not a modifier
 48 |         "\N{CARON}"
 49 |         "\N{BREVE}"
 50 |         "\N{OGONEK}"
 51 |         "\N{SMALL TILDE}"
 52 |         "\N{DAGGER}"
 53 |         "\N{DOUBLE DAGGER}"
 54 |         "\N{PER MILLE SIGN}"
 55 |         "\N{REVERSED NOT SIGN}"
 56 |         "\N{LOZENGE}"
 57 |         "\ufffd"
 58 |         # Theoretically these would appear in 'numeric' contexts, but when they
 59 |         # co-occur with other mojibake characters, it's not really ambiguous
 60 |         "\N{FEMININE ORDINAL INDICATOR}"
 61 |         "\N{MASCULINE ORDINAL INDICATOR}"
 62 |     ),
 63 |     # Characters used in legalese
 64 |     "law": (
 65 |         "\N{PILCROW SIGN}"
 66 |         "\N{SECTION SIGN}"
 67 |     ),
 68 |     "currency": (
 69 |         "\N{CENT SIGN}"
 70 |         "\N{POUND SIGN}"
 71 |         "\N{YEN SIGN}"
 72 |         "\N{PESETA SIGN}"
 73 |         "\N{EURO SIGN}"
 74 |     ),
 75 |     "start_punctuation": (
 76 |         "\N{INVERTED EXCLAMATION MARK}"
 77 |         "\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}"
 78 |         "\N{INVERTED QUESTION MARK}"
 79 |         "\N{COPYRIGHT SIGN}"
 80 |         "\N{GREEK TONOS}"
 81 |         "\N{GREEK DIALYTIKA TONOS}"
 82 |         "\N{LEFT SINGLE QUOTATION MARK}"
 83 |         "\N{SINGLE LOW-9 QUOTATION MARK}"
 84 |         "\N{LEFT DOUBLE QUOTATION MARK}"
 85 |         "\N{DOUBLE LOW-9 QUOTATION MARK}"
 86 |         "\N{BULLET}"
 87 |         "\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}"
 88 |         "\uf8ff"  # OS-specific symbol, usually the Apple logo
 89 |     ),
 90 |     "end_punctuation": (
 91 |         "\N{REGISTERED SIGN}"
 92 |         "\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
 93 |         "\N{DOUBLE ACUTE ACCENT}"
 94 |         "\N{RIGHT DOUBLE QUOTATION MARK}"
 95 |         "\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}"
 96 |         "\N{TRADE MARK SIGN}"
 97 |     ),
 98 |     "numeric": (
 99 |         "\N{SUPERSCRIPT TWO}"
100 |         "\N{SUPERSCRIPT THREE}"
101 |         "\N{SUPERSCRIPT ONE}"
102 |         "\N{PLUS-MINUS SIGN}"
103 |         "\N{VULGAR FRACTION ONE QUARTER}"
104 |         "\N{VULGAR FRACTION ONE HALF}"
105 |         "\N{VULGAR FRACTION THREE QUARTERS}"
106 |         "\N{MULTIPLICATION SIGN}"
107 |         "\N{MICRO SIGN}"
108 |         "\N{DIVISION SIGN}"
109 |         "\N{FRACTION SLASH}"
110 |         "\N{PARTIAL DIFFERENTIAL}"
111 |         "\N{INCREMENT}"
112 |         "\N{N-ARY PRODUCT}"
113 |         "\N{N-ARY SUMMATION}"
114 |         "\N{SQUARE ROOT}"
115 |         "\N{INFINITY}"
116 |         "\N{INTERSECTION}"
117 |         "\N{INTEGRAL}"
118 |         "\N{ALMOST EQUAL TO}"
119 |         "\N{NOT EQUAL TO}"
120 |         "\N{IDENTICAL TO}"
121 |         "\N{LESS-THAN OR EQUAL TO}"
122 |         "\N{GREATER-THAN OR EQUAL TO}"
123 |         "\N{NUMERO SIGN}"
124 |     ),
125 |     # Letters that might be used to make emoticon faces (kaomoji), and
126 |     # therefore might need to appear in more improbable-looking contexts.
127 |     #
128 |     # These are concatenated character ranges for use in a regex. I know
129 |     # they look like faces themselves. I think expressing the ranges like
130 |     # this helps to illustrate why we need to be careful with these
131 |     # characters.
132 |     "kaomoji": (
133 |         "Ò-Ö"
134 |         "Ù-Ü"
135 |         "ò-ö"
136 |         "ø-ü"
137 |         "\N{LATIN CAPITAL LETTER O WITH DOUBLE ACUTE}"
138 |         "\N{LATIN CAPITAL LETTER O WITH MACRON}"
139 |         "\N{LATIN CAPITAL LETTER U WITH MACRON}"
140 |         "\N{LATIN CAPITAL LETTER U WITH OGONEK}"
141 |         "\N{DEGREE SIGN}"
142 |     ),
143 |     "upper_accented": (
144 |         # LATIN CAPITAL LETTER A WITH GRAVE - LATIN CAPITAL LETTER N WITH TILDE
145 |         "\xc0-\xd1"
146 |         # skip capital O's and U's that could be used in kaomoji, but
147 |         # include Ø because it's very common in Arabic mojibake:
148 |         "\N{LATIN CAPITAL LETTER O WITH STROKE}"
149 |         "\N{LATIN CAPITAL LETTER U WITH DIAERESIS}"
150 |         "\N{LATIN CAPITAL LETTER Y WITH ACUTE}"
151 |         "\N{LATIN CAPITAL LETTER A WITH BREVE}"
152 |         "\N{LATIN CAPITAL LETTER A WITH MACRON}"
153 |         "\N{LATIN CAPITAL LETTER A WITH OGONEK}"
154 |         "\N{LATIN CAPITAL LETTER C WITH ACUTE}"
155 |         "\N{LATIN CAPITAL LETTER C WITH CARON}"
156 |         "\N{LATIN CAPITAL LETTER D WITH CARON}"
157 |         "\N{LATIN CAPITAL LETTER D WITH STROKE}"
158 |         "\N{LATIN CAPITAL LETTER E WITH OGONEK}"
159 |         "\N{LATIN CAPITAL LETTER E WITH CARON}"
160 |         "\N{LATIN CAPITAL LETTER E WITH MACRON}"
161 |         "\N{LATIN CAPITAL LETTER E WITH DOT ABOVE}"
162 |         "\N{LATIN CAPITAL LETTER G WITH BREVE}"
163 |         "\N{LATIN CAPITAL LETTER G WITH CEDILLA}"
164 |         "\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}"
165 |         "\N{LATIN CAPITAL LETTER I WITH MACRON}"
166 |         "\N{LATIN CAPITAL LETTER K WITH CEDILLA}"
167 |         "\N{LATIN CAPITAL LETTER L WITH ACUTE}"
168 |         "\N{LATIN CAPITAL LETTER L WITH CARON}"
169 |         "\N{LATIN CAPITAL LETTER L WITH STROKE}"
170 |         "\N{LATIN CAPITAL LETTER L WITH CEDILLA}"
171 |         "\N{LATIN CAPITAL LETTER N WITH ACUTE}"
172 |         "\N{LATIN CAPITAL LETTER N WITH CARON}"
173 |         "\N{LATIN CAPITAL LETTER N WITH CEDILLA}"
174 |         "\N{LATIN CAPITAL LIGATURE OE}"
175 |         "\N{LATIN CAPITAL LETTER R WITH CARON}"
176 |         "\N{LATIN CAPITAL LETTER S WITH ACUTE}"
177 |         "\N{LATIN CAPITAL LETTER S WITH CEDILLA}"
178 |         "\N{LATIN CAPITAL LETTER S WITH CARON}"
179 |         "\N{LATIN CAPITAL LETTER T WITH CEDILLA}"
180 |         "\N{LATIN CAPITAL LETTER T WITH CARON}"
181 |         "\N{LATIN CAPITAL LETTER U WITH RING ABOVE}"
182 |         "\N{LATIN CAPITAL LETTER U WITH DOUBLE ACUTE}"
183 |         "\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}"
184 |         "\N{LATIN CAPITAL LETTER Z WITH ACUTE}"
185 |         "\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}"
186 |         "\N{LATIN CAPITAL LETTER Z WITH CARON}"
187 |         "\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}"
188 |     ),
189 |     "lower_accented": (
190 |         "\N{LATIN SMALL LETTER SHARP S}"
191 |         # LATIN SMALL LETTER A WITH GRAVE - LATIN SMALL LETTER N WITH TILDE
192 |         "\xe0-\xf1"
193 |         # skip o's and u's that could be used in kaomoji
194 |         "\N{LATIN SMALL LETTER A WITH BREVE}"
195 |         "\N{LATIN SMALL LETTER A WITH OGONEK}"
196 |         "\N{LATIN SMALL LETTER A WITH MACRON}"
197 |         "\N{LATIN SMALL LETTER C WITH ACUTE}"
198 |         "\N{LATIN SMALL LETTER C WITH CARON}"
199 |         "\N{LATIN SMALL LETTER D WITH CARON}"
200 |         "\N{LATIN SMALL LETTER D WITH STROKE}"
201 |         "\N{LATIN SMALL LETTER E WITH OGONEK}"
202 |         "\N{LATIN SMALL LETTER E WITH CARON}"
203 |         "\N{LATIN SMALL LETTER E WITH MACRON}"
204 |         "\N{LATIN SMALL LETTER E WITH DOT ABOVE}"
205 |         "\N{LATIN SMALL LETTER G WITH BREVE}"
206 |         "\N{LATIN SMALL LETTER G WITH CEDILLA}"
207 |         "\N{LATIN SMALL LETTER I WITH OGONEK}"
208 |         "\N{LATIN SMALL LETTER I WITH MACRON}"
209 |         "\N{LATIN SMALL LETTER K WITH CEDILLA}"
210 |         "\N{LATIN SMALL LETTER L WITH ACUTE}"
211 |         "\N{LATIN SMALL LETTER L WITH CARON}"
212 |         "\N{LATIN SMALL LETTER L WITH STROKE}"
213 |         "\N{LATIN SMALL LETTER L WITH CEDILLA}"
214 |         "\N{LATIN SMALL LIGATURE OE}"
215 |         "\N{LATIN SMALL LETTER R WITH ACUTE}"
216 |         "\N{LATIN SMALL LETTER S WITH ACUTE}"
217 |         "\N{LATIN SMALL LETTER S WITH CEDILLA}"
218 |         "\N{LATIN SMALL LETTER S WITH CARON}"
219 |         "\N{LATIN SMALL LETTER T WITH CARON}"
220 |         "\N{LATIN SMALL LETTER U WITH DIAERESIS}"
221 |         "\N{LATIN SMALL LETTER Z WITH ACUTE}"
222 |         "\N{LATIN SMALL LETTER Z WITH DOT ABOVE}"
223 |         "\N{LATIN SMALL LETTER Z WITH CARON}"
224 |         "\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}"
225 |         "\N{LATIN SMALL LIGATURE FI}"
226 |         "\N{LATIN SMALL LIGATURE FL}"
227 |     ),
228 |     "upper_common": (
229 |         "\N{LATIN CAPITAL LETTER THORN}"
230 |         "\N{GREEK CAPITAL LETTER ALPHA}-\N{GREEK CAPITAL LETTER OMEGA}"
231 |         # not included under 'accented' because these can commonly
232 |         # occur at ends of words, in positions where they'd be detected
233 |         # as mojibake
234 |         "\N{GREEK CAPITAL LETTER ALPHA WITH TONOS}"
235 |         "\N{GREEK CAPITAL LETTER EPSILON WITH TONOS}"
236 |         "\N{GREEK CAPITAL LETTER ETA WITH TONOS}"
237 |         "\N{GREEK CAPITAL LETTER IOTA WITH TONOS}"
238 |         "\N{GREEK CAPITAL LETTER OMICRON WITH TONOS}"
239 |         "\N{GREEK CAPITAL LETTER UPSILON WITH TONOS}"
240 |         "\N{GREEK CAPITAL LETTER OMEGA WITH TONOS}"
241 |         "\N{GREEK CAPITAL LETTER IOTA WITH DIALYTIKA}"
242 |         "\N{GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA}"
243 |         "\N{CYRILLIC CAPITAL LETTER IO}-\N{CYRILLIC CAPITAL LETTER YA}"
244 |     ),
245 |     "lower_common": (
246 |         # lowercase thorn does not appear in mojibake
247 |         "\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER OMEGA}"
248 |         "\N{GREEK SMALL LETTER ALPHA WITH TONOS}"
249 |         "\N{GREEK SMALL LETTER EPSILON WITH TONOS}"
250 |         "\N{GREEK SMALL LETTER ETA WITH TONOS}"
251 |         "\N{GREEK SMALL LETTER IOTA WITH TONOS}"
252 |         "\N{GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS}"
253 |         "\N{CYRILLIC SMALL LETTER A}-\N{CYRILLIC SMALL LETTER DZHE}"
254 |     ),
255 |     "box": (
256 |         # omit the single horizontal line, might be used in kaomoji
257 |         "│┌┐┘├┤┬┼"
258 |         "\N{BOX DRAWINGS DOUBLE HORIZONTAL}-\N{BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL}"
259 |         "▀▄█▌▐░▒▓"
260 |     ),
261 | }
262 | 
263 | 
264 | # We can now build a regular expression that detects unlikely juxtapositions
265 | # of characters, mostly based on their categories.
266 | #
267 | # Another regular expression, which detects sequences that look more specifically
268 | # like UTF-8 mojibake, appears in chardata.py.
269 | #
270 | # This is a verbose regular expression, with whitespace added for somewhat more
271 | # readability. Remember that the only spaces that count as literal spaces in this
272 | # expression are ones inside character classes (square brackets).
273 | 
274 | BADNESS_RE = re.compile(
275 |     r"""
276 |     [{c1}]
277 |     |
278 |     [{bad}{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}{law}] [{bad}]
279 |     |
280 |     [a-zA-Z] [{lower_common}{upper_common}] [{bad}]
281 |     |
282 |     [{bad}] [{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}{law}]
283 |     |
284 |     [{lower_accented}{lower_common}{box}{end_punctuation}{currency}{numeric}] [{upper_accented}]
285 |     |
286 |     [{box}{end_punctuation}{currency}{numeric}] [{lower_accented}]
287 |     |
288 |     [{lower_accented}{box}{end_punctuation}] [{currency}]
289 |     |
290 |     \s [{upper_accented}] [{currency}]
291 |     |
292 |     [{upper_accented}{box}] [{numeric}{law}]
293 |     |
294 |     [{lower_accented}{upper_accented}{box}{currency}{end_punctuation}] [{start_punctuation}] [{numeric}]
295 |     |
296 |     [{lower_accented}{upper_accented}{currency}{numeric}{box}{law}] [{end_punctuation}] [{start_punctuation}]
297 |     |
298 |     [{currency}{numeric}{box}] [{start_punctuation}]
299 |     |
300 |     [a-z] [{upper_accented}] [{start_punctuation}{currency}]
301 |     |
302 |     [{box}] [{kaomoji}]
303 |     |
304 |     [{lower_accented}{upper_accented}{currency}{numeric}{start_punctuation}{end_punctuation}{law}] [{box}]
305 |     |
306 |     [{box}] [{end_punctuation}]
307 |     |
308 |     [{lower_accented}{upper_accented}] [{start_punctuation}{end_punctuation}] \w
309 |     |
310 | 
311 |     # The ligature œ when not followed by an unaccented Latin letter
312 |     [Œœ][^A-Za-z]
313 |     |
314 | 
315 |     # Degree signs after capital letters
316 |     [{upper_accented}]°
317 |     |
318 | 
319 |     # Common Windows-1252 2-character mojibake that isn't covered by the cases above
320 |     [ÂÃÎÐ][€œŠš¢£Ÿž\xa0\xad®©°·»{start_punctuation}{end_punctuation}–—´]
321 |     |
322 |     × [²³]
323 |     |
324 |     # Windows-1252 mojibake of Arabic words needs to include the 'common' characters.
325 |     # To compensate, we require four characters to be matched.
326 |       [ØÙ] [{common}{currency}{bad}{numeric}{start_punctuation}ŸŠ®°µ»]
327 |       [ØÙ] [{common}{currency}{bad}{numeric}{start_punctuation}ŸŠ®°µ»]
328 |     |
329 | 
330 |     # Windows-1252 mojibake that starts 3-character sequences for some South Asian
331 |     # alphabets
332 |     à[²µ¹¼½¾]
333 |     |
334 | 
335 |     # MacRoman mojibake that isn't covered by the cases above
336 |     √[±∂†≠®™´≤≥¥µø]
337 |     |
338 |     ≈[°¢]
339 |     |
340 |     ‚Ä[ìîïòôúùû†°¢π]
341 |     |
342 |     ‚[âó][àä°ê]
343 |     |
344 | 
345 |     # Windows-1251 mojibake of characters in the U+2000 range
346 |     вЂ
347 |     |
348 | 
349 |     # Windows-1251 mojibake of Latin-1 characters and/or the Cyrillic alphabet.
350 |     # Because the 2-character sequences involved here may be common, we require
351 |     # seeing a 3-character sequence.
352 |     [ВГРС][{c1}{bad}{start_punctuation}{end_punctuation}{currency}°µ][ВГРС]
353 |     |
354 |     # A distinctive five-character sequence of Cyrillic letters, which can be
355 |     # Windows-1251 mojibake on top of Latin-1 mojibake of Windows-1252 characters.
356 |     # Require a Latin letter nearby.
357 |     ГўВЂВ.[A-Za-z ]
358 |     |
359 | 
360 |     # Windows-1252 encodings of 'à' and 'á', as well as \xa0 itself
361 |     Ã[\xa0¡]
362 |     |
363 |     [a-z]\s?[ÃÂ][ ]
364 |     |
365 |     ^[ÃÂ][ ]
366 |     |
367 | 
368 |     # Cases where Â precedes a character as an encoding of exactly the same
369 |     # character, and the character is common enough
370 |     [a-z.,?!{end_punctuation}] Â [ {start_punctuation}{end_punctuation}]
371 |     |
372 | 
373 |     # Windows-1253 mojibake of characters in the U+2000 range
374 |     β€[™\xa0Ά\xad®°]
375 |     |
376 | 
377 |     # Windows-1253 mojibake of Latin-1 characters and/or the Greek alphabet
378 |     [ΒΓΞΟ][{c1}{bad}{start_punctuation}{end_punctuation}{currency}°][ΒΓΞΟ]
379 |     |
380 | 
381 |     # Windows-1257 mojibake of characters in the U+2000 range
382 |     ā€
383 |     """.format(
384 |         **MOJIBAKE_CATEGORIES
385 |     ),
386 |     re.VERBOSE,
387 | )
388 | 
389 | 
390 | def sequence_weirdness(text: str) -> int:
391 |     """
392 |     This was the name of the heuristic used in ftfy 2.x through 5.x. As an
393 |     attempt at compatibility with external code that calls the heuristic
394 |     directly, we redirect to our new heuristic, :func:`badness`.
395 |     """
396 |     warnings.warn(
397 |         "`sequence_weirdness()` is an old heuristic, and the current "
398 |         "closest equivalent is `ftfy.badness.badness()`"
399 |     )
400 |     return badness(text)
401 | 
402 | 
403 | def badness(text: str) -> int:
404 |     """
405 |     Get the 'badness' of a sequence of text, counting the number of unlikely
406 |     character sequences. A badness greater than 0 indicates that some of it
407 |     seems to be mojibake.
408 |     """
409 |     return len(BADNESS_RE.findall(text))
410 | 
411 | 
412 | def is_bad(text: str) -> bool:
413 |     """
414 |     Returns true iff the given text looks like it contains mojibake.
415 | 
416 |     This can be faster than `badness`, because it returns when the first match
417 |     is found to a regex instead of counting matches. Note that as strings get
418 |     longer, they have a higher chance of returning True for `is_bad(string)`.
419 |     """
420 |     return bool(BADNESS_RE.search(text))
421 | 


--------------------------------------------------------------------------------
/ftfy/cli.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A command-line utility for fixing text found in a file.
  3 | """
  4 | 
  5 | import os
  6 | import sys
  7 | from pathlib import Path
  8 | from typing import Union
  9 | 
 10 | from ftfy import TextFixerConfig, __version__, fix_file
 11 | 
 12 | ENCODE_ERROR_TEXT_UNIX = """ftfy error:
 13 | Unfortunately, this output stream does not support Unicode.
 14 | 
 15 | Your system locale may be very old or misconfigured. You should use a locale
 16 | that supports UTF-8. One way to do this is to `export LANG=C.UTF-8`.
 17 | """
 18 | 
 19 | ENCODE_ERROR_TEXT_WINDOWS = """ftfy error:
 20 | Unfortunately, this output stream does not support Unicode.
 21 | 
 22 | You might be trying to output to the Windows Command Prompt (cmd.exe), which
 23 | does not fully support Unicode for historical reasons. In general, we recommend
 24 | finding a way to run Python without using cmd.exe.
 25 | 
 26 | You can work around this problem by using the '-o filename' option in ftfy to
 27 | output to a file instead.
 28 | """
 29 | 
 30 | DECODE_ERROR_TEXT = """ftfy error:
 31 | This input couldn't be decoded as %r. We got the following error:
 32 | 
 33 |     %s
 34 | 
 35 | ftfy works best when its input is in a known encoding. You can use `ftfy -g`
 36 | to guess, if you're desperate. Otherwise, give the encoding name with the
 37 | `-e` option, such as `ftfy -e latin-1`.
 38 | """
 39 | 
 40 | SAME_FILE_ERROR_TEXT = """ftfy error:
 41 | Can't read and write the same file. Please output to a new file instead.
 42 | """
 43 | 
 44 | 
 45 | def main() -> None:
 46 |     """
 47 |     Run ftfy as a command-line utility.
 48 |     """
 49 |     import argparse
 50 | 
 51 |     parser = argparse.ArgumentParser(
 52 |         description=f"ftfy (fixes text for you), version {__version__}"
 53 |     )
 54 |     parser.add_argument(
 55 |         "filename",
 56 |         default="-",
 57 |         nargs="?",
 58 |         help="The file whose Unicode is to be fixed. Defaults to -, meaning standard input.",
 59 |     )
 60 |     parser.add_argument(
 61 |         "-o",
 62 |         "--output",
 63 |         type=str,
 64 |         default="-",
 65 |         help="The file to output to. Defaults to -, meaning standard output.",
 66 |     )
 67 |     parser.add_argument(
 68 |         "-g",
 69 |         "--guess",
 70 |         action="store_true",
 71 |         help="Ask ftfy to guess the encoding of your input. This is risky. Overrides -e.",
 72 |     )
 73 |     parser.add_argument(
 74 |         "-e",
 75 |         "--encoding",
 76 |         type=str,
 77 |         default="utf-8",
 78 |         help="The encoding of the input. Defaults to UTF-8.",
 79 |     )
 80 |     parser.add_argument(
 81 |         "-n",
 82 |         "--normalization",
 83 |         type=str,
 84 |         default="NFC",
 85 |         help='The normalization of Unicode to apply. Defaults to NFC. Can be "none".',
 86 |     )
 87 |     parser.add_argument(
 88 |         "--preserve-entities",
 89 |         action="store_true",
 90 |         help="Leave HTML entities as they are. The default "
 91 |         "is to decode them, as long as no HTML tags have appeared in the file.",
 92 |     )
 93 | 
 94 |     args = parser.parse_args()
 95 | 
 96 |     encoding = args.encoding
 97 |     if args.guess:
 98 |         encoding = None
 99 | 
100 |     if args.filename == "-":
101 |         # Get a standard input stream made of bytes, so we can decode it as
102 |         # whatever encoding is necessary.
103 |         file = sys.stdin.buffer
104 |     else:
105 |         file = Path(args.filename).open("rb")
106 | 
107 |     if args.output == "-":
108 |         outfile = sys.stdout
109 |     else:
110 |         if os.path.realpath(args.output) == os.path.realpath(args.filename):
111 |             sys.stderr.write(SAME_FILE_ERROR_TEXT)
112 |             sys.exit(1)
113 |         outfile = Path(args.output).open("w", encoding="utf-8")
114 | 
115 |     normalization = args.normalization
116 |     if normalization.lower() == "none":
117 |         normalization = None
118 | 
119 |     unescape_html: Union[str, bool]
120 |     unescape_html = False if args.preserve_entities else "auto"
121 | 
122 |     config = TextFixerConfig(unescape_html=unescape_html, normalization=normalization)
123 | 
124 |     try:
125 |         for line in fix_file(file, encoding=encoding, config=config):
126 |             try:
127 |                 outfile.write(line)
128 |             except UnicodeEncodeError:
129 |                 if sys.platform == "win32":
130 |                     sys.stderr.write(ENCODE_ERROR_TEXT_WINDOWS)
131 |                 else:
132 |                     sys.stderr.write(ENCODE_ERROR_TEXT_UNIX)
133 |                 sys.exit(1)
134 |     except UnicodeDecodeError as err:
135 |         sys.stderr.write(DECODE_ERROR_TEXT % (encoding, err))
136 |         sys.exit(1)
137 | 
138 | 
139 | if __name__ == "__main__":
140 |     main()
141 | 


--------------------------------------------------------------------------------
/ftfy/fixes.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The `ftfy.fixes` module contains the individual fixes that :func:`ftfy.fix_text`
  3 | can perform, and provides the functions that are named in "explanations"
  4 | such as the output of :func:`ftfy.fix_and_explain`.
  5 | 
  6 | Two of these functions are particularly useful on their own, as more robust
  7 | versions of functions in the Python standard library:
  8 | 
  9 | - :func:`ftfy.fixes.decode_escapes`
 10 | - :func:`ftfy.fixes.unescape_html`
 11 | """
 12 | 
 13 | import codecs
 14 | import html
 15 | import re
 16 | import warnings
 17 | from re import Match
 18 | from typing import Any
 19 | 
 20 | import ftfy
 21 | from ftfy.badness import is_bad
 22 | from ftfy.chardata import (
 23 |     ALTERED_UTF8_RE,
 24 |     C1_CONTROL_RE,
 25 |     CONTROL_CHARS,
 26 |     DOUBLE_QUOTE_RE,
 27 |     HTML_ENTITIES,
 28 |     HTML_ENTITY_RE,
 29 |     LIGATURES,
 30 |     LOSSY_UTF8_RE,
 31 |     SINGLE_QUOTE_RE,
 32 |     UTF8_DETECTOR_RE,
 33 |     WIDTH_MAP,
 34 | )
 35 | 
 36 | 
 37 | def fix_encoding_and_explain(text: str) -> Any:
 38 |     """
 39 |     Deprecated copy of `ftfy.fix_encoding_and_explain()`.
 40 |     """
 41 |     warnings.warn(
 42 |         "`fix_encoding_and_explain()` has moved to the main module of ftfy.",
 43 |         DeprecationWarning,
 44 |         stacklevel=2,
 45 |     )
 46 |     return ftfy.fix_encoding_and_explain(text)
 47 | 
 48 | 
 49 | def fix_encoding(text: str) -> str:
 50 |     """
 51 |     Deprecated copy of `ftfy.fix_encoding()`.
 52 |     """
 53 |     warnings.warn(
 54 |         "`fix_encoding()` has moved to the main module of ftfy.",
 55 |         DeprecationWarning,
 56 |         stacklevel=2,
 57 |     )
 58 |     return ftfy.fix_encoding(text)
 59 | 
 60 | 
 61 | def apply_plan(text: str, plan: list[tuple[str, str]]) -> str:
 62 |     """
 63 |     Deprecated copy of `ftfy.apply_plan()`.
 64 |     """
 65 |     warnings.warn(
 66 |         "`apply_plan()` has moved to the main module of ftfy.",
 67 |         DeprecationWarning,
 68 |         stacklevel=2,
 69 |     )
 70 |     return ftfy.apply_plan(text, plan)
 71 | 
 72 | 
 73 | def _unescape_fixup(match: Match[str]) -> str:
 74 |     """
 75 |     Replace one matched HTML entity with the character it represents,
 76 |     if possible.
 77 |     """
 78 |     text = match.group(0)
 79 |     if text in HTML_ENTITIES:
 80 |         return HTML_ENTITIES[text]
 81 |     elif text.startswith("&#"):
 82 |         unescaped: str = html.unescape(text)
 83 | 
 84 |         # If html.unescape only decoded part of the string, that's not what
 85 |         # we want. The semicolon should be consumed.
 86 |         if ";" in unescaped:
 87 |             return text
 88 |         else:
 89 |             return unescaped
 90 |     else:
 91 |         return text
 92 | 
 93 | 
 94 | def unescape_html(text: str) -> str:
 95 |     """
 96 |     Decode HTML entities and character references, including some nonstandard
 97 |     ones written in all-caps.
 98 | 
 99 |     Python has a built-in called `html.unescape` that can decode HTML escapes,
100 |     including a bunch of messy edge cases such as decoding escapes without
101 |     semicolons such as "&amp".
102 | 
103 |     If you know you've got HTML-escaped text, applying `html.unescape` is the
104 |     right way to convert it to plain text. But in ambiguous situations, that
105 |     would create false positives. For example, the informally written text
106 |     "this&not that" should not automatically be decoded as "this¬ that".
107 | 
108 |     In this function, we decode the escape sequences that appear in the
109 |     `html.entities.html5` dictionary, as long as they are the unambiguous ones
110 |     that end in semicolons.
111 | 
112 |     We also decode all-caps versions of Latin letters and common symbols.
113 |     If a database contains the name 'P&EACUTE;REZ', we can read that and intuit
114 |     that it was supposed to say 'PÉREZ'. This is limited to a smaller set of
115 |     entities, because there are many instances where entity names are
116 |     case-sensitive in complicated ways.
117 | 
118 |         >>> unescape_html('&lt;tag&gt;')
119 |         '<tag>'
120 | 
121 |         >>> unescape_html('&Jscr;ohn &HilbertSpace;ancock')
122 |         '𝒥ohn ℋancock'
123 | 
124 |         >>> unescape_html('&checkmark;')
125 |         '✓'
126 | 
127 |         >>> unescape_html('P&eacute;rez')
128 |         'Pérez'
129 | 
130 |         >>> unescape_html('P&EACUTE;REZ')
131 |         'PÉREZ'
132 | 
133 |         >>> unescape_html('BUNDESSTRA&SZLIG;E')
134 |         'BUNDESSTRASSE'
135 | 
136 |         >>> unescape_html('&ntilde; &Ntilde; &NTILDE; &nTILDE;')
137 |         'ñ Ñ Ñ &nTILDE;'
138 |     """
139 |     return HTML_ENTITY_RE.sub(_unescape_fixup, text)
140 | 
141 | 
142 | ANSI_RE = re.compile("\033\\[((?:\\d|;)*)([a-zA-Z])")
143 | 
144 | 
145 | def remove_terminal_escapes(text: str) -> str:
146 |     r"""
147 |     Strip out "ANSI" terminal escape sequences, such as those that produce
148 |     colored text on Unix.
149 | 
150 |         >>> print(remove_terminal_escapes(
151 |         ...     "\033[36;44mI'm blue, da ba dee da ba doo...\033[0m"
152 |         ... ))
153 |         I'm blue, da ba dee da ba doo...
154 |     """
155 |     return ANSI_RE.sub("", text)
156 | 
157 | 
158 | def uncurl_quotes(text: str) -> str:
159 |     r"""
160 |     Replace curly quotation marks with straight equivalents.
161 | 
162 |         >>> print(uncurl_quotes('\u201chere\u2019s a test\u201d'))
163 |         "here's a test"
164 |     """
165 |     return SINGLE_QUOTE_RE.sub("'", DOUBLE_QUOTE_RE.sub('"', text))
166 | 
167 | 
168 | def fix_latin_ligatures(text: str) -> str:
169 |     """
170 |     Replace single-character ligatures of Latin letters, such as 'ﬁ', with the
171 |     characters that they contain, as in 'fi'. Latin ligatures are usually not
172 |     intended in text strings (though they're lovely in *rendered* text).  If
173 |     you have such a ligature in your string, it is probably a result of a
174 |     copy-and-paste glitch.
175 | 
176 |     We leave ligatures in other scripts alone to be safe. They may be intended,
177 |     and removing them may lose information. If you want to take apart nearly
178 |     all ligatures, use NFKC normalization.
179 | 
180 |         >>> print(fix_latin_ligatures("ﬂuﬃeﬆ"))
181 |         fluffiest
182 |     """
183 |     return text.translate(LIGATURES)
184 | 
185 | 
186 | def fix_character_width(text: str) -> str:
187 |     """
188 |     The ASCII characters, katakana, and Hangul characters have alternate
189 |     "halfwidth" or "fullwidth" forms that help text line up in a grid.
190 | 
191 |     If you don't need these width properties, you probably want to replace
192 |     these characters with their standard form, which is what this function
193 |     does.
194 | 
195 |     Note that this replaces the ideographic space, U+3000, with the ASCII
196 |     space, U+20.
197 | 
198 |         >>> print(fix_character_width("ＬＯＵＤ　ＮＯＩＳＥＳ"))
199 |         LOUD NOISES
200 |         >>> print(fix_character_width("Ｕﾀｰﾝ"))   # this means "U-turn"
201 |         Uターン
202 |     """
203 |     return text.translate(WIDTH_MAP)
204 | 
205 | 
206 | def fix_line_breaks(text: str) -> str:
207 |     r"""
208 |     Convert all line breaks to Unix style.
209 | 
210 |     This will convert the following sequences into the standard \\n
211 |     line break:
212 | 
213 |     - CRLF (\\r\\n), used on Windows and in some communication protocols
214 |     - CR (\\r), once used on Mac OS Classic, and now kept alive by misguided
215 |       software such as Microsoft Office for Mac
216 |     - LINE SEPARATOR (\\u2028) and PARAGRAPH SEPARATOR (\\u2029), defined by
217 |       Unicode and used to sow confusion and discord
218 |     - NEXT LINE (\\x85), a C1 control character that is certainly not what you
219 |       meant
220 | 
221 |     The NEXT LINE character is a bit of an odd case, because it
222 |     usually won't show up if `fix_encoding` is also being run.
223 |     \\x85 is very common mojibake for \\u2026, HORIZONTAL ELLIPSIS.
224 | 
225 |         >>> print(fix_line_breaks(
226 |         ...     "This string is made of two things:\u2029"
227 |         ...     "1. Unicode\u2028"
228 |         ...     "2. Spite"
229 |         ... ))
230 |         This string is made of two things:
231 |         1. Unicode
232 |         2. Spite
233 | 
234 |     For further testing and examples, let's define a function to make sure
235 |     we can see the control characters in their escaped form:
236 | 
237 |         >>> def eprint(text):
238 |         ...     print(text.encode('unicode-escape').decode('ascii'))
239 | 
240 |         >>> eprint(fix_line_breaks("Content-type: text/plain\r\n\r\nHi."))
241 |         Content-type: text/plain\n\nHi.
242 | 
243 |         >>> eprint(fix_line_breaks("This is how Microsoft \r trolls Mac users"))
244 |         This is how Microsoft \n trolls Mac users
245 | 
246 |         >>> eprint(fix_line_breaks("What is this \x85 I don't even"))
247 |         What is this \n I don't even
248 |     """
249 |     return (
250 |         text.replace("\r\n", "\n")
251 |         .replace("\r", "\n")
252 |         .replace("\u2028", "\n")
253 |         .replace("\u2029", "\n")
254 |         .replace("\u0085", "\n")
255 |     )
256 | 
257 | 
258 | SURROGATE_RE = re.compile("[\ud800-\udfff]")
259 | SURROGATE_PAIR_RE = re.compile("[\ud800-\udbff][\udc00-\udfff]")
260 | 
261 | 
262 | def convert_surrogate_pair(match: Match[str]) -> str:
263 |     """
264 |     Convert a surrogate pair to the single codepoint it represents.
265 | 
266 |     This implements the formula described at:
267 |     http://en.wikipedia.org/wiki/Universal_Character_Set_characters#Surrogates
268 |     """
269 |     pair = match.group(0)
270 |     codept = 0x10000 + (ord(pair[0]) - 0xD800) * 0x400 + (ord(pair[1]) - 0xDC00)
271 |     return chr(codept)
272 | 
273 | 
274 | def fix_surrogates(text: str) -> str:
275 |     """
276 |     Replace 16-bit surrogate codepoints with the characters they represent
277 |     (when properly paired), or with \ufffd otherwise.
278 | 
279 |         >>> high_surrogate = chr(0xd83d)
280 |         >>> low_surrogate = chr(0xdca9)
281 |         >>> print(fix_surrogates(high_surrogate + low_surrogate))
282 |         💩
283 |         >>> print(fix_surrogates(low_surrogate + high_surrogate))
284 |         ��
285 | 
286 |     The above doctest had to be very carefully written, because even putting
287 |     the Unicode escapes of the surrogates in the docstring was causing
288 |     various tools to fail, which I think just goes to show why this fixer is
289 |     necessary.
290 |     """
291 |     if SURROGATE_RE.search(text):
292 |         text = SURROGATE_PAIR_RE.sub(convert_surrogate_pair, text)
293 |         text = SURROGATE_RE.sub("\ufffd", text)
294 |     return text
295 | 
296 | 
297 | def remove_control_chars(text: str) -> str:
298 |     """
299 |     Remove various control characters that you probably didn't intend to be in
300 |     your text. Many of these characters appear in the table of "Characters not
301 |     suitable for use with markup" at
302 |     http://www.unicode.org/reports/tr20/tr20-9.html.
303 | 
304 |     This includes:
305 | 
306 |     - ASCII control characters, except for the important whitespace characters
307 |       (U+00 to U+08, U+0B, U+0E to U+1F, U+7F)
308 |     - Deprecated Arabic control characters (U+206A to U+206F)
309 |     - Interlinear annotation characters (U+FFF9 to U+FFFB)
310 |     - The Object Replacement Character (U+FFFC)
311 |     - The byte order mark (U+FEFF)
312 | 
313 |     However, these similar characters are left alone:
314 | 
315 |     - Control characters that produce whitespace (U+09, U+0A, U+0C, U+0D,
316 |       U+2028, and U+2029)
317 |     - C1 control characters (U+80 to U+9F) -- even though they are basically
318 |       never used intentionally, they are important clues about what mojibake
319 |       has happened
320 |     - Control characters that affect glyph rendering, such as joiners and
321 |       right-to-left marks (U+200C to U+200F, U+202A to U+202E)
322 |     - Musical notation control characters (U+1D173 to U+1D17A) because wow if
323 |       you're using those you probably have a good reason
324 |     - Tag characters, because they are now used in emoji sequences such as
325 |       "Flag of Wales"
326 |     """
327 |     return text.translate(CONTROL_CHARS)
328 | 
329 | 
330 | def remove_bom(text: str) -> str:
331 |     r"""
332 |     Remove a byte-order mark that was accidentally decoded as if it were part
333 |     of the text.
334 | 
335 |     >>> print(remove_bom(chr(0xfeff) + "Where do you want to go today?"))
336 |     Where do you want to go today?
337 |     """
338 |     return text.lstrip(chr(0xFEFF))
339 | 
340 | 
341 | # Define a regex to match valid escape sequences in Python string literals.
342 | ESCAPE_SEQUENCE_RE = re.compile(
343 |     r"""
344 |     ( \\U........      # 8-digit hex escapes
345 |     | \\u....          # 4-digit hex escapes
346 |     | \\x..            # 2-digit hex escapes
347 |     | \\[0-7]{1,3}     # Octal escapes
348 |     | \\N\{[^}]+\}     # Unicode characters by name
349 |     | \\[\\'"abfnrtv]  # Single-character escapes
350 |     )""",
351 |     re.UNICODE | re.VERBOSE,
352 | )
353 | 
354 | 
355 | def decode_escapes(text: str) -> str:
356 |     r"""
357 |     Decode backslashed escape sequences, including \\x, \\u, and \\U character
358 |     references, even in the presence of other Unicode.
359 | 
360 |     This function has to be called specifically. It's not run automatically by
361 |     ftfy, because escaped text is not necessarily a mistake, and there is no
362 |     way to distinguish when it is.
363 | 
364 |     This is what Python's "string-escape" and "unicode-escape" codecs were
365 |     meant to do, but in contrast, this actually works. It will decode the
366 |     string exactly the same way that the Python interpreter decodes its string
367 |     literals.
368 | 
369 |         >>> factoid = '\\u20a1 is the currency symbol for the colón.'
370 |         >>> print(factoid[1:])
371 |         u20a1 is the currency symbol for the colón.
372 |         >>> print(decode_escapes(factoid))
373 |         ₡ is the currency symbol for the colón.
374 | 
375 |     Even though Python itself can read string literals with a combination of
376 |     escapes and literal Unicode -- you're looking at one right now -- the
377 |     "unicode-escape" codec doesn't work on literal Unicode. (See
378 |     http://stackoverflow.com/a/24519338/773754 for more details.)
379 | 
380 |     Instead, this function searches for just the parts of a string that
381 |     represent escape sequences, and decodes them, leaving the rest alone. All
382 |     valid escape sequences are made of ASCII characters, and this allows
383 |     "unicode-escape" to work correctly.
384 |     """
385 | 
386 |     def decode_match(match: Match[str]) -> str:
387 |         "Given a regex match, decode the escape sequence it contains."
388 |         return codecs.decode(match.group(0), "unicode-escape")
389 | 
390 |     return ESCAPE_SEQUENCE_RE.sub(decode_match, text)
391 | 
392 | 
393 | # This regex implements an exception to restore_byte_a0, so we can decode the
394 | # very common mojibake of (for example) "Ã la mode" as "à la mode", not "àla
395 | # mode".
396 | #
397 | # If byte C3 appears with a single space after it -- most commonly this shows
398 | # up as " Ã " appearing as an entire word -- we'll insert \xa0 while keeping
399 | # the space. Without this change, we would decode "à" as the start of the next
400 | # word, such as "àla". It's almost always intended to be a separate word, as in
401 | # "à la", but when mojibake turns this into "Ã\xa0 la", the two kinds of spaces
402 | # get coalesced into "Ã la".
403 | #
404 | # We make exceptions for the Portuguese words "às", "àquele", "àquela",
405 | # "àquilo" and their plurals -- these are contractions of, for example, "a
406 | # aquele" and are very common. Note that the final letter is important to
407 | # distinguish this case from French "à quel point".
408 | #
409 | # Other instances in Portuguese, such as "àfrica", seem to be typos (intended
410 | # to be "África" with the accent in the other direction).
411 | #
412 | # Unfortunately, "à" is a common letter in Catalan, and mojibake of words that
413 | # contain it will end up with inserted spaces. We can't do the right thing with
414 | # every word. The cost is that the mojibake text "fÃ cil" will be interpreted as
415 | # "fà cil", not "fàcil".
416 | A_GRAVE_WORD_RE = re.compile(b"\xc3 (?! |quele|quela|quilo|s )")
417 | 
418 | 
419 | def restore_byte_a0(byts: bytes) -> bytes:
420 |     """
421 |     Some mojibake has been additionally altered by a process that said "hmm,
422 |     byte A0, that's basically a space!" and replaced it with an ASCII space.
423 |     When the A0 is part of a sequence that we intend to decode as UTF-8,
424 |     changing byte A0 to 20 would make it fail to decode.
425 | 
426 |     This process finds sequences that would convincingly decode as UTF-8 if
427 |     byte 20 were changed to A0, and puts back the A0. For the purpose of
428 |     deciding whether this is a good idea, this step gets a cost of twice
429 |     the number of bytes that are changed.
430 | 
431 |     This is used as a step within `fix_encoding`.
432 |     """
433 |     byts = A_GRAVE_WORD_RE.sub(b"\xc3\xa0 ", byts)
434 | 
435 |     def replacement(match: Match[bytes]) -> bytes:
436 |         "The function to apply when this regex matches."
437 |         return match.group(0).replace(b"\x20", b"\xa0")
438 | 
439 |     return ALTERED_UTF8_RE.sub(replacement, byts)
440 | 
441 | 
442 | def replace_lossy_sequences(byts: bytes) -> bytes:
443 |     """
444 |     This function identifies sequences where information has been lost in
445 |     a "sloppy" codec, indicated by byte 1A, and if they would otherwise look
446 |     like a UTF-8 sequence, it replaces them with the UTF-8 sequence for U+FFFD.
447 | 
448 |     A further explanation:
449 | 
450 |     ftfy can now fix text in a few cases that it would previously fix
451 |     incompletely, because of the fact that it can't successfully apply the fix
452 |     to the entire string. A very common case of this is when characters have
453 |     been erroneously decoded as windows-1252, but instead of the "sloppy"
454 |     windows-1252 that passes through unassigned bytes, the unassigned bytes get
455 |     turned into U+FFFD (�), so we can't tell what they were.
456 | 
457 |     This most commonly happens with curly quotation marks that appear
458 |     ``â€œ like this â€�``.
459 | 
460 |     We can do better by building on ftfy's "sloppy codecs" to let them handle
461 |     less-sloppy but more-lossy text. When they encounter the character ``�``,
462 |     instead of refusing to encode it, they encode it as byte 1A -- an
463 |     ASCII control code called SUBSTITUTE that once was meant for about the same
464 |     purpose. We can then apply a fixer that looks for UTF-8 sequences where
465 |     some continuation bytes have been replaced by byte 1A, and decode the whole
466 |     sequence as �; if that doesn't work, it'll just turn the byte back into �
467 |     itself.
468 | 
469 |     As a result, the above text ``â€œ like this â€�`` will decode as
470 |     ``“ like this �``.
471 | 
472 |     If U+1A was actually in the original string, then the sloppy codecs will
473 |     not be used, and this function will not be run, so your weird control
474 |     character will be left alone but wacky fixes like this won't be possible.
475 | 
476 |     This is used as a transcoder within `fix_encoding`.
477 |     """
478 |     return LOSSY_UTF8_RE.sub("\ufffd".encode(), byts)
479 | 
480 | 
481 | def decode_inconsistent_utf8(text: str) -> str:
482 |     """
483 |     Sometimes, text from one encoding ends up embedded within text from a
484 |     different one. This is common enough that we need to be able to fix it.
485 | 
486 |     This is used as a transcoder within `fix_encoding`.
487 |     """
488 | 
489 |     def fix_embedded_mojibake(match: Match[str]) -> str:
490 |         substr = match.group(0)
491 | 
492 |         # Require the match to be shorter, so that this doesn't recurse infinitely
493 |         if len(substr) < len(text) and is_bad(substr):
494 |             return ftfy.fix_encoding(substr)
495 |         else:
496 |             return substr
497 | 
498 |     return UTF8_DETECTOR_RE.sub(fix_embedded_mojibake, text)
499 | 
500 | 
501 | def _c1_fixer(match: Match[str]) -> str:
502 |     return match.group(0).encode("latin-1").decode("sloppy-windows-1252")
503 | 
504 | 
505 | def fix_c1_controls(text: str) -> str:
506 |     """
507 |     If text still contains C1 control characters, treat them as their
508 |     Windows-1252 equivalents. This matches what Web browsers do.
509 |     """
510 |     return C1_CONTROL_RE.sub(_c1_fixer, text)
511 | 


--------------------------------------------------------------------------------
/ftfy/formatting.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module provides functions for justifying Unicode text in a monospaced
  3 | display such as a terminal.
  4 | 
  5 | We used to have our own implementation here, but now we mostly rely on
  6 | the 'wcwidth' library.
  7 | """
  8 | 
  9 | from unicodedata import normalize
 10 | 
 11 | from wcwidth import wcswidth, wcwidth
 12 | 
 13 | from ftfy.fixes import remove_terminal_escapes
 14 | 
 15 | 
 16 | def character_width(char: str) -> int:
 17 |     r"""
 18 |     Determine the width that a character is likely to be displayed as in
 19 |     a monospaced terminal. The width for a printable character will
 20 |     always be 0, 1, or 2.
 21 | 
 22 |     Nonprintable or control characters will return -1, a convention that comes
 23 |     from wcwidth.
 24 | 
 25 |     >>> character_width('車')
 26 |     2
 27 |     >>> character_width('A')
 28 |     1
 29 |     >>> character_width('\N{ZERO WIDTH JOINER}')
 30 |     0
 31 |     >>> character_width('\n')
 32 |     -1
 33 |     """
 34 |     return int(wcwidth(char))
 35 | 
 36 | 
 37 | def monospaced_width(text: str) -> int:
 38 |     r"""
 39 |     Return the number of character cells that this string is likely to occupy
 40 |     when displayed in a monospaced, modern, Unicode-aware terminal emulator.
 41 |     We refer to this as the "display width" of the string.
 42 | 
 43 |     This can be useful for formatting text that may contain non-spacing
 44 |     characters, or CJK characters that take up two character cells.
 45 | 
 46 |     Returns -1 if the string contains a non-printable or control character.
 47 | 
 48 |     >>> monospaced_width('ちゃぶ台返し')
 49 |     12
 50 |     >>> len('ちゃぶ台返し')
 51 |     6
 52 |     >>> monospaced_width('owl\N{SOFT HYPHEN}flavored')
 53 |     11
 54 |     >>> monospaced_width('example\x80')
 55 |     -1
 56 | 
 57 |     A more complex example: The Korean word 'ibnida' can be written with 3
 58 |     pre-composed characters or 7 jamo. Either way, it *looks* the same and
 59 |     takes up 6 character cells.
 60 | 
 61 |     >>> monospaced_width('입니다')
 62 |     6
 63 |     >>> monospaced_width('\u110b\u1175\u11b8\u1102\u1175\u1103\u1161')
 64 |     6
 65 | 
 66 |     The word "blue" with terminal escapes to make it blue still takes up only
 67 |     4 characters, when shown as intended.
 68 |     >>> monospaced_width('\x1b[34mblue\x1b[m')
 69 |     4
 70 |     """
 71 |     # NFC-normalize the text first, so that we don't need special cases for
 72 |     # Hangul jamo.
 73 |     #
 74 |     # Remove terminal escapes before calculating width, because if they are
 75 |     # displayed as intended, they will have zero width.
 76 |     return int(wcswidth(remove_terminal_escapes(normalize("NFC", text))))
 77 | 
 78 | 
 79 | def display_ljust(text: str, width: int, fillchar: str = " ") -> str:
 80 |     """
 81 |     Return `text` left-justified in a Unicode string whose display width,
 82 |     in a monospaced terminal, should be at least `width` character cells.
 83 |     The rest of the string will be padded with `fillchar`, which must be
 84 |     a width-1 character.
 85 | 
 86 |     "Left" here means toward the beginning of the string, which may actually
 87 |     appear on the right in an RTL context. This is similar to the use of the
 88 |     word "left" in "left parenthesis".
 89 | 
 90 |     >>> lines = ['Table flip', '(╯°□°)╯︵ ┻━┻', 'ちゃぶ台返し']
 91 |     >>> for line in lines:
 92 |     ...     print(display_ljust(line, 20, '▒'))
 93 |     Table flip▒▒▒▒▒▒▒▒▒▒
 94 |     (╯°□°)╯︵ ┻━┻▒▒▒▒▒▒▒
 95 |     ちゃぶ台返し▒▒▒▒▒▒▒▒
 96 | 
 97 |     This example, and the similar ones that follow, should come out justified
 98 |     correctly when viewed in a monospaced terminal. It will probably not look
 99 |     correct if you're viewing this code or documentation in a Web browser.
100 |     """
101 |     if character_width(fillchar) != 1:
102 |         msg = "The padding character must have display width 1"
103 |         raise ValueError(msg)
104 | 
105 |     text_width = monospaced_width(text)
106 |     if text_width == -1:
107 |         # There's a control character here, so just don't add padding
108 |         return text
109 | 
110 |     padding = max(0, width - text_width)
111 |     return text + fillchar * padding
112 | 
113 | 
114 | def display_rjust(text: str, width: int, fillchar: str = " ") -> str:
115 |     """
116 |     Return `text` right-justified in a Unicode string whose display width,
117 |     in a monospaced terminal, should be at least `width` character cells.
118 |     The rest of the string will be padded with `fillchar`, which must be
119 |     a width-1 character.
120 | 
121 |     "Right" here means toward the end of the string, which may actually be on
122 |     the left in an RTL context. This is similar to the use of the word "right"
123 |     in "right parenthesis".
124 | 
125 |     >>> lines = ['Table flip', '(╯°□°)╯︵ ┻━┻', 'ちゃぶ台返し']
126 |     >>> for line in lines:
127 |     ...     print(display_rjust(line, 20, '▒'))
128 |     ▒▒▒▒▒▒▒▒▒▒Table flip
129 |     ▒▒▒▒▒▒▒(╯°□°)╯︵ ┻━┻
130 |     ▒▒▒▒▒▒▒▒ちゃぶ台返し
131 |     """
132 |     if character_width(fillchar) != 1:
133 |         msg = "The padding character must have display width 1"
134 |         raise ValueError(msg)
135 | 
136 |     text_width = monospaced_width(text)
137 |     if text_width == -1:
138 |         return text
139 | 
140 |     padding = max(0, width - text_width)
141 |     return fillchar * padding + text
142 | 
143 | 
144 | def display_center(text: str, width: int, fillchar: str = " ") -> str:
145 |     """
146 |     Return `text` centered in a Unicode string whose display width, in a
147 |     monospaced terminal, should be at least `width` character cells. The rest
148 |     of the string will be padded with `fillchar`, which must be a width-1
149 |     character.
150 | 
151 |     >>> lines = ['Table flip', '(╯°□°)╯︵ ┻━┻', 'ちゃぶ台返し']
152 |     >>> for line in lines:
153 |     ...     print(display_center(line, 20, '▒'))
154 |     ▒▒▒▒▒Table flip▒▒▒▒▒
155 |     ▒▒▒(╯°□°)╯︵ ┻━┻▒▒▒▒
156 |     ▒▒▒▒ちゃぶ台返し▒▒▒▒
157 |     """
158 |     if character_width(fillchar) != 1:
159 |         msg = "The padding character must have display width 1"
160 |         raise ValueError(msg)
161 | 
162 |     text_width = monospaced_width(text)
163 |     if text_width == -1:
164 |         return text
165 | 
166 |     padding = max(0, width - text_width)
167 |     left_padding = padding // 2
168 |     right_padding = padding - left_padding
169 |     return fillchar * left_padding + text + fillchar * right_padding
170 | 


--------------------------------------------------------------------------------
/ftfy/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rspeer/python-ftfy/74dd0452b48286a3770013b3a02755313bd5575e/ftfy/py.typed


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | files = ftfy
 3 | check_untyped_defs = True
 4 | disallow_any_generics = True
 5 | disallow_incomplete_defs = False
 6 | disallow_subclassing_any = True
 7 | disallow_untyped_calls = False
 8 | disallow_untyped_decorators = False
 9 | disallow_untyped_defs = False
10 | no_implicit_optional = True
11 | no_implicit_reexport = False
12 | strict_equality = True
13 | warn_redundant_casts = True
14 | warn_return_any = True
15 | warn_unused_configs = True
16 | warn_unused_ignores = True
17 | python_version = 3.9
18 | 
19 | [mypy-wcwidth]
20 | ignore_missing_imports = True
21 | 
22 | 


--------------------------------------------------------------------------------
/notebook/excel-export.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rspeer/python-ftfy/74dd0452b48286a3770013b3a02755313bd5575e/notebook/excel-export.png


--------------------------------------------------------------------------------
/notes/mysteries.txt:
--------------------------------------------------------------------------------
 1 | on https://www.nipette.com/article-6358031.html, a comment is signed 'MÃ\x83Â©Ã\x82Â¬Ã\x82Â¡nie'.
 2 | This happens to be triple-UTF-8 for 'M鬡nie', but that's probably not the name they meant.
 3 | 
 4 | What exactly did https://www.horoskopy-horoskop.cz/clanek/431-numerologicky-vyznam-jmena-jaromir
 5 | mean when they said 'TadeÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂ¡ÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂ¡' ?
 6 | 
 7 | https://mtlurb.com/tags/arbres/
 8 | 'montrã©al' probably isn't in cp850, but what is it?
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "ftfy"
 3 | version = "6.3.1"
 4 | description = "Fixes mojibake and other problems with Unicode, after the fact"
 5 | authors = [{ name = "Robyn Speer", email = "rspeer@arborelia.net" }]
 6 | license = { text = "Apache-2.0" }
 7 | readme = "README.md"
 8 | dependencies = ["wcwidth"]
 9 | requires-python = ">=3.9"
10 | 
11 | [project.scripts]
12 | ftfy = "ftfy.cli:main"
13 | 
14 | [project.urls]
15 | Homepage = "https://ftfy.readthedocs.io/en/latest/"
16 | Documentation = "https://ftfy.readthedocs.io/en/latest/"
17 | Repository = "https://github.com/rspeer/python-ftfy"
18 | Issues = "https://github.com/rspeer/python-ftfy/issues/"
19 | Changelog = "https://github.com/rspeer/python-ftfy/blob/main/CHANGELOG.md"
20 | Blog = "https://posts.arborelia.net"
21 | 
22 | [build-system]
23 | requires = ["hatchling"]
24 | build-backend = "hatchling.build"
25 | 
26 | [tool.hatch.build.targets.sdist]
27 | exclude = ["^.github/", "scripts/", ".readthedocs.yaml", "notes/", "notebook/"]
28 | 
29 | [tool.uv]
30 | dev-dependencies = [
31 |     "Sphinx >=7, <8",
32 |     "furo >= 2024.7.18",
33 |     "pytest >= 8.3.2, < 9",
34 |     "ruff",
35 | ]
36 | 
37 | [tool.ruff]
38 | exclude = ["badness.py", "notebook"]
39 | line-length = 100
40 | target-version = "py39"
41 | 
42 | [tool.ruff.lint]
43 | select = ["B", "F", "I", "N", "ANN", "UP", "RUF", "C4", "EM", "PIE", "RSE", "TCH", "PTH", "FURB"]
44 | ignore = [
45 |     "ANN101",
46 |     "ANN401",
47 |     "RUF001", # complains about Unicode characters that belong in my docstrings
48 |     "RUF002", # complains about Unicode characters that belong in my docstrings
49 |     "PIE808", # explicitly starting ranges at 0 sometimes helps with readability
50 | ]
51 | 
52 | [tool.ruff.lint.per-file-ignores]
53 | "tests/*" = ["ANN"]
54 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = --doctest-modules --ignore=setup.py --ignore=specimens --ignore=scripts --ignore=docs
3 | 


--------------------------------------------------------------------------------
/scripts/char_data_table.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Used to regenerate character tables in ftfy/chardata.py with explanatory comments.
 3 | """
 4 | 
 5 | import unicodedata
 6 | from dataclasses import dataclass
 7 | 
 8 | from ftfy.chardata import UTF8_CLUES
 9 | 
10 | 
11 | @dataclass
12 | class CharData:
13 |     name: str
14 |     codept: int
15 |     encodings: list[tuple[str, int]]
16 | 
17 |     def sort_key(self) -> tuple[int, str, int]:
18 |         if self.name.startswith("LATIN "):
19 |             return (0, self.name, self.codept)
20 |         return (1, "", self.codept)
21 | 
22 | 
23 | SAFE_ENCODINGS = [
24 |     "latin-1",
25 |     "windows-1252",
26 |     "windows-1251",
27 |     "windows-1250",
28 |     "windows-1253",
29 |     "windows-1254",
30 |     "windows-1257",
31 | ]
32 | 
33 | 
34 | def show_char_table(chars: str, byte_min: int = 0, byte_max: int = 0xFF) -> None:
35 |     char_data: list[CharData] = []
36 |     for char in chars:
37 |         name = unicodedata.name(char, "<unknown>")
38 |         codept = ord(char)
39 |         encodings: list[tuple[str, int]] = []
40 |         for encoding in SAFE_ENCODINGS:
41 |             try:
42 |                 encoded: bytes = char.encode(encoding)
43 |                 byte: int = encoded[0]
44 |                 encodings.append((encoding, byte))
45 |             except UnicodeEncodeError:
46 |                 pass
47 |         if encodings:
48 |             char_data.append(CharData(name=name, codept=codept, encodings=encodings))
49 |         else:
50 |             print(f"No relevant encoding for {codept=}, {name=}")
51 |     char_data.sort(key=CharData.sort_key)
52 |     for cd in char_data:
53 |         encoding_info: list[str] = []
54 |         for encoding, byte in cd.encodings:
55 |             if byte_min <= byte <= byte_max:
56 |                 info_str = f"{encoding}:{byte:X}"
57 |                 encoding_info.append(info_str)
58 |         encoding_explanation = encoding_info[0] if encoding_info else "???"
59 |         print(f'        "\\N{{{cd.name}}}"  # {encoding_explanation}')
60 | 
61 | 
62 | def run() -> None:
63 |     print("# utf8_first_of_2")
64 |     show_char_table(UTF8_CLUES["utf8_first_of_2"], 0xC2, 0xDF)
65 |     print("# utf8_first_of_3")
66 |     show_char_table(UTF8_CLUES["utf8_first_of_3"], 0xE0, 0xEF)
67 |     print("# utf8_first_of_4")
68 |     show_char_table(UTF8_CLUES["utf8_first_of_4"], 0xF0, 0xF3)
69 |     print("# utf8_continuation")
70 |     print(r'        "\x80-\xbf"')
71 |     show_char_table(UTF8_CLUES["utf8_continuation"][3:], 0x80, 0xBF)
72 |     print("# utf8_continuation_strict")
73 |     print(r'        "\x80-\xbf"')
74 |     show_char_table(UTF8_CLUES["utf8_continuation_strict"][3:], 0x80, 0xBF)
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     run()
79 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rspeer/python-ftfy/74dd0452b48286a3770013b3a02755313bd5575e/tests/__init__.py


--------------------------------------------------------------------------------
/tests/face.txt:
--------------------------------------------------------------------------------
1 | â”’(âŒ£Ë›âŒ£)â”Ž
2 | 


--------------------------------------------------------------------------------
/tests/test-cases/README.md:
--------------------------------------------------------------------------------
 1 | # ftfy test cases
 2 | 
 3 | This directory contains JSON files with test cases for ftfy. Many of them are real mojibake found in the wild, such as by listening to the Twitter firehose (when that existed), searching through the OSCAR web crawl, or in issue reports from users.
 4 | 
 5 | Cases labeled "synthetic" were not found in the wild, but were instead constructed to test a particular edge case.
 6 | 
 7 | Cases labeled "negative" are not mojibake but look lke they could be. We're testing that ftfy does not alter the text (except for its usual processing such as un-curling quotes).
 8 | 
 9 | `known-failures.json` contains cases that we would do better at with an improved heuristic. Most of these are false negatives, where ftfy does not figure out how to fix the text. ftfy aims to have no false positives, but there is one synthetic false positive in `known-failures.json`.
10 | 
11 | ## Structure of a test case
12 | 
13 | A test case contains the following fields:
14 | 
15 | - `label`: A description of the test case, shown when pytest runs in verbose mode.
16 | - `comment`: Further details on the test case because JSON doesn't have comments.
17 | - `original`: The text to run through ftfy.
18 | - `fixed-encoding` (optional): the expected result of `ftfy.fix_encoding(original)`. If unspecified, uses the value from `fixed`.
19 | - `fixed`: the expected result of `ftfy.fix_text(original)`.
20 | - `expect`: "pass" for test cases that should pass, or "fail" for known failures.


--------------------------------------------------------------------------------
/tests/test-cases/in-the-wild.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     {
  3 |         "label": "Low-codepoint emoji",
  4 |         "comment": "From the ancient era before widespread emoji support on Twitter",
  5 |         "original": "He's Justinâ\u009d¤",
  6 |         "fixed": "He's Justin❤",
  7 |         "expect": "pass"
  8 |     },
  9 |     {
 10 |         "label": "UTF-8 / MacRoman mix-up about smurfs",
 11 |         "original": "Le Schtroumpf Docteur conseille g√¢teaux et baies schtroumpfantes pour un r√©gime √©quilibr√©.",
 12 |         "fixed": "Le Schtroumpf Docteur conseille gâteaux et baies schtroumpfantes pour un régime équilibré.",
 13 |         "expect": "pass"
 14 |     },
 15 |     {
 16 |         "label": "Checkmark that almost looks okay as mojibake",
 17 |         "original": "âœ” No problems",
 18 |         "fixed": "✔ No problems",
 19 |         "expect": "pass"
 20 |     },
 21 |     {
 22 |         "label": "UTF-8 / Windows-1251 Russian mixup about futbol",
 23 |         "original": "РґРѕСЂРѕРіРµ Р\u0098Р·-РїРѕРґ #С„СѓС‚Р±РѕР»",
 24 |         "fixed": "дороге Из-под #футбол",
 25 |         "expect": "pass"
 26 |     },
 27 |     {
 28 |         "label": "Latin-1 / Windows-1252 mixup in German",
 29 |         "original": "\u0084Handwerk bringt dich überall hin\u0093: Von der YOU bis nach Monaco",
 30 |         "fixed-encoding": "„Handwerk bringt dich überall hin“: Von der YOU bis nach Monaco",
 31 |         "fixed": "\"Handwerk bringt dich überall hin\": Von der YOU bis nach Monaco",
 32 |         "expect": "pass"
 33 |     },
 34 |     {
 35 |         "label": "Latin-1 / Windows-1252 mixup of the replacement character",
 36 |         "original": "Some comments may be republished on the website or in the newspaper ï¿½ email addresses will not be published.",
 37 |         "fixed": "Some comments may be republished on the website or in the newspaper � email addresses will not be published.",
 38 |         "expect": "pass"
 39 |     },
 40 |     {
 41 |         "label": "CESU-8 / Windows-1252 emoji",
 42 |         "original": "Hi guys í ½í¸\u008d",
 43 |         "fixed": "Hi guys 😍",
 44 |         "expect": "pass"
 45 |     },
 46 |     {
 47 |         "label": "CESU-8 / Latin-1 emoji",
 48 |         "original": "hihi RT username: â\u0098ºí ½í¸\u0098",
 49 |         "fixed": "hihi RT username: ☺😘",
 50 |         "expect": "pass"
 51 |     },
 52 |     {
 53 |         "label": "Latin-1 / Windows-1252 mixup in Turkish",
 54 |         "original": "Beta Haber: HÄ±rsÄ±zÄ± BÃ¼yÃ¼ Korkuttu",
 55 |         "fixed": "Beta Haber: Hırsızı Büyü Korkuttu",
 56 |         "expect": "pass"
 57 |     },
 58 |     {
 59 |         "label": "Latin-1 / Windows-1252 mixup in İstanbul (issue #192)",
 60 |         "original": "Ä°stanbul",
 61 |         "fixed": "İstanbul",
 62 |         "expect": "pass"
 63 |     },
 64 |     {
 65 |         "label": "Latin-1 / Windows-1252 mixup in German (issue #188)",
 66 |         "original": "RUF MICH ZURÃœCK",
 67 |         "fixed": "RUF MICH ZURÜCK",
 68 |         "expect": "pass"
 69 |     },
 70 |     {
 71 |         "label": "Latin-1 / Windows-1252 mixup in Rīga (issue #192)",
 72 |         "original": "RÄ«ga",
 73 |         "fixed": "Rīga",
 74 |         "expect": "pass"
 75 |     },
 76 |     {
 77 |         "label": "UTF-8 / Windows-1251 mixed up twice in Russian",
 78 |         "original": "Р С—РЎР‚Р С‘РЎРЏРЎвЂљР Р…Р С•РЎРѓРЎвЂљР С‘. РІСњВ¤",
 79 |         "fixed": "приятности. ❤",
 80 |         "expect": "pass"
 81 |     },
 82 |     {
 83 |         "label": "UTF-8 / Windows-1252 mixed up twice in Malay",
 84 |         "original": "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New Ã¢â‚¬Å“ RomanceÃ¢â‚¬Â\u009d.",
 85 |         "fixed-encoding": "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New “ Romance”.",
 86 |         "fixed": "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New \" Romance\".",
 87 |         "expect": "pass"
 88 |     },
 89 |     {
 90 |         "label": "UTF-8 / Windows-1252 mixed up twice in naming Iggy Pop",
 91 |         "original": "Iggy Pop (nÃƒÂ© Jim Osterberg)",
 92 |         "fixed": "Iggy Pop (né Jim Osterberg)",
 93 |         "expect": "pass"
 94 |     },
 95 |     {
 96 |         "label": "Left quote is UTF-8, right quote is Latin-1, both encoded in Windows-1252",
 97 |         "original": "Direzione Pd, ok â\u0080\u009csenza modifiche\u0094 all'Italicum.",
 98 |         "fixed-encoding": "Direzione Pd, ok “senza modifiche” all'Italicum.",
 99 |         "fixed": "Direzione Pd, ok \"senza modifiche\" all'Italicum.",
100 |         "expect": "pass"
101 |     },
102 |     {
103 |         "label": "UTF-8 / sloppy Windows-1252 mixed up twice in a triumphant emoticon",
104 |         "original": "selamat berpuasa sob (Ã\u00a0Â¸â€¡'ÃŒâ‚¬Ã¢Å’Â£'ÃŒÂ\u0081)Ã\u00a0Â¸â€¡",
105 |         "fixed": "selamat berpuasa sob (ง'̀⌣'́)ง",
106 |         "expect": "pass"
107 |     },
108 |     {
109 |         "label": "UTF-8 / Windows-1252 mixed up three times",
110 |         "original": "The Mona Lisa doesnÃƒÂ¢Ã¢â€šÂ¬Ã¢â€žÂ¢t have eyebrows.",
111 |         "fixed-encoding": "The Mona Lisa doesn’t have eyebrows.",
112 |         "fixed": "The Mona Lisa doesn't have eyebrows.",
113 |         "expect": "pass"
114 |     },
115 |     {
116 |         "label": "UTF-8 / Codepage 437 mixup in Russian",
117 |         "original": "#╨┐╤Ç╨░╨▓╨╕╨╗╤î╨╜╨╛╨╡╨┐╨╕╤é╨░╨╜╨╕╨╡",
118 |         "fixed": "#правильноепитание",
119 |         "expect": "pass"
120 |     },
121 |     {
122 |         "label": "UTF-8 / Windows-1252 mixup in French",
123 |         "original": "HÃ´tel de Police",
124 |         "fixed": "Hôtel de Police",
125 |         "expect": "pass"
126 |     },
127 |     {
128 |         "label": "UTF-8 / Windows-1250 mixup in French",
129 |         "original": "LiĂ¨ge Avenue de l'HĂ´pital",
130 |         "fixed": "Liège Avenue de l'Hôpital",
131 |         "expect": "pass"
132 |     },
133 |     {
134 |         "label": "UTF-8 / Windows-1252 mixup in Vietnamese",
135 |         "original": "Táº¡i sao giÃ¡ háº¡t sáº§u riÃªng láº¡i lÃªn giÃ¡?",
136 |         "fixed": "Tại sao giá hạt sầu riêng lại lên giá?",
137 |         "expect": "pass"
138 |     },
139 |     {
140 |         "label": "Science! Mid-word Greek letter gets fixed correctly",
141 |         "original": "Humanized HLA-DR4.RagKO.IL2RÎ³cKO.NOD (DRAG) mice sustain the complex vertebrate life cycle of Plasmodium falciparum malaria.",
142 |         "fixed": "Humanized HLA-DR4.RagKO.IL2RγcKO.NOD (DRAG) mice sustain the complex vertebrate life cycle of Plasmodium falciparum malaria.",
143 |         "expect": "pass"
144 |     },
145 |     {
146 |         "label": "For goodness' sake. We can come close to fixing this, but fail in the last step",
147 |         "original": "ItÃ?Â¢â?¬â?¢s classic. ItÃ?Â¢â?¬â?¢s epic. ItÃ?Â¢â?¬â?¢s ELIZABETH BENNET for goodnessÃ?Â¢â?¬â?¢ sake!",
148 |         "fixed": "It�¢��s classic. It�¢��s epic. It�¢��s ELIZABETH BENNET for goodness�¢�� sake!",
149 |         "expect": "pass"
150 |     },
151 |     {
152 |         "label": "lossy UTF-8 / Windows-1250 mixup in Spanish",
153 |         "original": "Europa, Asia, Ă�frica, Norte, AmĂ©rica Central y del Sur, Australia y OceanĂ­a",
154 |         "fixed": "Europa, Asia, �frica, Norte, América Central y del Sur, Australia y Oceanía",
155 |         "expect": "pass"
156 |     },
157 |     {
158 |         "label": "UTF-8 / sloppy Windows-1250 mixup in English",
159 |         "original": "It was namedÂ â€žscarsÂ´ stonesâ€ś after the rock-climbers who got hurt while climbing on it.",
160 |         "fixed-encoding": "It was named\u00a0„scars´ stones“ after the rock-climbers who got hurt while climbing on it.",
161 |         "fixed": "It was named\u00a0\"scars´ stones\" after the rock-climbers who got hurt while climbing on it.",
162 |         "expect": "pass"
163 |     },
164 |     {
165 |         "label": "The same text as above, but as a UTF-8 / ISO-8859-2 mixup",
166 |         "original": "It was namedÂ\u00a0â\u0080\u009escarsÂ´ stonesâ\u0080\u009c after the rock-climbers who got hurt while climbing on it.",
167 |         "fixed-encoding": "It was named\u00a0„scars´ stones“ after the rock-climbers who got hurt while climbing on it.",
168 |         "fixed": "It was named\u00a0\"scars´ stones\" after the rock-climbers who got hurt while climbing on it.",
169 |         "expect": "pass"
170 |     },
171 |     {
172 |         "label": "UTF-8 / ISO-8859-2 mixup in Czech",
173 |         "comment": "This says 'I've had enough of the third millennium', which is great because it involves software decisions made in the second",
174 |         "original": "MĂĄm dost tĹ\u0099etĂ\u00adho tisĂ\u00adciletĂ\u00ad",
175 |         "fixed": "Mám dost třetího tisíciletí",
176 |         "expect": "pass"
177 |     },
178 |     {
179 |         "label": "UTF-8 / Windows-1252 mixup in mixed French and Arabic",
180 |         "comment": "A difficult test case that can depend on the order that steps are applied",
181 |         "original": "Ã€ tous mes frÃ¨res et soeurs dans la syriennetÃ© comme dans l’humanitÃ©, sans discrimination aucune, je vous souhaite bonne fÃªte Ø¹ÙŠØ¯ Ø³Ø¹ÙŠØ¯.Que la paix, la libertÃ©, l’Ã©galitÃ©, la fraternitÃ© et la dignitÃ© soient avec vous.Pardonnez ce ton un peu ecclÃ©siastique.",
182 |         "fixed-encoding": "À tous mes frères et soeurs dans la syrienneté comme dans l’humanité, sans discrimination aucune, je vous souhaite bonne fête عيد سعيد.Que la paix, la liberté, l’égalité, la fraternité et la dignité soient avec vous.Pardonnez ce ton un peu ecclésiastique.",
183 |         "fixed": "À tous mes frères et soeurs dans la syrienneté comme dans l'humanité, sans discrimination aucune, je vous souhaite bonne fête عيد سعيد.Que la paix, la liberté, l'égalité, la fraternité et la dignité soient avec vous.Pardonnez ce ton un peu ecclésiastique.",
184 |         "expect": "pass"
185 |     },
186 |     {
187 |         "label": "UTF-8 / sloppy Windows-1250 mixup in Romanian",
188 |         "original": "vedere Ă®nceĹŁoĹźatÄ\u0083",
189 |         "fixed": "vedere înceţoşată",
190 |         "expect": "pass"
191 |     },
192 |     {
193 |         "label": "UTF-8 / Windows-1250 mixup in Slovak",
194 |         "original": "NapĂ\u00adĹˇte nĂˇm !",
195 |         "fixed": "Napíšte nám !",
196 |         "expect": "pass"
197 |     },
198 |     {
199 |         "label": "UTF-8 / Windows-1252 mixup in Spanish",
200 |         "original": "DOS AÃ‘OS",
201 |         "fixed": "DOS AÑOS",
202 |         "expect": "pass"
203 |     },
204 |     {
205 |         "label": "UTF-8 / Windows-1252 followed by UTF-8 / Windows-1251",
206 |         "original": "a bigger-than-expected Г‚ВЈ5.8bn rights issue to satisfy the new banking regulator",
207 |         "fixed": "a bigger-than-expected £5.8bn rights issue to satisfy the new banking regulator",
208 |         "expect": "pass"
209 |     },
210 |     {
211 |         "label": "fancy Unicode crossing-out, but mojibaked",
212 |         "original": "hotel $49 $Ì¶6Ì¶3Ì¶ updated 2018",
213 |         "fixed": "hotel $49 $̶6̶3̶ updated 2018",
214 |         "expect": "pass"
215 |     },
216 |     {
217 |         "label": "A face with UTF-8 / sloppy Windows-1252 mixed up twice",
218 |         "original": "Ã¢â€\u009dâ€™(Ã¢Å’Â£Ã‹â€ºÃ¢Å’Â£)Ã¢â€\u009dÅ½",
219 |         "fixed": "┒(⌣˛⌣)┎",
220 |         "expect": "pass"
221 |     },
222 |     {
223 |         "label": "We can mostly decode the face above when we lose the character U+009D",
224 |         "original": "Ã¢â€�â€™(Ã¢Å’Â£Ã‹â€ºÃ¢Å’Â£)Ã¢â€�Å½",
225 |         "fixed": "�(⌣˛⌣)�",
226 |         "expect": "pass"
227 |     },
228 |     {
229 |         "label": "Lossy decoding can have plain ASCII question marks, as well",
230 |         "original": "The ICR has been upgraded to â€œbb+â€? from â€œbbâ€?",
231 |         "fixed-encoding": "The ICR has been upgraded to “bb+� from “bb�",
232 |         "fixed": "The ICR has been upgraded to \"bb+� from \"bb�",
233 |         "expect": "pass"
234 |     },
235 |     {
236 |         "label": "CESU-8 / Latin-1 mixup over several emoji",
237 |         "comment": "You tried",
238 |         "original": "I just figured out how to tweet emojis! â\u009a½í\u00a0½í¸\u0080í\u00a0½í¸\u0081í\u00a0½í¸\u0082í\u00a0½í¸\u0086í\u00a0½í¸\u008eí\u00a0½í¸\u008eí\u00a0½í¸\u008eí\u00a0½í¸\u008e",
239 |         "fixed": "I just figured out how to tweet emojis! ⚽😀😁😂😆😎😎😎😎",
240 |         "expect": "pass"
241 |     },
242 |     {
243 |         "label": "An absolutely hopeless garble",
244 |         "comment": "If we try too hard to decode this, we'll recursively apply `decode_inconsistent_utf8` until the characters turn into random Han and katakana characters.",
245 |         "original": "ã†â€™ãƒâ€ ã¢â‚¬â„¢ãƒæ’ã‚â¢ãƒâ¢ã¢â‚¬å¡ã‚â¬ãƒâ€šã‚â",
246 |         "fixed-encoding": "ã†â€™ãƒâ€ ã¢â‚¬â„¢ãƒæ’ã‚â¢ãƒâ¢ã¢â‚¬å¡ã‚â¬ãƒâ€šã‚â",
247 |         "fixed": "ã†â€™ãƒâ€ ã¢â'¬â\"¢ãƒæ'ã'â¢ãƒâ¢ã¢â'¬å¡ã'â¬ãƒâ€šã'â",
248 |         "expect": "pass"
249 |     },
250 |     {
251 |         "label": "Inconsistent UTF-8 / Latin-1 mojibake",
252 |         "original": "Ecuadorâ\u0080\u0099s â\u0080\u0098purely political decision on Assangeâ\u0080\u0099 is likely result of â\u0080\u0098US pressureâ\u0080\u0099\u0085",
253 |         "fixed-encoding": "Ecuador’s ‘purely political decision on Assange’ is likely result of ‘US pressure’…",
254 |         "fixed": "Ecuador's 'purely political decision on Assange' is likely result of 'US pressure'…",
255 |         "expect": "pass"
256 |     },
257 |     {
258 |         "label": "Inconsistent UTF-8 / Latin-1 mojibake with an ellipsis from the Windows-1252 character set",
259 |         "original": "Ecuadorâ\u0080\u0099s â\u0080\u0098purely political decision on Assangeâ\u0080\u0099 is likely result of â\u0080\u0098US pressureâ\u0080\u0099…",
260 |         "fixed-encoding": "Ecuador’s ‘purely political decision on Assange’ is likely result of ‘US pressure’…",
261 |         "fixed": "Ecuador's 'purely political decision on Assange' is likely result of 'US pressure'…",
262 |         "expect": "pass"
263 |     },
264 |     {
265 |         "label": "Inconsistent mojibake in Portuguese",
266 |         "original": "Campeonatos > III DivisÃ£o - SÃ©rie F > Jornadas Classificação",
267 |         "fixed": "Campeonatos > III Divisão - Série F > Jornadas Classificação",
268 |         "expect": "pass"
269 |     },
270 |     {
271 |         "label": "Handle Afrikaans 'n character",
272 |         "original": "ŉ Chloroplas is ŉ organel wat in fotosinterende plante voorkom.",
273 |         "fixed-encoding": "ŉ Chloroplas is ŉ organel wat in fotosinterende plante voorkom.",
274 |         "fixed": "'n Chloroplas is 'n organel wat in fotosinterende plante voorkom.",
275 |         "expect": "pass"
276 |     },
277 |     {
278 |         "label": "Handle Croatian single-codepoint digraphs",
279 |         "original": "izum „bootstrap load“ koji je korišteǌem polisilicijskog sloja proizveo dovoǉno dobre kondenzatore na čipu",
280 |         "fixed-encoding": "izum „bootstrap load“ koji je korišteǌem polisilicijskog sloja proizveo dovoǉno dobre kondenzatore na čipu",
281 |         "fixed": "izum \"bootstrap load\" koji je korištenjem polisilicijskog sloja proizveo dovoljno dobre kondenzatore na čipu",
282 |         "expect": "pass"
283 |     },
284 |     {
285 |         "label": "A with an acute accent, in isolation",
286 |         "original": "NicolÃ¡s",
287 |         "fixed": "Nicolás",
288 |         "expect": "pass"
289 |     },
290 |     {
291 |         "label": "sharp S, in isolation, via MacRoman encoding",
292 |         "comment": "regression reported in issue #186",
293 |         "original": "wei√ü",
294 |         "fixed": "weiß",
295 |         "expect": "pass"
296 |     },
297 |     {
298 |         "label": "French example containing non-breaking spaces",
299 |         "original": "ART TRIP Ã\u00a0 l'office de tourisme",
300 |         "fixed": "ART TRIP à l'office de tourisme",
301 |         "expect": "pass"
302 |     },
303 |     {
304 |         "label": "English example in UTF-8 / Windows-1251 with a ligature",
305 |         "original": "This is signiп¬Ѓcantly lower than the respective share",
306 |         "fixed-encoding": "This is signiﬁcantly lower than the respective share",
307 |         "fixed": "This is significantly lower than the respective share",
308 |         "expect": "pass"
309 |     },
310 |     {
311 |         "label": "'à' remains its own word, even if spaces after it get coalesced into one",
312 |         "original": "Ã perturber la rÃ©flexion des thÃ©ologiens jusqu'Ã nos jours",
313 |         "fixed": "à perturber la réflexion des théologiens jusqu'à nos jours",
314 |         "expect": "pass"
315 |     },
316 |     {
317 |         "label": "Fix 'à' in inconsistent mojibake",
318 |         "original": "Le barÃ¨me forfaitaire permet l’Ã©valuation des frais de dÃ©placement relatifs Ã l’utilisation",
319 |         "fixed-encoding": "Le barème forfaitaire permet l’évaluation des frais de déplacement relatifs à l’utilisation",
320 |         "fixed": "Le barème forfaitaire permet l'évaluation des frais de déplacement relatifs à l'utilisation",
321 |         "expect": "pass"
322 |     },
323 |     {
324 |         "label": "The Portuguese word 'às' does not become 'à s' due to the French fix",
325 |         "original": "com especial atenÃ§Ã£o Ã s crianÃ§as",
326 |         "fixed": "com especial atenção às crianças",
327 |         "expect": "pass"
328 |     },
329 |     {
330 |         "label": "This is why we require a space after the 's' in 'às'",
331 |         "original": "TroisiÃ¨me Ã©dition pour ce festival qui persiste et signe Ã s'Ã©loigner des grands axes pour prendre les contre-allÃ©es en 16 concerts dans 7 villes de 2 pays voisins.",
332 |         "fixed": "Troisième édition pour ce festival qui persiste et signe à s'éloigner des grands axes pour prendre les contre-allées en 16 concerts dans 7 villes de 2 pays voisins.",
333 |         "expect": "pass"
334 |     },
335 |     {
336 |         "label": "We can fix 'à' in windows-1251 sometimes as well",
337 |         "original": "La rГ©gion de Dnepropetrovsk se trouve Г lвЂ™ouest de lвЂ™Ukraine",
338 |         "fixed-encoding": "La région de Dnepropetrovsk se trouve à l’ouest de l’Ukraine",
339 |         "fixed": "La région de Dnepropetrovsk se trouve à l'ouest de l'Ukraine",
340 |         "expect": "pass"
341 |     },
342 |     {
343 |         "label": "'Ã quele' is the Portuguese word 'àquele', not 'à quele'",
344 |         "original": "eliminado o antÃ­geno e mantidos os nÃ­veis de anticorpos, surgem as condiÃ§Ãµes necessÃ¡rias ao estabelecimento do granuloma, semelhante Ã quele observado nas lesÃµes por imunocomplexo em excesso de anticorpos",
345 |         "fixed": "eliminado o antígeno e mantidos os níveis de anticorpos, surgem as condições necessárias ao estabelecimento do granuloma, semelhante àquele observado nas lesões por imunocomplexo em excesso de anticorpos",
346 |         "expect": "pass"
347 |     },
348 |     {
349 |         "label": "A complex, lossy pile-up of mojibake in Portuguese",
350 |         "original": "â € ðŸ“�Â Regulamento: â € âš ï¸� As pessoas que marcarem nos comentÃ¡rios perfis empresariais e/ou de marcas, personalidades ou fake serÃ£o desclassificadas. âš ï¸� Podem participar pessoas residentes em Petrolina/PE ou Juazeiro/BA, desde que se comprometam a retirar o prÃªmio em nosso endereÃ§o. FuncionÃ¡rios estÃ£o vetados. âš ï¸� SerÃ£o vÃ¡lidos os comentÃ¡rios postados atÃ© 16h, do dia 31/03/2018. E o resultado serÃ¡ divulgado atÃ© Ã s 19h do mesmo dia em uma nova publicaÃ§Ã£o em nosso instagram. â € Boa sorte!!!Â ðŸ˜€ðŸ�°",
351 |         "fixed": "⠀ �\u00a0Regulamento: ⠀ ⚠� As pessoas que marcarem nos comentários perfis empresariais e/ou de marcas, personalidades ou fake serão desclassificadas. ⚠� Podem participar pessoas residentes em Petrolina/PE ou Juazeiro/BA, desde que se comprometam a retirar o prêmio em nosso endereço. Funcionários estão vetados. ⚠� Serão válidos os comentários postados até 16h, do dia 31/03/2018. E o resultado será divulgado até às 19h do mesmo dia em uma nova publicação em nosso instagram. ⠀ Boa sorte!!!\u00a0😀�",
352 |         "expect": "pass"
353 |     },
354 |     {
355 |         "label": "UTF-8 / Windows-1252 mixup in Gaelic involving non-breaking spaces",
356 |         "original": "CÃ\u00a0nan nan GÃ\u00a0idheal",
357 |         "fixed": "Cànan nan Gàidheal",
358 |         "expect": "pass"
359 |     },
360 |     {
361 |         "label": "UTF-8 / Windows-1251 mixup in tweet spam",
362 |         "original": "Blog Traffic Tip 2 вЂ“ Broadcast Email Your Blog",
363 |         "fixed": "Blog Traffic Tip 2 – Broadcast Email Your Blog",
364 |         "expect": "pass"
365 |     },
366 |     {
367 |         "label": "UTF-8 / Windows-1251 mixup",
368 |         "original": "S&P Confirms UkrsotsbankвЂ™s вЂњB-вЂњ Rating",
369 |         "fixed-encoding": "S&P Confirms Ukrsotsbank’s “B-“ Rating",
370 |         "fixed": "S&P Confirms Ukrsotsbank's \"B-\" Rating",
371 |         "expect": "pass"
372 |     },
373 |     {
374 |         "label": "Dutch example with ë",
375 |         "comment": "from issue reported by MicroJackson",
376 |         "original": "ongeÃ«venaard",
377 |         "fixed-encoding": "ongeëvenaard",
378 |         "fixed": "ongeëvenaard",
379 |         "expect": "pass"
380 |     },
381 |     {
382 |         "label": "HTML entity on top of UTF-8 / Latin-1",
383 |         "original": "10Î&frac14;s",
384 |         "fixed-encoding": "10Î&frac14;s",
385 |         "fixed": "10μs",
386 |         "expect": "pass"
387 |     },
388 |     {
389 |         "label": "Three layers of UTF-8 / MacRoman mixup in French",
390 |         "comment": "You're welcome",
391 |         "original": "Merci de t‚Äö√†√∂¬¨¬©l‚Äö√†√∂¬¨¬©charger le plug-in Flash Player 8",
392 |         "fixed": "Merci de télécharger le plug-in Flash Player 8",
393 |         "expect": "pass"
394 |     },
395 |     {
396 |         "label": "UTF-8 / MacRoman mixup in French",
397 |         "original": "Merci de bien vouloir activiter le Javascript dans votre navigateur web afin d'en profiter‚Ä¶",
398 |         "fixed": "Merci de bien vouloir activiter le Javascript dans votre navigateur web afin d'en profiter…",
399 |         "expect": "pass"
400 |     },
401 |     {
402 |         "label": "Italian UTF-8 / MacRoman example with ò",
403 |         "original": "Le Vigne di Zam√≤",
404 |         "fixed": "Le Vigne di Zamò",
405 |         "expect": "pass"
406 |     },
407 |     {
408 |         "label": "Punctuation pile-up should actually be musical notes",
409 |         "original": "Engkau masih yg terindah, indah di dalam hatikuâ™«~",
410 |         "fixed": "Engkau masih yg terindah, indah di dalam hatiku♫~",
411 |         "expect": "pass"
412 |     },
413 |     {
414 |         "label": "Latvian UTF-8 / Windows-1257 mojibake",
415 |         "original": "Å veices baÅ†Ä·ieri gaida konkrÄ“tus investÄ«ciju projektus",
416 |         "fixed": "Šveices baņķieri gaida konkrētus investīciju projektus",
417 |         "expect": "pass"
418 |     },
419 |     {
420 |         "label": "Latvian UTF-8 / MacRoman mojibake",
421 |         "original": "SaeimƒÅ ievƒìlƒìtƒÅs partijas \"Progresƒ´vie\" lƒ´dzvadƒ´tƒÅja Anto≈Üina ≈Öena≈°eva atbild uz ≈æurnƒÅlistu jautƒÅjumiem pƒìc partijas tik≈°anƒÅs ar Valsts prezidentu Rƒ´gas pilƒ´,",
422 |         "fixed": "Saeimā ievēlētās partijas \"Progresīvie\" līdzvadītāja Antoņina Ņenaševa atbild uz žurnālistu jautājumiem pēc partijas tikšanās ar Valsts prezidentu Rīgas pilī,",
423 |         "expect": "pass"
424 |     },
425 |     {
426 |         "label": "Lithuanian UTF-8 / Windows-1257 mojibake",
427 |         "original": "Å iaip ÄÆdomu, kaip ÄÆsivaizduoji. VisÅ³ pirma tam reikia laiko.",
428 |         "fixed": "Šiaip įdomu, kaip įsivaizduoji. Visų pirma tam reikia laiko.",
429 |         "expect": "pass"
430 |     },
431 |     {
432 |         "label": "Lithuanian UTF-8 / Windows-1250 mojibake",
433 |         "original": "Lietuva pagrÄŻstai gali paklausti: Ĺ˝inoma, kad ne.",
434 |         "fixed": "Lietuva pagrįstai gali paklausti: Žinoma, kad ne.",
435 |         "expect": "pass"
436 |     },
437 |     {
438 |         "label": "Hebrew UTF-8 / Windows-1252 mojibake",
439 |         "comment": "reported by SuperIRabbit as issue #158",
440 |         "original": "×‘×”×•×“×¢×”",
441 |         "fixed": "בהודעה",
442 |         "expect": "pass"
443 |     },
444 |     {
445 |         "label": "Wide comma in UTF-8 / Windows-1252",
446 |         "original": "Ningboï¼ŒChina",
447 |         "fixed-encoding": "Ningbo，China",
448 |         "fixed": "Ningbo,China",
449 |         "expect": "pass"
450 |     }
451 | ]


--------------------------------------------------------------------------------
/tests/test-cases/known-failures.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "label": "Misleading mix-up in Spanish",
 4 |         "comment": "The original text has mojibake, but the sequence 'á \u0093' can decode as U+1813 MONGOLIAN DIGIT THREE, when the whole string should really just decode as a Latin-1/Windows-1252 mixup",
 5 |         "original": "tiene demora y está \u0093próximo a resolverse\u0094",
 6 |         "fixed": "tiene demora y está \"próximo a resolverse\"",
 7 |         "expect": "fail"
 8 |     },
 9 |     {
10 |         "label": "Two levels of inconsistent mojibake",
11 |         "comment": "The en-dash was mojibaked in UTF-8 / Windows-1252 as three characters, two of which were mojibaked again as Windows-1252 / Latin-1, and the third of which was mojibaked as UTF-8 / Latin-1. Unfortunately, if we fix this, we leave ourselves room to greedily 'decode' random Han characters in complex Latin-alphabet mojibake",
12 |         "original": "Arsenal v Wolfsburg: pre-season friendly â\u0080â\u0080\u009c live!",
13 |         "fixed": "Arsenal v Wolfsburg: pre-season friendly – live!",
14 |         "expect": "fail"
15 |     },
16 |     {
17 |         "label": "A-with-grave in Vietnamese",
18 |         "comment": "Currently adds extra spaces that shouldn't be there",
19 |         "original": "Xem clip hĂ i, phim hĂ i má»›i hay nháşĄt",
20 |         "fixed": "Xem clip hài, phim hài mới hay nhất",
21 |         "expect": "fail"
22 |     },
23 |     {
24 |         "label": "Latin-1 / MacRoman mixup in Spanish",
25 |         "comment": "Requires something like encoding detection",
26 |         "original": "Deja dos heridos hundimiento de barco tur\u0092stico en Acapulco.",
27 |         "fixed": "Deja dos heridos hundimiento de barco turístico en Acapulco.",
28 |         "expect": "fail"
29 |     },
30 |     {
31 |         "label": "subtle UTF-8 / codepage 437 mixup in Spanish",
32 |         "original": "┬┐que diferencia hay?",
33 |         "fixed": "¿que diferencia hay?",
34 |         "expect": "fail"
35 |     },
36 |     {
37 |         "label": "Latin-1 / MacRoman mixup in Spanish, 2 characters",
38 |         "comment": "Requires something like encoding detection",
39 |         "original": "Habitantes de Coatl\u0087n conf\u0092an en proyecto de edil electo independiente",
40 |         "fixed": "Habitantes de Coatlán confían en proyecto de edil electo independiente",
41 |         "expect": "fail"
42 |     },
43 |     {
44 |         "label": "An example with 'à' in windows-1251 where we need our heuristic to be bolder",
45 |         "original": "faites attention Г bien vous renseigner avant sur le mГ©dicament",
46 |         "fixed": "faites attention à bien vous renseigner avant sur le médicament",
47 |         "expect": "fail"
48 |     },
49 |     {
50 |         "label": "Italian UTF-8 / MacRoman mojibake that looks like math",
51 |         "comment": "False negative: 'pi√π' is a bit too reasonable to fix",
52 |         "original": "Sarai ricontattato dal nostro Esperto al pi√π presto.",
53 |         "fixed": "Sarai ricontattato dal nostro Esperto al più presto.",
54 |         "expect": "fail"
55 |     },
56 |     {
57 |         "label": "Synthetic: Incomplete UTF-8 / Windows-1252 mixup in Arabic",
58 |         "comment": "I find text like this in OSCAR a fair amount, but couldn't isolate a good example that tested digits. The intended text means 'more than 100 countries'.",
59 |         "original": "أكثر Ù…Ù† Ù Ù Ù¡ Ø¨Ù„Ø¯",
60 |         "fixed": "أكثر من ٠٠١ بلد",
61 |         "expect": "fail"
62 |     },
63 |     {
64 |         "label": "Synthetic, false positive: the title of a manga, in weird capitalized romaji, with a non-breaking space",
65 |         "comment": "Testing tells me I should worry about cases like this, though I haven't seen a real example. Searching for similar real text yields a lot of examples that actually come out fine.",
66 |         "original": "MISUTÂ\u00a0AJIKKO",
67 |         "fixed": "MISUTÂ\u00a0AJIKKO",
68 |         "expect": "fail"
69 |     }
70 | ]


--------------------------------------------------------------------------------
/tests/test-cases/language-names.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     {
  3 |         "label": "Messy language names: Czech",
  4 |         "comment": "This and several following examples came from the same language selector",
  5 |         "original": "ÄŒeÅ¡tina",
  6 |         "fixed": "Čeština",
  7 |         "expect": "pass"
  8 |     },
  9 |     {
 10 |         "label": "Messy language names: Gaelic",
 11 |         "comment": "note that if U+A0 is replaced by a space, it comes out slightly incorrectly as 'Gà idhlig'",
 12 |         "original": "GÃ\u00a0idhlig",
 13 |         "fixed": "Gàidhlig",
 14 |         "expect": "pass"
 15 |     },
 16 |     {
 17 |         "label": "Messy language names: Lithuanian",
 18 |         "original": "LietuviÅ³",
 19 |         "fixed": "Lietuvių",
 20 |         "expect": "pass"
 21 |     },
 22 |     {
 23 |         "label": "Messy language names: Slovak",
 24 |         "original": "SlovenÄ�ina",
 25 |         "fixed": "Sloven�ina",
 26 |         "expect": "pass"
 27 |     },
 28 |     {
 29 |         "label": "Messy language names: Vietnamese",
 30 |         "original": "Tiáº¿ng Viá»‡t",
 31 |         "fixed": "Tiếng Việt",
 32 |         "expect": "pass"
 33 |     },
 34 |     {
 35 |         "label": "Messy language names: Greek",
 36 |         "original": "Î•Î»Î»Î·Î½Î¹ÎºÎ¬",
 37 |         "fixed": "Ελληνικά",
 38 |         "expect": "pass"
 39 |     },
 40 |     {
 41 |         "label": "Messy language names: Bulgarian",
 42 |         "original": "Ð±ÑŠÐ»Ð³Ð°Ñ€Ñ�ÐºÐ¸ ÐµÐ·Ð¸Ðº",
 43 |         "fixed": "българ�ки език",
 44 |         "expect": "pass"
 45 |     },
 46 |     {
 47 |         "label": "Messy language names: Russian",
 48 |         "original": "Ð ÑƒÑ�Ñ�ÐºÐ¸Ð¹",
 49 |         "fixed": "Ру��кий",
 50 |         "expect": "pass"
 51 |     },
 52 |     {
 53 |         "label": "Messy language names: Serbian [Cyrillic]",
 54 |         "original": "CÑ€Ð¿Ñ�ÐºÐ¸ [Ñ›Ð¸Ñ€Ð¸Ð»Ð¸Ñ†Ð¾Ð¼]",
 55 |         "fixed": "Cрп�ки [ћирилицом]",
 56 |         "expect": "pass"
 57 |     },
 58 |     {
 59 |         "label": "Messy language names: Hebrew",
 60 |         "original": "×¢×‘×¨×™×ª",
 61 |         "fixed": "עברית",
 62 |         "expect": "pass"
 63 |     },
 64 |     {
 65 |         "label": "Messy language names: Russian",
 66 |         "original": "Ð ÑƒÑ�Ñ�ÐºÐ¸Ð¹",
 67 |         "fixed": "Ру��кий",
 68 |         "expect": "pass"
 69 |     },
 70 |     {
 71 |         "label": "Messy language names: Hindi",
 72 |         "comment": "My terminal has difficulty rendering the mostly-fixed text",
 73 |         "original": "à¤¹à¤¿à¤¨à¥�à¤¦à¥€",
 74 |         "fixed": "\u0939\u093f\u0928\ufffd\u0926\u0940",
 75 |         "expect": "pass"
 76 |     },
 77 |     {
 78 |         "label": "Messy language names: Tamil",
 79 |         "comment": "My terminal has difficulty rendering the mostly-fixed text",
 80 |         "original": "à®¤à®®à®¿à®´à¯�",
 81 |         "fixed": "\u0ba4\u0bae\u0bbf\u0bb4\ufffd",
 82 |         "expect": "pass"
 83 |     },
 84 |     {
 85 |         "label": "Messy language names: Thai",
 86 |         "original": "à¸ à¸²à¸©à¸²à¹„à¸—à¸¢",
 87 |         "fixed": "ภาษาไทย",
 88 |         "expect": "pass"
 89 |     },
 90 |     {
 91 |         "label": "Messy language names: Simplified Chinese",
 92 |         "original": "ç®€ä½“ä¸\u00adæ–‡",
 93 |         "fixed": "简体中文",
 94 |         "expect": "pass"
 95 |     },
 96 |     {
 97 |         "label": "Messy language names: Traditional Chinese",
 98 |         "original": "æ\u00ad£é«”ä¸\u00adæ–‡",
 99 |         "fixed": "正體中文",
100 |         "expect": "pass"
101 |     },
102 |     {
103 |         "label": "Messy language names: Japanese",
104 |         "original": "æ—¥æœ¬èªž",
105 |         "fixed": "日本語",
106 |         "expect": "pass"
107 |     },
108 |     {
109 |         "label": "Messy language names: Korean",
110 |         "original": "í•œêµ\u00adì–´",
111 |         "fixed": "한국어",
112 |         "expect": "pass"
113 |     },
114 |     {
115 |         "label": "Messy language name in cp437: Czech",
116 |         "comment": "A synthetic example, I suppose, but goes with the other language name tests",
117 |         "original": "─îe┼ítina",
118 |         "fixed": "Čeština",
119 |         "expect": "pass"
120 |     },
121 |     {
122 |         "label": "Messy language name in cp437: Vietnamese",
123 |         "original": "Tiß║┐ng Viß╗çt",
124 |         "fixed": "Tiếng Việt",
125 |         "expect": "pass"
126 |     }
127 | ]


--------------------------------------------------------------------------------
/tests/test-cases/negative.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     {
  3 |         "label": "Negative: Using diaereses as quotation marks in Greek",
  4 |         "comment": "Examples in this file might be detected as mojibake-like, but should not be changed",
  5 |         "original": "Η ¨ανατροφή¨ δυστυχώς από τους προπονητές",
  6 |         "fixed": "Η ¨ανατροφή¨ δυστυχώς από τους προπονητές",
  7 |         "expect": "pass"
  8 |     },
  9 |     {
 10 |         "label": "Negative: Don't fix a multiplication symbol in quotes",
 11 |         "original": "higher values (“+” and “×” curves) in the superficial region",
 12 |         "fixed-encoding": "higher values (“+” and “×” curves) in the superficial region",
 13 |         "fixed": "higher values (\"+\" and \"×\" curves) in the superficial region",
 14 |         "expect": "pass"
 15 |     },
 16 |     {
 17 |         "label": "Sort of negative: this inconsistent mojibake could be Latin-1 or MacRoman, and it was meant to be Latin-1, but it's safest to not decode it as either",
 18 |         "comment": "issue #202",
 19 |         "original": "Bremer/Mccoy – DrÃ¥ber",
 20 |         "fixed": "Bremer/Mccoy – DrÃ¥ber",
 21 |         "expect": "pass"
 22 |     },
 23 |     {
 24 |         "label": "Negative: 'è' preceded by a non-breaking space is not a small capital Y",
 25 |         "original": "Con il corpo e lo spirito ammaccato,\u00a0è come se nel cuore avessi un vetro conficcato.",
 26 |         "fixed": "Con il corpo e lo spirito ammaccato,\u00a0è come se nel cuore avessi un vetro conficcato.",
 27 |         "expect": "pass"
 28 |     },
 29 |     {
 30 |         "label": "Negative: multiplication sign and ellipsis",
 31 |         "comment": "Should not turn into a dot below",
 32 |         "original": "4288×…",
 33 |         "fixed": "4288×…",
 34 |         "expect": "pass"
 35 |     },
 36 |     {
 37 |         "label": "Negative: accents are sometimes used as quotes",
 38 |         "comment": "Under a previous heuristic, this tested the CESU-8 decoder, which would try to decode it and fail when it hit the end of the string",
 39 |         "original": "``toda produzida pronta pra assa aí´´",
 40 |         "fixed": "``toda produzida pronta pra assa aí´´",
 41 |         "expect": "pass"
 42 |     },
 43 |     {
 44 |         "label": "Negative: 'Õ' followed by an ellipsis",
 45 |         "comment": "Should not turn into the Armenian letter Յ",
 46 |         "original": "HUHLL Õ…",
 47 |         "fixed": "HUHLL Õ…",
 48 |         "expect": "pass"
 49 |     },
 50 |     {
 51 |         "label": "Negative: 'Ê' followed by an ellipsis",
 52 |         "comment": "Should not turn into a squat reversed esh",
 53 |         "original": "RETWEET SE VOCÊ…",
 54 |         "fixed": "RETWEET SE VOCÊ…",
 55 |         "expect": "pass"
 56 |     },
 57 |     {
 58 |         "label": "Negative: 'É' followed by an ellipsis",
 59 |         "comment": "Should not turn into 'MARQUɅ'",
 60 |         "original": "PARCE QUE SUR LEURS PLAQUES IL Y MARQUÉ…",
 61 |         "fixed": "PARCE QUE SUR LEURS PLAQUES IL Y MARQUÉ…",
 62 |         "expect": "pass"
 63 |     },
 64 |     {
 65 |         "label": "Negative: 'Ó' followed by an ellipsis",
 66 |         "comment": "Should not turn into 'SӅ'",
 67 |         "original": "TEM QUE SEGUIR, SDV SÓ…",
 68 |         "fixed": "TEM QUE SEGUIR, SDV SÓ…",
 69 |         "expect": "pass"
 70 |     },
 71 |     {
 72 |         "label": "Negative: 'É' followed by a curly apostrophe",
 73 |         "comment": "Should not turn into 'ZZAJɒs'",
 74 |         "original": "Join ZZAJÉ’s Official Fan List and receive news, events, and more!",
 75 |         "fixed-encoding": "Join ZZAJÉ’s Official Fan List and receive news, events, and more!",
 76 |         "fixed": "Join ZZAJÉ's Official Fan List and receive news, events, and more!",
 77 |         "expect": "pass"
 78 |     },
 79 |     {
 80 |         "label": "Negative: 'é' preceded by curly apostrophe",
 81 |         "comment": "Should not turn into 'LՎpisode'",
 82 |         "original": "L’épisode 8 est trop fou ouahh",
 83 |         "fixed-encoding": "L’épisode 8 est trop fou ouahh",
 84 |         "fixed": "L'épisode 8 est trop fou ouahh",
 85 |         "expect": "pass"
 86 |     },
 87 |     {
 88 |         "label": "Negative: three raised eyebrows or something?",
 89 |         "comment": "Should not turn into private use character U+F659",
 90 |         "original": "Ôôô VIDA MINHA",
 91 |         "fixed": "Ôôô VIDA MINHA",
 92 |         "expect": "pass"
 93 |     },
 94 |     {
 95 |         "label": "Negative: copyright sign preceded by non-breaking space",
 96 |         "comment": "Should not turn into 'ʩ'",
 97 |         "original": "[x]\u00a0©",
 98 |         "fixed": "[x]\u00a0©",
 99 |         "expect": "pass"
100 |     },
101 |     {
102 |         "label": "Negative: en dash and infinity sign",
103 |         "comment": "Should not turn into '2012Ѱ'",
104 |         "original": "2012—∞",
105 |         "fixed": "2012—∞",
106 |         "expect": "pass"
107 |     },
108 |     {
109 |         "label": "Negative: This Е is a Ukrainian letter, but nothing else is wrong",
110 |         "original": "SENSЕ - Oleg Tsedryk",
111 |         "fixed": "SENSЕ - Oleg Tsedryk",
112 |         "expect": "pass"
113 |     },
114 |     {
115 |         "label": "Negative: angry face",
116 |         "comment": "The face should not turn into '`«'",
117 |         "original": "OK??:(   `¬´    ):",
118 |         "fixed": "OK??:(   `¬´    ):",
119 |         "expect": "pass"
120 |     },
121 |     {
122 |         "label": "Negative, synthetic: face with glasses and a raised eyebrow",
123 |         "original": "( o¬ô )",
124 |         "fixed": "( o¬ô )",
125 |         "expect": "pass"
126 |     },
127 |     {
128 |         "label": "Negative: triangle and degree sign",
129 |         "comment": "I'm not really sure what it *is* supposed to be, but it's not 'ơ'",
130 |         "original": "∆°",
131 |         "fixed": "∆°",
132 |         "expect": "pass"
133 |     },
134 |     {
135 |         "label": "Negative: Portuguese with inverted question mark",
136 |         "comment": "Former false positive - it should not turn into 'QUEM ɿ'",
137 |         "original": "ESSE CARA AI QUEM É¿",
138 |         "fixed": "ESSE CARA AI QUEM É¿",
139 |         "expect": "pass"
140 |     },
141 |     {
142 |         "label": "Negative: Portuguese with acute accents as quotation marks",
143 |         "comment": "Former false positive - the end should not turn into a superscript H",
144 |         "original": "``hogwarts nao existe, voce nao vai pegar o trem pra lá´´",
145 |         "fixed": "``hogwarts nao existe, voce nao vai pegar o trem pra lá´´",
146 |         "expect": "pass"
147 |     },
148 |     {
149 |         "label": "Negative: Finnish Ä followed by a non-breaking space",
150 |         "comment": "Former false positive - should not become a G with a dot",
151 |         "original": "SELKÄ\u00a0EDELLÄ\u00a0MAAHAN via @YouTube",
152 |         "fixed": "SELKÄ\u00a0EDELLÄ\u00a0MAAHAN via @YouTube",
153 |         "expect": "pass"
154 |     },
155 |     {
156 |         "label": "Negative: multiplying by currency",
157 |         "comment": "Former false positive - should not become the Hebrew letter 'final pe'",
158 |         "original": "Offering 5×£35 pin ups",
159 |         "fixed": "Offering 5×£35 pin ups",
160 |         "expect": "pass"
161 |     },
162 |     {
163 |         "label": "Negative: registered chocolate brand name",
164 |         "comment": "Former false positive - should not become the IPA letter 'lezh'",
165 |         "original": "NESTLÉ® requiere contratar personal para diferentes areas a nivel nacional e internacional",
166 |         "fixed": "NESTLÉ® requiere contratar personal para diferentes areas a nivel nacional e internacional",
167 |         "expect": "pass"
168 |     },
169 |     {
170 |         "label": "Negative: it looks like Windows-1257 mojibake but someone writes their name this way",
171 |         "comment": "Should not become a cedilla",
172 |         "original": "Connect with Āø on Facebook",
173 |         "fixed": "Connect with Āø on Facebook",
174 |         "expect": "pass"
175 |     },
176 |     {
177 |         "label": "Mostly negative: we only need to fix C1 control characters",
178 |         "comment": "We should not decode 'é\u0085 ' as '酠'",
179 |         "original": "C'est vrai que nous n'en avons pas encore beaucoup parlé\u0085 Tu sais, ça fait de nombreuses années",
180 |         "fixed": "C'est vrai que nous n'en avons pas encore beaucoup parlé… Tu sais, ça fait de nombreuses années",
181 |         "expect": "pass"
182 |     },
183 |     {
184 |         "label": "Negative: We don't fix Ã in all contexts",
185 |         "original": "C O N C L U S Ã O",
186 |         "fixed": "C O N C L U S Ã O",
187 |         "expect": "pass"
188 |     },
189 |     {
190 |         "label": "Negative: Two concatenated strings",
191 |         "comment": "Should not turn into 'fratarak᧠141'",
192 |         "original": "Oborzos, per. Vahbarz, frataraká§ 141",
193 |         "fixed": "Oborzos, per. Vahbarz, frataraká§ 141",
194 |         "expect": "pass"
195 |     },
196 |     {
197 |         "label": "Negative: Indonesian leetspeak",
198 |         "original": "MÄ£ÄM ÌÑÌ Q £ÄGÌ GÄLÄW ÑÍCH SÖÄ£ ÑÝÄ $ÚÄMÌ Q £ÄGÌ GÄK ÉÑÄK BÄDÄÑ....?????????,                     ......JÄDÍ...",
199 |         "fixed": "MÄ£ÄM ÌÑÌ Q £ÄGÌ GÄLÄW ÑÍCH SÖÄ£ ÑÝÄ $ÚÄMÌ Q £ÄGÌ GÄK ÉÑÄK BÄDÄÑ....?????????,                     ......JÄDÍ...",
200 |         "expect": "pass"
201 |     },
202 |     {
203 |         "label": "Negative: math in Unicode",
204 |         "comment": "This isn't mojibake, it's an actual equation",
205 |         "original": "(-1/2)! = √π",
206 |         "fixed": "(-1/2)! = √π",
207 |         "expect": "pass"
208 |     },
209 |     {
210 |         "label": "Negative: Leet line-art",
211 |         "comment": "The heuristic before v6 loved to 'fix' this and decode it as 'ôaſaſaſaſa'",
212 |         "original": "├┤a┼┐a┼┐a┼┐a┼┐a",
213 |         "fixed": "├┤a┼┐a┼┐a┼┐a┼┐a",
214 |         "expect": "pass"
215 |     }
216 | ]


--------------------------------------------------------------------------------
/tests/test-cases/synthetic.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     {
  3 |         "label": "Synthetic: we can recognize Ã in some cases when it's the only mojibake",
  4 |         "comment": "Examples in this file were made up to test something, instead of found in the wild",
  5 |         "original": "voilÃ  le travail",
  6 |         "fixed": "voilà le travail",
  7 |         "expect": "pass"
  8 |     },
  9 |     {
 10 |         "label": "Synthetic: we can recognize Ã at the end of a word when it absorbs a following space",
 11 |         "original": "voilÃ le travail",
 12 |         "fixed": "voilà le travail",
 13 |         "expect": "pass"
 14 |     },
 15 |     {
 16 |         "label": "Synthetic: Hebrew UTF-8 / Windows-1250 mojibake",
 17 |         "original": "×‘×”×•×“×˘×”",
 18 |         "fixed": "בהודעה",
 19 |         "expect": "pass"
 20 |     },
 21 |     {
 22 |         "label": "Synthetic: Hebrew UTF-8 / MacRoman mojibake",
 23 |         "original": "◊ë◊î◊ï◊ì◊¢◊î",
 24 |         "fixed": "בהודעה",
 25 |         "expect": "pass"
 26 |     },
 27 |     {
 28 |         "label": "Synthetic: Hebrew UTF-8 / Latin-1 mojibake",
 29 |         "comment": "This example uses low-numbered codepoints to spell 'ABBA' in Hebrew, so that it falls into the range where Latin-1 is different from Windows-1252. As a bonus, this example looks right even if your RTL text rendering isn't working.",
 30 |         "original": "×\u0090×\u0091×\u0091×\u0090",
 31 |         "fixed": "אבבא",
 32 |         "expect": "pass"
 33 |     },
 34 |     {
 35 |         "label": "Synthetic: Arabic UTF-8 / Windows-1252 mojibake",
 36 |         "original": "Ø±Ø³Ø§Ù„Ø©",
 37 |         "fixed": "رسالة",
 38 |         "expect": "pass"
 39 |     },
 40 |     {
 41 |         "label": "Synthetic: Arabic UTF-8 / Windows-1250 mojibake",
 42 |         "original": "Ř±ŘłŘ§Ů„Ř©",
 43 |         "fixed": "رسالة",
 44 |         "expect": "pass"
 45 |     },
 46 |     {
 47 |         "label": "Synthetic: Arabic UTF-8 / MacRoman mojibake",
 48 |         "original": "ÿ±ÿ≥ÿßŸÑÿ©",
 49 |         "fixed": "رسالة",
 50 |         "expect": "pass"
 51 |     },
 52 |     {
 53 |         "label": "Synthetic, negative: Brontë's name does not end with a Korean syllable",
 54 |         "comment": "The original example of why ftfy needs heuristics",
 55 |         "original": "I'm not such a fan of Charlotte Brontë…”",
 56 |         "fixed-encoding": "I'm not such a fan of Charlotte Brontë…”",
 57 |         "fixed": "I'm not such a fan of Charlotte Brontë…\"",
 58 |         "expect": "pass"
 59 |     },
 60 |     {
 61 |         "label": "Synthetic, negative: hypothetical Swedish product name",
 62 |         "comment": "This used to be a constructed example of a false positive, until you added another symbol",
 63 |         "original": "AHÅ™, the new sofa from IKEA",
 64 |         "fixed": "AHÅ™, the new sofa from IKEA",
 65 |         "expect": "pass"
 66 |     },
 67 |     {
 68 |         "label": "Synthetic, negative: Ukrainian capital letters",
 69 |         "comment": "We need to fix Windows-1251 conservatively, or else this decodes as '²ʲ'",
 70 |         "original": "ВІКІ is Ukrainian for WIKI",
 71 |         "fixed": "ВІКІ is Ukrainian for WIKI",
 72 |         "expect": "pass"
 73 |     },
 74 |     {
 75 |         "label": "Synthetic, negative: don't leak our internal use of byte 0x1A",
 76 |         "comment": "We use byte 0x1A internally as an encoding of U+FFFD, but literal occurrences of U+1A are just ASCII control characters",
 77 |         "original": "These control characters \u001a are apparently intentional \u0081",
 78 |         "fixed-encoding": "These control characters \u001a are apparently intentional \u0081",
 79 |         "fixed": "These control characters  are apparently intentional \u0081",
 80 |         "expect": "pass"
 81 |     },
 82 |     {
 83 |         "label": "Synthetic, negative: U+1A on its own",
 84 |         "comment": "We use byte 0x1A internally as an encoding of U+FFFD, but literal occurrences of U+1A are just ASCII control characters",
 85 |         "original": "Here's a control character: \u001a",
 86 |         "fixed-encoding": "Here's a control character: \u001a",
 87 |         "fixed": "Here's a control character: ",
 88 |         "expect": "pass"
 89 |     },
 90 |     {
 91 |         "label": "Synthetic, negative: A-with-circle as an Angstrom sign",
 92 |         "comment": "Should not turn into '10 ŗ'",
 93 |         "original": "a radius of 10 Å—",
 94 |         "fixed": "a radius of 10 Å—",
 95 |         "expect": "pass"
 96 |     },
 97 |     {
 98 |         "label": "Synthetic, negative: Spanish with exclamation points on the wrong sides",
 99 |         "original": "!YO SÉ¡",
100 |         "fixed": "!YO SÉ¡",
101 |         "expect": "pass"
102 |     },
103 |     {
104 |         "label": "Synthetic: fix text with backslashes in it",
105 |         "comment": "Tests for a regression on a long-ago bug",
106 |         "original": "<40\\% vs \u00e2\u0089\u00a540\\%",
107 |         "fixed": "<40\\% vs ≥40\\%",
108 |         "expect": "pass"
109 |     },
110 |     {
111 |         "label": "Synthetic: curly quotes with mismatched encoding glitches in Latin-1",
112 |         "original": "\u00e2\u0080\u009cmismatched quotes\u0085\u0094",
113 |         "fixed-encoding": "“mismatched quotes…”",
114 |         "fixed": "\"mismatched quotes…\"",
115 |         "expect": "pass"
116 |     },
117 |     {
118 |         "label": "Synthetic: curly quotes with mismatched encoding glitches in Windows-1252",
119 |         "original": "â€œmismatched quotesâ€¦”",
120 |         "fixed-encoding": "“mismatched quotes…”",
121 |         "fixed": "\"mismatched quotes…\"",
122 |         "expect": "pass"
123 |     },
124 |     {
125 |         "label": "Synthetic: lossy decoding in sloppy-windows-1252",
126 |         "original": "â€œlossy decodingâ€�",
127 |         "fixed-encoding": "“lossy decoding�",
128 |         "fixed": "\"lossy decoding�",
129 |         "expect": "pass"
130 |     },
131 |     {
132 |         "label": "Synthetic: French word for August in windows-1252",
133 |         "original": "aoÃ»t",
134 |         "fixed-encoding": "août",
135 |         "fixed": "août",
136 |         "expect": "pass"
137 |     },
138 |     {
139 |         "label": "Synthetic: French word for hotel in all-caps windows-1252",
140 |         "original": "HÃ”TEL",
141 |         "fixed-encoding": "HÔTEL",
142 |         "fixed": "HÔTEL",
143 |         "expect": "pass"
144 |     },
145 |     {
146 |         "label": "Synthetic: Scottish Gaelic word for 'subject' in all-caps windows-1252",
147 |         "original": "CÃ™IS",
148 |         "fixed-encoding": "CÙIS",
149 |         "fixed": "CÙIS",
150 |         "expect": "pass"
151 |     },
152 |     {
153 |         "label": "Synthetic, negative: Romanian word before a non-breaking space",
154 |         "comment": "The word literally means 'not even once', which might be a good recommendation about fixing Romanian mojibake",
155 |         "original": "NICIODATĂ\u00a0",
156 |         "fixed": "NICIODATĂ\u00a0",
157 |         "expect": "pass"
158 |     },
159 |     {
160 |         "label": "Synthetic, negative: Be careful around curly apostrophes",
161 |         "comment": "It shouldn't end up saying 'a lot of Òs'",
162 |         "original": "There are a lot of Ã’s in mojibake text",
163 |         "fixed-encoding": "There are a lot of Ã’s in mojibake text",
164 |         "fixed": "There are a lot of Ã's in mojibake text",
165 |         "expect": "pass"
166 |     },
167 |     {
168 |         "label": "Synthetic, negative: Romanian word before a trademark sign",
169 |         "comment": "We would change 'DATÃ™' to 'DATÙ' if it passed the badness heuristic",
170 |         "original": "NICIODATĂ™",
171 |         "fixed": "NICIODATĂ™",
172 |         "expect": "pass"
173 |     },
174 |     {
175 |         "label": "Synthetic, negative: Lithuanian word before a trademark sign",
176 |         "comment": "Similar to the above example. Shouldn't turn into U+0619 ARABIC SMALL DAMMA",
177 |         "original": "TRANSFORMATORIŲ™",
178 |         "fixed": "TRANSFORMATORIŲ™",
179 |         "expect": "pass"
180 |     },
181 |     {
182 |         "label": "Synthetic, negative: Norwegian capitalized nonsense",
183 |         "comment": "We're shouting that the island of Håøya is gullible. It should not turn into 'HŨYA ER BLŨYD'.",
184 |         "original": "HÅØYA ER BLÅØYD",
185 |         "fixed": "HÅØYA ER BLÅØYD",
186 |         "expect": "pass"
187 |     },
188 |     {
189 |         "label": "Synthetic, negative: raised eyebrow kaomoji",
190 |         "original": "Ō¬o",
191 |         "fixed": "Ō¬o",
192 |         "expect": "pass"
193 |     },
194 |     {
195 |         "label": "Synthetic, negative: Camel-cased Serbian that looks like a UTF-8 / Windows-1251 mixup",
196 |         "comment": "I made this text up, but it seems like it means 'HelloDevil'. Could be a username or something.",
197 |         "original": "ПоздравЂаво",
198 |         "fixed": "ПоздравЂаво",
199 |         "expect": "pass"
200 |     },
201 |     {
202 |         "label": "Synthetic: mojibake with trademark sign at the end of a word",
203 |         "comment": "I recall the correct version of this text from a sign in the movie Amélie. Now we can help her twin AmÃ©lie, who makes mojibaked signs.",
204 |         "original": "OÃ™ ET QUAND?",
205 |         "fixed": "OÙ ET QUAND?",
206 |         "expect": "pass"
207 |     }
208 | ]


--------------------------------------------------------------------------------
/tests/test_bytes.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from ftfy import guess_bytes
 4 | from ftfy.bad_codecs.utf8_variants import IncrementalDecoder
 5 | 
 6 | TEST_ENCODINGS = ["utf-16", "utf-8", "sloppy-windows-1252"]
 7 | 
 8 | TEST_STRINGS = [
 9 |     "Renée\nFleming",
10 |     "Noël\nCoward",
11 |     "Señor\nCardgage",
12 |     "€ • £ • ¥",
13 |     "¿Qué?",
14 | ]
15 | 
16 | 
17 | @pytest.mark.parametrize("string", TEST_STRINGS)
18 | def test_guess_bytes(string):
19 |     for encoding in TEST_ENCODINGS:
20 |         result_str, result_encoding = guess_bytes(string.encode(encoding))
21 |         assert result_str == string
22 |         assert result_encoding == encoding
23 | 
24 |     if "\n" in string:
25 |         old_mac_bytes = string.replace("\n", "\r").encode("macroman")
26 |         result_str, result_encoding = guess_bytes(old_mac_bytes)
27 |         assert result_str == string.replace("\n", "\r")
28 | 
29 | 
30 | def test_guess_bytes_null():
31 |     bowdlerized_null = b"null\xc0\x80separated"
32 |     result_str, result_encoding = guess_bytes(bowdlerized_null)
33 |     assert result_str == "null\x00separated"
34 |     assert result_encoding == "utf-8-variants"
35 | 
36 | 
37 | def test_incomplete_sequences():
38 |     test_bytes = b"surrogates: \xed\xa0\x80\xed\xb0\x80 / null: \xc0\x80"
39 |     test_string = "surrogates: \U00010000 / null: \x00"
40 | 
41 |     # Test that we can feed this string to decode() in multiple pieces, and no
42 |     # matter where the break between those pieces is, we get the same result.
43 |     for split_point in range(len(test_string) + 1):
44 |         left = test_bytes[:split_point]
45 |         right = test_bytes[split_point:]
46 | 
47 |         decoder = IncrementalDecoder()
48 |         got = decoder.decode(left, final=False)
49 |         got += decoder.decode(right)
50 |         assert got == test_string
51 | 


--------------------------------------------------------------------------------
/tests/test_characters.py:
--------------------------------------------------------------------------------
 1 | from ftfy import (
 2 |     fix_and_explain,
 3 |     fix_encoding,
 4 |     fix_text,
 5 | )
 6 | from ftfy.chardata import possible_encoding
 7 | from ftfy.fixes import fix_surrogates, remove_control_chars
 8 | 
 9 | 
10 | def test_possible_encoding():
11 |     for codept in range(256):
12 |         char = chr(codept)
13 |         assert possible_encoding(char, "latin-1")
14 | 
15 | 
16 | def test_byte_order_mark():
17 |     assert fix_encoding("ï»¿") == "\ufeff"
18 | 
19 | 
20 | def test_control_chars():
21 |     text = (
22 |         "\ufeffSometimes, \ufffcbad ideas \x7f\ufffalike these characters\ufffb "
23 |         "\u206aget standardized.\r\n"
24 |     )
25 |     fixed = "Sometimes, bad ideas like these characters get standardized.\r\n"
26 |     assert remove_control_chars(text) == fixed
27 | 
28 | 
29 | def test_welsh_flag():
30 |     # ftfy used to remove "tag characters", but they have been repurposed in the
31 |     # "Flag of England", "Flag of Scotland", and "Flag of Wales" emoji sequences.
32 |     text = "This flag has a dragon on it 🏴󠁧󠁢󠁷󠁬󠁳󠁿"
33 |     assert remove_control_chars(text) == text
34 | 
35 | 
36 | def test_ohio_flag():
37 |     # I did not expect to find the "Flag of Ohio" emoji in the wild but there it is.
38 |     # Test that this emoji (which no emoji database believes has been implemented)
39 |     # passes through unchanged.
40 |     text = "#superman #ohio 🏴\U000e0075\U000e0073\U000e006f\U000e0068\U000e007f #cleveland #usa 🇺🇸"
41 |     assert fix_text(text) == text
42 | 
43 | 
44 | def test_surrogates():
45 |     assert fix_surrogates("\udbff\udfff") == "\U0010ffff"
46 |     assert fix_surrogates("\ud800\udc00") == "\U00010000"
47 | 
48 | 
49 | def test_color_escapes():
50 |     fixed, plan = fix_and_explain("\001\033[36;44mfoo")
51 |     print(plan)
52 |     assert fixed == "foo"
53 |     assert plan == [
54 |         ("apply", "remove_terminal_escapes"),
55 |         ("apply", "remove_control_chars"),
56 |     ]
57 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | from pathlib import Path
 4 | 
 5 | import pytest
 6 | 
 7 | # Get the filename of 'face.txt', an example of mojibake
 8 | THIS_DIR = Path(__file__).parent
 9 | TEST_FILENAME = THIS_DIR / "face.txt"
10 | CORRECT_OUTPUT = os.linesep.join(["┒(⌣˛⌣)┎", ""])
11 | FAILED_OUTPUT = os.linesep.join(
12 |     [
13 |         "ftfy error:",
14 |         "This input couldn't be decoded as 'windows-1252'. We got the following error:",
15 |         "",
16 |         "    'charmap' codec can't decode byte 0x9d in position 4: character maps to <undefined>",
17 |         "",
18 |         "ftfy works best when its input is in a known encoding. You can use `ftfy -g`",
19 |         "to guess, if you're desperate. Otherwise, give the encoding name with the",
20 |         "`-e` option, such as `ftfy -e latin-1`.",
21 |         "",
22 |     ]
23 | )
24 | 
25 | 
26 | def get_command_output(args, stdin=None):
27 |     return subprocess.check_output(args, stdin=stdin, stderr=subprocess.STDOUT, timeout=5).decode(
28 |         "utf-8"
29 |     )
30 | 
31 | 
32 | def test_basic():
33 |     output = get_command_output(["ftfy", TEST_FILENAME])
34 |     assert output == CORRECT_OUTPUT
35 | 
36 | 
37 | def test_guess_bytes():
38 |     output = get_command_output(["ftfy", "-g", TEST_FILENAME])
39 |     assert output == CORRECT_OUTPUT
40 | 
41 | 
42 | def test_alternate_encoding():
43 |     # The file isn't really in Windows-1252. But that's a problem ftfy
44 |     # can fix, if it's allowed to be sloppy when reading the file.
45 |     output = get_command_output(["ftfy", "-e", "sloppy-windows-1252", TEST_FILENAME])
46 |     assert output == CORRECT_OUTPUT
47 | 
48 | 
49 | def test_wrong_encoding():
50 |     # It's more of a problem when the file doesn't actually decode.
51 |     with pytest.raises(subprocess.CalledProcessError) as exception:
52 |         get_command_output(["ftfy", "-e", "windows-1252", TEST_FILENAME])
53 |     assert exception.value.output.decode("utf-8") == FAILED_OUTPUT
54 | 
55 | 
56 | def test_same_file():
57 |     with pytest.raises(subprocess.CalledProcessError) as exception:
58 |         get_command_output(["ftfy", TEST_FILENAME, "-o", TEST_FILENAME])
59 |     error = exception.value.output.decode("utf-8")
60 |     assert error.startswith("ftfy error:")
61 |     assert "Can't read and write the same file" in error
62 | 
63 | 
64 | def test_stdin():
65 |     with TEST_FILENAME.open("rb") as infile:
66 |         output = get_command_output(["ftfy"], stdin=infile)
67 |         assert output == CORRECT_OUTPUT
68 | 


--------------------------------------------------------------------------------
/tests/test_encodings.py:
--------------------------------------------------------------------------------
 1 | from ftfy import bad_codecs, guess_bytes
 2 | 
 3 | 
 4 | def test_cesu8():
 5 |     cls1 = bad_codecs.search_function("cesu8").__class__
 6 |     cls2 = bad_codecs.search_function("cesu-8").__class__
 7 |     assert cls1 == cls2
 8 | 
 9 |     test_bytes = b"\xed\xa6\x9d\xed\xbd\xb7 is an unassigned character, and \xc0\x80 is null"
10 |     test_text = "\U00077777 is an unassigned character, and \x00 is null"
11 |     assert test_bytes.decode("cesu8") == test_text
12 | 
13 | 
14 | def test_russian_crash():
15 |     thebytes = b"\xe8\xed\xe2\xe5\xed\xf2\xe0\xf0\xe8\xe7\xe0\xf6\xe8\xff "
16 |     # We don't care what the result is, but this shouldn't crash
17 |     thebytes.decode("utf-8-variants", "replace")
18 | 
19 |     # This shouldn't crash either
20 |     guess_bytes(thebytes)
21 | 


--------------------------------------------------------------------------------
/tests/test_entities.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from ftfy import fix_text, fix_text_segment
 4 | from ftfy.fixes import unescape_html
 5 | 
 6 | 
 7 | def test_entities():
 8 |     example = "&amp;\n<html>\n&amp;"
 9 |     assert fix_text(example) == "&\n<html>\n&amp;"
10 |     assert fix_text_segment(example) == "&amp;\n<html>\n&amp;"
11 | 
12 |     assert fix_text(example, unescape_html=True) == "&\n<html>\n&"
13 |     assert fix_text_segment(example, unescape_html=True) == "&\n<html>\n&"
14 | 
15 |     assert fix_text(example, unescape_html=False) == "&amp;\n<html>\n&amp;"
16 |     assert fix_text_segment(example, unescape_html=False) == "&amp;\n<html>\n&amp;"
17 | 
18 |     assert fix_text_segment("&lt;&gt;", unescape_html=False) == "&lt;&gt;"
19 |     assert fix_text_segment("&lt;&gt;", unescape_html=True) == "<>"
20 |     assert fix_text_segment("&lt;&gt;") == "<>"
21 |     assert fix_text_segment("jednocze&sacute;nie") == "jednocześnie"
22 |     assert fix_text_segment("JEDNOCZE&Sacute;NIE") == "JEDNOCZEŚNIE"
23 |     assert fix_text_segment("ellipsis&#133;", normalization="NFKC") == "ellipsis..."
24 |     assert fix_text_segment("ellipsis&#x85;", normalization="NFKC") == "ellipsis..."
25 |     assert fix_text_segment("broken&#x81;") == "broken\x81"
26 |     assert fix_text_segment("&amp;amp;amp;") == "&"
27 |     assert unescape_html("euro &#x80;") == "euro €"
28 |     assert unescape_html("EURO &EURO;") == "EURO €"
29 |     assert unescape_html("not an entity &#20x6;") == "not an entity &#20x6;"
30 |     assert unescape_html("JEDNOCZE&SACUTE;NIE") == "JEDNOCZEŚNIE"
31 |     assert unescape_html("V&SCARON;ICHNI") == "VŠICHNI"
32 |     assert unescape_html("&#xffff;") == ""
33 |     assert unescape_html("&#xffffffff;") == "\ufffd"
34 |     assert (
35 |         fix_text_segment("this is just informal english &not html")
36 |         == "this is just informal english &not html"
37 |     )
38 | 
39 | 
40 | def test_old_parameter_name():
41 |     example = "&amp;\n<html>\n&amp;"
42 |     with pytest.deprecated_call():
43 |         assert fix_text(example, fix_entities=True) == "&\n<html>\n&"
44 |     with pytest.deprecated_call():
45 |         assert fix_text(example, fix_entities=False) == "&amp;\n<html>\n&amp;"
46 | 


--------------------------------------------------------------------------------
/tests/test_examples_in_json.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Test ftfy's fixes using the data in `test_cases.json`.
  3 | 
  4 | I collected many test cases by listening to the Twitter streaming API for
  5 | millions of tweets, picking out examples with high weirdness, and seeing what
  6 | ftfy decoded them to. There are some impressive things that can happen to text,
  7 | even in an ecosystem that is supposedly entirely UTF-8.
  8 | 
  9 | Some examples come from the Common Crawl (particularly those involving
 10 | Windows-1250 mojibake, which is more common on arbitrary Web pages than on
 11 | Twitter), and some examples marked as 'synthetic' are contrived to test
 12 | particular features of ftfy.
 13 | 
 14 | Each test case is a dictionary containing the following items:
 15 | 
 16 | - "label": a label that will identify the test case in nosetests
 17 | - "original": the text to be ftfy'd
 18 | - "fixed": what the result of ftfy.fix_text should be on this text
 19 | 
 20 | There are also two optional fields:
 21 | 
 22 | - "fixed-encoding": what the result of just ftfy.fix_encoding should be.
 23 |   If missing, it will be considered to be the same as "fixed".
 24 | - "comment": possibly-enlightening commentary on the test case.
 25 | """
 26 | 
 27 | import json
 28 | from pathlib import Path
 29 | 
 30 | import pytest
 31 | 
 32 | from ftfy import apply_plan, fix_and_explain, fix_encoding_and_explain, fix_text
 33 | 
 34 | THIS_DIR = Path(__file__).parent
 35 | TEST_CASE_DIR = THIS_DIR / "test-cases"
 36 | 
 37 | 
 38 | def load_test_data() -> list[dict]:
 39 |     test_data = []
 40 |     for filepath in TEST_CASE_DIR.glob("*.json"):
 41 |         test_data.extend(json.load(filepath.open()))
 42 |     return test_data
 43 | 
 44 | 
 45 | TEST_DATA = load_test_data()
 46 | 
 47 | TESTS_THAT_PASS = [test for test in TEST_DATA if test["expect"] == "pass"]
 48 | TESTS_THAT_FAIL = [test for test in TEST_DATA if test["expect"] == "fail"]
 49 | 
 50 | 
 51 | @pytest.mark.parametrize("test_case", TEST_DATA)
 52 | def test_well_formed_example(test_case):
 53 |     assert test_case["expect"] in ("pass", "fail")
 54 | 
 55 | 
 56 | @pytest.mark.parametrize("test_case", TESTS_THAT_PASS)
 57 | def test_json_example(test_case):
 58 |     # Run one example from the data file
 59 |     orig = test_case["original"]
 60 |     fixed = test_case["fixed"]
 61 | 
 62 |     # Make sure that we can fix the text as intended
 63 |     assert fix_text(orig) == fixed
 64 | 
 65 |     # Make sure that fix_and_explain outputs a plan that we can successfully
 66 |     # run to reproduce its result
 67 |     fixed_output, plan = fix_and_explain(orig)
 68 |     assert apply_plan(orig, plan) == fixed_output
 69 | 
 70 |     # Do the same for fix_encoding_and_explain
 71 |     encoding_fix, plan = fix_encoding_and_explain(orig)
 72 |     assert apply_plan(orig, plan) == encoding_fix
 73 | 
 74 |     # Ask for the encoding fix a different way, by disabling all the other steps
 75 |     # in the config object
 76 |     assert (
 77 |         fix_text(
 78 |             orig,
 79 |             unescape_html=False,
 80 |             remove_terminal_escapes=False,
 81 |             fix_character_width=False,
 82 |             fix_latin_ligatures=False,
 83 |             uncurl_quotes=False,
 84 |             fix_line_breaks=False,
 85 |             fix_surrogates=False,
 86 |             remove_control_chars=False,
 87 |             normalization=None,
 88 |         )
 89 |         == encoding_fix
 90 |     )
 91 | 
 92 |     # Make sure we can decode the text as intended
 93 |     assert fix_text(orig) == fixed
 94 |     assert encoding_fix == test_case.get("fixed-encoding", fixed)
 95 | 
 96 |     # Make sure we can decode as intended even with an extra layer of badness
 97 |     extra_bad = orig.encode("utf-8").decode("latin-1")
 98 |     assert fix_text(extra_bad) == fixed
 99 | 
100 | 
101 | @pytest.mark.parametrize("test_case", TESTS_THAT_FAIL)
102 | @pytest.mark.xfail(strict=True)
103 | def test_failing_json_example(test_case):
104 |     # Run an example from the data file that we believe will fail, due to
105 |     # ftfy's heuristic being insufficient
106 |     orig = test_case["original"]
107 |     fixed = test_case["fixed"]
108 | 
109 |     encoding_fix, plan = fix_encoding_and_explain(orig)
110 |     assert encoding_fix == test_case.get("fixed-encoding", fixed)
111 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py39, py310, py311, py312, py313
3 | 
4 | [testenv]
5 | deps =
6 |     pytest
7 |     wcwidth
8 | commands = pytest
9 | 


--------------------------------------------------------------------------------