├── .github └── workflows │ └── publish.yml ├── .gitignore ├── .mailmap ├── .readthedocs.yaml ├── CHANGELOG.md ├── LICENSE.txt ├── MANIFEST.in ├── README.md ├── docs ├── Makefile ├── _static │ └── css │ │ └── custom.css ├── avoid.rst ├── bad_encodings.rst ├── cite.rst ├── cli.rst ├── conf.py ├── config.rst ├── detect.rst ├── encodings.rst ├── explain.rst ├── fixes.rst ├── heuristic.rst ├── images │ └── shipping-label.png └── index.rst ├── ftfy ├── __init__.py ├── bad_codecs │ ├── __init__.py │ ├── sloppy.py │ └── utf8_variants.py ├── badness.py ├── chardata.py ├── cli.py ├── fixes.py ├── formatting.py └── py.typed ├── mypy.ini ├── notebook ├── excel-export.png └── ftfy talk.ipynb ├── notes └── mysteries.txt ├── pyproject.toml ├── pytest.ini ├── scripts └── char_data_table.py ├── tests ├── __init__.py ├── face.txt ├── test-cases │ ├── README.md │ ├── in-the-wild.json │ ├── known-failures.json │ ├── language-names.json │ ├── negative.json │ └── synthetic.json ├── test_bytes.py ├── test_characters.py ├── test_cli.py ├── test_encodings.py ├── test_entities.py └── test_examples_in_json.py ├── tox.ini └── uv.lock /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish Python distribution 📦 to PyPI 2 | 3 | on: 4 | push: 5 | tags: 6 | - "v*" 7 | 8 | jobs: 9 | build: 10 | name: Build distribution 📦 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v4 15 | - name: Set up Python 16 | uses: actions/setup-python@v5 17 | with: 18 | python-version: "3.x" 19 | - name: Install pypa/build 20 | run: >- 21 | python3 -m 22 | pip install 23 | build 24 | --user 25 | - name: Build a binary wheel and a source tarball 26 | run: python3 -m build 27 | - name: Store the distribution packages 28 | uses: actions/upload-artifact@v4 29 | with: 30 | name: python-package-distributions 31 | path: dist/ 32 | 33 | publish-to-pypi: 34 | name: >- 35 | Publish Python distribution 📦 to PyPI 36 | if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes 37 | needs: 38 | - build 39 | runs-on: ubuntu-latest 40 | environment: 41 | name: pypi 42 | url: https://pypi.org/p/ftfy 43 | permissions: 44 | id-token: write # IMPORTANT: mandatory for trusted publishing 45 | 46 | steps: 47 | - name: Download all the dists 48 | uses: actions/download-artifact@v4 49 | with: 50 | name: python-package-distributions 51 | path: dist/ 52 | - name: Publish distribution 📦 to PyPI 53 | uses: pypa/gh-action-pypi-publish@release/v1 54 | 55 | github-release: 56 | name: >- 57 | Sign the Python distribution 📦 with Sigstore 58 | and upload them to GitHub Release 59 | needs: 60 | - publish-to-pypi 61 | runs-on: ubuntu-latest 62 | 63 | permissions: 64 | contents: write # IMPORTANT: mandatory for making GitHub Releases 65 | id-token: write # IMPORTANT: mandatory for sigstore 66 | 67 | steps: 68 | - name: Download all the dists 69 | uses: actions/download-artifact@v4 70 | with: 71 | name: python-package-distributions 72 | path: dist/ 73 | - name: Sign the dists with Sigstore 74 | uses: sigstore/gh-action-sigstore-python@v3.0.0 75 | with: 76 | inputs: >- 77 | ./dist/*.tar.gz 78 | ./dist/*.whl 79 | - name: Create GitHub Release 80 | env: 81 | GITHUB_TOKEN: ${{ github.token }} 82 | run: >- 83 | gh release create 84 | '${{ github.ref_name }}' 85 | --repo '${{ github.repository }}' 86 | --notes "" 87 | - name: Upload artifact signatures to GitHub Release 88 | env: 89 | GITHUB_TOKEN: ${{ github.token }} 90 | # Upload to GitHub Release using the `gh` CLI. 91 | # `dist/` contains the built packages, and the 92 | # sigstore-produced signatures and certificates. 93 | run: >- 94 | gh release upload 95 | '${{ github.ref_name }}' dist/** 96 | --repo '${{ github.repository }}' 97 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | __pycache__ 3 | .coverage 4 | dist 5 | *.egg-info 6 | build 7 | _build 8 | twitterlogs 9 | .eggs 10 | .pytest_cache 11 | .tox 12 | specimens 13 | .vscode 14 | .python-version 15 | -------------------------------------------------------------------------------- /.mailmap: -------------------------------------------------------------------------------- 1 | # Robyn has used different names and e-mail addresses in the course of this project. Map them all to her current name and e-mail. 2 | Robyn Speer 3 | Robyn Speer 4 | Robyn Speer 5 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-24.04 11 | tools: 12 | python: "3.11" 13 | commands: 14 | - asdf plugin add uv 15 | - asdf install uv latest 16 | - asdf global uv latest 17 | - uv venv 18 | - uv sync 19 | - .venv/bin/python -m sphinx -T -b html -d docs/_build/doctrees -D language=en docs $READTHEDOCS_OUTPUT/html 20 | 21 | # Build documentation in the docs/ directory with Sphinx 22 | sphinx: 23 | configuration: docs/conf.py 24 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## Version 6.3.1 (October 25, 2024) 2 | 3 | - Fixed `license` metadata field in pyproject.toml. 4 | - Removed extraneous files from the `hatchling` sdist output. 5 | 6 | ## Version 6.3.0 (October 8, 2024) 7 | 8 | - Switched packaging from poetry to uv. 9 | - Uses modern Python packaging exclusively (no setup.py). 10 | - Added support for mojibake in Windows-1257 (Baltic). 11 | - Detects mojibake for "Ü" in an uppercase word, such as "ZURÜCK". 12 | - Expanded a heuristic that notices improbable punctuation. 13 | - Fixed a false positive involving two concatenated strings, one of which began with the § sign. 14 | - Rewrote `chardata.py` to be more human-readable and debuggable, instead of being full of 15 | keysmash-like character sets. 16 | 17 | ## Version 6.2.3 (August 5, 2024) 18 | 19 | - Updated PyPI metadata. 20 | 21 | ## Version 6.2.2 (August 5, 2024) 22 | 23 | - Updated Read the Docs config so that docs might build again. 24 | 25 | ## Version 6.2.1 (August 5, 2024) 26 | 27 | - Updated setup.py and tox.ini to indicate support for Python 3.8 through 3.13. 28 | - Replaced the text file used in CLI tests with a better one that tests the same issue. 29 | - Lints and auto-formatting using ruff. 30 | - Packaging and test fixes by Michał Górny. 31 | 32 | ## Version 6.2.0 (March 15, 2024) 33 | 34 | - Fixed a case where an en-dash and a space near other mojibake would be 35 | interpreted (probably incorrectly) as MacRoman mojibake. 36 | - Added [project.urls] metadata to pyproject.toml. 37 | - README contains license clarifications for entitled jerks. 38 | 39 | ## Version 6.1.3 (November 21, 2023) 40 | 41 | - Updated wcwidth. 42 | - Switched to the Apache 2.0 license. 43 | - Dropped support for Python 3.7. 44 | 45 | ## Version 6.1.2 (February 17, 2022) 46 | 47 | - Added type information for `guess_bytes`. 48 | 49 | ## Version 6.1.1 (February 9, 2022) 50 | 51 | - Updated the heuristic to fix the letter ß in UTF-8/MacRoman mojibake, 52 | which had regressed since version 5.6. 53 | 54 | - Packaging fixes to pyproject.toml. 55 | 56 | ## Version 6.1 (February 9, 2022) 57 | 58 | - Updated the heuristic to fix the letter Ñ with more confidence. 59 | 60 | - Fixed type annotations and added py.typed. 61 | 62 | - ftfy is packaged using Poetry now, and wheels are created and uploaded to 63 | PyPI. 64 | 65 | ## Version 6.0.3 (May 14, 2021) 66 | 67 | - Allow the keyword argument `fix_entities` as a deprecated alias for 68 | `unescape_html`, raising a warning. 69 | 70 | - `ftfy.formatting` functions now disregard ANSI terminal escapes when 71 | calculating text width. 72 | 73 | 74 | ## Version 6.0.2 (May 4, 2021) 75 | 76 | This version is purely a cosmetic change, updating the maintainer's e-mail 77 | address and the project's canonical location on GitHub. 78 | 79 | 80 | ## Version 6.0.1 (April 12, 2021) 81 | 82 | - The `remove_terminal_escapes` step was accidentally not being used. This 83 | version restores it. 84 | 85 | - Specified in setup.py that ftfy 6 requires Python 3.6 or later. 86 | 87 | - Use a lighter link color when the docs are viewed in dark mode. 88 | 89 | ## Version 6.0 (April 2, 2021) 90 | 91 | - New function: `ftfy.fix_and_explain()` can describe all the transformations 92 | that happen when fixing a string. This is similar to what 93 | `ftfy.fixes.fix_encoding_and_explain()` did in previous versions, but it 94 | can fix more than the encoding. 95 | 96 | - `fix_and_explain()` and `fix_encoding_and_explain()` are now in the top-level 97 | ftfy module. 98 | 99 | - Changed the heuristic entirely. ftfy no longer needs to categorize every 100 | Unicode character, but only characters that are expected to appear in 101 | mojibake. 102 | 103 | - Because of the new heuristic, ftfy will no longer have to release a new 104 | version for every new version of Unicode. It should also run faster and 105 | use less RAM when imported. 106 | 107 | - The heuristic `ftfy.badness.is_bad(text)` can be used to determine whether 108 | there appears to be mojibake in a string. Some users were already using 109 | the old function `sequence_weirdness()` for that, but this one is actually 110 | designed for that purpose. 111 | 112 | - Instead of a pile of named keyword arguments, ftfy functions now take in 113 | a TextFixerConfig object. The keyword arguments still work, and become 114 | settings that override the defaults in TextFixerConfig. 115 | 116 | - Added support for UTF-8 mixups with Windows-1253 and Windows-1254. 117 | 118 | - Overhauled the documentation: https://ftfy.readthedocs.org 119 | 120 | ## Version 5.9 (February 10, 2021) 121 | 122 | This version is brought to you by the letter à and the number 0xC3. 123 | 124 | - Tweaked the heuristic to decode, for example, "à" as the letter "à" 125 | more often. 126 | 127 | - This combines with the non-breaking-space fixer to decode "à " as "à" as 128 | well. However, in many cases, the text " à " was intended to be " à ", 129 | preserving the space -- the underlying mojibake had two spaces after it, but 130 | the Web coalesced them into one. We detect this case based on common French 131 | and Portuguese words, and preserve the space when it appears intended. 132 | 133 | Thanks to @zehavoc for bringing to my attention how common this case is. 134 | 135 | - Updated the data file of Unicode character categories to Unicode 13, as 136 | used in Python 3.9. (No matter what version of Python you're on, ftfy uses 137 | the same data.) 138 | 139 | ## Version 5.8 (July 17, 2020) 140 | 141 | - Improved detection of UTF-8 mojibake of Greek, Cyrillic, Hebrew, and Arabic 142 | scripts. 143 | 144 | - Fixed the undeclared dependency on setuptools by removing the use of 145 | `pkg_resources`. 146 | 147 | ## Version 5.7 (February 18, 2020) 148 | 149 | - Updated the data file of Unicode character categories to Unicode 12.1, as 150 | used in Python 3.8. (No matter what version of Python you're on, ftfy uses 151 | the same data.) 152 | 153 | - Corrected an omission where short sequences involving the ACUTE ACCENT 154 | character were not being fixed. 155 | 156 | ## Version 5.6 (August 7, 2019) 157 | 158 | - The `unescape_html` function now supports all the HTML5 entities that appear 159 | in `html.entities.html5`, including those with long names such as 160 | `˝`. 161 | 162 | - Unescaping of numeric HTML entities now uses the standard library's 163 | `html.unescape`, making edge cases consistent. 164 | 165 | (The reason we don't run `html.unescape` on all text is that it's not always 166 | appropriate to apply, and can lead to false positive fixes. The text 167 | "This&NotThat" should not have "&Not" replaced by a symbol, as 168 | `html.unescape` would do.) 169 | 170 | - On top of Python's support for HTML5 entities, ftfy will also convert HTML 171 | escapes of common Latin capital letters that are (nonstandardly) written 172 | in all caps, such as `&NTILDE;` for `Ñ`. 173 | 174 | 175 | ## Version 5.5.1 (September 14, 2018) 176 | 177 | - Added Python 3.7 support. 178 | 179 | - Updated the data file of Unicode character categories to Unicode 11, as used 180 | in Python 3.7.0. (No matter what version of Python you're on, ftfy uses the 181 | same data.) 182 | 183 | 184 | ## Version 5.5 (September 6, 2018) 185 | 186 | - Recent versions have emphasized making a reasonable attempt to fix short, 187 | common mojibake sequences, such as `û`. In this version, we've expanded the 188 | heuristics to recognize these sequences in MacRoman as well as Windows-125x 189 | encodings. 190 | 191 | - A related rule for fixing isolated Windows-1252/UTF-8 mixups, even when they 192 | were inconsistent with the rest of the string, claimed to work on Latin-1/UTF-8 193 | mixups as well, but in practice it didn't. We've made the rule more robust. 194 | 195 | - Fixed a failure when testing the CLI on Windows. 196 | 197 | - Removed the `pytest-runner` invocation from setup.py, as it created complex 198 | dependencies that would stop setup.py from working in some environments. 199 | The `pytest` command still works fine. `pytest-runner` is just too clever. 200 | 201 | 202 | ## Version 5.4.1 (June 14, 2018) 203 | 204 | - Fixed a bug in the `setup.py` metadata. 205 | 206 | This bug was causing ftfy, a package that fixes encoding mismatches, to not 207 | install in some environments due to an encoding mismatch. (We were really 208 | putting the "meta" in "metadata" here.) 209 | 210 | 211 | ## Version 5.4 (June 1, 2018) 212 | 213 | - ftfy was still too conservative about fixing short mojibake sequences, 214 | such as "août" -> "août", when the broken version contained punctuation 215 | such as curly or angle quotation marks. 216 | 217 | The new heuristic observes in some cases that, even if quotation marks are 218 | expected to appear next to letters, it is strange to have an accented capital 219 | A before the quotation mark and more letters after the quotation mark. 220 | 221 | - Provides better metadata for the new PyPI. 222 | 223 | - Switched from nosetests to pytest. 224 | 225 | 226 | ## Version 5.3 (January 25, 2018) 227 | 228 | - A heuristic has been too conservative since version 4.2, causing a regression 229 | compared to previous versions: ftfy would fail to fix mojibake of common 230 | characters such as `á` when seen in isolation. A new heuristic now makes it 231 | possible to fix more of these common cases with less evidence. 232 | 233 | 234 | ## Version 5.2 (November 27, 2017) 235 | 236 | - The command-line tool will not accept the same filename as its input 237 | and output. (Previously, this would write a zero-length file.) 238 | 239 | - The `uncurl_quotes` fixer, which replaces curly quotes with straight quotes, 240 | now also replaces MODIFIER LETTER APOSTROPHE. 241 | 242 | - Codepoints that contain two Latin characters crammed together for legacy 243 | encoding reasons are replaced by those two separate characters, even in NFC 244 | mode. We formerly did this just with ligatures such as `fi` and `IJ`, but now 245 | this includes the Afrikaans digraph `ʼn` and Serbian/Croatian digraphs such as 246 | `dž`. 247 | 248 | 249 | ## Version 5.1.1 and 4.4.3 (May 15, 2017) 250 | 251 | These releases fix two unrelated problems with the tests, one in each version. 252 | 253 | - v5.1.1: fixed the CLI tests (which are new in v5) so that they pass 254 | on Windows, as long as the Python output encoding is UTF-8. 255 | 256 | - v4.4.3: added the `# coding: utf-8` declaration to two files that were 257 | missing it, so that tests can run on Python 2. 258 | 259 | ## Version 5.1 (April 7, 2017) 260 | 261 | - Removed the dependency on `html5lib` by dropping support for Python 3.2. 262 | 263 | We previously used the dictionary `html5lib.constants.entities` to decode 264 | HTML entities. In Python 3.3 and later, that exact dictionary is now in the 265 | standard library as `html.entities.html5`. 266 | 267 | - Moved many test cases about how particular text should be fixed into 268 | `test_cases.json`, which may ease porting to other languages. 269 | 270 | The functionality of this version remains the same as 5.0.2 and 4.4.2. 271 | 272 | 273 | ## Version 5.0.2 and 4.4.2 (March 21, 2017) 274 | 275 | Added a `MANIFEST.in` that puts files such as the license file and this 276 | changelog inside the source distribution. 277 | 278 | 279 | ## Version 5.0.1 and 4.4.1 (March 10, 2017) 280 | 281 | Bug fix: 282 | 283 | - The `unescape_html` fixer will decode entities between `€` and `Ÿ` 284 | as what they would be in Windows-1252, even without the help of 285 | `fix_encoding`. 286 | 287 | This better matches what Web browsers do, and fixes a regression that version 288 | 4.4 introduced in an example that uses `…` as an ellipsis. 289 | 290 | 291 | ## Version 5.0 (February 17, 2017) 292 | 293 | Breaking changes: 294 | 295 | - Dropped support for Python 2. If you need Python 2 support, you should get 296 | version 4.4, which has the same features as this version. 297 | 298 | - The top-level functions require their arguments to be given as keyword 299 | arguments. 300 | 301 | Version 5.0 also now has tests for the command-line invocation of ftfy. 302 | 303 | 304 | ## Version 4.4.0 (February 17, 2017) 305 | 306 | Heuristic changes: 307 | 308 | - ftfy can now fix mojibake involving the Windows-1250 or ISO-8859-2 encodings. 309 | 310 | - The `fix_entities` fixer is now applied after `fix_encoding`. This makes 311 | more situations resolvable when both fixes are needed. 312 | 313 | - With a few exceptions for commonly-used characters such as `^`, it is now 314 | considered "weird" whenever a diacritic appears in non-combining form, 315 | such as the diaeresis character `¨`. 316 | 317 | - It is also now weird when IPA phonetic letters, besides `ə`, appear next to 318 | capital letters. 319 | 320 | - These changes to the heuristics, and others we've made in recent versions, 321 | let us lower the "cost" for fixing mojibake in some encodings, causing them 322 | to be fixed in more cases. 323 | 324 | 325 | ## Version 4.3.1 (January 12, 2017) 326 | 327 | Bug fix: 328 | 329 | - `remove_control_chars` was removing U+0D ('\r') prematurely. That's the 330 | job of `fix_line_breaks`. 331 | 332 | 333 | ## Version 4.3.0 (December 29, 2016) 334 | 335 | ftfy has gotten by for four years without dependencies on other Python 336 | libraries, but now we can spare ourselves some code and some maintenance burden 337 | by delegating certain tasks to other libraries that already solve them well. 338 | This version now depends on the `html5lib` and `wcwidth` libraries. 339 | 340 | Feature changes: 341 | 342 | - The `remove_control_chars` fixer will now remove some non-ASCII control 343 | characters as well, such as deprecated Arabic control characters and 344 | byte-order marks. Bidirectional controls are still left as is. 345 | 346 | This should have no impact on well-formed text, while cleaning up many 347 | characters that the Unicode Consortium deems "not suitable for markup" 348 | (see Unicode Technical Report #20). 349 | 350 | - The `unescape_html` fixer uses a more thorough list of HTML entities, 351 | which it imports from `html5lib`. 352 | 353 | - `ftfy.formatting` now uses `wcwidth` to compute the width that a string 354 | will occupy in a text console. 355 | 356 | Heuristic changes: 357 | 358 | - Updated the data file of Unicode character categories to Unicode 9, as used 359 | in Python 3.6.0. (No matter what version of Python you're on, ftfy uses the 360 | same data.) 361 | 362 | Pending deprecations: 363 | 364 | - The `remove_bom` option will become deprecated in 5.0, because it has been 365 | superseded by `remove_control_chars`. 366 | 367 | - ftfy 5.0 will remove the previously deprecated name `fix_text_encoding`. It 368 | was renamed to `fix_encoding` in 4.0. 369 | 370 | - ftfy 5.0 will require Python 3.2 or later, as planned. Python 2 users, please 371 | specify `ftfy < 5` in your dependencies if you haven't already. 372 | 373 | 374 | ## Version 4.2.0 (September 28, 2016) 375 | 376 | Heuristic changes: 377 | 378 | - Math symbols next to currency symbols are no longer considered 'weird' by the 379 | heuristic. This fixes a false positive where text that involved the 380 | multiplication sign and British pounds or euros (as in '5×£35') could turn 381 | into Hebrew letters. 382 | 383 | - A heuristic that used to be a bonus for certain punctuation now also gives a 384 | bonus to successfully decoding other common codepoints, such as the 385 | non-breaking space, the degree sign, and the byte order mark. 386 | 387 | - In version 4.0, we tried to "future-proof" the categorization of emoji (as a 388 | kind of symbol) to include codepoints that would likely be assigned to emoji 389 | later. The future happened, and there are even more emoji than we expected. 390 | We have expanded the range to include those emoji, too. 391 | 392 | ftfy is still mostly based on information from Unicode 8 (as Python 3.5 is), 393 | but this expanded range should include the emoji from Unicode 9 and 10. 394 | 395 | - Emoji are increasingly being modified by variation selectors and skin-tone 396 | modifiers. Those codepoints are now grouped with 'symbols' in ftfy, so they 397 | fit right in with emoji, instead of being considered 'marks' as their Unicode 398 | category would suggest. 399 | 400 | This enables fixing mojibake that involves iOS's new diverse emoji. 401 | 402 | - An old heuristic that wasn't necessary anymore considered Latin text with 403 | high-numbered codepoints to be 'weird', but this is normal in languages such 404 | as Vietnamese and Azerbaijani. This does not seem to have caused any false 405 | positives, but it caused ftfy to be too reluctant to fix some cases of broken 406 | text in those languages. 407 | 408 | The heuristic has been changed, and all languages that use Latin letters 409 | should be on even footing now. 410 | 411 | 412 | ## Version 4.1.1 (April 13, 2016) 413 | 414 | - Bug fix: in the command-line interface, the `-e` option had no effect on 415 | Python 3 when using standard input. Now, it correctly lets you specify 416 | a different encoding for standard input. 417 | 418 | 419 | ## Version 4.1.0 (February 25, 2016) 420 | 421 | Heuristic changes: 422 | 423 | - ftfy can now deal with "lossy" mojibake. If your text has been run through 424 | a strict Windows-1252 decoder, such as the one in Python, it may contain 425 | the replacement character � (U+FFFD) where there were bytes that are 426 | unassigned in Windows-1252. 427 | 428 | Although ftfy won't recover the lost information, it can now detect this 429 | situation, replace the entire lossy character with �, and decode the rest of 430 | the characters. Previous versions would be unable to fix any string that 431 | contained U+FFFD. 432 | 433 | As an example, text in curly quotes that gets corrupted `“ like this â€�` 434 | now gets fixed to be `“ like this �`. 435 | 436 | - Updated the data file of Unicode character categories to Unicode 8.0, as used 437 | in Python 3.5.0. (No matter what version of Python you're on, ftfy uses the 438 | same data.) 439 | 440 | - Heuristics now count characters such as `~` and `^` as punctuation instead 441 | of wacky math symbols, improving the detection of mojibake in some edge cases. 442 | 443 | New features: 444 | 445 | - A new module, `ftfy.formatting`, can be used to justify Unicode text in a 446 | monospaced terminal. It takes into account that each character can take up 447 | anywhere from 0 to 2 character cells. 448 | 449 | - Internally, the `utf-8-variants` codec was simplified and optimized. 450 | 451 | 452 | ## Version 4.0.0 (April 10, 2015) 453 | 454 | Breaking changes: 455 | 456 | - The default normalization form is now NFC, not NFKC. NFKC replaces a large 457 | number of characters with 'equivalent' characters, and some of these 458 | replacements are useful, but some are not desirable to do by default. 459 | 460 | - The `fix_text` function has some new options that perform more targeted 461 | operations that are part of NFKC normalization, such as 462 | `fix_character_width`, without requiring hitting all your text with the huge 463 | mallet that is NFKC. 464 | 465 | - If you were already using NFC normalization, or in general if you want to 466 | preserve the *spacing* of CJK text, you should be sure to set 467 | `fix_character_width=False`. 468 | 469 | - The `remove_unsafe_private_use` parameter has been removed entirely, after 470 | two versions of deprecation. The function name `fix_bad_encoding` is also 471 | gone. 472 | 473 | New features: 474 | 475 | - Fixers for strange new forms of mojibake, including particularly clear cases 476 | of mixed UTF-8 and Windows-1252. 477 | 478 | - New heuristics, so that ftfy can fix more stuff, while maintaining 479 | approximately zero false positives. 480 | 481 | - The command-line tool trusts you to know what encoding your *input* is in, 482 | and assumes UTF-8 by default. You can still tell it to guess with the `-g` 483 | option. 484 | 485 | - The command-line tool can be configured with options, and can be used as a 486 | pipe. 487 | 488 | - Recognizes characters that are new in Unicode 7.0, as well as emoji from 489 | Unicode 8.0+ that may already be in use on iOS. 490 | 491 | Deprecations: 492 | 493 | - `fix_text_encoding` is being renamed again, for conciseness and consistency. 494 | It's now simply called `fix_encoding`. The name `fix_text_encoding` is 495 | available but emits a warning. 496 | 497 | Pending deprecations: 498 | 499 | - Python 2.6 support is largely coincidental. 500 | 501 | - Python 2.7 support is on notice. If you use Python 2, be sure to pin a 502 | version of ftfy less than 5.0 in your requirements. 503 | 504 | 505 | ## Version 3.4.0 (January 15, 2015) 506 | 507 | New features: 508 | 509 | - `ftfy.fixes.fix_surrogates` will fix all 16-bit surrogate codepoints, 510 | which would otherwise break various encoding and output functions. 511 | 512 | Deprecations: 513 | 514 | - `remove_unsafe_private_use` emits a warning, and will disappear in the 515 | next minor or major version. 516 | 517 | 518 | ## Version 3.3.1 (December 12, 2014) 519 | 520 | This version restores compatibility with Python 2.6. 521 | 522 | 523 | ## Version 3.3.0 (August 16, 2014) 524 | 525 | Heuristic changes: 526 | 527 | - Certain symbols are marked as "ending punctuation" that may naturally occur 528 | after letters. When they follow an accented capital letter and look like 529 | mojibake, they will not be "fixed" without further evidence. 530 | An example is that "MARQUÉ…" will become "MARQUÉ...", and not "MARQUɅ". 531 | 532 | New features: 533 | 534 | - `ftfy.explain_unicode` is a diagnostic function that shows you what's going 535 | on in a Unicode string. It shows you a table with each code point in 536 | hexadecimal, its glyph, its name, and its Unicode category. 537 | 538 | - `ftfy.fixes.decode_escapes` adds a feature missing from the standard library: 539 | it lets you decode a Unicode string with backslashed escape sequences in it 540 | (such as "\u2014") the same way that Python itself would. 541 | 542 | - `ftfy.streamtester` is a release of the code that I use to test ftfy on 543 | an endless stream of real-world data from Twitter. With the new heuristics, 544 | the false positive rate of ftfy is about 1 per 6 million tweets. (See 545 | the "Accuracy" section of the documentation.) 546 | 547 | Deprecations: 548 | 549 | - Python 2.6 is no longer supported. 550 | 551 | - `remove_unsafe_private_use` is no longer needed in any current version of 552 | Python. This fixer will disappear in a later version of ftfy. 553 | 554 | 555 | ## Version 3.2.0 (June 27, 2014) 556 | 557 | - `fix_line_breaks` fixes three additional characters that are considered line 558 | breaks in some environments, such as Javascript, and Python's "codecs" 559 | library. These are all now replaced with \n: 560 | 561 | U+0085 , with alias "NEXT LINE" 562 | U+2028 LINE SEPARATOR 563 | U+2029 PARAGRAPH SEPARATOR 564 | 565 | 566 | ## Version 3.1.3 (May 15, 2014) 567 | 568 | - Fix `utf-8-variants` so it never outputs surrogate codepoints, even on 569 | Python 2 where that would otherwise be possible. 570 | 571 | 572 | ## Version 3.1.2 (January 29, 2014) 573 | 574 | - Fix bug in 3.1.1 where strings with backslashes in them could never be fixed 575 | 576 | 577 | ## Version 3.1.1 (January 29, 2014) 578 | 579 | - Add the `ftfy.bad_codecs` package, which registers new codecs that can 580 | decoding things that Python may otherwise refuse to decode: 581 | 582 | - `utf-8-variants`, which decodes CESU-8 and its Java lookalike 583 | 584 | - `sloppy-windows-*`, which decodes character-map encodings while treating 585 | unmapped characters as Latin-1 586 | 587 | - Simplify the code using `ftfy.bad_codecs`. 588 | 589 | 590 | ## Version 3.0.6 (November 5, 2013) 591 | 592 | - `fix_entities` can now be True, False, or 'auto'. The new case is True, which 593 | will decode all entities, even in text that already contains angle brackets. 594 | This may also be faster, because it doesn't have to check. 595 | - `build_data.py` will refuse to run on Python < 3.3, to prevent building 596 | an inconsistent data file. 597 | 598 | 599 | ## Version 3.0.5 (November 1, 2013) 600 | 601 | - Fix the arguments to `fix_file`, because they were totally wrong. 602 | 603 | 604 | ## Version 3.0.4 (October 1, 2013) 605 | 606 | - Restore compatibility with Python 2.6. 607 | 608 | 609 | ## Version 3.0.3 (September 9, 2013) 610 | 611 | - Fixed an ugly regular expression bug that prevented ftfy from importing on a 612 | narrow build of Python. 613 | 614 | 615 | ## Version 3.0.2 (September 4, 2013) 616 | 617 | - Fixed some false positives. 618 | 619 | - Basically, 3.0.1 was too eager to treat text as MacRoman or cp437 when 620 | three consecutive characters coincidentally decoded as UTF-8. Increased the 621 | cost of those encodings so that they have to successfully decode multiple 622 | UTF-8 characters. 623 | 624 | - See `tests/test_real_tweets.py` for the new test cases that were added as a 625 | result. 626 | 627 | 628 | ## Version 3.0.1 (August 30, 2013) 629 | 630 | - Fix bug in `fix_java_encoding` that led to only the first instance of 631 | CESU-8 badness per line being fixed 632 | - Add a fixer that removes unassigned characters that can break Python 3.3 633 | (http://bugs.python.org/issue18183) 634 | 635 | 636 | ## Version 3.0 (August 26, 2013) 637 | 638 | - Generally runs faster 639 | - Idempotent 640 | - Simplified decoding logic 641 | - Understands more encodings and more kinds of mistakes 642 | - Takes options that enable or disable particular normalization steps 643 | - Long line handling: now the time-consuming step (`fix_text_encoding`) will be 644 | consistently skipped on long lines, but all other fixes will apply 645 | - Tested on millions of examples from Twitter, ensuring a near-zero rate of 646 | false positives 647 | 648 | 649 | ## Version 2.0.2 (June 20, 2013) 650 | 651 | - Fix breaking up of long lines, so it can't go into an infinite loop 652 | 653 | 654 | ## Version 2.0.1 (March 19, 2013) 655 | 656 | - Restored Python 2.6 support 657 | 658 | 659 | ## Version 2.0 (January 30, 2013) 660 | 661 | - Python 3 support 662 | - Use fast Python built-ins to speed up fixes 663 | - Bugfixes 664 | 665 | 666 | ## Version 1.0 (August 24, 2012) 667 | 668 | - Made into its own package with no dependencies, instead of a part of 669 | `metanl` 670 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2023 Robyn Speer 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | graft tests 2 | 3 | include *.md 4 | include *.txt 5 | 6 | global-exclude __pycache__ *.py[cod] 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ftfy: fixes text for you 2 | 3 | [![PyPI package](https://badge.fury.io/py/ftfy.svg)](https://badge.fury.io/py/ftfy) 4 | [![Docs](https://readthedocs.org/projects/ftfy/badge/?version=latest)](https://ftfy.readthedocs.org/en/latest/) 5 | 6 | ```python 7 | 8 | >>> from ftfy import fix_encoding 9 | >>> print(fix_encoding("(ง'⌣')ง")) 10 | (ง'⌣')ง 11 | 12 | ``` 13 | 14 | The full documentation of ftfy is available at [ftfy.readthedocs.org](https://ftfy.readthedocs.org). The documentation covers a lot more than this README, so here are some links into it: 15 | 16 | - [Fixing problems and getting explanations](https://ftfy.readthedocs.io/en/latest/explain.html) 17 | - [Configuring ftfy](https://ftfy.readthedocs.io/en/latest/config.html) 18 | - [Encodings ftfy can handle](https://ftfy.readthedocs.io/en/latest/encodings.html) 19 | - [“Fixer” functions](https://ftfy.readthedocs.io/en/latest/fixes.html) 20 | - [Is ftfy an encoding detector?](https://ftfy.readthedocs.io/en/latest/detect.html) 21 | - [Heuristics for detecting mojibake](https://ftfy.readthedocs.io/en/latest/heuristic.html) 22 | - [Support for “bad” encodings](https://ftfy.readthedocs.io/en/latest/bad_encodings.html) 23 | - [Command-line usage](https://ftfy.readthedocs.io/en/latest/cli.html) 24 | - [Citing ftfy](https://ftfy.readthedocs.io/en/latest/cite.html) 25 | 26 | ## Testimonials 27 | 28 | - “My life is livable again!” 29 | — [@planarrowspace](https://twitter.com/planarrowspace) 30 | - “A handy piece of magic” 31 | — [@simonw](https://twitter.com/simonw) 32 | - “Saved me a large amount of frustrating dev work” 33 | — [@iancal](https://twitter.com/iancal) 34 | - “ftfy did the right thing right away, with no faffing about. Excellent work, solving a very tricky real-world (whole-world!) problem.” 35 | — Brennan Young 36 | - “I have no idea when I’m gonna need this, but I’m definitely bookmarking it.” 37 | — [/u/ocrow](https://reddit.com/u/ocrow) 38 | 39 | ## What it does 40 | 41 | Here are some examples (found in the real world) of what ftfy can do: 42 | 43 | ftfy can fix mojibake (encoding mix-ups), by detecting patterns of characters that were clearly meant to be UTF-8 but were decoded as something else: 44 | 45 | >>> import ftfy 46 | >>> ftfy.fix_text('✔ No problems') 47 | '✔ No problems' 48 | 49 | Does this sound impossible? It's really not. UTF-8 is a well-designed encoding that makes it obvious when it's being misused, and a string of mojibake usually contains all the information we need to recover the original string. 50 | 51 | ftfy can fix multiple layers of mojibake simultaneously: 52 | 53 | >>> ftfy.fix_text('The Mona Lisa doesn’t have eyebrows.') 54 | "The Mona Lisa doesn't have eyebrows." 55 | 56 | It can fix mojibake that has had "curly quotes" applied on top of it, which cannot be consistently decoded until the quotes are uncurled: 57 | 58 | >>> ftfy.fix_text("l’humanité") 59 | "l'humanité" 60 | 61 | ftfy can fix mojibake that would have included the character U+A0 (non-breaking space), but the U+A0 was turned into an ASCII space and then combined with another following space: 62 | 63 | >>> ftfy.fix_text('Ã\xa0 perturber la réflexion') 64 | 'à perturber la réflexion' 65 | >>> ftfy.fix_text('à perturber la réflexion') 66 | 'à perturber la réflexion' 67 | 68 | ftfy can also decode HTML entities that appear outside of HTML, even in cases where the entity has been incorrectly capitalized: 69 | 70 | >>> # by the HTML 5 standard, only 'PÉREZ' is acceptable 71 | >>> ftfy.fix_text('P&EACUTE;REZ') 72 | 'PÉREZ' 73 | 74 | These fixes are not applied in all cases, because ftfy has a strongly-held goal of avoiding false positives -- it should never change correctly-decoded text to something else. 75 | 76 | The following text could be encoded in Windows-1252 and decoded in UTF-8, and it would decode as 'MARQUɅ'. However, the original text is already sensible, so it is unchanged. 77 | 78 | >>> ftfy.fix_text('IL Y MARQUÉ…') 79 | 'IL Y MARQUÉ…' 80 | 81 | ## Installing 82 | 83 | ftfy is a Python 3 package that can be installed using `pip` or `uv pip`: 84 | 85 | pip install ftfy 86 | 87 | (Or use `pip3 install ftfy` on systems where Python 2 and 3 are both globally installed and `pip` refers to Python 2.) 88 | 89 | If you use `poetry`, you can use ftfy as a dependency in the usual way (such as `poetry add ftfy`). 90 | 91 | ### Local development 92 | 93 | ftfy is developed using [uv](https://github.com/astral-sh/uv). You can build a virtual environment with its local dependencies by running `uv venv`, and test it with `uv run pytest`. 94 | 95 | ## Who maintains ftfy? 96 | 97 | I'm Robyn Speer, also known as Elia Robyn Lake. You can find my projects 98 | [on GitHub](https://github.com/rspeer) and my posts on [my own blog](https://posts.arborelia.net). 99 | 100 | ## Citing ftfy 101 | 102 | ftfy has been used as a crucial data processing step in major NLP research. 103 | 104 | It's important to give credit appropriately to everyone whose work you build on in research. This includes software, not just high-status contributions such as mathematical models. All I ask when you use ftfy for research is that you cite it. 105 | 106 | ftfy has a citable record [on Zenodo](https://zenodo.org/record/2591652). A citation of ftfy may look like this: 107 | 108 | Robyn Speer. (2019). ftfy (Version 5.5). Zenodo. 109 | http://doi.org/10.5281/zenodo.2591652 110 | 111 | In BibTeX format, the citation is:: 112 | 113 | @misc{speer-2019-ftfy, 114 | author = {Robyn Speer}, 115 | title = {ftfy}, 116 | note = {Version 5.5}, 117 | year = 2019, 118 | howpublished = {Zenodo}, 119 | doi = {10.5281/zenodo.2591652}, 120 | url = {https://doi.org/10.5281/zenodo.2591652} 121 | } 122 | 123 | ## Important license clarifications 124 | 125 | If you do not follow ftfy's license, you do not have a license to ftfy. 126 | 127 | This sounds obvious and tautological, but there are people who think open source licenses mean that they can just do what they want, especially in the field of generative AI. It's a permissive license but you still have to follow it. The [Apache license](https://www.apache.org/licenses/LICENSE-2.0) is the only thing that gives you permission to use and copy ftfy; otherwise, all rights are reserved. 128 | 129 | If you use or distribute ftfy, you must follow the terms of the [Apache license](https://www.apache.org/licenses/LICENSE-2.0), including that you must attribute the author of ftfy (Robyn Speer) correctly. 130 | 131 | You _may not_ make a derived work of ftfy that obscures its authorship, such as by putting its code in an AI training dataset, including the code in AI training at runtime, or using a generative AI that copies code from such a dataset. 132 | 133 | At my discretion, I may notify you of a license violation, and give you a chance to either remedy it or delete all copies of ftfy in your possession. 134 | 135 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/ftfy.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/ftfy.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/ftfy" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/ftfy" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 178 | -------------------------------------------------------------------------------- /docs/_static/css/custom.css: -------------------------------------------------------------------------------- 1 | @import url('https://fonts.googleapis.com/css2?family=Inconsolata:wght@400;700&family=Source+Sans+Pro:ital,wght@0,400;0,700;1,400;1,700&display=swap'); 2 | -------------------------------------------------------------------------------- /docs/avoid.rst: -------------------------------------------------------------------------------- 1 | How can I avoid producing mojibake? 2 | =================================== 3 | 4 | Read the Python Unicode HOWTO 5 | ----------------------------- 6 | 7 | The `Python Unicode HOWTO`_ is a useful introduction to how to use Unicode correctly in Python. If you find yourself confused about the difference between bytes and characters, or you need to unlearn bad habits from Python 2, it's a great place to start. 8 | 9 | .. _`Python Unicode HOWTO`: https://docs.python.org/3/howto/unicode.html 10 | 11 | Assume UTF-8 12 | ------------ 13 | 14 | **Assume text is in UTF-8** unless you have a specific reason to believe it isn't. 15 | 16 | In the 2020s, `UTF-8 is everywhere`_, especially in text meant to be transferred over the Internet. Most mojibake comes from decoding correct UTF-8 as if it were some other encoding. 17 | 18 | .. _`UTF-8 is everywhere`: http://utf8everywhere.org/ 19 | 20 | In Python 3, you should use the Unicode string type (`str`) for all operations. You should open text files in UTF-8:: 21 | 22 | openfile = open(filename, encoding='utf-8', errors='replace') 23 | 24 | When you are specifically working with bytes and you need to turn them into text, you should decode them as UTF-8:: 25 | 26 | text = bytebuffer.decode('utf-8', 'replace') 27 | 28 | The exceptions, the cases where you're not using UTF-8, are few but relevant. If you're interacting with C APIs, you'll need to represent your text as bytes in the format the API expects. Windows APIs in particular expect UTF-16. 29 | 30 | We're mostly past the dark days when encodings were "character maps" of 256 possible characters, one byte per character. An unfortunate thing that keeps them alive is Microsoft Excel, whose "Export" feature will pick a 256-character encoding *based on your computer's operating system and default language*. So: 31 | 32 | Don't export CSVs from Excel 33 | ---------------------------- 34 | 35 | I know that I'm telling you not to do something that may seem like a requirement of doing your job. But **don't export CSV files from Excel** if you have any other choice. Though Excel CSVs look right on basic ASCII characters, on any other text, it either won't work or won't do what you want. Excel CSVs aren't even interoperable between different computers. 36 | 37 | My recommendation is to use Google Sheets to create CSVs, and keep Excel files in .xlsx format so the Unicode won't be mangled. 38 | 39 | If you must export a CSV-like file from Excel, you can find an option to tell Excel to export in "Unicode Text", and it will create a tab-separated UTF-16 file. This is not a very widely-used format, but at least it's not mojibake. 40 | 41 | You can follow these `unwieldy directions from a SalesForce help article`_ to use Excel and Notepad to create a UTF-8 CSV. You can see why I don't recommend this process. 42 | 43 | .. _`unwieldy directions from a SalesForce help article`: https://help.salesforce.com/articleView?id=000324657&type=1&mode=1 44 | 45 | Don't use chardet 46 | ----------------- 47 | 48 | Encoding detection on raw bytes is not a good idea. It was important in the '90s, during the rough transition to Unicode -- and the most popular way of doing it, ``chardet``, hasn't changed since the '90s. 49 | 50 | A heuristic designed before there was multilingual social media, before there were emoji, is not going to work correctly in the 2020s. 51 | 52 | When chardet sees the *correct* UTF-8 encoding of an emoji, it will have no idea what it's looking at, because it won't match anything in its training data. Often, it will guess that it's Turkish encoded in Windows-1254. On other reasonable text, it will guess the "iso-8859-2" encoding, an encoding that you'd very rarely see used intentionally. Because the problem is inherent to the design of chardet, it's not easily fixed. 53 | 54 | chardet was built on the assumption that "encoding detection is language detection", which is no longer true. Web sites now contain text in multiple languages, and for the most part they use UTF-8 regardless of the language. 55 | 56 | I've strengthened my recommendation from "don't trust chardet's output" to "don't use chardet", because there's no realistic way to use chardet without trusting its output. We've reached a situation where major Python packages such as ``requests`` assume that chardet is correct, and yet the changing nature of text means that chardet is more wrong with each passing year. 57 | 58 | So how should you interpret raw bytes of text if you're not told what encoding they're in? As UTF-8. Text is UTF-8 until proven otherwise. 59 | 60 | ASCII isn't extended 61 | -------------------- 62 | 63 | A sign that something is about to go wrong with encodings is if a developer is talking about "extended ASCII". 64 | 65 | ASCII is a set of 128 character codes (95 of them displayable). It has not had any new characters added to it since the backslash was added in 1967. 66 | 67 | Because ASCII is a 7-bit encoding but our computers use 8-bit bytes, it seems clear that ASCII *could* be extended to assign a meaning to all 256 possible bytes. There are many different encodings that have done so, and they're all incompatible with one another, which is why treating bytes as characters as a bad idea and why we have Unicode now. 68 | 69 | Many developers refer to one of these encodings as "extended ASCII", whose colloquial meaning is "the encoding of 256 characters that I learned first". Its meaning is completely dependent on the country you were in and the operating system you were using when you started programming: 70 | 71 | - My "extended ASCII" when I learned to program was IBM codepage 437, the one that was used in US versions of MS-DOS. 72 | - To many people, "extended ASCII" is Windows codepage 1252, which they'd find in the Character Map of their Windows 9x computer, at least if they were in North America or Western Europe. 73 | - To others in other countries, it could be a different Windows codepage, such as 1251 (which contains Cyrillic letters) or 1250 (which contains a different set of accented letters for Eastern European languages). 74 | - Or it might be Latin-1, the common name for the ISO-8859-1 standard that became the first 256 characters of Unicode. Latin-1 is easy to implement by accident, such as when you see byte C2 and assume it means Unicode codepoint U+00C2 -- what you get by incorrectly running `chr()` on each byte. 75 | 76 | "Extended ASCII" doesn't specify which encoding you mean, and often indicates that you don't realize that different people are thinking of different sets of 256 characters. 77 | 78 | Instead of "extended ASCII", say the name of the encoding such as "Latin-1", "Windows-1252", "Windows-1250", "codepage 437", or maybe "I don't know what it is but it looks right on my machine". 79 | 80 | And then revise things so that you use UTF-8, which is still a superset of ASCII but can represent every Unicode character. 81 | -------------------------------------------------------------------------------- /docs/bad_encodings.rst: -------------------------------------------------------------------------------- 1 | Support for "bad" encodings 2 | =========================== 3 | 4 | .. automodule:: ftfy.bad_codecs 5 | 6 | "Sloppy" encodings 7 | ------------------ 8 | .. automodule:: ftfy.bad_codecs.sloppy 9 | 10 | Variants of UTF-8 11 | ----------------- 12 | .. automodule:: ftfy.bad_codecs.utf8_variants 13 | 14 | -------------------------------------------------------------------------------- /docs/cite.rst: -------------------------------------------------------------------------------- 1 | .. _cite: 2 | 3 | Citing ftfy 4 | =========== 5 | ftfy has been used as a data processing step in major NLP research, including OpenAI's original GPT. 6 | 7 | It's important to give credit appropriately to everyone whose work you build on in research. This includes software, not just high-status contributions such as mathematical models. All I ask when you use ftfy for research is that you cite it. 8 | 9 | ftfy has a citable record on `Zenodo`_. A citation of ftfy may look like this: 10 | 11 | Robyn Speer. (2019). ftfy (Version 5.5). Zenodo. 12 | http://doi.org/10.5281/zenodo.2591652 13 | 14 | In BibTeX format, the citation is:: 15 | 16 | @misc{speer-2019-ftfy, 17 | author = {Robyn Speer}, 18 | title = {ftfy}, 19 | note = {Version 5.5}, 20 | year = 2019, 21 | howpublished = {Zenodo}, 22 | doi = {10.5281/zenodo.2591652}, 23 | url = {https://doi.org/10.5281/zenodo.2591652} 24 | } 25 | 26 | .. _Zenodo: https://zenodo.org/record/2591652 27 | -------------------------------------------------------------------------------- /docs/cli.rst: -------------------------------------------------------------------------------- 1 | Command-line usage 2 | ================== 3 | ftfy can be used from the command line. By default, it takes UTF-8 input and 4 | writes it to UTF-8 output, fixing problems in its Unicode as it goes. 5 | 6 | Here's the usage documentation for the `ftfy` command: 7 | 8 | .. code-block:: text 9 | 10 | usage: ftfy [-h] [-o OUTPUT] [-g] [-e ENCODING] [-n NORMALIZATION] 11 | [--preserve-entities] 12 | [filename] 13 | 14 | ftfy (fixes text for you), version 6.0 15 | 16 | positional arguments: 17 | filename The file whose Unicode is to be fixed. Defaults to -, 18 | meaning standard input. 19 | 20 | optional arguments: 21 | -h, --help show this help message and exit 22 | -o OUTPUT, --output OUTPUT 23 | The file to output to. Defaults to -, meaning standard 24 | output. 25 | -g, --guess Ask ftfy to guess the encoding of your input. This is 26 | risky. Overrides -e. 27 | -e ENCODING, --encoding ENCODING 28 | The encoding of the input. Defaults to UTF-8. 29 | -n NORMALIZATION, --normalization NORMALIZATION 30 | The normalization of Unicode to apply. Defaults to 31 | NFC. Can be "none". 32 | --preserve-entities Leave HTML entities as they are. The default is to 33 | decode them, as long as no HTML tags have appeared in 34 | the file. 35 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # 2 | # ftfy documentation build configuration file, created by 3 | # sphinx-quickstart on Wed Aug 28 03:18:27 2013. 4 | # 5 | # This file is execfile()d with the current directory set to its containing dir. 6 | # 7 | # Note that not all possible configuration values are present in this 8 | # autogenerated file. 9 | # 10 | # All configuration values have a default; values that are commented out 11 | # serve to show the default. 12 | 13 | 14 | # If extensions (or modules to document with autodoc) are in another directory, 15 | # add these directories to sys.path here. If the directory is relative to the 16 | # documentation root, use os.path.abspath to make it absolute, like shown here. 17 | # sys.path.insert(0, os.path.abspath('.')) 18 | 19 | # -- General configuration ----------------------------------------------------- 20 | 21 | # If your documentation needs a minimal Sphinx version, state it here. 22 | # needs_sphinx = '1.0' 23 | 24 | # Add any Sphinx extension module names here, as strings. They can be extensions 25 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 26 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.viewcode"] 27 | 28 | # Add any paths that contain templates here, relative to this directory. 29 | templates_path = ["_templates"] 30 | 31 | # The suffix of source filenames. 32 | source_suffix = ".rst" 33 | 34 | # The encoding of source files. 35 | # source_encoding = 'utf-8-sig' 36 | 37 | # The master toctree document. 38 | master_doc = "index" 39 | 40 | # General information about the project. 41 | project = "ftfy" 42 | copyright = "2024, Robyn Speer" 43 | 44 | # The version info for the project you're documenting, acts as replacement for 45 | # |version| and |release|, also used in various other places throughout the 46 | # built documents. 47 | # 48 | # The short X.Y version. 49 | version = "6.3" 50 | # The full version, including alpha/beta/rc tags. 51 | release = "6.3.1" 52 | 53 | # The language for content autogenerated by Sphinx. Refer to documentation 54 | # for a list of supported languages. 55 | # language = None 56 | 57 | # There are two options for replacing |today|: either, you set today to some 58 | # non-false value, then it is used: 59 | # today = '' 60 | # Else, today_fmt is used as the format for a strftime call. 61 | # today_fmt = '%B %d, %Y' 62 | 63 | # List of patterns, relative to source directory, that match files and 64 | # directories to ignore when looking for source files. 65 | exclude_patterns = ["_build"] 66 | 67 | # The reST default role (used for this markup: `text`) to use for all documents. 68 | default_role = "code" 69 | 70 | # If true, '()' will be appended to :func: etc. cross-reference text. 71 | # add_function_parentheses = True 72 | 73 | # If true, the current module name will be prepended to all description 74 | # unit titles (such as .. function::). 75 | # add_module_names = True 76 | 77 | # If true, sectionauthor and moduleauthor directives will be shown in the 78 | # output. They are ignored by default. 79 | # show_authors = False 80 | 81 | # The name of the Pygments (syntax highlighting) style to use. 82 | pygments_style = "default" 83 | pygments_dark_style = "monokai" 84 | 85 | # A list of ignored prefixes for module index sorting. 86 | # modindex_common_prefix = [] 87 | 88 | # If true, keep warnings as "system message" paragraphs in the built documents. 89 | # keep_warnings = False 90 | 91 | 92 | # -- Options for HTML output --------------------------------------------------- 93 | 94 | # The theme to use for HTML and HTML Help pages. See the documentation for 95 | # a list of builtin themes. 96 | html_theme = "furo" 97 | 98 | # Theme options are theme-specific and customize the look and feel of a theme 99 | # further. For a list of options available for each theme, see the 100 | # documentation. 101 | html_theme_options = { 102 | "light_css_variables": { 103 | "color-brand-primary": "#7C4DFF", 104 | "color-brand-content": "#7C4DFF", 105 | "font-stack": "Source Sans Pro, sans-serif", 106 | "font-stack--monospace": "Inconsolata", 107 | "code-font-size": "18px", 108 | # I don't know why furo wants inline code to be so small, but don't let it 109 | "font-size--small--2": "100%", 110 | }, 111 | "dark_css_variables": { 112 | "color-brand-primary": "#AC8DFF", 113 | "color-brand-content": "#AC8DFF", 114 | "font-stack": "Source Sans Pro, sans-serif", 115 | "font-stack--monospace": "Inconsolata", 116 | "code-font-size": "18px", 117 | "font-size--small--2": "100%", 118 | }, 119 | } 120 | html_css_files = [ 121 | "css/custom.css", 122 | ] 123 | 124 | # Add any paths that contain custom themes here, relative to this directory. 125 | # html_theme_path = [] 126 | 127 | # The name for this set of Sphinx documents. If None, it defaults to 128 | # " v documentation". 129 | html_title = "ftfy: fixes text for you" 130 | 131 | # A shorter title for the navigation bar. Default is the same as html_title. 132 | html_short_title = "ftfy" 133 | 134 | # The name of an image file (relative to this directory) to place at the top 135 | # of the sidebar. 136 | # html_logo = None 137 | 138 | # The name of an image file (within the static path) to use as favicon of the 139 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 140 | # pixels large. 141 | # html_favicon = None 142 | 143 | # Add any paths that contain custom static files (such as style sheets) here, 144 | # relative to this directory. They are copied after the builtin static files, 145 | # so a file named "default.css" will overwrite the builtin "default.css". 146 | html_static_path = ["_static"] 147 | 148 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 149 | # using the given strftime format. 150 | html_last_updated_fmt = "%b %d, %Y" 151 | 152 | # If true, SmartyPants will be used to convert quotes and dashes to 153 | # typographically correct entities. 154 | # html_use_smartypants = True 155 | 156 | # Custom sidebar templates, maps document names to template names. 157 | # html_sidebars = {} 158 | 159 | # Additional templates that should be rendered to pages, maps page names to 160 | # template names. 161 | # html_additional_pages = {} 162 | 163 | # If false, no module index is generated. 164 | # html_domain_indices = True 165 | 166 | # If false, no index is generated. 167 | # html_use_index = True 168 | 169 | # If true, the index is split into individual pages for each letter. 170 | # html_split_index = False 171 | 172 | # If true, links to the reST sources are added to the pages. 173 | html_show_sourcelink = False 174 | 175 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 176 | # html_show_sphinx = True 177 | 178 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 179 | # html_show_copyright = True 180 | 181 | # If true, an OpenSearch description file will be output, and all pages will 182 | # contain a tag referring to it. The value of this option must be the 183 | # base URL from which the finished HTML is served. 184 | # html_use_opensearch = '' 185 | 186 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 187 | # html_file_suffix = None 188 | 189 | # Output file base name for HTML help builder. 190 | htmlhelp_basename = "ftfydoc" 191 | 192 | 193 | # -- Options for LaTeX output -------------------------------------------------- 194 | 195 | latex_elements = { 196 | # The paper size ('letterpaper' or 'a4paper'). 197 | # 'papersize': 'letterpaper', 198 | # The font size ('10pt', '11pt' or '12pt'). 199 | # 'pointsize': '10pt', 200 | # Additional stuff for the LaTeX preamble. 201 | # 'preamble': '', 202 | } 203 | 204 | # Grouping the document tree into LaTeX files. List of tuples 205 | # (source start file, target name, title, author, documentclass [howto/manual]). 206 | latex_documents = [] 207 | 208 | # The name of an image file (relative to this directory) to place at the top of 209 | # the title page. 210 | # latex_logo = None 211 | 212 | # For "manual" documents, if this is true, then toplevel headings are parts, 213 | # not chapters. 214 | # latex_use_parts = False 215 | 216 | # If true, show page references after internal links. 217 | # latex_show_pagerefs = False 218 | 219 | # If true, show URL addresses after external links. 220 | # latex_show_urls = False 221 | 222 | # Documents to append as an appendix to all manuals. 223 | # latex_appendices = [] 224 | 225 | # If false, no module index is generated. 226 | # latex_domain_indices = True 227 | 228 | 229 | # -- Options for manual page output -------------------------------------------- 230 | 231 | # One entry per manual page. List of tuples 232 | # (source start file, name, description, authors, manual section). 233 | man_pages = [("index", "ftfy", "ftfy Documentation", ["Robyn Speer"], 1)] 234 | 235 | # If true, show URL addresses after external links. 236 | # man_show_urls = False 237 | 238 | 239 | # -- Options for Texinfo output ------------------------------------------------ 240 | 241 | # Grouping the document tree into Texinfo files. List of tuples 242 | # (source start file, target name, title, author, 243 | # dir menu entry, description, category) 244 | texinfo_documents = [] 245 | 246 | # Documents to append as an appendix to all manuals. 247 | # texinfo_appendices = [] 248 | 249 | # If false, no module index is generated. 250 | # texinfo_domain_indices = True 251 | 252 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 253 | # texinfo_show_urls = 'footnote' 254 | 255 | # If true, do not generate a @detailmenu in the "Top" node's menu. 256 | # texinfo_no_detailmenu = False 257 | -------------------------------------------------------------------------------- /docs/config.rst: -------------------------------------------------------------------------------- 1 | .. _config: 2 | 3 | Configuring ftfy 4 | ================ 5 | 6 | The main functions of ftfy -- :func:`ftfy.fix_text` and :func:`ftfy.fix_and_explain` -- run text through a sequence of fixes. If the text changed, it will run them through again, so that you can be sure the output ends up in a standard form that will be unchanged by ftfy. 7 | 8 | All the fixes are on by default, but you can pass in a configuration object or keyword options to turn them off. Check that the default fixes are appropriate for your use case. For example: 9 | 10 | - You should set `unescape_html` to False if the output is meant to be interpreted as HTML. 11 | 12 | - You should set `fix_character_width` to False if you want to preserve the spacing of CJK text. 13 | 14 | - You should set `uncurl_quotes` to False if you want to preserve quotation marks with nice typography. You could even consider doing the opposite of `uncurl_quotes`, running `smartypants`_ on the result to make all the punctuation typographically nice. 15 | 16 | - To be cautious and only fix mojibake when it can be fixed with a consistent sequence of encoding and decoding steps, you should set `decode_inconsistent_utf8` to False. 17 | 18 | .. _smartypants: http://pythonhosted.org/smartypants/ 19 | 20 | If the only fix you need is to detect and repair decoding errors (mojibake), use the :func:`ftfy.fix_encoding` function directly. However, note that mojibake is often entangled with other issues such as the curliness of quotation marks, so limiting the process to this step might make some mojibake unfixable. 21 | 22 | The TextFixerConfig object 23 | -------------------------- 24 | 25 | The top-level functions of ftfy take a `config` argument that is an instance of :class:`ftfy.TextFixerConfig`. If this argument is None, the configuration will use its default values. 26 | 27 | .. autoclass:: ftfy.TextFixerConfig() 28 | 29 | Keyword arguments 30 | ----------------- 31 | The top-level functions also accept keyword arguments in place of a `config` argument. Given these keyword arguments, they will pass them to the :class:`ftfy.TextFixerConfig` constructor, overriding the default values of those configuration options. 32 | -------------------------------------------------------------------------------- /docs/detect.rst: -------------------------------------------------------------------------------- 1 | Is ftfy an encoding detector? 2 | ============================= 3 | 4 | No, it's a mojibake detector (and fixer). That makes its task much easier, because it doesn't have to guess the encoding of everything: it can leave correct-looking text as it is. 5 | 6 | Encoding detectors have ended up being a bad idea, and they are largely responsible for *creating* the problems that ftfy has to fix. 7 | 8 | The text that you put into ftfy should be Unicode that you've attempted to decode correctly. ftfy doesn't accept bytes as input. 9 | 10 | There is a lot of Unicode out there that has already been mangled by mojibake, even when decoded properly. That is, you might correctly interpret the text as UTF-8, and what the UTF-8 text really says is a mojibake string like "réflexion" that needs to be decoded *again*. This is when you need ftfy. 11 | 12 | 13 | I really need to guess the encoding of some bytes 14 | ------------------------------------------------- 15 | 16 | I understand. Sometimes we can't have nice things. 17 | 18 | Though it's not part of the usual operation of ftfy, ftfy *does* contain a byte-encoding-guesser that tries to be less terrible than other byte-encoding-guessers in common cases. Instead of using probabilistic heuristics, it picks up on very strong signals like "having a UTF-16 byte-order mark" or "decoding successfully as UTF-8". 19 | 20 | This function won't solve everything. It can't solve everything. In particular, it has no capacity to guess non-Unicode CJK encodings such as Shift-JIS or Big5. 21 | 22 | .. autofunction:: ftfy.guess_bytes 23 | 24 | -------------------------------------------------------------------------------- /docs/encodings.rst: -------------------------------------------------------------------------------- 1 | Encodings ftfy can handle 2 | ========================= 3 | 4 | ftfy can't fix all possible mix-ups. Its goal is to cover the most common encoding mix-ups while keeping false positives to a very low rate. 5 | 6 | ftfy can understand text that was decoded as any of these single-byte encodings: 7 | 8 | - Latin-1 (ISO-8859-1) 9 | - Windows-1250 (cp1250 -- used in Microsoft products in Eastern Europe) 10 | - Windows-1251 (cp1251 -- used in Microsoft products in Russia) 11 | - Windows-1252 (cp1252 -- used in Microsoft products in Western Europe and the Americas) 12 | - Windows-1253 (cp1253 -- used in Microsoft products in Greece) 13 | - Windows-1254 (cp1254 -- used in Microsoft products in Türkiye) 14 | - Windows-1257 (cp1257 -- used in Microsoft products in Baltic countries) 15 | - ISO-8859-2 (which is not quite the same as Windows-1250) 16 | - MacRoman (used on Mac OS 9 and earlier) 17 | - cp437 (it's the "text mode" in your video card firmware) 18 | 19 | when it was actually intended to be decoded as one of these variable-length encodings: 20 | 21 | - UTF-8 22 | - CESU-8 (a common, incorrect implementation of UTF-8) 23 | 24 | It can also understand text that was intended as Windows-1252 but decoded as Latin-1. That's the very common case where things like smart-quotes and bullets turn into single weird control characters. 25 | 26 | However, ftfy cannot understand other mixups between single-byte encodings, because it is extremely difficult to detect which mixup in particular is the one that happened. 27 | 28 | We also can't handle the legacy encodings used for Chinese, Japanese, and Korean, such as ``shift-jis`` and ``gb18030``. See `issue #34`_ for why this is so hard. 29 | 30 | I tried adding support for cp850, the cp437-workalike that supported European languages, but I couldn't find any real examples that it fixed, and it introduced some false positives. 31 | 32 | .. _`issue #34`: https://github.com/rspeer/python-ftfy/issues/34 33 | 34 | Remember that the input to ftfy is Unicode, so it handles actual CJK *text* just fine. It just can't discover that a CJK *encoding* introduced mojibake into the text. 35 | -------------------------------------------------------------------------------- /docs/explain.rst: -------------------------------------------------------------------------------- 1 | Fixing problems and getting explanations 2 | ======================================== 3 | 4 | Ode to a Shipping Label 5 | ----------------------- 6 | 7 | A `poem about mojibake`_, whose original author might be `Carlos Bueno on Facebook`_, shows a shipping label that serves as an excellent example for this section, addressed to the surname `L&AMP;ATILDE;&AMP;SUP3;PEZ`. 8 | 9 | .. _`poem about mojibake`: https://imgur.com/4J7Il0m 10 | .. _`Carlos Bueno on Facebook`: https://www.facebook.com/cmb/posts/619241744770551:0 11 | 12 | .. image:: images/shipping-label.png 13 | :width: 600 14 | :alt: A package addressed to a name including "L&AMP;ATILDE;&AMP;SUP3;PEZ" 15 | 16 | We can use ftfy not only to fix the text that was on the label, but to show us what happened to it (like the poem does):: 17 | 18 | >>> from ftfy import fix_and_explain, apply_plan 19 | >>> shipping_label = "L&AMP;ATILDE;&AMP;SUP3;PEZ" 20 | >>> fixed, explanation = fix_and_explain(shipping_label) 21 | >>> fixed 22 | 'LóPEZ' 23 | 24 | >>> explanation 25 | [('apply', 'unescape_html'), 26 | ('apply', 'unescape_html'), 27 | ('apply', 'unescape_html'), 28 | ('encode', 'latin-1'), 29 | ('decode', 'utf-8')] 30 | 31 | The capitalization is inconsistent because the encoding of a lowercase "ó" is in there, but everything was printed in capital letters. 32 | 33 | The explanation may even be able to be applied to different text with the same problem:: 34 | 35 | >>> label2 = "CARR&AMP;ATILDE;&AMP;COPY;" 36 | >>> apply_plan(label2, explanation) 37 | 'CARRé' 38 | 39 | Functions that fix text 40 | ----------------------- 41 | 42 | The function that you'll probably use most often is :func:`ftfy.fix_text`, which applies all the fixes it can to every line of text, and returns the fixed text. 43 | 44 | .. autofunction:: ftfy.fix_text 45 | 46 | :func:`ftfy.fix_and_explain` takes the same arguments as :func:`ftfy.fix_text`, but provides an explanation, like we saw in the first section. 47 | 48 | .. autofunction:: ftfy.fix_and_explain 49 | 50 | Unlike :func:`ftfy.fix_text`, :func:`ftfy.fix_and_explain` doesn't separate the text into lines that it fixes separately -- because it's looking for a unified explanation of what happened to the text, not a different one for each line. 51 | 52 | A more targeted function is :func:`ftfy.fix_encoding_and_explain`, which only fixes problems that can be solved by encoding and decoding the text, not other problems such as HTML entities: 53 | 54 | .. autofunction:: ftfy.fix_encoding_and_explain 55 | 56 | This function has a counterpart that returns just the fixed string, without the explanation. It still fixes the string as a whole, not line by line. 57 | 58 | .. autofunction:: ftfy.fix_encoding 59 | 60 | The return type of the `..._and_explain` functions is a kind of NamedTuple called `ExplainedText`: 61 | 62 | .. autoclass:: ftfy.ExplainedText 63 | 64 | These explanations can be re-applied to text using :func:`apply_plan`: 65 | 66 | .. autofunction:: ftfy.apply_plan 67 | 68 | Showing the characters in a string 69 | ---------------------------------- 70 | 71 | A different kind of explanation you might need is simply a breakdown of what Unicode characters a string contains. For this, ftfy provides a utility function, :func:`ftfy.explain_unicode()`. 72 | 73 | .. autofunction:: ftfy.explain_unicode 74 | 75 | A command-line utility that provides similar information, and even more detail, is lunasorcery's `utf8info`_. 76 | 77 | .. _`utf8info`: https://github.com/lunasorcery/utf8info -------------------------------------------------------------------------------- /docs/fixes.rst: -------------------------------------------------------------------------------- 1 | "Fixer" functions 2 | ================= 3 | 4 | .. automodule:: ftfy.fixes 5 | :members: decode_escapes, decode_inconsistent_utf8, fix_c1_controls, 6 | fix_character_width, fix_latin_ligatures, fix_line_breaks, 7 | fix_surrogates, remove_control_chars, remove_terminal_escapes, 8 | replace_lossy_sequences, restore_byte_a0, uncurl_quotes, 9 | unescape_html 10 | -------------------------------------------------------------------------------- /docs/heuristic.rst: -------------------------------------------------------------------------------- 1 | .. _heuristic: 2 | 3 | Heuristics for detecting mojibake 4 | ================================= 5 | 6 | The "badness" heuristic 7 | ----------------------- 8 | 9 | .. automodule:: ftfy.badness 10 | :members: badness, is_bad 11 | 12 | 13 | The "UTF-8 detector" heuristic 14 | ------------------------------ 15 | A more narrow heuristic is defined in ``chardata.py`` as ``UTF8_DETECTOR_RE``. This heuristic looks for specific sequences of mojibake characters that come from the decoding of common UTF-8 sequences. 16 | 17 | Text that matches this regular expression can be partially fixed by :func:`ftfy.fixes.decode_inconsistent_utf8`, even when the string as a whole doesn't decode consistently. 18 | 19 | Because of this, the expression requires that the match isn't preceded by likely UTF-8 characters -- if this were allowed, then it might pick two or three characters out of a larger mess of mojibake to decode as another character while leaving the rest untouched. This makes the problem more confusing, doesn't really solve anything, and can even pile up recursively to decode as entirely arbitrary characters. 20 | 21 | 22 | The "lossy UTF-8" heuristic 23 | ---------------------------- 24 | ``chardata.py`` also includes ``LOSSY_UTF8_RE``, which is used similarly to the "UTF-8 detector" heuristic. This regular expression matches sequences that look like they were incorrectly decoded from UTF-8, but with characters replaced by question marks or the Unicode replacement character `�`. 25 | 26 | Characters that match this heuristic will be replaced by `�` in the :func:`ftfy.fixes.replace_lossy_sequences` fixer. -------------------------------------------------------------------------------- /docs/images/shipping-label.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rspeer/python-ftfy/74dd0452b48286a3770013b3a02755313bd5575e/docs/images/shipping-label.png -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | ftfy: fixes text for you 2 | ======================== 3 | 4 | *Version 6.3* 5 | 6 | “Assume all external input is the result of (a series of) bugs.” 7 | — `RFC 9225`_: Software Defects Considered Harmful 8 | 9 | .. _`RFC 9225`: https://www.rfc-editor.org/rfc/rfc9225.html 10 | 11 | **ftfy** fixes Unicode that's broken in various ways. 12 | 13 | The goal of ftfy is to **take in bad Unicode and output good Unicode**, for use in your Unicode-aware code. 14 | 15 | This is different from taking in non-Unicode and outputting Unicode, which is not a goal of ftfy. It also isn't designed to protect you from having to write Unicode-aware code. ftfy helps those who help themselves. 16 | 17 | Of course you're better off if your input is decoded properly and has no glitches. But you often don't have any control over your input; it's someone else's mistake, but it's your problem now. ftfy will do everything it can to fix the problem. 18 | 19 | ftfy is a heuristic that was designed (not machine-learned) by Robyn Speer. If you use ftfy in research, including pre-processing your language model data, you need to cite it: see :ref:`cite`. 20 | 21 | .. toctree:: 22 | :maxdepth: 1 23 | 24 | explain 25 | config 26 | encodings 27 | fixes 28 | detect 29 | avoid 30 | heuristic 31 | bad_encodings 32 | cli 33 | cite 34 | 35 | 36 | Some quick examples 37 | ------------------- 38 | 39 | Here are some examples (found in the real world) of what ftfy can do: 40 | 41 | ftfy can fix mojibake (encoding mix-ups), by detecting patterns of characters that were clearly meant to be UTF-8 but were decoded as something else: 42 | 43 | >>> import ftfy 44 | >>> ftfy.fix_text('✔ No problems') 45 | '✔ No problems' 46 | 47 | Does this sound impossible? It's really not. UTF-8 is a well-designed encoding that makes it obvious when it's being misused, and a string of mojibake usually contains all the information we need to recover the original string. 48 | 49 | ftfy can fix multiple layers of mojibake simultaneously: 50 | 51 | >>> ftfy.fix_text('The Mona Lisa doesn’t have eyebrows.') 52 | "The Mona Lisa doesn't have eyebrows." 53 | 54 | It can fix mojibake that has had "curly quotes" applied on top of it, which cannot be consistently decoded until the quotes are uncurled: 55 | 56 | >>> ftfy.fix_text("l’humanité") 57 | "l'humanité" 58 | 59 | ftfy can fix mojibake that would have included the character U+A0 (non-breaking space), but the U+A0 was turned into an ASCII space and then combined with another following space: 60 | 61 | >>> ftfy.fix_text('Ã\xa0 perturber la réflexion') 62 | 'à perturber la réflexion' 63 | >>> ftfy.fix_text('à perturber la réflexion') 64 | 'à perturber la réflexion' 65 | 66 | ftfy can also decode HTML entities that appear outside of HTML, even in cases where the entity has been incorrectly capitalized: 67 | 68 | >>> # by the HTML 5 standard, only 'PÉREZ' is acceptable 69 | >>> ftfy.fix_text('P&EACUTE;REZ') 70 | 'PÉREZ' 71 | 72 | These fixes are not applied in all cases, because ftfy has a strongly-held goal of avoiding false positives -- it should never change correctly-decoded text to something else. 73 | 74 | The following text could be encoded in Windows-1252 and decoded in UTF-8, and it would decode as 'MARQUɅ'. However, the original text is already sensible, so it is unchanged. 75 | 76 | >>> ftfy.fix_text('IL Y MARQUÉ…') 77 | 'IL Y MARQUÉ…' 78 | -------------------------------------------------------------------------------- /ftfy/bad_codecs/__init__.py: -------------------------------------------------------------------------------- 1 | r""" 2 | The `ftfy.bad_codecs` module gives Python the ability to decode some common, 3 | flawed encodings. 4 | 5 | Python does not want you to be sloppy with your text. Its encoders and decoders 6 | ("codecs") follow the relevant standards whenever possible, which means that 7 | when you get text that *doesn't* follow those standards, you'll probably fail 8 | to decode it. Or you might succeed at decoding it for implementation-specific 9 | reasons, which is perhaps worse. 10 | 11 | There are some encodings out there that Python wishes didn't exist, which are 12 | widely used outside of Python: 13 | 14 | - "utf-8-variants", a family of not-quite-UTF-8 encodings, including the 15 | ever-popular CESU-8 and "Java modified UTF-8". 16 | - "Sloppy" versions of character map encodings, where bytes that don't map to 17 | anything will instead map to the Unicode character with the same number. 18 | 19 | Simply importing this module, or in fact any part of the `ftfy` package, will 20 | make these new "bad codecs" available to Python through the standard Codecs 21 | API. You never have to actually call any functions inside `ftfy.bad_codecs`. 22 | 23 | However, if you want to call something because your code checker insists on it, 24 | you can call ``ftfy.bad_codecs.ok()``. 25 | 26 | A quick example of decoding text that's encoded in CESU-8: 27 | 28 | >>> import ftfy.bad_codecs 29 | >>> print(b'\xed\xa0\xbd\xed\xb8\x8d'.decode('utf-8-variants')) 30 | 😍 31 | """ 32 | 33 | import codecs 34 | from encodings import normalize_encoding 35 | from typing import Optional 36 | 37 | _CACHE: dict[str, codecs.CodecInfo] = {} 38 | 39 | # Define some aliases for 'utf-8-variants'. All hyphens get turned into 40 | # underscores, because of `normalize_encoding`. 41 | UTF8_VAR_NAMES = ( 42 | "utf_8_variants", 43 | "utf8_variants", 44 | "utf_8_variant", 45 | "utf8_variant", 46 | "utf_8_var", 47 | "utf8_var", 48 | "cesu_8", 49 | "cesu8", 50 | "java_utf_8", 51 | "java_utf8", 52 | ) 53 | 54 | 55 | def search_function(encoding: str) -> Optional[codecs.CodecInfo]: 56 | """ 57 | Register our "bad codecs" with Python's codecs API. This involves adding 58 | a search function that takes in an encoding name, and returns a codec 59 | for that encoding if it knows one, or None if it doesn't. 60 | 61 | The encodings this will match are: 62 | 63 | - Encodings of the form 'sloppy-windows-NNNN' or 'sloppy-iso-8859-N', 64 | where the non-sloppy version is an encoding that leaves some bytes 65 | unmapped to characters. 66 | - The 'utf-8-variants' encoding, which has the several aliases seen 67 | above. 68 | """ 69 | if encoding in _CACHE: 70 | return _CACHE[encoding] 71 | 72 | norm_encoding = normalize_encoding(encoding) 73 | codec = None 74 | if norm_encoding in UTF8_VAR_NAMES: 75 | from ftfy.bad_codecs.utf8_variants import CODEC_INFO 76 | 77 | codec = CODEC_INFO 78 | elif norm_encoding.startswith("sloppy_"): 79 | from ftfy.bad_codecs.sloppy import CODECS 80 | 81 | codec = CODECS.get(norm_encoding) 82 | 83 | if codec is not None: 84 | _CACHE[encoding] = codec 85 | 86 | return codec 87 | 88 | 89 | def ok() -> None: 90 | """ 91 | A feel-good function that gives you something to call after importing 92 | this package. 93 | 94 | Why is this here? Pyflakes. Pyflakes gets upset when you import a module 95 | and appear not to use it. It doesn't know that you're using it when 96 | you use the ``unicode.encode`` and ``bytes.decode`` methods with certain 97 | encodings. 98 | """ 99 | 100 | 101 | codecs.register(search_function) 102 | -------------------------------------------------------------------------------- /ftfy/bad_codecs/sloppy.py: -------------------------------------------------------------------------------- 1 | r""" 2 | `ftfy.bad_codecs.sloppy` provides character-map encodings that fill their "holes" 3 | in a messy but common way: by outputting the Unicode codepoints with the same 4 | numbers. 5 | 6 | This is incredibly ugly, and it's also in the HTML5 standard. 7 | 8 | A single-byte encoding maps each byte to a Unicode character, except that some 9 | bytes are left unmapped. In the commonly-used Windows-1252 encoding, for 10 | example, bytes 0x81 and 0x8D, among others, have no meaning. 11 | 12 | Python, wanting to preserve some sense of decorum, will handle these bytes 13 | as errors. But Windows knows that 0x81 and 0x8D are possible bytes and they're 14 | different from each other. It just hasn't defined what they are in terms of 15 | Unicode. 16 | 17 | Software that has to interoperate with Windows-1252 and Unicode -- such as all 18 | the common Web browsers -- will pick some Unicode characters for them to map 19 | to, and the characters they pick are the Unicode characters with the same 20 | numbers: U+0081 and U+008D. This is the same as what Latin-1 does, and the 21 | resulting characters tend to fall into a range of Unicode that's set aside for 22 | obsolete Latin-1 control characters anyway. 23 | 24 | These sloppy codecs let Python do the same thing, thus interoperating with 25 | other software that works this way. It defines a sloppy version of many 26 | single-byte encodings with holes. (There is no need for a sloppy version of 27 | an encoding without holes: for example, there is no such thing as 28 | sloppy-iso-8859-2 or sloppy-macroman.) 29 | 30 | The following encodings will become defined: 31 | 32 | - sloppy-windows-1250 (Central European, sort of based on ISO-8859-2) 33 | - sloppy-windows-1251 (Cyrillic) 34 | - sloppy-windows-1252 (Western European, based on Latin-1) 35 | - sloppy-windows-1253 (Greek, sort of based on ISO-8859-7) 36 | - sloppy-windows-1254 (Turkish, based on ISO-8859-9) 37 | - sloppy-windows-1255 (Hebrew, based on ISO-8859-8) 38 | - sloppy-windows-1256 (Arabic) 39 | - sloppy-windows-1257 (Baltic, based on ISO-8859-13) 40 | - sloppy-windows-1258 (Vietnamese) 41 | - sloppy-cp874 (Thai, based on ISO-8859-11) 42 | - sloppy-iso-8859-3 (Maltese and Esperanto, I guess) 43 | - sloppy-iso-8859-6 (different Arabic) 44 | - sloppy-iso-8859-7 (Greek) 45 | - sloppy-iso-8859-8 (Hebrew) 46 | - sloppy-iso-8859-11 (Thai) 47 | 48 | Aliases such as "sloppy-cp1252" for "sloppy-windows-1252" will also be 49 | defined. 50 | 51 | Five of these encodings (`sloppy-windows-1250` through `sloppy-windows-1254`) 52 | are used within ftfy. 53 | 54 | Here are some examples, using :func:`ftfy.explain_unicode` to illustrate how 55 | sloppy-windows-1252 merges Windows-1252 with Latin-1: 56 | 57 | >>> from ftfy import explain_unicode 58 | >>> some_bytes = b'\x80\x81\x82' 59 | >>> explain_unicode(some_bytes.decode('latin-1')) 60 | U+0080 \x80 [Cc] 61 | U+0081 \x81 [Cc] 62 | U+0082 \x82 [Cc] 63 | 64 | >>> explain_unicode(some_bytes.decode('windows-1252', 'replace')) 65 | U+20AC € [Sc] EURO SIGN 66 | U+FFFD � [So] REPLACEMENT CHARACTER 67 | U+201A ‚ [Ps] SINGLE LOW-9 QUOTATION MARK 68 | 69 | >>> explain_unicode(some_bytes.decode('sloppy-windows-1252')) 70 | U+20AC € [Sc] EURO SIGN 71 | U+0081 \x81 [Cc] 72 | U+201A ‚ [Ps] SINGLE LOW-9 QUOTATION MARK 73 | """ 74 | 75 | from __future__ import annotations 76 | 77 | import codecs 78 | from encodings import normalize_encoding 79 | 80 | REPLACEMENT_CHAR = "\ufffd" 81 | 82 | 83 | def make_sloppy_codec(encoding: str) -> codecs.CodecInfo: 84 | """ 85 | Take a codec name, and return a 'sloppy' version of that codec that can 86 | encode and decode the unassigned bytes in that encoding. 87 | 88 | Single-byte encodings in the standard library are defined using some 89 | boilerplate classes surrounding the functions that do the actual work, 90 | `codecs.charmap_decode` and `charmap_encode`. This function, given an 91 | encoding name, *defines* those boilerplate classes. 92 | """ 93 | # Make a bytestring of all 256 possible bytes. 94 | all_bytes = bytes(range(256)) 95 | 96 | # Get a list of what they would decode to in Latin-1. 97 | sloppy_chars = list(all_bytes.decode("latin-1")) 98 | 99 | # Get a list of what they decode to in the given encoding. Use the 100 | # replacement character for unassigned bytes. 101 | decoded_chars = all_bytes.decode(encoding, errors="replace") 102 | 103 | # Update the sloppy_chars list. Each byte that was successfully decoded 104 | # gets its decoded value in the list. The unassigned bytes are left as 105 | # they are, which gives their decoding in Latin-1. 106 | for i, char in enumerate(decoded_chars): 107 | if char != REPLACEMENT_CHAR: 108 | sloppy_chars[i] = char 109 | 110 | # For ftfy's own purposes, we're going to allow byte 1A, the "Substitute" 111 | # control code, to encode the Unicode replacement character U+FFFD. 112 | sloppy_chars[0x1A] = REPLACEMENT_CHAR 113 | 114 | # Create the data structures that tell the charmap methods how to encode 115 | # and decode in this sloppy encoding. 116 | decoding_table = "".join(sloppy_chars) 117 | encoding_table = codecs.charmap_build(decoding_table) 118 | 119 | # Now produce all the class boilerplate. Look at the Python source for 120 | # `encodings.cp1252` for comparison; this is almost exactly the same, 121 | # except I made it follow pep8. 122 | class Codec(codecs.Codec): 123 | def encode(self, input: str, errors: str | None = "strict") -> tuple[bytes, int]: 124 | return codecs.charmap_encode(input, errors, encoding_table) 125 | 126 | def decode(self, input: bytes, errors: str | None = "strict") -> tuple[str, int]: 127 | return codecs.charmap_decode(input, errors, decoding_table) # type: ignore[arg-type] 128 | 129 | class IncrementalEncoder(codecs.IncrementalEncoder): 130 | def encode(self, input: str, final: bool = False) -> bytes: 131 | return codecs.charmap_encode(input, self.errors, encoding_table)[0] 132 | 133 | class IncrementalDecoder(codecs.IncrementalDecoder): 134 | def decode(self, input: bytes, final: bool = False) -> str: # type: ignore[override] 135 | return codecs.charmap_decode(input, self.errors, decoding_table)[0] # type: ignore[arg-type] 136 | 137 | class StreamWriter(Codec, codecs.StreamWriter): 138 | pass 139 | 140 | class StreamReader(Codec, codecs.StreamReader): 141 | pass 142 | 143 | return codecs.CodecInfo( 144 | name="sloppy-" + encoding, 145 | encode=Codec().encode, 146 | decode=Codec().decode, # type: ignore[arg-type] 147 | incrementalencoder=IncrementalEncoder, 148 | incrementaldecoder=IncrementalDecoder, 149 | streamreader=StreamReader, 150 | streamwriter=StreamWriter, 151 | ) 152 | 153 | 154 | # Define a codec for each incomplete encoding. The resulting CODECS dictionary 155 | # can be used by the main module of ftfy.bad_codecs. 156 | CODECS = {} 157 | INCOMPLETE_ENCODINGS = ( 158 | [f"windows-{num}" for num in range(1250, 1259)] 159 | + [f"iso-8859-{num}" for num in (3, 6, 7, 8, 11)] 160 | + [f"cp{num}" for num in range(1250, 1259)] 161 | + ["cp874"] 162 | ) 163 | 164 | for _encoding in INCOMPLETE_ENCODINGS: 165 | _new_name = normalize_encoding("sloppy-" + _encoding) 166 | CODECS[_new_name] = make_sloppy_codec(_encoding) 167 | -------------------------------------------------------------------------------- /ftfy/bad_codecs/utf8_variants.py: -------------------------------------------------------------------------------- 1 | r""" 2 | This file defines a codec called "utf-8-variants" (or "utf-8-var"), which can 3 | decode text that's been encoded with a popular non-standard version of UTF-8. 4 | This includes CESU-8, the accidental encoding made by layering UTF-8 on top of 5 | UTF-16, as well as Java's twist on CESU-8 that contains a two-byte encoding for 6 | codepoint 0. 7 | 8 | This is particularly relevant in Python 3, which provides no other way of 9 | decoding CESU-8 [1]_. 10 | 11 | The easiest way to use the codec is to simply import `ftfy.bad_codecs`: 12 | 13 | >>> import ftfy.bad_codecs 14 | >>> result = b'here comes a null! \xc0\x80'.decode('utf-8-var') 15 | >>> print(repr(result).lstrip('u')) 16 | 'here comes a null! \x00' 17 | 18 | The codec does not at all enforce "correct" CESU-8. For example, the Unicode 19 | Consortium's not-quite-standard describing CESU-8 requires that there is only 20 | one possible encoding of any character, so it does not allow mixing of valid 21 | UTF-8 and CESU-8. This codec *does* allow that, just like Python 2's UTF-8 22 | decoder does. 23 | 24 | Characters in the Basic Multilingual Plane still have only one encoding. This 25 | codec still enforces the rule, within the BMP, that characters must appear in 26 | their shortest form. There is one exception: the sequence of bytes `0xc0 0x80`, 27 | instead of just `0x00`, may be used to encode the null character `U+0000`, like 28 | in Java. 29 | 30 | If you encode with this codec, you get legitimate UTF-8. Decoding with this 31 | codec and then re-encoding is not idempotent, although encoding and then 32 | decoding is. So this module won't produce CESU-8 for you. Look for that 33 | functionality in the sister module, "Breaks Text For You", coming approximately 34 | never. 35 | 36 | .. [1] In a pinch, you can decode CESU-8 in Python 2 using the UTF-8 codec: 37 | first decode the bytes (incorrectly), then encode them, then decode them 38 | again, using UTF-8 as the codec every time. But Python 2 is dead, so use 39 | ftfy instead. 40 | """ 41 | 42 | import codecs 43 | import re 44 | from encodings.utf_8 import ( 45 | IncrementalDecoder as UTF8IncrementalDecoder, 46 | ) 47 | from encodings.utf_8 import ( 48 | IncrementalEncoder as UTF8IncrementalEncoder, 49 | ) 50 | from typing import Callable, Optional 51 | 52 | NAME = "utf-8-variants" 53 | 54 | # This regular expression matches all possible six-byte CESU-8 sequences, 55 | # plus truncations of them at the end of the string. (If any of the 56 | # subgroups matches $, then all the subgroups after it also have to match $, 57 | # as there are no more characters to match.) 58 | CESU8_EXPR = ( 59 | b"(" 60 | b"\xed" 61 | b"([\xa0-\xaf]|$)" 62 | b"([\x80-\xbf]|$)" 63 | b"(\xed|$)" 64 | b"([\xb0-\xbf]|$)" 65 | b"([\x80-\xbf]|$)" 66 | b")" 67 | ) 68 | 69 | CESU8_RE = re.compile(CESU8_EXPR) 70 | 71 | # This expression matches isolated surrogate characters that aren't 72 | # CESU-8, which have to be handled carefully on Python 2. 73 | SURROGATE_EXPR = b"(\xed([\xa0-\xbf]|$)([\x80-\xbf]|$))" 74 | 75 | # This expression matches the Java encoding of U+0, including if it's 76 | # truncated and we need more bytes. 77 | NULL_EXPR = b"(\xc0(\x80|$))" 78 | 79 | # This regex matches cases that we need to decode differently from 80 | # standard UTF-8. 81 | SPECIAL_BYTES_RE = re.compile(b"|".join([NULL_EXPR, CESU8_EXPR, SURROGATE_EXPR])) 82 | 83 | 84 | class IncrementalDecoder(UTF8IncrementalDecoder): 85 | """ 86 | An incremental decoder that extends Python's built-in UTF-8 decoder. 87 | 88 | This encoder needs to take in bytes, possibly arriving in a stream, and 89 | output the correctly decoded text. The general strategy for doing this 90 | is to fall back on the real UTF-8 decoder whenever possible, because 91 | the real UTF-8 decoder is way optimized, but to call specialized methods 92 | we define here for the cases the real encoder isn't expecting. 93 | """ 94 | 95 | @staticmethod 96 | def _buffer_decode( # type: ignore[override] 97 | input: bytes, errors: Optional[str], final: bool 98 | ) -> tuple[str, int]: 99 | """ 100 | Decode bytes that may be arriving in a stream, following the Codecs 101 | API. 102 | 103 | `input` is the incoming sequence of bytes. `errors` tells us how to 104 | handle errors, though we delegate all error-handling cases to the real 105 | UTF-8 decoder to ensure correct behavior. `final` indicates whether 106 | this is the end of the sequence, in which case we should raise an 107 | error given incomplete input. 108 | 109 | Returns as much decoded text as possible, and the number of bytes 110 | consumed. 111 | """ 112 | # decoded_segments are the pieces of text we have decoded so far, 113 | # and position is our current position in the byte string. (Bytes 114 | # before this position have been consumed, and bytes after it have 115 | # yet to be decoded.) 116 | decoded_segments = [] 117 | position = 0 118 | while True: 119 | # Use _buffer_decode_step to decode a segment of text. 120 | decoded, consumed = IncrementalDecoder._buffer_decode_step( 121 | input[position:], errors, final 122 | ) 123 | if consumed == 0: 124 | # Either there's nothing left to decode, or we need to wait 125 | # for more input. Either way, we're done for now. 126 | break 127 | 128 | # Append the decoded text to the list, and update our position. 129 | decoded_segments.append(decoded) 130 | position += consumed 131 | 132 | if final: 133 | # _buffer_decode_step must consume all the bytes when `final` is 134 | # true. 135 | assert position == len(input) 136 | 137 | return "".join(decoded_segments), position 138 | 139 | @staticmethod 140 | def _buffer_decode_step(input: bytes, errors: Optional[str], final: bool) -> tuple[str, int]: 141 | """ 142 | There are three possibilities for each decoding step: 143 | 144 | - Decode as much real UTF-8 as possible. 145 | - Decode a six-byte CESU-8 sequence at the current position. 146 | - Decode a Java-style null at the current position. 147 | 148 | This method figures out which step is appropriate, and does it. 149 | """ 150 | # Get a reference to the superclass method that we'll be using for 151 | # most of the real work. 152 | sup = UTF8IncrementalDecoder._buffer_decode 153 | 154 | # Find the next byte position that indicates a variant of UTF-8. 155 | match = SPECIAL_BYTES_RE.search(input) 156 | if match is None: 157 | return sup(input, errors, final) 158 | 159 | cutoff = match.start() 160 | if cutoff > 0: 161 | return sup(input[:cutoff], errors, True) 162 | 163 | # Some byte sequence that we intend to handle specially matches 164 | # at the beginning of the input. 165 | if input.startswith(b"\xc0"): 166 | if len(input) > 1: 167 | # Decode the two-byte sequence 0xc0 0x80. 168 | return "\u0000", 2 169 | if final: 170 | # We hit the end of the stream. Let the superclass method 171 | # handle it. 172 | return sup(input, errors, True) 173 | # Wait to see another byte. 174 | return "", 0 175 | # Decode a possible six-byte sequence starting with 0xed. 176 | return IncrementalDecoder._buffer_decode_surrogates(sup, input, errors, final) 177 | 178 | @staticmethod 179 | def _buffer_decode_surrogates( 180 | sup: Callable[[bytes, Optional[str], bool], tuple[str, int]], 181 | input: bytes, 182 | errors: Optional[str], 183 | final: bool, 184 | ) -> tuple[str, int]: 185 | """ 186 | When we have improperly encoded surrogates, we can still see the 187 | bits that they were meant to represent. 188 | 189 | The surrogates were meant to encode a 20-bit number, to which we 190 | add 0x10000 to get a codepoint. That 20-bit number now appears in 191 | this form: 192 | 193 | 11101101 1010abcd 10efghij 11101101 1011klmn 10opqrst 194 | 195 | The CESU8_RE above matches byte sequences of this form. Then we need 196 | to extract the bits and assemble a codepoint number from them. 197 | """ 198 | if len(input) < 6: 199 | if final: 200 | # We found 0xed near the end of the stream, and there aren't 201 | # six bytes to decode. Delegate to the superclass method to 202 | # handle it as normal UTF-8. It might be a Hangul character 203 | # or an error. 204 | return sup(input, errors, final) 205 | # We found a surrogate, the stream isn't over yet, and we don't 206 | # know enough of the following bytes to decode anything, so 207 | # consume zero bytes and wait. 208 | return "", 0 209 | if CESU8_RE.match(input): 210 | # Given this is a CESU-8 sequence, do some math to pull out 211 | # the intended 20-bit value, and consume six bytes. 212 | codepoint = ( 213 | ((input[1] & 0x0F) << 16) 214 | + ((input[2] & 0x3F) << 10) 215 | + ((input[4] & 0x0F) << 6) 216 | + (input[5] & 0x3F) 217 | + 0x10000 218 | ) 219 | return chr(codepoint), 6 220 | # This looked like a CESU-8 sequence, but it wasn't one. 221 | # 0xed indicates the start of a three-byte sequence, so give 222 | # three bytes to the superclass to decode as usual. 223 | return sup(input[:3], errors, False) 224 | 225 | 226 | # The encoder is identical to UTF-8. 227 | IncrementalEncoder = UTF8IncrementalEncoder 228 | 229 | 230 | class StreamWriter(codecs.StreamWriter): 231 | @staticmethod 232 | def encode(input: str, errors: str = "strict") -> tuple[bytes, int]: 233 | return IncrementalEncoder(errors).encode(input, final=True), len(input) 234 | 235 | 236 | class StreamReader(codecs.StreamReader): 237 | @staticmethod 238 | def decode(input: bytes, errors: str = "strict") -> tuple[str, int]: 239 | return IncrementalDecoder(errors).decode(input, final=True), len(input) 240 | 241 | 242 | CODEC_INFO = codecs.CodecInfo( 243 | name=NAME, 244 | encode=StreamWriter.encode, 245 | decode=StreamReader.decode, # type: ignore[arg-type] 246 | incrementalencoder=IncrementalEncoder, 247 | incrementaldecoder=IncrementalDecoder, 248 | streamreader=StreamReader, 249 | streamwriter=StreamWriter, 250 | ) 251 | -------------------------------------------------------------------------------- /ftfy/badness.py: -------------------------------------------------------------------------------- 1 | """ 2 | `ftfy.badness` contains a heuristic that detects likely mojibake. 3 | 4 | This heuristic signals to ftfy which segments of text need to be fixed, and 5 | also indicates when the text can stop being fixed. 6 | 7 | The design of this heuristic is that we categorize the approximately 400 8 | Unicode characters that occur in UTF-8 mojibake, specifically the characters 9 | that come from mixing up UTF-8 with the other encodings we support. We 10 | identify sequences and contexts of these characters that are much more likely 11 | to be mojibake than intended strings, such as lowercase accented letters 12 | followed immediately by currency symbols. 13 | """ 14 | 15 | import warnings 16 | import re 17 | 18 | 19 | # There are only a few hundred characters that occur in known UTF-8 mojibake, and we can 20 | # characterize them: 21 | 22 | MOJIBAKE_CATEGORIES = { 23 | # Characters that appear in many different contexts. Sequences that contain 24 | # them are not inherently mojibake 25 | "common": ( 26 | "\N{NO-BREAK SPACE}" 27 | "\N{SOFT HYPHEN}" 28 | "\N{MIDDLE DOT}" 29 | "\N{ACUTE ACCENT}" 30 | "\N{EN DASH}" 31 | "\N{EM DASH}" 32 | "\N{HORIZONTAL BAR}" 33 | "\N{HORIZONTAL ELLIPSIS}" 34 | "\N{RIGHT SINGLE QUOTATION MARK}" 35 | ), 36 | # the C1 control character range, which have no uses outside of mojibake anymore 37 | "c1": "\x80-\x9f", 38 | # Characters that are nearly 100% used in mojibake 39 | "bad": ( 40 | "\N{BROKEN BAR}" 41 | "\N{CURRENCY SIGN}" 42 | "\N{DIAERESIS}" 43 | "\N{NOT SIGN}" 44 | "\N{MACRON}" 45 | "\N{CEDILLA}" 46 | "\N{LATIN SMALL LETTER F WITH HOOK}" 47 | "\N{MODIFIER LETTER CIRCUMFLEX ACCENT}" # it's not a modifier 48 | "\N{CARON}" 49 | "\N{BREVE}" 50 | "\N{OGONEK}" 51 | "\N{SMALL TILDE}" 52 | "\N{DAGGER}" 53 | "\N{DOUBLE DAGGER}" 54 | "\N{PER MILLE SIGN}" 55 | "\N{REVERSED NOT SIGN}" 56 | "\N{LOZENGE}" 57 | "\ufffd" 58 | # Theoretically these would appear in 'numeric' contexts, but when they 59 | # co-occur with other mojibake characters, it's not really ambiguous 60 | "\N{FEMININE ORDINAL INDICATOR}" 61 | "\N{MASCULINE ORDINAL INDICATOR}" 62 | ), 63 | # Characters used in legalese 64 | "law": ( 65 | "\N{PILCROW SIGN}" 66 | "\N{SECTION SIGN}" 67 | ), 68 | "currency": ( 69 | "\N{CENT SIGN}" 70 | "\N{POUND SIGN}" 71 | "\N{YEN SIGN}" 72 | "\N{PESETA SIGN}" 73 | "\N{EURO SIGN}" 74 | ), 75 | "start_punctuation": ( 76 | "\N{INVERTED EXCLAMATION MARK}" 77 | "\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}" 78 | "\N{INVERTED QUESTION MARK}" 79 | "\N{COPYRIGHT SIGN}" 80 | "\N{GREEK TONOS}" 81 | "\N{GREEK DIALYTIKA TONOS}" 82 | "\N{LEFT SINGLE QUOTATION MARK}" 83 | "\N{SINGLE LOW-9 QUOTATION MARK}" 84 | "\N{LEFT DOUBLE QUOTATION MARK}" 85 | "\N{DOUBLE LOW-9 QUOTATION MARK}" 86 | "\N{BULLET}" 87 | "\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}" 88 | "\uf8ff" # OS-specific symbol, usually the Apple logo 89 | ), 90 | "end_punctuation": ( 91 | "\N{REGISTERED SIGN}" 92 | "\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}" 93 | "\N{DOUBLE ACUTE ACCENT}" 94 | "\N{RIGHT DOUBLE QUOTATION MARK}" 95 | "\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}" 96 | "\N{TRADE MARK SIGN}" 97 | ), 98 | "numeric": ( 99 | "\N{SUPERSCRIPT TWO}" 100 | "\N{SUPERSCRIPT THREE}" 101 | "\N{SUPERSCRIPT ONE}" 102 | "\N{PLUS-MINUS SIGN}" 103 | "\N{VULGAR FRACTION ONE QUARTER}" 104 | "\N{VULGAR FRACTION ONE HALF}" 105 | "\N{VULGAR FRACTION THREE QUARTERS}" 106 | "\N{MULTIPLICATION SIGN}" 107 | "\N{MICRO SIGN}" 108 | "\N{DIVISION SIGN}" 109 | "\N{FRACTION SLASH}" 110 | "\N{PARTIAL DIFFERENTIAL}" 111 | "\N{INCREMENT}" 112 | "\N{N-ARY PRODUCT}" 113 | "\N{N-ARY SUMMATION}" 114 | "\N{SQUARE ROOT}" 115 | "\N{INFINITY}" 116 | "\N{INTERSECTION}" 117 | "\N{INTEGRAL}" 118 | "\N{ALMOST EQUAL TO}" 119 | "\N{NOT EQUAL TO}" 120 | "\N{IDENTICAL TO}" 121 | "\N{LESS-THAN OR EQUAL TO}" 122 | "\N{GREATER-THAN OR EQUAL TO}" 123 | "\N{NUMERO SIGN}" 124 | ), 125 | # Letters that might be used to make emoticon faces (kaomoji), and 126 | # therefore might need to appear in more improbable-looking contexts. 127 | # 128 | # These are concatenated character ranges for use in a regex. I know 129 | # they look like faces themselves. I think expressing the ranges like 130 | # this helps to illustrate why we need to be careful with these 131 | # characters. 132 | "kaomoji": ( 133 | "Ò-Ö" 134 | "Ù-Ü" 135 | "ò-ö" 136 | "ø-ü" 137 | "\N{LATIN CAPITAL LETTER O WITH DOUBLE ACUTE}" 138 | "\N{LATIN CAPITAL LETTER O WITH MACRON}" 139 | "\N{LATIN CAPITAL LETTER U WITH MACRON}" 140 | "\N{LATIN CAPITAL LETTER U WITH OGONEK}" 141 | "\N{DEGREE SIGN}" 142 | ), 143 | "upper_accented": ( 144 | # LATIN CAPITAL LETTER A WITH GRAVE - LATIN CAPITAL LETTER N WITH TILDE 145 | "\xc0-\xd1" 146 | # skip capital O's and U's that could be used in kaomoji, but 147 | # include Ø because it's very common in Arabic mojibake: 148 | "\N{LATIN CAPITAL LETTER O WITH STROKE}" 149 | "\N{LATIN CAPITAL LETTER U WITH DIAERESIS}" 150 | "\N{LATIN CAPITAL LETTER Y WITH ACUTE}" 151 | "\N{LATIN CAPITAL LETTER A WITH BREVE}" 152 | "\N{LATIN CAPITAL LETTER A WITH MACRON}" 153 | "\N{LATIN CAPITAL LETTER A WITH OGONEK}" 154 | "\N{LATIN CAPITAL LETTER C WITH ACUTE}" 155 | "\N{LATIN CAPITAL LETTER C WITH CARON}" 156 | "\N{LATIN CAPITAL LETTER D WITH CARON}" 157 | "\N{LATIN CAPITAL LETTER D WITH STROKE}" 158 | "\N{LATIN CAPITAL LETTER E WITH OGONEK}" 159 | "\N{LATIN CAPITAL LETTER E WITH CARON}" 160 | "\N{LATIN CAPITAL LETTER E WITH MACRON}" 161 | "\N{LATIN CAPITAL LETTER E WITH DOT ABOVE}" 162 | "\N{LATIN CAPITAL LETTER G WITH BREVE}" 163 | "\N{LATIN CAPITAL LETTER G WITH CEDILLA}" 164 | "\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}" 165 | "\N{LATIN CAPITAL LETTER I WITH MACRON}" 166 | "\N{LATIN CAPITAL LETTER K WITH CEDILLA}" 167 | "\N{LATIN CAPITAL LETTER L WITH ACUTE}" 168 | "\N{LATIN CAPITAL LETTER L WITH CARON}" 169 | "\N{LATIN CAPITAL LETTER L WITH STROKE}" 170 | "\N{LATIN CAPITAL LETTER L WITH CEDILLA}" 171 | "\N{LATIN CAPITAL LETTER N WITH ACUTE}" 172 | "\N{LATIN CAPITAL LETTER N WITH CARON}" 173 | "\N{LATIN CAPITAL LETTER N WITH CEDILLA}" 174 | "\N{LATIN CAPITAL LIGATURE OE}" 175 | "\N{LATIN CAPITAL LETTER R WITH CARON}" 176 | "\N{LATIN CAPITAL LETTER S WITH ACUTE}" 177 | "\N{LATIN CAPITAL LETTER S WITH CEDILLA}" 178 | "\N{LATIN CAPITAL LETTER S WITH CARON}" 179 | "\N{LATIN CAPITAL LETTER T WITH CEDILLA}" 180 | "\N{LATIN CAPITAL LETTER T WITH CARON}" 181 | "\N{LATIN CAPITAL LETTER U WITH RING ABOVE}" 182 | "\N{LATIN CAPITAL LETTER U WITH DOUBLE ACUTE}" 183 | "\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}" 184 | "\N{LATIN CAPITAL LETTER Z WITH ACUTE}" 185 | "\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}" 186 | "\N{LATIN CAPITAL LETTER Z WITH CARON}" 187 | "\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}" 188 | ), 189 | "lower_accented": ( 190 | "\N{LATIN SMALL LETTER SHARP S}" 191 | # LATIN SMALL LETTER A WITH GRAVE - LATIN SMALL LETTER N WITH TILDE 192 | "\xe0-\xf1" 193 | # skip o's and u's that could be used in kaomoji 194 | "\N{LATIN SMALL LETTER A WITH BREVE}" 195 | "\N{LATIN SMALL LETTER A WITH OGONEK}" 196 | "\N{LATIN SMALL LETTER A WITH MACRON}" 197 | "\N{LATIN SMALL LETTER C WITH ACUTE}" 198 | "\N{LATIN SMALL LETTER C WITH CARON}" 199 | "\N{LATIN SMALL LETTER D WITH CARON}" 200 | "\N{LATIN SMALL LETTER D WITH STROKE}" 201 | "\N{LATIN SMALL LETTER E WITH OGONEK}" 202 | "\N{LATIN SMALL LETTER E WITH CARON}" 203 | "\N{LATIN SMALL LETTER E WITH MACRON}" 204 | "\N{LATIN SMALL LETTER E WITH DOT ABOVE}" 205 | "\N{LATIN SMALL LETTER G WITH BREVE}" 206 | "\N{LATIN SMALL LETTER G WITH CEDILLA}" 207 | "\N{LATIN SMALL LETTER I WITH OGONEK}" 208 | "\N{LATIN SMALL LETTER I WITH MACRON}" 209 | "\N{LATIN SMALL LETTER K WITH CEDILLA}" 210 | "\N{LATIN SMALL LETTER L WITH ACUTE}" 211 | "\N{LATIN SMALL LETTER L WITH CARON}" 212 | "\N{LATIN SMALL LETTER L WITH STROKE}" 213 | "\N{LATIN SMALL LETTER L WITH CEDILLA}" 214 | "\N{LATIN SMALL LIGATURE OE}" 215 | "\N{LATIN SMALL LETTER R WITH ACUTE}" 216 | "\N{LATIN SMALL LETTER S WITH ACUTE}" 217 | "\N{LATIN SMALL LETTER S WITH CEDILLA}" 218 | "\N{LATIN SMALL LETTER S WITH CARON}" 219 | "\N{LATIN SMALL LETTER T WITH CARON}" 220 | "\N{LATIN SMALL LETTER U WITH DIAERESIS}" 221 | "\N{LATIN SMALL LETTER Z WITH ACUTE}" 222 | "\N{LATIN SMALL LETTER Z WITH DOT ABOVE}" 223 | "\N{LATIN SMALL LETTER Z WITH CARON}" 224 | "\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}" 225 | "\N{LATIN SMALL LIGATURE FI}" 226 | "\N{LATIN SMALL LIGATURE FL}" 227 | ), 228 | "upper_common": ( 229 | "\N{LATIN CAPITAL LETTER THORN}" 230 | "\N{GREEK CAPITAL LETTER ALPHA}-\N{GREEK CAPITAL LETTER OMEGA}" 231 | # not included under 'accented' because these can commonly 232 | # occur at ends of words, in positions where they'd be detected 233 | # as mojibake 234 | "\N{GREEK CAPITAL LETTER ALPHA WITH TONOS}" 235 | "\N{GREEK CAPITAL LETTER EPSILON WITH TONOS}" 236 | "\N{GREEK CAPITAL LETTER ETA WITH TONOS}" 237 | "\N{GREEK CAPITAL LETTER IOTA WITH TONOS}" 238 | "\N{GREEK CAPITAL LETTER OMICRON WITH TONOS}" 239 | "\N{GREEK CAPITAL LETTER UPSILON WITH TONOS}" 240 | "\N{GREEK CAPITAL LETTER OMEGA WITH TONOS}" 241 | "\N{GREEK CAPITAL LETTER IOTA WITH DIALYTIKA}" 242 | "\N{GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA}" 243 | "\N{CYRILLIC CAPITAL LETTER IO}-\N{CYRILLIC CAPITAL LETTER YA}" 244 | ), 245 | "lower_common": ( 246 | # lowercase thorn does not appear in mojibake 247 | "\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER OMEGA}" 248 | "\N{GREEK SMALL LETTER ALPHA WITH TONOS}" 249 | "\N{GREEK SMALL LETTER EPSILON WITH TONOS}" 250 | "\N{GREEK SMALL LETTER ETA WITH TONOS}" 251 | "\N{GREEK SMALL LETTER IOTA WITH TONOS}" 252 | "\N{GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS}" 253 | "\N{CYRILLIC SMALL LETTER A}-\N{CYRILLIC SMALL LETTER DZHE}" 254 | ), 255 | "box": ( 256 | # omit the single horizontal line, might be used in kaomoji 257 | "│┌┐┘├┤┬┼" 258 | "\N{BOX DRAWINGS DOUBLE HORIZONTAL}-\N{BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL}" 259 | "▀▄█▌▐░▒▓" 260 | ), 261 | } 262 | 263 | 264 | # We can now build a regular expression that detects unlikely juxtapositions 265 | # of characters, mostly based on their categories. 266 | # 267 | # Another regular expression, which detects sequences that look more specifically 268 | # like UTF-8 mojibake, appears in chardata.py. 269 | # 270 | # This is a verbose regular expression, with whitespace added for somewhat more 271 | # readability. Remember that the only spaces that count as literal spaces in this 272 | # expression are ones inside character classes (square brackets). 273 | 274 | BADNESS_RE = re.compile( 275 | r""" 276 | [{c1}] 277 | | 278 | [{bad}{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}{law}] [{bad}] 279 | | 280 | [a-zA-Z] [{lower_common}{upper_common}] [{bad}] 281 | | 282 | [{bad}] [{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}{law}] 283 | | 284 | [{lower_accented}{lower_common}{box}{end_punctuation}{currency}{numeric}] [{upper_accented}] 285 | | 286 | [{box}{end_punctuation}{currency}{numeric}] [{lower_accented}] 287 | | 288 | [{lower_accented}{box}{end_punctuation}] [{currency}] 289 | | 290 | \s [{upper_accented}] [{currency}] 291 | | 292 | [{upper_accented}{box}] [{numeric}{law}] 293 | | 294 | [{lower_accented}{upper_accented}{box}{currency}{end_punctuation}] [{start_punctuation}] [{numeric}] 295 | | 296 | [{lower_accented}{upper_accented}{currency}{numeric}{box}{law}] [{end_punctuation}] [{start_punctuation}] 297 | | 298 | [{currency}{numeric}{box}] [{start_punctuation}] 299 | | 300 | [a-z] [{upper_accented}] [{start_punctuation}{currency}] 301 | | 302 | [{box}] [{kaomoji}] 303 | | 304 | [{lower_accented}{upper_accented}{currency}{numeric}{start_punctuation}{end_punctuation}{law}] [{box}] 305 | | 306 | [{box}] [{end_punctuation}] 307 | | 308 | [{lower_accented}{upper_accented}] [{start_punctuation}{end_punctuation}] \w 309 | | 310 | 311 | # The ligature œ when not followed by an unaccented Latin letter 312 | [Œœ][^A-Za-z] 313 | | 314 | 315 | # Degree signs after capital letters 316 | [{upper_accented}]° 317 | | 318 | 319 | # Common Windows-1252 2-character mojibake that isn't covered by the cases above 320 | [ÂÃÎÐ][€œŠš¢£Ÿž\xa0\xad®©°·»{start_punctuation}{end_punctuation}–—´] 321 | | 322 | × [²³] 323 | | 324 | # Windows-1252 mojibake of Arabic words needs to include the 'common' characters. 325 | # To compensate, we require four characters to be matched. 326 | [ØÙ] [{common}{currency}{bad}{numeric}{start_punctuation}ŸŠ®°µ»] 327 | [ØÙ] [{common}{currency}{bad}{numeric}{start_punctuation}ŸŠ®°µ»] 328 | | 329 | 330 | # Windows-1252 mojibake that starts 3-character sequences for some South Asian 331 | # alphabets 332 | à[²µ¹¼½¾] 333 | | 334 | 335 | # MacRoman mojibake that isn't covered by the cases above 336 | √[±∂†≠®™´≤≥¥µø] 337 | | 338 | ≈[°¢] 339 | | 340 | ‚Ä[ìîïòôúùû†°¢π] 341 | | 342 | ‚[âó][àä°ê] 343 | | 344 | 345 | # Windows-1251 mojibake of characters in the U+2000 range 346 | †347 | | 348 | 349 | # Windows-1251 mojibake of Latin-1 characters and/or the Cyrillic alphabet. 350 | # Because the 2-character sequences involved here may be common, we require 351 | # seeing a 3-character sequence. 352 | [ВГРС][{c1}{bad}{start_punctuation}{end_punctuation}{currency}°µ][ВГРС] 353 | | 354 | # A distinctive five-character sequence of Cyrillic letters, which can be 355 | # Windows-1251 mojibake on top of Latin-1 mojibake of Windows-1252 characters. 356 | # Require a Latin letter nearby. 357 | ГўВЂВ.[A-Za-z ] 358 | | 359 | 360 | # Windows-1252 encodings of 'à' and 'á', as well as \xa0 itself 361 | Ã[\xa0¡] 362 | | 363 | [a-z]\s?[ÃÂ][ ] 364 | | 365 | ^[ÃÂ][ ] 366 | | 367 | 368 | # Cases where  precedes a character as an encoding of exactly the same 369 | # character, and the character is common enough 370 | [a-z.,?!{end_punctuation}]  [ {start_punctuation}{end_punctuation}] 371 | | 372 | 373 | # Windows-1253 mojibake of characters in the U+2000 range 374 | β€[™\xa0Ά\xad®°] 375 | | 376 | 377 | # Windows-1253 mojibake of Latin-1 characters and/or the Greek alphabet 378 | [ΒΓΞΟ][{c1}{bad}{start_punctuation}{end_punctuation}{currency}°][ΒΓΞΟ] 379 | | 380 | 381 | # Windows-1257 mojibake of characters in the U+2000 range 382 | †383 | """.format( 384 | **MOJIBAKE_CATEGORIES 385 | ), 386 | re.VERBOSE, 387 | ) 388 | 389 | 390 | def sequence_weirdness(text: str) -> int: 391 | """ 392 | This was the name of the heuristic used in ftfy 2.x through 5.x. As an 393 | attempt at compatibility with external code that calls the heuristic 394 | directly, we redirect to our new heuristic, :func:`badness`. 395 | """ 396 | warnings.warn( 397 | "`sequence_weirdness()` is an old heuristic, and the current " 398 | "closest equivalent is `ftfy.badness.badness()`" 399 | ) 400 | return badness(text) 401 | 402 | 403 | def badness(text: str) -> int: 404 | """ 405 | Get the 'badness' of a sequence of text, counting the number of unlikely 406 | character sequences. A badness greater than 0 indicates that some of it 407 | seems to be mojibake. 408 | """ 409 | return len(BADNESS_RE.findall(text)) 410 | 411 | 412 | def is_bad(text: str) -> bool: 413 | """ 414 | Returns true iff the given text looks like it contains mojibake. 415 | 416 | This can be faster than `badness`, because it returns when the first match 417 | is found to a regex instead of counting matches. Note that as strings get 418 | longer, they have a higher chance of returning True for `is_bad(string)`. 419 | """ 420 | return bool(BADNESS_RE.search(text)) 421 | -------------------------------------------------------------------------------- /ftfy/cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | A command-line utility for fixing text found in a file. 3 | """ 4 | 5 | import os 6 | import sys 7 | from pathlib import Path 8 | from typing import Union 9 | 10 | from ftfy import TextFixerConfig, __version__, fix_file 11 | 12 | ENCODE_ERROR_TEXT_UNIX = """ftfy error: 13 | Unfortunately, this output stream does not support Unicode. 14 | 15 | Your system locale may be very old or misconfigured. You should use a locale 16 | that supports UTF-8. One way to do this is to `export LANG=C.UTF-8`. 17 | """ 18 | 19 | ENCODE_ERROR_TEXT_WINDOWS = """ftfy error: 20 | Unfortunately, this output stream does not support Unicode. 21 | 22 | You might be trying to output to the Windows Command Prompt (cmd.exe), which 23 | does not fully support Unicode for historical reasons. In general, we recommend 24 | finding a way to run Python without using cmd.exe. 25 | 26 | You can work around this problem by using the '-o filename' option in ftfy to 27 | output to a file instead. 28 | """ 29 | 30 | DECODE_ERROR_TEXT = """ftfy error: 31 | This input couldn't be decoded as %r. We got the following error: 32 | 33 | %s 34 | 35 | ftfy works best when its input is in a known encoding. You can use `ftfy -g` 36 | to guess, if you're desperate. Otherwise, give the encoding name with the 37 | `-e` option, such as `ftfy -e latin-1`. 38 | """ 39 | 40 | SAME_FILE_ERROR_TEXT = """ftfy error: 41 | Can't read and write the same file. Please output to a new file instead. 42 | """ 43 | 44 | 45 | def main() -> None: 46 | """ 47 | Run ftfy as a command-line utility. 48 | """ 49 | import argparse 50 | 51 | parser = argparse.ArgumentParser( 52 | description=f"ftfy (fixes text for you), version {__version__}" 53 | ) 54 | parser.add_argument( 55 | "filename", 56 | default="-", 57 | nargs="?", 58 | help="The file whose Unicode is to be fixed. Defaults to -, meaning standard input.", 59 | ) 60 | parser.add_argument( 61 | "-o", 62 | "--output", 63 | type=str, 64 | default="-", 65 | help="The file to output to. Defaults to -, meaning standard output.", 66 | ) 67 | parser.add_argument( 68 | "-g", 69 | "--guess", 70 | action="store_true", 71 | help="Ask ftfy to guess the encoding of your input. This is risky. Overrides -e.", 72 | ) 73 | parser.add_argument( 74 | "-e", 75 | "--encoding", 76 | type=str, 77 | default="utf-8", 78 | help="The encoding of the input. Defaults to UTF-8.", 79 | ) 80 | parser.add_argument( 81 | "-n", 82 | "--normalization", 83 | type=str, 84 | default="NFC", 85 | help='The normalization of Unicode to apply. Defaults to NFC. Can be "none".', 86 | ) 87 | parser.add_argument( 88 | "--preserve-entities", 89 | action="store_true", 90 | help="Leave HTML entities as they are. The default " 91 | "is to decode them, as long as no HTML tags have appeared in the file.", 92 | ) 93 | 94 | args = parser.parse_args() 95 | 96 | encoding = args.encoding 97 | if args.guess: 98 | encoding = None 99 | 100 | if args.filename == "-": 101 | # Get a standard input stream made of bytes, so we can decode it as 102 | # whatever encoding is necessary. 103 | file = sys.stdin.buffer 104 | else: 105 | file = Path(args.filename).open("rb") 106 | 107 | if args.output == "-": 108 | outfile = sys.stdout 109 | else: 110 | if os.path.realpath(args.output) == os.path.realpath(args.filename): 111 | sys.stderr.write(SAME_FILE_ERROR_TEXT) 112 | sys.exit(1) 113 | outfile = Path(args.output).open("w", encoding="utf-8") 114 | 115 | normalization = args.normalization 116 | if normalization.lower() == "none": 117 | normalization = None 118 | 119 | unescape_html: Union[str, bool] 120 | unescape_html = False if args.preserve_entities else "auto" 121 | 122 | config = TextFixerConfig(unescape_html=unescape_html, normalization=normalization) 123 | 124 | try: 125 | for line in fix_file(file, encoding=encoding, config=config): 126 | try: 127 | outfile.write(line) 128 | except UnicodeEncodeError: 129 | if sys.platform == "win32": 130 | sys.stderr.write(ENCODE_ERROR_TEXT_WINDOWS) 131 | else: 132 | sys.stderr.write(ENCODE_ERROR_TEXT_UNIX) 133 | sys.exit(1) 134 | except UnicodeDecodeError as err: 135 | sys.stderr.write(DECODE_ERROR_TEXT % (encoding, err)) 136 | sys.exit(1) 137 | 138 | 139 | if __name__ == "__main__": 140 | main() 141 | -------------------------------------------------------------------------------- /ftfy/fixes.py: -------------------------------------------------------------------------------- 1 | """ 2 | The `ftfy.fixes` module contains the individual fixes that :func:`ftfy.fix_text` 3 | can perform, and provides the functions that are named in "explanations" 4 | such as the output of :func:`ftfy.fix_and_explain`. 5 | 6 | Two of these functions are particularly useful on their own, as more robust 7 | versions of functions in the Python standard library: 8 | 9 | - :func:`ftfy.fixes.decode_escapes` 10 | - :func:`ftfy.fixes.unescape_html` 11 | """ 12 | 13 | import codecs 14 | import html 15 | import re 16 | import warnings 17 | from re import Match 18 | from typing import Any 19 | 20 | import ftfy 21 | from ftfy.badness import is_bad 22 | from ftfy.chardata import ( 23 | ALTERED_UTF8_RE, 24 | C1_CONTROL_RE, 25 | CONTROL_CHARS, 26 | DOUBLE_QUOTE_RE, 27 | HTML_ENTITIES, 28 | HTML_ENTITY_RE, 29 | LIGATURES, 30 | LOSSY_UTF8_RE, 31 | SINGLE_QUOTE_RE, 32 | UTF8_DETECTOR_RE, 33 | WIDTH_MAP, 34 | ) 35 | 36 | 37 | def fix_encoding_and_explain(text: str) -> Any: 38 | """ 39 | Deprecated copy of `ftfy.fix_encoding_and_explain()`. 40 | """ 41 | warnings.warn( 42 | "`fix_encoding_and_explain()` has moved to the main module of ftfy.", 43 | DeprecationWarning, 44 | stacklevel=2, 45 | ) 46 | return ftfy.fix_encoding_and_explain(text) 47 | 48 | 49 | def fix_encoding(text: str) -> str: 50 | """ 51 | Deprecated copy of `ftfy.fix_encoding()`. 52 | """ 53 | warnings.warn( 54 | "`fix_encoding()` has moved to the main module of ftfy.", 55 | DeprecationWarning, 56 | stacklevel=2, 57 | ) 58 | return ftfy.fix_encoding(text) 59 | 60 | 61 | def apply_plan(text: str, plan: list[tuple[str, str]]) -> str: 62 | """ 63 | Deprecated copy of `ftfy.apply_plan()`. 64 | """ 65 | warnings.warn( 66 | "`apply_plan()` has moved to the main module of ftfy.", 67 | DeprecationWarning, 68 | stacklevel=2, 69 | ) 70 | return ftfy.apply_plan(text, plan) 71 | 72 | 73 | def _unescape_fixup(match: Match[str]) -> str: 74 | """ 75 | Replace one matched HTML entity with the character it represents, 76 | if possible. 77 | """ 78 | text = match.group(0) 79 | if text in HTML_ENTITIES: 80 | return HTML_ENTITIES[text] 81 | elif text.startswith("&#"): 82 | unescaped: str = html.unescape(text) 83 | 84 | # If html.unescape only decoded part of the string, that's not what 85 | # we want. The semicolon should be consumed. 86 | if ";" in unescaped: 87 | return text 88 | else: 89 | return unescaped 90 | else: 91 | return text 92 | 93 | 94 | def unescape_html(text: str) -> str: 95 | """ 96 | Decode HTML entities and character references, including some nonstandard 97 | ones written in all-caps. 98 | 99 | Python has a built-in called `html.unescape` that can decode HTML escapes, 100 | including a bunch of messy edge cases such as decoding escapes without 101 | semicolons such as "&". 102 | 103 | If you know you've got HTML-escaped text, applying `html.unescape` is the 104 | right way to convert it to plain text. But in ambiguous situations, that 105 | would create false positives. For example, the informally written text 106 | "this¬ that" should not automatically be decoded as "this¬ that". 107 | 108 | In this function, we decode the escape sequences that appear in the 109 | `html.entities.html5` dictionary, as long as they are the unambiguous ones 110 | that end in semicolons. 111 | 112 | We also decode all-caps versions of Latin letters and common symbols. 113 | If a database contains the name 'P&EACUTE;REZ', we can read that and intuit 114 | that it was supposed to say 'PÉREZ'. This is limited to a smaller set of 115 | entities, because there are many instances where entity names are 116 | case-sensitive in complicated ways. 117 | 118 | >>> unescape_html('<tag>') 119 | '' 120 | 121 | >>> unescape_html('𝒥ohn ℋancock') 122 | '𝒥ohn ℋancock' 123 | 124 | >>> unescape_html('✓') 125 | '✓' 126 | 127 | >>> unescape_html('Pérez') 128 | 'Pérez' 129 | 130 | >>> unescape_html('P&EACUTE;REZ') 131 | 'PÉREZ' 132 | 133 | >>> unescape_html('BUNDESSTRA&SZLIG;E') 134 | 'BUNDESSTRASSE' 135 | 136 | >>> unescape_html('ñ Ñ &NTILDE; &nTILDE;') 137 | 'ñ Ñ Ñ &nTILDE;' 138 | """ 139 | return HTML_ENTITY_RE.sub(_unescape_fixup, text) 140 | 141 | 142 | ANSI_RE = re.compile("\033\\[((?:\\d|;)*)([a-zA-Z])") 143 | 144 | 145 | def remove_terminal_escapes(text: str) -> str: 146 | r""" 147 | Strip out "ANSI" terminal escape sequences, such as those that produce 148 | colored text on Unix. 149 | 150 | >>> print(remove_terminal_escapes( 151 | ... "\033[36;44mI'm blue, da ba dee da ba doo...\033[0m" 152 | ... )) 153 | I'm blue, da ba dee da ba doo... 154 | """ 155 | return ANSI_RE.sub("", text) 156 | 157 | 158 | def uncurl_quotes(text: str) -> str: 159 | r""" 160 | Replace curly quotation marks with straight equivalents. 161 | 162 | >>> print(uncurl_quotes('\u201chere\u2019s a test\u201d')) 163 | "here's a test" 164 | """ 165 | return SINGLE_QUOTE_RE.sub("'", DOUBLE_QUOTE_RE.sub('"', text)) 166 | 167 | 168 | def fix_latin_ligatures(text: str) -> str: 169 | """ 170 | Replace single-character ligatures of Latin letters, such as 'fi', with the 171 | characters that they contain, as in 'fi'. Latin ligatures are usually not 172 | intended in text strings (though they're lovely in *rendered* text). If 173 | you have such a ligature in your string, it is probably a result of a 174 | copy-and-paste glitch. 175 | 176 | We leave ligatures in other scripts alone to be safe. They may be intended, 177 | and removing them may lose information. If you want to take apart nearly 178 | all ligatures, use NFKC normalization. 179 | 180 | >>> print(fix_latin_ligatures("fluffiest")) 181 | fluffiest 182 | """ 183 | return text.translate(LIGATURES) 184 | 185 | 186 | def fix_character_width(text: str) -> str: 187 | """ 188 | The ASCII characters, katakana, and Hangul characters have alternate 189 | "halfwidth" or "fullwidth" forms that help text line up in a grid. 190 | 191 | If you don't need these width properties, you probably want to replace 192 | these characters with their standard form, which is what this function 193 | does. 194 | 195 | Note that this replaces the ideographic space, U+3000, with the ASCII 196 | space, U+20. 197 | 198 | >>> print(fix_character_width("LOUD NOISES")) 199 | LOUD NOISES 200 | >>> print(fix_character_width("Uターン")) # this means "U-turn" 201 | Uターン 202 | """ 203 | return text.translate(WIDTH_MAP) 204 | 205 | 206 | def fix_line_breaks(text: str) -> str: 207 | r""" 208 | Convert all line breaks to Unix style. 209 | 210 | This will convert the following sequences into the standard \\n 211 | line break: 212 | 213 | - CRLF (\\r\\n), used on Windows and in some communication protocols 214 | - CR (\\r), once used on Mac OS Classic, and now kept alive by misguided 215 | software such as Microsoft Office for Mac 216 | - LINE SEPARATOR (\\u2028) and PARAGRAPH SEPARATOR (\\u2029), defined by 217 | Unicode and used to sow confusion and discord 218 | - NEXT LINE (\\x85), a C1 control character that is certainly not what you 219 | meant 220 | 221 | The NEXT LINE character is a bit of an odd case, because it 222 | usually won't show up if `fix_encoding` is also being run. 223 | \\x85 is very common mojibake for \\u2026, HORIZONTAL ELLIPSIS. 224 | 225 | >>> print(fix_line_breaks( 226 | ... "This string is made of two things:\u2029" 227 | ... "1. Unicode\u2028" 228 | ... "2. Spite" 229 | ... )) 230 | This string is made of two things: 231 | 1. Unicode 232 | 2. Spite 233 | 234 | For further testing and examples, let's define a function to make sure 235 | we can see the control characters in their escaped form: 236 | 237 | >>> def eprint(text): 238 | ... print(text.encode('unicode-escape').decode('ascii')) 239 | 240 | >>> eprint(fix_line_breaks("Content-type: text/plain\r\n\r\nHi.")) 241 | Content-type: text/plain\n\nHi. 242 | 243 | >>> eprint(fix_line_breaks("This is how Microsoft \r trolls Mac users")) 244 | This is how Microsoft \n trolls Mac users 245 | 246 | >>> eprint(fix_line_breaks("What is this \x85 I don't even")) 247 | What is this \n I don't even 248 | """ 249 | return ( 250 | text.replace("\r\n", "\n") 251 | .replace("\r", "\n") 252 | .replace("\u2028", "\n") 253 | .replace("\u2029", "\n") 254 | .replace("\u0085", "\n") 255 | ) 256 | 257 | 258 | SURROGATE_RE = re.compile("[\ud800-\udfff]") 259 | SURROGATE_PAIR_RE = re.compile("[\ud800-\udbff][\udc00-\udfff]") 260 | 261 | 262 | def convert_surrogate_pair(match: Match[str]) -> str: 263 | """ 264 | Convert a surrogate pair to the single codepoint it represents. 265 | 266 | This implements the formula described at: 267 | http://en.wikipedia.org/wiki/Universal_Character_Set_characters#Surrogates 268 | """ 269 | pair = match.group(0) 270 | codept = 0x10000 + (ord(pair[0]) - 0xD800) * 0x400 + (ord(pair[1]) - 0xDC00) 271 | return chr(codept) 272 | 273 | 274 | def fix_surrogates(text: str) -> str: 275 | """ 276 | Replace 16-bit surrogate codepoints with the characters they represent 277 | (when properly paired), or with \ufffd otherwise. 278 | 279 | >>> high_surrogate = chr(0xd83d) 280 | >>> low_surrogate = chr(0xdca9) 281 | >>> print(fix_surrogates(high_surrogate + low_surrogate)) 282 | 💩 283 | >>> print(fix_surrogates(low_surrogate + high_surrogate)) 284 | �� 285 | 286 | The above doctest had to be very carefully written, because even putting 287 | the Unicode escapes of the surrogates in the docstring was causing 288 | various tools to fail, which I think just goes to show why this fixer is 289 | necessary. 290 | """ 291 | if SURROGATE_RE.search(text): 292 | text = SURROGATE_PAIR_RE.sub(convert_surrogate_pair, text) 293 | text = SURROGATE_RE.sub("\ufffd", text) 294 | return text 295 | 296 | 297 | def remove_control_chars(text: str) -> str: 298 | """ 299 | Remove various control characters that you probably didn't intend to be in 300 | your text. Many of these characters appear in the table of "Characters not 301 | suitable for use with markup" at 302 | http://www.unicode.org/reports/tr20/tr20-9.html. 303 | 304 | This includes: 305 | 306 | - ASCII control characters, except for the important whitespace characters 307 | (U+00 to U+08, U+0B, U+0E to U+1F, U+7F) 308 | - Deprecated Arabic control characters (U+206A to U+206F) 309 | - Interlinear annotation characters (U+FFF9 to U+FFFB) 310 | - The Object Replacement Character (U+FFFC) 311 | - The byte order mark (U+FEFF) 312 | 313 | However, these similar characters are left alone: 314 | 315 | - Control characters that produce whitespace (U+09, U+0A, U+0C, U+0D, 316 | U+2028, and U+2029) 317 | - C1 control characters (U+80 to U+9F) -- even though they are basically 318 | never used intentionally, they are important clues about what mojibake 319 | has happened 320 | - Control characters that affect glyph rendering, such as joiners and 321 | right-to-left marks (U+200C to U+200F, U+202A to U+202E) 322 | - Musical notation control characters (U+1D173 to U+1D17A) because wow if 323 | you're using those you probably have a good reason 324 | - Tag characters, because they are now used in emoji sequences such as 325 | "Flag of Wales" 326 | """ 327 | return text.translate(CONTROL_CHARS) 328 | 329 | 330 | def remove_bom(text: str) -> str: 331 | r""" 332 | Remove a byte-order mark that was accidentally decoded as if it were part 333 | of the text. 334 | 335 | >>> print(remove_bom(chr(0xfeff) + "Where do you want to go today?")) 336 | Where do you want to go today? 337 | """ 338 | return text.lstrip(chr(0xFEFF)) 339 | 340 | 341 | # Define a regex to match valid escape sequences in Python string literals. 342 | ESCAPE_SEQUENCE_RE = re.compile( 343 | r""" 344 | ( \\U........ # 8-digit hex escapes 345 | | \\u.... # 4-digit hex escapes 346 | | \\x.. # 2-digit hex escapes 347 | | \\[0-7]{1,3} # Octal escapes 348 | | \\N\{[^}]+\} # Unicode characters by name 349 | | \\[\\'"abfnrtv] # Single-character escapes 350 | )""", 351 | re.UNICODE | re.VERBOSE, 352 | ) 353 | 354 | 355 | def decode_escapes(text: str) -> str: 356 | r""" 357 | Decode backslashed escape sequences, including \\x, \\u, and \\U character 358 | references, even in the presence of other Unicode. 359 | 360 | This function has to be called specifically. It's not run automatically by 361 | ftfy, because escaped text is not necessarily a mistake, and there is no 362 | way to distinguish when it is. 363 | 364 | This is what Python's "string-escape" and "unicode-escape" codecs were 365 | meant to do, but in contrast, this actually works. It will decode the 366 | string exactly the same way that the Python interpreter decodes its string 367 | literals. 368 | 369 | >>> factoid = '\\u20a1 is the currency symbol for the colón.' 370 | >>> print(factoid[1:]) 371 | u20a1 is the currency symbol for the colón. 372 | >>> print(decode_escapes(factoid)) 373 | ₡ is the currency symbol for the colón. 374 | 375 | Even though Python itself can read string literals with a combination of 376 | escapes and literal Unicode -- you're looking at one right now -- the 377 | "unicode-escape" codec doesn't work on literal Unicode. (See 378 | http://stackoverflow.com/a/24519338/773754 for more details.) 379 | 380 | Instead, this function searches for just the parts of a string that 381 | represent escape sequences, and decodes them, leaving the rest alone. All 382 | valid escape sequences are made of ASCII characters, and this allows 383 | "unicode-escape" to work correctly. 384 | """ 385 | 386 | def decode_match(match: Match[str]) -> str: 387 | "Given a regex match, decode the escape sequence it contains." 388 | return codecs.decode(match.group(0), "unicode-escape") 389 | 390 | return ESCAPE_SEQUENCE_RE.sub(decode_match, text) 391 | 392 | 393 | # This regex implements an exception to restore_byte_a0, so we can decode the 394 | # very common mojibake of (for example) "à la mode" as "à la mode", not "àla 395 | # mode". 396 | # 397 | # If byte C3 appears with a single space after it -- most commonly this shows 398 | # up as " à " appearing as an entire word -- we'll insert \xa0 while keeping 399 | # the space. Without this change, we would decode "à" as the start of the next 400 | # word, such as "àla". It's almost always intended to be a separate word, as in 401 | # "à la", but when mojibake turns this into "Ã\xa0 la", the two kinds of spaces 402 | # get coalesced into "à la". 403 | # 404 | # We make exceptions for the Portuguese words "às", "àquele", "àquela", 405 | # "àquilo" and their plurals -- these are contractions of, for example, "a 406 | # aquele" and are very common. Note that the final letter is important to 407 | # distinguish this case from French "à quel point". 408 | # 409 | # Other instances in Portuguese, such as "àfrica", seem to be typos (intended 410 | # to be "África" with the accent in the other direction). 411 | # 412 | # Unfortunately, "à" is a common letter in Catalan, and mojibake of words that 413 | # contain it will end up with inserted spaces. We can't do the right thing with 414 | # every word. The cost is that the mojibake text "fà cil" will be interpreted as 415 | # "fà cil", not "fàcil". 416 | A_GRAVE_WORD_RE = re.compile(b"\xc3 (?! |quele|quela|quilo|s )") 417 | 418 | 419 | def restore_byte_a0(byts: bytes) -> bytes: 420 | """ 421 | Some mojibake has been additionally altered by a process that said "hmm, 422 | byte A0, that's basically a space!" and replaced it with an ASCII space. 423 | When the A0 is part of a sequence that we intend to decode as UTF-8, 424 | changing byte A0 to 20 would make it fail to decode. 425 | 426 | This process finds sequences that would convincingly decode as UTF-8 if 427 | byte 20 were changed to A0, and puts back the A0. For the purpose of 428 | deciding whether this is a good idea, this step gets a cost of twice 429 | the number of bytes that are changed. 430 | 431 | This is used as a step within `fix_encoding`. 432 | """ 433 | byts = A_GRAVE_WORD_RE.sub(b"\xc3\xa0 ", byts) 434 | 435 | def replacement(match: Match[bytes]) -> bytes: 436 | "The function to apply when this regex matches." 437 | return match.group(0).replace(b"\x20", b"\xa0") 438 | 439 | return ALTERED_UTF8_RE.sub(replacement, byts) 440 | 441 | 442 | def replace_lossy_sequences(byts: bytes) -> bytes: 443 | """ 444 | This function identifies sequences where information has been lost in 445 | a "sloppy" codec, indicated by byte 1A, and if they would otherwise look 446 | like a UTF-8 sequence, it replaces them with the UTF-8 sequence for U+FFFD. 447 | 448 | A further explanation: 449 | 450 | ftfy can now fix text in a few cases that it would previously fix 451 | incompletely, because of the fact that it can't successfully apply the fix 452 | to the entire string. A very common case of this is when characters have 453 | been erroneously decoded as windows-1252, but instead of the "sloppy" 454 | windows-1252 that passes through unassigned bytes, the unassigned bytes get 455 | turned into U+FFFD (�), so we can't tell what they were. 456 | 457 | This most commonly happens with curly quotation marks that appear 458 | ``“ like this â€�``. 459 | 460 | We can do better by building on ftfy's "sloppy codecs" to let them handle 461 | less-sloppy but more-lossy text. When they encounter the character ``�``, 462 | instead of refusing to encode it, they encode it as byte 1A -- an 463 | ASCII control code called SUBSTITUTE that once was meant for about the same 464 | purpose. We can then apply a fixer that looks for UTF-8 sequences where 465 | some continuation bytes have been replaced by byte 1A, and decode the whole 466 | sequence as �; if that doesn't work, it'll just turn the byte back into � 467 | itself. 468 | 469 | As a result, the above text ``“ like this â€�`` will decode as 470 | ``“ like this �``. 471 | 472 | If U+1A was actually in the original string, then the sloppy codecs will 473 | not be used, and this function will not be run, so your weird control 474 | character will be left alone but wacky fixes like this won't be possible. 475 | 476 | This is used as a transcoder within `fix_encoding`. 477 | """ 478 | return LOSSY_UTF8_RE.sub("\ufffd".encode(), byts) 479 | 480 | 481 | def decode_inconsistent_utf8(text: str) -> str: 482 | """ 483 | Sometimes, text from one encoding ends up embedded within text from a 484 | different one. This is common enough that we need to be able to fix it. 485 | 486 | This is used as a transcoder within `fix_encoding`. 487 | """ 488 | 489 | def fix_embedded_mojibake(match: Match[str]) -> str: 490 | substr = match.group(0) 491 | 492 | # Require the match to be shorter, so that this doesn't recurse infinitely 493 | if len(substr) < len(text) and is_bad(substr): 494 | return ftfy.fix_encoding(substr) 495 | else: 496 | return substr 497 | 498 | return UTF8_DETECTOR_RE.sub(fix_embedded_mojibake, text) 499 | 500 | 501 | def _c1_fixer(match: Match[str]) -> str: 502 | return match.group(0).encode("latin-1").decode("sloppy-windows-1252") 503 | 504 | 505 | def fix_c1_controls(text: str) -> str: 506 | """ 507 | If text still contains C1 control characters, treat them as their 508 | Windows-1252 equivalents. This matches what Web browsers do. 509 | """ 510 | return C1_CONTROL_RE.sub(_c1_fixer, text) 511 | -------------------------------------------------------------------------------- /ftfy/formatting.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module provides functions for justifying Unicode text in a monospaced 3 | display such as a terminal. 4 | 5 | We used to have our own implementation here, but now we mostly rely on 6 | the 'wcwidth' library. 7 | """ 8 | 9 | from unicodedata import normalize 10 | 11 | from wcwidth import wcswidth, wcwidth 12 | 13 | from ftfy.fixes import remove_terminal_escapes 14 | 15 | 16 | def character_width(char: str) -> int: 17 | r""" 18 | Determine the width that a character is likely to be displayed as in 19 | a monospaced terminal. The width for a printable character will 20 | always be 0, 1, or 2. 21 | 22 | Nonprintable or control characters will return -1, a convention that comes 23 | from wcwidth. 24 | 25 | >>> character_width('車') 26 | 2 27 | >>> character_width('A') 28 | 1 29 | >>> character_width('\N{ZERO WIDTH JOINER}') 30 | 0 31 | >>> character_width('\n') 32 | -1 33 | """ 34 | return int(wcwidth(char)) 35 | 36 | 37 | def monospaced_width(text: str) -> int: 38 | r""" 39 | Return the number of character cells that this string is likely to occupy 40 | when displayed in a monospaced, modern, Unicode-aware terminal emulator. 41 | We refer to this as the "display width" of the string. 42 | 43 | This can be useful for formatting text that may contain non-spacing 44 | characters, or CJK characters that take up two character cells. 45 | 46 | Returns -1 if the string contains a non-printable or control character. 47 | 48 | >>> monospaced_width('ちゃぶ台返し') 49 | 12 50 | >>> len('ちゃぶ台返し') 51 | 6 52 | >>> monospaced_width('owl\N{SOFT HYPHEN}flavored') 53 | 11 54 | >>> monospaced_width('example\x80') 55 | -1 56 | 57 | A more complex example: The Korean word 'ibnida' can be written with 3 58 | pre-composed characters or 7 jamo. Either way, it *looks* the same and 59 | takes up 6 character cells. 60 | 61 | >>> monospaced_width('입니다') 62 | 6 63 | >>> monospaced_width('\u110b\u1175\u11b8\u1102\u1175\u1103\u1161') 64 | 6 65 | 66 | The word "blue" with terminal escapes to make it blue still takes up only 67 | 4 characters, when shown as intended. 68 | >>> monospaced_width('\x1b[34mblue\x1b[m') 69 | 4 70 | """ 71 | # NFC-normalize the text first, so that we don't need special cases for 72 | # Hangul jamo. 73 | # 74 | # Remove terminal escapes before calculating width, because if they are 75 | # displayed as intended, they will have zero width. 76 | return int(wcswidth(remove_terminal_escapes(normalize("NFC", text)))) 77 | 78 | 79 | def display_ljust(text: str, width: int, fillchar: str = " ") -> str: 80 | """ 81 | Return `text` left-justified in a Unicode string whose display width, 82 | in a monospaced terminal, should be at least `width` character cells. 83 | The rest of the string will be padded with `fillchar`, which must be 84 | a width-1 character. 85 | 86 | "Left" here means toward the beginning of the string, which may actually 87 | appear on the right in an RTL context. This is similar to the use of the 88 | word "left" in "left parenthesis". 89 | 90 | >>> lines = ['Table flip', '(╯°□°)╯︵ ┻━┻', 'ちゃぶ台返し'] 91 | >>> for line in lines: 92 | ... print(display_ljust(line, 20, '▒')) 93 | Table flip▒▒▒▒▒▒▒▒▒▒ 94 | (╯°□°)╯︵ ┻━┻▒▒▒▒▒▒▒ 95 | ちゃぶ台返し▒▒▒▒▒▒▒▒ 96 | 97 | This example, and the similar ones that follow, should come out justified 98 | correctly when viewed in a monospaced terminal. It will probably not look 99 | correct if you're viewing this code or documentation in a Web browser. 100 | """ 101 | if character_width(fillchar) != 1: 102 | msg = "The padding character must have display width 1" 103 | raise ValueError(msg) 104 | 105 | text_width = monospaced_width(text) 106 | if text_width == -1: 107 | # There's a control character here, so just don't add padding 108 | return text 109 | 110 | padding = max(0, width - text_width) 111 | return text + fillchar * padding 112 | 113 | 114 | def display_rjust(text: str, width: int, fillchar: str = " ") -> str: 115 | """ 116 | Return `text` right-justified in a Unicode string whose display width, 117 | in a monospaced terminal, should be at least `width` character cells. 118 | The rest of the string will be padded with `fillchar`, which must be 119 | a width-1 character. 120 | 121 | "Right" here means toward the end of the string, which may actually be on 122 | the left in an RTL context. This is similar to the use of the word "right" 123 | in "right parenthesis". 124 | 125 | >>> lines = ['Table flip', '(╯°□°)╯︵ ┻━┻', 'ちゃぶ台返し'] 126 | >>> for line in lines: 127 | ... print(display_rjust(line, 20, '▒')) 128 | ▒▒▒▒▒▒▒▒▒▒Table flip 129 | ▒▒▒▒▒▒▒(╯°□°)╯︵ ┻━┻ 130 | ▒▒▒▒▒▒▒▒ちゃぶ台返し 131 | """ 132 | if character_width(fillchar) != 1: 133 | msg = "The padding character must have display width 1" 134 | raise ValueError(msg) 135 | 136 | text_width = monospaced_width(text) 137 | if text_width == -1: 138 | return text 139 | 140 | padding = max(0, width - text_width) 141 | return fillchar * padding + text 142 | 143 | 144 | def display_center(text: str, width: int, fillchar: str = " ") -> str: 145 | """ 146 | Return `text` centered in a Unicode string whose display width, in a 147 | monospaced terminal, should be at least `width` character cells. The rest 148 | of the string will be padded with `fillchar`, which must be a width-1 149 | character. 150 | 151 | >>> lines = ['Table flip', '(╯°□°)╯︵ ┻━┻', 'ちゃぶ台返し'] 152 | >>> for line in lines: 153 | ... print(display_center(line, 20, '▒')) 154 | ▒▒▒▒▒Table flip▒▒▒▒▒ 155 | ▒▒▒(╯°□°)╯︵ ┻━┻▒▒▒▒ 156 | ▒▒▒▒ちゃぶ台返し▒▒▒▒ 157 | """ 158 | if character_width(fillchar) != 1: 159 | msg = "The padding character must have display width 1" 160 | raise ValueError(msg) 161 | 162 | text_width = monospaced_width(text) 163 | if text_width == -1: 164 | return text 165 | 166 | padding = max(0, width - text_width) 167 | left_padding = padding // 2 168 | right_padding = padding - left_padding 169 | return fillchar * left_padding + text + fillchar * right_padding 170 | -------------------------------------------------------------------------------- /ftfy/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rspeer/python-ftfy/74dd0452b48286a3770013b3a02755313bd5575e/ftfy/py.typed -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | files = ftfy 3 | check_untyped_defs = True 4 | disallow_any_generics = True 5 | disallow_incomplete_defs = False 6 | disallow_subclassing_any = True 7 | disallow_untyped_calls = False 8 | disallow_untyped_decorators = False 9 | disallow_untyped_defs = False 10 | no_implicit_optional = True 11 | no_implicit_reexport = False 12 | strict_equality = True 13 | warn_redundant_casts = True 14 | warn_return_any = True 15 | warn_unused_configs = True 16 | warn_unused_ignores = True 17 | python_version = 3.9 18 | 19 | [mypy-wcwidth] 20 | ignore_missing_imports = True 21 | 22 | -------------------------------------------------------------------------------- /notebook/excel-export.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rspeer/python-ftfy/74dd0452b48286a3770013b3a02755313bd5575e/notebook/excel-export.png -------------------------------------------------------------------------------- /notes/mysteries.txt: -------------------------------------------------------------------------------- 1 | on https://www.nipette.com/article-6358031.html, a comment is signed 'MÃ\x83©Ã\x82¬Ã\x82¡nie'. 2 | This happens to be triple-UTF-8 for 'M鬡nie', but that's probably not the name they meant. 3 | 4 | What exactly did https://www.horoskopy-horoskop.cz/clanek/431-numerologicky-vyznam-jmena-jaromir 5 | mean when they said 'TadeÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂáÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂá' ? 6 | 7 | https://mtlurb.com/tags/arbres/ 8 | 'montrã©al' probably isn't in cp850, but what is it? 9 | 10 | 11 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "ftfy" 3 | version = "6.3.1" 4 | description = "Fixes mojibake and other problems with Unicode, after the fact" 5 | authors = [{ name = "Robyn Speer", email = "rspeer@arborelia.net" }] 6 | license = { text = "Apache-2.0" } 7 | readme = "README.md" 8 | dependencies = ["wcwidth"] 9 | requires-python = ">=3.9" 10 | 11 | [project.scripts] 12 | ftfy = "ftfy.cli:main" 13 | 14 | [project.urls] 15 | Homepage = "https://ftfy.readthedocs.io/en/latest/" 16 | Documentation = "https://ftfy.readthedocs.io/en/latest/" 17 | Repository = "https://github.com/rspeer/python-ftfy" 18 | Issues = "https://github.com/rspeer/python-ftfy/issues/" 19 | Changelog = "https://github.com/rspeer/python-ftfy/blob/main/CHANGELOG.md" 20 | Blog = "https://posts.arborelia.net" 21 | 22 | [build-system] 23 | requires = ["hatchling"] 24 | build-backend = "hatchling.build" 25 | 26 | [tool.hatch.build.targets.sdist] 27 | exclude = ["^.github/", "scripts/", ".readthedocs.yaml", "notes/", "notebook/"] 28 | 29 | [tool.uv] 30 | dev-dependencies = [ 31 | "Sphinx >=7, <8", 32 | "furo >= 2024.7.18", 33 | "pytest >= 8.3.2, < 9", 34 | "ruff", 35 | ] 36 | 37 | [tool.ruff] 38 | exclude = ["badness.py", "notebook"] 39 | line-length = 100 40 | target-version = "py39" 41 | 42 | [tool.ruff.lint] 43 | select = ["B", "F", "I", "N", "ANN", "UP", "RUF", "C4", "EM", "PIE", "RSE", "TCH", "PTH", "FURB"] 44 | ignore = [ 45 | "ANN101", 46 | "ANN401", 47 | "RUF001", # complains about Unicode characters that belong in my docstrings 48 | "RUF002", # complains about Unicode characters that belong in my docstrings 49 | "PIE808", # explicitly starting ranges at 0 sometimes helps with readability 50 | ] 51 | 52 | [tool.ruff.lint.per-file-ignores] 53 | "tests/*" = ["ANN"] 54 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = --doctest-modules --ignore=setup.py --ignore=specimens --ignore=scripts --ignore=docs 3 | -------------------------------------------------------------------------------- /scripts/char_data_table.py: -------------------------------------------------------------------------------- 1 | """ 2 | Used to regenerate character tables in ftfy/chardata.py with explanatory comments. 3 | """ 4 | 5 | import unicodedata 6 | from dataclasses import dataclass 7 | 8 | from ftfy.chardata import UTF8_CLUES 9 | 10 | 11 | @dataclass 12 | class CharData: 13 | name: str 14 | codept: int 15 | encodings: list[tuple[str, int]] 16 | 17 | def sort_key(self) -> tuple[int, str, int]: 18 | if self.name.startswith("LATIN "): 19 | return (0, self.name, self.codept) 20 | return (1, "", self.codept) 21 | 22 | 23 | SAFE_ENCODINGS = [ 24 | "latin-1", 25 | "windows-1252", 26 | "windows-1251", 27 | "windows-1250", 28 | "windows-1253", 29 | "windows-1254", 30 | "windows-1257", 31 | ] 32 | 33 | 34 | def show_char_table(chars: str, byte_min: int = 0, byte_max: int = 0xFF) -> None: 35 | char_data: list[CharData] = [] 36 | for char in chars: 37 | name = unicodedata.name(char, "") 38 | codept = ord(char) 39 | encodings: list[tuple[str, int]] = [] 40 | for encoding in SAFE_ENCODINGS: 41 | try: 42 | encoded: bytes = char.encode(encoding) 43 | byte: int = encoded[0] 44 | encodings.append((encoding, byte)) 45 | except UnicodeEncodeError: 46 | pass 47 | if encodings: 48 | char_data.append(CharData(name=name, codept=codept, encodings=encodings)) 49 | else: 50 | print(f"No relevant encoding for {codept=}, {name=}") 51 | char_data.sort(key=CharData.sort_key) 52 | for cd in char_data: 53 | encoding_info: list[str] = [] 54 | for encoding, byte in cd.encodings: 55 | if byte_min <= byte <= byte_max: 56 | info_str = f"{encoding}:{byte:X}" 57 | encoding_info.append(info_str) 58 | encoding_explanation = encoding_info[0] if encoding_info else "???" 59 | print(f' "\\N{{{cd.name}}}" # {encoding_explanation}') 60 | 61 | 62 | def run() -> None: 63 | print("# utf8_first_of_2") 64 | show_char_table(UTF8_CLUES["utf8_first_of_2"], 0xC2, 0xDF) 65 | print("# utf8_first_of_3") 66 | show_char_table(UTF8_CLUES["utf8_first_of_3"], 0xE0, 0xEF) 67 | print("# utf8_first_of_4") 68 | show_char_table(UTF8_CLUES["utf8_first_of_4"], 0xF0, 0xF3) 69 | print("# utf8_continuation") 70 | print(r' "\x80-\xbf"') 71 | show_char_table(UTF8_CLUES["utf8_continuation"][3:], 0x80, 0xBF) 72 | print("# utf8_continuation_strict") 73 | print(r' "\x80-\xbf"') 74 | show_char_table(UTF8_CLUES["utf8_continuation_strict"][3:], 0x80, 0xBF) 75 | 76 | 77 | if __name__ == "__main__": 78 | run() 79 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rspeer/python-ftfy/74dd0452b48286a3770013b3a02755313bd5575e/tests/__init__.py -------------------------------------------------------------------------------- /tests/face.txt: -------------------------------------------------------------------------------- 1 | â”’(⌣˛⌣)┎ 2 | -------------------------------------------------------------------------------- /tests/test-cases/README.md: -------------------------------------------------------------------------------- 1 | # ftfy test cases 2 | 3 | This directory contains JSON files with test cases for ftfy. Many of them are real mojibake found in the wild, such as by listening to the Twitter firehose (when that existed), searching through the OSCAR web crawl, or in issue reports from users. 4 | 5 | Cases labeled "synthetic" were not found in the wild, but were instead constructed to test a particular edge case. 6 | 7 | Cases labeled "negative" are not mojibake but look lke they could be. We're testing that ftfy does not alter the text (except for its usual processing such as un-curling quotes). 8 | 9 | `known-failures.json` contains cases that we would do better at with an improved heuristic. Most of these are false negatives, where ftfy does not figure out how to fix the text. ftfy aims to have no false positives, but there is one synthetic false positive in `known-failures.json`. 10 | 11 | ## Structure of a test case 12 | 13 | A test case contains the following fields: 14 | 15 | - `label`: A description of the test case, shown when pytest runs in verbose mode. 16 | - `comment`: Further details on the test case because JSON doesn't have comments. 17 | - `original`: The text to run through ftfy. 18 | - `fixed-encoding` (optional): the expected result of `ftfy.fix_encoding(original)`. If unspecified, uses the value from `fixed`. 19 | - `fixed`: the expected result of `ftfy.fix_text(original)`. 20 | - `expect`: "pass" for test cases that should pass, or "fail" for known failures. -------------------------------------------------------------------------------- /tests/test-cases/in-the-wild.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "label": "Low-codepoint emoji", 4 | "comment": "From the ancient era before widespread emoji support on Twitter", 5 | "original": "He's Justinâ\u009d¤", 6 | "fixed": "He's Justin❤", 7 | "expect": "pass" 8 | }, 9 | { 10 | "label": "UTF-8 / MacRoman mix-up about smurfs", 11 | "original": "Le Schtroumpf Docteur conseille g√¢teaux et baies schtroumpfantes pour un r√©gime √©quilibr√©.", 12 | "fixed": "Le Schtroumpf Docteur conseille gâteaux et baies schtroumpfantes pour un régime équilibré.", 13 | "expect": "pass" 14 | }, 15 | { 16 | "label": "Checkmark that almost looks okay as mojibake", 17 | "original": "✔ No problems", 18 | "fixed": "✔ No problems", 19 | "expect": "pass" 20 | }, 21 | { 22 | "label": "UTF-8 / Windows-1251 Russian mixup about futbol", 23 | "original": "РґРѕСЂРѕРіРµ Р\u0098Р·-РїРѕРґ #футбол", 24 | "fixed": "дороге Из-под #футбол", 25 | "expect": "pass" 26 | }, 27 | { 28 | "label": "Latin-1 / Windows-1252 mixup in German", 29 | "original": "\u0084Handwerk bringt dich überall hin\u0093: Von der YOU bis nach Monaco", 30 | "fixed-encoding": "„Handwerk bringt dich überall hin“: Von der YOU bis nach Monaco", 31 | "fixed": "\"Handwerk bringt dich überall hin\": Von der YOU bis nach Monaco", 32 | "expect": "pass" 33 | }, 34 | { 35 | "label": "Latin-1 / Windows-1252 mixup of the replacement character", 36 | "original": "Some comments may be republished on the website or in the newspaper � email addresses will not be published.", 37 | "fixed": "Some comments may be republished on the website or in the newspaper � email addresses will not be published.", 38 | "expect": "pass" 39 | }, 40 | { 41 | "label": "CESU-8 / Windows-1252 emoji", 42 | "original": "Hi guys í ½í¸\u008d", 43 | "fixed": "Hi guys 😍", 44 | "expect": "pass" 45 | }, 46 | { 47 | "label": "CESU-8 / Latin-1 emoji", 48 | "original": "hihi RT username: â\u0098ºí ½í¸\u0098", 49 | "fixed": "hihi RT username: ☺😘", 50 | "expect": "pass" 51 | }, 52 | { 53 | "label": "Latin-1 / Windows-1252 mixup in Turkish", 54 | "original": "Beta Haber: Hırsızı Büyü Korkuttu", 55 | "fixed": "Beta Haber: Hırsızı Büyü Korkuttu", 56 | "expect": "pass" 57 | }, 58 | { 59 | "label": "Latin-1 / Windows-1252 mixup in İstanbul (issue #192)", 60 | "original": "İstanbul", 61 | "fixed": "İstanbul", 62 | "expect": "pass" 63 | }, 64 | { 65 | "label": "Latin-1 / Windows-1252 mixup in German (issue #188)", 66 | "original": "RUF MICH ZURÜCK", 67 | "fixed": "RUF MICH ZURÜCK", 68 | "expect": "pass" 69 | }, 70 | { 71 | "label": "Latin-1 / Windows-1252 mixup in Rīga (issue #192)", 72 | "original": "RÄ«ga", 73 | "fixed": "Rīga", 74 | "expect": "pass" 75 | }, 76 | { 77 | "label": "UTF-8 / Windows-1251 mixed up twice in Russian", 78 | "original": "приятности. РІСњВ¤", 79 | "fixed": "приятности. ❤", 80 | "expect": "pass" 81 | }, 82 | { 83 | "label": "UTF-8 / Windows-1252 mixed up twice in Malay", 84 | "original": "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New “ Romanceâ€Â\u009d.", 85 | "fixed-encoding": "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New “ Romance”.", 86 | "fixed": "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New \" Romance\".", 87 | "expect": "pass" 88 | }, 89 | { 90 | "label": "UTF-8 / Windows-1252 mixed up twice in naming Iggy Pop", 91 | "original": "Iggy Pop (né Jim Osterberg)", 92 | "fixed": "Iggy Pop (né Jim Osterberg)", 93 | "expect": "pass" 94 | }, 95 | { 96 | "label": "Left quote is UTF-8, right quote is Latin-1, both encoded in Windows-1252", 97 | "original": "Direzione Pd, ok â\u0080\u009csenza modifiche\u0094 all'Italicum.", 98 | "fixed-encoding": "Direzione Pd, ok “senza modifiche” all'Italicum.", 99 | "fixed": "Direzione Pd, ok \"senza modifiche\" all'Italicum.", 100 | "expect": "pass" 101 | }, 102 | { 103 | "label": "UTF-8 / sloppy Windows-1252 mixed up twice in a triumphant emoticon", 104 | "original": "selamat berpuasa sob (Ã\u00a0¸‡'̀⌣'ÃŒÂ\u0081)Ã\u00a0¸‡", 105 | "fixed": "selamat berpuasa sob (ง'̀⌣'́)ง", 106 | "expect": "pass" 107 | }, 108 | { 109 | "label": "UTF-8 / Windows-1252 mixed up three times", 110 | "original": "The Mona Lisa doesn’t have eyebrows.", 111 | "fixed-encoding": "The Mona Lisa doesn’t have eyebrows.", 112 | "fixed": "The Mona Lisa doesn't have eyebrows.", 113 | "expect": "pass" 114 | }, 115 | { 116 | "label": "UTF-8 / Codepage 437 mixup in Russian", 117 | "original": "#╨┐╤Ç╨░╨▓╨╕╨╗╤î╨╜╨╛╨╡╨┐╨╕╤é╨░╨╜╨╕╨╡", 118 | "fixed": "#правильноепитание", 119 | "expect": "pass" 120 | }, 121 | { 122 | "label": "UTF-8 / Windows-1252 mixup in French", 123 | "original": "Hôtel de Police", 124 | "fixed": "Hôtel de Police", 125 | "expect": "pass" 126 | }, 127 | { 128 | "label": "UTF-8 / Windows-1250 mixup in French", 129 | "original": "Liège Avenue de l'HĂ´pital", 130 | "fixed": "Liège Avenue de l'Hôpital", 131 | "expect": "pass" 132 | }, 133 | { 134 | "label": "UTF-8 / Windows-1252 mixup in Vietnamese", 135 | "original": "Tại sao giá hạt sầu riêng lại lên giá?", 136 | "fixed": "Tại sao giá hạt sầu riêng lại lên giá?", 137 | "expect": "pass" 138 | }, 139 | { 140 | "label": "Science! Mid-word Greek letter gets fixed correctly", 141 | "original": "Humanized HLA-DR4.RagKO.IL2RγcKO.NOD (DRAG) mice sustain the complex vertebrate life cycle of Plasmodium falciparum malaria.", 142 | "fixed": "Humanized HLA-DR4.RagKO.IL2RγcKO.NOD (DRAG) mice sustain the complex vertebrate life cycle of Plasmodium falciparum malaria.", 143 | "expect": "pass" 144 | }, 145 | { 146 | "label": "For goodness' sake. We can come close to fixing this, but fail in the last step", 147 | "original": "ItÃ?¢â?¬â?¢s classic. ItÃ?¢â?¬â?¢s epic. ItÃ?¢â?¬â?¢s ELIZABETH BENNET for goodnessÃ?¢â?¬â?¢ sake!", 148 | "fixed": "It�¢��s classic. It�¢��s epic. It�¢��s ELIZABETH BENNET for goodness�¢�� sake!", 149 | "expect": "pass" 150 | }, 151 | { 152 | "label": "lossy UTF-8 / Windows-1250 mixup in Spanish", 153 | "original": "Europa, Asia, Ă�frica, Norte, AmĂ©rica Central y del Sur, Australia y OceanĂ­a", 154 | "fixed": "Europa, Asia, �frica, Norte, América Central y del Sur, Australia y Oceanía", 155 | "expect": "pass" 156 | }, 157 | { 158 | "label": "UTF-8 / sloppy Windows-1250 mixup in English", 159 | "original": "It was named „scars´ stones“ after the rock-climbers who got hurt while climbing on it.", 160 | "fixed-encoding": "It was named\u00a0„scars´ stones“ after the rock-climbers who got hurt while climbing on it.", 161 | "fixed": "It was named\u00a0\"scars´ stones\" after the rock-climbers who got hurt while climbing on it.", 162 | "expect": "pass" 163 | }, 164 | { 165 | "label": "The same text as above, but as a UTF-8 / ISO-8859-2 mixup", 166 | "original": "It was namedÂ\u00a0â\u0080\u009escars´ stonesâ\u0080\u009c after the rock-climbers who got hurt while climbing on it.", 167 | "fixed-encoding": "It was named\u00a0„scars´ stones“ after the rock-climbers who got hurt while climbing on it.", 168 | "fixed": "It was named\u00a0\"scars´ stones\" after the rock-climbers who got hurt while climbing on it.", 169 | "expect": "pass" 170 | }, 171 | { 172 | "label": "UTF-8 / ISO-8859-2 mixup in Czech", 173 | "comment": "This says 'I've had enough of the third millennium', which is great because it involves software decisions made in the second", 174 | "original": "MĂĄm dost tĹ\u0099etĂ\u00adho tisĂ\u00adciletĂ\u00ad", 175 | "fixed": "Mám dost třetího tisíciletí", 176 | "expect": "pass" 177 | }, 178 | { 179 | "label": "UTF-8 / Windows-1252 mixup in mixed French and Arabic", 180 | "comment": "A difficult test case that can depend on the order that steps are applied", 181 | "original": "À tous mes frères et soeurs dans la syrienneté comme dans l’humanité, sans discrimination aucune, je vous souhaite bonne fête عيد سعيد.Que la paix, la liberté, l’égalité, la fraternité et la dignité soient avec vous.Pardonnez ce ton un peu ecclésiastique.", 182 | "fixed-encoding": "À tous mes frères et soeurs dans la syrienneté comme dans l’humanité, sans discrimination aucune, je vous souhaite bonne fête عيد سعيد.Que la paix, la liberté, l’égalité, la fraternité et la dignité soient avec vous.Pardonnez ce ton un peu ecclésiastique.", 183 | "fixed": "À tous mes frères et soeurs dans la syrienneté comme dans l'humanité, sans discrimination aucune, je vous souhaite bonne fête عيد سعيد.Que la paix, la liberté, l'égalité, la fraternité et la dignité soient avec vous.Pardonnez ce ton un peu ecclésiastique.", 184 | "expect": "pass" 185 | }, 186 | { 187 | "label": "UTF-8 / sloppy Windows-1250 mixup in Romanian", 188 | "original": "vedere Ă®nceĹŁoĹźatÄ\u0083", 189 | "fixed": "vedere înceţoşată", 190 | "expect": "pass" 191 | }, 192 | { 193 | "label": "UTF-8 / Windows-1250 mixup in Slovak", 194 | "original": "NapĂ\u00adšte nám !", 195 | "fixed": "Napíšte nám !", 196 | "expect": "pass" 197 | }, 198 | { 199 | "label": "UTF-8 / Windows-1252 mixup in Spanish", 200 | "original": "DOS AÑOS", 201 | "fixed": "DOS AÑOS", 202 | "expect": "pass" 203 | }, 204 | { 205 | "label": "UTF-8 / Windows-1252 followed by UTF-8 / Windows-1251", 206 | "original": "a bigger-than-expected £5.8bn rights issue to satisfy the new banking regulator", 207 | "fixed": "a bigger-than-expected £5.8bn rights issue to satisfy the new banking regulator", 208 | "expect": "pass" 209 | }, 210 | { 211 | "label": "fancy Unicode crossing-out, but mojibaked", 212 | "original": "hotel $49 $̶6̶3̶ updated 2018", 213 | "fixed": "hotel $49 $̶6̶3̶ updated 2018", 214 | "expect": "pass" 215 | }, 216 | { 217 | "label": "A face with UTF-8 / sloppy Windows-1252 mixed up twice", 218 | "original": "ââ€\u009d’(⌣˛⌣)ââ€\u009dŽ", 219 | "fixed": "┒(⌣˛⌣)┎", 220 | "expect": "pass" 221 | }, 222 | { 223 | "label": "We can mostly decode the face above when we lose the character U+009D", 224 | "original": "ââ€�’(⌣˛⌣)ââ€�Ž", 225 | "fixed": "�(⌣˛⌣)�", 226 | "expect": "pass" 227 | }, 228 | { 229 | "label": "Lossy decoding can have plain ASCII question marks, as well", 230 | "original": "The ICR has been upgraded to “bb+â€? from “bbâ€?", 231 | "fixed-encoding": "The ICR has been upgraded to “bb+� from “bb�", 232 | "fixed": "The ICR has been upgraded to \"bb+� from \"bb�", 233 | "expect": "pass" 234 | }, 235 | { 236 | "label": "CESU-8 / Latin-1 mixup over several emoji", 237 | "comment": "You tried", 238 | "original": "I just figured out how to tweet emojis! â\u009a½í\u00a0½í¸\u0080í\u00a0½í¸\u0081í\u00a0½í¸\u0082í\u00a0½í¸\u0086í\u00a0½í¸\u008eí\u00a0½í¸\u008eí\u00a0½í¸\u008eí\u00a0½í¸\u008e", 239 | "fixed": "I just figured out how to tweet emojis! ⚽😀😁😂😆😎😎😎😎", 240 | "expect": "pass" 241 | }, 242 | { 243 | "label": "An absolutely hopeless garble", 244 | "comment": "If we try too hard to decode this, we'll recursively apply `decode_inconsistent_utf8` until the characters turn into random Han and katakana characters.", 245 | "original": "ã†â€™ãƒâ€ ã¢â‚¬â„¢ãƒæ’ã‚â¢ãƒâ¢ã¢â‚¬å¡ã‚â¬ãƒâ€šã‚â", 246 | "fixed-encoding": "ã†â€™ãƒâ€ ã¢â‚¬â„¢ãƒæ’ã‚â¢ãƒâ¢ã¢â‚¬å¡ã‚â¬ãƒâ€šã‚â", 247 | "fixed": "ã†â€™ãƒâ€ ã¢â'¬â\"¢ãƒæ'ã'â¢ãƒâ¢ã¢â'¬å¡ã'â¬ãƒâ€šã'â", 248 | "expect": "pass" 249 | }, 250 | { 251 | "label": "Inconsistent UTF-8 / Latin-1 mojibake", 252 | "original": "Ecuadorâ\u0080\u0099s â\u0080\u0098purely political decision on Assangeâ\u0080\u0099 is likely result of â\u0080\u0098US pressureâ\u0080\u0099\u0085", 253 | "fixed-encoding": "Ecuador’s ‘purely political decision on Assange’ is likely result of ‘US pressure’…", 254 | "fixed": "Ecuador's 'purely political decision on Assange' is likely result of 'US pressure'…", 255 | "expect": "pass" 256 | }, 257 | { 258 | "label": "Inconsistent UTF-8 / Latin-1 mojibake with an ellipsis from the Windows-1252 character set", 259 | "original": "Ecuadorâ\u0080\u0099s â\u0080\u0098purely political decision on Assangeâ\u0080\u0099 is likely result of â\u0080\u0098US pressureâ\u0080\u0099…", 260 | "fixed-encoding": "Ecuador’s ‘purely political decision on Assange’ is likely result of ‘US pressure’…", 261 | "fixed": "Ecuador's 'purely political decision on Assange' is likely result of 'US pressure'…", 262 | "expect": "pass" 263 | }, 264 | { 265 | "label": "Inconsistent mojibake in Portuguese", 266 | "original": "Campeonatos > III Divisão - Série F > Jornadas Classificação", 267 | "fixed": "Campeonatos > III Divisão - Série F > Jornadas Classificação", 268 | "expect": "pass" 269 | }, 270 | { 271 | "label": "Handle Afrikaans 'n character", 272 | "original": "ʼn Chloroplas is ʼn organel wat in fotosinterende plante voorkom.", 273 | "fixed-encoding": "ʼn Chloroplas is ʼn organel wat in fotosinterende plante voorkom.", 274 | "fixed": "'n Chloroplas is 'n organel wat in fotosinterende plante voorkom.", 275 | "expect": "pass" 276 | }, 277 | { 278 | "label": "Handle Croatian single-codepoint digraphs", 279 | "original": "izum „bootstrap load“ koji je korištenjem polisilicijskog sloja proizveo dovoljno dobre kondenzatore na čipu", 280 | "fixed-encoding": "izum „bootstrap load“ koji je korištenjem polisilicijskog sloja proizveo dovoljno dobre kondenzatore na čipu", 281 | "fixed": "izum \"bootstrap load\" koji je korištenjem polisilicijskog sloja proizveo dovoljno dobre kondenzatore na čipu", 282 | "expect": "pass" 283 | }, 284 | { 285 | "label": "A with an acute accent, in isolation", 286 | "original": "Nicolás", 287 | "fixed": "Nicolás", 288 | "expect": "pass" 289 | }, 290 | { 291 | "label": "sharp S, in isolation, via MacRoman encoding", 292 | "comment": "regression reported in issue #186", 293 | "original": "wei√ü", 294 | "fixed": "weiß", 295 | "expect": "pass" 296 | }, 297 | { 298 | "label": "French example containing non-breaking spaces", 299 | "original": "ART TRIP Ã\u00a0 l'office de tourisme", 300 | "fixed": "ART TRIP à l'office de tourisme", 301 | "expect": "pass" 302 | }, 303 | { 304 | "label": "English example in UTF-8 / Windows-1251 with a ligature", 305 | "original": "This is signiп¬Ѓcantly lower than the respective share", 306 | "fixed-encoding": "This is significantly lower than the respective share", 307 | "fixed": "This is significantly lower than the respective share", 308 | "expect": "pass" 309 | }, 310 | { 311 | "label": "'à' remains its own word, even if spaces after it get coalesced into one", 312 | "original": "à perturber la réflexion des théologiens jusqu'à nos jours", 313 | "fixed": "à perturber la réflexion des théologiens jusqu'à nos jours", 314 | "expect": "pass" 315 | }, 316 | { 317 | "label": "Fix 'à' in inconsistent mojibake", 318 | "original": "Le barème forfaitaire permet l’évaluation des frais de déplacement relatifs à l’utilisation", 319 | "fixed-encoding": "Le barème forfaitaire permet l’évaluation des frais de déplacement relatifs à l’utilisation", 320 | "fixed": "Le barème forfaitaire permet l'évaluation des frais de déplacement relatifs à l'utilisation", 321 | "expect": "pass" 322 | }, 323 | { 324 | "label": "The Portuguese word 'às' does not become 'à s' due to the French fix", 325 | "original": "com especial atenção à s crianças", 326 | "fixed": "com especial atenção às crianças", 327 | "expect": "pass" 328 | }, 329 | { 330 | "label": "This is why we require a space after the 's' in 'às'", 331 | "original": "Troisième édition pour ce festival qui persiste et signe à s'éloigner des grands axes pour prendre les contre-allées en 16 concerts dans 7 villes de 2 pays voisins.", 332 | "fixed": "Troisième édition pour ce festival qui persiste et signe à s'éloigner des grands axes pour prendre les contre-allées en 16 concerts dans 7 villes de 2 pays voisins.", 333 | "expect": "pass" 334 | }, 335 | { 336 | "label": "We can fix 'à' in windows-1251 sometimes as well", 337 | "original": "La rГ©gion de Dnepropetrovsk se trouve Г l’ouest de l’Ukraine", 338 | "fixed-encoding": "La région de Dnepropetrovsk se trouve à l’ouest de l’Ukraine", 339 | "fixed": "La région de Dnepropetrovsk se trouve à l'ouest de l'Ukraine", 340 | "expect": "pass" 341 | }, 342 | { 343 | "label": "'à quele' is the Portuguese word 'àquele', not 'à quele'", 344 | "original": "eliminado o antígeno e mantidos os níveis de anticorpos, surgem as condições necessárias ao estabelecimento do granuloma, semelhante à quele observado nas lesões por imunocomplexo em excesso de anticorpos", 345 | "fixed": "eliminado o antígeno e mantidos os níveis de anticorpos, surgem as condições necessárias ao estabelecimento do granuloma, semelhante àquele observado nas lesões por imunocomplexo em excesso de anticorpos", 346 | "expect": "pass" 347 | }, 348 | { 349 | "label": "A complex, lossy pile-up of mojibake in Portuguese", 350 | "original": "â € ðŸ“� Regulamento: â € âš ï¸� As pessoas que marcarem nos comentários perfis empresariais e/ou de marcas, personalidades ou fake serão desclassificadas. âš ï¸� Podem participar pessoas residentes em Petrolina/PE ou Juazeiro/BA, desde que se comprometam a retirar o prêmio em nosso endereço. Funcionários estão vetados. âš ï¸� Serão válidos os comentários postados até 16h, do dia 31/03/2018. E o resultado será divulgado até à s 19h do mesmo dia em uma nova publicação em nosso instagram. â € Boa sorte!!! 😀ðŸ�°", 351 | "fixed": "⠀ �\u00a0Regulamento: ⠀ ⚠� As pessoas que marcarem nos comentários perfis empresariais e/ou de marcas, personalidades ou fake serão desclassificadas. ⚠� Podem participar pessoas residentes em Petrolina/PE ou Juazeiro/BA, desde que se comprometam a retirar o prêmio em nosso endereço. Funcionários estão vetados. ⚠� Serão válidos os comentários postados até 16h, do dia 31/03/2018. E o resultado será divulgado até às 19h do mesmo dia em uma nova publicação em nosso instagram. ⠀ Boa sorte!!!\u00a0😀�", 352 | "expect": "pass" 353 | }, 354 | { 355 | "label": "UTF-8 / Windows-1252 mixup in Gaelic involving non-breaking spaces", 356 | "original": "CÃ\u00a0nan nan GÃ\u00a0idheal", 357 | "fixed": "Cànan nan Gàidheal", 358 | "expect": "pass" 359 | }, 360 | { 361 | "label": "UTF-8 / Windows-1251 mixup in tweet spam", 362 | "original": "Blog Traffic Tip 2 – Broadcast Email Your Blog", 363 | "fixed": "Blog Traffic Tip 2 – Broadcast Email Your Blog", 364 | "expect": "pass" 365 | }, 366 | { 367 | "label": "UTF-8 / Windows-1251 mixup", 368 | "original": "S&P Confirms Ukrsotsbank’s “B-“ Rating", 369 | "fixed-encoding": "S&P Confirms Ukrsotsbank’s “B-“ Rating", 370 | "fixed": "S&P Confirms Ukrsotsbank's \"B-\" Rating", 371 | "expect": "pass" 372 | }, 373 | { 374 | "label": "Dutch example with ë", 375 | "comment": "from issue reported by MicroJackson", 376 | "original": "ongeëvenaard", 377 | "fixed-encoding": "ongeëvenaard", 378 | "fixed": "ongeëvenaard", 379 | "expect": "pass" 380 | }, 381 | { 382 | "label": "HTML entity on top of UTF-8 / Latin-1", 383 | "original": "10μs", 384 | "fixed-encoding": "10μs", 385 | "fixed": "10μs", 386 | "expect": "pass" 387 | }, 388 | { 389 | "label": "Three layers of UTF-8 / MacRoman mixup in French", 390 | "comment": "You're welcome", 391 | "original": "Merci de t‚Äö√†√∂¬¨¬©l‚Äö√†√∂¬¨¬©charger le plug-in Flash Player 8", 392 | "fixed": "Merci de télécharger le plug-in Flash Player 8", 393 | "expect": "pass" 394 | }, 395 | { 396 | "label": "UTF-8 / MacRoman mixup in French", 397 | "original": "Merci de bien vouloir activiter le Javascript dans votre navigateur web afin d'en profiter‚Ķ", 398 | "fixed": "Merci de bien vouloir activiter le Javascript dans votre navigateur web afin d'en profiter…", 399 | "expect": "pass" 400 | }, 401 | { 402 | "label": "Italian UTF-8 / MacRoman example with ò", 403 | "original": "Le Vigne di Zam√≤", 404 | "fixed": "Le Vigne di Zamò", 405 | "expect": "pass" 406 | }, 407 | { 408 | "label": "Punctuation pile-up should actually be musical notes", 409 | "original": "Engkau masih yg terindah, indah di dalam hatiku♫~", 410 | "fixed": "Engkau masih yg terindah, indah di dalam hatiku♫~", 411 | "expect": "pass" 412 | }, 413 | { 414 | "label": "Latvian UTF-8 / Windows-1257 mojibake", 415 | "original": "Å veices baņķieri gaida konkrÄ“tus investÄ«ciju projektus", 416 | "fixed": "Šveices baņķieri gaida konkrētus investīciju projektus", 417 | "expect": "pass" 418 | }, 419 | { 420 | "label": "Latvian UTF-8 / MacRoman mojibake", 421 | "original": "SaeimƒÅ ievƒìlƒìtƒÅs partijas \"Progresƒ´vie\" lƒ´dzvadƒ´tƒÅja Anto≈Üina ≈Öena≈°eva atbild uz ≈æurnƒÅlistu jautƒÅjumiem pƒìc partijas tik≈°anƒÅs ar Valsts prezidentu Rƒ´gas pilƒ´,", 422 | "fixed": "Saeimā ievēlētās partijas \"Progresīvie\" līdzvadītāja Antoņina Ņenaševa atbild uz žurnālistu jautājumiem pēc partijas tikšanās ar Valsts prezidentu Rīgas pilī,", 423 | "expect": "pass" 424 | }, 425 | { 426 | "label": "Lithuanian UTF-8 / Windows-1257 mojibake", 427 | "original": "Å iaip ÄÆdomu, kaip ÄÆsivaizduoji. Visų pirma tam reikia laiko.", 428 | "fixed": "Šiaip įdomu, kaip įsivaizduoji. Visų pirma tam reikia laiko.", 429 | "expect": "pass" 430 | }, 431 | { 432 | "label": "Lithuanian UTF-8 / Windows-1250 mojibake", 433 | "original": "Lietuva pagrÄŻstai gali paklausti: Ĺ˝inoma, kad ne.", 434 | "fixed": "Lietuva pagrįstai gali paklausti: Žinoma, kad ne.", 435 | "expect": "pass" 436 | }, 437 | { 438 | "label": "Hebrew UTF-8 / Windows-1252 mojibake", 439 | "comment": "reported by SuperIRabbit as issue #158", 440 | "original": "בהודעה", 441 | "fixed": "בהודעה", 442 | "expect": "pass" 443 | }, 444 | { 445 | "label": "Wide comma in UTF-8 / Windows-1252", 446 | "original": "Ningbo,China", 447 | "fixed-encoding": "Ningbo,China", 448 | "fixed": "Ningbo,China", 449 | "expect": "pass" 450 | } 451 | ] -------------------------------------------------------------------------------- /tests/test-cases/known-failures.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "label": "Misleading mix-up in Spanish", 4 | "comment": "The original text has mojibake, but the sequence 'á \u0093' can decode as U+1813 MONGOLIAN DIGIT THREE, when the whole string should really just decode as a Latin-1/Windows-1252 mixup", 5 | "original": "tiene demora y está \u0093próximo a resolverse\u0094", 6 | "fixed": "tiene demora y está \"próximo a resolverse\"", 7 | "expect": "fail" 8 | }, 9 | { 10 | "label": "Two levels of inconsistent mojibake", 11 | "comment": "The en-dash was mojibaked in UTF-8 / Windows-1252 as three characters, two of which were mojibaked again as Windows-1252 / Latin-1, and the third of which was mojibaked as UTF-8 / Latin-1. Unfortunately, if we fix this, we leave ourselves room to greedily 'decode' random Han characters in complex Latin-alphabet mojibake", 12 | "original": "Arsenal v Wolfsburg: pre-season friendly â\u0080â\u0080\u009c live!", 13 | "fixed": "Arsenal v Wolfsburg: pre-season friendly – live!", 14 | "expect": "fail" 15 | }, 16 | { 17 | "label": "A-with-grave in Vietnamese", 18 | "comment": "Currently adds extra spaces that shouldn't be there", 19 | "original": "Xem clip hĂ i, phim hĂ i má»›i hay nhất", 20 | "fixed": "Xem clip hài, phim hài mới hay nhất", 21 | "expect": "fail" 22 | }, 23 | { 24 | "label": "Latin-1 / MacRoman mixup in Spanish", 25 | "comment": "Requires something like encoding detection", 26 | "original": "Deja dos heridos hundimiento de barco tur\u0092stico en Acapulco.", 27 | "fixed": "Deja dos heridos hundimiento de barco turístico en Acapulco.", 28 | "expect": "fail" 29 | }, 30 | { 31 | "label": "subtle UTF-8 / codepage 437 mixup in Spanish", 32 | "original": "┬┐que diferencia hay?", 33 | "fixed": "¿que diferencia hay?", 34 | "expect": "fail" 35 | }, 36 | { 37 | "label": "Latin-1 / MacRoman mixup in Spanish, 2 characters", 38 | "comment": "Requires something like encoding detection", 39 | "original": "Habitantes de Coatl\u0087n conf\u0092an en proyecto de edil electo independiente", 40 | "fixed": "Habitantes de Coatlán confían en proyecto de edil electo independiente", 41 | "expect": "fail" 42 | }, 43 | { 44 | "label": "An example with 'à' in windows-1251 where we need our heuristic to be bolder", 45 | "original": "faites attention Г bien vous renseigner avant sur le mГ©dicament", 46 | "fixed": "faites attention à bien vous renseigner avant sur le médicament", 47 | "expect": "fail" 48 | }, 49 | { 50 | "label": "Italian UTF-8 / MacRoman mojibake that looks like math", 51 | "comment": "False negative: 'pi√π' is a bit too reasonable to fix", 52 | "original": "Sarai ricontattato dal nostro Esperto al pi√π presto.", 53 | "fixed": "Sarai ricontattato dal nostro Esperto al più presto.", 54 | "expect": "fail" 55 | }, 56 | { 57 | "label": "Synthetic: Incomplete UTF-8 / Windows-1252 mixup in Arabic", 58 | "comment": "I find text like this in OSCAR a fair amount, but couldn't isolate a good example that tested digits. The intended text means 'more than 100 countries'.", 59 | "original": "أكثر من Ù Ù Ù¡ بلد", 60 | "fixed": "أكثر من ٠٠١ بلد", 61 | "expect": "fail" 62 | }, 63 | { 64 | "label": "Synthetic, false positive: the title of a manga, in weird capitalized romaji, with a non-breaking space", 65 | "comment": "Testing tells me I should worry about cases like this, though I haven't seen a real example. Searching for similar real text yields a lot of examples that actually come out fine.", 66 | "original": "MISUTÂ\u00a0AJIKKO", 67 | "fixed": "MISUTÂ\u00a0AJIKKO", 68 | "expect": "fail" 69 | } 70 | ] -------------------------------------------------------------------------------- /tests/test-cases/language-names.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "label": "Messy language names: Czech", 4 | "comment": "This and several following examples came from the same language selector", 5 | "original": "ÄŒeÅ¡tina", 6 | "fixed": "Čeština", 7 | "expect": "pass" 8 | }, 9 | { 10 | "label": "Messy language names: Gaelic", 11 | "comment": "note that if U+A0 is replaced by a space, it comes out slightly incorrectly as 'Gà idhlig'", 12 | "original": "GÃ\u00a0idhlig", 13 | "fixed": "Gàidhlig", 14 | "expect": "pass" 15 | }, 16 | { 17 | "label": "Messy language names: Lithuanian", 18 | "original": "Lietuvių", 19 | "fixed": "Lietuvių", 20 | "expect": "pass" 21 | }, 22 | { 23 | "label": "Messy language names: Slovak", 24 | "original": "SlovenÄ�ina", 25 | "fixed": "Sloven�ina", 26 | "expect": "pass" 27 | }, 28 | { 29 | "label": "Messy language names: Vietnamese", 30 | "original": "Tiếng Việt", 31 | "fixed": "Tiếng Việt", 32 | "expect": "pass" 33 | }, 34 | { 35 | "label": "Messy language names: Greek", 36 | "original": "Ελληνικά", 37 | "fixed": "Ελληνικά", 38 | "expect": "pass" 39 | }, 40 | { 41 | "label": "Messy language names: Bulgarian", 42 | "original": "българÑ�ки език", 43 | "fixed": "българ�ки език", 44 | "expect": "pass" 45 | }, 46 | { 47 | "label": "Messy language names: Russian", 48 | "original": "РуÑ�Ñ�кий", 49 | "fixed": "Ру��кий", 50 | "expect": "pass" 51 | }, 52 | { 53 | "label": "Messy language names: Serbian [Cyrillic]", 54 | "original": "CрпÑ�ки [ћирилицом]", 55 | "fixed": "Cрп�ки [ћирилицом]", 56 | "expect": "pass" 57 | }, 58 | { 59 | "label": "Messy language names: Hebrew", 60 | "original": "עברית", 61 | "fixed": "עברית", 62 | "expect": "pass" 63 | }, 64 | { 65 | "label": "Messy language names: Russian", 66 | "original": "РуÑ�Ñ�кий", 67 | "fixed": "Ру��кий", 68 | "expect": "pass" 69 | }, 70 | { 71 | "label": "Messy language names: Hindi", 72 | "comment": "My terminal has difficulty rendering the mostly-fixed text", 73 | "original": "हिनà¥�दी", 74 | "fixed": "\u0939\u093f\u0928\ufffd\u0926\u0940", 75 | "expect": "pass" 76 | }, 77 | { 78 | "label": "Messy language names: Tamil", 79 | "comment": "My terminal has difficulty rendering the mostly-fixed text", 80 | "original": "தமிழà¯�", 81 | "fixed": "\u0ba4\u0bae\u0bbf\u0bb4\ufffd", 82 | "expect": "pass" 83 | }, 84 | { 85 | "label": "Messy language names: Thai", 86 | "original": "ภาษาไทย", 87 | "fixed": "ภาษาไทย", 88 | "expect": "pass" 89 | }, 90 | { 91 | "label": "Messy language names: Simplified Chinese", 92 | "original": "简体ä¸\u00adæ–‡", 93 | "fixed": "简体中文", 94 | "expect": "pass" 95 | }, 96 | { 97 | "label": "Messy language names: Traditional Chinese", 98 | "original": "æ\u00ad£é«”ä¸\u00adæ–‡", 99 | "fixed": "正體中文", 100 | "expect": "pass" 101 | }, 102 | { 103 | "label": "Messy language names: Japanese", 104 | "original": "日本語", 105 | "fixed": "日本語", 106 | "expect": "pass" 107 | }, 108 | { 109 | "label": "Messy language names: Korean", 110 | "original": "한êµ\u00adì–´", 111 | "fixed": "한국어", 112 | "expect": "pass" 113 | }, 114 | { 115 | "label": "Messy language name in cp437: Czech", 116 | "comment": "A synthetic example, I suppose, but goes with the other language name tests", 117 | "original": "─îe┼ítina", 118 | "fixed": "Čeština", 119 | "expect": "pass" 120 | }, 121 | { 122 | "label": "Messy language name in cp437: Vietnamese", 123 | "original": "Tiß║┐ng Viß╗çt", 124 | "fixed": "Tiếng Việt", 125 | "expect": "pass" 126 | } 127 | ] -------------------------------------------------------------------------------- /tests/test-cases/negative.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "label": "Negative: Using diaereses as quotation marks in Greek", 4 | "comment": "Examples in this file might be detected as mojibake-like, but should not be changed", 5 | "original": "Η ¨ανατροφή¨ δυστυχώς από τους προπονητές", 6 | "fixed": "Η ¨ανατροφή¨ δυστυχώς από τους προπονητές", 7 | "expect": "pass" 8 | }, 9 | { 10 | "label": "Negative: Don't fix a multiplication symbol in quotes", 11 | "original": "higher values (“+” and “×” curves) in the superficial region", 12 | "fixed-encoding": "higher values (“+” and “×” curves) in the superficial region", 13 | "fixed": "higher values (\"+\" and \"×\" curves) in the superficial region", 14 | "expect": "pass" 15 | }, 16 | { 17 | "label": "Sort of negative: this inconsistent mojibake could be Latin-1 or MacRoman, and it was meant to be Latin-1, but it's safest to not decode it as either", 18 | "comment": "issue #202", 19 | "original": "Bremer/Mccoy – DrÃ¥ber", 20 | "fixed": "Bremer/Mccoy – DrÃ¥ber", 21 | "expect": "pass" 22 | }, 23 | { 24 | "label": "Negative: 'è' preceded by a non-breaking space is not a small capital Y", 25 | "original": "Con il corpo e lo spirito ammaccato,\u00a0è come se nel cuore avessi un vetro conficcato.", 26 | "fixed": "Con il corpo e lo spirito ammaccato,\u00a0è come se nel cuore avessi un vetro conficcato.", 27 | "expect": "pass" 28 | }, 29 | { 30 | "label": "Negative: multiplication sign and ellipsis", 31 | "comment": "Should not turn into a dot below", 32 | "original": "4288×…", 33 | "fixed": "4288×…", 34 | "expect": "pass" 35 | }, 36 | { 37 | "label": "Negative: accents are sometimes used as quotes", 38 | "comment": "Under a previous heuristic, this tested the CESU-8 decoder, which would try to decode it and fail when it hit the end of the string", 39 | "original": "``toda produzida pronta pra assa aí´´", 40 | "fixed": "``toda produzida pronta pra assa aí´´", 41 | "expect": "pass" 42 | }, 43 | { 44 | "label": "Negative: 'Õ' followed by an ellipsis", 45 | "comment": "Should not turn into the Armenian letter Յ", 46 | "original": "HUHLL Õ…", 47 | "fixed": "HUHLL Õ…", 48 | "expect": "pass" 49 | }, 50 | { 51 | "label": "Negative: 'Ê' followed by an ellipsis", 52 | "comment": "Should not turn into a squat reversed esh", 53 | "original": "RETWEET SE VOCÊ…", 54 | "fixed": "RETWEET SE VOCÊ…", 55 | "expect": "pass" 56 | }, 57 | { 58 | "label": "Negative: 'É' followed by an ellipsis", 59 | "comment": "Should not turn into 'MARQUɅ'", 60 | "original": "PARCE QUE SUR LEURS PLAQUES IL Y MARQUÉ…", 61 | "fixed": "PARCE QUE SUR LEURS PLAQUES IL Y MARQUÉ…", 62 | "expect": "pass" 63 | }, 64 | { 65 | "label": "Negative: 'Ó' followed by an ellipsis", 66 | "comment": "Should not turn into 'SӅ'", 67 | "original": "TEM QUE SEGUIR, SDV SÓ…", 68 | "fixed": "TEM QUE SEGUIR, SDV SÓ…", 69 | "expect": "pass" 70 | }, 71 | { 72 | "label": "Negative: 'É' followed by a curly apostrophe", 73 | "comment": "Should not turn into 'ZZAJɒs'", 74 | "original": "Join ZZAJÉ’s Official Fan List and receive news, events, and more!", 75 | "fixed-encoding": "Join ZZAJÉ’s Official Fan List and receive news, events, and more!", 76 | "fixed": "Join ZZAJÉ's Official Fan List and receive news, events, and more!", 77 | "expect": "pass" 78 | }, 79 | { 80 | "label": "Negative: 'é' preceded by curly apostrophe", 81 | "comment": "Should not turn into 'LՎpisode'", 82 | "original": "L’épisode 8 est trop fou ouahh", 83 | "fixed-encoding": "L’épisode 8 est trop fou ouahh", 84 | "fixed": "L'épisode 8 est trop fou ouahh", 85 | "expect": "pass" 86 | }, 87 | { 88 | "label": "Negative: three raised eyebrows or something?", 89 | "comment": "Should not turn into private use character U+F659", 90 | "original": "Ôôô VIDA MINHA", 91 | "fixed": "Ôôô VIDA MINHA", 92 | "expect": "pass" 93 | }, 94 | { 95 | "label": "Negative: copyright sign preceded by non-breaking space", 96 | "comment": "Should not turn into 'ʩ'", 97 | "original": "[x]\u00a0©", 98 | "fixed": "[x]\u00a0©", 99 | "expect": "pass" 100 | }, 101 | { 102 | "label": "Negative: en dash and infinity sign", 103 | "comment": "Should not turn into '2012Ѱ'", 104 | "original": "2012—∞", 105 | "fixed": "2012—∞", 106 | "expect": "pass" 107 | }, 108 | { 109 | "label": "Negative: This Е is a Ukrainian letter, but nothing else is wrong", 110 | "original": "SENSЕ - Oleg Tsedryk", 111 | "fixed": "SENSЕ - Oleg Tsedryk", 112 | "expect": "pass" 113 | }, 114 | { 115 | "label": "Negative: angry face", 116 | "comment": "The face should not turn into '`«'", 117 | "original": "OK??:( `¬´ ):", 118 | "fixed": "OK??:( `¬´ ):", 119 | "expect": "pass" 120 | }, 121 | { 122 | "label": "Negative, synthetic: face with glasses and a raised eyebrow", 123 | "original": "( o¬ô )", 124 | "fixed": "( o¬ô )", 125 | "expect": "pass" 126 | }, 127 | { 128 | "label": "Negative: triangle and degree sign", 129 | "comment": "I'm not really sure what it *is* supposed to be, but it's not 'ơ'", 130 | "original": "∆°", 131 | "fixed": "∆°", 132 | "expect": "pass" 133 | }, 134 | { 135 | "label": "Negative: Portuguese with inverted question mark", 136 | "comment": "Former false positive - it should not turn into 'QUEM ɿ'", 137 | "original": "ESSE CARA AI QUEM É¿", 138 | "fixed": "ESSE CARA AI QUEM É¿", 139 | "expect": "pass" 140 | }, 141 | { 142 | "label": "Negative: Portuguese with acute accents as quotation marks", 143 | "comment": "Former false positive - the end should not turn into a superscript H", 144 | "original": "``hogwarts nao existe, voce nao vai pegar o trem pra lá´´", 145 | "fixed": "``hogwarts nao existe, voce nao vai pegar o trem pra lá´´", 146 | "expect": "pass" 147 | }, 148 | { 149 | "label": "Negative: Finnish Ä followed by a non-breaking space", 150 | "comment": "Former false positive - should not become a G with a dot", 151 | "original": "SELKÄ\u00a0EDELLÄ\u00a0MAAHAN via @YouTube", 152 | "fixed": "SELKÄ\u00a0EDELLÄ\u00a0MAAHAN via @YouTube", 153 | "expect": "pass" 154 | }, 155 | { 156 | "label": "Negative: multiplying by currency", 157 | "comment": "Former false positive - should not become the Hebrew letter 'final pe'", 158 | "original": "Offering 5×£35 pin ups", 159 | "fixed": "Offering 5×£35 pin ups", 160 | "expect": "pass" 161 | }, 162 | { 163 | "label": "Negative: registered chocolate brand name", 164 | "comment": "Former false positive - should not become the IPA letter 'lezh'", 165 | "original": "NESTLÉ® requiere contratar personal para diferentes areas a nivel nacional e internacional", 166 | "fixed": "NESTLÉ® requiere contratar personal para diferentes areas a nivel nacional e internacional", 167 | "expect": "pass" 168 | }, 169 | { 170 | "label": "Negative: it looks like Windows-1257 mojibake but someone writes their name this way", 171 | "comment": "Should not become a cedilla", 172 | "original": "Connect with Āø on Facebook", 173 | "fixed": "Connect with Āø on Facebook", 174 | "expect": "pass" 175 | }, 176 | { 177 | "label": "Mostly negative: we only need to fix C1 control characters", 178 | "comment": "We should not decode 'é\u0085 ' as '酠'", 179 | "original": "C'est vrai que nous n'en avons pas encore beaucoup parlé\u0085 Tu sais, ça fait de nombreuses années", 180 | "fixed": "C'est vrai que nous n'en avons pas encore beaucoup parlé… Tu sais, ça fait de nombreuses années", 181 | "expect": "pass" 182 | }, 183 | { 184 | "label": "Negative: We don't fix à in all contexts", 185 | "original": "C O N C L U S à O", 186 | "fixed": "C O N C L U S à O", 187 | "expect": "pass" 188 | }, 189 | { 190 | "label": "Negative: Two concatenated strings", 191 | "comment": "Should not turn into 'fratarak᧠141'", 192 | "original": "Oborzos, per. Vahbarz, frataraká§ 141", 193 | "fixed": "Oborzos, per. Vahbarz, frataraká§ 141", 194 | "expect": "pass" 195 | }, 196 | { 197 | "label": "Negative: Indonesian leetspeak", 198 | "original": "MÄ£ÄM ÌÑÌ Q £ÄGÌ GÄLÄW ÑÍCH SÖÄ£ ÑÝÄ $ÚÄMÌ Q £ÄGÌ GÄK ÉÑÄK BÄDÄÑ....?????????, ......JÄDÍ...", 199 | "fixed": "MÄ£ÄM ÌÑÌ Q £ÄGÌ GÄLÄW ÑÍCH SÖÄ£ ÑÝÄ $ÚÄMÌ Q £ÄGÌ GÄK ÉÑÄK BÄDÄÑ....?????????, ......JÄDÍ...", 200 | "expect": "pass" 201 | }, 202 | { 203 | "label": "Negative: math in Unicode", 204 | "comment": "This isn't mojibake, it's an actual equation", 205 | "original": "(-1/2)! = √π", 206 | "fixed": "(-1/2)! = √π", 207 | "expect": "pass" 208 | }, 209 | { 210 | "label": "Negative: Leet line-art", 211 | "comment": "The heuristic before v6 loved to 'fix' this and decode it as 'ôaſaſaſaſa'", 212 | "original": "├┤a┼┐a┼┐a┼┐a┼┐a", 213 | "fixed": "├┤a┼┐a┼┐a┼┐a┼┐a", 214 | "expect": "pass" 215 | } 216 | ] -------------------------------------------------------------------------------- /tests/test-cases/synthetic.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "label": "Synthetic: we can recognize à in some cases when it's the only mojibake", 4 | "comment": "Examples in this file were made up to test something, instead of found in the wild", 5 | "original": "voilà le travail", 6 | "fixed": "voilà le travail", 7 | "expect": "pass" 8 | }, 9 | { 10 | "label": "Synthetic: we can recognize à at the end of a word when it absorbs a following space", 11 | "original": "voilà le travail", 12 | "fixed": "voilà le travail", 13 | "expect": "pass" 14 | }, 15 | { 16 | "label": "Synthetic: Hebrew UTF-8 / Windows-1250 mojibake", 17 | "original": "בהודעה", 18 | "fixed": "בהודעה", 19 | "expect": "pass" 20 | }, 21 | { 22 | "label": "Synthetic: Hebrew UTF-8 / MacRoman mojibake", 23 | "original": "◊ë◊î◊ï◊ì◊¢◊î", 24 | "fixed": "בהודעה", 25 | "expect": "pass" 26 | }, 27 | { 28 | "label": "Synthetic: Hebrew UTF-8 / Latin-1 mojibake", 29 | "comment": "This example uses low-numbered codepoints to spell 'ABBA' in Hebrew, so that it falls into the range where Latin-1 is different from Windows-1252. As a bonus, this example looks right even if your RTL text rendering isn't working.", 30 | "original": "×\u0090×\u0091×\u0091×\u0090", 31 | "fixed": "אבבא", 32 | "expect": "pass" 33 | }, 34 | { 35 | "label": "Synthetic: Arabic UTF-8 / Windows-1252 mojibake", 36 | "original": "رسالة", 37 | "fixed": "رسالة", 38 | "expect": "pass" 39 | }, 40 | { 41 | "label": "Synthetic: Arabic UTF-8 / Windows-1250 mojibake", 42 | "original": "رسالة", 43 | "fixed": "رسالة", 44 | "expect": "pass" 45 | }, 46 | { 47 | "label": "Synthetic: Arabic UTF-8 / MacRoman mojibake", 48 | "original": "ÿ±ÿ≥ÿߟÑÿ©", 49 | "fixed": "رسالة", 50 | "expect": "pass" 51 | }, 52 | { 53 | "label": "Synthetic, negative: Brontë's name does not end with a Korean syllable", 54 | "comment": "The original example of why ftfy needs heuristics", 55 | "original": "I'm not such a fan of Charlotte Brontë…”", 56 | "fixed-encoding": "I'm not such a fan of Charlotte Brontë…”", 57 | "fixed": "I'm not such a fan of Charlotte Brontë…\"", 58 | "expect": "pass" 59 | }, 60 | { 61 | "label": "Synthetic, negative: hypothetical Swedish product name", 62 | "comment": "This used to be a constructed example of a false positive, until you added another symbol", 63 | "original": "AHÅ™, the new sofa from IKEA", 64 | "fixed": "AHÅ™, the new sofa from IKEA", 65 | "expect": "pass" 66 | }, 67 | { 68 | "label": "Synthetic, negative: Ukrainian capital letters", 69 | "comment": "We need to fix Windows-1251 conservatively, or else this decodes as '²ʲ'", 70 | "original": "ВІКІ is Ukrainian for WIKI", 71 | "fixed": "ВІКІ is Ukrainian for WIKI", 72 | "expect": "pass" 73 | }, 74 | { 75 | "label": "Synthetic, negative: don't leak our internal use of byte 0x1A", 76 | "comment": "We use byte 0x1A internally as an encoding of U+FFFD, but literal occurrences of U+1A are just ASCII control characters", 77 | "original": "These control characters \u001a are apparently intentional \u0081", 78 | "fixed-encoding": "These control characters \u001a are apparently intentional \u0081", 79 | "fixed": "These control characters are apparently intentional \u0081", 80 | "expect": "pass" 81 | }, 82 | { 83 | "label": "Synthetic, negative: U+1A on its own", 84 | "comment": "We use byte 0x1A internally as an encoding of U+FFFD, but literal occurrences of U+1A are just ASCII control characters", 85 | "original": "Here's a control character: \u001a", 86 | "fixed-encoding": "Here's a control character: \u001a", 87 | "fixed": "Here's a control character: ", 88 | "expect": "pass" 89 | }, 90 | { 91 | "label": "Synthetic, negative: A-with-circle as an Angstrom sign", 92 | "comment": "Should not turn into '10 ŗ'", 93 | "original": "a radius of 10 Å—", 94 | "fixed": "a radius of 10 Å—", 95 | "expect": "pass" 96 | }, 97 | { 98 | "label": "Synthetic, negative: Spanish with exclamation points on the wrong sides", 99 | "original": "!YO SÉ¡", 100 | "fixed": "!YO SÉ¡", 101 | "expect": "pass" 102 | }, 103 | { 104 | "label": "Synthetic: fix text with backslashes in it", 105 | "comment": "Tests for a regression on a long-ago bug", 106 | "original": "<40\\% vs \u00e2\u0089\u00a540\\%", 107 | "fixed": "<40\\% vs ≥40\\%", 108 | "expect": "pass" 109 | }, 110 | { 111 | "label": "Synthetic: curly quotes with mismatched encoding glitches in Latin-1", 112 | "original": "\u00e2\u0080\u009cmismatched quotes\u0085\u0094", 113 | "fixed-encoding": "“mismatched quotes…”", 114 | "fixed": "\"mismatched quotes…\"", 115 | "expect": "pass" 116 | }, 117 | { 118 | "label": "Synthetic: curly quotes with mismatched encoding glitches in Windows-1252", 119 | "original": "“mismatched quotes…”", 120 | "fixed-encoding": "“mismatched quotes…”", 121 | "fixed": "\"mismatched quotes…\"", 122 | "expect": "pass" 123 | }, 124 | { 125 | "label": "Synthetic: lossy decoding in sloppy-windows-1252", 126 | "original": "“lossy decodingâ€�", 127 | "fixed-encoding": "“lossy decoding�", 128 | "fixed": "\"lossy decoding�", 129 | "expect": "pass" 130 | }, 131 | { 132 | "label": "Synthetic: French word for August in windows-1252", 133 | "original": "août", 134 | "fixed-encoding": "août", 135 | "fixed": "août", 136 | "expect": "pass" 137 | }, 138 | { 139 | "label": "Synthetic: French word for hotel in all-caps windows-1252", 140 | "original": "HÔTEL", 141 | "fixed-encoding": "HÔTEL", 142 | "fixed": "HÔTEL", 143 | "expect": "pass" 144 | }, 145 | { 146 | "label": "Synthetic: Scottish Gaelic word for 'subject' in all-caps windows-1252", 147 | "original": "CÙIS", 148 | "fixed-encoding": "CÙIS", 149 | "fixed": "CÙIS", 150 | "expect": "pass" 151 | }, 152 | { 153 | "label": "Synthetic, negative: Romanian word before a non-breaking space", 154 | "comment": "The word literally means 'not even once', which might be a good recommendation about fixing Romanian mojibake", 155 | "original": "NICIODATĂ\u00a0", 156 | "fixed": "NICIODATĂ\u00a0", 157 | "expect": "pass" 158 | }, 159 | { 160 | "label": "Synthetic, negative: Be careful around curly apostrophes", 161 | "comment": "It shouldn't end up saying 'a lot of Òs'", 162 | "original": "There are a lot of Ã’s in mojibake text", 163 | "fixed-encoding": "There are a lot of Ã’s in mojibake text", 164 | "fixed": "There are a lot of Ã's in mojibake text", 165 | "expect": "pass" 166 | }, 167 | { 168 | "label": "Synthetic, negative: Romanian word before a trademark sign", 169 | "comment": "We would change 'DATÙ' to 'DATÙ' if it passed the badness heuristic", 170 | "original": "NICIODATĂ™", 171 | "fixed": "NICIODATĂ™", 172 | "expect": "pass" 173 | }, 174 | { 175 | "label": "Synthetic, negative: Lithuanian word before a trademark sign", 176 | "comment": "Similar to the above example. Shouldn't turn into U+0619 ARABIC SMALL DAMMA", 177 | "original": "TRANSFORMATORIŲ™", 178 | "fixed": "TRANSFORMATORIŲ™", 179 | "expect": "pass" 180 | }, 181 | { 182 | "label": "Synthetic, negative: Norwegian capitalized nonsense", 183 | "comment": "We're shouting that the island of Håøya is gullible. It should not turn into 'HŨYA ER BLŨYD'.", 184 | "original": "HÅØYA ER BLÅØYD", 185 | "fixed": "HÅØYA ER BLÅØYD", 186 | "expect": "pass" 187 | }, 188 | { 189 | "label": "Synthetic, negative: raised eyebrow kaomoji", 190 | "original": "Ō¬o", 191 | "fixed": "Ō¬o", 192 | "expect": "pass" 193 | }, 194 | { 195 | "label": "Synthetic, negative: Camel-cased Serbian that looks like a UTF-8 / Windows-1251 mixup", 196 | "comment": "I made this text up, but it seems like it means 'HelloDevil'. Could be a username or something.", 197 | "original": "ПоздравЂаво", 198 | "fixed": "ПоздравЂаво", 199 | "expect": "pass" 200 | }, 201 | { 202 | "label": "Synthetic: mojibake with trademark sign at the end of a word", 203 | "comment": "I recall the correct version of this text from a sign in the movie Amélie. Now we can help her twin Amélie, who makes mojibaked signs.", 204 | "original": "OÙ ET QUAND?", 205 | "fixed": "OÙ ET QUAND?", 206 | "expect": "pass" 207 | } 208 | ] -------------------------------------------------------------------------------- /tests/test_bytes.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from ftfy import guess_bytes 4 | from ftfy.bad_codecs.utf8_variants import IncrementalDecoder 5 | 6 | TEST_ENCODINGS = ["utf-16", "utf-8", "sloppy-windows-1252"] 7 | 8 | TEST_STRINGS = [ 9 | "Renée\nFleming", 10 | "Noël\nCoward", 11 | "Señor\nCardgage", 12 | "€ • £ • ¥", 13 | "¿Qué?", 14 | ] 15 | 16 | 17 | @pytest.mark.parametrize("string", TEST_STRINGS) 18 | def test_guess_bytes(string): 19 | for encoding in TEST_ENCODINGS: 20 | result_str, result_encoding = guess_bytes(string.encode(encoding)) 21 | assert result_str == string 22 | assert result_encoding == encoding 23 | 24 | if "\n" in string: 25 | old_mac_bytes = string.replace("\n", "\r").encode("macroman") 26 | result_str, result_encoding = guess_bytes(old_mac_bytes) 27 | assert result_str == string.replace("\n", "\r") 28 | 29 | 30 | def test_guess_bytes_null(): 31 | bowdlerized_null = b"null\xc0\x80separated" 32 | result_str, result_encoding = guess_bytes(bowdlerized_null) 33 | assert result_str == "null\x00separated" 34 | assert result_encoding == "utf-8-variants" 35 | 36 | 37 | def test_incomplete_sequences(): 38 | test_bytes = b"surrogates: \xed\xa0\x80\xed\xb0\x80 / null: \xc0\x80" 39 | test_string = "surrogates: \U00010000 / null: \x00" 40 | 41 | # Test that we can feed this string to decode() in multiple pieces, and no 42 | # matter where the break between those pieces is, we get the same result. 43 | for split_point in range(len(test_string) + 1): 44 | left = test_bytes[:split_point] 45 | right = test_bytes[split_point:] 46 | 47 | decoder = IncrementalDecoder() 48 | got = decoder.decode(left, final=False) 49 | got += decoder.decode(right) 50 | assert got == test_string 51 | -------------------------------------------------------------------------------- /tests/test_characters.py: -------------------------------------------------------------------------------- 1 | from ftfy import ( 2 | fix_and_explain, 3 | fix_encoding, 4 | fix_text, 5 | ) 6 | from ftfy.chardata import possible_encoding 7 | from ftfy.fixes import fix_surrogates, remove_control_chars 8 | 9 | 10 | def test_possible_encoding(): 11 | for codept in range(256): 12 | char = chr(codept) 13 | assert possible_encoding(char, "latin-1") 14 | 15 | 16 | def test_byte_order_mark(): 17 | assert fix_encoding("") == "\ufeff" 18 | 19 | 20 | def test_control_chars(): 21 | text = ( 22 | "\ufeffSometimes, \ufffcbad ideas \x7f\ufffalike these characters\ufffb " 23 | "\u206aget standardized.\r\n" 24 | ) 25 | fixed = "Sometimes, bad ideas like these characters get standardized.\r\n" 26 | assert remove_control_chars(text) == fixed 27 | 28 | 29 | def test_welsh_flag(): 30 | # ftfy used to remove "tag characters", but they have been repurposed in the 31 | # "Flag of England", "Flag of Scotland", and "Flag of Wales" emoji sequences. 32 | text = "This flag has a dragon on it 🏴󠁧󠁢󠁷󠁬󠁳󠁿" 33 | assert remove_control_chars(text) == text 34 | 35 | 36 | def test_ohio_flag(): 37 | # I did not expect to find the "Flag of Ohio" emoji in the wild but there it is. 38 | # Test that this emoji (which no emoji database believes has been implemented) 39 | # passes through unchanged. 40 | text = "#superman #ohio 🏴\U000e0075\U000e0073\U000e006f\U000e0068\U000e007f #cleveland #usa 🇺🇸" 41 | assert fix_text(text) == text 42 | 43 | 44 | def test_surrogates(): 45 | assert fix_surrogates("\udbff\udfff") == "\U0010ffff" 46 | assert fix_surrogates("\ud800\udc00") == "\U00010000" 47 | 48 | 49 | def test_color_escapes(): 50 | fixed, plan = fix_and_explain("\001\033[36;44mfoo") 51 | print(plan) 52 | assert fixed == "foo" 53 | assert plan == [ 54 | ("apply", "remove_terminal_escapes"), 55 | ("apply", "remove_control_chars"), 56 | ] 57 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | from pathlib import Path 4 | 5 | import pytest 6 | 7 | # Get the filename of 'face.txt', an example of mojibake 8 | THIS_DIR = Path(__file__).parent 9 | TEST_FILENAME = THIS_DIR / "face.txt" 10 | CORRECT_OUTPUT = os.linesep.join(["┒(⌣˛⌣)┎", ""]) 11 | FAILED_OUTPUT = os.linesep.join( 12 | [ 13 | "ftfy error:", 14 | "This input couldn't be decoded as 'windows-1252'. We got the following error:", 15 | "", 16 | " 'charmap' codec can't decode byte 0x9d in position 4: character maps to ", 17 | "", 18 | "ftfy works best when its input is in a known encoding. You can use `ftfy -g`", 19 | "to guess, if you're desperate. Otherwise, give the encoding name with the", 20 | "`-e` option, such as `ftfy -e latin-1`.", 21 | "", 22 | ] 23 | ) 24 | 25 | 26 | def get_command_output(args, stdin=None): 27 | return subprocess.check_output(args, stdin=stdin, stderr=subprocess.STDOUT, timeout=5).decode( 28 | "utf-8" 29 | ) 30 | 31 | 32 | def test_basic(): 33 | output = get_command_output(["ftfy", TEST_FILENAME]) 34 | assert output == CORRECT_OUTPUT 35 | 36 | 37 | def test_guess_bytes(): 38 | output = get_command_output(["ftfy", "-g", TEST_FILENAME]) 39 | assert output == CORRECT_OUTPUT 40 | 41 | 42 | def test_alternate_encoding(): 43 | # The file isn't really in Windows-1252. But that's a problem ftfy 44 | # can fix, if it's allowed to be sloppy when reading the file. 45 | output = get_command_output(["ftfy", "-e", "sloppy-windows-1252", TEST_FILENAME]) 46 | assert output == CORRECT_OUTPUT 47 | 48 | 49 | def test_wrong_encoding(): 50 | # It's more of a problem when the file doesn't actually decode. 51 | with pytest.raises(subprocess.CalledProcessError) as exception: 52 | get_command_output(["ftfy", "-e", "windows-1252", TEST_FILENAME]) 53 | assert exception.value.output.decode("utf-8") == FAILED_OUTPUT 54 | 55 | 56 | def test_same_file(): 57 | with pytest.raises(subprocess.CalledProcessError) as exception: 58 | get_command_output(["ftfy", TEST_FILENAME, "-o", TEST_FILENAME]) 59 | error = exception.value.output.decode("utf-8") 60 | assert error.startswith("ftfy error:") 61 | assert "Can't read and write the same file" in error 62 | 63 | 64 | def test_stdin(): 65 | with TEST_FILENAME.open("rb") as infile: 66 | output = get_command_output(["ftfy"], stdin=infile) 67 | assert output == CORRECT_OUTPUT 68 | -------------------------------------------------------------------------------- /tests/test_encodings.py: -------------------------------------------------------------------------------- 1 | from ftfy import bad_codecs, guess_bytes 2 | 3 | 4 | def test_cesu8(): 5 | cls1 = bad_codecs.search_function("cesu8").__class__ 6 | cls2 = bad_codecs.search_function("cesu-8").__class__ 7 | assert cls1 == cls2 8 | 9 | test_bytes = b"\xed\xa6\x9d\xed\xbd\xb7 is an unassigned character, and \xc0\x80 is null" 10 | test_text = "\U00077777 is an unassigned character, and \x00 is null" 11 | assert test_bytes.decode("cesu8") == test_text 12 | 13 | 14 | def test_russian_crash(): 15 | thebytes = b"\xe8\xed\xe2\xe5\xed\xf2\xe0\xf0\xe8\xe7\xe0\xf6\xe8\xff " 16 | # We don't care what the result is, but this shouldn't crash 17 | thebytes.decode("utf-8-variants", "replace") 18 | 19 | # This shouldn't crash either 20 | guess_bytes(thebytes) 21 | -------------------------------------------------------------------------------- /tests/test_entities.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from ftfy import fix_text, fix_text_segment 4 | from ftfy.fixes import unescape_html 5 | 6 | 7 | def test_entities(): 8 | example = "&\n\n&" 9 | assert fix_text(example) == "&\n\n&" 10 | assert fix_text_segment(example) == "&\n\n&" 11 | 12 | assert fix_text(example, unescape_html=True) == "&\n\n&" 13 | assert fix_text_segment(example, unescape_html=True) == "&\n\n&" 14 | 15 | assert fix_text(example, unescape_html=False) == "&\n\n&" 16 | assert fix_text_segment(example, unescape_html=False) == "&\n\n&" 17 | 18 | assert fix_text_segment("<>", unescape_html=False) == "<>" 19 | assert fix_text_segment("<>", unescape_html=True) == "<>" 20 | assert fix_text_segment("<>") == "<>" 21 | assert fix_text_segment("jednocześnie") == "jednocześnie" 22 | assert fix_text_segment("JEDNOCZEŚNIE") == "JEDNOCZEŚNIE" 23 | assert fix_text_segment("ellipsis…", normalization="NFKC") == "ellipsis..." 24 | assert fix_text_segment("ellipsis…", normalization="NFKC") == "ellipsis..." 25 | assert fix_text_segment("broken") == "broken\x81" 26 | assert fix_text_segment("&amp;amp;") == "&" 27 | assert unescape_html("euro €") == "euro €" 28 | assert unescape_html("EURO &EURO;") == "EURO €" 29 | assert unescape_html("not an entity x6;") == "not an entity x6;" 30 | assert unescape_html("JEDNOCZE&SACUTE;NIE") == "JEDNOCZEŚNIE" 31 | assert unescape_html("V&SCARON;ICHNI") == "VŠICHNI" 32 | assert unescape_html("￿") == "" 33 | assert unescape_html("�") == "\ufffd" 34 | assert ( 35 | fix_text_segment("this is just informal english ¬ html") 36 | == "this is just informal english ¬ html" 37 | ) 38 | 39 | 40 | def test_old_parameter_name(): 41 | example = "&\n\n&" 42 | with pytest.deprecated_call(): 43 | assert fix_text(example, fix_entities=True) == "&\n\n&" 44 | with pytest.deprecated_call(): 45 | assert fix_text(example, fix_entities=False) == "&\n\n&" 46 | -------------------------------------------------------------------------------- /tests/test_examples_in_json.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test ftfy's fixes using the data in `test_cases.json`. 3 | 4 | I collected many test cases by listening to the Twitter streaming API for 5 | millions of tweets, picking out examples with high weirdness, and seeing what 6 | ftfy decoded them to. There are some impressive things that can happen to text, 7 | even in an ecosystem that is supposedly entirely UTF-8. 8 | 9 | Some examples come from the Common Crawl (particularly those involving 10 | Windows-1250 mojibake, which is more common on arbitrary Web pages than on 11 | Twitter), and some examples marked as 'synthetic' are contrived to test 12 | particular features of ftfy. 13 | 14 | Each test case is a dictionary containing the following items: 15 | 16 | - "label": a label that will identify the test case in nosetests 17 | - "original": the text to be ftfy'd 18 | - "fixed": what the result of ftfy.fix_text should be on this text 19 | 20 | There are also two optional fields: 21 | 22 | - "fixed-encoding": what the result of just ftfy.fix_encoding should be. 23 | If missing, it will be considered to be the same as "fixed". 24 | - "comment": possibly-enlightening commentary on the test case. 25 | """ 26 | 27 | import json 28 | from pathlib import Path 29 | 30 | import pytest 31 | 32 | from ftfy import apply_plan, fix_and_explain, fix_encoding_and_explain, fix_text 33 | 34 | THIS_DIR = Path(__file__).parent 35 | TEST_CASE_DIR = THIS_DIR / "test-cases" 36 | 37 | 38 | def load_test_data() -> list[dict]: 39 | test_data = [] 40 | for filepath in TEST_CASE_DIR.glob("*.json"): 41 | test_data.extend(json.load(filepath.open())) 42 | return test_data 43 | 44 | 45 | TEST_DATA = load_test_data() 46 | 47 | TESTS_THAT_PASS = [test for test in TEST_DATA if test["expect"] == "pass"] 48 | TESTS_THAT_FAIL = [test for test in TEST_DATA if test["expect"] == "fail"] 49 | 50 | 51 | @pytest.mark.parametrize("test_case", TEST_DATA) 52 | def test_well_formed_example(test_case): 53 | assert test_case["expect"] in ("pass", "fail") 54 | 55 | 56 | @pytest.mark.parametrize("test_case", TESTS_THAT_PASS) 57 | def test_json_example(test_case): 58 | # Run one example from the data file 59 | orig = test_case["original"] 60 | fixed = test_case["fixed"] 61 | 62 | # Make sure that we can fix the text as intended 63 | assert fix_text(orig) == fixed 64 | 65 | # Make sure that fix_and_explain outputs a plan that we can successfully 66 | # run to reproduce its result 67 | fixed_output, plan = fix_and_explain(orig) 68 | assert apply_plan(orig, plan) == fixed_output 69 | 70 | # Do the same for fix_encoding_and_explain 71 | encoding_fix, plan = fix_encoding_and_explain(orig) 72 | assert apply_plan(orig, plan) == encoding_fix 73 | 74 | # Ask for the encoding fix a different way, by disabling all the other steps 75 | # in the config object 76 | assert ( 77 | fix_text( 78 | orig, 79 | unescape_html=False, 80 | remove_terminal_escapes=False, 81 | fix_character_width=False, 82 | fix_latin_ligatures=False, 83 | uncurl_quotes=False, 84 | fix_line_breaks=False, 85 | fix_surrogates=False, 86 | remove_control_chars=False, 87 | normalization=None, 88 | ) 89 | == encoding_fix 90 | ) 91 | 92 | # Make sure we can decode the text as intended 93 | assert fix_text(orig) == fixed 94 | assert encoding_fix == test_case.get("fixed-encoding", fixed) 95 | 96 | # Make sure we can decode as intended even with an extra layer of badness 97 | extra_bad = orig.encode("utf-8").decode("latin-1") 98 | assert fix_text(extra_bad) == fixed 99 | 100 | 101 | @pytest.mark.parametrize("test_case", TESTS_THAT_FAIL) 102 | @pytest.mark.xfail(strict=True) 103 | def test_failing_json_example(test_case): 104 | # Run an example from the data file that we believe will fail, due to 105 | # ftfy's heuristic being insufficient 106 | orig = test_case["original"] 107 | fixed = test_case["fixed"] 108 | 109 | encoding_fix, plan = fix_encoding_and_explain(orig) 110 | assert encoding_fix == test_case.get("fixed-encoding", fixed) 111 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py39, py310, py311, py312, py313 3 | 4 | [testenv] 5 | deps = 6 | pytest 7 | wcwidth 8 | commands = pytest 9 | --------------------------------------------------------------------------------