├── .gitignore ├── LICENSE ├── README.md ├── pyproject.toml ├── sanitext ├── __init__.py ├── cli.py ├── emoji_set.py ├── homoglyph_map.py └── text_sanitization.py ├── tests ├── conftest.py ├── test_cli.py ├── test_homoglyph_map.py └── test_text_sanitization.py └── tox.ini /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 116 | .pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .venv 133 | env/ 134 | venv/ 135 | ENV/ 136 | env.bak/ 137 | venv.bak/ 138 | 139 | # Spyder project settings 140 | .spyderproject 141 | .spyproject 142 | 143 | # Rope project settings 144 | .ropeproject 145 | 146 | # mkdocs documentation 147 | /site 148 | 149 | # mypy 150 | .mypy_cache/ 151 | .dmypy.json 152 | dmypy.json 153 | 154 | # Pyre type checker 155 | .pyre/ 156 | 157 | # pytype static type analyzer 158 | .pytype/ 159 | 160 | # Cython debug symbols 161 | cython_debug/ 162 | 163 | # PyCharm 164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 166 | # and can be added to the global gitignore or merged into this file. For a more nuclear 167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 168 | #.idea/ 169 | 170 | # PyPI configuration file 171 | .pypirc 172 | poetry.lock 173 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Panayiotis Panayiotou 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sanitext 2 | 3 | **Sanitize text from LLMs** 4 | 5 | Sanitext is a **command-line tool** and **Python library** for detecting and removing unwanted characters in text. It supports: 6 | 7 | - ASCII-only sanitization (default) 8 | - Custom character allowlists (`--allow-chars`, `--allow-file`) 9 | - Interactive review of non-allowed characters (`--interactive`) 10 | 11 | ## Installation 12 | 13 | ```bash 14 | pip install sanitext 15 | ``` 16 | 17 | By default, sanitext uses the string in your clipboard unless you specify one with `--string`. 18 | 19 | ## CLI usage example 20 | 21 | ```bash 22 | # Process the clipboard content & copy back to clipboard 23 | sanitext 24 | # Detect characters but don't modify 25 | sanitext --detect 26 | # Process clipboard + show detected characters (most common command) 27 | sanitext -v 28 | # Process clipboard + show input, detected characters & output 29 | sanitext -vv 30 | # Process the provided string and print it 31 | sanitext --string "Héllø, 𝒲𝑜𝓇𝓁𝒹!" 32 | # Allow additional characters (for now, only single unicode code point characters) 33 | sanitext --allow-chars "αøñç" 34 | # Allow characters from a file 35 | sanitext --allow-file allowed_chars.txt 36 | # Allow single code point emoji 37 | sanitext --allow-emoji 38 | # Prompt user for handling disallowed characters 39 | # y (Yes) -> keep it 40 | # n (No) -> remove it 41 | # r (Replace) -> provide a replacement character 42 | sanitext --interactive 43 | # Allow emojis 44 | sanitext --allow-emoji 45 | ``` 46 | 47 | ## Python library usage example 48 | 49 | ```python 50 | from sanitext.text_sanitization import ( 51 | sanitize_text, 52 | detect_suspicious_characters, 53 | get_allowed_characters, 54 | ) 55 | 56 | text = "“2×3 – 4 = 5”😎󠅒󠅟󠅣󠅣" 57 | 58 | # Detect suspicious characters 59 | suspicious_characters = detect_suspicious_characters(text) 60 | print(f"Suspicious characters: {suspicious_characters}") 61 | # [('“', 'LEFT DOUBLE QUOTATION MARK'), ('×', 'MULTIPLICATION SIGN'), ('–', 'EN DASH'), ('”', 'RIGHT DOUBLE QUOTATION MARK')] 62 | 63 | # Sanitize text to all ASCII 64 | sanitized_text = sanitize_text(text) 65 | print(f"Sanitized text: {sanitized_text}") # "2x3 - 4 = 5" 66 | # Allow the multiplication sign 67 | allowed_characters = get_allowed_characters() 68 | allowed_characters.add("×") 69 | sanitized_text = sanitize_text(text, allowed_characters=allowed_characters) 70 | print(f"Sanitized text: {sanitized_text}") # "2×3 - 4 = 5" 71 | # Allow the emoji (but clean it from the encoded message "boss") 72 | allowed_characters = get_allowed_characters(allow_emoji=True) 73 | sanitized_text = sanitize_text(text, allowed_characters=allowed_characters) 74 | print(f"Sanitized text: {sanitized_text}") # "2x3 - 4 = 5"😎 75 | ``` 76 | 77 | ## Dev setup 78 | 79 | ```bash 80 | # Install dependencies 81 | poetry install 82 | # Use it 83 | poetry run python sanitext/cli.py --help 84 | poetry run python sanitext/cli.py --string "your string" 85 | # Run tests 86 | poetry run pytest 87 | poetry run pytest -s tests/test_cli.py 88 | # Run tests over different python versions (TODO: setup github action) 89 | poetry run tox 90 | # Publish to PyPI 91 | poetry build 92 | poetry publish 93 | ``` 94 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "sanitext" 3 | version = "0.1.0" 4 | description = "" 5 | authors = [ 6 | {name = "panispani",email = "p.panayiotou2@gmail.com"} 7 | ] 8 | license = {text = "MIT"} 9 | readme = "README.md" 10 | requires-python = ">=3.9,<4.0" 11 | dependencies = [ 12 | "typer (>=0.15.2,<0.16.0)", 13 | "pyperclip (>=1.9.0,<2.0.0)" 14 | ] 15 | 16 | 17 | [build-system] 18 | requires = ["poetry-core>=2.0.0,<3.0.0"] 19 | build-backend = "poetry.core.masonry.api" 20 | 21 | [tool.poetry.group.dev.dependencies] 22 | pytest = "^8.3.5" 23 | tox = "^4.24.1" 24 | 25 | [project.scripts] 26 | sanitext = "sanitext.cli:app" 27 | 28 | [project.urls] 29 | Article = "https://www.panispani.com/blog/2025/sanitext/" 30 | Repository = "https://github.com/panispani/sanitext" -------------------------------------------------------------------------------- /sanitext/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/panispani/sanitext/97d7aa3cd901e20b57ebb167151286a8752f468f/sanitext/__init__.py -------------------------------------------------------------------------------- /sanitext/cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | sanitext: A command-line tool and Python library for text sanitization. 3 | 4 | Features: 5 | - Detect suspicious characters in text. 6 | - Sanitize text by removing or replacing non-allowed characters. 7 | - Customizable character filtering: 8 | - By default, only allows ASCII printable characters. 9 | - Optionally allow Unicode characters (--allow-unicode). 10 | - Specify additional allowed characters (--allow-chars). 11 | - Load a file containing allowed characters (--allow-file). 12 | - Interactive mode (--interactive): 13 | - Manually decide what to do with disallowed characters (keep, remove, replace). 14 | 15 | Usage examples: 16 | - sanitext --detect # Detect characters only 17 | - sanitext --string "text" # Process the provided string and print it 18 | - sanitext # Process the clipboard string, copy to clipboard, print if unchanged 19 | - sanitext --verbose # Process + show detected info 20 | - sanitext --very-verbose # Process + show input, detected info, and output 21 | - sanitext --allow-chars "αñøç" # Allow additional characters (only single unicode code point) 22 | - sanitext --allow-file allowed_chars.txt # Allow characters from a file 23 | - sanitext --allow-emoji # Allow single code point emoji 24 | - sanitext --interactive # Prompt user for handling disallowed characters 25 | """ 26 | 27 | import pyperclip 28 | import typer 29 | from pathlib import Path 30 | 31 | from sanitext.text_sanitization import ( 32 | detect_suspicious_characters, 33 | sanitize_text, 34 | get_allowed_characters, 35 | ) 36 | 37 | 38 | app = typer.Typer() 39 | 40 | 41 | @app.command() 42 | def main( 43 | detect: bool = typer.Option( 44 | False, "--detect", "-d", help="Detect characters only." 45 | ), 46 | string: str = typer.Option( 47 | None, "--string", "-s", help="Process the provided string and print it." 48 | ), 49 | verbose: bool = typer.Option( 50 | False, "--verbose", "-v", help="Verbose mode (process + show detected info)." 51 | ), 52 | very_verbose: bool = typer.Option( 53 | False, 54 | "--very-verbose", 55 | "-vv", 56 | help="Very verbose mode (process + show input, detected info, and output).", 57 | ), 58 | allow_chars: str = typer.Option( 59 | None, 60 | "--allow-chars", 61 | help='Additional characters to allow, e.g. --allow-chars "αñøç"', 62 | ), 63 | allow_emoji: bool = typer.Option( 64 | False, 65 | "--allow-emoji", 66 | help='Allow single code point emoji"', # TODO: extend to multiple codepoints 67 | ), 68 | allow_file: Path = typer.Option( 69 | None, 70 | "--allow-file", 71 | help="Path to a file containing characters to allow (one big string or multiple lines).", 72 | exists=True, 73 | file_okay=True, 74 | dir_okay=False, 75 | readable=True, 76 | ), 77 | interactive: bool = typer.Option( 78 | False, 79 | "--interactive", 80 | "-i", 81 | help="Interactive prompt for disallowed characters.", 82 | ), 83 | ): 84 | # Get text from either CLI or clipboard 85 | text = string if string is not None else pyperclip.paste() 86 | if not text: 87 | typer.echo( 88 | "Error: No text provided (clipboard is empty and no string was given).", 89 | err=True, 90 | ) 91 | raise typer.Exit(1) 92 | 93 | allowed_characters = get_allowed_characters( 94 | allow_chars=allow_chars, 95 | allow_file=allow_file, 96 | allow_emoji=allow_emoji, 97 | ) 98 | 99 | # If detection-only, just do detection and exit 100 | if detect: 101 | detected_info = detect_suspicious_characters( 102 | text, allowed_characters=allowed_characters 103 | ) 104 | typer.echo(f"Detected: {detected_info}") 105 | raise typer.Exit(0) 106 | 107 | # Otherwise, sanitize 108 | processed_text = sanitize_text( 109 | text, 110 | allowed_characters=allowed_characters, 111 | interactive=interactive, 112 | ) 113 | 114 | if very_verbose: 115 | detected_info = detect_suspicious_characters( 116 | text, allowed_characters=allowed_characters 117 | ) 118 | typer.echo(f"Input: {text}") 119 | typer.echo(f"Detected: {detected_info}") 120 | typer.echo(f"Output: {processed_text}") 121 | elif verbose: 122 | detected_info = detect_suspicious_characters( 123 | text, allowed_characters=allowed_characters 124 | ) 125 | typer.echo(f"Detected: {detected_info}") 126 | 127 | # If no `--string`, copy back to clipboard 128 | if string is None: 129 | if processed_text != text: 130 | pyperclip.copy(processed_text) 131 | typer.echo("Processed and copied to clipboard.") 132 | else: 133 | typer.echo("No changes!") 134 | else: 135 | typer.echo(processed_text) 136 | 137 | 138 | if __name__ == "__main__": 139 | app() 140 | -------------------------------------------------------------------------------- /sanitext/emoji_set.py: -------------------------------------------------------------------------------- 1 | # Extracted from https://unicode.org/emoji/charts/emoji-list.html 2 | # Only single unicode code points supported for now. 3 | EMOJI_SET = { 4 | chr(0x1F600), # 😀 5 | chr(0x1F603), # 😃 6 | chr(0x1F604), # 😄 7 | chr(0x1F601), # 😁 8 | chr(0x1F606), # 😆 9 | chr(0x1F605), # 😅 10 | chr(0x1F923), # 🤣 11 | chr(0x1F602), # 😂 12 | chr(0x1F642), # 🙂 13 | chr(0x1F643), # 🙃 14 | chr(0x1FAE0), # 🫠 15 | chr(0x1F609), # 😉 16 | chr(0x1F60A), # 😊 17 | chr(0x1F607), # 😇 18 | chr(0x1F970), # 🥰 19 | chr(0x1F60D), # 😍 20 | chr(0x1F929), # 🤩 21 | chr(0x1F618), # 😘 22 | chr(0x1F617), # 😗 23 | chr(0x263A), # ☺ 24 | chr(0x1F61A), # 😚 25 | chr(0x1F619), # 😙 26 | chr(0x1F972), # 🥲 27 | chr(0x1F60B), # 😋 28 | chr(0x1F61B), # 😛 29 | chr(0x1F61C), # 😜 30 | chr(0x1F92A), # 🤪 31 | chr(0x1F61D), # 😝 32 | chr(0x1F911), # 🤑 33 | chr(0x1F917), # 🤗 34 | chr(0x1F92D), # 🤭 35 | chr(0x1FAE2), # 🫢 36 | chr(0x1FAE3), # 🫣 37 | chr(0x1F92B), # 🤫 38 | chr(0x1F914), # 🤔 39 | chr(0x1FAE1), # 🫡 40 | chr(0x1F910), # 🤐 41 | chr(0x1F928), # 🤨 42 | chr(0x1F610), # 😐 43 | chr(0x1F611), # 😑 44 | chr(0x1F636), # 😶 45 | chr(0x1FAE5), # 🫥 46 | chr(0x1F60F), # 😏 47 | chr(0x1F612), # 😒 48 | chr(0x1F644), # 🙄 49 | chr(0x1F62C), # 😬 50 | chr(0x1F925), # 🤥 51 | chr(0x1FAE8), # 🫨 52 | chr(0x1F60C), # 😌 53 | chr(0x1F614), # 😔 54 | chr(0x1F62A), # 😪 55 | chr(0x1F924), # 🤤 56 | chr(0x1F634), # 😴 57 | chr(0x1FAE9), # 🫩 58 | chr(0x1F637), # 😷 59 | chr(0x1F912), # 🤒 60 | chr(0x1F915), # 🤕 61 | chr(0x1F922), # 🤢 62 | chr(0x1F92E), # 🤮 63 | chr(0x1F927), # 🤧 64 | chr(0x1F975), # 🥵 65 | chr(0x1F976), # 🥶 66 | chr(0x1F974), # 🥴 67 | chr(0x1F635), # 😵 68 | chr(0x1F92F), # 🤯 69 | chr(0x1F920), # 🤠 70 | chr(0x1F973), # 🥳 71 | chr(0x1F978), # 🥸 72 | chr(0x1F60E), # 😎 73 | chr(0x1F913), # 🤓 74 | chr(0x1F9D0), # 🧐 75 | chr(0x1F615), # 😕 76 | chr(0x1FAE4), # 🫤 77 | chr(0x1F61F), # 😟 78 | chr(0x1F641), # 🙁 79 | chr(0x2639), # ☹ 80 | chr(0x1F62E), # 😮 81 | chr(0x1F62F), # 😯 82 | chr(0x1F632), # 😲 83 | chr(0x1F633), # 😳 84 | chr(0x1F97A), # 🥺 85 | chr(0x1F979), # 🥹 86 | chr(0x1F626), # 😦 87 | chr(0x1F627), # 😧 88 | chr(0x1F628), # 😨 89 | chr(0x1F630), # 😰 90 | chr(0x1F625), # 😥 91 | chr(0x1F622), # 😢 92 | chr(0x1F62D), # 😭 93 | chr(0x1F631), # 😱 94 | chr(0x1F616), # 😖 95 | chr(0x1F623), # 😣 96 | chr(0x1F61E), # 😞 97 | chr(0x1F613), # 😓 98 | chr(0x1F629), # 😩 99 | chr(0x1F62B), # 😫 100 | chr(0x1F971), # 🥱 101 | chr(0x1F624), # 😤 102 | chr(0x1F621), # 😡 103 | chr(0x1F620), # 😠 104 | chr(0x1F92C), # 🤬 105 | chr(0x1F608), # 😈 106 | chr(0x1F47F), # 👿 107 | chr(0x1F480), # 💀 108 | chr(0x2620), # ☠ 109 | chr(0x1F4A9), # 💩 110 | chr(0x1F921), # 🤡 111 | chr(0x1F479), # 👹 112 | chr(0x1F47A), # 👺 113 | chr(0x1F47B), # 👻 114 | chr(0x1F47D), # 👽 115 | chr(0x1F47E), # 👾 116 | chr(0x1F916), # 🤖 117 | chr(0x1F63A), # 😺 118 | chr(0x1F638), # 😸 119 | chr(0x1F639), # 😹 120 | chr(0x1F63B), # 😻 121 | chr(0x1F63C), # 😼 122 | chr(0x1F63D), # 😽 123 | chr(0x1F640), # 🙀 124 | chr(0x1F63F), # 😿 125 | chr(0x1F63E), # 😾 126 | chr(0x1F648), # 🙈 127 | chr(0x1F649), # 🙉 128 | chr(0x1F64A), # 🙊 129 | chr(0x1F48C), # 💌 130 | chr(0x1F498), # 💘 131 | chr(0x1F49D), # 💝 132 | chr(0x1F496), # 💖 133 | chr(0x1F497), # 💗 134 | chr(0x1F493), # 💓 135 | chr(0x1F49E), # 💞 136 | chr(0x1F495), # 💕 137 | chr(0x1F49F), # 💟 138 | chr(0x2763), # ❣ 139 | chr(0x1F494), # 💔 140 | chr(0x2764), # ❤ 141 | chr(0x1FA77), # 🩷 142 | chr(0x1F9E1), # 🧡 143 | chr(0x1F49B), # 💛 144 | chr(0x1F49A), # 💚 145 | chr(0x1F499), # 💙 146 | chr(0x1FA75), # 🩵 147 | chr(0x1F49C), # 💜 148 | chr(0x1F90E), # 🤎 149 | chr(0x1F5A4), # 🖤 150 | chr(0x1FA76), # 🩶 151 | chr(0x1F90D), # 🤍 152 | chr(0x1F48B), # 💋 153 | chr(0x1F4AF), # 💯 154 | chr(0x1F4A2), # 💢 155 | chr(0x1F4A5), # 💥 156 | chr(0x1F4AB), # 💫 157 | chr(0x1F4A6), # 💦 158 | chr(0x1F4A8), # 💨 159 | chr(0x1F573), # 🕳 160 | chr(0x1F4AC), # 💬 161 | chr(0x1F5E8), # 🗨 162 | chr(0x1F5EF), # 🗯 163 | chr(0x1F4AD), # 💭 164 | chr(0x1F4A4), # 💤 165 | chr(0x1F44B), # 👋 166 | chr(0x1F91A), # 🤚 167 | chr(0x1F590), # 🖐 168 | chr(0x270B), # ✋ 169 | chr(0x1F596), # 🖖 170 | chr(0x1FAF1), # 🫱 171 | chr(0x1FAF2), # 🫲 172 | chr(0x1FAF3), # 🫳 173 | chr(0x1FAF4), # 🫴 174 | chr(0x1FAF7), # 🫷 175 | chr(0x1FAF8), # 🫸 176 | chr(0x1F44C), # 👌 177 | chr(0x1F90C), # 🤌 178 | chr(0x1F90F), # 🤏 179 | chr(0x270C), # ✌ 180 | chr(0x1F91E), # 🤞 181 | chr(0x1FAF0), # 🫰 182 | chr(0x1F91F), # 🤟 183 | chr(0x1F918), # 🤘 184 | chr(0x1F919), # 🤙 185 | chr(0x1F448), # 👈 186 | chr(0x1F449), # 👉 187 | chr(0x1F446), # 👆 188 | chr(0x1F595), # 🖕 189 | chr(0x1F447), # 👇 190 | chr(0x261D), # ☝ 191 | chr(0x1FAF5), # 🫵 192 | chr(0x1F44D), # 👍 193 | chr(0x1F44E), # 👎 194 | chr(0x270A), # ✊ 195 | chr(0x1F44A), # 👊 196 | chr(0x1F91B), # 🤛 197 | chr(0x1F91C), # 🤜 198 | chr(0x1F44F), # 👏 199 | chr(0x1F64C), # 🙌 200 | chr(0x1FAF6), # 🫶 201 | chr(0x1F450), # 👐 202 | chr(0x1F932), # 🤲 203 | chr(0x1F91D), # 🤝 204 | chr(0x1F64F), # 🙏 205 | chr(0x270D), # ✍ 206 | chr(0x1F485), # 💅 207 | chr(0x1F933), # 🤳 208 | chr(0x1F4AA), # 💪 209 | chr(0x1F9BE), # 🦾 210 | chr(0x1F9BF), # 🦿 211 | chr(0x1F9B5), # 🦵 212 | chr(0x1F9B6), # 🦶 213 | chr(0x1F442), # 👂 214 | chr(0x1F9BB), # 🦻 215 | chr(0x1F443), # 👃 216 | chr(0x1F9E0), # 🧠 217 | chr(0x1FAC0), # 🫀 218 | chr(0x1FAC1), # 🫁 219 | chr(0x1F9B7), # 🦷 220 | chr(0x1F9B4), # 🦴 221 | chr(0x1F440), # 👀 222 | chr(0x1F441), # 👁 223 | chr(0x1F445), # 👅 224 | chr(0x1F444), # 👄 225 | chr(0x1FAE6), # 🫦 226 | chr(0x1F476), # 👶 227 | chr(0x1F9D2), # 🧒 228 | chr(0x1F466), # 👦 229 | chr(0x1F467), # 👧 230 | chr(0x1F9D1), # 🧑 231 | chr(0x1F471), # 👱 232 | chr(0x1F468), # 👨 233 | chr(0x1F9D4), # 🧔 234 | chr(0x1F469), # 👩 235 | chr(0x1F9D3), # 🧓 236 | chr(0x1F474), # 👴 237 | chr(0x1F475), # 👵 238 | chr(0x1F64D), # 🙍 239 | chr(0x1F64E), # 🙎 240 | chr(0x1F645), # 🙅 241 | chr(0x1F646), # 🙆 242 | chr(0x1F481), # 💁 243 | chr(0x1F64B), # 🙋 244 | chr(0x1F9CF), # 🧏 245 | chr(0x1F647), # 🙇 246 | chr(0x1F926), # 🤦 247 | chr(0x1F937), # 🤷 248 | chr(0x1F46E), # 👮 249 | chr(0x1F575), # 🕵 250 | chr(0x1F482), # 💂 251 | chr(0x1F977), # 🥷 252 | chr(0x1F477), # 👷 253 | chr(0x1FAC5), # 🫅 254 | chr(0x1F934), # 🤴 255 | chr(0x1F478), # 👸 256 | chr(0x1F473), # 👳 257 | chr(0x1F472), # 👲 258 | chr(0x1F9D5), # 🧕 259 | chr(0x1F935), # 🤵 260 | chr(0x1F470), # 👰 261 | chr(0x1F930), # 🤰 262 | chr(0x1FAC3), # 🫃 263 | chr(0x1FAC4), # 🫄 264 | chr(0x1F931), # 🤱 265 | chr(0x1F47C), # 👼 266 | chr(0x1F385), # 🎅 267 | chr(0x1F936), # 🤶 268 | chr(0x1F9B8), # 🦸 269 | chr(0x1F9B9), # 🦹 270 | chr(0x1F9D9), # 🧙 271 | chr(0x1F9DA), # 🧚 272 | chr(0x1F9DB), # 🧛 273 | chr(0x1F9DC), # 🧜 274 | chr(0x1F9DD), # 🧝 275 | chr(0x1F9DE), # 🧞 276 | chr(0x1F9DF), # 🧟 277 | chr(0x1F9CC), # 🧌 278 | chr(0x1F486), # 💆 279 | chr(0x1F487), # 💇 280 | chr(0x1F6B6), # 🚶 281 | chr(0x1F9CD), # 🧍 282 | chr(0x1F9CE), # 🧎 283 | chr(0x1F3C3), # 🏃 284 | chr(0x1F483), # 💃 285 | chr(0x1F57A), # 🕺 286 | chr(0x1F574), # 🕴 287 | chr(0x1F46F), # 👯 288 | chr(0x1F9D6), # 🧖 289 | chr(0x1F9D7), # 🧗 290 | chr(0x1F93A), # 🤺 291 | chr(0x1F3C7), # 🏇 292 | chr(0x26F7), # ⛷ 293 | chr(0x1F3C2), # 🏂 294 | chr(0x1F3CC), # 🏌 295 | chr(0x1F3C4), # 🏄 296 | chr(0x1F6A3), # 🚣 297 | chr(0x1F3CA), # 🏊 298 | chr(0x26F9), # ⛹ 299 | chr(0x1F3CB), # 🏋 300 | chr(0x1F6B4), # 🚴 301 | chr(0x1F6B5), # 🚵 302 | chr(0x1F938), # 🤸 303 | chr(0x1F93C), # 🤼 304 | chr(0x1F93D), # 🤽 305 | chr(0x1F93E), # 🤾 306 | chr(0x1F939), # 🤹 307 | chr(0x1F9D8), # 🧘 308 | chr(0x1F6C0), # 🛀 309 | chr(0x1F6CC), # 🛌 310 | chr(0x1F46D), # 👭 311 | chr(0x1F46B), # 👫 312 | chr(0x1F46C), # 👬 313 | chr(0x1F48F), # 💏 314 | chr(0x1F491), # 💑 315 | chr(0x1F5E3), # 🗣 316 | chr(0x1F464), # 👤 317 | chr(0x1F465), # 👥 318 | chr(0x1FAC2), # 🫂 319 | chr(0x1F46A), # 👪 320 | chr(0x1F463), # 👣 321 | chr(0x1FAC6), # 🫆 322 | chr(0x1F9B0), # 🦰 323 | chr(0x1F9B1), # 🦱 324 | chr(0x1F9B3), # 🦳 325 | chr(0x1F9B2), # 🦲 326 | chr(0x1F435), # 🐵 327 | chr(0x1F412), # 🐒 328 | chr(0x1F98D), # 🦍 329 | chr(0x1F9A7), # 🦧 330 | chr(0x1F436), # 🐶 331 | chr(0x1F415), # 🐕 332 | chr(0x1F9AE), # 🦮 333 | chr(0x1F429), # 🐩 334 | chr(0x1F43A), # 🐺 335 | chr(0x1F98A), # 🦊 336 | chr(0x1F99D), # 🦝 337 | chr(0x1F431), # 🐱 338 | chr(0x1F408), # 🐈 339 | chr(0x1F981), # 🦁 340 | chr(0x1F42F), # 🐯 341 | chr(0x1F405), # 🐅 342 | chr(0x1F406), # 🐆 343 | chr(0x1F434), # 🐴 344 | chr(0x1FACE), # 🫎 345 | chr(0x1FACF), # 🫏 346 | chr(0x1F40E), # 🐎 347 | chr(0x1F984), # 🦄 348 | chr(0x1F993), # 🦓 349 | chr(0x1F98C), # 🦌 350 | chr(0x1F9AC), # 🦬 351 | chr(0x1F42E), # 🐮 352 | chr(0x1F402), # 🐂 353 | chr(0x1F403), # 🐃 354 | chr(0x1F404), # 🐄 355 | chr(0x1F437), # 🐷 356 | chr(0x1F416), # 🐖 357 | chr(0x1F417), # 🐗 358 | chr(0x1F43D), # 🐽 359 | chr(0x1F40F), # 🐏 360 | chr(0x1F411), # 🐑 361 | chr(0x1F410), # 🐐 362 | chr(0x1F42A), # 🐪 363 | chr(0x1F42B), # 🐫 364 | chr(0x1F999), # 🦙 365 | chr(0x1F992), # 🦒 366 | chr(0x1F418), # 🐘 367 | chr(0x1F9A3), # 🦣 368 | chr(0x1F98F), # 🦏 369 | chr(0x1F99B), # 🦛 370 | chr(0x1F42D), # 🐭 371 | chr(0x1F401), # 🐁 372 | chr(0x1F400), # 🐀 373 | chr(0x1F439), # 🐹 374 | chr(0x1F430), # 🐰 375 | chr(0x1F407), # 🐇 376 | chr(0x1F43F), # 🐿 377 | chr(0x1F9AB), # 🦫 378 | chr(0x1F994), # 🦔 379 | chr(0x1F987), # 🦇 380 | chr(0x1F43B), # 🐻 381 | chr(0x1F428), # 🐨 382 | chr(0x1F43C), # 🐼 383 | chr(0x1F9A5), # 🦥 384 | chr(0x1F9A6), # 🦦 385 | chr(0x1F9A8), # 🦨 386 | chr(0x1F998), # 🦘 387 | chr(0x1F9A1), # 🦡 388 | chr(0x1F43E), # 🐾 389 | chr(0x1F983), # 🦃 390 | chr(0x1F414), # 🐔 391 | chr(0x1F413), # 🐓 392 | chr(0x1F423), # 🐣 393 | chr(0x1F424), # 🐤 394 | chr(0x1F425), # 🐥 395 | chr(0x1F426), # 🐦 396 | chr(0x1F427), # 🐧 397 | chr(0x1F54A), # 🕊 398 | chr(0x1F985), # 🦅 399 | chr(0x1F986), # 🦆 400 | chr(0x1F9A2), # 🦢 401 | chr(0x1F989), # 🦉 402 | chr(0x1F9A4), # 🦤 403 | chr(0x1FAB6), # 🪶 404 | chr(0x1F9A9), # 🦩 405 | chr(0x1F99A), # 🦚 406 | chr(0x1F99C), # 🦜 407 | chr(0x1FABD), # 🪽 408 | chr(0x1FABF), # 🪿 409 | chr(0x1F438), # 🐸 410 | chr(0x1F40A), # 🐊 411 | chr(0x1F422), # 🐢 412 | chr(0x1F98E), # 🦎 413 | chr(0x1F40D), # 🐍 414 | chr(0x1F432), # 🐲 415 | chr(0x1F409), # 🐉 416 | chr(0x1F995), # 🦕 417 | chr(0x1F996), # 🦖 418 | chr(0x1F433), # 🐳 419 | chr(0x1F40B), # 🐋 420 | chr(0x1F42C), # 🐬 421 | chr(0x1F9AD), # 🦭 422 | chr(0x1F41F), # 🐟 423 | chr(0x1F420), # 🐠 424 | chr(0x1F421), # 🐡 425 | chr(0x1F988), # 🦈 426 | chr(0x1F419), # 🐙 427 | chr(0x1F41A), # 🐚 428 | chr(0x1FAB8), # 🪸 429 | chr(0x1FABC), # 🪼 430 | chr(0x1F980), # 🦀 431 | chr(0x1F99E), # 🦞 432 | chr(0x1F990), # 🦐 433 | chr(0x1F991), # 🦑 434 | chr(0x1F9AA), # 🦪 435 | chr(0x1F40C), # 🐌 436 | chr(0x1F98B), # 🦋 437 | chr(0x1F41B), # 🐛 438 | chr(0x1F41C), # 🐜 439 | chr(0x1F41D), # 🐝 440 | chr(0x1FAB2), # 🪲 441 | chr(0x1F41E), # 🐞 442 | chr(0x1F997), # 🦗 443 | chr(0x1FAB3), # 🪳 444 | chr(0x1F577), # 🕷 445 | chr(0x1F578), # 🕸 446 | chr(0x1F982), # 🦂 447 | chr(0x1F99F), # 🦟 448 | chr(0x1FAB0), # 🪰 449 | chr(0x1FAB1), # 🪱 450 | chr(0x1F9A0), # 🦠 451 | chr(0x1F490), # 💐 452 | chr(0x1F338), # 🌸 453 | chr(0x1F4AE), # 💮 454 | chr(0x1FAB7), # 🪷 455 | chr(0x1F3F5), # 🏵 456 | chr(0x1F339), # 🌹 457 | chr(0x1F940), # 🥀 458 | chr(0x1F33A), # 🌺 459 | chr(0x1F33B), # 🌻 460 | chr(0x1F33C), # 🌼 461 | chr(0x1F337), # 🌷 462 | chr(0x1FABB), # 🪻 463 | chr(0x1F331), # 🌱 464 | chr(0x1FAB4), # 🪴 465 | chr(0x1F332), # 🌲 466 | chr(0x1F333), # 🌳 467 | chr(0x1F334), # 🌴 468 | chr(0x1F335), # 🌵 469 | chr(0x1F33E), # 🌾 470 | chr(0x1F33F), # 🌿 471 | chr(0x2618), # ☘ 472 | chr(0x1F340), # 🍀 473 | chr(0x1F341), # 🍁 474 | chr(0x1F342), # 🍂 475 | chr(0x1F343), # 🍃 476 | chr(0x1FAB9), # 🪹 477 | chr(0x1FABA), # 🪺 478 | chr(0x1F344), # 🍄 479 | chr(0x1FABE), # 🪾 480 | chr(0x1F347), # 🍇 481 | chr(0x1F348), # 🍈 482 | chr(0x1F349), # 🍉 483 | chr(0x1F34A), # 🍊 484 | chr(0x1F34B), # 🍋 485 | chr(0x1F34C), # 🍌 486 | chr(0x1F34D), # 🍍 487 | chr(0x1F96D), # 🥭 488 | chr(0x1F34E), # 🍎 489 | chr(0x1F34F), # 🍏 490 | chr(0x1F350), # 🍐 491 | chr(0x1F351), # 🍑 492 | chr(0x1F352), # 🍒 493 | chr(0x1F353), # 🍓 494 | chr(0x1FAD0), # 🫐 495 | chr(0x1F95D), # 🥝 496 | chr(0x1F345), # 🍅 497 | chr(0x1FAD2), # 🫒 498 | chr(0x1F965), # 🥥 499 | chr(0x1F951), # 🥑 500 | chr(0x1F346), # 🍆 501 | chr(0x1F954), # 🥔 502 | chr(0x1F955), # 🥕 503 | chr(0x1F33D), # 🌽 504 | chr(0x1F336), # 🌶 505 | chr(0x1FAD1), # 🫑 506 | chr(0x1F952), # 🥒 507 | chr(0x1F96C), # 🥬 508 | chr(0x1F966), # 🥦 509 | chr(0x1F9C4), # 🧄 510 | chr(0x1F9C5), # 🧅 511 | chr(0x1F95C), # 🥜 512 | chr(0x1FAD8), # 🫘 513 | chr(0x1F330), # 🌰 514 | chr(0x1FADA), # 🫚 515 | chr(0x1FADB), # 🫛 516 | chr(0x1FADC), # 🫜 517 | chr(0x1F35E), # 🍞 518 | chr(0x1F950), # 🥐 519 | chr(0x1F956), # 🥖 520 | chr(0x1FAD3), # 🫓 521 | chr(0x1F968), # 🥨 522 | chr(0x1F96F), # 🥯 523 | chr(0x1F95E), # 🥞 524 | chr(0x1F9C7), # 🧇 525 | chr(0x1F9C0), # 🧀 526 | chr(0x1F356), # 🍖 527 | chr(0x1F357), # 🍗 528 | chr(0x1F969), # 🥩 529 | chr(0x1F953), # 🥓 530 | chr(0x1F354), # 🍔 531 | chr(0x1F35F), # 🍟 532 | chr(0x1F355), # 🍕 533 | chr(0x1F32D), # 🌭 534 | chr(0x1F96A), # 🥪 535 | chr(0x1F32E), # 🌮 536 | chr(0x1F32F), # 🌯 537 | chr(0x1FAD4), # 🫔 538 | chr(0x1F959), # 🥙 539 | chr(0x1F9C6), # 🧆 540 | chr(0x1F95A), # 🥚 541 | chr(0x1F373), # 🍳 542 | chr(0x1F958), # 🥘 543 | chr(0x1F372), # 🍲 544 | chr(0x1FAD5), # 🫕 545 | chr(0x1F963), # 🥣 546 | chr(0x1F957), # 🥗 547 | chr(0x1F37F), # 🍿 548 | chr(0x1F9C8), # 🧈 549 | chr(0x1F9C2), # 🧂 550 | chr(0x1F96B), # 🥫 551 | chr(0x1F371), # 🍱 552 | chr(0x1F358), # 🍘 553 | chr(0x1F359), # 🍙 554 | chr(0x1F35A), # 🍚 555 | chr(0x1F35B), # 🍛 556 | chr(0x1F35C), # 🍜 557 | chr(0x1F35D), # 🍝 558 | chr(0x1F360), # 🍠 559 | chr(0x1F362), # 🍢 560 | chr(0x1F363), # 🍣 561 | chr(0x1F364), # 🍤 562 | chr(0x1F365), # 🍥 563 | chr(0x1F96E), # 🥮 564 | chr(0x1F361), # 🍡 565 | chr(0x1F95F), # 🥟 566 | chr(0x1F960), # 🥠 567 | chr(0x1F961), # 🥡 568 | chr(0x1F366), # 🍦 569 | chr(0x1F367), # 🍧 570 | chr(0x1F368), # 🍨 571 | chr(0x1F369), # 🍩 572 | chr(0x1F36A), # 🍪 573 | chr(0x1F382), # 🎂 574 | chr(0x1F370), # 🍰 575 | chr(0x1F9C1), # 🧁 576 | chr(0x1F967), # 🥧 577 | chr(0x1F36B), # 🍫 578 | chr(0x1F36C), # 🍬 579 | chr(0x1F36D), # 🍭 580 | chr(0x1F36E), # 🍮 581 | chr(0x1F36F), # 🍯 582 | chr(0x1F37C), # 🍼 583 | chr(0x1F95B), # 🥛 584 | chr(0x2615), # ☕ 585 | chr(0x1FAD6), # 🫖 586 | chr(0x1F375), # 🍵 587 | chr(0x1F376), # 🍶 588 | chr(0x1F37E), # 🍾 589 | chr(0x1F377), # 🍷 590 | chr(0x1F378), # 🍸 591 | chr(0x1F379), # 🍹 592 | chr(0x1F37A), # 🍺 593 | chr(0x1F37B), # 🍻 594 | chr(0x1F942), # 🥂 595 | chr(0x1F943), # 🥃 596 | chr(0x1FAD7), # 🫗 597 | chr(0x1F964), # 🥤 598 | chr(0x1F9CB), # 🧋 599 | chr(0x1F9C3), # 🧃 600 | chr(0x1F9C9), # 🧉 601 | chr(0x1F9CA), # 🧊 602 | chr(0x1F962), # 🥢 603 | chr(0x1F37D), # 🍽 604 | chr(0x1F374), # 🍴 605 | chr(0x1F944), # 🥄 606 | chr(0x1F52A), # 🔪 607 | chr(0x1FAD9), # 🫙 608 | chr(0x1F3FA), # 🏺 609 | chr(0x1F30D), # 🌍 610 | chr(0x1F30E), # 🌎 611 | chr(0x1F30F), # 🌏 612 | chr(0x1F310), # 🌐 613 | chr(0x1F5FA), # 🗺 614 | chr(0x1F5FE), # 🗾 615 | chr(0x1F9ED), # 🧭 616 | chr(0x1F3D4), # 🏔 617 | chr(0x26F0), # ⛰ 618 | chr(0x1F30B), # 🌋 619 | chr(0x1F5FB), # 🗻 620 | chr(0x1F3D5), # 🏕 621 | chr(0x1F3D6), # 🏖 622 | chr(0x1F3DC), # 🏜 623 | chr(0x1F3DD), # 🏝 624 | chr(0x1F3DE), # 🏞 625 | chr(0x1F3DF), # 🏟 626 | chr(0x1F3DB), # 🏛 627 | chr(0x1F3D7), # 🏗 628 | chr(0x1F9F1), # 🧱 629 | chr(0x1FAA8), # 🪨 630 | chr(0x1FAB5), # 🪵 631 | chr(0x1F6D6), # 🛖 632 | chr(0x1F3D8), # 🏘 633 | chr(0x1F3DA), # 🏚 634 | chr(0x1F3E0), # 🏠 635 | chr(0x1F3E1), # 🏡 636 | chr(0x1F3E2), # 🏢 637 | chr(0x1F3E3), # 🏣 638 | chr(0x1F3E4), # 🏤 639 | chr(0x1F3E5), # 🏥 640 | chr(0x1F3E6), # 🏦 641 | chr(0x1F3E8), # 🏨 642 | chr(0x1F3E9), # 🏩 643 | chr(0x1F3EA), # 🏪 644 | chr(0x1F3EB), # 🏫 645 | chr(0x1F3EC), # 🏬 646 | chr(0x1F3ED), # 🏭 647 | chr(0x1F3EF), # 🏯 648 | chr(0x1F3F0), # 🏰 649 | chr(0x1F492), # 💒 650 | chr(0x1F5FC), # 🗼 651 | chr(0x1F5FD), # 🗽 652 | chr(0x26EA), # ⛪ 653 | chr(0x1F54C), # 🕌 654 | chr(0x1F6D5), # 🛕 655 | chr(0x1F54D), # 🕍 656 | chr(0x26E9), # ⛩ 657 | chr(0x1F54B), # 🕋 658 | chr(0x26F2), # ⛲ 659 | chr(0x26FA), # ⛺ 660 | chr(0x1F301), # 🌁 661 | chr(0x1F303), # 🌃 662 | chr(0x1F3D9), # 🏙 663 | chr(0x1F304), # 🌄 664 | chr(0x1F305), # 🌅 665 | chr(0x1F306), # 🌆 666 | chr(0x1F307), # 🌇 667 | chr(0x1F309), # 🌉 668 | chr(0x2668), # ♨ 669 | chr(0x1F3A0), # 🎠 670 | chr(0x1F6DD), # 🛝 671 | chr(0x1F3A1), # 🎡 672 | chr(0x1F3A2), # 🎢 673 | chr(0x1F488), # 💈 674 | chr(0x1F3AA), # 🎪 675 | chr(0x1F682), # 🚂 676 | chr(0x1F683), # 🚃 677 | chr(0x1F684), # 🚄 678 | chr(0x1F685), # 🚅 679 | chr(0x1F686), # 🚆 680 | chr(0x1F687), # 🚇 681 | chr(0x1F688), # 🚈 682 | chr(0x1F689), # 🚉 683 | chr(0x1F68A), # 🚊 684 | chr(0x1F69D), # 🚝 685 | chr(0x1F69E), # 🚞 686 | chr(0x1F68B), # 🚋 687 | chr(0x1F68C), # 🚌 688 | chr(0x1F68D), # 🚍 689 | chr(0x1F68E), # 🚎 690 | chr(0x1F690), # 🚐 691 | chr(0x1F691), # 🚑 692 | chr(0x1F692), # 🚒 693 | chr(0x1F693), # 🚓 694 | chr(0x1F694), # 🚔 695 | chr(0x1F695), # 🚕 696 | chr(0x1F696), # 🚖 697 | chr(0x1F697), # 🚗 698 | chr(0x1F698), # 🚘 699 | chr(0x1F699), # 🚙 700 | chr(0x1F6FB), # 🛻 701 | chr(0x1F69A), # 🚚 702 | chr(0x1F69B), # 🚛 703 | chr(0x1F69C), # 🚜 704 | chr(0x1F3CE), # 🏎 705 | chr(0x1F3CD), # 🏍 706 | chr(0x1F6F5), # 🛵 707 | chr(0x1F9BD), # 🦽 708 | chr(0x1F9BC), # 🦼 709 | chr(0x1F6FA), # 🛺 710 | chr(0x1F6B2), # 🚲 711 | chr(0x1F6F4), # 🛴 712 | chr(0x1F6F9), # 🛹 713 | chr(0x1F6FC), # 🛼 714 | chr(0x1F68F), # 🚏 715 | chr(0x1F6E3), # 🛣 716 | chr(0x1F6E4), # 🛤 717 | chr(0x1F6E2), # 🛢 718 | chr(0x26FD), # ⛽ 719 | chr(0x1F6DE), # 🛞 720 | chr(0x1F6A8), # 🚨 721 | chr(0x1F6A5), # 🚥 722 | chr(0x1F6A6), # 🚦 723 | chr(0x1F6D1), # 🛑 724 | chr(0x1F6A7), # 🚧 725 | chr(0x2693), # ⚓ 726 | chr(0x1F6DF), # 🛟 727 | chr(0x26F5), # ⛵ 728 | chr(0x1F6F6), # 🛶 729 | chr(0x1F6A4), # 🚤 730 | chr(0x1F6F3), # 🛳 731 | chr(0x26F4), # ⛴ 732 | chr(0x1F6E5), # 🛥 733 | chr(0x1F6A2), # 🚢 734 | chr(0x2708), # ✈ 735 | chr(0x1F6E9), # 🛩 736 | chr(0x1F6EB), # 🛫 737 | chr(0x1F6EC), # 🛬 738 | chr(0x1FA82), # 🪂 739 | chr(0x1F4BA), # 💺 740 | chr(0x1F681), # 🚁 741 | chr(0x1F69F), # 🚟 742 | chr(0x1F6A0), # 🚠 743 | chr(0x1F6A1), # 🚡 744 | chr(0x1F6F0), # 🛰 745 | chr(0x1F680), # 🚀 746 | chr(0x1F6F8), # 🛸 747 | chr(0x1F6CE), # 🛎 748 | chr(0x1F9F3), # 🧳 749 | chr(0x231B), # ⌛ 750 | chr(0x23F3), # ⏳ 751 | chr(0x231A), # ⌚ 752 | chr(0x23F0), # ⏰ 753 | chr(0x23F1), # ⏱ 754 | chr(0x23F2), # ⏲ 755 | chr(0x1F570), # 🕰 756 | chr(0x1F55B), # 🕛 757 | chr(0x1F567), # 🕧 758 | chr(0x1F550), # 🕐 759 | chr(0x1F55C), # 🕜 760 | chr(0x1F551), # 🕑 761 | chr(0x1F55D), # 🕝 762 | chr(0x1F552), # 🕒 763 | chr(0x1F55E), # 🕞 764 | chr(0x1F553), # 🕓 765 | chr(0x1F55F), # 🕟 766 | chr(0x1F554), # 🕔 767 | chr(0x1F560), # 🕠 768 | chr(0x1F555), # 🕕 769 | chr(0x1F561), # 🕡 770 | chr(0x1F556), # 🕖 771 | chr(0x1F562), # 🕢 772 | chr(0x1F557), # 🕗 773 | chr(0x1F563), # 🕣 774 | chr(0x1F558), # 🕘 775 | chr(0x1F564), # 🕤 776 | chr(0x1F559), # 🕙 777 | chr(0x1F565), # 🕥 778 | chr(0x1F55A), # 🕚 779 | chr(0x1F566), # 🕦 780 | chr(0x1F311), # 🌑 781 | chr(0x1F312), # 🌒 782 | chr(0x1F313), # 🌓 783 | chr(0x1F314), # 🌔 784 | chr(0x1F315), # 🌕 785 | chr(0x1F316), # 🌖 786 | chr(0x1F317), # 🌗 787 | chr(0x1F318), # 🌘 788 | chr(0x1F319), # 🌙 789 | chr(0x1F31A), # 🌚 790 | chr(0x1F31B), # 🌛 791 | chr(0x1F31C), # 🌜 792 | chr(0x1F321), # 🌡 793 | chr(0x2600), # ☀ 794 | chr(0x1F31D), # 🌝 795 | chr(0x1F31E), # 🌞 796 | chr(0x1FA90), # 🪐 797 | chr(0x2B50), # ⭐ 798 | chr(0x1F31F), # 🌟 799 | chr(0x1F320), # 🌠 800 | chr(0x1F30C), # 🌌 801 | chr(0x2601), # ☁ 802 | chr(0x26C5), # ⛅ 803 | chr(0x26C8), # ⛈ 804 | chr(0x1F324), # 🌤 805 | chr(0x1F325), # 🌥 806 | chr(0x1F326), # 🌦 807 | chr(0x1F327), # 🌧 808 | chr(0x1F328), # 🌨 809 | chr(0x1F329), # 🌩 810 | chr(0x1F32A), # 🌪 811 | chr(0x1F32B), # 🌫 812 | chr(0x1F32C), # 🌬 813 | chr(0x1F300), # 🌀 814 | chr(0x1F308), # 🌈 815 | chr(0x1F302), # 🌂 816 | chr(0x2602), # ☂ 817 | chr(0x2614), # ☔ 818 | chr(0x26F1), # ⛱ 819 | chr(0x26A1), # ⚡ 820 | chr(0x2744), # ❄ 821 | chr(0x2603), # ☃ 822 | chr(0x26C4), # ⛄ 823 | chr(0x2604), # ☄ 824 | chr(0x1F525), # 🔥 825 | chr(0x1F4A7), # 💧 826 | chr(0x1F30A), # 🌊 827 | chr(0x1F383), # 🎃 828 | chr(0x1F384), # 🎄 829 | chr(0x1F386), # 🎆 830 | chr(0x1F387), # 🎇 831 | chr(0x1F9E8), # 🧨 832 | chr(0x2728), # ✨ 833 | chr(0x1F388), # 🎈 834 | chr(0x1F389), # 🎉 835 | chr(0x1F38A), # 🎊 836 | chr(0x1F38B), # 🎋 837 | chr(0x1F38D), # 🎍 838 | chr(0x1F38E), # 🎎 839 | chr(0x1F38F), # 🎏 840 | chr(0x1F390), # 🎐 841 | chr(0x1F391), # 🎑 842 | chr(0x1F9E7), # 🧧 843 | chr(0x1F380), # 🎀 844 | chr(0x1F381), # 🎁 845 | chr(0x1F397), # 🎗 846 | chr(0x1F39F), # 🎟 847 | chr(0x1F3AB), # 🎫 848 | chr(0x1F396), # 🎖 849 | chr(0x1F3C6), # 🏆 850 | chr(0x1F3C5), # 🏅 851 | chr(0x1F947), # 🥇 852 | chr(0x1F948), # 🥈 853 | chr(0x1F949), # 🥉 854 | chr(0x26BD), # ⚽ 855 | chr(0x26BE), # ⚾ 856 | chr(0x1F94E), # 🥎 857 | chr(0x1F3C0), # 🏀 858 | chr(0x1F3D0), # 🏐 859 | chr(0x1F3C8), # 🏈 860 | chr(0x1F3C9), # 🏉 861 | chr(0x1F3BE), # 🎾 862 | chr(0x1F94F), # 🥏 863 | chr(0x1F3B3), # 🎳 864 | chr(0x1F3CF), # 🏏 865 | chr(0x1F3D1), # 🏑 866 | chr(0x1F3D2), # 🏒 867 | chr(0x1F94D), # 🥍 868 | chr(0x1F3D3), # 🏓 869 | chr(0x1F3F8), # 🏸 870 | chr(0x1F94A), # 🥊 871 | chr(0x1F94B), # 🥋 872 | chr(0x1F945), # 🥅 873 | chr(0x26F3), # ⛳ 874 | chr(0x26F8), # ⛸ 875 | chr(0x1F3A3), # 🎣 876 | chr(0x1F93F), # 🤿 877 | chr(0x1F3BD), # 🎽 878 | chr(0x1F3BF), # 🎿 879 | chr(0x1F6F7), # 🛷 880 | chr(0x1F94C), # 🥌 881 | chr(0x1F3AF), # 🎯 882 | chr(0x1FA80), # 🪀 883 | chr(0x1FA81), # 🪁 884 | chr(0x1F52B), # 🔫 885 | chr(0x1F3B1), # 🎱 886 | chr(0x1F52E), # 🔮 887 | chr(0x1FA84), # 🪄 888 | chr(0x1F3AE), # 🎮 889 | chr(0x1F579), # 🕹 890 | chr(0x1F3B0), # 🎰 891 | chr(0x1F3B2), # 🎲 892 | chr(0x1F9E9), # 🧩 893 | chr(0x1F9F8), # 🧸 894 | chr(0x1FA85), # 🪅 895 | chr(0x1FAA9), # 🪩 896 | chr(0x1FA86), # 🪆 897 | chr(0x2660), # ♠ 898 | chr(0x2665), # ♥ 899 | chr(0x2666), # ♦ 900 | chr(0x2663), # ♣ 901 | chr(0x265F), # ♟ 902 | chr(0x1F0CF), # 🃏 903 | chr(0x1F004), # 🀄 904 | chr(0x1F3B4), # 🎴 905 | chr(0x1F3AD), # 🎭 906 | chr(0x1F5BC), # 🖼 907 | chr(0x1F3A8), # 🎨 908 | chr(0x1F9F5), # 🧵 909 | chr(0x1FAA1), # 🪡 910 | chr(0x1F9F6), # 🧶 911 | chr(0x1FAA2), # 🪢 912 | chr(0x1F453), # 👓 913 | chr(0x1F576), # 🕶 914 | chr(0x1F97D), # 🥽 915 | chr(0x1F97C), # 🥼 916 | chr(0x1F9BA), # 🦺 917 | chr(0x1F454), # 👔 918 | chr(0x1F455), # 👕 919 | chr(0x1F456), # 👖 920 | chr(0x1F9E3), # 🧣 921 | chr(0x1F9E4), # 🧤 922 | chr(0x1F9E5), # 🧥 923 | chr(0x1F9E6), # 🧦 924 | chr(0x1F457), # 👗 925 | chr(0x1F458), # 👘 926 | chr(0x1F97B), # 🥻 927 | chr(0x1FA71), # 🩱 928 | chr(0x1FA72), # 🩲 929 | chr(0x1FA73), # 🩳 930 | chr(0x1F459), # 👙 931 | chr(0x1F45A), # 👚 932 | chr(0x1FAAD), # 🪭 933 | chr(0x1F45B), # 👛 934 | chr(0x1F45C), # 👜 935 | chr(0x1F45D), # 👝 936 | chr(0x1F6CD), # 🛍 937 | chr(0x1F392), # 🎒 938 | chr(0x1FA74), # 🩴 939 | chr(0x1F45E), # 👞 940 | chr(0x1F45F), # 👟 941 | chr(0x1F97E), # 🥾 942 | chr(0x1F97F), # 🥿 943 | chr(0x1F460), # 👠 944 | chr(0x1F461), # 👡 945 | chr(0x1FA70), # 🩰 946 | chr(0x1F462), # 👢 947 | chr(0x1FAAE), # 🪮 948 | chr(0x1F451), # 👑 949 | chr(0x1F452), # 👒 950 | chr(0x1F3A9), # 🎩 951 | chr(0x1F393), # 🎓 952 | chr(0x1F9E2), # 🧢 953 | chr(0x1FA96), # 🪖 954 | chr(0x26D1), # ⛑ 955 | chr(0x1F4FF), # 📿 956 | chr(0x1F484), # 💄 957 | chr(0x1F48D), # 💍 958 | chr(0x1F48E), # 💎 959 | chr(0x1F507), # 🔇 960 | chr(0x1F508), # 🔈 961 | chr(0x1F509), # 🔉 962 | chr(0x1F50A), # 🔊 963 | chr(0x1F4E2), # 📢 964 | chr(0x1F4E3), # 📣 965 | chr(0x1F4EF), # 📯 966 | chr(0x1F514), # 🔔 967 | chr(0x1F515), # 🔕 968 | chr(0x1F3BC), # 🎼 969 | chr(0x1F3B5), # 🎵 970 | chr(0x1F3B6), # 🎶 971 | chr(0x1F399), # 🎙 972 | chr(0x1F39A), # 🎚 973 | chr(0x1F39B), # 🎛 974 | chr(0x1F3A4), # 🎤 975 | chr(0x1F3A7), # 🎧 976 | chr(0x1F4FB), # 📻 977 | chr(0x1F3B7), # 🎷 978 | chr(0x1FA97), # 🪗 979 | chr(0x1F3B8), # 🎸 980 | chr(0x1F3B9), # 🎹 981 | chr(0x1F3BA), # 🎺 982 | chr(0x1F3BB), # 🎻 983 | chr(0x1FA95), # 🪕 984 | chr(0x1F941), # 🥁 985 | chr(0x1FA98), # 🪘 986 | chr(0x1FA87), # 🪇 987 | chr(0x1FA88), # 🪈 988 | chr(0x1FA89), # 🪉 989 | chr(0x1F4F1), # 📱 990 | chr(0x1F4F2), # 📲 991 | chr(0x260E), # ☎ 992 | chr(0x1F4DE), # 📞 993 | chr(0x1F4DF), # 📟 994 | chr(0x1F4E0), # 📠 995 | chr(0x1F50B), # 🔋 996 | chr(0x1FAAB), # 🪫 997 | chr(0x1F50C), # 🔌 998 | chr(0x1F4BB), # 💻 999 | chr(0x1F5A5), # 🖥 1000 | chr(0x1F5A8), # 🖨 1001 | chr(0x2328), # ⌨ 1002 | chr(0x1F5B1), # 🖱 1003 | chr(0x1F5B2), # 🖲 1004 | chr(0x1F4BD), # 💽 1005 | chr(0x1F4BE), # 💾 1006 | chr(0x1F4BF), # 💿 1007 | chr(0x1F4C0), # 📀 1008 | chr(0x1F9EE), # 🧮 1009 | chr(0x1F3A5), # 🎥 1010 | chr(0x1F39E), # 🎞 1011 | chr(0x1F4FD), # 📽 1012 | chr(0x1F3AC), # 🎬 1013 | chr(0x1F4FA), # 📺 1014 | chr(0x1F4F7), # 📷 1015 | chr(0x1F4F8), # 📸 1016 | chr(0x1F4F9), # 📹 1017 | chr(0x1F4FC), # 📼 1018 | chr(0x1F50D), # 🔍 1019 | chr(0x1F50E), # 🔎 1020 | chr(0x1F56F), # 🕯 1021 | chr(0x1F4A1), # 💡 1022 | chr(0x1F526), # 🔦 1023 | chr(0x1F3EE), # 🏮 1024 | chr(0x1FA94), # 🪔 1025 | chr(0x1F4D4), # 📔 1026 | chr(0x1F4D5), # 📕 1027 | chr(0x1F4D6), # 📖 1028 | chr(0x1F4D7), # 📗 1029 | chr(0x1F4D8), # 📘 1030 | chr(0x1F4D9), # 📙 1031 | chr(0x1F4DA), # 📚 1032 | chr(0x1F4D3), # 📓 1033 | chr(0x1F4D2), # 📒 1034 | chr(0x1F4C3), # 📃 1035 | chr(0x1F4DC), # 📜 1036 | chr(0x1F4C4), # 📄 1037 | chr(0x1F4F0), # 📰 1038 | chr(0x1F5DE), # 🗞 1039 | chr(0x1F4D1), # 📑 1040 | chr(0x1F516), # 🔖 1041 | chr(0x1F3F7), # 🏷 1042 | chr(0x1F4B0), # 💰 1043 | chr(0x1FA99), # 🪙 1044 | chr(0x1F4B4), # 💴 1045 | chr(0x1F4B5), # 💵 1046 | chr(0x1F4B6), # 💶 1047 | chr(0x1F4B7), # 💷 1048 | chr(0x1F4B8), # 💸 1049 | chr(0x1F4B3), # 💳 1050 | chr(0x1F9FE), # 🧾 1051 | chr(0x1F4B9), # 💹 1052 | chr(0x2709), # ✉ 1053 | chr(0x1F4E7), # 📧 1054 | chr(0x1F4E8), # 📨 1055 | chr(0x1F4E9), # 📩 1056 | chr(0x1F4E4), # 📤 1057 | chr(0x1F4E5), # 📥 1058 | chr(0x1F4E6), # 📦 1059 | chr(0x1F4EB), # 📫 1060 | chr(0x1F4EA), # 📪 1061 | chr(0x1F4EC), # 📬 1062 | chr(0x1F4ED), # 📭 1063 | chr(0x1F4EE), # 📮 1064 | chr(0x1F5F3), # 🗳 1065 | chr(0x270F), # ✏ 1066 | chr(0x2712), # ✒ 1067 | chr(0x1F58B), # 🖋 1068 | chr(0x1F58A), # 🖊 1069 | chr(0x1F58C), # 🖌 1070 | chr(0x1F58D), # 🖍 1071 | chr(0x1F4DD), # 📝 1072 | chr(0x1F4BC), # 💼 1073 | chr(0x1F4C1), # 📁 1074 | chr(0x1F4C2), # 📂 1075 | chr(0x1F5C2), # 🗂 1076 | chr(0x1F4C5), # 📅 1077 | chr(0x1F4C6), # 📆 1078 | chr(0x1F5D2), # 🗒 1079 | chr(0x1F5D3), # 🗓 1080 | chr(0x1F4C7), # 📇 1081 | chr(0x1F4C8), # 📈 1082 | chr(0x1F4C9), # 📉 1083 | chr(0x1F4CA), # 📊 1084 | chr(0x1F4CB), # 📋 1085 | chr(0x1F4CC), # 📌 1086 | chr(0x1F4CD), # 📍 1087 | chr(0x1F4CE), # 📎 1088 | chr(0x1F587), # 🖇 1089 | chr(0x1F4CF), # 📏 1090 | chr(0x1F4D0), # 📐 1091 | chr(0x2702), # ✂ 1092 | chr(0x1F5C3), # 🗃 1093 | chr(0x1F5C4), # 🗄 1094 | chr(0x1F5D1), # 🗑 1095 | chr(0x1F512), # 🔒 1096 | chr(0x1F513), # 🔓 1097 | chr(0x1F50F), # 🔏 1098 | chr(0x1F510), # 🔐 1099 | chr(0x1F511), # 🔑 1100 | chr(0x1F5DD), # 🗝 1101 | chr(0x1F528), # 🔨 1102 | chr(0x1FA93), # 🪓 1103 | chr(0x26CF), # ⛏ 1104 | chr(0x2692), # ⚒ 1105 | chr(0x1F6E0), # 🛠 1106 | chr(0x1F5E1), # 🗡 1107 | chr(0x2694), # ⚔ 1108 | chr(0x1F4A3), # 💣 1109 | chr(0x1FA83), # 🪃 1110 | chr(0x1F3F9), # 🏹 1111 | chr(0x1F6E1), # 🛡 1112 | chr(0x1FA9A), # 🪚 1113 | chr(0x1F527), # 🔧 1114 | chr(0x1FA9B), # 🪛 1115 | chr(0x1F529), # 🔩 1116 | chr(0x2699), # ⚙ 1117 | chr(0x1F5DC), # 🗜 1118 | chr(0x2696), # ⚖ 1119 | chr(0x1F9AF), # 🦯 1120 | chr(0x1F517), # 🔗 1121 | chr(0x26D3), # ⛓ 1122 | chr(0x1FA9D), # 🪝 1123 | chr(0x1F9F0), # 🧰 1124 | chr(0x1F9F2), # 🧲 1125 | chr(0x1FA9C), # 🪜 1126 | chr(0x1FA8F), # 🪏 1127 | chr(0x2697), # ⚗ 1128 | chr(0x1F9EA), # 🧪 1129 | chr(0x1F9EB), # 🧫 1130 | chr(0x1F9EC), # 🧬 1131 | chr(0x1F52C), # 🔬 1132 | chr(0x1F52D), # 🔭 1133 | chr(0x1F4E1), # 📡 1134 | chr(0x1F489), # 💉 1135 | chr(0x1FA78), # 🩸 1136 | chr(0x1F48A), # 💊 1137 | chr(0x1FA79), # 🩹 1138 | chr(0x1FA7C), # 🩼 1139 | chr(0x1FA7A), # 🩺 1140 | chr(0x1FA7B), # 🩻 1141 | chr(0x1F6AA), # 🚪 1142 | chr(0x1F6D7), # 🛗 1143 | chr(0x1FA9E), # 🪞 1144 | chr(0x1FA9F), # 🪟 1145 | chr(0x1F6CF), # 🛏 1146 | chr(0x1F6CB), # 🛋 1147 | chr(0x1FA91), # 🪑 1148 | chr(0x1F6BD), # 🚽 1149 | chr(0x1FAA0), # 🪠 1150 | chr(0x1F6BF), # 🚿 1151 | chr(0x1F6C1), # 🛁 1152 | chr(0x1FAA4), # 🪤 1153 | chr(0x1FA92), # 🪒 1154 | chr(0x1F9F4), # 🧴 1155 | chr(0x1F9F7), # 🧷 1156 | chr(0x1F9F9), # 🧹 1157 | chr(0x1F9FA), # 🧺 1158 | chr(0x1F9FB), # 🧻 1159 | chr(0x1FAA3), # 🪣 1160 | chr(0x1F9FC), # 🧼 1161 | chr(0x1FAE7), # 🫧 1162 | chr(0x1FAA5), # 🪥 1163 | chr(0x1F9FD), # 🧽 1164 | chr(0x1F9EF), # 🧯 1165 | chr(0x1F6D2), # 🛒 1166 | chr(0x1F6AC), # 🚬 1167 | chr(0x26B0), # ⚰ 1168 | chr(0x1FAA6), # 🪦 1169 | chr(0x26B1), # ⚱ 1170 | chr(0x1F9FF), # 🧿 1171 | chr(0x1FAAC), # 🪬 1172 | chr(0x1F5FF), # 🗿 1173 | chr(0x1FAA7), # 🪧 1174 | chr(0x1FAAA), # 🪪 1175 | chr(0x1F3E7), # 🏧 1176 | chr(0x1F6AE), # 🚮 1177 | chr(0x1F6B0), # 🚰 1178 | chr(0x267F), # ♿ 1179 | chr(0x1F6B9), # 🚹 1180 | chr(0x1F6BA), # 🚺 1181 | chr(0x1F6BB), # 🚻 1182 | chr(0x1F6BC), # 🚼 1183 | chr(0x1F6BE), # 🚾 1184 | chr(0x1F6C2), # 🛂 1185 | chr(0x1F6C3), # 🛃 1186 | chr(0x1F6C4), # 🛄 1187 | chr(0x1F6C5), # 🛅 1188 | chr(0x26A0), # ⚠ 1189 | chr(0x1F6B8), # 🚸 1190 | chr(0x26D4), # ⛔ 1191 | chr(0x1F6AB), # 🚫 1192 | chr(0x1F6B3), # 🚳 1193 | chr(0x1F6AD), # 🚭 1194 | chr(0x1F6AF), # 🚯 1195 | chr(0x1F6B1), # 🚱 1196 | chr(0x1F6B7), # 🚷 1197 | chr(0x1F4F5), # 📵 1198 | chr(0x1F51E), # 🔞 1199 | chr(0x2622), # ☢ 1200 | chr(0x2623), # ☣ 1201 | chr(0x2B06), # ⬆ 1202 | chr(0x2197), # ↗ 1203 | chr(0x27A1), # ➡ 1204 | chr(0x2198), # ↘ 1205 | chr(0x2B07), # ⬇ 1206 | chr(0x2199), # ↙ 1207 | chr(0x2B05), # ⬅ 1208 | chr(0x2196), # ↖ 1209 | chr(0x2195), # ↕ 1210 | chr(0x2194), # ↔ 1211 | chr(0x21A9), # ↩ 1212 | chr(0x21AA), # ↪ 1213 | chr(0x2934), # ⤴ 1214 | chr(0x2935), # ⤵ 1215 | chr(0x1F503), # 🔃 1216 | chr(0x1F504), # 🔄 1217 | chr(0x1F519), # 🔙 1218 | chr(0x1F51A), # 🔚 1219 | chr(0x1F51B), # 🔛 1220 | chr(0x1F51C), # 🔜 1221 | chr(0x1F51D), # 🔝 1222 | chr(0x1F6D0), # 🛐 1223 | chr(0x269B), # ⚛ 1224 | chr(0x1F549), # 🕉 1225 | chr(0x2721), # ✡ 1226 | chr(0x2638), # ☸ 1227 | chr(0x262F), # ☯ 1228 | chr(0x271D), # ✝ 1229 | chr(0x2626), # ☦ 1230 | chr(0x262A), # ☪ 1231 | chr(0x262E), # ☮ 1232 | chr(0x1F54E), # 🕎 1233 | chr(0x1F52F), # 🔯 1234 | chr(0x1FAAF), # 🪯 1235 | chr(0x2648), # ♈ 1236 | chr(0x2649), # ♉ 1237 | chr(0x264A), # ♊ 1238 | chr(0x264B), # ♋ 1239 | chr(0x264C), # ♌ 1240 | chr(0x264D), # ♍ 1241 | chr(0x264E), # ♎ 1242 | chr(0x264F), # ♏ 1243 | chr(0x2650), # ♐ 1244 | chr(0x2651), # ♑ 1245 | chr(0x2652), # ♒ 1246 | chr(0x2653), # ♓ 1247 | chr(0x26CE), # ⛎ 1248 | chr(0x1F500), # 🔀 1249 | chr(0x1F501), # 🔁 1250 | chr(0x1F502), # 🔂 1251 | chr(0x25B6), # ▶ 1252 | chr(0x23E9), # ⏩ 1253 | chr(0x23ED), # ⏭ 1254 | chr(0x23EF), # ⏯ 1255 | chr(0x25C0), # ◀ 1256 | chr(0x23EA), # ⏪ 1257 | chr(0x23EE), # ⏮ 1258 | chr(0x1F53C), # 🔼 1259 | chr(0x23EB), # ⏫ 1260 | chr(0x1F53D), # 🔽 1261 | chr(0x23EC), # ⏬ 1262 | chr(0x23F8), # ⏸ 1263 | chr(0x23F9), # ⏹ 1264 | chr(0x23FA), # ⏺ 1265 | chr(0x23CF), # ⏏ 1266 | chr(0x1F3A6), # 🎦 1267 | chr(0x1F505), # 🔅 1268 | chr(0x1F506), # 🔆 1269 | chr(0x1F4F6), # 📶 1270 | chr(0x1F6DC), # 🛜 1271 | chr(0x1F4F3), # 📳 1272 | chr(0x1F4F4), # 📴 1273 | chr(0x2640), # ♀ 1274 | chr(0x2642), # ♂ 1275 | chr(0x26A7), # ⚧ 1276 | chr(0x2716), # ✖ 1277 | chr(0x2795), # ➕ 1278 | chr(0x2796), # ➖ 1279 | chr(0x2797), # ➗ 1280 | chr(0x1F7F0), # 🟰 1281 | chr(0x267E), # ♾ 1282 | chr(0x203C), # ‼ 1283 | chr(0x2049), # ⁉ 1284 | chr(0x2753), # ❓ 1285 | chr(0x2754), # ❔ 1286 | chr(0x2755), # ❕ 1287 | chr(0x2757), # ❗ 1288 | chr(0x3030), # 〰 1289 | chr(0x1F4B1), # 💱 1290 | chr(0x1F4B2), # 💲 1291 | chr(0x2695), # ⚕ 1292 | chr(0x267B), # ♻ 1293 | chr(0x269C), # ⚜ 1294 | chr(0x1F531), # 🔱 1295 | chr(0x1F4DB), # 📛 1296 | chr(0x1F530), # 🔰 1297 | chr(0x2B55), # ⭕ 1298 | chr(0x2705), # ✅ 1299 | chr(0x2611), # ☑ 1300 | chr(0x2714), # ✔ 1301 | chr(0x274C), # ❌ 1302 | chr(0x274E), # ❎ 1303 | chr(0x27B0), # ➰ 1304 | chr(0x27BF), # ➿ 1305 | chr(0x303D), # 〽 1306 | chr(0x2733), # ✳ 1307 | chr(0x2734), # ✴ 1308 | chr(0x2747), # ❇ 1309 | chr(0x00A9), # © 1310 | chr(0x00AE), # ® 1311 | chr(0x2122), # ™ 1312 | chr(0x1FADF), # 🫟 1313 | chr(0x1F51F), # 🔟 1314 | chr(0x1F520), # 🔠 1315 | chr(0x1F521), # 🔡 1316 | chr(0x1F522), # 🔢 1317 | chr(0x1F523), # 🔣 1318 | chr(0x1F524), # 🔤 1319 | chr(0x1F170), # 🅰 1320 | chr(0x1F18E), # 🆎 1321 | chr(0x1F171), # 🅱 1322 | chr(0x1F191), # 🆑 1323 | chr(0x1F192), # 🆒 1324 | chr(0x1F193), # 🆓 1325 | chr(0x2139), # ℹ 1326 | chr(0x1F194), # 🆔 1327 | chr(0x24C2), # Ⓜ 1328 | chr(0x1F195), # 🆕 1329 | chr(0x1F196), # 🆖 1330 | chr(0x1F17E), # 🅾 1331 | chr(0x1F197), # 🆗 1332 | chr(0x1F17F), # 🅿 1333 | chr(0x1F198), # 🆘 1334 | chr(0x1F199), # 🆙 1335 | chr(0x1F19A), # 🆚 1336 | chr(0x1F201), # 🈁 1337 | chr(0x1F202), # 🈂 1338 | chr(0x1F237), # 🈷 1339 | chr(0x1F236), # 🈶 1340 | chr(0x1F22F), # 🈯 1341 | chr(0x1F250), # 🉐 1342 | chr(0x1F239), # 🈹 1343 | chr(0x1F21A), # 🈚 1344 | chr(0x1F232), # 🈲 1345 | chr(0x1F251), # 🉑 1346 | chr(0x1F238), # 🈸 1347 | chr(0x1F234), # 🈴 1348 | chr(0x1F233), # 🈳 1349 | chr(0x3297), # ㊗ 1350 | chr(0x3299), # ㊙ 1351 | chr(0x1F23A), # 🈺 1352 | chr(0x1F235), # 🈵 1353 | chr(0x1F534), # 🔴 1354 | chr(0x1F7E0), # 🟠 1355 | chr(0x1F7E1), # 🟡 1356 | chr(0x1F7E2), # 🟢 1357 | chr(0x1F535), # 🔵 1358 | chr(0x1F7E3), # 🟣 1359 | chr(0x1F7E4), # 🟤 1360 | chr(0x26AB), # ⚫ 1361 | chr(0x26AA), # ⚪ 1362 | chr(0x1F7E5), # 🟥 1363 | chr(0x1F7E7), # 🟧 1364 | chr(0x1F7E8), # 🟨 1365 | chr(0x1F7E9), # 🟩 1366 | chr(0x1F7E6), # 🟦 1367 | chr(0x1F7EA), # 🟪 1368 | chr(0x1F7EB), # 🟫 1369 | chr(0x2B1B), # ⬛ 1370 | chr(0x2B1C), # ⬜ 1371 | chr(0x25FC), # ◼ 1372 | chr(0x25FB), # ◻ 1373 | chr(0x25FE), # ◾ 1374 | chr(0x25FD), # ◽ 1375 | chr(0x25AA), # ▪ 1376 | chr(0x25AB), # ▫ 1377 | chr(0x1F536), # 🔶 1378 | chr(0x1F537), # 🔷 1379 | chr(0x1F538), # 🔸 1380 | chr(0x1F539), # 🔹 1381 | chr(0x1F53A), # 🔺 1382 | chr(0x1F53B), # 🔻 1383 | chr(0x1F4A0), # 💠 1384 | chr(0x1F518), # 🔘 1385 | chr(0x1F533), # 🔳 1386 | chr(0x1F532), # 🔲 1387 | chr(0x1F3C1), # 🏁 1388 | chr(0x1F6A9), # 🚩 1389 | chr(0x1F38C), # 🎌 1390 | chr(0x1F3F4), # 🏴 1391 | chr(0x1F3F3), # 🏳 1392 | } 1393 | -------------------------------------------------------------------------------- /sanitext/homoglyph_map.py: -------------------------------------------------------------------------------- 1 | # A comprehensive mapping of Unicode homoglyphs to ASCII characters 2 | 3 | HOMOGLYPH_MAP = { 4 | # Latin Homoglyphs 5 | chr(0xC0): "A", # "À" 6 | chr(0xC1): "A", # "Á" 7 | chr(0xC2): "A", # "Â" 8 | chr(0xC3): "A", # "Ã" 9 | chr(0xC4): "A", # "Ä" 10 | chr(0xC5): "A", # "Å" 11 | chr(0x100): "A", # "Ā" 12 | chr(0x102): "A", # "Ă" 13 | chr(0x104): "A", # "Ą" 14 | chr(0xE0): "a", # "à" 15 | chr(0xE1): "a", # "á" 16 | chr(0xE2): "a", # "â" 17 | chr(0xE3): "a", # "ã" 18 | chr(0xE4): "a", # "ä" 19 | chr(0xE5): "a", # "å" 20 | chr(0x101): "a", # "ā" 21 | chr(0x103): "a", # "ă" 22 | chr(0x105): "a", # "ą" 23 | chr(0x1D400): "A", # "𝐀" 24 | chr(0xFF21): "A", # "A" 25 | chr(0x042C): "b", # 'Ь' 26 | chr(0x1D401): "B", # "𝐁" 27 | chr(0xFF22): "B", # "B" 28 | chr(0xC7): "C", # "Ç" 29 | chr(0x106): "C", # "Ć" 30 | chr(0x108): "C", # "Ĉ" 31 | chr(0x10A): "C", # "Ċ" 32 | chr(0x10C): "C", # "Č" 33 | chr(0xE7): "c", # "ç" 34 | chr(0x107): "c", # "ć" 35 | chr(0x109): "c", # "ĉ" 36 | chr(0x10B): "c", # "ċ" 37 | chr(0x10D): "c", # "č" 38 | chr(0x2102): "C", # "ℂ" 39 | chr(0xFF23): "C", # "C" 40 | chr(0xD0): "D", # "Ð" 41 | chr(0x10E): "D", # "Ď" 42 | chr(0x110): "D", # "Đ" 43 | chr(0xF0): "d", # "ð" 44 | chr(0x10F): "d", # "ď" 45 | chr(0x111): "d", # "đ" 46 | chr(0xFF24): "D", # "D" 47 | chr(0xC8): "E", # "È" 48 | chr(0xC9): "E", # "É" 49 | chr(0xCA): "E", # "Ê" 50 | chr(0xCB): "E", # "Ë" 51 | chr(0x112): "E", # "Ē" 52 | chr(0x114): "E", # "Ĕ" 53 | chr(0x116): "E", # "Ė" 54 | chr(0x118): "E", # "Ę" 55 | chr(0x11A): "E", # "Ě" 56 | chr(0x2130): "E", # "ℰ" 57 | chr(0xFF25): "E", # "E" 58 | chr(0xE8): "e", # "è" 59 | chr(0xE9): "e", # "é" 60 | chr(0xEA): "e", # "ê" 61 | chr(0xEB): "e", # "ë" 62 | chr(0x113): "e", # "ē" 63 | chr(0x115): "e", # "ĕ" 64 | chr(0x117): "e", # "ė" 65 | chr(0x119): "e", # "ę" 66 | chr(0x11B): "e", # "ě" 67 | chr(0x2131): "F", # "ℱ" 68 | chr(0x1D4A2): "G", # '𝒢' 69 | chr(0x210B): "H", # 'ℋ' 70 | chr(0xCC): "I", # "Ì" 71 | chr(0xCD): "I", # "Í" 72 | chr(0xCE): "I", # "Î" 73 | chr(0xCF): "I", # "Ï" 74 | chr(0x128): "I", # "Ĩ" 75 | chr(0x12A): "I", # "Ī" 76 | chr(0x12C): "I", # "Ĭ" 77 | chr(0x12E): "I", # "Į" 78 | chr(0x130): "I", # "İ" 79 | chr(0xEC): "i", # "ì" 80 | chr(0xED): "i", # "í" 81 | chr(0xEE): "i", # "î" 82 | chr(0xEF): "i", # "ï" 83 | chr(0x129): "i", # "ĩ" 84 | chr(0x12B): "i", # "ī" 85 | chr(0x12D): "i", # "ĭ" 86 | chr(0x12F): "i", # "į" 87 | chr(0x131): "i", # "ı" 88 | chr(0x217C): "l", # 'ⅼ' 89 | chr(0xD1): "N", # "Ñ" 90 | chr(0x143): "N", # "Ń" 91 | chr(0x145): "N", # "Ņ" 92 | chr(0x147): "N", # "Ň" 93 | chr(0xF1): "n", # "ñ" 94 | chr(0x0578): "n", # 'ո' 95 | chr(0x144): "n", # "ń" 96 | chr(0x146): "n", # "ņ" 97 | chr(0x148): "n", # "ň" 98 | chr(0xD2): "O", # "Ò" 99 | chr(0xD3): "O", # "Ó" 100 | chr(0xD4): "O", # "Ô" 101 | chr(0xD5): "O", # "Õ" 102 | chr(0xD6): "O", # "Ö" 103 | chr(0xD8): "O", # "Ø" 104 | chr(0x14C): "O", # "Ō" 105 | chr(0x14E): "O", # "Ŏ" 106 | chr(0x150): "O", # "Ő" 107 | chr(0xF2): "o", # "ò" 108 | chr(0xF3): "o", # "ó" 109 | chr(0xF4): "o", # "ô" 110 | chr(0xF5): "o", # "õ" 111 | chr(0xF6): "o", # "ö" 112 | chr(0xF8): "o", # "ø" 113 | chr(0x14D): "o", # "ō" 114 | chr(0x14F): "o", # "ŏ" 115 | chr(0x151): "o", # "ő" 116 | chr(0x051B): "q", # 'ԛ' 117 | chr(0x0433): "r", # 'г' 118 | chr(0x211D): "R", # "ℝ" 119 | chr(0x0455): "s", # 'ѕ' 120 | chr(0xD9): "U", # "Ù" 121 | chr(0xDA): "U", # "Ú" 122 | chr(0xDB): "U", # "Û" 123 | chr(0xDC): "U", # "Ü" 124 | chr(0x168): "U", # "Ũ" 125 | chr(0x16A): "U", # "Ū" 126 | chr(0x16C): "U", # "Ŭ" 127 | chr(0x16E): "U", # "Ů" 128 | chr(0x170): "U", # "Ű" 129 | chr(0x172): "U", # "Ų" 130 | chr(0xF9): "u", # "ù" 131 | chr(0xFA): "u", # "ú" 132 | chr(0xFB): "u", # "û" 133 | chr(0xFC): "u", # "ü" 134 | chr(0x169): "u", # "ũ" 135 | chr(0x16B): "u", # "ū" 136 | chr(0x16D): "u", # "ŭ" 137 | chr(0x16F): "u", # "ů" 138 | chr(0x171): "u", # "ű" 139 | chr(0x173): "u", # "ų" 140 | chr(0x051D): "w", # 'ԝ' 141 | chr(0x1D22): "z", # 'ᴢ' 142 | chr(0x2124): "Z", # "ℤ" 143 | # Greek Homoglyphs 144 | chr(0x391): "A", # "Α" 145 | chr(0x392): "B", # "Β" 146 | chr(0x395): "E", # "Ε" 147 | chr(0x396): "Z", # "Ζ" 148 | chr(0x397): "H", # "Η" 149 | chr(0x399): "I", # "Ι" 150 | chr(0x39A): "K", # "Κ" 151 | chr(0x39C): "M", # "Μ" 152 | chr(0x39D): "N", # "Ν" 153 | chr(0x39F): "O", # "Ο" 154 | chr(0x3A1): "P", # "Ρ" 155 | chr(0x3A4): "T", # "Τ" 156 | chr(0x3A5): "Y", # "Υ" 157 | chr(0x3A7): "X", # "Χ" 158 | chr(0x3B1): "a", # "α" 159 | chr(0x3B2): "b", # "β" 160 | chr(0x3B3): "y", # "γ" 161 | chr(0x3B4): "d", # "δ" 162 | chr(0x3B5): "e", # "ε" 163 | chr(0x3B6): "z", # "ζ" 164 | chr(0x3B7): "h", # "η" 165 | chr(0x3B9): "i", # "ι" 166 | chr(0x3BA): "k", # "κ" 167 | chr(0x3BD): "v", # "ν" 168 | chr(0x3BF): "o", # "ο" 169 | chr(0x3C1): "p", # "ρ" 170 | chr(0x3C3): "s", # "σ" 171 | chr(0x3C4): "t", # "τ" 172 | chr(0x3C5): "y", # "υ" 173 | chr(0x3C7): "x", # "χ" 174 | chr(0x3C9): "w", # "ω" 175 | # Cyrillic Homoglyphs 176 | chr(0x410): "A", # "А" 177 | chr(0x412): "B", # "В" 178 | chr(0x415): "E", # "Е" 179 | chr(0x41A): "K", # "К" 180 | chr(0x41C): "M", # "М" 181 | chr(0x41D): "H", # "Н" 182 | chr(0x41E): "O", # "О" 183 | chr(0x420): "P", # "Р" 184 | chr(0x421): "C", # "С" 185 | chr(0x422): "T", # "Т" 186 | chr(0x423): "Y", # "У" 187 | chr(0x430): "a", # "а" 188 | chr(0x501): "d", # "ԁ" 189 | chr(0x435): "e", # "е" 190 | chr(0x456): "i", # "і" 191 | chr(0x43A): "k", # "к" 192 | chr(0x43C): "m", # "м" 193 | chr(0x43E): "o", # "о" 194 | chr(0x440): "p", # "р" 195 | chr(0x441): "c", # "с" 196 | chr(0x442): "t", # "т" 197 | chr(0x443): "y", # "у" 198 | chr(0x445): "x", # "х" 199 | # Numbers 200 | chr(0xFF10): "0", # "0" 201 | chr(0xFF11): "1", # "1" 202 | chr(0xFF12): "2", # "2" 203 | chr(0xFF13): "3", # "3" 204 | chr(0xFF14): "4", # "4" 205 | chr(0xFF15): "5", # "5" 206 | chr(0xFF16): "6", # "6" 207 | chr(0xFF17): "7", # "7" 208 | chr(0xFF18): "8", # "8" 209 | chr(0xFF19): "9", # "9" 210 | chr(0x2160): "I", # "Ⅰ" 211 | chr(0x2161): "II", # "Ⅱ" 212 | chr(0x2162): "III", # "Ⅲ" 213 | chr(0x2163): "IV", # "Ⅳ" 214 | chr(0x2164): "V", # "Ⅴ" 215 | chr(0x2165): "VI", # "Ⅵ" 216 | chr(0x2166): "VII", # "Ⅶ" 217 | chr(0x2167): "VIII", # "Ⅷ" 218 | chr(0x2168): "IX", # "Ⅸ" 219 | chr(0x2169): "X", # "Ⅹ" 220 | # Common symbols 221 | chr(0x201C): '"', # "“" 222 | chr(0x201D): '"', # "”" 223 | chr(0x201E): '"', # "„" 224 | chr(0x2018): "'", # "‘" 225 | chr(0x2019): "'", # "’" 226 | chr(0x2022): "*", # "•" 227 | chr(0x2023): ">", # "‣" 228 | chr(0x2026): "...", # "…" 229 | chr(0x2014): "-", # "—" 230 | chr(0x2013): "-", # "–" 231 | chr(0x2010): "-", # "‐" 232 | chr(0xAB): '"', # "«" 233 | chr(0xBB): '"', # "»" 234 | chr(0x2039): "'", # "‹" 235 | chr(0x203A): "'", # "›" 236 | chr(0x2024): ".", # '․' 237 | chr(0x00BF): "?", # '¿' 238 | chr(0x00A1): "!", # '¡' 239 | chr(0xFF0E): ".", # '.' 240 | chr(0xFF0C): ",", # ',' 241 | chr(0x2212): "-", # "−" 242 | chr(0xB1): "+/-", # "±" 243 | chr(0xD7): "x", # "×" 244 | chr(0xF7): "/", # "÷" 245 | chr(0xA9): "(c)", # "©" 246 | chr(0xAE): "(R)", # "®" 247 | chr(0x2122): "(TM)", # "™" 248 | # Whitespace 249 | chr(0x2009): " ", # " " 250 | chr(0x202F): " ", # " " 251 | chr(0x2007): " ", # " " 252 | chr(0x2000): " ", # " " 253 | chr(0x2001): " ", # " " 254 | chr(0x2800): " ", # "⠀" 255 | chr(0x180E): " ", # "᠎" 256 | chr(0x2002): " ", # ' ' 257 | chr(0x2003): " ", # ' ' 258 | # Invinsible 259 | chr(0x200B): "", # "​" 260 | chr(0x200C): "", # "‌" 261 | chr(0x200D): "", # "‍" 262 | chr(0x200E): "", # "‎" 263 | chr(0x200F): "", # "‏" 264 | chr(0xFEFF): "", # "" 265 | chr(0x202A): "", # "‪" 266 | chr(0x202B): "", # "‫" 267 | chr(0x202C): "", # "‬" 268 | chr(0x202D): "", # "‭" 269 | chr(0x202E): "", # "‮" 270 | chr(0x2060): "", # "⁠" 271 | chr(0x2061): "", # "⁡" 272 | chr(0x2062): "", # "⁢" 273 | chr(0x2063): "", # "⁣" 274 | chr(0x2064): "", # "⁤" 275 | chr(0x2066): "", # "⁦" 276 | chr(0x2067): "", # "⁧" 277 | chr(0x2068): "", # "⁨" 278 | chr(0x2069): "", # "⁩" 279 | chr(0x206A): "", # "" 280 | chr(0x206B): "", # "" 281 | chr(0x206C): "", # "" 282 | chr(0x206D): "", # "" 283 | chr(0x206E): "", # "" 284 | chr(0x206F): "", # "" 285 | } 286 | 287 | 288 | def get_homoglyph_replacement(char): 289 | """Returns the ASCII replacement for a given Unicode character if available.""" 290 | return HOMOGLYPH_MAP.get(char, char) 291 | -------------------------------------------------------------------------------- /sanitext/text_sanitization.py: -------------------------------------------------------------------------------- 1 | import unicodedata 2 | import re 3 | import string 4 | import sys 5 | from sanitext.homoglyph_map import get_homoglyph_replacement 6 | from sanitext.emoji_set import EMOJI_SET 7 | 8 | 9 | def get_allowed_characters(allow_emoji=False, allow_chars=None, allow_file=None): 10 | """ 11 | Build and return the set of allowed characters based on: 12 | - default ASCII printable 13 | - user-specified flag to allow single code point emojis 14 | - user-specified chars 15 | - user-specified file (pathlib.Path object) 16 | """ 17 | allowed = set(string.printable) 18 | 19 | if allow_emoji: 20 | allowed.update(EMOJI_SET) 21 | 22 | # If user provides extra chars via CLI: 23 | if allow_chars: 24 | allowed.update(set(allow_chars)) 25 | 26 | # If user provides a file of allowed chars: 27 | if allow_file: 28 | text_from_file = allow_file.read_text(encoding="utf-8", errors="replace") 29 | allowed.update(text_from_file) 30 | 31 | return allowed 32 | 33 | 34 | def sanitize_text(text, allowed_characters=get_allowed_characters(), interactive=False): 35 | """ 36 | Remove or replace characters not in the allowed set. Optionally prompt the user interactively. 37 | Returns the sanitized text. 38 | """ 39 | # Identify disallowed characters 40 | disallowed_chars = sorted(set(ch for ch in text if ch not in allowed_characters)) 41 | if not disallowed_chars: 42 | # If nothing disallowed, just return original text 43 | return text 44 | 45 | # If interactive is enabled, ask the user for each unique disallowed char 46 | char_decisions = {} 47 | if interactive: 48 | for ch in disallowed_chars: 49 | # Decision for this character already been taken 50 | if ch in char_decisions: 51 | continue 52 | # Provide some info about the character 53 | char_info = f"'{ch}' (U+{ord(ch):04X}, {unicodedata.name(ch, 'Unknown')})" 54 | while True: 55 | decision = ( 56 | input( 57 | f"Character {char_info} is not allowed. Keep [y], Remove [n], or Replace [r]? " 58 | ) 59 | .strip() 60 | .lower() 61 | ) 62 | if decision in ("y", "n", "r"): 63 | if decision == "y": 64 | # Keep => add to allowed set for this run 65 | char_decisions[ch] = ch 66 | elif decision == "n": 67 | # Remove => map to empty string 68 | char_decisions[ch] = "" 69 | else: 70 | # Replace => ask user for replacement 71 | replacement = input("Enter replacement character(s): ") 72 | char_decisions[ch] = replacement 73 | break 74 | else: 75 | print("Invalid input. Please enter 'y', 'n', or 'r'.") 76 | else: 77 | for ch in disallowed_chars: 78 | closest = closest_ascii(ch, allowed_characters) 79 | char_decisions[ch] = ( 80 | closest if set(closest).issubset(allowed_characters) else "" 81 | ) 82 | 83 | # Build the sanitized text 84 | sanitized_chars = [] 85 | for ch in text: 86 | if ch in disallowed_chars: 87 | sanitized_chars.append(char_decisions[ch]) 88 | else: 89 | sanitized_chars.append(ch) 90 | 91 | return "".join(sanitized_chars) 92 | 93 | 94 | def closest_ascii(char, allowed_characters): 95 | """Returns the closest ASCII character for a given Unicode character.""" 96 | # Try homoglyph replacement first 97 | mapped = get_homoglyph_replacement(char) 98 | if mapped in allowed_characters: 99 | return mapped # Direct replacement 100 | 101 | # Try Unicode normalization (NFKC) 102 | normalized = unicodedata.normalize("NFKC", char) 103 | if all(c in allowed_characters for c in normalized): 104 | return normalized # Safe replacement 105 | 106 | # Try Unicode decomposition 107 | # Examples: 108 | # 'é' (U+00E9) decomposes to 'e' + ◌́ (acute accent). 109 | # Ⅵ (U+2165) decomposes to 'V' + 'I' 110 | # fi (U+FB01) decomposes to 'f' + 'i' 111 | decomposed = unicodedata.decomposition(char) 112 | if decomposed: 113 | # Remove non-hex parts (e.g., "") 114 | hex_parts = [ 115 | part 116 | for part in decomposed.split() 117 | if all(c in "0123456789ABCDEF" for c in part) 118 | ] 119 | # Convert hex to ASCII characters 120 | ascii_chars = [chr(int(part, 16)) for part in hex_parts if int(part, 16) < 128] 121 | # Only keep allowed characters 122 | ascii_chars = [c for c in ascii_chars if c in allowed_characters] 123 | return "".join(ascii_chars) if ascii_chars else "" 124 | 125 | # If no good match, return "" 126 | return "" 127 | 128 | 129 | def detect_suspicious_characters(text, allowed_characters=get_allowed_characters()): 130 | """ 131 | Finds characters in the text that are not ASCII letters, digits, punctuation, or common whitespace. 132 | 133 | Args: 134 | text (str): The input text to check. 135 | allowed_characters (set): Set of allowed characters 136 | 137 | Returns: 138 | list of tuple: A list of tuples, each containing a suspicious character and its Unicode name. 139 | """ 140 | return [ 141 | (char, unicodedata.name(char, "Unknown")) 142 | for char in text 143 | if char not in allowed_characters 144 | ] 145 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | # Add the project toor directory to the Python path so we can import it in the tests 5 | current_dir = os.path.dirname(os.path.realpath(__file__)) 6 | project_root = os.path.abspath(os.path.join(current_dir, os.pardir)) 7 | # project_root = os.path.abspath(os.path.join(current_dir, os.pardir, "sanitext")) 8 | sys.path.insert(0, project_root) 9 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pyperclip 3 | from typer.testing import CliRunner 4 | from unittest.mock import patch 5 | from pathlib import Path 6 | import tempfile 7 | 8 | from sanitext.cli import app 9 | 10 | runner = CliRunner() 11 | 12 | 13 | def test_cli_detect(): 14 | """Test detection of unicode anomalies.""" 15 | result = runner.invoke(app, ["--detect", "-s", "Thіs іs а test."]) 16 | assert "Detected:" in result.output 17 | # Check that suspicious chars are reported 18 | # Example substring checks for partial Unicode names 19 | assert "CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I" in result.output 20 | assert "CYRILLIC SMALL LETTER A" in result.output 21 | assert result.exit_code == 0 22 | 23 | 24 | def test_cli_process(): 25 | """Test processing and replacing text.""" 26 | result = runner.invoke(app, ["--string", "Thіs іs а test.🔥"]) 27 | assert "This is a test." in result.output 28 | assert result.exit_code == 0 29 | 30 | 31 | def test_cli_verbose(): 32 | """Test verbose mode output.""" 33 | result = runner.invoke(app, ["--verbose", "-s", "Thіs іs а test."]) 34 | assert "Detected:" in result.output 35 | assert "Output: This is a test." not in result.output 36 | assert result.exit_code == 0 37 | 38 | 39 | def test_cli_very_verbose(): 40 | """Test very verbose mode output.""" 41 | result = runner.invoke(app, ["--very-verbose", "-s", "Thіs іs а test."]) 42 | assert "Input: Thіs іs а test." in result.output 43 | assert "Detected:" in result.output 44 | assert "Output: This is a test." in result.output 45 | assert result.exit_code == 0 46 | 47 | 48 | def test_cli_clipboard(monkeypatch): 49 | """Test clipboard processing.""" 50 | monkeypatch.setattr(pyperclip, "paste", lambda: "Thіs іs а test.") # Mock clipboard 51 | result = runner.invoke(app) 52 | assert "Processed and copied to clipboard." in result.output 53 | # Confirm the final sanitized text is indeed in the clipboard 54 | assert pyperclip.paste() == "This is a test." 55 | assert result.exit_code == 0 56 | 57 | 58 | def test_cli_no_clipboard(monkeypatch): 59 | """Test error when clipboard is empty.""" 60 | monkeypatch.setattr(pyperclip, "paste", lambda: "") # Empty clipboard 61 | result = runner.invoke(app) 62 | assert ( 63 | "Error: No text provided (clipboard is empty and no string was given)." 64 | in result.output 65 | ) 66 | assert result.exit_code == 1 67 | 68 | 69 | def test_cli_allow_chars(): 70 | """ 71 | Test allowing extra characters manually via --allow-chars. 72 | """ 73 | # 'ä' is normally disallowed. If we allow it explicitly, it should pass through. 74 | input_text = "Look, an umlaut: ä" 75 | result = runner.invoke(app, ["--allow-chars", "ä", "-s", input_text]) 76 | assert "Look, an umlaut: ä" in result.output 77 | # Without --allow-chars "ä", it would normally become "Look, an umlaut: a" 78 | assert result.exit_code == 0 79 | 80 | 81 | def test_cli_allow_emoji(): 82 | """ 83 | Test allowing single code point emoji via --allow-emoji. 84 | """ 85 | # '😎' is normally disallowed. If we allow it explicitly, it should pass through. 86 | input_text = "Look, a boss 😎" 87 | result = runner.invoke(app, ["--allow-emoji", "-s", input_text]) 88 | assert "Look, a boss 😎" in result.output 89 | # Without --allow-emoji, it would normally become "Look, a boss " 90 | assert result.exit_code == 0 91 | 92 | 93 | def test_cli_allow_file(): 94 | """ 95 | Test allowing extra characters from a file. 96 | """ 97 | # We want to allow 'é' from a file 98 | extra_chars = "é\n" 99 | input_text = "Café ø" 100 | with tempfile.TemporaryDirectory() as tmpdir: 101 | fpath = Path(tmpdir) / "allowed_chars.txt" 102 | # Write the extra char to the file 103 | fpath.write_text(extra_chars, encoding="utf-8") 104 | 105 | # Now run the CLI using that file 106 | result = runner.invoke(app, ["--allow-file", str(fpath), "-s", input_text]) 107 | assert ( 108 | "Café o" in result.output 109 | ), "Expected 'é' to remain because we allowed it." 110 | assert result.exit_code == 0 111 | 112 | 113 | def test_cli_detect_with_allowed_file(): 114 | """ 115 | Use --detect with a file-based allowed char. 116 | The char from the file is allowed, other disallowed remain suspicious. 117 | """ 118 | extra_chars = "é\n" # We'll allow 'é' only 119 | input_text = "Café ☯" 120 | with tempfile.TemporaryDirectory() as tmpdir: 121 | fpath = Path(tmpdir) / "allowed_chars.txt" 122 | fpath.write_text(extra_chars, encoding="utf-8") 123 | 124 | result = runner.invoke( 125 | app, ["--detect", "--allow-file", str(fpath), "-s", input_text] 126 | ) 127 | # 'é' is allowed now, so it shouldn't appear in "Detected:" 128 | # '☯' is disallowed => must appear in "Detected:" 129 | assert "Café" not in result.output # It's not a direct sanitization 130 | # Instead, we see "Detected: [ ... (☯, 'YIN YANG') ... ]" 131 | assert "☯" in result.output 132 | # We do not expect 'é' in the detected list 133 | assert "é" not in result.output 134 | assert "Detected:" in result.output 135 | assert result.exit_code == 0 136 | 137 | 138 | def test_cli_string_none_copy_back_if_changed(monkeypatch): 139 | """ 140 | If --string is not provided, we pull from clipboard. 141 | If the sanitized text changes, it is copied back. 142 | If no changes, we see "No changes!" 143 | """ 144 | # 1. Case: text has disallowed characters => it changes => "Processed and copied..." 145 | monkeypatch.setattr(pyperclip, "paste", lambda: "Café ☯") # Disallowed char: ☯ 146 | with patch.object(pyperclip, "copy") as mock_copy: 147 | result = runner.invoke(app) 148 | assert "Processed and copied to clipboard." in result.output 149 | # The sanitized text should presumably be "Cafe " 150 | # or "Cafe " (depending on newlines) or similar. 151 | sanitized = mock_copy.call_args[0][0] 152 | assert "☯" not in sanitized # The symbol should be removed 153 | assert result.exit_code == 0 154 | 155 | # 2. Case: text has only allowed ASCII => no changes => "No changes!" 156 | monkeypatch.setattr(pyperclip, "paste", lambda: "Just ASCII!") 157 | with patch.object(pyperclip, "copy") as mock_copy: 158 | result = runner.invoke(app) 159 | assert "No changes!" in result.output 160 | assert not mock_copy.called, "Should not copy if nothing changed." 161 | assert result.exit_code == 0 162 | 163 | 164 | def test_cli_interactive_keep(monkeypatch): 165 | """ 166 | Demonstrate interactive mode for a single disallowed character, 167 | where the user chooses 'keep' (y). 168 | """ 169 | # Suppose the input text is "Café". 'é' is disallowed by default ASCII rules. 170 | text = "Café" 171 | 172 | # We'll have the user input "y" => keep the char 173 | # (The sanitize_text logic should then preserve it.) 174 | def mock_input(prompt): 175 | return "y" 176 | 177 | monkeypatch.setattr("builtins.input", mock_input) 178 | monkeypatch.setattr(pyperclip, "paste", lambda: text) 179 | 180 | result = runner.invoke(app, ["--interactive"]) 181 | 182 | # We expect the final output to contain "Café" because we "kept" 'é' 183 | assert "No changes!" in result.output 184 | # Possibly "Processed and copied to clipboard." 185 | # because the text was changed from the default logic's perspective 186 | # (well, actually we 'kept' the char, so let's see if it was considered changed or not). 187 | # If the code doesn't consider "keep" as a no-change scenario, it'll copy. 188 | # It's up to the internal logic. We'll be lenient here, just check exit code. 189 | assert result.exit_code == 0 190 | 191 | 192 | def test_cli_interactive_remove_replace(monkeypatch): 193 | """ 194 | More advanced interactive test: multiple distinct disallowed chars, 195 | user decisions: first => remove, second => replace with '?'. 196 | """ 197 | # Input has 2 disallowed characters: "é" and "☯" 198 | text = "Some text: Café ☯" 199 | 200 | # We'll queue the interactive responses: 201 | # For 'é' => user chooses 'n' => remove 202 | # For '☯' => user chooses 'r' => then input "?" 203 | user_inputs = iter(["n", "r", "?"]) 204 | 205 | def mock_input(prompt): 206 | return next(user_inputs) 207 | 208 | monkeypatch.setattr("builtins.input", mock_input) 209 | monkeypatch.setattr(pyperclip, "paste", lambda: text) 210 | 211 | result = runner.invoke(app, ["--interactive", "-vv"]) 212 | 213 | # We expect: "Café" => "Caf" because 'é' was removed, 214 | # Then " ☯" => " ?" because '☯' was replaced with '?' 215 | # So the final is "Some text: Caf ?" 216 | assert "Some text: Caf ?" in result.output, f"Got: {result.output}" 217 | assert result.exit_code == 0 218 | 219 | 220 | def test_cli_help(): 221 | """ 222 | Simple check that `sanitext --help` works (and doesn't crash). 223 | """ 224 | result = runner.invoke(app, ["--help"]) 225 | assert result.exit_code == 0 226 | # Just check that some usage text is displayed 227 | assert ( 228 | "Usage: main [OPTIONS]" in result.output 229 | or "Usage: cli [OPTIONS]" in result.output 230 | ) 231 | assert "--detect" in result.output 232 | assert "--interactive" in result.output 233 | assert "--allow-chars" in result.output 234 | assert "--allow-emoji" in result.output 235 | assert "--allow-file" in result.output 236 | -------------------------------------------------------------------------------- /tests/test_homoglyph_map.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from sanitext.homoglyph_map import get_homoglyph_replacement, HOMOGLYPH_MAP 3 | 4 | 5 | class TestHomoglyphMap(unittest.TestCase): 6 | 7 | def test_known_mappings(self): 8 | """Test some known homoglyph mappings.""" 9 | test_cases = { 10 | "À": "A", 11 | "é": "e", 12 | "Ø": "O", 13 | "Ⅵ": "VI", 14 | "×": "x", 15 | "—": "-", 16 | "“": '"', 17 | "’": "'", 18 | } 19 | for char, expected in test_cases.items(): 20 | with self.subTest(char=char): 21 | self.assertEqual(get_homoglyph_replacement(char), expected) 22 | 23 | def test_identity_mappings(self): 24 | """Ensure ASCII characters remain unchanged.""" 25 | ascii_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" 26 | for char in ascii_chars: 27 | with self.subTest(char=char): 28 | self.assertEqual(get_homoglyph_replacement(char), char) 29 | 30 | def test_unmapped_characters(self): 31 | """Test that unmapped characters return themselves.""" 32 | unmapped_chars = ["∑", "∞", "🔥", "💀"] 33 | for char in unmapped_chars: 34 | with self.subTest(char=char): 35 | self.assertEqual(get_homoglyph_replacement(char), char) 36 | 37 | def test_homoglyph_dict_integrity(self): 38 | """Ensure all mapped characters exist in HOMOGLYPH_MAP.""" 39 | for key, value in HOMOGLYPH_MAP.items(): 40 | with self.subTest(key=key): 41 | self.assertIsInstance(key, str) 42 | self.assertIsInstance(value, str) 43 | 44 | 45 | if __name__ == "__main__": 46 | unittest.main() 47 | -------------------------------------------------------------------------------- /tests/test_text_sanitization.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import string 3 | import unicodedata 4 | import tempfile 5 | from pathlib import Path 6 | 7 | from sanitext.text_sanitization import ( 8 | get_allowed_characters, 9 | sanitize_text, 10 | closest_ascii, 11 | detect_suspicious_characters, 12 | ) 13 | from sanitext.emoji_set import EMOJI_SET 14 | 15 | 16 | @pytest.fixture 17 | def ascii_allowed(): 18 | """ 19 | A fixture that returns the default set of ASCII-printable allowed characters. 20 | """ 21 | return get_allowed_characters() 22 | 23 | 24 | # ------------------------------------------------------------------- 25 | # Tests for get_allowed_characters 26 | # ------------------------------------------------------------------- 27 | 28 | 29 | def test_get_allowed_characters_default(ascii_allowed): 30 | """ 31 | By default, the allowed set should contain `string.printable` but not beyond. 32 | """ 33 | # Check that typical ASCII chars are included 34 | for ch in "ABC123!@# \t\n\r": 35 | assert ch in ascii_allowed, f"Default allowed set should contain '{ch}'" 36 | 37 | # Check that a typical non-ASCII char is excluded 38 | assert "é" not in ascii_allowed, "Default allowed set should NOT contain 'é'" 39 | 40 | # Check that an emoji is excluded 41 | assert "😀" not in ascii_allowed, "Default allowed set should NOT contain '😀'" 42 | 43 | 44 | def test_get_allowed_characters_custom_chars(ascii_allowed): 45 | """ 46 | Adding a custom set of characters via 'allow_chars'. 47 | """ 48 | extra_chars = "ⓝⓔⓦ" 49 | custom_allowed = get_allowed_characters(allow_chars=extra_chars) 50 | 51 | # Everything in ascii_allowed should still be included 52 | for ch in ascii_allowed: 53 | assert ch in custom_allowed 54 | 55 | # The extra characters should also be included 56 | for ch in extra_chars: 57 | assert ch in custom_allowed, f"Custom allowed set should contain '{ch}'" 58 | 59 | 60 | def test_get_allowed_characters_emoji(ascii_allowed): 61 | """ 62 | Adding support for single code point emoji via 'allow_emoji'. 63 | """ 64 | custom_allowed = get_allowed_characters(allow_emoji=True) 65 | 66 | # Everything in ascii_allowed should still be included 67 | for ch in ascii_allowed: 68 | assert ch in custom_allowed 69 | 70 | # Emojis should also be included 71 | assert "😀" in custom_allowed, f"Custom allowed set should contain '😀'" 72 | for ch in EMOJI_SET: 73 | assert ch in custom_allowed, f"Custom allowed set should contain '{ch}'" 74 | 75 | 76 | def test_get_allowed_characters_from_file(ascii_allowed): 77 | """ 78 | Create a temporary file with some extra characters, then pass 79 | that file to 'get_allowed_characters(allow_file=...)'. 80 | """ 81 | extra_chars = "é⛄✅" 82 | with tempfile.TemporaryDirectory() as tmpdir: 83 | filepath = Path(tmpdir) / "allowed.txt" 84 | filepath.write_text(extra_chars, encoding="utf-8") 85 | 86 | custom_allowed = get_allowed_characters(allow_file=filepath) 87 | 88 | # All default ASCII chars must remain 89 | for ch in ascii_allowed: 90 | assert ch in custom_allowed 91 | 92 | # Now our extra characters from the file should be included 93 | for ch in extra_chars: 94 | assert ch in custom_allowed 95 | 96 | 97 | # ------------------------------------------------------------------- 98 | # Tests for closest_ascii 99 | # ------------------------------------------------------------------- 100 | 101 | 102 | @pytest.mark.parametrize( 103 | "char,expected", 104 | [ 105 | # 1) Simple accent 106 | ("é", "e"), 107 | # 2) Example with a roman numeral => "Ⅵ" => "VI" 108 | ("Ⅵ", "VI"), 109 | # 3) Ligature fi => "fi" => "fi" 110 | ("fi", "fi"), 111 | # 4) Already ASCII => "A" => "A" 112 | ("A", "A"), 113 | ], 114 | ) 115 | def test_closest_ascii_simple(char, expected, ascii_allowed): 116 | """ 117 | Tests straightforward decomposition or normalization. 118 | """ 119 | replaced = closest_ascii(char, ascii_allowed) 120 | assert replaced == expected, f"Expected '{char}' -> '{expected}', got '{replaced}'" 121 | 122 | 123 | def test_closest_ascii_disallowed_result(ascii_allowed): 124 | """ 125 | If the normalization yields some chars not in the allowed set, those should be dropped. 126 | For example, if we artificially remove 'V' from the allowed set, then 'Ⅵ' might degrade further. 127 | """ 128 | custom_set = ascii_allowed.copy() 129 | custom_set.discard("V") # Remove 'V' from the set to cause a partial fallback 130 | 131 | replaced = closest_ascii("Ⅵ", custom_set) 132 | # "Ⅵ" normally decomposes to "V" + "I", but 'V' is disallowed, 133 | # so only "I" can remain if "I" is still in the set. 134 | assert replaced == "I", f"Expected 'I' if 'V' is disallowed, but got '{replaced}'" 135 | 136 | 137 | def test_closest_ascii_no_decomposition_or_normalization(ascii_allowed): 138 | """ 139 | If there's no decomposition or normalization that yields an allowed char, 140 | we expect an empty string. 141 | """ 142 | # Choose a random symbol that doesn't decompose to ASCII, e.g. '☯' 143 | symbol = "☯" 144 | replaced = closest_ascii(symbol, ascii_allowed) 145 | # Default ASCII set doesn't include '☯', 146 | # and it doesn't have a direct NFKC or decomposition to ASCII letters. 147 | assert replaced == "", f"Expected an empty string if no decomposition possible." 148 | 149 | 150 | # ------------------------------------------------------------------- 151 | # Tests for detect_suspicious_characters 152 | # ------------------------------------------------------------------- 153 | 154 | 155 | def test_detect_suspicious_characters_none(ascii_allowed): 156 | text = "Hello, world!\n\t123" 157 | # This is all within ASCII printable, so we expect an empty list 158 | suspicious = detect_suspicious_characters(text, ascii_allowed) 159 | assert suspicious == [], "Expected no suspicious characters." 160 | 161 | 162 | def test_detect_suspicious_characters_mixed(ascii_allowed): 163 | text = "Hello, wörld! Ⅵ abc fi і\n" 164 | # Among these, "ö", "Ⅵ", and "fi" are not in default ASCII allowed 165 | suspicious = detect_suspicious_characters(text, ascii_allowed) 166 | # We'll just check that we found them, not the exact order 167 | found_chars = [item[0] for item in suspicious] 168 | assert "ö" in found_chars 169 | assert "Ⅵ" in found_chars 170 | assert "fi" in found_chars 171 | assert "і" in found_chars 172 | 173 | # Also verify that we get the correct Unicode names 174 | # (we won't do an exact match because names can differ slightly by Python version, 175 | # but we can do a substring check or partial check) 176 | for ch, name in suspicious: 177 | assert ch in found_chars 178 | # Just do a sanity check 179 | assert len(name) > 1, "Unicode name should be a non-empty string." 180 | 181 | 182 | def test_detect_suspicious_characters_empty(ascii_allowed): 183 | text = "" 184 | suspicious = detect_suspicious_characters(text, ascii_allowed) 185 | assert suspicious == [], "Empty text should yield no suspicious characters." 186 | 187 | 188 | @pytest.mark.parametrize( 189 | "text, expected", 190 | [ 191 | ("Hello, world! 👋", [("👋", "WAVING HAND SIGN")]), 192 | ( 193 | "Thіs tеxt cоntaіns homoglyphs.", # Uses homoglyphs 194 | [ 195 | ("і", "CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I"), 196 | ("е", "CYRILLIC SMALL LETTER IE"), 197 | ("о", "CYRILLIC SMALL LETTER O"), 198 | ("і", "CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I"), 199 | ], 200 | ), 201 | ("Normal ASCII text.", []), # No anomalies 202 | ("Invisible​ character.", [("​", "ZERO WIDTH SPACE")]), # Invisible character 203 | ( 204 | "𝑇ℎ𝑖𝑠.", # Unicode math characters 205 | [ 206 | ("𝑇", "MATHEMATICAL ITALIC CAPITAL T"), 207 | ("ℎ", "PLANCK CONSTANT"), 208 | ("𝑖", "MATHEMATICAL ITALIC SMALL I"), 209 | ("𝑠", "MATHEMATICAL ITALIC SMALL S"), 210 | ], 211 | ), 212 | ("​", [("​", "ZERO WIDTH SPACE")]), # Just an invisible character 213 | ( 214 | "​ ​", # Multiple invisible characters 215 | [ 216 | ("​", "ZERO WIDTH SPACE"), 217 | (" ", "NARROW NO-BREAK SPACE"), 218 | ("​", "ZERO WIDTH SPACE"), 219 | ], 220 | ), 221 | ("", []), # Empty string 222 | ], 223 | ) 224 | def test_detect_suspicious_characters_parametrized(text, expected): 225 | detected_characters = detect_suspicious_characters( 226 | text, allowed_characters=get_allowed_characters() 227 | ) 228 | assert detected_characters == expected, ( 229 | f"Failed text: {text}, " 230 | f"Found: {detected_characters}, " 231 | f"Expected: {expected}" 232 | ) 233 | 234 | 235 | # ------------------------------------------------------------------- 236 | # Tests for sanitize_text (non-interactive) 237 | # ------------------------------------------------------------------- 238 | 239 | 240 | # TODO: how to run only one of these easily 241 | @pytest.mark.parametrize( 242 | "text, expected", 243 | [ 244 | # No disallowed => Should remain the same 245 | ("Hello, world!\n", "Hello, world!\n"), 246 | # Contains a decomposable char => e.g., "é" 247 | ("Café", "Cafe"), 248 | # Contains a symbol that can't be decomposed => e.g., "☯" 249 | ("Peace ☯ within", "Peace within"), # '☯' replaced with "" 250 | # Mixed example => "Ⅵ is VI" => "VI is VI" 251 | ("Ⅵ is VI", "VI is VI"), 252 | # Homoglyphs 253 | ( 254 | "Thіs tеxt cоntaіns homoglyphs.", 255 | "This text contains homoglyphs.", 256 | ), 257 | # No changes 258 | ("Normal ASCII text.", "Normal ASCII text."), 259 | # Remove invisible character 260 | ("Invisible​ character.", "Invisible character."), 261 | # Convert math bold 262 | ("𝑇ℎ𝑖𝑠 𝑡𝑒𝑥𝑡 𝑢𝑠𝑒𝑠 𝑚𝑎𝑡ℎ 𝑏𝑜𝑙𝑑.", "This text uses math bold."), 263 | # Remove multiple invisible characters 264 | ("​ ​", " "), 265 | # Remove standalone invisible character 266 | ("​", ""), 267 | # Empty input should remain empty 268 | ("", ""), 269 | ], 270 | ) 271 | def test_sanitize_text_default(text, expected, ascii_allowed): 272 | """ 273 | By default (non-interactive), sanitize_text should use closest_ascii 274 | to handle disallowed characters. 275 | """ 276 | sanitized = sanitize_text(text, allowed_characters=ascii_allowed, interactive=False) 277 | assert sanitized == expected, ( 278 | f"Failed text: {text}, " f"Found: {sanitized}, " f"Expected: {expected}" 279 | ) 280 | 281 | 282 | def test_sanitize_text_no_disallowed_return_same(ascii_allowed): 283 | text = "Just ASCII printable stuff 123 !@#\n" 284 | sanitized = sanitize_text(text, allowed_characters=ascii_allowed, interactive=False) 285 | assert ( 286 | sanitized == text 287 | ), "If there's nothing disallowed, we should get the exact same string." 288 | 289 | 290 | def test_sanitize_text_all_disallowed(): 291 | """ 292 | If the allowed set is very small, basically everything should get replaced or removed. 293 | """ 294 | # Suppose we only allow 'A' and 'B' 295 | minimal_allowed = set("AB") 296 | text = "Hello, world! Café Ⅵ" 297 | # 'H', 'e', 'l', 'o', etc. are not in minimal_allowed 298 | # closest_ascii attempts might degrade them. 299 | # But eventually, if 'h' -> 'h' is not allowed, it becomes '' 300 | # because there's no further decomposition that leads to A/B/C. 301 | # We might still get partial decompositions for e.g. "é" => "e" => still not allowed => "" 302 | # "Ⅵ" => "VI" => 'V' not allowed => '', 'I' is not allowed => '' => total => '' 303 | sanitized = sanitize_text( 304 | text, allowed_characters=minimal_allowed, interactive=False 305 | ) 306 | assert ( 307 | sanitized == "" 308 | ), f"With a minimal allowed set, everything gets removed or replaced with ''. Got: {sanitized}" 309 | 310 | 311 | # ------------------------------------------------------------------- 312 | # Testing interactive mode 313 | # ------------------------------------------------------------------- 314 | def test_sanitize_text_interactive(monkeypatch, ascii_allowed): 315 | """ 316 | Mock user input for interactive decisions: 317 | - keep 'é' 318 | - remove 'ø' 319 | - replace '☯' with '?' 320 | """ 321 | # We'll contrive a text with exactly three distinct disallowed characters: é, ☯, and ø 322 | text = "Hello é, ø, and ☯!" 323 | 324 | # A queue of user responses: 325 | # 1) 'y' => keep 'é' 326 | # 2) 'n' => remove 'ø' 327 | # 3) 'r' => replace '☯' with '?' 328 | inputs = iter(["y", "n", "r", "?"]) 329 | 330 | def fake_input(prompt): 331 | return next(inputs) 332 | 333 | # Use monkeypatch to replace 'input' 334 | monkeypatch.setattr("builtins.input", fake_input) 335 | 336 | sanitized = sanitize_text(text, allowed_characters=ascii_allowed, interactive=True) 337 | # We expect: "Hello é, , and ?!" 338 | # Because 'é' was kept, 'ø' was removed, '☯' was replaced with '?' 339 | assert sanitized == "Hello é, , and ?!" 340 | 341 | 342 | def test_sanitize_text_interactive_repeated_characters(monkeypatch, ascii_allowed): 343 | """ 344 | Mock user input for interactive decisions: 345 | - é => Replace with "!" 346 | No need to ask again, even though é appears 3 times because its fate 347 | has been decided 348 | """ 349 | # We'll contrive a text with exactly three distinct disallowed characters: é, ☯, and ø 350 | text = "Hello é, é, and é!" 351 | 352 | # 'y' => replace 'é' with '!' 353 | inputs = iter(["r", "!"]) 354 | 355 | def fake_input(prompt): 356 | return next(inputs) 357 | 358 | # Use monkeypatch to replace 'input' 359 | monkeypatch.setattr("builtins.input", fake_input) 360 | 361 | sanitized = sanitize_text(text, allowed_characters=ascii_allowed, interactive=True) 362 | # We expect: "Hello !, !, and !!" 363 | # Because all 'é' were replaced with '!' 364 | assert sanitized == "Hello !, !, and !!" 365 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py39, py310, py311, py312 3 | 4 | [testenv] 5 | deps = pytest 6 | commands = pytest tests/ --------------------------------------------------------------------------------