├── .pre-commit-config.yaml ├── .github └── workflows │ ├── publish.yml │ └── CI.yml ├── pyproject.toml ├── LICENSE.txt ├── .gitignore ├── cologne_phonetics.py ├── test.py ├── README.rst └── poetry.lock /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v2.3.0 4 | hooks: 5 | - id: end-of-file-fixer 6 | - id: trailing-whitespace 7 | - repo: https://github.com/psf/black 8 | rev: 22.10.0 9 | hooks: 10 | - id: black 11 | - repo: https://github.com/astral-sh/ruff-pre-commit 12 | rev: v0.1.13 13 | hooks: 14 | - id: ruff 15 | args: [ --fix ] 16 | - repo: https://github.com/pre-commit/mirrors-mypy 17 | rev: 'v1.8.0' 18 | hooks: 19 | - id: mypy 20 | args: ["."] 21 | pass_filenames: false 22 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | release: 5 | types: [published] 6 | workflow_dispatch: 7 | 8 | jobs: 9 | publish-release: 10 | name: upload release to PyPI 11 | runs-on: ubuntu-latest 12 | permissions: 13 | id-token: write 14 | environment: release 15 | steps: 16 | - name: Check out repository 17 | uses: actions/checkout@v4 18 | 19 | - uses: actions/setup-python@v5 20 | with: 21 | python-version: "3.12" 22 | 23 | - name: Install and configure Poetry 24 | uses: snok/install-poetry@v1 25 | with: 26 | virtualenvs-create: true 27 | virtualenvs-in-project: false 28 | installer-parallel: true 29 | 30 | - name: Build package 31 | run: poetry build 32 | 33 | - name: Publish package distributions to PyPI 34 | uses: pypa/gh-action-pypi-publish@release/v1 35 | -------------------------------------------------------------------------------- /.github/workflows/CI.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - main 8 | 9 | jobs: 10 | lint: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | - name: Set up Python 15 | uses: actions/setup-python@v5 16 | with: 17 | python-version: 3.8 18 | - name: Install pre-commit 19 | run: pip install pre-commit 20 | - name: Run pre-commit 21 | run: pre-commit run --all-files 22 | 23 | test: 24 | strategy: 25 | matrix: 26 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "pypy3.9", "pypy3.10"] 27 | os: [ubuntu-latest] 28 | runs-on: ${{ matrix.os }} 29 | steps: 30 | - uses: actions/checkout@v4 31 | - name: Set up Python ${{ matrix.python-version }} 32 | uses: actions/setup-python@v5 33 | with: 34 | python-version: ${{ matrix.python-version }} 35 | 36 | - name: Run tests 37 | run: python test.py 38 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "cologne_phonetics" 3 | version = "2.0.0" 4 | description = "Python implementation of the cologne-phonetics algorithm" 5 | authors = ["Janek Nouvertné"] 6 | license = "MIT" 7 | readme = "README.rst" 8 | classifiers = [ 9 | "Programming Language :: Python :: 3.8", 10 | "Programming Language :: Python :: 3.9", 11 | "Programming Language :: Python :: 3.10", 12 | "Programming Language :: Python :: 3.11", 13 | "Programming Language :: Python :: 3.12" 14 | ] 15 | 16 | [tool.poetry.dependencies] 17 | python = ">=3.8" 18 | 19 | 20 | [tool.poetry.group.dev.dependencies] 21 | pre-commit = "<3.6.0" 22 | 23 | 24 | [tool.mypy] 25 | strict = true 26 | exclude = ["test.py"] 27 | 28 | [tool.ruff] 29 | line-length = 88 30 | indent-width = 4 31 | target-version = "py38" 32 | select = ["F", "E", "W", "I", "UP", "TCH", "FA"] 33 | include = ["cologne_phonetics.py"] 34 | 35 | 36 | [build-system] 37 | requires = ["poetry-core>=1.0.0"] 38 | build-backend = "poetry.core.masonry.api" 39 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | .idea/ 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # Jupyter Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # SageMath parsed files 81 | *.sage.py 82 | 83 | # dotenv 84 | .env 85 | 86 | # virtualenv 87 | .venv 88 | venv/ 89 | ENV/ 90 | 91 | # Spyder project settings 92 | .spyderproject 93 | .spyproject 94 | 95 | # Rope project settings 96 | .ropeproject 97 | 98 | # mkdocs documentation 99 | /site 100 | 101 | # mypy 102 | .mypy_cache/ 103 | -------------------------------------------------------------------------------- /cologne_phonetics.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Cologne_phonetics is a Python implementation of the cologne-phonetics, a phonetic 4 | algorithm similar to soundex but optimized for the german language 5 | 6 | Documentation can be found at https://github.com/provinzkraut/cologne_phonetics 7 | 8 | A detailed explanation of the cologne phonetics can be found at: 9 | https://en.wikipedia.org/wiki/Cologne_phonetics 10 | """ 11 | from __future__ import annotations 12 | 13 | __author__ = "Janek Nouvertné" 14 | __version__ = "2.0.0" 15 | __license__ = "MIT" 16 | 17 | import re 18 | import sys 19 | import unicodedata 20 | from argparse import ArgumentParser 21 | from typing import Iterable, Pattern 22 | 23 | RGX_SPECIAL_CHARS = re.compile(r"[äüöß]") 24 | 25 | RGX_SPECIAL_CHAR_REPLACEMENTS = [ 26 | (re.compile(r"ä"), "ae"), 27 | (re.compile(r"ö"), "oe"), 28 | (re.compile(r"ü"), "ue"), 29 | (re.compile(r"ß"), "s"), 30 | ] 31 | 32 | RGX_RULES = [ 33 | # ignore special characters that have not been replaced at this point 34 | (re.compile(r"[^a-z]"), ""), 35 | # d,t replacements 36 | # not before c,s,z 37 | (re.compile(r"[dt](?![csz])"), "2"), 38 | # before c,s,z 39 | (re.compile(r"[dt](?=[csz])"), "8"), 40 | # x replacements 41 | # not after c,k,q 42 | (re.compile(r"(? str: 71 | # https://stackoverflow.com/a/518232 72 | return "".join( 73 | c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn" 74 | ) 75 | 76 | 77 | def _replace_by_rules(rules: list[tuple[Pattern[str], str]], s: str) -> str: 78 | for rule in rules: 79 | s = rule[0].sub(rule[1], s) 80 | return s 81 | 82 | 83 | def encode(data: str, concat: bool = False) -> list[tuple[str, str]]: 84 | """ 85 | :param data: Input to be encoded. Whitespace characters will be 86 | interpreted as a wordbreak 87 | :param concat: The intended behaviour of the cologne-phonetics 88 | is to ignore special characters. This leads to concatenation for strings 89 | with hyphens. If ``concat`` is set to ``True``, hyphenated string will be 90 | treated as separate words 91 | 92 | :return: Return a list of tuples containing sanitised input / encoded substring 93 | pairs 94 | 95 | :note: Contrary to many other implementations, in the final pass only 96 | repeated **digits** are removed, not repeated **numbers**. Resulting e.g. 97 | in ``xx`` being encoded as `4848` and not `48`` 98 | """ 99 | 100 | if not concat: 101 | data = data.replace("-", " ") 102 | data = data.lower() 103 | 104 | words_encoded = [] 105 | for word in data.split(" "): 106 | word_clean = _remove_diacritics( 107 | _replace_by_rules(RGX_SPECIAL_CHAR_REPLACEMENTS, word) 108 | ) 109 | word_encoded = _replace_by_rules(RGX_RULES, word_clean) 110 | words_encoded.append((word_clean, word_encoded)) 111 | return words_encoded 112 | 113 | 114 | def compare(*data: str, concat: bool = False) -> bool: 115 | """ 116 | Encode and compare strings. 117 | 118 | :param data: Data to compare. Either at last 2 positional arguments or an iterable 119 | :param concat: Passed to ``encode()`` 120 | 121 | :returns: A boolean, indicating whether all passed data is equal after encoding 122 | :raises: ValueError if only one input string is given 123 | """ 124 | 125 | if ( 126 | not isinstance(data[0], str) 127 | and isinstance(data[0], Iterable) 128 | and len(data) == 1 129 | ): 130 | data = data[0] 131 | 132 | if len(data) == 1: 133 | raise ValueError('Compare called with only one value: "%s"' % data[0]) 134 | 135 | last = None 136 | for s in data: 137 | res = [r[1] for r in encode(s, concat=concat)] 138 | if last and res != last: 139 | return False 140 | else: 141 | last = res 142 | else: 143 | return True 144 | 145 | 146 | def cli(args: list[str] | None = None) -> None: 147 | parser = ArgumentParser(description=__doc__) 148 | parser.add_argument("data", help="string to be encoded") 149 | parser.add_argument( 150 | "-c", 151 | "--concat", 152 | action="store_true", 153 | help="treat words connected by hyphens as separate words", 154 | ) 155 | parser.add_argument( 156 | "-v", "--verbose", action="store_true", help="show detailed information" 157 | ) 158 | parser.add_argument( 159 | "-p", 160 | "--pretty", 161 | action="store_true", 162 | help="use in combination with --verbose to format output nicely", 163 | ) 164 | parsed_args = parser.parse_args(args) 165 | res = encode(parsed_args.data, concat=parsed_args.concat) 166 | if parsed_args.verbose: 167 | sep = "\n" if parsed_args.pretty else ", " 168 | out = sep.join([r[0] + ": " + r[1] for r in res]) 169 | else: 170 | out = ", ".join([r[1] for r in res]) 171 | print(out) 172 | 173 | 174 | if __name__ == "__main__": # pragma: no cover 175 | cli(sys.argv) 176 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest import mock 3 | 4 | import cologne_phonetics 5 | from cologne_phonetics import cli, compare, encode 6 | 7 | 8 | def enc_first(val, **kwargs): 9 | return encode(val, **kwargs)[0][1] 10 | 11 | 12 | class TestEncode(unittest.TestCase): 13 | def multiple_before(self, char=None, before=None, exp=None): 14 | for b in before: 15 | self.assertEqual(encode(char + b), exp) 16 | 17 | def multiple_after(self, char=None, after=None, exp=None): 18 | for b in after: 19 | self.assertEqual(enc_first(b + char), exp) 20 | 21 | def fuzz(self, char, exp, alt_exp="None", fuzzer="h"): 22 | for stmt in (char, fuzzer + char, char + fuzzer): 23 | try: 24 | self.assertEqual(enc_first(stmt), exp) 25 | except AssertionError as e: 26 | if alt_exp == "None": 27 | raise e 28 | self.assertEqual(enc_first(stmt), alt_exp) 29 | 30 | def test_aeijouy(self): 31 | chars = ["a", "ä", "á", "à", "e", "é", "è", "i", "j", "o", "ö", "u", "ü", "y"] 32 | for c in chars: 33 | self.fuzz(c, "0") 34 | 35 | def test_h(self): 36 | self.assertEqual(enc_first("h"), "") 37 | self.fuzz("h", "0", alt_exp="", fuzzer="a") 38 | 39 | def test_h_in_context(self): 40 | self.assertEqual(enc_first("aha"), "0") 41 | self.assertEqual(enc_first("ha"), "0") 42 | self.assertEqual(enc_first("ah"), "0") 43 | 44 | def test_b(self): 45 | self.fuzz("b", "1") 46 | 47 | def test_p_not_before_h(self): 48 | self.assertEqual(enc_first("apa"), "01") 49 | self.assertEqual(enc_first("pa"), "1") 50 | self.assertNotEqual(enc_first("ph"), "1") 51 | 52 | def test_p_before_h(self): 53 | self.assertEqual(enc_first("ph"), "3") 54 | self.assertEqual(enc_first("aph"), "03") 55 | 56 | def test_dt_not_before_csz(self): 57 | self.assertEqual(enc_first("da"), "2") 58 | self.assertNotEqual(enc_first("dc"), "2") 59 | self.assertEqual(enc_first("ta"), "2") 60 | self.assertNotEqual(enc_first("tc"), "2") 61 | 62 | def test_dt_before_csz(self): 63 | self.assertNotEqual(enc_first("da"), "8") 64 | self.assertEqual(enc_first("dc"), "8") 65 | self.assertNotEqual(enc_first("ta"), "8") 66 | self.assertEqual(enc_first("tc"), "8") 67 | 68 | def test_fvw(self): 69 | self.assertEqual(enc_first("fvw"), "3") 70 | self.assertEqual(enc_first("af"), "03") 71 | 72 | def test_gkq(self): 73 | self.assertEqual(enc_first("gkq"), "4") 74 | self.assertEqual(enc_first("ag"), "04") 75 | 76 | def test_c_init_before_ahkloqrux(self): 77 | self.assertEqual(enc_first("ca"), "4") 78 | self.assertNotEqual(enc_first("ac"), "04") 79 | self.assertNotEqual(enc_first("cm"), "4") 80 | 81 | def test_c_before_ahkoqux_not_after_sz(self): 82 | self.assertEqual(enc_first("ch"), "4") 83 | self.assertFalse(enc_first("sc").endswith("4")) 84 | self.assertFalse(enc_first("zc").endswith("4")) 85 | 86 | def test_x(self): 87 | self.fuzz("x", "48") 88 | self.multiple_after(char="x", after="ckq", exp="48") 89 | 90 | def test_l(self): 91 | self.fuzz("l", "5") 92 | 93 | def test_mn(self): 94 | self.fuzz("m", "6") 95 | self.fuzz("n", "6") 96 | 97 | def test_r(self): 98 | self.fuzz("r", "7") 99 | 100 | def test_sz(self): 101 | self.fuzz("s", "8") 102 | self.fuzz("z", "8") 103 | 104 | def test_special_chars(self): 105 | special_chars = { 106 | "ä": "ae", 107 | "á": "a", 108 | "à": "a", 109 | "ü": "ue", 110 | "ö": "oe", 111 | "é": "e", 112 | "è": "e", 113 | "ß": "s", 114 | "ç": "c", 115 | "š": "s", 116 | } 117 | for char, repl in special_chars.items(): 118 | self.assertEqual(enc_first(char), enc_first(repl)) 119 | 120 | def test_ignore_invalid(self): 121 | self.assertEqual(enc_first("ah"), enc_first("ahø")) 122 | 123 | def test_concatenation(self): 124 | self.assertTrue(encode("a-a") == encode("a a")) 125 | self.assertEqual(encode("a-a", concat=True), [("a-a", "0")]) 126 | self.assertEqual(encode("a a", concat=True), [("a", "0"), ("a", "0")]) 127 | 128 | def test_case_insensitive(self): 129 | self.assertEqual(encode("foo"), encode("FoO")) 130 | 131 | def test_returns_altered(self): 132 | self.assertEqual(encode("bäTes")[0][0], "baetes") 133 | 134 | 135 | class TestCompare(unittest.TestCase): 136 | def test_input(self): 137 | self.assertEqual(compare(["a", "b", "c"]), compare("a", "b", "c")) 138 | 139 | def test_iterinput(self): 140 | for i in (["a", "b"], ("a", "b"), {"a", "b"}): 141 | self.assertFalse(compare(i)) 142 | 143 | def test_case_insensitive(self): 144 | self.assertTrue(compare("foo", "FoO")) 145 | 146 | def test_compare(self): 147 | self.assertTrue(compare("a", "a")) 148 | self.assertFalse(compare("a", "x")) 149 | self.assertTrue(compare("foo", "fuh")) 150 | self.assertTrue(compare("foo-foo", "foo-fuh")) 151 | self.assertTrue(compare("foo foo", "foo-fuh")) 152 | self.assertFalse(compare("foo-foo", "foo-fuh", "foo bar")) 153 | 154 | def test_raises_on_one_value(self): 155 | with self.assertRaises(ValueError): 156 | compare("foo") 157 | with self.assertRaises(ValueError): 158 | compare(["foo"]) 159 | with self.assertRaises(ValueError): 160 | compare("f") 161 | with self.assertRaises(ValueError): 162 | compare(["f"]) 163 | 164 | 165 | class TestCLI(unittest.TestCase): 166 | def setUp(self): 167 | cologne_phonetics.print = mock.MagicMock() 168 | self.mock_print = cologne_phonetics.print 169 | 170 | def tearDown(self): 171 | cologne_phonetics.sys.argv = ["test_cologne_phonetics.py"] 172 | 173 | def compare_enc_call(self, mocked, data): 174 | encoded = enc_first(data) 175 | mocked.assert_called_with(encoded) 176 | 177 | def test_encode(self): 178 | cli(["foo"]) 179 | self.mock_print.assert_called_with(enc_first("foo")) 180 | 181 | @mock.patch("cologne_phonetics.encode") 182 | def test_concat(self, mock_encode): 183 | cli(["foo", "-c"]) 184 | mock_encode.assert_called_with("foo", concat=True) 185 | 186 | def test_verbose(self): 187 | cli(["foo", "-v"]) 188 | self.mock_print.assert_called_with("foo: 3") 189 | 190 | def test_pretty(self): 191 | cli(["foo-bar", "-vp"]) 192 | self.mock_print.assert_called_with("foo: 3\nbar: 17") 193 | 194 | 195 | if __name__ == "__main__": 196 | unittest.main() 197 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. _`PyPi`: https://pypi.org/project/cologne-phonetics/ 2 | 3 | 4 | ================= 5 | Cologne-phonetics 6 | ================= 7 | 8 | 9 | .. image:: https://img.shields.io/pypi/pyversions/cologne-phonetics.svg 10 | :alt: PyPI version 11 | 12 | 13 | Contents 14 | ======== 15 | 16 | - `Cologne-phonetics`_ 17 | 18 | - `Introduction`_ 19 | - `Examples`_ 20 | - `Installation`_ 21 | - `Usage`_ 22 | 23 | - `Module contents`_ 24 | 25 | - `encode`_ 26 | - `compare`_ 27 | - `Examples`_ 28 | - `Command line interface`_ 29 | - `Special characters`_ 30 | 31 | - `Word breaks and hyphens`_ 32 | - `Umlaut and special character replacement`_ 33 | 34 | - `Changelog`_ 35 | 36 | - `1.2.0`_ 37 | - `1.2.1`_ 38 | - `1.2.2`_ 39 | - `1.2.3`_ 40 | - `1.2.4`_ 41 | - `1.3.0`_ 42 | - `1.3.1`_ 43 | - `2.0.0`_ 44 | 45 | 46 | 47 | Introduction 48 | ============ 49 | 50 | Cologne-phonetics is a phonetic algorithm similar to Soundex, wich encodes words 51 | into a phonetic code, making it possible to compare how they *sound* rather than how they're *written*. 52 | It was developed by Hans Postel and contrary to Soundex, it's designed specific 53 | for the german language. 54 | 55 | It involves three steps: 56 | 57 | - Generate a code by representing every letter from left to right with a digit, according to a conversion table 58 | - Remove double digits 59 | - Remove every occurrence of '0', except as a leading digit 60 | 61 | The module itself is quite simple and consists only of the `encode`_ and `compare`_ functions 62 | and a simple command line interface. 63 | 64 | 65 | Examples 66 | ======== 67 | 68 | .. code-block:: bash 69 | 70 | $ cologne_phonetics.py "peter pédter" 71 | 127, 127 72 | $ cologne_phonetics.py "umwelt umhwält" 73 | 06352, 06352 74 | $ cologne_phonetics.py "urlaub uhrlaup" 75 | 0751, 0751 76 | 77 | As you can see, similar sounding names produce the same result, with respect to the *correct* pronunciation. 78 | 79 | .. code-block:: bash 80 | 81 | $ cologne_phonetics.py "peter peta" 82 | 127, 12 83 | 84 | This does not give the same result for each word because they may *look* similar, 85 | but (when pronounced correctly) don't really *sound* alike. 86 | 87 | 88 | ============ 89 | Installation 90 | ============ 91 | 92 | cologne_phonetics runs with Python 3.4+ or PyPy 3.5. 93 | It is available on `PyPi`_ and can be installed it via pip: 94 | 95 | .. code-block:: bash 96 | 97 | pip install cologne_phonetics 98 | 99 | 100 | ===== 101 | Usage 102 | ===== 103 | 104 | Module contents 105 | =============== 106 | 107 | .. _encode: 108 | 109 | encode(data, *concat=False*) 110 | Return a list of result tuples. 111 | 112 | Each tuple consists of the string that was encoded and its result. 113 | 114 | If the input string is altered in any way before encoding, the tuple will 115 | contain the altered version. 116 | 117 | .. code-block:: python 118 | 119 | >>> cologne_phonetics.encode("bäteS") 120 | >>> [('baetes', '128')] 121 | 122 | If ``concat=True`` is passed, words connected with hyphens will be treated as 123 | a single words. 124 | 125 | Most of the time, the list will be ``len(result_list) == 1``. Only if the input string 126 | contains a space character or a hyphen it is splitted into substrings and each 127 | substring will be encoded seperately. 128 | 129 | .. _compare: 130 | 131 | compare(\*data, *concat=False*) 132 | Parameter 133 | \*data. Either at last 2 positional arguments or an iterable 134 | Returns 135 | `True` if all encoded strings are equal, else `False` 136 | Raises 137 | `ValueError`. 138 | If only one value is submitted or the submitted Iterable is of lenght 1. 139 | 140 | 141 | Command line interface 142 | ====================== 143 | 144 | .. code-block:: bash 145 | 146 | $ cologne_phonetics.py hello 147 | 05 148 | $ cologne_phonetics.py hello world 149 | 05, 3752 150 | 151 | 152 | Optional arguments 153 | ~~~~~~~~~~~~~~~~~~~~ 154 | 155 | -h, --help 156 | show this help message and exit 157 | -c, --concat 158 | treat words connected by hyphens as seperate words 159 | -v, --verbose 160 | show detailed information 161 | -p, --pretty 162 | format output nicely 163 | 164 | 165 | 166 | =================== 167 | Special characters 168 | =================== 169 | 170 | Special characters are all characters that are not ascii-characters between A and Z. 171 | Most special characters are simply ignored, but even within the set of special characters, 172 | there are some that are even *more* special. 173 | 174 | 175 | Word breaks and hyphens 176 | ======================== 177 | 178 | By default, words connected by hyphens, e.g. ``meier-lüdenscheid`` are seperated. 179 | So ``meier-lüdenscheid`` would become ``'67', '52682'``. If you 180 | want it to be treated as a single word, you can pass a ``concat=True`` 181 | to the encode functions. 182 | 183 | While at first this doesn't seem to make a difference in the result, other than it being split 184 | into a list of strings, in some cases it can make a difference. 185 | 186 | .. code-block:: python 187 | 188 | >>> cologne_phonetics.encode("weiss-chemie") 189 | >>> [('weiss', '38'), ('chemie', '46')] 190 | >>> cologne_phonetics.encode("weiss-chemie", concat=True) 191 | >>> [('weiss-chemie', '386')] 192 | 193 | As you can see, a ``4`` got lost here. 194 | In case you *really* want to compare the concatenated words you may use this option, 195 | but in general there's not much use to it. 196 | 197 | 198 | Umlaut and special character replacement 199 | ========================================= 200 | 201 | Umlaute and some other special characters are converted to their non-special equivalent. 202 | 203 | ====== ========== 204 | Umlaut conversion 205 | ====== ========== 206 | ü ue 207 | ö oe 208 | ä ae 209 | ß s 210 | é e 211 | è e 212 | á a 213 | à a 214 | ====== ========== 215 | 216 | 217 | ========= 218 | Changelog 219 | ========= 220 | 221 | 1.2.0 222 | ===== 223 | 224 | - Removed `encode_many()` 225 | - `encode()` now allways returns a list of result tuples 226 | - Added `--verbose` and `--pretty` options to CLI 227 | - New function: `compare()` 228 | 229 | 1.2.1 230 | ===== 231 | 232 | - Fixed an error that would lead to case sensitive comparison in `compare`_ 233 | 234 | 1.2.2 235 | ===== 236 | 237 | - Another error in `compare`_ was found (and fixed); Compare didn't actually compare output. It compared input. This was due to bad tests and introduced in 1.2.0, with the change that made `encode`_ always return a tuple as a result 238 | 239 | 1.2.3 240 | ===== 241 | 242 | - PyPy 3.5 is now officially supported 243 | - A bug was fixed that would lead `encode`_ to sometimes an preprocessed rather than the altered string in the result tuple 244 | 245 | 246 | 1.2.4 247 | ===== 248 | 249 | - Drop support for Python 3.4 and 3.5 250 | - Add tests for Python 3.8 and 3.9 251 | - Remove deprecated ``Iterable`` import. See #1 252 | 253 | 254 | 1.3.0 255 | ===== 256 | 257 | - Add more robust replacement of diacritic using ``unicodedata`` (provided by `Tobias Bengfort `_ ) 258 | - Add type hints 259 | - Fix issue where ``concat`` parameter of `compare`_ wasn't passed to `encode`_ 260 | 261 | 262 | 1.3.1 263 | ===== 264 | 265 | - Run tests against Python 3.10 266 | - Add missing Readme to pyproject.toml 267 | - Drop Python 3.6 support 268 | 269 | 270 | 2.0.0 271 | ===== 272 | 273 | - Drop Python 3.7 support 274 | - Test against Python 3.11 and 3.12 275 | -------------------------------------------------------------------------------- /poetry.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. 2 | 3 | [[package]] 4 | name = "cfgv" 5 | version = "3.4.0" 6 | description = "Validate configuration and produce human readable error messages." 7 | optional = false 8 | python-versions = ">=3.8" 9 | files = [ 10 | {file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"}, 11 | {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"}, 12 | ] 13 | 14 | [[package]] 15 | name = "distlib" 16 | version = "0.3.8" 17 | description = "Distribution utilities" 18 | optional = false 19 | python-versions = "*" 20 | files = [ 21 | {file = "distlib-0.3.8-py2.py3-none-any.whl", hash = "sha256:034db59a0b96f8ca18035f36290806a9a6e6bd9d1ff91e45a7f172eb17e51784"}, 22 | {file = "distlib-0.3.8.tar.gz", hash = "sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64"}, 23 | ] 24 | 25 | [[package]] 26 | name = "filelock" 27 | version = "3.13.1" 28 | description = "A platform independent file lock." 29 | optional = false 30 | python-versions = ">=3.8" 31 | files = [ 32 | {file = "filelock-3.13.1-py3-none-any.whl", hash = "sha256:57dbda9b35157b05fb3e58ee91448612eb674172fab98ee235ccb0b5bee19a1c"}, 33 | {file = "filelock-3.13.1.tar.gz", hash = "sha256:521f5f56c50f8426f5e03ad3b281b490a87ef15bc6c526f168290f0c7148d44e"}, 34 | ] 35 | 36 | [package.extras] 37 | docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.24)"] 38 | testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)"] 39 | typing = ["typing-extensions (>=4.8)"] 40 | 41 | [[package]] 42 | name = "identify" 43 | version = "2.5.33" 44 | description = "File identification library for Python" 45 | optional = false 46 | python-versions = ">=3.8" 47 | files = [ 48 | {file = "identify-2.5.33-py2.py3-none-any.whl", hash = "sha256:d40ce5fcd762817627670da8a7d8d8e65f24342d14539c59488dc603bf662e34"}, 49 | {file = "identify-2.5.33.tar.gz", hash = "sha256:161558f9fe4559e1557e1bff323e8631f6a0e4837f7497767c1782832f16b62d"}, 50 | ] 51 | 52 | [package.extras] 53 | license = ["ukkonen"] 54 | 55 | [[package]] 56 | name = "nodeenv" 57 | version = "1.8.0" 58 | description = "Node.js virtual environment builder" 59 | optional = false 60 | python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*" 61 | files = [ 62 | {file = "nodeenv-1.8.0-py2.py3-none-any.whl", hash = "sha256:df865724bb3c3adc86b3876fa209771517b0cfe596beff01a92700e0e8be4cec"}, 63 | {file = "nodeenv-1.8.0.tar.gz", hash = "sha256:d51e0c37e64fbf47d017feac3145cdbb58836d7eee8c6f6d3b6880c5456227d2"}, 64 | ] 65 | 66 | [package.dependencies] 67 | setuptools = "*" 68 | 69 | [[package]] 70 | name = "platformdirs" 71 | version = "4.1.0" 72 | description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." 73 | optional = false 74 | python-versions = ">=3.8" 75 | files = [ 76 | {file = "platformdirs-4.1.0-py3-none-any.whl", hash = "sha256:11c8f37bcca40db96d8144522d925583bdb7a31f7b0e37e3ed4318400a8e2380"}, 77 | {file = "platformdirs-4.1.0.tar.gz", hash = "sha256:906d548203468492d432bcb294d4bc2fff751bf84971fbb2c10918cc206ee420"}, 78 | ] 79 | 80 | [package.extras] 81 | docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.1)", "sphinx-autodoc-typehints (>=1.24)"] 82 | test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)"] 83 | 84 | [[package]] 85 | name = "pre-commit" 86 | version = "3.5.0" 87 | description = "A framework for managing and maintaining multi-language pre-commit hooks." 88 | optional = false 89 | python-versions = ">=3.8" 90 | files = [ 91 | {file = "pre_commit-3.5.0-py2.py3-none-any.whl", hash = "sha256:841dc9aef25daba9a0238cd27984041fa0467b4199fc4852e27950664919f660"}, 92 | {file = "pre_commit-3.5.0.tar.gz", hash = "sha256:5804465c675b659b0862f07907f96295d490822a450c4c40e747d0b1c6ebcb32"}, 93 | ] 94 | 95 | [package.dependencies] 96 | cfgv = ">=2.0.0" 97 | identify = ">=1.0.0" 98 | nodeenv = ">=0.11.1" 99 | pyyaml = ">=5.1" 100 | virtualenv = ">=20.10.0" 101 | 102 | [[package]] 103 | name = "pyyaml" 104 | version = "6.0.1" 105 | description = "YAML parser and emitter for Python" 106 | optional = false 107 | python-versions = ">=3.6" 108 | files = [ 109 | {file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"}, 110 | {file = "PyYAML-6.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"}, 111 | {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, 112 | {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, 113 | {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, 114 | {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, 115 | {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, 116 | {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, 117 | {file = "PyYAML-6.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab"}, 118 | {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, 119 | {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, 120 | {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, 121 | {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, 122 | {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, 123 | {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, 124 | {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, 125 | {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, 126 | {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd"}, 127 | {file = "PyYAML-6.0.1-cp36-cp36m-win32.whl", hash = "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585"}, 128 | {file = "PyYAML-6.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa"}, 129 | {file = "PyYAML-6.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3"}, 130 | {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27"}, 131 | {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3"}, 132 | {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c"}, 133 | {file = "PyYAML-6.0.1-cp37-cp37m-win32.whl", hash = "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba"}, 134 | {file = "PyYAML-6.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867"}, 135 | {file = "PyYAML-6.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595"}, 136 | {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, 137 | {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, 138 | {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, 139 | {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, 140 | {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, 141 | {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, 142 | {file = "PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859"}, 143 | {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, 144 | {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, 145 | {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, 146 | {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, 147 | {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, 148 | {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, 149 | ] 150 | 151 | [[package]] 152 | name = "setuptools" 153 | version = "69.0.3" 154 | description = "Easily download, build, install, upgrade, and uninstall Python packages" 155 | optional = false 156 | python-versions = ">=3.8" 157 | files = [ 158 | {file = "setuptools-69.0.3-py3-none-any.whl", hash = "sha256:385eb4edd9c9d5c17540511303e39a147ce2fc04bc55289c322b9e5904fe2c05"}, 159 | {file = "setuptools-69.0.3.tar.gz", hash = "sha256:be1af57fc409f93647f2e8e4573a142ed38724b8cdd389706a867bb4efcf1e78"}, 160 | ] 161 | 162 | [package.extras] 163 | docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] 164 | testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] 165 | testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.1)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] 166 | 167 | [[package]] 168 | name = "virtualenv" 169 | version = "20.25.0" 170 | description = "Virtual Python Environment builder" 171 | optional = false 172 | python-versions = ">=3.7" 173 | files = [ 174 | {file = "virtualenv-20.25.0-py3-none-any.whl", hash = "sha256:4238949c5ffe6876362d9c0180fc6c3a824a7b12b80604eeb8085f2ed7460de3"}, 175 | {file = "virtualenv-20.25.0.tar.gz", hash = "sha256:bf51c0d9c7dd63ea8e44086fa1e4fb1093a31e963b86959257378aef020e1f1b"}, 176 | ] 177 | 178 | [package.dependencies] 179 | distlib = ">=0.3.7,<1" 180 | filelock = ">=3.12.2,<4" 181 | platformdirs = ">=3.9.1,<5" 182 | 183 | [package.extras] 184 | docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] 185 | test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"] 186 | 187 | [metadata] 188 | lock-version = "2.0" 189 | python-versions = ">=3.8" 190 | content-hash = "a243f9ddba47b494286ed0ee2c5767edf905cf0d1dc3e7a52d70bc31296b458f" 191 | --------------------------------------------------------------------------------