├── .gitignore ├── LICENSE ├── README.md ├── anaphones.json ├── anaphones.py ├── anaphones_nontrivial_unique_pronunciation.json ├── anaphones_unique_pronunciation.json ├── ipa-dict-en_US.json └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2021 MCODING, LLC 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Anaphones 2 | 3 | Anaphones are like anagrams but for sounds (phonemes). 4 | Examples include: salami-awesomely, atari-tiara, and beefy-phoebe. 5 | Anaphones can be anagrams, like atari-tiara, but they don't have to be, and most anagrams are not anaphones. 6 | Anaphones also include homophones, like their-there-they're, 7 | but the interesting anaphones are ones that have distinct pronunciations. 8 | 9 | I've compiled a dictionary of them because why not? 10 | The relevant files are: 11 | 12 | - ipa-dict-en_US.json: the phonetic dictionary used to find the anaphones. 13 | - anaphones.json: the complete dictionary of all anaphones I found. 14 | - anaphones_unique_pronunciation.json: 15 | same as anaphones.json but deduplicates pronunciations, so homophones like "there, their, they're" will only have one representative. 16 | 17 | - anaphones_nontrivial_unique_pronunciation.json: additionally filters out words that do not have any anaphones that are not homophones. 18 | - anaphones.py: the Python code used to generate the dictionaries. -------------------------------------------------------------------------------- /anaphones.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from collections import defaultdict, UserDict 4 | from typing import Dict, List, Iterator, Iterable, Union, TypeVar, Optional, Callable 5 | 6 | import attr 7 | 8 | if sys.version_info < (3, 9): 9 | raise RuntimeError("This module requires Python 3.9 or higher") 10 | 11 | 12 | def clean_str_to_alphanum(word: str) -> str: 13 | return "".join(c for c in word if c.isalpha() and c not in 'ˈˌ.-"\'') 14 | 15 | 16 | def normalize_pronunciation(pronunciation: str) -> str: 17 | return clean_str_to_alphanum(pronunciation.strip('/')) 18 | 19 | 20 | @attr.s(frozen=True, slots=True) 21 | class Spelling: 22 | s: str = attr.ib(converter=clean_str_to_alphanum) 23 | 24 | def __str__(self) -> str: 25 | return self.s 26 | 27 | def sorted(self) -> 'Spelling': 28 | return Spelling(sorted_str(self.s)) 29 | 30 | def is_anagram_of(self, other: 'Spelling') -> bool: 31 | return self.sorted() == other.sorted() 32 | 33 | 34 | @attr.s(frozen=True, slots=True) 35 | class Pronunciation: 36 | s: str = attr.ib(converter=normalize_pronunciation) 37 | 38 | def __str__(self) -> str: 39 | return self.s 40 | 41 | def sorted(self) -> 'Pronunciation': 42 | return Pronunciation(sorted_str(self.s)) 43 | 44 | def is_anagram_of(self, other: 'Pronunciation') -> bool: 45 | return self.sorted() == other.sorted() 46 | 47 | 48 | @attr.s(frozen=True, slots=True) 49 | class PronouncedWord: 50 | spelling: Spelling = attr.ib() 51 | pronunciation: Pronunciation = attr.ib() 52 | 53 | def __str__(self) -> str: 54 | return f'({str(self.spelling)}, {str(self.pronunciation)})' 55 | 56 | def is_anagram_of(self, other: 'PronouncedWord') -> bool: 57 | return self.spelling.is_anagram_of(other.spelling) 58 | 59 | def is_phonetic_anagram_of(self, other: 'PronouncedWord') -> bool: 60 | return self.pronunciation.is_anagram_of(other.pronunciation) 61 | 62 | def json_safe_str(self) -> str: 63 | return f'{self.spelling}:{self.pronunciation}' 64 | 65 | 66 | def deduplicate_pronunciations(words: Iterable[PronouncedWord]) -> List[PronouncedWord]: 67 | seen_pronunciations = set() 68 | unique = [] 69 | for word in words: 70 | if word.pronunciation not in seen_pronunciations: 71 | unique.append(word) 72 | seen_pronunciations.add(word.pronunciation) 73 | return unique 74 | 75 | 76 | class IPADict(UserDict[Spelling, List[Pronunciation]]): 77 | 78 | @staticmethod 79 | def from_file(filename: str, language: str) -> 'IPADict': 80 | with open(filename, encoding='utf-8') as fp: 81 | (d,) = json.load(fp)[language] 82 | return IPADict({ 83 | Spelling(k): list(set(Pronunciation(x) for x in v.split(', '))) 84 | for k, v in d.items() 85 | }) 86 | 87 | def flatten_to_pronounced_words(self) -> Iterator[PronouncedWord]: 88 | for spelling, pronunciations in self.data.items(): 89 | for pronunciation in pronunciations: 90 | yield PronouncedWord(spelling, pronunciation) 91 | 92 | 93 | def sorted_str(word: str) -> str: 94 | return "".join(sorted(word)) 95 | 96 | 97 | T = TypeVar('T') 98 | 99 | 100 | def find_anagrams(spellings: Iterable[T], key: Optional[Callable[[T], str]] = None) -> Dict[Pronunciation, List[T]]: 101 | d: Dict[str, List[T]] = defaultdict(list) 102 | for w in spellings: 103 | s: str 104 | if key is not None: 105 | s = key(w) 106 | elif not isinstance(w, str): 107 | raise TypeError 108 | else: 109 | s = w 110 | d[sorted_str(s)].append(w) 111 | return {Pronunciation(letters): word_list for letters, word_list in d.items()} 112 | 113 | 114 | class PhoneticAnagramDict(UserDict[Pronunciation, List[PronouncedWord]]): 115 | 116 | @staticmethod 117 | def from_IPADict(ipa_dict: IPADict) -> 'PhoneticAnagramDict': 118 | def get_pronunciation_str(x: PronouncedWord) -> str: 119 | return x.pronunciation.s 120 | 121 | phonetic_anagrams = find_anagrams(ipa_dict.flatten_to_pronounced_words(), 122 | key=get_pronunciation_str) 123 | return PhoneticAnagramDict(phonetic_anagrams) 124 | 125 | def __getitem__(self, pronunciation: Union[str, Pronunciation]) -> List[PronouncedWord]: 126 | if isinstance(pronunciation, Pronunciation): 127 | return self.data[pronunciation.sorted()] 128 | else: 129 | return self.data[Pronunciation(pronunciation).sorted()] 130 | 131 | def __str__(self) -> str: 132 | return str(self.data) 133 | 134 | 135 | def main(): 136 | lang = 'en_US' # feel free to download other languages or dictionaries 137 | dict_filename = f"ipa-dict-{lang}.json" 138 | spelling_to_pronunciations = IPADict.from_file(dict_filename, lang) 139 | 140 | phonetic_anagrams_dict = PhoneticAnagramDict.from_IPADict(spelling_to_pronunciations) 141 | 142 | anaphones = {} 143 | anaphones_unique_pronunciation = {} 144 | anaphones_nontrivial_unique_pronunciation = {} 145 | 146 | for pronounced_word in spelling_to_pronunciations.flatten_to_pronounced_words(): 147 | anas = phonetic_anagrams_dict[pronounced_word.pronunciation] 148 | key_str = pronounced_word.json_safe_str() 149 | anaphones[key_str] = ', '.join([x.json_safe_str() for x in anas]) 150 | 151 | anas = deduplicate_pronunciations(anas) 152 | joined_anas = ', '.join([x.json_safe_str() for x in anas]) 153 | anaphones_unique_pronunciation[key_str] = joined_anas 154 | 155 | if len(anas) > 1: 156 | anaphones_nontrivial_unique_pronunciation[key_str] = joined_anas 157 | 158 | dicts_files = [(anaphones, "anaphones.json"), 159 | (anaphones_unique_pronunciation, "anaphones_unique_pronunciation.json"), 160 | (anaphones_nontrivial_unique_pronunciation, "anaphones_nontrivial_unique_pronunciation.json")] 161 | 162 | for d, filename in dicts_files: 163 | with open(filename, "w", encoding='utf-8') as f: 164 | print(f"writing file {filename}") 165 | json.dump(d, f, indent="\t", sort_keys=True, ensure_ascii=False) 166 | 167 | 168 | if __name__ == '__main__': 169 | import time 170 | start = time.perf_counter() 171 | main() 172 | end = time.perf_counter() 173 | elapsed = end - start 174 | print(f'finished in {elapsed:.02f}s') 175 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | attrs 2 | mypy --------------------------------------------------------------------------------