├── .gitignore
├── LICENSE
├── README.md
├── anaphones.json
├── anaphones.py
├── anaphones_nontrivial_unique_pronunciation.json
├── anaphones_unique_pronunciation.json
├── ipa-dict-en_US.json
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | # Cython debug symbols
138 | cython_debug/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2021 MCODING, LLC
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Anaphones
 2 | 
 3 | Anaphones are like anagrams but for sounds (phonemes).
 4 | Examples include: salami-awesomely, atari-tiara, and beefy-phoebe.
 5 | Anaphones can be anagrams, like atari-tiara, but they don't have to be, and most anagrams are not anaphones.
 6 | Anaphones also include homophones, like their-there-they're,
 7 | but the interesting anaphones are ones that have distinct pronunciations.
 8 | 
 9 | I've compiled a dictionary of them because why not?
10 | The relevant files are:
11 | 
12 | - ipa-dict-en_US.json: the phonetic dictionary used to find the anaphones.
13 | - anaphones.json: the complete dictionary of all anaphones I found.
14 | - anaphones_unique_pronunciation.json: 
15 |   same as anaphones.json but deduplicates pronunciations, so homophones like "there, their, they're" will only have one representative.
16 |     
17 | - anaphones_nontrivial_unique_pronunciation.json: additionally filters out words that do not have any anaphones that are not homophones. 
18 | - anaphones.py: the Python code used to generate the dictionaries.


--------------------------------------------------------------------------------
/anaphones.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import sys
  3 | from collections import defaultdict, UserDict
  4 | from typing import Dict, List, Iterator, Iterable, Union, TypeVar, Optional, Callable
  5 | 
  6 | import attr
  7 | 
  8 | if sys.version_info < (3, 9):
  9 |     raise RuntimeError("This module requires Python 3.9 or higher")
 10 | 
 11 | 
 12 | def clean_str_to_alphanum(word: str) -> str:
 13 |     return "".join(c for c in word if c.isalpha() and c not in 'ˈˌ.-"\'')
 14 | 
 15 | 
 16 | def normalize_pronunciation(pronunciation: str) -> str:
 17 |     return clean_str_to_alphanum(pronunciation.strip('/'))
 18 | 
 19 | 
 20 | @attr.s(frozen=True, slots=True)
 21 | class Spelling:
 22 |     s: str = attr.ib(converter=clean_str_to_alphanum)
 23 | 
 24 |     def __str__(self) -> str:
 25 |         return self.s
 26 | 
 27 |     def sorted(self) -> 'Spelling':
 28 |         return Spelling(sorted_str(self.s))
 29 | 
 30 |     def is_anagram_of(self, other: 'Spelling') -> bool:
 31 |         return self.sorted() == other.sorted()
 32 | 
 33 | 
 34 | @attr.s(frozen=True, slots=True)
 35 | class Pronunciation:
 36 |     s: str = attr.ib(converter=normalize_pronunciation)
 37 | 
 38 |     def __str__(self) -> str:
 39 |         return self.s
 40 | 
 41 |     def sorted(self) -> 'Pronunciation':
 42 |         return Pronunciation(sorted_str(self.s))
 43 | 
 44 |     def is_anagram_of(self, other: 'Pronunciation') -> bool:
 45 |         return self.sorted() == other.sorted()
 46 | 
 47 | 
 48 | @attr.s(frozen=True, slots=True)
 49 | class PronouncedWord:
 50 |     spelling: Spelling = attr.ib()
 51 |     pronunciation: Pronunciation = attr.ib()
 52 | 
 53 |     def __str__(self) -> str:
 54 |         return f'({str(self.spelling)}, {str(self.pronunciation)})'
 55 | 
 56 |     def is_anagram_of(self, other: 'PronouncedWord') -> bool:
 57 |         return self.spelling.is_anagram_of(other.spelling)
 58 | 
 59 |     def is_phonetic_anagram_of(self, other: 'PronouncedWord') -> bool:
 60 |         return self.pronunciation.is_anagram_of(other.pronunciation)
 61 | 
 62 |     def json_safe_str(self) -> str:
 63 |         return f'{self.spelling}:{self.pronunciation}'
 64 | 
 65 | 
 66 | def deduplicate_pronunciations(words: Iterable[PronouncedWord]) -> List[PronouncedWord]:
 67 |     seen_pronunciations = set()
 68 |     unique = []
 69 |     for word in words:
 70 |         if word.pronunciation not in seen_pronunciations:
 71 |             unique.append(word)
 72 |             seen_pronunciations.add(word.pronunciation)
 73 |     return unique
 74 | 
 75 | 
 76 | class IPADict(UserDict[Spelling, List[Pronunciation]]):
 77 | 
 78 |     @staticmethod
 79 |     def from_file(filename: str, language: str) -> 'IPADict':
 80 |         with open(filename, encoding='utf-8') as fp:
 81 |             (d,) = json.load(fp)[language]
 82 |         return IPADict({
 83 |             Spelling(k): list(set(Pronunciation(x) for x in v.split(', ')))
 84 |             for k, v in d.items()
 85 |         })
 86 | 
 87 |     def flatten_to_pronounced_words(self) -> Iterator[PronouncedWord]:
 88 |         for spelling, pronunciations in self.data.items():
 89 |             for pronunciation in pronunciations:
 90 |                 yield PronouncedWord(spelling, pronunciation)
 91 | 
 92 | 
 93 | def sorted_str(word: str) -> str:
 94 |     return "".join(sorted(word))
 95 | 
 96 | 
 97 | T = TypeVar('T')
 98 | 
 99 | 
100 | def find_anagrams(spellings: Iterable[T], key: Optional[Callable[[T], str]] = None) -> Dict[Pronunciation, List[T]]:
101 |     d: Dict[str, List[T]] = defaultdict(list)
102 |     for w in spellings:
103 |         s: str
104 |         if key is not None:
105 |             s = key(w)
106 |         elif not isinstance(w, str):
107 |             raise TypeError
108 |         else:
109 |             s = w
110 |         d[sorted_str(s)].append(w)
111 |     return {Pronunciation(letters): word_list for letters, word_list in d.items()}
112 | 
113 | 
114 | class PhoneticAnagramDict(UserDict[Pronunciation, List[PronouncedWord]]):
115 | 
116 |     @staticmethod
117 |     def from_IPADict(ipa_dict: IPADict) -> 'PhoneticAnagramDict':
118 |         def get_pronunciation_str(x: PronouncedWord) -> str:
119 |             return x.pronunciation.s
120 | 
121 |         phonetic_anagrams = find_anagrams(ipa_dict.flatten_to_pronounced_words(),
122 |                                           key=get_pronunciation_str)
123 |         return PhoneticAnagramDict(phonetic_anagrams)
124 | 
125 |     def __getitem__(self, pronunciation: Union[str, Pronunciation]) -> List[PronouncedWord]:
126 |         if isinstance(pronunciation, Pronunciation):
127 |             return self.data[pronunciation.sorted()]
128 |         else:
129 |             return self.data[Pronunciation(pronunciation).sorted()]
130 | 
131 |     def __str__(self) -> str:
132 |         return str(self.data)
133 | 
134 | 
135 | def main():
136 |     lang = 'en_US'  # feel free to download other languages or dictionaries
137 |     dict_filename = f"ipa-dict-{lang}.json"
138 |     spelling_to_pronunciations = IPADict.from_file(dict_filename, lang)
139 | 
140 |     phonetic_anagrams_dict = PhoneticAnagramDict.from_IPADict(spelling_to_pronunciations)
141 | 
142 |     anaphones = {}
143 |     anaphones_unique_pronunciation = {}
144 |     anaphones_nontrivial_unique_pronunciation = {}
145 | 
146 |     for pronounced_word in spelling_to_pronunciations.flatten_to_pronounced_words():
147 |         anas = phonetic_anagrams_dict[pronounced_word.pronunciation]
148 |         key_str = pronounced_word.json_safe_str()
149 |         anaphones[key_str] = ', '.join([x.json_safe_str() for x in anas])
150 | 
151 |         anas = deduplicate_pronunciations(anas)
152 |         joined_anas = ', '.join([x.json_safe_str() for x in anas])
153 |         anaphones_unique_pronunciation[key_str] = joined_anas
154 | 
155 |         if len(anas) > 1:
156 |             anaphones_nontrivial_unique_pronunciation[key_str] = joined_anas
157 | 
158 |     dicts_files = [(anaphones, "anaphones.json"),
159 |                    (anaphones_unique_pronunciation, "anaphones_unique_pronunciation.json"),
160 |                    (anaphones_nontrivial_unique_pronunciation, "anaphones_nontrivial_unique_pronunciation.json")]
161 | 
162 |     for d, filename in dicts_files:
163 |         with open(filename, "w", encoding='utf-8') as f:
164 |             print(f"writing file {filename}")
165 |             json.dump(d, f, indent="\t", sort_keys=True, ensure_ascii=False)
166 | 
167 | 
168 | if __name__ == '__main__':
169 |     import time
170 |     start = time.perf_counter()
171 |     main()
172 |     end = time.perf_counter()
173 |     elapsed = end - start
174 |     print(f'finished in {elapsed:.02f}s')
175 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | attrs
2 | mypy


--------------------------------------------------------------------------------