├── .gitignore ├── LICENSE ├── README.md ├── assets ├── dictionary.png └── dictionary_myoutput.png ├── extract.py └── reverse_data.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # More 132 | cache*.pkl 133 | .idea/ 134 | 135 | # Test files, old files 136 | lookup/ 137 | old/ 138 | test_files/ 139 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 fab-jul 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # parse_dictionaries 2 | 3 | 4 | The following blog post contains some backgroud 5 | about this repo: 6 | [Reverse-Engineering Apple Dictionary](https://fmentzer.github.io/posts/2020/dictionary/). 7 | 8 | ## Parsing with `reverse_data`.py 9 | 10 | 11 | Parses the great Apple Dictionary 12 | (for now tested with the New Oxfor American Dictionary). 13 | 14 | Here is what the built-in Dictionary app gives for "handle": 15 | 16 |
17 | 18 |
19 | 20 | And here is what this script gives (on a Mac), with 21 | 22 | ```bash 23 | # 24 | # NOTE: might be at a different location for you! 25 | # If so, you can find it using a glob, e.g., 26 | # ls /System/Library/AssetsV2/ \ 27 | # com_apple_MobileAsset_DictionaryServices_dictionaryOSX/*/* 28 | # then take the one that has `New Oxford American Dictionary`! 29 | # 30 | # New Oxford American Dictionary 31 | NOAD='/System/Library/AssetsV2/ \ 32 | com_apple_MobileAsset_DictionaryServices_dictionaryOSX/ \ 33 | 4094df88727a054b658681dfb74f23702d3c985e.asset/ \ 34 | AssetData/ \ 35 | New Oxford American Dictionary.dictionary/ \ 36 | Contents/Resources/Body.data' 37 | 38 | python reverse_data.py \ 39 | --dictionary_path $NOAD --lookup handle --output_path lookup/lookup.html 40 | ``` 41 | 42 |
43 | 44 |
45 | 46 | ## Extracting words and definitions from a book with `extract.py` 47 | 48 | If you want to split a book into all its words and look them all up, 49 | you can use `extract.py`. This relies on `nltk` to properly get definitions, 50 | e.g., to turn "he builds houses" into `["he", "build", "house"]`. 51 | 52 | ```bash 53 | python extract.py PATH_TO_BOOK.txt PATH_TO_OUTPUT.zip 54 | ``` 55 | 56 | The resulting zip file contains a single file `master.json`, which contains 57 | three keys. Example: 58 | 59 | ```json 60 | { 61 | "definitions": { 62 | "cozen": " str: 73 | if input_encoding: 74 | input_encodings = [input_encoding] 75 | else: 76 | input_encodings = ['utf-8', 'ISO-8859-1', 'ascii'] 77 | 78 | for encoding in input_encodings: 79 | try: 80 | with codecs.open(input_path, mode='r', encoding=encoding) as f: 81 | print(f'Decoded file with {encoding}.') 82 | return f.read() 83 | except UnicodeDecodeError as e: 84 | print(f'Caught {e}') 85 | 86 | raise ValueError("Hmm, file has unknown encoding.") 87 | 88 | 89 | def _write_filtered_dict(words: set, 90 | word_dict: reverse_data.WordDictionary, 91 | scores: Dict[str, float], 92 | text: str, 93 | output_path: str): 94 | filtered_word_dict = word_dict.filtered(words) 95 | dict_of_str = {key: entry.content for key, entry in filtered_word_dict.items()} 96 | master_object = { 97 | 'definitions': dict_of_str, 98 | 'links': filtered_word_dict.links, 99 | 'scores': scores} 100 | 101 | # Write definitions as JSON 102 | os.makedirs(os.path.dirname(output_path), exist_ok=True) 103 | with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zf: 104 | zf.writestr('master.json', json.dumps(master_object).encode('utf-8')) 105 | zf.writestr('fulltext.txt', text.encode('utf-8')) 106 | 107 | 108 | def _get_scores(word_counts: Dict[str, int], 109 | word_dict: reverse_data.WordDictionary) -> Dict[str, float]: 110 | """Very crude way to get scores from word_counts and a dictionary. 111 | 112 | Algorithm for a word `w`: 113 | 1. Set score := how often does w occur in the text 114 | (as given by `word_counts`). 115 | 2. If the word is said to be "literary" in the dict, divide score by 2. 116 | 117 | Overall, lower score means "rarer" words. 118 | """ 119 | scores = {word: count for word, count in word_counts.items()} 120 | for word in word_counts: 121 | if 'literary' in word_dict[word].get_info(): 122 | scores[word] /= 2 123 | return scores 124 | 125 | 126 | def _get_word_counts(text: str, 127 | word_dict: reverse_data.WordDictionary): 128 | """Given a text and a dictionary, split the text into words and return counts. 129 | 130 | Done by: 131 | 1. Sanitize text by doing lower case and removing newlines. 132 | 2. Use NLTK's tokenizer 133 | 3. Try to find the base by using NLTK's lemmatizer (i.e. houses -> house), 134 | to increase chances of finding a word in the dictionary 135 | 4. Count the occurences of words. 136 | """ 137 | text = text.lower() 138 | 139 | print('Collapsing newlines...') 140 | text = re.sub('(.)\n', r'\1 ', text) 141 | 142 | print('Tokenizing...') 143 | words = tokenize.word_tokenize(text) 144 | 145 | print('Pruning...') 146 | 147 | # Remove punctuation in tokens, as ntlk tokenizes for example "they'll" as 148 | # [they, 'll]. The resulting "ll" will be ditched in a later stage. 149 | # Also removes tokens that are just quotes, which turn into empty tokens, 150 | # removed at the MIN_WORD_LEN stage below. 151 | words = (w.strip("'.-`\"") for w in words) 152 | # Ditches some genitives and third person singulars. In Python 3.9 this 153 | # should be `removesuffix` but the `replace` works well enough in this context. 154 | words = (w.replace("'s", '') for w in words) 155 | # Removes abbreviations such as "e.g." 156 | words = (w for w in words if '.' not in w) 157 | # Removes most punctuation from the list, such as ",", ":", etc., 158 | # also removes empty tokens. 159 | words = (w for w in words if len(w) > MIN_WORD_LEN) 160 | # Removes all numbers 161 | words = (w for w in words if w and not all(c.isdigit() for c in w)) 162 | 163 | print('Counting...') 164 | word_counts = collections.Counter(words) 165 | 166 | print('Lemmatizing...') 167 | lemma = WordNetLemmatizer() 168 | word_counts_lemmad = collections.defaultdict(int) 169 | 170 | # We create a map from word_as_it_appears_in_book to the lemmad 171 | # words to simplify lookup later. Note that it's not exactly 172 | # word_as_it_appears_in_book due to the preprocessing above but oh well. 173 | links = {} 174 | 175 | # Note: assume we have `word_counts` = {"belongs": 4 "belonging":3} 176 | # This results in sth like {"belong": 7, "belonging": 7} in the following. 177 | for w, count in word_counts.items(): 178 | possible_words = [] 179 | if w in word_dict: 180 | possible_words.append(w) 181 | word_is_in_dict = True 182 | else: 183 | word_is_in_dict = False 184 | 185 | for t in wordnet.POS_LIST: 186 | w_lemmad = lemma.lemmatize(w, pos=t) 187 | if w_lemmad != w and w_lemmad in word_dict: 188 | possible_words.append(w_lemmad) 189 | 190 | # Neither the input word nor any lemmad forms are in the dictionary. 191 | if not possible_words: 192 | continue 193 | 194 | # Input word is not in dictionary but some lemmad form is. 195 | # We pick any random of the lemmad forms to make a link, 196 | # hoping it's a good one. 197 | if not word_is_in_dict: 198 | links[w] = next(iter(possible_words)) 199 | 200 | # Build counts dict. 201 | for possible_w in possible_words: 202 | word_counts_lemmad[possible_w] += count 203 | 204 | return word_counts_lemmad, links 205 | 206 | 207 | if __name__ == '__main__': 208 | main() 209 | -------------------------------------------------------------------------------- /reverse_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parse Apple dictionaries given as Body.data files. 3 | 4 | The function that does the heavy lifting is _parse. Overview: 5 | 6 | - The files are just ZIPs of XML entries concatenated with some headers 7 | inbetween 8 | - We greedily try to find the ZIPs and extract the XML 9 | - Some XML parsing is implemented to find interesting stuff (derivatives for 10 | example). 11 | 12 | """ 13 | import argparse 14 | import collections 15 | import contextlib 16 | import itertools 17 | import os 18 | import pickle 19 | import shutil 20 | import zlib 21 | from typing import Dict, List, Tuple, Set 22 | 23 | import lxml.etree as etree 24 | 25 | # New Oxford American Dictionary 26 | NOAD = '/System/Library/AssetsV2/' \ 27 | 'com_apple_MobileAsset_DictionaryServices_dictionaryOSX/' \ 28 | '4094df88727a054b658681dfb74f23702d3c985e.asset/' \ 29 | 'AssetData/' \ 30 | 'New Oxford American Dictionary.dictionary/' \ 31 | 'Contents/Resources/Body.data' 32 | 33 | 34 | # Matches spans that give some meta info, like "literary", "informal", etc. 35 | XPATH_INFO = '//span[@class="lg"]/span[@class="reg"]' 36 | 37 | # This matches the bold words in the definitions. For an example, 38 | # see "vital", which contains "noun (vitals)" 39 | XPATH_OTHER_WORDS = '//span[@class="fg"]/span[@class="f"]' 40 | 41 | # This matches the derivatives at the end of definition. 42 | XPATH_DERIVATIVES = '//span[contains(@class, "t_derivatives")]//' \ 43 | 'span[contains(@class, "x_xoh")]/' \ 44 | 'span[@role="text"]' 45 | 46 | OUTPUT_HTML_HEADER = """ 47 | 48 | 49 | 50 | Words 51 | 52 | 53 | 54 | """ 55 | 56 | CUSTOM_CSS = """ 57 | .div-entry { 58 | border-top: 2px solid black; 59 | padding-bottom: 50px; 60 | } 61 | """ 62 | 63 | 64 | def main(): 65 | p = argparse.ArgumentParser() 66 | p.add_argument('--dictionary_path', default=NOAD, 67 | help=f"path to a body.data file. defaults to {NOAD}") 68 | p.add_argument('--lookup', nargs='+', 69 | default=['vital', 'house', 'cozen'], 70 | help='words to lookup') 71 | p.add_argument('--output_path', default='lookup/lookup.html', 72 | help='where to save the words.') 73 | 74 | flags = p.parse_args() 75 | save_definitions(flags.dictionary_path, 76 | flags.lookup, 77 | flags.output_path) 78 | 79 | 80 | def save_definitions(dictionary_path, lookup_words, output_path): 81 | if not dictionary_path.endswith('Body.data'): 82 | raise ValueError(f'Expected a Body.data file, got {dictionary_path}') 83 | 84 | word_dict = WordDictionary.from_file(dictionary_path) 85 | os.makedirs(os.path.dirname(output_path), exist_ok=True) 86 | 87 | with open(output_path, 'w') as f: 88 | f.write(OUTPUT_HTML_HEADER) 89 | with wrap_in_tag(f, 'body'): 90 | for target in lookup_words: 91 | entry = word_dict[target] 92 | t = entry.get_xml_tree() 93 | with wrap_in_tag(f, 'div', attr='class="div-entry"'): 94 | f.write(etree.tostring(t, pretty_print=True).decode()) 95 | 96 | print(f'Saved {len(lookup_words)} definitions at {output_path}.') 97 | 98 | # Copy default and custom CSS to output 99 | css_path = dictionary_path.replace('Body.data', 'DefaultStyle.css') 100 | if not os.path.isfile(css_path): 101 | print(f'WARN: CSS not found at expected path {css_path}') 102 | css_path_out = os.path.join(os.path.dirname(output_path), 103 | os.path.basename(css_path)) 104 | shutil.copy(css_path, css_path_out) 105 | custom_css_path_out = os.path.join(os.path.dirname(output_path), 106 | 'CustomStyle.css') 107 | with open(custom_css_path_out, 'w') as f: 108 | f.write(CUSTOM_CSS) 109 | 110 | 111 | class WordDictionary: 112 | """Rrepresents a dictionary.""" 113 | 114 | @staticmethod 115 | def from_file(p): 116 | d, links = parse(p) 117 | return WordDictionary(d, links) 118 | 119 | def __init__(self, d: Dict[str, 'Entry'], links: Dict[str, str]): 120 | """Constructor. 121 | 122 | :param d: The dictionary, as a dict mapping words to Entry instances. 123 | :param links: Special links, as a dict mapping words to words. Words `w` in 124 | this dict have a definition at `links[w]`. 125 | """ 126 | self.d, self.links = d, links 127 | 128 | def items(self): 129 | return self.d.items() 130 | 131 | def add_links(self, links: Dict[str, str]): 132 | for w, linked_w in links.items(): 133 | # Word already linked, so we should be able to find it. 134 | if w in self.links: 135 | continue 136 | assert linked_w in self 137 | self.links[w] = linked_w 138 | 139 | def filtered(self, words) -> 'WordDictionary': 140 | filtered_dict = {} 141 | filtered_links = {} 142 | for w in words: 143 | filtered_dict[w] = self[w] # May raise! 144 | if w in self.links: 145 | filtered_links[w] = self.links[w] 146 | return WordDictionary(filtered_dict, filtered_links) 147 | 148 | def __getitem__(self, w) -> 'Entry': 149 | if w in self.d: 150 | return self.d[w] 151 | if w in self.links: 152 | return self.d[self.links[w]] 153 | raise KeyError(w) 154 | 155 | def __contains__(self, w): 156 | return w in self.d or w in self.links 157 | 158 | def __str__(self): 159 | return f'WordDcitionary({len(self.d)} definitions, ' \ 160 | f'{len(self.links)} links)' 161 | 162 | 163 | @contextlib.contextmanager 164 | def wrap_in_tag(f, tag, attr=None): 165 | if attr: 166 | f.write(f'<{tag} {attr}>') 167 | else: 168 | f.write(f'<{tag}>') 169 | yield 170 | f.write(f'') 171 | 172 | 173 | def parse(dictionary_path): 174 | print(f"Parsing {dictionary_path}...") 175 | entries_tuples = _parse(dictionary_path) 176 | print('Augmenting...') 177 | # Some definitions have multiple entries (for example foil in NOAD). 178 | # Merge them here. 179 | entries = merge_same_keys(entries_tuples) 180 | links = _get_links(dictionary_path, entries) 181 | print(f'Links: {len(links)}') 182 | return entries, links 183 | 184 | 185 | def merge_same_keys(entries_tuples: List[Tuple[str, str]]) -> Dict[str, 'Entry']: 186 | entries = {} 187 | for k, e in entries_tuples: 188 | if k in entries: 189 | entries[k].append_definition(e) 190 | else: 191 | entries[k] = Entry(k, e) 192 | return entries 193 | 194 | 195 | def _pickle_cache(p): 196 | """Little helper decorator to store stuff in a pickle cache, used below.""" 197 | def decorator(func): 198 | if os.path.isfile(p): 199 | with open(p, 'rb') as f: 200 | cache = pickle.load(f) 201 | else: 202 | cache = {} 203 | 204 | def new_func(*args, **kwargs): 205 | key = args[0] 206 | if key not in cache: 207 | res = func(*args, **kwargs) 208 | cache[key] = res 209 | with open(p, 'wb') as f: 210 | pickle.dump(cache, f) 211 | else: 212 | print(f'Cached in {p}: {key}') 213 | return cache[key] 214 | 215 | return new_func 216 | return decorator 217 | 218 | 219 | @_pickle_cache('cache_links.pkl') 220 | def _get_links(p, entries): 221 | del p # Only used for cache 222 | links = {} 223 | print('Getting links...') 224 | for i, (key, entry) in enumerate(entries.items()): 225 | if i % 1000 == 0: 226 | progress = i / len(entries) 227 | print(f'\rGetting links: {progress * 100:.1f}%', end='', flush=True) 228 | for w in entry.get_words_and_derivaties(): 229 | if w in entries: 230 | continue 231 | # Word is not in dictionary, add to links 232 | if w in links: 233 | continue 234 | links[w] = key 235 | return links 236 | 237 | 238 | @_pickle_cache('cache_parse.pkl') 239 | def _parse(dictionary_path) -> List[Tuple[str, str]]: 240 | """Parse Body.data into a list of entries given as key, definition tuples.""" 241 | with open(dictionary_path, 'rb') as f: 242 | content_bytes = f.read() 243 | total_bytes = len(content_bytes) 244 | 245 | # The first zip file starts at ~100 bytes: 246 | content_bytes = content_bytes[100:] 247 | 248 | first = True 249 | entries = [] 250 | for i in itertools.count(): 251 | if not content_bytes: # Backup condition in case stop is never True. 252 | break 253 | try: 254 | d = zlib.decompressobj() 255 | res = d.decompress(content_bytes) 256 | new_entries, stop = _split(res, verbose=first) 257 | entries += new_entries 258 | if stop: 259 | break 260 | if i % 10 == 0: 261 | bytes_left = len(content_bytes) # Approximately... 262 | progress = 1 - bytes_left / total_bytes 263 | print(f'{progress * 100:.1f}% // ' 264 | f'{len(entries)} entries parsed // ' 265 | f'Latest entry: {entries[-1][0]}') 266 | first = False 267 | 268 | # Set content_bytes to the unused data so we can start the search for the 269 | # next zip file. 270 | content_bytes = d.unused_data 271 | 272 | except zlib.error: # Current content_bytes is not a zipfile -> skip a byte. 273 | content_bytes = content_bytes[1:] 274 | 275 | return entries 276 | 277 | 278 | def _split(input_bytes, verbose) -> Tuple[List[Tuple[str, str]], 279 | bool]: 280 | """Split `input_bytes` into a list of tuples (name, definition).""" 281 | printv = print if verbose else lambda *a, **k: ... 282 | 283 | # The first four bytes are always not UTF-8 (not sure why?) 284 | input_bytes = input_bytes[4:] 285 | 286 | printv('Splitting...') 287 | printv(f'{"index": <10}', f'{"bytes": <30}', f'{"as chars"}', 288 | '-' * 50, sep='\n') 289 | 290 | entries = [] 291 | total_offset = 0 292 | stop_further_parsing = False 293 | 294 | while True: 295 | # Find the next newline, which delimits the current entry. 296 | try: 297 | next_offset = input_bytes.index('\n'.encode('utf-8')) 298 | except ValueError: # No more new-lines -> no more entries! 299 | break 300 | 301 | entry_text = input_bytes[:next_offset].decode('utf-8') 302 | 303 | # The final part of the dictionary contains some meta info, which we skip. 304 | # TODO: might only be for the NOAD, so check other dictionaries. 305 | if 'fbm_AdvisoryBoard' in entry_text[:1000]: 306 | print('fbm_AdvisoryBoard detected, stopping...') 307 | stop_further_parsing = True 308 | break 309 | 310 | # Make sure we have a valid entry. 311 | assert (entry_text.startswith('')), \ 313 | f'ENTRY: {entry_text} \n REM: {input_bytes}' 314 | 315 | # The name of the definition is stored in the "d:title" attribute, 316 | # where "d" is the current domain, which we get from the nsmap - the 317 | # actual attribute will be "{com.apple.blabla}title" (including the 318 | # curly brackets). 319 | xml_entry = etree.fromstring(entry_text) 320 | domain = xml_entry.nsmap['d'] 321 | key = '{%s}title' % domain 322 | name = xml_entry.get(key) # Lookup the attribute in the tree. 323 | 324 | entries.append((name, entry_text)) 325 | 326 | printv(f'{next_offset + total_offset: 10d}', 327 | f'{str(input_bytes[next_offset + 1:next_offset + 5]): <30}', 328 | xml_entry.get(key)) 329 | 330 | # There is always 4 bytes of chibberish between entries. Skip them 331 | # and the new lines (for a total of 5 bytes). 332 | input_bytes = input_bytes[next_offset + 5:] 333 | total_offset += next_offset 334 | return entries, stop_further_parsing 335 | 336 | 337 | class Entry: 338 | def __init__(self, key, content): 339 | self.key = key 340 | self.content = content 341 | 342 | # Set to true on the first call to `append_definition`. 343 | # Used in get_xml_tree. 344 | self._multi_definition = False 345 | 346 | # These are lazily populated as they take a while. 347 | self._xml = None 348 | self._info = None 349 | self._words_and_derivatives = None 350 | 351 | def append_definition(self, content): 352 | """Extend self.content with more XML. 353 | 354 | The key here is to make sure the overall content is still valid XML 355 | by wrapping the whole thing in a
, which is handled in `get_xml_tree`, 356 | here we just set _multi_definition. 357 | """ 358 | self._multi_definition = True 359 | self.content += content 360 | 361 | def get_xml_tree(self): 362 | content = self.content 363 | if self._multi_definition: 364 | content = '
' + self.content + '
' 365 | return etree.fromstring(content) 366 | 367 | def get_special(self, xpath, replace=None): 368 | matches = self.get_xml().xpath(xpath) 369 | if not matches: 370 | return [] 371 | # Note: May be empty. 372 | texts = [el.text for el in matches if el.text] 373 | if replace: 374 | for r_in, r_out in replace: 375 | texts = [t.replace(r_in, r_out) for t in texts] 376 | texts = [t.strip() for t in texts] 377 | return texts 378 | 379 | def get_xml(self): 380 | if self._xml is None: 381 | self._xml = self.get_xml_tree() 382 | return self._xml 383 | 384 | def get_words_and_derivaties(self): 385 | def _make(): 386 | derivatives = set(self.get_special(XPATH_DERIVATIVES)) 387 | other_words = set(self.get_special(XPATH_OTHER_WORDS, [("the", "")])) 388 | return (derivatives | other_words) - {self.key} 389 | 390 | return _lazy(self, "_words_and_derivatives", _make) 391 | 392 | def get_info(self): 393 | return _lazy(self, "_info", lambda: set(self.get_special(XPATH_INFO))) 394 | 395 | def __str__(self): 396 | return f'Entry({self.key})' 397 | 398 | 399 | def _lazy(obj, ivar, creator): 400 | if getattr(obj, ivar) is None: 401 | setattr(obj, ivar, creator()) 402 | return getattr(obj, ivar) 403 | 404 | 405 | if __name__ == '__main__': 406 | main() 407 | --------------------------------------------------------------------------------