├── .gitignore
├── LICENSE
├── README.md
├── assets
├── dictionary.png
└── dictionary_myoutput.png
├── extract.py
└── reverse_data.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | # More
132 | cache*.pkl
133 | .idea/
134 |
135 | # Test files, old files
136 | lookup/
137 | old/
138 | test_files/
139 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 fab-jul
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # parse_dictionaries
2 |
3 |
4 | The following blog post contains some backgroud
5 | about this repo:
6 | [Reverse-Engineering Apple Dictionary](https://fmentzer.github.io/posts/2020/dictionary/).
7 |
8 | ## Parsing with `reverse_data`.py
9 |
10 |
11 | Parses the great Apple Dictionary
12 | (for now tested with the New Oxfor American Dictionary).
13 |
14 | Here is what the built-in Dictionary app gives for "handle":
15 |
16 |
17 |
18 |
19 |
20 | And here is what this script gives (on a Mac), with
21 |
22 | ```bash
23 | #
24 | # NOTE: might be at a different location for you!
25 | # If so, you can find it using a glob, e.g.,
26 | # ls /System/Library/AssetsV2/ \
27 | # com_apple_MobileAsset_DictionaryServices_dictionaryOSX/*/*
28 | # then take the one that has `New Oxford American Dictionary`!
29 | #
30 | # New Oxford American Dictionary
31 | NOAD='/System/Library/AssetsV2/ \
32 | com_apple_MobileAsset_DictionaryServices_dictionaryOSX/ \
33 | 4094df88727a054b658681dfb74f23702d3c985e.asset/ \
34 | AssetData/ \
35 | New Oxford American Dictionary.dictionary/ \
36 | Contents/Resources/Body.data'
37 |
38 | python reverse_data.py \
39 | --dictionary_path $NOAD --lookup handle --output_path lookup/lookup.html
40 | ```
41 |
42 |
43 |
44 |
45 |
46 | ## Extracting words and definitions from a book with `extract.py`
47 |
48 | If you want to split a book into all its words and look them all up,
49 | you can use `extract.py`. This relies on `nltk` to properly get definitions,
50 | e.g., to turn "he builds houses" into `["he", "build", "house"]`.
51 |
52 | ```bash
53 | python extract.py PATH_TO_BOOK.txt PATH_TO_OUTPUT.zip
54 | ```
55 |
56 | The resulting zip file contains a single file `master.json`, which contains
57 | three keys. Example:
58 |
59 | ```json
60 | {
61 | "definitions": {
62 | "cozen": " str:
73 | if input_encoding:
74 | input_encodings = [input_encoding]
75 | else:
76 | input_encodings = ['utf-8', 'ISO-8859-1', 'ascii']
77 |
78 | for encoding in input_encodings:
79 | try:
80 | with codecs.open(input_path, mode='r', encoding=encoding) as f:
81 | print(f'Decoded file with {encoding}.')
82 | return f.read()
83 | except UnicodeDecodeError as e:
84 | print(f'Caught {e}')
85 |
86 | raise ValueError("Hmm, file has unknown encoding.")
87 |
88 |
89 | def _write_filtered_dict(words: set,
90 | word_dict: reverse_data.WordDictionary,
91 | scores: Dict[str, float],
92 | text: str,
93 | output_path: str):
94 | filtered_word_dict = word_dict.filtered(words)
95 | dict_of_str = {key: entry.content for key, entry in filtered_word_dict.items()}
96 | master_object = {
97 | 'definitions': dict_of_str,
98 | 'links': filtered_word_dict.links,
99 | 'scores': scores}
100 |
101 | # Write definitions as JSON
102 | os.makedirs(os.path.dirname(output_path), exist_ok=True)
103 | with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zf:
104 | zf.writestr('master.json', json.dumps(master_object).encode('utf-8'))
105 | zf.writestr('fulltext.txt', text.encode('utf-8'))
106 |
107 |
108 | def _get_scores(word_counts: Dict[str, int],
109 | word_dict: reverse_data.WordDictionary) -> Dict[str, float]:
110 | """Very crude way to get scores from word_counts and a dictionary.
111 |
112 | Algorithm for a word `w`:
113 | 1. Set score := how often does w occur in the text
114 | (as given by `word_counts`).
115 | 2. If the word is said to be "literary" in the dict, divide score by 2.
116 |
117 | Overall, lower score means "rarer" words.
118 | """
119 | scores = {word: count for word, count in word_counts.items()}
120 | for word in word_counts:
121 | if 'literary' in word_dict[word].get_info():
122 | scores[word] /= 2
123 | return scores
124 |
125 |
126 | def _get_word_counts(text: str,
127 | word_dict: reverse_data.WordDictionary):
128 | """Given a text and a dictionary, split the text into words and return counts.
129 |
130 | Done by:
131 | 1. Sanitize text by doing lower case and removing newlines.
132 | 2. Use NLTK's tokenizer
133 | 3. Try to find the base by using NLTK's lemmatizer (i.e. houses -> house),
134 | to increase chances of finding a word in the dictionary
135 | 4. Count the occurences of words.
136 | """
137 | text = text.lower()
138 |
139 | print('Collapsing newlines...')
140 | text = re.sub('(.)\n', r'\1 ', text)
141 |
142 | print('Tokenizing...')
143 | words = tokenize.word_tokenize(text)
144 |
145 | print('Pruning...')
146 |
147 | # Remove punctuation in tokens, as ntlk tokenizes for example "they'll" as
148 | # [they, 'll]. The resulting "ll" will be ditched in a later stage.
149 | # Also removes tokens that are just quotes, which turn into empty tokens,
150 | # removed at the MIN_WORD_LEN stage below.
151 | words = (w.strip("'.-`\"") for w in words)
152 | # Ditches some genitives and third person singulars. In Python 3.9 this
153 | # should be `removesuffix` but the `replace` works well enough in this context.
154 | words = (w.replace("'s", '') for w in words)
155 | # Removes abbreviations such as "e.g."
156 | words = (w for w in words if '.' not in w)
157 | # Removes most punctuation from the list, such as ",", ":", etc.,
158 | # also removes empty tokens.
159 | words = (w for w in words if len(w) > MIN_WORD_LEN)
160 | # Removes all numbers
161 | words = (w for w in words if w and not all(c.isdigit() for c in w))
162 |
163 | print('Counting...')
164 | word_counts = collections.Counter(words)
165 |
166 | print('Lemmatizing...')
167 | lemma = WordNetLemmatizer()
168 | word_counts_lemmad = collections.defaultdict(int)
169 |
170 | # We create a map from word_as_it_appears_in_book to the lemmad
171 | # words to simplify lookup later. Note that it's not exactly
172 | # word_as_it_appears_in_book due to the preprocessing above but oh well.
173 | links = {}
174 |
175 | # Note: assume we have `word_counts` = {"belongs": 4 "belonging":3}
176 | # This results in sth like {"belong": 7, "belonging": 7} in the following.
177 | for w, count in word_counts.items():
178 | possible_words = []
179 | if w in word_dict:
180 | possible_words.append(w)
181 | word_is_in_dict = True
182 | else:
183 | word_is_in_dict = False
184 |
185 | for t in wordnet.POS_LIST:
186 | w_lemmad = lemma.lemmatize(w, pos=t)
187 | if w_lemmad != w and w_lemmad in word_dict:
188 | possible_words.append(w_lemmad)
189 |
190 | # Neither the input word nor any lemmad forms are in the dictionary.
191 | if not possible_words:
192 | continue
193 |
194 | # Input word is not in dictionary but some lemmad form is.
195 | # We pick any random of the lemmad forms to make a link,
196 | # hoping it's a good one.
197 | if not word_is_in_dict:
198 | links[w] = next(iter(possible_words))
199 |
200 | # Build counts dict.
201 | for possible_w in possible_words:
202 | word_counts_lemmad[possible_w] += count
203 |
204 | return word_counts_lemmad, links
205 |
206 |
207 | if __name__ == '__main__':
208 | main()
209 |
--------------------------------------------------------------------------------
/reverse_data.py:
--------------------------------------------------------------------------------
1 | """
2 | Parse Apple dictionaries given as Body.data files.
3 |
4 | The function that does the heavy lifting is _parse. Overview:
5 |
6 | - The files are just ZIPs of XML entries concatenated with some headers
7 | inbetween
8 | - We greedily try to find the ZIPs and extract the XML
9 | - Some XML parsing is implemented to find interesting stuff (derivatives for
10 | example).
11 |
12 | """
13 | import argparse
14 | import collections
15 | import contextlib
16 | import itertools
17 | import os
18 | import pickle
19 | import shutil
20 | import zlib
21 | from typing import Dict, List, Tuple, Set
22 |
23 | import lxml.etree as etree
24 |
25 | # New Oxford American Dictionary
26 | NOAD = '/System/Library/AssetsV2/' \
27 | 'com_apple_MobileAsset_DictionaryServices_dictionaryOSX/' \
28 | '4094df88727a054b658681dfb74f23702d3c985e.asset/' \
29 | 'AssetData/' \
30 | 'New Oxford American Dictionary.dictionary/' \
31 | 'Contents/Resources/Body.data'
32 |
33 |
34 | # Matches spans that give some meta info, like "literary", "informal", etc.
35 | XPATH_INFO = '//span[@class="lg"]/span[@class="reg"]'
36 |
37 | # This matches the bold words in the definitions. For an example,
38 | # see "vital", which contains "noun (vitals)"
39 | XPATH_OTHER_WORDS = '//span[@class="fg"]/span[@class="f"]'
40 |
41 | # This matches the derivatives at the end of definition.
42 | XPATH_DERIVATIVES = '//span[contains(@class, "t_derivatives")]//' \
43 | 'span[contains(@class, "x_xoh")]/' \
44 | 'span[@role="text"]'
45 |
46 | OUTPUT_HTML_HEADER = """
47 |
48 |
49 |
50 | Words
51 |
52 |
53 |
54 | """
55 |
56 | CUSTOM_CSS = """
57 | .div-entry {
58 | border-top: 2px solid black;
59 | padding-bottom: 50px;
60 | }
61 | """
62 |
63 |
64 | def main():
65 | p = argparse.ArgumentParser()
66 | p.add_argument('--dictionary_path', default=NOAD,
67 | help=f"path to a body.data file. defaults to {NOAD}")
68 | p.add_argument('--lookup', nargs='+',
69 | default=['vital', 'house', 'cozen'],
70 | help='words to lookup')
71 | p.add_argument('--output_path', default='lookup/lookup.html',
72 | help='where to save the words.')
73 |
74 | flags = p.parse_args()
75 | save_definitions(flags.dictionary_path,
76 | flags.lookup,
77 | flags.output_path)
78 |
79 |
80 | def save_definitions(dictionary_path, lookup_words, output_path):
81 | if not dictionary_path.endswith('Body.data'):
82 | raise ValueError(f'Expected a Body.data file, got {dictionary_path}')
83 |
84 | word_dict = WordDictionary.from_file(dictionary_path)
85 | os.makedirs(os.path.dirname(output_path), exist_ok=True)
86 |
87 | with open(output_path, 'w') as f:
88 | f.write(OUTPUT_HTML_HEADER)
89 | with wrap_in_tag(f, 'body'):
90 | for target in lookup_words:
91 | entry = word_dict[target]
92 | t = entry.get_xml_tree()
93 | with wrap_in_tag(f, 'div', attr='class="div-entry"'):
94 | f.write(etree.tostring(t, pretty_print=True).decode())
95 |
96 | print(f'Saved {len(lookup_words)} definitions at {output_path}.')
97 |
98 | # Copy default and custom CSS to output
99 | css_path = dictionary_path.replace('Body.data', 'DefaultStyle.css')
100 | if not os.path.isfile(css_path):
101 | print(f'WARN: CSS not found at expected path {css_path}')
102 | css_path_out = os.path.join(os.path.dirname(output_path),
103 | os.path.basename(css_path))
104 | shutil.copy(css_path, css_path_out)
105 | custom_css_path_out = os.path.join(os.path.dirname(output_path),
106 | 'CustomStyle.css')
107 | with open(custom_css_path_out, 'w') as f:
108 | f.write(CUSTOM_CSS)
109 |
110 |
111 | class WordDictionary:
112 | """Rrepresents a dictionary."""
113 |
114 | @staticmethod
115 | def from_file(p):
116 | d, links = parse(p)
117 | return WordDictionary(d, links)
118 |
119 | def __init__(self, d: Dict[str, 'Entry'], links: Dict[str, str]):
120 | """Constructor.
121 |
122 | :param d: The dictionary, as a dict mapping words to Entry instances.
123 | :param links: Special links, as a dict mapping words to words. Words `w` in
124 | this dict have a definition at `links[w]`.
125 | """
126 | self.d, self.links = d, links
127 |
128 | def items(self):
129 | return self.d.items()
130 |
131 | def add_links(self, links: Dict[str, str]):
132 | for w, linked_w in links.items():
133 | # Word already linked, so we should be able to find it.
134 | if w in self.links:
135 | continue
136 | assert linked_w in self
137 | self.links[w] = linked_w
138 |
139 | def filtered(self, words) -> 'WordDictionary':
140 | filtered_dict = {}
141 | filtered_links = {}
142 | for w in words:
143 | filtered_dict[w] = self[w] # May raise!
144 | if w in self.links:
145 | filtered_links[w] = self.links[w]
146 | return WordDictionary(filtered_dict, filtered_links)
147 |
148 | def __getitem__(self, w) -> 'Entry':
149 | if w in self.d:
150 | return self.d[w]
151 | if w in self.links:
152 | return self.d[self.links[w]]
153 | raise KeyError(w)
154 |
155 | def __contains__(self, w):
156 | return w in self.d or w in self.links
157 |
158 | def __str__(self):
159 | return f'WordDcitionary({len(self.d)} definitions, ' \
160 | f'{len(self.links)} links)'
161 |
162 |
163 | @contextlib.contextmanager
164 | def wrap_in_tag(f, tag, attr=None):
165 | if attr:
166 | f.write(f'<{tag} {attr}>')
167 | else:
168 | f.write(f'<{tag}>')
169 | yield
170 | f.write(f'{tag}>')
171 |
172 |
173 | def parse(dictionary_path):
174 | print(f"Parsing {dictionary_path}...")
175 | entries_tuples = _parse(dictionary_path)
176 | print('Augmenting...')
177 | # Some definitions have multiple entries (for example foil in NOAD).
178 | # Merge them here.
179 | entries = merge_same_keys(entries_tuples)
180 | links = _get_links(dictionary_path, entries)
181 | print(f'Links: {len(links)}')
182 | return entries, links
183 |
184 |
185 | def merge_same_keys(entries_tuples: List[Tuple[str, str]]) -> Dict[str, 'Entry']:
186 | entries = {}
187 | for k, e in entries_tuples:
188 | if k in entries:
189 | entries[k].append_definition(e)
190 | else:
191 | entries[k] = Entry(k, e)
192 | return entries
193 |
194 |
195 | def _pickle_cache(p):
196 | """Little helper decorator to store stuff in a pickle cache, used below."""
197 | def decorator(func):
198 | if os.path.isfile(p):
199 | with open(p, 'rb') as f:
200 | cache = pickle.load(f)
201 | else:
202 | cache = {}
203 |
204 | def new_func(*args, **kwargs):
205 | key = args[0]
206 | if key not in cache:
207 | res = func(*args, **kwargs)
208 | cache[key] = res
209 | with open(p, 'wb') as f:
210 | pickle.dump(cache, f)
211 | else:
212 | print(f'Cached in {p}: {key}')
213 | return cache[key]
214 |
215 | return new_func
216 | return decorator
217 |
218 |
219 | @_pickle_cache('cache_links.pkl')
220 | def _get_links(p, entries):
221 | del p # Only used for cache
222 | links = {}
223 | print('Getting links...')
224 | for i, (key, entry) in enumerate(entries.items()):
225 | if i % 1000 == 0:
226 | progress = i / len(entries)
227 | print(f'\rGetting links: {progress * 100:.1f}%', end='', flush=True)
228 | for w in entry.get_words_and_derivaties():
229 | if w in entries:
230 | continue
231 | # Word is not in dictionary, add to links
232 | if w in links:
233 | continue
234 | links[w] = key
235 | return links
236 |
237 |
238 | @_pickle_cache('cache_parse.pkl')
239 | def _parse(dictionary_path) -> List[Tuple[str, str]]:
240 | """Parse Body.data into a list of entries given as key, definition tuples."""
241 | with open(dictionary_path, 'rb') as f:
242 | content_bytes = f.read()
243 | total_bytes = len(content_bytes)
244 |
245 | # The first zip file starts at ~100 bytes:
246 | content_bytes = content_bytes[100:]
247 |
248 | first = True
249 | entries = []
250 | for i in itertools.count():
251 | if not content_bytes: # Backup condition in case stop is never True.
252 | break
253 | try:
254 | d = zlib.decompressobj()
255 | res = d.decompress(content_bytes)
256 | new_entries, stop = _split(res, verbose=first)
257 | entries += new_entries
258 | if stop:
259 | break
260 | if i % 10 == 0:
261 | bytes_left = len(content_bytes) # Approximately...
262 | progress = 1 - bytes_left / total_bytes
263 | print(f'{progress * 100:.1f}% // '
264 | f'{len(entries)} entries parsed // '
265 | f'Latest entry: {entries[-1][0]}')
266 | first = False
267 |
268 | # Set content_bytes to the unused data so we can start the search for the
269 | # next zip file.
270 | content_bytes = d.unused_data
271 |
272 | except zlib.error: # Current content_bytes is not a zipfile -> skip a byte.
273 | content_bytes = content_bytes[1:]
274 |
275 | return entries
276 |
277 |
278 | def _split(input_bytes, verbose) -> Tuple[List[Tuple[str, str]],
279 | bool]:
280 | """Split `input_bytes` into a list of tuples (name, definition)."""
281 | printv = print if verbose else lambda *a, **k: ...
282 |
283 | # The first four bytes are always not UTF-8 (not sure why?)
284 | input_bytes = input_bytes[4:]
285 |
286 | printv('Splitting...')
287 | printv(f'{"index": <10}', f'{"bytes": <30}', f'{"as chars"}',
288 | '-' * 50, sep='\n')
289 |
290 | entries = []
291 | total_offset = 0
292 | stop_further_parsing = False
293 |
294 | while True:
295 | # Find the next newline, which delimits the current entry.
296 | try:
297 | next_offset = input_bytes.index('\n'.encode('utf-8'))
298 | except ValueError: # No more new-lines -> no more entries!
299 | break
300 |
301 | entry_text = input_bytes[:next_offset].decode('utf-8')
302 |
303 | # The final part of the dictionary contains some meta info, which we skip.
304 | # TODO: might only be for the NOAD, so check other dictionaries.
305 | if 'fbm_AdvisoryBoard' in entry_text[:1000]:
306 | print('fbm_AdvisoryBoard detected, stopping...')
307 | stop_further_parsing = True
308 | break
309 |
310 | # Make sure we have a valid entry.
311 | assert (entry_text.startswith('')), \
313 | f'ENTRY: {entry_text} \n REM: {input_bytes}'
314 |
315 | # The name of the definition is stored in the "d:title" attribute,
316 | # where "d" is the current domain, which we get from the nsmap - the
317 | # actual attribute will be "{com.apple.blabla}title" (including the
318 | # curly brackets).
319 | xml_entry = etree.fromstring(entry_text)
320 | domain = xml_entry.nsmap['d']
321 | key = '{%s}title' % domain
322 | name = xml_entry.get(key) # Lookup the attribute in the tree.
323 |
324 | entries.append((name, entry_text))
325 |
326 | printv(f'{next_offset + total_offset: 10d}',
327 | f'{str(input_bytes[next_offset + 1:next_offset + 5]): <30}',
328 | xml_entry.get(key))
329 |
330 | # There is always 4 bytes of chibberish between entries. Skip them
331 | # and the new lines (for a total of 5 bytes).
332 | input_bytes = input_bytes[next_offset + 5:]
333 | total_offset += next_offset
334 | return entries, stop_further_parsing
335 |
336 |
337 | class Entry:
338 | def __init__(self, key, content):
339 | self.key = key
340 | self.content = content
341 |
342 | # Set to true on the first call to `append_definition`.
343 | # Used in get_xml_tree.
344 | self._multi_definition = False
345 |
346 | # These are lazily populated as they take a while.
347 | self._xml = None
348 | self._info = None
349 | self._words_and_derivatives = None
350 |
351 | def append_definition(self, content):
352 | """Extend self.content with more XML.
353 |
354 | The key here is to make sure the overall content is still valid XML
355 | by wrapping the whole thing in a
, which is handled in `get_xml_tree`,
356 | here we just set _multi_definition.
357 | """
358 | self._multi_definition = True
359 | self.content += content
360 |
361 | def get_xml_tree(self):
362 | content = self.content
363 | if self._multi_definition:
364 | content = '
' + self.content + '
'
365 | return etree.fromstring(content)
366 |
367 | def get_special(self, xpath, replace=None):
368 | matches = self.get_xml().xpath(xpath)
369 | if not matches:
370 | return []
371 | # Note: May be empty.
372 | texts = [el.text for el in matches if el.text]
373 | if replace:
374 | for r_in, r_out in replace:
375 | texts = [t.replace(r_in, r_out) for t in texts]
376 | texts = [t.strip() for t in texts]
377 | return texts
378 |
379 | def get_xml(self):
380 | if self._xml is None:
381 | self._xml = self.get_xml_tree()
382 | return self._xml
383 |
384 | def get_words_and_derivaties(self):
385 | def _make():
386 | derivatives = set(self.get_special(XPATH_DERIVATIVES))
387 | other_words = set(self.get_special(XPATH_OTHER_WORDS, [("the", "")]))
388 | return (derivatives | other_words) - {self.key}
389 |
390 | return _lazy(self, "_words_and_derivatives", _make)
391 |
392 | def get_info(self):
393 | return _lazy(self, "_info", lambda: set(self.get_special(XPATH_INFO)))
394 |
395 | def __str__(self):
396 | return f'Entry({self.key})'
397 |
398 |
399 | def _lazy(obj, ivar, creator):
400 | if getattr(obj, ivar) is None:
401 | setattr(obj, ivar, creator())
402 | return getattr(obj, ivar)
403 |
404 |
405 | if __name__ == '__main__':
406 | main()
407 |
--------------------------------------------------------------------------------