├── .gitignore
├── LICENSE
├── README.md
├── assets
    ├── dictionary.png
    └── dictionary_myoutput.png
├── extract.py
└── reverse_data.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # More
132 | cache*.pkl
133 | .idea/
134 | 
135 | # Test files, old files
136 | lookup/
137 | old/
138 | test_files/
139 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 fab-jul
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # parse_dictionaries
 2 | 
 3 | 
 4 | The following blog post contains some backgroud
 5 | about this repo: 
 6 | [Reverse-Engineering Apple Dictionary](https://fmentzer.github.io/posts/2020/dictionary/).
 7 | 
 8 | ## Parsing with `reverse_data`.py
 9 | 
10 | 
11 | Parses the great Apple Dictionary
12 | (for now tested with the New Oxfor American Dictionary).
13 | 
14 | Here is what the built-in Dictionary app gives for "handle":
15 | 
16 | <div align="center">
17 |   <img src='assets/dictionary.png' width="70%"/>
18 | </div>
19 | 
20 | And here is what this script gives (on a Mac), with
21 | 
22 | ```bash
23 | #
24 | # NOTE: might be at a different location for you!
25 | # If so, you can find it using a glob, e.g.,
26 | # ls /System/Library/AssetsV2/ \
27 | #   com_apple_MobileAsset_DictionaryServices_dictionaryOSX/*/*
28 | # then take the one that has `New Oxford American Dictionary`!
29 | #
30 | # New Oxford American Dictionary
31 | NOAD='/System/Library/AssetsV2/ \
32 |        com_apple_MobileAsset_DictionaryServices_dictionaryOSX/ \
33 |        4094df88727a054b658681dfb74f23702d3c985e.asset/ \
34 |        AssetData/ \
35 |        New Oxford American Dictionary.dictionary/ \
36 |        Contents/Resources/Body.data'
37 | 
38 | python reverse_data.py \ 
39 |         --dictionary_path $NOAD --lookup handle --output_path lookup/lookup.html
40 | ```
41 | 
42 | <div align="center">
43 |   <img src='assets/dictionary_myoutput.png' width="70%"/>
44 | </div>
45 | 
46 | ## Extracting words and definitions from a book with `extract.py`
47 | 
48 | If you want to split a book into all its words and look them all up,
49 | you can use `extract.py`. This relies on `nltk` to properly get definitions,
50 | e.g., to turn "he builds houses" into `["he", "build", "house"]`. 
51 | 
52 | ```bash
53 | python extract.py PATH_TO_BOOK.txt PATH_TO_OUTPUT.zip
54 | ```
55 | 
56 | The resulting zip file contains a single file `master.json`, which contains
57 | three keys. Example:
58 | 
59 | ```json
60 | {
61 |  "definitions": {
62 |    "cozen": "<d:entry xmlns:d=...",
63 |    "house": "<d:entry xmlns:d=...",
64 |    "related": "<d:entry xmlns:d=...",
65 |    "rod": "<d:entry xmlns:d=...",
66 |    "...": "..."
67 |  },
68 |  "links": {
69 |    "vitals": "vital",
70 |    "...": "..."
71 |  },
72 |  "scores": {
73 |    "cozen": 0.5,
74 |    "house": 1.0,
75 |    "related": 10.0,
76 |    "rod": 20.0,
77 |    "...": "..."
78 |  }
79 | }
80 | ```
81 | 
82 | - `definitions` are just definitions of all words.
83 | - `links` contains links
84 | to definitions, if a word in the book does not have it's own definition.
85 | - `scores` is a crude estimate for how likely it is that the reader knows
86 | a word, see the `_get_scores` in `extract.py`.
87 | 
88 | 


--------------------------------------------------------------------------------
/assets/dictionary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fab-jul/parse_dictionaries/1d9b49d16ca57ccf02bb0edf4f1515f6ddff3c76/assets/dictionary.png


--------------------------------------------------------------------------------
/assets/dictionary_myoutput.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fab-jul/parse_dictionaries/1d9b49d16ca57ccf02bb0edf4f1515f6ddff3c76/assets/dictionary_myoutput.png


--------------------------------------------------------------------------------
/extract.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Extract words out of a book/text and get their definitions.
  4 | 
  5 | """
  6 | import argparse
  7 | import codecs
  8 | import collections
  9 | import sys
 10 | import json
 11 | import os
 12 | import re
 13 | import zipfile
 14 | from typing import Dict
 15 | 
 16 | from nltk import tokenize, WordNetLemmatizer
 17 | from nltk.corpus.reader import wordnet
 18 | 
 19 | import reverse_data
 20 | 
 21 | # TODO: needs:
 22 | # nltk.download('wordnet')
 23 | 
 24 | MIN_WORD_LEN = 1
 25 | SEP = '|||'
 26 | 
 27 | 
 28 | def main():
 29 |   p = argparse.ArgumentParser()
 30 |   p.add_argument('input_path', help='Path to a .txt file or similar containing ' 
 31 |                                     'the text/book to parse.')
 32 |   p.add_argument('output_path', help='Path to where the output .zip should '
 33 |                                      'be stored.')
 34 |   p.add_argument('--dictionary_path', default=reverse_data.NOAD,
 35 |                  help="Path to a Body.data file. "
 36 |                       f"Defaults to {reverse_data.NOAD}")
 37 |   p.add_argument('--input_encoding', '-i',
 38 |                  help='Encoding of the file at INPUT_PATH. By default, try'
 39 |                       'utf-8 and ISO-8859-1.')
 40 |   flags = p.parse_args()
 41 |   if not flags.output_path.endswith('.zip'):
 42 |     print('output_path should end in .zip')
 43 |     sys.exit(1)
 44 | 
 45 |   extract_definitions_from_text(flags.input_path,
 46 |                                 flags.output_path,
 47 |                                 flags.dictionary_path,
 48 |                                 flags.input_encoding)
 49 | 
 50 | 
 51 | def extract_definitions_from_text(input_path,
 52 |                                   output_path,
 53 |                                   dictionary_path=reverse_data.NOAD,
 54 |                                   input_encoding=None):
 55 |   if not os.path.isfile(input_path):
 56 |     raise FileNotFoundError(input_path)
 57 | 
 58 | 
 59 |   word_dict = reverse_data.WordDictionary.from_file(dictionary_path)
 60 | 
 61 |   text = try_to_read(input_path, input_encoding)
 62 |   word_counts, links = _get_word_counts(text, word_dict)
 63 |   word_dict.add_links(links)
 64 | 
 65 |   scores = _get_scores(word_counts, word_dict)
 66 | 
 67 |   words = set(word_counts.keys())
 68 |   _write_filtered_dict(words, word_dict, scores, text, output_path)
 69 |   return list(word_counts.keys())
 70 | 
 71 | 
 72 | def try_to_read(input_path, input_encoding=None) -> str:
 73 |   if input_encoding:
 74 |     input_encodings = [input_encoding]
 75 |   else:
 76 |     input_encodings = ['utf-8', 'ISO-8859-1', 'ascii']
 77 | 
 78 |   for encoding in input_encodings:
 79 |     try:
 80 |       with codecs.open(input_path, mode='r', encoding=encoding) as f:
 81 |         print(f'Decoded file with {encoding}.')
 82 |         return f.read()
 83 |     except UnicodeDecodeError as e:
 84 |       print(f'Caught {e}')
 85 | 
 86 |   raise ValueError("Hmm, file has unknown encoding.")
 87 | 
 88 | 
 89 | def _write_filtered_dict(words: set,
 90 |                          word_dict: reverse_data.WordDictionary,
 91 |                          scores: Dict[str, float],
 92 |                          text: str,
 93 |                          output_path: str):
 94 |   filtered_word_dict = word_dict.filtered(words)
 95 |   dict_of_str = {key: entry.content for key, entry in filtered_word_dict.items()}
 96 |   master_object = {
 97 |     'definitions': dict_of_str,
 98 |     'links': filtered_word_dict.links,
 99 |     'scores': scores}
100 | 
101 |   # Write definitions as JSON
102 |   os.makedirs(os.path.dirname(output_path), exist_ok=True)
103 |   with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zf:
104 |     zf.writestr('master.json', json.dumps(master_object).encode('utf-8'))
105 |     zf.writestr('fulltext.txt', text.encode('utf-8'))
106 | 
107 | 
108 | def _get_scores(word_counts: Dict[str, int],
109 |                 word_dict: reverse_data.WordDictionary) -> Dict[str, float]:
110 |   """Very crude way to get scores from word_counts and a dictionary.
111 | 
112 |   Algorithm for a word `w`:
113 |   1. Set score := how often does w occur in the text
114 |      (as given by `word_counts`).
115 |   2. If the word is said to be "literary" in the dict, divide score by 2.
116 | 
117 |   Overall, lower score means "rarer" words.
118 |   """
119 |   scores = {word: count for word, count in word_counts.items()}
120 |   for word in word_counts:
121 |     if 'literary' in word_dict[word].get_info():
122 |       scores[word] /= 2
123 |   return scores
124 | 
125 | 
126 | def _get_word_counts(text: str,
127 |                      word_dict: reverse_data.WordDictionary):
128 |   """Given a text and a dictionary, split the text into words and return counts.
129 | 
130 |   Done by:
131 |   1. Sanitize text by doing lower case and removing newlines.
132 |   2. Use NLTK's tokenizer
133 |   3. Try to find the base by using NLTK's lemmatizer (i.e. houses -> house),
134 |      to increase chances of finding a word in the dictionary
135 |   4. Count the occurences of words.
136 |   """
137 |   text = text.lower()
138 | 
139 |   print('Collapsing newlines...')
140 |   text = re.sub('(.)\n', r'\1 ', text)
141 | 
142 |   print('Tokenizing...')
143 |   words = tokenize.word_tokenize(text)
144 | 
145 |   print('Pruning...')
146 | 
147 |   # Remove punctuation in tokens, as ntlk tokenizes for example "they'll" as
148 |   # [they, 'll]. The resulting "ll" will be ditched in a later stage.
149 |   # Also removes tokens that are just quotes, which turn into empty tokens,
150 |   # removed at the MIN_WORD_LEN stage below.
151 |   words = (w.strip("'.-`\"") for w in words)
152 |   # Ditches some genitives and third person singulars. In Python 3.9 this
153 |   # should be `removesuffix` but the `replace` works well enough in this context.
154 |   words = (w.replace("'s", '') for w in words)
155 |   # Removes abbreviations such as "e.g."
156 |   words = (w for w in words if '.' not in w)
157 |   # Removes most punctuation from the list, such as ",", ":", etc.,
158 |   # also removes empty tokens.
159 |   words = (w for w in words if len(w) > MIN_WORD_LEN)
160 |   # Removes all numbers
161 |   words = (w for w in words if w and not all(c.isdigit() for c in w))
162 |   
163 |   print('Counting...')
164 |   word_counts = collections.Counter(words)
165 | 
166 |   print('Lemmatizing...')
167 |   lemma = WordNetLemmatizer()
168 |   word_counts_lemmad = collections.defaultdict(int)
169 | 
170 |   # We create a map from word_as_it_appears_in_book to the lemmad
171 |   # words to simplify lookup later. Note that it's not exactly
172 |   # word_as_it_appears_in_book due to the preprocessing above but oh well.
173 |   links = {}
174 | 
175 |   # Note: assume we have `word_counts` = {"belongs": 4 "belonging":3}
176 |   # This results in sth like {"belong": 7, "belonging": 7} in the following.
177 |   for w, count in word_counts.items():
178 |     possible_words = []
179 |     if w in word_dict:
180 |       possible_words.append(w)
181 |       word_is_in_dict = True
182 |     else:
183 |       word_is_in_dict = False
184 | 
185 |     for t in wordnet.POS_LIST:
186 |       w_lemmad = lemma.lemmatize(w, pos=t)
187 |       if w_lemmad != w and w_lemmad in word_dict:
188 |         possible_words.append(w_lemmad)
189 | 
190 |     # Neither the input word nor any lemmad forms are in the dictionary.
191 |     if not possible_words:
192 |       continue
193 | 
194 |     # Input word is not in dictionary but some lemmad form is.
195 |     # We pick any random of the lemmad forms to make a link,
196 |     # hoping it's a good one.
197 |     if not word_is_in_dict:
198 |       links[w] = next(iter(possible_words))
199 | 
200 |     # Build counts dict.
201 |     for possible_w in possible_words:
202 |       word_counts_lemmad[possible_w] += count
203 | 
204 |   return word_counts_lemmad, links
205 | 
206 | 
207 | if __name__ == '__main__':
208 |   main()
209 | 


--------------------------------------------------------------------------------
/reverse_data.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Parse Apple dictionaries given as Body.data files.
  3 | 
  4 | The function that does the heavy lifting is _parse. Overview:
  5 | 
  6 | - The files are just ZIPs of XML entries concatenated with some headers
  7 |   inbetween
  8 | - We greedily try to find the ZIPs and extract the XML
  9 | - Some XML parsing is implemented to find interesting stuff (derivatives for
 10 |   example).
 11 | 
 12 | """
 13 | import argparse
 14 | import collections
 15 | import contextlib
 16 | import itertools
 17 | import os
 18 | import pickle
 19 | import shutil
 20 | import zlib
 21 | from typing import Dict, List, Tuple, Set
 22 | 
 23 | import lxml.etree as etree
 24 | 
 25 | # New Oxford American Dictionary
 26 | NOAD = '/System/Library/AssetsV2/' \
 27 |        'com_apple_MobileAsset_DictionaryServices_dictionaryOSX/' \
 28 |        '4094df88727a054b658681dfb74f23702d3c985e.asset/' \
 29 |        'AssetData/' \
 30 |        'New Oxford American Dictionary.dictionary/' \
 31 |        'Contents/Resources/Body.data'
 32 | 
 33 | 
 34 | # Matches spans that give some meta info, like "literary", "informal", etc.
 35 | XPATH_INFO = '//span[@class="lg"]/span[@class="reg"]'
 36 | 
 37 | # This matches the bold words in the definitions. For an example,
 38 | # see "vital", which contains "noun (vitals)"
 39 | XPATH_OTHER_WORDS = '//span[@class="fg"]/span[@class="f"]'
 40 | 
 41 | # This matches the derivatives at the end of definition.
 42 | XPATH_DERIVATIVES = '//span[contains(@class, "t_derivatives")]//' \
 43 |                     'span[contains(@class, "x_xoh")]/' \
 44 |                     'span[@role="text"]'
 45 | 
 46 | OUTPUT_HTML_HEADER = """
 47 | <html lang="en">
 48 | <head>
 49 |   <meta charset="utf-8">
 50 |   <title>Words</title>
 51 |   <link rel="stylesheet" href="DefaultStyle.css">
 52 |   <link rel="stylesheet" href="CustomStyle.css">
 53 | </head>
 54 | """
 55 | 
 56 | CUSTOM_CSS = """
 57 | .div-entry {
 58 |     border-top: 2px solid black;
 59 |     padding-bottom: 50px;
 60 | }
 61 | """
 62 | 
 63 | 
 64 | def main():
 65 |   p = argparse.ArgumentParser()
 66 |   p.add_argument('--dictionary_path', default=NOAD,
 67 |                  help=f"path to a body.data file. defaults to {NOAD}")
 68 |   p.add_argument('--lookup', nargs='+',
 69 |                  default=['vital', 'house', 'cozen'],
 70 |                  help='words to lookup')
 71 |   p.add_argument('--output_path', default='lookup/lookup.html',
 72 |                  help='where to save the words.')
 73 | 
 74 |   flags = p.parse_args()
 75 |   save_definitions(flags.dictionary_path,
 76 |                    flags.lookup,
 77 |                    flags.output_path)
 78 | 
 79 | 
 80 | def save_definitions(dictionary_path, lookup_words, output_path):
 81 |   if not dictionary_path.endswith('Body.data'):
 82 |     raise ValueError(f'Expected a Body.data file, got {dictionary_path}')
 83 | 
 84 |   word_dict = WordDictionary.from_file(dictionary_path)
 85 |   os.makedirs(os.path.dirname(output_path), exist_ok=True)
 86 | 
 87 |   with open(output_path, 'w') as f:
 88 |     f.write(OUTPUT_HTML_HEADER)
 89 |     with wrap_in_tag(f, 'body'):
 90 |       for target in lookup_words:
 91 |         entry = word_dict[target]
 92 |         t = entry.get_xml_tree()
 93 |         with wrap_in_tag(f, 'div', attr='class="div-entry"'):
 94 |           f.write(etree.tostring(t, pretty_print=True).decode())
 95 | 
 96 |   print(f'Saved {len(lookup_words)} definitions at {output_path}.')
 97 | 
 98 |   # Copy default and custom CSS to output
 99 |   css_path = dictionary_path.replace('Body.data', 'DefaultStyle.css')
100 |   if not os.path.isfile(css_path):
101 |     print(f'WARN: CSS not found at expected path {css_path}')
102 |   css_path_out = os.path.join(os.path.dirname(output_path),
103 |                               os.path.basename(css_path))
104 |   shutil.copy(css_path, css_path_out)
105 |   custom_css_path_out = os.path.join(os.path.dirname(output_path),
106 |                                      'CustomStyle.css')
107 |   with open(custom_css_path_out, 'w') as f:
108 |     f.write(CUSTOM_CSS)
109 | 
110 | 
111 | class WordDictionary:
112 |   """Rrepresents a dictionary."""
113 | 
114 |   @staticmethod
115 |   def from_file(p):
116 |     d, links = parse(p)
117 |     return WordDictionary(d, links)
118 | 
119 |   def __init__(self, d: Dict[str, 'Entry'], links: Dict[str, str]):
120 |     """Constructor.
121 | 
122 |     :param d: The dictionary, as a dict mapping words to Entry instances.
123 |     :param links: Special links, as a dict mapping words to words. Words `w` in
124 |       this dict have a definition at `links[w]`.
125 |     """
126 |     self.d, self.links = d, links
127 | 
128 |   def items(self):
129 |     return self.d.items()
130 | 
131 |   def add_links(self, links: Dict[str, str]):
132 |     for w, linked_w in links.items():
133 |       # Word already linked, so we should be able to find it.
134 |       if w in self.links:
135 |         continue
136 |       assert linked_w in self
137 |       self.links[w] = linked_w
138 | 
139 |   def filtered(self, words) -> 'WordDictionary':
140 |     filtered_dict = {}
141 |     filtered_links = {}
142 |     for w in words:
143 |       filtered_dict[w] = self[w]  # May raise!
144 |       if w in self.links:
145 |         filtered_links[w] = self.links[w]
146 |     return WordDictionary(filtered_dict, filtered_links)
147 | 
148 |   def __getitem__(self, w) -> 'Entry':
149 |     if w in self.d:
150 |       return self.d[w]
151 |     if w in self.links:
152 |       return self.d[self.links[w]]
153 |     raise KeyError(w)
154 | 
155 |   def __contains__(self, w):
156 |     return w in self.d or w in self.links
157 | 
158 |   def __str__(self):
159 |     return f'WordDcitionary({len(self.d)} definitions, ' \
160 |            f'{len(self.links)} links)'
161 | 
162 | 
163 | @contextlib.contextmanager
164 | def wrap_in_tag(f, tag, attr=None):
165 |   if attr:
166 |     f.write(f'<{tag} {attr}>')
167 |   else:
168 |     f.write(f'<{tag}>')
169 |   yield
170 |   f.write(f'</{tag}>')
171 | 
172 | 
173 | def parse(dictionary_path):
174 |   print(f"Parsing {dictionary_path}...")
175 |   entries_tuples = _parse(dictionary_path)
176 |   print('Augmenting...')
177 |   # Some definitions have multiple entries (for example foil in NOAD).
178 |   # Merge them here.
179 |   entries = merge_same_keys(entries_tuples)
180 |   links = _get_links(dictionary_path, entries)
181 |   print(f'Links: {len(links)}')
182 |   return entries, links
183 | 
184 | 
185 | def merge_same_keys(entries_tuples: List[Tuple[str, str]]) -> Dict[str, 'Entry']:
186 |   entries = {}
187 |   for k, e in entries_tuples:
188 |     if k in entries:
189 |       entries[k].append_definition(e)
190 |     else:
191 |       entries[k] = Entry(k, e)
192 |   return entries
193 | 
194 | 
195 | def _pickle_cache(p):
196 |   """Little helper decorator to store stuff in a pickle cache, used below."""
197 |   def decorator(func):
198 |     if os.path.isfile(p):
199 |       with open(p, 'rb') as f:
200 |         cache = pickle.load(f)
201 |     else:
202 |       cache = {}
203 | 
204 |     def new_func(*args, **kwargs):
205 |       key = args[0]
206 |       if key not in cache:
207 |         res = func(*args, **kwargs)
208 |         cache[key] = res
209 |         with open(p, 'wb') as f:
210 |           pickle.dump(cache, f)
211 |       else:
212 |         print(f'Cached in {p}: {key}')
213 |       return cache[key]
214 | 
215 |     return new_func
216 |   return decorator
217 | 
218 | 
219 | @_pickle_cache('cache_links.pkl')
220 | def _get_links(p, entries):
221 |   del p  # Only used for cache
222 |   links = {}
223 |   print('Getting links...')
224 |   for i, (key, entry) in enumerate(entries.items()):
225 |     if i % 1000 == 0:
226 |       progress = i / len(entries)
227 |       print(f'\rGetting links: {progress * 100:.1f}%', end='', flush=True)
228 |     for w in entry.get_words_and_derivaties():
229 |       if w in entries:
230 |         continue
231 |       # Word is not in dictionary, add to links
232 |       if w in links:
233 |         continue
234 |       links[w] = key
235 |   return links
236 | 
237 | 
238 | @_pickle_cache('cache_parse.pkl')
239 | def _parse(dictionary_path) -> List[Tuple[str, str]]:
240 |   """Parse Body.data into a list of entries given as key, definition tuples."""
241 |   with open(dictionary_path, 'rb') as f:
242 |     content_bytes = f.read()
243 |   total_bytes = len(content_bytes)
244 | 
245 |   # The first zip file starts at ~100 bytes:
246 |   content_bytes = content_bytes[100:]
247 | 
248 |   first = True
249 |   entries = []
250 |   for i in itertools.count():
251 |     if not content_bytes:  # Backup condition in case stop is never True.
252 |       break
253 |     try:
254 |       d = zlib.decompressobj()
255 |       res = d.decompress(content_bytes)
256 |       new_entries, stop = _split(res, verbose=first)
257 |       entries += new_entries
258 |       if stop:
259 |         break
260 |       if i % 10 == 0:
261 |         bytes_left = len(content_bytes)  # Approximately...
262 |         progress = 1 - bytes_left / total_bytes
263 |         print(f'{progress * 100:.1f}% // '
264 |               f'{len(entries)} entries parsed // '
265 |               f'Latest entry: {entries[-1][0]}')
266 |       first = False
267 | 
268 |       # Set content_bytes to the unused data so we can start the search for the
269 |       # next zip file.
270 |       content_bytes = d.unused_data
271 | 
272 |     except zlib.error:  # Current content_bytes is not a zipfile -> skip a byte.
273 |       content_bytes = content_bytes[1:]
274 | 
275 |   return entries
276 | 
277 | 
278 | def _split(input_bytes, verbose) -> Tuple[List[Tuple[str, str]],
279 |                                           bool]:
280 |   """Split `input_bytes` into a list of tuples (name, definition)."""
281 |   printv = print if verbose else lambda *a, **k: ...
282 | 
283 |   # The first four bytes are always not UTF-8 (not sure why?)
284 |   input_bytes = input_bytes[4:]
285 | 
286 |   printv('Splitting...')
287 |   printv(f'{"index": <10}', f'{"bytes": <30}', f'{"as chars"}',
288 |          '-' * 50, sep='\n')
289 | 
290 |   entries = []
291 |   total_offset = 0
292 |   stop_further_parsing = False
293 | 
294 |   while True:
295 |     # Find the next newline, which delimits the current entry.
296 |     try:
297 |       next_offset = input_bytes.index('\n'.encode('utf-8'))
298 |     except ValueError:  # No more new-lines -> no more entries!
299 |       break
300 | 
301 |     entry_text = input_bytes[:next_offset].decode('utf-8')
302 | 
303 |     # The final part of the dictionary contains some meta info, which we skip.
304 |     # TODO: might only be for the NOAD, so check other dictionaries.
305 |     if 'fbm_AdvisoryBoard' in entry_text[:1000]:
306 |       print('fbm_AdvisoryBoard detected, stopping...')
307 |       stop_further_parsing = True
308 |       break
309 | 
310 |     # Make sure we have a valid entry.
311 |     assert (entry_text.startswith('<d:entry') and
312 |             entry_text.endswith('</d:entry>')), \
313 |       f'ENTRY: {entry_text} \n REM: {input_bytes}'
314 | 
315 |     # The name of the definition is stored in the "d:title" attribute,
316 |     # where "d" is the current domain, which we get from the nsmap - the
317 |     # actual attribute will be "{com.apple.blabla}title" (including the
318 |     # curly brackets).
319 |     xml_entry = etree.fromstring(entry_text)
320 |     domain = xml_entry.nsmap['d']
321 |     key = '{%s}title' % domain
322 |     name = xml_entry.get(key)  # Lookup the attribute in the tree.
323 | 
324 |     entries.append((name, entry_text))
325 | 
326 |     printv(f'{next_offset + total_offset: 10d}',
327 |            f'{str(input_bytes[next_offset + 1:next_offset + 5]): <30}',
328 |            xml_entry.get(key))
329 | 
330 |     # There is always 4 bytes of chibberish between entries. Skip them
331 |     # and the new lines (for a total of 5 bytes).
332 |     input_bytes = input_bytes[next_offset + 5:]
333 |     total_offset += next_offset
334 |   return entries, stop_further_parsing
335 | 
336 | 
337 | class Entry:
338 |   def __init__(self, key, content):
339 |     self.key = key
340 |     self.content = content
341 | 
342 |     # Set to true on the first call to `append_definition`.
343 |     # Used in get_xml_tree.
344 |     self._multi_definition = False
345 | 
346 |     # These are lazily populated as they take a while.
347 |     self._xml = None
348 |     self._info = None
349 |     self._words_and_derivatives = None
350 | 
351 |   def append_definition(self, content):
352 |     """Extend self.content with more XML.
353 | 
354 |     The key here is to make sure the overall content is still valid XML
355 |     by wrapping the whole thing in a <div>, which is handled in `get_xml_tree`,
356 |     here we just set _multi_definition.
357 |     """
358 |     self._multi_definition = True
359 |     self.content += content
360 | 
361 |   def get_xml_tree(self):
362 |     content = self.content
363 |     if self._multi_definition:
364 |       content = '<div>' + self.content + '</div>'
365 |     return etree.fromstring(content)
366 | 
367 |   def get_special(self, xpath, replace=None):
368 |     matches = self.get_xml().xpath(xpath)
369 |     if not matches:
370 |       return []
371 |     # Note: May be empty.
372 |     texts = [el.text for el in matches if el.text]
373 |     if replace:
374 |       for r_in, r_out in replace:
375 |         texts = [t.replace(r_in, r_out) for t in texts]
376 |     texts = [t.strip() for t in texts]
377 |     return texts
378 | 
379 |   def get_xml(self):
380 |     if self._xml is None:
381 |       self._xml = self.get_xml_tree()
382 |     return self._xml
383 | 
384 |   def get_words_and_derivaties(self):
385 |     def _make():
386 |       derivatives = set(self.get_special(XPATH_DERIVATIVES))
387 |       other_words = set(self.get_special(XPATH_OTHER_WORDS, [("the", "")]))
388 |       return (derivatives | other_words) - {self.key}
389 | 
390 |     return _lazy(self, "_words_and_derivatives", _make)
391 | 
392 |   def get_info(self):
393 |     return _lazy(self, "_info", lambda: set(self.get_special(XPATH_INFO)))
394 | 
395 |   def __str__(self):
396 |     return f'Entry({self.key})'
397 | 
398 | 
399 | def _lazy(obj, ivar, creator):
400 |   if getattr(obj, ivar) is None:
401 |     setattr(obj, ivar, creator())
402 |   return getattr(obj, ivar)
403 | 
404 | 
405 | if __name__ == '__main__':
406 |   main()
407 | 


--------------------------------------------------------------------------------