├── MANIFEST.in
├── epub_conversion
    ├── __init__.py
    ├── converter.py
    ├── utils.py
    └── wiki_decoder.py
├── upload.sh
├── .gitignore
├── setup.py
└── README.md


/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md


--------------------------------------------------------------------------------
/epub_conversion/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Epub and wiki dump conversion module. Provides utilities
 3 | for taking large xml files and extracting pages or tokenized
 4 | sentences and words.
 5 | 
 6 | """
 7 | 
 8 | from .converter import Converter
 9 | from .wiki_decoder import convert_wiki_to_lines, convert_wiki_to_corpus
10 | 
11 | __all__ = ["Converter", "convert_wiki_to_lines", "convert_wiki_to_corpus"]


--------------------------------------------------------------------------------
/upload.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # stop script on error and print it
 4 | set -e
 5 | # inform me of undefined variables
 6 | set -u
 7 | # handle cascading failures well
 8 | set -o pipefail
 9 | 
10 | SCRIPT_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
11 | 
12 | rm ${SCRIPT_DIR}/dist/*
13 | python3 setup.py clean
14 | python3 setup.py sdist
15 | twine upload ${SCRIPT_DIR}/dist/* --verbose
16 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | <<<<<<< HEAD
 2 | # Compiled source #
 3 | ###################
 4 | *.com
 5 | *.class
 6 | *.dll
 7 | *.exe
 8 | *.o
 9 | *.so
10 | 
11 | # Packages #
12 | ############
13 | # it's better to unpack these files and commit the raw source
14 | # git has its own built in compression methods
15 | *.7z
16 | *.dmg
17 | *.gz
18 | *.iso
19 | *.jar
20 | *.rar
21 | *.tar
22 | *.zip
23 | *.gem
24 | *.pem
25 | dist/
26 | build/
27 | 
28 | # Saves #
29 | #########
30 | saves/*
31 | imported_saves/*
32 | pvdm_snapshots/*
33 | sentiment_data/*
34 | *.npy
35 | *.mat
36 | *.vocab
37 | *.svocab
38 | text8
39 | __pycache__/*
40 | *.pyc
41 | *.egg-info
42 | 
43 | # Logs and databases #
44 | ######################
45 | *.log
46 | *.sql
47 | *.sqlite
48 | 
49 | # OS generated files #
50 | ######################
51 | .DS_Store
52 | .DS_Store?
53 | ._*
54 | .Spotlight-V100
55 | .Trashes
56 | ehthumbs.db
57 | Thumbs.db
58 | =======
59 | .DS_Store
60 | >>>>>>> 9ced40381de6f9e6c2b02fc8ba7bb993203c6d62


--------------------------------------------------------------------------------
/epub_conversion/converter.py:
--------------------------------------------------------------------------------
 1 | from .utils import get_files_from_path, convert_epub_to_lines, convert_lines_to_text, open_book
 2 | import gzip
 3 | 
 4 | 
 5 | class Converter(object):
 6 |     """
 7 |     Convert a folder of epubs to raw text for corpus
 8 |     learning.
 9 |     """
10 | 
11 |     def __init__(self, path):
12 |         self.path = path
13 | 
14 |     def convert(self, target_path):
15 |         epub_paths = get_files_from_path(".epub", self.path)
16 | 
17 |         with gzip.open(target_path, "wb") as file:
18 |             for (epub_path, epub_name) in epub_paths:
19 |                 book = open_book(epub_path)
20 |                 if book is not None:
21 |                     for sentence in convert_lines_to_text(convert_epub_to_lines(book)):
22 |                         file.write(sentence.encode("utf-8"))
23 |                     print("Wrote \"%s\" to disk" % (epub_name))
24 |                 else:
25 |                     print("Couldn't open \"%s\"." % (epub_name))
26 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from setuptools import setup, find_packages
 3 | 
 4 | 
 5 | def readfile(fname):
 6 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
 7 | 
 8 | 
 9 | setup(
10 |     name='epub-conversion',
11 |     version='1.0.9',
12 |     description='Python package for converting xml and epubs to text files',
13 |     long_description=readfile('README.md'),
14 |     long_description_content_type="text/markdown",
15 |     ext_modules=[],
16 |     packages=find_packages(),
17 |     py_modules=[],
18 |     author='Jonathan Raiman',
19 |     author_email='jonathanraiman@gmail.com',
20 |     url='https://github.com/JonathanRaiman/epub_conversion',
21 |     download_url='https://github.com/JonathanRaiman/epub_conversion',
22 |     keywords='XML, epub, tokenization, NLP',
23 |     license='MIT',
24 |     platforms='any',
25 |     zip_safe=False,
26 |     classifiers=[
27 |         'Intended Audience :: Science/Research',
28 |         'Operating System :: OS Independent',
29 |         'Programming Language :: Python :: 3.3',
30 |         'Topic :: Text Processing :: Linguistic',
31 |     ],
32 |     setup_requires=[],
33 |     install_requires=[
34 |         'bz2file',
35 |         'epub',
36 |         'ciseau'
37 |     ],
38 |     include_package_data=True,
39 | )
40 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | epub conversion
 2 | ---------------
 3 | 
 4 | Create text corpuses using epubs and wiki dumps.
 5 | This is a python package with a Converter for epub and xml (wiki dumps) to text, lines, or Python generators.
 6 | 
 7 | Usage:
 8 | ------
 9 | 
10 | ### Epub usage
11 | 
12 | #### Book by book
13 | 
14 | To convert epubs to text files, usage is straightforward. First create a converter object:
15 | 
16 | 	converter = Converter("my_ebooks_folder/")
17 | 
18 | Then using this converter let's concatenate all the text within the ebooks into a single mega text file:
19 | 
20 | 	converter.convert("my_succinct_text_file.gz")
21 | 
22 | #### Line by line
23 | 
24 | You can also proceed line by line:
25 | 
26 | 	from epub_conversion.utils import open_book, convert_epub_to_lines
27 | 
28 | 	book = open_book("twilight.epub")
29 | 
30 | 	lines = convert_epub_to_lines(book)
31 | 
32 | ### Wikidump usage
33 | 
34 | #### Redirections
35 | 
36 | Suppose you are interested in all redirections in a given Wikipedia dump file
37 | that is still compressed, then you can access the dump as follows:
38 | 
39 | 
40 | 	wiki = epub_conversion.wiki_decoder.almost_smart_open("enwiki.bz2")
41 | 
42 | 
43 | Taking this dump as our **input** let us now use a generator to output all pairs of `title` and `redirection title` in this dump:
44 | 
45 | 	redirections = {redirect_from:redirect_to
46 | 		for redirect_from, redirect_to in epub_conversion.wiki_decoder.get_redirection_list(wiki)
47 | 	}
48 | 
49 | #### Page text
50 | 
51 | Suppose you are interested in the lines within each page's text section only, then:
52 | 
53 | 
54 | 	for line in epub_conversion.wiki_decoder.convert_wiki_to_lines(wiki):
55 | 		process_line( line )
56 | 
57 | 
58 | See Also:
59 | ---------
60 | 
61 | * [Wikipedia NER](https://github.com/JonathanRaiman/wikipedia_ner) a Python module that uses `epub_conversion` to process Wikipedia dumps and output only the lines that contain page to page links, with the link anchor texts extracted, and all markup removed.


--------------------------------------------------------------------------------
/epub_conversion/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from xml_cleaner import to_raw_text
 3 | from epub import open_epub, BadEpubFile
 4 | from zipfile import BadZipfile
 5 | 
 6 | 
 7 | def get_files_from_path(filetype, path):
 8 |     """
 9 |     Recursively returns files matching a filetype from
10 |     a path (e.g. return a list of paths from a folder
11 |     of epub files).
12 |     """
13 |     paths = []
14 |     for subdir in os.listdir(path):
15 |         joined_path = os.path.join(path, subdir)
16 |         if subdir.endswith(filetype):
17 |             paths.append((joined_path, subdir))
18 |         elif os.path.isdir(joined_path):
19 |             paths.extend(get_files_from_path(filetype, joined_path))
20 |     return paths
21 | 
22 | 
23 | def try_utf8(data):
24 |     "Returns a Unicode object on success, or None on failure"
25 |     try:
26 |         return data.decode('utf-8')
27 |     except UnicodeDecodeError:
28 |         return None
29 | 
30 | 
31 | def try_decode(ebook, item):
32 |     try:
33 |         return try_utf8(ebook.read_item(item))
34 |     except KeyError:
35 |         return None
36 | 
37 | 
38 | def open_book(path):
39 |     try:
40 |         return open_epub(path)
41 |     except (BadEpubFile, BadZipfile, KeyError, IndexError):
42 |         return None
43 | 
44 | 
45 | def convert_xml_element_to_lines(data, boundary):
46 |     start_boundary = "<%s" % (boundary)
47 |     end_boundary = "</%s>" % (boundary)
48 |     data = data.replace("\xa0", " ")
49 |     multi_line = data.split("\n")
50 |     lines = []
51 |     in_book = False
52 |     for line in multi_line:
53 |         if line.find(start_boundary) != -1:
54 |             in_book = True
55 |             line_end = line.find(">")
56 |             sliced_line = line[line_end + 1:]
57 |             if len(sliced_line) > 0:
58 |                 lines.append(sliced_line)
59 |             continue
60 |         if line.endswith(end_boundary):
61 |             in_book = False
62 |             line_end = line.find("<")
63 |             sliced_line = line[:line_end]
64 |             if len(sliced_line) > 0:
65 |                 lines.append(sliced_line)
66 |             continue
67 |         if in_book:
68 |             lines.append(line)
69 |     return lines
70 | 
71 | 
72 | def convert_epub_to_lines(ebook):
73 |     lines = []
74 |     for item in ebook.opf.manifest.values():
75 |         # read the content
76 |         data = try_decode(ebook, item)
77 |         if data is not None:
78 |             lines.extend(convert_xml_element_to_lines(data, "body"))
79 |     return lines
80 | 
81 | 
82 | def convert_lines_to_text(lines):
83 |     for sentence in to_raw_text(lines):
84 |         yield " ".join(sentence) + "\n"
85 | 


--------------------------------------------------------------------------------
/epub_conversion/wiki_decoder.py:
--------------------------------------------------------------------------------
  1 | from .utils import convert_lines_to_text
  2 | from xml.etree.cElementTree import fromstring
  3 | import gzip
  4 | import time
  5 | import re
  6 | from contextlib import closing
  7 | try:
  8 |     from IPython.display import clear_output as clear_output_ipython
  9 | except ImportError:
 10 |     def clear_output_ipython(*args, **kwargs):
 11 |         pass
 12 | from bz2file import BZ2File
 13 | from os import path
 14 | 
 15 | Namespaces = [
 16 |     "WP", "Aide", "Help", "Talk", "User", "Template", "Wikipedia",
 17 |     "File", "Book", "Portal", "Portail", "TimedText", "Module",
 18 |     "MediaWiki", "Special", "Spécial", "Media", "Category",
 19 |     "Catégorie", "[^:]+"
 20 | ]
 21 | 
 22 | Disambiguation = [
 23 |     "disambiguation", "homonymie", "значения", "disambigua", "peker",
 24 |     "ujednoznacznienie", "olika betydelser", "Begriffsklärung", "desambiguación"
 25 | ]
 26 | 
 27 | namespace_matcher = re.compile(
 28 |     "(?P<namespace>(?:" + "|".join(Namespaces) + "|[^|+])):.+",
 29 |     re.IGNORECASE)
 30 | disambiguation_matcher = re.compile(
 31 |     ".+ \((?:" + "|".join(Disambiguation) + ")\)",
 32 |     re.IGNORECASE)
 33 | 
 34 | ends_with_templator = re.compile("([\|}]})$")
 35 | 
 36 | 
 37 | def line_is_agreeable(line):
 38 |     return not (line.startswith("|") or
 39 |                 line.startswith("!") or
 40 |                 line.startswith("{{") or
 41 |                 line.startswith("{|") or
 42 |                 ends_with_templator.search(line) is not None)
 43 | 
 44 | 
 45 | class XMLNode(object):
 46 |     @staticmethod
 47 |     def parse_node(text):
 48 |         node = fromstring(text)
 49 |         return (node.tag, node.text)
 50 | 
 51 |     def __init__(self, text):
 52 |         self.tag, self.text = self.parse_node(text)
 53 | 
 54 | 
 55 | class TitleXMLNode(XMLNode):
 56 |     def is_disambiguation_page(self):
 57 |         return disambiguation_matcher.match(self.text) is not None
 58 | 
 59 |     def matches_special_namespaces(self):
 60 |         return namespace_matcher.match(self.text) is not None
 61 | 
 62 |     def is_special_page(self):
 63 |         return self.matches_special_namespaces() or self.is_disambiguation_page()
 64 | 
 65 | 
 66 | def smart_open(fname, mode='r'):
 67 |     _, ext = path.splitext(fname)
 68 |     if ext == '.bz2':
 69 |         return closing(BZ2File(fname, mode))
 70 |     if ext == '.gz':
 71 |         return closing(gzip.open(fname, mode))
 72 |     return open(fname, mode)
 73 | 
 74 | 
 75 | def almost_smart_open(fname, mode='r'):
 76 |     _, ext = path.splitext(fname)
 77 |     if ext == '.bz2':
 78 |         return BZ2File(fname, mode)
 79 |     if ext == '.gz':
 80 |         return gzip.open(fname, mode)
 81 |     return open(fname, mode)
 82 | 
 83 | 
 84 | def convert_wiki_to_corpus(path, target_path, target_mode="wb", *args, **kwargs):
 85 |     try:
 86 |         with gzip.open(target_path, target_mode) as file:
 87 |             origin_file = almost_smart_open(path, "rb")
 88 |             for sentence in convert_wiki_to_lines(origin_file, *args, **kwargs):
 89 |                 file.write(sentence.encode("utf-8"))
 90 |             origin_file.close()
 91 |     except KeyboardInterrupt:
 92 |         return origin_file
 93 | 
 94 | 
 95 | class WikiReaderState:
 96 |     """
 97 |     Stores the state of the reader
 98 |     as it sequentially discovers the
 99 |     contents of an xml dump line by line
100 |     """
101 | 
102 |     def __init__(self, file, report_every=100, clear_output=True):
103 |         # parameters & input
104 |         self.file = file
105 |         self.report_every = report_every
106 |         self.clear_output = clear_output
107 | 
108 |         # state
109 |         self.reset_state()
110 | 
111 |         # counters:
112 |         self.articles_seen = 0
113 |         self.filtered_articles_seen = 0
114 |         self.lines_seen = 0
115 | 
116 |         # clock:
117 |         self.start_time = time.time()
118 | 
119 |     def is_special(self):
120 |         """
121 |         Check whether the page is special:
122 |         is it a redirection, a namespace, or a
123 |         disambiguation_page.
124 |         """
125 |         return self.disambiguation_page or self.namespace_page or self.redirection_page
126 | 
127 |     def mark_redirection(self):
128 |         """
129 |         Tell state that a redirection was observed
130 |         """
131 |         self.redirection_page = True
132 | 
133 |     def enter_page(self):
134 |         """
135 |         Mark that reader is inside a page
136 |         """
137 |         self.in_page = True
138 |         self.articles_seen += 1
139 | 
140 |     def enter_text(self):
141 |         """
142 |         Mark that reader is inside the text portion of a page
143 |         """
144 |         self.inside_text = True
145 | 
146 |     def enter_line(self):
147 |         self.lines_seen += 1
148 | 
149 |     def mark_seen_filtered_article(self):
150 |         self.filtered_articles_seen += 1
151 |         if self.filtered_articles_seen % self.report_every == 0:
152 |             freq = self.filtered_articles_seen / (time.time() - self.start_time)
153 |             if self.clear_output:
154 |                 clear_output_ipython(wait=True)
155 |             print("%d articles seen so far. Processing %.3f articles / s : position %r" % (
156 |                 self.filtered_articles_seen, freq, self.file.tell()))
157 | 
158 |     def reset_state(self):
159 |         """
160 |         Resets all boolean observations in the state
161 |         """
162 |         self.in_page = False
163 |         self.inside_text = False
164 |         self.disambiguation_page = False
165 |         self.redirection_page = False
166 |         self.namespace_page = False
167 |         self.current_title = None
168 | 
169 |     def exit_page(self):
170 |         """
171 |         Mark that reader exits a page.
172 |         Also modifies state to reflect new knowledge.
173 |         """
174 |         self.reset_state()
175 | 
176 |     def exit_text(self):
177 |         self.inside_text = False
178 | 
179 |     def observe_title_line(self, line):
180 |         """
181 |         Observe and mark updates to state given
182 |         a line with <title> in it
183 |         """
184 |         title_node = TitleXMLNode(line)
185 |         self.current_title = title_node.text
186 |         self.disambiguation_page = title_node.is_disambiguation_page()
187 |         self.namespace_page = title_node.matches_special_namespaces()
188 | 
189 |     def print_state(self):
190 |         print("title          '%s'" % (self.current_title))
191 |         print("redirect       %r" % (self.redirection_page))
192 |         print("disambiguation %r" % (self.disambiguation_page))
193 |         print("special_page   %r" % (self.namespace_page))
194 | 
195 | 
196 | def get_redirection_list(wiki,
197 |                          encoding="utf-8",
198 |                          element="page",
199 |                          max_articles=9999999999999999,
200 |                          maxlines=9999999999999999,
201 |                          offset=0):
202 | 
203 |     state = WikiReaderState(wiki, report_every=100000000, clear_output=False)
204 | 
205 |     start_element_node = "<%s" % (element)
206 |     end_element_node = "</%s>" % (element)
207 | 
208 |     redirect_to = None
209 | 
210 |     for line in wiki:
211 |         line = line.decode(encoding)
212 |         state.enter_line()
213 | 
214 |         if state.lines_seen > maxlines:
215 |             break
216 | 
217 |         if line.find("<redirect") != -1:
218 |             state.mark_redirection()
219 |             redirect_to = line.split('"')[1]
220 |             continue
221 | 
222 |         if line.find(start_element_node) != -1:
223 |             state.enter_page()
224 |             if state.filtered_articles_seen >= max_articles:
225 |                 break
226 |             continue
227 | 
228 |         if state.in_page and line.find("<title>") != -1:
229 |             state.observe_title_line(line)
230 |             continue
231 | 
232 |         if line.find(end_element_node) != -1:
233 |             if state.redirection_page:
234 |                 state.mark_seen_filtered_article()
235 |                 yield (state.current_title, redirect_to)
236 |             redirect_to = None
237 |             state.exit_page()
238 |             continue
239 | 
240 | 
241 | def convert_wiki_to_lines(wiki,
242 |                           skip_cdata=False,
243 |                           line_converter=convert_lines_to_text,
244 |                           encoding="utf-8",
245 |                           inner_element="text",
246 |                           element="page",
247 |                           report_every=100,
248 |                           clear_output=True,
249 |                           parse_special_pages=False,
250 |                           skip_templated_lines=True,
251 |                           max_articles=9999999999999999,
252 |                           maxlines=9999999999999999,
253 |                           offset=0):
254 | 
255 |     state = WikiReaderState(wiki, report_every=report_every, clear_output=clear_output)
256 | 
257 |     current_article = ''
258 |     start_element_node = "<%s" % (element)
259 |     start_inner_element_node = "<%s" % (inner_element)
260 |     end_inner_element_node = "</%s>" % (inner_element)
261 |     end_element_node = "</%s>" % (element)
262 | 
263 |     for line in wiki:
264 |         line = line.decode(encoding)
265 |         state.enter_line()
266 | 
267 |         if state.lines_seen > maxlines:
268 |             break
269 | 
270 |         if skip_cdata:
271 |             if line.find("<![CDATA") != -1:
272 |                 continue
273 | 
274 |         if line.find("<redirect") != -1:
275 |             state.mark_redirection()
276 |             continue
277 | 
278 |         if line.find(start_element_node) != -1:
279 |             state.enter_page()
280 |             if state.filtered_articles_seen >= max_articles:
281 |                 break
282 |             continue
283 | 
284 |         if state.in_page and line.find("<title>") != -1:
285 |             state.observe_title_line(line)
286 |             continue
287 | 
288 |         if (parse_special_pages or not state.is_special()):
289 |             start_pos = line.find(start_inner_element_node)
290 |             if start_pos != -1:
291 |                 state.enter_text()
292 | 
293 |                 line = line[start_pos + len(start_inner_element_node):]
294 |                 endpos = line.find(">")
295 |                 line = line[endpos + 1:]
296 | 
297 |         if line.find(end_element_node) != -1:
298 |             if state.articles_seen > offset and (parse_special_pages or not state.is_special()):
299 |                 state.mark_seen_filtered_article()
300 |                 for subline in line_converter(current_article, state.current_title):
301 |                     yield subline
302 |             current_article = ''
303 |             state.exit_page()
304 |             continue
305 | 
306 |         if state.inside_text and (not skip_templated_lines or line_is_agreeable(line)):
307 |             endpos = line.find(end_inner_element_node)
308 |             if endpos != -1:
309 |                 line = line[:endpos]
310 |             current_article += (line.replace("\xa0", " ")
311 |                                     .replace("&quot;", '"')
312 |                                     .replace("&gt;", ">")
313 |                                     .replace("&lt;", "<")
314 |                                     .replace("&amp;nbsp;", " ")
315 |                                     .replace("&amp;", "&"))
316 |             if endpos != -1:
317 |                 state.exit_text()
318 |             continue
319 | 
320 |         if state.inside_text and line.find(end_inner_element_node) != -1:
321 |             state.exit_text()
322 | 


--------------------------------------------------------------------------------