├── MANIFEST.in ├── epub_conversion ├── __init__.py ├── converter.py ├── utils.py └── wiki_decoder.py ├── upload.sh ├── .gitignore ├── setup.py └── README.md /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md -------------------------------------------------------------------------------- /epub_conversion/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Epub and wiki dump conversion module. Provides utilities 3 | for taking large xml files and extracting pages or tokenized 4 | sentences and words. 5 | 6 | """ 7 | 8 | from .converter import Converter 9 | from .wiki_decoder import convert_wiki_to_lines, convert_wiki_to_corpus 10 | 11 | __all__ = ["Converter", "convert_wiki_to_lines", "convert_wiki_to_corpus"] -------------------------------------------------------------------------------- /upload.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # stop script on error and print it 4 | set -e 5 | # inform me of undefined variables 6 | set -u 7 | # handle cascading failures well 8 | set -o pipefail 9 | 10 | SCRIPT_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) 11 | 12 | rm ${SCRIPT_DIR}/dist/* 13 | python3 setup.py clean 14 | python3 setup.py sdist 15 | twine upload ${SCRIPT_DIR}/dist/* --verbose 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | <<<<<<< HEAD 2 | # Compiled source # 3 | ################### 4 | *.com 5 | *.class 6 | *.dll 7 | *.exe 8 | *.o 9 | *.so 10 | 11 | # Packages # 12 | ############ 13 | # it's better to unpack these files and commit the raw source 14 | # git has its own built in compression methods 15 | *.7z 16 | *.dmg 17 | *.gz 18 | *.iso 19 | *.jar 20 | *.rar 21 | *.tar 22 | *.zip 23 | *.gem 24 | *.pem 25 | dist/ 26 | build/ 27 | 28 | # Saves # 29 | ######### 30 | saves/* 31 | imported_saves/* 32 | pvdm_snapshots/* 33 | sentiment_data/* 34 | *.npy 35 | *.mat 36 | *.vocab 37 | *.svocab 38 | text8 39 | __pycache__/* 40 | *.pyc 41 | *.egg-info 42 | 43 | # Logs and databases # 44 | ###################### 45 | *.log 46 | *.sql 47 | *.sqlite 48 | 49 | # OS generated files # 50 | ###################### 51 | .DS_Store 52 | .DS_Store? 53 | ._* 54 | .Spotlight-V100 55 | .Trashes 56 | ehthumbs.db 57 | Thumbs.db 58 | ======= 59 | .DS_Store 60 | >>>>>>> 9ced40381de6f9e6c2b02fc8ba7bb993203c6d62 -------------------------------------------------------------------------------- /epub_conversion/converter.py: -------------------------------------------------------------------------------- 1 | from .utils import get_files_from_path, convert_epub_to_lines, convert_lines_to_text, open_book 2 | import gzip 3 | 4 | 5 | class Converter(object): 6 | """ 7 | Convert a folder of epubs to raw text for corpus 8 | learning. 9 | """ 10 | 11 | def __init__(self, path): 12 | self.path = path 13 | 14 | def convert(self, target_path): 15 | epub_paths = get_files_from_path(".epub", self.path) 16 | 17 | with gzip.open(target_path, "wb") as file: 18 | for (epub_path, epub_name) in epub_paths: 19 | book = open_book(epub_path) 20 | if book is not None: 21 | for sentence in convert_lines_to_text(convert_epub_to_lines(book)): 22 | file.write(sentence.encode("utf-8")) 23 | print("Wrote \"%s\" to disk" % (epub_name)) 24 | else: 25 | print("Couldn't open \"%s\"." % (epub_name)) 26 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup, find_packages 3 | 4 | 5 | def readfile(fname): 6 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 7 | 8 | 9 | setup( 10 | name='epub-conversion', 11 | version='1.0.9', 12 | description='Python package for converting xml and epubs to text files', 13 | long_description=readfile('README.md'), 14 | long_description_content_type="text/markdown", 15 | ext_modules=[], 16 | packages=find_packages(), 17 | py_modules=[], 18 | author='Jonathan Raiman', 19 | author_email='jonathanraiman@gmail.com', 20 | url='https://github.com/JonathanRaiman/epub_conversion', 21 | download_url='https://github.com/JonathanRaiman/epub_conversion', 22 | keywords='XML, epub, tokenization, NLP', 23 | license='MIT', 24 | platforms='any', 25 | zip_safe=False, 26 | classifiers=[ 27 | 'Intended Audience :: Science/Research', 28 | 'Operating System :: OS Independent', 29 | 'Programming Language :: Python :: 3.3', 30 | 'Topic :: Text Processing :: Linguistic', 31 | ], 32 | setup_requires=[], 33 | install_requires=[ 34 | 'bz2file', 35 | 'epub', 36 | 'ciseau' 37 | ], 38 | include_package_data=True, 39 | ) 40 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | epub conversion 2 | --------------- 3 | 4 | Create text corpuses using epubs and wiki dumps. 5 | This is a python package with a Converter for epub and xml (wiki dumps) to text, lines, or Python generators. 6 | 7 | Usage: 8 | ------ 9 | 10 | ### Epub usage 11 | 12 | #### Book by book 13 | 14 | To convert epubs to text files, usage is straightforward. First create a converter object: 15 | 16 | converter = Converter("my_ebooks_folder/") 17 | 18 | Then using this converter let's concatenate all the text within the ebooks into a single mega text file: 19 | 20 | converter.convert("my_succinct_text_file.gz") 21 | 22 | #### Line by line 23 | 24 | You can also proceed line by line: 25 | 26 | from epub_conversion.utils import open_book, convert_epub_to_lines 27 | 28 | book = open_book("twilight.epub") 29 | 30 | lines = convert_epub_to_lines(book) 31 | 32 | ### Wikidump usage 33 | 34 | #### Redirections 35 | 36 | Suppose you are interested in all redirections in a given Wikipedia dump file 37 | that is still compressed, then you can access the dump as follows: 38 | 39 | 40 | wiki = epub_conversion.wiki_decoder.almost_smart_open("enwiki.bz2") 41 | 42 | 43 | Taking this dump as our **input** let us now use a generator to output all pairs of `title` and `redirection title` in this dump: 44 | 45 | redirections = {redirect_from:redirect_to 46 | for redirect_from, redirect_to in epub_conversion.wiki_decoder.get_redirection_list(wiki) 47 | } 48 | 49 | #### Page text 50 | 51 | Suppose you are interested in the lines within each page's text section only, then: 52 | 53 | 54 | for line in epub_conversion.wiki_decoder.convert_wiki_to_lines(wiki): 55 | process_line( line ) 56 | 57 | 58 | See Also: 59 | --------- 60 | 61 | * [Wikipedia NER](https://github.com/JonathanRaiman/wikipedia_ner) a Python module that uses `epub_conversion` to process Wikipedia dumps and output only the lines that contain page to page links, with the link anchor texts extracted, and all markup removed. -------------------------------------------------------------------------------- /epub_conversion/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from xml_cleaner import to_raw_text 3 | from epub import open_epub, BadEpubFile 4 | from zipfile import BadZipfile 5 | 6 | 7 | def get_files_from_path(filetype, path): 8 | """ 9 | Recursively returns files matching a filetype from 10 | a path (e.g. return a list of paths from a folder 11 | of epub files). 12 | """ 13 | paths = [] 14 | for subdir in os.listdir(path): 15 | joined_path = os.path.join(path, subdir) 16 | if subdir.endswith(filetype): 17 | paths.append((joined_path, subdir)) 18 | elif os.path.isdir(joined_path): 19 | paths.extend(get_files_from_path(filetype, joined_path)) 20 | return paths 21 | 22 | 23 | def try_utf8(data): 24 | "Returns a Unicode object on success, or None on failure" 25 | try: 26 | return data.decode('utf-8') 27 | except UnicodeDecodeError: 28 | return None 29 | 30 | 31 | def try_decode(ebook, item): 32 | try: 33 | return try_utf8(ebook.read_item(item)) 34 | except KeyError: 35 | return None 36 | 37 | 38 | def open_book(path): 39 | try: 40 | return open_epub(path) 41 | except (BadEpubFile, BadZipfile, KeyError, IndexError): 42 | return None 43 | 44 | 45 | def convert_xml_element_to_lines(data, boundary): 46 | start_boundary = "<%s" % (boundary) 47 | end_boundary = "" % (boundary) 48 | data = data.replace("\xa0", " ") 49 | multi_line = data.split("\n") 50 | lines = [] 51 | in_book = False 52 | for line in multi_line: 53 | if line.find(start_boundary) != -1: 54 | in_book = True 55 | line_end = line.find(">") 56 | sliced_line = line[line_end + 1:] 57 | if len(sliced_line) > 0: 58 | lines.append(sliced_line) 59 | continue 60 | if line.endswith(end_boundary): 61 | in_book = False 62 | line_end = line.find("<") 63 | sliced_line = line[:line_end] 64 | if len(sliced_line) > 0: 65 | lines.append(sliced_line) 66 | continue 67 | if in_book: 68 | lines.append(line) 69 | return lines 70 | 71 | 72 | def convert_epub_to_lines(ebook): 73 | lines = [] 74 | for item in ebook.opf.manifest.values(): 75 | # read the content 76 | data = try_decode(ebook, item) 77 | if data is not None: 78 | lines.extend(convert_xml_element_to_lines(data, "body")) 79 | return lines 80 | 81 | 82 | def convert_lines_to_text(lines): 83 | for sentence in to_raw_text(lines): 84 | yield " ".join(sentence) + "\n" 85 | -------------------------------------------------------------------------------- /epub_conversion/wiki_decoder.py: -------------------------------------------------------------------------------- 1 | from .utils import convert_lines_to_text 2 | from xml.etree.cElementTree import fromstring 3 | import gzip 4 | import time 5 | import re 6 | from contextlib import closing 7 | try: 8 | from IPython.display import clear_output as clear_output_ipython 9 | except ImportError: 10 | def clear_output_ipython(*args, **kwargs): 11 | pass 12 | from bz2file import BZ2File 13 | from os import path 14 | 15 | Namespaces = [ 16 | "WP", "Aide", "Help", "Talk", "User", "Template", "Wikipedia", 17 | "File", "Book", "Portal", "Portail", "TimedText", "Module", 18 | "MediaWiki", "Special", "Spécial", "Media", "Category", 19 | "Catégorie", "[^:]+" 20 | ] 21 | 22 | Disambiguation = [ 23 | "disambiguation", "homonymie", "значения", "disambigua", "peker", 24 | "ujednoznacznienie", "olika betydelser", "Begriffsklärung", "desambiguación" 25 | ] 26 | 27 | namespace_matcher = re.compile( 28 | "(?P(?:" + "|".join(Namespaces) + "|[^|+])):.+", 29 | re.IGNORECASE) 30 | disambiguation_matcher = re.compile( 31 | ".+ \((?:" + "|".join(Disambiguation) + ")\)", 32 | re.IGNORECASE) 33 | 34 | ends_with_templator = re.compile("([\|}]})$") 35 | 36 | 37 | def line_is_agreeable(line): 38 | return not (line.startswith("|") or 39 | line.startswith("!") or 40 | line.startswith("{{") or 41 | line.startswith("{|") or 42 | ends_with_templator.search(line) is not None) 43 | 44 | 45 | class XMLNode(object): 46 | @staticmethod 47 | def parse_node(text): 48 | node = fromstring(text) 49 | return (node.tag, node.text) 50 | 51 | def __init__(self, text): 52 | self.tag, self.text = self.parse_node(text) 53 | 54 | 55 | class TitleXMLNode(XMLNode): 56 | def is_disambiguation_page(self): 57 | return disambiguation_matcher.match(self.text) is not None 58 | 59 | def matches_special_namespaces(self): 60 | return namespace_matcher.match(self.text) is not None 61 | 62 | def is_special_page(self): 63 | return self.matches_special_namespaces() or self.is_disambiguation_page() 64 | 65 | 66 | def smart_open(fname, mode='r'): 67 | _, ext = path.splitext(fname) 68 | if ext == '.bz2': 69 | return closing(BZ2File(fname, mode)) 70 | if ext == '.gz': 71 | return closing(gzip.open(fname, mode)) 72 | return open(fname, mode) 73 | 74 | 75 | def almost_smart_open(fname, mode='r'): 76 | _, ext = path.splitext(fname) 77 | if ext == '.bz2': 78 | return BZ2File(fname, mode) 79 | if ext == '.gz': 80 | return gzip.open(fname, mode) 81 | return open(fname, mode) 82 | 83 | 84 | def convert_wiki_to_corpus(path, target_path, target_mode="wb", *args, **kwargs): 85 | try: 86 | with gzip.open(target_path, target_mode) as file: 87 | origin_file = almost_smart_open(path, "rb") 88 | for sentence in convert_wiki_to_lines(origin_file, *args, **kwargs): 89 | file.write(sentence.encode("utf-8")) 90 | origin_file.close() 91 | except KeyboardInterrupt: 92 | return origin_file 93 | 94 | 95 | class WikiReaderState: 96 | """ 97 | Stores the state of the reader 98 | as it sequentially discovers the 99 | contents of an xml dump line by line 100 | """ 101 | 102 | def __init__(self, file, report_every=100, clear_output=True): 103 | # parameters & input 104 | self.file = file 105 | self.report_every = report_every 106 | self.clear_output = clear_output 107 | 108 | # state 109 | self.reset_state() 110 | 111 | # counters: 112 | self.articles_seen = 0 113 | self.filtered_articles_seen = 0 114 | self.lines_seen = 0 115 | 116 | # clock: 117 | self.start_time = time.time() 118 | 119 | def is_special(self): 120 | """ 121 | Check whether the page is special: 122 | is it a redirection, a namespace, or a 123 | disambiguation_page. 124 | """ 125 | return self.disambiguation_page or self.namespace_page or self.redirection_page 126 | 127 | def mark_redirection(self): 128 | """ 129 | Tell state that a redirection was observed 130 | """ 131 | self.redirection_page = True 132 | 133 | def enter_page(self): 134 | """ 135 | Mark that reader is inside a page 136 | """ 137 | self.in_page = True 138 | self.articles_seen += 1 139 | 140 | def enter_text(self): 141 | """ 142 | Mark that reader is inside the text portion of a page 143 | """ 144 | self.inside_text = True 145 | 146 | def enter_line(self): 147 | self.lines_seen += 1 148 | 149 | def mark_seen_filtered_article(self): 150 | self.filtered_articles_seen += 1 151 | if self.filtered_articles_seen % self.report_every == 0: 152 | freq = self.filtered_articles_seen / (time.time() - self.start_time) 153 | if self.clear_output: 154 | clear_output_ipython(wait=True) 155 | print("%d articles seen so far. Processing %.3f articles / s : position %r" % ( 156 | self.filtered_articles_seen, freq, self.file.tell())) 157 | 158 | def reset_state(self): 159 | """ 160 | Resets all boolean observations in the state 161 | """ 162 | self.in_page = False 163 | self.inside_text = False 164 | self.disambiguation_page = False 165 | self.redirection_page = False 166 | self.namespace_page = False 167 | self.current_title = None 168 | 169 | def exit_page(self): 170 | """ 171 | Mark that reader exits a page. 172 | Also modifies state to reflect new knowledge. 173 | """ 174 | self.reset_state() 175 | 176 | def exit_text(self): 177 | self.inside_text = False 178 | 179 | def observe_title_line(self, line): 180 | """ 181 | Observe and mark updates to state given 182 | a line with in it 183 | """ 184 | title_node = TitleXMLNode(line) 185 | self.current_title = title_node.text 186 | self.disambiguation_page = title_node.is_disambiguation_page() 187 | self.namespace_page = title_node.matches_special_namespaces() 188 | 189 | def print_state(self): 190 | print("title '%s'" % (self.current_title)) 191 | print("redirect %r" % (self.redirection_page)) 192 | print("disambiguation %r" % (self.disambiguation_page)) 193 | print("special_page %r" % (self.namespace_page)) 194 | 195 | 196 | def get_redirection_list(wiki, 197 | encoding="utf-8", 198 | element="page", 199 | max_articles=9999999999999999, 200 | maxlines=9999999999999999, 201 | offset=0): 202 | 203 | state = WikiReaderState(wiki, report_every=100000000, clear_output=False) 204 | 205 | start_element_node = "<%s" % (element) 206 | end_element_node = "</%s>" % (element) 207 | 208 | redirect_to = None 209 | 210 | for line in wiki: 211 | line = line.decode(encoding) 212 | state.enter_line() 213 | 214 | if state.lines_seen > maxlines: 215 | break 216 | 217 | if line.find("<redirect") != -1: 218 | state.mark_redirection() 219 | redirect_to = line.split('"')[1] 220 | continue 221 | 222 | if line.find(start_element_node) != -1: 223 | state.enter_page() 224 | if state.filtered_articles_seen >= max_articles: 225 | break 226 | continue 227 | 228 | if state.in_page and line.find("<title>") != -1: 229 | state.observe_title_line(line) 230 | continue 231 | 232 | if line.find(end_element_node) != -1: 233 | if state.redirection_page: 234 | state.mark_seen_filtered_article() 235 | yield (state.current_title, redirect_to) 236 | redirect_to = None 237 | state.exit_page() 238 | continue 239 | 240 | 241 | def convert_wiki_to_lines(wiki, 242 | skip_cdata=False, 243 | line_converter=convert_lines_to_text, 244 | encoding="utf-8", 245 | inner_element="text", 246 | element="page", 247 | report_every=100, 248 | clear_output=True, 249 | parse_special_pages=False, 250 | skip_templated_lines=True, 251 | max_articles=9999999999999999, 252 | maxlines=9999999999999999, 253 | offset=0): 254 | 255 | state = WikiReaderState(wiki, report_every=report_every, clear_output=clear_output) 256 | 257 | current_article = '' 258 | start_element_node = "<%s" % (element) 259 | start_inner_element_node = "<%s" % (inner_element) 260 | end_inner_element_node = "</%s>" % (inner_element) 261 | end_element_node = "</%s>" % (element) 262 | 263 | for line in wiki: 264 | line = line.decode(encoding) 265 | state.enter_line() 266 | 267 | if state.lines_seen > maxlines: 268 | break 269 | 270 | if skip_cdata: 271 | if line.find("<![CDATA") != -1: 272 | continue 273 | 274 | if line.find("<redirect") != -1: 275 | state.mark_redirection() 276 | continue 277 | 278 | if line.find(start_element_node) != -1: 279 | state.enter_page() 280 | if state.filtered_articles_seen >= max_articles: 281 | break 282 | continue 283 | 284 | if state.in_page and line.find("<title>") != -1: 285 | state.observe_title_line(line) 286 | continue 287 | 288 | if (parse_special_pages or not state.is_special()): 289 | start_pos = line.find(start_inner_element_node) 290 | if start_pos != -1: 291 | state.enter_text() 292 | 293 | line = line[start_pos + len(start_inner_element_node):] 294 | endpos = line.find(">") 295 | line = line[endpos + 1:] 296 | 297 | if line.find(end_element_node) != -1: 298 | if state.articles_seen > offset and (parse_special_pages or not state.is_special()): 299 | state.mark_seen_filtered_article() 300 | for subline in line_converter(current_article, state.current_title): 301 | yield subline 302 | current_article = '' 303 | state.exit_page() 304 | continue 305 | 306 | if state.inside_text and (not skip_templated_lines or line_is_agreeable(line)): 307 | endpos = line.find(end_inner_element_node) 308 | if endpos != -1: 309 | line = line[:endpos] 310 | current_article += (line.replace("\xa0", " ") 311 | .replace(""", '"') 312 | .replace(">", ">") 313 | .replace("<", "<") 314 | .replace("&nbsp;", " ") 315 | .replace("&", "&")) 316 | if endpos != -1: 317 | state.exit_text() 318 | continue 319 | 320 | if state.inside_text and line.find(end_inner_element_node) != -1: 321 | state.exit_text() 322 | --------------------------------------------------------------------------------