├── tests
├── gutenbergpy
    ├── __init__.py
    ├── caches
    │   ├── __init__.py
    │   ├── cache.py
    │   ├── gutenbergindex_indices.db.sql
    │   ├── gutenbergindex.db.sql
    │   ├── mongodbcache.py
    │   └── sqlitecache.py
    ├── parse
    │   ├── __init__.py
    │   ├── rdfparseresults.py
    │   ├── cachefields.py
    │   ├── book.py
    │   ├── parseitem.py
    │   ├── parseitemtitles.py
    │   ├── parseitemfile.py
    │   └── rdfparser.py
    ├── gutenbergcachesettings.py
    ├── gutenbergcache.py
    ├── utils.py
    ├── orderedset.py
    └── textget.py
├── dblogos.png
├── mongodb.png
├── sqlite.png
├── sqlitecheme.png
├── pyproject.toml
├── setup.py
├── .gitignore
├── test.py
├── setup.cfg
├── LICENSE.txt
└── README.md


/tests:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/gutenbergpy/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/gutenbergpy/caches/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/gutenbergpy/parse/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dblogos.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raduangelescu/gutenbergpy/HEAD/dblogos.png


--------------------------------------------------------------------------------
/mongodb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raduangelescu/gutenbergpy/HEAD/mongodb.png


--------------------------------------------------------------------------------
/sqlite.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raduangelescu/gutenbergpy/HEAD/sqlite.png


--------------------------------------------------------------------------------
/sqlitecheme.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raduangelescu/gutenbergpy/HEAD/sqlitecheme.png


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 |     "setuptools>=42",
4 |     "wheel"
5 | ]
6 | build-backend = "setuptools.build_meta"


--------------------------------------------------------------------------------
/gutenbergpy/parse/rdfparseresults.py:
--------------------------------------------------------------------------------
1 | ##
2 | # Used to hold the parser results
3 | class RDFParseResults:
4 |     def __init__(self):
5 |         self.field_sets = []
6 |         self.books = []
7 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from os.path import isfile
 2 | from sys import version_info
 3 | 
 4 | from setuptools import find_packages
 5 | from setuptools import setup
 6 | 
 7 | setup(
 8 |     packages=find_packages(),
 9 |     package_data={'gutenbergpy.caches': ['*.sql']},
10 |     include_package_data=True)


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | build/
 3 | GutenbergPy.egg-info/
 4 | dist/
 5 | .idea/gutenbergpy.iml
 6 | .idea/inspectionProfiles/profiles_settings.xml
 7 | .idea/misc.xml
 8 | .idea/modules.xml
 9 | .idea/vcs.xml
10 | .idea/workspace.xml
11 | texts/
12 | gutenbergindex.db
13 | cache/
14 | rdf-files.tar.bz2
15 | memory.pickle
16 | memusage.txt
17 | 


--------------------------------------------------------------------------------
/gutenbergpy/parse/cachefields.py:
--------------------------------------------------------------------------------
 1 | ##
 2 | # The main fields of a book
 3 | # noinspection PyClassHasNoInit
 4 | class Fields:  # Fields we will have in the caches db for a book entry
 5 |     TITLE = 0
 6 |     SUBJECT = 1
 7 |     TYPE = 2
 8 |     LANGUAGE = 3
 9 |     AUTHOR = 4
10 |     BOOKSHELF = 5
11 |     FILES = 6
12 |     PUBLISHER = 7
13 |     RIGHTS = 8
14 |     FIELD_COUNT = 9
15 | 


--------------------------------------------------------------------------------
/gutenbergpy/caches/cache.py:
--------------------------------------------------------------------------------
 1 | ##
 2 | # Base class for any kind of cache
 3 | class Cache:
 4 |     def __init__(self):
 5 |         pass
 6 | 
 7 |     def create_cache(self, parse_results):
 8 |         raise NotImplementedError("Please implement the create_cache function")
 9 | 
10 |     def query(self, **kwargs):
11 |         raise NotImplementedError("Please implement the query function")
12 | 
13 |     def native_query(self, sql_query):
14 |         raise NotImplementedError("Please implement the native_query function")
15 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import gutenbergpy.textget
 3 | from gutenbergpy.gutenbergcache import GutenbergCache
 4 | 
 5 | # create cache from scratchfrom scratch
 6 | GutenbergCache.create(refresh=True, download=True, unpack=True, parse=True, cache=True, deleteTemp=True)
 7 | # get the default cache (SQLite)
 8 | cache = GutenbergCache.get_cache()
 9 | # For the query function you can use the following fields: languages authors types titles subjects publishers bookshelves
10 | print(cache.query(downloadtype=['application/plain', 'text/plain', 'text/html; charset=utf-8']))
11 | # Print stripped text
12 | print(gutenbergpy.textget.strip_headers(gutenbergpy.textget.get_text_by_id(1000)))
13 | 


--------------------------------------------------------------------------------
/gutenbergpy/parse/book.py:
--------------------------------------------------------------------------------
 1 | ##
 2 | # Used to hold a book in parse results after parsing
 3 | class Book:
 4 |     def __init__(self, publisher_id, rights_id, language_id,
 5 |                  bookshelf_id, gutenberg_book_id,
 6 |                  date_issued, num_downloads,
 7 |                  titles_id, subjects_id, type_id, authors_id, files_id):
 8 |         self.publisher_id = publisher_id
 9 |         self.rights_id = rights_id
10 |         self.language_id = language_id
11 |         self.bookshelf_id = bookshelf_id
12 |         self.gutenberg_book_id = gutenberg_book_id
13 |         self.date_issued = date_issued
14 |         self.num_downloads = num_downloads
15 |         self.titles_id = titles_id
16 |         self.subjects_id = subjects_id
17 |         self.authors_id = authors_id
18 |         self.files_id = files_id
19 |         self.type_id = type_id
20 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name= gutenbergpy
 3 | author= Radu Angelescu
 4 | author_email = raduangelescu@gmail.com
 5 | version = 0.3.6
 6 | description = Library to create and interogate local cache for Project Gutenberg
 7 | long_description = file: README.md
 8 | long_description_content_type = text/markdown
 9 | url = https://github.com/raduangelescu/gutenbergpy
10 | download_url = http://pypi.python.org/pypi/gutenbergpy
11 | project_urls =
12 |     Bug Tracker = http://pypi.python.org/pypi/gutenbergpy/issues
13 | classifiers =
14 |     Programming Language :: Python :: 3
15 |     License :: OSI Approved :: MIT License
16 |     Operating System :: OS Independent
17 | license = LICENSE.txt
18 | 
19 | [options]
20 | install_requires =
21 |         future>=0.15.2
22 |         httpsproxy-urllib2
23 |         lxml>=3.2.0
24 |         pymongo
25 |         setuptools>=18.5
26 |         chardet
27 | python_requires = >=3.6
28 | 


--------------------------------------------------------------------------------
/gutenbergpy/caches/gutenbergindex_indices.db.sql:
--------------------------------------------------------------------------------
 1 | BEGIN TRANSACTION;
 2 | 
 3 | CREATE INDEX `books_dateissued_idx` ON `books` (`dateissued` ASC);
 4 | CREATE INDEX `books_numdownloads_idx` ON `books` (`numdownloads` ASC);
 5 | CREATE INDEX `gutenbergbookid_idx` ON `books` (`gutenbergbookid` ASC);
 6 | 
 7 | CREATE INDEX `authors_name_idx` ON `authors` (`name` ASC);
 8 | 
 9 | CREATE INDEX `types_name_idx` ON `types` (`name` ASC);
10 | 
11 | CREATE INDEX `titles_name_idx` ON `titles` (`name` ASC);
12 | 
13 | CREATE INDEX `subjects_name_idx` ON `subjects` (`name` ASC);
14 | 
15 | CREATE INDEX `rights_name_idx` ON `rights` (`name` ASC);
16 | 
17 | CREATE INDEX `publishers_name_idx` ON `publishers` (`name` ASC);
18 | 
19 | CREATE INDEX `languages_name_idx` ON `languages` (`name` ASC);
20 | 
21 | CREATE INDEX `bookshelves_name_idx` ON `bookshelves` (`name` ASC);
22 | 
23 | CREATE INDEX `downloadlinks_name_idx` ON `downloadlinks` (`name` ASC);
24 | 
25 | 
26 | COMMIT;
27 | 


--------------------------------------------------------------------------------
/gutenbergpy/parse/parseitem.py:
--------------------------------------------------------------------------------
 1 | from gutenbergpy.gutenbergcachesettings import GutenbergCacheSettings
 2 | from gutenbergpy.orderedset import OrderedSet
 3 | 
 4 | 
 5 | ##
 6 | # Helper for parsing a rdf file item
 7 | class ParseItem:
 8 |     def __init__(self, xpath):
 9 |         self.xPath = xpath
10 |         self.set = OrderedSet()
11 | 
12 |     def needs_book_id(self):
13 |         return False
14 | 
15 |     def add_to_set_internal(self, xpathResults, ret):
16 |         if len(xpathResults) > 0:
17 |             for el in xpathResults:
18 |                 ret.append(self.set.add(el.replace("\"", "'")))
19 | 
20 |     def add_to_set(self, xpathResults, ret):
21 |         self.add_to_set_internal(xpathResults, ret)
22 | 
23 |     def do(self, doc):
24 |         tmp = []
25 |         for xpth in self.xPath:
26 |             xpathResults = doc.xpath(xpth, namespaces=GutenbergCacheSettings.NS)
27 |             self.add_to_set(xpathResults, tmp)
28 |         return tmp
29 | 


--------------------------------------------------------------------------------
/gutenbergpy/parse/parseitemtitles.py:
--------------------------------------------------------------------------------
 1 | from gutenbergpy.parse.parseitem import ParseItem
 2 | from gutenbergpy.gutenbergcachesettings import GutenbergCacheSettings
 3 | 
 4 | 
 5 | ##
 6 | # Parser helper for title items
 7 | # noinspection PyMethodOverriding
 8 | class ParseItemTitles(ParseItem):
 9 |     def needs_book_id(self):
10 |         return True
11 | 
12 |     def att_to_set_book_id(self, xpathResults, ret, book_id):
13 |         if len(xpathResults) > 0:
14 |             for el in xpathResults:
15 |                 text = el.replace("\"", "'")
16 |                 index = self.set.index(text)
17 |                 if index != -1:
18 |                     self.set[text][1] = book_id
19 |                 else:
20 |                     new_index = self.set.add((text, book_id))
21 |                     ret.append(new_index)
22 | 
23 |     def do(self, doc, book_id):
24 |         tmp = []
25 |         for xpth in self.xPath:
26 |             xpathResults = doc.xpath(xpth, namespaces=GutenbergCacheSettings.NS)
27 |             self.att_to_set_book_id(xpathResults, tmp, book_id)
28 |         return tmp
29 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016 Radu Angelescu.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a
 4 | copy of this software and associated documentation files (the "Software"),
 5 | to deal in the Software without restriction, including without limitation
 6 | the rights to use, copy, modify, merge, publish, distribute, sublicense,
 7 | and/or sell copies of the Software, and to permit persons to whom the
 8 | Software is furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
19 | DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/gutenbergpy/caches/gutenbergindex.db.sql:
--------------------------------------------------------------------------------
 1 | BEGIN TRANSACTION;
 2 | CREATE TABLE `types` (
 3 | 	`id`	INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE,
 4 | 	`name`	TEXT
 5 | );
 6 | CREATE TABLE `titles` (
 7 | 	`id`	INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE,
 8 | 	`name`	TEXT,
 9 | 	`bookid` INTEGER
10 | );
11 | CREATE TABLE `subjects` (
12 | 	`id`	INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE,
13 | 	`name`	TEXT
14 | );
15 | CREATE TABLE `rights` (
16 | 	`id`	INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE,
17 | 	`name`	TEXT
18 | );
19 | CREATE TABLE `publishers` (
20 | 	`id`	INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE,
21 | 	`name`	TEXT
22 | );
23 | CREATE TABLE `languages` (
24 | 	`id`	INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE,
25 | 	`name`	TEXT
26 | );
27 | 
28 | CREATE TABLE `downloadlinkstype` (
29 | 	`id`	INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE,
30 | 	`name`	TEXT
31 | );
32 | 
33 | CREATE TABLE `downloadlinks` (
34 | 	`id`	INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE,
35 | 	`name`	TEXT,
36 | 	`downloadtypeid`	INTEGER,
37 | 	`bookid` INTEGER
38 | );
39 | CREATE TABLE `bookshelves` (
40 | 	`id`	INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE,
41 | 	`name`	TEXT
42 | );
43 | CREATE TABLE "books" (
44 | 	`id`	INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE,
45 | 	`publisherid` INTEGER,
46 | 	`dateissued` DATE,
47 | 	`rightsid` INTEGER,
48 | 	`numdownloads` INTEGER,
49 | 	`languageid` INTEGER,
50 | 	`bookshelveid` INTEGER,
51 | 	`gutenbergbookid` INTEGER,
52 | 	`typeid` INTEGER
53 | );
54 | CREATE TABLE `book_subjects` (
55 | 	`bookid`	INTEGER,
56 | 	`subjectid`	INTEGER
57 | );
58 | CREATE TABLE `book_authors` (
59 | 	`bookid`	INTEGER,
60 | 	`authorid`	INTEGER
61 | );
62 | CREATE TABLE `authors` (
63 | 	`id`	INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE,
64 | 	`name`	TEXT
65 | );
66 | 
67 | 
68 | 
69 | COMMIT;
70 | 


--------------------------------------------------------------------------------
/gutenbergpy/parse/parseitemfile.py:
--------------------------------------------------------------------------------
 1 | from gutenbergpy.parse.parseitem import ParseItem
 2 | from gutenbergpy.gutenbergcachesettings import GutenbergCacheSettings
 3 | from gutenbergpy.orderedset import OrderedSet
 4 | 
 5 | 
 6 | ##
 7 | # Parser helper for download links
 8 | # noinspection PyMethodOverriding
 9 | class ParseItemFiles(ParseItem):
10 | 
11 |     def __init__(self, xpath):
12 |         ParseItem.__init__(self, xpath)
13 |         self.xPath = xpath
14 |         self.setTypes = OrderedSet()
15 |         self.setLinks = OrderedSet()
16 | 
17 |     def needs_book_id(self):
18 |         return True
19 | 
20 |     @staticmethod
21 |     def add(theset, xpath_result, book_id, type_id):
22 |         text = xpath_result.replace("\"", "'")
23 |         index = theset.index(text)
24 |         if index != -1:
25 |             theset[text][1] = book_id
26 |         else:
27 |             index = theset.add((text, book_id,type_id))
28 |         return index
29 | 
30 |     @staticmethod
31 |     def add_simple(the_set, xpath_result):
32 |         text = xpath_result.replace("\"", "'")
33 |         return the_set.add(text)
34 | 
35 |     def do(self,doc,book_id):
36 |         arr =[]
37 |         for xpth in self.xPath:
38 |             book_files = doc.xpath(xpth, namespaces=GutenbergCacheSettings.NS)
39 |             for bk in book_files:
40 | 
41 |                 xpath_results_type = bk.xpath('.//dcterms:format/rdf:Description/rdf:value/text()', namespaces=GutenbergCacheSettings.NS)
42 |                 xpath_results_link = bk.xpath('.//pgterms:file/@rdf:about', namespaces=GutenbergCacheSettings.NS)
43 | 
44 |                 if xpath_results_link and xpath_results_type:
45 |                     type_id = self.add_simple( self.setTypes, xpath_results_type[0])
46 |                     link_id = self.add(self.setLinks, xpath_results_link[0], book_id, type_id)
47 | 
48 |                     arr.append(link_id)
49 |         return arr
50 | 


--------------------------------------------------------------------------------
/gutenbergpy/gutenbergcachesettings.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | # noinspection PyClassHasNoInit
 4 | class GutenbergCacheSettings:
 5 |     # name of the gutenberg link for the rdf arhive (should not change)
 6 |     CACHE_RDF_DOWNLOAD_LINK = 'https://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2'
 7 |     # name of the caches file name (sqlite db)
 8 |     CACHE_FILENAME = 'gutenbergindex.db'
 9 |     # name of the rdf unpack directory (this will be used when unpacking the rdf tar)
10 |     CACHE_RDF_UNPACK_DIRECTORY = os.path.join('cache','epub')
11 |     # name of the downloaded rdf arhive
12 |     CACHE_RDF_ARCHIVE_NAME = 'rdf-files.tar.bz2'
13 |     # number of #'s shown in loading bar (common to all loading bars)
14 |     DOWNLOAD_NUM_DIVS = 20
15 |     # text files cache folder
16 |     TEXT_FILES_CACHE_FOLDER = 'texts'
17 |     # mongo db connection server
18 |     MONGO_DB_CONNECTION_SERVER ="mongodb://localhost:27017"
19 |     ##########READONLY VARIABLES (please put readonly variables here)
20 |     # namespace used for the rds parsing (should not change)
21 |     NS = {
22 |         'cc': "http://web.resource.org/cc/",
23 |         'dcam': "http://purl.org/dc/dcam/",
24 |         'dcterms': "http://purl.org/dc/terms/",
25 |         'rdfs': "http://www.w3.org/2000/01/rdf-schema#",
26 |         'rdf': "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
27 |         'pgterms': "http://www.gutenberg.org/2009/pgterms/"}
28 | 
29 |     ##########END OF READONLY VARIABLES
30 | 
31 |     ##
32 |     # Used to set the settings global variables
33 |     @staticmethod
34 |     def set(**kwargs):
35 |         if 'CacheFilename' in kwargs:
36 |             GutenbergCacheSettings.CACHE_FILENAME = kwargs['CacheFilename']
37 |         if 'CacheUnpackDir' in kwargs:
38 |             GutenbergCacheSettings.CACHE_RDF_UNPACK_DIRECTORY = kwargs['CacheUnpackDir']
39 |         if 'CacheArchiveName' in kwargs:
40 |             GutenbergCacheSettings.CACHE_RDF_ARCHIVE_NAME = kwargs['CacheArchiveName']
41 |         if 'ProgressBarMaxLength' in kwargs:
42 |             GutenbergCacheSettings.DOWNLOAD_NUM_DIVS = kwargs['ProgressBarMaxLength']
43 |         if 'CacheRDFDownloadLink' in kwargs:
44 |             GutenbergCacheSettings.CACHE_RDF_DOWNLOAD_LINK = kwargs['CacheRDFDownloadLink']
45 |         if 'TextFilesCacheFolder' in kwargs:
46 |             GutenbergCacheSettings.TEXT_FILES_CACHE_FOLDER = kwargs['TextFilesCacheFolder']
47 |         if 'MongoDBCacheServer' in kwargs:
48 |             GutenbergCacheSettings.MONGO_DB_CONNECTION_SERVER = kwargs['MongoDBCacheServer']


--------------------------------------------------------------------------------
/gutenbergpy/gutenbergcache.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from os import path
 3 | import time
 4 | from gutenbergpy.utils import Utils
 5 | 
 6 | from gutenbergpy.gutenbergcachesettings import GutenbergCacheSettings
 7 | from gutenbergpy.parse.rdfparser import RdfParser
 8 | from gutenbergpy.caches.sqlitecache import SQLiteCache
 9 | from gutenbergpy.caches.mongodbcache import MongodbCache
10 | 
11 | ##
12 | # Cache types
13 | # noinspection PyClassHasNoInit
14 | class GutenbergCacheTypes:
15 |     CACHE_TYPE_SQLITE  = 0
16 |     CACHE_TYPE_MONGODB = 1
17 | 
18 | 
19 | ##
20 | # The main class (only this should be used to interface the cache)
21 | class GutenbergCache:
22 |     ##
23 |     # Get the cache by type
24 |     @staticmethod
25 |     def get_cache(type=GutenbergCacheTypes.CACHE_TYPE_SQLITE):
26 |             if type == GutenbergCacheTypes.CACHE_TYPE_SQLITE:
27 |                 return SQLiteCache()
28 |             elif type == GutenbergCacheTypes.CACHE_TYPE_MONGODB:
29 |                 return MongodbCache()
30 |             print("CACHE TYPE UNKNOWN")
31 |             return None
32 | 
33 |     ##
34 |     # Create the cache
35 |     @staticmethod
36 |     def create(**kwargs):
37 |         cache_type  = GutenbergCacheTypes.CACHE_TYPE_SQLITE if 'type' not in kwargs else kwargs['type']
38 |         refresh     = True  if 'refresh'  not in kwargs else kwargs['refresh']
39 |         download    = True  if 'download' not in kwargs else kwargs['download']
40 |         unpack      = True  if 'unpack'   not in kwargs else kwargs['unpack']
41 |         parse       = True  if 'parse'    not in kwargs else kwargs['parse']
42 |         cache       = True  if 'cache'    not in kwargs else kwargs['cache']
43 |         deleteTmp   = True  if 'deleteTemp' not in kwargs else kwargs['deleteTemp']
44 | 
45 |         if path.isfile(GutenbergCacheSettings.CACHE_FILENAME) and refresh and cache_type == GutenbergCacheTypes.CACHE_TYPE_SQLITE:
46 |             print('Cache already exists')
47 |             return
48 | 
49 |         if refresh:
50 |             print('Deleting old files')
51 |             Utils.delete_tmp_files(True)
52 | 
53 |         if download:
54 |             Utils.download_file()
55 | 
56 |         if unpack:
57 |             Utils.unpack_tarbz2()
58 | 
59 |         if parse:
60 |             t0 = time.time()
61 |             parser = RdfParser()
62 |             result = parser.do()
63 |             print('RDF PARSING took ' + str(time.time() - t0))
64 | 
65 |             if cache:
66 |                 t0 = time.time()
67 |                 cache = GutenbergCache.get_cache(cache_type)
68 |                 cache.create_cache(result)
69 |                 print('sql took %f' % (time.time() - t0))
70 | 
71 |         if deleteTmp:
72 |             print('Deleting temporary files')
73 |             Utils.delete_tmp_files()
74 | 
75 |         print('Done')
76 | 
77 |     ##
78 |     # Method to check if the cache exists
79 |     @staticmethod
80 |     def exists():
81 |         return path.isfile(GutenbergCacheSettings.CACHE_FILENAME)
82 | 


--------------------------------------------------------------------------------
/gutenbergpy/utils.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import sys
 3 | import os
 4 | import urllib
 5 | import tarfile
 6 | from future.standard_library import install_aliases
 7 | install_aliases()
 8 | import urllib.request
 9 | from gutenbergpy.gutenbergcachesettings import GutenbergCacheSettings
10 | 
11 | 
12 | ##
13 | # Wrapper for project used utils
14 | 
15 | # noinspection PyClassHasNoInit
16 | class Utils:
17 |     ##
18 |     # Deletes the temp files resulted in the cache process
19 |     @staticmethod
20 |     def delete_tmp_files(delete_sqlite=False):
21 |         if delete_sqlite:
22 |             try:
23 |                 os.remove(GutenbergCacheSettings.CACHE_FILENAME)
24 |             except OSError:
25 |                 pass
26 |         try:
27 |             os.remove(GutenbergCacheSettings.CACHE_RDF_ARCHIVE_NAME)
28 |         except OSError:
29 |             pass
30 |         try:
31 |             for root, dirs, files in os.walk(GutenbergCacheSettings.CACHE_RDF_UNPACK_DIRECTORY, topdown=False):
32 |                 for name in files:
33 |                     os.remove(os.path.join(root, name))
34 |                 for name in dirs:
35 |                     os.rmdir(os.path.join(root, name))
36 |         except OSError:
37 |             pass
38 | 
39 |     ##
40 |     # Updates the visual progress bar
41 | 
42 |     @staticmethod
43 |     def update_progress_bar(type, progress, total_progress,
44 |                             force_update=True):  # used to update the progress bar display
45 |         if total_progress % GutenbergCacheSettings.DOWNLOAD_NUM_DIVS == 0 or force_update == True or progress == 0:
46 |             dv = total_progress / GutenbergCacheSettings.DOWNLOAD_NUM_DIVS
47 |             num_of_sharp = int(progress / dv)
48 |             num_of_space = int((total_progress - progress) / dv)
49 | 
50 |             sys.stdout.write("\r %s : [%s%s]" % (type, '#' * num_of_sharp, ' ' * num_of_space))
51 |             sys.stdout.flush()
52 | 
53 |     download_progress = 0
54 | 
55 |     ##
56 |     # Callback to report downloaded data
57 | 
58 |     @staticmethod
59 |     def __report(block_no,block_size, file_size):  # callback called on download update
60 |         Utils.download_progress += block_size
61 |         type = 'Downloading %s' % GutenbergCacheSettings.CACHE_RDF_ARCHIVE_NAME
62 |         Utils.update_progress_bar(type, Utils.download_progress, file_size, True)
63 | 
64 |     ##
65 |     # Download the RDF file function
66 | 
67 |     @staticmethod
68 |     def download_file():  # used to download the rdf tar file
69 |         start = time.time()
70 |         urllib.request.urlretrieve(GutenbergCacheSettings.CACHE_RDF_DOWNLOAD_LINK,
71 |                                    GutenbergCacheSettings.CACHE_RDF_ARCHIVE_NAME, Utils.__report)
72 | 
73 |         print ('took %f' % (time.time() - start))
74 |         Utils.download_progress = 0
75 | 
76 |     ##
77 |     # Unpack the tar file
78 | 
79 |     @staticmethod
80 |     def unpack_tarbz2():  # used to unpack the rdf tar file
81 |         start = time.time()
82 |         tar = tarfile.open(GutenbergCacheSettings.CACHE_RDF_ARCHIVE_NAME)
83 |         total_num = len(tar.getmembers())
84 |         type = 'Extracting  %s' % GutenbergCacheSettings.CACHE_RDF_ARCHIVE_NAME
85 |         for idx, member in enumerate(tar.getmembers()):
86 |             Utils.update_progress_bar(type, idx, total_num)
87 |             tar.extract(member)
88 |         tar.close()
89 | 
90 |         print('took %f' % (time.time() - start))
91 | 


--------------------------------------------------------------------------------
/gutenbergpy/caches/mongodbcache.py:
--------------------------------------------------------------------------------
 1 | from gutenbergpy.gutenbergcachesettings import GutenbergCacheSettings
 2 | from gutenbergpy.caches.cache import Cache
 3 | from gutenbergpy.utils import Utils
 4 | from gutenbergpy.parse.cachefields import Fields
 5 | from gutenbergpy.gutenbergcachesettings import GutenbergCacheSettings
 6 | 
 7 | 
 8 | from pymongo import MongoClient
 9 | 
10 | 
11 | ##
12 | # SQLite cache implementation
13 | class MongodbCache(Cache):
14 | 
15 |     def __init__(self):
16 |         self.client = MongoClient(GutenbergCacheSettings.MONGO_DB_CONNECTION_SERVER)
17 |         self.db = self.client.mongodbgutenbergcache
18 | 
19 |     def __get_book_json(self,parseItem,fields):
20 |         book_dict ={}
21 | 
22 |         book_dict['publisher']      = fields[Fields.PUBLISHER].set[parseItem.publisher_id-1] if parseItem.publisher_id and parseItem.publisher_id != -1 else 'None'
23 |         book_dict['rights']         = fields[Fields.RIGHTS].set[parseItem.rights_id-1] if parseItem.rights_id and parseItem.rights_id != -1 else 'None'
24 |         book_dict['language']       = fields[Fields.LANGUAGE].set[parseItem.language_id-1] if parseItem.language_id and parseItem.language_id != -1 else 'None'
25 |         book_dict['book_shelf']     = fields[Fields.BOOKSHELF].set[parseItem.bookshelf_id-1] if parseItem.bookshelf_id and parseItem.bookshelf_id != -1 else 'None'
26 |         book_dict['gutenberg_book_id'] = parseItem.gutenberg_book_id
27 |         book_dict['date_issued']    = parseItem.date_issued
28 |         book_dict['num_downloads']  = parseItem.num_downloads
29 | 
30 |         book_dict['titles']         = map(lambda x: x[0] , fields[Fields.TITLE].set[[x - 1 for x in parseItem.titles_id]])  if parseItem.titles_id and parseItem.titles_id != -1 else ['None']
31 |         book_dict['subjects']       = map(lambda x: x , fields[Fields.SUBJECT].set[[x - 1 for x in parseItem.subjects_id]]) if parseItem.subjects_id and parseItem.subjects_id != -1 else ['None']
32 |         book_dict['authors']        = map(lambda x: x , fields[Fields.AUTHOR].set[[x - 1 for x in parseItem.authors_id]]) if parseItem.authors_id and parseItem.authors_id != -1 else ['None']
33 |         book_dict['files']          = map(lambda x: x[0] , fields[Fields.FILES].setLinks[[x - 1 for x in parseItem.files_id]]) if parseItem.files_id and parseItem.files_id != -1 else ['None']
34 |         book_dict['type']           = fields[Fields.TYPE].set[parseItem.type_id-1]  if parseItem.type_id and parseItem.type_id != -1 else 'None'
35 |         return book_dict
36 |     ##
37 |     # Create the MongoDB cache
38 |     def create_cache(self, parse_results):
39 |         self.db.books.drop()
40 |         book_collection = self.db.books
41 |         total = len (parse_results.books)
42 | 
43 |         for idx,book in enumerate(parse_results.books):
44 |             Utils.update_progress_bar("MongoDB progress", idx, total)
45 |             json = self.__get_book_json(book,parse_results.field_sets)
46 |             self.db.books.insert_one(json)
47 | 
48 |     def create_or_dict(self,name,newname, dt,out):
49 |         if dt.has_key(name):
50 |             dict = {}
51 |             lst = []
52 |             for e in dt[name]:
53 |                 lst.append ({name:e})
54 |             out.extend(lst)
55 | 
56 |     def query(self,**kwargs):
57 |         query = {}
58 |         lst = []
59 |         self.create_or_dict('languages','languages',kwargs,lst)
60 |         self.create_or_dict('authors','authors', kwargs, lst)
61 |         self.create_or_dict('types','type', kwargs, lst)
62 |         self.create_or_dict('titles', 'titles', kwargs, lst)
63 |         self.create_or_dict('subjects', 'subjects', kwargs, lst)
64 |         self.create_or_dict('publishers', 'publisher', kwargs, lst)
65 |         self.create_or_dict('bookshelves', 'bookshelves', kwargs, lst)
66 |         self.create_or_dict('gutenberg_book_id', 'gutenberg_book_id', kwargs, lst)
67 |         query['$or'] = lst
68 |         lst =[]
69 |         for res in  self.native_query(query):
70 |             lst.append(res["gutenberg_book_id"])
71 |         return lst
72 |     ##
73 |     # Native query function implementation
74 |     def native_query(self,mongodb_query):
75 |         return self.db.books.find(mongodb_query)


--------------------------------------------------------------------------------
/gutenbergpy/parse/rdfparser.py:
--------------------------------------------------------------------------------
 1 | from os   import listdir
 2 | from os   import path
 3 | from lxml import etree
 4 | 
 5 | from gutenbergpy.parse.cachefields      import Fields
 6 | from gutenbergpy.parse.book             import Book
 7 | from gutenbergpy.parse.parseitem        import ParseItem
 8 | from gutenbergpy.parse.parseitemfile    import ParseItemFiles
 9 | from gutenbergpy.parse.parseitemtitles  import ParseItemTitles
10 | from gutenbergpy.parse.rdfparseresults  import RDFParseResults
11 | from gutenbergpy.gutenbergcachesettings import GutenbergCacheSettings
12 | from gutenbergpy.utils                  import Utils
13 | 
14 | 
15 | ##
16 | # The rdf parser
17 | # noinspection PyClassHasNoInit
18 | class RdfParser:
19 |     ##
20 |     # The main funciton, this actually parses the rdf files from the downloaded cache
21 |     @staticmethod
22 |     def do():
23 |         result = RDFParseResults()
24 | 
25 |         result.field_sets = Fields.FIELD_COUNT * [None]
26 |         result.field_sets[Fields.TITLE]     = ParseItemTitles(xpath=['//dcterms:title/text()','//dcterms:alternative/text()'])
27 |         result.field_sets[Fields.SUBJECT]   = ParseItem(xpath =['//dcterms:subject/rdf:Description/rdf:value/text()'])
28 |         result.field_sets[Fields.TYPE]      = ParseItem(xpath =['//dcterms:type/rdf:Description/rdf:value/text()'])
29 |         result.field_sets[Fields.LANGUAGE]  = ParseItem(xpath =['//dcterms:language/rdf:Description/rdf:value/text()'])
30 |         result.field_sets[Fields.AUTHOR]    = ParseItem(xpath =['//dcterms:creator/pgterms:agent/pgterms:alias/text()','//dcterms:creator/pgterms:agent/pgterms:name/text()'])
31 |         result.field_sets[Fields.BOOKSHELF] = ParseItem(xpath =['//pgterms:bookshelf/rdf:Description/rdf:value/text()'])
32 |         result.field_sets[Fields.FILES]     = ParseItemFiles(xpath =['//dcterms:hasFormat'])
33 |         result.field_sets[Fields.PUBLISHER] = ParseItem(xpath =['//dcterms:publisher/text()'])
34 |         result.field_sets[Fields.RIGHTS]    = ParseItem( xpath =['//dcterms:rights/text()'])
35 | 
36 | 
37 |         dirs  =  [d for d in listdir(GutenbergCacheSettings.CACHE_RDF_UNPACK_DIRECTORY) if not d.startswith("DELETE")]
38 |         total = len(dirs)
39 | 
40 |         for idx, dir in enumerate(dirs):
41 |             if not str(dir).isdigit():
42 |                 continue
43 |             processing_str = "Processing progress: %d / %d" % (idx,total)
44 |             Utils.update_progress_bar(processing_str,idx,total)
45 |             file_path = path.join(GutenbergCacheSettings.CACHE_RDF_UNPACK_DIRECTORY,dir,'pg%s.rdf'%(dir))
46 |             doc = etree.parse(file_path,etree.ETCompatXMLParser())
47 | 
48 |             book_id = len(result.books)+1
49 |             res = Fields.FIELD_COUNT * [-1]
50 |             for idx_field, pt in enumerate(result.field_sets):
51 |                 if not pt.needs_book_id():
52 |                     res[idx_field] = pt.do(doc)
53 |                 else:
54 |                     res[idx_field] = pt.do(doc,book_id)
55 | 
56 |             gutenberg_book_id = int(dir)
57 | 
58 |             date_issued_x   = doc.xpath('//dcterms:issued/text()', namespaces=GutenbergCacheSettings.NS)
59 |             num_downloads_x = doc.xpath('//pgterms:downloads/text()',namespaces=GutenbergCacheSettings.NS)
60 | 
61 |             date_issued       = '1000-10-10' if not date_issued_x or date_issued_x[0] =='None' else str(date_issued_x[0])
62 |             num_downloads     =  -1 if not num_downloads_x else int(num_downloads_x[0])
63 |             publisher_id      =  -1 if not res[Fields.PUBLISHER] else res[Fields.PUBLISHER][0]
64 |             rights_id         =  -1 if not res[Fields.RIGHTS]    else res[Fields.RIGHTS][0]
65 |             language_id       =  -1 if not res[Fields.LANGUAGE] else res[Fields.LANGUAGE][0]
66 |             bookshelf_id      =  -1 if not res[Fields.BOOKSHELF] else res[Fields.BOOKSHELF][0]
67 |             type_id           =  -1 if not  res[Fields.TYPE]    else  res[Fields.TYPE][0]
68 | 
69 |             newbook = Book(publisher_id, rights_id, language_id, bookshelf_id,
70 |                            gutenberg_book_id, date_issued, num_downloads, res[Fields.TITLE],
71 |                            res[Fields.SUBJECT], type_id, res[Fields.AUTHOR], res[Fields.FILES])
72 | 
73 |             result.books.append(newbook)
74 | 
75 |         return result


--------------------------------------------------------------------------------
/gutenbergpy/orderedset.py:
--------------------------------------------------------------------------------
  1 | # THIS CODE IS ADAPTED FROM https://github.com/LuminosoInsight/ordered-set
  2 | SLICE_ALL = slice(None)
  3 | __version__ = '2.0.1'
  4 | 
  5 | import collections
  6 | try:
  7 |     from collections.abc import MutableSet
  8 | except:
  9 |     from collections import MutableSet
 10 | 
 11 | 
 12 | def is_iterable(obj):
 13 |     """
 14 |     Are we being asked to look up a list of things, instead of a single thing?
 15 |     We check for the `__iter__` attribute so that this can cover types that
 16 |     don't have to be known by this module, such as NumPy arrays.
 17 | 
 18 |     Strings, however, should be considered as atomic values to look up, not
 19 |     iterables. The same goes for tuples, since they are immutable and therefore
 20 |     valid entries.
 21 | 
 22 |     We don't need to check for the Python 2 `unicode` type, because it doesn't
 23 |     have an `__iter__` attribute anyway.
 24 |     """
 25 |     return hasattr(obj, '__iter__') and not isinstance(obj, str) and not isinstance(obj, tuple)
 26 | 
 27 | 
 28 | class OrderedSet(MutableSet):
 29 |     """
 30 |     An OrderedSet is a custom MutableSet that remembers its order, so that
 31 |     every entry has an index that can be looked up.
 32 |     """
 33 | 
 34 |     def __init__(self, iterable=None):
 35 |         self.items = []
 36 |         self.map = {}
 37 |         if iterable is not None:
 38 |             self |= iterable
 39 | 
 40 |     def __len__(self):
 41 |         return len(self.items)
 42 | 
 43 |     def __getitem__(self, index):
 44 |         """
 45 |         Get the item at a given index.
 46 | 
 47 |         If `index` is a slice, you will get back that slice of items. If it's
 48 |         the slice [:], exactly the same object is returned. (If you want an
 49 |         independent copy of an OrderedSet, use `OrderedSet.copy()`.)
 50 | 
 51 |         If `index` is an iterable, you'll get the OrderedSet of items
 52 |         corresponding to those indices. This is similar to NumPy's
 53 |         "fancy indexing".
 54 |         """
 55 |         if index == SLICE_ALL:
 56 |             return self
 57 |         elif hasattr(index, '__index__') or isinstance(index, slice):
 58 |             result = self.items[index]
 59 |             if isinstance(result, list):
 60 |                 return OrderedSet(result)
 61 |             else:
 62 |                 return result
 63 |         elif is_iterable(index):
 64 |             return OrderedSet([self.items[i] for i in index])
 65 |         else:
 66 |             raise TypeError("Don't know how to index an OrderedSet by %r" %
 67 |                             index)
 68 | 
 69 |     def copy(self):
 70 |         return OrderedSet(self)
 71 | 
 72 |     def __getstate__(self):
 73 |         if len(self) == 0:
 74 |             # The state can't be an empty list.
 75 |             # We need to return a truthy value, or else __setstate__ won't be run.
 76 |             #
 77 |             # This could have been done more gracefully by always putting the state
 78 |             # in a tuple, but this way is backwards- and forwards- compatible with
 79 |             # previous versions of OrderedSet.
 80 |             return None,
 81 |         else:
 82 |             return list(self)
 83 | 
 84 |     def __setstate__(self, state):
 85 |         if state == (None,):
 86 |             self.__init__([])
 87 |         else:
 88 |             self.__init__(state)
 89 | 
 90 |     def __contains__(self, key):
 91 |         return key in self.map
 92 | 
 93 |     def add(self, key):
 94 |         """
 95 |         Add `key` as an item to this OrderedSet, then return its index.
 96 | 
 97 |         If `key` is already in the OrderedSet, return the index it already
 98 |         had.
 99 |         """
100 |         if key not in self.map:
101 |             self.map[key] = len(self.items)
102 |             self.items.append(key)
103 |         return self.map[key] + 1
104 | 
105 |     append = add
106 | 
107 |     def update(self, sequence):
108 |         """
109 |         Update the set with the given iterable sequence, then return the index
110 |         of the last element inserted.
111 |         """
112 |         item_index = None
113 |         try:
114 |             for item in sequence:
115 |                 item_index = self.add(item)
116 |         except TypeError:
117 |             raise ValueError('Argument needs to be an iterable, got %s' % type(sequence))
118 |         return item_index
119 | 
120 |     def index(self, key):
121 |         """
122 |         Get the index of a given entry, raising an IndexError if it's not
123 |         present.
124 | 
125 |         `key` can be an iterable of entries that is not a string, in which case
126 |         this returns a list of indices.
127 |         """
128 |         if is_iterable(key):
129 |             return [self.index(subkey) for subkey in key]
130 | 
131 |         if key not in self.map:
132 |             return -1
133 | 
134 |         return self.map[key]
135 | 
136 |     def pop(self):
137 |         """
138 |         Remove and return the last element from the set.
139 | 
140 |         Raises KeyError if the set is empty.
141 |         """
142 |         if not self.items:
143 |             raise KeyError('Set is empty')
144 | 
145 |         elem = self.items[-1]
146 |         del self.items[-1]
147 |         del self.map[elem]
148 |         return elem
149 | 
150 |     def discard(self, key):
151 |         """
152 |         Remove an element.  Do not raise an exception if absent.
153 | 
154 |         The MutableSet mixin uses this to implement the .remove() method, which
155 |         *does* raise an error when asked to remove a non-existent item.
156 |         """
157 |         if key in self:
158 |             i = self.items.index(key)
159 |             del self.items[i]
160 |             del self.map[key]
161 |             for k, v in self.map.items():
162 |                 if v >= i:
163 |                     self.map[k] = v - 1
164 | 
165 |     def clear(self):
166 |         """
167 |         Remove all items from this OrderedSet.
168 |         """
169 |         del self.items[:]
170 |         self.map.clear()
171 | 
172 |     def __iter__(self):
173 |         return iter(self.items)
174 | 
175 |     def __reversed__(self):
176 |         return reversed(self.items)
177 | 
178 |     def __repr__(self):
179 |         if not self:
180 |             return '%s()' % (self.__class__.__name__,)
181 |         return '%s(%r)' % (self.__class__.__name__, list(self))
182 | 
183 |     def __eq__(self, other):
184 |         if isinstance(other, OrderedSet):
185 |             return len(self) == len(other) and self.items == other.items
186 |         try:
187 |             other_as_set = set(other)
188 |         except TypeError:
189 |             # If `other` can't be converted into a set, it's not equal.
190 |             return False
191 |         else:
192 |             return set(self) == other_as_set
193 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | GutenbergPy
  2 | ========
  3 | 
  4 | ![image](https://github.com/raduangelescu/gutenbergpy/blob/master/dblogos.png?raw=true)
  5 | 
  6 | This package makes filtering and getting information from [Project Gutenberg](http://www.gutenberg.org) easier from python.
  7 | 
  8 | It's target audience is machine learning guys that need data for their project, but may be freely used by anybody.
  9 | 
 10 | The package:
 11 | 
 12 | -   Generates a local cache (of all gutenberg informations) that you can interogate to get book ids. The Local cache may be sqlite (default) or mongodb (for wich you need to have installed the pymongodb packet)
 13 | -   Downloads and cleans raw text from gutenberg books
 14 | 
 15 | The package has been tested with Python 3.6 on both Windows and Linux It is faster, smaller and less third-party intensive alternative to <https://github.com/c-w/Gutenberg>
 16 | 
 17 | About development: <http://www.raduangelescu.com/gutenbergpy.html>
 18 | 
 19 | Installation
 20 | ============
 21 | 
 22 | ```
 23 | pip install gutenbergpy
 24 | ```
 25 | 
 26 | or just install it from source (it's all just python code):
 27 | 
 28 | ```
 29 | git clone https://github.com/raduangelescu/gutenbergpy
 30 | python setup.py install
 31 | ```
 32 | 
 33 | Usage
 34 | =====
 35 | 
 36 | Downloading a text
 37 | ------------------
 38 | ```
 39 | import gutenbergpy.textget
 40 | 
 41 | ```
 42 | After importing our module, we can download a text from gutenberg.
 43 | 
 44 | ```python
 45 | def usage_example():
 46 |     # This gets a book by its gutenberg id number
 47 |     raw_book = gutenbergpy.textget.get_text_by_id(2701) # with headers
 48 |     clean_book = gutenbergpy.textget.strip_headers(raw_book) # without headers
 49 |     return clean_book, raw_book
 50 | ```
 51 | The code above can easily be used without the function declaration, this is simply for illustration.
 52 | 
 53 | ```python
 54 | cleaned_book, raw_book = usage_example()
 55 | 
 56 | # Cleaned Book
 57 | print(f'Example phrase from the cleaned book: {" ".join(str(cleaned_book[3000:3050]).split(" "))}')
 58 | # Raw Book
 59 | print(f'Example phrase from the raw book: {" ".join(str(raw_book[3000:3050]).split(" "))}')
 60 | 
 61 | ```
 62 | The output of the code above is:
 63 | ```
 64 | b'rgris.\n\nCHAPTER 93. The Castaway.\n\nCHAPTER 94. A S'
 65 | b'\n\n\n\nMOBY-DICK;\n\nor, THE WHALE.\n\nBy Herman Melville\n\n\n\nCONTENTS\n\nETYMOLOGY.\n\nEXTRACTS (Supplied by a Sub-Sub-Librarian).\n\nCHAPTER 1. Loomings.\n\nCHAPTER 2. The Carpet-Bag.\n\nCHAPTER 3. The Spouter-Inn.\n\nCHAPTER 4. The Counterpane.\n\nCHAPTER 5. Breakfast.\n\nCHAPTER 6. The Street.\n\nCHAPTER 7. The Chapel.\n\nCHAPTER 8. The Pulpit.\n\nCHAPTER 9. The Sermon.\n\nCHAPTER 10. A Bosom Friend.\n\nCHAPTER 11. Nightgown.\n\nCHAPTER 12. Biographical.\n\nCHAPTER 13. Wheelbarrow.\n\nCHAPTER 14. Nantucket.\n\nCHAPTER 15. Chowder.\n\nCHAPTER 16. The Ship.\n\nCHAPTER 17. The Ramadan.\n\nCHAPTER 18. His Mark.\n\nCHAPTER 19. The Prophet.\n\nCHAPTER 20. All Astir.\n\nCHAPTER 21. Going Aboard.\n\nCHAPTER 22. Merry Christmas.\n\nCHAPTER 23. The Lee Shore.\n\nCHAPTER 24. The Advocate.\n\nCHAPTER 25. Postscript.\n\nCHAPTER 26. Knights and Squires.\n\nCHAPTER 27. Knights and Squires.\n\nCHAPTER 28. Ahab.\n\nCHAPTER 29. Enter Ahab; to Him, Stubb.\n\nCHAPTER 30. The Pipe.\n\nCHAPTER 31. Queen Mab.\n\nCHAPTER 32. Cetology.\n\nCHAPTER 33. The Specksnyder.\n\nCHAPTER 34. Th'
 66 | ```
 67 | They are both pretty messy, and will need to be cleaned prior to use for NLP etc.
 68 | 
 69 | The Raw book:
 70 | ```output
 71 | b'b\xe2\x80\x99s Supper.\r\n\r\nCHAPTER 65. The Whale as a Dish.\r'
 72 | b'\n\n\n\nMOBY-DICK;\n\nor, THE WHALE.\n\nBy Herman Melville\n\n\n\nCONTENTS\n\nETYMOLOGY.\n\nEXTRACTS (Supplied by a Sub-Sub-Librarian).\n\nCHAPTER 1. Loomings.\n\nCHAPTER 2. The Carpet-Bag.\n\nCHAPTER 3. The Spouter-Inn.\n\nCHAPTER 4. The Counterpane.\n\nCHAPTER 5. Breakfast.\n\nCHAPTER 6. The Street.\n\nCHAPTER 7. The Chapel.\n\nCHAPTER 8. The Pulpit.\n\nCHAPTER 9. The Sermon.\n\nCHAPTER 10. A Bosom Friend.\n\nCHAPTER 11. Nightgown.\n\nCHAPTER 12. Biographical.\n\nCHAPTER 13. Wheelbarrow.\n\nCHAPTER 14. Nantucket.\n\nCHAPTER 15. Chowder.\n\nCHAPTER 16. The Ship.\n\nCHAPTER 17. The Ramadan.\n\nCHAPTER 18. His Mark.\n\nCHAPTER 19. The Prophet.\n\nCHAPTER 20. All Astir.\n\nCHAPTER 21. Going Aboard.\n\nCHAPTER 22. Merry Christmas.\n\nCHAPTER 23. The Lee Shore.\n\nCHAPTER 24. The Advocate.\n\nCHAPTER 25. Postscript.\n\nCHAPTER 26. Knights and Squires.\n\nCHAPTER 27. Knights and Squires.\n\nCHAPTER 28. Ahab.\n\nCHAPTER 29. Enter Ahab; to Him, Stubb.\n\nCHAPTER 30. The Pipe.\n\nCHAPTER 31. Queen Mab.\n\nCHAPTER 32. Cetology.\n\nCHAPTER 33. The Specksnyder.\n\nCHAPTER 34. Th'
 73 | 
 74 | ```
 75 | Query the cache
 76 | ---------------
 77 | 
 78 | To do this you first need to create the cache (this is a one time thing per os, until you decide to redo it)
 79 | 
 80 | ```
 81 | from gutenbergpy.gutenbergcache import GutenbergCache
 82 | #for sqlite
 83 | GutenbergCache.create()
 84 | #for mongodb
 85 | GutenbergCache.create(type=GutenbergCacheTypes.CACHE_TYPE_MONGODB)
 86 | ```
 87 | 
 88 | for debugging/better control you have these boolean options on create
 89 | 
 90 | > -   *refresh* deletes the old cache
 91 | > -   *download* property downloads the rdf file from the gutenberg project
 92 | > -   *unpack* unpacks it
 93 | > -   *parse* parses it in memory
 94 | > -   *cache* writes the cache
 95 | 
 96 | ```
 97 | GutenbergCache.create(refresh=True, download=True, unpack=True, parse=True, cache=True, deleteTemp=True)
 98 | ```
 99 | 
100 | for even better control you may set the GutenbergCacheSettings
101 | -   *CacheFilename*
102 | -   *CacheUnpackDir*
103 | -   *CacheArchiveName*
104 | -   *ProgressBarMaxLength*
105 | -   *CacheRDFDownloadLink*
106 | -   *TextFilesCacheFolder*
107 | -   *MongoDBCacheServer*
108 | 
109 | ```
110 | GutenbergCacheSettings.set( CacheFilename="", CacheUnpackDir="",
111 |     CacheArchiveName="", ProgressBarMaxLength="", CacheRDFDownloadLink="", TextFilesCacheFolder="", MongoDBCacheServer="")
112 | ```
113 | 
114 | After doing a `create` go grab a coffee, it will be over in about 5 minutes, depending on your internet speed and computer power (On a i7 with gigabit connection and ssd it finishes in about 1 minute)
115 | 
116 | Get the cache
117 | ```
118 | #for mongodb
119 | cache = GutenbergCache.get_cache(GutenbergCacheTypes.CACHE_TYPE_MONGODB)
120 | #for sqlite
121 | cache  = GutenbergCache.get_cache()
122 | ```
123 | Now you can do queries
124 | 
125 | Get the book Gutenberg unique indices by using this query function
126 | 
127 | Standard query fields:
128 | -   languages
129 | -   authors
130 | -   types
131 | -   titles
132 | -   subjects
133 | -   publishers
134 | -   bookshelves
135 | -   downloadtype
136 | ```
137 | print(cache.query(downloadtype=['application/plain','text/plain','text/html; charset=utf-8']))
138 | ```
139 | Or do a native query on the sqlite database
140 | ```
141 | #python
142 | cache.native_query("SELECT * FROM books")
143 | #mongodb
144 | cache.native_query({type:'Text'}}
145 | ```
146 | For SQLITE custom queries, take a look at the SQLITE database scheme:
147 | 
148 | ![image](https://github.com/raduangelescu/gutenbergpy/blob/master/sqlitecheme.png?raw=true)
149 | 
150 | For MongoDB queries, you have all the books collection. Each book with the following fields:
151 | 
152 | > -   book(publisher, rights, language, book\_shelf, gutenberg\_book\_id, date\_issued, num\_downloads, titles, subjects, authors, files ,type)
153 | 


--------------------------------------------------------------------------------
/gutenbergpy/caches/sqlitecache.py:
--------------------------------------------------------------------------------
  1 | from gutenbergpy.gutenbergcachesettings import GutenbergCacheSettings
  2 | from gutenbergpy.caches.cache import Cache
  3 | from gutenbergpy.utils import Utils
  4 | from gutenbergpy.parse.cachefields import Fields
  5 | from gutenbergpy.gutenbergcachesettings import GutenbergCacheSettings
  6 | 
  7 | import sqlite3
  8 | import os
  9 | 
 10 | 
 11 | ##
 12 | # SQLite cache implementation
 13 | class SQLiteCache(Cache):
 14 | 
 15 | 
 16 |     def __init__(self):
 17 |         self.cursor     = None
 18 |         self.connection = None
 19 |         self.table_map = [None] * Fields.FIELD_COUNT
 20 | 
 21 |         self.table_map[Fields.TITLE]    =   'titles'
 22 |         self.table_map[Fields.SUBJECT]  =   'subjects'
 23 |         self.table_map[Fields.TYPE]     =   'types'
 24 |         self.table_map[Fields.LANGUAGE] =   'languages'
 25 |         self.table_map[Fields.AUTHOR]   =   'authors'
 26 |         self.table_map[Fields.BOOKSHELF]=   'bookshelves'
 27 |         self.table_map[Fields.FILES]    =   '/---/'
 28 |         self.table_map[Fields.PUBLISHER]=   'publishers'
 29 |         self.table_map[Fields.RIGHTS]   =   'rights'
 30 |         ##
 31 |         # Files are package data
 32 |         SQLiteCache.DB_CREATE_CACHE_FILENAME         = 'gutenbergindex.db.sql'
 33 |         SQLiteCache.DB_CREATE_CACHE_INDICES_FILENAME = 'gutenbergindex_indices.db.sql'
 34 | 
 35 |         this_dir, this_filename = os.path.split(__file__)
 36 |         SQLiteCache.DB_CREATE_CACHE_FILENAME = os.path.join(this_dir, SQLiteCache.DB_CREATE_CACHE_FILENAME)
 37 |         SQLiteCache.DB_CREATE_CACHE_INDICES_FILENAME = os.path.join(this_dir, SQLiteCache.DB_CREATE_CACHE_INDICES_FILENAME)
 38 | 
 39 |     ##
 40 |     # Insert many helper function
 41 |     def __insert_many_field(self, table, field, theSet):
 42 |         if len(theSet):
 43 |             query = 'INSERT OR IGNORE INTO %s(%s) VALUES (?)' % (table,field)
 44 |             self.cursor.executemany(query,map(lambda x: (x,) , theSet))
 45 | 
 46 |     ##
 47 |     # Insert many 2 fields helper function
 48 |     def __insert_many_field_id(self, table, field1, field2, theSet):
 49 |         if len(theSet):
 50 |             query = 'INSERT OR IGNORE INTO %s(%s, %s) VALUES (?,?)' % (table,field1,field2)
 51 |             insert_array = map(lambda x: (x[0],x[1]) , theSet)
 52 |             self.cursor.executemany(query,insert_array)
 53 | 
 54 |     ##
 55 |     # Insert in link table
 56 |     def __insertLinks(self,ids,tablename,link1name,link2name):
 57 |         if len(list(ids)):
 58 |             query = "INSERT INTO %s(%s,%s) VALUES (?,?)" % (tablename,link1name,link2name)
 59 |             self.cursor.executemany(query, ids)
 60 | 
 61 |     ##
 62 |     # Create the SQL cache
 63 |     def create_cache(self, parse_results):
 64 |         self.connection = sqlite3.connect(GutenbergCacheSettings.CACHE_FILENAME)
 65 |         self.cursor     = self.connection.cursor()
 66 | 
 67 |         # noinspection PyUnresolvedReferences
 68 |         create_query = open(SQLiteCache.DB_CREATE_CACHE_FILENAME, 'r').read()
 69 |         self.cursor.executescript(create_query)
 70 |         self.connection.commit()
 71 | 
 72 |         for idx,pt in enumerate(parse_results.field_sets):
 73 |             if idx == Fields.FILES:
 74 |                 self.__insert_many_field('downloadlinkstype', 'name', pt.setTypes)
 75 |                 self.cursor.executemany(
 76 |                     'INSERT OR IGNORE INTO downloadlinks(name,bookid,downloadtypeid) VALUES (?,?,?)'
 77 |                     , map(lambda x: (x[0], x[1], x[2]), parse_results.field_sets[Fields.FILES].setLinks))
 78 | 
 79 |             elif pt.needs_book_id():
 80 |                 self.__insert_many_field_id(self.table_map[idx], 'name', 'bookid', pt.set)
 81 |             else:
 82 |                 self.__insert_many_field(self.table_map[idx], 'name', pt.set)
 83 | 
 84 | 
 85 |         total = len(parse_results.books)
 86 | 
 87 |         for idx, book in enumerate(parse_results.books):
 88 |             Utils.update_progress_bar("SQLite progress" ,idx,total)
 89 |             book_id = idx +1
 90 |             self.__insertLinks(list(map(lambda x: (x,book_id) , book.authors_id)),'book_authors','authorid','bookid')
 91 |             self.__insertLinks(list(map(lambda x: (x,book_id) , book.subjects_id)),'book_subjects','subjectid','bookid')
 92 | 
 93 |             self.cursor.execute("INSERT OR IGNORE INTO books(publisherid,dateissued,rightsid,numdownloads,languageid,bookshelveid,gutenbergbookid,typeid) "
 94 |                                 "VALUES (?,?,?,?,?,?,?,?)" , (book.publisher_id, book.date_issued, book.rights_id,
 95 |                                                 book.num_downloads,book.language_id,book.bookshelf_id,book.gutenberg_book_id,book.type_id))
 96 | 
 97 |         self.connection.commit()
 98 | 
 99 |         # noinspection PyUnresolvedReferences
100 |         create_indices_query = open(SQLiteCache.DB_CREATE_CACHE_INDICES_FILENAME, 'r').read()
101 |         self.cursor.executescript(create_indices_query)
102 |         self.connection.commit()
103 | 
104 | 
105 |         self.connection.close()
106 | 
107 |     ##
108 |     # Query function implementation
109 |     def query(self,**kwargs):
110 |         class HelperQuery:
111 |             def __init__(self, tables, query_struct):
112 |                 self.tables         = tables
113 |                 self.query_struct   = query_struct
114 |         helpers=[
115 |             HelperQuery(['languages'], ('languages.id = books.languageid', 'languages.name',
116 |                         kwargs['languages'] if 'languages' in kwargs else None)),
117 |             HelperQuery(['authors', 'book_authors'],
118 |                         ('authors.id = book_authors.authorid and books.id = book_authors.bookid', 'authors.name',
119 |                          kwargs['authors'] if 'authors' in kwargs else None)),
120 |             HelperQuery(['types'],('books.typeid = types.id', 'types.name',
121 |                          kwargs['types'] if 'types' in kwargs else None)),
122 |             HelperQuery(['titles'],('titles.bookid = books.id', 'titles.name',
123 |                          kwargs['titles'] if 'titles' in kwargs else None)),
124 |             HelperQuery(['subjects', 'book_subjects'],
125 |                         ('subjects.id = book_subjects.bookid and books.id = book_subjects.subjectid ', 'subjects.name',
126 |                          kwargs['subjects'] if 'subjects' in kwargs else None)),
127 |             HelperQuery(['publishers'],
128 |                         ('publishers.id = books.publisherid', 'publishers.name',
129 |                          kwargs['publishers'] if 'publishers' in kwargs else None)),
130 |             HelperQuery(['bookshelves'],
131 |                         ('bookshelves.id = books.bookshelveid', 'bookshelves.name',
132 |                          kwargs['bookshelves'] if 'bookshelves' in kwargs else None)),
133 |             HelperQuery(['downloadlinks','downloadlinkstype'],
134 |                         ('downloadlinks.downloadtypeid =  downloadlinkstype.id and downloadlinks.bookid = books.id', 'downloadlinkstype.name',
135 |                          kwargs['downloadtype'] if 'downloadtype' in kwargs else None))
136 |         ]
137 |         runtime  = list(filter(lambda x: x.query_struct[2] , helpers))
138 | 
139 |         query = "SELECT DISTINCT books.gutenbergbookid FROM books"
140 |         for q in runtime:
141 |             query = "%s,%s"% (query ,','.join(map(str,  q.tables)))
142 |         query = "%s WHERE " % query
143 | 
144 |         for idx,q in enumerate(runtime):
145 |             query = "%s %s and %s in (%s) " % (query,q.query_struct[0],q.query_struct[1],','.join(map(lambda x: "'%s'"%(str(x)), q.query_struct[2])))
146 |             if idx != len(runtime) -1:
147 |                 query = "%s and " % query
148 | 
149 |         res = []
150 |         for row in self.native_query(query):
151 |             res.append(int(row[0]))
152 | 
153 |         return res
154 |     ##
155 |     # Native query function implementation
156 |     def native_query(self,sql_query):
157 |         if self.cursor is None or self.connection is None:
158 |             self.connection = sqlite3.connect(GutenbergCacheSettings.CACHE_FILENAME)
159 |             self.cursor     = self.connection.cursor()
160 | 
161 |         return self.cursor.execute(sql_query)
162 | 


--------------------------------------------------------------------------------
/gutenbergpy/textget.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # encoding=utf8 
  3 | from __future__ import absolute_import, unicode_literals
  4 | 
  5 | import gzip
  6 | import os
  7 | import errno
  8 | import http.client
  9 | from contextlib import closing
 10 | from future.standard_library import install_aliases
 11 | install_aliases()
 12 | from urllib.request import urlopen
 13 | from urllib.parse import urlparse
 14 | import chardet
 15 | from gutenbergpy.gutenbergcachesettings import GutenbergCacheSettings
 16 | 
 17 | class UnknownDownloadUri(Exception):
 18 |     """location cannot be found for a text"""
 19 | 
 20 | ##
 21 | # MARKERS ARE FROM https://github.com/c-w/Gutenberg/blob/master/gutenberg/_domain_model/text.py
 22 | 
 23 | TEXT_START_MARKERS = frozenset((
 24 |     "*END*THE SMALL PRINT",
 25 |     "*** START OF THE PROJECT GUTENBERG",
 26 |     "*** START OF THIS PROJECT GUTENBERG",
 27 |     "This etext was prepared by",
 28 |     "E-text prepared by",
 29 |     "Produced by",
 30 |     "Distributed Proofreading Team",
 31 |     "Proofreading Team at http://www.pgdp.net",
 32 |     "http://gallica.bnf.fr)",
 33 |     "      http://archive.org/details/",
 34 |     "http://www.pgdp.net",
 35 |     "by The Internet Archive)",
 36 |     "by The Internet Archive/Canadian Libraries",
 37 |     "by The Internet Archive/American Libraries",
 38 |     "public domain material from the Internet Archive",
 39 |     "Internet Archive)",
 40 |     "Internet Archive/Canadian Libraries",
 41 |     "Internet Archive/American Libraries",
 42 |     "material from the Google Print project",
 43 |     "*END THE SMALL PRINT",
 44 |     "***START OF THE PROJECT GUTENBERG",
 45 |     "This etext was produced by",
 46 |     "*** START OF THE COPYRIGHTED",
 47 |     "The Project Gutenberg",
 48 |     "http://gutenberg.spiegel.de/ erreichbar.",
 49 |     "Project Runeberg publishes",
 50 |     "Beginning of this Project Gutenberg",
 51 |     "Project Gutenberg Online Distributed",
 52 |     "Gutenberg Online Distributed",
 53 |     "the Project Gutenberg Online Distributed",
 54 |     "Project Gutenberg TEI",
 55 |     "This eBook was prepared by",
 56 |     "http://gutenberg2000.de erreichbar.",
 57 |     "This Etext was prepared by",
 58 |     "This Project Gutenberg Etext was prepared by",
 59 |     "Gutenberg Distributed Proofreaders",
 60 |     "Project Gutenberg Distributed Proofreaders",
 61 |     "the Project Gutenberg Online Distributed Proofreading Team",
 62 |     "**The Project Gutenberg",
 63 |     "*SMALL PRINT!",
 64 |     "More information about this book is at the top of this file.",
 65 |     "tells you about restrictions in how the file may be used.",
 66 |     "l'authorization à les utilizer pour preparer ce texte.",
 67 |     "of the etext through OCR.",
 68 |     "*****These eBooks Were Prepared By Thousands of Volunteers!*****",
 69 |     "We need your donations more than ever!",
 70 |     " *** START OF THIS PROJECT GUTENBERG",
 71 |     "****     SMALL PRINT!",
 72 |     '["Small Print" V.',
 73 |     '      (http://www.ibiblio.org/gutenberg/',
 74 |     'and the Project Gutenberg Online Distributed Proofreading Team',
 75 |     'Mary Meehan, and the Project Gutenberg Online Distributed Proofreading',
 76 |     '                this Project Gutenberg edition.',
 77 | ))
 78 | 
 79 | TEXT_END_MARKERS = frozenset((
 80 |     "*** END OF THE PROJECT GUTENBERG",
 81 |     "*** END OF THIS PROJECT GUTENBERG",
 82 |     "***END OF THE PROJECT GUTENBERG",
 83 |     "End of the Project Gutenberg",
 84 |     "End of The Project Gutenberg",
 85 |     "Ende dieses Project Gutenberg",
 86 |     "by Project Gutenberg",
 87 |     "End of Project Gutenberg",
 88 |     "End of this Project Gutenberg",
 89 |     "Ende dieses Projekt Gutenberg",
 90 |     "        ***END OF THE PROJECT GUTENBERG",
 91 |     "*** END OF THE COPYRIGHTED",
 92 |     "End of this is COPYRIGHTED",
 93 |     "Ende dieses Etextes ",
 94 |     "Ende dieses Project Gutenber",
 95 |     "Ende diese Project Gutenberg",
 96 |     "**This is a COPYRIGHTED Project Gutenberg Etext, Details Above**",
 97 |     "Fin de Project Gutenberg",
 98 |     "The Project Gutenberg Etext of ",
 99 |     "Ce document fut presente en lecture",
100 |     "Ce document fut présenté en lecture",
101 |     "More information about this book is at the top of this file.",
102 |     "We need your donations more than ever!",
103 |     "END OF PROJECT GUTENBERG",
104 |     " End of the Project Gutenberg",
105 |     " *** END OF THIS PROJECT GUTENBERG",
106 | ))
107 | 
108 | LEGALESE_START_MARKERS = frozenset(("<<THIS ELECTRONIC VERSION OF",))
109 | LEGALESE_END_MARKERS = frozenset(("SERVICE THAT CHARGES FOR DOWNLOAD",))
110 | 
111 | 
112 | ##
113 | # adapted from https://github.com/c-w/Gutenberg/blob/master/gutenberg/acquire/text.py
114 | def get_text_dir_from_index(index):
115 |     return f"files/{index}"
116 | 
117 | 
118 | ##
119 | # adapted from https://github.com/c-w/Gutenberg/blob/master/gutenberg/acquire/text.py
120 | def _format_download_uri(index):
121 |     """Returns the download location on the Project Gutenberg servers for a
122 |     given text.
123 |     Raises:
124 |         UnknownDownloadUri: If no download location can be found for the text.
125 |     """
126 |     uri_root = r'https://www.gutenberg.org'
127 |     extensions = ('.txt', '-8.txt', '-0.txt')
128 |     path = get_text_dir_from_index(index)
129 |     for extension in extensions:
130 |         uri = '{root}/{path}/{etextno}{extension}'.format(
131 |             root=uri_root,
132 |             path=path,
133 |             etextno=index,
134 |             extension=extension)
135 |         p = urlparse(uri)
136 |         conn = http.client.HTTPSConnection(p.netloc)
137 |         conn.request('HEAD', p.path)
138 |         resp = conn.getresponse()
139 |         if resp.status < 400:
140 |             return uri
141 |     raise UnknownDownloadUri("location not found for text at path: %s" % path)
142 | 
143 | 
144 | ##
145 | # adapted from https://github.com/c-w/Gutenberg/blob/master/gutenberg/acquire/text.py
146 | def get_text_by_id(index):
147 |     file_cache_location = os.path.join(GutenbergCacheSettings.TEXT_FILES_CACHE_FOLDER, str(index)+'.txt.gz')
148 |     if not os.path.exists(file_cache_location):
149 |         try:
150 |             os.makedirs(os.path.dirname(file_cache_location))
151 |         except OSError as ex:
152 |             if ex.errno != errno.EEXIST:
153 |                 raise
154 |         download_uri = _format_download_uri(index)
155 | 
156 |         text_bytes = urlopen(download_uri).read()
157 |         encoding = chardet.detect(text_bytes)["encoding"]
158 |         text = text_bytes.decode(encoding)
159 | 
160 |         with closing(gzip.open(file_cache_location, 'w')) as cache:
161 |             cache.write(text.encode('utf-8'))
162 | 
163 |     with closing(gzip.open(file_cache_location, 'r')) as cache:
164 |         text = cache.read().decode('utf-8')
165 |     return text.encode('utf-8')
166 | 
167 | 
168 | ##
169 | # this function is 100% from https://github.com/c-w/Gutenberg/blob/master/gutenberg/cleanup/strip_headers.py
170 | def strip_headers(text):
171 |     lines = text.splitlines()
172 |     sep = os.linesep
173 |     sep = sep.encode('utf-8')
174 |     out = []
175 |     i = 0
176 |     footer_found = False
177 |     ignore_section = False
178 | 
179 |     for line in lines:
180 |         reset = False
181 | 
182 |         if i <= 600:
183 |             # Check if the header ends here
184 |             if any(line.startswith(token.encode('utf-8')) for token in TEXT_START_MARKERS):
185 |                 reset = True
186 | 
187 |             # If it's the end of the header, delete the output produced so far.
188 |             # May be done several times, if multiple lines occur indicating the
189 |             # end of the header
190 |             if reset:
191 |                 out = []
192 |                 continue
193 | 
194 |         if i >= 100:
195 |             # Check if the footer begins here
196 |             if any(line.startswith(token.encode('utf-8')) for token in TEXT_END_MARKERS):
197 |                 footer_found = True
198 | 
199 |             # If it's the beginning of the footer, stop output
200 |             if footer_found:
201 |                 break
202 | 
203 |         if any(line.startswith(token.encode('utf-8')) for token in LEGALESE_START_MARKERS):
204 |             ignore_section = True
205 |             continue
206 |         elif any(line.startswith(token.encode('utf-8')) for token in LEGALESE_END_MARKERS):
207 |             ignore_section = False
208 |             continue
209 | 
210 |         if not ignore_section:
211 |             stripline = line.rstrip(sep) 
212 |             out.append(stripline)
213 |             i += 1
214 | 
215 |     return sep.join(out)
216 | 


--------------------------------------------------------------------------------