├── tests ├── gutenbergpy ├── __init__.py ├── caches │ ├── __init__.py │ ├── cache.py │ ├── gutenbergindex_indices.db.sql │ ├── gutenbergindex.db.sql │ ├── mongodbcache.py │ └── sqlitecache.py ├── parse │ ├── __init__.py │ ├── rdfparseresults.py │ ├── cachefields.py │ ├── book.py │ ├── parseitem.py │ ├── parseitemtitles.py │ ├── parseitemfile.py │ └── rdfparser.py ├── gutenbergcachesettings.py ├── gutenbergcache.py ├── utils.py ├── orderedset.py └── textget.py ├── dblogos.png ├── mongodb.png ├── sqlite.png ├── sqlitecheme.png ├── pyproject.toml ├── setup.py ├── .gitignore ├── test.py ├── setup.cfg ├── LICENSE.txt └── README.md /tests: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gutenbergpy/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gutenbergpy/caches/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gutenbergpy/parse/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dblogos.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raduangelescu/gutenbergpy/HEAD/dblogos.png -------------------------------------------------------------------------------- /mongodb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raduangelescu/gutenbergpy/HEAD/mongodb.png -------------------------------------------------------------------------------- /sqlite.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raduangelescu/gutenbergpy/HEAD/sqlite.png -------------------------------------------------------------------------------- /sqlitecheme.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raduangelescu/gutenbergpy/HEAD/sqlitecheme.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /gutenbergpy/parse/rdfparseresults.py: -------------------------------------------------------------------------------- 1 | ## 2 | # Used to hold the parser results 3 | class RDFParseResults: 4 | def __init__(self): 5 | self.field_sets = [] 6 | self.books = [] 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from os.path import isfile 2 | from sys import version_info 3 | 4 | from setuptools import find_packages 5 | from setuptools import setup 6 | 7 | setup( 8 | packages=find_packages(), 9 | package_data={'gutenbergpy.caches': ['*.sql']}, 10 | include_package_data=True) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | build/ 3 | GutenbergPy.egg-info/ 4 | dist/ 5 | .idea/gutenbergpy.iml 6 | .idea/inspectionProfiles/profiles_settings.xml 7 | .idea/misc.xml 8 | .idea/modules.xml 9 | .idea/vcs.xml 10 | .idea/workspace.xml 11 | texts/ 12 | gutenbergindex.db 13 | cache/ 14 | rdf-files.tar.bz2 15 | memory.pickle 16 | memusage.txt 17 | -------------------------------------------------------------------------------- /gutenbergpy/parse/cachefields.py: -------------------------------------------------------------------------------- 1 | ## 2 | # The main fields of a book 3 | # noinspection PyClassHasNoInit 4 | class Fields: # Fields we will have in the caches db for a book entry 5 | TITLE = 0 6 | SUBJECT = 1 7 | TYPE = 2 8 | LANGUAGE = 3 9 | AUTHOR = 4 10 | BOOKSHELF = 5 11 | FILES = 6 12 | PUBLISHER = 7 13 | RIGHTS = 8 14 | FIELD_COUNT = 9 15 | -------------------------------------------------------------------------------- /gutenbergpy/caches/cache.py: -------------------------------------------------------------------------------- 1 | ## 2 | # Base class for any kind of cache 3 | class Cache: 4 | def __init__(self): 5 | pass 6 | 7 | def create_cache(self, parse_results): 8 | raise NotImplementedError("Please implement the create_cache function") 9 | 10 | def query(self, **kwargs): 11 | raise NotImplementedError("Please implement the query function") 12 | 13 | def native_query(self, sql_query): 14 | raise NotImplementedError("Please implement the native_query function") 15 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import gutenbergpy.textget 3 | from gutenbergpy.gutenbergcache import GutenbergCache 4 | 5 | # create cache from scratchfrom scratch 6 | GutenbergCache.create(refresh=True, download=True, unpack=True, parse=True, cache=True, deleteTemp=True) 7 | # get the default cache (SQLite) 8 | cache = GutenbergCache.get_cache() 9 | # For the query function you can use the following fields: languages authors types titles subjects publishers bookshelves 10 | print(cache.query(downloadtype=['application/plain', 'text/plain', 'text/html; charset=utf-8'])) 11 | # Print stripped text 12 | print(gutenbergpy.textget.strip_headers(gutenbergpy.textget.get_text_by_id(1000))) 13 | -------------------------------------------------------------------------------- /gutenbergpy/parse/book.py: -------------------------------------------------------------------------------- 1 | ## 2 | # Used to hold a book in parse results after parsing 3 | class Book: 4 | def __init__(self, publisher_id, rights_id, language_id, 5 | bookshelf_id, gutenberg_book_id, 6 | date_issued, num_downloads, 7 | titles_id, subjects_id, type_id, authors_id, files_id): 8 | self.publisher_id = publisher_id 9 | self.rights_id = rights_id 10 | self.language_id = language_id 11 | self.bookshelf_id = bookshelf_id 12 | self.gutenberg_book_id = gutenberg_book_id 13 | self.date_issued = date_issued 14 | self.num_downloads = num_downloads 15 | self.titles_id = titles_id 16 | self.subjects_id = subjects_id 17 | self.authors_id = authors_id 18 | self.files_id = files_id 19 | self.type_id = type_id 20 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name= gutenbergpy 3 | author= Radu Angelescu 4 | author_email = raduangelescu@gmail.com 5 | version = 0.3.6 6 | description = Library to create and interogate local cache for Project Gutenberg 7 | long_description = file: README.md 8 | long_description_content_type = text/markdown 9 | url = https://github.com/raduangelescu/gutenbergpy 10 | download_url = http://pypi.python.org/pypi/gutenbergpy 11 | project_urls = 12 | Bug Tracker = http://pypi.python.org/pypi/gutenbergpy/issues 13 | classifiers = 14 | Programming Language :: Python :: 3 15 | License :: OSI Approved :: MIT License 16 | Operating System :: OS Independent 17 | license = LICENSE.txt 18 | 19 | [options] 20 | install_requires = 21 | future>=0.15.2 22 | httpsproxy-urllib2 23 | lxml>=3.2.0 24 | pymongo 25 | setuptools>=18.5 26 | chardet 27 | python_requires = >=3.6 28 | -------------------------------------------------------------------------------- /gutenbergpy/caches/gutenbergindex_indices.db.sql: -------------------------------------------------------------------------------- 1 | BEGIN TRANSACTION; 2 | 3 | CREATE INDEX `books_dateissued_idx` ON `books` (`dateissued` ASC); 4 | CREATE INDEX `books_numdownloads_idx` ON `books` (`numdownloads` ASC); 5 | CREATE INDEX `gutenbergbookid_idx` ON `books` (`gutenbergbookid` ASC); 6 | 7 | CREATE INDEX `authors_name_idx` ON `authors` (`name` ASC); 8 | 9 | CREATE INDEX `types_name_idx` ON `types` (`name` ASC); 10 | 11 | CREATE INDEX `titles_name_idx` ON `titles` (`name` ASC); 12 | 13 | CREATE INDEX `subjects_name_idx` ON `subjects` (`name` ASC); 14 | 15 | CREATE INDEX `rights_name_idx` ON `rights` (`name` ASC); 16 | 17 | CREATE INDEX `publishers_name_idx` ON `publishers` (`name` ASC); 18 | 19 | CREATE INDEX `languages_name_idx` ON `languages` (`name` ASC); 20 | 21 | CREATE INDEX `bookshelves_name_idx` ON `bookshelves` (`name` ASC); 22 | 23 | CREATE INDEX `downloadlinks_name_idx` ON `downloadlinks` (`name` ASC); 24 | 25 | 26 | COMMIT; 27 | -------------------------------------------------------------------------------- /gutenbergpy/parse/parseitem.py: -------------------------------------------------------------------------------- 1 | from gutenbergpy.gutenbergcachesettings import GutenbergCacheSettings 2 | from gutenbergpy.orderedset import OrderedSet 3 | 4 | 5 | ## 6 | # Helper for parsing a rdf file item 7 | class ParseItem: 8 | def __init__(self, xpath): 9 | self.xPath = xpath 10 | self.set = OrderedSet() 11 | 12 | def needs_book_id(self): 13 | return False 14 | 15 | def add_to_set_internal(self, xpathResults, ret): 16 | if len(xpathResults) > 0: 17 | for el in xpathResults: 18 | ret.append(self.set.add(el.replace("\"", "'"))) 19 | 20 | def add_to_set(self, xpathResults, ret): 21 | self.add_to_set_internal(xpathResults, ret) 22 | 23 | def do(self, doc): 24 | tmp = [] 25 | for xpth in self.xPath: 26 | xpathResults = doc.xpath(xpth, namespaces=GutenbergCacheSettings.NS) 27 | self.add_to_set(xpathResults, tmp) 28 | return tmp 29 | -------------------------------------------------------------------------------- /gutenbergpy/parse/parseitemtitles.py: -------------------------------------------------------------------------------- 1 | from gutenbergpy.parse.parseitem import ParseItem 2 | from gutenbergpy.gutenbergcachesettings import GutenbergCacheSettings 3 | 4 | 5 | ## 6 | # Parser helper for title items 7 | # noinspection PyMethodOverriding 8 | class ParseItemTitles(ParseItem): 9 | def needs_book_id(self): 10 | return True 11 | 12 | def att_to_set_book_id(self, xpathResults, ret, book_id): 13 | if len(xpathResults) > 0: 14 | for el in xpathResults: 15 | text = el.replace("\"", "'") 16 | index = self.set.index(text) 17 | if index != -1: 18 | self.set[text][1] = book_id 19 | else: 20 | new_index = self.set.add((text, book_id)) 21 | ret.append(new_index) 22 | 23 | def do(self, doc, book_id): 24 | tmp = [] 25 | for xpth in self.xPath: 26 | xpathResults = doc.xpath(xpth, namespaces=GutenbergCacheSettings.NS) 27 | self.att_to_set_book_id(xpathResults, tmp, book_id) 28 | return tmp 29 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016 Radu Angelescu. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a 4 | copy of this software and associated documentation files (the "Software"), 5 | to deal in the Software without restriction, including without limitation 6 | the rights to use, copy, modify, merge, publish, distribute, sublicense, 7 | and/or sell copies of the Software, and to permit persons to whom the 8 | Software is furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /gutenbergpy/caches/gutenbergindex.db.sql: -------------------------------------------------------------------------------- 1 | BEGIN TRANSACTION; 2 | CREATE TABLE `types` ( 3 | `id` INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE, 4 | `name` TEXT 5 | ); 6 | CREATE TABLE `titles` ( 7 | `id` INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE, 8 | `name` TEXT, 9 | `bookid` INTEGER 10 | ); 11 | CREATE TABLE `subjects` ( 12 | `id` INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE, 13 | `name` TEXT 14 | ); 15 | CREATE TABLE `rights` ( 16 | `id` INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE, 17 | `name` TEXT 18 | ); 19 | CREATE TABLE `publishers` ( 20 | `id` INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE, 21 | `name` TEXT 22 | ); 23 | CREATE TABLE `languages` ( 24 | `id` INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE, 25 | `name` TEXT 26 | ); 27 | 28 | CREATE TABLE `downloadlinkstype` ( 29 | `id` INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE, 30 | `name` TEXT 31 | ); 32 | 33 | CREATE TABLE `downloadlinks` ( 34 | `id` INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE, 35 | `name` TEXT, 36 | `downloadtypeid` INTEGER, 37 | `bookid` INTEGER 38 | ); 39 | CREATE TABLE `bookshelves` ( 40 | `id` INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE, 41 | `name` TEXT 42 | ); 43 | CREATE TABLE "books" ( 44 | `id` INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE, 45 | `publisherid` INTEGER, 46 | `dateissued` DATE, 47 | `rightsid` INTEGER, 48 | `numdownloads` INTEGER, 49 | `languageid` INTEGER, 50 | `bookshelveid` INTEGER, 51 | `gutenbergbookid` INTEGER, 52 | `typeid` INTEGER 53 | ); 54 | CREATE TABLE `book_subjects` ( 55 | `bookid` INTEGER, 56 | `subjectid` INTEGER 57 | ); 58 | CREATE TABLE `book_authors` ( 59 | `bookid` INTEGER, 60 | `authorid` INTEGER 61 | ); 62 | CREATE TABLE `authors` ( 63 | `id` INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE, 64 | `name` TEXT 65 | ); 66 | 67 | 68 | 69 | COMMIT; 70 | -------------------------------------------------------------------------------- /gutenbergpy/parse/parseitemfile.py: -------------------------------------------------------------------------------- 1 | from gutenbergpy.parse.parseitem import ParseItem 2 | from gutenbergpy.gutenbergcachesettings import GutenbergCacheSettings 3 | from gutenbergpy.orderedset import OrderedSet 4 | 5 | 6 | ## 7 | # Parser helper for download links 8 | # noinspection PyMethodOverriding 9 | class ParseItemFiles(ParseItem): 10 | 11 | def __init__(self, xpath): 12 | ParseItem.__init__(self, xpath) 13 | self.xPath = xpath 14 | self.setTypes = OrderedSet() 15 | self.setLinks = OrderedSet() 16 | 17 | def needs_book_id(self): 18 | return True 19 | 20 | @staticmethod 21 | def add(theset, xpath_result, book_id, type_id): 22 | text = xpath_result.replace("\"", "'") 23 | index = theset.index(text) 24 | if index != -1: 25 | theset[text][1] = book_id 26 | else: 27 | index = theset.add((text, book_id,type_id)) 28 | return index 29 | 30 | @staticmethod 31 | def add_simple(the_set, xpath_result): 32 | text = xpath_result.replace("\"", "'") 33 | return the_set.add(text) 34 | 35 | def do(self,doc,book_id): 36 | arr =[] 37 | for xpth in self.xPath: 38 | book_files = doc.xpath(xpth, namespaces=GutenbergCacheSettings.NS) 39 | for bk in book_files: 40 | 41 | xpath_results_type = bk.xpath('.//dcterms:format/rdf:Description/rdf:value/text()', namespaces=GutenbergCacheSettings.NS) 42 | xpath_results_link = bk.xpath('.//pgterms:file/@rdf:about', namespaces=GutenbergCacheSettings.NS) 43 | 44 | if xpath_results_link and xpath_results_type: 45 | type_id = self.add_simple( self.setTypes, xpath_results_type[0]) 46 | link_id = self.add(self.setLinks, xpath_results_link[0], book_id, type_id) 47 | 48 | arr.append(link_id) 49 | return arr 50 | -------------------------------------------------------------------------------- /gutenbergpy/gutenbergcachesettings.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # noinspection PyClassHasNoInit 4 | class GutenbergCacheSettings: 5 | # name of the gutenberg link for the rdf arhive (should not change) 6 | CACHE_RDF_DOWNLOAD_LINK = 'https://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2' 7 | # name of the caches file name (sqlite db) 8 | CACHE_FILENAME = 'gutenbergindex.db' 9 | # name of the rdf unpack directory (this will be used when unpacking the rdf tar) 10 | CACHE_RDF_UNPACK_DIRECTORY = os.path.join('cache','epub') 11 | # name of the downloaded rdf arhive 12 | CACHE_RDF_ARCHIVE_NAME = 'rdf-files.tar.bz2' 13 | # number of #'s shown in loading bar (common to all loading bars) 14 | DOWNLOAD_NUM_DIVS = 20 15 | # text files cache folder 16 | TEXT_FILES_CACHE_FOLDER = 'texts' 17 | # mongo db connection server 18 | MONGO_DB_CONNECTION_SERVER ="mongodb://localhost:27017" 19 | ##########READONLY VARIABLES (please put readonly variables here) 20 | # namespace used for the rds parsing (should not change) 21 | NS = { 22 | 'cc': "http://web.resource.org/cc/", 23 | 'dcam': "http://purl.org/dc/dcam/", 24 | 'dcterms': "http://purl.org/dc/terms/", 25 | 'rdfs': "http://www.w3.org/2000/01/rdf-schema#", 26 | 'rdf': "http://www.w3.org/1999/02/22-rdf-syntax-ns#", 27 | 'pgterms': "http://www.gutenberg.org/2009/pgterms/"} 28 | 29 | ##########END OF READONLY VARIABLES 30 | 31 | ## 32 | # Used to set the settings global variables 33 | @staticmethod 34 | def set(**kwargs): 35 | if 'CacheFilename' in kwargs: 36 | GutenbergCacheSettings.CACHE_FILENAME = kwargs['CacheFilename'] 37 | if 'CacheUnpackDir' in kwargs: 38 | GutenbergCacheSettings.CACHE_RDF_UNPACK_DIRECTORY = kwargs['CacheUnpackDir'] 39 | if 'CacheArchiveName' in kwargs: 40 | GutenbergCacheSettings.CACHE_RDF_ARCHIVE_NAME = kwargs['CacheArchiveName'] 41 | if 'ProgressBarMaxLength' in kwargs: 42 | GutenbergCacheSettings.DOWNLOAD_NUM_DIVS = kwargs['ProgressBarMaxLength'] 43 | if 'CacheRDFDownloadLink' in kwargs: 44 | GutenbergCacheSettings.CACHE_RDF_DOWNLOAD_LINK = kwargs['CacheRDFDownloadLink'] 45 | if 'TextFilesCacheFolder' in kwargs: 46 | GutenbergCacheSettings.TEXT_FILES_CACHE_FOLDER = kwargs['TextFilesCacheFolder'] 47 | if 'MongoDBCacheServer' in kwargs: 48 | GutenbergCacheSettings.MONGO_DB_CONNECTION_SERVER = kwargs['MongoDBCacheServer'] -------------------------------------------------------------------------------- /gutenbergpy/gutenbergcache.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from os import path 3 | import time 4 | from gutenbergpy.utils import Utils 5 | 6 | from gutenbergpy.gutenbergcachesettings import GutenbergCacheSettings 7 | from gutenbergpy.parse.rdfparser import RdfParser 8 | from gutenbergpy.caches.sqlitecache import SQLiteCache 9 | from gutenbergpy.caches.mongodbcache import MongodbCache 10 | 11 | ## 12 | # Cache types 13 | # noinspection PyClassHasNoInit 14 | class GutenbergCacheTypes: 15 | CACHE_TYPE_SQLITE = 0 16 | CACHE_TYPE_MONGODB = 1 17 | 18 | 19 | ## 20 | # The main class (only this should be used to interface the cache) 21 | class GutenbergCache: 22 | ## 23 | # Get the cache by type 24 | @staticmethod 25 | def get_cache(type=GutenbergCacheTypes.CACHE_TYPE_SQLITE): 26 | if type == GutenbergCacheTypes.CACHE_TYPE_SQLITE: 27 | return SQLiteCache() 28 | elif type == GutenbergCacheTypes.CACHE_TYPE_MONGODB: 29 | return MongodbCache() 30 | print("CACHE TYPE UNKNOWN") 31 | return None 32 | 33 | ## 34 | # Create the cache 35 | @staticmethod 36 | def create(**kwargs): 37 | cache_type = GutenbergCacheTypes.CACHE_TYPE_SQLITE if 'type' not in kwargs else kwargs['type'] 38 | refresh = True if 'refresh' not in kwargs else kwargs['refresh'] 39 | download = True if 'download' not in kwargs else kwargs['download'] 40 | unpack = True if 'unpack' not in kwargs else kwargs['unpack'] 41 | parse = True if 'parse' not in kwargs else kwargs['parse'] 42 | cache = True if 'cache' not in kwargs else kwargs['cache'] 43 | deleteTmp = True if 'deleteTemp' not in kwargs else kwargs['deleteTemp'] 44 | 45 | if path.isfile(GutenbergCacheSettings.CACHE_FILENAME) and refresh and cache_type == GutenbergCacheTypes.CACHE_TYPE_SQLITE: 46 | print('Cache already exists') 47 | return 48 | 49 | if refresh: 50 | print('Deleting old files') 51 | Utils.delete_tmp_files(True) 52 | 53 | if download: 54 | Utils.download_file() 55 | 56 | if unpack: 57 | Utils.unpack_tarbz2() 58 | 59 | if parse: 60 | t0 = time.time() 61 | parser = RdfParser() 62 | result = parser.do() 63 | print('RDF PARSING took ' + str(time.time() - t0)) 64 | 65 | if cache: 66 | t0 = time.time() 67 | cache = GutenbergCache.get_cache(cache_type) 68 | cache.create_cache(result) 69 | print('sql took %f' % (time.time() - t0)) 70 | 71 | if deleteTmp: 72 | print('Deleting temporary files') 73 | Utils.delete_tmp_files() 74 | 75 | print('Done') 76 | 77 | ## 78 | # Method to check if the cache exists 79 | @staticmethod 80 | def exists(): 81 | return path.isfile(GutenbergCacheSettings.CACHE_FILENAME) 82 | -------------------------------------------------------------------------------- /gutenbergpy/utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | import sys 3 | import os 4 | import urllib 5 | import tarfile 6 | from future.standard_library import install_aliases 7 | install_aliases() 8 | import urllib.request 9 | from gutenbergpy.gutenbergcachesettings import GutenbergCacheSettings 10 | 11 | 12 | ## 13 | # Wrapper for project used utils 14 | 15 | # noinspection PyClassHasNoInit 16 | class Utils: 17 | ## 18 | # Deletes the temp files resulted in the cache process 19 | @staticmethod 20 | def delete_tmp_files(delete_sqlite=False): 21 | if delete_sqlite: 22 | try: 23 | os.remove(GutenbergCacheSettings.CACHE_FILENAME) 24 | except OSError: 25 | pass 26 | try: 27 | os.remove(GutenbergCacheSettings.CACHE_RDF_ARCHIVE_NAME) 28 | except OSError: 29 | pass 30 | try: 31 | for root, dirs, files in os.walk(GutenbergCacheSettings.CACHE_RDF_UNPACK_DIRECTORY, topdown=False): 32 | for name in files: 33 | os.remove(os.path.join(root, name)) 34 | for name in dirs: 35 | os.rmdir(os.path.join(root, name)) 36 | except OSError: 37 | pass 38 | 39 | ## 40 | # Updates the visual progress bar 41 | 42 | @staticmethod 43 | def update_progress_bar(type, progress, total_progress, 44 | force_update=True): # used to update the progress bar display 45 | if total_progress % GutenbergCacheSettings.DOWNLOAD_NUM_DIVS == 0 or force_update == True or progress == 0: 46 | dv = total_progress / GutenbergCacheSettings.DOWNLOAD_NUM_DIVS 47 | num_of_sharp = int(progress / dv) 48 | num_of_space = int((total_progress - progress) / dv) 49 | 50 | sys.stdout.write("\r %s : [%s%s]" % (type, '#' * num_of_sharp, ' ' * num_of_space)) 51 | sys.stdout.flush() 52 | 53 | download_progress = 0 54 | 55 | ## 56 | # Callback to report downloaded data 57 | 58 | @staticmethod 59 | def __report(block_no,block_size, file_size): # callback called on download update 60 | Utils.download_progress += block_size 61 | type = 'Downloading %s' % GutenbergCacheSettings.CACHE_RDF_ARCHIVE_NAME 62 | Utils.update_progress_bar(type, Utils.download_progress, file_size, True) 63 | 64 | ## 65 | # Download the RDF file function 66 | 67 | @staticmethod 68 | def download_file(): # used to download the rdf tar file 69 | start = time.time() 70 | urllib.request.urlretrieve(GutenbergCacheSettings.CACHE_RDF_DOWNLOAD_LINK, 71 | GutenbergCacheSettings.CACHE_RDF_ARCHIVE_NAME, Utils.__report) 72 | 73 | print ('took %f' % (time.time() - start)) 74 | Utils.download_progress = 0 75 | 76 | ## 77 | # Unpack the tar file 78 | 79 | @staticmethod 80 | def unpack_tarbz2(): # used to unpack the rdf tar file 81 | start = time.time() 82 | tar = tarfile.open(GutenbergCacheSettings.CACHE_RDF_ARCHIVE_NAME) 83 | total_num = len(tar.getmembers()) 84 | type = 'Extracting %s' % GutenbergCacheSettings.CACHE_RDF_ARCHIVE_NAME 85 | for idx, member in enumerate(tar.getmembers()): 86 | Utils.update_progress_bar(type, idx, total_num) 87 | tar.extract(member) 88 | tar.close() 89 | 90 | print('took %f' % (time.time() - start)) 91 | -------------------------------------------------------------------------------- /gutenbergpy/caches/mongodbcache.py: -------------------------------------------------------------------------------- 1 | from gutenbergpy.gutenbergcachesettings import GutenbergCacheSettings 2 | from gutenbergpy.caches.cache import Cache 3 | from gutenbergpy.utils import Utils 4 | from gutenbergpy.parse.cachefields import Fields 5 | from gutenbergpy.gutenbergcachesettings import GutenbergCacheSettings 6 | 7 | 8 | from pymongo import MongoClient 9 | 10 | 11 | ## 12 | # SQLite cache implementation 13 | class MongodbCache(Cache): 14 | 15 | def __init__(self): 16 | self.client = MongoClient(GutenbergCacheSettings.MONGO_DB_CONNECTION_SERVER) 17 | self.db = self.client.mongodbgutenbergcache 18 | 19 | def __get_book_json(self,parseItem,fields): 20 | book_dict ={} 21 | 22 | book_dict['publisher'] = fields[Fields.PUBLISHER].set[parseItem.publisher_id-1] if parseItem.publisher_id and parseItem.publisher_id != -1 else 'None' 23 | book_dict['rights'] = fields[Fields.RIGHTS].set[parseItem.rights_id-1] if parseItem.rights_id and parseItem.rights_id != -1 else 'None' 24 | book_dict['language'] = fields[Fields.LANGUAGE].set[parseItem.language_id-1] if parseItem.language_id and parseItem.language_id != -1 else 'None' 25 | book_dict['book_shelf'] = fields[Fields.BOOKSHELF].set[parseItem.bookshelf_id-1] if parseItem.bookshelf_id and parseItem.bookshelf_id != -1 else 'None' 26 | book_dict['gutenberg_book_id'] = parseItem.gutenberg_book_id 27 | book_dict['date_issued'] = parseItem.date_issued 28 | book_dict['num_downloads'] = parseItem.num_downloads 29 | 30 | book_dict['titles'] = map(lambda x: x[0] , fields[Fields.TITLE].set[[x - 1 for x in parseItem.titles_id]]) if parseItem.titles_id and parseItem.titles_id != -1 else ['None'] 31 | book_dict['subjects'] = map(lambda x: x , fields[Fields.SUBJECT].set[[x - 1 for x in parseItem.subjects_id]]) if parseItem.subjects_id and parseItem.subjects_id != -1 else ['None'] 32 | book_dict['authors'] = map(lambda x: x , fields[Fields.AUTHOR].set[[x - 1 for x in parseItem.authors_id]]) if parseItem.authors_id and parseItem.authors_id != -1 else ['None'] 33 | book_dict['files'] = map(lambda x: x[0] , fields[Fields.FILES].setLinks[[x - 1 for x in parseItem.files_id]]) if parseItem.files_id and parseItem.files_id != -1 else ['None'] 34 | book_dict['type'] = fields[Fields.TYPE].set[parseItem.type_id-1] if parseItem.type_id and parseItem.type_id != -1 else 'None' 35 | return book_dict 36 | ## 37 | # Create the MongoDB cache 38 | def create_cache(self, parse_results): 39 | self.db.books.drop() 40 | book_collection = self.db.books 41 | total = len (parse_results.books) 42 | 43 | for idx,book in enumerate(parse_results.books): 44 | Utils.update_progress_bar("MongoDB progress", idx, total) 45 | json = self.__get_book_json(book,parse_results.field_sets) 46 | self.db.books.insert_one(json) 47 | 48 | def create_or_dict(self,name,newname, dt,out): 49 | if dt.has_key(name): 50 | dict = {} 51 | lst = [] 52 | for e in dt[name]: 53 | lst.append ({name:e}) 54 | out.extend(lst) 55 | 56 | def query(self,**kwargs): 57 | query = {} 58 | lst = [] 59 | self.create_or_dict('languages','languages',kwargs,lst) 60 | self.create_or_dict('authors','authors', kwargs, lst) 61 | self.create_or_dict('types','type', kwargs, lst) 62 | self.create_or_dict('titles', 'titles', kwargs, lst) 63 | self.create_or_dict('subjects', 'subjects', kwargs, lst) 64 | self.create_or_dict('publishers', 'publisher', kwargs, lst) 65 | self.create_or_dict('bookshelves', 'bookshelves', kwargs, lst) 66 | self.create_or_dict('gutenberg_book_id', 'gutenberg_book_id', kwargs, lst) 67 | query['$or'] = lst 68 | lst =[] 69 | for res in self.native_query(query): 70 | lst.append(res["gutenberg_book_id"]) 71 | return lst 72 | ## 73 | # Native query function implementation 74 | def native_query(self,mongodb_query): 75 | return self.db.books.find(mongodb_query) -------------------------------------------------------------------------------- /gutenbergpy/parse/rdfparser.py: -------------------------------------------------------------------------------- 1 | from os import listdir 2 | from os import path 3 | from lxml import etree 4 | 5 | from gutenbergpy.parse.cachefields import Fields 6 | from gutenbergpy.parse.book import Book 7 | from gutenbergpy.parse.parseitem import ParseItem 8 | from gutenbergpy.parse.parseitemfile import ParseItemFiles 9 | from gutenbergpy.parse.parseitemtitles import ParseItemTitles 10 | from gutenbergpy.parse.rdfparseresults import RDFParseResults 11 | from gutenbergpy.gutenbergcachesettings import GutenbergCacheSettings 12 | from gutenbergpy.utils import Utils 13 | 14 | 15 | ## 16 | # The rdf parser 17 | # noinspection PyClassHasNoInit 18 | class RdfParser: 19 | ## 20 | # The main funciton, this actually parses the rdf files from the downloaded cache 21 | @staticmethod 22 | def do(): 23 | result = RDFParseResults() 24 | 25 | result.field_sets = Fields.FIELD_COUNT * [None] 26 | result.field_sets[Fields.TITLE] = ParseItemTitles(xpath=['//dcterms:title/text()','//dcterms:alternative/text()']) 27 | result.field_sets[Fields.SUBJECT] = ParseItem(xpath =['//dcterms:subject/rdf:Description/rdf:value/text()']) 28 | result.field_sets[Fields.TYPE] = ParseItem(xpath =['//dcterms:type/rdf:Description/rdf:value/text()']) 29 | result.field_sets[Fields.LANGUAGE] = ParseItem(xpath =['//dcterms:language/rdf:Description/rdf:value/text()']) 30 | result.field_sets[Fields.AUTHOR] = ParseItem(xpath =['//dcterms:creator/pgterms:agent/pgterms:alias/text()','//dcterms:creator/pgterms:agent/pgterms:name/text()']) 31 | result.field_sets[Fields.BOOKSHELF] = ParseItem(xpath =['//pgterms:bookshelf/rdf:Description/rdf:value/text()']) 32 | result.field_sets[Fields.FILES] = ParseItemFiles(xpath =['//dcterms:hasFormat']) 33 | result.field_sets[Fields.PUBLISHER] = ParseItem(xpath =['//dcterms:publisher/text()']) 34 | result.field_sets[Fields.RIGHTS] = ParseItem( xpath =['//dcterms:rights/text()']) 35 | 36 | 37 | dirs = [d for d in listdir(GutenbergCacheSettings.CACHE_RDF_UNPACK_DIRECTORY) if not d.startswith("DELETE")] 38 | total = len(dirs) 39 | 40 | for idx, dir in enumerate(dirs): 41 | if not str(dir).isdigit(): 42 | continue 43 | processing_str = "Processing progress: %d / %d" % (idx,total) 44 | Utils.update_progress_bar(processing_str,idx,total) 45 | file_path = path.join(GutenbergCacheSettings.CACHE_RDF_UNPACK_DIRECTORY,dir,'pg%s.rdf'%(dir)) 46 | doc = etree.parse(file_path,etree.ETCompatXMLParser()) 47 | 48 | book_id = len(result.books)+1 49 | res = Fields.FIELD_COUNT * [-1] 50 | for idx_field, pt in enumerate(result.field_sets): 51 | if not pt.needs_book_id(): 52 | res[idx_field] = pt.do(doc) 53 | else: 54 | res[idx_field] = pt.do(doc,book_id) 55 | 56 | gutenberg_book_id = int(dir) 57 | 58 | date_issued_x = doc.xpath('//dcterms:issued/text()', namespaces=GutenbergCacheSettings.NS) 59 | num_downloads_x = doc.xpath('//pgterms:downloads/text()',namespaces=GutenbergCacheSettings.NS) 60 | 61 | date_issued = '1000-10-10' if not date_issued_x or date_issued_x[0] =='None' else str(date_issued_x[0]) 62 | num_downloads = -1 if not num_downloads_x else int(num_downloads_x[0]) 63 | publisher_id = -1 if not res[Fields.PUBLISHER] else res[Fields.PUBLISHER][0] 64 | rights_id = -1 if not res[Fields.RIGHTS] else res[Fields.RIGHTS][0] 65 | language_id = -1 if not res[Fields.LANGUAGE] else res[Fields.LANGUAGE][0] 66 | bookshelf_id = -1 if not res[Fields.BOOKSHELF] else res[Fields.BOOKSHELF][0] 67 | type_id = -1 if not res[Fields.TYPE] else res[Fields.TYPE][0] 68 | 69 | newbook = Book(publisher_id, rights_id, language_id, bookshelf_id, 70 | gutenberg_book_id, date_issued, num_downloads, res[Fields.TITLE], 71 | res[Fields.SUBJECT], type_id, res[Fields.AUTHOR], res[Fields.FILES]) 72 | 73 | result.books.append(newbook) 74 | 75 | return result -------------------------------------------------------------------------------- /gutenbergpy/orderedset.py: -------------------------------------------------------------------------------- 1 | # THIS CODE IS ADAPTED FROM https://github.com/LuminosoInsight/ordered-set 2 | SLICE_ALL = slice(None) 3 | __version__ = '2.0.1' 4 | 5 | import collections 6 | try: 7 | from collections.abc import MutableSet 8 | except: 9 | from collections import MutableSet 10 | 11 | 12 | def is_iterable(obj): 13 | """ 14 | Are we being asked to look up a list of things, instead of a single thing? 15 | We check for the `__iter__` attribute so that this can cover types that 16 | don't have to be known by this module, such as NumPy arrays. 17 | 18 | Strings, however, should be considered as atomic values to look up, not 19 | iterables. The same goes for tuples, since they are immutable and therefore 20 | valid entries. 21 | 22 | We don't need to check for the Python 2 `unicode` type, because it doesn't 23 | have an `__iter__` attribute anyway. 24 | """ 25 | return hasattr(obj, '__iter__') and not isinstance(obj, str) and not isinstance(obj, tuple) 26 | 27 | 28 | class OrderedSet(MutableSet): 29 | """ 30 | An OrderedSet is a custom MutableSet that remembers its order, so that 31 | every entry has an index that can be looked up. 32 | """ 33 | 34 | def __init__(self, iterable=None): 35 | self.items = [] 36 | self.map = {} 37 | if iterable is not None: 38 | self |= iterable 39 | 40 | def __len__(self): 41 | return len(self.items) 42 | 43 | def __getitem__(self, index): 44 | """ 45 | Get the item at a given index. 46 | 47 | If `index` is a slice, you will get back that slice of items. If it's 48 | the slice [:], exactly the same object is returned. (If you want an 49 | independent copy of an OrderedSet, use `OrderedSet.copy()`.) 50 | 51 | If `index` is an iterable, you'll get the OrderedSet of items 52 | corresponding to those indices. This is similar to NumPy's 53 | "fancy indexing". 54 | """ 55 | if index == SLICE_ALL: 56 | return self 57 | elif hasattr(index, '__index__') or isinstance(index, slice): 58 | result = self.items[index] 59 | if isinstance(result, list): 60 | return OrderedSet(result) 61 | else: 62 | return result 63 | elif is_iterable(index): 64 | return OrderedSet([self.items[i] for i in index]) 65 | else: 66 | raise TypeError("Don't know how to index an OrderedSet by %r" % 67 | index) 68 | 69 | def copy(self): 70 | return OrderedSet(self) 71 | 72 | def __getstate__(self): 73 | if len(self) == 0: 74 | # The state can't be an empty list. 75 | # We need to return a truthy value, or else __setstate__ won't be run. 76 | # 77 | # This could have been done more gracefully by always putting the state 78 | # in a tuple, but this way is backwards- and forwards- compatible with 79 | # previous versions of OrderedSet. 80 | return None, 81 | else: 82 | return list(self) 83 | 84 | def __setstate__(self, state): 85 | if state == (None,): 86 | self.__init__([]) 87 | else: 88 | self.__init__(state) 89 | 90 | def __contains__(self, key): 91 | return key in self.map 92 | 93 | def add(self, key): 94 | """ 95 | Add `key` as an item to this OrderedSet, then return its index. 96 | 97 | If `key` is already in the OrderedSet, return the index it already 98 | had. 99 | """ 100 | if key not in self.map: 101 | self.map[key] = len(self.items) 102 | self.items.append(key) 103 | return self.map[key] + 1 104 | 105 | append = add 106 | 107 | def update(self, sequence): 108 | """ 109 | Update the set with the given iterable sequence, then return the index 110 | of the last element inserted. 111 | """ 112 | item_index = None 113 | try: 114 | for item in sequence: 115 | item_index = self.add(item) 116 | except TypeError: 117 | raise ValueError('Argument needs to be an iterable, got %s' % type(sequence)) 118 | return item_index 119 | 120 | def index(self, key): 121 | """ 122 | Get the index of a given entry, raising an IndexError if it's not 123 | present. 124 | 125 | `key` can be an iterable of entries that is not a string, in which case 126 | this returns a list of indices. 127 | """ 128 | if is_iterable(key): 129 | return [self.index(subkey) for subkey in key] 130 | 131 | if key not in self.map: 132 | return -1 133 | 134 | return self.map[key] 135 | 136 | def pop(self): 137 | """ 138 | Remove and return the last element from the set. 139 | 140 | Raises KeyError if the set is empty. 141 | """ 142 | if not self.items: 143 | raise KeyError('Set is empty') 144 | 145 | elem = self.items[-1] 146 | del self.items[-1] 147 | del self.map[elem] 148 | return elem 149 | 150 | def discard(self, key): 151 | """ 152 | Remove an element. Do not raise an exception if absent. 153 | 154 | The MutableSet mixin uses this to implement the .remove() method, which 155 | *does* raise an error when asked to remove a non-existent item. 156 | """ 157 | if key in self: 158 | i = self.items.index(key) 159 | del self.items[i] 160 | del self.map[key] 161 | for k, v in self.map.items(): 162 | if v >= i: 163 | self.map[k] = v - 1 164 | 165 | def clear(self): 166 | """ 167 | Remove all items from this OrderedSet. 168 | """ 169 | del self.items[:] 170 | self.map.clear() 171 | 172 | def __iter__(self): 173 | return iter(self.items) 174 | 175 | def __reversed__(self): 176 | return reversed(self.items) 177 | 178 | def __repr__(self): 179 | if not self: 180 | return '%s()' % (self.__class__.__name__,) 181 | return '%s(%r)' % (self.__class__.__name__, list(self)) 182 | 183 | def __eq__(self, other): 184 | if isinstance(other, OrderedSet): 185 | return len(self) == len(other) and self.items == other.items 186 | try: 187 | other_as_set = set(other) 188 | except TypeError: 189 | # If `other` can't be converted into a set, it's not equal. 190 | return False 191 | else: 192 | return set(self) == other_as_set 193 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | GutenbergPy 2 | ======== 3 | 4 | ![image](https://github.com/raduangelescu/gutenbergpy/blob/master/dblogos.png?raw=true) 5 | 6 | This package makes filtering and getting information from [Project Gutenberg](http://www.gutenberg.org) easier from python. 7 | 8 | It's target audience is machine learning guys that need data for their project, but may be freely used by anybody. 9 | 10 | The package: 11 | 12 | - Generates a local cache (of all gutenberg informations) that you can interogate to get book ids. The Local cache may be sqlite (default) or mongodb (for wich you need to have installed the pymongodb packet) 13 | - Downloads and cleans raw text from gutenberg books 14 | 15 | The package has been tested with Python 3.6 on both Windows and Linux It is faster, smaller and less third-party intensive alternative to 16 | 17 | About development: 18 | 19 | Installation 20 | ============ 21 | 22 | ``` 23 | pip install gutenbergpy 24 | ``` 25 | 26 | or just install it from source (it's all just python code): 27 | 28 | ``` 29 | git clone https://github.com/raduangelescu/gutenbergpy 30 | python setup.py install 31 | ``` 32 | 33 | Usage 34 | ===== 35 | 36 | Downloading a text 37 | ------------------ 38 | ``` 39 | import gutenbergpy.textget 40 | 41 | ``` 42 | After importing our module, we can download a text from gutenberg. 43 | 44 | ```python 45 | def usage_example(): 46 | # This gets a book by its gutenberg id number 47 | raw_book = gutenbergpy.textget.get_text_by_id(2701) # with headers 48 | clean_book = gutenbergpy.textget.strip_headers(raw_book) # without headers 49 | return clean_book, raw_book 50 | ``` 51 | The code above can easily be used without the function declaration, this is simply for illustration. 52 | 53 | ```python 54 | cleaned_book, raw_book = usage_example() 55 | 56 | # Cleaned Book 57 | print(f'Example phrase from the cleaned book: {" ".join(str(cleaned_book[3000:3050]).split(" "))}') 58 | # Raw Book 59 | print(f'Example phrase from the raw book: {" ".join(str(raw_book[3000:3050]).split(" "))}') 60 | 61 | ``` 62 | The output of the code above is: 63 | ``` 64 | b'rgris.\n\nCHAPTER 93. The Castaway.\n\nCHAPTER 94. A S' 65 | b'\n\n\n\nMOBY-DICK;\n\nor, THE WHALE.\n\nBy Herman Melville\n\n\n\nCONTENTS\n\nETYMOLOGY.\n\nEXTRACTS (Supplied by a Sub-Sub-Librarian).\n\nCHAPTER 1. Loomings.\n\nCHAPTER 2. The Carpet-Bag.\n\nCHAPTER 3. The Spouter-Inn.\n\nCHAPTER 4. The Counterpane.\n\nCHAPTER 5. Breakfast.\n\nCHAPTER 6. The Street.\n\nCHAPTER 7. The Chapel.\n\nCHAPTER 8. The Pulpit.\n\nCHAPTER 9. The Sermon.\n\nCHAPTER 10. A Bosom Friend.\n\nCHAPTER 11. Nightgown.\n\nCHAPTER 12. Biographical.\n\nCHAPTER 13. Wheelbarrow.\n\nCHAPTER 14. Nantucket.\n\nCHAPTER 15. Chowder.\n\nCHAPTER 16. The Ship.\n\nCHAPTER 17. The Ramadan.\n\nCHAPTER 18. His Mark.\n\nCHAPTER 19. The Prophet.\n\nCHAPTER 20. All Astir.\n\nCHAPTER 21. Going Aboard.\n\nCHAPTER 22. Merry Christmas.\n\nCHAPTER 23. The Lee Shore.\n\nCHAPTER 24. The Advocate.\n\nCHAPTER 25. Postscript.\n\nCHAPTER 26. Knights and Squires.\n\nCHAPTER 27. Knights and Squires.\n\nCHAPTER 28. Ahab.\n\nCHAPTER 29. Enter Ahab; to Him, Stubb.\n\nCHAPTER 30. The Pipe.\n\nCHAPTER 31. Queen Mab.\n\nCHAPTER 32. Cetology.\n\nCHAPTER 33. The Specksnyder.\n\nCHAPTER 34. Th' 66 | ``` 67 | They are both pretty messy, and will need to be cleaned prior to use for NLP etc. 68 | 69 | The Raw book: 70 | ```output 71 | b'b\xe2\x80\x99s Supper.\r\n\r\nCHAPTER 65. The Whale as a Dish.\r' 72 | b'\n\n\n\nMOBY-DICK;\n\nor, THE WHALE.\n\nBy Herman Melville\n\n\n\nCONTENTS\n\nETYMOLOGY.\n\nEXTRACTS (Supplied by a Sub-Sub-Librarian).\n\nCHAPTER 1. Loomings.\n\nCHAPTER 2. The Carpet-Bag.\n\nCHAPTER 3. The Spouter-Inn.\n\nCHAPTER 4. The Counterpane.\n\nCHAPTER 5. Breakfast.\n\nCHAPTER 6. The Street.\n\nCHAPTER 7. The Chapel.\n\nCHAPTER 8. The Pulpit.\n\nCHAPTER 9. The Sermon.\n\nCHAPTER 10. A Bosom Friend.\n\nCHAPTER 11. Nightgown.\n\nCHAPTER 12. Biographical.\n\nCHAPTER 13. Wheelbarrow.\n\nCHAPTER 14. Nantucket.\n\nCHAPTER 15. Chowder.\n\nCHAPTER 16. The Ship.\n\nCHAPTER 17. The Ramadan.\n\nCHAPTER 18. His Mark.\n\nCHAPTER 19. The Prophet.\n\nCHAPTER 20. All Astir.\n\nCHAPTER 21. Going Aboard.\n\nCHAPTER 22. Merry Christmas.\n\nCHAPTER 23. The Lee Shore.\n\nCHAPTER 24. The Advocate.\n\nCHAPTER 25. Postscript.\n\nCHAPTER 26. Knights and Squires.\n\nCHAPTER 27. Knights and Squires.\n\nCHAPTER 28. Ahab.\n\nCHAPTER 29. Enter Ahab; to Him, Stubb.\n\nCHAPTER 30. The Pipe.\n\nCHAPTER 31. Queen Mab.\n\nCHAPTER 32. Cetology.\n\nCHAPTER 33. The Specksnyder.\n\nCHAPTER 34. Th' 73 | 74 | ``` 75 | Query the cache 76 | --------------- 77 | 78 | To do this you first need to create the cache (this is a one time thing per os, until you decide to redo it) 79 | 80 | ``` 81 | from gutenbergpy.gutenbergcache import GutenbergCache 82 | #for sqlite 83 | GutenbergCache.create() 84 | #for mongodb 85 | GutenbergCache.create(type=GutenbergCacheTypes.CACHE_TYPE_MONGODB) 86 | ``` 87 | 88 | for debugging/better control you have these boolean options on create 89 | 90 | > - *refresh* deletes the old cache 91 | > - *download* property downloads the rdf file from the gutenberg project 92 | > - *unpack* unpacks it 93 | > - *parse* parses it in memory 94 | > - *cache* writes the cache 95 | 96 | ``` 97 | GutenbergCache.create(refresh=True, download=True, unpack=True, parse=True, cache=True, deleteTemp=True) 98 | ``` 99 | 100 | for even better control you may set the GutenbergCacheSettings 101 | - *CacheFilename* 102 | - *CacheUnpackDir* 103 | - *CacheArchiveName* 104 | - *ProgressBarMaxLength* 105 | - *CacheRDFDownloadLink* 106 | - *TextFilesCacheFolder* 107 | - *MongoDBCacheServer* 108 | 109 | ``` 110 | GutenbergCacheSettings.set( CacheFilename="", CacheUnpackDir="", 111 | CacheArchiveName="", ProgressBarMaxLength="", CacheRDFDownloadLink="", TextFilesCacheFolder="", MongoDBCacheServer="") 112 | ``` 113 | 114 | After doing a `create` go grab a coffee, it will be over in about 5 minutes, depending on your internet speed and computer power (On a i7 with gigabit connection and ssd it finishes in about 1 minute) 115 | 116 | Get the cache 117 | ``` 118 | #for mongodb 119 | cache = GutenbergCache.get_cache(GutenbergCacheTypes.CACHE_TYPE_MONGODB) 120 | #for sqlite 121 | cache = GutenbergCache.get_cache() 122 | ``` 123 | Now you can do queries 124 | 125 | Get the book Gutenberg unique indices by using this query function 126 | 127 | Standard query fields: 128 | - languages 129 | - authors 130 | - types 131 | - titles 132 | - subjects 133 | - publishers 134 | - bookshelves 135 | - downloadtype 136 | ``` 137 | print(cache.query(downloadtype=['application/plain','text/plain','text/html; charset=utf-8'])) 138 | ``` 139 | Or do a native query on the sqlite database 140 | ``` 141 | #python 142 | cache.native_query("SELECT * FROM books") 143 | #mongodb 144 | cache.native_query({type:'Text'}} 145 | ``` 146 | For SQLITE custom queries, take a look at the SQLITE database scheme: 147 | 148 | ![image](https://github.com/raduangelescu/gutenbergpy/blob/master/sqlitecheme.png?raw=true) 149 | 150 | For MongoDB queries, you have all the books collection. Each book with the following fields: 151 | 152 | > - book(publisher, rights, language, book\_shelf, gutenberg\_book\_id, date\_issued, num\_downloads, titles, subjects, authors, files ,type) 153 | -------------------------------------------------------------------------------- /gutenbergpy/caches/sqlitecache.py: -------------------------------------------------------------------------------- 1 | from gutenbergpy.gutenbergcachesettings import GutenbergCacheSettings 2 | from gutenbergpy.caches.cache import Cache 3 | from gutenbergpy.utils import Utils 4 | from gutenbergpy.parse.cachefields import Fields 5 | from gutenbergpy.gutenbergcachesettings import GutenbergCacheSettings 6 | 7 | import sqlite3 8 | import os 9 | 10 | 11 | ## 12 | # SQLite cache implementation 13 | class SQLiteCache(Cache): 14 | 15 | 16 | def __init__(self): 17 | self.cursor = None 18 | self.connection = None 19 | self.table_map = [None] * Fields.FIELD_COUNT 20 | 21 | self.table_map[Fields.TITLE] = 'titles' 22 | self.table_map[Fields.SUBJECT] = 'subjects' 23 | self.table_map[Fields.TYPE] = 'types' 24 | self.table_map[Fields.LANGUAGE] = 'languages' 25 | self.table_map[Fields.AUTHOR] = 'authors' 26 | self.table_map[Fields.BOOKSHELF]= 'bookshelves' 27 | self.table_map[Fields.FILES] = '/---/' 28 | self.table_map[Fields.PUBLISHER]= 'publishers' 29 | self.table_map[Fields.RIGHTS] = 'rights' 30 | ## 31 | # Files are package data 32 | SQLiteCache.DB_CREATE_CACHE_FILENAME = 'gutenbergindex.db.sql' 33 | SQLiteCache.DB_CREATE_CACHE_INDICES_FILENAME = 'gutenbergindex_indices.db.sql' 34 | 35 | this_dir, this_filename = os.path.split(__file__) 36 | SQLiteCache.DB_CREATE_CACHE_FILENAME = os.path.join(this_dir, SQLiteCache.DB_CREATE_CACHE_FILENAME) 37 | SQLiteCache.DB_CREATE_CACHE_INDICES_FILENAME = os.path.join(this_dir, SQLiteCache.DB_CREATE_CACHE_INDICES_FILENAME) 38 | 39 | ## 40 | # Insert many helper function 41 | def __insert_many_field(self, table, field, theSet): 42 | if len(theSet): 43 | query = 'INSERT OR IGNORE INTO %s(%s) VALUES (?)' % (table,field) 44 | self.cursor.executemany(query,map(lambda x: (x,) , theSet)) 45 | 46 | ## 47 | # Insert many 2 fields helper function 48 | def __insert_many_field_id(self, table, field1, field2, theSet): 49 | if len(theSet): 50 | query = 'INSERT OR IGNORE INTO %s(%s, %s) VALUES (?,?)' % (table,field1,field2) 51 | insert_array = map(lambda x: (x[0],x[1]) , theSet) 52 | self.cursor.executemany(query,insert_array) 53 | 54 | ## 55 | # Insert in link table 56 | def __insertLinks(self,ids,tablename,link1name,link2name): 57 | if len(list(ids)): 58 | query = "INSERT INTO %s(%s,%s) VALUES (?,?)" % (tablename,link1name,link2name) 59 | self.cursor.executemany(query, ids) 60 | 61 | ## 62 | # Create the SQL cache 63 | def create_cache(self, parse_results): 64 | self.connection = sqlite3.connect(GutenbergCacheSettings.CACHE_FILENAME) 65 | self.cursor = self.connection.cursor() 66 | 67 | # noinspection PyUnresolvedReferences 68 | create_query = open(SQLiteCache.DB_CREATE_CACHE_FILENAME, 'r').read() 69 | self.cursor.executescript(create_query) 70 | self.connection.commit() 71 | 72 | for idx,pt in enumerate(parse_results.field_sets): 73 | if idx == Fields.FILES: 74 | self.__insert_many_field('downloadlinkstype', 'name', pt.setTypes) 75 | self.cursor.executemany( 76 | 'INSERT OR IGNORE INTO downloadlinks(name,bookid,downloadtypeid) VALUES (?,?,?)' 77 | , map(lambda x: (x[0], x[1], x[2]), parse_results.field_sets[Fields.FILES].setLinks)) 78 | 79 | elif pt.needs_book_id(): 80 | self.__insert_many_field_id(self.table_map[idx], 'name', 'bookid', pt.set) 81 | else: 82 | self.__insert_many_field(self.table_map[idx], 'name', pt.set) 83 | 84 | 85 | total = len(parse_results.books) 86 | 87 | for idx, book in enumerate(parse_results.books): 88 | Utils.update_progress_bar("SQLite progress" ,idx,total) 89 | book_id = idx +1 90 | self.__insertLinks(list(map(lambda x: (x,book_id) , book.authors_id)),'book_authors','authorid','bookid') 91 | self.__insertLinks(list(map(lambda x: (x,book_id) , book.subjects_id)),'book_subjects','subjectid','bookid') 92 | 93 | self.cursor.execute("INSERT OR IGNORE INTO books(publisherid,dateissued,rightsid,numdownloads,languageid,bookshelveid,gutenbergbookid,typeid) " 94 | "VALUES (?,?,?,?,?,?,?,?)" , (book.publisher_id, book.date_issued, book.rights_id, 95 | book.num_downloads,book.language_id,book.bookshelf_id,book.gutenberg_book_id,book.type_id)) 96 | 97 | self.connection.commit() 98 | 99 | # noinspection PyUnresolvedReferences 100 | create_indices_query = open(SQLiteCache.DB_CREATE_CACHE_INDICES_FILENAME, 'r').read() 101 | self.cursor.executescript(create_indices_query) 102 | self.connection.commit() 103 | 104 | 105 | self.connection.close() 106 | 107 | ## 108 | # Query function implementation 109 | def query(self,**kwargs): 110 | class HelperQuery: 111 | def __init__(self, tables, query_struct): 112 | self.tables = tables 113 | self.query_struct = query_struct 114 | helpers=[ 115 | HelperQuery(['languages'], ('languages.id = books.languageid', 'languages.name', 116 | kwargs['languages'] if 'languages' in kwargs else None)), 117 | HelperQuery(['authors', 'book_authors'], 118 | ('authors.id = book_authors.authorid and books.id = book_authors.bookid', 'authors.name', 119 | kwargs['authors'] if 'authors' in kwargs else None)), 120 | HelperQuery(['types'],('books.typeid = types.id', 'types.name', 121 | kwargs['types'] if 'types' in kwargs else None)), 122 | HelperQuery(['titles'],('titles.bookid = books.id', 'titles.name', 123 | kwargs['titles'] if 'titles' in kwargs else None)), 124 | HelperQuery(['subjects', 'book_subjects'], 125 | ('subjects.id = book_subjects.bookid and books.id = book_subjects.subjectid ', 'subjects.name', 126 | kwargs['subjects'] if 'subjects' in kwargs else None)), 127 | HelperQuery(['publishers'], 128 | ('publishers.id = books.publisherid', 'publishers.name', 129 | kwargs['publishers'] if 'publishers' in kwargs else None)), 130 | HelperQuery(['bookshelves'], 131 | ('bookshelves.id = books.bookshelveid', 'bookshelves.name', 132 | kwargs['bookshelves'] if 'bookshelves' in kwargs else None)), 133 | HelperQuery(['downloadlinks','downloadlinkstype'], 134 | ('downloadlinks.downloadtypeid = downloadlinkstype.id and downloadlinks.bookid = books.id', 'downloadlinkstype.name', 135 | kwargs['downloadtype'] if 'downloadtype' in kwargs else None)) 136 | ] 137 | runtime = list(filter(lambda x: x.query_struct[2] , helpers)) 138 | 139 | query = "SELECT DISTINCT books.gutenbergbookid FROM books" 140 | for q in runtime: 141 | query = "%s,%s"% (query ,','.join(map(str, q.tables))) 142 | query = "%s WHERE " % query 143 | 144 | for idx,q in enumerate(runtime): 145 | query = "%s %s and %s in (%s) " % (query,q.query_struct[0],q.query_struct[1],','.join(map(lambda x: "'%s'"%(str(x)), q.query_struct[2]))) 146 | if idx != len(runtime) -1: 147 | query = "%s and " % query 148 | 149 | res = [] 150 | for row in self.native_query(query): 151 | res.append(int(row[0])) 152 | 153 | return res 154 | ## 155 | # Native query function implementation 156 | def native_query(self,sql_query): 157 | if self.cursor is None or self.connection is None: 158 | self.connection = sqlite3.connect(GutenbergCacheSettings.CACHE_FILENAME) 159 | self.cursor = self.connection.cursor() 160 | 161 | return self.cursor.execute(sql_query) 162 | -------------------------------------------------------------------------------- /gutenbergpy/textget.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # encoding=utf8 3 | from __future__ import absolute_import, unicode_literals 4 | 5 | import gzip 6 | import os 7 | import errno 8 | import http.client 9 | from contextlib import closing 10 | from future.standard_library import install_aliases 11 | install_aliases() 12 | from urllib.request import urlopen 13 | from urllib.parse import urlparse 14 | import chardet 15 | from gutenbergpy.gutenbergcachesettings import GutenbergCacheSettings 16 | 17 | class UnknownDownloadUri(Exception): 18 | """location cannot be found for a text""" 19 | 20 | ## 21 | # MARKERS ARE FROM https://github.com/c-w/Gutenberg/blob/master/gutenberg/_domain_model/text.py 22 | 23 | TEXT_START_MARKERS = frozenset(( 24 | "*END*THE SMALL PRINT", 25 | "*** START OF THE PROJECT GUTENBERG", 26 | "*** START OF THIS PROJECT GUTENBERG", 27 | "This etext was prepared by", 28 | "E-text prepared by", 29 | "Produced by", 30 | "Distributed Proofreading Team", 31 | "Proofreading Team at http://www.pgdp.net", 32 | "http://gallica.bnf.fr)", 33 | " http://archive.org/details/", 34 | "http://www.pgdp.net", 35 | "by The Internet Archive)", 36 | "by The Internet Archive/Canadian Libraries", 37 | "by The Internet Archive/American Libraries", 38 | "public domain material from the Internet Archive", 39 | "Internet Archive)", 40 | "Internet Archive/Canadian Libraries", 41 | "Internet Archive/American Libraries", 42 | "material from the Google Print project", 43 | "*END THE SMALL PRINT", 44 | "***START OF THE PROJECT GUTENBERG", 45 | "This etext was produced by", 46 | "*** START OF THE COPYRIGHTED", 47 | "The Project Gutenberg", 48 | "http://gutenberg.spiegel.de/ erreichbar.", 49 | "Project Runeberg publishes", 50 | "Beginning of this Project Gutenberg", 51 | "Project Gutenberg Online Distributed", 52 | "Gutenberg Online Distributed", 53 | "the Project Gutenberg Online Distributed", 54 | "Project Gutenberg TEI", 55 | "This eBook was prepared by", 56 | "http://gutenberg2000.de erreichbar.", 57 | "This Etext was prepared by", 58 | "This Project Gutenberg Etext was prepared by", 59 | "Gutenberg Distributed Proofreaders", 60 | "Project Gutenberg Distributed Proofreaders", 61 | "the Project Gutenberg Online Distributed Proofreading Team", 62 | "**The Project Gutenberg", 63 | "*SMALL PRINT!", 64 | "More information about this book is at the top of this file.", 65 | "tells you about restrictions in how the file may be used.", 66 | "l'authorization à les utilizer pour preparer ce texte.", 67 | "of the etext through OCR.", 68 | "*****These eBooks Were Prepared By Thousands of Volunteers!*****", 69 | "We need your donations more than ever!", 70 | " *** START OF THIS PROJECT GUTENBERG", 71 | "**** SMALL PRINT!", 72 | '["Small Print" V.', 73 | ' (http://www.ibiblio.org/gutenberg/', 74 | 'and the Project Gutenberg Online Distributed Proofreading Team', 75 | 'Mary Meehan, and the Project Gutenberg Online Distributed Proofreading', 76 | ' this Project Gutenberg edition.', 77 | )) 78 | 79 | TEXT_END_MARKERS = frozenset(( 80 | "*** END OF THE PROJECT GUTENBERG", 81 | "*** END OF THIS PROJECT GUTENBERG", 82 | "***END OF THE PROJECT GUTENBERG", 83 | "End of the Project Gutenberg", 84 | "End of The Project Gutenberg", 85 | "Ende dieses Project Gutenberg", 86 | "by Project Gutenberg", 87 | "End of Project Gutenberg", 88 | "End of this Project Gutenberg", 89 | "Ende dieses Projekt Gutenberg", 90 | " ***END OF THE PROJECT GUTENBERG", 91 | "*** END OF THE COPYRIGHTED", 92 | "End of this is COPYRIGHTED", 93 | "Ende dieses Etextes ", 94 | "Ende dieses Project Gutenber", 95 | "Ende diese Project Gutenberg", 96 | "**This is a COPYRIGHTED Project Gutenberg Etext, Details Above**", 97 | "Fin de Project Gutenberg", 98 | "The Project Gutenberg Etext of ", 99 | "Ce document fut presente en lecture", 100 | "Ce document fut présenté en lecture", 101 | "More information about this book is at the top of this file.", 102 | "We need your donations more than ever!", 103 | "END OF PROJECT GUTENBERG", 104 | " End of the Project Gutenberg", 105 | " *** END OF THIS PROJECT GUTENBERG", 106 | )) 107 | 108 | LEGALESE_START_MARKERS = frozenset(("<= 100: 195 | # Check if the footer begins here 196 | if any(line.startswith(token.encode('utf-8')) for token in TEXT_END_MARKERS): 197 | footer_found = True 198 | 199 | # If it's the beginning of the footer, stop output 200 | if footer_found: 201 | break 202 | 203 | if any(line.startswith(token.encode('utf-8')) for token in LEGALESE_START_MARKERS): 204 | ignore_section = True 205 | continue 206 | elif any(line.startswith(token.encode('utf-8')) for token in LEGALESE_END_MARKERS): 207 | ignore_section = False 208 | continue 209 | 210 | if not ignore_section: 211 | stripline = line.rstrip(sep) 212 | out.append(stripline) 213 | i += 1 214 | 215 | return sep.join(out) 216 | --------------------------------------------------------------------------------