├── .gitattributes ├── .gitignore ├── .python-version ├── README.md ├── bookworm.py ├── eloquent_bookworm.py ├── requirements.txt └── style └── style.css /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Source files 5 | # ============ 6 | *.pxd text 7 | *.py text 8 | *.py3 text 9 | *.pyw text 10 | *.pyx text 11 | 12 | # Binary files 13 | # ============ 14 | *.db binary 15 | *.p binary 16 | *.pkl binary 17 | *.pyc binary 18 | *.pyd binary 19 | *.pyo binary 20 | 21 | # Note: .db, .p, and .pkl files are associated 22 | # with the python modules ``pickle``, ``dbm.*``, 23 | # ``shelve``, ``marshal``, ``anydbm``, & ``bsddb`` 24 | # (among others). 25 | 26 | # Standard to msysgit 27 | *.doc diff=astextplain 28 | *.DOC diff=astextplain 29 | *.docx diff=astextplain 30 | *.DOCX diff=astextplain 31 | *.dot diff=astextplain 32 | *.DOT diff=astextplain 33 | *.pdf diff=astextplain 34 | *.PDF diff=astextplain 35 | *.rtf diff=astextplain 36 | *.RTF diff=astextplain 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Files that must not be commited 2 | # - Local html files 3 | *.html 4 | *.htm 5 | 6 | # Epub files should not be commited 7 | *.epub 8 | 9 | #folders to ignore 10 | backup/ 11 | 12 | # Windows image file caches 13 | Thumbs.db 14 | ehthumbs.db 15 | 16 | # Folder config file 17 | Desktop.ini 18 | 19 | # Recycle Bin used on file shares 20 | $RECYCLE.BIN/ 21 | 22 | # Windows Installer files 23 | *.cab 24 | *.msi 25 | *.msm 26 | *.msp 27 | 28 | # Windows shortcuts 29 | *.lnk 30 | 31 | # books' HTML files 32 | downloads/ 33 | 34 | # ========================= 35 | # Operating System Files 36 | # ========================= 37 | 38 | # OSX 39 | # ========================= 40 | 41 | .DS_Store 42 | .AppleDouble 43 | .LSOverride 44 | 45 | # Thumbnails 46 | ._* 47 | 48 | # Files that might appear in the root of a volume 49 | .DocumentRevisions-V100 50 | .fseventsd 51 | .Spotlight-V100 52 | .TemporaryItems 53 | .Trashes 54 | .VolumeIcon.icns 55 | 56 | # Directories potentially created on remote AFP share 57 | .AppleDB 58 | .AppleDesktop 59 | Network Trash Folder 60 | Temporary Items 61 | .apdisk 62 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 2.7.11 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # BookCreator3 - BookCreator for Python 3.5 3 | # BookCreator 4 | A scrapper that takes an online book from ORilley and turns into an epub book, because I want to read'em in my nook, away from my computer, and since I know python I figured out it could not be that hard... 5 | 6 | # HOW TO INSTALL! 7 | 8 | * Dependencies: 9 | - LMXL 10 | - Ebooklib 11 | - BeatifulSoup [bs4] 12 | 13 | It is a bit tricky... you need to install LXML, if you are using windows I suggest you to use the precompiled module [ www.lfd.uci.edu/~gohlke/pythonlibs/#lxml ] 14 | To install epublib you will have to install using the project present in the github. Very easy, just download the master file [ https://github.com/aerkalov/ebooklib ] open the zip file and run your favorite installer, my tool of trade is pip so I run "pip install setup.py". 15 | And finally, run "pip install bs4" and that is all. 16 | 17 | ##It is quite simple actually 18 | The script is the "bookworm.py", it uses urllib2 and bs4 to lookup the html file in the web, download only the important part of it, captures the chapter name and sum it all up to create an epub file using the ebooklib (awesome lib this one! I'll probably create one that reads docstrings and creates the documentation of a python script with it!) 19 | 20 | The script itself depends on the html file you want to read, and it must be online, it can work the chapters with local files but if a chapter is missing it will automatically copy the html and write it locally with the same name ofthe html it was reading. 21 | 22 | ##Features 23 | * Easy to use 24 | * Fast to modify to download other books 25 | * Can be used to turn blogs into books! (requires a little bit of tinkering, contact me if you need help) 26 | * Reliable 27 | * Python 3.5! <3 28 | -------------------------------------------------------------------------------- /bookworm.py: -------------------------------------------------------------------------------- 1 | # BookCreator Script "BookWorm" 2 | # 3 | # Created by Lucas Scoppio 4 | # Twitter @developercoppio 5 | # 6 | # Do what you want cuz a pirate is free! YARRR!!! 7 | # But don't forget to be nice, the world is lacking nice people 8 | # 9 | 10 | from bs4 import BeautifulSoup 11 | from ebooklib import epub 12 | from requests import get 13 | import time 14 | import argparse 15 | import os 16 | 17 | """ Slugfy imports """ 18 | import re 19 | import unicodedata 20 | 21 | 22 | BASE_PATH = os.path.dirname(os.path.abspath(__file__)) 23 | DOWNLOADS_PATH = os.path.join(BASE_PATH, 'downloads') 24 | if not os.path.isdir(DOWNLOADS_PATH): 25 | print( 'CREATING DOWNLOADS_PATH ({})'.format(DOWNLOADS_PATH)) 26 | os.mkdir(DOWNLOADS_PATH) 27 | 28 | STYLE_PATH = os.path.join(BASE_PATH, 'style') 29 | if not os.path.isdir(STYLE_PATH): 30 | print( 'CREATING STYLE_PATH ({})'.format(STYLE_PATH)) 31 | os.mkdir(STYLE_PATH) 32 | 33 | VERBOSE = False 34 | LANGUAGE = "en" 35 | _style_ = "style.css" 36 | 37 | 38 | def slugify(string): 39 | """A generic slugifier utility (currently only for Latin-based scripts). 40 | 41 | Slugify a unicode string. 42 | 43 | Example: 44 | 45 | >>> slugify(u"Héllø Wörld") 46 | u"hello-world" 47 | 48 | """ 49 | 50 | return re.sub(r'[-\s]+', '-', 51 | (re.sub(r'[^\w\s-]', '',string).strip().lower())) 52 | 53 | 54 | def getDataForEbook(url): 55 | """ 56 | For now the url must be of the index of an oreilly internet ebook 57 | I plan to create a template file that will allow this script to read from just about 58 | any blog or website and turn it into an ebook. 59 | with the URL the script will look for the webpage and load it into memory to create 60 | the book Table of Contents, and after that it will create each chapter separately in its 61 | own folder, and to finish it up, it will wrap all into a single epub file. 62 | 63 | chapters type: array[str] 64 | var: It will hold the information of all the chapters of the book 65 | May in the future become a problem if the amount of data is too large 66 | for it to handle 67 | 68 | authors type: array[str] 69 | var: Keeps the names of the authors 70 | 71 | links type: array[str] 72 | var: holds the links of every chapter for the ebook 73 | 74 | book type: set{} 75 | var: Container for many important metadata for the ebook 76 | 77 | book_slug type: unicode 78 | var: slugify the url 79 | 80 | book_download_path 81 | type: str 82 | var: the path of the download folder for the book to be created 83 | 84 | eBook type: ebooklib 85 | var: constructor of the ebook 86 | """ 87 | #creation of the variables necessary to create the ebook 88 | chapters = [''] 89 | authors = [] 90 | links = [] 91 | book = {} 92 | 93 | # first it will drop "http[s]://" and "index.html", if present: 94 | simplified_url = url.split('://')[-1].split('index.html')[0] 95 | if VERBOSE: 96 | print( 'simplified url:', simplified_url) 97 | #then we will create the book folder... turns out it has to be unicode, so we fix that here 98 | book_slug = slugify(simplified_url)#slugify(unicode(simplified_url, "utf-8")) 99 | book_download_path = os.path.join(DOWNLOADS_PATH, book_slug) 100 | #in case the book folder is not present, it will create one. 101 | if not os.path.isdir(book_download_path): 102 | os.mkdir(book_download_path) 103 | if VERBOSE: 104 | print( 'CREATING book_download_path ({})'.format(book_download_path)) 105 | 106 | #Creating eBook creator 107 | eBook = epub.EpubBook() 108 | #Capturing the url to run BS4 on it 109 | resp = get_page(url) 110 | soup = BeautifulSoup(resp, "lxml", from_encoding="UTF-8") 111 | 112 | #url_root is the root of the book, where you find the table of contents (the link for all the chapters) 113 | url_root = url[:url.index("index")] 114 | #now we need to find the title of the book, usually it is an h1 with class "title" 115 | 116 | book["Title"] = soup.find('h1', class_="title").getText() 117 | 118 | #capture the authors of the book and put all to the authors ina variable to put into the metadata 119 | for author in soup.find_all("h3", class_="author"): 120 | authors.append(author.getText()) 121 | #this is the metadata 122 | book["Authors"] = authors 123 | #load the whole section "table of contents" (toc) into the container 124 | book["TOC"] = str(soup.find('div', class_="toc")) 125 | 126 | #creates the TOC.html of the book 127 | with open(os.path.join(book_download_path, "TOC.html"), "w") as text_file: 128 | text_file.write("\n") 129 | text_file.write(book["TOC"]) 130 | 131 | #to select the chapters it will look inside the TOC for links for chapters 132 | #those are prepared to capture only the chapters without the # markups and 133 | #only following the ORilley chapter names. 134 | for link in soup.find('div', class_="toc").find_all('a', href=True): 135 | if "#" not in link['href']: 136 | if 'pr' in link['href']: 137 | links.append(link['href']) 138 | 139 | if 'ch' in link['href']: 140 | links.append(link['href']) 141 | 142 | #setup the metadata 143 | eBook.set_identifier(book["Title"]) 144 | eBook.set_title(book["Title"]) 145 | eBook.set_language(LANGUAGE) 146 | #adding the authors into ebook metadata 147 | for author in book["Authors"]: 148 | eBook.add_author(author) 149 | 150 | #look for the files inside the book downloaded path 151 | f_ = os.listdir(book_download_path) 152 | #and then run the links looking for each one inside the local path looking for files missing. 153 | for link in links: 154 | if link in f_: 155 | print( "Local file found:", link) 156 | with open(os.path.join(book_download_path, link), "r") as text_file: 157 | resp = text_file.read() 158 | else: 159 | print( "Downloading file:", link) 160 | resp = get_page(url_root + link) 161 | 162 | soup = BeautifulSoup(resp, "lxml", from_encoding="UTF-8") 163 | 164 | try: 165 | c = epub.EpubHtml(title=soup.find('h1', class_="title").getText(), file_name=link, lang='en') 166 | c.content = createChapter(url_root, link, book_download_path, resp) 167 | chapters.append(c) 168 | eBook.add_item(c) 169 | except AttributeError: 170 | c = epub.EpubHtml(title=soup.find('h2', class_="title").getText(), file_name=link, lang='en') 171 | c.content = createChapter(url_root, link, book_download_path, resp) 172 | chapters.append(c) 173 | eBook.add_item(c) 174 | 175 | eBook.toc = chapters 176 | 177 | eBook.add_item(epub.EpubNcx()) 178 | eBook.add_item(epub.EpubNav()) 179 | 180 | # define css style 181 | style = "" 182 | with open(os.path.join(STYLE_PATH, _style_), "r") as text_file: 183 | style = text_file.read() 184 | 185 | if VERBOSE: 186 | print( "Applying style", _style_) 187 | # add css file 188 | nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style) 189 | eBook.add_item(nav_css) 190 | 191 | # create spine 192 | eBook.spine = chapters 193 | time_elapsed = time.time() 194 | if VERBOSE: 195 | print( "Starting book creation...") 196 | # create epub file 197 | epub.write_epub(os.path.join( DOWNLOADS_PATH, book["Title"] + '.epub'), eBook, {}) 198 | print( "Done,", os.path.join( DOWNLOADS_PATH, book["Title"] + '.epub'), "created!") 199 | print( "Time elapsed", time.time() - time_elapsed) 200 | 201 | def get_page(url): 202 | """ loads a webpage into a string, reading it in chunks """ 203 | src = '' 204 | #req = urllib2.Request(url) 205 | if VERBOSE: 206 | print( "GET PAGE:", url) 207 | #we have to try it, there is no other way to do it properly, because we may have timeouts 208 | try: 209 | result = get(url) 210 | src = result.content 211 | except IOError: 212 | #in case of url error we throw the error message 213 | print( 'can\'t open', url) 214 | except Exception as e: 215 | import traceback 216 | print('generic exception:', e) 217 | 218 | #return the content in src 219 | 220 | return src 221 | 222 | 223 | def createChapter(url, chapter, book_download_path, response): 224 | """ 225 | The url root is received, the name of the chapter is also taken to reach the correct page. 226 | Chapter is both the chapter and file extension - ex.: chapter01.html 227 | 228 | book_download_path is the path for the folder for the specific book. 229 | """ 230 | 231 | #after loading it to response, the data is turn into string and loaded to webContent 232 | webContent = str(response) 233 | #Unfortunatelly bs4 and urllib2 are causing problems to load only stuff inside the special markups 234 | #the best way to deal with it is directly manipulating the strings with native string operations 235 | # 236 | a = webContent.index(TAG) - 1 237 | b = webContent.index("") + (len(TAG)+3) 238 | chunk = webContent[a:b] 239 | chunk = chunk.replace(url, "") 240 | # 241 | if VERBOSE: 242 | print( "Writing", chapter, "to memory") 243 | #After all of that, the juicy stuff is written to a file that can later be modified 244 | with open(os.path.join(book_download_path, chapter), "w") as text_file: 245 | text_file.write(chunk) 246 | #Return the juice to the caller 247 | return chunk 248 | 249 | if __name__ == '__main__': 250 | #EXAMPLE LINKS TO USE 251 | # http://eloquentjavascript.net/index.html next try 252 | # http://chimera.labs.oreilly.com/books/1234000000754/index.html 253 | # http://chimera.labs.oreilly.com/books/1230000000393/index.html 254 | # http://chimera.labs.oreilly.com/books/1230000000393/index.html 255 | parser = argparse.ArgumentParser() 256 | parser.add_argument("--url", help="url of the webpage to be converted into ebook", 257 | default="http://chimera.labs.oreilly.com/books/1234000000754/index.html") 258 | parser.add_argument("-l", "--language", help="select ebook language, defaulte is 'en'", 259 | default="en") 260 | parser.add_argument("-v", "--verbose", help="increase output verbosity", 261 | action="store_true", default=True) 262 | parser.add_argument("-t", "--tag", help="Tag used in the webpage to define the start/end of the text of interest", 263 | default="section") 264 | parser.add_argument("-css", "--style", help="css file inside style folder to change the ebook formating style", 265 | default="style.css") 266 | args = parser.parse_args() 267 | 268 | _style_ = "style.css" #args['style'] 269 | TAG = "section" # args['tag'] 270 | LANGUAGE = "en" # args['language'] 271 | VERBOSE = True # args['verbose'] 272 | 273 | if VERBOSE: 274 | print( "verbosity turned on") 275 | print( "Style loaded:", _style_) 276 | print( "Language loaded:", LANGUAGE) 277 | print( "Tag for section is", TAG) 278 | 279 | print( "Running", os.path.basename(__file__)) 280 | getDataForEbook("http://chimera.labs.oreilly.com/books/1234000000754/index.html") # args['url'] -------------------------------------------------------------------------------- /eloquent_bookworm.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | from ebooklib import epub 3 | from slugify import slugify 4 | import urllib2 5 | import time 6 | import argparse 7 | import os 8 | 9 | BASE_PATH = os.path.dirname(os.path.abspath(__file__)) 10 | DOWNLOADS_PATH = os.path.join(BASE_PATH, 'downloads') 11 | if not os.path.isdir(DOWNLOADS_PATH): 12 | print 'CREATING DOWNLOADS_PATH ({})'.format(DOWNLOADS_PATH) 13 | os.mkdir(DOWNLOADS_PATH) 14 | 15 | STYLE_PATH = os.path.join(BASE_PATH, 'style') 16 | if not os.path.isdir(STYLE_PATH): 17 | print 'CREATING STYLE_PATH ({})'.format(STYLE_PATH) 18 | os.mkdir(STYLE_PATH) 19 | 20 | VERBOSE = False 21 | LANGUAGE = "en" 22 | STYLE = "style.css" 23 | 24 | def getDataForEbook(url): 25 | """ 26 | For now the url must be of the index of an oreilly internet ebook 27 | I plan to create a template file that will allow this script to read from just about 28 | any blog or website and turn it into an ebook. 29 | with the URL the script will look for the webpage and load it into memory to create 30 | the book Table of Contents, and after that it will create each chapter separately in its 31 | own folder, and to finish it up, it will wrap all into a single epub file. 32 | 33 | chapters type: array[str] 34 | var: It will hold the information of all the chapters of the book 35 | May in the future become a problem if the amount of data is too large 36 | for it to handle 37 | 38 | authors type: array[str] 39 | var: Keeps the names of the authors 40 | 41 | links type: array[str] 42 | var: holds the links of every chapter for the ebook 43 | 44 | book type: set{} 45 | var: Container for many important metadata for the ebook 46 | 47 | book_slug type: unicode 48 | var: slugify the url 49 | 50 | book_download_path 51 | type: str 52 | var: the path of the download folder for the book to be created 53 | 54 | eBook type: ebooklib 55 | var: constructor of the ebook 56 | """ 57 | #creation of the variables necessary to create the ebook 58 | chapters = [''] 59 | authors = [] 60 | links = [] 61 | book = {} 62 | 63 | # first it will drop "http[s]://" and "index.html", if present: 64 | simplified_url = url.split('://')[-1].split('index.html')[0] 65 | if VERBOSE: 66 | print 'simplified url:', simplified_url 67 | #then we will create the book folder... turns out it has to be unicode, so we fix that here 68 | book_slug = slugify(unicode(simplified_url, "utf-8")) 69 | book_download_path = os.path.join(DOWNLOADS_PATH, book_slug) 70 | #in case the book folder is not present, it will create one. 71 | if not os.path.isdir(book_download_path): 72 | os.mkdir(book_download_path) 73 | if VERBOSE: 74 | print 'CREATING book_download_path ({})'.format(book_download_path) 75 | 76 | #Creating eBook creator 77 | eBook = epub.EpubBook() 78 | #Capturing the url to run BS4 on it 79 | resp = get_page(url) 80 | soup = BeautifulSoup(resp, "lxml", from_encoding="UTF-8") 81 | 82 | #url_root is the root of the book, where you find the table of contents (the link for all the chapters) 83 | url_root = url[:url.index("index")] 84 | #now we need to find the title of the book, usually it is an h1 with class "title" 85 | book["Title"] = soup.find('h1').getText() 86 | #this is the metadata 87 | book["Authors"] = "Marijn Haverbeke" 88 | #load the whole section "table of contents" (toc) into the container 89 | book["TOC"] = str(soup.find('ol', class_="toc")) 90 | 91 | #creates the TOC.html of the book 92 | with open(os.path.join(book_download_path, "TOC.html"), "w") as text_file: 93 | text_file.write("\n") 94 | text_file.write(book["TOC"]) 95 | 96 | #to select the chapters it will look inside the TOC for links for chapters 97 | #those are prepared to capture only the chapters without the # markups and 98 | #only following the ORilley chapter names. 99 | for link in soup.find('ol', class_="toc").find_all('a', href=True): 100 | links.append(link['href']) 101 | 102 | #setup the metadata 103 | eBook.set_identifier(book["Title"]) 104 | eBook.set_title(book["Title"]) 105 | eBook.set_language(LANGUAGE) 106 | #adding the authors into ebook metadata 107 | for author in book["Authors"]: 108 | eBook.add_author(author) 109 | 110 | #look for the files inside the book downloaded path 111 | f_ = os.listdir(book_download_path) 112 | #and then run the links looking for each one inside the local path looking for files missing. 113 | for link in links: 114 | if link in f_: 115 | print "Local file found:", link 116 | with open(os.path.join(book_download_path, link), "r") as text_file: 117 | resp = text_file.read() 118 | else: 119 | print "Downloading file:", link 120 | resp = get_page(url_root + link) 121 | 122 | soup = BeautifulSoup(resp, "lxml", from_encoding="UTF-8") 123 | 124 | try: 125 | c = epub.EpubHtml(title=soup.find('h1').getText(), file_name=link, lang='en') 126 | c.content = createChapter(url_root, link, book_download_path, resp) 127 | chapters.append(c) 128 | eBook.add_item(c) 129 | except AttributeError: 130 | c = epub.EpubHtml(title=soup.find('h2').getText(), file_name=link, lang='en') 131 | c.content = createChapter(url_root, link, book_download_path, resp) 132 | chapters.append(c) 133 | eBook.add_item(c) 134 | 135 | eBook.toc = chapters 136 | 137 | eBook.add_item(epub.EpubNcx()) 138 | eBook.add_item(epub.EpubNav()) 139 | 140 | # define css style 141 | style = "" 142 | with open(os.path.join(STYLE_PATH, STYLE), "r") as text_file: 143 | style = text_file.read() 144 | 145 | if VERBOSE: 146 | print "Applying style", STYLE 147 | # add css file 148 | nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style) 149 | eBook.add_item(nav_css) 150 | 151 | # create spine 152 | eBook.spine = chapters 153 | time_elapsed = time.time() 154 | if VERBOSE: 155 | print "Starting book creation..." 156 | # create epub file 157 | epub.write_epub(book["Title"] + '.epub', eBook, {}) 158 | print "Done,", book["Title"] + '.epub', "created!" 159 | print "Time elapsed", time.time() - time_elapsed 160 | 161 | def get_page(url): 162 | """ loads a webpage into a string, reading it in chunks """ 163 | src = '' 164 | req = urllib2.Request(url) 165 | #we have to try it, there is no other way to do it properly, because we may have timeouts 166 | if VERBOSE: 167 | print "GET PAGE", url 168 | 169 | try: 170 | response = urllib2.urlopen(req) 171 | chunk = True 172 | #chunk becomes true and than becomes "1024" characters with response.read 173 | while chunk: 174 | chunk = response.read(1024) 175 | #after that it will read and "delete" response until it runs out and become False 176 | #and keeps putting the data into src 177 | src += chunk 178 | #use Close to stop the connection to the web 179 | response.close() 180 | except IOError: 181 | #in case of url error we throw the error message 182 | print 'can\'t open', url 183 | 184 | #return the content in src 185 | return src 186 | 187 | 188 | def createChapter(url, chapter, book_download_path, resp): 189 | """ 190 | The url root is received, the name of the chapter is also taken to reach the correct page. 191 | Chapter is both the chapter and file extension - ex.: chapter01.html 192 | 193 | book_download_path is the path for the folder for the specific book. 194 | """ 195 | #The chapter will be read by urllib2 and the "juice" will be written to response 196 | response = resp 197 | #response = get_page(url+chapter) 198 | #after loading it to response, the data is turn into string and loaded to webContent 199 | webContent = str(response) 200 | #Unfortunatelly bs4 and urllib2 are causing problems to load only stuff inside the special markups 201 | #the best way to deal with it is directly manipulating the strings with native string operations 202 | # 203 | a = webContent.index("article") - 1 204 | b = webContent.index("") + 10 205 | chunk = webContent[a:b] 206 | chunk = chunk.replace("http://eloquentjavascript.net/", "") 207 | # 208 | if VERBOSE: 209 | print "Writing", chapter, "to memory" 210 | #After all of that, the juicy stuff is written to a file that can later be modified 211 | with open(os.path.join(book_download_path, chapter), "w") as text_file: 212 | text_file.write(chunk) 213 | #Return the juice to the caller 214 | return chunk 215 | 216 | 217 | if __name__ == '__main__': 218 | #EXAMPLE LINKS TO USE 219 | # http://eloquentjavascript.net/index.html next try 220 | # http://chimera.labs.oreilly.com/books/1234000000754/index.html 221 | # http://chimera.labs.oreilly.com/books/1230000000393/index.html 222 | 223 | parser = argparse.ArgumentParser() 224 | parser.add_argument("--url", help="url of the webpage to be converted into ebook", 225 | default="http://eloquentjavascript.net/index.html") 226 | parser.add_argument("-l", "--language", help="select ebook language, defaulte is 'en'", 227 | default="en") 228 | parser.add_argument("-v", "--verbose", help="increase output verbosity", 229 | action="store_true", default=True) 230 | parser.add_argument("-t", "--tag", help="Tag used in the webpage to define the start/end of the text of interest", 231 | default="article") 232 | parser.add_argument("-css", "--style", help="css file inside style folder to change the ebook formating style", 233 | default="style.css") 234 | args = parser.parse_args() 235 | 236 | STYLE = args.style 237 | TAG = args.tag 238 | LANGUAGE = args.language 239 | VERBOSE = args.verbose 240 | if VERBOSE: 241 | print "verbosity turned on" 242 | print "Style loaded:", STYLE 243 | print "Language loaded:", LANGUAGE 244 | print "Tag for section is", TAG 245 | 246 | print "Running", os.path.basename(__file__) 247 | getDataForEbook(args.url) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | bs4>=0.0.1 2 | EbookLib>=0.15 3 | python-slugify>=1.2.0 4 | -------------------------------------------------------------------------------- /style/style.css: -------------------------------------------------------------------------------- 1 | @namespace epub "http://www.idpf.org/2007/ops"; 2 | body { 3 | font-family: Cambria, Liberation Serif, Bitstream Vera Serif, Georgia, Times, Times New Roman, serif; 4 | } 5 | h2 { 6 | text-align: left; 7 | text-transform: uppercase; 8 | font-weight: 200; 9 | } 10 | ol { 11 | list-style-type: none; 12 | } 13 | ol > li:first-child { 14 | margin-top: 0.3em; 15 | } 16 | nav[epub|type~='toc'] > ol > li > ol { 17 | list-style-type:square; 18 | } 19 | nav[epub|type~='toc'] > ol > li > ol > li { 20 | margin-top: 0.3em; 21 | } --------------------------------------------------------------------------------