├── .gitignore ├── README.md ├── getpy.py ├── main.py ├── manage.py ├── pdfpy.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.epub 3 | *.pdf 4 | venv/* 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EpubToPdf 2 | 3 | This program converts epub documents to pdf documents. 4 | 5 | 6 | **Installation** 7 | 8 | Clone this repository to your machine. 9 | 10 | 11 | **Requirements** 12 | 13 | To install the requirements, run the following command: 14 | 15 | ```pip install -r requirements.txt``` 16 | 17 | After this, you install the _wkhtmltopdf_ which is a dependency of the _pdfkit_ module. 18 | 19 | 20 | wkhtmltopdf file can be downloaded [here](https://wkhtmltopdf.org/downloads.html) 21 | 22 | Make sure to add wkhtlmtopdf as an executable in Windows environment paths. 23 | 24 | For Debian/Ubuntu users: 25 | 26 | 27 | ```$ sudo apt-get install wkhtmltopdf``` 28 | 29 | should do the trick. 30 | 31 | If you get a `QXcbConnection: Could not connect to display` error, check [this issue](https://github.com/JazzCore/python-pdfkit/issues/82). 32 | 33 | 34 | 35 | **Usage** 36 | 37 | Copy the epub file inside the repository folder when forked. 38 | 39 | Run the main.py file, adding the name of the epub file as a commandline argument. 40 | 41 | As shown below: 42 | 43 | ```python main.py epub-file-name``` 44 | -------------------------------------------------------------------------------- /getpy.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup as bs 2 | import os 3 | import re 4 | import ntpath 5 | 6 | 7 | class GetEngine(object): 8 | 9 | """ 10 | 11 | This class contains the methods needed to get the files, 12 | to help make the pdf file. 13 | 14 | The class contains the following methods: 15 | 16 | get_html() --- Which gets the html file names. 17 | 18 | get_pdf() --- Which gets the pdf file names. 19 | 20 | get_css() --- Which gets the css file names. 21 | 22 | get_images() --- Which gets the image file names. 23 | 24 | 25 | To create an instance of this object, pass in the name of the directory 26 | that stores all the extracted files from the epub file. 27 | 28 | 29 | """ 30 | 31 | def __init__(self, directory): 32 | self.html_files = [] 33 | self.css_files = [] 34 | self.image_files = [] 35 | self.directory = directory 36 | self.files = [] 37 | self.pdf_files = [] 38 | 39 | def get_html(self): 40 | 41 | for file in self.files: 42 | if file.endswith(".xhtml") or file.endswith(".html"): 43 | self.html_files.append(file) 44 | 45 | def get_pdf(self): 46 | 47 | for file in self.html_files: 48 | self.pdf_files.append("{}.pdf".format(self.html_files.index(file))) 49 | 50 | def get_css(self): 51 | 52 | for file in self.files: 53 | if file.endswith(".css"): 54 | self.css_files.append(file) 55 | 56 | def get_images(self): 57 | 58 | for file in self.files: 59 | if file.endswith((".png", ".jpg", ".gif")): 60 | self.image_files.append(file) 61 | 62 | def get_all(self): 63 | file = None 64 | directory_paths = [] 65 | for root, dirs, files in os.walk(self.directory): 66 | #This traverses the directory passed in as an argument, 67 | #returns the current directory, the sub directories and all the files 68 | directory_paths.append(root) 69 | if file: 70 | continue 71 | for each in files: 72 | if each.endswith(".opf"): 73 | file = os.path.join(root, each) 74 | continue 75 | if not file: 76 | return 77 | 78 | xml_content = open(file, "r").read() 79 | 80 | xml_tree = bs(xml_content, features = "xml") 81 | 82 | file_names = xml_tree.package.manifest.findAll('item') 83 | 84 | # Gets the name of all the documents in order 85 | # from the opf file, then saves the file name with its path 86 | # The file path in the opf file can't be relied upon 87 | # Hence, the need to extract file name and get its path 88 | 89 | for file in file_names: 90 | file_path_match = re.match(r'.+\.[a-zA-Z]+', file.get('href', '')) 91 | if not file_path_match: 92 | continue 93 | file_name = ntpath.basename(file_path_match.group()) 94 | for path in directory_paths: 95 | filepath = path + '/' + file_name 96 | if os.path.exists(filepath): 97 | self.files.append(filepath) 98 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from manage import FileManager 3 | from getpy import GetEngine 4 | from pdfpy import PdfEngine 5 | 6 | def process(): 7 | 8 | if sys.argv[1].endswith(".epub"): 9 | 10 | print('--- Epub to PDF conversion started') 11 | 12 | epub_file = sys.argv[1] 13 | file = FileManager(epub_file) 14 | file.epub_to_zip() 15 | file.get_directory() 16 | file.extract_zip() 17 | engine = GetEngine(file.directory) 18 | engine.get_all() 19 | engine.get_html() 20 | engine.get_pdf() 21 | engine.get_css() 22 | engine.get_images() 23 | pdf = PdfEngine(engine.html_files, engine.css_files, 24 | engine.pdf_files, file.directory) 25 | pdf.convert() 26 | pdf.combine() 27 | pdf.del_pdf() 28 | file.zip_to_epub() 29 | file.del_directory() 30 | 31 | print('--- Epub to PDF conversion successful') 32 | 33 | else: 34 | 35 | print("File is not an epub file") 36 | 37 | 38 | if __name__ == "__main__": 39 | process() 40 | -------------------------------------------------------------------------------- /manage.py: -------------------------------------------------------------------------------- 1 | import os 2 | import zipfile 3 | import shutil 4 | 5 | 6 | class FileManager(object): 7 | 8 | 9 | """ 10 | 11 | This class is used for file interactions. 12 | 13 | It has the following methods: 14 | 15 | epub_to_zip() --- Which converts the epub file to a zip file 16 | 17 | extract_zip() --- Which extracts the content of the zip file 18 | 19 | get_directory() --- Which gets the directory name where content of 20 | zip file was extracted 21 | 22 | zip_to_epub() --- Which converts the zip file back to epub 23 | 24 | del_directory() --- Which deletes the directory where zip files 25 | were extracted 26 | 27 | del_pdf() --- Which deletes the pdf files created by 28 | 29 | 30 | """ 31 | 32 | def __init__(self, epub_file): 33 | self.epub_file = epub_file 34 | self.zip_file = "{}.zip".format(epub_file.split(".epub")[0]) 35 | self.directory = "" 36 | 37 | 38 | def epub_to_zip(self): 39 | os.rename(self.epub_file, self.zip_file) 40 | 41 | 42 | def extract_zip(self): 43 | extracted_files = zipfile.ZipFile(self.zip_file) 44 | extracted_files.extractall(self.directory) 45 | extracted_files.close() 46 | 47 | def get_directory(self): 48 | minus_open_paren = self.epub_file.split(".epub")[0].replace("(", "") 49 | minus_close_paren = minus_open_paren.replace(")", "") 50 | self.directory = minus_close_paren.replace(" ", "") 51 | 52 | 53 | def zip_to_epub(self): 54 | os.rename(self.zip_file, self.epub_file) 55 | 56 | 57 | def del_directory(self): 58 | shutil.rmtree(self.directory) 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /pdfpy.py: -------------------------------------------------------------------------------- 1 | import pdfkit 2 | import os 3 | from PyPDF2 import PdfFileMerger 4 | from PyPDF2.utils import PdfReadError 5 | 6 | 7 | class PdfEngine(object): 8 | 9 | """ 10 | This class carries operations on pdf files. 11 | 12 | It has the following methods: 13 | 14 | convert() --- Which converts each of the markup file 15 | passed in to pdf. Markup file should be html 16 | 17 | combine() --- Which merges all of the pdf files created by 18 | the convert method, creating a new file. 19 | 20 | del_pdf() --- Which deletes all the pdf files created by 21 | the convert method. 22 | 23 | """ 24 | 25 | def __init__(self, markup_files, style_files, pdf_files, directory): 26 | self.markup_files = markup_files 27 | self.style_files = style_files 28 | self.pdf_files = pdf_files 29 | self.directory = directory 30 | 31 | def convert(self): 32 | for each in self.markup_files: 33 | 34 | # Prevent conversion process from showing terminal updates 35 | options = {"enable-local-file-access": None, "quiet": ""} 36 | pdfkit.from_file(each, "{}.pdf".format(self.markup_files.index(each)), 37 | options=options) 38 | 39 | print('--- Sections converted to pdf') 40 | 41 | def combine(self): 42 | 43 | merger = PdfFileMerger() 44 | 45 | for pdf in self.pdf_files: 46 | try: 47 | merger.append(pdf, import_bookmarks=False) 48 | except PdfReadError: 49 | pass 50 | 51 | merger.write("{}.pdf".format(self.directory)) 52 | 53 | print('--- Sections combined together in a single pdf file') 54 | 55 | merger.close() 56 | 57 | def del_pdf(self): 58 | for each in self.pdf_files: 59 | os.remove(each) 60 | print('--- Individual pdf files deleted from directory') 61 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.5.3 2 | lxml==4.6.3 3 | pdfkit==0.6.1 4 | PyPDF2==1.26.0 5 | --------------------------------------------------------------------------------