├── .gitignore
├── README.md
├── getpy.py
├── main.py
├── manage.py
├── pdfpy.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | *.epub
3 | *.pdf
4 | venv/*
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # EpubToPdf
 2 | 
 3 | This program converts epub documents to pdf documents.
 4 | 
 5 | 
 6 | **Installation**
 7 | 
 8 | Clone this repository to your machine.
 9 | 
10 | 
11 | **Requirements**
12 | 
13 | To install the requirements, run the following command:
14 | 
15 | ```pip install -r requirements.txt```
16 | 
17 | After this, you install the _wkhtmltopdf_ which is a dependency of the _pdfkit_ module.
18 | 
19 | 
20 | wkhtmltopdf file can be downloaded [here](https://wkhtmltopdf.org/downloads.html)
21 | 
22 | Make sure to add wkhtlmtopdf as an executable in Windows environment paths.
23 | 
24 | For Debian/Ubuntu users:
25 | 
26 | 
27 | ```$ sudo apt-get install wkhtmltopdf```
28 | 
29 | should do the trick.
30 | 
31 | If you get a `QXcbConnection: Could not connect to display` error, check [this issue](https://github.com/JazzCore/python-pdfkit/issues/82).
32 | 
33 | 
34 | 
35 | **Usage**
36 | 
37 | Copy the epub file inside the repository folder when forked.
38 | 
39 | Run the main.py file, adding the name of the epub file as a commandline argument.
40 | 
41 | As shown below:
42 | 
43 | ```python main.py epub-file-name```
44 | 


--------------------------------------------------------------------------------
/getpy.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup as bs
 2 | import os
 3 | import re
 4 | import ntpath
 5 | 
 6 | 
 7 | class GetEngine(object):
 8 | 
 9 | 	"""
10 | 		
11 | 		This class contains the methods needed to get the files,
12 | 		to help make the pdf file.
13 | 
14 | 		The class contains the following methods:
15 | 
16 | 		get_html() --- Which gets the html file names.
17 | 
18 | 		get_pdf() --- Which gets the pdf file names.
19 | 
20 | 		get_css() --- Which gets the css file names.
21 | 
22 | 		get_images() --- Which gets the image file names.
23 | 
24 | 
25 | 		To create an instance of this object, pass in the name of the directory
26 | 		that stores all the extracted files from the epub file.
27 | 
28 | 
29 | 	"""
30 | 
31 | 	def __init__(self, directory):
32 | 		self.html_files = []
33 | 		self.css_files = []
34 | 		self.image_files = []
35 | 		self.directory = directory
36 | 		self.files = []
37 | 		self.pdf_files = []
38 | 
39 | 	def get_html(self):
40 | 
41 | 		for file in self.files:
42 | 			if file.endswith(".xhtml") or file.endswith(".html"):
43 | 				self.html_files.append(file)
44 | 
45 | 	def get_pdf(self):
46 | 
47 | 		for file in self.html_files:
48 | 			self.pdf_files.append("{}.pdf".format(self.html_files.index(file)))
49 | 
50 | 	def get_css(self):
51 | 
52 | 		for file in self.files:
53 | 			if file.endswith(".css"):
54 | 				self.css_files.append(file)
55 | 
56 | 	def get_images(self):
57 | 
58 | 		for file in self.files:
59 | 			if file.endswith((".png", ".jpg", ".gif")):
60 | 				self.image_files.append(file)
61 | 
62 | 	def get_all(self):
63 | 		file = None
64 | 		directory_paths = []
65 | 		for root, dirs, files in os.walk(self.directory):
66 | 			#This traverses the directory passed in as an argument,
67 | 			#returns the current directory, the sub directories and all the files
68 | 			directory_paths.append(root)
69 | 			if file:
70 | 				continue
71 | 			for each in files:
72 | 				if each.endswith(".opf"):
73 | 					file = os.path.join(root, each)
74 | 					continue
75 | 		if not file:
76 | 			return
77 | 
78 | 		xml_content = open(file, "r").read()
79 | 
80 | 		xml_tree = bs(xml_content, features = "xml")
81 | 
82 | 		file_names = xml_tree.package.manifest.findAll('item')
83 | 
84 | 		# Gets the name of all the documents in order
85 | 		# from the opf file, then saves the file name with its path
86 | 		# The file path in the opf file can't be relied upon
87 | 		# Hence, the need to extract file name and get its path
88 | 
89 | 		for file in file_names:
90 | 			file_path_match = re.match(r'.+\.[a-zA-Z]+', file.get('href', ''))
91 | 			if not file_path_match:
92 | 				continue
93 | 			file_name = ntpath.basename(file_path_match.group())
94 | 			for path in directory_paths:
95 | 				filepath = path + '/' + file_name
96 | 				if os.path.exists(filepath):
97 | 					self.files.append(filepath)
98 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from manage import FileManager
 3 | from getpy import GetEngine
 4 | from pdfpy import PdfEngine
 5 | 
 6 | def process():
 7 | 
 8 | 	if sys.argv[1].endswith(".epub"):
 9 | 
10 | 		print('--- Epub to PDF conversion started')
11 | 
12 | 		epub_file = sys.argv[1]
13 | 		file = FileManager(epub_file)
14 | 		file.epub_to_zip()
15 | 		file.get_directory()
16 | 		file.extract_zip()
17 | 		engine = GetEngine(file.directory)
18 | 		engine.get_all()
19 | 		engine.get_html()
20 | 		engine.get_pdf()
21 | 		engine.get_css()
22 | 		engine.get_images()
23 | 		pdf = PdfEngine(engine.html_files, engine.css_files,
24 | 						engine.pdf_files, file.directory)
25 | 		pdf.convert()
26 | 		pdf.combine()
27 | 		pdf.del_pdf()
28 | 		file.zip_to_epub()
29 | 		file.del_directory()
30 | 
31 | 		print('--- Epub to PDF conversion successful')
32 | 
33 | 	else:
34 | 
35 | 		print("File is not an epub file")
36 | 
37 | 
38 | if __name__ == "__main__":
39 | 	process()
40 | 


--------------------------------------------------------------------------------
/manage.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import zipfile
 3 | import shutil
 4 | 
 5 | 
 6 | class FileManager(object):
 7 | 
 8 | 
 9 | 	"""
10 | 		
11 | 		This class is used for file interactions.
12 | 
13 | 		It has the following methods:
14 | 
15 | 		epub_to_zip() --- Which converts the epub file to a zip file
16 | 
17 | 		extract_zip() --- Which extracts the content of the zip file
18 | 
19 | 		get_directory() --- Which gets the directory name where content of
20 | 		 					zip file was extracted
21 | 
22 | 		zip_to_epub() --- Which converts the zip file back to epub
23 | 
24 | 		del_directory() --- Which deletes the directory where zip files
25 | 							were extracted
26 | 
27 | 		del_pdf() --- Which deletes the pdf files created by 
28 | 
29 | 
30 | 	"""
31 | 
32 | 	def __init__(self, epub_file):
33 | 		self.epub_file = epub_file
34 | 		self.zip_file = "{}.zip".format(epub_file.split(".epub")[0])
35 | 		self.directory = ""
36 | 
37 | 
38 | 	def epub_to_zip(self):
39 | 		os.rename(self.epub_file, self.zip_file)
40 | 
41 | 
42 | 	def extract_zip(self):
43 | 		extracted_files = zipfile.ZipFile(self.zip_file)
44 | 		extracted_files.extractall(self.directory)
45 | 		extracted_files.close()
46 | 
47 | 	def get_directory(self):
48 | 		minus_open_paren = self.epub_file.split(".epub")[0].replace("(", "")
49 | 		minus_close_paren = minus_open_paren.replace(")", "")
50 | 		self.directory = minus_close_paren.replace(" ", "")
51 | 		
52 | 
53 | 	def zip_to_epub(self):
54 | 		os.rename(self.zip_file, self.epub_file)
55 | 
56 | 
57 | 	def del_directory(self):
58 | 		shutil.rmtree(self.directory)
59 | 
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/pdfpy.py:
--------------------------------------------------------------------------------
 1 | import pdfkit
 2 | import os
 3 | from PyPDF2 import PdfFileMerger
 4 | from PyPDF2.utils import PdfReadError
 5 | 
 6 | 
 7 | class PdfEngine(object):
 8 | 
 9 | 	"""
10 | 		This class carries operations on pdf files.
11 | 
12 | 		It has the following methods:
13 | 
14 | 		convert() --- Which converts each of the markup file
15 | 		passed in to pdf. Markup file should be html
16 | 
17 | 		combine() --- Which merges all of the pdf files created by
18 | 		the convert method, creating a new file.
19 | 
20 | 		del_pdf() --- Which deletes all the pdf files created by
21 | 		the convert method.
22 | 
23 | 	"""
24 | 
25 | 	def __init__(self, markup_files, style_files, pdf_files, directory):
26 | 		self.markup_files = markup_files
27 | 		self.style_files = style_files
28 | 		self.pdf_files = pdf_files
29 | 		self.directory = directory
30 | 
31 | 	def convert(self):
32 | 		for each in self.markup_files:
33 | 
34 | 			# Prevent conversion process from showing terminal updates
35 | 			options = {"enable-local-file-access": None, "quiet": ""}
36 | 			pdfkit.from_file(each, "{}.pdf".format(self.markup_files.index(each)),
37 | 							 options=options)
38 | 
39 | 		print('--- Sections converted to pdf')
40 | 
41 | 	def combine(self):
42 | 
43 | 		merger = PdfFileMerger()
44 | 
45 | 		for pdf in self.pdf_files:
46 | 			try:
47 | 				merger.append(pdf, import_bookmarks=False)
48 | 			except PdfReadError:
49 | 				pass
50 | 
51 | 		merger.write("{}.pdf".format(self.directory))
52 | 
53 | 		print('--- Sections combined together in a single pdf file')
54 | 
55 | 		merger.close()
56 | 
57 | 	def del_pdf(self):
58 | 			for each in self.pdf_files:
59 | 				os.remove(each)
60 | 			print('--- Individual pdf files deleted from directory')
61 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.5.3
2 | lxml==4.6.3
3 | pdfkit==0.6.1
4 | PyPDF2==1.26.0
5 | 


--------------------------------------------------------------------------------