├── icon.ico ├── src ├── UI │ ├── __init__.py │ └── Interface.py ├── __init__.py ├── Processor │ ├── __init__.py │ ├── Crawler.py │ └── Handler.py └── Tools │ ├── __init__.py │ ├── Config.py │ ├── Logger.py │ └── Tools.py ├── main.py ├── LICENSE ├── README.md └── .gitignore /icon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeerChen/Science_Reading_Book_Downloader/HEAD/icon.ico -------------------------------------------------------------------------------- /src/UI/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Description: 3 | Author: Senkita 4 | Date: 2022-02-19 15:42:57 5 | LastEditors: Senkita 6 | LastEditTime: 2022-02-19 15:42:58 7 | ''' 8 | __all__ = ['Interface'] 9 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Description: 包说明 3 | Author: Senkita 4 | Date: 2021-12-20 23:41:42 5 | LastEditors: Senkita 6 | LastEditTime: 2022-02-19 16:20:49 7 | ''' 8 | __all__ = ['Processor', 'Tools', 'UI'] 9 | -------------------------------------------------------------------------------- /src/Processor/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Description: 3 | Author: Senkita 4 | Date: 2022-02-19 15:44:02 5 | LastEditors: Senkita 6 | LastEditTime: 2022-02-19 15:44:28 7 | ''' 8 | __all__ = ['Crawler', 'Handler'] 9 | -------------------------------------------------------------------------------- /src/Tools/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Description: 3 | Author: Senkita 4 | Date: 2022-02-19 15:48:56 5 | LastEditors: Senkita 6 | LastEditTime: 2022-02-19 15:50:08 7 | ''' 8 | __all__ = ['Config', 'Logger', 'Tools'] 9 | -------------------------------------------------------------------------------- /src/Tools/Config.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Description: 一些配置项 3 | Author: Senkita 4 | Date: 2022-02-19 15:44:55 5 | LastEditors: Senkita 6 | LastEditTime: 2022-02-19 15:44:56 7 | ''' 8 | headers: dict = {'Connection': 'close'} 9 | time_break: int = 2 10 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Description: 主入口 3 | Author: Senkita 4 | Date: 2021-12-20 23:40:59 5 | LastEditors: Senkita 6 | LastEditTime: 2022-03-17 23:03:39 7 | ''' 8 | import os 9 | from src.Processor.Handler import Handler 10 | from src.UI.Interface import Interface 11 | 12 | 13 | def main() -> None: 14 | # 命令行运行 15 | # from src.Tools import get_args 16 | # book_id = get_args() 17 | 18 | # GUI版 19 | ui = Interface() 20 | try: 21 | book_id, scaling, keep_pic_folder = ui.display() 22 | except Exception as e: 23 | print(e) 24 | os._exit(0) 25 | 26 | if book_id: 27 | handler: Handler = Handler(book_id, scaling, keep_pic_folder) 28 | handler.run() 29 | 30 | 31 | if __name__ == "__main__": 32 | main() 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Senkita 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/Tools/Logger.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Description: 自定义日志类 3 | Author: Senkita 4 | Date: 2021-12-22 09:36:07 5 | LastEditors: Senkita 6 | LastEditTime: 2021-12-22 20:21:28 7 | ''' 8 | import logging 9 | 10 | 11 | # 日志配置 12 | class LoggerConfig: 13 | def __init__(self, book_id: str) -> None: 14 | self.logger: logging.Logger = logging.getLogger() 15 | self.formatter: logging.Formatter = logging.Formatter( 16 | fmt='%(asctime)s - %(name)s - %(levelname)s - %(message)s', 17 | datefmt='%Y-%m-%d %H:%M:%S', 18 | ) 19 | 20 | # self.stream_handler: logging.StreamHandler = logging.StreamHandler() 21 | # self.stream_handler.setFormatter(self.formatter) 22 | # self.stream_handler.setLevel(logging.ERROR) 23 | 24 | self.file_handler: logging.FileHandler = logging.FileHandler( 25 | filename='{}.log'.format(book_id), 26 | mode='a', 27 | ) 28 | self.file_handler.setFormatter(self.formatter) 29 | self.file_handler.setLevel(logging.DEBUG) 30 | 31 | # self.logger.addHandler(self.stream_handler) 32 | self.logger.addHandler(self.file_handler) 33 | 34 | 35 | # 日志类 36 | class Logger(LoggerConfig): 37 | def __new__(cls: logging.Logger, book_id: str) -> logging.Logger: 38 | super(Logger, cls).__init__(cls, book_id) 39 | return cls.logger 40 | -------------------------------------------------------------------------------- /src/Tools/Tools.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Description: 一些独立函数 3 | Author: Senkita 4 | Date: 2021-12-20 23:44:20 5 | LastEditors: Senkita 6 | LastEditTime: 2022-02-19 19:49:48 7 | ''' 8 | import re 9 | import argparse 10 | from typing import Tuple, Union 11 | from src.Processor.Crawler import Crawler 12 | 13 | 14 | # 目录分级 15 | def catalog_grading(catalog_list: list) -> tuple: 16 | pid_dict: dict = {} 17 | catalog_dict: dict = {} 18 | 19 | level: int = 1 20 | 21 | for pid, title, page_num in catalog_list: 22 | if pid == '0': 23 | catalog_dict[title] = {'level': level, 'page_num': int(page_num) - 1} 24 | pid_dict[pid] = level 25 | level = 0 26 | elif pid in pid_dict: 27 | catalog_dict[title] = { 28 | 'level': pid_dict[pid], 29 | 'page_num': int(page_num) - 1, 30 | } 31 | level = pid_dict[pid] 32 | else: 33 | level += 1 34 | catalog_dict[title] = {'level': level, 'page_num': int(page_num) - 1} 35 | pid_dict[pid] = level 36 | 37 | return [(v['level'], k, v['page_num']) for k, v in catalog_dict.items()] 38 | 39 | 40 | # 参数校验 41 | def verification(book_id: str) -> bool: 42 | if re.match(r'^[A-Z0-9]{36}$', book_id) and Crawler.get_uuid( 43 | Crawler.get_user_info()[0], book_id 44 | ): 45 | return True 46 | return False 47 | 48 | 49 | # 命令行参数解析 50 | def get_args() -> Union[Tuple[str, int], Exception]: 51 | parser: argparse.ArgumentParser = argparse.ArgumentParser() 52 | parser.add_argument("book_id", type=str, help="科学文库电子书ID") 53 | 54 | args: argparse.Namespace(str, int) = parser.parse_args() 55 | if verification(args.book_id): 56 | return args.book_id 57 | else: 58 | raise Exception('参数错误') 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Science_Reading_Book_Downloader](https://socialify.git.ci/Senkita/Science_Reading_Book_Downloader/image?description=1&font=Bitter&language=1&owner=1&pattern=Solid&theme=Light) 2 | 3 | ## Introduction 4 | 5 | > 前情提要:[[Python] 顺着前文思路,借机水一段小爬虫](https://www.52pojie.cn/thread-1562830-1-1.html) 6 | 7 | 自用爬虫,用于下载科学文库电子书。 8 | 9 | 支持正版,请勿传播,谢谢。 10 | 11 | > [已失效] 网站管理员太过勤奋,而这只是个水贴项目,且个人暂无索书需求,故先弃之,溜了溜了。 12 | 13 | ## Features 14 | 15 | 1. 根据 book_id 自动获取电子书总页数 16 | 2. 对 book_id 做基本判别 17 | 3. 对页面图片下载有误的情况进行修复 18 | 4. 任务进度使用进度条可视化 19 | 5. 整编图片为 PDF 20 | 6. 支持命令行脚本和 GUI 两版 21 | 7. 为下载图书添加书签 22 | 8. 文件名显示为书名 23 | 9. 支持下载清晰度选择 24 | 10. 支持保留下载图片文件夹 25 | 26 | ## Installation 27 | 28 | ```bash 29 | # 依赖项 30 | pip install requests pillow rich pysimplegui pyinstaller pycrypto beautifulsoup4 pypdf2 lxml 31 | ``` 32 | 33 | - [Requests](https://github.com/psf/Requests)用于爬虫请求 34 | - [Pillow](https://github.com/Python-Pillow/Pillow)用于 PDF 生成 35 | - [Rich](https://github.com/willmcgugan/Rich)用于命令行进度条展示 36 | - [PySimpleGUI](https://github.com/PySimpleGUI/PySimpleGUI)用于 GUI 界面 37 | - [PyInstaller](https://github.com/PyInstaller/PyInstaller)用于打包成 exe 38 | - [PyCrypto](https://github.com/PyCrypto/PyCrypto)用于 PyInstaller 打包加密 39 | - [BeautifulSoup4](https://www.crummy.com/software/BeautifulSoup)用于网页解析 40 | - [PyPDF2](https://github.com/mstamy2/PyPDF2)用于 PDF 添加书签 41 | - [lxml](https://lxml.de)用于解析 XPath 42 | 43 | ## Usage 44 | 45 | ```bash 46 | # 命令行脚本直接运行 47 | python main.py 48 | 49 | # 打包成GUI程序 50 | pyinstaller -F -w --key 'passwd' --hidden-import pillow --hidden-import requests --hidden-import pysimplegui --hidden-import beautifulsoup4 --hidden-import pypdf2 --hidden-import lxml -n 科学文库电子书下载器 -i icon.ico --clean --win-private-assemblies -y main.py 51 | ``` 52 | 53 | ### Q&A 54 | 55 | 1. Q: ![KeyError: 'docinfo'](https://karasu.oss-cn-chengdu.aliyuncs.com/Senkita/报错.png) 56 | A: Try again. 57 | 58 | ## Maintainers 59 | 60 | [Senkita](https://github.com/Senkita) 61 | 62 | ## License 63 | 64 | [MIT](LICENSE) © [Senkita](https://github.com/Senkita) 65 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # Node.js 132 | node_modules/ 133 | package*.json -------------------------------------------------------------------------------- /src/UI/Interface.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Description: 界面 3 | Author: Senkita 4 | Date: 2021-12-22 12:17:30 5 | LastEditors: Senkita 6 | LastEditTime: 2022-03-17 20:28:01 7 | ''' 8 | from types import FunctionType 9 | from typing import Tuple, Union 10 | import PySimpleGUI as sg 11 | from src.Tools.Tools import verification 12 | 13 | 14 | class Interface: 15 | def __init__(self) -> None: 16 | self.notice_layout = [ 17 | [sg.T('书籍版权归科学文库(https://book.sciencereading.cn/)所有!')], 18 | [sg.T('此脚本仅供学习交流使用,不得用于商业用途,请支持正版!')], 19 | [sg.T('如果您不幸得到了该脚本,请低调使用,切勿传播!')], 20 | [sg.T('爬虫是个与服务器管理员斗智斗勇的游戏,因此具有时效性,失效不补!')], 21 | [sg.Submit('朕已阅!'), sg.Cancel('我不听!')], 22 | ] 23 | 24 | # 主体窗口 25 | def main_display(self) -> Union[Tuple[str, str], None]: 26 | main_layout = [ 27 | [ 28 | [sg.T('请输入book_id:', tooltip='book_id请在书籍页地址栏中查找'), sg.I()], 29 | [ 30 | sg.T('请选择缩放比:', tooltip='缩放比越大,则图页越清晰,但书籍体积也相应越大,爬取时间对应增长'), 31 | sg.Combo([100, 150], default_value=150), 32 | sg.T('%'), 33 | ], 34 | [ 35 | sg.Radio( 36 | text='保留图片文件夹', 37 | group_id='keep_pic_folder', 38 | default=False, 39 | ), 40 | sg.Radio( 41 | text='删除图片文件夹', 42 | group_id='keep_pic_folder', 43 | default=True, 44 | ), 45 | ], 46 | [sg.Submit('下载'), sg.Cancel('退出')], 47 | ] 48 | ] 49 | main_window: sg.Window = sg.Window('下载科学文库电子书', main_layout) 50 | event, value = main_window.read() 51 | if event == '下载': 52 | main_window.close() 53 | if verification(value[0]): 54 | return value[0], value[1], value[2] 55 | else: 56 | sg.Popup('输入有误,请重新输入!') 57 | self.main_display() 58 | else: 59 | main_window.close() 60 | 61 | # 告知窗体 62 | def notice_display(self, fn: FunctionType) -> Union[FunctionType, None]: 63 | notice_window = sg.Window('用前须知', self.notice_layout) 64 | 65 | notice_event, _ = notice_window.read() 66 | if notice_event == '朕已阅!': 67 | notice_window.close() 68 | return fn() 69 | else: 70 | notice_window.close() 71 | 72 | # 用户界面 73 | def display(self) -> Union[Tuple[str, str], None]: 74 | return self.notice_display(self.main_display) 75 | 76 | # 进度条 77 | @staticmethod 78 | def progress_display(total: int) -> sg.Window: 79 | progress_layout = [ 80 | [ 81 | sg.ProgressBar( 82 | total, orientation='h', size=(40, 10), key='progress_bar' 83 | ), 84 | sg.T('', key='percentage'), 85 | ], 86 | [sg.Cancel('取消')], 87 | ] 88 | return sg.Window('任务进度', progress_layout) 89 | -------------------------------------------------------------------------------- /src/Processor/Crawler.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Description: 爬虫主体 3 | Author: Senkita 4 | Date: 2021-12-20 23:41:21 5 | LastEditors: Senkita 6 | LastEditTime: 2022-03-17 23:06:25 7 | ''' 8 | import re 9 | import time 10 | import json 11 | import requests 12 | from lxml import etree 13 | from typing import Tuple 14 | from string import Template 15 | from bs4 import BeautifulSoup 16 | from src.Tools.Logger import Logger 17 | from src.Tools.Config import headers, time_break 18 | 19 | 20 | class Crawler: 21 | def __init__(self, book_id: str, scaling: int = 150) -> None: 22 | self.logger: Logger = Logger(book_id) 23 | self.dir_name: str = "./{}".format(book_id) 24 | 25 | self.user_id, self.accessToken = self.get_user_info() 26 | self.uuid: str = self.get_uuid(self.user_id, book_id) 27 | 28 | self.scaling: int = scaling 29 | 30 | # 获取页数 31 | def get_page_num(self) -> int: 32 | url: str = 'https://wkobwp.sciencereading.cn/asserts/{}/manifest?language=zh-CN'.format( 33 | self.uuid 34 | ) 35 | return int( 36 | json.loads( 37 | json.loads(requests.get(url, headers=headers).content.decode('UTF-8'))[ 38 | 'docinfo' 39 | ] 40 | )['PageCount'] 41 | ) 42 | 43 | # 下载页面图片 44 | def download_png(self, page_no: int) -> None: 45 | url: str = "https://wkobwp.sciencereading.cn/asserts/{}/image/{}/{}?accessToken={}".format( 46 | self.uuid, page_no, self.scaling, self.accessToken 47 | ) 48 | try: 49 | response: requests.Response = requests.get(url, headers=headers) 50 | except Exception as e: 51 | self.logger.warning(e) 52 | time.sleep(time_break) 53 | self.download_png(page_no) 54 | 55 | if b'{"error":-1}' in response.content: 56 | time.sleep(time_break) 57 | self.download_png(page_no) 58 | 59 | with open("{}/{}.png".format(self.dir_name, page_no), "wb") as f: 60 | f.write(response.content) 61 | time.sleep(time_break) 62 | 63 | # 获取用户ID和accessToken 64 | @staticmethod 65 | def get_user_info() -> Tuple[str, str]: 66 | user_id_url: str = "https://wkobwp.sciencereading.cn/api/systemuser/info" 67 | 68 | params: dict = {"params": '{"heads":{"defaultuser":null}}'} 69 | response: str = requests.get(user_id_url, params=params).content.decode("UTF-8") 70 | 71 | try: 72 | resultBody: dict = json.loads(response)["resultBody"] 73 | 74 | user_id: str = resultBody["id"] 75 | accessToken: str = resultBody["accessToken"] 76 | return user_id, accessToken 77 | except Exception as e: 78 | raise Exception(e) 79 | 80 | # 获取uuid 81 | @staticmethod 82 | def get_uuid(user_id: str, book_id: str) -> str: 83 | uuid: str = None 84 | uuid_url: str = "https://wkobwp.sciencereading.cn/api/file/add" 85 | params: Template = Template( 86 | '{"params": {"userId": "$user_id","file": "http://159.226.241.32:81/$book_id.pdf"}}' 87 | ) 88 | data: dict = {"params": params.substitute(user_id=user_id, book_id=book_id)} 89 | response: str = requests.post( 90 | uuid_url, data=data, headers=headers 91 | ).content.decode("UTF-8") 92 | if response != '': 93 | result: str = json.loads(response)["result"] 94 | if result != 'OutOfFileSizeLimit': 95 | uuid = result 96 | return uuid 97 | 98 | # 获取书名、ISBN及目录 99 | @staticmethod 100 | def get_book_info(book_id: str) -> Tuple[str, int, list]: 101 | book_name: str = None 102 | catalog_list: list = [] 103 | book_name_url: str = ( 104 | "https://book.sciencereading.cn/shop/book/Booksimple/show.do?id={}".format( 105 | book_id 106 | ) 107 | ) 108 | response: str = requests.get(book_name_url, headers=headers).content.decode( 109 | 'UTF-8' 110 | ) 111 | if response != '': 112 | soup: BeautifulSoup = BeautifulSoup(response, 'html.parser') 113 | 114 | book_name: str = soup.select( 115 | 'body > div:nth-child(3) > div > div > div > div.row > div.col-md-8.col-sm-7 > div.book_detail_title > span > b:nth-child(1)' 116 | )[0].text 117 | 118 | book_ISBN: int = int( 119 | etree.HTML(str(soup)) 120 | .xpath( 121 | "/html/body/div[1]/div/div/div/div[1]/div[2]/div[3]/div[2]/div[2]/span" 122 | )[0] 123 | .text 124 | ) 125 | 126 | pattern: re.Pattern = re.compile( 127 | r'"pId":"(.*?)".*?"name":"(.*?)".*?bookPageNum=(\d+)' 128 | ) 129 | catalog_list = re.findall(pattern, response) 130 | 131 | symbol_pattern: re.Pattern = re.compile(r'\W+') 132 | book_name = re.sub(symbol_pattern, '_', book_name) 133 | 134 | return book_name, book_ISBN, catalog_list 135 | -------------------------------------------------------------------------------- /src/Processor/Handler.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Description: PDF处理相关 3 | Author: Senkita 4 | Date: 2022-02-18 21:49:30 5 | LastEditors: Senkita 6 | LastEditTime: 2022-03-17 23:06:46 7 | ''' 8 | import os 9 | import time 10 | import shutil 11 | from PIL import Image 12 | from PyPDF2 import PdfFileReader as reader, PdfFileWriter as writer 13 | from src.UI.Interface import Interface 14 | from src.Processor.Crawler import Crawler 15 | from src.Tools.Logger import Logger 16 | from src.Tools.Config import time_break 17 | from src.Tools.Tools import catalog_grading 18 | 19 | # 这个rich库需要自己装一下,用于进度条显示 20 | # from rich.progress import track 21 | 22 | 23 | class Handler: 24 | def __init__( 25 | self, book_id: str, scaling: int = 150, keep_pic_folder: bool = False 26 | ) -> None: 27 | self.keep_pic_folder: bool = keep_pic_folder 28 | 29 | self.spider: Crawler = Crawler(book_id, scaling) 30 | 31 | self.logger: Logger = Logger(book_id) 32 | 33 | self.file_name_list: list = [] 34 | 35 | self.book_name, self.book_ISBN, self.catalog_list = self.spider.get_book_info( 36 | book_id 37 | ) 38 | self.dir_name: str = "./{}".format(book_id) 39 | self.pic_list: list = [] 40 | 41 | self.page_num: int = self.spider.get_page_num() 42 | self.progress_window: Interface = Interface().progress_display( 43 | self.page_num * 2 44 | ) 45 | self.progress_bar = self.progress_window['progress_bar'] 46 | self.percentage = self.progress_window['percentage'] 47 | 48 | # 文件名排序 49 | def list_file(self) -> None: 50 | for file_name in os.listdir(self.dir_name): 51 | if file_name[-4:] == ".png": 52 | self.file_name_list.append(file_name[:-4]) 53 | 54 | self.file_name_list.sort(key=lambda ele: int(ele)) 55 | 56 | # 拼接为PDF 57 | def generate_pdf(self) -> None: 58 | try: 59 | pdf: Image.Image = Image.open( 60 | "{}/{}.png".format(self.dir_name, self.file_name_list[0]) 61 | ) 62 | except Exception: 63 | self.logger.warning("首页下载有误,重试中...") 64 | time.sleep(time_break) 65 | self.spider.download_png(0) 66 | return self.generate_pdf() 67 | 68 | self.file_name_list.pop(0) 69 | self.progress_bar.update_bar(self.page_num + 1) 70 | 71 | # for pic_no in track(self.file_name_list, description="生成PDF中,请稍候..."): 72 | for pic_no in self.file_name_list: 73 | progress_event, _ = self.progress_window.read(timeout=time_break) 74 | if progress_event == '取消' or progress_event is None: 75 | self.progress_window.close() 76 | os._exit(0) 77 | 78 | self.add_png(pic_no) 79 | 80 | progress: int = self.page_num + self.file_name_list.index(pic_no) + 2 81 | self.progress_bar.UpdateBar(progress) 82 | self.percentage.update( 83 | '{:.3}%'.format(progress / (self.page_num * 2) * 100) 84 | ) 85 | 86 | pdf.save( 87 | "./{}.pdf".format(self.book_ISBN), 88 | "PDF", 89 | resolution=100.0, 90 | quality=100, 91 | subsampling=0, 92 | save_all=True, 93 | append_images=self.pic_list, 94 | ) 95 | 96 | # 添加页面 97 | def add_png(self, pic_no: int) -> None: 98 | try: 99 | img: Image.Image = Image.open("{}/{}.png".format(self.dir_name, pic_no)) 100 | if img.mode == "RGBA": 101 | img = img.convert("RGB") 102 | self.pic_list.append(img) 103 | except Exception: 104 | self.logger.warning("图片{}.png下载有误,重试中...".format(pic_no)) 105 | time.sleep(time_break) 106 | self.spider.download_png(pic_no) 107 | return self.add_png(pic_no) 108 | 109 | # 添加书签 110 | def add_bookmark(self) -> None: 111 | input_pdf: reader = reader("./{}.pdf".format(self.book_ISBN)) 112 | output_pdf: writer = writer() 113 | 114 | for i in range(input_pdf.getNumPages()): 115 | output_pdf.addPage(input_pdf.getPage(i)) 116 | 117 | parent_set = {} 118 | for bookmark in catalog_grading(self.catalog_list): 119 | parent = output_pdf.addBookmark( 120 | bookmark[1], 121 | bookmark[2], 122 | parent=parent_set.get(bookmark[0] - 1), 123 | ) 124 | parent_set[bookmark[0]] = parent 125 | 126 | with open('./{}.pdf'.format(self.book_name), 'wb') as f: 127 | output_pdf.write(f) 128 | 129 | def run(self) -> None: 130 | os.makedirs(self.dir_name, exist_ok=True) 131 | 132 | # for page_no in track(range(self.page_num), description="下载中,请稍候..."): 133 | for page_no in range(self.page_num): 134 | progress_event, _ = self.progress_window.read(timeout=time_break) 135 | if progress_event == '取消' or progress_event is None: 136 | self.progress_window.close() 137 | os._exit(0) 138 | self.spider.download_png(page_no) 139 | 140 | progress: int = page_no + 1 141 | self.progress_bar.update_bar(progress) 142 | self.percentage.update( 143 | '{:.3}%'.format(progress / (self.page_num * 2) * 100) 144 | ) 145 | 146 | self.list_file() 147 | self.generate_pdf() 148 | 149 | self.add_bookmark() 150 | 151 | # 清理 152 | if not self.keep_pic_folder: 153 | shutil.rmtree(self.dir_name) 154 | os.remove("./{}.pdf".format(self.book_ISBN)) 155 | self.progress_window.close() 156 | --------------------------------------------------------------------------------