├── icon.ico
├── src
    ├── UI
    │   ├── __init__.py
    │   └── Interface.py
    ├── __init__.py
    ├── Processor
    │   ├── __init__.py
    │   ├── Crawler.py
    │   └── Handler.py
    └── Tools
    │   ├── __init__.py
    │   ├── Config.py
    │   ├── Logger.py
    │   └── Tools.py
├── main.py
├── LICENSE
├── README.md
└── .gitignore


/icon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeerChen/Science_Reading_Book_Downloader/HEAD/icon.ico


--------------------------------------------------------------------------------
/src/UI/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | Description: 
3 | Author: Senkita
4 | Date: 2022-02-19 15:42:57
5 | LastEditors: Senkita
6 | LastEditTime: 2022-02-19 15:42:58
7 | '''
8 | __all__ = ['Interface']
9 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | Description: 包说明
3 | Author: Senkita
4 | Date: 2021-12-20 23:41:42
5 | LastEditors: Senkita
6 | LastEditTime: 2022-02-19 16:20:49
7 | '''
8 | __all__ = ['Processor', 'Tools', 'UI']
9 | 


--------------------------------------------------------------------------------
/src/Processor/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | Description: 
3 | Author: Senkita
4 | Date: 2022-02-19 15:44:02
5 | LastEditors: Senkita
6 | LastEditTime: 2022-02-19 15:44:28
7 | '''
8 | __all__ = ['Crawler', 'Handler']
9 | 


--------------------------------------------------------------------------------
/src/Tools/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | Description: 
3 | Author: Senkita
4 | Date: 2022-02-19 15:48:56
5 | LastEditors: Senkita
6 | LastEditTime: 2022-02-19 15:50:08
7 | '''
8 | __all__ = ['Config', 'Logger', 'Tools']
9 | 


--------------------------------------------------------------------------------
/src/Tools/Config.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Description: 一些配置项
 3 | Author: Senkita
 4 | Date: 2022-02-19 15:44:55
 5 | LastEditors: Senkita
 6 | LastEditTime: 2022-02-19 15:44:56
 7 | '''
 8 | headers: dict = {'Connection': 'close'}
 9 | time_break: int = 2
10 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Description: 主入口
 3 | Author: Senkita
 4 | Date: 2021-12-20 23:40:59
 5 | LastEditors: Senkita
 6 | LastEditTime: 2022-03-17 23:03:39
 7 | '''
 8 | import os
 9 | from src.Processor.Handler import Handler
10 | from src.UI.Interface import Interface
11 | 
12 | 
13 | def main() -> None:
14 |     # 命令行运行
15 |     # from src.Tools import get_args
16 |     # book_id = get_args()
17 | 
18 |     # GUI版
19 |     ui = Interface()
20 |     try:
21 |         book_id, scaling, keep_pic_folder = ui.display()
22 |     except Exception as e:
23 |         print(e)
24 |         os._exit(0)
25 | 
26 |     if book_id:
27 |         handler: Handler = Handler(book_id, scaling, keep_pic_folder)
28 |         handler.run()
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     main()
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Senkita
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/Tools/Logger.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Description: 自定义日志类
 3 | Author: Senkita
 4 | Date: 2021-12-22 09:36:07
 5 | LastEditors: Senkita
 6 | LastEditTime: 2021-12-22 20:21:28
 7 | '''
 8 | import logging
 9 | 
10 | 
11 | # 日志配置
12 | class LoggerConfig:
13 |     def __init__(self, book_id: str) -> None:
14 |         self.logger: logging.Logger = logging.getLogger()
15 |         self.formatter: logging.Formatter = logging.Formatter(
16 |             fmt='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
17 |             datefmt='%Y-%m-%d %H:%M:%S',
18 |         )
19 | 
20 |         # self.stream_handler: logging.StreamHandler = logging.StreamHandler()
21 |         # self.stream_handler.setFormatter(self.formatter)
22 |         # self.stream_handler.setLevel(logging.ERROR)
23 | 
24 |         self.file_handler: logging.FileHandler = logging.FileHandler(
25 |             filename='{}.log'.format(book_id),
26 |             mode='a',
27 |         )
28 |         self.file_handler.setFormatter(self.formatter)
29 |         self.file_handler.setLevel(logging.DEBUG)
30 | 
31 |         # self.logger.addHandler(self.stream_handler)
32 |         self.logger.addHandler(self.file_handler)
33 | 
34 | 
35 | # 日志类
36 | class Logger(LoggerConfig):
37 |     def __new__(cls: logging.Logger, book_id: str) -> logging.Logger:
38 |         super(Logger, cls).__init__(cls, book_id)
39 |         return cls.logger
40 | 


--------------------------------------------------------------------------------
/src/Tools/Tools.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Description: 一些独立函数
 3 | Author: Senkita
 4 | Date: 2021-12-20 23:44:20
 5 | LastEditors: Senkita
 6 | LastEditTime: 2022-02-19 19:49:48
 7 | '''
 8 | import re
 9 | import argparse
10 | from typing import Tuple, Union
11 | from src.Processor.Crawler import Crawler
12 | 
13 | 
14 | # 目录分级
15 | def catalog_grading(catalog_list: list) -> tuple:
16 |     pid_dict: dict = {}
17 |     catalog_dict: dict = {}
18 | 
19 |     level: int = 1
20 | 
21 |     for pid, title, page_num in catalog_list:
22 |         if pid == '0':
23 |             catalog_dict[title] = {'level': level, 'page_num': int(page_num) - 1}
24 |             pid_dict[pid] = level
25 |             level = 0
26 |         elif pid in pid_dict:
27 |             catalog_dict[title] = {
28 |                 'level': pid_dict[pid],
29 |                 'page_num': int(page_num) - 1,
30 |             }
31 |             level = pid_dict[pid]
32 |         else:
33 |             level += 1
34 |             catalog_dict[title] = {'level': level, 'page_num': int(page_num) - 1}
35 |             pid_dict[pid] = level
36 | 
37 |     return [(v['level'], k, v['page_num']) for k, v in catalog_dict.items()]
38 | 
39 | 
40 | # 参数校验
41 | def verification(book_id: str) -> bool:
42 |     if re.match(r'^[A-Z0-9]{36}$', book_id) and Crawler.get_uuid(
43 |         Crawler.get_user_info()[0], book_id
44 |     ):
45 |         return True
46 |     return False
47 | 
48 | 
49 | # 命令行参数解析
50 | def get_args() -> Union[Tuple[str, int], Exception]:
51 |     parser: argparse.ArgumentParser = argparse.ArgumentParser()
52 |     parser.add_argument("book_id", type=str, help="科学文库电子书ID")
53 | 
54 |     args: argparse.Namespace(str, int) = parser.parse_args()
55 |     if verification(args.book_id):
56 |         return args.book_id
57 |     else:
58 |         raise Exception('参数错误')
59 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![Science_Reading_Book_Downloader](https://socialify.git.ci/Senkita/Science_Reading_Book_Downloader/image?description=1&font=Bitter&language=1&owner=1&pattern=Solid&theme=Light)
 2 | 
 3 | ## Introduction
 4 | 
 5 | > 前情提要：[[Python] 顺着前文思路，借机水一段小爬虫](https://www.52pojie.cn/thread-1562830-1-1.html)
 6 | 
 7 | 自用爬虫，用于下载科学文库电子书。
 8 | 
 9 | 支持正版，请勿传播，谢谢。
10 | 
11 | > [已失效] 网站管理员太过勤奋，而这只是个水贴项目，且个人暂无索书需求，故先弃之，溜了溜了。
12 | 
13 | ## Features
14 | 
15 | 1. 根据 book_id 自动获取电子书总页数
16 | 2. 对 book_id 做基本判别
17 | 3. 对页面图片下载有误的情况进行修复
18 | 4. 任务进度使用进度条可视化
19 | 5. 整编图片为 PDF
20 | 6. 支持命令行脚本和 GUI 两版
21 | 7. 为下载图书添加书签
22 | 8. 文件名显示为书名
23 | 9. 支持下载清晰度选择
24 | 10. 支持保留下载图片文件夹
25 | 
26 | ## Installation
27 | 
28 | ```bash
29 | # 依赖项
30 | pip install requests pillow rich pysimplegui pyinstaller pycrypto beautifulsoup4 pypdf2 lxml
31 | ```
32 | 
33 | -   [Requests](https://github.com/psf/Requests)用于爬虫请求
34 | -   [Pillow](https://github.com/Python-Pillow/Pillow)用于 PDF 生成
35 | -   [Rich](https://github.com/willmcgugan/Rich)用于命令行进度条展示
36 | -   [PySimpleGUI](https://github.com/PySimpleGUI/PySimpleGUI)用于 GUI 界面
37 | -   [PyInstaller](https://github.com/PyInstaller/PyInstaller)用于打包成 exe
38 | -   [PyCrypto](https://github.com/PyCrypto/PyCrypto)用于 PyInstaller 打包加密
39 | -   [BeautifulSoup4](https://www.crummy.com/software/BeautifulSoup)用于网页解析
40 | -   [PyPDF2](https://github.com/mstamy2/PyPDF2)用于 PDF 添加书签
41 | -   [lxml](https://lxml.de)用于解析 XPath
42 | 
43 | ## Usage
44 | 
45 | ```bash
46 | # 命令行脚本直接运行
47 | python main.py
48 | 
49 | # 打包成GUI程序
50 | pyinstaller -F -w --key 'passwd' --hidden-import pillow --hidden-import requests --hidden-import pysimplegui --hidden-import beautifulsoup4 --hidden-import pypdf2 --hidden-import lxml -n 科学文库电子书下载器 -i icon.ico --clean --win-private-assemblies -y  main.py
51 | ```
52 | 
53 | ### Q&A
54 | 
55 | 1. Q: ![KeyError: 'docinfo'](https://karasu.oss-cn-chengdu.aliyuncs.com/Senkita/报错.png)
56 |    A: Try again.
57 | 
58 | ## Maintainers
59 | 
60 | [Senkita](https://github.com/Senkita)
61 | 
62 | ## License
63 | 
64 | [MIT](LICENSE) © [Senkita](https://github.com/Senkita)
65 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # Node.js
132 | node_modules/
133 | package*.json


--------------------------------------------------------------------------------
/src/UI/Interface.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Description: 界面
 3 | Author: Senkita
 4 | Date: 2021-12-22 12:17:30
 5 | LastEditors: Senkita
 6 | LastEditTime: 2022-03-17 20:28:01
 7 | '''
 8 | from types import FunctionType
 9 | from typing import Tuple, Union
10 | import PySimpleGUI as sg
11 | from src.Tools.Tools import verification
12 | 
13 | 
14 | class Interface:
15 |     def __init__(self) -> None:
16 |         self.notice_layout = [
17 |             [sg.T('书籍版权归科学文库(https://book.sciencereading.cn/)所有！')],
18 |             [sg.T('此脚本仅供学习交流使用，不得用于商业用途，请支持正版！')],
19 |             [sg.T('如果您不幸得到了该脚本，请低调使用，切勿传播！')],
20 |             [sg.T('爬虫是个与服务器管理员斗智斗勇的游戏，因此具有时效性，失效不补！')],
21 |             [sg.Submit('朕已阅！'), sg.Cancel('我不听！')],
22 |         ]
23 | 
24 |     # 主体窗口
25 |     def main_display(self) -> Union[Tuple[str, str], None]:
26 |         main_layout = [
27 |             [
28 |                 [sg.T('请输入book_id：', tooltip='book_id请在书籍页地址栏中查找'), sg.I()],
29 |                 [
30 |                     sg.T('请选择缩放比：', tooltip='缩放比越大，则图页越清晰，但书籍体积也相应越大，爬取时间对应增长'),
31 |                     sg.Combo([100, 150], default_value=150),
32 |                     sg.T('%'),
33 |                 ],
34 |                 [
35 |                     sg.Radio(
36 |                         text='保留图片文件夹',
37 |                         group_id='keep_pic_folder',
38 |                         default=False,
39 |                     ),
40 |                     sg.Radio(
41 |                         text='删除图片文件夹',
42 |                         group_id='keep_pic_folder',
43 |                         default=True,
44 |                     ),
45 |                 ],
46 |                 [sg.Submit('下载'), sg.Cancel('退出')],
47 |             ]
48 |         ]
49 |         main_window: sg.Window = sg.Window('下载科学文库电子书', main_layout)
50 |         event, value = main_window.read()
51 |         if event == '下载':
52 |             main_window.close()
53 |             if verification(value[0]):
54 |                 return value[0], value[1], value[2]
55 |             else:
56 |                 sg.Popup('输入有误，请重新输入！')
57 |                 self.main_display()
58 |         else:
59 |             main_window.close()
60 | 
61 |     # 告知窗体
62 |     def notice_display(self, fn: FunctionType) -> Union[FunctionType, None]:
63 |         notice_window = sg.Window('用前须知', self.notice_layout)
64 | 
65 |         notice_event, _ = notice_window.read()
66 |         if notice_event == '朕已阅！':
67 |             notice_window.close()
68 |             return fn()
69 |         else:
70 |             notice_window.close()
71 | 
72 |     # 用户界面
73 |     def display(self) -> Union[Tuple[str, str], None]:
74 |         return self.notice_display(self.main_display)
75 | 
76 |     # 进度条
77 |     @staticmethod
78 |     def progress_display(total: int) -> sg.Window:
79 |         progress_layout = [
80 |             [
81 |                 sg.ProgressBar(
82 |                     total, orientation='h', size=(40, 10), key='progress_bar'
83 |                 ),
84 |                 sg.T('', key='percentage'),
85 |             ],
86 |             [sg.Cancel('取消')],
87 |         ]
88 |         return sg.Window('任务进度', progress_layout)
89 | 


--------------------------------------------------------------------------------
/src/Processor/Crawler.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Description: 爬虫主体
  3 | Author: Senkita
  4 | Date: 2021-12-20 23:41:21
  5 | LastEditors: Senkita
  6 | LastEditTime: 2022-03-17 23:06:25
  7 | '''
  8 | import re
  9 | import time
 10 | import json
 11 | import requests
 12 | from lxml import etree
 13 | from typing import Tuple
 14 | from string import Template
 15 | from bs4 import BeautifulSoup
 16 | from src.Tools.Logger import Logger
 17 | from src.Tools.Config import headers, time_break
 18 | 
 19 | 
 20 | class Crawler:
 21 |     def __init__(self, book_id: str, scaling: int = 150) -> None:
 22 |         self.logger: Logger = Logger(book_id)
 23 |         self.dir_name: str = "./{}".format(book_id)
 24 | 
 25 |         self.user_id, self.accessToken = self.get_user_info()
 26 |         self.uuid: str = self.get_uuid(self.user_id, book_id)
 27 | 
 28 |         self.scaling: int = scaling
 29 | 
 30 |     # 获取页数
 31 |     def get_page_num(self) -> int:
 32 |         url: str = 'https://wkobwp.sciencereading.cn/asserts/{}/manifest?language=zh-CN'.format(
 33 |             self.uuid
 34 |         )
 35 |         return int(
 36 |             json.loads(
 37 |                 json.loads(requests.get(url, headers=headers).content.decode('UTF-8'))[
 38 |                     'docinfo'
 39 |                 ]
 40 |             )['PageCount']
 41 |         )
 42 | 
 43 |     # 下载页面图片
 44 |     def download_png(self, page_no: int) -> None:
 45 |         url: str = "https://wkobwp.sciencereading.cn/asserts/{}/image/{}/{}?accessToken={}".format(
 46 |             self.uuid, page_no, self.scaling, self.accessToken
 47 |         )
 48 |         try:
 49 |             response: requests.Response = requests.get(url, headers=headers)
 50 |         except Exception as e:
 51 |             self.logger.warning(e)
 52 |             time.sleep(time_break)
 53 |             self.download_png(page_no)
 54 | 
 55 |         if b'{"error":-1}' in response.content:
 56 |             time.sleep(time_break)
 57 |             self.download_png(page_no)
 58 | 
 59 |         with open("{}/{}.png".format(self.dir_name, page_no), "wb") as f:
 60 |             f.write(response.content)
 61 |         time.sleep(time_break)
 62 | 
 63 |     # 获取用户ID和accessToken
 64 |     @staticmethod
 65 |     def get_user_info() -> Tuple[str, str]:
 66 |         user_id_url: str = "https://wkobwp.sciencereading.cn/api/systemuser/info"
 67 | 
 68 |         params: dict = {"params": '{"heads":{"defaultuser":null}}'}
 69 |         response: str = requests.get(user_id_url, params=params).content.decode("UTF-8")
 70 | 
 71 |         try:
 72 |             resultBody: dict = json.loads(response)["resultBody"]
 73 | 
 74 |             user_id: str = resultBody["id"]
 75 |             accessToken: str = resultBody["accessToken"]
 76 |             return user_id, accessToken
 77 |         except Exception as e:
 78 |             raise Exception(e)
 79 | 
 80 |     # 获取uuid
 81 |     @staticmethod
 82 |     def get_uuid(user_id: str, book_id: str) -> str:
 83 |         uuid: str = None
 84 |         uuid_url: str = "https://wkobwp.sciencereading.cn/api/file/add"
 85 |         params: Template = Template(
 86 |             '{"params": {"userId": "$user_id","file": "http://159.226.241.32:81/$book_id.pdf"}}'
 87 |         )
 88 |         data: dict = {"params": params.substitute(user_id=user_id, book_id=book_id)}
 89 |         response: str = requests.post(
 90 |             uuid_url, data=data, headers=headers
 91 |         ).content.decode("UTF-8")
 92 |         if response != '':
 93 |             result: str = json.loads(response)["result"]
 94 |             if result != 'OutOfFileSizeLimit':
 95 |                 uuid = result
 96 |         return uuid
 97 | 
 98 |     # 获取书名、ISBN及目录
 99 |     @staticmethod
100 |     def get_book_info(book_id: str) -> Tuple[str, int, list]:
101 |         book_name: str = None
102 |         catalog_list: list = []
103 |         book_name_url: str = (
104 |             "https://book.sciencereading.cn/shop/book/Booksimple/show.do?id={}".format(
105 |                 book_id
106 |             )
107 |         )
108 |         response: str = requests.get(book_name_url, headers=headers).content.decode(
109 |             'UTF-8'
110 |         )
111 |         if response != '':
112 |             soup: BeautifulSoup = BeautifulSoup(response, 'html.parser')
113 | 
114 |             book_name: str = soup.select(
115 |                 'body > div:nth-child(3) > div > div > div > div.row > div.col-md-8.col-sm-7 > div.book_detail_title > span > b:nth-child(1)'
116 |             )[0].text
117 | 
118 |             book_ISBN: int = int(
119 |                 etree.HTML(str(soup))
120 |                 .xpath(
121 |                     "/html/body/div[1]/div/div/div/div[1]/div[2]/div[3]/div[2]/div[2]/span"
122 |                 )[0]
123 |                 .text
124 |             )
125 | 
126 |             pattern: re.Pattern = re.compile(
127 |                 r'"pId":"(.*?)".*?"name":"(.*?)".*?bookPageNum=(\d+)'
128 |             )
129 |             catalog_list = re.findall(pattern, response)
130 | 
131 |             symbol_pattern: re.Pattern = re.compile(r'\W+')
132 |             book_name = re.sub(symbol_pattern, '_', book_name)
133 | 
134 |         return book_name, book_ISBN, catalog_list
135 | 


--------------------------------------------------------------------------------
/src/Processor/Handler.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Description: PDF处理相关
  3 | Author: Senkita
  4 | Date: 2022-02-18 21:49:30
  5 | LastEditors: Senkita
  6 | LastEditTime: 2022-03-17 23:06:46
  7 | '''
  8 | import os
  9 | import time
 10 | import shutil
 11 | from PIL import Image
 12 | from PyPDF2 import PdfFileReader as reader, PdfFileWriter as writer
 13 | from src.UI.Interface import Interface
 14 | from src.Processor.Crawler import Crawler
 15 | from src.Tools.Logger import Logger
 16 | from src.Tools.Config import time_break
 17 | from src.Tools.Tools import catalog_grading
 18 | 
 19 | # 这个rich库需要自己装一下，用于进度条显示
 20 | # from rich.progress import track
 21 | 
 22 | 
 23 | class Handler:
 24 |     def __init__(
 25 |         self, book_id: str, scaling: int = 150, keep_pic_folder: bool = False
 26 |     ) -> None:
 27 |         self.keep_pic_folder: bool = keep_pic_folder
 28 | 
 29 |         self.spider: Crawler = Crawler(book_id, scaling)
 30 | 
 31 |         self.logger: Logger = Logger(book_id)
 32 | 
 33 |         self.file_name_list: list = []
 34 | 
 35 |         self.book_name, self.book_ISBN, self.catalog_list = self.spider.get_book_info(
 36 |             book_id
 37 |         )
 38 |         self.dir_name: str = "./{}".format(book_id)
 39 |         self.pic_list: list = []
 40 | 
 41 |         self.page_num: int = self.spider.get_page_num()
 42 |         self.progress_window: Interface = Interface().progress_display(
 43 |             self.page_num * 2
 44 |         )
 45 |         self.progress_bar = self.progress_window['progress_bar']
 46 |         self.percentage = self.progress_window['percentage']
 47 | 
 48 |     # 文件名排序
 49 |     def list_file(self) -> None:
 50 |         for file_name in os.listdir(self.dir_name):
 51 |             if file_name[-4:] == ".png":
 52 |                 self.file_name_list.append(file_name[:-4])
 53 | 
 54 |         self.file_name_list.sort(key=lambda ele: int(ele))
 55 | 
 56 |     # 拼接为PDF
 57 |     def generate_pdf(self) -> None:
 58 |         try:
 59 |             pdf: Image.Image = Image.open(
 60 |                 "{}/{}.png".format(self.dir_name, self.file_name_list[0])
 61 |             )
 62 |         except Exception:
 63 |             self.logger.warning("首页下载有误，重试中...")
 64 |             time.sleep(time_break)
 65 |             self.spider.download_png(0)
 66 |             return self.generate_pdf()
 67 | 
 68 |         self.file_name_list.pop(0)
 69 |         self.progress_bar.update_bar(self.page_num + 1)
 70 | 
 71 |         # for pic_no in track(self.file_name_list, description="生成PDF中，请稍候..."):
 72 |         for pic_no in self.file_name_list:
 73 |             progress_event, _ = self.progress_window.read(timeout=time_break)
 74 |             if progress_event == '取消' or progress_event is None:
 75 |                 self.progress_window.close()
 76 |                 os._exit(0)
 77 | 
 78 |             self.add_png(pic_no)
 79 | 
 80 |             progress: int = self.page_num + self.file_name_list.index(pic_no) + 2
 81 |             self.progress_bar.UpdateBar(progress)
 82 |             self.percentage.update(
 83 |                 '{:.3}%'.format(progress / (self.page_num * 2) * 100)
 84 |             )
 85 | 
 86 |         pdf.save(
 87 |             "./{}.pdf".format(self.book_ISBN),
 88 |             "PDF",
 89 |             resolution=100.0,
 90 |             quality=100,
 91 |             subsampling=0,
 92 |             save_all=True,
 93 |             append_images=self.pic_list,
 94 |         )
 95 | 
 96 |     # 添加页面
 97 |     def add_png(self, pic_no: int) -> None:
 98 |         try:
 99 |             img: Image.Image = Image.open("{}/{}.png".format(self.dir_name, pic_no))
100 |             if img.mode == "RGBA":
101 |                 img = img.convert("RGB")
102 |             self.pic_list.append(img)
103 |         except Exception:
104 |             self.logger.warning("图片{}.png下载有误，重试中...".format(pic_no))
105 |             time.sleep(time_break)
106 |             self.spider.download_png(pic_no)
107 |             return self.add_png(pic_no)
108 | 
109 |     # 添加书签
110 |     def add_bookmark(self) -> None:
111 |         input_pdf: reader = reader("./{}.pdf".format(self.book_ISBN))
112 |         output_pdf: writer = writer()
113 | 
114 |         for i in range(input_pdf.getNumPages()):
115 |             output_pdf.addPage(input_pdf.getPage(i))
116 | 
117 |         parent_set = {}
118 |         for bookmark in catalog_grading(self.catalog_list):
119 |             parent = output_pdf.addBookmark(
120 |                 bookmark[1],
121 |                 bookmark[2],
122 |                 parent=parent_set.get(bookmark[0] - 1),
123 |             )
124 |             parent_set[bookmark[0]] = parent
125 | 
126 |         with open('./{}.pdf'.format(self.book_name), 'wb') as f:
127 |             output_pdf.write(f)
128 | 
129 |     def run(self) -> None:
130 |         os.makedirs(self.dir_name, exist_ok=True)
131 | 
132 |         # for page_no in track(range(self.page_num), description="下载中，请稍候..."):
133 |         for page_no in range(self.page_num):
134 |             progress_event, _ = self.progress_window.read(timeout=time_break)
135 |             if progress_event == '取消' or progress_event is None:
136 |                 self.progress_window.close()
137 |                 os._exit(0)
138 |             self.spider.download_png(page_no)
139 | 
140 |             progress: int = page_no + 1
141 |             self.progress_bar.update_bar(progress)
142 |             self.percentage.update(
143 |                 '{:.3}%'.format(progress / (self.page_num * 2) * 100)
144 |             )
145 | 
146 |         self.list_file()
147 |         self.generate_pdf()
148 | 
149 |         self.add_bookmark()
150 | 
151 |         # 清理
152 |         if not self.keep_pic_folder:
153 |             shutil.rmtree(self.dir_name)
154 |         os.remove("./{}.pdf".format(self.book_ISBN))
155 |         self.progress_window.close()
156 | 


--------------------------------------------------------------------------------