├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── main.py
├── pyproject.toml
└── temp.css


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .nox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | .hypothesis/
 49 | .pytest_cache/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | db.sqlite3
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # IPython
 77 | profile_default/
 78 | ipython_config.py
 79 | 
 80 | # pyenv
 81 | .python-version
 82 | 
 83 | # celery beat schedule file
 84 | celerybeat-schedule
 85 | 
 86 | # SageMath parsed files
 87 | *.sage.py
 88 | 
 89 | # Environments
 90 | .env
 91 | .venv
 92 | env/
 93 | venv/
 94 | ENV/
 95 | env.bak/
 96 | venv.bak/
 97 | 
 98 | # Spyder project settings
 99 | .spyderproject
100 | .spyproject
101 | 
102 | # Rope project settings
103 | .ropeproject
104 | 
105 | # mkdocs documentation
106 | /site
107 | 
108 | # mypy
109 | .mypy_cache/
110 | .dmypy.json
111 | dmypy.json
112 | 
113 | # Pyre type checker
114 | .pyre/
115 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Weber Snake
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ZSXQ-Spider
 2 |  爬取知识星球内容，并制作成PDF电子书。
 3 | 
 4 | 代码修改自：[zsxq-spider](https://github.com/wbsabc/zsxq-spider)
 5 | 
 6 | 爬取知识星球，并制作 PDF 电子书。[https://www.zsxq.com/](https://www.zsxq.com/)
 7 | 
 8 | > [!CAUTION]
 9 | > 最新接口已使用Signature验证，暂未适配，暂无适配计划
10 | 
11 | 
12 | ## 功能
13 | 
14 | * 支持下载图片并写入 PDF。
15 | * 支持 PDF 中显示链接。
16 | * 支持下载评论。
17 | * 可控制只下载精华内容或下载全部内容。
18 | * 支持按时间区间下载。
19 | * ![New](https://via.placeholder.com/10/f03c15/000000?text=+) 使用最新接口(v2)。
20 | * ![New](https://via.placeholder.com/10/f03c15/000000?text=+) 每次运行结果保存为单独文件夹。
21 | * ![New](https://via.placeholder.com/10/f03c15/000000?text=+) 支持分片输出PDF。
22 | 
23 | ## 环境
24 | 
25 | * Python 3.8 测试通过。
26 | * 安装 [wkhtmltopdf](https://wkhtmltopdf.org/downloads.html) ，安装后将 bin 目录加入到环境变量。
27 | * 安装相应依赖：pip install pdfkit
28 | * 安装 BeautifulSoup：pip install BeautifulSoup4
29 | * 安装 Requests：pip install requests
30 | * ![New](https://via.placeholder.com/10/f03c15/000000?text=+) 或者使用poetry install
31 | 
32 | 
33 | ## 用法
34 | 
35 | 参考以下配置内容
36 | ```python
37 | ZSXQ_ACCESS_TOKEN = '00000000-0000-0000-0000-D09322903A59_6DF24A4ED3558CD4'    # 登录后Cookie中的Token（必须修改）
38 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0'    # 登录时使用的User-Agent（必须修改）
39 | GROUP_ID = '123456789123'                         # 知识星球中的小组ID
40 | PDF_FILE_NAME = 'outfile'                         # 生成的PDF文件名，不带后缀
41 | PDF_MAX_PAGE_NUM = 500                            # 单个PDF文件最大的页面数。windows下超过一定数量的页面会生成失败，所以需要调整此值
42 | DOWNLOAD_PICS = True                              # 是否下载图片 True | False 下载会导致程序变慢
43 | DOWNLOAD_COMMENTS = True                          # 是否下载评论
44 | ONLY_DIGESTS = False                              # True-只精华 | False-全部
45 | FROM_DATE_TO_DATE = False                         # 按时间区间下载
46 | EARLY_DATE = '2017-05-25T00:00:00.000+0800'       # 最早时间 当FROM_DATE_TO_DATE=True时生效 为空表示不限制 形如'2017-05-25T00:00:00.000+0800'
47 | LATE_DATE = '2018-05-25T00:00:00.000+0800'        # 最晚时间 当FROM_DATE_TO_DATE=True时生效 为空表示不限制 形如'2017-05-25T00:00:00.000+0800'
48 | COUNTS_PER_TIME = 30                              # 每次请求加载几个主题 最大可设置为30
49 | DEBUG = False                                     # DEBUG开关
50 | DEBUG_NUM = 120                                   # DEBUG时 跑多少条数据后停止 需与COUNTS_PER_TIME结合考虑
51 | SLEEP_FLAG = True                                 # 请求之间是否SLEEP避免请求过于频繁
52 | SLEEP_SEC = 5                                     # SLEEP秒数 SLEEP_FLAG=True时生效
53 | ```
54 | 
55 | 修改main.py文件中的相应参数
56 | `Spider('登录后Cookie中的Token', '登录时使用的User-Agent', '知识星球中的小组ID')`
57 | 然后运行main.py。
58 | 
59 | ## 说明
60 | 
61 | 1. 请大家合理使用本代码，不要随意传播生成的PDF，保护网站及作者的合法权益。
62 | 2. 爬虫会对网站性能造成一定影响，请勿频繁使用，在必要时合理使用，大家都是去学习知识的，体谅一下吴老板。
63 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import requests
  3 | import json
  4 | import os
  5 | import pdfkit
  6 | import datetime
  7 | import base64
  8 | import time
  9 | import traceback
 10 | import urllib.request
 11 | from bs4 import BeautifulSoup
 12 | from urllib.parse import quote
 13 | from urllib.parse import unquote
 14 | from urllib.error import ContentTooShortError
 15 | 
 16 | BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 17 | HTML_TEMPLATE = """
 18 | <!DOCTYPE html>
 19 | <html lang="en">
 20 | <head><meta charset="UTF-8"></head>
 21 | <body>
 22 | <h1>{title}</h1>
 23 | <br>{author} - {cretime}<br>
 24 | <p>{text}</p>
 25 | </body>
 26 | </html>
 27 | """
 28 | 
 29 | 
 30 | class Spider:
 31 |     ZSXQ_ACCESS_TOKEN = ''          # 登录后Cookie中的Token（必须修改）
 32 |     USER_AGENT = ''                 # 登录时使用的User-Agent（必须修改）
 33 |     GROUP_ID = ''                   # 知识星球中的小组ID
 34 |     PDF_FILE_NAME = 'output'        # 生成的PDF文件名，不带后缀
 35 |     PDF_MAX_PAGE_NUM = 500          # 单个PDF文件最大的页面数。windows下超过一定数量的页面会生成失败，所以需要调整此值
 36 |     DOWNLOAD_PICS = True            # 是否下载图片 True | False ;下载会导致程序变慢
 37 |     DOWNLOAD_COMMENTS = True        # 是否下载评论
 38 |     ONLY_DIGESTS = False            # True-只精华 | False-全部
 39 |     FROM_DATE_TO_DATE = False       # 按时间区间下载
 40 |     EARLY_DATE = ''                 # 最早时间 当FROM_DATE_TO_DATE=True时生效 为空表示不限制 形如'2017-05-25T00:00:00.000+0800'
 41 |     LATE_DATE = ''                  # 最晚时间 当FROM_DATE_TO_DATE=True时生效 为空表示不限制 形如'2017-05-25T00:00:00.000+0800'
 42 |     COUNTS_PER = 30                 # 每次请求加载几个主题 最大可设置为30
 43 |     DEBUG = False                   # DEBUG开关
 44 |     DEBUG_NUM = 120                 # DEBUG时 跑多少条数据后停止 需与COUNTS_PER结合考虑
 45 |     SLEEP_FLAG = True               # 请求之间是否SLEEP避免请求过于频繁
 46 |     SLEEP_SEC = 5                   # SLEEP秒数 SLEEP_FLAG=True时生效
 47 | 
 48 |     OVER_DATE_BREAK = False
 49 |     htmls_file = []
 50 |     num = 1
 51 |     output_dir = ''
 52 |     html_output_dir = ''
 53 |     image_output_dir = ''
 54 |     data_output_dir = ''
 55 |     start_url = ''
 56 |     headers = {}
 57 |     pdf_options = None
 58 | 
 59 |     def __init__(self, access_token=None, user_agent=None, group_id=None):
 60 |         self.ZSXQ_ACCESS_TOKEN = access_token or self.ZSXQ_ACCESS_TOKEN
 61 |         self.USER_AGENT = user_agent or self.USER_AGENT
 62 |         self.GROUP_ID = group_id or self.GROUP_ID
 63 |         self.headers = {
 64 |             'Cookie': 'abtest_env=product;zsxq_access_token=' + self.ZSXQ_ACCESS_TOKEN,
 65 |             'User-Agent': self.USER_AGENT,
 66 |             'accept': 'application/json, text/plain, */*',
 67 |             'sec-ch-ua-platform': '"Windows"',
 68 |             'origin': 'https://wx.zsxq.com',
 69 |             'sec-fetch-site': 'same-site',
 70 |             'sec-fetch-mode': 'cors',
 71 |             'sec-fetch-dest': 'empty',
 72 |             'sec-ch-ua-mobile': '?0',
 73 |             'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"',
 74 |             'referer': 'https://wx.zsxq.com/',
 75 |             'dnt': '1',
 76 |         }
 77 |         self.pdf_options = {
 78 |             "page-size": "A4",
 79 |             "margin-top": "0.35in",
 80 |             "margin-right": "0.65in",
 81 |             "margin-bottom": "0.35in",
 82 |             "margin-left": "0.65in",
 83 |             "encoding": "UTF-8",
 84 |             "custom-header": [("Accept-Encoding", "gzip")],
 85 |             "cookie": [],
 86 |             "outline-depth": 10,
 87 |         }
 88 | 
 89 |     def get_url_data(self, url):
 90 |         rsp = requests.get(url, headers=self.headers)
 91 |         rsp_data = rsp.json()
 92 | 
 93 |         if not rsp_data.get('succeeded'):
 94 |             if rsp_data.get('code') == 1059:
 95 |                 if self.SLEEP_FLAG:
 96 |                     time.sleep(self.SLEEP_SEC)
 97 |                 return self.get_url_data(url)
 98 |             raise Exception('访问错误：\n' + json.dumps(rsp_data, indent=2, ensure_ascii=False))
 99 |         else:
100 |             return rsp_data.get('resp_data')
101 | 
102 |     def get_data(self, url):
103 |         rsp_data = self.get_url_data(url)
104 |         self.save_data_json(self.COUNTS_PER, self.num, rsp_data)
105 |         topics = rsp_data.get('topics')
106 |         for topic in topics:
107 |             if self.FROM_DATE_TO_DATE and self.EARLY_DATE.strip():
108 |                 if topic.get('create_time') < self.EARLY_DATE.strip():
109 |                     self.OVER_DATE_BREAK = True
110 |                     break
111 | 
112 |             content = topic.get('question', topic.get('talk', topic.get('task', topic.get('solution'))))
113 | 
114 |             anonymous = content.get('anonymous')
115 |             if anonymous:
116 |                 author = '匿名用户'
117 |             else:
118 |                 author = content.get('owner').get('name')
119 | 
120 |             cretime = (topic.get('create_time')[:23]).replace('T', ' ')
121 | 
122 |             text = content.get('text', '')
123 |             # 排除不需要的文章
124 |             # if text.strip().startswith(u'') or text.find(u'') != -1:
125 |             #     continue
126 |             text = self.handle_link(text)
127 |             title = str(self.num) + '_' + cretime[:16]
128 |             if topic.get('digested') == True:
129 |                 title += ' {精华}'
130 | 
131 |             if self.DOWNLOAD_PICS and content.get('images'):
132 |                 soup = BeautifulSoup(HTML_TEMPLATE, 'html.parser')
133 |                 images_index = 0
134 |                 _images = content.get('images')
135 |                 print(f'Crawling images: {len(_images)}')
136 |                 for img in _images:
137 |                     url = img.get('large').get('url')
138 |                     local_url = os.path.join(self.image_output_dir, f'{self.num}_{images_index}.jpg')
139 |                     images_index += 1
140 |                     self.download_image(url, local_url)
141 |                     # img_tag = soup.new_tag('img', src=local_url)
142 |                     # 直接写入路径可能无法正常将图片写入PDF，此处写入转码后的图片数据
143 |                     img_tag = soup.new_tag('img', src=self.encode_image(local_url))
144 |                     soup.body.append(img_tag)
145 |                 html_img = str(soup)
146 |                 html = html_img.format(title=title, text=text, author=author, cretime=cretime)
147 |             else:
148 |                 html = HTML_TEMPLATE.format(title=title, text=text, author=author, cretime=cretime)
149 | 
150 |             if topic.get('question'):
151 |                 answer_author = topic.get('answer').get('owner').get('name', '')
152 |                 answer = topic.get('answer').get('text', "")
153 |                 answer = self.handle_link(answer)
154 | 
155 |                 soup = BeautifulSoup(html, 'html.parser')
156 |                 answer_tag = soup.new_tag('p')
157 | 
158 |                 answer = '【' + answer_author + '】 回答：<br>' + answer
159 |                 soup_temp = BeautifulSoup(answer, 'html.parser')
160 |                 answer_tag.append(soup_temp)
161 | 
162 |                 soup.body.append(answer_tag)
163 |                 html = str(soup)
164 | 
165 |             files = content.get('files')
166 |             if files:
167 |                 files_content = '<i>文件列表(需访问网站下载) :<br>'
168 |                 for f in files:
169 |                     files_content += f.get('name') + '<br>'
170 |                 files_content += '</i>'
171 |                 soup = BeautifulSoup(html, 'html.parser')
172 |                 files_tag = soup.new_tag('p')
173 |                 soup_temp = BeautifulSoup(files_content, 'html.parser')
174 |                 files_tag.append(soup_temp)
175 |                 soup.body.append(files_tag)
176 |                 html = str(soup)
177 | 
178 |             comments = topic.get('show_comments')
179 |             if self.DOWNLOAD_COMMENTS and comments:
180 |                 soup = BeautifulSoup(html, 'html.parser')
181 |                 hr_tag = soup.new_tag('hr')
182 |                 soup.body.append(hr_tag)
183 |                 for comment in comments:
184 |                     if comment.get('repliee'):
185 |                         comment_str = '[' + comment.get('owner').get('name') + ' 回复 ' + comment.get('repliee').get('name') + '] : ' + self.handle_link(comment.get('text'))
186 |                     else:
187 |                         comment_str = '[' + comment.get('owner').get('name') + '] : ' + self.handle_link(comment.get('text'))
188 | 
189 |                     comment_tag = soup.new_tag('p')
190 |                     soup_temp = BeautifulSoup(comment_str, 'html.parser')
191 |                     comment_tag.append(soup_temp)
192 |                     soup.body.append(comment_tag)
193 |                 html = str(soup)
194 | 
195 |             file_name = self.save_html(self.num, html)
196 |             self.num += 1
197 |             self.htmls_file.append(file_name)
198 | 
199 |         # DEBUG 仅导出部分数据时使用
200 |         if self.DEBUG and self.num >= self.DEBUG_NUM:
201 |             return self.htmls_file
202 | 
203 |         if self.OVER_DATE_BREAK:
204 |             return self.htmls_file
205 | 
206 |         if topics:
207 |             create_time = topics[-1].get('create_time')
208 |             if create_time[20:23] == "000":
209 |                 end_time = create_time[:20] + "999" + create_time[23:]
210 |                 str_date_time = end_time[:19]
211 |                 delta = datetime.timedelta(seconds=1)
212 |                 date_time = datetime.datetime.strptime(str_date_time, '%Y-%m-%dT%H:%M:%S')
213 |                 date_time = date_time - delta
214 |                 str_date_time = date_time.strftime('%Y-%m-%dT%H:%M:%S')
215 |                 end_time = str_date_time + end_time[19:]
216 |             else:
217 |                 res = int(create_time[20:23]) - 1
218 |                 end_time = create_time[:20] + str(res).zfill(3) + create_time[23:]  # zfill 函数补足结果前面的零，始终为3位数
219 |             end_time = quote(end_time)
220 |             if len(end_time) == 33:
221 |                 end_time = end_time[:24] + '0' + end_time[24:]
222 |             next_url = self.start_url + '&end_time=' + end_time
223 |             if self.SLEEP_FLAG:
224 |                 time.sleep(self.SLEEP_SEC)
225 |             print(f'Next url: {next_url}')
226 |             self.get_data(next_url)
227 | 
228 |         return self.htmls_file
229 | 
230 |     def encode_image(self, image_url):
231 |         with open(image_url, "rb") as image_file:
232 |             encoded_string = base64.b64encode(image_file.read())
233 |         return 'data:image/png;base64,' + encoded_string.decode('utf-8')
234 | 
235 |     def download_image(self, url, local_url):
236 |         try:
237 |             urllib.request.urlretrieve(url, local_url)
238 |         except ContentTooShortError:
239 |             print('Network not good. Reloading ' + url)
240 |             self.download_image(url, local_url)
241 | 
242 |     def handle_link(self, text):
243 |         soup = BeautifulSoup(text, "html.parser")
244 | 
245 |         mention = soup.find_all('e', attrs={'type': 'mention'})
246 |         if len(mention):
247 |             for m in mention:
248 |                 mention_name = m.attrs['title']
249 |                 new_tag = soup.new_tag('span')
250 |                 new_tag.string = mention_name
251 |                 m.replace_with(new_tag)
252 | 
253 |         hashtag = soup.find_all('e', attrs={'type': 'hashtag'})
254 |         if len(hashtag):
255 |             for tag in hashtag:
256 |                 tag_name = unquote(tag.attrs['title'])
257 |                 new_tag = soup.new_tag('span')
258 |                 new_tag.string = tag_name
259 |                 tag.replace_with(new_tag)
260 | 
261 |         links = soup.find_all('e', attrs={'type': 'web'})
262 |         if len(links):
263 |             for link in links:
264 |                 title = unquote(link.attrs['title'])
265 |                 href = unquote(link.attrs['href'])
266 |                 new_a_tag = soup.new_tag('a', href=href)
267 |                 new_a_tag.string = title
268 |                 link.replace_with(new_a_tag)
269 | 
270 |         text = str(soup)
271 |         text = re.sub(r'<e[^>]*>', '', text).strip()
272 |         text = text.replace('\n', '<br>')
273 |         return text
274 | 
275 |     def _make_pdf(self, html_files):
276 |         if len(html_files) > self.PDF_MAX_PAGE_NUM:
277 |             _html_files = html_files
278 |             html_files = [_html_files[i:i + self.PDF_MAX_PAGE_NUM] for i in range(0, len(_html_files), self.PDF_MAX_PAGE_NUM)]
279 |         else:
280 |             html_files = [html_files]
281 |         self.pdf_options['user-style-sheet'] = str(self.get_dir_path('temp.css'))
282 |         try:
283 |             for i, files in enumerate(html_files, start=1):
284 |                 pdfkit.from_file(files, os.path.join(self.output_dir, f'{self.PDF_FILE_NAME}_{i}.pdf'), options=self.pdf_options, verbose=True)
285 |             print("电子书生成成功！")
286 |         except Exception as e:
287 |             print("电子书生成失败：\n" + traceback.format_exc())
288 | 
289 |     def generate_pdf(self, html_files):
290 |         self._make_pdf(html_files)
291 | 
292 |     def regenerate_pdf(self, dir_name):
293 |         self.output_dir = self.get_dir_path(dir_name)
294 |         html_dir = os.path.join(self.output_dir, 'html')
295 |         html_files = os.listdir(html_dir)
296 |         html_files.sort(key=lambda x: int(x[:-5]))
297 |         html_files = [os.path.join(html_dir, i) for i in html_files if i.endswith('.html')]
298 |         self._make_pdf(html_files)
299 | 
300 |     def generate_merge_pdf(self, dir_name):
301 |         # 多个html合并成一个html后，再生成PDF，生成时间会比较长，会分页混乱
302 |         output_dir = self.get_dir_path(dir_name)
303 |         html_dir = os.path.join(output_dir, 'html')
304 |         html_files = os.listdir(html_dir)
305 |         html_files.sort(key=lambda x: int(x[:-5]))
306 |         html_files = [os.path.join(html_dir, i) for i in html_files if i.endswith('.html')]
307 |         single_html = os.path.join(output_dir, 'single.html')
308 |         with open(single_html, 'w+', encoding='utf-8') as f:
309 |             for i in html_files:
310 |                 with open(i, 'r+', encoding='utf-8') as fr:
311 |                     f.write(fr.read() + '\n\n')
312 |         try:
313 |             pdfkit.from_file(single_html, os.path.join(output_dir, self.PDF_FILE_NAME + '.pdf'), options=self.pdf_options, css=self.get_dir_path('temp.css'), verbose=True)
314 |             print("电子书生成成功！")
315 |         except Exception as e:
316 |             print("电子书生成失败：\n" + traceback.format_exc())
317 | 
318 |     def save_html(self, num, data):
319 |         file_name = os.path.join(self.html_output_dir, f'{num}.html')
320 |         with open(file_name, 'w+', encoding='utf-8') as f:
321 |             f.write(data)
322 |         return file_name
323 | 
324 |     def save_data_json(self, counts_per, num, data, url=None):
325 |         url = f'# {url}\n\n' if url else ''
326 |         with open(os.path.join(self.data_output_dir, f'{num}_{counts_per}.json'), 'w+', encoding='utf-8') as f:
327 |             f.write(url + json.dumps(data, indent=2, ensure_ascii=False))
328 | 
329 |     def get_dir_path(self, *paths):
330 |         return os.path.join(BASE_DIR, *paths)
331 | 
332 |     def mkdir(self):
333 |         self.output_dir = self.get_dir_path(time.strftime("%Y-%m-%d.%H%M%S", time.localtime()))
334 |         self.html_output_dir = os.path.join(self.output_dir, 'html')
335 |         self.image_output_dir = os.path.join(self.output_dir, 'image')
336 |         self.data_output_dir = os.path.join(self.output_dir, 'data')
337 |         os.mkdir(self.output_dir)
338 |         os.mkdir(self.html_output_dir)
339 |         os.mkdir(self.image_output_dir)
340 |         os.mkdir(self.data_output_dir)
341 | 
342 |     def run(self):
343 |         self.htmls_file = []
344 |         self.num = 1
345 |         self.mkdir()
346 |         if self.ONLY_DIGESTS:
347 |             self.start_url = 'https://api.zsxq.com/v2/groups/' + self.GROUP_ID + '/topics?scope=digests&count=' + str(self.COUNTS_PER)
348 |         else:
349 |             self.start_url = 'https://api.zsxq.com/v2/groups/' + self.GROUP_ID + '/topics?scope=all&count=' + str(self.COUNTS_PER)
350 | 
351 |         url = self.start_url
352 |         if self.FROM_DATE_TO_DATE and self.LATE_DATE.strip():
353 |             url = self.start_url + '&end_time=' + quote(self.LATE_DATE.strip())
354 |         print(f'Start Url: {url}')
355 |         self.get_data(url)
356 |         print(f'Generating PDF...')
357 |         self.generate_pdf(self.htmls_file)
358 | 
359 | 
360 | if __name__ == '__main__':
361 |     _ = Spider('登录后Cookie中的Token', '登录时使用的User-Agent', '知识星球中的小组ID')
362 |     _.run()
363 |     # _.regenerate_pdf('2022-02-01.000000')
364 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "zsxqproject"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Doscript"]
 6 | 
 7 | [tool.poetry.dependencies]
 8 | python = "^3.8"
 9 | requests = "^2.27.1"
10 | pdfkit = "^1.0.0"
11 | beautifulsoup4 = "^4.10.0"
12 | 
13 | [tool.poetry.dev-dependencies]
14 | 
15 | [build-system]
16 | requires = ["poetry-core>=1.0.0"]
17 | build-backend = "poetry.core.masonry.api"
18 | 


--------------------------------------------------------------------------------
/temp.css:
--------------------------------------------------------------------------------
 1 | body, p, img {
 2 |     page-break-inside: avoid;
 3 | }
 4 | h1 {
 5 |     font-size: 32px;
 6 |     color: red;
 7 |     text-align: center;
 8 | }
 9 | 
10 | p {
11 |     font-size: 20px;
12 | }
13 | 
14 | img {
15 |     max-width: 95%;
16 |     margin: 20px auto;
17 |     height: auto;
18 |     border: 0;
19 |     outline: 0;
20 |     -webkit-box-shadow: 1px 1px 18px 4px #b4b4b4;
21 |     -moz-box-shadow: 1px 1px 18px 4px #b4b4b4;
22 |     box-shadow: 1px 1px 18px 4px #b4b4b4;
23 |     /*set the images aligned*/
24 |     display: block;
25 | }


--------------------------------------------------------------------------------