├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── main.py ├── pyproject.toml └── temp.css /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .nox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # IPython 77 | profile_default/ 78 | ipython_config.py 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # SageMath parsed files 87 | *.sage.py 88 | 89 | # Environments 90 | .env 91 | .venv 92 | env/ 93 | venv/ 94 | ENV/ 95 | env.bak/ 96 | venv.bak/ 97 | 98 | # Spyder project settings 99 | .spyderproject 100 | .spyproject 101 | 102 | # Rope project settings 103 | .ropeproject 104 | 105 | # mkdocs documentation 106 | /site 107 | 108 | # mypy 109 | .mypy_cache/ 110 | .dmypy.json 111 | dmypy.json 112 | 113 | # Pyre type checker 114 | .pyre/ 115 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Weber Snake 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ZSXQ-Spider 2 | 爬取知识星球内容,并制作成PDF电子书。 3 | 4 | 代码修改自:[zsxq-spider](https://github.com/wbsabc/zsxq-spider) 5 | 6 | 爬取知识星球,并制作 PDF 电子书。[https://www.zsxq.com/](https://www.zsxq.com/) 7 | 8 | > [!CAUTION] 9 | > 最新接口已使用Signature验证,暂未适配,暂无适配计划 10 | 11 | 12 | ## 功能 13 | 14 | * 支持下载图片并写入 PDF。 15 | * 支持 PDF 中显示链接。 16 | * 支持下载评论。 17 | * 可控制只下载精华内容或下载全部内容。 18 | * 支持按时间区间下载。 19 | * ![New](https://via.placeholder.com/10/f03c15/000000?text=+) 使用最新接口(v2)。 20 | * ![New](https://via.placeholder.com/10/f03c15/000000?text=+) 每次运行结果保存为单独文件夹。 21 | * ![New](https://via.placeholder.com/10/f03c15/000000?text=+) 支持分片输出PDF。 22 | 23 | ## 环境 24 | 25 | * Python 3.8 测试通过。 26 | * 安装 [wkhtmltopdf](https://wkhtmltopdf.org/downloads.html) ,安装后将 bin 目录加入到环境变量。 27 | * 安装相应依赖:pip install pdfkit 28 | * 安装 BeautifulSoup:pip install BeautifulSoup4 29 | * 安装 Requests:pip install requests 30 | * ![New](https://via.placeholder.com/10/f03c15/000000?text=+) 或者使用poetry install 31 | 32 | 33 | ## 用法 34 | 35 | 参考以下配置内容 36 | ```python 37 | ZSXQ_ACCESS_TOKEN = '00000000-0000-0000-0000-D09322903A59_6DF24A4ED3558CD4' # 登录后Cookie中的Token(必须修改) 38 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0' # 登录时使用的User-Agent(必须修改) 39 | GROUP_ID = '123456789123' # 知识星球中的小组ID 40 | PDF_FILE_NAME = 'outfile' # 生成的PDF文件名,不带后缀 41 | PDF_MAX_PAGE_NUM = 500 # 单个PDF文件最大的页面数。windows下超过一定数量的页面会生成失败,所以需要调整此值 42 | DOWNLOAD_PICS = True # 是否下载图片 True | False 下载会导致程序变慢 43 | DOWNLOAD_COMMENTS = True # 是否下载评论 44 | ONLY_DIGESTS = False # True-只精华 | False-全部 45 | FROM_DATE_TO_DATE = False # 按时间区间下载 46 | EARLY_DATE = '2017-05-25T00:00:00.000+0800' # 最早时间 当FROM_DATE_TO_DATE=True时生效 为空表示不限制 形如'2017-05-25T00:00:00.000+0800' 47 | LATE_DATE = '2018-05-25T00:00:00.000+0800' # 最晚时间 当FROM_DATE_TO_DATE=True时生效 为空表示不限制 形如'2017-05-25T00:00:00.000+0800' 48 | COUNTS_PER_TIME = 30 # 每次请求加载几个主题 最大可设置为30 49 | DEBUG = False # DEBUG开关 50 | DEBUG_NUM = 120 # DEBUG时 跑多少条数据后停止 需与COUNTS_PER_TIME结合考虑 51 | SLEEP_FLAG = True # 请求之间是否SLEEP避免请求过于频繁 52 | SLEEP_SEC = 5 # SLEEP秒数 SLEEP_FLAG=True时生效 53 | ``` 54 | 55 | 修改main.py文件中的相应参数 56 | `Spider('登录后Cookie中的Token', '登录时使用的User-Agent', '知识星球中的小组ID')` 57 | 然后运行main.py。 58 | 59 | ## 说明 60 | 61 | 1. 请大家合理使用本代码,不要随意传播生成的PDF,保护网站及作者的合法权益。 62 | 2. 爬虫会对网站性能造成一定影响,请勿频繁使用,在必要时合理使用,大家都是去学习知识的,体谅一下吴老板。 63 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import re 2 | import requests 3 | import json 4 | import os 5 | import pdfkit 6 | import datetime 7 | import base64 8 | import time 9 | import traceback 10 | import urllib.request 11 | from bs4 import BeautifulSoup 12 | from urllib.parse import quote 13 | from urllib.parse import unquote 14 | from urllib.error import ContentTooShortError 15 | 16 | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) 17 | HTML_TEMPLATE = """ 18 | 19 | 20 | 21 | 22 |

{title}

23 |
{author} - {cretime}
24 |

{text}

25 | 26 | 27 | """ 28 | 29 | 30 | class Spider: 31 | ZSXQ_ACCESS_TOKEN = '' # 登录后Cookie中的Token(必须修改) 32 | USER_AGENT = '' # 登录时使用的User-Agent(必须修改) 33 | GROUP_ID = '' # 知识星球中的小组ID 34 | PDF_FILE_NAME = 'output' # 生成的PDF文件名,不带后缀 35 | PDF_MAX_PAGE_NUM = 500 # 单个PDF文件最大的页面数。windows下超过一定数量的页面会生成失败,所以需要调整此值 36 | DOWNLOAD_PICS = True # 是否下载图片 True | False ;下载会导致程序变慢 37 | DOWNLOAD_COMMENTS = True # 是否下载评论 38 | ONLY_DIGESTS = False # True-只精华 | False-全部 39 | FROM_DATE_TO_DATE = False # 按时间区间下载 40 | EARLY_DATE = '' # 最早时间 当FROM_DATE_TO_DATE=True时生效 为空表示不限制 形如'2017-05-25T00:00:00.000+0800' 41 | LATE_DATE = '' # 最晚时间 当FROM_DATE_TO_DATE=True时生效 为空表示不限制 形如'2017-05-25T00:00:00.000+0800' 42 | COUNTS_PER = 30 # 每次请求加载几个主题 最大可设置为30 43 | DEBUG = False # DEBUG开关 44 | DEBUG_NUM = 120 # DEBUG时 跑多少条数据后停止 需与COUNTS_PER结合考虑 45 | SLEEP_FLAG = True # 请求之间是否SLEEP避免请求过于频繁 46 | SLEEP_SEC = 5 # SLEEP秒数 SLEEP_FLAG=True时生效 47 | 48 | OVER_DATE_BREAK = False 49 | htmls_file = [] 50 | num = 1 51 | output_dir = '' 52 | html_output_dir = '' 53 | image_output_dir = '' 54 | data_output_dir = '' 55 | start_url = '' 56 | headers = {} 57 | pdf_options = None 58 | 59 | def __init__(self, access_token=None, user_agent=None, group_id=None): 60 | self.ZSXQ_ACCESS_TOKEN = access_token or self.ZSXQ_ACCESS_TOKEN 61 | self.USER_AGENT = user_agent or self.USER_AGENT 62 | self.GROUP_ID = group_id or self.GROUP_ID 63 | self.headers = { 64 | 'Cookie': 'abtest_env=product;zsxq_access_token=' + self.ZSXQ_ACCESS_TOKEN, 65 | 'User-Agent': self.USER_AGENT, 66 | 'accept': 'application/json, text/plain, */*', 67 | 'sec-ch-ua-platform': '"Windows"', 68 | 'origin': 'https://wx.zsxq.com', 69 | 'sec-fetch-site': 'same-site', 70 | 'sec-fetch-mode': 'cors', 71 | 'sec-fetch-dest': 'empty', 72 | 'sec-ch-ua-mobile': '?0', 73 | 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"', 74 | 'referer': 'https://wx.zsxq.com/', 75 | 'dnt': '1', 76 | } 77 | self.pdf_options = { 78 | "page-size": "A4", 79 | "margin-top": "0.35in", 80 | "margin-right": "0.65in", 81 | "margin-bottom": "0.35in", 82 | "margin-left": "0.65in", 83 | "encoding": "UTF-8", 84 | "custom-header": [("Accept-Encoding", "gzip")], 85 | "cookie": [], 86 | "outline-depth": 10, 87 | } 88 | 89 | def get_url_data(self, url): 90 | rsp = requests.get(url, headers=self.headers) 91 | rsp_data = rsp.json() 92 | 93 | if not rsp_data.get('succeeded'): 94 | if rsp_data.get('code') == 1059: 95 | if self.SLEEP_FLAG: 96 | time.sleep(self.SLEEP_SEC) 97 | return self.get_url_data(url) 98 | raise Exception('访问错误:\n' + json.dumps(rsp_data, indent=2, ensure_ascii=False)) 99 | else: 100 | return rsp_data.get('resp_data') 101 | 102 | def get_data(self, url): 103 | rsp_data = self.get_url_data(url) 104 | self.save_data_json(self.COUNTS_PER, self.num, rsp_data) 105 | topics = rsp_data.get('topics') 106 | for topic in topics: 107 | if self.FROM_DATE_TO_DATE and self.EARLY_DATE.strip(): 108 | if topic.get('create_time') < self.EARLY_DATE.strip(): 109 | self.OVER_DATE_BREAK = True 110 | break 111 | 112 | content = topic.get('question', topic.get('talk', topic.get('task', topic.get('solution')))) 113 | 114 | anonymous = content.get('anonymous') 115 | if anonymous: 116 | author = '匿名用户' 117 | else: 118 | author = content.get('owner').get('name') 119 | 120 | cretime = (topic.get('create_time')[:23]).replace('T', ' ') 121 | 122 | text = content.get('text', '') 123 | # 排除不需要的文章 124 | # if text.strip().startswith(u'') or text.find(u'') != -1: 125 | # continue 126 | text = self.handle_link(text) 127 | title = str(self.num) + '_' + cretime[:16] 128 | if topic.get('digested') == True: 129 | title += ' {精华}' 130 | 131 | if self.DOWNLOAD_PICS and content.get('images'): 132 | soup = BeautifulSoup(HTML_TEMPLATE, 'html.parser') 133 | images_index = 0 134 | _images = content.get('images') 135 | print(f'Crawling images: {len(_images)}') 136 | for img in _images: 137 | url = img.get('large').get('url') 138 | local_url = os.path.join(self.image_output_dir, f'{self.num}_{images_index}.jpg') 139 | images_index += 1 140 | self.download_image(url, local_url) 141 | # img_tag = soup.new_tag('img', src=local_url) 142 | # 直接写入路径可能无法正常将图片写入PDF,此处写入转码后的图片数据 143 | img_tag = soup.new_tag('img', src=self.encode_image(local_url)) 144 | soup.body.append(img_tag) 145 | html_img = str(soup) 146 | html = html_img.format(title=title, text=text, author=author, cretime=cretime) 147 | else: 148 | html = HTML_TEMPLATE.format(title=title, text=text, author=author, cretime=cretime) 149 | 150 | if topic.get('question'): 151 | answer_author = topic.get('answer').get('owner').get('name', '') 152 | answer = topic.get('answer').get('text', "") 153 | answer = self.handle_link(answer) 154 | 155 | soup = BeautifulSoup(html, 'html.parser') 156 | answer_tag = soup.new_tag('p') 157 | 158 | answer = '【' + answer_author + '】 回答:
' + answer 159 | soup_temp = BeautifulSoup(answer, 'html.parser') 160 | answer_tag.append(soup_temp) 161 | 162 | soup.body.append(answer_tag) 163 | html = str(soup) 164 | 165 | files = content.get('files') 166 | if files: 167 | files_content = '文件列表(需访问网站下载) :
' 168 | for f in files: 169 | files_content += f.get('name') + '
' 170 | files_content += '
' 171 | soup = BeautifulSoup(html, 'html.parser') 172 | files_tag = soup.new_tag('p') 173 | soup_temp = BeautifulSoup(files_content, 'html.parser') 174 | files_tag.append(soup_temp) 175 | soup.body.append(files_tag) 176 | html = str(soup) 177 | 178 | comments = topic.get('show_comments') 179 | if self.DOWNLOAD_COMMENTS and comments: 180 | soup = BeautifulSoup(html, 'html.parser') 181 | hr_tag = soup.new_tag('hr') 182 | soup.body.append(hr_tag) 183 | for comment in comments: 184 | if comment.get('repliee'): 185 | comment_str = '[' + comment.get('owner').get('name') + ' 回复 ' + comment.get('repliee').get('name') + '] : ' + self.handle_link(comment.get('text')) 186 | else: 187 | comment_str = '[' + comment.get('owner').get('name') + '] : ' + self.handle_link(comment.get('text')) 188 | 189 | comment_tag = soup.new_tag('p') 190 | soup_temp = BeautifulSoup(comment_str, 'html.parser') 191 | comment_tag.append(soup_temp) 192 | soup.body.append(comment_tag) 193 | html = str(soup) 194 | 195 | file_name = self.save_html(self.num, html) 196 | self.num += 1 197 | self.htmls_file.append(file_name) 198 | 199 | # DEBUG 仅导出部分数据时使用 200 | if self.DEBUG and self.num >= self.DEBUG_NUM: 201 | return self.htmls_file 202 | 203 | if self.OVER_DATE_BREAK: 204 | return self.htmls_file 205 | 206 | if topics: 207 | create_time = topics[-1].get('create_time') 208 | if create_time[20:23] == "000": 209 | end_time = create_time[:20] + "999" + create_time[23:] 210 | str_date_time = end_time[:19] 211 | delta = datetime.timedelta(seconds=1) 212 | date_time = datetime.datetime.strptime(str_date_time, '%Y-%m-%dT%H:%M:%S') 213 | date_time = date_time - delta 214 | str_date_time = date_time.strftime('%Y-%m-%dT%H:%M:%S') 215 | end_time = str_date_time + end_time[19:] 216 | else: 217 | res = int(create_time[20:23]) - 1 218 | end_time = create_time[:20] + str(res).zfill(3) + create_time[23:] # zfill 函数补足结果前面的零,始终为3位数 219 | end_time = quote(end_time) 220 | if len(end_time) == 33: 221 | end_time = end_time[:24] + '0' + end_time[24:] 222 | next_url = self.start_url + '&end_time=' + end_time 223 | if self.SLEEP_FLAG: 224 | time.sleep(self.SLEEP_SEC) 225 | print(f'Next url: {next_url}') 226 | self.get_data(next_url) 227 | 228 | return self.htmls_file 229 | 230 | def encode_image(self, image_url): 231 | with open(image_url, "rb") as image_file: 232 | encoded_string = base64.b64encode(image_file.read()) 233 | return 'data:image/png;base64,' + encoded_string.decode('utf-8') 234 | 235 | def download_image(self, url, local_url): 236 | try: 237 | urllib.request.urlretrieve(url, local_url) 238 | except ContentTooShortError: 239 | print('Network not good. Reloading ' + url) 240 | self.download_image(url, local_url) 241 | 242 | def handle_link(self, text): 243 | soup = BeautifulSoup(text, "html.parser") 244 | 245 | mention = soup.find_all('e', attrs={'type': 'mention'}) 246 | if len(mention): 247 | for m in mention: 248 | mention_name = m.attrs['title'] 249 | new_tag = soup.new_tag('span') 250 | new_tag.string = mention_name 251 | m.replace_with(new_tag) 252 | 253 | hashtag = soup.find_all('e', attrs={'type': 'hashtag'}) 254 | if len(hashtag): 255 | for tag in hashtag: 256 | tag_name = unquote(tag.attrs['title']) 257 | new_tag = soup.new_tag('span') 258 | new_tag.string = tag_name 259 | tag.replace_with(new_tag) 260 | 261 | links = soup.find_all('e', attrs={'type': 'web'}) 262 | if len(links): 263 | for link in links: 264 | title = unquote(link.attrs['title']) 265 | href = unquote(link.attrs['href']) 266 | new_a_tag = soup.new_tag('a', href=href) 267 | new_a_tag.string = title 268 | link.replace_with(new_a_tag) 269 | 270 | text = str(soup) 271 | text = re.sub(r']*>', '', text).strip() 272 | text = text.replace('\n', '
') 273 | return text 274 | 275 | def _make_pdf(self, html_files): 276 | if len(html_files) > self.PDF_MAX_PAGE_NUM: 277 | _html_files = html_files 278 | html_files = [_html_files[i:i + self.PDF_MAX_PAGE_NUM] for i in range(0, len(_html_files), self.PDF_MAX_PAGE_NUM)] 279 | else: 280 | html_files = [html_files] 281 | self.pdf_options['user-style-sheet'] = str(self.get_dir_path('temp.css')) 282 | try: 283 | for i, files in enumerate(html_files, start=1): 284 | pdfkit.from_file(files, os.path.join(self.output_dir, f'{self.PDF_FILE_NAME}_{i}.pdf'), options=self.pdf_options, verbose=True) 285 | print("电子书生成成功!") 286 | except Exception as e: 287 | print("电子书生成失败:\n" + traceback.format_exc()) 288 | 289 | def generate_pdf(self, html_files): 290 | self._make_pdf(html_files) 291 | 292 | def regenerate_pdf(self, dir_name): 293 | self.output_dir = self.get_dir_path(dir_name) 294 | html_dir = os.path.join(self.output_dir, 'html') 295 | html_files = os.listdir(html_dir) 296 | html_files.sort(key=lambda x: int(x[:-5])) 297 | html_files = [os.path.join(html_dir, i) for i in html_files if i.endswith('.html')] 298 | self._make_pdf(html_files) 299 | 300 | def generate_merge_pdf(self, dir_name): 301 | # 多个html合并成一个html后,再生成PDF,生成时间会比较长,会分页混乱 302 | output_dir = self.get_dir_path(dir_name) 303 | html_dir = os.path.join(output_dir, 'html') 304 | html_files = os.listdir(html_dir) 305 | html_files.sort(key=lambda x: int(x[:-5])) 306 | html_files = [os.path.join(html_dir, i) for i in html_files if i.endswith('.html')] 307 | single_html = os.path.join(output_dir, 'single.html') 308 | with open(single_html, 'w+', encoding='utf-8') as f: 309 | for i in html_files: 310 | with open(i, 'r+', encoding='utf-8') as fr: 311 | f.write(fr.read() + '\n\n') 312 | try: 313 | pdfkit.from_file(single_html, os.path.join(output_dir, self.PDF_FILE_NAME + '.pdf'), options=self.pdf_options, css=self.get_dir_path('temp.css'), verbose=True) 314 | print("电子书生成成功!") 315 | except Exception as e: 316 | print("电子书生成失败:\n" + traceback.format_exc()) 317 | 318 | def save_html(self, num, data): 319 | file_name = os.path.join(self.html_output_dir, f'{num}.html') 320 | with open(file_name, 'w+', encoding='utf-8') as f: 321 | f.write(data) 322 | return file_name 323 | 324 | def save_data_json(self, counts_per, num, data, url=None): 325 | url = f'# {url}\n\n' if url else '' 326 | with open(os.path.join(self.data_output_dir, f'{num}_{counts_per}.json'), 'w+', encoding='utf-8') as f: 327 | f.write(url + json.dumps(data, indent=2, ensure_ascii=False)) 328 | 329 | def get_dir_path(self, *paths): 330 | return os.path.join(BASE_DIR, *paths) 331 | 332 | def mkdir(self): 333 | self.output_dir = self.get_dir_path(time.strftime("%Y-%m-%d.%H%M%S", time.localtime())) 334 | self.html_output_dir = os.path.join(self.output_dir, 'html') 335 | self.image_output_dir = os.path.join(self.output_dir, 'image') 336 | self.data_output_dir = os.path.join(self.output_dir, 'data') 337 | os.mkdir(self.output_dir) 338 | os.mkdir(self.html_output_dir) 339 | os.mkdir(self.image_output_dir) 340 | os.mkdir(self.data_output_dir) 341 | 342 | def run(self): 343 | self.htmls_file = [] 344 | self.num = 1 345 | self.mkdir() 346 | if self.ONLY_DIGESTS: 347 | self.start_url = 'https://api.zsxq.com/v2/groups/' + self.GROUP_ID + '/topics?scope=digests&count=' + str(self.COUNTS_PER) 348 | else: 349 | self.start_url = 'https://api.zsxq.com/v2/groups/' + self.GROUP_ID + '/topics?scope=all&count=' + str(self.COUNTS_PER) 350 | 351 | url = self.start_url 352 | if self.FROM_DATE_TO_DATE and self.LATE_DATE.strip(): 353 | url = self.start_url + '&end_time=' + quote(self.LATE_DATE.strip()) 354 | print(f'Start Url: {url}') 355 | self.get_data(url) 356 | print(f'Generating PDF...') 357 | self.generate_pdf(self.htmls_file) 358 | 359 | 360 | if __name__ == '__main__': 361 | _ = Spider('登录后Cookie中的Token', '登录时使用的User-Agent', '知识星球中的小组ID') 362 | _.run() 363 | # _.regenerate_pdf('2022-02-01.000000') 364 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "zsxqproject" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Doscript"] 6 | 7 | [tool.poetry.dependencies] 8 | python = "^3.8" 9 | requests = "^2.27.1" 10 | pdfkit = "^1.0.0" 11 | beautifulsoup4 = "^4.10.0" 12 | 13 | [tool.poetry.dev-dependencies] 14 | 15 | [build-system] 16 | requires = ["poetry-core>=1.0.0"] 17 | build-backend = "poetry.core.masonry.api" 18 | -------------------------------------------------------------------------------- /temp.css: -------------------------------------------------------------------------------- 1 | body, p, img { 2 | page-break-inside: avoid; 3 | } 4 | h1 { 5 | font-size: 32px; 6 | color: red; 7 | text-align: center; 8 | } 9 | 10 | p { 11 | font-size: 20px; 12 | } 13 | 14 | img { 15 | max-width: 95%; 16 | margin: 20px auto; 17 | height: auto; 18 | border: 0; 19 | outline: 0; 20 | -webkit-box-shadow: 1px 1px 18px 4px #b4b4b4; 21 | -moz-box-shadow: 1px 1px 18px 4px #b4b4b4; 22 | box-shadow: 1px 1px 18px 4px #b4b4b4; 23 | /*set the images aligned*/ 24 | display: block; 25 | } --------------------------------------------------------------------------------