├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── main.py ├── pyproject.toml └── temp.css /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .nox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # IPython 77 | profile_default/ 78 | ipython_config.py 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # SageMath parsed files 87 | *.sage.py 88 | 89 | # Environments 90 | .env 91 | .venv 92 | env/ 93 | venv/ 94 | ENV/ 95 | env.bak/ 96 | venv.bak/ 97 | 98 | # Spyder project settings 99 | .spyderproject 100 | .spyproject 101 | 102 | # Rope project settings 103 | .ropeproject 104 | 105 | # mkdocs documentation 106 | /site 107 | 108 | # mypy 109 | .mypy_cache/ 110 | .dmypy.json 111 | dmypy.json 112 | 113 | # Pyre type checker 114 | .pyre/ 115 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Weber Snake 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ZSXQ-Spider 2 | 爬取知识星球内容,并制作成PDF电子书。 3 | 4 | 代码修改自:[zsxq-spider](https://github.com/wbsabc/zsxq-spider) 5 | 6 | 爬取知识星球,并制作 PDF 电子书。[https://www.zsxq.com/](https://www.zsxq.com/) 7 | 8 | > [!CAUTION] 9 | > 最新接口已使用Signature验证,暂未适配,暂无适配计划 10 | 11 | 12 | ## 功能 13 | 14 | * 支持下载图片并写入 PDF。 15 | * 支持 PDF 中显示链接。 16 | * 支持下载评论。 17 | * 可控制只下载精华内容或下载全部内容。 18 | * 支持按时间区间下载。 19 | *  使用最新接口(v2)。 20 | *  每次运行结果保存为单独文件夹。 21 | *  支持分片输出PDF。 22 | 23 | ## 环境 24 | 25 | * Python 3.8 测试通过。 26 | * 安装 [wkhtmltopdf](https://wkhtmltopdf.org/downloads.html) ,安装后将 bin 目录加入到环境变量。 27 | * 安装相应依赖:pip install pdfkit 28 | * 安装 BeautifulSoup:pip install BeautifulSoup4 29 | * 安装 Requests:pip install requests 30 | *  或者使用poetry install 31 | 32 | 33 | ## 用法 34 | 35 | 参考以下配置内容 36 | ```python 37 | ZSXQ_ACCESS_TOKEN = '00000000-0000-0000-0000-D09322903A59_6DF24A4ED3558CD4' # 登录后Cookie中的Token(必须修改) 38 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0' # 登录时使用的User-Agent(必须修改) 39 | GROUP_ID = '123456789123' # 知识星球中的小组ID 40 | PDF_FILE_NAME = 'outfile' # 生成的PDF文件名,不带后缀 41 | PDF_MAX_PAGE_NUM = 500 # 单个PDF文件最大的页面数。windows下超过一定数量的页面会生成失败,所以需要调整此值 42 | DOWNLOAD_PICS = True # 是否下载图片 True | False 下载会导致程序变慢 43 | DOWNLOAD_COMMENTS = True # 是否下载评论 44 | ONLY_DIGESTS = False # True-只精华 | False-全部 45 | FROM_DATE_TO_DATE = False # 按时间区间下载 46 | EARLY_DATE = '2017-05-25T00:00:00.000+0800' # 最早时间 当FROM_DATE_TO_DATE=True时生效 为空表示不限制 形如'2017-05-25T00:00:00.000+0800' 47 | LATE_DATE = '2018-05-25T00:00:00.000+0800' # 最晚时间 当FROM_DATE_TO_DATE=True时生效 为空表示不限制 形如'2017-05-25T00:00:00.000+0800' 48 | COUNTS_PER_TIME = 30 # 每次请求加载几个主题 最大可设置为30 49 | DEBUG = False # DEBUG开关 50 | DEBUG_NUM = 120 # DEBUG时 跑多少条数据后停止 需与COUNTS_PER_TIME结合考虑 51 | SLEEP_FLAG = True # 请求之间是否SLEEP避免请求过于频繁 52 | SLEEP_SEC = 5 # SLEEP秒数 SLEEP_FLAG=True时生效 53 | ``` 54 | 55 | 修改main.py文件中的相应参数 56 | `Spider('登录后Cookie中的Token', '登录时使用的User-Agent', '知识星球中的小组ID')` 57 | 然后运行main.py。 58 | 59 | ## 说明 60 | 61 | 1. 请大家合理使用本代码,不要随意传播生成的PDF,保护网站及作者的合法权益。 62 | 2. 爬虫会对网站性能造成一定影响,请勿频繁使用,在必要时合理使用,大家都是去学习知识的,体谅一下吴老板。 63 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import re 2 | import requests 3 | import json 4 | import os 5 | import pdfkit 6 | import datetime 7 | import base64 8 | import time 9 | import traceback 10 | import urllib.request 11 | from bs4 import BeautifulSoup 12 | from urllib.parse import quote 13 | from urllib.parse import unquote 14 | from urllib.error import ContentTooShortError 15 | 16 | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) 17 | HTML_TEMPLATE = """ 18 | 19 | 20 |
21 | 22 |{text}
25 | 26 | 27 | """ 28 | 29 | 30 | class Spider: 31 | ZSXQ_ACCESS_TOKEN = '' # 登录后Cookie中的Token(必须修改) 32 | USER_AGENT = '' # 登录时使用的User-Agent(必须修改) 33 | GROUP_ID = '' # 知识星球中的小组ID 34 | PDF_FILE_NAME = 'output' # 生成的PDF文件名,不带后缀 35 | PDF_MAX_PAGE_NUM = 500 # 单个PDF文件最大的页面数。windows下超过一定数量的页面会生成失败,所以需要调整此值 36 | DOWNLOAD_PICS = True # 是否下载图片 True | False ;下载会导致程序变慢 37 | DOWNLOAD_COMMENTS = True # 是否下载评论 38 | ONLY_DIGESTS = False # True-只精华 | False-全部 39 | FROM_DATE_TO_DATE = False # 按时间区间下载 40 | EARLY_DATE = '' # 最早时间 当FROM_DATE_TO_DATE=True时生效 为空表示不限制 形如'2017-05-25T00:00:00.000+0800' 41 | LATE_DATE = '' # 最晚时间 当FROM_DATE_TO_DATE=True时生效 为空表示不限制 形如'2017-05-25T00:00:00.000+0800' 42 | COUNTS_PER = 30 # 每次请求加载几个主题 最大可设置为30 43 | DEBUG = False # DEBUG开关 44 | DEBUG_NUM = 120 # DEBUG时 跑多少条数据后停止 需与COUNTS_PER结合考虑 45 | SLEEP_FLAG = True # 请求之间是否SLEEP避免请求过于频繁 46 | SLEEP_SEC = 5 # SLEEP秒数 SLEEP_FLAG=True时生效 47 | 48 | OVER_DATE_BREAK = False 49 | htmls_file = [] 50 | num = 1 51 | output_dir = '' 52 | html_output_dir = '' 53 | image_output_dir = '' 54 | data_output_dir = '' 55 | start_url = '' 56 | headers = {} 57 | pdf_options = None 58 | 59 | def __init__(self, access_token=None, user_agent=None, group_id=None): 60 | self.ZSXQ_ACCESS_TOKEN = access_token or self.ZSXQ_ACCESS_TOKEN 61 | self.USER_AGENT = user_agent or self.USER_AGENT 62 | self.GROUP_ID = group_id or self.GROUP_ID 63 | self.headers = { 64 | 'Cookie': 'abtest_env=product;zsxq_access_token=' + self.ZSXQ_ACCESS_TOKEN, 65 | 'User-Agent': self.USER_AGENT, 66 | 'accept': 'application/json, text/plain, */*', 67 | 'sec-ch-ua-platform': '"Windows"', 68 | 'origin': 'https://wx.zsxq.com', 69 | 'sec-fetch-site': 'same-site', 70 | 'sec-fetch-mode': 'cors', 71 | 'sec-fetch-dest': 'empty', 72 | 'sec-ch-ua-mobile': '?0', 73 | 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"', 74 | 'referer': 'https://wx.zsxq.com/', 75 | 'dnt': '1', 76 | } 77 | self.pdf_options = { 78 | "page-size": "A4", 79 | "margin-top": "0.35in", 80 | "margin-right": "0.65in", 81 | "margin-bottom": "0.35in", 82 | "margin-left": "0.65in", 83 | "encoding": "UTF-8", 84 | "custom-header": [("Accept-Encoding", "gzip")], 85 | "cookie": [], 86 | "outline-depth": 10, 87 | } 88 | 89 | def get_url_data(self, url): 90 | rsp = requests.get(url, headers=self.headers) 91 | rsp_data = rsp.json() 92 | 93 | if not rsp_data.get('succeeded'): 94 | if rsp_data.get('code') == 1059: 95 | if self.SLEEP_FLAG: 96 | time.sleep(self.SLEEP_SEC) 97 | return self.get_url_data(url) 98 | raise Exception('访问错误:\n' + json.dumps(rsp_data, indent=2, ensure_ascii=False)) 99 | else: 100 | return rsp_data.get('resp_data') 101 | 102 | def get_data(self, url): 103 | rsp_data = self.get_url_data(url) 104 | self.save_data_json(self.COUNTS_PER, self.num, rsp_data) 105 | topics = rsp_data.get('topics') 106 | for topic in topics: 107 | if self.FROM_DATE_TO_DATE and self.EARLY_DATE.strip(): 108 | if topic.get('create_time') < self.EARLY_DATE.strip(): 109 | self.OVER_DATE_BREAK = True 110 | break 111 | 112 | content = topic.get('question', topic.get('talk', topic.get('task', topic.get('solution')))) 113 | 114 | anonymous = content.get('anonymous') 115 | if anonymous: 116 | author = '匿名用户' 117 | else: 118 | author = content.get('owner').get('name') 119 | 120 | cretime = (topic.get('create_time')[:23]).replace('T', ' ') 121 | 122 | text = content.get('text', '') 123 | # 排除不需要的文章 124 | # if text.strip().startswith(u'') or text.find(u'') != -1: 125 | # continue 126 | text = self.handle_link(text) 127 | title = str(self.num) + '_' + cretime[:16] 128 | if topic.get('digested') == True: 129 | title += ' {精华}' 130 | 131 | if self.DOWNLOAD_PICS and content.get('images'): 132 | soup = BeautifulSoup(HTML_TEMPLATE, 'html.parser') 133 | images_index = 0 134 | _images = content.get('images') 135 | print(f'Crawling images: {len(_images)}') 136 | for img in _images: 137 | url = img.get('large').get('url') 138 | local_url = os.path.join(self.image_output_dir, f'{self.num}_{images_index}.jpg') 139 | images_index += 1 140 | self.download_image(url, local_url) 141 | # img_tag = soup.new_tag('img', src=local_url) 142 | # 直接写入路径可能无法正常将图片写入PDF,此处写入转码后的图片数据 143 | img_tag = soup.new_tag('img', src=self.encode_image(local_url)) 144 | soup.body.append(img_tag) 145 | html_img = str(soup) 146 | html = html_img.format(title=title, text=text, author=author, cretime=cretime) 147 | else: 148 | html = HTML_TEMPLATE.format(title=title, text=text, author=author, cretime=cretime) 149 | 150 | if topic.get('question'): 151 | answer_author = topic.get('answer').get('owner').get('name', '') 152 | answer = topic.get('answer').get('text', "") 153 | answer = self.handle_link(answer) 154 | 155 | soup = BeautifulSoup(html, 'html.parser') 156 | answer_tag = soup.new_tag('p') 157 | 158 | answer = '【' + answer_author + '】 回答: