├── .gitignore ├── assets └── image-20210623210003992.png ├── default.css ├── README.md ├── LICENSE └── crawl.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | 3 | .vscode/ 4 | 5 | *.pdf 6 | 7 | temp_content.json -------------------------------------------------------------------------------- /assets/image-20210623210003992.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xueshanlinghu/zsxq_to_pdf/HEAD/assets/image-20210623210003992.png -------------------------------------------------------------------------------- /default.css: -------------------------------------------------------------------------------- 1 | h1 {font-size:40px; color:red; text-align:center;} 2 | p {font-size:30px;} 3 | img{ 4 | max-width:100%; 5 | margin:20px auto; 6 | height:auto; 7 | border:0; 8 | outline:0 9 | -webkit-box-shadow: 1px 4px 16px 8px #5CA2BE; 10 | -moz-box-shadow: 1px 4px 16px 8px #5CA2BE; 11 | box-shadow: 1px 4px 16px 8px #5CA2BE; 12 | /*set the images aligned*/ 13 | display: block; 14 | margin-left: auto; 15 | margin-right: auto; 16 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 爬取知识星球,并制作成 PDF 电子书 2 | 3 | 4 | ## 功能 5 | 6 | 爬取知识星球的精华区,并制作成 PDF 电子书。 7 | 8 | ## 效果图 9 | 10 | ![image-20210623210003992](assets/image-20210623210003992.png) 11 | 12 | ## 用法 13 | 14 | 首先安装必备的包(包里要求的其他连带的包会自动安装): 15 | 16 | ```bash 17 | pip install requests beautifulsoup4 pdfkit 18 | ``` 19 | 20 | 安装 wkhtmltox,[https://wkhtmltopdf.org/downloads.html](https://wkhtmltopdf.org/downloads.html) 。安装后将安装目录下的 bin 目录加入到环境变量。这是用来制作 pdf 用的,可以将 html 格式转为 pdf 格式。 21 | 22 | 接下来请打开 `crawl.py` 文件,修改配置。 23 | 24 | 请先登录你有权限查看的星球的账号,进入该星球页面,请使用谷歌浏览器刷新页面,在 `Network` 面板的抓包内容中找到 `topics?...` 这样的请求,返回的是 `json` 内容。 25 | 26 | 将这个包的 `cookie` 部分复制到代码中 `headers` 部分的 `Cookie` 一栏,将这个请求的 `url`,域名为 `api.zsxq.com` 开头的,复制到代码中 `start_url` 的部分,注意,如果你只需要该星球星主的文章,这个网址一定要注意 `scope` 参数(该星球所有文章填写 `all`):`topics?scope=by_owner&count=20`。然后代码再往下是批量获取该星球所有还是每过一段时间生成一个 pdf 文件,你可以自行选择修改调整。 27 | 28 | 完成后运行脚本即可: 29 | 30 | ```bash 31 | python crawl.py 32 | ``` 33 | 34 | ## 鸣谢 35 | 36 | 这个仓库最开始不是我弄的,鸣谢之前的作者,在他们的基础上我增加了许多防卡死的功能和日志输出,并且已调整成支持时间段获取了。该项目仅供学习交流使用,切莫用于商业用途! -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 96chh, xingstarx, 雪山凌狐 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /crawl.py: -------------------------------------------------------------------------------- 1 | import re 2 | import requests 3 | import json 4 | import os 5 | import pdfkit 6 | from bs4 import BeautifulSoup 7 | from urllib.parse import quote 8 | from time import sleep 9 | import random 10 | import datetime 11 | 12 | 13 | def get_data(url, headers, before=None, after=None): 14 | """ 15 | before 默认为None,否则请填入内容,格式为:'2021-06-31 21:00',所有小于等于该时间的才会被获取 16 | after 默认为None,否则请填入内容,格式为:'2021-05-27 20:00',所有大于等于该时间的才会被获取 17 | """ 18 | global htmls, num 19 | 20 | i = 0 21 | while i < 10: 22 | rsp = requests.get(url, headers=headers) 23 | if rsp.json().get("succeeded") == False: 24 | sleep(0.01) 25 | print("访问失败,重来一遍...") 26 | rsp = requests.get(url, headers=headers) 27 | i += 1 28 | else: 29 | break 30 | 31 | with open('temp_content.json', 'w', encoding='utf-8') as f: # 将返回数据写入 temp_content.json 方便查看 32 | f.write(json.dumps(rsp.json(), indent=2, ensure_ascii=False)) 33 | 34 | with open('temp_content.json', encoding='utf-8') as f: 35 | all_contents = json.loads(f.read()) 36 | contents = all_contents.get('resp_data').get('topics') 37 | if contents is not None: 38 | for topic in contents: 39 | create_time = topic.get("create_time", "") 40 | if create_time != "": 41 | create_time = create_time[:16].replace("T", " ") 42 | create_time_time = datetime.datetime.strptime(create_time, '%Y-%m-%d %H:%M') 43 | if after is not None: 44 | after_time = datetime.datetime.strptime(after, '%Y-%m-%d %H:%M') 45 | if after_time > create_time_time: continue 46 | if before is not None: 47 | before_time = datetime.datetime.strptime(before, '%Y-%m-%d %H:%M') 48 | if create_time_time > before_time: continue 49 | 50 | content = topic.get('question', topic.get('talk', topic.get('task', topic.get('solution')))) 51 | # print(content) 52 | text = content.get('text', '') 53 | text = re.sub(r'<[^>]*>', '', text).strip() 54 | text = text.replace('\n', '
') 55 | if text != "": 56 | pos = text.find("
") 57 | title = str(num) + " " + text[:pos] 58 | else: 59 | title = str(num) + "Error: 找不到内容" 60 | 61 | if content.get('images'): 62 | soup = BeautifulSoup(html_template, 'html.parser') 63 | for img in content.get('images'): 64 | url = img.get('large').get('url') 65 | img_tag = soup.new_tag('img', src=url) 66 | soup.body.append(img_tag) 67 | html_img = str(soup) 68 | html = html_img.format(title=title, text=text, create_time=create_time) 69 | else: 70 | html = html_template.format(title=title, text=text, create_time=create_time) 71 | 72 | if topic.get('question'): 73 | answer = topic.get('answer').get('text', "") 74 | soup = BeautifulSoup(html, 'html.parser') 75 | answer_tag = soup.new_tag('p') 76 | answer_tag.string = answer 77 | soup.body.append(answer_tag) 78 | html_answer = str(soup) 79 | html = html_answer.format(title=title, text=text, create_time=create_time) 80 | 81 | htmls.append(html) 82 | 83 | num += 1 84 | else: 85 | print("*" * 16, "访问失败", "*" * 16) 86 | print("失败url:", url) 87 | print(all_contents) 88 | print(rsp.status_code) 89 | print("*" * 40) 90 | 91 | next_page = rsp.json().get('resp_data').get('topics') 92 | if next_page: 93 | create_time = next_page[-1].get('create_time') 94 | if create_time[20:23] == "000": 95 | end_time = create_time[:20] + "999" + create_time[23:] 96 | else : 97 | res = int(create_time[20:23])-1 98 | end_time = create_time[:20] + str(res).zfill(3) + create_time[23:] # zfill 函数补足结果前面的零,始终为3位数 99 | end_time = quote(end_time) 100 | if len(end_time) == 33: 101 | end_time = end_time[:24] + '0' + end_time[24:] 102 | next_url = start_url + '&end_time=' + end_time 103 | print("next_url:", next_url) 104 | sleep(random.randint(1, 5) / 100) 105 | get_data(next_url, headers, before, after) 106 | 107 | return htmls 108 | 109 | def make_pdf(htmls, pdf_filepath="电子书.pdf"): 110 | html_files = [] 111 | for index, html in enumerate(htmls): 112 | file = str(index) + ".html" 113 | html_files.append(file) 114 | with open(file, "w", encoding="utf-8") as f: 115 | f.write(html) 116 | 117 | options = { 118 | "user-style-sheet": "default.css", 119 | "page-size": "Letter", 120 | "margin-top": "0.75in", 121 | "margin-right": "0.75in", 122 | "margin-bottom": "0.75in", 123 | "margin-left": "0.75in", 124 | "encoding": "UTF-8", 125 | "custom-header": [("Accept-Encoding", "gzip")], 126 | "cookie": [ 127 | ("cookie-name1", "cookie-value1"), ("cookie-name2", "cookie-value2") 128 | ], 129 | "outline-depth": 10, 130 | } 131 | try: 132 | print("生成PDF文件中,请耐心等待...") 133 | if os.path.exists(pdf_filepath): os.remove(pdf_filepath) 134 | pdfkit.from_file(html_files, pdf_filepath, options=options) 135 | except Exception as e: 136 | print("生成pdf报错") 137 | print(e) 138 | 139 | for i in html_files: 140 | os.remove(i) 141 | 142 | print("已制作电子书在当前目录!") 143 | 144 | 145 | if __name__ == '__main__': 146 | # 这个模板是默认的,无需修改 147 | html_template = """ 148 | 149 | 150 | 151 | 152 | 153 | 154 |

{title}

155 |

{create_time}

156 |

{text}

157 | 158 | 159 | """ 160 | 161 | # 请先登录你有权限查看的星球的账号,进入该星球页面 162 | # 请使用谷歌浏览器刷新页面,在 Network 面板的抓包内容中找到 topics?... 这样的请求,返回的是 json 内容 163 | # 将这个包的 cookie 部分复制到 headers 部分的 Cookie 一栏 164 | # 将这个请求的 url,域名为 api.zsxq.com 开头的,复制到下面 start_url 的部分 165 | headers = { 166 | 'Cookie':'abtest_env=product; zsxq_access_token=EB72127D-2A94-A794-46FE-8E1D0F151F40_C348130420D15229; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22414445544118248%22%2C%22first_id%22%3A%2217a3722c1df65-0d10035f06881c8-e726559-2073600-17a3722c1e0421%22%2C%22props%22%3A%7B%7D%2C%22%24device_id%22%3A%2217a3722c1df65-0d10035f06881c8-e726559-2073600-17a3722c1e0421%22%7D', 167 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36 SE 2.X MetaSr 1.0' 168 | } 169 | start_url = 'https://api.zsxq.com/v2/groups/551212824514/topics?scope=by_owner&count=20' 170 | 171 | # 只取大于等于 after ,小于等于 before 的日期时间的文章,可以省略这俩参数,获取所有的历史文章 172 | # 下面这里我演示的是一段时间的拆分获取 pdf,可以批量生成多个,用于内容跨度时间长,内容非常多的星球,你可以自己看着改 173 | time_period = [ 174 | ("2021-04-01 00:00", "2021-06-30 23:59"), 175 | ("2021-01-01 00:00", "2021-03-31 23:59"), 176 | ("2020-10-01 00:00", "2020-12-31 23:59"), 177 | ("2020-07-01 00:00", "2020-09-30 23:59"), 178 | ("2020-04-01 00:00", "2020-06-30 23:59"), 179 | ("2020-01-01 00:00", "2020-03-31 23:59"), 180 | ("2019-10-01 00:00", "2019-12-31 23:59"), 181 | ("2019-07-01 00:00", "2019-09-30 23:59"), 182 | ("2019-04-01 00:00", "2019-06-30 23:59"), 183 | ("2019-01-01 00:00", "2019-03-31 23:59"), 184 | ("2018-10-01 00:00", "2018-12-31 23:59"), 185 | ("2018-07-01 00:00", "2018-09-30 23:59"), 186 | ("2018-04-01 00:00", "2018-06-30 23:59"), 187 | ] 188 | for period in time_period: 189 | pdf_filepath = "你的知识星球%s-%s.pdf" % (period[0][:10].replace("-",""), period[1][:10].replace("-","")) 190 | htmls = [] 191 | num = 1 192 | make_pdf(get_data(start_url, headers, before=period[1], after=period[0]), pdf_filepath=pdf_filepath) 193 | 194 | # 如果你想获取该星球的所有内容,请用这几句代码,但当内容较多的时候,生成 pdf 会极慢 195 | # htmls = [] 196 | # num = 1 197 | # make_pdf(get_data(start_url, headers)) 198 | --------------------------------------------------------------------------------