├── .gitignore
├── assets
    └── image-20210623210003992.png
├── default.css
├── README.md
├── LICENSE
└── crawl.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | 
3 | .vscode/
4 | 
5 | *.pdf
6 | 
7 | temp_content.json


--------------------------------------------------------------------------------
/assets/image-20210623210003992.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xueshanlinghu/zsxq_to_pdf/HEAD/assets/image-20210623210003992.png


--------------------------------------------------------------------------------
/default.css:
--------------------------------------------------------------------------------
 1 | h1 {font-size:40px; color:red; text-align:center;}
 2 | p {font-size:30px;}
 3 | img{
 4 | 	max-width:100%;
 5 | 	margin:20px auto;
 6 | 	height:auto;
 7 | 	border:0;
 8 | 	outline:0
 9 | 	-webkit-box-shadow: 1px 4px 16px 8px #5CA2BE;
10 |     -moz-box-shadow: 1px 4px 16px 8px #5CA2BE;
11 |     box-shadow: 1px 4px 16px 8px #5CA2BE;
12 |     /*set the images aligned*/
13 |     display: block;
14 |     margin-left: auto;
15 |     margin-right: auto;
16 | 	}


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 爬取知识星球，并制作成 PDF 电子书
 2 | 
 3 | 
 4 | ## 功能
 5 | 
 6 | 爬取知识星球的精华区，并制作成 PDF 电子书。
 7 | 
 8 | ## 效果图
 9 | 
10 | ![image-20210623210003992](assets/image-20210623210003992.png)
11 | 
12 | ## 用法
13 | 
14 | 首先安装必备的包（包里要求的其他连带的包会自动安装）：
15 | 
16 | ```bash
17 | pip install requests beautifulsoup4 pdfkit
18 | ```
19 | 
20 | 安装 wkhtmltox，[https://wkhtmltopdf.org/downloads.html](https://wkhtmltopdf.org/downloads.html) 。安装后将安装目录下的 bin 目录加入到环境变量。这是用来制作 pdf 用的，可以将 html 格式转为 pdf 格式。
21 | 
22 | 接下来请打开 `crawl.py` 文件，修改配置。
23 | 
24 | 请先登录你有权限查看的星球的账号，进入该星球页面，请使用谷歌浏览器刷新页面，在 `Network` 面板的抓包内容中找到 `topics?...` 这样的请求，返回的是 `json` 内容。
25 | 
26 | 将这个包的 `cookie` 部分复制到代码中 `headers` 部分的 `Cookie` 一栏，将这个请求的 `url`，域名为 `api.zsxq.com` 开头的，复制到代码中 `start_url` 的部分，注意，如果你只需要该星球星主的文章，这个网址一定要注意 `scope` 参数（该星球所有文章填写 `all`）：`topics?scope=by_owner&count=20`。然后代码再往下是批量获取该星球所有还是每过一段时间生成一个 pdf 文件，你可以自行选择修改调整。
27 | 
28 | 完成后运行脚本即可：
29 | 
30 | ```bash
31 | python crawl.py
32 | ```
33 | 
34 | ## 鸣谢
35 | 
36 | 这个仓库最开始不是我弄的，鸣谢之前的作者，在他们的基础上我增加了许多防卡死的功能和日志输出，并且已调整成支持时间段获取了。该项目仅供学习交流使用，切莫用于商业用途！


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 96chh, xingstarx, 雪山凌狐
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/crawl.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import requests
  3 | import json
  4 | import os
  5 | import pdfkit
  6 | from bs4 import BeautifulSoup
  7 | from urllib.parse import quote
  8 | from time import sleep
  9 | import random
 10 | import datetime
 11 | 
 12 | 
 13 | def get_data(url, headers, before=None, after=None):
 14 |     """
 15 |     before 默认为None，否则请填入内容，格式为：'2021-06-31 21:00'，所有小于等于该时间的才会被获取
 16 |     after 默认为None，否则请填入内容，格式为：'2021-05-27 20:00'，所有大于等于该时间的才会被获取
 17 |     """
 18 |     global htmls, num
 19 |     
 20 |     i = 0
 21 |     while i < 10:
 22 |         rsp = requests.get(url, headers=headers)
 23 |         if rsp.json().get("succeeded") == False:
 24 |             sleep(0.01)
 25 |             print("访问失败，重来一遍...")
 26 |             rsp = requests.get(url, headers=headers)
 27 |             i += 1
 28 |         else:
 29 |             break
 30 | 
 31 |     with open('temp_content.json', 'w', encoding='utf-8') as f:        # 将返回数据写入 temp_content.json 方便查看
 32 |         f.write(json.dumps(rsp.json(), indent=2, ensure_ascii=False))
 33 | 
 34 |     with open('temp_content.json', encoding='utf-8') as f:
 35 |         all_contents = json.loads(f.read())
 36 |         contents = all_contents.get('resp_data').get('topics')
 37 |         if contents is not None:
 38 |             for topic in contents:
 39 |                 create_time = topic.get("create_time", "")
 40 |                 if create_time != "":
 41 |                     create_time = create_time[:16].replace("T", " ")
 42 |                     create_time_time = datetime.datetime.strptime(create_time, '%Y-%m-%d %H:%M')
 43 |                     if after is not None:
 44 |                         after_time = datetime.datetime.strptime(after, '%Y-%m-%d %H:%M')
 45 |                         if after_time > create_time_time: continue
 46 |                     if before is not None:
 47 |                         before_time = datetime.datetime.strptime(before, '%Y-%m-%d %H:%M')   
 48 |                         if create_time_time > before_time: continue
 49 | 
 50 |                 content = topic.get('question', topic.get('talk', topic.get('task', topic.get('solution'))))
 51 |                 # print(content)
 52 |                 text = content.get('text', '')
 53 |                 text = re.sub(r'<[^>]*>', '', text).strip()
 54 |                 text = text.replace('\n', '<br>')
 55 |                 if text != "":
 56 |                     pos = text.find("<br>")
 57 |                     title = str(num) + " " + text[:pos]
 58 |                 else:
 59 |                     title = str(num) + "Error: 找不到内容"
 60 |         
 61 |                 if content.get('images'):
 62 |                     soup = BeautifulSoup(html_template, 'html.parser')
 63 |                     for img in content.get('images'):
 64 |                         url = img.get('large').get('url')
 65 |                         img_tag = soup.new_tag('img', src=url)
 66 |                         soup.body.append(img_tag)
 67 |                         html_img = str(soup)
 68 |                         html = html_img.format(title=title, text=text, create_time=create_time)
 69 |                 else:
 70 |                     html = html_template.format(title=title, text=text, create_time=create_time)
 71 | 
 72 |                 if topic.get('question'):
 73 |                     answer = topic.get('answer').get('text', "")
 74 |                     soup = BeautifulSoup(html, 'html.parser')
 75 |                     answer_tag = soup.new_tag('p')
 76 |                     answer_tag.string = answer
 77 |                     soup.body.append(answer_tag)
 78 |                     html_answer = str(soup)
 79 |                     html = html_answer.format(title=title, text=text, create_time=create_time)
 80 | 
 81 |                 htmls.append(html)
 82 | 
 83 |                 num += 1
 84 |         else:
 85 |             print("*" * 16, "访问失败", "*" * 16)
 86 |             print("失败url:", url)
 87 |             print(all_contents)
 88 |             print(rsp.status_code)
 89 |             print("*" * 40)
 90 | 
 91 |     next_page = rsp.json().get('resp_data').get('topics')
 92 |     if next_page:
 93 |         create_time = next_page[-1].get('create_time')
 94 |         if create_time[20:23] == "000":
 95 |             end_time = create_time[:20] + "999" + create_time[23:]
 96 |         else :
 97 |             res = int(create_time[20:23])-1
 98 |             end_time = create_time[:20] + str(res).zfill(3) + create_time[23:] # zfill 函数补足结果前面的零，始终为3位数
 99 |         end_time = quote(end_time)
100 |         if len(end_time) == 33:
101 |             end_time = end_time[:24] + '0' + end_time[24:]
102 |         next_url = start_url + '&end_time=' + end_time
103 |         print("next_url:", next_url)
104 |         sleep(random.randint(1, 5) / 100)
105 |         get_data(next_url, headers, before, after)
106 | 
107 |     return htmls
108 | 
109 | def make_pdf(htmls, pdf_filepath="电子书.pdf"):
110 |     html_files = []
111 |     for index, html in enumerate(htmls):
112 |         file = str(index) + ".html"
113 |         html_files.append(file)
114 |         with open(file, "w", encoding="utf-8") as f:
115 |             f.write(html)
116 | 
117 |     options = {
118 |         "user-style-sheet": "default.css",
119 |         "page-size": "Letter",
120 |         "margin-top": "0.75in",
121 |         "margin-right": "0.75in",
122 |         "margin-bottom": "0.75in",
123 |         "margin-left": "0.75in",
124 |         "encoding": "UTF-8",
125 |         "custom-header": [("Accept-Encoding", "gzip")],
126 |         "cookie": [
127 |             ("cookie-name1", "cookie-value1"), ("cookie-name2", "cookie-value2")
128 |         ],
129 |         "outline-depth": 10,
130 |     }
131 |     try:
132 |         print("生成PDF文件中，请耐心等待...")
133 |         if os.path.exists(pdf_filepath): os.remove(pdf_filepath)
134 |         pdfkit.from_file(html_files, pdf_filepath, options=options)
135 |     except Exception as e:
136 |         print("生成pdf报错")
137 |         print(e)
138 | 
139 |     for i in html_files:
140 |         os.remove(i)
141 | 
142 |     print("已制作电子书在当前目录！")
143 | 
144 | 
145 | if __name__ == '__main__':
146 |     # 这个模板是默认的，无需修改
147 |     html_template = """
148 | <!DOCTYPE html>
149 | <html lang="en">
150 | <head>
151 |     <meta charset="UTF-8">
152 | </head>
153 | <body>
154 | <h1>{title}</h1>
155 | <p>{create_time}</p>
156 | <p>{text}</p>
157 | </body>
158 | </html>
159 | """
160 | 
161 |     # 请先登录你有权限查看的星球的账号，进入该星球页面
162 |     # 请使用谷歌浏览器刷新页面，在 Network 面板的抓包内容中找到 topics?... 这样的请求，返回的是 json 内容
163 |     # 将这个包的 cookie 部分复制到 headers 部分的 Cookie 一栏
164 |     # 将这个请求的 url，域名为 api.zsxq.com 开头的，复制到下面 start_url 的部分
165 |     headers = {
166 |         'Cookie':'abtest_env=product; zsxq_access_token=EB72127D-2A94-A794-46FE-8E1D0F151F40_C348130420D15229; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22414445544118248%22%2C%22first_id%22%3A%2217a3722c1df65-0d10035f06881c8-e726559-2073600-17a3722c1e0421%22%2C%22props%22%3A%7B%7D%2C%22%24device_id%22%3A%2217a3722c1df65-0d10035f06881c8-e726559-2073600-17a3722c1e0421%22%7D',
167 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36 SE 2.X MetaSr 1.0'
168 |     }
169 |     start_url = 'https://api.zsxq.com/v2/groups/551212824514/topics?scope=by_owner&count=20'
170 | 
171 |     # 只取大于等于 after ，小于等于 before 的日期时间的文章，可以省略这俩参数，获取所有的历史文章
172 |     # 下面这里我演示的是一段时间的拆分获取 pdf，可以批量生成多个，用于内容跨度时间长，内容非常多的星球，你可以自己看着改
173 |     time_period = [
174 |         ("2021-04-01 00:00", "2021-06-30 23:59"),
175 |         ("2021-01-01 00:00", "2021-03-31 23:59"),
176 |         ("2020-10-01 00:00", "2020-12-31 23:59"),
177 |         ("2020-07-01 00:00", "2020-09-30 23:59"),
178 |         ("2020-04-01 00:00", "2020-06-30 23:59"),
179 |         ("2020-01-01 00:00", "2020-03-31 23:59"),
180 |         ("2019-10-01 00:00", "2019-12-31 23:59"),
181 |         ("2019-07-01 00:00", "2019-09-30 23:59"),
182 |         ("2019-04-01 00:00", "2019-06-30 23:59"),
183 |         ("2019-01-01 00:00", "2019-03-31 23:59"),
184 |         ("2018-10-01 00:00", "2018-12-31 23:59"),
185 |         ("2018-07-01 00:00", "2018-09-30 23:59"),
186 |         ("2018-04-01 00:00", "2018-06-30 23:59"),
187 |     ]
188 |     for period in time_period:
189 |         pdf_filepath = "你的知识星球%s-%s.pdf" % (period[0][:10].replace("-",""), period[1][:10].replace("-",""))
190 |         htmls = []
191 |         num = 1
192 |         make_pdf(get_data(start_url, headers, before=period[1], after=period[0]), pdf_filepath=pdf_filepath)
193 |     
194 |     # 如果你想获取该星球的所有内容，请用这几句代码，但当内容较多的时候，生成 pdf 会极慢
195 |     # htmls = []
196 |     # num = 1
197 |     # make_pdf(get_data(start_url, headers))
198 | 


--------------------------------------------------------------------------------