├── README.md
├── crawler.py
├── weixin.csv
├── weixin.py
└── zsxq.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Crawler
 2 | 爬取博客园文章/批量导出微信公众号文章/导出知识星球精华主题
 3 | 
 4 | 平时我们可能需要把感兴趣的公众号的文章保存为pdf，方便离线查看，也可以避免某些文章被删除后看不到。所以我们需要把该公众号的文章批量导出为pdf。这里我们使用python来实现该功能。
 5 | 
 6 | ### 导出该公众号的所有文章链接等信息为CSV文件。
 7 | 
 8 |      首先我们安装chrome的webscrapyer插件，用来爬取自己感兴趣的公众号的文章导出为CSV文件。文件保存文章的标题，时间和链接。具体插件的使用细节自己百度。
 9 | 
10 | ### 根据第一步生成的CSV文件批量导出为pdf
11 |   首先我们安装wkhtmltopdf工具程序。然后编写程序来读取上一步得到的csv文件批量导出为pdf。这里由于微信的图片防盗链措施，直接根据url导出pdf会发现图片缺失。 
12 |   所以我们需要对请求得到的html文件进行处理后，再导出为pdf。具体核心代码如下
13 |  ```python
14 |       def process(item):
15 |           url = str(item[2])
16 |           name = item[1] + item[0] + '.pdf'
17 |           response = requests.get(url)
18 |           html = response.text
19 |           html = html.replace('data-src', 'src')
20 | 
21 |           try:
22 |               pdfkit.from_string(html, name)
23 |           except:
24 |               pass
25 | 
26 |       with open("weixin.csv","r",encoding="gbk") as f:
27 |           f_csv=csv.reader(f)
28 |           next(f_csv)
29 |           pool = ThreadPool(processes=20)
30 |           pool.map(process, (i for i in f_csv))
31 |           pool.close()
32 |  ```
33 |  
34 | 其中使用了线程池来加速处理生成pdf，本地测试一分钟可以导出90+篇文章。
35 | * [博客](https://www.cnblogs.com/wzf-Learning/p/11153963.html)
36 | * ![3c9095308e58b7c90aace2fe02a1e90](https://user-images.githubusercontent.com/16174175/223343159-551f8954-3153-4c96-9aae-dd8fbcdfca97.jpg)
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/crawler.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import urllib
 3 | import urllib.request 
 4 | def getHtml(url):
 5 |     page = urllib.request.urlopen(url)
 6 |     html = page.read().decode("utf8")
 7 |     return html
 8 | 
 9 | def getArticleUrl(html):
10 |     reg = r'(https://www.cnblogs.com/'+blog_name+'/p/[0-9]+.html)'
11 |     articleUrl = re.findall(reg,html)
12 |     return articleUrl
13 | 
14 | blog_name = input("请输入博主昵称：");
15 | article = []
16 | htmlStr = getHtml("http://www.cnblogs.com/"+blog_name+"/default.html")
17 | for i in range(1,10):
18 |     html = getHtml("http://www.cnblogs.com/"+blog_name+"/default.html?page="+str(i))
19 |     articleUrl = getArticleUrl(html)
20 |     if len(articleUrl)==0:
21 |         break;
22 |     article = article.__add__(articleUrl)
23 | 
24 | article = list(set(article))
25 | print(article)
26 | 


--------------------------------------------------------------------------------
/weixin.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leehomfans/Crawler/ff5bf12196369d63dfb4ae05df98fcfd6f161824/weixin.csv


--------------------------------------------------------------------------------
/weixin.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import time
 3 | import pdfkit
 4 | import requests
 5 | from multiprocessing.pool import ThreadPool
 6 | 
 7 | def process(item):
 8 |     url = str(item[2])
 9 |     name = item[1] + item[0] + '.pdf'
10 |     response = requests.get(url)
11 |     print(response.status_code)
12 |     html = response.text
13 |     html = html.replace('data-src', 'src')
14 | 
15 |     try:
16 |         pdfkit.from_string(html, name)
17 |     except:
18 |         pass
19 |     
20 | with open("weixin.csv","r",encoding="gbk") as f:
21 |     f_csv=csv.reader(f)
22 |     next(f_csv)
23 |     pool = ThreadPool(processes=20)
24 |     pool.map(process, (i for i in f_csv))
25 |     pool.close() 
26 | 
27 | 
28 | 
29 | 
30 | 
31 | 
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/zsxq.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import requests
  3 | import json
  4 | import os
  5 | import pdfkit
  6 | from bs4 import BeautifulSoup
  7 | from urllib.parse import quote
  8 | 
  9 | html_template = """
 10 | <!DOCTYPE html>
 11 | <html lang="en">
 12 | <head>
 13 |     <meta charset="UTF-8">
 14 | </head>
 15 | <body>
 16 | <h1>{title}</h1>
 17 | <p>{text}</p>
 18 | </body>
 19 | </html>
 20 | """
 21 | htmls = []
 22 | num = 0
 23 | def get_data(url):
 24 | 
 25 |     global htmls, num
 26 |         
 27 |     headers = {
 28 |         'Cookie':'zsxq_access_token=F59F4329-5D05-D087-724E-A424A7DD3814',
 29 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
 30 |     }
 31 |     
 32 |     rsp = requests.get(url, headers=headers)
 33 |     rsp.encoding = 'gb2312'
 34 |     jsonContent=json.dumps(rsp.json(), indent=2, ensure_ascii=False)
 35 |     #print(jsonContent)
 36 |     with open('test.json', 'w') as f:        # 将返回数据写入 test.json 方便查看
 37 |         f.writelines(jsonContent)
 38 |     
 39 |     with open('test.json', encoding='utf-8') as f:
 40 |         for topic in json.loads(f.read()).get('resp_data').get('topics'):
 41 |             content = topic.get('question', topic.get('talk', topic.get('task', topic.get('solution'))))
 42 |             # print(content)
 43 |             text = content.get('text', '')
 44 |             text = re.sub(r'<[^>]*>', '', text).strip()
 45 |             text = text.replace('\n', '<br>')
 46 |             title = str(num) + text[:9]
 47 |             num += 1
 48 | 
 49 |             if content.get('images'):
 50 |                 soup = BeautifulSoup(html_template, 'html.parser')
 51 |                 for img in content.get('images'):
 52 |                     url = img.get('large').get('url')
 53 |                     img_tag = soup.new_tag('img', src=url)
 54 |                     soup.body.append(img_tag)
 55 |                     html_img = str(soup)
 56 |                     html = html_img.format(title=title, text=text)
 57 |             else:
 58 |                 html = html_template.format(title=title, text=text)
 59 | 
 60 |             if topic.get('question'):
 61 |                 answer = topic.get('answer').get('text', "")
 62 |                 soup = BeautifulSoup(html, 'html.parser')
 63 |                 answer_tag = soup.new_tag('p')
 64 |                 answer_tag.string = answer
 65 |                 soup.body.append(answer_tag)
 66 |                 html_answer = str(soup)
 67 |                 html = html_answer.format(title=title, text=text)
 68 | 
 69 |             htmls.append(html)
 70 | 
 71 |     next_page = rsp.json().get('resp_data').get('topics')
 72 |     if next_page:
 73 |         create_time = next_page[-1].get('create_time')
 74 |         if create_time[20:23] == "000":
 75 |             end_time = create_time[:20]+"999"+create_time[23:]
 76 |         else :
 77 |             res = int(create_time[20:23])-1
 78 |             end_time = create_time[:20]+str(res).zfill(3)+create_time[23:] # zfill 函数补足结果前面的零，始终为3位数
 79 |         end_time = quote(end_time)
 80 |         if len(end_time) == 33:
 81 |             end_time = end_time[:24] + '0' + end_time[24:]
 82 |         next_url = start_url + '&end_time=' + end_time
 83 |         print(next_url)
 84 |         get_data(next_url)
 85 | 
 86 |     return htmls
 87 | 
 88 | def make_pdf(htmls):
 89 |     html_files = []
 90 |     for index, html in enumerate(htmls):
 91 |         if html.strip():
 92 |             file = str(index) + ".html"
 93 |             html_files.append(file)
 94 |             with open(file, "w", encoding="utf-8") as f:
 95 |                 f.write(html)
 96 | 
 97 |     options = {
 98 |         "user-style-sheet": "test.css",
 99 |         "page-size": "Letter",
100 |         "margin-top": "0.75in",
101 |         "margin-right": "0.75in",
102 |         "margin-bottom": "0.75in",
103 |         "margin-left": "0.75in",
104 |         "encoding": "UTF-8",
105 |         "custom-header": [("Accept-Encoding", "gzip")],
106 |         "cookie": [
107 |             ("cookie-name1", "cookie-value1"), ("cookie-name2", "cookie-value2")
108 |         ],
109 |         "outline-depth": 10,
110 |     }
111 | 
112 |     try:
113 |         count=len(html_files)//1000
114 |         for num in range(count+1):
115 |             if num<count:
116 |                 pdfkit.from_file(html_files[num*1000:(num+1)*1000-1], str(num+1)+".pdf", options=options)
117 |             else:
118 |                 pdfkit.from_file(html_files[num*1000:len(html_files)],str(num+1)+".pdf", options=options)
119 |     except:
120 |         pass
121 | 
122 |     
123 |     for file in html_files:
124 |         os.remove(file)
125 |     
126 | 
127 |     print("已制作电子书在当前目录！")
128 | 
129 | if __name__ == '__main__':
130 |     start_url= "https://api.zsxq.com/v1.10/groups/881124541442/topics?count=20"
131 |     make_pdf(get_data(start_url))
132 | 


--------------------------------------------------------------------------------