├── README.md ├── crawler.py ├── weixin.csv ├── weixin.py └── zsxq.py /README.md: -------------------------------------------------------------------------------- 1 | # Crawler 2 | 爬取博客园文章/批量导出微信公众号文章/导出知识星球精华主题 3 | 4 | 平时我们可能需要把感兴趣的公众号的文章保存为pdf,方便离线查看,也可以避免某些文章被删除后看不到。所以我们需要把该公众号的文章批量导出为pdf。这里我们使用python来实现该功能。 5 | 6 | ### 导出该公众号的所有文章链接等信息为CSV文件。 7 | 8 | 首先我们安装chrome的webscrapyer插件,用来爬取自己感兴趣的公众号的文章导出为CSV文件。文件保存文章的标题,时间和链接。具体插件的使用细节自己百度。 9 | 10 | ### 根据第一步生成的CSV文件批量导出为pdf 11 | 首先我们安装wkhtmltopdf工具程序。然后编写程序来读取上一步得到的csv文件批量导出为pdf。这里由于微信的图片防盗链措施,直接根据url导出pdf会发现图片缺失。 12 | 所以我们需要对请求得到的html文件进行处理后,再导出为pdf。具体核心代码如下 13 | ```python 14 | def process(item): 15 | url = str(item[2]) 16 | name = item[1] + item[0] + '.pdf' 17 | response = requests.get(url) 18 | html = response.text 19 | html = html.replace('data-src', 'src') 20 | 21 | try: 22 | pdfkit.from_string(html, name) 23 | except: 24 | pass 25 | 26 | with open("weixin.csv","r",encoding="gbk") as f: 27 | f_csv=csv.reader(f) 28 | next(f_csv) 29 | pool = ThreadPool(processes=20) 30 | pool.map(process, (i for i in f_csv)) 31 | pool.close() 32 | ``` 33 | 34 | 其中使用了线程池来加速处理生成pdf,本地测试一分钟可以导出90+篇文章。 35 | * [博客](https://www.cnblogs.com/wzf-Learning/p/11153963.html) 36 | *  37 | 38 | 39 | -------------------------------------------------------------------------------- /crawler.py: -------------------------------------------------------------------------------- 1 | import re 2 | import urllib 3 | import urllib.request 4 | def getHtml(url): 5 | page = urllib.request.urlopen(url) 6 | html = page.read().decode("utf8") 7 | return html 8 | 9 | def getArticleUrl(html): 10 | reg = r'(https://www.cnblogs.com/'+blog_name+'/p/[0-9]+.html)' 11 | articleUrl = re.findall(reg,html) 12 | return articleUrl 13 | 14 | blog_name = input("请输入博主昵称:"); 15 | article = [] 16 | htmlStr = getHtml("http://www.cnblogs.com/"+blog_name+"/default.html") 17 | for i in range(1,10): 18 | html = getHtml("http://www.cnblogs.com/"+blog_name+"/default.html?page="+str(i)) 19 | articleUrl = getArticleUrl(html) 20 | if len(articleUrl)==0: 21 | break; 22 | article = article.__add__(articleUrl) 23 | 24 | article = list(set(article)) 25 | print(article) 26 | -------------------------------------------------------------------------------- /weixin.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Leehomfans/Crawler/ff5bf12196369d63dfb4ae05df98fcfd6f161824/weixin.csv -------------------------------------------------------------------------------- /weixin.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import time 3 | import pdfkit 4 | import requests 5 | from multiprocessing.pool import ThreadPool 6 | 7 | def process(item): 8 | url = str(item[2]) 9 | name = item[1] + item[0] + '.pdf' 10 | response = requests.get(url) 11 | print(response.status_code) 12 | html = response.text 13 | html = html.replace('data-src', 'src') 14 | 15 | try: 16 | pdfkit.from_string(html, name) 17 | except: 18 | pass 19 | 20 | with open("weixin.csv","r",encoding="gbk") as f: 21 | f_csv=csv.reader(f) 22 | next(f_csv) 23 | pool = ThreadPool(processes=20) 24 | pool.map(process, (i for i in f_csv)) 25 | pool.close() 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /zsxq.py: -------------------------------------------------------------------------------- 1 | import re 2 | import requests 3 | import json 4 | import os 5 | import pdfkit 6 | from bs4 import BeautifulSoup 7 | from urllib.parse import quote 8 | 9 | html_template = """ 10 | 11 | 12 |
13 | 14 | 15 | 16 |{text}
18 | 19 | 20 | """ 21 | htmls = [] 22 | num = 0 23 | def get_data(url): 24 | 25 | global htmls, num 26 | 27 | headers = { 28 | 'Cookie':'zsxq_access_token=F59F4329-5D05-D087-724E-A424A7DD3814', 29 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' 30 | } 31 | 32 | rsp = requests.get(url, headers=headers) 33 | rsp.encoding = 'gb2312' 34 | jsonContent=json.dumps(rsp.json(), indent=2, ensure_ascii=False) 35 | #print(jsonContent) 36 | with open('test.json', 'w') as f: # 将返回数据写入 test.json 方便查看 37 | f.writelines(jsonContent) 38 | 39 | with open('test.json', encoding='utf-8') as f: 40 | for topic in json.loads(f.read()).get('resp_data').get('topics'): 41 | content = topic.get('question', topic.get('talk', topic.get('task', topic.get('solution')))) 42 | # print(content) 43 | text = content.get('text', '') 44 | text = re.sub(r'<[^>]*>', '', text).strip() 45 | text = text.replace('\n', '