├── README.md ├── crawler.py ├── weixin.csv ├── weixin.py └── zsxq.py /README.md: -------------------------------------------------------------------------------- 1 | # Crawler 2 | 爬取博客园文章/批量导出微信公众号文章/导出知识星球精华主题 3 | 4 | 平时我们可能需要把感兴趣的公众号的文章保存为pdf,方便离线查看,也可以避免某些文章被删除后看不到。所以我们需要把该公众号的文章批量导出为pdf。这里我们使用python来实现该功能。 5 | 6 | ### 导出该公众号的所有文章链接等信息为CSV文件。 7 | 8 | 首先我们安装chrome的webscrapyer插件,用来爬取自己感兴趣的公众号的文章导出为CSV文件。文件保存文章的标题,时间和链接。具体插件的使用细节自己百度。 9 | 10 | ### 根据第一步生成的CSV文件批量导出为pdf 11 | 首先我们安装wkhtmltopdf工具程序。然后编写程序来读取上一步得到的csv文件批量导出为pdf。这里由于微信的图片防盗链措施,直接根据url导出pdf会发现图片缺失。 12 | 所以我们需要对请求得到的html文件进行处理后,再导出为pdf。具体核心代码如下 13 | ```python 14 | def process(item): 15 | url = str(item[2]) 16 | name = item[1] + item[0] + '.pdf' 17 | response = requests.get(url) 18 | html = response.text 19 | html = html.replace('data-src', 'src') 20 | 21 | try: 22 | pdfkit.from_string(html, name) 23 | except: 24 | pass 25 | 26 | with open("weixin.csv","r",encoding="gbk") as f: 27 | f_csv=csv.reader(f) 28 | next(f_csv) 29 | pool = ThreadPool(processes=20) 30 | pool.map(process, (i for i in f_csv)) 31 | pool.close() 32 | ``` 33 | 34 | 其中使用了线程池来加速处理生成pdf,本地测试一分钟可以导出90+篇文章。 35 | * [博客](https://www.cnblogs.com/wzf-Learning/p/11153963.html) 36 | * ![3c9095308e58b7c90aace2fe02a1e90](https://user-images.githubusercontent.com/16174175/223343159-551f8954-3153-4c96-9aae-dd8fbcdfca97.jpg) 37 | 38 | 39 | -------------------------------------------------------------------------------- /crawler.py: -------------------------------------------------------------------------------- 1 | import re 2 | import urllib 3 | import urllib.request 4 | def getHtml(url): 5 | page = urllib.request.urlopen(url) 6 | html = page.read().decode("utf8") 7 | return html 8 | 9 | def getArticleUrl(html): 10 | reg = r'(https://www.cnblogs.com/'+blog_name+'/p/[0-9]+.html)' 11 | articleUrl = re.findall(reg,html) 12 | return articleUrl 13 | 14 | blog_name = input("请输入博主昵称:"); 15 | article = [] 16 | htmlStr = getHtml("http://www.cnblogs.com/"+blog_name+"/default.html") 17 | for i in range(1,10): 18 | html = getHtml("http://www.cnblogs.com/"+blog_name+"/default.html?page="+str(i)) 19 | articleUrl = getArticleUrl(html) 20 | if len(articleUrl)==0: 21 | break; 22 | article = article.__add__(articleUrl) 23 | 24 | article = list(set(article)) 25 | print(article) 26 | -------------------------------------------------------------------------------- /weixin.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Leehomfans/Crawler/ff5bf12196369d63dfb4ae05df98fcfd6f161824/weixin.csv -------------------------------------------------------------------------------- /weixin.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import time 3 | import pdfkit 4 | import requests 5 | from multiprocessing.pool import ThreadPool 6 | 7 | def process(item): 8 | url = str(item[2]) 9 | name = item[1] + item[0] + '.pdf' 10 | response = requests.get(url) 11 | print(response.status_code) 12 | html = response.text 13 | html = html.replace('data-src', 'src') 14 | 15 | try: 16 | pdfkit.from_string(html, name) 17 | except: 18 | pass 19 | 20 | with open("weixin.csv","r",encoding="gbk") as f: 21 | f_csv=csv.reader(f) 22 | next(f_csv) 23 | pool = ThreadPool(processes=20) 24 | pool.map(process, (i for i in f_csv)) 25 | pool.close() 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /zsxq.py: -------------------------------------------------------------------------------- 1 | import re 2 | import requests 3 | import json 4 | import os 5 | import pdfkit 6 | from bs4 import BeautifulSoup 7 | from urllib.parse import quote 8 | 9 | html_template = """ 10 | 11 | 12 | 13 | 14 | 15 | 16 |

{title}

17 |

{text}

18 | 19 | 20 | """ 21 | htmls = [] 22 | num = 0 23 | def get_data(url): 24 | 25 | global htmls, num 26 | 27 | headers = { 28 | 'Cookie':'zsxq_access_token=F59F4329-5D05-D087-724E-A424A7DD3814', 29 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' 30 | } 31 | 32 | rsp = requests.get(url, headers=headers) 33 | rsp.encoding = 'gb2312' 34 | jsonContent=json.dumps(rsp.json(), indent=2, ensure_ascii=False) 35 | #print(jsonContent) 36 | with open('test.json', 'w') as f: # 将返回数据写入 test.json 方便查看 37 | f.writelines(jsonContent) 38 | 39 | with open('test.json', encoding='utf-8') as f: 40 | for topic in json.loads(f.read()).get('resp_data').get('topics'): 41 | content = topic.get('question', topic.get('talk', topic.get('task', topic.get('solution')))) 42 | # print(content) 43 | text = content.get('text', '') 44 | text = re.sub(r'<[^>]*>', '', text).strip() 45 | text = text.replace('\n', '
') 46 | title = str(num) + text[:9] 47 | num += 1 48 | 49 | if content.get('images'): 50 | soup = BeautifulSoup(html_template, 'html.parser') 51 | for img in content.get('images'): 52 | url = img.get('large').get('url') 53 | img_tag = soup.new_tag('img', src=url) 54 | soup.body.append(img_tag) 55 | html_img = str(soup) 56 | html = html_img.format(title=title, text=text) 57 | else: 58 | html = html_template.format(title=title, text=text) 59 | 60 | if topic.get('question'): 61 | answer = topic.get('answer').get('text', "") 62 | soup = BeautifulSoup(html, 'html.parser') 63 | answer_tag = soup.new_tag('p') 64 | answer_tag.string = answer 65 | soup.body.append(answer_tag) 66 | html_answer = str(soup) 67 | html = html_answer.format(title=title, text=text) 68 | 69 | htmls.append(html) 70 | 71 | next_page = rsp.json().get('resp_data').get('topics') 72 | if next_page: 73 | create_time = next_page[-1].get('create_time') 74 | if create_time[20:23] == "000": 75 | end_time = create_time[:20]+"999"+create_time[23:] 76 | else : 77 | res = int(create_time[20:23])-1 78 | end_time = create_time[:20]+str(res).zfill(3)+create_time[23:] # zfill 函数补足结果前面的零,始终为3位数 79 | end_time = quote(end_time) 80 | if len(end_time) == 33: 81 | end_time = end_time[:24] + '0' + end_time[24:] 82 | next_url = start_url + '&end_time=' + end_time 83 | print(next_url) 84 | get_data(next_url) 85 | 86 | return htmls 87 | 88 | def make_pdf(htmls): 89 | html_files = [] 90 | for index, html in enumerate(htmls): 91 | if html.strip(): 92 | file = str(index) + ".html" 93 | html_files.append(file) 94 | with open(file, "w", encoding="utf-8") as f: 95 | f.write(html) 96 | 97 | options = { 98 | "user-style-sheet": "test.css", 99 | "page-size": "Letter", 100 | "margin-top": "0.75in", 101 | "margin-right": "0.75in", 102 | "margin-bottom": "0.75in", 103 | "margin-left": "0.75in", 104 | "encoding": "UTF-8", 105 | "custom-header": [("Accept-Encoding", "gzip")], 106 | "cookie": [ 107 | ("cookie-name1", "cookie-value1"), ("cookie-name2", "cookie-value2") 108 | ], 109 | "outline-depth": 10, 110 | } 111 | 112 | try: 113 | count=len(html_files)//1000 114 | for num in range(count+1): 115 | if num