├── README.md ├── WeChatUrlCrawler.py └── WeixinSpider.py /README.md: -------------------------------------------------------------------------------- 1 | ## WeChatCrawler 2 | 3 | WeChatUrlCrawler是一个专门爬取公众号文章url的爬虫,将爬到的url保存在文件或者数据库中 4 | 5 | #### 环境准备: 6 | 7 | 准备python3.6及以上的环境,安装需要的库,如果报错 8 | 9 | `ModuleNotFoundError: No module named 'XXX'`,只需要执行`npm install XXX`即可 10 | 11 | #### 具体使用步骤,可以搜索公众号”程序员修炼“,或者扫描下方二维码 12 | 13 | ![程序员修炼](https://www.jingyoushui.cn/image/e6ea2efe-d0d7-487a-b7a5-c8bf0ee7d005.jpg) 14 | 15 | #### 有任何疑问,也可以加我微信咨询: 16 | 17 | ![](https://www.jingyoushui.cn/image/2faaefa1-b2f5-4260-9ac0-87be8cb708ce.jpg) -------------------------------------------------------------------------------- /WeChatUrlCrawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import requests 3 | import time 4 | import pandas as pd 5 | import math 6 | import random 7 | 8 | user_agent_list = [ 9 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 10 | 'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3', 11 | 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 12 | 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 13 | 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', 14 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', 15 | 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0', 16 | 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', 17 | "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Mobile Safari/537.36", 18 | ] 19 | 20 | # 目标url 21 | url = "https://mp.weixin.qq.com/cgi-bin/appmsg" 22 | cookie = "appmsglist_action_3211828736=card; ua_id=xf1IJPpCxrOkzx7HAAAAAEfpoLhrY0vf20LrgWcLYec=; wxuin=86999637334436; mm_lang=zh_CN; sig_login=h01d1e20db4fb703b033225cbbdba00bbe3cdc4ec6e572a379ee6538c8469a1e7fb7fa70fbab1502f6e; poc_sid=HKiMpWSj06-7f7D34kNWA1XMhli1chB19UDerP16; rewardsn=; wxtokenkey=777; _clck=3211828736|1|fd4|0; uuid=f96f4e9ebfb016ffe047415406c68016; rand_info=CAESIFj8UW/i916e5Wjx44FVtePMf/z26P4BzVg7WkY8+mZ3; slave_bizuin=3211828736; data_bizuin=3239826583; bizuin=3211828736; data_ticket=MY7BzVOs/eOR0o7KVMAz6kU7axsoln5E9yHzB8UFfr4OuWzLWc6wCq84S58drkZe; slave_sid=Nk9ibUw5QjVCT2czNFBEMDNyY3NVZlFIaEc0TjJ2SFdqVzhDaG9SNnpiZk5EMUJIOVNIeGRocUNiaFAyVFBqMWVfNWV4WVpwN0Z1aUk1WE43Y3UycUhBVm5ZYWM0UGhTWkVueFlFRk5PMTB2M0RPWlI2b0FRaVpCcEJYaW9Wcjd4TmluOGZwSVVDdnpkd0tE; slave_user=gh_2d4656f66685; xid=0c74682e70906d561039ca6e24d8ae8b; _clsk=o58sgn|1688813027677|1|1|mp.weixin.qq.com/weheat-agent/payload/record" 23 | 24 | # 使用Cookie,跳过登陆操作 25 | 26 | data = { 27 | "token": "20884314", 28 | "lang": "zh_CN", 29 | "f": "json", 30 | "ajax": "1", 31 | "action": "list_ex", 32 | "begin": "0", 33 | "count": "5", 34 | "query": "", 35 | "fakeid": "这里进行替换", 36 | "type": "9", 37 | } 38 | headers = { 39 | "Cookie": cookie, 40 | "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Mobile Safari/537.36", 41 | 42 | } 43 | content_json = requests.get(url, headers=headers, params=data).json() 44 | count = int(content_json["app_msg_cnt"]) 45 | print(count) 46 | page = int(math.ceil(count / 5)) 47 | print(page) 48 | content_list = [] 49 | # 功能:爬取IP存入ip_list列表 50 | 51 | for i in range(page): 52 | data["begin"] = i * 5 53 | user_agent = random.choice(user_agent_list) 54 | headers = { 55 | "Cookie": cookie, 56 | "User-Agent": user_agent, 57 | 58 | } 59 | ip_headers = { 60 | 'User-Agent': user_agent 61 | } 62 | # 使用get方法进行提交 63 | content_json = requests.get(url, headers=headers, params=data).json() 64 | # 返回了一个json,里面是每一页的数据 65 | for item in content_json["app_msg_list"]: 66 | # 提取每页文章的标题及对应的url 67 | items = [] 68 | items.append(item["title"]) 69 | items.append(item["link"]) 70 | t = time.localtime(item["create_time"]) 71 | items.append(time.strftime("%Y-%m-%d %H:%M:%S", t)) 72 | content_list.append(items) 73 | print(i) 74 | if (i > 0) and (i % 10 == 0): 75 | name = ['title', 'link', 'create_time'] 76 | test = pd.DataFrame(columns=name, data=content_list) 77 | test.to_csv("url.csv", mode='a', encoding='utf-8') 78 | print("第" + str(i) + "次保存成功") 79 | content_list = [] 80 | time.sleep(random.randint(60,90)) 81 | else: 82 | time.sleep(random.randint(15,25)) 83 | 84 | name = ['title', 'link', 'create_time'] 85 | test = pd.DataFrame(columns=name, data=content_list) 86 | test.to_csv("url.csv", mode='a', encoding='utf-8') 87 | print("最后一次保存成功") -------------------------------------------------------------------------------- /WeixinSpider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import requests 3 | from lxml import etree 4 | import json 5 | import re 6 | import pandas as pd 7 | import pymysql 8 | 9 | 10 | class WeixinSpider: 11 | def __init__(self): 12 | self.url_temp = "" 13 | self.headers = { 14 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"} 15 | 16 | 17 | def parse_url(self, url): # 发送请求,获取响应 18 | print(url) 19 | response = requests.get(url, headers=self.headers) 20 | return response.content.decode() 21 | 22 | def get_content_list(self,html_str):#提取数据 23 | html = etree.HTML(html_str) 24 | content_list = [] 25 | item = {} 26 | item["title"] = html.xpath("//*[@id=\"activity-name\"]/text()") 27 | # item["title"] = [i.replace("\n","").replace(" ","") for i in item["title"]] 28 | item["laiyuan"] = html.xpath("//*[@id=\"js_name\"]/text()") 29 | # item["laiyuan"] = [i.replace("\n","").replace(" ","") for i in item["laiyuan"]] 30 | item["other"] = html.xpath("//*[@id=\"js_content\"]//text()") 31 | print(item) 32 | content_list.append(item) 33 | 34 | return content_list 35 | 36 | def save_html(self, html_str, page_name): # 保存html字符串 37 | file_path = "it/{}.html".format(page_name) 38 | with open(file_path, "w", encoding="utf-8") as f: 39 | f.write(html_str) 40 | 41 | def get_url_list(self): 42 | file_path = "it.csv" 43 | df = pd.read_csv(file_path) 44 | temp_list = df["link"].str.split("!").tolist() #[[],[],[]] 45 | num_list = list(set([i for j in temp_list for i in j])) 46 | num_list_new = [i for i in num_list] 47 | 48 | time_list = df["create_time"].tolist() #[[],[],[]] 49 | return num_list_new,time_list 50 | 51 | def run(self): # 实现主要逻辑 52 | # 1.构造url列表 53 | # 2.遍历,发送请求,获取响应 54 | url_list,time_list = self.get_url_list() 55 | 56 | # 打开数据库连接(ip/数据库用户名/登录密码/数据库名) 57 | db = pymysql.connect("localhost", "root", "root", "weixin_database") 58 | 59 | # 使用 cursor() 方法创建一个游标对象 cursor 60 | cursor = db.cursor() 61 | 62 | for url in url_list: 63 | num = url_list.index(url) 64 | print(num) 65 | html_str = self.parse_url(url) 66 | content_list = self.get_content_list(html_str) 67 | title = ''.join(content_list[0]["title"]) 68 | laiyuan = ''.join(content_list[0]["laiyuan"]) 69 | other = '\n'.join(content_list[0]["other"]) 70 | create_time = time_list[num] 71 | p = re.compile('
.*?
',re.S) 72 | html = re.search(p,html_str) 73 | if(html): 74 | html = re.search(p3,html_str).group().replace("\n","") 75 | 76 | else: 77 | html = html_str.replace("\n","") 78 | sql = """INSERT INTO weixin_table(title,url,other,html,create_time,type_id) 79 | VALUES ({},{},{},{},{},{},{},{})""".format('"'+title+'"','"'+url+'"','"'+other+'"',"'"+html+"'",create_time,1) 80 | try: 81 | # 执行sql语句 82 | cursor.execute(sql) 83 | # 提交到数据库执行 84 | db.commit() 85 | except: 86 | print("第"+num+"条数据插入失败") 87 | # 如果发生错误则回滚 88 | db.rollback() 89 | 90 | 91 | 92 | # 3.保存html 93 | page_name = title 94 | self.save_html(html_str, page_name) 95 | # 关闭数据库连接 96 | db.close() 97 | 98 | 99 | if __name__ == '__main__': 100 | weixin_spider = WeixinSpider() 101 | weixin_spider.run() 102 | --------------------------------------------------------------------------------