├── README.md
├── WeChatUrlCrawler.py
└── WeixinSpider.py


/README.md:
--------------------------------------------------------------------------------
 1 | ## WeChatCrawler
 2 | 
 3 | WeChatUrlCrawler是一个专门爬取公众号文章url的爬虫，将爬到的url保存在文件或者数据库中
 4 | 
 5 | #### 环境准备：
 6 | 
 7 | 准备python3.6及以上的环境，安装需要的库，如果报错
 8 | 
 9 | `ModuleNotFoundError: No module named 'XXX'`,只需要执行`npm install XXX`即可
10 | 
11 | #### 具体使用步骤，可以搜索公众号”程序员修炼“，或者扫描下方二维码
12 | 
13 | ![程序员修炼](https://www.jingyoushui.cn/image/e6ea2efe-d0d7-487a-b7a5-c8bf0ee7d005.jpg)
14 | 
15 | #### 有任何疑问，也可以加我微信咨询：
16 | 
17 | ![](https://www.jingyoushui.cn/image/2faaefa1-b2f5-4260-9ac0-87be8cb708ce.jpg)


--------------------------------------------------------------------------------
/WeChatUrlCrawler.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import requests
 3 | import time
 4 | import pandas as pd
 5 | import math
 6 | import random
 7 | 
 8 | user_agent_list = [
 9 |     'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
10 |     'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
11 |     'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
12 |     'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
13 |     'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
14 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
15 |     'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
16 |     'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
17 |     "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Mobile Safari/537.36",
18 | ]
19 | 
20 | # 目标url
21 | url = "https://mp.weixin.qq.com/cgi-bin/appmsg"
22 | cookie = "appmsglist_action_3211828736=card; ua_id=xf1IJPpCxrOkzx7HAAAAAEfpoLhrY0vf20LrgWcLYec=; wxuin=86999637334436; mm_lang=zh_CN; sig_login=h01d1e20db4fb703b033225cbbdba00bbe3cdc4ec6e572a379ee6538c8469a1e7fb7fa70fbab1502f6e; poc_sid=HKiMpWSj06-7f7D34kNWA1XMhli1chB19UDerP16; rewardsn=; wxtokenkey=777; _clck=3211828736|1|fd4|0; uuid=f96f4e9ebfb016ffe047415406c68016; rand_info=CAESIFj8UW/i916e5Wjx44FVtePMf/z26P4BzVg7WkY8+mZ3; slave_bizuin=3211828736; data_bizuin=3239826583; bizuin=3211828736; data_ticket=MY7BzVOs/eOR0o7KVMAz6kU7axsoln5E9yHzB8UFfr4OuWzLWc6wCq84S58drkZe; slave_sid=Nk9ibUw5QjVCT2czNFBEMDNyY3NVZlFIaEc0TjJ2SFdqVzhDaG9SNnpiZk5EMUJIOVNIeGRocUNiaFAyVFBqMWVfNWV4WVpwN0Z1aUk1WE43Y3UycUhBVm5ZYWM0UGhTWkVueFlFRk5PMTB2M0RPWlI2b0FRaVpCcEJYaW9Wcjd4TmluOGZwSVVDdnpkd0tE; slave_user=gh_2d4656f66685; xid=0c74682e70906d561039ca6e24d8ae8b; _clsk=o58sgn|1688813027677|1|1|mp.weixin.qq.com/weheat-agent/payload/record"
23 | 
24 | # 使用Cookie，跳过登陆操作
25 | 
26 | data = {
27 |     "token": "20884314",
28 |     "lang": "zh_CN",
29 |     "f": "json",
30 |     "ajax": "1",
31 |     "action": "list_ex",
32 |     "begin": "0",
33 |     "count": "5",
34 |     "query": "",
35 |     "fakeid": "这里进行替换",
36 |     "type": "9",
37 | }
38 | headers = {
39 |         "Cookie": cookie,
40 |         "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Mobile Safari/537.36",
41 | 
42 |     }
43 | content_json = requests.get(url, headers=headers, params=data).json()
44 | count = int(content_json["app_msg_cnt"])
45 | print(count)
46 | page = int(math.ceil(count / 5))
47 | print(page)
48 | content_list = []
49 | # 功能：爬取IP存入ip_list列表
50 | 
51 | for i in range(page):
52 |     data["begin"] = i * 5
53 |     user_agent = random.choice(user_agent_list)
54 |     headers = {
55 |         "Cookie": cookie,
56 |         "User-Agent": user_agent,
57 | 
58 |     }
59 |     ip_headers = {
60 |         'User-Agent': user_agent
61 |     }
62 |     # 使用get方法进行提交
63 |     content_json = requests.get(url, headers=headers, params=data).json()
64 |     # 返回了一个json，里面是每一页的数据
65 |     for item in content_json["app_msg_list"]:
66 |         # 提取每页文章的标题及对应的url
67 |         items = []
68 |         items.append(item["title"])
69 |         items.append(item["link"])
70 |         t = time.localtime(item["create_time"])
71 |         items.append(time.strftime("%Y-%m-%d %H:%M:%S", t))
72 |         content_list.append(items)
73 |     print(i)
74 |     if (i > 0) and (i % 10 == 0):
75 |         name = ['title', 'link', 'create_time']
76 |         test = pd.DataFrame(columns=name, data=content_list)
77 |         test.to_csv("url.csv", mode='a', encoding='utf-8')
78 |         print("第" + str(i) + "次保存成功")
79 |         content_list = []
80 |         time.sleep(random.randint(60,90))
81 |     else:
82 |         time.sleep(random.randint(15,25))
83 | 
84 | name = ['title', 'link', 'create_time']
85 | test = pd.DataFrame(columns=name, data=content_list)
86 | test.to_csv("url.csv", mode='a', encoding='utf-8')
87 | print("最后一次保存成功")


--------------------------------------------------------------------------------
/WeixinSpider.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | import requests
  3 | from lxml import etree
  4 | import json
  5 | import re
  6 | import pandas as pd
  7 | import pymysql
  8 | 
  9 | 
 10 | class WeixinSpider:
 11 |     def __init__(self):
 12 |         self.url_temp = ""
 13 |         self.headers = {
 14 |             "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}
 15 | 
 16 | 
 17 |     def parse_url(self, url):  # 发送请求，获取响应
 18 |         print(url)
 19 |         response = requests.get(url, headers=self.headers)
 20 |         return response.content.decode()
 21 |     
 22 |     def get_content_list(self,html_str):#提取数据
 23 |         html = etree.HTML(html_str)
 24 |         content_list = []
 25 |         item = {}
 26 |         item["title"] = html.xpath("//*[@id=\"activity-name\"]/text()")
 27 | #         item["title"] = [i.replace("\n","").replace(" ","") for i in item["title"]]
 28 |         item["laiyuan"] = html.xpath("//*[@id=\"js_name\"]/text()")
 29 | #         item["laiyuan"] = [i.replace("\n","").replace(" ","") for i in item["laiyuan"]]
 30 |         item["other"] = html.xpath("//*[@id=\"js_content\"]//text()")
 31 |         print(item)
 32 |         content_list.append(item)
 33 | 
 34 |         return content_list
 35 | 
 36 |     def save_html(self, html_str, page_name):  # 保存html字符串
 37 |         file_path = "it/{}.html".format(page_name)
 38 |         with open(file_path, "w", encoding="utf-8") as f: 
 39 |             f.write(html_str)
 40 | 
 41 |     def get_url_list(self):
 42 |         file_path = "it.csv"
 43 |         df = pd.read_csv(file_path)
 44 |         temp_list = df["link"].str.split("!").tolist() #[[],[],[]]
 45 |         num_list = list(set([i for j in temp_list for i in j]))
 46 |         num_list_new = [i for i in num_list]
 47 |         
 48 |         time_list = df["create_time"].tolist() #[[],[],[]]
 49 |         return num_list_new,time_list
 50 | 
 51 |     def run(self):  # 实现主要逻辑
 52 |         # 1.构造url列表
 53 |         # 2.遍历，发送请求，获取响应
 54 |         url_list,time_list = self.get_url_list()
 55 |     
 56 |         # 打开数据库连接（ip/数据库用户名/登录密码/数据库名）
 57 |         db = pymysql.connect("localhost", "root", "root", "weixin_database")
 58 |     
 59 |         # 使用 cursor() 方法创建一个游标对象 cursor
 60 |         cursor = db.cursor()
 61 |    
 62 |         for url in url_list:
 63 |             num = url_list.index(url)
 64 |             print(num)
 65 |             html_str = self.parse_url(url)
 66 |             content_list = self.get_content_list(html_str)
 67 |             title = ''.join(content_list[0]["title"])
 68 |             laiyuan = ''.join(content_list[0]["laiyuan"])
 69 |             other = '\n'.join(content_list[0]["other"])
 70 |             create_time = time_list[num]
 71 |             p = re.compile('<div class="rich_media_content " id="js_content">.*?</div>',re.S)
 72 |             html = re.search(p,html_str)
 73 |             if(html):
 74 |                 html = re.search(p3,html_str).group().replace("\n","")
 75 |                 
 76 |             else:
 77 |                 html = html_str.replace("\n","")
 78 |             sql = """INSERT INTO weixin_table(title,url,other,html,create_time,type_id)
 79 |                 VALUES ({},{},{},{},{},{},{},{})""".format('"'+title+'"','"'+url+'"','"'+other+'"',"'"+html+"'",create_time,1)
 80 |             try:
 81 |                # 执行sql语句
 82 |                 cursor.execute(sql)
 83 |                # 提交到数据库执行
 84 |                 db.commit()
 85 |             except:
 86 |                 print("第"+num+"条数据插入失败")
 87 |                # 如果发生错误则回滚
 88 |                 db.rollback()
 89 |                
 90 | 
 91 |             
 92 |             # 3.保存html
 93 |             page_name = title
 94 |             self.save_html(html_str, page_name)      
 95 |         # 关闭数据库连接
 96 |         db.close()
 97 | 
 98 | 
 99 | if __name__ == '__main__':
100 |     weixin_spider = WeixinSpider()
101 |     weixin_spider.run()
102 | 


--------------------------------------------------------------------------------