├── README.markdown
├── img.png
├── img_1.png
├── main.py
├── requirements.txt
└── setting.py


/README.markdown:
--------------------------------------------------------------------------------
 1 | # **微信公众号文章爬虫---python**
 2 | ## <font color='yellow'>阐述：</font>
 3 | ### 调用微信公众平台的一个接口，来获取任意公众号的文章，需要自备一个个人公众号账号，由于微信公众平台的账号密码也需要扫描登录，所以无法写成自动获取token的操作
 4 | ## <font color='yellow'>功能：</font>
 5 | ### 爬取公众公众号文章，支持爬取多个公众号，在setting.py文件中配置即可，
 6 | ### 注意：*在使用本脚本的前提是有自己的一个订阅号，能登录到微信公众平台获取对应的cookies和token参数*
 7 | ## <font color='yellow'>使用说明：</font>
 8 | ### 1.首先在要在setting中进行配置，填写对应的参数，
 9 | #### wxgeturl_cookies：
10 | 获取方式如下：登录微信公众平台，新建图文
11 | ![img.png](img.png)
12 | #### wx_token：微信公众平台的一个token，需要手动登录微信公众平台获取
13 | ![img_1.png](img_1.png)
14 | #### wechat_accounts_name：要爬取微信公众号的名称（全称，并且完整正确）
15 | #### passagenum:每一个公众号爬取文章的数量，例如：3，代表爬取最新的三篇文章
16 | #### proxies:设置请求的代理地址
17 | ### 2.配置好后运行main.py


--------------------------------------------------------------------------------
/img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ysy-xmy/wx_spider/e23246c855926566f1096e95803a5aa63426e4f3/img.png


--------------------------------------------------------------------------------
/img_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ysy-xmy/wx_spider/e23246c855926566f1096e95803a5aa63426e4f3/img_1.png


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import requests
 3 | import json
 4 | import warnings
 5 | warnings.filterwarnings("ignore")
 6 | import setting
 7 | from setting import proxies
 8 | from bs4 import BeautifulSoup
 9 | from setting import passagenum
10 | import json
11 | headers = {
12 |     'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
13 | }
14 | #一个处理返回结果的函数
15 | def dealspring(prostring):
16 |     return json.loads(prostring["publish_page"])['publish_list']
17 | 
18 | #获取公众号文章链接
19 | def geturl():
20 |     wechat_accounts_fakeid={}
21 |     for item in setting.wechat_accounts_name:
22 |         params1 = {
23 |             'action': 'search_biz',
24 |             'begin': '0',
25 |             'count': '1',
26 |             'query': item,
27 |             'token': setting.wx_token,
28 |             'lang': 'zh_CN',
29 |             'f': 'json',
30 |             'ajax': '1',
31 |         }
32 |         try:
33 |             response = requests.get('https://mp.weixin.qq.com/cgi-bin/searchbiz',verify=False,
34 |                                         params=params1, cookies=setting.wxgeturl_cookies, headers=headers, proxies=proxies)
35 |             if json.loads(response.text)['base_resp']['ret'] == 200040:
36 |                 print('微信公众号token过期')
37 |                 return 0
38 |             fakeid = json.loads(response.text)['list'][0]['fakeid']
39 |             wechat_accounts_fakeid[item] = fakeid
40 |         except requests.exceptions.ConnectionError as e:
41 |             # 处理连接错误的异常逻辑
42 |             print("请求断了,10秒后重试，", e)
43 |             time.sleep(10)
44 |             geturl()
45 |             return 0
46 |     #多个公众号的文章获取，每一个fakeid对应一个公众号，要爬取的公众号在seetting中配置
47 |     for key in wechat_accounts_fakeid:
48 |         params2 = {
49 |             'begin': '0',
50 |             'count': passagenum ,
51 |             'query': '',
52 |             'fakeid': wechat_accounts_fakeid[key],
53 |             'type': '101_1',
54 |             'free_publish_type': '1',
55 |             # 'sub_action': 'list_ex',
56 |             'token': setting.wx_token,  # 需要定时更换token
57 |             'lang': 'zh_CN',
58 |             'f': 'json',
59 |             'ajax': '1',
60 |         }
61 |         # print(response.text)
62 |         try:
63 |             response = requests.get('https://mp.weixin.qq.com/cgi-bin/appmsgpublish', verify=False,
64 |                                     params=params2, cookies=setting.wxgeturl_cookies, headers=headers, proxies=proxies)
65 |             # 在这里处理正常响应的逻辑
66 |             if json.loads(response.text)=={"base_resp":{"ret":200003,"err_msg":"invalid session"}}:
67 |                 print('微信公众号token过期')
68 |                 return 0
69 |         except requests.exceptions.ConnectionError as e:
70 |             # 处理连接错误的异常逻辑
71 |             print("请求断了,10秒后重试，", e)
72 |             time.sleep(10)
73 |             geturl()
74 |             return 0
75 |         passages = []#用来存每一个公众号的文章链接
76 |         for i in range(passagenum):
77 |             list=json.loads(dealspring(json.loads(response.text))[i]['publish_info'])['appmsg_info']
78 |             for b in list:
79 |              passage = list[list.index(b)]
80 |              temp = {'title': passage['title'], 'url': passage['content_url']}
81 |              passages.append(temp)
82 |              # print(temp)
83 |         response.close()
84 |         print("目前爬取的公众号是：",key)
85 |         for i in passages:
86 |             # 这里加上文章链接的判断
87 |             print(i['title'], i['url'])
88 |             response = requests.get(i['url'], headers=headers, proxies=proxies, verify=False)
89 |             # 在这里处理正常响应的逻辑
90 |             soup = BeautifulSoup(response.text, 'lxml')
91 |             onlytext = soup.text.replace(" ", "").replace("\n", "")
92 |             print(f'从文章中提取到的文本是：{onlytext}')
93 |             print()
94 |             print('-----------------我是分割线😎😎😎-------------------')
95 |             print()
96 |         # 呼，休息一下，三秒后再获取另一个公众号的😊😊😊
97 |         time.sleep(3)
98 | geturl()


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.12.2
2 | Requests==2.31.0
3 | 


--------------------------------------------------------------------------------
/setting.py:
--------------------------------------------------------------------------------
 1 | # 定义全局变量
 2 | 
 3 | wxgeturl_cookies = {
 4 | #填写cookies
 5 | }
 6 | 
 7 | 
 8 | #要爬取的微信公众号
 9 | wechat_accounts_fakeid={
10 | 
11 |    # 如：'莞博社区':'Mzk0NTQ3Mzk5Nw==',
12 | 
13 | }
14 | 
15 | wx_token='1406298924'
16 | 
17 | # proxies = {'http': '112.80.248.73'}
18 | 
19 | 


--------------------------------------------------------------------------------