├── README.markdown ├── img.png ├── img_1.png ├── main.py ├── requirements.txt └── setting.py /README.markdown: -------------------------------------------------------------------------------- 1 | # **微信公众号文章爬虫---python** 2 | ## 阐述: 3 | ### 调用微信公众平台的一个接口,来获取任意公众号的文章,需要自备一个个人公众号账号,由于微信公众平台的账号密码也需要扫描登录,所以无法写成自动获取token的操作 4 | ## 功能: 5 | ### 爬取公众公众号文章,支持爬取多个公众号,在setting.py文件中配置即可, 6 | ### 注意:*在使用本脚本的前提是有自己的一个订阅号,能登录到微信公众平台获取对应的cookies和token参数* 7 | ## 使用说明: 8 | ### 1.首先在要在setting中进行配置,填写对应的参数, 9 | #### wxgeturl_cookies: 10 | 获取方式如下:登录微信公众平台,新建图文 11 | ![img.png](img.png) 12 | #### wx_token:微信公众平台的一个token,需要手动登录微信公众平台获取 13 | ![img_1.png](img_1.png) 14 | #### wechat_accounts_name:要爬取微信公众号的名称(全称,并且完整正确) 15 | #### passagenum:每一个公众号爬取文章的数量,例如:3,代表爬取最新的三篇文章 16 | #### proxies:设置请求的代理地址 17 | ### 2.配置好后运行main.py -------------------------------------------------------------------------------- /img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ysy-xmy/wx_spider/e23246c855926566f1096e95803a5aa63426e4f3/img.png -------------------------------------------------------------------------------- /img_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ysy-xmy/wx_spider/e23246c855926566f1096e95803a5aa63426e4f3/img_1.png -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import time 2 | import requests 3 | import json 4 | import warnings 5 | warnings.filterwarnings("ignore") 6 | import setting 7 | from setting import proxies 8 | from bs4 import BeautifulSoup 9 | from setting import passagenum 10 | import json 11 | headers = { 12 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0', 13 | } 14 | #一个处理返回结果的函数 15 | def dealspring(prostring): 16 | return json.loads(prostring["publish_page"])['publish_list'] 17 | 18 | #获取公众号文章链接 19 | def geturl(): 20 | wechat_accounts_fakeid={} 21 | for item in setting.wechat_accounts_name: 22 | params1 = { 23 | 'action': 'search_biz', 24 | 'begin': '0', 25 | 'count': '1', 26 | 'query': item, 27 | 'token': setting.wx_token, 28 | 'lang': 'zh_CN', 29 | 'f': 'json', 30 | 'ajax': '1', 31 | } 32 | try: 33 | response = requests.get('https://mp.weixin.qq.com/cgi-bin/searchbiz',verify=False, 34 | params=params1, cookies=setting.wxgeturl_cookies, headers=headers, proxies=proxies) 35 | if json.loads(response.text)['base_resp']['ret'] == 200040: 36 | print('微信公众号token过期') 37 | return 0 38 | fakeid = json.loads(response.text)['list'][0]['fakeid'] 39 | wechat_accounts_fakeid[item] = fakeid 40 | except requests.exceptions.ConnectionError as e: 41 | # 处理连接错误的异常逻辑 42 | print("请求断了,10秒后重试,", e) 43 | time.sleep(10) 44 | geturl() 45 | return 0 46 | #多个公众号的文章获取,每一个fakeid对应一个公众号,要爬取的公众号在seetting中配置 47 | for key in wechat_accounts_fakeid: 48 | params2 = { 49 | 'begin': '0', 50 | 'count': passagenum , 51 | 'query': '', 52 | 'fakeid': wechat_accounts_fakeid[key], 53 | 'type': '101_1', 54 | 'free_publish_type': '1', 55 | # 'sub_action': 'list_ex', 56 | 'token': setting.wx_token, # 需要定时更换token 57 | 'lang': 'zh_CN', 58 | 'f': 'json', 59 | 'ajax': '1', 60 | } 61 | # print(response.text) 62 | try: 63 | response = requests.get('https://mp.weixin.qq.com/cgi-bin/appmsgpublish', verify=False, 64 | params=params2, cookies=setting.wxgeturl_cookies, headers=headers, proxies=proxies) 65 | # 在这里处理正常响应的逻辑 66 | if json.loads(response.text)=={"base_resp":{"ret":200003,"err_msg":"invalid session"}}: 67 | print('微信公众号token过期') 68 | return 0 69 | except requests.exceptions.ConnectionError as e: 70 | # 处理连接错误的异常逻辑 71 | print("请求断了,10秒后重试,", e) 72 | time.sleep(10) 73 | geturl() 74 | return 0 75 | passages = []#用来存每一个公众号的文章链接 76 | for i in range(passagenum): 77 | list=json.loads(dealspring(json.loads(response.text))[i]['publish_info'])['appmsg_info'] 78 | for b in list: 79 | passage = list[list.index(b)] 80 | temp = {'title': passage['title'], 'url': passage['content_url']} 81 | passages.append(temp) 82 | # print(temp) 83 | response.close() 84 | print("目前爬取的公众号是:",key) 85 | for i in passages: 86 | # 这里加上文章链接的判断 87 | print(i['title'], i['url']) 88 | response = requests.get(i['url'], headers=headers, proxies=proxies, verify=False) 89 | # 在这里处理正常响应的逻辑 90 | soup = BeautifulSoup(response.text, 'lxml') 91 | onlytext = soup.text.replace(" ", "").replace("\n", "") 92 | print(f'从文章中提取到的文本是:{onlytext}') 93 | print() 94 | print('-----------------我是分割线😎😎😎-------------------') 95 | print() 96 | # 呼,休息一下,三秒后再获取另一个公众号的😊😊😊 97 | time.sleep(3) 98 | geturl() -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.12.2 2 | Requests==2.31.0 3 | -------------------------------------------------------------------------------- /setting.py: -------------------------------------------------------------------------------- 1 | # 定义全局变量 2 | 3 | wxgeturl_cookies = { 4 | #填写cookies 5 | } 6 | 7 | 8 | #要爬取的微信公众号 9 | wechat_accounts_fakeid={ 10 | 11 | # 如:'莞博社区':'Mzk0NTQ3Mzk5Nw==', 12 | 13 | } 14 | 15 | wx_token='1406298924' 16 | 17 | # proxies = {'http': '112.80.248.73'} 18 | 19 | --------------------------------------------------------------------------------