├── README.md └── weixin.py /README.md: -------------------------------------------------------------------------------- 1 | # Weixin 2 | 实现微信公众平台模拟登陆,爬取微信公众号历史文章,不过可能因为引擎过期,最终没有爬取成功=。= 3 | -------------------------------------------------------------------------------- /weixin.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from selenium import webdriver 3 | import time 4 | import json 5 | import requests 6 | import io 7 | import re 8 | import random 9 | 10 | user="fcscucs@sina.com" 11 | password="fcbayern07" 12 | #设置要爬取的公众号列表 13 | gzlist=['新世相'] 14 | 15 | def weChat_login(): 16 | post = {} 17 | print("启动浏览器,打开微信公众号登录界面") 18 | driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver') 19 | driver.get('https://mp.weixin.qq.com/') 20 | time.sleep(5) 21 | print("正在输入微信公众号登录账号和密码...") 22 | sreach_window = driver.current_window_handle 23 | #driver.find_element_by_xpath("./*//input[@id='account']").clear() 24 | #driver.find_element_by_xpath("./*//input[@id='account']").send_keys(user) 25 | driver.find_element_by_xpath("//*[@id='header']/div[2]/div/div/form/div[1]/div[1]/div/span/input").send_keys(user) 26 | #driver.find_element_by_xpath("./*//input[@id='pwd']").clear() 27 | #driver.find_element_by_xpath("./*//input[@id='pwd']").send_keys(password) 28 | driver.find_element_by_xpath("//*[@id='header']/div[2]/div/div/form/div[1]/div[2]/div/span/input").send_keys(password) 29 | print("请在登录界面点击:记住账号") 30 | time.sleep(5) 31 | driver.find_element_by_xpath("//*[@id='header']/div[2]/div/div/form/div[4]/a").click() 32 | print("请拿手机扫码二维码登录公众号") 33 | time.sleep(5) 34 | print("登录成功") 35 | driver.get('https://mp.weixin.qq.com/') 36 | cookie_items = driver.get_cookies() 37 | 38 | for cookie_item in cookie_items: 39 | post[cookie_item['name']] = cookie_item['value'] 40 | cookie_str = json.dumps(post, ensure_ascii=False) 41 | if isinstance(cookie_str, str): 42 | cookie_str = cookie_str.decode("utf-8") 43 | with io.open('cookie.txt', 'w+', encoding='utf-8') as f: 44 | f.write(cookie_str) 45 | print("cookies信息已保存到本地") 46 | 47 | def get_content(query): 48 | url = 'https://mp.weixin.qq.com' 49 | header = { 50 | "HOST": "mp.weixin.qq.com", 51 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"} 52 | with io.open('cookie.txt', 'r', encoding ='utf-8') as f: 53 | cookie = f.read() 54 | cookies = json.loads(cookie) 55 | response = requests.get(url=url, cookies=cookies) 56 | token = re.findall(r'token=(\d+)', str(response.url)) 57 | 58 | search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?' 59 | query_id = { 60 | 'action': 'search_biz', 61 | 'token': token, 62 | 'lang': 'zh_CN', 63 | 'f': 'json', 64 | 'ajax': '1', 65 | 'random': random.random(), 66 | 'query': query, 67 | 'begin': '0', 68 | 'count': '5' 69 | } 70 | 71 | search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id) 72 | lists = search_response.json().get('list')[0] 73 | fakeid = lists.get('fakeid') 74 | appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?' 75 | query_id_data = { 76 | 'token': token , 77 | 'lang': 'zh_CN', 78 | 'f': 'json', 79 | 'ajax': '1', 80 | 'random': random.random(), 81 | 'action': 'list_ex', 82 | 'begin': '0', # 不同页,此参数变化,变化规则为每页加5 83 | 'count': '5', 84 | 'query': '', 85 | 'fakeid': fakeid, 86 | 'type': '9' 87 | } 88 | 89 | #print("OJBK3") 90 | 91 | appmsg_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data) 92 | max_num = appmsg_response.json().get('app_msg_cnt') 93 | num = int(int(max_num) / 5) 94 | begin = 0 95 | while num + 1 > 0: 96 | query_id_data = { 97 | 'token': token, 98 | 'lang': 'zh_CN', 99 | 'f': 'json', 100 | 'ajax': '1', 101 | 'random': random.random(), 102 | 'action': 'list_ex', 103 | 'begin': '{}'.format(str(begin)), 104 | 'count': '5', 105 | 'query': '', 106 | 'fakeid': fakeid, 107 | 'type': '9' 108 | } 109 | print('正在翻页:--------------', begin) 110 | 111 | query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data) 112 | fakeid_list = query_fakeid_response.json().get('app_msg_list') 113 | for item in fakeid_list: 114 | content_link = item.get('link') 115 | content_title = item.get('title') 116 | fileName = query + '.txt' 117 | with io.open(fileName, 'a', encoding='utf-8') as fh: 118 | fh.write(content_title + ":\n" + content_link + "\n") 119 | num -= 1 120 | begin = int(begin) 121 | begin += 5 122 | time.sleep(2) 123 | 124 | if __name__=='__main__': 125 | #try: 126 | weChat_login() 127 | for query in gzlist: 128 | print("开始爬取公众号:"+query) 129 | get_content(query) 130 | print("爬取完成") 131 | #except Exception as e: 132 | # print(str(e)) 133 | 134 | 135 | --------------------------------------------------------------------------------