├── README.md └── sougou_weixin.py /README.md: -------------------------------------------------------------------------------- 1 | # Python-spiders 2 | 3 | ​ 本系列仅用于学习交流,并不针对任何网站、软件、个人,未对目标网站进行大批量访问,不承担任何责任。 4 | 5 | 6 | 7 | ## 系列目录 8 | 9 | - [搜狗微信采集 —— python爬虫系列一](https://www.cnblogs.com/hyonline/p/11812977.html) 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /sougou_weixin.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import requests 4 | from lxml import etree 5 | import re 6 | import random 7 | import json 8 | from urllib import parse 9 | 10 | 11 | def get_cookie(response1, uigs_para, UserAgent): 12 | SetCookie = response1.headers['Set-Cookie'] 13 | cookie_params = { 14 | "ABTEST": re.findall('ABTEST=(.*?);', SetCookie, re.S)[0], 15 | "SNUID": re.findall('SNUID=(.*?);', SetCookie, re.S)[0], 16 | "IPLOC": re.findall('IPLOC=(.*?);', SetCookie, re.S)[0], 17 | "SUID": re.findall('SUID=(.*?);', SetCookie, re.S)[0] 18 | } 19 | 20 | url = "https://www.sogou.com/sug/css/m3.min.v.7.css" 21 | headers = { 22 | "Accept": "text/css,*/*;q=0.1", 23 | "Accept-Encoding": "gzip, deflate, br", 24 | "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", 25 | "Connection": "keep-alive", 26 | "Cookie": "SNUID={}; IPLOC={}".format(cookie_params['SNUID'], cookie_params['IPLOC']), 27 | "Host": "www.sogou.com", 28 | "Referer": "https://weixin.sogou.com/", 29 | "User-Agent": UserAgent 30 | } 31 | response2 = requests.get(url, headers=headers) 32 | SetCookie = response2.headers['Set-Cookie'] 33 | cookie_params['SUID'] = re.findall('SUID=(.*?);', SetCookie, re.S)[0] 34 | 35 | url = "https://weixin.sogou.com/websearch/wexinurlenc_sogou_profile.jsp" 36 | headers = { 37 | "Accept": "*/*", 38 | "Accept-Encoding": "gzip, deflate, br", 39 | "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", 40 | "Connection": "keep-alive", 41 | "Cookie": "ABTEST={}; SNUID={}; IPLOC={}; SUID={}".format(cookie_params['ABTEST'], cookie_params['SNUID'], cookie_params['IPLOC'], 42 | cookie_params['SUID']), 43 | "Host": "weixin.sogou.com", 44 | "Referer": response1.url, 45 | "User-Agent": UserAgent 46 | } 47 | response3 = requests.get(url, headers=headers) 48 | SetCookie = response3.headers['Set-Cookie'] 49 | cookie_params['JSESSIONID'] = re.findall('JSESSIONID=(.*?);', SetCookie, re.S)[0] 50 | 51 | url = "https://pb.sogou.com/pv.gif" 52 | headers = { 53 | "Accept": "image/webp,*/*", 54 | "Accept-Encoding": "gzip, deflate, br", 55 | "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", 56 | "Connection": "keep-alive", 57 | "Cookie": "SNUID={}; IPLOC={}; SUID={}".format(cookie_params['SNUID'], cookie_params['IPLOC'], cookie_params['SUID']), 58 | "Host": "pb.sogou.com", 59 | "Referer": "https://weixin.sogou.com/", 60 | "User-Agent": UserAgent 61 | } 62 | response4 = requests.get(url, headers=headers, params=uigs_para) 63 | SetCookie = response4.headers['Set-Cookie'] 64 | cookie_params['SUV'] = re.findall('SUV=(.*?);', SetCookie, re.S)[0] 65 | 66 | return cookie_params 67 | 68 | 69 | def get_k_h(url): 70 | b = int(random.random() * 100) + 1 71 | a = url.find("url=") 72 | url = url + "&k=" + str(b) + "&h=" + url[a + 4 + 21 + b: a + 4 + 21 + b + 1] 73 | return url 74 | 75 | 76 | def get_uigs_para(response): 77 | uigs_para = re.findall('var uigs_para = (.*?);', response.text, re.S)[0] 78 | if 'passportUserId ? "1" : "0"' in uigs_para: 79 | uigs_para = uigs_para.replace('passportUserId ? "1" : "0"', '0') 80 | uigs_para = json.loads(uigs_para) 81 | exp_id = re.findall('uigs_para.exp_id = "(.*?)";', response.text, re.S)[0] 82 | uigs_para['right'] = 'right0_0' 83 | uigs_para['exp_id'] = exp_id[:-1] 84 | return uigs_para 85 | 86 | 87 | def main_v4(list_url, UserAgent): 88 | headers1 = { 89 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", 90 | "Accept-Encoding": "gzip, deflate, br", 91 | "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", 92 | "Connection": "keep-alive", 93 | "Host": "weixin.sogou.com", 94 | "Upgrade-Insecure-Requests": "1", 95 | "User-Agent": UserAgent, 96 | } 97 | response1 = requests.get(list_url, headers=headers1) 98 | html = etree.HTML(response1.text) 99 | urls = ['https://weixin.sogou.com' + i for i in html.xpath('//div[@class="img-box"]/a/@href')] 100 | 101 | uigs_para = get_uigs_para(response1) 102 | params = get_cookie(response1, uigs_para, UserAgent) 103 | approve_url = 'https://weixin.sogou.com/approve?uuid={}'.format(uigs_para['uuid']) 104 | headers2 = { 105 | "Accept": "*/*", 106 | "Accept-Encoding": "gzip, deflate, br", 107 | "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", 108 | "Connection": "keep-alive", 109 | "Cookie": "ABTEST={}; IPLOC={}; SUID={}; SUV={}; SNUID={}; JSESSIONID={};".format(params['ABTEST'], params['IPLOC'], 110 | params['SUID'], params['SUV'], params['SNUID'], 111 | params['JSESSIONID']), 112 | "Host": "weixin.sogou.com", 113 | "Referer": response1.url, 114 | "User-Agent": UserAgent, 115 | "X-Requested-With": "XMLHttpRequest" 116 | } 117 | for url in urls: 118 | response2 = requests.get(approve_url, headers=headers2) 119 | url = get_k_h(url) 120 | headers3 = { 121 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 122 | "Accept-Encoding": "gzip, deflate, br", 123 | "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", 124 | "Connection": "keep-alive", 125 | "Cookie": "ABTEST={}; SNUID={}; IPLOC={}; SUID={}; JSESSIONID={}; SUV={}".format(params['ABTEST'], params['SNUID'], 126 | params['IPLOC'], params['SUID'], 127 | params['JSESSIONID'], 128 | params['SUV']), 129 | "Host": "weixin.sogou.com", 130 | "Referer": list_url, 131 | "Upgrade-Insecure-Requests": "1", 132 | "User-Agent": UserAgent 133 | } 134 | response3 = requests.get(url, headers=headers3) 135 | 136 | fragments = re.findall("url \+= '(.*?)'", response3.text, re.S) 137 | itemurl = '' 138 | for i in fragments: 139 | itemurl += i 140 | 141 | # 文章url拿正文 142 | headers4 = { 143 | "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", 144 | "accept-encoding": "gzip, deflate, br", 145 | "accept-language": "zh-CN,zh;q=0.9,en;q=0.8", 146 | "cache-control": "max-age=0", 147 | "user-agent": UserAgent 148 | } 149 | response4 = requests.get(itemurl, headers=headers4) 150 | html = etree.HTML(response4.text) 151 | print(response4.status_code) 152 | print(html.xpath('//meta[@property="og:title"]/@content')[0]) 153 | 154 | 155 | if __name__ == "__main__": 156 | key = "咸蛋超人" 157 | url = 'https://weixin.sogou.com/weixin?type=2&s_from=input&query={}&_sug_=n&_sug_type_=&page=1'.format(parse.quote(key)) 158 | UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0" 159 | main_v4(url, UserAgent) 160 | --------------------------------------------------------------------------------