├── README.md
└── sougou_weixin.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Python-spiders
 2 | 
 3 | ​       本系列仅用于学习交流，并不针对任何网站、软件、个人，未对目标网站进行大批量访问，不承担任何责任。
 4 | 
 5 | 
 6 | 
 7 | ## 系列目录
 8 | 
 9 | - [搜狗微信采集 —— python爬虫系列一](https://www.cnblogs.com/hyonline/p/11812977.html)
10 | 
11 |   
12 | 
13 | 


--------------------------------------------------------------------------------
/sougou_weixin.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | import requests
  4 | from lxml import etree
  5 | import re
  6 | import random
  7 | import json
  8 | from urllib import parse
  9 | 
 10 | 
 11 | def get_cookie(response1, uigs_para, UserAgent):
 12 |     SetCookie = response1.headers['Set-Cookie']
 13 |     cookie_params = {
 14 |         "ABTEST": re.findall('ABTEST=(.*?);', SetCookie, re.S)[0],
 15 |         "SNUID": re.findall('SNUID=(.*?);', SetCookie, re.S)[0],
 16 |         "IPLOC": re.findall('IPLOC=(.*?);', SetCookie, re.S)[0],
 17 |         "SUID": re.findall('SUID=(.*?);', SetCookie, re.S)[0]
 18 |     }
 19 |     
 20 |     url = "https://www.sogou.com/sug/css/m3.min.v.7.css"
 21 |     headers = {
 22 |         "Accept": "text/css,*/*;q=0.1",
 23 |         "Accept-Encoding": "gzip, deflate, br",
 24 |         "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
 25 |         "Connection": "keep-alive",
 26 |         "Cookie": "SNUID={}; IPLOC={}".format(cookie_params['SNUID'], cookie_params['IPLOC']),
 27 |         "Host": "www.sogou.com",
 28 |         "Referer": "https://weixin.sogou.com/",
 29 |         "User-Agent": UserAgent
 30 |     }
 31 |     response2 = requests.get(url, headers=headers)
 32 |     SetCookie = response2.headers['Set-Cookie']
 33 |     cookie_params['SUID'] = re.findall('SUID=(.*?);', SetCookie, re.S)[0]
 34 |     
 35 |     url = "https://weixin.sogou.com/websearch/wexinurlenc_sogou_profile.jsp"
 36 |     headers = {
 37 |         "Accept": "*/*",
 38 |         "Accept-Encoding": "gzip, deflate, br",
 39 |         "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
 40 |         "Connection": "keep-alive",
 41 |         "Cookie": "ABTEST={}; SNUID={}; IPLOC={}; SUID={}".format(cookie_params['ABTEST'], cookie_params['SNUID'], cookie_params['IPLOC'],
 42 |                                                                   cookie_params['SUID']),
 43 |         "Host": "weixin.sogou.com",
 44 |         "Referer": response1.url,
 45 |         "User-Agent": UserAgent
 46 |     }
 47 |     response3 = requests.get(url, headers=headers)
 48 |     SetCookie = response3.headers['Set-Cookie']
 49 |     cookie_params['JSESSIONID'] = re.findall('JSESSIONID=(.*?);', SetCookie, re.S)[0]
 50 |     
 51 |     url = "https://pb.sogou.com/pv.gif"
 52 |     headers = {
 53 |         "Accept": "image/webp,*/*",
 54 |         "Accept-Encoding": "gzip, deflate, br",
 55 |         "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
 56 |         "Connection": "keep-alive",
 57 |         "Cookie": "SNUID={}; IPLOC={}; SUID={}".format(cookie_params['SNUID'], cookie_params['IPLOC'], cookie_params['SUID']),
 58 |         "Host": "pb.sogou.com",
 59 |         "Referer": "https://weixin.sogou.com/",
 60 |         "User-Agent": UserAgent
 61 |     }
 62 |     response4 = requests.get(url, headers=headers, params=uigs_para)
 63 |     SetCookie = response4.headers['Set-Cookie']
 64 |     cookie_params['SUV'] = re.findall('SUV=(.*?);', SetCookie, re.S)[0]
 65 |     
 66 |     return cookie_params
 67 | 
 68 | 
 69 | def get_k_h(url):
 70 |     b = int(random.random() * 100) + 1
 71 |     a = url.find("url=")
 72 |     url = url + "&k=" + str(b) + "&h=" + url[a + 4 + 21 + b: a + 4 + 21 + b + 1]
 73 |     return url
 74 | 
 75 | 
 76 | def get_uigs_para(response):
 77 |     uigs_para = re.findall('var uigs_para = (.*?);', response.text, re.S)[0]
 78 |     if 'passportUserId ? "1" : "0"' in uigs_para:
 79 |         uigs_para = uigs_para.replace('passportUserId ? "1" : "0"', '0')
 80 |     uigs_para = json.loads(uigs_para)
 81 |     exp_id = re.findall('uigs_para.exp_id = "(.*?)";', response.text, re.S)[0]
 82 |     uigs_para['right'] = 'right0_0'
 83 |     uigs_para['exp_id'] = exp_id[:-1]
 84 |     return uigs_para
 85 | 
 86 | 
 87 | def main_v4(list_url, UserAgent):
 88 |     headers1 = {
 89 |         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
 90 |         "Accept-Encoding": "gzip, deflate, br",
 91 |         "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
 92 |         "Connection": "keep-alive",
 93 |         "Host": "weixin.sogou.com",
 94 |         "Upgrade-Insecure-Requests": "1",
 95 |         "User-Agent": UserAgent,
 96 |     }
 97 |     response1 = requests.get(list_url, headers=headers1)
 98 |     html = etree.HTML(response1.text)
 99 |     urls = ['https://weixin.sogou.com' + i for i in html.xpath('//div[@class="img-box"]/a/@href')]
100 |     
101 |     uigs_para = get_uigs_para(response1)
102 |     params = get_cookie(response1, uigs_para, UserAgent)
103 |     approve_url = 'https://weixin.sogou.com/approve?uuid={}'.format(uigs_para['uuid'])
104 |     headers2 = {
105 |         "Accept": "*/*",
106 |         "Accept-Encoding": "gzip, deflate, br",
107 |         "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
108 |         "Connection": "keep-alive",
109 |         "Cookie": "ABTEST={}; IPLOC={}; SUID={}; SUV={}; SNUID={}; JSESSIONID={};".format(params['ABTEST'], params['IPLOC'],
110 |                                                                                           params['SUID'], params['SUV'], params['SNUID'],
111 |                                                                                           params['JSESSIONID']),
112 |         "Host": "weixin.sogou.com",
113 |         "Referer": response1.url,
114 |         "User-Agent": UserAgent,
115 |         "X-Requested-With": "XMLHttpRequest"
116 |     }
117 |     for url in urls:
118 |         response2 = requests.get(approve_url, headers=headers2)
119 |         url = get_k_h(url)
120 |         headers3 = {
121 |             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
122 |             "Accept-Encoding": "gzip, deflate, br",
123 |             "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
124 |             "Connection": "keep-alive",
125 |             "Cookie": "ABTEST={}; SNUID={}; IPLOC={}; SUID={}; JSESSIONID={}; SUV={}".format(params['ABTEST'], params['SNUID'],
126 |                                                                                              params['IPLOC'], params['SUID'],
127 |                                                                                              params['JSESSIONID'],
128 |                                                                                              params['SUV']),
129 |             "Host": "weixin.sogou.com",
130 |             "Referer": list_url,
131 |             "Upgrade-Insecure-Requests": "1",
132 |             "User-Agent": UserAgent
133 |         }
134 |         response3 = requests.get(url, headers=headers3)
135 |         
136 |         fragments = re.findall("url \+= '(.*?)'", response3.text, re.S)
137 |         itemurl = ''
138 |         for i in fragments:
139 |             itemurl += i
140 |         
141 |         # 文章url拿正文
142 |         headers4 = {
143 |             "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
144 |             "accept-encoding": "gzip, deflate, br",
145 |             "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
146 |             "cache-control": "max-age=0",
147 |             "user-agent": UserAgent
148 |         }
149 |         response4 = requests.get(itemurl, headers=headers4)
150 |         html = etree.HTML(response4.text)
151 |         print(response4.status_code)
152 |         print(html.xpath('//meta[@property="og:title"]/@content')[0])
153 | 
154 | 
155 | if __name__ == "__main__":
156 |     key = "咸蛋超人"
157 |     url = 'https://weixin.sogou.com/weixin?type=2&s_from=input&query={}&_sug_=n&_sug_type_=&page=1'.format(parse.quote(key))
158 |     UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0"
159 |     main_v4(url, UserAgent)
160 | 


--------------------------------------------------------------------------------