├── Login ├── demo ├── encryptPwd.html ├── encryptPwd.js ├── getToken.html └── 美团.html ├── README.md ├── config.py ├── creat_token.py ├── demo.py ├── get_uuid.py ├── mongodb.py ├── parse.py └── view ├── db.pinglun.png.png ├── db_dianpu.png.png └── pacharm_pinglun.png.png /Login/demo: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import execjs 3 | from bs4 import BeautifulSoup 4 | import re 5 | from pyppeteer import launch 6 | from requests import Session 7 | 8 | class MeituanLogin: 9 | def __init__(self, Account, Password): 10 | self.Account = Account 11 | self.Password = Password 12 | self.session = Session() 13 | self.session.headers = { 14 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36' 15 | } 16 | 17 | async def geth5Fingerprint(self): 18 | """ 19 | 获取h5指纹参数 20 | :return: 21 | """ 22 | browser = await launch({'args': ['--no-sandbox', '--disable-infobars'], }, userDataDir=r'D:\拉勾\userdata') 23 | page = await browser.newPage() 24 | await page.setJavaScriptEnabled(enabled=True) 25 | await page.setUserAgent( 26 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36') 27 | 28 | await page.goto('file:///D:/%E7%BE%8E%E5%9B%A2%E7%99%BB%E5%BD%95/%E7%BE%8E%E5%9B%A2.html') 29 | 30 | await asyncio.sleep(1) 31 | 32 | html = await page.content() 33 | h5Fingerprint = re.search('(.*?)', html).group(1) 34 | print('h5指纹', h5Fingerprint) 35 | 36 | await asyncio.sleep(1) 37 | 38 | await page.goto('file:///D:/%E7%BE%8E%E5%9B%A2%E7%99%BB%E5%BD%95/encryptPwd.html') 39 | 40 | await asyncio.sleep(1) 41 | 42 | html = await page.content() 43 | encryptPwd = re.search('(.*?)', html).group(1) 44 | print('RSA加密密码', encryptPwd) 45 | 46 | await browser.close() 47 | return h5Fingerprint, encryptPwd 48 | 49 | def getCsrf(self): 50 | """ 51 | 获取认证Csrf参数 52 | :return: 53 | """ 54 | url = 'https://passport.meituan.com/account/unitivelogin?service=www&continue=https%3A%2F%2Fwww.meituan.com%2Faccount%2Fsettoken%3Fcontinue%3Dhttp%253A%252F%252Fcd.meituan.com%252F' 55 | 56 | r = self.session.get(url) 57 | soup = BeautifulSoup(r.text, 'lxml') 58 | csrf = soup.select('input[name="csrf"]')[0]['value'] 59 | print('认证参数', csrf) 60 | return csrf 61 | 62 | def encryptPwd(self): 63 | """ 64 | RSA加密密码 65 | :return: 66 | """ 67 | with open('encryptPwd.js', 'rb') as f: 68 | js = f.read().decode() 69 | 70 | ctx = execjs.compile(js) 71 | encryptPwd = ctx.call('encrypt', self.Password) 72 | print(encryptPwd) 73 | return encryptPwd 74 | 75 | def login(self): 76 | """ 77 | 登录 78 | :return: 79 | """ 80 | loginAPI = 'https://passport.meituan.com/account/unitivelogin?risk_partner=0&risk_platform=1&risk_app=-1&uuid=34ebfd8f4ae642cea831.1560995483.1.0.0&service=www&continue=https%3A%2F%2Fwww.meituan.com%2Faccount%2Fsettoken%3Fcontinue%3Dhttp%253A%252F%252Fcd.meituan.com%252F' 81 | loop = asyncio.get_event_loop() 82 | h5Fingerprint, encryptPwd = loop.run_until_complete(self.geth5Fingerprint()) 83 | csrf = self.getCsrf() 84 | # encryptPwd = self.encryptPwd() 85 | self.session.headers.update({ 86 | 'Referer': 'https://passport.meituan.com/account/unitivelogin?service=www&continue=https%3A%2F%2Fwww.meituan.com%2Faccount%2Fsettoken%3Fcontinue%3Dhttp%253A%252F%252Fcd.meituan.com%252F', 87 | 'X-CSRF-Token': csrf, 88 | 'X-Client': 'javascript', 89 | 'X-Requested-With': 'XMLHttpRequest', 90 | # 'Cookie': '__mta=150821513.1558451397565.1561008520641.1561008570304.18; _lxsdk_cuid=16aedc3059e43-0a001c8235bd0e-7a1b34-100200-16aedc3059fc8; iuuid=8928289956E0DEDC81258FF113EE55E612C6AE864E76B21E40DF562EBEE60133; _lxsdk=8928289956E0DEDC81258FF113EE55E612C6AE864E76B21E40DF562EBEE60133; webp=1; __utmz=74597006.1558767512.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=74597006.426765488.1558767512.1558767512.1558953786.2; cityname=%E6%B7%B1%E5%9C%B3; i_extend=H__a100001__b2; ci=59; rvct=59%2C20%2C30%2C1%2C70%2C10%2C57%2C55%2C50; _hc.v=8aacd23f-d612-8bae-0082-9cea2ad85705.1558954306; uuid=34ebfd8f4ae642cea831.1560665036.1.0.0; mtcdn=K; lsu=; _lx_utm=utm_source%3Dso.com%26utm_medium%3Dorganic; _lxsdk_s=16b73539370-e55-13e-897%7C%7C17; SERV=www; LREF=aHR0cHM6Ly93d3cubWVpdHVhbi5jb20vYWNjb3VudC9zZXR0b2tlbj9jb250aW51ZT1odHRwJTNBJTJGJTJGY2QubWVpdHVhbi5jb20lMkY%3D; passport.sid=hnPB6A9LG_Djjn4azr3fdZ1Cb_8drWJt; passport.sid.sig=Wkwg3yhwzZxYDXr-dyw4EE0wxRU' 91 | }) 92 | data = { 93 | 'countrycode': '86', 94 | 'email': self.Account, 95 | 'password': encryptPwd, 96 | 'origin': 'account-login', 97 | 'csrf': csrf, 98 | 'requestCode': '', 99 | 'responseCode': '', 100 | 'h5Fingerprint': h5Fingerprint 101 | } 102 | 103 | r = self.session.post(loginAPI, data=data).json() 104 | print(r) 105 | 106 | if __name__ == '__main__': 107 | spider = MeituanLogin("手机号", "密码") 108 | spider.login() 109 | -------------------------------------------------------------------------------- /Login/美团.html: -------------------------------------------------------------------------------- 1 | 2 | 美团_h5Fingerprint破解 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 美团美食店铺爬虫 2 | ========== 3 | 4 | 环境依赖 5 | --------- 6 | pyhon3.7.2 7 | 8 | 抓取流程 9 | -------- 10 |   美团选定城市店铺首页开始,遍历抓取每页店铺列表,进而获取店铺id构造店铺评论详情API,抓取店铺所有评论。 11 | 12 | max_page = input('请输入你要查询的最大页码:') 13 | for page in range(1, int(max_page) + 1): 14 | fetch(page) 15 | 16 | 项目难点 17 | --------- 18 |   测试期间发现美团的反爬措施有: 19 | 20 | 常规链接404(店铺详情页链接404页面) 21 | 22 | _token参数 23 | 24 | 验证码(常规四字中英文混合) 25 | 26 | cookies 27 | 28 | 破解反爬关键在于_token参数的构造,另外还有cookies的时效性问题,测试发现不登陆翻页获取的cookies时效性大约为爬取5页左右的店铺就会失效,需要不停切换。 29 | 30 | 反反爬 31 | -------- 32 | _token参数的构造: 33 | 34 | 解密: 35 | 由现成_token参数结尾的'='猜测进行了base64加密,于是进行base64解密,得到bytes类型字符串,进行zlib解压后得出_token的加密生成字典,其中有两个比较 36 | 重要的变化参数为ts和cts,其中ts为13位时间戳,cts则为ts+100*1000。还有一个sign参数,形式与_token参数一致,再对sign参数进行一次同样的解密,得到一个字符串,其中的uuid在首页源码中可以正则匹配出来。 37 | 38 | 加密: 39 | 由上可知_token参数的构造过程,进行了两次zlib压缩和base64编码加密。第一次加密对象位sign参数。第二次加密就是生成_token的字典,构造好字典后再进行一次上述加密即为_token。 40 | 41 | 解释说明 42 | -------- 43 | 未实现cookies切换,爬取失效后需自行更换cookies。(可以构建cookie池或者selenium翻页获取cookies保存到本地使用)。 44 | 45 | 运行 46 | -------- 47 |   命令行切换至根目录: 48 | 49 | python demo.py 50 | 51 | 抓取结果 52 | --------- 53 | ![image](https://github.com/xzh0723/meituan/blob/master/view/db_dianpu.png.png) 54 | ![image](https://github.com/xzh0723/meituan/blob/master/view/db.pinglun.png.png) 55 | ![image](https://github.com/xzh0723/meituan/blob/master/view/pacharm_pinglun.png.png) 56 | 57 | 公告 58 | ========= 59 |   本代码仅作学习交流,切勿用于商业用途,否则后果自负。 60 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | HEADERS = { 2 | 'Cookie': '_lxsdk_cuid=16acb846703c8-008366d74aeda5-7a1b34-100200-16acb846703c8; client-id=918956e6-105b-4ca5-8968-89e93fb32be0; _hc.v=1910edeb-2eb7-a3c5-41ff-2a64d0eaf25e.1558451373; mtcdn=K; iuuid=49894D73803B6C7A56246D5EA9CE5D36334A7B19F721BDD32DA94E4F042CA7EC; _lxsdk=49894D73803B6C7A56246D5EA9CE5D36334A7B19F721BDD32DA94E4F042CA7EC; webp=1; __utmz=74597006.1558621239.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); uuid=4c097d9a35874d9bb55b.1558633426.1.0.0; rvct=20%2C70%2C45; __utma=74597006.1501988102.1558621239.1558621239.1558633443.2; latlng=28.266892,113.070056,1558633442917; ci=70; cityname=%E9%95%BF%E6%B2%99; i_extend=H__a100001__b2; _lx_utm=utm_source%3Dso.com%26utm_medium%3Dorganic; __mta=150257289.1558192631632.1558619209734.1558663121852.7; _lxsdk_s=16ae78fc7cc-688-efb-c45%7C%7C4', 3 | 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Mobile Safari/537.36' 4 | } 5 | 6 | type_dict = { 7 | '代金券': '393', '蛋糕甜点': '11', '火锅': '17', '自助餐': '40', '川菜': '30', '湘菜': '510', '小吃快餐': '36', 8 | '其他美食': '24', '日韩料理': '28', '东北菜': '20003', '聚餐宴请': '395', '西餐': '35', '香锅烤鱼': '20004', 9 | '烧烤烤肉': '54', '江浙菜': '56', '中式烧烤/烤串': '400', '粤菜': '57', '咖啡酒吧': '41', '西北菜': '58', '京菜鲁菜': '59', 10 | '云贵菜': '60', '东南亚菜': '62', '海鲜': '63', '素食': '217', '台湾/客家菜': '227', '创意菜': '228', '汤/粥/炖菜': '229', 11 | '蒙餐': '232', '新疆菜': '233' 12 | } 13 | 14 | # 热门城市,城市太多,懒得全加上去 15 | city_dict = { 16 | '长沙': 'chs', '成都': 'cd', '重庆': 'cq', '杭州': 'hz', '上海': 'sh', 17 | '南京': 'nj', '武汉': 'wh', '北京': 'bj', '广州': 'gz', '深圳': 'sz' 18 | } 19 | 20 | cityName = input('请输入你要查询的城市:') 21 | cityId = city_dict[cityName] 22 | 23 | _type = input('请输入你要查询的美食类型:') 24 | type_ = type_dict[_type] 25 | 26 | collection = f'{cityName}/{_type}' 27 | 28 | originUrl = f'http://{cityId}.meituan.com/meishi/c{type_}/' 29 | base_url = f'https://{cityId}.meituan.com/meishi/api/poi/getPoiList?' 30 | 31 | MONGO_URI = 'localhost' 32 | MONGO_PORT = 27017 33 | -------------------------------------------------------------------------------- /creat_token.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import zlib 3 | from datetime import datetime 4 | import json 5 | from get_uuid import get_uuid 6 | from config import * 7 | 8 | class CreatToken(): 9 | def __init__(self, page): 10 | self.page = page 11 | self.url = originUrl + 'pn{}/'.format(self.page) 12 | 13 | def get_sign(self): 14 | uuid = get_uuid() 15 | sign = f"areaId=0&cateId=17&cityName={cityName}&dinnerCountAttrId=&optimusCode=1&originUrl={self.url}&page={self.page}&partner=126&platform=1&riskLevel=1&sort=&userId=&uuid={uuid}" 16 | sign_ = zlib.compress(bytes(json.dumps(sign, ensure_ascii=False), encoding='utf-8')) 17 | sign_ = str(base64.b64encode(sign_), encoding='utf-8') 18 | # print(sign_) 19 | return sign_ 20 | 21 | def get_token(self): 22 | sign = self.get_sign() 23 | ts = int(datetime.now().timestamp() * 1000) 24 | # print(ts) 25 | data = { 26 | 'rId': 100900, 27 | 'ver': '1.0.6', 28 | 'ts': ts, 29 | 'cts': ts + 100 * 1000, 30 | 'brVD': [1326, 538], 31 | 'brR': [[1326, 538], [1326, 538], 24, 24], 32 | 'bI': [f'https://{cityId}.meituan.com/meishi/c{type_}/pn{self.page}/', f'https://{cityId}.meituan.com/meishi/c{type_}/pn{self.page-1}/'], 33 | 'mT': [], 34 | 'kT': [], 35 | 'aT': [], 36 | 'tT': [], 37 | 'aM': '', 38 | 'sign': sign 39 | } 40 | token_decode = zlib.compress( 41 | bytes(json.dumps(data, separators=(',', ':'), ensure_ascii=False), encoding="utf8")) 42 | token = str(base64.b64encode(token_decode), encoding="utf8") 43 | # print(token) 44 | return token 45 | 46 | if __name__ =='__main__': 47 | pass 48 | 49 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | Updated at 14:20 at May 24,2019 5 | @title: 美团店铺信息爬虫 6 | @author: xzh0723 7 | """ 8 | 9 | import requests 10 | import json 11 | from mongodb import MongoDB 12 | from config import * 13 | from get_uuid import get_uuid 14 | from parse import * 15 | import math 16 | from creat_token import CreatToken 17 | 18 | def fetch(page): 19 | db = MongoDB() 20 | 21 | uuid = get_uuid() 22 | token = CreatToken(page).get_token() 23 | params = { 24 | 'cityName': cityName, 25 | 'cateId': type_, 26 | 'areaId': '0', 27 | 'sort': '', 28 | 'dinnerCountAttrId': '', 29 | 'page': page, 30 | 'userId': '', 31 | 'uuid': uuid, 32 | 'platform': '1', 33 | 'partner': '126', 34 | 'originUrl': originUrl + 'pn{}/'.format(page), 35 | 'riskLevel': '1', 36 | 'optimusCode': '1', 37 | '_token': token 38 | } 39 | 40 | res = requests.get(base_url, params=params, headers=HEADERS) 41 | result = json.loads(res.text) 42 | items = result['data']['poiInfos'] 43 | for item in items: 44 | # print(store) 45 | store = parse_store(item) 46 | # db.save(store) 47 | 48 | poiId = store['poiId'] 49 | commentCount = store['allCommentNum'] 50 | max_page = math.ceil(int(commentCount) / 10) 51 | comment_list = [] 52 | for offset in range(max_page): 53 | params = { 54 | 'uuid': get_uuid(), 55 | 'id': poiId, 56 | 'userId': '2490983615', 57 | 'offset': offset * 10, 58 | 'pageSize': '10', 59 | } 60 | 61 | resp = requests.get(comment_url, params=params, headers=HEADERS) 62 | # print(resp.text) 63 | result = json.loads(resp.text) 64 | items = result['data']['comments'] 65 | for item in items: 66 | comment = parse_comment(item) 67 | print(comment) 68 | comment_list.append(comment) 69 | store['comment'] = comment_list 70 | print(store) 71 | db.save(store) 72 | 73 | if __name__ == '__main__': 74 | max_page = input('请输入你要查询的最大页码:') 75 | for page in range(1, int(max_page) + 1): 76 | fetch(page) 77 | -------------------------------------------------------------------------------- /get_uuid.py: -------------------------------------------------------------------------------- 1 | import re 2 | import requests 3 | from config import * 4 | 5 | def get_uuid(): 6 | res = requests.get(originUrl, headers=HEADERS) 7 | uuid = re.search("uuid: '(.*?)',", res.text, re.S).group(1) 8 | # print(uuid) 9 | return uuid -------------------------------------------------------------------------------- /mongodb.py: -------------------------------------------------------------------------------- 1 | import pymongo 2 | from config import * 3 | 4 | class MongoDB(): 5 | 6 | def __init__(self): 7 | self.client = pymongo.MongoClient(host='localhost', port=27017) 8 | self.db = self.client.meituan 9 | 10 | def save(self, item): 11 | try: 12 | if self.db[collection].insert(dict(item)): 13 | print('成功插入数据库') 14 | except Exception as e: 15 | print('插入数据库失败:', e.args) 16 | 17 | if __name__ == '__main__': 18 | pass 19 | -------------------------------------------------------------------------------- /parse.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | def parse_store(item): 4 | store = {} 5 | field_map = { 6 | 'title': 'title', 'address': 'address', 'frontImg': 'frontImg', 7 | 'allCommentNum': 'allCommentNum', 'poiId': 'poiId', 'avgPrice': 'avgPrice', 'avgScore': 'avgScore' 8 | } 9 | for field, attr in field_map.items(): 10 | store[field] = item[attr] 11 | return store 12 | 13 | def parse_comment(item): 14 | comment = {} 15 | user = {} 16 | field_map = { 17 | 'userName': 'userName', 'userId': 'userId', 'userUrl': 'userUrl', 'userLevel': 'userLevel' 18 | } 19 | for field, attr in field_map.items(): 20 | user[field] = item[attr] 21 | comment['user'] = user 22 | field_map_ = { 23 | 'comment': 'comment', 'menu': 'menu', 'merchantComment': 'merchantComment', 'readCnt': 'readCnt', 24 | 'replyCnt': 'replyCnt', 'zanCnt': 'zanCnt', 'avgPrice': 'avgPrice', 25 | } 26 | for field, attr in field_map_.items(): 27 | comment[field] = item[attr] 28 | # commentTime:13位时间戳转化为时间 29 | timestamp = float(int(item['commentTime']) / 1000) 30 | commentTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timestamp)) 31 | comment['commentTime'] = commentTime 32 | # dealEndtime: 10位时间戳转时间 33 | if item['dealEndtime']: 34 | timestamp = int(item['dealEndtime']) 35 | dealEndtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timestamp)) 36 | comment['dealEndtime'] = dealEndtime 37 | return comment -------------------------------------------------------------------------------- /view/db.pinglun.png.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xzh0723/MeiTuan/aee136d883fa2d301117178ffb9c00b1e653b388/view/db.pinglun.png.png -------------------------------------------------------------------------------- /view/db_dianpu.png.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xzh0723/MeiTuan/aee136d883fa2d301117178ffb9c00b1e653b388/view/db_dianpu.png.png -------------------------------------------------------------------------------- /view/pacharm_pinglun.png.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xzh0723/MeiTuan/aee136d883fa2d301117178ffb9c00b1e653b388/view/pacharm_pinglun.png.png --------------------------------------------------------------------------------