├── .gitignore ├── README.md ├── comment_crawl.py ├── config.ini ├── poi_crawl.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | .idea/ 3 | __pycache__/ 4 | .DS_Store 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Ctrip_Crawler 携程景点爬虫 2 | 3 | 4 | 5 | ![image-20210313235926448](https://irimskyblog.oss-cn-beijing.aliyuncs.com/content/20210313235929.png) 6 | 7 | 8 | 9 | ![image-20210314000428177](https://irimskyblog.oss-cn-beijing.aliyuncs.com/content/20210314000432.png) 10 | 11 | 12 | 13 | - 爬取的是 [**携程移动端**](https://m.ctrip.com/webapp/you/gspoi/sight/1.html?seo=1) 的数据(景点数据以及评论) 14 | 15 | - 修改`config.ini`中的配置可以改变**目标城市**(默认北京)以及**爬取模式** 16 | 17 | ![](https://irimskyblog.oss-cn-beijing.aliyuncs.com/content/20210507163603.png) 18 | 19 | 20 | 21 | - 爬取结果有两部分:`data/poi.csv`为**景点数据**,`data/comment/{id}.csv`为对应ID的景点的**评论数据** 22 | 23 | - 评论内容的爬取有两种方法: 24 | - 将`config.ini`中的`isCrawlComment`置为1,运行`poi_crawl.py`文件,在爬取 景点数据 的过程中爬取 评论数据 25 | - 将`config.ini`中的`isCrawlComment`置为0,运行`poi_crawl.py`文件,在爬取 景点数据 结束后运行再运行`comment_crawl.py`文件,获取 景点数据 中的所有景点的评论 26 | 27 | - 每次运行前都会在同一文件夹下复制一份上一次爬取的景点结果的备份,名为`back.csv` 28 | 29 | - 数据中 **价格**、**最低价格**为response中的数据,暂无参考价值 30 | 31 | - 后面四种人群门票价格为**预估的销量加权平均价格**,如果有不同需求可以修改 `GetTicketPrice` 函数。(返回的数据为所有的门票价格) 32 | 33 | - 景点数据中的**开放时间**与**优惠政策** 数据的格式为json格式 34 | 35 | - 爬取的 评论数据 格式为: 36 | 37 | - **用户ID** 38 | - **评论文本** 39 | - **发送时间戳** 40 | - **赞同数** 41 | 42 | TODO: 43 | 44 | 后续可能会支持: 45 | 46 | 输入城市名称自动获取城市编号 (√) 47 | 如果上次爬取过程中断可以从断点处开始爬取 (√) -------------------------------------------------------------------------------- /comment_crawl.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | from requests import post 4 | import csv 5 | 6 | URL = 'https://m.ctrip.com/restapi/soa2/13444/json/getCommentCollapseList' # 获取评论数据列表的URL 7 | SizePerPage = 20 # 每页的数据量,最好不好随意改变 8 | data = {"arg": {"resourceId": 229, "resourceType": 11, "pageIndex": 1, "pageSize": SizePerPage, "sortType": 3, "commentTagId": "0", 9 | "collapseType": 1, "channelType": 7, "videoImageSize": "700_392", "starType": 0}, 10 | "head": {"cid": "09031065211914680477", "ctok": "", "cver": "1.0", "lang": "01", "sid": "8888", "syscode": "09", 11 | "auth": None, "extension": [{"name": "protocal", "value": "https"}]}, "contentType": "json"} 12 | 13 | 14 | def GetComments(Id, total): 15 | f = open(f'data/comments/{Id}.csv', 'w', encoding='utf-8') 16 | DATA = data.copy() 17 | DATA['arg']['resourceId'] = Id 18 | wr = csv.writer(f) 19 | times = total // SizePerPage 20 | for i in range(times): 21 | DATA['arg']['pageIndex'] = i + 1 22 | resp = post(URL, json=DATA) 23 | comments = resp.json()['result']['items'] 24 | if not comments: 25 | print(resp.json()) 26 | break 27 | for comment in comments: 28 | if comment.get('languageType', '') != "zh-cn" or len(comment['content']) < 10: 29 | continue 30 | userId = comment.get('userInfo') 31 | if userId: 32 | userId = userId.get('userId', 'null') 33 | 34 | rrr = [userId, comment['content'], comment['publishTime'], comment['usefulCount']] 35 | wr.writerow(rrr) 36 | print(comment['content']) 37 | 38 | time.sleep(1) 39 | resp.close() 40 | 41 | f.close() 42 | 43 | 44 | if __name__ == '__main__': 45 | with open('data/pois.csv', 'r', encoding='utf-8') as f: 46 | rd = csv.reader(f) 47 | cnt = 0 48 | flag = 0 49 | for row in rd: 50 | if cnt == 0: 51 | cnt = 1 52 | continue 53 | ID = int(row[2]) 54 | print(ID, row[0]) 55 | GetComments(ID, int(row[11])) 56 | -------------------------------------------------------------------------------- /config.ini: -------------------------------------------------------------------------------- 1 | [poi] 2 | ; 城市 3 | city = 南京 4 | ;是否为中断后续写,1为是 5 | isRestart = 0 6 | ;是否在爬取过程中爬取评论,1为是 7 | isCrawlComment = 1 8 | -------------------------------------------------------------------------------- /poi_crawl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import configparser 4 | from requests import post 5 | import csv 6 | import logging 7 | import json 8 | import shutil 9 | from bs4 import BeautifulSoup 10 | from comment_crawl import GetComments 11 | 12 | URL = 'https://m.ctrip.com/restapi/soa2/13342/json/getSightRecreationList' # 获取景点列表数据 13 | DetailURL = 'https://m.ctrip.com/restapi/soa2/18254/json/getPoiMoreDetail' # 获取景点详情数据 14 | TicketURL = 'https://m.ctrip.com/restapi/soa2/12530/json/getProductShelf' # 获取票价数据 15 | CityURL = 'https://m.ctrip.com/restapi/soa2/13342/json/SearchSightRecreation' # 获取城市编号 16 | isRestart = 0 17 | isCrawlComment = 0 18 | history = set() 19 | 20 | data = {'fromChannel': 2, 21 | 'index': 1, 22 | 'count': 20, 23 | 'districtId': 9, # 可修改此处变更爬取城市 24 | 'sortType': 0, 25 | 'categoryId': 0, 26 | 'lat': 0, 27 | 'lon': 0, 28 | 'showNewVersion': True, 29 | 'locationFilterDistance': 300, 30 | 'locationDistrictId': 0, 31 | 'themeId': 0, 32 | 'level2ThemeId': 0, 33 | 'locationFilterId': 0, 34 | 'locationFilterType': 0, 35 | 'sightLevels': [], 36 | 'ticketType': None, 37 | 'commentScore': None, 38 | 'showAgg': True, 39 | 'fromNearby': '', 40 | 'sourceFrom': 'sightlist', 41 | 'themeName': '', 42 | 'scene': '', 43 | 'hiderank': '', 44 | 'isLibertinism': False, 45 | 'hideTop': False, 46 | 'head': {'cid': '09031065211914680477', 47 | 'ctok': '', 48 | 'cver': '1.0', 49 | 'lang': '01', 50 | 'sid': '8888', 51 | 'syscode': '09', 52 | 'auth': '', 53 | 'xsid': '', 54 | 'extension': []}} 55 | 56 | detail_data = { 57 | "poiId": 87211, 58 | "scene": "basic", 59 | "head": { 60 | "cid": "09031065211914680477", 61 | "ctok": "", 62 | "cver": "1.0", 63 | "lang": "01", 64 | "sid": "8888", 65 | "syscode": "09", 66 | "auth": "", 67 | "xsid": "", 68 | "extension": [] 69 | } 70 | } 71 | 72 | ticket_data = { 73 | 'head': {'cid': '09031065211914680477', 74 | 'syscode': '09', 75 | 'extension': [{'name': 'needNewStructureV2', 'value': 'true'}, 76 | {'name': 'crawlerKey', 77 | 'value': '0bd0f473f984aaf20ece34b437cce49e51d55d4baefe0ca56ac3559956a72691'}, 78 | {'name': 'fingerprintKeys', 79 | 'value': 'N1mwb6YgTeUNE4HEsYM9iqhWLYdte05EQLjd9WHYFsrb3IsNvUkj8Yo6jLMWgkvb1jfYUtEADwFmizljDYX7eGmw9ZvMcjhbvs1eoZYBgjO4yXY5XyN7vGDYDlwMpjQ3edSiFTYkYlYMNr9cEQ4wtTxZqYSajk1jLhW0YzYXYfoYnoiFTikZiMhj4Y9cEodiDlYldwO8yfpyFpYDzwhYpARsSwoTRfHEDcRtfyfoyahwT0JdfEtFJhOEgXR0YqoKaGymbi9kJnOJcBjkmRLbWPPxXOraSR18wq5E5fi0AecPjSYZNR1gwhLWm6wlgwNbRgzv8oYTOWA1JtHJFMjUSEgYN8jMgwQPvGSjsYzPEFDJ4GEk5eg5ykaW3AYnY0nE64JgMEZDe6OyStWOdY8YU7RdGwmHRUSENGRfoyoXyOLwc6J4BEBoESMwcqvTYQ0E5ojNSWgDWPbWaZYz4YbTYsUR90YTQWo6YdlYl0Ypoj4Uet1E1bW0Qeo9wbdeU8jPMYNsy8lEZbj65ElarpUj6zwZs'}, 80 | {'name': 'H5', 'value': 'H5'}]}, 81 | 'debug': False, 82 | 'pageid': '214070', 83 | 'contentType': 'json', 84 | 'clientInfo': {'pageId': '214070', 85 | 'platformId': None, 86 | 'crnVersion': '2021-02-03 20:08:05', 87 | 'location': {'lat': '', 88 | 'lon': '', 89 | 'cityId': None, 90 | 'locatedCityId': None, 91 | 'districtId': None, 92 | 'locatedDistrictId': None}, 93 | 'locale': 'zh-CN', 94 | 'currency': 'CNY'}, 95 | 'spotid': 229, 96 | 'poiId': 75595, 97 | 'locale': 'zh-CN', 98 | 'currency': 'CNY', 99 | 'platformId': None, 100 | 'needFilter': True, 101 | 'resourceLimit': True} 102 | 103 | city_data = { 104 | 'KeyWord': '', 105 | 'DistrictId': 1, 106 | 'CategoryId': 0, 107 | 'head': { 108 | 'cid': '09031065211914680477', 109 | 'ctok': '', 110 | 'cver': '1.0', 111 | 'lang': '01', 112 | 'sid': '8888', 113 | 'syscode': '09', 114 | 'auth': '', 115 | 'xsid': '', 116 | 'extension': [] 117 | } 118 | } 119 | 120 | 121 | def getCityID(city_name): 122 | city_data['KeyWord'] = city_name 123 | city_res = post(CityURL, json=city_data).json()['districtResult'] 124 | if len(city_res) == 0: 125 | logging.error('城市名错误!无结果') 126 | exit(0) 127 | 128 | elif len(city_res) >= 1: 129 | cityID = city_res[0]['districtId'] 130 | if len(city_res) > 1: 131 | logging.warning(f'多个相似城市名结果,注意确认\n') 132 | print([i["districtName"] + "-" + i["name"] for i in city_res]) 133 | 134 | print(f'目标城市:{city_res[0]["districtName"] + "-" + city_res[0]["name"]}\n') 135 | return cityID 136 | 137 | 138 | def CalPrice(kv): 139 | sumSale = 0 140 | avg = 0 141 | for k, v in kv: 142 | sumSale += k 143 | for k, v in kv: 144 | try: 145 | avg += k / sumSale * v 146 | except: 147 | continue 148 | return avg 149 | 150 | 151 | def GetTicketPrice(spotid, poiId): 152 | tdata = ticket_data.copy() 153 | tdata['spotid'] = spotid 154 | tdata['poiId'] = poiId 155 | ticket_res = post(TicketURL, json=tdata) 156 | dataa = ticket_res.json().get('data') 157 | if not dataa: 158 | return [0, 0, 0, 0] 159 | shelfGroup = dataa.get('shelfGroups') 160 | if not shelfGroup: 161 | return [0, 0, 0, 0] 162 | 163 | chengr = [] 164 | laor = [] 165 | xues = [] 166 | ertong = [] 167 | lr = 0 168 | xs = 0 169 | cr = 0 170 | et = 0 171 | tt = 0 172 | overall = [] 173 | maxsales = 0 174 | for i in shelfGroup: 175 | ticketGroups = i.get('ticketGroups') 176 | if ticketGroups: 177 | for j in ticketGroups: 178 | sales = j.get('yearlySale') 179 | maxsales = max(sales, maxsales) 180 | if j.get('mainTicket', False): 181 | tt = 1 182 | subTickets = j.get('subTicketGroups') 183 | if subTickets: 184 | for sub in subTickets: 185 | if not sub.get('subTicketGroupInfo') or not sub['subTicketGroupInfo'].get('priceInfo') or not sub['subTicketGroupInfo']['priceInfo'].get('price'): 186 | continue 187 | 188 | if '成人' in sub['subTicketGroupInfo']['name'] + sub['subTicketGroupInfo'].get('subName', ''): 189 | chengr.append((sales, sub['subTicketGroupInfo']['priceInfo']['price'])) 190 | cr = 1 191 | if '老人' in sub['subTicketGroupInfo']['name'] + sub['subTicketGroupInfo'].get('subName', ''): 192 | laor.append((sales, sub['subTicketGroupInfo']['priceInfo']['price'])) 193 | lr = 1 194 | if '学生' in sub['subTicketGroupInfo']['name'] + sub['subTicketGroupInfo'].get('subName', ''): 195 | xues.append((sales, sub['subTicketGroupInfo']['priceInfo']['price'])) 196 | xs = 1 197 | if '儿童' in sub['subTicketGroupInfo']['name'] + sub['subTicketGroupInfo'].get('subName', ''): 198 | ertong.append((sales, sub['subTicketGroupInfo']['priceInfo']['price'])) 199 | et = 1 200 | overall.append((sales, sub['subTicketGroupInfo']['priceInfo']['price'])) 201 | 202 | if tt == 0: 203 | for i in shelfGroup: 204 | ticketGroups = i.get('ticketGroups') 205 | if ticketGroups: 206 | for j in ticketGroups: 207 | sales = j.get('yearlySale') 208 | if not sales == maxsales: 209 | continue 210 | 211 | subTickets = j.get('subTicketGroups') 212 | if subTickets: 213 | for sub in subTickets: 214 | if not sub.get('subTicketGroupInfo') or not sub['subTicketGroupInfo'].get('priceInfo') or not sub['subTicketGroupInfo']['priceInfo'].get('price'): 215 | continue 216 | 217 | if '票' not in sub['subTicketGroupInfo']['name'] + sub['subTicketGroupInfo'].get('subName', 218 | '') \ 219 | or sub['subTicketGroupInfo']['priceInfo']['price'] > 300: 220 | continue 221 | 222 | if '成人' in sub['subTicketGroupInfo']['name'] + sub['subTicketGroupInfo'].get('subName', 223 | ''): 224 | chengr.append((sales, sub['subTicketGroupInfo']['priceInfo']['price'])) 225 | cr = 1 226 | if '老人' in sub['subTicketGroupInfo']['name'] + sub['subTicketGroupInfo'].get('subName', 227 | ''): 228 | laor.append((sales, sub['subTicketGroupInfo']['priceInfo']['price'])) 229 | lr = 1 230 | if '学生' in sub['subTicketGroupInfo']['name'] + sub['subTicketGroupInfo'].get('subName', 231 | ''): 232 | xues.append((sales, sub['subTicketGroupInfo']['priceInfo']['price'])) 233 | xs = 1 234 | if '儿童' in sub['subTicketGroupInfo']['name'] + sub['subTicketGroupInfo'].get('subName', 235 | ''): 236 | ertong.append((sales, sub['subTicketGroupInfo']['priceInfo']['price'])) 237 | et = 1 238 | overall.append((sales, sub['subTicketGroupInfo']['priceInfo']['price'])) 239 | else: 240 | logging.warning('无sub') 241 | 242 | if cr == 0: 243 | chengr = overall.copy() 244 | if lr == 0: 245 | laor = chengr.copy() 246 | if xs == 0: 247 | xues = chengr.copy() 248 | if et == 0: 249 | ertong = chengr.copy() 250 | 251 | crp = CalPrice(chengr) 252 | xsp = CalPrice(xues) 253 | lrp = CalPrice(laor) 254 | etp = CalPrice(ertong) 255 | 256 | ticket_res.close() 257 | return [crp, lrp, xsp, etp] 258 | 259 | 260 | def GetDetail(poiId): 261 | ddata = detail_data.copy() 262 | ddata['poiId'] = poiId 263 | detail_res = post(DetailURL, json=ddata) 264 | templateList = detail_res.json().get('templateList') 265 | spendTime = '' 266 | opentime = '' 267 | desc = '' 268 | preferential = {} 269 | 270 | if not templateList: 271 | return [spendTime, opentime, desc, preferential] 272 | 273 | for i in templateList: 274 | if i.get('templateName') == '温馨提示': 275 | moduleList = i.get('moduleList') 276 | if moduleList: 277 | for j in moduleList: 278 | if j.get('moduleName') == '开放时间': 279 | mod = j.get('poiOpenModule') 280 | spendTime = mod.get('playSpendTime') 281 | opentime = str(mod) 282 | elif j.get('moduleName') == '优待政策': 283 | mod = j.get('preferentialModule').get('policyInfoList') 284 | if mod: 285 | for l in mod: 286 | cus = l.get('customDesc') 287 | preferential[cus] = [] 288 | for k in l.get('policyDetail'): 289 | lst = [k.get('limitation'), k.get('policyDesc')] 290 | preferential[cus].append(lst) 291 | 292 | 293 | elif i.get('templateName') == '信息介绍': 294 | moduleList = i.get('moduleList') 295 | if moduleList: 296 | for j in moduleList: 297 | if j.get('moduleName') == '图文详情': 298 | mod = j.get('introductionModule') 299 | desc = mod.get('introduction') 300 | soup = BeautifulSoup(desc, 'lxml') 301 | desc = soup.text 302 | 303 | detail_res.close() 304 | return [spendTime, opentime, desc, preferential] 305 | 306 | 307 | if __name__ == '__main__': 308 | if not os.path.exists('data/'): 309 | os.mkdir('data') 310 | if not os.path.exists('data/comments'): 311 | os.mkdir('data/comments') 312 | 313 | conf = configparser.ConfigParser() 314 | conf.read('config.ini', encoding="utf-8") 315 | 316 | try: 317 | city = conf.get('poi', 'city') 318 | except: 319 | city = '北京' 320 | 321 | try: 322 | isRestart = int(conf.get('poi', 'isRestart')) 323 | isCrawlComment = int(conf.get('poi', 'isCrawlComment')) 324 | shutil.copyfile('data/pois.csv', 'data/back.csv') # 写前备份 325 | except: 326 | pass 327 | 328 | data['districtId'] = getCityID(city) # 获取城市编号 329 | 330 | if isRestart == 1: 331 | try: 332 | with open('data/pois.csv', 'r', encoding='utf-8') as rf: 333 | rd = csv.reader(rf) 334 | cnt = 0 335 | for r in rd: 336 | cnt += 1 337 | if cnt == 1: 338 | continue 339 | history.add(int(r[2])) 340 | 341 | f = open('data/pois.csv', 'a', encoding='utf-8') 342 | 343 | except FileNotFoundError: 344 | logging.error('无法续写,文件不存在') 345 | isRestart = 1 346 | f = open('data/pois.csv', 'w', encoding='utf-8') 347 | 348 | else: 349 | f = open('data/pois.csv', 'w', encoding='utf-8') 350 | 351 | wr = csv.writer(f) 352 | if isRestart == 0: 353 | wr.writerow(['名称', '英文名', 'id', 'poiID', '经度', '维度', '标签', '特色', '价格', '最低价格', '评价分数', 354 | '评论数量', '封面图片', '成人票价格', '老人票价格', '学生票价格', '儿童票价格', '建议游玩', '开放时间', '介绍', '优待政策']) 355 | 356 | # 最大页数 357 | for page in range(1, 5000): 358 | print(f'开始爬取第{page}页') 359 | data['index'] = page 360 | poiListRes = post(URL, json=data) 361 | if not poiListRes.json().get('result'): 362 | print(poiListRes.json()) 363 | break 364 | poiList = poiListRes.json()['result']['sightRecreationList'] 365 | if len(poiList) == 0: # 如果此页没有数据,说明爬取结束或者出错了 366 | break 367 | for poi in poiList: 368 | row = [] 369 | ID = poi.get('id', '') 370 | print(poi.get('name'), end='') 371 | if isRestart and int(ID) in history: 372 | print(' 已存在') 373 | continue 374 | print() 375 | row.append(poi.get('name', '')) # 名称 376 | row.append(poi.get('eName', '')) # 英文名 377 | row.append(ID) # id 378 | 379 | poiID = poi.get('poiId', '') 380 | row.append(poiID) # poiId 381 | row.append(poi['coordInfo']['gDLat']) # 经度 382 | row.append(poi['coordInfo']['gDLon']) # 维度 383 | 384 | tagSet = set() 385 | tagSet.update(poi.get('resourceTags', [])) 386 | tagSet.update(poi.get('tagNameList', [])) 387 | tagSet.update(poi.get('themeTags', [])) 388 | tagStr = '|'.join(tagSet) 389 | row.append(tagStr) # 标签 390 | 391 | row.append('|'.join(poi.get('shortFeatures', []))) # 特色 392 | 393 | row.append(poi.get('price', 0)) # 价格 394 | row.append(poi.get('displayMinPrice', 0)) # 最低价格 395 | 396 | commentScore = poi.get('commentScore') 397 | if not commentScore: 398 | commentScore = 0.0 399 | row.append(commentScore) # 评价分数 400 | commentCount = poi.get('commentCount') 401 | if not commentCount: 402 | commentCount = 0 403 | row.append(commentCount) # 评论数量 404 | 405 | row.append(poi.get('coverImageUrl', '')) # 封面 406 | row += GetTicketPrice(spotid=ID, poiId=poiID) 407 | row += GetDetail(poiId=poiID) 408 | ertong = row[-1].get('儿童') 409 | yhbj = 0 410 | free = 0 411 | if ertong: 412 | for i in ertong: 413 | if '半价' in i[1] or '优惠' in i[1]: 414 | yhbj = 1 415 | elif i[1] == '免费': 416 | free = 1 417 | if yhbj == 0 and free == 1: 418 | row[-5] = 0 # 儿童票免费 419 | 420 | wr.writerow(row) 421 | 422 | time.sleep(1) 423 | 424 | if isCrawlComment == 1: # 爬取评论 425 | try: 426 | print(f'开始爬取{poi.get("name", "")}评论') 427 | GetComments(ID, commentCount) 428 | except Exception as e: 429 | print(e) 430 | logging.error(f'爬取{poi.get("name", "")}评论错误!') 431 | 432 | time.sleep(2) 433 | 434 | f.close() 435 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests==2.24.0 2 | beautifulsoup4==4.9.3 3 | --------------------------------------------------------------------------------