├── .gitattributes ├── README.md ├── __pycache__ └── ctrip_funcs.cpython-36.pyc ├── cookie └── ticket.csv ├── ctrip_funcs.py ├── gen_ticket.py ├── js └── get_callback.js └── main.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scrape Learning (ctrip) 2 | **使用时先跑'gen_ticket.py'生成ctrip_ticket，再跑'main.py'**
3 | 破解eleven参数后，可以不需要调用selenium便可请求到详细的价格信息，提升获取速度、减少资源占用
4 | 该爬虫可爬取： 5 | * 每个酒店名称、评分 6 | * 每个酒店不同预订时间每种房型价格 7 | * 每个酒店不同预订时间每种房型满意度 8 | * 每个酒店不同预订时间每种房型剩余可订数
9 | ## 1. 获得房间价格的重要参数"eleven"的生成方法 10 | * "eleven"由一个混淆js文件-"oceanball"生成，该文件具有随机性 11 | * 生成方法:ctrip_funcs.py中的"get_oceanball"、"get_eleven"两个函数，需要配合js代码和python共同生成 12 | ## 2. 发现获得剩余可订房间仅需要Cookie: ctrip_ticket 13 | * 这个cookie疑似使用“Http-only”Flag 发送，无法直接读取，只能通过浏览器获得。([reference](https://stackoverflow.com/questions/1022112/why-doesnt-document-cookie-show-all-the-cookie-for-the-site)) 14 | * 该cookie时效较长，可以用selenium每30分钟提取一次 15 | -------------------------------------------------------------------------------- /__pycache__/ctrip_funcs.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/evanleungc/ctrip_spider/385d5fdab97a587766839985609d5434c7f996c0/__pycache__/ctrip_funcs.cpython-36.pyc -------------------------------------------------------------------------------- /cookie/ticket.csv: -------------------------------------------------------------------------------- 1 | ticket 2 | uoeOwviAJ6VQEgTNwLuTqSV9j/bS+aOP3Riia1P+kyQbgkQZsD2gieu3X0He46SZnHsPRr5UJ+5Fy9rQNa99C7UfsQg+vjmHBpzo4BXP5+/3LcLZ5QCh272+0j2OYuw54AMxB3zvMDC6XcW/gPiercK41+LSQpoMwCiyPRRRls6no29Tb5bejnmwATYgFv2mxZUe3s7LctKONKONqLgKfD+omhC70RR0/TpE7DhQlxz2YJxpwXBh2UDOSUeGXLqMDuMiNpDxLxKsogBdMce0+HFPxSaIhjPCnxjMqUedbOgsa6QYlxNWt+JmpOsyGpcJ 3 | -------------------------------------------------------------------------------- /ctrip_funcs.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import etree 3 | import PyV8 4 | import re 5 | from selenium import webdriver 6 | import time 7 | import os 8 | import pandas as pd 9 | 10 | JS_PATH = '/Users/apple/Documents/ctrip_spider/js/' 11 | COOKIE_PATH = '/Users/apple/Documents/ctrip_spider/cookie/' 12 | 13 | def usere(regex, getcontent): #regex 14 | pattern = re.compile(regex) 15 | content = re.findall(pattern, getcontent) 16 | return content 17 | 18 | def modify(string): 19 | ''' 20 | add escape to special signs 21 | ''' 22 | string = string.replace('(', '\(') 23 | string = string.replace(')', '\)') 24 | return string 25 | 26 | def get_city(hotel_url): 27 | ''' 28 | Parameters 29 | ---------- 30 | hotel_url: string 31 | url of the hotel brand 32 | Return 33 | ------ 34 | city: list 35 | city id list of the hotel brand 36 | ''' 37 | html = requests.get(hotel_url).content.decode('utf8') 38 | selector = etree.HTML(html) 39 | regex = '

([\s\S]*?)

' 40 | city = usere(regex, html)[0] 41 | regex2 = 'hotel/(.+?)/' 42 | city = usere(regex2, city) 43 | return city 44 | 45 | def get_max_page(city_hotel_url): 46 | ''' 47 | Parameters 48 | ---------- 49 | city_hotel_url: string 50 | url of the hotel in the specific city 51 | Return 52 | ------ 53 | max_page: int 54 | the maximum page of the hotel in the specific city 55 | ''' 56 | html = requests.get(city_hotel_url).content.decode('utf8') 57 | regex = 'layoutfix([\s\S]*)下一页' 58 | pages = usere(regex, html)[0] 59 | regex = 'data-value="(\d+?)"' 60 | pages = usere(regex, pages) 61 | if pages == []: 62 | max_page = 1 63 | else: 64 | max_page = int(pages[-1]) 65 | return max_page 66 | 67 | def get_hotel_info(page_url): 68 | ''' 69 | Parameters 70 | ---------- 71 | page_url: string 72 | url of a single page 73 | 74 | Return 75 | ------ 76 | infodict: dict 77 | brief info of the hotels in that single page 78 | ''' 79 | infodict = {} 80 | infodict.setdefault('title', []) 81 | infodict.setdefault('id', []) 82 | infodict.setdefault('score', []) 83 | #title: hotel name || id: hotel id(further used in detail info) || score: customers' score 84 | titlelist = [] 85 | idlist = [] 86 | scorelist = [] 87 | html = requests.get(page_url).content.decode('utf8') 88 | regex = '