├── .gitignore ├── README.md ├── hotel_ids ├── ids_empty.txt ├── ids_got.txt └── ids_total.txt ├── main.py ├── requirements.txt ├── sentiment_ctrip.marshal.3 ├── src ├── __init__.py ├── config.py ├── get_hotels_id.py ├── get_proxy.py ├── save_comment.py ├── save_comment_selenium.py ├── sentiment_analysis.py ├── set_record.py └── set_sentiment_score.py └── 流程图.png /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | *.pyc 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 基本功能 2 | 爬取携程的酒店的评论,对评论多情感分析 3 | *** 4 | # 目录结构 5 | ```. 6 | ├── hotel_ids 7 | │   ├── ids_empty.txt # 没有点评数据的酒店id 8 | │   ├── ids_got.txt # 已爬取了的酒店id 9 | │   └── ids_total.txt # 所有酒店id 10 | ├── log # 日志目录,程序每次运行会在该目录生成一个日志文件 11 | ├── main.py # 启动函数 12 | ├── README.md 13 | ├── requirements.txt # 依赖包 14 | └── src # 代码目录 15 | ├── config.py # 配置mongodb,日志路径等 16 | ├── get_hotels_id.py # 获取酒店id 17 | ├── get_proxy.py # 从[西刺](http://www.xicidaili.com/)爬取免费代理 18 | ├── __init__.py 19 | ├── save_comment.py # 爬取点评数据 20 | ├── save_comment_selenium.py # 用模拟浏览器的方式爬取 21 | ├── sentiment_analysis.py # 对[snownlp](https://github.com/isnowfy/snownlp)模块的简单封装 22 | ├── set_record.py # 程序结束后设置批次记录 23 | └── set_sentiment_score.py # 计算每条评论的情感值 24 | ``` 25 | *** 26 | # 流程图 27 | ![流程图](流程图.png) 28 | *** 29 | # 运行 30 | ``` 31 | python main.py 32 | ``` 33 | 34 | -------------------------------------------------------------------------------- /hotel_ids/ids_empty.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jllan/comment_sentiment_analysis/e151f9d9ebcca175047da8289bb35aec4b2324e0/hotel_ids/ids_empty.txt -------------------------------------------------------------------------------- /hotel_ids/ids_got.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jllan/comment_sentiment_analysis/e151f9d9ebcca175047da8289bb35aec4b2324e0/hotel_ids/ids_got.txt -------------------------------------------------------------------------------- /hotel_ids/ids_total.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jllan/comment_sentiment_analysis/e151f9d9ebcca175047da8289bb35aec4b2324e0/hotel_ids/ids_total.txt -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import time 2 | from src.get_proxy import ProxyGetting 3 | # from src.save_comment import CtripComment 4 | from src.save_comment_selenium import CtripComment 5 | from src.get_hotels_id import HotelIdGetting 6 | from src.set_sentiment_score import SentimentScoreSetting 7 | from src.set_record import RecordSetting 8 | 9 | 10 | def start(): 11 | HotelIdGetting().get_hids_remain() 12 | start_time_stamp = time.time() 13 | proxy, comment, score = [], [], [] 14 | for i in range(1): 15 | p = ProxyGetting() 16 | proxy.append(p) 17 | print('启动代理线程') 18 | p.start() 19 | 20 | time.sleep(60) 21 | 22 | for j in range(4): 23 | c = CtripComment() 24 | comment.append(c) 25 | print('启动爬虫线程{}'.format(str(j))) 26 | c.start() 27 | time.sleep(5) 28 | 29 | time.sleep(300) 30 | for k in range(4): 31 | s = SentimentScoreSetting() 32 | score.append(s) 33 | s.start() 34 | 35 | for i in proxy: 36 | i.join() 37 | print('代理线程退出') 38 | 39 | for j in comment: 40 | j.join() 41 | print('爬虫线程退出') 42 | 43 | 44 | for k in score: 45 | k.join() 46 | print('分析线程退出') 47 | 48 | end_time_stamp = time.time() 49 | HotelIdGetting().ids_file_del() # 程序运行完毕,已爬取所有id,清空已爬取id列表,以便下次重新开始爬取 50 | RecordSetting().set_record(start_time_stamp, end_time_stamp) # 插入批次记录 51 | 52 | if __name__ == '__main__': 53 | start() -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.5.1 2 | bs4==0.0.1 3 | pymongo==3.3.0 4 | requests==2.11.1 5 | selenium==2.53.6 6 | snownlp==0.12.3 7 | tqdm==4.8.4 -------------------------------------------------------------------------------- /sentiment_ctrip.marshal.3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jllan/comment_sentiment_analysis/e151f9d9ebcca175047da8289bb35aec4b2324e0/sentiment_ctrip.marshal.3 -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jllan/comment_sentiment_analysis/e151f9d9ebcca175047da8289bb35aec4b2324e0/src/__init__.py -------------------------------------------------------------------------------- /src/config.py: -------------------------------------------------------------------------------- 1 | #_*_coding:utf-8 _*_ 2 | import datetime 3 | from pymongo import MongoClient 4 | 5 | client = MongoClient('localhost',27017) 6 | ctrip_comment = client['ctrip_comment'] 7 | 8 | '''单条点评数据,包括hotel_id,comment_id,comment_dat,comment_text,score, sentiment_score''' 9 | comment_detail = ctrip_comment['comment_detail'] 10 | 11 | '''一个酒店点评数据的概要信息,包括hotel_id,comment_num,available_comment_num,score,recommend_rate,sentiment_score,deadline''' 12 | comment_basic = ctrip_comment['comment_basic'] 13 | 14 | '''批次记录''' 15 | comment_batch = ctrip_comment['comment_batch'] 16 | # comment_batch = client['ctrip_0811']['orderlist'] 17 | 18 | log_file = 'log/log_{}.txt'.format(str(datetime.date.today())) 19 | ids_total_file = 'hotel_ids/ids_total.txt' # 全部id 20 | ids_got_file = 'hotel_ids/ids_got.txt' # 已爬取成功的id 21 | ids_empty_file = 'hotel_ids/ids_empty.txt' # 没有点评数据的id -------------------------------------------------------------------------------- /src/get_hotels_id.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from queue import Queue 3 | from .config import log_file, ids_total_file, ids_got_file, ids_empty_file 4 | 5 | ids_remain = Queue() # 待爬取的id 6 | 7 | '''获取hotel_id''' 8 | 9 | 10 | class HotelIdGetting: 11 | def __init__(self): 12 | pass 13 | 14 | '''从服务器获取酒店id''' 15 | def get_hids_new(self): 16 | url = '' 17 | try: 18 | response = requests.get(url, timeout=300) 19 | except Exception: 20 | print('获取酒店id出错') 21 | with open(log_file, 'a+') as f: 22 | f.write('获取酒店id出错\n') 23 | else: 24 | ids = response.json()['hids'] 25 | ids = set([id['hotelid'].strip() for id in ids]) 26 | with open(ids_total_file, 'r+') as f: # 所有id 27 | ids_total = f.readlines() 28 | ids_total = set(id.strip() for id in ids_total) 29 | ids_new = ids - ids_total 30 | if ids_new: 31 | for id_new in ids_new: 32 | f.write(id_new+'\n') 33 | print('新增{}个酒店id'.format(len(ids_new))) 34 | with open(log_file, 'a+') as f: 35 | f.write('新增{}个酒店id\n'.format(len(ids_new))) 36 | 37 | def get_hids_remain(self): 38 | self.get_hids_new() 39 | with open(ids_total_file, 'r') as f: # 所有id 40 | ids_total = f.readlines() 41 | ids_total = set(id.strip() for id in ids_total) 42 | with open(ids_got_file, 'r') as f: # 已经爬取过的id 43 | ids_got = f.readlines() 44 | ids_got = set(id.strip() for id in ids_got) 45 | with open(ids_empty_file, 'r') as f: # 已经爬取过的id 46 | ids_empty = f.readlines() 47 | ids_empty = set(id.strip() for id in ids_empty) 48 | ids = ids_total - ids_got - ids_empty # 待爬取的id 49 | for id in ids: 50 | ids_remain.put({'hotel_id':id, 'start_page':1}) 51 | with open(log_file, 'a+') as f: 52 | f.write('共有{}个id,其中{}个已爬取,还有{}个待爬取\n'.format(len(ids_total),len(ids_got),ids_remain.qsize())) 53 | print('共有{}个id,其中{}个已爬取,还有{}个待爬取\n'.format(len(ids_total),len(ids_got),ids_remain.qsize())) 54 | return ids_remain 55 | 56 | '''如果全部爬取完毕,需要清空已爬取id数据,以便下次重新开始爬取''' 57 | def ids_file_del(self): 58 | print('清空已爬取的id列表和空id列表') 59 | with open(ids_got_file, 'w') as f: 60 | f.truncate() 61 | with open(ids_empty_file, 'w') as f: 62 | f.truncate() 63 | 64 | 65 | if __name__ == '__main__': 66 | get_ids = HotelIdGetting() 67 | get_ids.get_hids_remain() -------------------------------------------------------------------------------- /src/get_proxy.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import requests.adapters 3 | import time 4 | from threading import Thread 5 | from queue import Queue 6 | from bs4 import BeautifulSoup 7 | from .get_hotels_id import ids_remain 8 | 9 | proxy_list = Queue() 10 | 11 | 12 | class ProxyGetting(Thread): 13 | def __init__(self): 14 | self.session = requests.session() 15 | adapter = requests.adapters.HTTPAdapter(max_retries=3) 16 | self.session.mount('https://', adapter) 17 | self.session.mount('http://', adapter) 18 | Thread.__init__(self) 19 | 20 | def get_xici(self, page=1): 21 | url = 'http://www.xicidaili.com/nn/{}'.format(page) 22 | headers_xici = { 23 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 24 | 'Accept-Encoding': 'gzip, deflate, sdch', 25 | 'Accept-Language': 'zh-CN,zh;q=0.8', 26 | 'Cache-Control': 'max-age=0', 27 | 'Connection': 'keep-alive', 28 | 'Host': 'www.xicidaili.com', 29 | 'Upgrade-Insecure-Requests': 1, 30 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36' 31 | } 32 | print('crawl {} page'.format(page)) 33 | response = self.session.get(url, headers=headers_xici) 34 | soup = BeautifulSoup(response.text, 'lxml') 35 | ip_list = soup.find('table', id='ip_list').select('tr')[1:] 36 | for i in ip_list: 37 | # print(i) 38 | ip = i.select('td')[1].text.lower() 39 | port = i.select('td')[2].text 40 | type = i.select('td')[4].text 41 | protocol = i.select('td')[5].text.lower() 42 | print(ip, port, type, protocol) 43 | data = {'ip': ip, 'port': port, 'protocol': protocol} 44 | proxies = {} 45 | proxies[data['protocol']] = data['protocol']+'://'+data['ip']+':'+str(data['port']) 46 | proxy_list.put(proxies) 47 | 48 | def get_kuaidaili(self, page=1): 49 | s = requests.session() 50 | url = 'http://www.kuaidaili.com/free/inha/{}/'.format(page) 51 | headers_kuaidaili={ 52 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 53 | 'Accept-Encoding': 'gzip, deflate, sdch', 54 | 'Accept-Language': 'zh-CN,zh;q=0.8', 55 | 'Cache-Control': 'max-age=0', 56 | 'Connection': 'keep-alive', 57 | 'Host': 'www.kuaidaili.com', 58 | 'Upgrade-Insecure-Requests': '1', 59 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36' 60 | } 61 | print('crawl {} page'.format(page)) 62 | response = s.get(url, headers=headers_kuaidaili) 63 | soup = BeautifulSoup(response.text, 'lxml') 64 | ip_list = soup.table.select('tr')[1:] 65 | for i in ip_list: 66 | ip = i.select('td')[0].text.lower() 67 | port = i.select('td')[1].text 68 | type = i.select('td')[2].text 69 | protocol = i.select('td')[3].text.lower() 70 | print(ip, port, type, protocol) 71 | data = {'ip': ip, 'port': port, 'protocol': protocol} 72 | proxies = {} 73 | proxies[data['protocol']] = data['protocol']+'://'+data['ip']+':'+str(data['port']) 74 | proxy_list.put(proxies) 75 | 76 | '''从proxy_list中取出一个代理''' 77 | def get_one_proxy(self): 78 | if not proxy_list.empty(): 79 | proxy = proxy_list.get() 80 | return proxy 81 | else: 82 | return '' 83 | 84 | def run(self): 85 | while True: 86 | if not ids_remain.empty(): 87 | if proxy_list.qsize() < 10: 88 | self.get_xici() 89 | else: 90 | time.sleep(10) 91 | else: 92 | time.sleep(300) # 如果待爬取队列空,则暂停300s(因为有些失败的id会重新进入队列)后退出 93 | break 94 | 95 | 96 | if __name__ == '__main__': 97 | proxy_get = ProxyGetting() 98 | proxy_get.get_xici() 99 | print(proxy_get.get_one_proxy()) 100 | print(type(proxy_get.get_one_proxy())) 101 | # proxy_get.run() 102 | -------------------------------------------------------------------------------- /src/save_comment.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import requests.adapters 3 | import json 4 | import datetime 5 | import time 6 | import pymongo 7 | from tqdm import * 8 | from threading import Thread 9 | from queue import Queue 10 | from .config import comment_basic, comment_detail, log_file, ids_got_file, ids_empty_file 11 | from .get_proxy import ProxyGetting, proxy_list 12 | from .get_hotels_id import ids_remain 13 | 14 | MAX_RETRIES = 3 15 | ids_empty = set() # 没有点评数据的id 16 | ids_got = Queue() # 爬取成功的id 17 | 18 | 19 | class CtripComment(Thread): 20 | 21 | def __init__(self, hotel_id=''): 22 | Thread.__init__(self) 23 | self.hotel_id = hotel_id 24 | self.start_page = 1 25 | # self.deadline = '' 26 | # self.proxy = proxy_list.get() 27 | self.proxy = ProxyGetting().get_one_proxy() 28 | # self.proxy = '' 29 | self.session = requests.session() 30 | adapter = requests.adapters.HTTPAdapter(max_retries=MAX_RETRIES) 31 | self.session.mount('https://', adapter) 32 | self.session.mount('http://', adapter) 33 | 34 | def get_data(self, page, try_num=1): 35 | url = "http://m.ctrip.com/restapi/soa2/10935/hotel/booking/commentgroupsearch?_fxpcqlniredt=09031020210316541274" 36 | headers = { 37 | 'Host': 'm.ctrip.com', 38 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0', 39 | 'Accept': 'application/json', 40 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 41 | 'Accept-Encoding': 'gzip, deflate, br', 42 | 'X-Requested-With': 'XMLHttpRequest', 43 | 'Content-Type': 'application/json', 44 | 'Referer': 'http://m.ctrip.com/webapp/hotel/hoteldetail/dianping/{}.html?roomtype=&opr=&fr=detail&daylater=0&days=1'.format(self.hotel_id), 45 | 'Connection': 'keep-alive' 46 | } 47 | # params = {"flag":1,"id":self.hotel_id,"htype":1,"sort":{"idx":1,"size":10,"sort":1,"ord":1},"search":{"kword":"","gtype":4,"opr":0,"ctrl":14,"filters":[]},"alliance":{"aid":"66672","sid":"508668","ouid":"","ishybrid":0},"Key":"","head":{"cid":"09031020210316541274","ctok":"","cver":"1.0","lang":"01","sid":"55552328","syscode":"09","auth":"","extension":[{"name":"pageid","value":"228032"},{"name":"webp","value":0},{"name":"referrer","value":""},{"name":"protocal","value":"http"}]},"contentType":"json"} 48 | params = {"flag":1,"id":self.hotel_id,"htype":1,"sort":{"idx":1,"size":10,"sort":1,"ord":1},"search":{"kword":"","gtype":4,"opr":0,"ctrl":14,"filters":[]},"alliance":{"aid":"66672","sid":"508668","ouid":"","ishybrid":0},"Key":"f83e59228064b4ede9b33bc4325eb3d9","head":{"cid":"09031020210316541274","ctok":"","cver":"1.0","lang":"01","sid":"8888","syscode":"09","auth":"","extension":[{"name":"pageid","value":"228032"},{"name":"webp","value":0},{"name":"referrer","value":""},{"name":"protocal","value":"https"}]},"contentType":"json"} 49 | print('使用代理:', self.proxy) 50 | params['sort']['idx'] = page # 设置页码 51 | try: 52 | response = self.session.post(url, data=json.dumps(params), headers=headers, proxies=self.proxy, timeout=60, stream=False) 53 | print(response.text) 54 | print(response.status_code) 55 | except Exception as e: 56 | print('error:', e) 57 | self.handle_error(page) 58 | return None 59 | else: 60 | if response.status_code != 200: 61 | self.handle_error(page) 62 | return None 63 | response = response.json() 64 | if response['rc'] == 200: 65 | if not response.get('hcsi'): 66 | self.handle_error(page) 67 | return None 68 | if response['hcsi']['total'] > '0': 69 | return response 70 | else: 71 | print('{}没有评论数据'.format(self.hotel_id)) 72 | ids_empty.add(self.hotel_id) 73 | with open(ids_empty_file, 'a+') as f: 74 | f.write(self.hotel_id+'\n') 75 | with open(log_file, 'a+') as f: 76 | f.write('{}没有评论数据,当前使用代理{}\n'.format(self.hotel_id, self.proxy)) 77 | return None 78 | 79 | def get_comment(self, page=1, try_num=1): 80 | page = pages = self.start_page 81 | # comment_list = [] 82 | start_time = time.time() 83 | while page <= pages: 84 | response = self.get_data(page) 85 | if response: 86 | # if page == 1: 87 | if page == pages: 88 | total_pages = response['groups'][0]['pages'] 89 | count = response['groups'][0]['count'] 90 | score = response['hcsi']['avgpts']['all'] 91 | recommend_rate = response['hcsi']['recmd'] 92 | pages = total_pages 93 | comment_basic.find_one_and_update( 94 | {'hotel_id': self.hotel_id}, 95 | {'$set': {'score':score, 'recommend_rate':recommend_rate, 'comment_total':count}}, 96 | upsert = True 97 | ) # 酒店的评论概要信息写入comment_basic,因为每次爬取时这些信息可能会有更新,所以用find_one_and_update的方法 98 | if not comment_detail.find({'hotel_id': self.hotel_id}).count(): 99 | self.deadline = str(datetime.date.today() - datetime.timedelta(days=180)) # 如果没有对应id的酒店的点评数据,说明该酒店是第一次被抓去,截止日期设为半年前 100 | else: 101 | self.deadline = comment_detail.find_one( 102 | {'hotel_id':self.hotel_id}, 103 | sort=[('comment_date', pymongo.DESCENDING)] 104 | )['comment_date'] # 上次抓取的评论的最新的日期作为本次的截止日期 105 | print('deadline:', self.deadline) 106 | print('{}共有{}条评论,共{}页,评分{},推荐率{}'.format(self.hotel_id, count, pages, score, recommend_rate)) 107 | print('{}共有{}条评论,共{}页,当前正在爬取第{}页的数据'.format(self.hotel_id, count, pages, page)) 108 | with open(log_file, 'a+') as f: 109 | f.write('{}共有{}条评论,共{}页,当前正在爬取第{}页的数据,当前使用代理{}\n'.format(self.hotel_id, count, pages, page, self.proxy)) 110 | comments = response['groups'][0]['comments'] 111 | comment_date = '' 112 | for comment in comments: 113 | comment_score = comment['rats']['all'] 114 | comment_date = comment['date'].strip().split(' ')[0] 115 | comment_id = self.hotel_id+str(comment['comid']) 116 | comment_text = comment['text'].strip() 117 | comment_dict = { 118 | 'hotel_id': self.hotel_id, 119 | 'comment_id': comment_id, 120 | 'comment_text': comment_text, 121 | 'comment_score': comment_score, 122 | 'comment_date': comment_date 123 | } 124 | # print(comment_dict) 125 | if comment_date > self.deadline: 126 | if comment_detail.find({'comment_id':comment_id}).count(): 127 | print('该条评论已存在') 128 | else: 129 | # comment_list.append(comment_dict) 130 | comment_detail.insert_one(comment_dict) 131 | else: 132 | print(comment_date) 133 | print('数据已过期,{}爬取成功!'.format(self.hotel_id)) 134 | # for comment_dict in comment_list: 135 | # comment_detail.insert_one(comment_dict) 136 | ids_got.put(self.hotel_id) # 成功爬取的id加入队列id_got中 137 | with open(ids_got_file, 'a+') as f: 138 | f.write(self.hotel_id+'\n') 139 | with open(log_file, 'a+') as f: 140 | f.write('{}爬取成功!\n'.format(self.hotel_id)) 141 | break 142 | if comment_date <= self.deadline: 143 | break 144 | # time.sleep(random.random()*20) 145 | page += 1 146 | else: 147 | print('{}爬取失败'.format(self.hotel_id)) 148 | break 149 | end_time = time.time() 150 | print('{}耗时:{}'.format(self.hotel_id, end_time-start_time)) 151 | 152 | def handle_error(self, page): 153 | if page == 1: 154 | ids_remain.put({'hotel_id':self.hotel_id, 'start_page':page}) # 失败的酒店id重新进入队列id_total 155 | else: 156 | ids_remain.put({'hotel_id':self.hotel_id, 'start_page':page, 'deadline':self.deadline}) 157 | # self.proxy = proxy_list.get() 158 | self.proxy = ProxyGetting().get_one_proxy() 159 | 160 | def run(self): 161 | start_time = time.time() 162 | while not ids_remain.empty(): 163 | # self.hotel_id = ids_remain.get() 164 | record = ids_remain.get() 165 | self.hotel_id = record['hotel_id'] 166 | self.start_page = record['start_page'] 167 | self.deadline = record.get('deadline') 168 | ids_remain.task_done() 169 | # self.proxy = proxy_list_available.get() 170 | print('开始从第{}页爬取id为{}的酒店,还有{}个酒店待爬取'.format(self.start_page, self.hotel_id, ids_remain.qsize())) 171 | with open(log_file, 'a+') as f: 172 | f.write('开始从第{}页爬取id为{}的酒店,还有{}个酒店待爬取\n'.format(self.start_page, self.hotel_id, ids_remain.qsize())) 173 | self.get_comment() 174 | print('队列中还有{}个id'.format(ids_remain.qsize())) 175 | end_time = time.time() 176 | print('所有酒店爬取完毕,用时:', end_time-start_time) 177 | 178 | def print_run_time(func): 179 | """装饰器函数,输出运行时间""" 180 | def wrapper(*args, **kw): 181 | start_time = time.time() 182 | func() 183 | print('run time is {:.2f}'.format(time.time() - start_time)) 184 | return wrapper 185 | 186 | @print_run_time 187 | def start(): 188 | proxy, comment = [], [] 189 | 190 | for j in range(1): 191 | c = CtripComment() 192 | comment.append(c) 193 | c.start() 194 | 195 | for j in comment: 196 | j.join() 197 | 198 | 199 | if __name__ == '__main__': 200 | ctrip_comment = CtripComment() 201 | ctrip_comment.hotel_id = '429541' 202 | ctrip_comment.get_comment() 203 | print(ids_empty) -------------------------------------------------------------------------------- /src/save_comment_selenium.py: -------------------------------------------------------------------------------- 1 | #_*_coding:utf-8 _*_ 2 | import time 3 | import datetime 4 | import re 5 | import random 6 | import codecs 7 | import pymongo 8 | from tqdm import * 9 | from selenium import webdriver 10 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 11 | from selenium.webdriver.support.ui import Select 12 | from selenium.webdriver.common.proxy import * 13 | from bs4 import BeautifulSoup 14 | from threading import Thread 15 | from queue import Queue 16 | from .config import comment_basic, comment_detail, log_file, ids_got_file, ids_empty_file 17 | from .get_proxy import ProxyGetting, proxy_list 18 | from .get_hotels_id import ids_remain 19 | 20 | ids_empty = set() # 没有点评数据的id 21 | ids_got = Queue() # 成功爬取的id 22 | n = 0 23 | # 从携程获取评论然后保存到数据库或文件 24 | class CtripComment(Thread): 25 | 26 | def __init__(self, hotel_id=''): 27 | self.hotel_id = hotel_id 28 | self.start_page = 1 29 | self.proxy = ProxyGetting().get_one_proxy() 30 | Thread.__init__(self) 31 | 32 | # 抓取一个酒店半年内的评论,不区分好评差评,存储到‘hotel_comment_酒店id’中,目的主要是用来进行测试 33 | def save_comments_all_pages(self, page=1): 34 | start = time.time() 35 | dcap = dict(DesiredCapabilities.PHANTOMJS) 36 | dcap["phantomjs.page.settings.userAgent"] = ( 37 | "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36" 38 | ) 39 | print('使用代理:', self.proxy) 40 | p = ''.join(self.proxy.values()).split('/')[-1] 41 | proxy = Proxy({ 42 | 'proxyType': ProxyType.MANUAL, 43 | 'httpProxy': p, 44 | 'noProxy': '' 45 | }) 46 | driver = webdriver.Firefox(proxy=proxy) 47 | # driver = webdriver.PhantomJS() 48 | driver.set_page_load_timeout(30) 49 | url = 'http://hotels.ctrip.com/hotel/dianping/{}.html'.format(self.hotel_id) 50 | try: 51 | driver.get(url) 52 | time.sleep(3) 53 | except Exception as e: 54 | print('error:', e) 55 | with open(log_file, 'a+') as f: 56 | f.write('{}页面打开失败, 留待以后再爬!\n'.format(self.hotel_id)) 57 | ids_remain.put(self.hotel_id) # 失败的重进进入待爬取队列 58 | # self.handle_error(page) 59 | else: 60 | print(driver.current_url) 61 | 62 | if driver.current_url != url or '此酒店暂无点评' in driver.page_source: 63 | print('该酒店无点评数据!') 64 | with open(log_file, 'a+') as f: 65 | f.write('{}没有点评数据!\n'.format(self.hotel_id)) 66 | with open(ids_empty_file, 'a+') as f: 67 | f.write(self.hotel_id+'\n') 68 | ids_empty.add(self.hotel_id) 69 | else: 70 | if u'您访问的太快了, 休息一下吧。 或者输入验证码继续访问' in driver.page_source: 71 | print('访问太快被禁止,暂停五分钟后继续') 72 | # for i in tqdm(range(6000)): 73 | # time.sleep(.1) #进度条每0.1s前进一次,总时间为3000*0.1=300s 74 | # driver.get(url) 75 | ids_remain.put(self.hotel_id) 76 | # self.handle_error(page) 77 | # self.save_comments_all_pages(page=page) 78 | else: 79 | try: 80 | select = Select(driver.find_element_by_class_name('select_sort')) 81 | time.sleep(1) 82 | select.select_by_value('1') # 下拉框选择按时间排序 83 | time.sleep(3) 84 | except Exception as e: 85 | print('error:', e) 86 | ids_remain.put(self.hotel_id) 87 | # self.handle_error(page) 88 | with open(log_file, 'a+') as f: 89 | f.write('{}页面的点评数据无法按照时间排序,留待以后再爬!\n'.format(self.hotel_id)) 90 | else: 91 | last_page_comment = '' 92 | while True: 93 | soup = BeautifulSoup(driver.page_source) 94 | if page == 1: 95 | score = soup.find('div',class_="comment_total_score").find('span', class_='score').span.text.strip() #评分 96 | recommend_rate = soup.find('span', class_='rec').span.text.strip('%') #推荐度 97 | comment_total_tmp = soup.find('span', id='All_Comment').text.strip() #评论数量 98 | comment_total = re.search('\d+', comment_total_tmp).group(0) 99 | print('ID为{}的酒店共有{}条评论,评分{},推荐率{}'.format(self.hotel_id, comment_total, score, recommend_rate)) 100 | comment_basic.find_one_and_update( 101 | {'hotel_id': self.hotel_id}, 102 | {'$set': {'score':score, 'recommend_rate':recommend_rate, 'comment_total':comment_total}}, 103 | upsert = True 104 | ) # 酒店的评论概要信息写入comment_basic,因为每次爬取时这些信息可能会有更新,所以用find_one_and_update的方法 105 | if not comment_detail.find({'hotel_id': self.hotel_id}).count(): 106 | self.deadline = str(datetime.date.today() - datetime.timedelta(days=180)) # 如果没有对应id的酒店的点评数据,说明该酒店是第一次被抓去,截止日期设为半年前 107 | else: 108 | self.deadline = comment_detail.find_one( 109 | {'hotel_id':self.hotel_id}, 110 | sort=[('comment_date', pymongo.DESCENDING)] 111 | )['comment_date'] # 上次抓取的评论的最新的日期作为本次的截止日期 112 | print('deadline:', self.deadline) 113 | print('正在爬取{}第{}页的数据'.format(self.hotel_id, page)) 114 | with open(log_file, 'a+') as f: 115 | f.write('正在爬取{}第{}页的数据\n'.format(self.hotel_id, page)) 116 | comments = soup.findAll('div', {'class':'comment_block J_asyncCmt'}) 117 | comment_date = '' 118 | if comments == last_page_comment: 119 | print('和上一页数据相同,不进行解析!') 120 | else: 121 | last_page_comment = comments 122 | comment_num = len(comments) 123 | print('正在获取ID为{}的酒店第{}页评论数据,该页有{}条评论'.format(self.hotel_id, page, comment_num)) 124 | for i in comments: 125 | comment_text = i.find('div', class_="J_commentDetail").text.replace('\n', ' ') #多行评论转为一行 126 | comment_date = i.select('span[class="time"]')[0].text 127 | comment_date = re.sub("[\u4e00-\u9fa5()]+", '', comment_date) 128 | try: 129 | comment_score = i.select('span[class="score"]')[0].span.text.strip() 130 | except Exception: 131 | comment_score = '' 132 | if comment_date > self.deadline: # 只需要大于截止日期的数据 133 | comment_dict = { 134 | 'hotel_id': self.hotel_id, 135 | # 'comment_id': comment_id, 136 | 'comment_text': comment_text, 137 | 'comment_score': comment_score, 138 | 'comment_date': comment_date 139 | } 140 | print(comment_dict) 141 | comment_detail.insert_one(comment_dict) 142 | else: 143 | print(comment_date, comment_text) 144 | print('评论已过期!') 145 | ids_got.put(self.hotel_id) # 成功爬取的id加入队列id_got中 146 | with open(log_file, 'a+') as f: 147 | f.write('{}爬取成功!\n'.format(self.hotel_id)) 148 | with open(ids_got_file, 'a+') as f: 149 | f.write(self.hotel_id+'\n') 150 | break #首先结束for循环 151 | if comment_date <= self.deadline: 152 | break 153 | '''查找下一页''' 154 | try: 155 | next = driver.find_element_by_class_name('c_down') 156 | except Exception as e: 157 | print('error:',e) 158 | ids_got.put(self.hotel_id) # 成功爬取的id加入队列id_got中 159 | with open(log_file, 'a+') as f: 160 | f.write('{}爬取成功!\n'.format(self.hotel_id)) 161 | with open(ids_got_file, 'a+') as f: 162 | f.write(self.hotel_id+'\n') 163 | break 164 | else: 165 | next_page = soup.find('a', class_='c_down')['value'] 166 | if page == next_page: #如果点击了下一页但是没有生效,再次点击下一页,等待时间长一点,以便能顺利进入下一页 167 | print('没有成功进入{}页,当前仍处于{}页'.format(page, int(page)-1)) 168 | try: 169 | next.click() 170 | except Exception as e: 171 | print('error:', e) 172 | comment_detail.remove({'comment_date':{'$gt':self.deadline}}) # 进入下一页失败,该酒店爬取失败,清空本次已爬到的数据,留待以后再爬 173 | ids_remain.put(self.hotel_id) 174 | break 175 | else: 176 | page = next_page 177 | try: 178 | next.click() 179 | except Exception as e: 180 | print('error:', e) 181 | comment_detail.remove({'comment_date':{'$gt':self.deadline}}) # 进入下一页失败,该酒店爬取失败,清空本次已爬到的数据,留待以后再爬 182 | ids_remain.put(self.hotel_id) 183 | break 184 | time.sleep(5) 185 | 186 | driver.quit() 187 | print('ID为%s的酒店总耗时:'%self.hotel_id, time.time()-start) 188 | 189 | '''def handle_error(self, page): 190 | if page == 1: 191 | ids_remain.put({'hotel_id':self.hotel_id, 'start_page':page}) # 失败的酒店id重新进入队列id_total 192 | else: 193 | ids_remain.put({'hotel_id':self.hotel_id, 'start_page':page, 'deadline':self.deadline}) 194 | self.proxy = ProxyGetting().get_one_proxy()''' 195 | 196 | def run(self): 197 | global n 198 | # dcap = dict(DesiredCapabilities.PHANTOMJS) 199 | # dcap["phantomjs.page.settings.userAgent"] = ( 200 | # "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36" 201 | # ) 202 | # # driver = webdriver.PhantomJS() 203 | # driver = webdriver.Firefox() 204 | # driver.set_page_load_timeout(30) 205 | start_time_stamp = time.time() 206 | while not ids_remain.empty(): 207 | n += 1 208 | # self.hotel_id = ids_remain.get() 209 | record = ids_remain.get() 210 | self.hotel_id = record['hotel_id'] 211 | # self.start_page = record['start_page'] 212 | # self.deadline = record.get('deadline') 213 | 214 | ids_remain.task_done() 215 | # self.proxy = proxy_list_available.get() 216 | print('开始爬取第{}个酒店,id为{},还有{}个酒店待爬取'.format(n, self.hotel_id, ids_remain.qsize())) 217 | with open(log_file, 'a+') as f: 218 | f.write('开始爬取id为{}的酒店,还有{}个酒店待爬取\n'.format(self.hotel_id, ids_remain.qsize())) 219 | self.save_comments_all_pages() 220 | if n%50 == 0: 221 | # driver.quit() 222 | time.sleep(300) 223 | else: 224 | time.sleep(random.random()*10) 225 | # driver.quit() 226 | end_time_stamp = time.time() 227 | print('所有酒店爬取完毕,用时:', end_time_stamp-start_time_stamp) 228 | 229 | def print_run_time(func): 230 | """装饰器函数,输出运行时间""" 231 | def wrapper(*args, **kw): 232 | start_time = time.time() 233 | func() 234 | print('run time is {:.2f}'.format(time.time() - start_time)) 235 | return wrapper 236 | 237 | @print_run_time 238 | def start(): 239 | producers, consumers = [], [] 240 | for i in range(2): 241 | p = CtripComment() 242 | producers.append(p) 243 | p.start() 244 | time.sleep(10) 245 | 246 | for i in producers: 247 | i.join() 248 | 249 | 250 | if __name__ == '__main__': 251 | ctrip_comment = CtripComment() 252 | ctrip_comment.save_comments_all_pages() 253 | print(ids_empty) -------------------------------------------------------------------------------- /src/sentiment_analysis.py: -------------------------------------------------------------------------------- 1 | #_*_coding:utf-8 _*_ 2 | 3 | from snownlp import SnowNLP 4 | 5 | 6 | # 对snownlp的各个功能进行封装 7 | class SentimentAnalyse: 8 | 9 | def __init__(self, comment): 10 | self.s = SnowNLP(comment) 11 | 12 | def get_comment_label(self): 13 | pass 14 | 15 | def get_sentiment_score(self): 16 | return self.s.sentiments 17 | 18 | def get_sentences(self): 19 | return self.s.sentences 20 | 21 | def get_words(self): 22 | return self.s.words -------------------------------------------------------------------------------- /src/set_record.py: -------------------------------------------------------------------------------- 1 | import time 2 | import hashlib 3 | from .config import comment_batch, log_file 4 | from .save_comment import ids_empty 5 | 6 | class RecordSetting: 7 | def __init__(self): 8 | pass 9 | 10 | def set_record(self, start_time_stamp, end_time_stamp): 11 | record = { 12 | 'orderid': hashlib.md5(str(start_time_stamp).encode('utf8')).hexdigest(), # 对开始时间进行的md5加密 13 | 'tag': 1, 14 | 'inserttime': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time_stamp)), 15 | 'type': 'S', 16 | 'ispull': 0, 17 | 'endtime': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end_time_stamp)), 18 | 'day': time.strftime('%Y-%m-%d', time.localtime(start_time_stamp)) 19 | } 20 | comment_batch.insert_one(record) 21 | print('所有酒店爬取完毕,用时:', end_time_stamp-start_time_stamp) 22 | with open(log_file, 'a+') as f: 23 | f.write('所有酒店爬取完毕,用时:{}\n'.format(str(end_time_stamp-start_time_stamp))) 24 | f.write('找不到点评数据的酒店:{}\n'.format(ids_empty)) -------------------------------------------------------------------------------- /src/set_sentiment_score.py: -------------------------------------------------------------------------------- 1 | #_*_coding:utf-8 _*_ 2 | 3 | import time 4 | import datetime 5 | import re 6 | from .config import comment_basic, comment_detail, log_file 7 | from .sentiment_analysis import SentimentAnalyse 8 | from multiprocessing.dummy import Pool as ThreadPool 9 | from threading import Thread 10 | from .save_comment import ids_got 11 | from .get_hotels_id import ids_remain 12 | 13 | pattern_zh = re.compile(u'[\u4e00-\u9fa5]+') 14 | 15 | 16 | class SentimentScoreSetting(Thread): 17 | 18 | def __init__(self): 19 | Thread.__init__(self) 20 | 21 | # 为酒店的每条评论做情感值打分 22 | def set_sentiment_score(self, _id, score): 23 | comment_detail.update( 24 | {'_id': _id}, 25 | {'$set': {'sentiment_score': score}}, 26 | ) 27 | 28 | # 计算该酒店所有评论情感值的平均分 29 | def set_sentiment_score_average(self, hotel_id, sum_score, comment_num): 30 | score_avg = sum_score/comment_num 31 | # score_avg = float('%.1f' %(score_avg*5)) #把score_avg变换城五分制 32 | score_avg = round(score_avg*5, 1) #把score_avg变成五分制并四舍五入保留一位小数 33 | comment_basic.update( 34 | {'hotel_id': hotel_id}, 35 | {'$set': {'sentiment_score_avg': score_avg}}, 36 | ) 37 | 38 | def get_sentiment_score(self, comment): 39 | sentiment_analysis = SentimentAnalyse(comment) 40 | sentiment_score = sentiment_analysis.get_sentiment_score() 41 | return sentiment_score 42 | 43 | def main(self, hotel_id): 44 | today = datetime.date.today() 45 | deadline = str(today - datetime.timedelta(days=180)) #截止日期设为半年前 46 | # comments = comment_detail.find({'hotel_id':hotel_id, 'comment': {'$regex': '[\u4e00-\u9fa5]'}}).sort('_id') 47 | result = comment_detail.find({'hotel_id':hotel_id,'comment_date':{'$gte':deadline},'comment_text': {'$regex': u'[\u4e00-\u9fa5]'}}) #查找半年内且没有被设置过情感值的评论 48 | if result.count(): 49 | sum_score = 0 50 | for res in result: 51 | _id, comment = res['_id'], re.sub('\n+', '', res['comment_text']) 52 | sentiment_score = res.get('sentiment_score') 53 | if not sentiment_score: 54 | sentiment_score = self.get_sentiment_score(comment) 55 | sum_score += sentiment_score 56 | # print(sentiment_score, comment) 57 | self.set_sentiment_score(_id, sentiment_score) 58 | if result.count() > 10: #如果抓取到评论数量少于10个,就不进行分析 59 | self.set_sentiment_score_average(hotel_id, sum_score, result.count()) 60 | with open(log_file, 'a+') as f: 61 | f.write('{}共有{}条评论数据,其中{}条数据被设置了情感值,情感值{}\n'.format(hotel_id, comment_detail.count({'hotel_id':hotel_id}), result.count(), sum_score/result.count())) 62 | else: 63 | print('{}的评论数量较少,暂不设情感值分'.format(hotel_id)) 64 | with open(log_file, 'a+') as f: 65 | f.write('{}的评论数量较少,暂不设情感值\n'.format(hotel_id)) 66 | print('{}共有{}条评论数据,其中{}条数据被设置了情感值,情感值{}'.format(hotel_id, comment_detail.count({'hotel_id':hotel_id}), result.count(), sum_score/result.count())) 67 | else: 68 | print('{}未找到符合条件的点评数据!'.format(hotel_id)) 69 | 70 | def run(self): 71 | while True: 72 | if not ids_got.empty(): 73 | hotel_id = ids_got.get() 74 | self.main(hotel_id) 75 | else: 76 | if not ids_remain.empty(): 77 | print('队列中暂无数据,分析线程等待中......') 78 | time.sleep(300) 79 | else: 80 | break 81 | print('队列id_got已空,已爬取了的id全部进行情感分析完毕') 82 | 83 | def start(): 84 | sentiments = [] 85 | for i in range(2): 86 | p = SentimentScoreSetting() 87 | sentiments.append(p) 88 | p.start() 89 | 90 | for i in sentiments: 91 | i.join() 92 | 93 | if __name__ == '__main__': 94 | sentiment_setting = SentimentScoreSetting() 95 | hotel_ids = comment_detail.distinct('hotel_id') # 已爬取了的酒店id 96 | print('共有{}个酒店'.format(len(hotel_ids))) 97 | pool = ThreadPool(processes=4) 98 | start_time = time.time() 99 | pool.map(sentiment_setting.main, hotel_ids) 100 | pool.close() 101 | pool.join() 102 | print('队列id_got已空,已爬取了的id全部分析完毕') 103 | print('用时:', time.time()-start_time) -------------------------------------------------------------------------------- /流程图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jllan/comment_sentiment_analysis/e151f9d9ebcca175047da8289bb35aec4b2324e0/流程图.png --------------------------------------------------------------------------------