├── .gitignore
├── README.md
├── hotel_ids
    ├── ids_empty.txt
    ├── ids_got.txt
    └── ids_total.txt
├── main.py
├── requirements.txt
├── sentiment_ctrip.marshal.3
├── src
    ├── __init__.py
    ├── config.py
    ├── get_hotels_id.py
    ├── get_proxy.py
    ├── save_comment.py
    ├── save_comment_selenium.py
    ├── sentiment_analysis.py
    ├── set_record.py
    └── set_sentiment_score.py
└── 流程图.png


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | *.pyc
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 基本功能
 2 | 爬取携程的酒店的评论，对评论多情感分析
 3 | ***
 4 | # 目录结构
 5 | ```.
 6 | ├── hotel_ids
 7 | │   ├── ids_empty.txt   # 没有点评数据的酒店id
 8 | │   ├── ids_got.txt     # 已爬取了的酒店id
 9 | │   └── ids_total.txt   # 所有酒店id
10 | ├── log         # 日志目录，程序每次运行会在该目录生成一个日志文件
11 | ├── main.py             # 启动函数
12 | ├── README.md
13 | ├── requirements.txt    # 依赖包
14 | └── src         # 代码目录
15 |     ├── config.py       # 配置mongodb，日志路径等
16 |     ├── get_hotels_id.py    # 获取酒店id
17 |     ├── get_proxy.py        # 从[西刺](http://www.xicidaili.com/)爬取免费代理
18 |     ├── __init__.py
19 |     ├── save_comment.py     # 爬取点评数据
20 |     ├── save_comment_selenium.py    # 用模拟浏览器的方式爬取
21 |     ├── sentiment_analysis.py       # 对[snownlp](https://github.com/isnowfy/snownlp)模块的简单封装
22 |     ├── set_record.py               # 程序结束后设置批次记录
23 |     └── set_sentiment_score.py      # 计算每条评论的情感值
24 | ```
25 | ***
26 | # 流程图
27 | ![流程图](流程图.png)
28 | ***
29 | # 运行
30 | ```
31 | python main.py
32 | ```
33 | 
34 | 


--------------------------------------------------------------------------------
/hotel_ids/ids_empty.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jllan/comment_sentiment_analysis/e151f9d9ebcca175047da8289bb35aec4b2324e0/hotel_ids/ids_empty.txt


--------------------------------------------------------------------------------
/hotel_ids/ids_got.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jllan/comment_sentiment_analysis/e151f9d9ebcca175047da8289bb35aec4b2324e0/hotel_ids/ids_got.txt


--------------------------------------------------------------------------------
/hotel_ids/ids_total.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jllan/comment_sentiment_analysis/e151f9d9ebcca175047da8289bb35aec4b2324e0/hotel_ids/ids_total.txt


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from src.get_proxy import ProxyGetting
 3 | # from src.save_comment import CtripComment
 4 | from src.save_comment_selenium import CtripComment
 5 | from src.get_hotels_id import HotelIdGetting
 6 | from src.set_sentiment_score import SentimentScoreSetting
 7 | from src.set_record import RecordSetting
 8 | 
 9 | 
10 | def start():
11 |     HotelIdGetting().get_hids_remain()
12 |     start_time_stamp = time.time()
13 |     proxy, comment, score = [], [], []
14 |     for i in range(1):
15 |         p = ProxyGetting()
16 |         proxy.append(p)
17 |         print('启动代理线程')
18 |         p.start()
19 | 
20 |     time.sleep(60)
21 | 
22 |     for j in range(4):
23 |         c = CtripComment()
24 |         comment.append(c)
25 |         print('启动爬虫线程{}'.format(str(j)))
26 |         c.start()
27 |         time.sleep(5)
28 | 
29 |     time.sleep(300)
30 |     for k in range(4):
31 |         s = SentimentScoreSetting()
32 |         score.append(s)
33 |         s.start()
34 | 
35 |     for i in proxy:
36 |         i.join()
37 |         print('代理线程退出')
38 | 
39 |     for j in comment:
40 |         j.join()
41 |         print('爬虫线程退出')
42 | 
43 | 
44 |     for k in score:
45 |         k.join()
46 |         print('分析线程退出')
47 | 
48 |     end_time_stamp = time.time()
49 |     HotelIdGetting().ids_file_del()  # 程序运行完毕，已爬取所有id，清空已爬取id列表，以便下次重新开始爬取
50 |     RecordSetting().set_record(start_time_stamp, end_time_stamp)    # 插入批次记录
51 | 
52 | if __name__ == '__main__':
53 |     start()


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.5.1
2 | bs4==0.0.1
3 | pymongo==3.3.0
4 | requests==2.11.1
5 | selenium==2.53.6
6 | snownlp==0.12.3
7 | tqdm==4.8.4


--------------------------------------------------------------------------------
/sentiment_ctrip.marshal.3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jllan/comment_sentiment_analysis/e151f9d9ebcca175047da8289bb35aec4b2324e0/sentiment_ctrip.marshal.3


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jllan/comment_sentiment_analysis/e151f9d9ebcca175047da8289bb35aec4b2324e0/src/__init__.py


--------------------------------------------------------------------------------
/src/config.py:
--------------------------------------------------------------------------------
 1 | #_*_coding:utf-8 _*_
 2 | import datetime
 3 | from pymongo import MongoClient
 4 | 
 5 | client = MongoClient('localhost',27017)
 6 | ctrip_comment = client['ctrip_comment']
 7 | 
 8 | '''单条点评数据，包括hotel_id，comment_id，comment_dat，comment_text，score, sentiment_score'''
 9 | comment_detail = ctrip_comment['comment_detail']
10 | 
11 | '''一个酒店点评数据的概要信息，包括hotel_id,comment_num,available_comment_num,score,recommend_rate,sentiment_score,deadline'''
12 | comment_basic = ctrip_comment['comment_basic']
13 | 
14 | '''批次记录'''
15 | comment_batch = ctrip_comment['comment_batch']
16 | # comment_batch = client['ctrip_0811']['orderlist']
17 | 
18 | log_file = 'log/log_{}.txt'.format(str(datetime.date.today()))
19 | ids_total_file = 'hotel_ids/ids_total.txt'      # 全部id
20 | ids_got_file = 'hotel_ids/ids_got.txt'          # 已爬取成功的id
21 | ids_empty_file = 'hotel_ids/ids_empty.txt'      # 没有点评数据的id


--------------------------------------------------------------------------------
/src/get_hotels_id.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from queue import Queue
 3 | from .config import log_file, ids_total_file, ids_got_file, ids_empty_file
 4 | 
 5 | ids_remain = Queue()    # 待爬取的id
 6 | 
 7 | '''获取hotel_id'''
 8 | 
 9 | 
10 | class HotelIdGetting:
11 |     def __init__(self):
12 |         pass
13 | 
14 |     '''从服务器获取酒店id'''
15 |     def get_hids_new(self):
16 |         url = ''
17 |         try:
18 |             response = requests.get(url, timeout=300)
19 |         except Exception:
20 |             print('获取酒店id出错')
21 |             with open(log_file, 'a+') as f:
22 |                 f.write('获取酒店id出错\n')
23 |         else:
24 |             ids = response.json()['hids']
25 |             ids = set([id['hotelid'].strip() for id in ids])
26 |             with open(ids_total_file, 'r+') as f:    # 所有id
27 |                 ids_total = f.readlines()
28 |                 ids_total = set(id.strip() for id in ids_total)
29 |                 ids_new = ids - ids_total
30 |                 if ids_new:
31 |                     for id_new in ids_new:
32 |                         f.write(id_new+'\n')
33 |                 print('新增{}个酒店id'.format(len(ids_new)))
34 |                 with open(log_file, 'a+') as f:
35 |                     f.write('新增{}个酒店id\n'.format(len(ids_new)))
36 | 
37 |     def get_hids_remain(self):
38 |         self.get_hids_new()
39 |         with open(ids_total_file, 'r') as f:    # 所有id
40 |             ids_total = f.readlines()
41 |             ids_total = set(id.strip() for id in ids_total)
42 |         with open(ids_got_file, 'r') as f:     # 已经爬取过的id
43 |             ids_got = f.readlines()
44 |             ids_got = set(id.strip() for id in ids_got)
45 |         with open(ids_empty_file, 'r') as f:     # 已经爬取过的id
46 |             ids_empty = f.readlines()
47 |             ids_empty = set(id.strip() for id in ids_empty)
48 |         ids = ids_total - ids_got - ids_empty               # 待爬取的id
49 |         for id in ids:
50 |             ids_remain.put({'hotel_id':id, 'start_page':1})
51 |         with open(log_file, 'a+') as f:
52 |             f.write('共有{}个id，其中{}个已爬取，还有{}个待爬取\n'.format(len(ids_total),len(ids_got),ids_remain.qsize()))
53 |         print('共有{}个id，其中{}个已爬取，还有{}个待爬取\n'.format(len(ids_total),len(ids_got),ids_remain.qsize()))
54 |         return ids_remain
55 | 
56 |     '''如果全部爬取完毕，需要清空已爬取id数据，以便下次重新开始爬取'''
57 |     def ids_file_del(self):
58 |         print('清空已爬取的id列表和空id列表')
59 |         with open(ids_got_file, 'w') as f:
60 |             f.truncate()
61 |         with open(ids_empty_file, 'w') as f:
62 |             f.truncate()
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     get_ids = HotelIdGetting()
67 |     get_ids.get_hids_remain()


--------------------------------------------------------------------------------
/src/get_proxy.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import requests.adapters
  3 | import time
  4 | from threading import Thread
  5 | from queue import Queue
  6 | from bs4 import BeautifulSoup
  7 | from .get_hotels_id import ids_remain
  8 | 
  9 | proxy_list = Queue()
 10 | 
 11 | 
 12 | class ProxyGetting(Thread):
 13 |     def __init__(self):
 14 |         self.session = requests.session()
 15 |         adapter = requests.adapters.HTTPAdapter(max_retries=3)
 16 |         self.session.mount('https://', adapter)
 17 |         self.session.mount('http://', adapter)
 18 |         Thread.__init__(self)
 19 | 
 20 |     def get_xici(self, page=1):
 21 |         url = 'http://www.xicidaili.com/nn/{}'.format(page)
 22 |         headers_xici = {
 23 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
 24 |             'Accept-Encoding': 'gzip, deflate, sdch',
 25 |             'Accept-Language': 'zh-CN,zh;q=0.8',
 26 |             'Cache-Control': 'max-age=0',
 27 |             'Connection': 'keep-alive',
 28 |             'Host': 'www.xicidaili.com',
 29 |             'Upgrade-Insecure-Requests': 1,
 30 |             'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36'
 31 |         }
 32 |         print('crawl {} page'.format(page))
 33 |         response = self.session.get(url, headers=headers_xici)
 34 |         soup = BeautifulSoup(response.text, 'lxml')
 35 |         ip_list = soup.find('table', id='ip_list').select('tr')[1:]
 36 |         for i in ip_list:
 37 |             # print(i)
 38 |             ip = i.select('td')[1].text.lower()
 39 |             port = i.select('td')[2].text
 40 |             type = i.select('td')[4].text
 41 |             protocol = i.select('td')[5].text.lower()
 42 |             print(ip, port, type, protocol)
 43 |             data = {'ip': ip, 'port': port, 'protocol': protocol}
 44 |             proxies = {}
 45 |             proxies[data['protocol']] = data['protocol']+'://'+data['ip']+':'+str(data['port'])
 46 |             proxy_list.put(proxies)
 47 | 
 48 |     def get_kuaidaili(self, page=1):
 49 |         s = requests.session()
 50 |         url = 'http://www.kuaidaili.com/free/inha/{}/'.format(page)
 51 |         headers_kuaidaili={
 52 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
 53 |             'Accept-Encoding': 'gzip, deflate, sdch',
 54 |             'Accept-Language': 'zh-CN,zh;q=0.8',
 55 |             'Cache-Control': 'max-age=0',
 56 |             'Connection': 'keep-alive',
 57 |             'Host': 'www.kuaidaili.com',
 58 |             'Upgrade-Insecure-Requests': '1',
 59 |             'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36'
 60 |         }
 61 |         print('crawl {} page'.format(page))
 62 |         response = s.get(url, headers=headers_kuaidaili)
 63 |         soup = BeautifulSoup(response.text, 'lxml')
 64 |         ip_list = soup.table.select('tr')[1:]
 65 |         for i in ip_list:
 66 |             ip = i.select('td')[0].text.lower()
 67 |             port = i.select('td')[1].text
 68 |             type = i.select('td')[2].text
 69 |             protocol = i.select('td')[3].text.lower()
 70 |             print(ip, port, type, protocol)
 71 |             data = {'ip': ip, 'port': port, 'protocol': protocol}
 72 |             proxies = {}
 73 |             proxies[data['protocol']] = data['protocol']+'://'+data['ip']+':'+str(data['port'])
 74 |             proxy_list.put(proxies)
 75 | 
 76 |     '''从proxy_list中取出一个代理'''
 77 |     def get_one_proxy(self):
 78 |         if not proxy_list.empty():
 79 |             proxy = proxy_list.get()
 80 |             return proxy
 81 |         else:
 82 |             return ''
 83 | 
 84 |     def run(self):
 85 |         while True:
 86 |             if not ids_remain.empty():
 87 |                 if proxy_list.qsize() < 10:
 88 |                     self.get_xici()
 89 |                 else:
 90 |                     time.sleep(10)
 91 |             else:
 92 |                 time.sleep(300)     # 如果待爬取队列空，则暂停300s（因为有些失败的id会重新进入队列）后退出
 93 |                 break
 94 | 
 95 | 
 96 | if __name__ == '__main__':
 97 |     proxy_get = ProxyGetting()
 98 |     proxy_get.get_xici()
 99 |     print(proxy_get.get_one_proxy())
100 |     print(type(proxy_get.get_one_proxy()))
101 |     # proxy_get.run()
102 | 


--------------------------------------------------------------------------------
/src/save_comment.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import requests.adapters
  3 | import json
  4 | import datetime
  5 | import time
  6 | import pymongo
  7 | from tqdm import *
  8 | from threading import Thread
  9 | from queue import Queue
 10 | from .config import comment_basic, comment_detail, log_file, ids_got_file, ids_empty_file
 11 | from .get_proxy import ProxyGetting, proxy_list
 12 | from .get_hotels_id import ids_remain
 13 | 
 14 | MAX_RETRIES = 3
 15 | ids_empty = set()   # 没有点评数据的id
 16 | ids_got = Queue()   # 爬取成功的id
 17 | 
 18 | 
 19 | class CtripComment(Thread):
 20 | 
 21 |     def __init__(self, hotel_id=''):
 22 |         Thread.__init__(self)
 23 |         self.hotel_id = hotel_id
 24 |         self.start_page = 1
 25 |         # self.deadline = ''
 26 |         # self.proxy = proxy_list.get()
 27 |         self.proxy = ProxyGetting().get_one_proxy()
 28 |         # self.proxy = ''
 29 |         self.session = requests.session()
 30 |         adapter = requests.adapters.HTTPAdapter(max_retries=MAX_RETRIES)
 31 |         self.session.mount('https://', adapter)
 32 |         self.session.mount('http://', adapter)
 33 | 
 34 |     def get_data(self, page, try_num=1):
 35 |         url = "http://m.ctrip.com/restapi/soa2/10935/hotel/booking/commentgroupsearch?_fxpcqlniredt=09031020210316541274"
 36 |         headers = {
 37 |             'Host': 'm.ctrip.com',
 38 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0',
 39 |             'Accept': 'application/json',
 40 |             'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
 41 |             'Accept-Encoding': 'gzip, deflate, br',
 42 |             'X-Requested-With': 'XMLHttpRequest',
 43 |             'Content-Type': 'application/json',
 44 |             'Referer': 'http://m.ctrip.com/webapp/hotel/hoteldetail/dianping/{}.html?roomtype=&opr=&fr=detail&daylater=0&days=1'.format(self.hotel_id),
 45 |             'Connection': 'keep-alive'
 46 |         }
 47 |         # params = {"flag":1,"id":self.hotel_id,"htype":1,"sort":{"idx":1,"size":10,"sort":1,"ord":1},"search":{"kword":"","gtype":4,"opr":0,"ctrl":14,"filters":[]},"alliance":{"aid":"66672","sid":"508668","ouid":"","ishybrid":0},"Key":"","head":{"cid":"09031020210316541274","ctok":"","cver":"1.0","lang":"01","sid":"55552328","syscode":"09","auth":"","extension":[{"name":"pageid","value":"228032"},{"name":"webp","value":0},{"name":"referrer","value":""},{"name":"protocal","value":"http"}]},"contentType":"json"}
 48 |         params = {"flag":1,"id":self.hotel_id,"htype":1,"sort":{"idx":1,"size":10,"sort":1,"ord":1},"search":{"kword":"","gtype":4,"opr":0,"ctrl":14,"filters":[]},"alliance":{"aid":"66672","sid":"508668","ouid":"","ishybrid":0},"Key":"f83e59228064b4ede9b33bc4325eb3d9","head":{"cid":"09031020210316541274","ctok":"","cver":"1.0","lang":"01","sid":"8888","syscode":"09","auth":"","extension":[{"name":"pageid","value":"228032"},{"name":"webp","value":0},{"name":"referrer","value":""},{"name":"protocal","value":"https"}]},"contentType":"json"}
 49 |         print('使用代理：', self.proxy)
 50 |         params['sort']['idx'] = page   # 设置页码
 51 |         try:
 52 |             response = self.session.post(url, data=json.dumps(params), headers=headers, proxies=self.proxy, timeout=60, stream=False)
 53 |             print(response.text)
 54 |             print(response.status_code)
 55 |         except Exception as e:
 56 |             print('error:', e)
 57 |             self.handle_error(page)
 58 |             return None
 59 |         else:
 60 |             if response.status_code != 200:
 61 |                 self.handle_error(page)
 62 |                 return None
 63 |             response = response.json()
 64 |             if response['rc'] == 200:
 65 |                 if not response.get('hcsi'):
 66 |                     self.handle_error(page)
 67 |                     return None
 68 |                 if response['hcsi']['total'] > '0':
 69 |                     return response
 70 |                 else:
 71 |                     print('{}没有评论数据'.format(self.hotel_id))
 72 |                     ids_empty.add(self.hotel_id)
 73 |                     with open(ids_empty_file, 'a+') as f:
 74 |                         f.write(self.hotel_id+'\n')
 75 |                     with open(log_file, 'a+') as f:
 76 |                         f.write('{}没有评论数据，当前使用代理{}\n'.format(self.hotel_id, self.proxy))
 77 |                     return None
 78 | 
 79 |     def get_comment(self, page=1, try_num=1):
 80 |         page = pages = self.start_page
 81 |         # comment_list = []
 82 |         start_time = time.time()
 83 |         while page <= pages:
 84 |             response = self.get_data(page)
 85 |             if response:
 86 |                 # if page == 1:
 87 |                 if page == pages:
 88 |                     total_pages = response['groups'][0]['pages']
 89 |                     count = response['groups'][0]['count']
 90 |                     score = response['hcsi']['avgpts']['all']
 91 |                     recommend_rate = response['hcsi']['recmd']
 92 |                     pages = total_pages
 93 |                     comment_basic.find_one_and_update(
 94 |                         {'hotel_id': self.hotel_id},
 95 |                         {'$set': {'score':score, 'recommend_rate':recommend_rate, 'comment_total':count}},
 96 |                         upsert = True
 97 |                     )    # 酒店的评论概要信息写入comment_basic,因为每次爬取时这些信息可能会有更新，所以用find_one_and_update的方法
 98 |                     if not comment_detail.find({'hotel_id': self.hotel_id}).count():
 99 |                         self.deadline = str(datetime.date.today() - datetime.timedelta(days=180))    # 如果没有对应id的酒店的点评数据，说明该酒店是第一次被抓去，截止日期设为半年前
100 |                     else:
101 |                         self.deadline = comment_detail.find_one(
102 |                             {'hotel_id':self.hotel_id},
103 |                             sort=[('comment_date', pymongo.DESCENDING)]
104 |                         )['comment_date']   # 上次抓取的评论的最新的日期作为本次的截止日期
105 |                     print('deadline:', self.deadline)
106 |                     print('{}共有{}条评论，共{}页，评分{}，推荐率{}'.format(self.hotel_id, count, pages, score, recommend_rate))
107 |                 print('{}共有{}条评论，共{}页，当前正在爬取第{}页的数据'.format(self.hotel_id, count, pages, page))
108 |                 with open(log_file, 'a+') as f:
109 |                     f.write('{}共有{}条评论，共{}页，当前正在爬取第{}页的数据，当前使用代理{}\n'.format(self.hotel_id, count, pages, page, self.proxy))
110 |                 comments = response['groups'][0]['comments']
111 |                 comment_date = ''
112 |                 for comment in comments:
113 |                     comment_score = comment['rats']['all']
114 |                     comment_date = comment['date'].strip().split(' ')[0]
115 |                     comment_id = self.hotel_id+str(comment['comid'])
116 |                     comment_text = comment['text'].strip()
117 |                     comment_dict = {
118 |                         'hotel_id': self.hotel_id,
119 |                         'comment_id': comment_id,
120 |                         'comment_text': comment_text,
121 |                         'comment_score': comment_score,
122 |                         'comment_date': comment_date
123 |                     }
124 |                     # print(comment_dict)
125 |                     if comment_date > self.deadline:
126 |                         if comment_detail.find({'comment_id':comment_id}).count():
127 |                             print('该条评论已存在')
128 |                         else:
129 |                             # comment_list.append(comment_dict)
130 |                             comment_detail.insert_one(comment_dict)
131 |                     else:
132 |                         print(comment_date)
133 |                         print('数据已过期，{}爬取成功！'.format(self.hotel_id))
134 |                         # for comment_dict in comment_list:
135 |                         #     comment_detail.insert_one(comment_dict)
136 |                         ids_got.put(self.hotel_id)  # 成功爬取的id加入队列id_got中
137 |                         with open(ids_got_file, 'a+') as f:
138 |                             f.write(self.hotel_id+'\n')
139 |                         with open(log_file, 'a+') as f:
140 |                             f.write('{}爬取成功！\n'.format(self.hotel_id))
141 |                         break
142 |                 if comment_date <= self.deadline:
143 |                     break
144 |                 # time.sleep(random.random()*20)
145 |                 page += 1
146 |             else:
147 |                 print('{}爬取失败'.format(self.hotel_id))
148 |                 break
149 |         end_time = time.time()
150 |         print('{}耗时：{}'.format(self.hotel_id, end_time-start_time))
151 | 
152 |     def handle_error(self, page):
153 |         if page == 1:
154 |             ids_remain.put({'hotel_id':self.hotel_id, 'start_page':page})       # 失败的酒店id重新进入队列id_total
155 |         else:
156 |             ids_remain.put({'hotel_id':self.hotel_id, 'start_page':page, 'deadline':self.deadline})
157 |         # self.proxy = proxy_list.get()
158 |         self.proxy = ProxyGetting().get_one_proxy()
159 | 
160 |     def run(self):
161 |         start_time = time.time()
162 |         while not ids_remain.empty():
163 |             # self.hotel_id = ids_remain.get()
164 |             record = ids_remain.get()
165 |             self.hotel_id = record['hotel_id']
166 |             self.start_page = record['start_page']
167 |             self.deadline = record.get('deadline')
168 |             ids_remain.task_done()
169 |             # self.proxy = proxy_list_available.get()
170 |             print('开始从第{}页爬取id为{}的酒店,还有{}个酒店待爬取'.format(self.start_page, self.hotel_id, ids_remain.qsize()))
171 |             with open(log_file, 'a+') as f:
172 |                 f.write('开始从第{}页爬取id为{}的酒店,还有{}个酒店待爬取\n'.format(self.start_page, self.hotel_id, ids_remain.qsize()))
173 |             self.get_comment()
174 |             print('队列中还有{}个id'.format(ids_remain.qsize()))
175 |         end_time = time.time()
176 |         print('所有酒店爬取完毕，用时：', end_time-start_time)
177 | 
178 | def print_run_time(func):
179 |     """装饰器函数，输出运行时间"""
180 |     def wrapper(*args, **kw):
181 |         start_time = time.time()
182 |         func()
183 |         print('run time is {:.2f}'.format(time.time() - start_time))
184 |     return wrapper
185 | 
186 | @print_run_time
187 | def start():
188 |     proxy, comment = [], []
189 | 
190 |     for j in range(1):
191 |         c = CtripComment()
192 |         comment.append(c)
193 |         c.start()
194 | 
195 |     for j in comment:
196 |         j.join()
197 | 
198 | 
199 | if __name__ == '__main__':
200 |     ctrip_comment = CtripComment()
201 |     ctrip_comment.hotel_id = '429541'
202 |     ctrip_comment.get_comment()
203 |     print(ids_empty)


--------------------------------------------------------------------------------
/src/save_comment_selenium.py:
--------------------------------------------------------------------------------
  1 | #_*_coding:utf-8 _*_
  2 | import time
  3 | import datetime
  4 | import re
  5 | import random
  6 | import codecs
  7 | import pymongo
  8 | from tqdm import *
  9 | from selenium import webdriver
 10 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 11 | from selenium.webdriver.support.ui import Select
 12 | from selenium.webdriver.common.proxy import *
 13 | from bs4 import BeautifulSoup
 14 | from threading import Thread
 15 | from queue import Queue
 16 | from .config import comment_basic, comment_detail, log_file, ids_got_file, ids_empty_file
 17 | from .get_proxy import ProxyGetting, proxy_list
 18 | from .get_hotels_id import ids_remain
 19 | 
 20 | ids_empty = set()   # 没有点评数据的id
 21 | ids_got = Queue()   # 成功爬取的id
 22 | n = 0
 23 | # 从携程获取评论然后保存到数据库或文件
 24 | class CtripComment(Thread):
 25 | 
 26 |     def __init__(self, hotel_id=''):
 27 |         self.hotel_id = hotel_id
 28 |         self.start_page = 1
 29 |         self.proxy = ProxyGetting().get_one_proxy()
 30 |         Thread.__init__(self)
 31 | 
 32 |     # 抓取一个酒店半年内的评论，不区分好评差评，存储到‘hotel_comment_酒店id’中，目的主要是用来进行测试
 33 |     def save_comments_all_pages(self, page=1):
 34 |         start = time.time()
 35 |         dcap = dict(DesiredCapabilities.PHANTOMJS)
 36 |         dcap["phantomjs.page.settings.userAgent"] = (
 37 |             "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36"
 38 |         )
 39 |         print('使用代理：', self.proxy)
 40 |         p = ''.join(self.proxy.values()).split('/')[-1]
 41 |         proxy = Proxy({
 42 |             'proxyType': ProxyType.MANUAL,
 43 |             'httpProxy': p,
 44 |             'noProxy': ''
 45 |         })
 46 |         driver = webdriver.Firefox(proxy=proxy)
 47 |         # driver = webdriver.PhantomJS()
 48 |         driver.set_page_load_timeout(30)
 49 |         url = 'http://hotels.ctrip.com/hotel/dianping/{}.html'.format(self.hotel_id)
 50 |         try:
 51 |             driver.get(url)
 52 |             time.sleep(3)
 53 |         except Exception as e:
 54 |             print('error:', e)
 55 |             with open(log_file, 'a+') as f:
 56 |                 f.write('{}页面打开失败, 留待以后再爬！\n'.format(self.hotel_id))
 57 |             ids_remain.put(self.hotel_id)    # 失败的重进进入待爬取队列
 58 |             # self.handle_error(page)
 59 |         else:
 60 |             print(driver.current_url)
 61 | 
 62 |             if driver.current_url != url or '此酒店暂无点评' in driver.page_source:
 63 |                 print('该酒店无点评数据！')
 64 |                 with open(log_file, 'a+') as f:
 65 |                     f.write('{}没有点评数据！\n'.format(self.hotel_id))
 66 |                 with open(ids_empty_file, 'a+') as f:
 67 |                         f.write(self.hotel_id+'\n')
 68 |                 ids_empty.add(self.hotel_id)
 69 |             else:
 70 |                 if u'您访问的太快了， 休息一下吧。 或者输入验证码继续访问' in driver.page_source:
 71 |                     print('访问太快被禁止，暂停五分钟后继续')
 72 |                     # for i in tqdm(range(6000)):
 73 |                     #     time.sleep(.1)    #进度条每0.1s前进一次，总时间为3000*0.1=300s
 74 |                     # driver.get(url)
 75 |                     ids_remain.put(self.hotel_id)
 76 |                     # self.handle_error(page)
 77 |                     # self.save_comments_all_pages(page=page)
 78 |                 else:
 79 |                     try:
 80 |                         select = Select(driver.find_element_by_class_name('select_sort'))
 81 |                         time.sleep(1)
 82 |                         select.select_by_value('1')     # 下拉框选择按时间排序
 83 |                         time.sleep(3)
 84 |                     except Exception as e:
 85 |                         print('error:', e)
 86 |                         ids_remain.put(self.hotel_id)
 87 |                         # self.handle_error(page)
 88 |                         with open(log_file, 'a+') as f:
 89 |                             f.write('{}页面的点评数据无法按照时间排序，留待以后再爬！\n'.format(self.hotel_id))
 90 |                     else:
 91 |                         last_page_comment = ''
 92 |                         while True:
 93 |                             soup = BeautifulSoup(driver.page_source)
 94 |                             if page == 1:
 95 |                                 score = soup.find('div',class_="comment_total_score").find('span', class_='score').span.text.strip()             #评分
 96 |                                 recommend_rate = soup.find('span', class_='rec').span.text.strip('%')   #推荐度
 97 |                                 comment_total_tmp = soup.find('span', id='All_Comment').text.strip()    #评论数量
 98 |                                 comment_total = re.search('\d+', comment_total_tmp).group(0)
 99 |                                 print('ID为{}的酒店共有{}条评论，评分{}，推荐率{}'.format(self.hotel_id, comment_total, score, recommend_rate))
100 |                                 comment_basic.find_one_and_update(
101 |                                     {'hotel_id': self.hotel_id},
102 |                                     {'$set': {'score':score, 'recommend_rate':recommend_rate, 'comment_total':comment_total}},
103 |                                     upsert = True
104 |                                 )    # 酒店的评论概要信息写入comment_basic,因为每次爬取时这些信息可能会有更新，所以用find_one_and_update的方法
105 |                                 if not comment_detail.find({'hotel_id': self.hotel_id}).count():
106 |                                     self.deadline = str(datetime.date.today() - datetime.timedelta(days=180))    # 如果没有对应id的酒店的点评数据，说明该酒店是第一次被抓去，截止日期设为半年前
107 |                                 else:
108 |                                     self.deadline = comment_detail.find_one(
109 |                                         {'hotel_id':self.hotel_id},
110 |                                         sort=[('comment_date', pymongo.DESCENDING)]
111 |                                     )['comment_date']   # 上次抓取的评论的最新的日期作为本次的截止日期
112 |                                 print('deadline:', self.deadline)
113 |                             print('正在爬取{}第{}页的数据'.format(self.hotel_id, page))
114 |                             with open(log_file, 'a+') as f:
115 |                                 f.write('正在爬取{}第{}页的数据\n'.format(self.hotel_id, page))
116 |                             comments = soup.findAll('div', {'class':'comment_block J_asyncCmt'})
117 |                             comment_date = ''
118 |                             if comments == last_page_comment:
119 |                                 print('和上一页数据相同，不进行解析！')
120 |                             else:
121 |                                 last_page_comment = comments
122 |                                 comment_num = len(comments)
123 |                                 print('正在获取ID为{}的酒店第{}页评论数据，该页有{}条评论'.format(self.hotel_id, page, comment_num))
124 |                                 for i in comments:
125 |                                     comment_text = i.find('div', class_="J_commentDetail").text.replace('\n', ' ')   #多行评论转为一行
126 |                                     comment_date = i.select('span[class="time"]')[0].text
127 |                                     comment_date = re.sub("[\u4e00-\u9fa5()]+", '', comment_date)
128 |                                     try:
129 |                                         comment_score = i.select('span[class="score"]')[0].span.text.strip()
130 |                                     except Exception:
131 |                                         comment_score = ''
132 |                                     if comment_date > self.deadline:     # 只需要大于截止日期的数据
133 |                                         comment_dict = {
134 |                                             'hotel_id': self.hotel_id,
135 |                                             # 'comment_id': comment_id,
136 |                                             'comment_text': comment_text,
137 |                                             'comment_score': comment_score,
138 |                                             'comment_date': comment_date
139 |                                         }
140 |                                         print(comment_dict)
141 |                                         comment_detail.insert_one(comment_dict)
142 |                                     else:
143 |                                         print(comment_date, comment_text)
144 |                                         print('评论已过期！')
145 |                                         ids_got.put(self.hotel_id)  # 成功爬取的id加入队列id_got中
146 |                                         with open(log_file, 'a+') as f:
147 |                                             f.write('{}爬取成功！\n'.format(self.hotel_id))
148 |                                         with open(ids_got_file, 'a+') as f:
149 |                                             f.write(self.hotel_id+'\n')
150 |                                         break           #首先结束for循环
151 |                             if comment_date <= self.deadline:
152 |                                 break
153 |                             '''查找下一页'''
154 |                             try:
155 |                                 next = driver.find_element_by_class_name('c_down')
156 |                             except Exception as e:
157 |                                 print('error:',e)
158 |                                 ids_got.put(self.hotel_id)  # 成功爬取的id加入队列id_got中
159 |                                 with open(log_file, 'a+') as f:
160 |                                     f.write('{}爬取成功！\n'.format(self.hotel_id))
161 |                                 with open(ids_got_file, 'a+') as f:
162 |                                     f.write(self.hotel_id+'\n')
163 |                                 break
164 |                             else:
165 |                                 next_page = soup.find('a', class_='c_down')['value']
166 |                                 if page == next_page:   #如果点击了下一页但是没有生效，再次点击下一页，等待时间长一点，以便能顺利进入下一页
167 |                                     print('没有成功进入{}页，当前仍处于{}页'.format(page, int(page)-1))
168 |                                     try:
169 |                                         next.click()
170 |                                     except Exception as e:
171 |                                         print('error:', e)
172 |                                         comment_detail.remove({'comment_date':{'$gt':self.deadline}})    # 进入下一页失败，该酒店爬取失败，清空本次已爬到的数据，留待以后再爬
173 |                                         ids_remain.put(self.hotel_id)
174 |                                         break
175 |                                 else:
176 |                                     page = next_page
177 |                                     try:
178 |                                         next.click()
179 |                                     except Exception as e:
180 |                                         print('error:', e)
181 |                                         comment_detail.remove({'comment_date':{'$gt':self.deadline}})    # 进入下一页失败，该酒店爬取失败，清空本次已爬到的数据，留待以后再爬
182 |                                         ids_remain.put(self.hotel_id)
183 |                                         break
184 |                             time.sleep(5)
185 | 
186 |         driver.quit()
187 |         print('ID为%s的酒店总耗时：'%self.hotel_id, time.time()-start)
188 | 
189 |     '''def handle_error(self, page):
190 |         if page == 1:
191 |             ids_remain.put({'hotel_id':self.hotel_id, 'start_page':page})       # 失败的酒店id重新进入队列id_total
192 |         else:
193 |             ids_remain.put({'hotel_id':self.hotel_id, 'start_page':page, 'deadline':self.deadline})
194 |         self.proxy = ProxyGetting().get_one_proxy()'''
195 | 
196 |     def run(self):
197 |         global n
198 |         # dcap = dict(DesiredCapabilities.PHANTOMJS)
199 |         # dcap["phantomjs.page.settings.userAgent"] = (
200 |         #     "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36"
201 |         # )
202 |         # # driver = webdriver.PhantomJS()
203 |         # driver = webdriver.Firefox()
204 |         # driver.set_page_load_timeout(30)
205 |         start_time_stamp = time.time()
206 |         while not ids_remain.empty():
207 |             n += 1
208 |             # self.hotel_id = ids_remain.get()
209 |             record = ids_remain.get()
210 |             self.hotel_id = record['hotel_id']
211 |             # self.start_page = record['start_page']
212 |             # self.deadline = record.get('deadline')
213 | 
214 |             ids_remain.task_done()
215 |             # self.proxy = proxy_list_available.get()
216 |             print('开始爬取第{}个酒店，id为{},还有{}个酒店待爬取'.format(n, self.hotel_id, ids_remain.qsize()))
217 |             with open(log_file, 'a+') as f:
218 |                 f.write('开始爬取id为{}的酒店,还有{}个酒店待爬取\n'.format(self.hotel_id, ids_remain.qsize()))
219 |             self.save_comments_all_pages()
220 |             if n%50 == 0:
221 |                 # driver.quit()
222 |                 time.sleep(300)
223 |             else:
224 |                 time.sleep(random.random()*10)
225 |         # driver.quit()
226 |         end_time_stamp = time.time()
227 |         print('所有酒店爬取完毕，用时：', end_time_stamp-start_time_stamp)
228 | 
229 | def print_run_time(func):
230 |     """装饰器函数，输出运行时间"""
231 |     def wrapper(*args, **kw):
232 |         start_time = time.time()
233 |         func()
234 |         print('run time is {:.2f}'.format(time.time() - start_time))
235 |     return wrapper
236 | 
237 | @print_run_time
238 | def start():
239 |     producers, consumers = [], []
240 |     for i in range(2):
241 |         p = CtripComment()
242 |         producers.append(p)
243 |         p.start()
244 |         time.sleep(10)
245 | 
246 |     for i in producers:
247 |         i.join()
248 | 
249 | 
250 | if __name__ == '__main__':
251 |     ctrip_comment = CtripComment()
252 |     ctrip_comment.save_comments_all_pages()
253 |     print(ids_empty)


--------------------------------------------------------------------------------
/src/sentiment_analysis.py:
--------------------------------------------------------------------------------
 1 | #_*_coding:utf-8 _*_
 2 | 
 3 | from snownlp import SnowNLP
 4 | 
 5 | 
 6 | # 对snownlp的各个功能进行封装
 7 | class SentimentAnalyse:
 8 | 
 9 |     def __init__(self, comment):
10 |         self.s = SnowNLP(comment)
11 | 
12 |     def get_comment_label(self):
13 |         pass
14 | 
15 |     def get_sentiment_score(self):
16 |         return self.s.sentiments
17 | 
18 |     def get_sentences(self):
19 |         return self.s.sentences
20 | 
21 |     def get_words(self):
22 |         return self.s.words


--------------------------------------------------------------------------------
/src/set_record.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import hashlib
 3 | from .config import comment_batch, log_file
 4 | from .save_comment import ids_empty
 5 | 
 6 | class RecordSetting:
 7 |     def __init__(self):
 8 |         pass
 9 | 
10 |     def set_record(self, start_time_stamp, end_time_stamp):
11 |         record = {
12 |             'orderid': hashlib.md5(str(start_time_stamp).encode('utf8')).hexdigest(),   # 对开始时间进行的md5加密
13 |             'tag': 1,
14 |             'inserttime': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time_stamp)),
15 |             'type': 'S',
16 |             'ispull': 0,
17 |             'endtime': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end_time_stamp)),
18 |             'day': time.strftime('%Y-%m-%d', time.localtime(start_time_stamp))
19 |         }
20 |         comment_batch.insert_one(record)
21 |         print('所有酒店爬取完毕，用时：', end_time_stamp-start_time_stamp)
22 |         with open(log_file, 'a+') as f:
23 |             f.write('所有酒店爬取完毕，用时：{}\n'.format(str(end_time_stamp-start_time_stamp)))
24 |             f.write('找不到点评数据的酒店:{}\n'.format(ids_empty))


--------------------------------------------------------------------------------
/src/set_sentiment_score.py:
--------------------------------------------------------------------------------
  1 | #_*_coding:utf-8 _*_
  2 | 
  3 | import time
  4 | import datetime
  5 | import re
  6 | from .config import  comment_basic, comment_detail, log_file
  7 | from .sentiment_analysis import SentimentAnalyse
  8 | from multiprocessing.dummy import Pool as ThreadPool
  9 | from threading import Thread
 10 | from .save_comment import ids_got
 11 | from .get_hotels_id import ids_remain
 12 | 
 13 | pattern_zh = re.compile(u'[\u4e00-\u9fa5]+')
 14 | 
 15 | 
 16 | class SentimentScoreSetting(Thread):
 17 | 
 18 |     def __init__(self):
 19 |         Thread.__init__(self)
 20 | 
 21 |     # 为酒店的每条评论做情感值打分
 22 |     def set_sentiment_score(self, _id, score):
 23 |         comment_detail.update(
 24 |             {'_id': _id},
 25 |             {'$set': {'sentiment_score': score}},
 26 |         )
 27 | 
 28 |     # 计算该酒店所有评论情感值的平均分
 29 |     def set_sentiment_score_average(self, hotel_id, sum_score, comment_num):
 30 |         score_avg = sum_score/comment_num
 31 |         # score_avg = float('%.1f' %(score_avg*5))      #把score_avg变换城五分制
 32 |         score_avg = round(score_avg*5, 1)               #把score_avg变成五分制并四舍五入保留一位小数
 33 |         comment_basic.update(
 34 |             {'hotel_id': hotel_id},
 35 |             {'$set': {'sentiment_score_avg': score_avg}},
 36 |         )
 37 | 
 38 |     def get_sentiment_score(self, comment):
 39 |         sentiment_analysis = SentimentAnalyse(comment)
 40 |         sentiment_score = sentiment_analysis.get_sentiment_score()
 41 |         return sentiment_score
 42 | 
 43 |     def main(self, hotel_id):
 44 |         today = datetime.date.today()
 45 |         deadline = str(today - datetime.timedelta(days=180)) #截止日期设为半年前
 46 |         # comments = comment_detail.find({'hotel_id':hotel_id, 'comment': {'$regex': '[\u4e00-\u9fa5]'}}).sort('_id')
 47 |         result = comment_detail.find({'hotel_id':hotel_id,'comment_date':{'$gte':deadline},'comment_text': {'$regex': u'[\u4e00-\u9fa5]'}})  #查找半年内且没有被设置过情感值的评论
 48 |         if result.count():
 49 |             sum_score = 0
 50 |             for res in result:
 51 |                 _id, comment = res['_id'], re.sub('\n+', '', res['comment_text'])
 52 |                 sentiment_score = res.get('sentiment_score')
 53 |                 if not sentiment_score:
 54 |                     sentiment_score = self.get_sentiment_score(comment)
 55 |                 sum_score += sentiment_score
 56 |                 # print(sentiment_score, comment)
 57 |                 self.set_sentiment_score(_id, sentiment_score)
 58 |             if result.count() > 10:    #如果抓取到评论数量少于10个，就不进行分析
 59 |                 self.set_sentiment_score_average(hotel_id, sum_score, result.count())
 60 |                 with open(log_file, 'a+') as f:
 61 |                     f.write('{}共有{}条评论数据，其中{}条数据被设置了情感值，情感值{}\n'.format(hotel_id, comment_detail.count({'hotel_id':hotel_id}), result.count(), sum_score/result.count()))
 62 |             else:
 63 |                 print('{}的评论数量较少，暂不设情感值分'.format(hotel_id))
 64 |                 with open(log_file, 'a+') as f:
 65 |                     f.write('{}的评论数量较少，暂不设情感值\n'.format(hotel_id))
 66 |             print('{}共有{}条评论数据，其中{}条数据被设置了情感值，情感值{}'.format(hotel_id, comment_detail.count({'hotel_id':hotel_id}), result.count(), sum_score/result.count()))
 67 |         else:
 68 |             print('{}未找到符合条件的点评数据！'.format(hotel_id))
 69 | 
 70 |     def run(self):
 71 |         while True:
 72 |             if not ids_got.empty():
 73 |                 hotel_id = ids_got.get()
 74 |                 self.main(hotel_id)
 75 |             else:
 76 |                 if not ids_remain.empty():
 77 |                     print('队列中暂无数据，分析线程等待中......')
 78 |                     time.sleep(300)
 79 |                 else:
 80 |                     break
 81 |         print('队列id_got已空，已爬取了的id全部进行情感分析完毕')
 82 | 
 83 | def start():
 84 |     sentiments = []
 85 |     for i in range(2):
 86 |         p = SentimentScoreSetting()
 87 |         sentiments.append(p)
 88 |         p.start()
 89 | 
 90 |     for i in sentiments:
 91 |         i.join()
 92 | 
 93 | if __name__ == '__main__':
 94 |     sentiment_setting = SentimentScoreSetting()
 95 |     hotel_ids = comment_detail.distinct('hotel_id')     # 已爬取了的酒店id
 96 |     print('共有{}个酒店'.format(len(hotel_ids)))
 97 |     pool = ThreadPool(processes=4)
 98 |     start_time = time.time()
 99 |     pool.map(sentiment_setting.main, hotel_ids)
100 |     pool.close()
101 |     pool.join()
102 |     print('队列id_got已空，已爬取了的id全部分析完毕')
103 |     print('用时:', time.time()-start_time)


--------------------------------------------------------------------------------
/流程图.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jllan/comment_sentiment_analysis/e151f9d9ebcca175047da8289bb35aec4b2324e0/流程图.png


--------------------------------------------------------------------------------