├── .idea
├── dictionaries
│ └── Mr_Li.xml
├── vcs.xml
├── misc.xml
├── modules.xml
├── dazhongdianping.iml
└── workspace.xml
├── __pycache__
└── test.cpython-38.pyc
├── README.md
├── .gitignore
└── dzdp.py
/.idea/dictionaries/Mr_Li.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/__pycache__/test.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saltedfish666/dazhongdianping/HEAD/__pycache__/test.cpython-38.pyc
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/dazhongdianping.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 大众点评爬虫
2 |
3 | ## 提示:本项目仅供学习,严禁用于商业或着任何违法用途
4 |
5 | 程序文件:dzdp.py
6 | 运行结果为一个reviews.txt文件
7 |
8 | **更新日期:2020/02/29**
9 | 更新内容:
10 | 1.修复已知的bug
11 | 2.增加输出排版
12 |
13 | **更新日期:2020/02/27**
14 | 说明:
15 | 1.本程序主要解决大众点评里面CSS加密问题,对封IP问题采取了程序延迟10-25秒的措施,且没有解决验证码问题,但是验证码的问题可以手动验证,并修改相关代码,也可以爬完全部页面。
16 | 2.本程序只爬取了大众点评上一家店铺的所有评论。
17 | 3.直接在网页上看好评论的页数并填进代码,没有自动化获取评论页数。
18 | 4.大众点评需要cookie才能登录,需要抓取cookie填进headers。
19 | 5.本程序只抓取了评论,未保存昵称,ID,图片等信息。
20 | 6.抓取的评论还未进行排版。
21 |
22 | 如有错误或者更好的想法,欢迎大家指出一起讨论学习。
23 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
--------------------------------------------------------------------------------
/dzdp.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | """
5 | @project: dazhongdianping
6 | @file: dzdp.py.py
7 | @ide: PyCharm
8 | @time: 2020-02-24 18:30:41
9 | @author: Mr.Li
10 | Copyright © 2020—2020 Mr.Li. All rights reserved.
11 | """
12 |
13 | import requests
14 | import re # re库是对字符串进行解析,而lxml文件可以对xml文件进行解析
15 | from lxml import etree
16 | from fake_useragent import UserAgent
17 | import time #暂停程序,避免封号
18 | import random #生成随机时间值
19 |
20 | def get_url_list():
21 | url_list = []
22 | for i in list(range(72)): # 我所抓的评论一共72页,尚未完善自动化获取评论页数的代码
23 | url_list.append('http://www.dianping.com/shop/18335920/review_all/p' + str(i + 1))
24 | return url_list
25 |
26 | def get_css_content(html, headers):
27 | print('------begin to get css content------')
28 | css_l = re.search(r'', html)
29 | css_link = 'http:' + css_l.group(1)
30 | html_css = requests.get(css_link, headers).text
31 | return html_css
32 |
33 | def get_font_dic(css_content):
34 | print('------begin to get font dictionary------')
35 | # 获取svg链接和svg页面的html源码
36 | svg_l = re.search(r'svgmtsi.*?(//s3plus.sankuai.com.*?svg)\);', css_content)
37 | svg_link = 'http:' + svg_l.group(1)
38 | svg_html = requests.get(svg_link).text
39 | # 解析出字典
40 | y_list = re.findall('d="M0 (.*?) H600"', svg_html) # y_list的元素为str
41 | font_dic = {}
42 | j = 0 # j为第j行
43 | font_size = int(re.findall(r'font-size:(.*?)px;fill:#333;}', svg_html)[0])
44 | for y in y_list:
45 | font_l = re.findall(r'(.*?)', svg_html)
46 | font_list = re.findall(r'.{1}', font_l[0])
47 | for x in range(len(font_list)): # x为每一行第x个字
48 | font_dic[str(x * font_size) + ',' + y] = font_list[x]
49 | j += 1
50 | return font_dic, y_list
51 |
52 | def get_html_full_review(html, css_content, font_dic, y_list):
53 | font_key_list = re.findall(r'', html)
54 | # print(len(font_key))
55 | for font_key in font_key_list:
56 | pos_key = re.findall(r'.' + font_key + '{background:-(.*?).0px -(.*?).0px;}', css_content)
57 | pos_x = pos_key[0][0]
58 | pos_y_original = pos_key[0][1]
59 | for y in y_list:
60 | if int(pos_y_original) < int(y):
61 | pos_y = y
62 | break
63 | html = html.replace('', font_dic[pos_x + ',' + pos_y])
64 | return html
65 |
66 | def reviews_output(html_full_review, flag):
67 | print('------开始提取评论并写入文件------')
68 | html = etree.HTML(html_full_review)
69 | reviews_items = html.xpath("//div[@class='reviews-items']/ul/li")
70 | for i in reviews_items:
71 | r = [] #初始化数组
72 | r = i.xpath("./div/div[@class='review-words Hide']/text()")
73 | if r:
74 | pass
75 | else:
76 | r = i.xpath("./div/div[@class='review-words']/text()") #评论较短不需要展开的时候
77 | flag += 1
78 | #print(r)
79 | #print('第' + str(flag) + '条评论:\n' + r[0].strip())
80 | with open('reviews.txt', 'a+', encoding='UTF-8') as f:
81 | f.write('第' + str(flag) + '条评论:\n' + r[0].strip() + '\n\n')
82 | f.close()
83 | print('------写入完成,延迟10-25秒------')
84 | time.sleep(10 + 15 * random.random())
85 |
86 | if __name__ == '__main__':
87 | url_list = get_url_list()
88 | flag = 0 # 统计评论数量
89 | # url = 'http://www.dianping.com/shop/18335920/review_all/p1'
90 | headers = {
91 | 'Cookie': '自己的cookie',
92 | 'host': 'www.dianping.com',
93 | 'Upgrade-Insecure-Requests': '1',
94 | 'User-Agent': UserAgent().random
95 | }
96 | res = requests.get(url_list[0], headers=headers)
97 | # 获取css文件内容
98 | css_content = get_css_content(res.text, headers)
99 | # 获取字体字典
100 | font_dic, y_list = get_font_dic(css_content)
101 | #解析第一个网页
102 | print('------开始解析第1个网页------')
103 | html_full_review = get_html_full_review(res.text, css_content, font_dic, y_list)
104 | reviews_output(html_full_review, flag)
105 | flag += 15
106 | #解析从第二个网页开始的所有网页
107 | for n in list(range(len(url_list)-1)):
108 | print('------开始解析第' + str(n + 2) + '个网页------')
109 | res = requests.get(url_list[n+1], headers=headers)
110 | if res:
111 | html_full_review = get_html_full_review(res.text, css_content, font_dic, y_list)
112 | reviews_output(html_full_review, flag)
113 | n += 1
114 | flag += 15
115 | else:
116 | print('无法请求网页')
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 | true
95 | DEFINITION_ORDER
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 | Angular
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 | 1582540189842
246 |
247 |
248 | 1582540189842
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
--------------------------------------------------------------------------------