├── .idea ├── dictionaries │ └── Mr_Li.xml ├── vcs.xml ├── misc.xml ├── modules.xml ├── dazhongdianping.iml └── workspace.xml ├── __pycache__ └── test.cpython-38.pyc ├── README.md ├── .gitignore └── dzdp.py /.idea/dictionaries/Mr_Li.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /__pycache__/test.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saltedfish666/dazhongdianping/HEAD/__pycache__/test.cpython-38.pyc -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/dazhongdianping.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 大众点评爬虫 2 | 3 | ## 提示:本项目仅供学习,严禁用于商业或着任何违法用途 4 | 5 | 程序文件:dzdp.py 6 | 运行结果为一个reviews.txt文件 7 | 8 | **更新日期:2020/02/29** 9 | 更新内容: 10 | 1.修复已知的bug 11 | 2.增加输出排版 12 | 13 | **更新日期:2020/02/27** 14 | 说明: 15 | 1.本程序主要解决大众点评里面CSS加密问题,对封IP问题采取了程序延迟10-25秒的措施,且没有解决验证码问题,但是验证码的问题可以手动验证,并修改相关代码,也可以爬完全部页面。 16 | 2.本程序只爬取了大众点评上一家店铺的所有评论。 17 | 3.直接在网页上看好评论的页数并填进代码,没有自动化获取评论页数。 18 | 4.大众点评需要cookie才能登录,需要抓取cookie填进headers。 19 | 5.本程序只抓取了评论,未保存昵称,ID,图片等信息。 20 | 6.抓取的评论还未进行排版。 21 | 22 | 如有错误或者更好的想法,欢迎大家指出一起讨论学习。 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /dzdp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @project: dazhongdianping 6 | @file: dzdp.py.py 7 | @ide: PyCharm 8 | @time: 2020-02-24 18:30:41 9 | @author: Mr.Li 10 | Copyright © 2020—2020 Mr.Li. All rights reserved. 11 | """ 12 | 13 | import requests 14 | import re # re库是对字符串进行解析,而lxml文件可以对xml文件进行解析 15 | from lxml import etree 16 | from fake_useragent import UserAgent 17 | import time #暂停程序,避免封号 18 | import random #生成随机时间值 19 | 20 | def get_url_list(): 21 | url_list = [] 22 | for i in list(range(72)): # 我所抓的评论一共72页,尚未完善自动化获取评论页数的代码 23 | url_list.append('http://www.dianping.com/shop/18335920/review_all/p' + str(i + 1)) 24 | return url_list 25 | 26 | def get_css_content(html, headers): 27 | print('------begin to get css content------') 28 | css_l = re.search(r'', html) 29 | css_link = 'http:' + css_l.group(1) 30 | html_css = requests.get(css_link, headers).text 31 | return html_css 32 | 33 | def get_font_dic(css_content): 34 | print('------begin to get font dictionary------') 35 | # 获取svg链接和svg页面的html源码 36 | svg_l = re.search(r'svgmtsi.*?(//s3plus.sankuai.com.*?svg)\);', css_content) 37 | svg_link = 'http:' + svg_l.group(1) 38 | svg_html = requests.get(svg_link).text 39 | # 解析出字典 40 | y_list = re.findall('d="M0 (.*?) H600"', svg_html) # y_list的元素为str 41 | font_dic = {} 42 | j = 0 # j为第j行 43 | font_size = int(re.findall(r'font-size:(.*?)px;fill:#333;}', svg_html)[0]) 44 | for y in y_list: 45 | font_l = re.findall(r'(.*?)', svg_html) 46 | font_list = re.findall(r'.{1}', font_l[0]) 47 | for x in range(len(font_list)): # x为每一行第x个字 48 | font_dic[str(x * font_size) + ',' + y] = font_list[x] 49 | j += 1 50 | return font_dic, y_list 51 | 52 | def get_html_full_review(html, css_content, font_dic, y_list): 53 | font_key_list = re.findall(r'', html) 54 | # print(len(font_key)) 55 | for font_key in font_key_list: 56 | pos_key = re.findall(r'.' + font_key + '{background:-(.*?).0px -(.*?).0px;}', css_content) 57 | pos_x = pos_key[0][0] 58 | pos_y_original = pos_key[0][1] 59 | for y in y_list: 60 | if int(pos_y_original) < int(y): 61 | pos_y = y 62 | break 63 | html = html.replace('', font_dic[pos_x + ',' + pos_y]) 64 | return html 65 | 66 | def reviews_output(html_full_review, flag): 67 | print('------开始提取评论并写入文件------') 68 | html = etree.HTML(html_full_review) 69 | reviews_items = html.xpath("//div[@class='reviews-items']/ul/li") 70 | for i in reviews_items: 71 | r = [] #初始化数组 72 | r = i.xpath("./div/div[@class='review-words Hide']/text()") 73 | if r: 74 | pass 75 | else: 76 | r = i.xpath("./div/div[@class='review-words']/text()") #评论较短不需要展开的时候 77 | flag += 1 78 | #print(r) 79 | #print('第' + str(flag) + '条评论:\n' + r[0].strip()) 80 | with open('reviews.txt', 'a+', encoding='UTF-8') as f: 81 | f.write('第' + str(flag) + '条评论:\n' + r[0].strip() + '\n\n') 82 | f.close() 83 | print('------写入完成,延迟10-25秒------') 84 | time.sleep(10 + 15 * random.random()) 85 | 86 | if __name__ == '__main__': 87 | url_list = get_url_list() 88 | flag = 0 # 统计评论数量 89 | # url = 'http://www.dianping.com/shop/18335920/review_all/p1' 90 | headers = { 91 | 'Cookie': '自己的cookie', 92 | 'host': 'www.dianping.com', 93 | 'Upgrade-Insecure-Requests': '1', 94 | 'User-Agent': UserAgent().random 95 | } 96 | res = requests.get(url_list[0], headers=headers) 97 | # 获取css文件内容 98 | css_content = get_css_content(res.text, headers) 99 | # 获取字体字典 100 | font_dic, y_list = get_font_dic(css_content) 101 | #解析第一个网页 102 | print('------开始解析第1个网页------') 103 | html_full_review = get_html_full_review(res.text, css_content, font_dic, y_list) 104 | reviews_output(html_full_review, flag) 105 | flag += 15 106 | #解析从第二个网页开始的所有网页 107 | for n in list(range(len(url_list)-1)): 108 | print('------开始解析第' + str(n + 2) + '个网页------') 109 | res = requests.get(url_list[n+1], headers=headers) 110 | if res: 111 | html_full_review = get_html_full_review(res.text, css_content, font_dic, y_list) 112 | reviews_output(html_full_review, flag) 113 | n += 1 114 | flag += 15 115 | else: 116 | print('无法请求网页') -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 78 | 79 | 80 | 82 | 83 | 90 | 91 | 92 | 93 | 94 | true 95 | DEFINITION_ORDER 96 | 97 | 98 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | Angular 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 |