├── .gitignore ├── LICENSE ├── README.md ├── analysis ├── comments_per_day.py ├── images │ ├── 15天评论量分布图.png │ ├── 15天评论量走势图.png │ └── wc.png ├── vote_top_10.py └── word_cloud_generator.py ├── crawler ├── downloader.py ├── main.py ├── manager.py ├── parser.py └── processor.py └── example ├── __jieba.py ├── __word_cloud.py ├── dicts └── my_dict.txt ├── fonts └── FZXingKai-S04S.TTF └── images ├── __background.jpg ├── __wc_1.png └── __wc_2.png /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | .idea 107 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # python-crawler-douban-movie 2 | 豆瓣电影(短评)爬虫 3 | 4 | 5 | #### 爬虫 6 | ``` 7 | # 爬取到的数据(这里只展示部分数据) 8 | > db.getCollection('movie_26752088_comments').find({}).sort({'date':-1}).limit(3) 9 | 10 | /* 1 */ 11 | { 12 | "_id" : ObjectId("5b477a3d14170f331411152d"), 13 | "author" : "呵呵", 14 | "date" : ISODate("2018-07-12T08:00:00.000+08:00"), 15 | "star" : null, 16 | "vote" : 0, 17 | "comment" : "想看", 18 | "is_visit" : false 19 | } 20 | 21 | /* 2 */ 22 | { 23 | "_id" : ObjectId("5b477a3d14170f3314111560"), 24 | "author" : "爱的物语", 25 | "date" : ISODate("2018-07-12T08:00:00.000+08:00"), 26 | "star" : null, 27 | "vote" : 0, 28 | "comment" : "小病自我診斷\n大病自我了斷\n社會初級階段\n別給國家添亂", 29 | "is_visit" : false 30 | } 31 | 32 | /* 3 */ 33 | { 34 | "_id" : ObjectId("5b477a4014170f33141115d7"), 35 | "author" : "小小书童", 36 | "date" : ISODate("2018-07-12T08:00:00.000+08:00"), 37 | "star" : null, 38 | "vote" : 0, 39 | "comment" : "海报跟 辩护人 迷之相似~", 40 | "is_visit" : false 41 | } 42 | ``` 43 | 44 | #### 分析 45 | - 点赞Top10 46 | - 短评日期分布、走势 47 | - 词频统计(中文分词),生成词云 48 | - 评分阶梯分布、日期分布、日期走势 49 | - 每日评论数走势 50 | 51 | ###### 词云(词频统计) 52 | ![](./analysis/images/wc.png) 53 | 54 | ###### 15天短评量分布图 55 | ![](./analysis/images/15天评论量分布图.png) 56 | 57 | ###### 15天短评量走势图 58 | ![](./analysis/images/15天评论量走势图.png) -------------------------------------------------------------------------------- /analysis/comments_per_day.py: -------------------------------------------------------------------------------- 1 | # 每日评论数,基于此生成日期评论走势图(用Excel生成的 ^_^) 2 | from datetime import datetime 3 | 4 | import pymongo 5 | from bson import Code 6 | 7 | # http://api.mongodb.com/python/current/ 8 | # http://api.mongodb.com/python/current/api/pymongo/collection.html#pymongo.collection.Collection.map_reduce 9 | 10 | with pymongo.MongoClient(host='192.168.0.105') as client: 11 | comments = client.douban.movie_26752088_comments 12 | 13 | fn_map = Code(""" 14 | function () { 15 | if (this.date != null) { 16 | emit(this.date, 1); 17 | } 18 | } 19 | """) 20 | 21 | fn_reduce = Code(""" 22 | function (key, values) { 23 | var total = 0; 24 | for (var i = 0; i < values.length; i++) { 25 | total += values[i]; 26 | } 27 | return total; 28 | } 29 | """) 30 | 31 | # pymongo.collection.Collection 32 | results = comments.map_reduce(fn_map, fn_reduce, out="mr_results") 33 | # 取最近15天数据 34 | for col in results.find().sort([('_id', -1)]).limit(15): 35 | # 格式化输出 36 | print(col['_id'].strftime('%Y-%m-%d'), '\t', int(col['value'])) 37 | 38 | # 删除生成的结果集合 39 | client.douban.mr_results.drop() 40 | 41 | # 取最近15天评论量分布及走势 42 | # ./images/15天评论量分布图.png 43 | # ./images/15天评论量走势图.png 44 | -------------------------------------------------------------------------------- /analysis/images/15天评论量分布图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zlikun/python-crawler-douban-movie/5ba05291473db540f2c8539170ebd95f5dd379af/analysis/images/15天评论量分布图.png -------------------------------------------------------------------------------- /analysis/images/15天评论量走势图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zlikun/python-crawler-douban-movie/5ba05291473db540f2c8539170ebd95f5dd379af/analysis/images/15天评论量走势图.png -------------------------------------------------------------------------------- /analysis/images/wc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zlikun/python-crawler-douban-movie/5ba05291473db540f2c8539170ebd95f5dd379af/analysis/images/wc.png -------------------------------------------------------------------------------- /analysis/vote_top_10.py: -------------------------------------------------------------------------------- 1 | # 遍历点赞数Top 10短评数据 2 | 3 | import pymongo 4 | 5 | # 取点赞最多的前10条短评 6 | with pymongo.MongoClient(host='192.168.0.105') as client: 7 | comments = client.douban.movie_26752088_comments 8 | 9 | for doc in comments.find().sort([('vote', -1)]).limit(10): 10 | print('author = {}, date = {}, vote = {}, comment = {}'.format( 11 | doc.get('author'), 12 | doc.get('date'), 13 | doc.get('vote'), 14 | doc.get('comment') 15 | )) 16 | 17 | # author = 忻钰坤, date = 2018-07-04 00:00:00, vote = 28129, comment = “你敢保证你一辈子不得病?”纯粹、直接、有力!常常感叹:电影只能是电影。但每看到这样的佳作,又感慨:电影不只是电影!由衷的希望这部电影大卖!成为话题!成为榜样!成为国产电影最该有的可能。 18 | # author = 沐子荒, date = 2018-07-03 00:00:00, vote = 27237, comment = 王传君所有不被外人理解的坚持,都在这一刻得到了完美释放。他不是关谷神奇,他是王传君。 19 | # 你看,即使依旧烂片如云,只要还有哪怕极少的人坚持,中国影视也终于还是从中生出了茁壮的根。 20 | # 我不是药神,治不好这世界。但能改变一点,总归是会好的。 21 | # author = 凌睿, date = 2018-06-30 00:00:00, vote = 18304, comment = 别说这是“中国版《达拉斯买家俱乐部》”了,这是中国的真实事件改编的中国电影,是属于我们自己的电影。不知道就去百度一下“陆勇”,他卖印度抗癌药的时候《达拉斯买家俱乐部》还没上映呢。所以别提《达拉斯买家俱乐部》了,只会显得你无知。(别私信我了,我800年前就知道《达拉斯》也是真事改编) 22 | # author = 徐若风, date = 2018-06-06 00:00:00, vote = 16426, comment = 放豆瓣语境下,是部时至今日终于拍出来的国产“高分韩国电影”——拿现实题材拍商业类型片,社会性意义摆在那,群戏也处理得相当不错。对我们国家而言,这样的电影多一部是一部,走一步是一步。 23 | # author = 桃桃淘电影, date = 2018-06-19 00:00:00, vote = 13337, comment = 最大的病,其实是穷病。真的被感动了,整体都很成熟,也有些许韩片的影子。几个演员表演都非常出色。可看性和内在的表达都不错。这个世界最荒诞在于,越贴近真实,真实越荒诞。人这一生,太不易了。最后,王传君,加油哦! 24 | # author = 远世祖, date = 2018-06-30 00:00:00, vote = 9102, comment = 文牧野眼睛太毒了,观众的笑点、泪点、痛点被他牢牢抓住,徐峥现在不拼演技开始掏心炸肺放脱自我了,药物在中国绝对是个“不可说”,但这个电影说了它能说的,也不显山不漏水的说了它所不能说的,讲的是现实,但看过电影之后才会明白其实是超现实,2018最佳! 25 | # author = 影志, date = 2018-06-19 00:00:00, vote = 7076, comment = “今后都会越来越好吧,希望这一天早点来”口罩成为符号,不是雾霾,而是人性的仪式,结尾竟然看到《辛德勒名单》一样的救赎。通俗感人,上海电影节首映哭倒一片,基于真实事件改编的社会意义加分,或许《我不是药神》之于中国,就像《摔跤吧爸爸》之于印度吧…能看到就不错。“其实只有一种病:穷病” 26 | # author = Noodles, date = 2018-07-03 00:00:00, vote = 6926, comment = 人生建议:别买零食,吃不下的。 27 | # author = 哪吒男, date = 2018-06-25 00:00:00, vote = 6211, comment = 最喜欢王传君的表演啊,几乎所有泪点都给他了!!而他曾经的同伴们,下月继续拿《爱情公寓》电影版面对观众。这个圈子里还是有不爱赚快钱的年轻演员,真好。 28 | # author = 开开kergelen, date = 2018-07-04 00:00:00, vote = 5549, comment = 小时候路过一家药店,门口的对联写着“只愿世间无疾病,何愁架上药染尘” 29 | -------------------------------------------------------------------------------- /analysis/word_cloud_generator.py: -------------------------------------------------------------------------------- 1 | # 读取Mongo中短评数据,对其进行中文分词,并生成词云 2 | 3 | # 读取Mongo中的短评数据 4 | import pymongo 5 | import jieba 6 | from jieba import analyse 7 | 8 | # https://pypi.org/project/pymongo/ 9 | # http://github.com/mongodb/mongo-python-driver 10 | from matplotlib import pyplot 11 | from wordcloud import WordCloud 12 | 13 | text = None 14 | 15 | with pymongo.MongoClient(host='192.168.0.105', port=27017) as client: 16 | # 获取集合 17 | comments = client.douban.movie_26752088_comments 18 | 19 | # 不知道为什么爬虫只取到了1000条评论~ 20 | print('count:', comments.estimated_document_count()) 21 | 22 | # pymongo.cursor.Cursor 23 | cursor = comments.find() 24 | 25 | # 遍历数据,这里只遍历短评数据(好在数据量并不太大) 26 | text = ''.join(map(lambda doc: doc.get('comment'), cursor)) 27 | 28 | # 对短语数据文本进行分词 29 | # 添加自定义分词 30 | [jieba.add_word(k) for k in []] 31 | 32 | # 取Top50的词生成词云 33 | # https://github.com/fxsjy/jieba#基于-tf-idf-算法的关键词抽取 34 | tags = analyse.extract_tags(text, topK=50, withWeight=False) 35 | new_text = ' '.join(tags) 36 | print(new_text) 37 | 38 | # 对分词文本生成词云 39 | # 生成词云,需要指定支持中文的字体,否则无法生成中文词云 40 | wc = WordCloud( 41 | # 设置词云图片背景色,默认黑色 42 | # background_color='white', 43 | # 设置词云最大单词数 44 | max_words=200, 45 | # 设置词云中字号最大值 46 | # max_font_size=80, 47 | # 设置词云图片宽、高 48 | width=768, 49 | height=1024, 50 | # 设置词云文字字体(美化和解决中文乱码问题) 51 | font_path=r'../example/fonts/FZXingKai-S04S.TTF' 52 | ).generate(new_text) 53 | 54 | # 绘图(标准长方形图) 55 | pyplot.imshow(wc, interpolation='bilinear') 56 | pyplot.figure() 57 | pyplot.axis('off') 58 | # 将图片输出到文件 59 | wc.to_file(r'./images/wc.png') 60 | -------------------------------------------------------------------------------- /crawler/downloader.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | HEADERS = { 4 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', 5 | 'Cookie': '' 6 | } 7 | 8 | 9 | def download(url): 10 | try: 11 | # 如果不登录抓取的数据可能会很有限(未证实),这里简化处理认证部分逻辑,直接把我的cookie信息复制过来 12 | resp = requests.get(url, 13 | headers=HEADERS, 14 | timeout=3.0) 15 | resp.raise_for_status() 16 | return resp.text 17 | except requests.RequestException as e: 18 | print(e) 19 | except Exception as e: 20 | print(e) -------------------------------------------------------------------------------- /crawler/main.py: -------------------------------------------------------------------------------- 1 | # 爬虫启动入口 2 | 3 | from crawler.manager import Manager 4 | from crawler.downloader import download 5 | from crawler.parser import parse 6 | from crawler.processor import Processor 7 | 8 | movie_id = 26752088 9 | base_url = 'https://movie.douban.com/subject/{}/comments'.format(movie_id) 10 | 11 | 12 | class Crawler(object): 13 | 14 | def __init__(self): 15 | self._manager = Manager(base_url) 16 | self._processor = Processor(host='192.168.0.105', 17 | collection='movie_{}_comments'.format(movie_id)) 18 | 19 | def start(self, urls): 20 | """ 21 | 启动爬虫方法 22 | :param urls: 启动URL 23 | :return: 抓取的URL数量 24 | """ 25 | number = 0 26 | self._manager.append_new_urls(urls) 27 | while self._manager.has_new_url(): 28 | number += 1 29 | new_url = self._manager.get_new_url() 30 | print('开始下载第{:03}个URL:{}'.format(number, new_url)) 31 | html = download(new_url) 32 | if html is None: 33 | # print('html is empty .') 34 | continue 35 | links, results = parse(html, new_url) 36 | if len(links) > 0: 37 | self._manager.append_new_urls(links) 38 | if len(results) > 0: 39 | self._processor.process(results) 40 | return number 41 | 42 | 43 | if __name__ == "__main__": 44 | crawler = Crawler() 45 | # 同时抓取看过和未看过的链接,两者区别在于status查询参数上 46 | root_urls = ['?'.join([base_url, 'start=0&limit=20&sort=new_score&status=P']), 47 | '?'.join([base_url, 'start=0&limit=20&sort=new_score&status=F'])] 48 | nums = crawler.start(root_urls) 49 | print('爬虫执行完成,共抓取{}个URL'.format(nums)) 50 | -------------------------------------------------------------------------------- /crawler/manager.py: -------------------------------------------------------------------------------- 1 | class Manager(object): 2 | 3 | def __init__(self, base_url=None): 4 | self.base_url = base_url 5 | self.new_urls = [] 6 | self.old_urls = [] 7 | 8 | def append_new_urls(self, urls): 9 | if len(urls) == 0: 10 | return 11 | for url in urls: 12 | # 过滤非目标URL 13 | if self.base_url not in url: 14 | continue 15 | # 排序倒序数据,避免重复抓取 16 | if '&limit=-20' in url: 17 | continue 18 | # 去掉多余查询参数 19 | if '&percent_type=' in url: 20 | url = url.replace('&percent_type=', '') 21 | # URL重复检查 22 | if url not in self.new_urls and url not in self.old_urls: 23 | self.new_urls.append(url) 24 | 25 | def has_new_url(self): 26 | return len(self.new_urls) > 0 27 | 28 | def get_new_url(self): 29 | """ 30 | 获取一个新的URL,内部隐含了URL抓取过后加入已抓取队列操作(所以这里不考虑实际抓取过程中的失败情况) 31 | :return: 32 | """ 33 | url = self.new_urls.pop() 34 | self.old_urls.append(url) 35 | return url 36 | -------------------------------------------------------------------------------- /crawler/parser.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import urllib.parse 3 | 4 | from bs4 import BeautifulSoup 5 | 6 | 7 | def parse(html, url): 8 | soup = BeautifulSoup(html, 'html.parser') 9 | 10 | # 超链接列表 11 | links = [] 12 | for a in soup.select('#paginator > a'): 13 | links.append(urllib.parse.urljoin(url, a.get('href'))) 14 | 15 | # 数据列表 16 | results = [] 17 | # 根据 status 参数判断用户是否看过 18 | is_visit = ('status=P' in url) 19 | for div in soup.select('#comments > div.comment-item'): 20 | author = div.select_one('h3 > span.comment-info > a').get_text(strip=True) 21 | date = div.select_one('h3 > span.comment-info > span.comment-time').get_text(strip=True) 22 | rating = div.select_one('h3 > span.comment-info > span.rating') 23 | star = None 24 | if rating is not None: 25 | star = rating.get('class')[0].replace('allstar', '') 26 | vote = div.select_one('h3 > span.comment-vote > span.votes').get_text(strip=True) 27 | comment = div.select_one('div.comment > p').get_text(strip=True) 28 | results.append({ 29 | 'author': author, 30 | 'date': datetime.datetime.strptime(date, '%Y-%m-%d'), 31 | 'star': star, 32 | 'vote': int(vote), 33 | 'comment': comment, 34 | 'is_visit': is_visit 35 | }) 36 | 37 | return links, results 38 | -------------------------------------------------------------------------------- /crawler/processor.py: -------------------------------------------------------------------------------- 1 | import pymongo 2 | 3 | 4 | class Processor(object): 5 | 6 | def __init__(self, host=None, port=27017, database='douban', collection='comments'): 7 | self.client = pymongo.MongoClient(host=host, port=port) 8 | self.database = database 9 | self.collection = collection 10 | 11 | def __del__(self): 12 | self.client.close() 13 | 14 | def process(self, results): 15 | # print(results) 16 | crawler = self.client.get_database(self.database).get_collection(self.collection) 17 | return crawler.insert_many(results) 18 | -------------------------------------------------------------------------------- /example/__jieba.py: -------------------------------------------------------------------------------- 1 | # Python 中文分词模块 2 | # https://github.com/fxsjy/jieba 3 | # https://pypi.org/project/jieba/ 4 | 5 | import jieba 6 | 7 | 8 | def print_iterator(_iter): 9 | print('/'.join(_iter)) 10 | 11 | 12 | # 全模式 13 | # 我/来到/北京/清华/清华大学/华大/大学 14 | print_iterator(jieba.cut("我来到北京清华大学", cut_all=True)) 15 | 16 | # 精确模式 17 | # 我/来到/北京/清华大学 18 | print_iterator(jieba.cut("我来到北京清华大学", cut_all=False)) 19 | 20 | # 默认是精确模式 21 | # 他/来到/了/网易/杭研/大厦 22 | print_iterator(jieba.cut("他来到了网易杭研大厦")) 23 | 24 | # 搜索引擎模式 25 | # 小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/,/后/在/日本/京都/大学/日本京都大学/深造 26 | print_iterator(jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造")) 27 | 28 | # 自定义词典:一个词占一行,每行分三列,词语、词频(可省略)、词性(可省略),使用空格分隔,文件使用UTF-8编码 29 | # jieba.load_userdict(r'./dicts/my_dict.txt') 30 | 31 | 32 | # 载入自定义字典前 33 | # 她/是/市/创新/办/主任/,/也/是/云/计算/方面/的/专家 34 | print_iterator(jieba.cut('她是市创新办主任,也是云计算方面的专家')) 35 | 36 | # 载入自定义字典 37 | jieba.load_userdict(r'./dicts/my_dict.txt') 38 | 39 | # 载入自定义字典后 40 | # 她/是/市/创新办/主任/,/也是/云计算/方面/的/专家 41 | print_iterator(jieba.cut('她是市创新办主任,也是云计算方面的专家')) 42 | 43 | # 调整词典 44 | 45 | # 使用 add_word(word, freq=None, tag=None) 和 del_word(word) 可在程序中动态修改词典 46 | jieba.add_word('市创新办') 47 | # 她/是/市创新办/主任/,/也是/云计算/方面/的/专家 48 | print_iterator(jieba.cut('她是市创新办主任,也是云计算方面的专家')) 49 | jieba.del_word('也是') 50 | # 她/是/市创新办/主任/,/也/是/云计算/方面/的/专家 51 | print_iterator(jieba.cut('她是市创新办主任,也是云计算方面的专家')) 52 | 53 | # 使用 suggest_freq(segment, tune=True) 可调节单个词语的词频,使其能(或不能)被分出来 54 | # 如果/放到/post/中将/出错/。 55 | print_iterator(jieba.cut('如果放到post中将出错。', HMM=False)) 56 | # 494 57 | print(jieba.suggest_freq(('中', '将'), True)) 58 | # 如果/放到/post/中/将/出错/。 59 | print_iterator(jieba.cut('如果放到post中将出错。', HMM=False)) 60 | -------------------------------------------------------------------------------- /example/__word_cloud.py: -------------------------------------------------------------------------------- 1 | # Python 词云库测试 2 | # https://amueller.github.io/word_cloud/index.html 3 | # https://amueller.github.io/word_cloud/auto_examples/simple.html#sphx-glr-auto-examples-simple-py 4 | import jieba as jieba 5 | from wordcloud import WordCloud, ImageColorGenerator 6 | from matplotlib import pyplot 7 | 8 | # 读取文本文件 9 | # text = open('./__word_cloud.py', 'r', encoding='utf-8').read() 10 | # __file__ 表示当前文件 11 | text = open(__file__, 'r', encoding='utf-8').read() 12 | 13 | # 使用 jieba 分词 14 | # 添加自定义分词 15 | [jieba.add_word(k) for k in ['文本', '文件']] 16 | 17 | # 对文本进行分词处理 18 | seg_list = jieba.cut(text) 19 | new_text = ' '.join(seg_list) 20 | 21 | # 生成词云,需要指定支持中文的字体,否则无法生成中文词云 22 | wc = WordCloud( 23 | # 设置词云图片背景色,默认黑色 24 | # background_color='white', 25 | # 设置词云最大单词数 26 | max_words=200, 27 | # 设置词云中字号最大值 28 | # max_font_size=80, 29 | # 设置词云图片宽、高 30 | width=1024, 31 | height=768, 32 | # 设置词云文字字体(美化和解决中文乱码问题) 33 | font_path=r'./fonts/FZXingKai-S04S.TTF' 34 | ).generate(new_text) 35 | 36 | # 绘图(标准长方形图) 37 | pyplot.imshow(wc, interpolation='bilinear') 38 | # pyplot.figure() 39 | pyplot.axis('off') 40 | # 直接打印图片 41 | # pyplot.show() 42 | # 将图片输出到文件 43 | wc.to_file(r'./images/__wc_1.png') 44 | 45 | # 基于图像着色 46 | background_image = pyplot.imread(r'./images/__background.jpg') 47 | wc = WordCloud( 48 | font_path=r'./fonts/FZXingKai-S04S.TTF', 49 | mask=background_image, 50 | # random_state=32, 51 | # max_font_size=64, 52 | width=1000, 53 | height=833 54 | ).generate(new_text) 55 | # 从背景图片生成颜色值 56 | pyplot.imshow(wc.recolor(color_func=ImageColorGenerator(background_image))) 57 | pyplot.axis('off') 58 | # 绘制背景图片为底色的图片 59 | pyplot.figure() 60 | pyplot.imshow(background_image, cmap=pyplot.cm.gray) 61 | pyplot.axis('off') 62 | # pyplot.show() 63 | wc.to_file(r'./images/__wc_2.png') 64 | -------------------------------------------------------------------------------- /example/dicts/my_dict.txt: -------------------------------------------------------------------------------- 1 | 创新办 2 | 云计算 5 3 | 也是 -------------------------------------------------------------------------------- /example/fonts/FZXingKai-S04S.TTF: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zlikun/python-crawler-douban-movie/5ba05291473db540f2c8539170ebd95f5dd379af/example/fonts/FZXingKai-S04S.TTF -------------------------------------------------------------------------------- /example/images/__background.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zlikun/python-crawler-douban-movie/5ba05291473db540f2c8539170ebd95f5dd379af/example/images/__background.jpg -------------------------------------------------------------------------------- /example/images/__wc_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zlikun/python-crawler-douban-movie/5ba05291473db540f2c8539170ebd95f5dd379af/example/images/__wc_1.png -------------------------------------------------------------------------------- /example/images/__wc_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zlikun/python-crawler-douban-movie/5ba05291473db540f2c8539170ebd95f5dd379af/example/images/__wc_2.png --------------------------------------------------------------------------------