├── .gitignore
├── LICENSE
├── README.md
├── analysis
    ├── comments_per_day.py
    ├── images
    │   ├── 15天评论量分布图.png
    │   ├── 15天评论量走势图.png
    │   └── wc.png
    ├── vote_top_10.py
    └── word_cloud_generator.py
├── crawler
    ├── downloader.py
    ├── main.py
    ├── manager.py
    ├── parser.py
    └── processor.py
└── example
    ├── __jieba.py
    ├── __word_cloud.py
    ├── dicts
        └── my_dict.txt
    ├── fonts
        └── FZXingKai-S04S.TTF
    └── images
        ├── __background.jpg
        ├── __wc_1.png
        └── __wc_2.png


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | .idea
107 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # python-crawler-douban-movie
 2 | 豆瓣电影（短评）爬虫
 3 | 
 4 | 
 5 | #### 爬虫
 6 | ```
 7 | # 爬取到的数据（这里只展示部分数据）
 8 | > db.getCollection('movie_26752088_comments').find({}).sort({'date':-1}).limit(3)
 9 | 
10 | /* 1 */
11 | {
12 |     "_id" : ObjectId("5b477a3d14170f331411152d"),
13 |     "author" : "呵呵",
14 |     "date" : ISODate("2018-07-12T08:00:00.000+08:00"),
15 |     "star" : null,
16 |     "vote" : 0,
17 |     "comment" : "想看",
18 |     "is_visit" : false
19 | }
20 | 
21 | /* 2 */
22 | {
23 |     "_id" : ObjectId("5b477a3d14170f3314111560"),
24 |     "author" : "爱的物语",
25 |     "date" : ISODate("2018-07-12T08:00:00.000+08:00"),
26 |     "star" : null,
27 |     "vote" : 0,
28 |     "comment" : "小病自我診斷\n大病自我了斷\n社會初級階段\n別給國家添亂",
29 |     "is_visit" : false
30 | }
31 | 
32 | /* 3 */
33 | {
34 |     "_id" : ObjectId("5b477a4014170f33141115d7"),
35 |     "author" : "小小书童",
36 |     "date" : ISODate("2018-07-12T08:00:00.000+08:00"),
37 |     "star" : null,
38 |     "vote" : 0,
39 |     "comment" : "海报跟 辩护人 迷之相似～",
40 |     "is_visit" : false
41 | }
42 | ```
43 | 
44 | #### 分析
45 | - 点赞Top10
46 | - 短评日期分布、走势
47 | - 词频统计（中文分词），生成词云
48 | - 评分阶梯分布、日期分布、日期走势
49 | - 每日评论数走势
50 | 
51 | ###### 词云（词频统计）
52 | ![](./analysis/images/wc.png)
53 | 
54 | ###### 15天短评量分布图
55 | ![](./analysis/images/15天评论量分布图.png)
56 | 
57 | ###### 15天短评量走势图
58 | ![](./analysis/images/15天评论量走势图.png)


--------------------------------------------------------------------------------
/analysis/comments_per_day.py:
--------------------------------------------------------------------------------
 1 | # 每日评论数，基于此生成日期评论走势图(用Excel生成的 ^_^)
 2 | from datetime import datetime
 3 | 
 4 | import pymongo
 5 | from bson import Code
 6 | 
 7 | # http://api.mongodb.com/python/current/
 8 | # http://api.mongodb.com/python/current/api/pymongo/collection.html#pymongo.collection.Collection.map_reduce
 9 | 
10 | with pymongo.MongoClient(host='192.168.0.105') as client:
11 |     comments = client.douban.movie_26752088_comments
12 | 
13 |     fn_map = Code("""
14 |         function () {
15 |             if (this.date != null) {
16 |                 emit(this.date, 1);
17 |             }
18 |         }
19 |     """)
20 | 
21 |     fn_reduce = Code("""
22 |         function (key, values) {
23 |             var total = 0;
24 |             for (var i = 0; i < values.length; i++) {
25 |                 total += values[i];
26 |             }
27 |             return total;
28 |         }
29 |     """)
30 | 
31 |     # pymongo.collection.Collection
32 |     results = comments.map_reduce(fn_map, fn_reduce, out="mr_results")
33 |     # 取最近15天数据
34 |     for col in results.find().sort([('_id', -1)]).limit(15):
35 |         # 格式化输出
36 |         print(col['_id'].strftime('%Y-%m-%d'), '\t', int(col['value']))
37 | 
38 |     # 删除生成的结果集合
39 |     client.douban.mr_results.drop()
40 | 
41 |     # 取最近15天评论量分布及走势
42 |     # ./images/15天评论量分布图.png
43 |     # ./images/15天评论量走势图.png
44 | 


--------------------------------------------------------------------------------
/analysis/images/15天评论量分布图.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zlikun/python-crawler-douban-movie/5ba05291473db540f2c8539170ebd95f5dd379af/analysis/images/15天评论量分布图.png


--------------------------------------------------------------------------------
/analysis/images/15天评论量走势图.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zlikun/python-crawler-douban-movie/5ba05291473db540f2c8539170ebd95f5dd379af/analysis/images/15天评论量走势图.png


--------------------------------------------------------------------------------
/analysis/images/wc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zlikun/python-crawler-douban-movie/5ba05291473db540f2c8539170ebd95f5dd379af/analysis/images/wc.png


--------------------------------------------------------------------------------
/analysis/vote_top_10.py:
--------------------------------------------------------------------------------
 1 | # 遍历点赞数Top 10短评数据
 2 | 
 3 | import pymongo
 4 | 
 5 | # 取点赞最多的前10条短评
 6 | with pymongo.MongoClient(host='192.168.0.105') as client:
 7 |     comments = client.douban.movie_26752088_comments
 8 | 
 9 |     for doc in comments.find().sort([('vote', -1)]).limit(10):
10 |         print('author = {}, date = {}, vote = {}, comment = {}'.format(
11 |             doc.get('author'),
12 |             doc.get('date'),
13 |             doc.get('vote'),
14 |             doc.get('comment')
15 |         ))
16 | 
17 | # author = 忻钰坤, date = 2018-07-04 00:00:00, vote = 28129, comment = “你敢保证你一辈子不得病？”纯粹、直接、有力！常常感叹：电影只能是电影。但每看到这样的佳作，又感慨：电影不只是电影！由衷的希望这部电影大卖！成为话题！成为榜样！成为国产电影最该有的可能。
18 | # author = 沐子荒, date = 2018-07-03 00:00:00, vote = 27237, comment = 王传君所有不被外人理解的坚持，都在这一刻得到了完美释放。他不是关谷神奇，他是王传君。
19 | # 你看，即使依旧烂片如云，只要还有哪怕极少的人坚持，中国影视也终于还是从中生出了茁壮的根。
20 | # 我不是药神，治不好这世界。但能改变一点，总归是会好的。
21 | # author = 凌睿, date = 2018-06-30 00:00:00, vote = 18304, comment = 别说这是“中国版《达拉斯买家俱乐部》”了，这是中国的真实事件改编的中国电影，是属于我们自己的电影。不知道就去百度一下“陆勇”，他卖印度抗癌药的时候《达拉斯买家俱乐部》还没上映呢。所以别提《达拉斯买家俱乐部》了，只会显得你无知。（别私信我了，我800年前就知道《达拉斯》也是真事改编）
22 | # author = 徐若风, date = 2018-06-06 00:00:00, vote = 16426, comment = 放豆瓣语境下，是部时至今日终于拍出来的国产“高分韩国电影”——拿现实题材拍商业类型片，社会性意义摆在那，群戏也处理得相当不错。对我们国家而言，这样的电影多一部是一部，走一步是一步。
23 | # author = 桃桃淘电影, date = 2018-06-19 00:00:00, vote = 13337, comment = 最大的病，其实是穷病。真的被感动了，整体都很成熟，也有些许韩片的影子。几个演员表演都非常出色。可看性和内在的表达都不错。这个世界最荒诞在于，越贴近真实，真实越荒诞。人这一生，太不易了。最后，王传君，加油哦！
24 | # author = 远世祖, date = 2018-06-30 00:00:00, vote = 9102, comment = 文牧野眼睛太毒了，观众的笑点、泪点、痛点被他牢牢抓住，徐峥现在不拼演技开始掏心炸肺放脱自我了，药物在中国绝对是个“不可说”，但这个电影说了它能说的，也不显山不漏水的说了它所不能说的，讲的是现实，但看过电影之后才会明白其实是超现实，2018最佳!
25 | # author = 影志, date = 2018-06-19 00:00:00, vote = 7076, comment = “今后都会越来越好吧，希望这一天早点来”口罩成为符号，不是雾霾，而是人性的仪式，结尾竟然看到《辛德勒名单》一样的救赎。通俗感人，上海电影节首映哭倒一片，基于真实事件改编的社会意义加分，或许《我不是药神》之于中国，就像《摔跤吧爸爸》之于印度吧…能看到就不错。“其实只有一种病：穷病”
26 | # author = Noodles, date = 2018-07-03 00:00:00, vote = 6926, comment = 人生建议：别买零食，吃不下的。
27 | # author = 哪吒男, date = 2018-06-25 00:00:00, vote = 6211, comment = 最喜欢王传君的表演啊，几乎所有泪点都给他了！！而他曾经的同伴们，下月继续拿《爱情公寓》电影版面对观众。这个圈子里还是有不爱赚快钱的年轻演员，真好。
28 | # author = 开开kergelen, date = 2018-07-04 00:00:00, vote = 5549, comment = 小时候路过一家药店，门口的对联写着“只愿世间无疾病，何愁架上药染尘”
29 | 


--------------------------------------------------------------------------------
/analysis/word_cloud_generator.py:
--------------------------------------------------------------------------------
 1 | # 读取Mongo中短评数据，对其进行中文分词，并生成词云
 2 | 
 3 | # 读取Mongo中的短评数据
 4 | import pymongo
 5 | import jieba
 6 | from jieba import analyse
 7 | 
 8 | # https://pypi.org/project/pymongo/
 9 | # http://github.com/mongodb/mongo-python-driver
10 | from matplotlib import pyplot
11 | from wordcloud import WordCloud
12 | 
13 | text = None
14 | 
15 | with pymongo.MongoClient(host='192.168.0.105', port=27017) as client:
16 |     # 获取集合
17 |     comments = client.douban.movie_26752088_comments
18 | 
19 |     # 不知道为什么爬虫只取到了1000条评论~
20 |     print('count:', comments.estimated_document_count())
21 | 
22 |     # pymongo.cursor.Cursor
23 |     cursor = comments.find()
24 | 
25 |     # 遍历数据，这里只遍历短评数据(好在数据量并不太大)
26 |     text = ''.join(map(lambda doc: doc.get('comment'), cursor))
27 | 
28 | # 对短语数据文本进行分词
29 | # 添加自定义分词
30 | [jieba.add_word(k) for k in []]
31 | 
32 | # 取Top50的词生成词云
33 | # https://github.com/fxsjy/jieba#基于-tf-idf-算法的关键词抽取
34 | tags = analyse.extract_tags(text, topK=50, withWeight=False)
35 | new_text = ' '.join(tags)
36 | print(new_text)
37 | 
38 | # 对分词文本生成词云
39 | # 生成词云，需要指定支持中文的字体，否则无法生成中文词云
40 | wc = WordCloud(
41 |     # 设置词云图片背景色，默认黑色
42 |     # background_color='white',
43 |     # 设置词云最大单词数
44 |     max_words=200,
45 |     # 设置词云中字号最大值
46 |     # max_font_size=80,
47 |     # 设置词云图片宽、高
48 |     width=768,
49 |     height=1024,
50 |     # 设置词云文字字体(美化和解决中文乱码问题)
51 |     font_path=r'../example/fonts/FZXingKai-S04S.TTF'
52 | ).generate(new_text)
53 | 
54 | # 绘图(标准长方形图)
55 | pyplot.imshow(wc, interpolation='bilinear')
56 | pyplot.figure()
57 | pyplot.axis('off')
58 | # 将图片输出到文件
59 | wc.to_file(r'./images/wc.png')
60 | 


--------------------------------------------------------------------------------
/crawler/downloader.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | HEADERS = {
 4 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
 5 |     'Cookie': ''
 6 | }
 7 | 
 8 | 
 9 | def download(url):
10 |     try:
11 |         # 如果不登录抓取的数据可能会很有限（未证实），这里简化处理认证部分逻辑，直接把我的cookie信息复制过来
12 |         resp = requests.get(url,
13 |                             headers=HEADERS,
14 |                             timeout=3.0)
15 |         resp.raise_for_status()
16 |         return resp.text
17 |     except requests.RequestException as e:
18 |         print(e)
19 |     except Exception as e:
20 |         print(e)


--------------------------------------------------------------------------------
/crawler/main.py:
--------------------------------------------------------------------------------
 1 | # 爬虫启动入口
 2 | 
 3 | from crawler.manager import Manager
 4 | from crawler.downloader import download
 5 | from crawler.parser import parse
 6 | from crawler.processor import Processor
 7 | 
 8 | movie_id = 26752088
 9 | base_url = 'https://movie.douban.com/subject/{}/comments'.format(movie_id)
10 | 
11 | 
12 | class Crawler(object):
13 | 
14 |     def __init__(self):
15 |         self._manager = Manager(base_url)
16 |         self._processor = Processor(host='192.168.0.105',
17 |                                     collection='movie_{}_comments'.format(movie_id))
18 | 
19 |     def start(self, urls):
20 |         """
21 |         启动爬虫方法
22 |         :param urls: 启动URL
23 |         :return: 抓取的URL数量
24 |         """
25 |         number = 0
26 |         self._manager.append_new_urls(urls)
27 |         while self._manager.has_new_url():
28 |             number += 1
29 |             new_url = self._manager.get_new_url()
30 |             print('开始下载第{:03}个URL：{}'.format(number, new_url))
31 |             html = download(new_url)
32 |             if html is None:
33 |                 # print('html is empty .')
34 |                 continue
35 |             links, results = parse(html, new_url)
36 |             if len(links) > 0:
37 |                 self._manager.append_new_urls(links)
38 |             if len(results) > 0:
39 |                 self._processor.process(results)
40 |         return number
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     crawler = Crawler()
45 |     # 同时抓取看过和未看过的链接，两者区别在于status查询参数上
46 |     root_urls = ['?'.join([base_url, 'start=0&limit=20&sort=new_score&status=P']),
47 |                  '?'.join([base_url, 'start=0&limit=20&sort=new_score&status=F'])]
48 |     nums = crawler.start(root_urls)
49 |     print('爬虫执行完成，共抓取{}个URL'.format(nums))
50 | 


--------------------------------------------------------------------------------
/crawler/manager.py:
--------------------------------------------------------------------------------
 1 | class Manager(object):
 2 | 
 3 |     def __init__(self, base_url=None):
 4 |         self.base_url = base_url
 5 |         self.new_urls = []
 6 |         self.old_urls = []
 7 | 
 8 |     def append_new_urls(self, urls):
 9 |         if len(urls) == 0:
10 |             return
11 |         for url in urls:
12 |             # 过滤非目标URL
13 |             if self.base_url not in url:
14 |                 continue
15 |             # 排序倒序数据，避免重复抓取
16 |             if '&limit=-20' in url:
17 |                 continue
18 |             # 去掉多余查询参数
19 |             if '&percent_type=' in url:
20 |                 url = url.replace('&percent_type=', '')
21 |             # URL重复检查
22 |             if url not in self.new_urls and url not in self.old_urls:
23 |                 self.new_urls.append(url)
24 | 
25 |     def has_new_url(self):
26 |         return len(self.new_urls) > 0
27 | 
28 |     def get_new_url(self):
29 |         """
30 |         获取一个新的URL，内部隐含了URL抓取过后加入已抓取队列操作(所以这里不考虑实际抓取过程中的失败情况)
31 |         :return:
32 |         """
33 |         url = self.new_urls.pop()
34 |         self.old_urls.append(url)
35 |         return url
36 | 


--------------------------------------------------------------------------------
/crawler/parser.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import urllib.parse
 3 | 
 4 | from bs4 import BeautifulSoup
 5 | 
 6 | 
 7 | def parse(html, url):
 8 |     soup = BeautifulSoup(html, 'html.parser')
 9 | 
10 |     # 超链接列表
11 |     links = []
12 |     for a in soup.select('#paginator > a'):
13 |         links.append(urllib.parse.urljoin(url, a.get('href')))
14 | 
15 |     # 数据列表
16 |     results = []
17 |     # 根据 status 参数判断用户是否看过
18 |     is_visit = ('status=P' in url)
19 |     for div in soup.select('#comments > div.comment-item'):
20 |         author = div.select_one('h3 > span.comment-info > a').get_text(strip=True)
21 |         date = div.select_one('h3 > span.comment-info > span.comment-time').get_text(strip=True)
22 |         rating = div.select_one('h3 > span.comment-info > span.rating')
23 |         star = None
24 |         if rating is not None:
25 |             star = rating.get('class')[0].replace('allstar', '')
26 |         vote = div.select_one('h3 > span.comment-vote > span.votes').get_text(strip=True)
27 |         comment = div.select_one('div.comment > p').get_text(strip=True)
28 |         results.append({
29 |             'author': author,
30 |             'date': datetime.datetime.strptime(date, '%Y-%m-%d'),
31 |             'star': star,
32 |             'vote': int(vote),
33 |             'comment': comment,
34 |             'is_visit': is_visit
35 |         })
36 | 
37 |     return links, results
38 | 


--------------------------------------------------------------------------------
/crawler/processor.py:
--------------------------------------------------------------------------------
 1 | import pymongo
 2 | 
 3 | 
 4 | class Processor(object):
 5 | 
 6 |     def __init__(self, host=None, port=27017, database='douban', collection='comments'):
 7 |         self.client = pymongo.MongoClient(host=host, port=port)
 8 |         self.database = database
 9 |         self.collection = collection
10 | 
11 |     def __del__(self):
12 |         self.client.close()
13 | 
14 |     def process(self, results):
15 |         # print(results)
16 |         crawler = self.client.get_database(self.database).get_collection(self.collection)
17 |         return crawler.insert_many(results)
18 | 


--------------------------------------------------------------------------------
/example/__jieba.py:
--------------------------------------------------------------------------------
 1 | # Python 中文分词模块
 2 | # https://github.com/fxsjy/jieba
 3 | # https://pypi.org/project/jieba/
 4 | 
 5 | import jieba
 6 | 
 7 | 
 8 | def print_iterator(_iter):
 9 |     print('/'.join(_iter))
10 | 
11 | 
12 | # 全模式
13 | # 我/来到/北京/清华/清华大学/华大/大学
14 | print_iterator(jieba.cut("我来到北京清华大学", cut_all=True))
15 | 
16 | # 精确模式
17 | # 我/来到/北京/清华大学
18 | print_iterator(jieba.cut("我来到北京清华大学", cut_all=False))
19 | 
20 | # 默认是精确模式
21 | # 他/来到/了/网易/杭研/大厦
22 | print_iterator(jieba.cut("他来到了网易杭研大厦"))
23 | 
24 | # 搜索引擎模式
25 | # 小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/，/后/在/日本/京都/大学/日本京都大学/深造
26 | print_iterator(jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造"))
27 | 
28 | # 自定义词典：一个词占一行，每行分三列，词语、词频(可省略)、词性(可省略)，使用空格分隔，文件使用UTF-8编码
29 | # jieba.load_userdict(r'./dicts/my_dict.txt')
30 | 
31 | 
32 | # 载入自定义字典前
33 | # 她/是/市/创新/办/主任/，/也/是/云/计算/方面/的/专家
34 | print_iterator(jieba.cut('她是市创新办主任，也是云计算方面的专家'))
35 | 
36 | # 载入自定义字典
37 | jieba.load_userdict(r'./dicts/my_dict.txt')
38 | 
39 | # 载入自定义字典后
40 | # 她/是/市/创新办/主任/，/也是/云计算/方面/的/专家
41 | print_iterator(jieba.cut('她是市创新办主任，也是云计算方面的专家'))
42 | 
43 | # 调整词典
44 | 
45 | # 使用 add_word(word, freq=None, tag=None) 和 del_word(word) 可在程序中动态修改词典
46 | jieba.add_word('市创新办')
47 | # 她/是/市创新办/主任/，/也是/云计算/方面/的/专家
48 | print_iterator(jieba.cut('她是市创新办主任，也是云计算方面的专家'))
49 | jieba.del_word('也是')
50 | # 她/是/市创新办/主任/，/也/是/云计算/方面/的/专家
51 | print_iterator(jieba.cut('她是市创新办主任，也是云计算方面的专家'))
52 | 
53 | # 使用 suggest_freq(segment, tune=True) 可调节单个词语的词频，使其能（或不能）被分出来
54 | # 如果/放到/post/中将/出错/。
55 | print_iterator(jieba.cut('如果放到post中将出错。', HMM=False))
56 | # 494
57 | print(jieba.suggest_freq(('中', '将'), True))
58 | # 如果/放到/post/中/将/出错/。
59 | print_iterator(jieba.cut('如果放到post中将出错。', HMM=False))
60 | 


--------------------------------------------------------------------------------
/example/__word_cloud.py:
--------------------------------------------------------------------------------
 1 | # Python 词云库测试
 2 | # https://amueller.github.io/word_cloud/index.html
 3 | # https://amueller.github.io/word_cloud/auto_examples/simple.html#sphx-glr-auto-examples-simple-py
 4 | import jieba as jieba
 5 | from wordcloud import WordCloud, ImageColorGenerator
 6 | from matplotlib import pyplot
 7 | 
 8 | # 读取文本文件
 9 | # text = open('./__word_cloud.py', 'r', encoding='utf-8').read()
10 | # __file__ 表示当前文件
11 | text = open(__file__, 'r', encoding='utf-8').read()
12 | 
13 | # 使用 jieba 分词
14 | # 添加自定义分词
15 | [jieba.add_word(k) for k in ['文本', '文件']]
16 | 
17 | # 对文本进行分词处理
18 | seg_list = jieba.cut(text)
19 | new_text = ' '.join(seg_list)
20 | 
21 | # 生成词云，需要指定支持中文的字体，否则无法生成中文词云
22 | wc = WordCloud(
23 |     # 设置词云图片背景色，默认黑色
24 |     # background_color='white',
25 |     # 设置词云最大单词数
26 |     max_words=200,
27 |     # 设置词云中字号最大值
28 |     # max_font_size=80,
29 |     # 设置词云图片宽、高
30 |     width=1024,
31 |     height=768,
32 |     # 设置词云文字字体(美化和解决中文乱码问题)
33 |     font_path=r'./fonts/FZXingKai-S04S.TTF'
34 | ).generate(new_text)
35 | 
36 | # 绘图(标准长方形图)
37 | pyplot.imshow(wc, interpolation='bilinear')
38 | # pyplot.figure()
39 | pyplot.axis('off')
40 | # 直接打印图片
41 | # pyplot.show()
42 | # 将图片输出到文件
43 | wc.to_file(r'./images/__wc_1.png')
44 | 
45 | # 基于图像着色
46 | background_image = pyplot.imread(r'./images/__background.jpg')
47 | wc = WordCloud(
48 |     font_path=r'./fonts/FZXingKai-S04S.TTF',
49 |     mask=background_image,
50 |     # random_state=32,
51 |     # max_font_size=64,
52 |     width=1000,
53 |     height=833
54 | ).generate(new_text)
55 | # 从背景图片生成颜色值
56 | pyplot.imshow(wc.recolor(color_func=ImageColorGenerator(background_image)))
57 | pyplot.axis('off')
58 | # 绘制背景图片为底色的图片
59 | pyplot.figure()
60 | pyplot.imshow(background_image, cmap=pyplot.cm.gray)
61 | pyplot.axis('off')
62 | # pyplot.show()
63 | wc.to_file(r'./images/__wc_2.png')
64 | 


--------------------------------------------------------------------------------
/example/dicts/my_dict.txt:
--------------------------------------------------------------------------------
1 | 创新办
2 | 云计算 5
3 | 也是


--------------------------------------------------------------------------------
/example/fonts/FZXingKai-S04S.TTF:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zlikun/python-crawler-douban-movie/5ba05291473db540f2c8539170ebd95f5dd379af/example/fonts/FZXingKai-S04S.TTF


--------------------------------------------------------------------------------
/example/images/__background.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zlikun/python-crawler-douban-movie/5ba05291473db540f2c8539170ebd95f5dd379af/example/images/__background.jpg


--------------------------------------------------------------------------------
/example/images/__wc_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zlikun/python-crawler-douban-movie/5ba05291473db540f2c8539170ebd95f5dd379af/example/images/__wc_1.png


--------------------------------------------------------------------------------
/example/images/__wc_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zlikun/python-crawler-douban-movie/5ba05291473db540f2c8539170ebd95f5dd379af/example/images/__wc_2.png


--------------------------------------------------------------------------------