├── .gitignore ├── LICENSE ├── README.md ├── comment_analysis ├── .idea │ ├── comment_analysis.iml │ ├── dataSources.local.xml │ ├── dataSources.xml │ ├── dataSources │ │ └── 45308517-00ac-4b16-8c84-9590de05cab2.xml │ ├── dictionaries │ │ └── ASUS.xml │ ├── misc.xml │ ├── modules.xml │ └── workspace.xml ├── analysis.py ├── dict │ ├── adverb_dict.txt │ ├── conjunction_dict.txt │ ├── denial_dict.txt │ ├── negative_dict.txt │ ├── phrase_dict.txt │ ├── positive_dict.txt │ ├── punctuation_dict.txt │ └── user_dict.txt ├── judge_polarity.py └── log.txt ├── comment_spider ├── .idea │ ├── comment_spider.iml │ ├── dataSources.local.xml │ ├── dataSources.xml │ ├── dataSources │ │ └── 56b1a419-7a84-4661-a69b-42b7df13cf8b.xml │ ├── dictionaries │ │ └── ASUS.xml │ ├── misc.xml │ ├── modules.xml │ └── workspace.xml ├── comment_spider │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── commentspider.py ├── run_spider.py ├── scrapy.cfg └── tools │ ├── jieba_content.py │ ├── sql_tools.py │ ├── stop_words.txt │ └── user_words.txt ├── computer_analysis ├── .idea │ ├── computer_analysis.iml │ ├── dataSources.local.xml │ ├── dataSources.xml │ ├── dataSources │ │ └── 6ce9a5dc-7a40-4f48-a105-d36e592d5e6e.xml │ ├── dictionaries │ │ └── ASUS.xml │ ├── misc.xml │ ├── modules.xml │ └── workspace.xml ├── api │ ├── __init__.py │ ├── admin.py │ ├── apps.py │ ├── migrations │ │ ├── 0001_initial.py │ │ └── __init__.py │ ├── models.py │ ├── search_indexes.py │ ├── tests.py │ └── views.py ├── computer_analysis │ ├── __init__.py │ ├── settings.py │ ├── urls.py │ └── wsgi.py ├── manage.py ├── static │ ├── bar │ │ ├── 100002368328.svg │ │ ├── 11528184498.svg │ │ ├── 11547179910.svg │ │ ├── 15019918741.svg │ │ ├── 20167878769.svg │ │ ├── 3714545.svg │ │ ├── 39987003288.svg │ │ └── 5520838.svg │ ├── jieba_top10_bar │ │ ├── 100002368328.svg │ │ ├── 11528184498.svg │ │ ├── 11547179910.svg │ │ ├── 15019918741.svg │ │ ├── 20167878769.svg │ │ ├── 3714545.svg │ │ ├── 39987003288.svg │ │ └── 5520838.svg │ ├── pie │ │ ├── 100002368328.svg │ │ ├── 11528184498.svg │ │ ├── 11547179910.svg │ │ ├── 15019918741.svg │ │ ├── 20167878769.svg │ │ ├── 3714545.svg │ │ ├── 39987003288.svg │ │ └── 5520838.svg │ └── wordcloud │ │ ├── 100002368328.png │ │ ├── 11528184498.png │ │ ├── 11547179910.png │ │ ├── 15019918741.png │ │ ├── 20167878769.png │ │ ├── 3714545.png │ │ ├── 39987003288.png │ │ └── 5520838.png ├── templates │ └── search │ │ └── indexes │ │ └── api │ │ └── computer_text.txt ├── tools │ ├── STXINGKA.TTF │ ├── decorator.py │ ├── orm2json.py │ ├── pygal_process.py │ └── searchresult2json.py └── whoosh_index │ ├── MAIN_6vchd7acq93n4dv5.seg │ ├── MAIN_WRITELOCK │ ├── MAIN_agm2z7e75evl86bh.seg │ └── _MAIN_2.toc ├── computer_spider ├── .idea │ ├── computer_spider.iml │ ├── dataSources.local.xml │ ├── dataSources.xml │ ├── dataSources │ │ └── 42a1e165-b6b3-4fce-94e9-b34a7f891498.xml │ ├── dictionaries │ │ └── ASUS.xml │ ├── misc.xml │ ├── modules.xml │ └── workspace.xml ├── computer_spider │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── cpt_spider.py ├── run_spider.py └── scrapy.cfg └── html ├── .idea ├── dictionaries │ └── ASUS.xml ├── html.iml ├── misc.xml ├── modules.xml └── workspace.xml ├── css ├── bootstrap-theme.css ├── bootstrap-theme.css.map ├── bootstrap-theme.min.css ├── bootstrap-theme.min.css.map ├── bootstrap.css ├── bootstrap.css.map ├── bootstrap.min.css ├── bootstrap.min.css.map └── detail.css ├── detail.html ├── fonts ├── glyphicons-halflings-regular.eot ├── glyphicons-halflings-regular.svg ├── glyphicons-halflings-regular.ttf ├── glyphicons-halflings-regular.woff └── glyphicons-halflings-regular.woff2 ├── index.html └── js ├── bootstrap.js ├── bootstrap.min.js ├── detail.js ├── index.js ├── jquery-3.3.1.min.js └── npm.js /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # jd-spider 2 | 毕业设计京东商品评论爬虫分析 3 | ### 4 | 京东商品评论爬虫,并以图云的形式展示 5 | 后台数据抛出采用了Django框架进行数据抛出 6 | 前台采用同ajax数据请求 7 | 需要数据库请与我联系QQ614303219 8 | -------------------------------------------------------------------------------- /comment_analysis/.idea/comment_analysis.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /comment_analysis/.idea/dataSources.local.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | #@ 7 | ` 8 | 9 | 10 | master_key 11 | root 12 | *:jd_computer 13 | 14 | 15 | -------------------------------------------------------------------------------- /comment_analysis/.idea/dataSources.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | mysql 6 | true 7 | com.mysql.jdbc.Driver 8 | jdbc:mysql://localhost:3309/jd_computer 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /comment_analysis/.idea/dictionaries/ASUS.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /comment_analysis/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /comment_analysis/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /comment_analysis/analysis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | ''' 4 | @author: taiyc 5 | @file: comment_analysis 6 | @time: 2019/4/2 21:00 7 | ''' 8 | 9 | import pymysql 10 | from judge_polarity import DictClassifier 11 | 12 | class Analysis(object): 13 | def __init__(self): 14 | self.connect = None 15 | self.cursor = None 16 | self.score = 0 17 | self.total_score = 0 18 | 19 | def connect_sql(self): 20 | """ 21 | 连接数据库 22 | :return: 23 | """ 24 | self.connect = pymysql.connect( 25 | host='localhost', 26 | port=3309, 27 | user='root', 28 | password='123456', 29 | db='jd_computer' 30 | ) 31 | self.cursor = self.connect.cursor() 32 | 33 | def close_sql(self): 34 | """ 35 | 关闭数据库 36 | :return: 37 | """ 38 | self.connect.commit() 39 | self.connect.close() 40 | self.cursor.close() 41 | 42 | def get_goods_id(self): 43 | """ 44 | 获取computer表中的computer_id 45 | :return: 46 | """ 47 | self.connect_sql() 48 | sql = 'SELECT computer_id FROM computer' 49 | self.cursor.execute(sql) 50 | results = self.cursor.fetchall() 51 | self.close_sql() 52 | for goods_id in results: 53 | yield goods_id[0] 54 | 55 | def select_comment(self, computer_id): 56 | """ 57 | 查询评论表内所有computer_id的评论 58 | :param good_id: 59 | :return: 60 | """ 61 | self.connect_sql() 62 | sql = f'SELECT content FROM computer_comment WHERE computer_id={computer_id}' 63 | self.cursor.execute(sql) 64 | # 取出goods_id的content结果集 65 | content_data = self.cursor.fetchall() 66 | # content数大于99条的进行数据分析 67 | # print(content_data) 68 | if len(content_data) > 99: 69 | self.close_sql() 70 | return content_data 71 | 72 | else: 73 | print(f'------{computer_id}评论不足,仅有{len(content_data)}------') 74 | del_tags_sql = f'DELETE FROM computer_tag WHERE computer_id={computer_id}' 75 | self.cursor.execute(del_tags_sql) 76 | print('删除便签成功') 77 | del_comment_sql = f'DELETE FROM computer_comment WHERE computer_id={computer_id}' 78 | self.cursor.execute(del_comment_sql) 79 | print('删除评论成功') 80 | del_computer_sql = f'DELETE FROM computer WHERE computer_id={computer_id}' 81 | self.cursor.execute(del_computer_sql) 82 | print(f'---------删除笔记本{computer_id}所有信息成功。---------') 83 | self.close_sql() 84 | return None 85 | # 小于则放弃分析 86 | # else: 87 | # self.close_sql() 88 | # return None 89 | 90 | 91 | def analysis_content(self, content): 92 | """ 93 | 分析评论获取评论情感好评率 94 | :param content: 95 | :return: 96 | """ 97 | obj = DictClassifier() 98 | result = obj.analyse_sentence(content) 99 | print(f'{result}: {content}') 100 | self.total_score += result 101 | 102 | def save_result(self, goods_id, good_rate): 103 | """ 104 | :param goods_id: 105 | :return: 106 | """ 107 | print(good_rate) 108 | if int(good_rate) == 1: 109 | good_rate = 0.99 110 | self.connect_sql() 111 | print(f'{goods_id}评论情感分析好评率:{good_rate}') 112 | sql = f'UPDATE computer SET good_rate={good_rate} WHERE computer_id={goods_id}' 113 | self.cursor.execute(sql) 114 | self.close_sql() 115 | 116 | 117 | if __name__ == '__main__': 118 | obj = Analysis() 119 | for good_id in obj.get_goods_id(): 120 | content_data = obj.select_comment(good_id) 121 | # if content_data: 122 | # total_num = len(content_data) 123 | # obj.total_score = 0 124 | # for content in content_data: 125 | # content = content[0] 126 | # obj.analysis_content(content) 127 | # good_rate = round(obj.total_score / total_num, 4) 128 | # obj.save_result(good_id, good_rate) 129 | -------------------------------------------------------------------------------- /comment_analysis/dict/adverb_dict.txt: -------------------------------------------------------------------------------- 1 | 超级 2 2 | 超 2 3 | 都 1.75 4 | 还 1.5 5 | 实在 1.75 6 | 越来越 2 7 | 再也 2 8 | 完全 2 9 | 真是 1.75 10 | 足足 1.75 11 | 大大的 1.75 12 | 巨 2 13 | 最最 2 14 | 老是 1.75 15 | 压根 1.75 16 | 明显 1.5 17 | 18 | 好//1.75 19 | 老//1.75 20 | 过//2 21 | 22 | 最 2 23 | 最为 2 24 | 太 2 25 | 极 2 26 | 极为 2 27 | 极其 2 28 | 极度 2 29 | 极端 2 30 | 至 2 31 | 至为 2 32 | 顶 2 33 | 过于 2 34 | 过分 2 35 | 分外 2 36 | 万分 2 37 | 根本 2 38 | 更 1.75 39 | 更加 1.75 40 | 更其 1.75 41 | 越 1.75 42 | 越发 1.75 43 | 备加 1.75 44 | 愈 1.75 45 | 愈加 1.75 46 | 愈发 1.75 47 | 愈为 1.75 48 | 愈益 1.75 49 | 越加 1.75 50 | 格外 1.75 51 | 益发 1.75 52 | 很 1.75 53 | 挺 1.75 54 | 怪 1.75 55 | 非常 1.75 56 | 特别 1.75 57 | 相当 1.75 58 | 十分 1.75 59 | 好不 1.75 60 | 甚 1.75 61 | 甚为 1.75 62 | 颇 1.75 63 | 颇为 1.75 64 | 异常 1.75 65 | 深为 1.75 66 | 满 1.75 67 | 蛮 1.75 68 | 够 1.75 69 | 多 1.75 70 | 多么 1.75 71 | 殊特 1.75 72 | 大 1.75 73 | 大为 1.75 74 | 何等 1.75 75 | 何其 1.75 76 | 尤其 1.75 77 | 无比尤为 1.75 78 | 不胜 1.75 79 | 较 1.5 80 | 蛮 1.5 81 | 比较 1.5 82 | 较比 1.5 83 | 较为 1.5 84 | 不大 1.5 85 | 不太 1.5 86 | 不很 1.5 87 | 不甚 1.5 88 | 稍 0.8 89 | 稍稍 0.8 90 | 稍微 0.8 91 | 稍为 0.8 92 | 稍许 0.8 93 | 略 0.8 94 | 略略 0.8 95 | 略微 0.8 96 | 略为 0.8 97 | 些微 0.8 98 | 多少 0.8 99 | 有点 0.8 100 | 有些 0.8 -------------------------------------------------------------------------------- /comment_analysis/dict/conjunction_dict.txt: -------------------------------------------------------------------------------- 1 | 并 1.2 2 | 且 1.2 3 | 而 1.2 4 | 虽然 1.2 5 | 不过 1.2 6 | 至于 1.2 7 | 致 1.2 8 | 不料 1.2 9 | 岂知 1.2 10 | 11 | 也 1.5 12 | 不但 1.5 13 | 其次 1.5 14 | 不仅 1.5 15 | 就是 1.5 16 | 17 | 但是 2 18 | 偏偏 2 19 | 而且 2 20 | 何况 2 21 | 况且 2 22 | 乃至 2 23 | 但 2 24 | 却 2 25 | 然而 2 26 | 只是 2 27 | 28 | 甚至 3 29 | 尤其 3 30 | 居然 3 31 | -------------------------------------------------------------------------------- /comment_analysis/dict/denial_dict.txt: -------------------------------------------------------------------------------- 1 | 没敢 1 2 | 不是 1 3 | 4 | 不 1 5 | 没 1 6 | 无 1 7 | 非 1 8 | 莫 1 9 | 弗 1 10 | 毋 1 11 | 勿 1 12 | 未 1 13 | 否 1 14 | 别 1 15 | 休 1 16 | 無 1 17 | 不曾 1 18 | 未必 1 19 | 没有 1 20 | 不要 1 21 | 难以 1 22 | 未曾 1 23 | 并非 1 24 | 绝不 1 25 | 不可 1 26 | -------------------------------------------------------------------------------- /comment_analysis/dict/phrase_dict.txt: -------------------------------------------------------------------------------- 1 | 电脑……差 -2 2 | 虽然……但 -1.5 3 | 希望……提高……质量 -1 4 | ……蓝屏 -2 5 | 再也不…… -3 6 | 不会再…… -3 7 | 8 | 没有……特点 -1 9 | 像素……好 -1 10 | 没……好 -1 start:1 end:6 11 | 没……便宜 -1 start:1 end:6 12 | 13 | 比……贵 -1 14 | 没……值 -1 15 | 没……实用 -1 16 | 玩……卡 -2 17 | 玩……黑屏 -2 18 | 等……很久 -2 19 | 20 | (和|跟)……不(同|一样) -1 21 | 22 | 物流……快 2 23 | 物流……慢 -2 -------------------------------------------------------------------------------- /comment_analysis/dict/punctuation_dict.txt: -------------------------------------------------------------------------------- 1 | ! 2 2 | ! 2 3 | ~ 1.2 4 | ~ 1.2 5 | … 1.2 6 | .. 1.1 7 | ... 1.1 8 | .... 1.2 9 | ..... 1.2 10 | ...... 1.2 11 | ....... 1.2 12 | ........ 1.2 13 | ......... 1.3 14 | .......... 1.3 15 | ........... 1.3 16 | ............ 1.3 17 | ............. 1.3 18 | .............. 1.3 19 | ............... 1.3 20 | ................ 1.3 -------------------------------------------------------------------------------- /comment_analysis/dict/user_dict.txt: -------------------------------------------------------------------------------- 1 | 超薄 2 | 携带方便 3 | 特别喜欢 4 | 系统稳定 5 | 轻薄小巧 6 | 携带方便 7 | verygood 8 | 还好 9 | 还可以 10 | 色彩逼真 11 | 音质无敌 12 | 轻薄 13 | 物流好 14 | 五星好评 15 | 神机 16 | 美美哒 17 | 流畅度佳 18 | 小巧玲珑 19 | 方便携带 20 | 轻薄精巧 21 | 功能齐全 22 | 十分流畅 23 | 质量上乘 24 | 稳定可靠 25 | 反应灵敏 26 | 十分结实 27 | 电量充足 28 | 画质清晰 29 | 运行稳定 30 | 反应灵敏 31 | 方便快捷 32 | 时尚大气 33 | 运行超快 34 | 尺寸合适 35 | 极其省电 36 | 不好 37 | 好 38 | 不好用 39 | 散热差 40 | 散热好 41 | 真垃圾 42 | 垃圾 43 | 电脑不错 44 | 耐用 45 | 挺耐用 46 | 真耐用 47 | 手感好 48 | 屏幕大 49 | 手感极差 50 | 手感差 51 | 太贵了 52 | 贵 53 | 买贵了 54 | 不值 55 | 后悔 56 | 不后悔 57 | 太坑了 58 | 难看 59 | 高兴 60 | 上档次 61 | 颜值高 62 | 亏 63 | 真亏 64 | 太卡了 65 | 太卡 66 | 满意 67 | 物流快 68 | 颜值爆炸 69 | 超好玩 70 | 好评 71 | 易发烫 72 | 操作流畅 73 | 反应快速 74 | 游戏不卡顿 75 | 游戏卡顿 76 | 跑分也不低 77 | 跑分不低 78 | 质量有保证 79 | 售后服务好 80 | 颜值无敌 81 | 手感很好 82 | 一点都不卡 83 | 质量不错 84 | 手感极佳 85 | 好喜欢 86 | 很好用 87 | 高大上 88 | 好高大上 89 | 好惊喜 90 | 不会卡 91 | 科技感 92 | 一流 93 | 科技感一流 94 | 物超所值 95 | 有品位 96 | 低调奢华 97 | 做工好 98 | 很时尚 99 | 性价比真的高 100 | 性价比高 101 | 性价比低 102 | 掉漆 103 | 真棒 104 | 独一无二 105 | 别具匠心 106 | 发热严重 107 | 超级精美 108 | 蛮好的 109 | 特别帅 110 | 帅炸了 111 | 最满意 112 | 最喜欢 113 | 非常高清 114 | 最牛逼 115 | 非常好用 116 | 可以指纹 117 | 便宜 118 | 外观好看 119 | 没有卡顿 120 | 很轻 121 | 爱不释手 122 | 功能强大 123 | 配置高 124 | 分辨率低 125 | 分辨率高 126 | 反应也很快 127 | 屏幕艳丽 128 | 屏幕顺滑 129 | 大小合适 130 | 好太多 131 | 很灵活 132 | 音质效果很好 133 | 音质效果好 134 | 音质效果很不好 135 | 音质效果差 136 | 很溜 137 | 屏幕色彩艳丽 138 | 性价比很高 139 | 很值得 140 | 很不错 141 | 完美 142 | 产品质量高 143 | 价格合理 144 | 民族品牌 145 | 产品不错 146 | 值得信赖 147 | 有点卡 148 | 挺不错 149 | 一切满意 150 | 网速快 151 | 网速慢 152 | 有质感 153 | 有手感 154 | 续航很不错 155 | 操作很流畅 156 | 值得购买 157 | 玩游戏爽 158 | 玩游戏不行 159 | 反应很快 160 | 屏幕很大 161 | 值得购买 162 | 非常优越 163 | 使用顺手 164 | 非常满意 165 | 正品 166 | 吃鸡无压力 167 | 都能满足 168 | 性能好 169 | 屏幕够大 170 | 非常完美 171 | 性能不错 172 | 一点小瑕疵 173 | 有瑕疵 174 | 有点小瑕疵 175 | 稳定 176 | 很稳定 177 | 很快 178 | 不错的选择 179 | 超级好 180 | 反应超快 181 | 快递给力 182 | 不好 183 | 好 184 | 十分不错 185 | 送货速度 186 | 实用性强 187 | 特别流畅 188 | 特方便 189 | 很好上手 190 | 质量挺好 191 | 真心不错 192 | 价格实惠 193 | 质量超级好 194 | 快递速度 195 | 快递给力 196 | 性能优良 197 | 非常不错 198 | 好评 199 | 必须好评 200 | 容易上手 201 | 没问题 202 | 杠杠地 203 | 无瑕疵 204 | 特别快 205 | 非常流畅 206 | 帧数超高 207 | 得心应手 208 | 很惊喜 209 | 很失望 210 | 还行 211 | 商品不错 212 | 差评 213 | 挺好的 214 | 挺好 215 | 很棒 216 | 很惊艳 217 | 惊艳 218 | 超级棒 219 | 可以 220 | 使用流畅 221 | 价钱可以 222 | 太喜欢了 223 | 太好了 224 | 耐用 225 | 实用 226 | 大品牌 227 | 228 | -------------------------------------------------------------------------------- /comment_analysis/log.txt: -------------------------------------------------------------------------------- 1 | 电脑很快就收到了, 2 | [pair('电脑', 'n'), pair('很快', 'd'), pair('就', 'd'), pair('收到', 'v'), pair('了', 'ul'), pair(',', 'x')] 3 | 性价比不错, 4 | [pair('性价比', 'n'), pair('不错', 'a'), pair(',', 'x')] 5 | 非常好用的一个电脑, 6 | [pair('非常好用', 'x'), pair('的', 'uj'), pair('一个', 'm'), pair('电脑', 'n'), pair(',', 'x')] 7 | 整体效果也挺好, 8 | [pair('整体', 'n'), pair('效果', 'n'), pair('也', 'd'), pair('挺好', 'x'), pair(',', 'x')] 9 | 颜值高! 10 | [pair('颜值高', 'x'), pair('!', 'x')] 11 | 12 | 电脑很快就收到了,性价比不错,非常好用的一个电脑,整体效果也挺好,颜值高! 13 | Score:10.375 14 | Sub-clause0: positive:很快 15 | Sub-clause1: positive:不错 16 | Sub-clause2: positive:非常好用 17 | Sub-clause3: conjunction:也 positive:挺好 18 | Sub-clause4: positive:颜值高 punctuation:! 19 | {'score': 10.375, 'su-clause0': {'score': 1.75, 'positive': [{'key': '很快', 'adverb': [], 'denial': [], 'value': 1.75, 'score': 1.75}], 'negative': [], 'conjunction': [], 'punctuation': [], 'pattern': []}, 'su-clause1': {'score': 1.0, 'positive': [{'key': '不错', 'adverb': [], 'denial': [], 'value': 1.0, 'score': 1.0}], 'negative': [], 'conjunction': [], 'punctuation': [], 'pattern': []}, 'su-clause2': {'score': 2.0, 'positive': [{'key': '非常好用', 'adverb': [], 'denial': [], 'value': 2.0, 'score': 2.0}], 'negative': [], 'conjunction': [], 'punctuation': [], 'pattern': []}, 'su-clause3': {'score': 2.625, 'positive': [{'key': '挺好', 'adverb': [], 'denial': [], 'value': 1.75, 'score': 1.75}], 'negative': [], 'conjunction': [{'key': '也', 'value': 1.5}], 'punctuation': [], 'pattern': []}, 'su-clause4': {'score': 3.0, 'positive': [{'key': '颜值高', 'adverb': [], 'denial': [], 'value': 1.5, 'score': 1.5}], 'negative': [], 'conjunction': [], 'punctuation': [{'key': '!', 'value': 2.0}], 'pattern': []}} 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /comment_spider/.idea/comment_spider.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /comment_spider/.idea/dataSources.local.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | #@ 7 | ` 8 | 9 | 10 | master_key 11 | root 12 | *:jd_computer 13 | 14 | 15 | -------------------------------------------------------------------------------- /comment_spider/.idea/dataSources.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | mysql 6 | true 7 | com.mysql.jdbc.Driver 8 | jdbc:mysql://localhost:3309/jd_computer 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /comment_spider/.idea/dictionaries/ASUS.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /comment_spider/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /comment_spider/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /comment_spider/comment_spider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moxi255/jd-spider/8f92f7e7a9a27c2f83f6432f17e1f0bb920b4213/comment_spider/comment_spider/__init__.py -------------------------------------------------------------------------------- /comment_spider/comment_spider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class CommentSpiderItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /comment_spider/comment_spider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class CommentSpiderSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class CommentSpiderDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /comment_spider/comment_spider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class CommentSpiderPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /comment_spider/comment_spider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for comment_spider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | from tools.sql_tools import get_exists_comments 12 | BOT_NAME = 'comment_spider' 13 | 14 | SPIDER_MODULES = ['comment_spider.spiders'] 15 | NEWSPIDER_MODULE = 'comment_spider.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'comment_spider (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | CONCURRENT_REQUESTS = 5 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | # DOWNLOAD_DELAY = 0.1 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | DEFAULT_REQUEST_HEADERS = { 43 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | 'Accept-Language': 'en', 45 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36' 46 | } 47 | 48 | # Enable or disable spider middlewares 49 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 50 | #SPIDER_MIDDLEWARES = { 51 | # 'comment_spider.middlewares.CommentSpiderSpiderMiddleware': 543, 52 | #} 53 | 54 | # Enable or disable downloader middlewares 55 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 56 | #DOWNLOADER_MIDDLEWARES = { 57 | # 'comment_spider.middlewares.CommentSpiderDownloaderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable extensions 61 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | #} 65 | 66 | # Configure item pipelines 67 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 68 | ITEM_PIPELINES = { 69 | 'comment_spider.pipelines.CommentSpiderPipeline': 1, 70 | } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | 93 | 94 | 95 | 96 | 97 | 98 | EXISTS_CONMENTS = get_exists_comments() 99 | -------------------------------------------------------------------------------- /comment_spider/comment_spider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /comment_spider/comment_spider/spiders/commentspider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import json, re 4 | from tools.sql_tools import * 5 | from tools.jieba_content import get_jieba_comment 6 | from ..items import CommentSpiderItem 7 | from ..settings import EXISTS_CONMENTS 8 | 9 | class CommentspiderSpider(scrapy.Spider): 10 | name = 'commentspider' 11 | allowed_domains = ['jd.com'] 12 | start_urls = get_start_urls() 13 | 14 | 15 | def parse(self, response): 16 | if response.text: 17 | json_obj = json.loads(response.text) 18 | if json_obj: 19 | tag_data = json_obj['hotCommentTagStatistics'] 20 | tags = '|'.join([tag['name'] for tag in tag_data]) 21 | count = '|'.join([str(tag['count']) for tag in tag_data]) 22 | url = response._url 23 | page_num = int(url.split('&')[4].split('=')[1]) 24 | computer_id = int(url.split('&')[1].split('=')[1]) 25 | comments = json_obj['comments'] 26 | # 保存数据 27 | if page_num == 1: 28 | save_tags(tags, count, computer_id) 29 | if 0 < len(comments) < 10: 30 | for comment in comments: 31 | comment_id = str(computer_id) + str(comment['id']) 32 | content = re.sub(r"…|\.| |~|'", '', comment['content']) 33 | print(content) 34 | jieba_content = get_jieba_comment(content) 35 | print(jieba_content) 36 | create_time = comment['creationTime'] 37 | score = comment['score'] 38 | print(comment_id, content, jieba_content, score, create_time, computer_id) 39 | if comment_id in EXISTS_CONMENTS: 40 | print(f'{comment_id} 评论已存在') 41 | else: 42 | save_comment(comment_id, content, jieba_content, score, create_time, computer_id) 43 | # 该商品评论爬取完成更新if_spider字段 44 | update_if_spider(computer_id) 45 | 46 | elif len(comments) == 10: 47 | for comment in comments: 48 | comment_id = str(computer_id) + str(comment['id']) 49 | content = comment['content'].replace(' ', '') 50 | jieba_content = get_jieba_comment(content) 51 | create_time = comment['creationTime'] 52 | score = comment['score'] 53 | print(comment_id, content, jieba_content, score, create_time, computer_id) 54 | if comment_id in EXISTS_CONMENTS: 55 | print(f'{comment_id} 评论已存在') 56 | else: 57 | save_comment(comment_id, content, jieba_content, score, create_time, computer_id) 58 | page_num += 1 59 | if page_num == 101: 60 | # 该商品评论爬取完成更新if_spider字段 61 | update_if_spider(computer_id) 62 | # 找下一页 63 | if page_num < 101: 64 | next_url = f'https://club.jd.com/comment/skuProductPageComments.action?&productId={computer_id}&score=0&sortType=5&page={page_num}&pageSize=10&isShadowSku=0&rid=0&fold=1%27' 65 | yield scrapy.Request(url=next_url, callback=self.parse) 66 | else: 67 | update_if_spider(computer_id) 68 | # 进行下一个商品评论收集 69 | yield CommentspiderSpider() 70 | -------------------------------------------------------------------------------- /comment_spider/run_spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | ''' 4 | @author: taiyc 5 | @file: run_spider 6 | @time: 2019/3/29 21:44 7 | ''' 8 | 9 | from scrapy.cmdline import execute 10 | 11 | execute(['scrapy', 'crawl', 'commentspider']) -------------------------------------------------------------------------------- /comment_spider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = comment_spider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = comment_spider 12 | -------------------------------------------------------------------------------- /comment_spider/tools/jieba_content.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | ''' 4 | @author: taiyc 5 | @file: jieba_content 6 | @time: 2019/3/30 10:30 7 | ''' 8 | 9 | import jieba 10 | 11 | # 加载用户字典 12 | jieba.load_userdict('tools\\user_words.txt') 13 | 14 | 15 | # 获取stop_words字典 16 | def get_stop_words(): 17 | words = [] 18 | with open('tools\stop_words.txt', 'r', encoding='utf-8') as f: 19 | for line in f.readlines(): 20 | words.append(line.strip()) 21 | return words 22 | 23 | 24 | # 获取评论分词 25 | def get_jieba_comment(string): 26 | result = jieba.cut(string) 27 | jieba_content = '' 28 | stop_words = get_stop_words() 29 | for x in result: 30 | if x not in stop_words: 31 | jieba_content += " " + x 32 | return jieba_content -------------------------------------------------------------------------------- /comment_spider/tools/sql_tools.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | ''' 4 | @author: taiyc 5 | @file: sql_tools 6 | @time: 2019/3/29 22:22 7 | ''' 8 | 9 | import pymysql 10 | 11 | def connect_mysql(): 12 | """ 13 | 连接数据库 14 | :return: connect, cursor 15 | """ 16 | connect = pymysql.Connect(host='localhost', user='root', password='123456', port=3309, 17 | db='jd_computer' 18 | ) 19 | cursor = connect.cursor() 20 | return connect, cursor 21 | 22 | def close_mysql(connect, cursor): 23 | """ 24 | 关闭数据库 25 | :param connect: 26 | :return: 27 | """ 28 | connect.commit() 29 | cursor.close() 30 | connect.close() 31 | 32 | # 获取start_urls 33 | def get_start_urls(): 34 | """ 35 | 获取start_urls 36 | :return: 37 | """ 38 | connect, cursor = connect_mysql() 39 | select_sql = 'SELECT computer_id FROM computer WHERE if_spider=0' 40 | cursor.execute(select_sql) 41 | result = cursor.fetchall() 42 | close_mysql(connect, cursor) 43 | print(len(result)) 44 | return [f'https://club.jd.com/comment/skuProductPageComments.action?&productId={url[0]}&score=0&sortType=5&page=1&pageSize=10&isShadowSku=0&rid=0&fold=1%27' for url in result] 45 | 46 | # 保存标签 47 | def save_tags(*args): 48 | """ 49 | 保存标签数据to computer_tag 50 | :param args: 51 | :return: 52 | """ 53 | connect, cursor = connect_mysql() 54 | try: 55 | insert_sql = 'INSERT INTO computer_tag(tags, count, computer_id) VALUES (%s, %s, %s)' 56 | if_exists_sql = 'SELECT COUNT(*) FROM computer_tag WHERE computer_id={}'.format(args[2]) 57 | cursor.execute(if_exists_sql) 58 | if not cursor.fetchall()[0][0]: 59 | cursor.execute(insert_sql, args) 60 | print(f'{args[2]}的评论标签数据添加成功。') 61 | else: 62 | print(f'{args[2]}的评论标签数据已存在!') 63 | except Exception as e: 64 | print(f'添加{args[2]}评论标签数据时数据库出现错误!!') 65 | print(e) 66 | finally: 67 | close_mysql(connect, cursor) 68 | 69 | # 某个商品评论爬取完成将computer表 这个商品的if_spider设为1 70 | def update_if_spider(computer_id): 71 | connect, cursor = connect_mysql() 72 | updata_sql = f'UPDATE computer SET if_spider=1 WHERE computer_id={computer_id}' 73 | cursor.execute(updata_sql) 74 | print(f'{computer_id} 评论爬取完成!') 75 | close_mysql(connect, cursor) 76 | 77 | # 保存评论 78 | def save_comment(*args): 79 | connect, cursor = connect_mysql() 80 | # if_exists_sql = 'SELECT COUNT(*) FROM computer_comment WHERE comment_id={}'.format(args[0]) 81 | insert_sql = 'INSERT INTO computer_comment(comment_id, content, jieba_content, score, create_time, computer_id) VALUES (%s, %s,%s,%s,%s,%s)' 82 | # cursor.execute(if_exists_sql) 83 | # if cursor.fetchall()[0][0]: 84 | # update_sql = 'UPDATE computer_comment SET content=%s,jieba_content=%s WHERE comment_id = %s' 85 | # cursor.execute(update_sql, (args[1], args[2], args[0])) 86 | # close_mysql(connect, cursor) 87 | # print(f'{args[0]}评论已更新!') 88 | # else: 89 | cursor.execute(insert_sql, args) 90 | close_mysql(connect, cursor) 91 | print(f'{args[0]}评论添加成功!') 92 | 93 | def get_exists_comments(): 94 | connect, cursor = connect_mysql() 95 | select_sql = 'SELECT comment_id FROM computer_comment' 96 | cursor.execute(select_sql) 97 | exists_comments = cursor.fetchall() 98 | close_mysql(connect, cursor) 99 | return [x[0] for x in exists_comments] 100 | 101 | -------------------------------------------------------------------------------- /comment_spider/tools/user_words.txt: -------------------------------------------------------------------------------- 1 | 超薄 2 | 携带方便 3 | 特别喜欢 4 | 系统稳定 5 | 轻薄小巧 6 | 携带方便 7 | verygood 8 | 还好 9 | 还可以 10 | 色彩逼真 11 | 音质无敌 12 | 轻薄 13 | 物流好 14 | 五星好评 15 | 神机 16 | 美美哒 17 | 流畅度佳 18 | 小巧玲珑 19 | 方便携带 20 | 轻薄精巧 21 | 功能齐全 22 | 十分流畅 23 | 质量上乘 24 | 稳定可靠 25 | 反应灵敏 26 | 十分结实 27 | 电量充足 28 | 画质清晰 29 | 运行稳定 30 | 反应灵敏 31 | 方便快捷 32 | 时尚大气 33 | 运行超快 34 | 尺寸合适 35 | 极其省电 36 | 不好 37 | 好 38 | 不好用 39 | 散热差 40 | 散热好 41 | 真垃圾 42 | 垃圾 43 | 电脑不错 44 | 耐用 45 | 挺耐用 46 | 真耐用 47 | 手感好 48 | 屏幕大 49 | 手感极差 50 | 手感差 51 | 太贵了 52 | 贵 53 | 买贵了 54 | 不值 55 | 后悔 56 | 不后悔 57 | 太坑了 58 | 难看 59 | 高兴 60 | 上档次 61 | 颜值高 62 | 亏 63 | 真亏 64 | 太卡了 65 | 太卡 66 | 满意 67 | 物流快 68 | 颜值爆炸 69 | 超好玩 70 | 好评 71 | 易发烫 72 | 操作流畅 73 | 反应快速 74 | 游戏不卡顿 75 | 游戏卡顿 76 | 跑分也不低 77 | 跑分不低 78 | 质量有保证 79 | 售后服务好 80 | 颜值无敌 81 | 手感很好 82 | 一点都不卡 83 | 质量不错 84 | 手感极佳 85 | 好喜欢 86 | 很好用 87 | 高大上 88 | 好高大上 89 | 好惊喜 90 | 不会卡 91 | 科技感 92 | 一流 93 | 科技感一流 94 | 物超所值 95 | 有品位 96 | 低调奢华 97 | 做工好 98 | 很时尚 99 | 性价比真的高 100 | 性价比高 101 | 性价比低 102 | 掉漆 103 | 独一无二 104 | 别具匠心 105 | 发热严重 106 | 超级精美 107 | 蛮好的 108 | 特别帅 109 | 帅炸了 110 | 最满意 111 | 最喜欢 112 | 非常高清 113 | 最牛逼 114 | 非常好用 115 | 可以指纹 116 | 便宜 117 | 外观好看 118 | 没有卡顿 119 | 很轻 120 | 爱不释手 121 | 功能强大 122 | 配置高 123 | 分辨率低 124 | 分辨率高 125 | 反应也很快 126 | 屏幕艳丽 127 | 屏幕顺滑 128 | 大小合适 129 | 好太多 130 | 很灵活 131 | 音质效果很好 132 | 音质效果好 133 | 音质效果很不好 134 | 音质效果差 135 | 很溜 136 | 屏幕色彩艳丽 137 | 性价比很高 138 | 很值得 139 | 很不错 140 | 完美 141 | 产品质量高 142 | 价格合理 143 | 民族品牌 144 | 产品不错 145 | 值得信赖 146 | 有点卡 147 | 挺不错 148 | 一切满意 149 | 网速快 150 | 网速慢 151 | 有质感 152 | 有手感 153 | 续航很不错 154 | 操作很流畅 155 | 值得购买 156 | 玩游戏爽 157 | 玩游戏不行 158 | 反应很快 159 | 屏幕很大 160 | 值得购买 161 | 非常优越 162 | 使用顺手 163 | 非常满意 164 | 正品 165 | 吃鸡无压力 166 | 都能满足 167 | 性能好 168 | 屏幕够大 169 | 非常完美 170 | 性能不错 171 | 一点小瑕疵 172 | 有瑕疵 173 | 有点小瑕疵 174 | 稳定 175 | 很稳定 176 | 很快 177 | 不错的选择 178 | 超级好 179 | 反应超快 180 | 快递给力 181 | 不好 182 | 好 183 | 十分不错 184 | 送货速度 185 | 实用性强 186 | 特别流畅 187 | 特方便 188 | 很好上手 189 | 质量挺好 190 | 真心不错 191 | 价格实惠 192 | 质量超级好 193 | 快递速度 194 | 快递给力 195 | 性能优良 196 | 非常不错 197 | 好评 198 | 必须好评 199 | 容易上手 200 | 没问题 201 | 杠杠地 202 | 无瑕疵 203 | 特别快 204 | 非常流畅 205 | 帧数超高 206 | 得心应手 207 | 很惊喜 208 | 很失望 209 | 还行 210 | 商品不错 211 | 差评 212 | 挺好的 213 | 挺好 214 | 很棒 215 | 很惊艳 216 | 惊艳 217 | 超级棒 218 | 可以 219 | 使用流畅 220 | 价钱可以 221 | 太喜欢了 222 | 太好了 223 | 耐用 224 | 实用 225 | 大品牌 226 | 227 | -------------------------------------------------------------------------------- /computer_analysis/.idea/computer_analysis.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 23 | 24 | 27 | -------------------------------------------------------------------------------- /computer_analysis/.idea/dataSources.local.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | #@ 7 | ` 8 | 9 | 10 | master_key 11 | root 12 | *:jd_computer 13 | 14 | 15 | -------------------------------------------------------------------------------- /computer_analysis/.idea/dataSources.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | mysql 6 | true 7 | com.mysql.jdbc.Driver 8 | jdbc:mysql://localhost:3309/jd_computer 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /computer_analysis/.idea/dictionaries/ASUS.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /computer_analysis/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /computer_analysis/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /computer_analysis/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moxi255/jd-spider/8f92f7e7a9a27c2f83f6432f17e1f0bb920b4213/computer_analysis/api/__init__.py -------------------------------------------------------------------------------- /computer_analysis/api/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | # Register your models here. 4 | -------------------------------------------------------------------------------- /computer_analysis/api/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class CptAnalysisConfig(AppConfig): 5 | name = 'api' 6 | -------------------------------------------------------------------------------- /computer_analysis/api/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.1.4 on 2019-03-29 05:14 2 | 3 | from django.db import migrations, models 4 | import django.db.models.deletion 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | initial = True 10 | 11 | dependencies = [ 12 | ] 13 | 14 | operations = [ 15 | migrations.CreateModel( 16 | name='Comment', 17 | fields=[ 18 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 19 | ('comment_id', models.CharField(max_length=40, unique=True)), 20 | ('content', models.TextField()), 21 | ('jieba_content', models.TextField()), 22 | ('score', models.BigIntegerField()), 23 | ('create_time', models.CharField(max_length=30)), 24 | ], 25 | options={ 26 | 'db_table': 'computer_comment', 27 | }, 28 | ), 29 | migrations.CreateModel( 30 | name='Computer', 31 | fields=[ 32 | ('computer_id', models.BigIntegerField(primary_key=True, serialize=False)), 33 | ('brand', models.CharField(max_length=30)), 34 | ('title', models.CharField(max_length=255)), 35 | ('price', models.IntegerField()), 36 | ('img_url', models.CharField(max_length=255)), 37 | ('param', models.TextField()), 38 | ('if_spider', models.BooleanField(default=0)), 39 | ('good_rate', models.FloatField(default=0.0)), 40 | ], 41 | options={ 42 | 'db_table': 'computer', 43 | }, 44 | ), 45 | migrations.CreateModel( 46 | name='Tag', 47 | fields=[ 48 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 49 | ('tags', models.CharField(max_length=255)), 50 | ('count', models.CharField(max_length=200)), 51 | ('computer', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, to='api.Computer')), 52 | ], 53 | options={ 54 | 'db_table': 'computer_tag', 55 | }, 56 | ), 57 | migrations.AddField( 58 | model_name='comment', 59 | name='computer', 60 | field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='api.Computer'), 61 | ), 62 | ] 63 | -------------------------------------------------------------------------------- /computer_analysis/api/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moxi255/jd-spider/8f92f7e7a9a27c2f83f6432f17e1f0bb920b4213/computer_analysis/api/migrations/__init__.py -------------------------------------------------------------------------------- /computer_analysis/api/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | 3 | 4 | # Create your models here. 5 | 6 | # 电脑信息表 7 | class Computer(models.Model): 8 | computer_id = models.BigIntegerField(primary_key=True) 9 | brand = models.CharField(max_length=30, null=False) 10 | title = models.CharField(max_length=255) 11 | price = models.IntegerField() 12 | img_url = models.CharField(max_length=255) 13 | param = models.TextField() 14 | if_spider = models.BooleanField(default=False) 15 | good_rate = models.FloatField(default=0.0) 16 | 17 | class Meta: 18 | db_table = 'computer' 19 | 20 | # 评论标签 21 | class Tag(models.Model): 22 | tags = models.CharField(max_length=255, null=False) 23 | count = models.CharField(max_length=200, null=False) 24 | computer = models.OneToOneField(Computer, on_delete=models.CASCADE) 25 | 26 | class Meta: 27 | db_table = 'computer_tag' 28 | 29 | 30 | # 评论内容 31 | class Comment(models.Model): 32 | comment_id = models.CharField(max_length=40, unique=True) 33 | content = models.TextField() 34 | jieba_content = models.TextField() 35 | score = models.BigIntegerField(null=False) 36 | create_time = models.CharField(max_length=30) 37 | computer = models.ForeignKey(Computer, on_delete=models.CASCADE) 38 | 39 | class Meta: 40 | db_table = 'computer_comment' 41 | -------------------------------------------------------------------------------- /computer_analysis/api/search_indexes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | ''' 4 | @author: taiyc 5 | @file: search_indexes 6 | @time: 2019/4/3 13:40 7 | ''' 8 | from haystack import indexes 9 | from api.models import Computer 10 | 11 | 12 | class GoodsInfoIndex(indexes.SearchIndex, indexes.Indexable): 13 | text = indexes.CharField(document=True, use_template=True) 14 | 15 | good_rate = indexes.FloatField(model_attr='good_rate') 16 | computer_id = indexes.CharField(model_attr='computer_id') 17 | price = indexes.CharField(model_attr='price') 18 | img_url = indexes.CharField(model_attr='img_url') 19 | title = indexes.CharField(model_attr='title') 20 | 21 | def get_model(self): 22 | return Computer -------------------------------------------------------------------------------- /computer_analysis/api/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /computer_analysis/api/views.py: -------------------------------------------------------------------------------- 1 | import os 2 | from django.views.generic import View 3 | from tools.decorator import allow_origin 4 | from django.utils.decorators import method_decorator 5 | from tools.pygal_process import create_wordcloud, create_pie, create_bar, jieba_top10_bar 6 | # 导入分页模块 7 | from django.core.paginator import Paginator, PageNotAnInteger, InvalidPage, EmptyPage 8 | # 导入表 9 | from .models import Computer 10 | from tools.orm2json import object_to_json 11 | # 导入haystack搜索框架SearchView 12 | from haystack.views import SearchView 13 | from django.http import JsonResponse 14 | 15 | from tools.searchresult2json import sea_result2json 16 | 17 | class ComputerView(View): 18 | @method_decorator(allow_origin) 19 | def get(self, request): 20 | # ?&brand=huawei&page=1&page_size=5 21 | # 1、获取请请求参数 22 | brand = request.GET.get('brand', 'huawei') 23 | page = request.GET.get('page', 1) 24 | page_size = request.GET.get('page_size', 5) 25 | results = Computer.objects.filter(brand=brand).order_by('-good_rate') 26 | # isdigit():判断page、page_size是否纯数字组成 27 | page = int(page) if page.isdigit() else 1 28 | page_size = int(page_size) if page_size.isdigit() else 5 29 | # print(page_size) 30 | # 2、开始分页 31 | paginator = Paginator(results, page_size) 32 | error = 0 33 | try: 34 | # 获取当前页对象 35 | current_page = paginator.page(page) 36 | print(current_page.object_list) 37 | except (EmptyPage, InvalidPage, PageNotAnInteger) as e: 38 | error = '请求参数异常,默认已返回最后一页' 39 | current_page = paginator.page(paginator.num_pages) 40 | # 同时将page改为最后一页 41 | page = paginator.num_pages 42 | # 3、上一页、下一页链接以及总页数 43 | # next_url = f'http://127.0.0.1:8000/phone/?brand={brand}&page={page+1}&page_size={page_size}' if current_page.has_next() else "" 44 | # pre_url = f'http://127.0.0.1:8000/phone/?brand={brand}&page={page-1}&page_size={page_size}' if current_page.has_previous() else "" 45 | total_page = paginator.num_pages 46 | # 4、计算页码 47 | page_numbers = [] 48 | if total_page <= 5: 49 | page_numbers = [x for x in range(1, total_page+1)] 50 | else: 51 | # 移动 52 | if 3 < page < total_page-3: 53 | page_numbers = [x for x in range(page-2, page+3)] 54 | # 不移动 55 | elif page <= 3: 56 | page_numbers = [x for x in range(1, 6)] 57 | # 不移动 58 | elif page >= total_page-3: 59 | page_numbers = [x for x in range(page-5, total_page+1)] 60 | data = { 61 | 'status': 1, 62 | 'error': error, 63 | 'if_has_pre_page': current_page.has_previous(), 64 | 'if_has_next_page': current_page.has_next(), 65 | 'page_numbers': page_numbers, 66 | 'current_page_data': object_to_json(current_page.object_list) 67 | } 68 | # response = JsonResponse(data) 69 | # response['Access-Control-Allow-Origin'] = '*' 70 | return data 71 | 72 | 73 | class DetailView(View): 74 | @method_decorator(allow_origin) 75 | def get(self, request): 76 | computer_id = request.GET.get('computer_id', '') 77 | computer = 0 78 | status = 1 79 | wordcloud_path = 0 80 | pie_path = 0 81 | bar_path = 0 82 | top10_bar_path = 0 83 | try: 84 | computer_obj = Computer.objects.filter(computer_id=computer_id) 85 | computer = object_to_json(computer_obj)[0] 86 | # 词云图 87 | if not os.path.exists(f'static\wordcloud\{computer_id}.png'): 88 | if_success = create_wordcloud(computer_id) 89 | if if_success: 90 | wordcloud_path = f'..\computer_analysis\static\wordcloud\{computer_id}.png' 91 | else: 92 | wordcloud_path = f'..\computer_analysis\static\wordcloud\{computer_id}.png' 93 | # 饼状图 94 | if not os.path.exists(f'..\static\pie\{computer_id}.svg'): 95 | if_success = create_pie(computer_id) 96 | if if_success: 97 | pie_path = f'..\computer_analysis\static\pie\{computer_id}.svg' 98 | else: 99 | pie_path = f'..\computer_analysis\static\pie\{computer_id}.svg' 100 | # 柱状图 101 | if not os.path.exists(f'..\static\\bar\{computer_id}.svg'): 102 | if_success = create_bar(computer_id) 103 | if if_success: 104 | bar_path = f'..\computer_analysis\static\\bar\{computer_id}.svg' 105 | else: 106 | bar_path = f'..\computer_analysis\static\\bar\{computer_id}.svg' 107 | # jieba_top10_bar柱图 108 | if not os.path.exists(f'..\static\jieba_top10_bar\{computer_id}.svg'): 109 | if_success = jieba_top10_bar(computer_id) 110 | if if_success: 111 | top10_bar_path = f'..\computer_analysis\static\jieba_top10_bar\{computer_id}.svg' 112 | else: 113 | bar_path = f'..\computer_analysis\static\jieba_top10_bar\{computer_id}.svg' 114 | except Exception as e: 115 | print(e) 116 | status = 0 117 | data = { 118 | 'status': status, 119 | 'computer': computer, 120 | 'wordcloud': wordcloud_path, 121 | 'pie': pie_path, 122 | 'bar': bar_path, 123 | 'top10_bar': top10_bar_path 124 | } 125 | # response = JsonResponse(data) 126 | # response['Access-Control-Allow-Origin'] = '*' 127 | return data 128 | 129 | 130 | 131 | class ComputerSearchView(SearchView): 132 | 133 | def extra_context(self): 134 | context = super(ComputerSearchView, self).extra_context() 135 | key = self.request.GET.get('q') 136 | context['q'] = key 137 | return context 138 | 139 | def build_page(self): 140 | data = { 141 | 'status': 1, 142 | 'error': 0, 143 | 'if_has_pre_page': 0, 144 | 'if_has_next_page': 0, 145 | 'page_numbers': 0, 146 | 'current_page_data': 0 147 | } 148 | try: 149 | page = int(self.request.GET.get('page', 1)) 150 | except (TypeError, ValueError): 151 | data['error'] = '页码值非法!' 152 | page = 0 153 | data['status'] = 0 154 | if page < 1: 155 | data['error'] = '页码值非法!' 156 | data['status'] = 0 157 | # start_offset = (page - 1) * self.results_per_page 158 | if data['status']: 159 | self.results[:] 160 | self.results = self.results.order_by('-good_rate') 161 | paginator = Paginator(self.results, self.results_per_page) 162 | # self.results[start_offset: start_offset + self.results_per_page] 163 | # self.results在经过上面的切片代码self.results[start_offset:start_offset + self.results_per_page]之后,才有值。在此之前是没有值的,所有排序的order_by必须设置在这句代码的后面 164 | # haystack对搜索结果的分页,并不是将所有的搜索结果全部分页。 165 | # paginator = Paginator(self.results, self.results_per_page) 166 | try: 167 | # 获取当前页对象 168 | current_page = paginator.page(page) 169 | data['current_page_data'] = sea_result2json(current_page.object_list) 170 | data['if_has_next_page'] = current_page.has_next() 171 | data['if_has_pre_page'] = current_page.has_previous() 172 | except (EmptyPage, InvalidPage, PageNotAnInteger) as e: 173 | data['error'] = '请求页码超出范围,默认返回最后一页' 174 | data['current_page_data'] = sea_result2json(paginator.page(paginator.num_pages).object_list) 175 | # 同时将page改为最后一页 176 | page = paginator.num_pages 177 | data['if_has_pre_page'] = True 178 | total_page = paginator.num_pages 179 | # 计算页码 180 | page_numbers = [] 181 | if total_page <= 5: 182 | page_numbers = [x for x in range(1, total_page + 1)] 183 | else: 184 | # 移动 185 | if 3 < page < total_page - 3: 186 | page_numbers = [x for x in range(page - 2, page + 3)] 187 | # 不移动 188 | elif page <= 3: 189 | page_numbers = [x for x in range(1, 6)] 190 | # 不移动 191 | elif page >= total_page - 3: 192 | page_numbers = [x for x in range(page - 5, total_page + 1)] 193 | data['page_numbers'] = page_numbers 194 | return data 195 | return data 196 | 197 | def get_context(self): 198 | data = self.build_page() 199 | response = JsonResponse(data) 200 | # 解决跨域 201 | response['Access-Control-Allow-Origin'] = '*' 202 | return response 203 | 204 | def create_response(self): 205 | response = self.get_context() 206 | return response 207 | 208 | 209 | 210 | 211 | -------------------------------------------------------------------------------- /computer_analysis/computer_analysis/__init__.py: -------------------------------------------------------------------------------- 1 | import pymysql 2 | 3 | pymysql.install_as_MySQLdb() -------------------------------------------------------------------------------- /computer_analysis/computer_analysis/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for computer_analysis project. 3 | 4 | Generated by 'django-admin startproject' using Django 2.1.4. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/2.1/topics/settings/ 8 | 9 | For the full list of settings and their values, see 10 | https://docs.djangoproject.com/en/2.1/ref/settings/ 11 | """ 12 | 13 | import os 14 | 15 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...) 16 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 17 | 18 | 19 | # Quick-start development settings - unsuitable for production 20 | # See https://docs.djangoproject.com/en/2.1/howto/deployment/checklist/ 21 | 22 | # SECURITY WARNING: keep the secret key used in production secret! 23 | SECRET_KEY = 'f213_o#bb=_g+j(2r&d!0&%e%b(u4a!w94b2!tp8(wt!$w$mld' 24 | 25 | # SECURITY WARNING: don't run with debug turned on in production! 26 | DEBUG = True 27 | 28 | ALLOWED_HOSTS = [] 29 | 30 | 31 | # Application definition 32 | 33 | INSTALLED_APPS = [ 34 | 'django.contrib.admin', 35 | 'django.contrib.auth', 36 | 'django.contrib.contenttypes', 37 | 'django.contrib.sessions', 38 | 'django.contrib.messages', 39 | 'django.contrib.staticfiles', 40 | 'api', 41 | 'haystack' 42 | ] 43 | 44 | MIDDLEWARE = [ 45 | 'django.middleware.security.SecurityMiddleware', 46 | 'django.contrib.sessions.middleware.SessionMiddleware', 47 | 'django.middleware.common.CommonMiddleware', 48 | 'django.middleware.csrf.CsrfViewMiddleware', 49 | 'django.contrib.auth.middleware.AuthenticationMiddleware', 50 | 'django.contrib.messages.middleware.MessageMiddleware', 51 | 'django.middleware.clickjacking.XFrameOptionsMiddleware', 52 | ] 53 | 54 | ROOT_URLCONF = 'computer_analysis.urls' 55 | 56 | TEMPLATES = [ 57 | { 58 | 'BACKEND': 'django.template.backends.django.DjangoTemplates', 59 | 'DIRS': ['templates'], 60 | 'APP_DIRS': True, 61 | 'OPTIONS': { 62 | 'context_processors': [ 63 | 'django.template.context_processors.debug', 64 | 'django.template.context_processors.request', 65 | 'django.contrib.auth.context_processors.auth', 66 | 'django.contrib.messages.context_processors.messages', 67 | ], 68 | }, 69 | }, 70 | ] 71 | 72 | WSGI_APPLICATION = 'computer_analysis.wsgi.application' 73 | 74 | 75 | # Database 76 | # https://docs.djangoproject.com/en/2.1/ref/settings/#databases 77 | 78 | DATABASES = { 79 | 'default': { 80 | 'ENGINE': 'django.db.backends.mysql', 81 | 'NAME': 'jd_computer', 82 | 'HOME': 'localhost', 83 | 'USER': 'root', 84 | 'PASSWORD': '123456', 85 | 'PORT': 3309, 86 | } 87 | } 88 | 89 | 90 | # Password validation 91 | # https://docs.djangoproject.com/en/2.1/ref/settings/#auth-password-validators 92 | 93 | AUTH_PASSWORD_VALIDATORS = [ 94 | { 95 | 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', 96 | }, 97 | { 98 | 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', 99 | }, 100 | { 101 | 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', 102 | }, 103 | { 104 | 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', 105 | }, 106 | ] 107 | 108 | 109 | # Internationalization 110 | # https://docs.djangoproject.com/en/2.1/topics/i18n/ 111 | 112 | LANGUAGE_CODE = 'zh-hans' 113 | 114 | TIME_ZONE = 'Asia/Shanghai' 115 | 116 | USE_I18N = True 117 | 118 | USE_L10N = True 119 | 120 | USE_TZ = True 121 | 122 | 123 | # Static files (CSS, JavaScript, Images) 124 | # https://docs.djangoproject.com/en/2.1/howto/static-files/ 125 | 126 | STATIC_URL = '/static/' 127 | # 配置haystack这个框架所搭配的搜索引擎 128 | HAYSTACK_CONNECTIONS = { 129 | 'default': { 130 | 'ENGINE': 'haystack.backends.whoosh_cn_backend.WhooshEngine', 131 | 'PATH': os.path.join(BASE_DIR, 'whoosh_index'), 132 | }, 133 | } 134 | HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.RealtimeSignalProcessor' 135 | 136 | HAYSTACK_SEARCH_RESULTS_PER_PAGE = 5 -------------------------------------------------------------------------------- /computer_analysis/computer_analysis/urls.py: -------------------------------------------------------------------------------- 1 | """computer_analysis URL Configuration 2 | 3 | The `urlpatterns` list routes URLs to views. For more information please see: 4 | https://docs.djangoproject.com/en/2.1/topics/http/urls/ 5 | Examples: 6 | Function views 7 | 1. Add an import: from my_app import views 8 | 2. Add a URL to urlpatterns: path('', views.home, name='home') 9 | Class-based views 10 | 1. Add an import: from other_app.views import Home 11 | 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') 12 | Including another URLconf 13 | 1. Import the include() function: from django.urls import include, path 14 | 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) 15 | """ 16 | from django.contrib import admin 17 | from django.urls import path 18 | from api.views import * 19 | 20 | urlpatterns = [ 21 | path('admin/', admin.site.urls), 22 | path('api/v1/computer/list/', ComputerView.as_view()), 23 | path('api/v1/computer/detail/', DetailView.as_view()), 24 | path('api/v1/computer/search/', ComputerSearchView()), 25 | ] 26 | -------------------------------------------------------------------------------- /computer_analysis/computer_analysis/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for computer_analysis project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'computer_analysis.settings') 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /computer_analysis/manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | 5 | if __name__ == '__main__': 6 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'computer_analysis.settings') 7 | try: 8 | from django.core.management import execute_from_command_line 9 | except ImportError as exc: 10 | raise ImportError( 11 | "Couldn't import Django. Are you sure it's installed and " 12 | "available on your PYTHONPATH environment variable? Did you " 13 | "forget to activate a virtual environment?" 14 | ) from exc 15 | execute_from_command_line(sys.argv) 16 | -------------------------------------------------------------------------------- /computer_analysis/static/pie/100002368328.svg: -------------------------------------------------------------------------------- 1 | 2 | 990位买家评分分析饼状图(%)100360.60060075387.1499999984981990位买家评分分析饼状图(%)5分 -------------------------------------------------------------------------------- /computer_analysis/static/pie/11547179910.svg: -------------------------------------------------------------------------------- 1 | 2 | 918位买家评分分析饼状图(%)100360.60060075387.1499999984981918位买家评分分析饼状图(%)5分 -------------------------------------------------------------------------------- /computer_analysis/static/pie/15019918741.svg: -------------------------------------------------------------------------------- 1 | 2 | 450位买家评分分析饼状图(%)100360.60060075387.1499999984981450位买家评分分析饼状图(%)5分 -------------------------------------------------------------------------------- /computer_analysis/static/pie/20167878769.svg: -------------------------------------------------------------------------------- 1 | 2 | 144位买家评分分析饼状图(%)100360.60060075387.1499999984981144位买家评分分析饼状图(%)5分 -------------------------------------------------------------------------------- /computer_analysis/static/pie/3714545.svg: -------------------------------------------------------------------------------- 1 | 2 | 990位买家评分分析饼状图(%)100360.60060075387.1499999984981990位买家评分分析饼状图(%)5分 -------------------------------------------------------------------------------- /computer_analysis/static/pie/39987003288.svg: -------------------------------------------------------------------------------- 1 | 2 | 161位买家评分分析饼状图(%)100360.60060075387.1499999984981161位买家评分分析饼状图(%)5分 -------------------------------------------------------------------------------- /computer_analysis/static/wordcloud/100002368328.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moxi255/jd-spider/8f92f7e7a9a27c2f83f6432f17e1f0bb920b4213/computer_analysis/static/wordcloud/100002368328.png -------------------------------------------------------------------------------- /computer_analysis/static/wordcloud/11528184498.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moxi255/jd-spider/8f92f7e7a9a27c2f83f6432f17e1f0bb920b4213/computer_analysis/static/wordcloud/11528184498.png -------------------------------------------------------------------------------- /computer_analysis/static/wordcloud/11547179910.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moxi255/jd-spider/8f92f7e7a9a27c2f83f6432f17e1f0bb920b4213/computer_analysis/static/wordcloud/11547179910.png -------------------------------------------------------------------------------- /computer_analysis/static/wordcloud/15019918741.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moxi255/jd-spider/8f92f7e7a9a27c2f83f6432f17e1f0bb920b4213/computer_analysis/static/wordcloud/15019918741.png -------------------------------------------------------------------------------- /computer_analysis/static/wordcloud/20167878769.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moxi255/jd-spider/8f92f7e7a9a27c2f83f6432f17e1f0bb920b4213/computer_analysis/static/wordcloud/20167878769.png -------------------------------------------------------------------------------- /computer_analysis/static/wordcloud/3714545.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moxi255/jd-spider/8f92f7e7a9a27c2f83f6432f17e1f0bb920b4213/computer_analysis/static/wordcloud/3714545.png -------------------------------------------------------------------------------- /computer_analysis/static/wordcloud/39987003288.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moxi255/jd-spider/8f92f7e7a9a27c2f83f6432f17e1f0bb920b4213/computer_analysis/static/wordcloud/39987003288.png -------------------------------------------------------------------------------- /computer_analysis/static/wordcloud/5520838.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moxi255/jd-spider/8f92f7e7a9a27c2f83f6432f17e1f0bb920b4213/computer_analysis/static/wordcloud/5520838.png -------------------------------------------------------------------------------- /computer_analysis/templates/search/indexes/api/computer_text.txt: -------------------------------------------------------------------------------- 1 | {{ object.title }} 2 | {{ object.computer_id }} 3 | {{ object.brand }} -------------------------------------------------------------------------------- /computer_analysis/tools/STXINGKA.TTF: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moxi255/jd-spider/8f92f7e7a9a27c2f83f6432f17e1f0bb920b4213/computer_analysis/tools/STXINGKA.TTF -------------------------------------------------------------------------------- /computer_analysis/tools/decorator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | ''' 4 | @author: taiyc 5 | @file: decorator 6 | @time: 2019/3/14 16:00 7 | ''' 8 | 9 | from django.http import JsonResponse 10 | 11 | def allow_origin(func): 12 | def _func(*args, **kwargs): 13 | data = func(*args, **kwargs) 14 | response = JsonResponse(data) 15 | response['Access-Control-Allow-Origin'] = '*' 16 | return response 17 | return _func -------------------------------------------------------------------------------- /computer_analysis/tools/orm2json.py: -------------------------------------------------------------------------------- 1 | from django.db.models.query import QuerySet 2 | 3 | 4 | def object_to_json(model, ignore=None): 5 | if ignore is None: 6 | ignore = [] 7 | if type(model) in [QuerySet, list]: 8 | json = [] 9 | for element in model: 10 | json.append(_django_single_object_to_json(element, ignore)) 11 | return json 12 | else: 13 | return _django_single_object_to_json(model, ignore) 14 | 15 | 16 | def _django_single_object_to_json(element, ignore=None): 17 | return dict([(attr, getattr(element, attr)) for attr in [f.name for f in element._meta.fields if f not in ignore]]) -------------------------------------------------------------------------------- /computer_analysis/tools/pygal_process.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | ''' 4 | @author: taiyc 5 | @file: pygal_process 6 | @time: 2019/4/3 13:55 7 | ''' 8 | 9 | import pygal, pymysql 10 | from pygal.style import BlueStyle 11 | from wordcloud import WordCloud 12 | 13 | 14 | def connect_sql(): 15 | connect = pymysql.connect( 16 | user='root', 17 | password='123456', 18 | host='localhost', 19 | port=3309, 20 | db='jd_computer' 21 | ) 22 | cursor = connect.cursor() 23 | return connect, cursor 24 | 25 | 26 | def close_sql(connect, cursor): 27 | connect.commit() 28 | cursor.close() 29 | connect.close() 30 | 31 | 32 | def create_wordcloud(goods_id): 33 | """ 34 | 生成产品评论词云图 35 | :param goods_id: 36 | :return: 37 | """ 38 | connect, cursor = connect_sql() 39 | sql = f'SELECT jieba_content FROM computer_comment WHERE computer_id={goods_id}' 40 | cursor.execute(sql) 41 | result = cursor.fetchall() 42 | close_sql(connect, cursor) 43 | # 评论数大于50才生成词云图 44 | if len(result) >= 50: 45 | jieba_str = ''.join([x[0] for x in result]).replace(' ', '').replace('\n', '') 46 | wc = WordCloud(width=500, 47 | height=500, 48 | background_color='white', # 背景颜色 49 | max_words=50, # 最大词数 50 | max_font_size=100, # 显示字体的最大值 51 | font_path='STXINGKA.TTF', 52 | random_state=200, # 为每个词返回一个PIL颜色 53 | ) 54 | wc.generate(jieba_str) 55 | wc.to_file(f'static\wordcloud\{goods_id}.png') 56 | return True 57 | else: 58 | return False 59 | 60 | 61 | def create_pie(goods_id): 62 | """ 63 | 生成用户评分饼状图 64 | :param goods_id: 65 | :return: 66 | """ 67 | pie_chart = pygal.Pie(style=BlueStyle) 68 | connect, cursor = connect_sql() 69 | sql = f'SELECT score,count(score) FROM computer_comment WHERE computer_id={goods_id} GROUP BY score' 70 | cursor.execute(sql) 71 | result = cursor.fetchall() 72 | close_sql(connect, cursor) 73 | total = sum([x[1] for x in result]) 74 | if total >= 5: 75 | pie_chart.title = f'{total}位买家评分分析饼状图(%)' 76 | for score_group in result: 77 | pie_chart.add(str(score_group[0])+'分', round(score_group[1]/total, 4)*100) 78 | pie_chart.render_to_file(f'static\pie\{goods_id}.svg') 79 | return True 80 | else: 81 | return False 82 | 83 | 84 | def create_bar(goods_id): 85 | """ 86 | 生成标签柱状图 87 | :param goods_id: 88 | :return: 89 | """ 90 | bar_chart = pygal.HorizontalBar(style=BlueStyle) 91 | bar_chart.title = '标签分析条形图' 92 | connect, cursor = connect_sql() 93 | sql = f'SELECT tags,count FROM computer_tag WHERE computer_id={goods_id}' 94 | cursor.execute(sql) 95 | result = cursor.fetchall() 96 | close_sql(connect, cursor) 97 | # 判断是否存在标签 98 | if result: 99 | tags = result[0][0].split('|') 100 | count = result[0][1].split('|') 101 | for index in range(len(tags)): 102 | bar_chart.add(tags[index], int(count[index])) 103 | bar_chart.render_to_file(f'static\\bar\{goods_id}.svg') 104 | return True 105 | # 标签不小于3个的生成图 106 | # if len(tags) >= 3: 107 | # for index in range(len(tags)): 108 | # bar_chart.add(tags[index], int(count[index])) 109 | # bar_chart.render_to_file(f'..\static\bar\{good_id}.svg') 110 | # return 'success' 111 | # else: 112 | # # 返回标签字典 113 | # tags_dict = {} 114 | # for index in range(len(tags)): 115 | # tags_dict[tags[index]] = count[index] 116 | # print(tags_dict) 117 | # return tags_dict 118 | else: 119 | return False 120 | 121 | 122 | def jieba_top10_bar(goods_id): 123 | bar_chart = pygal.Bar(style=BlueStyle) 124 | bar_chart.title = '分词比重top10' 125 | connect, cursor = connect_sql() 126 | sql = f'SELECT jieba_content FROM computer_comment WHERE computer_id={goods_id}' 127 | cursor.execute(sql) 128 | result = cursor.fetchall() 129 | close_sql(connect, cursor) 130 | # 评论数大于50才生成 131 | if len(result) > 50: 132 | jieba_list = ''.join([x[0] for x in result]).replace(' ', '').replace('\n', '').split(' ') 133 | topn_dict = {} 134 | for word in jieba_list: 135 | if word: 136 | if word not in topn_dict: 137 | topn_dict[word] = 1 138 | else: 139 | topn_dict[word] = topn_dict[word] + 1 140 | stop = ['物流', '运行', '电脑', '客服', '收到', '开机', '东西', '质量', '购物'] 141 | for s in stop: 142 | if s in topn_dict: 143 | topn_dict.pop(s) 144 | top10_list = sorted(topn_dict.items(), key=lambda item: item[1], reverse=True)[1:11] 145 | for heat_word in top10_list: 146 | bar_chart.add(heat_word[0], int(heat_word[1])) 147 | bar_chart.render_to_file(f'static\jieba_top10_bar\{goods_id}.svg') 148 | return True 149 | else: 150 | return False 151 | 152 | 153 | 154 | # create_bar(6099496) 155 | # print(create_bar(6099496)) 156 | # jieba_top10_bar(29196113704) 157 | -------------------------------------------------------------------------------- /computer_analysis/tools/searchresult2json.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | ''' 4 | @author: taiyc 5 | @file: searchresult2json 6 | @time: 2019/3/28 21:17 7 | ''' 8 | # from haystack.models import SearchResult 9 | 10 | def sea_result2json(list_obj): 11 | json = [sr.get_additional_fields() for sr in list_obj] 12 | return json -------------------------------------------------------------------------------- /computer_analysis/whoosh_index/MAIN_6vchd7acq93n4dv5.seg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moxi255/jd-spider/8f92f7e7a9a27c2f83f6432f17e1f0bb920b4213/computer_analysis/whoosh_index/MAIN_6vchd7acq93n4dv5.seg -------------------------------------------------------------------------------- /computer_analysis/whoosh_index/MAIN_WRITELOCK: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moxi255/jd-spider/8f92f7e7a9a27c2f83f6432f17e1f0bb920b4213/computer_analysis/whoosh_index/MAIN_WRITELOCK -------------------------------------------------------------------------------- /computer_analysis/whoosh_index/MAIN_agm2z7e75evl86bh.seg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moxi255/jd-spider/8f92f7e7a9a27c2f83f6432f17e1f0bb920b4213/computer_analysis/whoosh_index/MAIN_agm2z7e75evl86bh.seg -------------------------------------------------------------------------------- /computer_analysis/whoosh_index/_MAIN_2.toc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moxi255/jd-spider/8f92f7e7a9a27c2f83f6432f17e1f0bb920b4213/computer_analysis/whoosh_index/_MAIN_2.toc -------------------------------------------------------------------------------- /computer_spider/.idea/computer_spider.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /computer_spider/.idea/dataSources.local.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | #@ 7 | ` 8 | 9 | 10 | master_key 11 | root 12 | *:jd_computer 13 | 14 | 15 | -------------------------------------------------------------------------------- /computer_spider/.idea/dataSources.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | mysql 6 | true 7 | com.mysql.jdbc.Driver 8 | jdbc:mysql://localhost:3309/jd_computer 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /computer_spider/.idea/dictionaries/ASUS.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /computer_spider/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /computer_spider/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /computer_spider/computer_spider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moxi255/jd-spider/8f92f7e7a9a27c2f83f6432f17e1f0bb920b4213/computer_spider/computer_spider/__init__.py -------------------------------------------------------------------------------- /computer_spider/computer_spider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ComputerSpiderItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | goods_id = scrapy.Field() 15 | brand = scrapy.Field() 16 | title = scrapy.Field() 17 | price = scrapy.Field() 18 | img_url = scrapy.Field() 19 | param = scrapy.Field() 20 | -------------------------------------------------------------------------------- /computer_spider/computer_spider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class ComputerSpiderSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class ComputerSpiderDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /computer_spider/computer_spider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymysql 8 | 9 | 10 | class ComputerSpiderPipeline(object): 11 | def __init__(self): 12 | self.connect = pymysql.connect( 13 | host='localhost', 14 | port=3309, 15 | user='root', 16 | password='123456', 17 | db='jd_computer' 18 | ) 19 | self.cursor = self.connect.cursor() 20 | 21 | def process_item(self, item, spider): 22 | sql = 'INSERT INTO computer(computer_id, brand, title, price, img_url, param) values (%s, %s, %s, %s, %s, %s)' 23 | if_exists_sql = f'SELECT count(*) FROM computer WHERE computer_id={item["goods_id"]}' 24 | self.cursor.execute(if_exists_sql) 25 | if self.cursor.fetchall()[0][0]: 26 | print(f'{item["goods_id"]}已存在') 27 | print(item['price']) 28 | elif round(float(item['price']), 2) < 1000: 29 | print(f'价格获取有误!放弃收集该产品!') 30 | else: 31 | self.cursor.execute(sql, ( 32 | item['goods_id'], item['brand'], item['title'], item['price'], item['img_url'], item['param'])) 33 | self.connect.commit() 34 | print(f'{item["goods_id"]}入库成功。') 35 | return item 36 | -------------------------------------------------------------------------------- /computer_spider/computer_spider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for computer_spider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'computer_spider' 13 | 14 | SPIDER_MODULES = ['computer_spider.spiders'] 15 | NEWSPIDER_MODULE = 'computer_spider.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'computer_spider (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | DEFAULT_REQUEST_HEADERS = { 43 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | 'Accept-Language': 'en', 45 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36' 46 | } 47 | 48 | # Enable or disable spider middlewares 49 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 50 | #SPIDER_MIDDLEWARES = { 51 | # 'computer_spider.middlewares.ComputerSpiderSpiderMiddleware': 543, 52 | #} 53 | 54 | # Enable or disable downloader middlewares 55 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 56 | #DOWNLOADER_MIDDLEWARES = { 57 | # 'computer_spider.middlewares.ComputerSpiderDownloaderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable extensions 61 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | #} 65 | 66 | # Configure item pipelines 67 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 68 | ITEM_PIPELINES = { 69 | 'computer_spider.pipelines.ComputerSpiderPipeline': 1, 70 | } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | -------------------------------------------------------------------------------- /computer_spider/computer_spider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /computer_spider/computer_spider/spiders/cpt_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import json, re 4 | from ..items import ComputerSpiderItem 5 | 6 | class CptSpiderSpider(scrapy.Spider): 7 | name = 'cpt_spider' 8 | allowed_domains = ['jd.com', 'p.3.cn'] 9 | start_urls = ['https://list.jd.com/list.html?cat=670%2C671%2C672'] 10 | 11 | def parse(self, response): 12 | """ 13 | 获取京东笔记本各大品牌的url 14 | :param response: 15 | :return: 16 | """ 17 | brand_urls = response.xpath('//ul[@id="brandsArea"]/li/a/@href').extract() 18 | brands = response.xpath('//ul[@id="brandsArea"]/li/a/@title').extract() 19 | print(len(brand_urls), len(brands)) 20 | for index, brand_url in enumerate(brand_urls): 21 | brand_url = 'https://list.jd.com' + brand_url 22 | brand = brands[index] 23 | print(brand) 24 | if '华为' in brand: 25 | brand = 'huawei' 26 | elif '联想' in brand: 27 | brand = 'Lenovo' 28 | elif 'ThinkPad' in brand: 29 | brand = 'ThinkPad' 30 | elif 'Apple' in brand: 31 | brand = 'apple' 32 | elif '戴尔' in brand: 33 | brand = 'DELL' 34 | elif '三星' in brand: 35 | brand = 'samsung' 36 | elif '华硕' in brand: 37 | brand = 'ASUS' 38 | elif '惠普' in brand: 39 | brand = 'HP' 40 | elif '宏碁' in brand: 41 | brand = 'acer' 42 | elif '小米' in brand: 43 | brand = 'xiaomi' 44 | elif '微软' in brand: 45 | brand = 'Microsoft' 46 | elif '外星人' in brand: 47 | brand = 'Alienware' 48 | elif '机械革命' in brand: 49 | brand = 'MECHREVO' 50 | elif '神舟' in brand: 51 | brand = 'HASEE' 52 | elif '微星' in brand: 53 | brand = 'MSI' 54 | elif '雷蛇' in brand: 55 | brand = 'Razer' 56 | elif '戴睿' in brand: 57 | brand = 'dere' 58 | elif '海尔' in brand: 59 | brand = 'Haier' 60 | response.meta['brand'] = brand 61 | print(brand) 62 | yield scrapy.Request(url=brand_url, callback=self.parse_list_page, meta=response.meta) 63 | 64 | def parse_list_page(self, response): 65 | """ 66 | 解析列表页 67 | :param response: 68 | :return: 69 | """ 70 | detail_urls = response.xpath('//li[@class="gl-item"]/div/div[@class="p-img"]/a/@href').extract() 71 | src_img_urls = response.xpath( 72 | '//li[@class="gl-item"]/div/div[@class="p-img"]/a[@target="_blank"]/img/@src').extract() 73 | data_lazy_img_urls = response.xpath( 74 | '//li[@class="gl-item"]/div/div[@class="p-img"]/a[@target="_blank"]/img/@data-lazy-img').extract() 75 | img_urls = src_img_urls + data_lazy_img_urls 76 | next_url = response.xpath('//a[contains(text(), "下一页")]/@href').extract_first() 77 | print(len(detail_urls), len(img_urls)) 78 | for index, detail_url in enumerate(detail_urls): 79 | detail_url = 'https:' + detail_url 80 | goods_id = detail_url.split('/')[-1].split('.')[0] 81 | response.meta['goods_id'] = goods_id 82 | img_url = 'https:' + img_urls[index] 83 | response.meta['img_url'] = img_url 84 | yield scrapy.Request(url=detail_url, callback=self.parse_detail_page, meta=response.meta) 85 | if next_url: 86 | next_url = 'https://list.jd.com' + next_url 87 | yield scrapy.Request(url=next_url, callback=self.parse_list_page, meta=response.meta) 88 | 89 | def parse_detail_page(self, response): 90 | """ 91 | 解析详情页 92 | :param response: 93 | :return: 94 | """ 95 | title = response.xpath('//title/text()').extract_first() 96 | title = re.sub('【.*?】|-京东', '', title.strip()) 97 | print(title) 98 | param = response.xpath('//div[@class="Ptable"]').extract() 99 | if len(param): 100 | param = param[0] 101 | else: 102 | param = response.xpath('//table[@class="Ptable"]').extract()[0] 103 | # 处理参数 104 | param = re.sub(' |\t|\r|\n', '', param) 105 | # print(param) 106 | response.meta['title'] = title 107 | response.meta['param'] = param 108 | price_url = 'https://p.3.cn/prices/mgets?skuIds=J_' + response.meta['goods_id'] 109 | yield scrapy.Request(url=price_url, callback=self.get_goods_price, meta=response.meta) 110 | 111 | def get_goods_price(self, response): 112 | """ 113 | 获取商品价格 114 | :param response: 115 | :return: 116 | """ 117 | json_obj = json.loads(response.text) 118 | price = json_obj[0]['p'] 119 | response.meta['price'] = price 120 | item = ComputerSpiderItem() 121 | item['goods_id'] = int(response.meta['goods_id']) 122 | item['brand'] = response.meta['brand'] 123 | item['title'] = response.meta['title'] 124 | item['img_url'] = response.meta['img_url'] 125 | item['param'] = response.meta['param'] 126 | item['price'] = response.meta['price'] 127 | yield item 128 | -------------------------------------------------------------------------------- /computer_spider/run_spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | ''' 4 | @author: taiyc 5 | @file: run_spider 6 | @time: 2019/3/29 13:56 7 | ''' 8 | from scrapy.cmdline import execute 9 | 10 | execute(['scrapy', 'crawl', 'cpt_spider']) -------------------------------------------------------------------------------- /computer_spider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = computer_spider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = computer_spider 12 | -------------------------------------------------------------------------------- /html/.idea/dictionaries/ASUS.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /html/.idea/html.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /html/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /html/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /html/css/detail.css: -------------------------------------------------------------------------------- 1 | 2 | #goods { 3 | margin-left: 35%; 4 | margin-top: 5%; 5 | } 6 | 7 | .self-dl{ 8 | display: flex; 9 | margin-top: 37%; 10 | font-size: 20px 11 | } 12 | 13 | .self-dl > dd{ 14 | color: red; 15 | } 16 | #wordcloud{ 17 | margin-top: 5%; 18 | text-align: center; 19 | } 20 | .svg{ 21 | margin-top: 5%; 22 | text-align: center; 23 | } 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | #param{ 37 | margin-top: 5%; 38 | /*background-color: beige;*/ 39 | } 40 | 41 | .Ptable { 42 | margin: 10px 43 | } 44 | 45 | .Ptable-item { 46 | padding-top: 0px; 47 | padding: 12px; 48 | line-height: 220%; 49 | color: #999; 50 | font-size: 12px; 51 | } 52 | 53 | .Ptable-item:after { 54 | content: ""; 55 | height: 0; 56 | visibility: hidden; 57 | display: block; 58 | clear: both 59 | } 60 | 61 | .Ptable-item h3 { 62 | align: center; 63 | padding-top:0px; 64 | padding-bottom:0px; 65 | margin-top: 0px; 66 | width: 150px; 67 | text-align: center 68 | } 69 | 70 | .Ptable-item dl { 71 | margin-left: 110px; 72 | margin-bottom:0px; 73 | } 74 | 75 | .Ptable-item dt { 76 | width: 370px; 77 | float: left; 78 | text-align: right; 79 | padding-right: 5px 80 | } 81 | 82 | .Ptable-item dd { 83 | margin-left: 210px 84 | } 85 | 86 | .Ptable-item .Ptable-tips { 87 | position: relative; 88 | float: left; 89 | width: auto; 90 | margin-left: 0 91 | } 92 | 93 | .Ptable-item .Ptable-tips:hover { 94 | z-index: 2 95 | } 96 | 97 | .Ptable-item .Ptable-sprite-question { 98 | display: inline-block; 99 | margin-left: 4px; 100 | width: 16px; 101 | height: 16px; 102 | vertical-align: -3px; 103 | } 104 | 105 | .Ptable-tips .tips { 106 | display: none; 107 | position: absolute; 108 | left: -10px; 109 | top: 27px; 110 | width: 300px 111 | } 112 | 113 | .Ptable-tips:hover .tips { 114 | display: block 115 | } 116 | 117 | .Ptable-tips .content { 118 | padding: 8px 10px; 119 | background: #fff; 120 | border: 1px solid #cecbce; 121 | box-shadow: 0 0 2px 2px #eee 122 | } 123 | 124 | .Ptable-tips p { 125 | font-family: "microsoft yahei"; 126 | color: #999; 127 | line-height: 160%; 128 | text-align: left 129 | } 130 | 131 | .Ptable-tips .Ptable-sprite-arrow { 132 | position: absolute; 133 | overflow: hidden; 134 | left: 15px; 135 | top: -5px; 136 | width: 11px; 137 | height: 6px; 138 | } 139 | 140 | 141 | 142 | .Ptable td,.Ptable th { 143 | font-size: 12px 144 | } 145 | 146 | .Ptable th { 147 | background: #EEF7FE; 148 | text-align: right; 149 | padding: 5px 150 | } 151 | 152 | .Ptable td { 153 | padding: 2px 5px; 154 | background: #fff 155 | } 156 | 157 | .Ptable th.tdTitle { 158 | text-align: center 159 | } 160 | 161 | .Ptable .tdTitle { 162 | text-align: right; 163 | width: 500px; 164 | background: #F5FAFE 165 | } 166 | .Ptable tr { 167 | text-align: center 168 | } 169 | 170 | 171 | .Ptable-tips { 172 | display: inline-block; 173 | position: relative; 174 | *display: inline; 175 | *zoom:1} 176 | 177 | .Ptable-tips:hover { 178 | z-index: 2 179 | } 180 | 181 | .Ptable-sprite-question { 182 | display: inline-block; 183 | margin-left: 4px; 184 | width: 16px; 185 | height: 16px; 186 | vertical-align: -3px; 187 | } 188 | 189 | .Ptable-tips .tips { 190 | display: none; 191 | position: absolute; 192 | left: -10px; 193 | top: 27px; 194 | width: 215px 195 | } 196 | 197 | .Ptable-tips:hover .tips { 198 | display: block 199 | } 200 | 201 | .Ptable-tips .content { 202 | padding: 8px 10px; 203 | background: #fff; 204 | border: 1px solid #cecbce; 205 | box-shadow: 0 0 2px 2px #eee 206 | } 207 | 208 | .Ptable-tips p { 209 | font-family: "microsoft yahei"; 210 | color: #999; 211 | line-height: 160%; 212 | text-align: left 213 | } 214 | 215 | .Ptable-tips .Ptable-sprite-arrow { 216 | position: absolute; 217 | overflow: hidden; 218 | left: 15px; 219 | top: -5px; 220 | width: 11px; 221 | height: 6px; 222 | } 223 | 224 | 225 | 226 | -------------------------------------------------------------------------------- /html/detail.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 详情页 6 | 7 | 8 | 9 | 10 |
11 |
12 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /html/fonts/glyphicons-halflings-regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moxi255/jd-spider/8f92f7e7a9a27c2f83f6432f17e1f0bb920b4213/html/fonts/glyphicons-halflings-regular.eot -------------------------------------------------------------------------------- /html/fonts/glyphicons-halflings-regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moxi255/jd-spider/8f92f7e7a9a27c2f83f6432f17e1f0bb920b4213/html/fonts/glyphicons-halflings-regular.ttf -------------------------------------------------------------------------------- /html/fonts/glyphicons-halflings-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moxi255/jd-spider/8f92f7e7a9a27c2f83f6432f17e1f0bb920b4213/html/fonts/glyphicons-halflings-regular.woff -------------------------------------------------------------------------------- /html/fonts/glyphicons-halflings-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moxi255/jd-spider/8f92f7e7a9a27c2f83f6432f17e1f0bb920b4213/html/fonts/glyphicons-halflings-regular.woff2 -------------------------------------------------------------------------------- /html/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 首页 6 | 7 | 8 | 9 |
10 |

京东笔记本商品分析

11 | 19 |
20 |
21 |
22 | 华为 23 | 联想 24 | ThinkPad 25 | Apple 26 | 戴尔 27 | 三星 28 | 华硕 29 | 惠普 30 | 宏碁 31 | 小米 32 | 微软 33 | 外星人 34 | 机械革命 35 | 神舟 36 | 微星 37 | 雷蛇 38 | 戴睿 39 | 海尔 40 |
41 |
42 |
43 |
44 | 45 |
46 | 49 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /html/js/detail.js: -------------------------------------------------------------------------------- 1 | function get_computer_id() { 2 | var loc = location.href;//获取整个跳转地址内容,其实就是你传过来的整个地址字符串console.log("我的地址"+loc); 3 | var n1 = loc.length;//地址的总长 4 | var n2 = loc.indexOf("?");//取得=号的位置 5 | var parameter = decodeURI(loc.substr(n2+1, n1-n2));//截取从?号后面的内容,也就是参数列表,因为传过来的路径是加了码的,所以要解码 6 | var parameters = parameter.split("&");//从&处拆分,返回字符串数组 7 | var paValue = new Array();//创建一个用于保存具体值得数组 8 | for (var i = 0; i < parameters.length; i++) { 9 | var m1 = parameters[i].length;//获得每个键值对的长度 10 | var m2 = parameters[i].indexOf("=");//获得每个键值对=号的位置 11 | var value = parameters[i].substr(m2+1, m1-m2);//获取每个键值对=号后面具体的值 12 | paValue[i] = value; 13 | } 14 | return paValue[0] 15 | } 16 | 17 | 18 | function loaddetail(computer_id) { 19 | var detail_url = 'http://127.0.0.1:8000/api/v1/computer/detail/?computer_id=' + computer_id 20 | $.get(detail_url, function (data) { 21 | if (data.computer) { 22 | var computer = data.computer; 23 | var good_rate = computer.good_rate * 100; 24 | if (good_rate === 0) { 25 | good_rate = '评论过少' 26 | } else { 27 | good_rate = good_rate.toFixed(2) + '%' 28 | } 29 | $('header').append($('

').text(computer.title).attr({ 30 | 'class': 'text-center text-success', 31 | })) 32 | $('main').append($('
').attr({ 33 | 'id': 'goods', 34 | 'style': 'display:flex; flex-direction:row; ' 35 | }).append($('').attr({ 36 | 'src': computer.img_url, 37 | 'width': 300, 38 | 'height': 300, 39 | })).append($('
').append($('
').attr({ 40 | 'class': 'self-dl' 41 | }).append($('
').text('参考价:')).append($('
').text('¥' + computer.price))) 42 | .append($('
').attr({'class': 'self-dl'}).append($('
').text('评论积极情感比重:')).append($('
').text(good_rate))))) 43 | if (data.wordcloud) { 44 | $('main').append($('
')) 45 | $('main').append($('
').attr({ 46 | 'id': 'wordcloud' 47 | }).append($('').attr({ 48 | 'src': data.wordcloud 49 | }))) 50 | } 51 | if (data.pie) { 52 | $('main').append($('
')) 53 | $('main').append($('
').attr({ 54 | 'class': 'svg' 55 | }).append($('