├── .gitignore ├── LICENSE ├── README.md ├── examples ├── model_classifier_demo.py ├── rule_classifier_demo.py ├── user_sentiment_dict.txt └── userdict_demo.py ├── pysenti ├── __init__.py ├── __main__.py ├── bayes.py ├── compat.py ├── data │ ├── adverb_dict.txt │ ├── conjunction_dict.txt │ ├── denial_dict.txt │ ├── neg_sentences.txt │ ├── pos_sentences.txt │ ├── process.py │ ├── sentiment_dict.txt │ ├── sentiment_model.pkl │ └── stopwords.txt ├── frequency.py ├── model_classifier.py ├── rule_classfier.py ├── tokenizer.py ├── train.py └── utils.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![PyPI version](https://badge.fury.io/py/pysenti.svg)](https://badge.fury.io/py/pysenti) 2 | [![Downloads](https://static.pepy.tech/badge/pysenti)](https://pepy.tech/project/pysenti) 3 | [![License Apache 2.0](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/shibing624/pysenti/LICENSE) 4 | ![Language](https://img.shields.io/badge/Language-Python-blue.svg) 5 | 6 | 7 | # pysenti 8 | 9 | Chinese Sentiment Classification Tool for Python. 中文情感极性分析工具。 10 | 11 | **pysenti**基于规则词典的情感极性分析,扩展性强,可作为调研用的基准方法。 12 | 13 | ## Question 14 | 文本情感极性(倾向)分析咋做? 15 | 16 | ## Solution 17 | ### 规则的解决思路 18 | 1. 中文情感极性分析,文本切分为段落,再切词,通过情感词标识出各个词语的情感极性,包括积极、中立、消极。 19 | 2. 结合句子结构(包括连词、否定词、副词、标点等)给各情感词语的情感极性赋予权重,然后加权求和得到文本的情感极性得分。 20 | 3. 优点:泛化性好,规则可扩展性强,所有领域通用。 21 | 4. 缺点:规则词典收集困难,专家系统的权重设定有局限,单一领域准确率相比模型方法低。 22 | 23 | ### 模型的解决思路 24 | 1. 常见的[NLP文本分类模型](https://github.com/shibing624/text-classifier)均可,包括经典文本分类模型(LR、SVM、Xgboost等)和深度文本分类模型(TextCNN、Bi-LSTM、BERT等)。 25 | 2. 优点:单一领域准召率高。 26 | 3. 缺点:不通用,有标注数据的样本收集困难,扩展性弱。 27 | 28 | ## Feature 29 | ### 规则 30 | * [情感词典](https://github.com/shibing624/pysenti/tree/master/pysenti/data)整合了`知网情感词典`、`清华大学李军情感词典`、[BosonNLP情感词典](https://bosonnlp.com/dev/resource)、`否定词词典`。 31 | 32 | ### 模型 33 | * bayes 文本分类模型 34 | * [样本数据](https://github.com/shibing624/pysenti/tree/master/pysenti/data)来自商品评论数据,分为积极、消极两类。 35 | 36 | ## Demo 37 | 38 | Official Demo: https://www.mulanai.com/product/sentiment_classify/ 39 | 40 | ## Install 41 | * 全自动安装:pip install pysenti 42 | * 半自动安装: 43 | ```shell 44 | git clone https://github.com/shibing624/pysenti.git 45 | cd pysenti 46 | python3 setup.py install 47 | ``` 48 | 49 | ## Usage 50 | ### 规则方法 51 | example: [examples/rule_classifier_demo.py](examples/rule_classifier_demo.py) 52 | 53 | ```python 54 | import pysenti 55 | 56 | texts = ["苹果是一家伟大的公司", 57 | "土豆丝很好吃", 58 | "土豆丝很难吃"] 59 | for i in texts: 60 | r = pysenti.classify(i) 61 | print(i, r['score'], r) 62 | 63 | ``` 64 | 65 | output: 66 | ```shell 67 | 苹果是一家伟大的公司 3.4346924811096997 {'score': 3.4346924811096997, 'sub_clause0': {'score': 3.4346924811096997, 'sentiment': [{'key': '苹果', 'adverb': [], 'denial': [], 'value': 1.37846341627, 'score': 1.37846341627}, {'key': '是', 'adverb': [], 'denial': [], 'value': -0.252600480826, 'score': -0.252600480826}, {'key': '一家', 'adverb': [], 'denial': [], 'value': 1.48470161748, 'score': 1.48470161748}, {'key': '伟大', 'adverb': [], 'denial': [], 'value': 1.14925252286, 'score': 1.14925252286}, {'key': '的', 'adverb': [], 'denial': [], 'value': 0.0353323193687, 'score': 0.0353323193687}, {'key': '公司', 'adverb': [], 'denial': [], 'value': -0.360456914043, 'score': -0.360456914043}], 'conjunction': []}} 68 | 土豆丝很好吃 2.294311221077 {'score': 2.294311221077, 'sub_clause0': {'score': 2.294311221077, 'sentiment': [{'key': '土豆丝', 'adverb': [], 'denial': [], 'value': 0.294892711165, 'score': 0.294892711165}, {'key': '很', 'adverb': [], 'denial': [], 'value': 0.530242664632, 'score': 0.530242664632}, {'key': '好吃', 'adverb': [], 'denial': [], 'value': 1.46917584528, 'score': 1.46917584528}], 'conjunction': []}} 69 | 土豆丝很难吃 -2.381874203563 {'score': -2.381874203563, 'sub_clause0': {'score': -2.381874203563, 'sentiment': [{'key': '土豆丝', 'adverb': [], 'denial': [], 'value': 0.294892711165, 'score': 0.294892711165}, {'key': '很', 'adverb': [], 'denial': [], 'value': 0.530242664632, 'score': 0.530242664632}, {'key': '难吃', 'adverb': [], 'denial': [], 'value': -3.20700957936, 'score': -3.20700957936}], 'conjunction': []}} 70 | ``` 71 | > score: 正值是积极情感;负值是消极情感。 72 | 73 | ### 模型方法 74 | example: [examples/model_classifier_demo.py](examples/model_classifier_demo.py) 75 | 76 | ```python 77 | from pysenti import ModelClassifier 78 | 79 | texts = ["苹果是一家伟大的公司", 80 | "土豆丝很好吃", 81 | "垃圾,在酒店中应该是很差的!", 82 | "我们刚走过一个烧烤店", 83 | "土豆丝很难吃"] 84 | 85 | m = ModelClassifier() 86 | for i in texts: 87 | r = m.classify(i) 88 | print(i, r) 89 | ``` 90 | 91 | output: 92 | ```shell 93 | 苹果是一家伟大的公司 {'positive_prob': 0.682, 'negative_prob': 0.318} 94 | 土豆丝很好吃 {'positive_prob': 0.601, 'negative_prob': 0.399} 95 | 土豆丝很难吃 {'positive_prob': 0.283, 'negative_prob': 0.717} 96 | ``` 97 | 98 | ### 延迟加载机制 99 | 100 | pysenti 采用延迟加载,`import pysenti` 和 `from pysenti import rule_classifier` 不会立即触发词典的加载,一旦有必要才开始加载词典。如果你想手工初始 pysenti,也可以手动初始化。 101 | ```python 102 | import pysenti 103 | pysenti.rule_classifier.init() # 手动初始化(可选) 104 | ``` 105 | 106 | 你还可以使用自定义情感词典: 107 | ```python 108 | pysenti.rule_classifier.init('user_sentiment_dict.txt') 109 | ``` 110 | 情感词典`user_sentiment_dict.txt`的格式如下: 111 | ```shell 112 | 难吃 -10 113 | 好吃 10 114 | ``` 115 | 空格间隔,第一个是词,第二个是分值:正值代表积极情感,负值代表消极情感,越大情感越强烈。 116 | 117 | ### 命令行 118 | 119 | 使用示例: python -m pysenti news.txt > news_result.txt 120 | 121 | 命令行选项(翻译): 122 | ```shell 123 | 使用: python -m pysenti [options] filename 124 | 125 | 命令行界面 126 | 127 | 固定参数: 128 | filename 输入文件 129 | 130 | 可选参数: 131 | -h, --help 显示此帮助信息并退出 132 | -d DICT, --dict DICT 使用 DICT 代替默认词典 133 | -u USER_DICT, --user-dict USER_DICT 134 | 使用 USER_DICT 作为附加词典,与默认词典或自定义词典配合使用 135 | -a, --output-all 输出句子及词级别情感分析详细信息 136 | -V, --version 显示版本信息并退出 137 | 138 | 如果没有指定文件名,则使用标准输入。 139 | ``` 140 | 141 | 142 | `--help`选项输出: 143 | ```shell 144 | $> python -m pysenti --help 145 | 146 | usage: python3 -m pysenti [options] filename 147 | 148 | pysenti command line interface. 149 | 150 | positional arguments: 151 | filename input file 152 | 153 | optional arguments: 154 | -h, --help show this help message and exit 155 | -d DICT, --dict DICT use DICT as dictionary 156 | -u USER_DICT, --user-dict USER_DICT 157 | use USER_DICT together with the default dictionary or 158 | DICT (if specified) 159 | -a, --output-all output text sentiment score and word sentiment info 160 | -V, --version show program's version number and exit 161 | 162 | If no filename specified, use STDIN instead. 163 | ``` 164 | 165 | ## Reference 166 | 167 | - snownlp 168 | - SentimentPolarityAnalysis 169 | -------------------------------------------------------------------------------- /examples/model_classifier_demo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author:XuMing(xuming624@qq.com) 4 | @description: 5 | """ 6 | from pysenti import ModelClassifier 7 | 8 | texts = ["苹果是一家伟大的公司", 9 | "土豆丝很好吃", 10 | "垃圾,在酒店中应该是很差的!", 11 | "我们刚走过一个烧烤店", 12 | "土豆丝很难吃"] 13 | 14 | if __name__ == '__main__': 15 | m = ModelClassifier() 16 | for i in texts: 17 | r = m.classify(i) 18 | print(i, r) 19 | -------------------------------------------------------------------------------- /examples/rule_classifier_demo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author:XuMing(xuming624@qq.com) 4 | @description: 5 | """ 6 | import pysenti 7 | 8 | if __name__ == '__main__': 9 | texts = [ 10 | "这个电影真心很难看,大多数人都不喜欢,被骂很久,早就习惯了。", 11 | "苹果是一家伟大的公司", 12 | "土豆丝很好吃", 13 | "土豆丝很难吃" 14 | ] 15 | for i in texts: 16 | r = pysenti.classify(i) 17 | print(i, r['score']) 18 | print(r) 19 | -------------------------------------------------------------------------------- /examples/user_sentiment_dict.txt: -------------------------------------------------------------------------------- 1 | 很硬 -10 2 | 难吃 -10 3 | fuck -5 4 | 好吃 10 5 | 很香 10 6 | 甜美 5 -------------------------------------------------------------------------------- /examples/userdict_demo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author:XuMing(xuming624@qq.com) 4 | @description: 5 | """ 6 | 7 | from pysenti import RuleClassifier 8 | 9 | if __name__ == '__main__': 10 | m = RuleClassifier() 11 | m.load_user_sentiment_dict('user_sentiment_dict.txt') 12 | print(m.user_sentiment_dict) 13 | 14 | a_sentences = ['剁椒鸡蛋好难吃。绝对没人受得了', 15 | '土豆丝很好吃', '土豆丝很难吃', 16 | '这笔钱是个天文数字', 17 | '啥也不是', 18 | '我一会儿出去玩了,你吃啥?给你带,然而你不知道'] 19 | for i in a_sentences: 20 | r = m.classify(i) 21 | print(i, r['score']) 22 | -------------------------------------------------------------------------------- /pysenti/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author:XuMing(xuming624@qq.com) 4 | @description: 5 | """ 6 | 7 | from pysenti.compat import strdecode 8 | from pysenti.model_classifier import ModelClassifier 9 | from pysenti.rule_classfier import RuleClassifier 10 | 11 | __version__ = '0.1.9' 12 | 13 | rule_classifier = RuleClassifier() 14 | classify = rule_classifier.classify 15 | 16 | -------------------------------------------------------------------------------- /pysenti/__main__.py: -------------------------------------------------------------------------------- 1 | """pysenti command line interface.""" 2 | 3 | import sys 4 | from argparse import ArgumentParser 5 | 6 | from pysenti import RuleClassifier, __version__ 7 | from pysenti.compat import PY2, default_encoding 8 | 9 | 10 | def main(args): 11 | rule_classifier = RuleClassifier() 12 | fp = open(args.filename, 'r') if args.filename else sys.stdin 13 | 14 | if args.dict: 15 | rule_classifier.init(sentiment_dict_path=args.dict) 16 | else: 17 | rule_classifier.init() 18 | if args.user_dict: 19 | rule_classifier.load_user_sentiment_dict(args.user_dict) 20 | 21 | ln = fp.readline() 22 | while ln: 23 | r = rule_classifier.classify(ln.rstrip()) 24 | if args.output_all: 25 | result = str(r) 26 | else: 27 | result = str(r['score']) 28 | if PY2: 29 | result = result.encode(default_encoding) 30 | print(result) 31 | ln = fp.readline() 32 | 33 | fp.close() 34 | 35 | 36 | if __name__ == '__main__': 37 | parser = ArgumentParser(usage="%s -m pysenti [options] filename" % sys.executable, 38 | description="pysenti command line interface.", 39 | epilog="If no filename specified, use STDIN instead.") 40 | parser.add_argument("-d", "--dict", help="use DICT as dictionary") 41 | parser.add_argument("-u", "--user-dict", 42 | help="use USER_DICT together with the default dictionary or DICT (if specified)") 43 | parser.add_argument("-a", "--output-all", 44 | action="store_true", default=False, 45 | help="output text sentiment score and word sentiment info") 46 | parser.add_argument("-V", '--version', action='version', 47 | version="pysenti " + __version__) 48 | parser.add_argument("filename", nargs='?', help="input file") 49 | args = parser.parse_args() 50 | 51 | main(args) 52 | -------------------------------------------------------------------------------- /pysenti/bayes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author:XuMing(xuming624@qq.com) 4 | @description: 5 | """ 6 | from math import log, exp 7 | 8 | from pysenti.frequency import AddOneProb 9 | from pysenti.utils import dump_pkl, load_pkl 10 | 11 | 12 | class Bayes(object): 13 | def __init__(self): 14 | self.d = {} 15 | self.total = 0 16 | 17 | def save(self, fname): 18 | d = {'total': self.total, 'd': {}} 19 | for k, v in self.d.items(): 20 | d['d'][k] = v.__dict__ 21 | dump_pkl(d, fname) 22 | 23 | def load(self, fname): 24 | d = load_pkl(fname) 25 | self.total = d['total'] 26 | self.d = {} 27 | for k, v in d['d'].items(): 28 | self.d[k] = AddOneProb() 29 | self.d[k].__dict__ = v 30 | 31 | def train(self, data): 32 | for d in data: 33 | c = d[1] 34 | if c not in self.d: 35 | self.d[c] = AddOneProb() 36 | for word in d[0]: 37 | self.d[c].add(word, 1) 38 | self.total = sum(map(lambda x: self.d[x].getsum(), self.d.keys())) 39 | 40 | def classify(self, x): 41 | tmp = {} 42 | for k in self.d: 43 | tmp[k] = log(self.d[k].getsum()) - log(self.total) 44 | for word in x: 45 | tmp[k] += log(self.d[k].freq(word)) 46 | ret, prob = 0, 0 47 | for k in self.d: 48 | now = 0 49 | try: 50 | for otherk in self.d: 51 | now += exp(tmp[otherk] - tmp[k]) 52 | now = 1 / now 53 | except OverflowError: 54 | now = 0 55 | if now > prob: 56 | ret, prob = k, now 57 | return ret, prob 58 | -------------------------------------------------------------------------------- /pysenti/compat.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author:XuMing(xuming624@qq.com) 4 | @description: 5 | """ 6 | import os 7 | import sys 8 | 9 | try: 10 | import pkg_resources 11 | 12 | get_module_res = lambda *res: pkg_resources.resource_stream(__name__, 13 | os.path.join(*res)) 14 | except ImportError: 15 | get_module_res = lambda *res: open(os.path.normpath(os.path.join( 16 | os.getcwd(), os.path.dirname(__file__), *res)), 'rb') 17 | 18 | PY2 = sys.version_info[0] == 2 19 | 20 | default_encoding = sys.getfilesystemencoding() 21 | 22 | if PY2: 23 | text_type = unicode 24 | string_types = (str, unicode) 25 | 26 | iterkeys = lambda d: d.iterkeys() 27 | itervalues = lambda d: d.itervalues() 28 | iteritems = lambda d: d.iteritems() 29 | 30 | else: 31 | text_type = str 32 | string_types = (str,) 33 | xrange = range 34 | 35 | iterkeys = lambda d: iter(d.keys()) 36 | itervalues = lambda d: iter(d.values()) 37 | iteritems = lambda d: iter(d.items()) 38 | 39 | 40 | def strdecode(sentence): 41 | if not isinstance(sentence, text_type): 42 | try: 43 | sentence = sentence.decode('utf-8') 44 | except UnicodeDecodeError: 45 | sentence = sentence.decode('gbk', 'ignore') 46 | return sentence 47 | 48 | 49 | def resolve_filename(f): 50 | try: 51 | return f.name 52 | except AttributeError: 53 | return repr(f) 54 | -------------------------------------------------------------------------------- /pysenti/data/adverb_dict.txt: -------------------------------------------------------------------------------- 1 | 超级 2 2 | 超 2 3 | 都 1.75 4 | 还 1.5 5 | 实在 1.75 6 | 越来越 2 7 | 再也 2 8 | 完全 2 9 | 真是 1.75 10 | 足足 1.75 11 | 大大的 1.75 12 | 巨 2 13 | 最最 2 14 | 老是 1.75 15 | 压根 1.75 16 | 明显 1.5 17 | 最 2 18 | 最为 2 19 | 太 2 20 | 极 2 21 | 极为 2 22 | 极其 2 23 | 极度 2 24 | 极端 2 25 | 至 2 26 | 至为 2 27 | 顶 2 28 | 过于 2 29 | 过分 2 30 | 分外 2 31 | 万分 2 32 | 根本 2 33 | 百分之百 2 34 | 倍加 2 35 | 备至 2 36 | 不得了 2 37 | 不堪 2 38 | 不可开交 2 39 | 不亦乐乎 2 40 | 不折不扣 2 41 | 彻头彻尾 2 42 | 充分 2 43 | 到头 2 44 | 地地道道 2 45 | 非常 2 46 | 极 2 47 | 极度 2 48 | 极端 2 49 | 极其 2 50 | 极为 2 51 | 截然 2 52 | 尽 2 53 | 惊人地 2 54 | 绝 2 55 | 绝顶 2 56 | 绝对 2 57 | 绝对化 2 58 | 刻骨 2 59 | 酷 2 60 | 满 2 61 | 满贯 2 62 | 满心 2 63 | 莫大 2 64 | 奇 2 65 | 入骨 2 66 | 甚为 2 67 | 十二分 2 68 | 十分 2 69 | 十足 2 70 | 死 2 71 | 滔天 2 72 | 痛 2 73 | 透 2 74 | 完全 2 75 | 完完全全 2 76 | 万 2 77 | 万般 2 78 | 万分 2 79 | 万万 2 80 | 无比 2 81 | 无度 2 82 | 无可估量 2 83 | 无以复加 2 84 | 无以伦比 2 85 | 要命 2 86 | 要死 2 87 | 已极 2 88 | 已甚 2 89 | 异常 2 90 | 逾常 2 91 | 贼 2 92 | 之极 2 93 | 之至 2 94 | 至极 2 95 | 卓绝 2 96 | 最为 2 97 | 佼佼 2 98 | 最 2 99 | 更 1.75 100 | 更加 1.75 101 | 更其 1.75 102 | 越 1.75 103 | 越发 1.75 104 | 备加 1.75 105 | 愈 1.75 106 | 愈加 1.75 107 | 愈发 1.75 108 | 愈为 1.75 109 | 愈益 1.75 110 | 越加 1.75 111 | 格外 1.75 112 | 益发 1.75 113 | 很 1.75 114 | 挺 1.75 115 | 怪 1.75 116 | 非常 1.75 117 | 特别 1.75 118 | 相当 1.75 119 | 十分 1.75 120 | 好不 1.75 121 | 甚 1.75 122 | 甚为 1.75 123 | 颇 1.75 124 | 颇为 1.75 125 | 异常 1.75 126 | 深为 1.75 127 | 满 1.75 128 | 蛮 1.75 129 | 够 1.75 130 | 多 1.75 131 | 多么 1.75 132 | 殊特 1.75 133 | 大 1.75 134 | 大为 1.75 135 | 何等 1.75 136 | 何其 1.75 137 | 尤其 1.75 138 | 无比尤为 1.75 139 | 不胜 1.75 140 | 不过 1.75 141 | 不少 1.75 142 | 不胜 1.75 143 | 惨 1.75 144 | 沉 1.75 145 | 沉沉 1.75 146 | 出奇 1.75 147 | 大为 1.75 148 | 多 1.75 149 | 多多 1.75 150 | 多加 1.75 151 | 多么 1.75 152 | 分外 1.75 153 | 格外 1.75 154 | 够瞧的 1.75 155 | 够戗 1.75 156 | 好 1.75 157 | 好不 1.75 158 | 何等 1.75 159 | 很 1.75 160 | 很是 1.75 161 | 坏 1.75 162 | 可 1.75 163 | 老 1.75 164 | 老大 1.75 165 | 良 1.75 166 | 颇 1.75 167 | 颇为 1.75 168 | 甚 1.75 169 | 实在 1.75 170 | 太 1.75 171 | 太甚 1.75 172 | 特 1.75 173 | 特别 1.75 174 | 尤 1.75 175 | 尤其 1.75 176 | 尤为 1.75 177 | 尤以 1.75 178 | 远 1.75 179 | 着实 1.75 180 | 较 1.5 181 | 蛮 1.5 182 | 比较 1.5 183 | 较比 1.5 184 | 较为 1.5 185 | 不大 1.5 186 | 不太 1.5 187 | 不很 1.5 188 | 不甚 1.5 189 | 大不了 1.5 190 | 多 1.5 191 | 更 1.5 192 | 更加 1.5 193 | 更进一步 1.5 194 | 更为 1.5 195 | 还 1.5 196 | 还要 1.5 197 | 较 1.5 198 | 较比 1.5 199 | 较为 1.5 200 | 进一步 1.5 201 | 那般 1.5 202 | 那么 1.5 203 | 那样 1.5 204 | 强 1.5 205 | 如斯 1.5 206 | 益 1.5 207 | 益发 1.5 208 | 尤甚 1.5 209 | 逾 1.5 210 | 愈 1.5 211 | 愈发 1.5 212 | 愈加 1.5 213 | 愈来愈 1.5 214 | 愈益 1.5 215 | 远远 1.5 216 | 越发 1.5 217 | 越加 1.5 218 | 越来越 1.5 219 | 越是 1.5 220 | 这般 1.5 221 | 这样 1.5 222 | 足 1.5 223 | 足足 1.5 224 | 不为过 1 225 | 超 1 226 | 超额 1 227 | 超外差 1 228 | 超微结构 1 229 | 超物质 1 230 | 出头 1 231 | 多 1 232 | 浮 1 233 | 过 1 234 | 过度 1 235 | 过分 1 236 | 过火 1 237 | 过劲 1 238 | 过了头 1 239 | 过猛 1 240 | 过热 1 241 | 过甚 1 242 | 过头 1 243 | 过于 1 244 | 过逾 1 245 | 何止 1 246 | 何啻 1 247 | 开外 1 248 | 苦 1 249 | 老 1 250 | 偏 1 251 | 强 1 252 | 溢 1 253 | 忒 1 254 | 稍 0.8 255 | 稍稍 0.8 256 | 稍微 0.8 257 | 稍为 0.8 258 | 稍许 0.8 259 | 略 0.8 260 | 略略 0.8 261 | 略微 0.8 262 | 略为 0.8 263 | 些微 0.8 264 | 多少 0.8 265 | 有点 0.8 266 | 有些 0.8 267 | 点点滴滴 0.8 268 | 多多少少 0.8 269 | 怪 0.8 270 | 好生 0.8 271 | 还 0.8 272 | 或多或少 0.8 273 | 略 0.8 274 | 略加 0.8 275 | 略略 0.8 276 | 略微 0.8 277 | 略为 0.8 278 | 蛮 0.8 279 | 稍 0.8 280 | 稍稍 0.8 281 | 稍微 0.8 282 | 稍为 0.8 283 | 稍许 0.8 284 | 挺 0.8 285 | 未免 0.8 286 | 相当 0.8 287 | 些 0.8 288 | 些微 0.8 289 | 些小 0.8 290 | 一点 0.8 291 | 一点儿 0.8 292 | 一些 0.8 293 | 有点 0.8 294 | 有点儿 0.8 295 | 有些 0.8 296 | 半点 0.6 297 | 不大 0.6 298 | 不丁点儿 0.6 299 | 不甚 0.6 300 | 不怎么 0.6 301 | 聊 0.6 302 | 没怎么 0.6 303 | 轻度 0.6 304 | 弱 0.6 305 | 丝毫 0.6 306 | 微 0.6 307 | 相对 0.6 -------------------------------------------------------------------------------- /pysenti/data/conjunction_dict.txt: -------------------------------------------------------------------------------- 1 | 并 1.2 2 | 且 1.2 3 | 而 1.2 4 | 虽然 1.2 5 | 不过 1.2 6 | 至于 1.2 7 | 致 1.2 8 | 不料 1.2 9 | 岂知 1.2 10 | 也 1.5 11 | 不但 1.5 12 | 其次 1.5 13 | 不仅 1.5 14 | 就是 1.5 15 | 但是 2 16 | 偏偏 2 17 | 而且 2 18 | 何况 2 19 | 况且 2 20 | 乃至 2 21 | 但 2 22 | 却 2 23 | 然而 2 24 | 只是 2 25 | 甚至 3 26 | 尤其 3 27 | 居然 3 28 | -------------------------------------------------------------------------------- /pysenti/data/denial_dict.txt: -------------------------------------------------------------------------------- 1 | 没敢 1 2 | 不是 1 3 | 不 1 4 | 没 1 5 | 无 1 6 | 非 1 7 | 莫 1 8 | 弗 1 9 | 毋 1 10 | 勿 1 11 | 未 1 12 | 否 1 13 | 别 1 14 | 休 1 15 | 無 1 16 | 不曾 1 17 | 未必 1 18 | 没有 1 19 | 不要 1 20 | 难以 1 21 | 未曾 1 22 | 并非 1 23 | 绝不 1 24 | 不可 1 25 | -------------------------------------------------------------------------------- /pysenti/data/process.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author:XuMing(xuming624@qq.com) 4 | @description: 5 | """ 6 | from codecs import open 7 | 8 | r = set() 9 | with open('pos_dict.txt', 'r', encoding='utf-8') as f: 10 | for line in f: 11 | line = line.strip() 12 | r.add(line) 13 | 14 | sentiments = set() 15 | with open('sentiment_dict.txt', 'r', encoding='utf-8') as f: 16 | for line in f: 17 | line = line.strip().split() 18 | w = line[0] 19 | sentiments.add(w) 20 | 21 | with open('pos', 'w', encoding='utf-8') as f: 22 | for i in r: 23 | if i not in sentiments: 24 | f.write(i + ' 2' + '\n') 25 | -------------------------------------------------------------------------------- /pysenti/data/sentiment_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shibing624/pysenti/47fec82a5ab155e311c5321eb1d237c2d8d79010/pysenti/data/sentiment_model.pkl -------------------------------------------------------------------------------- /pysenti/data/stopwords.txt: -------------------------------------------------------------------------------- 1 | -- 2 | ? 3 | < 4 | > 5 | ! 6 | , 7 | . 8 | " 9 | / 10 | ~ 11 | ` 12 | - 13 | = 14 | + 15 | ( 16 | ) 17 | * 18 | : 19 | ; 20 | -- 21 | 、 22 | 。 23 | “ 24 | ” 25 | 《 26 | 》 27 | ( 28 | ) 29 | 【 30 | 】 31 | [ 32 | ] 33 | ! 34 | , 35 | : 36 | ; 37 | ? 38 | able 39 | about 40 | above 41 | according 42 | accordingly 43 | across 44 | actually 45 | after 46 | afterwards 47 | again 48 | against 49 | ain't 50 | all 51 | allow 52 | allows 53 | almost 54 | alone 55 | along 56 | already 57 | also 58 | although 59 | always 60 | am 61 | among 62 | amongst 63 | an 64 | and 65 | another 66 | any 67 | anybody 68 | anyhow 69 | anyone 70 | anything 71 | anyway 72 | anyways 73 | anywhere 74 | apart 75 | appear 76 | appreciate 77 | appropriate 78 | are 79 | aren't 80 | around 81 | as 82 | a's 83 | aside 84 | ask 85 | asking 86 | associated 87 | at 88 | available 89 | away 90 | awfully 91 | be 92 | became 93 | because 94 | become 95 | becomes 96 | becoming 97 | been 98 | before 99 | beforehand 100 | behind 101 | being 102 | believe 103 | below 104 | beside 105 | besides 106 | best 107 | better 108 | between 109 | beyond 110 | both 111 | brief 112 | but 113 | by 114 | came 115 | can 116 | cannot 117 | cant 118 | can't 119 | cause 120 | causes 121 | certain 122 | certainly 123 | changes 124 | clearly 125 | c'mon 126 | co 127 | com 128 | come 129 | comes 130 | concerning 131 | consequently 132 | consider 133 | considering 134 | contain 135 | containing 136 | contains 137 | corresponding 138 | could 139 | couldn't 140 | course 141 | c's 142 | currently 143 | definitely 144 | described 145 | despite 146 | did 147 | didn't 148 | different 149 | do 150 | does 151 | doesn't 152 | doing 153 | done 154 | don't 155 | down 156 | downwards 157 | during 158 | each 159 | edu 160 | eg 161 | eight 162 | either 163 | else 164 | elsewhere 165 | enough 166 | entirely 167 | especially 168 | et 169 | etc 170 | even 171 | ever 172 | every 173 | everybody 174 | everyone 175 | everything 176 | everywhere 177 | ex 178 | exactly 179 | example 180 | except 181 | far 182 | few 183 | fifth 184 | first 185 | five 186 | followed 187 | following 188 | follows 189 | for 190 | former 191 | formerly 192 | forth 193 | four 194 | from 195 | further 196 | furthermore 197 | get 198 | gets 199 | getting 200 | given 201 | gives 202 | go 203 | goes 204 | going 205 | gone 206 | got 207 | gotten 208 | greetings 209 | had 210 | hadn't 211 | happens 212 | hardly 213 | has 214 | hasn't 215 | have 216 | haven't 217 | having 218 | he 219 | hello 220 | help 221 | hence 222 | her 223 | here 224 | hereafter 225 | hereby 226 | herein 227 | here's 228 | hereupon 229 | hers 230 | herself 231 | he's 232 | hi 233 | him 234 | himself 235 | his 236 | hither 237 | hopefully 238 | how 239 | howbeit 240 | however 241 | i'd 242 | ie 243 | if 244 | ignored 245 | i'll 246 | i'm 247 | immediate 248 | in 249 | inasmuch 250 | inc 251 | indeed 252 | indicate 253 | indicated 254 | indicates 255 | inner 256 | insofar 257 | instead 258 | into 259 | inward 260 | is 261 | isn't 262 | it 263 | it'd 264 | it'll 265 | its 266 | it's 267 | itself 268 | i've 269 | just 270 | keep 271 | keeps 272 | kept 273 | know 274 | known 275 | knows 276 | last 277 | lately 278 | later 279 | latter 280 | latterly 281 | least 282 | less 283 | lest 284 | let 285 | let's 286 | like 287 | liked 288 | likely 289 | little 290 | look 291 | looking 292 | looks 293 | ltd 294 | mainly 295 | many 296 | may 297 | maybe 298 | me 299 | mean 300 | meanwhile 301 | merely 302 | might 303 | more 304 | moreover 305 | most 306 | mostly 307 | much 308 | must 309 | my 310 | myself 311 | name 312 | namely 313 | nd 314 | near 315 | nearly 316 | necessary 317 | need 318 | needs 319 | neither 320 | never 321 | nevertheless 322 | new 323 | next 324 | nine 325 | no 326 | nobody 327 | non 328 | none 329 | noone 330 | nor 331 | normally 332 | not 333 | nothing 334 | novel 335 | now 336 | nowhere 337 | obviously 338 | of 339 | off 340 | often 341 | oh 342 | ok 343 | okay 344 | old 345 | on 346 | once 347 | one 348 | ones 349 | only 350 | onto 351 | or 352 | other 353 | others 354 | otherwise 355 | ought 356 | our 357 | ours 358 | ourselves 359 | out 360 | outside 361 | over 362 | overall 363 | own 364 | particular 365 | particularly 366 | per 367 | perhaps 368 | placed 369 | please 370 | plus 371 | possible 372 | presumably 373 | probably 374 | provides 375 | que 376 | quite 377 | qv 378 | rather 379 | rd 380 | re 381 | really 382 | reasonably 383 | regarding 384 | regardless 385 | regards 386 | relatively 387 | respectively 388 | right 389 | said 390 | same 391 | saw 392 | say 393 | saying 394 | says 395 | second 396 | secondly 397 | see 398 | seeing 399 | seem 400 | seemed 401 | seeming 402 | seems 403 | seen 404 | self 405 | selves 406 | sensible 407 | sent 408 | serious 409 | seriously 410 | seven 411 | several 412 | shall 413 | she 414 | should 415 | shouldn't 416 | since 417 | six 418 | so 419 | some 420 | somebody 421 | somehow 422 | someone 423 | something 424 | sometime 425 | sometimes 426 | somewhat 427 | somewhere 428 | soon 429 | sorry 430 | specified 431 | specify 432 | specifying 433 | still 434 | sub 435 | such 436 | sup 437 | sure 438 | take 439 | taken 440 | tell 441 | tends 442 | th 443 | than 444 | thank 445 | thanks 446 | thanx 447 | that 448 | thats 449 | that's 450 | the 451 | their 452 | theirs 453 | them 454 | themselves 455 | then 456 | thence 457 | there 458 | thereafter 459 | thereby 460 | therefore 461 | therein 462 | theres 463 | there's 464 | thereupon 465 | these 466 | they 467 | they'd 468 | they'll 469 | they're 470 | they've 471 | think 472 | third 473 | this 474 | thorough 475 | thoroughly 476 | those 477 | though 478 | three 479 | through 480 | throughout 481 | thru 482 | thus 483 | to 484 | together 485 | too 486 | took 487 | toward 488 | towards 489 | tried 490 | tries 491 | truly 492 | try 493 | trying 494 | t's 495 | twice 496 | two 497 | un 498 | under 499 | unfortunately 500 | unless 501 | unlikely 502 | until 503 | unto 504 | up 505 | upon 506 | us 507 | use 508 | used 509 | useful 510 | uses 511 | using 512 | usually 513 | value 514 | various 515 | very 516 | via 517 | viz 518 | vs 519 | want 520 | wants 521 | was 522 | wasn't 523 | way 524 | we 525 | we'd 526 | welcome 527 | well 528 | we'll 529 | went 530 | were 531 | we're 532 | weren't 533 | we've 534 | what 535 | whatever 536 | what's 537 | when 538 | whence 539 | whenever 540 | where 541 | whereafter 542 | whereas 543 | whereby 544 | wherein 545 | where's 546 | whereupon 547 | wherever 548 | whether 549 | which 550 | while 551 | whither 552 | who 553 | whoever 554 | whole 555 | whom 556 | who's 557 | whose 558 | why 559 | will 560 | willing 561 | wish 562 | with 563 | within 564 | without 565 | wonder 566 | won't 567 | would 568 | wouldn't 569 | yes 570 | yet 571 | you 572 | you'd 573 | you'll 574 | your 575 | you're 576 | yours 577 | yourself 578 | yourselves 579 | you've 580 | zero 581 | zt 582 | ZT 583 | zz 584 | ZZ 585 | 一 586 | 一下 587 | 一些 588 | 一切 589 | 一则 590 | 一天 591 | 一定 592 | 一方面 593 | 一旦 594 | 一时 595 | 一来 596 | 一样 597 | 一次 598 | 一片 599 | 一直 600 | 一致 601 | 一般 602 | 一起 603 | 一边 604 | 一面 605 | 万一 606 | 上下 607 | 上升 608 | 上去 609 | 上来 610 | 上述 611 | 上面 612 | 下列 613 | 下去 614 | 下来 615 | 下面 616 | 不一 617 | 不久 618 | 不仅 619 | 不会 620 | 不但 621 | 不光 622 | 不单 623 | 不变 624 | 不只 625 | 不可 626 | 不同 627 | 不够 628 | 不如 629 | 不得 630 | 不怕 631 | 不惟 632 | 不成 633 | 不拘 634 | 不敢 635 | 不断 636 | 不是 637 | 不比 638 | 不然 639 | 不特 640 | 不独 641 | 不管 642 | 不能 643 | 不要 644 | 不论 645 | 不足 646 | 不过 647 | 不问 648 | 与 649 | 与其 650 | 与否 651 | 与此同时 652 | 专门 653 | 且 654 | 两者 655 | 严格 656 | 严重 657 | 个 658 | 个人 659 | 个别 660 | 中小 661 | 中间 662 | 丰富 663 | 临 664 | 为 665 | 为主 666 | 为了 667 | 为什么 668 | 为什麽 669 | 为何 670 | 为着 671 | 主张 672 | 主要 673 | 举行 674 | 乃 675 | 乃至 676 | 么 677 | 之 678 | 之一 679 | 之前 680 | 之后 681 | 之後 682 | 之所以 683 | 之类 684 | 乌乎 685 | 乎 686 | 乘 687 | 也 688 | 也好 689 | 也是 690 | 也罢 691 | 了 692 | 了解 693 | 争取 694 | 于 695 | 于是 696 | 于是乎 697 | 云云 698 | 互相 699 | 产生 700 | 人们 701 | 人家 702 | 什么 703 | 什么样 704 | 什麽 705 | 今后 706 | 今天 707 | 今年 708 | 今後 709 | 仍然 710 | 从 711 | 从事 712 | 从而 713 | 他 714 | 他人 715 | 他们 716 | 他的 717 | 代替 718 | 以 719 | 以上 720 | 以下 721 | 以为 722 | 以便 723 | 以免 724 | 以前 725 | 以及 726 | 以后 727 | 以外 728 | 以後 729 | 以来 730 | 以至 731 | 以至于 732 | 以致 733 | 们 734 | 任 735 | 任何 736 | 任凭 737 | 任务 738 | 企图 739 | 伟大 740 | 似乎 741 | 似的 742 | 但 743 | 但是 744 | 何 745 | 何况 746 | 何处 747 | 何时 748 | 作为 749 | 你 750 | 你们 751 | 你的 752 | 使得 753 | 使用 754 | 例如 755 | 依 756 | 依照 757 | 依靠 758 | 促进 759 | 保持 760 | 俺 761 | 俺们 762 | 倘 763 | 倘使 764 | 倘或 765 | 倘然 766 | 倘若 767 | 假使 768 | 假如 769 | 假若 770 | 做到 771 | 像 772 | 允许 773 | 充分 774 | 先后 775 | 先後 776 | 先生 777 | 全部 778 | 全面 779 | 兮 780 | 共同 781 | 关于 782 | 其 783 | 其一 784 | 其中 785 | 其二 786 | 其他 787 | 其余 788 | 其它 789 | 其实 790 | 其次 791 | 具体 792 | 具体地说 793 | 具体说来 794 | 具有 795 | 再者 796 | 再说 797 | 冒 798 | 冲 799 | 决定 800 | 况且 801 | 准备 802 | 几 803 | 几乎 804 | 几时 805 | 凭 806 | 凭借 807 | 出去 808 | 出来 809 | 出现 810 | 分别 811 | 则 812 | 别 813 | 别的 814 | 别说 815 | 到 816 | 前后 817 | 前者 818 | 前进 819 | 前面 820 | 加之 821 | 加以 822 | 加入 823 | 加强 824 | 十分 825 | 即 826 | 即令 827 | 即使 828 | 即便 829 | 即或 830 | 即若 831 | 却不 832 | 原来 833 | 又 834 | 及 835 | 及其 836 | 及时 837 | 及至 838 | 双方 839 | 反之 840 | 反应 841 | 反映 842 | 反过来 843 | 反过来说 844 | 取得 845 | 受到 846 | 变成 847 | 另 848 | 另一方面 849 | 另外 850 | 只是 851 | 只有 852 | 只要 853 | 只限 854 | 叫 855 | 叫做 856 | 召开 857 | 叮咚 858 | 可 859 | 可以 860 | 可是 861 | 可能 862 | 可见 863 | 各 864 | 各个 865 | 各人 866 | 各位 867 | 各地 868 | 各种 869 | 各级 870 | 各自 871 | 合理 872 | 同 873 | 同一 874 | 同时 875 | 同样 876 | 后来 877 | 后面 878 | 向 879 | 向着 880 | 吓 881 | 吗 882 | 否则 883 | 吧 884 | 吧哒 885 | 吱 886 | 呀 887 | 呃 888 | 呕 889 | 呗 890 | 呜 891 | 呜呼 892 | 呢 893 | 周围 894 | 呵 895 | 呸 896 | 呼哧 897 | 咋 898 | 和 899 | 咚 900 | 咦 901 | 咱 902 | 咱们 903 | 咳 904 | 哇 905 | 哈 906 | 哈哈 907 | 哉 908 | 哎 909 | 哎呀 910 | 哎哟 911 | 哗 912 | 哟 913 | 哦 914 | 哩 915 | 哪 916 | 哪个 917 | 哪些 918 | 哪儿 919 | 哪天 920 | 哪年 921 | 哪怕 922 | 哪样 923 | 哪边 924 | 哪里 925 | 哼 926 | 哼唷 927 | 唉 928 | 啊 929 | 啐 930 | 啥 931 | 啦 932 | 啪达 933 | 喂 934 | 喏 935 | 喔唷 936 | 嗡嗡 937 | 嗬 938 | 嗯 939 | 嗳 940 | 嘎 941 | 嘎登 942 | 嘘 943 | 嘛 944 | 嘻 945 | 嘿 946 | 因 947 | 因为 948 | 因此 949 | 因而 950 | 固然 951 | 在 952 | 在下 953 | 地 954 | 坚决 955 | 坚持 956 | 基本 957 | 处理 958 | 复杂 959 | 多 960 | 多少 961 | 多数 962 | 多次 963 | 大力 964 | 大多数 965 | 大大 966 | 大家 967 | 大批 968 | 大约 969 | 大量 970 | 失去 971 | 她 972 | 她们 973 | 她的 974 | 好的 975 | 好象 976 | 如 977 | 如上所述 978 | 如下 979 | 如何 980 | 如其 981 | 如果 982 | 如此 983 | 如若 984 | 存在 985 | 宁 986 | 宁可 987 | 宁愿 988 | 宁肯 989 | 它 990 | 它们 991 | 它们的 992 | 它的 993 | 安全 994 | 完全 995 | 完成 996 | 实现 997 | 实际 998 | 宣布 999 | 容易 1000 | 密切 1001 | 对 1002 | 对于 1003 | 对应 1004 | 将 1005 | 少数 1006 | 尔后 1007 | 尚且 1008 | 尤其 1009 | 就 1010 | 就是 1011 | 就是说 1012 | 尽 1013 | 尽管 1014 | 属于 1015 | 岂但 1016 | 左右 1017 | 巨大 1018 | 巩固 1019 | 己 1020 | 已经 1021 | 帮助 1022 | 常常 1023 | 并 1024 | 并不 1025 | 并不是 1026 | 并且 1027 | 并没有 1028 | 广大 1029 | 广泛 1030 | 应当 1031 | 应用 1032 | 应该 1033 | 开外 1034 | 开始 1035 | 开展 1036 | 引起 1037 | 强烈 1038 | 强调 1039 | 归 1040 | 当 1041 | 当前 1042 | 当时 1043 | 当然 1044 | 当着 1045 | 形成 1046 | 彻底 1047 | 彼 1048 | 彼此 1049 | 往 1050 | 往往 1051 | 待 1052 | 後来 1053 | 後面 1054 | 得 1055 | 得出 1056 | 得到 1057 | 心里 1058 | 必然 1059 | 必要 1060 | 必须 1061 | 怎 1062 | 怎么 1063 | 怎么办 1064 | 怎么样 1065 | 怎样 1066 | 怎麽 1067 | 总之 1068 | 总是 1069 | 总的来看 1070 | 总的来说 1071 | 总的说来 1072 | 总结 1073 | 总而言之 1074 | 恰恰相反 1075 | 您 1076 | 意思 1077 | 愿意 1078 | 慢说 1079 | 成为 1080 | 我 1081 | 我们 1082 | 我的 1083 | 或 1084 | 或是 1085 | 或者 1086 | 战斗 1087 | 所 1088 | 所以 1089 | 所有 1090 | 所谓 1091 | 打 1092 | 扩大 1093 | 把 1094 | 抑或 1095 | 拿 1096 | 按 1097 | 按照 1098 | 换句话说 1099 | 换言之 1100 | 据 1101 | 掌握 1102 | 接着 1103 | 接著 1104 | 故 1105 | 故此 1106 | 整个 1107 | 方便 1108 | 方面 1109 | 旁人 1110 | 无宁 1111 | 无法 1112 | 无论 1113 | 既 1114 | 既是 1115 | 既然 1116 | 时候 1117 | 明显 1118 | 明确 1119 | 是 1120 | 是否 1121 | 是的 1122 | 显然 1123 | 显著 1124 | 普通 1125 | 普遍 1126 | 更加 1127 | 曾经 1128 | 替 1129 | 最后 1130 | 最大 1131 | 最好 1132 | 最後 1133 | 最近 1134 | 最高 1135 | 有 1136 | 有些 1137 | 有关 1138 | 有利 1139 | 有力 1140 | 有所 1141 | 有效 1142 | 有时 1143 | 有点 1144 | 有的 1145 | 有着 1146 | 有著 1147 | 望 1148 | 朝 1149 | 朝着 1150 | 本 1151 | 本着 1152 | 来 1153 | 来着 1154 | 极了 1155 | 构成 1156 | 果然 1157 | 果真 1158 | 某 1159 | 某个 1160 | 某些 1161 | 根据 1162 | 根本 1163 | 欢迎 1164 | 正在 1165 | 正如 1166 | 正常 1167 | 此 1168 | 此外 1169 | 此时 1170 | 此间 1171 | 毋宁 1172 | 每 1173 | 每个 1174 | 每天 1175 | 每年 1176 | 每当 1177 | 比 1178 | 比如 1179 | 比方 1180 | 比较 1181 | 沿 1182 | 沿着 1183 | 注意 1184 | 深入 1185 | 清楚 1186 | 满足 1187 | 漫说 1188 | 焉 1189 | 然则 1190 | 然后 1191 | 然後 1192 | 然而 1193 | 照 1194 | 照着 1195 | 特别是 1196 | 特殊 1197 | 特点 1198 | 现代 1199 | 现在 1200 | 甚么 1201 | 甚而 1202 | 甚至 1203 | 用 1204 | 由 1205 | 由于 1206 | 由此可见 1207 | 的 1208 | 的话 1209 | 目前 1210 | 直到 1211 | 直接 1212 | 相似 1213 | 相信 1214 | 相反 1215 | 相同 1216 | 相对 1217 | 相对而言 1218 | 相应 1219 | 相当 1220 | 相等 1221 | 省得 1222 | 看出 1223 | 看到 1224 | 看来 1225 | 看看 1226 | 看见 1227 | 真是 1228 | 真正 1229 | 着 1230 | 着呢 1231 | 矣 1232 | 知道 1233 | 确定 1234 | 离 1235 | 积极 1236 | 移动 1237 | 突出 1238 | 突然 1239 | 立即 1240 | 第 1241 | 等 1242 | 等等 1243 | 管 1244 | 紧接着 1245 | 纵 1246 | 纵令 1247 | 纵使 1248 | 纵然 1249 | 练习 1250 | 组成 1251 | 经 1252 | 经常 1253 | 经过 1254 | 结合 1255 | 结果 1256 | 给 1257 | 绝对 1258 | 继续 1259 | 继而 1260 | 维持 1261 | 综上所述 1262 | 罢了 1263 | 考虑 1264 | 者 1265 | 而 1266 | 而且 1267 | 而况 1268 | 而外 1269 | 而已 1270 | 而是 1271 | 而言 1272 | 联系 1273 | 能 1274 | 能否 1275 | 能够 1276 | 腾 1277 | 自 1278 | 自个儿 1279 | 自从 1280 | 自各儿 1281 | 自家 1282 | 自己 1283 | 自身 1284 | 至 1285 | 至于 1286 | 良好 1287 | 若 1288 | 若是 1289 | 若非 1290 | 范围 1291 | 莫若 1292 | 获得 1293 | 虽 1294 | 虽则 1295 | 虽然 1296 | 虽说 1297 | 行为 1298 | 行动 1299 | 表明 1300 | 表示 1301 | 被 1302 | 要 1303 | 要不 1304 | 要不是 1305 | 要不然 1306 | 要么 1307 | 要是 1308 | 要求 1309 | 规定 1310 | 觉得 1311 | 认为 1312 | 认真 1313 | 认识 1314 | 让 1315 | 许多 1316 | 论 1317 | 设使 1318 | 设若 1319 | 该 1320 | 说明 1321 | 诸位 1322 | 谁 1323 | 谁知 1324 | 赶 1325 | 起 1326 | 起来 1327 | 起见 1328 | 趁 1329 | 趁着 1330 | 越是 1331 | 跟 1332 | 转动 1333 | 转变 1334 | 转贴 1335 | 较 1336 | 较之 1337 | 边 1338 | 达到 1339 | 迅速 1340 | 过 1341 | 过去 1342 | 过来 1343 | 运用 1344 | 还是 1345 | 还有 1346 | 这 1347 | 这个 1348 | 这么 1349 | 这么些 1350 | 这么样 1351 | 这么点儿 1352 | 这些 1353 | 这会儿 1354 | 这儿 1355 | 这就是说 1356 | 这时 1357 | 这样 1358 | 这点 1359 | 这种 1360 | 这边 1361 | 这里 1362 | 这麽 1363 | 进入 1364 | 进步 1365 | 进而 1366 | 进行 1367 | 连 1368 | 连同 1369 | 适应 1370 | 适当 1371 | 适用 1372 | 逐步 1373 | 逐渐 1374 | 通常 1375 | 通过 1376 | 造成 1377 | 遇到 1378 | 遭到 1379 | 避免 1380 | 那 1381 | 那个 1382 | 那么 1383 | 那么些 1384 | 那么样 1385 | 那些 1386 | 那会儿 1387 | 那儿 1388 | 那时 1389 | 那样 1390 | 那边 1391 | 那里 1392 | 那麽 1393 | 部分 1394 | 鄙人 1395 | 采取 1396 | 里面 1397 | 重大 1398 | 重新 1399 | 重要 1400 | 鉴于 1401 | 问题 1402 | 防止 1403 | 阿 1404 | 附近 1405 | 限制 1406 | 除 1407 | 除了 1408 | 除此之外 1409 | 除非 1410 | 随 1411 | 随着 1412 | 随著 1413 | 集中 1414 | 需要 1415 | 非但 1416 | 非常 1417 | 非徒 1418 | 靠 1419 | 顺 1420 | 顺着 1421 | 首先 1422 | 高兴 1423 | 是不是 1424 | 说说 1425 | -------------------------------------------------------------------------------- /pysenti/frequency.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author:XuMing(xuming624@qq.com) 4 | @description: 5 | """ 6 | 7 | 8 | class BaseProb(object): 9 | def __init__(self): 10 | self.d = {} 11 | self.total = 0.0 12 | self.none = 0 13 | 14 | def exists(self, key): 15 | return key in self.d 16 | 17 | def getsum(self): 18 | return self.total 19 | 20 | def get(self, key): 21 | if not self.exists(key): 22 | return False, self.none 23 | return True, self.d[key] 24 | 25 | def freq(self, key): 26 | return float(self.get(key)[1]) / self.total 27 | 28 | def samples(self): 29 | return self.d.keys() 30 | 31 | 32 | class AddOneProb(BaseProb): 33 | def __init__(self): 34 | self.d = {} 35 | self.total = 0.0 36 | self.none = 1 37 | 38 | def add(self, key, value): 39 | self.total += value 40 | if not self.exists(key): 41 | self.d[key] = 1 42 | self.total += 1 43 | self.d[key] += value 44 | -------------------------------------------------------------------------------- /pysenti/model_classifier.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author:XuMing(xuming624@qq.com) 4 | @description: 5 | """ 6 | import os 7 | from pysenti.bayes import Bayes 8 | from pysenti.compat import strdecode 9 | from pysenti.tokenizer import segment 10 | from pysenti.utils import filter_stop, load_set 11 | 12 | pwd_path = os.path.abspath(os.path.dirname(__file__)) 13 | default_sentiment_model_path = os.path.join(pwd_path, 'data/sentiment_model.pkl') 14 | # 停用词 15 | default_stopwords_path = os.path.join(pwd_path, 'data/stopwords.txt') 16 | 17 | 18 | class ModelClassifier: 19 | def __init__(self, model_path=default_sentiment_model_path, stopwords_path=default_stopwords_path): 20 | self.classifier = Bayes() 21 | self.model_path = model_path 22 | self.stopwords = load_set(stopwords_path) 23 | if model_path: 24 | self.classifier.load(self.model_path) 25 | 26 | def save(self): 27 | self.classifier.save(self.model_path) 28 | 29 | def handle(self, doc): 30 | words = segment(doc) 31 | words = filter_stop(words, self.stopwords) 32 | return words 33 | 34 | def train(self, neg_docs, pos_docs): 35 | data = [] 36 | for sent in neg_docs: 37 | data.append([self.handle(sent), 'neg']) 38 | for sent in pos_docs: 39 | data.append([self.handle(sent), 'pos']) 40 | self.classifier.train(data) 41 | 42 | def classify(self, text): 43 | """ 44 | sentiment classification text 45 | :param text: text, str 46 | :return: "positive_prob": 0.0, 47 | "negative_prob": 0.0 48 | dict 49 | """ 50 | result = {"positive_prob": 0.0, "negative_prob": 0.0} 51 | text = strdecode(text) 52 | ret, prob = self.classifier.classify(self.handle(text)) 53 | if ret == 'pos': 54 | result["positive_prob"] = prob 55 | result["negative_prob"] = 1 - prob 56 | else: 57 | result["negative_prob"] = prob 58 | result["positive_prob"] = 1 - prob 59 | return result 60 | 61 | 62 | if __name__ == '__main__': 63 | model = ModelClassifier() 64 | a_sentence = ['剁椒鸡蛋好难吃。绝对没人受得了', 65 | '土豆丝很好吃', '土豆丝很难吃', 66 | '这笔钱是个天文数字', 67 | '我一会儿出去玩了,你吃啥?给你带'] 68 | for i in a_sentence: 69 | r = model.classify(i) 70 | print(i, r) 71 | -------------------------------------------------------------------------------- /pysenti/rule_classfier.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author:XuMing(xuming624@qq.com) 4 | @description: 5 | """ 6 | import os 7 | from codecs import open 8 | from loguru import logger 9 | from pysenti import tokenizer 10 | from pysenti.compat import strdecode 11 | from pysenti.utils import split_sentence 12 | 13 | pwd_path = os.path.abspath(os.path.dirname(__file__)) 14 | 15 | # 情感词典,包括积极词典、消极词典 16 | sentiment_dict_path = os.path.join(pwd_path, 'data/sentiment_dict.txt') 17 | # 连词词典 18 | conjunction_dict_path = os.path.join(pwd_path, 'data/conjunction_dict.txt') 19 | # 副词词典 20 | adverb_dict_path = os.path.join(pwd_path, 'data/adverb_dict.txt') 21 | # 否定词典 22 | denial_dict_path = os.path.join(pwd_path, 'data/denial_dict.txt') 23 | 24 | 25 | class RuleClassifier: 26 | def __init__(self): 27 | self.name = "rule_classifier" 28 | self.sentiment_dict = {} 29 | self.conjunction_dict = {} 30 | self.adverb_dict = {} 31 | self.denial_dict = {} 32 | self.user_sentiment_dict = {} 33 | self.inited = False 34 | 35 | def init(self, sentiment_dict_path=sentiment_dict_path): 36 | # 加载情感词典词典 37 | self.sentiment_dict = self._get_dict(sentiment_dict_path) 38 | self.conjunction_dict = self._get_dict(conjunction_dict_path) # 连词 39 | self.adverb_dict = self._get_dict(adverb_dict_path) # 副词 40 | self.denial_dict = self._get_dict(denial_dict_path) 41 | self.inited = True 42 | 43 | def load_user_sentiment_dict(self, path): 44 | if not self.inited: 45 | self.init() 46 | self.user_sentiment_dict = self._get_dict(path) 47 | self.sentiment_dict.update(self.user_sentiment_dict) 48 | 49 | def classify(self, text): 50 | if not self.inited: 51 | self.init() 52 | # 情感分析整体数据结构 53 | result = {"score": 0} 54 | text = strdecode(text) 55 | # 分句 56 | clauses = split_sentence(text) 57 | # 对每分句进行情感分析 58 | for i in range(len(clauses)): 59 | # 情感分析子句的数据结构 60 | sub_clause = self._analyse_clause(clauses[i]) 61 | 62 | # 将子句分析的数据结果添加到整体数据结构中 63 | result["sub_clause" + str(i)] = sub_clause 64 | result["score"] += sub_clause["score"] 65 | 66 | return result 67 | 68 | def _analyse_clause(self, clause): 69 | sub_clause = {"score": 0, "sentiment": [], "conjunction": []} 70 | seg_result = tokenizer.segment(clause, pos=False) 71 | 72 | # 逐个分析分词 73 | for word in seg_result: 74 | # 判断是否是连词 75 | r = self._is_word_conjunction(word) 76 | if r: 77 | sub_clause["conjunction"].append(r) 78 | 79 | # 判断是否是情感词 80 | r = self._is_word_sentiment(word, seg_result) 81 | if r: 82 | sub_clause["sentiment"].append(r) 83 | sub_clause["score"] += r["score"] 84 | 85 | # 综合连词的情感值 86 | for a_conjunction in sub_clause["conjunction"]: 87 | sub_clause["score"] *= a_conjunction["value"] 88 | 89 | return sub_clause 90 | 91 | def _is_word_conjunction(self, the_word): 92 | r = {} 93 | if the_word in self.conjunction_dict: 94 | r = {"key": the_word, "value": self.conjunction_dict[the_word]} 95 | return r 96 | 97 | def _is_word_sentiment(self, the_word, seg_result, index=-1): 98 | r = {} 99 | # 判断分词是否在情感词典内 100 | if the_word in self.sentiment_dict: 101 | # 在情感词典内,则构建一个以情感词为中心的字典数据结构 102 | r = self._emotional_word_analysis(the_word, self.sentiment_dict[the_word], seg_result, index) 103 | # 不在情感词典内,则返回空 104 | return r 105 | 106 | def _emotional_word_analysis(self, core_word, value, segments, index): 107 | # 在情感词典内,则构建一个以情感词为中心的字典数据结构 108 | orientation = {"key": core_word, "adverb": [], "denial": [], "value": value} 109 | orientation_score = value 110 | 111 | # 在三个前视窗内,判断是否有否定词、副词 112 | view_window = index - 1 113 | if view_window > -1: # 无越界 114 | # 判断前一个词是否是情感词 115 | if segments[view_window] in self.sentiment_dict: 116 | orientation["score"] = orientation_score 117 | return orientation 118 | # 判断是否是副词 119 | if segments[view_window] in self.adverb_dict: 120 | # 构建副词字典数据结构 121 | adverb = {"key": segments[view_window], "sentiment": 1, 122 | "value": self.adverb_dict[segments[view_window]]} 123 | orientation["adverb"].append(adverb) 124 | orientation_score *= self.adverb_dict[segments[view_window]] 125 | # 判断是否是否定词 126 | elif segments[view_window] in self.denial_dict: 127 | # 构建否定词字典数据结构 128 | denial = {"key": segments[view_window], "sentiment": 1, 129 | "value": self.denial_dict[segments[view_window]]} 130 | orientation["denial"].append(denial) 131 | orientation_score *= -1 132 | view_window = index - 2 133 | if view_window > -1: 134 | # 判断前一个词是否是情感词 135 | if segments[view_window] in self.sentiment_dict: 136 | orientation['score'] = orientation_score 137 | return orientation 138 | if segments[view_window] in self.adverb_dict: 139 | adverb = {"key": segments[view_window], "sentiment": 2, 140 | "value": self.adverb_dict[segments[view_window]]} 141 | orientation_score *= self.adverb_dict[segments[view_window]] 142 | orientation["adverb"].insert(0, adverb) 143 | elif segments[view_window] in self.denial_dict: 144 | denial = {"key": segments[view_window], "sentiment": 2, 145 | "value": self.denial_dict[segments[view_window]]} 146 | orientation["denial"].insert(0, denial) 147 | orientation_score *= -1 148 | # 判断是否是“不是很好”的结构(区别于“很不好”) 149 | if len(orientation["adverb"]) > 0: 150 | # 是,则引入调节阈值,0.3 151 | orientation_score *= 0.3 152 | view_window = index - 3 153 | if view_window > -1: 154 | # 判断前一个词是否是情感词 155 | if segments[view_window] in self.sentiment_dict: 156 | orientation["score"] = orientation_score 157 | return orientation 158 | if segments[view_window] in self.adverb_dict: 159 | adverb = {"key": segments[view_window], "sentiment": 3, 160 | "value": self.adverb_dict[segments[view_window]]} 161 | orientation_score *= self.adverb_dict[segments[view_window]] 162 | orientation["adverb"].insert(0, adverb) 163 | elif segments[view_window] in self.denial_dict: 164 | denial = {"key": segments[view_window], "sentiment": 3, 165 | "value": self.denial_dict[segments[view_window]]} 166 | orientation["denial"].insert(0, denial) 167 | orientation_score *= -1 168 | # 判断是否是“不是很好”的结构(区别于“很不好”) 169 | if len(orientation["adverb"]) > 0 and len(orientation["denial"]) == 0: 170 | orientation_score *= 0.3 171 | # 添加情感分析值 172 | orientation["score"] = orientation_score 173 | # 返回的数据结构 174 | return orientation 175 | 176 | @staticmethod 177 | def _get_dict(path, encoding="utf-8"): 178 | """ 179 | 情感词典的构建 180 | :param path: 181 | :param encoding: 182 | :return: 183 | """ 184 | sentiment_dict = {} 185 | with open(path, 'r', encoding=encoding) as f: 186 | c = 0 187 | for line in f: 188 | parts = line.strip().split() 189 | c += 1 190 | if len(parts) == 2: 191 | sentiment_dict[parts[0]] = float(parts[1]) 192 | else: 193 | logger.error(f"num: {c}, {line}") 194 | return sentiment_dict 195 | 196 | 197 | if __name__ == '__main__': 198 | d = RuleClassifier() 199 | 200 | a_sentence = ['剁椒鸡蛋好难吃。绝对没人受得了', 201 | '土豆丝很好吃', '土豆丝很难吃', 202 | '这笔钱是个天文数字', 203 | '我一会儿出去玩了,你吃啥?给你带,然而你不知道'] 204 | for i in a_sentence: 205 | r = d.classify(i) 206 | print(i, r) 207 | -------------------------------------------------------------------------------- /pysenti/tokenizer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author:XuMing(xuming624@qq.com) 4 | @description: 切词器 5 | """ 6 | import logging 7 | 8 | import jieba 9 | from jieba import posseg 10 | 11 | from .compat import strdecode 12 | 13 | jieba.default_logger.setLevel(logging.ERROR) 14 | 15 | 16 | def segment(sentence, cut_type='word', pos=False): 17 | """ 18 | 切词 19 | :param sentence: 20 | :param cut_type: 'word' use jieba.lcut; 'char' use list(sentence) 21 | :param pos: enable POS 22 | :return: list 23 | """ 24 | sentence = strdecode(sentence) 25 | if pos: 26 | if cut_type == 'word': 27 | return posseg.lcut(sentence) 28 | elif cut_type == 'char': 29 | word_seq = list(sentence) 30 | pos_seq = [] 31 | for w in word_seq: 32 | w_p = posseg.lcut(w) 33 | pos_seq.append(w_p[0].flag) 34 | return word_seq, pos_seq 35 | else: 36 | if cut_type == 'word': 37 | return jieba.lcut(sentence) 38 | elif cut_type == 'char': 39 | return list(sentence) 40 | -------------------------------------------------------------------------------- /pysenti/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author:XuMing(xuming624@qq.com) 4 | @description: 5 | """ 6 | import os 7 | from codecs import open 8 | 9 | from pysenti.model_classifier import ModelClassifier 10 | 11 | 12 | def train(neg_file, pos_file, model_path): 13 | neg = open(neg_file, 'r', 'utf-8').readlines() 14 | pos = open(pos_file, 'r', 'utf-8').readlines() 15 | neg_docs = [] 16 | pos_docs = [] 17 | for line in neg: 18 | neg_docs.append(line.rstrip("\r\n")) 19 | for line in pos: 20 | pos_docs.append(line.rstrip("\r\n")) 21 | global classifier 22 | classifier = ModelClassifier(model_path) 23 | classifier.train(neg_docs, pos_docs) 24 | 25 | 26 | def save(): 27 | classifier.save() 28 | 29 | 30 | def classify(sent): 31 | return classifier.classify(sent) 32 | 33 | 34 | if __name__ == '__main__': 35 | pwd_path = os.path.abspath(os.path.dirname(__file__)) 36 | default_sentiment_model_path = os.path.join(pwd_path, 'data/sentiment_model.pkl') 37 | 38 | train('data/neg_sentences.txt', 'data/pos_sentences.txt', default_sentiment_model_path) 39 | save() 40 | txt = "苹果是一家伟大的公司" 41 | print(txt, ' prob: ', classify(txt)) 42 | -------------------------------------------------------------------------------- /pysenti/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author:XuMing(xuming624@qq.com) 4 | @description: 5 | """ 6 | import os 7 | import pickle 8 | import re 9 | from codecs import open 10 | 11 | 12 | def load_set(path): 13 | words = set() 14 | with open(path, 'r', 'utf-8') as f: 15 | for line in f: 16 | words.add(line.strip()) 17 | return words 18 | 19 | 20 | re_zh = re.compile('([\u4E00-\u9FA5]+)') 21 | 22 | 23 | def filter_stop(words, stopwords): 24 | return list(filter(lambda x: x not in stopwords, words)) 25 | 26 | 27 | def load_pkl(pkl_path): 28 | """ 29 | 加载词典文件 30 | :param pkl_path: 31 | :return: 32 | """ 33 | with open(pkl_path, 'rb') as f: 34 | result = pickle.load(f) 35 | return result 36 | 37 | 38 | def dump_pkl(vocab, pkl_path, overwrite=True): 39 | """ 40 | 存储文件 41 | :param pkl_path: 42 | :param overwrite: 43 | :return: 44 | """ 45 | if pkl_path and os.path.exists(pkl_path) and not overwrite: 46 | return 47 | if pkl_path: 48 | with open(pkl_path, 'wb') as f: 49 | pickle.dump(vocab, f, protocol=0) 50 | print("save %s ok." % pkl_path) 51 | else: 52 | raise IOError("no file: %s" % pkl_path) 53 | 54 | 55 | def split_sentence(sentence): 56 | pattern = re.compile(u"[,。%、!!??,;~~.… ]+") 57 | clauses = [i for i in pattern.split(sentence.strip()) if i] 58 | return clauses 59 | 60 | 61 | if __name__ == '__main__': 62 | sent = "nihao,我是警察,你站起来。我要问你话!好不。" 63 | k = split_sentence(sent) 64 | print(k) 65 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | jieba -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author:XuMing(xuming624@qq.com) 4 | @description: 5 | """ 6 | from setuptools import setup, find_packages 7 | 8 | with open('README.md', 'r', encoding='utf-8') as f: 9 | readme = f.read() 10 | 11 | setup( 12 | name='pysenti', 13 | version='0.1.9', 14 | description='Chinese Sentiment Classifier', 15 | long_description=readme, 16 | long_description_content_type='text/markdown', 17 | author='XuMing', 18 | author_email='xuming624@qq.com', 19 | url='https://github.com/shibing624/pysenti', 20 | license="Apache 2.0", 21 | classifiers=[ 22 | 'Intended Audience :: Developers', 23 | 'Operating System :: OS Independent', 24 | 'Natural Language :: Chinese (Simplified)', 25 | 'Natural Language :: Chinese (Traditional)', 26 | 'Programming Language :: Python', 27 | 'Programming Language :: Python :: 2', 28 | 'Programming Language :: Python :: 2.7', 29 | 'Programming Language :: Python :: 3', 30 | 'Topic :: Text Processing', 31 | 'Topic :: Text Processing :: Indexing', 32 | 'Topic :: Text Processing :: Linguistic', 33 | ], 34 | keywords='NLP,sentiment-classifier,sentiment-classification,pysenti', 35 | install_requires=['jieba'], 36 | packages=find_packages(), 37 | package_dir={'pysenti': 'pysenti'}, 38 | package_data={'pysenti': ['*.*', 'data/*']} 39 | ) 40 | --------------------------------------------------------------------------------