├── .gitignore
├── LICENSE
├── README.md
├── examples
    ├── model_classifier_demo.py
    ├── rule_classifier_demo.py
    ├── user_sentiment_dict.txt
    └── userdict_demo.py
├── pysenti
    ├── __init__.py
    ├── __main__.py
    ├── bayes.py
    ├── compat.py
    ├── data
    │   ├── adverb_dict.txt
    │   ├── conjunction_dict.txt
    │   ├── denial_dict.txt
    │   ├── neg_sentences.txt
    │   ├── pos_sentences.txt
    │   ├── process.py
    │   ├── sentiment_dict.txt
    │   ├── sentiment_model.pkl
    │   └── stopwords.txt
    ├── frequency.py
    ├── model_classifier.py
    ├── rule_classfier.py
    ├── tokenizer.py
    ├── train.py
    └── utils.py
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![PyPI version](https://badge.fury.io/py/pysenti.svg)](https://badge.fury.io/py/pysenti)
  2 | [![Downloads](https://static.pepy.tech/badge/pysenti)](https://pepy.tech/project/pysenti)
  3 | [![License Apache 2.0](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/shibing624/pysenti/LICENSE)
  4 | ![Language](https://img.shields.io/badge/Language-Python-blue.svg)
  5 | 
  6 | 
  7 | # pysenti
  8 | 
  9 | Chinese Sentiment Classification Tool for Python. 中文情感极性分析工具。
 10 | 
 11 | **pysenti**基于规则词典的情感极性分析，扩展性强，可作为调研用的基准方法。
 12 | 
 13 | ## Question
 14 | 文本情感极性（倾向）分析咋做？
 15 | 
 16 | ## Solution
 17 | ### 规则的解决思路
 18 | 1. 中文情感极性分析，文本切分为段落，再切词，通过情感词标识出各个词语的情感极性，包括积极、中立、消极。
 19 | 2. 结合句子结构（包括连词、否定词、副词、标点等）给各情感词语的情感极性赋予权重，然后加权求和得到文本的情感极性得分。
 20 | 3. 优点：泛化性好，规则可扩展性强，所有领域通用。
 21 | 4. 缺点：规则词典收集困难，专家系统的权重设定有局限，单一领域准确率相比模型方法低。
 22 | 
 23 | ### 模型的解决思路
 24 | 1. 常见的[NLP文本分类模型](https://github.com/shibing624/text-classifier)均可，包括经典文本分类模型（LR、SVM、Xgboost等）和深度文本分类模型（TextCNN、Bi-LSTM、BERT等）。
 25 | 2. 优点：单一领域准召率高。
 26 | 3. 缺点：不通用，有标注数据的样本收集困难，扩展性弱。
 27 | 
 28 | ## Feature
 29 | ### 规则
 30 | * [情感词典](https://github.com/shibing624/pysenti/tree/master/pysenti/data)整合了`知网情感词典`、`清华大学李军情感词典`、[BosonNLP情感词典](https://bosonnlp.com/dev/resource)、`否定词词典`。
 31 | 
 32 | ### 模型
 33 | * bayes 文本分类模型
 34 | * [样本数据](https://github.com/shibing624/pysenti/tree/master/pysenti/data)来自商品评论数据，分为积极、消极两类。
 35 | 
 36 | ## Demo
 37 | 
 38 | Official Demo: https://www.mulanai.com/product/sentiment_classify/
 39 | 
 40 | ## Install
 41 | * 全自动安装：pip install pysenti
 42 | * 半自动安装：
 43 | ```shell
 44 | git clone https://github.com/shibing624/pysenti.git
 45 | cd pysenti
 46 | python3 setup.py install
 47 | ```
 48 | 
 49 | ## Usage
 50 | ### 规则方法
 51 | example: [examples/rule_classifier_demo.py](examples/rule_classifier_demo.py)
 52 | 
 53 | ```python
 54 | import pysenti
 55 | 
 56 | texts = ["苹果是一家伟大的公司",
 57 |          "土豆丝很好吃",
 58 |          "土豆丝很难吃"]
 59 | for i in texts:
 60 |     r = pysenti.classify(i)
 61 |     print(i, r['score'], r)
 62 | 
 63 | ```
 64 | 
 65 | output:
 66 | ```shell
 67 | 苹果是一家伟大的公司 3.4346924811096997 {'score': 3.4346924811096997, 'sub_clause0': {'score': 3.4346924811096997, 'sentiment': [{'key': '苹果', 'adverb': [], 'denial': [], 'value': 1.37846341627, 'score': 1.37846341627}, {'key': '是', 'adverb': [], 'denial': [], 'value': -0.252600480826, 'score': -0.252600480826}, {'key': '一家', 'adverb': [], 'denial': [], 'value': 1.48470161748, 'score': 1.48470161748}, {'key': '伟大', 'adverb': [], 'denial': [], 'value': 1.14925252286, 'score': 1.14925252286}, {'key': '的', 'adverb': [], 'denial': [], 'value': 0.0353323193687, 'score': 0.0353323193687}, {'key': '公司', 'adverb': [], 'denial': [], 'value': -0.360456914043, 'score': -0.360456914043}], 'conjunction': []}}
 68 | 土豆丝很好吃 2.294311221077 {'score': 2.294311221077, 'sub_clause0': {'score': 2.294311221077, 'sentiment': [{'key': '土豆丝', 'adverb': [], 'denial': [], 'value': 0.294892711165, 'score': 0.294892711165}, {'key': '很', 'adverb': [], 'denial': [], 'value': 0.530242664632, 'score': 0.530242664632}, {'key': '好吃', 'adverb': [], 'denial': [], 'value': 1.46917584528, 'score': 1.46917584528}], 'conjunction': []}}
 69 | 土豆丝很难吃 -2.381874203563 {'score': -2.381874203563, 'sub_clause0': {'score': -2.381874203563, 'sentiment': [{'key': '土豆丝', 'adverb': [], 'denial': [], 'value': 0.294892711165, 'score': 0.294892711165}, {'key': '很', 'adverb': [], 'denial': [], 'value': 0.530242664632, 'score': 0.530242664632}, {'key': '难吃', 'adverb': [], 'denial': [], 'value': -3.20700957936, 'score': -3.20700957936}], 'conjunction': []}}
 70 | ```
 71 | > score: 正值是积极情感；负值是消极情感。
 72 | 
 73 | ### 模型方法
 74 | example: [examples/model_classifier_demo.py](examples/model_classifier_demo.py)
 75 | 
 76 | ```python
 77 | from pysenti import ModelClassifier
 78 | 
 79 | texts = ["苹果是一家伟大的公司",
 80 |          "土豆丝很好吃",
 81 |          "垃圾，在酒店中应该是很差的！",
 82 |          "我们刚走过一个烧烤店",
 83 |          "土豆丝很难吃"]
 84 | 
 85 | m = ModelClassifier()
 86 | for i in texts:
 87 |     r = m.classify(i)
 88 |     print(i, r)
 89 | ```
 90 | 
 91 | output：
 92 | ```shell
 93 | 苹果是一家伟大的公司 {'positive_prob': 0.682, 'negative_prob': 0.318}
 94 | 土豆丝很好吃 {'positive_prob': 0.601, 'negative_prob': 0.399}
 95 | 土豆丝很难吃 {'positive_prob': 0.283, 'negative_prob': 0.717}
 96 | ```
 97 | 
 98 | ### 延迟加载机制
 99 | 
100 | pysenti 采用延迟加载，`import pysenti` 和 `from pysenti import rule_classifier` 不会立即触发词典的加载，一旦有必要才开始加载词典。如果你想手工初始 pysenti，也可以手动初始化。
101 | ```python
102 | import pysenti
103 | pysenti.rule_classifier.init()  # 手动初始化（可选）
104 | ```
105 | 
106 | 你还可以使用自定义情感词典:
107 | ```python
108 | pysenti.rule_classifier.init('user_sentiment_dict.txt')
109 | ```
110 | 情感词典`user_sentiment_dict.txt`的格式如下：
111 | ```shell
112 | 难吃 -10
113 | 好吃 10
114 | ```
115 | 空格间隔，第一个是词，第二个是分值：正值代表积极情感，负值代表消极情感，越大情感越强烈。
116 | 
117 | ### 命令行
118 | 
119 | 使用示例： python -m pysenti news.txt > news_result.txt
120 | 
121 | 命令行选项（翻译）：
122 | ```shell
123 | 使用: python -m pysenti [options] filename
124 | 
125 | 命令行界面
126 | 
127 | 固定参数:
128 |   filename              输入文件
129 | 
130 | 可选参数:
131 |   -h, --help            显示此帮助信息并退出
132 |   -d DICT, --dict DICT  使用 DICT 代替默认词典
133 |   -u USER_DICT, --user-dict USER_DICT
134 |                         使用 USER_DICT 作为附加词典，与默认词典或自定义词典配合使用
135 |   -a, --output-all      输出句子及词级别情感分析详细信息
136 |   -V, --version         显示版本信息并退出
137 | 
138 | 如果没有指定文件名，则使用标准输入。
139 | ```
140 | 
141 | 
142 | `--help`选项输出：
143 | ```shell
144 | $> python -m pysenti --help
145 | 
146 | usage: python3 -m pysenti [options] filename
147 | 
148 | pysenti command line interface.
149 | 
150 | positional arguments:
151 |   filename              input file
152 | 
153 | optional arguments:
154 |   -h, --help            show this help message and exit
155 |   -d DICT, --dict DICT  use DICT as dictionary
156 |   -u USER_DICT, --user-dict USER_DICT
157 |                         use USER_DICT together with the default dictionary or
158 |                         DICT (if specified)
159 |   -a, --output-all      output text sentiment score and word sentiment info
160 |   -V, --version         show program's version number and exit
161 | 
162 | If no filename specified, use STDIN instead.
163 | ```
164 | 
165 | ## Reference
166 | 
167 | - snownlp
168 | - SentimentPolarityAnalysis
169 | 


--------------------------------------------------------------------------------
/examples/model_classifier_demo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author:XuMing(xuming624@qq.com)
 4 | @description: 
 5 | """
 6 | from pysenti import ModelClassifier
 7 | 
 8 | texts = ["苹果是一家伟大的公司",
 9 |          "土豆丝很好吃",
10 |          "垃圾，在酒店中应该是很差的！",
11 |          "我们刚走过一个烧烤店",
12 |          "土豆丝很难吃"]
13 | 
14 | if __name__ == '__main__':
15 |     m = ModelClassifier()
16 |     for i in texts:
17 |         r = m.classify(i)
18 |         print(i, r)
19 | 


--------------------------------------------------------------------------------
/examples/rule_classifier_demo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author:XuMing(xuming624@qq.com)
 4 | @description:
 5 | """
 6 | import pysenti
 7 | 
 8 | if __name__ == '__main__':
 9 |     texts = [
10 |         "这个电影真心很难看，大多数人都不喜欢，被骂很久，早就习惯了。",
11 |         "苹果是一家伟大的公司",
12 |         "土豆丝很好吃",
13 |         "土豆丝很难吃"
14 |     ]
15 |     for i in texts:
16 |         r = pysenti.classify(i)
17 |         print(i, r['score'])
18 |         print(r)
19 | 


--------------------------------------------------------------------------------
/examples/user_sentiment_dict.txt:
--------------------------------------------------------------------------------
1 | 很硬 -10
2 | 难吃 -10
3 | fuck -5
4 | 好吃 10
5 | 很香 10
6 | 甜美 5


--------------------------------------------------------------------------------
/examples/userdict_demo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author:XuMing(xuming624@qq.com)
 4 | @description: 
 5 | """
 6 | 
 7 | from pysenti import RuleClassifier
 8 | 
 9 | if __name__ == '__main__':
10 |     m = RuleClassifier()
11 |     m.load_user_sentiment_dict('user_sentiment_dict.txt')
12 |     print(m.user_sentiment_dict)
13 | 
14 |     a_sentences = ['剁椒鸡蛋好难吃。绝对没人受得了',
15 |                    '土豆丝很好吃', '土豆丝很难吃',
16 |                    '这笔钱是个天文数字',
17 |                    '啥也不是',
18 |                    '我一会儿出去玩了，你吃啥？给你带,然而你不知道']
19 |     for i in a_sentences:
20 |         r = m.classify(i)
21 |         print(i, r['score'])
22 | 


--------------------------------------------------------------------------------
/pysenti/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author:XuMing(xuming624@qq.com)
 4 | @description: 
 5 | """
 6 | 
 7 | from pysenti.compat import strdecode
 8 | from pysenti.model_classifier import ModelClassifier
 9 | from pysenti.rule_classfier import RuleClassifier
10 | 
11 | __version__ = '0.1.9'
12 | 
13 | rule_classifier = RuleClassifier()
14 | classify = rule_classifier.classify
15 | 
16 | 


--------------------------------------------------------------------------------
/pysenti/__main__.py:
--------------------------------------------------------------------------------
 1 | """pysenti command line interface."""
 2 | 
 3 | import sys
 4 | from argparse import ArgumentParser
 5 | 
 6 | from pysenti import RuleClassifier, __version__
 7 | from pysenti.compat import PY2, default_encoding
 8 | 
 9 | 
10 | def main(args):
11 |     rule_classifier = RuleClassifier()
12 |     fp = open(args.filename, 'r') if args.filename else sys.stdin
13 | 
14 |     if args.dict:
15 |         rule_classifier.init(sentiment_dict_path=args.dict)
16 |     else:
17 |         rule_classifier.init()
18 |     if args.user_dict:
19 |         rule_classifier.load_user_sentiment_dict(args.user_dict)
20 | 
21 |     ln = fp.readline()
22 |     while ln:
23 |         r = rule_classifier.classify(ln.rstrip())
24 |         if args.output_all:
25 |             result = str(r)
26 |         else:
27 |             result = str(r['score'])
28 |         if PY2:
29 |             result = result.encode(default_encoding)
30 |         print(result)
31 |         ln = fp.readline()
32 | 
33 |     fp.close()
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     parser = ArgumentParser(usage="%s -m pysenti [options] filename" % sys.executable,
38 |                             description="pysenti command line interface.",
39 |                             epilog="If no filename specified, use STDIN instead.")
40 |     parser.add_argument("-d", "--dict", help="use DICT as dictionary")
41 |     parser.add_argument("-u", "--user-dict",
42 |                         help="use USER_DICT together with the default dictionary or DICT (if specified)")
43 |     parser.add_argument("-a", "--output-all",
44 |                         action="store_true", default=False,
45 |                         help="output text sentiment score and word sentiment info")
46 |     parser.add_argument("-V", '--version', action='version',
47 |                         version="pysenti " + __version__)
48 |     parser.add_argument("filename", nargs='?', help="input file")
49 |     args = parser.parse_args()
50 | 
51 |     main(args)
52 | 


--------------------------------------------------------------------------------
/pysenti/bayes.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author:XuMing(xuming624@qq.com)
 4 | @description:
 5 | """
 6 | from math import log, exp
 7 | 
 8 | from pysenti.frequency import AddOneProb
 9 | from pysenti.utils import dump_pkl, load_pkl
10 | 
11 | 
12 | class Bayes(object):
13 |     def __init__(self):
14 |         self.d = {}
15 |         self.total = 0
16 | 
17 |     def save(self, fname):
18 |         d = {'total': self.total, 'd': {}}
19 |         for k, v in self.d.items():
20 |             d['d'][k] = v.__dict__
21 |         dump_pkl(d, fname)
22 | 
23 |     def load(self, fname):
24 |         d = load_pkl(fname)
25 |         self.total = d['total']
26 |         self.d = {}
27 |         for k, v in d['d'].items():
28 |             self.d[k] = AddOneProb()
29 |             self.d[k].__dict__ = v
30 | 
31 |     def train(self, data):
32 |         for d in data:
33 |             c = d[1]
34 |             if c not in self.d:
35 |                 self.d[c] = AddOneProb()
36 |             for word in d[0]:
37 |                 self.d[c].add(word, 1)
38 |         self.total = sum(map(lambda x: self.d[x].getsum(), self.d.keys()))
39 | 
40 |     def classify(self, x):
41 |         tmp = {}
42 |         for k in self.d:
43 |             tmp[k] = log(self.d[k].getsum()) - log(self.total)
44 |             for word in x:
45 |                 tmp[k] += log(self.d[k].freq(word))
46 |         ret, prob = 0, 0
47 |         for k in self.d:
48 |             now = 0
49 |             try:
50 |                 for otherk in self.d:
51 |                     now += exp(tmp[otherk] - tmp[k])
52 |                 now = 1 / now
53 |             except OverflowError:
54 |                 now = 0
55 |             if now > prob:
56 |                 ret, prob = k, now
57 |         return ret, prob
58 | 


--------------------------------------------------------------------------------
/pysenti/compat.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author:XuMing(xuming624@qq.com)
 4 | @description:
 5 | """
 6 | import os
 7 | import sys
 8 | 
 9 | try:
10 |     import pkg_resources
11 | 
12 |     get_module_res = lambda *res: pkg_resources.resource_stream(__name__,
13 |                                                                 os.path.join(*res))
14 | except ImportError:
15 |     get_module_res = lambda *res: open(os.path.normpath(os.path.join(
16 |         os.getcwd(), os.path.dirname(__file__), *res)), 'rb')
17 | 
18 | PY2 = sys.version_info[0] == 2
19 | 
20 | default_encoding = sys.getfilesystemencoding()
21 | 
22 | if PY2:
23 |     text_type = unicode
24 |     string_types = (str, unicode)
25 | 
26 |     iterkeys = lambda d: d.iterkeys()
27 |     itervalues = lambda d: d.itervalues()
28 |     iteritems = lambda d: d.iteritems()
29 | 
30 | else:
31 |     text_type = str
32 |     string_types = (str,)
33 |     xrange = range
34 | 
35 |     iterkeys = lambda d: iter(d.keys())
36 |     itervalues = lambda d: iter(d.values())
37 |     iteritems = lambda d: iter(d.items())
38 | 
39 | 
40 | def strdecode(sentence):
41 |     if not isinstance(sentence, text_type):
42 |         try:
43 |             sentence = sentence.decode('utf-8')
44 |         except UnicodeDecodeError:
45 |             sentence = sentence.decode('gbk', 'ignore')
46 |     return sentence
47 | 
48 | 
49 | def resolve_filename(f):
50 |     try:
51 |         return f.name
52 |     except AttributeError:
53 |         return repr(f)
54 | 


--------------------------------------------------------------------------------
/pysenti/data/adverb_dict.txt:
--------------------------------------------------------------------------------
  1 | 超级  2
  2 | 超  2
  3 | 都   1.75
  4 | 还   1.5
  5 | 实在  1.75
  6 | 越来越  2
  7 | 再也  2
  8 | 完全  2
  9 | 真是  1.75
 10 | 足足  1.75
 11 | 大大的  1.75
 12 | 巨  2
 13 | 最最  2
 14 | 老是  1.75
 15 | 压根  1.75
 16 | 明显  1.5
 17 | 最   2
 18 | 最为  2
 19 | 太   2
 20 | 极   2
 21 | 极为  2
 22 | 极其  2
 23 | 极度  2
 24 | 极端  2
 25 | 至   2
 26 | 至为  2
 27 | 顶   2
 28 | 过于  2
 29 | 过分  2
 30 | 分外  2
 31 | 万分  2
 32 | 根本  2
 33 | 百分之百 2
 34 | 倍加 2
 35 | 备至 2
 36 | 不得了 2
 37 | 不堪 2
 38 | 不可开交 2
 39 | 不亦乐乎 2
 40 | 不折不扣 2
 41 | 彻头彻尾 2
 42 | 充分 2
 43 | 到头 2
 44 | 地地道道 2
 45 | 非常 2
 46 | 极 2
 47 | 极度 2
 48 | 极端 2
 49 | 极其 2
 50 | 极为 2
 51 | 截然 2
 52 | 尽 2
 53 | 惊人地 2
 54 | 绝 2
 55 | 绝顶 2
 56 | 绝对 2
 57 | 绝对化 2
 58 | 刻骨 2
 59 | 酷 2
 60 | 满 2
 61 | 满贯 2
 62 | 满心 2
 63 | 莫大 2
 64 | 奇 2
 65 | 入骨 2
 66 | 甚为 2
 67 | 十二分 2
 68 | 十分 2
 69 | 十足 2
 70 | 死 2
 71 | 滔天 2
 72 | 痛 2
 73 | 透 2
 74 | 完全 2
 75 | 完完全全 2
 76 | 万 2
 77 | 万般 2
 78 | 万分 2
 79 | 万万 2
 80 | 无比 2
 81 | 无度 2
 82 | 无可估量 2
 83 | 无以复加 2
 84 | 无以伦比 2
 85 | 要命 2
 86 | 要死 2
 87 | 已极 2
 88 | 已甚 2
 89 | 异常 2
 90 | 逾常 2
 91 | 贼 2
 92 | 之极 2
 93 | 之至 2
 94 | 至极 2
 95 | 卓绝 2
 96 | 最为 2
 97 | 佼佼 2
 98 | 最 2
 99 | 更 1.75
100 | 更加  1.75
101 | 更其  1.75
102 | 越   1.75
103 | 越发  1.75
104 | 备加  1.75
105 | 愈   1.75
106 | 愈加  1.75
107 | 愈发  1.75
108 | 愈为  1.75
109 | 愈益  1.75
110 | 越加  1.75
111 | 格外  1.75
112 | 益发  1.75
113 | 很   1.75
114 | 挺   1.75
115 | 怪   1.75
116 | 非常  1.75
117 | 特别  1.75
118 | 相当  1.75
119 | 十分  1.75
120 | 好不  1.75
121 | 甚  1.75
122 | 甚为  1.75
123 | 颇  1.75
124 | 颇为 1.75
125 | 异常  1.75
126 | 深为  1.75
127 | 满  1.75
128 | 蛮  1.75
129 | 够  1.75
130 | 多  1.75
131 | 多么  1.75
132 | 殊特  1.75
133 | 大  1.75
134 | 大为  1.75
135 | 何等  1.75
136 | 何其  1.75
137 | 尤其  1.75
138 | 无比尤为  1.75
139 | 不胜  1.75
140 | 不过  1.75
141 | 不少  1.75
142 | 不胜  1.75
143 | 惨  1.75
144 | 沉  1.75
145 | 沉沉  1.75
146 | 出奇  1.75
147 | 大为  1.75
148 | 多  1.75
149 | 多多  1.75
150 | 多加  1.75
151 | 多么  1.75
152 | 分外  1.75
153 | 格外  1.75
154 | 够瞧的  1.75
155 | 够戗  1.75
156 | 好  1.75
157 | 好不  1.75
158 | 何等  1.75
159 | 很  1.75
160 | 很是  1.75
161 | 坏  1.75
162 | 可  1.75
163 | 老  1.75
164 | 老大  1.75
165 | 良  1.75
166 | 颇  1.75
167 | 颇为  1.75
168 | 甚  1.75
169 | 实在  1.75
170 | 太  1.75
171 | 太甚  1.75
172 | 特  1.75
173 | 特别  1.75
174 | 尤  1.75
175 | 尤其  1.75
176 | 尤为  1.75
177 | 尤以  1.75
178 | 远  1.75
179 | 着实  1.75
180 | 较  1.5
181 | 蛮  1.5
182 | 比较  1.5
183 | 较比  1.5
184 | 较为  1.5
185 | 不大  1.5
186 | 不太  1.5
187 | 不很  1.5
188 | 不甚  1.5
189 | 大不了  1.5
190 | 多  1.5
191 | 更  1.5
192 | 更加  1.5
193 | 更进一步  1.5
194 | 更为  1.5
195 | 还  1.5
196 | 还要  1.5
197 | 较  1.5
198 | 较比  1.5
199 | 较为  1.5
200 | 进一步  1.5
201 | 那般  1.5
202 | 那么  1.5
203 | 那样  1.5
204 | 强  1.5
205 | 如斯  1.5
206 | 益  1.5
207 | 益发  1.5
208 | 尤甚  1.5
209 | 逾  1.5
210 | 愈  1.5
211 | 愈发  1.5
212 | 愈加  1.5
213 | 愈来愈  1.5
214 | 愈益  1.5
215 | 远远  1.5
216 | 越发  1.5
217 | 越加  1.5
218 | 越来越  1.5
219 | 越是  1.5
220 | 这般  1.5
221 | 这样  1.5
222 | 足  1.5
223 | 足足  1.5
224 | 不为过  1
225 | 超  1
226 | 超额  1
227 | 超外差  1
228 | 超微结构  1
229 | 超物质  1
230 | 出头  1
231 | 多  1
232 | 浮  1
233 | 过  1
234 | 过度  1
235 | 过分  1
236 | 过火  1
237 | 过劲  1
238 | 过了头  1
239 | 过猛  1
240 | 过热  1
241 | 过甚  1
242 | 过头  1
243 | 过于  1
244 | 过逾  1
245 | 何止  1
246 | 何啻  1
247 | 开外  1
248 | 苦  1
249 | 老  1
250 | 偏  1
251 | 强  1
252 | 溢  1
253 | 忒  1
254 | 稍  0.8
255 | 稍稍  0.8
256 | 稍微  0.8
257 | 稍为  0.8
258 | 稍许  0.8
259 | 略  0.8
260 | 略略  0.8
261 | 略微  0.8
262 | 略为  0.8
263 | 些微  0.8
264 | 多少 0.8
265 | 有点 0.8
266 | 有些 0.8
267 | 点点滴滴 0.8
268 | 多多少少 0.8
269 | 怪 0.8
270 | 好生 0.8
271 | 还 0.8
272 | 或多或少 0.8
273 | 略 0.8
274 | 略加 0.8
275 | 略略 0.8
276 | 略微 0.8
277 | 略为 0.8
278 | 蛮 0.8
279 | 稍 0.8
280 | 稍稍 0.8
281 | 稍微 0.8
282 | 稍为 0.8
283 | 稍许 0.8
284 | 挺 0.8
285 | 未免 0.8
286 | 相当 0.8
287 | 些 0.8
288 | 些微 0.8
289 | 些小 0.8
290 | 一点 0.8
291 | 一点儿 0.8
292 | 一些 0.8
293 | 有点 0.8
294 | 有点儿 0.8
295 | 有些 0.8
296 | 半点 0.6
297 | 不大 0.6
298 | 不丁点儿 0.6
299 | 不甚 0.6
300 | 不怎么 0.6
301 | 聊 0.6
302 | 没怎么 0.6
303 | 轻度 0.6
304 | 弱 0.6
305 | 丝毫 0.6
306 | 微 0.6
307 | 相对 0.6


--------------------------------------------------------------------------------
/pysenti/data/conjunction_dict.txt:
--------------------------------------------------------------------------------
 1 | 并   1.2
 2 | 且   1.2
 3 | 而   1.2
 4 | 虽然  1.2
 5 | 不过  1.2
 6 | 至于  1.2
 7 | 致   1.2
 8 | 不料  1.2
 9 | 岂知  1.2
10 | 也   1.5
11 | 不但  1.5
12 | 其次  1.5
13 | 不仅  1.5
14 | 就是  1.5
15 | 但是  2
16 | 偏偏  2
17 | 而且  2
18 | 何况  2
19 | 况且  2
20 | 乃至  2
21 | 但  2
22 | 却   2
23 | 然而  2
24 | 只是  2
25 | 甚至  3
26 | 尤其  3
27 | 居然  3
28 | 


--------------------------------------------------------------------------------
/pysenti/data/denial_dict.txt:
--------------------------------------------------------------------------------
 1 | 没敢 1
 2 | 不是  1
 3 | 不   1
 4 | 没   1
 5 | 无   1
 6 | 非   1
 7 | 莫   1
 8 | 弗   1
 9 | 毋   1
10 | 勿   1
11 | 未   1
12 | 否   1
13 | 别   1
14 | 休   1
15 | 無   1
16 | 不曾  1
17 | 未必  1
18 | 没有  1
19 | 不要  1
20 | 难以  1
21 | 未曾  1
22 | 并非  1
23 | 绝不  1
24 | 不可  1
25 | 


--------------------------------------------------------------------------------
/pysenti/data/process.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author:XuMing(xuming624@qq.com)
 4 | @description: 
 5 | """
 6 | from codecs import open
 7 | 
 8 | r = set()
 9 | with open('pos_dict.txt', 'r', encoding='utf-8') as f:
10 |     for line in f:
11 |         line = line.strip()
12 |         r.add(line)
13 | 
14 | sentiments = set()
15 | with open('sentiment_dict.txt', 'r', encoding='utf-8') as f:
16 |     for line in f:
17 |         line = line.strip().split()
18 |         w = line[0]
19 |         sentiments.add(w)
20 | 
21 | with open('pos', 'w', encoding='utf-8') as f:
22 |     for i in r:
23 |         if i not in sentiments:
24 |             f.write(i + ' 2' + '\n')
25 | 


--------------------------------------------------------------------------------
/pysenti/data/sentiment_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shibing624/pysenti/47fec82a5ab155e311c5321eb1d237c2d8d79010/pysenti/data/sentiment_model.pkl


--------------------------------------------------------------------------------
/pysenti/data/stopwords.txt:
--------------------------------------------------------------------------------
   1 | --
   2 | ?
   3 | <
   4 | >
   5 | !
   6 | ,
   7 | .
   8 | "
   9 | /
  10 | ~
  11 | `
  12 | -
  13 | =
  14 | +
  15 | (
  16 | )
  17 | *
  18 | :
  19 | ;
  20 | －－ 
  21 | 、 
  22 | 。 
  23 | “ 
  24 | ” 
  25 | 《 
  26 | 》 
  27 | （
  28 | ）
  29 | 【
  30 | 】
  31 | [
  32 | ]
  33 | ！ 
  34 | ， 
  35 | ： 
  36 | ； 
  37 | ？ 
  38 | able
  39 | about
  40 | above
  41 | according
  42 | accordingly
  43 | across
  44 | actually
  45 | after
  46 | afterwards
  47 | again
  48 | against
  49 | ain't
  50 | all
  51 | allow
  52 | allows
  53 | almost
  54 | alone
  55 | along
  56 | already
  57 | also
  58 | although
  59 | always
  60 | am
  61 | among
  62 | amongst
  63 | an
  64 | and
  65 | another
  66 | any
  67 | anybody
  68 | anyhow
  69 | anyone
  70 | anything
  71 | anyway
  72 | anyways
  73 | anywhere
  74 | apart
  75 | appear
  76 | appreciate
  77 | appropriate
  78 | are
  79 | aren't
  80 | around
  81 | as
  82 | a's
  83 | aside
  84 | ask
  85 | asking
  86 | associated
  87 | at
  88 | available
  89 | away
  90 | awfully
  91 | be
  92 | became
  93 | because
  94 | become
  95 | becomes
  96 | becoming
  97 | been
  98 | before
  99 | beforehand
 100 | behind
 101 | being
 102 | believe
 103 | below
 104 | beside
 105 | besides
 106 | best
 107 | better
 108 | between
 109 | beyond
 110 | both
 111 | brief
 112 | but
 113 | by
 114 | came
 115 | can
 116 | cannot
 117 | cant
 118 | can't
 119 | cause
 120 | causes
 121 | certain
 122 | certainly
 123 | changes
 124 | clearly
 125 | c'mon
 126 | co
 127 | com
 128 | come
 129 | comes
 130 | concerning
 131 | consequently
 132 | consider
 133 | considering
 134 | contain
 135 | containing
 136 | contains
 137 | corresponding
 138 | could
 139 | couldn't
 140 | course
 141 | c's
 142 | currently
 143 | definitely
 144 | described
 145 | despite
 146 | did
 147 | didn't
 148 | different
 149 | do
 150 | does
 151 | doesn't
 152 | doing
 153 | done
 154 | don't
 155 | down
 156 | downwards
 157 | during
 158 | each
 159 | edu
 160 | eg
 161 | eight
 162 | either
 163 | else
 164 | elsewhere
 165 | enough
 166 | entirely
 167 | especially
 168 | et
 169 | etc
 170 | even
 171 | ever
 172 | every
 173 | everybody
 174 | everyone
 175 | everything
 176 | everywhere
 177 | ex
 178 | exactly
 179 | example
 180 | except
 181 | far
 182 | few
 183 | fifth
 184 | first
 185 | five
 186 | followed
 187 | following
 188 | follows
 189 | for
 190 | former
 191 | formerly
 192 | forth
 193 | four
 194 | from
 195 | further
 196 | furthermore
 197 | get
 198 | gets
 199 | getting
 200 | given
 201 | gives
 202 | go
 203 | goes
 204 | going
 205 | gone
 206 | got
 207 | gotten
 208 | greetings
 209 | had
 210 | hadn't
 211 | happens
 212 | hardly
 213 | has
 214 | hasn't
 215 | have
 216 | haven't
 217 | having
 218 | he
 219 | hello
 220 | help
 221 | hence
 222 | her
 223 | here
 224 | hereafter
 225 | hereby
 226 | herein
 227 | here's
 228 | hereupon
 229 | hers
 230 | herself
 231 | he's
 232 | hi
 233 | him
 234 | himself
 235 | his
 236 | hither
 237 | hopefully
 238 | how
 239 | howbeit
 240 | however
 241 | i'd
 242 | ie
 243 | if
 244 | ignored
 245 | i'll
 246 | i'm
 247 | immediate
 248 | in
 249 | inasmuch
 250 | inc
 251 | indeed
 252 | indicate
 253 | indicated
 254 | indicates
 255 | inner
 256 | insofar
 257 | instead
 258 | into
 259 | inward
 260 | is
 261 | isn't
 262 | it
 263 | it'd
 264 | it'll
 265 | its
 266 | it's
 267 | itself
 268 | i've
 269 | just
 270 | keep
 271 | keeps
 272 | kept
 273 | know
 274 | known
 275 | knows
 276 | last
 277 | lately
 278 | later
 279 | latter
 280 | latterly
 281 | least
 282 | less
 283 | lest
 284 | let
 285 | let's
 286 | like
 287 | liked
 288 | likely
 289 | little
 290 | look
 291 | looking
 292 | looks
 293 | ltd
 294 | mainly
 295 | many
 296 | may
 297 | maybe
 298 | me
 299 | mean
 300 | meanwhile
 301 | merely
 302 | might
 303 | more
 304 | moreover
 305 | most
 306 | mostly
 307 | much
 308 | must
 309 | my
 310 | myself
 311 | name
 312 | namely
 313 | nd
 314 | near
 315 | nearly
 316 | necessary
 317 | need
 318 | needs
 319 | neither
 320 | never
 321 | nevertheless
 322 | new
 323 | next
 324 | nine
 325 | no
 326 | nobody
 327 | non
 328 | none
 329 | noone
 330 | nor
 331 | normally
 332 | not
 333 | nothing
 334 | novel
 335 | now
 336 | nowhere
 337 | obviously
 338 | of
 339 | off
 340 | often
 341 | oh
 342 | ok
 343 | okay
 344 | old
 345 | on
 346 | once
 347 | one
 348 | ones
 349 | only
 350 | onto
 351 | or
 352 | other
 353 | others
 354 | otherwise
 355 | ought
 356 | our
 357 | ours
 358 | ourselves
 359 | out
 360 | outside
 361 | over
 362 | overall
 363 | own
 364 | particular
 365 | particularly
 366 | per
 367 | perhaps
 368 | placed
 369 | please
 370 | plus
 371 | possible
 372 | presumably
 373 | probably
 374 | provides
 375 | que
 376 | quite
 377 | qv
 378 | rather
 379 | rd
 380 | re
 381 | really
 382 | reasonably
 383 | regarding
 384 | regardless
 385 | regards
 386 | relatively
 387 | respectively
 388 | right
 389 | said
 390 | same
 391 | saw
 392 | say
 393 | saying
 394 | says
 395 | second
 396 | secondly
 397 | see
 398 | seeing
 399 | seem
 400 | seemed
 401 | seeming
 402 | seems
 403 | seen
 404 | self
 405 | selves
 406 | sensible
 407 | sent
 408 | serious
 409 | seriously
 410 | seven
 411 | several
 412 | shall
 413 | she
 414 | should
 415 | shouldn't
 416 | since
 417 | six
 418 | so
 419 | some
 420 | somebody
 421 | somehow
 422 | someone
 423 | something
 424 | sometime
 425 | sometimes
 426 | somewhat
 427 | somewhere
 428 | soon
 429 | sorry
 430 | specified
 431 | specify
 432 | specifying
 433 | still
 434 | sub
 435 | such
 436 | sup
 437 | sure
 438 | take
 439 | taken
 440 | tell
 441 | tends
 442 | th
 443 | than
 444 | thank
 445 | thanks
 446 | thanx
 447 | that
 448 | thats
 449 | that's
 450 | the
 451 | their
 452 | theirs
 453 | them
 454 | themselves
 455 | then
 456 | thence
 457 | there
 458 | thereafter
 459 | thereby
 460 | therefore
 461 | therein
 462 | theres
 463 | there's
 464 | thereupon
 465 | these
 466 | they
 467 | they'd
 468 | they'll
 469 | they're
 470 | they've
 471 | think
 472 | third
 473 | this
 474 | thorough
 475 | thoroughly
 476 | those
 477 | though
 478 | three
 479 | through
 480 | throughout
 481 | thru
 482 | thus
 483 | to
 484 | together
 485 | too
 486 | took
 487 | toward
 488 | towards
 489 | tried
 490 | tries
 491 | truly
 492 | try
 493 | trying
 494 | t's
 495 | twice
 496 | two
 497 | un
 498 | under
 499 | unfortunately
 500 | unless
 501 | unlikely
 502 | until
 503 | unto
 504 | up
 505 | upon
 506 | us
 507 | use
 508 | used
 509 | useful
 510 | uses
 511 | using
 512 | usually
 513 | value
 514 | various
 515 | very
 516 | via
 517 | viz
 518 | vs
 519 | want
 520 | wants
 521 | was
 522 | wasn't
 523 | way
 524 | we
 525 | we'd
 526 | welcome
 527 | well
 528 | we'll
 529 | went
 530 | were
 531 | we're
 532 | weren't
 533 | we've
 534 | what
 535 | whatever
 536 | what's
 537 | when
 538 | whence
 539 | whenever
 540 | where
 541 | whereafter
 542 | whereas
 543 | whereby
 544 | wherein
 545 | where's
 546 | whereupon
 547 | wherever
 548 | whether
 549 | which
 550 | while
 551 | whither
 552 | who
 553 | whoever
 554 | whole
 555 | whom
 556 | who's
 557 | whose
 558 | why
 559 | will
 560 | willing
 561 | wish
 562 | with
 563 | within
 564 | without
 565 | wonder
 566 | won't
 567 | would
 568 | wouldn't
 569 | yes
 570 | yet
 571 | you
 572 | you'd
 573 | you'll
 574 | your
 575 | you're
 576 | yours
 577 | yourself
 578 | yourselves
 579 | you've
 580 | zero
 581 | zt
 582 | ZT
 583 | zz
 584 | ZZ
 585 | 一
 586 | 一下
 587 | 一些
 588 | 一切
 589 | 一则
 590 | 一天
 591 | 一定
 592 | 一方面
 593 | 一旦
 594 | 一时
 595 | 一来
 596 | 一样
 597 | 一次
 598 | 一片
 599 | 一直
 600 | 一致
 601 | 一般
 602 | 一起
 603 | 一边
 604 | 一面
 605 | 万一
 606 | 上下
 607 | 上升
 608 | 上去
 609 | 上来
 610 | 上述
 611 | 上面
 612 | 下列
 613 | 下去
 614 | 下来
 615 | 下面
 616 | 不一
 617 | 不久
 618 | 不仅
 619 | 不会
 620 | 不但
 621 | 不光
 622 | 不单
 623 | 不变
 624 | 不只
 625 | 不可
 626 | 不同
 627 | 不够
 628 | 不如
 629 | 不得
 630 | 不怕
 631 | 不惟
 632 | 不成
 633 | 不拘
 634 | 不敢
 635 | 不断
 636 | 不是
 637 | 不比
 638 | 不然
 639 | 不特
 640 | 不独
 641 | 不管
 642 | 不能
 643 | 不要
 644 | 不论
 645 | 不足
 646 | 不过
 647 | 不问
 648 | 与
 649 | 与其
 650 | 与否
 651 | 与此同时
 652 | 专门
 653 | 且
 654 | 两者
 655 | 严格
 656 | 严重
 657 | 个
 658 | 个人
 659 | 个别
 660 | 中小
 661 | 中间
 662 | 丰富
 663 | 临
 664 | 为
 665 | 为主
 666 | 为了
 667 | 为什么
 668 | 为什麽
 669 | 为何
 670 | 为着
 671 | 主张
 672 | 主要
 673 | 举行
 674 | 乃
 675 | 乃至
 676 | 么
 677 | 之
 678 | 之一
 679 | 之前
 680 | 之后
 681 | 之後
 682 | 之所以
 683 | 之类
 684 | 乌乎
 685 | 乎
 686 | 乘
 687 | 也
 688 | 也好
 689 | 也是
 690 | 也罢
 691 | 了
 692 | 了解
 693 | 争取
 694 | 于
 695 | 于是
 696 | 于是乎
 697 | 云云
 698 | 互相
 699 | 产生
 700 | 人们
 701 | 人家
 702 | 什么
 703 | 什么样
 704 | 什麽
 705 | 今后
 706 | 今天
 707 | 今年
 708 | 今後
 709 | 仍然
 710 | 从
 711 | 从事
 712 | 从而
 713 | 他
 714 | 他人
 715 | 他们
 716 | 他的
 717 | 代替
 718 | 以
 719 | 以上
 720 | 以下
 721 | 以为
 722 | 以便
 723 | 以免
 724 | 以前
 725 | 以及
 726 | 以后
 727 | 以外
 728 | 以後
 729 | 以来
 730 | 以至
 731 | 以至于
 732 | 以致
 733 | 们
 734 | 任
 735 | 任何
 736 | 任凭
 737 | 任务
 738 | 企图
 739 | 伟大
 740 | 似乎
 741 | 似的
 742 | 但
 743 | 但是
 744 | 何
 745 | 何况
 746 | 何处
 747 | 何时
 748 | 作为
 749 | 你
 750 | 你们
 751 | 你的
 752 | 使得
 753 | 使用
 754 | 例如
 755 | 依
 756 | 依照
 757 | 依靠
 758 | 促进
 759 | 保持
 760 | 俺
 761 | 俺们
 762 | 倘
 763 | 倘使
 764 | 倘或
 765 | 倘然
 766 | 倘若
 767 | 假使
 768 | 假如
 769 | 假若
 770 | 做到
 771 | 像
 772 | 允许
 773 | 充分
 774 | 先后
 775 | 先後
 776 | 先生
 777 | 全部
 778 | 全面
 779 | 兮
 780 | 共同
 781 | 关于
 782 | 其
 783 | 其一
 784 | 其中
 785 | 其二
 786 | 其他
 787 | 其余
 788 | 其它
 789 | 其实
 790 | 其次
 791 | 具体
 792 | 具体地说
 793 | 具体说来
 794 | 具有
 795 | 再者
 796 | 再说
 797 | 冒
 798 | 冲
 799 | 决定
 800 | 况且
 801 | 准备
 802 | 几
 803 | 几乎
 804 | 几时
 805 | 凭
 806 | 凭借
 807 | 出去
 808 | 出来
 809 | 出现
 810 | 分别
 811 | 则
 812 | 别
 813 | 别的
 814 | 别说
 815 | 到
 816 | 前后
 817 | 前者
 818 | 前进
 819 | 前面
 820 | 加之
 821 | 加以
 822 | 加入
 823 | 加强
 824 | 十分
 825 | 即
 826 | 即令
 827 | 即使
 828 | 即便
 829 | 即或
 830 | 即若
 831 | 却不
 832 | 原来
 833 | 又
 834 | 及
 835 | 及其
 836 | 及时
 837 | 及至
 838 | 双方
 839 | 反之
 840 | 反应
 841 | 反映
 842 | 反过来
 843 | 反过来说
 844 | 取得
 845 | 受到
 846 | 变成
 847 | 另
 848 | 另一方面
 849 | 另外
 850 | 只是
 851 | 只有
 852 | 只要
 853 | 只限
 854 | 叫
 855 | 叫做
 856 | 召开
 857 | 叮咚
 858 | 可
 859 | 可以
 860 | 可是
 861 | 可能
 862 | 可见
 863 | 各
 864 | 各个
 865 | 各人
 866 | 各位
 867 | 各地
 868 | 各种
 869 | 各级
 870 | 各自
 871 | 合理
 872 | 同
 873 | 同一
 874 | 同时
 875 | 同样
 876 | 后来
 877 | 后面
 878 | 向
 879 | 向着
 880 | 吓
 881 | 吗
 882 | 否则
 883 | 吧
 884 | 吧哒
 885 | 吱
 886 | 呀
 887 | 呃
 888 | 呕
 889 | 呗
 890 | 呜
 891 | 呜呼
 892 | 呢
 893 | 周围
 894 | 呵
 895 | 呸
 896 | 呼哧
 897 | 咋
 898 | 和
 899 | 咚
 900 | 咦
 901 | 咱
 902 | 咱们
 903 | 咳
 904 | 哇
 905 | 哈
 906 | 哈哈
 907 | 哉
 908 | 哎
 909 | 哎呀
 910 | 哎哟
 911 | 哗
 912 | 哟
 913 | 哦
 914 | 哩
 915 | 哪
 916 | 哪个
 917 | 哪些
 918 | 哪儿
 919 | 哪天
 920 | 哪年
 921 | 哪怕
 922 | 哪样
 923 | 哪边
 924 | 哪里
 925 | 哼
 926 | 哼唷
 927 | 唉
 928 | 啊
 929 | 啐
 930 | 啥
 931 | 啦
 932 | 啪达
 933 | 喂
 934 | 喏
 935 | 喔唷
 936 | 嗡嗡
 937 | 嗬
 938 | 嗯
 939 | 嗳
 940 | 嘎
 941 | 嘎登
 942 | 嘘
 943 | 嘛
 944 | 嘻
 945 | 嘿
 946 | 因
 947 | 因为
 948 | 因此
 949 | 因而
 950 | 固然
 951 | 在
 952 | 在下
 953 | 地
 954 | 坚决
 955 | 坚持
 956 | 基本
 957 | 处理
 958 | 复杂
 959 | 多
 960 | 多少
 961 | 多数
 962 | 多次
 963 | 大力
 964 | 大多数
 965 | 大大
 966 | 大家
 967 | 大批
 968 | 大约
 969 | 大量
 970 | 失去
 971 | 她
 972 | 她们
 973 | 她的
 974 | 好的
 975 | 好象
 976 | 如
 977 | 如上所述
 978 | 如下
 979 | 如何
 980 | 如其
 981 | 如果
 982 | 如此
 983 | 如若
 984 | 存在
 985 | 宁
 986 | 宁可
 987 | 宁愿
 988 | 宁肯
 989 | 它
 990 | 它们
 991 | 它们的
 992 | 它的
 993 | 安全
 994 | 完全
 995 | 完成
 996 | 实现
 997 | 实际
 998 | 宣布
 999 | 容易
1000 | 密切
1001 | 对
1002 | 对于
1003 | 对应
1004 | 将
1005 | 少数
1006 | 尔后
1007 | 尚且
1008 | 尤其
1009 | 就
1010 | 就是
1011 | 就是说
1012 | 尽
1013 | 尽管
1014 | 属于
1015 | 岂但
1016 | 左右
1017 | 巨大
1018 | 巩固
1019 | 己
1020 | 已经
1021 | 帮助
1022 | 常常
1023 | 并
1024 | 并不
1025 | 并不是
1026 | 并且
1027 | 并没有
1028 | 广大
1029 | 广泛
1030 | 应当
1031 | 应用
1032 | 应该
1033 | 开外
1034 | 开始
1035 | 开展
1036 | 引起
1037 | 强烈
1038 | 强调
1039 | 归
1040 | 当
1041 | 当前
1042 | 当时
1043 | 当然
1044 | 当着
1045 | 形成
1046 | 彻底
1047 | 彼
1048 | 彼此
1049 | 往
1050 | 往往
1051 | 待
1052 | 後来
1053 | 後面
1054 | 得
1055 | 得出
1056 | 得到
1057 | 心里
1058 | 必然
1059 | 必要
1060 | 必须
1061 | 怎
1062 | 怎么
1063 | 怎么办
1064 | 怎么样
1065 | 怎样
1066 | 怎麽
1067 | 总之
1068 | 总是
1069 | 总的来看
1070 | 总的来说
1071 | 总的说来
1072 | 总结
1073 | 总而言之
1074 | 恰恰相反
1075 | 您
1076 | 意思
1077 | 愿意
1078 | 慢说
1079 | 成为
1080 | 我
1081 | 我们
1082 | 我的
1083 | 或
1084 | 或是
1085 | 或者
1086 | 战斗
1087 | 所
1088 | 所以
1089 | 所有
1090 | 所谓
1091 | 打
1092 | 扩大
1093 | 把
1094 | 抑或
1095 | 拿
1096 | 按
1097 | 按照
1098 | 换句话说
1099 | 换言之
1100 | 据
1101 | 掌握
1102 | 接着
1103 | 接著
1104 | 故
1105 | 故此
1106 | 整个
1107 | 方便
1108 | 方面
1109 | 旁人
1110 | 无宁
1111 | 无法
1112 | 无论
1113 | 既
1114 | 既是
1115 | 既然
1116 | 时候
1117 | 明显
1118 | 明确
1119 | 是
1120 | 是否
1121 | 是的
1122 | 显然
1123 | 显著
1124 | 普通
1125 | 普遍
1126 | 更加
1127 | 曾经
1128 | 替
1129 | 最后
1130 | 最大
1131 | 最好
1132 | 最後
1133 | 最近
1134 | 最高
1135 | 有
1136 | 有些
1137 | 有关
1138 | 有利
1139 | 有力
1140 | 有所
1141 | 有效
1142 | 有时
1143 | 有点
1144 | 有的
1145 | 有着
1146 | 有著
1147 | 望
1148 | 朝
1149 | 朝着
1150 | 本
1151 | 本着
1152 | 来
1153 | 来着
1154 | 极了
1155 | 构成
1156 | 果然
1157 | 果真
1158 | 某
1159 | 某个
1160 | 某些
1161 | 根据
1162 | 根本
1163 | 欢迎
1164 | 正在
1165 | 正如
1166 | 正常
1167 | 此
1168 | 此外
1169 | 此时
1170 | 此间
1171 | 毋宁
1172 | 每
1173 | 每个
1174 | 每天
1175 | 每年
1176 | 每当
1177 | 比
1178 | 比如
1179 | 比方
1180 | 比较
1181 | 沿
1182 | 沿着
1183 | 注意
1184 | 深入
1185 | 清楚
1186 | 满足
1187 | 漫说
1188 | 焉
1189 | 然则
1190 | 然后
1191 | 然後
1192 | 然而
1193 | 照
1194 | 照着
1195 | 特别是
1196 | 特殊
1197 | 特点
1198 | 现代
1199 | 现在
1200 | 甚么
1201 | 甚而
1202 | 甚至
1203 | 用
1204 | 由
1205 | 由于
1206 | 由此可见
1207 | 的
1208 | 的话
1209 | 目前
1210 | 直到
1211 | 直接
1212 | 相似
1213 | 相信
1214 | 相反
1215 | 相同
1216 | 相对
1217 | 相对而言
1218 | 相应
1219 | 相当
1220 | 相等
1221 | 省得
1222 | 看出
1223 | 看到
1224 | 看来
1225 | 看看
1226 | 看见
1227 | 真是
1228 | 真正
1229 | 着
1230 | 着呢
1231 | 矣
1232 | 知道
1233 | 确定
1234 | 离
1235 | 积极
1236 | 移动
1237 | 突出
1238 | 突然
1239 | 立即
1240 | 第
1241 | 等
1242 | 等等
1243 | 管
1244 | 紧接着
1245 | 纵
1246 | 纵令
1247 | 纵使
1248 | 纵然
1249 | 练习
1250 | 组成
1251 | 经
1252 | 经常
1253 | 经过
1254 | 结合
1255 | 结果
1256 | 给
1257 | 绝对
1258 | 继续
1259 | 继而
1260 | 维持
1261 | 综上所述
1262 | 罢了
1263 | 考虑
1264 | 者
1265 | 而
1266 | 而且
1267 | 而况
1268 | 而外
1269 | 而已
1270 | 而是
1271 | 而言
1272 | 联系
1273 | 能
1274 | 能否
1275 | 能够
1276 | 腾
1277 | 自
1278 | 自个儿
1279 | 自从
1280 | 自各儿
1281 | 自家
1282 | 自己
1283 | 自身
1284 | 至
1285 | 至于
1286 | 良好
1287 | 若
1288 | 若是
1289 | 若非
1290 | 范围
1291 | 莫若
1292 | 获得
1293 | 虽
1294 | 虽则
1295 | 虽然
1296 | 虽说
1297 | 行为
1298 | 行动
1299 | 表明
1300 | 表示
1301 | 被
1302 | 要
1303 | 要不
1304 | 要不是
1305 | 要不然
1306 | 要么
1307 | 要是
1308 | 要求
1309 | 规定
1310 | 觉得
1311 | 认为
1312 | 认真
1313 | 认识
1314 | 让
1315 | 许多
1316 | 论
1317 | 设使
1318 | 设若
1319 | 该
1320 | 说明
1321 | 诸位
1322 | 谁
1323 | 谁知
1324 | 赶
1325 | 起
1326 | 起来
1327 | 起见
1328 | 趁
1329 | 趁着
1330 | 越是
1331 | 跟
1332 | 转动
1333 | 转变
1334 | 转贴
1335 | 较
1336 | 较之
1337 | 边
1338 | 达到
1339 | 迅速
1340 | 过
1341 | 过去
1342 | 过来
1343 | 运用
1344 | 还是
1345 | 还有
1346 | 这
1347 | 这个
1348 | 这么
1349 | 这么些
1350 | 这么样
1351 | 这么点儿
1352 | 这些
1353 | 这会儿
1354 | 这儿
1355 | 这就是说
1356 | 这时
1357 | 这样
1358 | 这点
1359 | 这种
1360 | 这边
1361 | 这里
1362 | 这麽
1363 | 进入
1364 | 进步
1365 | 进而
1366 | 进行
1367 | 连
1368 | 连同
1369 | 适应
1370 | 适当
1371 | 适用
1372 | 逐步
1373 | 逐渐
1374 | 通常
1375 | 通过
1376 | 造成
1377 | 遇到
1378 | 遭到
1379 | 避免
1380 | 那
1381 | 那个
1382 | 那么
1383 | 那么些
1384 | 那么样
1385 | 那些
1386 | 那会儿
1387 | 那儿
1388 | 那时
1389 | 那样
1390 | 那边
1391 | 那里
1392 | 那麽
1393 | 部分
1394 | 鄙人
1395 | 采取
1396 | 里面
1397 | 重大
1398 | 重新
1399 | 重要
1400 | 鉴于
1401 | 问题
1402 | 防止
1403 | 阿
1404 | 附近
1405 | 限制
1406 | 除
1407 | 除了
1408 | 除此之外
1409 | 除非
1410 | 随
1411 | 随着
1412 | 随著
1413 | 集中
1414 | 需要
1415 | 非但
1416 | 非常
1417 | 非徒
1418 | 靠
1419 | 顺
1420 | 顺着
1421 | 首先
1422 | 高兴
1423 | 是不是
1424 | 说说
1425 | 


--------------------------------------------------------------------------------
/pysenti/frequency.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author:XuMing(xuming624@qq.com)
 4 | @description:
 5 | """
 6 | 
 7 | 
 8 | class BaseProb(object):
 9 |     def __init__(self):
10 |         self.d = {}
11 |         self.total = 0.0
12 |         self.none = 0
13 | 
14 |     def exists(self, key):
15 |         return key in self.d
16 | 
17 |     def getsum(self):
18 |         return self.total
19 | 
20 |     def get(self, key):
21 |         if not self.exists(key):
22 |             return False, self.none
23 |         return True, self.d[key]
24 | 
25 |     def freq(self, key):
26 |         return float(self.get(key)[1]) / self.total
27 | 
28 |     def samples(self):
29 |         return self.d.keys()
30 | 
31 | 
32 | class AddOneProb(BaseProb):
33 |     def __init__(self):
34 |         self.d = {}
35 |         self.total = 0.0
36 |         self.none = 1
37 | 
38 |     def add(self, key, value):
39 |         self.total += value
40 |         if not self.exists(key):
41 |             self.d[key] = 1
42 |             self.total += 1
43 |         self.d[key] += value
44 | 


--------------------------------------------------------------------------------
/pysenti/model_classifier.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author:XuMing(xuming624@qq.com)
 4 | @description: 
 5 | """
 6 | import os
 7 | from pysenti.bayes import Bayes
 8 | from pysenti.compat import strdecode
 9 | from pysenti.tokenizer import segment
10 | from pysenti.utils import filter_stop, load_set
11 | 
12 | pwd_path = os.path.abspath(os.path.dirname(__file__))
13 | default_sentiment_model_path = os.path.join(pwd_path, 'data/sentiment_model.pkl')
14 | # 停用词
15 | default_stopwords_path = os.path.join(pwd_path, 'data/stopwords.txt')
16 | 
17 | 
18 | class ModelClassifier:
19 |     def __init__(self, model_path=default_sentiment_model_path, stopwords_path=default_stopwords_path):
20 |         self.classifier = Bayes()
21 |         self.model_path = model_path
22 |         self.stopwords = load_set(stopwords_path)
23 |         if model_path:
24 |             self.classifier.load(self.model_path)
25 | 
26 |     def save(self):
27 |         self.classifier.save(self.model_path)
28 | 
29 |     def handle(self, doc):
30 |         words = segment(doc)
31 |         words = filter_stop(words, self.stopwords)
32 |         return words
33 | 
34 |     def train(self, neg_docs, pos_docs):
35 |         data = []
36 |         for sent in neg_docs:
37 |             data.append([self.handle(sent), 'neg'])
38 |         for sent in pos_docs:
39 |             data.append([self.handle(sent), 'pos'])
40 |         self.classifier.train(data)
41 | 
42 |     def classify(self, text):
43 |         """
44 |         sentiment classification text
45 |         :param text: text, str
46 |         :return:    "positive_prob": 0.0,
47 |                     "negative_prob": 0.0
48 |                     dict
49 |         """
50 |         result = {"positive_prob": 0.0, "negative_prob": 0.0}
51 |         text = strdecode(text)
52 |         ret, prob = self.classifier.classify(self.handle(text))
53 |         if ret == 'pos':
54 |             result["positive_prob"] = prob
55 |             result["negative_prob"] = 1 - prob
56 |         else:
57 |             result["negative_prob"] = prob
58 |             result["positive_prob"] = 1 - prob
59 |         return result
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     model = ModelClassifier()
64 |     a_sentence = ['剁椒鸡蛋好难吃。绝对没人受得了',
65 |                   '土豆丝很好吃', '土豆丝很难吃',
66 |                   '这笔钱是个天文数字',
67 |                   '我一会儿出去玩了，你吃啥？给你带']
68 |     for i in a_sentence:
69 |         r = model.classify(i)
70 |         print(i, r)
71 | 


--------------------------------------------------------------------------------
/pysenti/rule_classfier.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author:XuMing(xuming624@qq.com)
  4 | @description: 
  5 | """
  6 | import os
  7 | from codecs import open
  8 | from loguru import logger
  9 | from pysenti import tokenizer
 10 | from pysenti.compat import strdecode
 11 | from pysenti.utils import split_sentence
 12 | 
 13 | pwd_path = os.path.abspath(os.path.dirname(__file__))
 14 | 
 15 | # 情感词典，包括积极词典、消极词典
 16 | sentiment_dict_path = os.path.join(pwd_path, 'data/sentiment_dict.txt')
 17 | # 连词词典
 18 | conjunction_dict_path = os.path.join(pwd_path, 'data/conjunction_dict.txt')
 19 | # 副词词典
 20 | adverb_dict_path = os.path.join(pwd_path, 'data/adverb_dict.txt')
 21 | # 否定词典
 22 | denial_dict_path = os.path.join(pwd_path, 'data/denial_dict.txt')
 23 | 
 24 | 
 25 | class RuleClassifier:
 26 |     def __init__(self):
 27 |         self.name = "rule_classifier"
 28 |         self.sentiment_dict = {}
 29 |         self.conjunction_dict = {}
 30 |         self.adverb_dict = {}
 31 |         self.denial_dict = {}
 32 |         self.user_sentiment_dict = {}
 33 |         self.inited = False
 34 | 
 35 |     def init(self, sentiment_dict_path=sentiment_dict_path):
 36 |         # 加载情感词典词典
 37 |         self.sentiment_dict = self._get_dict(sentiment_dict_path)
 38 |         self.conjunction_dict = self._get_dict(conjunction_dict_path)  # 连词
 39 |         self.adverb_dict = self._get_dict(adverb_dict_path)  # 副词
 40 |         self.denial_dict = self._get_dict(denial_dict_path)
 41 |         self.inited = True
 42 | 
 43 |     def load_user_sentiment_dict(self, path):
 44 |         if not self.inited:
 45 |             self.init()
 46 |         self.user_sentiment_dict = self._get_dict(path)
 47 |         self.sentiment_dict.update(self.user_sentiment_dict)
 48 | 
 49 |     def classify(self, text):
 50 |         if not self.inited:
 51 |             self.init()
 52 |         # 情感分析整体数据结构
 53 |         result = {"score": 0}
 54 |         text = strdecode(text)
 55 |         # 分句
 56 |         clauses = split_sentence(text)
 57 |         # 对每分句进行情感分析
 58 |         for i in range(len(clauses)):
 59 |             # 情感分析子句的数据结构
 60 |             sub_clause = self._analyse_clause(clauses[i])
 61 | 
 62 |             # 将子句分析的数据结果添加到整体数据结构中
 63 |             result["sub_clause" + str(i)] = sub_clause
 64 |             result["score"] += sub_clause["score"]
 65 | 
 66 |         return result
 67 | 
 68 |     def _analyse_clause(self, clause):
 69 |         sub_clause = {"score": 0, "sentiment": [], "conjunction": []}
 70 |         seg_result = tokenizer.segment(clause, pos=False)
 71 | 
 72 |         # 逐个分析分词
 73 |         for word in seg_result:
 74 |             # 判断是否是连词
 75 |             r = self._is_word_conjunction(word)
 76 |             if r:
 77 |                 sub_clause["conjunction"].append(r)
 78 | 
 79 |             # 判断是否是情感词
 80 |             r = self._is_word_sentiment(word, seg_result)
 81 |             if r:
 82 |                 sub_clause["sentiment"].append(r)
 83 |                 sub_clause["score"] += r["score"]
 84 | 
 85 |         # 综合连词的情感值
 86 |         for a_conjunction in sub_clause["conjunction"]:
 87 |             sub_clause["score"] *= a_conjunction["value"]
 88 | 
 89 |         return sub_clause
 90 | 
 91 |     def _is_word_conjunction(self, the_word):
 92 |         r = {}
 93 |         if the_word in self.conjunction_dict:
 94 |             r = {"key": the_word, "value": self.conjunction_dict[the_word]}
 95 |         return r
 96 | 
 97 |     def _is_word_sentiment(self, the_word, seg_result, index=-1):
 98 |         r = {}
 99 |         # 判断分词是否在情感词典内
100 |         if the_word in self.sentiment_dict:
101 |             # 在情感词典内，则构建一个以情感词为中心的字典数据结构
102 |             r = self._emotional_word_analysis(the_word, self.sentiment_dict[the_word], seg_result, index)
103 |         # 不在情感词典内，则返回空
104 |         return r
105 | 
106 |     def _emotional_word_analysis(self, core_word, value, segments, index):
107 |         # 在情感词典内，则构建一个以情感词为中心的字典数据结构
108 |         orientation = {"key": core_word, "adverb": [], "denial": [], "value": value}
109 |         orientation_score = value
110 | 
111 |         # 在三个前视窗内，判断是否有否定词、副词
112 |         view_window = index - 1
113 |         if view_window > -1:  # 无越界
114 |             # 判断前一个词是否是情感词
115 |             if segments[view_window] in self.sentiment_dict:
116 |                 orientation["score"] = orientation_score
117 |                 return orientation
118 |             # 判断是否是副词
119 |             if segments[view_window] in self.adverb_dict:
120 |                 # 构建副词字典数据结构
121 |                 adverb = {"key": segments[view_window], "sentiment": 1,
122 |                           "value": self.adverb_dict[segments[view_window]]}
123 |                 orientation["adverb"].append(adverb)
124 |                 orientation_score *= self.adverb_dict[segments[view_window]]
125 |             # 判断是否是否定词
126 |             elif segments[view_window] in self.denial_dict:
127 |                 # 构建否定词字典数据结构
128 |                 denial = {"key": segments[view_window], "sentiment": 1,
129 |                           "value": self.denial_dict[segments[view_window]]}
130 |                 orientation["denial"].append(denial)
131 |                 orientation_score *= -1
132 |         view_window = index - 2
133 |         if view_window > -1:
134 |             # 判断前一个词是否是情感词
135 |             if segments[view_window] in self.sentiment_dict:
136 |                 orientation['score'] = orientation_score
137 |                 return orientation
138 |             if segments[view_window] in self.adverb_dict:
139 |                 adverb = {"key": segments[view_window], "sentiment": 2,
140 |                           "value": self.adverb_dict[segments[view_window]]}
141 |                 orientation_score *= self.adverb_dict[segments[view_window]]
142 |                 orientation["adverb"].insert(0, adverb)
143 |             elif segments[view_window] in self.denial_dict:
144 |                 denial = {"key": segments[view_window], "sentiment": 2,
145 |                           "value": self.denial_dict[segments[view_window]]}
146 |                 orientation["denial"].insert(0, denial)
147 |                 orientation_score *= -1
148 |                 # 判断是否是“不是很好”的结构（区别于“很不好”）
149 |                 if len(orientation["adverb"]) > 0:
150 |                     # 是，则引入调节阈值，0.3
151 |                     orientation_score *= 0.3
152 |         view_window = index - 3
153 |         if view_window > -1:
154 |             # 判断前一个词是否是情感词
155 |             if segments[view_window] in self.sentiment_dict:
156 |                 orientation["score"] = orientation_score
157 |                 return orientation
158 |             if segments[view_window] in self.adverb_dict:
159 |                 adverb = {"key": segments[view_window], "sentiment": 3,
160 |                           "value": self.adverb_dict[segments[view_window]]}
161 |                 orientation_score *= self.adverb_dict[segments[view_window]]
162 |                 orientation["adverb"].insert(0, adverb)
163 |             elif segments[view_window] in self.denial_dict:
164 |                 denial = {"key": segments[view_window], "sentiment": 3,
165 |                           "value": self.denial_dict[segments[view_window]]}
166 |                 orientation["denial"].insert(0, denial)
167 |                 orientation_score *= -1
168 |                 # 判断是否是“不是很好”的结构（区别于“很不好”）
169 |                 if len(orientation["adverb"]) > 0 and len(orientation["denial"]) == 0:
170 |                     orientation_score *= 0.3
171 |         # 添加情感分析值
172 |         orientation["score"] = orientation_score
173 |         # 返回的数据结构
174 |         return orientation
175 | 
176 |     @staticmethod
177 |     def _get_dict(path, encoding="utf-8"):
178 |         """
179 |         情感词典的构建
180 |         :param path:
181 |         :param encoding:
182 |         :return:
183 |         """
184 |         sentiment_dict = {}
185 |         with open(path, 'r', encoding=encoding) as f:
186 |             c = 0
187 |             for line in f:
188 |                 parts = line.strip().split()
189 |                 c += 1
190 |                 if len(parts) == 2:
191 |                     sentiment_dict[parts[0]] = float(parts[1])
192 |                 else:
193 |                     logger.error(f"num: {c}, {line}")
194 |         return sentiment_dict
195 | 
196 | 
197 | if __name__ == '__main__':
198 |     d = RuleClassifier()
199 | 
200 |     a_sentence = ['剁椒鸡蛋好难吃。绝对没人受得了',
201 |                   '土豆丝很好吃', '土豆丝很难吃',
202 |                   '这笔钱是个天文数字',
203 |                   '我一会儿出去玩了，你吃啥？给你带,然而你不知道']
204 |     for i in a_sentence:
205 |         r = d.classify(i)
206 |         print(i, r)
207 | 


--------------------------------------------------------------------------------
/pysenti/tokenizer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author:XuMing(xuming624@qq.com)
 4 | @description: 切词器
 5 | """
 6 | import logging
 7 | 
 8 | import jieba
 9 | from jieba import posseg
10 | 
11 | from .compat import strdecode
12 | 
13 | jieba.default_logger.setLevel(logging.ERROR)
14 | 
15 | 
16 | def segment(sentence, cut_type='word', pos=False):
17 |     """
18 |     切词
19 |     :param sentence:
20 |     :param cut_type: 'word' use jieba.lcut; 'char' use list(sentence)
21 |     :param pos: enable POS
22 |     :return: list
23 |     """
24 |     sentence = strdecode(sentence)
25 |     if pos:
26 |         if cut_type == 'word':
27 |             return posseg.lcut(sentence)
28 |         elif cut_type == 'char':
29 |             word_seq = list(sentence)
30 |             pos_seq = []
31 |             for w in word_seq:
32 |                 w_p = posseg.lcut(w)
33 |                 pos_seq.append(w_p[0].flag)
34 |             return word_seq, pos_seq
35 |     else:
36 |         if cut_type == 'word':
37 |             return jieba.lcut(sentence)
38 |         elif cut_type == 'char':
39 |             return list(sentence)
40 | 


--------------------------------------------------------------------------------
/pysenti/train.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author:XuMing(xuming624@qq.com)
 4 | @description: 
 5 | """
 6 | import os
 7 | from codecs import open
 8 | 
 9 | from pysenti.model_classifier import ModelClassifier
10 | 
11 | 
12 | def train(neg_file, pos_file, model_path):
13 |     neg = open(neg_file, 'r', 'utf-8').readlines()
14 |     pos = open(pos_file, 'r', 'utf-8').readlines()
15 |     neg_docs = []
16 |     pos_docs = []
17 |     for line in neg:
18 |         neg_docs.append(line.rstrip("\r\n"))
19 |     for line in pos:
20 |         pos_docs.append(line.rstrip("\r\n"))
21 |     global classifier
22 |     classifier = ModelClassifier(model_path)
23 |     classifier.train(neg_docs, pos_docs)
24 | 
25 | 
26 | def save():
27 |     classifier.save()
28 | 
29 | 
30 | def classify(sent):
31 |     return classifier.classify(sent)
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     pwd_path = os.path.abspath(os.path.dirname(__file__))
36 |     default_sentiment_model_path = os.path.join(pwd_path, 'data/sentiment_model.pkl')
37 | 
38 |     train('data/neg_sentences.txt', 'data/pos_sentences.txt', default_sentiment_model_path)
39 |     save()
40 |     txt = "苹果是一家伟大的公司"
41 |     print(txt, ' prob: ', classify(txt))
42 | 


--------------------------------------------------------------------------------
/pysenti/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author:XuMing(xuming624@qq.com)
 4 | @description: 
 5 | """
 6 | import os
 7 | import pickle
 8 | import re
 9 | from codecs import open
10 | 
11 | 
12 | def load_set(path):
13 |     words = set()
14 |     with open(path, 'r', 'utf-8') as f:
15 |         for line in f:
16 |             words.add(line.strip())
17 |     return words
18 | 
19 | 
20 | re_zh = re.compile('([\u4E00-\u9FA5]+)')
21 | 
22 | 
23 | def filter_stop(words, stopwords):
24 |     return list(filter(lambda x: x not in stopwords, words))
25 | 
26 | 
27 | def load_pkl(pkl_path):
28 |     """
29 |     加载词典文件
30 |     :param pkl_path:
31 |     :return:
32 |     """
33 |     with open(pkl_path, 'rb') as f:
34 |         result = pickle.load(f)
35 |     return result
36 | 
37 | 
38 | def dump_pkl(vocab, pkl_path, overwrite=True):
39 |     """
40 |     存储文件
41 |     :param pkl_path:
42 |     :param overwrite:
43 |     :return:
44 |     """
45 |     if pkl_path and os.path.exists(pkl_path) and not overwrite:
46 |         return
47 |     if pkl_path:
48 |         with open(pkl_path, 'wb') as f:
49 |             pickle.dump(vocab, f, protocol=0)
50 |         print("save %s ok." % pkl_path)
51 |     else:
52 |         raise IOError("no file: %s" % pkl_path)
53 | 
54 | 
55 | def split_sentence(sentence):
56 |     pattern = re.compile(u"[，。%、！!？?,；～~.… ]+")
57 |     clauses = [i for i in pattern.split(sentence.strip()) if i]
58 |     return clauses
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     sent = "nihao,我是警察，你站起来。我要问你话！好不。"
63 |     k = split_sentence(sent)
64 |     print(k)
65 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | jieba


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author:XuMing(xuming624@qq.com)
 4 | @description:
 5 | """
 6 | from setuptools import setup, find_packages
 7 | 
 8 | with open('README.md', 'r', encoding='utf-8') as f:
 9 |     readme = f.read()
10 | 
11 | setup(
12 |     name='pysenti',
13 |     version='0.1.9',
14 |     description='Chinese Sentiment Classifier',
15 |     long_description=readme,
16 |     long_description_content_type='text/markdown',
17 |     author='XuMing',
18 |     author_email='xuming624@qq.com',
19 |     url='https://github.com/shibing624/pysenti',
20 |     license="Apache 2.0",
21 |     classifiers=[
22 |         'Intended Audience :: Developers',
23 |         'Operating System :: OS Independent',
24 |         'Natural Language :: Chinese (Simplified)',
25 |         'Natural Language :: Chinese (Traditional)',
26 |         'Programming Language :: Python',
27 |         'Programming Language :: Python :: 2',
28 |         'Programming Language :: Python :: 2.7',
29 |         'Programming Language :: Python :: 3',
30 |         'Topic :: Text Processing',
31 |         'Topic :: Text Processing :: Indexing',
32 |         'Topic :: Text Processing :: Linguistic',
33 |     ],
34 |     keywords='NLP,sentiment-classifier,sentiment-classification,pysenti',
35 |     install_requires=['jieba'],
36 |     packages=find_packages(),
37 |     package_dir={'pysenti': 'pysenti'},
38 |     package_data={'pysenti': ['*.*', 'data/*']}
39 | )
40 | 


--------------------------------------------------------------------------------