├── .gitignore
├── LICENSE
├── README.md
├── example
├── .gitignore
├── README.md
├── __init__.py
├── data
│ └── hotel_comment.csv
├── docker-compose.yml
├── model
│ └── __init__.py
├── model_predict.py
├── model_train.py
├── sa_client.py
├── sa_model2tf_serving_model.py
├── sa_server.py
├── sa_tf_serving_api_client.py
├── sa_ui.py
└── tf_model
│ └── __init__.py
├── litNlp
├── .gitignore
├── __init__.py
├── model_structure
│ ├── BiLSTM.py
│ ├── GRU.py
│ ├── Model_TextCNN.py
│ ├── TextCNN.py
│ ├── TextCNN_m.py
│ └── __init__.py
├── predict.py
└── train.py
├── pic
├── auc_2poch.png
├── logo.png
├── server.png
├── tools.png
└── ui.png
├── requirement.txt
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 | .idea
6 | Customer_Satisfaction_Analysis/.idea
7 | model/__pycache__
8 | # C extensions
9 | *.so
10 | dist
11 | litNlp.egg-info
12 | # Distribution / packaging
13 | .Python
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | wheels/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | MANIFEST
30 |
31 | # PyInstaller
32 | # Usually these files are written by a python script from a template
33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | .hypothesis/
51 | .pytest_cache/
52 |
53 | # Translations
54 | *.mo
55 | *.pot
56 |
57 | # Django stuff:
58 | *.log
59 | local_settings.py
60 | db.sqlite3
61 |
62 | # Flask stuff:
63 | instance/
64 | .webassets-cache
65 |
66 | # Scrapy stuff:
67 | .scrapy
68 |
69 | # Sphinx documentation
70 | docs/_build/
71 |
72 | # PyBuilder
73 | target/
74 |
75 | # Jupyter Notebook
76 | .ipynb_checkpoints
77 |
78 | # pyenv
79 | .python-version
80 |
81 | # celery beat schedule file
82 | celerybeat-schedule
83 |
84 | # SageMath parsed files
85 | *.sage.py
86 |
87 | # Environments
88 | .env
89 | .venv
90 | env/
91 | venv/
92 | ENV/
93 | env.bak/
94 | venv.bak/
95 |
96 | # Spyder project settings
97 | .spyderproject
98 | .spyproject
99 |
100 | # Rope project settings
101 | .ropeproject
102 |
103 | # mkdocs documentation
104 | /site
105 |
106 | # mypy
107 | .mypy_cache/
108 | ### Example user template template
109 | ### Example user template
110 |
111 | # IntelliJ project files
112 | .idea
113 | *.iml
114 | out
115 | gen
116 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |

3 |
4 |
5 | -----------------
6 | ## litNlp: A Fast Tool for Sentiment Analysis with Tensorflow2
7 | [](https://996.icu)
8 | [](https://pypi.org/project/litNlp/)
9 | [](https://pepy.tech/project/litnlp)
10 | [](https://pepy.tech/project/litnlp/month)
11 | [](https://pepy.tech/project/litnlp/week)
12 |
13 |
14 | # litNlp 简介
15 |
16 | litNlp 是兼容最新版 Tensorflow 2.0 实现的一个轻量级的深度情感极性推理模型,使用字符级代替词语级进一步提升训练和推理速度,可以实现细粒度的多级别情感极性训练和预测,TF2 下 GPU 和 CPU 平台都能直接安装运行,是搭建 NLP 情感分析和分类模型 Baseline 的快速方案。
17 |
18 | 1. 内置深度学习情感分析模型。
19 | 2. 直接提供模型训练,默认 Text-CNN 字符级卷积网络作为 baseline ,自带早停操作,使用少的参数即可开始训练多分类模型。
20 | 3. 使用 Streamlit 快速对模型进行 UI 演示。
21 | 4. 增加 TF Serving 的转化和部署。
22 | 5. 增加 docker-compose up 的启动方式
23 |
24 |
25 | ## 直接使用 emample/sa_ui.py 进行前端 ui 展示效果
26 |
27 | ```python
28 | # 安装 streamlit 之后直接运行脚本
29 | streamlit run sa_ui.py
30 | ```
31 |
32 |
33 |
34 | ## 使用方法
35 | > 1. pip install litNlp
36 | > 2. 模型需要先通过训练,保存在 sa_model 里面,然后就可以批预测,具体的使用见 example 文件内容
37 |
38 | ```python
39 | from litNlp.predict import SA_Model_Predict
40 | import numpy as np
41 |
42 | # 加载模型的字典项
43 | tokenize_path = 'model/tokenizer.pickle'
44 | # train_method : 模型训练方式,默认 textcnn ,可选:bilstm , gru
45 | train_method = 'textcnn'
46 | # 模型的保存位置,后续用于推理
47 | sa_model_path_m = 'model/{}.h5'.format(train_method)
48 | # 开始输入待测样例
49 | predict_text = ['这个我不喜欢', '这个我喜欢不']
50 | # 加载模型
51 | model = SA_Model_Predict(tokenize_path, sa_model_path_m, max_len=100)
52 | # 开始推理
53 | sa_score = model.predict(predict_text)
54 | # 情感极性概率
55 | print(np.asarray(sa_score)[:,1])
56 | # 情感label输出
57 | print(np.argmax(np.asarray(sa_score), axis=1))
58 |
59 | ```
60 |
61 | ## 参数解释
62 | ```python
63 | # 最大句子长度
64 | maxlen = 100
65 | # 最大的tokenizer字典长度
66 | max_words = 1000
67 | # 设置embedding大小
68 | embedding_dim = 300
69 | # 模型的保存位置,后续用于推理
70 | sa_model_path_m = 'sa_model/c_cnn_m.h5'
71 | # 离线保存tokenizer
72 | tokenize_path ='sa_model/tokenizer.pickle'
73 | # 分类的类别数
74 | num_classes = 2
75 | # train_method : 模型训练方式,默认textcnn,可选:bilstm, gru
76 | train_method = 'textcnn'
77 | ```
78 |
79 | ## 2 个 epoch 的二分类性能
80 |
81 |
82 |
83 | ## jupyter 实验
84 |
85 | > 情感分析,优化语义的情感推理
86 |
87 |
88 | ## Flask Gunicorn 模型部署
89 | python sa_server.py 即可对训练的情感分析模型进行部署,模型首次推理需要预热,后续推理耗时在 200ms 之内。
90 |
91 |
92 |
93 | ## Tensorflow Serving 模型部署
94 |
95 | 利用 python example/sa_model2tf_serving_model.py 进行模型转换之后即可直接进行 TF Serving 的服务部署。
96 |
97 | 首先拉取对应版本的 TF Serving Docker
98 |
99 | docker pull tensorflow/serving:2.3.0
100 |
101 | 直接利用 Docker 加载转换之后的模型即可完成模型部署,TensorFlow Serving 会自动选择版本号最大的模型进行载入。
102 |
103 | Docker 命令行的 Dev 启动模式
104 |
105 | docker run -t --rm -p 9500:8500 -p:9501:8501 \
106 | -v "$(pwd)/tf_model/:/models/textcnn" \
107 | -e MODEL_NAME=textcnn -tensorflow_inter_op_parallelism=4 \
108 | tensorflow/serving:2.3.0
109 |
110 |
111 | Docker 命令行的 Pro 启动模式
112 |
113 | docker run -d --rm -p 9500:8500 -p:9501:8501 \
114 | -v "$(pwd)/tf_model/:/models/textcnn" \
115 | -e MODEL_NAME=textcnn -tensorflow_inter_op_parallelism=4 \
116 | tensorflow/serving:2.3.0
117 |
118 | 或者在 yml 所在的文件夹下增加直接使用 docker-compose up 进行服务的启动。
119 |
120 | 服务请求:
121 | 部署之后使用 python sa_tf_serving_api_client.py 进行 TF serving 服务的调用。
122 |
123 |
--------------------------------------------------------------------------------
/example/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 | .idea
6 | Customer_Satisfaction_Analysis/.idea
7 | model/__pycache__
8 | # C extensions
9 | *.so
10 | dist
11 | litNlp.egg-info
12 | # Distribution / packaging
13 | .Python
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | wheels/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | MANIFEST
30 |
31 | # PyInstaller
32 | # Usually these files are written by a python script from a template
33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | .hypothesis/
51 | .pytest_cache/
52 |
53 | # Translations
54 | *.mo
55 | *.pot
56 |
57 | # Django stuff:
58 | *.log
59 | local_settings.py
60 | db.sqlite3
61 |
62 | # Flask stuff:
63 | instance/
64 | .webassets-cache
65 |
66 | # Scrapy stuff:
67 | .scrapy
68 |
69 | # Sphinx documentation
70 | docs/_build/
71 |
72 | # PyBuilder
73 | target/
74 |
75 | # Jupyter Notebook
76 | .ipynb_checkpoints
77 |
78 | # pyenv
79 | .python-version
80 |
81 | # celery beat schedule file
82 | celerybeat-schedule
83 |
84 | # SageMath parsed files
85 | *.sage.py
86 |
87 | # Environments
88 | .env
89 | .venv
90 | env/
91 | venv/
92 | ENV/
93 | env.bak/
94 | venv.bak/
95 |
96 | # Spyder project settings
97 | .spyderproject
98 | .spyproject
99 |
100 | # Rope project settings
101 | .ropeproject
102 |
103 | # mkdocs documentation
104 | /site
105 |
106 | # mypy
107 | .mypy_cache/
108 | ### Example user template template
109 | ### Example user template
110 |
111 | # IntelliJ project files
112 | .idea
113 | *.iml
114 | out
115 | gen
116 |
--------------------------------------------------------------------------------
/example/README.md:
--------------------------------------------------------------------------------
1 | ### litNlp Demo
1. 电商情感极性输出
2 |
3 |
2. 酒店情感极性输出
--------------------------------------------------------------------------------
/example/__init__.py:
--------------------------------------------------------------------------------
1 | name = "example_litNlp"
--------------------------------------------------------------------------------
/example/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: "3"
2 |
3 | services:
4 | tfserving_predict:
5 | image: tensorflow/serving:2.3.0
6 | environment:
7 | MODEL_NAME: textcnn
8 | ports:
9 | - 9500:8500
10 | - 9501:8501
11 | volumes:
12 | - ./tf_model/:/models/textcnn
13 | restart: on-failure
14 | entrypoint:
15 | - /usr/bin/tf_serving_entrypoint.sh
16 | - --tensorflow_inter_op_parallelism=2
17 |
--------------------------------------------------------------------------------
/example/model/__init__.py:
--------------------------------------------------------------------------------
1 | name = "example_litNlp"
--------------------------------------------------------------------------------
/example/model_predict.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | from litNlp.predict import SA_Model_Predict
5 | import numpy as np
6 |
7 | # 加载模型的字典项
8 | tokenize_path = 'model/tokenizer.pickle'
9 | # train_method : 模型训练方式,默认 textcnn ,可选:bilstm , gru
10 | train_method = 'textcnn'
11 | # 模型的保存位置,后续用于推理
12 | sa_model_path_m = 'model/{}.h5'.format(train_method)
13 | # 开始输入待测样例
14 | predict_text = ['这个我不喜欢', '这个我喜欢不']
15 | # 加载模型
16 | model = SA_Model_Predict(tokenize_path, sa_model_path_m, max_len=100)
17 | # 开始推理
18 | sa_score = model.predict(predict_text)
19 | # 情感极性概率
20 | print(np.asarray(sa_score)[:,1])
21 | # 情感label输出
22 | print(np.argmax(np.asarray(sa_score), axis=1))
--------------------------------------------------------------------------------
/example/model_train.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time: 2020/6/21 0021 20:46
3 | import pandas as pd
4 | from litNlp.train import SA_Model_Train
5 | # e_comment
6 | # train_data = pd.read_csv('data/ebusiness_comment.csv')
7 | # hotel
8 | train_data = pd.read_csv('data/hotel_comment.csv')
9 | # 进行字符级处理
10 | train_data['text_cut'] = train_data['text'].apply(lambda x: " ".join(list(x)))
11 | # 最大句子长度
12 | maxlen = 100
13 | # 设置 tokenizer 字典大小
14 | max_words = 1000
15 | # 设置随机 embedding 大小
16 | embedding_dim = 300
17 | # train_method : 模型训练方式,默认 textcnn ,可选:bilstm , gru
18 | train_method = 'textcnn'
19 | # 模型的保存位置,后续用于推理
20 | sa_model_path_m = 'model/{}.h5'.format(train_method)
21 | # 离线保存 tokenizer
22 | tokenize_path ='model/tokenizer.pickle'
23 | # train: evaluate默认在训练完毕之后开启计算
24 | label = train_data['label']
25 | train_data = train_data['text_cut']
26 | model = SA_Model_Train(max_words, embedding_dim, maxlen, tokenize_path, sa_model_path_m, train_method)
27 | # 模型使用两极情感标注,定义 2 类标签类别,参数可以调节
28 | model.train(train_data, label, num_classes=2, batch_size=256, epochs=2, verbose=1, evaluate=True)
--------------------------------------------------------------------------------
/example/sa_client.py:
--------------------------------------------------------------------------------
1 | #! -*- coding: utf-8 -*-
2 | import requests
3 | import time
4 | import json
5 |
6 |
7 | def sa_api_request(content):
8 | st = time.time()
9 | api_url = 'http://127.0.0.1:5021/sa_api'
10 | para = {"content": content}
11 | model_result = requests.post(api_url, data=json.dumps(para)).json()
12 | print(model_result)
13 | print('request time used:{}'.format(time.time() - st))
14 |
15 |
16 | if __name__ == '__main__':
17 | content = '这家酒店真的不错'
18 | # 接口请求
19 | sa_api_request(content)
20 |
--------------------------------------------------------------------------------
/example/sa_model2tf_serving_model.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import tensorflow as tf
4 |
5 |
6 | # 待转化的模型,默认 textcnn ,可选:bilstm , gru
7 | train_method = 'textcnn'
8 | # 模型的保存位置,后续用于推理
9 | sa_model_path_m = 'model/{}.h5'.format(train_method)
10 | # 模型加载
11 | model = tf.keras.models.load_model(sa_model_path_m)
12 | # TF Serving 按照最大的 tag 进行模型的热更新,设置模型的tag
13 | tag = 1
14 | # 转化之后的模型路径
15 | save_path = "tf_model/{}/".format(tag)
16 | # 保存为 tf serving 加载的 model 形式
17 | model.save(save_path, save_format='tf')
18 |
19 |
--------------------------------------------------------------------------------
/example/sa_server.py:
--------------------------------------------------------------------------------
1 | #! -*- coding: utf-8 -*-
2 | from flask_restful import Resource, Api, request
3 | from litNlp.predict import SA_Model_Predict
4 | from flask import Flask
5 | import json
6 |
7 | app = Flask(__name__)
8 | api = Api(app)
9 |
10 | # 初始化模型,第一次推理需要预热
11 | tokenize_path = 'model/tokenizer.pickle'
12 | sa_model_path_m = 'model/model.h5'
13 | # 模型加载
14 | model = SA_Model_Predict(tokenize_path, sa_model_path_m, max_len=100)
15 |
16 |
17 | class sa_post_api(Resource):
18 | def post(self):
19 | # 接收对象
20 | parser = json.loads(request.get_data())
21 | content = str(parser['content'])
22 | sa_score = round(float(model.predict([content])[0][1]), 5)
23 | show_data = dict()
24 | show_data['sa_score'] = sa_score
25 | show_data['status'] = 1
26 | if sa_score > 0.5:
27 | show_data['label'] = '积极'
28 | elif sa_score < 0.5:
29 | show_data['label'] = '消极'
30 | else:
31 | show_data['label'] = '中性'
32 | # print(show_data)
33 | return show_data
34 |
35 |
36 | # 定义 POST 接口的请求信息
37 | api.add_resource(sa_post_api, '/sa_api')
38 |
39 | if __name__ == '__main__':
40 | app.run(port='5021')
41 |
--------------------------------------------------------------------------------
/example/sa_tf_serving_api_client.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @USER: CarryChang
3 |
4 | from tensorflow.keras.preprocessing.sequence import pad_sequences
5 | import numpy as np
6 | import requests
7 | import pickle
8 | import json
9 |
10 | # 设置单个用户评论的最大句子长度
11 | maxlen = 100
12 | # 保存向量字典
13 | tokenize_path = 'model/tokenizer.pickle'
14 |
15 | predict_text = ['这个环境不喜欢', '这个环境喜欢不']
16 | # 特征处理
17 | with open(tokenize_path, 'rb') as tokenize_save:
18 | tokenizer_load = pickle.load(tokenize_save)
19 |
20 | # 字符级
21 | tk_list = [list(text) for text in predict_text]
22 | # 字符填充
23 | test_text = pad_sequences(tokenizer_load.texts_to_sequences(tk_list), maxlen)
24 |
25 | # 多个评论进行推理
26 | data = {'instances': test_text.tolist()}
27 | # tf_model_textcnn 模型部署,REST 的访问端口为 9501
28 | predict_url = 'http://localhost:9501/v1/models/textcnn:predict'
29 | r = requests.post(predict_url, data=json.dumps(data))
30 | # 直接提取矩阵中积极的情感
31 | print("待测样例的情感值是:")
32 | print(np.array(r.json()['predictions'])[:, 1])
33 |
--------------------------------------------------------------------------------
/example/sa_ui.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from litNlp.predict import SA_Model_Predict
3 | import streamlit as st
4 |
5 | # 初始化模型
6 | tokenize_path = 'model/tokenizer.pickle'
7 | sa_model_path_m = 'model/model.h5'
8 | model = SA_Model_Predict(tokenize_path, sa_model_path_m, max_len=100)
9 | # 不用项目自动重启
10 | st.subheader('文本情感分析')
11 | # st.write('文本情感分析')
12 | # 接受前端的内容显示
13 | comment_input = st.text_input('请输入一行测试文本: ')
14 | # 开始处理内容
15 | if comment_input != '':
16 | # 文本处理
17 | comment = str(comment_input).strip()
18 | # 添加等待,并开始预测
19 | with st.spinner('Predicting...'):
20 | sa_score = float(model.predict([comment])[0][1])
21 | show_data = dict()
22 | show_data['status'] = 1
23 | show_data['sa_score'] = sa_score
24 | if sa_score > 0.5:
25 | show_data['label'] = '积极'
26 | elif sa_score < 0.5:
27 | show_data['label'] = '消极'
28 | else:
29 | show_data['label'] = '中性'
30 | # 最后展示内容
31 | st.write('分析结果: ')
32 | st.write(show_data)
33 |
--------------------------------------------------------------------------------
/example/tf_model/__init__.py:
--------------------------------------------------------------------------------
1 | name = "tf_serving"
--------------------------------------------------------------------------------
/litNlp/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by .ignore support plugin (hsz.mobi)
2 | ### Example user template template
3 | ### Example user template
4 |
5 | # IntelliJ project files
6 | .idea
7 | *.iml
8 | out
9 | gen
10 | __pycache__/
11 | model/__pycache__
--------------------------------------------------------------------------------
/litNlp/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time: 2020/6/22 0022 12:08
--------------------------------------------------------------------------------
/litNlp/model_structure/BiLSTM.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time: 2020/6/20 0020 10:38
3 | from tensorflow.keras.layers import Dense, Embedding, Flatten,Dropout,Convolution1D,BatchNormalization
4 | from tensorflow.keras.optimizers import Adam,RMSprop
5 | from tensorflow.keras.models import Sequential
6 | from tensorflow.keras.layers import Bidirectional,Activation,LSTM
7 | class BILSTM_Model:
8 | def create_model(self, max_words,embedding_dim, maxlen, n_class=2):
9 | model = Sequential()
10 | # embedding layer
11 | model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
12 | # BiLSTM
13 | model.add(Bidirectional(LSTM(units=32, return_sequences=True)))
14 | model.add(LSTM(units=16, return_sequences=False))
15 | model.add(Dense(512, activation='relu'))
16 | model.add(Dropout(0.5))
17 | model.add(Dense(n_class, activation='softmax'))
18 | model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=1e-3), metrics=['accuracy'])
19 | return model
--------------------------------------------------------------------------------
/litNlp/model_structure/GRU.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time: 2020/6/20 0020 10:38
3 | from tensorflow.keras.layers import Dense, Embedding, Flatten,Dropout,Convolution1D,BatchNormalization
4 | from tensorflow.keras.optimizers import Adam,RMSprop
5 | from tensorflow.keras.models import Sequential
6 | from tensorflow.keras.layers import Activation,MaxPool1D,Input,GRU
7 | class GRU_Model:
8 | def create_model(self, max_words, embedding_dim, maxlen, n_class=2):
9 | model = Sequential()
10 | # embedding layer
11 | model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
12 | # GRU
13 | model.add(GRU(units=32, return_sequences=True))
14 | model.add(GRU(units=16, return_sequences=False))
15 | model.add(Dense(512, activation='relu'))
16 | model.add(Dropout(0.5))
17 | model.add(Dense(n_class, activation='softmax'))
18 | model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=1e-3), metrics=['accuracy'])
19 | return model
20 |
--------------------------------------------------------------------------------
/litNlp/model_structure/Model_TextCNN.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time: 2020/6/20 0020 0:46
3 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau
4 | from tensorflow.keras.layers import Dense, Embedding, Flatten,Dropout,Convolution1D,BatchNormalization
5 | from tensorflow.keras.preprocessing.sequence import pad_sequences
6 | from tensorflow.keras.optimizers import Adam,RMSprop
7 | from tensorflow.keras.models import Sequential, Model
8 | from sklearn.model_selection import train_test_split
9 | from tensorflow.keras.preprocessing.text import Tokenizer
10 | from tensorflow.keras.layers import concatenate,GlobalAveragePooling1D,Activation,MaxPool1D,Input
11 | class sa_model:
12 | def create_model(self, max_words, embedding_dim, maxlen, n_class=2):
13 | # 使用model模式
14 | main_input = Input(shape=(maxlen,), dtype='float64')
15 | embedder = Embedding(max_words + 1, embedding_dim, input_length=maxlen)
16 | embed = embedder(main_input)
17 | # 3,4,5 windows
18 | cnn1 = Convolution1D(256, 3, padding='same', strides=1, activation='relu')(embed)
19 | cnn1 = MaxPool1D(pool_size=4)(cnn1)
20 | cnn2 = Convolution1D(256, 4, padding='same', strides=1, activation='relu')(embed)
21 | cnn2 = MaxPool1D(pool_size=4)(cnn2)
22 | cnn3 = Convolution1D(256, 5, padding='same', strides=1, activation='relu')(embed)
23 | cnn3 = MaxPool1D(pool_size=4)(cnn3)
24 | # concat
25 | cnn = concatenate([cnn1, cnn2, cnn3], axis=-1)
26 | flat = Flatten()(cnn)
27 | drop = Dropout(0.5)(flat)
28 | main_output = Dense(n_class, activation='softmax')(drop)
29 | model = Model(inputs=main_input, outputs=main_output)
30 | model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=1e-3), metrics=['accuracy'])
31 | return model
32 |
--------------------------------------------------------------------------------
/litNlp/model_structure/TextCNN.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time: 2020/6/20 0020 10:38
3 | from tensorflow.keras.layers import Dense, Embedding, Flatten,Dropout,Convolution1D,BatchNormalization
4 | from tensorflow.keras.optimizers import Adam,RMSprop
5 | from tensorflow.keras.models import Sequential, Model
6 | from tensorflow.keras.layers import concatenate,GlobalAveragePooling1D,Activation,MaxPooling1D,Input
7 | class sa_model:
8 | def create_model(self, max_words, embedding_dim, maxlen):
9 | model = Sequential()
10 | # embedding layer
11 | model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
12 | model.add(Convolution1D(64, 3, input_shape=(-1, embedding_dim)))
13 | model.add(Activation('relu'))
14 | model.add(MaxPooling1D(2, 2))
15 | model.add(Flatten())
16 | model.add(Dense(512, activation='relu'))
17 | model.add(Dropout(0.5))
18 | model.add(Dense(1, activation='sigmoid'))
19 | model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-3), metrics=['accuracy'])
20 | return model
21 |
--------------------------------------------------------------------------------
/litNlp/model_structure/TextCNN_m.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time: 2020/6/20 0020 10:38
3 | from tensorflow.keras.layers import Dense, Embedding, Flatten,Dropout,Convolution1D,BatchNormalization
4 | from tensorflow.keras.optimizers import Adam,RMSprop
5 | from tensorflow.keras.models import Sequential, Model
6 | from tensorflow.keras.layers import GlobalAveragePooling1D,Activation,MaxPooling1D,Input
7 | class TextCNN_m:
8 | def create_model(self,max_words,embedding_dim, maxlen, n_class=2):
9 | model = Sequential()
10 | # embedding layer
11 | model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
12 | model.add(Convolution1D(64, 3, input_shape=(-1, embedding_dim)))
13 | model.add(Activation('relu'))
14 | model.add(MaxPooling1D(2, 2))
15 | model.add(Flatten())
16 | model.add(Dense(512, activation='relu'))
17 | model.add(Dropout(0.5))
18 | model.add(Dense(n_class, activation='softmax'))
19 | model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=1e-3), metrics=['accuracy'])
20 | return model
21 |
--------------------------------------------------------------------------------
/litNlp/model_structure/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CarryChang/litNlp/53bc4df6168fcfdc109afbc4acd0e4838c6899dc/litNlp/model_structure/__init__.py
--------------------------------------------------------------------------------
/litNlp/predict.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time: 2020/6/20 0020 0:55
3 | from tensorflow.keras.preprocessing.sequence import pad_sequences
4 | from tensorflow.keras.models import load_model
5 | import pickle
6 | class SA_Model_Predict:
7 | def __init__(self,tokenize_path, sa_model_path_m, max_len=100):
8 | with open(tokenize_path, 'rb') as tokenize_save:
9 | self.tokenizer_load = pickle.load(tokenize_save)
10 | self.max_len = max_len
11 | self.sa_model_path_m = sa_model_path_m
12 | def predict(self,predict_text):
13 | tk_list = [list(text) for text in predict_text]
14 | test_text = pad_sequences(self.tokenizer_load.texts_to_sequences(tk_list), self.max_len)
15 | model_load = load_model(self.sa_model_path_m)
16 | test_proba_list = model_load.predict(test_text)
17 | return test_proba_list
18 | # if __name__ == '__main__':
19 | # # 内置参数,批处理文本
20 | # predict_text = ['这个我不喜欢', '这个我喜欢不']
21 | # # 初始化模型
22 | # model = SA_Model_Predict(tokenize_path=tokenizer_path, sa_model_path_m=sa_model_path)
23 | # sa_score = model.predict(predict_text)
24 | # # 多分类模型输出
25 | # print([i[1] for i in sa_score])
--------------------------------------------------------------------------------
/litNlp/train.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time: 2020/6/20 0020 0:46
3 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau
4 | from tensorflow.keras.preprocessing.sequence import pad_sequences
5 | from sklearn.model_selection import train_test_split
6 | from tensorflow.keras.preprocessing.text import Tokenizer
7 | from tensorflow.keras.utils import to_categorical
8 | from .model_structure.BiLSTM import BILSTM_Model
9 | from .model_structure.GRU import GRU_Model
10 | from .model_structure.TextCNN_m import TextCNN_m
11 | from sklearn import metrics
12 | import numpy as np
13 | import pickle
14 | class SA_Model_Train:
15 | def __init__(self, max_words, embedding_dim, maxlen, tokenize_path, sa_model_path_m, train_method=''):
16 | if train_method == 'gru':
17 | self.init_model = GRU_Model()
18 | elif train_method == 'bilstm':
19 | self.init_model = BILSTM_Model()
20 | else:
21 | # 默认textcnn
22 | self.init_model = TextCNN_m()
23 | self.max_words = max_words
24 | self.tokenize_path = tokenize_path
25 | self.embedding_dim = embedding_dim
26 | self.maxlen = maxlen
27 | self.sa_model_path_m = sa_model_path_m
28 | self.model = self.init_model.create_model(self.max_words, self.embedding_dim, self.maxlen)
29 | def train_tk(self,train_data):
30 | tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', num_words=self.max_words)
31 | tokenizer.fit_on_texts(train_data)
32 | with open(self.tokenize_path, 'wb') as tokenize:
33 | pickle.dump(tokenizer, tokenize)
34 | return tokenizer
35 | def train(self,train_data,label,num_classes,batch_size=256,epochs=10,verbose=1,evaluate=True):
36 | # to_categorical
37 | targets_values = to_categorical(label, num_classes=num_classes)
38 | # data split
39 | x_train, y_train, x_test, y_test = train_test_split(train_data, targets_values, test_size=0.2, random_state=1)
40 | # pad_sequences
41 | tokenizer = self.train_tk(train_data)
42 | x_train, x_test = pad_sequences(tokenizer.texts_to_sequences(x_train), self.maxlen), np.array(x_test)
43 | y_train, y_test = pad_sequences(tokenizer.texts_to_sequences(y_train), self.maxlen), np.array(y_test)
44 | self.model.fit(x_train, x_test, batch_size, epochs, verbose,
45 | validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.0001)])
46 | self.model.save(self.sa_model_path_m)
47 | try:
48 | pre_result = self.model.predict(y_train, batch_size=256, verbose=0)
49 | except:
50 | result_ = self.model.predict_classes(y_train, batch_size=256, verbose=0)
51 | pre_result = np.argmax(result_, axis=1)
52 | if evaluate:
53 | result = [np.argmax(i) for i in pre_result]
54 | y_test = [np.argmax(i) for i in y_test]
55 | report = metrics.classification_report(y_test, result)
56 | acc = metrics.accuracy_score(y_test, result)
57 | auc = metrics.roc_auc_score(y_test, result)
58 | print(report)
59 | print('acc: {} auc: {}'.format(acc, auc))
60 | # if __name__ == '__main__':
61 | # # C-CNN-SA(字符级卷积网络)
62 | # train_data = pd.read_csv('data/sa_data_train.csv')
63 | # # list sentence
64 | # train_data['text_cut'] = train_data['text'].apply(lambda x: " ".join(list(x)))
65 | # model = SA()
66 | # tokenizer = model.train_tk()
67 | # # 2-8的分割数据,固定测试数据
68 | # targets_values = to_categorical(train_data['label'], num_classes=num_classes)
69 | # x_train, y_train, x_test, y_test = train_test_split(train_data['text_cut'],targets_values, test_size=0.2, random_state=1)
70 | # # pad_sequences
71 | # x_train, x_test = pad_sequences(tokenizer.texts_to_sequences(x_train), maxlen), np.array(x_test)
72 | # y_train, y_test = pad_sequences(tokenizer.texts_to_sequences(y_train), maxlen), np.array(y_test)
73 | # # train
74 | # pre_result = model.train(x_train, x_test)
75 | # # evaluate
76 | # model.evaluate(pre_result, y_test)
77 |
78 |
79 |
80 |
81 |
82 |
83 |
--------------------------------------------------------------------------------
/pic/auc_2poch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CarryChang/litNlp/53bc4df6168fcfdc109afbc4acd0e4838c6899dc/pic/auc_2poch.png
--------------------------------------------------------------------------------
/pic/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CarryChang/litNlp/53bc4df6168fcfdc109afbc4acd0e4838c6899dc/pic/logo.png
--------------------------------------------------------------------------------
/pic/server.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CarryChang/litNlp/53bc4df6168fcfdc109afbc4acd0e4838c6899dc/pic/server.png
--------------------------------------------------------------------------------
/pic/tools.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CarryChang/litNlp/53bc4df6168fcfdc109afbc4acd0e4838c6899dc/pic/tools.png
--------------------------------------------------------------------------------
/pic/ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CarryChang/litNlp/53bc4df6168fcfdc109afbc4acd0e4838c6899dc/pic/ui.png
--------------------------------------------------------------------------------
/requirement.txt:
--------------------------------------------------------------------------------
1 | tensorflow>=2.0.1
2 | streamlit
3 | docker-compose
4 | scikit-learn
5 | pandas
6 | numpy
7 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import setuptools
3 | import io
4 | import os
5 | import platform
6 | import subprocess
7 | import sys
8 | with io.open('README.md', 'r', encoding='utf-8') as f:
9 | long_description = f.read()
10 | REQUIRED_PACKAGES = [
11 | 'h5py', 'requests'
12 | ]
13 | setuptools.setup(
14 | name="litNlp",
15 | version="0.8.5",
16 | packages=['litNlp', 'litNlp.model_structure'],
17 | author="CarryChang",
18 | author_email="coolcahng@gmail.com",
19 | url='https://github.com/CarryChang/litNlp',
20 | license='https://www.apache.org/licenses/LICENSE-2.0',
21 | include_package_data=True,
22 | description='A fast tool for sentiment analysis model with tensorflow2.0 ',
23 | # long_description='litNlp 是基于 Tensorflow2.0 实现的一个轻量级的深度情感极性推理模型,可以实现细粒度的多级别情感极性训练和预测。 GPU 和 CPU 平台通用,是搭建 NLP 分类模型类 baseline 的快速方案。'
24 | long_description=long_description,
25 | long_description_content_type='text/markdown',
26 | install_requires=REQUIRED_PACKAGES,
27 | python_requires=">=3.5",
28 | zip_safe=True,
29 | classifiers=(
30 | "License :: OSI Approved :: Apache Software License",
31 | "Operating System :: OS Independent",
32 | 'Intended Audience :: Developers',
33 | 'Intended Audience :: Education',
34 | 'Intended Audience :: Science/Research',
35 | 'Programming Language :: Python :: 3.5',
36 | 'Programming Language :: Python :: 3.6',
37 | 'Programming Language :: Python :: 3.7',
38 | 'Topic :: Scientific/Engineering',
39 | 'Topic :: Software Development',
40 | 'Topic :: Software Development :: Libraries',
41 | 'Topic :: Software Development :: Libraries :: Python Modules',
42 | ),
43 | extras_require={
44 | "cpu": ["tensorflow>=2.0.1"],
45 | "gpu": ["tensorflow-gpu>=2.0.1"],
46 | },
47 | entry_points={
48 | },
49 | keywords=['text classification', 'nlp','batch predict',
50 | 'deep learning', 'tensorflow', 'ml',],
51 | )
--------------------------------------------------------------------------------