├── .gitignore ├── LICENSE ├── README.md ├── example ├── .gitignore ├── README.md ├── __init__.py ├── data │ └── hotel_comment.csv ├── docker-compose.yml ├── model │ └── __init__.py ├── model_predict.py ├── model_train.py ├── sa_client.py ├── sa_model2tf_serving_model.py ├── sa_server.py ├── sa_tf_serving_api_client.py ├── sa_ui.py └── tf_model │ └── __init__.py ├── litNlp ├── .gitignore ├── __init__.py ├── model_structure │ ├── BiLSTM.py │ ├── GRU.py │ ├── Model_TextCNN.py │ ├── TextCNN.py │ ├── TextCNN_m.py │ └── __init__.py ├── predict.py └── train.py ├── pic ├── auc_2poch.png ├── logo.png ├── server.png ├── tools.png └── ui.png ├── requirement.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | .idea 6 | Customer_Satisfaction_Analysis/.idea 7 | model/__pycache__ 8 | # C extensions 9 | *.so 10 | dist 11 | litNlp.egg-info 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # Environments 88 | .env 89 | .venv 90 | env/ 91 | venv/ 92 | ENV/ 93 | env.bak/ 94 | venv.bak/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | ### Example user template template 109 | ### Example user template 110 | 111 | # IntelliJ project files 112 | .idea 113 | *.iml 114 | out 115 | gen 116 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |
3 |
4 | 5 | ----------------- 6 | ## litNlp: A Fast Tool for Sentiment Analysis with Tensorflow2 7 | [![996.icu](https://img.shields.io/badge/link-996.icu-red.svg)](https://996.icu) 8 | [![PyPI Latest Release](https://img.shields.io/pypi/v/litNlp.svg)](https://pypi.org/project/litNlp/) 9 | [![Downloads](https://pepy.tech/badge/litnlp)](https://pepy.tech/project/litnlp) 10 | [![Downloads](https://pepy.tech/badge/litnlp/month)](https://pepy.tech/project/litnlp/month) 11 | [![Downloads](https://pepy.tech/badge/litnlp/week)](https://pepy.tech/project/litnlp/week) 12 | 13 | 14 | # litNlp 简介 15 | 16 | litNlp 是兼容最新版 Tensorflow 2.0 实现的一个轻量级的深度情感极性推理模型,使用字符级代替词语级进一步提升训练和推理速度,可以实现细粒度的多级别情感极性训练和预测,TF2 下 GPU 和 CPU 平台都能直接安装运行,是搭建 NLP 情感分析和分类模型 Baseline 的快速方案。 17 | 18 | 1. 内置深度学习情感分析模型。 19 | 2. 直接提供模型训练,默认 Text-CNN 字符级卷积网络作为 baseline ,自带早停操作,使用少的参数即可开始训练多分类模型。 20 | 3. 使用 Streamlit 快速对模型进行 UI 演示。 21 | 4. 增加 TF Serving 的转化和部署。 22 | 5. 增加 docker-compose up 的启动方式 23 | 24 | 25 | ## 直接使用 emample/sa_ui.py 进行前端 ui 展示效果 26 | 27 | ```python 28 | # 安装 streamlit 之后直接运行脚本 29 | streamlit run sa_ui.py 30 | ``` 31 | 32 |
33 | 34 | ## 使用方法 35 | > 1. pip install litNlp 36 | > 2. 模型需要先通过训练,保存在 sa_model 里面,然后就可以批预测,具体的使用见 example 文件内容 37 | 38 | ```python 39 | from litNlp.predict import SA_Model_Predict 40 | import numpy as np 41 | 42 | # 加载模型的字典项 43 | tokenize_path = 'model/tokenizer.pickle' 44 | # train_method : 模型训练方式,默认 textcnn ,可选:bilstm , gru 45 | train_method = 'textcnn' 46 | # 模型的保存位置,后续用于推理 47 | sa_model_path_m = 'model/{}.h5'.format(train_method) 48 | # 开始输入待测样例 49 | predict_text = ['这个我不喜欢', '这个我喜欢不'] 50 | # 加载模型 51 | model = SA_Model_Predict(tokenize_path, sa_model_path_m, max_len=100) 52 | # 开始推理 53 | sa_score = model.predict(predict_text) 54 | # 情感极性概率 55 | print(np.asarray(sa_score)[:,1]) 56 | # 情感label输出 57 | print(np.argmax(np.asarray(sa_score), axis=1)) 58 | 59 | ``` 60 | 61 | ## 参数解释 62 | ```python 63 | # 最大句子长度 64 | maxlen = 100 65 | # 最大的tokenizer字典长度 66 | max_words = 1000 67 | # 设置embedding大小 68 | embedding_dim = 300 69 | # 模型的保存位置,后续用于推理 70 | sa_model_path_m = 'sa_model/c_cnn_m.h5' 71 | # 离线保存tokenizer 72 | tokenize_path ='sa_model/tokenizer.pickle' 73 | # 分类的类别数 74 | num_classes = 2 75 | # train_method : 模型训练方式,默认textcnn,可选:bilstm, gru 76 | train_method = 'textcnn' 77 | ``` 78 | 79 | ## 2 个 epoch 的二分类性能 80 | 81 |
82 | 83 | ## jupyter 实验 84 | 85 | > 情感分析,优化语义的情感推理 86 |
87 | 88 | ## Flask Gunicorn 模型部署 89 | python sa_server.py 即可对训练的情感分析模型进行部署,模型首次推理需要预热,后续推理耗时在 200ms 之内。 90 | 91 |
92 | 93 | ## Tensorflow Serving 模型部署 94 | 95 | 利用 python example/sa_model2tf_serving_model.py 进行模型转换之后即可直接进行 TF Serving 的服务部署。 96 | 97 | 首先拉取对应版本的 TF Serving Docker 98 | 99 | docker pull tensorflow/serving:2.3.0 100 | 101 | 直接利用 Docker 加载转换之后的模型即可完成模型部署,TensorFlow Serving 会自动选择版本号最大的模型进行载入。 102 | 103 | Docker 命令行的 Dev 启动模式 104 | 105 | docker run -t --rm -p 9500:8500 -p:9501:8501 \ 106 | -v "$(pwd)/tf_model/:/models/textcnn" \ 107 | -e MODEL_NAME=textcnn -tensorflow_inter_op_parallelism=4 \ 108 | tensorflow/serving:2.3.0 109 | 110 | 111 | Docker 命令行的 Pro 启动模式 112 | 113 | docker run -d --rm -p 9500:8500 -p:9501:8501 \ 114 | -v "$(pwd)/tf_model/:/models/textcnn" \ 115 | -e MODEL_NAME=textcnn -tensorflow_inter_op_parallelism=4 \ 116 | tensorflow/serving:2.3.0 117 | 118 | 或者在 yml 所在的文件夹下增加直接使用 docker-compose up 进行服务的启动。 119 | 120 | 服务请求: 121 | 部署之后使用 python sa_tf_serving_api_client.py 进行 TF serving 服务的调用。 122 | 123 | -------------------------------------------------------------------------------- /example/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | .idea 6 | Customer_Satisfaction_Analysis/.idea 7 | model/__pycache__ 8 | # C extensions 9 | *.so 10 | dist 11 | litNlp.egg-info 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # Environments 88 | .env 89 | .venv 90 | env/ 91 | venv/ 92 | ENV/ 93 | env.bak/ 94 | venv.bak/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | ### Example user template template 109 | ### Example user template 110 | 111 | # IntelliJ project files 112 | .idea 113 | *.iml 114 | out 115 | gen 116 | -------------------------------------------------------------------------------- /example/README.md: -------------------------------------------------------------------------------- 1 | ### litNlp Demo 1. 电商情感极性输出 2 | 3 | 2. 酒店情感极性输出 -------------------------------------------------------------------------------- /example/__init__.py: -------------------------------------------------------------------------------- 1 | name = "example_litNlp" -------------------------------------------------------------------------------- /example/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | services: 4 | tfserving_predict: 5 | image: tensorflow/serving:2.3.0 6 | environment: 7 | MODEL_NAME: textcnn 8 | ports: 9 | - 9500:8500 10 | - 9501:8501 11 | volumes: 12 | - ./tf_model/:/models/textcnn 13 | restart: on-failure 14 | entrypoint: 15 | - /usr/bin/tf_serving_entrypoint.sh 16 | - --tensorflow_inter_op_parallelism=2 17 | -------------------------------------------------------------------------------- /example/model/__init__.py: -------------------------------------------------------------------------------- 1 | name = "example_litNlp" -------------------------------------------------------------------------------- /example/model_predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | from litNlp.predict import SA_Model_Predict 5 | import numpy as np 6 | 7 | # 加载模型的字典项 8 | tokenize_path = 'model/tokenizer.pickle' 9 | # train_method : 模型训练方式,默认 textcnn ,可选:bilstm , gru 10 | train_method = 'textcnn' 11 | # 模型的保存位置,后续用于推理 12 | sa_model_path_m = 'model/{}.h5'.format(train_method) 13 | # 开始输入待测样例 14 | predict_text = ['这个我不喜欢', '这个我喜欢不'] 15 | # 加载模型 16 | model = SA_Model_Predict(tokenize_path, sa_model_path_m, max_len=100) 17 | # 开始推理 18 | sa_score = model.predict(predict_text) 19 | # 情感极性概率 20 | print(np.asarray(sa_score)[:,1]) 21 | # 情感label输出 22 | print(np.argmax(np.asarray(sa_score), axis=1)) -------------------------------------------------------------------------------- /example/model_train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time: 2020/6/21 0021 20:46 3 | import pandas as pd 4 | from litNlp.train import SA_Model_Train 5 | # e_comment 6 | # train_data = pd.read_csv('data/ebusiness_comment.csv') 7 | # hotel 8 | train_data = pd.read_csv('data/hotel_comment.csv') 9 | # 进行字符级处理 10 | train_data['text_cut'] = train_data['text'].apply(lambda x: " ".join(list(x))) 11 | # 最大句子长度 12 | maxlen = 100 13 | # 设置 tokenizer 字典大小 14 | max_words = 1000 15 | # 设置随机 embedding 大小 16 | embedding_dim = 300 17 | # train_method : 模型训练方式,默认 textcnn ,可选:bilstm , gru 18 | train_method = 'textcnn' 19 | # 模型的保存位置,后续用于推理 20 | sa_model_path_m = 'model/{}.h5'.format(train_method) 21 | # 离线保存 tokenizer 22 | tokenize_path ='model/tokenizer.pickle' 23 | # train: evaluate默认在训练完毕之后开启计算 24 | label = train_data['label'] 25 | train_data = train_data['text_cut'] 26 | model = SA_Model_Train(max_words, embedding_dim, maxlen, tokenize_path, sa_model_path_m, train_method) 27 | # 模型使用两极情感标注,定义 2 类标签类别,参数可以调节 28 | model.train(train_data, label, num_classes=2, batch_size=256, epochs=2, verbose=1, evaluate=True) -------------------------------------------------------------------------------- /example/sa_client.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | import requests 3 | import time 4 | import json 5 | 6 | 7 | def sa_api_request(content): 8 | st = time.time() 9 | api_url = 'http://127.0.0.1:5021/sa_api' 10 | para = {"content": content} 11 | model_result = requests.post(api_url, data=json.dumps(para)).json() 12 | print(model_result) 13 | print('request time used:{}'.format(time.time() - st)) 14 | 15 | 16 | if __name__ == '__main__': 17 | content = '这家酒店真的不错' 18 | # 接口请求 19 | sa_api_request(content) 20 | -------------------------------------------------------------------------------- /example/sa_model2tf_serving_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | 5 | 6 | # 待转化的模型,默认 textcnn ,可选:bilstm , gru 7 | train_method = 'textcnn' 8 | # 模型的保存位置,后续用于推理 9 | sa_model_path_m = 'model/{}.h5'.format(train_method) 10 | # 模型加载 11 | model = tf.keras.models.load_model(sa_model_path_m) 12 | # TF Serving 按照最大的 tag 进行模型的热更新,设置模型的tag 13 | tag = 1 14 | # 转化之后的模型路径 15 | save_path = "tf_model/{}/".format(tag) 16 | # 保存为 tf serving 加载的 model 形式 17 | model.save(save_path, save_format='tf') 18 | 19 | -------------------------------------------------------------------------------- /example/sa_server.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | from flask_restful import Resource, Api, request 3 | from litNlp.predict import SA_Model_Predict 4 | from flask import Flask 5 | import json 6 | 7 | app = Flask(__name__) 8 | api = Api(app) 9 | 10 | # 初始化模型,第一次推理需要预热 11 | tokenize_path = 'model/tokenizer.pickle' 12 | sa_model_path_m = 'model/model.h5' 13 | # 模型加载 14 | model = SA_Model_Predict(tokenize_path, sa_model_path_m, max_len=100) 15 | 16 | 17 | class sa_post_api(Resource): 18 | def post(self): 19 | # 接收对象 20 | parser = json.loads(request.get_data()) 21 | content = str(parser['content']) 22 | sa_score = round(float(model.predict([content])[0][1]), 5) 23 | show_data = dict() 24 | show_data['sa_score'] = sa_score 25 | show_data['status'] = 1 26 | if sa_score > 0.5: 27 | show_data['label'] = '积极' 28 | elif sa_score < 0.5: 29 | show_data['label'] = '消极' 30 | else: 31 | show_data['label'] = '中性' 32 | # print(show_data) 33 | return show_data 34 | 35 | 36 | # 定义 POST 接口的请求信息 37 | api.add_resource(sa_post_api, '/sa_api') 38 | 39 | if __name__ == '__main__': 40 | app.run(port='5021') 41 | -------------------------------------------------------------------------------- /example/sa_tf_serving_api_client.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @USER: CarryChang 3 | 4 | from tensorflow.keras.preprocessing.sequence import pad_sequences 5 | import numpy as np 6 | import requests 7 | import pickle 8 | import json 9 | 10 | # 设置单个用户评论的最大句子长度 11 | maxlen = 100 12 | # 保存向量字典 13 | tokenize_path = 'model/tokenizer.pickle' 14 | 15 | predict_text = ['这个环境不喜欢', '这个环境喜欢不'] 16 | # 特征处理 17 | with open(tokenize_path, 'rb') as tokenize_save: 18 | tokenizer_load = pickle.load(tokenize_save) 19 | 20 | # 字符级 21 | tk_list = [list(text) for text in predict_text] 22 | # 字符填充 23 | test_text = pad_sequences(tokenizer_load.texts_to_sequences(tk_list), maxlen) 24 | 25 | # 多个评论进行推理 26 | data = {'instances': test_text.tolist()} 27 | # tf_model_textcnn 模型部署,REST 的访问端口为 9501 28 | predict_url = 'http://localhost:9501/v1/models/textcnn:predict' 29 | r = requests.post(predict_url, data=json.dumps(data)) 30 | # 直接提取矩阵中积极的情感 31 | print("待测样例的情感值是:") 32 | print(np.array(r.json()['predictions'])[:, 1]) 33 | -------------------------------------------------------------------------------- /example/sa_ui.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from litNlp.predict import SA_Model_Predict 3 | import streamlit as st 4 | 5 | # 初始化模型 6 | tokenize_path = 'model/tokenizer.pickle' 7 | sa_model_path_m = 'model/model.h5' 8 | model = SA_Model_Predict(tokenize_path, sa_model_path_m, max_len=100) 9 | # 不用项目自动重启 10 | st.subheader('文本情感分析') 11 | # st.write('文本情感分析') 12 | # 接受前端的内容显示 13 | comment_input = st.text_input('请输入一行测试文本: ') 14 | # 开始处理内容 15 | if comment_input != '': 16 | # 文本处理 17 | comment = str(comment_input).strip() 18 | # 添加等待,并开始预测 19 | with st.spinner('Predicting...'): 20 | sa_score = float(model.predict([comment])[0][1]) 21 | show_data = dict() 22 | show_data['status'] = 1 23 | show_data['sa_score'] = sa_score 24 | if sa_score > 0.5: 25 | show_data['label'] = '积极' 26 | elif sa_score < 0.5: 27 | show_data['label'] = '消极' 28 | else: 29 | show_data['label'] = '中性' 30 | # 最后展示内容 31 | st.write('分析结果: ') 32 | st.write(show_data) 33 | -------------------------------------------------------------------------------- /example/tf_model/__init__.py: -------------------------------------------------------------------------------- 1 | name = "tf_serving" -------------------------------------------------------------------------------- /litNlp/.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Example user template template 3 | ### Example user template 4 | 5 | # IntelliJ project files 6 | .idea 7 | *.iml 8 | out 9 | gen 10 | __pycache__/ 11 | model/__pycache__ -------------------------------------------------------------------------------- /litNlp/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time: 2020/6/22 0022 12:08 -------------------------------------------------------------------------------- /litNlp/model_structure/BiLSTM.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time: 2020/6/20 0020 10:38 3 | from tensorflow.keras.layers import Dense, Embedding, Flatten,Dropout,Convolution1D,BatchNormalization 4 | from tensorflow.keras.optimizers import Adam,RMSprop 5 | from tensorflow.keras.models import Sequential 6 | from tensorflow.keras.layers import Bidirectional,Activation,LSTM 7 | class BILSTM_Model: 8 | def create_model(self, max_words,embedding_dim, maxlen, n_class=2): 9 | model = Sequential() 10 | # embedding layer 11 | model.add(Embedding(max_words, embedding_dim, input_length=maxlen)) 12 | # BiLSTM 13 | model.add(Bidirectional(LSTM(units=32, return_sequences=True))) 14 | model.add(LSTM(units=16, return_sequences=False)) 15 | model.add(Dense(512, activation='relu')) 16 | model.add(Dropout(0.5)) 17 | model.add(Dense(n_class, activation='softmax')) 18 | model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=1e-3), metrics=['accuracy']) 19 | return model -------------------------------------------------------------------------------- /litNlp/model_structure/GRU.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time: 2020/6/20 0020 10:38 3 | from tensorflow.keras.layers import Dense, Embedding, Flatten,Dropout,Convolution1D,BatchNormalization 4 | from tensorflow.keras.optimizers import Adam,RMSprop 5 | from tensorflow.keras.models import Sequential 6 | from tensorflow.keras.layers import Activation,MaxPool1D,Input,GRU 7 | class GRU_Model: 8 | def create_model(self, max_words, embedding_dim, maxlen, n_class=2): 9 | model = Sequential() 10 | # embedding layer 11 | model.add(Embedding(max_words, embedding_dim, input_length=maxlen)) 12 | # GRU 13 | model.add(GRU(units=32, return_sequences=True)) 14 | model.add(GRU(units=16, return_sequences=False)) 15 | model.add(Dense(512, activation='relu')) 16 | model.add(Dropout(0.5)) 17 | model.add(Dense(n_class, activation='softmax')) 18 | model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=1e-3), metrics=['accuracy']) 19 | return model 20 | -------------------------------------------------------------------------------- /litNlp/model_structure/Model_TextCNN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time: 2020/6/20 0020 0:46 3 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau 4 | from tensorflow.keras.layers import Dense, Embedding, Flatten,Dropout,Convolution1D,BatchNormalization 5 | from tensorflow.keras.preprocessing.sequence import pad_sequences 6 | from tensorflow.keras.optimizers import Adam,RMSprop 7 | from tensorflow.keras.models import Sequential, Model 8 | from sklearn.model_selection import train_test_split 9 | from tensorflow.keras.preprocessing.text import Tokenizer 10 | from tensorflow.keras.layers import concatenate,GlobalAveragePooling1D,Activation,MaxPool1D,Input 11 | class sa_model: 12 | def create_model(self, max_words, embedding_dim, maxlen, n_class=2): 13 | # 使用model模式 14 | main_input = Input(shape=(maxlen,), dtype='float64') 15 | embedder = Embedding(max_words + 1, embedding_dim, input_length=maxlen) 16 | embed = embedder(main_input) 17 | # 3,4,5 windows 18 | cnn1 = Convolution1D(256, 3, padding='same', strides=1, activation='relu')(embed) 19 | cnn1 = MaxPool1D(pool_size=4)(cnn1) 20 | cnn2 = Convolution1D(256, 4, padding='same', strides=1, activation='relu')(embed) 21 | cnn2 = MaxPool1D(pool_size=4)(cnn2) 22 | cnn3 = Convolution1D(256, 5, padding='same', strides=1, activation='relu')(embed) 23 | cnn3 = MaxPool1D(pool_size=4)(cnn3) 24 | # concat 25 | cnn = concatenate([cnn1, cnn2, cnn3], axis=-1) 26 | flat = Flatten()(cnn) 27 | drop = Dropout(0.5)(flat) 28 | main_output = Dense(n_class, activation='softmax')(drop) 29 | model = Model(inputs=main_input, outputs=main_output) 30 | model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=1e-3), metrics=['accuracy']) 31 | return model 32 | -------------------------------------------------------------------------------- /litNlp/model_structure/TextCNN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time: 2020/6/20 0020 10:38 3 | from tensorflow.keras.layers import Dense, Embedding, Flatten,Dropout,Convolution1D,BatchNormalization 4 | from tensorflow.keras.optimizers import Adam,RMSprop 5 | from tensorflow.keras.models import Sequential, Model 6 | from tensorflow.keras.layers import concatenate,GlobalAveragePooling1D,Activation,MaxPooling1D,Input 7 | class sa_model: 8 | def create_model(self, max_words, embedding_dim, maxlen): 9 | model = Sequential() 10 | # embedding layer 11 | model.add(Embedding(max_words, embedding_dim, input_length=maxlen)) 12 | model.add(Convolution1D(64, 3, input_shape=(-1, embedding_dim))) 13 | model.add(Activation('relu')) 14 | model.add(MaxPooling1D(2, 2)) 15 | model.add(Flatten()) 16 | model.add(Dense(512, activation='relu')) 17 | model.add(Dropout(0.5)) 18 | model.add(Dense(1, activation='sigmoid')) 19 | model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-3), metrics=['accuracy']) 20 | return model 21 | -------------------------------------------------------------------------------- /litNlp/model_structure/TextCNN_m.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time: 2020/6/20 0020 10:38 3 | from tensorflow.keras.layers import Dense, Embedding, Flatten,Dropout,Convolution1D,BatchNormalization 4 | from tensorflow.keras.optimizers import Adam,RMSprop 5 | from tensorflow.keras.models import Sequential, Model 6 | from tensorflow.keras.layers import GlobalAveragePooling1D,Activation,MaxPooling1D,Input 7 | class TextCNN_m: 8 | def create_model(self,max_words,embedding_dim, maxlen, n_class=2): 9 | model = Sequential() 10 | # embedding layer 11 | model.add(Embedding(max_words, embedding_dim, input_length=maxlen)) 12 | model.add(Convolution1D(64, 3, input_shape=(-1, embedding_dim))) 13 | model.add(Activation('relu')) 14 | model.add(MaxPooling1D(2, 2)) 15 | model.add(Flatten()) 16 | model.add(Dense(512, activation='relu')) 17 | model.add(Dropout(0.5)) 18 | model.add(Dense(n_class, activation='softmax')) 19 | model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=1e-3), metrics=['accuracy']) 20 | return model 21 | -------------------------------------------------------------------------------- /litNlp/model_structure/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CarryChang/litNlp/53bc4df6168fcfdc109afbc4acd0e4838c6899dc/litNlp/model_structure/__init__.py -------------------------------------------------------------------------------- /litNlp/predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time: 2020/6/20 0020 0:55 3 | from tensorflow.keras.preprocessing.sequence import pad_sequences 4 | from tensorflow.keras.models import load_model 5 | import pickle 6 | class SA_Model_Predict: 7 | def __init__(self,tokenize_path, sa_model_path_m, max_len=100): 8 | with open(tokenize_path, 'rb') as tokenize_save: 9 | self.tokenizer_load = pickle.load(tokenize_save) 10 | self.max_len = max_len 11 | self.sa_model_path_m = sa_model_path_m 12 | def predict(self,predict_text): 13 | tk_list = [list(text) for text in predict_text] 14 | test_text = pad_sequences(self.tokenizer_load.texts_to_sequences(tk_list), self.max_len) 15 | model_load = load_model(self.sa_model_path_m) 16 | test_proba_list = model_load.predict(test_text) 17 | return test_proba_list 18 | # if __name__ == '__main__': 19 | # # 内置参数,批处理文本 20 | # predict_text = ['这个我不喜欢', '这个我喜欢不'] 21 | # # 初始化模型 22 | # model = SA_Model_Predict(tokenize_path=tokenizer_path, sa_model_path_m=sa_model_path) 23 | # sa_score = model.predict(predict_text) 24 | # # 多分类模型输出 25 | # print([i[1] for i in sa_score]) -------------------------------------------------------------------------------- /litNlp/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time: 2020/6/20 0020 0:46 3 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau 4 | from tensorflow.keras.preprocessing.sequence import pad_sequences 5 | from sklearn.model_selection import train_test_split 6 | from tensorflow.keras.preprocessing.text import Tokenizer 7 | from tensorflow.keras.utils import to_categorical 8 | from .model_structure.BiLSTM import BILSTM_Model 9 | from .model_structure.GRU import GRU_Model 10 | from .model_structure.TextCNN_m import TextCNN_m 11 | from sklearn import metrics 12 | import numpy as np 13 | import pickle 14 | class SA_Model_Train: 15 | def __init__(self, max_words, embedding_dim, maxlen, tokenize_path, sa_model_path_m, train_method=''): 16 | if train_method == 'gru': 17 | self.init_model = GRU_Model() 18 | elif train_method == 'bilstm': 19 | self.init_model = BILSTM_Model() 20 | else: 21 | # 默认textcnn 22 | self.init_model = TextCNN_m() 23 | self.max_words = max_words 24 | self.tokenize_path = tokenize_path 25 | self.embedding_dim = embedding_dim 26 | self.maxlen = maxlen 27 | self.sa_model_path_m = sa_model_path_m 28 | self.model = self.init_model.create_model(self.max_words, self.embedding_dim, self.maxlen) 29 | def train_tk(self,train_data): 30 | tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', num_words=self.max_words) 31 | tokenizer.fit_on_texts(train_data) 32 | with open(self.tokenize_path, 'wb') as tokenize: 33 | pickle.dump(tokenizer, tokenize) 34 | return tokenizer 35 | def train(self,train_data,label,num_classes,batch_size=256,epochs=10,verbose=1,evaluate=True): 36 | # to_categorical 37 | targets_values = to_categorical(label, num_classes=num_classes) 38 | # data split 39 | x_train, y_train, x_test, y_test = train_test_split(train_data, targets_values, test_size=0.2, random_state=1) 40 | # pad_sequences 41 | tokenizer = self.train_tk(train_data) 42 | x_train, x_test = pad_sequences(tokenizer.texts_to_sequences(x_train), self.maxlen), np.array(x_test) 43 | y_train, y_test = pad_sequences(tokenizer.texts_to_sequences(y_train), self.maxlen), np.array(y_test) 44 | self.model.fit(x_train, x_test, batch_size, epochs, verbose, 45 | validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.0001)]) 46 | self.model.save(self.sa_model_path_m) 47 | try: 48 | pre_result = self.model.predict(y_train, batch_size=256, verbose=0) 49 | except: 50 | result_ = self.model.predict_classes(y_train, batch_size=256, verbose=0) 51 | pre_result = np.argmax(result_, axis=1) 52 | if evaluate: 53 | result = [np.argmax(i) for i in pre_result] 54 | y_test = [np.argmax(i) for i in y_test] 55 | report = metrics.classification_report(y_test, result) 56 | acc = metrics.accuracy_score(y_test, result) 57 | auc = metrics.roc_auc_score(y_test, result) 58 | print(report) 59 | print('acc: {} auc: {}'.format(acc, auc)) 60 | # if __name__ == '__main__': 61 | # # C-CNN-SA(字符级卷积网络) 62 | # train_data = pd.read_csv('data/sa_data_train.csv') 63 | # # list sentence 64 | # train_data['text_cut'] = train_data['text'].apply(lambda x: " ".join(list(x))) 65 | # model = SA() 66 | # tokenizer = model.train_tk() 67 | # # 2-8的分割数据,固定测试数据 68 | # targets_values = to_categorical(train_data['label'], num_classes=num_classes) 69 | # x_train, y_train, x_test, y_test = train_test_split(train_data['text_cut'],targets_values, test_size=0.2, random_state=1) 70 | # # pad_sequences 71 | # x_train, x_test = pad_sequences(tokenizer.texts_to_sequences(x_train), maxlen), np.array(x_test) 72 | # y_train, y_test = pad_sequences(tokenizer.texts_to_sequences(y_train), maxlen), np.array(y_test) 73 | # # train 74 | # pre_result = model.train(x_train, x_test) 75 | # # evaluate 76 | # model.evaluate(pre_result, y_test) 77 | 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /pic/auc_2poch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CarryChang/litNlp/53bc4df6168fcfdc109afbc4acd0e4838c6899dc/pic/auc_2poch.png -------------------------------------------------------------------------------- /pic/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CarryChang/litNlp/53bc4df6168fcfdc109afbc4acd0e4838c6899dc/pic/logo.png -------------------------------------------------------------------------------- /pic/server.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CarryChang/litNlp/53bc4df6168fcfdc109afbc4acd0e4838c6899dc/pic/server.png -------------------------------------------------------------------------------- /pic/tools.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CarryChang/litNlp/53bc4df6168fcfdc109afbc4acd0e4838c6899dc/pic/tools.png -------------------------------------------------------------------------------- /pic/ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CarryChang/litNlp/53bc4df6168fcfdc109afbc4acd0e4838c6899dc/pic/ui.png -------------------------------------------------------------------------------- /requirement.txt: -------------------------------------------------------------------------------- 1 | tensorflow>=2.0.1 2 | streamlit 3 | docker-compose 4 | scikit-learn 5 | pandas 6 | numpy 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import setuptools 3 | import io 4 | import os 5 | import platform 6 | import subprocess 7 | import sys 8 | with io.open('README.md', 'r', encoding='utf-8') as f: 9 | long_description = f.read() 10 | REQUIRED_PACKAGES = [ 11 | 'h5py', 'requests' 12 | ] 13 | setuptools.setup( 14 | name="litNlp", 15 | version="0.8.5", 16 | packages=['litNlp', 'litNlp.model_structure'], 17 | author="CarryChang", 18 | author_email="coolcahng@gmail.com", 19 | url='https://github.com/CarryChang/litNlp', 20 | license='https://www.apache.org/licenses/LICENSE-2.0', 21 | include_package_data=True, 22 | description='A fast tool for sentiment analysis model with tensorflow2.0 ', 23 | # long_description='litNlp 是基于 Tensorflow2.0 实现的一个轻量级的深度情感极性推理模型,可以实现细粒度的多级别情感极性训练和预测。 GPU 和 CPU 平台通用,是搭建 NLP 分类模型类 baseline 的快速方案。' 24 | long_description=long_description, 25 | long_description_content_type='text/markdown', 26 | install_requires=REQUIRED_PACKAGES, 27 | python_requires=">=3.5", 28 | zip_safe=True, 29 | classifiers=( 30 | "License :: OSI Approved :: Apache Software License", 31 | "Operating System :: OS Independent", 32 | 'Intended Audience :: Developers', 33 | 'Intended Audience :: Education', 34 | 'Intended Audience :: Science/Research', 35 | 'Programming Language :: Python :: 3.5', 36 | 'Programming Language :: Python :: 3.6', 37 | 'Programming Language :: Python :: 3.7', 38 | 'Topic :: Scientific/Engineering', 39 | 'Topic :: Software Development', 40 | 'Topic :: Software Development :: Libraries', 41 | 'Topic :: Software Development :: Libraries :: Python Modules', 42 | ), 43 | extras_require={ 44 | "cpu": ["tensorflow>=2.0.1"], 45 | "gpu": ["tensorflow-gpu>=2.0.1"], 46 | }, 47 | entry_points={ 48 | }, 49 | keywords=['text classification', 'nlp','batch predict', 50 | 'deep learning', 'tensorflow', 'ml',], 51 | ) --------------------------------------------------------------------------------