├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── README_EN.md ├── pnlp ├── __init__.py ├── pcut.py ├── penh.py ├── piop.py ├── pmag.py ├── pnorm.py ├── ptrans.py ├── ptxt.py ├── stopwords │ ├── ReadMe.md │ ├── __init__.py │ ├── chinese_stopwords.txt │ └── english_stopwords.txt └── utils.py ├── setup.py └── tests ├── __init__.py ├── piop_data ├── a.md ├── b.txt ├── c.data ├── csv.csv ├── first │ ├── fa.md │ ├── fb.txt │ ├── fc.data │ └── second │ │ ├── sa.md │ │ ├── sb.txt │ │ └── sc.data ├── json.json ├── list_dict.json ├── outfile.file ├── outfile.listdict ├── outjson.json └── yaml.yaml ├── test_pcut.py ├── test_penh.py ├── test_piop.py ├── test_pmag.py ├── test_pnorm.py ├── test_ptrans.py ├── test_ptxt.py ├── test_stopwords.py └── test_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | notebook.ipynb 2 | .* 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | .PHONY: build 3 | build: 4 | python3.11 setup.py sdist bdist_wheel 5 | 6 | .PHONY: upload 7 | upload: 8 | python3.11 -m twine upload -r pypi dist/* 9 | 10 | .PHONY: clean 11 | clean: 12 | rm -rf ./dist/ ./build/ ./pnlp.egg-info/ 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | **Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)* 4 | 5 | - [功能特性](#%E5%8A%9F%E8%83%BD%E7%89%B9%E6%80%A7) 6 | - [安装](#%E5%AE%89%E8%A3%85) 7 | - [使用](#%E4%BD%BF%E7%94%A8) 8 | - [文本IO](#%E6%96%87%E6%9C%ACio) 9 | - [IO 处理](#io-%E5%A4%84%E7%90%86) 10 | - [内置方法](#%E5%86%85%E7%BD%AE%E6%96%B9%E6%B3%95) 11 | - [文本处理](#%E6%96%87%E6%9C%AC%E5%A4%84%E7%90%86) 12 | - [清理和提取](#%E6%B8%85%E7%90%86%E5%92%8C%E6%8F%90%E5%8F%96) 13 | - [内置正则](#%E5%86%85%E7%BD%AE%E6%AD%A3%E5%88%99) 14 | - [文本切分](#%E6%96%87%E6%9C%AC%E5%88%87%E5%88%86) 15 | - [任意部分切分](#%E4%BB%BB%E6%84%8F%E9%83%A8%E5%88%86%E5%88%87%E5%88%86) 16 | - [分句](#%E5%88%86%E5%8F%A5) 17 | - [分子句并按一个阈值合并子句](#%E5%88%86%E5%AD%90%E5%8F%A5%E5%B9%B6%E6%8C%89%E4%B8%80%E4%B8%AA%E9%98%88%E5%80%BC%E5%90%88%E5%B9%B6%E5%AD%90%E5%8F%A5) 18 | - [中文字符切分](#%E4%B8%AD%E6%96%87%E5%AD%97%E7%AC%A6%E5%88%87%E5%88%86) 19 | - [句子分组](#%E5%8F%A5%E5%AD%90%E5%88%86%E7%BB%84) 20 | - [文本增强](#%E6%96%87%E6%9C%AC%E5%A2%9E%E5%BC%BA) 21 | - [Token级别](#token%E7%BA%A7%E5%88%AB) 22 | - [句子级别](#%E5%8F%A5%E5%AD%90%E7%BA%A7%E5%88%AB) 23 | - [文本归一化](#%E6%96%87%E6%9C%AC%E5%BD%92%E4%B8%80%E5%8C%96) 24 | - [中文数字](#%E4%B8%AD%E6%96%87%E6%95%B0%E5%AD%97) 25 | - [格式转换](#%E6%A0%BC%E5%BC%8F%E8%BD%AC%E6%8D%A2) 26 | - [BIO转实体](#bio%E8%BD%AC%E5%AE%9E%E4%BD%93) 27 | - [任意参数转UUID](#%E4%BB%BB%E6%84%8F%E5%8F%82%E6%95%B0%E8%BD%ACuuid) 28 | - [内置词典](#%E5%86%85%E7%BD%AE%E8%AF%8D%E5%85%B8) 29 | - [停用词](#%E5%81%9C%E7%94%A8%E8%AF%8D) 30 | - [文本长度](#%E6%96%87%E6%9C%AC%E9%95%BF%E5%BA%A6) 31 | - [魔术方法](#%E9%AD%94%E6%9C%AF%E6%96%B9%E6%B3%95) 32 | - [并行处理](#%E5%B9%B6%E8%A1%8C%E5%A4%84%E7%90%86) 33 | - [测试](#%E6%B5%8B%E8%AF%95) 34 | - [更新日志](#%E6%9B%B4%E6%96%B0%E6%97%A5%E5%BF%97) 35 | 36 | 37 | 38 | NLP 预/后处理工具。 39 | 40 | ## 功能特性 41 | 42 | - 专为文本 IO 设计的灵活的 Pipeline 43 | - 灵活的文本清理/提取工具 44 | - 文本增强 45 | - 按句切分或按中文字符切分文本 46 | - 文本分桶 47 | - 中文字符归一化 48 | - 文本各种长度计算 49 | - 中英文常用停用词 50 | - 预处理魔术方法 51 | - 并发、批量化、实体 BIO 转实体 52 | 53 | ## 安装 54 | 55 | 需要 Python3.7+。 56 | 57 | `pip install pnlp` 58 | 59 | ## 使用 60 | 61 | ### 文本IO 62 | 63 | #### IO 处理 64 | 65 | ```bash 66 | tree tests/piop_data/ 67 | ├── a.md 68 | ├── b.txt 69 | ├── c.data 70 | ├── first 71 | │   ├── fa.md 72 | │   ├── fb.txt 73 | │   ├── fc.data 74 | │   └── second 75 | │   ├── sa.md 76 | │   ├── sb.txt 77 | │   └── sc.data 78 | ├── json.json 79 | ├── outfile.file 80 | ├── outjson.json 81 | └── yml.yml 82 | ``` 83 | 84 | ```python 85 | import os 86 | from pnlp import Reader 87 | 88 | DATA_PATH = "./pnlp/tests/piop_data/" 89 | pattern = '*.md' # 可以是 '*.txt', 'f*.*' 等,支持正则 90 | reader = Reader(pattern, use_regex=True) 91 | 92 | # 获取所有文件的行,输出行文本、行索引和所在的文件名 93 | for line in reader(DATA_FOLDER_PATH): 94 | print(line.lid, line.fname, line.text) 95 | """ 96 | 0 a.md line 1 in a. 97 | 1 a.md line 2 in a. 98 | 2 a.md line 3 in a. 99 | 0 fa.md line 1 in fa. 100 | 1 fa.md line 2 in fa 101 | ... 102 | """ 103 | 104 | # 获取某个文件的所有行,输出行文本、行索引和所在文件名,此时由于指定了文件名 pattern 无效 105 | for line in reader(os.path.join(DATA_FOLDER_PATH, "a.md")): 106 | print(line.lid, line.fname, line.text) 107 | """ 108 | 0 a.md line 1 in a. 109 | 1 a.md line 2 in a. 110 | 2 a.md line 3 in a. 111 | """ 112 | 113 | 114 | 115 | # 获取目录下的所有文件路径 116 | for path in Reader.gen_files(DATA_PATH, pattern, use_regex: True): 117 | print(path) 118 | """ 119 | pnlp/tests/piop_data/a.md 120 | pnlp/tests/piop_data/first/fa.md 121 | pnlp/tests/piop_data/first/second/sa.md 122 | """ 123 | 124 | # 获取一个目录下所有文件名和它们的内容 125 | paths = Reader.gen_files(DATA_PATH, pattern) 126 | articles = Reader.gen_articles(paths) 127 | for article in articles: 128 | print(article.fname) 129 | print(article.f.read()) 130 | """ 131 | a.md 132 | line 1 in a. 133 | line 2 in a. 134 | line 3 in a. 135 | ... 136 | """ 137 | 138 | # 同前两个例子 139 | paths = Reader.gen_files(DATA_PATH, pattern) 140 | articles = Reader.gen_articles(paths) 141 | for line in Reader.gen_flines(articles, strip="\n"): 142 | print(line.lid, line.fname, line.text) 143 | ``` 144 | 145 | #### 内置方法 146 | 147 | ```python 148 | import pnlp 149 | 150 | # Read 151 | file_string = pnlp.read_file(file_path) 152 | file_list = pnlp.read_lines(file_path) 153 | file_json = pnlp.read_json(file_path) 154 | file_yaml = pnlp.read_yaml(file_path) 155 | file_csv = pnlp.read_csv(file_path) 156 | file_pickle = pnlp.read_pickle(file_path) 157 | list_dict = pnlp.read_file_to_list_dict(file_path) 158 | 159 | # Write 160 | pnlp.write_json(file_path, data, indent=2) 161 | pnlp.write_file(file_path, data) 162 | pnlp.write_pickle(file_path, data) 163 | pnlp.write_list_dict_to_file(file_path, data) 164 | 165 | # Others 166 | pnlp.check_dir(dirname) # 如果目录不存在会创建 167 | ``` 168 | 169 | ### 文本处理 170 | 171 | #### 清理和提取 172 | 173 | ```python 174 | import re 175 | from pnlp import Text 176 | 177 | text = "这是https://www.yam.gift长度测试,《 》*)FSJfdsjf😁![](http://xx.jpg)。233." 178 | pattern = re.compile(r'\d+') 179 | 180 | # pattern 是 re.Pattern 类型或 str 类型 181 | # 默认为空字符串:'', 表示不使用任何 pattern(实际是 re.compile(r'.+')),此时 clean 返回空(全部被清了),extract 返回原始文本。 182 | # pattern 支持以下字符串类型(实际为正则): 183 | # 'chi': 中文字符 184 | # 'pun': 标点 185 | # 'whi': 空白 186 | # 'nwh': 非空白 187 | # 'wnb': 字母(含中文字符)或数字 188 | # 'nwn': 非字母(含中文字符)或数字 189 | # 'eng': 英文字符 190 | # 'num': 数字 191 | # 'pic': 图片 192 | # 'lnk': 链接 193 | # 'emj': 表情 194 | 195 | pt = Text(['chi', pattern]) 196 | 197 | # 提取所有符合 pattern 的文本和它们的位置 198 | res = pt.extract(text) 199 | print(res) 200 | """ 201 | {'text': '这是长度测试233', 'mats': ['这是', '长度测试', '233'], 'locs': [(0, 2), (22, 26), (60, 63)]} 202 | """ 203 | # 支持用「点」获取key属性 204 | print(res.text, res.mats, res.locs) 205 | """ 206 | '这是长度测试' ['这是', '长度测试'] [(0, 2), (22, 26)] 207 | """ 208 | 209 | # 返回指定 pattern 清理后的文本 210 | print(pt.clean(text)) 211 | """ 212 | https://www.yam.gift,《 》*)FSJfdsjf😁![](http://xx.jpg)。233. 213 | """ 214 | 215 | # 可以指定多个 pattern,注意先后顺序可能会影响结果哦 216 | pt = Text(['pic', 'lnk']) 217 | # 提取到的 218 | res = pt.extract(text) 219 | print(res.mats) 220 | """ 221 | ['https://www.yam.gif', 222 | '![](http://xx.jpg)', 223 | 'https://www.yam.gift', 224 | 'http://xx.jpg'] 225 | """ 226 | # 清理后的 227 | print(pt.clean(text)) 228 | """ 229 | 这是t长度测试,《 》*)FSJfdsjf😁。233. 230 | """ 231 | ``` 232 | 233 | #### 内置正则 234 | 235 | ```python 236 | # USE Regex 237 | from pnlp import reg 238 | def clean_text(text: str) -> str: 239 | text = reg.pwhi.sub("", text) 240 | text = reg.pemj.sub("", text) 241 | text = reg.ppic.sub("", text) 242 | text = reg.plnk.sub("", text) 243 | return text 244 | ``` 245 | 246 | ### 文本切分 247 | 248 | #### 任意部分切分 249 | 250 | ```python 251 | # Cut by Regex 252 | from pnlp import cut_part, psent 253 | text = "你好!欢迎使用。" 254 | sent_list = cut_part(text, psent, with_spliter=True, with_offset=False) 255 | print(sent_list) 256 | """ 257 | ['你好!', '欢迎使用。'] 258 | """ 259 | pcustom_sent = re.compile(r'[。!]') 260 | sent_list = cut_part(text, pcustom_sent, with_spliter=False, with_offset=False) 261 | print(sent_list) 262 | """ 263 | ['你好', '欢迎使用'] 264 | """ 265 | sent_list = cut_part(text, pcustom_sent, with_spliter=False, with_offset=True) 266 | print(sent_list) 267 | """ 268 | [('你好', 0, 3), ('欢迎使用', 3, 8)] 269 | """ 270 | ``` 271 | 272 | #### 分句 273 | 274 | ```python 275 | # Cut Sentence 276 | from pnlp import cut_sentence as pcs 277 | text = "你好!欢迎使用。" 278 | sent_list = pcs(text) 279 | print(sent_list) 280 | """ 281 | ['你好!', '欢迎使用。'] 282 | """ 283 | ``` 284 | 285 | #### 分子句并按一个阈值合并子句 286 | 287 | ```python 288 | from pnlp import cut_sub_sentence as pcss 289 | text = "你好!你好。你好?你坏~欢迎使用。" 290 | sent_list = pcss(text) 291 | print(sent_list) 292 | """ 293 | ['你好!', '你好。', '你好?', '你坏~', '欢迎使用。'] 294 | """ 295 | sent_list = pcss(text, 6) 296 | print(sent_list) 297 | """ 298 | ['你好!你好。', '你好?你坏~', '欢迎使用。'] 299 | """ 300 | sent_list = pcss(text, 12) 301 | print(sent_list) 302 | """ 303 | ['你好!你好。你好?你坏~', '欢迎使用。'] 304 | """ 305 | ``` 306 | 307 | 这个功能在很多场合非常有用;)懂的都懂:D 308 | 309 | #### 中文字符切分 310 | 311 | ```python 312 | # 中文字符切分 313 | from pnlp import cut_zhchar 314 | text = "你好,hello, 520 i love u. = ”我爱你“。" 315 | char_list = cut_zhchar(text) 316 | print(char_list) 317 | """ 318 | ['你', '好', ',', 'hello', ',', ' ', '520', ' ', 'i', ' ', 'love', ' ', 'u', '.', ' ', '=', ' ', '”', '我', '爱', '你', '“', '。'] 319 | """ 320 | char_list = cut_zhchar(text, remove_blank=True) 321 | print(char_list) 322 | """ 323 | ['你', '好', ',', 'hello', ',', '520', 'i', 'love', 'u', '.', '=', '”', '我', '爱', '你', '“', '。'] 324 | """ 325 | ``` 326 | 327 | #### 句子分组 328 | 329 | ```python 330 | from pnlp import combine_bucket 331 | parts = [ 332 | "先生,那夜,我因胸中纳闷,无法入睡,", 333 | "折腾得比那铐了脚镣的叛变水手还更难过;", 334 | "那时,我就冲动的 ——", 335 | "好在有那一时之念,", 336 | "因为有时我们在无意中所做的事能够圆满……" 337 | ] 338 | buckets = combine_bucket(parts.copy(), 10, truncate=True, keep_remain=True) 339 | print(buckets) 340 | """ 341 | ['先生,那夜,我因胸中', 342 | '纳闷,无法入睡,', 343 | '折腾得比那铐了脚镣的', 344 | '叛变水手还更难过;', 345 | '那时,我就冲动的 —', 346 | '—', 347 | '好在有那一时之念,', 348 | '因为有时我们在无意中', 349 | '所做的事能够圆满……'] 350 | """ 351 | ``` 352 | 353 | ### 文本增强 354 | 355 | 采样器支持删除、交换、插入操作,所有的操作不会跨越标点。 356 | 357 | #### Token级别 358 | 359 | - 默认 Tokenizer 360 | - 中文:字符级 Tokenizer(见上) 361 | - 英文:空白符切分 Tokenizer 362 | - Tokenizer 可以任意指定,但它的输出应该是一个 List 的 Token 或一个 List 的 Tuple,每个 Tuple 包含一个 Token 和一个词性。 363 | - 对字符级增强,默认并不会操作所有字或词。可以自定义要操作的词或词性。 364 | - 默认 Token 是「停用词」 365 | - 默认词性(当 Tokenizer 输出带词性时)是「功能词」:副词、介词、连词、助词、其他虚词(标记为 d p c u xc) 366 | 367 | ```python 368 | # 【】内的为改变的 369 | text = "人为什么活着?生而为人必须要有梦想!还要有尽可能多的精神体验。" 370 | # 字符粒度 371 | from pnlp import TokenLevelSampler 372 | tls = TokenLevelSampler() 373 | tls.make_samples(text) 374 | """ 375 | {'delete': '人为什么活着?生而为人必须要【有】梦想!还要有尽可能多的精神体验。', 376 | 'swap': '【为】【人】什么活着?生而为人必须要有梦想!还要有尽可能多的精神体验。', 377 | 'insert': '人为什么活着?生而为人必须要有梦想!【还】还要有尽可能多的精神体验。', 378 | 'together': '人什么着着活?生而必为为须要有梦想!还要有尽可能多的精神体验。'} 379 | """ 380 | # 支持自定义 tokenizer 381 | tls.make_samples(text, jieba.lcut) 382 | """ 383 | {'delete': '人为什么活着?生而为人【必须】要有梦想!还要有尽可能多的精神体验。', 384 | 'swap': '【为什么】【人】活着?生而为人必须要有梦想!还要有尽可能多的精神体验。', 385 | 'insert': '人为什么活着?生而为人必须要有梦想!【还要】还要有尽可能多的精神体验。', 386 | 'together': '人为什么活着?生而为人人要有梦想!还要有多尽可能的精神体验。'} 387 | """ 388 | # 自定义 389 | tls = TokenLevelSampler( 390 | rate=替换比例, # 默认 5% 391 | types=["delete", "swap", "insert"], # 默认三个 392 | sample_words=["词1", "词2"], # 默认停用词 393 | sample_pos=["词性1", "词性2"], # 默认功能词 394 | ) 395 | ``` 396 | 397 | #### 句子级别 398 | 399 | ```python 400 | from pnlp import SentenceLevelSampler 401 | sls = SentenceLevelSampler() 402 | sls.make_samples(text) 403 | """ 404 | {'delete': '生而为人必须要有梦想!还要有尽可能多的精神体验。', 405 | 'swap': '人为什么活着?还要有尽可能多的精神体验。生而为人必须要有梦想!', 406 | 'insert': '人为什么活着?还要有尽可能多的精神体验。生而为人必须要有梦想!生而为人必须要有梦想!', 407 | 'together': '生而为人必须要有梦想!人为什么活着?人为什么活着?'} 408 | """ 409 | # 自定义 410 | sls = SentenceLevelSampler(types=["delete", "swap", "insert"]) # 默认三个 411 | ``` 412 | 413 | ### 文本归一化 414 | 415 | #### 中文数字 416 | 417 | ```python 418 | from pnlp import num_norm 419 | num_norm.num2zh(1024) == "一千零二十四" 420 | num_norm.num2zh(1024).to_money() == "壹仟零贰拾肆" 421 | num_norm.zh2num("一千零二十四") == 1024 422 | ``` 423 | 424 | ### 格式转换 425 | 426 | #### BIO转实体 427 | 428 | ```python 429 | # 实体 BIO Token 转实体 430 | from pnlp import pick_entity_from_bio_labels 431 | pairs = [('天', 'B-LOC'), ('安', 'I-LOC'), ('门', 'I-LOC'), ('有', 'O'), ('毛', 'B-PER'), ('主', 'I-PER'), ('席', 'I-PER')] 432 | pick_entity_from_bio_labels(pairs) 433 | """ 434 | [('天安门', 'LOC'), ('毛主席', 'PER')] 435 | """ 436 | pick_entity_from_bio_labels(pairs, with_offset=True) 437 | """ 438 | [('天安门', 'LOC', 0, 3), ('毛主席', 'PER', 4, 7)] 439 | """ 440 | ``` 441 | 442 | #### 任意参数转UUID 443 | 444 | ```python 445 | from pnlp import generate_uuid 446 | 447 | uid1 = pnlp.generate_uuid("a", 1, 0.02) 448 | uid2 = pnlp.generete_uuid("a", 1) 449 | """ 450 | uid1 == 3fbc8b70d05b5abdb5badca1d26e1dbd 451 | uid2 == f7b0ffc589e453e88d4faf66eb92f669 452 | """ 453 | ``` 454 | 455 | ### 内置词典 456 | 457 | #### 停用词 458 | 459 | ```python 460 | from pnlp import StopWords, chinese_stopwords, english_stopwords 461 | 462 | csw = StopWords("/path/to/custom/stopwords.txt") 463 | csw.stopwords # a set of the custom stopwords 464 | 465 | csw.zh == chinese_stopwords # Chineses stopwords 466 | csw.en == english_stopwords # English stopwords 467 | ``` 468 | 469 | 470 | ### 文本长度 471 | 472 | ```python 473 | from pnlp import Length 474 | 475 | text = "这是https://www.yam.gift长度测试,《 》*)FSJfdsjf😁![](http://xx.jpg)。233." 476 | 477 | pl = Length(text) 478 | # 注意:即使使用了 pattern,长度都是基于原始文本 479 | # 长度基于字符计数(不是整词) 480 | print("Length of all characters: ", pl.len_all) 481 | print("Length of all non-white characters: ", pl.len_nwh) 482 | print("Length of all Chinese characters: ", pl.len_chi) 483 | print("Length of all words and numbers: ", pl.len_wnb) 484 | print("Length of all punctuations: ", pl.len_pun) 485 | print("Length of all English characters: ", pl.len_eng) 486 | print("Length of all numbers: ", pl.len_num) 487 | 488 | """ 489 | Length of all characters: 64 490 | Length of all non-white characters: 63 491 | Length of all Chinese characters: 6 492 | Length of all words and numbers: 41 493 | Length of all punctuations: 14 494 | Length of all English characters: 32 495 | Length of all numbers: 3 496 | """ 497 | ``` 498 | 499 | ### 魔术方法 500 | 501 | #### 字典 502 | 503 | ```python 504 | from pnlp import MagicDict 505 | 506 | # 嵌套字典 507 | pmd = MagicDict() 508 | pmd['a']['b']['c'] = 2 509 | print(pmd) 510 | 511 | """ 512 | {'a': {'b': {'c': 2}}} 513 | """ 514 | 515 | # 当字典被翻转时,保留所有的重复 value-keys 516 | dx = {1: 'a', 517 | 2: 'a', 518 | 3: 'a', 519 | 4: 'b' } 520 | print(pmag.MagicDict.reverse(dx)) 521 | 522 | """ 523 | {'a': [1, 2, 3], 'b': 4} 524 | """ 525 | ``` 526 | 527 | #### 获取唯一文件名 528 | 529 | ```python 530 | from pnlp import get_unique_fn 531 | 532 | get_unique_fn("a/b/c.md") == "a_b_c.md" 533 | ``` 534 | 535 | ### 并行处理 536 | 537 | 支持四种并行处理方式: 538 | 539 | - 线程池:`thread_pool` 540 | - 进程池:`process_pool` 541 | - 线程 Executor:`thread_executor`,默认使用 542 | - 线程:`thread` 543 | 544 | 注意:惰性处理,返回的是 Generator。 545 | 546 | ```python 547 | import math 548 | def is_prime(x): 549 | if x < 2: 550 | return False 551 | for i in range(2, int(math.sqrt(x)) + 1): 552 | if x % i == 0: 553 | return False 554 | return True 555 | 556 | from pnlp import concurring 557 | 558 | # max_workers 默认为 4 559 | @concurring 560 | def get_primes(lst): 561 | res = [] 562 | for i in lst: 563 | if is_prime(i): 564 | res.append(i) 565 | return res 566 | 567 | @concurrint(type="thread_pool", max_workers=10) 568 | def get_primes(lst): 569 | pass 570 | ``` 571 | 572 | `concurring` 装饰器让你的迭代函数并行。 573 | 574 | ### 后台处理 575 | 576 | ```python 577 | from pnlp import run_in_new_thread 578 | 579 | def func(file, a, b, c): 580 | background_task() 581 | 582 | run_in_new_thread(func, file, 1, 2, 3) 583 | ``` 584 | 585 | ## 测试 586 | 587 | Clone 仓库后执行: 588 | 589 | ```bash 590 | $ python -m pytest 591 | ``` 592 | 593 | ## 更新日志 594 | 595 | 见英文版 README。 596 | 597 | 598 | 599 | -------------------------------------------------------------------------------- /README_EN.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | **Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)* 4 | 5 | - [Features](#features) 6 | - [Install](#install) 7 | - [Usage](#usage) 8 | - [Iopipe](#iopipe) 9 | - [IO process](#io-process) 10 | - [Built-in Method](#built-in-method) 11 | - [Text](#text) 12 | - [Clean and Extract](#clean-and-extract) 13 | - [Regex](#regex) 14 | - [Cut](#cut) 15 | - [AnypartCut](#anypartcut) 16 | - [SentenceCut](#sentencecut) 17 | - [SubSentenceCut and Combine by threshold](#subsentencecut-and-combine-by-threshold) 18 | - [ChineseCharCut](#chinesecharcut) 19 | - [CombineBucket](#combinebucket) 20 | - [Enhancement](#enhancement) 21 | - [TokenLevel](#tokenlevel) 22 | - [SentenceLevel](#sentencelevel) 23 | - [Normalization](#normalization) 24 | - [ChineseNumber](#chinesenumber) 25 | - [Transformation](#transformation) 26 | - [BIO2Entity](#bio2entity) 27 | - [Parameters2uuid](#parameters2uuid) 28 | - [Built-in Dicts](#built-in-dicts) 29 | - [StopWords](#stopwords) 30 | - [Length](#length) 31 | - [Magic](#magic) 32 | - [Concurring](#concurring) 33 | - [Test](#test) 34 | - [ChangeLog](#changelog) 35 | 36 | 37 | 38 | This is a pre/post-processing tool for NLP. 39 | 40 | ## Features 41 | 42 | - A flexible pipe line for text io 43 | - A flexible tool for text clean and extract 44 | - Text enhancement 45 | - Sentence cut and Chinese character cut 46 | - Text bucket 47 | - Chinese character normalization 48 | - Kinds of length 49 | - Stopwords 50 | - Some magic usage in pre-processing 51 | - Tools like Concurring, generating batches 52 | 53 | ## Install 54 | 55 | Need Python3.7+. 56 | 57 | `pip install pnlp` 58 | 59 | ## Usage 60 | 61 | ### Iopipe 62 | 63 | #### IO process 64 | 65 | ```bash 66 | tree tests/piop_data/ 67 | ├── a.md 68 | ├── b.txt 69 | ├── c.data 70 | ├── first 71 | │   ├── fa.md 72 | │   ├── fb.txt 73 | │   ├── fc.data 74 | │   └── second 75 | │   ├── sa.md 76 | │   ├── sb.txt 77 | │   └── sc.data 78 | ├── json.json 79 | ├── outfile.file 80 | ├── outjson.json 81 | └── yml.yml 82 | ``` 83 | 84 | ```python 85 | import os 86 | from pnlp import Reader 87 | 88 | DATA_PATH = "./pnlp/tests/piop_data/" 89 | pattern = '*.md' # also could be '*.txt', 'f*.*', etc. SUPPORT regex 90 | reader = Reader(pattern, use_regex=True) 91 | 92 | # Get lines of all files in one directory with line index and file name 93 | for line in reader(DATA_FOLDER_PATH): 94 | print(line.lid, line.fname, line.text) 95 | """ 96 | 0 a.md line 1 in a. 97 | 1 a.md line 2 in a. 98 | 2 a.md line 3 in a. 99 | 0 fa.md line 1 in fa. 100 | 1 fa.md line 2 in fa 101 | ... 102 | """ 103 | 104 | # Get lines of one file lines with line index and file name 105 | # if a file is read, the `pattern` is not effective 106 | for line in reader(os.path.join(DATA_FOLDER_PATH, "a.md")): 107 | print(line.lid, line.fname, line.text) 108 | """ 109 | 0 a.md line 1 in a. 110 | 1 a.md line 2 in a. 111 | 2 a.md line 3 in a. 112 | """ 113 | 114 | 115 | 116 | # Get all filepaths in one directory 117 | for path in Reader.gen_files(DATA_PATH, pattern): 118 | print(path) 119 | """ 120 | pnlp/tests/piop_data/a.md 121 | pnlp/tests/piop_data/first/fa.md 122 | pnlp/tests/piop_data/first/second/sa.md 123 | """ 124 | 125 | # Get content(article) of all files in one directory with file name 126 | paths = Reader.gen_files(DATA_PATH, pattern) 127 | articles = reader.gen_articles(paths) 128 | for article in articles: 129 | print(article.fname) 130 | print(article.f.read()) 131 | """ 132 | a.md 133 | line 1 in a. 134 | line 2 in a. 135 | line 3 in a. 136 | ... 137 | """ 138 | 139 | # Get lines of all files in one directory with line index and file name 140 | # the same as ip.Reader(DATA_PATH, pattern) 141 | paths = Reader.gen_files(DATA_PATH, pattern) 142 | articles = Reader.gen_articles(paths) 143 | for line in Reader.gen_flines(articles): 144 | print(line.lid, line.fname, line.text) 145 | ``` 146 | 147 | #### Built-in Method 148 | 149 | ```python 150 | import pnlp 151 | 152 | # Read 153 | file_string = pnlp.read_file(file_path) 154 | file_list = pnlp.read_lines(file_path) 155 | file_json = pnlp.read_json(file_path) 156 | file_yaml = pnlp.read_yaml(file_path) 157 | file_csv = pnlp.read_csv(file_path) 158 | file_pickle = pnlp.read_pickle(file_path) 159 | list_dict = pnlp.read_file_to_list_dict(file_path) 160 | 161 | # Write 162 | pnlp.write_json(file_path, data, indent=2) 163 | pnlp.write_file(file_path, data) 164 | pnlp.write_pickle(file_path, data) 165 | pnlp.write_list_dict_to_file(file_path, data) 166 | 167 | # Others 168 | pnlp.check_dir(dirname) # will make dirname if not exist 169 | ``` 170 | 171 | ### Text 172 | 173 | #### Clean and Extract 174 | 175 | ```python 176 | import re 177 | 178 | # Use Text 179 | from pnlp import Text 180 | 181 | text = "这是https://www.yam.gift长度测试,《 》*)FSJfdsjf😁![](http://xx.jpg)。233." 182 | pattern = re.compile(r'\d+') 183 | 184 | # pattern is re.Pattern or str type 185 | # Default is '', means do not use any pattern (acctually is re.compile(r'.+'). In this pattern, clean returns nothing, extract returns the origin. 186 | # If pattern is a string, a build-in pattern will be used, there are 11 types: 187 | # 'chi': Chinese character 188 | # 'pun': Punctuations 189 | # 'whi': White space 190 | # 'nwh': Non White space 191 | # 'wnb': Word and number 192 | # 'nwn': Non word and number 193 | # 'eng': English character 194 | # 'num': Number 195 | # 'pic': Pictures 196 | # 'lnk': Links 197 | # 'emj': Emojis 198 | 199 | pt = Text(['chi', pattern]) 200 | # pt.extract will return matches and their locations 201 | res = pt.extract(text) 202 | print(res) 203 | """ 204 | {'text': '这是长度测试233', 'mats': ['这是', '长度测试', '233'], 'locs': [(0, 2), (22, 26), (60, 63)]} 205 | """ 206 | # support use dot to get the key field 207 | print(res.text, res.mats, res.locs) 208 | """ 209 | '这是长度测试' ['这是', '长度测试'] [(0, 2), (22, 26)] 210 | """ 211 | # pt.clean will return cleaned text using the pattern 212 | print(pt.clean(text)) 213 | """ 214 | https://www.yam.gift,《 》*)FSJfdsjf😁![](http://xx.jpg)。233. 215 | """ 216 | 217 | pt = Text(['pic', 'lnk']) 218 | res = pt.extract(text) 219 | print(res.mats) 220 | """ 221 | ['https://www.yam.gif', 222 | '![](http://xx.jpg)', 223 | 'https://www.yam.gift', 224 | 'http://xx.jpg'] 225 | """ 226 | print(pt.clean(text)) 227 | """ 228 | 这是t长度测试,《 》*)FSJfdsjf😁。233. 229 | """ 230 | ``` 231 | 232 | #### Regex 233 | 234 | ```python 235 | # USE Regex 236 | from pnlp import reg 237 | def clean_text(text: str) -> str: 238 | text = reg.pwhi.sub("", text) 239 | text = reg.pemj.sub("", text) 240 | text = reg.ppic.sub("", text) 241 | text = reg.plnk.sub("", text) 242 | return text 243 | ``` 244 | 245 | ### Cut 246 | 247 | #### AnypartCut 248 | 249 | ```python 250 | # Cut by Regex 251 | from pnlp import cut_part, psent 252 | text = "你好!欢迎使用。" 253 | sent_list = cut_part(text, psent, with_spliter=True, with_offset=False) 254 | print(sent_list) 255 | """ 256 | ['你好!', '欢迎使用。'] 257 | """ 258 | pcustom_sent = re.compile(r'[。!]') 259 | sent_list = cut_part(text, pcustom_sent, with_spliter=False, with_offset=False) 260 | print(sent_list) 261 | """ 262 | ['你好', '欢迎使用'] 263 | """ 264 | sent_list = cut_part(text, pcustom_sent, with_spliter=False, with_offset=True) 265 | print(sent_list) 266 | """ 267 | [('你好', 0, 3), ('欢迎使用', 3, 8)] 268 | """ 269 | ``` 270 | 271 | #### SentenceCut 272 | 273 | ```python 274 | # Cut Sentence 275 | from pnlp import cut_sentence as pcs 276 | text = "你好!欢迎使用。" 277 | sent_list = pcs(text) 278 | print(sent_list) 279 | """ 280 | ['你好!', '欢迎使用。'] 281 | """ 282 | ``` 283 | 284 | #### SubSentenceCut and Combine by threshold 285 | 286 | ```python 287 | from pnlp import cut_sub_sentence as pcss 288 | text = "你好!你好。你好?你坏~欢迎使用。" 289 | sent_list = pcss(text) 290 | print(sent_list) 291 | """ 292 | ['你好!', '你好。', '你好?', '你坏~', '欢迎使用。'] 293 | """ 294 | sent_list = pcss(text, 6) 295 | print(sent_list) 296 | """ 297 | ['你好!你好。', '你好?你坏~', '欢迎使用。'] 298 | """ 299 | sent_list = pcss(text, 12) 300 | print(sent_list) 301 | """ 302 | ['你好!你好。你好?你坏~', '欢迎使用。'] 303 | """ 304 | ``` 305 | 306 | This is very useful in some places, you knows;) 307 | 308 | #### ChineseCharCut 309 | 310 | ```python 311 | # Cut to Chinese chars 312 | from pnlp import cut_zhchar 313 | text = "你好,hello, 520 i love u. = ”我爱你“。" 314 | char_list = cut_zhchar(text) 315 | print(char_list) 316 | """ 317 | ['你', '好', ',', 'hello', ',', ' ', '520', ' ', 'i', ' ', 'love', ' ', 'u', '.', ' ', '=', ' ', '”', '我', '爱', '你', '“', '。'] 318 | """ 319 | char_list = cut_zhchar(text, remove_blank=True) 320 | print(char_list) 321 | """ 322 | ['你', '好', ',', 'hello', ',', '520', 'i', 'love', 'u', '.', '=', '”', '我', '爱', '你', '“', '。'] 323 | """ 324 | ``` 325 | 326 | #### CombineBucket 327 | 328 | ```python 329 | from pnlp import combine_bucket 330 | parts = [ 331 | "先生,那夜,我因胸中纳闷,无法入睡,", 332 | "折腾得比那铐了脚镣的叛变水手还更难过;", 333 | "那时,我就冲动的 ——", 334 | "好在有那一时之念,", 335 | "因为有时我们在无意中所做的事能够圆满……" 336 | ] 337 | buckets = combine_bucket(parts.copy(), 10, truncate=True, keep_remain=True) 338 | print(buckets) 339 | """ 340 | ['先生,那夜,我因胸中', 341 | '纳闷,无法入睡,', 342 | '折腾得比那铐了脚镣的', 343 | '叛变水手还更难过;', 344 | '那时,我就冲动的 —', 345 | '—', 346 | '好在有那一时之念,', 347 | '因为有时我们在无意中', 348 | '所做的事能够圆满……'] 349 | """ 350 | ``` 351 | 352 | ### Enhancement 353 | 354 | Sampler support delete, swap and insert operation, all operations do not span punctuations. 355 | 356 | #### TokenLevel 357 | 358 | - It uses a default tokenizer for Chinese (Chinese Char Tokenizer) and English (Simple Whitespace Tokenizer). 359 | - The tokenizer could be anyone you like, but the output should be a list of tokens or a list of tuple pairs, each pair include a token and a part-of-speech. 360 | - It uses `stopwords` as default sample words and function part-of-speech as default sample pos. This means we only sampling those tokens who are in the sample words or their pos are in the sample pos (if they just have a pos). You could customize them as you like. 361 | 362 | ```python 363 | # tokens in 【】 are operated 364 | text = "人为什么活着?生而为人必须要有梦想!还要有尽可能多的精神体验。" 365 | # TokenLevel 366 | from pnlp import TokenLevelSampler 367 | tls = TokenLevelSampler() 368 | tls.make_samples(text) 369 | """ 370 | {'delete': '人为什么活着?生而为人必须要【有】梦想!还要有尽可能多的精神体验。', 371 | 'swap': '【为】【人】什么活着?生而为人必须要有梦想!还要有尽可能多的精神体验。', 372 | 'insert': '人为什么活着?生而为人必须要有梦想!【还】还要有尽可能多的精神体验。', 373 | 'together': '人什么着着活?生而必为为须要有梦想!还要有尽可能多的精神体验。'} 374 | """ 375 | # tokenizer is supported 376 | tls.make_samples(text, jieba.lcut) 377 | """ 378 | {'delete': '人为什么活着?生而为人【必须】要有梦想!还要有尽可能多的精神体验。', 379 | 'swap': '【为什么】【人】活着?生而为人必须要有梦想!还要有尽可能多的精神体验。', 380 | 'insert': '人为什么活着?生而为人必须要有梦想!【还要】还要有尽可能多的精神体验。', 381 | 'together': '人为什么活着?生而为人人要有梦想!还要有多尽可能的精神体验。'} 382 | """ 383 | # custom the Sampler 384 | tls = TokenLevelSampler( 385 | rate=replace_rate, # default 5% 386 | types=["delete", "swap", "insert"], # default the three 387 | sample_words=["w1", "w2"], # default is the stopwords 388 | sample_pos=["pos1", "pos2"], # default is the functional pos (d p c u xc), means adv, prep, conj, auxiliary, other functional pos 389 | ) 390 | ``` 391 | 392 | #### SentenceLevel 393 | 394 | ```python 395 | from pnlp import SentenceLevelSampler 396 | sls = SentenceLevelSampler() 397 | sls.make_samples(text) 398 | """ 399 | {'delete': '生而为人必须要有梦想!还要有尽可能多的精神体验。', 400 | 'swap': '人为什么活着?还要有尽可能多的精神体验。生而为人必须要有梦想!', 401 | 'insert': '人为什么活着?还要有尽可能多的精神体验。生而为人必须要有梦想!生而为人必须要有梦想!', 402 | 'together': '生而为人必须要有梦想!人为什么活着?人为什么活着?'} 403 | """ 404 | # custom the Sampler 405 | sls = SentenceLevelSampler(types=["delete", "swap", "insert"]) # default is the three 406 | ``` 407 | 408 | ### Normalization 409 | 410 | #### ChineseNumber 411 | 412 | ```python 413 | from pnlp import num_norm 414 | num_norm.num2zh(1024) == "一千零二十四" 415 | num_norm.num2zh(1024).to_money() == "壹仟零贰拾肆" 416 | num_norm.zh2num("一千零二十四") == 1024 417 | ``` 418 | 419 | ### Transformation 420 | 421 | #### BIO2Entity 422 | 423 | ```python 424 | # entity bio to entities 425 | from pnlp import pick_entity_from_bio_labels 426 | pairs = [('天', 'B-LOC'), ('安', 'I-LOC'), ('门', 'I-LOC'), ('有', 'O'), ('毛', 'B-PER'), ('主', 'I-PER'), ('席', 'I-PER')] 427 | pick_entity_from_bio_labels(pairs) 428 | """ 429 | [('天安门', 'LOC'), ('毛主席', 'PER')] 430 | """ 431 | pick_entity_from_bio_labels(pairs, with_offset=True) 432 | """ 433 | [('天安门', 'LOC', 0, 3), ('毛主席', 'PER', 4, 7)] 434 | """ 435 | ``` 436 | 437 | #### Parameters2uuid 438 | 439 | ```python 440 | from pnlp import generate_uuid 441 | 442 | uid1 = pnlp.generate_uuid("a", 1, 0.02) 443 | uid2 = pnlp.generete_uuid("a", 1) 444 | """ 445 | uid1 == 3fbc8b70d05b5abdb5badca1d26e1dbd 446 | uid2 == f7b0ffc589e453e88d4faf66eb92f669 447 | """ 448 | ``` 449 | 450 | ### Built-in Dicts 451 | 452 | #### StopWords 453 | 454 | ```python 455 | from pnlp import StopWords, chinese_stopwords, english_stopwords 456 | 457 | csw = StopWords("/path/to/custom/stopwords.txt") 458 | csw.stopwords # a set of the custom stopwords 459 | 460 | csw.zh == chinese_stopwords # Chineses stopwords 461 | csw.en == english_stopwords # English stopwords 462 | ``` 463 | 464 | 465 | ### Length 466 | 467 | ```python 468 | from pnlp import Length 469 | 470 | text = "这是https://www.yam.gift长度测试,《 》*)FSJfdsjf😁![](http://xx.jpg)。233." 471 | 472 | pl = Length(text) 473 | # Note that even a pattern is used, the length is always for the raw text. 474 | # Length is counted by character, not entire word or number. 475 | print("Length of all characters: ", pl.len_all) 476 | print("Length of all non-white characters: ", pl.len_nwh) 477 | print("Length of all Chinese characters: ", pl.len_chi) 478 | print("Length of all words and numbers: ", pl.len_wnb) 479 | print("Length of all punctuations: ", pl.len_pun) 480 | print("Length of all English characters: ", pl.len_eng) 481 | print("Length of all numbers: ", pl.len_num) 482 | 483 | """ 484 | Length of all characters: 64 485 | Length of all non-white characters: 63 486 | Length of all Chinese characters: 6 487 | Length of all words and numbers: 41 488 | Length of all punctuations: 14 489 | Length of all English characters: 32 490 | Length of all numbers: 3 491 | """ 492 | ``` 493 | 494 | ### Magic 495 | 496 | #### MagicDict 497 | 498 | ```python 499 | from pnlp import MagicDict 500 | 501 | # Nest dict 502 | pmd = MagicDict() 503 | pmd['a']['b']['c'] = 2 504 | print(pmd) 505 | 506 | """ 507 | {'a': {'b': {'c': 2}}} 508 | """ 509 | 510 | # Preserve all repeated value-keys when a Dict is reversed. 511 | dx = {1: 'a', 512 | 2: 'a', 513 | 3: 'a', 514 | 4: 'b' } 515 | print(pmag.MagicDict.reverse(dx)) 516 | 517 | """ 518 | {'a': [1, 2, 3], 'b': 4} 519 | """ 520 | ``` 521 | 522 | #### GetUniqueFileName 523 | 524 | ```python 525 | from pnlp import get_unique_fn 526 | 527 | get_unique_fn("a/b/c.md") == "a_b_c.md" 528 | ``` 529 | 530 | ### Concurring 531 | 532 | Support 4 types of concurring: 533 | 534 | - `thread_pool` 535 | - `process_pool` 536 | - `thread_executor`, the default 537 | - `thread` 538 | 539 | Note that we use lazy process, return generators. 540 | 541 | ```python 542 | import math 543 | def is_prime(x): 544 | if x < 2: 545 | return False 546 | for i in range(2, int(math.sqrt(x)) + 1): 547 | if x % i == 0: 548 | return False 549 | return True 550 | 551 | from pnlp import concurring 552 | 553 | # the default value of `max_workers` is 4 554 | @concurring 555 | def get_primes(lst): 556 | res = [] 557 | for i in lst: 558 | if is_prime(i): 559 | res.append(i) 560 | return res 561 | 562 | @concurrint(type="thread", max_workers=10) 563 | def get_primes(lst): 564 | pass 565 | ``` 566 | 567 | `concurring` wrapper just make your original function concurring. 568 | 569 | ### Background 570 | 571 | ```python 572 | from pnlp import run_in_new_thread 573 | 574 | def func(file, a, b, c): 575 | background_task() 576 | 577 | run_in_new_thread(func, file, 1, 2, 3) 578 | ``` 579 | 580 | ## Test 581 | 582 | Clone the repo run: 583 | 584 | ```bash 585 | $ python -m pytest 586 | ``` 587 | 588 | ## ChangeLog 589 | 590 | **v0.4.16** 591 | 592 | Fix: `read_json` default use `UTF-8`. 593 | 594 | **v0.4.14-15** 595 | 596 | Fix: Number as part of sub sentence. 597 | 598 | **v0.4.13** 599 | 600 | Feat: Background task and magic get unique file name from file path. 601 | 602 | **v0.4.12** 603 | 604 | Feat: Subsentence cut and combine with a given threshold. 605 | 606 | **v0.4.10** 607 | 608 | Fix: Chinese stopwords reading, `piop.gen_files` regex 609 | 610 | **v0.4.9** 611 | 612 | Add: `generate_uuid` given by arbitrary parameters. 613 | 614 | **v0.4.8** 615 | 616 | Opt: read `count` lines of a text file for api `read_lines` 617 | 618 | **v0.4.7** 619 | 620 | Add `write_list_dict_to_file` and `read_file_to_list_dict` 621 | 622 | **v0.4.6** 623 | 624 | Fix regex: `-` if is used as string literal, should be transfered. 625 | 626 | **v0.4.5** 627 | 628 | Add loc to bio label => entity. 629 | 630 | **v0.4.3** 631 | 632 | Adjust `Reader` init parameters. 633 | 634 | **v0.4.2** 635 | 636 | Add bio label => entity. 637 | 638 | **v0.4.1** 639 | 640 | Remove annotation `re.Pattern`. 641 | 642 | **v0.4.0** 643 | 644 | Make dataclass their right usage. 645 | 646 | 647 | **v0.3.11** 648 | 649 | Adjust `MagicDict` and `check_dir`. 650 | 651 | **v0.3.10** 652 | 653 | Fix piop `strip`. 654 | 655 | **v0.3.9** 656 | 657 | `Reader` support regex. 658 | 659 | **v0.3.8** 660 | 661 | Fix `concurring` for multiple processing. 662 | 663 | **v0.3.7** 664 | 665 | Add concurring and batch generator 666 | 667 | **v0.3.5** 668 | 669 | Add text enhancement. 670 | 671 | **v0.3.3/4** 672 | 673 | Fix url link and picture `Regex` pattern. 674 | 675 | **v0.3.2** 676 | 677 | Fix `cut_part` for sentence ends with a white space and a full stop. 678 | 679 | **v0.3.1** 680 | 681 | Add `cut_part` to cut text to any parts by the given Regex Pattern; Add `combine_bucket` to combine any parts to buckets by the given threshold(length). 682 | 683 | **v0.3.0** 684 | 685 | Update `cut_sentence`; Add `NumNorm`. 686 | 687 | **v0.28-29** 688 | 689 | Update `cut_zhchar`. 690 | 691 | **v0.27** 692 | 693 | Add `cut_zhchar`. 694 | 695 | **v0.26** 696 | 697 | Add `read_csv`, remove `;` as a sentence cut standard. 698 | 699 | **v0.25** 700 | 701 | Add `stop_words`. 702 | 703 | **v0.24** 704 | 705 | Fix `read_json`. 706 | 707 | **v0.23** 708 | 709 | Fix `Text` default rule. 710 | 711 | **v0.22** 712 | 713 | Make `Text` more convenient to use. 714 | 715 | **v0.21** 716 | 717 | Add `cut_sentence` method. 718 | 719 | **v0.20** 720 | 721 | Optimize several interface and make `Text` accept list of Regular Expression Patterns. 722 | 723 | -------------------------------------------------------------------------------- /pnlp/__init__.py: -------------------------------------------------------------------------------- 1 | from pnlp.piop import read_file, read_lines, read_json, read_yaml, read_csv, read_pickle, read_file_to_list_dict 2 | from pnlp.piop import write_file, write_json, write_pickle, check_dir, write_list_dict_to_file 3 | from pnlp.pcut import cut_sentence, cut_sub_sentence, cut_zhchar, cut_part, combine_bucket 4 | from pnlp.pcut import psent, psubsent 5 | 6 | from pnlp.piop import Reader, Dict 7 | from pnlp.ptxt import Regex, Text, Length 8 | from pnlp.pnorm import NumNorm 9 | from pnlp.penh import TokenLevelSampler, SentenceLevelSampler 10 | from pnlp.ptrans import pick_entity_from_bio_labels, generate_uuid 11 | from pnlp.pmag import MagicDict, get_unique_fn 12 | from pnlp.stopwords import StopWords 13 | from pnlp.stopwords import chinese_stopwords, english_stopwords 14 | 15 | from pnlp.utils import pstr, concurring, divide2int, run_in_new_thread 16 | from pnlp.utils import generate_batches_by_num, generate_batches_by_size 17 | 18 | 19 | num_norm = NumNorm() 20 | reg = Regex() 21 | reader = Reader() 22 | tlsampler = TokenLevelSampler() 23 | slsampler = SentenceLevelSampler() 24 | 25 | 26 | __title__ = "pnlp" 27 | __version__ = "0.4.16" 28 | __author__ = "Yam" 29 | __license__ = "Apache-2.0" 30 | __copyright__ = "Copyright 2019, 2020, 2021, 2022, 2023, 2024 Yam" 31 | __all__ = [ 32 | "Reader", 33 | "Text", "Regex", "Length", 34 | "MagicDict", 35 | "NumNorm", 36 | "StopWords", 37 | "TokenLevelSampler", "SentenceLevelSampler" 38 | ] 39 | -------------------------------------------------------------------------------- /pnlp/pcut.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import re 3 | from pnlp.ptxt import Regex 4 | from pnlp.utils import pstr 5 | 6 | psent = re.compile( 7 | r''' 8 | \n+ 9 | | 10 | [。.!!??…]+[”][。.!!??…~~]? 11 | | 12 | (?<=[ \u3000a-zA-Z"”》))〉〕】>」』\u4e00-\u9fa5])[。.!!??…~~]+ 13 | ''', re.UNICODE | re.VERBOSE) 14 | psubsent = re.compile( 15 | r''' 16 | \n+ 17 | | 18 | [。.!!??…]+[”][。.!!??…~~]? 19 | | 20 | (?<=[ \u3000a-zA-Z"”》))〉〕】>」』\u4e00-\u9fa5])[,,、::;;。.!!??…~~]+ 21 | | 22 | \d+[,,]+ 23 | ''', re.UNICODE | re.VERBOSE) 24 | # referenced from jieba 25 | punzh = pstr(Regex.pun_zh) - "-" # for minus number eg -2 26 | punen = pstr(Regex.pun_en) - "." # for float number eg 1.3 27 | pun = punzh + punen 28 | pzh = re.compile(rf"([\u4E00-\u9FD5{pun}+#&])", re.UNICODE) 29 | pen = re.compile(r"([a-zA-Z]+)", re.UNICODE) 30 | pskip = re.compile(r"(\s)", re.UNICODE) 31 | pspecial = re.compile(r"([\-.])") # split to single 32 | pnum = re.compile( 33 | r""" 34 | ([\-]?\d{1,}[.]?\d{0,}%) 35 | | 36 | ([\-]?\d{1,}[./]?\d{0,}) 37 | """, re.UNICODE | re.VERBOSE) 38 | 39 | 40 | def cut_zhchar(text: str, remove_blank: bool = False) -> list: 41 | lst = [] 42 | blocks = pzh.split(text) 43 | for block in blocks: 44 | if not block: 45 | continue 46 | if pzh.match(block): 47 | for char in block: 48 | lst.append(char) 49 | else: 50 | skips = pskip.split(block) 51 | for skip in skips: 52 | if pen.search(skip): 53 | for en_part in pen.split(skip): 54 | if en_part: 55 | spe = pspecial.search(en_part) 56 | if not spe: 57 | lst.append(en_part) 58 | else: 59 | for spe_part in pspecial.split(en_part): 60 | if spe_part: 61 | lst.append(spe_part) 62 | elif pnum.search(skip): 63 | if skip[-1] != ".": 64 | lst.append(skip) 65 | else: 66 | i = 0 67 | while skip[-1] == ".": 68 | i += 1 69 | skip = skip[:-1] 70 | lst.append(skip) 71 | for _ in range(i): 72 | lst.append(".") 73 | else: 74 | if remove_blank: 75 | skip = pskip.sub("", skip) 76 | if skip: 77 | lst.append(skip) 78 | return lst 79 | 80 | 81 | def cut_part(text: str, 82 | split_pattern, 83 | with_spliter: bool = True, 84 | with_offset: bool = False) -> list: 85 | """ 86 | Cut text to parts by the given Regex Pattern. 87 | 88 | Parameters 89 | ---------- 90 | text: raw text. 91 | split_pattern: how to split text. 92 | with_spliter: whether the parts contain spliters. 93 | with_offset: whether the parts contain offsets. 94 | 95 | Returns 96 | -------- 97 | out: cutted parts. 98 | """ 99 | spliters = split_pattern.findall(text) 100 | length = len(spliters) 101 | lst = [] 102 | start = 0 103 | for i, part in enumerate(split_pattern.split(text)): 104 | if i < length: 105 | if with_spliter: 106 | part = part + spliters[i] 107 | len_spliter = 0 108 | else: 109 | len_spliter = len(spliters[i]) 110 | else: 111 | len_spliter = 0 112 | end = start + len(part) + len_spliter 113 | if part: 114 | if with_offset: 115 | item = (part, start, end) 116 | else: 117 | item = part 118 | lst.append(item) 119 | start = end 120 | return lst 121 | 122 | 123 | def combine_bucket(parts: list, 124 | threshold: int, 125 | truncate: bool = False, 126 | keep_remain: bool = False) -> list: 127 | """ 128 | Convert parts to buckets with given length(threshold). 129 | 130 | Parameters 131 | ---------- 132 | parts: the given parts. 133 | threshold: bucket length. 134 | truncate: whether to truncate those whose length is bigger than threshold. 135 | keep_remain: when truncate=True, whether to keep the remain parts. 136 | 137 | Returns 138 | out: list of bucket. 139 | ------- 140 | """ 141 | 142 | def deal_long_part(part: str) -> list: 143 | result = [] 144 | if truncate: 145 | if keep_remain: 146 | len_subparts = len(part) // threshold + 1 147 | for i in range(len_subparts): 148 | sub_part = part[i * threshold:(i + 1) * threshold] 149 | if sub_part: 150 | result.append(sub_part) 151 | else: 152 | result.append(part[:threshold]) 153 | else: 154 | result.append(part) 155 | return result 156 | 157 | buckets = [] 158 | while parts: 159 | part = parts.pop(0) 160 | # directly add to buckets when a part is longer than threshold 161 | if len(part) > threshold: 162 | sub_parts = deal_long_part(part) 163 | buckets.append(sub_parts) 164 | else: 165 | while parts and len(part) < threshold: 166 | another = parts[0] 167 | if len(part + another) > threshold: 168 | break 169 | else: 170 | part += parts.pop(0) 171 | buckets.append([part]) 172 | result = list(itertools.chain(*buckets)) 173 | return result 174 | 175 | 176 | def cut_sentence(text: str) -> list: 177 | return cut_part(text, psent, True, False) 178 | 179 | 180 | def cut_sub_sentence(text: str, threshold: int = 0) -> list: 181 | parts = cut_part(text, psubsent, True, False) 182 | res = combine_bucket(parts, threshold, False, False) 183 | return res -------------------------------------------------------------------------------- /pnlp/penh.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from typing import List, Dict, Callable, Tuple, Optional 3 | from itertools import chain 4 | 5 | import numpy as np 6 | 7 | from pnlp.pcut import cut_zhchar, cut_part, psent, psubsent 8 | from pnlp.ptxt import Regex 9 | from pnlp.stopwords import chinese_stopwords, english_stopwords 10 | 11 | reg = Regex() 12 | STOPWORDS = list(english_stopwords | chinese_stopwords) 13 | # 主要对功能词(与实词对应)采样处理 14 | SAMPLE_WORDS = [w for w in STOPWORDS if reg.pwnb.search(w)] 15 | # 副词、介词、连词、助词、其他虚词 16 | SAMPLE_POS = ["d", "p", "c", "u", "xc"] 17 | 18 | 19 | def swap(lst: list, index: int, start: int, end: int) -> list: 20 | """Randomly swap two continuous parts.""" 21 | assert start <= index <= end <= len(lst) - 1 22 | if (index == start or np.random.rand() < 0.5) and index != end: 23 | lst[index], lst[index + 1] = lst[index + 1], lst[index] 24 | else: 25 | lst[index], lst[index - 1] = lst[index - 1], lst[index] 26 | return lst 27 | 28 | 29 | class Sampler: 30 | 31 | def check_types(self): 32 | default_types = set(("delete", "swap", "insert")) 33 | for typ in self.types: 34 | if typ not in default_types: 35 | raise ValueError( 36 | "pnlp: Type {} is not a valid type.".format(typ)) 37 | 38 | 39 | class TokenLevelSampler(Sampler): 40 | """ 41 | Random choose an index. 42 | - Insert a copy token. Usually a function word. 43 | - Delete a token. 44 | - Swap with the prev or the next token. 45 | 46 | Parameters 47 | ------------- 48 | rate: The sampling rate (each type respectively). 49 | types: Sampling methods. You should take care of the order. 50 | sample_words: Words will be used in sampling. Usually use stopwords. 51 | sample_pos: Part-of-speech will be used in sampling. Usually use function pos. 52 | 53 | Note 54 | ------ 55 | 1. We mainly use `stopwords (usually function words)` as sample words to do the sampling. 56 | If you want to sample other kinds of words, you could appoint `sample_pos` to what you need, 57 | then your input should include POS flags. 58 | 2. The order of the `types` will influence the output of the dependent_sample. 59 | """ 60 | 61 | def __init__( 62 | self, 63 | rate: float = 0.05, 64 | types: List[str] = ["delete", "swap", "insert"], 65 | sample_words: List[str] = SAMPLE_WORDS, 66 | sample_pos: List[str] = SAMPLE_POS, 67 | ): 68 | self.rate = rate 69 | self.types = types 70 | self.sample_words = sample_words 71 | self.sample_pos = sample_pos 72 | self.len_types = len(types) 73 | self.check_types() 74 | assert self.rate <= 0.1 75 | 76 | def filter_sample_idx( 77 | self, token_list: List[str or Tuple[str, str]]) -> List[int]: 78 | if not token_list: 79 | return [] 80 | if type(token_list[0]) == str: 81 | can_deal_idx = [ 82 | i for (i, w) in enumerate(token_list) if w in self.sample_words 83 | ] 84 | else: 85 | can_deal_idx = [ 86 | i for (i, (w, f)) in enumerate(token_list) 87 | if f in self.sample_pos 88 | ] 89 | return can_deal_idx 90 | 91 | def choose_sample_idx(self, len_parts: int, sample_count: int) -> List[int]: 92 | size = min(len_parts, sample_count) 93 | sample_part_idx = np.random.choice(len_parts, size, 94 | replace=False).tolist() 95 | return sample_part_idx 96 | 97 | def delete_sampling(self, token_list: List[str or Tuple[str, str]], 98 | sample_idx: List[int]) -> List[str or Tuple[str, str]]: 99 | """Simple delete sampling. Delete the tokens in the given indexes.""" 100 | result = [] 101 | for i, token in enumerate(token_list): 102 | if i in sample_idx: 103 | continue 104 | result.append(token) 105 | return result 106 | 107 | def insert_sampling(self, token_list: List[str or Tuple[str, str]], 108 | sample_idx: List[int]) -> List[str or Tuple[str, str]]: 109 | """Simple insert sampling. Insert the tokens in the given indexes.""" 110 | result = [] 111 | for i, token in enumerate(token_list): 112 | if i in sample_idx: 113 | result.append(token) 114 | result.append(token) 115 | return result 116 | 117 | def swap_sampling(self, token_list: List[str or Tuple[str, str]], 118 | sample_idx: List[int]) -> List[str or Tuple[str, str]]: 119 | """Simple swap sampling. Swap the tokens in the given indexes. 120 | DONOT swap start and end. 121 | """ 122 | result = copy.deepcopy(token_list) 123 | end = len(token_list) - 1 124 | for idx in sample_idx: 125 | swap(result, idx, 0, end) 126 | return result 127 | 128 | def _sampling( 129 | self, 130 | type: str, 131 | parts: List[List[str] or List[Tuple[str, str]]], 132 | sample_idx: List[int], 133 | ) -> List[List[str] or List[Tuple[str, str]]]: 134 | """ 135 | Sampling by part, each time deal with a part instead of a token. 136 | """ 137 | cp_parts = copy.deepcopy(parts) 138 | for j, part in enumerate(cp_parts): 139 | if j not in sample_idx: 140 | continue 141 | can_deal_idx = self.filter_sample_idx(part) 142 | if not can_deal_idx: 143 | continue 144 | deal_idx = np.random.choice(can_deal_idx, 1).tolist()[0] 145 | if type == "delete": 146 | part.remove(part[deal_idx]) 147 | elif type == "insert": 148 | part.insert(deal_idx, part[deal_idx]) 149 | else: 150 | swap(part, deal_idx, can_deal_idx[0], can_deal_idx[-1]) 151 | return cp_parts 152 | 153 | def independent_sampling( 154 | self, token_list: List[str or Tuple[str, str]] 155 | ) -> List[List[str] or List[Tuple[str, str]]]: 156 | result = [] 157 | parts = self.convert_tokens_to_parts_by_nonword(token_list) 158 | len_parts = len(parts) 159 | len_tokens = len(token_list) 160 | sample_count = round(len_tokens * self.rate) * self.len_types 161 | sample_count = max(sample_count, 1) 162 | # 一次采样到位,之后只是分别操作,操作原则上互相不依赖 163 | sample_part_idx = self.choose_sample_idx(len_parts, sample_count) 164 | each_count = len(sample_part_idx) // self.len_types 165 | 166 | for i, typ in enumerate(self.types): 167 | sample_idx = sample_part_idx[i * each_count:(i + 1) * each_count] 168 | new_parts = self._sampling(typ, parts, sample_idx) 169 | sample = list(chain(*new_parts)) 170 | result.append(sample) 171 | return result 172 | 173 | def dependent_sampling( 174 | self, 175 | token_list: List[str or 176 | Tuple[str, str]]) -> List[str or Tuple[str, str]]: 177 | parts = self.convert_tokens_to_parts_by_nonword(token_list) 178 | len_parts = len(parts) 179 | len_tokens = len(token_list) 180 | sample_count = round(len_tokens * self.rate) 181 | sample_count = max(sample_count, 1) 182 | for i, typ in enumerate(self.types): 183 | # 每次重新采样,后面的操作可能会与前面的操作重叠 184 | sample_idx = self.choose_sample_idx(len_parts, sample_count) 185 | parts = self._sampling(typ, parts, sample_idx) 186 | return list(chain(*parts)) 187 | 188 | def convert_tokens_to_parts_by_nonword( 189 | self, token_list: List[str or Tuple[str, str]] 190 | ) -> List[List[str] or List[Tuple[str, str]]]: 191 | parts = [] 192 | tmp = [] 193 | for token in token_list: 194 | tmp.append(token) 195 | word = self.__get_word_from_token(token) 196 | if reg.pnwn.search(word): 197 | parts.append(tmp) 198 | tmp = [] 199 | return parts 200 | 201 | def __get_word_from_token(self, token: str or Tuple[str, str]) -> str: 202 | if type(token) == str: 203 | return token 204 | else: 205 | tup = tuple(token) 206 | return tup[0] 207 | 208 | def __join_tokens(self, token_list: List[str or Tuple[str, str]]) -> str: 209 | result = [] 210 | for token in token_list: 211 | word = self.__get_word_from_token(token) 212 | result.append(word) 213 | return "".join(result) 214 | 215 | def make_samples( 216 | self, 217 | text: str, 218 | tokenizer: Optional[Callable[[str], List[str or 219 | Tuple[str, str]]]] = None, 220 | ) -> Dict[str, str]: 221 | """ 222 | Make negative samples. 223 | 224 | Parameters 225 | ----------- 226 | text: The given text. Usually a sentence. 227 | tokenizer: Input a text, output a List of tokens. A token is a word or a (word, flag) tuple. 228 | 229 | Returns 230 | -------- 231 | output: A dict of different kinds of negative samples. 232 | """ 233 | if self.len_types == 0: 234 | return {} 235 | if not tokenizer: 236 | if reg.pchi.search(text): 237 | tokenizer = cut_zhchar 238 | else: 239 | 240 | def tokenizer(x): 241 | return x.split() 242 | 243 | tokens = tokenizer(text) 244 | if len(tokens) == 0: 245 | return {} 246 | result = {} 247 | indep_samples = self.independent_sampling(tokens) 248 | dep_sample = self.dependent_sampling(tokens) 249 | if indep_samples: 250 | for i, typ in enumerate(self.types): 251 | new_tokens = indep_samples[i] 252 | result[typ] = self.__join_tokens(new_tokens) 253 | result["together"] = self.__join_tokens(dep_sample) 254 | return result 255 | 256 | 257 | class SentenceLevelSampler(Sampler): 258 | """ 259 | Random choose an index. 260 | - Insert a copy. 261 | - Delete. 262 | - Swap with the prev or the next one. 263 | 264 | We only deal with ONE sentence once. 265 | So you'd better use a paragraph as input. 266 | 267 | Parameters 268 | ----------- 269 | types: Sampling methods. You should take care of the order. 270 | """ 271 | 272 | def __init__(self, types: List[str] = ["delete", "swap", "insert"]): 273 | self.types = types 274 | self.check_types() 275 | 276 | def independent_sampling(self, text_list: List[str]) -> List[List[str]]: 277 | result = [] 278 | text_list = copy.deepcopy(text_list) 279 | length = len(text_list) 280 | for i, typ in enumerate(self.types): 281 | idx = np.random.choice(length, 1).tolist()[0] 282 | if typ == "insert": 283 | new = text_list[:idx] + [text_list[idx]] + text_list[idx:] 284 | elif typ == "delete": 285 | new = [s for (i, s) in enumerate(text_list) if i != idx] 286 | else: 287 | new = swap(text_list, idx, 0, length - 1) 288 | result.append(new) 289 | return result 290 | 291 | def dependent_sampling(self, text_list: List[str]) -> List[str]: 292 | text_list = copy.deepcopy(text_list) 293 | for i, typ in enumerate(self.types): 294 | # 每次重新更新长度 295 | length = len(text_list) 296 | if length == 0: 297 | continue 298 | idx = np.random.choice(length, 1).tolist()[0] 299 | if typ == "insert": 300 | text_list = text_list[:idx] + \ 301 | [text_list[idx]] + text_list[idx:] 302 | elif typ == "delete": 303 | text_list = [s for (i, s) in enumerate(text_list) if i != idx] 304 | else: 305 | text_list = swap(text_list, idx, 0, length - 1) 306 | return text_list 307 | 308 | def make_samples(self, text: str, level: str = "sent") -> List[str]: 309 | """ 310 | Parameters 311 | ------------- 312 | text: The given text. Always a paragraph. 313 | level: The sampling level. Could be one of {"sent", "subsent"}. 314 | 315 | Returns 316 | -------- 317 | output: A dict of different kinds of negative samples. 318 | """ 319 | if level == "sent": 320 | text_list = cut_part(text, psent) 321 | else: 322 | text_list = cut_part(text, psubsent) 323 | if len(text_list) == 0: 324 | return {} 325 | result = {} 326 | indep_samples = self.independent_sampling(text_list) 327 | dep_sample = self.dependent_sampling(text_list) 328 | if indep_samples: 329 | for i, typ in enumerate(self.types): 330 | new = indep_samples[i] 331 | result[typ] = "".join(new) 332 | result["together"] = "".join(dep_sample) 333 | return result 334 | -------------------------------------------------------------------------------- /pnlp/piop.py: -------------------------------------------------------------------------------- 1 | # from __future__ import annotations 2 | 3 | from typing import List, Dict, Union 4 | from addict import Dict as AdDict 5 | import json 6 | import pickle 7 | import os 8 | import re 9 | import csv 10 | import pathlib 11 | import yaml 12 | 13 | 14 | class Reader: 15 | """ 16 | Parameters 17 | ----------- 18 | pattern: path pattern, support Regex. default "*.*" 19 | use_regex: whether to use Regex to compile the string pattern. default False 20 | """ 21 | 22 | def __init__( 23 | self, 24 | pattern: str = "*.*", 25 | use_regex: bool = False, 26 | strip: str = "\n"): 27 | self.pattern = pattern 28 | self.use_regex = use_regex 29 | self.strip = strip 30 | 31 | def __repr__(self) -> str: 32 | return "Reader(pattern=%r)" % (self.pattern) 33 | 34 | @staticmethod 35 | def gen_files(dirname: str, pattern: str = "*.*", use_regex: bool = False): 36 | """ 37 | Find all filenames in a directory tree that match the filepattern. 38 | If filepath is a file, yield the filepath directly. 39 | """ 40 | if os.path.isfile(dirname): 41 | fpath = pathlib.Path(dirname) 42 | yield fpath 43 | if use_regex: 44 | try: 45 | pat = re.compile(pattern) 46 | except Exception: 47 | raise ValueError("pnlp: invalid pattern: {}".format(pattern)) 48 | 49 | for fpath in pathlib.Path(dirname).rglob("*"): 50 | if pat.search(fpath.name): 51 | yield fpath 52 | else: 53 | for fpath in pathlib.Path(dirname).rglob(pattern): 54 | yield fpath 55 | 56 | @staticmethod 57 | def gen_articles(fpaths: list): 58 | for fpath in fpaths: 59 | with open(fpath, encoding="utf8") as f: 60 | article = AdDict() 61 | article.fname = fpath.name 62 | article.f = f 63 | yield article 64 | 65 | @staticmethod 66 | def gen_flines(articles: list, strip: str = "\n"): 67 | """ 68 | Process each file to lines when io.TextIOWrapper is given. 69 | """ 70 | for article in articles: 71 | lid = 0 72 | for line_content in article.f: 73 | line_content = line_content.strip(strip) 74 | if len(line_content) == 0: 75 | continue 76 | line = AdDict() 77 | line.lid = lid 78 | line.fname = article.fname 79 | line.text = line_content 80 | lid += 1 81 | yield line 82 | 83 | @staticmethod 84 | def gen_plines(fpath: str, strip: str = "\n"): 85 | """ 86 | Process each file to lines when fpath is given. 87 | """ 88 | with open(fpath, encoding="utf8") as f: 89 | for line in f: 90 | line = line.strip(strip) 91 | if len(line) == 0: 92 | continue 93 | yield line 94 | 95 | def __call__(self, dirname: str): 96 | fpaths = Reader.gen_files(dirname, self.pattern, self.use_regex) 97 | articles = Reader.gen_articles(fpaths) 98 | flines = Reader.gen_flines(articles, self.strip) 99 | for line in flines: 100 | yield line 101 | 102 | 103 | def read_file(fpath: str, encoding="utf-8", **kwargs) -> str: 104 | """ 105 | Read file from file path. 106 | 107 | Parameters 108 | ----------- 109 | fpath: str 110 | File path. 111 | kwargs: optional 112 | Other `open` support params. 113 | 114 | Returns 115 | -------- 116 | data string of the file. 117 | """ 118 | with open(fpath, encoding="utf-8", **kwargs) as f: 119 | data = f.read() 120 | return data 121 | 122 | 123 | def read_lines( 124 | fpath: str, 125 | strip: str = "\n", 126 | count: int = -1, 127 | **kwargs 128 | ) -> List[str]: 129 | """ 130 | Read file with `open` from file path. 131 | 132 | Parameters 133 | ---------- 134 | fpath: str 135 | File path. 136 | strip: str 137 | Strip method, could be strip string or None, default is "\n". 138 | count: int 139 | How many lines to read, default is -1 (all). 140 | kwargs: optional 141 | Other `open` support params. 142 | 143 | Returns 144 | ------- 145 | Lines of the file. 146 | 147 | Notes 148 | ----- 149 | Blank line is ignored as default. 150 | """ 151 | res = [] 152 | i = 0 153 | with open(fpath, **kwargs) as f: 154 | for line in f: 155 | if count >= 0 and i >= count: 156 | break 157 | line = line.strip(strip) 158 | if not line: 159 | continue 160 | res.append(line) 161 | i += 1 162 | return res 163 | 164 | 165 | def read_csv(fpath: str, delimiter: str = ",") -> List: 166 | data = [] 167 | with open(fpath, "r") as f: 168 | fcsv = csv.reader(f, delimiter=delimiter) 169 | for row in fcsv: 170 | data.append(row) 171 | return data 172 | 173 | 174 | def read_json(fpath: str, **kwargs) -> Union[List, Dict]: 175 | with open(fpath, "r", encoding="utf-8") as fin: 176 | data = json.load(fin, **kwargs) 177 | return data 178 | 179 | 180 | def read_yaml(fpath: str) -> Dict: 181 | with open(fpath, "r") as fin: 182 | data = yaml.load(fin, Loader=yaml.SafeLoader) 183 | return data 184 | 185 | 186 | def read_pickle(fpath: str, **kwargs) -> Union[List, Dict]: 187 | with open(fpath, "rb") as f: 188 | data = pickle.load(f, **kwargs) 189 | return data 190 | 191 | 192 | def read_file_to_list_dict(inp_file: str) -> List[Dict]: 193 | res = [] 194 | for line in read_lines(inp_file): 195 | item = json.loads(line.strip()) 196 | res.append(item) 197 | return res 198 | 199 | 200 | def write_list_dict_to_file(out_file: str, data: List[Dict]) -> None: 201 | fo = open(out_file, "w") 202 | for item in data: 203 | line = json.dumps(item, ensure_ascii=False) 204 | fo.write(line) 205 | fo.write("\n") 206 | 207 | 208 | def write_json(fpath: str, data, **kwargs) -> None: 209 | fout = open(fpath, "w") 210 | kwargs["ensure_ascii"] = False 211 | json.dump(data, fout, **kwargs) 212 | fout.close() 213 | 214 | 215 | 216 | def write_file(fpath: str, data, **kwargs) -> None: 217 | with open(fpath, "w", **kwargs) as fout: 218 | for line in data: 219 | fout.write(line + "\n") 220 | 221 | 222 | def write_pickle(fpath: str, data, **kwargs) -> None: 223 | with open(fpath, "wb") as f: 224 | pickle.dump(data, f, **kwargs) 225 | 226 | 227 | def check_dir(*args) -> None: 228 | for dirname in args: 229 | if os.path.exists(dirname): 230 | pass 231 | else: 232 | os.makedirs(dirname) 233 | -------------------------------------------------------------------------------- /pnlp/pmag.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from addict import Dict 3 | from pathlib import Path 4 | 5 | 6 | class MagicDict(Dict): 7 | 8 | def __getitem__(self, item): 9 | try: 10 | return dict.__getitem__(self, item) 11 | except KeyError: 12 | # create a self instance 13 | value = self[item] = type(self)() 14 | return value 15 | 16 | @staticmethod 17 | def reverse(dic): 18 | """ 19 | Preserve all repeated value-keys when a Dict is reversed. 20 | 21 | Parameters 22 | ---------- 23 | dic: dict 24 | A dict where several keys have the same values. 25 | 26 | Returns 27 | ------- 28 | Reversed dict of the given dict, but preserve all key-values. 29 | 30 | Example 31 | ------- 32 | dx = { 1: 'a', 33 | 2: 'a', 34 | 3: 'a', 35 | 4: 'b' } 36 | reversedx = { 'a': [1, 2, 3], 37 | 'b': 4 } 38 | """ 39 | d1 = dict(zip(dic.values(), [[] for i in range(len(dic))])) 40 | d2 = dict([ 41 | (y, d1[y].append(x)) 42 | if y in 43 | [w for (w, f) in Counter(dic.values()).items() if f > 1] 44 | else (y, x) 45 | for (x, y) in dic.items()]) 46 | reversdict = dict([(x, d1[x]) if len(d1[x]) != 0 47 | else (x, d2[x]) 48 | for x in d1.keys()]) 49 | return reversdict 50 | 51 | 52 | def get_unique_fn(file_path: str, level=0): 53 | fp = Path(file_path) 54 | fn = fp.name 55 | 56 | file_path = str(file_path).strip("/") 57 | tmp = file_path.split("/")[:-1] 58 | length = len(tmp) 59 | if level > length: 60 | level = length 61 | 62 | if length == 0: 63 | return fn 64 | 65 | path = "_".join(tmp[-level:]) 66 | return "_".join([path, fn]) -------------------------------------------------------------------------------- /pnlp/pnorm.py: -------------------------------------------------------------------------------- 1 | from typing import TypeVar 2 | 3 | 4 | T = TypeVar('T', str, float, int) 5 | 6 | ZH_NUM = { 7 | '〇': 0, '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, 8 | '六': 6, '七': 7, '八': 8, '九': 9, '零': 0, 9 | '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5, 10 | '陆': 6, '柒': 7, '捌': 8, '玖': 9, '貮': 2, '两': 2, 11 | } 12 | 13 | ZH_UNIT = { 14 | '十': 10, 15 | '拾': 10, 16 | '百': 100, 17 | '佰': 100, 18 | '千': 1000, 19 | '仟': 1000, 20 | '万': 10000, 21 | '萬': 10000, 22 | '亿': 100000000, 23 | '億': 100000000, 24 | '兆': 10000000000000, 25 | } 26 | 27 | 28 | ARB_NUM = { 29 | 0: "零", 30 | 1: "一", 31 | 2: "二", 32 | 3: "三", 33 | 4: "四", 34 | 5: "五", 35 | 6: "六", 36 | 7: "七", 37 | 8: "八", 38 | 9: "九", 39 | 10: "十", 40 | 100: "百", 41 | 1000: "千", 42 | 10000: "万", 43 | 100000000: "亿", 44 | 10000000000000: "兆" 45 | } 46 | 47 | ZH2MONEY = { 48 | "一": "壹", 49 | "二": "贰", 50 | "三": "叁", 51 | "四": "肆", 52 | "五": "伍", 53 | "六": "陆", 54 | "七": "柒", 55 | "八": "捌", 56 | "九": "玖", 57 | "十": "拾", 58 | "百": "佰", 59 | "千": "仟", 60 | "万": "萬", 61 | "亿": "億" 62 | } 63 | 64 | 65 | class pnumstr(str): 66 | 67 | def to_money(self): 68 | for c in self: 69 | mc = ZH2MONEY.get(c) 70 | if mc: 71 | self = self.replace(c, mc) 72 | return self 73 | 74 | 75 | class NumNorm: 76 | """ 77 | Chinese_to_Arabic 78 | modifed from https://github.com/bamtercelboo/corpus_process_script/blob/master/cn_to_arabic/cn_to_arabic.py 79 | """ 80 | @staticmethod 81 | def num_len(num: int) -> int: 82 | if num == 0: 83 | return 1 84 | if num < 0: 85 | num = -num 86 | i = 0 87 | while num != 0: 88 | num //= 10 89 | i += 1 90 | return i 91 | 92 | def num2zh(self, num: int) -> str: 93 | def get_base(num): 94 | zh = ARB_NUM.get(num) 95 | if num < 10: 96 | return zh 97 | else: 98 | return "一" + zh 99 | 100 | def get_less_than_10w(num): 101 | res = "" 102 | while num != 0: 103 | if num < 10: 104 | res += ARB_NUM.get(num) 105 | break 106 | length = NumNorm.num_len(num) 107 | divider = 10 ** (length - 1) 108 | high = num // divider 109 | res += ARB_NUM.get(high) 110 | res += ARB_NUM.get(divider) 111 | num = num % divider 112 | new_len = NumNorm.num_len(num) 113 | if length - new_len > 1 and num != 0: 114 | res += "零" 115 | return res 116 | 117 | def get_interval(num: int, lower: int, unit: str): 118 | res = "" 119 | length = NumNorm.num_len(num) 120 | divider = lower / 10 121 | high = num // divider 122 | res = get_less_than_10w(high) 123 | high_len = NumNorm.num_len(high) 124 | res += unit 125 | num -= high * divider 126 | new_len = NumNorm.num_len(num) 127 | if length - high_len - new_len > 0 and num != 0: 128 | res += "零" 129 | return res, num 130 | 131 | def get_10w_to_1y(num): 132 | res, num = get_interval(num, 10**5, "万") 133 | if 0 < num < 100000: 134 | res += get_less_than_10w(num) 135 | return res 136 | 137 | def get_1y_to_1z(num): 138 | res, num = get_interval(num, 10**9, "亿") 139 | if 0 < num < 100000000: 140 | res += get_10w_to_1y(num) 141 | return res 142 | 143 | if num in ARB_NUM: 144 | result = get_base(num) 145 | return pnumstr(result) 146 | # 十万 147 | if num < 10**5: 148 | result = get_less_than_10w(num) 149 | # 一亿 150 | elif num < 10**8: 151 | result = get_10w_to_1y(num) 152 | # 一兆 153 | elif num < 10**13: 154 | result = get_1y_to_1z(num) 155 | else: 156 | result = "超大" 157 | return pnumstr(result) 158 | 159 | def zh2num(self, zh: str) -> T: 160 | unit = 0 161 | digit_list = [] 162 | for zhdigit in reversed(zh): 163 | if zhdigit in ZH_UNIT: 164 | unit = ZH_UNIT.get(zhdigit) 165 | if unit == 10000 or unit == 100000000: 166 | digit_list.append(unit) 167 | unit = 1 168 | else: 169 | digit = ZH_NUM.get(zhdigit) 170 | if unit: 171 | digit *= unit 172 | unit = 0 173 | digit_list.append(digit) 174 | if unit == 10: 175 | digit_list.append(10) 176 | val, tmp = 0, 0 177 | for x in reversed(digit_list): 178 | if x == 10000 or x == 100000000: 179 | val += tmp * x 180 | tmp = 0 181 | else: 182 | tmp += x 183 | val += tmp 184 | if val == 0 and zh != "零": 185 | return zh 186 | else: 187 | return val 188 | -------------------------------------------------------------------------------- /pnlp/ptrans.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | import uuid 3 | 4 | 5 | def generate_uuid(*args) -> str: 6 | s = uuid.uuid5( 7 | uuid.NAMESPACE_URL, 8 | " ".join(map(str, args)) 9 | ) 10 | return s.hex 11 | 12 | 13 | def pick_entity_from_bio_labels( 14 | pairs: List[Tuple[str, str]], 15 | with_offset: bool = False 16 | ) -> List[Tuple[str, str]]: 17 | """ 18 | Parameters 19 | ---------- 20 | pairs: List of tuple pairs, each pair contains a token and a bio tag 21 | with_offset: whether to return locations for the entities 22 | 23 | Returns 24 | ------- 25 | List of entity pairs, each pair contains an entity and entity type (migtht also a start and end index) 26 | """ 27 | 28 | def collect(span: List[Tuple[str, str]]): 29 | res = [] 30 | for c, t in span: 31 | if t.endswith("O"): 32 | break 33 | res.append(c) 34 | return "".join(res), span[0][1].split("-")[-1] 35 | 36 | without_lasto = False 37 | if pairs and pairs[-1][1] != "O": 38 | without_lasto = True 39 | pairs.append(("#", "O")) 40 | bidx = [] 41 | for i, (c, t) in enumerate(pairs): 42 | if t.startswith("B-"): 43 | bidx.append(i) 44 | bidx.append(len(pairs) - 1) 45 | res = [] 46 | for i in range(len(bidx) - 1): 47 | start, end = bidx[i], bidx[i + 1] 48 | span = pairs[start: end] 49 | word, tag = collect(span) 50 | if with_offset: 51 | tup = (word, tag, start, start + len(word)) 52 | else: 53 | tup = (word, tag) 54 | res.append(tup) 55 | if without_lasto: 56 | pairs.pop() 57 | return res 58 | -------------------------------------------------------------------------------- /pnlp/ptxt.py: -------------------------------------------------------------------------------- 1 | from addict import Dict 2 | import re 3 | 4 | 5 | class Regex: 6 | 7 | """ 8 | All kinds of Regular patterns. 9 | """ 10 | 11 | patnames = ["chi", "pun", 12 | "whi", "nwh", 13 | "wnb", "nwn", 14 | "eng", "num", 15 | "pic", "lnk", "emj"] 16 | 17 | pun_zh = r",。;、?!:“”‘’()「」『』〔〕【】《》〈〉…——\-—~~·" 18 | pun_en = r",.;?!\(\)\[\]\{\}<>_" 19 | 20 | @property 21 | def pchi(self): 22 | """ 23 | Chinese char pattern. 24 | """ 25 | _pchi = re.compile(r'[\u4E00-\u9FD5]+') # from jieba 26 | return _pchi 27 | 28 | @property 29 | def ppun(self): 30 | """ 31 | Punctuation pattern. 32 | """ 33 | _ppun = re.compile(rf'[{self.pun_en + self.pun_zh}]+') 34 | return _ppun 35 | 36 | @property 37 | def pwhi(self): 38 | """ 39 | White space pattern. 40 | """ 41 | _pwhi = re.compile(r'\s+') 42 | return _pwhi 43 | 44 | @property 45 | def pnwh(self): 46 | """ 47 | Non-white space pattern. 48 | """ 49 | _pnwh = re.compile(r'\S+') 50 | return _pnwh 51 | 52 | @property 53 | def pwnb(self): 54 | """ 55 | Word and num pattern. 56 | """ 57 | _pwnb = re.compile(r'\w+') 58 | return _pwnb 59 | 60 | @property 61 | def pnwn(self): 62 | """ 63 | Non-alphanumeric char pattern. 64 | """ 65 | _pnwn = re.compile(r'\W+') 66 | return _pnwn 67 | 68 | @property 69 | def peng(self): 70 | """ 71 | English char pattern. 72 | """ 73 | _peng = re.compile(r'[a-zA-Z]+') 74 | return _peng 75 | 76 | @property 77 | def pnum(self): 78 | """ 79 | Number pattern. 80 | 81 | Example 82 | ------- 83 | 2, +2, -2, 2.1, -2.2, 1/5, 2:3, -2/5, 2%, 2.5% 84 | """ 85 | _pnum = re.compile(r''' 86 | [+\-.]?\d+[.:/]?[\d%]+ 87 | | 88 | [+\-.]?\d+(?!\.\w+) 89 | ''', re.UNICODE | re.VERBOSE) 90 | return _pnum 91 | 92 | @property 93 | def ppic(self): 94 | """ 95 | Picture pattern. 96 | """ 97 | _ppic = re.compile(r''' 98 | !\[.*?\]\(.*?\.?(jpeg|png|jpg|gif)?\) 99 | | 100 | https?:\/\/(www\.)?[\-a-zA-Z0-9@:%._\+~#=]{0,256}\.(jpeg|png|jpg|gif) 101 | ''', re.UNICODE | re.VERBOSE) 102 | return _ppic 103 | 104 | @property 105 | def plnk(self): 106 | """ 107 | Link pattern. 108 | """ 109 | _plink = re.compile(r''' 110 | \[.+?\]\(https?:\/\/(www\.)?[\-a-zA-Z0-9@:%._\+~#=]{0,256}\.[a-z]{2,6}\b([\-a-zA-Z0-9@:%_\+.~#?&//=]*)\) 111 | | 112 | https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{0,256}\.[a-z]{2,6}\b([\-a-zA-Z0-9@:%_\+.~#?&//=]*) 113 | | 114 | (https?:\/\/)?www\.[-a-zA-Z0-9@:%._\+~#=]{0,256}\.[a-z]{2,6}\b([\-a-zA-Z0-9@:%_\+.~#?&//=]*) 115 | ''', re.UNICODE | re.VERBOSE) 116 | return _plink 117 | 118 | @property 119 | def pemj(self): 120 | """ 121 | Emoj pattern. 122 | """ 123 | _pemj = re.compile( 124 | r'['u'\U0001F300-\U0001F64F' 125 | u'\U0001F680-\U0001F6FF' 126 | u'\u2600-\u2B55]+') 127 | return _pemj 128 | 129 | @property 130 | def patdict(self): 131 | """ 132 | All patterns. 133 | """ 134 | patterns = [self.pchi, self.ppun, 135 | self.pwhi, self.pnwh, 136 | self.pwnb, self.pnwn, 137 | self.peng, self.pnum, 138 | self.ppic, self.plnk, self.pemj] 139 | _patdict = dict(zip(self.patnames, patterns)) 140 | return _patdict 141 | 142 | 143 | class Text(Regex): 144 | 145 | """ 146 | Text clean, extract and length. 147 | 148 | Parameters 149 | ---------- 150 | pat: list of patterns 151 | Support custom re.Pattern. 152 | Default is re.compile(r'.+'). 153 | Other str pattern is built-in, including: 154 | 'chi': Chinese character 155 | 'pun': Punctuations 156 | 'whi': White space 157 | 'nwh': Non White space 158 | 'wnb': Word and number 159 | 'nwn': Non word and number 160 | 'eng': English character 161 | 'num': Number 162 | 'pic': Pictures 163 | 'lnk': Links 164 | 'emj': Emojis 165 | 166 | Returns 167 | ------- 168 | A text object. 169 | 170 | Notes 171 | ------ 172 | The pattern order is important. The front pattern will be excute earlier. 173 | """ 174 | 175 | def __init__(self, pattern_list: list = []): 176 | self.pats = [] 177 | for pat in pattern_list: 178 | if isinstance(pat, str): 179 | built_in_pat = self.patdict.get(pat) 180 | if built_in_pat: 181 | self.pats.append(built_in_pat) 182 | else: 183 | raise ValueError( 184 | "pnlp: {} \ 185 | is not a valid built-in pattern.".format(pat)) 186 | elif isinstance(pat, re.Pattern): 187 | self.pats.append(pat) 188 | else: 189 | raise ValueError( 190 | "pnlp: {} is not a valid RE pattern.".format(pat)) 191 | 192 | def __repr__(self) -> str: 193 | return "Text(pattern=%r)" % str(self.pat) 194 | 195 | def clean(self, text: str): 196 | """ 197 | Clean text with givening pattern. 198 | 199 | Returns 200 | ------- 201 | Cleaned text. 202 | """ 203 | for pat in self.pats: 204 | text = pat.sub("", text) 205 | return text 206 | 207 | def extract(self, text: str): 208 | """ 209 | Extract pattern-matching items. 210 | 211 | Returns 212 | ------- 213 | Extracted items and their locations. 214 | """ 215 | mats, locs = [], [] 216 | for pat in self.pats: 217 | for mat in pat.finditer(text): 218 | mats.append(mat.group()) 219 | locs.append(mat.span()) 220 | ext = Dict() 221 | ext.text = "".join(mats) 222 | ext.mats = mats 223 | ext.locs = locs 224 | return ext 225 | 226 | 227 | class Length(Regex): 228 | 229 | def __init__(self, text: str): 230 | self.text = text 231 | 232 | def _len(self, pat): 233 | lst = pat.findall(self.text) 234 | return len("".join(lst)) 235 | 236 | @property 237 | def len_all(self): 238 | """ 239 | Length of all characters. 240 | """ 241 | return len(self.text) 242 | 243 | @property 244 | def len_nwh(self): 245 | """ 246 | Length of non-white characters. 247 | """ 248 | return self._len(self.pnwh) 249 | 250 | @property 251 | def len_chi(self): 252 | """ 253 | Length of pure Chinese characters. 254 | """ 255 | return self._len(self.pchi) 256 | 257 | @property 258 | def len_wnb(self): 259 | """ 260 | Length of characters and numbers. 261 | """ 262 | return self._len(self.pwnb) 263 | 264 | @property 265 | def len_pun(self): 266 | """ 267 | Length of all punctuations. 268 | """ 269 | return self._len(self.ppun) 270 | 271 | @property 272 | def len_eng(self): 273 | """ 274 | Length of English characters. 275 | """ 276 | return self._len(self.peng) 277 | 278 | @property 279 | def len_num(self): 280 | """ 281 | Length of all numbers. 282 | """ 283 | return self._len(self.pnum) 284 | -------------------------------------------------------------------------------- /pnlp/stopwords/ReadMe.md: -------------------------------------------------------------------------------- 1 | ## English Stopwords 2 | 3 | From https://gist.github.com/sebleier/554280 4 | 5 | ## Chinese Stopwords 6 | 7 | 8 | - 综合多个词表(结巴、哈工大、川大),不包含百度停用词,百度停用词中的词是搜索相关的,有些实词和有意义的词也会在里面,所以准确的角度考虑百度停用词最好不使用。 9 | - 人工进行了部分修改(见下表)。 10 | 11 | ```json 12 | # 删除 13 | 风雨无阻 14 | 奋勇 15 | 16 | # 增加 17 | 即是 18 | 来到 19 | 见到 20 | 异于 21 | 何谓 22 | 没什么 23 | 赶到 24 | 没啥 25 | 123 26 | 只剩 27 | 途中 28 | 只能 29 | 所谓 30 | 看到 31 | 只好 32 | 丢下 33 | 撇下 34 | 看不到 35 | 记得 36 | 任何理由 37 | 最大 38 | ``` 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /pnlp/stopwords/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pnlp.piop import read_lines 3 | 4 | root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 5 | 6 | 7 | chinese_stopwords_file = os.path.join(root, "stopwords/chinese_stopwords.txt") 8 | english_stopwords_file = os.path.join(root, "stopwords/english_stopwords.txt") 9 | 10 | 11 | chinese_stopwords = set(read_lines(chinese_stopwords_file)) 12 | english_stopwords = set(read_lines(english_stopwords_file)) 13 | 14 | 15 | class StopWords: 16 | 17 | def __init__(self, path: str = ""): 18 | self._chinese_stopwords = chinese_stopwords 19 | self._english_stopwords = english_stopwords 20 | if path: 21 | self._stopwords = set(read_lines(path)) 22 | else: 23 | self._stopwords = set() 24 | 25 | @property 26 | def zh(self): 27 | return self._chinese_stopwords 28 | 29 | @property 30 | def zh_len(self): 31 | return len(self._chinese_stopwords) 32 | 33 | @property 34 | def en(self): 35 | return self._english_stopwords 36 | 37 | @property 38 | def en_len(self): 39 | return len(self._english_stopwords) 40 | 41 | @property 42 | def stopwords(self): 43 | return self._stopwords 44 | -------------------------------------------------------------------------------- /pnlp/stopwords/chinese_stopwords.txt: -------------------------------------------------------------------------------- 1 | 2 | ! 3 | " 4 | # 5 | $ 6 | & 7 | ' 8 | ( 9 | ) 10 | * 11 | + 12 | , 13 | - 14 | -- 15 | . 16 | ... 17 | ...... 18 | ................... 19 | ./ 20 | .一 21 | .数 22 | .日 23 | / 24 | // 25 | 0 26 | 1 27 | 2 28 | 3 29 | 4 30 | 5 31 | 6 32 | 7 33 | 8 34 | 9 35 | : 36 | :// 37 | :: 38 | ; 39 | < 40 | = 41 | > 42 | ? 43 | @ 44 | Lex 45 | [ 46 | ] 47 | _ 48 | } 49 | ~~~~ 50 | · 51 | × 52 | ××× 53 | Δ 54 | Ψ 55 | γ 56 | μ 57 | φ 58 | φ. 59 | В 60 | — 61 | —— 62 | ——— 63 | ‘ 64 | ’ 65 | ’‘ 66 | “ 67 | ” 68 | ”, 69 | … 70 | …… 71 | …………………………………………………③ 72 | ′∈ 73 | ′| 74 | ℃ 75 | Ⅲ 76 | ↑ 77 | → 78 | ∈[ 79 | ∪φ∈ 80 | ≈ 81 | ① 82 | ② 83 | ②c 84 | ③ 85 | ③] 86 | ④ 87 | ⑤ 88 | ⑥ 89 | ⑦ 90 | ⑧ 91 | ⑨ 92 | ⑩ 93 | ── 94 | ■ 95 | ▲ 96 | 、 97 | 。 98 | 〉 99 | 《 100 | 》 101 | 》), 102 | 「 103 | 」 104 | 『 105 | 』 106 | 【 107 | 】 108 | 〔 109 | 〕 110 | 〕〔 111 | ㈧ 112 | 一 113 | 一. 114 | 一一 115 | 一个 116 | 一些 117 | 一何 118 | 一個 119 | 一切 120 | 一则 121 | 一则通过 122 | 一方面 123 | 一旦 124 | 一来 125 | 一样 126 | 一番 127 | 一直 128 | 一般 129 | 一转眼 130 | 万一 131 | 三天两头 132 | 三番两次 133 | 三番五次 134 | 上 135 | 上下 136 | 上去 137 | 上来 138 | 下 139 | 不 140 | 不下 141 | 不了 142 | 不亦乐乎 143 | 不仅 144 | 不仅仅 145 | 不仅仅是 146 | 不会 147 | 不但 148 | 不光 149 | 不免 150 | 不再 151 | 不力 152 | 不单 153 | 不只 154 | 不可开交 155 | 不可抗拒 156 | 不同 157 | 不外 158 | 不外乎 159 | 不大 160 | 不如 161 | 不妨 162 | 不定 163 | 不对 164 | 不少 165 | 不尽 166 | 不尽然 167 | 不巧 168 | 不已 169 | 不常 170 | 不得 171 | 不得不 172 | 不得了 173 | 不得已 174 | 不必 175 | 不怎么 176 | 不怕 177 | 不惟 178 | 不成 179 | 不拘 180 | 不择手段 181 | 不料 182 | 不日 183 | 不时 184 | 不是 185 | 不曾 186 | 不止 187 | 不止一次 188 | 不比 189 | 不消 190 | 不满 191 | 不然 192 | 不然的话 193 | 不特 194 | 不独 195 | 不由得 196 | 不知不觉 197 | 不管 198 | 不管怎样 199 | 不经意 200 | 不胜 201 | 不能 202 | 不能不 203 | 不至于 204 | 不若 205 | 不要 206 | 不论 207 | 不起 208 | 不过 209 | 不迭 210 | 不问 211 | 不限 212 | 与 213 | 与其 214 | 与其说 215 | 与否 216 | 与此同时 217 | 且 218 | 且不说 219 | 且说 220 | 两者 221 | 个 222 | 个人 223 | 个别 224 | 临 225 | 临到 226 | 为 227 | 为了 228 | 为什么 229 | 为何 230 | 为止 231 | 为此 232 | 为着 233 | 举凡 234 | 乃 235 | 乃至 236 | 乃至于 237 | 么 238 | 之 239 | 之一 240 | 之所以 241 | 之类 242 | 乌乎 243 | 乎 244 | 乘 245 | 乘势 246 | 乘机 247 | 乘虚 248 | 乘隙 249 | 也 250 | 也好 251 | 也就是说 252 | 也罢 253 | 了 254 | 二来 255 | 二话不说 256 | 二话没说 257 | 于 258 | 于是 259 | 于是乎 260 | 云云 261 | 云尔 262 | 互相 263 | 些 264 | 交口 265 | 亦 266 | 亲口 267 | 亲手 268 | 亲眼 269 | 亲自 270 | 亲身 271 | 人 272 | 人人 273 | 人们 274 | 人家 275 | 什么 276 | 什么样 277 | 今 278 | 介于 279 | 仍 280 | 仍旧 281 | 仍然 282 | 从 283 | 从不 284 | 从严 285 | 从中 286 | 从今以后 287 | 从优 288 | 从古到今 289 | 从古至今 290 | 从头 291 | 从宽 292 | 从小 293 | 从新 294 | 从无到有 295 | 从早到晚 296 | 从未 297 | 从来 298 | 从此 299 | 从此以后 300 | 从而 301 | 从轻 302 | 从速 303 | 从重 304 | 他 305 | 他人 306 | 他们 307 | 他們 308 | 他是 309 | 以 310 | 以上 311 | 以为 312 | 以便 313 | 以免 314 | 以及 315 | 以故 316 | 以期 317 | 以来 318 | 以至 319 | 以至于 320 | 以致 321 | 们 322 | 任 323 | 任何 324 | 任凭 325 | 伙同 326 | 会 327 | 传说 328 | 传闻 329 | 似的 330 | 但 331 | 但凡 332 | 但愿 333 | 但是 334 | 何 335 | 何乐而不为 336 | 何以 337 | 何况 338 | 何处 339 | 何妨 340 | 何尝 341 | 何必 342 | 何时 343 | 何止 344 | 何苦 345 | 何须 346 | 余外 347 | 作为 348 | 你 349 | 你们 350 | 你們 351 | 你是 352 | 使 353 | 使得 354 | 例如 355 | 依 356 | 依据 357 | 依照 358 | 便于 359 | 俺 360 | 俺们 361 | 倍加 362 | 倍感 363 | 倒不如 364 | 倒不如说 365 | 倒是 366 | 倘 367 | 倘使 368 | 倘或 369 | 倘然 370 | 倘若 371 | 借 372 | 借以 373 | 借此 374 | 假使 375 | 假如 376 | 假若 377 | 偏偏 378 | 偶尔 379 | 偶而 380 | 傥然 381 | 像 382 | 儿 383 | 元/吨 384 | 充其极 385 | 充其量 386 | 充分 387 | 先不先 388 | 光是 389 | 全体 390 | 全力 391 | 全年 392 | 全然 393 | 全身心 394 | 全部 395 | 全都 396 | 八成 397 | 公然 398 | 兮 399 | 共总 400 | 关于 401 | 其 402 | 其一 403 | 其中 404 | 其二 405 | 其他 406 | 其余 407 | 其后 408 | 其它 409 | 其实 410 | 其次 411 | 具体地说 412 | 具体来说 413 | 具体说来 414 | 兼之 415 | 内 416 | 再 417 | 再其次 418 | 再则 419 | 再有 420 | 再次 421 | 再者 422 | 再者说 423 | 再说 424 | 冒 425 | 冲 426 | 决不 427 | 决非 428 | 况且 429 | 凑巧 430 | 凝神 431 | 几 432 | 几乎 433 | 几度 434 | 几时 435 | 几番 436 | 几经 437 | 凡 438 | 凡是 439 | 凭 440 | 凭借 441 | 出于 442 | 出去 443 | 出来 444 | 分别 445 | 分头 446 | 分期分批 447 | 切不可 448 | 切切 449 | 切勿 450 | 切莫 451 | 则 452 | 则甚 453 | 刚好 454 | 刚巧 455 | 刚才 456 | 别 457 | 别人 458 | 别处 459 | 别是 460 | 别的 461 | 别管 462 | 别说 463 | 到 464 | 到了儿 465 | 到处 466 | 到头 467 | 到头来 468 | 到底 469 | 到目前为止 470 | 前后 471 | 前此 472 | 前者 473 | 加上 474 | 加之 475 | 加以 476 | 动不动 477 | 动辄 478 | 勃然 479 | 匆匆 480 | 千万千万 481 | 单单 482 | 单纯 483 | 即 484 | 即令 485 | 即使 486 | 即便 487 | 即刻 488 | 即如 489 | 即将 490 | 即或 491 | 即是说 492 | 即若 493 | 却 494 | 去 495 | 又 496 | 又及 497 | 及 498 | 及其 499 | 及至 500 | 反之 501 | 反之亦然 502 | 反之则 503 | 反倒 504 | 反倒是 505 | 反手 506 | 反而 507 | 反过来 508 | 反过来说 509 | 取道 510 | 受到 511 | 另 512 | 另一个 513 | 另一方面 514 | 另外 515 | 另悉 516 | 另方面 517 | 另行 518 | 只 519 | 只当 520 | 只怕 521 | 只是 522 | 只有 523 | 只消 524 | 只要 525 | 只限 526 | 叫 527 | 叮咚 528 | 可 529 | 可以 530 | 可好 531 | 可是 532 | 可能 533 | 可见 534 | 各 535 | 各个 536 | 各位 537 | 各式 538 | 各种 539 | 各自 540 | 同 541 | 同时 542 | 后 543 | 后来 544 | 后者 545 | 向 546 | 向使 547 | 向着 548 | 吓 549 | 吗 550 | 否则 551 | 吧 552 | 吧哒 553 | 吱 554 | 呀 555 | 呃 556 | 呆呆地 557 | 呕 558 | 呗 559 | 呜 560 | 呜呼 561 | 呢 562 | 呵 563 | 呵呵 564 | 呸 565 | 呼哧 566 | 呼啦 567 | 咋 568 | 和 569 | 咚 570 | 咦 571 | 咧 572 | 咱 573 | 咱们 574 | 咳 575 | 哇 576 | 哈 577 | 哈哈 578 | 哉 579 | 哎 580 | 哎呀 581 | 哎哟 582 | 哗 583 | 哗啦 584 | 哟 585 | 哦 586 | 哩 587 | 哪 588 | 哪个 589 | 哪些 590 | 哪儿 591 | 哪天 592 | 哪年 593 | 哪怕 594 | 哪样 595 | 哪边 596 | 哪里 597 | 哼 598 | 哼唷 599 | 唉 600 | 唯有 601 | 啊 602 | 啊呀 603 | 啊哈 604 | 啊哟 605 | 啐 606 | 啥 607 | 啦 608 | 啪达 609 | 啷当 610 | 喂 611 | 喏 612 | 喔唷 613 | 喽 614 | 嗡 615 | 嗡嗡 616 | 嗬 617 | 嗯 618 | 嗳 619 | 嘎 620 | 嘎嘎 621 | 嘎登 622 | 嘘 623 | 嘛 624 | 嘻 625 | 嘿 626 | 嘿嘿 627 | 因 628 | 因为 629 | 因了 630 | 因此 631 | 因着 632 | 因而 633 | 固然 634 | 在 635 | 在下 636 | 在于 637 | 地 638 | 基于 639 | 基本 640 | 基本上 641 | 处在 642 | 处处 643 | 多 644 | 多么 645 | 多亏 646 | 多多 647 | 多多少少 648 | 多多益善 649 | 多少 650 | 多年前 651 | 多年来 652 | 多次 653 | 够瞧的 654 | 大 655 | 大不了 656 | 大举 657 | 大体上 658 | 大凡 659 | 大多 660 | 大大 661 | 大家 662 | 大张旗鼓 663 | 大抵 664 | 大概 665 | 大略 666 | 大约 667 | 大致 668 | 大都 669 | 大面儿上 670 | 她 671 | 她们 672 | 她們 673 | 她是 674 | 好 675 | 好在 676 | 如 677 | 如上 678 | 如上所述 679 | 如下 680 | 如今 681 | 如何 682 | 如其 683 | 如前所述 684 | 如同 685 | 如常 686 | 如是 687 | 如期 688 | 如果 689 | 如次 690 | 如此 691 | 如此等等 692 | 如若 693 | 妳們 694 | 始而 695 | 姑且 696 | 存心 697 | 孰料 698 | 孰知 699 | 宁 700 | 宁可 701 | 宁愿 702 | 宁肯 703 | 它 704 | 它们 705 | 它是 706 | 对 707 | 对于 708 | 对待 709 | 对方 710 | 对比 711 | 将 712 | 将才 713 | 将要 714 | 将近 715 | 小 716 | 尔 717 | 尔后 718 | 尔尔 719 | 尔等 720 | 尚且 721 | 就 722 | 就地 723 | 就是 724 | 就是了 725 | 就是说 726 | 就此 727 | 就算 728 | 就要 729 | 尽 730 | 尽可能 731 | 尽如人意 732 | 尽心尽力 733 | 尽心竭力 734 | 尽快 735 | 尽早 736 | 尽然 737 | 尽管 738 | 尽管如此 739 | 尽量 740 | 局外 741 | 居然 742 | 届时 743 | 屡屡 744 | 屡次 745 | 屡次三番 746 | 岂但 747 | 岂止 748 | 岂非 749 | 川流不息 750 | 差一点 751 | 差不多 752 | 己 753 | 已 754 | 已矣 755 | 巴 756 | 巴巴 757 | 常言说 758 | 常言说得好 759 | 常言道 760 | 平素 761 | 年复一年 762 | 并 763 | 并且 764 | 并排 765 | 并无 766 | 并没 767 | 并没有 768 | 并肩 769 | 并非 770 | 庶乎 771 | 庶几 772 | 开外 773 | 开始 774 | 弹指之间 775 | 归 776 | 归根到底 777 | 归根结底 778 | 归齐 779 | 当 780 | 当下 781 | 当中 782 | 当儿 783 | 当即 784 | 当口儿 785 | 当地 786 | 当场 787 | 当头 788 | 当庭 789 | 当然 790 | 当真 791 | 当着 792 | 彻夜 793 | 彼 794 | 彼时 795 | 彼此 796 | 往 797 | 待 798 | 待到 799 | 很 800 | 很多 801 | 很少 802 | 得 803 | 得了 804 | 得天独厚 805 | 得起 806 | 必定 807 | 必将 808 | 必须 809 | 快要 810 | 忽地 811 | 忽然 812 | 怎 813 | 怎么 814 | 怎么办 815 | 怎么样 816 | 怎奈 817 | 怎样 818 | 急匆匆 819 | 怪不得 820 | 总之 821 | 总的来看 822 | 总的来说 823 | 总的说来 824 | 总而言之 825 | 恍然 826 | 恐怕 827 | 恰似 828 | 恰好 829 | 恰如 830 | 恰巧 831 | 恰恰 832 | 恰恰相反 833 | 恰逢 834 | 您 835 | 您们 836 | 您是 837 | 惟其 838 | 惯常 839 | 愤然 840 | 慢说 841 | 成年累月 842 | 成心 843 | 我 844 | 我们 845 | 我們 846 | 我是 847 | 或 848 | 或则 849 | 或多或少 850 | 或是 851 | 或曰 852 | 或者 853 | 或许 854 | 截然 855 | 截至 856 | 所 857 | 所以 858 | 所在 859 | 所幸 860 | 所有 861 | 才 862 | 才能 863 | 扑通 864 | 打 865 | 打从 866 | 打开天窗说亮话 867 | 把 868 | 抑或 869 | 抽冷子 870 | 拦腰 871 | 拿 872 | 按 873 | 按时 874 | 按期 875 | 按照 876 | 按理 877 | 按说 878 | 挨个 879 | 挨家挨户 880 | 挨次 881 | 挨着 882 | 挨门挨户 883 | 挨门逐户 884 | 换句话说 885 | 换言之 886 | 据 887 | 据实 888 | 据悉 889 | 据我所知 890 | 据此 891 | 据称 892 | 据说 893 | 接下来 894 | 接着 895 | 接连不断 896 | 故 897 | 故意 898 | 故此 899 | 故而 900 | 敞开儿 901 | 敢于 902 | 敢情 903 | 数/ 904 | 断然 905 | 方才 906 | 方能 907 | 旁人 908 | 无 909 | 无宁 910 | 无论 911 | 既 912 | 既往 913 | 既是 914 | 既然 915 | 日复一日 916 | 日渐 917 | 日益 918 | 日臻 919 | 日见 920 | 时候 921 | 昂然 922 | 是 923 | 是以 924 | 是否 925 | 是的 926 | 暗中 927 | 暗地里 928 | 暗自 929 | 更为 930 | 更加 931 | 更进一步 932 | 曾 933 | 替 934 | 替代 935 | 最 936 | 最后 937 | 有 938 | 有些 939 | 有关 940 | 有及 941 | 有时 942 | 有的 943 | 有的是 944 | 望 945 | 朝 946 | 朝着 947 | 末##末 948 | 本 949 | 本人 950 | 本地 951 | 本着 952 | 本身 953 | 权时 954 | 来 955 | 来不及 956 | 来得及 957 | 来看 958 | 来着 959 | 来自 960 | 来讲 961 | 来说 962 | 极为 963 | 极了 964 | 极其 965 | 极力 966 | 极大 967 | 极度 968 | 极端 969 | 果然 970 | 果真 971 | 某 972 | 某个 973 | 某些 974 | 某某 975 | 根据 976 | 格外 977 | 次第 978 | 欤 979 | 正值 980 | 正如 981 | 正巧 982 | 正是 983 | 此 984 | 此中 985 | 此后 986 | 此地 987 | 此处 988 | 此外 989 | 此时 990 | 此次 991 | 此间 992 | 毋宁 993 | 每 994 | 每当 995 | 每时每刻 996 | 每每 997 | 每逢 998 | 比 999 | 比及 1000 | 比如 1001 | 比如说 1002 | 比方 1003 | 比照 1004 | 比起 1005 | 毕竟 1006 | 毫不 1007 | 毫无 1008 | 毫无例外 1009 | 毫无保留地 1010 | 沒有 1011 | 沙沙 1012 | 没奈何 1013 | 没有 1014 | 沿 1015 | 沿着 1016 | 漫说 1017 | 焉 1018 | 然则 1019 | 然后 1020 | 然而 1021 | 照 1022 | 照着 1023 | 牢牢 1024 | 犹且 1025 | 犹自 1026 | 独自 1027 | 猛然 1028 | 猛然间 1029 | 率尔 1030 | 率然 1031 | 理应 1032 | 理当 1033 | 理该 1034 | 瑟瑟 1035 | 甚且 1036 | 甚么 1037 | 甚或 1038 | 甚而 1039 | 甚至 1040 | 甚至于 1041 | 用 1042 | 用来 1043 | 甭 1044 | 由 1045 | 由于 1046 | 由是 1047 | 由此 1048 | 由此可见 1049 | 略为 1050 | 略加 1051 | 略微 1052 | 的 1053 | 的确 1054 | 的话 1055 | 皆可 1056 | 直到 1057 | 相对而言 1058 | 省得 1059 | 看 1060 | 看上去 1061 | 看来 1062 | 看样子 1063 | 看起来 1064 | 眨眼 1065 | 着 1066 | 着呢 1067 | 矣 1068 | 矣乎 1069 | 矣哉 1070 | 砰 1071 | 碰巧 1072 | 离 1073 | 种 1074 | 究竟 1075 | 穷年累月 1076 | 立刻 1077 | 立地 1078 | 立时 1079 | 立马 1080 | 竟然 1081 | 竟而 1082 | 第 1083 | 第二 1084 | 等 1085 | 等到 1086 | 等等 1087 | 策略地 1088 | 简直 1089 | 简而言之 1090 | 简言之 1091 | 管 1092 | 类如 1093 | 精光 1094 | 紧接着 1095 | 累年 1096 | 累次 1097 | 纯粹 1098 | 纵 1099 | 纵令 1100 | 纵使 1101 | 纵然 1102 | 经 1103 | 经常 1104 | 经过 1105 | 结果 1106 | 给 1107 | 绝不 1108 | 绝对 1109 | 绝非 1110 | 绝顶 1111 | 继之 1112 | 继后 1113 | 继而 1114 | 综上所述 1115 | 缕缕 1116 | 罢了 1117 | 老是 1118 | 老老实实 1119 | 者 1120 | 而 1121 | 而且 1122 | 而况 1123 | 而又 1124 | 而后 1125 | 而外 1126 | 而已 1127 | 而是 1128 | 而言 1129 | 而论 1130 | 联袂 1131 | 背地里 1132 | 背靠背 1133 | 能 1134 | 能否 1135 | 腾 1136 | 自 1137 | 自个儿 1138 | 自从 1139 | 自各儿 1140 | 自后 1141 | 自家 1142 | 自己 1143 | 自打 1144 | 自身 1145 | 至 1146 | 至于 1147 | 至今 1148 | 至若 1149 | 致 1150 | 與 1151 | 般的 1152 | 若 1153 | 若夫 1154 | 若是 1155 | 若果 1156 | 若非 1157 | 莫不 1158 | 莫不然 1159 | 莫如 1160 | 莫若 1161 | 莫非 1162 | 著 1163 | 藉以 1164 | 虽 1165 | 虽则 1166 | 虽然 1167 | 虽说 1168 | 被 1169 | 要 1170 | 要不 1171 | 要不是 1172 | 要不然 1173 | 要么 1174 | 要是 1175 | 譬喻 1176 | 譬如 1177 | 让 1178 | 许多 1179 | 论 1180 | 论说 1181 | 设使 1182 | 设或 1183 | 设若 1184 | 诚如 1185 | 诚然 1186 | 话说 1187 | 该 1188 | 该当 1189 | 说来 1190 | 请勿 1191 | 诸 1192 | 诸位 1193 | 诸如 1194 | 谁 1195 | 谁人 1196 | 谁料 1197 | 谁知 1198 | 豁然 1199 | 贼死 1200 | 赖以 1201 | 赶 1202 | 赶快 1203 | 赶早不赶晚 1204 | 起 1205 | 起先 1206 | 起初 1207 | 起头 1208 | 起来 1209 | 起见 1210 | 起首 1211 | 趁 1212 | 趁便 1213 | 趁势 1214 | 趁早 1215 | 趁机 1216 | 趁热 1217 | 趁着 1218 | 越是 1219 | 距 1220 | 跟 1221 | 路经 1222 | 轰然 1223 | 较 1224 | 较为 1225 | 较之 1226 | 较比 1227 | 边 1228 | 达旦 1229 | 过 1230 | 过于 1231 | 近几年来 1232 | 近年来 1233 | 近来 1234 | 还 1235 | 还是 1236 | 还有 1237 | 还要 1238 | 这 1239 | 这一来 1240 | 这个 1241 | 这么 1242 | 这么些 1243 | 这么样 1244 | 这么点儿 1245 | 这些 1246 | 这会儿 1247 | 这儿 1248 | 这就是说 1249 | 这时 1250 | 这样 1251 | 这次 1252 | 这般 1253 | 这边 1254 | 这里 1255 | 进去 1256 | 进来 1257 | 进而 1258 | 连 1259 | 连同 1260 | 连声 1261 | 连日 1262 | 连日来 1263 | 连袂 1264 | 连连 1265 | 迟早 1266 | 迫于 1267 | 逐步 1268 | 通过 1269 | 遵循 1270 | 遵照 1271 | 那 1272 | 那个 1273 | 那么 1274 | 那么些 1275 | 那么样 1276 | 那些 1277 | 那会儿 1278 | 那儿 1279 | 那时 1280 | 那末 1281 | 那样 1282 | 那般 1283 | 那边 1284 | 那里 1285 | 都 1286 | 鄙人 1287 | 鉴于 1288 | 针对 1289 | 长期以来 1290 | 长此下去 1291 | 长话短说 1292 | 间或 1293 | 阿 1294 | 陡然 1295 | 除 1296 | 除了 1297 | 除却 1298 | 除去 1299 | 除外 1300 | 除开 1301 | 除此 1302 | 除此之外 1303 | 除此以外 1304 | 除此而外 1305 | 除非 1306 | 随 1307 | 随后 1308 | 随时 1309 | 随着 1310 | 隔夜 1311 | 隔日 1312 | 难得 1313 | 难怪 1314 | 难说 1315 | 难道 1316 | 难道说 1317 | 非但 1318 | 非常 1319 | 非徒 1320 | 非得 1321 | 非特 1322 | 非独 1323 | 靠 1324 | 顶多 1325 | 顷刻 1326 | 顷刻之间 1327 | 顷刻间 1328 | 顺 1329 | 顺着 1330 | 顿时 1331 | 首先 1332 | 马上 1333 | 高低 1334 | 默然 1335 | 默默地 1336 | ! 1337 | # 1338 | % 1339 | & 1340 | ' 1341 | ( 1342 | ) 1343 | )÷(1- 1344 | )、 1345 | * 1346 | + 1347 | +ξ 1348 | ++ 1349 | , 1350 | ,也 1351 | - 1352 | -β 1353 | -- 1354 | -[*]- 1355 | . 1356 | / 1357 | 0:2 1358 | 1. 1359 | 12% 1360 | 2.3% 1361 | 5:0 1362 | : 1363 | ; 1364 | < 1365 | <± 1366 | <Δ 1367 | <λ 1368 | <φ 1369 | << 1370 | = 1371 | =″ 1372 | =☆ 1373 | =( 1374 | =- 1375 | =[ 1376 | ={ 1377 | > 1378 | >λ 1379 | ? 1380 | A 1381 | LI 1382 | R.L. 1383 | ZXFITL 1384 | [ 1385 | [①①] 1386 | [①②] 1387 | [①③] 1388 | [①④] 1389 | [①⑤] 1390 | [①⑥] 1391 | [①⑦] 1392 | [①⑧] 1393 | [①⑨] 1394 | [①A] 1395 | [①B] 1396 | [①C] 1397 | [①D] 1398 | [①E] 1399 | [①] 1400 | [①a] 1401 | [①c] 1402 | [①d] 1403 | [①e] 1404 | [①f] 1405 | [①g] 1406 | [①h] 1407 | [①i] 1408 | [①o] 1409 | [② 1410 | [②①] 1411 | [②②] 1412 | [②③] 1413 | [②④ 1414 | [②⑤] 1415 | [②⑥] 1416 | [②⑦] 1417 | [②⑧] 1418 | [②⑩] 1419 | [②B] 1420 | [②G] 1421 | [②] 1422 | [②a] 1423 | [②b] 1424 | [②c] 1425 | [②d] 1426 | [②e] 1427 | [②f] 1428 | [②g] 1429 | [②h] 1430 | [②i] 1431 | [②j] 1432 | [③①] 1433 | [③⑩] 1434 | [③F] 1435 | [③] 1436 | [③a] 1437 | [③b] 1438 | [③c] 1439 | [③d] 1440 | [③e] 1441 | [③g] 1442 | [③h] 1443 | [④] 1444 | [④a] 1445 | [④b] 1446 | [④c] 1447 | [④d] 1448 | [④e] 1449 | [⑤] 1450 | [⑤]] 1451 | [⑤a] 1452 | [⑤b] 1453 | [⑤d] 1454 | [⑤e] 1455 | [⑤f] 1456 | [⑥] 1457 | [⑦] 1458 | [⑧] 1459 | [⑨] 1460 | [⑩] 1461 | [*] 1462 | [- 1463 | [] 1464 | ] 1465 | ]∧′=[ 1466 | ][ 1467 | _ 1468 | a] 1469 | b] 1470 | c] 1471 | e] 1472 | f] 1473 | ng昉 1474 | {- 1475 | } 1476 | }> 1477 | ~ 1478 | ~± 1479 | ~+ 1480 | 只剩 1481 | 所谓 1482 | 异于 1483 | 何谓 1484 | 即是 1485 | 来到 1486 | 赶到 1487 | 看不到 1488 | 看到 1489 | 只能 1490 | 只好 1491 | 没啥 1492 | 没什么 1493 | 见到 1494 | 记得 1495 | 123 1496 | 任何理由 1497 | 丢下 1498 | 撇下 1499 | 途中 1500 | 最大 1501 | 1502 | 未 1503 | 快 1504 | 登时 1505 | 无所 1506 | 妄 1507 | 无论如何 1508 | 难免 1509 | 未必 1510 | 一定 1511 | 已经 1512 | 好不 1513 | 太 1514 | 必然 1515 | 越 1516 | 久而久之 1517 | 倒 1518 | 尤其 1519 | 总是 1520 | 原本 1521 | 一不小心 1522 | 真 1523 | 有点 1524 | 起码 1525 | 实际上 1526 | 无非 1527 | 永远 1528 | 顺便 1529 | 一手 1530 | 就这样 1531 | 更 1532 | 常 1533 | 最好 1534 | 或者说 1535 | 没黑 1536 | 乱 1537 | 相 1538 | 先 1539 | 终于 1540 | 十分 1541 | 总 1542 | 不够 1543 | 有一天 1544 | 放声 1545 | 比较 1546 | 老 1547 | 好像 1548 | 不管怎么说 1549 | 仿佛 1550 | 极 1551 | 正 1552 | 非 1553 | 未免 1554 | 生来 1555 | 正在 1556 | 完全 1557 | 光 1558 | 刚 1559 | 似 1560 | 相当 1561 | 真是 1562 | 成天 1563 | 确实 1564 | 原来 1565 | 肯定 1566 | 没 1567 | 曾经 1568 | 反正 1569 | 实在 1570 | 同样 1571 | 足 1572 | 并不 1573 | 常常 1574 | 慢慢 1575 | 绝 1576 | 也许 1577 | 往往 1578 | 猛 1579 | 古往今来 1580 | 大体 1581 | 刚刚 1582 | 越来越 1583 | 早 1584 | 以此 1585 | 和你 1586 | 稍 1587 | 决 1588 | 再加上 1589 | 初 1590 | 尚 1591 | 至高 1592 | 事实上 1593 | 更何况 1594 | 全盘 1595 | 在此 1596 | 早就 1597 | 足以 1598 | 一心 1599 | 就是我 1600 | 一闪 1601 | 难以 1602 | 对此 1603 | 特别 1604 | 在内 1605 | 该不该 1606 | 似乎 1607 | 总算 1608 | 相比之下 1609 | 不 1610 | -------------------------------------------------------------------------------- /pnlp/stopwords/english_stopwords.txt: -------------------------------------------------------------------------------- 1 | 0o 2 | 0s 3 | 3a 4 | 3b 5 | 3d 6 | 6b 7 | 6o 8 | a 9 | a1 10 | a2 11 | a3 12 | a4 13 | ab 14 | able 15 | about 16 | above 17 | abst 18 | ac 19 | accordance 20 | according 21 | accordingly 22 | across 23 | act 24 | actually 25 | ad 26 | added 27 | adj 28 | ae 29 | af 30 | affected 31 | affecting 32 | affects 33 | after 34 | afterwards 35 | ag 36 | again 37 | against 38 | ah 39 | ain 40 | ain't 41 | aj 42 | al 43 | all 44 | allow 45 | allows 46 | almost 47 | alone 48 | along 49 | already 50 | also 51 | although 52 | always 53 | am 54 | among 55 | amongst 56 | amoungst 57 | amount 58 | an 59 | and 60 | announce 61 | another 62 | any 63 | anybody 64 | anyhow 65 | anymore 66 | anyone 67 | anything 68 | anyway 69 | anyways 70 | anywhere 71 | ao 72 | ap 73 | apart 74 | apparently 75 | appear 76 | appreciate 77 | appropriate 78 | approximately 79 | ar 80 | are 81 | aren 82 | arent 83 | aren't 84 | arise 85 | around 86 | as 87 | a's 88 | aside 89 | ask 90 | asking 91 | associated 92 | at 93 | au 94 | auth 95 | av 96 | available 97 | aw 98 | away 99 | awfully 100 | ax 101 | ay 102 | az 103 | b 104 | b1 105 | b2 106 | b3 107 | ba 108 | back 109 | bc 110 | bd 111 | be 112 | became 113 | because 114 | become 115 | becomes 116 | becoming 117 | been 118 | before 119 | beforehand 120 | begin 121 | beginning 122 | beginnings 123 | begins 124 | behind 125 | being 126 | believe 127 | below 128 | beside 129 | besides 130 | best 131 | better 132 | between 133 | beyond 134 | bi 135 | bill 136 | biol 137 | bj 138 | bk 139 | bl 140 | bn 141 | both 142 | bottom 143 | bp 144 | br 145 | brief 146 | briefly 147 | bs 148 | bt 149 | bu 150 | but 151 | bx 152 | by 153 | c 154 | c1 155 | c2 156 | c3 157 | ca 158 | call 159 | came 160 | can 161 | cannot 162 | cant 163 | can't 164 | cause 165 | causes 166 | cc 167 | cd 168 | ce 169 | certain 170 | certainly 171 | cf 172 | cg 173 | ch 174 | changes 175 | ci 176 | cit 177 | cj 178 | cl 179 | clearly 180 | cm 181 | c'mon 182 | cn 183 | co 184 | com 185 | come 186 | comes 187 | con 188 | concerning 189 | consequently 190 | consider 191 | considering 192 | contain 193 | containing 194 | contains 195 | corresponding 196 | could 197 | couldn 198 | couldnt 199 | couldn't 200 | course 201 | cp 202 | cq 203 | cr 204 | cry 205 | cs 206 | c's 207 | ct 208 | cu 209 | currently 210 | cv 211 | cx 212 | cy 213 | cz 214 | d 215 | d2 216 | da 217 | date 218 | dc 219 | dd 220 | de 221 | definitely 222 | describe 223 | described 224 | despite 225 | detail 226 | df 227 | di 228 | did 229 | didn 230 | didn't 231 | different 232 | dj 233 | dk 234 | dl 235 | do 236 | does 237 | doesn 238 | doesn't 239 | doing 240 | don 241 | done 242 | don't 243 | down 244 | downwards 245 | dp 246 | dr 247 | ds 248 | dt 249 | du 250 | due 251 | during 252 | dx 253 | dy 254 | e 255 | e2 256 | e3 257 | ea 258 | each 259 | ec 260 | ed 261 | edu 262 | ee 263 | ef 264 | effect 265 | eg 266 | ei 267 | eight 268 | eighty 269 | either 270 | ej 271 | el 272 | eleven 273 | else 274 | elsewhere 275 | em 276 | empty 277 | en 278 | end 279 | ending 280 | enough 281 | entirely 282 | eo 283 | ep 284 | eq 285 | er 286 | es 287 | especially 288 | est 289 | et 290 | et-al 291 | etc 292 | eu 293 | ev 294 | even 295 | ever 296 | every 297 | everybody 298 | everyone 299 | everything 300 | everywhere 301 | ex 302 | exactly 303 | example 304 | except 305 | ey 306 | f 307 | f2 308 | fa 309 | far 310 | fc 311 | few 312 | ff 313 | fi 314 | fifteen 315 | fifth 316 | fify 317 | fill 318 | find 319 | fire 320 | first 321 | five 322 | fix 323 | fj 324 | fl 325 | fn 326 | fo 327 | followed 328 | following 329 | follows 330 | for 331 | former 332 | formerly 333 | forth 334 | forty 335 | found 336 | four 337 | fr 338 | from 339 | front 340 | fs 341 | ft 342 | fu 343 | full 344 | further 345 | furthermore 346 | fy 347 | g 348 | ga 349 | gave 350 | ge 351 | get 352 | gets 353 | getting 354 | gi 355 | give 356 | given 357 | gives 358 | giving 359 | gj 360 | gl 361 | go 362 | goes 363 | going 364 | gone 365 | got 366 | gotten 367 | gr 368 | greetings 369 | gs 370 | gy 371 | h 372 | h2 373 | h3 374 | had 375 | hadn 376 | hadn't 377 | happens 378 | hardly 379 | has 380 | hasn 381 | hasnt 382 | hasn't 383 | have 384 | haven 385 | haven't 386 | having 387 | he 388 | hed 389 | he'd 390 | he'll 391 | hello 392 | help 393 | hence 394 | her 395 | here 396 | hereafter 397 | hereby 398 | herein 399 | heres 400 | here's 401 | hereupon 402 | hers 403 | herself 404 | hes 405 | he's 406 | hh 407 | hi 408 | hid 409 | him 410 | himself 411 | his 412 | hither 413 | hj 414 | ho 415 | home 416 | hopefully 417 | how 418 | howbeit 419 | however 420 | how's 421 | hr 422 | hs 423 | http 424 | hu 425 | hundred 426 | hy 427 | i 428 | i2 429 | i3 430 | i4 431 | i6 432 | i7 433 | i8 434 | ia 435 | ib 436 | ibid 437 | ic 438 | id 439 | i'd 440 | ie 441 | if 442 | ig 443 | ignored 444 | ih 445 | ii 446 | ij 447 | il 448 | i'll 449 | im 450 | i'm 451 | immediate 452 | immediately 453 | importance 454 | important 455 | in 456 | inasmuch 457 | inc 458 | indeed 459 | index 460 | indicate 461 | indicated 462 | indicates 463 | information 464 | inner 465 | insofar 466 | instead 467 | interest 468 | into 469 | invention 470 | inward 471 | io 472 | ip 473 | iq 474 | ir 475 | is 476 | isn 477 | isn't 478 | it 479 | itd 480 | it'd 481 | it'll 482 | its 483 | it's 484 | itself 485 | iv 486 | i've 487 | ix 488 | iy 489 | iz 490 | j 491 | jj 492 | jr 493 | js 494 | jt 495 | ju 496 | just 497 | k 498 | ke 499 | keep 500 | keeps 501 | kept 502 | kg 503 | kj 504 | km 505 | know 506 | known 507 | knows 508 | ko 509 | l 510 | l2 511 | la 512 | largely 513 | last 514 | lately 515 | later 516 | latter 517 | latterly 518 | lb 519 | lc 520 | le 521 | least 522 | les 523 | less 524 | lest 525 | let 526 | lets 527 | let's 528 | lf 529 | like 530 | liked 531 | likely 532 | line 533 | little 534 | lj 535 | ll 536 | ll 537 | ln 538 | lo 539 | look 540 | looking 541 | looks 542 | los 543 | lr 544 | ls 545 | lt 546 | ltd 547 | m 548 | m2 549 | ma 550 | made 551 | mainly 552 | make 553 | makes 554 | many 555 | may 556 | maybe 557 | me 558 | mean 559 | means 560 | meantime 561 | meanwhile 562 | merely 563 | mg 564 | might 565 | mightn 566 | mightn't 567 | mill 568 | million 569 | mine 570 | miss 571 | ml 572 | mn 573 | mo 574 | more 575 | moreover 576 | most 577 | mostly 578 | move 579 | mr 580 | mrs 581 | ms 582 | mt 583 | mu 584 | much 585 | mug 586 | must 587 | mustn 588 | mustn't 589 | my 590 | myself 591 | n 592 | n2 593 | na 594 | name 595 | namely 596 | nay 597 | nc 598 | nd 599 | ne 600 | near 601 | nearly 602 | necessarily 603 | necessary 604 | need 605 | needn 606 | needn't 607 | needs 608 | neither 609 | never 610 | nevertheless 611 | new 612 | next 613 | ng 614 | ni 615 | nine 616 | ninety 617 | nj 618 | nl 619 | nn 620 | no 621 | nobody 622 | non 623 | none 624 | nonetheless 625 | noone 626 | nor 627 | normally 628 | nos 629 | not 630 | noted 631 | nothing 632 | novel 633 | now 634 | nowhere 635 | nr 636 | ns 637 | nt 638 | ny 639 | o 640 | oa 641 | ob 642 | obtain 643 | obtained 644 | obviously 645 | oc 646 | od 647 | of 648 | off 649 | often 650 | og 651 | oh 652 | oi 653 | oj 654 | ok 655 | okay 656 | ol 657 | old 658 | om 659 | omitted 660 | on 661 | once 662 | one 663 | ones 664 | only 665 | onto 666 | oo 667 | op 668 | oq 669 | or 670 | ord 671 | os 672 | ot 673 | other 674 | others 675 | otherwise 676 | ou 677 | ought 678 | our 679 | ours 680 | ourselves 681 | out 682 | outside 683 | over 684 | overall 685 | ow 686 | owing 687 | own 688 | ox 689 | oz 690 | p 691 | p1 692 | p2 693 | p3 694 | page 695 | pagecount 696 | pages 697 | par 698 | part 699 | particular 700 | particularly 701 | pas 702 | past 703 | pc 704 | pd 705 | pe 706 | per 707 | perhaps 708 | pf 709 | ph 710 | pi 711 | pj 712 | pk 713 | pl 714 | placed 715 | please 716 | plus 717 | pm 718 | pn 719 | po 720 | poorly 721 | possible 722 | possibly 723 | potentially 724 | pp 725 | pq 726 | pr 727 | predominantly 728 | present 729 | presumably 730 | previously 731 | primarily 732 | probably 733 | promptly 734 | proud 735 | provides 736 | ps 737 | pt 738 | pu 739 | put 740 | py 741 | q 742 | qj 743 | qu 744 | que 745 | quickly 746 | quite 747 | qv 748 | r 749 | r2 750 | ra 751 | ran 752 | rather 753 | rc 754 | rd 755 | re 756 | readily 757 | really 758 | reasonably 759 | recent 760 | recently 761 | ref 762 | refs 763 | regarding 764 | regardless 765 | regards 766 | related 767 | relatively 768 | research 769 | research-articl 770 | respectively 771 | resulted 772 | resulting 773 | results 774 | rf 775 | rh 776 | ri 777 | right 778 | rj 779 | rl 780 | rm 781 | rn 782 | ro 783 | rq 784 | rr 785 | rs 786 | rt 787 | ru 788 | run 789 | rv 790 | ry 791 | s 792 | s2 793 | sa 794 | said 795 | same 796 | saw 797 | say 798 | saying 799 | says 800 | sc 801 | sd 802 | se 803 | sec 804 | second 805 | secondly 806 | section 807 | see 808 | seeing 809 | seem 810 | seemed 811 | seeming 812 | seems 813 | seen 814 | self 815 | selves 816 | sensible 817 | sent 818 | serious 819 | seriously 820 | seven 821 | several 822 | sf 823 | shall 824 | shan 825 | shan't 826 | she 827 | shed 828 | she'd 829 | she'll 830 | shes 831 | she's 832 | should 833 | shouldn 834 | shouldn't 835 | should've 836 | show 837 | showed 838 | shown 839 | showns 840 | shows 841 | si 842 | side 843 | significant 844 | significantly 845 | similar 846 | similarly 847 | since 848 | sincere 849 | six 850 | sixty 851 | sj 852 | sl 853 | slightly 854 | sm 855 | sn 856 | so 857 | some 858 | somebody 859 | somehow 860 | someone 861 | somethan 862 | something 863 | sometime 864 | sometimes 865 | somewhat 866 | somewhere 867 | soon 868 | sorry 869 | sp 870 | specifically 871 | specified 872 | specify 873 | specifying 874 | sq 875 | sr 876 | ss 877 | st 878 | still 879 | stop 880 | strongly 881 | sub 882 | substantially 883 | successfully 884 | such 885 | sufficiently 886 | suggest 887 | sup 888 | sure 889 | sy 890 | system 891 | sz 892 | t 893 | t1 894 | t2 895 | t3 896 | take 897 | taken 898 | taking 899 | tb 900 | tc 901 | td 902 | te 903 | tell 904 | ten 905 | tends 906 | tf 907 | th 908 | than 909 | thank 910 | thanks 911 | thanx 912 | that 913 | that'll 914 | thats 915 | that's 916 | that've 917 | the 918 | their 919 | theirs 920 | them 921 | themselves 922 | then 923 | thence 924 | there 925 | thereafter 926 | thereby 927 | thered 928 | therefore 929 | therein 930 | there'll 931 | thereof 932 | therere 933 | theres 934 | there's 935 | thereto 936 | thereupon 937 | there've 938 | these 939 | they 940 | theyd 941 | they'd 942 | they'll 943 | theyre 944 | they're 945 | they've 946 | thickv 947 | thin 948 | think 949 | third 950 | this 951 | thorough 952 | thoroughly 953 | those 954 | thou 955 | though 956 | thoughh 957 | thousand 958 | three 959 | throug 960 | through 961 | throughout 962 | thru 963 | thus 964 | ti 965 | til 966 | tip 967 | tj 968 | tl 969 | tm 970 | tn 971 | to 972 | together 973 | too 974 | took 975 | top 976 | toward 977 | towards 978 | tp 979 | tq 980 | tr 981 | tried 982 | tries 983 | truly 984 | try 985 | trying 986 | ts 987 | t's 988 | tt 989 | tv 990 | twelve 991 | twenty 992 | twice 993 | two 994 | tx 995 | u 996 | u201d 997 | ue 998 | ui 999 | uj 1000 | uk 1001 | um 1002 | un 1003 | under 1004 | unfortunately 1005 | unless 1006 | unlike 1007 | unlikely 1008 | until 1009 | unto 1010 | uo 1011 | up 1012 | upon 1013 | ups 1014 | ur 1015 | us 1016 | use 1017 | used 1018 | useful 1019 | usefully 1020 | usefulness 1021 | uses 1022 | using 1023 | usually 1024 | ut 1025 | v 1026 | va 1027 | value 1028 | various 1029 | vd 1030 | ve 1031 | ve 1032 | very 1033 | via 1034 | viz 1035 | vj 1036 | vo 1037 | vol 1038 | vols 1039 | volumtype 1040 | vq 1041 | vs 1042 | vt 1043 | vu 1044 | w 1045 | wa 1046 | want 1047 | wants 1048 | was 1049 | wasn 1050 | wasnt 1051 | wasn't 1052 | way 1053 | we 1054 | wed 1055 | we'd 1056 | welcome 1057 | well 1058 | we'll 1059 | well-b 1060 | went 1061 | were 1062 | we're 1063 | weren 1064 | werent 1065 | weren't 1066 | we've 1067 | what 1068 | whatever 1069 | what'll 1070 | whats 1071 | what's 1072 | when 1073 | whence 1074 | whenever 1075 | when's 1076 | where 1077 | whereafter 1078 | whereas 1079 | whereby 1080 | wherein 1081 | wheres 1082 | where's 1083 | whereupon 1084 | wherever 1085 | whether 1086 | which 1087 | while 1088 | whim 1089 | whither 1090 | who 1091 | whod 1092 | whoever 1093 | whole 1094 | who'll 1095 | whom 1096 | whomever 1097 | whos 1098 | who's 1099 | whose 1100 | why 1101 | why's 1102 | wi 1103 | widely 1104 | will 1105 | willing 1106 | wish 1107 | with 1108 | within 1109 | without 1110 | wo 1111 | won 1112 | wonder 1113 | wont 1114 | won't 1115 | words 1116 | world 1117 | would 1118 | wouldn 1119 | wouldnt 1120 | wouldn't 1121 | www 1122 | x 1123 | x1 1124 | x2 1125 | x3 1126 | xf 1127 | xi 1128 | xj 1129 | xk 1130 | xl 1131 | xn 1132 | xo 1133 | xs 1134 | xt 1135 | xv 1136 | xx 1137 | y 1138 | y2 1139 | yes 1140 | yet 1141 | yj 1142 | yl 1143 | you 1144 | youd 1145 | you'd 1146 | you'll 1147 | your 1148 | youre 1149 | you're 1150 | yours 1151 | yourself 1152 | yourselves 1153 | you've 1154 | yr 1155 | ys 1156 | yt 1157 | z 1158 | zero 1159 | zi 1160 | zz 1161 | -------------------------------------------------------------------------------- /pnlp/utils.py: -------------------------------------------------------------------------------- 1 | from functools import wraps, partial 2 | from typing import Any, List, Generator, Callable 3 | import multiprocessing as mp 4 | from multiprocessing import Pool 5 | from multiprocessing.pool import ThreadPool 6 | from concurrent.futures import ThreadPoolExecutor 7 | from threading import Thread 8 | import numpy as np 9 | import dill 10 | 11 | 12 | class pstr(str): 13 | def __sub__(self, other) -> str: 14 | result = [] 15 | for c in self: 16 | if c in other: 17 | continue 18 | result.append(c) 19 | return "".join(result) 20 | 21 | 22 | class ThreadWithReturnValue(Thread): 23 | """ 24 | referenced from https://stackoverflow.com/questions/6893968/how-to-get-the-return-value-from-a-thread-in-python 25 | """ 26 | 27 | def __init__( 28 | self, 29 | group=None, 30 | target=None, 31 | name=None, 32 | args=(), 33 | kwargs={} 34 | ): 35 | Thread.__init__(self, group, target, name, args, kwargs) 36 | self._return = None 37 | 38 | def run(self): 39 | if self._target is not None: 40 | self._return = self._target(*self._args, **self._kwargs) 41 | 42 | def join(self, *args): 43 | Thread.join(self, *args) 44 | return self._return 45 | 46 | 47 | def divide2int1(y: int, x: int) -> int: 48 | res = y // x 49 | if y % x != 0: 50 | res += 1 51 | return res 52 | 53 | 54 | def divide2int(y: int, x: int) -> int: 55 | return np.ceil(y / x).astype(np.int_) 56 | 57 | 58 | def generate_batches_by_size(lst: List[Any], batch_size: int 59 | ) -> Generator[List[Any], None, None]: 60 | batch_num = divide2int(len(lst), batch_size) 61 | for i in range(batch_num): 62 | yield lst[i * batch_size: (i + 1) * batch_size] 63 | 64 | 65 | def generate_batches_by_num(lst: List[Any], batch_num: int 66 | ) -> Generator[List[Any], None, None]: 67 | batch_size = divide2int(len(lst), batch_num) 68 | return generate_batches_by_size(lst, batch_size) 69 | 70 | 71 | # referenced from: 72 | # https://izziswift.com/python-multiprocessing-picklingerror-cant-pickle/ 73 | def run_dill_encoded(payload): 74 | fun, args, kwargs = dill.loads(payload) 75 | return fun(*args, **kwargs) 76 | 77 | 78 | def apply_async(pool, fun, args, kwargs): 79 | payload = dill.dumps((fun, args, kwargs)) 80 | return pool.apply_async(run_dill_encoded, (payload,)) 81 | 82 | 83 | def concurring( 84 | func=None, 85 | type: str = "thread_executor", 86 | max_workers: int = mp.cpu_count() 87 | ) -> Generator[List[Any], None, None]: 88 | """ 89 | decorator for concurring. 90 | 91 | Parameters 92 | ----------- 93 | type: one of thread_pool, process_pool, thread_executor, thread 94 | these are all different implements. 95 | max_workers: worker number 96 | """ 97 | 98 | if func is None: 99 | return partial(concurring, type=type, max_workers=max_workers) 100 | 101 | if max_workers <= 0: 102 | raise ValueError("hnlp: max_workers must > 0") 103 | 104 | def _thread(engine, func, batches, **kwargs): 105 | jobs = [] 106 | for batch in batches: 107 | job = engine(target=func, args=(batch, ), kwargs=kwargs) 108 | jobs.append(job) 109 | job.start() 110 | for i, job in enumerate(jobs): 111 | yield job.join() 112 | 113 | def _pool(engine, func, batches, max_workers, **kwargs): 114 | with engine(processes=max_workers) as pool: 115 | jobs = [apply_async(pool, func, (batch, ), kwargs) 116 | for batch in batches] 117 | for job in jobs: 118 | yield job.get() 119 | 120 | def _executor(engine, func, batches, max_workers, **kwargs): 121 | with engine(max_workers=max_workers) as executor: 122 | jobs = [executor.submit(*(func, batch), **kwargs) 123 | for batch in batches] 124 | for f in jobs: 125 | yield f.result() 126 | 127 | @wraps(func) 128 | def wrapper(lst: List[Any], *args, **kwargs): 129 | batches = generate_batches_by_num(lst, max_workers) 130 | if type == "thread_pool": 131 | return _pool( 132 | ThreadPool, func, batches, max_workers, **kwargs) 133 | elif type == "process_pool": 134 | return _pool( 135 | Pool, func, batches, max_workers, **kwargs) 136 | elif type == "thread_executor": 137 | return _executor( 138 | ThreadPoolExecutor, func, batches, max_workers, **kwargs) 139 | elif type == "thread": 140 | return _thread( 141 | ThreadWithReturnValue, func, batches, **kwargs) 142 | else: 143 | err_info = f"hnlp: does not support type {type}, use one of " 144 | err_info += "thread_pool, process_pool, thread_executor, thread" 145 | raise ValueError(err_info) 146 | 147 | return wrapper 148 | 149 | 150 | def run_in_new_thread( 151 | func: Callable, *args, **kwargs 152 | ): 153 | if kwargs: 154 | func = partial(func, **kwargs) 155 | t = Thread(target=func, name="BackgroundRun", args=args) 156 | t.start() -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="pnlp", 8 | version="0.4.16", 9 | author="Yam", 10 | author_email="haoshaochun@gmail.com", 11 | description="A pre/post-processing tool for NLP.", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/hscspring/pnlp", 15 | include_package_data=True, 16 | # default is `setup.py` path, so do not need a `package_dir` attr 17 | # if another dir, should be declared by `package_dir` 18 | packages=setuptools.find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), 19 | install_requires=[ 20 | 'addict', 21 | 'pyyaml', 22 | 'dill', 23 | 'numpy' 24 | ], 25 | package_data={ 26 | 'pnlp': ["stopwords/*"], 27 | }, 28 | classifiers=[ 29 | "Programming Language :: Python :: 3", 30 | "License :: OSI Approved :: Apache Software License", 31 | "Operating System :: OS Independent", 32 | ], 33 | ) 34 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hscspring/pnlp/87178634727231ba16663bb99ec40fa3668b226e/tests/__init__.py -------------------------------------------------------------------------------- /tests/piop_data/a.md: -------------------------------------------------------------------------------- 1 | line 1 in a. 2 | line 2 in a. 3 | line 3 in a. -------------------------------------------------------------------------------- /tests/piop_data/b.txt: -------------------------------------------------------------------------------- 1 | line 1 in b. 2 | line 2 in b. 3 | line 3 in b. -------------------------------------------------------------------------------- /tests/piop_data/c.data: -------------------------------------------------------------------------------- 1 | line 1 in c. 2 | line 2 in c. 3 | line 3 in c. -------------------------------------------------------------------------------- /tests/piop_data/csv.csv: -------------------------------------------------------------------------------- 1 | id,title 2 | 1,title1 3 | 2,title2 -------------------------------------------------------------------------------- /tests/piop_data/first/fa.md: -------------------------------------------------------------------------------- 1 | line 1 in fa. 2 | line 2 in fa. 3 | line 3 in fa. -------------------------------------------------------------------------------- /tests/piop_data/first/fb.txt: -------------------------------------------------------------------------------- 1 | line 1 in fb. 2 | line 2 in fb. 3 | line 3 in fb. -------------------------------------------------------------------------------- /tests/piop_data/first/fc.data: -------------------------------------------------------------------------------- 1 | line 1 in fc. 2 | line 2 in fc. 3 | line 3 in fc. -------------------------------------------------------------------------------- /tests/piop_data/first/second/sa.md: -------------------------------------------------------------------------------- 1 | line 1 in sa. 2 | line 2 in sa. 3 | line 3 in sa. -------------------------------------------------------------------------------- /tests/piop_data/first/second/sb.txt: -------------------------------------------------------------------------------- 1 | line 1 in sb. 2 | line 2 in sb. 3 | line 3 in sb. -------------------------------------------------------------------------------- /tests/piop_data/first/second/sc.data: -------------------------------------------------------------------------------- 1 | line 1 in sc. 2 | line 2 in sc. 3 | line 3 in sc. -------------------------------------------------------------------------------- /tests/piop_data/json.json: -------------------------------------------------------------------------------- 1 | { 2 | "json1": "this is line 1", 3 | "json2": "这是第二行。" 4 | } -------------------------------------------------------------------------------- /tests/piop_data/list_dict.json: -------------------------------------------------------------------------------- 1 | {"name": "Yam", "age": 20} 2 | {"name": "May", "age": 21} 3 | -------------------------------------------------------------------------------- /tests/piop_data/outfile.file: -------------------------------------------------------------------------------- 1 | line 1 of outfile. 2 | 这是 outfile 的第二行。 3 | -------------------------------------------------------------------------------- /tests/piop_data/outfile.listdict: -------------------------------------------------------------------------------- 1 | {"name": "Yam", "age": 20} 2 | -------------------------------------------------------------------------------- /tests/piop_data/outjson.json: -------------------------------------------------------------------------------- 1 | { 2 | "outjson1": "this is line 1.", 3 | "outjson2": "这是第二行。" 4 | } -------------------------------------------------------------------------------- /tests/piop_data/yaml.yaml: -------------------------------------------------------------------------------- 1 | 元旦: 2 | - 新年快乐 3 | - 元旦快乐 4 | - 节日快乐 5 | 周末: 6 | - 周末快乐! 7 | - 周末愉快! -------------------------------------------------------------------------------- /tests/test_pcut.py: -------------------------------------------------------------------------------- 1 | import re 2 | import pytest 3 | 4 | from pnlp.pcut import cut_sentence, cut_zhchar, combine_bucket, cut_sub_sentence 5 | 6 | 7 | def test_text2zhchar1(): 8 | text = "我喜欢你。" 9 | ret = cut_zhchar(text) 10 | assert ret == ["我", "喜", "欢", "你", "。"] 11 | 12 | 13 | def test_text2zhchar2(): 14 | text = "我 喜欢 你。" 15 | ret = cut_zhchar(text) 16 | assert ret == ["我", " ", "喜", "欢", " ", "你", "。"] 17 | 18 | 19 | def test_text2zhchar3(): 20 | text = "我喜欢like你。" 21 | ret = cut_zhchar(text) 22 | assert ret == ["我", "喜", "欢", "like", "你", "。"] 23 | 24 | 25 | def test_text2zhchar4(): 26 | text = "我喜欢你233。" 27 | ret = cut_zhchar(text) 28 | assert ret == ["我", "喜", "欢", "你", "233", "。"] 29 | 30 | 31 | def test_text2zhchar5(): 32 | text = "我喜欢你3.14。" 33 | ret = cut_zhchar(text) 34 | assert ret == ["我", "喜", "欢", "你", "3.14", "。"] 35 | 36 | 37 | def test_text2zhchar6(): 38 | text = "我喜欢你100%。" 39 | ret = cut_zhchar(text) 40 | assert ret == ["我", "喜", "欢", "你", "100%", "。"] 41 | 42 | 43 | def test_text2zhchar7(): 44 | text = "我喜欢你2/3。" 45 | ret = cut_zhchar(text) 46 | assert ret == ["我", "喜", "欢", "你", "2/3", "。"] 47 | 48 | 49 | def test_text2zhchar8(): 50 | text = "我喜欢你-2。" 51 | ret = cut_zhchar(text) 52 | assert ret == ["我", "喜", "欢", "你", "-2", "。"] 53 | 54 | 55 | def test_text2zhchar9(): 56 | text = "我喜欢你2、3。" 57 | ret = cut_zhchar(text) 58 | assert ret == ["我", "喜", "欢", "你", "2", "、", "3", "。"] 59 | 60 | 61 | def test_text2zhchar10(): 62 | text = "我喜欢你。。" 63 | ret = cut_zhchar(text) 64 | assert ret == ["我", "喜", "欢", "你", "。", "。"] 65 | 66 | 67 | def test_text2zhchar11(): 68 | text = "我喜欢你A-B。" 69 | ret = cut_zhchar(text) 70 | assert ret == ["我", "喜", "欢", "你", "A", "-", "B", "。"] 71 | 72 | 73 | def test_text2zhchar12(): 74 | text = "我喜欢你A_B。" 75 | ret = cut_zhchar(text) 76 | assert ret == ["我", "喜", "欢", "你", "A", "_", "B", "。"] 77 | 78 | 79 | def test_text2zhchar13(): 80 | text = "我喜欢你C++。" 81 | ret = cut_zhchar(text) 82 | assert ret == ["我", "喜", "欢", "你", "C", "+", "+", "。"] 83 | 84 | 85 | def test_text2zhchar14(): 86 | text = "我喜欢你R&B。" 87 | ret = cut_zhchar(text) 88 | assert ret == ["我", "喜", "欢", "你", "R", "&", "B", "。"] 89 | 90 | 91 | def test_text2zhchar15(): 92 | text = "#我喜欢你。" 93 | ret = cut_zhchar(text) 94 | assert ret == ["#", "我", "喜", "欢", "你", "。"] 95 | 96 | 97 | def test_text2zhchar16(): 98 | text = "我 喜欢 你。" 99 | ret = cut_zhchar(text, remove_blank=True) 100 | assert ret == ["我", "喜", "欢", "你", "。"] 101 | 102 | 103 | def test_text2zhchar17(): 104 | text = "我 love you." 105 | ret = cut_zhchar(text, remove_blank=True) 106 | assert ret == ["我", "love", "you", "."] 107 | 108 | 109 | def test_text2zhchar18(): 110 | text = "lo-ve." 111 | ret = cut_zhchar(text, remove_blank=True) 112 | assert ret == ['lo', '-', 've', '.'] 113 | 114 | 115 | def test_text2zhchar19(): 116 | text = "v-.f " 117 | ret = cut_zhchar(text, remove_blank=True) 118 | assert ret == ['v', '-', '.', 'f'] 119 | 120 | 121 | def test_text2zhchar20(): 122 | text = "-1.2." 123 | ret = cut_zhchar(text) 124 | assert ret == ['-1.2', '.'] 125 | 126 | 127 | def test_text2zhchar21(): 128 | text = "1-2-3-" 129 | ret = cut_zhchar(text) 130 | assert ret == ['1-2-3-'] 131 | 132 | 133 | def test_text2zhchar22(): 134 | text = "-1-2-3" 135 | ret = cut_zhchar(text) 136 | assert ret == ['-1-2-3'] 137 | 138 | 139 | def test_text2zhchar23(): 140 | text = "1.2.3" 141 | ret = cut_zhchar(text) 142 | assert ret == ['1.2.3'] 143 | 144 | 145 | def test_text2zhchar24(): 146 | text = "1..2" 147 | ret = cut_zhchar(text) 148 | assert ret == ['1..2'] 149 | 150 | 151 | def test_text2zhchar25(): 152 | text = "1.2..." 153 | ret = cut_zhchar(text) 154 | assert ret == ['1.2', '.', '.', '.'] 155 | 156 | 157 | def test_text2zhchar26(): 158 | text = "1...2..." 159 | ret = cut_zhchar(text) 160 | assert ret == ['1...2', '.', '.', '.'] 161 | 162 | 163 | def test_text2zhchar27(): 164 | text = """ 165 | x..x 1.2, -1.23 lo-.ve.. -1-2-3- 2-2. -1.2. 3/5 1.2.3 1..2 2% 3.5% -2.0% 166 | """ 167 | ret = cut_zhchar(text, remove_blank=True) 168 | assert ret == [ 169 | 'x', '.', '.', 'x', 170 | '1.2', ',', '-1.23', 171 | 'lo', '-', '.', 've', '.', '.', 172 | '-1-2-3-', '2-2', '.', 173 | '-1.2', '.', '3/5', 174 | '1.2.3', '1..2', 175 | '2%', '3.5%', '-2.0%' 176 | ] 177 | 178 | 179 | def test_text2sent1(): 180 | text = "我喜欢你,你呢?哈哈,我不告诉你。" 181 | ret = cut_sentence(text) 182 | assert len(ret) == 2 183 | assert ret[0] == "我喜欢你,你呢?" 184 | assert ret[1] == "哈哈,我不告诉你。" 185 | 186 | 187 | def test_text2sent2(): 188 | text = "我喜欢你,你呢!哈哈,我不告诉你" 189 | ret = cut_sentence(text) 190 | assert len(ret) == 2 191 | assert ret[0] == "我喜欢你,你呢!" 192 | assert ret[1] == "哈哈,我不告诉你" 193 | 194 | 195 | def test_text2sent3(): 196 | text = "我喜欢你,「哈哈」。我不告诉你~~~" 197 | ret = cut_sentence(text) 198 | assert len(ret) == 2 199 | assert ret[0] == "我喜欢你,「哈哈」。" 200 | assert ret[1] == "我不告诉你~~~" 201 | 202 | 203 | def test_text2sent4(): 204 | text = "我喜欢你,“哈哈”.我不告诉你……" 205 | ret = cut_sentence(text) 206 | assert len(ret) == 2 207 | assert ret[0] == "我喜欢你,“哈哈”." 208 | assert ret[1] == "我不告诉你……" 209 | 210 | 211 | def test_text2sent5(): 212 | text = "我喜欢你,“哈哈” 我不告诉你;" 213 | ret = cut_sentence(text) 214 | assert len(ret) == 1 215 | assert ret[0] == "我喜欢你,“哈哈” 我不告诉你;" 216 | 217 | 218 | def test_text2sent6(): 219 | text = "我喜欢你,“哈哈。” 我不告诉你!" 220 | ret = cut_sentence(text) 221 | assert len(ret) == 2 222 | assert ret[0] == "我喜欢你,“哈哈。”" 223 | assert ret[1] == " 我不告诉你!" 224 | 225 | 226 | def test_text2sent7(): 227 | text = "我喜欢你(haha). 我不告诉你~" 228 | ret = cut_sentence(text) 229 | assert len(ret) == 2 230 | assert ret[0] == "我喜欢你(haha)." 231 | assert ret[1] == " 我不告诉你~" 232 | 233 | 234 | def test_text2sent8(): 235 | text = "我喜欢你, “哈哈……”。“我不告诉你.”" 236 | ret = cut_sentence(text) 237 | assert len(ret) == 2 238 | assert ret[0] == "我喜欢你, “哈哈……”。" 239 | assert ret[1] == "“我不告诉你.”" 240 | 241 | 242 | def test_text2sent9(): 243 | text = "我喜欢你&“哈哈?”“我不告诉你”" 244 | ret = cut_sentence(text) 245 | assert len(ret) == 2 246 | assert ret[0] == "我喜欢你&“哈哈?”" 247 | assert ret[1] == "“我不告诉你”" 248 | 249 | 250 | def test_text2sent10(): 251 | text = "我喜欢你," 252 | ret = cut_sentence(text) 253 | assert len(ret) == 1 254 | assert ret[0] == "我喜欢你," 255 | 256 | 257 | def test_text2sent11(): 258 | text = "我喜欢你" 259 | ret = cut_sentence(text) 260 | assert len(ret) == 1 261 | assert ret[0] == "我喜欢你" 262 | 263 | 264 | def test_text2sent12(): 265 | text = "我喜欢\n你" 266 | ret = cut_sentence(text) 267 | assert len(ret) == 2 268 | assert ret == ["我喜欢\n", "你"] 269 | 270 | 271 | def test_text2sent13(): 272 | text = "我喜欢。\n你" 273 | ret = cut_sentence(text) 274 | assert len(ret) == 3 275 | assert ret == ["我喜欢。", "\n", "你"] 276 | 277 | 278 | def test_text2sent14(): 279 | text = "我喜欢。\n.你" 280 | ret = cut_sentence(text) 281 | assert len(ret) == 3 282 | assert ret == ["我喜欢。", "\n", ".你"] 283 | 284 | 285 | def test_text2sent15(): 286 | text = "我喜欢\n.你" 287 | ret = cut_sentence(text) 288 | assert len(ret) == 2 289 | assert ret == ["我喜欢\n", ".你"] 290 | 291 | 292 | def test_text2sent16(): 293 | text = "我喜欢 .你" 294 | ret = cut_sentence(text) 295 | assert len(ret) == 2 296 | assert ret == ["我喜欢 .", "你"] 297 | 298 | 299 | def test_text2sent17(): 300 | text = "我喜欢 .你" 301 | ret = cut_sentence(text) 302 | assert len(ret) == 2 303 | assert ret == ["我喜欢 .", "你"] 304 | 305 | 306 | @pytest.fixture 307 | def parts(): 308 | return [ 309 | '习近平指出', 310 | '中方不仅维护中国人民生命安全和身体健康', 311 | '也维护世界人民生命安全和身体健康', 312 | '我们本着公开', 313 | '透明', 314 | ] 315 | 316 | 317 | def test_combine_bucket1(parts): 318 | ret = combine_bucket(parts.copy(), 5) 319 | assert ret == parts 320 | ret = combine_bucket(parts.copy(), 10) 321 | assert ret == [ 322 | '习近平指出', 323 | '中方不仅维护中国人民生命安全和身体健康', 324 | '也维护世界人民生命安全和身体健康', 325 | '我们本着公开透明', 326 | ] 327 | 328 | 329 | def test_combine_bucket2(parts): 330 | ret = combine_bucket(parts.copy(), 5, truncate=True) 331 | assert ret == [ 332 | '习近平指出', 333 | '中方不仅维', 334 | '也维护世界', 335 | '我们本着公', 336 | '透明' 337 | ] 338 | ret = combine_bucket(parts.copy(), 10, truncate=True) 339 | assert ret == [ 340 | '习近平指出', 341 | '中方不仅维护中国人民', 342 | '也维护世界人民生命安', 343 | '我们本着公开透明', 344 | ] 345 | 346 | 347 | def test_combine_bucket3(parts): 348 | ret = combine_bucket(parts.copy(), 5, truncate=True, keep_remain=True) 349 | assert ret == [ 350 | '习近平指出', 351 | '中方不仅维', 352 | '护中国人民', 353 | '生命安全和', 354 | '身体健康', 355 | '也维护世界', 356 | '人民生命安', 357 | '全和身体健', 358 | '康', 359 | '我们本着公', 360 | '开', 361 | '透明' 362 | ] 363 | ret = combine_bucket(parts.copy(), 10, truncate=True, keep_remain=True) 364 | assert ret == [ 365 | '习近平指出', 366 | '中方不仅维护中国人民', 367 | '生命安全和身体健康', 368 | '也维护世界人民生命安', 369 | '全和身体健康', 370 | '我们本着公开透明', 371 | ] 372 | 373 | 374 | 375 | @pytest.mark.parametrize("inp, expected", [ 376 | ("1,2,3,4", ["1,", "2,", "3,", "4"]), 377 | ("2/5是0.4。", ["2/5是0.4。"]), 378 | ("2/5是0.4.", ["2/5是0.4."]), 379 | ("2除以8等于0.25。", ["2除以8等于0.25。"]), 380 | ]) 381 | def test_cut_subsent(inp, expected): 382 | res = cut_sub_sentence(inp) 383 | assert res == expected -------------------------------------------------------------------------------- /tests/test_penh.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pnlp.penh import swap, SentenceLevelSampler, TokenLevelSampler 3 | from pnlp.pcut import psubsent, cut_part, cut_zhchar 4 | 5 | 6 | def test_swap_middle(): 7 | lst = [1, 2, 3, 4, 5] 8 | new = swap(lst, 2, 0, 4) 9 | assert new == [1, 3, 2, 4, 5] or new == [1, 2, 4, 3, 5] 10 | 11 | 12 | def test_swap_start(): 13 | lst = [1, 2, 3] 14 | new = swap(lst, 0, 0, 2) 15 | assert new == [2, 1, 3] 16 | 17 | 18 | def test_swap_end(): 19 | lst = [1, 2, 3] 20 | new = swap(lst, 2, 0, 2) 21 | assert new == [1, 3, 2] 22 | 23 | 24 | def cut_words(text: str) -> list: 25 | return [ 26 | '人', '为什么', '活着', '?', 27 | '生而为', '人', '必须', '要', '有', '梦想', '!', 28 | '还要', '有', '尽可能', '多', '的', '精神', '体验', '。'] 29 | 30 | 31 | def cut_wps(text: str) -> list: 32 | return [ 33 | ('人', 'n'), ('为什么', 'r'), ('活着', 'v'), ('?', 'w'), 34 | ('生而为人', 'v'), ('必须', 'd'), ('要', 'v'), 35 | ('有', 'v'), ('梦想', 'n'), ('!', 'w'), 36 | ('还要', 'v'), ('有', 'v'), ('尽可能', 'd'), ('多', 'a'), 37 | ('的', 'u'), ('精神', 'n'), ('体验', 'vn'), ('。', 'w') 38 | ] 39 | 40 | 41 | def test_token_level_sampler(): 42 | tls = TokenLevelSampler() 43 | text = "人为什么活着?生而为人必须要有梦想!还要有尽可能多的精神体验。" 44 | res = tls.make_samples(text) 45 | assert type(res) == dict 46 | assert len(res) == 4 47 | 48 | 49 | def test_token_level_sampler_none(): 50 | tls = TokenLevelSampler(types=[]) 51 | text = "人为什么活着?生而为人必须要有梦想!还要有尽可能多的精神体验。" 52 | res = tls.make_samples(text) 53 | assert res == {} 54 | 55 | 56 | def test_token_level_sampler_none_text(): 57 | tls = TokenLevelSampler() 58 | text = "" 59 | res = tls.make_samples(text) 60 | assert res == {} 61 | 62 | 63 | def test_token_level_sampler_single_sent(): 64 | tls = TokenLevelSampler() 65 | text = "人为什么活着?" 66 | res = tls.make_samples(text) 67 | assert len(res) == 4 68 | 69 | 70 | def test_token_level_sampler_independent_sampling(): 71 | tls = TokenLevelSampler() 72 | text = "人为什么活着?生而为人必须要有梦想!还要有尽可能多的精神体验。" 73 | tokens = cut_zhchar(text) 74 | res = tls.independent_sampling(tokens) 75 | assert type(res) == list 76 | assert len(res) == 3 77 | 78 | 79 | def test_token_level_sampler_dependent_sampling(): 80 | tls = TokenLevelSampler() 81 | text = "人为什么活着?生而为人必须要有梦想!还要有尽可能多的精神体验。" 82 | tokens = cut_zhchar(text) 83 | res = tls.dependent_sampling(tokens) 84 | assert type(res) == list 85 | assert type(res[0]) == str 86 | 87 | 88 | def test_token_level_sampler_delete(): 89 | tls = TokenLevelSampler(types=["delete"]) 90 | text = "人为什么活着?生而为人必须要有梦想!还要有尽可能多的精神体验。" 91 | res = tls.make_samples(text) 92 | assert type(res) == dict 93 | assert len(res) == 2 94 | 95 | 96 | def test_token_level_sampler_swap(): 97 | tls = TokenLevelSampler(types=["swap"]) 98 | text = "人为什么活着?生而为人必须要有梦想!还要有尽可能多的精神体验。" 99 | res = tls.make_samples(text) 100 | assert type(res) == dict 101 | assert len(res) == 2 102 | 103 | 104 | def test_token_level_sampler_insert(): 105 | tls = TokenLevelSampler(types=["insert"]) 106 | text = "人为什么活着?生而为人必须要有梦想!还要有尽可能多的精神体验。" 107 | res = tls.make_samples(text) 108 | assert type(res) == dict 109 | assert len(res) == 2 110 | 111 | 112 | def test_token_level_sampler_token_spliter(): 113 | tls = TokenLevelSampler() 114 | text = "人为什么活着?生而为人必须要有梦想!还要有尽可能多的精神体验。" 115 | res = tls.make_samples(text, cut_words) 116 | assert len(res) == 4 117 | 118 | 119 | def test_token_level_sampler_token_pos_spliter(): 120 | tls = TokenLevelSampler() 121 | text = "人为什么活着?生而为人必须要有梦想!还要有尽可能多的精神体验。" 122 | res = tls.make_samples(text, cut_wps) 123 | assert len(res) == 4 124 | 125 | 126 | def test_token_level_sampler_delete_sampling(): 127 | tls = TokenLevelSampler() 128 | text = "人为什么活着?生而为人必须要有梦想!还要有尽可能多的精神体验。" 129 | tokens = cut_words(text) 130 | res = tls.delete_sampling(tokens, [2]) 131 | assert type(res) == list 132 | assert len(res) + 1 == len(tokens) 133 | 134 | 135 | def test_token_level_sampler_insert_sampling(): 136 | tls = TokenLevelSampler() 137 | text = "人为什么活着?生而为人必须要有梦想!还要有尽可能多的精神体验。" 138 | tokens = cut_words(text) 139 | res = tls.insert_sampling(tokens, [2, 6]) 140 | assert type(res) == list 141 | assert len(res) - 2 == len(tokens) 142 | 143 | 144 | def test_token_level_sampler_swap_sampling(): 145 | tls = TokenLevelSampler() 146 | text = "人为什么活着?生而为人必须要有梦想!还要有尽可能多的精神体验。" 147 | tokens = cut_words(text) 148 | res = tls.swap_sampling(tokens, [5, 8]) 149 | assert type(res) == list 150 | assert len(res) == len(tokens) 151 | 152 | 153 | def test_sentence_level_sampler(): 154 | sls = SentenceLevelSampler() 155 | text = "我爱你。你爱我。" 156 | res = sls.make_samples(text) 157 | assert type(res) == dict 158 | assert len(res) == 4 159 | 160 | 161 | def test_sentence_level_sampler_none(): 162 | sls = SentenceLevelSampler([]) 163 | text = "我爱你。你爱我。" 164 | assert sls.make_samples(text) == {} 165 | 166 | 167 | def test_sentence_level_sampler_single_sent(): 168 | sls = SentenceLevelSampler() 169 | text = "我爱你。" 170 | assert len(sls.make_samples(text)) == 4 171 | 172 | 173 | def test_sentence_level_sampler_none_text(): 174 | sls = SentenceLevelSampler() 175 | text = "" 176 | assert sls.make_samples(text) == {} 177 | 178 | 179 | def test_sentence_level_sampler_independent_sampling(): 180 | sls = SentenceLevelSampler() 181 | text = "写代码。写好代码。" 182 | text_list = cut_part(text, psubsent) 183 | res = sls.independent_sampling(text_list) 184 | assert type(res) == list 185 | assert len(res) == 3 186 | assert len(res[0]) == 1 187 | assert len(res[1]) == 2 188 | assert len(res[2]) == 3 189 | 190 | 191 | def test_sentence_level_sampler_dependent_sampling(): 192 | sls = SentenceLevelSampler() 193 | text = "写代码。多写代码。写好代码。" 194 | text_list = cut_part(text, psubsent) 195 | res = sls.dependent_sampling(text_list) 196 | assert type(res) == list 197 | assert len(res) == 3 198 | 199 | 200 | def test_sentence_level_sampler_insert(): 201 | sls = SentenceLevelSampler(types=["insert"]) 202 | text = "我爱你。你爱我。NLP 很有意思。简洁最重要。" 203 | res = sls.make_samples(text) 204 | assert len(res) == 2 205 | 206 | 207 | def test_sentence_level_sampler_delete(): 208 | sls = SentenceLevelSampler(types=["delete"]) 209 | text = "我爱你。你爱我。NLP 很有意思。简洁最重要。" 210 | res = sls.make_samples(text) 211 | assert len(res) == 2 212 | 213 | 214 | def test_sentence_level_sampler_swap(): 215 | sls = SentenceLevelSampler(types=["swap"]) 216 | text = "我爱你。你爱我。NLP 很有意思。简洁最重要。" 217 | res = sls.make_samples(text) 218 | assert len(res) == 2 219 | -------------------------------------------------------------------------------- /tests/test_piop.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import types 4 | 5 | from pnlp.piop import write_json, write_file 6 | from pnlp.piop import Reader, read_file, read_lines, read_json, read_yaml, read_csv 7 | from pnlp.piop import read_file_to_list_dict, write_list_dict_to_file 8 | from pnlp.piop import check_dir 9 | 10 | DATA_PATH = os.path.join('tests', 'piop_data') 11 | 12 | 13 | @pytest.fixture(params=['*.md', '*.txt', '*.data', 'f*.*', '*c.*']) 14 | def get_Reader_path_match_res(request): 15 | res = [] 16 | reader = Reader(request.param) 17 | for line in reader(DATA_PATH): 18 | res.append(line) 19 | return res 20 | 21 | 22 | def test_Reader_path_match(get_Reader_path_match_res): 23 | assert len(get_Reader_path_match_res) == 9 24 | assert get_Reader_path_match_res[0].lid == 0 25 | assert get_Reader_path_match_res[-1].lid == 2 26 | 27 | 28 | def test_Reader_file(): 29 | res = [] 30 | reader = Reader() 31 | for line in reader(os.path.join(DATA_PATH, 'a.md')): 32 | res.append(line) 33 | assert len(res) == 3 34 | assert res[0].text == 'line 1 in a.' 35 | 36 | 37 | def test_Reader_gen_files(): 38 | paths = Reader.gen_files(DATA_PATH, '*.md') 39 | assert isinstance(paths, types.GeneratorType) 40 | assert len(list(paths)) == 3 41 | 42 | 43 | def test_Reader_gen_files_with_regex(): 44 | paths = Reader.gen_files(DATA_PATH, "(md)|(txt)", True) 45 | assert isinstance(paths, types.GeneratorType) 46 | assert len(list(paths)) == 6 47 | 48 | 49 | def test_Reader_gen_articles(): 50 | paths = Reader.gen_files(DATA_PATH, '*.txt') 51 | articles = Reader.gen_articles(paths) 52 | assert isinstance(articles, types.GeneratorType) 53 | assert len(list(articles)) == 3 54 | 55 | 56 | def test_Reader_gen_flines(): 57 | paths = Reader.gen_files(DATA_PATH, '*.txt') 58 | articles = Reader.gen_articles(paths) 59 | lines = Reader.gen_flines(articles) 60 | assert isinstance(lines, types.GeneratorType) 61 | assert len(list(lines)) == 9 62 | 63 | 64 | def test_Reader_gen_plines(): 65 | lines = Reader.gen_plines(os.path.join(DATA_PATH, 'b.txt')) 66 | assert isinstance(lines, types.GeneratorType) 67 | assert len(list(lines)) == 3 68 | 69 | 70 | @pytest.fixture 71 | def get_read_data(): 72 | return os.path.join(DATA_PATH, 'c.data') 73 | 74 | 75 | def test_read_file(get_read_data): 76 | data = read_file(get_read_data) 77 | assert data == 'line 1 in c.\nline 2 in c.\nline 3 in c.' 78 | assert type(data) == str 79 | 80 | 81 | @pytest.mark.parametrize("count", [0, 1, 2, -1]) 82 | def test_read_lines(get_read_data, count): 83 | data = read_lines(get_read_data, count=count) 84 | if count != -1: 85 | assert len(data) == count 86 | else: 87 | assert data == ['line 1 in c.', 'line 2 in c.', 'line 3 in c.'] 88 | assert type(data) == list 89 | 90 | 91 | def test_read_json(): 92 | data = read_json(os.path.join(DATA_PATH, 'json.json')) 93 | assert type(data) == dict 94 | assert data == { 95 | "json1": "this is line 1", 96 | "json2": "这是第二行。" 97 | } 98 | 99 | 100 | def test_read_yaml(): 101 | data = read_yaml(os.path.join(DATA_PATH, 'yaml.yaml')) 102 | assert type(data) == dict 103 | assert data == {'元旦': ['新年快乐', '元旦快乐', '节日快乐'], 104 | '周末': ['周末快乐!', '周末愉快!']} 105 | 106 | 107 | def test_read_csv(): 108 | data = read_csv(os.path.join(DATA_PATH, 'csv.csv')) 109 | assert type(data) == list 110 | assert data == [['id', 'title'], ['1', 'title1'], ['2', 'title2']] 111 | 112 | 113 | def test_read_file_to_list_dict(): 114 | data = read_file_to_list_dict(os.path.join(DATA_PATH, "list_dict.json")) 115 | assert type(data) == list 116 | assert type(data[0]) == dict 117 | 118 | 119 | def test_write_json(): 120 | data = {"outjson1": "this is line 1.", 121 | "outjson2": "这是第二行。"} 122 | write_json(os.path.join(DATA_PATH, 'outjson.json'), 123 | data, indent=4, ensure_ascii=False) 124 | 125 | 126 | def test_write_file(): 127 | data = ['line 1 of outfile.', '这是 outfile 的第二行。'] 128 | write_file(os.path.join(DATA_PATH, 'outfile.file'), data) 129 | 130 | 131 | def test_write_list_dict_to_file(): 132 | data = [{"name": "Yam", "age": 20}] 133 | write_list_dict_to_file(os.path.join(DATA_PATH, "outfile.listdict"), data) 134 | 135 | 136 | def test_check_dir(): 137 | assert check_dir(DATA_PATH) is None 138 | -------------------------------------------------------------------------------- /tests/test_pmag.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pnlp.pmag import MagicDict, get_unique_fn 3 | 4 | 5 | def test_magic(): 6 | tmd = MagicDict() 7 | tmd["a"]["b"]["c"] = 1 8 | assert tmd["a"]["b"]["c"] == 1 9 | 10 | 11 | def test_magic_set_get(): 12 | d = MagicDict() 13 | d["a"]["b"] = 2 14 | assert d.a.b == 2 15 | 16 | 17 | def test_magic_reverse(): 18 | dx = {1: "a", 2: "a", 3: "a", 4: "b"} 19 | assert MagicDict.reverse(dx) == {"a": [1, 2, 3], "b": 4} 20 | 21 | 22 | 23 | @pytest.mark.parametrize("inp,oup,level", [ 24 | ("a.md", "a.md", 0), 25 | ("a.md", "a.md", 1), 26 | ("a.md", "a.md", 10), 27 | ("a/b.md", "a_b.md", 0), 28 | ("a/b.md", "a_b.md", 1), 29 | ("a/b.md", "a_b.md", 10), 30 | ("a/b/c.md", "a_b_c.md", 0), 31 | ("a/b/c.md", "b_c.md", 1), 32 | ("a/b/c.md", "a_b_c.md", 10), 33 | ("/a/b/c.md", "a_b_c.md", 0), 34 | ("/a/b/c.md", "b_c.md", 1), 35 | ("/a/b/c.md", "a_b_c.md", 10), 36 | ]) 37 | def test_get_unique_fn(inp, level, oup): 38 | res = get_unique_fn(inp, level) 39 | assert res == oup 40 | -------------------------------------------------------------------------------- /tests/test_pnorm.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from pnlp.pnorm import NumNorm 4 | 5 | c2a = NumNorm() 6 | 7 | 8 | def test_chinese2arabic1(): 9 | s = "一亿三千万" 10 | assert c2a.zh2num(s) == 130000000 11 | 12 | 13 | def test_chinese2arabic2(): 14 | s = "一万五千六百三十八" 15 | assert c2a.zh2num(s) == 15638 16 | 17 | 18 | def test_chinese2arabic3(): 19 | s = "壹仟两百" 20 | assert c2a.zh2num(s) == 1200 21 | 22 | 23 | def test_chinese2arabic4(): 24 | s = "壹仟两百零三" 25 | assert c2a.zh2num(s) == 1203 26 | 27 | 28 | def test_chinese2arabic5(): 29 | s = "壹仟两百一十五" 30 | assert c2a.zh2num(s) == 1215 31 | 32 | 33 | def test_chinese2arabic6(): 34 | s = "壹仟两百九十" 35 | assert c2a.zh2num(s) == 1290 36 | 37 | 38 | def test_chinese2arabic7(): 39 | s = "十一" 40 | assert c2a.zh2num(s) == 11 41 | 42 | 43 | def test_chinese2arabic8(): 44 | s = "八十八" 45 | assert c2a.zh2num(s) == 88 46 | 47 | 48 | def test_chinese2arabic9(): 49 | s = "三" 50 | assert c2a.zh2num(s) == 3 51 | 52 | 53 | def test_chinese2arabic10(): 54 | s = "两百五十" 55 | assert c2a.zh2num(s) == 250 56 | 57 | 58 | def test_chinese2arabic11(): 59 | s = "两百" 60 | assert c2a.zh2num(s) == 200 61 | 62 | 63 | def test_chinese2arabic12(): 64 | s = "两百零五" 65 | assert c2a.zh2num(s) == 205 66 | 67 | 68 | def test_chinese2arabic13(): 69 | s = "两百二十五" 70 | assert c2a.zh2num(s) == 225 71 | 72 | 73 | def test_chinese2arabic14(): 74 | s = "二十万五千" 75 | assert c2a.zh2num(s) == 205000 76 | 77 | 78 | def test_chinese2arabic15(): 79 | s = "两百三十九万四千八百二十三" 80 | assert c2a.zh2num(s) == 2394823 81 | 82 | 83 | def test_chinese2arabic16(): 84 | s = "一千三百万" 85 | assert c2a.zh2num(s) == 13000000 86 | 87 | 88 | def test_chinese2arabic17(): 89 | s = "万" 90 | assert c2a.zh2num(s) == "万" 91 | 92 | 93 | def test_chinese2arabic18(): 94 | s = "亿" 95 | assert c2a.zh2num(s) == "亿" 96 | 97 | 98 | def test_chinese2arabic19(): 99 | s = "千" 100 | assert c2a.zh2num(s) == "千" 101 | 102 | 103 | def test_chinese2arabic20(): 104 | s = "百" 105 | assert c2a.zh2num(s) == "百" 106 | 107 | 108 | def test_chinese2arabic21(): 109 | s = "零" 110 | assert c2a.zh2num(s) == 0 111 | 112 | 113 | def test_arabic2chinese1(): 114 | num = 0 115 | assert c2a.num2zh(num) == "零" 116 | 117 | 118 | def test_arabic2chinese2(): 119 | num = 1 120 | assert c2a.num2zh(num) == "一" 121 | 122 | 123 | def test_arabic2chinese3(): 124 | num = 10 125 | assert c2a.num2zh(num) == "一十" 126 | 127 | 128 | def test_arabic2chinese4(): 129 | num = 12 130 | assert c2a.num2zh(num) == "一十二" 131 | 132 | 133 | def test_arabic2chinese5(): 134 | num = 22 135 | assert c2a.num2zh(num) == "二十二" 136 | 137 | 138 | def test_arabic2chinese6(): 139 | num = 100 140 | assert c2a.num2zh(num) == "一百" 141 | 142 | 143 | def test_arabic2chinese7(): 144 | num = 101 145 | assert c2a.num2zh(num) == "一百零一" 146 | 147 | 148 | def test_arabic2chinese8(): 149 | num = 110 150 | assert c2a.num2zh(num) == "一百一十" 151 | 152 | 153 | def test_arabic2chinese9(): 154 | num = 112 155 | assert c2a.num2zh(num) == "一百一十二" 156 | 157 | 158 | def test_arabic2chinese10(): 159 | num = 1000 160 | assert c2a.num2zh(num) == "一千" 161 | 162 | 163 | def test_arabic2chinese11(): 164 | num = 1001 165 | assert c2a.num2zh(num) == "一千零一" 166 | 167 | 168 | def test_arabic2chinese12(): 169 | num = 1011 170 | assert c2a.num2zh(num) == "一千零一十一" 171 | 172 | 173 | def test_arabic2chinese13(): 174 | num = 1101 175 | assert c2a.num2zh(num) == "一千一百零一" 176 | 177 | 178 | def test_arabic2chinese14(): 179 | num = 1010 180 | assert c2a.num2zh(num) == "一千零一十" 181 | 182 | 183 | def test_arabic2chinese15(): 184 | num = 1100 185 | assert c2a.num2zh(num) == "一千一百" 186 | 187 | 188 | def test_arabic2chinese16(): 189 | num = 1110 190 | assert c2a.num2zh(num) == "一千一百一十" 191 | 192 | 193 | def test_arabic2chinese17(): 194 | num = 1111 195 | assert c2a.num2zh(num) == "一千一百一十一" 196 | 197 | 198 | def test_arabic2chinese18(): 199 | num = 100000 200 | assert c2a.num2zh(num) == "一十万" 201 | 202 | 203 | def test_arabic2chinese19(): 204 | num = 110000 205 | assert c2a.num2zh(num) == "一十一万" 206 | 207 | 208 | def test_arabic2chinese20(): 209 | num = 1000000 210 | assert c2a.num2zh(num) == "一百万" 211 | 212 | 213 | def test_arabic2chinese21(): 214 | num = 1010000 215 | assert c2a.num2zh(num) == "一百零一万" 216 | 217 | 218 | def test_arabic2chinese22(): 219 | num = 1100000 220 | assert c2a.num2zh(num) == "一百一十万" 221 | 222 | 223 | def test_arabic2chinese23(): 224 | num = 1110000 225 | assert c2a.num2zh(num) == "一百一十一万" 226 | 227 | 228 | def test_arabic2chinese24(): 229 | num = 100000000 230 | assert c2a.num2zh(num) == "一亿" 231 | 232 | 233 | def test_arabic2chinese25(): 234 | num = 110000000 235 | assert c2a.num2zh(num) == "一亿一千万" 236 | 237 | 238 | def test_arabic2chinese26(): 239 | num = 111000000 240 | assert c2a.num2zh(num) == "一亿一千一百万" 241 | 242 | 243 | def test_arabic2chinese27(): 244 | num = 101000000 245 | assert c2a.num2zh(num) == "一亿零一百万" 246 | 247 | 248 | def test_arabic2chinese28(): 249 | num = 1000000000000 250 | assert c2a.num2zh(num) == "一万亿" 251 | 252 | 253 | def test_arabic2chinese29(): 254 | num = 1100000000000 255 | assert c2a.num2zh(num) == "一万一千亿" 256 | 257 | 258 | def test_arabic2chinese30(): 259 | # 一兆一亿 260 | num = 11000000000000 261 | assert c2a.num2zh(num) == "超大" 262 | 263 | 264 | def test_arabic2chinese31(): 265 | num = 1110011 266 | assert c2a.num2zh(num) == "一百一十一万零一十一" 267 | 268 | 269 | def test_arabic2chinese_money1(): 270 | num = 112 271 | assert c2a.num2zh(num).to_money() == "壹佰壹拾贰" 272 | 273 | 274 | def test_arabic2chinese_money2(): 275 | num = 1111 276 | assert c2a.num2zh(num).to_money() == "壹仟壹佰壹拾壹" 277 | 278 | 279 | def test_arabic2chinese_money3(): 280 | num = 1010000 281 | assert c2a.num2zh(num).to_money() == "壹佰零壹萬" 282 | 283 | -------------------------------------------------------------------------------- /tests/test_ptrans.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from pnlp.ptrans import pick_entity_from_bio_labels, generate_uuid 4 | 5 | 6 | @pytest.mark.parametrize("pairs,result", [ 7 | ([('v1', 'B-1')], [("v1", "1")]), 8 | ([('v0', 'O'), ('vo', 'O')], []), 9 | ([('v1', 'B-1'), ('v2', 'I-1')], [("v1v2", "1")]), 10 | ([('v1', 'B-1'), ('v2', 'I-1'), ('v0', 'O')], [("v1v2", "1")]), 11 | ([('v1', 'O'), 12 | ('v2', 'B-2'), 13 | ('v3', 'B-3'), 14 | ('v4', 'I-3'), 15 | ('v5', 'B-5'), 16 | ('v6', 'B-6'), 17 | ('v0', 'O'), 18 | ('v0', 'O'), 19 | ('v7', 'B-7'), 20 | ('v8', 'I-7'), 21 | ('v9', 'B-9')], 22 | [("v2", "2"), ("v3v4", "3"), ("v5", "5"), ("v6", "6"), ("v7v8", "7"), ("v9", "9")] 23 | ), 24 | ]) 25 | def test_pick_entity_from_bio_labels(pairs, result): 26 | entities = pick_entity_from_bio_labels(pairs) 27 | assert entities == result 28 | 29 | 30 | @pytest.mark.parametrize("pairs,result", [ 31 | ([("我", "O"), ("国", "O"), ("北", "B-LOC"), ("京", "I-LOC")], [("北京", "LOC", 2, 4)]), 32 | ([("我", "O"), ("国", "O"), ("北", "B-LOC"), ("京", "I-LOC"), ("。", "O")], [("北京", "LOC", 2, 4)]), 33 | ([("我", "O"), ("国", "O"), ("北", "B-LOC"), ("京", "I-LOC"), ("天", "B-LOC"), ("安", "I-LOC"), ("门", "I-LOC")], [("北京", "LOC", 2, 4), ("天安门", "LOC", 4, 7)]), 34 | ([("我", "O"), ("国", "O"), ("北", "B-LOC"), ("京", "I-LOC"), ("的", "O"), ("天", "B-LOC"), ("安", "I-LOC"), ("门", "I-LOC")], [("北京", "LOC", 2, 4), ("天安门", "LOC", 5, 8)]), 35 | ([("北", "B-LOC"), ("京", "I-LOC"), ("天", "B-LOC"), ("安", "I-LOC"), ("门", "I-LOC")], [("北京", "LOC", 0, 2), ("天安门", "LOC", 2, 5)]), 36 | ([("北", "B-LOC"), ("京", "I-LOC"), ("天", "B-LOC"), ("安", "I-LOC"), ("门", "I-LOC"), ("。", "O")], [("北京", "LOC", 0, 2), ("天安门", "LOC", 2, 5)]), 37 | ([("北", "B-ORG"), ("大", "I-ORG"), ("蔡", "B-PER"), ("元", "I-PER"), ("培", "I-PER")], [("北大", "ORG", 0, 2), ("蔡元培", "PER", 2, 5)]), 38 | ([("说", "O"), ("北", "B-ORG"), ("大", "I-ORG"), ("蔡", "B-PER"), ("元", "I-PER"), ("培", "I-PER")], [("北大", "ORG", 1, 3), ("蔡元培", "PER", 3, 6)]), 39 | ([("北", "B-ORG"), ("大", "I-ORG"), ("蔡", "B-PER"), ("元", "I-PER"), ("培", "I-PER"), ("啊", "O")], [("北大", "ORG", 0, 2), ("蔡元培", "PER", 2, 5)]), 40 | ([("北", "B-LOC"), ("京", "I-LOC"), ("的", "O"), ("安", "I-LOC")], [("北京", "LOC", 0, 2)]), 41 | ([("北", "B-LOC"), ("京", "I-LOC"), ("的", "O"), ("安", "B-LOC")], [("北京", "LOC", 0, 2), ("安", "LOC", 3, 4)]), 42 | ]) 43 | def test_pick_entity_from_bio_labels_with_offset(pairs, result): 44 | entities = pick_entity_from_bio_labels(pairs, True) 45 | assert entities == result 46 | 47 | 48 | @pytest.mark.parametrize("inp", [ 49 | (("a", 1, 0.5)), 50 | (("好", 1, 0.5)), 51 | ]) 52 | def test_generate_uuid(inp): 53 | uid = generate_uuid(*inp) 54 | assert type(uid) == str 55 | assert len(uid) == 32 56 | -------------------------------------------------------------------------------- /tests/test_ptxt.py: -------------------------------------------------------------------------------- 1 | import re 2 | import pytest 3 | 4 | from pnlp.ptxt import Text, Regex, Length 5 | reg = Regex() 6 | 7 | 8 | @pytest.mark.parametrize("inp, expected", [ 9 | ("1.t", ""), 10 | ("1,t", "1"), 11 | (",1", "1"), 12 | ]) 13 | def test_regex_pnum(inp, expected): 14 | match = reg.pnum.search(inp) 15 | if match: 16 | res = match.group() 17 | else: 18 | res = "" 19 | assert res == expected 20 | 21 | 22 | @pytest.fixture(params=reg.patnames) 23 | def get_regex(request): 24 | return reg.patdict[request.param] 25 | 26 | 27 | def test_regex_well(get_regex): 28 | assert isinstance(get_regex, re.Pattern) 29 | 30 | 31 | @pytest.fixture(params=reg.patnames) 32 | def get_patten(request): 33 | return [request.param] 34 | 35 | 36 | def test_Text_extract(get_patten): 37 | text = "这是,测试fdsf234*(&( 返回类型的文本。" 38 | res = Text(get_patten).extract(text) 39 | assert isinstance(res, dict) 40 | assert isinstance(res.mats, list) 41 | assert isinstance(res.locs, list) 42 | 43 | 44 | def test_Text_clean(get_patten): 45 | text = "这是,测试fdsf234*(&( 返回类型的文本。" 46 | res = Text(get_patten).clean(text) 47 | assert isinstance(res, str) 48 | 49 | 50 | def test_pattern_string_invalid(): 51 | try: 52 | Text(["XX"]) 53 | except Exception as e: 54 | assert "built-in" in str(e) 55 | 56 | 57 | def test_pattern_invalid(): 58 | try: 59 | Text([lambda x: x]) 60 | except Exception as e: 61 | assert "RE" in str(e) 62 | 63 | 64 | @pytest.fixture 65 | def text_chi(): 66 | text = "你好。jefj*(&-1)这是中文测试!" 67 | return text 68 | 69 | 70 | def test_Text_extract_chi(text_chi): 71 | res = Text(['chi']).extract(text_chi) 72 | assert "".join(res.mats) == "你好这是中文测试" 73 | assert res.text == "你好这是中文测试" 74 | 75 | 76 | def test_Text_clean_chi(text_chi): 77 | res = Text(['chi']).clean(text_chi) 78 | assert res == "。jefj*(&-1)!" 79 | 80 | 81 | @pytest.fixture 82 | def text_pun(): 83 | text = "你好,这是标点,.!;<>()符号测试。" 84 | return text 85 | 86 | 87 | def test_Text_extract_pun(text_pun): 88 | res = Text(['nwn']).extract(text_pun) 89 | assert "".join(res.mats) == ",,.!;<>()。" 90 | assert res.text == ",,.!;<>()。" 91 | 92 | 93 | def test_Text_clean_pun(text_pun): 94 | res = Text(['nwn']).clean(text_pun) 95 | assert res == "你好这是标点符号测试" 96 | 97 | 98 | @pytest.fixture 99 | def text_whi(): 100 | text = "你好,这是空白 \t\n符号测试。" 101 | return text 102 | 103 | 104 | def test_Text_extract_whi(text_whi): 105 | res = Text(['whi']).extract(text_whi) 106 | assert "".join(res.mats) == " \t\n" 107 | assert res.text == " \t\n" 108 | 109 | 110 | def test_Text_clean_whi(text_whi): 111 | res = Text(['whi']).clean(text_whi) 112 | assert res == "你好,这是空白符号测试。" 113 | 114 | 115 | @pytest.fixture 116 | def text_nwh(): 117 | text = "你好,这是非空白 \t\n符号测试。" 118 | return text 119 | 120 | 121 | def test_Text_extract_nwh(text_nwh): 122 | res = Text(['nwh']).extract(text_nwh) 123 | assert "".join(res.mats) == "你好,这是非空白符号测试。" 124 | assert res.text == "你好,这是非空白符号测试。" 125 | 126 | 127 | def test_Text_clean_nwh(text_nwh): 128 | res = Text(['nwh']).clean(text_nwh) 129 | assert res == " \t\n" 130 | 131 | 132 | @pytest.fixture 133 | def text_wnb(): 134 | text = "你好,这是词与word数字number测试。" 135 | return text 136 | 137 | 138 | def test_Text_extract_wnb(text_wnb): 139 | res = Text(['wnb']).extract(text_wnb) 140 | assert "".join(res.mats) == "你好这是词与word数字number测试" 141 | assert res.text == "你好这是词与word数字number测试" 142 | 143 | 144 | def test_Text_clean_wnb(text_wnb): 145 | res = Text(['wnb']).clean(text_wnb) 146 | assert res == ",。" 147 | 148 | 149 | @pytest.fixture 150 | def text_nwn(): 151 | text = "你好,这是非词或word数字number测试。" 152 | return text 153 | 154 | 155 | def test_Text_extract_nwn(text_nwn): 156 | res = Text(['nwn']).extract(text_nwn) 157 | assert "".join(res.mats) == ",。" 158 | assert res.text == ",。" 159 | 160 | 161 | def test_Text_clean_nwn(text_nwn): 162 | res = Text(['nwn']).clean(text_nwn) 163 | assert res == "你好这是非词或word数字number测试" 164 | 165 | 166 | @pytest.fixture 167 | def text_eng(): 168 | text = "你好,这#¥是英文English测试。" 169 | return text 170 | 171 | 172 | def test_Text_extract_eng(text_eng): 173 | res = Text(['eng']).extract(text_eng) 174 | assert "".join(res.mats) == "English" 175 | assert res.text == "English" 176 | 177 | 178 | def test_Text_clean_eng(text_eng): 179 | res = Text(['eng']).clean(text_eng) 180 | assert res == "你好,这#¥是英文测试。" 181 | 182 | 183 | @pytest.fixture 184 | def text_num(): 185 | text = "你好,这#¥是数字2, +2, -2, 2.1, -2.2, 1/5, 2:3, -2/5, 2%, 2.5%测试。" 186 | return text 187 | 188 | 189 | def test_Text_extract_num(text_num): 190 | res = Text(['num']).extract(text_num) 191 | assert "".join(res.mats) == "2+2-22.1-2.21/52:3-2/52%2.5%" 192 | assert res.text == "2+2-22.1-2.21/52:3-2/52%2.5%" 193 | 194 | 195 | def test_Text_clean_num(text_num): 196 | res = Text(['num']).clean(text_num) 197 | assert res == "你好,这#¥是数字, , , , , , , , , 测试。" 198 | 199 | 200 | @pytest.fixture 201 | def text_pic(): 202 | text = "你好,这#¥是![p1](https://xxx.jpeg)图片![](yyy.png)测试https://z.jpg。" 203 | return text 204 | 205 | 206 | def test_Text_extract_pic(text_pic): 207 | res = Text(['pic']).extract(text_pic) 208 | assert "".join( 209 | res.mats) == "![p1](https://xxx.jpeg)![](yyy.png)https://z.jpg" 210 | assert res.text == "![p1](https://xxx.jpeg)![](yyy.png)https://z.jpg" 211 | 212 | 213 | def test_Text_clean_pic(text_pic): 214 | res = Text(['pic']).clean(text_pic) 215 | assert res == "你好,这#¥是图片测试。" 216 | 217 | 218 | @pytest.fixture 219 | def text_lnk(): 220 | text = "你好,www.g.com,这#¥是链接[link](https://yam.gift)测试http://yam.gift。" 221 | return text 222 | 223 | 224 | def test_Text_extract_lnk(text_lnk): 225 | res = Text(['lnk']).extract(text_lnk) 226 | assert "".join( 227 | res.mats) == "www.g.com[link](https://yam.gift)http://yam.gift" 228 | assert res.text == "www.g.com[link](https://yam.gift)http://yam.gift" 229 | 230 | 231 | def test_Text_clean_lnk(text_lnk): 232 | res = Text(['lnk']).clean(text_lnk) 233 | assert res == "你好,,这#¥是链接测试。" 234 | 235 | 236 | def test_markdown_link_with_whitespace_in_title(): 237 | text = """啊[Download | View](https://www.altova.com/xmlspy-xml-editor/download/)""" 238 | res = Text(['lnk']).clean(text) 239 | assert res == "啊" 240 | 241 | 242 | @pytest.fixture 243 | def text_emj(): 244 | text = "你好,这#¥是表情😁测试😜🌹。" 245 | return text 246 | 247 | 248 | def test_Text_extract_emj(text_emj): 249 | res = Text(['emj']).extract(text_emj) 250 | assert "".join(res.mats) == "😁😜🌹" 251 | assert res.text == "😁😜🌹" 252 | 253 | 254 | def test_Text_clean_emj(text_emj): 255 | res = Text(['emj']).clean(text_emj) 256 | assert res == "你好,这#¥是表情测试。" 257 | 258 | 259 | @pytest.fixture 260 | def text_len(): 261 | text = "这是https://www.yam.gift长度测试,《 》*)FSJfdsjf😁![](http://xx.jpg)。233." 262 | return text 263 | 264 | 265 | def test_Text_len_all(text_len): 266 | res = Length(text_len).len_all 267 | assert res == 64 268 | 269 | 270 | def test_Text_len_nwh(text_len): 271 | res = Length(text_len).len_nwh 272 | assert res == 63 273 | 274 | 275 | def test_Text_len_chi(text_len): 276 | res = Length(text_len).len_chi 277 | assert res == 6 278 | 279 | 280 | def test_Text_len_wnb(text_len): 281 | res = Length(text_len).len_wnb 282 | assert res == 41 283 | 284 | 285 | def test_Text_len_pun(text_len): 286 | res = Length(text_len).len_pun 287 | assert res == 14 288 | 289 | 290 | def test_Text_len_eng(text_len): 291 | res = Length(text_len).len_eng 292 | assert res == 32 293 | 294 | 295 | def test_Text_len_num(text_len): 296 | res = Length(text_len).len_num 297 | assert res == 3 298 | 299 | 300 | if __name__ == '__main__': 301 | print(reg.patnames) 302 | -------------------------------------------------------------------------------- /tests/test_stopwords.py: -------------------------------------------------------------------------------- 1 | from pnlp.stopwords import StopWords 2 | 3 | 4 | 5 | def test_stopwords(): 6 | sw = StopWords() 7 | assert type(sw.zh) == set 8 | assert type(sw.en) == set 9 | assert sw.zh_len > 0 10 | assert sw.en_len > 0 11 | 12 | 13 | def test_custom_stopwords(): 14 | sw = StopWords("tests/piop_data/b.txt") 15 | assert type(sw.stopwords) == set 16 | assert len(sw.stopwords) == 3 -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | from functools import partial 4 | import math 5 | import itertools 6 | import pytest 7 | import multiprocessing as mp 8 | 9 | from pnlp.utils import pstr, concurring, generate_batches_by_num 10 | from pnlp.utils import run_in_new_thread 11 | from pnlp.piop import read_file, write_file 12 | 13 | 14 | def test_pstr1(): 15 | s1 = pstr("123") 16 | s2 = "1" 17 | assert s1 - s2 == "23" 18 | 19 | 20 | def test_pstr2(): 21 | s1 = pstr("123") 22 | s2 = "123" 23 | assert s1 - s2 == "" 24 | 25 | 26 | def test_pstr3(): 27 | s1 = pstr("123") 28 | s2 = "234" 29 | assert s1 - s2 == "1" 30 | 31 | 32 | def test_pstr4(): 33 | s1 = pstr("123") 34 | s2 = "456" 35 | assert s1 - s2 == "123" 36 | 37 | 38 | def test_pstr5(): 39 | s1 = pstr("") 40 | s2 = "456" 41 | assert s1 - s2 == "" 42 | 43 | 44 | def test_generate_batches(): 45 | lst = range(100) 46 | res = list(generate_batches_by_num(lst, 10)) 47 | assert len(res) == 10 48 | assert len(res[0]) == 10 49 | 50 | 51 | def is_prime(x): 52 | if x < 2: 53 | return False 54 | for i in range(2, int(math.sqrt(x)) + 1): 55 | if x % i == 0: 56 | return False 57 | return True 58 | 59 | 60 | def test_concurring_default(): 61 | 62 | @concurring 63 | def get_primes(lst): 64 | res = [] 65 | for i in lst: 66 | if is_prime(i): 67 | res.append(i) 68 | return res 69 | lst = list(range(100)) 70 | res = get_primes(lst) 71 | res = list(res) 72 | assert len(res) == mp.cpu_count() 73 | res = list(itertools.chain(*res)) 74 | assert len(res) == 25 75 | 76 | 77 | @pytest.mark.parametrize( 78 | "type", 79 | ["thread_pool", "process_pool", "thread_executor", "thread"]) 80 | @pytest.mark.parametrize("max_workers", [1, 2, 4, 7, 10]) 81 | def test_concurring_with_parameters(type, max_workers): 82 | 83 | @concurring(type=type, max_workers=max_workers) 84 | def get_primes(lst): 85 | res = [] 86 | for i in lst: 87 | if is_prime(i): 88 | res.append(i) 89 | return res 90 | lst = list(range(100)) 91 | res = get_primes(lst) 92 | res = list(res) 93 | assert len(res) == max_workers 94 | res = list(itertools.chain(*res)) 95 | assert len(res) == 25 96 | 97 | 98 | def test_concurring_invalid_type(): 99 | 100 | @concurring(type="invalid") 101 | def get_primes(lst): 102 | res = [] 103 | for i in lst: 104 | if is_prime(i): 105 | res.append(i) 106 | return res 107 | lst = list(range(100)) 108 | try: 109 | res = get_primes(lst) 110 | except Exception as err: 111 | assert "invalid" in str(err) 112 | 113 | 114 | def test_concurring_invalid_workers(): 115 | 116 | try: 117 | @concurring(max_workers=0) 118 | def get_primes(lst): 119 | res = [] 120 | for i in lst: 121 | if is_prime(i): 122 | res.append(i) 123 | return res 124 | except Exception as err: 125 | assert "0" in str(err) 126 | 127 | 128 | 129 | def test_run_in_thread(): 130 | file = "run_in_new_thread.txt" 131 | 132 | def func(file, a, b, c): 133 | write_file(file, list(map(str, [a, b, c]))) 134 | 135 | run_in_new_thread(func, file, 1, 2, 3) 136 | import time 137 | time.sleep(1) 138 | 139 | assert os.path.exists(file) 140 | os.remove(file) 141 | 142 | 143 | def test_run_in_thread_kwargs(): 144 | kwargs = { 145 | "b": 2, 146 | "c": 3, 147 | } 148 | 149 | def func(a, b, c): 150 | return a + b + c 151 | 152 | func = partial(func, **kwargs) 153 | 154 | assert 6 == func(1) 155 | 156 | --------------------------------------------------------------------------------