├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── README_EN.md
├── pnlp
    ├── __init__.py
    ├── pcut.py
    ├── penh.py
    ├── piop.py
    ├── pmag.py
    ├── pnorm.py
    ├── ptrans.py
    ├── ptxt.py
    ├── stopwords
    │   ├── ReadMe.md
    │   ├── __init__.py
    │   ├── chinese_stopwords.txt
    │   └── english_stopwords.txt
    └── utils.py
├── setup.py
└── tests
    ├── __init__.py
    ├── piop_data
        ├── a.md
        ├── b.txt
        ├── c.data
        ├── csv.csv
        ├── first
        │   ├── fa.md
        │   ├── fb.txt
        │   ├── fc.data
        │   └── second
        │   │   ├── sa.md
        │   │   ├── sb.txt
        │   │   └── sc.data
        ├── json.json
        ├── list_dict.json
        ├── outfile.file
        ├── outfile.listdict
        ├── outjson.json
        └── yaml.yaml
    ├── test_pcut.py
    ├── test_penh.py
    ├── test_piop.py
    ├── test_pmag.py
    ├── test_pnorm.py
    ├── test_ptrans.py
    ├── test_ptxt.py
    ├── test_stopwords.py
    └── test_utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | notebook.ipynb
  2 | .*
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | .hypothesis/
 50 | .pytest_cache/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | db.sqlite3
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # pyenv
 78 | .python-version
 79 | 
 80 | # celery beat schedule file
 81 | celerybeat-schedule
 82 | 
 83 | # SageMath parsed files
 84 | *.sage.py
 85 | 
 86 | # Environments
 87 | .env
 88 | .venv
 89 | env/
 90 | venv/
 91 | ENV/
 92 | env.bak/
 93 | venv.bak/
 94 | 
 95 | # Spyder project settings
 96 | .spyderproject
 97 | .spyproject
 98 | 
 99 | # Rope project settings
100 | .ropeproject
101 | 
102 | # mkdocs documentation
103 | /site
104 | 
105 | # mypy
106 | .mypy_cache/
107 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | .PHONY: build
 3 | build:
 4 | 	python3.11 setup.py sdist bdist_wheel
 5 | 
 6 | .PHONY: upload
 7 | upload:
 8 | 	python3.11 -m twine upload -r pypi dist/*
 9 | 
10 | .PHONY: clean
11 | clean:
12 | 	rm -rf ./dist/ ./build/ ./pnlp.egg-info/
13 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <!-- START doctoc generated TOC please keep comment here to allow auto update -->
  2 | <!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
  3 | **Table of Contents**  *generated with [DocToc](https://github.com/thlorenz/doctoc)*
  4 | 
  5 | - [功能特性](#%E5%8A%9F%E8%83%BD%E7%89%B9%E6%80%A7)
  6 | - [安装](#%E5%AE%89%E8%A3%85)
  7 | - [使用](#%E4%BD%BF%E7%94%A8)
  8 |   - [文本IO](#%E6%96%87%E6%9C%ACio)
  9 |     - [IO 处理](#io-%E5%A4%84%E7%90%86)
 10 |     - [内置方法](#%E5%86%85%E7%BD%AE%E6%96%B9%E6%B3%95)
 11 |   - [文本处理](#%E6%96%87%E6%9C%AC%E5%A4%84%E7%90%86)
 12 |     - [清理和提取](#%E6%B8%85%E7%90%86%E5%92%8C%E6%8F%90%E5%8F%96)
 13 |     - [内置正则](#%E5%86%85%E7%BD%AE%E6%AD%A3%E5%88%99)
 14 |   - [文本切分](#%E6%96%87%E6%9C%AC%E5%88%87%E5%88%86)
 15 |     - [任意部分切分](#%E4%BB%BB%E6%84%8F%E9%83%A8%E5%88%86%E5%88%87%E5%88%86)
 16 |     - [分句](#%E5%88%86%E5%8F%A5)
 17 |     - [分子句并按一个阈值合并子句](#%E5%88%86%E5%AD%90%E5%8F%A5%E5%B9%B6%E6%8C%89%E4%B8%80%E4%B8%AA%E9%98%88%E5%80%BC%E5%90%88%E5%B9%B6%E5%AD%90%E5%8F%A5)
 18 |     - [中文字符切分](#%E4%B8%AD%E6%96%87%E5%AD%97%E7%AC%A6%E5%88%87%E5%88%86)
 19 |     - [句子分组](#%E5%8F%A5%E5%AD%90%E5%88%86%E7%BB%84)
 20 |   - [文本增强](#%E6%96%87%E6%9C%AC%E5%A2%9E%E5%BC%BA)
 21 |     - [Token级别](#token%E7%BA%A7%E5%88%AB)
 22 |     - [句子级别](#%E5%8F%A5%E5%AD%90%E7%BA%A7%E5%88%AB)
 23 |   - [文本归一化](#%E6%96%87%E6%9C%AC%E5%BD%92%E4%B8%80%E5%8C%96)
 24 |     - [中文数字](#%E4%B8%AD%E6%96%87%E6%95%B0%E5%AD%97)
 25 |   - [格式转换](#%E6%A0%BC%E5%BC%8F%E8%BD%AC%E6%8D%A2)
 26 |     - [BIO转实体](#bio%E8%BD%AC%E5%AE%9E%E4%BD%93)
 27 |     - [任意参数转UUID](#%E4%BB%BB%E6%84%8F%E5%8F%82%E6%95%B0%E8%BD%ACuuid)
 28 |   - [内置词典](#%E5%86%85%E7%BD%AE%E8%AF%8D%E5%85%B8)
 29 |     - [停用词](#%E5%81%9C%E7%94%A8%E8%AF%8D)
 30 |   - [文本长度](#%E6%96%87%E6%9C%AC%E9%95%BF%E5%BA%A6)
 31 |   - [魔术方法](#%E9%AD%94%E6%9C%AF%E6%96%B9%E6%B3%95)
 32 |   - [并行处理](#%E5%B9%B6%E8%A1%8C%E5%A4%84%E7%90%86)
 33 | - [测试](#%E6%B5%8B%E8%AF%95)
 34 | - [更新日志](#%E6%9B%B4%E6%96%B0%E6%97%A5%E5%BF%97)
 35 | 
 36 | <!-- END doctoc generated TOC please keep comment here to allow auto update -->
 37 | 
 38 | NLP 预/后处理工具。
 39 | 
 40 | ## 功能特性
 41 | 
 42 | - 专为文本 IO 设计的灵活的 Pipeline
 43 | - 灵活的文本清理/提取工具
 44 | - 文本增强
 45 | - 按句切分或按中文字符切分文本
 46 | - 文本分桶
 47 | - 中文字符归一化
 48 | - 文本各种长度计算
 49 | - 中英文常用停用词
 50 | - 预处理魔术方法
 51 | - 并发、批量化、实体 BIO 转实体
 52 | 
 53 | ## 安装
 54 | 
 55 | 需要 Python3.7+。
 56 | 
 57 | `pip install pnlp`
 58 | 
 59 | ## 使用
 60 | 
 61 | ### 文本IO
 62 | 
 63 | #### IO 处理
 64 | 
 65 | ```bash
 66 | tree tests/piop_data/
 67 | ├── a.md
 68 | ├── b.txt
 69 | ├── c.data
 70 | ├── first
 71 | │   ├── fa.md
 72 | │   ├── fb.txt
 73 | │   ├── fc.data
 74 | │   └── second
 75 | │       ├── sa.md
 76 | │       ├── sb.txt
 77 | │       └── sc.data
 78 | ├── json.json
 79 | ├── outfile.file
 80 | ├── outjson.json
 81 | └── yml.yml
 82 | ```
 83 | 
 84 | ```python
 85 | import os
 86 | from pnlp import Reader
 87 | 
 88 | DATA_PATH = "./pnlp/tests/piop_data/"
 89 | pattern = '*.md' # 可以是 '*.txt', 'f*.*' 等，支持正则
 90 | reader = Reader(pattern, use_regex=True)
 91 | 
 92 | # 获取所有文件的行，输出行文本、行索引和所在的文件名
 93 | for line in reader(DATA_FOLDER_PATH):
 94 |     print(line.lid, line.fname, line.text)
 95 | """
 96 | 0 a.md line 1 in a.
 97 | 1 a.md line 2 in a.
 98 | 2 a.md line 3 in a.
 99 | 0 fa.md line 1 in fa.
100 | 1 fa.md line 2 in fa
101 | ...
102 | """
103 | 
104 | # 获取某个文件的所有行，输出行文本、行索引和所在文件名，此时由于指定了文件名 pattern 无效
105 | for line in reader(os.path.join(DATA_FOLDER_PATH, "a.md")):
106 |     print(line.lid, line.fname, line.text)
107 | """
108 | 0 a.md line 1 in a.
109 | 1 a.md line 2 in a.
110 | 2 a.md line 3 in a.
111 | """
112 | 
113 | 
114 | 
115 | # 获取目录下的所有文件路径
116 | for path in Reader.gen_files(DATA_PATH, pattern, use_regex: True):
117 |     print(path)
118 | """
119 | pnlp/tests/piop_data/a.md
120 | pnlp/tests/piop_data/first/fa.md
121 | pnlp/tests/piop_data/first/second/sa.md
122 | """
123 | 
124 | # 获取一个目录下所有文件名和它们的内容
125 | paths = Reader.gen_files(DATA_PATH, pattern)
126 | articles = Reader.gen_articles(paths)
127 | for article in articles:
128 |     print(article.fname)
129 |     print(article.f.read())
130 | """
131 | a.md
132 | line 1 in a.
133 | line 2 in a.
134 | line 3 in a.
135 | ...
136 | """
137 | 
138 | # 同前两个例子
139 | paths = Reader.gen_files(DATA_PATH, pattern)
140 | articles = Reader.gen_articles(paths)
141 | for line in Reader.gen_flines(articles, strip="\n"):
142 |     print(line.lid, line.fname, line.text)
143 | ```
144 | 
145 | #### 内置方法
146 | 
147 | ```python
148 | import pnlp
149 | 
150 | # Read
151 | file_string = pnlp.read_file(file_path)
152 | file_list = pnlp.read_lines(file_path)
153 | file_json = pnlp.read_json(file_path)
154 | file_yaml = pnlp.read_yaml(file_path)
155 | file_csv = pnlp.read_csv(file_path)
156 | file_pickle = pnlp.read_pickle(file_path)
157 | list_dict = pnlp.read_file_to_list_dict(file_path)
158 | 
159 | # Write
160 | pnlp.write_json(file_path, data, indent=2)
161 | pnlp.write_file(file_path, data)
162 | pnlp.write_pickle(file_path, data)
163 | pnlp.write_list_dict_to_file(file_path, data)
164 | 
165 | # Others
166 | pnlp.check_dir(dirname) # 如果目录不存在会创建
167 | ```
168 | 
169 | ### 文本处理
170 | 
171 | #### 清理和提取
172 | 
173 | ```python
174 | import re
175 | from pnlp import Text
176 | 
177 | text = "这是https://www.yam.gift长度测试，《 》*)FSJfdsjf😁![](http://xx.jpg)。233."
178 | pattern = re.compile(r'\d+')
179 | 
180 | # pattern 是 re.Pattern 类型或 str 类型
181 | # 默认为空字符串：'', 表示不使用任何 pattern（实际是 re.compile(r'.+')），此时 clean 返回空（全部被清了），extract 返回原始文本。
182 | # pattern 支持以下字符串类型（实际为正则）：
183 | #	'chi': 中文字符
184 | #	'pun': 标点
185 | #	'whi': 空白
186 | #	'nwh': 非空白
187 | #	'wnb': 字母（含中文字符）或数字
188 | #	'nwn': 非字母（含中文字符）或数字
189 | #	'eng': 英文字符
190 | #	'num': 数字
191 | #	'pic': 图片
192 | #	'lnk': 链接
193 | #	'emj': 表情
194 | 
195 | pt = Text(['chi', pattern])
196 | 
197 | # 提取所有符合 pattern 的文本和它们的位置
198 | res = pt.extract(text)
199 | print(res)
200 | """
201 | {'text': '这是长度测试233', 'mats': ['这是', '长度测试', '233'], 'locs': [(0, 2), (22, 26), (60, 63)]}
202 | """
203 | # 支持用「点」获取key属性
204 | print(res.text, res.mats, res.locs)
205 | """
206 | '这是长度测试' ['这是', '长度测试'] [(0, 2), (22, 26)]
207 | """
208 | 
209 | # 返回指定 pattern 清理后的文本
210 | print(pt.clean(text))
211 | """
212 | https://www.yam.gift，《 》*)FSJfdsjf😁![](http://xx.jpg)。233.
213 | """
214 | 
215 | # 可以指定多个 pattern，注意先后顺序可能会影响结果哦
216 | pt = Text(['pic', 'lnk'])
217 | # 提取到的
218 | res = pt.extract(text)
219 | print(res.mats)
220 | """
221 | ['https://www.yam.gif',
222 |  '![](http://xx.jpg)',
223 |  'https://www.yam.gift',
224 |  'http://xx.jpg']
225 | """
226 | # 清理后的
227 | print(pt.clean(text))
228 | """
229 | 这是t长度测试，《 》*)FSJfdsjf😁。233.
230 | """
231 | ```
232 | 
233 | #### 内置正则
234 | 
235 | ```python
236 | # USE Regex
237 | from pnlp import reg
238 | def clean_text(text: str) -> str:
239 |     text = reg.pwhi.sub("", text)
240 |     text = reg.pemj.sub("", text)
241 |     text = reg.ppic.sub("", text)
242 |     text = reg.plnk.sub("", text)
243 |     return text
244 | ```
245 | 
246 | ### 文本切分
247 | 
248 | #### 任意部分切分
249 | 
250 | ```python
251 | # Cut by Regex
252 | from pnlp import cut_part, psent
253 | text = "你好！欢迎使用。"
254 | sent_list = cut_part(text, psent, with_spliter=True, with_offset=False)
255 | print(sent_list)
256 | """
257 | ['你好！', '欢迎使用。']
258 | """
259 | pcustom_sent = re.compile(r'[。！]')
260 | sent_list = cut_part(text, pcustom_sent, with_spliter=False, with_offset=False)
261 | print(sent_list)
262 | """
263 | ['你好', '欢迎使用']
264 | """
265 | sent_list = cut_part(text, pcustom_sent, with_spliter=False, with_offset=True)
266 | print(sent_list)
267 | """
268 | [('你好', 0, 3), ('欢迎使用', 3, 8)]
269 | """
270 | ```
271 | 
272 | #### 分句
273 | 
274 | ```python
275 | # Cut Sentence
276 | from pnlp import cut_sentence as pcs
277 | text = "你好！欢迎使用。"
278 | sent_list = pcs(text)
279 | print(sent_list)
280 | """
281 | ['你好！', '欢迎使用。']
282 | """
283 | ```
284 | 
285 | #### 分子句并按一个阈值合并子句
286 | 
287 | ```python
288 | from pnlp import cut_sub_sentence as pcss
289 | text = "你好！你好。你好？你坏~欢迎使用。"
290 | sent_list = pcss(text)
291 | print(sent_list)
292 | """
293 | ['你好！', '你好。', '你好？', '你坏~', '欢迎使用。']
294 | """
295 | sent_list = pcss(text, 6)
296 | print(sent_list)
297 | """
298 | ['你好！你好。', '你好？你坏~', '欢迎使用。']
299 | """
300 | sent_list = pcss(text, 12)
301 | print(sent_list)
302 | """
303 | ['你好！你好。你好？你坏~', '欢迎使用。']
304 | """
305 | ```
306 | 
307 | 这个功能在很多场合非常有用；）懂的都懂：D
308 | 
309 | #### 中文字符切分
310 | 
311 | ```python
312 | # 中文字符切分
313 | from pnlp import cut_zhchar
314 | text = "你好，hello, 520 i love u. = ”我爱你“。"
315 | char_list = cut_zhchar(text)
316 | print(char_list)
317 | """
318 | ['你', '好', '，', 'hello', ',', ' ', '520', ' ', 'i', ' ', 'love', ' ', 'u', '.', ' ', '=', ' ', '”', '我', '爱', '你', '“', '。']
319 | """
320 | char_list = cut_zhchar(text, remove_blank=True)
321 | print(char_list)
322 | """
323 | ['你', '好', '，', 'hello', ',', '520', 'i', 'love', 'u', '.', '=', '”', '我', '爱', '你', '“', '。']
324 | """
325 | ```
326 | 
327 | #### 句子分组
328 | 
329 | ```python
330 | from pnlp import combine_bucket
331 | parts = [
332 |     "先生，那夜，我因胸中纳闷，无法入睡，",
333 |     "折腾得比那铐了脚镣的叛变水手还更难过；",
334 |     "那时，我就冲动的 ——",
335 |     "好在有那一时之念，",
336 |     "因为有时我们在无意中所做的事能够圆满……"
337 | ]
338 | buckets = combine_bucket(parts.copy(), 10, truncate=True, keep_remain=True)
339 | print(buckets)
340 | """
341 | ['先生，那夜，我因胸中',
342 |  '纳闷，无法入睡，',
343 |  '折腾得比那铐了脚镣的',
344 |  '叛变水手还更难过；',
345 |  '那时，我就冲动的 —',
346 |  '—',
347 |  '好在有那一时之念，',
348 |  '因为有时我们在无意中',
349 |  '所做的事能够圆满……']
350 | """
351 | ```
352 | 
353 | ### 文本增强
354 | 
355 | 采样器支持删除、交换、插入操作，所有的操作不会跨越标点。
356 | 
357 | #### Token级别
358 | 
359 | - 默认 Tokenizer
360 |     - 中文：字符级 Tokenizer（见上）
361 |     - 英文：空白符切分 Tokenizer
362 | - Tokenizer 可以任意指定，但它的输出应该是一个 List 的 Token 或一个 List 的 Tuple，每个 Tuple 包含一个 Token 和一个词性。
363 | - 对字符级增强，默认并不会操作所有字或词。可以自定义要操作的词或词性。
364 |     - 默认 Token 是「停用词」
365 |     - 默认词性（当 Tokenizer 输出带词性时）是「功能词」：副词、介词、连词、助词、其他虚词（标记为 d p c u xc）
366 | 
367 | ```python
368 | # 【】内的为改变的
369 | text = "人为什么活着？生而为人必须要有梦想！还要有尽可能多的精神体验。"
370 | # 字符粒度
371 | from pnlp import TokenLevelSampler
372 | tls = TokenLevelSampler()
373 | tls.make_samples(text)
374 | """
375 | {'delete': '人为什么活着？生而为人必须要【有】梦想！还要有尽可能多的精神体验。',
376 |  'swap': '【为】【人】什么活着？生而为人必须要有梦想！还要有尽可能多的精神体验。',
377 |  'insert': '人为什么活着？生而为人必须要有梦想！【还】还要有尽可能多的精神体验。',
378 |  'together': '人什么着着活？生而必为为须要有梦想！还要有尽可能多的精神体验。'}
379 | """
380 | # 支持自定义 tokenizer
381 | tls.make_samples(text, jieba.lcut)
382 | """
383 | {'delete': '人为什么活着？生而为人【必须】要有梦想！还要有尽可能多的精神体验。',
384 |  'swap': '【为什么】【人】活着？生而为人必须要有梦想！还要有尽可能多的精神体验。',
385 |  'insert': '人为什么活着？生而为人必须要有梦想！【还要】还要有尽可能多的精神体验。',
386 |  'together': '人为什么活着？生而为人人要有梦想！还要有多尽可能的精神体验。'}
387 | """
388 | # 自定义
389 | tls = TokenLevelSampler(
390 |     rate=替换比例, # 默认 5%
391 |     types=["delete", "swap", "insert"], # 默认三个 
392 |     sample_words=["词1", "词2"], # 默认停用词
393 |     sample_pos=["词性1", "词性2"], # 默认功能词
394 | )
395 | ```
396 | 
397 | #### 句子级别
398 | 
399 | ```python
400 | from pnlp import SentenceLevelSampler
401 | sls = SentenceLevelSampler()
402 | sls.make_samples(text)
403 | """
404 | {'delete': '生而为人必须要有梦想！还要有尽可能多的精神体验。',
405 |  'swap': '人为什么活着？还要有尽可能多的精神体验。生而为人必须要有梦想！',
406 |  'insert': '人为什么活着？还要有尽可能多的精神体验。生而为人必须要有梦想！生而为人必须要有梦想！',
407 |  'together': '生而为人必须要有梦想！人为什么活着？人为什么活着？'}
408 | """
409 | # 自定义
410 | sls = SentenceLevelSampler(types=["delete", "swap", "insert"]) # 默认三个
411 | ```
412 | 
413 | ### 文本归一化
414 | 
415 | #### 中文数字
416 | 
417 | ```python
418 | from pnlp import num_norm
419 | num_norm.num2zh(1024) == "一千零二十四"
420 | num_norm.num2zh(1024).to_money() == "壹仟零贰拾肆"
421 | num_norm.zh2num("一千零二十四") == 1024
422 | ```
423 | 
424 | ### 格式转换
425 | 
426 | #### BIO转实体
427 | 
428 | ```python
429 | # 实体 BIO Token 转实体
430 | from pnlp import pick_entity_from_bio_labels
431 | pairs = [('天', 'B-LOC'), ('安', 'I-LOC'), ('门', 'I-LOC'), ('有', 'O'), ('毛', 'B-PER'), ('主', 'I-PER'), ('席', 'I-PER')]
432 | pick_entity_from_bio_labels(pairs)
433 | """
434 | [('天安门', 'LOC'), ('毛主席', 'PER')]
435 | """
436 | pick_entity_from_bio_labels(pairs, with_offset=True)
437 | """
438 | [('天安门', 'LOC', 0, 3), ('毛主席', 'PER', 4, 7)]
439 | """
440 | ```
441 | 
442 | #### 任意参数转UUID
443 | 
444 | ```python
445 | from pnlp import generate_uuid
446 | 
447 | uid1 = pnlp.generate_uuid("a", 1, 0.02)
448 | uid2 = pnlp.generete_uuid("a", 1)
449 | """
450 | uid1 == 3fbc8b70d05b5abdb5badca1d26e1dbd
451 | uid2 == f7b0ffc589e453e88d4faf66eb92f669
452 | """
453 | ```
454 | 
455 | ### 内置词典
456 | 
457 | #### 停用词
458 | 
459 | ```python
460 | from pnlp import StopWords, chinese_stopwords, english_stopwords
461 | 
462 | csw = StopWords("/path/to/custom/stopwords.txt")
463 | csw.stopwords # a set of the custom stopwords
464 | 
465 | csw.zh == chinese_stopwords # Chineses stopwords
466 | csw.en == english_stopwords # English stopwords
467 | ```
468 | 
469 | 
470 | ### 文本长度
471 | 
472 | ```python
473 | from pnlp import Length
474 | 
475 | text = "这是https://www.yam.gift长度测试，《 》*)FSJfdsjf😁![](http://xx.jpg)。233."
476 | 
477 | pl = Length(text)
478 | # 注意：即使使用了 pattern，长度都是基于原始文本
479 | # 长度基于字符计数（不是整词）
480 | print("Length of all characters: ", pl.len_all)
481 | print("Length of all non-white characters: ", pl.len_nwh)
482 | print("Length of all Chinese characters: ", pl.len_chi)
483 | print("Length of all words and numbers: ", pl.len_wnb)
484 | print("Length of all punctuations: ", pl.len_pun)
485 | print("Length of all English characters: ", pl.len_eng)
486 | print("Length of all numbers: ", pl.len_num)
487 | 
488 | """
489 | Length of all characters:  64
490 | Length of all non-white characters:  63
491 | Length of all Chinese characters:  6
492 | Length of all words and numbers:  41
493 | Length of all punctuations:  14
494 | Length of all English characters:  32
495 | Length of all numbers:  3
496 | """
497 | ```
498 | 
499 | ### 魔术方法
500 | 
501 | #### 字典
502 | 
503 | ```python
504 | from pnlp import MagicDict
505 | 
506 | # 嵌套字典
507 | pmd = MagicDict()
508 | pmd['a']['b']['c'] = 2
509 | print(pmd)
510 | 
511 | """
512 | {'a': {'b': {'c': 2}}}
513 | """
514 | 
515 | # 当字典被翻转时，保留所有的重复 value-keys
516 | dx = {1: 'a',
517 |       2: 'a',
518 |       3: 'a',
519 |       4: 'b' }
520 | print(pmag.MagicDict.reverse(dx))
521 | 
522 | """
523 | {'a': [1, 2, 3], 'b': 4}
524 | """
525 | ```
526 | 
527 | #### 获取唯一文件名
528 | 
529 | ```python
530 | from pnlp import get_unique_fn
531 | 
532 | get_unique_fn("a/b/c.md") == "a_b_c.md"
533 | ```
534 | 
535 | ### 并行处理
536 | 
537 | 支持四种并行处理方式：
538 | 
539 | - 线程池：`thread_pool`
540 | - 进程池：`process_pool`
541 | - 线程 Executor：`thread_executor`，默认使用
542 | - 线程：`thread`
543 | 
544 | 注意：惰性处理，返回的是 Generator。
545 | 
546 | ```python
547 | import math
548 | def is_prime(x):
549 |     if x < 2:
550 |         return False
551 |     for i in range(2, int(math.sqrt(x)) + 1):
552 |         if x % i == 0:
553 |             return False
554 |     return True
555 | 
556 | from pnlp import concurring
557 | 
558 | # max_workers 默认为 4
559 | @concurring
560 | def get_primes(lst):
561 |     res = []
562 |     for i in lst:
563 |         if is_prime(i):
564 |             res.append(i)
565 |     return res
566 | 
567 | @concurrint(type="thread_pool", max_workers=10)
568 | def get_primes(lst):
569 |     pass
570 | ```
571 | 
572 | `concurring` 装饰器让你的迭代函数并行。
573 | 
574 | ### 后台处理
575 | 
576 | ```python
577 | from pnlp import run_in_new_thread
578 | 
579 | def func(file, a, b, c):
580 |     background_task()
581 | 
582 | run_in_new_thread(func, file, 1, 2, 3)
583 | ```
584 | 
585 | ## 测试
586 | 
587 | Clone 仓库后执行：
588 | 
589 | ```bash
590 | $ python -m pytest
591 | ```
592 | 
593 | ## 更新日志
594 | 
595 | 见英文版 README。
596 | 
597 | 
598 | 
599 | 


--------------------------------------------------------------------------------
/README_EN.md:
--------------------------------------------------------------------------------
  1 | <!-- START doctoc generated TOC please keep comment here to allow auto update -->
  2 | <!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
  3 | **Table of Contents**  *generated with [DocToc](https://github.com/thlorenz/doctoc)*
  4 | 
  5 | - [Features](#features)
  6 | - [Install](#install)
  7 | - [Usage](#usage)
  8 |   - [Iopipe](#iopipe)
  9 |     - [IO process](#io-process)
 10 |     - [Built-in Method](#built-in-method)
 11 |   - [Text](#text)
 12 |     - [Clean and Extract](#clean-and-extract)
 13 |     - [Regex](#regex)
 14 |   - [Cut](#cut)
 15 |     - [AnypartCut](#anypartcut)
 16 |     - [SentenceCut](#sentencecut)
 17 |     - [SubSentenceCut and Combine by threshold](#subsentencecut-and-combine-by-threshold)
 18 |     - [ChineseCharCut](#chinesecharcut)
 19 |     - [CombineBucket](#combinebucket)
 20 |   - [Enhancement](#enhancement)
 21 |     - [TokenLevel](#tokenlevel)
 22 |     - [SentenceLevel](#sentencelevel)
 23 |   - [Normalization](#normalization)
 24 |     - [ChineseNumber](#chinesenumber)
 25 |   - [Transformation](#transformation)
 26 |     - [BIO2Entity](#bio2entity)
 27 |     - [Parameters2uuid](#parameters2uuid)
 28 |   - [Built-in Dicts](#built-in-dicts)
 29 |     - [StopWords](#stopwords)
 30 |   - [Length](#length)
 31 |   - [Magic](#magic)
 32 |   - [Concurring](#concurring)
 33 | - [Test](#test)
 34 | - [ChangeLog](#changelog)
 35 | 
 36 | <!-- END doctoc generated TOC please keep comment here to allow auto update -->
 37 | 
 38 | This is a pre/post-processing tool for NLP.
 39 | 
 40 | ## Features
 41 | 
 42 | - A flexible pipe line for text io
 43 | - A flexible tool for text clean and extract
 44 | - Text enhancement
 45 | - Sentence cut and Chinese character cut
 46 | - Text bucket
 47 | - Chinese character normalization
 48 | - Kinds of length
 49 | - Stopwords
 50 | - Some magic usage in pre-processing
 51 | - Tools like Concurring, generating batches
 52 | 
 53 | ## Install
 54 | 
 55 | Need Python3.7+.
 56 | 
 57 | `pip install pnlp`
 58 | 
 59 | ## Usage
 60 | 
 61 | ### Iopipe
 62 | 
 63 | #### IO process
 64 | 
 65 | ```bash
 66 | tree tests/piop_data/
 67 | ├── a.md
 68 | ├── b.txt
 69 | ├── c.data
 70 | ├── first
 71 | │   ├── fa.md
 72 | │   ├── fb.txt
 73 | │   ├── fc.data
 74 | │   └── second
 75 | │       ├── sa.md
 76 | │       ├── sb.txt
 77 | │       └── sc.data
 78 | ├── json.json
 79 | ├── outfile.file
 80 | ├── outjson.json
 81 | └── yml.yml
 82 | ```
 83 | 
 84 | ```python
 85 | import os
 86 | from pnlp import Reader
 87 | 
 88 | DATA_PATH = "./pnlp/tests/piop_data/"
 89 | pattern = '*.md' # also could be '*.txt', 'f*.*', etc. SUPPORT regex
 90 | reader = Reader(pattern, use_regex=True)
 91 | 
 92 | # Get lines of all files in one directory with line index and file name
 93 | for line in reader(DATA_FOLDER_PATH):
 94 |     print(line.lid, line.fname, line.text)
 95 | """
 96 | 0 a.md line 1 in a.
 97 | 1 a.md line 2 in a.
 98 | 2 a.md line 3 in a.
 99 | 0 fa.md line 1 in fa.
100 | 1 fa.md line 2 in fa
101 | ...
102 | """
103 | 
104 | # Get lines of one file lines with line index and file name
105 | # if a file is read, the `pattern` is not effective
106 | for line in reader(os.path.join(DATA_FOLDER_PATH, "a.md")):
107 |     print(line.lid, line.fname, line.text)
108 | """
109 | 0 a.md line 1 in a.
110 | 1 a.md line 2 in a.
111 | 2 a.md line 3 in a.
112 | """
113 | 
114 | 
115 | 
116 | # Get all filepaths in one directory
117 | for path in Reader.gen_files(DATA_PATH, pattern):
118 |     print(path)
119 | """
120 | pnlp/tests/piop_data/a.md
121 | pnlp/tests/piop_data/first/fa.md
122 | pnlp/tests/piop_data/first/second/sa.md
123 | """
124 | 
125 | # Get content(article) of all files in one directory with file name
126 | paths = Reader.gen_files(DATA_PATH, pattern)
127 | articles = reader.gen_articles(paths)
128 | for article in articles:
129 |     print(article.fname)
130 |     print(article.f.read())
131 | """
132 | a.md
133 | line 1 in a.
134 | line 2 in a.
135 | line 3 in a.
136 | ...
137 | """
138 | 
139 | # Get lines of all files in one directory with line index and file name
140 | # the same as ip.Reader(DATA_PATH, pattern)
141 | paths = Reader.gen_files(DATA_PATH, pattern)
142 | articles = Reader.gen_articles(paths)
143 | for line in Reader.gen_flines(articles):
144 |     print(line.lid, line.fname, line.text)
145 | ```
146 | 
147 | #### Built-in Method
148 | 
149 | ```python
150 | import pnlp
151 | 
152 | # Read
153 | file_string = pnlp.read_file(file_path)
154 | file_list = pnlp.read_lines(file_path)
155 | file_json = pnlp.read_json(file_path)
156 | file_yaml = pnlp.read_yaml(file_path)
157 | file_csv = pnlp.read_csv(file_path)
158 | file_pickle = pnlp.read_pickle(file_path)
159 | list_dict = pnlp.read_file_to_list_dict(file_path)
160 | 
161 | # Write
162 | pnlp.write_json(file_path, data, indent=2)
163 | pnlp.write_file(file_path, data)
164 | pnlp.write_pickle(file_path, data)
165 | pnlp.write_list_dict_to_file(file_path, data)
166 | 
167 | # Others
168 | pnlp.check_dir(dirname) # will make dirname if not exist
169 | ```
170 | 
171 | ### Text
172 | 
173 | #### Clean and Extract
174 | 
175 | ```python
176 | import re
177 | 
178 | # Use Text
179 | from pnlp import Text
180 | 
181 | text = "这是https://www.yam.gift长度测试，《 》*)FSJfdsjf😁![](http://xx.jpg)。233."
182 | pattern = re.compile(r'\d+')
183 | 
184 | # pattern is re.Pattern or str type
185 | # Default is '', means do not use any pattern (acctually is re.compile(r'.+'). In this pattern, clean returns nothing, extract returns the origin.
186 | # If pattern is a string, a build-in pattern will be used, there are 11 types:
187 | #	'chi': Chinese character
188 | #	'pun': Punctuations
189 | #	'whi': White space
190 | #	'nwh': Non White space
191 | #	'wnb': Word and number
192 | #	'nwn': Non word and number
193 | #	'eng': English character
194 | #	'num': Number
195 | #	'pic': Pictures
196 | #	'lnk': Links
197 | #	'emj': Emojis
198 | 
199 | pt = Text(['chi', pattern])
200 | # pt.extract will return matches and their locations
201 | res = pt.extract(text)
202 | print(res)
203 | """
204 | {'text': '这是长度测试233', 'mats': ['这是', '长度测试', '233'], 'locs': [(0, 2), (22, 26), (60, 63)]}
205 | """
206 | # support use dot to get the key field
207 | print(res.text, res.mats, res.locs)
208 | """
209 | '这是长度测试' ['这是', '长度测试'] [(0, 2), (22, 26)]
210 | """
211 | # pt.clean will return cleaned text using the pattern
212 | print(pt.clean(text))
213 | """
214 | https://www.yam.gift，《 》*)FSJfdsjf😁![](http://xx.jpg)。233.
215 | """
216 | 
217 | pt = Text(['pic', 'lnk'])
218 | res = pt.extract(text)
219 | print(res.mats)
220 | """
221 | ['https://www.yam.gif',
222 |  '![](http://xx.jpg)',
223 |  'https://www.yam.gift',
224 |  'http://xx.jpg']
225 | """
226 | print(pt.clean(text))
227 | """
228 | 这是t长度测试，《 》*)FSJfdsjf😁。233.
229 | """
230 | ```
231 | 
232 | #### Regex
233 | 
234 | ```python
235 | # USE Regex
236 | from pnlp import reg
237 | def clean_text(text: str) -> str:
238 |     text = reg.pwhi.sub("", text)
239 |     text = reg.pemj.sub("", text)
240 |     text = reg.ppic.sub("", text)
241 |     text = reg.plnk.sub("", text)
242 |     return text
243 | ```
244 | 
245 | ### Cut
246 | 
247 | #### AnypartCut
248 | 
249 | ```python
250 | # Cut by Regex
251 | from pnlp import cut_part, psent
252 | text = "你好！欢迎使用。"
253 | sent_list = cut_part(text, psent, with_spliter=True, with_offset=False)
254 | print(sent_list)
255 | """
256 | ['你好！', '欢迎使用。']
257 | """
258 | pcustom_sent = re.compile(r'[。！]')
259 | sent_list = cut_part(text, pcustom_sent, with_spliter=False, with_offset=False)
260 | print(sent_list)
261 | """
262 | ['你好', '欢迎使用']
263 | """
264 | sent_list = cut_part(text, pcustom_sent, with_spliter=False, with_offset=True)
265 | print(sent_list)
266 | """
267 | [('你好', 0, 3), ('欢迎使用', 3, 8)]
268 | """
269 | ```
270 | 
271 | #### SentenceCut
272 | 
273 | ```python
274 | # Cut Sentence
275 | from pnlp import cut_sentence as pcs
276 | text = "你好！欢迎使用。"
277 | sent_list = pcs(text)
278 | print(sent_list)
279 | """
280 | ['你好！', '欢迎使用。']
281 | """
282 | ```
283 | 
284 | #### SubSentenceCut and Combine by threshold
285 | 
286 | ```python
287 | from pnlp import cut_sub_sentence as pcss
288 | text = "你好！你好。你好？你坏~欢迎使用。"
289 | sent_list = pcss(text)
290 | print(sent_list)
291 | """
292 | ['你好！', '你好。', '你好？', '你坏~', '欢迎使用。']
293 | """
294 | sent_list = pcss(text, 6)
295 | print(sent_list)
296 | """
297 | ['你好！你好。', '你好？你坏~', '欢迎使用。']
298 | """
299 | sent_list = pcss(text, 12)
300 | print(sent_list)
301 | """
302 | ['你好！你好。你好？你坏~', '欢迎使用。']
303 | """
304 | ```
305 | 
306 | This is very useful in some places, you knows;）
307 | 
308 | #### ChineseCharCut
309 | 
310 | ```python
311 | # Cut to Chinese chars
312 | from pnlp import cut_zhchar
313 | text = "你好，hello, 520 i love u. = ”我爱你“。"
314 | char_list = cut_zhchar(text)
315 | print(char_list)
316 | """
317 | ['你', '好', '，', 'hello', ',', ' ', '520', ' ', 'i', ' ', 'love', ' ', 'u', '.', ' ', '=', ' ', '”', '我', '爱', '你', '“', '。']
318 | """
319 | char_list = cut_zhchar(text, remove_blank=True)
320 | print(char_list)
321 | """
322 | ['你', '好', '，', 'hello', ',', '520', 'i', 'love', 'u', '.', '=', '”', '我', '爱', '你', '“', '。']
323 | """
324 | ```
325 | 
326 | #### CombineBucket
327 | 
328 | ```python
329 | from pnlp import combine_bucket
330 | parts = [
331 |     "先生，那夜，我因胸中纳闷，无法入睡，",
332 |     "折腾得比那铐了脚镣的叛变水手还更难过；",
333 |     "那时，我就冲动的 ——",
334 |     "好在有那一时之念，",
335 |     "因为有时我们在无意中所做的事能够圆满……"
336 | ]
337 | buckets = combine_bucket(parts.copy(), 10, truncate=True, keep_remain=True)
338 | print(buckets)
339 | """
340 | ['先生，那夜，我因胸中',
341 |  '纳闷，无法入睡，',
342 |  '折腾得比那铐了脚镣的',
343 |  '叛变水手还更难过；',
344 |  '那时，我就冲动的 —',
345 |  '—',
346 |  '好在有那一时之念，',
347 |  '因为有时我们在无意中',
348 |  '所做的事能够圆满……']
349 | """
350 | ```
351 | 
352 | ### Enhancement
353 | 
354 | Sampler support delete, swap and insert operation, all operations do not span punctuations.
355 | 
356 | #### TokenLevel
357 | 
358 | - It uses a default tokenizer for Chinese (Chinese Char Tokenizer) and English (Simple Whitespace Tokenizer).
359 | - The tokenizer could be anyone you like, but the output should be a list of tokens or a list of tuple pairs, each pair include a token and a part-of-speech.
360 | - It uses `stopwords` as default sample words and function part-of-speech as default sample pos. This means we only sampling those tokens who are in the sample words or their pos are in the sample pos (if they just have a pos). You could customize them as you like.
361 | 
362 | ```python
363 | # tokens in 【】 are operated
364 | text = "人为什么活着？生而为人必须要有梦想！还要有尽可能多的精神体验。"
365 | # TokenLevel
366 | from pnlp import TokenLevelSampler
367 | tls = TokenLevelSampler()
368 | tls.make_samples(text)
369 | """
370 | {'delete': '人为什么活着？生而为人必须要【有】梦想！还要有尽可能多的精神体验。',
371 |  'swap': '【为】【人】什么活着？生而为人必须要有梦想！还要有尽可能多的精神体验。',
372 |  'insert': '人为什么活着？生而为人必须要有梦想！【还】还要有尽可能多的精神体验。',
373 |  'together': '人什么着着活？生而必为为须要有梦想！还要有尽可能多的精神体验。'}
374 | """
375 | # tokenizer is supported
376 | tls.make_samples(text, jieba.lcut)
377 | """
378 | {'delete': '人为什么活着？生而为人【必须】要有梦想！还要有尽可能多的精神体验。',
379 |  'swap': '【为什么】【人】活着？生而为人必须要有梦想！还要有尽可能多的精神体验。',
380 |  'insert': '人为什么活着？生而为人必须要有梦想！【还要】还要有尽可能多的精神体验。',
381 |  'together': '人为什么活着？生而为人人要有梦想！还要有多尽可能的精神体验。'}
382 | """
383 | # custom the Sampler
384 | tls = TokenLevelSampler(
385 |     rate=replace_rate, # default 5%
386 |     types=["delete", "swap", "insert"], # default the three
387 |     sample_words=["w1", "w2"], # default is the stopwords
388 |     sample_pos=["pos1", "pos2"], # default is the functional pos (d p c u xc), means adv, prep, conj, auxiliary,  other functional pos
389 | )
390 | ```
391 | 
392 | #### SentenceLevel
393 | 
394 | ```python
395 | from pnlp import SentenceLevelSampler
396 | sls = SentenceLevelSampler()
397 | sls.make_samples(text)
398 | """
399 | {'delete': '生而为人必须要有梦想！还要有尽可能多的精神体验。',
400 |  'swap': '人为什么活着？还要有尽可能多的精神体验。生而为人必须要有梦想！',
401 |  'insert': '人为什么活着？还要有尽可能多的精神体验。生而为人必须要有梦想！生而为人必须要有梦想！',
402 |  'together': '生而为人必须要有梦想！人为什么活着？人为什么活着？'}
403 | """
404 | # custom the Sampler
405 | sls = SentenceLevelSampler(types=["delete", "swap", "insert"]) # default is the three
406 | ```
407 | 
408 | ### Normalization
409 | 
410 | #### ChineseNumber
411 | 
412 | ```python
413 | from pnlp import num_norm
414 | num_norm.num2zh(1024) == "一千零二十四"
415 | num_norm.num2zh(1024).to_money() == "壹仟零贰拾肆"
416 | num_norm.zh2num("一千零二十四") == 1024
417 | ```
418 | 
419 | ### Transformation
420 | 
421 | #### BIO2Entity
422 | 
423 | ```python
424 | # entity bio to entities
425 | from pnlp import pick_entity_from_bio_labels
426 | pairs = [('天', 'B-LOC'), ('安', 'I-LOC'), ('门', 'I-LOC'), ('有', 'O'), ('毛', 'B-PER'), ('主', 'I-PER'), ('席', 'I-PER')]
427 | pick_entity_from_bio_labels(pairs)
428 | """
429 | [('天安门', 'LOC'), ('毛主席', 'PER')]
430 | """
431 | pick_entity_from_bio_labels(pairs, with_offset=True)
432 | """
433 | [('天安门', 'LOC', 0, 3), ('毛主席', 'PER', 4, 7)]
434 | """
435 | ```
436 | 
437 | #### Parameters2uuid
438 | 
439 | ```python
440 | from pnlp import generate_uuid
441 | 
442 | uid1 = pnlp.generate_uuid("a", 1, 0.02)
443 | uid2 = pnlp.generete_uuid("a", 1)
444 | """
445 | uid1 == 3fbc8b70d05b5abdb5badca1d26e1dbd
446 | uid2 == f7b0ffc589e453e88d4faf66eb92f669
447 | """
448 | ```
449 | 
450 | ### Built-in Dicts
451 | 
452 | #### StopWords
453 | 
454 | ```python
455 | from pnlp import StopWords, chinese_stopwords, english_stopwords
456 | 
457 | csw = StopWords("/path/to/custom/stopwords.txt")
458 | csw.stopwords # a set of the custom stopwords
459 | 
460 | csw.zh == chinese_stopwords # Chineses stopwords
461 | csw.en == english_stopwords # English stopwords
462 | ```
463 | 
464 | 
465 | ### Length
466 | 
467 | ```python
468 | from pnlp import Length
469 | 
470 | text = "这是https://www.yam.gift长度测试，《 》*)FSJfdsjf😁![](http://xx.jpg)。233."
471 | 
472 | pl = Length(text)
473 | # Note that even a pattern is used, the length is always for the raw text.
474 | # Length is counted by character, not entire word or number.
475 | print("Length of all characters: ", pl.len_all)
476 | print("Length of all non-white characters: ", pl.len_nwh)
477 | print("Length of all Chinese characters: ", pl.len_chi)
478 | print("Length of all words and numbers: ", pl.len_wnb)
479 | print("Length of all punctuations: ", pl.len_pun)
480 | print("Length of all English characters: ", pl.len_eng)
481 | print("Length of all numbers: ", pl.len_num)
482 | 
483 | """
484 | Length of all characters:  64
485 | Length of all non-white characters:  63
486 | Length of all Chinese characters:  6
487 | Length of all words and numbers:  41
488 | Length of all punctuations:  14
489 | Length of all English characters:  32
490 | Length of all numbers:  3
491 | """
492 | ```
493 | 
494 | ### Magic
495 | 
496 | #### MagicDict
497 | 
498 | ```python
499 | from pnlp import MagicDict
500 | 
501 | # Nest dict
502 | pmd = MagicDict()
503 | pmd['a']['b']['c'] = 2
504 | print(pmd)
505 | 
506 | """
507 | {'a': {'b': {'c': 2}}}
508 | """
509 | 
510 | # Preserve all repeated value-keys when a Dict is reversed.
511 | dx = {1: 'a',
512 |       2: 'a',
513 |       3: 'a',
514 |       4: 'b' }
515 | print(pmag.MagicDict.reverse(dx))
516 | 
517 | """
518 | {'a': [1, 2, 3], 'b': 4}
519 | """
520 | ```
521 | 
522 | #### GetUniqueFileName
523 | 
524 | ```python
525 | from pnlp import get_unique_fn
526 | 
527 | get_unique_fn("a/b/c.md") == "a_b_c.md"
528 | ```
529 | 
530 | ### Concurring
531 | 
532 | Support 4 types of concurring:
533 | 
534 | - `thread_pool`
535 | - `process_pool`
536 | - `thread_executor`, the default
537 | - `thread`
538 | 
539 | Note that we use lazy process, return  generators.
540 | 
541 | ```python
542 | import math
543 | def is_prime(x):
544 |     if x < 2:
545 |         return False
546 |     for i in range(2, int(math.sqrt(x)) + 1):
547 |         if x % i == 0:
548 |             return False
549 |     return True
550 | 
551 | from pnlp import concurring
552 | 
553 | # the default value of `max_workers` is 4
554 | @concurring
555 | def get_primes(lst):
556 |     res = []
557 |     for i in lst:
558 |         if is_prime(i):
559 |             res.append(i)
560 |     return res
561 | 
562 | @concurrint(type="thread", max_workers=10)
563 | def get_primes(lst):
564 |     pass
565 | ```
566 | 
567 | `concurring` wrapper just make your original function concurring.  
568 | 
569 | ### Background
570 | 
571 | ```python
572 | from pnlp import run_in_new_thread
573 | 
574 | def func(file, a, b, c):
575 |     background_task()
576 | 
577 | run_in_new_thread(func, file, 1, 2, 3)
578 | ```
579 | 
580 | ## Test
581 | 
582 | Clone the repo run:
583 | 
584 | ```bash
585 | $ python -m pytest
586 | ```
587 | 
588 | ## ChangeLog
589 | 
590 | **v0.4.16**
591 | 
592 | Fix: `read_json` default use `UTF-8`.
593 | 
594 | **v0.4.14-15**
595 | 
596 | Fix: Number as part of sub sentence.
597 | 
598 | **v0.4.13**
599 | 
600 | Feat: Background task and magic get unique file name from file path.
601 | 
602 | **v0.4.12**
603 | 
604 | Feat: Subsentence cut and combine with a given threshold.
605 | 
606 | **v0.4.10**
607 | 
608 | Fix: Chinese stopwords reading, `piop.gen_files` regex
609 | 
610 | **v0.4.9**
611 | 
612 | Add: `generate_uuid` given by arbitrary parameters.
613 | 
614 | **v0.4.8**
615 | 
616 | Opt: read `count` lines of a text file for api `read_lines`
617 | 
618 | **v0.4.7**
619 | 
620 | Add `write_list_dict_to_file` and `read_file_to_list_dict`
621 | 
622 | **v0.4.6**
623 | 
624 | Fix regex: `-` if is used as string literal, should be transfered.
625 | 
626 | **v0.4.5**
627 | 
628 | Add loc to bio label => entity.
629 | 
630 | **v0.4.3**
631 | 
632 | Adjust `Reader` init parameters.
633 | 
634 | **v0.4.2**
635 | 
636 | Add bio label => entity.
637 | 
638 | **v0.4.1**
639 | 
640 | Remove annotation `re.Pattern`.
641 | 
642 | **v0.4.0**
643 | 
644 | Make dataclass their right usage.
645 | 
646 | 
647 | **v0.3.11**
648 | 
649 | Adjust `MagicDict` and `check_dir`.
650 | 
651 | **v0.3.10**
652 | 
653 | Fix piop `strip`.
654 | 
655 | **v0.3.9**
656 | 
657 | `Reader` support regex.
658 | 
659 | **v0.3.8**
660 | 
661 | Fix `concurring` for multiple processing.
662 | 
663 | **v0.3.7**
664 | 
665 | Add concurring and batch generator
666 | 
667 | **v0.3.5**
668 | 
669 | Add text enhancement.
670 | 
671 | **v0.3.3/4**
672 | 
673 | Fix url link and picture  `Regex` pattern.
674 | 
675 | **v0.3.2**
676 | 
677 | Fix `cut_part` for sentence ends with a white space and a full stop. 
678 | 
679 | **v0.3.1**
680 | 
681 | Add `cut_part` to cut text to any parts by the given Regex Pattern; Add `combine_bucket` to combine any parts to buckets by the given threshold(length).
682 | 
683 | **v0.3.0**
684 | 
685 | Update `cut_sentence`; Add `NumNorm`.
686 | 
687 | **v0.28-29**
688 | 
689 | Update `cut_zhchar`.
690 | 
691 | **v0.27**
692 | 
693 | Add `cut_zhchar`.
694 | 
695 | **v0.26**
696 | 
697 | Add `read_csv`, remove `；` as a sentence cut standard.
698 | 
699 | **v0.25**
700 | 
701 | Add `stop_words`. 
702 | 
703 | **v0.24**
704 | 
705 | Fix `read_json`.
706 | 
707 | **v0.23**
708 | 
709 | Fix `Text` default rule.
710 | 
711 | **v0.22**
712 | 
713 | Make `Text` more convenient to use.
714 | 
715 | **v0.21**
716 | 
717 | Add `cut_sentence` method.
718 | 
719 | **v0.20**
720 | 
721 | Optimize several interface and make `Text` accept list of Regular Expression Patterns.
722 | 
723 | 


--------------------------------------------------------------------------------
/pnlp/__init__.py:
--------------------------------------------------------------------------------
 1 | from pnlp.piop import read_file, read_lines, read_json, read_yaml, read_csv, read_pickle, read_file_to_list_dict
 2 | from pnlp.piop import write_file, write_json, write_pickle, check_dir, write_list_dict_to_file
 3 | from pnlp.pcut import cut_sentence, cut_sub_sentence, cut_zhchar, cut_part, combine_bucket
 4 | from pnlp.pcut import psent, psubsent
 5 | 
 6 | from pnlp.piop import Reader, Dict
 7 | from pnlp.ptxt import Regex, Text, Length
 8 | from pnlp.pnorm import NumNorm
 9 | from pnlp.penh import TokenLevelSampler, SentenceLevelSampler
10 | from pnlp.ptrans import pick_entity_from_bio_labels, generate_uuid
11 | from pnlp.pmag import MagicDict, get_unique_fn
12 | from pnlp.stopwords import StopWords
13 | from pnlp.stopwords import chinese_stopwords, english_stopwords
14 | 
15 | from pnlp.utils import pstr, concurring, divide2int, run_in_new_thread
16 | from pnlp.utils import generate_batches_by_num, generate_batches_by_size
17 | 
18 | 
19 | num_norm = NumNorm()
20 | reg = Regex()
21 | reader = Reader()
22 | tlsampler = TokenLevelSampler()
23 | slsampler = SentenceLevelSampler()
24 | 
25 | 
26 | __title__ = "pnlp"
27 | __version__ = "0.4.16"
28 | __author__ = "Yam"
29 | __license__ = "Apache-2.0"
30 | __copyright__ = "Copyright 2019, 2020, 2021, 2022, 2023, 2024 Yam"
31 | __all__ = [
32 |     "Reader",
33 |     "Text", "Regex", "Length",
34 |     "MagicDict",
35 |     "NumNorm",
36 |     "StopWords",
37 |     "TokenLevelSampler", "SentenceLevelSampler"
38 | ]
39 | 


--------------------------------------------------------------------------------
/pnlp/pcut.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import re
  3 | from pnlp.ptxt import Regex
  4 | from pnlp.utils import pstr
  5 | 
  6 | psent = re.compile(
  7 |     r'''
  8 |     \n+
  9 |     |
 10 |     [。.！!?？…]+[”][。.!！?？…～~]?
 11 |     |
 12 |     (?<=[ \u3000a-zA-Z"”》）)〉〕】>」』\u4e00-\u9fa5])[。.!！?？…～~]+
 13 |     ''', re.UNICODE | re.VERBOSE)
 14 | psubsent = re.compile(
 15 |     r'''
 16 |     \n+
 17 |     |
 18 |     [。.！!?？…]+[”][。.!！?？…～~]?
 19 |     |
 20 |     (?<=[ \u3000a-zA-Z"”》）)〉〕】>」』\u4e00-\u9fa5])[,，、:：；;。.!！?？…～~]+
 21 |     |
 22 |     \d+[,，]+
 23 |     ''', re.UNICODE | re.VERBOSE)
 24 | # referenced from jieba
 25 | punzh = pstr(Regex.pun_zh) - "-"  # for minus number eg -2
 26 | punen = pstr(Regex.pun_en) - "."  # for float number eg 1.3
 27 | pun = punzh + punen
 28 | pzh = re.compile(rf"([\u4E00-\u9FD5{pun}+#&])", re.UNICODE)
 29 | pen = re.compile(r"([a-zA-Z]+)", re.UNICODE)
 30 | pskip = re.compile(r"(\s)", re.UNICODE)
 31 | pspecial = re.compile(r"([\-.])")  # split to single
 32 | pnum = re.compile(
 33 |     r"""
 34 |     ([\-]?\d{1,}[.]?\d{0,}%)
 35 |     |
 36 |     ([\-]?\d{1,}[./]?\d{0,})
 37 |     """, re.UNICODE | re.VERBOSE)
 38 | 
 39 | 
 40 | def cut_zhchar(text: str, remove_blank: bool = False) -> list:
 41 |     lst = []
 42 |     blocks = pzh.split(text)
 43 |     for block in blocks:
 44 |         if not block:
 45 |             continue
 46 |         if pzh.match(block):
 47 |             for char in block:
 48 |                 lst.append(char)
 49 |         else:
 50 |             skips = pskip.split(block)
 51 |             for skip in skips:
 52 |                 if pen.search(skip):
 53 |                     for en_part in pen.split(skip):
 54 |                         if en_part:
 55 |                             spe = pspecial.search(en_part)
 56 |                             if not spe:
 57 |                                 lst.append(en_part)
 58 |                             else:
 59 |                                 for spe_part in pspecial.split(en_part):
 60 |                                     if spe_part:
 61 |                                         lst.append(spe_part)
 62 |                 elif pnum.search(skip):
 63 |                     if skip[-1] != ".":
 64 |                         lst.append(skip)
 65 |                     else:
 66 |                         i = 0
 67 |                         while skip[-1] == ".":
 68 |                             i += 1
 69 |                             skip = skip[:-1]
 70 |                         lst.append(skip)
 71 |                         for _ in range(i):
 72 |                             lst.append(".")
 73 |                 else:
 74 |                     if remove_blank:
 75 |                         skip = pskip.sub("", skip)
 76 |                     if skip:
 77 |                         lst.append(skip)
 78 |     return lst
 79 | 
 80 | 
 81 | def cut_part(text: str,
 82 |              split_pattern,
 83 |              with_spliter: bool = True,
 84 |              with_offset: bool = False) -> list:
 85 |     """
 86 |     Cut text to parts by the given Regex Pattern.
 87 | 
 88 |     Parameters
 89 |     ----------
 90 |     text: raw text.
 91 |     split_pattern: how to split text.
 92 |     with_spliter: whether the parts contain spliters.
 93 |     with_offset: whether the parts contain offsets.
 94 | 
 95 |     Returns
 96 |     --------
 97 |     out: cutted parts.
 98 |     """
 99 |     spliters = split_pattern.findall(text)
100 |     length = len(spliters)
101 |     lst = []
102 |     start = 0
103 |     for i, part in enumerate(split_pattern.split(text)):
104 |         if i < length:
105 |             if with_spliter:
106 |                 part = part + spliters[i]
107 |                 len_spliter = 0
108 |             else:
109 |                 len_spliter = len(spliters[i])
110 |         else:
111 |             len_spliter = 0
112 |         end = start + len(part) + len_spliter
113 |         if part:
114 |             if with_offset:
115 |                 item = (part, start, end)
116 |             else:
117 |                 item = part
118 |             lst.append(item)
119 |         start = end
120 |     return lst
121 | 
122 | 
123 | def combine_bucket(parts: list,
124 |                    threshold: int,
125 |                    truncate: bool = False,
126 |                    keep_remain: bool = False) -> list:
127 |     """
128 |     Convert parts to buckets with given length(threshold).
129 | 
130 |     Parameters
131 |     ----------
132 |     parts: the given parts.
133 |     threshold: bucket length.
134 |     truncate: whether to truncate those whose length is bigger than threshold.
135 |     keep_remain: when truncate=True, whether to keep the remain parts.
136 | 
137 |     Returns
138 |     out: list of bucket.
139 |     -------
140 |     """
141 | 
142 |     def deal_long_part(part: str) -> list:
143 |         result = []
144 |         if truncate:
145 |             if keep_remain:
146 |                 len_subparts = len(part) // threshold + 1
147 |                 for i in range(len_subparts):
148 |                     sub_part = part[i * threshold:(i + 1) * threshold]
149 |                     if sub_part:
150 |                         result.append(sub_part)
151 |             else:
152 |                 result.append(part[:threshold])
153 |         else:
154 |             result.append(part)
155 |         return result
156 | 
157 |     buckets = []
158 |     while parts:
159 |         part = parts.pop(0)
160 |         # directly add to buckets when a part is longer than threshold
161 |         if len(part) > threshold:
162 |             sub_parts = deal_long_part(part)
163 |             buckets.append(sub_parts)
164 |         else:
165 |             while parts and len(part) < threshold:
166 |                 another = parts[0]
167 |                 if len(part + another) > threshold:
168 |                     break
169 |                 else:
170 |                     part += parts.pop(0)
171 |             buckets.append([part])
172 |     result = list(itertools.chain(*buckets))
173 |     return result
174 | 
175 | 
176 | def cut_sentence(text: str) -> list:
177 |     return cut_part(text, psent, True, False)
178 | 
179 | 
180 | def cut_sub_sentence(text: str, threshold: int = 0) -> list:
181 |     parts = cut_part(text, psubsent, True, False)
182 |     res = combine_bucket(parts, threshold, False, False)
183 |     return res


--------------------------------------------------------------------------------
/pnlp/penh.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | from typing import List, Dict, Callable, Tuple, Optional
  3 | from itertools import chain
  4 | 
  5 | import numpy as np
  6 | 
  7 | from pnlp.pcut import cut_zhchar, cut_part, psent, psubsent
  8 | from pnlp.ptxt import Regex
  9 | from pnlp.stopwords import chinese_stopwords, english_stopwords
 10 | 
 11 | reg = Regex()
 12 | STOPWORDS = list(english_stopwords | chinese_stopwords)
 13 | # 主要对功能词（与实词对应）采样处理
 14 | SAMPLE_WORDS = [w for w in STOPWORDS if reg.pwnb.search(w)]
 15 | # 副词、介词、连词、助词、其他虚词
 16 | SAMPLE_POS = ["d", "p", "c", "u", "xc"]
 17 | 
 18 | 
 19 | def swap(lst: list, index: int, start: int, end: int) -> list:
 20 |     """Randomly swap two continuous parts."""
 21 |     assert start <= index <= end <= len(lst) - 1
 22 |     if (index == start or np.random.rand() < 0.5) and index != end:
 23 |         lst[index], lst[index + 1] = lst[index + 1], lst[index]
 24 |     else:
 25 |         lst[index], lst[index - 1] = lst[index - 1], lst[index]
 26 |     return lst
 27 | 
 28 | 
 29 | class Sampler:
 30 | 
 31 |     def check_types(self):
 32 |         default_types = set(("delete", "swap", "insert"))
 33 |         for typ in self.types:
 34 |             if typ not in default_types:
 35 |                 raise ValueError(
 36 |                     "pnlp: Type {} is not a valid type.".format(typ))
 37 | 
 38 | 
 39 | class TokenLevelSampler(Sampler):
 40 |     """
 41 |     Random choose an index.
 42 |     - Insert a copy token. Usually a function word.
 43 |     - Delete a token.
 44 |     - Swap with the prev or the next token.
 45 | 
 46 |     Parameters
 47 |     -------------
 48 |     rate: The sampling rate (each type respectively).
 49 |     types: Sampling methods. You should take care of the order.
 50 |     sample_words: Words will be used in sampling. Usually use stopwords.
 51 |     sample_pos: Part-of-speech will be used in sampling. Usually use function pos.
 52 | 
 53 |     Note
 54 |     ------
 55 |     1. We mainly use `stopwords (usually function words)` as sample words to do the sampling.
 56 |     If you want to sample other kinds of words, you could appoint `sample_pos` to what you need,
 57 |     then your input should include POS flags.
 58 |     2. The order of the `types` will influence the output of the dependent_sample.
 59 |     """
 60 | 
 61 |     def __init__(
 62 |         self,
 63 |         rate: float = 0.05,
 64 |         types: List[str] = ["delete", "swap", "insert"],
 65 |         sample_words: List[str] = SAMPLE_WORDS,
 66 |         sample_pos: List[str] = SAMPLE_POS,
 67 |     ):
 68 |         self.rate = rate
 69 |         self.types = types
 70 |         self.sample_words = sample_words
 71 |         self.sample_pos = sample_pos
 72 |         self.len_types = len(types)
 73 |         self.check_types()
 74 |         assert self.rate <= 0.1
 75 | 
 76 |     def filter_sample_idx(
 77 |             self, token_list: List[str or Tuple[str, str]]) -> List[int]:
 78 |         if not token_list:
 79 |             return []
 80 |         if type(token_list[0]) == str:
 81 |             can_deal_idx = [
 82 |                 i for (i, w) in enumerate(token_list) if w in self.sample_words
 83 |             ]
 84 |         else:
 85 |             can_deal_idx = [
 86 |                 i for (i, (w, f)) in enumerate(token_list)
 87 |                 if f in self.sample_pos
 88 |             ]
 89 |         return can_deal_idx
 90 | 
 91 |     def choose_sample_idx(self, len_parts: int, sample_count: int) -> List[int]:
 92 |         size = min(len_parts, sample_count)
 93 |         sample_part_idx = np.random.choice(len_parts, size,
 94 |                                            replace=False).tolist()
 95 |         return sample_part_idx
 96 | 
 97 |     def delete_sampling(self, token_list: List[str or Tuple[str, str]],
 98 |                         sample_idx: List[int]) -> List[str or Tuple[str, str]]:
 99 |         """Simple delete sampling. Delete the tokens in the given indexes."""
100 |         result = []
101 |         for i, token in enumerate(token_list):
102 |             if i in sample_idx:
103 |                 continue
104 |             result.append(token)
105 |         return result
106 | 
107 |     def insert_sampling(self, token_list: List[str or Tuple[str, str]],
108 |                         sample_idx: List[int]) -> List[str or Tuple[str, str]]:
109 |         """Simple insert sampling. Insert the tokens in the given indexes."""
110 |         result = []
111 |         for i, token in enumerate(token_list):
112 |             if i in sample_idx:
113 |                 result.append(token)
114 |             result.append(token)
115 |         return result
116 | 
117 |     def swap_sampling(self, token_list: List[str or Tuple[str, str]],
118 |                       sample_idx: List[int]) -> List[str or Tuple[str, str]]:
119 |         """Simple swap sampling. Swap the tokens in the given indexes.
120 |         DONOT swap start and end.
121 |         """
122 |         result = copy.deepcopy(token_list)
123 |         end = len(token_list) - 1
124 |         for idx in sample_idx:
125 |             swap(result, idx, 0, end)
126 |         return result
127 | 
128 |     def _sampling(
129 |         self,
130 |         type: str,
131 |         parts: List[List[str] or List[Tuple[str, str]]],
132 |         sample_idx: List[int],
133 |     ) -> List[List[str] or List[Tuple[str, str]]]:
134 |         """
135 |         Sampling by part, each time deal with a part instead of a token.
136 |         """
137 |         cp_parts = copy.deepcopy(parts)
138 |         for j, part in enumerate(cp_parts):
139 |             if j not in sample_idx:
140 |                 continue
141 |             can_deal_idx = self.filter_sample_idx(part)
142 |             if not can_deal_idx:
143 |                 continue
144 |             deal_idx = np.random.choice(can_deal_idx, 1).tolist()[0]
145 |             if type == "delete":
146 |                 part.remove(part[deal_idx])
147 |             elif type == "insert":
148 |                 part.insert(deal_idx, part[deal_idx])
149 |             else:
150 |                 swap(part, deal_idx, can_deal_idx[0], can_deal_idx[-1])
151 |         return cp_parts
152 | 
153 |     def independent_sampling(
154 |         self, token_list: List[str or Tuple[str, str]]
155 |     ) -> List[List[str] or List[Tuple[str, str]]]:
156 |         result = []
157 |         parts = self.convert_tokens_to_parts_by_nonword(token_list)
158 |         len_parts = len(parts)
159 |         len_tokens = len(token_list)
160 |         sample_count = round(len_tokens * self.rate) * self.len_types
161 |         sample_count = max(sample_count, 1)
162 |         # 一次采样到位，之后只是分别操作，操作原则上互相不依赖
163 |         sample_part_idx = self.choose_sample_idx(len_parts, sample_count)
164 |         each_count = len(sample_part_idx) // self.len_types
165 | 
166 |         for i, typ in enumerate(self.types):
167 |             sample_idx = sample_part_idx[i * each_count:(i + 1) * each_count]
168 |             new_parts = self._sampling(typ, parts, sample_idx)
169 |             sample = list(chain(*new_parts))
170 |             result.append(sample)
171 |         return result
172 | 
173 |     def dependent_sampling(
174 |         self,
175 |         token_list: List[str or
176 |                          Tuple[str, str]]) -> List[str or Tuple[str, str]]:
177 |         parts = self.convert_tokens_to_parts_by_nonword(token_list)
178 |         len_parts = len(parts)
179 |         len_tokens = len(token_list)
180 |         sample_count = round(len_tokens * self.rate)
181 |         sample_count = max(sample_count, 1)
182 |         for i, typ in enumerate(self.types):
183 |             # 每次重新采样，后面的操作可能会与前面的操作重叠
184 |             sample_idx = self.choose_sample_idx(len_parts, sample_count)
185 |             parts = self._sampling(typ, parts, sample_idx)
186 |         return list(chain(*parts))
187 | 
188 |     def convert_tokens_to_parts_by_nonword(
189 |         self, token_list: List[str or Tuple[str, str]]
190 |     ) -> List[List[str] or List[Tuple[str, str]]]:
191 |         parts = []
192 |         tmp = []
193 |         for token in token_list:
194 |             tmp.append(token)
195 |             word = self.__get_word_from_token(token)
196 |             if reg.pnwn.search(word):
197 |                 parts.append(tmp)
198 |                 tmp = []
199 |         return parts
200 | 
201 |     def __get_word_from_token(self, token: str or Tuple[str, str]) -> str:
202 |         if type(token) == str:
203 |             return token
204 |         else:
205 |             tup = tuple(token)
206 |             return tup[0]
207 | 
208 |     def __join_tokens(self, token_list: List[str or Tuple[str, str]]) -> str:
209 |         result = []
210 |         for token in token_list:
211 |             word = self.__get_word_from_token(token)
212 |             result.append(word)
213 |         return "".join(result)
214 | 
215 |     def make_samples(
216 |         self,
217 |         text: str,
218 |         tokenizer: Optional[Callable[[str], List[str or
219 |                                                  Tuple[str, str]]]] = None,
220 |     ) -> Dict[str, str]:
221 |         """
222 |         Make negative samples.
223 | 
224 |         Parameters
225 |         -----------
226 |         text: The given text. Usually a sentence.
227 |         tokenizer: Input a text, output a List of tokens. A token is a word or a (word, flag) tuple.
228 | 
229 |         Returns
230 |         --------
231 |         output: A dict of different kinds of negative samples.
232 |         """
233 |         if self.len_types == 0:
234 |             return {}
235 |         if not tokenizer:
236 |             if reg.pchi.search(text):
237 |                 tokenizer = cut_zhchar
238 |             else:
239 | 
240 |                 def tokenizer(x):
241 |                     return x.split()
242 | 
243 |         tokens = tokenizer(text)
244 |         if len(tokens) == 0:
245 |             return {}
246 |         result = {}
247 |         indep_samples = self.independent_sampling(tokens)
248 |         dep_sample = self.dependent_sampling(tokens)
249 |         if indep_samples:
250 |             for i, typ in enumerate(self.types):
251 |                 new_tokens = indep_samples[i]
252 |                 result[typ] = self.__join_tokens(new_tokens)
253 |             result["together"] = self.__join_tokens(dep_sample)
254 |         return result
255 | 
256 | 
257 | class SentenceLevelSampler(Sampler):
258 |     """
259 |     Random choose an index.
260 |     - Insert a copy.
261 |     - Delete.
262 |     - Swap with the prev or the next one.
263 | 
264 |     We only deal with ONE sentence once.
265 |     So you'd better use a paragraph as input.
266 | 
267 |     Parameters
268 |     -----------
269 |     types: Sampling methods. You should take care of the order.
270 |     """
271 | 
272 |     def __init__(self, types: List[str] = ["delete", "swap", "insert"]):
273 |         self.types = types
274 |         self.check_types()
275 | 
276 |     def independent_sampling(self, text_list: List[str]) -> List[List[str]]:
277 |         result = []
278 |         text_list = copy.deepcopy(text_list)
279 |         length = len(text_list)
280 |         for i, typ in enumerate(self.types):
281 |             idx = np.random.choice(length, 1).tolist()[0]
282 |             if typ == "insert":
283 |                 new = text_list[:idx] + [text_list[idx]] + text_list[idx:]
284 |             elif typ == "delete":
285 |                 new = [s for (i, s) in enumerate(text_list) if i != idx]
286 |             else:
287 |                 new = swap(text_list, idx, 0, length - 1)
288 |             result.append(new)
289 |         return result
290 | 
291 |     def dependent_sampling(self, text_list: List[str]) -> List[str]:
292 |         text_list = copy.deepcopy(text_list)
293 |         for i, typ in enumerate(self.types):
294 |             # 每次重新更新长度
295 |             length = len(text_list)
296 |             if length == 0:
297 |                 continue
298 |             idx = np.random.choice(length, 1).tolist()[0]
299 |             if typ == "insert":
300 |                 text_list = text_list[:idx] + \
301 |                     [text_list[idx]] + text_list[idx:]
302 |             elif typ == "delete":
303 |                 text_list = [s for (i, s) in enumerate(text_list) if i != idx]
304 |             else:
305 |                 text_list = swap(text_list, idx, 0, length - 1)
306 |         return text_list
307 | 
308 |     def make_samples(self, text: str, level: str = "sent") -> List[str]:
309 |         """
310 |         Parameters
311 |         -------------
312 |         text: The given text. Always a paragraph.
313 |         level: The sampling level. Could be one of {"sent", "subsent"}.
314 | 
315 |         Returns
316 |         --------
317 |         output: A dict of different kinds of negative samples.
318 |         """
319 |         if level == "sent":
320 |             text_list = cut_part(text, psent)
321 |         else:
322 |             text_list = cut_part(text, psubsent)
323 |         if len(text_list) == 0:
324 |             return {}
325 |         result = {}
326 |         indep_samples = self.independent_sampling(text_list)
327 |         dep_sample = self.dependent_sampling(text_list)
328 |         if indep_samples:
329 |             for i, typ in enumerate(self.types):
330 |                 new = indep_samples[i]
331 |                 result[typ] = "".join(new)
332 |             result["together"] = "".join(dep_sample)
333 |         return result
334 | 


--------------------------------------------------------------------------------
/pnlp/piop.py:
--------------------------------------------------------------------------------
  1 | # from __future__ import annotations
  2 | 
  3 | from typing import List, Dict, Union
  4 | from addict import Dict as AdDict
  5 | import json
  6 | import pickle
  7 | import os
  8 | import re
  9 | import csv
 10 | import pathlib
 11 | import yaml
 12 | 
 13 | 
 14 | class Reader:
 15 |     """
 16 |     Parameters
 17 |     -----------
 18 |     pattern: path pattern, support Regex. default "*.*"
 19 |     use_regex: whether to use Regex to compile the string pattern. default False
 20 |     """
 21 | 
 22 |     def __init__(
 23 |             self,
 24 |             pattern: str = "*.*",
 25 |             use_regex: bool = False,
 26 |             strip: str = "\n"):
 27 |         self.pattern = pattern
 28 |         self.use_regex = use_regex
 29 |         self.strip = strip
 30 | 
 31 |     def __repr__(self) -> str:
 32 |         return "Reader(pattern=%r)" % (self.pattern)
 33 | 
 34 |     @staticmethod
 35 |     def gen_files(dirname: str, pattern: str = "*.*", use_regex: bool = False):
 36 |         """
 37 |         Find all filenames in a directory tree that match the filepattern.
 38 |         If filepath is a file, yield the filepath directly.
 39 |         """
 40 |         if os.path.isfile(dirname):
 41 |             fpath = pathlib.Path(dirname)
 42 |             yield fpath
 43 |         if use_regex:
 44 |             try:
 45 |                 pat = re.compile(pattern)
 46 |             except Exception:
 47 |                 raise ValueError("pnlp: invalid pattern: {}".format(pattern))
 48 | 
 49 |             for fpath in pathlib.Path(dirname).rglob("*"):
 50 |                 if pat.search(fpath.name):
 51 |                     yield fpath
 52 |         else:
 53 |             for fpath in pathlib.Path(dirname).rglob(pattern):
 54 |                 yield fpath
 55 | 
 56 |     @staticmethod
 57 |     def gen_articles(fpaths: list):
 58 |         for fpath in fpaths:
 59 |             with open(fpath, encoding="utf8") as f:
 60 |                 article = AdDict()
 61 |                 article.fname = fpath.name
 62 |                 article.f = f
 63 |                 yield article
 64 | 
 65 |     @staticmethod
 66 |     def gen_flines(articles: list, strip: str = "\n"):
 67 |         """
 68 |         Process each file to lines when io.TextIOWrapper is given.
 69 |         """
 70 |         for article in articles:
 71 |             lid = 0
 72 |             for line_content in article.f:
 73 |                 line_content = line_content.strip(strip)
 74 |                 if len(line_content) == 0:
 75 |                     continue
 76 |                 line = AdDict()
 77 |                 line.lid = lid
 78 |                 line.fname = article.fname
 79 |                 line.text = line_content
 80 |                 lid += 1
 81 |                 yield line
 82 | 
 83 |     @staticmethod
 84 |     def gen_plines(fpath: str, strip: str = "\n"):
 85 |         """
 86 |         Process each file to lines when fpath is given.
 87 |         """
 88 |         with open(fpath, encoding="utf8") as f:
 89 |             for line in f:
 90 |                 line = line.strip(strip)
 91 |                 if len(line) == 0:
 92 |                     continue
 93 |                 yield line
 94 | 
 95 |     def __call__(self, dirname: str):
 96 |         fpaths = Reader.gen_files(dirname, self.pattern, self.use_regex)
 97 |         articles = Reader.gen_articles(fpaths)
 98 |         flines = Reader.gen_flines(articles, self.strip)
 99 |         for line in flines:
100 |             yield line
101 | 
102 | 
103 | def read_file(fpath: str, encoding="utf-8", **kwargs) -> str:
104 |     """
105 |     Read file from file path.
106 | 
107 |     Parameters
108 |     -----------
109 |     fpath: str
110 |         File path.
111 |     kwargs: optional
112 |         Other `open` support params.
113 | 
114 |     Returns
115 |     --------
116 |         data string of the file.
117 |     """
118 |     with open(fpath, encoding="utf-8", **kwargs) as f:
119 |         data = f.read()
120 |     return data
121 | 
122 | 
123 | def read_lines(
124 |     fpath: str,
125 |     strip: str = "\n",
126 |     count: int = -1,
127 |     **kwargs
128 | ) -> List[str]:
129 |     """
130 |     Read file with `open` from file path.
131 | 
132 |     Parameters
133 |     ----------
134 |     fpath: str
135 |         File path.
136 |     strip: str
137 |         Strip method, could be strip string or None, default is "\n".
138 |     count: int
139 |         How many lines to read, default is -1 (all).
140 |     kwargs: optional
141 |         Other `open` support params.
142 | 
143 |     Returns
144 |     -------
145 |     Lines of the file.
146 | 
147 |     Notes
148 |     -----
149 |     Blank line is ignored as default.
150 |     """
151 |     res = []
152 |     i = 0
153 |     with open(fpath, **kwargs) as f:
154 |         for line in f:
155 |             if count >= 0 and i >= count:
156 |                 break
157 |             line = line.strip(strip)
158 |             if not line:
159 |                 continue
160 |             res.append(line)
161 |             i += 1
162 |     return res
163 | 
164 | 
165 | def read_csv(fpath: str, delimiter: str = ",") -> List:
166 |     data = []
167 |     with open(fpath, "r") as f:
168 |         fcsv = csv.reader(f, delimiter=delimiter)
169 |         for row in fcsv:
170 |             data.append(row)
171 |     return data
172 | 
173 | 
174 | def read_json(fpath: str, **kwargs) -> Union[List, Dict]:
175 |     with open(fpath, "r", encoding="utf-8") as fin:
176 |         data = json.load(fin, **kwargs)
177 |     return data
178 | 
179 | 
180 | def read_yaml(fpath: str) -> Dict:
181 |     with open(fpath, "r") as fin:
182 |         data = yaml.load(fin, Loader=yaml.SafeLoader)
183 |     return data
184 | 
185 | 
186 | def read_pickle(fpath: str, **kwargs) -> Union[List, Dict]:
187 |     with open(fpath, "rb") as f:
188 |         data = pickle.load(f, **kwargs)
189 |     return data
190 | 
191 | 
192 | def read_file_to_list_dict(inp_file: str) -> List[Dict]:
193 |     res = []
194 |     for line in read_lines(inp_file):
195 |         item = json.loads(line.strip())
196 |         res.append(item)
197 |     return res
198 | 
199 | 
200 | def write_list_dict_to_file(out_file: str, data: List[Dict]) -> None:
201 |     fo = open(out_file, "w")
202 |     for item in data:
203 |         line = json.dumps(item, ensure_ascii=False)
204 |         fo.write(line)
205 |         fo.write("\n")
206 | 
207 | 
208 | def write_json(fpath: str, data, **kwargs) -> None:
209 |     fout = open(fpath, "w")
210 |     kwargs["ensure_ascii"] = False
211 |     json.dump(data, fout, **kwargs)
212 |     fout.close()
213 | 
214 | 
215 | 
216 | def write_file(fpath: str, data, **kwargs) -> None:
217 |     with open(fpath, "w", **kwargs) as fout:
218 |         for line in data:
219 |             fout.write(line + "\n")
220 | 
221 | 
222 | def write_pickle(fpath: str, data, **kwargs) -> None:
223 |     with open(fpath, "wb") as f:
224 |         pickle.dump(data, f, **kwargs)
225 | 
226 | 
227 | def check_dir(*args) -> None:
228 |     for dirname in args:
229 |         if os.path.exists(dirname):
230 |             pass
231 |         else:
232 |             os.makedirs(dirname)
233 | 


--------------------------------------------------------------------------------
/pnlp/pmag.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | from addict import Dict
 3 | from pathlib import Path
 4 | 
 5 | 
 6 | class MagicDict(Dict):
 7 | 
 8 |     def __getitem__(self, item):
 9 |         try:
10 |             return dict.__getitem__(self, item)
11 |         except KeyError:
12 |             # create a self instance
13 |             value = self[item] = type(self)()
14 |             return value
15 | 
16 |     @staticmethod
17 |     def reverse(dic):
18 |         """
19 |         Preserve all repeated value-keys when a Dict is reversed.
20 | 
21 |         Parameters
22 |         ----------
23 |         dic: dict
24 |             A dict where several keys have the same values.
25 | 
26 |         Returns
27 |         -------
28 |         Reversed dict of the given dict, but preserve all key-values.
29 | 
30 |         Example
31 |         -------
32 |         dx = {  1: 'a',
33 |                 2: 'a',
34 |                 3: 'a',
35 |                 4: 'b' }
36 |         reversedx = {   'a': [1, 2, 3],
37 |                         'b': 4 }
38 |         """
39 |         d1 = dict(zip(dic.values(), [[] for i in range(len(dic))]))
40 |         d2 = dict([
41 |             (y, d1[y].append(x))
42 |             if y in
43 |             [w for (w, f) in Counter(dic.values()).items() if f > 1]
44 |             else (y, x)
45 |             for (x, y) in dic.items()])
46 |         reversdict = dict([(x, d1[x]) if len(d1[x]) != 0
47 |                            else (x, d2[x])
48 |                            for x in d1.keys()])
49 |         return reversdict
50 | 
51 | 
52 | def get_unique_fn(file_path: str, level=0):
53 |     fp = Path(file_path)
54 |     fn = fp.name
55 | 
56 |     file_path = str(file_path).strip("/")
57 |     tmp = file_path.split("/")[:-1]
58 |     length = len(tmp)
59 |     if level > length:
60 |         level = length
61 | 
62 |     if length == 0:
63 |         return fn
64 |     
65 |     path = "_".join(tmp[-level:])
66 |     return "_".join([path, fn])


--------------------------------------------------------------------------------
/pnlp/pnorm.py:
--------------------------------------------------------------------------------
  1 | from typing import TypeVar
  2 | 
  3 | 
  4 | T = TypeVar('T', str, float, int)
  5 | 
  6 | ZH_NUM = {
  7 |     '〇': 0, '一': 1, '二': 2, '三': 3, '四': 4, '五': 5,
  8 |     '六': 6, '七': 7, '八': 8, '九': 9, '零': 0,
  9 |     '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5,
 10 |     '陆': 6, '柒': 7, '捌': 8, '玖': 9, '貮': 2, '两': 2,
 11 | }
 12 | 
 13 | ZH_UNIT = {
 14 |     '十': 10,
 15 |     '拾': 10,
 16 |     '百': 100,
 17 |     '佰': 100,
 18 |     '千': 1000,
 19 |     '仟': 1000,
 20 |     '万': 10000,
 21 |     '萬': 10000,
 22 |     '亿': 100000000,
 23 |     '億': 100000000,
 24 |     '兆': 10000000000000,
 25 | }
 26 | 
 27 | 
 28 | ARB_NUM = {
 29 |     0: "零",
 30 |     1: "一",
 31 |     2: "二",
 32 |     3: "三",
 33 |     4: "四",
 34 |     5: "五",
 35 |     6: "六",
 36 |     7: "七",
 37 |     8: "八",
 38 |     9: "九",
 39 |     10: "十",
 40 |     100: "百",
 41 |     1000: "千",
 42 |     10000: "万",
 43 |     100000000: "亿",
 44 |     10000000000000: "兆"
 45 | }
 46 | 
 47 | ZH2MONEY = {
 48 |     "一": "壹",
 49 |     "二": "贰",
 50 |     "三": "叁",
 51 |     "四": "肆",
 52 |     "五": "伍",
 53 |     "六": "陆",
 54 |     "七": "柒",
 55 |     "八": "捌",
 56 |     "九": "玖",
 57 |     "十": "拾",
 58 |     "百": "佰",
 59 |     "千": "仟",
 60 |     "万": "萬",
 61 |     "亿": "億"
 62 | }
 63 | 
 64 | 
 65 | class pnumstr(str):
 66 | 
 67 |     def to_money(self):
 68 |         for c in self:
 69 |             mc = ZH2MONEY.get(c)
 70 |             if mc:
 71 |                 self = self.replace(c, mc)
 72 |         return self
 73 | 
 74 | 
 75 | class NumNorm:
 76 |     """
 77 |     Chinese_to_Arabic
 78 |     modifed from https://github.com/bamtercelboo/corpus_process_script/blob/master/cn_to_arabic/cn_to_arabic.py
 79 |     """
 80 |     @staticmethod
 81 |     def num_len(num: int) -> int:
 82 |         if num == 0:
 83 |             return 1
 84 |         if num < 0:
 85 |             num = -num
 86 |         i = 0
 87 |         while num != 0:
 88 |             num //= 10
 89 |             i += 1
 90 |         return i
 91 | 
 92 |     def num2zh(self, num: int) -> str:
 93 |         def get_base(num):
 94 |             zh = ARB_NUM.get(num)
 95 |             if num < 10:
 96 |                 return zh
 97 |             else:
 98 |                 return "一" + zh
 99 | 
100 |         def get_less_than_10w(num):
101 |             res = ""
102 |             while num != 0:
103 |                 if num < 10:
104 |                     res += ARB_NUM.get(num)
105 |                     break
106 |                 length = NumNorm.num_len(num)
107 |                 divider = 10 ** (length - 1)
108 |                 high = num // divider
109 |                 res += ARB_NUM.get(high)
110 |                 res += ARB_NUM.get(divider)
111 |                 num = num % divider
112 |                 new_len = NumNorm.num_len(num)
113 |                 if length - new_len > 1 and num != 0:
114 |                     res += "零"
115 |             return res
116 | 
117 |         def get_interval(num: int, lower: int, unit: str):
118 |             res = ""
119 |             length = NumNorm.num_len(num)
120 |             divider = lower / 10
121 |             high = num // divider
122 |             res = get_less_than_10w(high)
123 |             high_len = NumNorm.num_len(high)
124 |             res += unit
125 |             num -= high * divider
126 |             new_len = NumNorm.num_len(num)
127 |             if length - high_len - new_len > 0 and num != 0:
128 |                 res += "零"
129 |             return res, num
130 | 
131 |         def get_10w_to_1y(num):
132 |             res, num = get_interval(num, 10**5, "万")
133 |             if 0 < num < 100000:
134 |                 res += get_less_than_10w(num)
135 |             return res
136 | 
137 |         def get_1y_to_1z(num):
138 |             res, num = get_interval(num, 10**9, "亿")
139 |             if 0 < num < 100000000:
140 |                 res += get_10w_to_1y(num)
141 |             return res
142 | 
143 |         if num in ARB_NUM:
144 |             result = get_base(num)
145 |             return pnumstr(result)
146 |         # 十万
147 |         if num < 10**5:
148 |             result = get_less_than_10w(num)
149 |         # 一亿
150 |         elif num < 10**8:
151 |             result = get_10w_to_1y(num)
152 |         # 一兆
153 |         elif num < 10**13:
154 |             result = get_1y_to_1z(num)
155 |         else:
156 |             result = "超大"
157 |         return pnumstr(result)
158 | 
159 |     def zh2num(self, zh: str) -> T:
160 |         unit = 0
161 |         digit_list = []
162 |         for zhdigit in reversed(zh):
163 |             if zhdigit in ZH_UNIT:
164 |                 unit = ZH_UNIT.get(zhdigit)
165 |                 if unit == 10000 or unit == 100000000:
166 |                     digit_list.append(unit)
167 |                     unit = 1
168 |             else:
169 |                 digit = ZH_NUM.get(zhdigit)
170 |                 if unit:
171 |                     digit *= unit
172 |                     unit = 0
173 |                 digit_list.append(digit)
174 |         if unit == 10:
175 |             digit_list.append(10)
176 |         val, tmp = 0, 0
177 |         for x in reversed(digit_list):
178 |             if x == 10000 or x == 100000000:
179 |                 val += tmp * x
180 |                 tmp = 0
181 |             else:
182 |                 tmp += x
183 |         val += tmp
184 |         if val == 0 and zh != "零":
185 |             return zh
186 |         else:
187 |             return val
188 | 


--------------------------------------------------------------------------------
/pnlp/ptrans.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | import uuid
 3 | 
 4 | 
 5 | def generate_uuid(*args) -> str:
 6 |     s = uuid.uuid5(
 7 |         uuid.NAMESPACE_URL,
 8 |         " ".join(map(str, args))
 9 |     )
10 |     return s.hex
11 | 
12 | 
13 | def pick_entity_from_bio_labels(
14 |         pairs: List[Tuple[str, str]],
15 |         with_offset: bool = False
16 | ) -> List[Tuple[str, str]]:
17 |     """
18 |     Parameters
19 |     ----------
20 |     pairs: List of tuple pairs, each pair contains a token and a bio tag
21 |     with_offset: whether to return locations for the entities
22 | 
23 |     Returns
24 |     -------
25 |     List of entity pairs, each pair contains an entity and entity type (migtht also a start and end index)
26 |     """
27 | 
28 |     def collect(span: List[Tuple[str, str]]):
29 |         res = []
30 |         for c, t in span:
31 |             if t.endswith("O"):
32 |                 break
33 |             res.append(c)
34 |         return "".join(res), span[0][1].split("-")[-1]
35 | 
36 |     without_lasto = False
37 |     if pairs and pairs[-1][1] != "O":
38 |         without_lasto = True
39 |         pairs.append(("#", "O"))
40 |     bidx = []
41 |     for i, (c, t) in enumerate(pairs):
42 |         if t.startswith("B-"):
43 |             bidx.append(i)
44 |     bidx.append(len(pairs) - 1)
45 |     res = []
46 |     for i in range(len(bidx) - 1):
47 |         start, end = bidx[i], bidx[i + 1]
48 |         span = pairs[start: end]
49 |         word, tag = collect(span)
50 |         if with_offset:
51 |             tup = (word, tag, start, start + len(word))
52 |         else:
53 |             tup = (word, tag)
54 |         res.append(tup)
55 |     if without_lasto:
56 |         pairs.pop()
57 |     return res
58 | 


--------------------------------------------------------------------------------
/pnlp/ptxt.py:
--------------------------------------------------------------------------------
  1 | from addict import Dict
  2 | import re
  3 | 
  4 | 
  5 | class Regex:
  6 | 
  7 |     """
  8 |     All kinds of Regular patterns.
  9 |     """
 10 | 
 11 |     patnames = ["chi", "pun",
 12 |                 "whi", "nwh",
 13 |                 "wnb", "nwn",
 14 |                 "eng", "num",
 15 |                 "pic", "lnk", "emj"]
 16 | 
 17 |     pun_zh = r"，。；、？！：“”‘’（）「」『』〔〕【】《》〈〉…——\-—～~·"
 18 |     pun_en = r",.;?!\(\)\[\]\{\}<>_"
 19 | 
 20 |     @property
 21 |     def pchi(self):
 22 |         """
 23 |         Chinese char pattern.
 24 |         """
 25 |         _pchi = re.compile(r'[\u4E00-\u9FD5]+')  # from jieba
 26 |         return _pchi
 27 | 
 28 |     @property
 29 |     def ppun(self):
 30 |         """
 31 |         Punctuation pattern.
 32 |         """
 33 |         _ppun = re.compile(rf'[{self.pun_en + self.pun_zh}]+')
 34 |         return _ppun
 35 | 
 36 |     @property
 37 |     def pwhi(self):
 38 |         """
 39 |         White space pattern.
 40 |         """
 41 |         _pwhi = re.compile(r'\s+')
 42 |         return _pwhi
 43 | 
 44 |     @property
 45 |     def pnwh(self):
 46 |         """
 47 |         Non-white space pattern.
 48 |         """
 49 |         _pnwh = re.compile(r'\S+')
 50 |         return _pnwh
 51 | 
 52 |     @property
 53 |     def pwnb(self):
 54 |         """
 55 |         Word and num pattern.
 56 |         """
 57 |         _pwnb = re.compile(r'\w+')
 58 |         return _pwnb
 59 | 
 60 |     @property
 61 |     def pnwn(self):
 62 |         """
 63 |         Non-alphanumeric char pattern.
 64 |         """
 65 |         _pnwn = re.compile(r'\W+')
 66 |         return _pnwn
 67 | 
 68 |     @property
 69 |     def peng(self):
 70 |         """
 71 |         English char pattern.
 72 |         """
 73 |         _peng = re.compile(r'[a-zA-Z]+')
 74 |         return _peng
 75 | 
 76 |     @property
 77 |     def pnum(self):
 78 |         """
 79 |         Number pattern.
 80 | 
 81 |         Example
 82 |         -------
 83 |         2, +2, -2, 2.1, -2.2, 1/5, 2:3, -2/5, 2%, 2.5%
 84 |         """
 85 |         _pnum = re.compile(r'''
 86 |                     [+\-.]?\d+[.:/]?[\d%]+
 87 |                     |
 88 |                     [+\-.]?\d+(?!\.\w+)
 89 |                     ''', re.UNICODE | re.VERBOSE)
 90 |         return _pnum
 91 | 
 92 |     @property
 93 |     def ppic(self):
 94 |         """
 95 |         Picture pattern.
 96 |         """
 97 |         _ppic = re.compile(r'''
 98 |                     !\[.*?\]\(.*?\.?(jpeg|png|jpg|gif)?\)
 99 |                     |
100 |                     https?:\/\/(www\.)?[\-a-zA-Z0-9@:%._\+~#=]{0,256}\.(jpeg|png|jpg|gif)
101 |                     ''', re.UNICODE | re.VERBOSE)
102 |         return _ppic
103 | 
104 |     @property
105 |     def plnk(self):
106 |         """
107 |         Link pattern.
108 |         """
109 |         _plink = re.compile(r'''
110 |                     \[.+?\]\(https?:\/\/(www\.)?[\-a-zA-Z0-9@:%._\+~#=]{0,256}\.[a-z]{2,6}\b([\-a-zA-Z0-9@:%_\+.~#?&//=]*)\)
111 |                     |
112 |                     https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{0,256}\.[a-z]{2,6}\b([\-a-zA-Z0-9@:%_\+.~#?&//=]*)
113 |                     |
114 |                     (https?:\/\/)?www\.[-a-zA-Z0-9@:%._\+~#=]{0,256}\.[a-z]{2,6}\b([\-a-zA-Z0-9@:%_\+.~#?&//=]*)
115 |                     ''', re.UNICODE | re.VERBOSE)
116 |         return _plink
117 | 
118 |     @property
119 |     def pemj(self):
120 |         """
121 |         Emoj pattern.
122 |         """
123 |         _pemj = re.compile(
124 |             r'['u'\U0001F300-\U0001F64F'
125 |                 u'\U0001F680-\U0001F6FF'
126 |                 u'\u2600-\u2B55]+')
127 |         return _pemj
128 | 
129 |     @property
130 |     def patdict(self):
131 |         """
132 |         All patterns.
133 |         """
134 |         patterns = [self.pchi, self.ppun,
135 |                     self.pwhi, self.pnwh,
136 |                     self.pwnb, self.pnwn,
137 |                     self.peng, self.pnum,
138 |                     self.ppic, self.plnk, self.pemj]
139 |         _patdict = dict(zip(self.patnames, patterns))
140 |         return _patdict
141 | 
142 | 
143 | class Text(Regex):
144 | 
145 |     """
146 |     Text clean, extract and length.
147 | 
148 |     Parameters
149 |     ----------
150 |     pat: list of patterns
151 |         Support custom re.Pattern. 
152 |         Default is re.compile(r'.+').
153 |         Other str pattern is built-in, including:
154 |         'chi': Chinese character
155 |         'pun': Punctuations
156 |         'whi': White space
157 |         'nwh': Non White space
158 |         'wnb': Word and number
159 |         'nwn': Non word and number
160 |         'eng': English character
161 |         'num': Number
162 |         'pic': Pictures
163 |         'lnk': Links
164 |         'emj': Emojis
165 | 
166 |     Returns
167 |     -------
168 |     A text object.
169 | 
170 |     Notes
171 |     ------
172 |     The pattern order is important. The front pattern will be excute earlier.
173 |     """
174 | 
175 |     def __init__(self, pattern_list: list = []):
176 |         self.pats = []
177 |         for pat in pattern_list:
178 |             if isinstance(pat, str):
179 |                 built_in_pat = self.patdict.get(pat)
180 |                 if built_in_pat:
181 |                     self.pats.append(built_in_pat)
182 |                 else:
183 |                     raise ValueError(
184 |                         "pnlp: {} \
185 |                         is not a valid built-in pattern.".format(pat))
186 |             elif isinstance(pat, re.Pattern):
187 |                 self.pats.append(pat)
188 |             else:
189 |                 raise ValueError(
190 |                     "pnlp: {} is not a valid RE pattern.".format(pat))
191 | 
192 |     def __repr__(self) -> str:
193 |         return "Text(pattern=%r)" % str(self.pat)
194 | 
195 |     def clean(self, text: str):
196 |         """
197 |         Clean text with givening pattern.
198 | 
199 |         Returns
200 |         -------
201 |         Cleaned text.
202 |         """
203 |         for pat in self.pats:
204 |             text = pat.sub("", text)
205 |         return text
206 | 
207 |     def extract(self, text: str):
208 |         """
209 |         Extract pattern-matching items.
210 | 
211 |         Returns
212 |         -------
213 |         Extracted items and their locations.
214 |         """
215 |         mats, locs = [], []
216 |         for pat in self.pats:
217 |             for mat in pat.finditer(text):
218 |                 mats.append(mat.group())
219 |                 locs.append(mat.span())
220 |         ext = Dict()
221 |         ext.text = "".join(mats)
222 |         ext.mats = mats
223 |         ext.locs = locs
224 |         return ext
225 | 
226 | 
227 | class Length(Regex):
228 | 
229 |     def __init__(self, text: str):
230 |         self.text = text
231 | 
232 |     def _len(self, pat):
233 |         lst = pat.findall(self.text)
234 |         return len("".join(lst))
235 | 
236 |     @property
237 |     def len_all(self):
238 |         """
239 |         Length of all characters.
240 |         """
241 |         return len(self.text)
242 | 
243 |     @property
244 |     def len_nwh(self):
245 |         """
246 |         Length of non-white characters.
247 |         """
248 |         return self._len(self.pnwh)
249 | 
250 |     @property
251 |     def len_chi(self):
252 |         """
253 |         Length of pure Chinese characters.
254 |         """
255 |         return self._len(self.pchi)
256 | 
257 |     @property
258 |     def len_wnb(self):
259 |         """
260 |         Length of characters and numbers.
261 |         """
262 |         return self._len(self.pwnb)
263 | 
264 |     @property
265 |     def len_pun(self):
266 |         """
267 |         Length of all punctuations.
268 |         """
269 |         return self._len(self.ppun)
270 | 
271 |     @property
272 |     def len_eng(self):
273 |         """
274 |         Length of English characters.
275 |         """
276 |         return self._len(self.peng)
277 | 
278 |     @property
279 |     def len_num(self):
280 |         """
281 |         Length of all numbers.
282 |         """
283 |         return self._len(self.pnum)
284 | 


--------------------------------------------------------------------------------
/pnlp/stopwords/ReadMe.md:
--------------------------------------------------------------------------------
 1 | ## English Stopwords
 2 | 
 3 | From  https://gist.github.com/sebleier/554280
 4 | 
 5 | ## Chinese Stopwords
 6 | 
 7 | 
 8 | - 综合多个词表（结巴、哈工大、川大），不包含百度停用词，百度停用词中的词是搜索相关的，有些实词和有意义的词也会在里面，所以准确的角度考虑百度停用词最好不使用。
 9 | - 人工进行了部分修改（见下表）。
10 | 
11 | ```json
12 | # 删除
13 | 风雨无阻
14 | 奋勇
15 | 
16 | # 增加
17 | 即是
18 | 来到
19 | 见到
20 | 异于
21 | 何谓
22 | 没什么
23 | 赶到
24 | 没啥
25 | 123
26 | 只剩
27 | 途中
28 | 只能
29 | 所谓
30 | 看到
31 | 只好
32 | 丢下
33 | 撇下
34 | 看不到
35 | 记得
36 | 任何理由
37 | 最大
38 | ```
39 | 
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/pnlp/stopwords/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pnlp.piop import read_lines
 3 | 
 4 | root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 5 | 
 6 | 
 7 | chinese_stopwords_file = os.path.join(root, "stopwords/chinese_stopwords.txt")
 8 | english_stopwords_file = os.path.join(root, "stopwords/english_stopwords.txt")
 9 | 
10 | 
11 | chinese_stopwords = set(read_lines(chinese_stopwords_file))
12 | english_stopwords = set(read_lines(english_stopwords_file))
13 | 
14 | 
15 | class StopWords:
16 | 
17 |     def __init__(self, path: str = ""):
18 |         self._chinese_stopwords = chinese_stopwords
19 |         self._english_stopwords = english_stopwords
20 |         if path:
21 |             self._stopwords = set(read_lines(path))
22 |         else:
23 |             self._stopwords = set()
24 | 
25 |     @property
26 |     def zh(self):
27 |         return self._chinese_stopwords
28 | 
29 |     @property
30 |     def zh_len(self):
31 |         return len(self._chinese_stopwords)
32 | 
33 |     @property
34 |     def en(self):
35 |         return self._english_stopwords
36 | 
37 |     @property
38 |     def en_len(self):
39 |         return len(self._english_stopwords)
40 | 
41 |     @property
42 |     def stopwords(self):
43 |         return self._stopwords
44 | 


--------------------------------------------------------------------------------
/pnlp/stopwords/chinese_stopwords.txt:
--------------------------------------------------------------------------------
   1 |  
   2 | !
   3 | "
   4 | #
   5 | $
   6 | &
   7 | '
   8 | (
   9 | )
  10 | *
  11 | +
  12 | ,
  13 | -
  14 | --
  15 | .
  16 | ...
  17 | ......
  18 | ...................
  19 | ./
  20 | .一
  21 | .数
  22 | .日
  23 | /
  24 | //
  25 | 0
  26 | 1
  27 | 2
  28 | 3
  29 | 4
  30 | 5
  31 | 6
  32 | 7
  33 | 8
  34 | 9
  35 | :
  36 | ://
  37 | ::
  38 | ;
  39 | <
  40 | =
  41 | >
  42 | ?
  43 | @
  44 | Lex
  45 | [
  46 | ]
  47 | _
  48 | }
  49 | ~~~~
  50 | ·
  51 | ×
  52 | ×××
  53 | Δ
  54 | Ψ
  55 | γ
  56 | μ
  57 | φ
  58 | φ．
  59 | В
  60 | —
  61 | ——
  62 | ———
  63 | ‘
  64 | ’
  65 | ’‘
  66 | “
  67 | ”
  68 | ”，
  69 | …
  70 | ……
  71 | …………………………………………………③
  72 | ′∈
  73 | ′｜
  74 | ℃
  75 | Ⅲ
  76 | ↑
  77 | →
  78 | ∈［
  79 | ∪φ∈
  80 | ≈
  81 | ①
  82 | ②
  83 | ②ｃ
  84 | ③
  85 | ③］
  86 | ④
  87 | ⑤
  88 | ⑥
  89 | ⑦
  90 | ⑧
  91 | ⑨
  92 | ⑩
  93 | ──
  94 | ■
  95 | ▲
  96 | 、
  97 | 。
  98 | 〉
  99 | 《
 100 | 》
 101 | 》），
 102 | 「
 103 | 」
 104 | 『
 105 | 』
 106 | 【
 107 | 】
 108 | 〔
 109 | 〕
 110 | 〕〔
 111 | ㈧
 112 | 一
 113 | 一.
 114 | 一一
 115 | 一个
 116 | 一些
 117 | 一何
 118 | 一個
 119 | 一切
 120 | 一则
 121 | 一则通过
 122 | 一方面
 123 | 一旦
 124 | 一来
 125 | 一样
 126 | 一番
 127 | 一直
 128 | 一般
 129 | 一转眼
 130 | 万一
 131 | 三天两头
 132 | 三番两次
 133 | 三番五次
 134 | 上
 135 | 上下
 136 | 上去
 137 | 上来
 138 | 下
 139 | 不
 140 | 不下
 141 | 不了
 142 | 不亦乐乎
 143 | 不仅
 144 | 不仅仅
 145 | 不仅仅是
 146 | 不会
 147 | 不但
 148 | 不光
 149 | 不免
 150 | 不再
 151 | 不力
 152 | 不单
 153 | 不只
 154 | 不可开交
 155 | 不可抗拒
 156 | 不同
 157 | 不外
 158 | 不外乎
 159 | 不大
 160 | 不如
 161 | 不妨
 162 | 不定
 163 | 不对
 164 | 不少
 165 | 不尽
 166 | 不尽然
 167 | 不巧
 168 | 不已
 169 | 不常
 170 | 不得
 171 | 不得不
 172 | 不得了
 173 | 不得已
 174 | 不必
 175 | 不怎么
 176 | 不怕
 177 | 不惟
 178 | 不成
 179 | 不拘
 180 | 不择手段
 181 | 不料
 182 | 不日
 183 | 不时
 184 | 不是
 185 | 不曾
 186 | 不止
 187 | 不止一次
 188 | 不比
 189 | 不消
 190 | 不满
 191 | 不然
 192 | 不然的话
 193 | 不特
 194 | 不独
 195 | 不由得
 196 | 不知不觉
 197 | 不管
 198 | 不管怎样
 199 | 不经意
 200 | 不胜
 201 | 不能
 202 | 不能不
 203 | 不至于
 204 | 不若
 205 | 不要
 206 | 不论
 207 | 不起
 208 | 不过
 209 | 不迭
 210 | 不问
 211 | 不限
 212 | 与
 213 | 与其
 214 | 与其说
 215 | 与否
 216 | 与此同时
 217 | 且
 218 | 且不说
 219 | 且说
 220 | 两者
 221 | 个
 222 | 个人
 223 | 个别
 224 | 临
 225 | 临到
 226 | 为
 227 | 为了
 228 | 为什么
 229 | 为何
 230 | 为止
 231 | 为此
 232 | 为着
 233 | 举凡
 234 | 乃
 235 | 乃至
 236 | 乃至于
 237 | 么
 238 | 之
 239 | 之一
 240 | 之所以
 241 | 之类
 242 | 乌乎
 243 | 乎
 244 | 乘
 245 | 乘势
 246 | 乘机
 247 | 乘虚
 248 | 乘隙
 249 | 也
 250 | 也好
 251 | 也就是说
 252 | 也罢
 253 | 了
 254 | 二来
 255 | 二话不说
 256 | 二话没说
 257 | 于
 258 | 于是
 259 | 于是乎
 260 | 云云
 261 | 云尔
 262 | 互相
 263 | 些
 264 | 交口
 265 | 亦
 266 | 亲口
 267 | 亲手
 268 | 亲眼
 269 | 亲自
 270 | 亲身
 271 | 人
 272 | 人人
 273 | 人们
 274 | 人家
 275 | 什么
 276 | 什么样
 277 | 今
 278 | 介于
 279 | 仍
 280 | 仍旧
 281 | 仍然
 282 | 从
 283 | 从不
 284 | 从严
 285 | 从中
 286 | 从今以后
 287 | 从优
 288 | 从古到今
 289 | 从古至今
 290 | 从头
 291 | 从宽
 292 | 从小
 293 | 从新
 294 | 从无到有
 295 | 从早到晚
 296 | 从未
 297 | 从来
 298 | 从此
 299 | 从此以后
 300 | 从而
 301 | 从轻
 302 | 从速
 303 | 从重
 304 | 他
 305 | 他人
 306 | 他们
 307 | 他們
 308 | 他是
 309 | 以
 310 | 以上
 311 | 以为
 312 | 以便
 313 | 以免
 314 | 以及
 315 | 以故
 316 | 以期
 317 | 以来
 318 | 以至
 319 | 以至于
 320 | 以致
 321 | 们
 322 | 任
 323 | 任何
 324 | 任凭
 325 | 伙同
 326 | 会
 327 | 传说
 328 | 传闻
 329 | 似的
 330 | 但
 331 | 但凡
 332 | 但愿
 333 | 但是
 334 | 何
 335 | 何乐而不为
 336 | 何以
 337 | 何况
 338 | 何处
 339 | 何妨
 340 | 何尝
 341 | 何必
 342 | 何时
 343 | 何止
 344 | 何苦
 345 | 何须
 346 | 余外
 347 | 作为
 348 | 你
 349 | 你们
 350 | 你們
 351 | 你是
 352 | 使
 353 | 使得
 354 | 例如
 355 | 依
 356 | 依据
 357 | 依照
 358 | 便于
 359 | 俺
 360 | 俺们
 361 | 倍加
 362 | 倍感
 363 | 倒不如
 364 | 倒不如说
 365 | 倒是
 366 | 倘
 367 | 倘使
 368 | 倘或
 369 | 倘然
 370 | 倘若
 371 | 借
 372 | 借以
 373 | 借此
 374 | 假使
 375 | 假如
 376 | 假若
 377 | 偏偏
 378 | 偶尔
 379 | 偶而
 380 | 傥然
 381 | 像
 382 | 儿
 383 | 元／吨
 384 | 充其极
 385 | 充其量
 386 | 充分
 387 | 先不先
 388 | 光是
 389 | 全体
 390 | 全力
 391 | 全年
 392 | 全然
 393 | 全身心
 394 | 全部
 395 | 全都
 396 | 八成
 397 | 公然
 398 | 兮
 399 | 共总
 400 | 关于
 401 | 其
 402 | 其一
 403 | 其中
 404 | 其二
 405 | 其他
 406 | 其余
 407 | 其后
 408 | 其它
 409 | 其实
 410 | 其次
 411 | 具体地说
 412 | 具体来说
 413 | 具体说来
 414 | 兼之
 415 | 内
 416 | 再
 417 | 再其次
 418 | 再则
 419 | 再有
 420 | 再次
 421 | 再者
 422 | 再者说
 423 | 再说
 424 | 冒
 425 | 冲
 426 | 决不
 427 | 决非
 428 | 况且
 429 | 凑巧
 430 | 凝神
 431 | 几
 432 | 几乎
 433 | 几度
 434 | 几时
 435 | 几番
 436 | 几经
 437 | 凡
 438 | 凡是
 439 | 凭
 440 | 凭借
 441 | 出于
 442 | 出去
 443 | 出来
 444 | 分别
 445 | 分头
 446 | 分期分批
 447 | 切不可
 448 | 切切
 449 | 切勿
 450 | 切莫
 451 | 则
 452 | 则甚
 453 | 刚好
 454 | 刚巧
 455 | 刚才
 456 | 别
 457 | 别人
 458 | 别处
 459 | 别是
 460 | 别的
 461 | 别管
 462 | 别说
 463 | 到
 464 | 到了儿
 465 | 到处
 466 | 到头
 467 | 到头来
 468 | 到底
 469 | 到目前为止
 470 | 前后
 471 | 前此
 472 | 前者
 473 | 加上
 474 | 加之
 475 | 加以
 476 | 动不动
 477 | 动辄
 478 | 勃然
 479 | 匆匆
 480 | 千万千万
 481 | 单单
 482 | 单纯
 483 | 即
 484 | 即令
 485 | 即使
 486 | 即便
 487 | 即刻
 488 | 即如
 489 | 即将
 490 | 即或
 491 | 即是说
 492 | 即若
 493 | 却
 494 | 去
 495 | 又
 496 | 又及
 497 | 及
 498 | 及其
 499 | 及至
 500 | 反之
 501 | 反之亦然
 502 | 反之则
 503 | 反倒
 504 | 反倒是
 505 | 反手
 506 | 反而
 507 | 反过来
 508 | 反过来说
 509 | 取道
 510 | 受到
 511 | 另
 512 | 另一个
 513 | 另一方面
 514 | 另外
 515 | 另悉
 516 | 另方面
 517 | 另行
 518 | 只
 519 | 只当
 520 | 只怕
 521 | 只是
 522 | 只有
 523 | 只消
 524 | 只要
 525 | 只限
 526 | 叫
 527 | 叮咚
 528 | 可
 529 | 可以
 530 | 可好
 531 | 可是
 532 | 可能
 533 | 可见
 534 | 各
 535 | 各个
 536 | 各位
 537 | 各式
 538 | 各种
 539 | 各自
 540 | 同
 541 | 同时
 542 | 后
 543 | 后来
 544 | 后者
 545 | 向
 546 | 向使
 547 | 向着
 548 | 吓
 549 | 吗
 550 | 否则
 551 | 吧
 552 | 吧哒
 553 | 吱
 554 | 呀
 555 | 呃
 556 | 呆呆地
 557 | 呕
 558 | 呗
 559 | 呜
 560 | 呜呼
 561 | 呢
 562 | 呵
 563 | 呵呵
 564 | 呸
 565 | 呼哧
 566 | 呼啦
 567 | 咋
 568 | 和
 569 | 咚
 570 | 咦
 571 | 咧
 572 | 咱
 573 | 咱们
 574 | 咳
 575 | 哇
 576 | 哈
 577 | 哈哈
 578 | 哉
 579 | 哎
 580 | 哎呀
 581 | 哎哟
 582 | 哗
 583 | 哗啦
 584 | 哟
 585 | 哦
 586 | 哩
 587 | 哪
 588 | 哪个
 589 | 哪些
 590 | 哪儿
 591 | 哪天
 592 | 哪年
 593 | 哪怕
 594 | 哪样
 595 | 哪边
 596 | 哪里
 597 | 哼
 598 | 哼唷
 599 | 唉
 600 | 唯有
 601 | 啊
 602 | 啊呀
 603 | 啊哈
 604 | 啊哟
 605 | 啐
 606 | 啥
 607 | 啦
 608 | 啪达
 609 | 啷当
 610 | 喂
 611 | 喏
 612 | 喔唷
 613 | 喽
 614 | 嗡
 615 | 嗡嗡
 616 | 嗬
 617 | 嗯
 618 | 嗳
 619 | 嘎
 620 | 嘎嘎
 621 | 嘎登
 622 | 嘘
 623 | 嘛
 624 | 嘻
 625 | 嘿
 626 | 嘿嘿
 627 | 因
 628 | 因为
 629 | 因了
 630 | 因此
 631 | 因着
 632 | 因而
 633 | 固然
 634 | 在
 635 | 在下
 636 | 在于
 637 | 地
 638 | 基于
 639 | 基本
 640 | 基本上
 641 | 处在
 642 | 处处
 643 | 多
 644 | 多么
 645 | 多亏
 646 | 多多
 647 | 多多少少
 648 | 多多益善
 649 | 多少
 650 | 多年前
 651 | 多年来
 652 | 多次
 653 | 够瞧的
 654 | 大
 655 | 大不了
 656 | 大举
 657 | 大体上
 658 | 大凡
 659 | 大多
 660 | 大大
 661 | 大家
 662 | 大张旗鼓
 663 | 大抵
 664 | 大概
 665 | 大略
 666 | 大约
 667 | 大致
 668 | 大都
 669 | 大面儿上
 670 | 她
 671 | 她们
 672 | 她們
 673 | 她是
 674 | 好
 675 | 好在
 676 | 如
 677 | 如上
 678 | 如上所述
 679 | 如下
 680 | 如今
 681 | 如何
 682 | 如其
 683 | 如前所述
 684 | 如同
 685 | 如常
 686 | 如是
 687 | 如期
 688 | 如果
 689 | 如次
 690 | 如此
 691 | 如此等等
 692 | 如若
 693 | 妳們
 694 | 始而
 695 | 姑且
 696 | 存心
 697 | 孰料
 698 | 孰知
 699 | 宁
 700 | 宁可
 701 | 宁愿
 702 | 宁肯
 703 | 它
 704 | 它们
 705 | 它是
 706 | 对
 707 | 对于
 708 | 对待
 709 | 对方
 710 | 对比
 711 | 将
 712 | 将才
 713 | 将要
 714 | 将近
 715 | 小
 716 | 尔
 717 | 尔后
 718 | 尔尔
 719 | 尔等
 720 | 尚且
 721 | 就
 722 | 就地
 723 | 就是
 724 | 就是了
 725 | 就是说
 726 | 就此
 727 | 就算
 728 | 就要
 729 | 尽
 730 | 尽可能
 731 | 尽如人意
 732 | 尽心尽力
 733 | 尽心竭力
 734 | 尽快
 735 | 尽早
 736 | 尽然
 737 | 尽管
 738 | 尽管如此
 739 | 尽量
 740 | 局外
 741 | 居然
 742 | 届时
 743 | 屡屡
 744 | 屡次
 745 | 屡次三番
 746 | 岂但
 747 | 岂止
 748 | 岂非
 749 | 川流不息
 750 | 差一点
 751 | 差不多
 752 | 己
 753 | 已
 754 | 已矣
 755 | 巴
 756 | 巴巴
 757 | 常言说
 758 | 常言说得好
 759 | 常言道
 760 | 平素
 761 | 年复一年
 762 | 并
 763 | 并且
 764 | 并排
 765 | 并无
 766 | 并没
 767 | 并没有
 768 | 并肩
 769 | 并非
 770 | 庶乎
 771 | 庶几
 772 | 开外
 773 | 开始
 774 | 弹指之间
 775 | 归
 776 | 归根到底
 777 | 归根结底
 778 | 归齐
 779 | 当
 780 | 当下
 781 | 当中
 782 | 当儿
 783 | 当即
 784 | 当口儿
 785 | 当地
 786 | 当场
 787 | 当头
 788 | 当庭
 789 | 当然
 790 | 当真
 791 | 当着
 792 | 彻夜
 793 | 彼
 794 | 彼时
 795 | 彼此
 796 | 往
 797 | 待
 798 | 待到
 799 | 很
 800 | 很多
 801 | 很少
 802 | 得
 803 | 得了
 804 | 得天独厚
 805 | 得起
 806 | 必定
 807 | 必将
 808 | 必须
 809 | 快要
 810 | 忽地
 811 | 忽然
 812 | 怎
 813 | 怎么
 814 | 怎么办
 815 | 怎么样
 816 | 怎奈
 817 | 怎样
 818 | 急匆匆
 819 | 怪不得
 820 | 总之
 821 | 总的来看
 822 | 总的来说
 823 | 总的说来
 824 | 总而言之
 825 | 恍然
 826 | 恐怕
 827 | 恰似
 828 | 恰好
 829 | 恰如
 830 | 恰巧
 831 | 恰恰
 832 | 恰恰相反
 833 | 恰逢
 834 | 您
 835 | 您们
 836 | 您是
 837 | 惟其
 838 | 惯常
 839 | 愤然
 840 | 慢说
 841 | 成年累月
 842 | 成心
 843 | 我
 844 | 我们
 845 | 我們
 846 | 我是
 847 | 或
 848 | 或则
 849 | 或多或少
 850 | 或是
 851 | 或曰
 852 | 或者
 853 | 或许
 854 | 截然
 855 | 截至
 856 | 所
 857 | 所以
 858 | 所在
 859 | 所幸
 860 | 所有
 861 | 才
 862 | 才能
 863 | 扑通
 864 | 打
 865 | 打从
 866 | 打开天窗说亮话
 867 | 把
 868 | 抑或
 869 | 抽冷子
 870 | 拦腰
 871 | 拿
 872 | 按
 873 | 按时
 874 | 按期
 875 | 按照
 876 | 按理
 877 | 按说
 878 | 挨个
 879 | 挨家挨户
 880 | 挨次
 881 | 挨着
 882 | 挨门挨户
 883 | 挨门逐户
 884 | 换句话说
 885 | 换言之
 886 | 据
 887 | 据实
 888 | 据悉
 889 | 据我所知
 890 | 据此
 891 | 据称
 892 | 据说
 893 | 接下来
 894 | 接着
 895 | 接连不断
 896 | 故
 897 | 故意
 898 | 故此
 899 | 故而
 900 | 敞开儿
 901 | 敢于
 902 | 敢情
 903 | 数/
 904 | 断然
 905 | 方才
 906 | 方能
 907 | 旁人
 908 | 无
 909 | 无宁
 910 | 无论
 911 | 既
 912 | 既往
 913 | 既是
 914 | 既然
 915 | 日复一日
 916 | 日渐
 917 | 日益
 918 | 日臻
 919 | 日见
 920 | 时候
 921 | 昂然
 922 | 是
 923 | 是以
 924 | 是否
 925 | 是的
 926 | 暗中
 927 | 暗地里
 928 | 暗自
 929 | 更为
 930 | 更加
 931 | 更进一步
 932 | 曾
 933 | 替
 934 | 替代
 935 | 最
 936 | 最后
 937 | 有
 938 | 有些
 939 | 有关
 940 | 有及
 941 | 有时
 942 | 有的
 943 | 有的是
 944 | 望
 945 | 朝
 946 | 朝着
 947 | 末##末
 948 | 本
 949 | 本人
 950 | 本地
 951 | 本着
 952 | 本身
 953 | 权时
 954 | 来
 955 | 来不及
 956 | 来得及
 957 | 来看
 958 | 来着
 959 | 来自
 960 | 来讲
 961 | 来说
 962 | 极为
 963 | 极了
 964 | 极其
 965 | 极力
 966 | 极大
 967 | 极度
 968 | 极端
 969 | 果然
 970 | 果真
 971 | 某
 972 | 某个
 973 | 某些
 974 | 某某
 975 | 根据
 976 | 格外
 977 | 次第
 978 | 欤
 979 | 正值
 980 | 正如
 981 | 正巧
 982 | 正是
 983 | 此
 984 | 此中
 985 | 此后
 986 | 此地
 987 | 此处
 988 | 此外
 989 | 此时
 990 | 此次
 991 | 此间
 992 | 毋宁
 993 | 每
 994 | 每当
 995 | 每时每刻
 996 | 每每
 997 | 每逢
 998 | 比
 999 | 比及
1000 | 比如
1001 | 比如说
1002 | 比方
1003 | 比照
1004 | 比起
1005 | 毕竟
1006 | 毫不
1007 | 毫无
1008 | 毫无例外
1009 | 毫无保留地
1010 | 沒有
1011 | 沙沙
1012 | 没奈何
1013 | 没有
1014 | 沿
1015 | 沿着
1016 | 漫说
1017 | 焉
1018 | 然则
1019 | 然后
1020 | 然而
1021 | 照
1022 | 照着
1023 | 牢牢
1024 | 犹且
1025 | 犹自
1026 | 独自
1027 | 猛然
1028 | 猛然间
1029 | 率尔
1030 | 率然
1031 | 理应
1032 | 理当
1033 | 理该
1034 | 瑟瑟
1035 | 甚且
1036 | 甚么
1037 | 甚或
1038 | 甚而
1039 | 甚至
1040 | 甚至于
1041 | 用
1042 | 用来
1043 | 甭
1044 | 由
1045 | 由于
1046 | 由是
1047 | 由此
1048 | 由此可见
1049 | 略为
1050 | 略加
1051 | 略微
1052 | 的
1053 | 的确
1054 | 的话
1055 | 皆可
1056 | 直到
1057 | 相对而言
1058 | 省得
1059 | 看
1060 | 看上去
1061 | 看来
1062 | 看样子
1063 | 看起来
1064 | 眨眼
1065 | 着
1066 | 着呢
1067 | 矣
1068 | 矣乎
1069 | 矣哉
1070 | 砰
1071 | 碰巧
1072 | 离
1073 | 种
1074 | 究竟
1075 | 穷年累月
1076 | 立刻
1077 | 立地
1078 | 立时
1079 | 立马
1080 | 竟然
1081 | 竟而
1082 | 第
1083 | 第二
1084 | 等
1085 | 等到
1086 | 等等
1087 | 策略地
1088 | 简直
1089 | 简而言之
1090 | 简言之
1091 | 管
1092 | 类如
1093 | 精光
1094 | 紧接着
1095 | 累年
1096 | 累次
1097 | 纯粹
1098 | 纵
1099 | 纵令
1100 | 纵使
1101 | 纵然
1102 | 经
1103 | 经常
1104 | 经过
1105 | 结果
1106 | 给
1107 | 绝不
1108 | 绝对
1109 | 绝非
1110 | 绝顶
1111 | 继之
1112 | 继后
1113 | 继而
1114 | 综上所述
1115 | 缕缕
1116 | 罢了
1117 | 老是
1118 | 老老实实
1119 | 者
1120 | 而
1121 | 而且
1122 | 而况
1123 | 而又
1124 | 而后
1125 | 而外
1126 | 而已
1127 | 而是
1128 | 而言
1129 | 而论
1130 | 联袂
1131 | 背地里
1132 | 背靠背
1133 | 能
1134 | 能否
1135 | 腾
1136 | 自
1137 | 自个儿
1138 | 自从
1139 | 自各儿
1140 | 自后
1141 | 自家
1142 | 自己
1143 | 自打
1144 | 自身
1145 | 至
1146 | 至于
1147 | 至今
1148 | 至若
1149 | 致
1150 | 與
1151 | 般的
1152 | 若
1153 | 若夫
1154 | 若是
1155 | 若果
1156 | 若非
1157 | 莫不
1158 | 莫不然
1159 | 莫如
1160 | 莫若
1161 | 莫非
1162 | 著
1163 | 藉以
1164 | 虽
1165 | 虽则
1166 | 虽然
1167 | 虽说
1168 | 被
1169 | 要
1170 | 要不
1171 | 要不是
1172 | 要不然
1173 | 要么
1174 | 要是
1175 | 譬喻
1176 | 譬如
1177 | 让
1178 | 许多
1179 | 论
1180 | 论说
1181 | 设使
1182 | 设或
1183 | 设若
1184 | 诚如
1185 | 诚然
1186 | 话说
1187 | 该
1188 | 该当
1189 | 说来
1190 | 请勿
1191 | 诸
1192 | 诸位
1193 | 诸如
1194 | 谁
1195 | 谁人
1196 | 谁料
1197 | 谁知
1198 | 豁然
1199 | 贼死
1200 | 赖以
1201 | 赶
1202 | 赶快
1203 | 赶早不赶晚
1204 | 起
1205 | 起先
1206 | 起初
1207 | 起头
1208 | 起来
1209 | 起见
1210 | 起首
1211 | 趁
1212 | 趁便
1213 | 趁势
1214 | 趁早
1215 | 趁机
1216 | 趁热
1217 | 趁着
1218 | 越是
1219 | 距
1220 | 跟
1221 | 路经
1222 | 轰然
1223 | 较
1224 | 较为
1225 | 较之
1226 | 较比
1227 | 边
1228 | 达旦
1229 | 过
1230 | 过于
1231 | 近几年来
1232 | 近年来
1233 | 近来
1234 | 还
1235 | 还是
1236 | 还有
1237 | 还要
1238 | 这
1239 | 这一来
1240 | 这个
1241 | 这么
1242 | 这么些
1243 | 这么样
1244 | 这么点儿
1245 | 这些
1246 | 这会儿
1247 | 这儿
1248 | 这就是说
1249 | 这时
1250 | 这样
1251 | 这次
1252 | 这般
1253 | 这边
1254 | 这里
1255 | 进去
1256 | 进来
1257 | 进而
1258 | 连
1259 | 连同
1260 | 连声
1261 | 连日
1262 | 连日来
1263 | 连袂
1264 | 连连
1265 | 迟早
1266 | 迫于
1267 | 逐步
1268 | 通过
1269 | 遵循
1270 | 遵照
1271 | 那
1272 | 那个
1273 | 那么
1274 | 那么些
1275 | 那么样
1276 | 那些
1277 | 那会儿
1278 | 那儿
1279 | 那时
1280 | 那末
1281 | 那样
1282 | 那般
1283 | 那边
1284 | 那里
1285 | 都
1286 | 鄙人
1287 | 鉴于
1288 | 针对
1289 | 长期以来
1290 | 长此下去
1291 | 长话短说
1292 | 间或
1293 | 阿
1294 | 陡然
1295 | 除
1296 | 除了
1297 | 除却
1298 | 除去
1299 | 除外
1300 | 除开
1301 | 除此
1302 | 除此之外
1303 | 除此以外
1304 | 除此而外
1305 | 除非
1306 | 随
1307 | 随后
1308 | 随时
1309 | 随着
1310 | 隔夜
1311 | 隔日
1312 | 难得
1313 | 难怪
1314 | 难说
1315 | 难道
1316 | 难道说
1317 | 非但
1318 | 非常
1319 | 非徒
1320 | 非得
1321 | 非特
1322 | 非独
1323 | 靠
1324 | 顶多
1325 | 顷刻
1326 | 顷刻之间
1327 | 顷刻间
1328 | 顺
1329 | 顺着
1330 | 顿时
1331 | 首先
1332 | 马上
1333 | 高低
1334 | 默然
1335 | 默默地
1336 | ！
1337 | ＃
1338 | ％
1339 | ＆
1340 | ＇
1341 | （
1342 | ）
1343 | ）÷（１－
1344 | ）、
1345 | ＊
1346 | ＋
1347 | ＋ξ
1348 | ＋＋
1349 | ，
1350 | ，也
1351 | －
1352 | －β
1353 | －－
1354 | －［＊］－
1355 | ．
1356 | ／
1357 | ０：２
1358 | １．
1359 | １２％
1360 | ２．３％
1361 | ５：０
1362 | ：
1363 | ；
1364 | ＜
1365 | ＜±
1366 | ＜Δ
1367 | ＜λ
1368 | ＜φ
1369 | ＜＜
1370 | ＝
1371 | ＝″
1372 | ＝☆
1373 | ＝（
1374 | ＝－
1375 | ＝［
1376 | ＝｛
1377 | ＞
1378 | ＞λ
1379 | ？
1380 | Ａ
1381 | ＬＩ
1382 | Ｒ．Ｌ．
1383 | ＺＸＦＩＴＬ
1384 | ［
1385 | ［①①］
1386 | ［①②］
1387 | ［①③］
1388 | ［①④］
1389 | ［①⑤］
1390 | ［①⑥］
1391 | ［①⑦］
1392 | ［①⑧］
1393 | ［①⑨］
1394 | ［①Ａ］
1395 | ［①Ｂ］
1396 | ［①Ｃ］
1397 | ［①Ｄ］
1398 | ［①Ｅ］
1399 | ［①］
1400 | ［①ａ］
1401 | ［①ｃ］
1402 | ［①ｄ］
1403 | ［①ｅ］
1404 | ［①ｆ］
1405 | ［①ｇ］
1406 | ［①ｈ］
1407 | ［①ｉ］
1408 | ［①ｏ］
1409 | ［②
1410 | ［②①］
1411 | ［②②］
1412 | ［②③］
1413 | ［②④
1414 | ［②⑤］
1415 | ［②⑥］
1416 | ［②⑦］
1417 | ［②⑧］
1418 | ［②⑩］
1419 | ［②Ｂ］
1420 | ［②Ｇ］
1421 | ［②］
1422 | ［②ａ］
1423 | ［②ｂ］
1424 | ［②ｃ］
1425 | ［②ｄ］
1426 | ［②ｅ］
1427 | ［②ｆ］
1428 | ［②ｇ］
1429 | ［②ｈ］
1430 | ［②ｉ］
1431 | ［②ｊ］
1432 | ［③①］
1433 | ［③⑩］
1434 | ［③Ｆ］
1435 | ［③］
1436 | ［③ａ］
1437 | ［③ｂ］
1438 | ［③ｃ］
1439 | ［③ｄ］
1440 | ［③ｅ］
1441 | ［③ｇ］
1442 | ［③ｈ］
1443 | ［④］
1444 | ［④ａ］
1445 | ［④ｂ］
1446 | ［④ｃ］
1447 | ［④ｄ］
1448 | ［④ｅ］
1449 | ［⑤］
1450 | ［⑤］］
1451 | ［⑤ａ］
1452 | ［⑤ｂ］
1453 | ［⑤ｄ］
1454 | ［⑤ｅ］
1455 | ［⑤ｆ］
1456 | ［⑥］
1457 | ［⑦］
1458 | ［⑧］
1459 | ［⑨］
1460 | ［⑩］
1461 | ［＊］
1462 | ［－
1463 | ［］
1464 | ］
1465 | ］∧′＝［
1466 | ］［
1467 | ＿
1468 | ａ］
1469 | ｂ］
1470 | ｃ］
1471 | ｅ］
1472 | ｆ］
1473 | ｎｇ昉
1474 | ｛－
1475 | ｝
1476 | ｝＞
1477 | ～
1478 | ～±
1479 | ～＋
1480 | 只剩
1481 | 所谓
1482 | 异于
1483 | 何谓
1484 | 即是
1485 | 来到
1486 | 赶到
1487 | 看不到
1488 | 看到
1489 | 只能
1490 | 只好
1491 | 没啥
1492 | 没什么
1493 | 见到
1494 | 记得
1495 | 123
1496 | 任何理由
1497 | 丢下
1498 | 撇下
1499 | 途中
1500 | 最大
1501 | 
1502 | 未
1503 | 快
1504 | 登时
1505 | 无所
1506 | 妄
1507 | 无论如何
1508 | 难免
1509 | 未必
1510 | 一定
1511 | 已经
1512 | 好不
1513 | 太
1514 | 必然
1515 | 越
1516 | 久而久之
1517 | 倒
1518 | 尤其
1519 | 总是
1520 | 原本
1521 | 一不小心
1522 | 真
1523 | 有点
1524 | 起码
1525 | 实际上
1526 | 无非
1527 | 永远
1528 | 顺便
1529 | 一手
1530 | 就这样
1531 | 更
1532 | 常
1533 | 最好
1534 | 或者说
1535 | 没黑
1536 | 乱
1537 | 相
1538 | 先
1539 | 终于
1540 | 十分
1541 | 总
1542 | 不够
1543 | 有一天
1544 | 放声
1545 | 比较
1546 | 老
1547 | 好像
1548 | 不管怎么说
1549 | 仿佛
1550 | 极
1551 | 正
1552 | 非
1553 | 未免
1554 | 生来
1555 | 正在
1556 | 完全
1557 | 光
1558 | 刚
1559 | 似
1560 | 相当
1561 | 真是
1562 | 成天
1563 | 确实
1564 | 原来
1565 | 肯定
1566 | 没
1567 | 曾经
1568 | 反正
1569 | 实在
1570 | 同样
1571 | 足
1572 | 并不
1573 | 常常
1574 | 慢慢
1575 | 绝
1576 | 也许
1577 | 往往
1578 | 猛
1579 | 古往今来
1580 | 大体
1581 | 刚刚
1582 | 越来越
1583 | 早
1584 | 以此
1585 | 和你
1586 | 稍
1587 | 决
1588 | 再加上
1589 | 初
1590 | 尚
1591 | 至高
1592 | 事实上
1593 | 更何况
1594 | 全盘
1595 | 在此
1596 | 早就
1597 | 足以
1598 | 一心
1599 | 就是我
1600 | 一闪
1601 | 难以
1602 | 对此
1603 | 特别
1604 | 在内
1605 | 该不该
1606 | 似乎
1607 | 总算
1608 | 相比之下
1609 | 不
1610 | 


--------------------------------------------------------------------------------
/pnlp/stopwords/english_stopwords.txt:
--------------------------------------------------------------------------------
   1 | 0o
   2 | 0s
   3 | 3a
   4 | 3b
   5 | 3d
   6 | 6b
   7 | 6o
   8 | a
   9 | a1
  10 | a2
  11 | a3
  12 | a4
  13 | ab
  14 | able
  15 | about
  16 | above
  17 | abst
  18 | ac
  19 | accordance
  20 | according
  21 | accordingly
  22 | across
  23 | act
  24 | actually
  25 | ad
  26 | added
  27 | adj
  28 | ae
  29 | af
  30 | affected
  31 | affecting
  32 | affects
  33 | after
  34 | afterwards
  35 | ag
  36 | again
  37 | against
  38 | ah
  39 | ain
  40 | ain't
  41 | aj
  42 | al
  43 | all
  44 | allow
  45 | allows
  46 | almost
  47 | alone
  48 | along
  49 | already
  50 | also
  51 | although
  52 | always
  53 | am
  54 | among
  55 | amongst
  56 | amoungst
  57 | amount
  58 | an
  59 | and
  60 | announce
  61 | another
  62 | any
  63 | anybody
  64 | anyhow
  65 | anymore
  66 | anyone
  67 | anything
  68 | anyway
  69 | anyways
  70 | anywhere
  71 | ao
  72 | ap
  73 | apart
  74 | apparently
  75 | appear
  76 | appreciate
  77 | appropriate
  78 | approximately
  79 | ar
  80 | are
  81 | aren
  82 | arent
  83 | aren't
  84 | arise
  85 | around
  86 | as
  87 | a's
  88 | aside
  89 | ask
  90 | asking
  91 | associated
  92 | at
  93 | au
  94 | auth
  95 | av
  96 | available
  97 | aw
  98 | away
  99 | awfully
 100 | ax
 101 | ay
 102 | az
 103 | b
 104 | b1
 105 | b2
 106 | b3
 107 | ba
 108 | back
 109 | bc
 110 | bd
 111 | be
 112 | became
 113 | because
 114 | become
 115 | becomes
 116 | becoming
 117 | been
 118 | before
 119 | beforehand
 120 | begin
 121 | beginning
 122 | beginnings
 123 | begins
 124 | behind
 125 | being
 126 | believe
 127 | below
 128 | beside
 129 | besides
 130 | best
 131 | better
 132 | between
 133 | beyond
 134 | bi
 135 | bill
 136 | biol
 137 | bj
 138 | bk
 139 | bl
 140 | bn
 141 | both
 142 | bottom
 143 | bp
 144 | br
 145 | brief
 146 | briefly
 147 | bs
 148 | bt
 149 | bu
 150 | but
 151 | bx
 152 | by
 153 | c
 154 | c1
 155 | c2
 156 | c3
 157 | ca
 158 | call
 159 | came
 160 | can
 161 | cannot
 162 | cant
 163 | can't
 164 | cause
 165 | causes
 166 | cc
 167 | cd
 168 | ce
 169 | certain
 170 | certainly
 171 | cf
 172 | cg
 173 | ch
 174 | changes
 175 | ci
 176 | cit
 177 | cj
 178 | cl
 179 | clearly
 180 | cm
 181 | c'mon
 182 | cn
 183 | co
 184 | com
 185 | come
 186 | comes
 187 | con
 188 | concerning
 189 | consequently
 190 | consider
 191 | considering
 192 | contain
 193 | containing
 194 | contains
 195 | corresponding
 196 | could
 197 | couldn
 198 | couldnt
 199 | couldn't
 200 | course
 201 | cp
 202 | cq
 203 | cr
 204 | cry
 205 | cs
 206 | c's
 207 | ct
 208 | cu
 209 | currently
 210 | cv
 211 | cx
 212 | cy
 213 | cz
 214 | d
 215 | d2
 216 | da
 217 | date
 218 | dc
 219 | dd
 220 | de
 221 | definitely
 222 | describe
 223 | described
 224 | despite
 225 | detail
 226 | df
 227 | di
 228 | did
 229 | didn
 230 | didn't
 231 | different
 232 | dj
 233 | dk
 234 | dl
 235 | do
 236 | does
 237 | doesn
 238 | doesn't
 239 | doing
 240 | don
 241 | done
 242 | don't
 243 | down
 244 | downwards
 245 | dp
 246 | dr
 247 | ds
 248 | dt
 249 | du
 250 | due
 251 | during
 252 | dx
 253 | dy
 254 | e
 255 | e2
 256 | e3
 257 | ea
 258 | each
 259 | ec
 260 | ed
 261 | edu
 262 | ee
 263 | ef
 264 | effect
 265 | eg
 266 | ei
 267 | eight
 268 | eighty
 269 | either
 270 | ej
 271 | el
 272 | eleven
 273 | else
 274 | elsewhere
 275 | em
 276 | empty
 277 | en
 278 | end
 279 | ending
 280 | enough
 281 | entirely
 282 | eo
 283 | ep
 284 | eq
 285 | er
 286 | es
 287 | especially
 288 | est
 289 | et
 290 | et-al
 291 | etc
 292 | eu
 293 | ev
 294 | even
 295 | ever
 296 | every
 297 | everybody
 298 | everyone
 299 | everything
 300 | everywhere
 301 | ex
 302 | exactly
 303 | example
 304 | except
 305 | ey
 306 | f
 307 | f2
 308 | fa
 309 | far
 310 | fc
 311 | few
 312 | ff
 313 | fi
 314 | fifteen
 315 | fifth
 316 | fify
 317 | fill
 318 | find
 319 | fire
 320 | first
 321 | five
 322 | fix
 323 | fj
 324 | fl
 325 | fn
 326 | fo
 327 | followed
 328 | following
 329 | follows
 330 | for
 331 | former
 332 | formerly
 333 | forth
 334 | forty
 335 | found
 336 | four
 337 | fr
 338 | from
 339 | front
 340 | fs
 341 | ft
 342 | fu
 343 | full
 344 | further
 345 | furthermore
 346 | fy
 347 | g
 348 | ga
 349 | gave
 350 | ge
 351 | get
 352 | gets
 353 | getting
 354 | gi
 355 | give
 356 | given
 357 | gives
 358 | giving
 359 | gj
 360 | gl
 361 | go
 362 | goes
 363 | going
 364 | gone
 365 | got
 366 | gotten
 367 | gr
 368 | greetings
 369 | gs
 370 | gy
 371 | h
 372 | h2
 373 | h3
 374 | had
 375 | hadn
 376 | hadn't
 377 | happens
 378 | hardly
 379 | has
 380 | hasn
 381 | hasnt
 382 | hasn't
 383 | have
 384 | haven
 385 | haven't
 386 | having
 387 | he
 388 | hed
 389 | he'd
 390 | he'll
 391 | hello
 392 | help
 393 | hence
 394 | her
 395 | here
 396 | hereafter
 397 | hereby
 398 | herein
 399 | heres
 400 | here's
 401 | hereupon
 402 | hers
 403 | herself
 404 | hes
 405 | he's
 406 | hh
 407 | hi
 408 | hid
 409 | him
 410 | himself
 411 | his
 412 | hither
 413 | hj
 414 | ho
 415 | home
 416 | hopefully
 417 | how
 418 | howbeit
 419 | however
 420 | how's
 421 | hr
 422 | hs
 423 | http
 424 | hu
 425 | hundred
 426 | hy
 427 | i
 428 | i2
 429 | i3
 430 | i4
 431 | i6
 432 | i7
 433 | i8
 434 | ia
 435 | ib
 436 | ibid
 437 | ic
 438 | id
 439 | i'd
 440 | ie
 441 | if
 442 | ig
 443 | ignored
 444 | ih
 445 | ii
 446 | ij
 447 | il
 448 | i'll
 449 | im
 450 | i'm
 451 | immediate
 452 | immediately
 453 | importance
 454 | important
 455 | in
 456 | inasmuch
 457 | inc
 458 | indeed
 459 | index
 460 | indicate
 461 | indicated
 462 | indicates
 463 | information
 464 | inner
 465 | insofar
 466 | instead
 467 | interest
 468 | into
 469 | invention
 470 | inward
 471 | io
 472 | ip
 473 | iq
 474 | ir
 475 | is
 476 | isn
 477 | isn't
 478 | it
 479 | itd
 480 | it'd
 481 | it'll
 482 | its
 483 | it's
 484 | itself
 485 | iv
 486 | i've
 487 | ix
 488 | iy
 489 | iz
 490 | j
 491 | jj
 492 | jr
 493 | js
 494 | jt
 495 | ju
 496 | just
 497 | k
 498 | ke
 499 | keep
 500 | keeps
 501 | kept
 502 | kg
 503 | kj
 504 | km
 505 | know
 506 | known
 507 | knows
 508 | ko
 509 | l
 510 | l2
 511 | la
 512 | largely
 513 | last
 514 | lately
 515 | later
 516 | latter
 517 | latterly
 518 | lb
 519 | lc
 520 | le
 521 | least
 522 | les
 523 | less
 524 | lest
 525 | let
 526 | lets
 527 | let's
 528 | lf
 529 | like
 530 | liked
 531 | likely
 532 | line
 533 | little
 534 | lj
 535 | ll
 536 | ll
 537 | ln
 538 | lo
 539 | look
 540 | looking
 541 | looks
 542 | los
 543 | lr
 544 | ls
 545 | lt
 546 | ltd
 547 | m
 548 | m2
 549 | ma
 550 | made
 551 | mainly
 552 | make
 553 | makes
 554 | many
 555 | may
 556 | maybe
 557 | me
 558 | mean
 559 | means
 560 | meantime
 561 | meanwhile
 562 | merely
 563 | mg
 564 | might
 565 | mightn
 566 | mightn't
 567 | mill
 568 | million
 569 | mine
 570 | miss
 571 | ml
 572 | mn
 573 | mo
 574 | more
 575 | moreover
 576 | most
 577 | mostly
 578 | move
 579 | mr
 580 | mrs
 581 | ms
 582 | mt
 583 | mu
 584 | much
 585 | mug
 586 | must
 587 | mustn
 588 | mustn't
 589 | my
 590 | myself
 591 | n
 592 | n2
 593 | na
 594 | name
 595 | namely
 596 | nay
 597 | nc
 598 | nd
 599 | ne
 600 | near
 601 | nearly
 602 | necessarily
 603 | necessary
 604 | need
 605 | needn
 606 | needn't
 607 | needs
 608 | neither
 609 | never
 610 | nevertheless
 611 | new
 612 | next
 613 | ng
 614 | ni
 615 | nine
 616 | ninety
 617 | nj
 618 | nl
 619 | nn
 620 | no
 621 | nobody
 622 | non
 623 | none
 624 | nonetheless
 625 | noone
 626 | nor
 627 | normally
 628 | nos
 629 | not
 630 | noted
 631 | nothing
 632 | novel
 633 | now
 634 | nowhere
 635 | nr
 636 | ns
 637 | nt
 638 | ny
 639 | o
 640 | oa
 641 | ob
 642 | obtain
 643 | obtained
 644 | obviously
 645 | oc
 646 | od
 647 | of
 648 | off
 649 | often
 650 | og
 651 | oh
 652 | oi
 653 | oj
 654 | ok
 655 | okay
 656 | ol
 657 | old
 658 | om
 659 | omitted
 660 | on
 661 | once
 662 | one
 663 | ones
 664 | only
 665 | onto
 666 | oo
 667 | op
 668 | oq
 669 | or
 670 | ord
 671 | os
 672 | ot
 673 | other
 674 | others
 675 | otherwise
 676 | ou
 677 | ought
 678 | our
 679 | ours
 680 | ourselves
 681 | out
 682 | outside
 683 | over
 684 | overall
 685 | ow
 686 | owing
 687 | own
 688 | ox
 689 | oz
 690 | p
 691 | p1
 692 | p2
 693 | p3
 694 | page
 695 | pagecount
 696 | pages
 697 | par
 698 | part
 699 | particular
 700 | particularly
 701 | pas
 702 | past
 703 | pc
 704 | pd
 705 | pe
 706 | per
 707 | perhaps
 708 | pf
 709 | ph
 710 | pi
 711 | pj
 712 | pk
 713 | pl
 714 | placed
 715 | please
 716 | plus
 717 | pm
 718 | pn
 719 | po
 720 | poorly
 721 | possible
 722 | possibly
 723 | potentially
 724 | pp
 725 | pq
 726 | pr
 727 | predominantly
 728 | present
 729 | presumably
 730 | previously
 731 | primarily
 732 | probably
 733 | promptly
 734 | proud
 735 | provides
 736 | ps
 737 | pt
 738 | pu
 739 | put
 740 | py
 741 | q
 742 | qj
 743 | qu
 744 | que
 745 | quickly
 746 | quite
 747 | qv
 748 | r
 749 | r2
 750 | ra
 751 | ran
 752 | rather
 753 | rc
 754 | rd
 755 | re
 756 | readily
 757 | really
 758 | reasonably
 759 | recent
 760 | recently
 761 | ref
 762 | refs
 763 | regarding
 764 | regardless
 765 | regards
 766 | related
 767 | relatively
 768 | research
 769 | research-articl
 770 | respectively
 771 | resulted
 772 | resulting
 773 | results
 774 | rf
 775 | rh
 776 | ri
 777 | right
 778 | rj
 779 | rl
 780 | rm
 781 | rn
 782 | ro
 783 | rq
 784 | rr
 785 | rs
 786 | rt
 787 | ru
 788 | run
 789 | rv
 790 | ry
 791 | s
 792 | s2
 793 | sa
 794 | said
 795 | same
 796 | saw
 797 | say
 798 | saying
 799 | says
 800 | sc
 801 | sd
 802 | se
 803 | sec
 804 | second
 805 | secondly
 806 | section
 807 | see
 808 | seeing
 809 | seem
 810 | seemed
 811 | seeming
 812 | seems
 813 | seen
 814 | self
 815 | selves
 816 | sensible
 817 | sent
 818 | serious
 819 | seriously
 820 | seven
 821 | several
 822 | sf
 823 | shall
 824 | shan
 825 | shan't
 826 | she
 827 | shed
 828 | she'd
 829 | she'll
 830 | shes
 831 | she's
 832 | should
 833 | shouldn
 834 | shouldn't
 835 | should've
 836 | show
 837 | showed
 838 | shown
 839 | showns
 840 | shows
 841 | si
 842 | side
 843 | significant
 844 | significantly
 845 | similar
 846 | similarly
 847 | since
 848 | sincere
 849 | six
 850 | sixty
 851 | sj
 852 | sl
 853 | slightly
 854 | sm
 855 | sn
 856 | so
 857 | some
 858 | somebody
 859 | somehow
 860 | someone
 861 | somethan
 862 | something
 863 | sometime
 864 | sometimes
 865 | somewhat
 866 | somewhere
 867 | soon
 868 | sorry
 869 | sp
 870 | specifically
 871 | specified
 872 | specify
 873 | specifying
 874 | sq
 875 | sr
 876 | ss
 877 | st
 878 | still
 879 | stop
 880 | strongly
 881 | sub
 882 | substantially
 883 | successfully
 884 | such
 885 | sufficiently
 886 | suggest
 887 | sup
 888 | sure
 889 | sy
 890 | system
 891 | sz
 892 | t
 893 | t1
 894 | t2
 895 | t3
 896 | take
 897 | taken
 898 | taking
 899 | tb
 900 | tc
 901 | td
 902 | te
 903 | tell
 904 | ten
 905 | tends
 906 | tf
 907 | th
 908 | than
 909 | thank
 910 | thanks
 911 | thanx
 912 | that
 913 | that'll
 914 | thats
 915 | that's
 916 | that've
 917 | the
 918 | their
 919 | theirs
 920 | them
 921 | themselves
 922 | then
 923 | thence
 924 | there
 925 | thereafter
 926 | thereby
 927 | thered
 928 | therefore
 929 | therein
 930 | there'll
 931 | thereof
 932 | therere
 933 | theres
 934 | there's
 935 | thereto
 936 | thereupon
 937 | there've
 938 | these
 939 | they
 940 | theyd
 941 | they'd
 942 | they'll
 943 | theyre
 944 | they're
 945 | they've
 946 | thickv
 947 | thin
 948 | think
 949 | third
 950 | this
 951 | thorough
 952 | thoroughly
 953 | those
 954 | thou
 955 | though
 956 | thoughh
 957 | thousand
 958 | three
 959 | throug
 960 | through
 961 | throughout
 962 | thru
 963 | thus
 964 | ti
 965 | til
 966 | tip
 967 | tj
 968 | tl
 969 | tm
 970 | tn
 971 | to
 972 | together
 973 | too
 974 | took
 975 | top
 976 | toward
 977 | towards
 978 | tp
 979 | tq
 980 | tr
 981 | tried
 982 | tries
 983 | truly
 984 | try
 985 | trying
 986 | ts
 987 | t's
 988 | tt
 989 | tv
 990 | twelve
 991 | twenty
 992 | twice
 993 | two
 994 | tx
 995 | u
 996 | u201d
 997 | ue
 998 | ui
 999 | uj
1000 | uk
1001 | um
1002 | un
1003 | under
1004 | unfortunately
1005 | unless
1006 | unlike
1007 | unlikely
1008 | until
1009 | unto
1010 | uo
1011 | up
1012 | upon
1013 | ups
1014 | ur
1015 | us
1016 | use
1017 | used
1018 | useful
1019 | usefully
1020 | usefulness
1021 | uses
1022 | using
1023 | usually
1024 | ut
1025 | v
1026 | va
1027 | value
1028 | various
1029 | vd
1030 | ve
1031 | ve
1032 | very
1033 | via
1034 | viz
1035 | vj
1036 | vo
1037 | vol
1038 | vols
1039 | volumtype
1040 | vq
1041 | vs
1042 | vt
1043 | vu
1044 | w
1045 | wa
1046 | want
1047 | wants
1048 | was
1049 | wasn
1050 | wasnt
1051 | wasn't
1052 | way
1053 | we
1054 | wed
1055 | we'd
1056 | welcome
1057 | well
1058 | we'll
1059 | well-b
1060 | went
1061 | were
1062 | we're
1063 | weren
1064 | werent
1065 | weren't
1066 | we've
1067 | what
1068 | whatever
1069 | what'll
1070 | whats
1071 | what's
1072 | when
1073 | whence
1074 | whenever
1075 | when's
1076 | where
1077 | whereafter
1078 | whereas
1079 | whereby
1080 | wherein
1081 | wheres
1082 | where's
1083 | whereupon
1084 | wherever
1085 | whether
1086 | which
1087 | while
1088 | whim
1089 | whither
1090 | who
1091 | whod
1092 | whoever
1093 | whole
1094 | who'll
1095 | whom
1096 | whomever
1097 | whos
1098 | who's
1099 | whose
1100 | why
1101 | why's
1102 | wi
1103 | widely
1104 | will
1105 | willing
1106 | wish
1107 | with
1108 | within
1109 | without
1110 | wo
1111 | won
1112 | wonder
1113 | wont
1114 | won't
1115 | words
1116 | world
1117 | would
1118 | wouldn
1119 | wouldnt
1120 | wouldn't
1121 | www
1122 | x
1123 | x1
1124 | x2
1125 | x3
1126 | xf
1127 | xi
1128 | xj
1129 | xk
1130 | xl
1131 | xn
1132 | xo
1133 | xs
1134 | xt
1135 | xv
1136 | xx
1137 | y
1138 | y2
1139 | yes
1140 | yet
1141 | yj
1142 | yl
1143 | you
1144 | youd
1145 | you'd
1146 | you'll
1147 | your
1148 | youre
1149 | you're
1150 | yours
1151 | yourself
1152 | yourselves
1153 | you've
1154 | yr
1155 | ys
1156 | yt
1157 | z
1158 | zero
1159 | zi
1160 | zz
1161 | 


--------------------------------------------------------------------------------
/pnlp/utils.py:
--------------------------------------------------------------------------------
  1 | from functools import wraps, partial
  2 | from typing import Any, List, Generator, Callable
  3 | import multiprocessing as mp
  4 | from multiprocessing import Pool
  5 | from multiprocessing.pool import ThreadPool
  6 | from concurrent.futures import ThreadPoolExecutor
  7 | from threading import Thread
  8 | import numpy as np
  9 | import dill
 10 | 
 11 | 
 12 | class pstr(str):
 13 |     def __sub__(self, other) -> str:
 14 |         result = []
 15 |         for c in self:
 16 |             if c in other:
 17 |                 continue
 18 |             result.append(c)
 19 |         return "".join(result)
 20 | 
 21 | 
 22 | class ThreadWithReturnValue(Thread):
 23 |     """
 24 |     referenced from https://stackoverflow.com/questions/6893968/how-to-get-the-return-value-from-a-thread-in-python
 25 |     """
 26 | 
 27 |     def __init__(
 28 |             self,
 29 |             group=None,
 30 |             target=None,
 31 |             name=None,
 32 |             args=(),
 33 |             kwargs={}
 34 |     ):
 35 |         Thread.__init__(self, group, target, name, args, kwargs)
 36 |         self._return = None
 37 | 
 38 |     def run(self):
 39 |         if self._target is not None:
 40 |             self._return = self._target(*self._args, **self._kwargs)
 41 | 
 42 |     def join(self, *args):
 43 |         Thread.join(self, *args)
 44 |         return self._return
 45 | 
 46 | 
 47 | def divide2int1(y: int, x: int) -> int:
 48 |     res = y // x
 49 |     if y % x != 0:
 50 |         res += 1
 51 |     return res
 52 | 
 53 | 
 54 | def divide2int(y: int, x: int) -> int:
 55 |     return np.ceil(y / x).astype(np.int_)
 56 | 
 57 | 
 58 | def generate_batches_by_size(lst: List[Any], batch_size: int
 59 |                              ) -> Generator[List[Any], None, None]:
 60 |     batch_num = divide2int(len(lst), batch_size)
 61 |     for i in range(batch_num):
 62 |         yield lst[i * batch_size: (i + 1) * batch_size]
 63 | 
 64 | 
 65 | def generate_batches_by_num(lst: List[Any], batch_num: int
 66 |                             ) -> Generator[List[Any], None, None]:
 67 |     batch_size = divide2int(len(lst), batch_num)
 68 |     return generate_batches_by_size(lst, batch_size)
 69 | 
 70 | 
 71 | # referenced from:
 72 | # https://izziswift.com/python-multiprocessing-picklingerror-cant-pickle/
 73 | def run_dill_encoded(payload):
 74 |     fun, args, kwargs = dill.loads(payload)
 75 |     return fun(*args, **kwargs)
 76 | 
 77 | 
 78 | def apply_async(pool, fun, args, kwargs):
 79 |     payload = dill.dumps((fun, args, kwargs))
 80 |     return pool.apply_async(run_dill_encoded, (payload,))
 81 | 
 82 | 
 83 | def concurring(
 84 |         func=None,
 85 |         type: str = "thread_executor",
 86 |         max_workers: int = mp.cpu_count()
 87 | ) -> Generator[List[Any], None, None]:
 88 |     """
 89 |     decorator for concurring.
 90 | 
 91 |     Parameters
 92 |     -----------
 93 |     type: one of thread_pool, process_pool, thread_executor, thread
 94 |         these are all different implements.
 95 |     max_workers: worker number
 96 |     """
 97 | 
 98 |     if func is None:
 99 |         return partial(concurring, type=type, max_workers=max_workers)
100 | 
101 |     if max_workers <= 0:
102 |         raise ValueError("hnlp: max_workers must > 0")
103 | 
104 |     def _thread(engine, func, batches, **kwargs):
105 |         jobs = []
106 |         for batch in batches:
107 |             job = engine(target=func, args=(batch, ), kwargs=kwargs)
108 |             jobs.append(job)
109 |             job.start()
110 |         for i, job in enumerate(jobs):
111 |             yield job.join()
112 | 
113 |     def _pool(engine, func, batches, max_workers, **kwargs):
114 |         with engine(processes=max_workers) as pool:
115 |             jobs = [apply_async(pool, func, (batch, ), kwargs)
116 |                     for batch in batches]
117 |             for job in jobs:
118 |                 yield job.get()
119 | 
120 |     def _executor(engine, func, batches, max_workers, **kwargs):
121 |         with engine(max_workers=max_workers) as executor:
122 |             jobs = [executor.submit(*(func, batch), **kwargs)
123 |                     for batch in batches]
124 |             for f in jobs:
125 |                 yield f.result()
126 | 
127 |     @wraps(func)
128 |     def wrapper(lst: List[Any], *args, **kwargs):
129 |         batches = generate_batches_by_num(lst, max_workers)
130 |         if type == "thread_pool":
131 |             return _pool(
132 |                 ThreadPool, func, batches, max_workers, **kwargs)
133 |         elif type == "process_pool":
134 |             return _pool(
135 |                 Pool, func, batches, max_workers, **kwargs)
136 |         elif type == "thread_executor":
137 |             return _executor(
138 |                 ThreadPoolExecutor, func, batches, max_workers, **kwargs)
139 |         elif type == "thread":
140 |             return _thread(
141 |                 ThreadWithReturnValue, func, batches, **kwargs)
142 |         else:
143 |             err_info = f"hnlp: does not support type {type}, use one of "
144 |             err_info += "thread_pool, process_pool, thread_executor, thread"
145 |             raise ValueError(err_info)
146 | 
147 |     return wrapper
148 | 
149 | 
150 | def run_in_new_thread(
151 |     func: Callable, *args, **kwargs
152 | ):
153 |     if kwargs:
154 |         func = partial(func, **kwargs)
155 |     t = Thread(target=func, name="BackgroundRun", args=args)
156 |     t.start()


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="pnlp",
 8 |     version="0.4.16",
 9 |     author="Yam",
10 |     author_email="haoshaochun@gmail.com",
11 |     description="A pre/post-processing tool for NLP.",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/hscspring/pnlp",
15 |     include_package_data=True,
16 |     # default is `setup.py` path, so do not need a `package_dir` attr
17 |     # if another dir, should be declared by `package_dir`
18 |     packages=setuptools.find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
19 |     install_requires=[
20 |         'addict',
21 |         'pyyaml',
22 |         'dill',
23 |         'numpy'
24 |     ],
25 |     package_data={
26 |         'pnlp': ["stopwords/*"],
27 |     },
28 |     classifiers=[
29 |         "Programming Language :: Python :: 3",
30 |         "License :: OSI Approved :: Apache Software License",
31 |         "Operating System :: OS Independent",
32 |     ],
33 | )
34 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hscspring/pnlp/87178634727231ba16663bb99ec40fa3668b226e/tests/__init__.py


--------------------------------------------------------------------------------
/tests/piop_data/a.md:
--------------------------------------------------------------------------------
1 | line 1 in a.
2 | line 2 in a.
3 | line 3 in a.


--------------------------------------------------------------------------------
/tests/piop_data/b.txt:
--------------------------------------------------------------------------------
1 | line 1 in b.
2 | line 2 in b.
3 | line 3 in b.


--------------------------------------------------------------------------------
/tests/piop_data/c.data:
--------------------------------------------------------------------------------
1 | line 1 in c.
2 | line 2 in c.
3 | line 3 in c.


--------------------------------------------------------------------------------
/tests/piop_data/csv.csv:
--------------------------------------------------------------------------------
1 | id,title
2 | 1,title1
3 | 2,title2


--------------------------------------------------------------------------------
/tests/piop_data/first/fa.md:
--------------------------------------------------------------------------------
1 | line 1 in fa.
2 | line 2 in fa.
3 | line 3 in fa.


--------------------------------------------------------------------------------
/tests/piop_data/first/fb.txt:
--------------------------------------------------------------------------------
1 | line 1 in fb.
2 | line 2 in fb.
3 | line 3 in fb.


--------------------------------------------------------------------------------
/tests/piop_data/first/fc.data:
--------------------------------------------------------------------------------
1 | line 1 in fc.
2 | line 2 in fc.
3 | line 3 in fc.


--------------------------------------------------------------------------------
/tests/piop_data/first/second/sa.md:
--------------------------------------------------------------------------------
1 | line 1 in sa.
2 | line 2 in sa.
3 | line 3 in sa.


--------------------------------------------------------------------------------
/tests/piop_data/first/second/sb.txt:
--------------------------------------------------------------------------------
1 | line 1 in sb.
2 | line 2 in sb.
3 | line 3 in sb.


--------------------------------------------------------------------------------
/tests/piop_data/first/second/sc.data:
--------------------------------------------------------------------------------
1 | line 1 in sc.
2 | line 2 in sc.
3 | line 3 in sc.


--------------------------------------------------------------------------------
/tests/piop_data/json.json:
--------------------------------------------------------------------------------
1 | {
2 |     "json1": "this is line 1",
3 |     "json2": "这是第二行。"
4 | }


--------------------------------------------------------------------------------
/tests/piop_data/list_dict.json:
--------------------------------------------------------------------------------
1 | {"name": "Yam", "age": 20}
2 | {"name": "May", "age": 21}
3 | 


--------------------------------------------------------------------------------
/tests/piop_data/outfile.file:
--------------------------------------------------------------------------------
1 | line 1 of outfile.
2 | 这是 outfile 的第二行。
3 | 


--------------------------------------------------------------------------------
/tests/piop_data/outfile.listdict:
--------------------------------------------------------------------------------
1 | {"name": "Yam", "age": 20}
2 | 


--------------------------------------------------------------------------------
/tests/piop_data/outjson.json:
--------------------------------------------------------------------------------
1 | {
2 |     "outjson1": "this is line 1.",
3 |     "outjson2": "这是第二行。"
4 | }


--------------------------------------------------------------------------------
/tests/piop_data/yaml.yaml:
--------------------------------------------------------------------------------
1 | 元旦:
2 |   - 新年快乐
3 |   - 元旦快乐
4 |   - 节日快乐
5 | 周末:
6 |   - 周末快乐！
7 |   - 周末愉快！


--------------------------------------------------------------------------------
/tests/test_pcut.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import pytest
  3 | 
  4 | from pnlp.pcut import cut_sentence, cut_zhchar, combine_bucket, cut_sub_sentence
  5 | 
  6 | 
  7 | def test_text2zhchar1():
  8 |     text = "我喜欢你。"
  9 |     ret = cut_zhchar(text)
 10 |     assert ret == ["我", "喜", "欢", "你", "。"]
 11 | 
 12 | 
 13 | def test_text2zhchar2():
 14 |     text = "我 喜欢 你。"
 15 |     ret = cut_zhchar(text)
 16 |     assert ret == ["我", " ", "喜", "欢", " ", "你", "。"]
 17 | 
 18 | 
 19 | def test_text2zhchar3():
 20 |     text = "我喜欢like你。"
 21 |     ret = cut_zhchar(text)
 22 |     assert ret == ["我", "喜", "欢", "like", "你", "。"]
 23 | 
 24 | 
 25 | def test_text2zhchar4():
 26 |     text = "我喜欢你233。"
 27 |     ret = cut_zhchar(text)
 28 |     assert ret == ["我", "喜", "欢", "你", "233", "。"]
 29 | 
 30 | 
 31 | def test_text2zhchar5():
 32 |     text = "我喜欢你3.14。"
 33 |     ret = cut_zhchar(text)
 34 |     assert ret == ["我", "喜", "欢", "你", "3.14", "。"]
 35 | 
 36 | 
 37 | def test_text2zhchar6():
 38 |     text = "我喜欢你100%。"
 39 |     ret = cut_zhchar(text)
 40 |     assert ret == ["我", "喜", "欢", "你", "100%", "。"]
 41 | 
 42 | 
 43 | def test_text2zhchar7():
 44 |     text = "我喜欢你2/3。"
 45 |     ret = cut_zhchar(text)
 46 |     assert ret == ["我", "喜", "欢", "你", "2/3", "。"]
 47 | 
 48 | 
 49 | def test_text2zhchar8():
 50 |     text = "我喜欢你-2。"
 51 |     ret = cut_zhchar(text)
 52 |     assert ret == ["我", "喜", "欢", "你", "-2", "。"]
 53 | 
 54 | 
 55 | def test_text2zhchar9():
 56 |     text = "我喜欢你2、3。"
 57 |     ret = cut_zhchar(text)
 58 |     assert ret == ["我", "喜", "欢", "你", "2", "、", "3", "。"]
 59 | 
 60 | 
 61 | def test_text2zhchar10():
 62 |     text = "我喜欢你。。"
 63 |     ret = cut_zhchar(text)
 64 |     assert ret == ["我", "喜", "欢", "你", "。", "。"]
 65 | 
 66 | 
 67 | def test_text2zhchar11():
 68 |     text = "我喜欢你A-B。"
 69 |     ret = cut_zhchar(text)
 70 |     assert ret == ["我", "喜", "欢", "你", "A", "-", "B", "。"]
 71 | 
 72 | 
 73 | def test_text2zhchar12():
 74 |     text = "我喜欢你A_B。"
 75 |     ret = cut_zhchar(text)
 76 |     assert ret == ["我", "喜", "欢", "你", "A", "_", "B", "。"]
 77 | 
 78 | 
 79 | def test_text2zhchar13():
 80 |     text = "我喜欢你C++。"
 81 |     ret = cut_zhchar(text)
 82 |     assert ret == ["我", "喜", "欢", "你", "C", "+", "+", "。"]
 83 | 
 84 | 
 85 | def test_text2zhchar14():
 86 |     text = "我喜欢你R&B。"
 87 |     ret = cut_zhchar(text)
 88 |     assert ret == ["我", "喜", "欢", "你", "R", "&", "B", "。"]
 89 | 
 90 | 
 91 | def test_text2zhchar15():
 92 |     text = "#我喜欢你。"
 93 |     ret = cut_zhchar(text)
 94 |     assert ret == ["#", "我", "喜", "欢", "你", "。"]
 95 | 
 96 | 
 97 | def test_text2zhchar16():
 98 |     text = "我 喜欢 你。"
 99 |     ret = cut_zhchar(text, remove_blank=True)
100 |     assert ret == ["我", "喜", "欢", "你", "。"]
101 | 
102 | 
103 | def test_text2zhchar17():
104 |     text = "我 love you."
105 |     ret = cut_zhchar(text, remove_blank=True)
106 |     assert ret == ["我", "love", "you", "."]
107 | 
108 | 
109 | def test_text2zhchar18():
110 |     text = "lo-ve."
111 |     ret = cut_zhchar(text, remove_blank=True)
112 |     assert ret == ['lo', '-', 've', '.']
113 | 
114 | 
115 | def test_text2zhchar19():
116 |     text = "v-.f "
117 |     ret = cut_zhchar(text, remove_blank=True)
118 |     assert ret == ['v', '-', '.', 'f']
119 | 
120 | 
121 | def test_text2zhchar20():
122 |     text = "-1.2."
123 |     ret = cut_zhchar(text)
124 |     assert ret == ['-1.2', '.']
125 | 
126 | 
127 | def test_text2zhchar21():
128 |     text = "1-2-3-"
129 |     ret = cut_zhchar(text)
130 |     assert ret == ['1-2-3-']
131 | 
132 | 
133 | def test_text2zhchar22():
134 |     text = "-1-2-3"
135 |     ret = cut_zhchar(text)
136 |     assert ret == ['-1-2-3']
137 | 
138 | 
139 | def test_text2zhchar23():
140 |     text = "1.2.3"
141 |     ret = cut_zhchar(text)
142 |     assert ret == ['1.2.3']
143 | 
144 | 
145 | def test_text2zhchar24():
146 |     text = "1..2"
147 |     ret = cut_zhchar(text)
148 |     assert ret == ['1..2']
149 | 
150 | 
151 | def test_text2zhchar25():
152 |     text = "1.2..."
153 |     ret = cut_zhchar(text)
154 |     assert ret == ['1.2', '.', '.', '.']
155 | 
156 | 
157 | def test_text2zhchar26():
158 |     text = "1...2..."
159 |     ret = cut_zhchar(text)
160 |     assert ret == ['1...2', '.', '.', '.']
161 | 
162 | 
163 | def test_text2zhchar27():
164 |     text = """
165 |     x..x 1.2, -1.23 lo-.ve.. -1-2-3- 2-2. -1.2. 3/5 1.2.3 1..2 2% 3.5% -2.0%
166 |     """
167 |     ret = cut_zhchar(text, remove_blank=True)
168 |     assert ret == [
169 |         'x', '.', '.', 'x',
170 |         '1.2', ',', '-1.23',
171 |         'lo', '-', '.', 've', '.', '.',
172 |         '-1-2-3-', '2-2', '.',
173 |         '-1.2', '.', '3/5',
174 |         '1.2.3', '1..2',
175 |         '2%', '3.5%', '-2.0%'
176 |     ]
177 | 
178 | 
179 | def test_text2sent1():
180 |     text = "我喜欢你，你呢？哈哈，我不告诉你。"
181 |     ret = cut_sentence(text)
182 |     assert len(ret) == 2
183 |     assert ret[0] == "我喜欢你，你呢？"
184 |     assert ret[1] == "哈哈，我不告诉你。"
185 | 
186 | 
187 | def test_text2sent2():
188 |     text = "我喜欢你，你呢！哈哈，我不告诉你"
189 |     ret = cut_sentence(text)
190 |     assert len(ret) == 2
191 |     assert ret[0] == "我喜欢你，你呢！"
192 |     assert ret[1] == "哈哈，我不告诉你"
193 | 
194 | 
195 | def test_text2sent3():
196 |     text = "我喜欢你，「哈哈」。我不告诉你~~~"
197 |     ret = cut_sentence(text)
198 |     assert len(ret) == 2
199 |     assert ret[0] == "我喜欢你，「哈哈」。"
200 |     assert ret[1] == "我不告诉你~~~"
201 | 
202 | 
203 | def test_text2sent4():
204 |     text = "我喜欢你，“哈哈”.我不告诉你……"
205 |     ret = cut_sentence(text)
206 |     assert len(ret) == 2
207 |     assert ret[0] == "我喜欢你，“哈哈”."
208 |     assert ret[1] == "我不告诉你……"
209 | 
210 | 
211 | def test_text2sent5():
212 |     text = "我喜欢你，“哈哈” 我不告诉你；"
213 |     ret = cut_sentence(text)
214 |     assert len(ret) == 1
215 |     assert ret[0] == "我喜欢你，“哈哈” 我不告诉你；"
216 | 
217 | 
218 | def test_text2sent6():
219 |     text = "我喜欢你，“哈哈。” 我不告诉你!"
220 |     ret = cut_sentence(text)
221 |     assert len(ret) == 2
222 |     assert ret[0] == "我喜欢你，“哈哈。”"
223 |     assert ret[1] == " 我不告诉你!"
224 | 
225 | 
226 | def test_text2sent7():
227 |     text = "我喜欢你(haha). 我不告诉你～"
228 |     ret = cut_sentence(text)
229 |     assert len(ret) == 2
230 |     assert ret[0] == "我喜欢你(haha)."
231 |     assert ret[1] == " 我不告诉你～"
232 | 
233 | 
234 | def test_text2sent8():
235 |     text = "我喜欢你, “哈哈……”。“我不告诉你.”"
236 |     ret = cut_sentence(text)
237 |     assert len(ret) == 2
238 |     assert ret[0] == "我喜欢你, “哈哈……”。"
239 |     assert ret[1] == "“我不告诉你.”"
240 | 
241 | 
242 | def test_text2sent9():
243 |     text = "我喜欢你&“哈哈？”“我不告诉你”"
244 |     ret = cut_sentence(text)
245 |     assert len(ret) == 2
246 |     assert ret[0] == "我喜欢你&“哈哈？”"
247 |     assert ret[1] == "“我不告诉你”"
248 | 
249 | 
250 | def test_text2sent10():
251 |     text = "我喜欢你，"
252 |     ret = cut_sentence(text)
253 |     assert len(ret) == 1
254 |     assert ret[0] == "我喜欢你，"
255 | 
256 | 
257 | def test_text2sent11():
258 |     text = "我喜欢你"
259 |     ret = cut_sentence(text)
260 |     assert len(ret) == 1
261 |     assert ret[0] == "我喜欢你"
262 | 
263 | 
264 | def test_text2sent12():
265 |     text = "我喜欢\n你"
266 |     ret = cut_sentence(text)
267 |     assert len(ret) == 2
268 |     assert ret == ["我喜欢\n", "你"]
269 | 
270 | 
271 | def test_text2sent13():
272 |     text = "我喜欢。\n你"
273 |     ret = cut_sentence(text)
274 |     assert len(ret) == 3
275 |     assert ret == ["我喜欢。", "\n", "你"]
276 | 
277 | 
278 | def test_text2sent14():
279 |     text = "我喜欢。\n.你"
280 |     ret = cut_sentence(text)
281 |     assert len(ret) == 3
282 |     assert ret == ["我喜欢。", "\n", ".你"]
283 | 
284 | 
285 | def test_text2sent15():
286 |     text = "我喜欢\n.你"
287 |     ret = cut_sentence(text)
288 |     assert len(ret) == 2
289 |     assert ret == ["我喜欢\n", ".你"]
290 | 
291 | 
292 | def test_text2sent16():
293 |     text = "我喜欢 .你"
294 |     ret = cut_sentence(text)
295 |     assert len(ret) == 2
296 |     assert ret == ["我喜欢 .", "你"]
297 | 
298 | 
299 | def test_text2sent17():
300 |     text = "我喜欢　.你"
301 |     ret = cut_sentence(text)
302 |     assert len(ret) == 2
303 |     assert ret == ["我喜欢　.", "你"]
304 | 
305 | 
306 | @pytest.fixture
307 | def parts():
308 |     return [
309 |         '习近平指出',
310 |         '中方不仅维护中国人民生命安全和身体健康',
311 |         '也维护世界人民生命安全和身体健康',
312 |         '我们本着公开',
313 |         '透明',
314 |     ]
315 | 
316 | 
317 | def test_combine_bucket1(parts):
318 |     ret = combine_bucket(parts.copy(), 5)
319 |     assert ret == parts
320 |     ret = combine_bucket(parts.copy(), 10)
321 |     assert ret == [
322 |         '习近平指出',
323 |         '中方不仅维护中国人民生命安全和身体健康',
324 |         '也维护世界人民生命安全和身体健康',
325 |         '我们本着公开透明',
326 |     ]
327 | 
328 | 
329 | def test_combine_bucket2(parts):
330 |     ret = combine_bucket(parts.copy(), 5, truncate=True)
331 |     assert ret == [
332 |         '习近平指出',
333 |         '中方不仅维',
334 |         '也维护世界',
335 |         '我们本着公',
336 |         '透明'
337 |     ]
338 |     ret = combine_bucket(parts.copy(), 10, truncate=True)
339 |     assert ret == [
340 |         '习近平指出',
341 |         '中方不仅维护中国人民',
342 |         '也维护世界人民生命安',
343 |         '我们本着公开透明',
344 |     ]
345 | 
346 | 
347 | def test_combine_bucket3(parts):
348 |     ret = combine_bucket(parts.copy(), 5, truncate=True, keep_remain=True)
349 |     assert ret == [
350 |         '习近平指出',
351 |         '中方不仅维',
352 |         '护中国人民',
353 |         '生命安全和',
354 |         '身体健康',
355 |         '也维护世界',
356 |         '人民生命安',
357 |         '全和身体健',
358 |         '康',
359 |         '我们本着公',
360 |         '开',
361 |         '透明'
362 |     ]
363 |     ret = combine_bucket(parts.copy(), 10, truncate=True, keep_remain=True)
364 |     assert ret == [
365 |         '习近平指出',
366 |         '中方不仅维护中国人民',
367 |         '生命安全和身体健康',
368 |         '也维护世界人民生命安',
369 |         '全和身体健康',
370 |         '我们本着公开透明',
371 |     ]
372 | 
373 | 
374 | 
375 | @pytest.mark.parametrize("inp, expected", [
376 |     ("1,2,3,4", ["1,", "2,", "3,", "4"]),
377 |     ("2/5是0.4。", ["2/5是0.4。"]),
378 |     ("2/5是0.4.", ["2/5是0.4."]),
379 |     ("2除以8等于0.25。", ["2除以8等于0.25。"]),
380 | ])
381 | def test_cut_subsent(inp, expected):
382 |     res = cut_sub_sentence(inp)
383 |     assert res == expected


--------------------------------------------------------------------------------
/tests/test_penh.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from pnlp.penh import swap, SentenceLevelSampler, TokenLevelSampler
  3 | from pnlp.pcut import psubsent, cut_part, cut_zhchar
  4 | 
  5 | 
  6 | def test_swap_middle():
  7 |     lst = [1, 2, 3, 4, 5]
  8 |     new = swap(lst, 2, 0, 4)
  9 |     assert new == [1, 3, 2, 4, 5] or new == [1, 2, 4, 3, 5]
 10 | 
 11 | 
 12 | def test_swap_start():
 13 |     lst = [1, 2, 3]
 14 |     new = swap(lst, 0, 0, 2)
 15 |     assert new == [2, 1, 3]
 16 | 
 17 | 
 18 | def test_swap_end():
 19 |     lst = [1, 2, 3]
 20 |     new = swap(lst, 2, 0, 2)
 21 |     assert new == [1, 3, 2]
 22 | 
 23 | 
 24 | def cut_words(text: str) -> list:
 25 |     return [
 26 |         '人', '为什么', '活着', '？',
 27 |         '生而为', '人', '必须', '要', '有', '梦想', '！',
 28 |         '还要', '有', '尽可能', '多', '的', '精神', '体验', '。']
 29 | 
 30 | 
 31 | def cut_wps(text: str) -> list:
 32 |     return [
 33 |         ('人', 'n'), ('为什么', 'r'), ('活着', 'v'), ('？', 'w'),
 34 |         ('生而为人', 'v'), ('必须', 'd'), ('要', 'v'),
 35 |         ('有', 'v'), ('梦想', 'n'), ('！', 'w'),
 36 |         ('还要', 'v'), ('有', 'v'), ('尽可能', 'd'), ('多', 'a'),
 37 |         ('的', 'u'), ('精神', 'n'), ('体验', 'vn'), ('。', 'w')
 38 |     ]
 39 | 
 40 | 
 41 | def test_token_level_sampler():
 42 |     tls = TokenLevelSampler()
 43 |     text = "人为什么活着？生而为人必须要有梦想！还要有尽可能多的精神体验。"
 44 |     res = tls.make_samples(text)
 45 |     assert type(res) == dict
 46 |     assert len(res) == 4
 47 | 
 48 | 
 49 | def test_token_level_sampler_none():
 50 |     tls = TokenLevelSampler(types=[])
 51 |     text = "人为什么活着？生而为人必须要有梦想！还要有尽可能多的精神体验。"
 52 |     res = tls.make_samples(text)
 53 |     assert res == {}
 54 | 
 55 | 
 56 | def test_token_level_sampler_none_text():
 57 |     tls = TokenLevelSampler()
 58 |     text = ""
 59 |     res = tls.make_samples(text)
 60 |     assert res == {}
 61 | 
 62 | 
 63 | def test_token_level_sampler_single_sent():
 64 |     tls = TokenLevelSampler()
 65 |     text = "人为什么活着？"
 66 |     res = tls.make_samples(text)
 67 |     assert len(res) == 4
 68 | 
 69 | 
 70 | def test_token_level_sampler_independent_sampling():
 71 |     tls = TokenLevelSampler()
 72 |     text = "人为什么活着？生而为人必须要有梦想！还要有尽可能多的精神体验。"
 73 |     tokens = cut_zhchar(text)
 74 |     res = tls.independent_sampling(tokens)
 75 |     assert type(res) == list
 76 |     assert len(res) == 3
 77 | 
 78 | 
 79 | def test_token_level_sampler_dependent_sampling():
 80 |     tls = TokenLevelSampler()
 81 |     text = "人为什么活着？生而为人必须要有梦想！还要有尽可能多的精神体验。"
 82 |     tokens = cut_zhchar(text)
 83 |     res = tls.dependent_sampling(tokens)
 84 |     assert type(res) == list
 85 |     assert type(res[0]) == str
 86 | 
 87 | 
 88 | def test_token_level_sampler_delete():
 89 |     tls = TokenLevelSampler(types=["delete"])
 90 |     text = "人为什么活着？生而为人必须要有梦想！还要有尽可能多的精神体验。"
 91 |     res = tls.make_samples(text)
 92 |     assert type(res) == dict
 93 |     assert len(res) == 2
 94 | 
 95 | 
 96 | def test_token_level_sampler_swap():
 97 |     tls = TokenLevelSampler(types=["swap"])
 98 |     text = "人为什么活着？生而为人必须要有梦想！还要有尽可能多的精神体验。"
 99 |     res = tls.make_samples(text)
100 |     assert type(res) == dict
101 |     assert len(res) == 2
102 | 
103 | 
104 | def test_token_level_sampler_insert():
105 |     tls = TokenLevelSampler(types=["insert"])
106 |     text = "人为什么活着？生而为人必须要有梦想！还要有尽可能多的精神体验。"
107 |     res = tls.make_samples(text)
108 |     assert type(res) == dict
109 |     assert len(res) == 2
110 | 
111 | 
112 | def test_token_level_sampler_token_spliter():
113 |     tls = TokenLevelSampler()
114 |     text = "人为什么活着？生而为人必须要有梦想！还要有尽可能多的精神体验。"
115 |     res = tls.make_samples(text, cut_words)
116 |     assert len(res) == 4
117 | 
118 | 
119 | def test_token_level_sampler_token_pos_spliter():
120 |     tls = TokenLevelSampler()
121 |     text = "人为什么活着？生而为人必须要有梦想！还要有尽可能多的精神体验。"
122 |     res = tls.make_samples(text, cut_wps)
123 |     assert len(res) == 4
124 | 
125 | 
126 | def test_token_level_sampler_delete_sampling():
127 |     tls = TokenLevelSampler()
128 |     text = "人为什么活着？生而为人必须要有梦想！还要有尽可能多的精神体验。"
129 |     tokens = cut_words(text)
130 |     res = tls.delete_sampling(tokens, [2])
131 |     assert type(res) == list
132 |     assert len(res) + 1 == len(tokens)
133 | 
134 | 
135 | def test_token_level_sampler_insert_sampling():
136 |     tls = TokenLevelSampler()
137 |     text = "人为什么活着？生而为人必须要有梦想！还要有尽可能多的精神体验。"
138 |     tokens = cut_words(text)
139 |     res = tls.insert_sampling(tokens, [2, 6])
140 |     assert type(res) == list
141 |     assert len(res) - 2 == len(tokens)
142 | 
143 | 
144 | def test_token_level_sampler_swap_sampling():
145 |     tls = TokenLevelSampler()
146 |     text = "人为什么活着？生而为人必须要有梦想！还要有尽可能多的精神体验。"
147 |     tokens = cut_words(text)
148 |     res = tls.swap_sampling(tokens, [5, 8])
149 |     assert type(res) == list
150 |     assert len(res) == len(tokens)
151 | 
152 | 
153 | def test_sentence_level_sampler():
154 |     sls = SentenceLevelSampler()
155 |     text = "我爱你。你爱我。"
156 |     res = sls.make_samples(text)
157 |     assert type(res) == dict
158 |     assert len(res) == 4
159 | 
160 | 
161 | def test_sentence_level_sampler_none():
162 |     sls = SentenceLevelSampler([])
163 |     text = "我爱你。你爱我。"
164 |     assert sls.make_samples(text) == {}
165 | 
166 | 
167 | def test_sentence_level_sampler_single_sent():
168 |     sls = SentenceLevelSampler()
169 |     text = "我爱你。"
170 |     assert len(sls.make_samples(text)) == 4
171 | 
172 | 
173 | def test_sentence_level_sampler_none_text():
174 |     sls = SentenceLevelSampler()
175 |     text = ""
176 |     assert sls.make_samples(text) == {}
177 | 
178 | 
179 | def test_sentence_level_sampler_independent_sampling():
180 |     sls = SentenceLevelSampler()
181 |     text = "写代码。写好代码。"
182 |     text_list = cut_part(text, psubsent)
183 |     res = sls.independent_sampling(text_list)
184 |     assert type(res) == list
185 |     assert len(res) == 3
186 |     assert len(res[0]) == 1
187 |     assert len(res[1]) == 2
188 |     assert len(res[2]) == 3
189 | 
190 | 
191 | def test_sentence_level_sampler_dependent_sampling():
192 |     sls = SentenceLevelSampler()
193 |     text = "写代码。多写代码。写好代码。"
194 |     text_list = cut_part(text, psubsent)
195 |     res = sls.dependent_sampling(text_list)
196 |     assert type(res) == list
197 |     assert len(res) == 3
198 | 
199 | 
200 | def test_sentence_level_sampler_insert():
201 |     sls = SentenceLevelSampler(types=["insert"])
202 |     text = "我爱你。你爱我。NLP 很有意思。简洁最重要。"
203 |     res = sls.make_samples(text)
204 |     assert len(res) == 2
205 | 
206 | 
207 | def test_sentence_level_sampler_delete():
208 |     sls = SentenceLevelSampler(types=["delete"])
209 |     text = "我爱你。你爱我。NLP 很有意思。简洁最重要。"
210 |     res = sls.make_samples(text)
211 |     assert len(res) == 2
212 | 
213 | 
214 | def test_sentence_level_sampler_swap():
215 |     sls = SentenceLevelSampler(types=["swap"])
216 |     text = "我爱你。你爱我。NLP 很有意思。简洁最重要。"
217 |     res = sls.make_samples(text)
218 |     assert len(res) == 2
219 | 


--------------------------------------------------------------------------------
/tests/test_piop.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pytest
  3 | import types
  4 | 
  5 | from pnlp.piop import write_json, write_file
  6 | from pnlp.piop import Reader, read_file, read_lines, read_json, read_yaml, read_csv
  7 | from pnlp.piop import read_file_to_list_dict, write_list_dict_to_file
  8 | from pnlp.piop import check_dir
  9 | 
 10 | DATA_PATH = os.path.join('tests', 'piop_data')
 11 | 
 12 | 
 13 | @pytest.fixture(params=['*.md', '*.txt', '*.data', 'f*.*', '*c.*'])
 14 | def get_Reader_path_match_res(request):
 15 |     res = []
 16 |     reader = Reader(request.param)
 17 |     for line in reader(DATA_PATH):
 18 |         res.append(line)
 19 |     return res
 20 | 
 21 | 
 22 | def test_Reader_path_match(get_Reader_path_match_res):
 23 |     assert len(get_Reader_path_match_res) == 9
 24 |     assert get_Reader_path_match_res[0].lid == 0
 25 |     assert get_Reader_path_match_res[-1].lid == 2
 26 | 
 27 | 
 28 | def test_Reader_file():
 29 |     res = []
 30 |     reader = Reader()
 31 |     for line in reader(os.path.join(DATA_PATH, 'a.md')):
 32 |         res.append(line)
 33 |     assert len(res) == 3
 34 |     assert res[0].text == 'line 1 in a.'
 35 | 
 36 | 
 37 | def test_Reader_gen_files():
 38 |     paths = Reader.gen_files(DATA_PATH, '*.md')
 39 |     assert isinstance(paths, types.GeneratorType)
 40 |     assert len(list(paths)) == 3
 41 | 
 42 | 
 43 | def test_Reader_gen_files_with_regex():
 44 |     paths = Reader.gen_files(DATA_PATH, "(md)|(txt)", True)
 45 |     assert isinstance(paths, types.GeneratorType)
 46 |     assert len(list(paths)) == 6
 47 | 
 48 | 
 49 | def test_Reader_gen_articles():
 50 |     paths = Reader.gen_files(DATA_PATH, '*.txt')
 51 |     articles = Reader.gen_articles(paths)
 52 |     assert isinstance(articles, types.GeneratorType)
 53 |     assert len(list(articles)) == 3
 54 | 
 55 | 
 56 | def test_Reader_gen_flines():
 57 |     paths = Reader.gen_files(DATA_PATH, '*.txt')
 58 |     articles = Reader.gen_articles(paths)
 59 |     lines = Reader.gen_flines(articles)
 60 |     assert isinstance(lines, types.GeneratorType)
 61 |     assert len(list(lines)) == 9
 62 | 
 63 | 
 64 | def test_Reader_gen_plines():
 65 |     lines = Reader.gen_plines(os.path.join(DATA_PATH, 'b.txt'))
 66 |     assert isinstance(lines, types.GeneratorType)
 67 |     assert len(list(lines)) == 3
 68 | 
 69 | 
 70 | @pytest.fixture
 71 | def get_read_data():
 72 |     return os.path.join(DATA_PATH, 'c.data')
 73 | 
 74 | 
 75 | def test_read_file(get_read_data):
 76 |     data = read_file(get_read_data)
 77 |     assert data == 'line 1 in c.\nline 2 in c.\nline 3 in c.'
 78 |     assert type(data) == str
 79 | 
 80 | 
 81 | @pytest.mark.parametrize("count", [0, 1, 2, -1])
 82 | def test_read_lines(get_read_data, count):
 83 |     data = read_lines(get_read_data, count=count)
 84 |     if count != -1:
 85 |         assert len(data) == count
 86 |     else:
 87 |         assert data == ['line 1 in c.', 'line 2 in c.', 'line 3 in c.']
 88 |     assert type(data) == list
 89 | 
 90 | 
 91 | def test_read_json():
 92 |     data = read_json(os.path.join(DATA_PATH, 'json.json'))
 93 |     assert type(data) == dict
 94 |     assert data == {
 95 |         "json1": "this is line 1",
 96 |         "json2": "这是第二行。"
 97 |     }
 98 | 
 99 | 
100 | def test_read_yaml():
101 |     data = read_yaml(os.path.join(DATA_PATH, 'yaml.yaml'))
102 |     assert type(data) == dict
103 |     assert data == {'元旦': ['新年快乐', '元旦快乐', '节日快乐'],
104 |                     '周末': ['周末快乐！', '周末愉快！']}
105 | 
106 | 
107 | def test_read_csv():
108 |     data = read_csv(os.path.join(DATA_PATH, 'csv.csv'))
109 |     assert type(data) == list
110 |     assert data == [['id', 'title'], ['1', 'title1'], ['2', 'title2']]
111 | 
112 | 
113 | def test_read_file_to_list_dict():
114 |     data = read_file_to_list_dict(os.path.join(DATA_PATH, "list_dict.json"))
115 |     assert type(data) == list
116 |     assert type(data[0]) == dict
117 | 
118 | 
119 | def test_write_json():
120 |     data = {"outjson1": "this is line 1.",
121 |             "outjson2": "这是第二行。"}
122 |     write_json(os.path.join(DATA_PATH, 'outjson.json'),
123 |                data, indent=4, ensure_ascii=False)
124 | 
125 | 
126 | def test_write_file():
127 |     data = ['line 1 of outfile.', '这是 outfile 的第二行。']
128 |     write_file(os.path.join(DATA_PATH, 'outfile.file'), data)
129 | 
130 | 
131 | def test_write_list_dict_to_file():
132 |     data = [{"name": "Yam", "age": 20}]
133 |     write_list_dict_to_file(os.path.join(DATA_PATH, "outfile.listdict"), data)
134 | 
135 | 
136 | def test_check_dir():
137 |     assert check_dir(DATA_PATH) is None
138 | 


--------------------------------------------------------------------------------
/tests/test_pmag.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from pnlp.pmag import MagicDict, get_unique_fn
 3 | 
 4 | 
 5 | def test_magic():
 6 |     tmd = MagicDict()
 7 |     tmd["a"]["b"]["c"] = 1
 8 |     assert tmd["a"]["b"]["c"] == 1
 9 | 
10 | 
11 | def test_magic_set_get():
12 |     d = MagicDict()
13 |     d["a"]["b"] = 2
14 |     assert d.a.b == 2
15 | 
16 | 
17 | def test_magic_reverse():
18 |     dx = {1: "a", 2: "a", 3: "a", 4: "b"}
19 |     assert MagicDict.reverse(dx) == {"a": [1, 2, 3], "b": 4}
20 | 
21 | 
22 | 
23 | @pytest.mark.parametrize("inp,oup,level", [
24 |     ("a.md", "a.md", 0),
25 |     ("a.md", "a.md", 1),
26 |     ("a.md", "a.md", 10),
27 |     ("a/b.md", "a_b.md", 0),
28 |     ("a/b.md", "a_b.md", 1),
29 |     ("a/b.md", "a_b.md", 10),
30 |     ("a/b/c.md", "a_b_c.md", 0),
31 |     ("a/b/c.md", "b_c.md", 1),
32 |     ("a/b/c.md", "a_b_c.md", 10),
33 |     ("/a/b/c.md", "a_b_c.md", 0),
34 |     ("/a/b/c.md", "b_c.md", 1),
35 |     ("/a/b/c.md", "a_b_c.md", 10),
36 | ])
37 | def test_get_unique_fn(inp, level, oup):
38 |     res = get_unique_fn(inp, level)
39 |     assert res == oup
40 | 


--------------------------------------------------------------------------------
/tests/test_pnorm.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from pnlp.pnorm import NumNorm
  4 | 
  5 | c2a = NumNorm()
  6 | 
  7 | 
  8 | def test_chinese2arabic1():
  9 |     s = "一亿三千万"
 10 |     assert c2a.zh2num(s) == 130000000
 11 | 
 12 | 
 13 | def test_chinese2arabic2():
 14 |     s = "一万五千六百三十八"
 15 |     assert c2a.zh2num(s) == 15638
 16 | 
 17 | 
 18 | def test_chinese2arabic3():
 19 |     s = "壹仟两百"
 20 |     assert c2a.zh2num(s) == 1200
 21 | 
 22 | 
 23 | def test_chinese2arabic4():
 24 |     s = "壹仟两百零三"
 25 |     assert c2a.zh2num(s) == 1203
 26 | 
 27 | 
 28 | def test_chinese2arabic5():
 29 |     s = "壹仟两百一十五"
 30 |     assert c2a.zh2num(s) == 1215
 31 | 
 32 | 
 33 | def test_chinese2arabic6():
 34 |     s = "壹仟两百九十"
 35 |     assert c2a.zh2num(s) == 1290
 36 | 
 37 | 
 38 | def test_chinese2arabic7():
 39 |     s = "十一"
 40 |     assert c2a.zh2num(s) == 11
 41 | 
 42 | 
 43 | def test_chinese2arabic8():
 44 |     s = "八十八"
 45 |     assert c2a.zh2num(s) == 88
 46 | 
 47 | 
 48 | def test_chinese2arabic9():
 49 |     s = "三"
 50 |     assert c2a.zh2num(s) == 3
 51 | 
 52 | 
 53 | def test_chinese2arabic10():
 54 |     s = "两百五十"
 55 |     assert c2a.zh2num(s) == 250
 56 | 
 57 | 
 58 | def test_chinese2arabic11():
 59 |     s = "两百"
 60 |     assert c2a.zh2num(s) == 200
 61 | 
 62 | 
 63 | def test_chinese2arabic12():
 64 |     s = "两百零五"
 65 |     assert c2a.zh2num(s) == 205
 66 | 
 67 | 
 68 | def test_chinese2arabic13():
 69 |     s = "两百二十五"
 70 |     assert c2a.zh2num(s) == 225
 71 | 
 72 | 
 73 | def test_chinese2arabic14():
 74 |     s = "二十万五千"
 75 |     assert c2a.zh2num(s) == 205000
 76 | 
 77 | 
 78 | def test_chinese2arabic15():
 79 |     s = "两百三十九万四千八百二十三"
 80 |     assert c2a.zh2num(s) == 2394823
 81 | 
 82 | 
 83 | def test_chinese2arabic16():
 84 |     s = "一千三百万"
 85 |     assert c2a.zh2num(s) == 13000000
 86 | 
 87 | 
 88 | def test_chinese2arabic17():
 89 |     s = "万"
 90 |     assert c2a.zh2num(s) == "万"
 91 | 
 92 | 
 93 | def test_chinese2arabic18():
 94 |     s = "亿"
 95 |     assert c2a.zh2num(s) == "亿"
 96 | 
 97 | 
 98 | def test_chinese2arabic19():
 99 |     s = "千"
100 |     assert c2a.zh2num(s) == "千"
101 | 
102 | 
103 | def test_chinese2arabic20():
104 |     s = "百"
105 |     assert c2a.zh2num(s) == "百"
106 | 
107 | 
108 | def test_chinese2arabic21():
109 |     s = "零"
110 |     assert c2a.zh2num(s) == 0
111 | 
112 | 
113 | def test_arabic2chinese1():
114 |     num = 0
115 |     assert c2a.num2zh(num) == "零"
116 | 
117 | 
118 | def test_arabic2chinese2():
119 |     num = 1
120 |     assert c2a.num2zh(num) == "一"
121 | 
122 | 
123 | def test_arabic2chinese3():
124 |     num = 10
125 |     assert c2a.num2zh(num) == "一十"
126 | 
127 | 
128 | def test_arabic2chinese4():
129 |     num = 12
130 |     assert c2a.num2zh(num) == "一十二"
131 | 
132 | 
133 | def test_arabic2chinese5():
134 |     num = 22
135 |     assert c2a.num2zh(num) == "二十二"
136 | 
137 | 
138 | def test_arabic2chinese6():
139 |     num = 100
140 |     assert c2a.num2zh(num) == "一百"
141 | 
142 | 
143 | def test_arabic2chinese7():
144 |     num = 101
145 |     assert c2a.num2zh(num) == "一百零一"
146 | 
147 | 
148 | def test_arabic2chinese8():
149 |     num = 110
150 |     assert c2a.num2zh(num) == "一百一十"
151 | 
152 | 
153 | def test_arabic2chinese9():
154 |     num = 112
155 |     assert c2a.num2zh(num) == "一百一十二"
156 | 
157 | 
158 | def test_arabic2chinese10():
159 |     num = 1000
160 |     assert c2a.num2zh(num) == "一千"
161 | 
162 | 
163 | def test_arabic2chinese11():
164 |     num = 1001
165 |     assert c2a.num2zh(num) == "一千零一"
166 | 
167 | 
168 | def test_arabic2chinese12():
169 |     num = 1011
170 |     assert c2a.num2zh(num) == "一千零一十一"
171 | 
172 | 
173 | def test_arabic2chinese13():
174 |     num = 1101
175 |     assert c2a.num2zh(num) == "一千一百零一"
176 | 
177 | 
178 | def test_arabic2chinese14():
179 |     num = 1010
180 |     assert c2a.num2zh(num) == "一千零一十"
181 | 
182 | 
183 | def test_arabic2chinese15():
184 |     num = 1100
185 |     assert c2a.num2zh(num) == "一千一百"
186 | 
187 | 
188 | def test_arabic2chinese16():
189 |     num = 1110
190 |     assert c2a.num2zh(num) == "一千一百一十"
191 | 
192 | 
193 | def test_arabic2chinese17():
194 |     num = 1111
195 |     assert c2a.num2zh(num) == "一千一百一十一"
196 | 
197 | 
198 | def test_arabic2chinese18():
199 |     num = 100000
200 |     assert c2a.num2zh(num) == "一十万"
201 | 
202 | 
203 | def test_arabic2chinese19():
204 |     num = 110000
205 |     assert c2a.num2zh(num) == "一十一万"
206 | 
207 | 
208 | def test_arabic2chinese20():
209 |     num = 1000000
210 |     assert c2a.num2zh(num) == "一百万"
211 | 
212 | 
213 | def test_arabic2chinese21():
214 |     num = 1010000
215 |     assert c2a.num2zh(num) == "一百零一万"
216 | 
217 | 
218 | def test_arabic2chinese22():
219 |     num = 1100000
220 |     assert c2a.num2zh(num) == "一百一十万"
221 | 
222 | 
223 | def test_arabic2chinese23():
224 |     num = 1110000
225 |     assert c2a.num2zh(num) == "一百一十一万"
226 | 
227 | 
228 | def test_arabic2chinese24():
229 |     num = 100000000
230 |     assert c2a.num2zh(num) == "一亿"
231 | 
232 | 
233 | def test_arabic2chinese25():
234 |     num = 110000000
235 |     assert c2a.num2zh(num) == "一亿一千万"
236 | 
237 | 
238 | def test_arabic2chinese26():
239 |     num = 111000000
240 |     assert c2a.num2zh(num) == "一亿一千一百万"
241 | 
242 | 
243 | def test_arabic2chinese27():
244 |     num = 101000000
245 |     assert c2a.num2zh(num) == "一亿零一百万"
246 | 
247 | 
248 | def test_arabic2chinese28():
249 |     num = 1000000000000
250 |     assert c2a.num2zh(num) == "一万亿"
251 | 
252 | 
253 | def test_arabic2chinese29():
254 |     num = 1100000000000
255 |     assert c2a.num2zh(num) == "一万一千亿"
256 | 
257 | 
258 | def test_arabic2chinese30():
259 |     # 一兆一亿
260 |     num = 11000000000000
261 |     assert c2a.num2zh(num) == "超大"
262 | 
263 | 
264 | def test_arabic2chinese31():
265 |     num = 1110011
266 |     assert c2a.num2zh(num) == "一百一十一万零一十一"
267 | 
268 | 
269 | def test_arabic2chinese_money1():
270 |     num = 112
271 |     assert c2a.num2zh(num).to_money() == "壹佰壹拾贰"
272 | 
273 | 
274 | def test_arabic2chinese_money2():
275 |     num = 1111
276 |     assert c2a.num2zh(num).to_money() == "壹仟壹佰壹拾壹"
277 | 
278 | 
279 | def test_arabic2chinese_money3():
280 |     num = 1010000
281 |     assert c2a.num2zh(num).to_money() == "壹佰零壹萬"
282 | 
283 | 


--------------------------------------------------------------------------------
/tests/test_ptrans.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from pnlp.ptrans import pick_entity_from_bio_labels, generate_uuid
 4 | 
 5 | 
 6 | @pytest.mark.parametrize("pairs,result", [
 7 |     ([('v1', 'B-1')], [("v1", "1")]),
 8 |     ([('v0', 'O'), ('vo', 'O')], []),
 9 |     ([('v1', 'B-1'), ('v2', 'I-1')], [("v1v2", "1")]),
10 |     ([('v1', 'B-1'), ('v2', 'I-1'), ('v0', 'O')], [("v1v2", "1")]),
11 |     ([('v1', 'O'),
12 |         ('v2', 'B-2'),
13 |         ('v3', 'B-3'),
14 |         ('v4', 'I-3'),
15 |         ('v5', 'B-5'),
16 |         ('v6', 'B-6'),
17 |         ('v0', 'O'),
18 |         ('v0', 'O'),
19 |         ('v7', 'B-7'),
20 |         ('v8', 'I-7'),
21 |         ('v9', 'B-9')],
22 |         [("v2", "2"), ("v3v4", "3"), ("v5", "5"), ("v6", "6"), ("v7v8", "7"), ("v9", "9")]
23 |      ),
24 | ])
25 | def test_pick_entity_from_bio_labels(pairs, result):
26 |     entities = pick_entity_from_bio_labels(pairs)
27 |     assert entities == result
28 | 
29 | 
30 | @pytest.mark.parametrize("pairs,result", [
31 |     ([("我", "O"), ("国", "O"), ("北", "B-LOC"), ("京", "I-LOC")], [("北京", "LOC", 2, 4)]),
32 |     ([("我", "O"), ("国", "O"), ("北", "B-LOC"), ("京", "I-LOC"), ("。", "O")], [("北京", "LOC", 2, 4)]),
33 |     ([("我", "O"), ("国", "O"), ("北", "B-LOC"), ("京", "I-LOC"), ("天", "B-LOC"), ("安", "I-LOC"), ("门", "I-LOC")], [("北京", "LOC", 2, 4), ("天安门", "LOC", 4, 7)]),
34 |     ([("我", "O"), ("国", "O"), ("北", "B-LOC"), ("京", "I-LOC"), ("的", "O"), ("天", "B-LOC"), ("安", "I-LOC"), ("门", "I-LOC")], [("北京", "LOC", 2, 4), ("天安门", "LOC", 5, 8)]),
35 |     ([("北", "B-LOC"), ("京", "I-LOC"), ("天", "B-LOC"), ("安", "I-LOC"), ("门", "I-LOC")], [("北京", "LOC", 0, 2), ("天安门", "LOC", 2, 5)]),
36 |     ([("北", "B-LOC"), ("京", "I-LOC"), ("天", "B-LOC"), ("安", "I-LOC"), ("门", "I-LOC"), ("。", "O")], [("北京", "LOC", 0, 2), ("天安门", "LOC", 2, 5)]),
37 |     ([("北", "B-ORG"), ("大", "I-ORG"), ("蔡", "B-PER"), ("元", "I-PER"), ("培", "I-PER")], [("北大", "ORG", 0, 2), ("蔡元培", "PER", 2, 5)]),
38 |     ([("说", "O"), ("北", "B-ORG"), ("大", "I-ORG"), ("蔡", "B-PER"), ("元", "I-PER"), ("培", "I-PER")], [("北大", "ORG", 1, 3), ("蔡元培", "PER", 3, 6)]),
39 |     ([("北", "B-ORG"), ("大", "I-ORG"), ("蔡", "B-PER"), ("元", "I-PER"), ("培", "I-PER"), ("啊", "O")], [("北大", "ORG", 0, 2), ("蔡元培", "PER", 2, 5)]),
40 |     ([("北", "B-LOC"), ("京", "I-LOC"), ("的", "O"), ("安", "I-LOC")], [("北京", "LOC", 0, 2)]),
41 |     ([("北", "B-LOC"), ("京", "I-LOC"), ("的", "O"), ("安", "B-LOC")], [("北京", "LOC", 0, 2), ("安", "LOC", 3, 4)]),
42 | ])
43 | def test_pick_entity_from_bio_labels_with_offset(pairs, result):
44 |     entities = pick_entity_from_bio_labels(pairs, True)
45 |     assert entities == result
46 | 
47 | 
48 | @pytest.mark.parametrize("inp", [
49 |     (("a", 1, 0.5)),
50 |     (("好", 1, 0.5)),
51 | ])
52 | def test_generate_uuid(inp):
53 |     uid = generate_uuid(*inp)
54 |     assert type(uid) == str
55 |     assert len(uid) == 32
56 | 


--------------------------------------------------------------------------------
/tests/test_ptxt.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import pytest
  3 | 
  4 | from pnlp.ptxt import Text, Regex, Length
  5 | reg = Regex()
  6 | 
  7 | 
  8 | @pytest.mark.parametrize("inp, expected", [
  9 |     ("1.t", ""),
 10 |     ("1,t", "1"),
 11 |     (",1", "1"),
 12 | ])
 13 | def test_regex_pnum(inp, expected):
 14 |     match = reg.pnum.search(inp)
 15 |     if match:
 16 |         res = match.group()
 17 |     else:
 18 |         res = ""
 19 |     assert res == expected
 20 | 
 21 | 
 22 | @pytest.fixture(params=reg.patnames)
 23 | def get_regex(request):
 24 |     return reg.patdict[request.param]
 25 | 
 26 | 
 27 | def test_regex_well(get_regex):
 28 |     assert isinstance(get_regex, re.Pattern)
 29 | 
 30 | 
 31 | @pytest.fixture(params=reg.patnames)
 32 | def get_patten(request):
 33 |     return [request.param]
 34 | 
 35 | 
 36 | def test_Text_extract(get_patten):
 37 |     text = "这是，测试fdsf234*(&( 返回类型的文本。"
 38 |     res = Text(get_patten).extract(text)
 39 |     assert isinstance(res, dict)
 40 |     assert isinstance(res.mats, list)
 41 |     assert isinstance(res.locs, list)
 42 | 
 43 | 
 44 | def test_Text_clean(get_patten):
 45 |     text = "这是，测试fdsf234*(&( 返回类型的文本。"
 46 |     res = Text(get_patten).clean(text)
 47 |     assert isinstance(res, str)
 48 | 
 49 | 
 50 | def test_pattern_string_invalid():
 51 |     try:
 52 |         Text(["XX"])
 53 |     except Exception as e:
 54 |         assert "built-in" in str(e)
 55 | 
 56 | 
 57 | def test_pattern_invalid():
 58 |     try:
 59 |         Text([lambda x: x])
 60 |     except Exception as e:
 61 |         assert "RE" in str(e)
 62 | 
 63 | 
 64 | @pytest.fixture
 65 | def text_chi():
 66 |     text = "你好。jefj*(&-1)这是中文测试！"
 67 |     return text
 68 | 
 69 | 
 70 | def test_Text_extract_chi(text_chi):
 71 |     res = Text(['chi']).extract(text_chi)
 72 |     assert "".join(res.mats) == "你好这是中文测试"
 73 |     assert res.text == "你好这是中文测试"
 74 | 
 75 | 
 76 | def test_Text_clean_chi(text_chi):
 77 |     res = Text(['chi']).clean(text_chi)
 78 |     assert res == "。jefj*(&-1)！"
 79 | 
 80 | 
 81 | @pytest.fixture
 82 | def text_pun():
 83 |     text = "你好，这是标点,.!;<>()符号测试。"
 84 |     return text
 85 | 
 86 | 
 87 | def test_Text_extract_pun(text_pun):
 88 |     res = Text(['nwn']).extract(text_pun)
 89 |     assert "".join(res.mats) == "，,.!;<>()。"
 90 |     assert res.text == "，,.!;<>()。"
 91 | 
 92 | 
 93 | def test_Text_clean_pun(text_pun):
 94 |     res = Text(['nwn']).clean(text_pun)
 95 |     assert res == "你好这是标点符号测试"
 96 | 
 97 | 
 98 | @pytest.fixture
 99 | def text_whi():
100 |     text = "你好，这是空白 \t\n符号测试。"
101 |     return text
102 | 
103 | 
104 | def test_Text_extract_whi(text_whi):
105 |     res = Text(['whi']).extract(text_whi)
106 |     assert "".join(res.mats) == " \t\n"
107 |     assert res.text == " \t\n"
108 | 
109 | 
110 | def test_Text_clean_whi(text_whi):
111 |     res = Text(['whi']).clean(text_whi)
112 |     assert res == "你好，这是空白符号测试。"
113 | 
114 | 
115 | @pytest.fixture
116 | def text_nwh():
117 |     text = "你好，这是非空白 \t\n符号测试。"
118 |     return text
119 | 
120 | 
121 | def test_Text_extract_nwh(text_nwh):
122 |     res = Text(['nwh']).extract(text_nwh)
123 |     assert "".join(res.mats) == "你好，这是非空白符号测试。"
124 |     assert res.text == "你好，这是非空白符号测试。"
125 | 
126 | 
127 | def test_Text_clean_nwh(text_nwh):
128 |     res = Text(['nwh']).clean(text_nwh)
129 |     assert res == " \t\n"
130 | 
131 | 
132 | @pytest.fixture
133 | def text_wnb():
134 |     text = "你好，这是词与word数字number测试。"
135 |     return text
136 | 
137 | 
138 | def test_Text_extract_wnb(text_wnb):
139 |     res = Text(['wnb']).extract(text_wnb)
140 |     assert "".join(res.mats) == "你好这是词与word数字number测试"
141 |     assert res.text == "你好这是词与word数字number测试"
142 | 
143 | 
144 | def test_Text_clean_wnb(text_wnb):
145 |     res = Text(['wnb']).clean(text_wnb)
146 |     assert res == "，。"
147 | 
148 | 
149 | @pytest.fixture
150 | def text_nwn():
151 |     text = "你好，这是非词或word数字number测试。"
152 |     return text
153 | 
154 | 
155 | def test_Text_extract_nwn(text_nwn):
156 |     res = Text(['nwn']).extract(text_nwn)
157 |     assert "".join(res.mats) == "，。"
158 |     assert res.text == "，。"
159 | 
160 | 
161 | def test_Text_clean_nwn(text_nwn):
162 |     res = Text(['nwn']).clean(text_nwn)
163 |     assert res == "你好这是非词或word数字number测试"
164 | 
165 | 
166 | @pytest.fixture
167 | def text_eng():
168 |     text = "你好，这#￥是英文English测试。"
169 |     return text
170 | 
171 | 
172 | def test_Text_extract_eng(text_eng):
173 |     res = Text(['eng']).extract(text_eng)
174 |     assert "".join(res.mats) == "English"
175 |     assert res.text == "English"
176 | 
177 | 
178 | def test_Text_clean_eng(text_eng):
179 |     res = Text(['eng']).clean(text_eng)
180 |     assert res == "你好，这#￥是英文测试。"
181 | 
182 | 
183 | @pytest.fixture
184 | def text_num():
185 |     text = "你好，这#￥是数字2, +2, -2, 2.1, -2.2, 1/5, 2:3, -2/5, 2%, 2.5%测试。"
186 |     return text
187 | 
188 | 
189 | def test_Text_extract_num(text_num):
190 |     res = Text(['num']).extract(text_num)
191 |     assert "".join(res.mats) == "2+2-22.1-2.21/52:3-2/52%2.5%"
192 |     assert res.text == "2+2-22.1-2.21/52:3-2/52%2.5%"
193 | 
194 | 
195 | def test_Text_clean_num(text_num):
196 |     res = Text(['num']).clean(text_num)
197 |     assert res == "你好，这#￥是数字, , , , , , , , , 测试。"
198 | 
199 | 
200 | @pytest.fixture
201 | def text_pic():
202 |     text = "你好，这#￥是![p1](https://xxx.jpeg)图片![](yyy.png)测试https://z.jpg。"
203 |     return text
204 | 
205 | 
206 | def test_Text_extract_pic(text_pic):
207 |     res = Text(['pic']).extract(text_pic)
208 |     assert "".join(
209 |         res.mats) == "![p1](https://xxx.jpeg)![](yyy.png)https://z.jpg"
210 |     assert res.text == "![p1](https://xxx.jpeg)![](yyy.png)https://z.jpg"
211 | 
212 | 
213 | def test_Text_clean_pic(text_pic):
214 |     res = Text(['pic']).clean(text_pic)
215 |     assert res == "你好，这#￥是图片测试。"
216 | 
217 | 
218 | @pytest.fixture
219 | def text_lnk():
220 |     text = "你好，www.g.com，这#￥是链接[link](https://yam.gift)测试http://yam.gift。"
221 |     return text
222 | 
223 | 
224 | def test_Text_extract_lnk(text_lnk):
225 |     res = Text(['lnk']).extract(text_lnk)
226 |     assert "".join(
227 |         res.mats) == "www.g.com[link](https://yam.gift)http://yam.gift"
228 |     assert res.text == "www.g.com[link](https://yam.gift)http://yam.gift"
229 | 
230 | 
231 | def test_Text_clean_lnk(text_lnk):
232 |     res = Text(['lnk']).clean(text_lnk)
233 |     assert res == "你好，，这#￥是链接测试。"
234 | 
235 | 
236 | def test_markdown_link_with_whitespace_in_title():
237 |     text = """啊[Download | View](https://www.altova.com/xmlspy-xml-editor/download/)"""
238 |     res = Text(['lnk']).clean(text)
239 |     assert res == "啊"
240 | 
241 | 
242 | @pytest.fixture
243 | def text_emj():
244 |     text = "你好，这#￥是表情😁测试😜🌹。"
245 |     return text
246 | 
247 | 
248 | def test_Text_extract_emj(text_emj):
249 |     res = Text(['emj']).extract(text_emj)
250 |     assert "".join(res.mats) == "😁😜🌹"
251 |     assert res.text == "😁😜🌹"
252 | 
253 | 
254 | def test_Text_clean_emj(text_emj):
255 |     res = Text(['emj']).clean(text_emj)
256 |     assert res == "你好，这#￥是表情测试。"
257 | 
258 | 
259 | @pytest.fixture
260 | def text_len():
261 |     text = "这是https://www.yam.gift长度测试，《 》*)FSJfdsjf😁![](http://xx.jpg)。233."
262 |     return text
263 | 
264 | 
265 | def test_Text_len_all(text_len):
266 |     res = Length(text_len).len_all
267 |     assert res == 64
268 | 
269 | 
270 | def test_Text_len_nwh(text_len):
271 |     res = Length(text_len).len_nwh
272 |     assert res == 63
273 | 
274 | 
275 | def test_Text_len_chi(text_len):
276 |     res = Length(text_len).len_chi
277 |     assert res == 6
278 | 
279 | 
280 | def test_Text_len_wnb(text_len):
281 |     res = Length(text_len).len_wnb
282 |     assert res == 41
283 | 
284 | 
285 | def test_Text_len_pun(text_len):
286 |     res = Length(text_len).len_pun
287 |     assert res == 14
288 | 
289 | 
290 | def test_Text_len_eng(text_len):
291 |     res = Length(text_len).len_eng
292 |     assert res == 32
293 | 
294 | 
295 | def test_Text_len_num(text_len):
296 |     res = Length(text_len).len_num
297 |     assert res == 3
298 | 
299 | 
300 | if __name__ == '__main__':
301 |     print(reg.patnames)
302 | 


--------------------------------------------------------------------------------
/tests/test_stopwords.py:
--------------------------------------------------------------------------------
 1 | from pnlp.stopwords import StopWords
 2 | 
 3 | 
 4 | 
 5 | def test_stopwords():
 6 |     sw = StopWords()
 7 |     assert type(sw.zh) == set
 8 |     assert type(sw.en) == set
 9 |     assert sw.zh_len > 0
10 |     assert sw.en_len > 0
11 | 
12 | 
13 | def test_custom_stopwords():
14 |     sw = StopWords("tests/piop_data/b.txt")
15 |     assert type(sw.stopwords) == set
16 |     assert len(sw.stopwords) == 3


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | from functools import partial
  4 | import math
  5 | import itertools
  6 | import pytest
  7 | import multiprocessing as mp
  8 | 
  9 | from pnlp.utils import pstr, concurring, generate_batches_by_num
 10 | from pnlp.utils import run_in_new_thread
 11 | from pnlp.piop import read_file, write_file
 12 | 
 13 | 
 14 | def test_pstr1():
 15 |     s1 = pstr("123")
 16 |     s2 = "1"
 17 |     assert s1 - s2 == "23"
 18 | 
 19 | 
 20 | def test_pstr2():
 21 |     s1 = pstr("123")
 22 |     s2 = "123"
 23 |     assert s1 - s2 == ""
 24 | 
 25 | 
 26 | def test_pstr3():
 27 |     s1 = pstr("123")
 28 |     s2 = "234"
 29 |     assert s1 - s2 == "1"
 30 | 
 31 | 
 32 | def test_pstr4():
 33 |     s1 = pstr("123")
 34 |     s2 = "456"
 35 |     assert s1 - s2 == "123"
 36 | 
 37 | 
 38 | def test_pstr5():
 39 |     s1 = pstr("")
 40 |     s2 = "456"
 41 |     assert s1 - s2 == ""
 42 | 
 43 | 
 44 | def test_generate_batches():
 45 |     lst = range(100)
 46 |     res = list(generate_batches_by_num(lst, 10))
 47 |     assert len(res) == 10
 48 |     assert len(res[0]) == 10
 49 | 
 50 | 
 51 | def is_prime(x):
 52 |     if x < 2:
 53 |         return False
 54 |     for i in range(2, int(math.sqrt(x)) + 1):
 55 |         if x % i == 0:
 56 |             return False
 57 |     return True
 58 | 
 59 | 
 60 | def test_concurring_default():
 61 | 
 62 |     @concurring
 63 |     def get_primes(lst):
 64 |         res = []
 65 |         for i in lst:
 66 |             if is_prime(i):
 67 |                 res.append(i)
 68 |         return res
 69 |     lst = list(range(100))
 70 |     res = get_primes(lst)
 71 |     res = list(res)
 72 |     assert len(res) == mp.cpu_count()
 73 |     res = list(itertools.chain(*res))
 74 |     assert len(res) == 25
 75 | 
 76 | 
 77 | @pytest.mark.parametrize(
 78 |     "type",
 79 |     ["thread_pool", "process_pool", "thread_executor", "thread"])
 80 | @pytest.mark.parametrize("max_workers", [1, 2, 4, 7, 10])
 81 | def test_concurring_with_parameters(type, max_workers):
 82 | 
 83 |     @concurring(type=type, max_workers=max_workers)
 84 |     def get_primes(lst):
 85 |         res = []
 86 |         for i in lst:
 87 |             if is_prime(i):
 88 |                 res.append(i)
 89 |         return res
 90 |     lst = list(range(100))
 91 |     res = get_primes(lst)
 92 |     res = list(res)
 93 |     assert len(res) == max_workers
 94 |     res = list(itertools.chain(*res))
 95 |     assert len(res) == 25
 96 | 
 97 | 
 98 | def test_concurring_invalid_type():
 99 | 
100 |     @concurring(type="invalid")
101 |     def get_primes(lst):
102 |         res = []
103 |         for i in lst:
104 |             if is_prime(i):
105 |                 res.append(i)
106 |         return res
107 |     lst = list(range(100))
108 |     try:
109 |         res = get_primes(lst)
110 |     except Exception as err:
111 |         assert "invalid" in str(err)
112 | 
113 | 
114 | def test_concurring_invalid_workers():
115 | 
116 |     try:
117 |         @concurring(max_workers=0)
118 |         def get_primes(lst):
119 |             res = []
120 |             for i in lst:
121 |                 if is_prime(i):
122 |                     res.append(i)
123 |             return res
124 |     except Exception as err:
125 |         assert "0" in str(err)
126 | 
127 | 
128 | 
129 | def test_run_in_thread():
130 |     file = "run_in_new_thread.txt"
131 |     
132 |     def func(file, a, b, c):
133 |         write_file(file, list(map(str, [a, b, c])))
134 | 
135 |     run_in_new_thread(func, file, 1, 2, 3)
136 |     import time
137 |     time.sleep(1)
138 | 
139 |     assert os.path.exists(file)
140 |     os.remove(file)
141 | 
142 | 
143 | def test_run_in_thread_kwargs():
144 |     kwargs = {
145 |         "b": 2,
146 |         "c": 3,
147 |     }
148 | 
149 |     def func(a, b, c):
150 |         return a + b + c
151 | 
152 |     func = partial(func, **kwargs)
153 | 
154 |     assert 6 == func(1)
155 | 
156 | 


--------------------------------------------------------------------------------