├── .gitignore ├── LICENSE ├── README.md ├── examples ├── batch_keyword_search.py ├── keyword_search.py ├── new_word.py └── new_word │ ├── test.txt │ ├── test_new_word.txt │ ├── test_new_word2.txt │ ├── test_new_word3.txt │ └── test_new_word4.txt ├── lighttext ├── __init__.py ├── component │ ├── __init__.py │ ├── article.py │ ├── character.py │ ├── dictionary.py │ ├── paragraph.py │ ├── sentence.py │ ├── vocabulary.py │ └── word.py ├── core │ ├── __init__.py │ └── utils.py ├── data │ ├── ban.txt │ ├── dict.txt │ ├── post_filter_pos.txt │ ├── pre_filter_pos.txt │ └── stopwords.txt ├── doc_search │ ├── __init__.py │ ├── basic.py │ ├── bloom.py │ └── searcher.py ├── inverted_index │ ├── __init__.py │ └── inv_idx.py ├── string_search │ ├── __init__.py │ ├── keyword.py │ └── ner.py ├── text_mining │ ├── __init__.py │ └── new_words_detection │ │ ├── __init__.py │ │ ├── detector.py │ │ └── utils.py ├── text_similarity │ ├── __init__.py │ └── sim.py └── utils │ ├── __init__.py │ ├── sen_split.py │ └── tokenize.py ├── requirements.txt ├── setup.py ├── test ├── doc.txt ├── doc_search │ ├── test_bloom.py │ └── test_searcher.py ├── fuzzy_test.py ├── test_article.py ├── test_gensim.py ├── test_paragraph.py ├── test_sen_split.py ├── test_sentence.py ├── test_sim_model.py └── test_vocabulary.py └── version.txt /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | dist 3 | lightText.egg-info 4 | 5 | # python 6 | *__pycache__* 7 | 8 | # pycharm 9 | *.idea* 10 | 11 | # test 12 | test/_trial_temp 13 | test/sim.bin 14 | test/saves 15 | 16 | # scripts 17 | scripts 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # lightText 2 | 文本处理相关库,目前包括新词发现、字符串提取等功能。 3 | 4 | ## 功能 5 | 6 | 1. 新词发现 7 | 2. 字符串提取 8 | 9 | ## 安装 10 | 11 | ```bash 12 | pip install lightText 13 | ``` 14 | 建议使用国内源来安装,如使用以下命令: 15 | 16 | ```bash 17 | pip install -i https://pypi.douban.com/simple/ lightText 18 | ``` 19 | 20 | ## 使用 21 | 22 | ### 1.新词发现 23 | 24 | 目前主要是根据统计指标判别两个词是否可以连结成一个新词,并没有考虑三个词组词的情况。主要原理是互信息和左右熵。 25 | 26 | #### 使用示例 27 | 28 | ```python 29 | from lighttext import NewWordDetector 30 | 31 | if __name__ == '__main__': 32 | detector = NewWordDetector() 33 | detector.load_file('new_word/test_new_word3.txt') 34 | print(detector.get_top_k(5)) 35 | ``` 36 | 37 | 其中,文本内容如下: 38 | 39 | ```text 40 | 知识图谱(Knowledge Graph),在图书情报界称为知识域可视化或知识领域映射地图,是显示知识发展进程与结构关系的一系列各种不同的图形,用可视化技术描述知识资源及其载体,挖掘、分析、构建、绘制和显示知识及它们之间的相互联系。 41 | 知识图谱是通过将应用数学、图形学、信息可视化技术、信息科学等学科的理论与方法与计量学引文分析、共现分析等方法结合,并利用可视化的图谱形象地展示学科的核心结构、发展历史、前沿领域以及整体知识架构达到多学科融合目的的现代理论。它能为学科研究提供切实的、有价值的参考。 42 | 具体来说,知识图谱是通过将应用数学、图形学、信息可视化技术、信息科学等学科的理论与方法与计量学引文分析、共现分析等方法结合,并利用可视化的图谱形象地展示学科的核心结构、发展历史、前沿领域以及整体知识架构达到多学科融合目的的现代理论。它把复杂的知识领域通过数据挖掘、信息处理、知识计量和图形绘制而显示出来,揭示知识领域的动态发展规律,为学科研究提供切实的、有价值的参考。迄今为止,其实际应用在发达国家已经逐步拓展并取得了较好的效果,但它在我国仍属研究的起步阶段。 43 | ``` 44 | 45 | #### 运行结果 46 | 47 | ```text 48 | [('知识_图谱', 0.4759861448031701), ('可视化_技术', 0.43589887951973), ('知识_领域', 0.4213569981012666), ('共现_分析', 0.38939972966154035), ('计量学_引文', 0.3790459908198307)] 49 | ``` 50 | 51 | ### 2.字符串提取 52 | 53 | 基于AC自动机的高效的字符串匹配提取工具,可以直接得到覆盖最多字的提取结果。主要原理是AC自动机、字典树、动态规划算法。 54 | 55 | #### 使用示例 56 | 57 | ```python 58 | from lighttext import KeywordProcessor 59 | 60 | 61 | if __name__ == '__main__': 62 | kp = KeywordProcessor() 63 | kp.add_keyword("曹操") 64 | kp.add_keyword("曹丕") 65 | kp.add_keyword("司马懿") 66 | kp.add_keyword("司马") 67 | stn = "曹操、曹丕和司马懿一起去吃大盘鸡。" 68 | 69 | print(kp.extract_keywords(stn)) 70 | ``` 71 | 72 | #### 运行结果 73 | 74 | ```text 75 | ['曹操', '曹丕', '司马懿'] 76 | ``` 77 | 78 | ## 参考 79 | 80 | ### NLP 81 | 82 | 1. [基于互信息和左右信息熵的短语提取识别-码农场](https://www.hankcs.com/nlp/extraction-and-identification-of-mutual-information-about-the-phrase-based-on-information-entropy.html) 83 | 2. [互联网时代的社会语言学:基于SNS的文本数据挖掘 | Matrix67: The Aha Moments](http://www.matrix67.com/blog/archives/5044) 84 | 3. [python3实现互信息和左右熵的新词发现 - 简书](https://www.jianshu.com/p/e9313fd692ef) 85 | 86 | ### 源码 87 | 88 | 1. [xylander23/New-Word-Detection: 新词发现算法(NewWordDetection)](https://github.com/xylander23/New-Word-Detection) 89 | 2. [zhanzecheng/Chinese_segment_augment: python3实现互信息和左右熵的新词发现](https://github.com/zhanzecheng/Chinese_segment_augment) 90 | 3. [vi3k6i5/flashtext: Extract Keywords from sentence or Replace keywords in sentences.](https://github.com/vi3k6i5/flashtext) 91 | 92 | ### Python 93 | 94 | 1. [Can't pickle local object 'DataLoader.__init__..' - vision - PyTorch Forums](https://discuss.pytorch.org/t/cant-pickle-local-object-dataloader-init-locals-lambda/31857) 95 | 2. [python3.X中pickle类的用法(cPickle模块移除了)_python,pickle_lanqiu5ge的专栏-CSDN博客](https://blog.csdn.net/lanqiu5ge/article/details/25136909) 96 | 3. [python - copy.deepcopy vs pickle - Stack Overflow](https://stackoverflow.com/questions/1410615/copy-deepcopy-vs-pickle) 97 | 4. [Python中collections.defaultdict()使用 - 简书](https://www.jianshu.com/p/26df28b3bfc8) 98 | 99 | ### 数据结构 100 | 101 | 1. [Trie树(字典树)](https://github.com/zhanzecheng/The-Art-Of-Programming-By-July/blob/master/ebook/zh/06.09.md) 102 | 103 | ## 打赏 104 | 105 | 如果该项目对您有所帮助,欢迎打赏~ 106 | 107 | ![UTOOLS1578660899400.jpg](https://lightsmile-img.oss-cn-beijing.aliyuncs.com/UTOOLS1578660899400.jpg) 108 | 109 | 110 | 111 | 112 | -------------------------------------------------------------------------------- /examples/batch_keyword_search.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2020/7/12 18:22 3 | # @Author : lightsmile 4 | # @Software: PyCharm 5 | 6 | import time 7 | 8 | from tqdm import tqdm 9 | from lightutils import read_json_line, cutoff_iter 10 | from lighttext import KeywordProcessor 11 | 12 | data_path = r"D:\Data\NLP\corpus\baike_info_1_to_10000\result.json" 13 | data_path2 = r"D:\Data\NLP\corpus\baike_info_10000_to_100000\result.json" 14 | 15 | if __name__ == '__main__': 16 | word_list = [] 17 | for line in tqdm(read_json_line(data_path)): 18 | if 'info' in line: 19 | word_list.append(line['info']['word']) 20 | 21 | for line in tqdm(read_json_line(data_path2)): 22 | if 'info' in line: 23 | word_list.append(line['info']['word']) 24 | 25 | word_set = set(word_list) 26 | 27 | print(len(word_list)) 28 | print(len(word_set)) 29 | 30 | for word in cutoff_iter(word_set): 31 | print(word) 32 | 33 | a = time.time() 34 | kp = KeywordProcessor() 35 | kp.add_keyword("张居正", "人物") 36 | b = time.time() 37 | for word in tqdm(word_set): 38 | if len(word) > 1: 39 | kp.add_keyword(word) 40 | c = time.time() 41 | print(c - b, b - a) 42 | print(len(kp)) 43 | 44 | sentence = """东汉末年,天下大乱,曹操以汉朝天子刘协的名义征讨四方,对内消灭二袁、吕布、刘表、马超、韩遂等割据势力,对外降服南匈奴、乌桓、鲜卑等,统一了中国北方,并实行一系列政策恢复经济生产和社会秩序,扩大屯田、兴修水利、奖励农桑、重视手工业、安置流亡人口、实行“租调制”,从而使中原社会渐趋稳定、经济出现转机。 """ 45 | sentence = """张居正入选庶吉士,教习中有内阁重臣徐阶。徐阶重视经邦济世的学问,在其引导下,张居正努力钻研朝章国故,为他日后走上政治舞台打下了坚实的基础。明初为了加强君主专制,废丞相,设内阁,其职能相当于皇帝的秘书厅。首席内阁学士称首辅,张居正入翰林院学习的时候,内阁中正在进行着一场激烈的政治斗争。内阁大学士只有夏言、严嵩二人,二人争夺首辅职位,夏言夺得首辅之后被严嵩进谗而被杀,严嵩为内阁首辅。""" 46 | print(kp.extract_keywords(sentence)) 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /examples/keyword_search.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2020/7/2 15:16 3 | # @Author : lightsmile 4 | # @Software: PyCharm 5 | 6 | from lighttext import KeywordProcessor 7 | 8 | 9 | if __name__ == '__main__': 10 | kp = KeywordProcessor() 11 | kp.add_keyword("曹操") 12 | kp.add_keyword("曹丕") 13 | kp.add_keyword("司马懿") 14 | kp.add_keyword("司马") 15 | stn = "曹操、曹丕和司马懿一起去吃大盘鸡。" 16 | 17 | print(kp.extract_keywords(stn)) 18 | -------------------------------------------------------------------------------- /examples/new_word.py: -------------------------------------------------------------------------------- 1 | from lighttext import NewWordDetector 2 | 3 | if __name__ == '__main__': 4 | detector = NewWordDetector() 5 | detector.load_file('new_word/test_new_word3.txt') 6 | print(detector.get_top_k(5)) 7 | -------------------------------------------------------------------------------- /examples/new_word/test.txt: -------------------------------------------------------------------------------- 1 | 第一章 陨落的天才 2 | 3 | “斗之力,三段!” 4 | 望着测验魔石碑上面闪亮得甚至有些刺眼的五个大字,少年面无表情,唇角有着一抹自嘲,紧握的手掌,因为大力,而导致略微尖锐的指甲深深的刺进了掌心之中,带来一阵阵钻心的疼痛…… 5 | “萧炎,斗之力,三段!级别:低级!”测验魔石碑之旁,一位中年男子,看了一眼碑上所显示出来的信息,语气漠然的将之公布了出来…… 6 | 中年男子话刚刚脱口,便是不出意外的在人头汹涌的广场上带起了一阵嘲讽的骚动。 7 | “三段?嘿嘿,果然不出我所料,这个“天才”这一年又是在原地踏步!” 8 | “哎,这废物真是把家族的脸都给丢光了。” 9 | “要不是族长是他的父亲,这种废物,早就被驱赶出家族,任其自生自灭了,哪还有机会待在家族中白吃白喝。” 10 | “唉,昔年那名闻乌坦城的天才少年,如今怎么落魄成这般模样了啊?” 11 | “谁知道呢,或许做了什么亏心事,惹得神灵降怒了吧……” 12 | 周围传来的不屑嘲笑以及惋惜轻叹,落在那如木桩待在原地的少年耳中,恍如一根根利刺狠狠的扎在心脏一般,让得少年呼吸微微急促。 13 | 少年缓缓抬起头来,露出一张有些清秀的稚嫩脸庞,漆黑的眸子木然的在周围那些嘲讽的同龄人身上扫过,少年嘴角的自嘲,似乎变得更加苦涩了。 14 | “这些人,都如此刻薄势力吗?或许是因为三年前他们曾经在自己面前露出过最谦卑的笑容,所以,如今想要讨还回去吧……”苦涩的一笑,萧炎落寞的转身,安静地回到了队伍的最后一排,孤单的身影,与周围的世界,有些格格不入。 15 | “下一个,萧媚!” 16 | 听着测验人的喊声,一名少女快速的人群中跑出,少女刚刚出场,附近的议论声便是小了许多,一双双略微火热的目光,牢牢的锁定着少女的脸颊…… 17 | 少女年龄不过十四左右,虽然并算不上绝色,不过那张稚气未脱的小脸,却是蕴含着淡淡的妩媚,清纯与妩媚,矛盾的集合,让得她成功的成为了全场瞩目的焦点…… 18 | 少女快步上前,小手轻车熟路的触摸着漆黑的魔石碑,然后缓缓闭上眼睛…… 19 | 在少女闭眼片刻之后,漆黑的魔石碑之上再次亮起了光芒…… 20 | “斗之气:七段!” 21 | “萧媚,斗之气:七段!级别:高级!” 22 | “耶!”听着测验员所喊出的成绩,少女脸颊扬起了得意的笑容…… 23 | “啧啧,七段斗之气,真了不起,按这进度,恐怕顶多只需要三年时间,她就能称为一名真正的斗者了吧……” 24 | “不愧是家族中种子级别的人物啊……” 25 | 听着人群中传来的一阵阵羡慕声,少女脸颊上的笑容更是多了几分,虚荣心,这是很多女孩都无法抗拒的诱惑…… 26 | 与平日里的几个姐妹互相笑谈着,萧媚的视线,忽然的透过周围的人群,停在了人群外的那一道孤单身影上…… 27 | 皱眉思虑了瞬间,萧媚还是打消了过去的念头,现在的两人,已经不在同一个阶层之上,以萧炎最近几年的表现,成年后,顶多只能作为家族中的下层人员,而天赋优秀的她,则将会成为家族重点培养的强者,前途可以说是不可限量。 28 | “唉……”莫名的轻叹了一口气,萧媚脑中忽然浮现出三年前那意气风发的少年,四岁练气,十岁拥有九段斗之气,十一岁突破十段斗之气,成功凝聚斗之气旋,一跃成为家族百年之内最年轻的斗者! 29 | 当初的少年,自信而且潜力无可估量,不知让得多少少女对其春心荡漾,当然,这也包括以前的萧媚。 30 | 然而天才的道路,貌似总是曲折的,三年之前,这名声望达到巅峰的天才少年,却是突兀的接受到了有生以来最残酷的打击,不仅辛辛苦苦修炼十数载方才凝聚的斗之气旋,一夜之间,化为乌有,而且体内的斗之气,也是随着时间的流逝,变得诡异的越来越少。 31 | 斗之气消失的直接结果,便是导致其实力不断地后退。 32 | 从天才的神坛,一夜跌落到了连普通人都不如的地步,这种打击,让得少年从此失魂落魄,天才之名,也是逐渐的被不屑与嘲讽所替代。 33 | 站的越高,摔得越狠,这次的跌落,或许就再也没有爬起的机会。 34 | “下一个,萧薰儿!” 35 | 喧闹的人群中,测试员的声音,再次响了起来。 36 | 随着这有些清雅的名字响起,人群忽然的安静了下来,所有的视线,豁然转移。 37 | 在众人视线汇聚之处,一位身着紫色衣裙的少女,正淡雅的站立,平静的稚嫩俏脸,并未因为众人的注目而改变分毫。 38 | 少女清冷淡然的气质,犹如清莲初绽,小小年纪,却已初具脱俗气质,难以想象,日后若是长大,少女将会如何的倾国倾城…… 39 | 这名紫裙少女,论起美貌与气质来,比先前的萧媚,无疑还要更胜上几分,也难怪在场的众人都是这般动作。 40 | 莲步微移,名为萧薰儿的少女行到魔石碑之前,小手伸出,镶着黑金丝的紫袖滑落而下,露出一截雪白娇嫩的皓腕,然后轻触着石碑…… 41 | 微微沉静,石碑之上,刺眼的光芒再次绽放。 42 | “斗之气:九段!级别:高级!” 43 | 望着石碑之上的字体,场中陷入了一阵寂静。 44 | “……竟然到九段了,真是恐怖!家族中年轻一辈的第一人,恐怕非薰儿小姐莫属了。”寂静过后,周围的少年,都是不由自主的咽了一口唾沫,眼神充满敬畏…… 45 | 斗之气,每位斗者的必经之路,初阶斗之气分一至十段,当体内斗之气到达十段之时,便能凝聚斗之气旋,成为一名受人尊重的斗者! 46 | 人群中,萧媚皱着浅眉盯着石碑前的紫裙少女,脸颊上闪过一抹嫉妒…… 47 | 望着石碑上的信息,一旁的中年测验员漠然的脸庞上竟然也是罕见的露出了一丝笑意,对着少女略微恭声道:“薰儿小姐,半年之后,你应该便能凝聚斗气之旋,如果你成功的话,那么以十四岁年龄成为一名真正的斗者,你是萧家百年内的第二人!” 48 | 是的,第二人,那位第一人,便是褪去了天才光环的萧炎。 49 | “谢谢。”少女微微点了点头,平淡的小脸并未因为他的夸奖而出现喜悦,安静地回转过身,然后在众人炽热的注目中,缓缓地行到了人群最后面的那颓废少年面前…… 50 | “萧炎哥哥。”在经过少年身旁时,少女顿下了脚步,对着萧炎恭敬地弯了弯腰,美丽的俏脸上,居然露出了让周围少女为之嫉妒的清雅笑容。 51 | “我现在还有资格让你怎么叫么?”望着面前这颗已经成长为家族中最璀璨的明珠,萧炎苦涩的道,她是在自己落魄后,极为少数还对自己依旧保持着尊敬的人。 52 | “萧炎哥哥,以前你曾经与薰儿说过,要能放下,才能拿起,提放自如,是自在人!”萧薰儿微笑着柔声道,略微稚嫩的嗓音,却是暖人心肺。 53 | “呵呵,自在人?我也只会说而已,你看我现在的模样,象自在人吗?而且……这世界,本来就不属于我。”萧炎自嘲的一笑,意兴阑珊地道。 54 | 面对着萧炎的颓废,萧薰儿纤细的眉毛微微皱了皱,认真的道:“萧炎哥哥,虽然并不知道你究竟是怎么回事,不过,薰儿相信,你会重新站起来,取回属于你的荣耀与尊严……”话到此处,微顿了顿,少女白皙的俏脸,头一次露出淡淡的绯红:“当年的萧炎哥哥,的确很吸引人……” 55 | “呵呵……”面对着少女毫不掩饰的坦率话语,少年尴尬的笑了一声,可却未再说什么,人不风流枉少年,可现在的他,实在没这资格与心情,落寞的回转过身,对着广场之外缓缓行去…… 56 | 站在原地望着少年那恍如与世隔绝的孤独背影,萧薰儿踌躇了一会,然后在身后一干嫉妒的狼嚎声中,快步追了上去,与少年并肩而行…… 57 | 58 | 第二章 斗气大陆 59 | 60 | 月如银盘,漫天繁星。 61 | 山崖之颠,萧炎斜躺在草地之上,嘴中叼中一根青草,微微嚼动,任由那淡淡的苦涩在嘴中弥漫开来…… 62 | 举起有些白皙的手掌,挡在眼前,目光透过手指缝隙,遥望着天空上那轮巨大的银月。 63 | “唉……”想起下午的测试,萧炎轻叹了一口气,懒懒地抽回手掌,双手枕着脑袋,眼神有些恍惚…… 64 | “十五年了呢……”低低的自喃声,忽然毫无边际的从少年嘴中轻吐了出来。 65 | 在萧炎的心中,有一个仅有他自己知道的秘密:他并不是这个世界的人,或者说,萧炎的灵魂,并不属于这个世界,他来自一个名叫地球的蔚蓝星球,至于为什么会来到这里,这种离奇经过,他也无法解释,不过在生活了一段时间之后,他还是后知后觉的明白了过来:他穿越了! 66 | 随着年龄的增长,对这块大陆,萧炎也是有了些模糊的了解…… 67 | 大陆名为斗气大陆,大陆上并没有小说中常见的各系魔法,而斗气,才是大陆的唯一主调! 68 | 在这片大陆上,斗气的修炼,几乎已经在无数代人的努力之下,发展到了巅峰地步,而且由于斗气的不断繁衍,最后甚至扩散到了民间之中,这也导致,斗气,与人类的日常生活,变得息息相关,如此,斗气在大陆中的重要性,更是变得无可替代! 69 | 因为斗气的极端繁衍,同时也导致从这条主线中分化出了无数条斗气修炼之法,所谓手有长短,分化出来的斗气修炼之法,自然也是有强有弱。 70 | 经过归纳统计,斗气大陆将斗气功法的等级,由高到低分为四阶十二级:天、地、玄、黄! 71 | 而每一阶,又分初,中,高三级! 72 | 修炼的斗气功法等级的高低,也是决定日后成就高低的关键,比如修炼玄阶中级功法的人,自然要比修炼黄阶高级功法的同等级的人要强上几分。 73 | 斗气大陆,分辩强弱,取决于三种条件。 74 | 首先,最重要的,当然是自身的实力,如果本身实力只有一星斗者级别,那就算你修炼的是天阶高级的稀世功法,那也难以战胜一名修炼黄阶功法的斗师。 75 | 其次,便是功法!同等级的强者,如果你的功法等级较之对方要高级许多,那么在比试之时,种种优势,一触既知。 76 | 最后一种,名叫斗技! 77 | 顾名思义,这是一种发挥斗气的特殊技能,斗技在大陆之上,也有着等级之分,总的说来,同样也是分为天地玄黄四级。 78 | 斗气大陆斗技数不胜数,不过一般流传出来的大众斗技,大多都只是黄级左右,想要获得更高深的斗技,便必须加入宗派,或者大陆上的斗气学院。 79 | 当然,一些依靠奇遇所得到前人遗留而下的功法,或者有着自己相配套的斗技,这种由功法衍变而出的斗技,互相配合起来,威力要更强上一些。 80 | 依靠这三种条件,方才能判出究竟孰强孰弱,总的说来,如果能够拥有等级偏高的斗气功法,日后的好处,不言而喻…… 81 | 不过高级斗气修炼功法常人很难得到,流传在普通阶层的功法,顶多只是黄阶功法,一些比较强大的家族或者中小宗派,应该有玄阶的修炼之法,比如萧炎所在的家族,最为顶层的功法,便是只有族长才有资格修炼的:狂狮怒罡,这是一种风属性,并且是玄阶中级的斗气功法。 82 | 玄阶之上,便是地阶了,不过这种高深功法,或许便只有那些超然势力与大帝国,方才可能拥有…… 83 | 至于天阶……已经几百年未曾出现了。 84 | 从理论上来说,常人想要获得高级功法,基本上是难如登天,然而事无绝对,斗气大陆地域辽阔,万族林立,大陆之北,有号称力大无穷,可与兽魂合体的蛮族,大陆之南,也有各种智商奇高的高级魔兽家族,更有那以诡异阴狠而著名的黑暗种族等等…… 85 | 由于地域的辽阔,也有很多不为人知的无名隐士,在生命走到尽头之后,性子孤僻的他们,或许会将平生所创功法隐于某处,等待有缘人取之,在斗气大陆上,流传一句话:如果某日,你摔落悬崖,掉落山洞,不要惊慌,往前走两步,或许,你,将成为强者! 86 | 此话,并不属假,大陆近千年历史中,并不泛这种依靠奇遇而成为强者的故事。 87 | 这个故事所造成的后果,便是造就了大批每天等在悬崖边,准备跳崖得绝世功法的怀梦之人,当然了,这些人大多都是以断胳膊断腿归来…… 88 | 总之,这是一片充满奇迹,以及创造奇迹的大陆! 89 | 当然,想要修炼斗气秘籍,至少需要成为一名真正的斗者之后,方才够资格,而现在的萧炎隔那段距离,似乎还很是遥远…… 90 | “呸。”吐出嘴中的草根,萧炎忽然跳起身来,脸庞狰狞,对着夜空失态地咆哮道:“我操你奶奶的,把老子穿过来当废物玩吗?草!” 91 | 在前世,萧炎只是庸碌众生中极其平凡的一员,金钱,美人,这些东西与他根本就是两条平行线,永远没有交叉点,然而,当来到这片斗气大陆之后,萧炎却是惊喜的发现,因为两世的经验,他的灵魂,竟然比常人要强上许多! 92 | 要知道,在斗气大陆,灵魂是天生的,或许它能随着年龄的增长而稍稍变强,可却从没有什么功法能够单独修炼灵魂,就算是天阶功法,也不可能!这是斗气大陆的常识。 93 | 灵魂的强化,也造就出萧炎的修炼天赋,同样,也造就了他的天才之名。 94 | 当一个平凡庸碌之人,在知道他有成为无数人瞩目的本钱之后,若是没有足够的定力,很难能够把握本心,很显然的,前世仅仅是普通人的萧炎,并没有这种超人般的定力,所以,在他开始修炼斗之气后,他选择了成为受人瞩目的天才之路,而并非是在安静中逐渐成长! 95 | 若是没有意外发生的话,萧炎或许还真能够顶着天才的名头越长越大,不过,很可惜,在十一岁那年,天才之名,逐渐被突如其来的变故剥夺而去,而天才,也是在一夜间,沦落成了路人口中嘲笑的废物! 96 | …… 97 | 在咆哮了几嗓子之后,萧炎的情绪也是缓缓的平息了下来,脸庞再次回复了平日的落寞,事与至此,不管他如何暴怒,也是挽不回辛苦修炼而来的斗之气旋。 98 | 苦涩地摇了摇头,萧炎心中其实有些委屈,毕竟他对自己身体究竟发生了什么事,也是一概不知,平日检查,却没有发现丝毫不对劲的地方,灵魂,随着年龄的增加,也是越来越强大,而且吸收斗之气的速度,比几年前最巅峰的状态还要强盛上几分,这种种条件,都说明自己的天赋从不曾减弱,可那些进入体内的斗之气,却都是无一例外的消失得干干净净,诡异的情形,让得萧炎黯然神伤…… 99 | 黯然地叹了口气,萧炎抬起手掌,手指上有一颗黑色戒指,戒指很是古朴,不知是何材料所铸,其上还绘有些模糊的纹路,这是母亲临死前送给他的唯一礼物,从四岁开始,他已经佩戴了十年,母亲的遗物,让得萧炎对它也是有着一份眷恋,手指轻轻地抚摸着戒指,萧炎苦笑道:“这几年,还真是辜负母亲的期望了……” 100 | 深深的吐了一口气,萧炎忽然回转过头,对着漆黑的树林温暖地笑道:“父亲,您来了?” 101 | 虽然斗之气只有三段,不过萧炎的灵魂感知,却是比一名五星斗者都要敏锐许多,在先前说起母亲的时候,他便察觉到了树林中的一丝动静。 102 | “呵呵,炎儿,这么晚了,怎么还待在这上面呢?”树林中,在静了片刻后,传出男子的关切笑声。 103 | 树枝一阵摇摆,一位中年人跃了出来,脸庞上带着笑意,凝视着自己那站在月光下的儿子。 104 | 中年人身着华贵的灰色衣衫,龙行虎步间颇有几分威严,脸上一对粗眉更是为其添了几分豪气,他便是萧家现任族长,同时也是萧炎的父亲,五星大斗师,萧战! 105 | “父亲,您不也还没休息么?”望着中年男子,萧炎脸庞上的笑容更浓了一分,虽然自己有着前世的记忆,不过自出生以来,面前这位父亲便是对自己百般宠爱,在自己落魄之后,宠爱不减反增,如此行径,却是让得萧炎甘心叫他一声父亲。 106 | “炎儿,还在想下午测验的事呢?”大步上前,萧战笑道。 107 | “呵呵,有什么好想的,意料之中而已。”萧炎少年老成的摇了摇头,笑容却是有些勉强。 108 | “唉……”望着萧炎那依旧有些稚嫩的清秀脸庞,萧战叹了一口气,沉默了片刻,忽然道:“炎儿,你十五岁了吧?” 109 | “嗯,父亲。” 110 | “再有一年,似乎……就该进行成年仪式了……”萧战苦笑道。 111 | “是的,父亲,还有一年!”手掌微微一紧,萧炎平静地回道,成年仪式代表什么,他自然非常清楚,只要度过了成年仪式,那么没有修炼潜力的他,便将会被取消进入斗气阁寻找斗气功法的资格,从而被分配到家族的各处产业之中,为家族打理一些普通事物,这是家族的族规,就算他的父亲是族长,那也不可能改变! 112 | 毕竟,若是在二十五岁之前没有成为一名斗者,那将不会被家族所认可! 113 | “对不起了,炎儿,如果在一年后你的斗之气达不到七段,那么父亲也只得忍痛把你分配到家族的产业中去,毕竟,这个家族,还并不是父亲一人说了算,那几个老家伙,可随时等着父亲犯错呢……”望着平静的萧炎,萧战有些歉疚地叹道。 114 | “父亲,我会努力的,一年后,我一定会到达七段斗之气的!”萧炎微笑着安慰道。 115 | “一年,四段?呵呵,如果是以前,或许还有可能吧,不过现在……基本没半点机会……”虽然口中在安慰着父亲,不过萧炎心中却是自嘲的苦笑了起来。 116 | 同样非常清楚萧炎底细的萧战,也只得叹息着应了一声,他知道一年修炼四段斗之气有多困难,轻拍了拍他的脑袋,忽然笑道:“不早了,回去休息吧,明天,家族中有贵客,你可别失了礼。” 117 | “贵客?谁啊?”萧炎好奇的问道。 118 | “明天就知道了。”对着萧炎挤了挤眼睛,萧战大笑而去,留下无奈的萧炎。 119 | “放心吧,父亲,我会尽力的!”抚摸着手指上的古朴戒指,萧炎抬头喃喃道。 120 | 在萧炎抬头的那一刹,手指中的黑色古戒,却是忽然亮起了一抹极其微弱的诡异毫光,毫光眨眼便逝,没有引起任何人的察觉…… -------------------------------------------------------------------------------- /examples/new_word/test_new_word.txt: -------------------------------------------------------------------------------- 1 | 自然语言处理是计算机科学领域与人工智能领域中的一个重要方向。 2 | 它研究能实现人与计算机之间用自然语言进行有效通信的各种理论和方法。 3 | 自然语言处理是一门融语言学、计算机科学、数学于一体的科学。 4 | 因此,这一领域的研究将涉及自然语言,即人们日常使用的语言,所以它与语言学的研究有着密切的联系,但又有重要的区别。 5 | 自然语言处理并不是一般地研究自然语言,而在于研制能有效地实现自然语言通信的计算机系统,特别是其中的软件系统。 6 | 因而它是计算机科学的一部分。 7 | 自然语言处理(NLP)是计算机科学,人工智能,语言学关注计算机和人类(自然)语言之间的相互作用的领域。 8 | 实现人机间自然语言通信意味着要使计算机既能理解自然语言文本的意义,也能以自然语言文本来表达给定的意图、思想等。 9 | 前者称为自然语言理解,后者称为自然语言生成。 10 | 因此,自然语言处理大体包括了自然语言理解和自然语言生成两个部分。 11 | 历史上对自然语言理解研究得较多,而对自然语言生成研究得较少。 12 | 但这种状况已有所改变。 13 | 无论实现自然语言理解,还是自然语言生成,都远不如人们原来想象的那么简单,而是十分困难的。 14 | 从现有的理论和技术现状看,通用的、高质量的自然语言处理系统,仍然是较长期的努力目标,但是针对一定应用,具有相当自然语言处理能力的实用系统已经出现,有些已商品化,甚至开始产业化。 15 | 典型的例子有:多语种数据库和专家系统的自然语言接口、各种机器翻译系统、全文信息检索系统、自动文摘系统等。 16 | 语言是人类区别其他动物的本质特性。在所有生物中,只有人类才具有语言能力。人类的多种智能都与语言有着密切的关系。人类的逻辑思维以语言为形式,人类的绝大部分知识也是以语言文字的形式记载和流传下来的。因而,它也是人工智能的一个重要,甚至核心部分。 17 | 用自然语言与计算机进行通信,这是人们长期以来所追求的。因为它既有明显的实际意义,同时也有重要的理论意义:人们可以用自己最习惯的语言来使用计算机,而无需再花大量的时间和精力去学习不很自然和习惯的各种计算机语言;人们也可通过它进一步了解人类的语言能力和智能的机制。 18 | 实现人机间自然语言通信意味着要使计算机既能理解自然语言文本的意义,也能以自然语言文本来表达给定的意图、思想等。前者称为自然语言理解,后者称为自然语言生成。因此,自然语言处理大体包括了自然语言理解和自然语言生成两个部分。历史上对自然语言理解研究得较多,而对自然语言生成研究得较少。但这种状况已有所改变。 19 | 无论实现自然语言理解,还是自然语言生成,都远不如人们原来想象的那么简单,而是十分困难的。从现有的理论和技术现状看,通用的、高质量的自然语言处理系统,仍然是较长期的努力目标,但是针对一定应用,具有相当自然语言处理能力的实用系统已经出现,有些已商品化,甚至开始产业化。典型的例子有:多语种数据库和专家系统的自然语言接口、各种机器翻译系统、全文信息检索系统、自动文摘系统等。 20 | 自然语言处理,即实现人机间自然语言通信,或实现自然语言理解和自然语言生成是十分困难的。造成困难的根本原因是自然语言文本和对话的各个层次上广泛存在的各种各样的歧义性或多义性(ambiguity)。 21 | 一个中文文本从形式上看是由汉字(包括标点符号等)组成的一个字符串。由字可组成词,由词可组成词组,由词组可组成句子,进而由一些句子组成段、节、章、篇。无论在上述的各种层次:字(符)、词、词组、句子、段,……还是在下一层次向上一层次转变中都存在着歧义和多义现象,即形式上一样的一段字符串,在不同的场景或不同的语境下,可以理解成不同的词串、词组串等,并有不同的意义。一般情况下,它们中的大多数都是可以根据相应的语境和场景的规定而得到解决的。也就是说,从总体上说,并不存在歧义。这也就是我们平时并不感到自然语言歧义,和能用自然语言进行正确交流的原因。但是一方面,我们也看到,为了消解歧义,是需要极其大量的知识和进行推理的。如何将这些知识较完整地加以收集和整理出来;又如何找到合适的形式,将它们存入计算机系统中去;以及如何有效地利用它们来消除歧义,都是工作量极大且十分困难的工作。这不是少数人短时期内可以完成的,还有待长期的、系统的工作。 22 | 以上说的是,一个中文文本或一个汉字(含标点符号等)串可能有多个含义。它是自然语言理解中的主要困难和障碍。反过来,一个相同或相近的意义同样可以用多个中文文本或多个汉字串来表示。 23 | 因此,自然语言的形式(字符串)与其意义之间是一种多对多的关系。其实这也正是自然语言的魅力所在。但从计算机处理的角度看,我们必须消除歧义,而且有人认为它正是自然语言理解中的中心问题,即要把带有潜在歧义的自然语言输入转换成某种无歧义的计算机内部表示。 24 | 歧义现象的广泛存在使得消除它们需要大量的知识和推理,这就给基于语言学的方法、基于知识的方法带来了巨大的困难,因而以这些方法为主流的自然语言处理研究几十年来一方面在理论和方法方面取得了很多成就,但在能处理大规模真实文本的系统研制方面,成绩并不显著。研制的一些系统大多数是小规模的、研究性的演示系统。 25 | 目前存在的问题有两个方面:一方面,迄今为止的语法都限于分析一个孤立的句子,上下文关系和谈话环境对本句的约束和影响还缺乏系统的研究,因此分析歧义、词语省略、代词所指、同一句话在不同场合或由不同的人说出来所具有的不同含义等问题,尚无明确规律可循,需要加强语用学的研究才能逐步解决。另一方面,人理解一个句子不是单凭语法,还运用了大量的有关知识,包括生活知识和专门知识,这些知识无法全部贮存在计算机里。因此一个书面理解系统只能建立在有限的词汇、句型和特定的主题范围内;计算机的贮存量和运转速度大大提高之后,才有可能适当扩大范围. 26 | 以上存在的问题成为自然语言理解在机器翻译应用中的主要难题,这也就是当今机器翻译系统的译文质量离理想目标仍相差甚远的原因之一;而译文质量是机译系统成败的关键。中国数学家、语言学家周海中教授曾在经典论文《机器翻译五十年》中指出:要提高机译的质量,首先要解决的是语言本身问题而不是程序设计问题;单靠若干程序来做机译系统,肯定是无法提高机译质量的;另外在人类尚未明了大脑是如何进行语言的模糊识别和逻辑判断的情况下,机译要想达到“信、达、雅”的程度是不可能的。 -------------------------------------------------------------------------------- /examples/new_word/test_new_word2.txt: -------------------------------------------------------------------------------- 1 | 台湾“中时电子报”26日报道称,蔡英文今日一早会见“世卫行动团”,她称,台湾虽然无法参加WHA(世界卫生大会),但“还是要有贡献”。于是,她表示要捐100万美元给WHO对抗埃博拉病毒。 2 | 3 |   对于台湾为何不能,蔡英文又一次惯性“甩锅”,宣称“中国对台湾的外交打压已无所不用其极”。 4 | 5 |   随后,蔡英文还在“世卫行动团”面前自诩,台湾是个“模范生”。对于台湾无法参会,蔡英文自有一套“道理”:世卫组织“不应该把台湾排除在WHO之外,台湾的健保在世界是有目共睹,然而WHO秘书长却因为政治因素,把台湾这个模范生排除在外”“但外在情势的艰难,只会成为台湾的动力,台湾用公卫医疗的成就,向世界证明,我们可以在世界扮演重要的角色”。 6 | 7 |   随后,蔡英文也不忘感谢在世卫大会上替台“发声”的几个“友邦”:“这次WHO,台湾得到‘邦交国’的支持和伸援,我们感谢这些国家支持,也感谢(台湾)‘卫福部’和相关部门的努力”。 8 | 9 |   不过环环想提醒一句,此次大会上,确实有多个台湾“友邦”受台当局邀请,向大会提案“邀请台湾作为观察员参加WHA”,然而结果是,立即被大会否决…… 10 | 11 |   中国国家卫生健康委员会主任马晓伟20日曾对此表示,2009年至2016年,台湾地区连续8年以“中华台北”名义和观察员身份参加了世界卫生大会。这是在两岸均坚持体现一个中国原则的“九二共识”基础上,通过两岸协商做出的特殊安排。由于民进党当局迄不承认体现一个中国原则的“九二共识”,破坏了台湾地区参加世界卫生大会的政治基础。今年台湾地区收不到参会邀请,责任完全在民进党当局。 12 | 13 |   而对于蔡英文此次颇费心机出此招数,希望再在WHA上刷一波存在感,岛内网友先看不下去了,有网友直指“别白忙活了,忙正事吧”,还有网友笑道“典型的狂躁症又发作了”,也有网友表示蔡英文这一次“又是花纳税人的钱,真是惨!” 14 | 台湾卫福部门负责人陈时中 15 | 16 | 一家日本媒体近日用日文刊发了陈时中所谓的“台湾要求参与世界卫生大会”一文。陈时中在文中称,2017年的第70届世界卫生大会未邀请台湾以观察员身份出席,对此,他公然妄言,世界卫生组织(WHO)不仅违背了其宪章,更忽视了所谓的“支持”台湾的声音。他还叫嚣,台湾将“专业、务实地”争取“参与”2018年第71届世界卫生大会,世界卫生组织及相关各方应承认台湾以观察员身份参与世界卫生大会。 17 | 18 | 对于陈时中在日本媒体告洋状的做法,众多台湾网友纷纷留言表示看不下去了。有网友痛批,“台当局官员投书日本媒体,把台湾人的脸都丢光了”、“尽干蠢事”、“殖民地三脚仔(三脚仔是台湾民众对“皇民化”的本岛人的称呼)。而对于台当局打算在会场外安排谈话的“蹭会方式”,有台湾网友也留言讽刺,“和主人谈了这么多,结果还是拿不到门票啊。” 19 | 20 | 国台办发言人:台湾无缘世卫大会责任在民进党当局 21 | 22 | 国台办发言人安峰山7日就世界卫生大会有关涉台问题接受记者采访时表示,导致台湾地区今年仍不能参加世界卫生大会的原因是民进党当局迄今拒不承认体现一个中国原则的“九二共识”,台湾地区参加世卫大会的政治基础不存在,不能参会的责任完全在民进党当局 23 | 就在美国总统特朗普在白宫签下退出伊核协议的那一刻起,中东地区就已经注定了会再次混乱起来,伊朗似乎早就料到了特朗普会退群,所以伊朗做好了充分的准备。今年5月中旬,伊朗从世界多个国家招募了近十万的兵力,准备开赴叙利亚,这些士兵大多来自利比亚,伊拉克和阿富汗等地,我们可以发现,这些国家都是曾被美国“欺负”过的国家,同美国有着深仇大恨,所以伊朗这十万大军从气势上来看就显得很足,看来伊朗这次是要来真的了。 24 | 25 | 26 | -------------------------------------------------------------------------------- /examples/new_word/test_new_word3.txt: -------------------------------------------------------------------------------- 1 | 知识图谱(Knowledge Graph),在图书情报界称为知识域可视化或知识领域映射地图,是显示知识发展进程与结构关系的一系列各种不同的图形,用可视化技术描述知识资源及其载体,挖掘、分析、构建、绘制和显示知识及它们之间的相互联系。 2 | 知识图谱是通过将应用数学、图形学、信息可视化技术、信息科学等学科的理论与方法与计量学引文分析、共现分析等方法结合,并利用可视化的图谱形象地展示学科的核心结构、发展历史、前沿领域以及整体知识架构达到多学科融合目的的现代理论。它能为学科研究提供切实的、有价值的参考。 3 | 具体来说,知识图谱是通过将应用数学、图形学、信息可视化技术、信息科学等学科的理论与方法与计量学引文分析、共现分析等方法结合,并利用可视化的图谱形象地展示学科的核心结构、发展历史、前沿领域以及整体知识架构达到多学科融合目的的现代理论。它把复杂的知识领域通过数据挖掘、信息处理、知识计量和图形绘制而显示出来,揭示知识领域的动态发展规律,为学科研究提供切实的、有价值的参考。迄今为止,其实际应用在发达国家已经逐步拓展并取得了较好的效果,但它在我国仍属研究的起步阶段。 -------------------------------------------------------------------------------- /examples/new_word/test_new_word4.txt: -------------------------------------------------------------------------------- 1 | 深度学习(DL, Deep Learning)是机器学习(ML, Machine Learning)领域中一个新的研究方向,它被引入机器学习使其更接近于最初的目标——人工智能(AI, Artificial Intelligence)。 2 | 深度学习是学习样本数据的内在规律和表示层次,这些学习过程中获得的信息对诸如文字,图像和声音等数据的解释有很大的帮助。它的最终目标是让机器能够像人一样具有分析学习能力,能够识别文字、图像和声音等数据。 深度学习是一个复杂的机器学习算法,在语音和图像识别方面取得的效果,远远超过先前相关技术。 3 | 深度学习在搜索技术,数据挖掘,机器学习,机器翻译,自然语言处理,多媒体学习,语音,推荐和个性化技术,以及其他相关领域都取得了很多成果。深度学习使机器模仿视听和思考等人类的活动,解决了很多复杂的模式识别难题,使得人工智能相关技术取得了很大进步。 -------------------------------------------------------------------------------- /lighttext/__init__.py: -------------------------------------------------------------------------------- 1 | from .component import * 2 | from .text_mining import * 3 | from .string_search import * 4 | -------------------------------------------------------------------------------- /lighttext/component/__init__.py: -------------------------------------------------------------------------------- 1 | from .article import Article 2 | from .character import Character 3 | from .paragraph import Paragraph 4 | from .sentence import Sentence 5 | from .vocabulary import Vocabulary 6 | from .word import Word 7 | __all__ = ['Article', 'Character', 'Paragraph', 'Sentence', 'Vocabulary', 'Word'] 8 | -------------------------------------------------------------------------------- /lighttext/component/article.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | 4 | from .sentence import Sentence 5 | from .paragraph import Paragraph 6 | 7 | para_spliter = re.compile('\n') 8 | 9 | 10 | class Article: 11 | def __init__(self, title, content): 12 | assert type(title) == str 13 | assert type(content) == str 14 | self.raw_title = title.strip() 15 | self.raw_content = content.strip() 16 | self.title = Sentence(self.raw_title) 17 | self.content = [Paragraph(x) for x in para_spliter.split(self.raw_content) if x] 18 | 19 | def __str__(self): 20 | return json.dumps({ 21 | 'title': self.raw_title, 22 | 'content': self.raw_content 23 | }, ensure_ascii=False, indent=1) 24 | -------------------------------------------------------------------------------- /lighttext/component/character.py: -------------------------------------------------------------------------------- 1 | class Character: 2 | def __init__(self, pattern): 3 | assert type(pattern) == str 4 | assert len(pattern) == 1 5 | self.pattern = pattern 6 | 7 | def __eq__(self, other): 8 | if type(other) == str: 9 | return self.pattern == other 10 | if type(other) == Character: 11 | assert type(other) == Character 12 | return self.pattern == other.pattern 13 | 14 | def __str__(self): 15 | return self.pattern 16 | 17 | -------------------------------------------------------------------------------- /lighttext/component/dictionary.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from typing import List 3 | from collections import defaultdict, Counter 4 | 5 | 6 | class Dictionary: 7 | def __init__(self): 8 | self._dict = dict() 9 | 10 | def set(self, word: str, obj: object): 11 | self._dict[word] = obj 12 | 13 | def get(self, word: str): 14 | if word in self._dict: 15 | return self._dict[word] 16 | return None 17 | 18 | def __contains__(self, item): 19 | return item in self._dict 20 | 21 | def __str__(self): 22 | return str(self._dict) 23 | -------------------------------------------------------------------------------- /lighttext/component/paragraph.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from .sentence import Sentence 4 | 5 | sen_spliter = re.compile('[!~。?;\s.?!;]') 6 | 7 | 8 | class Paragraph: 9 | def __init__(self, text): 10 | assert type(text) == str 11 | self.text = text 12 | 13 | def split(self): 14 | return [x for x in sen_spliter.split(self.text) if x] 15 | 16 | @property 17 | def sentences(self): 18 | return [Sentence(x) for x in self.split()] 19 | 20 | def __str__(self): 21 | return self.text 22 | -------------------------------------------------------------------------------- /lighttext/component/sentence.py: -------------------------------------------------------------------------------- 1 | import jieba 2 | 3 | from .word import Word 4 | 5 | 6 | class Sentence: 7 | def __init__(self, text): 8 | assert type(text) == str 9 | self.text = text 10 | 11 | def split(self): 12 | return list(jieba.cut(self.text)) 13 | 14 | @property 15 | def words(self): 16 | return [Word(x) for x in self.split()] 17 | 18 | def __str__(self): 19 | return self.text 20 | -------------------------------------------------------------------------------- /lighttext/component/vocabulary.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from typing import List, Dict 3 | from collections import Counter 4 | from lightutils import logger 5 | 6 | 7 | class Vocabulary: 8 | """ 9 | 词汇表结构,做word和id之间的映射 10 | """ 11 | def __init__(self, padding: str = '', unknown: str = ''): 12 | self._word2idx: Dict = dict() 13 | self._idx2word: Dict = dict() 14 | self._word_count: Counter = Counter() 15 | self.padding = padding 16 | self.unknown = unknown 17 | self._init() 18 | 19 | def _init(self): 20 | self._word2idx = {self.padding: 0, self.unknown: 1} 21 | self._idx2word = {0: self.padding, 1: self.unknown} 22 | 23 | def build_from_corpora(self, corpora: List[List[str]]) -> None: 24 | """ 25 | 从corpora中构造词典 26 | Args: 27 | corpora: 分词后的语料 28 | 29 | Returns: 30 | 31 | """ 32 | self._word_count.update(itertools.chain(*corpora)) 33 | offset = len(self.special_words) 34 | self._word2idx.update({word: idx + offset for idx, word in enumerate(self._word_count)}) 35 | self._idx2word.update({idx + offset: word for idx, word in enumerate(self._word_count)}) 36 | 37 | def __len__(self) -> int: 38 | assert len(self._word2idx) == len(self._idx2word) 39 | return len(self._word2idx) 40 | 41 | def add_word(self, word: str): 42 | self._word_count[word] += 1 43 | idx = len(self) 44 | if word not in self._word2idx: 45 | self._word2idx[word] = idx 46 | if idx not in self._idx2word: 47 | self._idx2word[idx] = word 48 | 49 | def update(self, word_lst: List[str]): 50 | for word in word_lst: 51 | self.add_word(word) 52 | 53 | def to_word(self, idx: int): 54 | return self._idx2word[idx] 55 | 56 | def to_idx(self, word: str): 57 | word = word if word in self else self.unknown 58 | if word == self.unknown: 59 | logger.error('{} not in vocab'.format(word)) 60 | return self[word] 61 | 62 | def __contains__(self, item): 63 | return item in self._word2idx 64 | 65 | def has_word(self, word: str): 66 | return word in self 67 | 68 | def __getitem__(self, item): 69 | return self._word2idx[item] 70 | 71 | def __str__(self): 72 | return str(self._word2idx) 73 | 74 | @property 75 | def word2idx(self): 76 | return self._word2idx 77 | 78 | @property 79 | def idx2word(self): 80 | return self._idx2word 81 | 82 | @property 83 | def word_count(self): 84 | return self._word_count 85 | 86 | @property 87 | def special_words(self): 88 | return {self.padding: 0, self.unknown: 1} 89 | 90 | -------------------------------------------------------------------------------- /lighttext/component/word.py: -------------------------------------------------------------------------------- 1 | class Word: 2 | def __init__(self, pattern): 3 | assert type(pattern) == str 4 | assert len(pattern) >= 1 5 | self.pattern = pattern 6 | 7 | def __eq__(self, other): 8 | if type(other) == str: 9 | return self.pattern == other 10 | if type(other) == Word: 11 | assert type(other) == Word 12 | return self.pattern == other.pattern 13 | 14 | def __str__(self): 15 | return self.pattern 16 | -------------------------------------------------------------------------------- /lighttext/core/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /lighttext/core/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # copied from https://gitee.com/fastnlp/fastNLP/blob/master/fastNLP/core/utils.py line:233 4 | class Option(dict): 5 | """a dict can treat keys as attributes""" 6 | 7 | def __getattr__(self, item): 8 | try: 9 | return self.__getitem__(item) 10 | except KeyError: 11 | raise AttributeError(item) 12 | 13 | def __setattr__(self, key, value): 14 | if key.startswith('__') and key.endswith('__'): 15 | raise AttributeError(key) 16 | self.__setitem__(key, value) 17 | 18 | def __delattr__(self, item): 19 | try: 20 | self.pop(item) 21 | except KeyError: 22 | raise AttributeError(item) 23 | 24 | def __getstate__(self): 25 | return self 26 | 27 | def __setstate__(self, state): 28 | self.update(state) 29 | -------------------------------------------------------------------------------- /lighttext/data/ban.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smilelight/lightText/b015d0e3524722fb5a8ee5ea83b7fbbd7408f797/lighttext/data/ban.txt -------------------------------------------------------------------------------- /lighttext/data/post_filter_pos.txt: -------------------------------------------------------------------------------- 1 | a 2 | v 3 | c 4 | f 5 | d 6 | m 7 | r 8 | vd 9 | a 10 | u -------------------------------------------------------------------------------- /lighttext/data/pre_filter_pos.txt: -------------------------------------------------------------------------------- 1 | v 2 | p 3 | q 4 | c 5 | f 6 | d 7 | m 8 | r 9 | vd 10 | a 11 | u -------------------------------------------------------------------------------- /lighttext/data/stopwords.txt: -------------------------------------------------------------------------------- 1 | 2 | ⇙ 3 | ▼ 4 | ! 5 | " 6 | # 7 | $ 8 | % 9 | & 10 | ' 11 | ( 12 | ) 13 | * 14 | + 15 | , 16 | - 17 | -- 18 | . 19 | .. 20 | ... 21 | ...... 22 | ................... 23 | ./ 24 | .一 25 | .数 26 | .日 27 | / 28 | // 29 | 0 30 | 1 31 | 2 32 | 3 33 | 4 34 | 5 35 | 6 36 | 7 37 | 8 38 | 9 39 | : 40 | :// 41 | :: 42 | ; 43 | < 44 | = 45 | > 46 | >> 47 | ? 48 | @ 49 | A 50 | Lex 51 | [ 52 | \ 53 | ] 54 | ^ 55 | _ 56 | ` 57 | exp 58 | sub 59 | sup 60 | | 61 | } 62 | ~ 63 | ~~~ 64 | · 65 | × 66 | ××× 67 | Δ 68 | Ψ 69 | γ 70 | μ 71 | φ 72 | φ. 73 | В 74 | — 75 | —— 76 | ——— 77 | ‘ 78 | ’ 79 | ’‘ 80 | “ 81 | ” 82 | ”, 83 | … 84 | …… 85 | …………………………………………………③ 86 | ′∈ 87 | ′| 88 | ℃ 89 | Ⅲ 90 | ↑ 91 | → 92 | ∈[ 93 | ∪φ∈ 94 | ≈ 95 | ① 96 | ② 97 | 的 98 | 我 99 | 你 100 | 了 101 | 呢 102 | ②c 103 | ③ 104 | ③] 105 | ④ 106 | ⑤ 107 | ⑥ 108 | ⑦ 109 | ⑧ 110 | ⑨ 111 | ⑩ 112 | ── 113 | ■ 114 | ▲ 115 |   116 | 、 117 | 。 118 | 〈 119 | 〉 120 | 《 121 | 》 122 | 》), 123 | 」 124 | 『 125 | 』 126 | 【 127 | 】 128 | 〔 129 | 〕 130 | 〕〔 131 | ㈧ 132 | ︿ 133 | ! 134 | # 135 | $ 136 | % 137 | & 138 | ' 139 | ( 140 | ) 141 | )÷(1- 142 | )、 143 | * 144 | + 145 | +ξ 146 | ++ 147 | , 148 | ,也 149 | - 150 | -β 151 | -- 152 | -[*]- 153 | . 154 | / 155 | 0 156 | 0:2 157 | 1 158 | 1. 159 | 12% 160 | 2 161 | 2.3% 162 | 3 163 | 4 164 | 5 165 | 5:0 166 | 6 167 | 7 168 | 8 169 | 9 170 | : 171 | ; 172 | < 173 | <± 174 | <Δ 175 | <λ 176 | <φ 177 | << 178 | = 179 | =″ 180 | =☆ 181 | =( 182 | =- 183 | =[ 184 | ={ 185 | > 186 | >λ 187 | ? 188 | @ 189 | A 190 | LI 191 | R.L. 192 | ZXFITL 193 | [ 194 | [①①] 195 | [①②] 196 | [①③] 197 | [①④] 198 | [①⑤] 199 | [①⑥] 200 | [①⑦] 201 | [①⑧] 202 | [①⑨] 203 | [①A] 204 | [①B] 205 | [①C] 206 | [①D] 207 | [①E] 208 | [①] 209 | [①a] 210 | [①c] 211 | [①d] 212 | [①e] 213 | [①f] 214 | [①g] 215 | [①h] 216 | [①i] 217 | [①o] 218 | [② 219 | [②①] 220 | [②②] 221 | [②③] 222 | [②④ 223 | [②⑤] 224 | [②⑥] 225 | [②⑦] 226 | [②⑧] 227 | [②⑩] 228 | [②B] 229 | [②G] 230 | [②] 231 | [②a] 232 | [②b] 233 | [②c] 234 | [②d] 235 | [②e] 236 | [②f] 237 | [②g] 238 | [②h] 239 | [②i] 240 | [②j] 241 | [③①] 242 | [③⑩] 243 | [③F] 244 | [③] 245 | [③a] 246 | [③b] 247 | [③c] 248 | [③d] 249 | [③e] 250 | [③g] 251 | [③h] 252 | [④] 253 | [④a] 254 | [④b] 255 | [④c] 256 | [④d] 257 | [④e] 258 | [⑤] 259 | [⑤]] 260 | [⑤a] 261 | [⑤b] 262 | [⑤d] 263 | [⑤e] 264 | [⑤f] 265 | [⑥] 266 | [⑦] 267 | [⑧] 268 | [⑨] 269 | [⑩] 270 | [*] 271 | [- 272 | [] 273 | ] 274 | ]∧′=[ 275 | ][ 276 | _ 277 | a] 278 | b] 279 | c] 280 | e] 281 | f] 282 | ng昉 283 | { 284 | {- 285 | | 286 | } 287 | }> 288 | ~ 289 | ~± 290 | ~+ 291 | ¥ 292 | -------------------------------------------------------------------------------- /lighttext/doc_search/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /lighttext/doc_search/basic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /lighttext/doc_search/bloom.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | class BloomFilter(object): 4 | 5 | def __init__(self, size: int): 6 | self.values = [False] * size 7 | self.size = size 8 | 9 | def hash_value(self, value): 10 | return hash(value) % self.size 11 | 12 | def add_value(self, value): 13 | h = self.hash_value(value) 14 | self.values[h] = True 15 | 16 | def might_contain(self, value): 17 | h = self.hash_value(value) 18 | return self.values[h] 19 | 20 | @property 21 | def contents(self): 22 | return self.values 23 | -------------------------------------------------------------------------------- /lighttext/doc_search/searcher.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from typing import List 3 | from collections import defaultdict 4 | 5 | from .bloom import BloomFilter 6 | from ..utils.tokenize import token_split 7 | 8 | 9 | class Searcher: 10 | def __init__(self): 11 | self.bf = BloomFilter(256) 12 | self.terms = defaultdict(set) 13 | self.events = [] 14 | 15 | def add_events(self, event: str): 16 | event_id = len(self.events) 17 | self.events.append(event) 18 | 19 | for term in token_split(event): 20 | self.bf.add_value(term) 21 | 22 | self.terms[term].add(event_id) 23 | 24 | def search(self, term: str): 25 | if not self.bf.might_contain(term): 26 | return 27 | 28 | if term not in self.terms: 29 | return 30 | 31 | for event_id in sorted(self.terms[term]): 32 | yield self.events[event_id] 33 | 34 | def search_all(self, terms: List[str]): 35 | results = set(range(len(self.events))) 36 | for term in terms: 37 | if not self.bf.might_contain(term): 38 | return 39 | if term not in self.terms: 40 | return 41 | 42 | results = results.union(self.terms[term]) 43 | 44 | for event_id in sorted(results): 45 | yield self.events[event_id] 46 | -------------------------------------------------------------------------------- /lighttext/inverted_index/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /lighttext/inverted_index/inv_idx.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from typing import List 3 | from collections import defaultdict, Counter 4 | 5 | 6 | class InvertedIndex: 7 | def __init__(self): 8 | self._inverted_index = defaultdict(Counter) 9 | self._documents = dict() 10 | 11 | def build_from_corpus(self, corpora: List[List[str]]): 12 | for idx, corpus in enumerate(corpora): 13 | for word in corpus: 14 | self._inverted_index[word][idx] += 1 15 | 16 | -------------------------------------------------------------------------------- /lighttext/string_search/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2020/7/2 8:55 3 | # @Author : lightsmile 4 | # @Software: PyCharm 5 | 6 | from .keyword import KeywordProcessor 7 | from .ner import NER 8 | 9 | __all__ = ['KeywordProcessor', 'NER'] 10 | -------------------------------------------------------------------------------- /lighttext/string_search/keyword.py: -------------------------------------------------------------------------------- 1 | """ 2 | module(keyword) - 关键词提取,参考代码:https://github.com/vi3k6i5/flashtext. 3 | 4 | Main members: 5 | 6 | # KeywordProcessor - 关键词匹配工具类. 7 | """ 8 | __all__ = [ 9 | 'KeywordProcessor' 10 | ] 11 | 12 | import os 13 | import io 14 | 15 | 16 | class KeywordProcessor(object): 17 | """KeywordProcessor 18 | Attributes: 19 | _keyword (str): Used as key to store keywords in trie dictionary. 20 | Defaults to '_keyword_' 21 | non_word_boundaries (set(str)): Characters that will determine if the word is continuing. 22 | Defaults to set([A-Za-z0-9_]) 23 | keyword_trie_dict (dict): Trie dict built character by character, that is used for lookup 24 | Defaults to empty dictionary 25 | case_sensitive (boolean): if the search algorithm should be case sensitive or not. 26 | Defaults to False 27 | Note: 28 | * loosely based on `Aho-Corasick algorithm `_. 29 | * Idea came from this `Stack Overflow Question `_. 30 | """ 31 | 32 | def __init__(self, case_sensitive=False, white_space_chars=['\t', '\n', '\a', ' ', ','], non_word_boundaries=[]): 33 | """ 34 | Args: 35 | case_sensitive (boolean): Keyword search should be case sensitive set or not. 36 | Defaults to False 37 | """ 38 | self._keyword = '_keyword_' 39 | # self._white_space_chars = set(['.', '\t', '\n', '\a', ' ', ',']) 40 | self._white_space_chars = set(white_space_chars) 41 | try: 42 | # python 2.x 43 | self.non_word_boundaries = set(non_word_boundaries) 44 | # self.non_word_boundaries = set(string.digits + string.letters + '_') 45 | except AttributeError: 46 | # python 3.x 47 | self.non_word_boundaries = set(non_word_boundaries) 48 | # self.non_word_boundaries = set(string.digits + string.ascii_letters + '_') 49 | self.keyword_trie_dict = dict() 50 | self.case_sensitive = case_sensitive 51 | self._terms_in_trie = 0 52 | 53 | def __len__(self): 54 | """Number of terms present in the keyword_trie_dict 55 | Returns: 56 | length : int 57 | Count of number of distinct terms in trie dictionary. 58 | """ 59 | return self._terms_in_trie 60 | 61 | def __contains__(self, word): 62 | """To check if word is present in the keyword_trie_dict 63 | Args: 64 | word : string 65 | word that you want to check 66 | Returns: 67 | status : bool 68 | If word is present as it is in keyword_trie_dict then we return True, else False 69 | """ 70 | if not self.case_sensitive: 71 | word = word.lower() 72 | current_dict = self.keyword_trie_dict 73 | len_covered = 0 74 | for char in word: 75 | if char in current_dict: 76 | current_dict = current_dict[char] 77 | len_covered += 1 78 | else: 79 | break 80 | return self._keyword in current_dict and len_covered == len(word) 81 | 82 | def __getitem__(self, word): 83 | """if word is present in keyword_trie_dict return the clean name for it. 84 | Args: 85 | word : string 86 | word that you want to check 87 | Returns: 88 | keyword : string 89 | If word is present as it is in keyword_trie_dict then we return keyword mapped to it. 90 | """ 91 | if not self.case_sensitive: 92 | word = word.lower() 93 | current_dict = self.keyword_trie_dict 94 | len_covered = 0 95 | for char in word: 96 | if char in current_dict: 97 | current_dict = current_dict[char] 98 | len_covered += 1 99 | else: 100 | break 101 | if self._keyword in current_dict and len_covered == len(word): 102 | return current_dict[self._keyword] 103 | 104 | def __setitem__(self, keyword, clean_name=None): 105 | """To add keyword to the dictionary 106 | pass the keyword and the clean name it maps to. 107 | Args: 108 | keyword : string 109 | keyword that you want to identify 110 | clean_name : string 111 | clean term for that keyword that you would want to get back in return or replace 112 | if not provided, keyword will be used as the clean name also. 113 | """ 114 | status = False 115 | if not clean_name and keyword: 116 | clean_name = keyword 117 | 118 | if keyword and clean_name: 119 | if not self.case_sensitive: 120 | keyword = keyword.lower() 121 | current_dict = self.keyword_trie_dict 122 | for letter in keyword: 123 | current_dict = current_dict.setdefault(letter, {}) 124 | if self._keyword not in current_dict: 125 | status = True 126 | self._terms_in_trie += 1 127 | current_dict[self._keyword] = clean_name 128 | return status 129 | 130 | def __delitem__(self, keyword): 131 | """To remove keyword from the dictionary 132 | pass the keyword and the clean name it maps to. 133 | Args: 134 | keyword : string 135 | keyword that you want to remove if it's present 136 | """ 137 | status = False 138 | if keyword: 139 | if not self.case_sensitive: 140 | keyword = keyword.lower() 141 | current_dict = self.keyword_trie_dict 142 | character_trie_list = [] 143 | for letter in keyword: 144 | if letter in current_dict: 145 | character_trie_list.append((letter, current_dict)) 146 | current_dict = current_dict[letter] 147 | else: 148 | # if character is not found, break out of the loop 149 | current_dict = None 150 | break 151 | # remove the characters from trie dict if there are no other keywords with them 152 | if current_dict and self._keyword in current_dict: 153 | # we found a complete match for input keyword. 154 | character_trie_list.append((self._keyword, current_dict)) 155 | character_trie_list.reverse() 156 | 157 | for key_to_remove, dict_pointer in character_trie_list: 158 | if len(dict_pointer.keys()) == 1: 159 | dict_pointer.pop(key_to_remove) 160 | else: 161 | # more than one key means more than 1 path. 162 | # Delete not required path and keep the other 163 | dict_pointer.pop(key_to_remove) 164 | break 165 | # successfully removed keyword 166 | status = True 167 | self._terms_in_trie -= 1 168 | return status 169 | 170 | def __iter__(self): 171 | """Disabled iteration as get_all_keywords() is the right way to iterate 172 | """ 173 | raise NotImplementedError("Please use get_all_keywords() instead") 174 | 175 | def set_non_word_boundaries(self, non_word_boundaries): 176 | """set of characters that will be considered as part of word. 177 | Args: 178 | non_word_boundaries (set(str)): 179 | Set of characters that will be considered as part of word. 180 | """ 181 | self.non_word_boundaries = non_word_boundaries 182 | 183 | def add_non_word_boundary(self, character): 184 | """add a character that will be considered as part of word. 185 | Args: 186 | character (char): 187 | Character that will be considered as part of word. 188 | """ 189 | self.non_word_boundaries.add(character) 190 | 191 | def add_keyword(self, keyword, clean_name=None): 192 | """To add one or more keywords to the dictionary 193 | pass the keyword and the clean name it maps to. 194 | Args: 195 | keyword : string 196 | keyword that you want to identify 197 | clean_name : string 198 | clean term for that keyword that you would want to get back in return or replace 199 | if not provided, keyword will be used as the clean name also. 200 | Returns: 201 | status : bool 202 | The return value. True for success, False otherwise. 203 | """ 204 | return self.__setitem__(keyword, clean_name) 205 | 206 | def remove_keyword(self, keyword): 207 | """To remove one or more keywords from the dictionary 208 | pass the keyword and the clean name it maps to. 209 | Args: 210 | keyword : string 211 | keyword that you want to remove if it's present 212 | Returns: 213 | status : bool 214 | The return value. True for success, False otherwise. 215 | """ 216 | return self.__delitem__(keyword) 217 | 218 | def get_keyword(self, word): 219 | """if word is present in keyword_trie_dict return the clean name for it. 220 | Args: 221 | word : string 222 | word that you want to check 223 | Returns: 224 | keyword : string 225 | If word is present as it is in keyword_trie_dict then we return keyword mapped to it. 226 | """ 227 | return self.__getitem__(word) 228 | 229 | def add_keyword_from_file(self, keyword_file, encoding="utf-8"): 230 | """To add keywords from a file 231 | Args: 232 | keyword_file : path to keywords file 233 | encoding : specify the encoding of the file 234 | Raises: 235 | IOError: If `keyword_file` path is not valid 236 | """ 237 | if not os.path.isfile(keyword_file): 238 | raise IOError("Invalid file path {}".format(keyword_file)) 239 | with io.open(keyword_file, encoding=encoding) as f: 240 | for line in f: 241 | if '=>' in line: 242 | keyword, clean_name = line.split('=>') 243 | self.add_keyword(keyword, clean_name.strip()) 244 | else: 245 | keyword = line.strip() 246 | self.add_keyword(keyword) 247 | 248 | def add_keywords_from_dict(self, keyword_dict): 249 | """To add keywords from a dictionary 250 | Args: 251 | keyword_dict (dict): A dictionary with `str` key and (list `str`) as value 252 | Raises: 253 | AttributeError: If value for a key in `keyword_dict` is not a list. 254 | """ 255 | for clean_name, keywords in keyword_dict.items(): 256 | if not isinstance(keywords, list): 257 | raise AttributeError("Value of key {} should be a list".format(clean_name)) 258 | 259 | for keyword in keywords: 260 | self.add_keyword(keyword, clean_name) 261 | 262 | def remove_keywords_from_dict(self, keyword_dict): 263 | """To remove keywords from a dictionary 264 | Args: 265 | keyword_dict (dict): A dictionary with `str` key and (list `str`) as value 266 | Examples: 267 | >>> keyword_dict = { 268 | "java": ["java_2e", "java programing"], 269 | "product management": ["PM", "product manager"] 270 | } 271 | >>> keyword_processor.remove_keywords_from_dict(keyword_dict) 272 | Raises: 273 | AttributeError: If value for a key in `keyword_dict` is not a list. 274 | """ 275 | for clean_name, keywords in keyword_dict.items(): 276 | if not isinstance(keywords, list): 277 | raise AttributeError("Value of key {} should be a list".format(clean_name)) 278 | 279 | for keyword in keywords: 280 | self.remove_keyword(keyword) 281 | 282 | def add_keywords_from_list(self, keyword_list): 283 | """To add keywords from a list 284 | Args: 285 | keyword_list (list(str)): List of keywords to add 286 | Examples: 287 | >>> keyword_processor.add_keywords_from_list(["java", "python"]}) 288 | Raises: 289 | AttributeError: If `keyword_list` is not a list. 290 | """ 291 | if not isinstance(keyword_list, list): 292 | raise AttributeError("keyword_list should be a list") 293 | 294 | for keyword in keyword_list: 295 | self.add_keyword(keyword) 296 | 297 | def remove_keywords_from_list(self, keyword_list): 298 | """To remove keywords present in list 299 | Args: 300 | keyword_list (list(str)): List of keywords to remove 301 | Examples: 302 | >>> keyword_processor.remove_keywords_from_list(["java", "python"]}) 303 | Raises: 304 | AttributeError: If `keyword_list` is not a list. 305 | """ 306 | if not isinstance(keyword_list, list): 307 | raise AttributeError("keyword_list should be a list") 308 | for keyword in keyword_list: 309 | self.remove_keyword(keyword) 310 | 311 | def get_all_keywords(self, term_so_far='', current_dict=None): 312 | """Recursively builds a dictionary of keywords present in the dictionary 313 | And the clean name mapped to those keywords. 314 | Args: 315 | term_so_far : string 316 | term built so far by adding all previous characters 317 | current_dict : dict 318 | current recursive position in dictionary 319 | Returns: 320 | terms_present : dict 321 | A map of key and value where each key is a term in the keyword_trie_dict. 322 | And value mapped to it is the clean name mapped to it. 323 | """ 324 | terms_present = {} 325 | if not term_so_far: 326 | term_so_far = '' 327 | if current_dict is None: 328 | current_dict = self.keyword_trie_dict 329 | for key in current_dict: 330 | if key == '_keyword_': 331 | terms_present[term_so_far] = current_dict[key] 332 | else: 333 | sub_values = self.get_all_keywords(term_so_far + key, current_dict[key]) 334 | for key in sub_values: 335 | terms_present[key] = sub_values[key] 336 | return terms_present 337 | 338 | def extract_keywords(self, sentence, span_info=False, max_cost=0): 339 | """Searches in the string for all keywords present in corpus. 340 | Keywords present are added to a list `keywords_extracted` and returned. 341 | Args: 342 | sentence (str): Line of text where we will search for keywords 343 | span_info (bool): True if you need to span the boundaries where the extraction has been performed 344 | max_cost (int): maximum levensthein distance to accept when extracting keywords 345 | Returns: 346 | keywords_extracted (list(str)): List of terms/keywords found in sentence that match our corpus 347 | """ 348 | keywords_extracted = [] 349 | if not sentence: 350 | # if sentence is empty or none just return empty list 351 | return keywords_extracted 352 | if not self.case_sensitive: 353 | sentence = sentence.lower() 354 | current_dict = self.keyword_trie_dict 355 | sequence_start_pos = 0 356 | sequence_end_pos = 0 357 | reset_current_dict = False 358 | idx = 0 359 | sentence_len = len(sentence) 360 | curr_cost = max_cost 361 | while idx < sentence_len: 362 | char = sentence[idx] 363 | # when we reach a character that might denote word end 364 | if char not in self.non_word_boundaries: 365 | 366 | # if end is present in current_dict 367 | if self._keyword in current_dict or char in current_dict: 368 | # update longest sequence found 369 | sequence_found = None 370 | longest_sequence_found = None 371 | is_longer_seq_found = False 372 | if self._keyword in current_dict: 373 | sequence_found = current_dict[self._keyword] 374 | longest_sequence_found = current_dict[self._keyword] 375 | sequence_end_pos = idx 376 | 377 | # re look for longest_sequence from this position 378 | if char in current_dict: 379 | current_dict_continued = current_dict[char] 380 | 381 | idy = idx + 1 382 | while idy < sentence_len: 383 | inner_char = sentence[idy] 384 | if inner_char not in self.non_word_boundaries and self._keyword in current_dict_continued: 385 | # update longest sequence found 386 | longest_sequence_found = current_dict_continued[self._keyword] 387 | sequence_end_pos = idy 388 | is_longer_seq_found = True 389 | if inner_char in current_dict_continued: 390 | current_dict_continued = current_dict_continued[inner_char] 391 | elif curr_cost > 0: 392 | next_word = self.get_next_word(sentence[idy:]) 393 | current_dict_continued, cost, _ = next( 394 | self.levensthein(next_word, max_cost=curr_cost, start_node=current_dict_continued), 395 | ({}, 0, 0), 396 | ) # current_dict_continued to empty dict by default, so next iteration goes to a `break` 397 | curr_cost -= cost 398 | idy += len(next_word) - 1 399 | if not current_dict_continued: 400 | break 401 | else: 402 | break 403 | idy += 1 404 | else: 405 | # end of sentence reached. 406 | if self._keyword in current_dict_continued: 407 | # update longest sequence found 408 | longest_sequence_found = current_dict_continued[self._keyword] 409 | sequence_end_pos = idy 410 | is_longer_seq_found = True 411 | if is_longer_seq_found: 412 | # idx = sequence_end_pos 413 | idx = sequence_end_pos - 1 # 修复相邻关键词,后面的识别不出来的bug 414 | current_dict = self.keyword_trie_dict 415 | if longest_sequence_found: 416 | # keywords_extracted.append((longest_sequence_found, sequence_start_pos, idx)) 417 | keywords_extracted.append((longest_sequence_found, sequence_start_pos, idx+1)) 418 | curr_cost = max_cost 419 | reset_current_dict = True 420 | else: 421 | # we reset current_dict 422 | current_dict = self.keyword_trie_dict 423 | reset_current_dict = True 424 | elif char in current_dict: 425 | # we can continue from this char 426 | current_dict = current_dict[char] 427 | elif curr_cost > 0: 428 | next_word = self.get_next_word(sentence[idx:]) 429 | current_dict, cost, _ = next( 430 | self.levensthein(next_word, max_cost=curr_cost, start_node=current_dict), 431 | (self.keyword_trie_dict, 0, 0) 432 | ) 433 | curr_cost -= cost 434 | idx += len(next_word) - 1 435 | else: 436 | # we reset current_dict 437 | current_dict = self.keyword_trie_dict 438 | reset_current_dict = True 439 | # skip to end of word 440 | idy = idx + 1 441 | while idy < sentence_len: 442 | char = sentence[idy] 443 | if char not in self.non_word_boundaries: 444 | break 445 | idy += 1 446 | idx = idy 447 | # if we are end of sentence and have a sequence discovered 448 | if idx + 1 >= sentence_len: 449 | if self._keyword in current_dict: 450 | sequence_found = current_dict[self._keyword] 451 | keywords_extracted.append((sequence_found, sequence_start_pos, sentence_len)) 452 | idx += 1 453 | if reset_current_dict: 454 | reset_current_dict = False 455 | sequence_start_pos = idx 456 | if span_info: 457 | return keywords_extracted 458 | return [value[0] for value in keywords_extracted] 459 | 460 | def replace_keywords(self, sentence, max_cost=0): 461 | """Searches in the string for all keywords present in corpus. 462 | Keywords present are replaced by the clean name and a new string is returned. 463 | Args: 464 | sentence (str): Line of text where we will replace keywords 465 | Returns: 466 | new_sentence (str): Line of text with replaced keywords 467 | """ 468 | if not sentence: 469 | # if sentence is empty or none just return the same. 470 | return sentence 471 | new_sentence = [] 472 | orig_sentence = sentence 473 | if not self.case_sensitive: 474 | sentence = sentence.lower() 475 | current_word = '' 476 | current_dict = self.keyword_trie_dict 477 | current_white_space = '' 478 | sequence_end_pos = 0 479 | idx = 0 480 | sentence_len = len(sentence) 481 | curr_cost = max_cost 482 | while idx < sentence_len: 483 | char = sentence[idx] 484 | # when we reach whitespace 485 | if char not in self.non_word_boundaries: 486 | current_word += orig_sentence[idx] 487 | current_white_space = char 488 | # if end is present in current_dict 489 | if self._keyword in current_dict or char in current_dict: 490 | # update longest sequence found 491 | sequence_found = None 492 | longest_sequence_found = None 493 | is_longer_seq_found = False 494 | if self._keyword in current_dict: 495 | sequence_found = current_dict[self._keyword] 496 | longest_sequence_found = current_dict[self._keyword] 497 | sequence_end_pos = idx 498 | 499 | # re look for longest_sequence from this position 500 | if char in current_dict: 501 | current_dict_continued = current_dict[char] 502 | current_word_continued = current_word 503 | idy = idx + 1 504 | while idy < sentence_len: 505 | inner_char = sentence[idy] 506 | if inner_char not in self.non_word_boundaries and self._keyword in current_dict_continued: 507 | current_word_continued += orig_sentence[idy] 508 | # update longest sequence found 509 | current_white_space = inner_char 510 | longest_sequence_found = current_dict_continued[self._keyword] 511 | sequence_end_pos = idy 512 | is_longer_seq_found = True 513 | if inner_char in current_dict_continued: 514 | current_word_continued += orig_sentence[idy] 515 | current_dict_continued = current_dict_continued[inner_char] 516 | elif curr_cost > 0: 517 | next_word = self.get_next_word(sentence[idy:]) 518 | current_dict_continued, cost, _ = next( 519 | self.levensthein(next_word, max_cost=curr_cost, start_node=current_dict_continued), 520 | ({}, 0, 0) 521 | ) 522 | idy += len(next_word) - 1 523 | curr_cost -= cost 524 | current_word_continued += next_word # just in case of a no match at the end 525 | if not current_dict_continued: 526 | break 527 | else: 528 | break 529 | idy += 1 530 | else: 531 | # end of sentence reached. 532 | if self._keyword in current_dict_continued: 533 | # update longest sequence found 534 | current_white_space = '' 535 | longest_sequence_found = current_dict_continued[self._keyword] 536 | sequence_end_pos = idy 537 | is_longer_seq_found = True 538 | if is_longer_seq_found: 539 | # idx = sequence_end_pos 540 | idx = sequence_end_pos - 1 # 修复相邻关键词,后面的识别不出来的bug 541 | current_word = current_word_continued 542 | current_dict = self.keyword_trie_dict 543 | if longest_sequence_found: 544 | curr_cost = max_cost 545 | if is_longer_seq_found: # 修复相邻关键词,后面的识别不出来的bug 546 | new_sentence.append(longest_sequence_found) 547 | else: 548 | new_sentence.append(longest_sequence_found + current_white_space) 549 | current_word = '' 550 | current_white_space = '' 551 | else: 552 | new_sentence.append(current_word) 553 | current_word = '' 554 | current_white_space = '' 555 | else: 556 | # we reset current_dict 557 | current_dict = self.keyword_trie_dict 558 | new_sentence.append(current_word) 559 | current_word = '' 560 | current_white_space = '' 561 | elif char in current_dict: 562 | # we can continue from this char 563 | current_word += orig_sentence[idx] 564 | current_dict = current_dict[char] 565 | elif curr_cost > 0: 566 | next_orig_word = self.get_next_word(orig_sentence[idx:]) 567 | next_word = next_orig_word if self.case_sensitive else str.lower(next_orig_word) 568 | current_dict, cost, _ = next( 569 | self.levensthein(next_word, max_cost=curr_cost, start_node=current_dict), 570 | (self.keyword_trie_dict, 0, 0) 571 | ) 572 | idx += len(next_word) - 1 573 | curr_cost -= cost 574 | current_word += next_orig_word # just in case of a no match at the end 575 | else: 576 | current_word += orig_sentence[idx] 577 | # we reset current_dict 578 | current_dict = self.keyword_trie_dict 579 | # skip to end of word 580 | idy = idx + 1 581 | while idy < sentence_len: 582 | char = sentence[idy] 583 | current_word += orig_sentence[idy] 584 | if char not in self.non_word_boundaries: 585 | break 586 | idy += 1 587 | idx = idy 588 | new_sentence.append(current_word) 589 | current_word = '' 590 | current_white_space = '' 591 | # if we are end of sentence and have a sequence discovered 592 | if idx + 1 >= sentence_len: 593 | if self._keyword in current_dict: 594 | sequence_found = current_dict[self._keyword] 595 | new_sentence.append(sequence_found) 596 | else: 597 | new_sentence.append(current_word) 598 | idx += 1 599 | return "".join(new_sentence) 600 | 601 | def get_next_word(self, sentence): 602 | """ 603 | Retrieve the next word in the sequence 604 | Iterate in the string until finding the first char not in non_word_boundaries 605 | Args: 606 | sentence (str): Line of text where we will look for the next word 607 | Returns: 608 | next_word (str): The next word in the sentence 609 | """ 610 | next_word = str() 611 | for char in sentence: 612 | if char not in self.non_word_boundaries: 613 | break 614 | next_word += char 615 | return next_word 616 | 617 | def levensthein(self, word, max_cost=2, start_node=None): 618 | """ 619 | Retrieve the nodes where there is a fuzzy match, 620 | via levenshtein distance, and with respect to max_cost 621 | Args: 622 | word (str): word to find a fuzzy match for 623 | max_cost (int): maximum levenshtein distance when performing the fuzzy match 624 | start_node (dict): Trie node from which the search is performed 625 | Yields: 626 | node, cost, depth (tuple): A tuple containing the final node, 627 | the cost (i.e the distance), and the depth in the trie 628 | """ 629 | start_node = start_node or self.keyword_trie_dict 630 | rows = range(len(word) + 1) 631 | 632 | for char, node in start_node.items(): 633 | yield from self._levenshtein_rec(char, node, word, rows, max_cost, depth=1) 634 | 635 | def _levenshtein_rec(self, char, node, word, rows, max_cost, depth=0): 636 | n_columns = len(word) + 1 637 | new_rows = [rows[0] + 1] 638 | cost = 0 639 | 640 | for col in range(1, n_columns): 641 | insert_cost = new_rows[col - 1] + 1 642 | delete_cost = rows[col] + 1 643 | replace_cost = rows[col - 1] + int(word[col - 1] != char) 644 | cost = min((insert_cost, delete_cost, replace_cost)) 645 | new_rows.append(cost) 646 | 647 | stop_crit = isinstance(node, dict) and node.keys() & (self._white_space_chars | {self._keyword}) 648 | if new_rows[-1] <= max_cost and stop_crit: 649 | yield node, cost, depth 650 | 651 | elif isinstance(node, dict) and min(new_rows) <= max_cost: 652 | for new_char, new_node in node.items(): 653 | yield from self._levenshtein_rec(new_char, new_node, word, new_rows, max_cost, depth=depth + 1) 654 | -------------------------------------------------------------------------------- /lighttext/string_search/ner.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import csv 4 | import pickle 5 | from collections import defaultdict 6 | 7 | from tqdm import tqdm 8 | from lightutils import logger, get_file_name, check_file 9 | 10 | from .keyword import KeywordProcessor 11 | 12 | 13 | def default_type(): 14 | return None 15 | 16 | 17 | class NER: 18 | def __init__(self): 19 | self._kp = KeywordProcessor() 20 | self._type_dict = defaultdict(default_type) 21 | 22 | def build_from_txt(self, file_path: str, type_name: str = None): 23 | check_file(file_path, 'txt') 24 | file_name = get_file_name(file_path) 25 | if not type_name: 26 | type_name = file_name 27 | file_data = open(file_path, encoding='utf8').read().split('\n') 28 | logger.info("正在从{}中导入词表,共计{}条数据".format(file_path, len(file_data))) 29 | self._kp.add_keywords_from_list(file_data) 30 | for word in tqdm(file_data): 31 | self._type_dict[word] = type_name 32 | 33 | def build_from_csv(self, file_path: str, column: int, type_name: str = None): 34 | check_file(file_path, 'csv') 35 | file_name = get_file_name(file_path) 36 | if not type_name: 37 | type_name = file_name 38 | file_data = [] 39 | with open(file_path, encoding='utf8') as file: 40 | csv_reader = csv.reader(file) 41 | headers = next(csv_reader) 42 | assert column < len(headers) 43 | logger.info("headers:{}".format(','.join(headers))) 44 | for line in csv_reader: 45 | file_data.append(line[column]) 46 | logger.info("正在从{}中导入词表,共计{}条数据".format(file_path, len(file_data))) 47 | self._kp.add_keywords_from_list(file_data) 48 | for word in tqdm(file_data): 49 | self._type_dict[word] = type_name 50 | 51 | def build_from_dir(self, file_dir, type_name: str = None): 52 | for file_path in os.listdir(file_dir): 53 | file_full_path = os.path.join(file_dir, file_path) 54 | file_name = get_file_name(file_full_path) 55 | if not type_name: 56 | type_name = file_name 57 | if file_path.endswith('csv'): 58 | file_data = [] 59 | with open(file_full_path, encoding='utf8') as file: 60 | csv_reader = csv.reader(file) 61 | headers = next(csv_reader) 62 | logger.info("headers:{}".format(','.join(headers))) 63 | for line in csv_reader: 64 | file_data.append(line[1]) 65 | else: # default txt format 66 | file_data = open(file_full_path, encoding='utf8').read().split('\n') 67 | 68 | logger.info("正在从{}中导入词表,共计{}条数据".format(file_path, len(file_data))) 69 | self._kp.add_keywords_from_list(file_data) 70 | for word in tqdm(file_data): 71 | self._type_dict[word] = type_name 72 | 73 | def save(self, save_path: str = 'ner.pt'): 74 | logger.info("将模型保存至{}中".format(save_path)) 75 | with open(save_path, 'wb') as file: 76 | pickle.dump(self._kp, file) 77 | pickle.dump(self._type_dict, file) 78 | logger.info("成功将模型保存至{}中".format(save_path)) 79 | 80 | def load(self, save_path: str = 'ner.pt'): 81 | logger.info("从{}中加载模型中".format(save_path)) 82 | with open(save_path, 'rb') as file: 83 | self._kp = pickle.load(file) 84 | self._type_dict = pickle.load(file) 85 | logger.info("成功从{}中加载模型".format(save_path)) 86 | 87 | def extract(self, sentence: str): 88 | keywords = self._kp.extract_keywords(sentence, span_info=True) 89 | # return keywords 90 | return [(x[0], self._type_dict[x[0]], x[1], x[2]) for x in keywords] 91 | -------------------------------------------------------------------------------- /lighttext/text_mining/__init__.py: -------------------------------------------------------------------------------- 1 | from .new_words_detection.detector import NewWordDetector 2 | __all__ = ['NewWordDetector'] 3 | -------------------------------------------------------------------------------- /lighttext/text_mining/new_words_detection/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smilelight/lightText/b015d0e3524722fb5a8ee5ea83b7fbbd7408f797/lighttext/text_mining/new_words_detection/__init__.py -------------------------------------------------------------------------------- /lighttext/text_mining/new_words_detection/detector.py: -------------------------------------------------------------------------------- 1 | import math 2 | import gc 3 | from collections import defaultdict 4 | 5 | from .utils import gen_ngram, get_stopwords, load_data, load_dictionary, return_zero, get_post_list, get_pre_list, \ 6 | get_ban_list 7 | 8 | DEFAULT_DICT = load_dictionary() 9 | DEFAULT_UNI_SUM = sum([x[0] for x in DEFAULT_DICT.values()]) 10 | DEFAULT_STOP_WORDS = get_stopwords() 11 | pre_list = get_pre_list() 12 | post_list = get_post_list() 13 | ban_list = get_ban_list() 14 | 15 | 16 | class WordNode: 17 | """ 18 | 字结点 19 | """ 20 | def __init__(self, word): 21 | self._word = word 22 | self._count = 0 23 | self.freq = 0 24 | self.pmi = 0 25 | self._left = defaultdict(return_zero) 26 | self._right = defaultdict(return_zero) 27 | self._child = {} 28 | self.left_entropy = 0 29 | self.right_entropy = 0 30 | 31 | def __getitem__(self, item): 32 | return self._child[item] 33 | 34 | def __contains__(self, item): 35 | return item in self._child 36 | 37 | def __setitem__(self, key, value): 38 | self._child[key] = value 39 | 40 | def values(self): 41 | return self._child.values() 42 | 43 | def items(self): 44 | return self._child.items() 45 | 46 | def add_count(self, num=1): 47 | self._count += num 48 | 49 | def get_count(self): 50 | return self._count 51 | 52 | def get_freq(self): 53 | return self.freq 54 | 55 | def update_freq(self, words_sum): 56 | self.freq = self._count / words_sum 57 | 58 | def add_left_nbr(self, word): 59 | self._left[word] += 1 60 | 61 | def add_right_nbr(self, word): 62 | self._right[word] += 1 63 | 64 | def update_pmi(self, pmi): 65 | self.pmi = pmi 66 | 67 | def compute_entropy(self): 68 | left_length = sum(self._left.values()) 69 | self.left_entropy = sum(map(lambda x: -x/left_length * math.log(x/left_length, 2), self._left.values())) 70 | right_length = sum(self._right.values()) 71 | self.right_entropy = sum(map(lambda x: -x / right_length * math.log(x / right_length, 2), self._right.values())) 72 | 73 | def __str__(self): 74 | return self._word 75 | 76 | 77 | class NewWordDetector: 78 | """ 79 | 字字典 80 | """ 81 | def __init__(self, dic_file=None, accumulate=False): 82 | """ 83 | 新词发现检测程序 84 | :param dic_file: 自定义词典文件,每行格式如:`知无不谈 3 i`,分别表示:词语,频次,词性(这里是最常用词性) 85 | :param accumulate: 是否开启累积模式,默认为False,即每次调用`load_file`函数都会对模型进行初始化 86 | """ 87 | self.root = {} # interesting! 88 | self._pmi = False 89 | self._entropy = False 90 | self._dictionary = None 91 | self._uni_sum = 0 92 | self._accumulate = accumulate 93 | if dic_file: 94 | self._dictionary = load_dictionary(dic_file) 95 | self._uni_sum = sum([x[0] for x in self._dictionary.values()]) 96 | else: 97 | self._dictionary = DEFAULT_DICT 98 | self._uni_sum = DEFAULT_UNI_SUM 99 | 100 | def add(self, word): 101 | word_node = self.get_node(word) 102 | word_node.add_count() 103 | if len(word) > 1: 104 | left_sub_node = self.get_node(word[:-1]) 105 | left_sub_node.add_right_nbr(word[-1]) 106 | right_sub_node = self.get_node(word[1:]) 107 | right_sub_node.add_left_nbr(word[0]) 108 | 109 | def add_node(self, word): 110 | node = self.root 111 | for char in word: 112 | if char not in node: 113 | new_node = WordNode(char) 114 | node[char] = new_node 115 | node = new_node 116 | else: 117 | node = node[char] 118 | return node 119 | 120 | def search_node(self, word): 121 | node = self.root 122 | for char in word: 123 | if char in node: 124 | node = node[char] 125 | else: 126 | return None 127 | return node 128 | 129 | def get_node(self, word): 130 | node = self.search_node(word) 131 | if node: 132 | return node 133 | else: 134 | return self.add_node(word) 135 | 136 | def build(self, data): 137 | stop_words = DEFAULT_STOP_WORDS 138 | if not self._accumulate: 139 | self.clear() 140 | for word_list in data: 141 | ngrams = gen_ngram(word_list, 3) 142 | for d in ngrams: 143 | if set(d) & stop_words: 144 | continue 145 | else: 146 | self.add(d) 147 | 148 | def clear(self): 149 | self.root = {} 150 | self._pmi = False 151 | self._entropy = False 152 | gc.collect() 153 | 154 | def load_file(self, file_name): 155 | data = load_data(file_name) 156 | self.build(data) 157 | 158 | def update_freq(self): 159 | uni_sum, bi_sum = self.count_sum() 160 | for word, child in self.root.items(): 161 | if word in self._dictionary: 162 | child.add_count(self._dictionary[word][0]) 163 | child.update_freq(uni_sum) 164 | for child in self.root.values(): 165 | for descendant in child.values(): 166 | descendant.update_freq(bi_sum) 167 | 168 | def count_sum(self): 169 | uni_sum = 0 170 | bi_sum = 0 171 | for word in self.root.values(): 172 | count = word.get_count() 173 | uni_sum += count 174 | for child in self.root.values(): 175 | for descendant in child.values(): 176 | bi_sum += descendant.get_count() 177 | return uni_sum + self._uni_sum, bi_sum 178 | 179 | def count_pmi(self): 180 | self.update_freq() 181 | for child in self.root.values(): 182 | for word, descendant in child.items(): 183 | descendant.update_pmi( 184 | math.log(descendant.get_freq(), 2) - 185 | math.log(child.get_freq(), 2) - 186 | math.log(self.root[word].get_freq(), 2) 187 | ) 188 | self._pmi = True 189 | 190 | def compute_entropy(self): 191 | for child in self.root.values(): 192 | for descendant in child.values(): 193 | descendant.compute_entropy() 194 | self._entropy = True 195 | 196 | def get_top_k(self, k=5, debug=False, threshold=0.1): 197 | if not self._pmi: 198 | self.count_pmi() 199 | if not self._entropy: 200 | self.compute_entropy() 201 | result = {} 202 | for ch_word, child in self.root.items(): 203 | for des_word, descendant in child.items(): 204 | pmi = descendant.pmi 205 | freq = descendant.freq 206 | left_entropy = descendant.left_entropy 207 | right_entropy = descendant.right_entropy 208 | result[ch_word + '_' + des_word] = { 209 | 'pmi': descendant.pmi, 210 | 'freq': descendant.freq, 211 | 'left_entropy': descendant.left_entropy, 212 | 'right_entropy': descendant.right_entropy, 213 | 'score': (pmi + min(left_entropy, right_entropy)) * freq 214 | } 215 | result = {word: info for word, info in result.items() if self.filter_word(word)} 216 | result = sorted(result.items(), key=lambda x: x[1]['score'], reverse=True) 217 | if threshold: 218 | result = [x for x in result if x[1]['score'] >= threshold] 219 | if debug: 220 | return result[:k] 221 | return [(x[0], x[1]['score']) for x in result][:k] 222 | 223 | def filter_word(self, word): 224 | words = word.split('_') 225 | 226 | if self._dictionary: 227 | if words[0] in self._dictionary and self._dictionary[words[0]][1] in \ 228 | pre_list: 229 | return False 230 | if words[-1] in self._dictionary and self._dictionary[words[-1]][1] in post_list: 231 | return False 232 | if words[0] not in self._dictionary and len(words[0]) > 2: # 禁止造词,和jieba分出词典外的词 233 | return False 234 | if words[1] not in self._dictionary and len(words[1]) > 2: # 禁止造词,和jieba分出词典外的词 235 | return False 236 | for word in words: 237 | if word in ban_list: 238 | return False 239 | return True 240 | -------------------------------------------------------------------------------- /lighttext/text_mining/new_words_detection/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | import jieba 5 | 6 | # 词性规范详见:https://github.com/NLPchina/ansj_seg/wiki/%E8%AF%8D%E6%80%A7%E6%A0%87%E6%B3%A8%E8%A7%84%E8%8C%83 7 | STOP_WORDS_FILE = os.path.dirname(__file__) + '/../../data/stopwords.txt' 8 | DICT_FILE = os.path.dirname(__file__) + '/../../data/dict.txt' 9 | PRE_FILTER_POS = os.path.dirname(__file__) + '/../../data/pre_filter_pos.txt' 10 | POST_FILTER_POS = os.path.dirname(__file__) + '/../../data/post_filter_pos.txt' 11 | BAN_FILE = os.path.dirname(__file__) + '/../../data/ban.txt' 12 | spliter = re.compile('[\s]+') 13 | 14 | 15 | def return_zero(): 16 | return 0 17 | 18 | 19 | def get_stopwords(): 20 | with open(STOP_WORDS_FILE, 'r', encoding='utf-8') as f: 21 | stopwords = [line.strip() for line in f] 22 | return set(stopwords) 23 | 24 | 25 | def gen_ngram(input_list, n): 26 | result = [] 27 | for i in range(1, n + 1): 28 | result.extend(zip(*[input_list[j:] for j in range(i)])) 29 | return result 30 | 31 | 32 | def load_dictionary(filename=DICT_FILE): 33 | """ 34 | 加载外部词频记录 35 | :param filename: 36 | :return: 37 | """ 38 | word_freq = {} 39 | with open(filename, encoding='utf-8') as f: 40 | for line in f: 41 | try: 42 | line_list = spliter.split(line.strip()) 43 | # 规定最少词频 44 | if int(line_list[1]) > 2: 45 | word_freq[line_list[0]] = (int(line_list[1]), line_list[2]) # 词频,词性 46 | except IndexError as e: 47 | print(line) 48 | continue 49 | return word_freq 50 | 51 | 52 | def load_data(filename): 53 | """ 54 | 55 | :param filename: 56 | :return: 二维数组,[[句子1分词list], [句子2分词list],...,[句子n分词list]] 57 | """ 58 | data = [] 59 | with open(filename, encoding='utf-8') as f: 60 | for line in f: 61 | word_list = [x for x in jieba.cut(line.strip(), cut_all=False)] 62 | data.append(word_list) 63 | return data 64 | 65 | 66 | def get_pre_list(): 67 | pre_list = [] 68 | with open(PRE_FILTER_POS, encoding='utf-8') as f: 69 | pre_list.extend([x.strip() for x in f.readlines()]) 70 | return pre_list 71 | 72 | 73 | def get_post_list(): 74 | post_list = [] 75 | with open(POST_FILTER_POS, encoding='utf-8') as f: 76 | post_list.extend([x.strip() for x in f.readlines()]) 77 | return post_list 78 | 79 | 80 | def get_ban_list(): 81 | ban_list = [' '] 82 | with open(BAN_FILE, encoding='utf-8') as f: 83 | ban_list.extend([x.strip() for x in f.readlines()]) 84 | return ban_list 85 | 86 | 87 | if __name__ == '__main__': 88 | print(get_stopwords()) 89 | data = ['它', '是', '小', '狗'] 90 | print(gen_ngram(data, 4)) 91 | print(spliter.split('与 2510 p')) 92 | -------------------------------------------------------------------------------- /lighttext/text_similarity/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /lighttext/text_similarity/sim.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from typing import List, Callable, Union 3 | from enum import Enum 4 | import pickle 5 | 6 | import jieba.posseg as pseg 7 | from gensim import corpora, models, similarities 8 | 9 | from ..utils.sen_split import split_sentence 10 | 11 | NUM_TOPICS = 300 12 | NUM_FEATURES = 300 13 | MODEL_PATH = 'saves/model' 14 | DIC_PATH = 'saves/dic' 15 | 16 | 17 | class ModelType(Enum): 18 | TFIDF = 1 19 | LDA = 2 20 | LSI = 3 21 | 22 | 23 | type_dict = { 24 | "tfidf": ModelType.TFIDF, 25 | "lda": ModelType.LDA, 26 | "lsi": ModelType.LSI 27 | } 28 | 29 | 30 | def tokenize(text: str) -> List[str]: 31 | # {标点符号、连词、助词、副词、介词、时语素、‘的’、数词、方位词、代词} 32 | # {'x', 'c', 'u', 'd', 'p', 't', 'uj', 'm', 'f', 'r'} 33 | stop_flags = {'x', 'c', 'u', 'd', 'p', 't', 'uj', 'm', 'f', 'r'} 34 | stop_words = {'nbsp', '\u3000', '\xa0'} 35 | words = pseg.cut(text) 36 | return [word for word, flag in words if flag not in stop_flags and word not in stop_words] 37 | 38 | 39 | class SimModel: 40 | def __init__(self, model_type: str = 'lda', token_fn: Callable = tokenize, split_fn: Callable = split_sentence): 41 | if model_type not in type_dict: 42 | raise Exception('undefined model_type: {}, must be one of {}'.format(model_type, list(type_dict.keys()))) 43 | self._model_type = model_type 44 | self._token_fn = token_fn 45 | self._split_fn = split_fn 46 | self._dic = corpora.Dictionary() 47 | self._models = { 48 | 'tfidf': None, 49 | 'lsi': None, 50 | 'lda': None 51 | } 52 | self._mtx: similarities.MatrixSimilarity = None 53 | self._documents: List[str] = [] 54 | 55 | @property 56 | def dic(self): 57 | return self._dic 58 | 59 | @property 60 | def token_fn(self): 61 | return self._token_fn 62 | 63 | @property 64 | def split_fn(self): 65 | return self._split_fn 66 | 67 | def add_vocab_from_documents(self, documents: List[str]): 68 | """ 69 | 构造dic词典 70 | Args: 71 | documents: 原始文本列表 72 | 73 | Returns: 74 | 75 | """ 76 | texts = [self._token_fn(doc) for doc in documents] 77 | self._dic.add_documents(texts) 78 | 79 | def tokenize_texts(self, texts: List[str]): 80 | """ 81 | 对文本列表进行分词处理 82 | Args: 83 | texts: 文本列表 84 | 85 | Returns: 86 | 87 | """ 88 | return [self._token_fn(doc) for doc in texts] 89 | 90 | def build_vocab(self, documents: List[str]): 91 | """ 92 | 从文本中构造dic词表 93 | Args: 94 | documents: 构造dic词典 95 | 96 | Returns: 97 | 98 | """ 99 | self.add_vocab_from_documents(documents) 100 | 101 | def get_corpus(self, texts: List[str]): 102 | """ 103 | 得到文本列表对应的倒排索引表 104 | Args: 105 | texts: 原始文本列表 106 | 107 | Returns: 108 | 109 | """ 110 | texts = self.tokenize_texts(texts) 111 | return [self._dic.doc2bow(text) for text in texts] 112 | 113 | def _init_tfidf_model(self, corpus): 114 | self._models["tfidf"] = models.TfidfModel(corpus, id2word=self._dic) 115 | 116 | def _init_lsi_model(self, corpus): 117 | self._models["lsi"] = models.LsiModel(corpus, id2word=self._dic, num_topics=len(self._dic.token2id)) 118 | 119 | def _init_lda_model(self, corpus): 120 | self._models['lda'] = models.LdaModel(corpus, id2word=self._dic, num_topics=len(self._dic.token2id)) 121 | 122 | def _init_model(self, corpus): 123 | if self._model_type == 'tfidf': 124 | self._init_tfidf_model(corpus) 125 | elif self._model_type == 'lsi': 126 | self._init_lsi_model(corpus) 127 | else: 128 | self._init_lda_model(corpus) 129 | 130 | def build_model(self, corpus): 131 | """ 132 | 创建模型 133 | Args: 134 | corpus: 135 | 136 | Returns: 137 | 138 | """ 139 | self._init_model(corpus) 140 | 141 | def _check_model(self): 142 | model = self._models[self._model_type] 143 | if not model: 144 | raise Exception('model must be initialized') 145 | 146 | def build_from_documents(self, documents: List[str]): 147 | """ 148 | 从文本列表中构造词表并构造模型 149 | Args: 150 | documents: 文本列表 151 | 152 | Returns: 153 | 154 | """ 155 | self.build_vocab(documents) 156 | self.build_model(self.get_corpus(documents)) 157 | 158 | def build_from_txt(self, path: str): 159 | """ 160 | 从txt文本中构造词表并构造模型 161 | Args: 162 | path: 文本路径 163 | 164 | Returns: 165 | 166 | """ 167 | documents = [] 168 | with open(path, encoding='utf8') as f: 169 | for line in f: 170 | documents.extend(self._split_fn(line)) 171 | self.build_from_documents(documents) 172 | 173 | def process(self, text: str, documents: List[str]): 174 | vec = self._dic.doc2bow(self._token_fn(text)) 175 | model = self._models[self._model_type] 176 | if not model: 177 | raise Exception('model must be initialized') 178 | corpus = self.get_corpus(documents) 179 | mtx = similarities.MatrixSimilarity(model[corpus], num_features=len(self._dic.token2id)) 180 | scores = mtx[model[vec]] 181 | return sorted(zip(scores, documents), key=lambda x: x[0], reverse=True) 182 | 183 | def set_documents(self, documents: List[str]): 184 | """ 185 | 设置待检索的文档 186 | Args: 187 | documents: 188 | 189 | Returns: 190 | 191 | """ 192 | self._documents = documents 193 | corpus = self.get_corpus(self._documents) 194 | model = self._models[self._model_type] 195 | if not model: 196 | raise Exception('model must be initialized') 197 | self._mtx = similarities.MatrixSimilarity(model[corpus], num_features=len(self._dic.token2id)) 198 | 199 | def search(self, text: str): 200 | """ 201 | 根据text检索最相关的文档片段 202 | Args: 203 | text: 待检索的文本 204 | 205 | Returns: 文档结果及分数 206 | 207 | """ 208 | vec = self._dic.doc2bow(self._token_fn(text)) 209 | model = self._models[self._model_type] 210 | if not model: 211 | raise Exception('model must be initialized') 212 | scores = self._mtx[model[vec]] 213 | return sorted(zip(scores, self._documents), key=lambda x: x[0], reverse=True) 214 | 215 | def save(self, path: str): 216 | with open(path, 'wb') as f: 217 | pickle.dump(self, f) 218 | 219 | @classmethod 220 | def load(cls, path: str): 221 | with open(path, 'rb') as f: 222 | model = pickle.load(f) 223 | return model 224 | -------------------------------------------------------------------------------- /lighttext/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smilelight/lightText/b015d0e3524722fb5a8ee5ea83b7fbbd7408f797/lighttext/utils/__init__.py -------------------------------------------------------------------------------- /lighttext/utils/sen_split.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | 4 | split_re = re.compile('([﹒﹔﹖﹗.;。!?]["’”」』]{0,2}|:(?=["‘“「『]{1,2}|$))') 5 | 6 | 7 | def split_sentence(sentence: str): 8 | lst = [] 9 | for i in split_re.split(sentence): 10 | if split_re.match(i) and lst: 11 | lst[-1] += i 12 | elif i: 13 | lst.append(i) 14 | return lst 15 | -------------------------------------------------------------------------------- /lighttext/utils/tokenize.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from typing import List 3 | import jieba 4 | 5 | 6 | def token_split(text: str) -> List[str]: 7 | return list(jieba.cut(text)) 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | lightUtils>=0.1.10.0 2 | tqdm>=4.28.1 3 | jieba>=0.39 4 | fuzzywuzzy 5 | python-Levenshtein 6 | gensim 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | import setuptools 3 | 4 | with open('./version.txt', 'r', encoding='utf8') as f: 5 | version = f.read() 6 | 7 | with open('./README.md', 'r', encoding='utf8') as f: 8 | long_description = f.read() 9 | 10 | with open('./requirements.txt', 'r', encoding='utf8') as f: 11 | install_requires = list(map(lambda x: x.strip(), f.readlines())) 12 | 13 | setup( 14 | name='lightText', 15 | version=version, 16 | description="lightsmile's text library", 17 | author='lightsmile', 18 | author_email='iamlightsmile@gmail.com', 19 | url='https://github.com/smilelight/lightText', 20 | packages=setuptools.find_packages(), 21 | install_requires=install_requires, 22 | long_description=long_description, 23 | include_package_data=True, 24 | long_description_content_type='text/markdown', 25 | license='Apache-2.0', 26 | classifiers=[ 27 | 'Development Status :: 4 - Beta', 28 | 'Operating System :: OS Independent', 29 | 'Intended Audience :: Developers', 30 | 'Programming Language :: Python', 31 | 'Programming Language :: Python :: 3', 32 | 'Programming Language :: Python :: 3.4', 33 | 'Programming Language :: Python :: 3.5', 34 | 'Programming Language :: Python :: 3.6', 35 | 'Programming Language :: Python :: 3.7', 36 | 'Topic :: Software Development :: Libraries' 37 | ], 38 | ) 39 | -------------------------------------------------------------------------------- /test/doc.txt: -------------------------------------------------------------------------------- 1 | 语言是人类区别其他动物的本质特性。在所有生物中,只有人类才具有语言能力。人类的多种智能都与语言有着密切的关系。人类的逻辑思维以语言为形式,人类的绝大部分知识也是以语言文字的形式记载和流传下来的。因而,它也是人工智能的一个重要,甚至核心部分。 2 | 用自然语言与计算机进行通信,这是人们长期以来所追求的。因为它既有明显的实际意义,同时也有重要的理论意义:人们可以用自己最习惯的语言来使用计算机,而无需再花大量的时间和精力去学习不很自然和习惯的各种计算机语言;人们也可通过它进一步了解人类的语言能力和智能的机制。 3 | 自然语言处理是指利用人类交流所使用的自然语言与机器进行交互通讯的技术。通过人为的对自然语言的处理,使得计算机对其能够可读并理解。自然语言处理的相关研究始于人类对机器翻译的探索。虽然自然语言处理涉及语音、语法、语义、语用等多维度的操作,但简单而言,自然语言处理的基本任务是基于本体词典、词频统计、上下文语义分析等方式对待处理语料进行分词,形成以最小词性为单位,且富含语义的词项单元。 [3] 4 | 自然语言处理( Natural Language Processing, NLP)以语言为对象,利用计算机技术来分析、理解和处理自然语言的一门学科,即把计算机作为语言研究的强大工具,在计算机的支持下对语言信息进行定量化的研究,并提供可供人与计算机之间能共同使用的语言描写。包括自然语言理解( NaturalLanguage Understanding, NLU)和自然语言生成( Natural LanguageGeneration, NLG)两部分。它是典型边缘交叉学科,涉及到语言科学、计算机科学、数学、认知学、逻辑学等,关注计算机和人类(自然)语言之间的相互作用的领域。人们把用计算机处理自然语言的过程在不同时期或侧重点不同时又称为自然语言理解( Natural Language Understanding, NLU)、人类语言技术( Human Language Technology, HLT)、计算语言学Hl(Computational Linguistics)、计量语言学( QuantitativeLinguistics)、数理语言学( Mathematical Linguistics) [1] 。 5 | 实现人机间自然语言通信意味着要使计算机既能理解自然语言文本的意义,也能以自然语言文本来表达给定的意图、思想等。前者称为自然语言理解,后者称为自然语言生成。因此,自然语言处理大体包括了自然语言理解和自然语言生成两个部分。历史上对自然语言理解研究得较多,而对自然语言生成研究得较少。但这种状况已有所改变。 6 | 无论实现自然语言理解,还是自然语言生成,都远不如人们原来想象的那么简单,而是十分困难的。从现有的理论和技术现状看,通用的、高质量的自然语言处理系统,仍然是较长期的努力目标,但是针对一定应用,具有相当自然语言处理能力的实用系统已经出现,有些已商品化,甚至开始产业化。典型的例子有:多语种数据库和专家系统的自然语言接口、各种机器翻译系统、全文信息检索系统、自动文摘系统等。 7 | 自然语言处理,即实现人机间自然语言通信,或实现自然语言理解和自然语言生成是十分困难的。造成困难的根本原因是自然语言文本和对话的各个层次上广泛存在的各种各样的歧义性或多义性(ambiguity)。 8 | 自然语言的形式(字符串)与其意义之间是一种多对多的关系。其实这也正是自然语言的魅力所在。但从计算机处理的角度看,我们必须消除歧义,而且有人认为它正是自然语言理解中的中心问题,即要把带有潜在歧义的自然语言输入转换成某种无歧义的计算机内部表示。 9 | 歧义现象的广泛存在使得消除它们需要大量的知识和推理,这就给基于语言学的方法、基于知识的方法带来了巨大的困难,因而以这些方法为主流的自然语言处理研究几十年来一方面在理论和方法方面取得了很多成就,但在能处理大规模真实文本的系统研制方面,成绩并不显著。研制的一些系统大多数是小规模的、研究性的演示系统。 10 | 目前存在的问题有两个方面:一方面,迄今为止的语法都限于分析一个孤立的句子,上下文关系和谈话环境对本句的约束和影响还缺乏系统的研究,因此分析歧义、词语省略、代词所指、同一句话在不同场合或由不同的人说出来所具有的不同含义等问题,尚无明确规律可循,需要加强语用学的研究才能逐步解决。另一方面,人理解一个句子不是单凭语法,还运用了大量的有关知识,包括生活知识和专门知识,这些知识无法全部贮存在计算机里。因此一个书面理解系统只能建立在有限的词汇、句型和特定的主题范围内;计算机的贮存量和运转速度大大提高之后,才有可能适当扩大范围. 11 | 以上存在的问题成为自然语言理解在机器翻译应用中的主要难题,这也就是当今机器翻译系统的译文质量离理想目标仍相差甚远的原因之一;而译文质量是机译系统成败的关键。中国数学家、语言学家周海中教授曾在经典论文《机器翻译五十年》中指出:要提高机译的质量,首先要解决的是语言本身问题而不是程序设计问题;单靠若干程序来做机译系统,肯定是无法提高机译质量的;另外在人类尚未明了大脑是如何进行语言的模糊识别和逻辑判断的情况下,机译要想达到“信、达、雅”的程度是不可能的。 12 | 乔布斯极力推崇自己家的苹果手机 13 | 这苹果又大又圆又甜,还便宜 14 | 这年头,谁还用安卓手机,要么是苹果,要么是鸿蒙 -------------------------------------------------------------------------------- /test/doc_search/test_bloom.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from lighttext.doc_search.bloom import BloomFilter 3 | 4 | 5 | if __name__ == '__main__': 6 | bf = BloomFilter(10) 7 | bf.add_value('dog') 8 | bf.add_value('fish') 9 | bf.add_value('cat') 10 | print(bf.contents) 11 | -------------------------------------------------------------------------------- /test/doc_search/test_searcher.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from lighttext.doc_search.searcher import Searcher 3 | 4 | 5 | if __name__ == '__main__': 6 | searcher = Searcher() 7 | searcher.add_events('曹操和刘备去赶集') 8 | searcher.add_events('人生苦短,我用Python') 9 | print(list(searcher.search('曹操'))) 10 | -------------------------------------------------------------------------------- /test/fuzzy_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from fuzzywuzzy import fuzz 3 | 4 | if __name__ == '__main__': 5 | a = 'asdf' 6 | b = 'sdfa' 7 | print(fuzz.partial_ratio(a, b)) 8 | -------------------------------------------------------------------------------- /test/test_article.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from lighttext import Article 4 | 5 | 6 | class TestArticle(unittest.TestCase): 7 | 8 | def test_str(self): 9 | title = '月亮和六便士' 10 | content = """ 11 | 《月亮和六便士》是英国小说家威廉· 萨默赛特·毛姆的创作的长篇小说,成书于1919年。 12 | 作品以法国印象派画家保罗·高更的生平为素材,描述了一个原本平凡的伦敦证券经纪人思特里克兰德,突然着了艺术的魔,抛妻弃子,绝弃了旁人看来优裕美满的生活,奔赴南太平洋的塔希提岛,用圆笔谱写出自己光辉灿烂的生命,把生命的价值全部注入绚烂的画布的故事。 13 | 贫穷的纠缠,病魔的折磨他毫不在意,只是后悔从来没有光顾过他的意识。作品表现了天才、个性与物质文明以及现代婚姻、家庭生活之间的矛盾,有着广阔的生命视角,用散发着消毒水味道的手术刀对皮囊包裹下的人性进行了犀利地解剖,混合着看客讪笑的幽默和残忍的目光。 14 | """ 15 | article = Article(title=title, content=content) 16 | print(article) 17 | -------------------------------------------------------------------------------- /test/test_gensim.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from typing import List 3 | import jieba.posseg as pseg 4 | 5 | from gensim import corpora, models, similarities 6 | 7 | NUM_TOPICS = 350 8 | MODEL_PATH = 'saves/model' 9 | DIC_PATH = 'saves/dic' 10 | 11 | def tokenize(text: str) -> List[str]: 12 | # {标点符号、连词、助词、副词、介词、时语素、‘的’、数词、方位词、代词} 13 | # {'x', 'c', 'u', 'd', 'p', 't', 'uj', 'm', 'f', 'r'} 14 | stop_flags = {'x', 'c', 'u', 'd', 'p', 't', 'uj', 'm', 'f', 'r'} 15 | stop_words = {'nbsp', '\u3000', '\xa0'} 16 | words = pseg.cut(text) 17 | return [word for word, flag in words if flag not in stop_flags and word not in stop_words] 18 | 19 | 20 | def get_dic_corpus(contents: List[str]): 21 | texts = [tokenize(content) for content in contents] 22 | 23 | dic = corpora.Dictionary(texts) 24 | corpus = [dic.doc2bow(text) for text in texts] 25 | return dic, corpus 26 | 27 | 28 | def get_lsi_model(corpus, dic, num_topics: int = NUM_TOPICS): 29 | lsi = models.LsiModel(corpus, id2word=dic, num_topics=len(dic.token2id)) 30 | index = similarities.MatrixSimilarity(lsi[corpus]) 31 | return lsi, index 32 | 33 | 34 | def get_tfidf_model(corpus, dic): 35 | model = models.TfidfModel(corpus, id2word=dic) 36 | print(dic.token2id, type(dic.token2id)) 37 | index = similarities.MatrixSimilarity(model[corpus], num_features=len(dic.token2id)) 38 | return model, index 39 | 40 | 41 | def get_lda_model(corpus, dic, num_topics: int = NUM_TOPICS): 42 | model = models.LdaModel(corpus, id2word=dic, num_topics=len(dic.token2id)) 43 | index = similarities.MatrixSimilarity(model[corpus], num_features=len(dic.token2id)) 44 | return model, index 45 | 46 | 47 | def get_test_mtx(texts: List[str], dic, model): 48 | corpus = [dic.doc2bow(tokenize(text)) for text in texts] 49 | idx = similarities.MatrixSimilarity(model[corpus], num_features=len(dic.token2id)) 50 | return idx 51 | 52 | 53 | 54 | if __name__ == '__main__': 55 | text = "测试曹操去东北,然后hello world!" 56 | print(tokenize(text)) 57 | 58 | contents = [ 59 | '乔布斯极力推崇自己家的苹果手机', 60 | '这苹果又大又圆又甜,还便宜', 61 | '这年头,谁还用安卓手机,要么是苹果,要么是鸿蒙' 62 | ] 63 | 64 | others = [ 65 | '许多超市里都有卖苹果的', 66 | '比尔盖茨打算收购乔布斯的苹果手机' 67 | ] 68 | 69 | dic, corpus = get_dic_corpus(contents) 70 | 71 | text = '苹果手机还是小米手机呢?' 72 | text_vec = dic.doc2bow(tokenize(text)) 73 | 74 | print(text_vec) 75 | 76 | # 获取tfidf模型 77 | # model, idx = get_tfidf_model(corpus, dic) 78 | 79 | # 获取lsi模型 80 | # model, idx = get_lsi_model(corpus, dic) 81 | # print(model.print_topics()) 82 | 83 | # 获取lda模型 84 | model, idx = get_lda_model(corpus, dic) 85 | print(model.print_topics()) 86 | 87 | model.save(MODEL_PATH) 88 | dic.save(DIC_PATH) 89 | model = models.LdaModel.load(MODEL_PATH) 90 | dic = corpora.Dictionary.load(DIC_PATH) 91 | 92 | test_mtx = get_test_mtx(others, dic, model) 93 | 94 | # sims = idx[model[text_vec]] 95 | sims = test_mtx[model[text_vec]] 96 | 97 | print(list(enumerate(sims))) -------------------------------------------------------------------------------- /test/test_paragraph.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from lighttext import Paragraph 4 | 5 | 6 | class TestParagraph(unittest.TestCase): 7 | 8 | def test_str(self): 9 | text = '桥上的恋人入对出双,桥边红药叹夜太漫长。月也摇晃,人也彷徨,乌蓬里传来了一曲离殇。' 10 | para = Paragraph(text) 11 | self.assertEqual(str(para), text) 12 | for x in para.sentences: 13 | print(type(x)) 14 | print(x) 15 | print(para.split()) 16 | -------------------------------------------------------------------------------- /test/test_sen_split.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from lighttext.utils.sen_split import split_sentence 3 | 4 | if __name__ == '__main__': 5 | text = "自然语言处理( Natural Language Processing, NLP)是计算机科学领域与人工智能领域中的一个重要方向。 它研究能实现人与计算机之间用自然语言进行有效通信的各种理论和方法。 自然语言处理是一门融语言学、计算机科学、数学于一体的科学。 ... 因而它是计算机科学的一部分 。" 6 | lst = split_sentence(text) 7 | # print(lst) 8 | [print(x) for x in lst] 9 | 10 | -------------------------------------------------------------------------------- /test/test_sentence.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from lighttext import Sentence 4 | 5 | 6 | class TestSentence(unittest.TestCase): 7 | 8 | def test_str(self): 9 | text = '桥上的恋人入对出双,桥边红药叹夜太漫长。' 10 | sen = Sentence(text) 11 | self.assertEqual(str(sen), text) 12 | 13 | def test_split(self): 14 | import jieba 15 | 16 | text = '桥上的恋人入对出双,桥边红药叹夜太漫长。' 17 | sen = Sentence(text) 18 | self.assertEqual(sen.split(), list(jieba.cut(text))) 19 | 20 | def test_words(self): 21 | from lighttext import Word 22 | text = '桥上的恋人入对出双,桥边红药叹夜太漫长。' 23 | sen = Sentence(text) 24 | self.assertTrue(type(sen.words[0]) == Word) 25 | -------------------------------------------------------------------------------- /test/test_sim_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from lighttext.text_similarity.sim import SimModel 3 | 4 | if __name__ == '__main__': 5 | documents = [ 6 | '乔布斯极力推崇自己家的苹果手机', 7 | '这苹果又大又圆又甜,还便宜', 8 | '这年头,谁还用安卓手机,要么是苹果,要么是鸿蒙' 9 | ] 10 | 11 | text = '苹果手机还是小米手机呢?' 12 | 13 | corpus = [ 14 | '许多超市里都有卖苹果的', 15 | '比尔盖茨打算收购乔布斯的苹果手机' 16 | ] 17 | 18 | path = "sim.bin" 19 | doc_path = "doc.txt" 20 | 21 | model = SimModel(model_type='lsi') 22 | 23 | # 从documents中初始化模型 24 | # model.build_from_documents(documents) 25 | 26 | model.build_from_txt(doc_path) 27 | 28 | model.save(path) 29 | model: SimModel = SimModel.load(path) 30 | model.set_documents(corpus) 31 | print(model.dic.id2token) 32 | print(model.search(text)) 33 | print(model.process(text, corpus)) 34 | -------------------------------------------------------------------------------- /test/test_vocabulary.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from lighttext.component.vocabulary import Vocabulary 3 | from lighttext.utils.tokenize import token_split 4 | 5 | if __name__ == '__main__': 6 | corpora = [ 7 | '我们都是中国人', 8 | '谁才是最可爱的人' 9 | ] 10 | 11 | vocab = Vocabulary() 12 | print(vocab.special_words) 13 | vocab.build_from_corpora([token_split(text) for text in corpora]) 14 | print(vocab.word_count) 15 | print(vocab.idx2word) 16 | print(vocab.word2idx) 17 | 18 | vocab.add_word('曹操') 19 | vocab.update(['刘备', '司马懿', '关羽']) 20 | print(vocab.word_count) 21 | print(vocab.idx2word) 22 | print(vocab.word2idx) 23 | 24 | print(vocab.has_word('张飞')) 25 | print(vocab['刘备']) 26 | print('赵云' in vocab) 27 | print(vocab.to_idx('徐庶')) 28 | print(vocab.to_word(2)) 29 | 30 | 31 | -------------------------------------------------------------------------------- /version.txt: -------------------------------------------------------------------------------- 1 | 0.2.3.1 --------------------------------------------------------------------------------