├── .gitignore ├── LICENSE ├── README.md ├── demo └── demo_segment.py ├── setup.py ├── test ├── __init__.py ├── data │ ├── test.ngram.txt │ └── test.txt ├── test_bigramtable.py ├── test_dict.py ├── test_enum.py ├── test_hmm.py ├── test_organization_recognition.py ├── test_person_recognition.py ├── test_place_recognition.py ├── test_role_tag.py ├── test_segment.py ├── test_traditionalChineseDict.py ├── test_trie.py ├── test_viterbi_segment.py └── test_wordnet.py └── yaya ├── __init__.py ├── collection ├── __init__.py ├── bigram.py ├── dict.py ├── hmm.py └── trie.py ├── common ├── __init__.py ├── enum.py ├── nature.py ├── nr.py ├── ns.py └── nt.py ├── config.py ├── const.py ├── dictionary ├── __init__.py ├── chinese_traditional_dict.py ├── org_dict.py ├── person_dict.py └── place_dict.py ├── recognition ├── __init__.py ├── organization_recognition.py ├── person_recognition.py ├── place_recognition.py └── recognition.py ├── seg ├── __init__.py ├── segment.py ├── viterbi.py └── wordnet.py └── utility ├── __init__.py ├── bytearray.py ├── chartype.py ├── persistence.py └── singleton.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.bin 2 | *.ya 3 | *.dat 4 | *.pyc 5 | /.idea 6 | /data 7 | /.project 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, and 10 | distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright 13 | owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all other entities 16 | that control, are controlled by, or are under common control with that entity. 17 | For the purposes of this definition, "control" means (i) the power, direct or 18 | indirect, to cause the direction or management of such entity, whether by 19 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the 20 | outstanding shares, or (iii) beneficial ownership of such entity. 21 | 22 | "You" (or "Your") shall mean an individual or Legal Entity exercising 23 | permissions granted by this License. 24 | 25 | "Source" form shall mean the preferred form for making modifications, including 26 | but not limited to software source code, documentation source, and configuration 27 | files. 28 | 29 | "Object" form shall mean any form resulting from mechanical transformation or 30 | translation of a Source form, including but not limited to compiled object code, 31 | generated documentation, and conversions to other media types. 32 | 33 | "Work" shall mean the work of authorship, whether in Source or Object form, made 34 | available under the License, as indicated by a copyright notice that is included 35 | in or attached to the work (an example is provided in the Appendix below). 36 | 37 | "Derivative Works" shall mean any work, whether in Source or Object form, that 38 | is based on (or derived from) the Work and for which the editorial revisions, 39 | annotations, elaborations, or other modifications represent, as a whole, an 40 | original work of authorship. For the purposes of this License, Derivative Works 41 | shall not include works that remain separable from, or merely link (or bind by 42 | name) to the interfaces of, the Work and Derivative Works thereof. 43 | 44 | "Contribution" shall mean any work of authorship, including the original version 45 | of the Work and any modifications or additions to that Work or Derivative Works 46 | thereof, that is intentionally submitted to Licensor for inclusion in the Work 47 | by the copyright owner or by an individual or Legal Entity authorized to submit 48 | on behalf of the copyright owner. For the purposes of this definition, 49 | "submitted" means any form of electronic, verbal, or written communication sent 50 | to the Licensor or its representatives, including but not limited to 51 | communication on electronic mailing lists, source code control systems, and 52 | issue tracking systems that are managed by, or on behalf of, the Licensor for 53 | the purpose of discussing and improving the Work, but excluding communication 54 | that is conspicuously marked or otherwise designated in writing by the copyright 55 | owner as "Not a Contribution." 56 | 57 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf 58 | of whom a Contribution has been received by Licensor and subsequently 59 | incorporated within the Work. 60 | 61 | 2. Grant of Copyright License. 62 | 63 | Subject to the terms and conditions of this License, each Contributor hereby 64 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 65 | irrevocable copyright license to reproduce, prepare Derivative Works of, 66 | publicly display, publicly perform, sublicense, and distribute the Work and such 67 | Derivative Works in Source or Object form. 68 | 69 | 3. Grant of Patent License. 70 | 71 | Subject to the terms and conditions of this License, each Contributor hereby 72 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 73 | irrevocable (except as stated in this section) patent license to make, have 74 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where 75 | such license applies only to those patent claims licensable by such Contributor 76 | that are necessarily infringed by their Contribution(s) alone or by combination 77 | of their Contribution(s) with the Work to which such Contribution(s) was 78 | submitted. If You institute patent litigation against any entity (including a 79 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a 80 | Contribution incorporated within the Work constitutes direct or contributory 81 | patent infringement, then any patent licenses granted to You under this License 82 | for that Work shall terminate as of the date such litigation is filed. 83 | 84 | 4. Redistribution. 85 | 86 | You may reproduce and distribute copies of the Work or Derivative Works thereof 87 | in any medium, with or without modifications, and in Source or Object form, 88 | provided that You meet the following conditions: 89 | 90 | You must give any other recipients of the Work or Derivative Works a copy of 91 | this License; and 92 | You must cause any modified files to carry prominent notices stating that You 93 | changed the files; and 94 | You must retain, in the Source form of any Derivative Works that You distribute, 95 | all copyright, patent, trademark, and attribution notices from the Source form 96 | of the Work, excluding those notices that do not pertain to any part of the 97 | Derivative Works; and 98 | If the Work includes a "NOTICE" text file as part of its distribution, then any 99 | Derivative Works that You distribute must include a readable copy of the 100 | attribution notices contained within such NOTICE file, excluding those notices 101 | that do not pertain to any part of the Derivative Works, in at least one of the 102 | following places: within a NOTICE text file distributed as part of the 103 | Derivative Works; within the Source form or documentation, if provided along 104 | with the Derivative Works; or, within a display generated by the Derivative 105 | Works, if and wherever such third-party notices normally appear. The contents of 106 | the NOTICE file are for informational purposes only and do not modify the 107 | License. You may add Your own attribution notices within Derivative Works that 108 | You distribute, alongside or as an addendum to the NOTICE text from the Work, 109 | provided that such additional attribution notices cannot be construed as 110 | modifying the License. 111 | You may add Your own copyright statement to Your modifications and may provide 112 | additional or different license terms and conditions for use, reproduction, or 113 | distribution of Your modifications, or for any such Derivative Works as a whole, 114 | provided Your use, reproduction, and distribution of the Work otherwise complies 115 | with the conditions stated in this License. 116 | 117 | 5. Submission of Contributions. 118 | 119 | Unless You explicitly state otherwise, any Contribution intentionally submitted 120 | for inclusion in the Work by You to the Licensor shall be under the terms and 121 | conditions of this License, without any additional terms or conditions. 122 | Notwithstanding the above, nothing herein shall supersede or modify the terms of 123 | any separate license agreement you may have executed with Licensor regarding 124 | such Contributions. 125 | 126 | 6. Trademarks. 127 | 128 | This License does not grant permission to use the trade names, trademarks, 129 | service marks, or product names of the Licensor, except as required for 130 | reasonable and customary use in describing the origin of the Work and 131 | reproducing the content of the NOTICE file. 132 | 133 | 7. Disclaimer of Warranty. 134 | 135 | Unless required by applicable law or agreed to in writing, Licensor provides the 136 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, 137 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, 138 | including, without limitation, any warranties or conditions of TITLE, 139 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are 140 | solely responsible for determining the appropriateness of using or 141 | redistributing the Work and assume any risks associated with Your exercise of 142 | permissions under this License. 143 | 144 | 8. Limitation of Liability. 145 | 146 | In no event and under no legal theory, whether in tort (including negligence), 147 | contract, or otherwise, unless required by applicable law (such as deliberate 148 | and grossly negligent acts) or agreed to in writing, shall any Contributor be 149 | liable to You for damages, including any direct, indirect, special, incidental, 150 | or consequential damages of any character arising as a result of this License or 151 | out of the use or inability to use the Work (including but not limited to 152 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or 153 | any and all other commercial damages or losses), even if such Contributor has 154 | been advised of the possibility of such damages. 155 | 156 | 9. Accepting Warranty or Additional Liability. 157 | 158 | While redistributing the Work or Derivative Works thereof, You may choose to 159 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or 160 | other liability obligations and/or rights consistent with this License. However, 161 | in accepting such obligations, You may act only on Your own behalf and on Your 162 | sole responsibility, not on behalf of any other Contributor, and only if You 163 | agree to indemnify, defend, and hold each Contributor harmless for any liability 164 | incurred by, or claims asserted against, such Contributor by reason of your 165 | accepting any such warranty or additional liability. 166 | 167 | END OF TERMS AND CONDITIONS 168 | 169 | APPENDIX: How to apply the Apache License to your work 170 | 171 | To apply the Apache License to your work, attach the following boilerplate 172 | notice, with the fields enclosed by brackets "{}" replaced with your own 173 | identifying information. (Don't include the brackets!) The text should be 174 | enclosed in the appropriate comment syntax for the file format. We also 175 | recommend that a file or class name and description of purpose be included on 176 | the same "printed page" as the copyright notice for easier identification within 177 | third-party archives. 178 | 179 | Copyright 2015 TonyWang 180 | 181 | Licensed under the Apache License, Version 2.0 (the "License"); 182 | you may not use this file except in compliance with the License. 183 | You may obtain a copy of the License at 184 | 185 | http://www.apache.org/licenses/LICENSE-2.0 186 | 187 | Unless required by applicable law or agreed to in writing, software 188 | distributed under the License is distributed on an "AS IS" BASIS, 189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 190 | See the License for the specific language governing permissions and 191 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # YaYaNLP: Chinese Language Processing 2 | YaYaNLP是一个纯python编写的中文自然语言处理包,取名于“牙牙学语”。 3 | YaYaNLP提供以下功能: 4 | - 中文分词 5 | - 词性标注 6 | - 命名实体识别 7 | * 人名识别 8 | * 地名识别 9 | * 组织机构识别 10 | - 简繁转换 11 | 12 | ## 项目 13 | 14 | 项目主页:[https://github.com/Tony-Wang/YaYaNLP](https://github.com/Tony-Wang/YaYaNLP) 15 | 16 | 我的主页:[www.huangyong.me](http://www.huangyong.me) 17 | 18 | ## 安装 19 | 20 | ### 直接下载源码包,解压后运行 21 | 22 | ``` bash 23 | python setup.py install 24 | ``` 25 | 26 | ### 下载字典与模型文件 27 | 28 | YaYaNLP使用了与HanLP兼容的字典数据,而编译后的字典数据保存的扩展名为.ya 29 | 可以直接从hanLP项目下载,[data-for-1.2.4.zip](http://pan.baidu.com/s/1gd1vo8j) 30 | 31 | ### 配置数据文件路径 32 | 33 | 在**yaya/config.py**修改自己的数据文件路径 34 | ``` python 35 | DATA_ROOT = "/your/data/path" 36 | ``` 37 | 38 | ## 特性 39 | 40 | ### 人名识别 41 | 42 | ``` 43 | # 识别人名 44 | text = u"签约仪式前,秦光荣、李纪恒、仇和等一同会见了参加签约的企业家。" 45 | terms = segment.seg(text) 46 | print_terms(terms) 47 | ``` 48 | 49 | ``` 50 | 签约/vi 51 | 仪式/n 52 | 前/f 53 | ,/w 54 | 秦光荣/nr 55 | 、/w 56 | 李纪恒/nr 57 | 、/w 58 | 仇和/nr 59 | 等/udeng 60 | 一同/d 61 | 会见/v 62 | 了/ule 63 | 参加/v 64 | 签约/vi 65 | 的/ude1 66 | 企业家/nnt 67 | 。/w 68 | ``` 69 | 70 | 71 | ### 歧意词识别 72 | 73 | ``` 74 | # 识别歧意词 75 | text = u"龚学平等领导说,邓颖超生前杜绝超生" 76 | terms = segment.seg(text) 77 | print_terms(terms) 78 | ``` 79 | 80 | ``` 81 | 龚学平/nr 82 | 等/udeng 83 | 领导/n 84 | 说/v 85 | ,/w 86 | 邓颖超/nr 87 | 生前/t 88 | 杜绝/v 89 | 超生/vi 90 | ``` 91 | 92 | ### 地名识别 93 | 94 | ``` 95 | # 识别地名 96 | text = u"蓝翔给宁夏固原市彭阳县红河镇黑牛沟村捐赠了挖掘机" 97 | terms = segment.seg(text) 98 | print_terms(terms) 99 | ``` 100 | 101 | ``` 102 | 蓝翔/nt 103 | 给/p 104 | 宁夏/ns 105 | 固原市/ns 106 | 彭阳县/ns 107 | 红河镇/ns 108 | 黑牛沟村/ns 109 | 捐赠/v 110 | 了/ule 111 | 挖掘机/n 112 | ``` 113 | 114 | ### 组织名识别 115 | 116 | ``` 117 | # 组织名识别 118 | text = u"济南杨铭宇餐饮管理有限公司是由杨先生创办的餐饮企业" 119 | terms = segment.seg(text) 120 | print_terms(terms) 121 | ``` 122 | 123 | ``` 124 | 济南杨铭宇餐饮管理有限公司/nt 125 | 是/vshi 126 | 由/p 127 | 杨先生/nr 128 | 创办/v 129 | 的/ude1 130 | 餐饮企业/nz 131 | ``` 132 | 133 | ### 简繁转换 134 | 135 | ``` 136 | # 简繁转换 137 | text = u"以后等你当上皇后,就能买草莓庆祝了" 138 | print segment.simplified_to_traditional(text) 139 | ``` 140 | 141 | ``` 142 | 以後等妳當上皇后,就能買士多啤梨慶祝了 143 | ``` 144 | 145 | ``` 146 | # 繁简转换 147 | text = u"用筆記簿型電腦寫程式HelloWorld" 148 | print segment.traditional_to_simplified(text) 149 | ``` 150 | 151 | ``` 152 | 用笔记本电脑写程序HelloWorld 153 | ``` 154 | 155 | ## 感谢 156 | 本项目参考了[hanck/HanLP](https://github.com/hankcs/HanLP/)项目实现原理并使用了该项目的字典和模型文件。 157 | 158 | 159 | ## 版权 160 | * Apache License Version 2.0 161 | * 任何使用了YaYaNLP的全部或部分功能、词典、模型的项目、产品或文章等形式的成果必须显式注明YaYaNLP及此项目主页。 162 | -------------------------------------------------------------------------------- /demo/demo_segment.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from yaya.seg import segment 3 | 4 | __author__ = 'tony' 5 | 6 | 7 | def print_terms(terms): 8 | for i, v in enumerate(terms): 9 | print "%s/%s" % (v[0], v[1]) 10 | 11 | 12 | def main(): 13 | 14 | # 识别歧意词 15 | text = u"龚学平、张晓辉等领导说,邓颖超生前杜绝超生" 16 | terms = segment.seg(text) 17 | print_terms(terms) 18 | 19 | # 识别人名 20 | text = u"签约仪式前,秦光荣、李纪恒、仇和等一同会见了参加签约的企业家。" 21 | terms = segment.seg(text) 22 | print_terms(terms) 23 | 24 | # 识别地名 25 | text = u"蓝翔给宁夏固原市彭阳县红河镇黑牛沟村捐赠了挖掘机" 26 | terms = segment.seg(text) 27 | print_terms(terms) 28 | 29 | # 识别组织名 30 | text = u"济南杨铭宇餐饮管理有限公司是由杨先生创办的餐饮企业" 31 | terms = segment.seg(text) 32 | print_terms(terms) 33 | 34 | # 简繁转换 35 | text = u"以后等你当上皇后,就能买草莓庆祝了" 36 | print segment.simplified_to_traditional(text) 37 | 38 | # 繁简转换 39 | text = u"用筆記簿型電腦寫程式HelloWorld" 40 | print segment.traditional_to_simplified(text) 41 | 42 | 43 | if __name__ == '__main__': 44 | main() 45 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | PACKAGE = "yaya" 4 | NAME = "YaYaNLP" 5 | DESCRIPTION = "YaYaNLP: Chinese Language Processing" 6 | AUTHOR = "tony huang" 7 | AUTHOR_EMAIL = "tony@huangyong.me" 8 | URL = "http://www.huangyong.me" 9 | 10 | VERSION = __import__(PACKAGE).__version__ 11 | 12 | setup( 13 | name=NAME, 14 | version=VERSION, 15 | description=DESCRIPTION, 16 | author=AUTHOR, 17 | author_email=AUTHOR_EMAIL, 18 | license="Apache", 19 | url=URL, 20 | packages=find_packages(exclude=["test*", "data*"]), 21 | classifiers=[ 22 | 'Development Status :: 0.1.1 - Alpha', 23 | 'Intended Audience :: Developers', 24 | 'License :: OSI Approved :: Apache License Version 2.0', 25 | 'Programming Language :: Python', 26 | ], 27 | 28 | zip_safe=False, 29 | ) 30 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tony-Wang/YaYaNLP/d75780290926877e55759fb64e1440f809d653ed/test/__init__.py -------------------------------------------------------------------------------- /test/data/test.ngram.txt: -------------------------------------------------------------------------------- 1 | 中华@骨髓 6 2 | 中华@骨髓库 40 3 | 中华@魂 1 4 | 中华@鳖精 2 5 | 中华@鸟类 4 6 | 中华@鸟龙 7 7 | 中华@, 15 8 | 中华人民共和国@不可 1 9 | 中华人民共和国@与 2 10 | 中华人民共和国@中央 1 11 | 中华人民共和国@中央政府 1 12 | 中华人民共和国@主席 2 13 | 中华人民共和国@主席令 2 14 | 中华人民共和国@主管 1 -------------------------------------------------------------------------------- /test/data/test.txt: -------------------------------------------------------------------------------- 1 | 一举 n 1 2 | 一举成名 n 1 3 | 一举成名天下知 n 1 4 | 成名 n 1 5 | 天下 n 1 6 | 法兰西 n 1 7 | 注册机 n 1 -------------------------------------------------------------------------------- /test/test_bigramtable.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from unittest import TestCase 3 | from yaya.collection.bigram import * 4 | 5 | __author__ = 'tony' 6 | 7 | 8 | class TestBiGramTable(TestCase): 9 | def test_build(self): 10 | filename = "./data/test.ngram.txt" 11 | table = BiGramTable.build(filename) 12 | self.assertEqual(table.get_bifreq(u"中华", u"鸟类"), 4) 13 | self.assertEqual(table.get_bifreq(u"中华", u"鸟龙"), 7) 14 | 15 | def test_get_Bifreq(self): 16 | self.assertEqual(CoreBiGramTable().table.get_bifreq(u"中华", u"鸟类"), 4) 17 | self.assertEqual(CoreBiGramTable().table.get_bifreq(u"中华", u"鸟龙"), 7) 18 | -------------------------------------------------------------------------------- /test/test_dict.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | from __future__ import absolute_import, unicode_literals 3 | import os 4 | from unittest import TestCase 5 | 6 | from yaya.collection.dict import * 7 | import yaya.config 8 | from yaya.dictionary.person_dict import PersonDict 9 | 10 | __author__ = 'tony' 11 | 12 | 13 | class TestDoubleArrayTrie(TestCase): 14 | def test_fetch(self): 15 | trie = DoubleArrayTrie() 16 | words = [] 17 | words.append(u"一举") 18 | words.append(u"一举一动") 19 | words.append(u"一举成名") 20 | words.append(u"一举成名天下知") 21 | words.append(u"啊") 22 | words.append(u"埃及") 23 | words.append(u"阿拉伯") 24 | words.append(u"阿拉伯人") 25 | words.append(u"阿根廷") 26 | words.append(u"阿胶") 27 | words.sort() 28 | trie.build(key=words, v=['一', '一', '一', '一', '二', '三', '四', '四', '四', '四']) 29 | self.assertGreater(trie.exact_match_search(u"一举一动"), 0) 30 | self.assertGreater(trie.exact_match_search(u"阿拉伯"), 0) 31 | self.assertGreater(trie.exact_match_search(u"阿拉伯人"), 0) 32 | 33 | def test_build(self): 34 | trie = DoubleArrayTrie() 35 | words = [] 36 | words.append(u"一举 n 1") 37 | words.append(u"一举一动 n 1") 38 | words.append(u"一举成名 n 1") 39 | words.append(u"一举成名天下知 n 1") 40 | words.append(u"啊 n 1") 41 | words.append(u"埃及 n 1") 42 | words.append(u"阿拉伯 n 1") 43 | words.append(u"阿拉伯人 n 1") 44 | words.append(u"阿根廷 n 1") 45 | words.append(u"阿胶 n 1") 46 | words.sort() 47 | trie=DoubleArrayTrie.load_from_list(words) 48 | self.assertEqual(trie.get(u"一举")[1].nature, NATURE.n) 49 | self.assertEqual(trie.get(u"一举一动")[1].nature, NATURE.n) 50 | self.assertEqual(trie.get(u"一举成名")[1].nature, NATURE.n) 51 | self.assertEqual(trie.get(u"一举成名天下知")[1].nature, NATURE.n) 52 | self.assertEqual(trie.get(u"啊")[1].nature, NATURE.n) 53 | self.assertEqual(trie.get(u"埃及")[1].nature, NATURE.n) 54 | self.assertEqual(trie.get(u"阿拉伯")[1].nature, NATURE.n) 55 | 56 | def test_load_dict(self): 57 | new_trie = DoubleArrayTrie.load_dict_file(os.path.join("data", "test.txt")) 58 | self.assertGreater(new_trie.exact_match_search(u"注册机"), 0) 59 | 60 | def test_load_big(self): 61 | trie = DoubleArrayTrie.load(yaya.config.CORE_DICT_NAME) 62 | self.assertGreater(trie.exact_match_search(u"法兰西斯"), 0) 63 | self.assertIsNotNone(trie.get(u"法兰西")[1].nature, u"核心字典里的value字段不应该None") 64 | 65 | 66 | def test_search(self): 67 | trie = DoubleArrayTrie.load(os.path.join("data", "test.txt")) 68 | self.assertGreaterEqual(u"一举", 0, u"词典中含有") 69 | self.assertGreaterEqual(u"一举成名", 0, u"词典中含有") 70 | self.assertGreaterEqual(u"一举成名天下知", 0, u"词典中含有") 71 | search = trie.search(u"一举成名天下知", 0) 72 | while search.next(): 73 | print(search.value) 74 | 75 | def test_searcher_generator(self): 76 | trie = DoubleArrayTrie.load(os.path.join("data", "test.txt")) 77 | self.assertGreaterEqual(u"一举", 0, u"词典中含有") 78 | self.assertGreaterEqual(u"一举成名", 0, u"词典中含有") 79 | self.assertGreaterEqual(u"一举成名天下知", 0, u"词典中含有") 80 | search = trie.search(u"一举成名天下知", 0) 81 | terms = [] 82 | for i, k, v in search.search_all_words(): 83 | terms.append((i, k, v)) 84 | self.assertEqual(v.nature, NATURE.n) 85 | self.assertEqual(len(v), 1) 86 | self.assertEqual(v.to_tuple()[1], 1) 87 | self.assertEqual(len(terms), 5, u"搜索生成器,查找出所有词典里有的词") 88 | 89 | 90 | 91 | def test_custom_dict(self): 92 | self.assertGreaterEqual(CustomDict().trie.exact_match_search(u"黄勇"), 0) 93 | 94 | def test_dat_transition(self): 95 | trie = DoubleArrayTrie.load(os.path.join("data", "test.txt")) 96 | self.assertNotEqual(trie.transition(u"法兰西", 1), -1) 97 | self.assertEqual(trie.transition(u"法兰东", 1), -1) 98 | p = trie.transition(u"法兰", 1) 99 | self.assertNotEqual(trie.transition(u"西", p), -1) 100 | self.assertEqual(trie.transition(u"东", p), -1) 101 | 102 | def test_dat_output(self): 103 | dat = DoubleArrayTrie() 104 | dat.build(key=[u"江河湖海"], v=[u"江河湖海 n 1"]) 105 | state = dat.transition(u'江河湖海', 1) 106 | self.assertGreater(state, -1) 107 | self.assertIsNotNone(dat.output(state)) 108 | self.assertEqual(dat.output(state), dat.get(u"江河湖海")[1]) 109 | 110 | # state = CoreDict().trie.transition(u"大海", 1) 111 | # self.assertGreater(state, -1) 112 | # self.assertEqual(CoreDict().trie.output(state), CoreDict().trie.get(u'大海')[1]) 113 | 114 | 115 | 116 | class TestAttribute(TestCase): 117 | def test_total_freq(self): 118 | text = "测试 n 10 nz 3 p 4" 119 | attr = Attribute(attr=text.split()[1:]) 120 | self.assertEqual(attr.total_frequency, 17) 121 | # self.assertEqual(attr.get_nature_frequency('n'), 10) 122 | self.assertEqual(attr.get_nature_frequency(NATURE.n), 10) 123 | self.assertEqual(attr.get_nature_frequency(NATURE.nz), 3) 124 | self.assertEqual(attr.get_nature_frequency(NATURE.p), 4) 125 | 126 | 127 | class TestAllDict(TestCase): 128 | def test_PersonDict(self): 129 | self.assertNotEqual(PersonDict().trie.exact_match_search(u"籍"), -1) 130 | -------------------------------------------------------------------------------- /test/test_enum.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from yaya.common.enum import Enum, EnumValue 4 | from yaya.common.nr import NR 5 | from yaya.common.nature import NATURE 6 | 7 | __author__ = 'tony' 8 | 9 | 10 | class TestEnum(TestCase): 11 | def test_nr(self): 12 | self.assertEqual(NR.A.index, 14) 13 | 14 | def test_nature(self): 15 | self.assertEqual(NATURE.n.index, 13) 16 | 17 | def test_nature_key_to_index(self): 18 | self.assertEqual(type(NATURE.n), EnumValue) 19 | 20 | def test_nature_key_to_str(self): 21 | self.assertEqual(str(NATURE.n), 'n') 22 | 23 | def test_enum(self, ): 24 | E1 = Enum('a', 'b', enum_name='E1') 25 | self.assertTrue(str(E1.b) == 'b') 26 | self.assertEqual(E1['b'].index, 1) 27 | 28 | def test_demo(self): 29 | # char => int 30 | E1 = Enum('a', 'b', enum_name='E1') 31 | self.assertTrue(str(E1.b) == 'b') 32 | self.assertEqual(E1['b'].index, 1) 33 | self.assertTrue(str(E1[1]) == 'b' ) -------------------------------------------------------------------------------- /test/test_hmm.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from unittest import TestCase 3 | from yaya.collection.hmm import PersonTranMatrix 4 | 5 | __author__ = 'tony' 6 | 7 | 8 | class TestHMMMatrix(TestCase): 9 | def test_load(self): 10 | self.assertIsNotNone(PersonTranMatrix().hmm, u"加载人名识别HMM转换矩阵") 11 | self.assertNotEqual(PersonTranMatrix().hmm.matrix.__len__(), 0) 12 | self.assertEqual(PersonTranMatrix().hmm.total_freq, 43938702) 13 | -------------------------------------------------------------------------------- /test/test_organization_recognition.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | __author__ = 'tony' 3 | from unittest import TestCase 4 | 5 | from yaya.config import Config 6 | from yaya.recognition import person_recognition 7 | from yaya.recognition import place_recognition 8 | from yaya.recognition import organization_recognition 9 | from yaya.seg.viterbi import viterbi 10 | from yaya.seg.wordnet import WordNet, Vertex, gen_word_net, dump_vertexs 11 | from yaya.seg.segment import traditional_to_simplified 12 | 13 | class TestOrgRecognition(TestCase): 14 | def gen_word(self, text): 15 | self.text = text 16 | self.word_net = WordNet(self.text) 17 | # 粗分词网 18 | gen_word_net(self.text, self.word_net) 19 | # 维特比 20 | self.vertexs = viterbi(self.word_net.vertexs) 21 | self.word_net_optimum = WordNet(self.text, vertexs=self.vertexs) 22 | 23 | def test_recognition_1_level(self): 24 | text = u"济南杨铭宇餐饮管理有限公司是由杨先生创办的餐饮企业" 25 | self.gen_word(text) 26 | # vertexs = persion_recognition.recognition(vertexs, word_net_optimum, word_net) 27 | # word_net_optimum = WordNet(text, vertexs=vertexs) 28 | organization_recognition.recognition(self.vertexs, self.word_net_optimum, self.word_net) 29 | vertexs = viterbi(self.word_net_optimum.vertexs) 30 | self.assertIn(Vertex(u"济南杨铭宇餐饮管理有限公司", attribute=u"nt 1"), vertexs) 31 | 32 | def test_recognition_2_level(self): 33 | text = u"济南杨铭宇餐饮管理有限公司是由杨先生创办的餐饮企业" 34 | self.gen_word(text) 35 | person_recognition.recognition(self.vertexs, self.word_net_optimum, self.word_net) 36 | place_recognition.recognition(self.vertexs, self.word_net_optimum, self.word_net) 37 | word_net_optimum = WordNet(self.text, vertexs=self.vertexs) 38 | vertexs = organization_recognition.recognition(self.vertexs, word_net_optimum, self.word_net) 39 | # viterbi(word_net_optimum.vertexs) 40 | dump_vertexs(vertexs) 41 | self.assertIn(Vertex(u"济南杨铭宇餐饮管理有限公司", attribute=u"nt 1"), vertexs) 42 | 43 | def test_organization_recognition(self): 44 | text = traditional_to_simplified(u"馬總統上午前往陸軍航空601旅,") 45 | Config.debug = True 46 | self.gen_word(text) 47 | person_recognition.recognition(self.vertexs, self.word_net_optimum, self.word_net) 48 | place_recognition.recognition(self.vertexs, self.word_net_optimum, self.word_net) 49 | word_net_optimum = WordNet(self.text, vertexs=self.vertexs) 50 | vertexs = organization_recognition.recognition(self.vertexs, word_net_optimum, self.word_net) 51 | dump_vertexs(vertexs) 52 | self.assertIn(Vertex(u"陆军航空601旅", attribute=u"nt 1"), vertexs) 53 | 54 | -------------------------------------------------------------------------------- /test/test_person_recognition.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from unittest import TestCase 3 | 4 | from yaya.seg import segment 5 | from yaya.seg.viterbi import viterbi 6 | from yaya.seg.wordnet import WordNet, gen_word_net, Vertex 7 | from yaya.recognition import person_recognition 8 | 9 | __author__ = 'tony' 10 | 11 | 12 | class TestPersonRecognition(TestCase): 13 | def test_recognition(self): 14 | text = u"签约仪式前,秦光荣、李纪恒、仇和、王春桂、张晓辉等一同会见了参加签约的企业家。" 15 | word_net = WordNet(text) 16 | 17 | # 粗分词网 18 | gen_word_net(text, word_net) 19 | 20 | # 维特比 21 | vertexs = viterbi(word_net.vertexs) 22 | word_net_optimum = WordNet(text, vertexs=vertexs) 23 | person_recognition.recognition(vertexs, word_net_optimum, word_net) 24 | vertexs = viterbi(word_net_optimum.vertexs) 25 | self.assertIn(Vertex(u"秦光荣", attribute=u"nr 1"), vertexs) 26 | self.assertIn(Vertex(u"李纪恒", attribute=u"nr 1"), vertexs) 27 | self.assertIn(Vertex(u"仇和", attribute=u"nr 1"), vertexs) 28 | self.assertIn(Vertex(u"王春桂", attribute=u"nr 1"), vertexs) 29 | self.assertIn(Vertex(u"张晓辉", attribute=u"nr 1"), vertexs) 30 | print(vertexs) 31 | 32 | def test_person_name_V_should_split_to_EL_DL(self): 33 | text = u"龚学平、张晓辉等领导说,邓颖超生前杜绝超生" 34 | vertexs = segment.seg_to_vertexs(text) 35 | terms = segment.vertexs_to_terms(vertexs, True) 36 | self.assertIn(u"龚学平", terms) 37 | self.assertIn(u"张晓辉", terms) 38 | self.assertIn(u"邓颖超", terms) 39 | 40 | -------------------------------------------------------------------------------- /test/test_place_recognition.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | __author__ = 'tony' 3 | 4 | from unittest import TestCase 5 | 6 | from yaya.recognition import place_recognition 7 | from yaya.collection.dict import CustomDict, Attribute 8 | from yaya.seg import segment 9 | from yaya.seg.viterbi import viterbi 10 | from yaya.seg.wordnet import WordNet, gen_word_net, Vertex, combine_by_custom_dict 11 | 12 | 13 | class TestPlaceRecognition(TestCase): 14 | def setUp(self): 15 | self.text = u"蓝翔给宁夏固原市彭阳县红河镇黑牛沟村捐赠了挖掘机" 16 | self.word_net = WordNet(self.text) 17 | # 粗分词网 18 | gen_word_net(self.text, self.word_net) 19 | # 维特比 20 | self.vertexs = viterbi(self.word_net.vertexs) 21 | self.vertexs = combine_by_custom_dict(self.vertexs, CustomDict().trie) 22 | self.word_net_optimum = WordNet(self.text, vertexs=self.vertexs) 23 | 24 | def test_recognition(self): 25 | place_recognition.recognition(self.vertexs, self.word_net_optimum, self.word_net) 26 | vertexs = viterbi(self.word_net_optimum.vertexs) 27 | self.assertIn(Vertex(u"宁夏"), vertexs) 28 | self.assertIn(Vertex(u"固原市"), vertexs) 29 | self.assertIn(Vertex(u"彭阳县", attribute=u"ns 1"), vertexs) 30 | self.assertIn(Vertex(u"红河镇", attribute=u"ns 1"), vertexs) 31 | self.assertIn(Vertex(u"黑牛沟村", attribute=u"ns 1"), vertexs) 32 | -------------------------------------------------------------------------------- /test/test_role_tag.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from unittest import TestCase 3 | 4 | from yaya.collection.dict import Attribute, DoubleArrayTrie 5 | from yaya.collection.hmm import PersonTranMatrix 6 | from yaya.common.nr import NR, NRPattern 7 | from yaya.const import * 8 | from yaya.recognition.person_recognition import role_tag 9 | from yaya.seg.viterbi import viterbi_roletag 10 | from yaya.seg.wordnet import new_tag_vertex, Vertex 11 | 12 | __author__ = 'tony' 13 | 14 | 15 | class TestRole_tag(TestCase): 16 | def test_role_tag(self): 17 | word_seg_list = [ 18 | new_tag_vertex(TAG_BIGIN), 19 | Vertex(u"秦", attribute=Attribute(u'n 1')), 20 | Vertex(u"光荣", attribute=Attribute(u'n 1')), 21 | Vertex(u"同志", attribute=Attribute(u'n 1')), 22 | new_tag_vertex(TAG_END), 23 | ] 24 | taglist = role_tag(word_seg_list) 25 | 26 | self.assertTrue(isinstance(taglist, list)) 27 | self.assertEqual(taglist[2].to_tuple(), (NR.Z, 29, NR.L, 2)) 28 | 29 | tag_index_list = viterbi_roletag(taglist, PersonTranMatrix().hmm) 30 | self.assertEqual(tag_index_list[0], NR.A, u"人名识别,第一个标识应该为TAG_BAGIN") 31 | self.assertEqual(tag_index_list[1], NR.B) 32 | self.assertEqual(tag_index_list[2], NR.Z) 33 | self.assertEqual(tag_index_list[3], NR.L) 34 | self.assertEqual(tag_index_list[4], NR.A) 35 | 36 | def test_NRPattern(self): 37 | """ 38 | 39 | 40 | """ 41 | trie = DoubleArrayTrie() 42 | NRPattern.sort() 43 | trie.build(key=NRPattern) 44 | self.assertTrue(trie.exact_match_search("BCD") != -1) 45 | self.assertTrue(trie.exact_match_search("BBCD") != -1) 46 | self.assertTrue(trie.exact_match_search("BG") != -1) 47 | self.assertTrue(trie.exact_match_search("DG") != -1) 48 | self.assertTrue(trie.exact_match_search("CD") == -1) 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /test/test_segment.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from unittest import TestCase 3 | 4 | from yaya.collection.dict import DoubleArrayTrie 5 | from yaya.seg import segment 6 | from yaya.seg.segment import traditional_seg 7 | from yaya.seg.wordnet import atom_seg, WordNet, gen_word_net, combine_by_custom_dict 8 | from yaya.utility.chartype import * 9 | 10 | __author__ = 'tony' 11 | 12 | 13 | class TestAtomSegment(TestCase): 14 | def test_char_type(self): 15 | self.assertEqual(get('a'), CT_SINGLE) 16 | self.assertEqual(get('1'), CT_NUM) 17 | self.assertEqual(get(u'中'), CT_CHINESE) 18 | 19 | def test_atom_seg(self): 20 | text = '12341' 21 | node_list = atom_seg(text, 0, text.__len__()) 22 | self.assertEqual(node_list.__len__(), 1) 23 | self.assertEqual(node_list[0].pos, CT_NUM) 24 | text = '123.41' 25 | node_list = atom_seg(text, 0, text.__len__()) 26 | self.assertEqual(node_list.__len__(), 1) 27 | self.assertEqual(node_list[0].pos, CT_NUM) 28 | text = 'abc' 29 | node_list = atom_seg(text, 0, text.__len__()) 30 | self.assertEqual(node_list.__len__(), 1) 31 | self.assertEqual(node_list[0].pos, CT_SINGLE) 32 | 33 | 34 | class TestSegment(TestCase): 35 | def test_seg_find_nr(self): 36 | text = u"签约仪式前,秦光荣、李纪恒、仇和、王春桂等一同会见了参加签约的企业家。" 37 | terms = segment.seg(text) 38 | self.assertIn((u"秦光荣", 'nr', 6), terms, u"测试是否找出人名") 39 | self.assertIn((u"李纪恒", 'nr', 10), terms, u"测试是否找出人名") 40 | self.assertIn((u"仇和", 'nr', 14), terms, u"测试是否找出人名") 41 | 42 | def test_combin_by_dict(self): 43 | dat = DoubleArrayTrie() 44 | dat.build([u"江", u"河", u"湖", "海"]) 45 | text = u"江河湖海" 46 | word_net = WordNet(text) 47 | gen_word_net(text, word_net, dat) 48 | vertexs = [v[0] for v in word_net.vertexs] 49 | self.assertEqual(len(word_net), 6, u"自定义字典分词") 50 | 51 | combin_dat = DoubleArrayTrie() 52 | combin_dat.build(key=[u"江河湖海"], v=[u"江河湖海 n 1"]) 53 | vertexs = combine_by_custom_dict(vertexs, combin_dat) 54 | self.assertEqual(len(vertexs), 3, u"合并完成后应该只有前尾加中间词") 55 | 56 | def test_traditional_seg(self): 57 | text = u"記者羅吉訓/新竹報導 雙方合作的主要內容包括,希望能夠促成太陽能設備安裝維修人才培養;結合推廣教育由綠野集團引薦國外學生來臺就讀;與觀光及餐飲系合作觀光休閒產業,提供來臺遊客入住大華科大樂群會館,並導覽參訪張學良故居等臺灣各知名景點。 訂閱聯絡電話:02-23222722-814 瀏覽器建議使用IE 9.0以上版本 最佳觀看解析度1024x768 網站更新日期:2015/12/13 " 58 | print traditional_seg(text) 59 | -------------------------------------------------------------------------------- /test/test_traditionalChineseDict.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from unittest import TestCase 3 | 4 | from yaya import config 5 | from yaya.dictionary.chinese_traditional_dict import TraditionalChineseDict, SimplifiedChineseDict 6 | 7 | __author__ = 'tony' 8 | 9 | 10 | class TestTraditionalChineseDict(TestCase): 11 | def test_convert_simplified_to_traditional(self): 12 | simplified = TraditionalChineseDict().convert_traditional_to_simplified(u"用筆記簿型電腦寫程式HelloWorld") 13 | self.assertEqual(simplified, u"用笔记本电脑写程序HelloWorld") 14 | 15 | def test_convert_traditional_to_simplified(self): 16 | config.Config.debug = True 17 | traditional = SimplifiedChineseDict().convert_simplified_to_traditional(u"用笔记本电脑写程序HelloWorld") 18 | self.assertEqual(traditional, u"用筆記簿型電腦寫程式HelloWorld") 19 | 20 | def test_traditional_chinese_dict_search_all_words(self): 21 | searcher = TraditionalChineseDict().trie.search(u"用筆記簿型電腦寫程式HelloWorld") 22 | for i, k, v in searcher.search_all_words(): 23 | print i, k, v 24 | 25 | def test_demo1(self): 26 | text = u"記者羅吉訓/新竹報導 雙方合作的主要內容包括,希望能夠促成太陽能設備安裝維修人才培養;" \ 27 | u"結合推廣教育由綠野集團引薦國外學生來臺就讀;與觀光及餐飲系合作觀光休閒產業," \ 28 | u"提供來臺遊客入住大華科大樂群會館,並導覽參訪張學良故居等臺灣各知名景點。 " \ 29 | u"訂閱聯絡電話:02-23222722-814 瀏覽器建議使用IE 9.0以上版本 最佳觀看解析度1024x768 " \ 30 | u"網站更新日期:2015/12/13 " 31 | simplified = TraditionalChineseDict().convert_traditional_to_simplified(text) 32 | print(simplified) 33 | text = u"媒體詢問對目前選戰看法?朱立倫說最重要是要把沉默的大眾喚出來," \ 34 | u"為了台灣安定、兩岸和平及經濟發展,拜託大家在最後關頭全力團結及共同支持。 " \ 35 | u"今晚黨內重量級人士到齊,媒體詢問等於是最高規格的選戰會議," \ 36 | u"是否會向總統當面拜託總統夫人周美青出來?朱立倫馬上向身旁的馬總統說," \ 37 | u"「對呀,請馬學長拜託周學姐出來輔選」,總統笑著說「我一定轉達」。 " \ 38 | u"朱立倫表示,今晚餐敘不是輔選會報,但不管是馬總統、吳副總統、王金平及行政院長毛治國," \ 39 | u"大家都是同心協力,求團結勝選 。 他強調,最近到各地陸續見到好多民眾展現熱情," \ 40 | u"希望最後一個月不斷加溫,直到明年1月16日勝選。1041217 這裡有個好粉絲團,需要你關注!" 41 | simplified = TraditionalChineseDict().convert_traditional_to_simplified(text) 42 | print(simplified) 43 | -------------------------------------------------------------------------------- /test/test_trie.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tony' 2 | -------------------------------------------------------------------------------- /test/test_viterbi_segment.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from unittest import TestCase 3 | 4 | from yaya.collection.hmm import OrgTranMatrix 5 | from yaya.common.nt import NT 6 | from yaya.seg.segment import vertexs_to_terms 7 | from yaya.seg.viterbi import * 8 | from yaya.seg.wordnet import * 9 | 10 | __author__ = 'tony' 11 | 12 | 13 | class TestViterbiSegment(TestCase): 14 | def test_viterbi(self): 15 | text = u"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" 16 | # text = u"商品23和服务" 17 | word_net = WordNet(text) 18 | gen_word_net(text, word_net) 19 | vertex_list = vertexs_to_terms(viterbi(word_net.vertexs), True) 20 | self.assertTrue(u"工信处" in vertex_list) 21 | self.assertTrue(u"女" in vertex_list) 22 | self.assertTrue(u"干事" in vertex_list) 23 | self.assertTrue(u"每月" in vertex_list) 24 | self.assertTrue(u"经过" in vertex_list) 25 | self.assertTrue(u"下属" in vertex_list) 26 | self.assertTrue(u"科室" in vertex_list) 27 | self.assertTrue(u"都" in vertex_list) 28 | self.assertTrue(u"要" in vertex_list) 29 | self.assertTrue(u"亲口" in vertex_list) 30 | self.assertTrue(u"交代" in vertex_list) 31 | self.assertTrue(u"24" in vertex_list) 32 | self.assertTrue(u"口" in vertex_list) 33 | self.assertTrue(u"交换机" in vertex_list) 34 | self.assertTrue(u"等" in vertex_list) 35 | self.assertTrue(u"技术性" in vertex_list) 36 | self.assertTrue(u"器件" in vertex_list) 37 | self.assertTrue(u"的" in vertex_list) 38 | self.assertTrue(u"安装" in vertex_list) 39 | self.assertTrue(u"工作" in vertex_list) 40 | 41 | def test_custom_dict(self): 42 | text = u"黄勇今天来上班了" 43 | word_net = WordNet(text) 44 | gen_word_net(text, word_net) 45 | vertex_list = viterbi(word_net.vertexs) 46 | vertex_list = combine_by_custom_dict(vertex_list) 47 | self.assertEqual(vertex_list[1].real_word, u"黄勇") 48 | 49 | 50 | class TestViterbi(TestCase): 51 | def test_computer(self): 52 | node_list = [] 53 | node_list.append(Attribute((NT.S, 19800))) 54 | node_list.append(Attribute((NT.K, 1000, NT.D, 1000))) 55 | node_list.append(Attribute((NT.C, 1000, NT.B, 1000))) 56 | node_list.append(Attribute((NT.M, 1000))) 57 | node_list.append(Attribute((NT.P, 12, NT.D, 1))) 58 | node_list.append(Attribute((NT.B, 19800))) 59 | tag_list = viterbi_standard(node_list, hmm=OrgTranMatrix().hmm) 60 | self.assertEquals(6, len(tag_list)) 61 | self.assertEqual(NT.K, tag_list[1]) 62 | self.assertEqual(NT.C, tag_list[2]) 63 | self.assertEqual(NT.M, tag_list[3]) 64 | self.assertEqual(NT.D, tag_list[4]) 65 | -------------------------------------------------------------------------------- /test/test_wordnet.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | from __future__ import absolute_import 3 | from unittest import TestCase 4 | 5 | from yaya.const import TAG_BIGIN, TAG_END 6 | from yaya.seg.wordnet import WordNet, gen_word_net, Vertex, new_tag_vertex 7 | 8 | __author__ = 'tony' 9 | 10 | 11 | class TestWordNet(TestCase): 12 | def test_gen_word_net(self): 13 | text = u"一举成名天下知" 14 | word_net = WordNet(text) 15 | gen_word_net(text, word_net) 16 | self.assertEqual(word_net.vertexs.__len__(), text.__len__() + 2) 17 | # 一举 一举成名 18 | # 举 19 | # 成 成名 20 | # 名 21 | # 天 天下 22 | # 下 23 | # 知 24 | self.assertEqual(word_net.vertexs[1].__len__(), 2) 25 | self.assertEqual(word_net.vertexs[2].__len__(), 1) 26 | self.assertEqual(word_net.vertexs[3].__len__(), 2) 27 | self.assertEqual(word_net.vertexs[4].__len__(), 1) 28 | self.assertEqual(word_net.vertexs[5].__len__(), 2) 29 | self.assertEqual(word_net.vertexs[6].__len__(), 1) 30 | self.assertEqual(word_net.vertexs[7].__len__(), 1) 31 | 32 | def test_gen_word_net_include_num(self): 33 | text = u"123456" 34 | word_net = WordNet(text) 35 | gen_word_net(text, word_net) 36 | self.assertEqual(word_net.vertexs.__len__(), 6 + 2) 37 | self.assertTrue([] not in word_net.vertexs, u"原始词网,不能可能有空节点") 38 | 39 | def test_vector(self): 40 | v1 = Vertex("test", attribute="nr 1") 41 | v2 = Vertex("test", attribute="nr 1") 42 | v3 = Vertex("test", attribute="nr1 1") 43 | self.assertEqual(v1, v2) 44 | self.assertNotEqual(v1, v3) 45 | self.assertIn(v1, [v2]) 46 | self.assertNotIn(v1, [v3]) 47 | 48 | def test_tag_vector_real_word_len_should_eq_0(self): 49 | # 标识词的real_word不能为空,否则在字典里无法表示 50 | self.assertEqual(new_tag_vertex(TAG_BIGIN).real_word, chr(32)) 51 | self.assertEqual(new_tag_vertex(TAG_END).real_word, chr(32)) 52 | 53 | def test_word_net_insert(self): 54 | text = u"1234567890" 55 | word_net_all = WordNet(text) 56 | for i, c in enumerate(text): 57 | word_net_all.add(i + 1, Vertex(c)) 58 | -------------------------------------------------------------------------------- /yaya/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tony' 2 | __version__ = "0.1.1" 3 | -------------------------------------------------------------------------------- /yaya/collection/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tony' 2 | -------------------------------------------------------------------------------- /yaya/collection/bigram.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import absolute_import 3 | 4 | import time 5 | 6 | from yaya import config 7 | from yaya.collection.dict import CoreDict 8 | from yaya.const import logger 9 | from yaya.utility.singleton import singleton 10 | 11 | __author__ = 'tony' 12 | 13 | 14 | class BiGramTable: 15 | def __init__(self): 16 | self.start = [] 17 | self.pair = [] 18 | 19 | def get_bifreq(self, pre_word, next_word): 20 | pre_word_id = pre_word if type(pre_word) is int else CoreDict().trie.exact_match_search(pre_word) 21 | if pre_word_id == -1: 22 | return 0 23 | next_word_id = next_word if type(next_word) is int else CoreDict().trie.exact_match_search(next_word) 24 | if next_word_id == -1: 25 | return 0 26 | index = binary_search(self.pair, self.start[pre_word_id], 27 | self.start[pre_word_id + 1] - self.start[pre_word_id], 28 | next_word_id) 29 | if index < 0: 30 | return 0 31 | index <<= 1 32 | return self.pair[index + 1] 33 | 34 | @staticmethod 35 | def load(filename=config.CORE_BIGRAM_NAME): 36 | start = time.time() 37 | logger.info(u"开始加载核心二元语法词表") 38 | import os 39 | if os.path.exists(filename + config.DICT_BIN_EXT): 40 | return BiGramTable.load_bin(filename + config.DICT_BIN_EXT) 41 | else: 42 | table = BiGramTable.build(filename) 43 | import cPickle as Pickle 44 | with open(filename + config.DICT_BIN_EXT, 'w') as f: 45 | Pickle.dump(table, f) 46 | return table 47 | logger.info(u"加载核心二元语法词表完毕,耗时%s", time.time() - start) 48 | 49 | @staticmethod 50 | def load_bin(filename): 51 | import cPickle as Pickle 52 | with open(filename, 'r') as f: 53 | bigram = Pickle.load(f) 54 | f.close() 55 | return bigram 56 | 57 | @staticmethod 58 | def build(filename): 59 | import codecs 60 | f = codecs.open(filename, 'r', 'utf-8') 61 | pre_word_map = {} 62 | max_word_id = CoreDict().trie.word_size() 63 | total = 0 64 | while True: 65 | line = f.readline() 66 | if not line: 67 | break 68 | params = line.split() 69 | if params.__len__() != 2: 70 | continue 71 | two_word = params[0].split('@', 2) 72 | if two_word.__len__() != 2: 73 | continue 74 | 75 | pre_word_id = CoreDict().trie.exact_match_search(two_word[0]) 76 | if pre_word_id == -1: 77 | continue 78 | next_word_id = CoreDict().trie.exact_match_search(two_word[1]) 79 | if next_word_id == -1: 80 | continue 81 | if pre_word_id not in pre_word_map: 82 | pre_word_map[pre_word_id] = {} 83 | next_word_map = pre_word_map.get(pre_word_id) 84 | next_word_map[next_word_id] = int(params[1]) 85 | total += 2 86 | f.close() 87 | 88 | table = BiGramTable() 89 | table.start = [0] * (max_word_id + 1) 90 | table.pair = [0] * total 91 | offset = 0 92 | for i in range(max_word_id): 93 | next_word_map = pre_word_map.get(i, None) 94 | if next_word_map is not None: 95 | key_list = next_word_map.keys() 96 | key_list.sort() 97 | for k in key_list: 98 | index = offset << 1 99 | table.pair[index] = k 100 | table.pair[index + 1] = next_word_map[k] 101 | offset += 1 102 | table.start[i + 1] = offset 103 | return table 104 | 105 | 106 | def binary_search(a, from_index, length, key): 107 | low = from_index 108 | high = from_index + length - 1 109 | while low <= high: 110 | mid = (low + high) >> 1 111 | mid_val = a[mid << 1] 112 | if mid_val < key: 113 | low = mid + 1 114 | elif mid_val > key: 115 | high = mid - 1 116 | else: 117 | return mid 118 | return -(low + 1) 119 | 120 | 121 | @singleton 122 | class CoreBiGramTable: 123 | def __init__(self): 124 | self.table = BiGramTable.load() 125 | 126 | 127 | CORE_BIG_RAM_TABLE = CoreBiGramTable() 128 | -------------------------------------------------------------------------------- /yaya/collection/dict.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | try: 3 | import cPickle as Pickle 4 | except: 5 | import Pickle 6 | try: 7 | import xrange as range 8 | except: 9 | pass 10 | 11 | from collections import OrderedDict 12 | 13 | from yaya.const import * 14 | from yaya import config 15 | from yaya.utility.singleton import singleton 16 | from yaya.common.nature import NATURE 17 | 18 | ATTRIBUTE_MAIN_NATURE_INDEX = 0 19 | 20 | 21 | class Node(object): 22 | def __init__(self, code=0, depth=0, left=0, right=0): 23 | self.code = code 24 | self.depth = depth 25 | self.left = left 26 | self.right = right 27 | 28 | 29 | class Attribute(object): 30 | def __init__(self, attr, cls=NATURE): 31 | self.cls = cls 32 | self.total = 0 33 | if not isinstance(attr, tuple): 34 | self.data = () 35 | if attr is not None: 36 | attr = attr if isinstance(attr, list) else attr.split(' ') 37 | nature = [] 38 | for i in range(0, attr.__len__(), 2): 39 | nature.append(cls[attr[i]]) 40 | nature.append(int(attr[i + 1])) 41 | self.total += int(attr[i + 1]) 42 | self.data = tuple(nature) 43 | else: 44 | self.data = attr 45 | for i in range(len(self.data)): 46 | if i % 2 == 1: 47 | self.total += self.data[i] 48 | 49 | def to_tuple(self): 50 | return self.data 51 | 52 | def __str__(self): 53 | return ' '.join([str(x) for x in self.data]) 54 | 55 | def __repr__(self): 56 | return u"Attribute(%s)" % self.__str__() 57 | 58 | def __len__(self): 59 | return len(self.data) / 2 60 | 61 | def __eq__(self, other): 62 | return str(self) == str(other) 63 | 64 | def get_nature_frequency(self, nature): 65 | try: 66 | return self.data[self.data.index(nature) + 1] 67 | except: 68 | return 0 69 | 70 | @property 71 | def natures(self): 72 | for i in range(0, len(self.data), 2): 73 | yield i / 2, self.data[i], self.data[i + 1] 74 | # return self.data 75 | 76 | @property 77 | def nature(self): 78 | if self.data.__len__() != 0: 79 | return self.data[ATTRIBUTE_MAIN_NATURE_INDEX] 80 | else: 81 | return None 82 | 83 | @property 84 | def total_frequency(self): 85 | return self.total 86 | 87 | 88 | class FastArray: 89 | def __init__(self, default_value=0): 90 | self.default_value = 0 91 | self.data = {} 92 | self._max_key = 0 93 | pass 94 | 95 | def __getitem__(self, item): 96 | return self.data.get(item, 0) 97 | 98 | def __setitem__(self, key, value): 99 | self.data[key] = value 100 | self._max_key = max(self._max_key, key) 101 | 102 | @property 103 | def max_key(self): 104 | return self._max_key 105 | 106 | def extend(self, size): 107 | pass 108 | 109 | 110 | class DoubleArrayTrie: 111 | def __init__(self, enum_cls=NATURE): 112 | self.alloc_size = 0 113 | self.check = [] 114 | self.base = [] 115 | self.enum_cls = enum_cls 116 | self.used = [] 117 | self.size = 0 118 | self.key = [] 119 | self.key_size = 0 120 | self.length = None 121 | self.value = [] 122 | self.v = None 123 | self.progress = 0 124 | self.next_check_pos = 0 125 | self.error_ = 0 126 | 127 | def word_size(self): 128 | if self.v is None: 129 | return 0 130 | else: 131 | return self.v.__len__() 132 | 133 | def resize(self, newsize): 134 | offsize = newsize - self.alloc_size 135 | self.base.extend([0] * offsize) 136 | self.check.extend([0] * offsize) 137 | self.used.extend([0] * offsize) 138 | self.alloc_size = newsize 139 | 140 | def fetch(self, parent, siblings): 141 | if self.error_ < 0: 142 | return 0 143 | prev = 0 144 | for i in range(parent.left, parent.right): 145 | if parent.depth > (self.length[i] if self.length is not None else self.key[i].__len__()): 146 | continue 147 | tmp = self.key[i] 148 | cur = 0 149 | if (self.length[i] if self.length is not None else tmp.__len__()) != parent.depth: 150 | cur = ord(tmp[parent.depth]) + 1 151 | 152 | # 检测是不是字典序 153 | if prev > cur: 154 | return 0 155 | 156 | if cur != prev or siblings.__len__() is 0: 157 | tmp_node = Node(depth=parent.depth + 1, code=cur, left=i, right=0) 158 | if siblings.__len__() != 0: 159 | siblings[-1].right = i 160 | siblings.append(tmp_node) 161 | prev = cur 162 | 163 | if siblings.__len__() != 0: 164 | siblings[-1].right = parent.right 165 | 166 | return siblings.__len__() 167 | 168 | def insert(self, siblings): 169 | if self.error_ < 0: 170 | return 0 171 | 172 | begin = 0 173 | pos = (siblings[0].code + 1 if (siblings[0].code + 1 > self.next_check_pos) else self.next_check_pos) - 1 174 | nonzero_num = 0 175 | first = 0 176 | 177 | if self.alloc_size <= pos: 178 | self.resize(pos + 1) 179 | 180 | while 1: 181 | pos += 1 182 | 183 | if self.alloc_size <= pos: 184 | self.resize(pos + 1) 185 | 186 | if self.check[pos] != 0: 187 | nonzero_num += 1 188 | continue 189 | elif first is 0: 190 | self.next_check_pos = pos 191 | first = 1 192 | 193 | begin = pos - siblings[0].code 194 | 195 | if self.alloc_size <= (begin + siblings[-1].code): 196 | if 1.05 > 1.0 * self.key_size / (self.progress + 1): 197 | l = 1.05 198 | else: 199 | l = 1.0 * self.key_size / (self.progress + 1) 200 | self.resize(int(self.alloc_size * l)) 201 | 202 | if self.used[begin]: 203 | continue 204 | 205 | find = True 206 | for i in range(siblings.__len__()): 207 | if self.check[begin + siblings[i].code] != 0: 208 | find = False 209 | break 210 | if not find: 211 | continue 212 | break 213 | 214 | if 1.0 * nonzero_num / (pos - self.next_check_pos + 1) >= 0.95: 215 | self.next_check_pos = pos 216 | 217 | self.used[begin] = True 218 | self.size = self.size if (self.size > begin + siblings[-1].code + 1) else \ 219 | begin + siblings[-1].code + 1 220 | 221 | for i in range(siblings.__len__()): 222 | self.check[begin + siblings[i].code] = begin 223 | 224 | for i in range(siblings.__len__()): 225 | new_siblings = [] 226 | 227 | if self.fetch(siblings[i], new_siblings) is 0: 228 | self.base[begin + siblings[i].code] = -self.value[siblings[i].left] - 1 if ( 229 | self.value is not None) else (-siblings[i].left - 1) 230 | 231 | if self.value is not None and -self.value[siblings[i].left] - 1 >= 0: 232 | self.error_ = -2 233 | return 0 234 | 235 | self.progress += 1 236 | else: 237 | h = self.insert(new_siblings) 238 | self.base[begin + siblings[i].code] = h 239 | 240 | return begin 241 | 242 | def build(self, key=None, length=None, key_size=None, v=None): 243 | if key is None: 244 | return 0 245 | if key_size is not None and key_size > key.__len__(): 246 | return 0 247 | self.key = key 248 | self.length = length 249 | self.key_size = key_size if key_size is not None else key.__len__() 250 | self.value = None 251 | self.v = v if v is not None else key 252 | self.progress = 0 253 | 254 | self.resize(65536 * 32) 255 | 256 | self.base[0] = 1 257 | self.next_check_pos = 0 258 | 259 | root_node = Node(left=0, right=self.key_size, depth=0, code=0) 260 | 261 | siblings = [] 262 | self.fetch(root_node, siblings) 263 | self.insert(siblings) 264 | 265 | self.key = None 266 | 267 | return self.error_ 268 | 269 | def exact_match_search(self, key, pos=0, keylen=0, nodepos=0): 270 | if key is None: 271 | return -1 272 | if keylen <= 0: 273 | keylen = key.__len__() 274 | if nodepos <= 0: 275 | nodepos = 0 276 | 277 | result = -1 278 | b = self.base[nodepos] 279 | 280 | for i in range(pos, keylen): 281 | p = b + ord(key[i]) + 1 282 | if b == self.check[p]: 283 | b = self.base[p] 284 | else: 285 | return result 286 | 287 | p = b 288 | n = self.base[p] 289 | if b == self.check[p] and n < 0: 290 | result = -n - 1 291 | return result 292 | 293 | def get(self, word): 294 | index = self.exact_match_search(word) 295 | if index >= 0: 296 | return index, self.get_attr(self.v[index]) 297 | else: 298 | return index, None 299 | 300 | def get_attr(self, value): 301 | if isinstance(value, unicode) or isinstance(value, str): 302 | return Attribute(value.split(chr(32))[1:], cls=self.enum_cls) 303 | elif isinstance(value, list): 304 | return Attribute(value[1:], cls=self.enum_cls) 305 | raise Exception("异常的字典值类型:%s" % type(value)) 306 | 307 | def transition(self, path, state_from): 308 | b = state_from 309 | for i in range(len(path)): 310 | p = b + ord(path[i]) + 1 311 | if b == self.check[p]: 312 | b = self.base[p] 313 | else: 314 | return -1 315 | p = b 316 | return p 317 | 318 | def output(self, state): 319 | if state < 0: 320 | return None 321 | n = self.base[state] 322 | if state == self.check[state] and n < 0: 323 | return self.get_attr(self.v[-n - 1]) 324 | return None 325 | 326 | def dump(self): 327 | for i in range(self.size): 328 | print("i: %s [%s,%s]" % (i, self.base[i], self.check[i])) 329 | 330 | def compress(self): 331 | last = self.alloc_size - 1 332 | while self.used[last] == 0: 333 | last -= 1 334 | self.base = self.base[:last + 1] 335 | self.check = self.check[:last + 1] 336 | self.alloc_size = len(self.base) 337 | 338 | @staticmethod 339 | def save_to_ya(trie, filename): 340 | # trie.compress() 341 | import cPickle as Pickle 342 | with open(filename, 'wb') as f: 343 | Pickle.dump(trie, f, protocol=Pickle.HIGHEST_PROTOCOL) 344 | f.close() 345 | 346 | @staticmethod 347 | def save_to_yaf(trie, filename): 348 | pass 349 | 350 | @staticmethod 351 | def load_bin(filename): 352 | with open(filename, 'rb') as f: 353 | trie = Pickle.load(f) 354 | return trie 355 | 356 | @staticmethod 357 | def load_dict_file(filenames, key_func=None, value_func=None, enum_cls=NATURE): 358 | import codecs 359 | k, v, dict_list = [], [], [] 360 | if not isinstance(filenames, list): 361 | filenames = [filenames] 362 | 363 | for filename in filenames: 364 | with codecs.open(filename, 'rb', 'utf-8') as f: 365 | dict_list += f.read().splitlines() 366 | 367 | return DoubleArrayTrie.load_from_list(dict_list, key_func, value_func, enum_cls) 368 | 369 | @staticmethod 370 | def load_from_list(dict_list, key_func=None, value_func=None, enum_cls=NATURE): 371 | key_func = key_func or (lambda i: i.split()[0]) 372 | value_func = value_func or (lambda i: i) 373 | # sort 374 | dict_map = {} 375 | for i in dict_list: 376 | try: 377 | i = i.replace('\t', chr(32)) 378 | dict_map[key_func(i)] = value_func(i) # 此处需要解开成列表,viterbi会直接用到 379 | except: 380 | logger.error(u"字典项:[ %s ]格式异常。" % i) 381 | continue 382 | dict_map = OrderedDict(sorted(dict_map.items())) 383 | trie = DoubleArrayTrie(enum_cls=enum_cls) 384 | trie.build(key=dict_map.keys(), v=dict_map.values()) 385 | return trie 386 | 387 | def search(self, key, offset=0): 388 | return Searcher(self, key, offset) 389 | 390 | @staticmethod 391 | def load(filenames, key_func=None, value_func=None, 392 | dict_bin_ext=config.DICT_BIN_EXT, enum_cls=NATURE): 393 | import os 394 | # 考虑用户自定义宝典输入为列表的情况 395 | filename = filenames[0] if type(filenames) is list else filenames 396 | if config.Config.use_dict_cache and os.path.exists(filename + dict_bin_ext): 397 | return DoubleArrayTrie.load_bin(filename + dict_bin_ext) 398 | trie = DoubleArrayTrie.load_dict_file(filenames, key_func, value_func, enum_cls) 399 | DoubleArrayTrie.save_to_ya(trie, filename + dict_bin_ext) 400 | return trie 401 | 402 | @staticmethod 403 | def buildcoredictsearcher(key, offset=0): 404 | return DoubleArrayTrie().load(config.CORE_DICT_NAME).search(key, offset) 405 | 406 | 407 | class Searcher: 408 | def __init__(self, trie, chararray, offset=0): 409 | # key的起点 410 | self.begin = 0 411 | # key的长度 412 | self.length = 0 413 | # key的字典序坐标 414 | self.index = 0 415 | self.key = None 416 | 417 | # key对应的value 418 | self.value = None 419 | 420 | # 传入的字符数组 421 | self.code_array = [ord(c) for c in chararray] 422 | 423 | self.char_array = chararray 424 | 425 | # 上一个node位置 426 | self.trie = trie 427 | self.last = trie.base[0] 428 | 429 | # charArray的长度,效率起见,开个变量 430 | self.array_length = chararray.__len__() 431 | 432 | # 上一个字符的下标 433 | self.i = offset - 1 434 | # // A trick,如果文本长度为0的话,调用next()时,会带来越界的问题。 435 | self.begin = -1 if (self.array_length is 0) else offset 436 | 437 | # 是否命中,当返回false表示搜索结束,否则使用公开的成员读取命中的详细信息 438 | def next(self): 439 | b = self.last 440 | while 1: 441 | self.i += 1 442 | if self.i == self.array_length: # 指针到头了,将起点往前挪一个,重新开始,状态归零 443 | self.begin += 1 444 | if self.begin == self.array_length: 445 | break 446 | self.i = self.begin 447 | b = self.trie.base[0] 448 | 449 | p = b + self.code_array[self.i] + 1 # 状态转移 p = base[char[i-1]] + char[i] + 1 450 | if b == self.trie.check[p]: # base[char[i-1]] == check[base[char[i-1]] + char[i] + 1] 451 | b = self.trie.base[p] # 转移成功 452 | else: 453 | self.i = self.begin # 转移失败,也将起点往前挪一个,重新开始,状态归零 454 | self.begin += 1 455 | if self.begin is self.array_length: 456 | break 457 | b = self.trie.base[0] 458 | continue 459 | p = b 460 | n = self.trie.base[p] 461 | if b == self.trie.check[p] and n < 0: # base[p] == check[p] && base[p] < 0 查到一个词 462 | self.length = self.i - self.begin + 1 463 | self.index = -n - 1 464 | self.key = self.char_array[self.begin:self.begin + self.length] 465 | self.value = self.trie.get_attr(self.trie.v[self.index]) 466 | self.last = b 467 | return True 468 | return False 469 | 470 | def search_all_words(self): 471 | b = self.last 472 | while 1: 473 | self.i += 1 474 | if self.i == self.array_length: # 指针到头了,将起点往前挪一个,重新开始,状态归零 475 | self.begin += 1 476 | if self.begin == self.array_length: 477 | break 478 | self.i = self.begin 479 | b = self.trie.base[0] 480 | 481 | p = b + self.code_array[self.i] + 1 # 状态转移 p = base[char[i-1]] + char[i] + 1 482 | if b == self.trie.check[p]: # base[char[i-1]] == check[base[char[i-1]] + char[i] + 1] 483 | b = self.trie.base[p] # 转移成功 484 | else: 485 | self.i = self.begin # 转移失败,也将起点往前挪一个,重新开始,状态归零 486 | self.begin += 1 487 | if self.begin == self.array_length: 488 | break 489 | b = self.trie.base[0] 490 | continue 491 | p = b 492 | n = self.trie.base[p] 493 | if b == self.trie.check[p] and n < 0: # base[p] == check[p] && base[p] < 0 查到一个词 494 | self.length = self.i - self.begin + 1 495 | self.index = -n - 1 496 | self.key = self.char_array[self.begin:self.begin + self.length] 497 | self.value = self.trie.get_attr(self.trie.v[self.index]) 498 | self.last = b 499 | yield self.begin, self.key, self.value 500 | return 501 | 502 | 503 | 504 | 505 | # def seek(self,index): 506 | # self.i = index -1 507 | # self.begin = index 508 | # self.last = self.trie.base[0] 509 | 510 | 511 | # class MaxSearcher: 512 | # def __init__(self, trie, chararray, offset=0): 513 | # self.searcher = trie.search(chararray) 514 | # self.textbegin = 0 515 | # self.textend = 0 516 | # 517 | # def next(self): 518 | # prekey = None 519 | # preindex = None 520 | # prebegin = None 521 | # preend = None 522 | # 523 | # while self.searcher.next(): 524 | # if prekey == None or prekey == self.searcher.key[:len(prekey)] : 525 | # prekey = self.searcher.key 526 | # preindex = self.searcher.index 527 | # prebegin = self.searcher.begin 528 | # preend = self.searcher.begin+self.searcher.length 529 | # continue 530 | # else: 531 | # self.key = prekey 532 | # self.value = self.searcher.trie.v[preindex] 533 | # self.textbegin = prebegin 534 | # self.textend = preend 535 | # # 需要将起点移到找到的词的后一个 536 | # self.searcher.seek(self.textend) 537 | # return True 538 | # return False 539 | 540 | 541 | 542 | 543 | @singleton 544 | class CoreDict: 545 | def __init__(self): 546 | self.trie = DoubleArrayTrie.load(config.CORE_DICT_NAME) 547 | 548 | 549 | def __split_id_attribute(item): 550 | index = item[0] 551 | value = item[1] 552 | if isinstance(value, str): 553 | value = value.split() 554 | if isinstance(value, list): 555 | value = value[1:] 556 | return index, value 557 | 558 | 559 | PERSON_WORD_ID, PERSON_ATTRIBUTE = __split_id_attribute(CoreDict().trie.get(TAG_PEOPLE)) 560 | PLACE_WORD_ID, PLACE_ATTRIBUTE = __split_id_attribute(CoreDict().trie.get(TAG_PLACE)) 561 | ORG_WORD_ID, ORG_ATTRIBUTE = __split_id_attribute(CoreDict().trie.get(TAG_GROUP)) 562 | PROPER_WORD_ID, PROPER_ATTRIBUTE = __split_id_attribute(CoreDict().trie.get(TAG_PROPER)) 563 | TIME_WORD_ID, TIME_ATTRIBUTE = __split_id_attribute(CoreDict().trie.get(TAG_TIME)) 564 | NUMBER_WORD_ID, NUMBER_ATTRIBUTE = __split_id_attribute(CoreDict().trie.get(TAG_NUMBER)) 565 | CLUSTER_WORD_ID, CLUSTER_ATTRIBUTE = __split_id_attribute(CoreDict().trie.get(TAG_CLUSTER)) 566 | 567 | @singleton 568 | class CustomDict: 569 | def __init__(self): 570 | self.trie = DoubleArrayTrie.load(config.CUSTOM_DICT_NAME) 571 | -------------------------------------------------------------------------------- /yaya/collection/hmm.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import unicode_literals 3 | import math 4 | 5 | from yaya.common.ns import NS 6 | from yaya import config 7 | from yaya.common.nr import NR 8 | from yaya.common.nt import NT 9 | from yaya.utility.singleton import singleton 10 | 11 | __author__ = 'tony' 12 | 13 | 14 | class HMMMatrix: 15 | def __init__(self): 16 | self.matrix = [] 17 | self.total = None 18 | self.total_freq = 0 19 | self.start_prob = None 20 | self.trans_prob = None 21 | 22 | def get_total_freq(self, nature): 23 | return self.total[nature.index] 24 | 25 | @staticmethod 26 | def load(filename, cls): 27 | with open(filename, 'r') as f: 28 | flist = f.read().splitlines() 29 | labels = flist[0].split(',')[1:] 30 | ord_array = [[0]] * len(labels) 31 | ord_max = 0 32 | for i in range(len(ord_array)): 33 | ord_array[i] = cls[labels[i]].index 34 | ord_max = max(ord_max, ord_array[i]) 35 | # 找到最大的枚举值 36 | ord_max += 1 37 | hmm = HMMMatrix() 38 | hmm.matrix = [[0 for col in range(ord_max)] for row in range(ord_max)] 39 | for row in flist[1:]: 40 | params = row.split(',') 41 | cur_ord = cls[params[0]].index 42 | for i in range(ord_array.__len__()): 43 | hmm.matrix[cur_ord][ord_array[i]] = int(params[1 + i]) 44 | 45 | hmm.total = [[0]] * ord_max 46 | for j in range(ord_max): 47 | hmm.total[j] = 0 48 | for i in range(ord_max): 49 | hmm.total[j] += hmm.matrix[i][j] 50 | 51 | for j in range(ord_max): 52 | hmm.total[j] += hmm.matrix[j][j] 53 | 54 | for j in range(ord_max): 55 | hmm.total_freq += hmm.total[j] 56 | 57 | # 计算HMM四元组 58 | states = ord_array 59 | hmm.start_prob = [[0]] * ord_max 60 | for s in ord_array: 61 | freq = hmm.total[s] + 1e-8 62 | hmm.start_prob[s] = -math.log(freq / hmm.total_freq) 63 | 64 | hmm.trans_prob = [[0 for col in range(ord_max)] for row in range(ord_max)] 65 | for f in ord_array: 66 | for t in ord_array: 67 | freq = hmm.matrix[f][t] + 1e-8 68 | hmm.trans_prob[f][t] = -math.log(freq / hmm.total_freq) 69 | return hmm 70 | 71 | 72 | @singleton 73 | class PersonTranMatrix: 74 | def __init__(self): 75 | self.hmm = HMMMatrix.load(config.PERSON_TR_PATH, NR) 76 | 77 | 78 | @singleton 79 | class OrgTranMatrix: 80 | def __init__(self): 81 | self.hmm = HMMMatrix.load(config.ORG_TR_PATH, NT) 82 | 83 | @singleton 84 | class PlaceTranMatrix: 85 | def __init__(self): 86 | self.hmm = HMMMatrix.load(config.PLACE_TR_PATH, NS) 87 | -------------------------------------------------------------------------------- /yaya/collection/trie.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tony' 2 | __all__ = ['Trie', 'StringTrie', 'SortedTrie', 'SortedStringTrie', 'Node'] 3 | 4 | import sys 5 | from copy import copy 6 | from operator import itemgetter 7 | from collections import MutableMapping 8 | 9 | # Python 3 interoperability 10 | PY3 = sys.version_info[0] == 3 11 | if PY3: 12 | def itervalues(d): 13 | return d.values() 14 | 15 | 16 | def iteritems(d): 17 | return d.items() 18 | else: 19 | def itervalues(d): 20 | return d.itervalues() 21 | 22 | 23 | def iteritems(d): 24 | return d.iteritems() 25 | 26 | 27 | # Singleton sentinel - works with pickling 28 | class NULL(object): 29 | pass 30 | 31 | 32 | class Node(object): 33 | '''Trie node class. 34 | 35 | Subclasses may extend it to replace :attr:`ChildrenFactory` with a different 36 | mapping class (e.g. `sorteddict `_). 37 | 38 | :ivar value: The value of the key corresponding to this node or :const:`NULL` 39 | if there is no such key. 40 | :ivar children: A ``{key-part : child-node}`` mapping. 41 | ''' 42 | __slots__ = ('value', 'children') 43 | 44 | #: A callable for creating a new :attr:`children` mapping. 45 | ChildrenFactory = dict 46 | 47 | def __init__(self, value=NULL): 48 | self.value = value 49 | self.children = self.ChildrenFactory() 50 | 51 | def numkeys(self): 52 | '''Return the number of keys in the subtree rooted at this node.''' 53 | return (int(self.value is not NULL) + 54 | sum(child.numkeys() for child in itervalues(self.children))) 55 | 56 | def __repr__(self): 57 | return '(%s, {%s})' % ( 58 | self.value is NULL and 'NULL' or repr(self.value), 59 | ', '.join('%r: %r' % t for t in iteritems(self.children))) 60 | 61 | def __copy__(self): 62 | clone = self.__class__(self.value) 63 | clone_children = clone.children 64 | for key, child in iteritems(self.children): 65 | clone_children[key] = child.__copy__() 66 | return clone 67 | 68 | def __getstate__(self): 69 | return (self.value, self.children) 70 | 71 | def __setstate__(self, state): 72 | self.value, self.children = state 73 | 74 | 75 | class Trie(MutableMapping): 76 | '''Base trie class. 77 | 78 | As with regular dicts, keys are not necessarily returned sorted. Use 79 | :class:`SortedTrie` if sorting is required. 80 | ''' 81 | 82 | #: Callable for forming a key from its parts. 83 | KeyFactory = tuple 84 | 85 | #: Callable for creating new trie nodes. 86 | NodeFactory = Node 87 | 88 | def __init__(self, *args, **kwargs): 89 | '''Create a new trie. 90 | 91 | Parameters are the same with ``dict()``. 92 | ''' 93 | self._root = self.NodeFactory() 94 | 95 | self.update(*args, **kwargs) 96 | 97 | @classmethod 98 | def fromkeys(cls, iterable, value=None): 99 | '''Create a new trie with keys from ``iterable`` and values set to ``value``. 100 | 101 | Parameters are the same with ``dict.fromkeys()``. 102 | ''' 103 | d = cls() 104 | for key in iterable: 105 | d[key] = value 106 | return d 107 | 108 | # ----- trie-specific methods ----------------------------------------------- 109 | 110 | def longest_prefix(self, key, default=NULL): 111 | '''Return the longest key in this trie that is a prefix of ``key``. 112 | 113 | If the trie doesn't contain any prefix of ``key``: 114 | - if ``default`` is given, return it 115 | - otherwise raise ``KeyError`` 116 | ''' 117 | try: 118 | return self.longest_prefix_item(key)[0] 119 | except KeyError: 120 | if default is not NULL: 121 | return default 122 | raise 123 | 124 | def longest_prefix_value(self, key, default=NULL): 125 | '''Return the value associated with the longest key in this trie that is 126 | a prefix of ``key``. 127 | 128 | If the trie doesn't contain any prefix of ``key``: 129 | - if ``default`` is given, return it 130 | - otherwise raise ``KeyError`` 131 | ''' 132 | current = self._root 133 | longest_prefix_value = NULL 134 | for part in key: 135 | current = current.children.get(part) 136 | if current is None: 137 | break 138 | value = current.value 139 | if value is not NULL: 140 | longest_prefix_value = value 141 | if longest_prefix_value is not NULL: 142 | return longest_prefix_value 143 | elif default is not NULL: 144 | return default 145 | else: 146 | raise KeyError 147 | 148 | def longest_prefix_item(self, key, default=NULL): 149 | '''Return the item (``(key,value)`` tuple) associated with the longest 150 | key in this trie that is a prefix of ``key``. 151 | 152 | If the trie doesn't contain any prefix of ``key``: 153 | - if ``default`` is given, return it 154 | - otherwise raise ``KeyError`` 155 | ''' 156 | prefix = [] 157 | append = prefix.append 158 | current = self._root 159 | longest_prefix_value = NULL 160 | max_non_null_index = -1 161 | for i, part in enumerate(key): 162 | current = current.children.get(part) 163 | if current is None: 164 | break 165 | append(part) 166 | value = current.value 167 | if value is not NULL: 168 | longest_prefix_value = value 169 | max_non_null_index = i 170 | if longest_prefix_value is not NULL: 171 | del prefix[max_non_null_index + 1:] 172 | return (self.KeyFactory(prefix), longest_prefix_value) 173 | elif default is not NULL: 174 | return default 175 | else: 176 | raise KeyError 177 | 178 | def iter_prefixes(self, key): 179 | 'Return an iterator over the keys of this trie that are prefixes of ``key``.' 180 | key_factory = self.KeyFactory 181 | prefix = [] 182 | append = prefix.append 183 | node = self._root 184 | for part in key: 185 | node = node.children.get(part) 186 | if node is None: 187 | break 188 | append(part) 189 | if node.value is not NULL: 190 | yield key_factory(prefix) 191 | 192 | def iter_prefix_values(self, key): 193 | '''Return an iterator over the values of this trie that are associated 194 | with keys that are prefixes of ``key``. 195 | ''' 196 | node = self._root 197 | for part in key: 198 | node = node.children.get(part) 199 | if node is None: 200 | break 201 | if node.value is not NULL: 202 | yield node.value 203 | 204 | def iter_prefix_items(self, key): 205 | '''Return an iterator over the items (``(key,value)`` tuples) of this 206 | trie that are associated with keys that are prefixes of ``key``. 207 | ''' 208 | key_factory = self.KeyFactory 209 | prefix = [] 210 | append = prefix.append 211 | node = self._root 212 | for part in key: 213 | node = node.children.get(part) 214 | if node is None: 215 | break 216 | append(part) 217 | if node.value is not NULL: 218 | yield (key_factory(prefix), node.value) 219 | 220 | # ----- extended mapping API methods ---------------------------------------- 221 | 222 | def keys(self, prefix=None): 223 | '''Return a list of this trie's keys. 224 | 225 | :param prefix: If not None, return only the keys prefixed by ``prefix``. 226 | ''' 227 | return list(self.iterkeys(prefix)) 228 | 229 | def values(self, prefix=None): 230 | '''Return a list of this trie's values. 231 | 232 | :param prefix: If not None, return only the values associated with keys 233 | prefixed by ``prefix``. 234 | ''' 235 | return list(self.itervalues(prefix)) 236 | 237 | def items(self, prefix=None): 238 | '''Return a list of this trie's items (``(key,value)`` tuples). 239 | 240 | :param prefix: If not None, return only the items associated with keys 241 | prefixed by ``prefix``. 242 | ''' 243 | return list(self.iteritems(prefix)) 244 | 245 | def iterkeys(self, prefix=None): 246 | '''Return an iterator over this trie's keys. 247 | 248 | :param prefix: If not None, yield only the keys prefixed by ``prefix``. 249 | ''' 250 | return (key for key, value in self.iteritems(prefix)) 251 | 252 | def itervalues(self, prefix=None): 253 | '''Return an iterator over this trie's values. 254 | 255 | :param prefix: If not None, yield only the values associated with keys 256 | prefixed by ``prefix``. 257 | ''' 258 | 259 | def generator(node, NULL=NULL): 260 | if node.value is not NULL: 261 | yield node.value 262 | for part, child in iteritems(node.children): 263 | for subresult in generator(child): 264 | yield subresult 265 | 266 | if prefix is None: 267 | node = self._root 268 | else: 269 | node = self._find(prefix) 270 | if node is None: 271 | node = self.NodeFactory() 272 | return generator(node) 273 | 274 | def iteritems(self, prefix=None): 275 | '''Return an iterator over this trie's items (``(key,value)`` tuples). 276 | 277 | :param prefix: If not None, yield only the items associated with keys 278 | prefixed by ``prefix``. 279 | ''' 280 | parts = [] 281 | append = parts.append 282 | 283 | def generator(node, key_factory=self.KeyFactory, parts=parts, 284 | append=append, NULL=NULL): 285 | if node.value is not NULL: 286 | yield (key_factory(parts), node.value) 287 | for part, child in iteritems(node.children): 288 | append(part) 289 | for subresult in generator(child): 290 | yield subresult 291 | del parts[-1] 292 | 293 | node = self._root 294 | if prefix is not None: 295 | for part in prefix: 296 | append(part) 297 | node = node.children.get(part) 298 | if node is None: 299 | node = self.NodeFactory() 300 | break 301 | return generator(node) 302 | 303 | # ----- original mapping API methods ---------------------------------------- 304 | 305 | def __len__(self): 306 | return self._root.numkeys() 307 | 308 | def __iter__(self): 309 | return self.iterkeys() 310 | 311 | def __contains__(self, key): 312 | node = self._find(key) 313 | return node is not None and node.value is not NULL 314 | 315 | def has_key(self, key): 316 | return key in self 317 | 318 | def __getitem__(self, key): 319 | node = self._find(key) 320 | if node is None or node.value is NULL: 321 | raise KeyError 322 | return node.value 323 | 324 | def __setitem__(self, key, value): 325 | node = self._root 326 | Node = self.NodeFactory 327 | for part in key: 328 | next = node.children.get(part) 329 | if next is None: 330 | node = node.children.setdefault(part, Node()) 331 | else: 332 | node = next 333 | node.value = value 334 | 335 | def __delitem__(self, key): 336 | nodes_parts = [] 337 | append = nodes_parts.append 338 | node = self._root 339 | for part in key: 340 | append((node, part)) 341 | node = node.children.get(part) 342 | if node is None: 343 | break 344 | if node is None or node.value is NULL: 345 | raise KeyError 346 | node.value = NULL 347 | pop = nodes_parts.pop 348 | while node.value is NULL and not node.children and nodes_parts: 349 | node, part = pop() 350 | del node.children[part] 351 | 352 | def clear(self): 353 | self._root.children.clear() 354 | 355 | def copy(self): 356 | clone = copy(super(Trie, self)) 357 | clone._root = copy(self._root) 358 | return clone 359 | 360 | def __repr__(self): 361 | return '%s({%s})' % ( 362 | self.__class__.__name__, 363 | ', '.join('%r: %r' % t for t in self.iteritems())) 364 | 365 | def _find(self, key): 366 | node = self._root 367 | for part in key: 368 | node = node.children.get(part) 369 | if node is None: 370 | break 371 | return node 372 | 373 | 374 | class StringTrie(Trie): 375 | '''A more appropriate for string keys :class:`Trie`.''' 376 | KeyFactory = ''.join 377 | 378 | 379 | # XXX: quick & dirty sorted dict 380 | # currently only iteritems() (for Python 2) or items() (for Python 3) has to be 381 | # overriden. However this is implementation detail that may change in the future 382 | class _SortedDict(dict): 383 | if PY3: 384 | def items(self): 385 | return iter(sorted(dict.items(self), key=itemgetter(0))) 386 | else: 387 | def iteritems(self): 388 | return iter(sorted(dict.iteritems(self), key=itemgetter(0))) 389 | 390 | 391 | class _SortedNode(Node): 392 | ChildrenFactory = _SortedDict 393 | 394 | 395 | class SortedTrie(Trie): 396 | '''A :class:`Trie` that returns its keys (and associated values/items) sorted. 397 | 398 | .. note:: 399 | This implementation does not keep the keys sorted internally; instead it 400 | sorts them every time a method returning a list or iterator (e.g. 401 | :meth:`keys`) is called. In cases where a trie is relatively stable 402 | (few inserts/deletes) and is iterated often, it is probably more efficient 403 | to use a :attr:`NodeFactory` based on a sorted dict such as 404 | `sorteddict `_. 405 | ''' 406 | NodeFactory = _SortedNode 407 | 408 | 409 | class SortedStringTrie(SortedTrie, StringTrie): 410 | 'A :class:`Trie` that is both a :class:`StringTrie` and a :class:`SortedTrie`.' 411 | 412 | 413 | if __name__ == '__main__': 414 | import doctest 415 | 416 | doctest.testmod() 417 | -------------------------------------------------------------------------------- /yaya/common/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tony' 2 | -------------------------------------------------------------------------------- /yaya/common/enum.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | class EnumException(Exception): 3 | """ Base class for all exceptions in this module. """ 4 | 5 | def __init__(self, *args, **kwargs): 6 | if self.__class__ is EnumException: 7 | raise NotImplementedError( 8 | "%(class_name)s is an abstract base class" % vars()) 9 | super(EnumException, self).__init__(*args, **kwargs) 10 | 11 | 12 | class EnumEmptyError(AssertionError, EnumException): 13 | """ Raised when attempting to create an empty enumeration. """ 14 | 15 | def __str__(self): 16 | return "Enumerations cannot be empty" 17 | 18 | 19 | class EnumBadKeyError(TypeError, EnumException): 20 | """ Raised when creating an Enum with non-string keys. """ 21 | 22 | def __init__(self, key): 23 | self.key = key 24 | 25 | def __str__(self): 26 | return "Enumeration keys must be strings: %(key)r" % vars(self) 27 | 28 | 29 | class EnumImmutableError(TypeError, EnumException): 30 | """ Raised when attempting to modify an Enum. """ 31 | 32 | def __init__(self, *args): 33 | self.args = args 34 | 35 | def __str__(self): 36 | return "Enumeration does not allow modification" 37 | 38 | 39 | def _comparator(func): 40 | """ Decorator for EnumValue rich comparison methods. """ 41 | 42 | def comparator_wrapper(self, other): 43 | try: 44 | assert self.enumtype == other.enumtype 45 | result = func(self.index, other.index) 46 | except (AssertionError, AttributeError): 47 | result = NotImplemented 48 | 49 | return result 50 | 51 | comparator_wrapper.__name__ = func.__name__ 52 | comparator_wrapper.__doc__ = getattr(float, func.__name__).__doc__ 53 | return comparator_wrapper 54 | 55 | 56 | class EnumValue(object): 57 | """ A specific value of an enumerated type. """ 58 | 59 | def __init__(self, enumtype, index, key): 60 | """ Set up a new instance. """ 61 | self._enumtype = enumtype.enum_name 62 | self._index = index 63 | self._key = key 64 | 65 | @property 66 | def enumtype(self): 67 | return self._enumtype 68 | 69 | @property 70 | def key(self): 71 | return self._key 72 | 73 | def __str__(self): 74 | return str(self.key) 75 | 76 | @property 77 | def index(self): 78 | return self._index 79 | 80 | def __repr__(self): 81 | return "EnumValue(%(_enumtype)r, %(_index)r, %(_key)r)" % vars(self) 82 | 83 | def __hash__(self): 84 | return hash(self._index) 85 | 86 | @_comparator 87 | def __eq__(self, other): 88 | return self == other 89 | 90 | @_comparator 91 | def __ne__(self, other): 92 | return self != other 93 | 94 | @_comparator 95 | def __lt__(self, other): 96 | return self < other 97 | 98 | @_comparator 99 | def __le__(self, other): 100 | return self <= other 101 | 102 | @_comparator 103 | def __gt__(self, other): 104 | return self > other 105 | 106 | @_comparator 107 | def __ge__(self, other): 108 | return self >= other 109 | 110 | 111 | class Enum(object): 112 | """ Enumerated type. """ 113 | 114 | def __init__(self, *keys, **kwargs): 115 | """ Create an enumeration instance. """ 116 | 117 | value_type = kwargs.get('value_type', EnumValue) 118 | enum_name = kwargs.get('enum_name', None) 119 | assert enum_name is not None 120 | self.__dict__['enum_name'] = enum_name 121 | if not keys: 122 | raise EnumEmptyError() 123 | 124 | keys = tuple(keys) 125 | values = [None] * len(keys) 126 | 127 | for i, key in enumerate(keys): 128 | value = value_type(self, i, key) 129 | values[i] = value 130 | try: 131 | super(Enum, self).__setattr__(key, value) 132 | except TypeError: 133 | raise EnumBadKeyError(key) 134 | 135 | self.__dict__['_keys'] = keys 136 | self.__dict__['_values'] = values 137 | 138 | def __setattr__(self, name, value): 139 | raise EnumImmutableError(name) 140 | 141 | def __delattr__(self, name): 142 | raise EnumImmutableError(name) 143 | 144 | def __len__(self): 145 | return len(self._values) 146 | 147 | def __getitem__(self, index): 148 | # tony 添加,添加从字符型枚举名到变量值的转换 149 | if isinstance(index, str) or isinstance(index, unicode) : 150 | return self.__getattribute__(index) 151 | else: 152 | return self._values[index] 153 | 154 | def __setitem__(self, index, value): 155 | raise EnumImmutableError(index) 156 | 157 | def __delitem__(self, index): 158 | raise EnumImmutableError(index) 159 | 160 | def __iter__(self): 161 | return iter(self._values) 162 | 163 | def __contains__(self, value): 164 | is_member = False 165 | if isinstance(value, basestring): 166 | is_member = (value in self._keys) 167 | else: 168 | is_member = (value in self._values) 169 | return is_member 170 | -------------------------------------------------------------------------------- /yaya/common/nature.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from yaya.common.enum import Enum 3 | 4 | __author__ = 'tony' 5 | # 词性 6 | 7 | NATURE = Enum( 8 | "bg", # u"区别语素", 9 | "mg", # u"数语素", 10 | "nl", # u"名词性惯用语", 11 | "nx", # u"字母专名", 12 | "qg", # u"量词语素", 13 | "ud", # u"助词", 14 | "uj", # u"助词", 15 | "uz", # u"着", 16 | "ug", # u"过", 17 | "ul", # u"连词", 18 | "uv", # u"连词", 19 | "yg", # u"语气语素", 20 | "zg", # u"状态词", 21 | "n", # u"名词", 22 | "nr", # u"人名", 23 | "nrj", # u"日语人名", 24 | "nrf", # u"音译人名", 25 | "nr1", # u"复姓", 26 | "nr2", # u"蒙古姓名", 27 | "ns", # u"地名", 28 | "nsf", # u"音译地名", 29 | "nt", # u"机构团体名", 30 | "ntc", # u"公司名", 31 | "ntcf", # u"工厂", 32 | "ntcb", # u"银行", 33 | "ntch", # u"酒店宾馆", 34 | "nto", # u"政府机构", 35 | "ntu", # u"大学", 36 | "nts", # u"中小学", 37 | "nth", # u"医院", 38 | "nh", # u"医药疾病等健康相关名词", 39 | "nhm", # u"药品", 40 | "nhd", # u"疾病", 41 | "nn", # u"工作相关名词", 42 | "nnt", # u"职务职称", 43 | "nnd", # u"职业", 44 | "ng", # u"名词性语素", 45 | "nf", # u"食品", 46 | "ni", # u"机构相关", 47 | "nit", # u"教育相关机构", 48 | "nic", # u"下属机构", 49 | "nis", # u"机构后缀", 50 | "nm", # u"物品名", 51 | "nmc", # u"化学品名", 52 | "nb", # u"生物名", 53 | "nba", # u"动物名", 54 | "nbc", # u"动物纲目", 55 | "nbp", # u"植物名", 56 | "nz", # u"其他专名", 57 | "g", # u"学术词汇", 58 | "gm", # u"数学相关词汇", 59 | "gp", # u"物理相关词汇", 60 | "gc", # u"化学相关词汇", 61 | "gb", # u"生物相关词汇", 62 | "gbc", # u"生物类别", 63 | "gg", # u"地理地质相关词汇", 64 | "gi", # u"计算机相关词汇", 65 | "j", # u"简称略语", 66 | "i", # u"成语", 67 | "l", # u"习用语", 68 | "t", # u"时间词", 69 | "tg", # u"时间词性语素", 70 | "s", # u"处所词", 71 | "f", # u"方位词", 72 | "v", # u"动词", 73 | "vd", # u"副动词", 74 | "vn", # u"名动词", 75 | "vshi", # u"动词", 76 | "vyou", # u"动词", 77 | "vf", # u"趋向动词", 78 | "vx", # u"形式动词", 79 | "vi", # u"不及物动词", 80 | "vl", # u"动词性惯用语", 81 | "vg", # u"动词性语素", 82 | "a", # u"形容词", 83 | "ad", # u"副形词", 84 | "an", # u"名形词", 85 | "ag", # u"形容词性语素", 86 | "al", # u"形容词性惯用语", 87 | "b", # u"区别词", 88 | "bl", # u"区别词性惯用语", 89 | "z", # u"状态词", 90 | "r", # u"代词", 91 | "rr", # u"人称代词", 92 | "rz", # u"指示代词", 93 | "rzt", # u"时间指示代词", 94 | "rzs", # u"处所指示代词", 95 | "rzv", # u"谓词性指示代词", 96 | "ry", # u"疑问代词", 97 | "ryt", # u"时间疑问代词", 98 | "rys", # u"处所疑问代词", 99 | "ryv", # u"谓词性疑问代词", 100 | "rg", # u"代词性语素", 101 | "Rg", # u"古汉语代词性语素", 102 | "m", # u"数词", 103 | "mq", # u"数量词", 104 | "Mg", # u"甲乙丙丁之类的数词", 105 | "q", # u"量词", 106 | "qv", # u"动量词", 107 | "qt", # u"时量词", 108 | "d", # u"副词", 109 | "dg", # u"辄", 110 | "dl", # u"连语", 111 | "p", # u"介词", 112 | "pba", # u"介词", 113 | "pbei", # u"介词", 114 | "c", # u"连词", 115 | "cc", # u"并列连词", 116 | "u", # u"助词", 117 | "uzhe", # u"着", 118 | "ule", # u"了 ", 119 | "uguo", # u"过", 120 | "ude1", # u"的 ", 121 | "ude2", # u"地", 122 | "ude3", # u"得", 123 | "usuo", # u"所", 124 | "udeng", # u"等 ", 125 | "uyy", # u"一样 ", 126 | "udh", # u"的话", 127 | "uls", # u"来讲 ", 128 | "uzhi", # u"之", 129 | "ulian", # u"连 ", 130 | "e", # u"叹词", 131 | "y", # u"语气词", 132 | "o", # u"拟声词", 133 | "h", # u"前缀", 134 | "k", # u"后缀", 135 | "x", # u"字符串", 136 | "xx", # u"非语素字", 137 | "xu", # u"网址", 138 | "w", # u"标点符号", 139 | "wkz", # u"左括号", 140 | "wky", # u"右括号", 141 | "wyz", # u"左引号", 142 | "wyy", # u"右引号", 143 | "wj", # u"句号", 144 | "ww", # u"问号", 145 | "wt", # u"叹号", 146 | "wd", # u"逗号", 147 | "wf", # u"分号", 148 | "wn", # u"顿号", 149 | "wm", # u"冒号", 150 | "ws", # u"省略号", 151 | "wp", # u"破折号", 152 | "wb", # u"百分号千分号", 153 | "wh", # u"单位符号", 154 | "end", # u"仅用于始", 155 | "begin", # u"仅用于终" 156 | enum_name="NATURE" # 如果不指定,enum持久化时按id来判断类型的相等 157 | ) 158 | 159 | -------------------------------------------------------------------------------- /yaya/common/nr.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import unicode_literals 3 | from yaya.common.enum import Enum 4 | 5 | __author__ = 'tony' 6 | 7 | NR = Enum( 8 | 9 | # Pf 姓氏 【张】华平先生 10 | 11 | 'B', 12 | 13 | # Pm 双名的首字 张【华】平先生 14 | 15 | 'C', 16 | 17 | # Pt 双名的末字 张华【平】先生 18 | 19 | 'D', 20 | 21 | # Ps 单名 张【浩】说:“我是一个好人” 22 | 23 | 'E', 24 | 25 | # Ppf 前缀 【老】刘、【小】李 26 | 27 | 'F', 28 | 29 | # Plf 后缀 王【总】、刘【老】、肖【氏】、吴【妈】、叶【帅】 30 | 31 | 'G', 32 | 33 | # Pp 人名的上文 又【来到】于洪洋的家。 34 | 35 | 'K', 36 | 37 | # Pn 人名的下文 新华社记者黄文【摄】 38 | 39 | 'L', 40 | 41 | # Ppn 两个中国人名之间的成分 编剧邵钧林【和】稽道青说 42 | 43 | 'M', 44 | 45 | # Ppf 人名的上文和姓成词 这里【有关】天培的壮烈 46 | 47 | 'U', 48 | 49 | # Pnw 三字人名的末字和下文成词 龚学平等领导, 邓颖【超生】前 50 | 51 | 'V', 52 | 53 | # Pfm 姓与双名的首字成词 【王国】维、 54 | 55 | 'X', 56 | 57 | # Pfs 姓与单名成词 【高峰】、【汪洋】 58 | 59 | 'Y', 60 | 61 | # Pmt 双名本身成词 张【朝阳】 62 | 63 | 'Z', 64 | 65 | # Po 以上之外其他的角色 66 | 67 | 'A', 68 | 69 | # 句子的开头 70 | 71 | 'S', 72 | enum_name="NR" 73 | 74 | ) 75 | 76 | NRPattern = [ 77 | 'BBCD', 78 | 'BBE', 79 | 'BBZ', 80 | 'BCD', 81 | 'BEE', 82 | 'BE', 83 | 'BC', 84 | 'BEC', 85 | 'BG', 86 | 'DG', 87 | 'EG', 88 | 'BXD', 89 | 'BZ', 90 | 'EE', 91 | 'FE', 92 | 'FC', 93 | 'FB', 94 | 'FG', 95 | 'Y', 96 | 'XD', 97 | 'GD', 98 | ] -------------------------------------------------------------------------------- /yaya/common/ns.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import unicode_literals 3 | from yaya.common.enum import Enum 4 | 5 | __author__ = 'tony' 6 | 7 | NS = Enum( 8 | 'A', # 地名的上文 我【来到】中关园 9 | 'B', # 地名的下文刘家村/和/下岸村/相邻 10 | 'C', # 中国地名的第一个字 11 | 'D', # 中国地名的第二个字 12 | 'E', # 中国地名的第三个字 13 | 'G', # 其他整个的地名 14 | 'H', # 中国地名的后缀海/淀区 15 | 'X', # 连接词刘家村/和/下岸村/相邻 16 | 'Z', # 其它非地名成分 17 | 'S', # 句子的开头 18 | enum_name="NS" 19 | ) 20 | 21 | NSPattern = [ 22 | "CH", 23 | "CDH", 24 | "CDEH", 25 | "GH" 26 | ] 27 | -------------------------------------------------------------------------------- /yaya/common/nt.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import unicode_literals 3 | from yaya.common.enum import Enum 4 | 5 | __author__ = 'tony' 6 | 7 | NT = Enum( 8 | 'A', # 上文 [参与]亚太经合组织的活动 9 | 'B', # 下文 中央电视台[报道] 10 | 'X', # 连接词 北京电视台[和]天津电视台 11 | 'C', # 特征词的一般性前缀 北京[电影]学院 12 | 'F', # 特征词的译名性前缀 美国[摩托罗拉]公司 13 | 'G', # 特征词的地名性前缀 交通银行[北京]分行 14 | 'H', # 特征词的机构名前缀 [中共中央]顾问委员会 15 | 'I', # 特征词的特殊性前缀 [华谊]医院 16 | 'J', # 特征词的简称性前缀 [巴]政府 17 | 'K', # 整个机构 [麦当劳] 18 | 'L', # 方位词 19 | 'M', # 数词 公交集团[五]分公司 20 | 'P', # 单字碎片 21 | 'W', # 符号 22 | 'D', # 机构名的特征词 国务院侨务[办公室] 23 | 'Z', # 非机构名成份 24 | 'S', # 句子的开头 25 | enum_name="NT" 26 | ) 27 | 28 | NTPattern = [ 29 | "CCCCCCCCD", 30 | "CCCCCCCD", 31 | "CCCCCCD", 32 | "CCCCCCGD", 33 | "CCCCCCICCCCD", 34 | "CCCCCCPD", 35 | "CCCCCD", 36 | "CCCCCDD", 37 | "CCCCCGCCD", 38 | "CCCCCICCCCCD", 39 | "CCCCCPCCD", 40 | "CCCCCWDWD", 41 | "CCCCD", 42 | "CCCCDCCD", 43 | "CCCCDCD", 44 | "CCCCDD", 45 | "CCCCID", 46 | "CCCCPCD", 47 | "CCCD", 48 | "CCCDCCCD", 49 | "CCCDCCD", 50 | "CCCDCD", 51 | "CCCDD", 52 | "CCCDICFPD", 53 | "CCCFCFFCD", 54 | "CCCGD", 55 | "CCCGID", 56 | "CCCGJCD", 57 | "CCCID", 58 | "CCCJCCD", 59 | "CCCJD", 60 | "CCCLGCD", 61 | "CCCMD", 62 | "CCCPCCCD", 63 | "CCCPCCD", 64 | "CCCPCD", 65 | "CCCPD", 66 | "CCD", 67 | "CCDCCCCCCD", 68 | "CCDCCCCD", 69 | "CCDCCCD", 70 | "CCDCCCDD", 71 | "CCDCCD", 72 | "CCDCD", 73 | "CCDCDD", 74 | "CCDCGCD", 75 | "CCDCGCDID", 76 | "CCDCGCDPD", 77 | "CCDCGGDD", 78 | "CCDCID", 79 | "CCDCJCCD", 80 | "CCDCJCCDD", 81 | "CCDD", 82 | "CCDDD", 83 | "CCDFIDGD", 84 | "CCDGCCD", 85 | "CCDICD", 86 | "CCDID", 87 | "CCDJCD", 88 | "CCDPCD", 89 | "CCDPJD", 90 | "CCFCCD", 91 | "CCFD", 92 | "CCGCCCD", 93 | "CCGCCD", 94 | "CCGCD", 95 | "CCGCDCD", 96 | "CCGCDCMD", 97 | "CCGD", 98 | "CCGGCD", 99 | "CCGID", 100 | "CCGIDD", 101 | "CCGJD", 102 | "CCGWGWD", 103 | "CCICCD", 104 | "CCICD", 105 | "CCICIFD", 106 | "CCICJPD", 107 | "CCID", 108 | "CCIDCD", 109 | "CCIDD", 110 | "CCIID", 111 | "CCJCCCD", 112 | "CCJCCD", 113 | "CCJCD", 114 | "CCJCFD", 115 | "CCJD", 116 | "CCJID", 117 | "CCJJMJD", 118 | "CCKID", 119 | "CCLD", 120 | "CCMD", 121 | "CCMMPDCD", 122 | "CCPCCD", 123 | "CCPCD", 124 | "CCPD", 125 | "CCPDCD", 126 | "CCPPD", 127 | "CCWCWD", 128 | "CCWGWCCD", 129 | "CCWGWD", 130 | "CD", 131 | "CDCCCCCCD", 132 | "CDCCCCD", 133 | "CDCCCD", 134 | "CDCCD", 135 | "CDCCDD", 136 | "CDCCJD", 137 | "CDCD", 138 | "CDCDD", 139 | "CDCGD", 140 | "CDCGPCCD", 141 | "CDCJD", 142 | "CDCLD", 143 | "CDCWIWD", 144 | "CDD", 145 | "CDDCCD", 146 | "CDDCCDD", 147 | "CDDCD", 148 | "CDDD", 149 | "CDFD", 150 | "CDFPCCD", 151 | "CDGCD", 152 | "CDGCICD", 153 | "CDGD", 154 | "CDICD", 155 | "CDID", 156 | "CDILLCCD", 157 | "CDJCCD", 158 | "CDJCD", 159 | "CDJD", 160 | "CDJLD", 161 | "CDLGCD", 162 | "CDLJD", 163 | "CDMCD", 164 | "CDPCCCCD", 165 | "CDPCCD", 166 | "CDPD", 167 | "CDPPD", 168 | "CFCCD", 169 | "CFCPD", 170 | "CFD", 171 | "CFPD", 172 | "CGCCCD", 173 | "CGCCD", 174 | "CGCD", 175 | "CGCDCD", 176 | "CGCDD", 177 | "CGD", 178 | "CGDCD", 179 | "CGDD", 180 | "CGDDCCD", 181 | "CGDDD", 182 | "CGDDID", 183 | "CGDJD", 184 | "CGDMD", 185 | "CGFD", 186 | "CGGCCCD", 187 | "CGGCCD", 188 | "CGGCD", 189 | "CGGD", 190 | "CGGGD", 191 | "CGGGDD", 192 | "CGGICD", 193 | "CGGJD", 194 | "CGICD", 195 | "CGID", 196 | "CGIJD", 197 | "CGJD", 198 | "CGMD", 199 | "CGPJD", 200 | "CICCCCD", 201 | "CICCD", 202 | "CICD", 203 | "CICDCD", 204 | "CICDD", 205 | "CICWGWD", 206 | "CID", 207 | "CIDD", 208 | "CIGCD", 209 | "CIGD", 210 | "CIID", 211 | "CILCD", 212 | "CIMD", 213 | "CJCCCCCD", 214 | "CJCCCD", 215 | "CJCCCDD", 216 | "CJCCD", 217 | "CJCCMD", 218 | "CJCD", 219 | "CJCDD", 220 | "CJCGCCD", 221 | "CJCGPJD", 222 | "CJCMD", 223 | "CJCPCCCD", 224 | "CJCPD", 225 | "CJD", 226 | "CJDCCCCD", 227 | "CJDCCJD", 228 | "CJDCD", 229 | "CJDD", 230 | "CJDFD", 231 | "CJDPD", 232 | "CJFCD", 233 | "CJFD", 234 | "CJGD", 235 | "CJGLD", 236 | "CJGPCJD", 237 | "CJID", 238 | "CJJCCD", 239 | "CJJD", 240 | "CJJJD", 241 | "CJJLD", 242 | "CJKD", 243 | "CJLCCD", 244 | "CJMCD", 245 | "CJMD", 246 | "CJPD", 247 | "CJWCCWCGJD", 248 | "CJWD", 249 | "CJWPMWCGD", 250 | "CKCD", 251 | "CKD", 252 | "CKJCDCD", 253 | "CKJPD", 254 | "CLCCCD", 255 | "CLCCD", 256 | "CLCCGCD", 257 | "CLCD", 258 | "CLD", 259 | "CLDFD", 260 | "CLID", 261 | "CLPCD", 262 | "CMCD", 263 | "CMCDD", 264 | "CMCGD", 265 | "CMD", 266 | "CMDCD", 267 | "CMDD", 268 | "CMMD", 269 | "CMMDCCD", 270 | "CMPD", 271 | "CPCCCCCCCD", 272 | "CPCCCCD", 273 | "CPCCCD", 274 | "CPCCD", 275 | "CPCD", 276 | "CPCDD", 277 | "CPCPD", 278 | "CPD", 279 | "CPDCCD", 280 | "CPDCD", 281 | "CPDD", 282 | "CPDGD", 283 | "CPDWGWD", 284 | "CPGCD", 285 | "CPGD", 286 | "CPID", 287 | "CPJCD", 288 | "CPJD", 289 | "CPJPD", 290 | "CPMD", 291 | "CPPD", 292 | "CWCD", 293 | "CWCGWCCD", 294 | "CWCWD", 295 | "CWDWDD", 296 | "CWGWCCD", 297 | "CWGWCD", 298 | "CWPWD", 299 | "DCCCCCD", 300 | "DCCCCD", 301 | "DCCCCDCCD", 302 | "DCCCD", 303 | "DCCD", 304 | "DCD", 305 | "DCDD", 306 | "DCGCD", 307 | "DCJD", 308 | "DCPD", 309 | "DD", 310 | "DDCCD", 311 | "DDCD", 312 | "DDD", 313 | "DDICCD", 314 | "DFD", 315 | "DGCCD", 316 | "DGCD", 317 | "DGD", 318 | "DGDCD", 319 | "DGDD", 320 | "DGDPD", 321 | "DGGD", 322 | "DICCCD", 323 | "DICD", 324 | "DID", 325 | "DIICD", 326 | "DJCCD", 327 | "DJCD", 328 | "DJD", 329 | "DLCCD", 330 | "DLCD", 331 | "DLD", 332 | "DMCD", 333 | "DMD", 334 | "DMMCD", 335 | "DPD", 336 | "DPMMCCD", 337 | "FCCCCCD", 338 | "FCCCCD", 339 | "FCCCD", 340 | "FCCCPCD", 341 | "FCCD", 342 | "FCCGD", 343 | "FCCID", 344 | "FCCPD", 345 | "FCCWGWD", 346 | "FCD", 347 | "FCDCD", 348 | "FCDD", 349 | "FCDFD", 350 | "FCFCD", 351 | "FCFPD", 352 | "FCGCCD", 353 | "FCGCD", 354 | "FCGD", 355 | "FCID", 356 | "FCIJJD", 357 | "FCJCD", 358 | "FCJD", 359 | "FCPD", 360 | "FCPGCD", 361 | "FCWGWD", 362 | "FD", 363 | "FDCD", 364 | "FDD", 365 | "FDFD", 366 | "FDGCCD", 367 | "FDID", 368 | "FDLCD", 369 | "FFCCD", 370 | "FFCD", 371 | "FFCKFCCD", 372 | "FFCLLD", 373 | "FFD", 374 | "FFFD", 375 | "FFGCCD", 376 | "FFGD", 377 | "FFJCD", 378 | "FFJD", 379 | "FFJPCD", 380 | "FFPD", 381 | "FGCCD", 382 | "FGCD", 383 | "FGCGCGCJCD", 384 | "FGD", 385 | "FGDD", 386 | "FGFD", 387 | "FGJCCD", 388 | "FICCD", 389 | "FICD", 390 | "FICDD", 391 | "FICGD", 392 | "FICID", 393 | "FID", 394 | "FIDCD", 395 | "FIDD", 396 | "FIFPD", 397 | "FIID", 398 | "FIJCD", 399 | "FIJD", 400 | "FJCCD", 401 | "FJCD", 402 | "FJCDD", 403 | "FJD", 404 | "FJDCD", 405 | "FJDD", 406 | "FJGD", 407 | "FJJCCD", 408 | "FJJCD", 409 | "FJJCLCD", 410 | "FJJD", 411 | "FJJJCCD", 412 | "FJJJD", 413 | "FJJJICCD", 414 | "FJJLJLCD", 415 | "FJPJD", 416 | "FKCD", 417 | "FKCJD", 418 | "FLD", 419 | "FLPCD", 420 | "FMD", 421 | "FPCCCD", 422 | "FPCD", 423 | "FPD", 424 | "FPFD", 425 | "FPFDD", 426 | "FPID", 427 | "FPJCCD", 428 | "FPJCD", 429 | "FPPCD", 430 | "FPPD", 431 | "FPPDLD", 432 | "FWCCCWCD", 433 | "FWCCCWD", 434 | "FWDWD", 435 | "FWFD", 436 | "FWFWCCCWD", 437 | "FWGJCD", 438 | "FWGWCD", 439 | "GCCCCCCCD", 440 | "GCCCCCCD", 441 | "GCCCCCD", 442 | "GCCCCCDCD", 443 | "GCCCCCDD", 444 | "GCCCCD", 445 | "GCCCCDCCD", 446 | "GCCCCDD", 447 | "GCCCCGD", 448 | "GCCCCJD", 449 | "GCCCCPD", 450 | "GCCCCWDWD", 451 | "GCCCD", 452 | "GCCCDCCCD", 453 | "GCCCDCCCDD", 454 | "GCCCDCCD", 455 | "GCCCDCD", 456 | "GCCCDD", 457 | "GCCCDDJD", 458 | "GCCCDID", 459 | "GCCCDMCD", 460 | "GCCCDPD", 461 | "GCCCDWGCDWD", 462 | "GCCCFCD", 463 | "GCCCGD", 464 | "GCCCICD", 465 | "GCCCID", 466 | "GCCCJCD", 467 | "GCCCJD", 468 | "GCCCJGD", 469 | "GCCCLD", 470 | "GCCCMD", 471 | "GCCCPCCD", 472 | "GCCCWDWD", 473 | "GCCD", 474 | "GCCDCCCCD", 475 | "GCCDCCCD", 476 | "GCCDCCCDCD", 477 | "GCCDCCD", 478 | "GCCDCD", 479 | "GCCDCID", 480 | "GCCDCJCD", 481 | "GCCDCPCD", 482 | "GCCDD", 483 | "GCCDDCCCD", 484 | "GCCDDCCD", 485 | "GCCDDD", 486 | "GCCDFD", 487 | "GCCDGCCD", 488 | "GCCDGD", 489 | "GCCDGGDCD", 490 | "GCCDID", 491 | "GCCDJCD", 492 | "GCCDJD", 493 | "GCCDLDD", 494 | "GCCDLJCD", 495 | "GCCDMJD", 496 | "GCCDMJMMCD", 497 | "GCCDMJMMD", 498 | "GCCDMMD", 499 | "GCCDPD", 500 | "GCCFCD", 501 | "GCCFDD", 502 | "GCCFJPD", 503 | "GCCFPD", 504 | "GCCGCCCD", 505 | "GCCGCCD", 506 | "GCCGCD", 507 | "GCCGCDD", 508 | "GCCGD", 509 | "GCCGGCGD", 510 | "GCCGGDD", 511 | "GCCICCDCCD", 512 | "GCCICD", 513 | "GCCID", 514 | "GCCIDD", 515 | "GCCJCCCD", 516 | "GCCJCCCID", 517 | "GCCJCCD", 518 | "GCCJCD", 519 | "GCCJCJD", 520 | "GCCJD", 521 | "GCCJICD", 522 | "GCCJID", 523 | "GCCJPCD", 524 | "GCCJPD", 525 | "GCCKD", 526 | "GCCLCCD", 527 | "GCCLCD", 528 | "GCCLCGCD", 529 | "GCCLD", 530 | "GCCMCD", 531 | "GCCMD", 532 | "GCCMPD", 533 | "GCCPCCCCD", 534 | "GCCPCCCID", 535 | "GCCPCCD", 536 | "GCCPCD", 537 | "GCCPD", 538 | "GCCPDD", 539 | "GCCPFWCJD", 540 | "GCCPJD", 541 | "GCCWCCWCD", 542 | "GCCWCDWCD", 543 | "GCCWDWCCD", 544 | "GCCWDWD", 545 | "GCD", 546 | "GCDCCCCD", 547 | "GCDCCCCPD", 548 | "GCDCCCD", 549 | "GCDCCD", 550 | "GCDCCDCD", 551 | "GCDCCDD", 552 | "GCDCCDID", 553 | "GCDCCJCD", 554 | "GCDCCJD", 555 | "GCDCD", 556 | "GCDCDD", 557 | "GCDCDICD", 558 | "GCDCGCD", 559 | "GCDCGD", 560 | "GCDCGMCD", 561 | "GCDCID", 562 | "GCDCJCD", 563 | "GCDCJD", 564 | "GCDCLDD", 565 | "GCDCMCD", 566 | "GCDCMD", 567 | "GCDCMDCD", 568 | "GCDCMDD", 569 | "GCDCMDID", 570 | "GCDCPD", 571 | "GCDD", 572 | "GCDDCD", 573 | "GCDDD", 574 | "GCDDMCD", 575 | "GCDFD", 576 | "GCDFGCD", 577 | "GCDFWFD", 578 | "GCDGCCCCCD", 579 | "GCDGCCD", 580 | "GCDGCD", 581 | "GCDGD", 582 | "GCDGDD", 583 | "GCDGGD", 584 | "GCDGLCCD", 585 | "GCDGLJPCD", 586 | "GCDICCCCD", 587 | "GCDICCD", 588 | "GCDICD", 589 | "GCDID", 590 | "GCDIDD", 591 | "GCDJCCD", 592 | "GCDJCD", 593 | "GCDJCDGPD", 594 | "GCDJD", 595 | "GCDJJD", 596 | "GCDKCDCD", 597 | "GCDLCCCD", 598 | "GCDLD", 599 | "GCDLGCCCCD", 600 | "GCDLGCD", 601 | "GCDLPD", 602 | "GCDMCD", 603 | "GCDMCDD", 604 | "GCDMD", 605 | "GCDMDD", 606 | "GCDMJD", 607 | "GCDPCD", 608 | "GCDPD", 609 | "GCDWFWD", 610 | "GCDWGWCD", 611 | "GCDWGWD", 612 | "GCFCCD", 613 | "GCFCCJFGDD", 614 | "GCFCD", 615 | "GCFD", 616 | "GCFDD", 617 | "GCFFD", 618 | "GCFID", 619 | "GCFJCCD", 620 | "GCFPCD", 621 | "GCFPD", 622 | "GCFWGCCD", 623 | "GCFWGCCDD", 624 | "GCFWGJCD", 625 | "GCGCCCD", 626 | "GCGCCD", 627 | "GCGCD", 628 | "GCGCID", 629 | "GCGCLD", 630 | "GCGCPPCCD", 631 | "GCGD", 632 | "GCGDD", 633 | "GCGGCD", 634 | "GCGGCGD", 635 | "GCGGD", 636 | "GCGICD", 637 | "GCGID", 638 | "GCGJCCD", 639 | "GCGPCCD", 640 | "GCICCCCD", 641 | "GCICCCD", 642 | "GCICCD", 643 | "GCICD", 644 | "GCICDD", 645 | "GCID", 646 | "GCIDD", 647 | "GCIDID", 648 | "GCIFCCD", 649 | "GCIID", 650 | "GCIJCD", 651 | "GCIJD", 652 | "GCIJICD", 653 | "GCIPCD", 654 | "GCIPD", 655 | "GCIWGIIWD", 656 | "GCJCCCCD", 657 | "GCJCCCD", 658 | "GCJCCD", 659 | "GCJCD", 660 | "GCJCGD", 661 | "GCJCID", 662 | "GCJCIID", 663 | "GCJCPD", 664 | "GCJD", 665 | "GCJDCCD", 666 | "GCJDCD", 667 | "GCJDD", 668 | "GCJDID", 669 | "GCJFD", 670 | "GCJGD", 671 | "GCJICD", 672 | "GCJID", 673 | "GCJJCCD", 674 | "GCJJCD", 675 | "GCJJD", 676 | "GCJJGD", 677 | "GCJKCD", 678 | "GCJLCCD", 679 | "GCJMD", 680 | "GCJPCCGJLFD", 681 | "GCJPD", 682 | "GCJWCCJCD", 683 | "GCKCCD", 684 | "GCKD", 685 | "GCLCCCD", 686 | "GCLCCD", 687 | "GCLCD", 688 | "GCLD", 689 | "GCLDD", 690 | "GCLGGCD", 691 | "GCMCCD", 692 | "GCMCD", 693 | "GCMD", 694 | "GCMDD", 695 | "GCMPCD", 696 | "GCMPMD", 697 | "GCPCCCCD", 698 | "GCPCCCD", 699 | "GCPCCD", 700 | "GCPCCDD", 701 | "GCPCD", 702 | "GCPCDD", 703 | "GCPCKCD", 704 | "GCPD", 705 | "GCPDCCD", 706 | "GCPDD", 707 | "GCPFD", 708 | "GCPICCCD", 709 | "GCPJCCD", 710 | "GCPJCD", 711 | "GCPJD", 712 | "GCPJDCD", 713 | "GCPJJCD", 714 | "GCPJJDD", 715 | "GCPJPD", 716 | "GCPPCCD", 717 | "GCPPD", 718 | "GCPPPD", 719 | "GCWCWCJD", 720 | "GCWCWD", 721 | "GCWDWCDD", 722 | "GCWDWD", 723 | "GCWGWDD", 724 | "GD", 725 | "GDCCCCCCD", 726 | "GDCCCCCD", 727 | "GDCCCCD", 728 | "GDCCCCPD", 729 | "GDCCCD", 730 | "GDCCCDD", 731 | "GDCCCGCCD", 732 | "GDCCCJCD", 733 | "GDCCCJD", 734 | "GDCCCJDCD", 735 | "GDCCD", 736 | "GDCCDCD", 737 | "GDCCDCDD", 738 | "GDCCDD", 739 | "GDCCID", 740 | "GDCCJD", 741 | "GDCCPCD", 742 | "GDCD", 743 | "GDCDCCD", 744 | "GDCDCD", 745 | "GDCDD", 746 | "GDCDICD", 747 | "GDCDPD", 748 | "GDCFD", 749 | "GDCGCCD", 750 | "GDCGD", 751 | "GDCGPPCCD", 752 | "GDCID", 753 | "GDCIDD", 754 | "GDCJCCD", 755 | "GDCJD", 756 | "GDCLD", 757 | "GDCMD", 758 | "GDCPD", 759 | "GDCPID", 760 | "GDCPJD", 761 | "GDD", 762 | "GDDCCCCD", 763 | "GDDCCCD", 764 | "GDDCCD", 765 | "GDDCD", 766 | "GDDCDD", 767 | "GDDCFD", 768 | "GDDCFDCD", 769 | "GDDCMD", 770 | "GDDD", 771 | "GDDDCD", 772 | "GDDID", 773 | "GDDPPD", 774 | "GDDPPLD", 775 | "GDFCCD", 776 | "GDFCD", 777 | "GDFD", 778 | "GDFFD", 779 | "GDFGD", 780 | "GDGCCCD", 781 | "GDGCCD", 782 | "GDGCD", 783 | "GDGD", 784 | "GDGDCD", 785 | "GDGDD", 786 | "GDGDFID", 787 | "GDGJCCD", 788 | "GDGMD", 789 | "GDICCD", 790 | "GDICD", 791 | "GDID", 792 | "GDIDCD", 793 | "GDIDD", 794 | "GDIGCD", 795 | "GDIID", 796 | "GDIPCD", 797 | "GDJCCCD", 798 | "GDJCCD", 799 | "GDJCD", 800 | "GDJD", 801 | "GDJICD", 802 | "GDJJD", 803 | "GDJJJD", 804 | "GDJPCD", 805 | "GDJPDD", 806 | "GDLCCCCCD", 807 | "GDLCID", 808 | "GDLD", 809 | "GDLJD", 810 | "GDLJDD", 811 | "GDMCD", 812 | "GDMD", 813 | "GDMDCD", 814 | "GDMDD", 815 | "GDMJD", 816 | "GDMJMMD", 817 | "GDMPD", 818 | "GDPCCCCCD", 819 | "GDPCCD", 820 | "GDPCD", 821 | "GDPD", 822 | "GDPGCD", 823 | "GDPID", 824 | "GDPJCD", 825 | "GDPJD", 826 | "GDPPD", 827 | "GDPPJD", 828 | "GDWDWCCD", 829 | "GDWDWCCDD", 830 | "GDWDWD", 831 | "GDWFWD", 832 | "GDWGWD", 833 | "GFCCCCCD", 834 | "GFCCCCD", 835 | "GFCCCCJD", 836 | "GFCCCD", 837 | "GFCCCID", 838 | "GFCCD", 839 | "GFCCDD", 840 | "GFCCFCD", 841 | "GFCCPD", 842 | "GFCCPGD", 843 | "GFCD", 844 | "GFCDCD", 845 | "GFCDD", 846 | "GFCID", 847 | "GFCJCD", 848 | "GFCJD", 849 | "GFCPCCD", 850 | "GFCPCD", 851 | "GFCPD", 852 | "GFCPJD", 853 | "GFCPJPD", 854 | "GFD", 855 | "GFDCCCD", 856 | "GFDCD", 857 | "GFDD", 858 | "GFFCCD", 859 | "GFFCD", 860 | "GFFD", 861 | "GFFPCGCD", 862 | "GFGCD", 863 | "GFGCID", 864 | "GFGD", 865 | "GFGJCD", 866 | "GFICCD", 867 | "GFICD", 868 | "GFID", 869 | "GFIICD", 870 | "GFJCCCD", 871 | "GFJCCD", 872 | "GFJCD", 873 | "GFJCDCD", 874 | "GFJD", 875 | "GFJJCCD", 876 | "GFJJD", 877 | "GFJJJCCD", 878 | "GFJJLJCLCD", 879 | "GFLD", 880 | "GFLPD", 881 | "GFMCD", 882 | "GFPCD", 883 | "GFPD", 884 | "GFPJCD", 885 | "GFPJD", 886 | "GFPJPD", 887 | "GFPPCCCD", 888 | "GFPPD", 889 | "GFWCJCPCCCWCCD", 890 | "GFWGWCD", 891 | "GGCCCCCD", 892 | "GGCCCCD", 893 | "GGCCCD", 894 | "GGCCCICD", 895 | "GGCCCID", 896 | "GGCCCWDWD", 897 | "GGCCD", 898 | "GGCCDCD", 899 | "GGCCDD", 900 | "GGCCGCD", 901 | "GGCCGD", 902 | "GGCCGJD", 903 | "GGCCJCD", 904 | "GGCCJD", 905 | "GGCD", 906 | "GGCDCCCCCD", 907 | "GGCDCCD", 908 | "GGCDCD", 909 | "GGCDD", 910 | "GGCDJD", 911 | "GGCFCCFCPD", 912 | "GGCFD", 913 | "GGCFJD", 914 | "GGCGCCCD", 915 | "GGCGCD", 916 | "GGCGD", 917 | "GGCGGD", 918 | "GGCICLCD", 919 | "GGCID", 920 | "GGCIJCD", 921 | "GGCJCCD", 922 | "GGCJCD", 923 | "GGCJD", 924 | "GGCJDDCD", 925 | "GGCJJCCD", 926 | "GGCJJD", 927 | "GGCJPCICCCD", 928 | "GGCJPD", 929 | "GGCLCD", 930 | "GGCLD", 931 | "GGCMD", 932 | "GGCPCCD", 933 | "GGCPCD", 934 | "GGCPD", 935 | "GGD", 936 | "GGDCCCD", 937 | "GGDCCD", 938 | "GGDCD", 939 | "GGDD", 940 | "GGDDCCD", 941 | "GGDDCD", 942 | "GGDDD", 943 | "GGDFCD", 944 | "GGDFD", 945 | "GGDGD", 946 | "GGDID", 947 | "GGDJCD", 948 | "GGDJD", 949 | "GGDJJD", 950 | "GGDPPJD", 951 | "GGFCCCD", 952 | "GGFCCD", 953 | "GGFCD", 954 | "GGFD", 955 | "GGFDD", 956 | "GGFFCD", 957 | "GGFFD", 958 | "GGFFDCD", 959 | "GGFFDD", 960 | "GGFGD", 961 | "GGFJCCD", 962 | "GGFJD", 963 | "GGFJDD", 964 | "GGFJJD", 965 | "GGFLD", 966 | "GGFPCFPCD", 967 | "GGGCCCCD", 968 | "GGGCCCD", 969 | "GGGCCD", 970 | "GGGCD", 971 | "GGGCDD", 972 | "GGGCGCD", 973 | "GGGCGD", 974 | "GGGCID", 975 | "GGGCJD", 976 | "GGGD", 977 | "GGGDCD", 978 | "GGGDD", 979 | "GGGFD", 980 | "GGGGCD", 981 | "GGGGD", 982 | "GGGGFJD", 983 | "GGGGICD", 984 | "GGGGJD", 985 | "GGGGJPD", 986 | "GGGGLD", 987 | "GGGGPCD", 988 | "GGGGPPD", 989 | "GGGICD", 990 | "GGGID", 991 | "GGGIDID", 992 | "GGGIGCJD", 993 | "GGGIJD", 994 | "GGGJCD", 995 | "GGGJD", 996 | "GGGJJCJD", 997 | "GGGJJD", 998 | "GGGJPCCD", 999 | "GGGLD", 1000 | "GGGMD", 1001 | "GGGPJD", 1002 | "GGGWICWD", 1003 | "GGICCCCD", 1004 | "GGICCCD", 1005 | "GGICCD", 1006 | "GGICCGD", 1007 | "GGICCLD", 1008 | "GGICCPCCD", 1009 | "GGICD", 1010 | "GGICGCCCD", 1011 | "GGICID", 1012 | "GGICJD", 1013 | "GGID", 1014 | "GGIDCD", 1015 | "GGIDD", 1016 | "GGIFD", 1017 | "GGIFJCD", 1018 | "GGIFPD", 1019 | "GGIGCCD", 1020 | "GGIGD", 1021 | "GGIICD", 1022 | "GGIID", 1023 | "GGIIPID", 1024 | "GGIJCCD", 1025 | "GGIJD", 1026 | "GGIPCD", 1027 | "GGIPD", 1028 | "GGIPDD", 1029 | "GGJCCCD", 1030 | "GGJCCD", 1031 | "GGJCCPCJCCD", 1032 | "GGJCD", 1033 | "GGJCWDWD", 1034 | "GGJD", 1035 | "GGJGCCCD", 1036 | "GGJGCCD", 1037 | "GGJGD", 1038 | "GGJJD", 1039 | "GGJJPCD", 1040 | "GGJLD", 1041 | "GGJPD", 1042 | "GGJPDD", 1043 | "GGKD", 1044 | "GGKGD", 1045 | "GGLCCCD", 1046 | "GGLCD", 1047 | "GGLCDD", 1048 | "GGLCJD", 1049 | "GGLCPD", 1050 | "GGLD", 1051 | "GGLFD", 1052 | "GGLID", 1053 | "GGLJD", 1054 | "GGLLFD", 1055 | "GGLPD", 1056 | "GGMCD", 1057 | "GGMCDD", 1058 | "GGMD", 1059 | "GGMJCD", 1060 | "GGMLD", 1061 | "GGMPCCD", 1062 | "GGPCCCD", 1063 | "GGPCCD", 1064 | "GGPCD", 1065 | "GGPCJCD", 1066 | "GGPD", 1067 | "GGPFD", 1068 | "GGPICD", 1069 | "GGPJCCCCD", 1070 | "GGPJCD", 1071 | "GGPJCDD", 1072 | "GGPJD", 1073 | "GGPLD", 1074 | "GGPPCCD", 1075 | "GGPPCD", 1076 | "GGPPD", 1077 | "GGPPJJD", 1078 | "GGPPPCD", 1079 | "GGWPCGWPJD", 1080 | "GICCCCCCD", 1081 | "GICCCCCD", 1082 | "GICCCCD", 1083 | "GICCCD", 1084 | "GICCCDD", 1085 | "GICCCJCD", 1086 | "GICCD", 1087 | "GICCDD", 1088 | "GICCJD", 1089 | "GICCLDD", 1090 | "GICCPD", 1091 | "GICD", 1092 | "GICDCCCCD", 1093 | "GICDCCD", 1094 | "GICDCD", 1095 | "GICDD", 1096 | "GICDLPD", 1097 | "GICDWCCWD", 1098 | "GICGCCCCD", 1099 | "GICGCCD", 1100 | "GICGCJICD", 1101 | "GICGD", 1102 | "GICGGD", 1103 | "GICGMMD", 1104 | "GICGPCJD", 1105 | "GICICCD", 1106 | "GICICD", 1107 | "GICID", 1108 | "GICIGD", 1109 | "GICIID", 1110 | "GICJCCD", 1111 | "GICJCD", 1112 | "GICJD", 1113 | "GICPCCCCD", 1114 | "GICPD", 1115 | "GICPICD", 1116 | "GICPJD", 1117 | "GID", 1118 | "GIDCCCJCD", 1119 | "GIDCCD", 1120 | "GIDCD", 1121 | "GIDD", 1122 | "GIDDD", 1123 | "GIDICCD", 1124 | "GIDID", 1125 | "GIDLPCD", 1126 | "GIFCCD", 1127 | "GIFD", 1128 | "GIFICD", 1129 | "GIFWFD", 1130 | "GIGCCD", 1131 | "GIGCD", 1132 | "GIGCGCD", 1133 | "GIGCJD", 1134 | "GIGCPD", 1135 | "GIGD", 1136 | "GIGGD", 1137 | "GIGICD", 1138 | "GIGID", 1139 | "GIGJPCD", 1140 | "GIICCCCD", 1141 | "GIICCD", 1142 | "GIICD", 1143 | "GIID", 1144 | "GIIGD", 1145 | "GIIID", 1146 | "GIIJCCCD", 1147 | "GIIJCD", 1148 | "GIJCCCCCD", 1149 | "GIJCCCCD", 1150 | "GIJCCCD", 1151 | "GIJCCD", 1152 | "GIJCD", 1153 | "GIJCPD", 1154 | "GIJD", 1155 | "GIJDD", 1156 | "GIJID", 1157 | "GIJJCCD", 1158 | "GIJJCD", 1159 | "GIJLD", 1160 | "GIJPD", 1161 | "GIJPDCD", 1162 | "GIKD", 1163 | "GILCCCCDD", 1164 | "GILCCD", 1165 | "GILCD", 1166 | "GILD", 1167 | "GILID", 1168 | "GILPMD", 1169 | "GIMCCD", 1170 | "GIMCD", 1171 | "GIMD", 1172 | "GIMJCD", 1173 | "GIMJD", 1174 | "GIMPCCD", 1175 | "GIPCCCCD", 1176 | "GIPCCCD", 1177 | "GIPCCD", 1178 | "GIPCD", 1179 | "GIPCMD", 1180 | "GIPD", 1181 | "GIPDCD", 1182 | "GIPDD", 1183 | "GIPICD", 1184 | "GIPJCCD", 1185 | "GIPJCD", 1186 | "GIPPCD", 1187 | "GIPPD", 1188 | "GIWDCCWCD", 1189 | "GIWDWD", 1190 | "GIWGWCD", 1191 | "GJCCCCCD", 1192 | "GJCCCCD", 1193 | "GJCCCD", 1194 | "GJCCCDCDCD", 1195 | "GJCCCDD", 1196 | "GJCCD", 1197 | "GJCCDCD", 1198 | "GJCCDD", 1199 | "GJCCFD", 1200 | "GJCCGJPD", 1201 | "GJCCICCD", 1202 | "GJCCJCD", 1203 | "GJCCJD", 1204 | "GJCD", 1205 | "GJCDCCD", 1206 | "GJCDCJCCD", 1207 | "GJCDD", 1208 | "GJCDJCD", 1209 | "GJCDPD", 1210 | "GJCGCD", 1211 | "GJCGD", 1212 | "GJCGPJCCD", 1213 | "GJCICCCD", 1214 | "GJCICD", 1215 | "GJCID", 1216 | "GJCJCCD", 1217 | "GJCJCD", 1218 | "GJCJD", 1219 | "GJCJJCCCCD", 1220 | "GJCJJCD", 1221 | "GJCJPD", 1222 | "GJCJPPCD", 1223 | "GJCLD", 1224 | "GJCLJCCCD", 1225 | "GJCMD", 1226 | "GJCPD", 1227 | "GJCPJD", 1228 | "GJCPPD", 1229 | "GJD", 1230 | "GJDCCCD", 1231 | "GJDCCD", 1232 | "GJDCD", 1233 | "GJDD", 1234 | "GJDICD", 1235 | "GJDID", 1236 | "GJDLCD", 1237 | "GJDPCD", 1238 | "GJFCCD", 1239 | "GJFCD", 1240 | "GJFD", 1241 | "GJFFD", 1242 | "GJFGD", 1243 | "GJFICD", 1244 | "GJGCD", 1245 | "GJGD", 1246 | "GJGPCD", 1247 | "GJICCCD", 1248 | "GJICCD", 1249 | "GJICD", 1250 | "GJID", 1251 | "GJIID", 1252 | "GJJCCCD", 1253 | "GJJCCD", 1254 | "GJJCCDD", 1255 | "GJJCD", 1256 | "GJJCJCCCD", 1257 | "GJJCJCCD", 1258 | "GJJCPCD", 1259 | "GJJD", 1260 | "GJJDCD", 1261 | "GJJDD", 1262 | "GJJFCCD", 1263 | "GJJFD", 1264 | "GJJGD", 1265 | "GJJJCD", 1266 | "GJJJD", 1267 | "GJJJICD", 1268 | "GJJJJCCD", 1269 | "GJJJJD", 1270 | "GJJPCCCD", 1271 | "GJJPCCD", 1272 | "GJJPCID", 1273 | "GJJPPD", 1274 | "GJLCCCCD", 1275 | "GJLCD", 1276 | "GJLCDD", 1277 | "GJLD", 1278 | "GJMCCD", 1279 | "GJMD", 1280 | "GJPCCCCD", 1281 | "GJPCCCD", 1282 | "GJPCCD", 1283 | "GJPCD", 1284 | "GJPCDD", 1285 | "GJPCJCD", 1286 | "GJPCLCD", 1287 | "GJPCMD", 1288 | "GJPD", 1289 | "GJPDD", 1290 | "GJPGCCD", 1291 | "GJPGD", 1292 | "GJPICCD", 1293 | "GJPICD", 1294 | "GJPICDD", 1295 | "GJPJCCD", 1296 | "GJPJD", 1297 | "GJPJPD", 1298 | "GJPLCD", 1299 | "GJPPJD", 1300 | "GKCCCD", 1301 | "GKCCD", 1302 | "GKCCPD", 1303 | "GKCD", 1304 | "GKCDCD", 1305 | "GKCDD", 1306 | "GKCDJCD", 1307 | "GKCJCD", 1308 | "GKCMD", 1309 | "GKD", 1310 | "GKDD", 1311 | "GKJJD", 1312 | "GLCCCCCCD", 1313 | "GLCCCCD", 1314 | "GLCCCD", 1315 | "GLCCD", 1316 | "GLCCDD", 1317 | "GLCCJCCCD", 1318 | "GLCCJCCD", 1319 | "GLCD", 1320 | "GLCDD", 1321 | "GLCDGCCD", 1322 | "GLCGCJCD", 1323 | "GLCGD", 1324 | "GLCGDD", 1325 | "GLCJD", 1326 | "GLCJJCCCCCD", 1327 | "GLCLD", 1328 | "GLCMD", 1329 | "GLCPCCD", 1330 | "GLCPD", 1331 | "GLD", 1332 | "GLDCD", 1333 | "GLDCMD", 1334 | "GLDCMDCD", 1335 | "GLDCMDD", 1336 | "GLDD", 1337 | "GLDDCKCD", 1338 | "GLFCD", 1339 | "GLFCFD", 1340 | "GLFGCD", 1341 | "GLGCD", 1342 | "GLGD", 1343 | "GLGPJD", 1344 | "GLICCD", 1345 | "GLICD", 1346 | "GLID", 1347 | "GLJCCCD", 1348 | "GLJCCD", 1349 | "GLJCD", 1350 | "GLJCICCD", 1351 | "GLJD", 1352 | "GLJFCD", 1353 | "GLJGD", 1354 | "GLJICCD", 1355 | "GLJID", 1356 | "GLJJD", 1357 | "GLJPCCD", 1358 | "GLJPCICD", 1359 | "GLJPJCCD", 1360 | "GLJWGWCD", 1361 | "GLLCCCD", 1362 | "GLLCID", 1363 | "GLPCCCD", 1364 | "GLPCCD", 1365 | "GLPCD", 1366 | "GLPCDD", 1367 | "GLPCPCCD", 1368 | "GLPD", 1369 | "GLPDD", 1370 | "GLPGCD", 1371 | "GLPJD", 1372 | "GLPLJCCCD", 1373 | "GLPLJCD", 1374 | "GLPPCCCCD", 1375 | "GLPPCCD", 1376 | "GLPPCD", 1377 | "GMCCCCD", 1378 | "GMCCCD", 1379 | "GMCCD", 1380 | "GMCCID", 1381 | "GMCD", 1382 | "GMCDCCCD", 1383 | "GMCDCCD", 1384 | "GMCDCD", 1385 | "GMCDD", 1386 | "GMCDMCD", 1387 | "GMCGD", 1388 | "GMCJCD", 1389 | "GMCMD", 1390 | "GMCMJD", 1391 | "GMD", 1392 | "GMDCD", 1393 | "GMDD", 1394 | "GMDICD", 1395 | "GMDID", 1396 | "GMGJCD", 1397 | "GMGJJD", 1398 | "GMICD", 1399 | "GMID", 1400 | "GMIPJCCD", 1401 | "GMJCCD", 1402 | "GMJCD", 1403 | "GMJD", 1404 | "GMJDD", 1405 | "GMJICCCD", 1406 | "GMJMJFCD", 1407 | "GMJPCD", 1408 | "GMJPLCCD", 1409 | "GMLD", 1410 | "GMLDCD", 1411 | "GMLGCD", 1412 | "GMLID", 1413 | "GMLLD", 1414 | "GMMCCCD", 1415 | "GMMD", 1416 | "GMMGD", 1417 | "GMMLCCD", 1418 | "GMMPCD", 1419 | "GMMPD", 1420 | "GMPCCD", 1421 | "GMPCD", 1422 | "GMPD", 1423 | "GMPDCD", 1424 | "GMPDD", 1425 | "GMPJCD", 1426 | "GPCCCCCCD", 1427 | "GPCCCCD", 1428 | "GPCCCCID", 1429 | "GPCCCD", 1430 | "GPCCD", 1431 | "GPCCDCCD", 1432 | "GPCCDD", 1433 | "GPCCDDD", 1434 | "GPCD", 1435 | "GPCDCCD", 1436 | "GPCDCD", 1437 | "GPCDD", 1438 | "GPCFDCCD", 1439 | "GPCFDD", 1440 | "GPCGD", 1441 | "GPCICCD", 1442 | "GPCID", 1443 | "GPCIJD", 1444 | "GPCJCCCD", 1445 | "GPCJCCD", 1446 | "GPCJCD", 1447 | "GPCPID", 1448 | "GPCWDWCD", 1449 | "GPD", 1450 | "GPDCCD", 1451 | "GPDCD", 1452 | "GPDD", 1453 | "GPFCCD", 1454 | "GPFCD", 1455 | "GPFD", 1456 | "GPFFCD", 1457 | "GPGCCCD", 1458 | "GPGD", 1459 | "GPGJCJCCCCD", 1460 | "GPGPJD", 1461 | "GPICCCCD", 1462 | "GPICCCD", 1463 | "GPICCD", 1464 | "GPICD", 1465 | "GPID", 1466 | "GPIDCD", 1467 | "GPIDD", 1468 | "GPJCCCCCD", 1469 | "GPJCCCD", 1470 | "GPJCCD", 1471 | "GPJCD", 1472 | "GPJCDD", 1473 | "GPJCJCCD", 1474 | "GPJD", 1475 | "GPJDCCD", 1476 | "GPJDCD", 1477 | "GPJDD", 1478 | "GPJFICD", 1479 | "GPJFID", 1480 | "GPJGD", 1481 | "GPJJCCD", 1482 | "GPJJCD", 1483 | "GPJLCD", 1484 | "GPJWDWD", 1485 | "GPLCWCWCWD", 1486 | "GPLD", 1487 | "GPLJCCD", 1488 | "GPMJCGD", 1489 | "GPMMD", 1490 | "GPMPCCD", 1491 | "GPPCCCCD", 1492 | "GPPCCCD", 1493 | "GPPCCD", 1494 | "GPPCD", 1495 | "GPPCDCCD", 1496 | "GPPCDD", 1497 | "GPPCLD", 1498 | "GPPD", 1499 | "GPPDCD", 1500 | "GPPDCDD", 1501 | "GPPDD", 1502 | "GPPGCD", 1503 | "GPPICCD", 1504 | "GPPID", 1505 | "GPPJCD", 1506 | "GPPJD", 1507 | "GPPJDD", 1508 | "GPPJJCCCCD", 1509 | "GPPLD", 1510 | "GPPPCCD", 1511 | "GPPPCKCCD", 1512 | "GPPPPCCD", 1513 | "GWCPWD", 1514 | "GWCWCCCD", 1515 | "GWCWCD", 1516 | "GWCWD", 1517 | "GWCWPJCD", 1518 | "GWD", 1519 | "GWFCD", 1520 | "GWGCCCD", 1521 | "GWGCCD", 1522 | "GWGCCWCD", 1523 | "GWGCD", 1524 | "GWGCWD", 1525 | "GWGD", 1526 | "GWGID", 1527 | "GWGWCCCCD", 1528 | "GWGWCCCD", 1529 | "GWGWCD", 1530 | "GWGWICD", 1531 | "GWGWLCD", 1532 | "GWICD", 1533 | "GWICWD", 1534 | "GWIWD", 1535 | "GWJWD", 1536 | "GWLJWCD", 1537 | "GWPD", 1538 | "GWPJD", 1539 | "ICCCCCCD", 1540 | "ICCCCCD", 1541 | "ICCCCD", 1542 | "ICCCCDD", 1543 | "ICCCD", 1544 | "ICCD", 1545 | "ICCDCCD", 1546 | "ICCDCD", 1547 | "ICCDD", 1548 | "ICCGCCD", 1549 | "ICCGCIPD", 1550 | "ICCGD", 1551 | "ICCJD", 1552 | "ICCPD", 1553 | "ICCWDWCD", 1554 | "ICD", 1555 | "ICDD", 1556 | "ICDID", 1557 | "ICFD", 1558 | "ICGCCCD", 1559 | "ICGCD", 1560 | "ICGFD", 1561 | "ICGGCD", 1562 | "ICGLCMD", 1563 | "ICICD", 1564 | "ICID", 1565 | "ICIGD", 1566 | "ICJCD", 1567 | "ICJD", 1568 | "ICJJD", 1569 | "ICLJCD", 1570 | "ICMCCCCD", 1571 | "ICMD", 1572 | "ICPCD", 1573 | "ICPD", 1574 | "ICPPD", 1575 | "ICWGWCD", 1576 | "ICWGWD", 1577 | "ICWGWDCD", 1578 | "ID", 1579 | "IDCCCCD", 1580 | "IDCCCD", 1581 | "IDCCD", 1582 | "IDCCGJID", 1583 | "IDCCICD", 1584 | "IDCCICDID", 1585 | "IDCD", 1586 | "IDCDCD", 1587 | "IDCDD", 1588 | "IDCFCD", 1589 | "IDCGD", 1590 | "IDCICD", 1591 | "IDCID", 1592 | "IDCJD", 1593 | "IDCPCCCCCCD", 1594 | "IDD", 1595 | "IDGCCCD", 1596 | "IDGCD", 1597 | "IDID", 1598 | "IDIDD", 1599 | "IDJCD", 1600 | "IDKCD", 1601 | "IDPD", 1602 | "IDWCWCCDD", 1603 | "IFD", 1604 | "IFWGWCD", 1605 | "IGCCCD", 1606 | "IGCCCDD", 1607 | "IGCCD", 1608 | "IGCD", 1609 | "IGCDCD", 1610 | "IGCDD", 1611 | "IGCGCCD", 1612 | "IGCGCD", 1613 | "IGCID", 1614 | "IGCJD", 1615 | "IGCPD", 1616 | "IGCWJWD", 1617 | "IGD", 1618 | "IGDD", 1619 | "IGFCCD", 1620 | "IGFCD", 1621 | "IGFD", 1622 | "IGGCD", 1623 | "IGID", 1624 | "IGJD", 1625 | "IGLCD", 1626 | "IGLD", 1627 | "IGPCD", 1628 | "IGPCDD", 1629 | "IICCCD", 1630 | "IICCD", 1631 | "IICD", 1632 | "IICGD", 1633 | "IID", 1634 | "IIGD", 1635 | "IIGJCJCD", 1636 | "IIIGCD", 1637 | "IIPCD", 1638 | "IJCCCCD", 1639 | "IJCCCD", 1640 | "IJCCD", 1641 | "IJCD", 1642 | "IJD", 1643 | "IJDCCD", 1644 | "IJGCD", 1645 | "IJGD", 1646 | "IJJCD", 1647 | "IJJD", 1648 | "IJJJCD", 1649 | "IJPCDD", 1650 | "IJWCFIWGD", 1651 | "IJWCFWD", 1652 | "IJWCPWGD", 1653 | "IKCCCD", 1654 | "ILCD", 1655 | "ILD", 1656 | "ILPCD", 1657 | "ILPMD", 1658 | "IMCCD", 1659 | "IMCD", 1660 | "IMD", 1661 | "IMPD", 1662 | "IPCCCD", 1663 | "IPCCD", 1664 | "IPCCID", 1665 | "IPCCJD", 1666 | "IPCD", 1667 | "IPCID", 1668 | "IPCJD", 1669 | "IPCPD", 1670 | "IPD", 1671 | "IPFCD", 1672 | "IPID", 1673 | "IPIJD", 1674 | "IPJCGD", 1675 | "IPJD", 1676 | "IPPCD", 1677 | "JCCCCCCD", 1678 | "JCCCCCD", 1679 | "JCCCCD", 1680 | "JCCCD", 1681 | "JCCCJCD", 1682 | "JCCD", 1683 | "JCCID", 1684 | "JCCJD", 1685 | "JCCMCD", 1686 | "JCD", 1687 | "JCDCCD", 1688 | "JCDCD", 1689 | "JCDD", 1690 | "JCDID", 1691 | "JCFCD", 1692 | "JCGCCCCD", 1693 | "JCGCCCD", 1694 | "JCGCCD", 1695 | "JCGCD", 1696 | "JCGD", 1697 | "JCGJGD", 1698 | "JCICCCD", 1699 | "JCID", 1700 | "JCIDD", 1701 | "JCJCCCD", 1702 | "JCJCCD", 1703 | "JCJCD", 1704 | "JCJD", 1705 | "JCJDD", 1706 | "JCJFD", 1707 | "JCJJPCD", 1708 | "JCJPID", 1709 | "JCJWGWD", 1710 | "JCLD", 1711 | "JCMD", 1712 | "JCMPD", 1713 | "JCPJCID", 1714 | "JCPJJCD", 1715 | "JCPPCCCD", 1716 | "JD", 1717 | "JDCD", 1718 | "JDCMD", 1719 | "JDD", 1720 | "JDGD", 1721 | "JDID", 1722 | "JDJD", 1723 | "JDMD", 1724 | "JFCD", 1725 | "JFD", 1726 | "JGCCCD", 1727 | "JGCD", 1728 | "JGD", 1729 | "JGDCJD", 1730 | "JGGD", 1731 | "JGPD", 1732 | "JICCCD", 1733 | "JICD", 1734 | "JID", 1735 | "JIDD", 1736 | "JIID", 1737 | "JIJD", 1738 | "JILD", 1739 | "JJCCCD", 1740 | "JJCCD", 1741 | "JJCCPGD", 1742 | "JJCD", 1743 | "JJD", 1744 | "JJDCJD", 1745 | "JJDD", 1746 | "JJGCCD", 1747 | "JJGD", 1748 | "JJICD", 1749 | "JJID", 1750 | "JJJCCCD", 1751 | "JJJCD", 1752 | "JJJCFCCCD", 1753 | "JJJD", 1754 | "JJJGD", 1755 | "JJMCID", 1756 | "JJPCD", 1757 | "JJPD", 1758 | "JJPPJLCD", 1759 | "JJWFWCCJJD", 1760 | "JJWGWCD", 1761 | "JJWGWCDD", 1762 | "JKCD", 1763 | "JKD", 1764 | "JLCCD", 1765 | "JLCCDD", 1766 | "JLCCJD", 1767 | "JLCD", 1768 | "JLCDD", 1769 | "JLCMD", 1770 | "JLCMDD", 1771 | "JLD", 1772 | "JLDD", 1773 | "JLGCJD", 1774 | "JLGJCCCJD", 1775 | "JLJD", 1776 | "JMCD", 1777 | "JMD", 1778 | "JMJD", 1779 | "JMPD", 1780 | "JPCCD", 1781 | "JPCD", 1782 | "JPCMD", 1783 | "JPCMDPD", 1784 | "JPD", 1785 | "JPDCCCD", 1786 | "JPDD", 1787 | "JPDGCD", 1788 | "JPFCCD", 1789 | "JPFD", 1790 | "JPICD", 1791 | "JPID", 1792 | "JPIID", 1793 | "JPJD", 1794 | "JPJJCCCFPCD", 1795 | "JPMD", 1796 | "JPMDCCD", 1797 | "JPMDD", 1798 | "JPPJD", 1799 | "JPPJLCD", 1800 | "KCCCCCD", 1801 | "KCCCCD", 1802 | "KCCCCDCD", 1803 | "KCCCD", 1804 | "KCCCDCD", 1805 | "KCCCDD", 1806 | "KCCCDDCCCD", 1807 | "KCCCGD", 1808 | "KCCD", 1809 | "KCCDCCD", 1810 | "KCCDCD", 1811 | "KCCJD", 1812 | "KCCJDID", 1813 | "KCCPD", 1814 | "KCD", 1815 | "KCDCCCCD", 1816 | "KCDCCD", 1817 | "KCDCD", 1818 | "KCDD", 1819 | "KCDICD", 1820 | "KCDJD", 1821 | "KCGCCCD", 1822 | "KCGCCCDD", 1823 | "KCGCCD", 1824 | "KCGCD", 1825 | "KCGD", 1826 | "KCGGGD", 1827 | "KCICD", 1828 | "KCID", 1829 | "KCIDCD", 1830 | "KCJCD", 1831 | "KCJD", 1832 | "KCKCD", 1833 | "KCMD", 1834 | "KCMDCD", 1835 | "KCPD", 1836 | "KCWGWD", 1837 | "KD", 1838 | "KDCCCD", 1839 | "KDCD", 1840 | "KDD", 1841 | "KDICD", 1842 | "KDLCCPD", 1843 | "KFCD", 1844 | "KFCDD", 1845 | "KFD", 1846 | "KFWFD", 1847 | "KGCCCD", 1848 | "KGCCD", 1849 | "KGCD", 1850 | "KGCDCCD", 1851 | "KGD", 1852 | "KGDD", 1853 | "KGGD", 1854 | "KGJPD", 1855 | "KICCD", 1856 | "KICD", 1857 | "KICDD", 1858 | "KID", 1859 | "KIDCCD", 1860 | "KIDJCD", 1861 | "KIGID", 1862 | "KIMCD", 1863 | "KIMD", 1864 | "KIWGWD", 1865 | "KJCCD", 1866 | "KJCD", 1867 | "KJD", 1868 | "KJDD", 1869 | "KJICCD", 1870 | "KJJD", 1871 | "KJJDCD", 1872 | "KJJJD", 1873 | "KJPD", 1874 | "KLCCD", 1875 | "KLD", 1876 | "KMCCJCCD", 1877 | "KMCD", 1878 | "KMCDD", 1879 | "KMD", 1880 | "KMDCD", 1881 | "KMDD", 1882 | "KMMD", 1883 | "KMMMD", 1884 | "KPCCCD", 1885 | "KPCCD", 1886 | "KPCD", 1887 | "KPD", 1888 | "KPDD", 1889 | "LCCCCD", 1890 | "LCCCD", 1891 | "LCCD", 1892 | "LCCDD", 1893 | "LCCDJCCD", 1894 | "LCCGD", 1895 | "LCCGID", 1896 | "LCCID", 1897 | "LCCPCD", 1898 | "LCCWGWD", 1899 | "LCD", 1900 | "LCDCCD", 1901 | "LCDCD", 1902 | "LCDCDD", 1903 | "LCDCDIGCD", 1904 | "LCDD", 1905 | "LCDFD", 1906 | "LCDGDD", 1907 | "LCDGID", 1908 | "LCDID", 1909 | "LCDLD", 1910 | "LCDLDCD", 1911 | "LCDLDD", 1912 | "LCDMCDD", 1913 | "LCDPD", 1914 | "LCGD", 1915 | "LCGDD", 1916 | "LCICCWGWD", 1917 | "LCID", 1918 | "LCIGD", 1919 | "LCJCD", 1920 | "LCJD", 1921 | "LCLD", 1922 | "LCMCCD", 1923 | "LCMCDD", 1924 | "LCMCID", 1925 | "LCMCMD", 1926 | "LCMD", 1927 | "LCMJCICD", 1928 | "LCMJD", 1929 | "LCPCJCD", 1930 | "LCPD", 1931 | "LCPMD", 1932 | "LCPPCD", 1933 | "LD", 1934 | "LDCCD", 1935 | "LDCD", 1936 | "LDCLCD", 1937 | "LDCLCDCD", 1938 | "LDCPD", 1939 | "LDD", 1940 | "LDDD", 1941 | "LDLCCCCD", 1942 | "LFCD", 1943 | "LFCFD", 1944 | "LFD", 1945 | "LFPPPCCD", 1946 | "LGCD", 1947 | "LGD", 1948 | "LGGCCCD", 1949 | "LGGCD", 1950 | "LGJCD", 1951 | "LGJLCD", 1952 | "LGJLD", 1953 | "LICCCD", 1954 | "LICCD", 1955 | "LICD", 1956 | "LICLD", 1957 | "LID", 1958 | "LIGD", 1959 | "LIPCCCD", 1960 | "LIWGWCCCD", 1961 | "LJCCCCD", 1962 | "LJCCCCWGWD", 1963 | "LJCCCD", 1964 | "LJCCD", 1965 | "LJCCDCCCD", 1966 | "LJCCDCCD", 1967 | "LJCCDCD", 1968 | "LJCCDID", 1969 | "LJCCDJCD", 1970 | "LJCD", 1971 | "LJCDD", 1972 | "LJCGD", 1973 | "LJCJJD", 1974 | "LJCWCWJWCWJD", 1975 | "LJD", 1976 | "LJDCCD", 1977 | "LJDCD", 1978 | "LJDD", 1979 | "LJDJPD", 1980 | "LJDJPDD", 1981 | "LJDJPDID", 1982 | "LJDJPMDD", 1983 | "LJFJJCLCD", 1984 | "LJGD", 1985 | "LJID", 1986 | "LJJCD", 1987 | "LJJD", 1988 | "LJLD", 1989 | "LJMD", 1990 | "LJPCD", 1991 | "LKCD", 1992 | "LLCD", 1993 | "LLD", 1994 | "LLPD", 1995 | "LMCCFCCD", 1996 | "LMCD", 1997 | "LMD", 1998 | "LMID", 1999 | "LPCCCCCD", 2000 | "LPCCCD", 2001 | "LPCCD", 2002 | "LPCD", 2003 | "LPCDD", 2004 | "LPCFPPD", 2005 | "LPCGCCCD", 2006 | "LPCGCCD", 2007 | "LPCGCCDCCD", 2008 | "LPCGD", 2009 | "LPCGDDPD", 2010 | "LPD", 2011 | "LPDD", 2012 | "LPDDD", 2013 | "LPICD", 2014 | "LPID", 2015 | "LPJD", 2016 | "LPMDCCD", 2017 | "LPPJD", 2018 | "MCCCD", 2019 | "MCCD", 2020 | "MCCPD", 2021 | "MCD", 2022 | "MCDCCD", 2023 | "MCDCCDCD", 2024 | "MCDCCDD", 2025 | "MCDCD", 2026 | "MCDCGD", 2027 | "MCDD", 2028 | "MCDFD", 2029 | "MCDFDD", 2030 | "MCDLCD", 2031 | "MCDPPD", 2032 | "MCGCD", 2033 | "MCICD", 2034 | "MCID", 2035 | "MCIDWGWD", 2036 | "MCJD", 2037 | "MCLD", 2038 | "MCPD", 2039 | "MD", 2040 | "MDD", 2041 | "MFD", 2042 | "MGD", 2043 | "MGJD", 2044 | "MGJJD", 2045 | "MICCD", 2046 | "MICD", 2047 | "MID", 2048 | "MIDCCD", 2049 | "MJCCD", 2050 | "MJCD", 2051 | "MJD", 2052 | "MJDD", 2053 | "MLCD", 2054 | "MLD", 2055 | "MLGD", 2056 | "MLGGD", 2057 | "MMCCD", 2058 | "MMCD", 2059 | "MMD", 2060 | "MMMD", 2061 | "MMPD", 2062 | "MPCCD", 2063 | "MPCD", 2064 | "MPD", 2065 | "MPDCD", 2066 | "MPJPD", 2067 | "MPPD", 2068 | "PCCCCCCD", 2069 | "PCCCCCD", 2070 | "PCCCCD", 2071 | "PCCCD", 2072 | "PCCCDD", 2073 | "PCCD", 2074 | "PCCDD", 2075 | "PCCGJGD", 2076 | "PCCID", 2077 | "PCCIDD", 2078 | "PCD", 2079 | "PCDCD", 2080 | "PCDCJCD", 2081 | "PCDD", 2082 | "PCDFCCCD", 2083 | "PCDID", 2084 | "PCGCCD", 2085 | "PCGCD", 2086 | "PCGD", 2087 | "PCID", 2088 | "PCJCD", 2089 | "PCJGD", 2090 | "PCPCCD", 2091 | "PCPD", 2092 | "PD", 2093 | "PDCCD", 2094 | "PDD", 2095 | "PDDD", 2096 | "PFCCD", 2097 | "PFCDD", 2098 | "PFCJCD", 2099 | "PFD", 2100 | "PFFCD", 2101 | "PFPCD", 2102 | "PGCD", 2103 | "PGCJD", 2104 | "PGD", 2105 | "PGDCICD", 2106 | "PGJD", 2107 | "PICCD", 2108 | "PICD", 2109 | "PICDD", 2110 | "PID", 2111 | "PIFD", 2112 | "PIJCCD", 2113 | "PIJD", 2114 | "PJCCCDD", 2115 | "PJCCD", 2116 | "PJCD", 2117 | "PJD", 2118 | "PJDCD", 2119 | "PJDD", 2120 | "PJFD", 2121 | "PJGD", 2122 | "PJICCCPCD", 2123 | "PJID", 2124 | "PJJD", 2125 | "PJJDD", 2126 | "PJJPD", 2127 | "PJLPCD", 2128 | "PJPCD", 2129 | "PJPD", 2130 | "PLD", 2131 | "PLPCD", 2132 | "PMJCD", 2133 | "PPCCCDCD", 2134 | "PPCD", 2135 | "PPCJCCD", 2136 | "PPD", 2137 | "PPDCD", 2138 | "PPFCCD", 2139 | "PPFCD", 2140 | "PPGCID", 2141 | "PPGD", 2142 | "PPGJCCD", 2143 | "PPICCD", 2144 | "PPIGD", 2145 | "PPJCD", 2146 | "PPJD", 2147 | "PPJJD", 2148 | "PPMD", 2149 | "PPPCPD", 2150 | "PPPD", 2151 | "PPPWGWCCD", 2152 | "CCCCDID", 2153 | "CCCDFGD", 2154 | "CCCDGCD", 2155 | "CCCDGDD", 2156 | "CCCDWD", 2157 | "CCCGCCD", 2158 | "CCCGCD", 2159 | "CCCWCWD", 2160 | "CCCWGWCCD", 2161 | "CCCWGWCCDWD", 2162 | "CCCWGWD", 2163 | "CCDDGCD", 2164 | "CCDPCCD", 2165 | "CCDWD", 2166 | "CCFGCCCCCD", 2167 | "CCFGFCCCD", 2168 | "CCFPCD", 2169 | "CCGDD", 2170 | "CCGGCCD", 2171 | "CCIDGD", 2172 | "CCKD", 2173 | "CCMIDGCD", 2174 | "CCWD", 2175 | "CCWGWCCCD", 2176 | "CCWGWCD", 2177 | "CCWGWDD", 2178 | "CDWGWDGD", 2179 | "CFCCGWD", 2180 | "CFCD", 2181 | "CFCWGWD", 2182 | "CFGFGFGFGJID", 2183 | "CFJD", 2184 | "CFWGWCCDGCD", 2185 | "CFWGWCJCD", 2186 | "CGCCCCD", 2187 | "CGCCID", 2188 | "CGCCJCCCD", 2189 | "CGCDCCD", 2190 | "CGCFCCD", 2191 | "CGCGCD", 2192 | "CGCID", 2193 | "CGFCCD", 2194 | "CGFCD", 2195 | "CGFDID", 2196 | "CGGCICD", 2197 | "CGGJPD", 2198 | "CGICDGCD", 2199 | "CGICDID", 2200 | "CGIID", 2201 | "CGJCCCD", 2202 | "CGJCCD", 2203 | "CGJCD", 2204 | "CGJCDGD", 2205 | "CGJCDWD", 2206 | "CGJCJCD", 2207 | "CGJDD", 2208 | "CGJDDCCD", 2209 | "CGJGCD", 2210 | "CGJID", 2211 | "CGLCCD", 2212 | "CGPCCD", 2213 | "CGPCD", 2214 | "CGPD", 2215 | "CGPFCCD", 2216 | "CGPICD", 2217 | "CGPID", 2218 | "CGPJCDD", 2219 | "CGPJJJCD", 2220 | "CICCDGD", 2221 | "CICFJGD", 2222 | "CICGFID", 2223 | "CIDCD", 2224 | "CIDGD", 2225 | "CIFID", 2226 | "CIGCCD", 2227 | "CIGMCD", 2228 | "CIICCD", 2229 | "CIICD", 2230 | "CIJCWGWCD", 2231 | "CIJD", 2232 | "CIJWD", 2233 | "CIPCCD", 2234 | "CJCCDFD", 2235 | "CJCGD", 2236 | "CJCID", 2237 | "CJCWCCCD", 2238 | "CJCWGWD", 2239 | "CJGCCCD", 2240 | "CJICD", 2241 | "CJIDD", 2242 | "CJJCD", 2243 | "CJWGCD", 2244 | "CJWGWID", 2245 | "CPCCDGJD", 2246 | "CPCDCCD", 2247 | "CPDFCD", 2248 | "CPGID", 2249 | "CPICD", 2250 | "CPIWGWD", 2251 | "CPJGD", 2252 | "CPPCD", 2253 | "CPWGWDGD", 2254 | "D", 2255 | "FCCCCCCCD", 2256 | "FCCCCGD", 2257 | "FCCCDGD", 2258 | "FCCCWGWD", 2259 | "FCCDD", 2260 | "FCCDFCGD", 2261 | "FCCDGD", 2262 | "FCCDIPD", 2263 | "FCCDWGWD", 2264 | "FCCPCD", 2265 | "FCCWGWDD", 2266 | "FCDGD", 2267 | "FCDWD", 2268 | "FCDWGD", 2269 | "FCFWGWD", 2270 | "FCICCD", 2271 | "FCICDGD", 2272 | "FCIWGWDD", 2273 | "FCPCD", 2274 | "FCPCPD", 2275 | "FCPDGD", 2276 | "FCPPGD", 2277 | "FCWGWCD", 2278 | "FCWGWDD", 2279 | "FDDD", 2280 | "FDGD", 2281 | "FDGJCCD", 2282 | "FDWGWD", 2283 | "FFCCWGWD", 2284 | "FFFFD", 2285 | "FFFFFWWFD", 2286 | "FFFFWWD", 2287 | "FFFWD", 2288 | "FFFWWD", 2289 | "FFFWWFD", 2290 | "FFWWD", 2291 | "FGFPCCD", 2292 | "FGJWGWD", 2293 | "FICCCD", 2294 | "FICDGD", 2295 | "FICGWD", 2296 | "FICJD", 2297 | "FIICD", 2298 | "FIWGWCDD", 2299 | "FIWGWD", 2300 | "FIWGWDD", 2301 | "FJCCDD", 2302 | "FJGPCD", 2303 | "FJID", 2304 | "FJJGD", 2305 | "FMJD", 2306 | "FPCCD", 2307 | "FPCDD", 2308 | "FPDD", 2309 | "FPIDGD", 2310 | "FPWCWD", 2311 | "FWFWFD", 2312 | "FWGCD", 2313 | "FWGWCCD", 2314 | "FWGWCDGCD", 2315 | "FWGWCDGD", 2316 | "FWGWGD", 2317 | "FWJD", 2318 | "GCCCCCCDCD", 2319 | "GCCCCCDGD", 2320 | "GCCCCCID", 2321 | "GCCCCCKFD", 2322 | "GCCCCDCD", 2323 | "GCCCCDGCD", 2324 | "GCCCCDGCIJD", 2325 | "GCCCCDGDGDDDD", 2326 | "GCCCCDWFCCD", 2327 | "GCCCCDWGD", 2328 | "GCCCCFCCCCD", 2329 | "GCCCCID", 2330 | "GCCCDCPD", 2331 | "GCCCDDGCD", 2332 | "GCCCDDGD", 2333 | "GCCCDFCD", 2334 | "GCCCDGD", 2335 | "GCCCDGID", 2336 | "GCCCDICD", 2337 | "GCCCDMD", 2338 | "GCCCDWGCDWFCCD", 2339 | "GCCCDWGD", 2340 | "GCCCDWGWD", 2341 | "GCCCDWID", 2342 | "GCCCGPD", 2343 | "GCCCIJD", 2344 | "GCCCJCCD", 2345 | "GCCCJJCD", 2346 | "GCCCMCD", 2347 | "GCCCWD", 2348 | "GCCDCCMD", 2349 | "GCCDDWD", 2350 | "GCCDFCCD", 2351 | "GCCDGCD", 2352 | "GCCDGCGD", 2353 | "GCCDGDGCD", 2354 | "GCCDGJD", 2355 | "GCCDPPCD", 2356 | "GCCDWD", 2357 | "GCCFCCD", 2358 | "GCCFID", 2359 | "GCCFJCD", 2360 | "GCCFWCWCD", 2361 | "GCCGDCD", 2362 | "GCCGFD", 2363 | "GCCGFICD", 2364 | "GCCGID", 2365 | "GCCGIID", 2366 | "GCCICCD", 2367 | "GCCICDCD", 2368 | "GCCICWDD", 2369 | "GCCIDWDCD", 2370 | "GCCIID", 2371 | "GCCIJD", 2372 | "GCCJCDD", 2373 | "GCCJCGCD", 2374 | "GCCJDD", 2375 | "GCCJIDCD", 2376 | "GCCKDGD", 2377 | "GCCMJCD", 2378 | "GCCMJJCD", 2379 | "GCCWD", 2380 | "GCDCCCDGD", 2381 | "GCDCWDWD", 2382 | "GCDDDD", 2383 | "GCDDJCD", 2384 | "GCDFCD", 2385 | "GCDFID", 2386 | "GCDFJD", 2387 | "GCDGCGD", 2388 | "GCDGGGCD", 2389 | "GCDGIID", 2390 | "GCDIID", 2391 | "GCDKD", 2392 | "GCDMDFD", 2393 | "GCDPGD", 2394 | "GCDWD", 2395 | "GCDWDWD", 2396 | "GCFCCCD", 2397 | "GCFCCCDGD", 2398 | "GCFCDICD", 2399 | "GCFCDWGD", 2400 | "GCFCIFD", 2401 | "GCFCJD", 2402 | "GCFDDCID", 2403 | "GCFFJD", 2404 | "GCFGJPCD", 2405 | "GCFICD", 2406 | "GCFIDFD", 2407 | "GCFJD", 2408 | "GCFJDD", 2409 | "GCFJPD", 2410 | "GCFPCCCD", 2411 | "GCFPDD", 2412 | "GCFPID", 2413 | "GCGCCCCD", 2414 | "GCGCCCID", 2415 | "GCGCCCIDD", 2416 | "GCGCCDD", 2417 | "GCGCCDFD", 2418 | "GCGCCID", 2419 | "GCGCCJCD", 2420 | "GCGCCPD", 2421 | "GCGCDCCCD", 2422 | "GCGCDCD", 2423 | "GCGCDCID", 2424 | "GCGCDD", 2425 | "GCGCFCCD", 2426 | "GCGCFCD", 2427 | "GCGCFGCD", 2428 | "GCGCGCCD", 2429 | "GCGCGCD", 2430 | "GCGCGCPCCD", 2431 | "GCGCGD", 2432 | "GCGCGID", 2433 | "GCGCGPD", 2434 | "GCGCICCCD", 2435 | "GCGCICDDFCCCD", 2436 | "GCGCIDD", 2437 | "GCGCIID", 2438 | "GCGCJCCD", 2439 | "GCGCJD", 2440 | "GCGCJGWD", 2441 | "GCGCJJD", 2442 | "GCGCLCCD", 2443 | "GCGCPCCD", 2444 | "GCGCPCCID", 2445 | "GCGCPCD", 2446 | "GCGCPCJCCD", 2447 | "GCGDCCICCD", 2448 | "GCGDCD", 2449 | "GCGDIMD", 2450 | "GCGFCCD", 2451 | "GCGFCD", 2452 | "GCGFCJD", 2453 | "GCGFCMJD", 2454 | "GCGFD", 2455 | "GCGFDD", 2456 | "GCGFFCD", 2457 | "GCGFFD", 2458 | "GCGFID", 2459 | "GCGFIDD", 2460 | "GCGFJD", 2461 | "GCGGCCD", 2462 | "GCGGGJCD", 2463 | "GCGGJCID", 2464 | "GCGGJCJD", 2465 | "GCGICCCD", 2466 | "GCGICCD", 2467 | "GCGICCJD", 2468 | "GCGICDMD", 2469 | "GCGICICCD", 2470 | "GCGICJCDD", 2471 | "GCGICJD", 2472 | "GCGICJJD", 2473 | "GCGIDCGD", 2474 | "GCGIDD", 2475 | "GCGIDGD", 2476 | "GCGIGCCD", 2477 | "GCGIICD", 2478 | "GCGIID", 2479 | "GCGIMCCD", 2480 | "GCGIMJD", 2481 | "GCGIPCCD", 2482 | "GCGIPD", 2483 | "GCGJCCCCDD", 2484 | "GCGJCCCD", 2485 | "GCGJCCDD", 2486 | "GCGJCD", 2487 | "GCGJCID", 2488 | "GCGJD", 2489 | "GCGJDD", 2490 | "GCGJGICD", 2491 | "GCGJICD", 2492 | "GCGJID", 2493 | "GCGJIFCD", 2494 | "GCGJJCD", 2495 | "GCGJPCCD", 2496 | "GCGJPCD", 2497 | "GCGKCD", 2498 | "GCGKD", 2499 | "GCGLCDCCD", 2500 | "GCGLCJD", 2501 | "GCGLGCCD", 2502 | "GCGLGPCCID", 2503 | "GCGLIPJD", 2504 | "GCGLJJID", 2505 | "GCGMCD", 2506 | "GCGMD", 2507 | "GCGPCCCCCD", 2508 | "GCGPCCCD", 2509 | "GCGPCD", 2510 | "GCGPCFCCD", 2511 | "GCGPCID", 2512 | "GCGPCPD", 2513 | "GCGPD", 2514 | "GCGPFCD", 2515 | "GCGPGCD", 2516 | "GCGPIID", 2517 | "GCGPJCCD", 2518 | "GCGPJCD", 2519 | "GCGPJD", 2520 | "GCGPJGCD", 2521 | "GCGPJID", 2522 | "GCGPLICD", 2523 | "GCGPLID", 2524 | "GCGPPCCD", 2525 | "GCGPPCD", 2526 | "GCGPPD", 2527 | "GCGPPID", 2528 | "GCGPPJD", 2529 | "GCGWPFCD", 2530 | "GCICCCDD", 2531 | "GCICCDFD", 2532 | "GCICCJD", 2533 | "GCICCWDWDCGD", 2534 | "GCICDFCD", 2535 | "GCICPD", 2536 | "GCIDCD", 2537 | "GCIDCGD", 2538 | "GCIDDGD", 2539 | "GCIDPCCD", 2540 | "GCIICD", 2541 | "GCIJCCD", 2542 | "GCIJCCDMD", 2543 | "GCIJCID", 2544 | "GCIKD", 2545 | "GCIPCCD", 2546 | "GCIPCPD", 2547 | "GCJCCCCCD", 2548 | "GCJCCDCD", 2549 | "GCJCCDGD", 2550 | "GCJCCDMD", 2551 | "GCJCCICD", 2552 | "GCJCDD", 2553 | "GCJCICD", 2554 | "GCJCKDD", 2555 | "GCJDCDCD", 2556 | "GCJDDCD", 2557 | "GCJGCD", 2558 | "GCJICCCD", 2559 | "GCJICGD", 2560 | "GCJIDCD", 2561 | "GCJIDD", 2562 | "GCJJCDD", 2563 | "GCJJCJCD", 2564 | "GCJJDD", 2565 | "GCJMCID", 2566 | "GCJPCCCD", 2567 | "GCJPCCD", 2568 | "GCJPCD", 2569 | "GCJPCDMD", 2570 | "GCJPID", 2571 | "GCJPJD", 2572 | "GCJWCPWD", 2573 | "GCKCCCD", 2574 | "GCKCD", 2575 | "GCKDGD", 2576 | "GCKGD", 2577 | "GCKICD", 2578 | "GCKJCCD", 2579 | "GCKPD", 2580 | "GCLCID", 2581 | "GCLGIJCD", 2582 | "GCLID", 2583 | "GCMCCDFD", 2584 | "GCMCCKGD", 2585 | "GCMCJCCD", 2586 | "GCMCPD", 2587 | "GCMDCGCD", 2588 | "GCMFCDGD", 2589 | "GCMID", 2590 | "GCMJCD", 2591 | "GCMJCDD", 2592 | "GCMJCID", 2593 | "GCMJID", 2594 | "GCMJPCCCCD", 2595 | "GCMKD", 2596 | "GCMKGD", 2597 | "GCMPCCD", 2598 | "GCMPJD", 2599 | "GCMPPCCD", 2600 | "GCPCCCMD", 2601 | "GCPCCDCD", 2602 | "GCPCCDMD", 2603 | "GCPCCDWD", 2604 | "GCPCCWGCWD", 2605 | "GCPCDCD", 2606 | "GCPCDGD", 2607 | "GCPCDWD", 2608 | "GCPCICDWGD", 2609 | "GCPCIICFD", 2610 | "GCPCJCFD", 2611 | "GCPCJD", 2612 | "GCPDGD", 2613 | "GCPGGCD", 2614 | "GCPICCCDGD", 2615 | "GCPICCD", 2616 | "GCPICD", 2617 | "GCPICID", 2618 | "GCPID", 2619 | "GCPIJCCD", 2620 | "GCPJCDD", 2621 | "GCPJPDD", 2622 | "GCPKD", 2623 | "GCPMCCD", 2624 | "GCPMJCD", 2625 | "GCPPCD", 2626 | "GCPPID", 2627 | "GCPPWCWID", 2628 | "GCPWCWCD", 2629 | "GCPWDWDCCD", 2630 | "GCWDWDCCD", 2631 | "GCWGWCCD", 2632 | "GCWGWD", 2633 | "GCWGWJD", 2634 | "GCWJCCD", 2635 | "GDCCCCFCD", 2636 | "GDCCCFCPD", 2637 | "GDCCPCCD", 2638 | "GDCDGCD", 2639 | "GDCDJD", 2640 | "GDCKGCD", 2641 | "GDDGCD", 2642 | "GDDGD", 2643 | "GDDMD", 2644 | "GDICCCD", 2645 | "GDIPD", 2646 | "GDJCICD", 2647 | "GDLCCD", 2648 | "GFCCCCCCD", 2649 | "GFCCCCFD", 2650 | "GFCCCDD", 2651 | "GFCCCDDD", 2652 | "GFCCCDFCD", 2653 | "GFCCCDFDD", 2654 | "GFCCCDGCD", 2655 | "GFCCCDGD", 2656 | "GFCCCDID", 2657 | "GFCCDCCD", 2658 | "GFCCDCD", 2659 | "GFCCDCFD", 2660 | "GFCCDDD", 2661 | "GFCCDFCD", 2662 | "GFCCDFDMD", 2663 | "GFCCDFFCD", 2664 | "GFCCDFFD", 2665 | "GFCCDFGD", 2666 | "GFCCDGCCD", 2667 | "GFCCDGCD", 2668 | "GFCCDGD", 2669 | "GFCCDGGID", 2670 | "GFCCDGICDJICD", 2671 | "GFCCDID", 2672 | "GFCCDLD", 2673 | "GFCCDMD", 2674 | "GFCCDWCD", 2675 | "GFCCDWD", 2676 | "GFCCDWFDD", 2677 | "GFCCDWGCD", 2678 | "GFCCDWGD", 2679 | "GFCCID", 2680 | "GFCCJD", 2681 | "GFCCPCD", 2682 | "GFCDDCCCD", 2683 | "GFCDFCCD", 2684 | "GFCDFCD", 2685 | "GFCDGCD", 2686 | "GFCDGD", 2687 | "GFCDGDD", 2688 | "GFCDGFCD", 2689 | "GFCDGGCD", 2690 | "GFCDGGD", 2691 | "GFCDGPD", 2692 | "GFCDID", 2693 | "GFCDMIDMD", 2694 | "GFCDWCD", 2695 | "GFCDWD", 2696 | "GFCDWGD", 2697 | "GFCFCD", 2698 | "GFCGCD", 2699 | "GFCGD", 2700 | "GFCICD", 2701 | "GFCIDCGD", 2702 | "GFCIDWD", 2703 | "GFCJCCCD", 2704 | "GFCJCCD", 2705 | "GFCJCCDD", 2706 | "GFCJCJD", 2707 | "GFCJDD", 2708 | "GFCJID", 2709 | "GFCKD", 2710 | "GFCLCD", 2711 | "GFCMCCD", 2712 | "GFCMJCDWD", 2713 | "GFCPDGD", 2714 | "GFCPPCD", 2715 | "GFCWCD", 2716 | "GFCWCWCD", 2717 | "GFCWFWFCCD", 2718 | "GFCWGWD", 2719 | "GFDCCD", 2720 | "GFDCDCDD", 2721 | "GFDCDD", 2722 | "GFDCDGD", 2723 | "GFDCID", 2724 | "GFDDCCD", 2725 | "GFDDCGD", 2726 | "GFDDD", 2727 | "GFDDGD", 2728 | "GFDDPD", 2729 | "GFDGCD", 2730 | "GFDGD", 2731 | "GFDICD", 2732 | "GFDICPCD", 2733 | "GFDID", 2734 | "GFDJPCD", 2735 | "GFDWD", 2736 | "GFDWDWD", 2737 | "GFFCCCD", 2738 | "GFFCJD", 2739 | "GFFDD", 2740 | "GFFJJDGD", 2741 | "GFFPDGD", 2742 | "GFGCCCDD", 2743 | "GFGCCD", 2744 | "GFGCCDGPD", 2745 | "GFGFICD", 2746 | "GFGMPD", 2747 | "GFICCDCD", 2748 | "GFICCDD", 2749 | "GFICDCD", 2750 | "GFICDCJD", 2751 | "GFICDD", 2752 | "GFICDGD", 2753 | "GFICJD", 2754 | "GFICKD", 2755 | "GFIDD", 2756 | "GFIDFGD", 2757 | "GFIDGCD", 2758 | "GFIDGD", 2759 | "GFIDPCPCD", 2760 | "GFIGD", 2761 | "GFIID", 2762 | "GFIIDFCD", 2763 | "GFIIGD", 2764 | "GFIJCCD", 2765 | "GFIJD", 2766 | "GFJCCCCD", 2767 | "GFJCCDD", 2768 | "GFJCDD", 2769 | "GFJCDGD", 2770 | "GFJCDWD", 2771 | "GFJCJD", 2772 | "GFJDD", 2773 | "GFJDGCD", 2774 | "GFJDGFCD", 2775 | "GFJDWD", 2776 | "GFJDWFICGD", 2777 | "GFJFD", 2778 | "GFJICD", 2779 | "GFJICDGD", 2780 | "GFJID", 2781 | "GFJJCD", 2782 | "GFJJDWGD", 2783 | "GFKD", 2784 | "GFKDGD", 2785 | "GFLCD", 2786 | "GFMJCD", 2787 | "GFPCCCD", 2788 | "GFPCCD", 2789 | "GFPCDCD", 2790 | "GFPCDD", 2791 | "GFPCJD", 2792 | "GFPDCD", 2793 | "GFPDD", 2794 | "GFPDID", 2795 | "GFPICD", 2796 | "GFPIJD", 2797 | "GFPJIDD", 2798 | "GFPKD", 2799 | "GFPPCCD", 2800 | "GFPPCD", 2801 | "GFWCWID", 2802 | "GFWDWD", 2803 | "GFWJD", 2804 | "GGCCCCCCD", 2805 | "GGCCCCJCD", 2806 | "GGCCCDD", 2807 | "GGCCCDDD", 2808 | "GGCCCDGD", 2809 | "GGCCCDWGCD", 2810 | "GGCCDCDGD", 2811 | "GGCCDFGCD", 2812 | "GGCCDGD", 2813 | "GGCCDGDCD", 2814 | "GGCCDID", 2815 | "GGCCDMCD", 2816 | "GGCCDWGD", 2817 | "GGCCFCD", 2818 | "GGCCFD", 2819 | "GGCCGCCD", 2820 | "GGCCICDD", 2821 | "GGCCID", 2822 | "GGCCJCCDD", 2823 | "GGCCLCD", 2824 | "GGCCPCD", 2825 | "GGCCPJD", 2826 | "GGCDCCDGD", 2827 | "GGCDCDD", 2828 | "GGCDCDGD", 2829 | "GGCDCGD", 2830 | "GGCDDCCD", 2831 | "GGCDGD", 2832 | "GGCDGPGCD", 2833 | "GGCDID", 2834 | "GGCDMD", 2835 | "GGCFCCD", 2836 | "GGCFCD", 2837 | "GGCFID", 2838 | "GGCGCCCCD", 2839 | "GGCGCCD", 2840 | "GGCGCGCCD", 2841 | "GGCGCGCD", 2842 | "GGCGCGFCD", 2843 | "GGCGDGCD", 2844 | "GGCGFD", 2845 | "GGCGFID", 2846 | "GGCGGCD", 2847 | "GGCGGGD", 2848 | "GGCGGJD", 2849 | "GGCGICCD", 2850 | "GGCGIICD", 2851 | "GGCGILICD", 2852 | "GGCGJID", 2853 | "GGCGJIJCD", 2854 | "GGCGPCCD", 2855 | "GGCGPCD", 2856 | "GGCGPJCCD", 2857 | "GGCGPJCD", 2858 | "GGCGPJD", 2859 | "GGCGPPD", 2860 | "GGCICCD", 2861 | "GGCICCID", 2862 | "GGCICD", 2863 | "GGCIDD", 2864 | "GGCIICD", 2865 | "GGCIIJD", 2866 | "GGCIPCICCD", 2867 | "GGCIPD", 2868 | "GGCJCDD", 2869 | "GGCJCJD", 2870 | "GGCJDD", 2871 | "GGCJID", 2872 | "GGCKLCD", 2873 | "GGCLCCD", 2874 | "GGCMCIJD", 2875 | "GGCMID", 2876 | "GGCPCCCD", 2877 | "GGCPCCJCCCWD", 2878 | "GGCPCDD", 2879 | "GGCPDD", 2880 | "GGCPGGCID", 2881 | "GGCPICD", 2882 | "GGCPICDD", 2883 | "GGCPID", 2884 | "GGCPJCD", 2885 | "GGCPPCCD", 2886 | "GGCPPD", 2887 | "GGCWDWCCDGCD", 2888 | "GGCWGD", 2889 | "GGDCDCCD", 2890 | "GGDCJD", 2891 | "GGDDFD", 2892 | "GGDGCCGCD", 2893 | "GGDGCDGD", 2894 | "GGDLGD", 2895 | "GGFCCCCD", 2896 | "GGFCDID", 2897 | "GGFCFCDD", 2898 | "GGFCID", 2899 | "GGFCJD", 2900 | "GGFCMCCD", 2901 | "GGFDCD", 2902 | "GGFDDD", 2903 | "GGFICD", 2904 | "GGFICDD", 2905 | "GGFID", 2906 | "GGFJCD", 2907 | "GGFJID", 2908 | "GGFJMD", 2909 | "GGFKID", 2910 | "GGFMJDD", 2911 | "GGFPCD", 2912 | "GGFPD", 2913 | "GGFWID", 2914 | "GGGCCCDGD", 2915 | "GGGCCCICD", 2916 | "GGGCCDGD", 2917 | "GGGCCID", 2918 | "GGGCGGD", 2919 | "GGGCJCD", 2920 | "GGGCPD", 2921 | "GGGCPFCPCD", 2922 | "GGGCPJD", 2923 | "GGGFCCD", 2924 | "GGGFCCID", 2925 | "GGGFCD", 2926 | "GGGFCJD", 2927 | "GGGFID", 2928 | "GGGGPJD", 2929 | "GGGICCD", 2930 | "GGGICJD", 2931 | "GGGIDGID", 2932 | "GGGIICD", 2933 | "GGGJCCD", 2934 | "GGGJGID", 2935 | "GGGKCD", 2936 | "GGGKDJD", 2937 | "GGGLJCD", 2938 | "GGGMCD", 2939 | "GGGPCD", 2940 | "GGGPFIDWD", 2941 | "GGGPIICD", 2942 | "GGGPIPD", 2943 | "GGGPPID", 2944 | "GGICCGCD", 2945 | "GGICCID", 2946 | "GGICDD", 2947 | "GGICFID", 2948 | "GGICJCD", 2949 | "GGICJDD", 2950 | "GGICPCCID", 2951 | "GGICPD", 2952 | "GGIDID", 2953 | "GGIDWGD", 2954 | "GGIFCCD", 2955 | "GGIFCD", 2956 | "GGIFCJD", 2957 | "GGIFICD", 2958 | "GGIFID", 2959 | "GGIFIDDD", 2960 | "GGIFJD", 2961 | "GGIFMID", 2962 | "GGIGPFD", 2963 | "GGIICCD", 2964 | "GGIJCD", 2965 | "GGIJCID", 2966 | "GGIJDD", 2967 | "GGIJICD", 2968 | "GGIPCCD", 2969 | "GGIPDCCD", 2970 | "GGIPICD", 2971 | "GGIPMICD", 2972 | "GGJCCCCCD", 2973 | "GGJCCCCD", 2974 | "GGJCCICD", 2975 | "GGJCDD", 2976 | "GGJCGCD", 2977 | "GGJCICCD", 2978 | "GGJCICD", 2979 | "GGJGCD", 2980 | "GGJGCICD", 2981 | "GGJGCLCGCD", 2982 | "GGJICCD", 2983 | "GGJICJD", 2984 | "GGJICPCCD", 2985 | "GGJID", 2986 | "GGJIID", 2987 | "GGJJCD", 2988 | "GGJJCDD", 2989 | "GGJJCKD", 2990 | "GGJJID", 2991 | "GGJMID", 2992 | "GGJPCCCCD", 2993 | "GGJPCCD", 2994 | "GGJPCD", 2995 | "GGJPCJCD", 2996 | "GGJPCJPJCD", 2997 | "GGJPID", 2998 | "GGJPJD", 2999 | "GGKCCCD", 3000 | "GGKCD", 3001 | "GGKDD", 3002 | "GGLCCD", 3003 | "GGLCCPJD", 3004 | "GGLFCCCD", 3005 | "GGLGCJD", 3006 | "GGLGFID", 3007 | "GGLGPCD", 3008 | "GGLJCCD", 3009 | "GGLJCID", 3010 | "GGMFJD", 3011 | "GGMJCDGD", 3012 | "GGMPJD", 3013 | "GGPCCDD", 3014 | "GGPCDD", 3015 | "GGPCICD", 3016 | "GGPCID", 3017 | "GGPFCCD", 3018 | "GGPFCD", 3019 | "GGPFCID", 3020 | "GGPFJD", 3021 | "GGPGCD", 3022 | "GGPGID", 3023 | "GGPICFCD", 3024 | "GGPID", 3025 | "GGPIDD", 3026 | "GGPIID", 3027 | "GGPJCCCD", 3028 | "GGPJCCD", 3029 | "GGPJCCID", 3030 | "GGPJCJMD", 3031 | "GGPJID", 3032 | "GGPJKCCD", 3033 | "GGPJPCD", 3034 | "GGPPCID", 3035 | "GGPPDD", 3036 | "GGPPFCCD", 3037 | "GGPPICD", 3038 | "GGPPJCD", 3039 | "GGWCJD", 3040 | "GGWGWID", 3041 | "GGWIWCCD", 3042 | "GICCCCCCCD", 3043 | "GICCCDGD", 3044 | "GICCDDD", 3045 | "GICCDGCD", 3046 | "GICCDGD", 3047 | "GICCDWD", 3048 | "GICCDWGD", 3049 | "GICCFCCD", 3050 | "GICCICCD", 3051 | "GICCICD", 3052 | "GICCID", 3053 | "GICCJCCD", 3054 | "GICDCCCD", 3055 | "GICDDWGD", 3056 | "GICDGD", 3057 | "GICDGJCD", 3058 | "GICDID", 3059 | "GICDWD", 3060 | "GICFD", 3061 | "GICFID", 3062 | "GICGCD", 3063 | "GICICCCCCCCCPD", 3064 | "GICICDDGD", 3065 | "GICICDFD", 3066 | "GICIDGD", 3067 | "GICIFD", 3068 | "GICIIFID", 3069 | "GICJDD", 3070 | "GICJDGD", 3071 | "GICJJD", 3072 | "GICKD", 3073 | "GICPCD", 3074 | "GICPID", 3075 | "GICPIDD", 3076 | "GICWCWCWD", 3077 | "GIDCDD", 3078 | "GIDDCD", 3079 | "GIDDGD", 3080 | "GIDDWGD", 3081 | "GIDFCD", 3082 | "GIDGDCD", 3083 | "GIDJJD", 3084 | "GIFCCCD", 3085 | "GIFCD", 3086 | "GIFCJD", 3087 | "GIFFFWFWD", 3088 | "GIFGD", 3089 | "GIFICCCD", 3090 | "GIFID", 3091 | "GIFIDCD", 3092 | "GIFJD", 3093 | "GIFPD", 3094 | "GIFPDCD", 3095 | "GIGCCDMD", 3096 | "GIGGCD", 3097 | "GIGJD", 3098 | "GIGMD", 3099 | "GIICCCD", 3100 | "GIICCDD", 3101 | "GIICCDGD", 3102 | "GIICCDMCD", 3103 | "GIICDD", 3104 | "GIICID", 3105 | "GIIDFCD", 3106 | "GIIDGD", 3107 | "GIIDJCD", 3108 | "GIIFICD", 3109 | "GIIICCD", 3110 | "GIIJD", 3111 | "GIIPCD", 3112 | "GIIPD", 3113 | "GIJCCDD", 3114 | "GIJCCICD", 3115 | "GIJCCJD", 3116 | "GIJCDCD", 3117 | "GIJCDGD", 3118 | "GIJCDWCFD", 3119 | "GIJDCD", 3120 | "GIJICCD", 3121 | "GIJICD", 3122 | "GIJICDGD", 3123 | "GIJIDD", 3124 | "GIJJD", 3125 | "GIJJICJD", 3126 | "GIJPCD", 3127 | "GIJPID", 3128 | "GILGCD", 3129 | "GIMCID", 3130 | "GIMCPD", 3131 | "GIPCCCCCD", 3132 | "GIPCCCDGD", 3133 | "GIPCCDD", 3134 | "GIPDCCCCD", 3135 | "GIPDGD", 3136 | "GIPDWCCD", 3137 | "GIPFD", 3138 | "GIPID", 3139 | "GIPJCDD", 3140 | "GIPJD", 3141 | "GIWGFWDGD", 3142 | "GIWGWDD", 3143 | "GJCCCCCDCD", 3144 | "GJCCCCCDD", 3145 | "GJCCCCDD", 3146 | "GJCCCDCD", 3147 | "GJCCCDGCD", 3148 | "GJCCCDGD", 3149 | "GJCCCDLD", 3150 | "GJCCCDWD", 3151 | "GJCCCPD", 3152 | "GJCCDFDD", 3153 | "GJCCID", 3154 | "GJCCPD", 3155 | "GJCCWKWCD", 3156 | "GJCDMCCD", 3157 | "GJCICCD", 3158 | "GJCIWGWCCD", 3159 | "GJCJCDID", 3160 | "GJCKD", 3161 | "GJCLCCCD", 3162 | "GJCMIGD", 3163 | "GJCMWD", 3164 | "GJCPDCCD", 3165 | "GJDCDCD", 3166 | "GJFCCCD", 3167 | "GJFCCDWGCGD", 3168 | "GJFCDCD", 3169 | "GJFCDD", 3170 | "GJFCDWFCD", 3171 | "GJFID", 3172 | "GJGCCCCD", 3173 | "GJGCCCD", 3174 | "GJGCCD", 3175 | "GJGCCDD", 3176 | "GJGCCDID", 3177 | "GJGCMJD", 3178 | "GJGFJCD", 3179 | "GJGJCD", 3180 | "GJGJCJCD", 3181 | "GJGJCKDGD", 3182 | "GJGMCCD", 3183 | "GJGPCCCD", 3184 | "GJGPD", 3185 | "GJICCCCD", 3186 | "GJICCDCD", 3187 | "GJICDD", 3188 | "GJICJD", 3189 | "GJIDD", 3190 | "GJIDWCCCWD", 3191 | "GJIICD", 3192 | "GJIIID", 3193 | "GJIJCCD", 3194 | "GJIJCD", 3195 | "GJIJD", 3196 | "GJIPD", 3197 | "GJJCCDGD", 3198 | "GJJCDCCD", 3199 | "GJJCDCD", 3200 | "GJJCDD", 3201 | "GJJCID", 3202 | "GJJCPD", 3203 | "GJJDDCCD", 3204 | "GJJGCCCCD", 3205 | "GJJICCCCD", 3206 | "GJJICCDCD", 3207 | "GJJICD", 3208 | "GJJID", 3209 | "GJJIPD", 3210 | "GJJPD", 3211 | "GJKJCD", 3212 | "GJKPD", 3213 | "GJMCCCCD", 3214 | "GJMCCCD", 3215 | "GJMCD", 3216 | "GJMICD", 3217 | "GJMJCCCCD", 3218 | "GJMJCD", 3219 | "GJMPCDD", 3220 | "GJPCCCDGD", 3221 | "GJPCCDD", 3222 | "GJPCCDGD", 3223 | "GJPCCDID", 3224 | "GJPCID", 3225 | "GJPDCCD", 3226 | "GJPDDGD", 3227 | "GJPDGD", 3228 | "GJPID", 3229 | "GJPIDID", 3230 | "GJPJCD", 3231 | "GJPJMJCD", 3232 | "GJPPCCD", 3233 | "GJWGWCD", 3234 | "GKCCCCD", 3235 | "GKCCDCD", 3236 | "GKCCDD", 3237 | "GKCCDWD", 3238 | "GKCDDICD", 3239 | "GKCID", 3240 | "GKCJDDWD", 3241 | "GKCWWD", 3242 | "GKDCCD", 3243 | "GKDID", 3244 | "GKGDGCDD", 3245 | "GKGPDD", 3246 | "GKICD", 3247 | "GKJCCDGCD", 3248 | "GKJCCID", 3249 | "GKJCD", 3250 | "GKJD", 3251 | "GKJID", 3252 | "GKJIPCD", 3253 | "GKPCDFJD", 3254 | "GKPCJD", 3255 | "GKWD", 3256 | "GLCCCCCID", 3257 | "GLCCCDD", 3258 | "GLCDCGCD", 3259 | "GLCICD", 3260 | "GLCID", 3261 | "GLCJID", 3262 | "GLCMCICD", 3263 | "GLCPCCDGD", 3264 | "GLCWGWDGCD", 3265 | "GLFCCD", 3266 | "GLFD", 3267 | "GLFJJCD", 3268 | "GLFKCCCD", 3269 | "GLGCCD", 3270 | "GLGCDDFCD", 3271 | "GLGJCCD", 3272 | "GLICCCCD", 3273 | "GLICCCD", 3274 | "GLICCDD", 3275 | "GLIFD", 3276 | "GLIID", 3277 | "GLIJD", 3278 | "GLIWDWD", 3279 | "GLJCCPCJD", 3280 | "GLJCDD", 3281 | "GLJCID", 3282 | "GLJDCD", 3283 | "GLJGCCD", 3284 | "GLJGCDGD", 3285 | "GLKCIID", 3286 | "GLMCCDD", 3287 | "GLMCD", 3288 | "GLPCCCCD", 3289 | "GLPCCDD", 3290 | "GLPCCDDFD", 3291 | "GLPICD", 3292 | "GLPID", 3293 | "GLPJCD", 3294 | "GLPPCCDD", 3295 | "GLPWIWD", 3296 | "GMCCCDD", 3297 | "GMCCDCCCD", 3298 | "GMCCDCMD", 3299 | "GMCCDMD", 3300 | "GMCCDWD", 3301 | "GMCID", 3302 | "GMCIID", 3303 | "GMCKCD", 3304 | "GMDPDID", 3305 | "GMFCCD", 3306 | "GMFCD", 3307 | "GMFCDCDGD", 3308 | "GMFICD", 3309 | "GMGCD", 3310 | "GMGCDCD", 3311 | "GMICID", 3312 | "GMIID", 3313 | "GMJCCCD", 3314 | "GMMCDGDD", 3315 | "GMMICCD", 3316 | "GMPCCCD", 3317 | "GMPDGCD", 3318 | "GMPFD", 3319 | "GMPICD", 3320 | "GMPID", 3321 | "GMPMCCCD", 3322 | "GPCCCCCD", 3323 | "GPCCCCDGD", 3324 | "GPCCCDGD", 3325 | "GPCCCDLD", 3326 | "GPCCCID", 3327 | "GPCCCPD", 3328 | "GPCCDGD", 3329 | "GPCCDWD", 3330 | "GPCCGD", 3331 | "GPCCICD", 3332 | "GPCCID", 3333 | "GPCCJD", 3334 | "GPCCWCWCD", 3335 | "GPCDCJD", 3336 | "GPCDDCD", 3337 | "GPCDGFCD", 3338 | "GPCDIID", 3339 | "GPCFCCCDGD", 3340 | "GPCFD", 3341 | "GPCGCD", 3342 | "GPCICD", 3343 | "GPCIDD", 3344 | "GPCIDWCWD", 3345 | "GPCIID", 3346 | "GPCIJCD", 3347 | "GPCIPD", 3348 | "GPCJD", 3349 | "GPCKD", 3350 | "GPCPCD", 3351 | "GPCPCDD", 3352 | "GPCPD", 3353 | "GPCWWD", 3354 | "GPFCCDWD", 3355 | "GPFCCWD", 3356 | "GPFCDD", 3357 | "GPFCWFWD", 3358 | "GPFDD", 3359 | "GPFICD", 3360 | "GPFIGGPCD", 3361 | "GPFJCD", 3362 | "GPFJD", 3363 | "GPFJDCD", 3364 | "GPFPCDCD", 3365 | "GPFPD", 3366 | "GPGCCCCD", 3367 | "GPGCCD", 3368 | "GPGCD", 3369 | "GPGCDWGD", 3370 | "GPGCGCD", 3371 | "GPGCIICD", 3372 | "GPGCPCCD", 3373 | "GPGFFCD", 3374 | "GPGICCD", 3375 | "GPGICD", 3376 | "GPGID", 3377 | "GPGJCD", 3378 | "GPGJD", 3379 | "GPGPCGD", 3380 | "GPGPDD", 3381 | "GPGPGJCCD", 3382 | "GPGPICD", 3383 | "GPICCCCDGD", 3384 | "GPICDD", 3385 | "GPICDGD", 3386 | "GPICICD", 3387 | "GPIDFGD", 3388 | "GPIICD", 3389 | "GPIICDGD", 3390 | "GPIID", 3391 | "GPIJCCD", 3392 | "GPIJCD", 3393 | "GPIJCDD", 3394 | "GPIPCCCD", 3395 | "GPIPCCD", 3396 | "GPJCCCCD", 3397 | "GPJCCDD", 3398 | "GPJCDCPD", 3399 | "GPJCDFD", 3400 | "GPJCDGCD", 3401 | "GPJCDGD", 3402 | "GPJCID", 3403 | "GPJCIDD", 3404 | "GPJCJCD", 3405 | "GPJCPCCD", 3406 | "GPJFCCD", 3407 | "GPJFIDD", 3408 | "GPJICD", 3409 | "GPJID", 3410 | "GPJPCD", 3411 | "GPJPD", 3412 | "GPKD", 3413 | "GPLCCCCD", 3414 | "GPLCCD", 3415 | "GPLCD", 3416 | "GPLICD", 3417 | "GPLID", 3418 | "GPMCCCD", 3419 | "GPMCCD", 3420 | "GPMCD", 3421 | "GPMD", 3422 | "GPMDJCD", 3423 | "GPPCCDD", 3424 | "GPPCCDWD", 3425 | "GPPCDWCCCD", 3426 | "GPPDDD", 3427 | "GPPFD", 3428 | "GPPGCCCD", 3429 | "GPPGD", 3430 | "GPPGDGD", 3431 | "GPPICD", 3432 | "GPPIID", 3433 | "GPPIJD", 3434 | "GPPJCCD", 3435 | "GPPJCDFCD", 3436 | "GPPJDCD", 3437 | "GPPMJCD", 3438 | "GPPPCD", 3439 | "GPPPICD", 3440 | "GPWCCD", 3441 | "GPWGWCDGD", 3442 | "GWCCCCD", 3443 | "GWCWJD", 3444 | "GWGPPD", 3445 | "GWGWCCD", 3446 | "GWICCD", 3447 | "GWJD", 3448 | "GWPCPWD", 3449 | "ICCCDFPCCFGCCD", 3450 | "ICCCDGD", 3451 | "ICCCDGJD", 3452 | "ICCCID", 3453 | "ICCCWGWD", 3454 | "ICCCWGWDGD", 3455 | "ICCFJCWGFJCCD", 3456 | "ICCICCD", 3457 | "ICCWGWD", 3458 | "ICDCCCCD", 3459 | "ICDCDCD", 3460 | "ICDGD", 3461 | "ICDWGD", 3462 | "ICFCD", 3463 | "ICFCJCD", 3464 | "ICFDID", 3465 | "ICGD", 3466 | "ICGGD", 3467 | "ICICCD", 3468 | "ICIDWID", 3469 | "ICIWD", 3470 | "ICJCCD", 3471 | "ICJWGWCCD", 3472 | "ICWGWDD", 3473 | "IDDGCD", 3474 | "IDGD", 3475 | "IFCD", 3476 | "IFCICCDGD", 3477 | "IFGD", 3478 | "IFICCD", 3479 | "IFIDWGD", 3480 | "IFKD", 3481 | "IGCCCCDCCD", 3482 | "IGFGJCGDD", 3483 | "IGGDFCD", 3484 | "IGGPCGCD", 3485 | "IGICCD", 3486 | "IIGCD", 3487 | "IIICD", 3488 | "IIID", 3489 | "IIJD", 3490 | "IIWGWCD", 3491 | "IIWGWD", 3492 | "IJCCDGD", 3493 | "IJCCWGWD", 3494 | "IJCDCCD", 3495 | "IJCDD", 3496 | "IJDD", 3497 | "IJGCCD", 3498 | "IJPCCD", 3499 | "IJPCD", 3500 | "IKCD", 3501 | "IMCCCCD", 3502 | "IPCCCCD", 3503 | "IPCGD", 3504 | "IPFD", 3505 | "IPJCD", 3506 | "IPPD", 3507 | "IPPDCD", 3508 | "IPPJCD", 3509 | "IPWGWCCD", 3510 | "IPWGWCD", 3511 | "IWGMFCCDGD", 3512 | "IWGWCCD", 3513 | "IWGWCD", 3514 | "IWGWD", 3515 | "IWGWID", 3516 | "JCCCWFWD", 3517 | "JCCGD", 3518 | "JCCPD", 3519 | "JCCWGWD", 3520 | "JCDCWMWMWCCCD", 3521 | "JCDDWD", 3522 | "JCGDWCPWD", 3523 | "JCIDGD", 3524 | "JCIWGWD", 3525 | "JCJGCD", 3526 | "JCPCD", 3527 | "JCWGWCDWD", 3528 | "JDGCCD", 3529 | "JDWGWCD", 3530 | "JFCCD", 3531 | "JFCCWGWDD", 3532 | "JFID", 3533 | "JFJCCD", 3534 | "JGCCD", 3535 | "JGCCGCD", 3536 | "JGCDFCD", 3537 | "JGCFCCCD", 3538 | "JGFD", 3539 | "JGICD", 3540 | "JGID", 3541 | "JGJDCD", 3542 | "JGMCCD", 3543 | "JGPCCD", 3544 | "JICCD", 3545 | "JIGPCD", 3546 | "JIICD", 3547 | "JIPCD", 3548 | "JJCCCDGD", 3549 | "JJCCJD", 3550 | "JJCGD", 3551 | "JJCID", 3552 | "JJCWGWCCDFGCD", 3553 | "JJCWGWD", 3554 | "JJLWGWCD", 3555 | "JJPJJD", 3556 | "JJPPCD", 3557 | "JPCCCCWGWD", 3558 | "JPCCCD", 3559 | "JPIWGWID", 3560 | "JPJJD", 3561 | "JPWWD", 3562 | "JWWFWCD", 3563 | "KCCCCDD", 3564 | "KCCDGD", 3565 | "KCDGCD", 3566 | "KCDGD", 3567 | "KCFCD", 3568 | "KCICCD", 3569 | "KCJCCCDGD", 3570 | "KFCCD", 3571 | "KGCDCCCD", 3572 | "KGCDD", 3573 | "KGCDGD", 3574 | "KGCGCD", 3575 | "KGGCDID", 3576 | "KIDJCCD", 3577 | "KIICD", 3578 | "KJGCD", 3579 | "KLPCCD", 3580 | "KPFCCD", 3581 | "KPGCD", 3582 | "KPICD", 3583 | "KPKICD", 3584 | "LCCCDGCD", 3585 | "LCGFCD", 3586 | "LCWGWCFDD", 3587 | "LCWGWICD", 3588 | "LFCCD", 3589 | "LFCWGWCD", 3590 | "LFJD", 3591 | "LGCCCD", 3592 | "LGCCD", 3593 | "LGCICD", 3594 | "LGCPJCD", 3595 | "LGFCCD", 3596 | "LGFCD", 3597 | "LGFD", 3598 | "LGFICD", 3599 | "LGFID", 3600 | "LGFJD", 3601 | "LGFPD", 3602 | "LGICCDGCD", 3603 | "LGICD", 3604 | "LGID", 3605 | "LGIICD", 3606 | "LGJCCD", 3607 | "LGJCID", 3608 | "LGLIDD", 3609 | "LGPCCD", 3610 | "LGPCD", 3611 | "LGPDDWD", 3612 | "LGPJPD", 3613 | "LIJD", 3614 | "LIWGWD", 3615 | "LJFCCD", 3616 | "LJFGCD", 3617 | "LKGPD", 3618 | "LKPJCCD", 3619 | "LMPJIPCCD", 3620 | "LPCGDGD", 3621 | "LPPWCCD", 3622 | "MCCID", 3623 | "MCCWGWD", 3624 | "MCJCCD", 3625 | "MCWD", 3626 | "MJCWGWD", 3627 | "MPCFCD", 3628 | "MPCGD", 3629 | "PCCGGCCCCD", 3630 | "PCCKD", 3631 | "PCDDDCCD", 3632 | "PCJD", 3633 | "PCJJDD", 3634 | "PCLGCCD", 3635 | "PCWGWCD", 3636 | "PCWGWID", 3637 | "PCWGWPDWD", 3638 | "PFCCCWGWD", 3639 | "PFCCJICCD", 3640 | "PFCD", 3641 | "PFCPCDCD", 3642 | "PFCWGWCD", 3643 | "PFID", 3644 | "PFPID", 3645 | "PGCCD", 3646 | "PGCCDGD", 3647 | "PGCID", 3648 | "PGCJCJD", 3649 | "PGFCD", 3650 | "PGJCCD", 3651 | "PGJICD", 3652 | "PGPCCCJD", 3653 | "PGPCCD", 3654 | "PGPD", 3655 | "PGPID", 3656 | "PIGFCD", 3657 | "PIWGWCCD", 3658 | "PJCCCD", 3659 | "PJCDD", 3660 | "PJCWGWDID", 3661 | "PJDFFD", 3662 | "PLDWGWCDCGD", 3663 | "PPCCCDD", 3664 | "PPCCD", 3665 | "PPCCDGD", 3666 | "PPCCGWD", 3667 | "PPCCWGWD", 3668 | "PPCID", 3669 | "PPCPWCCCWCDD", 3670 | "PPCWGWD", 3671 | "PPDD", 3672 | "PPFWGWCD", 3673 | "PPICD", 3674 | "PPIDD", 3675 | "PPLWGWCCD", 3676 | "PPWGWCD", 3677 | "PPWGWD", 3678 | "PWCCCD", 3679 | "PWCWCD", 3680 | "PWFWCD", 3681 | "PWWJWGWD", 3682 | "WCCCD", 3683 | "WCJWD", 3684 | "WFFPCWJD", 3685 | "WFPWPWCCD", 3686 | "WFWD", 3687 | "WGCJDWFCCCD", 3688 | "WJCCD", 3689 | "WPWJD", 3690 | "WWIWWCWD", 3691 | ] 3692 | -------------------------------------------------------------------------------- /yaya/config.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | __author__ = 'tony' 3 | import os 4 | 5 | DICT_BIN_EXT = '.ya' 6 | DICT_BIN_REVERSE_EXT = '.reverse.ya' 7 | DATA_ROOT = "/home/tony/MyProject/YaYaNLP/data" 8 | 9 | CUSTOM_DICT_NAME = [os.path.join(DATA_ROOT, "dictionary", "custom", f) for f in [ 10 | u"CustomDictionary.txt", 11 | u"上海地名.txt", 12 | u"人名词典.txt", 13 | u"全国地名大全.txt", 14 | u"机构名词典.txt", 15 | u"现代汉语补充词库.txt"]] 16 | 17 | CORE_DICT_NAME = os.path.join(DATA_ROOT, "dictionary", "CoreNatureDictionary.txt") 18 | CORE_BIGRAM_NAME = os.path.join(DATA_ROOT, "dictionary", "CoreNatureDictionary.ngram.txt") 19 | CORE_TR_PATH = os.path.join(DATA_ROOT, "dictionary", "person", "CoreNatureDictionary.tr.txt") 20 | 21 | CHAR_TYPE_PATH = os.path.join(DATA_ROOT, "dictionary", "other", "CharType.dat.yes") 22 | 23 | PERSON_TR_PATH = os.path.join(DATA_ROOT, "dictionary", "person", "nr.tr.txt") 24 | PERSON_DICT_NAME = os.path.join(DATA_ROOT, "dictionary", "person", "nr.txt") 25 | 26 | ORG_TR_PATH = os.path.join(DATA_ROOT, "dictionary", "organization", "nt.tr.txt") 27 | ORG_DICT_NAME = os.path.join(DATA_ROOT, "dictionary", "organization", "nt.txt") 28 | 29 | PLACE_TR_PATH = os.path.join(DATA_ROOT, "dictionary", "place", "ns.tr.txt") 30 | PLACE_DICT_NAME = os.path.join(DATA_ROOT, "dictionary", "place", "ns.txt") 31 | 32 | TRADITIONAL_CHINESE_DICT_NAME = os.path.join(DATA_ROOT, "dictionary", "tc", "TraditionalChinese.txt") 33 | 34 | # 全局配置 35 | class _Config: 36 | # 是否优先使用缓存字典 37 | use_dict_cache = True 38 | 39 | # 是否使用用户字典 40 | use_custom_dict = True 41 | 42 | # 中国人名识别 43 | name_recognize = True 44 | 45 | # 地名识别 46 | place_recognize = True 47 | 48 | # 机构识别 49 | org_recognize = True 50 | 51 | debug = True 52 | 53 | 54 | Config = _Config() 55 | -------------------------------------------------------------------------------- /yaya/const.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import logging 3 | 4 | __author__ = 'tony' 5 | 6 | logger = logging.getLogger("YaYaNLP") 7 | 8 | # 算术常量 9 | DOUBLE_MAX = 1.7976931348623157e+308 10 | 11 | # 预定义常量 12 | TAG_PLACE = u"未##地" 13 | TAG_BIGIN = u"始##始" 14 | TAG_OTHER = u"未##它" 15 | TAG_GROUP = u"未##团" 16 | TAG_NUMBER = u"未##数" 17 | TAG_QUANTIFIER = u"未##量" 18 | TAG_PROPER = u"未##专" 19 | TAG_TIME = u"未##时" 20 | TAG_CLUSTER = u"未##串" 21 | TAG_END = u"末##末" 22 | TAG_PEOPLE = u"未##人" 23 | 24 | # 总词频 25 | MAX_FREQUENCY = 25146057 26 | SMOOTHING_FACTOR = 1.0 / MAX_FREQUENCY + 0.00001 27 | SMOOTHING_PARAM = 0.1 28 | -------------------------------------------------------------------------------- /yaya/dictionary/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tony' 2 | -------------------------------------------------------------------------------- /yaya/dictionary/chinese_traditional_dict.py: -------------------------------------------------------------------------------- 1 | from yaya.collection.dict import DoubleArrayTrie 2 | from yaya import config 3 | from yaya.utility.singleton import singleton 4 | 5 | __author__ = 'tony' 6 | 7 | 8 | class ChinseTraditionalBaseDict: 9 | def convert_key_to_value(self, text): 10 | search = self.trie.search(text) 11 | wordnet = [None] * search.array_length 12 | lennet = [0] * search.array_length 13 | for i, k, v in search.search_all_words(): 14 | if len(v[1]) > lennet[i]: 15 | wordnet[i] = v[1] 16 | lennet[i] = len(k) 17 | offset = 0 18 | valuetext = [] 19 | while offset < search.array_length: 20 | if wordnet[offset] is None: 21 | valuetext.append(search.char_array[offset]) 22 | offset += 1 23 | else: 24 | valuetext.append(wordnet[offset]) 25 | offset += lennet[offset] 26 | return "".join(valuetext) 27 | 28 | 29 | @singleton 30 | class SimplifiedChineseDict(ChinseTraditionalBaseDict): 31 | def __init__(self): 32 | self.trie = DoubleArrayTrie.load(config.TRADITIONAL_CHINESE_DICT_NAME, 33 | lambda i: i[i.find(u'=') + 1:], 34 | lambda i: i.split('=')[::-1], 35 | dict_bin_ext=config.DICT_BIN_REVERSE_EXT) 36 | self.trie.get_attr = lambda v: v 37 | 38 | def convert_simplified_to_traditional(self, text): 39 | return self.convert_key_to_value(text) 40 | 41 | 42 | @singleton 43 | class TraditionalChineseDict(ChinseTraditionalBaseDict): 44 | def __init__(self): 45 | self.trie = DoubleArrayTrie.load(config.TRADITIONAL_CHINESE_DICT_NAME, 46 | lambda i: i[:i.find(u'=')], 47 | lambda i: i.split('=')) 48 | self.trie.get_attr = lambda v: v 49 | 50 | def convert_traditional_to_simplified(self, text): 51 | return self.convert_key_to_value(text) 52 | -------------------------------------------------------------------------------- /yaya/dictionary/org_dict.py: -------------------------------------------------------------------------------- 1 | from yaya import config 2 | from yaya.collection.dict import DoubleArrayTrie 3 | from yaya.collection.hmm import HMMMatrix 4 | from yaya.common.nt import NTPattern, NT 5 | from yaya.utility.singleton import singleton 6 | 7 | __author__ = 'tony' 8 | 9 | 10 | @singleton 11 | class OrgDict: 12 | def __init__(self): 13 | self.trie = DoubleArrayTrie.load(config.ORG_DICT_NAME, enum_cls=NT) 14 | self.matrix = HMMMatrix.load(config.ORG_TR_PATH, NT) 15 | 16 | 17 | @singleton 18 | class NTPatternDict: 19 | def __init__(self): 20 | self.trie = DoubleArrayTrie() 21 | NTPattern.sort() 22 | self.trie.build(key=NTPattern) 23 | -------------------------------------------------------------------------------- /yaya/dictionary/person_dict.py: -------------------------------------------------------------------------------- 1 | from yaya import config 2 | from yaya.collection.dict import DoubleArrayTrie 3 | from yaya.collection.hmm import HMMMatrix 4 | from yaya.common.nr import NRPattern, NR 5 | from yaya.utility.singleton import singleton 6 | 7 | __author__ = 'tony' 8 | 9 | 10 | @singleton 11 | class PersonDict: 12 | def __init__(self): 13 | self.trie = DoubleArrayTrie.load(config.PERSON_DICT_NAME, enum_cls=NR) 14 | self.matrix = HMMMatrix.load(config.PERSON_TR_PATH, NR) 15 | 16 | 17 | @singleton 18 | class NRPatternDict: 19 | def __init__(self): 20 | self.trie = DoubleArrayTrie() 21 | NRPattern.sort() 22 | self.trie.build(key=NRPattern) -------------------------------------------------------------------------------- /yaya/dictionary/place_dict.py: -------------------------------------------------------------------------------- 1 | from yaya.common.ns import NS, NSPattern 2 | from yaya import config 3 | from yaya.collection.dict import DoubleArrayTrie 4 | from yaya.collection.hmm import HMMMatrix 5 | from yaya.utility.singleton import singleton 6 | 7 | __author__ = 'tony' 8 | 9 | 10 | @singleton 11 | class PlaceDict: 12 | def __init__(self): 13 | self.trie = DoubleArrayTrie.load(config.PLACE_DICT_NAME, enum_cls=NS) 14 | self.matrix = HMMMatrix.load(config.PLACE_TR_PATH, NS) 15 | 16 | 17 | @singleton 18 | class NSPatternDict: 19 | def __init__(self): 20 | self.trie = DoubleArrayTrie() 21 | NSPattern.sort() 22 | self.trie.build(key=NSPattern) 23 | -------------------------------------------------------------------------------- /yaya/recognition/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tony' 2 | 3 | -------------------------------------------------------------------------------- /yaya/recognition/organization_recognition.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from yaya.collection.dict import Attribute, ORG_ATTRIBUTE 3 | from yaya.collection.hmm import OrgTranMatrix 4 | from yaya.common.nature import NATURE 5 | from yaya.common.nt import NT 6 | from yaya.dictionary.org_dict import NTPatternDict, OrgDict 7 | from yaya.recognition.recognition import role_viterbi 8 | from yaya.seg.viterbi import viterbi_standard 9 | 10 | __author__ = 'tony' 11 | 12 | 13 | def recognition(vertexs, wordnet_optimum, wordnet_all): 14 | # 识别角色,并进行一次维特比 15 | return role_viterbi(vertexs, wordnet_optimum, 16 | hmm=OrgTranMatrix().hmm, 17 | trie=NTPatternDict().trie, 18 | recognition_attr=ORG_ATTRIBUTE, 19 | tag_func=role_tag, 20 | viterbi_fun=viterbi_standard 21 | ) 22 | 23 | def role_tag(word_seg_list): 24 | tag_index_list = [] 25 | for vertex in word_seg_list: 26 | nature = vertex.nature 27 | if nature == NATURE.nz: 28 | if vertex.attribute.total_frequency <= 1000: 29 | tag_index_list.append(Attribute([str(NT.F), 1000], cls=NT)) # ((NT.F, 1000)) 30 | else: 31 | break 32 | continue 33 | elif nature in [NATURE.ni, 34 | NATURE.nic, 35 | NATURE.nis, 36 | NATURE.nit]: 37 | tag_index_list.append(Attribute([str(NT.K), 1000, str(NT.D), 1000], cls=NT)) 38 | continue 39 | elif nature == NATURE.m: 40 | tag_index_list.append(Attribute([str(NT.M), 1000], cls=NT)) 41 | continue 42 | 43 | index, value = OrgDict().trie.get(vertex.word) 44 | if value is None: 45 | value = Attribute([str(NT.Z), OrgDict().matrix.get_total_freq(NT.Z)], cls=NT) 46 | # else: 47 | # # if not isinstance(value, list): 48 | # # value = value.split() 49 | # # value = Attribute(value[1:], cls=NT) 50 | 51 | tag_index_list.append(value) 52 | 53 | return tag_index_list 54 | -------------------------------------------------------------------------------- /yaya/recognition/person_recognition.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from yaya.collection.dict import Attribute, PERSON_ATTRIBUTE 3 | from yaya.collection.hmm import PersonTranMatrix 4 | from yaya.common.nature import NATURE 5 | from yaya.common.nr import NR 6 | from yaya.dictionary.person_dict import PersonDict, NRPatternDict 7 | from yaya.recognition.recognition import role_viterbi 8 | from yaya.seg.wordnet import Vertex 9 | 10 | __author__ = 'tony' 11 | 12 | def recognition(vertexs, wordnet_optimum, wordnet_all): 13 | return role_viterbi(vertexs, wordnet_optimum, 14 | hmm=PersonTranMatrix().hmm, 15 | trie=NRPatternDict().trie, 16 | recognition_attr=PERSON_ATTRIBUTE, 17 | tag_func=role_tag 18 | ) 19 | # # 识别角色,并进行一次维特比 20 | # tag_list = role_tag(vertexs) 21 | # if Config.debug: 22 | # sb = [] 23 | # for i, v in enumerate(vertexs): 24 | # sb.append("[%s %s]" % (unicode(vertexs[i]), tag_list[i].nature)) 25 | # print u"人名角色观察:%s" % "".join(sb) 26 | # tag_list = viterbi_template(tag_list, PersonTranMatrix().hmm) 27 | # 28 | # 29 | # tag_str = [str(x) for x in tag_list] 30 | # tag_str = ''.join(tag_str) 31 | # 32 | # # 处理V、U的特殊情况 33 | # tag_str, vertexs = parse_pattern(tag_str, vertexs, None, None) 34 | # 35 | # search = Searcher(NRPatternDict().trie, tag_str) 36 | # vertexs_offset = [0 for i in range(len(vertexs))] 37 | # offset = 0 38 | # vertexs_offset[1] = 1 39 | # for i in range(2, len(vertexs) - 2): 40 | # vertexs_offset[i] = vertexs_offset[i - 1] + len(vertexs[i - 1].real_word) 41 | # while search.next(): 42 | # name_str = "" 43 | # for i in range(search.begin, search.begin + len(search.key)): 44 | # name_str += vertexs[i].real_word 45 | # 46 | # # 添加到词网内 47 | # vertex = Vertex(name_str, attribute="nr 1") 48 | # wordnet_optimum.insert(vertexs_offset[search.begin], vertex, wordnet_all) 49 | 50 | 51 | 52 | def role_tag(word_seg_list): 53 | tag_index_list = [] 54 | for vertex in word_seg_list: 55 | if vertex.nature == NATURE.nr and vertex.attribute.total_frequency <= 1000: 56 | if vertex.real_word.__len__() == 2: 57 | tag_index_list.append(Attribute(attr=(NR.X, 1, NR.G, 1), cls=NR)) 58 | continue 59 | 60 | index, value = PersonDict().trie.get(vertex.real_word) 61 | 62 | if value is None: 63 | value = Attribute([str(NR.A), PersonDict().matrix.get_total_freq(NR.A)], cls=NR) 64 | 65 | tag_index_list.append(value) 66 | return tag_index_list 67 | 68 | 69 | def parse_pattern(tag_str, vertexs, wordnet_optimum, wordnet_all): 70 | new_tag_list = [] 71 | new_vertexs = [] 72 | for i, t in enumerate(tag_str): 73 | if t == str(NR.U): 74 | new_tag_list.append(str(NR.K)) 75 | new_tag_list.append(str(NR.B)) 76 | word_K = vertexs[i].real_word[:-1] 77 | word_B = vertexs[i].real_word[-1] 78 | new_vertexs.append(Vertex(word_K)) 79 | new_vertexs.append(Vertex(word_B)) 80 | elif t == str(NR.V): 81 | if tag_str[i - 1] == str(NR.B): 82 | new_tag_list.append(str(NR.E)) 83 | else: 84 | new_tag_list.append(str(NR.D)) 85 | new_tag_list.append(str(NR.L)) 86 | word_ED = vertexs[i].real_word[:-1] 87 | word_L = vertexs[i].real_word[-1] 88 | new_vertexs.append(Vertex(word_ED)) 89 | new_vertexs.append(Vertex(word_L)) 90 | else: 91 | new_tag_list.append(t) 92 | new_vertexs.append(vertexs[i]) 93 | return "".join(new_tag_list), new_vertexs 94 | -------------------------------------------------------------------------------- /yaya/recognition/place_recognition.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from yaya.common.ns import NS 3 | from yaya.dictionary.place_dict import NSPatternDict, PlaceDict 4 | from yaya.recognition.recognition import role_viterbi 5 | 6 | __author__ = 'tony' 7 | # coding=utf-8 8 | from yaya.collection.dict import Attribute, PLACE_ATTRIBUTE 9 | from yaya.collection.hmm import PlaceTranMatrix 10 | from yaya.common.nature import NATURE 11 | 12 | __author__ = 'tony' 13 | 14 | 15 | def recognition(vertexs, wordnet_optimum, wordnet_all): 16 | return role_viterbi(vertexs, wordnet_optimum, 17 | hmm=PlaceTranMatrix().hmm, 18 | trie=NSPatternDict().trie, 19 | recognition_attr=PLACE_ATTRIBUTE, 20 | tag_func=role_tag 21 | ) 22 | 23 | # # 识别角色,并进行一次维特比 24 | # tag_list = viterbi_template(role_tag(vertexs), PlaceTranMatrix().hmm) 25 | # tag_str = [str(x) for x in tag_list] 26 | # tag_str = ''.join(tag_str) 27 | # search = Searcher(NSPatternDict().trie, tag_str) 28 | # vertexs_offset = [0] * len(vertexs) 29 | # offset = 0 30 | # for i in range(1, len(vertexs) - 2): 31 | # vertexs_offset[i] = offset 32 | # offset += len(vertexs[i].real_word) 33 | # while search.next(): 34 | # name_str = "" 35 | # for i in range(search.begin, search.begin + len(search.key)): 36 | # name_str += vertexs[i].real_word 37 | # 38 | # # 添加到词网内 39 | # vertex = Vertex(name_str, attribute="ns 1") 40 | # wordnet_optimum.insert(vertexs_offset[search.begin + 1], vertex, wordnet_all) 41 | 42 | 43 | def role_tag(word_seg_list): 44 | tag_index_list = [] 45 | for vertex in word_seg_list: 46 | if vertex.nature == NATURE.ns.index and vertex.attribute.total_frequency <= 1000: 47 | if vertex.real_word.__len__() < 3: 48 | tag_index_list.append(Attribute("%s 1 %s 1" % (NS.H, NS.G), NS)) 49 | continue 50 | index, value = PlaceDict().trie.get(vertex.real_word) 51 | if value is None: 52 | value = Attribute([str(NS.Z), PlaceDict().matrix.get_total_freq(NS.Z)], cls=NS) 53 | # else: 54 | # if not isinstance(value, list): 55 | # value = value.split() 56 | # value = Attribute(value[1:], cls=NS) 57 | tag_index_list.append(value) 58 | return tag_index_list 59 | -------------------------------------------------------------------------------- /yaya/recognition/recognition.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from yaya.collection.dict import Searcher 3 | from yaya.seg.viterbi import viterbi, viterbi_template 4 | from yaya.seg.wordnet import Vertex 5 | from yaya.config import Config 6 | __author__ = 'tony' 7 | 8 | 9 | def role_viterbi(vertexs, wordnet_optimum, hmm, trie, recognition_attr, tag_func, viterbi_fun=viterbi_template): 10 | tag_list = tag_func(vertexs) 11 | if Config.debug: 12 | sb = [] 13 | for i, tag in enumerate(tag_list): 14 | sb.append(u"[ %s %s ]" % (vertexs[i].real_word, tag)) 15 | print u"角色观察: %s" % u"".join(sb) 16 | 17 | tag_list = viterbi_fun(tag_list, hmm) 18 | if Config.debug: 19 | sb = [] 20 | for i, tag in enumerate(tag_list): 21 | sb.append(u"%s/%s" % (vertexs[i].real_word, tag)) 22 | print(u"角色标注:[%s]" % u", ".join(sb)) 23 | 24 | tag_str = [str(x) for x in tag_list] 25 | tag_str = ''.join(tag_str) 26 | search = Searcher(trie, tag_str) 27 | vertexs_offset = [0] * len(vertexs) 28 | offset = 1 29 | # head tail skip 30 | for i, v in enumerate(vertexs[1:-1]): 31 | vertexs_offset[i + 1] = offset 32 | offset += len(vertexs[i + 1].real_word) 33 | while search.next(): 34 | name_str = "" 35 | for i in range(search.begin, search.begin + len(search.key)): 36 | name_str += vertexs[i].real_word 37 | 38 | # 添加到词网内 39 | vertex = Vertex(name_str, attribute=recognition_attr) 40 | wordnet_optimum.add(vertexs_offset[search.begin], vertex) 41 | vertexs = viterbi(wordnet_optimum.vertexs) 42 | return vertexs 43 | -------------------------------------------------------------------------------- /yaya/seg/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tony' 2 | -------------------------------------------------------------------------------- /yaya/seg/segment.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from yaya.dictionary.chinese_traditional_dict import SimplifiedChineseDict, TraditionalChineseDict 3 | from yaya.recognition import place_recognition 4 | from yaya.config import Config 5 | from yaya.recognition import person_recognition 6 | from yaya.recognition import organization_recognition 7 | from yaya.seg.viterbi import viterbi 8 | from yaya.seg.wordnet import WordNet, gen_word_net, combine_by_custom_dict 9 | 10 | __author__ = 'tony' 11 | 12 | 13 | def vertexs_to_terms(vertexs, word_only=False): 14 | terms = [] 15 | offset = 0 16 | if word_only: 17 | terms = [v.real_word for v in vertexs] 18 | else: 19 | for v in vertexs[1:-1]: 20 | terms.append((v.real_word, str(v.nature), offset)) 21 | offset += len(v.real_word) 22 | return terms 23 | 24 | 25 | # def combin_by_dict(vertexs, dat): 26 | # for i, start_v in enumerate(vertexs): 27 | # # skip head and skip combined word 28 | # if i == 0 or start_v is None: 29 | # continue 30 | # state = dat.transition(start_v.real_word, 1) 31 | # if state > 0: 32 | # start = i 33 | # end = -1 34 | # value = None 35 | # for j, end_v in enumerate(vertexs[i + 1:-1]): 36 | # state = dat.transition(end_v.real_word, state) 37 | # if state < 0: 38 | # break 39 | # value = dat.output(state) 40 | # end = j + 1 41 | # 42 | # if value is not None: 43 | # for k in range(start, end + i + 1): 44 | # vertexs[k] = None 45 | # vertexs[i] = Vertex(value, attribute=value) 46 | # 47 | # return [v for v in vertexs if v is not None] 48 | 49 | 50 | def seg_to_vertexs(text): 51 | word_net = WordNet(text) 52 | 53 | # 粗分词网 54 | gen_word_net(text, word_net) 55 | 56 | if Config.debug: 57 | print(u"打印粗分词网:") 58 | print(unicode(word_net)) 59 | 60 | # 维特比 61 | vertexs = viterbi(word_net.vertexs) 62 | if Config.use_custom_dict: 63 | vertexs = combine_by_custom_dict(vertexs) 64 | word_net_optimum = WordNet(text, vertexs=vertexs) 65 | 66 | if Config.name_recognize: 67 | person_recognition.recognition(vertexs, word_net_optimum, word_net) 68 | 69 | if Config.place_recognize: 70 | place_recognition.recognition(vertexs, word_net_optimum, word_net) 71 | 72 | if Config.debug: 73 | print(u"打印人名、地名识别词网:") 74 | print(unicode(word_net_optimum)) 75 | 76 | vertexs = viterbi(word_net_optimum.vertexs) 77 | 78 | if Config.org_recognize: 79 | word_net_optimum = WordNet(text, vertexs=vertexs) 80 | vertexs = organization_recognition.recognition(vertexs, word_net_optimum, word_net) 81 | 82 | if Config.debug: 83 | print(u"打印人组织识别词网:") 84 | print(unicode(word_net_optimum)) 85 | return vertexs 86 | 87 | 88 | def seg(text): 89 | return vertexs_to_terms(seg_to_vertexs(text)) 90 | 91 | 92 | def traditional_seg(text): 93 | simplified = TraditionalChineseDict().convert_traditional_to_simplified(text) 94 | return seg(simplified) 95 | 96 | def simplified_to_traditional(text): 97 | return SimplifiedChineseDict().convert_simplified_to_traditional(text) 98 | 99 | 100 | def traditional_to_simplified(text): 101 | return TraditionalChineseDict().convert_traditional_to_simplified(text) 102 | -------------------------------------------------------------------------------- /yaya/seg/viterbi.py: -------------------------------------------------------------------------------- 1 | # -*- encoding:utf-8 -*- 2 | from __future__ import unicode_literals 3 | import math 4 | 5 | from yaya.const import DOUBLE_MAX 6 | from yaya.config import Config 7 | 8 | __author__ = 'tony' 9 | 10 | 11 | class Viterbi: 12 | @staticmethod 13 | def computer(obs, states, start_p, trans_p, emit_p): 14 | max_states_value = 0 15 | for s in states: 16 | max_states_value = max(max_states_value, s) 17 | max_states_value += 1 18 | 19 | V = [[0 for col in range(obs.__len__())] for row in range(max_states_value)] 20 | path = [[0 for col in range(max_states_value)] for row in range(obs.__len__())] 21 | 22 | for y in states: 23 | V[0][y] = start_p[y] + emit_p[obs[0]] 24 | path[y][0] = y 25 | 26 | for t in range(obs.__len__()): 27 | new_path = [[0 for col in range(max_states_value) for row in range(obs.__len__())]] 28 | for y in states: 29 | prob = DOUBLE_MAX 30 | states = 0 31 | for y0 in states: 32 | nprob = V[t - 1][y0] + trans_p[y0][y] + emit_p[y][obs[t]] 33 | if nprob < prob: 34 | prob = nprob 35 | state = y0 36 | V[t][y] = prob 37 | path[state][0:t] = new_path[y][0:t] 38 | new_path[y][t] = y 39 | path = new_path 40 | prob = DOUBLE_MAX 41 | state = 0 42 | for y in states: 43 | if V[-1][y] < prob: 44 | prob = V[-1][y] 45 | state = y 46 | return path[state] 47 | 48 | 49 | def viterbi(vertexs): 50 | for v in vertexs[1]: 51 | v.update_from(vertexs[0][0]) 52 | for i in range(1, vertexs.__len__() - 1): 53 | node_array = vertexs[i] 54 | if node_array is None: 55 | continue 56 | for node in node_array: 57 | if node.vertex_from is None: 58 | continue 59 | for node_to in vertexs[i + len(node.real_word)]: 60 | node_to.update_from(node) 61 | vertex_from = vertexs[-1][0] 62 | vertex_list = [] 63 | while vertex_from is not None: 64 | vertex_list.insert(0, vertex_from) 65 | vertex_from = vertex_from.vertex_from 66 | return vertex_list 67 | 68 | def viterbi_roletag(roletaglist, hmm): 69 | _length = len(roletaglist) 70 | taglist = [] 71 | # 得到第一个元素的第一个标签的词性 72 | _pre_nature = roletaglist[0].nature 73 | _perfect_nature = _pre_nature 74 | taglist.append(_pre_nature) 75 | for i in xrange(1, _length): 76 | perfect_cost = DOUBLE_MAX 77 | item = roletaglist[i] 78 | for i, nature, freq in item.natures: 79 | _now = hmm.trans_prob[_pre_nature.index][nature.index] - math.log((item.get_nature_frequency(nature)+1e-8) / hmm.get_total_freq(nature)) 80 | if perfect_cost > _now: 81 | perfect_cost = _now 82 | _perfect_nature = nature 83 | _pre_nature = _perfect_nature 84 | taglist.append(_pre_nature) 85 | return taglist 86 | 87 | def viterbi_template(node_list, hmm, init_cost=DOUBLE_MAX): 88 | node_count = len(node_list) 89 | taglist = [] 90 | # 得到第一个元素的第一个标签的词性 91 | _pre_nature = node_list[0].nature 92 | _perfect_nature = _pre_nature 93 | taglist.append(_pre_nature) 94 | for i, cur_node in enumerate(node_list[1:]): 95 | perfect_cost = init_cost 96 | for j, vertex, freq in cur_node.natures: 97 | _now = hmm.trans_prob[_pre_nature.index][vertex.index] - math.log( 98 | (cur_node.get_nature_frequency(vertex) + 1e-8) / hmm.get_total_freq(vertex)) 99 | if perfect_cost > _now: 100 | perfect_cost = _now 101 | _perfect_nature = vertex 102 | _pre_nature = _perfect_nature 103 | taglist.append(_pre_nature) 104 | return taglist 105 | 106 | 107 | def viterbi_standard(node_list, hmm, init_cost=DOUBLE_MAX): 108 | node_count = len(node_list) 109 | taglist = [] 110 | # 得到第一个元素的第一个标签的词性 111 | route_cost = [] 112 | _pre_nature = node_list[0].nature 113 | _perfect_nature = _pre_nature 114 | taglist.append(_pre_nature) 115 | 116 | # 计算第2个元素 117 | current_line = node_list[1] 118 | for i, vertex, freq in current_line.natures: 119 | _now = hmm.trans_prob[_pre_nature.index][vertex.index] - math.log( 120 | (current_line.get_nature_frequency(vertex) + 1e-8) / hmm.get_total_freq(vertex)) 121 | route_cost.append(_now) 122 | pre_line = current_line 123 | 124 | # 计算第三个元素 125 | for i, current_line in enumerate(node_list[2:]): 126 | new_route_cost = [] 127 | perfect_pre_nature = None 128 | perfect_cost = init_cost 129 | for k, cur_nature, cur_freq in current_line.natures: 130 | new_route_cost.append(init_cost) 131 | for j, pre_nature, pre_freq in pre_line.natures: 132 | assert j < len(route_cost) 133 | 134 | _now = route_cost[j] + hmm.trans_prob[pre_nature.index][cur_nature.index] - math.log( 135 | (current_line.get_nature_frequency(cur_nature) + 1e-8) / hmm.get_total_freq(cur_nature)) 136 | 137 | if new_route_cost[k] > _now: 138 | new_route_cost[k] = _now 139 | if perfect_cost > _now: 140 | perfect_cost = _now 141 | perfect_pre_nature = pre_nature 142 | 143 | pre_line = current_line 144 | route_cost = new_route_cost 145 | if Config.debug: 146 | print new_route_cost 147 | taglist.append(perfect_pre_nature) 148 | taglist.append(cur_nature) 149 | return taglist 150 | 151 | -------------------------------------------------------------------------------- /yaya/seg/wordnet.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import absolute_import 3 | import math 4 | import copy 5 | 6 | from yaya.collection.dict import * 7 | from yaya.common.nature import NATURE 8 | from yaya.utility.chartype import * 9 | from yaya.collection.bigram import CORE_BIG_RAM_TABLE 10 | from yaya.const import * 11 | 12 | __author__ = 'tony' 13 | 14 | 15 | class AtomNode: 16 | def __init__(self, word, pos): 17 | self.word = word 18 | self.pos = pos 19 | 20 | def __str__(self): 21 | return "AtomNode{ word='%s', nature='%s' }" % (self.word, self.pos) 22 | 23 | 24 | class Vertex: 25 | def __init__(self, real_word, *args, **kwargs): 26 | if kwargs.has_key('attribute'): 27 | attribute = kwargs.get('attribute') 28 | else: 29 | index, attribute = CoreDict().trie.get(real_word) 30 | self.attribute = attribute if isinstance(attribute, Attribute) else Attribute(attribute) 31 | 32 | self.word_id = kwargs.get('word_id', -1) 33 | self.real_word = real_word 34 | word = kwargs.get('word', None) 35 | self.word = word if word is not None else self.compile_real_word(self.real_word, self.attribute) 36 | self.vertex_from = None 37 | self.weight = 0 38 | 39 | def __unicode__(self): 40 | return u"%s/%s" % (self.real_word, self.word) 41 | 42 | def __repr__(self): 43 | return u"Vertex(%(real_word)r, %(attribute)r )" % vars(self) 44 | 45 | def __eq__(self, other): 46 | if type(self) != type(other): 47 | return False 48 | return self.real_word == other.real_word and self.nature == other.nature 49 | 50 | @property 51 | def nature(self): 52 | return self.attribute.nature 53 | 54 | 55 | @nature.setter 56 | def nature(self, value): 57 | self.attribute.nature = value 58 | 59 | def update_from(self, vertex_from): 60 | weight = vertex_from.weight + Vertex.calc_wight(vertex_from, self) 61 | if self.vertex_from is None or self.weight > weight: 62 | self.vertex_from = vertex_from 63 | self.weight = weight 64 | 65 | @staticmethod 66 | def calc_wight(vertex_p, vertex_n): 67 | freq = vertex_p.attribute.total_frequency 68 | if freq == 0: 69 | freq = 1 70 | two_word_freq = CORE_BIG_RAM_TABLE.table.get_bifreq(vertex_p.word_id, vertex_n.word_id) 71 | value = -math.log(SMOOTHING_PARAM * freq / MAX_FREQUENCY + (1 - SMOOTHING_PARAM) * 72 | ((1 - SMOOTHING_FACTOR) * two_word_freq / freq + SMOOTHING_FACTOR)) 73 | if value < 0: 74 | value = -value 75 | return value 76 | 77 | def compile_real_word(self, real_word, attribute): 78 | if (len(attribute) >= 1): 79 | if attribute.nature in [NATURE.nr, 80 | NATURE.nr1, 81 | NATURE.nr2, 82 | NATURE.nrf, 83 | NATURE.nrj]: 84 | self.word_id = PERSON_WORD_ID 85 | return TAG_PEOPLE 86 | elif attribute.nature in [NATURE.ns, NATURE.nsf]: 87 | self.word_id = PLACE_WORD_ID 88 | return TAG_PLACE 89 | elif attribute.nature in [NATURE.nz, NATURE.nx]: 90 | self.word_id = PROPER_WORD_ID 91 | return TAG_PROPER 92 | elif attribute.nature in [ 93 | NATURE.nt, 94 | NATURE.ntc, 95 | NATURE.ntcf, 96 | NATURE.ntcb, 97 | NATURE.ntch, 98 | NATURE.nto, 99 | NATURE.ntu, 100 | NATURE.nts, 101 | NATURE.nth, 102 | NATURE.nit]: 103 | self.word_id = PLACE_WORD_ID 104 | return TAG_GROUP 105 | elif attribute.nature in [NATURE.m, NATURE.mq]: 106 | self.word_id = NUMBER_WORD_ID 107 | return TAG_NUMBER 108 | elif attribute.nature == NATURE.x: 109 | self.word_id = CLUSTER_WORD_ID 110 | return TAG_CLUSTER 111 | elif attribute.nature in [NATURE.t]: 112 | self.word_id = TIME_WORD_ID 113 | return TAG_TIME 114 | return real_word 115 | 116 | 117 | def atom_seg(text, begin, end): 118 | node_list = [] 119 | offset = begin 120 | pre_type = get(text[offset]) 121 | offset += 1 122 | while offset < end: 123 | cur_type = get(text[offset]) 124 | if cur_type != pre_type: 125 | # 处理浮点数 126 | if text[offset] == '.' and pre_type == CT_NUM: 127 | offset += 1 128 | while offset < end: 129 | cur_type = get(text[offset]) 130 | if cur_type != CT_NUM: 131 | break 132 | else: 133 | offset += 1 134 | node_list.append(AtomNode(text[begin:offset], pre_type)) 135 | begin = offset 136 | pre_type = cur_type 137 | offset += 1 138 | 139 | if offset == end: 140 | node_list.append(AtomNode(text[begin:offset], pre_type)) 141 | 142 | return node_list 143 | 144 | 145 | def combine_by_custom_dict(vertexs, dat=CustomDict().trie): 146 | dat = CustomDict().trie 147 | for i in range(len(vertexs)): 148 | state = 1 149 | if vertexs[i] is None: 150 | continue 151 | state = dat.transition(vertexs[i].real_word, state) 152 | value = None 153 | if state > 0: 154 | start = i 155 | to = i + 1 156 | end = - 1 157 | for to in range(to, len(vertexs)): 158 | state = dat.transition(vertexs[to].real_word, state) 159 | if state < 0: 160 | break 161 | output = dat.output(state) 162 | if output is not None: 163 | value = output 164 | end = to + 1 165 | 166 | if value is not None: 167 | word = "" 168 | for j in range(start, end): 169 | word += vertexs[j].real_word 170 | vertexs[j] = None 171 | vertexs[i] = Vertex(real_word=word, attribute=value) 172 | 173 | # todo 考虑加入动态用户词典 174 | return [v for v in vertexs if v is not None] 175 | 176 | 177 | 178 | def dump_vertexs(vertexs): 179 | logger.info("=" * 30) 180 | for i, v in enumerate(vertexs): 181 | logger.info("[%d] %s %s %s" % (i, v.real_word, v.word, v.nature)) 182 | 183 | class WordNet: 184 | def __init__(self, text=None, vertexs=None): 185 | self.vertexs = [[] for i in range(len(text) + 2)] 186 | self.size = 2 187 | if vertexs is not None: 188 | i = 1 189 | for v in vertexs[1:-1]: 190 | v.vertex_from = None 191 | self.vertexs[i]=[v] 192 | i += v.real_word.__len__() 193 | self.vertexs[0] = [vertexs[0]] 194 | self.vertexs[-1] = [vertexs[-1]] 195 | else: 196 | self.vertexs[0] = [new_tag_vertex(TAG_BIGIN)] 197 | self.vertexs[-1] = [new_tag_vertex(TAG_END)] 198 | pass 199 | 200 | def get_first(self, line): 201 | if self.vertexs[line].__len__() > 0: 202 | return self.vertexs[line][0] 203 | else: 204 | return None 205 | 206 | def get(self, line, word_length=None): 207 | if word_length is None: 208 | return self.vertexs[line] 209 | for v in self.vertexs[line]: 210 | if len(v.real_word) == word_length: 211 | return v 212 | return None 213 | 214 | def add(self, line, vertex): 215 | for v in self.vertexs[line]: 216 | if v.real_word.__len__() == vertex.real_word.__len__(): 217 | return 218 | if self.vertexs[line].__len__() == 0: 219 | self.vertexs[line] = [vertex] 220 | else: 221 | self.vertexs[line].append(vertex) 222 | self.size += 1 223 | 224 | def insert(self, line, vertex, word_net): 225 | self.add(line, vertex) 226 | # 保证连接性 227 | for l in range(line - 1, 1, -1): 228 | if self.get(l, 1) is None: 229 | first = word_net.get_first(l) 230 | if first is None: 231 | return 232 | self.vertexs[l].append(copy.deepcopy(first)) 233 | self.size += 1 234 | if len(self.vertexs) > 1: 235 | break 236 | else: 237 | break 238 | l = line + len(vertex.real_word) 239 | if len(self.get(l)) == 0: 240 | target_line = word_net.get(l) 241 | if target_line is None or len(target_line) == 0: 242 | return 243 | self.vertexs[l] = copy.deepcopy(target_line) 244 | self.size += len(self.vertexs[l]) 245 | 246 | for l in range(l, len(self.vertexs)): 247 | if self.get(l).__len__() == 0: 248 | first = word_net.get_first(l) 249 | if first is None: 250 | break 251 | self.vertexs[l].append(copy.deepcopy(first)) 252 | self.size += 1 253 | if self.vertexs[l].__len__() > 1: 254 | break 255 | else: 256 | break 257 | 258 | def add_atoms(self, line, atom_list): 259 | offset = 0 260 | for atom_node in atom_list: 261 | word = atom_node.word 262 | nature = NATURE.n 263 | if atom_node.pos in [CT_INDEX, CT_NUM]: 264 | nature = NATURE.m 265 | word = TAG_NUMBER 266 | elif atom_node.pos in [CT_DELIMITER]: 267 | nature = NATURE.w 268 | elif atom_node.pos in [CT_LETTER, CT_SINGLE]: 269 | nature = NATURE.nx 270 | word = TAG_CLUSTER 271 | self.add(line + offset, Vertex(word=word, 272 | real_word=atom_node.word, 273 | attribute=Attribute([str(nature), '1']), 274 | word_id=-1 275 | )) 276 | 277 | def __len__(self): 278 | return len(self.vertexs) 279 | 280 | def __unicode__(self): 281 | sb = [] 282 | sb.append("=" * 30) 283 | for i, vl in enumerate(self.vertexs): 284 | sb.append(u"[%d]:[%s]" % (i, u",".join([v.real_word for v in vl]))) 285 | sb.append("=" * 30) 286 | return u"\n".join(sb) 287 | 288 | def gen_word_net(text, word_net, dat=CoreDict().trie): 289 | searcher = dat.buildcoredictsearcher(text) 290 | while searcher.next(): 291 | word_net.add(searcher.begin + 1, Vertex(real_word=searcher.key, 292 | attribute=searcher.value, 293 | word_id=searcher.index)) 294 | for i in range(word_net.vertexs.__len__()): 295 | # for i, v in enumerate(word_net.vertexs): 296 | if word_net.vertexs[i].__len__() == 0: 297 | j = i + 1 298 | for j in range(i + 1, word_net.vertexs.__len__() - 1): 299 | if word_net.vertexs[j].__len__() != 0: 300 | break 301 | word_net.add_atoms(i, atom_seg(text, i - 1, j - 1)) 302 | else: 303 | i += word_net.vertexs[i][-1].real_word.__len__() 304 | 305 | def new_tag_vertex(tag): 306 | word_id, attribute = CoreDict().trie.get(tag) 307 | if word_id > 0: 308 | vertex = Vertex(chr(32), attribute=attribute, word=tag, word_id=word_id) 309 | return vertex 310 | else: 311 | logger.error(u"从核心字典加载%s信息时出错", tag) 312 | import sys 313 | sys.exit(-1) 314 | -------------------------------------------------------------------------------- /yaya/utility/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tony' 2 | -------------------------------------------------------------------------------- /yaya/utility/bytearray.py: -------------------------------------------------------------------------------- 1 | from io import FileIO 2 | 3 | __author__ = 'tony' 4 | import struct 5 | 6 | 7 | class ByteArray: 8 | @staticmethod 9 | def load_from_file(filename): 10 | f = FileIO(filename, 'rb') 11 | data = f.readall() 12 | return ByteArray(data) 13 | 14 | def __init__(self, data): 15 | self.data = data 16 | self.offset = 0 17 | 18 | def has_more(self): 19 | return self.offset < len(self.data) 20 | 21 | def next_ushort(self): 22 | data = struct.unpack_from('!h', self.data, self.offset) 23 | self.offset += 2 24 | return data[0] 25 | 26 | def next_uchar(self): 27 | data = struct.unpack_from('!B', self.data, self.offset) 28 | self.offset += 1 29 | return data[0] 30 | -------------------------------------------------------------------------------- /yaya/utility/chartype.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import time 3 | from yaya import config 4 | from yaya.const import logger 5 | from yaya.utility.bytearray import ByteArray 6 | 7 | __author__ = 'tony' 8 | 9 | CT_SINGLE = 5 # 单字节 10 | CT_DELIMITER = CT_SINGLE + 1 # 分隔符"!,.?()[]{}+= 11 | CT_CHINESE = CT_SINGLE + 2 # 中文字符 12 | CT_LETTER = CT_SINGLE + 3 # 字母 13 | CT_NUM = CT_SINGLE + 4 # 数字 14 | CT_INDEX = CT_SINGLE + 5 # 序号 15 | CT_OTHER = CT_SINGLE + 12 # 其他 16 | 17 | char_type = [[]] * 65536 18 | 19 | 20 | def __init__(): 21 | logger.info("字符类型对应表开始加载 %s", config.CHAR_TYPE_PATH) 22 | start = time.time() 23 | byte_array = ByteArray.load_from_file(config.CHAR_TYPE_PATH) 24 | if byte_array is None: 25 | import sys 26 | logger.error("字符类型对应表加载失败:" + config.CHAR_TYPE_PATH) 27 | sys.exit(-1) 28 | else: 29 | while byte_array.has_more(): 30 | b = byte_array.next_ushort() 31 | e = byte_array.next_ushort() 32 | t = byte_array.next_uchar() 33 | for i in range(b, e + 1): 34 | char_type[i] = t 35 | logger.info("字符类型对应表加载成功,耗时 %s s", (time.time() - start)) 36 | 37 | 38 | def get(c): 39 | if type(c) is not int: 40 | return char_type[ord(c)] 41 | else: 42 | return char_type[c] 43 | 44 | 45 | __init__() 46 | -------------------------------------------------------------------------------- /yaya/utility/persistence.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tony' 2 | -------------------------------------------------------------------------------- /yaya/utility/singleton.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tony' 2 | 3 | 4 | def singleton(class_): 5 | instances = {} 6 | 7 | def get_instance(*args, **kwargs): 8 | if class_ not in instances: 9 | instances[class_] = class_(*args, **kwargs) 10 | return instances[class_] 11 | 12 | return get_instance 13 | --------------------------------------------------------------------------------