├── .gitignore ├── LICENSE ├── README.md ├── build.sh ├── config ├── texsmart-remote.xml └── texsmart.properties ├── lib ├── .DS_Store ├── jna.jar ├── libtencent_ai_texsmart.so ├── tencent.ai.texsmart.jar ├── tencent_ai_texsmart.dll ├── tencent_ai_texsmart.lib ├── tencent_ai_texsmart.py └── tencent_ai_texsmart.pyc ├── pom.xml ├── settings.xml └── src └── main ├── assemblies └── plugin.xml ├── java ├── com │ └── texsmart │ │ ├── TexSmart.java │ │ ├── cfg │ │ └── Configuration.java │ │ ├── dic │ │ ├── Dictionary.java │ │ ├── DictionaryFile.java │ │ ├── ExtMonitor.java │ │ ├── RemoteMonitor.java │ │ ├── cache │ │ │ └── DictionaryFileCache.java │ │ ├── config │ │ │ ├── RemoteDictConfig.java │ │ │ └── TexSmartConfig.java │ │ └── stopword │ │ │ ├── Filter.java │ │ │ └── FilterStopWord.java │ │ ├── help │ │ ├── ESPluginLoggerFactory.java │ │ └── PrefixPluginLogger.java │ │ ├── lucene │ │ ├── PorterStemmer.java │ │ ├── SegmentWrapper.java │ │ ├── TexSmartAnalyzer.java │ │ ├── TexSmartIndexAnalyzer.java │ │ ├── TexSmartStandardAnalyzer.java │ │ ├── TexSmartTokenizer.java │ │ └── TokenizerBuilder.java │ │ ├── seg │ │ ├── Config.java │ │ ├── Segment.java │ │ └── TexSmartBasicSegment.java │ │ ├── tokenizer │ │ └── StandardTokenizer.java │ │ └── utility │ │ └── TextUtility.java ├── es-plugin.properties ├── org │ └── elasticsearch │ │ ├── index │ │ └── analysis │ │ │ ├── NerAlgType.java │ │ │ ├── PosAlgType.java │ │ │ ├── TexSmartAnalyzerProvider.java │ │ │ ├── TexSmartTokenizerFactory.java │ │ │ └── TexSmartType.java │ │ └── plugin │ │ └── analysis │ │ └── texsmart │ │ └── AnalysisTexSmartPlugin.java └── tencent │ └── ai │ └── texsmart │ ├── CLib.java │ ├── NluEngine.java │ └── NluOutput.java └── resources ├── plugin-descriptor.properties ├── plugin-security.policy ├── texsmart-remote.xml └── texsmart.properties /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .DS_Store 3 | *.iws 4 | *.iml 5 | *.ipr 6 | target/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # elasticsearch-analysis-texsmart 2 | TexSmart Analyzer for ElasticSearch 3 | 4 | 此分词器基于腾讯AI实验室[TexSmart中文分词库](https://ai.tencent.com/ailab/nlp/texsmart),提供了ES中文分词插件. 5 | 6 | 🚩 更新日志: 7 | 1. 适配Elasticsearch 7.x版本,TexSmart-0.1.3版本 8 | 9 | ---------- 10 | 11 | 版本对应 12 | ---------- 13 | 14 | | Plugin version | Elastic version | 15 | | :------------- | :-------------- | 16 | | master | 7.x | 17 | | 7.6.2 | 7.6.2 | 18 | 19 | 20 | 安装步骤 21 | ---------- 22 | 23 | ### 1. 下载安装ES对应Plugin Release版本 24 | 25 | 安装方式: 26 | 27 | 方式一 28 | 29 | a. 下载对应的release安装包,最新release包可从github下载(链接: https://github.com/koios-sh/elasticsearch-analysis-texsmart/releases/download/v7.6.2/elasticsearch-analysis-texsmart-7.6.2.zip) 30 | 31 | b. 执行如下命令安装,其中PATH为插件包绝对路径: 32 | 33 | `./bin/elasticsearch-plugin install file://${PATH}` 34 | 35 | 方式二 36 | 37 | a. 使用elasticsearch插件脚本安装command如下: 38 | 39 | `./bin/elasticsearch-plugin install https://github.com/koios-sh/elasticsearch-analysis-texsmart/releases/download/v7.6.2/elasticsearch-analysis-texsmart-7.6.2.zip` 40 | 41 | 方式三 42 | 43 | a. 编译:sh build.sh 44 | b. 执行如下命令安装,其中PATH为插件包绝对路径: 45 | 46 | `./bin/elasticsearch-plugin install file://${PATH}` 47 | 48 | ### 2. 安装数据包 49 | 50 | release包中不包含TexSmart数据包,若要下载完整版数据包,请查看[TexSmart Release](https://ai.tencent.com/ailab/nlp/texsmart/zh/download.html)。 51 | 52 | 数据包目录:/etc/elasticsearch/texsmart/data 53 | 可以修改config/texsmart.properties文件中的path值,调整数据路径 54 | 55 | ### 3. 安装libtencent_ai_texsmart.so 56 | 57 | cp libtencent_ai_texsmart.so /usr/lib64 && chmod 777 /usr/lib64/libtencent_ai_texsmart.so 58 | 59 | **注:每个节点都需要做上述更改** 60 | 61 | 提供的分词方式说明 62 | ---------- 63 | 64 | texsmart: texsmart默认分词 65 | 66 | texsmart_standard: 标准分词 67 | 68 | texsmart_index: 索引分词 69 | 70 | 样例 71 | ---------- 72 | 73 | ```text 74 | POST http://localhost:9200/test/_analyze 75 | { 76 | "text": "2020年,空调市场“冷风吹过”", 77 | "tokenizer": "texsmart_standard" 78 | } 79 | ``` 80 | 81 | ```json 82 | { 83 | "tokens": [ 84 | { 85 | "token": "2020", 86 | "start_offset": 0, 87 | "end_offset": 4, 88 | "type": "CD", 89 | "position": 0 90 | }, 91 | { 92 | "token": "年", 93 | "start_offset": 4, 94 | "end_offset": 5, 95 | "type": "M", 96 | "position": 1 97 | }, 98 | { 99 | "token": ",", 100 | "start_offset": 5, 101 | "end_offset": 6, 102 | "type": "PU", 103 | "position": 2 104 | }, 105 | { 106 | "token": "空调", 107 | "start_offset": 6, 108 | "end_offset": 8, 109 | "type": "NN", 110 | "position": 3 111 | }, 112 | { 113 | "token": "市场", 114 | "start_offset": 8, 115 | "end_offset": 10, 116 | "type": "NN", 117 | "position": 4 118 | }, 119 | { 120 | "token": "“", 121 | "start_offset": 10, 122 | "end_offset": 11, 123 | "type": "PU", 124 | "position": 5 125 | }, 126 | { 127 | "token": "冷风", 128 | "start_offset": 11, 129 | "end_offset": 13, 130 | "type": "NN", 131 | "position": 6 132 | }, 133 | { 134 | "token": "吹过", 135 | "start_offset": 13, 136 | "end_offset": 15, 137 | "type": "VV", 138 | "position": 7 139 | }, 140 | { 141 | "token": "”", 142 | "start_offset": 15, 143 | "end_offset": 16, 144 | "type": "PU", 145 | "position": 8 146 | } 147 | ] 148 | } 149 | ``` 150 | 151 | - 保证词典编码UTF-8 152 | 153 | 自定义分词配置 154 | ---------- 155 | 156 | TexSmart在提供了各类分词方式的基础上,也提供了一系列的分词配置,分词插件也提供了相关的分词配置,我们可以在通过如下配置来自定义自己的分词器: 157 | 158 | | Config | Elastic version | 159 | | :----------------------------------- | :------------------ | 160 | | enable_index_mode | 是否是索引分词 | 161 | | enable_stop_dictionary | 是否启用停用词 | 162 | | enable_offset | 是否计算偏移量 | 163 | | enable_pos_alg | 指定pos_tagging算法 (log_linear(默认), crf,dnn) | 164 | | enable_ner_alg | 指定ner算法 (crf(默认),dnn) | 165 | 166 | 注意: 如果要采用如上配置过滤中英文标点符号,需要设置enable_stop_dictionary为true 167 | 168 | 例如: 169 | ```text 170 | PUT test 171 | { 172 | "settings": { 173 | "analysis": { 174 | "analyzer": { 175 | "my_texsmart_analyzer": { 176 | "tokenizer": "my_texsmart" 177 | } 178 | }, 179 | "tokenizer": { 180 | "my_texsmart": { 181 | "type": "texsmart", 182 | "enable_stop_dictionary": true, 183 | "enable_pos_alg": "log_linear", 184 | "enable_ner_alg": "crf" 185 | } 186 | } 187 | } 188 | } 189 | } 190 | ``` 191 | 192 | ```text 193 | POST test/_analyze 194 | { 195 | "text": "2020年,空调市场“冷风吹过”", 196 | "analyzer": "my_texsmart_analyzer" 197 | } 198 | ``` 199 | 200 | 结果: 201 | ```json 202 | { 203 | "tokens": [ 204 | { 205 | "token": "2020", 206 | "start_offset": 0, 207 | "end_offset": 4, 208 | "type": "CD", 209 | "position": 0 210 | }, 211 | { 212 | "token": "年", 213 | "start_offset": 4, 214 | "end_offset": 5, 215 | "type": "M", 216 | "position": 1 217 | }, 218 | { 219 | "token": "空调", 220 | "start_offset": 6, 221 | "end_offset": 8, 222 | "type": "NN", 223 | "position": 2 224 | }, 225 | { 226 | "token": "市场", 227 | "start_offset": 8, 228 | "end_offset": 10, 229 | "type": "NN", 230 | "position": 3 231 | }, 232 | { 233 | "token": "冷风", 234 | "start_offset": 11, 235 | "end_offset": 13, 236 | "type": "NN", 237 | "position": 4 238 | }, 239 | { 240 | "token": "吹过", 241 | "start_offset": 13, 242 | "end_offset": 15, 243 | "type": "VV", 244 | "position": 5 245 | } 246 | ] 247 | } 248 | 249 | ``` 250 | 251 | # 特别说明 252 | 1, texsmart目前官方不支持热词加载更新,听说下一个版本会支持。 253 | 代码中参考analysis-hanlp插件集成了远程词库和动态更新分词的功能 254 | 后续等腾讯官方版本更新后,上线该功能 255 | 256 | 🚩 参考资料: 257 | [TexSmart](https://ai.tencent.com/ailab/nlp/texsmart) 258 | [analysis-hanlp](https://github.com/KennFalcon/elasticsearch-analysis-hanlp) -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | mvn --settings=settings.xml -Dmaven.test.skip=true clean install 2 | -------------------------------------------------------------------------------- /config/texsmart-remote.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | TexSmart Analyzer 扩展配置 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /config/texsmart.properties: -------------------------------------------------------------------------------- 1 | root=. 2 | CoreDictionaryPath=data/nlu/kb/ 3 | CustomDictionaryPath=data/nlu/kb/customization/ 4 | 5 | path=/etc/elasticsearch/texsmart/data/nlu/kb/ -------------------------------------------------------------------------------- /lib/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koios-sh/elasticsearch-analysis-texsmart/29f1b109b9f78aaabf2bf81a2406ba81c9314d3b/lib/.DS_Store -------------------------------------------------------------------------------- /lib/jna.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koios-sh/elasticsearch-analysis-texsmart/29f1b109b9f78aaabf2bf81a2406ba81c9314d3b/lib/jna.jar -------------------------------------------------------------------------------- /lib/libtencent_ai_texsmart.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koios-sh/elasticsearch-analysis-texsmart/29f1b109b9f78aaabf2bf81a2406ba81c9314d3b/lib/libtencent_ai_texsmart.so -------------------------------------------------------------------------------- /lib/tencent.ai.texsmart.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koios-sh/elasticsearch-analysis-texsmart/29f1b109b9f78aaabf2bf81a2406ba81c9314d3b/lib/tencent.ai.texsmart.jar -------------------------------------------------------------------------------- /lib/tencent_ai_texsmart.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koios-sh/elasticsearch-analysis-texsmart/29f1b109b9f78aaabf2bf81a2406ba81c9314d3b/lib/tencent_ai_texsmart.dll -------------------------------------------------------------------------------- /lib/tencent_ai_texsmart.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koios-sh/elasticsearch-analysis-texsmart/29f1b109b9f78aaabf2bf81a2406ba81c9314d3b/lib/tencent_ai_texsmart.lib -------------------------------------------------------------------------------- /lib/tencent_ai_texsmart.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from ctypes import * 3 | import os 4 | import sys 5 | 6 | my_dir_path = os.path.dirname(os.path.realpath(__file__)) + '/' 7 | dll_name = 'libtencent_ai_texsmart.so' 8 | if sys.platform.startswith("win"): 9 | dll_name = 'tencent_ai_texsmart.dll' 10 | elif sys.platform == "cygwin": 11 | dll_name = "tencent_ai_texsmart.dll" 12 | lib = cdll.LoadLibrary(my_dir_path + dll_name) 13 | 14 | class NluToken(Structure): 15 | _fields_ = [ 16 | ('str', c_wchar_p), 17 | ('offset', c_uint32), 18 | ('type', c_uint32), 19 | ] 20 | 21 | class NluTerm(Structure): 22 | _fields_ = [ 23 | ('str', c_wchar_p), 24 | ('offset', c_uint32), 25 | ('len', c_uint32), 26 | ('start_token', c_uint32), 27 | ('token_count', c_uint32), 28 | ('tag', c_wchar_p), 29 | ('tag_id', c_uint32), 30 | ] 31 | 32 | class NluEntityType(Structure): 33 | _fields_ = [ 34 | ('name', c_wchar_p), 35 | ('i18n', c_wchar_p), 36 | ('flag', c_uint32), 37 | ('path', c_wchar_p), 38 | ] 39 | 40 | class NluEntityTypeArray(Structure): 41 | _fields_ = [ 42 | ('size', c_uint32), 43 | ('items', POINTER(NluEntityType)), 44 | ] 45 | 46 | class NluEntity(Structure): 47 | _fields_ = [ 48 | ('str', c_wchar_p), 49 | ('offset', c_uint32), 50 | ('len', c_uint32), 51 | ('start_token', c_uint32), 52 | ('token_count', c_uint32), 53 | ('type', NluEntityType), 54 | ('alt_types', NluEntityTypeArray), 55 | ('meaning', c_wchar_p), 56 | ] 57 | 58 | class _NluTokenArray(Structure): 59 | _fields_ = [ 60 | ('size', c_uint32), 61 | ('items', POINTER(NluToken)), 62 | ] 63 | 64 | class _NluTermArray(Structure): 65 | _fields_ = [ 66 | ('size', c_uint32), 67 | ('items', POINTER(NluTerm)), 68 | ] 69 | 70 | class _NluEntityArray(Structure): 71 | _fields_ = [ 72 | ('size', c_uint32), 73 | ('items', POINTER(NluEntity)), 74 | ] 75 | 76 | lib.Nlu_CreateEngine.restype = c_void_p 77 | lib.Nlu_CreateEngine.argtypes = [c_char_p, c_int] 78 | lib.Nlu_DestroyEngine.argtypes = [c_void_p] 79 | lib.Nlu_ParseText.restype = c_void_p 80 | lib.Nlu_ParseText.argtypes = [c_void_p, c_wchar_p, c_int] 81 | lib.Nlu_ParseTextExt.restype = c_void_p 82 | lib.Nlu_ParseTextExt.argtypes = [c_void_p, c_wchar_p, c_int, c_wchar_p] 83 | lib.Nlu_DestroyOutput.argtypes = [c_void_p] 84 | lib.Nlu_GetNormText.restype = c_wchar_p 85 | lib.Nlu_GetNormText.argtypes = [c_void_p, POINTER(c_int)] 86 | lib.Nlu_GetTokens.restype = _NluTokenArray 87 | lib.Nlu_GetTokens.argtypes = [c_void_p] 88 | lib.Nlu_GetWords.restype = _NluTermArray 89 | lib.Nlu_GetWords.argtypes = [c_void_p] 90 | lib.Nlu_GetPhrases.restype = _NluTermArray 91 | lib.Nlu_GetPhrases.argtypes = [c_void_p] 92 | lib.Nlu_GetEntities.restype = _NluEntityArray 93 | lib.Nlu_GetEntities.argtypes = [c_void_p] 94 | 95 | class NluOutput(object): 96 | def __init__(self, ptr): 97 | self.obj = ptr 98 | def __del__(self): 99 | if(self.obj is not None): 100 | lib.Nlu_DestroyOutput(self.obj) 101 | self.obj = None 102 | def norm_text(self): 103 | ret = lib.Nlu_GetNormText(self.obj, None) 104 | return ret 105 | def tokens(self): 106 | arr = [] 107 | item_list = lib.Nlu_GetTokens(self.obj) 108 | for idx in range(item_list.size): 109 | arr.append(item_list.items[idx]) 110 | return arr 111 | def words(self): 112 | arr = [] 113 | item_list = lib.Nlu_GetWords(self.obj) 114 | for idx in range(item_list.size): 115 | arr.append(item_list.items[idx]) 116 | return arr 117 | def phrases(self): 118 | arr = [] 119 | item_list = lib.Nlu_GetPhrases(self.obj) 120 | for idx in range(item_list.size): 121 | arr.append(item_list.items[idx]) 122 | return arr 123 | def entities(self): 124 | arr = [] 125 | #count = lib.Nlu_GetEntityCount(self.obj) 126 | #for idx in range(count): 127 | # arr.append(lib.Nlu_GetEntity(slef.obj, idx)) 128 | item_list = lib.Nlu_GetEntities(self.obj) 129 | for idx in range(item_list.size): 130 | arr.append(item_list.items[idx]) 131 | return arr 132 | 133 | class NluEngine(object): 134 | def __init__(self, data_dir, worker_count): 135 | self.obj = lib.Nlu_CreateEngine(data_dir.encode('utf-8'), worker_count) 136 | def __del__(self): 137 | if self.obj is not None: 138 | lib.Nlu_DestroyEngine(self.obj) 139 | self.obj = None 140 | def parse_text(self, input_str): 141 | output_handle = lib.Nlu_ParseText(self.obj, c_wchar_p(input_str), len(input_str)) 142 | return NluOutput(output_handle) 143 | def parse_text_ext(self, input_str, options_str): 144 | output_handle = lib.Nlu_ParseTextExt(self.obj, c_wchar_p(input_str), len(input_str), c_wchar_p(options_str)) 145 | return NluOutput(output_handle) 146 | -------------------------------------------------------------------------------- /lib/tencent_ai_texsmart.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koios-sh/elasticsearch-analysis-texsmart/29f1b109b9f78aaabf2bf81a2406ba81c9314d3b/lib/tencent_ai_texsmart.pyc -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | elasticsearch-analysis-texsmart 6 | org.elasticsearch 7 | elasticsearch-analysis-texsmart 8 | ${elasticsearch.version} 9 | jar 10 | TexSmart Analyzer for ElasticSearch 11 | 12 | 13 | 7.6.2 14 | 1.8 15 | UTF-8 16 | ${project.basedir}/src/main/assemblies/plugin.xml 17 | analysis-texsmart 18 | org.elasticsearch.plugin.analysis.texsmart.AnalysisTexSmartPlugin 19 | true 20 | false 21 | true 22 | sdk-0.1.3 23 | 12 24 | 12 25 | 26 | 27 | 28 | 29 | org.elasticsearch 30 | elasticsearch 31 | ${elasticsearch.version} 32 | compile 33 | 34 | 35 | org.apache.httpcomponents 36 | httpclient 37 | 4.5.6 38 | 39 | 40 | org.apache.logging.log4j 41 | log4j-api 42 | 2.3 43 | compile 44 | 45 | 46 | org.hamcrest 47 | hamcrest-core 48 | 1.3.RC2 49 | test 50 | 51 | 52 | org.hamcrest 53 | hamcrest-library 54 | 1.3.RC2 55 | test 56 | 57 | 58 | junit 59 | junit 60 | 4.11 61 | test 62 | 63 | 64 | com.sun.jna 65 | com.sun.jna 66 | 1.0 67 | system 68 | ${project.basedir}/lib/jna.jar 69 | 70 | 71 | 72 | 73 | 74 | 75 | org.apache.maven.plugins 76 | maven-compiler-plugin 77 | 3.5.1 78 | 79 | ${maven.compiler.target} 80 | ${maven.compiler.target} 81 | 82 | 83 | 84 | org.apache.maven.plugins 85 | maven-surefire-plugin 86 | 2.11 87 | 88 | 89 | **/*Tests.java 90 | 91 | 92 | 93 | 94 | org.apache.maven.plugins 95 | maven-source-plugin 96 | 2.1.2 97 | 98 | 99 | attach-sources 100 | 101 | jar 102 | 103 | 104 | 105 | 106 | 107 | maven-assembly-plugin 108 | 109 | 110 | false 111 | ${project.build.directory}/releases/ 112 | 113 | ${basedir}/src/main/assemblies/plugin.xml 114 | 115 | 116 | 117 | fully.qualified.MainClass 118 | 119 | 120 | 121 | 122 | 123 | package 124 | 125 | single 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | lib 134 | BOOT-INF/lib/ 135 | 136 | **/*.jar 137 | 138 | 139 | 140 | 141 | 142 | 143 | disable-java8-doclint 144 | 145 | [1.8,) 146 | 147 | 148 | -Xdoclint:none 149 | 150 | 151 | 152 | release 153 | 154 | 155 | 156 | org.apache.maven.plugins 157 | maven-jar-plugin 158 | 3.1.2 159 | 160 | 161 | texsmart.properties 162 | 163 | 164 | 165 | 166 | org.apache.maven.plugins 167 | maven-compiler-plugin 168 | 3.8.0 169 | 170 | ${maven.compiler.target} 171 | ${maven.compiler.target} 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | -------------------------------------------------------------------------------- /settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | aliyunmaven 7 | * 8 | 阿里云公共仓库 9 | https://maven.aliyun.com/repository/public 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /src/main/assemblies/plugin.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | analysis-texsmart-release 4 | 5 | zip 6 | 7 | false 8 | 9 | 10 | ${project.basedir}/config 11 | /config 12 | 13 | 14 | 15 | 16 | ${project.basedir}/src/main/resources/plugin-descriptor.properties 17 | / 18 | true 19 | 20 | 21 | ${project.basedir}/src/main/resources/plugin-security.policy 22 | / 23 | true 24 | 25 | 26 | ${project.basedir}/config/texsmart-remote.xml 27 | / 28 | true 29 | 30 | 31 | ${project.basedir}/config/texsmart.properties 32 | / 33 | true 34 | 35 | 36 | 37 | 38 | / 39 | true 40 | true 41 | 42 | org.elasticsearch:elasticsearch 43 | 44 | 45 | 46 | / 47 | true 48 | true 49 | 50 | ${pom.basedir}/lib/jna.jar 51 | 52 | 53 | org.apache.lucene:lucene-core 54 | org.apache.lucene:lucene-analyzers-common 55 | org.apache.lucene:lucene-queryparser 56 | org.apache.lucene:lucene-sandbox 57 | 58 | 59 | 60 | / 61 | true 62 | true 63 | 64 | com.fasterxml.jackson.core:jackson-databind 65 | com.fasterxml.jackson.core:jackson-annotations 66 | 67 | 68 | com.fasterxml.jackson.core:jackson-core 69 | 70 | 71 | 72 | / 73 | true 74 | true 75 | 76 | org.apache.httpcomponents:httpclient 77 | 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /src/main/java/com/texsmart/TexSmart.java: -------------------------------------------------------------------------------- 1 | package com.texsmart; 2 | 3 | import com.texsmart.dic.config.TexSmartConfig; 4 | import com.texsmart.help.ESPluginLoggerFactory; 5 | import com.texsmart.seg.Segment; 6 | import com.texsmart.seg.TexSmartBasicSegment; 7 | import com.texsmart.tokenizer.StandardTokenizer; 8 | import org.apache.logging.log4j.Logger; 9 | import tencent.ai.texsmart.NluEngine; 10 | import tencent.ai.texsmart.NluOutput.Term; 11 | 12 | import java.util.List; 13 | 14 | public class TexSmart { 15 | 16 | private static final Logger logger = ESPluginLoggerFactory.getLogger(TexSmart.class.getName()); 17 | 18 | public static NluEngine TEX_ENGINE; 19 | 20 | static { 21 | TEX_ENGINE = new NluEngine(); 22 | int workerCount = Runtime.getRuntime().availableProcessors(); 23 | logger.info("texsmart analysis is initializing"); 24 | boolean ret = TEX_ENGINE.init(TexSmartConfig.getConfig().getProperty("path"), workerCount); 25 | if (!ret) { 26 | logger.info("texsmart analysis load failed"); 27 | } else { 28 | logger.info("texsmart analysis load success"); 29 | } 30 | } 31 | 32 | private TexSmart() { 33 | } 34 | 35 | public static List segment(String text) { 36 | return StandardTokenizer.segment(text); 37 | } 38 | 39 | public static Segment newSegment() { 40 | return new TexSmartBasicSegment(); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/com/texsmart/cfg/Configuration.java: -------------------------------------------------------------------------------- 1 | package com.texsmart.cfg; 2 | 3 | import com.texsmart.dic.Dictionary; 4 | import org.elasticsearch.common.inject.Inject; 5 | import org.elasticsearch.common.settings.Settings; 6 | import org.elasticsearch.env.Environment; 7 | import org.elasticsearch.index.analysis.NerAlgType; 8 | import org.elasticsearch.index.analysis.PosAlgType; 9 | 10 | /** 11 | * @project: elasticsearch-analysis-texsmart 12 | * @description: 配置信息 13 | * @author: wei_liu 14 | * @create: 2020-09-09 15:10 15 | */ 16 | public class Configuration { 17 | 18 | private Environment environment; 19 | 20 | private Settings settings; 21 | 22 | private boolean enablePorterStemming; 23 | 24 | private boolean enableIndexMode; 25 | 26 | private boolean enableCustomDictionary; 27 | 28 | private boolean enableRemoteDict; 29 | 30 | private boolean enableNormalization; 31 | 32 | private boolean enableOffset; 33 | 34 | private boolean enableCustomConfig; 35 | 36 | private boolean enableStopDictionary; 37 | 38 | private PosAlgType enablePosAlg; 39 | private NerAlgType enableNerAlg; 40 | 41 | @Inject 42 | public Configuration(Environment env, Settings settings) { 43 | this.environment = env; 44 | this.settings = settings; 45 | this.enablePorterStemming = settings.get("enable_porter_stemming", "false").equals("true"); 46 | this.enableIndexMode = settings.get("enable_index_mode", "false").equals("true"); 47 | this.enableCustomDictionary = settings.get("enable_custom_dictionary", "true").equals("true"); 48 | this.enableStopDictionary = settings.get("enable_stop_dictionary", "false").equals("true"); 49 | this.enableRemoteDict = settings.get("enable_remote_dict", "true").equals("true"); 50 | this.enableNormalization = settings.get("enable_normalization", "false").equals("true"); 51 | this.enableOffset = settings.get("enable_offset", "true").equals("true"); 52 | this.enableCustomConfig = settings.get("enable_custom_config", "false").equals("true"); 53 | try { 54 | this.enablePosAlg = PosAlgType.valueOf(settings.get("enable_pos_alg", "log_linear")); 55 | this.enableNerAlg = NerAlgType.valueOf(settings.get("enable_ner_alg", "crf")); 56 | } catch (IllegalArgumentException e) { 57 | this.enablePosAlg = PosAlgType.LOG_LINEAR; 58 | this.enableNerAlg = NerAlgType.CRF; 59 | } 60 | Dictionary.initial(this); 61 | } 62 | 63 | public Environment getEnvironment() { 64 | return this.environment; 65 | } 66 | 67 | public Settings getSettings() { 68 | return this.settings; 69 | } 70 | 71 | public boolean isEnablePorterStemming() { 72 | return this.enablePorterStemming; 73 | } 74 | 75 | public Configuration enablePorterStemming(boolean enablePorterStemming) { 76 | this.enablePorterStemming = enablePorterStemming; 77 | return this; 78 | } 79 | 80 | public boolean isEnableStopDictionary() { 81 | return this.enableStopDictionary; 82 | } 83 | 84 | public boolean isEnableIndexMode() { 85 | return this.enableIndexMode; 86 | } 87 | 88 | public Configuration enableIndexMode(boolean enableIndexMode) { 89 | this.enableIndexMode = enableIndexMode; 90 | return this; 91 | } 92 | 93 | public boolean isEnableCustomDictionary() { 94 | return this.enableCustomDictionary; 95 | } 96 | 97 | public Configuration enableCustomDictionary(boolean enableCustomDictionary) { 98 | this.enableCustomDictionary = enableCustomDictionary; 99 | return this; 100 | } 101 | 102 | public boolean isEnableRemoteDict() { 103 | return enableRemoteDict; 104 | } 105 | 106 | public Configuration enableRemoteDict(boolean enableRemoteDict) { 107 | this.enableRemoteDict = enableRemoteDict; 108 | return this; 109 | } 110 | 111 | public boolean isEnableNormalization() { 112 | return enableNormalization; 113 | } 114 | 115 | public Configuration enableNormalization(boolean enableNormalization) { 116 | this.enableNormalization = enableNormalization; 117 | return this; 118 | } 119 | 120 | public boolean isEnableOffset() { 121 | return enableOffset; 122 | } 123 | 124 | public Configuration enableOffset(boolean enableOffset) { 125 | this.enableOffset = enableOffset; 126 | return this; 127 | } 128 | 129 | public boolean isEnableCustomConfig() { 130 | return enableCustomConfig; 131 | } 132 | 133 | public Configuration enableCustomConfig(boolean enableCustomConfig) { 134 | this.enableCustomConfig = enableCustomConfig; 135 | return this; 136 | } 137 | 138 | public PosAlgType getEnablePosAlg() { 139 | return this.enablePosAlg; 140 | } 141 | 142 | public Configuration enablePosAlg(PosAlgType enablePosAlg) { 143 | this.enablePosAlg = enablePosAlg; 144 | return this; 145 | } 146 | 147 | public NerAlgType getEnableNerAlg() { 148 | return this.enableNerAlg; 149 | } 150 | 151 | public Configuration enablePosAlg(NerAlgType enableNerAlg) { 152 | this.enableNerAlg = enableNerAlg; 153 | return this; 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /src/main/java/com/texsmart/dic/Dictionary.java: -------------------------------------------------------------------------------- 1 | package com.texsmart.dic; 2 | 3 | import com.texsmart.cfg.Configuration; 4 | import com.texsmart.dic.cache.DictionaryFileCache; 5 | import com.texsmart.dic.config.RemoteDictConfig; 6 | import org.elasticsearch.plugin.analysis.texsmart.AnalysisTexSmartPlugin; 7 | 8 | import java.nio.file.Path; 9 | import java.util.concurrent.Executors; 10 | import java.util.concurrent.ScheduledExecutorService; 11 | import java.util.concurrent.TimeUnit; 12 | 13 | /** 14 | * @project: elasticsearch-analysis-texsmart 15 | * @description: 词典类 16 | * @author: wei_liu 17 | * @create: 2020-09-09 15:10 18 | */ 19 | public class Dictionary { 20 | /** 21 | * 词典单子实例 22 | */ 23 | private static Dictionary singleton; 24 | /** 25 | * TexSmart配置文件名 26 | */ 27 | public static final String CONFIG_FILE_NAME = "texsmart.properties"; 28 | /** 29 | * TexSmart远程词典配置文件名 30 | */ 31 | private static final String REMOTE_CONFIG_FILE_NAME = "texsmart-remote.xml"; 32 | 33 | private static ScheduledExecutorService pool = Executors.newScheduledThreadPool(1); 34 | 35 | private Dictionary(Configuration configuration) { 36 | Path configDir = configuration.getEnvironment().configFile().resolve(AnalysisTexSmartPlugin.PLUGIN_NAME); 37 | DictionaryFileCache.configCachePath(configuration); 38 | DictionaryFileCache.loadCache(); 39 | RemoteDictConfig.initial(configDir.resolve(REMOTE_CONFIG_FILE_NAME).toString()); 40 | } 41 | 42 | public static synchronized Dictionary initial(Configuration configuration) { 43 | if (singleton == null) { 44 | synchronized (Dictionary.class) { 45 | if (singleton == null) { 46 | singleton = new Dictionary(configuration); 47 | pool.scheduleAtFixedRate(new ExtMonitor(), 10, 60, TimeUnit.SECONDS); 48 | if (configuration.isEnableRemoteDict()) { 49 | for (String location : RemoteDictConfig.getSingleton().getRemoteExtDictionarys()) { 50 | pool.scheduleAtFixedRate(new RemoteMonitor(location, "custom"), 10, 60, TimeUnit.SECONDS); 51 | } 52 | 53 | for (String location : RemoteDictConfig.getSingleton().getRemoteExtStopWordDictionarys()) { 54 | pool.scheduleAtFixedRate(new RemoteMonitor(location, "stop"), 10, 60, TimeUnit.SECONDS); 55 | } 56 | } 57 | return singleton; 58 | } 59 | } 60 | } 61 | return singleton; 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/com/texsmart/dic/DictionaryFile.java: -------------------------------------------------------------------------------- 1 | package com.texsmart.dic; 2 | 3 | import java.io.DataInputStream; 4 | import java.io.DataOutputStream; 5 | import java.io.IOException; 6 | import java.nio.charset.StandardCharsets; 7 | import java.util.Objects; 8 | 9 | /** 10 | * @project: elasticsearch-analysis-texsmart 11 | * @description: 自定义词典文件信息 12 | * @author: wei_liu 13 | * @create: 2020-09-09 15:10 14 | */ 15 | public class DictionaryFile { 16 | 17 | private String path; 18 | 19 | private String type; 20 | 21 | private long lastModified; 22 | 23 | public DictionaryFile() { 24 | } 25 | 26 | DictionaryFile(String path, long lastModified) { 27 | this.path = path; 28 | this.lastModified = lastModified; 29 | } 30 | 31 | DictionaryFile(String path, String type, long lastModified) { 32 | this(path, lastModified); 33 | this.type = type; 34 | } 35 | 36 | public String getPath() { 37 | return path; 38 | } 39 | 40 | public void setPath(String path) { 41 | this.path = path; 42 | } 43 | 44 | public String getType() { 45 | return type; 46 | } 47 | 48 | public void setType(String type) { 49 | this.type = type; 50 | } 51 | 52 | public long getLastModified() { 53 | return lastModified; 54 | } 55 | 56 | public void setLastModified(long lastModified) { 57 | this.lastModified = lastModified; 58 | } 59 | 60 | public void write(DataOutputStream out) throws IOException { 61 | if (path != null && path.length() != 0) { 62 | byte[] bytes = path.getBytes(StandardCharsets.UTF_8); 63 | out.writeInt(bytes.length); 64 | out.write(bytes); 65 | } else { 66 | out.writeInt(0); 67 | } 68 | if (type != null && type.length() != 0) { 69 | byte[] bytes = type.getBytes(StandardCharsets.UTF_8); 70 | out.writeInt(bytes.length); 71 | out.write(bytes); 72 | } else { 73 | out.writeInt(0); 74 | } 75 | out.writeLong(lastModified); 76 | } 77 | 78 | public void read(DataInputStream in) throws IOException { 79 | int pathLength = in.readInt(); 80 | if (pathLength != 0) { 81 | byte[] bytes = new byte[pathLength]; 82 | in.read(bytes); 83 | path = new String(bytes, StandardCharsets.UTF_8); 84 | } 85 | 86 | int typeLength = in.readInt(); 87 | if (typeLength != 0) { 88 | byte[] bytes = new byte[typeLength]; 89 | in.read(bytes); 90 | type = new String(bytes, StandardCharsets.UTF_8); 91 | } 92 | lastModified = in.readLong(); 93 | } 94 | 95 | @Override 96 | public boolean equals(Object o) { 97 | if (this == o) { 98 | return true; 99 | } 100 | if (o == null || getClass() != o.getClass()) { 101 | return false; 102 | } 103 | DictionaryFile that = (DictionaryFile) o; 104 | return lastModified == that.lastModified && 105 | Objects.equals(path, that.path) && 106 | Objects.equals(type, that.type); 107 | } 108 | 109 | @Override 110 | public int hashCode() { 111 | return Objects.hash(path, type, lastModified); 112 | } 113 | 114 | @Override 115 | public String toString() { 116 | return "DictionaryFile{" + 117 | "path='" + path + '\'' + 118 | ", lastModified=" + lastModified + 119 | '}'; 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /src/main/java/com/texsmart/dic/ExtMonitor.java: -------------------------------------------------------------------------------- 1 | package com.texsmart.dic; 2 | 3 | import com.texsmart.TexSmart; 4 | import com.texsmart.dic.cache.DictionaryFileCache; 5 | import com.texsmart.help.ESPluginLoggerFactory; 6 | import org.apache.logging.log4j.Logger; 7 | import org.elasticsearch.SpecialPermission; 8 | 9 | import java.io.File; 10 | import java.io.FileInputStream; 11 | import java.io.InputStreamReader; 12 | import java.security.AccessController; 13 | import java.security.PrivilegedAction; 14 | import java.util.ArrayList; 15 | import java.util.Arrays; 16 | import java.util.List; 17 | import java.util.Properties; 18 | 19 | /** 20 | * @project: elasticsearch-analysis-hanlp 21 | * @description: 自定义词典监控线程 22 | * @author: Kenn 23 | * @create: 2018-12-14 15:10 24 | */ 25 | public class ExtMonitor implements Runnable { 26 | 27 | private static final Logger logger = ESPluginLoggerFactory.getLogger(ExtMonitor.class.getName()); 28 | 29 | ExtMonitor() { 30 | SecurityManager sm = System.getSecurityManager(); 31 | if (sm != null) { 32 | sm.checkPermission(new SpecialPermission()); 33 | } 34 | } 35 | 36 | @Override 37 | public void run() { 38 | // List originalDictionaryFileList = DictionaryFileCache.getCustomDictionaryFileList(); 39 | // logger.debug("hanlp original custom dictionary: {}", Arrays.toString(originalDictionaryFileList.toArray())); 40 | // reloadProperty(); 41 | // List currentDictironaryFileList = getCurrentDictionaryFileList(TexSmart.Config.CustomDictionaryPath); 42 | // logger.debug("hanlp current custom dictionary: {}", Arrays.toString(currentDictironaryFileList.toArray())); 43 | // boolean isModified = false; 44 | // for (DictionaryFile currentDictionaryFile : currentDictironaryFileList) { 45 | // if (!originalDictionaryFileList.contains(currentDictionaryFile)) { 46 | // isModified = true; 47 | // break; 48 | // } 49 | // } 50 | // if (isModified) { 51 | // logger.info("reloading hanlp custom dictionary"); 52 | // try { 53 | // AccessController.doPrivileged((PrivilegedAction) CustomDictionaryUtility::reload); 54 | // } catch (Exception e) { 55 | // logger.error("can not reload hanlp custom dictionary", e); 56 | // } 57 | // DictionaryFileCache.setCustomDictionaryFileList(currentDictironaryFileList); 58 | // DictionaryFileCache.writeCache(); 59 | // logger.info("finish reload hanlp custom dictionary"); 60 | // } else { 61 | // logger.info("hanlp custom dictionary isn't modified, so no need reload"); 62 | // } 63 | } 64 | 65 | private void reloadProperty() { 66 | // Properties p = new Properties(); 67 | // try { 68 | // ClassLoader loader = AccessController.doPrivileged((PrivilegedAction) () -> Thread.currentThread().getContextClassLoader()); 69 | // if (loader == null) { 70 | // loader = HanLP.Config.class.getClassLoader(); 71 | // } 72 | // p.load(new InputStreamReader(Predefine.HANLP_PROPERTIES_PATH == null ? loader.getResourceAsStream("hanlp.properties") : new FileInputStream(Predefine.HANLP_PROPERTIES_PATH), "UTF-8")); 73 | // String root = p.getProperty("root", "").replaceAll("\\\\", "/"); 74 | // if (root.length() > 0 && !root.endsWith("/")) { 75 | // root += "/"; 76 | // } 77 | // String[] pathArray = p.getProperty("CustomDictionaryPath", "data/dictionary/custom/CustomDictionary.txt").split(";"); 78 | // String prePath = root; 79 | // for (int i = 0; i < pathArray.length; ++i) { 80 | // if (pathArray[i].startsWith(" ")) { 81 | // pathArray[i] = prePath + pathArray[i].trim(); 82 | // } else { 83 | // pathArray[i] = root + pathArray[i]; 84 | // int lastSplash = pathArray[i].lastIndexOf('/'); 85 | // if (lastSplash != -1) { 86 | // prePath = pathArray[i].substring(0, lastSplash + 1); 87 | // } 88 | // } 89 | // } 90 | // AccessController.doPrivileged((PrivilegedAction) () -> HanLP.Config.CustomDictionaryPath = pathArray); 91 | // } catch (Exception e) { 92 | // logger.error("can not find hanlp.properties", e); 93 | // } 94 | // } 95 | // 96 | // private List getCurrentDictionaryFileList(String[] customDictionaryPaths) { 97 | // List dictionaryFileList = new ArrayList<>(); 98 | // for (String customDictionaryPath : customDictionaryPaths) { 99 | // String[] customDictionaryPathTuple = customDictionaryPath.split(" "); 100 | // String path = customDictionaryPathTuple[0].trim(); 101 | // logger.debug("hanlp custom path: {}", path); 102 | // File file = new File(path); 103 | // AccessController.doPrivileged((PrivilegedAction) () -> { 104 | // if (file.exists()) { 105 | // if (customDictionaryPathTuple.length > 1) { 106 | // if (customDictionaryPathTuple[1] == null || customDictionaryPathTuple[1].length() == 0) { 107 | // dictionaryFileList.add(new DictionaryFile(path, file.lastModified())); 108 | // } else { 109 | // dictionaryFileList.add(new DictionaryFile(path, customDictionaryPathTuple[1].trim(), file.lastModified())); 110 | // } 111 | // } else { 112 | // dictionaryFileList.add(new DictionaryFile(path, file.lastModified())); 113 | // } 114 | // } 115 | // return null; 116 | // }); 117 | // } 118 | // return dictionaryFileList; 119 | } 120 | } 121 | 122 | -------------------------------------------------------------------------------- /src/main/java/com/texsmart/dic/RemoteMonitor.java: -------------------------------------------------------------------------------- 1 | package com.texsmart.dic; 2 | 3 | import com.texsmart.help.ESPluginLoggerFactory; 4 | import org.apache.http.HttpStatus; 5 | import org.apache.http.client.config.RequestConfig; 6 | import org.apache.http.client.methods.CloseableHttpResponse; 7 | import org.apache.http.client.methods.HttpGet; 8 | import org.apache.http.client.methods.HttpHead; 9 | import org.apache.http.impl.client.CloseableHttpClient; 10 | import org.apache.http.impl.client.HttpClients; 11 | import org.apache.logging.log4j.Logger; 12 | import org.elasticsearch.SpecialPermission; 13 | import org.elasticsearch.common.collect.Tuple; 14 | import org.elasticsearch.core.internal.io.IOUtils; 15 | 16 | import java.io.BufferedReader; 17 | import java.io.IOException; 18 | import java.io.InputStreamReader; 19 | import java.nio.charset.Charset; 20 | import java.nio.charset.StandardCharsets; 21 | import java.security.AccessController; 22 | import java.security.PrivilegedAction; 23 | 24 | /** 25 | * @project: elasticsearch-analysis-hanlp 26 | * @description: 自定义远程词典监控线程 27 | * @author: Kenn 28 | * @create: 2018-12-14 15:10 29 | */ 30 | public class RemoteMonitor implements Runnable { 31 | 32 | private static final Logger logger = ESPluginLoggerFactory.getLogger(RemoteMonitor.class.getName()); 33 | 34 | private static CloseableHttpClient httpclient = HttpClients.createDefault(); 35 | /** 36 | * 上次更改时间 37 | */ 38 | private String last_modified; 39 | /** 40 | * 资源属性 41 | */ 42 | private String eTags; 43 | /** 44 | * 请求地址 45 | */ 46 | private String location; 47 | /** 48 | * 数据类型 49 | */ 50 | private String type; 51 | 52 | private static final String SPLITTER = "\\s"; 53 | 54 | public RemoteMonitor(String location, String type) { 55 | this.location = location; 56 | this.type = type; 57 | this.last_modified = null; 58 | this.eTags = null; 59 | } 60 | 61 | @Override 62 | public void run() { 63 | SpecialPermission.check(); 64 | AccessController.doPrivileged((PrivilegedAction) () -> { 65 | runUnprivileged(); 66 | return null; 67 | }); 68 | } 69 | 70 | /** 71 | * 监控流程: 72 | * ①向词库服务器发送Head请求 73 | * ②从响应中获取Last-Modify、ETags字段值,判断是否变化 74 | * ③如果未变化,休眠1min,返回第①步 75 | * ④如果有变化,重新加载词典 76 | * ⑤休眠1min,返回第①步 77 | */ 78 | 79 | private void runUnprivileged() { 80 | String path = location.split(SPLITTER)[0]; 81 | 82 | HttpHead head = new HttpHead(path); 83 | // head.setConfig(buildRequestConfig()); 84 | 85 | // 设置请求头 86 | if (last_modified != null) { 87 | head.setHeader("If-Modified-Since", last_modified); 88 | } 89 | if (eTags != null) { 90 | head.setHeader("If-None-Match", eTags); 91 | } 92 | 93 | CloseableHttpResponse response = null; 94 | try { 95 | response = httpclient.execute(head); 96 | if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { 97 | if ((response.getLastHeader("Last-Modified") != null) && !response.getLastHeader("Last-Modified").getValue().equalsIgnoreCase(last_modified)) { 98 | loadRemoteCustomWords(response); 99 | } else if ((response.getLastHeader("ETag") != null) && !response.getLastHeader("ETag").getValue().equalsIgnoreCase(eTags)) { 100 | loadRemoteCustomWords(response); 101 | } 102 | } else if (response.getStatusLine().getStatusCode() == HttpStatus.SC_NOT_MODIFIED) { 103 | logger.info("remote_ext_dict {} is without modified", location); 104 | } else { 105 | logger.info("remote_ext_dict {} return bad code {}", location, response.getStatusLine().getStatusCode()); 106 | } 107 | } catch (Exception e) { 108 | e.printStackTrace(); 109 | logger.error("remote_ext_dict {} error!", e, location); 110 | } finally { 111 | try { 112 | if (response != null) { 113 | response.close(); 114 | } 115 | } catch (IOException e) { 116 | logger.error(e.getMessage(), e); 117 | } 118 | } 119 | } 120 | 121 | /** 122 | * 加载远程自定义词典 123 | * 124 | * @param response header响应 125 | */ 126 | private void loadRemoteCustomWords(CloseableHttpResponse response) { 127 | switch (type) { 128 | case "custom": 129 | logger.info("load hanlp remote custom dict path: {}", location); 130 | loadRemoteWordsUnprivileged(location); 131 | logger.info("finish load hanlp remote custom dict path: {}", location); 132 | break; 133 | case "stop": 134 | logger.info("load hanlp remote stop words path: {}", location); 135 | // loadRemoteStopWordsUnprivileged(location); 136 | logger.info("finish load hanlp remote stop words path: {}", location); 137 | break; 138 | default: 139 | return; 140 | } 141 | last_modified = response.getLastHeader("Last-Modified") == null ? null : response.getLastHeader("Last-Modified").getValue(); 142 | eTags = response.getLastHeader("ETag") == null ? null : response.getLastHeader("ETag").getValue(); 143 | } 144 | 145 | /** 146 | * 从远程服务器上下载自定义词条 147 | * 148 | * @param location 配置条目 149 | */ 150 | private void loadRemoteWordsUnprivileged(String location) { 151 | // Tuple defaultInfo = analysisDefaultInfo(location); 152 | // CloseableHttpClient httpclient = HttpClients.createDefault(); 153 | // CloseableHttpResponse response = null; 154 | // BufferedReader in = null; 155 | // HttpGet get = new HttpGet(defaultInfo.v1()); 156 | // get.setConfig(buildRequestConfig()); 157 | // try { 158 | // response = httpclient.execute(get); 159 | // if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { 160 | // in = new BufferedReader(new InputStreamReader(response.getEntity().getContent(), analysisDefaultCharset(response))); 161 | // String line; 162 | // boolean firstLine = true; 163 | // while ((line = in.readLine()) != null) { 164 | // if (firstLine) { 165 | // line = IOUtil.removeUTF8BOM(line); 166 | // firstLine = false; 167 | // } 168 | // 169 | // // 切分 170 | // String[] param = line.split(SPLITTER); 171 | // String word = param[0]; 172 | // 173 | // // 排除空行 174 | // if (word.length() == 0) { 175 | // continue; 176 | // } 177 | // 178 | // // 正规化 179 | // if (HanLP.Config.Normalization) { 180 | // word = CharTable.convert(word); 181 | // } 182 | // logger.debug("hanlp remote custom word: {}", word); 183 | // CustomDictionary.insert(word, analysisNatureWithFrequency(defaultInfo.v2(), param)); 184 | // } 185 | // in.close(); 186 | // response.close(); 187 | // } 188 | // response.close(); 189 | // } catch (IllegalStateException | IOException e) { 190 | // logger.error("get remote words {} error", e, location); 191 | // } finally { 192 | // try { 193 | // IOUtils.close(in); 194 | // IOUtils.close(response); 195 | // } catch (Exception e) { 196 | // e.printStackTrace(); 197 | // } 198 | // } 199 | // } 200 | // 201 | // /** 202 | // * 从远程服务器上下载停止词词条 203 | // * 204 | // * @param location 配置条目 205 | // */ 206 | // private void loadRemoteStopWordsUnprivileged(String location) { 207 | // CloseableHttpClient httpclient = HttpClients.createDefault(); 208 | // CloseableHttpResponse response = null; 209 | // BufferedReader in = null; 210 | // HttpGet get = new HttpGet(location); 211 | // get.setConfig(buildRequestConfig()); 212 | // try { 213 | // response = httpclient.execute(get); 214 | // if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { 215 | // in = new BufferedReader(new InputStreamReader(response.getEntity().getContent(), analysisDefaultCharset(response))); 216 | // String line; 217 | // boolean firstLine = true; 218 | // while ((line = in.readLine()) != null) { 219 | // if (firstLine) { 220 | // line = IOUtil.removeUTF8BOM(line); 221 | // firstLine = false; 222 | // } 223 | // logger.debug("hanlp remote stop word: {}", line); 224 | // CoreStopWordDictionary.add(line); 225 | // } 226 | // in.close(); 227 | // response.close(); 228 | // } 229 | // response.close(); 230 | // } catch (IllegalStateException | IOException e) { 231 | // logger.error("get remote words {} error", e, location); 232 | // } finally { 233 | // try { 234 | // IOUtils.close(in); 235 | // IOUtils.close(response); 236 | // } catch (Exception e) { 237 | // e.printStackTrace(); 238 | // } 239 | // } 240 | // } 241 | // 242 | // private RequestConfig buildRequestConfig() { 243 | // return RequestConfig.custom() 244 | // .setConnectionRequestTimeout(10 * 1000) 245 | // .setConnectTimeout(10 * 1000) 246 | // .setSocketTimeout(60 * 1000) 247 | // .build(); 248 | // } 249 | // 250 | // /** 251 | // * 分析默认编码 252 | // * 253 | // * @param response 响应 254 | // * @return 返回编码 255 | // */ 256 | // private Charset analysisDefaultCharset(CloseableHttpResponse response) { 257 | // Charset charset = StandardCharsets.UTF_8; 258 | // // 获取编码,默认为utf-8 259 | // if (response.getEntity().getContentType().getValue().contains("charset=")) { 260 | // String contentType = response.getEntity().getContentType().getValue(); 261 | // charset = Charset.forName(contentType.substring(contentType.lastIndexOf("=") + 1)); 262 | // } 263 | // return charset; 264 | // } 265 | // 266 | // /** 267 | // * 解析默认信息 268 | // * 269 | // * @param location 配置路径 270 | // * @return 返回new Tuple<路径, 默认词性> 271 | // */ 272 | // private Tuple analysisDefaultInfo(String location) { 273 | // Nature defaultNature = Nature.n; 274 | // String path = location; 275 | // int cut = location.indexOf(' '); 276 | // if (cut > 0) { 277 | // // 有默认词性 278 | // String nature = location.substring(cut + 1); 279 | // path = location.substring(0, cut); 280 | // defaultNature = LexiconUtility.convertStringToNature(nature); 281 | // } 282 | // return Tuple.tuple(path, defaultNature); 283 | // } 284 | // 285 | // /** 286 | // * 分析词性和频次 287 | // * 288 | // * @param defaultNature 默认词性 289 | // * @param param 行数据 290 | // * @return 返回[单词] [词性A] [A的频次] [词性B] [B的频次] ... 291 | // */ 292 | // private String analysisNatureWithFrequency(Nature defaultNature, String[] param) { 293 | // int natureCount = (param.length - 1) / 2; 294 | // StringBuilder builder = new StringBuilder(); 295 | // if (natureCount == 0) { 296 | // builder.append(defaultNature).append(" ").append(1000); 297 | // } else { 298 | // for (int i = 0; i < natureCount; ++i) { 299 | // Nature nature = LexiconUtility.convertStringToNature(param[1 + 2 * i]); 300 | // int frequency = Integer.parseInt(param[2 + 2 * i]); 301 | // builder.append(nature).append(" ").append(frequency); 302 | // if (i != natureCount - 1) { 303 | // builder.append(" "); 304 | // } 305 | // } 306 | // } 307 | // return builder.toString(); 308 | } 309 | } 310 | 311 | -------------------------------------------------------------------------------- /src/main/java/com/texsmart/dic/cache/DictionaryFileCache.java: -------------------------------------------------------------------------------- 1 | package com.texsmart.dic.cache; 2 | 3 | import com.texsmart.cfg.Configuration; 4 | import com.texsmart.dic.DictionaryFile; 5 | import com.texsmart.help.ESPluginLoggerFactory; 6 | import org.apache.logging.log4j.Logger; 7 | import org.elasticsearch.core.internal.io.IOUtils; 8 | import org.elasticsearch.plugin.analysis.texsmart.AnalysisTexSmartPlugin; 9 | 10 | import java.io.*; 11 | import java.nio.file.Path; 12 | import java.security.AccessController; 13 | import java.security.PrivilegedAction; 14 | import java.util.ArrayList; 15 | import java.util.Arrays; 16 | import java.util.List; 17 | 18 | public class DictionaryFileCache { 19 | 20 | private static final Logger logger = ESPluginLoggerFactory.getLogger(DictionaryFileCache.class.getName()); 21 | 22 | private static Path cachePath = null; 23 | 24 | private static final String DICTIONARY_FILE_CACHE_RECORD_FILE = "hanlp.cache"; 25 | 26 | private static List customDictionaryFileList = new ArrayList<>(); 27 | 28 | public static synchronized void configCachePath(Configuration configuration) { 29 | cachePath = configuration.getEnvironment().pluginsFile().resolve(AnalysisTexSmartPlugin.PLUGIN_NAME).resolve(DICTIONARY_FILE_CACHE_RECORD_FILE); 30 | } 31 | 32 | public static void loadCache() { 33 | File file = cachePath.toFile(); 34 | if (!file.exists()) { 35 | return; 36 | } 37 | List dictionaryFiles = AccessController.doPrivileged((PrivilegedAction>) () -> { 38 | List dictionaryFileList = new ArrayList<>(); 39 | DataInputStream in = null; 40 | try { 41 | in = new DataInputStream(new FileInputStream(file)); 42 | int size = in.readInt(); 43 | for (int i = 0; i < size; i++) { 44 | DictionaryFile dictionaryFile = new DictionaryFile(); 45 | dictionaryFile.read(in); 46 | dictionaryFileList.add(dictionaryFile); 47 | } 48 | } catch (IOException e) { 49 | logger.debug("can not load custom dictionary cache file", e); 50 | } finally { 51 | try { 52 | IOUtils.close(in); 53 | } catch (IOException e) { 54 | e.printStackTrace(); 55 | } 56 | } 57 | return dictionaryFileList; 58 | }); 59 | setCustomDictionaryFileList(dictionaryFiles); 60 | } 61 | 62 | public static void writeCache() { 63 | AccessController.doPrivileged((PrivilegedAction) () -> { 64 | DataOutputStream out = null; 65 | try { 66 | logger.info("begin write down hanlp custom dictionary file cache, file path: {}, custom dictionary file list: {}", cachePath.toFile().getAbsolutePath(), Arrays.toString(customDictionaryFileList.toArray())); 67 | out = new DataOutputStream(new FileOutputStream(cachePath.toFile())); 68 | out.writeInt(customDictionaryFileList.size()); 69 | for (DictionaryFile dictionaryFile : customDictionaryFileList) { 70 | dictionaryFile.write(out); 71 | } 72 | logger.info("write down hanlp custom dictionary file cache successfully"); 73 | } catch (IOException e) { 74 | logger.debug("can not write down hanlp custom dictionary file cache", e); 75 | } finally { 76 | try { 77 | IOUtils.close(out); 78 | } catch (IOException e) { 79 | e.printStackTrace(); 80 | } 81 | } 82 | return null; 83 | }); 84 | } 85 | 86 | public static List getCustomDictionaryFileList() { 87 | return customDictionaryFileList; 88 | } 89 | 90 | public static synchronized void setCustomDictionaryFileList(List customDictionaryFileList) { 91 | DictionaryFileCache.customDictionaryFileList = customDictionaryFileList; 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/main/java/com/texsmart/dic/config/RemoteDictConfig.java: -------------------------------------------------------------------------------- 1 | package com.texsmart.dic.config; 2 | 3 | import com.texsmart.dic.Dictionary; 4 | import com.texsmart.help.ESPluginLoggerFactory; 5 | import org.apache.logging.log4j.Logger; 6 | import org.elasticsearch.core.internal.io.IOUtils; 7 | 8 | import java.io.FileInputStream; 9 | import java.io.FileNotFoundException; 10 | import java.io.IOException; 11 | import java.io.InputStream; 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | import java.util.Properties; 15 | 16 | /** 17 | * @project: elasticsearch-analysis-texsmart 18 | * @description: 远程词典配置 19 | * @author: Kenn 20 | * @create: 2018-12-18 15:23 21 | */ 22 | public class RemoteDictConfig { 23 | 24 | /** 25 | * 远程词典配置实例 26 | */ 27 | private static RemoteDictConfig singleton; 28 | 29 | private static final Logger logger = ESPluginLoggerFactory.getLogger(RemoteDictConfig.class.getName()); 30 | 31 | private static final String REMOTE_EXT_DICT = "remote_ext_dict"; 32 | 33 | private static final String REMOTE_EXT_STOP = "remote_ext_stopwords"; 34 | 35 | private Properties props; 36 | 37 | private String configFile; 38 | 39 | private RemoteDictConfig(String configFile) { 40 | this.configFile = configFile; 41 | this.props = new Properties(); 42 | loadConfig(); 43 | } 44 | 45 | public static synchronized RemoteDictConfig initial(String configFile) { 46 | if (singleton == null) { 47 | synchronized (Dictionary.class) { 48 | if (singleton == null) { 49 | singleton = new RemoteDictConfig(configFile); 50 | } 51 | } 52 | } 53 | return singleton; 54 | } 55 | 56 | public boolean loadConfig() { 57 | InputStream input = null; 58 | try { 59 | logger.info("try load remote hanlp config from {}", configFile); 60 | input = new FileInputStream(configFile); 61 | props.loadFromXML(input); 62 | } catch (FileNotFoundException e) { 63 | logger.error("remote hanlp config isn't exist", e); 64 | return false; 65 | } catch (Exception e) { 66 | logger.error("can not load remote hanlp config", e); 67 | return false; 68 | } finally { 69 | try { 70 | IOUtils.close(input); 71 | } catch (IOException e) { 72 | e.printStackTrace(); 73 | } 74 | } 75 | return true; 76 | } 77 | 78 | public List getRemoteExtDictionarys() { 79 | return getRemoteExtFiles(REMOTE_EXT_DICT); 80 | } 81 | 82 | public List getRemoteExtStopWordDictionarys() { 83 | return getRemoteExtFiles(REMOTE_EXT_STOP); 84 | } 85 | 86 | private List getRemoteExtFiles(String key) { 87 | List remoteExtFiles = new ArrayList(2); 88 | String remoteExtStopWordDictCfg = getProperty(key); 89 | if (remoteExtStopWordDictCfg != null) { 90 | 91 | String[] filePaths = remoteExtStopWordDictCfg.split(";"); 92 | for (String filePath : filePaths) { 93 | if (filePath != null && !"".equals(filePath.trim())) { 94 | remoteExtFiles.add(filePath); 95 | 96 | } 97 | } 98 | } 99 | return remoteExtFiles; 100 | } 101 | 102 | private String getProperty(String key) { 103 | if (props != null) { 104 | return props.getProperty(key); 105 | } 106 | return null; 107 | } 108 | 109 | /** 110 | * 获取远程词典配置实例 111 | * 112 | * @return Dictionary 单例对象 113 | */ 114 | public static RemoteDictConfig getSingleton() { 115 | if (singleton == null) { 116 | throw new IllegalStateException("远程词典配置尚未初始化,请先调用initial方法"); 117 | } 118 | return singleton; 119 | } 120 | } 121 | 122 | -------------------------------------------------------------------------------- /src/main/java/com/texsmart/dic/config/TexSmartConfig.java: -------------------------------------------------------------------------------- 1 | package com.texsmart.dic.config; 2 | 3 | import com.texsmart.dic.Dictionary; 4 | import org.elasticsearch.plugin.analysis.texsmart.AnalysisTexSmartPlugin; 5 | 6 | import java.io.FileInputStream; 7 | import java.io.InputStream; 8 | import java.io.InputStreamReader; 9 | import java.io.IOException; 10 | import java.nio.file.Path; 11 | import java.nio.file.Paths; 12 | import java.util.Properties; 13 | 14 | public class TexSmartConfig { 15 | 16 | private static TexSmartConfig ourInstance = new TexSmartConfig(); 17 | 18 | private static Properties config; 19 | 20 | static { 21 | Path filePath = Paths.get(System.getProperty("user.dir"), "plugins", 22 | AnalysisTexSmartPlugin.PLUGIN_NAME, Dictionary.CONFIG_FILE_NAME); 23 | try { 24 | InputStream in = new FileInputStream(filePath.toString()); 25 | config = new Properties(); 26 | InputStreamReader inputStreamReader = new InputStreamReader(in, "UTF-8"); 27 | config.load(inputStreamReader); 28 | } catch (IOException e) { 29 | e.printStackTrace(); 30 | } 31 | } 32 | 33 | public static TexSmartConfig getInstance() { 34 | return ourInstance; 35 | } 36 | 37 | private TexSmartConfig() {} 38 | 39 | public static Properties getConfig() { 40 | return config; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/com/texsmart/dic/stopword/Filter.java: -------------------------------------------------------------------------------- 1 | package com.texsmart.dic.stopword; 2 | 3 | import tencent.ai.texsmart.NluOutput.Term; 4 | 5 | public interface Filter { 6 | boolean beRemove(Term var1); 7 | } 8 | -------------------------------------------------------------------------------- /src/main/java/com/texsmart/dic/stopword/FilterStopWord.java: -------------------------------------------------------------------------------- 1 | package com.texsmart.dic.stopword; 2 | 3 | import tencent.ai.texsmart.NluOutput.Term; 4 | 5 | public class FilterStopWord { 6 | private static FilterStopWord ourInstance = new FilterStopWord(); 7 | 8 | private static Filter FILTER = term -> { 9 | // 除掉停用词 (目前只去掉标点符号) 10 | String nature = term.tag != null ? term.tag : "空"; 11 | return nature.equals("PU"); 12 | }; 13 | 14 | public static FilterStopWord getInstance() { 15 | return ourInstance; 16 | } 17 | 18 | private FilterStopWord() {} 19 | 20 | public static boolean beRemove(Term term) { 21 | return FILTER.beRemove(term); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/com/texsmart/help/ESPluginLoggerFactory.java: -------------------------------------------------------------------------------- 1 | package com.texsmart.help; 2 | 3 | import org.apache.logging.log4j.LogManager; 4 | import org.apache.logging.log4j.Logger; 5 | import org.apache.logging.log4j.spi.ExtendedLogger; 6 | 7 | /** 8 | * @project: elasticsearch-analysis-texsmart 9 | * @description: logger 10 | * @author: wei_liu 11 | * @create: 2020-09-09 15:10 12 | */ 13 | public class ESPluginLoggerFactory { 14 | 15 | private ESPluginLoggerFactory() { 16 | } 17 | 18 | static public Logger getLogger(String name) { 19 | return getLogger("", LogManager.getLogger(name)); 20 | } 21 | 22 | static public Logger getLogger(String prefix, String name) { 23 | return getLogger(prefix, LogManager.getLogger(name)); 24 | } 25 | 26 | static public Logger getLogger(String prefix, Class clazz) { 27 | return getLogger(prefix, LogManager.getLogger(clazz.getName())); 28 | } 29 | 30 | static public Logger getLogger(String prefix, Logger logger) { 31 | return (Logger)(prefix != null && prefix.length() != 0 ? new PrefixPluginLogger((ExtendedLogger)logger, logger.getName(), prefix) : logger); 32 | } 33 | } 34 | 35 | -------------------------------------------------------------------------------- /src/main/java/com/texsmart/help/PrefixPluginLogger.java: -------------------------------------------------------------------------------- 1 | package com.texsmart.help; 2 | 3 | import org.apache.logging.log4j.Level; 4 | import org.apache.logging.log4j.Marker; 5 | import org.apache.logging.log4j.MarkerManager; 6 | import org.apache.logging.log4j.message.Message; 7 | import org.apache.logging.log4j.spi.ExtendedLogger; 8 | import org.apache.logging.log4j.spi.ExtendedLoggerWrapper; 9 | 10 | import java.util.WeakHashMap; 11 | 12 | /** 13 | * @project: elasticsearch-analysis-texsmart 14 | * @description: logger wrapper 15 | * @author: wei_liu 16 | * @create: 2020-09-09 15:10 17 | */ 18 | public class PrefixPluginLogger extends ExtendedLoggerWrapper { 19 | 20 | private static final WeakHashMap MARKERS = new WeakHashMap<>(); 21 | 22 | private final Marker marker; 23 | 24 | static int markersSize() { 25 | return MARKERS.size(); 26 | } 27 | 28 | public String prefix() { 29 | return this.marker.getName(); 30 | } 31 | 32 | PrefixPluginLogger(ExtendedLogger logger, String name, String prefix) { 33 | super(logger, name, null); 34 | String actualPrefix = prefix == null ? "" : prefix; 35 | MarkerManager.Log4jMarker actualMarker; 36 | synchronized (MARKERS) { 37 | MarkerManager.Log4jMarker maybeMarker = (MarkerManager.Log4jMarker)MARKERS.get(actualPrefix); 38 | if (maybeMarker == null) { 39 | actualMarker = new MarkerManager.Log4jMarker(actualPrefix); 40 | MARKERS.put(actualPrefix, actualMarker); 41 | } else { 42 | actualMarker = maybeMarker; 43 | } 44 | } 45 | this.marker = actualMarker; 46 | } 47 | 48 | @Override 49 | public void logMessage(String fqcn, Level level, Marker marker, Message message, Throwable t) { 50 | assert marker == null; 51 | super.logMessage(fqcn, level, this.marker, message, t); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/com/texsmart/lucene/PorterStemmer.java: -------------------------------------------------------------------------------- 1 | package com.texsmart.lucene; 2 | 3 | import org.apache.lucene.util.ArrayUtil; 4 | 5 | import java.io.FileInputStream; 6 | import java.io.IOException; 7 | import java.io.InputStream; 8 | 9 | /** 10 | * 抄袭lucene的英文处理 11 | * Stemmer, implementing the Porter Stemming Algorithm 12 | *

13 | * The Stemmer class transforms a word into its root form. The input word can be 14 | * provided a character at time (by calling add()), or at once by calling one of 15 | * the various stem(something) methods. 16 | */ 17 | 18 | public class PorterStemmer 19 | { 20 | private char[] b; 21 | private int i, /* offset into b */ 22 | j, k, k0; 23 | private boolean dirty = false; 24 | private static final int INITIAL_SIZE = 50; 25 | 26 | public PorterStemmer() 27 | { 28 | b = new char[INITIAL_SIZE]; 29 | i = 0; 30 | } 31 | 32 | /** 33 | * reset() resets the stemmer so it can stem another word. If you invoke the 34 | * stemmer by calling add(char) and then stem(), you must call reset() 35 | * before starting another word. 36 | */ 37 | public void reset() 38 | { 39 | i = 0; 40 | dirty = false; 41 | } 42 | 43 | /** 44 | * Add a character to the word being stemmed. When you are finished adding 45 | * characters, you can call stem(void) to process the word. 46 | */ 47 | public void add(char ch) 48 | { 49 | if (b.length <= i) 50 | { 51 | b = ArrayUtil.grow(b, i + 1); 52 | } 53 | b[i++] = ch; 54 | } 55 | 56 | /** 57 | * After a word has been stemmed, it can be retrieved by toString(), or a 58 | * reference to the internal buffer can be retrieved by getResultBuffer and 59 | * getResultLength (which is generally more efficient.) 60 | */ 61 | @Override 62 | public String toString() 63 | { 64 | return new String(b, 0, i); 65 | } 66 | 67 | /** 68 | * Returns the length of the word resulting from the stemming process. 69 | */ 70 | public int getResultLength() 71 | { 72 | return i; 73 | } 74 | 75 | /** 76 | * Returns a reference to a character buffer containing the results of the 77 | * stemming process. You also need to consult getResultLength() to determine 78 | * the length of the result. 79 | */ 80 | public char[] getResultBuffer() 81 | { 82 | return b; 83 | } 84 | 85 | /* cons(i) is true <=> b[i] is a consonant. */ 86 | 87 | private final boolean cons(int i) 88 | { 89 | switch (b[i]) 90 | { 91 | case 'a': 92 | case 'e': 93 | case 'i': 94 | case 'o': 95 | case 'u': 96 | return false; 97 | case 'y': 98 | return (i == k0) ? true : !cons(i - 1); 99 | default: 100 | return true; 101 | } 102 | } 103 | 104 | /* 105 | * m() measures the number of consonant sequences between k0 and j. if c is 106 | * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary 107 | * presence, 108 | * 109 | * gives 0 vc gives 1 vcvc gives 2 vcvcvc gives 3 110 | * .... 111 | */ 112 | 113 | private final int m() 114 | { 115 | int n = 0; 116 | int i = k0; 117 | while (true) 118 | { 119 | if (i > j) 120 | return n; 121 | if (!cons(i)) 122 | break; 123 | i++; 124 | } 125 | i++; 126 | while (true) 127 | { 128 | while (true) 129 | { 130 | if (i > j) 131 | return n; 132 | if (cons(i)) 133 | break; 134 | i++; 135 | } 136 | i++; 137 | n++; 138 | while (true) 139 | { 140 | if (i > j) 141 | return n; 142 | if (!cons(i)) 143 | break; 144 | i++; 145 | } 146 | i++; 147 | } 148 | } 149 | 150 | /* vowelinstem() is true <=> k0,...j contains a vowel */ 151 | 152 | private final boolean vowelinstem() 153 | { 154 | int i; 155 | for (i = k0; i <= j; i++) 156 | if (!cons(i)) 157 | return true; 158 | return false; 159 | } 160 | 161 | /* doublec(j) is true <=> j,(j-1) contain a double consonant. */ 162 | 163 | private final boolean doublec(int j) 164 | { 165 | if (j < k0 + 1) 166 | return false; 167 | if (b[j] != b[j - 1]) 168 | return false; 169 | return cons(j); 170 | } 171 | 172 | /* 173 | * cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant 174 | * and also if the second c is not w,x or y. this is used when trying to 175 | * restore an e at the end of a short word. e.g. 176 | * 177 | * cav(e), lov(e), hop(e), crim(e), but snow, box, tray. 178 | */ 179 | 180 | private final boolean cvc(int i) 181 | { 182 | if (i < k0 + 2 || !cons(i) || cons(i - 1) || !cons(i - 2)) 183 | return false; 184 | else 185 | { 186 | int ch = b[i]; 187 | if (ch == 'w' || ch == 'x' || ch == 'y') 188 | return false; 189 | } 190 | return true; 191 | } 192 | 193 | private final boolean ends(String s) 194 | { 195 | int l = s.length(); 196 | int o = k - l + 1; 197 | if (o < k0) 198 | return false; 199 | for (int i = 0; i < l; i++) 200 | if (b[o + i] != s.charAt(i)) 201 | return false; 202 | j = k - l; 203 | return true; 204 | } 205 | 206 | /* 207 | * setto(s) sets (j+1),...k to the characters in the string s, readjusting 208 | * k. 209 | */ 210 | 211 | void setto(String s) 212 | { 213 | int l = s.length(); 214 | int o = j + 1; 215 | for (int i = 0; i < l; i++) 216 | b[o + i] = s.charAt(i); 217 | k = j + l; 218 | dirty = true; 219 | } 220 | 221 | /* r(s) is used further down. */ 222 | 223 | void r(String s) 224 | { 225 | if (m() > 0) 226 | setto(s); 227 | } 228 | 229 | /* 230 | * step1() gets rid of plurals and -ed or -ing. e.g. 231 | * 232 | * caresses -> caress ponies -> poni ties -> ti caress -> caress cats -> cat 233 | * 234 | * feed -> feed agreed -> agree disabled -> disable 235 | * 236 | * matting -> mat mating -> mate meeting -> meet milling -> mill messing -> 237 | * mess 238 | * 239 | * meetings -> meet 240 | */ 241 | 242 | private final void step1() 243 | { 244 | if (b[k] == 's') 245 | { 246 | if (ends("sses")) 247 | k -= 2; 248 | else if (ends("ies")) 249 | setto("i"); 250 | else if (b[k - 1] != 's') 251 | k--; 252 | } 253 | if (ends("eed")) 254 | { 255 | if (m() > 0) 256 | k--; 257 | } 258 | else if ((ends("ed") || ends("ing")) && vowelinstem()) 259 | { 260 | k = j; 261 | if (ends("at")) 262 | setto("ate"); 263 | else if (ends("bl")) 264 | setto("ble"); 265 | else if (ends("iz")) 266 | setto("ize"); 267 | else if (doublec(k)) 268 | { 269 | int ch = b[k--]; 270 | if (ch == 'l' || ch == 's' || ch == 'z') 271 | k++; 272 | } 273 | else if (m() == 1 && cvc(k)) 274 | setto("e"); 275 | } 276 | } 277 | 278 | /* step2() turns terminal y to i when there is another vowel in the stem. */ 279 | 280 | private final void step2() 281 | { 282 | if (ends("y") && vowelinstem()) 283 | { 284 | b[k] = 'i'; 285 | dirty = true; 286 | } 287 | } 288 | 289 | /* 290 | * step3() maps double suffices to single ones. so -ization ( = -ize plus 291 | * -ation) maps to -ize etc. note that the string before the suffix must 292 | * give m() > 0. 293 | */ 294 | 295 | private final void step3() 296 | { 297 | if (k == k0) 298 | return; /* For Bug 1 */ 299 | switch (b[k - 1]) 300 | { 301 | case 'a': 302 | if (ends("ational")) 303 | { 304 | r("ate"); 305 | break; 306 | } 307 | if (ends("tional")) 308 | { 309 | r("tion"); 310 | break; 311 | } 312 | break; 313 | case 'c': 314 | if (ends("enci")) 315 | { 316 | r("ence"); 317 | break; 318 | } 319 | if (ends("anci")) 320 | { 321 | r("ance"); 322 | break; 323 | } 324 | break; 325 | case 'e': 326 | if (ends("izer")) 327 | { 328 | r("ize"); 329 | break; 330 | } 331 | break; 332 | case 'l': 333 | if (ends("bli")) 334 | { 335 | r("ble"); 336 | break; 337 | } 338 | if (ends("alli")) 339 | { 340 | r("al"); 341 | break; 342 | } 343 | if (ends("entli")) 344 | { 345 | r("ent"); 346 | break; 347 | } 348 | if (ends("eli")) 349 | { 350 | r("e"); 351 | break; 352 | } 353 | if (ends("ousli")) 354 | { 355 | r("ous"); 356 | break; 357 | } 358 | break; 359 | case 'o': 360 | if (ends("ization")) 361 | { 362 | r("ize"); 363 | break; 364 | } 365 | if (ends("ation")) 366 | { 367 | r("ate"); 368 | break; 369 | } 370 | if (ends("ator")) 371 | { 372 | r("ate"); 373 | break; 374 | } 375 | break; 376 | case 's': 377 | if (ends("alism")) 378 | { 379 | r("al"); 380 | break; 381 | } 382 | if (ends("iveness")) 383 | { 384 | r("ive"); 385 | break; 386 | } 387 | if (ends("fulness")) 388 | { 389 | r("ful"); 390 | break; 391 | } 392 | if (ends("ousness")) 393 | { 394 | r("ous"); 395 | break; 396 | } 397 | break; 398 | case 't': 399 | if (ends("aliti")) 400 | { 401 | r("al"); 402 | break; 403 | } 404 | if (ends("iviti")) 405 | { 406 | r("ive"); 407 | break; 408 | } 409 | if (ends("biliti")) 410 | { 411 | r("ble"); 412 | break; 413 | } 414 | break; 415 | case 'g': 416 | if (ends("logi")) 417 | { 418 | r("log"); 419 | break; 420 | } 421 | } 422 | } 423 | 424 | /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */ 425 | 426 | private final void step4() 427 | { 428 | switch (b[k]) 429 | { 430 | case 'e': 431 | if (ends("icate")) 432 | { 433 | r("ic"); 434 | break; 435 | } 436 | if (ends("ative")) 437 | { 438 | r(""); 439 | break; 440 | } 441 | if (ends("alize")) 442 | { 443 | r("al"); 444 | break; 445 | } 446 | break; 447 | case 'i': 448 | if (ends("iciti")) 449 | { 450 | r("ic"); 451 | break; 452 | } 453 | break; 454 | case 'l': 455 | if (ends("ical")) 456 | { 457 | r("ic"); 458 | break; 459 | } 460 | if (ends("ful")) 461 | { 462 | r(""); 463 | break; 464 | } 465 | break; 466 | case 's': 467 | if (ends("ness")) 468 | { 469 | r(""); 470 | break; 471 | } 472 | break; 473 | } 474 | } 475 | 476 | /* step5() takes off -ant, -ence etc., in context vcvc. */ 477 | 478 | private final void step5() 479 | { 480 | if (k == k0) 481 | return; /* for Bug 1 */ 482 | switch (b[k - 1]) 483 | { 484 | case 'a': 485 | if (ends("al")) 486 | break; 487 | return; 488 | case 'c': 489 | if (ends("ance")) 490 | break; 491 | if (ends("ence")) 492 | break; 493 | return; 494 | case 'e': 495 | if (ends("er")) 496 | break; 497 | return; 498 | case 'i': 499 | if (ends("ic")) 500 | break; 501 | return; 502 | case 'l': 503 | if (ends("able")) 504 | break; 505 | if (ends("ible")) 506 | break; 507 | return; 508 | case 'n': 509 | if (ends("ant")) 510 | break; 511 | if (ends("ement")) 512 | break; 513 | if (ends("ment")) 514 | break; 515 | /* element etc. not stripped before the m */ 516 | if (ends("ent")) 517 | break; 518 | return; 519 | case 'o': 520 | if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) 521 | break; 522 | /* j >= 0 fixes Bug 2 */ 523 | if (ends("ou")) 524 | break; 525 | return; 526 | /* takes care of -ous */ 527 | case 's': 528 | if (ends("ism")) 529 | break; 530 | return; 531 | case 't': 532 | if (ends("ate")) 533 | break; 534 | if (ends("iti")) 535 | break; 536 | return; 537 | case 'u': 538 | if (ends("ous")) 539 | break; 540 | return; 541 | case 'v': 542 | if (ends("ive")) 543 | break; 544 | return; 545 | case 'z': 546 | if (ends("ize")) 547 | break; 548 | return; 549 | default: 550 | return; 551 | } 552 | if (m() > 1) 553 | k = j; 554 | } 555 | 556 | /* step6() removes a final -e if m() > 1. */ 557 | 558 | private final void step6() 559 | { 560 | j = k; 561 | if (b[k] == 'e') 562 | { 563 | int a = m(); 564 | if (a > 1 || a == 1 && !cvc(k - 1)) 565 | k--; 566 | } 567 | if (b[k] == 'l' && doublec(k) && m() > 1) 568 | k--; 569 | } 570 | 571 | /** 572 | * Stem a word provided as a String. Returns the result as a String. 573 | */ 574 | public String stem(String s) 575 | { 576 | if (stem(s.toCharArray(), s.length())) 577 | return toString(); 578 | else 579 | return s; 580 | } 581 | 582 | /** 583 | * Stem a word contained in a char[]. Returns true if the stemming process 584 | * resulted in a word different from the input. You can retrieve the result 585 | * with getResultLength()/getResultBuffer() or toString(). 586 | */ 587 | public boolean stem(char[] word) 588 | { 589 | return stem(word, word.length); 590 | } 591 | 592 | /** 593 | * Stem a word contained in a portion of a char[] array. Returns true if the 594 | * stemming process resulted in a word different from the input. You can 595 | * retrieve the result with getResultLength()/getResultBuffer() or 596 | * toString(). 597 | */ 598 | public boolean stem(char[] wordBuffer, int offset, int wordLen) 599 | { 600 | reset(); 601 | if (b.length < wordLen) 602 | { 603 | b = new char[ArrayUtil.oversize(wordLen, Character.BYTES)]; 604 | } 605 | System.arraycopy(wordBuffer, offset, b, 0, wordLen); 606 | i = wordLen; 607 | return stem(0); 608 | } 609 | 610 | /** 611 | * Stem a word contained in a leading portion of a char[] array. Returns 612 | * true if the stemming process resulted in a word different from the input. 613 | * You can retrieve the result with getResultLength()/getResultBuffer() or 614 | * toString(). 615 | */ 616 | public boolean stem(char[] word, int wordLen) 617 | { 618 | return stem(word, 0, wordLen); 619 | } 620 | 621 | /** 622 | * Stem the word placed into the Stemmer buffer through calls to add(). 623 | * Returns true if the stemming process resulted in a word different from 624 | * the input. You can retrieve the result with 625 | * getResultLength()/getResultBuffer() or toString(). 626 | */ 627 | public boolean stem() 628 | { 629 | return stem(0); 630 | } 631 | 632 | public boolean stem(int i0) 633 | { 634 | k = i - 1; 635 | k0 = i0; 636 | if (k > k0 + 1) 637 | { 638 | step1(); 639 | step2(); 640 | step3(); 641 | step4(); 642 | step5(); 643 | step6(); 644 | } 645 | // Also, a word is considered dirty if we lopped off letters 646 | // Thanks to Ifigenia Vairelles for pointing this out. 647 | if (i != k + 1) 648 | dirty = true; 649 | i = k + 1; 650 | return dirty; 651 | } 652 | 653 | /** 654 | * Test program for demonstrating the Stemmer. It reads a file and stems 655 | * each word, writing the result to standard out. Usage: Stemmer file-name 656 | */ 657 | public static void main(String[] args) 658 | { 659 | PorterStemmer s = new PorterStemmer(); 660 | 661 | for (int i = 0; i < args.length; i++) 662 | { 663 | try 664 | { 665 | InputStream in = new FileInputStream(args[i]); 666 | byte[] buffer = new byte[1024]; 667 | int bufferLen, offset, ch; 668 | 669 | bufferLen = in.read(buffer); 670 | offset = 0; 671 | s.reset(); 672 | 673 | while (true) 674 | { 675 | if (offset < bufferLen) 676 | ch = buffer[offset++]; 677 | else 678 | { 679 | bufferLen = in.read(buffer); 680 | offset = 0; 681 | if (bufferLen < 0) 682 | ch = -1; 683 | else 684 | ch = buffer[offset++]; 685 | } 686 | 687 | if (Character.isLetter((char) ch)) 688 | { 689 | s.add(Character.toLowerCase((char) ch)); 690 | } 691 | else 692 | { 693 | s.stem(); 694 | System.out.print(s.toString()); 695 | s.reset(); 696 | if (ch < 0) 697 | break; 698 | else 699 | { 700 | System.out.print((char) ch); 701 | } 702 | } 703 | } 704 | 705 | in.close(); 706 | } 707 | catch (IOException e) 708 | { 709 | System.out.println("error reading " + args[i]); 710 | } 711 | } 712 | } 713 | 714 | } 715 | -------------------------------------------------------------------------------- /src/main/java/com/texsmart/lucene/SegmentWrapper.java: -------------------------------------------------------------------------------- 1 | package com.texsmart.lucene; 2 | 3 | import com.texsmart.cfg.Configuration; 4 | import com.texsmart.seg.Segment; 5 | import tencent.ai.texsmart.NluOutput.Term; 6 | 7 | import java.io.Reader; 8 | import java.security.AccessController; 9 | import java.security.PrivilegedAction; 10 | import java.util.List; 11 | import java.util.Scanner; 12 | 13 | public class SegmentWrapper { 14 | 15 | private Scanner scanner; 16 | 17 | private Segment segment; 18 | /** 19 | * 因为next是单个term出去的,所以在这里做一个记录 20 | */ 21 | private Term[] termArray; 22 | /** 23 | * termArray下标 24 | */ 25 | private int index; 26 | /** 27 | * term的偏移量,由于wrapper是按行读取的,必须对term.offset做一个校正 28 | */ 29 | int offset; 30 | 31 | Configuration configuration; 32 | 33 | public SegmentWrapper(Reader reader, Segment segment, Configuration configuration) { 34 | scanner = createScanner(reader); 35 | this.segment = segment; 36 | this.configuration = configuration; 37 | } 38 | 39 | public SegmentWrapper(Reader reader, Segment segment) { 40 | scanner = createScanner(reader); 41 | this.segment = segment; 42 | } 43 | 44 | /** 45 | * 重置分词器 46 | * 47 | * @param reader 48 | */ 49 | public void reset(Reader reader) { 50 | scanner = createScanner(reader); 51 | termArray = null; 52 | index = 0; 53 | offset = 0; 54 | } 55 | 56 | public Term next() { 57 | if (termArray != null && index < termArray.length) { 58 | return termArray[index++]; 59 | } 60 | if (!scanner.hasNextLine()) { 61 | return null; 62 | } 63 | String line = scanner.nextLine(); 64 | while (isBlank(line)) { 65 | offset += line.length() + 1; 66 | if (scanner.hasNextLine()) { 67 | line = scanner.nextLine(); 68 | } else { 69 | return null; 70 | } 71 | } 72 | 73 | final String lineNeedSeg = line; 74 | List termList = AccessController.doPrivileged((PrivilegedAction>)() -> { 75 | // char[] text = lineNeedSeg.toCharArray(); 76 | if (configuration != null && configuration.isEnableNormalization()) { 77 | // AccessController.doPrivileged((PrivilegedAction) () -> { 78 | // CharTable.normalization(text); 79 | // return null; 80 | // }); 81 | } 82 | return segment.seg(lineNeedSeg); 83 | }); 84 | 85 | if (termList.size() == 0) { 86 | return null; 87 | } 88 | termArray = termList.toArray(new Term[0]); 89 | 90 | for (Term term: termArray) { 91 | term.offset = term.offset + offset; 92 | } 93 | if (scanner.hasNextLine()) { 94 | offset += line.length() + 1; 95 | } else { 96 | offset += line.length(); 97 | } 98 | index = 0; 99 | return termArray[index++]; 100 | } 101 | 102 | /** 103 | * 判断字符串是否为空(null和空格) 104 | * 105 | * @param cs 106 | * @return 107 | */ 108 | private static boolean isBlank(CharSequence cs) { 109 | int strLen; 110 | if (cs == null || (strLen = cs.length()) == 0) { 111 | return true; 112 | } 113 | for (int i = 0; i < strLen; i++) { 114 | if (!Character.isWhitespace(cs.charAt(i))) { 115 | return false; 116 | } 117 | } 118 | return true; 119 | } 120 | 121 | private static Scanner createScanner(Reader reader) { 122 | return new Scanner(reader).useDelimiter("\n"); 123 | } 124 | } 125 | 126 | -------------------------------------------------------------------------------- /src/main/java/com/texsmart/lucene/TexSmartAnalyzer.java: -------------------------------------------------------------------------------- 1 | package com.texsmart.lucene; 2 | 3 | import com.texsmart.TexSmart; 4 | import com.texsmart.cfg.Configuration; 5 | import org.apache.lucene.analysis.Analyzer; 6 | 7 | /** 8 | * @project: elasticsearch-analysis-texsmart 9 | * @description: 默认分词 10 | * @author: wei_liu 11 | * @create: 2020-09-09 15:10 12 | */ 13 | public class TexSmartAnalyzer extends Analyzer { 14 | /** 15 | * 分词配置 16 | */ 17 | private Configuration configuration; 18 | 19 | public TexSmartAnalyzer(Configuration configuration) { 20 | this.configuration = configuration; 21 | } 22 | 23 | public TexSmartAnalyzer() { 24 | super(); 25 | } 26 | 27 | @Override 28 | protected Analyzer.TokenStreamComponents createComponents(String fieldName) { 29 | return new Analyzer.TokenStreamComponents( 30 | TokenizerBuilder.tokenizer(TexSmart.newSegment(), configuration)); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/com/texsmart/lucene/TexSmartIndexAnalyzer.java: -------------------------------------------------------------------------------- 1 | package com.texsmart.lucene; 2 | 3 | import com.texsmart.TexSmart; 4 | import com.texsmart.cfg.Configuration; 5 | import org.apache.lucene.analysis.Analyzer; 6 | 7 | public class TexSmartIndexAnalyzer extends Analyzer { 8 | /** 9 | * 分词配置 10 | */ 11 | private Configuration configuration; 12 | 13 | public TexSmartIndexAnalyzer(Configuration configuration) { 14 | this.configuration = configuration; 15 | this.configuration.enableIndexMode(true); 16 | } 17 | 18 | public TexSmartIndexAnalyzer() { 19 | super(); 20 | } 21 | 22 | @Override 23 | protected Analyzer.TokenStreamComponents createComponents(String fieldName) { 24 | return new Analyzer.TokenStreamComponents( 25 | TokenizerBuilder.tokenizer(TexSmart.newSegment().enableIndexMode(true), configuration)); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/com/texsmart/lucene/TexSmartStandardAnalyzer.java: -------------------------------------------------------------------------------- 1 | package com.texsmart.lucene; 2 | 3 | import com.texsmart.TexSmart; 4 | import com.texsmart.cfg.Configuration; 5 | import org.apache.lucene.analysis.Analyzer; 6 | 7 | /** 8 | * @project: elasticsearch-analysis-texsmart 9 | * @description: 中文基础分词 10 | * @author: wei_liu 11 | * @create: 2020-09-09 15:10 12 | */ 13 | public class TexSmartStandardAnalyzer extends Analyzer { 14 | /** 15 | * 分词配置 16 | */ 17 | private Configuration configuration; 18 | 19 | public TexSmartStandardAnalyzer(Configuration configuration) { 20 | this.configuration = configuration; 21 | } 22 | 23 | public TexSmartStandardAnalyzer() { 24 | super(); 25 | } 26 | 27 | @Override 28 | protected Analyzer.TokenStreamComponents createComponents(String fieldName) { 29 | return new Analyzer.TokenStreamComponents( 30 | TokenizerBuilder.tokenizer(TexSmart.newSegment(), configuration)); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/com/texsmart/lucene/TexSmartTokenizer.java: -------------------------------------------------------------------------------- 1 | package com.texsmart.lucene; 2 | 3 | import com.texsmart.dic.stopword.FilterStopWord; 4 | import com.texsmart.seg.Segment; 5 | import com.texsmart.utility.TextUtility; 6 | import tencent.ai.texsmart.NluOutput.Term; 7 | import com.texsmart.cfg.Configuration; 8 | import org.apache.lucene.analysis.Tokenizer; 9 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 10 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 11 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 12 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 13 | 14 | import java.io.BufferedReader; 15 | import java.io.IOException; 16 | 17 | 18 | public class TexSmartTokenizer extends Tokenizer { 19 | /** 20 | * 当前词 21 | */ 22 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); 23 | /** 24 | * 偏移量 25 | */ 26 | private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); 27 | /** 28 | * 距离 29 | */ 30 | private final PositionIncrementAttribute positionAttr = addAttribute(PositionIncrementAttribute.class); 31 | /** 32 | * 词性 33 | */ 34 | private TypeAttribute typeAtt = addAttribute(TypeAttribute.class); 35 | /** 36 | * 配置 37 | */ 38 | private Configuration configuration; 39 | /** 40 | * 分词器 41 | */ 42 | private SegmentWrapper segment; 43 | /** 44 | * 45 | */ 46 | private final PorterStemmer stemmer = new PorterStemmer(); 47 | /** 48 | * 单文档当前所在的总offset,当reset(切换multi-value fields中的value)的时候不清零,在end(切换field)时清零 49 | */ 50 | private int totalOffset = 0; 51 | 52 | /** 53 | * @param segment TexSmart中的某个分词器 54 | * @param configuration 分词配置 55 | */ 56 | public TexSmartTokenizer(Segment segment, Configuration configuration) { 57 | this.configuration = configuration; 58 | this.segment = new SegmentWrapper(this.input, segment, configuration); 59 | } 60 | 61 | @Override 62 | final public boolean incrementToken() throws IOException { 63 | clearAttributes(); 64 | int position = 0; 65 | Term term; 66 | boolean unIncreased = true; 67 | do { 68 | term = segment.next(); 69 | if (term == null) { 70 | totalOffset += segment.offset; 71 | return false; 72 | } 73 | if (TextUtility.isBlank(term.str)) { 74 | totalOffset += term.length(); 75 | continue; 76 | } 77 | if (configuration.isEnablePorterStemming() && term.tag.equals("nx")) { 78 | term.str = stemmer.stem(term.str); 79 | } 80 | 81 | final Term copyTerm = term; 82 | if (!this.configuration.isEnableStopDictionary() || !FilterStopWord.beRemove(copyTerm)) { 83 | position++; 84 | unIncreased = false; 85 | } else { 86 | totalOffset += term.length(); 87 | } 88 | } 89 | while (unIncreased); 90 | 91 | positionAttr.setPositionIncrement(position); 92 | termAtt.setEmpty().append(term.str); 93 | offsetAtt.setOffset(correctOffset(term.offset), correctOffset(term.offset + term.str.length())); 94 | typeAtt.setType(term.tag == null ? "null" : term.tag); 95 | totalOffset += term.length(); 96 | return true; 97 | } 98 | 99 | @Override 100 | public void end() throws IOException { 101 | super.end(); 102 | offsetAtt.setOffset(totalOffset, totalOffset); 103 | totalOffset = 0; 104 | } 105 | 106 | /** 107 | * 必须重载的方法,否则在批量索引文件时将会导致文件索引失败 108 | */ 109 | @Override 110 | public void reset() throws IOException { 111 | super.reset(); 112 | segment.reset(new BufferedReader(this.input)); 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /src/main/java/com/texsmart/lucene/TokenizerBuilder.java: -------------------------------------------------------------------------------- 1 | package com.texsmart.lucene; 2 | 3 | import com.texsmart.cfg.Configuration; 4 | import com.texsmart.seg.Segment; 5 | import org.apache.lucene.analysis.Tokenizer; 6 | 7 | /** 8 | * @project: elasticsearch-analysis-texsmart 9 | * @description: 10 | * @author: wei_liu 11 | * @create: 20202-09-09 09:47 12 | */ 13 | public class TokenizerBuilder { 14 | 15 | /** 16 | * 构建Tokenizer 17 | * 18 | * @param segment 原始segment 19 | * @param configuration 配置信息 20 | * @return 返回tokenizer 21 | */ 22 | public static Tokenizer tokenizer(Segment segment, Configuration configuration) { 23 | Segment seg = segment(segment, configuration); 24 | return new TexSmartTokenizer(seg, configuration); 25 | } 26 | 27 | /** 28 | * 根据配置信息配置segment 29 | * 30 | * @param segment 原始segment 31 | * @param configuration 配置信息 32 | * @return 新segment 33 | */ 34 | private static Segment segment(Segment segment, Configuration configuration) { 35 | if (!configuration.isEnableCustomConfig()) { 36 | return segment.enableOffset(true); 37 | } 38 | segment.enableIndexMode(configuration.isEnableIndexMode()) 39 | .enableOffset(configuration.isEnableOffset()) 40 | .enableStopDictionary(configuration.isEnableStopDictionary()) 41 | .setPosAlgType(configuration.getEnablePosAlg()) 42 | .setNerAlgType(configuration.getEnableNerAlg()); 43 | return segment; 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/com/texsmart/seg/Config.java: -------------------------------------------------------------------------------- 1 | package com.texsmart.seg; 2 | 3 | import org.elasticsearch.index.analysis.NerAlgType; 4 | import org.elasticsearch.index.analysis.PosAlgType; 5 | 6 | public class Config { 7 | public int indexMode = 0; 8 | public boolean useCustomDictionary = true; 9 | public boolean forceEntName = true; 10 | public boolean ner = true; 11 | public boolean offset = false; 12 | public boolean enableStopDictionary = false; 13 | public PosAlgType posAlgType = PosAlgType.LOG_LINEAR; 14 | public NerAlgType nerAlgType = NerAlgType.CRF; 15 | 16 | public Config() { 17 | } 18 | 19 | public boolean isIndexMode() { return this.indexMode > 0; } 20 | 21 | public String getPosAlgType() { 22 | return this.posAlgType.getAlg(); 23 | } 24 | 25 | public String getNerAlgType() { 26 | return this.nerAlgType.getAlg(); 27 | } 28 | } -------------------------------------------------------------------------------- /src/main/java/com/texsmart/seg/Segment.java: -------------------------------------------------------------------------------- 1 | package com.texsmart.seg; 2 | 3 | import org.elasticsearch.index.analysis.NerAlgType; 4 | import org.elasticsearch.index.analysis.PosAlgType; 5 | import tencent.ai.texsmart.NluOutput.Term; 6 | 7 | import java.util.List; 8 | 9 | public abstract class Segment { 10 | protected Config config = new Config(); 11 | 12 | public Segment() { 13 | } 14 | 15 | public List seg(String text) { 16 | return segSentence(text); 17 | } 18 | 19 | protected abstract List segSentence(String text); 20 | 21 | public Segment enableOffset(boolean enable) { 22 | this.config.offset = enable; 23 | return this; 24 | } 25 | 26 | public Segment enableIndexMode(boolean enable) { 27 | this.config.indexMode = enable ? 2 : 0; 28 | return this; 29 | } 30 | 31 | public Segment enableIndexMode(int minimalLength) { 32 | if (minimalLength < 1) { 33 | throw new IllegalArgumentException("最小长度应当大于等于1"); 34 | } else { 35 | this.config.indexMode = minimalLength; 36 | return this; 37 | } 38 | } 39 | 40 | public Segment enableStopDictionary(boolean enable) { 41 | this.config.enableStopDictionary = enable; 42 | return this; 43 | } 44 | 45 | public Segment setPosAlgType(PosAlgType posAlgType) { 46 | this.config.posAlgType = posAlgType; 47 | return this; 48 | } 49 | 50 | public Segment setNerAlgType(NerAlgType nerAlgType) { 51 | this.config.nerAlgType = nerAlgType; 52 | return this; 53 | } 54 | } -------------------------------------------------------------------------------- /src/main/java/com/texsmart/seg/TexSmartBasicSegment.java: -------------------------------------------------------------------------------- 1 | package com.texsmart.seg; 2 | 3 | import com.texsmart.TexSmart; 4 | import tencent.ai.texsmart.NluOutput; 5 | import tencent.ai.texsmart.NluOutput.Term; 6 | 7 | import java.util.List; 8 | 9 | public class TexSmartBasicSegment extends Segment { 10 | 11 | private static String formatOptions = "{" + 12 | " \"input_spec\":{\"lang\":\"auto\"}," + 13 | " \"word_seg\":{\"enable\":true},\n" + 14 | " \"pos_tagging\":{\"enable\":true,\"alg\":\"%s\"}," + 15 | " \"ner\":{\"enable\":true,\"alg\":\"%s\",\"fine_grained\":false}," + 16 | " \"syntactic_parsing\":{\"enable\":false}," + 17 | " \"srl\":{\"enable\":false}" + 18 | " }"; 19 | 20 | public TexSmartBasicSegment() { 21 | } 22 | 23 | @Override 24 | protected List segSentence(String text) { 25 | NluOutput output = TexSmart.TEX_ENGINE.parseText(text, String.format( 26 | formatOptions, config.getPosAlgType(), config.getNerAlgType())); 27 | 28 | if (null == output) return null; 29 | if (config.isIndexMode()) { 30 | return output.words(); 31 | } else { 32 | return output.phrases(); 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/com/texsmart/tokenizer/StandardTokenizer.java: -------------------------------------------------------------------------------- 1 | package com.texsmart.tokenizer; 2 | 3 | import com.texsmart.TexSmart; 4 | import com.texsmart.seg.Segment; 5 | import tencent.ai.texsmart.NluOutput.Term; 6 | 7 | import java.util.List; 8 | 9 | public class StandardTokenizer { 10 | public static final Segment SEGMENT = TexSmart.newSegment(); 11 | 12 | public StandardTokenizer() { 13 | } 14 | 15 | public static List segment(String text) { 16 | return SEGMENT.seg(text); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/com/texsmart/utility/TextUtility.java: -------------------------------------------------------------------------------- 1 | package com.texsmart.utility; 2 | 3 | // 4 | // Source code recreated from a .class file by IntelliJ IDEA 5 | // (powered by Fernflower decompiler) 6 | // 7 | 8 | import java.io.DataOutputStream; 9 | import java.io.IOException; 10 | import java.io.PrintWriter; 11 | import java.io.StringWriter; 12 | import java.io.UnsupportedEncodingException; 13 | import java.util.Collection; 14 | import java.util.Iterator; 15 | import java.util.List; 16 | 17 | public class TextUtility { 18 | public TextUtility() { 19 | } 20 | 21 | public static int charType(char c) { 22 | return charType(String.valueOf(c)); 23 | } 24 | 25 | public static int charType(String str) { 26 | if (str != null && str.length() > 0) { 27 | if ("零○〇一二两三四五六七八九十廿百千万亿壹贰叁肆伍陆柒捌玖拾佰仟".contains(str)) { 28 | return 11; 29 | } 30 | 31 | byte[] b; 32 | try { 33 | b = str.getBytes("GBK"); 34 | } catch (UnsupportedEncodingException var6) { 35 | b = str.getBytes(); 36 | var6.printStackTrace(); 37 | } 38 | 39 | byte b1 = b[0]; 40 | byte b2 = b.length > 1 ? b[1] : 0; 41 | int ub1 = getUnsigned(b1); 42 | int ub2 = getUnsigned(b2); 43 | if (ub1 < 128) { 44 | if (ub1 <= 32) { 45 | return 17; 46 | } 47 | 48 | if ("*\"!,.?()[]{}+=/\\;:|".indexOf((char)b1) != -1) { 49 | return 6; 50 | } 51 | 52 | if ("0123456789".indexOf((char)b1) != -1) { 53 | return 9; 54 | } 55 | 56 | return 5; 57 | } 58 | 59 | if (ub1 == 162) { 60 | return 10; 61 | } 62 | 63 | if (ub1 == 163 && ub2 > 175 && ub2 < 186) { 64 | return 9; 65 | } 66 | 67 | if (ub1 == 163 && (ub2 >= 193 && ub2 <= 218 || ub2 >= 225 && ub2 <= 250)) { 68 | return 8; 69 | } 70 | 71 | if (ub1 == 161 || ub1 == 163) { 72 | return 6; 73 | } 74 | 75 | if (ub1 >= 176 && ub1 <= 247) { 76 | return 7; 77 | } 78 | } 79 | 80 | return 17; 81 | } 82 | 83 | public static boolean isAllChinese(String str) { 84 | return str.matches("[\\u4E00-\\u9FA5]+"); 85 | } 86 | 87 | public static boolean isAllNonChinese(byte[] sString) { 88 | int nLen = sString.length; 89 | int i = 0; 90 | 91 | while(i < nLen) { 92 | if (getUnsigned(sString[i]) < 248 && getUnsigned(sString[i]) > 175) { 93 | return false; 94 | } 95 | 96 | if (sString[i] < 0) { 97 | i += 2; 98 | } else { 99 | ++i; 100 | } 101 | } 102 | 103 | return true; 104 | } 105 | 106 | public static boolean isAllSingleByte(String str) { 107 | assert str != null; 108 | 109 | for(int i = 0; i < str.length(); ++i) { 110 | if (str.charAt(i) > 128) { 111 | return false; 112 | } 113 | } 114 | 115 | return true; 116 | } 117 | 118 | public static int cint(String str) { 119 | if (str != null) { 120 | try { 121 | int i = new Integer(str); 122 | return i; 123 | } catch (NumberFormatException var2) { 124 | } 125 | } 126 | 127 | return -1; 128 | } 129 | 130 | public static boolean isAllNum(String str) { 131 | if (str == null) { 132 | return false; 133 | } else { 134 | int i = 0; 135 | if ("±+-+-—".indexOf(str.charAt(0)) != -1) { 136 | ++i; 137 | } 138 | 139 | while(i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1) { 140 | ++i; 141 | } 142 | 143 | char ch; 144 | if (i > 0 && i < str.length()) { 145 | ch = str.charAt(i); 146 | if ("·∶:,,..//".indexOf(ch) != -1) { 147 | ++i; 148 | 149 | while(i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1) { 150 | ++i; 151 | } 152 | } 153 | } 154 | 155 | if (i >= str.length()) { 156 | return true; 157 | } else { 158 | while(i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1) { 159 | ++i; 160 | } 161 | 162 | if (i > 0 && i < str.length()) { 163 | ch = str.charAt(i); 164 | if (',' == ch || '.' == ch || '/' == ch || ':' == ch || "∶·,./".indexOf(ch) != -1) { 165 | ++i; 166 | 167 | while(i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1) { 168 | ++i; 169 | } 170 | } 171 | } 172 | 173 | if (i < str.length() && "百千万亿佰仟%%‰".indexOf(str.charAt(i)) != -1) { 174 | ++i; 175 | } 176 | 177 | return i >= str.length(); 178 | } 179 | } 180 | } 181 | 182 | public static boolean isAllIndex(byte[] sString) { 183 | int nLen = sString.length; 184 | 185 | int i; 186 | for(i = 0; i < nLen - 1 && getUnsigned(sString[i]) == 162; i += 2) { 187 | } 188 | 189 | if (i >= nLen) { 190 | return true; 191 | } else { 192 | while(i < nLen && sString[i] > 64 && sString[i] < 91 || sString[i] > 96 && sString[i] < 123) { 193 | ++i; 194 | } 195 | 196 | return i >= nLen; 197 | } 198 | } 199 | 200 | public static boolean isAllLetter(String text) { 201 | for(int i = 0; i < text.length(); ++i) { 202 | char c = text.charAt(i); 203 | if ((c < 'a' || c > 'z') && (c < 'A' || c > 'Z')) { 204 | return false; 205 | } 206 | } 207 | 208 | return true; 209 | } 210 | 211 | public static boolean isAllLetterOrNum(String text) { 212 | for(int i = 0; i < text.length(); ++i) { 213 | char c = text.charAt(i); 214 | if ((c < 'a' || c > 'z') && (c < 'A' || c > 'Z') && (c < '0' || c > '9')) { 215 | return false; 216 | } 217 | } 218 | 219 | return true; 220 | } 221 | 222 | public static boolean isAllDelimiter(byte[] sString) { 223 | int nLen = sString.length; 224 | 225 | int i; 226 | for(i = 0; i < nLen - 1 && (getUnsigned(sString[i]) == 161 || getUnsigned(sString[i]) == 163); i += 2) { 227 | } 228 | 229 | return i >= nLen; 230 | } 231 | 232 | public static boolean isAllChineseNum(String word) { 233 | String chineseNum = "零○一二两三四五六七八九十廿百千万亿壹贰叁肆伍陆柒捌玖拾佰仟∶·./点"; 234 | String prefix = "几数上第"; 235 | String surfix = "几多余来成倍"; 236 | boolean round = false; 237 | if (word == null) { 238 | return false; 239 | } else { 240 | char[] temp = word.toCharArray(); 241 | 242 | for(int i = 0; i < temp.length; ++i) { 243 | if (word.startsWith("分之", i)) { 244 | ++i; 245 | } else { 246 | char tchar = temp[i]; 247 | if (i == 0 && prefix.indexOf(tchar) != -1) { 248 | round = true; 249 | } else if (i == temp.length - 1 && !round && surfix.indexOf(tchar) != -1) { 250 | round = true; 251 | } else if (chineseNum.indexOf(tchar) == -1) { 252 | return false; 253 | } 254 | } 255 | } 256 | 257 | return true; 258 | } 259 | } 260 | 261 | public static int getCharCount(String charSet, String word) { 262 | int nCount = 0; 263 | if (word != null) { 264 | String temp = word + " "; 265 | 266 | for(int i = 0; i < word.length(); ++i) { 267 | String s = temp.substring(i, i + 1); 268 | if (charSet.indexOf(s) != -1) { 269 | ++nCount; 270 | } 271 | } 272 | } 273 | 274 | return nCount; 275 | } 276 | 277 | public static int getUnsigned(byte b) { 278 | return b > 0 ? b : b & 255; 279 | } 280 | 281 | public static boolean isYearTime(String snum) { 282 | if (snum != null) { 283 | int len = snum.length(); 284 | String first = snum.substring(0, 1); 285 | if (isAllSingleByte(snum) && (len == 4 || len == 2 && (cint(first) > 4 || cint(first) == 0))) { 286 | return true; 287 | } 288 | 289 | if (isAllNum(snum) && (len >= 3 || len == 2 && "056789".indexOf(first) != -1)) { 290 | return true; 291 | } 292 | 293 | if (getCharCount("零○一二三四五六七八九壹贰叁肆伍陆柒捌玖", snum) == len && len >= 2) { 294 | return true; 295 | } 296 | 297 | if (len == 4 && getCharCount("千仟零○", snum) == 2) { 298 | return true; 299 | } 300 | 301 | if (len == 1 && getCharCount("千仟", snum) == 1) { 302 | return true; 303 | } 304 | 305 | if (len == 2 && getCharCount("甲乙丙丁戊己庚辛壬癸", snum) == 1 && getCharCount("子丑寅卯辰巳午未申酉戌亥", snum.substring(1)) == 1) { 306 | return true; 307 | } 308 | } 309 | 310 | return false; 311 | } 312 | 313 | public static boolean isInAggregate(String aggr, String str) { 314 | if (aggr != null && str != null) { 315 | str = str + "1"; 316 | 317 | for(int i = 0; i < str.length(); ++i) { 318 | String s = str.substring(i, i + 1); 319 | if (aggr.indexOf(s) == -1) { 320 | return false; 321 | } 322 | } 323 | 324 | return true; 325 | } else { 326 | return false; 327 | } 328 | } 329 | 330 | public static boolean isDBCCase(String str) { 331 | if (str == null) { 332 | return false; 333 | } else { 334 | str = str + " "; 335 | 336 | for(int i = 0; i < str.length(); ++i) { 337 | String s = str.substring(i, i + 1); 338 | boolean var3 = false; 339 | 340 | int length; 341 | try { 342 | length = s.getBytes("GBK").length; 343 | } catch (UnsupportedEncodingException var5) { 344 | var5.printStackTrace(); 345 | length = s.getBytes().length; 346 | } 347 | 348 | if (length != 1) { 349 | return false; 350 | } 351 | } 352 | 353 | return true; 354 | } 355 | } 356 | 357 | public static boolean isSBCCase(String str) { 358 | if (str == null) { 359 | return false; 360 | } else { 361 | str = str + " "; 362 | 363 | for(int i = 0; i < str.length(); ++i) { 364 | String s = str.substring(i, i + 1); 365 | boolean var3 = false; 366 | 367 | int length; 368 | try { 369 | length = s.getBytes("GBK").length; 370 | } catch (UnsupportedEncodingException var5) { 371 | var5.printStackTrace(); 372 | length = s.getBytes().length; 373 | } 374 | 375 | if (length != 2) { 376 | return false; 377 | } 378 | } 379 | 380 | return true; 381 | } 382 | } 383 | 384 | public static boolean isDelimiter(String str) { 385 | return str != null && ("-".equals(str) || "-".equals(str)); 386 | } 387 | 388 | public static boolean isUnknownWord(String word) { 389 | return word != null && word.indexOf("未##") == 0; 390 | } 391 | 392 | public static double nonZero(double frequency) { 393 | return frequency == 0.0D ? 0.001D : frequency; 394 | } 395 | 396 | public static char[] long2char(long x) { 397 | char[] c = new char[]{(char)((int)(x >> 48)), (char)((int)(x >> 32)), (char)((int)(x >> 16)), (char)((int)x)}; 398 | return c; 399 | } 400 | 401 | public static String long2String(long x) { 402 | char[] cArray = long2char(x); 403 | StringBuilder sbResult = new StringBuilder(cArray.length); 404 | char[] var4 = cArray; 405 | int var5 = cArray.length; 406 | 407 | for(int var6 = 0; var6 < var5; ++var6) { 408 | char c = var4[var6]; 409 | sbResult.append(c); 410 | } 411 | 412 | return sbResult.toString(); 413 | } 414 | 415 | public static String exceptionToString(Exception e) { 416 | StringWriter sw = new StringWriter(); 417 | PrintWriter pw = new PrintWriter(sw); 418 | e.printStackTrace(pw); 419 | return sw.toString(); 420 | } 421 | 422 | public static boolean isChinese(char c) { 423 | String regex = "[\\u4e00-\\u9fa5]"; 424 | return String.valueOf(c).matches(regex); 425 | } 426 | 427 | public static int count(String keyword, String srcText) { 428 | int count = 0; 429 | int leng = srcText.length(); 430 | int j = 0; 431 | 432 | for(int i = 0; i < leng; ++i) { 433 | if (srcText.charAt(i) == keyword.charAt(j)) { 434 | ++j; 435 | if (j == keyword.length()) { 436 | ++count; 437 | j = 0; 438 | } 439 | } else { 440 | i -= j; 441 | j = 0; 442 | } 443 | } 444 | 445 | return count; 446 | } 447 | 448 | public static void writeString(String s, DataOutputStream out) throws IOException { 449 | out.writeInt(s.length()); 450 | char[] var2 = s.toCharArray(); 451 | int var3 = var2.length; 452 | 453 | for(int var4 = 0; var4 < var3; ++var4) { 454 | char c = var2[var4]; 455 | out.writeChar(c); 456 | } 457 | 458 | } 459 | 460 | public static boolean isBlank(CharSequence cs) { 461 | int strLen; 462 | if (cs != null && (strLen = cs.length()) != 0) { 463 | for(int i = 0; i < strLen; ++i) { 464 | if (!Character.isWhitespace(cs.charAt(i))) { 465 | return false; 466 | } 467 | } 468 | 469 | return true; 470 | } else { 471 | return true; 472 | } 473 | } 474 | 475 | public static String join(String delimiter, Collection stringCollection) { 476 | StringBuilder sb = new StringBuilder(stringCollection.size() * (16 + delimiter.length())); 477 | Iterator var3 = stringCollection.iterator(); 478 | 479 | while(var3.hasNext()) { 480 | String str = (String)var3.next(); 481 | sb.append(str).append(delimiter); 482 | } 483 | 484 | return sb.toString(); 485 | } 486 | 487 | public static String combine(String... termArray) { 488 | StringBuilder sbSentence = new StringBuilder(); 489 | String[] var2 = termArray; 490 | int var3 = termArray.length; 491 | 492 | for(int var4 = 0; var4 < var3; ++var4) { 493 | String word = var2[var4]; 494 | sbSentence.append(word); 495 | } 496 | 497 | return sbSentence.toString(); 498 | } 499 | 500 | public static String join(Iterable s, String delimiter) { 501 | Iterator iter = s.iterator(); 502 | if (!iter.hasNext()) { 503 | return ""; 504 | } else { 505 | StringBuilder buffer = new StringBuilder((CharSequence)iter.next()); 506 | 507 | while(iter.hasNext()) { 508 | buffer.append(delimiter).append((CharSequence)iter.next()); 509 | } 510 | 511 | return buffer.toString(); 512 | } 513 | } 514 | } 515 | 516 | -------------------------------------------------------------------------------- /src/main/java/es-plugin.properties: -------------------------------------------------------------------------------- 1 | plugin=org.elasticsearch.plugin.analysis.texsmart.AnalysisTexSmartPlugin -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/NerAlgType.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | public enum NerAlgType { 4 | CRF("crf"), DNN("dnn"); 5 | 6 | private String alg; 7 | 8 | NerAlgType(String alg) { 9 | this.alg = alg; 10 | } 11 | 12 | public String getAlg() { 13 | return alg; 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/PosAlgType.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | public enum PosAlgType { 4 | CRF("crf"), DNN("dnn"), LOG_LINEAR("log_LINEAR"); 5 | 6 | private String alg; 7 | 8 | PosAlgType(String alg) { 9 | this.alg = alg; 10 | } 11 | 12 | public String getAlg() { 13 | return alg; 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/TexSmartAnalyzerProvider.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import com.texsmart.cfg.Configuration; 4 | import com.texsmart.lucene.*; 5 | import org.apache.lucene.analysis.Analyzer; 6 | import org.elasticsearch.common.settings.Settings; 7 | import org.elasticsearch.env.Environment; 8 | import org.elasticsearch.index.IndexSettings; 9 | 10 | /** 11 | * @project: elasticsearch-analysis-texsmart 12 | * @description: TexSmart analyzer provider 13 | * @author: wei_liu 14 | * @create: 2020-09-09 15:10 15 | */ 16 | public class TexSmartAnalyzerProvider extends AbstractIndexAnalyzerProvider { 17 | 18 | private final Analyzer analyzer; 19 | 20 | public TexSmartAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings, TexSmartType texSmartType) { 21 | super(indexSettings, name, settings); 22 | Configuration configuration = new Configuration(env, settings); 23 | switch (texSmartType) { 24 | case TEXSMART: 25 | analyzer = new TexSmartAnalyzer(configuration); 26 | break; 27 | case STANDARD: 28 | analyzer = new TexSmartStandardAnalyzer(configuration); 29 | break; 30 | case SINGLE: 31 | analyzer = new TexSmartIndexAnalyzer(configuration); 32 | break; 33 | default: 34 | analyzer = null; 35 | break; 36 | } 37 | } 38 | 39 | public static TexSmartAnalyzerProvider getTexSmartAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { 40 | return new TexSmartAnalyzerProvider(indexSettings, env, name, settings, TexSmartType.TEXSMART); 41 | } 42 | 43 | public static TexSmartAnalyzerProvider getTexSmartStandardAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { 44 | return new TexSmartAnalyzerProvider(indexSettings, env, name, settings, TexSmartType.STANDARD); 45 | } 46 | 47 | public static TexSmartAnalyzerProvider getTexSmartIndexAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { 48 | return new TexSmartAnalyzerProvider(indexSettings, env, name, settings, TexSmartType.SINGLE); 49 | } 50 | 51 | @Override 52 | public Analyzer get() { 53 | return this.analyzer; 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/TexSmartTokenizerFactory.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import com.texsmart.TexSmart; 4 | import com.texsmart.cfg.Configuration; 5 | import com.texsmart.lucene.TokenizerBuilder; 6 | import org.apache.lucene.analysis.Tokenizer; 7 | import org.elasticsearch.common.settings.Settings; 8 | import org.elasticsearch.env.Environment; 9 | import org.elasticsearch.index.IndexSettings; 10 | 11 | /** 12 | * @project: elasticsearch-analysis-texsmart 13 | * @description: TexSmart tokenizer factory 14 | * @author: wei_liu 15 | * @create: 2020-09-09 15:10 16 | */ 17 | public class TexSmartTokenizerFactory extends AbstractTokenizerFactory { 18 | /** 19 | * 分词类型 20 | */ 21 | private TexSmartType texSmartType; 22 | /** 23 | * 分词配置 24 | */ 25 | private Configuration configuration; 26 | 27 | public TexSmartTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings, TexSmartType texSmartType) { 28 | super(indexSettings, settings, name); 29 | this.texSmartType = texSmartType; 30 | this.configuration = new Configuration(env, settings); 31 | } 32 | 33 | public static TexSmartTokenizerFactory getTexSmartTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { 34 | return new TexSmartTokenizerFactory(indexSettings, env, name, settings, TexSmartType.TEXSMART); 35 | } 36 | 37 | public static TexSmartTokenizerFactory getTexSmartStandardTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { 38 | return new TexSmartTokenizerFactory(indexSettings, env, name, settings, TexSmartType.STANDARD); 39 | } 40 | 41 | public static TexSmartTokenizerFactory getTexSmartIndexTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { 42 | return new TexSmartTokenizerFactory(indexSettings, env, name, settings, TexSmartType.SINGLE); 43 | } 44 | 45 | 46 | 47 | @Override 48 | public Tokenizer create() { 49 | switch (this.texSmartType) { 50 | case SINGLE: 51 | configuration.enableIndexMode(true); 52 | return TokenizerBuilder.tokenizer(TexSmart.newSegment().enableIndexMode(true), configuration); 53 | case STANDARD: 54 | default: 55 | return TokenizerBuilder.tokenizer(TexSmart.newSegment(), configuration); 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/TexSmartType.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | /** 4 | * @project: elasticsearch-analysis-texsmart 5 | * @description: TexSmart分词类型 6 | * @author: wei_liu 7 | * @create: 2020-09-09 15:10 8 | */ 9 | public enum TexSmartType { 10 | /** 11 | * 默认分词 12 | */ 13 | TEXSMART, 14 | /** 15 | * 中文基础分词 16 | */ 17 | STANDARD, 18 | /** 19 | * 中文单词分词 20 | */ 21 | SINGLE 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/plugin/analysis/texsmart/AnalysisTexSmartPlugin.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.plugin.analysis.texsmart; 2 | 3 | import org.apache.lucene.analysis.Analyzer; 4 | import org.elasticsearch.SpecialPermission; 5 | import org.elasticsearch.index.analysis.AnalyzerProvider; 6 | import org.elasticsearch.index.analysis.TexSmartAnalyzerProvider; 7 | import org.elasticsearch.index.analysis.TexSmartTokenizerFactory; 8 | import org.elasticsearch.index.analysis.TokenizerFactory; 9 | import org.elasticsearch.indices.analysis.AnalysisModule; 10 | import org.elasticsearch.plugins.AnalysisPlugin; 11 | import org.elasticsearch.plugins.Plugin; 12 | 13 | import java.security.AccessController; 14 | import java.security.PrivilegedAction; 15 | import java.util.HashMap; 16 | import java.util.Map; 17 | 18 | /** 19 | * @project: elasticsearch-analysis-texsmart 20 | * @description: TexSmart分词插件 21 | * @author: wei_liu 22 | * @create: 2018-12-14 15:10 23 | */ 24 | public class AnalysisTexSmartPlugin extends Plugin implements AnalysisPlugin { 25 | public static String PLUGIN_NAME = "analysis-texsmart"; 26 | 27 | static { 28 | SecurityManager sm = System.getSecurityManager(); 29 | if (sm != null) { 30 | // unprivileged code such as scripts do not have SpecialPermission 31 | sm.checkPermission(new SpecialPermission()); 32 | } 33 | } 34 | 35 | @Override 36 | public Map> getTokenizers() { 37 | return AccessController.doPrivileged((PrivilegedAction>>) () -> { 38 | Map> extra = new HashMap<>(); 39 | 40 | extra.put("texsmart", TexSmartTokenizerFactory::getTexSmartTokenizerFactory); 41 | extra.put("texsmart_standard", TexSmartTokenizerFactory::getTexSmartStandardTokenizerFactory); 42 | extra.put("texsmart_index", TexSmartTokenizerFactory::getTexSmartIndexTokenizerFactory); 43 | 44 | return extra; 45 | }); 46 | } 47 | 48 | @Override 49 | public Map>> getAnalyzers() { 50 | return AccessController.doPrivileged((PrivilegedAction>>>) () -> { 51 | Map>> extra = new HashMap<>(); 52 | 53 | extra.put("texsmart", TexSmartAnalyzerProvider::getTexSmartAnalyzerProvider); 54 | extra.put("texsmart_standard", TexSmartAnalyzerProvider::getTexSmartStandardAnalyzerProvider); 55 | extra.put("texsmart_index", TexSmartAnalyzerProvider::getTexSmartIndexAnalyzerProvider); 56 | 57 | return extra; 58 | }); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/tencent/ai/texsmart/CLib.java: -------------------------------------------------------------------------------- 1 | package tencent.ai.texsmart; 2 | 3 | import com.sun.jna.Library; 4 | import com.sun.jna.Native; 5 | import com.sun.jna.Pointer; 6 | import com.sun.jna.WString; 7 | 8 | public interface CLib extends Library { 9 | CLib INSTANCE = Native.loadLibrary("tencent_ai_texsmart", CLib.class); 10 | 11 | public Pointer Nlu_CreateEngine(String data_dir, int worker_count); 12 | public void Nlu_DestroyEngine(Pointer engine); 13 | 14 | public Pointer Nlu_ParseTextExt(Pointer engine, WString text, int text_len, WString options); 15 | public Pointer Nlu_ParseUtf8TextExt(Pointer engine, String text, int text_len, String options); 16 | public void Nlu_DestroyOutput(Pointer result); 17 | 18 | public Pointer Nlu_GetNormText(Pointer result, Pointer len); 19 | 20 | public int Nlu_GetWordCount(Pointer result); 21 | public Pointer Nlu_GetWord(Pointer result, int idx); 22 | public int Nlu_GetPhraseCount(Pointer result); 23 | public Pointer Nlu_GetPhrase(Pointer result, int idx); 24 | public Pointer Nlu_TermStr(Pointer term); 25 | public int Nlu_TermOffset(Pointer term); 26 | public int Nlu_TermLen(Pointer term); 27 | public Pointer Nlu_TermTag(Pointer term); 28 | 29 | public int Nlu_GetEntityCount(Pointer result); 30 | public Pointer Nlu_GetEntity(Pointer result, int idx); 31 | public Pointer Nlu_EntityStr(Pointer entity); 32 | public int Nlu_EntityOffset(Pointer entity); 33 | public int Nlu_EntityLen(Pointer entity); 34 | public Pointer Nlu_EntityType(Pointer entity); 35 | public Pointer Nlu_EntityMeaning(Pointer entity); 36 | public Pointer Nlu_EntityTypeName(Pointer entityType); 37 | public Pointer Nlu_EntityTypeI18n(Pointer entityType); 38 | public int Nlu_EntityTypeFlag(Pointer entityType); 39 | public Pointer Nlu_EntityTypePath(Pointer entityType); 40 | } 41 | 42 | -------------------------------------------------------------------------------- /src/main/java/tencent/ai/texsmart/NluEngine.java: -------------------------------------------------------------------------------- 1 | package tencent.ai.texsmart; 2 | 3 | import com.sun.jna.Pointer; 4 | import com.sun.jna.WString; 5 | 6 | //import com.sun.jna.Platform; 7 | 8 | public class NluEngine { 9 | 10 | public boolean init(String dataDir, int workerCount) { 11 | enginePointer = CLib.INSTANCE.Nlu_CreateEngine(dataDir, workerCount); 12 | return enginePointer != null; 13 | } 14 | 15 | /** 16 | * Analyze text and get parsing results (word segmentation, POS tagging, NER, semantic expansion, etc.) 17 | * @param text: The input natural language text 18 | * @return Parsing results 19 | */ 20 | public NluOutput parseText(String text) { 21 | WString options = null; 22 | WString text_wstr = new WString(text); 23 | Pointer result = CLib.INSTANCE.Nlu_ParseTextExt(enginePointer, text_wstr, text_wstr.length(), options); 24 | NluOutput output = new NluOutput(); 25 | output.dataPointer = result; 26 | return output; 27 | } 28 | 29 | /** 30 | * Analyze text and get parsing results (word segmentation, POS tagging, NER, semantic expansion, etc.) 31 | * @param text: The input natural language text 32 | * @param options: Parsing options, in JSON format 33 | * @return Parsing results 34 | */ 35 | public NluOutput parseText(String text, String options) { 36 | WString text_wstr = new WString(text); 37 | WString options_wstr = new WString(options); 38 | Pointer result = CLib.INSTANCE.Nlu_ParseTextExt(enginePointer, text_wstr, text_wstr.length(), options_wstr); 39 | NluOutput output = new NluOutput(); 40 | output.dataPointer = result; 41 | return output; 42 | } 43 | 44 | protected void finalize() { 45 | if(enginePointer != null) { 46 | CLib.INSTANCE.Nlu_DestroyEngine(enginePointer); 47 | enginePointer = null; 48 | } 49 | } 50 | 51 | protected Pointer enginePointer = null; 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/tencent/ai/texsmart/NluOutput.java: -------------------------------------------------------------------------------- 1 | package tencent.ai.texsmart; 2 | 3 | import com.sun.jna.Pointer; 4 | 5 | import java.util.ArrayList; 6 | 7 | public class NluOutput { 8 | 9 | /** A word or a phrase */ 10 | public class Term { 11 | public String str; 12 | public int offset = 0; 13 | public int len = 0; 14 | public String tag; 15 | 16 | public String toString() { 17 | return "str:" + str + " offset:" + offset + " len:" + len + " tag:" + tag; 18 | } 19 | 20 | public int length() { return this.str.length(); } 21 | 22 | public boolean equals(Object obj) { 23 | if (obj instanceof Term) { 24 | Term term = (Term)obj; 25 | if (this.tag == term.tag && this.tag.equals(term.tag)) { 26 | return true; 27 | } 28 | } 29 | 30 | return super.equals(obj); 31 | } 32 | } 33 | 34 | /** Entity type information */ 35 | public class EntityType { 36 | public String name; 37 | public String i18n; 38 | public int flag = 0; 39 | public String path; 40 | } 41 | 42 | /** Entity information */ 43 | public class Entity { 44 | public String str; 45 | public int offset = 0; 46 | public int len = 0; 47 | public EntityType type; 48 | public String meaning; 49 | } 50 | 51 | /** 52 | * Get the normalized text 53 | * @param result: The parsing result object 54 | * @return The normalize text 55 | */ 56 | public String normText() { 57 | Pointer len = null; 58 | Pointer ptr = CLib.INSTANCE.Nlu_GetNormText(dataPointer, len); 59 | return getWideStr(ptr); 60 | } 61 | 62 | /** 63 | * Get words from the parsing result. 64 | * @param result: The parsing result object 65 | * @return A list of words 66 | */ 67 | public ArrayList words() { 68 | ArrayList termList = new ArrayList(); 69 | int count = CLib.INSTANCE.Nlu_GetWordCount(dataPointer); 70 | for(int idx = 0; idx < count; idx++) { 71 | Pointer termPtr = CLib.INSTANCE.Nlu_GetWord(dataPointer, idx); 72 | Term newTerm = new Term(); 73 | newTerm.str = getWideStr(CLib.INSTANCE.Nlu_TermStr(termPtr)); 74 | newTerm.offset = CLib.INSTANCE.Nlu_TermOffset(termPtr); 75 | newTerm.len = CLib.INSTANCE.Nlu_TermLen(termPtr); 76 | newTerm.tag = getWideStr(CLib.INSTANCE.Nlu_TermTag(termPtr)); 77 | newTerm.tag = newTerm.tag.equals("") ? "NN" : newTerm.tag; 78 | termList.add(newTerm); 79 | } 80 | return termList; 81 | } 82 | 83 | /** 84 | * Get phrases from the parsing result. 85 | * @param result: The parsing result object 86 | * @return A list of phrases 87 | */ 88 | public ArrayList phrases() { 89 | ArrayList temrList = new ArrayList(); 90 | int count = CLib.INSTANCE.Nlu_GetPhraseCount(dataPointer); 91 | for(int idx = 0; idx < count; idx++) { 92 | Pointer termPtr = CLib.INSTANCE.Nlu_GetPhrase(dataPointer, idx); 93 | Term newTerm = new Term(); 94 | newTerm.str = getWideStr(CLib.INSTANCE.Nlu_TermStr(termPtr)); 95 | newTerm.offset = CLib.INSTANCE.Nlu_TermOffset(termPtr); 96 | newTerm.len = CLib.INSTANCE.Nlu_TermLen(termPtr); 97 | newTerm.tag = getWideStr(CLib.INSTANCE.Nlu_TermTag(termPtr)); 98 | newTerm.tag = newTerm.tag.equals("") ? "NN" : newTerm.tag; 99 | temrList.add(newTerm); 100 | } 101 | return temrList; 102 | } 103 | 104 | /** 105 | * Get entities from the parsing result. 106 | * @param result: The parsing result object 107 | * @return A list of entities 108 | */ 109 | public ArrayList entities() { 110 | ArrayList entityList = new ArrayList(); 111 | int count = CLib.INSTANCE.Nlu_GetEntityCount(dataPointer); 112 | for(int idx = 0; idx < count; idx++) { 113 | Pointer entityPtr = CLib.INSTANCE.Nlu_GetEntity(dataPointer, idx); 114 | Entity newEntity = new Entity(); 115 | newEntity.str = getWideStr(CLib.INSTANCE.Nlu_EntityStr(entityPtr)); 116 | newEntity.offset = CLib.INSTANCE.Nlu_EntityOffset(entityPtr); 117 | newEntity.len = CLib.INSTANCE.Nlu_EntityLen(entityPtr); 118 | newEntity.type = new EntityType(); 119 | Pointer typePtr = CLib.INSTANCE.Nlu_EntityType(entityPtr); 120 | newEntity.type.name = getWideStr(CLib.INSTANCE.Nlu_EntityTypeName(typePtr)); 121 | newEntity.type.i18n = getWideStr(CLib.INSTANCE.Nlu_EntityTypeI18n(typePtr)); 122 | newEntity.type.flag = CLib.INSTANCE.Nlu_EntityTypeFlag(typePtr); 123 | newEntity.type.path = getWideStr(CLib.INSTANCE.Nlu_EntityTypePath(typePtr)); 124 | newEntity.meaning = getWideStr(CLib.INSTANCE.Nlu_EntityMeaning(entityPtr)); 125 | entityList.add(newEntity); 126 | } 127 | return entityList; 128 | } 129 | 130 | public void close() { 131 | if(dataPointer != null) { 132 | CLib.INSTANCE.Nlu_DestroyOutput(dataPointer); 133 | dataPointer = null; 134 | } 135 | } 136 | 137 | //An utility function 138 | protected String getWideStr(Pointer strPtr) { 139 | if(strPtr == null) { 140 | return null; 141 | } 142 | 143 | return strPtr.getWideString(0); 144 | } 145 | 146 | protected void finalize() { 147 | close(); 148 | } 149 | 150 | protected Pointer dataPointer; 151 | } 152 | -------------------------------------------------------------------------------- /src/main/resources/plugin-descriptor.properties: -------------------------------------------------------------------------------- 1 | # Elasticsearch plugin descriptor file 2 | # This file must exist as 'plugin-descriptor.properties' at 3 | # the root directory of all plugins. 4 | # 5 | # A plugin can be 'site', 'jvm', or both. 6 | # 7 | ### example site plugin for "foo": 8 | # 9 | # foo.zip <-- zip file for the plugin, with this structure: 10 | # _site/ <-- the contents that will be served 11 | # plugin-descriptor.properties <-- example contents below: 12 | # 13 | # site=true 14 | # description=My cool plugin 15 | # version=1.0 16 | # 17 | ### example jvm plugin for "foo" 18 | # 19 | # foo.zip <-- zip file for the plugin, with this structure: 20 | # .jar <-- classes, resources, dependencies 21 | # .jar <-- any number of jars 22 | # plugin-descriptor.properties <-- example contents below: 23 | # 24 | # jvm=true 25 | # classname=foo.bar.BazPlugin 26 | # description=My cool plugin 27 | # version=2.0.0-rc1 28 | # elasticsearch.version=2.0 29 | # java.version=1.7 30 | # 31 | ### mandatory elements for all plugins: 32 | # 33 | # 'description': simple summary of the plugin 34 | description=${project.description} 35 | # 36 | # 'version': plugin's version 37 | version=${project.version} 38 | # 39 | # 'name': the plugin name 40 | name=${elasticsearch.plugin.name} 41 | # 42 | # 'classname': the name of the class to load, fully-qualified. 43 | classname=${elasticsearch.plugin.classname} 44 | # 45 | # 'java.version' version of java the code is built against 46 | # use the system property java.specification.version 47 | # version string must be a sequence of nonnegative decimal integers 48 | # separated by "."'s and may have leading zeros 49 | java.version=${maven.compiler.target} 50 | # 51 | # 'elasticsearch.version' version of elasticsearch compiled against 52 | # You will have to release a new version of the plugin for each new 53 | # elasticsearch release. This version is checked when the plugin 54 | # is loaded so Elasticsearch will refuse to start in the presence of 55 | # plugins with the incorrect elasticsearch.version. 56 | elasticsearch.version=${elasticsearch.version} 57 | # -------------------------------------------------------------------------------- /src/main/resources/plugin-security.policy: -------------------------------------------------------------------------------- 1 | grant { 2 | // texsmart data directories 3 | permission java.io.FilePermission "<>", "read,write,delete"; 4 | 5 | // needed because of problems in unbound LDAP library 6 | permission java.util.PropertyPermission "*", "read,write"; 7 | 8 | // classloader 9 | permission java.lang.RuntimePermission "setContextClassLoader"; 10 | permission java.lang.RuntimePermission "getClassLoader"; 11 | permission java.lang.RuntimePermission "accessClassInPackage.jdk.internal.loader"; 12 | 13 | // socket 14 | permission java.net.SocketPermission "*", "connect,resolve"; 15 | 16 | permission java.lang.reflect.ReflectPermission "suppressAccessChecks"; 17 | permission java.lang.reflect.ReflectPermission "newProxyInPackage.tencent.ai.texsmart"; 18 | }; -------------------------------------------------------------------------------- /src/main/resources/texsmart-remote.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | TexSmart Analyzer 扩展配置 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /src/main/resources/texsmart.properties: -------------------------------------------------------------------------------- 1 | root=. 2 | CoreDictionaryPath=data/nlu/kb/ 3 | CustomDictionaryPath=data/nlu/kb/customization/ 4 | 5 | path=/etc/elasticsearch/texsmart/data/nlu/kb/ --------------------------------------------------------------------------------