├── .gitignore
├── LICENSE
├── README.md
├── build.sh
├── config
├── texsmart-remote.xml
└── texsmart.properties
├── lib
├── .DS_Store
├── jna.jar
├── libtencent_ai_texsmart.so
├── tencent.ai.texsmart.jar
├── tencent_ai_texsmart.dll
├── tencent_ai_texsmart.lib
├── tencent_ai_texsmart.py
└── tencent_ai_texsmart.pyc
├── pom.xml
├── settings.xml
└── src
└── main
├── assemblies
└── plugin.xml
├── java
├── com
│ └── texsmart
│ │ ├── TexSmart.java
│ │ ├── cfg
│ │ └── Configuration.java
│ │ ├── dic
│ │ ├── Dictionary.java
│ │ ├── DictionaryFile.java
│ │ ├── ExtMonitor.java
│ │ ├── RemoteMonitor.java
│ │ ├── cache
│ │ │ └── DictionaryFileCache.java
│ │ ├── config
│ │ │ ├── RemoteDictConfig.java
│ │ │ └── TexSmartConfig.java
│ │ └── stopword
│ │ │ ├── Filter.java
│ │ │ └── FilterStopWord.java
│ │ ├── help
│ │ ├── ESPluginLoggerFactory.java
│ │ └── PrefixPluginLogger.java
│ │ ├── lucene
│ │ ├── PorterStemmer.java
│ │ ├── SegmentWrapper.java
│ │ ├── TexSmartAnalyzer.java
│ │ ├── TexSmartIndexAnalyzer.java
│ │ ├── TexSmartStandardAnalyzer.java
│ │ ├── TexSmartTokenizer.java
│ │ └── TokenizerBuilder.java
│ │ ├── seg
│ │ ├── Config.java
│ │ ├── Segment.java
│ │ └── TexSmartBasicSegment.java
│ │ ├── tokenizer
│ │ └── StandardTokenizer.java
│ │ └── utility
│ │ └── TextUtility.java
├── es-plugin.properties
├── org
│ └── elasticsearch
│ │ ├── index
│ │ └── analysis
│ │ │ ├── NerAlgType.java
│ │ │ ├── PosAlgType.java
│ │ │ ├── TexSmartAnalyzerProvider.java
│ │ │ ├── TexSmartTokenizerFactory.java
│ │ │ └── TexSmartType.java
│ │ └── plugin
│ │ └── analysis
│ │ └── texsmart
│ │ └── AnalysisTexSmartPlugin.java
└── tencent
│ └── ai
│ └── texsmart
│ ├── CLib.java
│ ├── NluEngine.java
│ └── NluOutput.java
└── resources
├── plugin-descriptor.properties
├── plugin-security.policy
├── texsmart-remote.xml
└── texsmart.properties
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | .DS_Store
3 | *.iws
4 | *.iml
5 | *.ipr
6 | target/
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # elasticsearch-analysis-texsmart
2 | TexSmart Analyzer for ElasticSearch
3 |
4 | 此分词器基于腾讯AI实验室[TexSmart中文分词库](https://ai.tencent.com/ailab/nlp/texsmart),提供了ES中文分词插件.
5 |
6 | 🚩 更新日志:
7 | 1. 适配Elasticsearch 7.x版本,TexSmart-0.1.3版本
8 |
9 | ----------
10 |
11 | 版本对应
12 | ----------
13 |
14 | | Plugin version | Elastic version |
15 | | :------------- | :-------------- |
16 | | master | 7.x |
17 | | 7.6.2 | 7.6.2 |
18 |
19 |
20 | 安装步骤
21 | ----------
22 |
23 | ### 1. 下载安装ES对应Plugin Release版本
24 |
25 | 安装方式:
26 |
27 | 方式一
28 |
29 | a. 下载对应的release安装包,最新release包可从github下载(链接: https://github.com/koios-sh/elasticsearch-analysis-texsmart/releases/download/v7.6.2/elasticsearch-analysis-texsmart-7.6.2.zip)
30 |
31 | b. 执行如下命令安装,其中PATH为插件包绝对路径:
32 |
33 | `./bin/elasticsearch-plugin install file://${PATH}`
34 |
35 | 方式二
36 |
37 | a. 使用elasticsearch插件脚本安装command如下:
38 |
39 | `./bin/elasticsearch-plugin install https://github.com/koios-sh/elasticsearch-analysis-texsmart/releases/download/v7.6.2/elasticsearch-analysis-texsmart-7.6.2.zip`
40 |
41 | 方式三
42 |
43 | a. 编译:sh build.sh
44 | b. 执行如下命令安装,其中PATH为插件包绝对路径:
45 |
46 | `./bin/elasticsearch-plugin install file://${PATH}`
47 |
48 | ### 2. 安装数据包
49 |
50 | release包中不包含TexSmart数据包,若要下载完整版数据包,请查看[TexSmart Release](https://ai.tencent.com/ailab/nlp/texsmart/zh/download.html)。
51 |
52 | 数据包目录:/etc/elasticsearch/texsmart/data
53 | 可以修改config/texsmart.properties文件中的path值,调整数据路径
54 |
55 | ### 3. 安装libtencent_ai_texsmart.so
56 |
57 | cp libtencent_ai_texsmart.so /usr/lib64 && chmod 777 /usr/lib64/libtencent_ai_texsmart.so
58 |
59 | **注:每个节点都需要做上述更改**
60 |
61 | 提供的分词方式说明
62 | ----------
63 |
64 | texsmart: texsmart默认分词
65 |
66 | texsmart_standard: 标准分词
67 |
68 | texsmart_index: 索引分词
69 |
70 | 样例
71 | ----------
72 |
73 | ```text
74 | POST http://localhost:9200/test/_analyze
75 | {
76 | "text": "2020年,空调市场“冷风吹过”",
77 | "tokenizer": "texsmart_standard"
78 | }
79 | ```
80 |
81 | ```json
82 | {
83 | "tokens": [
84 | {
85 | "token": "2020",
86 | "start_offset": 0,
87 | "end_offset": 4,
88 | "type": "CD",
89 | "position": 0
90 | },
91 | {
92 | "token": "年",
93 | "start_offset": 4,
94 | "end_offset": 5,
95 | "type": "M",
96 | "position": 1
97 | },
98 | {
99 | "token": ",",
100 | "start_offset": 5,
101 | "end_offset": 6,
102 | "type": "PU",
103 | "position": 2
104 | },
105 | {
106 | "token": "空调",
107 | "start_offset": 6,
108 | "end_offset": 8,
109 | "type": "NN",
110 | "position": 3
111 | },
112 | {
113 | "token": "市场",
114 | "start_offset": 8,
115 | "end_offset": 10,
116 | "type": "NN",
117 | "position": 4
118 | },
119 | {
120 | "token": "“",
121 | "start_offset": 10,
122 | "end_offset": 11,
123 | "type": "PU",
124 | "position": 5
125 | },
126 | {
127 | "token": "冷风",
128 | "start_offset": 11,
129 | "end_offset": 13,
130 | "type": "NN",
131 | "position": 6
132 | },
133 | {
134 | "token": "吹过",
135 | "start_offset": 13,
136 | "end_offset": 15,
137 | "type": "VV",
138 | "position": 7
139 | },
140 | {
141 | "token": "”",
142 | "start_offset": 15,
143 | "end_offset": 16,
144 | "type": "PU",
145 | "position": 8
146 | }
147 | ]
148 | }
149 | ```
150 |
151 | - 保证词典编码UTF-8
152 |
153 | 自定义分词配置
154 | ----------
155 |
156 | TexSmart在提供了各类分词方式的基础上,也提供了一系列的分词配置,分词插件也提供了相关的分词配置,我们可以在通过如下配置来自定义自己的分词器:
157 |
158 | | Config | Elastic version |
159 | | :----------------------------------- | :------------------ |
160 | | enable_index_mode | 是否是索引分词 |
161 | | enable_stop_dictionary | 是否启用停用词 |
162 | | enable_offset | 是否计算偏移量 |
163 | | enable_pos_alg | 指定pos_tagging算法 (log_linear(默认), crf,dnn) |
164 | | enable_ner_alg | 指定ner算法 (crf(默认),dnn) |
165 |
166 | 注意: 如果要采用如上配置过滤中英文标点符号,需要设置enable_stop_dictionary为true
167 |
168 | 例如:
169 | ```text
170 | PUT test
171 | {
172 | "settings": {
173 | "analysis": {
174 | "analyzer": {
175 | "my_texsmart_analyzer": {
176 | "tokenizer": "my_texsmart"
177 | }
178 | },
179 | "tokenizer": {
180 | "my_texsmart": {
181 | "type": "texsmart",
182 | "enable_stop_dictionary": true,
183 | "enable_pos_alg": "log_linear",
184 | "enable_ner_alg": "crf"
185 | }
186 | }
187 | }
188 | }
189 | }
190 | ```
191 |
192 | ```text
193 | POST test/_analyze
194 | {
195 | "text": "2020年,空调市场“冷风吹过”",
196 | "analyzer": "my_texsmart_analyzer"
197 | }
198 | ```
199 |
200 | 结果:
201 | ```json
202 | {
203 | "tokens": [
204 | {
205 | "token": "2020",
206 | "start_offset": 0,
207 | "end_offset": 4,
208 | "type": "CD",
209 | "position": 0
210 | },
211 | {
212 | "token": "年",
213 | "start_offset": 4,
214 | "end_offset": 5,
215 | "type": "M",
216 | "position": 1
217 | },
218 | {
219 | "token": "空调",
220 | "start_offset": 6,
221 | "end_offset": 8,
222 | "type": "NN",
223 | "position": 2
224 | },
225 | {
226 | "token": "市场",
227 | "start_offset": 8,
228 | "end_offset": 10,
229 | "type": "NN",
230 | "position": 3
231 | },
232 | {
233 | "token": "冷风",
234 | "start_offset": 11,
235 | "end_offset": 13,
236 | "type": "NN",
237 | "position": 4
238 | },
239 | {
240 | "token": "吹过",
241 | "start_offset": 13,
242 | "end_offset": 15,
243 | "type": "VV",
244 | "position": 5
245 | }
246 | ]
247 | }
248 |
249 | ```
250 |
251 | # 特别说明
252 | 1, texsmart目前官方不支持热词加载更新,听说下一个版本会支持。
253 | 代码中参考analysis-hanlp插件集成了远程词库和动态更新分词的功能
254 | 后续等腾讯官方版本更新后,上线该功能
255 |
256 | 🚩 参考资料:
257 | [TexSmart](https://ai.tencent.com/ailab/nlp/texsmart)
258 | [analysis-hanlp](https://github.com/KennFalcon/elasticsearch-analysis-hanlp)
--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
1 | mvn --settings=settings.xml -Dmaven.test.skip=true clean install
2 |
--------------------------------------------------------------------------------
/config/texsmart-remote.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | TexSmart Analyzer 扩展配置
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/config/texsmart.properties:
--------------------------------------------------------------------------------
1 | root=.
2 | CoreDictionaryPath=data/nlu/kb/
3 | CustomDictionaryPath=data/nlu/kb/customization/
4 |
5 | path=/etc/elasticsearch/texsmart/data/nlu/kb/
--------------------------------------------------------------------------------
/lib/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koios-sh/elasticsearch-analysis-texsmart/29f1b109b9f78aaabf2bf81a2406ba81c9314d3b/lib/.DS_Store
--------------------------------------------------------------------------------
/lib/jna.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koios-sh/elasticsearch-analysis-texsmart/29f1b109b9f78aaabf2bf81a2406ba81c9314d3b/lib/jna.jar
--------------------------------------------------------------------------------
/lib/libtencent_ai_texsmart.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koios-sh/elasticsearch-analysis-texsmart/29f1b109b9f78aaabf2bf81a2406ba81c9314d3b/lib/libtencent_ai_texsmart.so
--------------------------------------------------------------------------------
/lib/tencent.ai.texsmart.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koios-sh/elasticsearch-analysis-texsmart/29f1b109b9f78aaabf2bf81a2406ba81c9314d3b/lib/tencent.ai.texsmart.jar
--------------------------------------------------------------------------------
/lib/tencent_ai_texsmart.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koios-sh/elasticsearch-analysis-texsmart/29f1b109b9f78aaabf2bf81a2406ba81c9314d3b/lib/tencent_ai_texsmart.dll
--------------------------------------------------------------------------------
/lib/tencent_ai_texsmart.lib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koios-sh/elasticsearch-analysis-texsmart/29f1b109b9f78aaabf2bf81a2406ba81c9314d3b/lib/tencent_ai_texsmart.lib
--------------------------------------------------------------------------------
/lib/tencent_ai_texsmart.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | from ctypes import *
3 | import os
4 | import sys
5 |
6 | my_dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
7 | dll_name = 'libtencent_ai_texsmart.so'
8 | if sys.platform.startswith("win"):
9 | dll_name = 'tencent_ai_texsmart.dll'
10 | elif sys.platform == "cygwin":
11 | dll_name = "tencent_ai_texsmart.dll"
12 | lib = cdll.LoadLibrary(my_dir_path + dll_name)
13 |
14 | class NluToken(Structure):
15 | _fields_ = [
16 | ('str', c_wchar_p),
17 | ('offset', c_uint32),
18 | ('type', c_uint32),
19 | ]
20 |
21 | class NluTerm(Structure):
22 | _fields_ = [
23 | ('str', c_wchar_p),
24 | ('offset', c_uint32),
25 | ('len', c_uint32),
26 | ('start_token', c_uint32),
27 | ('token_count', c_uint32),
28 | ('tag', c_wchar_p),
29 | ('tag_id', c_uint32),
30 | ]
31 |
32 | class NluEntityType(Structure):
33 | _fields_ = [
34 | ('name', c_wchar_p),
35 | ('i18n', c_wchar_p),
36 | ('flag', c_uint32),
37 | ('path', c_wchar_p),
38 | ]
39 |
40 | class NluEntityTypeArray(Structure):
41 | _fields_ = [
42 | ('size', c_uint32),
43 | ('items', POINTER(NluEntityType)),
44 | ]
45 |
46 | class NluEntity(Structure):
47 | _fields_ = [
48 | ('str', c_wchar_p),
49 | ('offset', c_uint32),
50 | ('len', c_uint32),
51 | ('start_token', c_uint32),
52 | ('token_count', c_uint32),
53 | ('type', NluEntityType),
54 | ('alt_types', NluEntityTypeArray),
55 | ('meaning', c_wchar_p),
56 | ]
57 |
58 | class _NluTokenArray(Structure):
59 | _fields_ = [
60 | ('size', c_uint32),
61 | ('items', POINTER(NluToken)),
62 | ]
63 |
64 | class _NluTermArray(Structure):
65 | _fields_ = [
66 | ('size', c_uint32),
67 | ('items', POINTER(NluTerm)),
68 | ]
69 |
70 | class _NluEntityArray(Structure):
71 | _fields_ = [
72 | ('size', c_uint32),
73 | ('items', POINTER(NluEntity)),
74 | ]
75 |
76 | lib.Nlu_CreateEngine.restype = c_void_p
77 | lib.Nlu_CreateEngine.argtypes = [c_char_p, c_int]
78 | lib.Nlu_DestroyEngine.argtypes = [c_void_p]
79 | lib.Nlu_ParseText.restype = c_void_p
80 | lib.Nlu_ParseText.argtypes = [c_void_p, c_wchar_p, c_int]
81 | lib.Nlu_ParseTextExt.restype = c_void_p
82 | lib.Nlu_ParseTextExt.argtypes = [c_void_p, c_wchar_p, c_int, c_wchar_p]
83 | lib.Nlu_DestroyOutput.argtypes = [c_void_p]
84 | lib.Nlu_GetNormText.restype = c_wchar_p
85 | lib.Nlu_GetNormText.argtypes = [c_void_p, POINTER(c_int)]
86 | lib.Nlu_GetTokens.restype = _NluTokenArray
87 | lib.Nlu_GetTokens.argtypes = [c_void_p]
88 | lib.Nlu_GetWords.restype = _NluTermArray
89 | lib.Nlu_GetWords.argtypes = [c_void_p]
90 | lib.Nlu_GetPhrases.restype = _NluTermArray
91 | lib.Nlu_GetPhrases.argtypes = [c_void_p]
92 | lib.Nlu_GetEntities.restype = _NluEntityArray
93 | lib.Nlu_GetEntities.argtypes = [c_void_p]
94 |
95 | class NluOutput(object):
96 | def __init__(self, ptr):
97 | self.obj = ptr
98 | def __del__(self):
99 | if(self.obj is not None):
100 | lib.Nlu_DestroyOutput(self.obj)
101 | self.obj = None
102 | def norm_text(self):
103 | ret = lib.Nlu_GetNormText(self.obj, None)
104 | return ret
105 | def tokens(self):
106 | arr = []
107 | item_list = lib.Nlu_GetTokens(self.obj)
108 | for idx in range(item_list.size):
109 | arr.append(item_list.items[idx])
110 | return arr
111 | def words(self):
112 | arr = []
113 | item_list = lib.Nlu_GetWords(self.obj)
114 | for idx in range(item_list.size):
115 | arr.append(item_list.items[idx])
116 | return arr
117 | def phrases(self):
118 | arr = []
119 | item_list = lib.Nlu_GetPhrases(self.obj)
120 | for idx in range(item_list.size):
121 | arr.append(item_list.items[idx])
122 | return arr
123 | def entities(self):
124 | arr = []
125 | #count = lib.Nlu_GetEntityCount(self.obj)
126 | #for idx in range(count):
127 | # arr.append(lib.Nlu_GetEntity(slef.obj, idx))
128 | item_list = lib.Nlu_GetEntities(self.obj)
129 | for idx in range(item_list.size):
130 | arr.append(item_list.items[idx])
131 | return arr
132 |
133 | class NluEngine(object):
134 | def __init__(self, data_dir, worker_count):
135 | self.obj = lib.Nlu_CreateEngine(data_dir.encode('utf-8'), worker_count)
136 | def __del__(self):
137 | if self.obj is not None:
138 | lib.Nlu_DestroyEngine(self.obj)
139 | self.obj = None
140 | def parse_text(self, input_str):
141 | output_handle = lib.Nlu_ParseText(self.obj, c_wchar_p(input_str), len(input_str))
142 | return NluOutput(output_handle)
143 | def parse_text_ext(self, input_str, options_str):
144 | output_handle = lib.Nlu_ParseTextExt(self.obj, c_wchar_p(input_str), len(input_str), c_wchar_p(options_str))
145 | return NluOutput(output_handle)
146 |
--------------------------------------------------------------------------------
/lib/tencent_ai_texsmart.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koios-sh/elasticsearch-analysis-texsmart/29f1b109b9f78aaabf2bf81a2406ba81c9314d3b/lib/tencent_ai_texsmart.pyc
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | elasticsearch-analysis-texsmart
6 | org.elasticsearch
7 | elasticsearch-analysis-texsmart
8 | ${elasticsearch.version}
9 | jar
10 | TexSmart Analyzer for ElasticSearch
11 |
12 |
13 | 7.6.2
14 | 1.8
15 | UTF-8
16 | ${project.basedir}/src/main/assemblies/plugin.xml
17 | analysis-texsmart
18 | org.elasticsearch.plugin.analysis.texsmart.AnalysisTexSmartPlugin
19 | true
20 | false
21 | true
22 | sdk-0.1.3
23 | 12
24 | 12
25 |
26 |
27 |
28 |
29 | org.elasticsearch
30 | elasticsearch
31 | ${elasticsearch.version}
32 | compile
33 |
34 |
35 | org.apache.httpcomponents
36 | httpclient
37 | 4.5.6
38 |
39 |
40 | org.apache.logging.log4j
41 | log4j-api
42 | 2.3
43 | compile
44 |
45 |
46 | org.hamcrest
47 | hamcrest-core
48 | 1.3.RC2
49 | test
50 |
51 |
52 | org.hamcrest
53 | hamcrest-library
54 | 1.3.RC2
55 | test
56 |
57 |
58 | junit
59 | junit
60 | 4.11
61 | test
62 |
63 |
64 | com.sun.jna
65 | com.sun.jna
66 | 1.0
67 | system
68 | ${project.basedir}/lib/jna.jar
69 |
70 |
71 |
72 |
73 |
74 |
75 | org.apache.maven.plugins
76 | maven-compiler-plugin
77 | 3.5.1
78 |
79 | ${maven.compiler.target}
80 | ${maven.compiler.target}
81 |
82 |
83 |
84 | org.apache.maven.plugins
85 | maven-surefire-plugin
86 | 2.11
87 |
88 |
89 | **/*Tests.java
90 |
91 |
92 |
93 |
94 | org.apache.maven.plugins
95 | maven-source-plugin
96 | 2.1.2
97 |
98 |
99 | attach-sources
100 |
101 | jar
102 |
103 |
104 |
105 |
106 |
107 | maven-assembly-plugin
108 |
109 |
110 | false
111 | ${project.build.directory}/releases/
112 |
113 | ${basedir}/src/main/assemblies/plugin.xml
114 |
115 |
116 |
117 | fully.qualified.MainClass
118 |
119 |
120 |
121 |
122 |
123 | package
124 |
125 | single
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 | lib
134 | BOOT-INF/lib/
135 |
136 | **/*.jar
137 |
138 |
139 |
140 |
141 |
142 |
143 | disable-java8-doclint
144 |
145 | [1.8,)
146 |
147 |
148 | -Xdoclint:none
149 |
150 |
151 |
152 | release
153 |
154 |
155 |
156 | org.apache.maven.plugins
157 | maven-jar-plugin
158 | 3.1.2
159 |
160 |
161 | texsmart.properties
162 |
163 |
164 |
165 |
166 | org.apache.maven.plugins
167 | maven-compiler-plugin
168 | 3.8.0
169 |
170 | ${maven.compiler.target}
171 | ${maven.compiler.target}
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
--------------------------------------------------------------------------------
/settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | aliyunmaven
7 | *
8 | 阿里云公共仓库
9 | https://maven.aliyun.com/repository/public
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/src/main/assemblies/plugin.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | analysis-texsmart-release
4 |
5 | zip
6 |
7 | false
8 |
9 |
10 | ${project.basedir}/config
11 | /config
12 |
13 |
14 |
15 |
16 | ${project.basedir}/src/main/resources/plugin-descriptor.properties
17 | /
18 | true
19 |
20 |
21 | ${project.basedir}/src/main/resources/plugin-security.policy
22 | /
23 | true
24 |
25 |
26 | ${project.basedir}/config/texsmart-remote.xml
27 | /
28 | true
29 |
30 |
31 | ${project.basedir}/config/texsmart.properties
32 | /
33 | true
34 |
35 |
36 |
37 |
38 | /
39 | true
40 | true
41 |
42 | org.elasticsearch:elasticsearch
43 |
44 |
45 |
46 | /
47 | true
48 | true
49 |
50 | ${pom.basedir}/lib/jna.jar
51 |
52 |
53 | org.apache.lucene:lucene-core
54 | org.apache.lucene:lucene-analyzers-common
55 | org.apache.lucene:lucene-queryparser
56 | org.apache.lucene:lucene-sandbox
57 |
58 |
59 |
60 | /
61 | true
62 | true
63 |
64 | com.fasterxml.jackson.core:jackson-databind
65 | com.fasterxml.jackson.core:jackson-annotations
66 |
67 |
68 | com.fasterxml.jackson.core:jackson-core
69 |
70 |
71 |
72 | /
73 | true
74 | true
75 |
76 | org.apache.httpcomponents:httpclient
77 |
78 |
79 |
80 |
81 |
--------------------------------------------------------------------------------
/src/main/java/com/texsmart/TexSmart.java:
--------------------------------------------------------------------------------
1 | package com.texsmart;
2 |
3 | import com.texsmart.dic.config.TexSmartConfig;
4 | import com.texsmart.help.ESPluginLoggerFactory;
5 | import com.texsmart.seg.Segment;
6 | import com.texsmart.seg.TexSmartBasicSegment;
7 | import com.texsmart.tokenizer.StandardTokenizer;
8 | import org.apache.logging.log4j.Logger;
9 | import tencent.ai.texsmart.NluEngine;
10 | import tencent.ai.texsmart.NluOutput.Term;
11 |
12 | import java.util.List;
13 |
14 | public class TexSmart {
15 |
16 | private static final Logger logger = ESPluginLoggerFactory.getLogger(TexSmart.class.getName());
17 |
18 | public static NluEngine TEX_ENGINE;
19 |
20 | static {
21 | TEX_ENGINE = new NluEngine();
22 | int workerCount = Runtime.getRuntime().availableProcessors();
23 | logger.info("texsmart analysis is initializing");
24 | boolean ret = TEX_ENGINE.init(TexSmartConfig.getConfig().getProperty("path"), workerCount);
25 | if (!ret) {
26 | logger.info("texsmart analysis load failed");
27 | } else {
28 | logger.info("texsmart analysis load success");
29 | }
30 | }
31 |
32 | private TexSmart() {
33 | }
34 |
35 | public static List segment(String text) {
36 | return StandardTokenizer.segment(text);
37 | }
38 |
39 | public static Segment newSegment() {
40 | return new TexSmartBasicSegment();
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/src/main/java/com/texsmart/cfg/Configuration.java:
--------------------------------------------------------------------------------
1 | package com.texsmart.cfg;
2 |
3 | import com.texsmart.dic.Dictionary;
4 | import org.elasticsearch.common.inject.Inject;
5 | import org.elasticsearch.common.settings.Settings;
6 | import org.elasticsearch.env.Environment;
7 | import org.elasticsearch.index.analysis.NerAlgType;
8 | import org.elasticsearch.index.analysis.PosAlgType;
9 |
10 | /**
11 | * @project: elasticsearch-analysis-texsmart
12 | * @description: 配置信息
13 | * @author: wei_liu
14 | * @create: 2020-09-09 15:10
15 | */
16 | public class Configuration {
17 |
18 | private Environment environment;
19 |
20 | private Settings settings;
21 |
22 | private boolean enablePorterStemming;
23 |
24 | private boolean enableIndexMode;
25 |
26 | private boolean enableCustomDictionary;
27 |
28 | private boolean enableRemoteDict;
29 |
30 | private boolean enableNormalization;
31 |
32 | private boolean enableOffset;
33 |
34 | private boolean enableCustomConfig;
35 |
36 | private boolean enableStopDictionary;
37 |
38 | private PosAlgType enablePosAlg;
39 | private NerAlgType enableNerAlg;
40 |
41 | @Inject
42 | public Configuration(Environment env, Settings settings) {
43 | this.environment = env;
44 | this.settings = settings;
45 | this.enablePorterStemming = settings.get("enable_porter_stemming", "false").equals("true");
46 | this.enableIndexMode = settings.get("enable_index_mode", "false").equals("true");
47 | this.enableCustomDictionary = settings.get("enable_custom_dictionary", "true").equals("true");
48 | this.enableStopDictionary = settings.get("enable_stop_dictionary", "false").equals("true");
49 | this.enableRemoteDict = settings.get("enable_remote_dict", "true").equals("true");
50 | this.enableNormalization = settings.get("enable_normalization", "false").equals("true");
51 | this.enableOffset = settings.get("enable_offset", "true").equals("true");
52 | this.enableCustomConfig = settings.get("enable_custom_config", "false").equals("true");
53 | try {
54 | this.enablePosAlg = PosAlgType.valueOf(settings.get("enable_pos_alg", "log_linear"));
55 | this.enableNerAlg = NerAlgType.valueOf(settings.get("enable_ner_alg", "crf"));
56 | } catch (IllegalArgumentException e) {
57 | this.enablePosAlg = PosAlgType.LOG_LINEAR;
58 | this.enableNerAlg = NerAlgType.CRF;
59 | }
60 | Dictionary.initial(this);
61 | }
62 |
63 | public Environment getEnvironment() {
64 | return this.environment;
65 | }
66 |
67 | public Settings getSettings() {
68 | return this.settings;
69 | }
70 |
71 | public boolean isEnablePorterStemming() {
72 | return this.enablePorterStemming;
73 | }
74 |
75 | public Configuration enablePorterStemming(boolean enablePorterStemming) {
76 | this.enablePorterStemming = enablePorterStemming;
77 | return this;
78 | }
79 |
80 | public boolean isEnableStopDictionary() {
81 | return this.enableStopDictionary;
82 | }
83 |
84 | public boolean isEnableIndexMode() {
85 | return this.enableIndexMode;
86 | }
87 |
88 | public Configuration enableIndexMode(boolean enableIndexMode) {
89 | this.enableIndexMode = enableIndexMode;
90 | return this;
91 | }
92 |
93 | public boolean isEnableCustomDictionary() {
94 | return this.enableCustomDictionary;
95 | }
96 |
97 | public Configuration enableCustomDictionary(boolean enableCustomDictionary) {
98 | this.enableCustomDictionary = enableCustomDictionary;
99 | return this;
100 | }
101 |
102 | public boolean isEnableRemoteDict() {
103 | return enableRemoteDict;
104 | }
105 |
106 | public Configuration enableRemoteDict(boolean enableRemoteDict) {
107 | this.enableRemoteDict = enableRemoteDict;
108 | return this;
109 | }
110 |
111 | public boolean isEnableNormalization() {
112 | return enableNormalization;
113 | }
114 |
115 | public Configuration enableNormalization(boolean enableNormalization) {
116 | this.enableNormalization = enableNormalization;
117 | return this;
118 | }
119 |
120 | public boolean isEnableOffset() {
121 | return enableOffset;
122 | }
123 |
124 | public Configuration enableOffset(boolean enableOffset) {
125 | this.enableOffset = enableOffset;
126 | return this;
127 | }
128 |
129 | public boolean isEnableCustomConfig() {
130 | return enableCustomConfig;
131 | }
132 |
133 | public Configuration enableCustomConfig(boolean enableCustomConfig) {
134 | this.enableCustomConfig = enableCustomConfig;
135 | return this;
136 | }
137 |
138 | public PosAlgType getEnablePosAlg() {
139 | return this.enablePosAlg;
140 | }
141 |
142 | public Configuration enablePosAlg(PosAlgType enablePosAlg) {
143 | this.enablePosAlg = enablePosAlg;
144 | return this;
145 | }
146 |
147 | public NerAlgType getEnableNerAlg() {
148 | return this.enableNerAlg;
149 | }
150 |
151 | public Configuration enablePosAlg(NerAlgType enableNerAlg) {
152 | this.enableNerAlg = enableNerAlg;
153 | return this;
154 | }
155 | }
156 |
--------------------------------------------------------------------------------
/src/main/java/com/texsmart/dic/Dictionary.java:
--------------------------------------------------------------------------------
1 | package com.texsmart.dic;
2 |
3 | import com.texsmart.cfg.Configuration;
4 | import com.texsmart.dic.cache.DictionaryFileCache;
5 | import com.texsmart.dic.config.RemoteDictConfig;
6 | import org.elasticsearch.plugin.analysis.texsmart.AnalysisTexSmartPlugin;
7 |
8 | import java.nio.file.Path;
9 | import java.util.concurrent.Executors;
10 | import java.util.concurrent.ScheduledExecutorService;
11 | import java.util.concurrent.TimeUnit;
12 |
13 | /**
14 | * @project: elasticsearch-analysis-texsmart
15 | * @description: 词典类
16 | * @author: wei_liu
17 | * @create: 2020-09-09 15:10
18 | */
19 | public class Dictionary {
20 | /**
21 | * 词典单子实例
22 | */
23 | private static Dictionary singleton;
24 | /**
25 | * TexSmart配置文件名
26 | */
27 | public static final String CONFIG_FILE_NAME = "texsmart.properties";
28 | /**
29 | * TexSmart远程词典配置文件名
30 | */
31 | private static final String REMOTE_CONFIG_FILE_NAME = "texsmart-remote.xml";
32 |
33 | private static ScheduledExecutorService pool = Executors.newScheduledThreadPool(1);
34 |
35 | private Dictionary(Configuration configuration) {
36 | Path configDir = configuration.getEnvironment().configFile().resolve(AnalysisTexSmartPlugin.PLUGIN_NAME);
37 | DictionaryFileCache.configCachePath(configuration);
38 | DictionaryFileCache.loadCache();
39 | RemoteDictConfig.initial(configDir.resolve(REMOTE_CONFIG_FILE_NAME).toString());
40 | }
41 |
42 | public static synchronized Dictionary initial(Configuration configuration) {
43 | if (singleton == null) {
44 | synchronized (Dictionary.class) {
45 | if (singleton == null) {
46 | singleton = new Dictionary(configuration);
47 | pool.scheduleAtFixedRate(new ExtMonitor(), 10, 60, TimeUnit.SECONDS);
48 | if (configuration.isEnableRemoteDict()) {
49 | for (String location : RemoteDictConfig.getSingleton().getRemoteExtDictionarys()) {
50 | pool.scheduleAtFixedRate(new RemoteMonitor(location, "custom"), 10, 60, TimeUnit.SECONDS);
51 | }
52 |
53 | for (String location : RemoteDictConfig.getSingleton().getRemoteExtStopWordDictionarys()) {
54 | pool.scheduleAtFixedRate(new RemoteMonitor(location, "stop"), 10, 60, TimeUnit.SECONDS);
55 | }
56 | }
57 | return singleton;
58 | }
59 | }
60 | }
61 | return singleton;
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/src/main/java/com/texsmart/dic/DictionaryFile.java:
--------------------------------------------------------------------------------
1 | package com.texsmart.dic;
2 |
3 | import java.io.DataInputStream;
4 | import java.io.DataOutputStream;
5 | import java.io.IOException;
6 | import java.nio.charset.StandardCharsets;
7 | import java.util.Objects;
8 |
9 | /**
10 | * @project: elasticsearch-analysis-texsmart
11 | * @description: 自定义词典文件信息
12 | * @author: wei_liu
13 | * @create: 2020-09-09 15:10
14 | */
15 | public class DictionaryFile {
16 |
17 | private String path;
18 |
19 | private String type;
20 |
21 | private long lastModified;
22 |
23 | public DictionaryFile() {
24 | }
25 |
26 | DictionaryFile(String path, long lastModified) {
27 | this.path = path;
28 | this.lastModified = lastModified;
29 | }
30 |
31 | DictionaryFile(String path, String type, long lastModified) {
32 | this(path, lastModified);
33 | this.type = type;
34 | }
35 |
36 | public String getPath() {
37 | return path;
38 | }
39 |
40 | public void setPath(String path) {
41 | this.path = path;
42 | }
43 |
44 | public String getType() {
45 | return type;
46 | }
47 |
48 | public void setType(String type) {
49 | this.type = type;
50 | }
51 |
52 | public long getLastModified() {
53 | return lastModified;
54 | }
55 |
56 | public void setLastModified(long lastModified) {
57 | this.lastModified = lastModified;
58 | }
59 |
60 | public void write(DataOutputStream out) throws IOException {
61 | if (path != null && path.length() != 0) {
62 | byte[] bytes = path.getBytes(StandardCharsets.UTF_8);
63 | out.writeInt(bytes.length);
64 | out.write(bytes);
65 | } else {
66 | out.writeInt(0);
67 | }
68 | if (type != null && type.length() != 0) {
69 | byte[] bytes = type.getBytes(StandardCharsets.UTF_8);
70 | out.writeInt(bytes.length);
71 | out.write(bytes);
72 | } else {
73 | out.writeInt(0);
74 | }
75 | out.writeLong(lastModified);
76 | }
77 |
78 | public void read(DataInputStream in) throws IOException {
79 | int pathLength = in.readInt();
80 | if (pathLength != 0) {
81 | byte[] bytes = new byte[pathLength];
82 | in.read(bytes);
83 | path = new String(bytes, StandardCharsets.UTF_8);
84 | }
85 |
86 | int typeLength = in.readInt();
87 | if (typeLength != 0) {
88 | byte[] bytes = new byte[typeLength];
89 | in.read(bytes);
90 | type = new String(bytes, StandardCharsets.UTF_8);
91 | }
92 | lastModified = in.readLong();
93 | }
94 |
95 | @Override
96 | public boolean equals(Object o) {
97 | if (this == o) {
98 | return true;
99 | }
100 | if (o == null || getClass() != o.getClass()) {
101 | return false;
102 | }
103 | DictionaryFile that = (DictionaryFile) o;
104 | return lastModified == that.lastModified &&
105 | Objects.equals(path, that.path) &&
106 | Objects.equals(type, that.type);
107 | }
108 |
109 | @Override
110 | public int hashCode() {
111 | return Objects.hash(path, type, lastModified);
112 | }
113 |
114 | @Override
115 | public String toString() {
116 | return "DictionaryFile{" +
117 | "path='" + path + '\'' +
118 | ", lastModified=" + lastModified +
119 | '}';
120 | }
121 | }
122 |
--------------------------------------------------------------------------------
/src/main/java/com/texsmart/dic/ExtMonitor.java:
--------------------------------------------------------------------------------
1 | package com.texsmart.dic;
2 |
3 | import com.texsmart.TexSmart;
4 | import com.texsmart.dic.cache.DictionaryFileCache;
5 | import com.texsmart.help.ESPluginLoggerFactory;
6 | import org.apache.logging.log4j.Logger;
7 | import org.elasticsearch.SpecialPermission;
8 |
9 | import java.io.File;
10 | import java.io.FileInputStream;
11 | import java.io.InputStreamReader;
12 | import java.security.AccessController;
13 | import java.security.PrivilegedAction;
14 | import java.util.ArrayList;
15 | import java.util.Arrays;
16 | import java.util.List;
17 | import java.util.Properties;
18 |
19 | /**
20 | * @project: elasticsearch-analysis-hanlp
21 | * @description: 自定义词典监控线程
22 | * @author: Kenn
23 | * @create: 2018-12-14 15:10
24 | */
25 | public class ExtMonitor implements Runnable {
26 |
27 | private static final Logger logger = ESPluginLoggerFactory.getLogger(ExtMonitor.class.getName());
28 |
29 | ExtMonitor() {
30 | SecurityManager sm = System.getSecurityManager();
31 | if (sm != null) {
32 | sm.checkPermission(new SpecialPermission());
33 | }
34 | }
35 |
36 | @Override
37 | public void run() {
38 | // List originalDictionaryFileList = DictionaryFileCache.getCustomDictionaryFileList();
39 | // logger.debug("hanlp original custom dictionary: {}", Arrays.toString(originalDictionaryFileList.toArray()));
40 | // reloadProperty();
41 | // List currentDictironaryFileList = getCurrentDictionaryFileList(TexSmart.Config.CustomDictionaryPath);
42 | // logger.debug("hanlp current custom dictionary: {}", Arrays.toString(currentDictironaryFileList.toArray()));
43 | // boolean isModified = false;
44 | // for (DictionaryFile currentDictionaryFile : currentDictironaryFileList) {
45 | // if (!originalDictionaryFileList.contains(currentDictionaryFile)) {
46 | // isModified = true;
47 | // break;
48 | // }
49 | // }
50 | // if (isModified) {
51 | // logger.info("reloading hanlp custom dictionary");
52 | // try {
53 | // AccessController.doPrivileged((PrivilegedAction) CustomDictionaryUtility::reload);
54 | // } catch (Exception e) {
55 | // logger.error("can not reload hanlp custom dictionary", e);
56 | // }
57 | // DictionaryFileCache.setCustomDictionaryFileList(currentDictironaryFileList);
58 | // DictionaryFileCache.writeCache();
59 | // logger.info("finish reload hanlp custom dictionary");
60 | // } else {
61 | // logger.info("hanlp custom dictionary isn't modified, so no need reload");
62 | // }
63 | }
64 |
65 | private void reloadProperty() {
66 | // Properties p = new Properties();
67 | // try {
68 | // ClassLoader loader = AccessController.doPrivileged((PrivilegedAction) () -> Thread.currentThread().getContextClassLoader());
69 | // if (loader == null) {
70 | // loader = HanLP.Config.class.getClassLoader();
71 | // }
72 | // p.load(new InputStreamReader(Predefine.HANLP_PROPERTIES_PATH == null ? loader.getResourceAsStream("hanlp.properties") : new FileInputStream(Predefine.HANLP_PROPERTIES_PATH), "UTF-8"));
73 | // String root = p.getProperty("root", "").replaceAll("\\\\", "/");
74 | // if (root.length() > 0 && !root.endsWith("/")) {
75 | // root += "/";
76 | // }
77 | // String[] pathArray = p.getProperty("CustomDictionaryPath", "data/dictionary/custom/CustomDictionary.txt").split(";");
78 | // String prePath = root;
79 | // for (int i = 0; i < pathArray.length; ++i) {
80 | // if (pathArray[i].startsWith(" ")) {
81 | // pathArray[i] = prePath + pathArray[i].trim();
82 | // } else {
83 | // pathArray[i] = root + pathArray[i];
84 | // int lastSplash = pathArray[i].lastIndexOf('/');
85 | // if (lastSplash != -1) {
86 | // prePath = pathArray[i].substring(0, lastSplash + 1);
87 | // }
88 | // }
89 | // }
90 | // AccessController.doPrivileged((PrivilegedAction) () -> HanLP.Config.CustomDictionaryPath = pathArray);
91 | // } catch (Exception e) {
92 | // logger.error("can not find hanlp.properties", e);
93 | // }
94 | // }
95 | //
96 | // private List getCurrentDictionaryFileList(String[] customDictionaryPaths) {
97 | // List dictionaryFileList = new ArrayList<>();
98 | // for (String customDictionaryPath : customDictionaryPaths) {
99 | // String[] customDictionaryPathTuple = customDictionaryPath.split(" ");
100 | // String path = customDictionaryPathTuple[0].trim();
101 | // logger.debug("hanlp custom path: {}", path);
102 | // File file = new File(path);
103 | // AccessController.doPrivileged((PrivilegedAction) () -> {
104 | // if (file.exists()) {
105 | // if (customDictionaryPathTuple.length > 1) {
106 | // if (customDictionaryPathTuple[1] == null || customDictionaryPathTuple[1].length() == 0) {
107 | // dictionaryFileList.add(new DictionaryFile(path, file.lastModified()));
108 | // } else {
109 | // dictionaryFileList.add(new DictionaryFile(path, customDictionaryPathTuple[1].trim(), file.lastModified()));
110 | // }
111 | // } else {
112 | // dictionaryFileList.add(new DictionaryFile(path, file.lastModified()));
113 | // }
114 | // }
115 | // return null;
116 | // });
117 | // }
118 | // return dictionaryFileList;
119 | }
120 | }
121 |
122 |
--------------------------------------------------------------------------------
/src/main/java/com/texsmart/dic/RemoteMonitor.java:
--------------------------------------------------------------------------------
1 | package com.texsmart.dic;
2 |
3 | import com.texsmart.help.ESPluginLoggerFactory;
4 | import org.apache.http.HttpStatus;
5 | import org.apache.http.client.config.RequestConfig;
6 | import org.apache.http.client.methods.CloseableHttpResponse;
7 | import org.apache.http.client.methods.HttpGet;
8 | import org.apache.http.client.methods.HttpHead;
9 | import org.apache.http.impl.client.CloseableHttpClient;
10 | import org.apache.http.impl.client.HttpClients;
11 | import org.apache.logging.log4j.Logger;
12 | import org.elasticsearch.SpecialPermission;
13 | import org.elasticsearch.common.collect.Tuple;
14 | import org.elasticsearch.core.internal.io.IOUtils;
15 |
16 | import java.io.BufferedReader;
17 | import java.io.IOException;
18 | import java.io.InputStreamReader;
19 | import java.nio.charset.Charset;
20 | import java.nio.charset.StandardCharsets;
21 | import java.security.AccessController;
22 | import java.security.PrivilegedAction;
23 |
24 | /**
25 | * @project: elasticsearch-analysis-hanlp
26 | * @description: 自定义远程词典监控线程
27 | * @author: Kenn
28 | * @create: 2018-12-14 15:10
29 | */
30 | public class RemoteMonitor implements Runnable {
31 |
32 | private static final Logger logger = ESPluginLoggerFactory.getLogger(RemoteMonitor.class.getName());
33 |
34 | private static CloseableHttpClient httpclient = HttpClients.createDefault();
35 | /**
36 | * 上次更改时间
37 | */
38 | private String last_modified;
39 | /**
40 | * 资源属性
41 | */
42 | private String eTags;
43 | /**
44 | * 请求地址
45 | */
46 | private String location;
47 | /**
48 | * 数据类型
49 | */
50 | private String type;
51 |
52 | private static final String SPLITTER = "\\s";
53 |
54 | public RemoteMonitor(String location, String type) {
55 | this.location = location;
56 | this.type = type;
57 | this.last_modified = null;
58 | this.eTags = null;
59 | }
60 |
61 | @Override
62 | public void run() {
63 | SpecialPermission.check();
64 | AccessController.doPrivileged((PrivilegedAction) () -> {
65 | runUnprivileged();
66 | return null;
67 | });
68 | }
69 |
70 | /**
71 | * 监控流程:
72 | * ①向词库服务器发送Head请求
73 | * ②从响应中获取Last-Modify、ETags字段值,判断是否变化
74 | * ③如果未变化,休眠1min,返回第①步
75 | * ④如果有变化,重新加载词典
76 | * ⑤休眠1min,返回第①步
77 | */
78 |
79 | private void runUnprivileged() {
80 | String path = location.split(SPLITTER)[0];
81 |
82 | HttpHead head = new HttpHead(path);
83 | // head.setConfig(buildRequestConfig());
84 |
85 | // 设置请求头
86 | if (last_modified != null) {
87 | head.setHeader("If-Modified-Since", last_modified);
88 | }
89 | if (eTags != null) {
90 | head.setHeader("If-None-Match", eTags);
91 | }
92 |
93 | CloseableHttpResponse response = null;
94 | try {
95 | response = httpclient.execute(head);
96 | if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
97 | if ((response.getLastHeader("Last-Modified") != null) && !response.getLastHeader("Last-Modified").getValue().equalsIgnoreCase(last_modified)) {
98 | loadRemoteCustomWords(response);
99 | } else if ((response.getLastHeader("ETag") != null) && !response.getLastHeader("ETag").getValue().equalsIgnoreCase(eTags)) {
100 | loadRemoteCustomWords(response);
101 | }
102 | } else if (response.getStatusLine().getStatusCode() == HttpStatus.SC_NOT_MODIFIED) {
103 | logger.info("remote_ext_dict {} is without modified", location);
104 | } else {
105 | logger.info("remote_ext_dict {} return bad code {}", location, response.getStatusLine().getStatusCode());
106 | }
107 | } catch (Exception e) {
108 | e.printStackTrace();
109 | logger.error("remote_ext_dict {} error!", e, location);
110 | } finally {
111 | try {
112 | if (response != null) {
113 | response.close();
114 | }
115 | } catch (IOException e) {
116 | logger.error(e.getMessage(), e);
117 | }
118 | }
119 | }
120 |
121 | /**
122 | * 加载远程自定义词典
123 | *
124 | * @param response header响应
125 | */
126 | private void loadRemoteCustomWords(CloseableHttpResponse response) {
127 | switch (type) {
128 | case "custom":
129 | logger.info("load hanlp remote custom dict path: {}", location);
130 | loadRemoteWordsUnprivileged(location);
131 | logger.info("finish load hanlp remote custom dict path: {}", location);
132 | break;
133 | case "stop":
134 | logger.info("load hanlp remote stop words path: {}", location);
135 | // loadRemoteStopWordsUnprivileged(location);
136 | logger.info("finish load hanlp remote stop words path: {}", location);
137 | break;
138 | default:
139 | return;
140 | }
141 | last_modified = response.getLastHeader("Last-Modified") == null ? null : response.getLastHeader("Last-Modified").getValue();
142 | eTags = response.getLastHeader("ETag") == null ? null : response.getLastHeader("ETag").getValue();
143 | }
144 |
145 | /**
146 | * 从远程服务器上下载自定义词条
147 | *
148 | * @param location 配置条目
149 | */
150 | private void loadRemoteWordsUnprivileged(String location) {
151 | // Tuple defaultInfo = analysisDefaultInfo(location);
152 | // CloseableHttpClient httpclient = HttpClients.createDefault();
153 | // CloseableHttpResponse response = null;
154 | // BufferedReader in = null;
155 | // HttpGet get = new HttpGet(defaultInfo.v1());
156 | // get.setConfig(buildRequestConfig());
157 | // try {
158 | // response = httpclient.execute(get);
159 | // if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
160 | // in = new BufferedReader(new InputStreamReader(response.getEntity().getContent(), analysisDefaultCharset(response)));
161 | // String line;
162 | // boolean firstLine = true;
163 | // while ((line = in.readLine()) != null) {
164 | // if (firstLine) {
165 | // line = IOUtil.removeUTF8BOM(line);
166 | // firstLine = false;
167 | // }
168 | //
169 | // // 切分
170 | // String[] param = line.split(SPLITTER);
171 | // String word = param[0];
172 | //
173 | // // 排除空行
174 | // if (word.length() == 0) {
175 | // continue;
176 | // }
177 | //
178 | // // 正规化
179 | // if (HanLP.Config.Normalization) {
180 | // word = CharTable.convert(word);
181 | // }
182 | // logger.debug("hanlp remote custom word: {}", word);
183 | // CustomDictionary.insert(word, analysisNatureWithFrequency(defaultInfo.v2(), param));
184 | // }
185 | // in.close();
186 | // response.close();
187 | // }
188 | // response.close();
189 | // } catch (IllegalStateException | IOException e) {
190 | // logger.error("get remote words {} error", e, location);
191 | // } finally {
192 | // try {
193 | // IOUtils.close(in);
194 | // IOUtils.close(response);
195 | // } catch (Exception e) {
196 | // e.printStackTrace();
197 | // }
198 | // }
199 | // }
200 | //
201 | // /**
202 | // * 从远程服务器上下载停止词词条
203 | // *
204 | // * @param location 配置条目
205 | // */
206 | // private void loadRemoteStopWordsUnprivileged(String location) {
207 | // CloseableHttpClient httpclient = HttpClients.createDefault();
208 | // CloseableHttpResponse response = null;
209 | // BufferedReader in = null;
210 | // HttpGet get = new HttpGet(location);
211 | // get.setConfig(buildRequestConfig());
212 | // try {
213 | // response = httpclient.execute(get);
214 | // if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
215 | // in = new BufferedReader(new InputStreamReader(response.getEntity().getContent(), analysisDefaultCharset(response)));
216 | // String line;
217 | // boolean firstLine = true;
218 | // while ((line = in.readLine()) != null) {
219 | // if (firstLine) {
220 | // line = IOUtil.removeUTF8BOM(line);
221 | // firstLine = false;
222 | // }
223 | // logger.debug("hanlp remote stop word: {}", line);
224 | // CoreStopWordDictionary.add(line);
225 | // }
226 | // in.close();
227 | // response.close();
228 | // }
229 | // response.close();
230 | // } catch (IllegalStateException | IOException e) {
231 | // logger.error("get remote words {} error", e, location);
232 | // } finally {
233 | // try {
234 | // IOUtils.close(in);
235 | // IOUtils.close(response);
236 | // } catch (Exception e) {
237 | // e.printStackTrace();
238 | // }
239 | // }
240 | // }
241 | //
242 | // private RequestConfig buildRequestConfig() {
243 | // return RequestConfig.custom()
244 | // .setConnectionRequestTimeout(10 * 1000)
245 | // .setConnectTimeout(10 * 1000)
246 | // .setSocketTimeout(60 * 1000)
247 | // .build();
248 | // }
249 | //
250 | // /**
251 | // * 分析默认编码
252 | // *
253 | // * @param response 响应
254 | // * @return 返回编码
255 | // */
256 | // private Charset analysisDefaultCharset(CloseableHttpResponse response) {
257 | // Charset charset = StandardCharsets.UTF_8;
258 | // // 获取编码,默认为utf-8
259 | // if (response.getEntity().getContentType().getValue().contains("charset=")) {
260 | // String contentType = response.getEntity().getContentType().getValue();
261 | // charset = Charset.forName(contentType.substring(contentType.lastIndexOf("=") + 1));
262 | // }
263 | // return charset;
264 | // }
265 | //
266 | // /**
267 | // * 解析默认信息
268 | // *
269 | // * @param location 配置路径
270 | // * @return 返回new Tuple<路径, 默认词性>
271 | // */
272 | // private Tuple analysisDefaultInfo(String location) {
273 | // Nature defaultNature = Nature.n;
274 | // String path = location;
275 | // int cut = location.indexOf(' ');
276 | // if (cut > 0) {
277 | // // 有默认词性
278 | // String nature = location.substring(cut + 1);
279 | // path = location.substring(0, cut);
280 | // defaultNature = LexiconUtility.convertStringToNature(nature);
281 | // }
282 | // return Tuple.tuple(path, defaultNature);
283 | // }
284 | //
285 | // /**
286 | // * 分析词性和频次
287 | // *
288 | // * @param defaultNature 默认词性
289 | // * @param param 行数据
290 | // * @return 返回[单词] [词性A] [A的频次] [词性B] [B的频次] ...
291 | // */
292 | // private String analysisNatureWithFrequency(Nature defaultNature, String[] param) {
293 | // int natureCount = (param.length - 1) / 2;
294 | // StringBuilder builder = new StringBuilder();
295 | // if (natureCount == 0) {
296 | // builder.append(defaultNature).append(" ").append(1000);
297 | // } else {
298 | // for (int i = 0; i < natureCount; ++i) {
299 | // Nature nature = LexiconUtility.convertStringToNature(param[1 + 2 * i]);
300 | // int frequency = Integer.parseInt(param[2 + 2 * i]);
301 | // builder.append(nature).append(" ").append(frequency);
302 | // if (i != natureCount - 1) {
303 | // builder.append(" ");
304 | // }
305 | // }
306 | // }
307 | // return builder.toString();
308 | }
309 | }
310 |
311 |
--------------------------------------------------------------------------------
/src/main/java/com/texsmart/dic/cache/DictionaryFileCache.java:
--------------------------------------------------------------------------------
1 | package com.texsmart.dic.cache;
2 |
3 | import com.texsmart.cfg.Configuration;
4 | import com.texsmart.dic.DictionaryFile;
5 | import com.texsmart.help.ESPluginLoggerFactory;
6 | import org.apache.logging.log4j.Logger;
7 | import org.elasticsearch.core.internal.io.IOUtils;
8 | import org.elasticsearch.plugin.analysis.texsmart.AnalysisTexSmartPlugin;
9 |
10 | import java.io.*;
11 | import java.nio.file.Path;
12 | import java.security.AccessController;
13 | import java.security.PrivilegedAction;
14 | import java.util.ArrayList;
15 | import java.util.Arrays;
16 | import java.util.List;
17 |
18 | public class DictionaryFileCache {
19 |
20 | private static final Logger logger = ESPluginLoggerFactory.getLogger(DictionaryFileCache.class.getName());
21 |
22 | private static Path cachePath = null;
23 |
24 | private static final String DICTIONARY_FILE_CACHE_RECORD_FILE = "hanlp.cache";
25 |
26 | private static List customDictionaryFileList = new ArrayList<>();
27 |
28 | public static synchronized void configCachePath(Configuration configuration) {
29 | cachePath = configuration.getEnvironment().pluginsFile().resolve(AnalysisTexSmartPlugin.PLUGIN_NAME).resolve(DICTIONARY_FILE_CACHE_RECORD_FILE);
30 | }
31 |
32 | public static void loadCache() {
33 | File file = cachePath.toFile();
34 | if (!file.exists()) {
35 | return;
36 | }
37 | List dictionaryFiles = AccessController.doPrivileged((PrivilegedAction>) () -> {
38 | List dictionaryFileList = new ArrayList<>();
39 | DataInputStream in = null;
40 | try {
41 | in = new DataInputStream(new FileInputStream(file));
42 | int size = in.readInt();
43 | for (int i = 0; i < size; i++) {
44 | DictionaryFile dictionaryFile = new DictionaryFile();
45 | dictionaryFile.read(in);
46 | dictionaryFileList.add(dictionaryFile);
47 | }
48 | } catch (IOException e) {
49 | logger.debug("can not load custom dictionary cache file", e);
50 | } finally {
51 | try {
52 | IOUtils.close(in);
53 | } catch (IOException e) {
54 | e.printStackTrace();
55 | }
56 | }
57 | return dictionaryFileList;
58 | });
59 | setCustomDictionaryFileList(dictionaryFiles);
60 | }
61 |
62 | public static void writeCache() {
63 | AccessController.doPrivileged((PrivilegedAction