├── .gitignore
├── LICENSE
├── README.md
├── build.sh
├── config
    ├── texsmart-remote.xml
    └── texsmart.properties
├── lib
    ├── .DS_Store
    ├── jna.jar
    ├── libtencent_ai_texsmart.so
    ├── tencent.ai.texsmart.jar
    ├── tencent_ai_texsmart.dll
    ├── tencent_ai_texsmart.lib
    ├── tencent_ai_texsmart.py
    └── tencent_ai_texsmart.pyc
├── pom.xml
├── settings.xml
└── src
    └── main
        ├── assemblies
            └── plugin.xml
        ├── java
            ├── com
            │   └── texsmart
            │   │   ├── TexSmart.java
            │   │   ├── cfg
            │   │       └── Configuration.java
            │   │   ├── dic
            │   │       ├── Dictionary.java
            │   │       ├── DictionaryFile.java
            │   │       ├── ExtMonitor.java
            │   │       ├── RemoteMonitor.java
            │   │       ├── cache
            │   │       │   └── DictionaryFileCache.java
            │   │       ├── config
            │   │       │   ├── RemoteDictConfig.java
            │   │       │   └── TexSmartConfig.java
            │   │       └── stopword
            │   │       │   ├── Filter.java
            │   │       │   └── FilterStopWord.java
            │   │   ├── help
            │   │       ├── ESPluginLoggerFactory.java
            │   │       └── PrefixPluginLogger.java
            │   │   ├── lucene
            │   │       ├── PorterStemmer.java
            │   │       ├── SegmentWrapper.java
            │   │       ├── TexSmartAnalyzer.java
            │   │       ├── TexSmartIndexAnalyzer.java
            │   │       ├── TexSmartStandardAnalyzer.java
            │   │       ├── TexSmartTokenizer.java
            │   │       └── TokenizerBuilder.java
            │   │   ├── seg
            │   │       ├── Config.java
            │   │       ├── Segment.java
            │   │       └── TexSmartBasicSegment.java
            │   │   ├── tokenizer
            │   │       └── StandardTokenizer.java
            │   │   └── utility
            │   │       └── TextUtility.java
            ├── es-plugin.properties
            ├── org
            │   └── elasticsearch
            │   │   ├── index
            │   │       └── analysis
            │   │       │   ├── NerAlgType.java
            │   │       │   ├── PosAlgType.java
            │   │       │   ├── TexSmartAnalyzerProvider.java
            │   │       │   ├── TexSmartTokenizerFactory.java
            │   │       │   └── TexSmartType.java
            │   │   └── plugin
            │   │       └── analysis
            │   │           └── texsmart
            │   │               └── AnalysisTexSmartPlugin.java
            └── tencent
            │   └── ai
            │       └── texsmart
            │           ├── CLib.java
            │           ├── NluEngine.java
            │           └── NluOutput.java
        └── resources
            ├── plugin-descriptor.properties
            ├── plugin-security.policy
            ├── texsmart-remote.xml
            └── texsmart.properties


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | .DS_Store
3 | *.iws
4 | *.iml
5 | *.ipr
6 | target/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # elasticsearch-analysis-texsmart
  2 | TexSmart Analyzer for ElasticSearch
  3 | 
  4 | 此分词器基于腾讯AI实验室[TexSmart中文分词库](https://ai.tencent.com/ailab/nlp/texsmart)，提供了ES中文分词插件.
  5 | 
  6 | 🚩 更新日志：
  7 | 1. 适配Elasticsearch 7.x版本，TexSmart-0.1.3版本
  8 | 
  9 | ----------
 10 | 
 11 | 版本对应
 12 | ----------
 13 | 
 14 | | Plugin version | Elastic version |
 15 | | :------------- | :-------------- |
 16 | | master         | 7.x             |
 17 | | 7.6.2          | 7.6.2           |
 18 | 
 19 | 
 20 | 安装步骤
 21 | ----------
 22 | 
 23 | ### 1. 下载安装ES对应Plugin Release版本
 24 | 
 25 | 安装方式：
 26 | 
 27 | 方式一
 28 | 
 29 |    a. 下载对应的release安装包，最新release包可从github下载（链接: https://github.com/koios-sh/elasticsearch-analysis-texsmart/releases/download/v7.6.2/elasticsearch-analysis-texsmart-7.6.2.zip）
 30 |    
 31 |    b. 执行如下命令安装，其中PATH为插件包绝对路径：
 32 |    
 33 |    `./bin/elasticsearch-plugin install file://${PATH}`
 34 |    
 35 | 方式二
 36 | 
 37 |    a. 使用elasticsearch插件脚本安装command如下：
 38 |    
 39 |    `./bin/elasticsearch-plugin install https://github.com/koios-sh/elasticsearch-analysis-texsmart/releases/download/v7.6.2/elasticsearch-analysis-texsmart-7.6.2.zip`
 40 | 
 41 | 方式三
 42 | 
 43 |    a. 编译：sh build.sh
 44 |    b. 执行如下命令安装，其中PATH为插件包绝对路径：
 45 |       
 46 |       `./bin/elasticsearch-plugin install file://${PATH}`
 47 | 
 48 | ### 2. 安装数据包
 49 | 
 50 | release包中不包含TexSmart数据包，若要下载完整版数据包，请查看[TexSmart Release](https://ai.tencent.com/ailab/nlp/texsmart/zh/download.html)。
 51 | 
 52 | 数据包目录：/etc/elasticsearch/texsmart/data
 53 | 可以修改config/texsmart.properties文件中的path值，调整数据路径
 54 | 
 55 | ### 3. 安装libtencent_ai_texsmart.so
 56 | 
 57 | cp libtencent_ai_texsmart.so /usr/lib64 && chmod 777 /usr/lib64/libtencent_ai_texsmart.so
 58 | 
 59 | **注：每个节点都需要做上述更改**
 60 | 
 61 | 提供的分词方式说明
 62 | ----------
 63 | 
 64 | texsmart: texsmart默认分词
 65 | 
 66 | texsmart_standard: 标准分词
 67 | 
 68 | texsmart_index: 索引分词
 69 | 
 70 | 样例
 71 | ----------
 72 | 
 73 | ```text
 74 | POST http://localhost:9200/test/_analyze
 75 | {
 76 |     "text": "2020年，空调市场“冷风吹过”",
 77 |     "tokenizer": "texsmart_standard"
 78 | }
 79 | ```
 80 | 
 81 | ```json
 82 | {
 83 |     "tokens": [
 84 |         {
 85 |             "token": "2020",
 86 |             "start_offset": 0,
 87 |             "end_offset": 4,
 88 |             "type": "CD",
 89 |             "position": 0
 90 |         },
 91 |         {
 92 |             "token": "年",
 93 |             "start_offset": 4,
 94 |             "end_offset": 5,
 95 |             "type": "M",
 96 |             "position": 1
 97 |         },
 98 |         {
 99 |             "token": "，",
100 |             "start_offset": 5,
101 |             "end_offset": 6,
102 |             "type": "PU",
103 |             "position": 2
104 |         },
105 |         {
106 |             "token": "空调",
107 |             "start_offset": 6,
108 |             "end_offset": 8,
109 |             "type": "NN",
110 |             "position": 3
111 |         },
112 |         {
113 |             "token": "市场",
114 |             "start_offset": 8,
115 |             "end_offset": 10,
116 |             "type": "NN",
117 |             "position": 4
118 |         },
119 |         {
120 |             "token": "“",
121 |             "start_offset": 10,
122 |             "end_offset": 11,
123 |             "type": "PU",
124 |             "position": 5
125 |         },
126 |         {
127 |             "token": "冷风",
128 |             "start_offset": 11,
129 |             "end_offset": 13,
130 |             "type": "NN",
131 |             "position": 6
132 |         },
133 |         {
134 |             "token": "吹过",
135 |             "start_offset": 13,
136 |             "end_offset": 15,
137 |             "type": "VV",
138 |             "position": 7
139 |         },
140 |         {
141 |             "token": "”",
142 |             "start_offset": 15,
143 |             "end_offset": 16,
144 |             "type": "PU",
145 |             "position": 8
146 |         }
147 |     ]
148 | }
149 | ```
150 | 
151 | - 保证词典编码UTF-8
152 | 
153 | 自定义分词配置
154 | ----------
155 | 
156 | TexSmart在提供了各类分词方式的基础上，也提供了一系列的分词配置，分词插件也提供了相关的分词配置，我们可以在通过如下配置来自定义自己的分词器：
157 | 
158 | | Config                               | Elastic version     |
159 | | :----------------------------------- | :------------------ |
160 | | enable_index_mode                    | 是否是索引分词        |
161 | | enable_stop_dictionary               | 是否启用停用词        |
162 | | enable_offset                        | 是否计算偏移量        |
163 | | enable_pos_alg                       | 指定pos_tagging算法 (log_linear(默认), crf,dnn) |
164 | | enable_ner_alg                       | 指定ner算法 (crf(默认),dnn) |
165 | 
166 | 注意： 如果要采用如上配置过滤中英文标点符号，需要设置enable_stop_dictionary为true
167 | 
168 | 例如：
169 | ```text
170 | PUT test
171 | {
172 |   "settings": {
173 |     "analysis": {
174 |       "analyzer": {
175 |         "my_texsmart_analyzer": {
176 |           "tokenizer": "my_texsmart"
177 |         }
178 |       },
179 |       "tokenizer": {
180 |         "my_texsmart": {
181 |           "type": "texsmart",
182 |           "enable_stop_dictionary": true,
183 |           "enable_pos_alg": "log_linear",
184 |           "enable_ner_alg": "crf"
185 |         }
186 |       }
187 |     }
188 |   }
189 | }
190 | ```
191 | 
192 | ```text
193 | POST test/_analyze
194 | {
195 |     "text": "2020年，空调市场“冷风吹过”",
196 |     "analyzer": "my_texsmart_analyzer"
197 | }
198 | ```
199 | 
200 | 结果：
201 | ```json
202 | {
203 |     "tokens": [
204 |         {
205 |             "token": "2020",
206 |             "start_offset": 0,
207 |             "end_offset": 4,
208 |             "type": "CD",
209 |             "position": 0
210 |         },
211 |         {
212 |             "token": "年",
213 |             "start_offset": 4,
214 |             "end_offset": 5,
215 |             "type": "M",
216 |             "position": 1
217 |         },
218 |         {
219 |             "token": "空调",
220 |             "start_offset": 6,
221 |             "end_offset": 8,
222 |             "type": "NN",
223 |             "position": 2
224 |         },
225 |         {
226 |             "token": "市场",
227 |             "start_offset": 8,
228 |             "end_offset": 10,
229 |             "type": "NN",
230 |             "position": 3
231 |         },
232 |         {
233 |             "token": "冷风",
234 |             "start_offset": 11,
235 |             "end_offset": 13,
236 |             "type": "NN",
237 |             "position": 4
238 |         },
239 |         {
240 |             "token": "吹过",
241 |             "start_offset": 13,
242 |             "end_offset": 15,
243 |             "type": "VV",
244 |             "position": 5
245 |         }
246 |     ]
247 | }
248 | 
249 | ```
250 | 
251 | # 特别说明
252 | 1, texsmart目前官方不支持热词加载更新，听说下一个版本会支持。
253 |    代码中参考analysis-hanlp插件集成了远程词库和动态更新分词的功能
254 |    后续等腾讯官方版本更新后，上线该功能
255 | 
256 | 🚩 参考资料：
257 | [TexSmart](https://ai.tencent.com/ailab/nlp/texsmart)
258 | [analysis-hanlp](https://github.com/KennFalcon/elasticsearch-analysis-hanlp)


--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
1 | mvn --settings=settings.xml -Dmaven.test.skip=true clean install
2 | 


--------------------------------------------------------------------------------
/config/texsmart-remote.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
 3 | <properties>
 4 |     <comment>TexSmart Analyzer 扩展配置</comment>
 5 | 
 6 |     <!--用户可以在这里配置远程扩展字典 -->
 7 |     <!--<entry key="remote_ext_dict">words_location</entry>-->
 8 | 
 9 |     <!--用户可以在这里配置远程扩展停止词字典-->
10 |     <!--<entry key="remote_ext_stopwords">stop_words_location</entry>-->
11 | </properties>


--------------------------------------------------------------------------------
/config/texsmart.properties:
--------------------------------------------------------------------------------
1 | root=.
2 | CoreDictionaryPath=data/nlu/kb/
3 | CustomDictionaryPath=data/nlu/kb/customization/
4 | 
5 | path=/etc/elasticsearch/texsmart/data/nlu/kb/


--------------------------------------------------------------------------------
/lib/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koios-sh/elasticsearch-analysis-texsmart/29f1b109b9f78aaabf2bf81a2406ba81c9314d3b/lib/.DS_Store


--------------------------------------------------------------------------------
/lib/jna.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koios-sh/elasticsearch-analysis-texsmart/29f1b109b9f78aaabf2bf81a2406ba81c9314d3b/lib/jna.jar


--------------------------------------------------------------------------------
/lib/libtencent_ai_texsmart.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koios-sh/elasticsearch-analysis-texsmart/29f1b109b9f78aaabf2bf81a2406ba81c9314d3b/lib/libtencent_ai_texsmart.so


--------------------------------------------------------------------------------
/lib/tencent.ai.texsmart.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koios-sh/elasticsearch-analysis-texsmart/29f1b109b9f78aaabf2bf81a2406ba81c9314d3b/lib/tencent.ai.texsmart.jar


--------------------------------------------------------------------------------
/lib/tencent_ai_texsmart.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koios-sh/elasticsearch-analysis-texsmart/29f1b109b9f78aaabf2bf81a2406ba81c9314d3b/lib/tencent_ai_texsmart.dll


--------------------------------------------------------------------------------
/lib/tencent_ai_texsmart.lib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koios-sh/elasticsearch-analysis-texsmart/29f1b109b9f78aaabf2bf81a2406ba81c9314d3b/lib/tencent_ai_texsmart.lib


--------------------------------------------------------------------------------
/lib/tencent_ai_texsmart.py:
--------------------------------------------------------------------------------
  1 | ﻿#!/usr/bin/python
  2 | from ctypes import *
  3 | import os
  4 | import sys
  5 | 
  6 | my_dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
  7 | dll_name = 'libtencent_ai_texsmart.so'
  8 | if sys.platform.startswith("win"):
  9 |     dll_name = 'tencent_ai_texsmart.dll'
 10 | elif sys.platform == "cygwin":
 11 |     dll_name = "tencent_ai_texsmart.dll"
 12 | lib = cdll.LoadLibrary(my_dir_path + dll_name)
 13 | 
 14 | class NluToken(Structure):
 15 |     _fields_ = [
 16 |         ('str', c_wchar_p),
 17 |         ('offset', c_uint32),
 18 |         ('type', c_uint32),
 19 |     ]
 20 | 
 21 | class NluTerm(Structure):
 22 |     _fields_ = [
 23 |         ('str', c_wchar_p),
 24 |         ('offset', c_uint32),
 25 |         ('len', c_uint32),
 26 |         ('start_token', c_uint32),
 27 |         ('token_count', c_uint32),
 28 |         ('tag', c_wchar_p),
 29 |         ('tag_id', c_uint32),
 30 |     ]
 31 | 
 32 | class NluEntityType(Structure):
 33 |     _fields_ = [
 34 |         ('name', c_wchar_p),
 35 |         ('i18n', c_wchar_p),
 36 |         ('flag', c_uint32),
 37 |         ('path', c_wchar_p),
 38 |     ]
 39 | 
 40 | class NluEntityTypeArray(Structure):
 41 |     _fields_ = [
 42 |         ('size', c_uint32),
 43 |         ('items', POINTER(NluEntityType)),
 44 |     ]
 45 | 
 46 | class NluEntity(Structure):
 47 |     _fields_ = [
 48 |         ('str', c_wchar_p),
 49 |         ('offset', c_uint32),
 50 |         ('len', c_uint32),
 51 |         ('start_token', c_uint32),
 52 |         ('token_count', c_uint32),
 53 |         ('type', NluEntityType),
 54 |         ('alt_types', NluEntityTypeArray),
 55 |         ('meaning', c_wchar_p),
 56 |     ]
 57 | 
 58 | class _NluTokenArray(Structure):
 59 |     _fields_ = [
 60 |         ('size', c_uint32),
 61 |         ('items', POINTER(NluToken)),
 62 |     ]
 63 | 
 64 | class _NluTermArray(Structure):
 65 |     _fields_ = [
 66 |         ('size', c_uint32),
 67 |         ('items', POINTER(NluTerm)),
 68 |     ]
 69 | 
 70 | class _NluEntityArray(Structure):
 71 |     _fields_ = [
 72 |         ('size', c_uint32),
 73 |         ('items', POINTER(NluEntity)),
 74 |     ]
 75 | 
 76 | lib.Nlu_CreateEngine.restype = c_void_p
 77 | lib.Nlu_CreateEngine.argtypes = [c_char_p, c_int]
 78 | lib.Nlu_DestroyEngine.argtypes = [c_void_p]
 79 | lib.Nlu_ParseText.restype = c_void_p
 80 | lib.Nlu_ParseText.argtypes = [c_void_p, c_wchar_p, c_int]
 81 | lib.Nlu_ParseTextExt.restype = c_void_p
 82 | lib.Nlu_ParseTextExt.argtypes = [c_void_p, c_wchar_p, c_int, c_wchar_p]
 83 | lib.Nlu_DestroyOutput.argtypes = [c_void_p]
 84 | lib.Nlu_GetNormText.restype = c_wchar_p
 85 | lib.Nlu_GetNormText.argtypes = [c_void_p, POINTER(c_int)]
 86 | lib.Nlu_GetTokens.restype = _NluTokenArray
 87 | lib.Nlu_GetTokens.argtypes = [c_void_p]
 88 | lib.Nlu_GetWords.restype = _NluTermArray
 89 | lib.Nlu_GetWords.argtypes = [c_void_p]
 90 | lib.Nlu_GetPhrases.restype = _NluTermArray
 91 | lib.Nlu_GetPhrases.argtypes = [c_void_p]
 92 | lib.Nlu_GetEntities.restype = _NluEntityArray
 93 | lib.Nlu_GetEntities.argtypes = [c_void_p]
 94 | 
 95 | class NluOutput(object):
 96 |     def __init__(self, ptr):
 97 |         self.obj = ptr
 98 |     def __del__(self):
 99 |         if(self.obj is not None):
100 |             lib.Nlu_DestroyOutput(self.obj)
101 |             self.obj = None
102 |     def norm_text(self):
103 |         ret = lib.Nlu_GetNormText(self.obj, None)
104 |         return ret
105 |     def tokens(self):
106 |         arr = []
107 |         item_list =  lib.Nlu_GetTokens(self.obj)
108 |         for idx in range(item_list.size):
109 |             arr.append(item_list.items[idx])
110 |         return arr
111 |     def words(self):
112 |         arr = []
113 |         item_list = lib.Nlu_GetWords(self.obj)
114 |         for idx in range(item_list.size):
115 |             arr.append(item_list.items[idx])
116 |         return arr
117 |     def phrases(self):
118 |         arr = []
119 |         item_list = lib.Nlu_GetPhrases(self.obj)
120 |         for idx in range(item_list.size):
121 |             arr.append(item_list.items[idx])
122 |         return arr
123 |     def entities(self):
124 |         arr = []
125 |         #count = lib.Nlu_GetEntityCount(self.obj)
126 |         #for idx in range(count):
127 |         #    arr.append(lib.Nlu_GetEntity(slef.obj, idx))
128 |         item_list = lib.Nlu_GetEntities(self.obj)
129 |         for idx in range(item_list.size):
130 |             arr.append(item_list.items[idx])
131 |         return arr
132 | 
133 | class NluEngine(object):
134 |     def __init__(self, data_dir, worker_count):
135 |         self.obj = lib.Nlu_CreateEngine(data_dir.encode('utf-8'), worker_count)
136 |     def __del__(self):
137 |         if self.obj is not None:
138 |             lib.Nlu_DestroyEngine(self.obj)
139 |             self.obj = None
140 |     def parse_text(self, input_str):
141 |         output_handle = lib.Nlu_ParseText(self.obj, c_wchar_p(input_str), len(input_str))
142 |         return NluOutput(output_handle)
143 |     def parse_text_ext(self, input_str, options_str):
144 |         output_handle = lib.Nlu_ParseTextExt(self.obj, c_wchar_p(input_str), len(input_str), c_wchar_p(options_str))
145 |         return NluOutput(output_handle)
146 | 


--------------------------------------------------------------------------------
/lib/tencent_ai_texsmart.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koios-sh/elasticsearch-analysis-texsmart/29f1b109b9f78aaabf2bf81a2406ba81c9314d3b/lib/tencent_ai_texsmart.pyc


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://maven.apache.org/POM/4.0.0"
  2 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 |     <modelVersion>4.0.0</modelVersion>
  4 | 
  5 |     <name>elasticsearch-analysis-texsmart</name>
  6 |     <groupId>org.elasticsearch</groupId>
  7 |     <artifactId>elasticsearch-analysis-texsmart</artifactId>
  8 |     <version>${elasticsearch.version}</version>
  9 |     <packaging>jar</packaging>
 10 |     <description>TexSmart Analyzer for ElasticSearch</description>
 11 | 
 12 |     <properties>
 13 |         <elasticsearch.version>7.6.2</elasticsearch.version>
 14 |         <maven.compiler.target>1.8</maven.compiler.target>
 15 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 16 |         <elasticsearch.assembly.descriptor>${project.basedir}/src/main/assemblies/plugin.xml</elasticsearch.assembly.descriptor>
 17 |         <elasticsearch.plugin.name>analysis-texsmart</elasticsearch.plugin.name>
 18 |         <elasticsearch.plugin.classname>org.elasticsearch.plugin.analysis.texsmart.AnalysisTexSmartPlugin</elasticsearch.plugin.classname>
 19 |         <elasticsearch.plugin.jvm>true</elasticsearch.plugin.jvm>
 20 |         <tests.rest.load_packaged>false</tests.rest.load_packaged>
 21 |         <skip.unit.tests>true</skip.unit.tests>
 22 |         <texsmart.version>sdk-0.1.3</texsmart.version>
 23 |         <javac.src.version>12</javac.src.version>
 24 |         <javac.target.version>12</javac.target.version>
 25 |     </properties>
 26 | 
 27 |     <dependencies>
 28 |         <dependency>
 29 |             <groupId>org.elasticsearch</groupId>
 30 |             <artifactId>elasticsearch</artifactId>
 31 |             <version>${elasticsearch.version}</version>
 32 |             <scope>compile</scope>
 33 |         </dependency>
 34 |         <dependency>
 35 |             <groupId>org.apache.httpcomponents</groupId>
 36 |             <artifactId>httpclient</artifactId>
 37 |             <version>4.5.6</version>
 38 |         </dependency>
 39 |         <dependency>
 40 |             <groupId>org.apache.logging.log4j</groupId>
 41 |             <artifactId>log4j-api</artifactId>
 42 |             <version>2.3</version>
 43 |             <scope>compile</scope>
 44 |         </dependency>
 45 |         <dependency>
 46 |             <groupId>org.hamcrest</groupId>
 47 |             <artifactId>hamcrest-core</artifactId>
 48 |             <version>1.3.RC2</version>
 49 |             <scope>test</scope>
 50 |         </dependency>
 51 |         <dependency>
 52 |             <groupId>org.hamcrest</groupId>
 53 |             <artifactId>hamcrest-library</artifactId>
 54 |             <version>1.3.RC2</version>
 55 |             <scope>test</scope>
 56 |         </dependency>
 57 |         <dependency>
 58 |             <groupId>junit</groupId>
 59 |             <artifactId>junit</artifactId>
 60 |             <version>4.11</version>
 61 |             <scope>test</scope>
 62 |         </dependency>
 63 |         <dependency>
 64 |             <groupId>com.sun.jna</groupId>
 65 |             <artifactId>com.sun.jna</artifactId>
 66 |             <version>1.0</version>
 67 |             <scope>system</scope>
 68 |             <systemPath>${project.basedir}/lib/jna.jar</systemPath>
 69 |         </dependency>
 70 |     </dependencies>
 71 | 
 72 |     <build>
 73 |         <plugins>
 74 |             <plugin>
 75 |                 <groupId>org.apache.maven.plugins</groupId>
 76 |                 <artifactId>maven-compiler-plugin</artifactId>
 77 |                 <version>3.5.1</version>
 78 |                 <configuration>
 79 |                     <source>${maven.compiler.target}</source>
 80 |                     <target>${maven.compiler.target}</target>
 81 |                 </configuration>
 82 |             </plugin>
 83 |             <plugin>
 84 |                 <groupId>org.apache.maven.plugins</groupId>
 85 |                 <artifactId>maven-surefire-plugin</artifactId>
 86 |                 <version>2.11</version>
 87 |                 <configuration>
 88 |                     <includes>
 89 |                         <include>**/*Tests.java</include>
 90 |                     </includes>
 91 |                 </configuration>
 92 |             </plugin>
 93 |             <plugin>
 94 |                 <groupId>org.apache.maven.plugins</groupId>
 95 |                 <artifactId>maven-source-plugin</artifactId>
 96 |                 <version>2.1.2</version>
 97 |                 <executions>
 98 |                     <execution>
 99 |                         <id>attach-sources</id>
100 |                         <goals>
101 |                             <goal>jar</goal>
102 |                         </goals>
103 |                     </execution>
104 |                 </executions>
105 |             </plugin>
106 |             <plugin>
107 |                 <artifactId>maven-assembly-plugin</artifactId>
108 | 
109 |                 <configuration>
110 |                     <appendAssemblyId>false</appendAssemblyId>
111 |                     <outputDirectory>${project.build.directory}/releases/</outputDirectory>
112 |                     <descriptors>
113 |                         <descriptor>${basedir}/src/main/assemblies/plugin.xml</descriptor>
114 |                     </descriptors>
115 |                     <archive>
116 |                         <manifest>
117 |                             <mainClass>fully.qualified.MainClass</mainClass>
118 |                         </manifest>
119 |                     </archive>
120 |                 </configuration>
121 |                 <executions>
122 |                     <execution>
123 |                         <phase>package</phase>
124 |                         <goals>
125 |                             <goal>single</goal>
126 |                         </goals>
127 |                     </execution>
128 |                 </executions>
129 |             </plugin>
130 |         </plugins>
131 |         <resources>
132 |             <resource>
133 |                 <directory>lib</directory>
134 |                 <targetPath>BOOT-INF/lib/</targetPath>
135 |                 <includes>
136 |                     <include>**/*.jar</include>
137 |                 </includes>
138 |             </resource>
139 |         </resources>
140 |     </build>
141 |     <profiles>
142 |         <profile>
143 |             <id>disable-java8-doclint</id>
144 |             <activation>
145 |                 <jdk>[1.8,)</jdk>
146 |             </activation>
147 |             <properties>
148 |                 <additionalparam>-Xdoclint:none</additionalparam>
149 |             </properties>
150 |         </profile>
151 |         <profile>
152 |             <id>release</id>
153 |             <build>
154 |                 <plugins>
155 |                     <plugin>
156 |                         <groupId>org.apache.maven.plugins</groupId>
157 |                         <artifactId>maven-jar-plugin</artifactId>
158 |                         <version>3.1.2</version>
159 |                         <configuration>
160 |                             <excludes>
161 |                                 <exclude>texsmart.properties</exclude>
162 |                             </excludes>
163 |                         </configuration>
164 |                     </plugin>
165 |                     <plugin>
166 |                         <groupId>org.apache.maven.plugins</groupId>
167 |                         <artifactId>maven-compiler-plugin</artifactId>
168 |                         <version>3.8.0</version>
169 |                         <configuration>
170 |                             <source>${maven.compiler.target}</source>
171 |                             <target>${maven.compiler.target}</target>
172 |                         </configuration>
173 |                     </plugin>
174 |                 </plugins>
175 |             </build>
176 |         </profile>
177 |     </profiles>
178 | </project>
179 | 


--------------------------------------------------------------------------------
/settings.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | 
 3 | <settings>
 4 |     <mirrors>
 5 |         <mirror>
 6 |             <id>aliyunmaven</id>
 7 |             <mirrorOf>*</mirrorOf>
 8 |             <name>阿里云公共仓库</name>
 9 |             <url>https://maven.aliyun.com/repository/public</url>
10 |         </mirror>
11 |     </mirrors>
12 | </settings>
13 | 


--------------------------------------------------------------------------------
/src/main/assemblies/plugin.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <assembly>
 3 |     <id>analysis-texsmart-release</id>
 4 |     <formats>
 5 |         <format>zip</format>
 6 |     </formats>
 7 |     <includeBaseDirectory>false</includeBaseDirectory>
 8 |     <fileSets>
 9 |         <fileSet>
10 |             <directory>${project.basedir}/config</directory>
11 |             <outputDirectory>/config</outputDirectory>
12 |         </fileSet>
13 |     </fileSets>
14 |     <files>
15 |         <file>
16 |             <source>${project.basedir}/src/main/resources/plugin-descriptor.properties</source>
17 |             <outputDirectory>/</outputDirectory>
18 |             <filtered>true</filtered>
19 |         </file>
20 |         <file>
21 |             <source>${project.basedir}/src/main/resources/plugin-security.policy</source>
22 |             <outputDirectory>/</outputDirectory>
23 |             <filtered>true</filtered>
24 |         </file>
25 |         <file>
26 |             <source>${project.basedir}/config/texsmart-remote.xml</source>
27 |             <outputDirectory>/</outputDirectory>
28 |             <filtered>true</filtered>
29 |         </file>
30 |         <file>
31 |             <source>${project.basedir}/config/texsmart.properties</source>
32 |             <outputDirectory>/</outputDirectory>
33 |             <filtered>true</filtered>
34 |         </file>
35 |     </files>
36 |     <dependencySets>
37 |         <dependencySet>
38 |             <outputDirectory>/</outputDirectory>
39 |             <useProjectArtifact>true</useProjectArtifact>
40 |             <useTransitiveFiltering>true</useTransitiveFiltering>
41 |             <excludes>
42 |                 <exclude>org.elasticsearch:elasticsearch</exclude>
43 |             </excludes>
44 |         </dependencySet>
45 |         <dependencySet>
46 |             <outputDirectory>/</outputDirectory>
47 |             <useProjectArtifact>true</useProjectArtifact>
48 |             <useTransitiveFiltering>true</useTransitiveFiltering>
49 |             <includes>
50 |                 <include>${pom.basedir}/lib/jna.jar</include>
51 |             </includes>
52 |             <excludes>
53 |                 <exclude>org.apache.lucene:lucene-core</exclude>
54 |                 <exclude>org.apache.lucene:lucene-analyzers-common</exclude>
55 |                 <exclude>org.apache.lucene:lucene-queryparser</exclude>
56 |                 <exclude>org.apache.lucene:lucene-sandbox</exclude>
57 |             </excludes>
58 |         </dependencySet>
59 |         <dependencySet>
60 |             <outputDirectory>/</outputDirectory>
61 |             <useProjectArtifact>true</useProjectArtifact>
62 |             <useTransitiveFiltering>true</useTransitiveFiltering>
63 |             <includes>
64 |                 <include>com.fasterxml.jackson.core:jackson-databind</include>
65 |                 <include>com.fasterxml.jackson.core:jackson-annotations</include>
66 |             </includes>
67 |             <excludes>
68 |                 <exclude>com.fasterxml.jackson.core:jackson-core</exclude>
69 |             </excludes>
70 |         </dependencySet>
71 |         <dependencySet>
72 |             <outputDirectory>/</outputDirectory>
73 |             <useProjectArtifact>true</useProjectArtifact>
74 |             <useTransitiveFiltering>true</useTransitiveFiltering>
75 |             <includes>
76 |                 <include>org.apache.httpcomponents:httpclient</include>
77 |             </includes>
78 |         </dependencySet>
79 |     </dependencySets>
80 | </assembly>
81 | 


--------------------------------------------------------------------------------
/src/main/java/com/texsmart/TexSmart.java:
--------------------------------------------------------------------------------
 1 | package com.texsmart;
 2 | 
 3 | import com.texsmart.dic.config.TexSmartConfig;
 4 | import com.texsmart.help.ESPluginLoggerFactory;
 5 | import com.texsmart.seg.Segment;
 6 | import com.texsmart.seg.TexSmartBasicSegment;
 7 | import com.texsmart.tokenizer.StandardTokenizer;
 8 | import org.apache.logging.log4j.Logger;
 9 | import tencent.ai.texsmart.NluEngine;
10 | import tencent.ai.texsmart.NluOutput.Term;
11 | 
12 | import java.util.List;
13 | 
14 | public class TexSmart {
15 | 
16 |     private static final Logger logger = ESPluginLoggerFactory.getLogger(TexSmart.class.getName());
17 | 
18 |     public static NluEngine TEX_ENGINE;
19 | 
20 |     static {
21 |         TEX_ENGINE = new NluEngine();
22 |         int workerCount = Runtime.getRuntime().availableProcessors();
23 |         logger.info("texsmart analysis is initializing");
24 |         boolean ret = TEX_ENGINE.init(TexSmartConfig.getConfig().getProperty("path"), workerCount);
25 |         if (!ret) {
26 |             logger.info("texsmart analysis load failed");
27 |         } else {
28 |             logger.info("texsmart analysis load success");
29 |         }
30 |     }
31 | 
32 |     private TexSmart() {
33 |     }
34 | 
35 |     public static List<Term> segment(String text) {
36 |         return StandardTokenizer.segment(text);
37 |     }
38 | 
39 |     public static Segment newSegment() {
40 |         return new TexSmartBasicSegment();
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/java/com/texsmart/cfg/Configuration.java:
--------------------------------------------------------------------------------
  1 | package com.texsmart.cfg;
  2 | 
  3 | import com.texsmart.dic.Dictionary;
  4 | import org.elasticsearch.common.inject.Inject;
  5 | import org.elasticsearch.common.settings.Settings;
  6 | import org.elasticsearch.env.Environment;
  7 | import org.elasticsearch.index.analysis.NerAlgType;
  8 | import org.elasticsearch.index.analysis.PosAlgType;
  9 | 
 10 | /**
 11 |  * @project: elasticsearch-analysis-texsmart
 12 |  * @description: 配置信息
 13 |  * @author: wei_liu
 14 |  * @create: 2020-09-09 15:10
 15 |  */
 16 | public class Configuration {
 17 | 
 18 |     private Environment environment;
 19 | 
 20 |     private Settings settings;
 21 | 
 22 |     private boolean enablePorterStemming;
 23 | 
 24 |     private boolean enableIndexMode;
 25 | 
 26 |     private boolean enableCustomDictionary;
 27 | 
 28 |     private boolean enableRemoteDict;
 29 | 
 30 |     private boolean enableNormalization;
 31 | 
 32 |     private boolean enableOffset;
 33 | 
 34 |     private boolean enableCustomConfig;
 35 | 
 36 |     private boolean enableStopDictionary;
 37 | 
 38 |     private PosAlgType enablePosAlg;
 39 |     private NerAlgType enableNerAlg;
 40 | 
 41 |     @Inject
 42 |     public Configuration(Environment env, Settings settings) {
 43 |         this.environment = env;
 44 |         this.settings = settings;
 45 |         this.enablePorterStemming = settings.get("enable_porter_stemming", "false").equals("true");
 46 |         this.enableIndexMode = settings.get("enable_index_mode", "false").equals("true");
 47 |         this.enableCustomDictionary = settings.get("enable_custom_dictionary", "true").equals("true");
 48 |         this.enableStopDictionary = settings.get("enable_stop_dictionary", "false").equals("true");
 49 |         this.enableRemoteDict = settings.get("enable_remote_dict", "true").equals("true");
 50 |         this.enableNormalization = settings.get("enable_normalization", "false").equals("true");
 51 |         this.enableOffset = settings.get("enable_offset", "true").equals("true");
 52 |         this.enableCustomConfig = settings.get("enable_custom_config", "false").equals("true");
 53 |         try {
 54 |             this.enablePosAlg = PosAlgType.valueOf(settings.get("enable_pos_alg", "log_linear"));
 55 |             this.enableNerAlg = NerAlgType.valueOf(settings.get("enable_ner_alg", "crf"));
 56 |         } catch (IllegalArgumentException e) {
 57 |             this.enablePosAlg = PosAlgType.LOG_LINEAR;
 58 |             this.enableNerAlg = NerAlgType.CRF;
 59 |         }
 60 |         Dictionary.initial(this);
 61 |     }
 62 | 
 63 |     public Environment getEnvironment() {
 64 |         return this.environment;
 65 |     }
 66 | 
 67 |     public Settings getSettings() {
 68 |         return this.settings;
 69 |     }
 70 | 
 71 |     public boolean isEnablePorterStemming() {
 72 |         return this.enablePorterStemming;
 73 |     }
 74 | 
 75 |     public Configuration enablePorterStemming(boolean enablePorterStemming) {
 76 |         this.enablePorterStemming = enablePorterStemming;
 77 |         return this;
 78 |     }
 79 | 
 80 |     public boolean isEnableStopDictionary() {
 81 |         return this.enableStopDictionary;
 82 |     }
 83 | 
 84 |     public boolean isEnableIndexMode() {
 85 |         return this.enableIndexMode;
 86 |     }
 87 | 
 88 |     public Configuration enableIndexMode(boolean enableIndexMode) {
 89 |         this.enableIndexMode = enableIndexMode;
 90 |         return this;
 91 |     }
 92 | 
 93 |     public boolean isEnableCustomDictionary() {
 94 |         return this.enableCustomDictionary;
 95 |     }
 96 | 
 97 |     public Configuration enableCustomDictionary(boolean enableCustomDictionary) {
 98 |         this.enableCustomDictionary = enableCustomDictionary;
 99 |         return this;
100 |     }
101 | 
102 |     public boolean isEnableRemoteDict() {
103 |         return enableRemoteDict;
104 |     }
105 | 
106 |     public Configuration enableRemoteDict(boolean enableRemoteDict) {
107 |         this.enableRemoteDict = enableRemoteDict;
108 |         return this;
109 |     }
110 | 
111 |     public boolean isEnableNormalization() {
112 |         return enableNormalization;
113 |     }
114 | 
115 |     public Configuration enableNormalization(boolean enableNormalization) {
116 |         this.enableNormalization = enableNormalization;
117 |         return this;
118 |     }
119 | 
120 |     public boolean isEnableOffset() {
121 |         return enableOffset;
122 |     }
123 | 
124 |     public Configuration enableOffset(boolean enableOffset) {
125 |         this.enableOffset = enableOffset;
126 |         return this;
127 |     }
128 | 
129 |     public boolean isEnableCustomConfig() {
130 |         return enableCustomConfig;
131 |     }
132 | 
133 |     public Configuration enableCustomConfig(boolean enableCustomConfig) {
134 |         this.enableCustomConfig = enableCustomConfig;
135 |         return this;
136 |     }
137 | 
138 |     public PosAlgType getEnablePosAlg() {
139 |         return this.enablePosAlg;
140 |     }
141 | 
142 |     public Configuration enablePosAlg(PosAlgType enablePosAlg) {
143 |         this.enablePosAlg = enablePosAlg;
144 |         return this;
145 |     }
146 | 
147 |     public NerAlgType getEnableNerAlg() {
148 |         return this.enableNerAlg;
149 |     }
150 | 
151 |     public Configuration enablePosAlg(NerAlgType enableNerAlg) {
152 |         this.enableNerAlg = enableNerAlg;
153 |         return this;
154 |     }
155 | }
156 | 


--------------------------------------------------------------------------------
/src/main/java/com/texsmart/dic/Dictionary.java:
--------------------------------------------------------------------------------
 1 | package com.texsmart.dic;
 2 | 
 3 | import com.texsmart.cfg.Configuration;
 4 | import com.texsmart.dic.cache.DictionaryFileCache;
 5 | import com.texsmart.dic.config.RemoteDictConfig;
 6 | import org.elasticsearch.plugin.analysis.texsmart.AnalysisTexSmartPlugin;
 7 | 
 8 | import java.nio.file.Path;
 9 | import java.util.concurrent.Executors;
10 | import java.util.concurrent.ScheduledExecutorService;
11 | import java.util.concurrent.TimeUnit;
12 | 
13 | /**
14 |  * @project: elasticsearch-analysis-texsmart
15 |  * @description: 词典类
16 |  * @author: wei_liu
17 |  * @create: 2020-09-09 15:10
18 |  */
19 | public class Dictionary {
20 |     /**
21 |      * 词典单子实例
22 |      */
23 |     private static Dictionary singleton;
24 |     /**
25 |      * TexSmart配置文件名
26 |      */
27 |     public static final String CONFIG_FILE_NAME = "texsmart.properties";
28 |     /**
29 |      * TexSmart远程词典配置文件名
30 |      */
31 |     private static final String REMOTE_CONFIG_FILE_NAME = "texsmart-remote.xml";
32 | 
33 |     private static ScheduledExecutorService pool = Executors.newScheduledThreadPool(1);
34 | 
35 |     private Dictionary(Configuration configuration) {
36 |         Path configDir = configuration.getEnvironment().configFile().resolve(AnalysisTexSmartPlugin.PLUGIN_NAME);
37 |         DictionaryFileCache.configCachePath(configuration);
38 |         DictionaryFileCache.loadCache();
39 |         RemoteDictConfig.initial(configDir.resolve(REMOTE_CONFIG_FILE_NAME).toString());
40 |     }
41 | 
42 |     public static synchronized Dictionary initial(Configuration configuration) {
43 |         if (singleton == null) {
44 |             synchronized (Dictionary.class) {
45 |                 if (singleton == null) {
46 |                     singleton = new Dictionary(configuration);
47 |                     pool.scheduleAtFixedRate(new ExtMonitor(), 10, 60, TimeUnit.SECONDS);
48 |                     if (configuration.isEnableRemoteDict()) {
49 |                         for (String location : RemoteDictConfig.getSingleton().getRemoteExtDictionarys()) {
50 |                             pool.scheduleAtFixedRate(new RemoteMonitor(location, "custom"), 10, 60, TimeUnit.SECONDS);
51 |                         }
52 | 
53 |                         for (String location : RemoteDictConfig.getSingleton().getRemoteExtStopWordDictionarys()) {
54 |                             pool.scheduleAtFixedRate(new RemoteMonitor(location, "stop"), 10, 60, TimeUnit.SECONDS);
55 |                         }
56 |                     }
57 |                     return singleton;
58 |                 }
59 |             }
60 |         }
61 |         return singleton;
62 |     }
63 | }
64 | 


--------------------------------------------------------------------------------
/src/main/java/com/texsmart/dic/DictionaryFile.java:
--------------------------------------------------------------------------------
  1 | package com.texsmart.dic;
  2 | 
  3 | import java.io.DataInputStream;
  4 | import java.io.DataOutputStream;
  5 | import java.io.IOException;
  6 | import java.nio.charset.StandardCharsets;
  7 | import java.util.Objects;
  8 | 
  9 | /**
 10 |  * @project: elasticsearch-analysis-texsmart
 11 |  * @description: 自定义词典文件信息
 12 |  * @author: wei_liu
 13 |  * @create: 2020-09-09 15:10
 14 |  */
 15 | public class DictionaryFile {
 16 | 
 17 |     private String path;
 18 | 
 19 |     private String type;
 20 | 
 21 |     private long lastModified;
 22 | 
 23 |     public DictionaryFile() {
 24 |     }
 25 | 
 26 |     DictionaryFile(String path, long lastModified) {
 27 |         this.path = path;
 28 |         this.lastModified = lastModified;
 29 |     }
 30 | 
 31 |     DictionaryFile(String path, String type, long lastModified) {
 32 |         this(path, lastModified);
 33 |         this.type = type;
 34 |     }
 35 | 
 36 |     public String getPath() {
 37 |         return path;
 38 |     }
 39 | 
 40 |     public void setPath(String path) {
 41 |         this.path = path;
 42 |     }
 43 | 
 44 |     public String getType() {
 45 |         return type;
 46 |     }
 47 | 
 48 |     public void setType(String type) {
 49 |         this.type = type;
 50 |     }
 51 | 
 52 |     public long getLastModified() {
 53 |         return lastModified;
 54 |     }
 55 | 
 56 |     public void setLastModified(long lastModified) {
 57 |         this.lastModified = lastModified;
 58 |     }
 59 | 
 60 |     public void write(DataOutputStream out) throws IOException {
 61 |         if (path != null && path.length() != 0) {
 62 |             byte[] bytes = path.getBytes(StandardCharsets.UTF_8);
 63 |             out.writeInt(bytes.length);
 64 |             out.write(bytes);
 65 |         } else {
 66 |             out.writeInt(0);
 67 |         }
 68 |         if (type != null && type.length() != 0) {
 69 |             byte[] bytes = type.getBytes(StandardCharsets.UTF_8);
 70 |             out.writeInt(bytes.length);
 71 |             out.write(bytes);
 72 |         } else {
 73 |             out.writeInt(0);
 74 |         }
 75 |         out.writeLong(lastModified);
 76 |     }
 77 | 
 78 |     public void read(DataInputStream in) throws IOException {
 79 |         int pathLength = in.readInt();
 80 |         if (pathLength != 0) {
 81 |             byte[] bytes = new byte[pathLength];
 82 |             in.read(bytes);
 83 |             path = new String(bytes, StandardCharsets.UTF_8);
 84 |         }
 85 | 
 86 |         int typeLength = in.readInt();
 87 |         if (typeLength != 0) {
 88 |             byte[] bytes = new byte[typeLength];
 89 |             in.read(bytes);
 90 |             type = new String(bytes, StandardCharsets.UTF_8);
 91 |         }
 92 |         lastModified = in.readLong();
 93 |     }
 94 | 
 95 |     @Override
 96 |     public boolean equals(Object o) {
 97 |         if (this == o) {
 98 |             return true;
 99 |         }
100 |         if (o == null || getClass() != o.getClass()) {
101 |             return false;
102 |         }
103 |         DictionaryFile that = (DictionaryFile) o;
104 |         return lastModified == that.lastModified &&
105 |                 Objects.equals(path, that.path) &&
106 |                 Objects.equals(type, that.type);
107 |     }
108 | 
109 |     @Override
110 |     public int hashCode() {
111 |         return Objects.hash(path, type, lastModified);
112 |     }
113 | 
114 |     @Override
115 |     public String toString() {
116 |         return "DictionaryFile{" +
117 |                 "path='" + path + '\'' +
118 |                 ", lastModified=" + lastModified +
119 |                 '}';
120 |     }
121 | }
122 | 


--------------------------------------------------------------------------------
/src/main/java/com/texsmart/dic/ExtMonitor.java:
--------------------------------------------------------------------------------
  1 | package com.texsmart.dic;
  2 | 
  3 | import com.texsmart.TexSmart;
  4 | import com.texsmart.dic.cache.DictionaryFileCache;
  5 | import com.texsmart.help.ESPluginLoggerFactory;
  6 | import org.apache.logging.log4j.Logger;
  7 | import org.elasticsearch.SpecialPermission;
  8 | 
  9 | import java.io.File;
 10 | import java.io.FileInputStream;
 11 | import java.io.InputStreamReader;
 12 | import java.security.AccessController;
 13 | import java.security.PrivilegedAction;
 14 | import java.util.ArrayList;
 15 | import java.util.Arrays;
 16 | import java.util.List;
 17 | import java.util.Properties;
 18 | 
 19 | /**
 20 |  * @project: elasticsearch-analysis-hanlp
 21 |  * @description: 自定义词典监控线程
 22 |  * @author: Kenn
 23 |  * @create: 2018-12-14 15:10
 24 |  */
 25 | public class ExtMonitor implements Runnable {
 26 | 
 27 |     private static final Logger logger = ESPluginLoggerFactory.getLogger(ExtMonitor.class.getName());
 28 | 
 29 |     ExtMonitor() {
 30 |         SecurityManager sm = System.getSecurityManager();
 31 |         if (sm != null) {
 32 |             sm.checkPermission(new SpecialPermission());
 33 |         }
 34 |     }
 35 | 
 36 |     @Override
 37 |     public void run() {
 38 | //        List<DictionaryFile> originalDictionaryFileList = DictionaryFileCache.getCustomDictionaryFileList();
 39 | //        logger.debug("hanlp original custom dictionary: {}", Arrays.toString(originalDictionaryFileList.toArray()));
 40 | //        reloadProperty();
 41 | //        List<DictionaryFile> currentDictironaryFileList = getCurrentDictionaryFileList(TexSmart.Config.CustomDictionaryPath);
 42 | //        logger.debug("hanlp current custom dictionary: {}", Arrays.toString(currentDictironaryFileList.toArray()));
 43 | //        boolean isModified = false;
 44 | //        for (DictionaryFile currentDictionaryFile : currentDictironaryFileList) {
 45 | //            if (!originalDictionaryFileList.contains(currentDictionaryFile)) {
 46 | //                isModified = true;
 47 | //                break;
 48 | //            }
 49 | //        }
 50 | //        if (isModified) {
 51 | //            logger.info("reloading hanlp custom dictionary");
 52 | //            try {
 53 | //                AccessController.doPrivileged((PrivilegedAction) CustomDictionaryUtility::reload);
 54 | //            } catch (Exception e) {
 55 | //                logger.error("can not reload hanlp custom dictionary", e);
 56 | //            }
 57 | //            DictionaryFileCache.setCustomDictionaryFileList(currentDictironaryFileList);
 58 | //            DictionaryFileCache.writeCache();
 59 | //            logger.info("finish reload hanlp custom dictionary");
 60 | //        } else {
 61 | //            logger.info("hanlp custom dictionary isn't modified, so no need reload");
 62 | //        }
 63 |     }
 64 | 
 65 |     private void reloadProperty() {
 66 | //        Properties p = new Properties();
 67 | //        try {
 68 | //            ClassLoader loader = AccessController.doPrivileged((PrivilegedAction<ClassLoader>) () -> Thread.currentThread().getContextClassLoader());
 69 | //            if (loader == null) {
 70 | //                loader = HanLP.Config.class.getClassLoader();
 71 | //            }
 72 | //            p.load(new InputStreamReader(Predefine.HANLP_PROPERTIES_PATH == null ? loader.getResourceAsStream("hanlp.properties") : new FileInputStream(Predefine.HANLP_PROPERTIES_PATH), "UTF-8"));
 73 | //            String root = p.getProperty("root", "").replaceAll("\\\\", "/");
 74 | //            if (root.length() > 0 && !root.endsWith("/")) {
 75 | //                root += "/";
 76 | //            }
 77 | //            String[] pathArray = p.getProperty("CustomDictionaryPath", "data/dictionary/custom/CustomDictionary.txt").split(";");
 78 | //            String prePath = root;
 79 | //            for (int i = 0; i < pathArray.length; ++i) {
 80 | //                if (pathArray[i].startsWith(" ")) {
 81 | //                    pathArray[i] = prePath + pathArray[i].trim();
 82 | //                } else {
 83 | //                    pathArray[i] = root + pathArray[i];
 84 | //                    int lastSplash = pathArray[i].lastIndexOf('/');
 85 | //                    if (lastSplash != -1) {
 86 | //                        prePath = pathArray[i].substring(0, lastSplash + 1);
 87 | //                    }
 88 | //                }
 89 | //            }
 90 | //            AccessController.doPrivileged((PrivilegedAction) () -> HanLP.Config.CustomDictionaryPath = pathArray);
 91 | //        } catch (Exception e) {
 92 | //            logger.error("can not find hanlp.properties", e);
 93 | //        }
 94 | //    }
 95 | //
 96 | //    private List<DictionaryFile> getCurrentDictionaryFileList(String[] customDictionaryPaths) {
 97 | //        List<DictionaryFile> dictionaryFileList = new ArrayList<>();
 98 | //        for (String customDictionaryPath : customDictionaryPaths) {
 99 | //            String[] customDictionaryPathTuple = customDictionaryPath.split(" ");
100 | //            String path = customDictionaryPathTuple[0].trim();
101 | //            logger.debug("hanlp custom path: {}", path);
102 | //            File file = new File(path);
103 | //            AccessController.doPrivileged((PrivilegedAction) () -> {
104 | //                if (file.exists()) {
105 | //                    if (customDictionaryPathTuple.length > 1) {
106 | //                        if (customDictionaryPathTuple[1] == null || customDictionaryPathTuple[1].length() == 0) {
107 | //                            dictionaryFileList.add(new DictionaryFile(path, file.lastModified()));
108 | //                        } else {
109 | //                            dictionaryFileList.add(new DictionaryFile(path, customDictionaryPathTuple[1].trim(), file.lastModified()));
110 | //                        }
111 | //                    } else {
112 | //                        dictionaryFileList.add(new DictionaryFile(path, file.lastModified()));
113 | //                    }
114 | //                }
115 | //                return null;
116 | //            });
117 | //        }
118 | //        return dictionaryFileList;
119 |     }
120 | }
121 | 
122 | 


--------------------------------------------------------------------------------
/src/main/java/com/texsmart/dic/RemoteMonitor.java:
--------------------------------------------------------------------------------
  1 | package com.texsmart.dic;
  2 | 
  3 | import com.texsmart.help.ESPluginLoggerFactory;
  4 | import org.apache.http.HttpStatus;
  5 | import org.apache.http.client.config.RequestConfig;
  6 | import org.apache.http.client.methods.CloseableHttpResponse;
  7 | import org.apache.http.client.methods.HttpGet;
  8 | import org.apache.http.client.methods.HttpHead;
  9 | import org.apache.http.impl.client.CloseableHttpClient;
 10 | import org.apache.http.impl.client.HttpClients;
 11 | import org.apache.logging.log4j.Logger;
 12 | import org.elasticsearch.SpecialPermission;
 13 | import org.elasticsearch.common.collect.Tuple;
 14 | import org.elasticsearch.core.internal.io.IOUtils;
 15 | 
 16 | import java.io.BufferedReader;
 17 | import java.io.IOException;
 18 | import java.io.InputStreamReader;
 19 | import java.nio.charset.Charset;
 20 | import java.nio.charset.StandardCharsets;
 21 | import java.security.AccessController;
 22 | import java.security.PrivilegedAction;
 23 | 
 24 | /**
 25 |  * @project: elasticsearch-analysis-hanlp
 26 |  * @description: 自定义远程词典监控线程
 27 |  * @author: Kenn
 28 |  * @create: 2018-12-14 15:10
 29 |  */
 30 | public class RemoteMonitor implements Runnable {
 31 | 
 32 |     private static final Logger logger = ESPluginLoggerFactory.getLogger(RemoteMonitor.class.getName());
 33 | 
 34 |     private static CloseableHttpClient httpclient = HttpClients.createDefault();
 35 |     /**
 36 |      * 上次更改时间
 37 |      */
 38 |     private String last_modified;
 39 |     /**
 40 |      * 资源属性
 41 |      */
 42 |     private String eTags;
 43 |     /**
 44 |      * 请求地址
 45 |      */
 46 |     private String location;
 47 |     /**
 48 |      * 数据类型
 49 |      */
 50 |     private String type;
 51 | 
 52 |     private static final String SPLITTER = "\\s";
 53 | 
 54 |     public RemoteMonitor(String location, String type) {
 55 |         this.location = location;
 56 |         this.type = type;
 57 |         this.last_modified = null;
 58 |         this.eTags = null;
 59 |     }
 60 | 
 61 |     @Override
 62 |     public void run() {
 63 |         SpecialPermission.check();
 64 |         AccessController.doPrivileged((PrivilegedAction<Void>) () -> {
 65 |             runUnprivileged();
 66 |             return null;
 67 |         });
 68 |     }
 69 | 
 70 |     /**
 71 |      * 监控流程：
 72 |      * ①向词库服务器发送Head请求
 73 |      * ②从响应中获取Last-Modify、ETags字段值，判断是否变化
 74 |      * ③如果未变化，休眠1min，返回第①步
 75 |      * ④如果有变化，重新加载词典
 76 |      * ⑤休眠1min，返回第①步
 77 |      */
 78 | 
 79 |     private void runUnprivileged() {
 80 |         String path = location.split(SPLITTER)[0];
 81 | 
 82 |         HttpHead head = new HttpHead(path);
 83 | //        head.setConfig(buildRequestConfig());
 84 | 
 85 |         // 设置请求头
 86 |         if (last_modified != null) {
 87 |             head.setHeader("If-Modified-Since", last_modified);
 88 |         }
 89 |         if (eTags != null) {
 90 |             head.setHeader("If-None-Match", eTags);
 91 |         }
 92 | 
 93 |         CloseableHttpResponse response = null;
 94 |         try {
 95 |             response = httpclient.execute(head);
 96 |             if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
 97 |                 if ((response.getLastHeader("Last-Modified") != null) && !response.getLastHeader("Last-Modified").getValue().equalsIgnoreCase(last_modified)) {
 98 |                     loadRemoteCustomWords(response);
 99 |                 } else if ((response.getLastHeader("ETag") != null) && !response.getLastHeader("ETag").getValue().equalsIgnoreCase(eTags)) {
100 |                     loadRemoteCustomWords(response);
101 |                 }
102 |             } else if (response.getStatusLine().getStatusCode() == HttpStatus.SC_NOT_MODIFIED) {
103 |                 logger.info("remote_ext_dict {} is without modified", location);
104 |             } else {
105 |                 logger.info("remote_ext_dict {} return bad code {}", location, response.getStatusLine().getStatusCode());
106 |             }
107 |         } catch (Exception e) {
108 |             e.printStackTrace();
109 |             logger.error("remote_ext_dict {} error!", e, location);
110 |         } finally {
111 |             try {
112 |                 if (response != null) {
113 |                     response.close();
114 |                 }
115 |             } catch (IOException e) {
116 |                 logger.error(e.getMessage(), e);
117 |             }
118 |         }
119 |     }
120 | 
121 |     /**
122 |      * 加载远程自定义词典
123 |      *
124 |      * @param response header响应
125 |      */
126 |     private void loadRemoteCustomWords(CloseableHttpResponse response) {
127 |         switch (type) {
128 |             case "custom":
129 |                 logger.info("load hanlp remote custom dict path: {}", location);
130 |                 loadRemoteWordsUnprivileged(location);
131 |                 logger.info("finish load hanlp remote custom dict path: {}", location);
132 |                 break;
133 |             case "stop":
134 |                 logger.info("load hanlp remote stop words path: {}", location);
135 | //                loadRemoteStopWordsUnprivileged(location);
136 |                 logger.info("finish load hanlp remote stop words path: {}", location);
137 |                 break;
138 |             default:
139 |                 return;
140 |         }
141 |         last_modified = response.getLastHeader("Last-Modified") == null ? null : response.getLastHeader("Last-Modified").getValue();
142 |         eTags = response.getLastHeader("ETag") == null ? null : response.getLastHeader("ETag").getValue();
143 |     }
144 | 
145 |     /**
146 |      * 从远程服务器上下载自定义词条
147 |      *
148 |      * @param location 配置条目
149 |      */
150 |     private void loadRemoteWordsUnprivileged(String location) {
151 | //        Tuple<String, Nature> defaultInfo = analysisDefaultInfo(location);
152 | //        CloseableHttpClient httpclient = HttpClients.createDefault();
153 | //        CloseableHttpResponse response = null;
154 | //        BufferedReader in = null;
155 | //        HttpGet get = new HttpGet(defaultInfo.v1());
156 | //        get.setConfig(buildRequestConfig());
157 | //        try {
158 | //            response = httpclient.execute(get);
159 | //            if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
160 | //                in = new BufferedReader(new InputStreamReader(response.getEntity().getContent(), analysisDefaultCharset(response)));
161 | //                String line;
162 | //                boolean firstLine = true;
163 | //                while ((line = in.readLine()) != null) {
164 | //                    if (firstLine) {
165 | //                        line = IOUtil.removeUTF8BOM(line);
166 | //                        firstLine = false;
167 | //                    }
168 | //
169 | //                    // 切分
170 | //                    String[] param = line.split(SPLITTER);
171 | //                    String word = param[0];
172 | //
173 | //                    // 排除空行
174 | //                    if (word.length() == 0) {
175 | //                        continue;
176 | //                    }
177 | //
178 | //                    // 正规化
179 | //                    if (HanLP.Config.Normalization) {
180 | //                        word = CharTable.convert(word);
181 | //                    }
182 | //                    logger.debug("hanlp remote custom word: {}", word);
183 | //                    CustomDictionary.insert(word, analysisNatureWithFrequency(defaultInfo.v2(), param));
184 | //                }
185 | //                in.close();
186 | //                response.close();
187 | //            }
188 | //            response.close();
189 | //        } catch (IllegalStateException | IOException e) {
190 | //            logger.error("get remote words {} error", e, location);
191 | //        } finally {
192 | //            try {
193 | //                IOUtils.close(in);
194 | //                IOUtils.close(response);
195 | //            } catch (Exception e) {
196 | //                e.printStackTrace();
197 | //            }
198 | //        }
199 | //    }
200 | //
201 | //    /**
202 | //     * 从远程服务器上下载停止词词条
203 | //     *
204 | //     * @param location 配置条目
205 | //     */
206 | //    private void loadRemoteStopWordsUnprivileged(String location) {
207 | //        CloseableHttpClient httpclient = HttpClients.createDefault();
208 | //        CloseableHttpResponse response = null;
209 | //        BufferedReader in = null;
210 | //        HttpGet get = new HttpGet(location);
211 | //        get.setConfig(buildRequestConfig());
212 | //        try {
213 | //            response = httpclient.execute(get);
214 | //            if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
215 | //                in = new BufferedReader(new InputStreamReader(response.getEntity().getContent(), analysisDefaultCharset(response)));
216 | //                String line;
217 | //                boolean firstLine = true;
218 | //                while ((line = in.readLine()) != null) {
219 | //                    if (firstLine) {
220 | //                        line = IOUtil.removeUTF8BOM(line);
221 | //                        firstLine = false;
222 | //                    }
223 | //                    logger.debug("hanlp remote stop word: {}", line);
224 | //                    CoreStopWordDictionary.add(line);
225 | //                }
226 | //                in.close();
227 | //                response.close();
228 | //            }
229 | //            response.close();
230 | //        } catch (IllegalStateException | IOException e) {
231 | //            logger.error("get remote words {} error", e, location);
232 | //        } finally {
233 | //            try {
234 | //                IOUtils.close(in);
235 | //                IOUtils.close(response);
236 | //            } catch (Exception e) {
237 | //                e.printStackTrace();
238 | //            }
239 | //        }
240 | //    }
241 | //
242 | //    private RequestConfig buildRequestConfig() {
243 | //        return RequestConfig.custom()
244 | //                .setConnectionRequestTimeout(10 * 1000)
245 | //                .setConnectTimeout(10 * 1000)
246 | //                .setSocketTimeout(60 * 1000)
247 | //                .build();
248 | //    }
249 | //
250 | //    /**
251 | //     * 分析默认编码
252 | //     *
253 | //     * @param response 响应
254 | //     * @return 返回编码
255 | //     */
256 | //    private Charset analysisDefaultCharset(CloseableHttpResponse response) {
257 | //        Charset charset = StandardCharsets.UTF_8;
258 | //        // 获取编码，默认为utf-8
259 | //        if (response.getEntity().getContentType().getValue().contains("charset=")) {
260 | //            String contentType = response.getEntity().getContentType().getValue();
261 | //            charset = Charset.forName(contentType.substring(contentType.lastIndexOf("=") + 1));
262 | //        }
263 | //        return charset;
264 | //    }
265 | //
266 | //    /**
267 | //     * 解析默认信息
268 | //     *
269 | //     * @param location 配置路径
270 | //     * @return 返回new Tuple<路径, 默认词性>
271 | //     */
272 | //    private Tuple<String, Nature> analysisDefaultInfo(String location) {
273 | //        Nature defaultNature = Nature.n;
274 | //        String path = location;
275 | //        int cut = location.indexOf(' ');
276 | //        if (cut > 0) {
277 | //            // 有默认词性
278 | //            String nature = location.substring(cut + 1);
279 | //            path = location.substring(0, cut);
280 | //            defaultNature = LexiconUtility.convertStringToNature(nature);
281 | //        }
282 | //        return Tuple.tuple(path, defaultNature);
283 | //    }
284 | //
285 | //    /**
286 | //     * 分析词性和频次
287 | //     *
288 | //     * @param defaultNature 默认词性
289 | //     * @param param         行数据
290 | //     * @return 返回[单词] [词性A] [A的频次] [词性B] [B的频次] ...
291 | //     */
292 | //    private String analysisNatureWithFrequency(Nature defaultNature, String[] param) {
293 | //        int natureCount = (param.length - 1) / 2;
294 | //        StringBuilder builder = new StringBuilder();
295 | //        if (natureCount == 0) {
296 | //            builder.append(defaultNature).append(" ").append(1000);
297 | //        } else {
298 | //            for (int i = 0; i < natureCount; ++i) {
299 | //                Nature nature = LexiconUtility.convertStringToNature(param[1 + 2 * i]);
300 | //                int frequency = Integer.parseInt(param[2 + 2 * i]);
301 | //                builder.append(nature).append(" ").append(frequency);
302 | //                if (i != natureCount - 1) {
303 | //                    builder.append(" ");
304 | //                }
305 | //            }
306 | //        }
307 | //        return builder.toString();
308 |     }
309 | }
310 | 
311 | 


--------------------------------------------------------------------------------
/src/main/java/com/texsmart/dic/cache/DictionaryFileCache.java:
--------------------------------------------------------------------------------
 1 | package com.texsmart.dic.cache;
 2 | 
 3 | import com.texsmart.cfg.Configuration;
 4 | import com.texsmart.dic.DictionaryFile;
 5 | import com.texsmart.help.ESPluginLoggerFactory;
 6 | import org.apache.logging.log4j.Logger;
 7 | import org.elasticsearch.core.internal.io.IOUtils;
 8 | import org.elasticsearch.plugin.analysis.texsmart.AnalysisTexSmartPlugin;
 9 | 
10 | import java.io.*;
11 | import java.nio.file.Path;
12 | import java.security.AccessController;
13 | import java.security.PrivilegedAction;
14 | import java.util.ArrayList;
15 | import java.util.Arrays;
16 | import java.util.List;
17 | 
18 | public class DictionaryFileCache {
19 | 
20 |     private static final Logger logger = ESPluginLoggerFactory.getLogger(DictionaryFileCache.class.getName());
21 | 
22 |     private static Path cachePath = null;
23 | 
24 |     private static final String DICTIONARY_FILE_CACHE_RECORD_FILE = "hanlp.cache";
25 | 
26 |     private static List<DictionaryFile> customDictionaryFileList = new ArrayList<>();
27 | 
28 |     public static synchronized void configCachePath(Configuration configuration) {
29 |         cachePath = configuration.getEnvironment().pluginsFile().resolve(AnalysisTexSmartPlugin.PLUGIN_NAME).resolve(DICTIONARY_FILE_CACHE_RECORD_FILE);
30 |     }
31 | 
32 |     public static void loadCache() {
33 |         File file = cachePath.toFile();
34 |         if (!file.exists()) {
35 |             return;
36 |         }
37 |         List<DictionaryFile> dictionaryFiles = AccessController.doPrivileged((PrivilegedAction<List<DictionaryFile>>) () -> {
38 |             List<DictionaryFile> dictionaryFileList = new ArrayList<>();
39 |             DataInputStream in = null;
40 |             try {
41 |                 in = new DataInputStream(new FileInputStream(file));
42 |                 int size = in.readInt();
43 |                 for (int i = 0; i < size; i++) {
44 |                     DictionaryFile dictionaryFile = new DictionaryFile();
45 |                     dictionaryFile.read(in);
46 |                     dictionaryFileList.add(dictionaryFile);
47 |                 }
48 |             } catch (IOException e) {
49 |                 logger.debug("can not load custom dictionary cache file", e);
50 |             } finally {
51 |                 try {
52 |                     IOUtils.close(in);
53 |                 } catch (IOException e) {
54 |                     e.printStackTrace();
55 |                 }
56 |             }
57 |             return dictionaryFileList;
58 |         });
59 |         setCustomDictionaryFileList(dictionaryFiles);
60 |     }
61 | 
62 |     public static void writeCache() {
63 |         AccessController.doPrivileged((PrivilegedAction<Object>) () -> {
64 |             DataOutputStream out = null;
65 |             try {
66 |                 logger.info("begin write down hanlp custom dictionary file cache, file path: {}, custom dictionary file list: {}", cachePath.toFile().getAbsolutePath(), Arrays.toString(customDictionaryFileList.toArray()));
67 |                 out = new DataOutputStream(new FileOutputStream(cachePath.toFile()));
68 |                 out.writeInt(customDictionaryFileList.size());
69 |                 for (DictionaryFile dictionaryFile : customDictionaryFileList) {
70 |                     dictionaryFile.write(out);
71 |                 }
72 |                 logger.info("write down hanlp custom dictionary file cache successfully");
73 |             } catch (IOException e) {
74 |                 logger.debug("can not write down hanlp custom dictionary file cache", e);
75 |             } finally {
76 |                 try {
77 |                     IOUtils.close(out);
78 |                 } catch (IOException e) {
79 |                     e.printStackTrace();
80 |                 }
81 |             }
82 |             return null;
83 |         });
84 |     }
85 | 
86 |     public static List<DictionaryFile> getCustomDictionaryFileList() {
87 |         return customDictionaryFileList;
88 |     }
89 | 
90 |     public static synchronized void setCustomDictionaryFileList(List<DictionaryFile> customDictionaryFileList) {
91 |         DictionaryFileCache.customDictionaryFileList = customDictionaryFileList;
92 |     }
93 | }
94 | 


--------------------------------------------------------------------------------
/src/main/java/com/texsmart/dic/config/RemoteDictConfig.java:
--------------------------------------------------------------------------------
  1 | package com.texsmart.dic.config;
  2 | 
  3 | import com.texsmart.dic.Dictionary;
  4 | import com.texsmart.help.ESPluginLoggerFactory;
  5 | import org.apache.logging.log4j.Logger;
  6 | import org.elasticsearch.core.internal.io.IOUtils;
  7 | 
  8 | import java.io.FileInputStream;
  9 | import java.io.FileNotFoundException;
 10 | import java.io.IOException;
 11 | import java.io.InputStream;
 12 | import java.util.ArrayList;
 13 | import java.util.List;
 14 | import java.util.Properties;
 15 | 
 16 | /**
 17 |  * @project: elasticsearch-analysis-texsmart
 18 |  * @description: 远程词典配置
 19 |  * @author: Kenn
 20 |  * @create: 2018-12-18 15:23
 21 |  */
 22 | public class RemoteDictConfig {
 23 | 
 24 |     /**
 25 |      * 远程词典配置实例
 26 |      */
 27 |     private static RemoteDictConfig singleton;
 28 | 
 29 |     private static final Logger logger = ESPluginLoggerFactory.getLogger(RemoteDictConfig.class.getName());
 30 | 
 31 |     private static final String REMOTE_EXT_DICT = "remote_ext_dict";
 32 | 
 33 |     private static final String REMOTE_EXT_STOP = "remote_ext_stopwords";
 34 | 
 35 |     private Properties props;
 36 | 
 37 |     private String configFile;
 38 | 
 39 |     private RemoteDictConfig(String configFile) {
 40 |         this.configFile = configFile;
 41 |         this.props = new Properties();
 42 |         loadConfig();
 43 |     }
 44 | 
 45 |     public static synchronized RemoteDictConfig initial(String configFile) {
 46 |         if (singleton == null) {
 47 |             synchronized (Dictionary.class) {
 48 |                 if (singleton == null) {
 49 |                     singleton = new RemoteDictConfig(configFile);
 50 |                 }
 51 |             }
 52 |         }
 53 |         return singleton;
 54 |     }
 55 | 
 56 |     public boolean loadConfig() {
 57 |         InputStream input = null;
 58 |         try {
 59 |             logger.info("try load remote hanlp config from {}", configFile);
 60 |             input = new FileInputStream(configFile);
 61 |             props.loadFromXML(input);
 62 |         } catch (FileNotFoundException e) {
 63 |             logger.error("remote hanlp config isn't exist", e);
 64 |             return false;
 65 |         } catch (Exception e) {
 66 |             logger.error("can not load remote hanlp config", e);
 67 |             return false;
 68 |         } finally {
 69 |             try {
 70 |                 IOUtils.close(input);
 71 |             } catch (IOException e) {
 72 |                 e.printStackTrace();
 73 |             }
 74 |         }
 75 |         return true;
 76 |     }
 77 | 
 78 |     public List<String> getRemoteExtDictionarys() {
 79 |         return getRemoteExtFiles(REMOTE_EXT_DICT);
 80 |     }
 81 | 
 82 |     public List<String> getRemoteExtStopWordDictionarys() {
 83 |         return getRemoteExtFiles(REMOTE_EXT_STOP);
 84 |     }
 85 | 
 86 |     private List<String> getRemoteExtFiles(String key) {
 87 |         List<String> remoteExtFiles = new ArrayList<String>(2);
 88 |         String remoteExtStopWordDictCfg = getProperty(key);
 89 |         if (remoteExtStopWordDictCfg != null) {
 90 | 
 91 |             String[] filePaths = remoteExtStopWordDictCfg.split(";");
 92 |             for (String filePath : filePaths) {
 93 |                 if (filePath != null && !"".equals(filePath.trim())) {
 94 |                     remoteExtFiles.add(filePath);
 95 | 
 96 |                 }
 97 |             }
 98 |         }
 99 |         return remoteExtFiles;
100 |     }
101 | 
102 |     private String getProperty(String key) {
103 |         if (props != null) {
104 |             return props.getProperty(key);
105 |         }
106 |         return null;
107 |     }
108 | 
109 |     /**
110 |      * 获取远程词典配置实例
111 |      *
112 |      * @return Dictionary 单例对象
113 |      */
114 |     public static RemoteDictConfig getSingleton() {
115 |         if (singleton == null) {
116 |             throw new IllegalStateException("远程词典配置尚未初始化，请先调用initial方法");
117 |         }
118 |         return singleton;
119 |     }
120 | }
121 | 
122 | 


--------------------------------------------------------------------------------
/src/main/java/com/texsmart/dic/config/TexSmartConfig.java:
--------------------------------------------------------------------------------
 1 | package com.texsmart.dic.config;
 2 | 
 3 | import com.texsmart.dic.Dictionary;
 4 | import org.elasticsearch.plugin.analysis.texsmart.AnalysisTexSmartPlugin;
 5 | 
 6 | import java.io.FileInputStream;
 7 | import java.io.InputStream;
 8 | import java.io.InputStreamReader;
 9 | import java.io.IOException;
10 | import java.nio.file.Path;
11 | import java.nio.file.Paths;
12 | import java.util.Properties;
13 | 
14 | public class TexSmartConfig {
15 | 
16 |     private static TexSmartConfig ourInstance = new TexSmartConfig();
17 | 
18 |     private static Properties config;
19 | 
20 |     static {
21 |         Path filePath = Paths.get(System.getProperty("user.dir"), "plugins",
22 |                 AnalysisTexSmartPlugin.PLUGIN_NAME, Dictionary.CONFIG_FILE_NAME);
23 |         try {
24 |             InputStream in = new FileInputStream(filePath.toString());
25 |             config = new Properties();
26 |             InputStreamReader inputStreamReader = new InputStreamReader(in, "UTF-8");
27 |             config.load(inputStreamReader);
28 |         } catch (IOException e) {
29 |             e.printStackTrace();
30 |         }
31 |     }
32 | 
33 |     public static TexSmartConfig getInstance() {
34 |         return ourInstance;
35 |     }
36 | 
37 |     private TexSmartConfig() {}
38 | 
39 |     public static Properties getConfig() {
40 |         return config;
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/java/com/texsmart/dic/stopword/Filter.java:
--------------------------------------------------------------------------------
1 | package com.texsmart.dic.stopword;
2 | 
3 | import tencent.ai.texsmart.NluOutput.Term;
4 | 
5 | public interface Filter {
6 |     boolean beRemove(Term var1);
7 | }
8 | 


--------------------------------------------------------------------------------
/src/main/java/com/texsmart/dic/stopword/FilterStopWord.java:
--------------------------------------------------------------------------------
 1 | package com.texsmart.dic.stopword;
 2 | 
 3 | import tencent.ai.texsmart.NluOutput.Term;
 4 | 
 5 | public class FilterStopWord {
 6 |     private static FilterStopWord ourInstance = new FilterStopWord();
 7 | 
 8 |     private static Filter FILTER = term -> {
 9 |         // 除掉停用词 (目前只去掉标点符号)
10 |         String nature = term.tag != null ? term.tag : "空";
11 |         return nature.equals("PU");
12 |     };
13 | 
14 |     public static FilterStopWord getInstance() {
15 |         return ourInstance;
16 |     }
17 | 
18 |     private FilterStopWord() {}
19 | 
20 |     public static boolean beRemove(Term term) {
21 |         return FILTER.beRemove(term);
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/java/com/texsmart/help/ESPluginLoggerFactory.java:
--------------------------------------------------------------------------------
 1 | package com.texsmart.help;
 2 | 
 3 | import org.apache.logging.log4j.LogManager;
 4 | import org.apache.logging.log4j.Logger;
 5 | import org.apache.logging.log4j.spi.ExtendedLogger;
 6 | 
 7 | /**
 8 |  * @project: elasticsearch-analysis-texsmart
 9 |  * @description: logger
10 |  * @author: wei_liu
11 |  * @create: 2020-09-09 15:10
12 |  */
13 | public class ESPluginLoggerFactory {
14 | 
15 |     private ESPluginLoggerFactory() {
16 |     }
17 | 
18 |     static public Logger getLogger(String name) {
19 |         return getLogger("", LogManager.getLogger(name));
20 |     }
21 | 
22 |     static public Logger getLogger(String prefix, String name) {
23 |         return getLogger(prefix, LogManager.getLogger(name));
24 |     }
25 | 
26 |     static public Logger getLogger(String prefix, Class<?> clazz) {
27 |         return getLogger(prefix, LogManager.getLogger(clazz.getName()));
28 |     }
29 | 
30 |     static public Logger getLogger(String prefix, Logger logger) {
31 |         return (Logger)(prefix != null && prefix.length() != 0 ? new PrefixPluginLogger((ExtendedLogger)logger, logger.getName(), prefix) : logger);
32 |     }
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/src/main/java/com/texsmart/help/PrefixPluginLogger.java:
--------------------------------------------------------------------------------
 1 | package com.texsmart.help;
 2 | 
 3 | import org.apache.logging.log4j.Level;
 4 | import org.apache.logging.log4j.Marker;
 5 | import org.apache.logging.log4j.MarkerManager;
 6 | import org.apache.logging.log4j.message.Message;
 7 | import org.apache.logging.log4j.spi.ExtendedLogger;
 8 | import org.apache.logging.log4j.spi.ExtendedLoggerWrapper;
 9 | 
10 | import java.util.WeakHashMap;
11 | 
12 | /**
13 |  * @project: elasticsearch-analysis-texsmart
14 |  * @description: logger wrapper
15 |  * @author: wei_liu
16 |  * @create: 2020-09-09 15:10
17 |  */
18 | public class PrefixPluginLogger extends ExtendedLoggerWrapper {
19 | 
20 |     private static final WeakHashMap<String, Marker> MARKERS = new WeakHashMap<>();
21 | 
22 |     private final Marker marker;
23 | 
24 |     static int markersSize() {
25 |         return MARKERS.size();
26 |     }
27 | 
28 |     public String prefix() {
29 |         return this.marker.getName();
30 |     }
31 | 
32 |     PrefixPluginLogger(ExtendedLogger logger, String name, String prefix) {
33 |         super(logger, name, null);
34 |         String actualPrefix = prefix == null ? "" : prefix;
35 |         MarkerManager.Log4jMarker actualMarker;
36 |         synchronized (MARKERS) {
37 |             MarkerManager.Log4jMarker maybeMarker = (MarkerManager.Log4jMarker)MARKERS.get(actualPrefix);
38 |             if (maybeMarker == null) {
39 |                 actualMarker = new MarkerManager.Log4jMarker(actualPrefix);
40 |                 MARKERS.put(actualPrefix, actualMarker);
41 |             } else {
42 |                 actualMarker = maybeMarker;
43 |             }
44 |         }
45 |         this.marker = actualMarker;
46 |     }
47 | 
48 |     @Override
49 |     public void logMessage(String fqcn, Level level, Marker marker, Message message, Throwable t) {
50 |         assert marker == null;
51 |         super.logMessage(fqcn, level, this.marker, message, t);
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/java/com/texsmart/lucene/PorterStemmer.java:
--------------------------------------------------------------------------------
  1 | package com.texsmart.lucene;
  2 | 
  3 | import org.apache.lucene.util.ArrayUtil;
  4 | 
  5 | import java.io.FileInputStream;
  6 | import java.io.IOException;
  7 | import java.io.InputStream;
  8 | 
  9 | /**
 10 |  * 抄袭lucene的英文处理
 11 |  * Stemmer, implementing the Porter Stemming Algorithm
 12 |  * <p/>
 13 |  * The Stemmer class transforms a word into its root form. The input word can be
 14 |  * provided a character at time (by calling add()), or at once by calling one of
 15 |  * the various stem(something) methods.
 16 |  */
 17 | 
 18 | public class PorterStemmer
 19 | {
 20 |     private char[] b;
 21 |     private int i, /* offset into b */
 22 |             j, k, k0;
 23 |     private boolean dirty = false;
 24 |     private static final int INITIAL_SIZE = 50;
 25 | 
 26 |     public PorterStemmer()
 27 |     {
 28 |         b = new char[INITIAL_SIZE];
 29 |         i = 0;
 30 |     }
 31 | 
 32 |     /**
 33 |      * reset() resets the stemmer so it can stem another word. If you invoke the
 34 |      * stemmer by calling add(char) and then stem(), you must call reset()
 35 |      * before starting another word.
 36 |      */
 37 |     public void reset()
 38 |     {
 39 |         i = 0;
 40 |         dirty = false;
 41 |     }
 42 | 
 43 |     /**
 44 |      * Add a character to the word being stemmed. When you are finished adding
 45 |      * characters, you can call stem(void) to process the word.
 46 |      */
 47 |     public void add(char ch)
 48 |     {
 49 |         if (b.length <= i)
 50 |         {
 51 |             b = ArrayUtil.grow(b, i + 1);
 52 |         }
 53 |         b[i++] = ch;
 54 |     }
 55 | 
 56 |     /**
 57 |      * After a word has been stemmed, it can be retrieved by toString(), or a
 58 |      * reference to the internal buffer can be retrieved by getResultBuffer and
 59 |      * getResultLength (which is generally more efficient.)
 60 |      */
 61 |     @Override
 62 |     public String toString()
 63 |     {
 64 |         return new String(b, 0, i);
 65 |     }
 66 | 
 67 |     /**
 68 |      * Returns the length of the word resulting from the stemming process.
 69 |      */
 70 |     public int getResultLength()
 71 |     {
 72 |         return i;
 73 |     }
 74 | 
 75 |     /**
 76 |      * Returns a reference to a character buffer containing the results of the
 77 |      * stemming process. You also need to consult getResultLength() to determine
 78 |      * the length of the result.
 79 |      */
 80 |     public char[] getResultBuffer()
 81 |     {
 82 |         return b;
 83 |     }
 84 | 
 85 |     /* cons(i) is true <=> b[i] is a consonant. */
 86 | 
 87 |     private final boolean cons(int i)
 88 |     {
 89 |         switch (b[i])
 90 |         {
 91 |             case 'a':
 92 |             case 'e':
 93 |             case 'i':
 94 |             case 'o':
 95 |             case 'u':
 96 |                 return false;
 97 |             case 'y':
 98 |                 return (i == k0) ? true : !cons(i - 1);
 99 |             default:
100 |                 return true;
101 |         }
102 |     }
103 | 
104 |     /*
105 |      * m() measures the number of consonant sequences between k0 and j. if c is
106 |      * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
107 |      * presence,
108 |      *
109 |      * <c><v> gives 0 <c>vc<v> gives 1 <c>vcvc<v> gives 2 <c>vcvcvc<v> gives 3
110 |      * ....
111 |      */
112 | 
113 |     private final int m()
114 |     {
115 |         int n = 0;
116 |         int i = k0;
117 |         while (true)
118 |         {
119 |             if (i > j)
120 |                 return n;
121 |             if (!cons(i))
122 |                 break;
123 |             i++;
124 |         }
125 |         i++;
126 |         while (true)
127 |         {
128 |             while (true)
129 |             {
130 |                 if (i > j)
131 |                     return n;
132 |                 if (cons(i))
133 |                     break;
134 |                 i++;
135 |             }
136 |             i++;
137 |             n++;
138 |             while (true)
139 |             {
140 |                 if (i > j)
141 |                     return n;
142 |                 if (!cons(i))
143 |                     break;
144 |                 i++;
145 |             }
146 |             i++;
147 |         }
148 |     }
149 | 
150 |     /* vowelinstem() is true <=> k0,...j contains a vowel */
151 | 
152 |     private final boolean vowelinstem()
153 |     {
154 |         int i;
155 |         for (i = k0; i <= j; i++)
156 |             if (!cons(i))
157 |                 return true;
158 |         return false;
159 |     }
160 | 
161 |     /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
162 | 
163 |     private final boolean doublec(int j)
164 |     {
165 |         if (j < k0 + 1)
166 |             return false;
167 |         if (b[j] != b[j - 1])
168 |             return false;
169 |         return cons(j);
170 |     }
171 | 
172 |     /*
173 |      * cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
174 |      * and also if the second c is not w,x or y. this is used when trying to
175 |      * restore an e at the end of a short word. e.g.
176 |      *
177 |      * cav(e), lov(e), hop(e), crim(e), but snow, box, tray.
178 |      */
179 | 
180 |     private final boolean cvc(int i)
181 |     {
182 |         if (i < k0 + 2 || !cons(i) || cons(i - 1) || !cons(i - 2))
183 |             return false;
184 |         else
185 |         {
186 |             int ch = b[i];
187 |             if (ch == 'w' || ch == 'x' || ch == 'y')
188 |                 return false;
189 |         }
190 |         return true;
191 |     }
192 | 
193 |     private final boolean ends(String s)
194 |     {
195 |         int l = s.length();
196 |         int o = k - l + 1;
197 |         if (o < k0)
198 |             return false;
199 |         for (int i = 0; i < l; i++)
200 |             if (b[o + i] != s.charAt(i))
201 |                 return false;
202 |         j = k - l;
203 |         return true;
204 |     }
205 | 
206 |     /*
207 |      * setto(s) sets (j+1),...k to the characters in the string s, readjusting
208 |      * k.
209 |      */
210 | 
211 |     void setto(String s)
212 |     {
213 |         int l = s.length();
214 |         int o = j + 1;
215 |         for (int i = 0; i < l; i++)
216 |             b[o + i] = s.charAt(i);
217 |         k = j + l;
218 |         dirty = true;
219 |     }
220 | 
221 |     /* r(s) is used further down. */
222 | 
223 |     void r(String s)
224 |     {
225 |         if (m() > 0)
226 |             setto(s);
227 |     }
228 | 
229 |     /*
230 |      * step1() gets rid of plurals and -ed or -ing. e.g.
231 |      *
232 |      * caresses -> caress ponies -> poni ties -> ti caress -> caress cats -> cat
233 |      *
234 |      * feed -> feed agreed -> agree disabled -> disable
235 |      *
236 |      * matting -> mat mating -> mate meeting -> meet milling -> mill messing ->
237 |      * mess
238 |      *
239 |      * meetings -> meet
240 |      */
241 | 
242 |     private final void step1()
243 |     {
244 |         if (b[k] == 's')
245 |         {
246 |             if (ends("sses"))
247 |                 k -= 2;
248 |             else if (ends("ies"))
249 |                 setto("i");
250 |             else if (b[k - 1] != 's')
251 |                 k--;
252 |         }
253 |         if (ends("eed"))
254 |         {
255 |             if (m() > 0)
256 |                 k--;
257 |         }
258 |         else if ((ends("ed") || ends("ing")) && vowelinstem())
259 |         {
260 |             k = j;
261 |             if (ends("at"))
262 |                 setto("ate");
263 |             else if (ends("bl"))
264 |                 setto("ble");
265 |             else if (ends("iz"))
266 |                 setto("ize");
267 |             else if (doublec(k))
268 |             {
269 |                 int ch = b[k--];
270 |                 if (ch == 'l' || ch == 's' || ch == 'z')
271 |                     k++;
272 |             }
273 |             else if (m() == 1 && cvc(k))
274 |                 setto("e");
275 |         }
276 |     }
277 | 
278 |     /* step2() turns terminal y to i when there is another vowel in the stem. */
279 | 
280 |     private final void step2()
281 |     {
282 |         if (ends("y") && vowelinstem())
283 |         {
284 |             b[k] = 'i';
285 |             dirty = true;
286 |         }
287 |     }
288 | 
289 |     /*
290 |      * step3() maps double suffices to single ones. so -ization ( = -ize plus
291 |      * -ation) maps to -ize etc. note that the string before the suffix must
292 |      * give m() > 0.
293 |      */
294 | 
295 |     private final void step3()
296 |     {
297 |         if (k == k0)
298 |             return; /* For Bug 1 */
299 |         switch (b[k - 1])
300 |         {
301 |             case 'a':
302 |                 if (ends("ational"))
303 |                 {
304 |                     r("ate");
305 |                     break;
306 |                 }
307 |                 if (ends("tional"))
308 |                 {
309 |                     r("tion");
310 |                     break;
311 |                 }
312 |                 break;
313 |             case 'c':
314 |                 if (ends("enci"))
315 |                 {
316 |                     r("ence");
317 |                     break;
318 |                 }
319 |                 if (ends("anci"))
320 |                 {
321 |                     r("ance");
322 |                     break;
323 |                 }
324 |                 break;
325 |             case 'e':
326 |                 if (ends("izer"))
327 |                 {
328 |                     r("ize");
329 |                     break;
330 |                 }
331 |                 break;
332 |             case 'l':
333 |                 if (ends("bli"))
334 |                 {
335 |                     r("ble");
336 |                     break;
337 |                 }
338 |                 if (ends("alli"))
339 |                 {
340 |                     r("al");
341 |                     break;
342 |                 }
343 |                 if (ends("entli"))
344 |                 {
345 |                     r("ent");
346 |                     break;
347 |                 }
348 |                 if (ends("eli"))
349 |                 {
350 |                     r("e");
351 |                     break;
352 |                 }
353 |                 if (ends("ousli"))
354 |                 {
355 |                     r("ous");
356 |                     break;
357 |                 }
358 |                 break;
359 |             case 'o':
360 |                 if (ends("ization"))
361 |                 {
362 |                     r("ize");
363 |                     break;
364 |                 }
365 |                 if (ends("ation"))
366 |                 {
367 |                     r("ate");
368 |                     break;
369 |                 }
370 |                 if (ends("ator"))
371 |                 {
372 |                     r("ate");
373 |                     break;
374 |                 }
375 |                 break;
376 |             case 's':
377 |                 if (ends("alism"))
378 |                 {
379 |                     r("al");
380 |                     break;
381 |                 }
382 |                 if (ends("iveness"))
383 |                 {
384 |                     r("ive");
385 |                     break;
386 |                 }
387 |                 if (ends("fulness"))
388 |                 {
389 |                     r("ful");
390 |                     break;
391 |                 }
392 |                 if (ends("ousness"))
393 |                 {
394 |                     r("ous");
395 |                     break;
396 |                 }
397 |                 break;
398 |             case 't':
399 |                 if (ends("aliti"))
400 |                 {
401 |                     r("al");
402 |                     break;
403 |                 }
404 |                 if (ends("iviti"))
405 |                 {
406 |                     r("ive");
407 |                     break;
408 |                 }
409 |                 if (ends("biliti"))
410 |                 {
411 |                     r("ble");
412 |                     break;
413 |                 }
414 |                 break;
415 |             case 'g':
416 |                 if (ends("logi"))
417 |                 {
418 |                     r("log");
419 |                     break;
420 |                 }
421 |         }
422 |     }
423 | 
424 |     /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
425 | 
426 |     private final void step4()
427 |     {
428 |         switch (b[k])
429 |         {
430 |             case 'e':
431 |                 if (ends("icate"))
432 |                 {
433 |                     r("ic");
434 |                     break;
435 |                 }
436 |                 if (ends("ative"))
437 |                 {
438 |                     r("");
439 |                     break;
440 |                 }
441 |                 if (ends("alize"))
442 |                 {
443 |                     r("al");
444 |                     break;
445 |                 }
446 |                 break;
447 |             case 'i':
448 |                 if (ends("iciti"))
449 |                 {
450 |                     r("ic");
451 |                     break;
452 |                 }
453 |                 break;
454 |             case 'l':
455 |                 if (ends("ical"))
456 |                 {
457 |                     r("ic");
458 |                     break;
459 |                 }
460 |                 if (ends("ful"))
461 |                 {
462 |                     r("");
463 |                     break;
464 |                 }
465 |                 break;
466 |             case 's':
467 |                 if (ends("ness"))
468 |                 {
469 |                     r("");
470 |                     break;
471 |                 }
472 |                 break;
473 |         }
474 |     }
475 | 
476 |     /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
477 | 
478 |     private final void step5()
479 |     {
480 |         if (k == k0)
481 |             return; /* for Bug 1 */
482 |         switch (b[k - 1])
483 |         {
484 |             case 'a':
485 |                 if (ends("al"))
486 |                     break;
487 |                 return;
488 |             case 'c':
489 |                 if (ends("ance"))
490 |                     break;
491 |                 if (ends("ence"))
492 |                     break;
493 |                 return;
494 |             case 'e':
495 |                 if (ends("er"))
496 |                     break;
497 |                 return;
498 |             case 'i':
499 |                 if (ends("ic"))
500 |                     break;
501 |                 return;
502 |             case 'l':
503 |                 if (ends("able"))
504 |                     break;
505 |                 if (ends("ible"))
506 |                     break;
507 |                 return;
508 |             case 'n':
509 |                 if (ends("ant"))
510 |                     break;
511 |                 if (ends("ement"))
512 |                     break;
513 |                 if (ends("ment"))
514 |                     break;
515 |                 /* element etc. not stripped before the m */
516 |                 if (ends("ent"))
517 |                     break;
518 |                 return;
519 |             case 'o':
520 |                 if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't'))
521 |                     break;
522 |                 /* j >= 0 fixes Bug 2 */
523 |                 if (ends("ou"))
524 |                     break;
525 |                 return;
526 |             /* takes care of -ous */
527 |             case 's':
528 |                 if (ends("ism"))
529 |                     break;
530 |                 return;
531 |             case 't':
532 |                 if (ends("ate"))
533 |                     break;
534 |                 if (ends("iti"))
535 |                     break;
536 |                 return;
537 |             case 'u':
538 |                 if (ends("ous"))
539 |                     break;
540 |                 return;
541 |             case 'v':
542 |                 if (ends("ive"))
543 |                     break;
544 |                 return;
545 |             case 'z':
546 |                 if (ends("ize"))
547 |                     break;
548 |                 return;
549 |             default:
550 |                 return;
551 |         }
552 |         if (m() > 1)
553 |             k = j;
554 |     }
555 | 
556 |     /* step6() removes a final -e if m() > 1. */
557 | 
558 |     private final void step6()
559 |     {
560 |         j = k;
561 |         if (b[k] == 'e')
562 |         {
563 |             int a = m();
564 |             if (a > 1 || a == 1 && !cvc(k - 1))
565 |                 k--;
566 |         }
567 |         if (b[k] == 'l' && doublec(k) && m() > 1)
568 |             k--;
569 |     }
570 | 
571 |     /**
572 |      * Stem a word provided as a String. Returns the result as a String.
573 |      */
574 |     public String stem(String s)
575 |     {
576 |         if (stem(s.toCharArray(), s.length()))
577 |             return toString();
578 |         else
579 |             return s;
580 |     }
581 | 
582 |     /**
583 |      * Stem a word contained in a char[]. Returns true if the stemming process
584 |      * resulted in a word different from the input. You can retrieve the result
585 |      * with getResultLength()/getResultBuffer() or toString().
586 |      */
587 |     public boolean stem(char[] word)
588 |     {
589 |         return stem(word, word.length);
590 |     }
591 | 
592 |     /**
593 |      * Stem a word contained in a portion of a char[] array. Returns true if the
594 |      * stemming process resulted in a word different from the input. You can
595 |      * retrieve the result with getResultLength()/getResultBuffer() or
596 |      * toString().
597 |      */
598 |     public boolean stem(char[] wordBuffer, int offset, int wordLen)
599 |     {
600 |         reset();
601 |         if (b.length < wordLen)
602 |         {
603 |             b = new char[ArrayUtil.oversize(wordLen, Character.BYTES)];
604 |         }
605 |         System.arraycopy(wordBuffer, offset, b, 0, wordLen);
606 |         i = wordLen;
607 |         return stem(0);
608 |     }
609 | 
610 |     /**
611 |      * Stem a word contained in a leading portion of a char[] array. Returns
612 |      * true if the stemming process resulted in a word different from the input.
613 |      * You can retrieve the result with getResultLength()/getResultBuffer() or
614 |      * toString().
615 |      */
616 |     public boolean stem(char[] word, int wordLen)
617 |     {
618 |         return stem(word, 0, wordLen);
619 |     }
620 | 
621 |     /**
622 |      * Stem the word placed into the Stemmer buffer through calls to add().
623 |      * Returns true if the stemming process resulted in a word different from
624 |      * the input. You can retrieve the result with
625 |      * getResultLength()/getResultBuffer() or toString().
626 |      */
627 |     public boolean stem()
628 |     {
629 |         return stem(0);
630 |     }
631 | 
632 |     public boolean stem(int i0)
633 |     {
634 |         k = i - 1;
635 |         k0 = i0;
636 |         if (k > k0 + 1)
637 |         {
638 |             step1();
639 |             step2();
640 |             step3();
641 |             step4();
642 |             step5();
643 |             step6();
644 |         }
645 |         // Also, a word is considered dirty if we lopped off letters
646 |         // Thanks to Ifigenia Vairelles for pointing this out.
647 |         if (i != k + 1)
648 |             dirty = true;
649 |         i = k + 1;
650 |         return dirty;
651 |     }
652 | 
653 |     /**
654 |      * Test program for demonstrating the Stemmer. It reads a file and stems
655 |      * each word, writing the result to standard out. Usage: Stemmer file-name
656 |      */
657 |     public static void main(String[] args)
658 |     {
659 |         PorterStemmer s = new PorterStemmer();
660 | 
661 |         for (int i = 0; i < args.length; i++)
662 |         {
663 |             try
664 |             {
665 |                 InputStream in = new FileInputStream(args[i]);
666 |                 byte[] buffer = new byte[1024];
667 |                 int bufferLen, offset, ch;
668 | 
669 |                 bufferLen = in.read(buffer);
670 |                 offset = 0;
671 |                 s.reset();
672 | 
673 |                 while (true)
674 |                 {
675 |                     if (offset < bufferLen)
676 |                         ch = buffer[offset++];
677 |                     else
678 |                     {
679 |                         bufferLen = in.read(buffer);
680 |                         offset = 0;
681 |                         if (bufferLen < 0)
682 |                             ch = -1;
683 |                         else
684 |                             ch = buffer[offset++];
685 |                     }
686 | 
687 |                     if (Character.isLetter((char) ch))
688 |                     {
689 |                         s.add(Character.toLowerCase((char) ch));
690 |                     }
691 |                     else
692 |                     {
693 |                         s.stem();
694 |                         System.out.print(s.toString());
695 |                         s.reset();
696 |                         if (ch < 0)
697 |                             break;
698 |                         else
699 |                         {
700 |                             System.out.print((char) ch);
701 |                         }
702 |                     }
703 |                 }
704 | 
705 |                 in.close();
706 |             }
707 |             catch (IOException e)
708 |             {
709 |                 System.out.println("error reading " + args[i]);
710 |             }
711 |         }
712 |     }
713 | 
714 | }
715 | 


--------------------------------------------------------------------------------
/src/main/java/com/texsmart/lucene/SegmentWrapper.java:
--------------------------------------------------------------------------------
  1 | package com.texsmart.lucene;
  2 | 
  3 | import com.texsmart.cfg.Configuration;
  4 | import com.texsmart.seg.Segment;
  5 | import tencent.ai.texsmart.NluOutput.Term;
  6 | 
  7 | import java.io.Reader;
  8 | import java.security.AccessController;
  9 | import java.security.PrivilegedAction;
 10 | import java.util.List;
 11 | import java.util.Scanner;
 12 | 
 13 | public class SegmentWrapper {
 14 | 
 15 |     private Scanner scanner;
 16 | 
 17 |     private Segment segment;
 18 |     /**
 19 |      * 因为next是单个term出去的，所以在这里做一个记录
 20 |      */
 21 |     private Term[] termArray;
 22 |     /**
 23 |      * termArray下标
 24 |      */
 25 |     private int index;
 26 |     /**
 27 |      * term的偏移量，由于wrapper是按行读取的，必须对term.offset做一个校正
 28 |      */
 29 |     int offset;
 30 | 
 31 |     Configuration configuration;
 32 | 
 33 |     public SegmentWrapper(Reader reader, Segment segment, Configuration configuration) {
 34 |         scanner = createScanner(reader);
 35 |         this.segment = segment;
 36 |         this.configuration = configuration;
 37 |     }
 38 | 
 39 |     public SegmentWrapper(Reader reader, Segment segment) {
 40 |         scanner = createScanner(reader);
 41 |         this.segment = segment;
 42 |     }
 43 | 
 44 |     /**
 45 |      * 重置分词器
 46 |      *
 47 |      * @param reader
 48 |      */
 49 |     public void reset(Reader reader) {
 50 |         scanner = createScanner(reader);
 51 |         termArray = null;
 52 |         index = 0;
 53 |         offset = 0;
 54 |     }
 55 | 
 56 |     public Term next() {
 57 |         if (termArray != null && index < termArray.length) {
 58 |             return termArray[index++];
 59 |         }
 60 |         if (!scanner.hasNextLine()) {
 61 |             return null;
 62 |         }
 63 |         String line = scanner.nextLine();
 64 |         while (isBlank(line)) {
 65 |             offset += line.length() + 1;
 66 |             if (scanner.hasNextLine()) {
 67 |                 line = scanner.nextLine();
 68 |             } else {
 69 |                 return null;
 70 |             }
 71 |         }
 72 | 
 73 |         final String lineNeedSeg = line;
 74 |         List<Term> termList = AccessController.doPrivileged((PrivilegedAction<List<Term>>)() -> {
 75 | //            char[] text = lineNeedSeg.toCharArray();
 76 |             if (configuration != null && configuration.isEnableNormalization()) {
 77 | //                AccessController.doPrivileged((PrivilegedAction) () -> {
 78 | //                    CharTable.normalization(text);
 79 | //                    return null;
 80 | //                });
 81 |             }
 82 |             return segment.seg(lineNeedSeg);
 83 |         });
 84 | 
 85 |         if (termList.size() == 0) {
 86 |             return null;
 87 |         }
 88 |         termArray = termList.toArray(new Term[0]);
 89 | 
 90 |         for (Term term: termArray) {
 91 |             term.offset = term.offset + offset;
 92 |         }
 93 |         if (scanner.hasNextLine()) {
 94 |             offset += line.length() + 1;
 95 |         } else {
 96 |             offset += line.length();
 97 |         }
 98 |         index = 0;
 99 |         return termArray[index++];
100 |     }
101 | 
102 |     /**
103 |      * 判断字符串是否为空（null和空格）
104 |      *
105 |      * @param cs
106 |      * @return
107 |      */
108 |     private static boolean isBlank(CharSequence cs) {
109 |         int strLen;
110 |         if (cs == null || (strLen = cs.length()) == 0) {
111 |             return true;
112 |         }
113 |         for (int i = 0; i < strLen; i++) {
114 |             if (!Character.isWhitespace(cs.charAt(i))) {
115 |                 return false;
116 |             }
117 |         }
118 |         return true;
119 |     }
120 | 
121 |     private static Scanner createScanner(Reader reader) {
122 |         return new Scanner(reader).useDelimiter("\n");
123 |     }
124 | }
125 | 
126 | 


--------------------------------------------------------------------------------
/src/main/java/com/texsmart/lucene/TexSmartAnalyzer.java:
--------------------------------------------------------------------------------
 1 | package com.texsmart.lucene;
 2 | 
 3 | import com.texsmart.TexSmart;
 4 | import com.texsmart.cfg.Configuration;
 5 | import org.apache.lucene.analysis.Analyzer;
 6 | 
 7 | /**
 8 |  * @project: elasticsearch-analysis-texsmart
 9 |  * @description: 默认分词
10 |  * @author: wei_liu
11 |  * @create: 2020-09-09 15:10
12 |  */
13 | public class TexSmartAnalyzer extends Analyzer {
14 |     /**
15 |      * 分词配置
16 |      */
17 |     private Configuration configuration;
18 | 
19 |     public TexSmartAnalyzer(Configuration configuration) {
20 |         this.configuration = configuration;
21 |     }
22 | 
23 |     public TexSmartAnalyzer() {
24 |         super();
25 |     }
26 | 
27 |     @Override
28 |     protected Analyzer.TokenStreamComponents createComponents(String fieldName) {
29 |         return new Analyzer.TokenStreamComponents(
30 |                 TokenizerBuilder.tokenizer(TexSmart.newSegment(), configuration));
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/java/com/texsmart/lucene/TexSmartIndexAnalyzer.java:
--------------------------------------------------------------------------------
 1 | package com.texsmart.lucene;
 2 | 
 3 | import com.texsmart.TexSmart;
 4 | import com.texsmart.cfg.Configuration;
 5 | import org.apache.lucene.analysis.Analyzer;
 6 | 
 7 | public class TexSmartIndexAnalyzer extends Analyzer {
 8 |     /**
 9 |      * 分词配置
10 |      */
11 |     private Configuration configuration;
12 | 
13 |     public TexSmartIndexAnalyzer(Configuration configuration) {
14 |         this.configuration = configuration;
15 |         this.configuration.enableIndexMode(true);
16 |     }
17 | 
18 |     public TexSmartIndexAnalyzer() {
19 |         super();
20 |     }
21 | 
22 |     @Override
23 |     protected Analyzer.TokenStreamComponents createComponents(String fieldName) {
24 |         return new Analyzer.TokenStreamComponents(
25 |                 TokenizerBuilder.tokenizer(TexSmart.newSegment().enableIndexMode(true), configuration));
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/java/com/texsmart/lucene/TexSmartStandardAnalyzer.java:
--------------------------------------------------------------------------------
 1 | package com.texsmart.lucene;
 2 | 
 3 | import com.texsmart.TexSmart;
 4 | import com.texsmart.cfg.Configuration;
 5 | import org.apache.lucene.analysis.Analyzer;
 6 | 
 7 | /**
 8 |  * @project: elasticsearch-analysis-texsmart
 9 |  * @description: 中文基础分词
10 |  * @author: wei_liu
11 |  * @create: 2020-09-09 15:10
12 |  */
13 | public class TexSmartStandardAnalyzer extends Analyzer {
14 |     /**
15 |      * 分词配置
16 |      */
17 |     private Configuration configuration;
18 | 
19 |     public TexSmartStandardAnalyzer(Configuration configuration) {
20 |         this.configuration = configuration;
21 |     }
22 | 
23 |     public TexSmartStandardAnalyzer() {
24 |         super();
25 |     }
26 | 
27 |     @Override
28 |     protected Analyzer.TokenStreamComponents createComponents(String fieldName) {
29 |         return new Analyzer.TokenStreamComponents(
30 |                 TokenizerBuilder.tokenizer(TexSmart.newSegment(), configuration));
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/java/com/texsmart/lucene/TexSmartTokenizer.java:
--------------------------------------------------------------------------------
  1 | package com.texsmart.lucene;
  2 | 
  3 | import com.texsmart.dic.stopword.FilterStopWord;
  4 | import com.texsmart.seg.Segment;
  5 | import com.texsmart.utility.TextUtility;
  6 | import tencent.ai.texsmart.NluOutput.Term;
  7 | import com.texsmart.cfg.Configuration;
  8 | import org.apache.lucene.analysis.Tokenizer;
  9 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 10 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 11 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 12 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 13 | 
 14 | import java.io.BufferedReader;
 15 | import java.io.IOException;
 16 | 
 17 | 
 18 | public class TexSmartTokenizer extends Tokenizer {
 19 |     /**
 20 |      * 当前词
 21 |      */
 22 |     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
 23 |     /**
 24 |      * 偏移量
 25 |      */
 26 |     private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
 27 |     /**
 28 |      * 距离
 29 |      */
 30 |     private final PositionIncrementAttribute positionAttr = addAttribute(PositionIncrementAttribute.class);
 31 |     /**
 32 |      * 词性
 33 |      */
 34 |     private TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
 35 |     /**
 36 |      * 配置
 37 |      */
 38 |     private Configuration configuration;
 39 |     /**
 40 |      * 分词器
 41 |      */
 42 |     private SegmentWrapper segment;
 43 |     /**
 44 |      *
 45 |      */
 46 |     private final PorterStemmer stemmer = new PorterStemmer();
 47 |     /**
 48 |      * 单文档当前所在的总offset，当reset（切换multi-value fields中的value）的时候不清零，在end（切换field）时清零
 49 |      */
 50 |     private int totalOffset = 0;
 51 | 
 52 |     /**
 53 |      * @param segment       TexSmart中的某个分词器
 54 |      * @param configuration 分词配置
 55 |      */
 56 |     public TexSmartTokenizer(Segment segment, Configuration configuration) {
 57 |         this.configuration = configuration;
 58 |         this.segment = new SegmentWrapper(this.input, segment, configuration);
 59 |     }
 60 | 
 61 |     @Override
 62 |     final public boolean incrementToken() throws IOException {
 63 |         clearAttributes();
 64 |         int position = 0;
 65 |         Term term;
 66 |         boolean unIncreased = true;
 67 |         do {
 68 |             term = segment.next();
 69 |             if (term == null) {
 70 |                 totalOffset += segment.offset;
 71 |                 return false;
 72 |             }
 73 |             if (TextUtility.isBlank(term.str)) {
 74 |                 totalOffset += term.length();
 75 |                 continue;
 76 |             }
 77 |             if (configuration.isEnablePorterStemming() && term.tag.equals("nx")) {
 78 |                 term.str = stemmer.stem(term.str);
 79 |             }
 80 | 
 81 |             final Term copyTerm = term;
 82 |             if (!this.configuration.isEnableStopDictionary() || !FilterStopWord.beRemove(copyTerm)) {
 83 |                 position++;
 84 |                 unIncreased = false;
 85 |             } else {
 86 |                 totalOffset += term.length();
 87 |             }
 88 |         }
 89 |         while (unIncreased);
 90 | 
 91 |         positionAttr.setPositionIncrement(position);
 92 |         termAtt.setEmpty().append(term.str);
 93 |         offsetAtt.setOffset(correctOffset(term.offset), correctOffset(term.offset + term.str.length()));
 94 |         typeAtt.setType(term.tag == null ? "null" : term.tag);
 95 |         totalOffset += term.length();
 96 |         return true;
 97 |     }
 98 | 
 99 |     @Override
100 |     public void end() throws IOException {
101 |         super.end();
102 |         offsetAtt.setOffset(totalOffset, totalOffset);
103 |         totalOffset = 0;
104 |     }
105 | 
106 |     /**
107 |      * 必须重载的方法，否则在批量索引文件时将会导致文件索引失败
108 |      */
109 |     @Override
110 |     public void reset() throws IOException {
111 |         super.reset();
112 |         segment.reset(new BufferedReader(this.input));
113 |     }
114 | }
115 | 


--------------------------------------------------------------------------------
/src/main/java/com/texsmart/lucene/TokenizerBuilder.java:
--------------------------------------------------------------------------------
 1 | package com.texsmart.lucene;
 2 | 
 3 | import com.texsmart.cfg.Configuration;
 4 | import com.texsmart.seg.Segment;
 5 | import org.apache.lucene.analysis.Tokenizer;
 6 | 
 7 | /**
 8 |  * @project: elasticsearch-analysis-texsmart
 9 |  * @description:
10 |  * @author: wei_liu
11 |  * @create: 20202-09-09 09:47
12 |  */
13 | public class TokenizerBuilder {
14 | 
15 |     /**
16 |      * 构建Tokenizer
17 |      *
18 |      * @param segment       原始segment
19 |      * @param configuration 配置信息
20 |      * @return 返回tokenizer
21 |      */
22 |     public static Tokenizer tokenizer(Segment segment, Configuration configuration) {
23 |         Segment seg = segment(segment, configuration);
24 |         return new TexSmartTokenizer(seg, configuration);
25 |     }
26 | 
27 |     /**
28 |      * 根据配置信息配置segment
29 |      *
30 |      * @param segment       原始segment
31 |      * @param configuration 配置信息
32 |      * @return 新segment
33 |      */
34 |     private static Segment segment(Segment segment, Configuration configuration) {
35 |         if (!configuration.isEnableCustomConfig()) {
36 |             return segment.enableOffset(true);
37 |         }
38 |         segment.enableIndexMode(configuration.isEnableIndexMode())
39 |             .enableOffset(configuration.isEnableOffset())
40 |             .enableStopDictionary(configuration.isEnableStopDictionary())
41 |             .setPosAlgType(configuration.getEnablePosAlg())
42 |             .setNerAlgType(configuration.getEnableNerAlg());
43 |         return segment;
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/java/com/texsmart/seg/Config.java:
--------------------------------------------------------------------------------
 1 | package com.texsmart.seg;
 2 | 
 3 | import org.elasticsearch.index.analysis.NerAlgType;
 4 | import org.elasticsearch.index.analysis.PosAlgType;
 5 | 
 6 | public class Config {
 7 |     public int indexMode = 0;
 8 |     public boolean useCustomDictionary = true;
 9 |     public boolean forceEntName = true;
10 |     public boolean ner = true;
11 |     public boolean offset = false;
12 |     public boolean enableStopDictionary = false;
13 |     public PosAlgType posAlgType = PosAlgType.LOG_LINEAR;
14 |     public NerAlgType nerAlgType = NerAlgType.CRF;
15 | 
16 |     public Config() {
17 |     }
18 | 
19 |     public boolean isIndexMode() { return this.indexMode > 0; }
20 | 
21 |     public String getPosAlgType() {
22 |         return this.posAlgType.getAlg();
23 |     }
24 | 
25 |     public String getNerAlgType() {
26 |         return this.nerAlgType.getAlg();
27 |     }
28 | }


--------------------------------------------------------------------------------
/src/main/java/com/texsmart/seg/Segment.java:
--------------------------------------------------------------------------------
 1 | package com.texsmart.seg;
 2 | 
 3 | import org.elasticsearch.index.analysis.NerAlgType;
 4 | import org.elasticsearch.index.analysis.PosAlgType;
 5 | import tencent.ai.texsmart.NluOutput.Term;
 6 | 
 7 | import java.util.List;
 8 | 
 9 | public abstract class Segment {
10 |     protected Config config = new Config();
11 | 
12 |     public Segment() {
13 |     }
14 | 
15 |     public List<Term> seg(String text) {
16 |         return segSentence(text);
17 |     }
18 | 
19 |     protected abstract List<Term> segSentence(String text);
20 | 
21 |     public Segment enableOffset(boolean enable) {
22 |         this.config.offset = enable;
23 |         return this;
24 |     }
25 | 
26 |     public Segment enableIndexMode(boolean enable) {
27 |         this.config.indexMode = enable ? 2 : 0;
28 |         return this;
29 |     }
30 | 
31 |     public Segment enableIndexMode(int minimalLength) {
32 |         if (minimalLength < 1) {
33 |             throw new IllegalArgumentException("最小长度应当大于等于1");
34 |         } else {
35 |             this.config.indexMode = minimalLength;
36 |             return this;
37 |         }
38 |     }
39 | 
40 |     public Segment enableStopDictionary(boolean enable) {
41 |         this.config.enableStopDictionary = enable;
42 |         return this;
43 |     }
44 | 
45 |     public Segment setPosAlgType(PosAlgType posAlgType) {
46 |         this.config.posAlgType = posAlgType;
47 |         return this;
48 |     }
49 | 
50 |     public Segment setNerAlgType(NerAlgType nerAlgType) {
51 |         this.config.nerAlgType = nerAlgType;
52 |         return this;
53 |     }
54 | }


--------------------------------------------------------------------------------
/src/main/java/com/texsmart/seg/TexSmartBasicSegment.java:
--------------------------------------------------------------------------------
 1 | package com.texsmart.seg;
 2 | 
 3 | import com.texsmart.TexSmart;
 4 | import tencent.ai.texsmart.NluOutput;
 5 | import tencent.ai.texsmart.NluOutput.Term;
 6 | 
 7 | import java.util.List;
 8 | 
 9 | public class TexSmartBasicSegment extends Segment {
10 | 
11 |     private static String formatOptions = "{" +
12 |             "    \"input_spec\":{\"lang\":\"auto\"}," +
13 |             "    \"word_seg\":{\"enable\":true},\n" +
14 |             "    \"pos_tagging\":{\"enable\":true,\"alg\":\"%s\"}," +
15 |             "    \"ner\":{\"enable\":true,\"alg\":\"%s\",\"fine_grained\":false}," +
16 |             "    \"syntactic_parsing\":{\"enable\":false}," +
17 |             "    \"srl\":{\"enable\":false}" +
18 |             "  }";
19 | 
20 |     public TexSmartBasicSegment() {
21 |     }
22 | 
23 |     @Override
24 |     protected List<Term> segSentence(String text) {
25 |         NluOutput output = TexSmart.TEX_ENGINE.parseText(text, String.format(
26 |                 formatOptions, config.getPosAlgType(), config.getNerAlgType()));
27 | 
28 |         if (null == output) return null;
29 |         if (config.isIndexMode()) {
30 |             return output.words();
31 |         } else {
32 |             return output.phrases();
33 |         }
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/java/com/texsmart/tokenizer/StandardTokenizer.java:
--------------------------------------------------------------------------------
 1 | package com.texsmart.tokenizer;
 2 | 
 3 | import com.texsmart.TexSmart;
 4 | import com.texsmart.seg.Segment;
 5 | import tencent.ai.texsmart.NluOutput.Term;
 6 | 
 7 | import java.util.List;
 8 | 
 9 | public class StandardTokenizer {
10 |     public static final Segment SEGMENT = TexSmart.newSegment();
11 | 
12 |     public StandardTokenizer() {
13 |     }
14 | 
15 |     public static List<Term> segment(String text) {
16 |         return SEGMENT.seg(text);
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/java/com/texsmart/utility/TextUtility.java:
--------------------------------------------------------------------------------
  1 | package com.texsmart.utility;
  2 | 
  3 | //
  4 | // Source code recreated from a .class file by IntelliJ IDEA
  5 | // (powered by Fernflower decompiler)
  6 | //
  7 | 
  8 | import java.io.DataOutputStream;
  9 | import java.io.IOException;
 10 | import java.io.PrintWriter;
 11 | import java.io.StringWriter;
 12 | import java.io.UnsupportedEncodingException;
 13 | import java.util.Collection;
 14 | import java.util.Iterator;
 15 | import java.util.List;
 16 | 
 17 | public class TextUtility {
 18 |     public TextUtility() {
 19 |     }
 20 | 
 21 |     public static int charType(char c) {
 22 |         return charType(String.valueOf(c));
 23 |     }
 24 | 
 25 |     public static int charType(String str) {
 26 |         if (str != null && str.length() > 0) {
 27 |             if ("零○〇一二两三四五六七八九十廿百千万亿壹贰叁肆伍陆柒捌玖拾佰仟".contains(str)) {
 28 |                 return 11;
 29 |             }
 30 | 
 31 |             byte[] b;
 32 |             try {
 33 |                 b = str.getBytes("GBK");
 34 |             } catch (UnsupportedEncodingException var6) {
 35 |                 b = str.getBytes();
 36 |                 var6.printStackTrace();
 37 |             }
 38 | 
 39 |             byte b1 = b[0];
 40 |             byte b2 = b.length > 1 ? b[1] : 0;
 41 |             int ub1 = getUnsigned(b1);
 42 |             int ub2 = getUnsigned(b2);
 43 |             if (ub1 < 128) {
 44 |                 if (ub1 <= 32) {
 45 |                     return 17;
 46 |                 }
 47 | 
 48 |                 if ("*\"!,.?()[]{}+=/\\;:|".indexOf((char)b1) != -1) {
 49 |                     return 6;
 50 |                 }
 51 | 
 52 |                 if ("0123456789".indexOf((char)b1) != -1) {
 53 |                     return 9;
 54 |                 }
 55 | 
 56 |                 return 5;
 57 |             }
 58 | 
 59 |             if (ub1 == 162) {
 60 |                 return 10;
 61 |             }
 62 | 
 63 |             if (ub1 == 163 && ub2 > 175 && ub2 < 186) {
 64 |                 return 9;
 65 |             }
 66 | 
 67 |             if (ub1 == 163 && (ub2 >= 193 && ub2 <= 218 || ub2 >= 225 && ub2 <= 250)) {
 68 |                 return 8;
 69 |             }
 70 | 
 71 |             if (ub1 == 161 || ub1 == 163) {
 72 |                 return 6;
 73 |             }
 74 | 
 75 |             if (ub1 >= 176 && ub1 <= 247) {
 76 |                 return 7;
 77 |             }
 78 |         }
 79 | 
 80 |         return 17;
 81 |     }
 82 | 
 83 |     public static boolean isAllChinese(String str) {
 84 |         return str.matches("[\\u4E00-\\u9FA5]+");
 85 |     }
 86 | 
 87 |     public static boolean isAllNonChinese(byte[] sString) {
 88 |         int nLen = sString.length;
 89 |         int i = 0;
 90 | 
 91 |         while(i < nLen) {
 92 |             if (getUnsigned(sString[i]) < 248 && getUnsigned(sString[i]) > 175) {
 93 |                 return false;
 94 |             }
 95 | 
 96 |             if (sString[i] < 0) {
 97 |                 i += 2;
 98 |             } else {
 99 |                 ++i;
100 |             }
101 |         }
102 | 
103 |         return true;
104 |     }
105 | 
106 |     public static boolean isAllSingleByte(String str) {
107 |         assert str != null;
108 | 
109 |         for(int i = 0; i < str.length(); ++i) {
110 |             if (str.charAt(i) > 128) {
111 |                 return false;
112 |             }
113 |         }
114 | 
115 |         return true;
116 |     }
117 | 
118 |     public static int cint(String str) {
119 |         if (str != null) {
120 |             try {
121 |                 int i = new Integer(str);
122 |                 return i;
123 |             } catch (NumberFormatException var2) {
124 |             }
125 |         }
126 | 
127 |         return -1;
128 |     }
129 | 
130 |     public static boolean isAllNum(String str) {
131 |         if (str == null) {
132 |             return false;
133 |         } else {
134 |             int i = 0;
135 |             if ("±+-＋－—".indexOf(str.charAt(0)) != -1) {
136 |                 ++i;
137 |             }
138 | 
139 |             while(i < str.length() && "０１２３４５６７８９".indexOf(str.charAt(i)) != -1) {
140 |                 ++i;
141 |             }
142 | 
143 |             char ch;
144 |             if (i > 0 && i < str.length()) {
145 |                 ch = str.charAt(i);
146 |                 if ("·∶:，,．.／/".indexOf(ch) != -1) {
147 |                     ++i;
148 | 
149 |                     while(i < str.length() && "０１２３４５６７８９".indexOf(str.charAt(i)) != -1) {
150 |                         ++i;
151 |                     }
152 |                 }
153 |             }
154 | 
155 |             if (i >= str.length()) {
156 |                 return true;
157 |             } else {
158 |                 while(i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1) {
159 |                     ++i;
160 |                 }
161 | 
162 |                 if (i > 0 && i < str.length()) {
163 |                     ch = str.charAt(i);
164 |                     if (',' == ch || '.' == ch || '/' == ch || ':' == ch || "∶·，．／".indexOf(ch) != -1) {
165 |                         ++i;
166 | 
167 |                         while(i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1) {
168 |                             ++i;
169 |                         }
170 |                     }
171 |                 }
172 | 
173 |                 if (i < str.length() && "百千万亿佰仟%％‰".indexOf(str.charAt(i)) != -1) {
174 |                     ++i;
175 |                 }
176 | 
177 |                 return i >= str.length();
178 |             }
179 |         }
180 |     }
181 | 
182 |     public static boolean isAllIndex(byte[] sString) {
183 |         int nLen = sString.length;
184 | 
185 |         int i;
186 |         for(i = 0; i < nLen - 1 && getUnsigned(sString[i]) == 162; i += 2) {
187 |         }
188 | 
189 |         if (i >= nLen) {
190 |             return true;
191 |         } else {
192 |             while(i < nLen && sString[i] > 64 && sString[i] < 91 || sString[i] > 96 && sString[i] < 123) {
193 |                 ++i;
194 |             }
195 | 
196 |             return i >= nLen;
197 |         }
198 |     }
199 | 
200 |     public static boolean isAllLetter(String text) {
201 |         for(int i = 0; i < text.length(); ++i) {
202 |             char c = text.charAt(i);
203 |             if ((c < 'a' || c > 'z') && (c < 'A' || c > 'Z')) {
204 |                 return false;
205 |             }
206 |         }
207 | 
208 |         return true;
209 |     }
210 | 
211 |     public static boolean isAllLetterOrNum(String text) {
212 |         for(int i = 0; i < text.length(); ++i) {
213 |             char c = text.charAt(i);
214 |             if ((c < 'a' || c > 'z') && (c < 'A' || c > 'Z') && (c < '0' || c > '9')) {
215 |                 return false;
216 |             }
217 |         }
218 | 
219 |         return true;
220 |     }
221 | 
222 |     public static boolean isAllDelimiter(byte[] sString) {
223 |         int nLen = sString.length;
224 | 
225 |         int i;
226 |         for(i = 0; i < nLen - 1 && (getUnsigned(sString[i]) == 161 || getUnsigned(sString[i]) == 163); i += 2) {
227 |         }
228 | 
229 |         return i >= nLen;
230 |     }
231 | 
232 |     public static boolean isAllChineseNum(String word) {
233 |         String chineseNum = "零○一二两三四五六七八九十廿百千万亿壹贰叁肆伍陆柒捌玖拾佰仟∶·．／点";
234 |         String prefix = "几数上第";
235 |         String surfix = "几多余来成倍";
236 |         boolean round = false;
237 |         if (word == null) {
238 |             return false;
239 |         } else {
240 |             char[] temp = word.toCharArray();
241 | 
242 |             for(int i = 0; i < temp.length; ++i) {
243 |                 if (word.startsWith("分之", i)) {
244 |                     ++i;
245 |                 } else {
246 |                     char tchar = temp[i];
247 |                     if (i == 0 && prefix.indexOf(tchar) != -1) {
248 |                         round = true;
249 |                     } else if (i == temp.length - 1 && !round && surfix.indexOf(tchar) != -1) {
250 |                         round = true;
251 |                     } else if (chineseNum.indexOf(tchar) == -1) {
252 |                         return false;
253 |                     }
254 |                 }
255 |             }
256 | 
257 |             return true;
258 |         }
259 |     }
260 | 
261 |     public static int getCharCount(String charSet, String word) {
262 |         int nCount = 0;
263 |         if (word != null) {
264 |             String temp = word + " ";
265 | 
266 |             for(int i = 0; i < word.length(); ++i) {
267 |                 String s = temp.substring(i, i + 1);
268 |                 if (charSet.indexOf(s) != -1) {
269 |                     ++nCount;
270 |                 }
271 |             }
272 |         }
273 | 
274 |         return nCount;
275 |     }
276 | 
277 |     public static int getUnsigned(byte b) {
278 |         return b > 0 ? b : b & 255;
279 |     }
280 | 
281 |     public static boolean isYearTime(String snum) {
282 |         if (snum != null) {
283 |             int len = snum.length();
284 |             String first = snum.substring(0, 1);
285 |             if (isAllSingleByte(snum) && (len == 4 || len == 2 && (cint(first) > 4 || cint(first) == 0))) {
286 |                 return true;
287 |             }
288 | 
289 |             if (isAllNum(snum) && (len >= 3 || len == 2 && "０５６７８９".indexOf(first) != -1)) {
290 |                 return true;
291 |             }
292 | 
293 |             if (getCharCount("零○一二三四五六七八九壹贰叁肆伍陆柒捌玖", snum) == len && len >= 2) {
294 |                 return true;
295 |             }
296 | 
297 |             if (len == 4 && getCharCount("千仟零○", snum) == 2) {
298 |                 return true;
299 |             }
300 | 
301 |             if (len == 1 && getCharCount("千仟", snum) == 1) {
302 |                 return true;
303 |             }
304 | 
305 |             if (len == 2 && getCharCount("甲乙丙丁戊己庚辛壬癸", snum) == 1 && getCharCount("子丑寅卯辰巳午未申酉戌亥", snum.substring(1)) == 1) {
306 |                 return true;
307 |             }
308 |         }
309 | 
310 |         return false;
311 |     }
312 | 
313 |     public static boolean isInAggregate(String aggr, String str) {
314 |         if (aggr != null && str != null) {
315 |             str = str + "1";
316 | 
317 |             for(int i = 0; i < str.length(); ++i) {
318 |                 String s = str.substring(i, i + 1);
319 |                 if (aggr.indexOf(s) == -1) {
320 |                     return false;
321 |                 }
322 |             }
323 | 
324 |             return true;
325 |         } else {
326 |             return false;
327 |         }
328 |     }
329 | 
330 |     public static boolean isDBCCase(String str) {
331 |         if (str == null) {
332 |             return false;
333 |         } else {
334 |             str = str + " ";
335 | 
336 |             for(int i = 0; i < str.length(); ++i) {
337 |                 String s = str.substring(i, i + 1);
338 |                 boolean var3 = false;
339 | 
340 |                 int length;
341 |                 try {
342 |                     length = s.getBytes("GBK").length;
343 |                 } catch (UnsupportedEncodingException var5) {
344 |                     var5.printStackTrace();
345 |                     length = s.getBytes().length;
346 |                 }
347 | 
348 |                 if (length != 1) {
349 |                     return false;
350 |                 }
351 |             }
352 | 
353 |             return true;
354 |         }
355 |     }
356 | 
357 |     public static boolean isSBCCase(String str) {
358 |         if (str == null) {
359 |             return false;
360 |         } else {
361 |             str = str + " ";
362 | 
363 |             for(int i = 0; i < str.length(); ++i) {
364 |                 String s = str.substring(i, i + 1);
365 |                 boolean var3 = false;
366 | 
367 |                 int length;
368 |                 try {
369 |                     length = s.getBytes("GBK").length;
370 |                 } catch (UnsupportedEncodingException var5) {
371 |                     var5.printStackTrace();
372 |                     length = s.getBytes().length;
373 |                 }
374 | 
375 |                 if (length != 2) {
376 |                     return false;
377 |                 }
378 |             }
379 | 
380 |             return true;
381 |         }
382 |     }
383 | 
384 |     public static boolean isDelimiter(String str) {
385 |         return str != null && ("-".equals(str) || "－".equals(str));
386 |     }
387 | 
388 |     public static boolean isUnknownWord(String word) {
389 |         return word != null && word.indexOf("未##") == 0;
390 |     }
391 | 
392 |     public static double nonZero(double frequency) {
393 |         return frequency == 0.0D ? 0.001D : frequency;
394 |     }
395 | 
396 |     public static char[] long2char(long x) {
397 |         char[] c = new char[]{(char)((int)(x >> 48)), (char)((int)(x >> 32)), (char)((int)(x >> 16)), (char)((int)x)};
398 |         return c;
399 |     }
400 | 
401 |     public static String long2String(long x) {
402 |         char[] cArray = long2char(x);
403 |         StringBuilder sbResult = new StringBuilder(cArray.length);
404 |         char[] var4 = cArray;
405 |         int var5 = cArray.length;
406 | 
407 |         for(int var6 = 0; var6 < var5; ++var6) {
408 |             char c = var4[var6];
409 |             sbResult.append(c);
410 |         }
411 | 
412 |         return sbResult.toString();
413 |     }
414 | 
415 |     public static String exceptionToString(Exception e) {
416 |         StringWriter sw = new StringWriter();
417 |         PrintWriter pw = new PrintWriter(sw);
418 |         e.printStackTrace(pw);
419 |         return sw.toString();
420 |     }
421 | 
422 |     public static boolean isChinese(char c) {
423 |         String regex = "[\\u4e00-\\u9fa5]";
424 |         return String.valueOf(c).matches(regex);
425 |     }
426 | 
427 |     public static int count(String keyword, String srcText) {
428 |         int count = 0;
429 |         int leng = srcText.length();
430 |         int j = 0;
431 | 
432 |         for(int i = 0; i < leng; ++i) {
433 |             if (srcText.charAt(i) == keyword.charAt(j)) {
434 |                 ++j;
435 |                 if (j == keyword.length()) {
436 |                     ++count;
437 |                     j = 0;
438 |                 }
439 |             } else {
440 |                 i -= j;
441 |                 j = 0;
442 |             }
443 |         }
444 | 
445 |         return count;
446 |     }
447 | 
448 |     public static void writeString(String s, DataOutputStream out) throws IOException {
449 |         out.writeInt(s.length());
450 |         char[] var2 = s.toCharArray();
451 |         int var3 = var2.length;
452 | 
453 |         for(int var4 = 0; var4 < var3; ++var4) {
454 |             char c = var2[var4];
455 |             out.writeChar(c);
456 |         }
457 | 
458 |     }
459 | 
460 |     public static boolean isBlank(CharSequence cs) {
461 |         int strLen;
462 |         if (cs != null && (strLen = cs.length()) != 0) {
463 |             for(int i = 0; i < strLen; ++i) {
464 |                 if (!Character.isWhitespace(cs.charAt(i))) {
465 |                     return false;
466 |                 }
467 |             }
468 | 
469 |             return true;
470 |         } else {
471 |             return true;
472 |         }
473 |     }
474 | 
475 |     public static String join(String delimiter, Collection<String> stringCollection) {
476 |         StringBuilder sb = new StringBuilder(stringCollection.size() * (16 + delimiter.length()));
477 |         Iterator var3 = stringCollection.iterator();
478 | 
479 |         while(var3.hasNext()) {
480 |             String str = (String)var3.next();
481 |             sb.append(str).append(delimiter);
482 |         }
483 | 
484 |         return sb.toString();
485 |     }
486 | 
487 |     public static String combine(String... termArray) {
488 |         StringBuilder sbSentence = new StringBuilder();
489 |         String[] var2 = termArray;
490 |         int var3 = termArray.length;
491 | 
492 |         for(int var4 = 0; var4 < var3; ++var4) {
493 |             String word = var2[var4];
494 |             sbSentence.append(word);
495 |         }
496 | 
497 |         return sbSentence.toString();
498 |     }
499 | 
500 |     public static String join(Iterable<? extends CharSequence> s, String delimiter) {
501 |         Iterator<? extends CharSequence> iter = s.iterator();
502 |         if (!iter.hasNext()) {
503 |             return "";
504 |         } else {
505 |             StringBuilder buffer = new StringBuilder((CharSequence)iter.next());
506 | 
507 |             while(iter.hasNext()) {
508 |                 buffer.append(delimiter).append((CharSequence)iter.next());
509 |             }
510 | 
511 |             return buffer.toString();
512 |         }
513 |     }
514 | }
515 | 
516 | 


--------------------------------------------------------------------------------
/src/main/java/es-plugin.properties:
--------------------------------------------------------------------------------
1 | plugin=org.elasticsearch.plugin.analysis.texsmart.AnalysisTexSmartPlugin


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/NerAlgType.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis;
 2 | 
 3 | public enum NerAlgType {
 4 |     CRF("crf"), DNN("dnn");
 5 | 
 6 |     private String alg;
 7 | 
 8 |     NerAlgType(String alg) {
 9 |         this.alg = alg;
10 |     }
11 | 
12 |     public String getAlg() {
13 |         return alg;
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/PosAlgType.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis;
 2 | 
 3 | public enum PosAlgType {
 4 |     CRF("crf"), DNN("dnn"), LOG_LINEAR("log_LINEAR");
 5 | 
 6 |     private String alg;
 7 | 
 8 |     PosAlgType(String alg) {
 9 |         this.alg = alg;
10 |     }
11 | 
12 |     public String getAlg() {
13 |         return alg;
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/TexSmartAnalyzerProvider.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis;
 2 | 
 3 | import com.texsmart.cfg.Configuration;
 4 | import com.texsmart.lucene.*;
 5 | import org.apache.lucene.analysis.Analyzer;
 6 | import org.elasticsearch.common.settings.Settings;
 7 | import org.elasticsearch.env.Environment;
 8 | import org.elasticsearch.index.IndexSettings;
 9 | 
10 | /**
11 |  * @project: elasticsearch-analysis-texsmart
12 |  * @description: TexSmart analyzer provider
13 |  * @author: wei_liu
14 |  * @create: 2020-09-09 15:10
15 |  */
16 | public class TexSmartAnalyzerProvider extends AbstractIndexAnalyzerProvider<Analyzer> {
17 | 
18 |     private final Analyzer analyzer;
19 | 
20 |     public TexSmartAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings, TexSmartType texSmartType) {
21 |         super(indexSettings, name, settings);
22 |         Configuration configuration = new Configuration(env, settings);
23 |         switch (texSmartType) {
24 |             case TEXSMART:
25 |                 analyzer = new TexSmartAnalyzer(configuration);
26 |                 break;
27 |             case STANDARD:
28 |                 analyzer = new TexSmartStandardAnalyzer(configuration);
29 |                 break;
30 |             case SINGLE:
31 |                 analyzer = new TexSmartIndexAnalyzer(configuration);
32 |                 break;
33 |             default:
34 |                 analyzer = null;
35 |                 break;
36 |         }
37 |     }
38 | 
39 |     public static TexSmartAnalyzerProvider getTexSmartAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
40 |         return new TexSmartAnalyzerProvider(indexSettings, env, name, settings, TexSmartType.TEXSMART);
41 |     }
42 | 
43 |     public static TexSmartAnalyzerProvider getTexSmartStandardAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
44 |         return new TexSmartAnalyzerProvider(indexSettings, env, name, settings, TexSmartType.STANDARD);
45 |     }
46 | 
47 |     public static TexSmartAnalyzerProvider getTexSmartIndexAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
48 |         return new TexSmartAnalyzerProvider(indexSettings, env, name, settings, TexSmartType.SINGLE);
49 |     }
50 | 
51 |     @Override
52 |     public Analyzer get() {
53 |         return this.analyzer;
54 |     }
55 | }
56 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/TexSmartTokenizerFactory.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis;
 2 | 
 3 | import com.texsmart.TexSmart;
 4 | import com.texsmart.cfg.Configuration;
 5 | import com.texsmart.lucene.TokenizerBuilder;
 6 | import org.apache.lucene.analysis.Tokenizer;
 7 | import org.elasticsearch.common.settings.Settings;
 8 | import org.elasticsearch.env.Environment;
 9 | import org.elasticsearch.index.IndexSettings;
10 | 
11 | /**
12 |  * @project: elasticsearch-analysis-texsmart
13 |  * @description: TexSmart tokenizer factory
14 |  * @author: wei_liu
15 |  * @create: 2020-09-09 15:10
16 |  */
17 | public class TexSmartTokenizerFactory extends AbstractTokenizerFactory {
18 |     /**
19 |      * 分词类型
20 |      */
21 |     private TexSmartType texSmartType;
22 |     /**
23 |      * 分词配置
24 |      */
25 |     private Configuration configuration;
26 | 
27 |     public TexSmartTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings, TexSmartType texSmartType) {
28 |         super(indexSettings, settings, name);
29 |         this.texSmartType = texSmartType;
30 |         this.configuration = new Configuration(env, settings);
31 |     }
32 | 
33 |     public static TexSmartTokenizerFactory getTexSmartTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
34 |         return new TexSmartTokenizerFactory(indexSettings, env, name, settings, TexSmartType.TEXSMART);
35 |     }
36 | 
37 |     public static TexSmartTokenizerFactory getTexSmartStandardTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
38 |         return new TexSmartTokenizerFactory(indexSettings, env, name, settings, TexSmartType.STANDARD);
39 |     }
40 | 
41 |     public static TexSmartTokenizerFactory getTexSmartIndexTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
42 |         return new TexSmartTokenizerFactory(indexSettings, env, name, settings, TexSmartType.SINGLE);
43 |     }
44 | 
45 | 
46 | 
47 |     @Override
48 |     public Tokenizer create() {
49 |         switch (this.texSmartType) {
50 |             case SINGLE:
51 |                 configuration.enableIndexMode(true);
52 |                 return TokenizerBuilder.tokenizer(TexSmart.newSegment().enableIndexMode(true), configuration);
53 |             case STANDARD:
54 |             default:
55 |                 return TokenizerBuilder.tokenizer(TexSmart.newSegment(), configuration);
56 |         }
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/TexSmartType.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis;
 2 | 
 3 | /**
 4 |  * @project: elasticsearch-analysis-texsmart
 5 |  * @description: TexSmart分词类型
 6 |  * @author: wei_liu
 7 |  * @create: 2020-09-09 15:10
 8 |  */
 9 | public enum TexSmartType {
10 |     /**
11 |      * 默认分词
12 |      */
13 |     TEXSMART,
14 |     /**
15 |      * 中文基础分词
16 |      */
17 |     STANDARD,
18 |     /**
19 |      * 中文单词分词
20 |      */
21 |     SINGLE
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/analysis/texsmart/AnalysisTexSmartPlugin.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.plugin.analysis.texsmart;
 2 | 
 3 | import org.apache.lucene.analysis.Analyzer;
 4 | import org.elasticsearch.SpecialPermission;
 5 | import org.elasticsearch.index.analysis.AnalyzerProvider;
 6 | import org.elasticsearch.index.analysis.TexSmartAnalyzerProvider;
 7 | import org.elasticsearch.index.analysis.TexSmartTokenizerFactory;
 8 | import org.elasticsearch.index.analysis.TokenizerFactory;
 9 | import org.elasticsearch.indices.analysis.AnalysisModule;
10 | import org.elasticsearch.plugins.AnalysisPlugin;
11 | import org.elasticsearch.plugins.Plugin;
12 | 
13 | import java.security.AccessController;
14 | import java.security.PrivilegedAction;
15 | import java.util.HashMap;
16 | import java.util.Map;
17 | 
18 | /**
19 |  * @project: elasticsearch-analysis-texsmart
20 |  * @description: TexSmart分词插件
21 |  * @author: wei_liu
22 |  * @create: 2018-12-14 15:10
23 |  */
24 | public class AnalysisTexSmartPlugin extends Plugin implements AnalysisPlugin {
25 |     public static String PLUGIN_NAME = "analysis-texsmart";
26 | 
27 |     static {
28 |         SecurityManager sm = System.getSecurityManager();
29 |         if (sm != null) {
30 |             // unprivileged code such as scripts do not have SpecialPermission
31 |             sm.checkPermission(new SpecialPermission());
32 |         }
33 |     }
34 | 
35 |     @Override
36 |     public Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> getTokenizers() {
37 |         return AccessController.doPrivileged((PrivilegedAction<Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>>>) () -> {
38 |             Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> extra = new HashMap<>();
39 | 
40 |             extra.put("texsmart", TexSmartTokenizerFactory::getTexSmartTokenizerFactory);
41 |             extra.put("texsmart_standard", TexSmartTokenizerFactory::getTexSmartStandardTokenizerFactory);
42 |             extra.put("texsmart_index", TexSmartTokenizerFactory::getTexSmartIndexTokenizerFactory);
43 | 
44 |             return extra;
45 |         });
46 |     }
47 | 
48 |     @Override
49 |     public Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAnalyzers() {
50 |         return AccessController.doPrivileged((PrivilegedAction<Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<? extends Analyzer>>>>) () -> {
51 |             Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> extra = new HashMap<>();
52 | 
53 |             extra.put("texsmart", TexSmartAnalyzerProvider::getTexSmartAnalyzerProvider);
54 |             extra.put("texsmart_standard", TexSmartAnalyzerProvider::getTexSmartStandardAnalyzerProvider);
55 |             extra.put("texsmart_index", TexSmartAnalyzerProvider::getTexSmartIndexAnalyzerProvider);
56 | 
57 |             return extra;
58 |         });
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/java/tencent/ai/texsmart/CLib.java:
--------------------------------------------------------------------------------
 1 | package tencent.ai.texsmart;
 2 | 
 3 | import com.sun.jna.Library;
 4 | import com.sun.jna.Native;
 5 | import com.sun.jna.Pointer;
 6 | import com.sun.jna.WString;
 7 | 
 8 | public interface CLib extends Library {
 9 | 	CLib INSTANCE = Native.loadLibrary("tencent_ai_texsmart", CLib.class);
10 | 
11 | 	public Pointer Nlu_CreateEngine(String data_dir, int worker_count);
12 | 	public void Nlu_DestroyEngine(Pointer engine);
13 | 
14 | 	public Pointer Nlu_ParseTextExt(Pointer engine, WString text, int text_len, WString options);
15 | 	public Pointer Nlu_ParseUtf8TextExt(Pointer engine, String text, int text_len, String options);
16 | 	public void Nlu_DestroyOutput(Pointer result);
17 | 	
18 | 	public Pointer Nlu_GetNormText(Pointer result, Pointer len);
19 | 	
20 | 	public int Nlu_GetWordCount(Pointer result);
21 | 	public Pointer Nlu_GetWord(Pointer result, int idx);
22 | 	public int Nlu_GetPhraseCount(Pointer result);
23 | 	public Pointer Nlu_GetPhrase(Pointer result, int idx);
24 | 	public Pointer Nlu_TermStr(Pointer term);
25 | 	public int Nlu_TermOffset(Pointer term);
26 | 	public int Nlu_TermLen(Pointer term);
27 | 	public Pointer Nlu_TermTag(Pointer term);
28 | 	
29 | 	public int Nlu_GetEntityCount(Pointer result);
30 | 	public Pointer Nlu_GetEntity(Pointer result, int idx);
31 | 	public Pointer Nlu_EntityStr(Pointer entity);
32 | 	public int Nlu_EntityOffset(Pointer entity);
33 | 	public int Nlu_EntityLen(Pointer entity);
34 | 	public Pointer Nlu_EntityType(Pointer entity);
35 | 	public Pointer Nlu_EntityMeaning(Pointer entity);
36 | 	public Pointer Nlu_EntityTypeName(Pointer entityType);
37 | 	public Pointer Nlu_EntityTypeI18n(Pointer entityType);
38 | 	public int Nlu_EntityTypeFlag(Pointer entityType);
39 | 	public Pointer Nlu_EntityTypePath(Pointer entityType);
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/src/main/java/tencent/ai/texsmart/NluEngine.java:
--------------------------------------------------------------------------------
 1 | package tencent.ai.texsmart;
 2 | 
 3 | import com.sun.jna.Pointer;
 4 | import com.sun.jna.WString;
 5 | 
 6 | //import com.sun.jna.Platform;
 7 | 
 8 | public class NluEngine {
 9 | 
10 | 	public boolean init(String dataDir, int workerCount) {
11 | 		enginePointer = CLib.INSTANCE.Nlu_CreateEngine(dataDir, workerCount);
12 | 		return enginePointer != null;
13 | 	}
14 | 
15 | 	/**
16 | 	 * Analyze text and get parsing results (word segmentation, POS tagging, NER, semantic expansion, etc.)
17 | 	 * @param text: The input natural language text
18 | 	 * @return Parsing results
19 | 	 */
20 | 	public NluOutput parseText(String text) {
21 | 		WString options = null;
22 | 		WString text_wstr = new WString(text);
23 | 		Pointer result = CLib.INSTANCE.Nlu_ParseTextExt(enginePointer, text_wstr, text_wstr.length(), options);
24 | 		NluOutput output = new NluOutput();
25 | 		output.dataPointer = result;
26 | 		return output;
27 | 	}
28 | 
29 | 	/**
30 | 	 * Analyze text and get parsing results (word segmentation, POS tagging, NER, semantic expansion, etc.)
31 | 	 * @param text: The input natural language text
32 | 	 * @param options: Parsing options, in JSON format
33 | 	 * @return Parsing results
34 | 	 */
35 | 	public NluOutput parseText(String text, String options) {
36 | 		WString text_wstr = new WString(text);
37 | 		WString options_wstr = new WString(options);
38 | 		Pointer result = CLib.INSTANCE.Nlu_ParseTextExt(enginePointer, text_wstr, text_wstr.length(), options_wstr);
39 | 		NluOutput output = new NluOutput();
40 | 		output.dataPointer = result;
41 | 		return output;
42 | 	}
43 | 	
44 | 	protected void finalize() {
45 | 		if(enginePointer != null) {
46 | 			CLib.INSTANCE.Nlu_DestroyEngine(enginePointer);
47 | 			enginePointer = null;
48 | 		}
49 | 	}
50 | 
51 | 	protected Pointer enginePointer = null;
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/java/tencent/ai/texsmart/NluOutput.java:
--------------------------------------------------------------------------------
  1 | package tencent.ai.texsmart;
  2 | 
  3 | import com.sun.jna.Pointer;
  4 | 
  5 | import java.util.ArrayList;
  6 | 
  7 | public class NluOutput {
  8 | 
  9 | 	/** A word or a phrase */
 10 | 	public class Term {
 11 | 		public String str;
 12 | 		public int offset = 0;
 13 | 		public int len = 0;
 14 | 		public String tag;
 15 | 
 16 | 		public String toString() {
 17 | 			return "str:" + str + " offset:" + offset + " len:" + len + " tag:" + tag;
 18 | 		}
 19 | 
 20 | 		public int length() { return this.str.length(); }
 21 | 
 22 | 		public boolean equals(Object obj) {
 23 | 			if (obj instanceof Term) {
 24 | 				Term term = (Term)obj;
 25 | 				if (this.tag == term.tag && this.tag.equals(term.tag)) {
 26 | 					return true;
 27 | 				}
 28 | 			}
 29 | 
 30 | 			return super.equals(obj);
 31 | 		}
 32 | 	}
 33 | 	
 34 | 	/** Entity type information */
 35 | 	public class EntityType {
 36 | 		public String name;
 37 | 		public String i18n;
 38 | 		public int flag = 0;
 39 | 		public String path;
 40 | 	}
 41 | 
 42 | 	/** Entity information */
 43 | 	public class Entity {
 44 | 		public String str;
 45 | 		public int offset = 0;
 46 | 		public int len = 0;
 47 | 		public EntityType type;
 48 | 		public String meaning;
 49 | 	}
 50 | 
 51 | 	/**
 52 | 	 * Get the normalized text
 53 | 	 * @param result: The parsing result object
 54 | 	 * @return The normalize text
 55 | 	 */
 56 | 	public String normText() {
 57 | 		Pointer len = null;
 58 | 		Pointer ptr = CLib.INSTANCE.Nlu_GetNormText(dataPointer, len);
 59 | 		return getWideStr(ptr);
 60 | 	}
 61 | 	
 62 | 	/**
 63 | 	 * Get words from the parsing result.
 64 | 	 * @param result: The parsing result object
 65 | 	 * @return A list of words
 66 | 	 */
 67 | 	public ArrayList<Term> words() {
 68 | 		ArrayList<Term> termList = new ArrayList<Term>();
 69 | 		int count = CLib.INSTANCE.Nlu_GetWordCount(dataPointer);
 70 | 		for(int idx = 0; idx < count; idx++) {
 71 | 			Pointer termPtr = CLib.INSTANCE.Nlu_GetWord(dataPointer, idx);
 72 | 			Term newTerm = new Term();
 73 | 			newTerm.str = getWideStr(CLib.INSTANCE.Nlu_TermStr(termPtr));
 74 | 			newTerm.offset = CLib.INSTANCE.Nlu_TermOffset(termPtr);
 75 | 			newTerm.len = CLib.INSTANCE.Nlu_TermLen(termPtr);
 76 | 			newTerm.tag = getWideStr(CLib.INSTANCE.Nlu_TermTag(termPtr));
 77 | 			newTerm.tag = newTerm.tag.equals("") ? "NN" : newTerm.tag;
 78 | 			termList.add(newTerm);
 79 | 		}
 80 | 		return termList;
 81 | 	}
 82 | 	
 83 | 	/**
 84 | 	 * Get phrases from the parsing result.
 85 | 	 * @param result: The parsing result object
 86 | 	 * @return A list of phrases
 87 | 	 */
 88 | 	public ArrayList<Term> phrases() {
 89 | 		ArrayList<Term> temrList = new ArrayList<Term>();
 90 | 		int count = CLib.INSTANCE.Nlu_GetPhraseCount(dataPointer);
 91 | 		for(int idx = 0; idx < count; idx++) {
 92 | 			Pointer termPtr = CLib.INSTANCE.Nlu_GetPhrase(dataPointer, idx);
 93 | 			Term newTerm = new Term();
 94 | 			newTerm.str = getWideStr(CLib.INSTANCE.Nlu_TermStr(termPtr));
 95 | 			newTerm.offset = CLib.INSTANCE.Nlu_TermOffset(termPtr);
 96 | 			newTerm.len = CLib.INSTANCE.Nlu_TermLen(termPtr);
 97 | 			newTerm.tag = getWideStr(CLib.INSTANCE.Nlu_TermTag(termPtr));
 98 | 			newTerm.tag = newTerm.tag.equals("") ? "NN" : newTerm.tag;
 99 | 			temrList.add(newTerm);
100 | 		}
101 | 		return temrList;
102 | 	}
103 | 
104 | 	/**
105 | 	 * Get entities from the parsing result.
106 | 	 * @param result: The parsing result object
107 | 	 * @return A list of entities
108 | 	 */
109 | 	public ArrayList<Entity> entities() {
110 | 		ArrayList<Entity> entityList = new ArrayList<Entity>();
111 | 		int count = CLib.INSTANCE.Nlu_GetEntityCount(dataPointer);
112 | 		for(int idx = 0; idx < count; idx++) {
113 | 			Pointer entityPtr = CLib.INSTANCE.Nlu_GetEntity(dataPointer, idx);
114 | 			Entity newEntity = new Entity();
115 | 			newEntity.str = getWideStr(CLib.INSTANCE.Nlu_EntityStr(entityPtr));
116 | 			newEntity.offset = CLib.INSTANCE.Nlu_EntityOffset(entityPtr);
117 | 			newEntity.len = CLib.INSTANCE.Nlu_EntityLen(entityPtr);
118 | 			newEntity.type = new EntityType();
119 | 			Pointer typePtr = CLib.INSTANCE.Nlu_EntityType(entityPtr);
120 | 			newEntity.type.name = getWideStr(CLib.INSTANCE.Nlu_EntityTypeName(typePtr));
121 | 			newEntity.type.i18n = getWideStr(CLib.INSTANCE.Nlu_EntityTypeI18n(typePtr));
122 | 			newEntity.type.flag = CLib.INSTANCE.Nlu_EntityTypeFlag(typePtr);
123 | 			newEntity.type.path = getWideStr(CLib.INSTANCE.Nlu_EntityTypePath(typePtr));
124 | 			newEntity.meaning = getWideStr(CLib.INSTANCE.Nlu_EntityMeaning(entityPtr));
125 | 			entityList.add(newEntity);
126 | 		}
127 | 		return entityList;
128 | 	}
129 | 	
130 | 	public void close() {
131 | 		if(dataPointer != null) {
132 | 			CLib.INSTANCE.Nlu_DestroyOutput(dataPointer);
133 | 			dataPointer = null;
134 | 		}
135 | 	}
136 | 
137 | 	//An utility function
138 | 	protected String getWideStr(Pointer strPtr) {
139 | 		if(strPtr == null) {
140 | 			return null;
141 | 		}
142 | 		
143 | 		return strPtr.getWideString(0);
144 | 	}
145 | 	
146 | 	protected void finalize() {
147 | 		close();
148 | 	}
149 | 
150 | 	protected Pointer dataPointer;
151 | }
152 | 


--------------------------------------------------------------------------------
/src/main/resources/plugin-descriptor.properties:
--------------------------------------------------------------------------------
 1 | # Elasticsearch plugin descriptor file
 2 | # This file must exist as 'plugin-descriptor.properties' at
 3 | # the root directory of all plugins.
 4 | #
 5 | # A plugin can be 'site', 'jvm', or both.
 6 | #
 7 | ### example site plugin for "foo":
 8 | #
 9 | # foo.zip <-- zip file for the plugin, with this structure:
10 | #   _site/ <-- the contents that will be served
11 | #   plugin-descriptor.properties <-- example contents below:
12 | #
13 | # site=true
14 | # description=My cool plugin
15 | # version=1.0
16 | #
17 | ### example jvm plugin for "foo"
18 | #
19 | # foo.zip <-- zip file for the plugin, with this structure:
20 | #   <arbitrary name1>.jar <-- classes, resources, dependencies
21 | #   <arbitrary nameN>.jar <-- any number of jars
22 | #   plugin-descriptor.properties <-- example contents below:
23 | #
24 | # jvm=true
25 | # classname=foo.bar.BazPlugin
26 | # description=My cool plugin
27 | # version=2.0.0-rc1
28 | # elasticsearch.version=2.0
29 | # java.version=1.7
30 | #
31 | ### mandatory elements for all plugins:
32 | #
33 | # 'description': simple summary of the plugin
34 | description=${project.description}
35 | #
36 | # 'version': plugin's version
37 | version=${project.version}
38 | #
39 | # 'name': the plugin name
40 | name=${elasticsearch.plugin.name}
41 | #
42 | # 'classname': the name of the class to load, fully-qualified.
43 | classname=${elasticsearch.plugin.classname}
44 | #
45 | # 'java.version' version of java the code is built against
46 | # use the system property java.specification.version
47 | # version string must be a sequence of nonnegative decimal integers
48 | # separated by "."'s and may have leading zeros
49 | java.version=${maven.compiler.target}
50 | #
51 | # 'elasticsearch.version' version of elasticsearch compiled against
52 | # You will have to release a new version of the plugin for each new
53 | # elasticsearch release. This version is checked when the plugin
54 | # is loaded so Elasticsearch will refuse to start in the presence of
55 | # plugins with the incorrect elasticsearch.version.
56 | elasticsearch.version=${elasticsearch.version}
57 | #


--------------------------------------------------------------------------------
/src/main/resources/plugin-security.policy:
--------------------------------------------------------------------------------
 1 | grant {
 2 |   // texsmart data directories
 3 |   permission java.io.FilePermission "<<ALL FILES>>", "read,write,delete";
 4 | 
 5 |   // needed because of problems in unbound LDAP library
 6 |   permission java.util.PropertyPermission "*", "read,write";
 7 | 
 8 |   // classloader
 9 |   permission java.lang.RuntimePermission "setContextClassLoader";
10 |   permission java.lang.RuntimePermission "getClassLoader";
11 |   permission java.lang.RuntimePermission "accessClassInPackage.jdk.internal.loader";
12 | 
13 |   // socket
14 |   permission java.net.SocketPermission "*", "connect,resolve";
15 | 
16 |   permission java.lang.reflect.ReflectPermission "suppressAccessChecks";
17 |   permission java.lang.reflect.ReflectPermission "newProxyInPackage.tencent.ai.texsmart";
18 | };


--------------------------------------------------------------------------------
/src/main/resources/texsmart-remote.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
 3 | <properties>
 4 |     <comment>TexSmart Analyzer 扩展配置</comment>
 5 | 
 6 |     <!--用户可以在这里配置远程扩展字典 -->
 7 |     <!--<entry key="remote_ext_dict">words_location</entry>-->
 8 | 
 9 |     <!--用户可以在这里配置远程扩展停止词字典-->
10 |     <!--<entry key="remote_ext_stopwords">stop_words_location</entry>-->
11 | </properties>


--------------------------------------------------------------------------------
/src/main/resources/texsmart.properties:
--------------------------------------------------------------------------------
1 | root=.
2 | CoreDictionaryPath=data/nlu/kb/
3 | CustomDictionaryPath=data/nlu/kb/customization/
4 | 
5 | path=/etc/elasticsearch/texsmart/data/nlu/kb/


--------------------------------------------------------------------------------