├── README.md
├── licenses
├── lucene-LICENSE.txt
└── lucene-NOTICE.txt
├── pom.xml
└── src
└── main
├── assemblies
└── plugin.xml
├── java
└── org
│ ├── analyzer
│ └── lucene
│ │ ├── SDAnalyzer.java
│ │ └── SDTokenizer.java
│ └── elasticsearch
│ ├── index
│ └── analysis
│ │ ├── SDAnalyzerProvider.java
│ │ └── SDTokenizerFactory.java
│ └── plugin
│ └── analysis
│ └── AnalysisSDPlugin.java
└── resources
├── plugin-descriptor.properties
└── plugin-security.policy
/README.md:
--------------------------------------------------------------------------------
1 | # Stanford Core NLP Analyzer for Elasticsearch
2 |
3 | This project is used for build a plugin for elasticsearch to involve the Stanford NLP analyzer.
4 |
5 | ## Stanford NLP?
6 |
7 | The stanford NLP project is an open-source project (which is licensed under the GNU General Public License V3 or later)\
8 | which used for providing a set of human language technology tools\
9 | This project as its name said, was maintaining by the Standford group.\
10 | ref:\
11 | [Stanford CoreNLP home page](https://stanfordnlp.github.io/CoreNLP/index.html)\
12 | [Stanford CoreNLP GitHub page](https://github.com/stanfordnlp/CoreNLP)
13 |
14 | ## Why stanford core NLP?
15 |
16 | Yes, we do have several open-source analyzers for ES, include the words popular IK, Jieba and
17 | some other NLP analyzers provided by some group and companies.
18 | And yes, each of them has their advantages.
19 |
20 | But having compared a branch of test cases among all these analyzers of both open-source ones and commercialized,
21 | we found that it seems the Stanford NLP is the most fitful for our project as we need not only sentences' separate,
22 | but also the sentiments' analyze.
23 |
24 | ## Why this project?
25 |
26 | I searched both Google and Github and asked for help on the professional forum
27 | ([elasticsearch China](https://elasticsearch.cn/)),
28 | and found that It seems not to have a stable project which could provide this requirement.
29 |
30 | Well, there is only one choice for me, build a fitful plugin to make it done. :)
31 |
32 | ## Use?
33 |
34 | ### install by git
35 | 1. I suppose your machine has prepared the Java JDK 8, Maven, Git .etc
36 | 2. git clone this project
37 | 3. maven package this project `mvn clean install -e -U`
38 | 4. copy the jar packages into plugin folder:
39 | 1. the project jar of course
40 | 1. stanford-corenlp-3.9.2.jar
41 | 1. stanford-chinese-corenlp-models-current.jar
42 | 1. commons-logging-1.2.jar
43 | 4. restart the ES and enjoy
44 |
45 | ### install by archive file
46 | 1. download the release file
47 | 2. untar into ES plugin folder
48 | 3. cp the stanford-*.jar files into this folder
49 | 4. also, download the stanford-chinese-corenlp-models-current.jar into this folder as it's too large to upload
50 | 3. restart the ES and enjoy
51 |
52 | ## Quick Example
53 | 1. Create an index
54 | ```bash
55 | curl -XPUT http://localhost:9200/index -H 'Content-Type:application/json' -d'
56 | {
57 | "settings": {
58 | "number_of_replicas": 0,
59 | "number_of_shards": 1
60 | },
61 | "mappings": {
62 | "_doc": {
63 | "properties": {
64 | "id": {
65 | "type": "integer"
66 | },
67 | "text": {
68 | "type": "text",
69 | "analyzer": "stanford-core-nlp",
70 | "search_analyzer": "stanford-core-nlp"
71 | }
72 | }
73 | }
74 | }
75 | }
76 | '
77 | ```
78 |
79 | 2. Index some docs
80 | ```bash
81 | curl -XPOST http://localhost:9200/index/_doc/1 -H 'Content-Type:application/json' -d'
82 | {"id":1, "text":"中美贸易摩擦到贸易战"}
83 | '
84 | ```
85 |
86 | ```bash
87 | curl -XPOST http://localhost:9200/index/_doc/2 -H 'Content-Type:application/json' -d'
88 | {"id":2, "text":"美国和墨西哥重新签订美墨贸易协定"}
89 | '
90 | ```
91 |
92 | ```bash
93 | curl -XPOST http://localhost:9200/index/_doc/3 -H 'Content-Type:application/json' -d'
94 | {"id":3, "text":"知乎裁员意味着互联网寒冬的到来"}
95 | '
96 | ```
97 |
98 | ```bash
99 | curl -XPOST http://localhost:9200/index/_doc/4 -H 'Content-Type:application/json' -d'
100 | {"id":4, "text":"比亚迪公开被撕拖欠款项,声称欠款方伪造公章"}
101 | '
102 | ```
103 |
104 | 3.query with highlighting
105 |
106 | ```bash
107 | curl -XPOST http://localhost:9200/index/_search -H 'Content-Type:application/json' -d'
108 | {
109 | "query" : { "match" : { "text" : "美国" }},
110 | "highlight" : {
111 | "pre_tags" : ["", ""],
112 | "post_tags" : ["", ""],
113 | "fields" : {
114 | "text" : {}
115 | }
116 | }
117 | }
118 | '
119 | ```
120 |
121 | Result
122 | ```json
123 | {
124 | "took": 192,
125 | "timed_out": false,
126 | "_shards": {
127 | "total": 1,
128 | "successful": 1,
129 | "skipped": 0,
130 | "failed": 0
131 | },
132 | "hits": {
133 | "total": 2,
134 | "max_score": 0.92510056,
135 | "hits": [
136 | {
137 | "_index": "index",
138 | "_type": "_doc",
139 | "_id": "1",
140 | "_score": 0.92510056,
141 | "_source": {
142 | "id": 1,
143 | "text": "美国政府瘫痪"
144 | },
145 | "highlight": {
146 | "text": [
147 | "美国政府瘫痪"
148 | ]
149 | }
150 | },
151 | {
152 | "_index": "index",
153 | "_type": "_doc",
154 | "_id": "2",
155 | "_score": 0.65024257,
156 | "_source": {
157 | "id": 2,
158 | "text": "美国和墨西哥重新签订美墨贸易协定"
159 | },
160 | "highlight": {
161 | "text": [
162 | "美国和墨西哥重新签订美墨贸易协定"
163 | ]
164 | }
165 | }
166 | ]
167 | }
168 | }
169 | ```
170 |
171 | ## Shit happens?
172 | 1. xx access denied (e.g. java.lang.RuntimePermission xxxx)?\
173 | -> Add these into plugin-security.policy
174 | * permission java.lang.RuntimePermission "*";
175 | * permission java.lang.reflect.ReflectPermission "*";
176 | -> Edit the `jvm.options `
177 | * add the following cmd
178 | -Djava.security.policy=file://${dir of this}/plugin-security.policy
179 |
180 | 2. xx gc xx overhead?\
181 | -> Edit the `jvm.options`
182 | * change the Xms && Xmx to be a larger size
183 |
184 |
185 |
--------------------------------------------------------------------------------
/licenses/lucene-LICENSE.txt:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
203 |
204 |
205 |
206 | Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was
207 | derived from unicode conversion examples available at
208 | http://www.unicode.org/Public/PROGRAMS/CVTUTF. Here is the copyright
209 | from those sources:
210 |
211 | /*
212 | * Copyright 2001-2004 Unicode, Inc.
213 | *
214 | * Disclaimer
215 | *
216 | * This source code is provided as is by Unicode, Inc. No claims are
217 | * made as to fitness for any particular purpose. No warranties of any
218 | * kind are expressed or implied. The recipient agrees to determine
219 | * applicability of information provided. If this file has been
220 | * purchased on magnetic or optical media from Unicode, Inc., the
221 | * sole remedy for any claim will be exchange of defective media
222 | * within 90 days of receipt.
223 | *
224 | * Limitations on Rights to Redistribute This Code
225 | *
226 | * Unicode, Inc. hereby grants the right to freely use the information
227 | * supplied in this file in the creation of products supporting the
228 | * Unicode Standard, and to make copies of this file in any form
229 | * for internal or external distribution as long as this notice
230 | * remains attached.
231 | */
232 |
233 |
234 | Some code in core/src/java/org/apache/lucene/util/ArrayUtil.java was
235 | derived from Python 2.4.2 sources available at
236 | http://www.python.org. Full license is here:
237 |
238 | http://www.python.org/download/releases/2.4.2/license/
239 |
240 | Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was
241 | derived from Python 3.1.2 sources available at
242 | http://www.python.org. Full license is here:
243 |
244 | http://www.python.org/download/releases/3.1.2/license/
245 |
246 | Some code in core/src/java/org/apache/lucene/util/automaton was
247 | derived from Brics automaton sources available at
248 | www.brics.dk/automaton/. Here is the copyright from those sources:
249 |
250 | /*
251 | * Copyright (c) 2001-2009 Anders Moeller
252 | * All rights reserved.
253 | *
254 | * Redistribution and use in source and binary forms, with or without
255 | * modification, are permitted provided that the following conditions
256 | * are met:
257 | * 1. Redistributions of source code must retain the above copyright
258 | * notice, this list of conditions and the following disclaimer.
259 | * 2. Redistributions in binary form must reproduce the above copyright
260 | * notice, this list of conditions and the following disclaimer in the
261 | * documentation and/or other materials provided with the distribution.
262 | * 3. The name of the author may not be used to endorse or promote products
263 | * derived from this software without specific prior written permission.
264 | *
265 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
266 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
267 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
268 | * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
269 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
270 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
271 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
272 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
273 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
274 | * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
275 | */
276 |
277 | The levenshtein automata tables in core/src/java/org/apache/lucene/util/automaton
278 | were automatically generated with the moman/finenight FSA package.
279 | Here is the copyright for those sources:
280 |
281 | # Copyright (c) 2010, Jean-Philippe Barrette-LaPierre,
282 | #
283 | # Permission is hereby granted, free of charge, to any person
284 | # obtaining a copy of this software and associated documentation
285 | # files (the "Software"), to deal in the Software without
286 | # restriction, including without limitation the rights to use,
287 | # copy, modify, merge, publish, distribute, sublicense, and/or sell
288 | # copies of the Software, and to permit persons to whom the
289 | # Software is furnished to do so, subject to the following
290 | # conditions:
291 | #
292 | # The above copyright notice and this permission notice shall be
293 | # included in all copies or substantial portions of the Software.
294 | #
295 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
296 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
297 | # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
298 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
299 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
300 | # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
301 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
302 | # OTHER DEALINGS IN THE SOFTWARE.
303 |
304 | Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was
305 | derived from ICU (http://www.icu-project.org)
306 | The full license is available here:
307 | http://source.icu-project.org/repos/icu/icu/trunk/license.html
308 |
309 | /*
310 | * Copyright (C) 1999-2010, International Business Machines
311 | * Corporation and others. All Rights Reserved.
312 | *
313 | * Permission is hereby granted, free of charge, to any person obtaining a copy
314 | * of this software and associated documentation files (the "Software"), to deal
315 | * in the Software without restriction, including without limitation the rights
316 | * to use, copy, modify, merge, publish, distribute, and/or sell copies of the
317 | * Software, and to permit persons to whom the Software is furnished to do so,
318 | * provided that the above copyright notice(s) and this permission notice appear
319 | * in all copies of the Software and that both the above copyright notice(s) and
320 | * this permission notice appear in supporting documentation.
321 | *
322 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
323 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
324 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
325 | * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
326 | * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
327 | * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
328 | * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
329 | * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
330 | *
331 | * Except as contained in this notice, the name of a copyright holder shall not
332 | * be used in advertising or otherwise to promote the sale, use or other
333 | * dealings in this Software without prior written authorization of the
334 | * copyright holder.
335 | */
336 |
337 | The following license applies to the Snowball stemmers:
338 |
339 | Copyright (c) 2001, Dr Martin Porter
340 | Copyright (c) 2002, Richard Boulton
341 | All rights reserved.
342 |
343 | Redistribution and use in source and binary forms, with or without
344 | modification, are permitted provided that the following conditions are met:
345 |
346 | * Redistributions of source code must retain the above copyright notice,
347 | * this list of conditions and the following disclaimer.
348 | * Redistributions in binary form must reproduce the above copyright
349 | * notice, this list of conditions and the following disclaimer in the
350 | * documentation and/or other materials provided with the distribution.
351 | * Neither the name of the copyright holders nor the names of its contributors
352 | * may be used to endorse or promote products derived from this software
353 | * without specific prior written permission.
354 |
355 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
356 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
357 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
358 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
359 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
360 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
361 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
362 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
363 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
364 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
365 |
366 | The following license applies to the KStemmer:
367 |
368 | Copyright © 2003,
369 | Center for Intelligent Information Retrieval,
370 | University of Massachusetts, Amherst.
371 | All rights reserved.
372 |
373 | Redistribution and use in source and binary forms, with or without modification,
374 | are permitted provided that the following conditions are met:
375 |
376 | 1. Redistributions of source code must retain the above copyright notice, this
377 | list of conditions and the following disclaimer.
378 |
379 | 2. Redistributions in binary form must reproduce the above copyright notice,
380 | this list of conditions and the following disclaimer in the documentation
381 | and/or other materials provided with the distribution.
382 |
383 | 3. The names "Center for Intelligent Information Retrieval" and
384 | "University of Massachusetts" must not be used to endorse or promote products
385 | derived from this software without prior written permission. To obtain
386 | permission, contact info@ciir.cs.umass.edu.
387 |
388 | THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS
389 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
390 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
391 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
392 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
393 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
394 | GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
395 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
396 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
397 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
398 | SUCH DAMAGE.
399 |
400 | The following license applies to the Morfologik project:
401 |
402 | Copyright (c) 2006 Dawid Weiss
403 | Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
404 | All rights reserved.
405 |
406 | Redistribution and use in source and binary forms, with or without modification,
407 | are permitted provided that the following conditions are met:
408 |
409 | * Redistributions of source code must retain the above copyright notice,
410 | this list of conditions and the following disclaimer.
411 |
412 | * Redistributions in binary form must reproduce the above copyright notice,
413 | this list of conditions and the following disclaimer in the documentation
414 | and/or other materials provided with the distribution.
415 |
416 | * Neither the name of Morfologik nor the names of its contributors
417 | may be used to endorse or promote products derived from this software
418 | without specific prior written permission.
419 |
420 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
421 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
422 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
423 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
424 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
425 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
426 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
427 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
428 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
429 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
430 |
431 | ---
432 |
433 | The dictionary comes from Morfologik project. Morfologik uses data from
434 | Polish ispell/myspell dictionary hosted at http://www.sjp.pl/slownik/en/ and
435 | is licenced on the terms of (inter alia) LGPL and Creative Commons
436 | ShareAlike. The part-of-speech tags were added in Morfologik project and
437 | are not found in the data from sjp.pl. The tagset is similar to IPI PAN
438 | tagset.
439 |
440 | ---
441 |
442 | The following license applies to the Morfeusz project,
443 | used by org.apache.lucene.analysis.morfologik.
444 |
445 | BSD-licensed dictionary of Polish (SGJP)
446 | http://sgjp.pl/morfeusz/
447 |
448 | Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński,
449 | Marcin Woliński, Robert Wołosz
450 |
451 | All rights reserved.
452 |
453 | Redistribution and use in source and binary forms, with or without
454 | modification, are permitted provided that the following conditions are
455 | met:
456 |
457 | 1. Redistributions of source code must retain the above copyright
458 | notice, this list of conditions and the following disclaimer.
459 |
460 | 2. Redistributions in binary form must reproduce the above copyright
461 | notice, this list of conditions and the following disclaimer in the
462 | documentation and/or other materials provided with the
463 | distribution.
464 |
465 | THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
466 | OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
467 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
468 | DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE
469 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
470 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
471 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
472 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
473 | WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
474 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
475 | IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
476 |
--------------------------------------------------------------------------------
/licenses/lucene-NOTICE.txt:
--------------------------------------------------------------------------------
1 | Apache Lucene
2 | Copyright 2014 The Apache Software Foundation
3 |
4 | This product includes software developed at
5 | The Apache Software Foundation (http://www.apache.org/).
6 |
7 | Includes software from other Apache Software Foundation projects,
8 | including, but not limited to:
9 | - Apache Ant
10 | - Apache Jakarta Regexp
11 | - Apache Commons
12 | - Apache Xerces
13 |
14 | ICU4J, (under analysis/icu) is licensed under an MIT styles license
15 | and Copyright (c) 1995-2008 International Business Machines Corporation and others
16 |
17 | Some data files (under analysis/icu/src/data) are derived from Unicode data such
18 | as the Unicode Character Database. See http://unicode.org/copyright.html for more
19 | details.
20 |
21 | Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is
22 | BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/
23 |
24 | The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were
25 | automatically generated with the moman/finenight FSA library, created by
26 | Jean-Philippe Barrette-LaPierre. This library is available under an MIT license,
27 | see http://sites.google.com/site/rrettesite/moman and
28 | http://bitbucket.org/jpbarrette/moman/overview/
29 |
30 | The class org.apache.lucene.util.WeakIdentityMap was derived from
31 | the Apache CXF project and is Apache License 2.0.
32 |
33 | The Google Code Prettify is Apache License 2.0.
34 | See http://code.google.com/p/google-code-prettify/
35 |
36 | JUnit (junit-4.10) is licensed under the Common Public License v. 1.0
37 | See http://junit.sourceforge.net/cpl-v10.html
38 |
39 | This product includes code (JaspellTernarySearchTrie) from Java Spelling Checkin
40 | g Package (jaspell): http://jaspell.sourceforge.net/
41 | License: The BSD License (http://www.opensource.org/licenses/bsd-license.php)
42 |
43 | The snowball stemmers in
44 | analysis/common/src/java/net/sf/snowball
45 | were developed by Martin Porter and Richard Boulton.
46 | The snowball stopword lists in
47 | analysis/common/src/resources/org/apache/lucene/analysis/snowball
48 | were developed by Martin Porter and Richard Boulton.
49 | The full snowball package is available from
50 | http://snowball.tartarus.org/
51 |
52 | The KStem stemmer in
53 | analysis/common/src/org/apache/lucene/analysis/en
54 | was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
55 | under the BSD-license.
56 |
57 | The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
58 | stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
59 | analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
60 | analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
61 | analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
62 | analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
63 | analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
64 | See http://members.unine.ch/jacques.savoy/clef/index.html.
65 |
66 | The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
67 | (common) are based on BSD-licensed reference implementations created by Jacques Savoy and
68 | Ljiljana Dolamic. These files reside in:
69 | analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java
70 | analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java
71 | analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java
72 | analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java
73 | analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java
74 | analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java
75 | analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java
76 | analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java
77 | analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java
78 | analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java
79 | analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java
80 |
81 | The Stempel analyzer (stempel) includes BSD-licensed software developed
82 | by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil,
83 | and Edmond Nolan.
84 |
85 | The Polish analyzer (stempel) comes with a default
86 | stopword list that is BSD-licensed created by the Carrot2 project. The file resides
87 | in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt.
88 | See http://project.carrot2.org/license.html.
89 |
90 | The SmartChineseAnalyzer source code (smartcn) was
91 | provided by Xiaoping Gao and copyright 2009 by www.imdict.net.
92 |
93 | WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/)
94 | is derived from Unicode data such as the Unicode Character Database.
95 | See http://unicode.org/copyright.html for more details.
96 |
97 | The Morfologik analyzer (morfologik) includes BSD-licensed software
98 | developed by Dawid Weiss and Marcin Miłkowski (http://morfologik.blogspot.com/).
99 |
100 | Morfologik uses data from Polish ispell/myspell dictionary
101 | (http://www.sjp.pl/slownik/en/) licenced on the terms of (inter alia)
102 | LGPL and Creative Commons ShareAlike.
103 |
104 | Morfologic includes data from BSD-licensed dictionary of Polish (SGJP)
105 | (http://sgjp.pl/morfeusz/)
106 |
107 | Servlet-api.jar and javax.servlet-*.jar are under the CDDL license, the original
108 | source code for this can be found at http://www.eclipse.org/jetty/downloads.php
109 |
110 | ===========================================================================
111 | Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration
112 | ===========================================================================
113 |
114 | This software includes a binary and/or source version of data from
115 |
116 | mecab-ipadic-2.7.0-20070801
117 |
118 | which can be obtained from
119 |
120 | http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz
121 |
122 | or
123 |
124 | http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz
125 |
126 | ===========================================================================
127 | mecab-ipadic-2.7.0-20070801 Notice
128 | ===========================================================================
129 |
130 | Nara Institute of Science and Technology (NAIST),
131 | the copyright holders, disclaims all warranties with regard to this
132 | software, including all implied warranties of merchantability and
133 | fitness, in no event shall NAIST be liable for
134 | any special, indirect or consequential damages or any damages
135 | whatsoever resulting from loss of use, data or profits, whether in an
136 | action of contract, negligence or other tortuous action, arising out
137 | of or in connection with the use or performance of this software.
138 |
139 | A large portion of the dictionary entries
140 | originate from ICOT Free Software. The following conditions for ICOT
141 | Free Software applies to the current dictionary as well.
142 |
143 | Each User may also freely distribute the Program, whether in its
144 | original form or modified, to any third party or parties, PROVIDED
145 | that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
146 | on, or be attached to, the Program, which is distributed substantially
147 | in the same form as set out herein and that such intended
148 | distribution, if actually made, will neither violate or otherwise
149 | contravene any of the laws and regulations of the countries having
150 | jurisdiction over the User or the intended distribution itself.
151 |
152 | NO WARRANTY
153 |
154 | The program was produced on an experimental basis in the course of the
155 | research and development conducted during the project and is provided
156 | to users as so produced on an experimental basis. Accordingly, the
157 | program is provided without any warranty whatsoever, whether express,
158 | implied, statutory or otherwise. The term "warranty" used herein
159 | includes, but is not limited to, any warranty of the quality,
160 | performance, merchantability and fitness for a particular purpose of
161 | the program and the nonexistence of any infringement or violation of
162 | any right of any third party.
163 |
164 | Each user of the program will agree and understand, and be deemed to
165 | have agreed and understood, that there is no warranty whatsoever for
166 | the program and, accordingly, the entire risk arising from or
167 | otherwise connected with the program is assumed by the user.
168 |
169 | Therefore, neither ICOT, the copyright holder, or any other
170 | organization that participated in or was otherwise related to the
171 | development of the program and their respective officials, directors,
172 | officers and other employees shall be held liable for any and all
173 | damages, including, without limitation, general, special, incidental
174 | and consequential damages, arising out of or otherwise in connection
175 | with the use or inability to use the program or any product, material
176 | or result produced or otherwise obtained by using the program,
177 | regardless of whether they have been advised of, or otherwise had
178 | knowledge of, the possibility of such damages at any time during the
179 | project or thereafter. Each user will be deemed to have agreed to the
180 | foregoing by his or her commencement of use of the program. The term
181 | "use" as used herein includes, but is not limited to, the use,
182 | modification, copying and distribution of the program and the
183 | production of secondary products from the program.
184 |
185 | In the case where the program, whether in its original form or
186 | modified, was distributed or delivered to or received by a user from
187 | any person, organization or entity other than ICOT, unless it makes or
188 | grants independently of ICOT any specific warranty to the user in
189 | writing, such person, organization or entity, will also be exempted
190 | from and not be held liable to the user for any such damages as noted
191 | above as far as the program is concerned.
192 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 | godLockin
7 | esStanfordNLPAnalyzer
8 | ${elasticsearch.version}
9 | jar
10 | Stanford Core NLP Analyzer for Elasticsearch
11 | 2018
12 |
13 |
14 |
15 | The Apache Software License, Version 2.0
16 | http://www.apache.org/licenses/LICENSE-2.0.txt
17 | repo
18 |
19 |
20 |
21 |
22 |
23 | Steven
24 | stevenchenworking@gmail.com
25 | godLockin
26 |
27 |
28 |
29 |
30 | scm:git:git@github.com:godlockin/esStanfordNLPAnalyzer.git
31 | scm:git:git@github.com:godlockin/esStanfordNLPAnalyzer.git
32 | https://github.com/godlockin/esStanfordNLPAnalyzer
33 |
34 |
35 |
36 | 1.8
37 | 1.8
38 | 6.5.0
39 | ${project.basedir}/src/main/assemblies/plugin.xml
40 | analysis-stanford-nlp
41 | org.elasticsearch.plugin.analysis.AnalysisSDPlugin
42 | true
43 |
44 | 3.9.2
45 | 3.5.1
46 | 2.8
47 |
48 |
49 |
50 |
51 | org.elasticsearch
52 | elasticsearch
53 | ${elasticsearch.version}
54 | compile
55 |
56 |
57 |
58 | edu.stanford.nlp
59 | stanford-corenlp
60 | ${stanfordNLP.version}
61 |
62 |
63 | edu.stanford.nlp
64 | stanford-corenlp
65 | ${stanfordNLP.version}
66 | models
67 |
68 |
69 | edu.stanford.nlp
70 | stanford-corenlp
71 | ${stanfordNLP.version}
72 | models-chinese
73 |
74 |
75 |
76 |
77 |
78 |
79 | org.apache.maven.plugins
80 | maven-compiler-plugin
81 | ${org.apache.maven.compiler.plugin.version}
82 |
83 | ${maven.compiler.target}
84 | ${maven.compiler.target}
85 |
86 |
87 |
88 | org.apache.maven.plugins
89 | maven-dependency-plugin
90 | ${org.apache.maven.dependency.plugin.version}
91 |
92 |
93 | copy
94 | package
95 |
96 | copy
97 |
98 |
99 |
100 |
101 | edu.stanford.nlp
102 | stanford-corenlp
103 | ${stanfordNLP.version}
104 | models
105 | jar
106 | true
107 | ${basedir}/target
108 | stanford-corenlp-models-${stanfordNLP.version}.jar
109 |
110 |
111 | false
112 | true
113 |
114 |
115 |
116 |
117 |
118 |
119 |
--------------------------------------------------------------------------------
/src/main/assemblies/plugin.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | analysis-stanfordNLP
4 |
5 | zip
6 |
7 | false
8 |
9 |
10 | ${project.basedir}/config
11 | config
12 |
13 |
14 |
15 |
16 |
17 | ${project.basedir}/src/main/resources/plugin-descriptor.properties
18 |
19 | true
20 |
21 |
22 | ${project.basedir}/src/main/resources/plugin-security.policy
23 |
24 | true
25 |
26 |
27 |
28 |
29 |
30 | true
31 | true
32 |
33 | org.elasticsearch:elasticsearch
34 |
35 |
36 |
37 |
38 |
--------------------------------------------------------------------------------
/src/main/java/org/analyzer/lucene/SDAnalyzer.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Stanford Core NLP 中文分词器 版本 1.0
3 | * Stanford Core NLP Analyzer Release 1.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由陈晨(stevenchenworking@gmail.com)提供
21 | * provided by Steven Chen
22 | * [Stanford CoreNLP home page](https://stanfordnlp.github.io/CoreNLP/index.html)
23 | * [Stanford CoreNLP GitHub page](https://github.com/stanfordnlp/CoreNLP)
24 | */
25 | package org.analyzer.lucene;
26 |
27 | import org.apache.lucene.analysis.Analyzer;
28 |
29 | public final class SDAnalyzer extends Analyzer {
30 |
31 | public SDAnalyzer(){ }
32 |
33 | @Override
34 | protected TokenStreamComponents createComponents(String fieldName) {
35 | return new TokenStreamComponents(new SDTokenizer());
36 | }
37 |
38 | }
39 |
--------------------------------------------------------------------------------
/src/main/java/org/analyzer/lucene/SDTokenizer.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Stanford Core NLP 中文分词器 版本 1.0
3 | * Stanford Core NLP Analyzer Release 1.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由陈晨(stevenchenworking@gmail.com)提供
21 | * provided by Steven Chen
22 | * Ref: Stanford Core NLP project
23 | * [Stanford CoreNLP home page](https://stanfordnlp.github.io/CoreNLP/index.html)
24 | * [Stanford CoreNLP GitHub page](https://github.com/stanfordnlp/CoreNLP)
25 | */
26 | package org.analyzer.lucene;
27 |
28 | import edu.stanford.nlp.ling.CoreAnnotations;
29 | import edu.stanford.nlp.pipeline.StanfordCoreNLP;
30 | import org.apache.logging.log4j.LogManager;
31 | import org.apache.logging.log4j.Logger;
32 | import org.apache.lucene.analysis.Tokenizer;
33 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
34 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
35 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
36 | import org.elasticsearch.SpecialPermission;
37 |
38 | import java.io.BufferedReader;
39 | import java.io.IOException;
40 | import java.security.AccessController;
41 | import java.security.PrivilegedAction;
42 | import java.util.*;
43 | import java.util.stream.Collectors;
44 | import java.util.stream.Stream;
45 |
46 | public final class SDTokenizer extends Tokenizer {
47 |
48 | private final CharTermAttribute termAtt;
49 | private final OffsetAttribute offsetAtt;
50 | private int endPosition;
51 | private Iterator wordsIter = Collections.emptyIterator();
52 |
53 | private Set ignoreSymbols = new HashSet(Stream.of(("囧 ⊙ ● ○ ⊕ ◎ Θ ⊙ ¤ ㈱ ㊣ ★ ☆ ♀ ◆ ◇ ◣ ◢ ◥ ▲ ▼ △ ▽ ⊿ ◤ ◥ ▂ ▃ " +
54 | "▄ ▅ ▆ ▇ █ █ ■ ▓ □ 〓 ≡ ╝ ╚ ╔ ╗ ╬ ═ ╓ ╩ ┠ ┨ ┯ ┷ ┏ ┓ ┗ ┛ ┳ ⊥ 『 』 ┌ ┐ └ ┘ ∟ 「 」 ↑ ↓ → ← ↘ ↙ ♀ ♂ ┇ " +
55 | "┅ ﹉ ﹊ ﹍ ﹎ ╭ ╮ ╰ ╯ *^_^* ^*^ ^-^ ^_^ ^(^ ∵ ∴ ‖ | | ︴ ﹏ ﹋ ﹌ ( ) 〔 〕 【 】 〖 〗 @ : ! / \\ \" " +
56 | "_ < > ` , · 。 ≈ { } ~ ~ ( ) _ -『 』 √ $ @ * & # ※ 卐 々 ∞ Ψ ∪ ∩ ∈ ∏ の ℡ ぁ § ∮ ” 〃 ミ 灬 ξ № ∑ ⌒ ξ ζ ω * " +
57 | "\uE7E7 \uE7F3 ㄨ ≮ ≯ + - × ÷ + - ± / = ∫ ∮ ∝ ∞ ∧ ∨ ∑ ∏ ‖ ∠ ≌ ∽ ≤ ≥ ≈ < > じ ☆ ↑ ↓ ⊙ ● ★ ☆ ■ ♀ 『 』 Ψ" +
58 | " ※ → № ← ㊣ ∑ ⌒ 〖 〗 @ ξ ζ ω □ ∮ 〓 ※ ∴ ぷ ∏ 卐 【 】 △ √ ∩ ¤ 々 ♀ ♂ ∞ ① ㄨ ≡ ↘ ↙ ┗ ┛ ╰ ☆ ╮ ① ② ③ ④ ⑤ ⑥ ⑦ ⑧ ⑨ ⑩ " +
59 | "⑴ ⑵ ⑶ ⑷ ⑸ ⑹ ⑺ ⑻ ⑼ ⑽ ⑾ ⑿ ⒀ ⒁ ⒂ ⒃ ⒄ ⒅ ⒆ ⒇ 丨 丩 丬 丶 丷 丿 乀 乙 乂 乄 乆 乛 亅 亠 亻 冂 冫 冖 凵" +
60 | "\uE81C \uE81D \uE815 \uE816 \uE817 \uE818 \uE819 \uE81E \uE822 \uE823 \uE82B \uE82C\uE830 \uE831 \uE832 \uE833 \uE836 \uE838 \uE839 \uE83A \uE83B \uE83E \uE848 \uE81A \uE81B" +
61 | " 、 。 . ? ! ~ $ % @ & # * ? ; ∶ … ¨ , · ˙ ? ‘ ’ “ ” ” 〃 ‘ ′ 〃 ↑ ↓ ← → ↖ ↗ ↙ ↘ ㊣ ◎ ○ ● ⊕ ⊙ ○ ● △ ▲ ☆ ★ ◇ ◆ □ ■ ▽ ▼ § ¥ 〒" +
62 | " ¢ £ ※ ♀ ♂ α β γ δ ε ζ η θ ι κ λ μ ν ξ ο π ρ σ τ υ φ χ ψ ω C").split(" ")).filter(x -> !(null == x || "".equals(x.trim()))).collect(Collectors.toList()));
63 | private PositionIncrementAttribute posIncrAtt;
64 |
65 | private int increment = 0;
66 | private StanfordCoreNLP pipeline;
67 |
68 | public SDTokenizer(){
69 | super();
70 |
71 | offsetAtt = addAttribute(OffsetAttribute.class);
72 | termAtt = addAttribute(CharTermAttribute.class);
73 | posIncrAtt = addAttribute(PositionIncrementAttribute.class);
74 | }
75 |
76 | private StanfordCoreNLP instance() {
77 | if (null == pipeline) {
78 | synchronized (SDTokenizer.class) {
79 | if (null == pipeline) {
80 | Properties props = new Properties();
81 | props.setProperty("annotators", "tokenize, ssplit");
82 | props.setProperty("tokenize.language", "zh");
83 | props.setProperty("segment.model", "edu/stanford/nlp/models/segmenter/chinese/ctb.gz");
84 | props.setProperty("segment.sighanCorporaDict", "edu/stanford/nlp/models/segmenter/chinese");
85 | props.setProperty("segment.serDictionary", "edu/stanford/nlp/models/segmenter/chinese/dict-chris6.ser.gz");
86 | props.setProperty("segment.sighanPostProcessing", "true");
87 | props.setProperty("ssplit.boundaryTokenRegex", "[.。]|[!?!?]+");
88 |
89 | SpecialPermission.check();
90 | pipeline = AccessController.doPrivileged((PrivilegedAction) () -> new StanfordCoreNLP(props));
91 | return pipeline;
92 | }
93 | }
94 | }
95 | return pipeline;
96 | }
97 |
98 | @Override
99 | public boolean incrementToken() {
100 | clearAttributes();
101 |
102 | if(wordsIter.hasNext()){
103 | String word = wordsIter.next();
104 | int wordLength = word.length();
105 | posIncrAtt.setPositionIncrement(increment + 1);
106 | termAtt.append(word);
107 | termAtt.setLength(wordLength);
108 | offsetAtt.setOffset(endPosition + 1, endPosition + 1 + wordLength);
109 | endPosition += wordLength;
110 | return true;
111 | }
112 | return false;
113 | }
114 |
115 | @Override
116 | public void reset() throws IOException {
117 | super.reset();
118 | // reset the input content
119 | endPosition = -1;
120 | increment = 0;
121 |
122 | List words = new ArrayList<>();
123 | try (BufferedReader br = new BufferedReader(input)) {
124 | String temp;
125 | StringBuilder stringBuilder = new StringBuilder();
126 | while ((temp = br.readLine()) != null) {
127 | stringBuilder.append(temp.trim());
128 | }
129 |
130 | words = instance().process(stringBuilder.toString().trim())
131 | .get(CoreAnnotations.TokensAnnotation.class)
132 | .stream().map(x -> x.get(CoreAnnotations.TextAnnotation.class))
133 | .filter(x -> !ignoreSymbols.contains(x))
134 | .collect(Collectors.toList());
135 | } catch (IOException e) {
136 | e.printStackTrace();
137 | } finally {
138 | wordsIter = words.iterator();
139 | }
140 | }
141 |
142 | @Override
143 | public final void end() throws IOException {
144 | super.end();
145 |
146 | // set final offset
147 | int finalOffset = correctOffset(this.endPosition);
148 | offsetAtt.setOffset(finalOffset, finalOffset);
149 | posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + increment);
150 | }
151 | }
152 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/SDAnalyzerProvider.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Stanford Core NLP 中文分词器 版本 1.0
3 | * Stanford Core NLP Analyzer Release 1.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由陈晨(stevenchenworking@gmail.com)提供
21 | * provided by Steven Chen
22 | * [Stanford CoreNLP home page](https://stanfordnlp.github.io/CoreNLP/index.html)
23 | * [Stanford CoreNLP GitHub page](https://github.com/stanfordnlp/CoreNLP)
24 | */
25 | package org.elasticsearch.index.analysis;
26 |
27 | import org.elasticsearch.common.settings.Settings;
28 | import org.elasticsearch.env.Environment;
29 | import org.elasticsearch.index.IndexSettings;
30 | import org.analyzer.lucene.SDAnalyzer;
31 |
32 | public class SDAnalyzerProvider extends AbstractIndexAnalyzerProvider {
33 |
34 | public SDAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
35 | super(indexSettings, name, settings);
36 | }
37 |
38 | @Override
39 | public SDAnalyzer get() {
40 | return new SDAnalyzer();
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/SDTokenizerFactory.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Stanford Core NLP 中文分词器 版本 1.0
3 | * Stanford Core NLP Analyzer Release 1.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由陈晨(stevenchenworking@gmail.com)提供
21 | * provided by Steven Chen
22 | * [Stanford CoreNLP home page](https://stanfordnlp.github.io/CoreNLP/index.html)
23 | * [Stanford CoreNLP GitHub page](https://github.com/stanfordnlp/CoreNLP)
24 | */
25 | package org.elasticsearch.index.analysis;
26 |
27 | import org.apache.lucene.analysis.Tokenizer;
28 | import org.elasticsearch.common.settings.Settings;
29 | import org.elasticsearch.env.Environment;
30 | import org.elasticsearch.index.IndexSettings;
31 | import org.analyzer.lucene.SDTokenizer;
32 |
33 | public class SDTokenizerFactory extends AbstractTokenizerFactory {
34 |
35 | public SDTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
36 | super(indexSettings, name, settings);
37 | }
38 |
39 | @Override
40 | public Tokenizer create() {
41 | return new SDTokenizer();
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/analysis/AnalysisSDPlugin.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Stanford Core NLP 中文分词器 版本 1.0
3 | * Stanford Core NLP Analyzer Release 1.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由陈晨(stevenchenworking@gmail.com)提供
21 | * provided by Steven Chen
22 | * [Stanford CoreNLP home page](https://stanfordnlp.github.io/CoreNLP/index.html)
23 | * [Stanford CoreNLP GitHub page](https://github.com/stanfordnlp/CoreNLP)
24 | */
25 | package org.elasticsearch.plugin.analysis;
26 |
27 | import org.apache.lucene.analysis.Analyzer;
28 | import org.elasticsearch.index.analysis.AnalyzerProvider;
29 | import org.elasticsearch.index.analysis.SDAnalyzerProvider;
30 | import org.elasticsearch.index.analysis.SDTokenizerFactory;
31 | import org.elasticsearch.index.analysis.TokenizerFactory;
32 | import org.elasticsearch.indices.analysis.AnalysisModule;
33 | import org.elasticsearch.plugins.AnalysisPlugin;
34 | import org.elasticsearch.plugins.Plugin;
35 |
36 | import java.util.HashMap;
37 | import java.util.Map;
38 |
39 | public class AnalysisSDPlugin extends Plugin implements AnalysisPlugin {
40 |
41 | private static final String PLUGIN_NAME = "stanford-core-nlp";
42 |
43 | @Override
44 | public Map> getTokenizers() {
45 | Map> extra = new HashMap<>();
46 | extra.put(PLUGIN_NAME, SDTokenizerFactory::new);
47 | return extra;
48 | }
49 |
50 | @Override
51 | public Map>> getAnalyzers() {
52 | Map>> extra = new HashMap<>();
53 | extra.put(PLUGIN_NAME, SDAnalyzerProvider::new);
54 | return extra;
55 | }
56 |
57 | }
58 |
--------------------------------------------------------------------------------
/src/main/resources/plugin-descriptor.properties:
--------------------------------------------------------------------------------
1 | # Elasticsearch plugin descriptor file
2 | # This file must exist as 'plugin-descriptor.properties' at
3 | # the root directory of all plugins.
4 | #
5 | # A plugin can be 'site', 'jvm', or both.
6 | #
7 | ### example site plugin for "foo":
8 | #
9 | # foo.zip <-- zip file for the plugin, with this structure:
10 | # _site/ <-- the contents that will be served
11 | # plugin-descriptor.properties <-- example contents below:
12 | #
13 | # site=true
14 | # description=My cool plugin
15 | # version=1.0
16 | #
17 | ### example jvm plugin for "foo"
18 | #
19 | # foo.zip <-- zip file for the plugin, with this structure:
20 | # .jar <-- classes, resources, dependencies
21 | # .jar <-- any number of jars
22 | # plugin-descriptor.properties <-- example contents below:
23 | #
24 | # jvm=true
25 | # classname=foo.bar.BazPlugin
26 | # description=My cool plugin
27 | # version=2.0.0-rc1
28 | # elasticsearch.version=2.0
29 | # java.version=1.7
30 | #
31 | ### mandatory elements for all plugins:
32 | #
33 | # 'description': simple summary of the plugin
34 | description=${project.description}
35 | #
36 | # 'version': plugin's version
37 | version=6.5.4
38 | #${project.version}
39 | #
40 | # 'name': the plugin name
41 | name=stanford-core-nlp
42 | #${elasticsearch.plugin.name}
43 | #
44 | # 'classname': the name of the class to load, fully-qualified.
45 | classname=org.elasticsearch.plugin.analysis.AnalysisSDPlugin
46 | #${elasticsearch.plugin.classname}
47 | #
48 | # 'java.version' version of java the code is built against
49 | # use the system property java.specification.version
50 | # version string must be a sequence of nonnegative decimal integers
51 | # separated by "."'s and may have leading zeros
52 | java.version=1.8
53 | #${maven.compiler.target}
54 | #
55 | # 'elasticsearch.version' version of elasticsearch compiled against
56 | # You will have to release a new version of the plugin for each new
57 | # elasticsearch release. This version is checked when the plugin
58 | # is loaded so Elasticsearch will refuse to start in the presence of
59 | # plugins with the incorrect elasticsearch.version.
60 | elasticsearch.version=6.5.4
61 | #${elasticsearch.version}
62 |
--------------------------------------------------------------------------------
/src/main/resources/plugin-security.policy:
--------------------------------------------------------------------------------
1 | grant {
2 | // needed because of the hot reload functionality
3 | permission java.lang.RuntimePermission "*";
4 | permission java.lang.reflect.ReflectPermission "*";
5 | };
--------------------------------------------------------------------------------